RubyGems - red-candle - Versions diffs - 1.1.0 → 1.1.2 - Mend

red-candle 1.1.0 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +4 -4
data/README.md +65 -1
data/Rakefile +40 -0
data/ext/candle/src/llm/constrained_generation_test.rs +199 -6
data/ext/candle/src/llm/gemma.rs +21 -5
data/ext/candle/src/llm/generation_config.rs +6 -0
data/ext/candle/src/llm/llama.rs +21 -5
data/ext/candle/src/llm/mistral.rs +21 -5
data/ext/candle/src/llm/phi.rs +21 -5
data/ext/candle/src/llm/quantized_gguf.rs +35 -6
data/ext/candle/src/llm/qwen.rs +21 -5
data/ext/candle/src/llm/text_generation.rs +121 -28
data/ext/candle/src/ner.rs +25 -51
data/ext/candle/src/reranker.rs +41 -68
data/ext/candle/src/ruby/device.rs +2 -1
data/ext/candle/src/ruby/dtype.rs +1 -0
data/ext/candle/src/ruby/errors.rs +1 -0
data/ext/candle/src/ruby/llm.rs +81 -55
data/ext/candle/src/ruby/tensor.rs +2 -1
data/ext/candle/src/tokenizer/mod.rs +2 -1
data/ext/candle/tests/device_tests.rs +43 -0
data/ext/candle/tests/tensor_tests.rs +162 -0
data/lib/candle/llm.rb +129 -34
data/lib/candle/version.rb +1 -1
metadata +4 -2

data/ext/candle/src/llm/quantized_gguf.rs CHANGED Viewed

@@ -32,6 +32,10 @@ enum ModelType {
 }
 impl QuantizedGGUF {
+    pub fn eos_token_id(&self) -> u32 {
+        self.eos_token_id
+    }
     /// Get the tokenizer
     pub fn tokenizer(&self) -> &TokenizerWrapper {
         &self.tokenizer
@@ -316,7 +320,9 @@ impl QuantizedGGUF {
         // Check model name since Mistral GGUF reports as llama architecture
         let model_lower = self.model_id.to_lowercase();
-        if model_lower.contains("mistral") {
+        if model_lower.contains("tinyllama") {
+            self.apply_chatml_template(messages)
+        } else if model_lower.contains("mistral") {
             self.apply_mistral_template(messages)
         } else if model_lower.contains("gemma") {
             // Always use Gemma template for Gemma models, regardless of loader used
@@ -512,6 +518,20 @@ impl QuantizedGGUF {
         Ok(prompt)
     }
+    fn apply_chatml_template(&self, messages: &[serde_json::Value]) -> CandleResult<String> {
+        let mut prompt = String::new();
+        for message in messages {
+            let role = message["role"].as_str().unwrap_or("");
+            let content = message["content"].as_str().unwrap_or("");
+            prompt.push_str(&format!("<|{}|>\n{}</s>\n", role, content));
+        }
+        prompt.push_str("<|assistant|>");
+        Ok(prompt)
+    }
     fn apply_generic_template(&self, messages: &[serde_json::Value]) -> String {
         let mut prompt = String::new();
@@ -538,7 +558,7 @@ impl QuantizedGGUF {
         config: &GenerationConfig,
         mut callback: Option<impl FnMut(&str)>,
     ) -> CandleResult<Vec<u32>> {
-        let mut text_gen = TextGeneration::from_config(config);
+        let mut text_gen = TextGeneration::new(config);
         text_gen.set_eos_token_id(self.eos_token_id);
         text_gen.set_tokens(prompt_tokens.clone());
@@ -571,10 +591,7 @@ impl QuantizedGGUF {
             let logits = logits.to_dtype(DType::F32)?;
-            let next_token = text_gen.sample_next_token(
-                &logits,
-                Some((config.repetition_penalty, config.repetition_penalty_last_n)),
-            )?;
+            let next_token = text_gen.sample_next_token(&logits)?;
             all_tokens.push(next_token);
@@ -596,6 +613,18 @@ impl QuantizedGGUF {
                 break;
             }
+            // Check if constraint is satisfied (early stopping)
+            if config.stop_on_constraint_satisfaction {
+                let satisfied = if config.stop_on_match {
+                    text_gen.is_constraint_satisfied_stop_on_match()
+                } else {
+                    text_gen.is_constraint_satisfied()
+                };
+                if satisfied {
+                    break;
+                }
+            }
             // Check stop sequences
             let generated_text = self.tokenizer.decode(&all_tokens[start_gen..], true)?;
             if text_gen.check_stop_sequences(&generated_text, &config.stop_sequences) {

data/ext/candle/src/llm/qwen.rs CHANGED Viewed

@@ -16,6 +16,10 @@ pub struct Qwen {
 }
 impl Qwen {
+    pub fn eos_token_id(&self) -> u32 {
+        self.eos_token_id
+    }
     /// Get the tokenizer
     pub fn tokenizer(&self) -> &TokenizerWrapper {
         &self.tokenizer
@@ -55,6 +59,9 @@ impl Qwen {
             .unwrap_or(151643); // Default Qwen3 EOS token
         // Download model weights
+        // NOTE: Qwen uses hardcoded shard counts based on model size rather than
+        // reading model.safetensors.index.json. This works for official Qwen models
+        // but may fail for custom configurations with different shard counts.
         let mut filenames = vec![];
         let num_shards = if model_id.contains("72b") || model_id.contains("72B") { 8 }
                         else if model_id.contains("14b") || model_id.contains("14B") { 3 }
@@ -124,7 +131,7 @@ impl Qwen {
         config: &GenerationConfig,
         mut callback: Option<impl FnMut(&str)>,
     ) -> CandleResult<Vec<u32>> {
-        let mut text_gen = TextGeneration::from_config(config);
+        let mut text_gen = TextGeneration::new(config);
         text_gen.set_eos_token_id(self.eos_token_id);
         text_gen.set_tokens(prompt_tokens.clone());
@@ -150,10 +157,7 @@ impl Qwen {
             let logits = logits.to_dtype(DType::F32)?;
-            let next_token = text_gen.sample_next_token(
-                &logits,
-                Some((config.repetition_penalty, config.repetition_penalty_last_n)),
-            )?;
+            let next_token = text_gen.sample_next_token(&logits)?;
             all_tokens.push(next_token);
@@ -173,6 +177,18 @@ impl Qwen {
                 break;
             }
+            // Check if constraint is satisfied (early stopping)
+            if config.stop_on_constraint_satisfaction {
+                let satisfied = if config.stop_on_match {
+                    text_gen.is_constraint_satisfied_stop_on_match()
+                } else {
+                    text_gen.is_constraint_satisfied()
+                };
+                if satisfied {
+                    break;
+                }
+            }
             // Check stop sequences
             let generated_text = self.tokenizer.decode(&all_tokens[start_gen..], true)?;
             if text_gen.check_stop_sequences(&generated_text, &config.stop_sequences) {

data/ext/candle/src/llm/text_generation.rs CHANGED Viewed

@@ -10,39 +10,29 @@ pub struct TextGeneration {
     logits_processor: LogitsProcessor,
     tokens: Vec<u32>,
     eos_token_id: Option<u32>,
+    repetition_penalty: f32,
+    repetition_penalty_last_n: usize,
     constraint: Option<Arc<Index>>,
     constraint_state: Option<u32>,
+    constraint_completed: bool,
+    tokens_since_constraint_start: usize,
 }
 impl TextGeneration {
-    pub fn new(
-        seed: u64,
-        temperature: Option<f64>,
-        top_p: Option<f64>,
-        _top_k: Option<usize>,
-        _repetition_penalty: f32,
-        _repetition_penalty_last_n: usize,
-    ) -> Self {
-        let logits_processor = LogitsProcessor::new(seed, temperature, top_p);
-        Self {
+    pub fn new(config: &GenerationConfig) -> Self {
+        let logits_processor = LogitsProcessor::new(config.seed, Some(config.temperature), config.top_p);
+        let mut text_gen = Self {
             logits_processor,
             tokens: Vec::new(),
             eos_token_id: None,
+            repetition_penalty: config.repetition_penalty,
+            repetition_penalty_last_n: config.repetition_penalty_last_n,
             constraint: None,
             constraint_state: None,
-        }
-    }
-    pub fn from_config(config: &GenerationConfig) -> Self {
-        let mut text_gen = Self::new(
-            config.seed,
-            Some(config.temperature),
-            config.top_p,
-            config.top_k,
-            config.repetition_penalty,
-            config.repetition_penalty_last_n,
-        );
+            constraint_completed: false,
+            tokens_since_constraint_start: 0,
+        };
         // Set constraint if provided
         if let Some(ref constraint) = config.constraint {
@@ -72,6 +62,8 @@ impl TextGeneration {
         // Initialize with the first state
         self.constraint_state = Some(constraint.initial_state());
         self.constraint = Some(constraint);
+        self.constraint_completed = false;
+        self.tokens_since_constraint_start = self.tokens.len();
     }
     /// Apply constraints to logits by masking disallowed tokens
@@ -137,13 +129,12 @@ impl TextGeneration {
     pub fn sample_next_token(
         &mut self,
         logits: &Tensor,
-        repetition_penalty: Option<(f32, usize)>,
     ) -> CandleResult<u32> {
         let mut logits = logits.clone();
-        // Apply repetition penalty if specified
-        if let Some((penalty, last_n)) = repetition_penalty {
-            self.apply_repetition_penalty(&mut logits, penalty, last_n)?;
+        // Apply repetition penalty using stored parameters
+        if self.repetition_penalty != 1.0 {
+            self.apply_repetition_penalty(&mut logits, self.repetition_penalty, self.repetition_penalty_last_n)?;
         }
         // Apply constraints if active
@@ -155,12 +146,114 @@ impl TextGeneration {
         // Update constraint state if active
         if let (Some(ref constraint_index), Some(current_state)) = (&self.constraint, self.constraint_state) {
-            self.constraint_state = constraint_index.next_state(&current_state, &next_token);
+            // Get the next state
+            let next_state = constraint_index.next_state(&current_state, &next_token);
+            // Check if we're transitioning to a state with no allowed tokens (completion)
+            if !self.constraint_completed && self.tokens.len() > self.tokens_since_constraint_start {
+                // Check if we've transitioned from a constrained state to an unconstrained state
+                // This happens when the pattern is complete and the FSM allows "anything"
+                let current_constrained = if let Some(allowed) = constraint_index.allowed_tokens(&current_state) {
+                    // Consider it constrained if we have a limited set of allowed tokens
+                    allowed.len() < 1000  // Arbitrary threshold for "constrained"
+                } else {
+                    true  // No tokens allowed is definitely constrained
+                };
+                let next_constrained = if let Some(next_state_val) = next_state {
+                    if let Some(allowed) = constraint_index.allowed_tokens(&next_state_val) {
+                        allowed.is_empty() || allowed.len() < 1000
+                    } else {
+                        true
+                    }
+                } else {
+                    true
+                };
+                // If we're transitioning from constrained to unconstrained, we've completed the pattern
+                if current_constrained && !next_constrained {
+                    self.constraint_completed = true;
+                }
+                // Also check if next state has no allowed tokens at all
+                if let Some(next_state_val) = next_state {
+                    if let Some(allowed) = constraint_index.allowed_tokens(&next_state_val) {
+                        if allowed.is_empty() {
+                            self.constraint_completed = true;
+                        }
+                    } else {
+                        // None means no tokens allowed - constraint is complete
+                        self.constraint_completed = true;
+                    }
+                }
+            }
+            self.constraint_state = next_state;
         }
         Ok(next_token)
     }
+    /// Check if the constraint is satisfied (reached a valid completion state)
+    pub fn is_constraint_satisfied(&self) -> bool {
+        // If we've explicitly marked the constraint as completed, return true
+        if self.constraint_completed {
+            return true;
+        }
+        // Also check the current state
+        if let (Some(ref constraint_index), Some(state)) = (&self.constraint, self.constraint_state) {
+            // Check if the constraint has reached a state where it could validly end
+            // This happens when:
+            // 1. We have no more allowed tokens (constraint fully satisfied)
+            // 2. The EOS token is in the allowed tokens (optional ending)
+            if let Some(allowed) = constraint_index.allowed_tokens(&state) {
+                // If no tokens are allowed, the constraint is fully satisfied
+                if allowed.is_empty() {
+                    return true;
+                }
+                // If EOS token is allowed, we've reached an optional completion point
+                if let Some(eos) = self.eos_token_id {
+                    if allowed.contains(&eos) {
+                        return true;
+                    }
+                }
+            } else {
+                // None means no tokens allowed - constraint is satisfied
+                return true;
+            }
+        }
+        false
+    }
+    /// Check if the constraint is satisfied when stop_on_match is true
+    pub fn is_constraint_satisfied_stop_on_match(&self) -> bool {
+        // When stop_on_match is true, we stop as soon as the constraint is completed
+        if self.constraint_completed {
+            return true;
+        }
+        // Also check if we're currently in a state that could be a valid end
+        // This is important for patterns like phone numbers where after matching
+        // the pattern, the FSM might allow any token (including more numbers)
+        if let (Some(ref constraint_index), Some(state)) = (&self.constraint, self.constraint_state) {
+            // Check if we've generated at least one token since constraint start
+            if self.tokens.len() > self.tokens_since_constraint_start {
+                if let Some(allowed) = constraint_index.allowed_tokens(&state) {
+                    // If the allowed tokens set is very large (unconstrained),
+                    // it means the pattern has been satisfied
+                    if allowed.len() > 1000 {
+                        return true;
+                    }
+                }
+            }
+        }
+        false
+    }
     /// Check if we should stop generation
     pub fn should_stop(&self, token: u32, max_length: usize) -> bool {
         if self.tokens.len() >= max_length {

data/ext/candle/src/ner.rs CHANGED Viewed

@@ -39,13 +39,9 @@ impl NER {
     pub fn new(model_id: String, device: Option<Device>, tokenizer_id: Option<String>) -> Result<Self> {
         let device = device.unwrap_or(Device::Cpu).as_device()?;
-        // Load model in a separate thread to avoid blocking
-        let device_clone = device.clone();
-        let model_id_clone = model_id.clone();
-        let handle = std::thread::spawn(move || -> std::result::Result<(BertModel, TokenizerWrapper, Linear, NERConfig), Box<dyn std::error::Error + Send + Sync>> {
+        let result = (|| -> std::result::Result<(BertModel, TokenizerWrapper, Linear, NERConfig), Box<dyn std::error::Error + Send + Sync>> {
             let api = Api::new()?;
-            let repo = api.repo(Repo::new(model_id_clone.clone(), RepoType::Model));
+            let repo = api.repo(Repo::new(model_id.clone(), RepoType::Model));
             // Download model files
             let config_filename = repo.get("config.json")?;
@@ -92,7 +88,7 @@ impl NER {
             // Load model weights
             let vb = unsafe {
-                VarBuilder::from_mmaped_safetensors(&[weights_filename], DType::F32, &device_clone)?
+                VarBuilder::from_mmaped_safetensors(&[weights_filename], DType::F32, &device)?
             };
             // Load BERT model
@@ -106,10 +102,10 @@ impl NER {
             )?;
             Ok((model, TokenizerWrapper::new(tokenizer), classifier, ner_config))
-        });
+        })();
-        match handle.join() {
-            Ok(Ok((model, tokenizer, classifier, config))) => {
+        match result {
+            Ok((model, tokenizer, classifier, config)) => {
                 Ok(Self {
                     model,
                     tokenizer,
@@ -119,28 +115,20 @@ impl NER {
                     model_id,
                 })
             }
-            Ok(Err(e)) => Err(Error::new(
+            Err(e) => Err(Error::new(
                 magnus::exception::runtime_error(),
                 format!("Failed to load NER model: {}", e)
             )),
-            Err(_) => Err(Error::new(
-                magnus::exception::runtime_error(),
-                "Thread panicked while loading NER model"
-            )),
         }
     }
-    /// Extract entities from text with confidence scores
-    pub fn extract_entities(&self, text: String, confidence_threshold: Option<f64>) -> Result<RArray> {
-        let threshold = confidence_threshold.unwrap_or(0.9) as f32;
+    /// Common tokenization and prediction logic
+    fn tokenize_and_predict(&self, text: &str) -> Result<(tokenizers::Encoding, Vec<Vec<f32>>)> {
         // Tokenize the text
-        let encoding = self.tokenizer.inner().encode(text.as_str(), true)
+        let encoding = self.tokenizer.inner().encode(text, true)
             .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Tokenization failed: {}", e)))?;
         let token_ids = encoding.get_ids();
-        let tokens = encoding.get_tokens();
-        let offsets = encoding.get_offsets();
         // Convert to tensors
         let input_ids = Tensor::new(token_ids, &self.device)
@@ -171,6 +159,19 @@ impl NER {
             .to_vec2()
             .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
+        Ok((encoding, probs_vec))
+    }
+    /// Extract entities from text with confidence scores
+    pub fn extract_entities(&self, text: String, confidence_threshold: Option<f64>) -> Result<RArray> {
+        let threshold = confidence_threshold.unwrap_or(0.9) as f32;
+        // Use common tokenization and prediction logic
+        let (encoding, probs_vec) = self.tokenize_and_predict(&text)?;
+        let tokens = encoding.get_tokens();
+        let offsets = encoding.get_offsets();
         // Extract entities with BIO decoding
         let entities = self.decode_entities(
             &text,
@@ -199,38 +200,11 @@ impl NER {
     /// Get token-level predictions with labels and confidence scores
     pub fn predict_tokens(&self, text: String) -> Result<RArray> {
-        // Tokenize the text
-        let encoding = self.tokenizer.inner().encode(text.as_str(), true)
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Tokenization failed: {}", e)))?;
+        // Use common tokenization and prediction logic
+        let (encoding, probs_vec) = self.tokenize_and_predict(&text)?;
-        let token_ids = encoding.get_ids();
         let tokens = encoding.get_tokens();
-        // Convert to tensors
-        let input_ids = Tensor::new(token_ids, &self.device)
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?
-            .unsqueeze(0)
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
-        let attention_mask = Tensor::ones_like(&input_ids)
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
-        let token_type_ids = Tensor::zeros_like(&input_ids)
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
-        // Forward pass
-        let output = self.model.forward(&input_ids, &token_type_ids, Some(&attention_mask))
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
-        let logits = self.classifier.forward(&output)
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
-        let probs = candle_nn::ops::softmax(&logits, 2)
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
-        // Get predictions
-        let probs_vec: Vec<Vec<f32>> = probs.squeeze(0)
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?
-            .to_vec2()
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), e.to_string()))?;
         // Build result array
         let result = RArray::new();
         for (i, (token, probs)) in tokens.iter().zip(probs_vec.iter()).enumerate() {

data/ext/candle/src/reranker.rs CHANGED Viewed

@@ -4,7 +4,6 @@ use candle_core::{Device as CoreDevice, Tensor, IndexOp, DType};
 use candle_nn::{VarBuilder, Linear, Module, ops::sigmoid};
 use hf_hub::{api::sync::Api, Repo, RepoType};
 use tokenizers::{EncodeInput, Tokenizer};
-use std::thread;
 use crate::ruby::{Device, Result};
 use crate::tokenizer::{TokenizerWrapper, loader::TokenizerLoader};
@@ -24,8 +23,7 @@ impl Reranker {
     }
     fn new_with_core_device(model_id: String, device: CoreDevice) -> std::result::Result<Self, Error> {
-        let device_clone = device.clone();
-        let handle = thread::spawn(move || -> std::result::Result<(BertModel, TokenizerWrapper, Linear, Linear), Box<dyn std::error::Error + Send + Sync>> {
+        let result = (|| -> std::result::Result<(BertModel, TokenizerWrapper, Linear, Linear), Box<dyn std::error::Error + Send + Sync>> {
             let api = Api::new()?;
             let repo = api.repo(Repo::new(model_id.clone(), RepoType::Model));
@@ -44,7 +42,7 @@ impl Reranker {
             // Load model weights
             let vb = unsafe {
-                VarBuilder::from_mmaped_safetensors(&[weights_filename], DType::F32, &device_clone)?
+                VarBuilder::from_mmaped_safetensors(&[weights_filename], DType::F32, &device)?
             };
             // Load BERT model
@@ -57,17 +55,49 @@ impl Reranker {
             let classifier = candle_nn::linear(config.hidden_size, 1, vb.pp("classifier"))?;
             Ok((model, TokenizerWrapper::new(tokenizer), pooler, classifier))
-        });
+        })();
-        match handle.join() {
-            Ok(Ok((model, tokenizer, pooler, classifier))) => {
+        match result {
+            Ok((model, tokenizer, pooler, classifier)) => {
                 Ok(Self { model, tokenizer, pooler, classifier, device })
             }
-            Ok(Err(e)) => Err(Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e))),
-            Err(_) => Err(Error::new(magnus::exception::runtime_error(), "Thread panicked while loading model")),
+            Err(e) => Err(Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e))),
         }
     }
+    /// Extract CLS embeddings from the model output, handling Metal device workarounds
+    fn extract_cls_embeddings(&self, embeddings: &Tensor) -> std::result::Result<Tensor, Error> {
+        let cls_embeddings = if self.device.is_metal() {
+            // Metal has issues with tensor indexing, use a different approach
+            let (batch_size, seq_len, hidden_size) = embeddings.dims3()
+                .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to get dims: {}", e)))?;
+            // Reshape to [batch * seq_len, hidden] then take first hidden vectors for each batch
+            let reshaped = embeddings.reshape((batch_size * seq_len, hidden_size))
+                .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to reshape: {}", e)))?;
+            // Extract CLS tokens (first token of each sequence)
+            let mut cls_vecs = Vec::new();
+            for i in 0..batch_size {
+                let start_idx = i * seq_len;
+                let cls_vec = reshaped.narrow(0, start_idx, 1)
+                    .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to extract CLS: {}", e)))?;
+                cls_vecs.push(cls_vec);
+            }
+            // Stack the CLS vectors
+            Tensor::cat(&cls_vecs, 0)
+                .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to cat CLS tokens: {}", e)))?
+        } else {
+            embeddings.i((.., 0))
+                .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to extract CLS token: {}", e)))?
+        };
+        // Ensure tensor is contiguous for downstream operations
+        cls_embeddings.contiguous()
+            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to make CLS embeddings contiguous: {}", e)))
+    }
     pub fn debug_tokenization(&self, query: String, document: String) -> std::result::Result<magnus::RHash, Error> {
         // Create query-document pair for cross-encoder
         let query_doc_pair: EncodeInput = (query.clone(), document.clone()).into();
@@ -131,37 +161,7 @@ impl Reranker {
         let pooled_embeddings = match pooling_method.as_str() {
             "pooler" => {
                 // Extract [CLS] token and apply pooler (dense + tanh)
-                // Work around Metal indexing issue by using narrow instead of i((.., 0))
-                let cls_embeddings = if self.device.is_metal() {
-                    // Metal has issues with tensor indexing, use a different approach
-                    let (batch_size, _seq_len, hidden_size) = embeddings.dims3()
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to get dims: {}", e)))?;
-                    // Reshape to [batch * seq_len, hidden] then take first hidden vectors for each batch
-                    let reshaped = embeddings.reshape((batch_size * _seq_len, hidden_size))
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to reshape: {}", e)))?;
-                    // Extract CLS tokens (first token of each sequence)
-                    let mut cls_vecs = Vec::new();
-                    for i in 0..batch_size {
-                        let start_idx = i * _seq_len;
-                        let cls_vec = reshaped.narrow(0, start_idx, 1)
-                            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to extract CLS: {}", e)))?;
-                        cls_vecs.push(cls_vec);
-                    }
-                    // Stack the CLS vectors
-                    Tensor::cat(&cls_vecs, 0)
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to cat CLS tokens: {}", e)))?
-                        .contiguous()
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to make contiguous: {}", e)))?
-                } else {
-                    embeddings.i((.., 0))
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to extract CLS token: {}", e)))?
-                };
-                // Ensure tensor is contiguous before linear layer
-                let cls_embeddings = cls_embeddings.contiguous()
-                    .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to make cls_embeddings contiguous: {}", e)))?;
+                let cls_embeddings = self.extract_cls_embeddings(&embeddings)?;
                 let pooled = self.pooler.forward(&cls_embeddings)
                     .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Pooler forward failed: {}", e)))?;
                 pooled.tanh()
@@ -169,34 +169,7 @@ impl Reranker {
             },
             "cls" => {
                 // Just use the [CLS] token embeddings directly (no pooler layer)
-                // Work around Metal indexing issue
-                let cls_embeddings = if self.device.is_metal() {
-                    // Use same approach as pooler method
-                    let (batch_size, _seq_len, hidden_size) = embeddings.dims3()
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to get dims: {}", e)))?;
-                    let reshaped = embeddings.reshape((batch_size * _seq_len, hidden_size))
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to reshape: {}", e)))?;
-                    let mut cls_vecs = Vec::new();
-                    for i in 0..batch_size {
-                        let start_idx = i * _seq_len;
-                        let cls_vec = reshaped.narrow(0, start_idx, 1)
-                            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to extract CLS: {}", e)))?;
-                        cls_vecs.push(cls_vec);
-                    }
-                    Tensor::cat(&cls_vecs, 0)
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to cat CLS tokens: {}", e)))?
-                        .contiguous()
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to make contiguous: {}", e)))?
-                } else {
-                    embeddings.i((.., 0))
-                        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to extract CLS token: {}", e)))?
-                };
-                // Ensure contiguous for classifier
-                cls_embeddings.contiguous()
-                    .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to make CLS embeddings contiguous: {}", e)))?
+                self.extract_cls_embeddings(&embeddings)?
             },
             "mean" => {
                 // Mean pooling across all tokens

data/ext/candle/src/ruby/device.rs CHANGED Viewed

@@ -199,4 +199,5 @@ pub fn init(rb_candle: RModule) -> Result<()> {
     rb_device.define_method("inspect", method!(Device::__repr__, 0))?;
     rb_device.define_method("==", method!(Device::__eq__, 1))?;
     Ok(())
-}
+}

data/ext/candle/src/ruby/dtype.rs CHANGED Viewed

@@ -35,3 +35,4 @@ pub fn init(rb_candle: RModule) -> Result<()> {
     rb_dtype.define_method("inspect", method!(DType::__repr__, 0))?;
     Ok(())
 }

data/ext/candle/src/ruby/errors.rs CHANGED Viewed

@@ -11,3 +11,4 @@ pub fn wrap_candle_err(err: candle_core::Error) -> Error {
 pub fn wrap_hf_err(err: hf_hub::api::sync::ApiError) -> Error {
     Error::new(magnus::exception::runtime_error(), err.to_string())
 }