RubyGems - red-candle - Versions diffs - 1.2.3 → 1.3.1 - Mend

red-candle 1.2.3 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +4 -4
data/Cargo.lock +460 -379
data/README.md +1 -1
data/ext/candle/Cargo.toml +3 -3
data/ext/candle/src/llm/constrained_generation_test.rs +79 -0
data/ext/candle/src/llm/gemma.rs +24 -9
data/ext/candle/src/llm/llama.rs +46 -10
data/ext/candle/src/llm/mistral.rs +46 -10
data/ext/candle/src/llm/phi.rs +76 -8
data/ext/candle/src/llm/qwen.rs +23 -10
data/ext/candle/src/llm/text_generation.rs +40 -50
data/ext/candle/src/ruby/llm.rs +62 -29
data/ext/candle/src/ruby/structured.rs +54 -10
data/lib/candle/llm.rb +77 -3
data/lib/candle/version.rb +1 -1
metadata +11 -13
data/ext/candle/target/release/build/bindgen-0f89ba23b9ca1395/out/host-target.txt +0 -1
data/ext/candle/target/release/build/clang-sys-cac31d63c4694603/out/common.rs +0 -355
data/ext/candle/target/release/build/clang-sys-cac31d63c4694603/out/dynamic.rs +0 -276
data/ext/candle/target/release/build/clang-sys-cac31d63c4694603/out/macros.rs +0 -49
data/ext/candle/target/release/build/pulp-1b95cfe377eede97/out/x86_64_asm.rs +0 -2748
data/ext/candle/target/release/build/rb-sys-f8ac4edc30ab3e53/out/bindings-0.9.116-mri-arm64-darwin24-3.3.0.rs +0 -8902

data/ext/candle/src/llm/text_generation.rs CHANGED Viewed

@@ -148,47 +148,28 @@ impl TextGeneration {
         if let (Some(ref constraint_index), Some(current_state)) = (&self.constraint, self.constraint_state) {
             // Get the next state
             let next_state = constraint_index.next_state(&current_state, &next_token);
             // Check if we're transitioning to a state with no allowed tokens (completion)
             if !self.constraint_completed && self.tokens.len() > self.tokens_since_constraint_start {
-                // Check if we've transitioned from a constrained state to an unconstrained state
-                // This happens when the pattern is complete and the FSM allows "anything"
-                let current_constrained = if let Some(allowed) = constraint_index.allowed_tokens(&current_state) {
-                    // Consider it constrained if we have a limited set of allowed tokens
-                    allowed.len() < 1000  // Arbitrary threshold for "constrained"
-                } else {
-                    true  // No tokens allowed is definitely constrained
-                };
-                let next_constrained = if let Some(next_state_val) = next_state {
-                    if let Some(allowed) = constraint_index.allowed_tokens(&next_state_val) {
-                        allowed.is_empty() || allowed.len() < 1000
-                    } else {
-                        true
-                    }
-                } else {
-                    true
-                };
-                // If we're transitioning from constrained to unconstrained, we've completed the pattern
-                if current_constrained && !next_constrained {
-                    self.constraint_completed = true;
-                }
-                // Also check if next state has no allowed tokens at all
+                // Check if next state has no allowed tokens at all - this is definitive completion
                 if let Some(next_state_val) = next_state {
                     if let Some(allowed) = constraint_index.allowed_tokens(&next_state_val) {
                         if allowed.is_empty() {
                             self.constraint_completed = true;
                         }
+                        // Only mark as complete if ONLY EOS is allowed (not just if EOS is one of many options)
+                        else if let Some(eos) = self.eos_token_id {
+                            if allowed.len() == 1 && allowed.contains(&eos) {
+                                self.constraint_completed = true;
+                            }
+                        }
                     } else {
                         // None means no tokens allowed - constraint is complete
                         self.constraint_completed = true;
                     }
                 }
             }
             self.constraint_state = next_state;
         }
@@ -201,22 +182,22 @@ impl TextGeneration {
         if self.constraint_completed {
             return true;
         }
         // Also check the current state
         if let (Some(ref constraint_index), Some(state)) = (&self.constraint, self.constraint_state) {
-            // Check if the constraint has reached a state where it could validly end
-            // This happens when:
-            // 1. We have no more allowed tokens (constraint fully satisfied)
-            // 2. The EOS token is in the allowed tokens (optional ending)
+            // Check if the constraint has reached a state where it MUST end
+            // This happens when there are no more allowed tokens (constraint fully satisfied)
             if let Some(allowed) = constraint_index.allowed_tokens(&state) {
                 // If no tokens are allowed, the constraint is fully satisfied
                 if allowed.is_empty() {
                     return true;
                 }
-                // If EOS token is allowed, we've reached an optional completion point
+                // For JSON schemas, check if ONLY the EOS token is allowed
+                // This means we've generated a complete, valid JSON structure
+                // Don't treat EOS as a satisfaction signal if other tokens are also allowed
                 if let Some(eos) = self.eos_token_id {
-                    if allowed.contains(&eos) {
+                    if allowed.len() == 1 && allowed.contains(&eos) {
                         return true;
                     }
                 }
@@ -229,28 +210,37 @@ impl TextGeneration {
     }
     /// Check if the constraint is satisfied when stop_on_match is true
+    /// NOTE: For JSON schemas, this should only return true when the JSON structure is complete,
+    /// not just because we're in a state with many allowed tokens (like inside a string).
     pub fn is_constraint_satisfied_stop_on_match(&self) -> bool {
         // When stop_on_match is true, we stop as soon as the constraint is completed
         if self.constraint_completed {
             return true;
         }
-        // Also check if we're currently in a state that could be a valid end
-        // This is important for patterns like phone numbers where after matching
-        // the pattern, the FSM might allow any token (including more numbers)
+        // For JSON and other structured outputs, don't use the "large allowed set" heuristic.
+        // Instead, only consider the constraint satisfied when:
+        // 1. There are no allowed tokens (definitive completion)
+        // 2. Only EOS is allowed (completion with optional termination)
         if let (Some(ref constraint_index), Some(state)) = (&self.constraint, self.constraint_state) {
-            // Check if we've generated at least one token since constraint start
-            if self.tokens.len() > self.tokens_since_constraint_start {
-                if let Some(allowed) = constraint_index.allowed_tokens(&state) {
-                    // If the allowed tokens set is very large (unconstrained),
-                    // it means the pattern has been satisfied
-                    if allowed.len() > 1000 {
+            if let Some(allowed) = constraint_index.allowed_tokens(&state) {
+                // No more tokens allowed - definitely complete
+                if allowed.is_empty() {
+                    return true;
+                }
+                // Only EOS is allowed - complete JSON structure
+                if let Some(eos) = self.eos_token_id {
+                    if allowed.len() == 1 && allowed.contains(&eos) {
                         return true;
                     }
                 }
+            } else {
+                // None means no tokens allowed - constraint is complete
+                return true;
             }
         }
         false
     }
@@ -259,13 +249,13 @@ impl TextGeneration {
         if self.tokens.len() >= max_length {
             return true;
         }
         if let Some(eos) = self.eos_token_id {
             if token == eos {
                 return true;
             }
         }
         // Check if we've reached a final state in constraint
         // A state is considered final if it has no allowed tokens
         if let (Some(ref constraint_index), Some(state)) = (&self.constraint, self.constraint_state) {
@@ -278,7 +268,7 @@ impl TextGeneration {
                 return true;
             }
         }
         false
     }

data/ext/candle/src/ruby/llm.rs CHANGED Viewed

@@ -257,14 +257,15 @@ impl LLM {
         let model_lower = model_id.to_lowercase();
         let is_quantized = model_lower.contains("gguf") || model_lower.contains("-q4") || model_lower.contains("-q5") || model_lower.contains("-q8");
+        // Extract tokenizer source if provided in model_id (for both GGUF and regular models)
+        let (model_id_clean, tokenizer_source) = if let Some(pos) = model_id.find("@@") {
+            let (id, _tok) = model_id.split_at(pos);
+            (id.to_string(), Some(&model_id[pos+2..]))
+        } else {
+            (model_id.clone(), None)
+        };
         let model = if is_quantized {
-            // Extract tokenizer source if provided in model_id
-            let (model_id_clean, tokenizer_source) = if let Some(pos) = model_id.find("@@") {
-                let (id, _tok) = model_id.split_at(pos);
-                (id.to_string(), Some(&model_id[pos+2..]))
-            } else {
-                (model_id.clone(), None)
-            };
             // Use unified GGUF loader for all quantized models
             let gguf_model = rt.block_on(async {
@@ -273,41 +274,73 @@ impl LLM {
             .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load GGUF model: {}", e)))?;
             ModelType::QuantizedGGUF(gguf_model)
         } else {
-            // Load non-quantized models
-            if model_lower.contains("mistral") {
-                let mistral = rt.block_on(async {
-                    RustMistral::from_pretrained(&model_id, candle_device).await
-                })
+            // Load non-quantized models based on type
+            let model_lower_clean = model_id_clean.to_lowercase();
+            if model_lower_clean.contains("mistral") {
+                let mistral = if tokenizer_source.is_some() {
+                    rt.block_on(async {
+                        RustMistral::from_pretrained_with_tokenizer(&model_id_clean, candle_device, tokenizer_source).await
+                    })
+                } else {
+                    rt.block_on(async {
+                        RustMistral::from_pretrained(&model_id_clean, candle_device).await
+                    })
+                }
                 .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e)))?;
                 ModelType::Mistral(mistral)
-            } else if model_lower.contains("llama") || model_lower.contains("meta-llama") || model_lower.contains("tinyllama") {
-                let llama = rt.block_on(async {
-                    RustLlama::from_pretrained(&model_id, candle_device).await
-                })
+            } else if model_lower_clean.contains("llama") || model_lower_clean.contains("meta-llama") || model_lower_clean.contains("tinyllama") {
+                let llama = if tokenizer_source.is_some() {
+                    rt.block_on(async {
+                        RustLlama::from_pretrained_with_tokenizer(&model_id_clean, candle_device, tokenizer_source).await
+                    })
+                } else {
+                    rt.block_on(async {
+                        RustLlama::from_pretrained(&model_id_clean, candle_device).await
+                    })
+                }
                 .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e)))?;
                 ModelType::Llama(llama)
-            } else if model_lower.contains("gemma") || model_lower.contains("google/gemma") {
-                let gemma = rt.block_on(async {
-                    RustGemma::from_pretrained(&model_id, candle_device).await
-                })
+            } else if model_lower_clean.contains("gemma") || model_lower_clean.contains("google/gemma") {
+                let gemma = if tokenizer_source.is_some() {
+                    rt.block_on(async {
+                        RustGemma::from_pretrained_with_tokenizer(&model_id_clean, candle_device, tokenizer_source).await
+                    })
+                } else {
+                    rt.block_on(async {
+                        RustGemma::from_pretrained(&model_id_clean, candle_device).await
+                    })
+                }
                 .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e)))?;
                 ModelType::Gemma(gemma)
-            } else if model_lower.contains("qwen") {
-                let qwen = rt.block_on(async {
-                    RustQwen::from_pretrained(&model_id, candle_device).await
-                })
+            } else if model_lower_clean.contains("qwen") {
+                let qwen = if tokenizer_source.is_some() {
+                    rt.block_on(async {
+                        RustQwen::from_pretrained_with_tokenizer(&model_id_clean, candle_device, tokenizer_source).await
+                    })
+                } else {
+                    rt.block_on(async {
+                        RustQwen::from_pretrained(&model_id_clean, candle_device).await
+                    })
+                }
                 .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e)))?;
                 ModelType::Qwen(qwen)
-            } else if model_lower.contains("phi") {
-                let phi = rt.block_on(async {
-                    RustPhi::from_pretrained(&model_id, candle_device).await
-                })
+            } else if model_lower_clean.contains("phi") {
+                let phi = if tokenizer_source.is_some() {
+                    rt.block_on(async {
+                        RustPhi::from_pretrained_with_tokenizer(&model_id_clean, candle_device, tokenizer_source).await
+                    })
+                } else {
+                    rt.block_on(async {
+                        RustPhi::from_pretrained(&model_id_clean, candle_device).await
+                    })
+                }
                 .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e)))?;
                 ModelType::Phi(phi)
             } else {
                 return Err(Error::new(
                     magnus::exception::runtime_error(),
-                    format!("Unsupported model type: {}. Currently Mistral, Llama, Gemma, Qwen, and Phi models are supported.", model_id),
+                    format!("Unsupported model type: {}. Currently Mistral, Llama, Gemma, Qwen, and Phi models are supported.", model_id_clean),
                 ));
             }
         };

data/ext/candle/src/ruby/structured.rs CHANGED Viewed

@@ -1,7 +1,7 @@
 use magnus::{Error, Module, RModule, function, Object};
 use std::sync::Arc;
-use crate::structured::{SchemaProcessor, VocabularyAdapter, Index};
+use crate::structured::{SchemaProcessor, VocabularyAdapter, Index, Vocabulary};
 use crate::ruby::{Result, tokenizer::Tokenizer};
 /// Ruby wrapper for structured generation constraints
@@ -12,36 +12,80 @@ pub struct StructuredConstraint {
 }
 impl StructuredConstraint {
-    /// Create a constraint from a JSON schema
+    /// Create a constraint from a JSON schema using a model ID
+    /// This uses Vocabulary::from_pretrained which handles tokenizer byte encoding correctly
+    pub fn from_schema_with_model(schema: String, model_id: String) -> Result<Self> {
+        // Use tokio runtime for async vocabulary loading
+        let rt = tokio::runtime::Runtime::new()
+            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to create runtime: {}", e)))?;
+        let vocabulary = rt.block_on(async {
+            Vocabulary::from_pretrained(&model_id, None)
+        })
+        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to create vocabulary from model '{}': {:?}", model_id, e)))?;
+        let processor = SchemaProcessor::new();
+        let index = processor.process_schema(&schema, &vocabulary)
+            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to process schema: {}", e)))?;
+        Ok(Self { index })
+    }
+    /// Create a constraint from a regex pattern using a model ID
+    pub fn from_regex_with_model(pattern: String, model_id: String) -> Result<Self> {
+        // Use tokio runtime for async vocabulary loading
+        let rt = tokio::runtime::Runtime::new()
+            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to create runtime: {}", e)))?;
+        let vocabulary = rt.block_on(async {
+            Vocabulary::from_pretrained(&model_id, None)
+        })
+        .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to create vocabulary from model '{}': {:?}", model_id, e)))?;
+        let processor = SchemaProcessor::new();
+        let index = processor.process_regex(&pattern, &vocabulary)
+            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to process regex: {}", e)))?;
+        Ok(Self { index })
+    }
+    /// Create a constraint from a JSON schema (legacy method using tokenizer directly)
+    /// Note: This may not handle all tokenizer byte encodings correctly
     pub fn from_schema(schema: String, tokenizer: &Tokenizer) -> Result<Self> {
         let vocabulary = VocabularyAdapter::from_tokenizer(&tokenizer.0)
             .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to create vocabulary: {}", e)))?;
         let processor = SchemaProcessor::new();
         let index = processor.process_schema(&schema, &vocabulary)
             .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to process schema: {}", e)))?;
         Ok(Self { index })
     }
-    /// Create a constraint from a regex pattern
+    /// Create a constraint from a regex pattern (legacy method using tokenizer directly)
+    /// Note: This may not handle all tokenizer byte encodings correctly
     pub fn from_regex(pattern: String, tokenizer: &Tokenizer) -> Result<Self> {
         let vocabulary = VocabularyAdapter::from_tokenizer(&tokenizer.0)
             .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to create vocabulary: {}", e)))?;
         let processor = SchemaProcessor::new();
         let index = processor.process_regex(&pattern, &vocabulary)
             .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to process regex: {}", e)))?;
         Ok(Self { index })
     }
 }
 pub fn init_structured(rb_candle: RModule) -> Result<()> {
     let class = rb_candle.define_class("StructuredConstraint", magnus::class::object())?;
+    // New methods using model_id for proper vocabulary loading
+    class.define_singleton_method("from_schema_with_model", function!(StructuredConstraint::from_schema_with_model, 2))?;
+    class.define_singleton_method("from_regex_with_model", function!(StructuredConstraint::from_regex_with_model, 2))?;
+    // Legacy methods using tokenizer directly (may have byte encoding issues with some models)
     class.define_singleton_method("from_schema", function!(StructuredConstraint::from_schema, 2))?;
     class.define_singleton_method("from_regex", function!(StructuredConstraint::from_regex, 2))?;
     Ok(())
 }

data/lib/candle/llm.rb CHANGED Viewed

@@ -32,16 +32,90 @@ module Candle
       end
     end
     # Create a structured constraint from a JSON schema
+    # Uses the model's vocabulary with proper byte encoding handling
     def constraint_from_schema(schema)
       schema_str = schema.is_a?(String) ? schema : JSON.generate(schema)
-      StructuredConstraint.from_schema(schema_str, tokenizer)
+      # Extract the tokenizer source model ID for proper vocabulary loading
+      tokenizer_model = tokenizer_source_model
+      if tokenizer_model
+        begin
+          StructuredConstraint.from_schema_with_model(schema_str, tokenizer_model)
+        rescue RuntimeError => e
+          # Fall back to legacy method if from_pretrained fails
+          # (e.g., tokenizer doesn't have EOS token in expected format)
+          if e.message.include?("UnsupportedTokenizer")
+            StructuredConstraint.from_schema(schema_str, tokenizer)
+          else
+            raise
+          end
+        end
+      else
+        # Fall back to legacy method if we can't determine the model
+        StructuredConstraint.from_schema(schema_str, tokenizer)
+      end
     end
     # Create a structured constraint from a regex pattern
+    # Uses the model's vocabulary with proper byte encoding handling
     def constraint_from_regex(pattern)
       pattern_str = pattern.is_a?(Regexp) ? pattern.source : pattern.to_s
-      StructuredConstraint.from_regex(pattern_str, tokenizer)
+      # Extract the tokenizer source model ID for proper vocabulary loading
+      tokenizer_model = tokenizer_source_model
+      if tokenizer_model
+        begin
+          StructuredConstraint.from_regex_with_model(pattern_str, tokenizer_model)
+        rescue RuntimeError => e
+          # Fall back to legacy method if from_pretrained fails
+          if e.message.include?("UnsupportedTokenizer")
+            StructuredConstraint.from_regex(pattern_str, tokenizer)
+          else
+            raise
+          end
+        end
+      else
+        # Fall back to legacy method if we can't determine the model
+        StructuredConstraint.from_regex(pattern_str, tokenizer)
+      end
     end
+    private
+    # Get the model ID to use for vocabulary loading
+    # This handles GGUF models by extracting the tokenizer source
+    def tokenizer_source_model
+      opts = options rescue {}
+      # For GGUF models, use the tokenizer source if available
+      if opts["tokenizer_source"]
+        return opts["tokenizer_source"]
+      end
+      # For regular models, use the base model ID
+      if opts["base_model"]
+        return opts["base_model"]
+      end
+      # Try model_id but strip GGUF parts
+      model = opts["model_id"] || (model_id rescue nil)
+      return nil unless model
+      # Remove GGUF file suffix if present
+      if model.include?("@")
+        model = model.split("@").first
+      end
+      # For GGUF repos, try to guess the tokenizer source
+      if model.downcase.include?("gguf")
+        guessed = self.class.guess_tokenizer(model)
+        return guessed if guessed && guessed != model
+      end
+      model
+    end
+    public
     # Generate with regex constraint
     def generate_regex(prompt, pattern:, stop_on_match: true, **options)

data/lib/candle/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # :nocov:
 module Candle
-  VERSION = "1.2.3"
+  VERSION = "1.3.1"
 end
 # :nocov:

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: red-candle
 version: !ruby/object:Gem::Version
-  version: 1.2.3
+  version: 1.3.1
 platform: ruby
 authors:
 - Christopher Petersen
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2025-09-07 00:00:00.000000000 Z
+date: 2025-12-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rb_sys
@@ -151,7 +151,9 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '3.13'
-description: huggingface/candle for Ruby
+description: Ruby gem for running state-of-the-art language models locally. Access
+  LLMs, embeddings, rerankers, and NER models directly from Ruby using Rust-powered
+  Candle with Metal/CUDA acceleration.
 email:
 - chris@petersen.io
 - 2xijok@gmail.com
@@ -204,12 +206,6 @@ files:
 - ext/candle/src/structured/vocabulary_adapter_simple_test.rs
 - ext/candle/src/tokenizer/loader.rs
 - ext/candle/src/tokenizer/mod.rs
-- ext/candle/target/release/build/bindgen-0f89ba23b9ca1395/out/host-target.txt
-- ext/candle/target/release/build/clang-sys-cac31d63c4694603/out/common.rs
-- ext/candle/target/release/build/clang-sys-cac31d63c4694603/out/dynamic.rs
-- ext/candle/target/release/build/clang-sys-cac31d63c4694603/out/macros.rs
-- ext/candle/target/release/build/pulp-1b95cfe377eede97/out/x86_64_asm.rs
-- ext/candle/target/release/build/rb-sys-f8ac4edc30ab3e53/out/bindings-0.9.116-mri-arm64-darwin24-3.3.0.rs
 - ext/candle/tests/device_tests.rs
 - ext/candle/tests/tensor_tests.rs
 - lib/candle.rb
@@ -237,16 +233,18 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: 3.2.0
+      version: 3.1.0
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: 3.3.26
+      version: '3.3'
 requirements:
 - Rust >= 1.85
-rubygems_version: 3.5.3
+rubygems_version: 3.3.3
 signing_key:
 specification_version: 4
-summary: huggingface/candle for Ruby
+summary: Ruby gem for running state-of-the-art language models locally. Access LLMs,
+  embeddings, rerankers, and NER models directly from Ruby using Rust-powered Candle
+  with Metal/CUDA acceleration.
 test_files: []

data/ext/candle/target/release/build/bindgen-0f89ba23b9ca1395/out/host-target.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- aarch64-apple-darwin