RubyGems - red-candle - Versions diffs - 1.0.0.pre.6 → 1.0.0.pre.7 - Mend

red-candle 1.0.0.pre.6 → 1.0.0.pre.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/README.md +159 -0
data/Rakefile +1 -3
data/ext/candle/src/llm/gemma.rs +16 -79
data/ext/candle/src/llm/generation_config.rs +3 -0
data/ext/candle/src/llm/llama.rs +16 -79
data/ext/candle/src/llm/mistral.rs +16 -89
data/ext/candle/src/llm/mod.rs +58 -0
data/ext/candle/src/llm/quantized_gguf.rs +496 -0
data/ext/candle/src/llm/text_generation.rs +0 -4
data/ext/candle/src/ruby/embedding_model.rs +0 -1
data/ext/candle/src/ruby/llm.rs +79 -36
data/lib/candle/llm.rb +91 -2
data/lib/candle/version.rb +1 -1
metadata +3 -2

data/ext/candle/src/ruby/llm.rs CHANGED Viewed

@@ -1,7 +1,7 @@
 use magnus::{function, method, prelude::*, Error, Module, RArray, RHash, RModule, Ruby, TryConvert, Value};
 use std::cell::RefCell;
-use crate::llm::{GenerationConfig as RustGenerationConfig, TextGenerator, mistral::Mistral as RustMistral, llama::Llama as RustLlama, gemma::Gemma as RustGemma};
+use crate::llm::{GenerationConfig as RustGenerationConfig, TextGenerator, mistral::Mistral as RustMistral, llama::Llama as RustLlama, gemma::Gemma as RustGemma, QuantizedGGUF as RustQuantizedGGUF};
 use crate::ruby::{Result as RbResult, Device as RbDevice};
 // Use an enum to handle different model types instead of trait objects
@@ -10,6 +10,7 @@ enum ModelType {
     Mistral(RustMistral),
     Llama(RustLlama),
     Gemma(RustGemma),
+    QuantizedGGUF(RustQuantizedGGUF),
 }
 impl ModelType {
@@ -18,6 +19,7 @@ impl ModelType {
             ModelType::Mistral(m) => m.generate(prompt, config),
             ModelType::Llama(m) => m.generate(prompt, config),
             ModelType::Gemma(m) => m.generate(prompt, config),
+            ModelType::QuantizedGGUF(m) => m.generate(prompt, config),
         }
     }
@@ -31,15 +33,7 @@ impl ModelType {
             ModelType::Mistral(m) => m.generate_stream(prompt, config, callback),
             ModelType::Llama(m) => m.generate_stream(prompt, config, callback),
             ModelType::Gemma(m) => m.generate_stream(prompt, config, callback),
-        }
-    }
-    #[allow(dead_code)]
-    fn model_name(&self) -> &str {
-        match self {
-            ModelType::Mistral(m) => m.model_name(),
-            ModelType::Llama(m) => m.model_name(),
-            ModelType::Gemma(m) => m.model_name(),
+            ModelType::QuantizedGGUF(m) => m.generate_stream(prompt, config, callback),
         }
     }
@@ -48,6 +42,7 @@ impl ModelType {
             ModelType::Mistral(m) => m.clear_cache(),
             ModelType::Llama(m) => m.clear_cache(),
             ModelType::Gemma(m) => m.clear_cache(),
+            ModelType::QuantizedGGUF(m) => m.clear_cache(),
         }
     }
@@ -72,6 +67,7 @@ impl ModelType {
             },
             ModelType::Llama(m) => m.apply_chat_template(messages),
             ModelType::Gemma(m) => m.apply_chat_template(messages),
+            ModelType::QuantizedGGUF(m) => m.apply_chat_template(messages),
         }
     }
 }
@@ -144,6 +140,12 @@ impl GenerationConfig {
             }
         }
+        if let Some(value) = kwargs.get(magnus::Symbol::new("debug_tokens")) {
+            if let Ok(v) = TryConvert::try_convert(value) {
+                config.debug_tokens = v;
+            }
+        }
         Ok(Self { inner: config })
     }
@@ -185,6 +187,10 @@ impl GenerationConfig {
     pub fn include_prompt(&self) -> bool {
         self.inner.include_prompt
     }
+    pub fn debug_tokens(&self) -> bool {
+        self.inner.debug_tokens
+    }
 }
 #[derive(Clone, Debug)]
@@ -206,31 +212,51 @@ impl LLM {
         let rt = tokio::runtime::Runtime::new()
             .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to create runtime: {}", e)))?;
-        // Determine model type from ID and load appropriately
+        // Determine model type from ID and whether it's quantized
         let model_lower = model_id.to_lowercase();
-        let model = if model_lower.contains("mistral") {
-            let mistral = rt.block_on(async {
-                RustMistral::from_pretrained(&model_id, candle_device).await
-            })
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e)))?;
-            ModelType::Mistral(mistral)
-        } else if model_lower.contains("llama") || model_lower.contains("meta-llama") || model_lower.contains("tinyllama") {
-            let llama = rt.block_on(async {
-                RustLlama::from_pretrained(&model_id, candle_device).await
-            })
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e)))?;
-            ModelType::Llama(llama)
-        } else if model_lower.contains("gemma") || model_lower.contains("google/gemma") {
-            let gemma = rt.block_on(async {
-                RustGemma::from_pretrained(&model_id, candle_device).await
+        let is_quantized = model_lower.contains("gguf") || model_lower.contains("-q4") || model_lower.contains("-q5") || model_lower.contains("-q8");
+        let model = if is_quantized {
+            // Extract tokenizer source if provided in model_id
+            let (model_id_clean, tokenizer_source) = if let Some(pos) = model_id.find("@@") {
+                let (id, _tok) = model_id.split_at(pos);
+                (id.to_string(), Some(&model_id[pos+2..]))
+            } else {
+                (model_id.clone(), None)
+            };
+            // Use unified GGUF loader for all quantized models
+            let gguf_model = rt.block_on(async {
+                RustQuantizedGGUF::from_pretrained(&model_id_clean, candle_device, tokenizer_source).await
             })
-            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e)))?;
-            ModelType::Gemma(gemma)
+            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load GGUF model: {}", e)))?;
+            ModelType::QuantizedGGUF(gguf_model)
         } else {
-            return Err(Error::new(
-                magnus::exception::runtime_error(),
-                format!("Unsupported model type: {}. Currently Mistral, Llama, and Gemma models are supported.", model_id),
-            ));
+            // Load non-quantized models
+            if model_lower.contains("mistral") {
+                let mistral = rt.block_on(async {
+                    RustMistral::from_pretrained(&model_id, candle_device).await
+                })
+                .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e)))?;
+                ModelType::Mistral(mistral)
+            } else if model_lower.contains("llama") || model_lower.contains("meta-llama") || model_lower.contains("tinyllama") {
+                let llama = rt.block_on(async {
+                    RustLlama::from_pretrained(&model_id, candle_device).await
+                })
+                .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e)))?;
+                ModelType::Llama(llama)
+            } else if model_lower.contains("gemma") || model_lower.contains("google/gemma") {
+                let gemma = rt.block_on(async {
+                    RustGemma::from_pretrained(&model_id, candle_device).await
+                })
+                .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e)))?;
+                ModelType::Gemma(gemma)
+            } else {
+                return Err(Error::new(
+                    magnus::exception::runtime_error(),
+                    format!("Unsupported model type: {}. Currently Mistral, Llama, and Gemma models are supported.", model_id),
+                ));
+            }
         };
         Ok(Self {
@@ -246,7 +272,10 @@ impl LLM {
             .map(|c| c.inner.clone())
             .unwrap_or_default();
-        let model = self.model.lock().unwrap();
+        let model = match self.model.lock() {
+            Ok(guard) => guard,
+            Err(poisoned) => poisoned.into_inner(),
+        };
         let mut model_ref = model.borrow_mut();
         model_ref.generate(&prompt, &config)
@@ -266,7 +295,10 @@ impl LLM {
         }
         let block = block.unwrap();
-        let model = self.model.lock().unwrap();
+        let model = match self.model.lock() {
+            Ok(guard) => guard,
+            Err(poisoned) => poisoned.into_inner(),
+        };
         let mut model_ref = model.borrow_mut();
         let result = model_ref.generate_stream(&prompt, &config, |token| {
@@ -289,7 +321,14 @@ impl LLM {
     /// Clear the model's cache (e.g., KV cache for transformers)
     pub fn clear_cache(&self) -> RbResult<()> {
-        let model = self.model.lock().unwrap();
+        let model = match self.model.lock() {
+            Ok(guard) => guard,
+            Err(poisoned) => {
+                // If the mutex is poisoned, we can still recover the data
+                // This happens when another thread panicked while holding the lock
+                poisoned.into_inner()
+            }
+        };
         let mut model_ref = model.borrow_mut();
         model_ref.clear_cache();
         Ok(())
@@ -323,7 +362,10 @@ impl LLM {
             })
             .collect();
-        let model = self.model.lock().unwrap();
+        let model = match self.model.lock() {
+            Ok(guard) => guard,
+            Err(poisoned) => poisoned.into_inner(),
+        };
         let model_ref = model.borrow();
         model_ref.apply_chat_template(&json_messages)
@@ -363,6 +405,7 @@ pub fn init_llm(rb_candle: RModule) -> RbResult<()> {
     rb_generation_config.define_method("seed", method!(GenerationConfig::seed, 0))?;
     rb_generation_config.define_method("stop_sequences", method!(GenerationConfig::stop_sequences, 0))?;
     rb_generation_config.define_method("include_prompt", method!(GenerationConfig::include_prompt, 0))?;
+    rb_generation_config.define_method("debug_tokens", method!(GenerationConfig::debug_tokens, 0))?;
     let rb_llm = rb_candle.define_class("LLM", magnus::class::object())?;
     rb_llm.define_singleton_method("_from_pretrained", function!(from_pretrained_wrapper, -1))?;

data/lib/candle/llm.rb CHANGED Viewed

@@ -1,5 +1,65 @@
 module Candle
   class LLM
+    # Tokenizer registry for automatic detection
+    TOKENIZER_REGISTRY = {
+      # Exact model matches
+      "TheBloke/Mistral-7B-Instruct-v0.2-GGUF" => "mistralai/Mistral-7B-Instruct-v0.2",
+      "TheBloke/Mistral-7B-v0.1-GGUF" => "mistralai/Mistral-7B-v0.1",
+      "TheBloke/Llama-2-7B-Chat-GGUF" => "meta-llama/Llama-2-7b-chat-hf",
+      "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" => "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+      # Pattern-based fallbacks (evaluated in order)
+      :patterns => [
+        # Mistral models
+        [/mistral.*?7b.*?instruct.*?v0\.2/i, "mistralai/Mistral-7B-Instruct-v0.2"],
+        [/mistral.*?7b.*?instruct.*?v0\.1/i, "mistralai/Mistral-7B-Instruct-v0.1"],
+        [/mistral.*?7b/i, "mistralai/Mistral-7B-v0.1"],
+        # Llama models
+        [/llama.*?3.*?8b/i, "meta-llama/Meta-Llama-3-8B"],
+        [/llama.*?3.*?70b/i, "meta-llama/Meta-Llama-3-70B"],
+        [/llama.*?2.*?7b.*?chat/i, "meta-llama/Llama-2-7b-chat-hf"],
+        [/llama.*?2.*?13b.*?chat/i, "meta-llama/Llama-2-13b-chat-hf"],
+        [/llama.*?2.*?70b.*?chat/i, "meta-llama/Llama-2-70b-chat-hf"],
+        [/tinyllama/i, "TinyLlama/TinyLlama-1.1B-Chat-v1.0"],
+        # Gemma models
+        [/gemma.*?2.*?9b/i, "google/gemma-2-9b"],
+        [/gemma.*?2.*?2b/i, "google/gemma-2-2b"],
+        [/gemma.*?7b/i, "google/gemma-7b"],
+        [/gemma.*?2b/i, "google/gemma-2b"]
+      ]
+    }
+    # Allow users to register custom tokenizer mappings
+    def self.register_tokenizer(model_pattern, tokenizer_id)
+      if model_pattern.is_a?(String)
+        TOKENIZER_REGISTRY[model_pattern] = tokenizer_id
+      elsif model_pattern.is_a?(Regexp)
+        TOKENIZER_REGISTRY[:patterns] ||= []
+        TOKENIZER_REGISTRY[:patterns].unshift([model_pattern, tokenizer_id])
+      else
+        raise ArgumentError, "model_pattern must be a String or Regexp"
+      end
+    end
+    # Guess the tokenizer for a model
+    def self.guess_tokenizer(model_id)
+      # Check exact matches first
+      return TOKENIZER_REGISTRY[model_id] if TOKENIZER_REGISTRY[model_id]
+      # Check patterns
+      if patterns = TOKENIZER_REGISTRY[:patterns]
+        patterns.each do |pattern, tokenizer|
+          return tokenizer if model_id.match?(pattern)
+        end
+      end
+      # Default: try removing common GGUF suffixes
+      base_model = model_id.gsub(/-gguf|-q\d+_\w+$/i, "")
+      base_model
+    end
     # Simple chat interface for instruction models
     def chat(messages, **options)
       prompt = apply_chat_template(messages)
@@ -28,8 +88,37 @@ module Candle
       end
     end
-    def self.from_pretrained(model_id, device: Candle::Device.cpu)
-      _from_pretrained(model_id, device)
+    def self.from_pretrained(model_id, device: Candle::Device.cpu, gguf_file: nil, tokenizer: nil)
+      model_str = if gguf_file
+        "#{model_id}@#{gguf_file}"
+      else
+        model_id
+      end
+      # Handle GGUF models that need tokenizer
+      if model_str.downcase.include?("gguf") && tokenizer.nil?
+        # Try to load without tokenizer first
+        begin
+          _from_pretrained(model_str, device)
+        rescue => e
+          if e.message.include?("No tokenizer found")
+            # Auto-detect tokenizer
+            detected_tokenizer = guess_tokenizer(model_id)
+            warn "No tokenizer found in GGUF repo. Using tokenizer from: #{detected_tokenizer}"
+            model_str = "#{model_str}@@#{detected_tokenizer}"
+            _from_pretrained(model_str, device)
+          else
+            raise e
+          end
+        end
+      elsif tokenizer
+        # User specified tokenizer
+        model_str = "#{model_str}@@#{tokenizer}"
+        _from_pretrained(model_str, device)
+      else
+        # Non-GGUF model or GGUF with embedded tokenizer
+        _from_pretrained(model_str, device)
+      end
     end
     private

data/lib/candle/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Candle
-  VERSION = "1.0.0.pre.6"
+  VERSION = "1.0.0.pre.7"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: red-candle
 version: !ruby/object:Gem::Version
-  version: 1.0.0.pre.6
+  version: 1.0.0.pre.7
 platform: ruby
 authors:
 - Christopher Petersen
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2025-07-10 00:00:00.000000000 Z
+date: 2025-07-13 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rb_sys
@@ -52,6 +52,7 @@ files:
 - ext/candle/src/llm/llama.rs
 - ext/candle/src/llm/mistral.rs
 - ext/candle/src/llm/mod.rs
+- ext/candle/src/llm/quantized_gguf.rs
 - ext/candle/src/llm/text_generation.rs
 - ext/candle/src/reranker.rs
 - ext/candle/src/ruby/device.rs