RubyGems - red-candle - Versions diffs - 1.0.0.pre.6 → 1.0.0 - Mend

red-candle 1.0.0.pre.6 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

checksums.yaml +4 -4
data/Gemfile +1 -10
data/README.md +481 -4
data/Rakefile +1 -3
data/ext/candle/src/lib.rs +6 -3
data/ext/candle/src/llm/gemma.rs +21 -79
data/ext/candle/src/llm/generation_config.rs +3 -0
data/ext/candle/src/llm/llama.rs +21 -79
data/ext/candle/src/llm/mistral.rs +21 -89
data/ext/candle/src/llm/mod.rs +3 -33
data/ext/candle/src/llm/quantized_gguf.rs +501 -0
data/ext/candle/src/llm/text_generation.rs +0 -4
data/ext/candle/src/ner.rs +423 -0
data/ext/candle/src/reranker.rs +24 -21
data/ext/candle/src/ruby/device.rs +6 -6
data/ext/candle/src/ruby/dtype.rs +4 -4
data/ext/candle/src/ruby/embedding_model.rs +36 -34
data/ext/candle/src/ruby/llm.rs +110 -49
data/ext/candle/src/ruby/mod.rs +1 -2
data/ext/candle/src/ruby/tensor.rs +66 -66
data/ext/candle/src/ruby/tokenizer.rs +269 -0
data/ext/candle/src/ruby/utils.rs +6 -24
data/ext/candle/src/tokenizer/loader.rs +108 -0
data/ext/candle/src/tokenizer/mod.rs +103 -0
data/ext/candle/target/release/build/bindgen-0f89ba23b9ca1395/out/host-target.txt +1 -0
data/ext/candle/target/release/build/clang-sys-cac31d63c4694603/out/common.rs +355 -0
data/ext/candle/target/release/build/clang-sys-cac31d63c4694603/out/dynamic.rs +276 -0
data/ext/candle/target/release/build/clang-sys-cac31d63c4694603/out/macros.rs +49 -0
data/ext/candle/target/release/build/pulp-1b95cfe377eede97/out/x86_64_asm.rs +2748 -0
data/ext/candle/target/release/build/rb-sys-f8ac4edc30ab3e53/out/bindings-0.9.116-mri-arm64-darwin24-3.3.0.rs +8902 -0
data/lib/candle/build_info.rb +2 -0
data/lib/candle/device_utils.rb +2 -0
data/lib/candle/llm.rb +91 -2
data/lib/candle/ner.rb +345 -0
data/lib/candle/reranker.rb +1 -1
data/lib/candle/tensor.rb +2 -0
data/lib/candle/tokenizer.rb +139 -0
data/lib/candle/version.rb +4 -2
data/lib/candle.rb +2 -0
metadata +127 -3
data/ext/candle/src/ruby/qtensor.rs +0 -69

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 07ca4e6eb0b65eac5b62f4b3622ed3189f203279265b7174936ccfd5ff3e5099
-  data.tar.gz: f4970f5c4376453cde1ee18b93155f69ca634ccc3e4a359a45b49d7f20379f64
+  metadata.gz: 829a937851c782dfd58b8fb724dc7b08d524d26400047e9f5fc7a5bd0de9cb4b
+  data.tar.gz: e8a9420fc310e977968aa396a47e5e5269d107eb8cb7246ca9e2f980a0a28f4d
 SHA512:
-  metadata.gz: 10ed0881ec2f67ab1e798401e857eac638049b254b20460bcb5565cee822b24ce2abe23d0ce00275dcb1d1ddebfd926d47eac7e6d54924937da4356a36211224
-  data.tar.gz: d24fa67f74cd62c87ea1666e9488f12e8773d15e2d62b806bd38ca7cb20215d819b0502d352e4d310d1377b9ab64debbdd664148c70fc1ac70f1f2e23e9b516c
+  metadata.gz: 020e23df61d5679612a7892bdbfc7dbcf2d28055df9fc6b9199a8e09933e74d03a0a0fb97d6ddc998f8f8e70e856d63d7eb758638e072cd492734b21681267b7
+  data.tar.gz: 5e10f888c2bd74dfdf01c97ca18f6666440edb000d524784ad0ac0676208d9485b8d708fa72d8fe73672f018c038b3526109153540ce5bbc53a81e4e30deccd1

data/Gemfile CHANGED Viewed

@@ -1,12 +1,3 @@
 source "https://rubygems.org"
-gemspec
-gem "minitest"
-gem "rake"
-gem "rake-compiler"
-gem "yard", require: false
-gem "yard-rustdoc", require: false
-gem "redcarpet", "~> 3.6"
+gemspec

data/README.md CHANGED Viewed

@@ -51,6 +51,42 @@ Red-Candle now supports Large Language Models (LLMs) with GPU acceleration!
 - **Llama**: Llama 2 and Llama 3 models (e.g., `TinyLlama/TinyLlama-1.1B-Chat-v1.0`, `meta-llama/Llama-2-7b-hf`, `NousResearch/Llama-2-7b-hf`)
 - **Mistral**: All Mistral models (e.g., `mistralai/Mistral-7B-Instruct-v0.1`)
+### Quantized Model Support (GGUF)
+Red-Candle supports quantized models in GGUF format, offering 4-8x memory reduction:
+> **Note on GGUF Support**: Red-Candle now uses a unified GGUF loader that automatically detects the model architecture from the GGUF file. This means all GGUF models (including Mistral models from TheBloke) should now work correctly! The loader automatically selects the appropriate tokenizer based on the model type to ensure proper text generation.
+```ruby
+# Load quantized models - always specify the GGUF filename
+llm = Candle::LLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGUF",
+                                  device: device,
+                                  gguf_file: "llama-2-7b-chat.Q4_K_M.gguf")
+# Register custom tokenizer mappings for your models
+Candle::LLM.register_tokenizer("my-org/my-model-GGUF", "my-org/my-tokenizer")
+# Popular quantized model sources:
+# - TheBloke: Extensive collection of GGUF models
+# - Search HuggingFace for "GGUF" models
+```
+**Memory usage comparison (7B models):**
+- Full precision: ~28 GB
+- Q8_0 (8-bit): ~7 GB - Best quality, larger size
+- Q5_K_M (5-bit): ~4.5 GB - Very good quality
+- Q4_K_M (4-bit): ~4 GB - Recommended default, best balance
+- Q3_K_M (3-bit): ~3 GB - Good for memory-constrained systems
+**Quantization levels explained:**
+- **Q8_0**: Almost identical to full model, use when quality is paramount
+- **Q5_K_M**: Excellent quality with good compression
+- **Q4_K_M**: Best balance of quality/size/speed (recommended default)
+- **Q3_K_M**: Noticeable quality reduction but very compact
+- **Q2_K**: ⚠️ **Not recommended** - Can cause inference errors due to extreme quantization
+> **Warning**: Q2_K quantization can lead to "weight is negative, too large or not a valid number" errors during inference. Use Q3_K_M or higher for stable operation.
 > ### ⚠️ Huggingface login warning
 >
 > Many models, including the one below, require you to agree to the terms. You'll need to:
@@ -91,6 +127,8 @@ response = llm.chat(messages)
 ### GPU Acceleration
+We see an 18x speed up running LLMs under CUDA vs CPU and a >3x speed up running under Metal vs CPU. Details [here](DEVICE_SUPPORT.md#performance-considerations).
 ```ruby
 # CPU works for all models
 device = Candle::Device.cpu
@@ -103,9 +141,38 @@ device = Candle::Device.metal
 device = Candle::Device.cuda   # Linux/Windows with NVIDIA GPU
 ```
-## ⚠️ Model Format Requirement: Safetensors Only
+### Debugging Token Generation
+For debugging purposes, you can enable raw token output to see both token IDs and their raw representations:
+```ruby
+# Enable debug mode to see raw tokens during generation
+config = Candle::GenerationConfig.balanced(debug_tokens: true)
+# Non-streaming generation with debug tokens
+result = llm.generate("Hello, world!", config: config)
+puts result
+# Output: [15043:Hello][11:,][1917:world][0:!]
+# Streaming generation with debug tokens
+llm.generate_stream("Hello, world!", config: config) do |text|
+  print text  # Will show each token as it's generated: [15043:Hello][11:,][1917:world][0:!]
+end
+# Works with all models (Llama, Mistral, Gemma, and quantized GGUF models)
+```
+This is particularly useful for:
+- Debugging tokenization issues
+- Understanding how the model processes text
+- Troubleshooting generation problems
+- Analyzing model behavior
+## ⚠️ Model Format Requirements
-Red-Candle **only supports embedding models that provide their weights in the [safetensors](https://github.com/huggingface/safetensors) format** (i.e., the model repo must contain a `model.safetensors` file). If the model repo does not provide the required file, loading will fail with a clear error. Most official BERT and DistilBERT models do **not** provide safetensors; many Sentence Transformers and JinaBERT models do.
+### EmbeddingModels and Rerankers: Safetensors Only
+Red-Candle **only supports embedding models and rerankers that provide their weights in the [safetensors](https://github.com/huggingface/safetensors) format** (i.e., the model repo must contain a `model.safetensors` file). If the model repo does not provide the required file, loading will fail with a clear error. Most official BERT and DistilBERT models do **not** provide safetensors; many Sentence Transformers and JinaBERT models do.
 **If you encounter an error like:**
@@ -115,13 +182,22 @@ RuntimeError: model.safetensors not found after download. Only safetensors model
 this means the selected model is not compatible. Please choose a model repo that provides the required file.
+### LLMs: Safetensors and GGUF Support
+LLM models support two formats:
+1. **Safetensors format** - Standard HuggingFace models (e.g., `TinyLlama/TinyLlama-1.1B-Chat-v1.0`)
+2. **GGUF quantized format** - Memory-efficient quantized models (e.g., `TheBloke/Llama-2-7B-Chat-GGUF`)
+See the [Quantized Model Support](#quantized-model-support-gguf) section for details on using GGUF models.
 ## Supported Embedding Models
 Red-Candle supports the following embedding model types from Hugging Face:
 1. `Candle::EmbeddingModelType::JINA_BERT` - Jina BERT models (e.g., `jinaai/jina-embeddings-v2-base-en`) (**safetensors required**)
-2. `Candle::EmbeddingModelType::STANDARD_BERT` - Standard BERT models (e.g., `sentence-transformers/all-MiniLM-L6-v2`) (**safetensors required**)
+2. `Candle::EmbeddingModelType::MINILM` - MINILM models (e.g., `sentence-transformers/all-MiniLM-L6-v2`) (**safetensors required**)
 3. `Candle::EmbeddingModelType::DISTILBERT` - DistilBERT models (e.g., `distilbert-base-uncased-finetuned-sst-2-english`) (**safetensors required**)
+4. `Candle::EmbeddingModelType::STANDARD_BERT` - Standard BERT models (e.g., `scientistcom/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext`) (**safetensors required**)
 > **Note:** Most official BERT and DistilBERT models do _not_ provide safetensors. Please check the model repo before use.
@@ -197,7 +273,7 @@ ranked_results = reranker.rerank(query, documents, pooling_method: "pooler", app
 # Or apply sigmoid activation to get scores between 0 and 1
 sigmoid_results = reranker.rerank(query, documents, pooling_method: "pooler", apply_sigmoid: true)
-# The pooler method is the default and is recommended for cross-encoders, as is apply_sigmod, so the above is the same as:
+# The pooler method is the default and is recommended for cross-encoders, as is apply_sigmoid, so the above is the same as:
 ranked_results = reranker.rerank(query, documents)
 # Results are returned as an array of hashes, sorted by relevance
@@ -288,6 +364,407 @@ The reranker uses a BERT-based architecture that:
 This joint processing allows cross-encoders to capture subtle semantic relationships between queries and documents, making them more accurate for reranking tasks, though at the cost of higher computational requirements.
+## Tokenizer
+Red-Candle provides direct access to tokenizers for text preprocessing and analysis. This is useful for understanding how models process text, debugging issues, and building custom NLP pipelines.
+### Basic Usage
+```ruby
+require 'candle'
+# Load a tokenizer from HuggingFace
+tokenizer = Candle::Tokenizer.from_pretrained("bert-base-uncased")
+# Encode text to token IDs
+token_ids = tokenizer.encode("Hello, world!")
+# => [101, 7592, 1010, 2088, 999, 102]
+# Decode token IDs back to text
+text = tokenizer.decode(token_ids)
+# => "hello, world!"
+# Get token strings (subwords) - useful for visualization
+tokens = tokenizer.encode_to_tokens("Hello, world!")
+# => ["[CLS]", "hello", ",", "world", "!", "[SEP]"]
+# Get both IDs and tokens together
+result = tokenizer.encode_with_tokens("preprocessing")
+# => {"ids" => [101, 3653, 22618, 2527, 102],
+#     "tokens" => ["[CLS]", "prep", "##ro", "##ces", "##sing", "[SEP]"]}
+```
+### Batch Processing
+```ruby
+# Encode multiple texts at once
+texts = ["Hello world", "How are you?", "Tokenizers are cool"]
+batch_ids = tokenizer.encode_batch(texts)
+# Get token strings for multiple texts
+batch_tokens = tokenizer.encode_batch_to_tokens(texts)
+```
+### Vocabulary Access
+```ruby
+# Get vocabulary size
+vocab_size = tokenizer.vocab_size
+# => 30522
+# Get full vocabulary as a hash
+vocab = tokenizer.get_vocab
+# vocab["hello"] => 7592
+# Convert a specific token ID to its string
+token_str = tokenizer.id_to_token(7592)
+# => "hello"
+# Get special tokens
+special = tokenizer.get_special_tokens
+# => {"cls_token" => 101, "sep_token" => 102, "pad_token" => 0, ...}
+```
+### Configuration
+```ruby
+# Create a tokenizer with padding enabled
+padded_tokenizer = tokenizer.with_padding(length: 128)
+# Create a tokenizer with truncation
+truncated_tokenizer = tokenizer.with_truncation(512)
+# Configure padding with more options
+padded_tokenizer = tokenizer.with_padding(
+  length: 128,          # Fixed length padding
+  direction: "right",   # Pad on the right (default)
+  pad_token: "[PAD]"    # Padding token
+)
+```
+### Model Integration
+All models expose their tokenizers:
+```ruby
+# From LLM
+llm = Candle::LLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+llm_tokenizer = llm.tokenizer
+# From EmbeddingModel
+embedding_model = Candle::EmbeddingModel.new
+emb_tokenizer = embedding_model.tokenizer
+# From Reranker
+reranker = Candle::Reranker.new(model_path: "cross-encoder/ms-marco-MiniLM-L-12-v2")
+rank_tokenizer = reranker.tokenizer
+```
+### Understanding Subword Tokenization
+Modern tokenizers split unknown or rare words into subword pieces:
+```ruby
+# See how words are split into subwords
+result = tokenizer.encode_with_tokens("unbelievable")
+# => {"ids" => [101, 4895, 6499, 102],
+#     "tokens" => ["[CLS]", "un", "##believable", "[SEP]"]}
+# The ## prefix indicates a continuation of the previous token
+complex = tokenizer.encode_to_tokens("preprocessing tokenization")
+# => ["[CLS]", "prep", "##ro", "##ces", "##sing", "token", "##ization", "[SEP]"]
+```
+### Use Cases
+- **Token Analysis**: Understand how your text is being processed by models
+- **Debugging**: See why certain inputs might cause unexpected model behavior
+- **Custom Preprocessing**: Build your own text processing pipelines
+- **Educational**: Teach how modern NLP models handle text
+- **NER Preparation**: Get aligned tokens for named entity recognition tasks
+## Named Entity Recognition (NER)
+Red-Candle includes comprehensive Named Entity Recognition capabilities for extracting entities like people, organizations, locations, and custom entity types from text.
+### Model-based NER
+Load pre-trained NER models from HuggingFace:
+```ruby
+require 'candle'
+# Load a pre-trained NER model
+ner = Candle::NER.from_pretrained("Babelscape/wikineural-multilingual-ner")
+# Or load a model with a specific tokenizer (for models without tokenizer.json)
+ner = Candle::NER.from_pretrained("dslim/bert-base-NER", tokenizer: "bert-base-cased")
+# Extract entities from text
+text = "Apple Inc. was founded by Steve Jobs and Steve Wozniak in Cupertino, California."
+entities = ner.extract_entities(text)
+entities.each do |entity|
+  puts "#{entity['text']} (#{entity['label']}) - confidence: #{entity['confidence'].round(2)}"
+end
+# Output:
+# Apple Inc. (ORG) - confidence: 0.99
+# Steve Jobs (PER) - confidence: 0.99
+# Steve Wozniak (PER) - confidence: 0.98
+# Cupertino (LOC) - confidence: 0.97
+# California (LOC) - confidence: 0.98
+# Adjust confidence threshold (default: 0.9)
+entities = ner.extract_entities(text, confidence_threshold: 0.95)
+# Get token-level predictions for detailed analysis
+tokens = ner.predict_tokens(text)
+```
+### Pattern-based Recognition
+For domain-specific entities, use regex patterns:
+```ruby
+# Create pattern-based recognizers
+email_recognizer = Candle::PatternEntityRecognizer.new("EMAIL", [
+  /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/
+])
+phone_recognizer = Candle::PatternEntityRecognizer.new("PHONE", [
+  /\b\d{3}[-.]?\d{3}[-.]?\d{4}\b/,         # 555-123-4567
+  /\b\(\d{3}\)\s*\d{3}[-.]?\d{4}\b/,      # (555) 123-4567
+  /\b\+1\s*\d{3}[-.]?\d{3}[-.]?\d{4}\b/   # +1 555-123-4567
+])
+# Extract entities
+text = "Contact us at info@example.com or call 555-123-4567"
+email_entities = email_recognizer.recognize(text)
+phone_entities = phone_recognizer.recognize(text)
+```
+### Gazetteer-based Recognition
+Use dictionaries for known entities:
+```ruby
+# Create gazetteer recognizers
+companies = ["Apple", "Google", "Microsoft", "Amazon", "Tesla"]
+company_recognizer = Candle::GazetteerEntityRecognizer.new("COMPANY", companies)
+# Load from file
+drug_recognizer = Candle::GazetteerEntityRecognizer.new("DRUG")
+drug_recognizer.load_from_file("drug_names.txt")
+# Case-sensitive matching
+product_recognizer = Candle::GazetteerEntityRecognizer.new("PRODUCT",
+  ["iPhone", "iPad", "MacBook"],
+  case_sensitive: true
+)
+```
+### Hybrid NER
+Combine ML models with rule-based approaches for best results:
+```ruby
+# Create hybrid NER system
+hybrid = Candle::HybridNER.new("Babelscape/wikineural-multilingual-ner")
+# Add pattern recognizers
+hybrid.add_pattern_recognizer("EMAIL", [/\b[\w._%+-]+@[\w.-]+\.[A-Z|a-z]{2,}\b/])
+hybrid.add_pattern_recognizer("PHONE", [/\b\d{3}[-.]?\d{3}[-.]?\d{4}\b/])
+# Add gazetteer recognizers
+hybrid.add_gazetteer_recognizer("COMPANY", ["Apple", "Google", "Microsoft"])
+hybrid.add_gazetteer_recognizer("PRODUCT", ["iPhone", "Android", "Windows"])
+# Extract all entities
+text = "John Smith (john@apple.com) from Apple called about the new iPhone. Reach him at 555-0123."
+entities = hybrid.extract_entities(text)
+# Results include entities from all recognizers
+# Overlapping entities are automatically resolved (highest confidence wins)
+```
+### Custom Entity Types
+Perfect for specialized domains:
+```ruby
+# Biomedical entities
+gene_patterns = [
+  /\b[A-Z][A-Z0-9]{2,}\b/,      # TP53, BRCA1, EGFR
+  /\bCD\d+\b/,                  # CD4, CD8, CD34
+  /\b[A-Z]+\d[A-Z]\d*\b/        # RAD51C, PALB2
+]
+gene_recognizer = Candle::PatternEntityRecognizer.new("GENE", gene_patterns)
+# Financial entities
+ticker_patterns = [
+  /\$[A-Z]{1,5}\b/,             # $AAPL, $GOOGL
+  /\b[A-Z]{1,5}\.NYSE\b/,       # AAPL.NYSE
+  /\b[A-Z]{1,5}\.NASDAQ\b/      # GOOGL.NASDAQ
+]
+ticker_recognizer = Candle::PatternEntityRecognizer.new("TICKER", ticker_patterns)
+# Legal entities
+case_patterns = [
+  /\b\d+\s+F\.\d+\s+\d+\b/,     # 123 F.3d 456
+  /\b\d+\s+U\.S\.\s+\d+\b/,     # 123 U.S. 456
+  /\bNo\.\s+\d+-\d+\b/          # No. 20-1234
+]
+case_recognizer = Candle::PatternEntityRecognizer.new("CASE", case_patterns)
+```
+### Available Pre-trained Models
+Popular NER models on HuggingFace:
+```ruby
+# General multilingual NER (4 entity types: PER, ORG, LOC, MISC)
+ner = Candle::NER.from_pretrained("Babelscape/wikineural-multilingual-ner")
+# English NER (requires separate tokenizer)
+ner = Candle::NER.from_pretrained("dslim/bert-base-NER", tokenizer: "bert-base-cased")
+# Multilingual NER
+ner = Candle::NER.from_pretrained("Davlan/bert-base-multilingual-cased-ner-hrl")
+# OntoNotes 5 (18 entity types including DATE, TIME, MONEY, etc.)
+ner = Candle::NER.from_pretrained("flair/ner-english-ontonotes-large")
+# Biomedical NER
+ner = Candle::NER.from_pretrained("dmis-lab/biobert-base-cased-v1.2")
+ner = Candle::NER.from_pretrained("allenai/scibert_scivocab_uncased")
+```
+### Performance Tips
+1. **Device Selection**: Use GPU for faster inference
+   ```ruby
+   ner = Candle::NER.from_pretrained("Babelscape/wikineural-multilingual-ner", device: Candle::Device.metal)
+   ```
+2. **Batch Processing**: Process multiple texts together when possible
+3. **Confidence Threshold**: Balance precision/recall with appropriate thresholds
+4. **Entity Resolution**: The hybrid NER automatically handles overlapping entities
+### Output Format
+All NER methods return entities in a consistent format:
+```ruby
+{
+  "text" => "Apple Inc.",          # The entity text
+  "label" => "ORG",               # Entity type
+  "start" => 0,                   # Character start position
+  "end" => 10,                    # Character end position
+  "confidence" => 0.99,           # Confidence score (0-1)
+  "token_start" => 0,             # Token start index (model-based only)
+  "token_end" => 2,               # Token end index (model-based only)
+  "source" => "model"             # Source: "model", "pattern", or "gazetteer"
+}
+```
+## Common Runtime Errors
+### 1. Weight is negative, too large or not a valid number
+**Error:**
+```
+/Users/cpetersen/src/scientist/red-candle/lib/candle/llm.rb:25:in `_generate_stream': Generation failed: A weight is negative, too large or not a valid number (RuntimeError)
+    from /Users/cpetersen/src/scientist/red-candle/lib/candle/llm.rb:25:in `generate_stream'
+    ...
+```
+**Cause:** This error occurs when using overly aggressive quantization levels (particularly Q2_K) that result in numerical instability during inference. The 2-bit quantization can cause weights to become corrupted or produce NaN/Inf values.
+**Solution:** Use a higher quantization level. Recommended options:
+- Q4_K_M (4-bit) - Best balance of quality and size
+- Q5_K_M (5-bit) - Higher quality with slightly larger size
+- Q3_K_M (3-bit) - Minimum recommended quantization
+```ruby
+# Instead of Q2_K:
+llm = Candle::LLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
+                                  device: device,
+                                  gguf_file: "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")
+```
+### 2. Cannot find tensor model.embed_tokens.weight
+**Error:**
+```
+Failed to load quantized model: cannot find tensor model.embed_tokens.weight (RuntimeError)
+```
+**Cause:** This error was common in earlier versions when loading GGUF files with incompatible tensor naming conventions. The unified GGUF loader in version 1.0.0+ should handle most GGUF files correctly.
+**If you still encounter this error:**
+1. Ensure you're using the latest version of red-candle (1.0.0 or higher)
+2. Make sure to specify the exact GGUF filename:
+   ```ruby
+   llm = Candle::LLM.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
+                                     device: device,
+                                     gguf_file: "mistral-7b-instruct-v0.2.Q4_K_M.gguf")
+   ```
+3. If the error persists, the GGUF file may use an unsupported architecture or format
+### 3. No GGUF file found in repository
+**Error:**
+```
+Failed to load quantized model: No GGUF file found in repository TheBloke/model-name-GGUF. Try specifying a quantization level like Q4_K_M, Q5_K_M, or Q8_0. (RuntimeError)
+```
+**Cause:** The automatic GGUF file detection couldn't find a matching file, often due to naming variations.
+**Solution:** Specify the exact GGUF filename:
+```ruby
+# Visit the HuggingFace repository to find the exact filename
+llm = Candle::LLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGUF",
+                                  device: device,
+                                  gguf_file: "llama-2-7b-chat.Q4_K_M.gguf")
+```
+### 4. Failed to download tokenizer
+**Error:**
+```
+Failed to load quantized model: Failed to download tokenizer: request error: HTTP status client error (404 Not Found)
+```
+**Cause:** GGUF repositories often don't include separate tokenizer files since they're embedded in the GGUF format.
+**Solution:** The code now includes fallback tokenizer loading. If you still encounter this error, ensure you're using the latest version of red-candle.
+### 5. Missing metadata in GGUF file
+**Error:**
+```
+Failed to load GGUF model: cannot find gemma3.attention.head_count in metadata (RuntimeError)
+```
+or
+```
+Failed to load GGUF model: cannot find llama.attention.head_count in metadata (RuntimeError)
+```
+**Cause:** Some GGUF files may have been created with older conversion tools that don't include all required metadata fields.
+**Solution:**
+- Try a different GGUF file from the same model
+- Look for GGUF files from TheBloke or other reputable sources
+- Check if a newer version of the GGUF file is available
+- Some Gemma GGUF files may not be compatible with the current loader
+**Known compatibility issues:**
+- `lmstudio-ai/gemma-2b-it-GGUF` - Missing required metadata fields
+- Gemma 3 GGUF files may require specific tokenizers that are not publicly available
+- For best compatibility, use Llama or Mistral GGUF files from TheBloke
 ## Development
 FORK IT!

data/Rakefile CHANGED Viewed

@@ -8,7 +8,7 @@ task default: :test
 Rake::TestTask.new do |t|
   t.deps << :compile
   t.libs << "test"
-  t.test_files = FileList["test/**/*_test.rb"]
+  t.test_files = FileList["test/**/*_test.rb"].exclude("test/benchmarks/**/*_test.rb")
 end
 spec = Bundler.load_gemspec("candle.gemspec")
@@ -36,7 +36,6 @@ end
 desc "Run benchmark tests"
 Rake::TestTask.new("test:benchmark") do |t|
-  ENV['CANDLE_RUN_BENCHMARKS'] = 'true'
   t.deps << :compile
   t.libs << "test"
   t.test_files = FileList["test/benchmarks/**/*_test.rb"]
@@ -59,7 +58,6 @@ end
 desc "Run benchmarks with device tests"
 task "test:device:benchmark" => :compile do
-  ENV['CANDLE_RUN_BENCHMARKS'] = 'true'
   ENV['CANDLE_TEST_VERBOSE'] = 'true'
   Rake::Task["test:device"].invoke
   Rake::Task["test:benchmark"].invoke

data/ext/candle/src/lib.rs CHANGED Viewed

@@ -1,11 +1,13 @@
 use magnus::{function, prelude::*, Ruby};
 use crate::ruby::candle_utils;
-use crate::ruby::Result as RbResult;
+use crate::ruby::Result;
 pub mod llm;
+pub mod ner;
 pub mod reranker;
 pub mod ruby;
+pub mod tokenizer;
 // Configuration detection from build.rs
 #[cfg(all(has_metal, not(force_cpu)))]
@@ -33,7 +35,7 @@ pub fn get_build_info() -> magnus::RHash {
 }
 #[magnus::init]
-fn init(ruby: &Ruby) -> RbResult<()> {
+fn init(ruby: &Ruby) -> Result<()> {
     let rb_candle = ruby.define_module("Candle")?;
     // Export build info
@@ -41,11 +43,12 @@ fn init(ruby: &Ruby) -> RbResult<()> {
     ruby::init_embedding_model(rb_candle)?;
     ruby::init_llm(rb_candle)?;
+    ner::init(rb_candle)?;
     reranker::init(rb_candle)?;
     ruby::dtype::init(rb_candle)?;
-    ruby::qtensor::init(rb_candle)?;
     ruby::device::init(rb_candle)?;
     ruby::tensor::init(rb_candle)?;
+    ruby::tokenizer::init(rb_candle)?;
     candle_utils(rb_candle)?;
     Ok(())