RubyGems - red-candle - Versions diffs - 1.1.1 → 1.2.0 - Mend

red-candle 1.1.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/README.md +102 -45
data/Rakefile +108 -77
data/ext/candle/src/lib.rs +2 -4
data/ext/candle/src/llm/quantized_gguf.rs +18 -2
data/ext/candle/src/ruby/device.rs +32 -1
data/ext/candle/src/ruby/dtype.rs +1 -0
data/ext/candle/src/ruby/embedding_model.rs +74 -28
data/ext/candle/src/ruby/errors.rs +1 -0
data/ext/candle/src/ruby/llm.rs +96 -1
data/ext/candle/src/ruby/mod.rs +2 -0
data/ext/candle/src/{ner.rs → ruby/ner.rs} +47 -15
data/ext/candle/src/{reranker.rs → ruby/reranker.rs} +24 -2
data/ext/candle/src/ruby/tensor.rs +103 -27
data/ext/candle/src/ruby/tokenizer.rs +60 -3
data/ext/candle/src/tokenizer/mod.rs +2 -1
data/ext/candle/tests/device_tests.rs +43 -0
data/ext/candle/tests/tensor_tests.rs +162 -0
data/lib/candle/device_utils.rb +3 -15
data/lib/candle/embedding_model.rb +44 -1
data/lib/candle/llm.rb +63 -1
data/lib/candle/ner.rb +34 -22
data/lib/candle/reranker.rb +20 -1
data/lib/candle/tensor.rb +15 -0
data/lib/candle/version.rb +1 -1
metadata +20 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: a3678037fbb196c621c8e9df6a213b0d3dffbdb1b8b3dfd73eee4a7ea2feafca
-  data.tar.gz: ada97ef81af854439622bdc12b796442be9e0f31e7c7d8a5df374c7bfb07ff2e
+  metadata.gz: 3f2d005688d7b0253060d087a9800ea8c1d2c7bbb6ff6c92cca3ebc238d99be3
+  data.tar.gz: 6296db628c2d13a39ef035fe45c41be39de2404e6ab72d9735109ad18879f65c
 SHA512:
-  metadata.gz: d353177318c4599fa30974a676350087a8e5fd070fe3d317344a4e1b3ae022cb69adf742d62063c2da09dbab7e971cbfae1e53a87527ce7f1c18afd1223797e8
-  data.tar.gz: df4b2f43f6fb1aa623053fd09d6e48eba0d8c2615f51dc2accdc4dc292fb3fb7d665553b04cae3747e001e04cd4b9cdbe5c022c3efd077daddf97e074a1e9e5c
+  metadata.gz: 4511b6f96d1356101e10547f740702479c894fb6d1e6f8cb04213b49b624ac8dc73d83b8b91bbab396e095fd31bbb4dd019ca967ce7f790447a4b77dd25d3356
+  data.tar.gz: 5a1ef095e2bbd9967317e0c416fb018da2673e18d76e00c98177cdba5dd2f9c5723fc5404b0d4221800227aab8b74e5560d420aff79f94e216365e6d3cee6f1e

data/README.md CHANGED Viewed

@@ -1,4 +1,4 @@
-# `red-candle` Native LLMs for Ruby 🚀
+<img src="/docs/assets/logo-title.png" alt="red-candle" height="80px">
 [![build](https://github.com/assaydepot/red-candle/actions/workflows/build.yml/badge.svg)](https://github.com/assaydepot/red-candle/actions/workflows/build.yml)
 [![Gem Version](https://badge.fury.io/rb/red-candle.svg)](https://badge.fury.io/rb/red-candle)
@@ -18,7 +18,7 @@ gem install red-candle
 require 'candle'
 # Download a model (one-time, ~650MB) - Mistral, Llama3, Gemma all work!
-llm = Candle::LLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
+llm = Candle::LLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
                                   gguf_file: "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")
 # Chat with it - no API calls, running locally in your Ruby process!
@@ -27,8 +27,8 @@ messages = [
 ]
 puts llm.chat(messages)
-# => "Ruby is a dynamic, object-oriented programming language known for its
-#     simplicity, elegance, and productivity, often used for web development
+# => "Ruby is a dynamic, object-oriented programming language known for its
+#     simplicity, elegance, and productivity, often used for web development
 #     with frameworks like Rails."
 ```
@@ -99,22 +99,16 @@ x = x.reshape([3, 2])
 require 'candle'
 # Default model (JinaBERT) on CPU
-model = Candle::EmbeddingModel.new
+model = Candle::EmbeddingModel.from_pretrained
 embedding = model.embedding("Hi there!")
 # Specify device (CPU, Metal, or CUDA)
 device = Candle::Device.cpu     # or Candle::Device.metal, Candle::Device.cuda
-model = Candle::EmbeddingModel.new(
-  model_path: "jinaai/jina-embeddings-v2-base-en",
-  device: device
-)
+model = Candle::EmbeddingModel.from_pretrained("jinaai/jina-embeddings-v2-base-en", device: device)
 embedding = model.embedding("Hi there!")
 # Reranker also supports device selection
-reranker = Candle::Reranker.new(
-  model_path: "cross-encoder/ms-marco-MiniLM-L-12-v2",
-  device: device
-)
+reranker = Candle::Reranker.from_pretrained("cross-encoder/ms-marco-MiniLM-L-12-v2", device: device)
 results = reranker.rerank("query", ["doc1", "doc2", "doc3"])
 ```
@@ -140,8 +134,8 @@ Red-Candle supports quantized models in GGUF format, offering 4-8x memory reduct
 ```ruby
 # Load quantized models - always specify the GGUF filename
-llm = Candle::LLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGUF",
-                                  device: device,
+llm = Candle::LLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGUF",
+                                  device: device,
                                   gguf_file: "llama-2-7b-chat.Q4_K_M.gguf")
 # Register custom tokenizer mappings for your models
@@ -155,7 +149,7 @@ Candle::LLM.register_tokenizer("my-org/my-model-GGUF", "my-org/my-tokenizer")
 **Memory usage comparison (7B models):**
 - Full precision: ~28 GB
 - Q8_0 (8-bit): ~7 GB - Best quality, larger size
-- Q5_K_M (5-bit): ~4.5 GB - Very good quality
+- Q5_K_M (5-bit): ~4.5 GB - Very good quality
 - Q4_K_M (4-bit): ~4 GB - Recommended default, best balance
 - Q3_K_M (3-bit): ~3 GB - Good for memory-constrained systems
@@ -169,13 +163,13 @@ Candle::LLM.register_tokenizer("my-org/my-model-GGUF", "my-org/my-tokenizer")
 > **Warning**: Q2_K quantization can lead to "weight is negative, too large or not a valid number" errors during inference. Use Q3_K_M or higher for stable operation.
 > ### ⚠️ Huggingface login warning
->
+>
 > Many models, including the one below, require you to agree to the terms. You'll need to:
 > 1. Login to [Huggingface](https://huggingface.co)
 > 2. Agree to the terms. For example: [here](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
 > 3. Authenticate your session. Simplest way is with `huggingface-cli login`. Detail here: [Huggingface CLI](https://huggingface.co/docs/huggingface_hub/en/guides/cli)
 >
-> More details here: [Huggingface Authentication](HUGGINGFACE.md)
+> More details here: [Huggingface Authentication](docs/HUGGINGFACE.md)
 ```ruby
 require 'candle'
@@ -208,7 +202,7 @@ response = llm.chat(messages)
 ### GPU Acceleration
-We see an 18x speed up running LLMs under CUDA vs CPU and a >3x speed up running under Metal vs CPU. Details [here](DEVICE_SUPPORT.md#performance-considerations).
+We see an 18x speed up running LLMs under CUDA vs CPU and a >3x speed up running under Metal vs CPU. Details [here](docs/DEVICE_SUPPORT.md#performance-considerations).
 ```ruby
 # CPU works for all models
@@ -216,7 +210,7 @@ device = Candle::Device.cpu
 llm = Candle::LLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", device: device)
 # Metal
-device = Candle::Device.metal
+device = Candle::Device.metal
 # CUDA support (for NVIDIA GPUs)
 device = Candle::Device.cuda   # Linux/Windows with NVIDIA GPU
@@ -325,9 +319,9 @@ The default model (`jinaai/jina-embeddings-v2-base-en` with the `sentence-transf
 ```ruby
 > require 'candle'
 # Ruby memory = 25.9 MB
-> model = Candle::EmbeddingModel.new
+> model = Candle::EmbeddingModel.from_pretrained
 # Ruby memory = 3.50 GB
-> model2 = Candle::EmbeddingModel.new
+> model2 = Candle::EmbeddingModel.from_pretrained
 # Ruby memory = 7.04 GB
 > model2 = nil
 > GC.start
@@ -353,7 +347,7 @@ And the following ruby:
 ```ruby
 require 'candle'
-model = Candle::EmbeddingModel.new
+model = Candle::EmbeddingModel.from_pretrained
 embedding = model.embedding("Hi there!")
 ```
@@ -367,13 +361,13 @@ Red-Candle includes support for cross-encoder reranking models, which can be use
 require 'candle'
 # Initialize the reranker with a cross-encoder model
-reranker = Candle::Reranker.new(model_path: "cross-encoder/ms-marco-MiniLM-L-12-v2")
+reranker = Candle::Reranker.from_pretrained("cross-encoder/ms-marco-MiniLM-L-12-v2")
 # Define your query and candidate documents
 query = "How many people live in London?"
 documents = [
   "London is known for its financial district",
-  "Around 9 Million people live in London",
+  "Around 9 Million people live in London",
   "The weather in London is often rainy",
   "London is the capital of England"
 ]
@@ -457,7 +451,7 @@ For faster inference on NVIDIA GPUs:
 ```ruby
 # Initialize with CUDA if available (falls back to CPU if not)
-reranker = Candle::Reranker.new(model_path: "cross-encoder/ms-marco-MiniLM-L-12-v2", cuda: true)
+reranker = Candle::Reranker.from_pretrained("cross-encoder/ms-marco-MiniLM-L-12-v2", cuda: true)
 ```
 ### How It Works
@@ -501,7 +495,7 @@ tokens = tokenizer.encode_to_tokens("Hello, world!")
 # Get both IDs and tokens together
 result = tokenizer.encode_with_tokens("preprocessing")
-# => {"ids" => [101, 3653, 22618, 2527, 102],
+# => {"ids" => [101, 3653, 22618, 2527, 102],
 #     "tokens" => ["[CLS]", "prep", "##ro", "##ces", "##sing", "[SEP]"]}
 ```
@@ -563,11 +557,11 @@ llm = Candle::LLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
 llm_tokenizer = llm.tokenizer
 # From EmbeddingModel
-embedding_model = Candle::EmbeddingModel.new
+embedding_model = Candle::EmbeddingModel.from_pretrained
 emb_tokenizer = embedding_model.tokenizer
 # From Reranker
-reranker = Candle::Reranker.new(model_path: "cross-encoder/ms-marco-MiniLM-L-12-v2")
+reranker = Candle::Reranker.from_pretrained("cross-encoder/ms-marco-MiniLM-L-12-v2")
 rank_tokenizer = reranker.tokenizer
 ```
@@ -578,7 +572,7 @@ Modern tokenizers split unknown or rare words into subword pieces:
 ```ruby
 # See how words are split into subwords
 result = tokenizer.encode_with_tokens("unbelievable")
-# => {"ids" => [101, 4895, 6499, 102],
+# => {"ids" => [101, 4895, 6499, 102],
 #     "tokens" => ["[CLS]", "un", "##believable", "[SEP]"]}
 # The ## prefix indicates a continuation of the previous token
@@ -589,7 +583,7 @@ complex = tokenizer.encode_to_tokens("preprocessing tokenization")
 ### Use Cases
 - **Token Analysis**: Understand how your text is being processed by models
-- **Debugging**: See why certain inputs might cause unexpected model behavior
+- **Debugging**: See why certain inputs might cause unexpected model behavior
 - **Custom Preprocessing**: Build your own text processing pipelines
 - **Educational**: Teach how modern NLP models handle text
 - **NER Preparation**: Get aligned tokens for named entity recognition tasks
@@ -616,7 +610,7 @@ text = "Apple Inc. was founded by Steve Jobs and Steve Wozniak in Cupertino, Cal
 entities = ner.extract_entities(text)
 entities.each do |entity|
-  puts "#{entity['text']} (#{entity['label']}) - confidence: #{entity['confidence'].round(2)}"
+  puts "#{entity[:text]} (#{entity[:label]}) - confidence: #{entity[:confidence].round(2)}"
 end
 # Output:
 # Apple Inc. (ORG) - confidence: 0.99
@@ -668,8 +662,8 @@ drug_recognizer = Candle::GazetteerEntityRecognizer.new("DRUG")
 drug_recognizer.load_from_file("drug_names.txt")
 # Case-sensitive matching
-product_recognizer = Candle::GazetteerEntityRecognizer.new("PRODUCT",
-  ["iPhone", "iPad", "MacBook"],
+product_recognizer = Candle::GazetteerEntityRecognizer.new("PRODUCT",
+  ["iPhone", "iPad", "MacBook"],
   case_sensitive: true
 )
 ```
@@ -686,7 +680,7 @@ hybrid = Candle::HybridNER.new("Babelscape/wikineural-multilingual-ner")
 hybrid.add_pattern_recognizer("EMAIL", [/\b[\w._%+-]+@[\w.-]+\.[A-Z|a-z]{2,}\b/])
 hybrid.add_pattern_recognizer("PHONE", [/\b\d{3}[-.]?\d{3}[-.]?\d{4}\b/])
-# Add gazetteer recognizers
+# Add gazetteer recognizers
 hybrid.add_gazetteer_recognizer("COMPANY", ["Apple", "Google", "Microsoft"])
 hybrid.add_gazetteer_recognizer("PRODUCT", ["iPhone", "Android", "Windows"])
@@ -739,7 +733,7 @@ ner = Candle::NER.from_pretrained("Babelscape/wikineural-multilingual-ner")
 # English NER (requires separate tokenizer)
 ner = Candle::NER.from_pretrained("dslim/bert-base-NER", tokenizer: "bert-base-cased")
-# Multilingual NER
+# Multilingual NER
 ner = Candle::NER.from_pretrained("Davlan/bert-base-multilingual-cased-ner-hrl")
 # OntoNotes 5 (18 entity types including DATE, TIME, MONEY, etc.)
@@ -758,9 +752,9 @@ ner = Candle::NER.from_pretrained("allenai/scibert_scivocab_uncased")
    ```
 2. **Batch Processing**: Process multiple texts together when possible
 3. **Confidence Threshold**: Balance precision/recall with appropriate thresholds
 4. **Entity Resolution**: The hybrid NER automatically handles overlapping entities
 ### Output Format
@@ -772,7 +766,7 @@ All NER methods return entities in a consistent format:
   "text" => "Apple Inc.",          # The entity text
   "label" => "ORG",               # Entity type
   "start" => 0,                   # Character start position
-  "end" => 10,                    # Character end position
+  "end" => 10,                    # Character end position
   "confidence" => 0.99,           # Confidence score (0-1)
   "token_start" => 0,             # Token start index (model-based only)
   "token_end" => 2,               # Token end index (model-based only)
@@ -799,8 +793,8 @@ All NER methods return entities in a consistent format:
 - Q3_K_M (3-bit) - Minimum recommended quantization
 ```ruby
-llm = Candle::LLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
-                                  device: device,
+llm = Candle::LLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
+                                  device: device,
                                   gguf_file: "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")
 ```
@@ -817,7 +811,7 @@ Failed to load quantized model: cannot find tensor model.embed_tokens.weight (Ru
 1. Ensure you're using the latest version of red-candle (1.0.0 or higher)
 2. Make sure to specify the exact GGUF filename:
    ```ruby
-   llm = Candle::LLM.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
+   llm = Candle::LLM.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
                                      device: device,
                                      gguf_file: "mistral-7b-instruct-v0.2.Q4_K_M.gguf")
    ```
@@ -835,8 +829,8 @@ Failed to load quantized model: No GGUF file found in repository TheBloke/model-
 **Solution:** Specify the exact GGUF filename:
 ```ruby
 # Visit the HuggingFace repository to find the exact filename
-llm = Candle::LLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGUF",
-                                  device: device,
+llm = Candle::LLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGUF",
+                                  device: device,
                                   gguf_file: "llama-2-7b-chat.Q4_K_M.gguf")
 ```
@@ -864,7 +858,7 @@ Failed to load GGUF model: cannot find llama.attention.head_count in metadata (R
 **Cause:** Some GGUF files may have been created with older conversion tools that don't include all required metadata fields.
-**Solution:**
+**Solution:**
 - Try a different GGUF file from the same model
 - Look for GGUF files from TheBloke or other reputable sources
 - Check if a newer version of the GGUF file is available
@@ -888,6 +882,69 @@ bundle exec rake compile
 Pull requests are welcome.
+## Testing
+Red Candle has comprehensive tests at both the Ruby and Rust levels:
+### Ruby Tests
+```bash
+# Run all Ruby tests
+bundle exec rake test
+# Run specific test suites
+bundle exec rake test:device         # Device compatibility tests
+bundle exec rake test:benchmark      # Benchmark tests
+bundle exec rake test:llm:mistral    # Model-specific tests
+```
+### Rust Tests
+```bash
+# Run Rust unit and integration tests
+cd ext/candle && cargo test
+# Or use the Rake task
+bundle exec rake rust:test
+```
+The Rust tests include:
+- Unit tests within source files (using `#[cfg(test)]` modules)
+- Integration tests for external dependencies (candle_core operations)
+- Tests for structured generation, tokenization, and text generation
+### Code Coverage
+#### Rust Code Coverage
+Red Candle uses `cargo-llvm-cov` for Rust code coverage analysis:
+```bash
+# Generate HTML coverage report (opens in target/llvm-cov/html/index.html)
+bundle exec rake rust:coverage:html
+# Show coverage summary in terminal
+bundle exec rake rust:coverage:summary
+# Generate detailed coverage report
+bundle exec rake rust:coverage:report
+# Generate LCOV format for CI integration
+bundle exec rake rust:coverage:lcov
+# Clean coverage data
+bundle exec rake rust:coverage:clean
+```
+**Note**: Overall Rust coverage shows ~17% because most code consists of Ruby FFI bindings that are tested through Ruby tests. The testable Rust components have high coverage:
+- Constrained generation: 99.59%
+- Schema processing: 90.99%
+- Integration tests: 97.12%
+#### Ruby Code Coverage
+Ruby test coverage is generated automatically when running tests:
+```bash
+bundle exec rake test
+# Coverage report generated in coverage/index.html
+```
 ## Release
 1. Update version number in `lib/candle/version.rb` and commit.

data/Rakefile CHANGED Viewed

@@ -1,22 +1,10 @@
 # frozen_string_literal: true
 require "bundler/gem_tasks"
-require "rake/testtask"
 require "rake/extensiontask"
+require "rspec/core/rake_task"
-task default: :test
-Rake::TestTask.new do |t|
-  t.deps << :compile
-  t.libs << "test"
-  t.test_files = FileList["test/**/*_test.rb"]
-    .exclude("test/benchmarks/**/*_test.rb")
-    .exclude("test/llm/llm_test.rb")
-    .exclude("test/llm/gemma_test.rb")
-    .exclude("test/llm/mistral_test.rb")
-    .exclude("test/llm/llama_test.rb")
-    .exclude("test/llm/phi_test.rb")
-    .exclude("test/llm/qwen_test.rb")
-end
+task default: :spec
 spec = Bundler.load_gemspec("candle.gemspec")
 Rake::ExtensionTask.new("candle", spec) do |c|
@@ -33,104 +21,147 @@ Rake::ExtensionTask.new("candle", spec) do |c|
   ]
 end
-desc "Run device compatibility tests"
-Rake::TestTask.new("test:device") do |t|
-  t.deps << :compile
-  t.libs << "test"
-  t.test_files = FileList["test/device_compatibility_test.rb"]
-  t.verbose = true
+namespace :doc do
+  task default: %i[rustdoc yard]
+  desc "Generate YARD documentation"
+  task :yard do
+    sh <<~CMD
+      yard doc \
+        --plugin rustdoc -- lib tmp/doc/candle.json
+    CMD
+  end
+  desc "Generate Rust documentation as JSON"
+  task :rustdoc do
+    sh <<~CMD
+      cargo +nightly rustdoc \
+        --target-dir tmp/doc/target \
+        -p candle \
+        -- -Zunstable-options --output-format json \
+        --document-private-items
+    CMD
+    cp "tmp/doc/target/doc/candle.json", "tmp/doc/candle.json"
+  end
 end
-desc "Run benchmark tests"
-Rake::TestTask.new("test:benchmark") do |t|
-  t.deps << :compile
-  t.libs << "test"
-  t.test_files = FileList["test/benchmarks/**/*_test.rb"]
-  t.verbose = true
+task doc: "doc:default"
+namespace :rust do
+  desc "Run Rust tests with code coverage"
+  namespace :coverage do
+    desc "Generate HTML coverage report"
+    task :html do
+      sh "cd ext/candle && cargo llvm-cov --html"
+      puts "Coverage report generated in target/llvm-cov/html/index.html"
+    end
+    desc "Generate coverage report in terminal"
+    task :report do
+      sh "cd ext/candle && cargo llvm-cov"
+    end
+    desc "Show coverage summary"
+    task :summary do
+      sh "cd ext/candle && cargo llvm-cov --summary-only"
+    end
+    desc "Generate lcov format coverage report"
+    task :lcov do
+      sh "cd ext/candle && cargo llvm-cov --lcov --output-path ../../coverage/lcov.info"
+      puts "LCOV report generated in coverage/lcov.info"
+    end
+    desc "Clean coverage data"
+    task :clean do
+      sh "cd ext/candle && cargo llvm-cov clean"
+    end
+  end
+  desc "Run Rust tests"
+  task :test do
+    sh "cd ext/candle && cargo test"
+  end
 end
-desc "Run all tests including benchmarks"
-task "test:all" => [:test, "test:benchmark"]
+desc "Run Rust tests with coverage (alias)"
+task "coverage:rust" => "rust:coverage:html"
-desc "Run tests on specific devices"
-namespace :test do
+# RSpec tasks
+desc "Run RSpec tests"
+RSpec::Core::RakeTask.new(:spec) do |t|
+  t.rspec_opts = "--format progress"
+end
+# Add compile as a dependency for spec task
+task spec: :compile
+namespace :spec do
+  desc "Run RSpec tests with all devices"
+  RSpec::Core::RakeTask.new(:device) do |t|
+    t.rspec_opts = "--format documentation --tag device"
+  end
+  desc "Run RSpec tests with coverage"
+  task :coverage do
+    ENV['COVERAGE'] = 'true'
+    Rake::Task["spec"].invoke
+  end
+  desc "Run RSpec tests in parallel (requires parallel_tests gem)"
+  task :parallel do
+    begin
+      require 'parallel_tests'
+      sh "parallel_rspec spec/"
+    rescue LoadError
+      puts "parallel_tests gem not installed. Run: gem install parallel_tests"
+    end
+  end
+  desc "Run specific device tests"
   %w[cpu metal cuda].each do |device|
     desc "Run tests on #{device.upcase} only"
     task "device:#{device}" => :compile do
       ENV['CANDLE_TEST_DEVICES'] = device
-      Rake::Task["test:device"].invoke
+      sh "rspec spec/device_compatibility_spec.rb --format documentation"
     end
   end
-end
-desc "Run benchmarks with device tests"
-task "test:device:benchmark" => :compile do
-  ENV['CANDLE_TEST_VERBOSE'] = 'true'
-  Rake::Task["test:device"].invoke
-  Rake::Task["test:benchmark"].invoke
-end
-desc "Run LLM tests for specific models"
-namespace :test do
+  desc "Run LLM tests for specific models"
   namespace :llm do
     desc "Run tests for Gemma models"
     task :gemma => :compile do
-      ruby "-Itest", "test/llm/gemma_test.rb"
+      sh "rspec spec/llm/gemma_spec.rb --format documentation"
     end
     desc "Run tests for Phi models"
     task :phi => :compile do
-      ruby "-Itest", "test/llm/phi_test.rb"
+      sh "rspec spec/llm/phi_spec.rb --format documentation"
     end
     desc "Run tests for Qwen models"
     task :qwen => :compile do
-      ruby "-Itest", "test/llm/qwen_test.rb"
+      sh "rspec spec/llm/qwen_spec.rb --format documentation"
     end
     desc "Run tests for Mistral models"
     task :mistral => :compile do
-      ruby "-Itest", "test/llm/mistral_test.rb"
+      sh "rspec spec/llm/mistral_spec.rb --format documentation"
     end
     desc "Run tests for Llama models"
     task :llama => :compile do
-      ruby "-Itest", "test/llm/llama_test.rb"
+      sh "rspec spec/llm/llama_spec.rb --format documentation"
     end
     desc "Run tests for TinyLlama models"
     task :tinyllama => :compile do
-      ruby "-Itest", "test/llm/tinyllama_test.rb"
+      sh "rspec spec/llm/tinyllama_spec.rb --format documentation"
     end
-    desc "Run all LLM tests (WARNING: downloads large models)"
-    task :all => [:gemma, :phi, :qwen, :mistral, :llama]
-  end
-end
-namespace :doc do
-  task default: %i[rustdoc yard]
-  desc "Generate YARD documentation"
-  task :yard do
-    sh <<~CMD
-      yard doc \
-        --plugin rustdoc -- lib tmp/doc/candle.json
-    CMD
-  end
-  desc "Generate Rust documentation as JSON"
-  task :rustdoc do
-    sh <<~CMD
-      cargo +nightly rustdoc \
-        --target-dir tmp/doc/target \
-        -p candle \
-        -- -Zunstable-options --output-format json \
-        --document-private-items
-    CMD
-    cp "tmp/doc/target/doc/candle.json", "tmp/doc/candle.json"
+    desc "Run all LLM tests (WARNING: requires large models already downloaded)"
+    task :all => [:gemma, :phi, :qwen, :mistral, :llama, :tinyllama]
   end
 end
-task doc: "doc:default"

data/ext/candle/src/lib.rs CHANGED Viewed

@@ -4,8 +4,6 @@ use crate::ruby::candle_utils;
 use crate::ruby::Result;
 pub mod llm;
-pub mod ner;
-pub mod reranker;
 pub mod ruby;
 pub mod structured;
 pub mod tokenizer;
@@ -44,8 +42,8 @@ fn init(ruby: &Ruby) -> Result<()> {
     ruby::init_embedding_model(rb_candle)?;
     ruby::init_llm(rb_candle)?;
-    ner::init(rb_candle)?;
-    reranker::init(rb_candle)?;
+    ruby::ner::init(rb_candle)?;
+    ruby::reranker::init(rb_candle)?;
     ruby::dtype::init(rb_candle)?;
     ruby::device::init(rb_candle)?;
     ruby::tensor::init(rb_candle)?;

data/ext/candle/src/llm/quantized_gguf.rs CHANGED Viewed

@@ -18,7 +18,7 @@ pub struct QuantizedGGUF {
     device: Device,
     model_id: String,
     eos_token_id: u32,
-    architecture: String,
+    pub architecture: String,
     _chat_template: Option<String>,
 }
@@ -320,7 +320,9 @@ impl QuantizedGGUF {
         // Check model name since Mistral GGUF reports as llama architecture
         let model_lower = self.model_id.to_lowercase();
-        if model_lower.contains("mistral") {
+        if model_lower.contains("tinyllama") {
+            self.apply_chatml_template(messages)
+        } else if model_lower.contains("mistral") {
             self.apply_mistral_template(messages)
         } else if model_lower.contains("gemma") {
             // Always use Gemma template for Gemma models, regardless of loader used
@@ -516,6 +518,20 @@ impl QuantizedGGUF {
         Ok(prompt)
     }
+    fn apply_chatml_template(&self, messages: &[serde_json::Value]) -> CandleResult<String> {
+        let mut prompt = String::new();
+        for message in messages {
+            let role = message["role"].as_str().unwrap_or("");
+            let content = message["content"].as_str().unwrap_or("");
+            prompt.push_str(&format!("<|{}|>\n{}</s>\n", role, content));
+        }
+        prompt.push_str("<|assistant|>");
+        Ok(prompt)
+    }
     fn apply_generic_template(&self, messages: &[serde_json::Value]) -> String {
         let mut prompt = String::new();