red-candle 1.0.0.pre.6 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +1 -10
  3. data/README.md +481 -4
  4. data/Rakefile +1 -3
  5. data/ext/candle/src/lib.rs +6 -3
  6. data/ext/candle/src/llm/gemma.rs +21 -79
  7. data/ext/candle/src/llm/generation_config.rs +3 -0
  8. data/ext/candle/src/llm/llama.rs +21 -79
  9. data/ext/candle/src/llm/mistral.rs +21 -89
  10. data/ext/candle/src/llm/mod.rs +3 -33
  11. data/ext/candle/src/llm/quantized_gguf.rs +501 -0
  12. data/ext/candle/src/llm/text_generation.rs +0 -4
  13. data/ext/candle/src/ner.rs +423 -0
  14. data/ext/candle/src/reranker.rs +24 -21
  15. data/ext/candle/src/ruby/device.rs +6 -6
  16. data/ext/candle/src/ruby/dtype.rs +4 -4
  17. data/ext/candle/src/ruby/embedding_model.rs +36 -34
  18. data/ext/candle/src/ruby/llm.rs +110 -49
  19. data/ext/candle/src/ruby/mod.rs +1 -2
  20. data/ext/candle/src/ruby/tensor.rs +66 -66
  21. data/ext/candle/src/ruby/tokenizer.rs +269 -0
  22. data/ext/candle/src/ruby/utils.rs +6 -24
  23. data/ext/candle/src/tokenizer/loader.rs +108 -0
  24. data/ext/candle/src/tokenizer/mod.rs +103 -0
  25. data/ext/candle/target/release/build/bindgen-0f89ba23b9ca1395/out/host-target.txt +1 -0
  26. data/ext/candle/target/release/build/clang-sys-cac31d63c4694603/out/common.rs +355 -0
  27. data/ext/candle/target/release/build/clang-sys-cac31d63c4694603/out/dynamic.rs +276 -0
  28. data/ext/candle/target/release/build/clang-sys-cac31d63c4694603/out/macros.rs +49 -0
  29. data/ext/candle/target/release/build/pulp-1b95cfe377eede97/out/x86_64_asm.rs +2748 -0
  30. data/ext/candle/target/release/build/rb-sys-f8ac4edc30ab3e53/out/bindings-0.9.116-mri-arm64-darwin24-3.3.0.rs +8902 -0
  31. data/lib/candle/build_info.rb +2 -0
  32. data/lib/candle/device_utils.rb +2 -0
  33. data/lib/candle/llm.rb +91 -2
  34. data/lib/candle/ner.rb +345 -0
  35. data/lib/candle/reranker.rb +1 -1
  36. data/lib/candle/tensor.rb +2 -0
  37. data/lib/candle/tokenizer.rb +139 -0
  38. data/lib/candle/version.rb +4 -2
  39. data/lib/candle.rb +2 -0
  40. metadata +127 -3
  41. data/ext/candle/src/ruby/qtensor.rs +0 -69
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 07ca4e6eb0b65eac5b62f4b3622ed3189f203279265b7174936ccfd5ff3e5099
4
- data.tar.gz: f4970f5c4376453cde1ee18b93155f69ca634ccc3e4a359a45b49d7f20379f64
3
+ metadata.gz: 829a937851c782dfd58b8fb724dc7b08d524d26400047e9f5fc7a5bd0de9cb4b
4
+ data.tar.gz: e8a9420fc310e977968aa396a47e5e5269d107eb8cb7246ca9e2f980a0a28f4d
5
5
  SHA512:
6
- metadata.gz: 10ed0881ec2f67ab1e798401e857eac638049b254b20460bcb5565cee822b24ce2abe23d0ce00275dcb1d1ddebfd926d47eac7e6d54924937da4356a36211224
7
- data.tar.gz: d24fa67f74cd62c87ea1666e9488f12e8773d15e2d62b806bd38ca7cb20215d819b0502d352e4d310d1377b9ab64debbdd664148c70fc1ac70f1f2e23e9b516c
6
+ metadata.gz: 020e23df61d5679612a7892bdbfc7dbcf2d28055df9fc6b9199a8e09933e74d03a0a0fb97d6ddc998f8f8e70e856d63d7eb758638e072cd492734b21681267b7
7
+ data.tar.gz: 5e10f888c2bd74dfdf01c97ca18f6666440edb000d524784ad0ac0676208d9485b8d708fa72d8fe73672f018c038b3526109153540ce5bbc53a81e4e30deccd1
data/Gemfile CHANGED
@@ -1,12 +1,3 @@
1
1
  source "https://rubygems.org"
2
2
 
3
- gemspec
4
-
5
- gem "minitest"
6
- gem "rake"
7
- gem "rake-compiler"
8
-
9
- gem "yard", require: false
10
- gem "yard-rustdoc", require: false
11
-
12
- gem "redcarpet", "~> 3.6"
3
+ gemspec
data/README.md CHANGED
@@ -51,6 +51,42 @@ Red-Candle now supports Large Language Models (LLMs) with GPU acceleration!
51
51
  - **Llama**: Llama 2 and Llama 3 models (e.g., `TinyLlama/TinyLlama-1.1B-Chat-v1.0`, `meta-llama/Llama-2-7b-hf`, `NousResearch/Llama-2-7b-hf`)
52
52
  - **Mistral**: All Mistral models (e.g., `mistralai/Mistral-7B-Instruct-v0.1`)
53
53
 
54
+ ### Quantized Model Support (GGUF)
55
+
56
+ Red-Candle supports quantized models in GGUF format, offering 4-8x memory reduction:
57
+
58
+ > **Note on GGUF Support**: Red-Candle now uses a unified GGUF loader that automatically detects the model architecture from the GGUF file. This means all GGUF models (including Mistral models from TheBloke) should now work correctly! The loader automatically selects the appropriate tokenizer based on the model type to ensure proper text generation.
59
+
60
+ ```ruby
61
+ # Load quantized models - always specify the GGUF filename
62
+ llm = Candle::LLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGUF",
63
+ device: device,
64
+ gguf_file: "llama-2-7b-chat.Q4_K_M.gguf")
65
+
66
+ # Register custom tokenizer mappings for your models
67
+ Candle::LLM.register_tokenizer("my-org/my-model-GGUF", "my-org/my-tokenizer")
68
+
69
+ # Popular quantized model sources:
70
+ # - TheBloke: Extensive collection of GGUF models
71
+ # - Search HuggingFace for "GGUF" models
72
+ ```
73
+
74
+ **Memory usage comparison (7B models):**
75
+ - Full precision: ~28 GB
76
+ - Q8_0 (8-bit): ~7 GB - Best quality, larger size
77
+ - Q5_K_M (5-bit): ~4.5 GB - Very good quality
78
+ - Q4_K_M (4-bit): ~4 GB - Recommended default, best balance
79
+ - Q3_K_M (3-bit): ~3 GB - Good for memory-constrained systems
80
+
81
+ **Quantization levels explained:**
82
+ - **Q8_0**: Almost identical to full model, use when quality is paramount
83
+ - **Q5_K_M**: Excellent quality with good compression
84
+ - **Q4_K_M**: Best balance of quality/size/speed (recommended default)
85
+ - **Q3_K_M**: Noticeable quality reduction but very compact
86
+ - **Q2_K**: ⚠️ **Not recommended** - Can cause inference errors due to extreme quantization
87
+
88
+ > **Warning**: Q2_K quantization can lead to "weight is negative, too large or not a valid number" errors during inference. Use Q3_K_M or higher for stable operation.
89
+
54
90
  > ### ⚠️ Huggingface login warning
55
91
  >
56
92
  > Many models, including the one below, require you to agree to the terms. You'll need to:
@@ -91,6 +127,8 @@ response = llm.chat(messages)
91
127
 
92
128
  ### GPU Acceleration
93
129
 
130
+ We see an 18x speed up running LLMs under CUDA vs CPU and a >3x speed up running under Metal vs CPU. Details [here](DEVICE_SUPPORT.md#performance-considerations).
131
+
94
132
  ```ruby
95
133
  # CPU works for all models
96
134
  device = Candle::Device.cpu
@@ -103,9 +141,38 @@ device = Candle::Device.metal
103
141
  device = Candle::Device.cuda # Linux/Windows with NVIDIA GPU
104
142
  ```
105
143
 
106
- ## ⚠️ Model Format Requirement: Safetensors Only
144
+ ### Debugging Token Generation
145
+
146
+ For debugging purposes, you can enable raw token output to see both token IDs and their raw representations:
147
+
148
+ ```ruby
149
+ # Enable debug mode to see raw tokens during generation
150
+ config = Candle::GenerationConfig.balanced(debug_tokens: true)
151
+
152
+ # Non-streaming generation with debug tokens
153
+ result = llm.generate("Hello, world!", config: config)
154
+ puts result
155
+ # Output: [15043:Hello][11:,][1917:world][0:!]
156
+
157
+ # Streaming generation with debug tokens
158
+ llm.generate_stream("Hello, world!", config: config) do |text|
159
+ print text # Will show each token as it's generated: [15043:Hello][11:,][1917:world][0:!]
160
+ end
161
+
162
+ # Works with all models (Llama, Mistral, Gemma, and quantized GGUF models)
163
+ ```
164
+
165
+ This is particularly useful for:
166
+ - Debugging tokenization issues
167
+ - Understanding how the model processes text
168
+ - Troubleshooting generation problems
169
+ - Analyzing model behavior
170
+
171
+ ## ⚠️ Model Format Requirements
107
172
 
108
- Red-Candle **only supports embedding models that provide their weights in the [safetensors](https://github.com/huggingface/safetensors) format** (i.e., the model repo must contain a `model.safetensors` file). If the model repo does not provide the required file, loading will fail with a clear error. Most official BERT and DistilBERT models do **not** provide safetensors; many Sentence Transformers and JinaBERT models do.
173
+ ### EmbeddingModels and Rerankers: Safetensors Only
174
+
175
+ Red-Candle **only supports embedding models and rerankers that provide their weights in the [safetensors](https://github.com/huggingface/safetensors) format** (i.e., the model repo must contain a `model.safetensors` file). If the model repo does not provide the required file, loading will fail with a clear error. Most official BERT and DistilBERT models do **not** provide safetensors; many Sentence Transformers and JinaBERT models do.
109
176
 
110
177
  **If you encounter an error like:**
111
178
 
@@ -115,13 +182,22 @@ RuntimeError: model.safetensors not found after download. Only safetensors model
115
182
 
116
183
  this means the selected model is not compatible. Please choose a model repo that provides the required file.
117
184
 
185
+ ### LLMs: Safetensors and GGUF Support
186
+
187
+ LLM models support two formats:
188
+ 1. **Safetensors format** - Standard HuggingFace models (e.g., `TinyLlama/TinyLlama-1.1B-Chat-v1.0`)
189
+ 2. **GGUF quantized format** - Memory-efficient quantized models (e.g., `TheBloke/Llama-2-7B-Chat-GGUF`)
190
+
191
+ See the [Quantized Model Support](#quantized-model-support-gguf) section for details on using GGUF models.
192
+
118
193
  ## Supported Embedding Models
119
194
 
120
195
  Red-Candle supports the following embedding model types from Hugging Face:
121
196
 
122
197
  1. `Candle::EmbeddingModelType::JINA_BERT` - Jina BERT models (e.g., `jinaai/jina-embeddings-v2-base-en`) (**safetensors required**)
123
- 2. `Candle::EmbeddingModelType::STANDARD_BERT` - Standard BERT models (e.g., `sentence-transformers/all-MiniLM-L6-v2`) (**safetensors required**)
198
+ 2. `Candle::EmbeddingModelType::MINILM` - MINILM models (e.g., `sentence-transformers/all-MiniLM-L6-v2`) (**safetensors required**)
124
199
  3. `Candle::EmbeddingModelType::DISTILBERT` - DistilBERT models (e.g., `distilbert-base-uncased-finetuned-sst-2-english`) (**safetensors required**)
200
+ 4. `Candle::EmbeddingModelType::STANDARD_BERT` - Standard BERT models (e.g., `scientistcom/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext`) (**safetensors required**)
125
201
 
126
202
  > **Note:** Most official BERT and DistilBERT models do _not_ provide safetensors. Please check the model repo before use.
127
203
 
@@ -197,7 +273,7 @@ ranked_results = reranker.rerank(query, documents, pooling_method: "pooler", app
197
273
  # Or apply sigmoid activation to get scores between 0 and 1
198
274
  sigmoid_results = reranker.rerank(query, documents, pooling_method: "pooler", apply_sigmoid: true)
199
275
 
200
- # The pooler method is the default and is recommended for cross-encoders, as is apply_sigmod, so the above is the same as:
276
+ # The pooler method is the default and is recommended for cross-encoders, as is apply_sigmoid, so the above is the same as:
201
277
  ranked_results = reranker.rerank(query, documents)
202
278
 
203
279
  # Results are returned as an array of hashes, sorted by relevance
@@ -288,6 +364,407 @@ The reranker uses a BERT-based architecture that:
288
364
 
289
365
  This joint processing allows cross-encoders to capture subtle semantic relationships between queries and documents, making them more accurate for reranking tasks, though at the cost of higher computational requirements.
290
366
 
367
+ ## Tokenizer
368
+
369
+ Red-Candle provides direct access to tokenizers for text preprocessing and analysis. This is useful for understanding how models process text, debugging issues, and building custom NLP pipelines.
370
+
371
+ ### Basic Usage
372
+
373
+ ```ruby
374
+ require 'candle'
375
+
376
+ # Load a tokenizer from HuggingFace
377
+ tokenizer = Candle::Tokenizer.from_pretrained("bert-base-uncased")
378
+
379
+ # Encode text to token IDs
380
+ token_ids = tokenizer.encode("Hello, world!")
381
+ # => [101, 7592, 1010, 2088, 999, 102]
382
+
383
+ # Decode token IDs back to text
384
+ text = tokenizer.decode(token_ids)
385
+ # => "hello, world!"
386
+
387
+ # Get token strings (subwords) - useful for visualization
388
+ tokens = tokenizer.encode_to_tokens("Hello, world!")
389
+ # => ["[CLS]", "hello", ",", "world", "!", "[SEP]"]
390
+
391
+ # Get both IDs and tokens together
392
+ result = tokenizer.encode_with_tokens("preprocessing")
393
+ # => {"ids" => [101, 3653, 22618, 2527, 102],
394
+ # "tokens" => ["[CLS]", "prep", "##ro", "##ces", "##sing", "[SEP]"]}
395
+ ```
396
+
397
+ ### Batch Processing
398
+
399
+ ```ruby
400
+ # Encode multiple texts at once
401
+ texts = ["Hello world", "How are you?", "Tokenizers are cool"]
402
+ batch_ids = tokenizer.encode_batch(texts)
403
+
404
+ # Get token strings for multiple texts
405
+ batch_tokens = tokenizer.encode_batch_to_tokens(texts)
406
+ ```
407
+
408
+ ### Vocabulary Access
409
+
410
+ ```ruby
411
+ # Get vocabulary size
412
+ vocab_size = tokenizer.vocab_size
413
+ # => 30522
414
+
415
+ # Get full vocabulary as a hash
416
+ vocab = tokenizer.get_vocab
417
+ # vocab["hello"] => 7592
418
+
419
+ # Convert a specific token ID to its string
420
+ token_str = tokenizer.id_to_token(7592)
421
+ # => "hello"
422
+
423
+ # Get special tokens
424
+ special = tokenizer.get_special_tokens
425
+ # => {"cls_token" => 101, "sep_token" => 102, "pad_token" => 0, ...}
426
+ ```
427
+
428
+ ### Configuration
429
+
430
+ ```ruby
431
+ # Create a tokenizer with padding enabled
432
+ padded_tokenizer = tokenizer.with_padding(length: 128)
433
+
434
+ # Create a tokenizer with truncation
435
+ truncated_tokenizer = tokenizer.with_truncation(512)
436
+
437
+ # Configure padding with more options
438
+ padded_tokenizer = tokenizer.with_padding(
439
+ length: 128, # Fixed length padding
440
+ direction: "right", # Pad on the right (default)
441
+ pad_token: "[PAD]" # Padding token
442
+ )
443
+ ```
444
+
445
+ ### Model Integration
446
+
447
+ All models expose their tokenizers:
448
+
449
+ ```ruby
450
+ # From LLM
451
+ llm = Candle::LLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
452
+ llm_tokenizer = llm.tokenizer
453
+
454
+ # From EmbeddingModel
455
+ embedding_model = Candle::EmbeddingModel.new
456
+ emb_tokenizer = embedding_model.tokenizer
457
+
458
+ # From Reranker
459
+ reranker = Candle::Reranker.new(model_path: "cross-encoder/ms-marco-MiniLM-L-12-v2")
460
+ rank_tokenizer = reranker.tokenizer
461
+ ```
462
+
463
+ ### Understanding Subword Tokenization
464
+
465
+ Modern tokenizers split unknown or rare words into subword pieces:
466
+
467
+ ```ruby
468
+ # See how words are split into subwords
469
+ result = tokenizer.encode_with_tokens("unbelievable")
470
+ # => {"ids" => [101, 4895, 6499, 102],
471
+ # "tokens" => ["[CLS]", "un", "##believable", "[SEP]"]}
472
+
473
+ # The ## prefix indicates a continuation of the previous token
474
+ complex = tokenizer.encode_to_tokens("preprocessing tokenization")
475
+ # => ["[CLS]", "prep", "##ro", "##ces", "##sing", "token", "##ization", "[SEP]"]
476
+ ```
477
+
478
+ ### Use Cases
479
+
480
+ - **Token Analysis**: Understand how your text is being processed by models
481
+ - **Debugging**: See why certain inputs might cause unexpected model behavior
482
+ - **Custom Preprocessing**: Build your own text processing pipelines
483
+ - **Educational**: Teach how modern NLP models handle text
484
+ - **NER Preparation**: Get aligned tokens for named entity recognition tasks
485
+
486
+ ## Named Entity Recognition (NER)
487
+
488
+ Red-Candle includes comprehensive Named Entity Recognition capabilities for extracting entities like people, organizations, locations, and custom entity types from text.
489
+
490
+ ### Model-based NER
491
+
492
+ Load pre-trained NER models from HuggingFace:
493
+
494
+ ```ruby
495
+ require 'candle'
496
+
497
+ # Load a pre-trained NER model
498
+ ner = Candle::NER.from_pretrained("Babelscape/wikineural-multilingual-ner")
499
+
500
+ # Or load a model with a specific tokenizer (for models without tokenizer.json)
501
+ ner = Candle::NER.from_pretrained("dslim/bert-base-NER", tokenizer: "bert-base-cased")
502
+
503
+ # Extract entities from text
504
+ text = "Apple Inc. was founded by Steve Jobs and Steve Wozniak in Cupertino, California."
505
+ entities = ner.extract_entities(text)
506
+
507
+ entities.each do |entity|
508
+ puts "#{entity['text']} (#{entity['label']}) - confidence: #{entity['confidence'].round(2)}"
509
+ end
510
+ # Output:
511
+ # Apple Inc. (ORG) - confidence: 0.99
512
+ # Steve Jobs (PER) - confidence: 0.99
513
+ # Steve Wozniak (PER) - confidence: 0.98
514
+ # Cupertino (LOC) - confidence: 0.97
515
+ # California (LOC) - confidence: 0.98
516
+
517
+ # Adjust confidence threshold (default: 0.9)
518
+ entities = ner.extract_entities(text, confidence_threshold: 0.95)
519
+
520
+ # Get token-level predictions for detailed analysis
521
+ tokens = ner.predict_tokens(text)
522
+ ```
523
+
524
+ ### Pattern-based Recognition
525
+
526
+ For domain-specific entities, use regex patterns:
527
+
528
+ ```ruby
529
+ # Create pattern-based recognizers
530
+ email_recognizer = Candle::PatternEntityRecognizer.new("EMAIL", [
531
+ /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/
532
+ ])
533
+
534
+ phone_recognizer = Candle::PatternEntityRecognizer.new("PHONE", [
535
+ /\b\d{3}[-.]?\d{3}[-.]?\d{4}\b/, # 555-123-4567
536
+ /\b\(\d{3}\)\s*\d{3}[-.]?\d{4}\b/, # (555) 123-4567
537
+ /\b\+1\s*\d{3}[-.]?\d{3}[-.]?\d{4}\b/ # +1 555-123-4567
538
+ ])
539
+
540
+ # Extract entities
541
+ text = "Contact us at info@example.com or call 555-123-4567"
542
+ email_entities = email_recognizer.recognize(text)
543
+ phone_entities = phone_recognizer.recognize(text)
544
+ ```
545
+
546
+ ### Gazetteer-based Recognition
547
+
548
+ Use dictionaries for known entities:
549
+
550
+ ```ruby
551
+ # Create gazetteer recognizers
552
+ companies = ["Apple", "Google", "Microsoft", "Amazon", "Tesla"]
553
+ company_recognizer = Candle::GazetteerEntityRecognizer.new("COMPANY", companies)
554
+
555
+ # Load from file
556
+ drug_recognizer = Candle::GazetteerEntityRecognizer.new("DRUG")
557
+ drug_recognizer.load_from_file("drug_names.txt")
558
+
559
+ # Case-sensitive matching
560
+ product_recognizer = Candle::GazetteerEntityRecognizer.new("PRODUCT",
561
+ ["iPhone", "iPad", "MacBook"],
562
+ case_sensitive: true
563
+ )
564
+ ```
565
+
566
+ ### Hybrid NER
567
+
568
+ Combine ML models with rule-based approaches for best results:
569
+
570
+ ```ruby
571
+ # Create hybrid NER system
572
+ hybrid = Candle::HybridNER.new("Babelscape/wikineural-multilingual-ner")
573
+
574
+ # Add pattern recognizers
575
+ hybrid.add_pattern_recognizer("EMAIL", [/\b[\w._%+-]+@[\w.-]+\.[A-Z|a-z]{2,}\b/])
576
+ hybrid.add_pattern_recognizer("PHONE", [/\b\d{3}[-.]?\d{3}[-.]?\d{4}\b/])
577
+
578
+ # Add gazetteer recognizers
579
+ hybrid.add_gazetteer_recognizer("COMPANY", ["Apple", "Google", "Microsoft"])
580
+ hybrid.add_gazetteer_recognizer("PRODUCT", ["iPhone", "Android", "Windows"])
581
+
582
+ # Extract all entities
583
+ text = "John Smith (john@apple.com) from Apple called about the new iPhone. Reach him at 555-0123."
584
+ entities = hybrid.extract_entities(text)
585
+
586
+ # Results include entities from all recognizers
587
+ # Overlapping entities are automatically resolved (highest confidence wins)
588
+ ```
589
+
590
+ ### Custom Entity Types
591
+
592
+ Perfect for specialized domains:
593
+
594
+ ```ruby
595
+ # Biomedical entities
596
+ gene_patterns = [
597
+ /\b[A-Z][A-Z0-9]{2,}\b/, # TP53, BRCA1, EGFR
598
+ /\bCD\d+\b/, # CD4, CD8, CD34
599
+ /\b[A-Z]+\d[A-Z]\d*\b/ # RAD51C, PALB2
600
+ ]
601
+ gene_recognizer = Candle::PatternEntityRecognizer.new("GENE", gene_patterns)
602
+
603
+ # Financial entities
604
+ ticker_patterns = [
605
+ /\$[A-Z]{1,5}\b/, # $AAPL, $GOOGL
606
+ /\b[A-Z]{1,5}\.NYSE\b/, # AAPL.NYSE
607
+ /\b[A-Z]{1,5}\.NASDAQ\b/ # GOOGL.NASDAQ
608
+ ]
609
+ ticker_recognizer = Candle::PatternEntityRecognizer.new("TICKER", ticker_patterns)
610
+
611
+ # Legal entities
612
+ case_patterns = [
613
+ /\b\d+\s+F\.\d+\s+\d+\b/, # 123 F.3d 456
614
+ /\b\d+\s+U\.S\.\s+\d+\b/, # 123 U.S. 456
615
+ /\bNo\.\s+\d+-\d+\b/ # No. 20-1234
616
+ ]
617
+ case_recognizer = Candle::PatternEntityRecognizer.new("CASE", case_patterns)
618
+ ```
619
+
620
+ ### Available Pre-trained Models
621
+
622
+ Popular NER models on HuggingFace:
623
+
624
+ ```ruby
625
+ # General multilingual NER (4 entity types: PER, ORG, LOC, MISC)
626
+ ner = Candle::NER.from_pretrained("Babelscape/wikineural-multilingual-ner")
627
+
628
+ # English NER (requires separate tokenizer)
629
+ ner = Candle::NER.from_pretrained("dslim/bert-base-NER", tokenizer: "bert-base-cased")
630
+
631
+ # Multilingual NER
632
+ ner = Candle::NER.from_pretrained("Davlan/bert-base-multilingual-cased-ner-hrl")
633
+
634
+ # OntoNotes 5 (18 entity types including DATE, TIME, MONEY, etc.)
635
+ ner = Candle::NER.from_pretrained("flair/ner-english-ontonotes-large")
636
+
637
+ # Biomedical NER
638
+ ner = Candle::NER.from_pretrained("dmis-lab/biobert-base-cased-v1.2")
639
+ ner = Candle::NER.from_pretrained("allenai/scibert_scivocab_uncased")
640
+ ```
641
+
642
+ ### Performance Tips
643
+
644
+ 1. **Device Selection**: Use GPU for faster inference
645
+ ```ruby
646
+ ner = Candle::NER.from_pretrained("Babelscape/wikineural-multilingual-ner", device: Candle::Device.metal)
647
+ ```
648
+
649
+ 2. **Batch Processing**: Process multiple texts together when possible
650
+
651
+ 3. **Confidence Threshold**: Balance precision/recall with appropriate thresholds
652
+
653
+ 4. **Entity Resolution**: The hybrid NER automatically handles overlapping entities
654
+
655
+ ### Output Format
656
+
657
+ All NER methods return entities in a consistent format:
658
+
659
+ ```ruby
660
+ {
661
+ "text" => "Apple Inc.", # The entity text
662
+ "label" => "ORG", # Entity type
663
+ "start" => 0, # Character start position
664
+ "end" => 10, # Character end position
665
+ "confidence" => 0.99, # Confidence score (0-1)
666
+ "token_start" => 0, # Token start index (model-based only)
667
+ "token_end" => 2, # Token end index (model-based only)
668
+ "source" => "model" # Source: "model", "pattern", or "gazetteer"
669
+ }
670
+ ```
671
+
672
+ ## Common Runtime Errors
673
+
674
+ ### 1. Weight is negative, too large or not a valid number
675
+
676
+ **Error:**
677
+ ```
678
+ /Users/cpetersen/src/scientist/red-candle/lib/candle/llm.rb:25:in `_generate_stream': Generation failed: A weight is negative, too large or not a valid number (RuntimeError)
679
+ from /Users/cpetersen/src/scientist/red-candle/lib/candle/llm.rb:25:in `generate_stream'
680
+ ...
681
+ ```
682
+
683
+ **Cause:** This error occurs when using overly aggressive quantization levels (particularly Q2_K) that result in numerical instability during inference. The 2-bit quantization can cause weights to become corrupted or produce NaN/Inf values.
684
+
685
+ **Solution:** Use a higher quantization level. Recommended options:
686
+ - Q4_K_M (4-bit) - Best balance of quality and size
687
+ - Q5_K_M (5-bit) - Higher quality with slightly larger size
688
+ - Q3_K_M (3-bit) - Minimum recommended quantization
689
+
690
+ ```ruby
691
+ # Instead of Q2_K:
692
+ llm = Candle::LLM.from_pretrained("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
693
+ device: device,
694
+ gguf_file: "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")
695
+ ```
696
+
697
+ ### 2. Cannot find tensor model.embed_tokens.weight
698
+
699
+ **Error:**
700
+ ```
701
+ Failed to load quantized model: cannot find tensor model.embed_tokens.weight (RuntimeError)
702
+ ```
703
+
704
+ **Cause:** This error was common in earlier versions when loading GGUF files with incompatible tensor naming conventions. The unified GGUF loader in version 1.0.0+ should handle most GGUF files correctly.
705
+
706
+ **If you still encounter this error:**
707
+ 1. Ensure you're using the latest version of red-candle (1.0.0 or higher)
708
+ 2. Make sure to specify the exact GGUF filename:
709
+ ```ruby
710
+ llm = Candle::LLM.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
711
+ device: device,
712
+ gguf_file: "mistral-7b-instruct-v0.2.Q4_K_M.gguf")
713
+ ```
714
+ 3. If the error persists, the GGUF file may use an unsupported architecture or format
715
+
716
+ ### 3. No GGUF file found in repository
717
+
718
+ **Error:**
719
+ ```
720
+ Failed to load quantized model: No GGUF file found in repository TheBloke/model-name-GGUF. Try specifying a quantization level like Q4_K_M, Q5_K_M, or Q8_0. (RuntimeError)
721
+ ```
722
+
723
+ **Cause:** The automatic GGUF file detection couldn't find a matching file, often due to naming variations.
724
+
725
+ **Solution:** Specify the exact GGUF filename:
726
+ ```ruby
727
+ # Visit the HuggingFace repository to find the exact filename
728
+ llm = Candle::LLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGUF",
729
+ device: device,
730
+ gguf_file: "llama-2-7b-chat.Q4_K_M.gguf")
731
+ ```
732
+
733
+ ### 4. Failed to download tokenizer
734
+
735
+ **Error:**
736
+ ```
737
+ Failed to load quantized model: Failed to download tokenizer: request error: HTTP status client error (404 Not Found)
738
+ ```
739
+
740
+ **Cause:** GGUF repositories often don't include separate tokenizer files since they're embedded in the GGUF format.
741
+
742
+ **Solution:** The code now includes fallback tokenizer loading. If you still encounter this error, ensure you're using the latest version of red-candle.
743
+
744
+ ### 5. Missing metadata in GGUF file
745
+
746
+ **Error:**
747
+ ```
748
+ Failed to load GGUF model: cannot find gemma3.attention.head_count in metadata (RuntimeError)
749
+ ```
750
+ or
751
+ ```
752
+ Failed to load GGUF model: cannot find llama.attention.head_count in metadata (RuntimeError)
753
+ ```
754
+
755
+ **Cause:** Some GGUF files may have been created with older conversion tools that don't include all required metadata fields.
756
+
757
+ **Solution:**
758
+ - Try a different GGUF file from the same model
759
+ - Look for GGUF files from TheBloke or other reputable sources
760
+ - Check if a newer version of the GGUF file is available
761
+ - Some Gemma GGUF files may not be compatible with the current loader
762
+
763
+ **Known compatibility issues:**
764
+ - `lmstudio-ai/gemma-2b-it-GGUF` - Missing required metadata fields
765
+ - Gemma 3 GGUF files may require specific tokenizers that are not publicly available
766
+ - For best compatibility, use Llama or Mistral GGUF files from TheBloke
767
+
291
768
  ## Development
292
769
 
293
770
  FORK IT!
data/Rakefile CHANGED
@@ -8,7 +8,7 @@ task default: :test
8
8
  Rake::TestTask.new do |t|
9
9
  t.deps << :compile
10
10
  t.libs << "test"
11
- t.test_files = FileList["test/**/*_test.rb"]
11
+ t.test_files = FileList["test/**/*_test.rb"].exclude("test/benchmarks/**/*_test.rb")
12
12
  end
13
13
 
14
14
  spec = Bundler.load_gemspec("candle.gemspec")
@@ -36,7 +36,6 @@ end
36
36
 
37
37
  desc "Run benchmark tests"
38
38
  Rake::TestTask.new("test:benchmark") do |t|
39
- ENV['CANDLE_RUN_BENCHMARKS'] = 'true'
40
39
  t.deps << :compile
41
40
  t.libs << "test"
42
41
  t.test_files = FileList["test/benchmarks/**/*_test.rb"]
@@ -59,7 +58,6 @@ end
59
58
 
60
59
  desc "Run benchmarks with device tests"
61
60
  task "test:device:benchmark" => :compile do
62
- ENV['CANDLE_RUN_BENCHMARKS'] = 'true'
63
61
  ENV['CANDLE_TEST_VERBOSE'] = 'true'
64
62
  Rake::Task["test:device"].invoke
65
63
  Rake::Task["test:benchmark"].invoke
@@ -1,11 +1,13 @@
1
1
  use magnus::{function, prelude::*, Ruby};
2
2
 
3
3
  use crate::ruby::candle_utils;
4
- use crate::ruby::Result as RbResult;
4
+ use crate::ruby::Result;
5
5
 
6
6
  pub mod llm;
7
+ pub mod ner;
7
8
  pub mod reranker;
8
9
  pub mod ruby;
10
+ pub mod tokenizer;
9
11
 
10
12
  // Configuration detection from build.rs
11
13
  #[cfg(all(has_metal, not(force_cpu)))]
@@ -33,7 +35,7 @@ pub fn get_build_info() -> magnus::RHash {
33
35
  }
34
36
 
35
37
  #[magnus::init]
36
- fn init(ruby: &Ruby) -> RbResult<()> {
38
+ fn init(ruby: &Ruby) -> Result<()> {
37
39
  let rb_candle = ruby.define_module("Candle")?;
38
40
 
39
41
  // Export build info
@@ -41,11 +43,12 @@ fn init(ruby: &Ruby) -> RbResult<()> {
41
43
 
42
44
  ruby::init_embedding_model(rb_candle)?;
43
45
  ruby::init_llm(rb_candle)?;
46
+ ner::init(rb_candle)?;
44
47
  reranker::init(rb_candle)?;
45
48
  ruby::dtype::init(rb_candle)?;
46
- ruby::qtensor::init(rb_candle)?;
47
49
  ruby::device::init(rb_candle)?;
48
50
  ruby::tensor::init(rb_candle)?;
51
+ ruby::tokenizer::init(rb_candle)?;
49
52
  candle_utils(rb_candle)?;
50
53
 
51
54
  Ok(())