fine 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/CHANGELOG.md +38 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +167 -0
- data/LICENSE +21 -0
- data/README.md +212 -0
- data/Rakefile +6 -0
- data/docs/installation.md +151 -0
- data/docs/tutorials/llm-fine-tuning.md +246 -0
- data/docs/tutorials/model-export.md +200 -0
- data/docs/tutorials/siglip2-image-classification.md +130 -0
- data/docs/tutorials/siglip2-object-recognition.md +203 -0
- data/docs/tutorials/siglip2-similarity-search.md +152 -0
- data/docs/tutorials/text-classification.md +233 -0
- data/docs/tutorials/text-embeddings.md +211 -0
- data/examples/basic_classification.rb +70 -0
- data/examples/data/tool_calls.jsonl +30 -0
- data/examples/demo_training.rb +78 -0
- data/examples/finetune_gemma3_tools.rb +135 -0
- data/examples/real_llm_test.rb +128 -0
- data/examples/real_text_classification_test.rb +90 -0
- data/examples/real_text_embedder_test.rb +110 -0
- data/examples/real_training_test.rb +88 -0
- data/examples/test_export.rb +28 -0
- data/examples/test_image_classifier.rb +79 -0
- data/examples/test_llm.rb +100 -0
- data/examples/test_text_classifier.rb +59 -0
- data/lib/fine/callbacks/base.rb +140 -0
- data/lib/fine/callbacks/progress_bar.rb +66 -0
- data/lib/fine/configuration.rb +106 -0
- data/lib/fine/datasets/data_loader.rb +63 -0
- data/lib/fine/datasets/image_dataset.rb +203 -0
- data/lib/fine/datasets/instruction_dataset.rb +226 -0
- data/lib/fine/datasets/text_data_loader.rb +88 -0
- data/lib/fine/datasets/text_dataset.rb +266 -0
- data/lib/fine/error.rb +49 -0
- data/lib/fine/export/gguf_exporter.rb +424 -0
- data/lib/fine/export/onnx_exporter.rb +249 -0
- data/lib/fine/export.rb +53 -0
- data/lib/fine/hub/config_loader.rb +145 -0
- data/lib/fine/hub/model_downloader.rb +136 -0
- data/lib/fine/hub/safetensors_loader.rb +108 -0
- data/lib/fine/image_classifier.rb +256 -0
- data/lib/fine/llm.rb +336 -0
- data/lib/fine/models/base.rb +48 -0
- data/lib/fine/models/bert_encoder.rb +202 -0
- data/lib/fine/models/bert_for_sequence_classification.rb +226 -0
- data/lib/fine/models/causal_lm.rb +279 -0
- data/lib/fine/models/classification_head.rb +24 -0
- data/lib/fine/models/gemma3_decoder.rb +244 -0
- data/lib/fine/models/llama_decoder.rb +297 -0
- data/lib/fine/models/sentence_transformer.rb +202 -0
- data/lib/fine/models/siglip2_for_image_classification.rb +155 -0
- data/lib/fine/models/siglip2_vision_encoder.rb +190 -0
- data/lib/fine/text_classifier.rb +250 -0
- data/lib/fine/text_embedder.rb +221 -0
- data/lib/fine/tokenizers/auto_tokenizer.rb +208 -0
- data/lib/fine/training/llm_trainer.rb +212 -0
- data/lib/fine/training/text_trainer.rb +275 -0
- data/lib/fine/training/trainer.rb +194 -0
- data/lib/fine/transforms/compose.rb +28 -0
- data/lib/fine/transforms/normalize.rb +33 -0
- data/lib/fine/transforms/resize.rb +35 -0
- data/lib/fine/transforms/to_tensor.rb +53 -0
- data/lib/fine/version.rb +3 -0
- data/lib/fine.rb +112 -0
- data/mise.toml +2 -0
- metadata +240 -0
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
# Fine-tuning SigLIP2 for New Object Recognition
|
|
2
|
+
|
|
3
|
+
Teach the model to recognize specific objects, products, or concepts it hasn't seen before.
|
|
4
|
+
|
|
5
|
+
## When to Use This
|
|
6
|
+
|
|
7
|
+
- You want to detect your specific products, logos, or items
|
|
8
|
+
- You need to recognize custom objects not in standard datasets
|
|
9
|
+
- Examples: your product catalog, brand logos, custom equipment, specific animals/plants
|
|
10
|
+
|
|
11
|
+
## How It Works
|
|
12
|
+
|
|
13
|
+
SigLIP2 learns visual concepts from image-text pairs. For object recognition, we fine-tune it to associate your specific objects with labels, enabling it to recognize them in new images.
|
|
14
|
+
|
|
15
|
+
## Dataset Structure
|
|
16
|
+
|
|
17
|
+
Create folders for each object you want to recognize:
|
|
18
|
+
|
|
19
|
+
```
|
|
20
|
+
data/
|
|
21
|
+
train/
|
|
22
|
+
my_product_a/
|
|
23
|
+
product_a_01.jpg
|
|
24
|
+
product_a_02.jpg
|
|
25
|
+
product_a_angle1.jpg
|
|
26
|
+
product_a_lighting2.jpg
|
|
27
|
+
...
|
|
28
|
+
my_product_b/
|
|
29
|
+
product_b_01.jpg
|
|
30
|
+
...
|
|
31
|
+
background/ # Optional: negative examples
|
|
32
|
+
random_scene_01.jpg
|
|
33
|
+
...
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
**Tips for Object Recognition:**
|
|
37
|
+
- Capture objects from multiple angles
|
|
38
|
+
- Vary lighting conditions
|
|
39
|
+
- Include different backgrounds
|
|
40
|
+
- Show objects at different scales
|
|
41
|
+
- 30-100 images per object works well
|
|
42
|
+
|
|
43
|
+
## Training
|
|
44
|
+
|
|
45
|
+
```ruby
|
|
46
|
+
require "fine"
|
|
47
|
+
|
|
48
|
+
# Use a larger model for better object recognition
|
|
49
|
+
classifier = Fine::ImageClassifier.new("google/siglip2-base-patch16-384") do |config|
|
|
50
|
+
config.epochs = 5
|
|
51
|
+
config.batch_size = 16
|
|
52
|
+
config.learning_rate = 1e-4 # Slightly lower for fine-grained recognition
|
|
53
|
+
config.freeze_encoder = false # Full fine-tuning for new concepts
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
classifier.fit(
|
|
57
|
+
train_dir: "data/train",
|
|
58
|
+
val_dir: "data/val"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
classifier.save("models/product_recognizer")
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Recognizing Objects in New Images
|
|
65
|
+
|
|
66
|
+
```ruby
|
|
67
|
+
recognizer = Fine::ImageClassifier.load("models/product_recognizer")
|
|
68
|
+
|
|
69
|
+
# Check what object is in an image
|
|
70
|
+
results = recognizer.predict("customer_photo.jpg")
|
|
71
|
+
|
|
72
|
+
# Get top prediction
|
|
73
|
+
top = results.first
|
|
74
|
+
if top[:score] > 0.7
|
|
75
|
+
puts "Detected: #{top[:label]} (#{(top[:score] * 100).round}% confident)"
|
|
76
|
+
else
|
|
77
|
+
puts "No confident match found"
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# See all possibilities
|
|
81
|
+
results.each do |pred|
|
|
82
|
+
puts "#{pred[:label]}: #{(pred[:score] * 100).round}%"
|
|
83
|
+
end
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Including Negative Examples
|
|
87
|
+
|
|
88
|
+
For better precision, include a "background" or "other" class:
|
|
89
|
+
|
|
90
|
+
```
|
|
91
|
+
data/train/
|
|
92
|
+
product_a/
|
|
93
|
+
product_b/
|
|
94
|
+
other/ # Images without your products
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
This helps the model learn what your objects are NOT.
|
|
98
|
+
|
|
99
|
+
## Multi-Object Detection Strategy
|
|
100
|
+
|
|
101
|
+
If images might contain multiple objects, train separate binary classifiers:
|
|
102
|
+
|
|
103
|
+
```ruby
|
|
104
|
+
# Train one model per object type
|
|
105
|
+
products = ["product_a", "product_b", "product_c"]
|
|
106
|
+
|
|
107
|
+
products.each do |product|
|
|
108
|
+
classifier = Fine::ImageClassifier.new("google/siglip2-base-patch16-224")
|
|
109
|
+
|
|
110
|
+
# Binary classification: this_product vs everything_else
|
|
111
|
+
classifier.fit(train_dir: "data/binary/#{product}")
|
|
112
|
+
classifier.save("models/detect_#{product}")
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# At inference time, run all detectors
|
|
116
|
+
def detect_all_products(image_path)
|
|
117
|
+
detected = []
|
|
118
|
+
|
|
119
|
+
products.each do |product|
|
|
120
|
+
detector = Fine::ImageClassifier.load("models/detect_#{product}")
|
|
121
|
+
results = detector.predict(image_path)
|
|
122
|
+
|
|
123
|
+
if results.first[:label] == product && results.first[:score] > 0.8
|
|
124
|
+
detected << product
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
detected
|
|
129
|
+
end
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Best Practices
|
|
133
|
+
|
|
134
|
+
### Image Collection
|
|
135
|
+
|
|
136
|
+
1. **Variety is key**: Same object, different conditions
|
|
137
|
+
2. **Real-world context**: Objects in actual use, not just product shots
|
|
138
|
+
3. **Scale variation**: Close-ups and distant shots
|
|
139
|
+
4. **Partial visibility**: Objects partially obscured (if that's realistic)
|
|
140
|
+
|
|
141
|
+
### Data Augmentation
|
|
142
|
+
|
|
143
|
+
Enable augmentation for more robust recognition:
|
|
144
|
+
|
|
145
|
+
```ruby
|
|
146
|
+
classifier = Fine::ImageClassifier.new("google/siglip2-base-patch16-384") do |config|
|
|
147
|
+
config.augmentation do |aug|
|
|
148
|
+
aug.random_horizontal_flip = true
|
|
149
|
+
aug.random_rotation = 15 # Degrees
|
|
150
|
+
aug.color_jitter = { brightness: 0.2, contrast: 0.2 }
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### Confidence Thresholds
|
|
156
|
+
|
|
157
|
+
Set appropriate thresholds based on your use case:
|
|
158
|
+
|
|
159
|
+
```ruby
|
|
160
|
+
results = recognizer.predict(image)
|
|
161
|
+
confidence = results.first[:score]
|
|
162
|
+
|
|
163
|
+
case
|
|
164
|
+
when confidence > 0.9
|
|
165
|
+
# High confidence - safe to act automatically
|
|
166
|
+
when confidence > 0.7
|
|
167
|
+
# Medium confidence - show to user for confirmation
|
|
168
|
+
else
|
|
169
|
+
# Low confidence - likely not a known object
|
|
170
|
+
end
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## Example: Product Catalog Recognition
|
|
174
|
+
|
|
175
|
+
```ruby
|
|
176
|
+
# Train on your product catalog
|
|
177
|
+
catalog_recognizer = Fine::ImageClassifier.new("google/siglip2-base-patch16-384") do |config|
|
|
178
|
+
config.epochs = 10
|
|
179
|
+
config.batch_size = 8
|
|
180
|
+
config.on_epoch_end do |epoch, metrics|
|
|
181
|
+
puts "Epoch #{epoch + 1}: val_accuracy=#{(metrics[:val_accuracy] * 100).round(1)}%"
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
catalog_recognizer.fit(
|
|
186
|
+
train_dir: "catalog/train",
|
|
187
|
+
val_dir: "catalog/val"
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
catalog_recognizer.save("models/catalog_v1")
|
|
191
|
+
|
|
192
|
+
# Use in production
|
|
193
|
+
def identify_product(photo_path)
|
|
194
|
+
recognizer = Fine::ImageClassifier.load("models/catalog_v1")
|
|
195
|
+
results = recognizer.predict(photo_path, top_k: 3)
|
|
196
|
+
|
|
197
|
+
{
|
|
198
|
+
product_id: results.first[:label],
|
|
199
|
+
confidence: results.first[:score],
|
|
200
|
+
alternatives: results[1..2]
|
|
201
|
+
}
|
|
202
|
+
end
|
|
203
|
+
```
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# Using SigLIP2 for Image Similarity Search
|
|
2
|
+
|
|
3
|
+
Find visually similar images using learned embeddings.
|
|
4
|
+
|
|
5
|
+
## When to Use This
|
|
6
|
+
|
|
7
|
+
- Find similar products in a catalog
|
|
8
|
+
- Detect near-duplicates
|
|
9
|
+
- Build "more like this" features
|
|
10
|
+
- Visual search engines
|
|
11
|
+
|
|
12
|
+
## How It Works
|
|
13
|
+
|
|
14
|
+
Instead of classifying images, we extract the embedding vectors from SigLIP2 and compare them. Similar images have similar embeddings.
|
|
15
|
+
|
|
16
|
+
## Extracting Embeddings
|
|
17
|
+
|
|
18
|
+
```ruby
|
|
19
|
+
require "fine"
|
|
20
|
+
|
|
21
|
+
# Load a pre-trained model (no fine-tuning needed for general similarity)
|
|
22
|
+
# Or load your fine-tuned model for domain-specific similarity
|
|
23
|
+
model = Fine::Models::SigLIP2ForImageClassification.from_pretrained(
|
|
24
|
+
"google/siglip2-base-patch16-224",
|
|
25
|
+
num_labels: 1 # Dummy value, we won't use the classifier
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
# Get the encoder only
|
|
29
|
+
encoder = model.encoder
|
|
30
|
+
|
|
31
|
+
# Prepare image transform
|
|
32
|
+
transforms = Fine::Transforms::Compose.new([
|
|
33
|
+
Fine::Transforms::Resize.new(224),
|
|
34
|
+
Fine::Transforms::ToTensor.new,
|
|
35
|
+
Fine::Transforms::Normalize.new
|
|
36
|
+
])
|
|
37
|
+
|
|
38
|
+
def get_embedding(encoder, transforms, image_path)
|
|
39
|
+
image = Vips::Image.new_from_file(image_path, access: :sequential)
|
|
40
|
+
tensor = transforms.call(image).unsqueeze(0) # Add batch dimension
|
|
41
|
+
|
|
42
|
+
encoder.eval
|
|
43
|
+
Torch.no_grad do
|
|
44
|
+
encoder.call(tensor).squeeze.to_a
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Extract embeddings
|
|
49
|
+
embedding1 = get_embedding(encoder, transforms, "image1.jpg")
|
|
50
|
+
embedding2 = get_embedding(encoder, transforms, "image2.jpg")
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Computing Similarity
|
|
54
|
+
|
|
55
|
+
```ruby
|
|
56
|
+
def cosine_similarity(a, b)
|
|
57
|
+
dot = a.zip(b).sum { |x, y| x * y }
|
|
58
|
+
norm_a = Math.sqrt(a.sum { |x| x * x })
|
|
59
|
+
norm_b = Math.sqrt(b.sum { |x| x * x })
|
|
60
|
+
dot / (norm_a * norm_b)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
similarity = cosine_similarity(embedding1, embedding2)
|
|
64
|
+
puts "Similarity: #{(similarity * 100).round(1)}%"
|
|
65
|
+
# > 90% = very similar
|
|
66
|
+
# 70-90% = somewhat similar
|
|
67
|
+
# < 70% = different
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Building a Search Index
|
|
71
|
+
|
|
72
|
+
For searching through many images:
|
|
73
|
+
|
|
74
|
+
```ruby
|
|
75
|
+
class ImageSearchIndex
|
|
76
|
+
def initialize(encoder, transforms)
|
|
77
|
+
@encoder = encoder
|
|
78
|
+
@transforms = transforms
|
|
79
|
+
@embeddings = {}
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def add(image_path)
|
|
83
|
+
@embeddings[image_path] = get_embedding(image_path)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def search(query_path, top_k: 5)
|
|
87
|
+
query_emb = get_embedding(query_path)
|
|
88
|
+
|
|
89
|
+
results = @embeddings.map do |path, emb|
|
|
90
|
+
{ path: path, score: cosine_similarity(query_emb, emb) }
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
results.sort_by { |r| -r[:score] }.first(top_k)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
private
|
|
97
|
+
|
|
98
|
+
def get_embedding(path)
|
|
99
|
+
image = Vips::Image.new_from_file(path, access: :sequential)
|
|
100
|
+
tensor = @transforms.call(image).unsqueeze(0)
|
|
101
|
+
|
|
102
|
+
@encoder.eval
|
|
103
|
+
Torch.no_grad do
|
|
104
|
+
@encoder.call(tensor).squeeze.to_a
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def cosine_similarity(a, b)
|
|
109
|
+
dot = a.zip(b).sum { |x, y| x * y }
|
|
110
|
+
norm_a = Math.sqrt(a.sum { |x| x * x })
|
|
111
|
+
norm_b = Math.sqrt(b.sum { |x| x * x })
|
|
112
|
+
dot / (norm_a * norm_b)
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Usage
|
|
117
|
+
index = ImageSearchIndex.new(encoder, transforms)
|
|
118
|
+
|
|
119
|
+
# Index your images
|
|
120
|
+
Dir.glob("catalog/*.jpg").each { |path| index.add(path) }
|
|
121
|
+
|
|
122
|
+
# Search
|
|
123
|
+
results = index.search("query_image.jpg", top_k: 10)
|
|
124
|
+
results.each do |result|
|
|
125
|
+
puts "#{result[:path]}: #{(result[:score] * 100).round}% similar"
|
|
126
|
+
end
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## Domain-Specific Similarity
|
|
130
|
+
|
|
131
|
+
For better results on your specific domain, fine-tune first:
|
|
132
|
+
|
|
133
|
+
```ruby
|
|
134
|
+
# Fine-tune on your domain
|
|
135
|
+
classifier = Fine::ImageClassifier.new("google/siglip2-base-patch16-224")
|
|
136
|
+
classifier.fit(train_dir: "my_domain/train", epochs: 3)
|
|
137
|
+
classifier.save("models/my_domain")
|
|
138
|
+
|
|
139
|
+
# Load the encoder from fine-tuned model
|
|
140
|
+
model = Fine::Models::SigLIP2ForImageClassification.load("models/my_domain")
|
|
141
|
+
encoder = model.encoder
|
|
142
|
+
|
|
143
|
+
# Now use this encoder for similarity search
|
|
144
|
+
# It will be better at distinguishing items in your domain
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## Performance Tips
|
|
148
|
+
|
|
149
|
+
For large catalogs (10k+ images):
|
|
150
|
+
- Pre-compute and cache embeddings
|
|
151
|
+
- Use approximate nearest neighbor libraries (e.g., Annoy, Faiss via FFI)
|
|
152
|
+
- Batch embedding computation for faster indexing
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
# Fine-tuning Text Classification Models
|
|
2
|
+
|
|
3
|
+
Classify text into categories—sentiment, spam, intent, topics.
|
|
4
|
+
|
|
5
|
+
## When to Use This
|
|
6
|
+
|
|
7
|
+
- Sentiment analysis (positive/negative/neutral reviews)
|
|
8
|
+
- Spam detection
|
|
9
|
+
- Intent classification for chatbots
|
|
10
|
+
- Support ticket routing
|
|
11
|
+
- Content moderation
|
|
12
|
+
- Topic categorization
|
|
13
|
+
|
|
14
|
+
## Supported Models
|
|
15
|
+
|
|
16
|
+
| Model | Parameters | Speed | Quality |
|
|
17
|
+
|-------|------------|-------|---------|
|
|
18
|
+
| `distilbert-base-uncased` | 66M | Fast | Good |
|
|
19
|
+
| `bert-base-uncased` | 110M | Medium | Better |
|
|
20
|
+
| `roberta-base` | 125M | Medium | Better |
|
|
21
|
+
| `microsoft/deberta-v3-small` | 44M | Fast | Great |
|
|
22
|
+
| `microsoft/deberta-v3-base` | 86M | Medium | Best |
|
|
23
|
+
|
|
24
|
+
## Dataset Format
|
|
25
|
+
|
|
26
|
+
JSONL file with `text` and `label` fields:
|
|
27
|
+
|
|
28
|
+
```
|
|
29
|
+
data/train.jsonl
|
|
30
|
+
{"text": "This product exceeded my expectations!", "label": "positive"}
|
|
31
|
+
{"text": "Terrible quality, broke after one day", "label": "negative"}
|
|
32
|
+
{"text": "It's okay, nothing special", "label": "neutral"}
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Or CSV:
|
|
36
|
+
|
|
37
|
+
```
|
|
38
|
+
data/train.csv
|
|
39
|
+
text,label
|
|
40
|
+
"This product exceeded my expectations!",positive
|
|
41
|
+
"Terrible quality, broke after one day",negative
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Basic Training
|
|
45
|
+
|
|
46
|
+
```ruby
|
|
47
|
+
require "fine"
|
|
48
|
+
|
|
49
|
+
classifier = Fine::TextClassifier.new("distilbert-base-uncased")
|
|
50
|
+
|
|
51
|
+
classifier.fit(
|
|
52
|
+
train_file: "data/train.jsonl",
|
|
53
|
+
val_file: "data/val.jsonl",
|
|
54
|
+
epochs: 3
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
classifier.save("models/sentiment")
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Making Predictions
|
|
61
|
+
|
|
62
|
+
```ruby
|
|
63
|
+
classifier = Fine::TextClassifier.load("models/sentiment")
|
|
64
|
+
|
|
65
|
+
# Single text
|
|
66
|
+
result = classifier.predict("I love this product!")
|
|
67
|
+
puts result.first[:label] # => "positive"
|
|
68
|
+
puts result.first[:score] # => 0.97
|
|
69
|
+
|
|
70
|
+
# Batch prediction
|
|
71
|
+
results = classifier.predict([
|
|
72
|
+
"Great service, highly recommend",
|
|
73
|
+
"Worst purchase ever",
|
|
74
|
+
"It works as described"
|
|
75
|
+
])
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Configuration
|
|
79
|
+
|
|
80
|
+
```ruby
|
|
81
|
+
classifier = Fine::TextClassifier.new("microsoft/deberta-v3-small") do |config|
|
|
82
|
+
config.epochs = 5
|
|
83
|
+
config.batch_size = 16
|
|
84
|
+
config.learning_rate = 2e-5 # Lower than vision models
|
|
85
|
+
config.max_length = 256 # Max tokens per text
|
|
86
|
+
config.warmup_ratio = 0.1 # 10% of steps for warmup
|
|
87
|
+
|
|
88
|
+
config.on_epoch_end do |epoch, metrics|
|
|
89
|
+
puts "Epoch #{epoch}: val_accuracy=#{(metrics[:val_accuracy] * 100).round(1)}%"
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Use Cases
|
|
95
|
+
|
|
96
|
+
### Sentiment Analysis
|
|
97
|
+
|
|
98
|
+
```ruby
|
|
99
|
+
# Train on product reviews
|
|
100
|
+
classifier = Fine::TextClassifier.new("distilbert-base-uncased")
|
|
101
|
+
classifier.fit(train_file: "reviews.jsonl", epochs: 3)
|
|
102
|
+
|
|
103
|
+
# Analyze new reviews
|
|
104
|
+
def analyze_sentiment(review_text)
|
|
105
|
+
result = classifier.predict(review_text).first
|
|
106
|
+
{
|
|
107
|
+
sentiment: result[:label],
|
|
108
|
+
confidence: result[:score],
|
|
109
|
+
needs_attention: result[:label] == "negative" && result[:score] > 0.8
|
|
110
|
+
}
|
|
111
|
+
end
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Spam Detection
|
|
115
|
+
|
|
116
|
+
```ruby
|
|
117
|
+
classifier = Fine::TextClassifier.new("distilbert-base-uncased")
|
|
118
|
+
classifier.fit(train_file: "spam_ham.jsonl", epochs: 3)
|
|
119
|
+
|
|
120
|
+
def is_spam?(message)
|
|
121
|
+
result = classifier.predict(message).first
|
|
122
|
+
result[:label] == "spam" && result[:score] > 0.7
|
|
123
|
+
end
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Intent Classification
|
|
127
|
+
|
|
128
|
+
```ruby
|
|
129
|
+
# For chatbot / support routing
|
|
130
|
+
# Labels: billing, technical, shipping, general, cancel
|
|
131
|
+
|
|
132
|
+
classifier = Fine::TextClassifier.new("microsoft/deberta-v3-small")
|
|
133
|
+
classifier.fit(train_file: "support_intents.jsonl", epochs: 5)
|
|
134
|
+
|
|
135
|
+
def route_ticket(message)
|
|
136
|
+
result = classifier.predict(message).first
|
|
137
|
+
|
|
138
|
+
case result[:label]
|
|
139
|
+
when "billing"
|
|
140
|
+
assign_to_billing_team(message)
|
|
141
|
+
when "cancel"
|
|
142
|
+
assign_to_retention_team(message)
|
|
143
|
+
when "technical"
|
|
144
|
+
assign_to_tech_support(message)
|
|
145
|
+
else
|
|
146
|
+
assign_to_general_queue(message)
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### Multi-label Classification
|
|
152
|
+
|
|
153
|
+
For texts that can have multiple labels:
|
|
154
|
+
|
|
155
|
+
```ruby
|
|
156
|
+
classifier = Fine::TextClassifier.new("distilbert-base-uncased") do |config|
|
|
157
|
+
config.multi_label = true
|
|
158
|
+
config.threshold = 0.5 # Predict labels above this confidence
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Data format for multi-label
|
|
162
|
+
# {"text": "Server crashed and lost data", "labels": ["technical", "urgent", "data_loss"]}
|
|
163
|
+
|
|
164
|
+
classifier.fit(train_file: "tickets_multilabel.jsonl")
|
|
165
|
+
|
|
166
|
+
result = classifier.predict("Payment failed and I can't login")
|
|
167
|
+
# => [{ label: "billing", score: 0.89 }, { label: "technical", score: 0.72 }]
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## Data Preparation Tips
|
|
171
|
+
|
|
172
|
+
### Minimum Data
|
|
173
|
+
|
|
174
|
+
- 100+ examples per class for decent results
|
|
175
|
+
- 500+ examples per class for good results
|
|
176
|
+
- Balance classes or use class weights
|
|
177
|
+
|
|
178
|
+
### Class Imbalance
|
|
179
|
+
|
|
180
|
+
```ruby
|
|
181
|
+
classifier = Fine::TextClassifier.new("distilbert-base-uncased") do |config|
|
|
182
|
+
config.class_weights = :balanced # Auto-compute from data
|
|
183
|
+
# Or manually: config.class_weights = { "positive" => 1.0, "negative" => 2.5 }
|
|
184
|
+
end
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
### Text Preprocessing
|
|
188
|
+
|
|
189
|
+
The tokenizer handles most preprocessing, but you might want to:
|
|
190
|
+
|
|
191
|
+
```ruby
|
|
192
|
+
def clean_text(text)
|
|
193
|
+
text
|
|
194
|
+
.gsub(/<[^>]+>/, ' ') # Remove HTML
|
|
195
|
+
.gsub(/https?:\S+/, '') # Remove URLs
|
|
196
|
+
.gsub(/\s+/, ' ') # Normalize whitespace
|
|
197
|
+
.strip
|
|
198
|
+
end
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## Evaluation
|
|
202
|
+
|
|
203
|
+
```ruby
|
|
204
|
+
# Load test set
|
|
205
|
+
test_data = File.readlines("data/test.jsonl").map { |l| JSON.parse(l) }
|
|
206
|
+
|
|
207
|
+
predictions = classifier.predict(test_data.map { |d| d["text"] })
|
|
208
|
+
actuals = test_data.map { |d| d["label"] }
|
|
209
|
+
|
|
210
|
+
# Calculate accuracy
|
|
211
|
+
correct = predictions.zip(actuals).count { |pred, actual| pred.first[:label] == actual }
|
|
212
|
+
accuracy = correct.to_f / predictions.size
|
|
213
|
+
puts "Test accuracy: #{(accuracy * 100).round(1)}%"
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
## Troubleshooting
|
|
217
|
+
|
|
218
|
+
**Low accuracy:**
|
|
219
|
+
- Add more training data
|
|
220
|
+
- Check for mislabeled examples
|
|
221
|
+
- Try a larger model (deberta-v3-base)
|
|
222
|
+
- Increase epochs
|
|
223
|
+
|
|
224
|
+
**Overfitting:**
|
|
225
|
+
- Add more data
|
|
226
|
+
- Use dropout (increase in config)
|
|
227
|
+
- Reduce epochs
|
|
228
|
+
- Use a smaller model
|
|
229
|
+
|
|
230
|
+
**Slow training:**
|
|
231
|
+
- Reduce `max_length`
|
|
232
|
+
- Use a smaller model (distilbert)
|
|
233
|
+
- Reduce batch size if memory-constrained
|