fine 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +20 -10
  3. data/docs/examples/image-classification-shapes.md +83 -0
  4. data/docs/examples/text-embeddings-faq.md +98 -0
  5. data/docs/quickstart.md +209 -0
  6. data/docs/tutorials/lora-tool-calling.md +306 -0
  7. data/examples/data/generate_tool_data.rb +261 -0
  8. data/examples/data/ollama_tool_calls.jsonl +40 -0
  9. data/examples/data/sentiment_reviews.jsonl +30 -0
  10. data/examples/data/shapes/circle/circle_1.jpg +0 -0
  11. data/examples/data/shapes/circle/circle_10.jpg +0 -0
  12. data/examples/data/shapes/circle/circle_2.jpg +0 -0
  13. data/examples/data/shapes/circle/circle_3.jpg +0 -0
  14. data/examples/data/shapes/circle/circle_4.jpg +0 -0
  15. data/examples/data/shapes/circle/circle_5.jpg +0 -0
  16. data/examples/data/shapes/circle/circle_6.jpg +0 -0
  17. data/examples/data/shapes/circle/circle_7.jpg +0 -0
  18. data/examples/data/shapes/circle/circle_8.jpg +0 -0
  19. data/examples/data/shapes/circle/circle_9.jpg +0 -0
  20. data/examples/data/shapes/square/square_1.jpg +0 -0
  21. data/examples/data/shapes/square/square_10.jpg +0 -0
  22. data/examples/data/shapes/square/square_2.jpg +0 -0
  23. data/examples/data/shapes/square/square_3.jpg +0 -0
  24. data/examples/data/shapes/square/square_4.jpg +0 -0
  25. data/examples/data/shapes/square/square_5.jpg +0 -0
  26. data/examples/data/shapes/square/square_6.jpg +0 -0
  27. data/examples/data/shapes/square/square_7.jpg +0 -0
  28. data/examples/data/shapes/square/square_8.jpg +0 -0
  29. data/examples/data/shapes/square/square_9.jpg +0 -0
  30. data/examples/data/shapes/triangle/triangle_1.jpg +0 -0
  31. data/examples/data/shapes/triangle/triangle_10.jpg +0 -0
  32. data/examples/data/shapes/triangle/triangle_2.jpg +0 -0
  33. data/examples/data/shapes/triangle/triangle_3.jpg +0 -0
  34. data/examples/data/shapes/triangle/triangle_4.jpg +0 -0
  35. data/examples/data/shapes/triangle/triangle_5.jpg +0 -0
  36. data/examples/data/shapes/triangle/triangle_6.jpg +0 -0
  37. data/examples/data/shapes/triangle/triangle_7.jpg +0 -0
  38. data/examples/data/shapes/triangle/triangle_8.jpg +0 -0
  39. data/examples/data/shapes/triangle/triangle_9.jpg +0 -0
  40. data/examples/data/support_faq_pairs.jsonl +30 -0
  41. data/examples/generate_shape_images.rb +94 -0
  42. data/examples/sentiment_classification.rb +87 -0
  43. data/examples/shape_classification.rb +87 -0
  44. data/examples/support_faq_embeddings.rb +105 -0
  45. data/examples/train_lora_tools.rb +218 -0
  46. data/lib/fine/configuration.rb +173 -15
  47. data/lib/fine/datasets/image_dataset.rb +14 -2
  48. data/lib/fine/datasets/instruction_dataset.rb +17 -2
  49. data/lib/fine/datasets/text_dataset.rb +15 -5
  50. data/lib/fine/hub/config_loader.rb +4 -4
  51. data/lib/fine/hub/safetensors_loader.rb +3 -2
  52. data/lib/fine/llm.rb +39 -10
  53. data/lib/fine/lora.rb +214 -0
  54. data/lib/fine/models/bert_encoder.rb +15 -6
  55. data/lib/fine/models/bert_for_sequence_classification.rb +35 -4
  56. data/lib/fine/models/causal_lm.rb +46 -5
  57. data/lib/fine/models/gemma3_decoder.rb +25 -6
  58. data/lib/fine/models/llama_decoder.rb +9 -8
  59. data/lib/fine/models/sentence_transformer.rb +1 -1
  60. data/lib/fine/tokenizers/auto_tokenizer.rb +15 -0
  61. data/lib/fine/training/text_trainer.rb +3 -1
  62. data/lib/fine/validators.rb +304 -0
  63. data/lib/fine/version.rb +1 -1
  64. data/lib/fine.rb +4 -0
  65. metadata +47 -2
@@ -0,0 +1,304 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Fine
4
+ # Data validation with helpful error messages
5
+ module Validators
6
+ class ValidationError < Error
7
+ attr_reader :line_number, :expected_format
8
+
9
+ def initialize(message, line_number: nil, expected_format: nil)
10
+ @line_number = line_number
11
+ @expected_format = expected_format
12
+ super(build_message(message))
13
+ end
14
+
15
+ private
16
+
17
+ def build_message(message)
18
+ parts = [message]
19
+ parts << "Line #{@line_number}" if @line_number
20
+ parts << "\nExpected format:\n#{@expected_format}" if @expected_format
21
+ parts.join(" ")
22
+ end
23
+ end
24
+
25
+ class << self
26
+ # Validate text classification data
27
+ #
28
+ # @param path [String] Path to JSONL file
29
+ # @raise [ValidationError] if validation fails
30
+ def validate_text_classification!(path)
31
+ validate_file_exists!(path)
32
+ validate_jsonl!(path) do |data, line_num|
33
+ unless data.key?("text") || data.key?(:text)
34
+ raise ValidationError.new(
35
+ "Missing 'text' field",
36
+ line_number: line_num,
37
+ expected_format: TEXT_CLASSIFICATION_FORMAT
38
+ )
39
+ end
40
+ unless data.key?("label") || data.key?(:label)
41
+ raise ValidationError.new(
42
+ "Missing 'label' field",
43
+ line_number: line_num,
44
+ expected_format: TEXT_CLASSIFICATION_FORMAT
45
+ )
46
+ end
47
+ end
48
+ end
49
+
50
+ # Validate text pairs for embedding training
51
+ #
52
+ # @param path [String] Path to JSONL file
53
+ # @raise [ValidationError] if validation fails
54
+ def validate_text_pairs!(path)
55
+ validate_file_exists!(path)
56
+ validate_jsonl!(path) do |data, line_num|
57
+ has_text_a = data.key?("text_a") || data.key?(:text_a) ||
58
+ data.key?("anchor") || data.key?(:anchor) ||
59
+ data.key?("sentence1") || data.key?(:sentence1) ||
60
+ data.key?("query") || data.key?(:query)
61
+ has_text_b = data.key?("text_b") || data.key?(:text_b) ||
62
+ data.key?("positive") || data.key?(:positive) ||
63
+ data.key?("sentence2") || data.key?(:sentence2)
64
+
65
+ unless has_text_a && has_text_b
66
+ raise ValidationError.new(
67
+ "Missing text pair fields",
68
+ line_number: line_num,
69
+ expected_format: TEXT_PAIRS_FORMAT
70
+ )
71
+ end
72
+ end
73
+ end
74
+
75
+ # Validate instruction data for LLM fine-tuning
76
+ #
77
+ # @param path [String] Path to JSONL file
78
+ # @param format [Symbol] Expected format (:alpaca, :sharegpt, :simple, :auto)
79
+ # @raise [ValidationError] if validation fails
80
+ def validate_instructions!(path, format: :auto)
81
+ validate_file_exists!(path)
82
+
83
+ first_line = File.open(path, &:readline)
84
+ first_data = JSON.parse(first_line, symbolize_names: true)
85
+ detected_format = format == :auto ? detect_instruction_format(first_data) : format
86
+
87
+ validate_jsonl!(path) do |data, line_num|
88
+ case detected_format
89
+ when :alpaca
90
+ validate_alpaca_format!(data, line_num)
91
+ when :sharegpt
92
+ validate_sharegpt_format!(data, line_num)
93
+ when :simple
94
+ validate_simple_format!(data, line_num)
95
+ end
96
+ end
97
+
98
+ detected_format
99
+ end
100
+
101
+ # Validate image directory structure
102
+ #
103
+ # @param path [String] Path to directory
104
+ # @raise [ValidationError] if validation fails
105
+ def validate_image_directory!(path)
106
+ unless File.directory?(path)
107
+ raise ValidationError.new(
108
+ "Directory not found: #{path}",
109
+ expected_format: IMAGE_DIRECTORY_FORMAT
110
+ )
111
+ end
112
+
113
+ subdirs = Dir.entries(path).reject { |e| e.start_with?(".") }
114
+ subdirs = subdirs.select { |e| File.directory?(File.join(path, e)) }
115
+
116
+ if subdirs.empty?
117
+ raise ValidationError.new(
118
+ "No class subdirectories found in #{path}",
119
+ expected_format: IMAGE_DIRECTORY_FORMAT
120
+ )
121
+ end
122
+
123
+ # Check each subdirectory has images
124
+ subdirs.each do |subdir|
125
+ subdir_path = File.join(path, subdir)
126
+ images = Dir.glob(File.join(subdir_path, "*.{jpg,jpeg,png,gif,webp}"))
127
+ if images.empty?
128
+ raise ValidationError.new(
129
+ "No images found in class directory: #{subdir_path}",
130
+ expected_format: IMAGE_DIRECTORY_FORMAT
131
+ )
132
+ end
133
+ end
134
+
135
+ subdirs
136
+ end
137
+
138
+ # Quick check if file looks valid (non-blocking, for warnings)
139
+ #
140
+ # @param path [String] Path to file
141
+ # @param type [Symbol] Type of data (:text_classification, :text_pairs, :instructions)
142
+ # @return [Hash] { valid: true/false, warnings: [...], line_count: N }
143
+ def check(path, type:)
144
+ result = { valid: true, warnings: [], line_count: 0 }
145
+
146
+ begin
147
+ case type
148
+ when :text_classification
149
+ validate_text_classification!(path)
150
+ when :text_pairs
151
+ validate_text_pairs!(path)
152
+ when :instructions
153
+ validate_instructions!(path)
154
+ when :image_directory
155
+ validate_image_directory!(path)
156
+ end
157
+
158
+ result[:line_count] = File.readlines(path).count if File.file?(path)
159
+ rescue ValidationError => e
160
+ result[:valid] = false
161
+ result[:warnings] << e.message
162
+ rescue StandardError => e
163
+ result[:valid] = false
164
+ result[:warnings] << "Unexpected error: #{e.message}"
165
+ end
166
+
167
+ result
168
+ end
169
+
170
+ private
171
+
172
+ def validate_file_exists!(path)
173
+ unless File.exist?(path)
174
+ raise ValidationError.new("File not found: #{path}")
175
+ end
176
+
177
+ if File.empty?(path)
178
+ raise ValidationError.new("File is empty: #{path}")
179
+ end
180
+ end
181
+
182
+ def validate_jsonl!(path)
183
+ File.foreach(path).with_index(1) do |line, line_num|
184
+ next if line.strip.empty?
185
+
186
+ begin
187
+ data = JSON.parse(line, symbolize_names: true)
188
+ rescue JSON::ParserError => e
189
+ raise ValidationError.new(
190
+ "Invalid JSON: #{e.message}",
191
+ line_number: line_num
192
+ )
193
+ end
194
+
195
+ yield(data, line_num) if block_given?
196
+ end
197
+ end
198
+
199
+ def detect_instruction_format(data)
200
+ if data.key?(:instruction)
201
+ :alpaca
202
+ elsif data.key?(:conversations)
203
+ :sharegpt
204
+ elsif data.key?(:prompt) || data.key?(:text)
205
+ :simple
206
+ else
207
+ raise ValidationError.new(
208
+ "Cannot detect instruction format",
209
+ expected_format: INSTRUCTION_FORMATS
210
+ )
211
+ end
212
+ end
213
+
214
+ def validate_alpaca_format!(data, line_num)
215
+ unless data.key?(:instruction)
216
+ raise ValidationError.new(
217
+ "Missing 'instruction' field for Alpaca format",
218
+ line_number: line_num,
219
+ expected_format: ALPACA_FORMAT
220
+ )
221
+ end
222
+ unless data.key?(:output) || data.key?(:response)
223
+ raise ValidationError.new(
224
+ "Missing 'output' or 'response' field for Alpaca format",
225
+ line_number: line_num,
226
+ expected_format: ALPACA_FORMAT
227
+ )
228
+ end
229
+ end
230
+
231
+ def validate_sharegpt_format!(data, line_num)
232
+ unless data.key?(:conversations)
233
+ raise ValidationError.new(
234
+ "Missing 'conversations' field for ShareGPT format",
235
+ line_number: line_num,
236
+ expected_format: SHAREGPT_FORMAT
237
+ )
238
+ end
239
+ unless data[:conversations].is_a?(Array)
240
+ raise ValidationError.new(
241
+ "'conversations' must be an array",
242
+ line_number: line_num,
243
+ expected_format: SHAREGPT_FORMAT
244
+ )
245
+ end
246
+ end
247
+
248
+ def validate_simple_format!(data, line_num)
249
+ unless data.key?(:prompt) || data.key?(:text)
250
+ raise ValidationError.new(
251
+ "Missing 'prompt' or 'text' field for simple format",
252
+ line_number: line_num,
253
+ expected_format: SIMPLE_FORMAT
254
+ )
255
+ end
256
+ end
257
+
258
+ # Format examples for error messages
259
+ TEXT_CLASSIFICATION_FORMAT = <<~FORMAT
260
+ {"text": "This product is great!", "label": "positive"}
261
+ {"text": "Terrible experience", "label": "negative"}
262
+ FORMAT
263
+
264
+ TEXT_PAIRS_FORMAT = <<~FORMAT
265
+ {"text_a": "How do I reset my password?", "text_b": "Click forgot password on login page"}
266
+
267
+ Alternative field names: query/positive, anchor/positive, sentence1/sentence2
268
+ FORMAT
269
+
270
+ ALPACA_FORMAT = <<~FORMAT
271
+ {"instruction": "Summarize this text", "input": "Long text here...", "output": "Summary here"}
272
+ {"instruction": "Translate to French", "output": "Bonjour"}
273
+ FORMAT
274
+
275
+ SHAREGPT_FORMAT = <<~FORMAT
276
+ {"conversations": [
277
+ {"from": "human", "value": "Hello"},
278
+ {"from": "assistant", "value": "Hi there!"}
279
+ ]}
280
+ FORMAT
281
+
282
+ SIMPLE_FORMAT = <<~FORMAT
283
+ {"prompt": "Question here", "completion": "Answer here"}
284
+ {"text": "Full text for language modeling"}
285
+ FORMAT
286
+
287
+ INSTRUCTION_FORMATS = <<~FORMAT
288
+ Alpaca: {"instruction": "...", "output": "..."}
289
+ ShareGPT: {"conversations": [{"from": "human", "value": "..."}, ...]}
290
+ Simple: {"prompt": "...", "completion": "..."}
291
+ FORMAT
292
+
293
+ IMAGE_DIRECTORY_FORMAT = <<~FORMAT
294
+ data/
295
+ cats/
296
+ cat1.jpg
297
+ cat2.jpg
298
+ dogs/
299
+ dog1.jpg
300
+ dog2.jpg
301
+ FORMAT
302
+ end
303
+ end
304
+ end
data/lib/fine/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Fine
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
data/lib/fine.rb CHANGED
@@ -12,6 +12,7 @@ require "fileutils"
12
12
  require_relative "fine/version"
13
13
  require_relative "fine/error"
14
14
  require_relative "fine/configuration"
15
+ require_relative "fine/validators"
15
16
 
16
17
  # Hub
17
18
  require_relative "fine/hub/config_loader"
@@ -68,6 +69,9 @@ require_relative "fine/llm"
68
69
  # Export
69
70
  require_relative "fine/export"
70
71
 
72
+ # LoRA
73
+ require_relative "fine/lora"
74
+
71
75
  module Fine
72
76
  class << self
73
77
  attr_accessor :configuration
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fine
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Hasinski
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-01-26 00:00:00.000000000 Z
11
+ date: 2026-01-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: torch-rb
@@ -151,8 +151,12 @@ files:
151
151
  - LICENSE
152
152
  - README.md
153
153
  - Rakefile
154
+ - docs/examples/image-classification-shapes.md
155
+ - docs/examples/text-embeddings-faq.md
154
156
  - docs/installation.md
157
+ - docs/quickstart.md
155
158
  - docs/tutorials/llm-fine-tuning.md
159
+ - docs/tutorials/lora-tool-calling.md
156
160
  - docs/tutorials/model-export.md
157
161
  - docs/tutorials/siglip2-image-classification.md
158
162
  - docs/tutorials/siglip2-object-recognition.md
@@ -160,17 +164,56 @@ files:
160
164
  - docs/tutorials/text-classification.md
161
165
  - docs/tutorials/text-embeddings.md
162
166
  - examples/basic_classification.rb
167
+ - examples/data/generate_tool_data.rb
168
+ - examples/data/ollama_tool_calls.jsonl
169
+ - examples/data/sentiment_reviews.jsonl
170
+ - examples/data/shapes/circle/circle_1.jpg
171
+ - examples/data/shapes/circle/circle_10.jpg
172
+ - examples/data/shapes/circle/circle_2.jpg
173
+ - examples/data/shapes/circle/circle_3.jpg
174
+ - examples/data/shapes/circle/circle_4.jpg
175
+ - examples/data/shapes/circle/circle_5.jpg
176
+ - examples/data/shapes/circle/circle_6.jpg
177
+ - examples/data/shapes/circle/circle_7.jpg
178
+ - examples/data/shapes/circle/circle_8.jpg
179
+ - examples/data/shapes/circle/circle_9.jpg
180
+ - examples/data/shapes/square/square_1.jpg
181
+ - examples/data/shapes/square/square_10.jpg
182
+ - examples/data/shapes/square/square_2.jpg
183
+ - examples/data/shapes/square/square_3.jpg
184
+ - examples/data/shapes/square/square_4.jpg
185
+ - examples/data/shapes/square/square_5.jpg
186
+ - examples/data/shapes/square/square_6.jpg
187
+ - examples/data/shapes/square/square_7.jpg
188
+ - examples/data/shapes/square/square_8.jpg
189
+ - examples/data/shapes/square/square_9.jpg
190
+ - examples/data/shapes/triangle/triangle_1.jpg
191
+ - examples/data/shapes/triangle/triangle_10.jpg
192
+ - examples/data/shapes/triangle/triangle_2.jpg
193
+ - examples/data/shapes/triangle/triangle_3.jpg
194
+ - examples/data/shapes/triangle/triangle_4.jpg
195
+ - examples/data/shapes/triangle/triangle_5.jpg
196
+ - examples/data/shapes/triangle/triangle_6.jpg
197
+ - examples/data/shapes/triangle/triangle_7.jpg
198
+ - examples/data/shapes/triangle/triangle_8.jpg
199
+ - examples/data/shapes/triangle/triangle_9.jpg
200
+ - examples/data/support_faq_pairs.jsonl
163
201
  - examples/data/tool_calls.jsonl
164
202
  - examples/demo_training.rb
165
203
  - examples/finetune_gemma3_tools.rb
204
+ - examples/generate_shape_images.rb
166
205
  - examples/real_llm_test.rb
167
206
  - examples/real_text_classification_test.rb
168
207
  - examples/real_text_embedder_test.rb
169
208
  - examples/real_training_test.rb
209
+ - examples/sentiment_classification.rb
210
+ - examples/shape_classification.rb
211
+ - examples/support_faq_embeddings.rb
170
212
  - examples/test_export.rb
171
213
  - examples/test_image_classifier.rb
172
214
  - examples/test_llm.rb
173
215
  - examples/test_text_classifier.rb
216
+ - examples/train_lora_tools.rb
174
217
  - lib/fine.rb
175
218
  - lib/fine/callbacks/base.rb
176
219
  - lib/fine/callbacks/progress_bar.rb
@@ -189,6 +232,7 @@ files:
189
232
  - lib/fine/hub/safetensors_loader.rb
190
233
  - lib/fine/image_classifier.rb
191
234
  - lib/fine/llm.rb
235
+ - lib/fine/lora.rb
192
236
  - lib/fine/models/base.rb
193
237
  - lib/fine/models/bert_encoder.rb
194
238
  - lib/fine/models/bert_for_sequence_classification.rb
@@ -209,6 +253,7 @@ files:
209
253
  - lib/fine/transforms/normalize.rb
210
254
  - lib/fine/transforms/resize.rb
211
255
  - lib/fine/transforms/to_tensor.rb
256
+ - lib/fine/validators.rb
212
257
  - lib/fine/version.rb
213
258
  - mise.toml
214
259
  homepage: https://github.com/khasinski/fine