kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,434 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::Config::Extraction do
4
+ describe '#initialize' do
5
+ it 'creates config with default values' do
6
+ config = described_class.new
7
+
8
+ expect(config.use_cache).to be true
9
+ expect(config.enable_quality_processing).to be true
10
+ expect(config.force_ocr).to be false
11
+ expect(config.ocr).to be_nil
12
+ expect(config.chunking).to be_nil
13
+ expect(config.language_detection).to be_nil
14
+ expect(config.pdf_options).to be_nil
15
+ expect(config.image_extraction).to be_nil
16
+ expect(config.postprocessor).to be_nil
17
+ expect(config.token_reduction).to be_nil
18
+ expect(config.keywords).to be_nil
19
+ expect(config.html_options).to be_nil
20
+ expect(config.pages).to be_nil
21
+ expect(config.max_concurrent_extractions).to be_nil
22
+ end
23
+
24
+ it 'creates config with custom boolean values' do
25
+ config = described_class.new(
26
+ use_cache: false,
27
+ enable_quality_processing: true,
28
+ force_ocr: true
29
+ )
30
+
31
+ expect(config.use_cache).to be false
32
+ expect(config.enable_quality_processing).to be true
33
+ expect(config.force_ocr).to be true
34
+ end
35
+
36
+ it 'accepts all nested config instances' do
37
+ ocr = Kreuzberg::Config::OCR.new(backend: 'tesseract')
38
+ chunking = Kreuzberg::Config::Chunking.new(max_chars: 500)
39
+ lang_detect = Kreuzberg::Config::LanguageDetection.new(enabled: true)
40
+
41
+ config = described_class.new(
42
+ ocr: ocr,
43
+ chunking: chunking,
44
+ language_detection: lang_detect
45
+ )
46
+
47
+ expect(config.ocr).to be ocr
48
+ expect(config.chunking).to be chunking
49
+ expect(config.language_detection).to be lang_detect
50
+ end
51
+
52
+ it 'converts nested config hashes to instances' do
53
+ config = described_class.new(
54
+ ocr: { backend: 'easyocr', language: 'fra' },
55
+ chunking: { max_chars: 750 }
56
+ )
57
+
58
+ expect(config.ocr).to be_a Kreuzberg::Config::OCR
59
+ expect(config.ocr.backend).to eq 'easyocr'
60
+ expect(config.chunking).to be_a Kreuzberg::Config::Chunking
61
+ expect(config.chunking.max_chars).to eq 750
62
+ end
63
+
64
+ it 'converts max_concurrent_extractions to integer' do
65
+ config = described_class.new(max_concurrent_extractions: '4')
66
+
67
+ expect(config.max_concurrent_extractions).to eq 4
68
+ expect(config.max_concurrent_extractions).to be_a Integer
69
+ end
70
+ end
71
+
72
+ describe '#to_h' do
73
+ it 'serializes to hash' do
74
+ config = described_class.new(use_cache: true)
75
+ hash = config.to_h
76
+
77
+ expect(hash).to be_a Hash
78
+ expect(hash[:use_cache]).to be true
79
+ end
80
+
81
+ it 'includes all nested configs in hash' do
82
+ config = described_class.new(
83
+ ocr: { backend: 'tesseract' },
84
+ chunking: { max_chars: 500 }
85
+ )
86
+ hash = config.to_h
87
+
88
+ expect(hash[:ocr]).to be_a Hash
89
+ expect(hash[:chunking]).to be_a Hash
90
+ end
91
+
92
+ it 'compacts nil nested configs from hash' do
93
+ config = described_class.new(use_cache: true)
94
+ hash = config.to_h
95
+
96
+ expect(hash.key?(:ocr)).to be false
97
+ expect(hash.key?(:chunking)).to be false
98
+ end
99
+
100
+ it 'always includes top-level boolean values' do
101
+ config = described_class.new
102
+ hash = config.to_h
103
+
104
+ expect(hash[:use_cache]).to be true
105
+ expect(hash[:enable_quality_processing]).to be true
106
+ expect(hash[:force_ocr]).to be false
107
+ end
108
+ end
109
+
110
+ describe '#to_json' do
111
+ it 'serializes to JSON string' do
112
+ config = described_class.new(use_cache: true, force_ocr: false)
113
+ json = config.to_json
114
+
115
+ expect(json).to be_a String
116
+ parsed = JSON.parse(json)
117
+ expect(parsed['use_cache']).to be true
118
+ expect(parsed['force_ocr']).to be false
119
+ end
120
+
121
+ it 'handles nested configs in JSON' do
122
+ config = described_class.new(ocr: { backend: 'tesseract' })
123
+ json = config.to_json
124
+
125
+ parsed = JSON.parse(json)
126
+ expect(parsed['ocr']['backend']).to eq 'tesseract'
127
+ end
128
+ end
129
+
130
+ describe '#get_field' do
131
+ it 'retrieves top-level field' do
132
+ config = described_class.new(use_cache: false)
133
+
134
+ expect(config.get_field('use_cache')).to be false
135
+ end
136
+
137
+ it 'retrieves nested field with dot notation' do
138
+ config = described_class.new(ocr: { backend: 'tesseract' })
139
+
140
+ expect(config.get_field('ocr.backend')).to eq 'tesseract'
141
+ end
142
+
143
+ it 'returns nil for non-existent field' do
144
+ config = described_class.new
145
+
146
+ expect(config.get_field('nonexistent')).to be_nil
147
+ end
148
+
149
+ it 'accepts symbol field names' do
150
+ config = described_class.new(use_cache: true)
151
+
152
+ expect(config.get_field(:use_cache)).to be true
153
+ end
154
+
155
+ it 'handles deeply nested fields' do
156
+ config = described_class.new(
157
+ chunking: { embedding: { model: { type: :preset, name: 'fast' } } }
158
+ )
159
+
160
+ expect(config.get_field('chunking.embedding.model')).to be_a Hash
161
+ end
162
+ end
163
+
164
+ describe '#merge' do
165
+ it 'merges two configs' do
166
+ base = described_class.new(use_cache: true, force_ocr: false)
167
+ override = described_class.new(force_ocr: true)
168
+ merged = base.merge(override)
169
+
170
+ expect(merged.use_cache).to be true
171
+ expect(merged.force_ocr).to be true
172
+ end
173
+
174
+ it 'returns new config without modifying original' do
175
+ base = described_class.new(use_cache: true)
176
+ override = described_class.new(use_cache: false)
177
+ merged = base.merge(override)
178
+
179
+ expect(base.use_cache).to be true
180
+ expect(merged.use_cache).to be false
181
+ end
182
+
183
+ it 'merges nested configs' do
184
+ base = described_class.new(ocr: { backend: 'tesseract' })
185
+ override = described_class.new(ocr: { language: 'fra' })
186
+ merged = base.merge(override)
187
+
188
+ expect(merged.ocr.backend).to eq 'tesseract'
189
+ end
190
+
191
+ it 'accepts hash as merge argument' do
192
+ base = described_class.new(use_cache: true)
193
+ merged = base.merge({ use_cache: false })
194
+
195
+ expect(merged.use_cache).to be false
196
+ end
197
+ end
198
+
199
+ describe '#merge!' do
200
+ it 'mutates config in-place' do
201
+ config = described_class.new(use_cache: true, force_ocr: false)
202
+ override = described_class.new(force_ocr: true)
203
+ result = config.merge!(override)
204
+
205
+ expect(config.force_ocr).to be true
206
+ expect(result).to be config
207
+ end
208
+
209
+ it 'returns self' do
210
+ config = described_class.new
211
+ override = described_class.new
212
+
213
+ expect(config.merge!(override)).to be config
214
+ end
215
+
216
+ it 'accepts hash argument' do
217
+ config = described_class.new(use_cache: true)
218
+ config[:use_cache] = false
219
+ config[:force_ocr] = true
220
+
221
+ expect(config.use_cache).to be false
222
+ expect(config.force_ocr).to be true
223
+ end
224
+ end
225
+
226
+ describe 'validation' do
227
+ it 'rejects invalid ocr type' do
228
+ expect do
229
+ described_class.new(ocr: 'invalid')
230
+ end.to raise_error ArgumentError, /Expected.*OCR/
231
+ end
232
+
233
+ it 'rejects invalid chunking type' do
234
+ expect do
235
+ described_class.new(chunking: 123)
236
+ end.to raise_error ArgumentError, /Expected.*Chunking/
237
+ end
238
+
239
+ it 'accepts valid nested instances' do
240
+ expect do
241
+ described_class.new(
242
+ ocr: Kreuzberg::Config::OCR.new,
243
+ chunking: Kreuzberg::Config::Chunking.new
244
+ )
245
+ end.not_to raise_error
246
+ end
247
+ end
248
+
249
+ describe 'keyword arguments' do
250
+ it 'accepts all keyword arguments' do
251
+ config = described_class.new(
252
+ use_cache: false,
253
+ enable_quality_processing: true,
254
+ force_ocr: true,
255
+ ocr: { backend: 'tesseract' },
256
+ chunking: { max_chars: 500 },
257
+ language_detection: { enabled: true },
258
+ pdf_options: { extract_images: true },
259
+ image_extraction: { target_dpi: 600 },
260
+ postprocessor: { enabled: true },
261
+ token_reduction: { mode: 'light' },
262
+ keywords: { algorithm: 'yake' },
263
+ pages: { extract_pages: true },
264
+ max_concurrent_extractions: 4
265
+ )
266
+
267
+ expect(config.use_cache).to be false
268
+ expect(config.enable_quality_processing).to be true
269
+ expect(config.force_ocr).to be true
270
+ expect(config.ocr).to be_a Kreuzberg::Config::OCR
271
+ expect(config.max_concurrent_extractions).to eq 4
272
+ end
273
+ end
274
+
275
+ describe 'equality' do
276
+ it 'compares configs with same values' do
277
+ config1 = described_class.new(use_cache: true, force_ocr: false)
278
+ config2 = described_class.new(use_cache: true, force_ocr: false)
279
+
280
+ expect(config1.use_cache).to eq config2.use_cache
281
+ expect(config1.force_ocr).to eq config2.force_ocr
282
+ end
283
+
284
+ it 'detects differences' do
285
+ config1 = described_class.new(use_cache: true)
286
+ config2 = described_class.new(use_cache: false)
287
+
288
+ expect(config1.use_cache).not_to eq config2.use_cache
289
+ end
290
+ end
291
+
292
+ describe '.from_file' do
293
+ it 'loads from TOML file' do
294
+ config_path = File.join(__dir__, '../../fixtures/config.toml')
295
+ config = described_class.from_file(config_path)
296
+
297
+ expect(config).to be_a described_class
298
+ expect(config.use_cache).to be false
299
+ end
300
+
301
+ it 'loads from YAML file' do
302
+ config_path = File.join(__dir__, '../../fixtures/config.yaml')
303
+ config = described_class.from_file(config_path)
304
+
305
+ expect(config).to be_a described_class
306
+ expect(config.use_cache).to be false
307
+ end
308
+
309
+ it 'raises error for non-existent file' do
310
+ expect do
311
+ described_class.from_file('/nonexistent/path/config.toml')
312
+ end.to raise_error Kreuzberg::Errors::ValidationError
313
+ end
314
+ end
315
+
316
+ describe '.discover' do
317
+ it 'returns nil when no config file found' do
318
+ # This test may vary by environment
319
+ # Documenting the behavior
320
+ config = described_class.discover
321
+ # Should either return a config or nil
322
+ expect(config.nil? || config.is_a?(described_class)).to be true
323
+ end
324
+ end
325
+
326
+ describe 'boolean conversion' do
327
+ it 'converts truthy use_cache to true' do
328
+ config = described_class.new(use_cache: 1)
329
+
330
+ expect(config.use_cache).to be true
331
+ end
332
+
333
+ it 'converts false use_cache to false' do
334
+ config = described_class.new(use_cache: false)
335
+
336
+ expect(config.use_cache).to be false
337
+ end
338
+
339
+ it 'converts truthy enable_quality_processing to true' do
340
+ config = described_class.new(enable_quality_processing: 'yes')
341
+
342
+ expect(config.enable_quality_processing).to be true
343
+ end
344
+
345
+ it 'converts false enable_quality_processing to false' do
346
+ config = described_class.new(enable_quality_processing: false)
347
+
348
+ expect(config.enable_quality_processing).to be false
349
+ end
350
+
351
+ it 'converts truthy force_ocr to true' do
352
+ config = described_class.new(force_ocr: [1])
353
+
354
+ expect(config.force_ocr).to be true
355
+ end
356
+
357
+ it 'converts false force_ocr to false' do
358
+ config = described_class.new(force_ocr: false)
359
+
360
+ expect(config.force_ocr).to be false
361
+ end
362
+ end
363
+
364
+ describe 'complex nested configurations' do
365
+ it 'handles deeply nested configs' do
366
+ config = described_class.new(
367
+ chunking: {
368
+ max_chars: 750,
369
+ embedding: {
370
+ model: { type: :preset, name: 'balanced' },
371
+ batch_size: 64
372
+ }
373
+ }
374
+ )
375
+
376
+ expect(config.chunking.embedding).to be_a Kreuzberg::Config::Embedding
377
+ expect(config.chunking.embedding.batch_size).to eq 64
378
+ end
379
+
380
+ it 'handles PDF with font and hierarchy configs' do
381
+ config = described_class.new(
382
+ pdf_options: {
383
+ extract_images: true,
384
+ font_config: { enabled: true, custom_font_dirs: ['/fonts'] },
385
+ hierarchy: { k_clusters: 8 }
386
+ }
387
+ )
388
+
389
+ expect(config.pdf_options.font_config).to be_a Kreuzberg::Config::FontConfig
390
+ expect(config.pdf_options.hierarchy).to be_a Kreuzberg::Config::Hierarchy
391
+ end
392
+
393
+ it 'handles complete extraction config' do
394
+ config = described_class.new(
395
+ use_cache: false,
396
+ force_ocr: true,
397
+ ocr: { backend: 'tesseract', language: 'deu' },
398
+ chunking: { max_chars: 500, preset: 'fast' },
399
+ language_detection: { enabled: true, min_confidence: 0.9 },
400
+ pdf_options: { extract_images: true, passwords: ['secret'] },
401
+ image_extraction: { target_dpi: 600 },
402
+ postprocessor: { enabled: true, enabled_processors: %w[quality] },
403
+ token_reduction: { mode: 'light' },
404
+ keywords: { algorithm: 'yake', max_keywords: 10 },
405
+ pages: { extract_pages: true }
406
+ )
407
+
408
+ expect(config.use_cache).to be false
409
+ expect(config.force_ocr).to be true
410
+ expect(config.ocr.language).to eq 'deu'
411
+ expect(config.chunking.max_chars).to eq 500
412
+ expect(config.language_detection.enabled).to be true
413
+ expect(config.pdf_options.extract_images).to be true
414
+ expect(config.image_extraction.target_dpi).to eq 600
415
+ expect(config.postprocessor.enabled).to be true
416
+ expect(config.token_reduction.mode).to eq 'light'
417
+ expect(config.keywords.max_keywords).to eq 10
418
+ expect(config.pages.extract_pages).to be true
419
+ end
420
+ end
421
+
422
+ describe 'ExtractionConfig alias' do
423
+ it 'exists as module constant' do
424
+ expect(Kreuzberg.const_defined?(:ExtractionConfig)).to be true
425
+ end
426
+
427
+ it 'can be instantiated through alias' do
428
+ config = Kreuzberg::ExtractionConfig.new(use_cache: false)
429
+
430
+ expect(config).to be_a described_class
431
+ expect(config.use_cache).to be false
432
+ end
433
+ end
434
+ end
@@ -0,0 +1,285 @@
1
+ # frozen_string_literal: true
2
+
3
+ RSpec.describe Kreuzberg::Config::FontConfig do
4
+ describe '#initialize' do
5
+ it 'creates config with default values' do
6
+ config = described_class.new
7
+
8
+ expect(config.enabled).to be true
9
+ expect(config.custom_font_dirs).to be_nil
10
+ end
11
+
12
+ it 'creates config with enabled false' do
13
+ config = described_class.new(enabled: false)
14
+
15
+ expect(config.enabled).to be false
16
+ end
17
+
18
+ it 'creates config with custom_font_dirs' do
19
+ dirs = ['/usr/share/fonts', '/home/user/.fonts']
20
+ config = described_class.new(custom_font_dirs: dirs)
21
+
22
+ expect(config.custom_font_dirs).to eq dirs
23
+ end
24
+
25
+ it 'accepts single font directory as string' do
26
+ config = described_class.new(custom_font_dirs: '/usr/share/fonts')
27
+
28
+ expect(config.custom_font_dirs).to eq '/usr/share/fonts'
29
+ end
30
+
31
+ it 'accepts multiple directories as array' do
32
+ dirs = ['/fonts1', '/fonts2', '/fonts3']
33
+ config = described_class.new(custom_font_dirs: dirs)
34
+
35
+ expect(config.custom_font_dirs).to eq dirs
36
+ end
37
+
38
+ it 'converts enabled to boolean' do
39
+ config = described_class.new(enabled: 1)
40
+
41
+ expect(config.enabled).to be true
42
+ end
43
+ end
44
+
45
+ describe '#to_h' do
46
+ it 'serializes to hash with default values' do
47
+ config = described_class.new
48
+ hash = config.to_h
49
+
50
+ expect(hash).to be_a Hash
51
+ expect(hash[:enabled]).to be true
52
+ end
53
+
54
+ it 'includes custom_font_dirs when present' do
55
+ dirs = ['/fonts']
56
+ config = described_class.new(custom_font_dirs: dirs)
57
+ hash = config.to_h
58
+
59
+ expect(hash[:custom_font_dirs]).to eq dirs
60
+ end
61
+
62
+ it 'compacts nil values from hash' do
63
+ config = described_class.new(enabled: true)
64
+ hash = config.to_h
65
+
66
+ expect(hash.key?(:custom_font_dirs)).to be false
67
+ end
68
+
69
+ it 'includes both keys when both are present' do
70
+ config = described_class.new(
71
+ enabled: true,
72
+ custom_font_dirs: ['/fonts']
73
+ )
74
+ hash = config.to_h
75
+
76
+ expect(hash.keys).to contain_exactly(:enabled, :custom_font_dirs)
77
+ end
78
+ end
79
+
80
+ describe 'validation' do
81
+ it 'accepts enabled true' do
82
+ expect do
83
+ described_class.new(enabled: true)
84
+ end.not_to raise_error
85
+ end
86
+
87
+ it 'accepts enabled false' do
88
+ expect do
89
+ described_class.new(enabled: false)
90
+ end.not_to raise_error
91
+ end
92
+
93
+ it 'accepts custom_font_dirs as string' do
94
+ expect do
95
+ described_class.new(custom_font_dirs: '/fonts')
96
+ end.not_to raise_error
97
+ end
98
+
99
+ it 'accepts custom_font_dirs as array' do
100
+ expect do
101
+ described_class.new(custom_font_dirs: ['/fonts1', '/fonts2'])
102
+ end.not_to raise_error
103
+ end
104
+ end
105
+
106
+ describe 'keyword arguments' do
107
+ it 'accepts all keyword arguments' do
108
+ dirs = ['/fonts']
109
+ config = described_class.new(
110
+ enabled: false,
111
+ custom_font_dirs: dirs
112
+ )
113
+
114
+ expect(config.enabled).to be false
115
+ expect(config.custom_font_dirs).to eq dirs
116
+ end
117
+ end
118
+
119
+ describe 'equality' do
120
+ it 'compares configs by value' do
121
+ config1 = described_class.new(
122
+ enabled: true,
123
+ custom_font_dirs: ['/fonts']
124
+ )
125
+ config2 = described_class.new(
126
+ enabled: true,
127
+ custom_font_dirs: ['/fonts']
128
+ )
129
+
130
+ expect(config1.enabled).to eq config2.enabled
131
+ expect(config1.custom_font_dirs).to eq config2.custom_font_dirs
132
+ end
133
+
134
+ it 'detects differences in enabled' do
135
+ config1 = described_class.new(enabled: true)
136
+ config2 = described_class.new(enabled: false)
137
+
138
+ expect(config1.enabled).not_to eq config2.enabled
139
+ end
140
+
141
+ it 'detects differences in custom_font_dirs' do
142
+ config1 = described_class.new(custom_font_dirs: ['/fonts1'])
143
+ config2 = described_class.new(custom_font_dirs: ['/fonts2'])
144
+
145
+ expect(config1.custom_font_dirs).not_to eq config2.custom_font_dirs
146
+ end
147
+ end
148
+
149
+ describe 'nested config integration' do
150
+ it 'can be nested in PDF config' do
151
+ font_config = described_class.new(enabled: true, custom_font_dirs: ['/fonts'])
152
+ pdf = Kreuzberg::Config::PDF.new(font_config: font_config)
153
+
154
+ expect(pdf.font_config).to be_a described_class
155
+ expect(pdf.font_config.enabled).to be true
156
+ expect(pdf.font_config.custom_font_dirs).to eq ['/fonts']
157
+ end
158
+
159
+ it 'accepts hash in PDF config' do
160
+ pdf = Kreuzberg::Config::PDF.new(
161
+ font_config: { enabled: true, custom_font_dirs: ['/fonts'] }
162
+ )
163
+
164
+ expect(pdf.font_config).to be_a described_class
165
+ expect(pdf.font_config.enabled).to be true
166
+ expect(pdf.font_config.custom_font_dirs).to eq ['/fonts']
167
+ end
168
+
169
+ it 'can be nested in Extraction config via PDF' do
170
+ extraction = Kreuzberg::Config::Extraction.new(
171
+ pdf_options: { font_config: { enabled: true } }
172
+ )
173
+
174
+ expect(extraction.pdf_options.font_config).to be_a described_class
175
+ expect(extraction.pdf_options.font_config.enabled).to be true
176
+ end
177
+ end
178
+
179
+ describe 'symbol vs string key handling' do
180
+ it 'converts symbol enabled to boolean' do
181
+ config = described_class.new(enabled: true)
182
+
183
+ expect(config.enabled).to be true
184
+ end
185
+
186
+ it 'preserves custom_font_dirs as array' do
187
+ dirs = ['/fonts1', '/fonts2']
188
+ config = described_class.new(custom_font_dirs: dirs)
189
+
190
+ expect(config.custom_font_dirs).to eq dirs
191
+ end
192
+ end
193
+
194
+ describe 'boolean conversion' do
195
+ it 'converts truthy enabled to true' do
196
+ config = described_class.new(enabled: 1)
197
+
198
+ expect(config.enabled).to be true
199
+ end
200
+
201
+ it 'converts false enabled to false' do
202
+ config = described_class.new(enabled: false)
203
+
204
+ expect(config.enabled).to be false
205
+ end
206
+
207
+ it 'converts string yes to true' do
208
+ config = described_class.new(enabled: 'yes')
209
+
210
+ expect(config.enabled).to be true
211
+ end
212
+
213
+ it 'converts nil to false' do
214
+ config = described_class.new(enabled: nil)
215
+
216
+ expect(config.enabled).to be false
217
+ end
218
+ end
219
+
220
+ describe 'font directory handling' do
221
+ it 'stores single directory path as string' do
222
+ config = described_class.new(custom_font_dirs: '/usr/share/fonts')
223
+
224
+ expect(config.custom_font_dirs).to eq '/usr/share/fonts'
225
+ end
226
+
227
+ it 'stores multiple directories as array' do
228
+ dirs = ['/fonts1', '/fonts2', '/fonts3']
229
+ config = described_class.new(custom_font_dirs: dirs)
230
+
231
+ expect(config.custom_font_dirs).to eq dirs
232
+ expect(config.custom_font_dirs).to be_a Array
233
+ end
234
+
235
+ it 'preserves exact directory paths' do
236
+ dir = '/home/user/.local/share/fonts'
237
+ config = described_class.new(custom_font_dirs: dir)
238
+
239
+ expect(config.custom_font_dirs).to eq dir
240
+ end
241
+
242
+ it 'preserves array of directory paths' do
243
+ dirs = ['/usr/share/fonts', '/home/user/.fonts', '~/.local/share/fonts']
244
+ config = described_class.new(custom_font_dirs: dirs)
245
+
246
+ expect(config.custom_font_dirs).to eq dirs
247
+ end
248
+ end
249
+
250
+ describe 'mutability' do
251
+ it 'allows modification of enabled' do
252
+ config = described_class.new(enabled: true)
253
+ config.enabled = false
254
+
255
+ expect(config.enabled).to be false
256
+ end
257
+
258
+ it 'allows modification of custom_font_dirs' do
259
+ config = described_class.new(custom_font_dirs: ['/fonts1'])
260
+ config.custom_font_dirs = ['/fonts2']
261
+
262
+ expect(config.custom_font_dirs).to eq ['/fonts2']
263
+ end
264
+ end
265
+
266
+ describe 'default behavior' do
267
+ it 'defaults to enabled' do
268
+ config = described_class.new
269
+
270
+ expect(config.enabled).to be true
271
+ end
272
+
273
+ it 'defaults to no custom font directories' do
274
+ config = described_class.new
275
+
276
+ expect(config.custom_font_dirs).to be_nil
277
+ end
278
+
279
+ it 'allows disabling font support' do
280
+ config = described_class.new(enabled: false)
281
+
282
+ expect(config.enabled).to be false
283
+ end
284
+ end
285
+ end