kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,732 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+ require 'tempfile'
5
+ require 'fileutils'
6
+
7
+ RSpec.describe 'Image Extraction' do
8
+ describe 'PDF image extraction with metadata' do
9
+ it 'extracts images with format and dimensions' do
10
+ config = Kreuzberg::Config::Extraction.new(
11
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
12
+ extract_images: true,
13
+ target_dpi: 150
14
+ )
15
+ )
16
+
17
+ pdf_path = test_document_path('pdf/with_images.pdf')
18
+ begin
19
+ result = Kreuzberg.extract_file_sync(path: pdf_path, config: config)
20
+
21
+ expect(result).not_to be_nil
22
+ if result.images && !result.images.empty?
23
+ image = result.images.first
24
+ expect(image).to be_a(Kreuzberg::Result::Image)
25
+ expect(image.format).not_to be_nil
26
+ expect(image.width).to be > 0
27
+ expect(image.height).to be > 0
28
+ end
29
+ rescue Kreuzberg::Errors::ValidationError
30
+ skip 'Test file not available'
31
+ end
32
+ end
33
+
34
+ it 'includes page numbers in extracted images' do
35
+ config = Kreuzberg::Config::Extraction.new(
36
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
37
+ extract_images: true,
38
+ target_dpi: 150
39
+ )
40
+ )
41
+
42
+ begin
43
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
44
+
45
+ if result.images && !result.images.empty?
46
+ result.images.each do |image|
47
+ expect(image.page_number).to be > 0
48
+ end
49
+ end
50
+ rescue Kreuzberg::Errors::ValidationError
51
+ skip 'Test file not available'
52
+ end
53
+ end
54
+
55
+ it 'respects target_dpi configuration parameter' do
56
+ dpi_values = [150, 300, 600]
57
+
58
+ dpi_values.each do |dpi|
59
+ config = Kreuzberg::Config::Extraction.new(
60
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
61
+ extract_images: true,
62
+ target_dpi: dpi
63
+ )
64
+ )
65
+
66
+ begin
67
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
68
+
69
+ expect(result).not_to be_nil
70
+ rescue Kreuzberg::Errors::ValidationError
71
+ skip 'Test file not available'
72
+ end
73
+ end
74
+ end
75
+
76
+ it 'includes colorspace information in image metadata' do
77
+ config = Kreuzberg::Config::Extraction.new(
78
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
79
+ extract_images: true,
80
+ target_dpi: 150
81
+ )
82
+ )
83
+
84
+ begin
85
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
86
+
87
+ if result.images && !result.images.empty?
88
+ image = result.images.first
89
+ expect(image).to respond_to(:colorspace)
90
+ # Verify colorspace has meaningful value if present
91
+ if image.colorspace
92
+ expect(image.colorspace).not_to be_empty
93
+ expect(image.colorspace).to be_a(String)
94
+ end
95
+ end
96
+ rescue Kreuzberg::Errors::ValidationError
97
+ skip 'Test file not available'
98
+ end
99
+ end
100
+ end
101
+
102
+ describe 'Image handling in composite documents' do
103
+ it 'extracts images from DOCX files' do
104
+ config = Kreuzberg::Config::Extraction.new(
105
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
106
+ extract_images: true
107
+ )
108
+ )
109
+
110
+ begin
111
+ docx_path = test_document_path('docx/extraction_test.docx')
112
+ result = Kreuzberg.extract_file_sync(path: docx_path, config: config)
113
+
114
+ expect(result).not_to be_nil
115
+ expect(result.content).not_to be_nil
116
+ rescue Kreuzberg::Errors::ValidationError
117
+ skip 'Test file not available'
118
+ end
119
+ end
120
+
121
+ it 'extracts images from PPTX files' do
122
+ config = Kreuzberg::Config::Extraction.new(
123
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
124
+ extract_images: true
125
+ )
126
+ )
127
+
128
+ begin
129
+ pptx_path = test_document_path('pptx/simple.pptx')
130
+ result = Kreuzberg.extract_file_sync(path: pptx_path, config: config)
131
+
132
+ expect(result).not_to be_nil
133
+ expect(result.content).not_to be_nil
134
+ rescue Kreuzberg::Errors::ValidationError
135
+ skip 'Test file not available'
136
+ end
137
+ end
138
+
139
+ it 'handles documents with multiple images across pages' do
140
+ config = Kreuzberg::Config::Extraction.new(
141
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
142
+ extract_images: true,
143
+ target_dpi: 150
144
+ )
145
+ )
146
+
147
+ begin
148
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
149
+
150
+ if result.images && result.images.length > 1
151
+ page_numbers = result.images.map(&:page_number).uniq
152
+ expect(page_numbers.length).to be > 1
153
+ end
154
+ rescue Kreuzberg::Errors::ValidationError
155
+ skip 'Test file not available'
156
+ end
157
+ end
158
+
159
+ it 'preserves image index for sequential extraction' do
160
+ config = Kreuzberg::Config::Extraction.new(
161
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
162
+ extract_images: true
163
+ )
164
+ )
165
+
166
+ begin
167
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
168
+
169
+ if result.images && result.images.length > 1
170
+ result.images.each_with_index do |image, _index|
171
+ expect(image.image_index).to be_a(Integer)
172
+ end
173
+ end
174
+ rescue Kreuzberg::Errors::ValidationError
175
+ skip 'Test file not available'
176
+ end
177
+ end
178
+ end
179
+
180
+ describe 'Image format detection' do
181
+ it 'detects PNG format in extracted images' do
182
+ config = Kreuzberg::Config::Extraction.new(
183
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
184
+ extract_images: true,
185
+ target_dpi: 150
186
+ )
187
+ )
188
+
189
+ begin
190
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
191
+
192
+ if result.images && !result.images.empty?
193
+ formats = result.images.filter_map(&:format)
194
+ expect(formats).to be_an(Array)
195
+ end
196
+ rescue Kreuzberg::Errors::ValidationError
197
+ skip 'Test file not available'
198
+ end
199
+ end
200
+
201
+ it 'detects JPEG format in extracted images' do
202
+ config = Kreuzberg::Config::Extraction.new(
203
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
204
+ extract_images: true,
205
+ target_dpi: 150
206
+ )
207
+ )
208
+
209
+ begin
210
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
211
+
212
+ if result.images && !result.images.empty?
213
+ result.images.each do |image|
214
+ expect(image.format).not_to be_nil
215
+ expect(image.format).to be_a(String)
216
+ end
217
+ end
218
+ rescue Kreuzberg::Errors::ValidationError
219
+ skip 'Test file not available'
220
+ end
221
+ end
222
+
223
+ it 'handles WebP format detection if present' do
224
+ config = Kreuzberg::Config::Extraction.new(
225
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
226
+ extract_images: true
227
+ )
228
+ )
229
+
230
+ begin
231
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
232
+
233
+ expect(result).not_to be_nil
234
+ rescue Kreuzberg::Errors::ValidationError
235
+ skip 'Test file not available'
236
+ end
237
+ end
238
+
239
+ it 'provides consistent format strings across extractions' do
240
+ config = Kreuzberg::Config::Extraction.new(
241
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
242
+ extract_images: true
243
+ )
244
+ )
245
+
246
+ begin
247
+ result1 = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
248
+ result2 = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
249
+
250
+ if result1.images && result2.images && !result1.images.empty? && !result2.images.empty?
251
+ expect(result1.images.first.format).to eq(result2.images.first.format)
252
+ end
253
+ rescue Kreuzberg::Errors::ValidationError
254
+ skip 'Test file not available'
255
+ end
256
+ end
257
+ end
258
+
259
+ describe 'Embedded vs referenced images' do
260
+ it 'extracts embedded images from documents' do
261
+ config = Kreuzberg::Config::Extraction.new(
262
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
263
+ extract_images: true
264
+ )
265
+ )
266
+
267
+ begin
268
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
269
+
270
+ expect(result).not_to be_nil
271
+ rescue Kreuzberg::Errors::ValidationError
272
+ skip 'Test file not available'
273
+ end
274
+ end
275
+
276
+ it 'handles image data field in extracted images' do
277
+ config = Kreuzberg::Config::Extraction.new(
278
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
279
+ extract_images: true,
280
+ target_dpi: 150
281
+ )
282
+ )
283
+
284
+ begin
285
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
286
+
287
+ if result.images && !result.images.empty?
288
+ image = result.images.first
289
+ expect(image).to respond_to(:data)
290
+ end
291
+ rescue Kreuzberg::Errors::ValidationError
292
+ skip 'Test file not available'
293
+ end
294
+ end
295
+
296
+ it 'preserves image metadata when extraction enabled' do
297
+ config = Kreuzberg::Config::Extraction.new(
298
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
299
+ extract_images: true
300
+ )
301
+ )
302
+
303
+ begin
304
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
305
+
306
+ if result.images && !result.images.empty?
307
+ image = result.images.first
308
+ expect(image.width).to be_a(Integer)
309
+ expect(image.height).to be_a(Integer)
310
+ end
311
+ rescue Kreuzberg::Errors::ValidationError
312
+ skip 'Test file not available'
313
+ end
314
+ end
315
+
316
+ it 'returns nil for images when extraction disabled' do
317
+ config = Kreuzberg::Config::Extraction.new(
318
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
319
+ extract_images: false
320
+ )
321
+ )
322
+
323
+ begin
324
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
325
+
326
+ expect(result.images).to be_nil
327
+ rescue Kreuzberg::Errors::ValidationError
328
+ skip 'Test file not available'
329
+ end
330
+ end
331
+ end
332
+
333
+ describe 'Error handling for corrupted images' do
334
+ it 'gracefully handles documents with malformed images' do
335
+ config = Kreuzberg::Config::Extraction.new(
336
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
337
+ extract_images: true
338
+ )
339
+ )
340
+
341
+ begin
342
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
343
+ expect(result).not_to be_nil
344
+ rescue Kreuzberg::Errors::ValidationError
345
+ skip 'Test file not available'
346
+ end
347
+ end
348
+
349
+ it 'continues extraction when encountering problematic images' do
350
+ config = Kreuzberg::Config::Extraction.new(
351
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
352
+ extract_images: true
353
+ )
354
+ )
355
+
356
+ begin
357
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
358
+
359
+ expect(result).not_to be_nil
360
+ expect(result.content).not_to be_nil
361
+ rescue Kreuzberg::Errors::ValidationError
362
+ skip 'Test file not available'
363
+ end
364
+ end
365
+
366
+ it 'handles extraction with max_image_dimension constraint' do
367
+ config = Kreuzberg::Config::Extraction.new(
368
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
369
+ extract_images: true,
370
+ max_image_dimension: 1000
371
+ )
372
+ )
373
+
374
+ begin
375
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
376
+
377
+ if result.images && !result.images.empty?
378
+ result.images.each do |image|
379
+ expect(image.width).to be_a(Integer)
380
+ expect(image.height).to be_a(Integer)
381
+ end
382
+ end
383
+ rescue Kreuzberg::Errors::ValidationError
384
+ skip 'Test file not available'
385
+ end
386
+ end
387
+
388
+ it 'respects auto_adjust_dpi configuration' do
389
+ config = Kreuzberg::Config::Extraction.new(
390
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
391
+ extract_images: true,
392
+ auto_adjust_dpi: true,
393
+ min_dpi: 150,
394
+ max_dpi: 600
395
+ )
396
+ )
397
+
398
+ begin
399
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
400
+
401
+ expect(result).not_to be_nil
402
+ rescue Kreuzberg::Errors::ValidationError
403
+ skip 'Test file not available'
404
+ end
405
+ end
406
+ end
407
+
408
+ describe 'Batch image extraction from multi-page documents' do
409
+ it 'extracts images from multi-page PDF in single operation' do
410
+ config = Kreuzberg::Config::Extraction.new(
411
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
412
+ extract_images: true,
413
+ target_dpi: 150
414
+ )
415
+ )
416
+
417
+ begin
418
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
419
+
420
+ expect(result).not_to be_nil
421
+ rescue Kreuzberg::Errors::ValidationError
422
+ skip 'Test file not available'
423
+ end
424
+ end
425
+
426
+ it 'maintains correct page associations for extracted images' do
427
+ config = Kreuzberg::Config::Extraction.new(
428
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
429
+ extract_images: true
430
+ )
431
+ )
432
+
433
+ begin
434
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
435
+
436
+ if result.images && result.images.length > 1
437
+ result.images.each do |image|
438
+ expect(image.page_number).to be >= 1
439
+ end
440
+ end
441
+ rescue Kreuzberg::Errors::ValidationError
442
+ skip 'Test file not available'
443
+ end
444
+ end
445
+
446
+ it 'preserves image order within document' do
447
+ config = Kreuzberg::Config::Extraction.new(
448
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
449
+ extract_images: true
450
+ )
451
+ )
452
+
453
+ begin
454
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
455
+
456
+ if result.images && result.images.length > 1
457
+ (0...(result.images.length - 1)).each do |i|
458
+ expect(result.images[i].image_index).to be <= result.images[i + 1].image_index
459
+ end
460
+ end
461
+ rescue Kreuzberg::Errors::ValidationError
462
+ skip 'Test file not available'
463
+ end
464
+ end
465
+
466
+ it 'handles multiple file batch extraction with images' do
467
+ paths = []
468
+ 2.times do |i|
469
+ file = Tempfile.new("batch_image_test_#{i}.txt")
470
+ file.write("Image extraction test #{i}")
471
+ file.close
472
+ paths << file.path
473
+ end
474
+
475
+ config = Kreuzberg::Config::Extraction.new(
476
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
477
+ extract_images: true
478
+ )
479
+ )
480
+
481
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
482
+
483
+ expect(results).to be_a(Array)
484
+ expect(results.length).to eq(2)
485
+ expect(results).to all(be_a(Kreuzberg::Result))
486
+ ensure
487
+ paths.each { |p| FileUtils.rm_f(p) }
488
+ end
489
+
490
+ it 'maintains correct image count across batch operations' do
491
+ paths = []
492
+ 2.times do |i|
493
+ file = Tempfile.new("batch_count_#{i}.txt")
494
+ file.write("Content #{i}")
495
+ file.close
496
+ paths << file.path
497
+ end
498
+
499
+ config = Kreuzberg::Config::Extraction.new(
500
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
501
+ extract_images: true
502
+ )
503
+ )
504
+
505
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
506
+
507
+ expect(results.length).to eq(paths.length)
508
+ expect(results).to all(be_a(Kreuzberg::Result))
509
+ ensure
510
+ paths.each { |p| FileUtils.rm_f(p) }
511
+ end
512
+ end
513
+
514
+ describe 'ImageExtraction configuration integration' do
515
+ it 'applies different DPI settings to affect extraction behavior' do
516
+ config_low = Kreuzberg::Config::Extraction.new(
517
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
518
+ extract_images: true,
519
+ target_dpi: 72
520
+ )
521
+ )
522
+ config_high = Kreuzberg::Config::Extraction.new(
523
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
524
+ extract_images: true,
525
+ target_dpi: 300
526
+ )
527
+ )
528
+
529
+ begin
530
+ result_low = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config_low)
531
+ result_high = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config_high)
532
+
533
+ # Both configurations should produce valid extraction
534
+ expect(result_low).not_to be_nil
535
+ expect(result_high).not_to be_nil
536
+ # Different DPI settings should be accepted
537
+ expect([result_low, result_high]).to all(be_a(Kreuzberg::Result))
538
+ rescue Kreuzberg::Errors::ValidationError
539
+ skip 'Test file not available'
540
+ end
541
+ end
542
+
543
+ it 'respects extract_images false disables extraction' do
544
+ config_enabled = Kreuzberg::Config::Extraction.new(
545
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
546
+ extract_images: true
547
+ )
548
+ )
549
+ config_disabled = Kreuzberg::Config::Extraction.new(
550
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
551
+ extract_images: false
552
+ )
553
+ )
554
+
555
+ begin
556
+ result_enabled = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config_enabled)
557
+ result_disabled = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config_disabled)
558
+
559
+ # Enabled should extract if images present
560
+ expect(result_enabled).not_to be_nil
561
+ # Disabled should return nil or empty images
562
+ expect(result_disabled.images).to be_empty if result_disabled.images
563
+ rescue Kreuzberg::Errors::ValidationError
564
+ skip 'Test file not available'
565
+ end
566
+ end
567
+
568
+ it 'handles dimension constraints realistically' do
569
+ config = Kreuzberg::Config::Extraction.new(
570
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
571
+ extract_images: true,
572
+ max_image_dimension: 1024
573
+ )
574
+ )
575
+
576
+ begin
577
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
578
+
579
+ expect(result).not_to be_nil
580
+ # Dimension constraint should be applied
581
+ if result.images && !result.images.empty?
582
+ result.images.each do |image|
583
+ # Image should respect dimension constraints
584
+ expect(image).not_to be_nil
585
+ end
586
+ end
587
+ rescue Kreuzberg::Errors::ValidationError
588
+ skip 'Test file not available'
589
+ end
590
+ end
591
+ end
592
+
593
+ describe 'Integration with Extraction config' do
594
+ it 'accepts ImageExtraction config in Extraction' do
595
+ image_config = Kreuzberg::Config::ImageExtraction.new(
596
+ extract_images: true,
597
+ target_dpi: 600
598
+ )
599
+ config = Kreuzberg::Config::Extraction.new(image_extraction: image_config)
600
+
601
+ expect(config.image_extraction).to be_a(Kreuzberg::Config::ImageExtraction)
602
+ expect(config.image_extraction.target_dpi).to eq(600)
603
+ end
604
+
605
+ it 'accepts image extraction config as hash in Extraction' do
606
+ config = Kreuzberg::Config::Extraction.new(
607
+ image_extraction: {
608
+ extract_images: true,
609
+ target_dpi: 600,
610
+ max_image_dimension: 3000
611
+ }
612
+ )
613
+
614
+ expect(config.image_extraction).to be_a(Kreuzberg::Config::ImageExtraction)
615
+ expect(config.image_extraction.extract_images).to be true
616
+ expect(config.image_extraction.target_dpi).to eq(600)
617
+ expect(config.image_extraction.max_image_dimension).to eq(3000)
618
+ end
619
+
620
+ it 'includes image extraction config in to_h' do
621
+ image_config = Kreuzberg::Config::ImageExtraction.new(
622
+ extract_images: true,
623
+ target_dpi: 600
624
+ )
625
+ config = Kreuzberg::Config::Extraction.new(image_extraction: image_config)
626
+
627
+ hash = config.to_h
628
+
629
+ expect(hash).to include(:image_extraction)
630
+ expect(hash[:image_extraction]).to be_a(Hash)
631
+ expect(hash[:image_extraction][:extract_images]).to be true
632
+ expect(hash[:image_extraction][:target_dpi]).to eq(600)
633
+ end
634
+
635
+ it 'combines image extraction with other configurations' do
636
+ config = Kreuzberg::Config::Extraction.new(
637
+ use_cache: true,
638
+ force_ocr: true,
639
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
640
+ extract_images: true,
641
+ target_dpi: 600
642
+ ),
643
+ ocr: Kreuzberg::Config::OCR.new(
644
+ backend: 'tesseract',
645
+ language: 'eng'
646
+ )
647
+ )
648
+
649
+ expect(config.use_cache).to be true
650
+ expect(config.force_ocr).to be true
651
+ expect(config.image_extraction.target_dpi).to eq(600)
652
+ expect(config.ocr.backend).to eq('tesseract')
653
+ end
654
+
655
+ it 'handles nil image extraction config' do
656
+ config = Kreuzberg::Config::Extraction.new(image_extraction: nil)
657
+
658
+ expect(config.image_extraction).to be_nil
659
+ end
660
+ end
661
+
662
+ describe 'Image metadata validation in real extractions' do
663
+ it 'validates extracted images have complete required metadata' do
664
+ config = Kreuzberg::Config::Extraction.new(
665
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
666
+ extract_images: true
667
+ )
668
+ )
669
+
670
+ begin
671
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
672
+
673
+ if result.images && !result.images.empty?
674
+ result.images.each do |image|
675
+ # All extracted images must have these fields populated
676
+ expect(image).not_to be_nil
677
+ expect(image.format).not_to be_nil, 'Format is required'
678
+ expect(image.format).not_to be_empty
679
+ expect(image.image_index).to be >= 0, 'Image index must be non-negative'
680
+ expect(image.data).not_to be_nil, 'Image data is required'
681
+ end
682
+ end
683
+ rescue Kreuzberg::Errors::ValidationError
684
+ skip 'Test file not available'
685
+ end
686
+ end
687
+
688
+ it 'includes optional metadata fields appropriately' do
689
+ config = Kreuzberg::Config::Extraction.new(
690
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
691
+ extract_images: true,
692
+ target_dpi: 150
693
+ )
694
+ )
695
+
696
+ begin
697
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
698
+
699
+ if result.images && !result.images.empty?
700
+ result.images.each do |image|
701
+ # Optional fields should be valid when present
702
+ expect(image.width).to be > 0, 'Width should be positive when present' if image.width
703
+ expect(image.height).to be > 0, 'Height should be positive when present' if image.height
704
+ expect(image.page_number).to be > 0, 'Page number should be positive' if image.page_number
705
+ end
706
+ end
707
+ rescue Kreuzberg::Errors::ValidationError
708
+ skip 'Test file not available'
709
+ end
710
+ end
711
+
712
+ it 'ensures multiple images have different indices' do
713
+ config = Kreuzberg::Config::Extraction.new(
714
+ image_extraction: Kreuzberg::Config::ImageExtraction.new(
715
+ extract_images: true
716
+ )
717
+ )
718
+
719
+ begin
720
+ result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
721
+
722
+ if result.images && result.images.length > 1
723
+ indices = result.images.map(&:image_index)
724
+ unique_indices = indices.uniq
725
+ expect(unique_indices.length).to eq(indices.length), 'Each image should have unique index'
726
+ end
727
+ rescue Kreuzberg::Errors::ValidationError
728
+ skip 'Test file not available'
729
+ end
730
+ end
731
+ end
732
+ end