kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
data/spec/binding/images_spec.rb
DELETED
|
@@ -1,738 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'spec_helper'
|
|
4
|
-
require 'tempfile'
|
|
5
|
-
require 'fileutils'
|
|
6
|
-
|
|
7
|
-
RSpec.describe 'Image Extraction' do
|
|
8
|
-
describe 'PDF image extraction with metadata' do
|
|
9
|
-
it 'extracts images with format and dimensions' do
|
|
10
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
11
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
12
|
-
extract_images: true,
|
|
13
|
-
target_dpi: 150
|
|
14
|
-
)
|
|
15
|
-
)
|
|
16
|
-
|
|
17
|
-
pdf_path = test_document_path('pdf/with_images.pdf')
|
|
18
|
-
begin
|
|
19
|
-
result = Kreuzberg.extract_file_sync(path: pdf_path, config: config)
|
|
20
|
-
|
|
21
|
-
expect(result).not_to be_nil
|
|
22
|
-
expect(result.images).not_to be_nil
|
|
23
|
-
if result.images && !result.images.empty?
|
|
24
|
-
image = result.images.first
|
|
25
|
-
expect(image).to be_a(Kreuzberg::Result::Image)
|
|
26
|
-
expect(image.format).not_to be_nil
|
|
27
|
-
expect(image.width).to be > 0
|
|
28
|
-
expect(image.height).to be > 0
|
|
29
|
-
end
|
|
30
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
31
|
-
skip 'Test file not available'
|
|
32
|
-
end
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
it 'includes page numbers in extracted images' do
|
|
36
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
37
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
38
|
-
extract_images: true,
|
|
39
|
-
target_dpi: 150
|
|
40
|
-
)
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
begin
|
|
44
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
45
|
-
|
|
46
|
-
expect(result.images).not_to be_nil
|
|
47
|
-
if result.images && !result.images.empty?
|
|
48
|
-
result.images.each do |image|
|
|
49
|
-
expect(image.page_number).to be > 0
|
|
50
|
-
end
|
|
51
|
-
end
|
|
52
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
53
|
-
skip 'Test file not available'
|
|
54
|
-
end
|
|
55
|
-
end
|
|
56
|
-
|
|
57
|
-
it 'respects target_dpi configuration parameter' do
|
|
58
|
-
dpi_values = [150, 300, 600]
|
|
59
|
-
|
|
60
|
-
dpi_values.each do |dpi|
|
|
61
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
62
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
63
|
-
extract_images: true,
|
|
64
|
-
target_dpi: dpi
|
|
65
|
-
)
|
|
66
|
-
)
|
|
67
|
-
|
|
68
|
-
begin
|
|
69
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
70
|
-
|
|
71
|
-
expect(result).not_to be_nil
|
|
72
|
-
expect(result.images).not_to be_nil
|
|
73
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
74
|
-
skip 'Test file not available'
|
|
75
|
-
end
|
|
76
|
-
end
|
|
77
|
-
end
|
|
78
|
-
|
|
79
|
-
it 'includes colorspace information in image metadata' do
|
|
80
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
81
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
82
|
-
extract_images: true,
|
|
83
|
-
target_dpi: 150
|
|
84
|
-
)
|
|
85
|
-
)
|
|
86
|
-
|
|
87
|
-
begin
|
|
88
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
89
|
-
|
|
90
|
-
if result.images && !result.images.empty?
|
|
91
|
-
image = result.images.first
|
|
92
|
-
expect(image).to respond_to(:colorspace)
|
|
93
|
-
# Verify colorspace has meaningful value if present
|
|
94
|
-
if image.colorspace
|
|
95
|
-
expect(image.colorspace).not_to be_empty
|
|
96
|
-
expect(image.colorspace).to be_a(String)
|
|
97
|
-
end
|
|
98
|
-
end
|
|
99
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
100
|
-
skip 'Test file not available'
|
|
101
|
-
end
|
|
102
|
-
end
|
|
103
|
-
end
|
|
104
|
-
|
|
105
|
-
describe 'Image handling in composite documents' do
|
|
106
|
-
it 'extracts images from DOCX files' do
|
|
107
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
108
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
109
|
-
extract_images: true
|
|
110
|
-
)
|
|
111
|
-
)
|
|
112
|
-
|
|
113
|
-
begin
|
|
114
|
-
docx_path = test_document_path('office/document.docx')
|
|
115
|
-
result = Kreuzberg.extract_file_sync(path: docx_path, config: config)
|
|
116
|
-
|
|
117
|
-
expect(result).not_to be_nil
|
|
118
|
-
expect(result.content).not_to be_nil
|
|
119
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
120
|
-
skip 'Test file not available'
|
|
121
|
-
end
|
|
122
|
-
end
|
|
123
|
-
|
|
124
|
-
it 'extracts images from PPTX files' do
|
|
125
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
126
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
127
|
-
extract_images: true
|
|
128
|
-
)
|
|
129
|
-
)
|
|
130
|
-
|
|
131
|
-
begin
|
|
132
|
-
pptx_path = test_document_path('presentations/simple.pptx')
|
|
133
|
-
result = Kreuzberg.extract_file_sync(path: pptx_path, config: config)
|
|
134
|
-
|
|
135
|
-
expect(result).not_to be_nil
|
|
136
|
-
expect(result.content).not_to be_nil
|
|
137
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
138
|
-
skip 'Test file not available'
|
|
139
|
-
end
|
|
140
|
-
end
|
|
141
|
-
|
|
142
|
-
it 'handles documents with multiple images across pages' do
|
|
143
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
144
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
145
|
-
extract_images: true,
|
|
146
|
-
target_dpi: 150
|
|
147
|
-
)
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
begin
|
|
151
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
152
|
-
|
|
153
|
-
expect(result.images).not_to be_nil
|
|
154
|
-
if result.images && result.images.length > 1
|
|
155
|
-
page_numbers = result.images.map(&:page_number).uniq
|
|
156
|
-
expect(page_numbers.length).to be > 1
|
|
157
|
-
end
|
|
158
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
159
|
-
skip 'Test file not available'
|
|
160
|
-
end
|
|
161
|
-
end
|
|
162
|
-
|
|
163
|
-
it 'preserves image index for sequential extraction' do
|
|
164
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
165
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
166
|
-
extract_images: true
|
|
167
|
-
)
|
|
168
|
-
)
|
|
169
|
-
|
|
170
|
-
begin
|
|
171
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
172
|
-
|
|
173
|
-
if result.images && result.images.length > 1
|
|
174
|
-
result.images.each_with_index do |image, _index|
|
|
175
|
-
expect(image.image_index).to be_a(Integer)
|
|
176
|
-
end
|
|
177
|
-
end
|
|
178
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
179
|
-
skip 'Test file not available'
|
|
180
|
-
end
|
|
181
|
-
end
|
|
182
|
-
end
|
|
183
|
-
|
|
184
|
-
describe 'Image format detection' do
|
|
185
|
-
it 'detects PNG format in extracted images' do
|
|
186
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
187
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
188
|
-
extract_images: true,
|
|
189
|
-
target_dpi: 150
|
|
190
|
-
)
|
|
191
|
-
)
|
|
192
|
-
|
|
193
|
-
begin
|
|
194
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
195
|
-
|
|
196
|
-
if result.images && !result.images.empty?
|
|
197
|
-
formats = result.images.filter_map(&:format)
|
|
198
|
-
expect(formats).to be_an(Array)
|
|
199
|
-
end
|
|
200
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
201
|
-
skip 'Test file not available'
|
|
202
|
-
end
|
|
203
|
-
end
|
|
204
|
-
|
|
205
|
-
it 'detects JPEG format in extracted images' do
|
|
206
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
207
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
208
|
-
extract_images: true,
|
|
209
|
-
target_dpi: 150
|
|
210
|
-
)
|
|
211
|
-
)
|
|
212
|
-
|
|
213
|
-
begin
|
|
214
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
215
|
-
|
|
216
|
-
if result.images && !result.images.empty?
|
|
217
|
-
result.images.each do |image|
|
|
218
|
-
expect(image.format).not_to be_nil
|
|
219
|
-
expect(image.format).to be_a(String)
|
|
220
|
-
end
|
|
221
|
-
end
|
|
222
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
223
|
-
skip 'Test file not available'
|
|
224
|
-
end
|
|
225
|
-
end
|
|
226
|
-
|
|
227
|
-
it 'handles WebP format detection if present' do
|
|
228
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
229
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
230
|
-
extract_images: true
|
|
231
|
-
)
|
|
232
|
-
)
|
|
233
|
-
|
|
234
|
-
begin
|
|
235
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
236
|
-
|
|
237
|
-
expect(result.images).not_to be_nil
|
|
238
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
239
|
-
skip 'Test file not available'
|
|
240
|
-
end
|
|
241
|
-
end
|
|
242
|
-
|
|
243
|
-
it 'provides consistent format strings across extractions' do
|
|
244
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
245
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
246
|
-
extract_images: true
|
|
247
|
-
)
|
|
248
|
-
)
|
|
249
|
-
|
|
250
|
-
begin
|
|
251
|
-
result1 = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
252
|
-
result2 = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
253
|
-
|
|
254
|
-
if result1.images && result2.images && !result1.images.empty? && !result2.images.empty?
|
|
255
|
-
expect(result1.images.first.format).to eq(result2.images.first.format)
|
|
256
|
-
end
|
|
257
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
258
|
-
skip 'Test file not available'
|
|
259
|
-
end
|
|
260
|
-
end
|
|
261
|
-
end
|
|
262
|
-
|
|
263
|
-
describe 'Embedded vs referenced images' do
|
|
264
|
-
it 'extracts embedded images from documents' do
|
|
265
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
266
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
267
|
-
extract_images: true
|
|
268
|
-
)
|
|
269
|
-
)
|
|
270
|
-
|
|
271
|
-
begin
|
|
272
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
273
|
-
|
|
274
|
-
expect(result.images).not_to be_nil
|
|
275
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
276
|
-
skip 'Test file not available'
|
|
277
|
-
end
|
|
278
|
-
end
|
|
279
|
-
|
|
280
|
-
it 'handles image data field in extracted images' do
|
|
281
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
282
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
283
|
-
extract_images: true,
|
|
284
|
-
target_dpi: 150
|
|
285
|
-
)
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
begin
|
|
289
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
290
|
-
|
|
291
|
-
if result.images && !result.images.empty?
|
|
292
|
-
image = result.images.first
|
|
293
|
-
expect(image).to respond_to(:data)
|
|
294
|
-
end
|
|
295
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
296
|
-
skip 'Test file not available'
|
|
297
|
-
end
|
|
298
|
-
end
|
|
299
|
-
|
|
300
|
-
it 'preserves image metadata when extraction enabled' do
|
|
301
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
302
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
303
|
-
extract_images: true
|
|
304
|
-
)
|
|
305
|
-
)
|
|
306
|
-
|
|
307
|
-
begin
|
|
308
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
309
|
-
|
|
310
|
-
if result.images && !result.images.empty?
|
|
311
|
-
image = result.images.first
|
|
312
|
-
expect(image.width).to be_a(Integer)
|
|
313
|
-
expect(image.height).to be_a(Integer)
|
|
314
|
-
end
|
|
315
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
316
|
-
skip 'Test file not available'
|
|
317
|
-
end
|
|
318
|
-
end
|
|
319
|
-
|
|
320
|
-
it 'returns nil for images when extraction disabled' do
|
|
321
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
322
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
323
|
-
extract_images: false
|
|
324
|
-
)
|
|
325
|
-
)
|
|
326
|
-
|
|
327
|
-
begin
|
|
328
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
329
|
-
|
|
330
|
-
expect(result.images).to be_nil
|
|
331
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
332
|
-
skip 'Test file not available'
|
|
333
|
-
end
|
|
334
|
-
end
|
|
335
|
-
end
|
|
336
|
-
|
|
337
|
-
describe 'Error handling for corrupted images' do
|
|
338
|
-
it 'gracefully handles documents with malformed images' do
|
|
339
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
340
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
341
|
-
extract_images: true
|
|
342
|
-
)
|
|
343
|
-
)
|
|
344
|
-
|
|
345
|
-
begin
|
|
346
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
347
|
-
expect(result).not_to be_nil
|
|
348
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
349
|
-
skip 'Test file not available'
|
|
350
|
-
end
|
|
351
|
-
end
|
|
352
|
-
|
|
353
|
-
it 'continues extraction when encountering problematic images' do
|
|
354
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
355
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
356
|
-
extract_images: true
|
|
357
|
-
)
|
|
358
|
-
)
|
|
359
|
-
|
|
360
|
-
begin
|
|
361
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
362
|
-
|
|
363
|
-
expect(result).not_to be_nil
|
|
364
|
-
expect(result.content).not_to be_nil
|
|
365
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
366
|
-
skip 'Test file not available'
|
|
367
|
-
end
|
|
368
|
-
end
|
|
369
|
-
|
|
370
|
-
it 'handles extraction with max_image_dimension constraint' do
|
|
371
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
372
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
373
|
-
extract_images: true,
|
|
374
|
-
max_image_dimension: 1000
|
|
375
|
-
)
|
|
376
|
-
)
|
|
377
|
-
|
|
378
|
-
begin
|
|
379
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
380
|
-
|
|
381
|
-
if result.images && !result.images.empty?
|
|
382
|
-
result.images.each do |image|
|
|
383
|
-
expect(image.width).to be_a(Integer)
|
|
384
|
-
expect(image.height).to be_a(Integer)
|
|
385
|
-
end
|
|
386
|
-
end
|
|
387
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
388
|
-
skip 'Test file not available'
|
|
389
|
-
end
|
|
390
|
-
end
|
|
391
|
-
|
|
392
|
-
it 'respects auto_adjust_dpi configuration' do
|
|
393
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
394
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
395
|
-
extract_images: true,
|
|
396
|
-
auto_adjust_dpi: true,
|
|
397
|
-
min_dpi: 150,
|
|
398
|
-
max_dpi: 600
|
|
399
|
-
)
|
|
400
|
-
)
|
|
401
|
-
|
|
402
|
-
begin
|
|
403
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
404
|
-
|
|
405
|
-
expect(result).not_to be_nil
|
|
406
|
-
expect(result.images).not_to be_nil
|
|
407
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
408
|
-
skip 'Test file not available'
|
|
409
|
-
end
|
|
410
|
-
end
|
|
411
|
-
end
|
|
412
|
-
|
|
413
|
-
describe 'Batch image extraction from multi-page documents' do
|
|
414
|
-
it 'extracts images from multi-page PDF in single operation' do
|
|
415
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
416
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
417
|
-
extract_images: true,
|
|
418
|
-
target_dpi: 150
|
|
419
|
-
)
|
|
420
|
-
)
|
|
421
|
-
|
|
422
|
-
begin
|
|
423
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
424
|
-
|
|
425
|
-
expect(result).not_to be_nil
|
|
426
|
-
expect(result.images).not_to be_nil
|
|
427
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
428
|
-
skip 'Test file not available'
|
|
429
|
-
end
|
|
430
|
-
end
|
|
431
|
-
|
|
432
|
-
it 'maintains correct page associations for extracted images' do
|
|
433
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
434
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
435
|
-
extract_images: true
|
|
436
|
-
)
|
|
437
|
-
)
|
|
438
|
-
|
|
439
|
-
begin
|
|
440
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
441
|
-
|
|
442
|
-
if result.images && result.images.length > 1
|
|
443
|
-
result.images.each do |image|
|
|
444
|
-
expect(image.page_number).to be >= 1
|
|
445
|
-
end
|
|
446
|
-
end
|
|
447
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
448
|
-
skip 'Test file not available'
|
|
449
|
-
end
|
|
450
|
-
end
|
|
451
|
-
|
|
452
|
-
it 'preserves image order within document' do
|
|
453
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
454
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
455
|
-
extract_images: true
|
|
456
|
-
)
|
|
457
|
-
)
|
|
458
|
-
|
|
459
|
-
begin
|
|
460
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
461
|
-
|
|
462
|
-
if result.images && result.images.length > 1
|
|
463
|
-
(0...(result.images.length - 1)).each do |i|
|
|
464
|
-
expect(result.images[i].image_index).to be <= result.images[i + 1].image_index
|
|
465
|
-
end
|
|
466
|
-
end
|
|
467
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
468
|
-
skip 'Test file not available'
|
|
469
|
-
end
|
|
470
|
-
end
|
|
471
|
-
|
|
472
|
-
it 'handles multiple file batch extraction with images' do
|
|
473
|
-
paths = []
|
|
474
|
-
2.times do |i|
|
|
475
|
-
file = Tempfile.new("batch_image_test_#{i}.txt")
|
|
476
|
-
file.write("Image extraction test #{i}")
|
|
477
|
-
file.close
|
|
478
|
-
paths << file.path
|
|
479
|
-
end
|
|
480
|
-
|
|
481
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
482
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
483
|
-
extract_images: true
|
|
484
|
-
)
|
|
485
|
-
)
|
|
486
|
-
|
|
487
|
-
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
488
|
-
|
|
489
|
-
expect(results).to be_a(Array)
|
|
490
|
-
expect(results.length).to eq(2)
|
|
491
|
-
expect(results).to all(be_a(Kreuzberg::Result))
|
|
492
|
-
ensure
|
|
493
|
-
paths.each { |p| FileUtils.rm_f(p) }
|
|
494
|
-
end
|
|
495
|
-
|
|
496
|
-
it 'maintains correct image count across batch operations' do
|
|
497
|
-
paths = []
|
|
498
|
-
2.times do |i|
|
|
499
|
-
file = Tempfile.new("batch_count_#{i}.txt")
|
|
500
|
-
file.write("Content #{i}")
|
|
501
|
-
file.close
|
|
502
|
-
paths << file.path
|
|
503
|
-
end
|
|
504
|
-
|
|
505
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
506
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
507
|
-
extract_images: true
|
|
508
|
-
)
|
|
509
|
-
)
|
|
510
|
-
|
|
511
|
-
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
512
|
-
|
|
513
|
-
expect(results.length).to eq(paths.length)
|
|
514
|
-
expect(results).to all(be_a(Kreuzberg::Result))
|
|
515
|
-
ensure
|
|
516
|
-
paths.each { |p| FileUtils.rm_f(p) }
|
|
517
|
-
end
|
|
518
|
-
end
|
|
519
|
-
|
|
520
|
-
describe 'ImageExtraction configuration integration' do
|
|
521
|
-
it 'applies different DPI settings to affect extraction behavior' do
|
|
522
|
-
config_low = Kreuzberg::Config::Extraction.new(
|
|
523
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
524
|
-
extract_images: true,
|
|
525
|
-
target_dpi: 72
|
|
526
|
-
)
|
|
527
|
-
)
|
|
528
|
-
config_high = Kreuzberg::Config::Extraction.new(
|
|
529
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
530
|
-
extract_images: true,
|
|
531
|
-
target_dpi: 300
|
|
532
|
-
)
|
|
533
|
-
)
|
|
534
|
-
|
|
535
|
-
begin
|
|
536
|
-
result_low = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config_low)
|
|
537
|
-
result_high = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config_high)
|
|
538
|
-
|
|
539
|
-
# Both configurations should produce valid extraction
|
|
540
|
-
expect(result_low).not_to be_nil
|
|
541
|
-
expect(result_high).not_to be_nil
|
|
542
|
-
# Different DPI settings should be accepted
|
|
543
|
-
expect([result_low, result_high]).to all(be_a(Kreuzberg::Result))
|
|
544
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
545
|
-
skip 'Test file not available'
|
|
546
|
-
end
|
|
547
|
-
end
|
|
548
|
-
|
|
549
|
-
it 'respects extract_images false disables extraction' do
|
|
550
|
-
config_enabled = Kreuzberg::Config::Extraction.new(
|
|
551
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
552
|
-
extract_images: true
|
|
553
|
-
)
|
|
554
|
-
)
|
|
555
|
-
config_disabled = Kreuzberg::Config::Extraction.new(
|
|
556
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
557
|
-
extract_images: false
|
|
558
|
-
)
|
|
559
|
-
)
|
|
560
|
-
|
|
561
|
-
begin
|
|
562
|
-
result_enabled = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config_enabled)
|
|
563
|
-
result_disabled = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config_disabled)
|
|
564
|
-
|
|
565
|
-
# Enabled should extract if images present
|
|
566
|
-
expect(result_enabled).not_to be_nil
|
|
567
|
-
# Disabled should return nil or empty images
|
|
568
|
-
expect(result_disabled.images).to be_empty if result_disabled.images
|
|
569
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
570
|
-
skip 'Test file not available'
|
|
571
|
-
end
|
|
572
|
-
end
|
|
573
|
-
|
|
574
|
-
it 'handles dimension constraints realistically' do
|
|
575
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
576
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
577
|
-
extract_images: true,
|
|
578
|
-
max_image_dimension: 1024
|
|
579
|
-
)
|
|
580
|
-
)
|
|
581
|
-
|
|
582
|
-
begin
|
|
583
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
584
|
-
|
|
585
|
-
expect(result).not_to be_nil
|
|
586
|
-
# Dimension constraint should be applied
|
|
587
|
-
if result.images && !result.images.empty?
|
|
588
|
-
result.images.each do |image|
|
|
589
|
-
# Image should respect dimension constraints
|
|
590
|
-
expect(image).not_to be_nil
|
|
591
|
-
end
|
|
592
|
-
end
|
|
593
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
594
|
-
skip 'Test file not available'
|
|
595
|
-
end
|
|
596
|
-
end
|
|
597
|
-
end
|
|
598
|
-
|
|
599
|
-
describe 'Integration with Extraction config' do
|
|
600
|
-
it 'accepts ImageExtraction config in Extraction' do
|
|
601
|
-
image_config = Kreuzberg::Config::ImageExtraction.new(
|
|
602
|
-
extract_images: true,
|
|
603
|
-
target_dpi: 600
|
|
604
|
-
)
|
|
605
|
-
config = Kreuzberg::Config::Extraction.new(image_extraction: image_config)
|
|
606
|
-
|
|
607
|
-
expect(config.image_extraction).to be_a(Kreuzberg::Config::ImageExtraction)
|
|
608
|
-
expect(config.image_extraction.target_dpi).to eq(600)
|
|
609
|
-
end
|
|
610
|
-
|
|
611
|
-
it 'accepts image extraction config as hash in Extraction' do
|
|
612
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
613
|
-
image_extraction: {
|
|
614
|
-
extract_images: true,
|
|
615
|
-
target_dpi: 600,
|
|
616
|
-
max_image_dimension: 3000
|
|
617
|
-
}
|
|
618
|
-
)
|
|
619
|
-
|
|
620
|
-
expect(config.image_extraction).to be_a(Kreuzberg::Config::ImageExtraction)
|
|
621
|
-
expect(config.image_extraction.extract_images).to be true
|
|
622
|
-
expect(config.image_extraction.target_dpi).to eq(600)
|
|
623
|
-
expect(config.image_extraction.max_image_dimension).to eq(3000)
|
|
624
|
-
end
|
|
625
|
-
|
|
626
|
-
it 'includes image extraction config in to_h' do
|
|
627
|
-
image_config = Kreuzberg::Config::ImageExtraction.new(
|
|
628
|
-
extract_images: true,
|
|
629
|
-
target_dpi: 600
|
|
630
|
-
)
|
|
631
|
-
config = Kreuzberg::Config::Extraction.new(image_extraction: image_config)
|
|
632
|
-
|
|
633
|
-
hash = config.to_h
|
|
634
|
-
|
|
635
|
-
expect(hash).to include(:image_extraction)
|
|
636
|
-
expect(hash[:image_extraction]).to be_a(Hash)
|
|
637
|
-
expect(hash[:image_extraction][:extract_images]).to be true
|
|
638
|
-
expect(hash[:image_extraction][:target_dpi]).to eq(600)
|
|
639
|
-
end
|
|
640
|
-
|
|
641
|
-
it 'combines image extraction with other configurations' do
|
|
642
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
643
|
-
use_cache: true,
|
|
644
|
-
force_ocr: true,
|
|
645
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
646
|
-
extract_images: true,
|
|
647
|
-
target_dpi: 600
|
|
648
|
-
),
|
|
649
|
-
ocr: Kreuzberg::Config::OCR.new(
|
|
650
|
-
backend: 'tesseract',
|
|
651
|
-
language: 'eng'
|
|
652
|
-
)
|
|
653
|
-
)
|
|
654
|
-
|
|
655
|
-
expect(config.use_cache).to be true
|
|
656
|
-
expect(config.force_ocr).to be true
|
|
657
|
-
expect(config.image_extraction.target_dpi).to eq(600)
|
|
658
|
-
expect(config.ocr.backend).to eq('tesseract')
|
|
659
|
-
end
|
|
660
|
-
|
|
661
|
-
it 'handles nil image extraction config' do
|
|
662
|
-
config = Kreuzberg::Config::Extraction.new(image_extraction: nil)
|
|
663
|
-
|
|
664
|
-
expect(config.image_extraction).to be_nil
|
|
665
|
-
end
|
|
666
|
-
end
|
|
667
|
-
|
|
668
|
-
describe 'Image metadata validation in real extractions' do
|
|
669
|
-
it 'validates extracted images have complete required metadata' do
|
|
670
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
671
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
672
|
-
extract_images: true
|
|
673
|
-
)
|
|
674
|
-
)
|
|
675
|
-
|
|
676
|
-
begin
|
|
677
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
678
|
-
|
|
679
|
-
if result.images && !result.images.empty?
|
|
680
|
-
result.images.each do |image|
|
|
681
|
-
# All extracted images must have these fields populated
|
|
682
|
-
expect(image).not_to be_nil
|
|
683
|
-
expect(image.format).not_to be_nil, 'Format is required'
|
|
684
|
-
expect(image.format).not_to be_empty
|
|
685
|
-
expect(image.image_index).to be >= 0, 'Image index must be non-negative'
|
|
686
|
-
expect(image.data).not_to be_nil, 'Image data is required'
|
|
687
|
-
end
|
|
688
|
-
end
|
|
689
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
690
|
-
skip 'Test file not available'
|
|
691
|
-
end
|
|
692
|
-
end
|
|
693
|
-
|
|
694
|
-
it 'includes optional metadata fields appropriately' do
|
|
695
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
696
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
697
|
-
extract_images: true,
|
|
698
|
-
target_dpi: 150
|
|
699
|
-
)
|
|
700
|
-
)
|
|
701
|
-
|
|
702
|
-
begin
|
|
703
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
704
|
-
|
|
705
|
-
if result.images && !result.images.empty?
|
|
706
|
-
result.images.each do |image|
|
|
707
|
-
# Optional fields should be valid when present
|
|
708
|
-
expect(image.width).to be > 0, 'Width should be positive when present' if image.width
|
|
709
|
-
expect(image.height).to be > 0, 'Height should be positive when present' if image.height
|
|
710
|
-
expect(image.page_number).to be > 0, 'Page number should be positive' if image.page_number
|
|
711
|
-
end
|
|
712
|
-
end
|
|
713
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
714
|
-
skip 'Test file not available'
|
|
715
|
-
end
|
|
716
|
-
end
|
|
717
|
-
|
|
718
|
-
it 'ensures multiple images have different indices' do
|
|
719
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
720
|
-
image_extraction: Kreuzberg::Config::ImageExtraction.new(
|
|
721
|
-
extract_images: true
|
|
722
|
-
)
|
|
723
|
-
)
|
|
724
|
-
|
|
725
|
-
begin
|
|
726
|
-
result = Kreuzberg.extract_file_sync(path: test_document_path('pdf/with_images.pdf'), config: config)
|
|
727
|
-
|
|
728
|
-
if result.images && result.images.length > 1
|
|
729
|
-
indices = result.images.map(&:image_index)
|
|
730
|
-
unique_indices = indices.uniq
|
|
731
|
-
expect(unique_indices.length).to eq(indices.length), 'Each image should have unique index'
|
|
732
|
-
end
|
|
733
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
734
|
-
skip 'Test file not available'
|
|
735
|
-
end
|
|
736
|
-
end
|
|
737
|
-
end
|
|
738
|
-
end
|