kreuzberg 4.0.0.pre.rc.8 → 4.0.0.pre.rc.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +4 -104
- data/README.md +454 -432
- data/Rakefile +25 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6941 -6721
- data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3135
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -182
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -46
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -32
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -85
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -103
- data/lib/pdfium.dll +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -537
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +45 -0
- data/vendor/kreuzberg/Cargo.toml +61 -38
- data/vendor/kreuzberg/README.md +230 -221
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +843 -891
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1171
- data/vendor/kreuzberg/src/embeddings.rs +500 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +601 -569
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -562
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -673
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +346 -328
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -66
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -417
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +164 -161
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +63 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/build.rs +176 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +48 -0
- data/vendor/kreuzberg-tesseract/LICENSE +22 -0
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1354 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +44 -81
- data/vendor/rb-sys/bin/release.sh +0 -21
|
@@ -1,698 +1,698 @@
|
|
|
1
|
-
//! OCR error handling and edge case tests.
|
|
2
|
-
//!
|
|
3
|
-
//! This module tests OCR error scenarios to ensure robust error handling:
|
|
4
|
-
//! - Invalid configurations (bad language codes, invalid PSM values)
|
|
5
|
-
//! - Corrupted or invalid image inputs
|
|
6
|
-
//! - Missing dependencies (Tesseract not installed)
|
|
7
|
-
//! - Cache-related errors
|
|
8
|
-
//! - Concurrent processing scenarios
|
|
9
|
-
//!
|
|
10
|
-
//! Test philosophy:
|
|
11
|
-
//! - Verify graceful handling of all error conditions
|
|
12
|
-
//! - Ensure error messages are informative
|
|
13
|
-
//! - Test recovery from transient failures
|
|
14
|
-
//! - Validate resource limits and constraints
|
|
15
|
-
|
|
16
|
-
#![cfg(feature = "ocr")]
|
|
17
|
-
|
|
18
|
-
mod helpers;
|
|
19
|
-
|
|
20
|
-
use helpers::*;
|
|
21
|
-
use kreuzberg::core::config::{ExtractionConfig, OcrConfig};
|
|
22
|
-
use kreuzberg::types::TesseractConfig;
|
|
23
|
-
use kreuzberg::{KreuzbergError, extract_bytes_sync, extract_file_sync};
|
|
24
|
-
|
|
25
|
-
#[test]
|
|
26
|
-
fn test_ocr_invalid_language_code() {
|
|
27
|
-
if skip_if_missing("images/test_hello_world.png") {
|
|
28
|
-
return;
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
32
|
-
let config = ExtractionConfig {
|
|
33
|
-
ocr: Some(OcrConfig {
|
|
34
|
-
backend: "tesseract".to_string(),
|
|
35
|
-
language: "invalid_lang_99999".to_string(),
|
|
36
|
-
tesseract_config: None,
|
|
37
|
-
}),
|
|
38
|
-
force_ocr: false,
|
|
39
|
-
..Default::default()
|
|
40
|
-
};
|
|
41
|
-
|
|
42
|
-
let result = extract_file_sync(&file_path, None, &config);
|
|
43
|
-
|
|
44
|
-
match result {
|
|
45
|
-
Err(KreuzbergError::Ocr { message, .. }) => {
|
|
46
|
-
tracing::debug!("Expected OCR error for invalid language: {}", message);
|
|
47
|
-
assert!(
|
|
48
|
-
message.contains("language") || message.contains("lang") || message.contains("invalid"),
|
|
49
|
-
"Error message should mention language issue: {}",
|
|
50
|
-
message
|
|
51
|
-
);
|
|
52
|
-
}
|
|
53
|
-
Err(e) => {
|
|
54
|
-
tracing::debug!("Invalid language produced error: {}", e);
|
|
55
|
-
}
|
|
56
|
-
Ok(_) => {
|
|
57
|
-
tracing::debug!("Invalid language was accepted (fallback behavior)");
|
|
58
|
-
}
|
|
59
|
-
}
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
#[test]
|
|
63
|
-
fn test_ocr_invalid_psm_mode() {
|
|
64
|
-
if skip_if_missing("images/test_hello_world.png") {
|
|
65
|
-
return;
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
69
|
-
let config = ExtractionConfig {
|
|
70
|
-
ocr: Some(OcrConfig {
|
|
71
|
-
backend: "tesseract".to_string(),
|
|
72
|
-
language: "eng".to_string(),
|
|
73
|
-
tesseract_config: Some(TesseractConfig {
|
|
74
|
-
psm: 999,
|
|
75
|
-
..Default::default()
|
|
76
|
-
}),
|
|
77
|
-
}),
|
|
78
|
-
force_ocr: false,
|
|
79
|
-
..Default::default()
|
|
80
|
-
};
|
|
81
|
-
|
|
82
|
-
let result = extract_file_sync(&file_path, None, &config);
|
|
83
|
-
|
|
84
|
-
match result {
|
|
85
|
-
Err(KreuzbergError::Ocr { message, .. }) | Err(KreuzbergError::Validation { message, .. }) => {
|
|
86
|
-
tracing::debug!("Expected error for invalid PSM: {}", message);
|
|
87
|
-
assert!(
|
|
88
|
-
message.contains("psm") || message.contains("segmentation") || message.contains("mode"),
|
|
89
|
-
"Error message should mention PSM issue: {}",
|
|
90
|
-
message
|
|
91
|
-
);
|
|
92
|
-
}
|
|
93
|
-
Err(e) => {
|
|
94
|
-
tracing::debug!("Invalid PSM produced error: {}", e);
|
|
95
|
-
}
|
|
96
|
-
Ok(result) => {
|
|
97
|
-
tracing::debug!("Invalid PSM was accepted (fallback behavior)");
|
|
98
|
-
assert_non_empty_content(&result);
|
|
99
|
-
}
|
|
100
|
-
}
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
#[test]
|
|
104
|
-
fn test_ocr_invalid_backend_name() {
|
|
105
|
-
if skip_if_missing("images/test_hello_world.png") {
|
|
106
|
-
return;
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
110
|
-
let config = ExtractionConfig {
|
|
111
|
-
ocr: Some(OcrConfig {
|
|
112
|
-
backend: "nonexistent_ocr_backend_xyz".to_string(),
|
|
113
|
-
language: "eng".to_string(),
|
|
114
|
-
tesseract_config: None,
|
|
115
|
-
}),
|
|
116
|
-
force_ocr: false,
|
|
117
|
-
..Default::default()
|
|
118
|
-
};
|
|
119
|
-
|
|
120
|
-
let result = extract_file_sync(&file_path, None, &config);
|
|
121
|
-
|
|
122
|
-
match result {
|
|
123
|
-
Ok(extraction_result) => {
|
|
124
|
-
tracing::debug!("Invalid backend name ignored, fallback to Tesseract (expected behavior in Rust core)");
|
|
125
|
-
assert_non_empty_content(&extraction_result);
|
|
126
|
-
}
|
|
127
|
-
Err(KreuzbergError::Ocr { message, .. }) => {
|
|
128
|
-
tracing::debug!("OCR error for invalid backend: {}", message);
|
|
129
|
-
}
|
|
130
|
-
Err(KreuzbergError::MissingDependency(msg)) => {
|
|
131
|
-
tracing::debug!("MissingDependency error for invalid backend: {}", msg);
|
|
132
|
-
}
|
|
133
|
-
Err(KreuzbergError::Validation { message, .. }) => {
|
|
134
|
-
tracing::debug!("Validation error for invalid backend: {}", message);
|
|
135
|
-
}
|
|
136
|
-
Err(e) => {
|
|
137
|
-
tracing::debug!("Invalid backend produced error: {}", e);
|
|
138
|
-
}
|
|
139
|
-
}
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
#[test]
|
|
143
|
-
fn test_ocr_corrupted_image_data() {
|
|
144
|
-
let corrupted_data = vec![0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10];
|
|
145
|
-
let config = ExtractionConfig {
|
|
146
|
-
ocr: Some(OcrConfig {
|
|
147
|
-
backend: "tesseract".to_string(),
|
|
148
|
-
language: "eng".to_string(),
|
|
149
|
-
tesseract_config: None,
|
|
150
|
-
}),
|
|
151
|
-
force_ocr: true,
|
|
152
|
-
..Default::default()
|
|
153
|
-
};
|
|
154
|
-
|
|
155
|
-
let result = extract_bytes_sync(&corrupted_data, "image/jpeg", &config);
|
|
156
|
-
|
|
157
|
-
match result {
|
|
158
|
-
Err(KreuzbergError::ImageProcessing { message, .. })
|
|
159
|
-
| Err(KreuzbergError::Parsing { message, .. })
|
|
160
|
-
| Err(KreuzbergError::Ocr { message, .. }) => {
|
|
161
|
-
tracing::debug!("Expected error for corrupted image: {}", message);
|
|
162
|
-
}
|
|
163
|
-
Err(e) => {
|
|
164
|
-
tracing::debug!("Corrupted image produced error: {}", e);
|
|
165
|
-
}
|
|
166
|
-
Ok(_) => {
|
|
167
|
-
tracing::debug!("Corrupted image was processed (partial success)");
|
|
168
|
-
}
|
|
169
|
-
}
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
#[test]
|
|
173
|
-
fn test_ocr_empty_image() {
|
|
174
|
-
let empty_data = vec![];
|
|
175
|
-
let config = ExtractionConfig {
|
|
176
|
-
ocr: Some(OcrConfig {
|
|
177
|
-
backend: "tesseract".to_string(),
|
|
178
|
-
language: "eng".to_string(),
|
|
179
|
-
tesseract_config: None,
|
|
180
|
-
}),
|
|
181
|
-
force_ocr: true,
|
|
182
|
-
..Default::default()
|
|
183
|
-
};
|
|
184
|
-
|
|
185
|
-
let result = extract_bytes_sync(&empty_data, "image/png", &config);
|
|
186
|
-
|
|
187
|
-
assert!(result.is_err(), "Empty image data should produce an error");
|
|
188
|
-
|
|
189
|
-
match result {
|
|
190
|
-
Err(KreuzbergError::Validation { message, .. })
|
|
191
|
-
| Err(KreuzbergError::Parsing { message, .. })
|
|
192
|
-
| Err(KreuzbergError::ImageProcessing { message, .. }) => {
|
|
193
|
-
tracing::debug!("Expected error for empty image: {}", message);
|
|
194
|
-
}
|
|
195
|
-
Err(e) => {
|
|
196
|
-
tracing::debug!("Empty image produced error: {}", e);
|
|
197
|
-
}
|
|
198
|
-
Ok(_) => unreachable!(),
|
|
199
|
-
}
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
#[test]
|
|
203
|
-
fn test_ocr_non_image_data() {
|
|
204
|
-
let text_data = b"This is plain text, not an image";
|
|
205
|
-
let config = ExtractionConfig {
|
|
206
|
-
ocr: Some(OcrConfig {
|
|
207
|
-
backend: "tesseract".to_string(),
|
|
208
|
-
language: "eng".to_string(),
|
|
209
|
-
tesseract_config: None,
|
|
210
|
-
}),
|
|
211
|
-
force_ocr: true,
|
|
212
|
-
..Default::default()
|
|
213
|
-
};
|
|
214
|
-
|
|
215
|
-
let result = extract_bytes_sync(text_data, "image/png", &config);
|
|
216
|
-
|
|
217
|
-
match result {
|
|
218
|
-
Err(KreuzbergError::Parsing { message, .. }) | Err(KreuzbergError::ImageProcessing { message, .. }) => {
|
|
219
|
-
tracing::debug!("Expected error for non-image data: {}", message);
|
|
220
|
-
}
|
|
221
|
-
Err(e) => {
|
|
222
|
-
tracing::debug!("Non-image data produced error: {}", e);
|
|
223
|
-
}
|
|
224
|
-
Ok(_) => {
|
|
225
|
-
tracing::debug!("Non-image data was accepted");
|
|
226
|
-
}
|
|
227
|
-
}
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
#[test]
|
|
231
|
-
fn test_ocr_extreme_table_threshold() {
|
|
232
|
-
if skip_if_missing("tables/simple_table.png") {
|
|
233
|
-
return;
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
let file_path = get_test_file_path("tables/simple_table.png");
|
|
237
|
-
let config = ExtractionConfig {
|
|
238
|
-
ocr: Some(OcrConfig {
|
|
239
|
-
backend: "tesseract".to_string(),
|
|
240
|
-
language: "eng".to_string(),
|
|
241
|
-
tesseract_config: Some(TesseractConfig {
|
|
242
|
-
enable_table_detection: true,
|
|
243
|
-
table_min_confidence: 1.5,
|
|
244
|
-
table_column_threshold: -50,
|
|
245
|
-
table_row_threshold_ratio: 10.0,
|
|
246
|
-
..Default::default()
|
|
247
|
-
}),
|
|
248
|
-
}),
|
|
249
|
-
force_ocr: false,
|
|
250
|
-
..Default::default()
|
|
251
|
-
};
|
|
252
|
-
|
|
253
|
-
let result = extract_file_sync(&file_path, None, &config);
|
|
254
|
-
|
|
255
|
-
match result {
|
|
256
|
-
Ok(extraction_result) => {
|
|
257
|
-
tracing::debug!("Extreme table config was accepted (values may be clamped)");
|
|
258
|
-
assert_non_empty_content(&extraction_result);
|
|
259
|
-
}
|
|
260
|
-
Err(KreuzbergError::Validation { message, .. }) => {
|
|
261
|
-
tracing::debug!("Configuration validation caught extreme values: {}", message);
|
|
262
|
-
}
|
|
263
|
-
Err(e) => {
|
|
264
|
-
tracing::debug!("Extreme table config produced error: {}", e);
|
|
265
|
-
}
|
|
266
|
-
}
|
|
267
|
-
}
|
|
268
|
-
|
|
269
|
-
#[test]
|
|
270
|
-
fn test_ocr_negative_psm() {
|
|
271
|
-
if skip_if_missing("images/test_hello_world.png") {
|
|
272
|
-
return;
|
|
273
|
-
}
|
|
274
|
-
|
|
275
|
-
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
276
|
-
let config = ExtractionConfig {
|
|
277
|
-
ocr: Some(OcrConfig {
|
|
278
|
-
backend: "tesseract".to_string(),
|
|
279
|
-
language: "eng".to_string(),
|
|
280
|
-
tesseract_config: Some(TesseractConfig {
|
|
281
|
-
psm: -5,
|
|
282
|
-
..Default::default()
|
|
283
|
-
}),
|
|
284
|
-
}),
|
|
285
|
-
force_ocr: false,
|
|
286
|
-
..Default::default()
|
|
287
|
-
};
|
|
288
|
-
|
|
289
|
-
let result = extract_file_sync(&file_path, None, &config);
|
|
290
|
-
|
|
291
|
-
match result {
|
|
292
|
-
Ok(_) => {
|
|
293
|
-
tracing::debug!("Negative PSM was accepted (clamped or default used)");
|
|
294
|
-
}
|
|
295
|
-
Err(e) => {
|
|
296
|
-
tracing::debug!("Negative PSM produced error: {}", e);
|
|
297
|
-
}
|
|
298
|
-
}
|
|
299
|
-
}
|
|
300
|
-
|
|
301
|
-
#[test]
|
|
302
|
-
fn test_ocr_empty_whitelist() {
|
|
303
|
-
if skip_if_missing("images/test_hello_world.png") {
|
|
304
|
-
return;
|
|
305
|
-
}
|
|
306
|
-
|
|
307
|
-
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
308
|
-
let config = ExtractionConfig {
|
|
309
|
-
ocr: Some(OcrConfig {
|
|
310
|
-
backend: "tesseract".to_string(),
|
|
311
|
-
language: "eng".to_string(),
|
|
312
|
-
tesseract_config: Some(TesseractConfig {
|
|
313
|
-
tessedit_char_whitelist: "".to_string(),
|
|
314
|
-
..Default::default()
|
|
315
|
-
}),
|
|
316
|
-
}),
|
|
317
|
-
force_ocr: false,
|
|
318
|
-
..Default::default()
|
|
319
|
-
};
|
|
320
|
-
|
|
321
|
-
let result = extract_file_sync(&file_path, None, &config);
|
|
322
|
-
|
|
323
|
-
match result {
|
|
324
|
-
Ok(extraction_result) => {
|
|
325
|
-
tracing::debug!(
|
|
326
|
-
"Empty whitelist accepted, content length: {}",
|
|
327
|
-
extraction_result.content.len()
|
|
328
|
-
);
|
|
329
|
-
}
|
|
330
|
-
Err(e) => {
|
|
331
|
-
tracing::debug!("Empty whitelist produced error: {}", e);
|
|
332
|
-
}
|
|
333
|
-
}
|
|
334
|
-
}
|
|
335
|
-
|
|
336
|
-
#[test]
|
|
337
|
-
fn test_ocr_conflicting_whitelist_blacklist() {
|
|
338
|
-
if skip_if_missing("images/test_hello_world.png") {
|
|
339
|
-
return;
|
|
340
|
-
}
|
|
341
|
-
|
|
342
|
-
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
343
|
-
let config = ExtractionConfig {
|
|
344
|
-
ocr: Some(OcrConfig {
|
|
345
|
-
backend: "tesseract".to_string(),
|
|
346
|
-
language: "eng".to_string(),
|
|
347
|
-
tesseract_config: Some(TesseractConfig {
|
|
348
|
-
tessedit_char_whitelist: "abc".to_string(),
|
|
349
|
-
tessedit_char_blacklist: "abc".to_string(),
|
|
350
|
-
..Default::default()
|
|
351
|
-
}),
|
|
352
|
-
}),
|
|
353
|
-
force_ocr: false,
|
|
354
|
-
..Default::default()
|
|
355
|
-
};
|
|
356
|
-
|
|
357
|
-
let result = extract_file_sync(&file_path, None, &config);
|
|
358
|
-
|
|
359
|
-
match result {
|
|
360
|
-
Ok(extraction_result) => {
|
|
361
|
-
tracing::debug!(
|
|
362
|
-
"Conflicting whitelist/blacklist accepted: {}",
|
|
363
|
-
extraction_result.content.len()
|
|
364
|
-
);
|
|
365
|
-
}
|
|
366
|
-
Err(e) => {
|
|
367
|
-
tracing::debug!("Conflicting config produced error: {}", e);
|
|
368
|
-
}
|
|
369
|
-
}
|
|
370
|
-
}
|
|
371
|
-
|
|
372
|
-
#[test]
|
|
373
|
-
fn test_ocr_empty_language() {
|
|
374
|
-
if skip_if_missing("images/test_hello_world.png") {
|
|
375
|
-
return;
|
|
376
|
-
}
|
|
377
|
-
|
|
378
|
-
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
379
|
-
let config = ExtractionConfig {
|
|
380
|
-
ocr: Some(OcrConfig {
|
|
381
|
-
backend: "tesseract".to_string(),
|
|
382
|
-
language: "".to_string(),
|
|
383
|
-
tesseract_config: None,
|
|
384
|
-
}),
|
|
385
|
-
force_ocr: false,
|
|
386
|
-
..Default::default()
|
|
387
|
-
};
|
|
388
|
-
|
|
389
|
-
let result = extract_file_sync(&file_path, None, &config);
|
|
390
|
-
|
|
391
|
-
match result {
|
|
392
|
-
Ok(_) => {
|
|
393
|
-
tracing::debug!("Empty language accepted (fallback to default)");
|
|
394
|
-
}
|
|
395
|
-
Err(KreuzbergError::Validation { message, .. }) | Err(KreuzbergError::Ocr { message, .. }) => {
|
|
396
|
-
tracing::debug!("Empty language rejected: {}", message);
|
|
397
|
-
}
|
|
398
|
-
Err(e) => {
|
|
399
|
-
tracing::debug!("Empty language produced error: {}", e);
|
|
400
|
-
}
|
|
401
|
-
}
|
|
402
|
-
}
|
|
403
|
-
|
|
404
|
-
#[test]
|
|
405
|
-
fn test_ocr_malformed_multi_language() {
|
|
406
|
-
if skip_if_missing("images/test_hello_world.png") {
|
|
407
|
-
return;
|
|
408
|
-
}
|
|
409
|
-
|
|
410
|
-
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
411
|
-
let config = ExtractionConfig {
|
|
412
|
-
ocr: Some(OcrConfig {
|
|
413
|
-
backend: "tesseract".to_string(),
|
|
414
|
-
language: "eng++deu++fra".to_string(),
|
|
415
|
-
tesseract_config: None,
|
|
416
|
-
}),
|
|
417
|
-
force_ocr: false,
|
|
418
|
-
..Default::default()
|
|
419
|
-
};
|
|
420
|
-
|
|
421
|
-
let result = extract_file_sync(&file_path, None, &config);
|
|
422
|
-
|
|
423
|
-
match result {
|
|
424
|
-
Ok(_) => {
|
|
425
|
-
tracing::debug!("Malformed multi-language accepted (parser tolerant)");
|
|
426
|
-
}
|
|
427
|
-
Err(e) => {
|
|
428
|
-
tracing::debug!("Malformed language string produced error: {}", e);
|
|
429
|
-
}
|
|
430
|
-
}
|
|
431
|
-
}
|
|
432
|
-
|
|
433
|
-
#[test]
|
|
434
|
-
fn test_ocr_cache_disabled_then_enabled() {
|
|
435
|
-
if skip_if_missing("images/ocr_image.jpg") {
|
|
436
|
-
return;
|
|
437
|
-
}
|
|
438
|
-
|
|
439
|
-
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
440
|
-
|
|
441
|
-
let config_no_cache = ExtractionConfig {
|
|
442
|
-
ocr: Some(OcrConfig {
|
|
443
|
-
backend: "tesseract".to_string(),
|
|
444
|
-
language: "eng".to_string(),
|
|
445
|
-
tesseract_config: Some(TesseractConfig {
|
|
446
|
-
use_cache: false,
|
|
447
|
-
..Default::default()
|
|
448
|
-
}),
|
|
449
|
-
}),
|
|
450
|
-
force_ocr: false,
|
|
451
|
-
use_cache: false,
|
|
452
|
-
..Default::default()
|
|
453
|
-
};
|
|
454
|
-
|
|
455
|
-
let result1 = extract_file_sync(&file_path, None, &config_no_cache);
|
|
456
|
-
if matches!(result1, Err(KreuzbergError::MissingDependency(_))) {
|
|
457
|
-
return;
|
|
458
|
-
}
|
|
459
|
-
assert!(result1.is_ok(), "First extraction should succeed");
|
|
460
|
-
|
|
461
|
-
let config_with_cache = ExtractionConfig {
|
|
462
|
-
ocr: Some(OcrConfig {
|
|
463
|
-
backend: "tesseract".to_string(),
|
|
464
|
-
language: "eng".to_string(),
|
|
465
|
-
tesseract_config: Some(TesseractConfig {
|
|
466
|
-
use_cache: true,
|
|
467
|
-
..Default::default()
|
|
468
|
-
}),
|
|
469
|
-
}),
|
|
470
|
-
force_ocr: false,
|
|
471
|
-
use_cache: true,
|
|
472
|
-
..Default::default()
|
|
473
|
-
};
|
|
474
|
-
|
|
475
|
-
let result2 = extract_file_sync(&file_path, None, &config_with_cache);
|
|
476
|
-
if matches!(result2, Err(KreuzbergError::MissingDependency(_))) {
|
|
477
|
-
return;
|
|
478
|
-
}
|
|
479
|
-
assert!(result2.is_ok(), "Second extraction should succeed");
|
|
480
|
-
|
|
481
|
-
assert_non_empty_content(&result1.unwrap());
|
|
482
|
-
assert_non_empty_content(&result2.unwrap());
|
|
483
|
-
}
|
|
484
|
-
|
|
485
|
-
#[test]
|
|
486
|
-
fn test_ocr_concurrent_same_file() {
|
|
487
|
-
if skip_if_missing("images/ocr_image.jpg") {
|
|
488
|
-
return;
|
|
489
|
-
}
|
|
490
|
-
|
|
491
|
-
use std::sync::Arc;
|
|
492
|
-
use std::thread;
|
|
493
|
-
|
|
494
|
-
let file_path = Arc::new(get_test_file_path("images/ocr_image.jpg"));
|
|
495
|
-
let config = Arc::new(ExtractionConfig {
|
|
496
|
-
ocr: Some(OcrConfig {
|
|
497
|
-
backend: "tesseract".to_string(),
|
|
498
|
-
language: "eng".to_string(),
|
|
499
|
-
tesseract_config: None,
|
|
500
|
-
}),
|
|
501
|
-
force_ocr: false,
|
|
502
|
-
use_cache: true,
|
|
503
|
-
..Default::default()
|
|
504
|
-
});
|
|
505
|
-
|
|
506
|
-
if matches!(
|
|
507
|
-
extract_file_sync(&*file_path, None, &config),
|
|
508
|
-
Err(KreuzbergError::MissingDependency(_))
|
|
509
|
-
) {
|
|
510
|
-
return;
|
|
511
|
-
}
|
|
512
|
-
|
|
513
|
-
let mut handles = vec![];
|
|
514
|
-
for i in 0..5 {
|
|
515
|
-
let file_path_clone = Arc::clone(&file_path);
|
|
516
|
-
let config_clone = Arc::clone(&config);
|
|
517
|
-
|
|
518
|
-
let handle = thread::spawn(move || {
|
|
519
|
-
let result = extract_file_sync(&*file_path_clone, None, &config_clone);
|
|
520
|
-
let success = result.is_ok();
|
|
521
|
-
match result {
|
|
522
|
-
Ok(extraction_result) => {
|
|
523
|
-
tracing::debug!("Thread {} succeeded", i);
|
|
524
|
-
assert_non_empty_content(&extraction_result);
|
|
525
|
-
}
|
|
526
|
-
Err(e) => {
|
|
527
|
-
tracing::debug!("Thread {} failed: {}", i, e);
|
|
528
|
-
}
|
|
529
|
-
}
|
|
530
|
-
success
|
|
531
|
-
});
|
|
532
|
-
|
|
533
|
-
handles.push(handle);
|
|
534
|
-
}
|
|
535
|
-
|
|
536
|
-
let successes: usize = handles.into_iter().map(|h| if h.join().unwrap() { 1 } else { 0 }).sum();
|
|
537
|
-
|
|
538
|
-
tracing::debug!("Concurrent processing: {}/5 threads succeeded", successes);
|
|
539
|
-
|
|
540
|
-
assert!(
|
|
541
|
-
successes >= 1,
|
|
542
|
-
"At least one concurrent thread should succeed (got {})",
|
|
543
|
-
successes
|
|
544
|
-
);
|
|
545
|
-
}
|
|
546
|
-
|
|
547
|
-
#[test]
|
|
548
|
-
fn test_ocr_concurrent_different_files() {
|
|
549
|
-
if skip_if_missing("images/ocr_image.jpg") || skip_if_missing("images/test_hello_world.png") {
|
|
550
|
-
return;
|
|
551
|
-
}
|
|
552
|
-
|
|
553
|
-
use std::sync::Arc;
|
|
554
|
-
use std::thread;
|
|
555
|
-
|
|
556
|
-
let files = Arc::new(vec![
|
|
557
|
-
get_test_file_path("images/ocr_image.jpg"),
|
|
558
|
-
get_test_file_path("images/test_hello_world.png"),
|
|
559
|
-
]);
|
|
560
|
-
|
|
561
|
-
let config = Arc::new(ExtractionConfig {
|
|
562
|
-
ocr: Some(OcrConfig {
|
|
563
|
-
backend: "tesseract".to_string(),
|
|
564
|
-
language: "eng".to_string(),
|
|
565
|
-
tesseract_config: None,
|
|
566
|
-
}),
|
|
567
|
-
force_ocr: false,
|
|
568
|
-
use_cache: true,
|
|
569
|
-
..Default::default()
|
|
570
|
-
});
|
|
571
|
-
|
|
572
|
-
if matches!(
|
|
573
|
-
extract_file_sync(&files[0], None, &config),
|
|
574
|
-
Err(KreuzbergError::MissingDependency(_))
|
|
575
|
-
) {
|
|
576
|
-
return;
|
|
577
|
-
}
|
|
578
|
-
|
|
579
|
-
let mut handles = vec![];
|
|
580
|
-
for (i, file_path) in files.iter().enumerate() {
|
|
581
|
-
let file_path_clone = file_path.clone();
|
|
582
|
-
let config_clone = Arc::clone(&config);
|
|
583
|
-
|
|
584
|
-
let handle = thread::spawn(move || {
|
|
585
|
-
let result = extract_file_sync(&file_path_clone, None, &config_clone);
|
|
586
|
-
match result {
|
|
587
|
-
Ok(extraction_result) => {
|
|
588
|
-
tracing::debug!("File {} extraction succeeded", i);
|
|
589
|
-
assert_non_empty_content(&extraction_result);
|
|
590
|
-
true
|
|
591
|
-
}
|
|
592
|
-
Err(e) => {
|
|
593
|
-
tracing::debug!("File {} extraction failed: {}", i, e);
|
|
594
|
-
false
|
|
595
|
-
}
|
|
596
|
-
}
|
|
597
|
-
});
|
|
598
|
-
|
|
599
|
-
handles.push(handle);
|
|
600
|
-
}
|
|
601
|
-
|
|
602
|
-
let successes: usize = handles.into_iter().map(|h| if h.join().unwrap() { 1 } else { 0 }).sum();
|
|
603
|
-
|
|
604
|
-
assert_eq!(
|
|
605
|
-
successes, 2,
|
|
606
|
-
"All concurrent threads should succeed with different files"
|
|
607
|
-
);
|
|
608
|
-
}
|
|
609
|
-
|
|
610
|
-
#[test]
|
|
611
|
-
fn test_ocr_with_preprocessing_extreme_dpi() {
|
|
612
|
-
if skip_if_missing("images/test_hello_world.png") {
|
|
613
|
-
return;
|
|
614
|
-
}
|
|
615
|
-
|
|
616
|
-
use kreuzberg::types::ImagePreprocessingConfig;
|
|
617
|
-
|
|
618
|
-
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
619
|
-
let config = ExtractionConfig {
|
|
620
|
-
ocr: Some(OcrConfig {
|
|
621
|
-
backend: "tesseract".to_string(),
|
|
622
|
-
language: "eng".to_string(),
|
|
623
|
-
tesseract_config: Some(TesseractConfig {
|
|
624
|
-
preprocessing: Some(ImagePreprocessingConfig {
|
|
625
|
-
target_dpi: 10000,
|
|
626
|
-
auto_rotate: true,
|
|
627
|
-
deskew: true,
|
|
628
|
-
denoise: false,
|
|
629
|
-
contrast_enhance: false,
|
|
630
|
-
binarization_method: "otsu".to_string(),
|
|
631
|
-
invert_colors: false,
|
|
632
|
-
}),
|
|
633
|
-
..Default::default()
|
|
634
|
-
}),
|
|
635
|
-
}),
|
|
636
|
-
force_ocr: false,
|
|
637
|
-
..Default::default()
|
|
638
|
-
};
|
|
639
|
-
|
|
640
|
-
let result = extract_file_sync(&file_path, None, &config);
|
|
641
|
-
|
|
642
|
-
match result {
|
|
643
|
-
Ok(extraction_result) => {
|
|
644
|
-
tracing::debug!("Extreme DPI accepted (clamped): {}", extraction_result.content.len());
|
|
645
|
-
}
|
|
646
|
-
Err(KreuzbergError::ImageProcessing { message, .. }) | Err(KreuzbergError::Validation { message, .. }) => {
|
|
647
|
-
tracing::debug!("Extreme DPI rejected: {}", message);
|
|
648
|
-
}
|
|
649
|
-
Err(e) => {
|
|
650
|
-
tracing::debug!("Extreme DPI produced error: {}", e);
|
|
651
|
-
}
|
|
652
|
-
}
|
|
653
|
-
}
|
|
654
|
-
|
|
655
|
-
#[test]
|
|
656
|
-
fn test_ocr_with_invalid_binarization_method() {
|
|
657
|
-
if skip_if_missing("images/test_hello_world.png") {
|
|
658
|
-
return;
|
|
659
|
-
}
|
|
660
|
-
|
|
661
|
-
use kreuzberg::types::ImagePreprocessingConfig;
|
|
662
|
-
|
|
663
|
-
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
664
|
-
let config = ExtractionConfig {
|
|
665
|
-
ocr: Some(OcrConfig {
|
|
666
|
-
backend: "tesseract".to_string(),
|
|
667
|
-
language: "eng".to_string(),
|
|
668
|
-
tesseract_config: Some(TesseractConfig {
|
|
669
|
-
preprocessing: Some(ImagePreprocessingConfig {
|
|
670
|
-
target_dpi: 300,
|
|
671
|
-
auto_rotate: true,
|
|
672
|
-
deskew: true,
|
|
673
|
-
denoise: false,
|
|
674
|
-
contrast_enhance: false,
|
|
675
|
-
binarization_method: "invalid_method_xyz".to_string(),
|
|
676
|
-
invert_colors: false,
|
|
677
|
-
}),
|
|
678
|
-
..Default::default()
|
|
679
|
-
}),
|
|
680
|
-
}),
|
|
681
|
-
force_ocr: false,
|
|
682
|
-
..Default::default()
|
|
683
|
-
};
|
|
684
|
-
|
|
685
|
-
let result = extract_file_sync(&file_path, None, &config);
|
|
686
|
-
|
|
687
|
-
match result {
|
|
688
|
-
Ok(_) => {
|
|
689
|
-
tracing::debug!("Invalid binarization method accepted (fallback used)");
|
|
690
|
-
}
|
|
691
|
-
Err(KreuzbergError::Validation { message, .. }) | Err(KreuzbergError::ImageProcessing { message, .. }) => {
|
|
692
|
-
tracing::debug!("Invalid binarization method rejected: {}", message);
|
|
693
|
-
}
|
|
694
|
-
Err(e) => {
|
|
695
|
-
tracing::debug!("Invalid binarization method produced error: {}", e);
|
|
696
|
-
}
|
|
697
|
-
}
|
|
698
|
-
}
|
|
1
|
+
//! OCR error handling and edge case tests.
|
|
2
|
+
//!
|
|
3
|
+
//! This module tests OCR error scenarios to ensure robust error handling:
|
|
4
|
+
//! - Invalid configurations (bad language codes, invalid PSM values)
|
|
5
|
+
//! - Corrupted or invalid image inputs
|
|
6
|
+
//! - Missing dependencies (Tesseract not installed)
|
|
7
|
+
//! - Cache-related errors
|
|
8
|
+
//! - Concurrent processing scenarios
|
|
9
|
+
//!
|
|
10
|
+
//! Test philosophy:
|
|
11
|
+
//! - Verify graceful handling of all error conditions
|
|
12
|
+
//! - Ensure error messages are informative
|
|
13
|
+
//! - Test recovery from transient failures
|
|
14
|
+
//! - Validate resource limits and constraints
|
|
15
|
+
|
|
16
|
+
#![cfg(feature = "ocr")]
|
|
17
|
+
|
|
18
|
+
mod helpers;
|
|
19
|
+
|
|
20
|
+
use helpers::*;
|
|
21
|
+
use kreuzberg::core::config::{ExtractionConfig, OcrConfig};
|
|
22
|
+
use kreuzberg::types::TesseractConfig;
|
|
23
|
+
use kreuzberg::{KreuzbergError, extract_bytes_sync, extract_file_sync};
|
|
24
|
+
|
|
25
|
+
#[test]
|
|
26
|
+
fn test_ocr_invalid_language_code() {
|
|
27
|
+
if skip_if_missing("images/test_hello_world.png") {
|
|
28
|
+
return;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
32
|
+
let config = ExtractionConfig {
|
|
33
|
+
ocr: Some(OcrConfig {
|
|
34
|
+
backend: "tesseract".to_string(),
|
|
35
|
+
language: "invalid_lang_99999".to_string(),
|
|
36
|
+
tesseract_config: None,
|
|
37
|
+
}),
|
|
38
|
+
force_ocr: false,
|
|
39
|
+
..Default::default()
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
let result = extract_file_sync(&file_path, None, &config);
|
|
43
|
+
|
|
44
|
+
match result {
|
|
45
|
+
Err(KreuzbergError::Ocr { message, .. }) => {
|
|
46
|
+
tracing::debug!("Expected OCR error for invalid language: {}", message);
|
|
47
|
+
assert!(
|
|
48
|
+
message.contains("language") || message.contains("lang") || message.contains("invalid"),
|
|
49
|
+
"Error message should mention language issue: {}",
|
|
50
|
+
message
|
|
51
|
+
);
|
|
52
|
+
}
|
|
53
|
+
Err(e) => {
|
|
54
|
+
tracing::debug!("Invalid language produced error: {}", e);
|
|
55
|
+
}
|
|
56
|
+
Ok(_) => {
|
|
57
|
+
tracing::debug!("Invalid language was accepted (fallback behavior)");
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
#[test]
|
|
63
|
+
fn test_ocr_invalid_psm_mode() {
|
|
64
|
+
if skip_if_missing("images/test_hello_world.png") {
|
|
65
|
+
return;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
69
|
+
let config = ExtractionConfig {
|
|
70
|
+
ocr: Some(OcrConfig {
|
|
71
|
+
backend: "tesseract".to_string(),
|
|
72
|
+
language: "eng".to_string(),
|
|
73
|
+
tesseract_config: Some(TesseractConfig {
|
|
74
|
+
psm: 999,
|
|
75
|
+
..Default::default()
|
|
76
|
+
}),
|
|
77
|
+
}),
|
|
78
|
+
force_ocr: false,
|
|
79
|
+
..Default::default()
|
|
80
|
+
};
|
|
81
|
+
|
|
82
|
+
let result = extract_file_sync(&file_path, None, &config);
|
|
83
|
+
|
|
84
|
+
match result {
|
|
85
|
+
Err(KreuzbergError::Ocr { message, .. }) | Err(KreuzbergError::Validation { message, .. }) => {
|
|
86
|
+
tracing::debug!("Expected error for invalid PSM: {}", message);
|
|
87
|
+
assert!(
|
|
88
|
+
message.contains("psm") || message.contains("segmentation") || message.contains("mode"),
|
|
89
|
+
"Error message should mention PSM issue: {}",
|
|
90
|
+
message
|
|
91
|
+
);
|
|
92
|
+
}
|
|
93
|
+
Err(e) => {
|
|
94
|
+
tracing::debug!("Invalid PSM produced error: {}", e);
|
|
95
|
+
}
|
|
96
|
+
Ok(result) => {
|
|
97
|
+
tracing::debug!("Invalid PSM was accepted (fallback behavior)");
|
|
98
|
+
assert_non_empty_content(&result);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
#[test]
|
|
104
|
+
fn test_ocr_invalid_backend_name() {
|
|
105
|
+
if skip_if_missing("images/test_hello_world.png") {
|
|
106
|
+
return;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
110
|
+
let config = ExtractionConfig {
|
|
111
|
+
ocr: Some(OcrConfig {
|
|
112
|
+
backend: "nonexistent_ocr_backend_xyz".to_string(),
|
|
113
|
+
language: "eng".to_string(),
|
|
114
|
+
tesseract_config: None,
|
|
115
|
+
}),
|
|
116
|
+
force_ocr: false,
|
|
117
|
+
..Default::default()
|
|
118
|
+
};
|
|
119
|
+
|
|
120
|
+
let result = extract_file_sync(&file_path, None, &config);
|
|
121
|
+
|
|
122
|
+
match result {
|
|
123
|
+
Ok(extraction_result) => {
|
|
124
|
+
tracing::debug!("Invalid backend name ignored, fallback to Tesseract (expected behavior in Rust core)");
|
|
125
|
+
assert_non_empty_content(&extraction_result);
|
|
126
|
+
}
|
|
127
|
+
Err(KreuzbergError::Ocr { message, .. }) => {
|
|
128
|
+
tracing::debug!("OCR error for invalid backend: {}", message);
|
|
129
|
+
}
|
|
130
|
+
Err(KreuzbergError::MissingDependency(msg)) => {
|
|
131
|
+
tracing::debug!("MissingDependency error for invalid backend: {}", msg);
|
|
132
|
+
}
|
|
133
|
+
Err(KreuzbergError::Validation { message, .. }) => {
|
|
134
|
+
tracing::debug!("Validation error for invalid backend: {}", message);
|
|
135
|
+
}
|
|
136
|
+
Err(e) => {
|
|
137
|
+
tracing::debug!("Invalid backend produced error: {}", e);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
#[test]
|
|
143
|
+
fn test_ocr_corrupted_image_data() {
|
|
144
|
+
let corrupted_data = vec![0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10];
|
|
145
|
+
let config = ExtractionConfig {
|
|
146
|
+
ocr: Some(OcrConfig {
|
|
147
|
+
backend: "tesseract".to_string(),
|
|
148
|
+
language: "eng".to_string(),
|
|
149
|
+
tesseract_config: None,
|
|
150
|
+
}),
|
|
151
|
+
force_ocr: true,
|
|
152
|
+
..Default::default()
|
|
153
|
+
};
|
|
154
|
+
|
|
155
|
+
let result = extract_bytes_sync(&corrupted_data, "image/jpeg", &config);
|
|
156
|
+
|
|
157
|
+
match result {
|
|
158
|
+
Err(KreuzbergError::ImageProcessing { message, .. })
|
|
159
|
+
| Err(KreuzbergError::Parsing { message, .. })
|
|
160
|
+
| Err(KreuzbergError::Ocr { message, .. }) => {
|
|
161
|
+
tracing::debug!("Expected error for corrupted image: {}", message);
|
|
162
|
+
}
|
|
163
|
+
Err(e) => {
|
|
164
|
+
tracing::debug!("Corrupted image produced error: {}", e);
|
|
165
|
+
}
|
|
166
|
+
Ok(_) => {
|
|
167
|
+
tracing::debug!("Corrupted image was processed (partial success)");
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
#[test]
|
|
173
|
+
fn test_ocr_empty_image() {
|
|
174
|
+
let empty_data = vec![];
|
|
175
|
+
let config = ExtractionConfig {
|
|
176
|
+
ocr: Some(OcrConfig {
|
|
177
|
+
backend: "tesseract".to_string(),
|
|
178
|
+
language: "eng".to_string(),
|
|
179
|
+
tesseract_config: None,
|
|
180
|
+
}),
|
|
181
|
+
force_ocr: true,
|
|
182
|
+
..Default::default()
|
|
183
|
+
};
|
|
184
|
+
|
|
185
|
+
let result = extract_bytes_sync(&empty_data, "image/png", &config);
|
|
186
|
+
|
|
187
|
+
assert!(result.is_err(), "Empty image data should produce an error");
|
|
188
|
+
|
|
189
|
+
match result {
|
|
190
|
+
Err(KreuzbergError::Validation { message, .. })
|
|
191
|
+
| Err(KreuzbergError::Parsing { message, .. })
|
|
192
|
+
| Err(KreuzbergError::ImageProcessing { message, .. }) => {
|
|
193
|
+
tracing::debug!("Expected error for empty image: {}", message);
|
|
194
|
+
}
|
|
195
|
+
Err(e) => {
|
|
196
|
+
tracing::debug!("Empty image produced error: {}", e);
|
|
197
|
+
}
|
|
198
|
+
Ok(_) => unreachable!(),
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
#[test]
|
|
203
|
+
fn test_ocr_non_image_data() {
|
|
204
|
+
let text_data = b"This is plain text, not an image";
|
|
205
|
+
let config = ExtractionConfig {
|
|
206
|
+
ocr: Some(OcrConfig {
|
|
207
|
+
backend: "tesseract".to_string(),
|
|
208
|
+
language: "eng".to_string(),
|
|
209
|
+
tesseract_config: None,
|
|
210
|
+
}),
|
|
211
|
+
force_ocr: true,
|
|
212
|
+
..Default::default()
|
|
213
|
+
};
|
|
214
|
+
|
|
215
|
+
let result = extract_bytes_sync(text_data, "image/png", &config);
|
|
216
|
+
|
|
217
|
+
match result {
|
|
218
|
+
Err(KreuzbergError::Parsing { message, .. }) | Err(KreuzbergError::ImageProcessing { message, .. }) => {
|
|
219
|
+
tracing::debug!("Expected error for non-image data: {}", message);
|
|
220
|
+
}
|
|
221
|
+
Err(e) => {
|
|
222
|
+
tracing::debug!("Non-image data produced error: {}", e);
|
|
223
|
+
}
|
|
224
|
+
Ok(_) => {
|
|
225
|
+
tracing::debug!("Non-image data was accepted");
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
#[test]
|
|
231
|
+
fn test_ocr_extreme_table_threshold() {
|
|
232
|
+
if skip_if_missing("tables/simple_table.png") {
|
|
233
|
+
return;
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
let file_path = get_test_file_path("tables/simple_table.png");
|
|
237
|
+
let config = ExtractionConfig {
|
|
238
|
+
ocr: Some(OcrConfig {
|
|
239
|
+
backend: "tesseract".to_string(),
|
|
240
|
+
language: "eng".to_string(),
|
|
241
|
+
tesseract_config: Some(TesseractConfig {
|
|
242
|
+
enable_table_detection: true,
|
|
243
|
+
table_min_confidence: 1.5,
|
|
244
|
+
table_column_threshold: -50,
|
|
245
|
+
table_row_threshold_ratio: 10.0,
|
|
246
|
+
..Default::default()
|
|
247
|
+
}),
|
|
248
|
+
}),
|
|
249
|
+
force_ocr: false,
|
|
250
|
+
..Default::default()
|
|
251
|
+
};
|
|
252
|
+
|
|
253
|
+
let result = extract_file_sync(&file_path, None, &config);
|
|
254
|
+
|
|
255
|
+
match result {
|
|
256
|
+
Ok(extraction_result) => {
|
|
257
|
+
tracing::debug!("Extreme table config was accepted (values may be clamped)");
|
|
258
|
+
assert_non_empty_content(&extraction_result);
|
|
259
|
+
}
|
|
260
|
+
Err(KreuzbergError::Validation { message, .. }) => {
|
|
261
|
+
tracing::debug!("Configuration validation caught extreme values: {}", message);
|
|
262
|
+
}
|
|
263
|
+
Err(e) => {
|
|
264
|
+
tracing::debug!("Extreme table config produced error: {}", e);
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
#[test]
|
|
270
|
+
fn test_ocr_negative_psm() {
|
|
271
|
+
if skip_if_missing("images/test_hello_world.png") {
|
|
272
|
+
return;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
276
|
+
let config = ExtractionConfig {
|
|
277
|
+
ocr: Some(OcrConfig {
|
|
278
|
+
backend: "tesseract".to_string(),
|
|
279
|
+
language: "eng".to_string(),
|
|
280
|
+
tesseract_config: Some(TesseractConfig {
|
|
281
|
+
psm: -5,
|
|
282
|
+
..Default::default()
|
|
283
|
+
}),
|
|
284
|
+
}),
|
|
285
|
+
force_ocr: false,
|
|
286
|
+
..Default::default()
|
|
287
|
+
};
|
|
288
|
+
|
|
289
|
+
let result = extract_file_sync(&file_path, None, &config);
|
|
290
|
+
|
|
291
|
+
match result {
|
|
292
|
+
Ok(_) => {
|
|
293
|
+
tracing::debug!("Negative PSM was accepted (clamped or default used)");
|
|
294
|
+
}
|
|
295
|
+
Err(e) => {
|
|
296
|
+
tracing::debug!("Negative PSM produced error: {}", e);
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
#[test]
|
|
302
|
+
fn test_ocr_empty_whitelist() {
|
|
303
|
+
if skip_if_missing("images/test_hello_world.png") {
|
|
304
|
+
return;
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
308
|
+
let config = ExtractionConfig {
|
|
309
|
+
ocr: Some(OcrConfig {
|
|
310
|
+
backend: "tesseract".to_string(),
|
|
311
|
+
language: "eng".to_string(),
|
|
312
|
+
tesseract_config: Some(TesseractConfig {
|
|
313
|
+
tessedit_char_whitelist: "".to_string(),
|
|
314
|
+
..Default::default()
|
|
315
|
+
}),
|
|
316
|
+
}),
|
|
317
|
+
force_ocr: false,
|
|
318
|
+
..Default::default()
|
|
319
|
+
};
|
|
320
|
+
|
|
321
|
+
let result = extract_file_sync(&file_path, None, &config);
|
|
322
|
+
|
|
323
|
+
match result {
|
|
324
|
+
Ok(extraction_result) => {
|
|
325
|
+
tracing::debug!(
|
|
326
|
+
"Empty whitelist accepted, content length: {}",
|
|
327
|
+
extraction_result.content.len()
|
|
328
|
+
);
|
|
329
|
+
}
|
|
330
|
+
Err(e) => {
|
|
331
|
+
tracing::debug!("Empty whitelist produced error: {}", e);
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
#[test]
|
|
337
|
+
fn test_ocr_conflicting_whitelist_blacklist() {
|
|
338
|
+
if skip_if_missing("images/test_hello_world.png") {
|
|
339
|
+
return;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
343
|
+
let config = ExtractionConfig {
|
|
344
|
+
ocr: Some(OcrConfig {
|
|
345
|
+
backend: "tesseract".to_string(),
|
|
346
|
+
language: "eng".to_string(),
|
|
347
|
+
tesseract_config: Some(TesseractConfig {
|
|
348
|
+
tessedit_char_whitelist: "abc".to_string(),
|
|
349
|
+
tessedit_char_blacklist: "abc".to_string(),
|
|
350
|
+
..Default::default()
|
|
351
|
+
}),
|
|
352
|
+
}),
|
|
353
|
+
force_ocr: false,
|
|
354
|
+
..Default::default()
|
|
355
|
+
};
|
|
356
|
+
|
|
357
|
+
let result = extract_file_sync(&file_path, None, &config);
|
|
358
|
+
|
|
359
|
+
match result {
|
|
360
|
+
Ok(extraction_result) => {
|
|
361
|
+
tracing::debug!(
|
|
362
|
+
"Conflicting whitelist/blacklist accepted: {}",
|
|
363
|
+
extraction_result.content.len()
|
|
364
|
+
);
|
|
365
|
+
}
|
|
366
|
+
Err(e) => {
|
|
367
|
+
tracing::debug!("Conflicting config produced error: {}", e);
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
#[test]
|
|
373
|
+
fn test_ocr_empty_language() {
|
|
374
|
+
if skip_if_missing("images/test_hello_world.png") {
|
|
375
|
+
return;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
379
|
+
let config = ExtractionConfig {
|
|
380
|
+
ocr: Some(OcrConfig {
|
|
381
|
+
backend: "tesseract".to_string(),
|
|
382
|
+
language: "".to_string(),
|
|
383
|
+
tesseract_config: None,
|
|
384
|
+
}),
|
|
385
|
+
force_ocr: false,
|
|
386
|
+
..Default::default()
|
|
387
|
+
};
|
|
388
|
+
|
|
389
|
+
let result = extract_file_sync(&file_path, None, &config);
|
|
390
|
+
|
|
391
|
+
match result {
|
|
392
|
+
Ok(_) => {
|
|
393
|
+
tracing::debug!("Empty language accepted (fallback to default)");
|
|
394
|
+
}
|
|
395
|
+
Err(KreuzbergError::Validation { message, .. }) | Err(KreuzbergError::Ocr { message, .. }) => {
|
|
396
|
+
tracing::debug!("Empty language rejected: {}", message);
|
|
397
|
+
}
|
|
398
|
+
Err(e) => {
|
|
399
|
+
tracing::debug!("Empty language produced error: {}", e);
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
#[test]
|
|
405
|
+
fn test_ocr_malformed_multi_language() {
|
|
406
|
+
if skip_if_missing("images/test_hello_world.png") {
|
|
407
|
+
return;
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
411
|
+
let config = ExtractionConfig {
|
|
412
|
+
ocr: Some(OcrConfig {
|
|
413
|
+
backend: "tesseract".to_string(),
|
|
414
|
+
language: "eng++deu++fra".to_string(),
|
|
415
|
+
tesseract_config: None,
|
|
416
|
+
}),
|
|
417
|
+
force_ocr: false,
|
|
418
|
+
..Default::default()
|
|
419
|
+
};
|
|
420
|
+
|
|
421
|
+
let result = extract_file_sync(&file_path, None, &config);
|
|
422
|
+
|
|
423
|
+
match result {
|
|
424
|
+
Ok(_) => {
|
|
425
|
+
tracing::debug!("Malformed multi-language accepted (parser tolerant)");
|
|
426
|
+
}
|
|
427
|
+
Err(e) => {
|
|
428
|
+
tracing::debug!("Malformed language string produced error: {}", e);
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
#[test]
|
|
434
|
+
fn test_ocr_cache_disabled_then_enabled() {
|
|
435
|
+
if skip_if_missing("images/ocr_image.jpg") {
|
|
436
|
+
return;
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
440
|
+
|
|
441
|
+
let config_no_cache = ExtractionConfig {
|
|
442
|
+
ocr: Some(OcrConfig {
|
|
443
|
+
backend: "tesseract".to_string(),
|
|
444
|
+
language: "eng".to_string(),
|
|
445
|
+
tesseract_config: Some(TesseractConfig {
|
|
446
|
+
use_cache: false,
|
|
447
|
+
..Default::default()
|
|
448
|
+
}),
|
|
449
|
+
}),
|
|
450
|
+
force_ocr: false,
|
|
451
|
+
use_cache: false,
|
|
452
|
+
..Default::default()
|
|
453
|
+
};
|
|
454
|
+
|
|
455
|
+
let result1 = extract_file_sync(&file_path, None, &config_no_cache);
|
|
456
|
+
if matches!(result1, Err(KreuzbergError::MissingDependency(_))) {
|
|
457
|
+
return;
|
|
458
|
+
}
|
|
459
|
+
assert!(result1.is_ok(), "First extraction should succeed");
|
|
460
|
+
|
|
461
|
+
let config_with_cache = ExtractionConfig {
|
|
462
|
+
ocr: Some(OcrConfig {
|
|
463
|
+
backend: "tesseract".to_string(),
|
|
464
|
+
language: "eng".to_string(),
|
|
465
|
+
tesseract_config: Some(TesseractConfig {
|
|
466
|
+
use_cache: true,
|
|
467
|
+
..Default::default()
|
|
468
|
+
}),
|
|
469
|
+
}),
|
|
470
|
+
force_ocr: false,
|
|
471
|
+
use_cache: true,
|
|
472
|
+
..Default::default()
|
|
473
|
+
};
|
|
474
|
+
|
|
475
|
+
let result2 = extract_file_sync(&file_path, None, &config_with_cache);
|
|
476
|
+
if matches!(result2, Err(KreuzbergError::MissingDependency(_))) {
|
|
477
|
+
return;
|
|
478
|
+
}
|
|
479
|
+
assert!(result2.is_ok(), "Second extraction should succeed");
|
|
480
|
+
|
|
481
|
+
assert_non_empty_content(&result1.unwrap());
|
|
482
|
+
assert_non_empty_content(&result2.unwrap());
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
#[test]
|
|
486
|
+
fn test_ocr_concurrent_same_file() {
|
|
487
|
+
if skip_if_missing("images/ocr_image.jpg") {
|
|
488
|
+
return;
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
use std::sync::Arc;
|
|
492
|
+
use std::thread;
|
|
493
|
+
|
|
494
|
+
let file_path = Arc::new(get_test_file_path("images/ocr_image.jpg"));
|
|
495
|
+
let config = Arc::new(ExtractionConfig {
|
|
496
|
+
ocr: Some(OcrConfig {
|
|
497
|
+
backend: "tesseract".to_string(),
|
|
498
|
+
language: "eng".to_string(),
|
|
499
|
+
tesseract_config: None,
|
|
500
|
+
}),
|
|
501
|
+
force_ocr: false,
|
|
502
|
+
use_cache: true,
|
|
503
|
+
..Default::default()
|
|
504
|
+
});
|
|
505
|
+
|
|
506
|
+
if matches!(
|
|
507
|
+
extract_file_sync(&*file_path, None, &config),
|
|
508
|
+
Err(KreuzbergError::MissingDependency(_))
|
|
509
|
+
) {
|
|
510
|
+
return;
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
let mut handles = vec![];
|
|
514
|
+
for i in 0..5 {
|
|
515
|
+
let file_path_clone = Arc::clone(&file_path);
|
|
516
|
+
let config_clone = Arc::clone(&config);
|
|
517
|
+
|
|
518
|
+
let handle = thread::spawn(move || {
|
|
519
|
+
let result = extract_file_sync(&*file_path_clone, None, &config_clone);
|
|
520
|
+
let success = result.is_ok();
|
|
521
|
+
match result {
|
|
522
|
+
Ok(extraction_result) => {
|
|
523
|
+
tracing::debug!("Thread {} succeeded", i);
|
|
524
|
+
assert_non_empty_content(&extraction_result);
|
|
525
|
+
}
|
|
526
|
+
Err(e) => {
|
|
527
|
+
tracing::debug!("Thread {} failed: {}", i, e);
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
success
|
|
531
|
+
});
|
|
532
|
+
|
|
533
|
+
handles.push(handle);
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
let successes: usize = handles.into_iter().map(|h| if h.join().unwrap() { 1 } else { 0 }).sum();
|
|
537
|
+
|
|
538
|
+
tracing::debug!("Concurrent processing: {}/5 threads succeeded", successes);
|
|
539
|
+
|
|
540
|
+
assert!(
|
|
541
|
+
successes >= 1,
|
|
542
|
+
"At least one concurrent thread should succeed (got {})",
|
|
543
|
+
successes
|
|
544
|
+
);
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
#[test]
|
|
548
|
+
fn test_ocr_concurrent_different_files() {
|
|
549
|
+
if skip_if_missing("images/ocr_image.jpg") || skip_if_missing("images/test_hello_world.png") {
|
|
550
|
+
return;
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
use std::sync::Arc;
|
|
554
|
+
use std::thread;
|
|
555
|
+
|
|
556
|
+
let files = Arc::new(vec![
|
|
557
|
+
get_test_file_path("images/ocr_image.jpg"),
|
|
558
|
+
get_test_file_path("images/test_hello_world.png"),
|
|
559
|
+
]);
|
|
560
|
+
|
|
561
|
+
let config = Arc::new(ExtractionConfig {
|
|
562
|
+
ocr: Some(OcrConfig {
|
|
563
|
+
backend: "tesseract".to_string(),
|
|
564
|
+
language: "eng".to_string(),
|
|
565
|
+
tesseract_config: None,
|
|
566
|
+
}),
|
|
567
|
+
force_ocr: false,
|
|
568
|
+
use_cache: true,
|
|
569
|
+
..Default::default()
|
|
570
|
+
});
|
|
571
|
+
|
|
572
|
+
if matches!(
|
|
573
|
+
extract_file_sync(&files[0], None, &config),
|
|
574
|
+
Err(KreuzbergError::MissingDependency(_))
|
|
575
|
+
) {
|
|
576
|
+
return;
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
let mut handles = vec![];
|
|
580
|
+
for (i, file_path) in files.iter().enumerate() {
|
|
581
|
+
let file_path_clone = file_path.clone();
|
|
582
|
+
let config_clone = Arc::clone(&config);
|
|
583
|
+
|
|
584
|
+
let handle = thread::spawn(move || {
|
|
585
|
+
let result = extract_file_sync(&file_path_clone, None, &config_clone);
|
|
586
|
+
match result {
|
|
587
|
+
Ok(extraction_result) => {
|
|
588
|
+
tracing::debug!("File {} extraction succeeded", i);
|
|
589
|
+
assert_non_empty_content(&extraction_result);
|
|
590
|
+
true
|
|
591
|
+
}
|
|
592
|
+
Err(e) => {
|
|
593
|
+
tracing::debug!("File {} extraction failed: {}", i, e);
|
|
594
|
+
false
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
});
|
|
598
|
+
|
|
599
|
+
handles.push(handle);
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
let successes: usize = handles.into_iter().map(|h| if h.join().unwrap() { 1 } else { 0 }).sum();
|
|
603
|
+
|
|
604
|
+
assert_eq!(
|
|
605
|
+
successes, 2,
|
|
606
|
+
"All concurrent threads should succeed with different files"
|
|
607
|
+
);
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
#[test]
|
|
611
|
+
fn test_ocr_with_preprocessing_extreme_dpi() {
|
|
612
|
+
if skip_if_missing("images/test_hello_world.png") {
|
|
613
|
+
return;
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
use kreuzberg::types::ImagePreprocessingConfig;
|
|
617
|
+
|
|
618
|
+
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
619
|
+
let config = ExtractionConfig {
|
|
620
|
+
ocr: Some(OcrConfig {
|
|
621
|
+
backend: "tesseract".to_string(),
|
|
622
|
+
language: "eng".to_string(),
|
|
623
|
+
tesseract_config: Some(TesseractConfig {
|
|
624
|
+
preprocessing: Some(ImagePreprocessingConfig {
|
|
625
|
+
target_dpi: 10000,
|
|
626
|
+
auto_rotate: true,
|
|
627
|
+
deskew: true,
|
|
628
|
+
denoise: false,
|
|
629
|
+
contrast_enhance: false,
|
|
630
|
+
binarization_method: "otsu".to_string(),
|
|
631
|
+
invert_colors: false,
|
|
632
|
+
}),
|
|
633
|
+
..Default::default()
|
|
634
|
+
}),
|
|
635
|
+
}),
|
|
636
|
+
force_ocr: false,
|
|
637
|
+
..Default::default()
|
|
638
|
+
};
|
|
639
|
+
|
|
640
|
+
let result = extract_file_sync(&file_path, None, &config);
|
|
641
|
+
|
|
642
|
+
match result {
|
|
643
|
+
Ok(extraction_result) => {
|
|
644
|
+
tracing::debug!("Extreme DPI accepted (clamped): {}", extraction_result.content.len());
|
|
645
|
+
}
|
|
646
|
+
Err(KreuzbergError::ImageProcessing { message, .. }) | Err(KreuzbergError::Validation { message, .. }) => {
|
|
647
|
+
tracing::debug!("Extreme DPI rejected: {}", message);
|
|
648
|
+
}
|
|
649
|
+
Err(e) => {
|
|
650
|
+
tracing::debug!("Extreme DPI produced error: {}", e);
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
#[test]
|
|
656
|
+
fn test_ocr_with_invalid_binarization_method() {
|
|
657
|
+
if skip_if_missing("images/test_hello_world.png") {
|
|
658
|
+
return;
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
use kreuzberg::types::ImagePreprocessingConfig;
|
|
662
|
+
|
|
663
|
+
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
664
|
+
let config = ExtractionConfig {
|
|
665
|
+
ocr: Some(OcrConfig {
|
|
666
|
+
backend: "tesseract".to_string(),
|
|
667
|
+
language: "eng".to_string(),
|
|
668
|
+
tesseract_config: Some(TesseractConfig {
|
|
669
|
+
preprocessing: Some(ImagePreprocessingConfig {
|
|
670
|
+
target_dpi: 300,
|
|
671
|
+
auto_rotate: true,
|
|
672
|
+
deskew: true,
|
|
673
|
+
denoise: false,
|
|
674
|
+
contrast_enhance: false,
|
|
675
|
+
binarization_method: "invalid_method_xyz".to_string(),
|
|
676
|
+
invert_colors: false,
|
|
677
|
+
}),
|
|
678
|
+
..Default::default()
|
|
679
|
+
}),
|
|
680
|
+
}),
|
|
681
|
+
force_ocr: false,
|
|
682
|
+
..Default::default()
|
|
683
|
+
};
|
|
684
|
+
|
|
685
|
+
let result = extract_file_sync(&file_path, None, &config);
|
|
686
|
+
|
|
687
|
+
match result {
|
|
688
|
+
Ok(_) => {
|
|
689
|
+
tracing::debug!("Invalid binarization method accepted (fallback used)");
|
|
690
|
+
}
|
|
691
|
+
Err(KreuzbergError::Validation { message, .. }) | Err(KreuzbergError::ImageProcessing { message, .. }) => {
|
|
692
|
+
tracing::debug!("Invalid binarization method rejected: {}", message);
|
|
693
|
+
}
|
|
694
|
+
Err(e) => {
|
|
695
|
+
tracing::debug!("Invalid binarization method produced error: {}", e);
|
|
696
|
+
}
|
|
697
|
+
}
|
|
698
|
+
}
|