kreuzberg 4.0.0.rc1 → 4.0.0.rc2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -8
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -534
- data/.rubocop.yml +538 -0
- data/Gemfile +8 -9
- data/Gemfile.lock +9 -109
- data/README.md +426 -421
- data/Rakefile +25 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -340
- data/ext/kreuzberg_rb/extconf.rb +45 -35
- data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +44 -36
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -17
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +2998 -2939
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +148 -105
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +46 -45
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +691 -684
- data/lib/kreuzberg/error_context.rb +32 -0
- data/lib/kreuzberg/errors.rb +118 -50
- data/lib/kreuzberg/extraction_api.rb +85 -84
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +216 -216
- data/lib/kreuzberg/setup_lib_path.rb +80 -79
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +103 -82
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +520 -468
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -87
- data/spec/binding/cli_spec.rb +55 -54
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -42
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/kreuzberg/Cargo.toml +204 -134
- data/vendor/kreuzberg/README.md +175 -175
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
- data/vendor/kreuzberg/build.rs +474 -460
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1143
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -677
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -35
- data/vendor/kreuzberg/src/core/config.rs +1032 -1032
- data/vendor/kreuzberg/src/core/extractor.rs +1024 -903
- data/vendor/kreuzberg/src/core/io.rs +329 -327
- data/vendor/kreuzberg/src/core/mime.rs +605 -615
- data/vendor/kreuzberg/src/core/mod.rs +45 -42
- data/vendor/kreuzberg/src/core/pipeline.rs +984 -906
- data/vendor/kreuzberg/src/embeddings.rs +432 -323
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -40
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +553 -553
- data/vendor/kreuzberg/src/extraction/image.rs +368 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -564
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -77
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -128
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +446 -425
- data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +367 -479
- data/vendor/kreuzberg/src/extractors/email.rs +143 -129
- data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +343 -344
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
- data/vendor/kreuzberg/src/extractors/html.rs +393 -410
- data/vendor/kreuzberg/src/extractors/image.rs +198 -195
- data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +365 -268
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
- data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +493 -496
- data/vendor/kreuzberg/src/extractors/pptx.rs +248 -234
- data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
- data/vendor/kreuzberg/src/extractors/security.rs +484 -0
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +140 -126
- data/vendor/kreuzberg/src/extractors/text.rs +260 -242
- data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +135 -128
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -294
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -942
- data/vendor/kreuzberg/src/lib.rs +105 -102
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -32
- data/vendor/kreuzberg/src/mcp/server.rs +1968 -1966
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -847
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -122
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +393 -420
- data/vendor/kreuzberg/src/pdf/text.rs +158 -161
- data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -1010
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +620 -629
- data/vendor/kreuzberg/src/plugins/processor.rs +642 -641
- data/vendor/kreuzberg/src/plugins/registry.rs +1337 -1324
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +956 -955
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +19 -19
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +903 -873
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -542
- data/vendor/kreuzberg/tests/batch_processing.rs +316 -304
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -509
- data/vendor/kreuzberg/tests/config_features.rs +598 -580
- data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -439
- data/vendor/kreuzberg/tests/core_integration.rs +510 -493
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -424
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -124
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -676
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -43
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -1412
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -561
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -607
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
- data/vendor/kreuzberg/tests/security_validation.rs +415 -404
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/rb-sys/.cargo-ok +1 -0
- data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
- data/vendor/rb-sys/Cargo.lock +393 -0
- data/vendor/rb-sys/Cargo.toml +70 -0
- data/vendor/rb-sys/Cargo.toml.orig +57 -0
- data/vendor/rb-sys/LICENSE-APACHE +190 -0
- data/vendor/rb-sys/LICENSE-MIT +21 -0
- data/vendor/rb-sys/bin/release.sh +21 -0
- data/vendor/rb-sys/build/features.rs +108 -0
- data/vendor/rb-sys/build/main.rs +246 -0
- data/vendor/rb-sys/build/stable_api_config.rs +153 -0
- data/vendor/rb-sys/build/version.rs +48 -0
- data/vendor/rb-sys/readme.md +36 -0
- data/vendor/rb-sys/src/bindings.rs +21 -0
- data/vendor/rb-sys/src/hidden.rs +11 -0
- data/vendor/rb-sys/src/lib.rs +34 -0
- data/vendor/rb-sys/src/macros.rs +371 -0
- data/vendor/rb-sys/src/memory.rs +53 -0
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
- data/vendor/rb-sys/src/special_consts.rs +31 -0
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
- data/vendor/rb-sys/src/stable_api.rs +261 -0
- data/vendor/rb-sys/src/symbol.rs +31 -0
- data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
- data/vendor/rb-sys/src/utils.rs +89 -0
- data/vendor/rb-sys/src/value_type.rs +7 -0
- metadata +90 -95
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/spec/examples.txt +0 -104
- data/vendor/kreuzberg/src/bin/profile_extract.rs +0 -455
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +0 -275
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +0 -178
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +0 -491
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +0 -496
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +0 -1188
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +0 -162
- data/vendor/kreuzberg/src/extractors/pandoc.rs +0 -201
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +0 -92
- data/vendor/kreuzberg/tests/pandoc_integration.rs +0 -503
|
@@ -1,510 +1,510 @@
|
|
|
1
|
-
//! OCR configuration integration tests.
|
|
2
|
-
//!
|
|
3
|
-
//! This module extensively tests Tesseract OCR configuration propagation
|
|
4
|
-
//! to ensure all settings are correctly passed through to the OCR engine.
|
|
5
|
-
//!
|
|
6
|
-
//! Test philosophy:
|
|
7
|
-
//! - Verify all TesseractConfig fields are propagated correctly
|
|
8
|
-
//! - Test different language settings with appropriate test files
|
|
9
|
-
//! - Test PSM (page segmentation mode) variations
|
|
10
|
-
//! - Test force_ocr mode
|
|
11
|
-
//! - Verify configuration changes actually affect output
|
|
12
|
-
//! - Test table detection with various settings
|
|
13
|
-
|
|
14
|
-
mod helpers;
|
|
15
|
-
|
|
16
|
-
use helpers::*;
|
|
17
|
-
use kreuzberg::core::config::{ExtractionConfig, OcrConfig};
|
|
18
|
-
use kreuzberg::extract_file_sync;
|
|
19
|
-
use kreuzberg::types::TesseractConfig;
|
|
20
|
-
|
|
21
|
-
#[test]
|
|
22
|
-
fn test_ocr_language_english() {
|
|
23
|
-
if skip_if_missing("images/test_hello_world.png") {
|
|
24
|
-
return;
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
28
|
-
let config = ExtractionConfig {
|
|
29
|
-
ocr: Some(OcrConfig {
|
|
30
|
-
backend: "tesseract".to_string(),
|
|
31
|
-
language: "eng".to_string(),
|
|
32
|
-
tesseract_config: None,
|
|
33
|
-
}),
|
|
34
|
-
force_ocr: false,
|
|
35
|
-
..Default::default()
|
|
36
|
-
};
|
|
37
|
-
|
|
38
|
-
let result = extract_file_sync(&file_path, None, &config).expect("Should extract with English OCR");
|
|
39
|
-
|
|
40
|
-
assert_mime_type(&result, "image/png");
|
|
41
|
-
|
|
42
|
-
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
43
|
-
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
#[test]
|
|
47
|
-
fn test_ocr_language_german() {
|
|
48
|
-
if skip_if_missing("images/test_hello_world.png") {
|
|
49
|
-
return;
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
53
|
-
let config = ExtractionConfig {
|
|
54
|
-
ocr: Some(OcrConfig {
|
|
55
|
-
backend: "tesseract".to_string(),
|
|
56
|
-
language: "deu".to_string(),
|
|
57
|
-
tesseract_config: None,
|
|
58
|
-
}),
|
|
59
|
-
force_ocr: false,
|
|
60
|
-
..Default::default()
|
|
61
|
-
};
|
|
62
|
-
|
|
63
|
-
let result = extract_file_sync(&file_path, None, &config);
|
|
64
|
-
|
|
65
|
-
match result {
|
|
66
|
-
Ok(extraction_result) => {
|
|
67
|
-
assert_mime_type(&extraction_result, "image/png");
|
|
68
|
-
|
|
69
|
-
assert!(
|
|
70
|
-
extraction_result.chunks.is_none(),
|
|
71
|
-
"Chunks should be None without chunking config"
|
|
72
|
-
);
|
|
73
|
-
assert!(
|
|
74
|
-
extraction_result.detected_languages.is_none(),
|
|
75
|
-
"Language detection not enabled"
|
|
76
|
-
);
|
|
77
|
-
}
|
|
78
|
-
Err(e) => {
|
|
79
|
-
tracing::debug!("German OCR failed (language pack may not be installed): {}", e);
|
|
80
|
-
}
|
|
81
|
-
}
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
#[test]
|
|
85
|
-
fn test_ocr_language_multiple() {
|
|
86
|
-
if skip_if_missing("images/english_and_korean.png") {
|
|
87
|
-
return;
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
let file_path = get_test_file_path("images/english_and_korean.png");
|
|
91
|
-
let config = ExtractionConfig {
|
|
92
|
-
ocr: Some(OcrConfig {
|
|
93
|
-
backend: "tesseract".to_string(),
|
|
94
|
-
language: "eng+kor".to_string(),
|
|
95
|
-
tesseract_config: None,
|
|
96
|
-
}),
|
|
97
|
-
force_ocr: false,
|
|
98
|
-
..Default::default()
|
|
99
|
-
};
|
|
100
|
-
|
|
101
|
-
let result = extract_file_sync(&file_path, None, &config);
|
|
102
|
-
|
|
103
|
-
match result {
|
|
104
|
-
Ok(extraction_result) => {
|
|
105
|
-
assert_mime_type(&extraction_result, "image/png");
|
|
106
|
-
|
|
107
|
-
assert!(
|
|
108
|
-
extraction_result.chunks.is_none(),
|
|
109
|
-
"Chunks should be None without chunking config"
|
|
110
|
-
);
|
|
111
|
-
assert!(
|
|
112
|
-
extraction_result.detected_languages.is_none(),
|
|
113
|
-
"Language detection not enabled"
|
|
114
|
-
);
|
|
115
|
-
}
|
|
116
|
-
Err(e) => {
|
|
117
|
-
tracing::debug!("Multi-language OCR failed (language pack may not be installed): {}", e);
|
|
118
|
-
}
|
|
119
|
-
}
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
#[test]
|
|
123
|
-
fn test_ocr_psm_auto() {
|
|
124
|
-
if skip_if_missing("images/ocr_image.jpg") {
|
|
125
|
-
return;
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
129
|
-
let config = ExtractionConfig {
|
|
130
|
-
ocr: Some(OcrConfig {
|
|
131
|
-
backend: "tesseract".to_string(),
|
|
132
|
-
language: "eng".to_string(),
|
|
133
|
-
tesseract_config: Some(TesseractConfig {
|
|
134
|
-
psm: 3,
|
|
135
|
-
..Default::default()
|
|
136
|
-
}),
|
|
137
|
-
}),
|
|
138
|
-
force_ocr: false,
|
|
139
|
-
..Default::default()
|
|
140
|
-
};
|
|
141
|
-
|
|
142
|
-
let result = extract_file_sync(&file_path, None, &config).expect("Should extract with PSM 3 (auto)");
|
|
143
|
-
|
|
144
|
-
assert_mime_type(&result, "image/jpeg");
|
|
145
|
-
|
|
146
|
-
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
147
|
-
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
#[test]
|
|
151
|
-
fn test_ocr_psm_single_block() {
|
|
152
|
-
if skip_if_missing("images/ocr_image.jpg") {
|
|
153
|
-
return;
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
157
|
-
let config = ExtractionConfig {
|
|
158
|
-
ocr: Some(OcrConfig {
|
|
159
|
-
backend: "tesseract".to_string(),
|
|
160
|
-
language: "eng".to_string(),
|
|
161
|
-
tesseract_config: Some(TesseractConfig {
|
|
162
|
-
psm: 6,
|
|
163
|
-
..Default::default()
|
|
164
|
-
}),
|
|
165
|
-
}),
|
|
166
|
-
force_ocr: false,
|
|
167
|
-
..Default::default()
|
|
168
|
-
};
|
|
169
|
-
|
|
170
|
-
let result = extract_file_sync(&file_path, None, &config).expect("Should extract with PSM 6 (single block)");
|
|
171
|
-
|
|
172
|
-
assert_mime_type(&result, "image/jpeg");
|
|
173
|
-
|
|
174
|
-
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
175
|
-
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
#[test]
|
|
179
|
-
fn test_ocr_psm_single_line() {
|
|
180
|
-
if skip_if_missing("images/test_hello_world.png") {
|
|
181
|
-
return;
|
|
182
|
-
}
|
|
183
|
-
|
|
184
|
-
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
185
|
-
let config = ExtractionConfig {
|
|
186
|
-
ocr: Some(OcrConfig {
|
|
187
|
-
backend: "tesseract".to_string(),
|
|
188
|
-
language: "eng".to_string(),
|
|
189
|
-
tesseract_config: Some(TesseractConfig {
|
|
190
|
-
psm: 7,
|
|
191
|
-
..Default::default()
|
|
192
|
-
}),
|
|
193
|
-
}),
|
|
194
|
-
force_ocr: false,
|
|
195
|
-
..Default::default()
|
|
196
|
-
};
|
|
197
|
-
|
|
198
|
-
let result = extract_file_sync(&file_path, None, &config).expect("Should extract with PSM 7 (single line)");
|
|
199
|
-
|
|
200
|
-
assert_mime_type(&result, "image/png");
|
|
201
|
-
|
|
202
|
-
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
203
|
-
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
#[test]
|
|
207
|
-
fn test_force_ocr_on_text_pdf() {
|
|
208
|
-
if skip_if_missing("pdfs/fake_memo.pdf") {
|
|
209
|
-
return;
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
let file_path = get_test_file_path("pdfs/fake_memo.pdf");
|
|
213
|
-
let config = ExtractionConfig {
|
|
214
|
-
ocr: Some(OcrConfig {
|
|
215
|
-
backend: "tesseract".to_string(),
|
|
216
|
-
language: "eng".to_string(),
|
|
217
|
-
tesseract_config: None,
|
|
218
|
-
}),
|
|
219
|
-
force_ocr: true,
|
|
220
|
-
..Default::default()
|
|
221
|
-
};
|
|
222
|
-
|
|
223
|
-
let result = extract_file_sync(&file_path, None, &config).expect("Should extract with force_ocr enabled");
|
|
224
|
-
|
|
225
|
-
assert_mime_type(&result, "application/pdf");
|
|
226
|
-
assert_non_empty_content(&result);
|
|
227
|
-
|
|
228
|
-
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
229
|
-
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
230
|
-
|
|
231
|
-
#[cfg(feature = "pdf")]
|
|
232
|
-
assert!(result.metadata.format.is_some(), "PDF should have metadata");
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
#[test]
|
|
236
|
-
fn test_force_ocr_disabled() {
|
|
237
|
-
if skip_if_missing("pdfs/fake_memo.pdf") {
|
|
238
|
-
return;
|
|
239
|
-
}
|
|
240
|
-
|
|
241
|
-
let file_path = get_test_file_path("pdfs/fake_memo.pdf");
|
|
242
|
-
let config = ExtractionConfig {
|
|
243
|
-
ocr: Some(OcrConfig {
|
|
244
|
-
backend: "tesseract".to_string(),
|
|
245
|
-
language: "eng".to_string(),
|
|
246
|
-
tesseract_config: None,
|
|
247
|
-
}),
|
|
248
|
-
force_ocr: false,
|
|
249
|
-
..Default::default()
|
|
250
|
-
};
|
|
251
|
-
|
|
252
|
-
let result = extract_file_sync(&file_path, None, &config).expect("Should extract without forcing OCR");
|
|
253
|
-
|
|
254
|
-
assert_mime_type(&result, "application/pdf");
|
|
255
|
-
assert_non_empty_content(&result);
|
|
256
|
-
|
|
257
|
-
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
258
|
-
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
259
|
-
|
|
260
|
-
#[cfg(feature = "pdf")]
|
|
261
|
-
assert!(result.metadata.format.is_some(), "PDF should have metadata");
|
|
262
|
-
}
|
|
263
|
-
|
|
264
|
-
#[test]
|
|
265
|
-
fn test_table_detection_enabled() {
|
|
266
|
-
if skip_if_missing("tables/simple_table.png") {
|
|
267
|
-
return;
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
let file_path = get_test_file_path("tables/simple_table.png");
|
|
271
|
-
let config = ExtractionConfig {
|
|
272
|
-
ocr: Some(OcrConfig {
|
|
273
|
-
backend: "tesseract".to_string(),
|
|
274
|
-
language: "eng".to_string(),
|
|
275
|
-
tesseract_config: Some(TesseractConfig {
|
|
276
|
-
enable_table_detection: true,
|
|
277
|
-
table_min_confidence: 0.5,
|
|
278
|
-
table_column_threshold: 10,
|
|
279
|
-
table_row_threshold_ratio: 0.5,
|
|
280
|
-
..Default::default()
|
|
281
|
-
}),
|
|
282
|
-
}),
|
|
283
|
-
force_ocr: false,
|
|
284
|
-
..Default::default()
|
|
285
|
-
};
|
|
286
|
-
|
|
287
|
-
let result = extract_file_sync(&file_path, None, &config).expect("Should extract with table detection enabled");
|
|
288
|
-
|
|
289
|
-
assert_mime_type(&result, "image/png");
|
|
290
|
-
|
|
291
|
-
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
292
|
-
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
293
|
-
}
|
|
294
|
-
|
|
295
|
-
#[test]
|
|
296
|
-
fn test_table_detection_disabled() {
|
|
297
|
-
if skip_if_missing("tables/simple_table.png") {
|
|
298
|
-
return;
|
|
299
|
-
}
|
|
300
|
-
|
|
301
|
-
let file_path = get_test_file_path("tables/simple_table.png");
|
|
302
|
-
let config = ExtractionConfig {
|
|
303
|
-
ocr: Some(OcrConfig {
|
|
304
|
-
backend: "tesseract".to_string(),
|
|
305
|
-
language: "eng".to_string(),
|
|
306
|
-
tesseract_config: Some(TesseractConfig {
|
|
307
|
-
enable_table_detection: false,
|
|
308
|
-
..Default::default()
|
|
309
|
-
}),
|
|
310
|
-
}),
|
|
311
|
-
force_ocr: false,
|
|
312
|
-
..Default::default()
|
|
313
|
-
};
|
|
314
|
-
|
|
315
|
-
let result = extract_file_sync(&file_path, None, &config).expect("Should extract with table detection disabled");
|
|
316
|
-
|
|
317
|
-
assert_mime_type(&result, "image/png");
|
|
318
|
-
|
|
319
|
-
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
320
|
-
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
321
|
-
}
|
|
322
|
-
|
|
323
|
-
#[test]
|
|
324
|
-
fn test_language_model_ngram_configuration() {
|
|
325
|
-
if skip_if_missing("images/ocr_image.jpg") {
|
|
326
|
-
return;
|
|
327
|
-
}
|
|
328
|
-
|
|
329
|
-
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
330
|
-
let config = ExtractionConfig {
|
|
331
|
-
ocr: Some(OcrConfig {
|
|
332
|
-
backend: "tesseract".to_string(),
|
|
333
|
-
language: "eng".to_string(),
|
|
334
|
-
tesseract_config: Some(TesseractConfig {
|
|
335
|
-
language_model_ngram_on: true,
|
|
336
|
-
..Default::default()
|
|
337
|
-
}),
|
|
338
|
-
}),
|
|
339
|
-
force_ocr: false,
|
|
340
|
-
..Default::default()
|
|
341
|
-
};
|
|
342
|
-
|
|
343
|
-
let result =
|
|
344
|
-
extract_file_sync(&file_path, None, &config).expect("Should extract with ngram language model enabled");
|
|
345
|
-
|
|
346
|
-
assert_mime_type(&result, "image/jpeg");
|
|
347
|
-
|
|
348
|
-
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
349
|
-
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
350
|
-
}
|
|
351
|
-
|
|
352
|
-
#[test]
|
|
353
|
-
fn test_dictionary_correction_enabled() {
|
|
354
|
-
if skip_if_missing("images/ocr_image.jpg") {
|
|
355
|
-
return;
|
|
356
|
-
}
|
|
357
|
-
|
|
358
|
-
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
359
|
-
let config = ExtractionConfig {
|
|
360
|
-
ocr: Some(OcrConfig {
|
|
361
|
-
backend: "tesseract".to_string(),
|
|
362
|
-
language: "eng".to_string(),
|
|
363
|
-
tesseract_config: Some(TesseractConfig {
|
|
364
|
-
tessedit_enable_dict_correction: true,
|
|
365
|
-
..Default::default()
|
|
366
|
-
}),
|
|
367
|
-
}),
|
|
368
|
-
force_ocr: false,
|
|
369
|
-
..Default::default()
|
|
370
|
-
};
|
|
371
|
-
|
|
372
|
-
let result =
|
|
373
|
-
extract_file_sync(&file_path, None, &config).expect("Should extract with dictionary correction enabled");
|
|
374
|
-
|
|
375
|
-
assert_mime_type(&result, "image/jpeg");
|
|
376
|
-
|
|
377
|
-
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
378
|
-
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
379
|
-
}
|
|
380
|
-
|
|
381
|
-
#[test]
|
|
382
|
-
fn test_character_whitelist() {
|
|
383
|
-
if skip_if_missing("images/test_hello_world.png") {
|
|
384
|
-
return;
|
|
385
|
-
}
|
|
386
|
-
|
|
387
|
-
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
388
|
-
let config = ExtractionConfig {
|
|
389
|
-
ocr: Some(OcrConfig {
|
|
390
|
-
backend: "tesseract".to_string(),
|
|
391
|
-
language: "eng".to_string(),
|
|
392
|
-
tesseract_config: Some(TesseractConfig {
|
|
393
|
-
tessedit_char_whitelist: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ".to_string(),
|
|
394
|
-
..Default::default()
|
|
395
|
-
}),
|
|
396
|
-
}),
|
|
397
|
-
force_ocr: false,
|
|
398
|
-
..Default::default()
|
|
399
|
-
};
|
|
400
|
-
|
|
401
|
-
let result = extract_file_sync(&file_path, None, &config).expect("Should extract with character whitelist");
|
|
402
|
-
|
|
403
|
-
assert_mime_type(&result, "image/png");
|
|
404
|
-
|
|
405
|
-
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
406
|
-
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
407
|
-
}
|
|
408
|
-
|
|
409
|
-
#[test]
|
|
410
|
-
fn test_ocr_cache_enabled() {
|
|
411
|
-
if skip_if_missing("images/ocr_image.jpg") {
|
|
412
|
-
return;
|
|
413
|
-
}
|
|
414
|
-
|
|
415
|
-
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
416
|
-
let config = ExtractionConfig {
|
|
417
|
-
ocr: Some(OcrConfig {
|
|
418
|
-
backend: "tesseract".to_string(),
|
|
419
|
-
language: "eng".to_string(),
|
|
420
|
-
tesseract_config: Some(TesseractConfig {
|
|
421
|
-
use_cache: true,
|
|
422
|
-
..Default::default()
|
|
423
|
-
}),
|
|
424
|
-
}),
|
|
425
|
-
force_ocr: false,
|
|
426
|
-
use_cache: true,
|
|
427
|
-
..Default::default()
|
|
428
|
-
};
|
|
429
|
-
|
|
430
|
-
let result1 = extract_file_sync(&file_path, None, &config).expect("First extraction should succeed");
|
|
431
|
-
let result2 = extract_file_sync(&file_path, None, &config).expect("Second extraction should succeed (cached)");
|
|
432
|
-
|
|
433
|
-
assert_mime_type(&result1, "image/jpeg");
|
|
434
|
-
assert_mime_type(&result2, "image/jpeg");
|
|
435
|
-
|
|
436
|
-
assert!(
|
|
437
|
-
result1.chunks.is_none(),
|
|
438
|
-
"Chunks should be None without chunking config"
|
|
439
|
-
);
|
|
440
|
-
assert!(result1.detected_languages.is_none(), "Language detection not enabled");
|
|
441
|
-
assert!(
|
|
442
|
-
result2.chunks.is_none(),
|
|
443
|
-
"Chunks should be None without chunking config"
|
|
444
|
-
);
|
|
445
|
-
assert!(result2.detected_languages.is_none(), "Language detection not enabled");
|
|
446
|
-
}
|
|
447
|
-
|
|
448
|
-
#[test]
|
|
449
|
-
fn test_ocr_cache_disabled() {
|
|
450
|
-
if skip_if_missing("images/ocr_image.jpg") {
|
|
451
|
-
return;
|
|
452
|
-
}
|
|
453
|
-
|
|
454
|
-
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
455
|
-
let config = ExtractionConfig {
|
|
456
|
-
ocr: Some(OcrConfig {
|
|
457
|
-
backend: "tesseract".to_string(),
|
|
458
|
-
language: "eng".to_string(),
|
|
459
|
-
tesseract_config: Some(TesseractConfig {
|
|
460
|
-
use_cache: false,
|
|
461
|
-
..Default::default()
|
|
462
|
-
}),
|
|
463
|
-
}),
|
|
464
|
-
force_ocr: false,
|
|
465
|
-
use_cache: false,
|
|
466
|
-
..Default::default()
|
|
467
|
-
};
|
|
468
|
-
|
|
469
|
-
let result = extract_file_sync(&file_path, None, &config).expect("Should extract without caching");
|
|
470
|
-
|
|
471
|
-
assert_mime_type(&result, "image/jpeg");
|
|
472
|
-
|
|
473
|
-
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
474
|
-
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
475
|
-
}
|
|
476
|
-
|
|
477
|
-
#[test]
|
|
478
|
-
fn test_complex_configuration_combination() {
|
|
479
|
-
if skip_if_missing("images/layout_parser_ocr.jpg") {
|
|
480
|
-
return;
|
|
481
|
-
}
|
|
482
|
-
|
|
483
|
-
let file_path = get_test_file_path("images/layout_parser_ocr.jpg");
|
|
484
|
-
let config = ExtractionConfig {
|
|
485
|
-
ocr: Some(OcrConfig {
|
|
486
|
-
backend: "tesseract".to_string(),
|
|
487
|
-
language: "eng".to_string(),
|
|
488
|
-
tesseract_config: Some(TesseractConfig {
|
|
489
|
-
psm: 3,
|
|
490
|
-
enable_table_detection: true,
|
|
491
|
-
table_min_confidence: 0.7,
|
|
492
|
-
language_model_ngram_on: true,
|
|
493
|
-
tessedit_enable_dict_correction: true,
|
|
494
|
-
use_cache: true,
|
|
495
|
-
..Default::default()
|
|
496
|
-
}),
|
|
497
|
-
}),
|
|
498
|
-
force_ocr: false,
|
|
499
|
-
use_cache: true,
|
|
500
|
-
..Default::default()
|
|
501
|
-
};
|
|
502
|
-
|
|
503
|
-
let result = extract_file_sync(&file_path, None, &config).expect("Should extract with complex configuration");
|
|
504
|
-
|
|
505
|
-
assert_mime_type(&result, "image/jpeg");
|
|
506
|
-
assert_non_empty_content(&result);
|
|
507
|
-
|
|
508
|
-
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
509
|
-
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
510
|
-
}
|
|
1
|
+
//! OCR configuration integration tests.
|
|
2
|
+
//!
|
|
3
|
+
//! This module extensively tests Tesseract OCR configuration propagation
|
|
4
|
+
//! to ensure all settings are correctly passed through to the OCR engine.
|
|
5
|
+
//!
|
|
6
|
+
//! Test philosophy:
|
|
7
|
+
//! - Verify all TesseractConfig fields are propagated correctly
|
|
8
|
+
//! - Test different language settings with appropriate test files
|
|
9
|
+
//! - Test PSM (page segmentation mode) variations
|
|
10
|
+
//! - Test force_ocr mode
|
|
11
|
+
//! - Verify configuration changes actually affect output
|
|
12
|
+
//! - Test table detection with various settings
|
|
13
|
+
|
|
14
|
+
mod helpers;
|
|
15
|
+
|
|
16
|
+
use helpers::*;
|
|
17
|
+
use kreuzberg::core::config::{ExtractionConfig, OcrConfig};
|
|
18
|
+
use kreuzberg::extract_file_sync;
|
|
19
|
+
use kreuzberg::types::TesseractConfig;
|
|
20
|
+
|
|
21
|
+
#[test]
|
|
22
|
+
fn test_ocr_language_english() {
|
|
23
|
+
if skip_if_missing("images/test_hello_world.png") {
|
|
24
|
+
return;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
28
|
+
let config = ExtractionConfig {
|
|
29
|
+
ocr: Some(OcrConfig {
|
|
30
|
+
backend: "tesseract".to_string(),
|
|
31
|
+
language: "eng".to_string(),
|
|
32
|
+
tesseract_config: None,
|
|
33
|
+
}),
|
|
34
|
+
force_ocr: false,
|
|
35
|
+
..Default::default()
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Should extract with English OCR");
|
|
39
|
+
|
|
40
|
+
assert_mime_type(&result, "image/png");
|
|
41
|
+
|
|
42
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
43
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
#[test]
|
|
47
|
+
fn test_ocr_language_german() {
|
|
48
|
+
if skip_if_missing("images/test_hello_world.png") {
|
|
49
|
+
return;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
53
|
+
let config = ExtractionConfig {
|
|
54
|
+
ocr: Some(OcrConfig {
|
|
55
|
+
backend: "tesseract".to_string(),
|
|
56
|
+
language: "deu".to_string(),
|
|
57
|
+
tesseract_config: None,
|
|
58
|
+
}),
|
|
59
|
+
force_ocr: false,
|
|
60
|
+
..Default::default()
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
let result = extract_file_sync(&file_path, None, &config);
|
|
64
|
+
|
|
65
|
+
match result {
|
|
66
|
+
Ok(extraction_result) => {
|
|
67
|
+
assert_mime_type(&extraction_result, "image/png");
|
|
68
|
+
|
|
69
|
+
assert!(
|
|
70
|
+
extraction_result.chunks.is_none(),
|
|
71
|
+
"Chunks should be None without chunking config"
|
|
72
|
+
);
|
|
73
|
+
assert!(
|
|
74
|
+
extraction_result.detected_languages.is_none(),
|
|
75
|
+
"Language detection not enabled"
|
|
76
|
+
);
|
|
77
|
+
}
|
|
78
|
+
Err(e) => {
|
|
79
|
+
tracing::debug!("German OCR failed (language pack may not be installed): {}", e);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
#[test]
|
|
85
|
+
fn test_ocr_language_multiple() {
|
|
86
|
+
if skip_if_missing("images/english_and_korean.png") {
|
|
87
|
+
return;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
let file_path = get_test_file_path("images/english_and_korean.png");
|
|
91
|
+
let config = ExtractionConfig {
|
|
92
|
+
ocr: Some(OcrConfig {
|
|
93
|
+
backend: "tesseract".to_string(),
|
|
94
|
+
language: "eng+kor".to_string(),
|
|
95
|
+
tesseract_config: None,
|
|
96
|
+
}),
|
|
97
|
+
force_ocr: false,
|
|
98
|
+
..Default::default()
|
|
99
|
+
};
|
|
100
|
+
|
|
101
|
+
let result = extract_file_sync(&file_path, None, &config);
|
|
102
|
+
|
|
103
|
+
match result {
|
|
104
|
+
Ok(extraction_result) => {
|
|
105
|
+
assert_mime_type(&extraction_result, "image/png");
|
|
106
|
+
|
|
107
|
+
assert!(
|
|
108
|
+
extraction_result.chunks.is_none(),
|
|
109
|
+
"Chunks should be None without chunking config"
|
|
110
|
+
);
|
|
111
|
+
assert!(
|
|
112
|
+
extraction_result.detected_languages.is_none(),
|
|
113
|
+
"Language detection not enabled"
|
|
114
|
+
);
|
|
115
|
+
}
|
|
116
|
+
Err(e) => {
|
|
117
|
+
tracing::debug!("Multi-language OCR failed (language pack may not be installed): {}", e);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
#[test]
|
|
123
|
+
fn test_ocr_psm_auto() {
|
|
124
|
+
if skip_if_missing("images/ocr_image.jpg") {
|
|
125
|
+
return;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
129
|
+
let config = ExtractionConfig {
|
|
130
|
+
ocr: Some(OcrConfig {
|
|
131
|
+
backend: "tesseract".to_string(),
|
|
132
|
+
language: "eng".to_string(),
|
|
133
|
+
tesseract_config: Some(TesseractConfig {
|
|
134
|
+
psm: 3,
|
|
135
|
+
..Default::default()
|
|
136
|
+
}),
|
|
137
|
+
}),
|
|
138
|
+
force_ocr: false,
|
|
139
|
+
..Default::default()
|
|
140
|
+
};
|
|
141
|
+
|
|
142
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Should extract with PSM 3 (auto)");
|
|
143
|
+
|
|
144
|
+
assert_mime_type(&result, "image/jpeg");
|
|
145
|
+
|
|
146
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
147
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
#[test]
|
|
151
|
+
fn test_ocr_psm_single_block() {
|
|
152
|
+
if skip_if_missing("images/ocr_image.jpg") {
|
|
153
|
+
return;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
157
|
+
let config = ExtractionConfig {
|
|
158
|
+
ocr: Some(OcrConfig {
|
|
159
|
+
backend: "tesseract".to_string(),
|
|
160
|
+
language: "eng".to_string(),
|
|
161
|
+
tesseract_config: Some(TesseractConfig {
|
|
162
|
+
psm: 6,
|
|
163
|
+
..Default::default()
|
|
164
|
+
}),
|
|
165
|
+
}),
|
|
166
|
+
force_ocr: false,
|
|
167
|
+
..Default::default()
|
|
168
|
+
};
|
|
169
|
+
|
|
170
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Should extract with PSM 6 (single block)");
|
|
171
|
+
|
|
172
|
+
assert_mime_type(&result, "image/jpeg");
|
|
173
|
+
|
|
174
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
175
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
#[test]
|
|
179
|
+
fn test_ocr_psm_single_line() {
|
|
180
|
+
if skip_if_missing("images/test_hello_world.png") {
|
|
181
|
+
return;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
185
|
+
let config = ExtractionConfig {
|
|
186
|
+
ocr: Some(OcrConfig {
|
|
187
|
+
backend: "tesseract".to_string(),
|
|
188
|
+
language: "eng".to_string(),
|
|
189
|
+
tesseract_config: Some(TesseractConfig {
|
|
190
|
+
psm: 7,
|
|
191
|
+
..Default::default()
|
|
192
|
+
}),
|
|
193
|
+
}),
|
|
194
|
+
force_ocr: false,
|
|
195
|
+
..Default::default()
|
|
196
|
+
};
|
|
197
|
+
|
|
198
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Should extract with PSM 7 (single line)");
|
|
199
|
+
|
|
200
|
+
assert_mime_type(&result, "image/png");
|
|
201
|
+
|
|
202
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
203
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
#[test]
|
|
207
|
+
fn test_force_ocr_on_text_pdf() {
|
|
208
|
+
if skip_if_missing("pdfs/fake_memo.pdf") {
|
|
209
|
+
return;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
let file_path = get_test_file_path("pdfs/fake_memo.pdf");
|
|
213
|
+
let config = ExtractionConfig {
|
|
214
|
+
ocr: Some(OcrConfig {
|
|
215
|
+
backend: "tesseract".to_string(),
|
|
216
|
+
language: "eng".to_string(),
|
|
217
|
+
tesseract_config: None,
|
|
218
|
+
}),
|
|
219
|
+
force_ocr: true,
|
|
220
|
+
..Default::default()
|
|
221
|
+
};
|
|
222
|
+
|
|
223
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Should extract with force_ocr enabled");
|
|
224
|
+
|
|
225
|
+
assert_mime_type(&result, "application/pdf");
|
|
226
|
+
assert_non_empty_content(&result);
|
|
227
|
+
|
|
228
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
229
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
230
|
+
|
|
231
|
+
#[cfg(feature = "pdf")]
|
|
232
|
+
assert!(result.metadata.format.is_some(), "PDF should have metadata");
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
#[test]
|
|
236
|
+
fn test_force_ocr_disabled() {
|
|
237
|
+
if skip_if_missing("pdfs/fake_memo.pdf") {
|
|
238
|
+
return;
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
let file_path = get_test_file_path("pdfs/fake_memo.pdf");
|
|
242
|
+
let config = ExtractionConfig {
|
|
243
|
+
ocr: Some(OcrConfig {
|
|
244
|
+
backend: "tesseract".to_string(),
|
|
245
|
+
language: "eng".to_string(),
|
|
246
|
+
tesseract_config: None,
|
|
247
|
+
}),
|
|
248
|
+
force_ocr: false,
|
|
249
|
+
..Default::default()
|
|
250
|
+
};
|
|
251
|
+
|
|
252
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Should extract without forcing OCR");
|
|
253
|
+
|
|
254
|
+
assert_mime_type(&result, "application/pdf");
|
|
255
|
+
assert_non_empty_content(&result);
|
|
256
|
+
|
|
257
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
258
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
259
|
+
|
|
260
|
+
#[cfg(feature = "pdf")]
|
|
261
|
+
assert!(result.metadata.format.is_some(), "PDF should have metadata");
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
#[test]
|
|
265
|
+
fn test_table_detection_enabled() {
|
|
266
|
+
if skip_if_missing("tables/simple_table.png") {
|
|
267
|
+
return;
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
let file_path = get_test_file_path("tables/simple_table.png");
|
|
271
|
+
let config = ExtractionConfig {
|
|
272
|
+
ocr: Some(OcrConfig {
|
|
273
|
+
backend: "tesseract".to_string(),
|
|
274
|
+
language: "eng".to_string(),
|
|
275
|
+
tesseract_config: Some(TesseractConfig {
|
|
276
|
+
enable_table_detection: true,
|
|
277
|
+
table_min_confidence: 0.5,
|
|
278
|
+
table_column_threshold: 10,
|
|
279
|
+
table_row_threshold_ratio: 0.5,
|
|
280
|
+
..Default::default()
|
|
281
|
+
}),
|
|
282
|
+
}),
|
|
283
|
+
force_ocr: false,
|
|
284
|
+
..Default::default()
|
|
285
|
+
};
|
|
286
|
+
|
|
287
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Should extract with table detection enabled");
|
|
288
|
+
|
|
289
|
+
assert_mime_type(&result, "image/png");
|
|
290
|
+
|
|
291
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
292
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
#[test]
|
|
296
|
+
fn test_table_detection_disabled() {
|
|
297
|
+
if skip_if_missing("tables/simple_table.png") {
|
|
298
|
+
return;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
let file_path = get_test_file_path("tables/simple_table.png");
|
|
302
|
+
let config = ExtractionConfig {
|
|
303
|
+
ocr: Some(OcrConfig {
|
|
304
|
+
backend: "tesseract".to_string(),
|
|
305
|
+
language: "eng".to_string(),
|
|
306
|
+
tesseract_config: Some(TesseractConfig {
|
|
307
|
+
enable_table_detection: false,
|
|
308
|
+
..Default::default()
|
|
309
|
+
}),
|
|
310
|
+
}),
|
|
311
|
+
force_ocr: false,
|
|
312
|
+
..Default::default()
|
|
313
|
+
};
|
|
314
|
+
|
|
315
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Should extract with table detection disabled");
|
|
316
|
+
|
|
317
|
+
assert_mime_type(&result, "image/png");
|
|
318
|
+
|
|
319
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
320
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
#[test]
|
|
324
|
+
fn test_language_model_ngram_configuration() {
|
|
325
|
+
if skip_if_missing("images/ocr_image.jpg") {
|
|
326
|
+
return;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
330
|
+
let config = ExtractionConfig {
|
|
331
|
+
ocr: Some(OcrConfig {
|
|
332
|
+
backend: "tesseract".to_string(),
|
|
333
|
+
language: "eng".to_string(),
|
|
334
|
+
tesseract_config: Some(TesseractConfig {
|
|
335
|
+
language_model_ngram_on: true,
|
|
336
|
+
..Default::default()
|
|
337
|
+
}),
|
|
338
|
+
}),
|
|
339
|
+
force_ocr: false,
|
|
340
|
+
..Default::default()
|
|
341
|
+
};
|
|
342
|
+
|
|
343
|
+
let result =
|
|
344
|
+
extract_file_sync(&file_path, None, &config).expect("Should extract with ngram language model enabled");
|
|
345
|
+
|
|
346
|
+
assert_mime_type(&result, "image/jpeg");
|
|
347
|
+
|
|
348
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
349
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
#[test]
|
|
353
|
+
fn test_dictionary_correction_enabled() {
|
|
354
|
+
if skip_if_missing("images/ocr_image.jpg") {
|
|
355
|
+
return;
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
359
|
+
let config = ExtractionConfig {
|
|
360
|
+
ocr: Some(OcrConfig {
|
|
361
|
+
backend: "tesseract".to_string(),
|
|
362
|
+
language: "eng".to_string(),
|
|
363
|
+
tesseract_config: Some(TesseractConfig {
|
|
364
|
+
tessedit_enable_dict_correction: true,
|
|
365
|
+
..Default::default()
|
|
366
|
+
}),
|
|
367
|
+
}),
|
|
368
|
+
force_ocr: false,
|
|
369
|
+
..Default::default()
|
|
370
|
+
};
|
|
371
|
+
|
|
372
|
+
let result =
|
|
373
|
+
extract_file_sync(&file_path, None, &config).expect("Should extract with dictionary correction enabled");
|
|
374
|
+
|
|
375
|
+
assert_mime_type(&result, "image/jpeg");
|
|
376
|
+
|
|
377
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
378
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
#[test]
|
|
382
|
+
fn test_character_whitelist() {
|
|
383
|
+
if skip_if_missing("images/test_hello_world.png") {
|
|
384
|
+
return;
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
let file_path = get_test_file_path("images/test_hello_world.png");
|
|
388
|
+
let config = ExtractionConfig {
|
|
389
|
+
ocr: Some(OcrConfig {
|
|
390
|
+
backend: "tesseract".to_string(),
|
|
391
|
+
language: "eng".to_string(),
|
|
392
|
+
tesseract_config: Some(TesseractConfig {
|
|
393
|
+
tessedit_char_whitelist: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ".to_string(),
|
|
394
|
+
..Default::default()
|
|
395
|
+
}),
|
|
396
|
+
}),
|
|
397
|
+
force_ocr: false,
|
|
398
|
+
..Default::default()
|
|
399
|
+
};
|
|
400
|
+
|
|
401
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Should extract with character whitelist");
|
|
402
|
+
|
|
403
|
+
assert_mime_type(&result, "image/png");
|
|
404
|
+
|
|
405
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
406
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
#[test]
|
|
410
|
+
fn test_ocr_cache_enabled() {
|
|
411
|
+
if skip_if_missing("images/ocr_image.jpg") {
|
|
412
|
+
return;
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
416
|
+
let config = ExtractionConfig {
|
|
417
|
+
ocr: Some(OcrConfig {
|
|
418
|
+
backend: "tesseract".to_string(),
|
|
419
|
+
language: "eng".to_string(),
|
|
420
|
+
tesseract_config: Some(TesseractConfig {
|
|
421
|
+
use_cache: true,
|
|
422
|
+
..Default::default()
|
|
423
|
+
}),
|
|
424
|
+
}),
|
|
425
|
+
force_ocr: false,
|
|
426
|
+
use_cache: true,
|
|
427
|
+
..Default::default()
|
|
428
|
+
};
|
|
429
|
+
|
|
430
|
+
let result1 = extract_file_sync(&file_path, None, &config).expect("First extraction should succeed");
|
|
431
|
+
let result2 = extract_file_sync(&file_path, None, &config).expect("Second extraction should succeed (cached)");
|
|
432
|
+
|
|
433
|
+
assert_mime_type(&result1, "image/jpeg");
|
|
434
|
+
assert_mime_type(&result2, "image/jpeg");
|
|
435
|
+
|
|
436
|
+
assert!(
|
|
437
|
+
result1.chunks.is_none(),
|
|
438
|
+
"Chunks should be None without chunking config"
|
|
439
|
+
);
|
|
440
|
+
assert!(result1.detected_languages.is_none(), "Language detection not enabled");
|
|
441
|
+
assert!(
|
|
442
|
+
result2.chunks.is_none(),
|
|
443
|
+
"Chunks should be None without chunking config"
|
|
444
|
+
);
|
|
445
|
+
assert!(result2.detected_languages.is_none(), "Language detection not enabled");
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
#[test]
|
|
449
|
+
fn test_ocr_cache_disabled() {
|
|
450
|
+
if skip_if_missing("images/ocr_image.jpg") {
|
|
451
|
+
return;
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
455
|
+
let config = ExtractionConfig {
|
|
456
|
+
ocr: Some(OcrConfig {
|
|
457
|
+
backend: "tesseract".to_string(),
|
|
458
|
+
language: "eng".to_string(),
|
|
459
|
+
tesseract_config: Some(TesseractConfig {
|
|
460
|
+
use_cache: false,
|
|
461
|
+
..Default::default()
|
|
462
|
+
}),
|
|
463
|
+
}),
|
|
464
|
+
force_ocr: false,
|
|
465
|
+
use_cache: false,
|
|
466
|
+
..Default::default()
|
|
467
|
+
};
|
|
468
|
+
|
|
469
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Should extract without caching");
|
|
470
|
+
|
|
471
|
+
assert_mime_type(&result, "image/jpeg");
|
|
472
|
+
|
|
473
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
474
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
#[test]
|
|
478
|
+
fn test_complex_configuration_combination() {
|
|
479
|
+
if skip_if_missing("images/layout_parser_ocr.jpg") {
|
|
480
|
+
return;
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
let file_path = get_test_file_path("images/layout_parser_ocr.jpg");
|
|
484
|
+
let config = ExtractionConfig {
|
|
485
|
+
ocr: Some(OcrConfig {
|
|
486
|
+
backend: "tesseract".to_string(),
|
|
487
|
+
language: "eng".to_string(),
|
|
488
|
+
tesseract_config: Some(TesseractConfig {
|
|
489
|
+
psm: 3,
|
|
490
|
+
enable_table_detection: true,
|
|
491
|
+
table_min_confidence: 0.7,
|
|
492
|
+
language_model_ngram_on: true,
|
|
493
|
+
tessedit_enable_dict_correction: true,
|
|
494
|
+
use_cache: true,
|
|
495
|
+
..Default::default()
|
|
496
|
+
}),
|
|
497
|
+
}),
|
|
498
|
+
force_ocr: false,
|
|
499
|
+
use_cache: true,
|
|
500
|
+
..Default::default()
|
|
501
|
+
};
|
|
502
|
+
|
|
503
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Should extract with complex configuration");
|
|
504
|
+
|
|
505
|
+
assert_mime_type(&result, "image/jpeg");
|
|
506
|
+
assert_non_empty_content(&result);
|
|
507
|
+
|
|
508
|
+
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
509
|
+
assert!(result.detected_languages.is_none(), "Language detection not enabled");
|
|
510
|
+
}
|