kreuzberg 4.0.0.pre.rc.13 → 4.0.0.pre.rc.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +105 -2
- data/README.md +454 -454
- data/Rakefile +33 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6940 -6941
- data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3158
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -214
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -81
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -80
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -340
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -109
- data/lib/{pdfium.dll → libpdfium.dylib} +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -546
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +230 -230
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +843 -843
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1184
- data/vendor/kreuzberg/src/embeddings.rs +500 -500
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +601 -601
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -574
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -749
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -44
- data/vendor/kreuzberg/src/pdf/bundled.rs +346 -346
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -68
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -420
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +164 -164
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +1 -1
- data/vendor/kreuzberg-ffi/README.md +851 -851
- data/vendor/kreuzberg-ffi/build.rs +176 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- data/vendor/kreuzberg-tesseract/LICENSE +22 -22
- data/vendor/kreuzberg-tesseract/README.md +399 -399
- data/vendor/kreuzberg-tesseract/build.rs +1354 -1354
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -211
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +73 -4
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +0 -12
|
@@ -1,469 +1,469 @@
|
|
|
1
|
-
//! Comprehensive OCR stress tests.
|
|
2
|
-
//!
|
|
3
|
-
//! Validates that Tesseract integration is thread-safe and performant under heavy load:
|
|
4
|
-
//! - Rayon parallel batch processing doesn't cause race conditions
|
|
5
|
-
//! - Multiple concurrent batch operations don't interfere
|
|
6
|
-
//! - Memory usage stays bounded under heavy OCR load
|
|
7
|
-
//! - Tesseract API calls are thread-safe
|
|
8
|
-
//! - Cache handles concurrent OCR operations correctly
|
|
9
|
-
//!
|
|
10
|
-
//! These tests ensure production workloads with heavy OCR usage work correctly.
|
|
11
|
-
|
|
12
|
-
#![cfg(feature = "ocr")]
|
|
13
|
-
|
|
14
|
-
use kreuzberg::core::config::{ExtractionConfig, OcrConfig};
|
|
15
|
-
use kreuzberg::core::extractor::extract_file_sync;
|
|
16
|
-
use kreuzberg::ocr::processor::OcrProcessor;
|
|
17
|
-
use kreuzberg::ocr::types::TesseractConfig;
|
|
18
|
-
use std::sync::Arc;
|
|
19
|
-
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
20
|
-
use std::time::Instant;
|
|
21
|
-
|
|
22
|
-
mod helpers;
|
|
23
|
-
|
|
24
|
-
/// Stress test: Rayon parallel batch processing with many images.
|
|
25
|
-
///
|
|
26
|
-
/// Validates that:
|
|
27
|
-
/// - Rayon parallelization works correctly with Tesseract
|
|
28
|
-
/// - No race conditions in parallel OCR processing
|
|
29
|
-
/// - All results are correct with no cross-contamination
|
|
30
|
-
#[cfg(feature = "ocr")]
|
|
31
|
-
#[cfg_attr(coverage, ignore = "coverage instrumentation slows down rayon benchmarks")]
|
|
32
|
-
#[ignore = "flaky performance test dependent on CI runner speed"]
|
|
33
|
-
#[test]
|
|
34
|
-
fn test_rayon_batch_stress_many_images() {
|
|
35
|
-
use helpers::{get_test_file_path, skip_if_missing};
|
|
36
|
-
|
|
37
|
-
if skip_if_missing("images/ocr_image.jpg") {
|
|
38
|
-
tracing::debug!("Skipping Rayon batch stress test: test file not available");
|
|
39
|
-
return;
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
let processor = OcrProcessor::new(None).expect("Should create processor");
|
|
43
|
-
let config = TesseractConfig::default();
|
|
44
|
-
|
|
45
|
-
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
46
|
-
let file_paths: Vec<String> = (0..100).map(|_| file_path.to_string_lossy().to_string()).collect();
|
|
47
|
-
|
|
48
|
-
let start = Instant::now();
|
|
49
|
-
let results = processor.process_files_batch(file_paths, &config);
|
|
50
|
-
let duration = start.elapsed();
|
|
51
|
-
|
|
52
|
-
let success_count = results.iter().filter(|r| r.success).count();
|
|
53
|
-
assert_eq!(
|
|
54
|
-
success_count, 100,
|
|
55
|
-
"All 100 OCR operations should succeed, got {} successes",
|
|
56
|
-
success_count
|
|
57
|
-
);
|
|
58
|
-
|
|
59
|
-
let first_content = results[0].result.as_ref().unwrap().content.clone();
|
|
60
|
-
for (i, result) in results.iter().enumerate().skip(1) {
|
|
61
|
-
assert!(result.success, "Result {} should succeed", i);
|
|
62
|
-
let content = &result.result.as_ref().unwrap().content;
|
|
63
|
-
assert_eq!(
|
|
64
|
-
content, &first_content,
|
|
65
|
-
"Result {} content differs - possible race condition",
|
|
66
|
-
i
|
|
67
|
-
);
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
println!(
|
|
71
|
-
"Processed 100 images with Rayon in {:?} ({:.2} images/sec)",
|
|
72
|
-
duration,
|
|
73
|
-
100.0 / duration.as_secs_f64()
|
|
74
|
-
);
|
|
75
|
-
|
|
76
|
-
let images_per_sec = 100.0 / duration.as_secs_f64();
|
|
77
|
-
assert!(
|
|
78
|
-
images_per_sec > 5.0,
|
|
79
|
-
"Parallel batch should process at least 5 images/sec, got {:.2}",
|
|
80
|
-
images_per_sec
|
|
81
|
-
);
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
/// Stress test: Multiple concurrent batch operations.
|
|
85
|
-
///
|
|
86
|
-
/// Validates that:
|
|
87
|
-
/// - Multiple threads can run batch_process simultaneously
|
|
88
|
-
/// - Rayon thread pool doesn't deadlock or starve
|
|
89
|
-
/// - Results remain correct under concurrent batch load
|
|
90
|
-
#[test]
|
|
91
|
-
fn test_concurrent_rayon_batches() {
|
|
92
|
-
use helpers::{get_test_file_path, skip_if_missing};
|
|
93
|
-
|
|
94
|
-
if skip_if_missing("images/ocr_image.jpg") {
|
|
95
|
-
tracing::debug!("Skipping concurrent Rayon batches test: test file not available");
|
|
96
|
-
return;
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
let processor = Arc::new(OcrProcessor::new(None).expect("Should create processor"));
|
|
100
|
-
let config = Arc::new(TesseractConfig::default());
|
|
101
|
-
|
|
102
|
-
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
103
|
-
let file_paths: Vec<String> = (0..20).map(|_| file_path.to_string_lossy().to_string()).collect();
|
|
104
|
-
|
|
105
|
-
let mut handles = vec![];
|
|
106
|
-
let total_processed = Arc::new(AtomicUsize::new(0));
|
|
107
|
-
|
|
108
|
-
for batch_id in 0..10 {
|
|
109
|
-
let processor = Arc::clone(&processor);
|
|
110
|
-
let config = Arc::clone(&config);
|
|
111
|
-
let file_paths = file_paths.clone();
|
|
112
|
-
let total = Arc::clone(&total_processed);
|
|
113
|
-
|
|
114
|
-
handles.push(std::thread::spawn(move || {
|
|
115
|
-
let results = processor.process_files_batch(file_paths, &config);
|
|
116
|
-
|
|
117
|
-
let success_count = results.iter().filter(|r| r.success).count();
|
|
118
|
-
assert_eq!(
|
|
119
|
-
success_count, 20,
|
|
120
|
-
"Batch {} should have 20 successes, got {}",
|
|
121
|
-
batch_id, success_count
|
|
122
|
-
);
|
|
123
|
-
|
|
124
|
-
total.fetch_add(success_count, Ordering::Relaxed);
|
|
125
|
-
results
|
|
126
|
-
}));
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
let mut all_results = vec![];
|
|
130
|
-
for handle in handles {
|
|
131
|
-
let results = handle.join().expect("Thread should not panic");
|
|
132
|
-
all_results.push(results);
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
let total = total_processed.load(Ordering::Relaxed);
|
|
136
|
-
assert_eq!(total, 200, "Should process 200 total images (10 batches × 20 images)");
|
|
137
|
-
|
|
138
|
-
println!("Successfully processed 200 images across 10 concurrent batches");
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
/// Stress test: High memory pressure with large batch.
|
|
142
|
-
///
|
|
143
|
-
/// Validates that:
|
|
144
|
-
/// - Memory usage stays bounded during large batch processing
|
|
145
|
-
/// - No memory leaks in Tesseract integration
|
|
146
|
-
/// - System remains stable under memory pressure
|
|
147
|
-
#[test]
|
|
148
|
-
fn test_rayon_batch_memory_pressure() {
|
|
149
|
-
use helpers::{get_test_file_path, skip_if_missing};
|
|
150
|
-
|
|
151
|
-
if skip_if_missing("images/ocr_image.jpg") {
|
|
152
|
-
tracing::debug!("Skipping memory pressure test: test file not available");
|
|
153
|
-
return;
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
let processor = OcrProcessor::new(None).expect("Should create processor");
|
|
157
|
-
let config = TesseractConfig::default();
|
|
158
|
-
|
|
159
|
-
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
160
|
-
|
|
161
|
-
for wave in 0..5 {
|
|
162
|
-
let file_paths: Vec<String> = (0..50).map(|_| file_path.to_string_lossy().to_string()).collect();
|
|
163
|
-
|
|
164
|
-
let start = Instant::now();
|
|
165
|
-
let results = processor.process_files_batch(file_paths, &config);
|
|
166
|
-
let duration = start.elapsed();
|
|
167
|
-
|
|
168
|
-
let success_count = results.iter().filter(|r| r.success).count();
|
|
169
|
-
assert_eq!(
|
|
170
|
-
success_count, 50,
|
|
171
|
-
"Wave {} should process 50 images, got {} successes",
|
|
172
|
-
wave, success_count
|
|
173
|
-
);
|
|
174
|
-
|
|
175
|
-
println!("Wave {} processed 50 images in {:?}", wave, duration);
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
println!("Successfully completed 5 waves of 50 images (250 total) without memory issues");
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
/// Stress test: Concurrent Tesseract API calls.
|
|
182
|
-
///
|
|
183
|
-
/// Validates that:
|
|
184
|
-
/// - TesseractAPI is thread-safe in Rust wrapper
|
|
185
|
-
/// - No crashes or corruption with concurrent API usage
|
|
186
|
-
/// - Results are deterministic across threads
|
|
187
|
-
#[test]
|
|
188
|
-
fn test_tesseract_api_thread_safety() {
|
|
189
|
-
use helpers::{get_test_file_path, skip_if_missing};
|
|
190
|
-
|
|
191
|
-
if skip_if_missing("images/ocr_image.jpg") {
|
|
192
|
-
tracing::debug!("Skipping Tesseract API thread-safety test: test file not available");
|
|
193
|
-
return;
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
let config = ExtractionConfig {
|
|
197
|
-
ocr: Some(OcrConfig {
|
|
198
|
-
backend: "tesseract".to_string(),
|
|
199
|
-
language: "eng".to_string(),
|
|
200
|
-
tesseract_config: None,
|
|
201
|
-
}),
|
|
202
|
-
force_ocr: false,
|
|
203
|
-
use_cache: false,
|
|
204
|
-
..Default::default()
|
|
205
|
-
};
|
|
206
|
-
|
|
207
|
-
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
208
|
-
|
|
209
|
-
let mut handles = vec![];
|
|
210
|
-
for thread_id in 0..50 {
|
|
211
|
-
let file_path = file_path.clone();
|
|
212
|
-
let config = config.clone();
|
|
213
|
-
|
|
214
|
-
handles.push(std::thread::spawn(move || {
|
|
215
|
-
let result = extract_file_sync(&file_path, None, &config);
|
|
216
|
-
assert!(
|
|
217
|
-
result.is_ok(),
|
|
218
|
-
"Thread {} OCR should succeed: {:?}",
|
|
219
|
-
thread_id,
|
|
220
|
-
result.err()
|
|
221
|
-
);
|
|
222
|
-
result.unwrap()
|
|
223
|
-
}));
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
let mut results = vec![];
|
|
227
|
-
for handle in handles {
|
|
228
|
-
let extraction = handle.join().expect("Thread should not panic");
|
|
229
|
-
assert!(!extraction.content.is_empty(), "OCR should extract text");
|
|
230
|
-
results.push(extraction);
|
|
231
|
-
}
|
|
232
|
-
|
|
233
|
-
let first_content = &results[0].content;
|
|
234
|
-
for (i, result) in results.iter().enumerate().skip(1) {
|
|
235
|
-
assert_eq!(
|
|
236
|
-
&result.content, first_content,
|
|
237
|
-
"Result {} differs from first - thread-safety issue",
|
|
238
|
-
i
|
|
239
|
-
);
|
|
240
|
-
}
|
|
241
|
-
|
|
242
|
-
println!("Successfully completed 50 concurrent Tesseract API calls with consistent results");
|
|
243
|
-
}
|
|
244
|
-
|
|
245
|
-
/// Stress test: Sustained concurrent OCR load over time.
|
|
246
|
-
///
|
|
247
|
-
/// Validates that:
|
|
248
|
-
/// - System remains stable under prolonged concurrent OCR
|
|
249
|
-
/// - No resource leaks or degradation over time
|
|
250
|
-
/// - Throughput remains consistent
|
|
251
|
-
#[test]
|
|
252
|
-
fn test_sustained_concurrent_ocr_load() {
|
|
253
|
-
use helpers::{get_test_file_path, skip_if_missing};
|
|
254
|
-
|
|
255
|
-
if skip_if_missing("images/ocr_image.jpg") {
|
|
256
|
-
tracing::debug!("Skipping sustained load test: test file not available");
|
|
257
|
-
return;
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
let processor = Arc::new(OcrProcessor::new(None).expect("Should create processor"));
|
|
261
|
-
let config = Arc::new(TesseractConfig {
|
|
262
|
-
use_cache: false,
|
|
263
|
-
..Default::default()
|
|
264
|
-
});
|
|
265
|
-
|
|
266
|
-
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
267
|
-
let total_processed = Arc::new(AtomicUsize::new(0));
|
|
268
|
-
|
|
269
|
-
let mut handles = vec![];
|
|
270
|
-
for worker_id in 0..20 {
|
|
271
|
-
let processor = Arc::clone(&processor);
|
|
272
|
-
let config = Arc::clone(&config);
|
|
273
|
-
let file_path = file_path.clone();
|
|
274
|
-
let total = Arc::clone(&total_processed);
|
|
275
|
-
|
|
276
|
-
handles.push(std::thread::spawn(move || {
|
|
277
|
-
for batch in 0..2 {
|
|
278
|
-
let file_paths: Vec<String> = (0..5).map(|_| file_path.to_string_lossy().to_string()).collect();
|
|
279
|
-
|
|
280
|
-
let results = processor.process_files_batch(file_paths, &config);
|
|
281
|
-
|
|
282
|
-
let success_count = results.iter().filter(|r| r.success).count();
|
|
283
|
-
assert_eq!(
|
|
284
|
-
success_count, 5,
|
|
285
|
-
"Worker {} batch {} should process 5 images",
|
|
286
|
-
worker_id, batch
|
|
287
|
-
);
|
|
288
|
-
|
|
289
|
-
total.fetch_add(success_count, Ordering::Relaxed);
|
|
290
|
-
}
|
|
291
|
-
}));
|
|
292
|
-
}
|
|
293
|
-
|
|
294
|
-
for handle in handles {
|
|
295
|
-
handle.join().expect("Worker should not panic");
|
|
296
|
-
}
|
|
297
|
-
|
|
298
|
-
let total = total_processed.load(Ordering::Relaxed);
|
|
299
|
-
assert_eq!(total, 200, "Should process 200 total images (20 workers × 10 images)");
|
|
300
|
-
|
|
301
|
-
println!("Successfully sustained 20 concurrent workers processing 200 total images");
|
|
302
|
-
}
|
|
303
|
-
|
|
304
|
-
/// Stress test: Concurrent cache access during batch OCR.
|
|
305
|
-
///
|
|
306
|
-
/// Validates that:
|
|
307
|
-
/// - Cache is thread-safe under concurrent batch operations
|
|
308
|
-
/// - Cache hits work correctly with Rayon parallelism
|
|
309
|
-
/// - No cache corruption or race conditions
|
|
310
|
-
#[test]
|
|
311
|
-
fn test_concurrent_batch_with_cache() {
|
|
312
|
-
use helpers::{get_test_file_path, skip_if_missing};
|
|
313
|
-
|
|
314
|
-
if skip_if_missing("images/ocr_image.jpg") {
|
|
315
|
-
tracing::debug!("Skipping cache stress test: test file not available");
|
|
316
|
-
return;
|
|
317
|
-
}
|
|
318
|
-
|
|
319
|
-
let temp_dir = tempfile::tempdir().expect("Should create temp dir");
|
|
320
|
-
let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).expect("Should create processor");
|
|
321
|
-
let config = TesseractConfig {
|
|
322
|
-
use_cache: true,
|
|
323
|
-
..Default::default()
|
|
324
|
-
};
|
|
325
|
-
|
|
326
|
-
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
327
|
-
|
|
328
|
-
let warm_paths: Vec<String> = (0..10).map(|_| file_path.to_string_lossy().to_string()).collect();
|
|
329
|
-
let _ = processor.process_files_batch(warm_paths, &config);
|
|
330
|
-
|
|
331
|
-
let processor = Arc::new(processor);
|
|
332
|
-
let config = Arc::new(config);
|
|
333
|
-
let mut handles = vec![];
|
|
334
|
-
let total_successes = Arc::new(AtomicUsize::new(0));
|
|
335
|
-
|
|
336
|
-
for _ in 0..10 {
|
|
337
|
-
let processor = Arc::clone(&processor);
|
|
338
|
-
let config = Arc::clone(&config);
|
|
339
|
-
let file_path = file_path.clone();
|
|
340
|
-
let total = Arc::clone(&total_successes);
|
|
341
|
-
|
|
342
|
-
handles.push(std::thread::spawn(move || {
|
|
343
|
-
let file_paths: Vec<String> = (0..5).map(|_| file_path.to_string_lossy().to_string()).collect();
|
|
344
|
-
|
|
345
|
-
let results = processor.process_files_batch(file_paths, &config);
|
|
346
|
-
|
|
347
|
-
let success_count = results.iter().filter(|r| r.success).count();
|
|
348
|
-
total.fetch_add(success_count, Ordering::Relaxed);
|
|
349
|
-
|
|
350
|
-
results
|
|
351
|
-
}));
|
|
352
|
-
}
|
|
353
|
-
|
|
354
|
-
for handle in handles {
|
|
355
|
-
let results = handle.join().expect("Thread should not panic");
|
|
356
|
-
assert_eq!(results.len(), 5, "Each batch should process 5 images");
|
|
357
|
-
|
|
358
|
-
let success_count = results.iter().filter(|r| r.success).count();
|
|
359
|
-
assert_eq!(success_count, 5, "All 5 should succeed (from cache)");
|
|
360
|
-
}
|
|
361
|
-
|
|
362
|
-
let total = total_successes.load(Ordering::Relaxed);
|
|
363
|
-
assert_eq!(total, 50, "Should process 50 total images (10 batches × 5 images)");
|
|
364
|
-
|
|
365
|
-
println!("Successfully completed 10 concurrent cached batches with 50 total images");
|
|
366
|
-
}
|
|
367
|
-
|
|
368
|
-
/// Stress test: Rayon parallel performance comparison.
|
|
369
|
-
///
|
|
370
|
-
/// Validates that:
|
|
371
|
-
/// - Rayon parallelization provides significant speedup
|
|
372
|
-
/// - Parallel batch is faster than sequential
|
|
373
|
-
/// - Speedup scales reasonably with CPU cores
|
|
374
|
-
#[test]
|
|
375
|
-
fn test_rayon_parallel_speedup() {
|
|
376
|
-
use helpers::{get_test_file_path, skip_if_missing};
|
|
377
|
-
|
|
378
|
-
if std::env::var("CI").is_ok() {
|
|
379
|
-
tracing::warn!("Skipping Rayon speedup test on CI to avoid flaky timing-based failures");
|
|
380
|
-
return;
|
|
381
|
-
}
|
|
382
|
-
|
|
383
|
-
if skip_if_missing("images/ocr_image.jpg") {
|
|
384
|
-
tracing::debug!("Skipping Rayon speedup test: test file not available");
|
|
385
|
-
return;
|
|
386
|
-
}
|
|
387
|
-
|
|
388
|
-
let processor = OcrProcessor::new(None).expect("Should create processor");
|
|
389
|
-
let config = TesseractConfig {
|
|
390
|
-
use_cache: false,
|
|
391
|
-
..Default::default()
|
|
392
|
-
};
|
|
393
|
-
|
|
394
|
-
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
395
|
-
let test_size = 20;
|
|
396
|
-
|
|
397
|
-
let sequential_start = Instant::now();
|
|
398
|
-
for _ in 0..test_size {
|
|
399
|
-
let result = processor.process_file(&file_path.to_string_lossy(), &config);
|
|
400
|
-
assert!(result.is_ok(), "Sequential OCR should succeed");
|
|
401
|
-
}
|
|
402
|
-
let sequential_duration = sequential_start.elapsed();
|
|
403
|
-
|
|
404
|
-
let file_paths: Vec<String> = (0..test_size)
|
|
405
|
-
.map(|_| file_path.to_string_lossy().to_string())
|
|
406
|
-
.collect();
|
|
407
|
-
|
|
408
|
-
let parallel_start = Instant::now();
|
|
409
|
-
let results = processor.process_files_batch(file_paths, &config);
|
|
410
|
-
let parallel_duration = parallel_start.elapsed();
|
|
411
|
-
|
|
412
|
-
assert_eq!(results.len(), test_size as usize, "Should process all images");
|
|
413
|
-
let success_count = results.iter().filter(|r| r.success).count();
|
|
414
|
-
assert_eq!(success_count, test_size as usize, "All should succeed");
|
|
415
|
-
|
|
416
|
-
let speedup = sequential_duration.as_secs_f64() / parallel_duration.as_secs_f64();
|
|
417
|
-
|
|
418
|
-
println!(
|
|
419
|
-
"Sequential: {:?}, Parallel (Rayon): {:?}, Speedup: {:.2}x",
|
|
420
|
-
sequential_duration, parallel_duration, speedup
|
|
421
|
-
);
|
|
422
|
-
|
|
423
|
-
let cpu_cores = num_cpus::get().max(2) as f64;
|
|
424
|
-
let dynamic_target = 1.0 + (cpu_cores.min(8.0) - 1.0) * 0.01;
|
|
425
|
-
let floor = if cfg!(target_os = "macos") {
|
|
426
|
-
// macOS runners throttle parallelism heavily, so keep the minimum bar very modest ~keep
|
|
427
|
-
1.005
|
|
428
|
-
} else {
|
|
429
|
-
1.01
|
|
430
|
-
};
|
|
431
|
-
let required_speedup = dynamic_target.max(floor);
|
|
432
|
-
|
|
433
|
-
assert!(
|
|
434
|
-
speedup >= required_speedup,
|
|
435
|
-
"Rayon parallel should be at least {:.2}x faster than sequential, got {:.2}x",
|
|
436
|
-
required_speedup,
|
|
437
|
-
speedup
|
|
438
|
-
);
|
|
439
|
-
}
|
|
440
|
-
|
|
441
|
-
/// Stress test: Mixed valid and invalid files in batch.
|
|
442
|
-
///
|
|
443
|
-
/// Validates that:
|
|
444
|
-
/// - Rayon batch handles errors gracefully
|
|
445
|
-
/// - One failure doesn't affect other parallel operations
|
|
446
|
-
/// - Error reporting is correct under parallelism
|
|
447
|
-
#[test]
|
|
448
|
-
fn test_rayon_batch_error_handling() {
|
|
449
|
-
let processor = OcrProcessor::new(None).expect("Should create processor");
|
|
450
|
-
let config = TesseractConfig::default();
|
|
451
|
-
|
|
452
|
-
let mut file_paths = vec![];
|
|
453
|
-
|
|
454
|
-
for i in 0..10 {
|
|
455
|
-
file_paths.push(format!("/nonexistent/file_{}.jpg", i));
|
|
456
|
-
}
|
|
457
|
-
|
|
458
|
-
let results = processor.process_files_batch(file_paths, &config);
|
|
459
|
-
|
|
460
|
-
assert_eq!(results.len(), 10, "Should return results for all files");
|
|
461
|
-
|
|
462
|
-
for (i, result) in results.iter().enumerate() {
|
|
463
|
-
assert!(!result.success, "Result {} should fail (file doesn't exist)", i);
|
|
464
|
-
assert!(result.error.is_some(), "Result {} should have error message", i);
|
|
465
|
-
assert!(result.result.is_none(), "Result {} should not have OCR result", i);
|
|
466
|
-
}
|
|
467
|
-
|
|
468
|
-
println!("Successfully handled 10 file errors in parallel batch");
|
|
469
|
-
}
|
|
1
|
+
//! Comprehensive OCR stress tests.
|
|
2
|
+
//!
|
|
3
|
+
//! Validates that Tesseract integration is thread-safe and performant under heavy load:
|
|
4
|
+
//! - Rayon parallel batch processing doesn't cause race conditions
|
|
5
|
+
//! - Multiple concurrent batch operations don't interfere
|
|
6
|
+
//! - Memory usage stays bounded under heavy OCR load
|
|
7
|
+
//! - Tesseract API calls are thread-safe
|
|
8
|
+
//! - Cache handles concurrent OCR operations correctly
|
|
9
|
+
//!
|
|
10
|
+
//! These tests ensure production workloads with heavy OCR usage work correctly.
|
|
11
|
+
|
|
12
|
+
#![cfg(feature = "ocr")]
|
|
13
|
+
|
|
14
|
+
use kreuzberg::core::config::{ExtractionConfig, OcrConfig};
|
|
15
|
+
use kreuzberg::core::extractor::extract_file_sync;
|
|
16
|
+
use kreuzberg::ocr::processor::OcrProcessor;
|
|
17
|
+
use kreuzberg::ocr::types::TesseractConfig;
|
|
18
|
+
use std::sync::Arc;
|
|
19
|
+
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
20
|
+
use std::time::Instant;
|
|
21
|
+
|
|
22
|
+
mod helpers;
|
|
23
|
+
|
|
24
|
+
/// Stress test: Rayon parallel batch processing with many images.
|
|
25
|
+
///
|
|
26
|
+
/// Validates that:
|
|
27
|
+
/// - Rayon parallelization works correctly with Tesseract
|
|
28
|
+
/// - No race conditions in parallel OCR processing
|
|
29
|
+
/// - All results are correct with no cross-contamination
|
|
30
|
+
#[cfg(feature = "ocr")]
|
|
31
|
+
#[cfg_attr(coverage, ignore = "coverage instrumentation slows down rayon benchmarks")]
|
|
32
|
+
#[ignore = "flaky performance test dependent on CI runner speed"]
|
|
33
|
+
#[test]
|
|
34
|
+
fn test_rayon_batch_stress_many_images() {
|
|
35
|
+
use helpers::{get_test_file_path, skip_if_missing};
|
|
36
|
+
|
|
37
|
+
if skip_if_missing("images/ocr_image.jpg") {
|
|
38
|
+
tracing::debug!("Skipping Rayon batch stress test: test file not available");
|
|
39
|
+
return;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
let processor = OcrProcessor::new(None).expect("Should create processor");
|
|
43
|
+
let config = TesseractConfig::default();
|
|
44
|
+
|
|
45
|
+
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
46
|
+
let file_paths: Vec<String> = (0..100).map(|_| file_path.to_string_lossy().to_string()).collect();
|
|
47
|
+
|
|
48
|
+
let start = Instant::now();
|
|
49
|
+
let results = processor.process_files_batch(file_paths, &config);
|
|
50
|
+
let duration = start.elapsed();
|
|
51
|
+
|
|
52
|
+
let success_count = results.iter().filter(|r| r.success).count();
|
|
53
|
+
assert_eq!(
|
|
54
|
+
success_count, 100,
|
|
55
|
+
"All 100 OCR operations should succeed, got {} successes",
|
|
56
|
+
success_count
|
|
57
|
+
);
|
|
58
|
+
|
|
59
|
+
let first_content = results[0].result.as_ref().unwrap().content.clone();
|
|
60
|
+
for (i, result) in results.iter().enumerate().skip(1) {
|
|
61
|
+
assert!(result.success, "Result {} should succeed", i);
|
|
62
|
+
let content = &result.result.as_ref().unwrap().content;
|
|
63
|
+
assert_eq!(
|
|
64
|
+
content, &first_content,
|
|
65
|
+
"Result {} content differs - possible race condition",
|
|
66
|
+
i
|
|
67
|
+
);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
println!(
|
|
71
|
+
"Processed 100 images with Rayon in {:?} ({:.2} images/sec)",
|
|
72
|
+
duration,
|
|
73
|
+
100.0 / duration.as_secs_f64()
|
|
74
|
+
);
|
|
75
|
+
|
|
76
|
+
let images_per_sec = 100.0 / duration.as_secs_f64();
|
|
77
|
+
assert!(
|
|
78
|
+
images_per_sec > 5.0,
|
|
79
|
+
"Parallel batch should process at least 5 images/sec, got {:.2}",
|
|
80
|
+
images_per_sec
|
|
81
|
+
);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/// Stress test: Multiple concurrent batch operations.
|
|
85
|
+
///
|
|
86
|
+
/// Validates that:
|
|
87
|
+
/// - Multiple threads can run batch_process simultaneously
|
|
88
|
+
/// - Rayon thread pool doesn't deadlock or starve
|
|
89
|
+
/// - Results remain correct under concurrent batch load
|
|
90
|
+
#[test]
|
|
91
|
+
fn test_concurrent_rayon_batches() {
|
|
92
|
+
use helpers::{get_test_file_path, skip_if_missing};
|
|
93
|
+
|
|
94
|
+
if skip_if_missing("images/ocr_image.jpg") {
|
|
95
|
+
tracing::debug!("Skipping concurrent Rayon batches test: test file not available");
|
|
96
|
+
return;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
let processor = Arc::new(OcrProcessor::new(None).expect("Should create processor"));
|
|
100
|
+
let config = Arc::new(TesseractConfig::default());
|
|
101
|
+
|
|
102
|
+
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
103
|
+
let file_paths: Vec<String> = (0..20).map(|_| file_path.to_string_lossy().to_string()).collect();
|
|
104
|
+
|
|
105
|
+
let mut handles = vec![];
|
|
106
|
+
let total_processed = Arc::new(AtomicUsize::new(0));
|
|
107
|
+
|
|
108
|
+
for batch_id in 0..10 {
|
|
109
|
+
let processor = Arc::clone(&processor);
|
|
110
|
+
let config = Arc::clone(&config);
|
|
111
|
+
let file_paths = file_paths.clone();
|
|
112
|
+
let total = Arc::clone(&total_processed);
|
|
113
|
+
|
|
114
|
+
handles.push(std::thread::spawn(move || {
|
|
115
|
+
let results = processor.process_files_batch(file_paths, &config);
|
|
116
|
+
|
|
117
|
+
let success_count = results.iter().filter(|r| r.success).count();
|
|
118
|
+
assert_eq!(
|
|
119
|
+
success_count, 20,
|
|
120
|
+
"Batch {} should have 20 successes, got {}",
|
|
121
|
+
batch_id, success_count
|
|
122
|
+
);
|
|
123
|
+
|
|
124
|
+
total.fetch_add(success_count, Ordering::Relaxed);
|
|
125
|
+
results
|
|
126
|
+
}));
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
let mut all_results = vec![];
|
|
130
|
+
for handle in handles {
|
|
131
|
+
let results = handle.join().expect("Thread should not panic");
|
|
132
|
+
all_results.push(results);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
let total = total_processed.load(Ordering::Relaxed);
|
|
136
|
+
assert_eq!(total, 200, "Should process 200 total images (10 batches × 20 images)");
|
|
137
|
+
|
|
138
|
+
println!("Successfully processed 200 images across 10 concurrent batches");
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/// Stress test: High memory pressure with large batch.
|
|
142
|
+
///
|
|
143
|
+
/// Validates that:
|
|
144
|
+
/// - Memory usage stays bounded during large batch processing
|
|
145
|
+
/// - No memory leaks in Tesseract integration
|
|
146
|
+
/// - System remains stable under memory pressure
|
|
147
|
+
#[test]
|
|
148
|
+
fn test_rayon_batch_memory_pressure() {
|
|
149
|
+
use helpers::{get_test_file_path, skip_if_missing};
|
|
150
|
+
|
|
151
|
+
if skip_if_missing("images/ocr_image.jpg") {
|
|
152
|
+
tracing::debug!("Skipping memory pressure test: test file not available");
|
|
153
|
+
return;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
let processor = OcrProcessor::new(None).expect("Should create processor");
|
|
157
|
+
let config = TesseractConfig::default();
|
|
158
|
+
|
|
159
|
+
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
160
|
+
|
|
161
|
+
for wave in 0..5 {
|
|
162
|
+
let file_paths: Vec<String> = (0..50).map(|_| file_path.to_string_lossy().to_string()).collect();
|
|
163
|
+
|
|
164
|
+
let start = Instant::now();
|
|
165
|
+
let results = processor.process_files_batch(file_paths, &config);
|
|
166
|
+
let duration = start.elapsed();
|
|
167
|
+
|
|
168
|
+
let success_count = results.iter().filter(|r| r.success).count();
|
|
169
|
+
assert_eq!(
|
|
170
|
+
success_count, 50,
|
|
171
|
+
"Wave {} should process 50 images, got {} successes",
|
|
172
|
+
wave, success_count
|
|
173
|
+
);
|
|
174
|
+
|
|
175
|
+
println!("Wave {} processed 50 images in {:?}", wave, duration);
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
println!("Successfully completed 5 waves of 50 images (250 total) without memory issues");
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
/// Stress test: Concurrent Tesseract API calls.
|
|
182
|
+
///
|
|
183
|
+
/// Validates that:
|
|
184
|
+
/// - TesseractAPI is thread-safe in Rust wrapper
|
|
185
|
+
/// - No crashes or corruption with concurrent API usage
|
|
186
|
+
/// - Results are deterministic across threads
|
|
187
|
+
#[test]
|
|
188
|
+
fn test_tesseract_api_thread_safety() {
|
|
189
|
+
use helpers::{get_test_file_path, skip_if_missing};
|
|
190
|
+
|
|
191
|
+
if skip_if_missing("images/ocr_image.jpg") {
|
|
192
|
+
tracing::debug!("Skipping Tesseract API thread-safety test: test file not available");
|
|
193
|
+
return;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
let config = ExtractionConfig {
|
|
197
|
+
ocr: Some(OcrConfig {
|
|
198
|
+
backend: "tesseract".to_string(),
|
|
199
|
+
language: "eng".to_string(),
|
|
200
|
+
tesseract_config: None,
|
|
201
|
+
}),
|
|
202
|
+
force_ocr: false,
|
|
203
|
+
use_cache: false,
|
|
204
|
+
..Default::default()
|
|
205
|
+
};
|
|
206
|
+
|
|
207
|
+
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
208
|
+
|
|
209
|
+
let mut handles = vec![];
|
|
210
|
+
for thread_id in 0..50 {
|
|
211
|
+
let file_path = file_path.clone();
|
|
212
|
+
let config = config.clone();
|
|
213
|
+
|
|
214
|
+
handles.push(std::thread::spawn(move || {
|
|
215
|
+
let result = extract_file_sync(&file_path, None, &config);
|
|
216
|
+
assert!(
|
|
217
|
+
result.is_ok(),
|
|
218
|
+
"Thread {} OCR should succeed: {:?}",
|
|
219
|
+
thread_id,
|
|
220
|
+
result.err()
|
|
221
|
+
);
|
|
222
|
+
result.unwrap()
|
|
223
|
+
}));
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
let mut results = vec![];
|
|
227
|
+
for handle in handles {
|
|
228
|
+
let extraction = handle.join().expect("Thread should not panic");
|
|
229
|
+
assert!(!extraction.content.is_empty(), "OCR should extract text");
|
|
230
|
+
results.push(extraction);
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
let first_content = &results[0].content;
|
|
234
|
+
for (i, result) in results.iter().enumerate().skip(1) {
|
|
235
|
+
assert_eq!(
|
|
236
|
+
&result.content, first_content,
|
|
237
|
+
"Result {} differs from first - thread-safety issue",
|
|
238
|
+
i
|
|
239
|
+
);
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
println!("Successfully completed 50 concurrent Tesseract API calls with consistent results");
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
/// Stress test: Sustained concurrent OCR load over time.
|
|
246
|
+
///
|
|
247
|
+
/// Validates that:
|
|
248
|
+
/// - System remains stable under prolonged concurrent OCR
|
|
249
|
+
/// - No resource leaks or degradation over time
|
|
250
|
+
/// - Throughput remains consistent
|
|
251
|
+
#[test]
|
|
252
|
+
fn test_sustained_concurrent_ocr_load() {
|
|
253
|
+
use helpers::{get_test_file_path, skip_if_missing};
|
|
254
|
+
|
|
255
|
+
if skip_if_missing("images/ocr_image.jpg") {
|
|
256
|
+
tracing::debug!("Skipping sustained load test: test file not available");
|
|
257
|
+
return;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
let processor = Arc::new(OcrProcessor::new(None).expect("Should create processor"));
|
|
261
|
+
let config = Arc::new(TesseractConfig {
|
|
262
|
+
use_cache: false,
|
|
263
|
+
..Default::default()
|
|
264
|
+
});
|
|
265
|
+
|
|
266
|
+
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
267
|
+
let total_processed = Arc::new(AtomicUsize::new(0));
|
|
268
|
+
|
|
269
|
+
let mut handles = vec![];
|
|
270
|
+
for worker_id in 0..20 {
|
|
271
|
+
let processor = Arc::clone(&processor);
|
|
272
|
+
let config = Arc::clone(&config);
|
|
273
|
+
let file_path = file_path.clone();
|
|
274
|
+
let total = Arc::clone(&total_processed);
|
|
275
|
+
|
|
276
|
+
handles.push(std::thread::spawn(move || {
|
|
277
|
+
for batch in 0..2 {
|
|
278
|
+
let file_paths: Vec<String> = (0..5).map(|_| file_path.to_string_lossy().to_string()).collect();
|
|
279
|
+
|
|
280
|
+
let results = processor.process_files_batch(file_paths, &config);
|
|
281
|
+
|
|
282
|
+
let success_count = results.iter().filter(|r| r.success).count();
|
|
283
|
+
assert_eq!(
|
|
284
|
+
success_count, 5,
|
|
285
|
+
"Worker {} batch {} should process 5 images",
|
|
286
|
+
worker_id, batch
|
|
287
|
+
);
|
|
288
|
+
|
|
289
|
+
total.fetch_add(success_count, Ordering::Relaxed);
|
|
290
|
+
}
|
|
291
|
+
}));
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
for handle in handles {
|
|
295
|
+
handle.join().expect("Worker should not panic");
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
let total = total_processed.load(Ordering::Relaxed);
|
|
299
|
+
assert_eq!(total, 200, "Should process 200 total images (20 workers × 10 images)");
|
|
300
|
+
|
|
301
|
+
println!("Successfully sustained 20 concurrent workers processing 200 total images");
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
/// Stress test: Concurrent cache access during batch OCR.
|
|
305
|
+
///
|
|
306
|
+
/// Validates that:
|
|
307
|
+
/// - Cache is thread-safe under concurrent batch operations
|
|
308
|
+
/// - Cache hits work correctly with Rayon parallelism
|
|
309
|
+
/// - No cache corruption or race conditions
|
|
310
|
+
#[test]
|
|
311
|
+
fn test_concurrent_batch_with_cache() {
|
|
312
|
+
use helpers::{get_test_file_path, skip_if_missing};
|
|
313
|
+
|
|
314
|
+
if skip_if_missing("images/ocr_image.jpg") {
|
|
315
|
+
tracing::debug!("Skipping cache stress test: test file not available");
|
|
316
|
+
return;
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
let temp_dir = tempfile::tempdir().expect("Should create temp dir");
|
|
320
|
+
let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).expect("Should create processor");
|
|
321
|
+
let config = TesseractConfig {
|
|
322
|
+
use_cache: true,
|
|
323
|
+
..Default::default()
|
|
324
|
+
};
|
|
325
|
+
|
|
326
|
+
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
327
|
+
|
|
328
|
+
let warm_paths: Vec<String> = (0..10).map(|_| file_path.to_string_lossy().to_string()).collect();
|
|
329
|
+
let _ = processor.process_files_batch(warm_paths, &config);
|
|
330
|
+
|
|
331
|
+
let processor = Arc::new(processor);
|
|
332
|
+
let config = Arc::new(config);
|
|
333
|
+
let mut handles = vec![];
|
|
334
|
+
let total_successes = Arc::new(AtomicUsize::new(0));
|
|
335
|
+
|
|
336
|
+
for _ in 0..10 {
|
|
337
|
+
let processor = Arc::clone(&processor);
|
|
338
|
+
let config = Arc::clone(&config);
|
|
339
|
+
let file_path = file_path.clone();
|
|
340
|
+
let total = Arc::clone(&total_successes);
|
|
341
|
+
|
|
342
|
+
handles.push(std::thread::spawn(move || {
|
|
343
|
+
let file_paths: Vec<String> = (0..5).map(|_| file_path.to_string_lossy().to_string()).collect();
|
|
344
|
+
|
|
345
|
+
let results = processor.process_files_batch(file_paths, &config);
|
|
346
|
+
|
|
347
|
+
let success_count = results.iter().filter(|r| r.success).count();
|
|
348
|
+
total.fetch_add(success_count, Ordering::Relaxed);
|
|
349
|
+
|
|
350
|
+
results
|
|
351
|
+
}));
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
for handle in handles {
|
|
355
|
+
let results = handle.join().expect("Thread should not panic");
|
|
356
|
+
assert_eq!(results.len(), 5, "Each batch should process 5 images");
|
|
357
|
+
|
|
358
|
+
let success_count = results.iter().filter(|r| r.success).count();
|
|
359
|
+
assert_eq!(success_count, 5, "All 5 should succeed (from cache)");
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
let total = total_successes.load(Ordering::Relaxed);
|
|
363
|
+
assert_eq!(total, 50, "Should process 50 total images (10 batches × 5 images)");
|
|
364
|
+
|
|
365
|
+
println!("Successfully completed 10 concurrent cached batches with 50 total images");
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
/// Stress test: Rayon parallel performance comparison.
|
|
369
|
+
///
|
|
370
|
+
/// Validates that:
|
|
371
|
+
/// - Rayon parallelization provides significant speedup
|
|
372
|
+
/// - Parallel batch is faster than sequential
|
|
373
|
+
/// - Speedup scales reasonably with CPU cores
|
|
374
|
+
#[test]
|
|
375
|
+
fn test_rayon_parallel_speedup() {
|
|
376
|
+
use helpers::{get_test_file_path, skip_if_missing};
|
|
377
|
+
|
|
378
|
+
if std::env::var("CI").is_ok() {
|
|
379
|
+
tracing::warn!("Skipping Rayon speedup test on CI to avoid flaky timing-based failures");
|
|
380
|
+
return;
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
if skip_if_missing("images/ocr_image.jpg") {
|
|
384
|
+
tracing::debug!("Skipping Rayon speedup test: test file not available");
|
|
385
|
+
return;
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
let processor = OcrProcessor::new(None).expect("Should create processor");
|
|
389
|
+
let config = TesseractConfig {
|
|
390
|
+
use_cache: false,
|
|
391
|
+
..Default::default()
|
|
392
|
+
};
|
|
393
|
+
|
|
394
|
+
let file_path = get_test_file_path("images/ocr_image.jpg");
|
|
395
|
+
let test_size = 20;
|
|
396
|
+
|
|
397
|
+
let sequential_start = Instant::now();
|
|
398
|
+
for _ in 0..test_size {
|
|
399
|
+
let result = processor.process_file(&file_path.to_string_lossy(), &config);
|
|
400
|
+
assert!(result.is_ok(), "Sequential OCR should succeed");
|
|
401
|
+
}
|
|
402
|
+
let sequential_duration = sequential_start.elapsed();
|
|
403
|
+
|
|
404
|
+
let file_paths: Vec<String> = (0..test_size)
|
|
405
|
+
.map(|_| file_path.to_string_lossy().to_string())
|
|
406
|
+
.collect();
|
|
407
|
+
|
|
408
|
+
let parallel_start = Instant::now();
|
|
409
|
+
let results = processor.process_files_batch(file_paths, &config);
|
|
410
|
+
let parallel_duration = parallel_start.elapsed();
|
|
411
|
+
|
|
412
|
+
assert_eq!(results.len(), test_size as usize, "Should process all images");
|
|
413
|
+
let success_count = results.iter().filter(|r| r.success).count();
|
|
414
|
+
assert_eq!(success_count, test_size as usize, "All should succeed");
|
|
415
|
+
|
|
416
|
+
let speedup = sequential_duration.as_secs_f64() / parallel_duration.as_secs_f64();
|
|
417
|
+
|
|
418
|
+
println!(
|
|
419
|
+
"Sequential: {:?}, Parallel (Rayon): {:?}, Speedup: {:.2}x",
|
|
420
|
+
sequential_duration, parallel_duration, speedup
|
|
421
|
+
);
|
|
422
|
+
|
|
423
|
+
let cpu_cores = num_cpus::get().max(2) as f64;
|
|
424
|
+
let dynamic_target = 1.0 + (cpu_cores.min(8.0) - 1.0) * 0.01;
|
|
425
|
+
let floor = if cfg!(target_os = "macos") {
|
|
426
|
+
// macOS runners throttle parallelism heavily, so keep the minimum bar very modest ~keep
|
|
427
|
+
1.005
|
|
428
|
+
} else {
|
|
429
|
+
1.01
|
|
430
|
+
};
|
|
431
|
+
let required_speedup = dynamic_target.max(floor);
|
|
432
|
+
|
|
433
|
+
assert!(
|
|
434
|
+
speedup >= required_speedup,
|
|
435
|
+
"Rayon parallel should be at least {:.2}x faster than sequential, got {:.2}x",
|
|
436
|
+
required_speedup,
|
|
437
|
+
speedup
|
|
438
|
+
);
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
/// Stress test: Mixed valid and invalid files in batch.
|
|
442
|
+
///
|
|
443
|
+
/// Validates that:
|
|
444
|
+
/// - Rayon batch handles errors gracefully
|
|
445
|
+
/// - One failure doesn't affect other parallel operations
|
|
446
|
+
/// - Error reporting is correct under parallelism
|
|
447
|
+
#[test]
|
|
448
|
+
fn test_rayon_batch_error_handling() {
|
|
449
|
+
let processor = OcrProcessor::new(None).expect("Should create processor");
|
|
450
|
+
let config = TesseractConfig::default();
|
|
451
|
+
|
|
452
|
+
let mut file_paths = vec![];
|
|
453
|
+
|
|
454
|
+
for i in 0..10 {
|
|
455
|
+
file_paths.push(format!("/nonexistent/file_{}.jpg", i));
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
let results = processor.process_files_batch(file_paths, &config);
|
|
459
|
+
|
|
460
|
+
assert_eq!(results.len(), 10, "Should return results for all files");
|
|
461
|
+
|
|
462
|
+
for (i, result) in results.iter().enumerate() {
|
|
463
|
+
assert!(!result.success, "Result {} should fail (file doesn't exist)", i);
|
|
464
|
+
assert!(result.error.is_some(), "Result {} should have error message", i);
|
|
465
|
+
assert!(result.result.is_none(), "Result {} should not have OCR result", i);
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
println!("Successfully handled 10 file errors in parallel batch");
|
|
469
|
+
}
|