kreuzberg 4.0.0.rc2 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +391 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
//! Tests for smart OCR triggering based on text coverage.
|
|
2
|
+
//!
|
|
3
|
+
//! This module tests the logic for determining when OCR should be triggered
|
|
4
|
+
//! based on text block coverage of a PDF page. The tests use TDD approach
|
|
5
|
+
//! with edge cases for threshold boundary conditions.
|
|
6
|
+
//!
|
|
7
|
+
//! Test philosophy:
|
|
8
|
+
//! - Verify OCR is triggered when text coverage is below threshold
|
|
9
|
+
//! - Verify OCR is not triggered when text coverage is above threshold
|
|
10
|
+
//! - Test exact threshold edge cases (50% boundary)
|
|
11
|
+
//! - Use real PDF files for reliable page dimensions
|
|
12
|
+
//!
|
|
13
|
+
//! NOTE: Tests use real PDF files from test_documents directory to avoid
|
|
14
|
+
//! pdfium initialization issues that occur with dynamically created PDFs.
|
|
15
|
+
|
|
16
|
+
#![cfg(feature = "pdf")]
|
|
17
|
+
|
|
18
|
+
use kreuzberg::core::config::{ExtractionConfig, HierarchyConfig, PdfConfig};
|
|
19
|
+
use kreuzberg::pdf::hierarchy::{BoundingBox, TextBlock, should_trigger_ocr};
|
|
20
|
+
use pdfium_render::prelude::*;
|
|
21
|
+
use std::path::Path;
|
|
22
|
+
|
|
23
|
+
/// Helper function to get test PDF path
|
|
24
|
+
fn get_test_pdf_path() -> String {
|
|
25
|
+
"test_documents/pdfs_with_tables/tiny.pdf".to_string()
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/// Helper function to skip test if PDF is unavailable
|
|
29
|
+
fn skip_if_no_pdf() -> bool {
|
|
30
|
+
let pdf_path = get_test_pdf_path();
|
|
31
|
+
if !Path::new(&pdf_path).exists() {
|
|
32
|
+
eprintln!("PDF file not found at {}, skipping test", pdf_path);
|
|
33
|
+
true
|
|
34
|
+
} else {
|
|
35
|
+
false
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/// Test OCR triggering with low text coverage (20% coverage → should trigger).
|
|
40
|
+
///
|
|
41
|
+
/// Creates mock text blocks that cover only ~20% of the page.
|
|
42
|
+
/// Expected: OCR should trigger since 20% < 50%
|
|
43
|
+
#[test]
|
|
44
|
+
fn test_ocr_trigger_low_coverage() {
|
|
45
|
+
if skip_if_no_pdf() {
|
|
46
|
+
return;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
let pdfium = Pdfium;
|
|
50
|
+
let document = pdfium
|
|
51
|
+
.load_pdf_from_file(get_test_pdf_path().as_str(), None)
|
|
52
|
+
.expect("Should load PDF document");
|
|
53
|
+
|
|
54
|
+
let page = document.pages().get(0).expect("Should get first page");
|
|
55
|
+
|
|
56
|
+
// Create text blocks covering only ~20% of the page
|
|
57
|
+
let blocks = vec![
|
|
58
|
+
TextBlock {
|
|
59
|
+
text: "Small text block 1".to_string(),
|
|
60
|
+
bbox: BoundingBox {
|
|
61
|
+
left: 0.0,
|
|
62
|
+
top: 0.0,
|
|
63
|
+
right: 150.0,
|
|
64
|
+
bottom: 200.0,
|
|
65
|
+
},
|
|
66
|
+
font_size: 12.0,
|
|
67
|
+
},
|
|
68
|
+
TextBlock {
|
|
69
|
+
text: "Small text block 2".to_string(),
|
|
70
|
+
bbox: BoundingBox {
|
|
71
|
+
left: 200.0,
|
|
72
|
+
top: 200.0,
|
|
73
|
+
right: 300.0,
|
|
74
|
+
bottom: 350.0,
|
|
75
|
+
},
|
|
76
|
+
font_size: 12.0,
|
|
77
|
+
},
|
|
78
|
+
];
|
|
79
|
+
|
|
80
|
+
let config = ExtractionConfig::default();
|
|
81
|
+
|
|
82
|
+
// Should trigger OCR because coverage is below 50%
|
|
83
|
+
assert!(
|
|
84
|
+
should_trigger_ocr(&page, &blocks, &config),
|
|
85
|
+
"OCR should trigger when text coverage is below 50%"
|
|
86
|
+
);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/// Test OCR not triggering with high text coverage (60% coverage → don't trigger).
|
|
90
|
+
///
|
|
91
|
+
/// Creates mock text blocks that cover approximately 60% of the page.
|
|
92
|
+
/// Expected: OCR should not trigger since 60% > 50%
|
|
93
|
+
#[test]
|
|
94
|
+
fn test_ocr_no_trigger_high_coverage() {
|
|
95
|
+
if skip_if_no_pdf() {
|
|
96
|
+
return;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
let pdfium = Pdfium;
|
|
100
|
+
let document = pdfium
|
|
101
|
+
.load_pdf_from_file(get_test_pdf_path().as_str(), None)
|
|
102
|
+
.expect("Should load PDF document");
|
|
103
|
+
|
|
104
|
+
let page = document.pages().get(0).expect("Should get first page");
|
|
105
|
+
|
|
106
|
+
// Create text blocks covering approximately 60% of the page
|
|
107
|
+
// For a typical page (~612 x 792 points), this is a large block
|
|
108
|
+
let blocks = vec![TextBlock {
|
|
109
|
+
text: "Large text block covering most of the page".to_string(),
|
|
110
|
+
bbox: BoundingBox {
|
|
111
|
+
left: 0.0,
|
|
112
|
+
top: 0.0,
|
|
113
|
+
right: 400.0,
|
|
114
|
+
bottom: 600.0,
|
|
115
|
+
},
|
|
116
|
+
font_size: 12.0,
|
|
117
|
+
}];
|
|
118
|
+
|
|
119
|
+
let config = ExtractionConfig::default();
|
|
120
|
+
|
|
121
|
+
// Should NOT trigger OCR because coverage is above 50%
|
|
122
|
+
assert!(
|
|
123
|
+
!should_trigger_ocr(&page, &blocks, &config),
|
|
124
|
+
"OCR should not trigger when text coverage is above 50%"
|
|
125
|
+
);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/// Test OCR triggering at exact threshold boundary (50% edge case).
|
|
129
|
+
///
|
|
130
|
+
/// Creates mock text blocks that cover exactly ~50% of the page.
|
|
131
|
+
/// Expected: OCR should trigger at boundary (uses < not <=)
|
|
132
|
+
#[test]
|
|
133
|
+
fn test_ocr_trigger_exact_threshold() {
|
|
134
|
+
if skip_if_no_pdf() {
|
|
135
|
+
return;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
let pdfium = Pdfium;
|
|
139
|
+
let document = pdfium
|
|
140
|
+
.load_pdf_from_file(get_test_pdf_path().as_str(), None)
|
|
141
|
+
.expect("Should load PDF document");
|
|
142
|
+
|
|
143
|
+
let page = document.pages().get(0).expect("Should get first page");
|
|
144
|
+
let page_width = page.width().value;
|
|
145
|
+
let page_height = page.height().value;
|
|
146
|
+
|
|
147
|
+
// Create text block covering exactly 50% of the page
|
|
148
|
+
let coverage_height = page_height * 0.5;
|
|
149
|
+
let blocks = vec![TextBlock {
|
|
150
|
+
text: "Text block at 50% threshold".to_string(),
|
|
151
|
+
bbox: BoundingBox {
|
|
152
|
+
left: 0.0,
|
|
153
|
+
top: 0.0,
|
|
154
|
+
right: page_width,
|
|
155
|
+
bottom: coverage_height,
|
|
156
|
+
},
|
|
157
|
+
font_size: 12.0,
|
|
158
|
+
}];
|
|
159
|
+
|
|
160
|
+
let config = ExtractionConfig::default();
|
|
161
|
+
|
|
162
|
+
// At exactly 50%, OCR should trigger (using < not <=)
|
|
163
|
+
// Due to floating point precision, we allow a small margin
|
|
164
|
+
let result = should_trigger_ocr(&page, &blocks, &config);
|
|
165
|
+
assert!(
|
|
166
|
+
result,
|
|
167
|
+
"OCR should trigger at or very near 50% threshold boundary (using < not <=)"
|
|
168
|
+
);
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/// Test OCR with overlapping text blocks.
|
|
172
|
+
///
|
|
173
|
+
/// Verifies that overlapping blocks are handled correctly.
|
|
174
|
+
/// Both blocks contribute to coverage calculation.
|
|
175
|
+
#[test]
|
|
176
|
+
fn test_ocr_overlapping_blocks() {
|
|
177
|
+
if skip_if_no_pdf() {
|
|
178
|
+
return;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
let pdfium = Pdfium;
|
|
182
|
+
let document = pdfium
|
|
183
|
+
.load_pdf_from_file(get_test_pdf_path().as_str(), None)
|
|
184
|
+
.expect("Should load PDF document");
|
|
185
|
+
|
|
186
|
+
let page = document.pages().get(0).expect("Should get first page");
|
|
187
|
+
|
|
188
|
+
// Create overlapping text blocks
|
|
189
|
+
// Block 1: covers 150x200 area
|
|
190
|
+
// Block 2: covers 200x200 area, overlaps with Block 1
|
|
191
|
+
// Total area contribution: both areas count (not union)
|
|
192
|
+
let blocks = vec![
|
|
193
|
+
TextBlock {
|
|
194
|
+
text: "Block 1".to_string(),
|
|
195
|
+
bbox: BoundingBox {
|
|
196
|
+
left: 0.0,
|
|
197
|
+
top: 0.0,
|
|
198
|
+
right: 150.0,
|
|
199
|
+
bottom: 200.0,
|
|
200
|
+
},
|
|
201
|
+
font_size: 12.0,
|
|
202
|
+
},
|
|
203
|
+
TextBlock {
|
|
204
|
+
text: "Block 2".to_string(),
|
|
205
|
+
bbox: BoundingBox {
|
|
206
|
+
left: 100.0,
|
|
207
|
+
top: 150.0,
|
|
208
|
+
right: 300.0,
|
|
209
|
+
bottom: 350.0,
|
|
210
|
+
},
|
|
211
|
+
font_size: 12.0,
|
|
212
|
+
},
|
|
213
|
+
];
|
|
214
|
+
|
|
215
|
+
let config = ExtractionConfig::default();
|
|
216
|
+
|
|
217
|
+
// Should trigger OCR because combined areas are still below 50%
|
|
218
|
+
assert!(
|
|
219
|
+
should_trigger_ocr(&page, &blocks, &config),
|
|
220
|
+
"OCR should trigger with overlapping blocks below threshold"
|
|
221
|
+
);
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
/// Test OCR with empty blocks list (no text → should trigger).
|
|
225
|
+
///
|
|
226
|
+
/// When there are no text blocks, coverage is 0%, so OCR should trigger.
|
|
227
|
+
#[test]
|
|
228
|
+
fn test_ocr_empty_blocks() {
|
|
229
|
+
if skip_if_no_pdf() {
|
|
230
|
+
return;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
let pdfium = Pdfium;
|
|
234
|
+
let document = pdfium
|
|
235
|
+
.load_pdf_from_file(get_test_pdf_path().as_str(), None)
|
|
236
|
+
.expect("Should load PDF document");
|
|
237
|
+
|
|
238
|
+
let page = document.pages().get(0).expect("Should get first page");
|
|
239
|
+
|
|
240
|
+
let blocks = vec![];
|
|
241
|
+
let config = ExtractionConfig::default();
|
|
242
|
+
|
|
243
|
+
// Should trigger OCR because there are no text blocks (0% coverage)
|
|
244
|
+
assert!(
|
|
245
|
+
should_trigger_ocr(&page, &blocks, &config),
|
|
246
|
+
"OCR should trigger with empty blocks (0% coverage)"
|
|
247
|
+
);
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
/// Test OCR triggering with custom threshold from config.
|
|
251
|
+
///
|
|
252
|
+
/// If HierarchyConfig specifies a custom ocr_coverage_threshold, respect it.
|
|
253
|
+
/// With 30% coverage and custom 25% threshold, OCR should NOT trigger.
|
|
254
|
+
#[test]
|
|
255
|
+
fn test_ocr_custom_threshold() {
|
|
256
|
+
if skip_if_no_pdf() {
|
|
257
|
+
return;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
let pdfium = Pdfium;
|
|
261
|
+
let document = pdfium
|
|
262
|
+
.load_pdf_from_file(get_test_pdf_path().as_str(), None)
|
|
263
|
+
.expect("Should load PDF document");
|
|
264
|
+
|
|
265
|
+
let page = document.pages().get(0).expect("Should get first page");
|
|
266
|
+
|
|
267
|
+
// Create text blocks covering ~30% of the page
|
|
268
|
+
let blocks = vec![TextBlock {
|
|
269
|
+
text: "Text block 30%".to_string(),
|
|
270
|
+
bbox: BoundingBox {
|
|
271
|
+
left: 0.0,
|
|
272
|
+
top: 0.0,
|
|
273
|
+
right: 250.0,
|
|
274
|
+
bottom: 600.0,
|
|
275
|
+
},
|
|
276
|
+
font_size: 12.0,
|
|
277
|
+
}];
|
|
278
|
+
|
|
279
|
+
// Set custom threshold to 25% instead of default 50%
|
|
280
|
+
let config = ExtractionConfig {
|
|
281
|
+
pdf_options: Some(PdfConfig {
|
|
282
|
+
extract_images: false,
|
|
283
|
+
passwords: None,
|
|
284
|
+
extract_metadata: true,
|
|
285
|
+
hierarchy: Some(HierarchyConfig {
|
|
286
|
+
enabled: true,
|
|
287
|
+
k_clusters: 6,
|
|
288
|
+
include_bbox: true,
|
|
289
|
+
ocr_coverage_threshold: Some(0.25),
|
|
290
|
+
}),
|
|
291
|
+
}),
|
|
292
|
+
..Default::default()
|
|
293
|
+
};
|
|
294
|
+
|
|
295
|
+
// With 30% coverage and custom 25% threshold, should NOT trigger OCR
|
|
296
|
+
// (30% > 25% threshold)
|
|
297
|
+
assert!(
|
|
298
|
+
!should_trigger_ocr(&page, &blocks, &config),
|
|
299
|
+
"OCR should respect custom threshold from config"
|
|
300
|
+
);
|
|
301
|
+
}
|