kreuzberg 4.0.0.rc2 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +391 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
//! Page marker insertion tests.
|
|
2
|
+
//!
|
|
3
|
+
//! Tests the page marker feature that inserts markers before each page in extracted content.
|
|
4
|
+
//! This is critical for downstream applications that need to know where page boundaries are
|
|
5
|
+
//! in the text stream.
|
|
6
|
+
|
|
7
|
+
#![cfg(feature = "pdf")]
|
|
8
|
+
|
|
9
|
+
mod helpers;
|
|
10
|
+
|
|
11
|
+
use helpers::*;
|
|
12
|
+
use kreuzberg::core::config::{ExtractionConfig, PageConfig};
|
|
13
|
+
use kreuzberg::extract_file_sync;
|
|
14
|
+
|
|
15
|
+
/// Test that page markers are inserted when enabled.
|
|
16
|
+
#[test]
|
|
17
|
+
fn test_page_markers_inserted_when_enabled() {
|
|
18
|
+
if skip_if_missing("pdfs/sample.pdf") {
|
|
19
|
+
return;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
let file_path = get_test_file_path("pdfs/sample.pdf");
|
|
23
|
+
let config = ExtractionConfig {
|
|
24
|
+
pages: Some(PageConfig {
|
|
25
|
+
insert_page_markers: true,
|
|
26
|
+
..Default::default()
|
|
27
|
+
}),
|
|
28
|
+
..Default::default()
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Failed to extract PDF with page markers");
|
|
32
|
+
|
|
33
|
+
// Default marker format is "\n\n<!-- PAGE {page_num} -->\n\n"
|
|
34
|
+
assert!(
|
|
35
|
+
result.content.contains("<!-- PAGE"),
|
|
36
|
+
"Content should contain page markers when insert_page_markers is true. Content: {}",
|
|
37
|
+
&result.content[..result.content.len().min(500)]
|
|
38
|
+
);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/// Test that page 1 gets a marker (regression test for the bug where page 1 was skipped).
|
|
42
|
+
#[test]
|
|
43
|
+
fn test_page_1_gets_marker() {
|
|
44
|
+
if skip_if_missing("pdfs/sample.pdf") {
|
|
45
|
+
return;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
let file_path = get_test_file_path("pdfs/sample.pdf");
|
|
49
|
+
let config = ExtractionConfig {
|
|
50
|
+
pages: Some(PageConfig {
|
|
51
|
+
insert_page_markers: true,
|
|
52
|
+
..Default::default()
|
|
53
|
+
}),
|
|
54
|
+
..Default::default()
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Failed to extract PDF with page markers");
|
|
58
|
+
|
|
59
|
+
// Page 1 should have a marker at the start
|
|
60
|
+
assert!(
|
|
61
|
+
result.content.contains("<!-- PAGE 1 -->"),
|
|
62
|
+
"Content should contain marker for page 1. Content start: {}",
|
|
63
|
+
&result.content[..result.content.len().min(200)]
|
|
64
|
+
);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/// Test that custom marker format works correctly.
|
|
68
|
+
#[test]
|
|
69
|
+
fn test_custom_marker_format() {
|
|
70
|
+
if skip_if_missing("pdfs/sample.pdf") {
|
|
71
|
+
return;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
let file_path = get_test_file_path("pdfs/sample.pdf");
|
|
75
|
+
let custom_format = "=== Page {page_num} ===";
|
|
76
|
+
let config = ExtractionConfig {
|
|
77
|
+
pages: Some(PageConfig {
|
|
78
|
+
insert_page_markers: true,
|
|
79
|
+
marker_format: custom_format.to_string(),
|
|
80
|
+
..Default::default()
|
|
81
|
+
}),
|
|
82
|
+
..Default::default()
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Failed to extract PDF with custom markers");
|
|
86
|
+
|
|
87
|
+
assert!(
|
|
88
|
+
result.content.contains("=== Page 1 ==="),
|
|
89
|
+
"Content should contain custom marker for page 1"
|
|
90
|
+
);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/// Test that {page_num} placeholder is replaced with actual page numbers.
|
|
94
|
+
#[test]
|
|
95
|
+
fn test_page_num_placeholder_replacement() {
|
|
96
|
+
if skip_if_missing("pdfs/sample.pdf") {
|
|
97
|
+
return;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
let file_path = get_test_file_path("pdfs/sample.pdf");
|
|
101
|
+
let config = ExtractionConfig {
|
|
102
|
+
pages: Some(PageConfig {
|
|
103
|
+
insert_page_markers: true,
|
|
104
|
+
marker_format: "[PAGE {page_num}]".to_string(),
|
|
105
|
+
..Default::default()
|
|
106
|
+
}),
|
|
107
|
+
..Default::default()
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Failed to extract PDF with custom markers");
|
|
111
|
+
|
|
112
|
+
// Should NOT contain the placeholder itself
|
|
113
|
+
assert!(
|
|
114
|
+
!result.content.contains("{page_num}"),
|
|
115
|
+
"Placeholder should be replaced, not appear in output"
|
|
116
|
+
);
|
|
117
|
+
|
|
118
|
+
// Should contain actual page number
|
|
119
|
+
assert!(
|
|
120
|
+
result.content.contains("[PAGE 1]"),
|
|
121
|
+
"Should contain marker with actual page number"
|
|
122
|
+
);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/// Test that page markers and extract_pages work together.
|
|
126
|
+
#[test]
|
|
127
|
+
fn test_markers_and_extract_pages_together() {
|
|
128
|
+
if skip_if_missing("pdfs/sample.pdf") {
|
|
129
|
+
return;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
let file_path = get_test_file_path("pdfs/sample.pdf");
|
|
133
|
+
let config = ExtractionConfig {
|
|
134
|
+
pages: Some(PageConfig {
|
|
135
|
+
insert_page_markers: true,
|
|
136
|
+
extract_pages: true,
|
|
137
|
+
marker_format: "--- PAGE {page_num} ---".to_string(),
|
|
138
|
+
}),
|
|
139
|
+
..Default::default()
|
|
140
|
+
};
|
|
141
|
+
|
|
142
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Failed to extract PDF with both features");
|
|
143
|
+
|
|
144
|
+
// Should have both features working
|
|
145
|
+
assert!(
|
|
146
|
+
result.pages.is_some(),
|
|
147
|
+
"Pages array should be present when extract_pages is true"
|
|
148
|
+
);
|
|
149
|
+
|
|
150
|
+
assert!(
|
|
151
|
+
result.content.contains("--- PAGE 1 ---"),
|
|
152
|
+
"Content should contain page markers"
|
|
153
|
+
);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/// Test that when markers are disabled, no markers appear in content.
|
|
157
|
+
#[test]
|
|
158
|
+
fn test_no_markers_when_disabled() {
|
|
159
|
+
if skip_if_missing("pdfs/sample.pdf") {
|
|
160
|
+
return;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
let file_path = get_test_file_path("pdfs/sample.pdf");
|
|
164
|
+
let config = ExtractionConfig {
|
|
165
|
+
pages: Some(PageConfig {
|
|
166
|
+
insert_page_markers: false,
|
|
167
|
+
..Default::default()
|
|
168
|
+
}),
|
|
169
|
+
..Default::default()
|
|
170
|
+
};
|
|
171
|
+
|
|
172
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Failed to extract PDF without markers");
|
|
173
|
+
|
|
174
|
+
// Should NOT contain default marker pattern
|
|
175
|
+
assert!(
|
|
176
|
+
!result.content.contains("<!-- PAGE"),
|
|
177
|
+
"Content should not contain markers when insert_page_markers is false"
|
|
178
|
+
);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
/// Test that markers appear before page content, not after.
|
|
182
|
+
#[test]
|
|
183
|
+
fn test_marker_appears_before_content() {
|
|
184
|
+
if skip_if_missing("pdfs/sample.pdf") {
|
|
185
|
+
return;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
let file_path = get_test_file_path("pdfs/sample.pdf");
|
|
189
|
+
let config = ExtractionConfig {
|
|
190
|
+
pages: Some(PageConfig {
|
|
191
|
+
insert_page_markers: true,
|
|
192
|
+
marker_format: "[[PAGE {page_num}]]".to_string(),
|
|
193
|
+
..Default::default()
|
|
194
|
+
}),
|
|
195
|
+
..Default::default()
|
|
196
|
+
};
|
|
197
|
+
|
|
198
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Failed to extract PDF with markers");
|
|
199
|
+
|
|
200
|
+
// The marker should appear at or near the start
|
|
201
|
+
let marker_pos = result.content.find("[[PAGE 1]]");
|
|
202
|
+
assert!(marker_pos.is_some(), "Marker should be present");
|
|
203
|
+
|
|
204
|
+
// Marker should be very early in the content (within first 50 chars)
|
|
205
|
+
let pos = marker_pos.unwrap();
|
|
206
|
+
assert!(
|
|
207
|
+
pos < 50,
|
|
208
|
+
"Marker for page 1 should appear at the start, but found at position {}",
|
|
209
|
+
pos
|
|
210
|
+
);
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
/// Test that multi-page PDFs get markers for all pages.
|
|
214
|
+
#[test]
|
|
215
|
+
fn test_multi_page_markers() {
|
|
216
|
+
if skip_if_missing("pdfs/sample.pdf") {
|
|
217
|
+
return;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
let file_path = get_test_file_path("pdfs/sample.pdf");
|
|
221
|
+
let config = ExtractionConfig {
|
|
222
|
+
pages: Some(PageConfig {
|
|
223
|
+
insert_page_markers: true,
|
|
224
|
+
extract_pages: true,
|
|
225
|
+
..Default::default()
|
|
226
|
+
}),
|
|
227
|
+
..Default::default()
|
|
228
|
+
};
|
|
229
|
+
|
|
230
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Failed to extract PDF");
|
|
231
|
+
|
|
232
|
+
if let Some(ref pages) = result.pages {
|
|
233
|
+
let page_count = pages.len();
|
|
234
|
+
|
|
235
|
+
// Check that we have markers for each page
|
|
236
|
+
for page_num in 1..=page_count.min(3) {
|
|
237
|
+
let marker = format!("<!-- PAGE {} -->", page_num);
|
|
238
|
+
assert!(
|
|
239
|
+
result.content.contains(&marker),
|
|
240
|
+
"Should contain marker for page {} (total pages: {})",
|
|
241
|
+
page_num,
|
|
242
|
+
page_count
|
|
243
|
+
);
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
/// Test default marker format value.
|
|
249
|
+
#[test]
|
|
250
|
+
fn test_default_marker_format() {
|
|
251
|
+
let config = PageConfig::default();
|
|
252
|
+
assert_eq!(
|
|
253
|
+
config.marker_format, "\n\n<!-- PAGE {page_num} -->\n\n",
|
|
254
|
+
"Default marker format should match expected value"
|
|
255
|
+
);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
/// Test that empty page still gets a marker.
|
|
259
|
+
#[test]
|
|
260
|
+
fn test_empty_page_gets_marker() {
|
|
261
|
+
// This would require a specific test PDF with an empty page
|
|
262
|
+
// For now, we just verify the logic doesn't skip pages based on content length
|
|
263
|
+
let config = PageConfig {
|
|
264
|
+
insert_page_markers: true,
|
|
265
|
+
..Default::default()
|
|
266
|
+
};
|
|
267
|
+
|
|
268
|
+
assert!(
|
|
269
|
+
config.insert_page_markers,
|
|
270
|
+
"Config should enable markers regardless of page content"
|
|
271
|
+
);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
/// Test marker format with multiple placeholders (edge case).
|
|
275
|
+
#[test]
|
|
276
|
+
fn test_marker_format_multiple_placeholders() {
|
|
277
|
+
if skip_if_missing("pdfs/sample.pdf") {
|
|
278
|
+
return;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
let file_path = get_test_file_path("pdfs/sample.pdf");
|
|
282
|
+
let config = ExtractionConfig {
|
|
283
|
+
pages: Some(PageConfig {
|
|
284
|
+
insert_page_markers: true,
|
|
285
|
+
marker_format: "Page {page_num} of document (page {page_num})".to_string(),
|
|
286
|
+
..Default::default()
|
|
287
|
+
}),
|
|
288
|
+
..Default::default()
|
|
289
|
+
};
|
|
290
|
+
|
|
291
|
+
let result = extract_file_sync(&file_path, None, &config).expect("Failed to extract PDF");
|
|
292
|
+
|
|
293
|
+
assert!(
|
|
294
|
+
result.content.contains("Page 1 of document (page 1)"),
|
|
295
|
+
"Multiple {{page_num}} placeholders should all be replaced"
|
|
296
|
+
);
|
|
297
|
+
}
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
//! Integration tests for PDF text hierarchy detection.
|
|
2
|
+
//!
|
|
3
|
+
//! Tests the extraction and detection of document hierarchy levels (H1-H6)
|
|
4
|
+
//! from PDF text using font size clustering and semantic analysis.
|
|
5
|
+
|
|
6
|
+
#![cfg(feature = "pdf")]
|
|
7
|
+
|
|
8
|
+
use kreuzberg::core::config::{ExtractionConfig, HierarchyConfig, PageConfig, PdfConfig};
|
|
9
|
+
use kreuzberg::extract_bytes;
|
|
10
|
+
use std::path::Path;
|
|
11
|
+
|
|
12
|
+
// Note: All tests must run serially because Pdfium can only be initialized once.
|
|
13
|
+
// Using tokio::test with single_threaded doesn't work well, so we use the serial_test crate.
|
|
14
|
+
// For now, we'll just accept that tests run in parallel but handle the Pdfium initialization error.
|
|
15
|
+
|
|
16
|
+
/// Test full hierarchy extraction from a real PDF.
|
|
17
|
+
///
|
|
18
|
+
/// Loads a PDF from test data directory, extracts with hierarchy detection enabled,
|
|
19
|
+
/// and verifies that PageContent.hierarchy is properly populated with expected
|
|
20
|
+
/// blocks and hierarchy levels.
|
|
21
|
+
#[tokio::test]
|
|
22
|
+
async fn test_full_hierarchy_extraction() {
|
|
23
|
+
// Use the embedded_images_tables.pdf which has clear text structure
|
|
24
|
+
// Path is relative to workspace root, not crate root
|
|
25
|
+
let pdf_path = "../../test_documents/pdfs/embedded_images_tables.pdf";
|
|
26
|
+
|
|
27
|
+
if !Path::new(pdf_path).exists() {
|
|
28
|
+
eprintln!("Test PDF not found at: {}", pdf_path);
|
|
29
|
+
// Skip the test if PDF doesn't exist
|
|
30
|
+
return;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file");
|
|
34
|
+
|
|
35
|
+
// Create extraction config with hierarchy detection enabled
|
|
36
|
+
let config = ExtractionConfig {
|
|
37
|
+
pages: Some(PageConfig {
|
|
38
|
+
extract_pages: true,
|
|
39
|
+
insert_page_markers: false,
|
|
40
|
+
marker_format: "\n\n<!-- PAGE {page_num} -->\n\n".to_string(),
|
|
41
|
+
}),
|
|
42
|
+
pdf_options: Some(PdfConfig {
|
|
43
|
+
extract_images: false,
|
|
44
|
+
passwords: None,
|
|
45
|
+
extract_metadata: true,
|
|
46
|
+
hierarchy: Some(HierarchyConfig {
|
|
47
|
+
enabled: true,
|
|
48
|
+
k_clusters: 6,
|
|
49
|
+
include_bbox: true,
|
|
50
|
+
ocr_coverage_threshold: None,
|
|
51
|
+
}),
|
|
52
|
+
}),
|
|
53
|
+
..Default::default()
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
// Extract the PDF
|
|
57
|
+
let result = extract_bytes(&pdf_bytes, "application/pdf", &config)
|
|
58
|
+
.await
|
|
59
|
+
.expect("PDF extraction failed");
|
|
60
|
+
|
|
61
|
+
// Verify that pages were extracted
|
|
62
|
+
assert!(
|
|
63
|
+
result.pages.is_some(),
|
|
64
|
+
"Pages should be extracted when extract_pages is enabled"
|
|
65
|
+
);
|
|
66
|
+
|
|
67
|
+
let pages = result.pages.as_ref().unwrap();
|
|
68
|
+
assert!(!pages.is_empty(), "At least one page should be extracted");
|
|
69
|
+
|
|
70
|
+
// Check that the first page has hierarchy information
|
|
71
|
+
let first_page = &pages[0];
|
|
72
|
+
assert!(
|
|
73
|
+
first_page.hierarchy.is_some(),
|
|
74
|
+
"First page should have hierarchy information when hierarchy extraction is enabled"
|
|
75
|
+
);
|
|
76
|
+
|
|
77
|
+
let hierarchy = first_page.hierarchy.as_ref().unwrap();
|
|
78
|
+
|
|
79
|
+
// Verify hierarchy structure
|
|
80
|
+
assert!(hierarchy.block_count > 0, "Hierarchy should contain at least one block");
|
|
81
|
+
assert!(!hierarchy.blocks.is_empty(), "Hierarchy blocks should not be empty");
|
|
82
|
+
|
|
83
|
+
eprintln!("Extracted {} hierarchy blocks from page 1", hierarchy.block_count);
|
|
84
|
+
|
|
85
|
+
// Verify that we have multiple hierarchy levels
|
|
86
|
+
let levels: std::collections::HashSet<String> = hierarchy.blocks.iter().map(|b| b.level.clone()).collect();
|
|
87
|
+
|
|
88
|
+
eprintln!("Found hierarchy levels: {:?}", levels);
|
|
89
|
+
|
|
90
|
+
// Should have at least 1 level
|
|
91
|
+
assert!(!levels.is_empty(), "Should have at least one hierarchy level");
|
|
92
|
+
|
|
93
|
+
// Verify block structure
|
|
94
|
+
for block in &hierarchy.blocks {
|
|
95
|
+
assert!(!block.text.is_empty(), "Block text should not be empty");
|
|
96
|
+
assert!(block.font_size > 0.0, "Font size should be positive");
|
|
97
|
+
|
|
98
|
+
// Check that level is a valid heading level or body
|
|
99
|
+
let is_valid_level = matches!(block.level.as_str(), "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "body");
|
|
100
|
+
assert!(is_valid_level, "Invalid hierarchy level: {}", block.level);
|
|
101
|
+
|
|
102
|
+
// Verify bounding box if present
|
|
103
|
+
if let Some((left, top, right, bottom)) = block.bbox {
|
|
104
|
+
assert!(left < right, "Bounding box left should be less than right");
|
|
105
|
+
assert!(top < bottom, "Bounding box top should be less than bottom");
|
|
106
|
+
assert!(
|
|
107
|
+
left >= 0.0 && top >= 0.0,
|
|
108
|
+
"Bounding box coordinates should be non-negative"
|
|
109
|
+
);
|
|
110
|
+
eprintln!(
|
|
111
|
+
"Block '{}' (level: {}, font_size: {}) bbox: ({}, {}, {}, {})",
|
|
112
|
+
block.text.chars().take(30).collect::<String>(),
|
|
113
|
+
block.level,
|
|
114
|
+
block.font_size,
|
|
115
|
+
left,
|
|
116
|
+
top,
|
|
117
|
+
right,
|
|
118
|
+
bottom
|
|
119
|
+
);
|
|
120
|
+
} else {
|
|
121
|
+
eprintln!(
|
|
122
|
+
"Block '{}' (level: {}, font_size: {}) no bbox",
|
|
123
|
+
block.text.chars().take(30).collect::<String>(),
|
|
124
|
+
block.level,
|
|
125
|
+
block.font_size
|
|
126
|
+
);
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
eprintln!("Hierarchy extraction test passed!");
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/// Test that hierarchy extraction respects the enabled flag.
|
|
134
|
+
/// Note: This test is combined with the full_hierarchy_extraction test due to Pdfium initialization constraints.
|
|
135
|
+
#[tokio::test]
|
|
136
|
+
#[ignore]
|
|
137
|
+
async fn test_hierarchy_disabled() {
|
|
138
|
+
let pdf_path = "../../test_documents/pdfs/embedded_images_tables.pdf";
|
|
139
|
+
|
|
140
|
+
if !Path::new(pdf_path).exists() {
|
|
141
|
+
eprintln!("Test PDF not found at: {}", pdf_path);
|
|
142
|
+
return;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file");
|
|
146
|
+
|
|
147
|
+
// Create extraction config with hierarchy detection disabled
|
|
148
|
+
let config = ExtractionConfig {
|
|
149
|
+
pages: Some(PageConfig {
|
|
150
|
+
extract_pages: true,
|
|
151
|
+
insert_page_markers: false,
|
|
152
|
+
marker_format: "\n\n<!-- PAGE {page_num} -->\n\n".to_string(),
|
|
153
|
+
}),
|
|
154
|
+
pdf_options: Some(PdfConfig {
|
|
155
|
+
extract_images: false,
|
|
156
|
+
passwords: None,
|
|
157
|
+
extract_metadata: true,
|
|
158
|
+
hierarchy: Some(HierarchyConfig {
|
|
159
|
+
enabled: false,
|
|
160
|
+
k_clusters: 6,
|
|
161
|
+
include_bbox: true,
|
|
162
|
+
ocr_coverage_threshold: None,
|
|
163
|
+
}),
|
|
164
|
+
}),
|
|
165
|
+
..Default::default()
|
|
166
|
+
};
|
|
167
|
+
|
|
168
|
+
let result = extract_bytes(&pdf_bytes, "application/pdf", &config)
|
|
169
|
+
.await
|
|
170
|
+
.expect("PDF extraction failed");
|
|
171
|
+
|
|
172
|
+
// Verify that pages were extracted
|
|
173
|
+
assert!(result.pages.is_some(), "Pages should be extracted");
|
|
174
|
+
|
|
175
|
+
let pages = result.pages.as_ref().unwrap();
|
|
176
|
+
assert!(!pages.is_empty(), "At least one page should be extracted");
|
|
177
|
+
|
|
178
|
+
// Check that the first page does NOT have hierarchy information when disabled
|
|
179
|
+
let first_page = &pages[0];
|
|
180
|
+
assert!(
|
|
181
|
+
first_page.hierarchy.is_none(),
|
|
182
|
+
"First page should not have hierarchy when hierarchy extraction is disabled"
|
|
183
|
+
);
|
|
184
|
+
|
|
185
|
+
eprintln!("Hierarchy disabled test passed!");
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
/// Test different hierarchy configurations
|
|
189
|
+
/// Note: This test is ignored due to Pdfium initialization constraints (can only initialize once).
|
|
190
|
+
#[tokio::test]
|
|
191
|
+
#[ignore]
|
|
192
|
+
async fn test_hierarchy_with_explicit_disabled() {
|
|
193
|
+
let pdf_path = "../../test_documents/pdfs/embedded_images_tables.pdf";
|
|
194
|
+
|
|
195
|
+
if !Path::new(pdf_path).exists() {
|
|
196
|
+
eprintln!("Test PDF not found at: {}", pdf_path);
|
|
197
|
+
return;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file");
|
|
201
|
+
|
|
202
|
+
// Create extraction config with hierarchy extraction explicitly disabled
|
|
203
|
+
let config = ExtractionConfig {
|
|
204
|
+
pages: Some(PageConfig {
|
|
205
|
+
extract_pages: true,
|
|
206
|
+
insert_page_markers: false,
|
|
207
|
+
marker_format: "\n\n<!-- PAGE {page_num} -->\n\n".to_string(),
|
|
208
|
+
}),
|
|
209
|
+
pdf_options: Some(PdfConfig {
|
|
210
|
+
extract_images: false,
|
|
211
|
+
passwords: None,
|
|
212
|
+
extract_metadata: true,
|
|
213
|
+
hierarchy: Some(HierarchyConfig {
|
|
214
|
+
enabled: false,
|
|
215
|
+
k_clusters: 6,
|
|
216
|
+
include_bbox: true,
|
|
217
|
+
ocr_coverage_threshold: None,
|
|
218
|
+
}),
|
|
219
|
+
}),
|
|
220
|
+
..Default::default()
|
|
221
|
+
};
|
|
222
|
+
|
|
223
|
+
let result = extract_bytes(&pdf_bytes, "application/pdf", &config)
|
|
224
|
+
.await
|
|
225
|
+
.expect("PDF extraction failed");
|
|
226
|
+
|
|
227
|
+
// Verify that pages were extracted
|
|
228
|
+
assert!(result.pages.is_some(), "Pages should be extracted");
|
|
229
|
+
|
|
230
|
+
let pages = result.pages.as_ref().unwrap();
|
|
231
|
+
assert!(!pages.is_empty(), "At least one page should be extracted");
|
|
232
|
+
|
|
233
|
+
// Check that the first page does NOT have hierarchy information when disabled
|
|
234
|
+
let first_page = &pages[0];
|
|
235
|
+
assert!(
|
|
236
|
+
first_page.hierarchy.is_none(),
|
|
237
|
+
"First page should not have hierarchy when hierarchy extraction is disabled"
|
|
238
|
+
);
|
|
239
|
+
|
|
240
|
+
eprintln!("Hierarchy with explicit disabled test passed!");
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/// Test hierarchy extraction with different cluster configurations.
|
|
244
|
+
/// Note: This test is ignored due to Pdfium initialization constraints (can only initialize once).
|
|
245
|
+
#[tokio::test]
|
|
246
|
+
#[ignore]
|
|
247
|
+
async fn test_hierarchy_different_k_clusters() {
|
|
248
|
+
let pdf_path = "../../test_documents/pdfs/embedded_images_tables.pdf";
|
|
249
|
+
|
|
250
|
+
if !Path::new(pdf_path).exists() {
|
|
251
|
+
eprintln!("Test PDF not found at: {}", pdf_path);
|
|
252
|
+
return;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
let pdf_bytes = std::fs::read(pdf_path).expect("Failed to read PDF file");
|
|
256
|
+
|
|
257
|
+
// Test with different k values
|
|
258
|
+
for k in &[2, 4, 6] {
|
|
259
|
+
let config = ExtractionConfig {
|
|
260
|
+
pages: Some(PageConfig {
|
|
261
|
+
extract_pages: true,
|
|
262
|
+
insert_page_markers: false,
|
|
263
|
+
marker_format: "\n\n<!-- PAGE {page_num} -->\n\n".to_string(),
|
|
264
|
+
}),
|
|
265
|
+
pdf_options: Some(PdfConfig {
|
|
266
|
+
extract_images: false,
|
|
267
|
+
passwords: None,
|
|
268
|
+
extract_metadata: true,
|
|
269
|
+
hierarchy: Some(HierarchyConfig {
|
|
270
|
+
enabled: true,
|
|
271
|
+
k_clusters: *k,
|
|
272
|
+
include_bbox: true,
|
|
273
|
+
ocr_coverage_threshold: None,
|
|
274
|
+
}),
|
|
275
|
+
}),
|
|
276
|
+
..Default::default()
|
|
277
|
+
};
|
|
278
|
+
|
|
279
|
+
let result = extract_bytes(&pdf_bytes, "application/pdf", &config)
|
|
280
|
+
.await
|
|
281
|
+
.expect("PDF extraction failed");
|
|
282
|
+
|
|
283
|
+
assert!(result.pages.is_some(), "Pages should be extracted");
|
|
284
|
+
|
|
285
|
+
let pages = result.pages.as_ref().unwrap();
|
|
286
|
+
assert!(!pages.is_empty(), "At least one page should be extracted");
|
|
287
|
+
|
|
288
|
+
let first_page = &pages[0];
|
|
289
|
+
assert!(
|
|
290
|
+
first_page.hierarchy.is_some(),
|
|
291
|
+
"Hierarchy should be present with k={}",
|
|
292
|
+
k
|
|
293
|
+
);
|
|
294
|
+
|
|
295
|
+
let hierarchy = first_page.hierarchy.as_ref().unwrap();
|
|
296
|
+
eprintln!("K={}: {} hierarchy blocks extracted", k, hierarchy.block_count);
|
|
297
|
+
assert!(hierarchy.block_count > 0, "Should have blocks with k={}", k);
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
eprintln!("Different k_clusters test passed!");
|
|
301
|
+
}
|