kreuzberg 4.0.0.rc2 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +391 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -0,0 +1,589 @@
|
|
|
1
|
+
//! PDF hierarchy quality assessment tests.
|
|
2
|
+
//!
|
|
3
|
+
//! This module tests PDF text hierarchy extraction quality by comparing against ground truth annotations.
|
|
4
|
+
//! Measures precision, recall, F1 score, and level accuracy to ensure the hierarchy detection
|
|
5
|
+
//! algorithm works well on real document structures.
|
|
6
|
+
//!
|
|
7
|
+
//! Test philosophy:
|
|
8
|
+
//! - Define ground truth hierarchies for representative PDF documents
|
|
9
|
+
//! - Measure how well extracted hierarchies match ground truth
|
|
10
|
+
//! - Assert minimum quality thresholds for precision/recall/F1
|
|
11
|
+
//! - Verify correct hierarchy level assignments
|
|
12
|
+
|
|
13
|
+
#![cfg(feature = "pdf")]
|
|
14
|
+
|
|
15
|
+
use kreuzberg::pdf::hierarchy::{
|
|
16
|
+
BoundingBox, HierarchyLevel, KMeansResult, TextBlock, assign_hierarchy_levels,
|
|
17
|
+
assign_hierarchy_levels_from_clusters, cluster_font_sizes,
|
|
18
|
+
};
|
|
19
|
+
use serde::{Deserialize, Serialize};
|
|
20
|
+
use std::fs;
|
|
21
|
+
use std::path::Path;
|
|
22
|
+
|
|
23
|
+
/// A bounding box annotation from ground truth.
|
|
24
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
25
|
+
struct GroundTruthBBox {
|
|
26
|
+
left: f32,
|
|
27
|
+
top: f32,
|
|
28
|
+
right: f32,
|
|
29
|
+
bottom: f32,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
impl GroundTruthBBox {
|
|
33
|
+
/// Convert to kreuzberg BoundingBox
|
|
34
|
+
fn to_bbox(&self) -> BoundingBox {
|
|
35
|
+
BoundingBox {
|
|
36
|
+
left: self.left,
|
|
37
|
+
top: self.top,
|
|
38
|
+
right: self.right,
|
|
39
|
+
bottom: self.bottom,
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/// A ground truth text block with hierarchy level annotation.
|
|
45
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
46
|
+
struct GroundTruthBlock {
|
|
47
|
+
text: String,
|
|
48
|
+
level: String,
|
|
49
|
+
bbox: GroundTruthBBox,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/// A page of ground truth annotations.
|
|
53
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
54
|
+
struct GroundTruthPage {
|
|
55
|
+
page_number: u32,
|
|
56
|
+
blocks: Vec<GroundTruthBlock>,
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/// A document with ground truth hierarchy annotations.
|
|
60
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
61
|
+
struct GroundTruthDocument {
|
|
62
|
+
pdf_file: String,
|
|
63
|
+
pages: Vec<GroundTruthPage>,
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/// Root structure for ground truth JSON file.
|
|
67
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
68
|
+
struct GroundTruthFile {
|
|
69
|
+
documents: Vec<GroundTruthDocument>,
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/// Quality metrics for hierarchy extraction.
|
|
73
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
74
|
+
pub struct QualityMetrics {
|
|
75
|
+
/// Precision: (correctly identified hierarchies) / (total extracted hierarchies)
|
|
76
|
+
pub precision: f64,
|
|
77
|
+
/// Recall: (correctly identified hierarchies) / (total ground truth hierarchies)
|
|
78
|
+
pub recall: f64,
|
|
79
|
+
/// F1 Score: harmonic mean of precision and recall
|
|
80
|
+
pub f1_score: f64,
|
|
81
|
+
/// Level accuracy: percentage of blocks assigned to correct hierarchy level
|
|
82
|
+
pub level_accuracy: f64,
|
|
83
|
+
/// Number of correctly identified hierarchy blocks
|
|
84
|
+
pub true_positives: usize,
|
|
85
|
+
/// Number of incorrectly extracted hierarchy blocks
|
|
86
|
+
pub false_positives: usize,
|
|
87
|
+
/// Number of missed ground truth hierarchy blocks
|
|
88
|
+
pub false_negatives: usize,
|
|
89
|
+
/// Number of blocks with correct hierarchy level
|
|
90
|
+
pub correct_levels: usize,
|
|
91
|
+
/// Total number of blocks evaluated
|
|
92
|
+
pub total_blocks: usize,
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
impl QualityMetrics {
|
|
96
|
+
/// Create new quality metrics from test results.
|
|
97
|
+
fn new(
|
|
98
|
+
true_positives: usize,
|
|
99
|
+
false_positives: usize,
|
|
100
|
+
false_negatives: usize,
|
|
101
|
+
correct_levels: usize,
|
|
102
|
+
total_blocks: usize,
|
|
103
|
+
) -> Self {
|
|
104
|
+
let precision = if true_positives + false_positives > 0 {
|
|
105
|
+
true_positives as f64 / (true_positives + false_positives) as f64
|
|
106
|
+
} else {
|
|
107
|
+
0.0
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
let recall = if true_positives + false_negatives > 0 {
|
|
111
|
+
true_positives as f64 / (true_positives + false_negatives) as f64
|
|
112
|
+
} else {
|
|
113
|
+
0.0
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
let f1_score = if precision + recall > 0.0 {
|
|
117
|
+
2.0 * precision * recall / (precision + recall)
|
|
118
|
+
} else {
|
|
119
|
+
0.0
|
|
120
|
+
};
|
|
121
|
+
|
|
122
|
+
let level_accuracy = if total_blocks > 0 {
|
|
123
|
+
correct_levels as f64 / total_blocks as f64
|
|
124
|
+
} else {
|
|
125
|
+
0.0
|
|
126
|
+
};
|
|
127
|
+
|
|
128
|
+
Self {
|
|
129
|
+
precision,
|
|
130
|
+
recall,
|
|
131
|
+
f1_score,
|
|
132
|
+
level_accuracy,
|
|
133
|
+
true_positives,
|
|
134
|
+
false_positives,
|
|
135
|
+
false_negatives,
|
|
136
|
+
correct_levels,
|
|
137
|
+
total_blocks,
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/// Convert hierarchy level string to HierarchyLevel enum.
|
|
143
|
+
fn parse_level(level: &str) -> HierarchyLevel {
|
|
144
|
+
match level {
|
|
145
|
+
"H1" => HierarchyLevel::H1,
|
|
146
|
+
"H2" => HierarchyLevel::H2,
|
|
147
|
+
"H3" => HierarchyLevel::H3,
|
|
148
|
+
"H4" => HierarchyLevel::H4,
|
|
149
|
+
"H5" => HierarchyLevel::H5,
|
|
150
|
+
"H6" => HierarchyLevel::H6,
|
|
151
|
+
_ => HierarchyLevel::Body,
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/// Load ground truth annotations from JSON file.
|
|
156
|
+
///
|
|
157
|
+
/// Reads the hierarchy_ground_truth.json file and parses document annotations.
|
|
158
|
+
///
|
|
159
|
+
/// # Arguments
|
|
160
|
+
///
|
|
161
|
+
/// * `path` - Path to the ground truth JSON file
|
|
162
|
+
///
|
|
163
|
+
/// # Returns
|
|
164
|
+
///
|
|
165
|
+
/// Result containing the parsed GroundTruthFile or error message
|
|
166
|
+
fn load_ground_truth<P: AsRef<Path>>(path: P) -> Result<GroundTruthFile, String> {
|
|
167
|
+
let content = fs::read_to_string(path).map_err(|e| format!("Failed to read file: {}", e))?;
|
|
168
|
+
serde_json::from_str(&content).map_err(|e| format!("Failed to parse JSON: {}", e))
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/// Calculate quality metrics by comparing extracted hierarchies to ground truth.
|
|
172
|
+
///
|
|
173
|
+
/// Compares extracted text blocks with their hierarchy assignments to ground truth annotations.
|
|
174
|
+
/// Measures:
|
|
175
|
+
/// - Precision: correctly identified hierarchies / total extracted
|
|
176
|
+
/// - Recall: correctly identified hierarchies / total ground truth
|
|
177
|
+
/// - F1 Score: harmonic mean of precision and recall
|
|
178
|
+
/// - Level Accuracy: percentage of blocks with correct hierarchy level
|
|
179
|
+
///
|
|
180
|
+
/// # Arguments
|
|
181
|
+
///
|
|
182
|
+
/// * `extracted_blocks` - Vector of extracted HierarchyBlock objects
|
|
183
|
+
/// * `ground_truth_blocks` - Vector of ground truth blocks
|
|
184
|
+
///
|
|
185
|
+
/// # Returns
|
|
186
|
+
///
|
|
187
|
+
/// QualityMetrics struct with calculated precision, recall, F1, and level accuracy
|
|
188
|
+
fn calculate_quality_metrics(
|
|
189
|
+
extracted_blocks: &[kreuzberg::pdf::hierarchy::HierarchyBlock],
|
|
190
|
+
ground_truth_blocks: &[GroundTruthBlock],
|
|
191
|
+
) -> QualityMetrics {
|
|
192
|
+
let mut true_positives = 0;
|
|
193
|
+
let mut false_positives = 0;
|
|
194
|
+
let mut correct_levels = 0;
|
|
195
|
+
|
|
196
|
+
// For matching blocks, we use bounding box overlap and text similarity
|
|
197
|
+
let mut matched_gt_indices: Vec<bool> = vec![false; ground_truth_blocks.len()];
|
|
198
|
+
|
|
199
|
+
for extracted in extracted_blocks {
|
|
200
|
+
let mut best_match_idx: Option<usize> = None;
|
|
201
|
+
let mut best_overlap = 0.0;
|
|
202
|
+
|
|
203
|
+
// Find the best matching ground truth block by bounding box overlap
|
|
204
|
+
for (gt_idx, gt_block) in ground_truth_blocks.iter().enumerate() {
|
|
205
|
+
if matched_gt_indices[gt_idx] {
|
|
206
|
+
continue; // Already matched
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
let gt_bbox = gt_block.bbox.to_bbox();
|
|
210
|
+
let overlap = extracted.bbox.iou(>_bbox);
|
|
211
|
+
|
|
212
|
+
if overlap > best_overlap && overlap > 0.3 {
|
|
213
|
+
best_overlap = overlap;
|
|
214
|
+
best_match_idx = Some(gt_idx);
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
if let Some(gt_idx) = best_match_idx {
|
|
219
|
+
matched_gt_indices[gt_idx] = true;
|
|
220
|
+
true_positives += 1;
|
|
221
|
+
|
|
222
|
+
// Check if the hierarchy level matches
|
|
223
|
+
let gt_level = parse_level(&ground_truth_blocks[gt_idx].level);
|
|
224
|
+
if extracted.hierarchy_level == gt_level {
|
|
225
|
+
correct_levels += 1;
|
|
226
|
+
}
|
|
227
|
+
} else {
|
|
228
|
+
false_positives += 1;
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// Count unmatched ground truth blocks as false negatives
|
|
233
|
+
let false_negatives = matched_gt_indices.iter().filter(|&&m| !m).count();
|
|
234
|
+
|
|
235
|
+
let total_blocks = extracted_blocks.len().max(ground_truth_blocks.len());
|
|
236
|
+
|
|
237
|
+
QualityMetrics::new(
|
|
238
|
+
true_positives,
|
|
239
|
+
false_positives,
|
|
240
|
+
false_negatives,
|
|
241
|
+
correct_levels,
|
|
242
|
+
total_blocks,
|
|
243
|
+
)
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
/// Create test text blocks from ground truth.
|
|
247
|
+
fn create_text_blocks_from_ground_truth(gt_blocks: &[GroundTruthBlock]) -> Vec<TextBlock> {
|
|
248
|
+
gt_blocks
|
|
249
|
+
.iter()
|
|
250
|
+
.enumerate()
|
|
251
|
+
.map(|(idx, gt_block)| {
|
|
252
|
+
// Estimate font size from bbox height
|
|
253
|
+
let bbox = gt_block.bbox.to_bbox();
|
|
254
|
+
let font_size = match gt_block.level.as_str() {
|
|
255
|
+
"H1" => 28.0,
|
|
256
|
+
"H2" => 24.0,
|
|
257
|
+
"H3" => 20.0,
|
|
258
|
+
"H4" => 16.0,
|
|
259
|
+
"H5" => 14.0,
|
|
260
|
+
"H6" => 12.0,
|
|
261
|
+
_ => 10.0, // Body
|
|
262
|
+
};
|
|
263
|
+
|
|
264
|
+
TextBlock {
|
|
265
|
+
text: if gt_block.text.len() > 50 {
|
|
266
|
+
format!("{} (Block {})", gt_block.text.chars().take(50).collect::<String>(), idx)
|
|
267
|
+
} else {
|
|
268
|
+
gt_block.text.clone()
|
|
269
|
+
},
|
|
270
|
+
bbox,
|
|
271
|
+
font_size,
|
|
272
|
+
}
|
|
273
|
+
})
|
|
274
|
+
.collect()
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
#[test]
|
|
278
|
+
fn test_hierarchy_quality_on_ground_truth() {
|
|
279
|
+
// Load ground truth data
|
|
280
|
+
let ground_truth_path = "tests/data/hierarchy_ground_truth.json";
|
|
281
|
+
let ground_truth_file = load_ground_truth(ground_truth_path).expect("Failed to load ground truth file");
|
|
282
|
+
|
|
283
|
+
println!(
|
|
284
|
+
"\nLoaded {} documents from ground truth",
|
|
285
|
+
ground_truth_file.documents.len()
|
|
286
|
+
);
|
|
287
|
+
|
|
288
|
+
let mut all_metrics: Vec<QualityMetrics> = Vec::new();
|
|
289
|
+
|
|
290
|
+
// Process each document
|
|
291
|
+
for doc in &ground_truth_file.documents {
|
|
292
|
+
println!("\nProcessing document: {}", doc.pdf_file);
|
|
293
|
+
|
|
294
|
+
for page in &doc.pages {
|
|
295
|
+
println!(" Page {}: {} blocks", page.page_number, page.blocks.len());
|
|
296
|
+
|
|
297
|
+
// Create text blocks from ground truth
|
|
298
|
+
let text_blocks = create_text_blocks_from_ground_truth(&page.blocks);
|
|
299
|
+
|
|
300
|
+
// Cluster by font size
|
|
301
|
+
let k = (text_blocks.len() / 3).clamp(1, 6); // Estimate k clusters
|
|
302
|
+
let clusters = cluster_font_sizes(&text_blocks, k).expect("Failed to cluster font sizes");
|
|
303
|
+
|
|
304
|
+
println!(
|
|
305
|
+
" Created {} clusters from {} blocks",
|
|
306
|
+
clusters.len(),
|
|
307
|
+
text_blocks.len()
|
|
308
|
+
);
|
|
309
|
+
|
|
310
|
+
// Assign hierarchy levels from clusters
|
|
311
|
+
let hierarchy_assignments = assign_hierarchy_levels_from_clusters(&text_blocks, &clusters);
|
|
312
|
+
|
|
313
|
+
// Convert to HierarchyBlock format
|
|
314
|
+
let extracted_blocks: Vec<kreuzberg::pdf::hierarchy::HierarchyBlock> = hierarchy_assignments
|
|
315
|
+
.iter()
|
|
316
|
+
.map(|(block, level)| kreuzberg::pdf::hierarchy::HierarchyBlock {
|
|
317
|
+
text: block.text.clone(),
|
|
318
|
+
bbox: block.bbox,
|
|
319
|
+
font_size: block.font_size,
|
|
320
|
+
hierarchy_level: *level,
|
|
321
|
+
})
|
|
322
|
+
.collect();
|
|
323
|
+
|
|
324
|
+
// Calculate quality metrics
|
|
325
|
+
let metrics = calculate_quality_metrics(&extracted_blocks, &page.blocks);
|
|
326
|
+
all_metrics.push(metrics.clone());
|
|
327
|
+
|
|
328
|
+
println!(" Precision: {:.4}", metrics.precision);
|
|
329
|
+
println!(" Recall: {:.4}", metrics.recall);
|
|
330
|
+
println!(" F1 Score: {:.4}", metrics.f1_score);
|
|
331
|
+
println!(" Level Accuracy: {:.4}", metrics.level_accuracy);
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
// Calculate average metrics
|
|
336
|
+
if !all_metrics.is_empty() {
|
|
337
|
+
let avg_precision = all_metrics.iter().map(|m| m.precision).sum::<f64>() / all_metrics.len() as f64;
|
|
338
|
+
let avg_recall = all_metrics.iter().map(|m| m.recall).sum::<f64>() / all_metrics.len() as f64;
|
|
339
|
+
let avg_f1 = all_metrics.iter().map(|m| m.f1_score).sum::<f64>() / all_metrics.len() as f64;
|
|
340
|
+
let avg_level_acc = all_metrics.iter().map(|m| m.level_accuracy).sum::<f64>() / all_metrics.len() as f64;
|
|
341
|
+
|
|
342
|
+
println!("\n=== AVERAGE METRICS ACROSS ALL PAGES ===");
|
|
343
|
+
println!("Average Precision: {:.4}", avg_precision);
|
|
344
|
+
println!("Average Recall: {:.4}", avg_recall);
|
|
345
|
+
println!("Average F1 Score: {:.4}", avg_f1);
|
|
346
|
+
println!("Average Level Accuracy: {:.4}", avg_level_acc);
|
|
347
|
+
|
|
348
|
+
// Assert minimum F1 threshold
|
|
349
|
+
assert!(
|
|
350
|
+
avg_f1 > 0.85,
|
|
351
|
+
"F1 score ({:.4}) must be greater than 0.85. Metrics: precision={:.4}, recall={:.4}, level_accuracy={:.4}",
|
|
352
|
+
avg_f1,
|
|
353
|
+
avg_precision,
|
|
354
|
+
avg_recall,
|
|
355
|
+
avg_level_acc
|
|
356
|
+
);
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
#[test]
|
|
361
|
+
fn test_hierarchy_clustering_consistency() {
|
|
362
|
+
// Arrange: Create a simple document with clear hierarchy
|
|
363
|
+
let blocks = vec![
|
|
364
|
+
TextBlock {
|
|
365
|
+
text: "Title".to_string(),
|
|
366
|
+
bbox: BoundingBox {
|
|
367
|
+
left: 0.0,
|
|
368
|
+
top: 0.0,
|
|
369
|
+
right: 100.0,
|
|
370
|
+
bottom: 28.0,
|
|
371
|
+
},
|
|
372
|
+
font_size: 28.0,
|
|
373
|
+
},
|
|
374
|
+
TextBlock {
|
|
375
|
+
text: "Subtitle".to_string(),
|
|
376
|
+
bbox: BoundingBox {
|
|
377
|
+
left: 0.0,
|
|
378
|
+
top: 30.0,
|
|
379
|
+
right: 100.0,
|
|
380
|
+
bottom: 54.0,
|
|
381
|
+
},
|
|
382
|
+
font_size: 24.0,
|
|
383
|
+
},
|
|
384
|
+
TextBlock {
|
|
385
|
+
text: "Section".to_string(),
|
|
386
|
+
bbox: BoundingBox {
|
|
387
|
+
left: 0.0,
|
|
388
|
+
top: 60.0,
|
|
389
|
+
right: 100.0,
|
|
390
|
+
bottom: 80.0,
|
|
391
|
+
},
|
|
392
|
+
font_size: 20.0,
|
|
393
|
+
},
|
|
394
|
+
TextBlock {
|
|
395
|
+
text: "Body paragraph".to_string(),
|
|
396
|
+
bbox: BoundingBox {
|
|
397
|
+
left: 0.0,
|
|
398
|
+
top: 90.0,
|
|
399
|
+
right: 100.0,
|
|
400
|
+
bottom: 102.0,
|
|
401
|
+
},
|
|
402
|
+
font_size: 10.0,
|
|
403
|
+
},
|
|
404
|
+
];
|
|
405
|
+
|
|
406
|
+
// Act: Cluster and assign hierarchies
|
|
407
|
+
let clusters = cluster_font_sizes(&blocks, 4).expect("Clustering failed");
|
|
408
|
+
let assignments = assign_hierarchy_levels_from_clusters(&blocks, &clusters);
|
|
409
|
+
|
|
410
|
+
// Assert: Verify hierarchy levels are correct
|
|
411
|
+
assert_eq!(assignments.len(), 4);
|
|
412
|
+
assert_eq!(assignments[0].1, HierarchyLevel::H1, "Largest text should be H1");
|
|
413
|
+
assert_eq!(assignments[1].1, HierarchyLevel::H2, "Second largest should be H2");
|
|
414
|
+
assert_eq!(assignments[2].1, HierarchyLevel::H3, "Third largest should be H3");
|
|
415
|
+
assert_eq!(assignments[3].1, HierarchyLevel::Body, "Smallest text should be Body");
|
|
416
|
+
|
|
417
|
+
// Assert: F1 score should be perfect for this simple case
|
|
418
|
+
let quality_metrics = calculate_quality_metrics(
|
|
419
|
+
&assignments
|
|
420
|
+
.iter()
|
|
421
|
+
.map(|(b, l)| kreuzberg::pdf::hierarchy::HierarchyBlock {
|
|
422
|
+
text: b.text.clone(),
|
|
423
|
+
bbox: b.bbox,
|
|
424
|
+
font_size: b.font_size,
|
|
425
|
+
hierarchy_level: *l,
|
|
426
|
+
})
|
|
427
|
+
.collect::<Vec<_>>(),
|
|
428
|
+
&[
|
|
429
|
+
GroundTruthBlock {
|
|
430
|
+
text: "Title".to_string(),
|
|
431
|
+
level: "H1".to_string(),
|
|
432
|
+
bbox: GroundTruthBBox {
|
|
433
|
+
left: 0.0,
|
|
434
|
+
top: 0.0,
|
|
435
|
+
right: 100.0,
|
|
436
|
+
bottom: 28.0,
|
|
437
|
+
},
|
|
438
|
+
},
|
|
439
|
+
GroundTruthBlock {
|
|
440
|
+
text: "Subtitle".to_string(),
|
|
441
|
+
level: "H2".to_string(),
|
|
442
|
+
bbox: GroundTruthBBox {
|
|
443
|
+
left: 0.0,
|
|
444
|
+
top: 30.0,
|
|
445
|
+
right: 100.0,
|
|
446
|
+
bottom: 54.0,
|
|
447
|
+
},
|
|
448
|
+
},
|
|
449
|
+
GroundTruthBlock {
|
|
450
|
+
text: "Section".to_string(),
|
|
451
|
+
level: "H3".to_string(),
|
|
452
|
+
bbox: GroundTruthBBox {
|
|
453
|
+
left: 0.0,
|
|
454
|
+
top: 60.0,
|
|
455
|
+
right: 100.0,
|
|
456
|
+
bottom: 80.0,
|
|
457
|
+
},
|
|
458
|
+
},
|
|
459
|
+
GroundTruthBlock {
|
|
460
|
+
text: "Body paragraph".to_string(),
|
|
461
|
+
level: "Body".to_string(),
|
|
462
|
+
bbox: GroundTruthBBox {
|
|
463
|
+
left: 0.0,
|
|
464
|
+
top: 90.0,
|
|
465
|
+
right: 100.0,
|
|
466
|
+
bottom: 102.0,
|
|
467
|
+
},
|
|
468
|
+
},
|
|
469
|
+
],
|
|
470
|
+
);
|
|
471
|
+
|
|
472
|
+
println!("Consistency Test - F1 Score: {:.4}", quality_metrics.f1_score);
|
|
473
|
+
assert!(
|
|
474
|
+
quality_metrics.f1_score >= 0.8,
|
|
475
|
+
"F1 score for simple hierarchy should be >= 0.8"
|
|
476
|
+
);
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
#[test]
|
|
480
|
+
fn test_hierarchy_level_assignment() {
|
|
481
|
+
// Arrange: Create blocks and KMeans result
|
|
482
|
+
let blocks = vec![
|
|
483
|
+
TextBlock {
|
|
484
|
+
text: "Main Title".to_string(),
|
|
485
|
+
bbox: BoundingBox {
|
|
486
|
+
left: 50.0,
|
|
487
|
+
top: 50.0,
|
|
488
|
+
right: 150.0,
|
|
489
|
+
bottom: 100.0,
|
|
490
|
+
},
|
|
491
|
+
font_size: 28.0,
|
|
492
|
+
},
|
|
493
|
+
TextBlock {
|
|
494
|
+
text: "Section Title".to_string(),
|
|
495
|
+
bbox: BoundingBox {
|
|
496
|
+
left: 50.0,
|
|
497
|
+
top: 120.0,
|
|
498
|
+
right: 150.0,
|
|
499
|
+
bottom: 160.0,
|
|
500
|
+
},
|
|
501
|
+
font_size: 20.0,
|
|
502
|
+
},
|
|
503
|
+
TextBlock {
|
|
504
|
+
text: "Regular body text".to_string(),
|
|
505
|
+
bbox: BoundingBox {
|
|
506
|
+
left: 50.0,
|
|
507
|
+
top: 180.0,
|
|
508
|
+
right: 200.0,
|
|
509
|
+
bottom: 200.0,
|
|
510
|
+
},
|
|
511
|
+
font_size: 12.0,
|
|
512
|
+
},
|
|
513
|
+
];
|
|
514
|
+
|
|
515
|
+
let kmeans_result = KMeansResult { labels: vec![0, 1, 2] };
|
|
516
|
+
|
|
517
|
+
// Act: Assign hierarchy levels using KMeans result
|
|
518
|
+
let result = assign_hierarchy_levels(&blocks, &kmeans_result);
|
|
519
|
+
|
|
520
|
+
// Assert: Verify correct level assignments
|
|
521
|
+
assert_eq!(result.len(), 3);
|
|
522
|
+
assert_eq!(result[0].hierarchy_level, HierarchyLevel::H1);
|
|
523
|
+
assert_eq!(result[1].hierarchy_level, HierarchyLevel::H2);
|
|
524
|
+
assert_eq!(result[2].hierarchy_level, HierarchyLevel::H3);
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
#[test]
|
|
528
|
+
fn test_quality_metrics_calculation() {
|
|
529
|
+
// Arrange: Create extracted blocks and ground truth
|
|
530
|
+
let extracted = vec![
|
|
531
|
+
kreuzberg::pdf::hierarchy::HierarchyBlock {
|
|
532
|
+
text: "Title".to_string(),
|
|
533
|
+
bbox: BoundingBox {
|
|
534
|
+
left: 0.0,
|
|
535
|
+
top: 0.0,
|
|
536
|
+
right: 100.0,
|
|
537
|
+
bottom: 20.0,
|
|
538
|
+
},
|
|
539
|
+
font_size: 28.0,
|
|
540
|
+
hierarchy_level: HierarchyLevel::H1,
|
|
541
|
+
},
|
|
542
|
+
kreuzberg::pdf::hierarchy::HierarchyBlock {
|
|
543
|
+
text: "Body".to_string(),
|
|
544
|
+
bbox: BoundingBox {
|
|
545
|
+
left: 0.0,
|
|
546
|
+
top: 30.0,
|
|
547
|
+
right: 100.0,
|
|
548
|
+
bottom: 50.0,
|
|
549
|
+
},
|
|
550
|
+
font_size: 12.0,
|
|
551
|
+
hierarchy_level: HierarchyLevel::Body,
|
|
552
|
+
},
|
|
553
|
+
];
|
|
554
|
+
|
|
555
|
+
let ground_truth = vec![
|
|
556
|
+
GroundTruthBlock {
|
|
557
|
+
text: "Title".to_string(),
|
|
558
|
+
level: "H1".to_string(),
|
|
559
|
+
bbox: GroundTruthBBox {
|
|
560
|
+
left: 0.0,
|
|
561
|
+
top: 0.0,
|
|
562
|
+
right: 100.0,
|
|
563
|
+
bottom: 20.0,
|
|
564
|
+
},
|
|
565
|
+
},
|
|
566
|
+
GroundTruthBlock {
|
|
567
|
+
text: "Body".to_string(),
|
|
568
|
+
level: "Body".to_string(),
|
|
569
|
+
bbox: GroundTruthBBox {
|
|
570
|
+
left: 0.0,
|
|
571
|
+
top: 30.0,
|
|
572
|
+
right: 100.0,
|
|
573
|
+
bottom: 50.0,
|
|
574
|
+
},
|
|
575
|
+
},
|
|
576
|
+
];
|
|
577
|
+
|
|
578
|
+
// Act: Calculate metrics
|
|
579
|
+
let metrics = calculate_quality_metrics(&extracted, &ground_truth);
|
|
580
|
+
|
|
581
|
+
// Assert: Verify metrics
|
|
582
|
+
assert_eq!(metrics.true_positives, 2);
|
|
583
|
+
assert_eq!(metrics.false_positives, 0);
|
|
584
|
+
assert_eq!(metrics.false_negatives, 0);
|
|
585
|
+
assert_eq!(metrics.correct_levels, 2);
|
|
586
|
+
assert!(metrics.precision > 0.99);
|
|
587
|
+
assert!(metrics.recall > 0.99);
|
|
588
|
+
assert!(metrics.f1_score > 0.99);
|
|
589
|
+
}
|
|
@@ -1,43 +1,45 @@
|
|
|
1
|
-
//! PDF integration tests that remain specific to the Rust core.
|
|
2
|
-
//!
|
|
3
|
-
//! Positive-path scenarios live in the shared fixtures that back the
|
|
4
|
-
//! multi-language E2E generator. This module keeps only the cases that
|
|
5
|
-
//! exercise Rust-specific failure handling or error propagation.
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
use
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
}
|
|
1
|
+
//! PDF integration tests that remain specific to the Rust core.
|
|
2
|
+
//!
|
|
3
|
+
//! Positive-path scenarios live in the shared fixtures that back the
|
|
4
|
+
//! multi-language E2E generator. This module keeps only the cases that
|
|
5
|
+
//! exercise Rust-specific failure handling or error propagation.
|
|
6
|
+
|
|
7
|
+
#![cfg(feature = "pdf")]
|
|
8
|
+
|
|
9
|
+
mod helpers;
|
|
10
|
+
|
|
11
|
+
use helpers::*;
|
|
12
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
13
|
+
use kreuzberg::extract_file_sync;
|
|
14
|
+
|
|
15
|
+
#[test]
|
|
16
|
+
fn test_pdf_password_protected_fails_gracefully() {
|
|
17
|
+
if skip_if_missing("pdfs/copy_protected.pdf") {
|
|
18
|
+
return;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
let file_path = get_test_file_path("pdfs/copy_protected.pdf");
|
|
22
|
+
let result = extract_file_sync(&file_path, None, &ExtractionConfig::default());
|
|
23
|
+
|
|
24
|
+
match result {
|
|
25
|
+
Ok(extraction_result) => {
|
|
26
|
+
assert_mime_type(&extraction_result, "application/pdf");
|
|
27
|
+
assert!(
|
|
28
|
+
extraction_result.chunks.is_none(),
|
|
29
|
+
"Chunks should be None without chunking config"
|
|
30
|
+
);
|
|
31
|
+
assert!(
|
|
32
|
+
extraction_result.detected_languages.is_none(),
|
|
33
|
+
"Language detection not enabled"
|
|
34
|
+
);
|
|
35
|
+
}
|
|
36
|
+
Err(e) => {
|
|
37
|
+
let error_msg = e.to_string().to_lowercase();
|
|
38
|
+
assert!(
|
|
39
|
+
error_msg.contains("password") || error_msg.contains("protected") || error_msg.contains("encrypted"),
|
|
40
|
+
"Error message should indicate password/protection issue, got: {}",
|
|
41
|
+
e
|
|
42
|
+
);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|