kreuzberg 4.0.0.rc2 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +391 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -0,0 +1,475 @@
|
|
|
1
|
+
//! Tests for PDF text extraction with font information and bounding box operations.
|
|
2
|
+
//!
|
|
3
|
+
//! This module tests character extraction functionality following TDD principles,
|
|
4
|
+
//! verifying that characters are extracted with positions and font sizes,
|
|
5
|
+
//! and that bounding box operations work correctly.
|
|
6
|
+
|
|
7
|
+
#![cfg(feature = "pdf")]
|
|
8
|
+
|
|
9
|
+
mod helpers;
|
|
10
|
+
|
|
11
|
+
use helpers::get_test_file_path;
|
|
12
|
+
use kreuzberg::pdf::hierarchy::{BoundingBox, extract_chars_with_fonts};
|
|
13
|
+
use pdfium_render::prelude::*;
|
|
14
|
+
|
|
15
|
+
// ============================================================================
|
|
16
|
+
// Character Extraction Tests (Following TDD)
|
|
17
|
+
// ============================================================================
|
|
18
|
+
|
|
19
|
+
/// Test basic character extraction with positions and font sizes.
|
|
20
|
+
///
|
|
21
|
+
/// Verifies that:
|
|
22
|
+
/// - Characters are extracted with their positions
|
|
23
|
+
/// - Font sizes are captured
|
|
24
|
+
/// - Position order is preserved
|
|
25
|
+
#[test]
|
|
26
|
+
fn test_extract_chars_basic() {
|
|
27
|
+
let pdf_path = get_test_file_path("pdfs_with_tables/tiny.pdf");
|
|
28
|
+
|
|
29
|
+
// Load PDF
|
|
30
|
+
let pdfium = Pdfium;
|
|
31
|
+
let document = pdfium
|
|
32
|
+
.load_pdf_from_file(pdf_path.to_str().unwrap(), None)
|
|
33
|
+
.expect("Failed to load test PDF");
|
|
34
|
+
|
|
35
|
+
// Get first page
|
|
36
|
+
let page = document.pages().get(0).expect("Failed to get first page");
|
|
37
|
+
|
|
38
|
+
// Extract characters with fonts
|
|
39
|
+
let chars = extract_chars_with_fonts(&page).expect("Failed to extract characters with fonts");
|
|
40
|
+
|
|
41
|
+
// Verify we got some characters
|
|
42
|
+
assert!(!chars.is_empty(), "Should extract at least one character from test PDF");
|
|
43
|
+
|
|
44
|
+
// Verify each character has required fields
|
|
45
|
+
for char_data in chars.iter() {
|
|
46
|
+
assert!(!char_data.text.is_empty(), "Character text should not be empty");
|
|
47
|
+
assert!(char_data.font_size > 0.0, "Font size should be positive");
|
|
48
|
+
assert!(char_data.x >= 0.0, "X position should be non-negative");
|
|
49
|
+
assert!(char_data.y >= 0.0, "Y position should be non-negative");
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/// Test that character extraction preserves reading order.
|
|
54
|
+
///
|
|
55
|
+
/// Verifies that:
|
|
56
|
+
/// - Characters appear in left-to-right order
|
|
57
|
+
/// - Y-coordinates generally decrease as we move down the page
|
|
58
|
+
#[test]
|
|
59
|
+
fn test_extract_chars_preserves_order() {
|
|
60
|
+
let pdf_path = get_test_file_path("pdfs_with_tables/tiny.pdf");
|
|
61
|
+
|
|
62
|
+
// Load PDF
|
|
63
|
+
let pdfium = Pdfium;
|
|
64
|
+
let document = pdfium
|
|
65
|
+
.load_pdf_from_file(pdf_path.to_str().unwrap(), None)
|
|
66
|
+
.expect("Failed to load test PDF");
|
|
67
|
+
|
|
68
|
+
// Get first page
|
|
69
|
+
let page = document.pages().get(0).expect("Failed to get first page");
|
|
70
|
+
|
|
71
|
+
// Extract characters with fonts
|
|
72
|
+
let chars = extract_chars_with_fonts(&page).expect("Failed to extract characters with fonts");
|
|
73
|
+
|
|
74
|
+
assert!(!chars.is_empty(), "Should extract at least one character");
|
|
75
|
+
|
|
76
|
+
// Within each line (similar y-coordinate), characters should be left-to-right
|
|
77
|
+
let mut last_y = f32::NEG_INFINITY;
|
|
78
|
+
let mut last_x = f32::NEG_INFINITY;
|
|
79
|
+
let y_line_threshold = 5.0; // Consider within 5 units as same line
|
|
80
|
+
|
|
81
|
+
for char_data in chars.iter() {
|
|
82
|
+
// If we're on a new line
|
|
83
|
+
if (char_data.y - last_y).abs() > y_line_threshold {
|
|
84
|
+
last_x = f32::NEG_INFINITY;
|
|
85
|
+
last_y = char_data.y;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// On same line, x should generally increase (allowing for small variations)
|
|
89
|
+
// We use a threshold to allow for measurement precision issues
|
|
90
|
+
if (char_data.y - last_y).abs() <= y_line_threshold && char_data.x < last_x - 1.0 {
|
|
91
|
+
// This is acceptable if it's a new line or small variation
|
|
92
|
+
if last_x != f32::NEG_INFINITY && (char_data.y - last_y).abs() <= y_line_threshold {
|
|
93
|
+
panic!(
|
|
94
|
+
"Characters should be left-to-right on same line: {} < {} at y={}",
|
|
95
|
+
char_data.x, last_x, char_data.y
|
|
96
|
+
);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
last_x = char_data.x;
|
|
101
|
+
last_y = char_data.y;
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// ============================================================================
|
|
106
|
+
// Bounding Box Tests
|
|
107
|
+
// ============================================================================
|
|
108
|
+
|
|
109
|
+
/// Helper function to create a BoundingBox from x, y, width, height
|
|
110
|
+
fn create_bbox(x: f32, y: f32, width: f32, height: f32) -> BoundingBox {
|
|
111
|
+
BoundingBox {
|
|
112
|
+
left: x,
|
|
113
|
+
top: y,
|
|
114
|
+
right: x + width,
|
|
115
|
+
bottom: y + height,
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
#[test]
|
|
120
|
+
fn test_iou_calculation() {
|
|
121
|
+
// Two overlapping boxes
|
|
122
|
+
let bbox1 = create_bbox(0.0, 0.0, 10.0, 10.0);
|
|
123
|
+
let bbox2 = create_bbox(5.0, 5.0, 10.0, 10.0);
|
|
124
|
+
|
|
125
|
+
// Expected intersection: 5x5 = 25
|
|
126
|
+
// Expected union: 100 + 100 - 25 = 175
|
|
127
|
+
// Expected IOU: 25/175 ≈ 0.1429
|
|
128
|
+
let iou = bbox1.iou(&bbox2);
|
|
129
|
+
assert!((iou - 0.1429).abs() < 0.001, "IOU calculation failed");
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
#[test]
|
|
133
|
+
fn test_weighted_distance_calculation() {
|
|
134
|
+
// Two boxes with different X and Y distances
|
|
135
|
+
let bbox1 = create_bbox(0.0, 0.0, 10.0, 10.0);
|
|
136
|
+
let bbox2 = create_bbox(20.0, 5.0, 10.0, 10.0);
|
|
137
|
+
|
|
138
|
+
// Distance: X=20, Y=5
|
|
139
|
+
// Weighted: X*5.0 + Y*1.0 = 20*5.0 + 5*1.0 = 100 + 5 = 105
|
|
140
|
+
let weighted_dist = bbox1.weighted_distance(&bbox2);
|
|
141
|
+
assert!(
|
|
142
|
+
(weighted_dist - 105.0).abs() < 0.001,
|
|
143
|
+
"Weighted distance calculation failed"
|
|
144
|
+
);
|
|
145
|
+
|
|
146
|
+
// Verify X weight (5.0) > Y weight (1.0) by checking ratio
|
|
147
|
+
let bbox3 = create_bbox(0.0, 0.0, 10.0, 10.0);
|
|
148
|
+
let bbox4 = create_bbox(10.0, 0.0, 10.0, 10.0);
|
|
149
|
+
let only_x_dist = bbox3.weighted_distance(&bbox4);
|
|
150
|
+
|
|
151
|
+
let bbox5 = create_bbox(0.0, 0.0, 10.0, 10.0);
|
|
152
|
+
let bbox6 = create_bbox(0.0, 10.0, 10.0, 10.0);
|
|
153
|
+
let only_y_dist = bbox5.weighted_distance(&bbox6);
|
|
154
|
+
|
|
155
|
+
// X distance of 10 with weight 5.0 = 50
|
|
156
|
+
// Y distance of 10 with weight 1.0 = 10
|
|
157
|
+
// X weight should be 5x larger than Y weight
|
|
158
|
+
assert!(only_x_dist > only_y_dist, "X weight should be greater than Y weight");
|
|
159
|
+
assert!((only_x_dist - 50.0).abs() < 0.001, "X-only weighted distance failed");
|
|
160
|
+
assert!((only_y_dist - 10.0).abs() < 0.001, "Y-only weighted distance failed");
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
#[test]
|
|
164
|
+
fn test_intersection_ratio() {
|
|
165
|
+
// Two overlapping boxes
|
|
166
|
+
let bbox1 = create_bbox(0.0, 0.0, 10.0, 10.0);
|
|
167
|
+
let bbox2 = create_bbox(5.0, 5.0, 10.0, 10.0);
|
|
168
|
+
|
|
169
|
+
// Expected intersection: 5x5 = 25
|
|
170
|
+
// bbox1 area: 100
|
|
171
|
+
// Expected ratio: 25/100 = 0.25
|
|
172
|
+
let ratio = bbox1.intersection_ratio(&bbox2);
|
|
173
|
+
assert!((ratio - 0.25).abs() < 0.001, "Intersection ratio calculation failed");
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
#[test]
|
|
177
|
+
fn test_edge_case_no_overlap() {
|
|
178
|
+
// Two non-overlapping boxes
|
|
179
|
+
let bbox1 = create_bbox(0.0, 0.0, 10.0, 10.0);
|
|
180
|
+
let bbox2 = create_bbox(20.0, 20.0, 10.0, 10.0);
|
|
181
|
+
|
|
182
|
+
// IOU should be 0
|
|
183
|
+
let iou = bbox1.iou(&bbox2);
|
|
184
|
+
assert!((iou - 0.0).abs() < 0.001, "Non-overlapping boxes should have IOU of 0");
|
|
185
|
+
|
|
186
|
+
// Intersection ratio should be 0
|
|
187
|
+
let ratio = bbox1.intersection_ratio(&bbox2);
|
|
188
|
+
assert!(
|
|
189
|
+
(ratio - 0.0).abs() < 0.001,
|
|
190
|
+
"Non-overlapping boxes should have intersection ratio of 0"
|
|
191
|
+
);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
#[test]
|
|
195
|
+
fn test_edge_case_fully_contained() {
|
|
196
|
+
// Smaller box fully contained in larger box
|
|
197
|
+
let bbox_large = create_bbox(0.0, 0.0, 20.0, 20.0);
|
|
198
|
+
let bbox_small = create_bbox(5.0, 5.0, 10.0, 10.0);
|
|
199
|
+
|
|
200
|
+
// Intersection: 10x10 = 100
|
|
201
|
+
// Union: 400 + 100 - 100 = 400
|
|
202
|
+
// IOU: 100/400 = 0.25
|
|
203
|
+
let iou = bbox_large.iou(&bbox_small);
|
|
204
|
+
assert!((iou - 0.25).abs() < 0.001, "Fully contained box IOU calculation failed");
|
|
205
|
+
|
|
206
|
+
// Intersection ratio: 100/400 = 0.25
|
|
207
|
+
let ratio = bbox_large.intersection_ratio(&bbox_small);
|
|
208
|
+
assert!(
|
|
209
|
+
(ratio - 0.25).abs() < 0.001,
|
|
210
|
+
"Fully contained box intersection ratio failed"
|
|
211
|
+
);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// ============================================================================
|
|
215
|
+
// Character Merging Tests (Following TDD)
|
|
216
|
+
// ============================================================================
|
|
217
|
+
|
|
218
|
+
use kreuzberg::pdf::hierarchy::{CharData, merge_chars_into_blocks};
|
|
219
|
+
|
|
220
|
+
/// Factory helper to create a CharData with minimal parameters.
|
|
221
|
+
fn create_char(text: &str, x: f32, y: f32, font_size: f32) -> CharData {
|
|
222
|
+
CharData {
|
|
223
|
+
text: text.to_string(),
|
|
224
|
+
font_size,
|
|
225
|
+
x,
|
|
226
|
+
y,
|
|
227
|
+
width: font_size * 0.6,
|
|
228
|
+
height: font_size,
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
/// Test horizontal text merging: characters at (0,10), (10,10), (20,10) should merge into 1 block.
|
|
233
|
+
///
|
|
234
|
+
/// This test validates that characters on the same horizontal line and within the merge
|
|
235
|
+
/// distance threshold are grouped into a single text block.
|
|
236
|
+
#[test]
|
|
237
|
+
fn test_merge_horizontal_text_merging() {
|
|
238
|
+
let chars = vec![
|
|
239
|
+
create_char("H", 0.0, 10.0, 12.0),
|
|
240
|
+
create_char("e", 10.0, 10.0, 12.0),
|
|
241
|
+
create_char("y", 20.0, 10.0, 12.0),
|
|
242
|
+
];
|
|
243
|
+
|
|
244
|
+
let blocks = merge_chars_into_blocks(chars);
|
|
245
|
+
|
|
246
|
+
assert_eq!(blocks.len(), 1, "Expected 1 block for horizontal text");
|
|
247
|
+
assert_eq!(blocks[0].text, "Hey", "Expected merged text 'Hey'");
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
/// Test vertical text separation: characters at (0,10) and (0,50) should be 2 blocks.
|
|
251
|
+
///
|
|
252
|
+
/// This test validates that characters with large vertical separation are kept in separate blocks.
|
|
253
|
+
#[test]
|
|
254
|
+
fn test_merge_vertical_text_separation() {
|
|
255
|
+
let chars = vec![create_char("A", 0.0, 10.0, 12.0), create_char("B", 0.0, 50.0, 12.0)];
|
|
256
|
+
|
|
257
|
+
let blocks = merge_chars_into_blocks(chars);
|
|
258
|
+
|
|
259
|
+
assert_eq!(blocks.len(), 2, "Expected 2 blocks for vertically separated text");
|
|
260
|
+
assert_eq!(blocks[0].text, "A", "Expected first block to contain 'A'");
|
|
261
|
+
assert_eq!(blocks[1].text, "B", "Expected second block to contain 'B'");
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
/// Test edge case with negative coordinates.
|
|
265
|
+
///
|
|
266
|
+
/// This test ensures the merging algorithm handles negative coordinates correctly.
|
|
267
|
+
#[test]
|
|
268
|
+
fn test_merge_edge_case_negative_coordinates() {
|
|
269
|
+
let chars = vec![
|
|
270
|
+
create_char("X", -10.0, -5.0, 12.0),
|
|
271
|
+
create_char("Y", 0.0, -5.0, 12.0),
|
|
272
|
+
create_char("Z", 10.0, -5.0, 12.0),
|
|
273
|
+
];
|
|
274
|
+
|
|
275
|
+
let blocks = merge_chars_into_blocks(chars);
|
|
276
|
+
|
|
277
|
+
assert_eq!(blocks.len(), 1, "Expected 1 block for negative coordinates");
|
|
278
|
+
assert_eq!(blocks[0].text, "XYZ", "Expected merged text 'XYZ'");
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
/// Test edge case with overlapping blocks.
|
|
282
|
+
///
|
|
283
|
+
/// This test validates behavior when characters have overlapping or very close bounding boxes.
|
|
284
|
+
#[test]
|
|
285
|
+
fn test_merge_edge_case_overlapping_blocks() {
|
|
286
|
+
let chars = vec![
|
|
287
|
+
create_char("O", 0.0, 0.0, 12.0),
|
|
288
|
+
create_char("V", 1.0, 0.0, 12.0),
|
|
289
|
+
create_char("E", 2.0, 0.0, 12.0),
|
|
290
|
+
];
|
|
291
|
+
|
|
292
|
+
let blocks = merge_chars_into_blocks(chars);
|
|
293
|
+
|
|
294
|
+
assert_eq!(blocks.len(), 1, "Expected 1 block for overlapping characters");
|
|
295
|
+
assert_eq!(blocks[0].text, "OVE", "Expected merged text 'OVE'");
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
/// Test max merge distance threshold.
|
|
299
|
+
///
|
|
300
|
+
/// This test validates that characters beyond the maximum merge distance are kept separate.
|
|
301
|
+
#[test]
|
|
302
|
+
fn test_merge_max_merge_distance_threshold() {
|
|
303
|
+
let chars = vec![
|
|
304
|
+
create_char("T", 0.0, 10.0, 12.0),
|
|
305
|
+
create_char("e", 50.0, 10.0, 12.0), // Large gap, should be separate
|
|
306
|
+
create_char("s", 100.0, 10.0, 12.0), // Even larger gap
|
|
307
|
+
];
|
|
308
|
+
|
|
309
|
+
let blocks = merge_chars_into_blocks(chars);
|
|
310
|
+
|
|
311
|
+
// With reasonable merge distance (should be ~2.5x font size for distance),
|
|
312
|
+
// characters at 50 and 100 units apart should create separate blocks
|
|
313
|
+
assert!(blocks.len() > 1, "Expected multiple blocks due to large gaps");
|
|
314
|
+
assert_eq!(
|
|
315
|
+
blocks.iter().map(|b| b.text.len()).sum::<usize>(),
|
|
316
|
+
3,
|
|
317
|
+
"Expected all 3 characters to be preserved"
|
|
318
|
+
);
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
/// Test edge case with zero font size.
|
|
322
|
+
///
|
|
323
|
+
/// This test ensures the algorithm handles zero or near-zero font sizes gracefully
|
|
324
|
+
/// without panicking or causing division by zero.
|
|
325
|
+
#[test]
|
|
326
|
+
fn test_merge_zero_font_size() {
|
|
327
|
+
// Create characters with zero font size (edge case)
|
|
328
|
+
let chars = vec![
|
|
329
|
+
CharData {
|
|
330
|
+
text: "A".to_string(),
|
|
331
|
+
x: 0.0,
|
|
332
|
+
y: 10.0,
|
|
333
|
+
font_size: 0.0,
|
|
334
|
+
width: 0.0,
|
|
335
|
+
height: 0.0,
|
|
336
|
+
},
|
|
337
|
+
CharData {
|
|
338
|
+
text: "B".to_string(),
|
|
339
|
+
x: 1.0,
|
|
340
|
+
y: 10.0,
|
|
341
|
+
font_size: 0.0,
|
|
342
|
+
width: 0.0,
|
|
343
|
+
height: 0.0,
|
|
344
|
+
},
|
|
345
|
+
];
|
|
346
|
+
|
|
347
|
+
// Should not panic
|
|
348
|
+
let blocks = merge_chars_into_blocks(chars);
|
|
349
|
+
|
|
350
|
+
// Should still produce some output
|
|
351
|
+
assert!(!blocks.is_empty(), "Should produce blocks even with zero font size");
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
/// Test IOU calculation with zero area boxes.
|
|
355
|
+
///
|
|
356
|
+
/// This test validates that IOU calculations handle degenerate bounding boxes
|
|
357
|
+
/// with zero area without panicking.
|
|
358
|
+
#[test]
|
|
359
|
+
fn test_iou_zero_area_boxes() {
|
|
360
|
+
// Two boxes with zero area (point-like)
|
|
361
|
+
let bbox1 = create_bbox(0.0, 0.0, 0.0, 0.0);
|
|
362
|
+
let bbox2 = create_bbox(5.0, 5.0, 0.0, 0.0);
|
|
363
|
+
|
|
364
|
+
// Should not panic and should return 0 or similar
|
|
365
|
+
let iou = bbox1.iou(&bbox2);
|
|
366
|
+
assert!(
|
|
367
|
+
(0.0..=1.0).contains(&iou),
|
|
368
|
+
"IOU should be in valid range for zero-area boxes"
|
|
369
|
+
);
|
|
370
|
+
|
|
371
|
+
// Intersection ratio should also be safe
|
|
372
|
+
let ratio = bbox1.intersection_ratio(&bbox2);
|
|
373
|
+
assert!(
|
|
374
|
+
(0.0..=1.0).contains(&ratio),
|
|
375
|
+
"Intersection ratio should be in valid range"
|
|
376
|
+
);
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
/// Test IOU calculation with identical boxes.
|
|
380
|
+
///
|
|
381
|
+
/// This test validates that identical boxes have IOU of 1.0 (perfect overlap).
|
|
382
|
+
#[test]
|
|
383
|
+
fn test_iou_identical_boxes() {
|
|
384
|
+
// Two identical boxes
|
|
385
|
+
let bbox1 = create_bbox(10.0, 20.0, 30.0, 40.0);
|
|
386
|
+
let bbox2 = create_bbox(10.0, 20.0, 30.0, 40.0);
|
|
387
|
+
|
|
388
|
+
// IOU should be 1.0 for identical boxes
|
|
389
|
+
let iou = bbox1.iou(&bbox2);
|
|
390
|
+
assert!(
|
|
391
|
+
(iou - 1.0).abs() < 0.001,
|
|
392
|
+
"Identical boxes should have IOU of 1.0, got {}",
|
|
393
|
+
iou
|
|
394
|
+
);
|
|
395
|
+
|
|
396
|
+
// Intersection ratio should also be 1.0
|
|
397
|
+
let ratio = bbox1.intersection_ratio(&bbox2);
|
|
398
|
+
assert!(
|
|
399
|
+
(ratio - 1.0).abs() < 0.001,
|
|
400
|
+
"Identical boxes should have intersection ratio of 1.0, got {}",
|
|
401
|
+
ratio
|
|
402
|
+
);
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
/// Test contains() method with various box configurations.
|
|
406
|
+
#[test]
|
|
407
|
+
fn test_contains_method() {
|
|
408
|
+
let large_box = create_bbox(0.0, 0.0, 100.0, 100.0);
|
|
409
|
+
let small_box = create_bbox(10.0, 10.0, 50.0, 50.0);
|
|
410
|
+
let outside_box = create_bbox(110.0, 110.0, 150.0, 150.0);
|
|
411
|
+
|
|
412
|
+
// Small box should be contained in large box
|
|
413
|
+
assert!(large_box.contains(&small_box), "Large box should contain small box");
|
|
414
|
+
|
|
415
|
+
// Large box should not be contained in small box
|
|
416
|
+
assert!(
|
|
417
|
+
!small_box.contains(&large_box),
|
|
418
|
+
"Small box should not contain large box"
|
|
419
|
+
);
|
|
420
|
+
|
|
421
|
+
// Outside box should not be contained
|
|
422
|
+
assert!(
|
|
423
|
+
!large_box.contains(&outside_box),
|
|
424
|
+
"Large box should not contain outside box"
|
|
425
|
+
);
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
/// Test center() method for bounding box centerpoint calculation.
|
|
429
|
+
#[test]
|
|
430
|
+
fn test_center_method() {
|
|
431
|
+
let bbox = create_bbox(0.0, 0.0, 100.0, 100.0);
|
|
432
|
+
let center = bbox.center();
|
|
433
|
+
|
|
434
|
+
assert_eq!(center.0, 50.0, "Center X should be 50.0");
|
|
435
|
+
assert_eq!(center.1, 50.0, "Center Y should be 50.0");
|
|
436
|
+
|
|
437
|
+
// Test with offset box
|
|
438
|
+
let offset_bbox = create_bbox(20.0, 30.0, 80.0, 70.0);
|
|
439
|
+
let offset_center = offset_bbox.center();
|
|
440
|
+
|
|
441
|
+
assert_eq!(offset_center.0, 60.0, "Offset center X should be 60.0");
|
|
442
|
+
assert_eq!(offset_center.1, 65.0, "Offset center Y should be 65.0");
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
/// Test merge() method for combining bounding boxes.
|
|
446
|
+
#[test]
|
|
447
|
+
fn test_merge_method() {
|
|
448
|
+
let bbox1 = create_bbox(0.0, 0.0, 50.0, 50.0);
|
|
449
|
+
let bbox2 = create_bbox(30.0, 30.0, 100.0, 100.0);
|
|
450
|
+
|
|
451
|
+
let merged = bbox1.merge(&bbox2);
|
|
452
|
+
|
|
453
|
+
assert_eq!(merged.left, 0.0, "Merged left should be 0.0");
|
|
454
|
+
assert_eq!(merged.top, 0.0, "Merged top should be 0.0");
|
|
455
|
+
assert_eq!(merged.right, 130.0, "Merged right should be 130.0");
|
|
456
|
+
assert_eq!(merged.bottom, 130.0, "Merged bottom should be 130.0");
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
/// Test relaxed_iou() method with expansion factor.
|
|
460
|
+
#[test]
|
|
461
|
+
fn test_relaxed_iou_method() {
|
|
462
|
+
let bbox1 = create_bbox(0.0, 0.0, 10.0, 10.0);
|
|
463
|
+
let bbox2 = create_bbox(15.0, 15.0, 25.0, 25.0);
|
|
464
|
+
|
|
465
|
+
// Without relaxation, IOU should be 0
|
|
466
|
+
let normal_iou = bbox1.iou(&bbox2);
|
|
467
|
+
assert!(normal_iou < 0.01, "Non-overlapping boxes should have near-zero IOU");
|
|
468
|
+
|
|
469
|
+
// With relaxation, IOU should increase
|
|
470
|
+
let relaxed_iou = bbox1.relaxed_iou(&bbox2, 0.5);
|
|
471
|
+
assert!(
|
|
472
|
+
relaxed_iou > normal_iou,
|
|
473
|
+
"Relaxed IOU should be greater than normal IOU"
|
|
474
|
+
);
|
|
475
|
+
}
|