kreuzberg 4.0.0.rc2 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +543 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +194 -6
- data/README.md +396 -426
- data/Rakefile +34 -25
- data/Steepfile +51 -47
- data/examples/async_patterns.rb +283 -341
- data/ext/kreuzberg_rb/extconf.rb +65 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
- data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
- data/extconf.rb +60 -28
- data/kreuzberg.gemspec +199 -148
- data/lib/kreuzberg/api_proxy.rb +126 -142
- data/lib/kreuzberg/cache_api.rb +67 -46
- data/lib/kreuzberg/cli.rb +47 -55
- data/lib/kreuzberg/cli_proxy.rb +117 -127
- data/lib/kreuzberg/config.rb +936 -691
- data/lib/kreuzberg/error_context.rb +136 -32
- data/lib/kreuzberg/errors.rb +116 -118
- data/lib/kreuzberg/extraction_api.rb +313 -85
- data/lib/kreuzberg/mcp_proxy.rb +177 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
- data/lib/kreuzberg/post_processor_protocol.rb +15 -86
- data/lib/kreuzberg/result.rb +334 -216
- data/lib/kreuzberg/setup_lib_path.rb +99 -80
- data/lib/kreuzberg/types.rb +170 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +96 -103
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +561 -520
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +595 -0
- data/spec/binding/batch_spec.rb +359 -0
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -345
- data/spec/binding/config_validation_spec.rb +377 -283
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -213
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +738 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1228 -0
- data/spec/binding/pages_extraction_spec.rb +471 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +273 -274
- data/spec/binding/tables_spec.rb +641 -0
- data/spec/fixtures/config.toml +38 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +3 -4
- data/spec/smoke/package_spec.rb +177 -178
- data/spec/spec_helper.rb +40 -42
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +438 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- data/vendor/Cargo.toml +61 -0
- data/vendor/kreuzberg/Cargo.toml +259 -204
- data/vendor/kreuzberg/README.md +263 -175
- data/vendor/kreuzberg/build.rs +782 -474
- data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +320 -199
- data/vendor/kreuzberg/src/api/mod.rs +94 -79
- data/vendor/kreuzberg/src/api/server.rs +518 -353
- data/vendor/kreuzberg/src/api/types.rs +206 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
- data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
- data/vendor/kreuzberg/src/core/config.rs +1914 -1032
- data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
- data/vendor/kreuzberg/src/core/formats.rs +235 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +61 -45
- data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
- data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
- data/vendor/kreuzberg/src/embeddings.rs +471 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
- data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
- data/vendor/kreuzberg/src/extraction/email.rs +855 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
- data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
- data/vendor/kreuzberg/src/extraction/image.rs +492 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
- data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
- data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
- data/vendor/kreuzberg/src/extraction/table.rs +329 -328
- data/vendor/kreuzberg/src/extraction/text.rs +277 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
- data/vendor/kreuzberg/src/extractors/email.rs +157 -143
- data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
- data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
- data/vendor/kreuzberg/src/extractors/html.rs +419 -393
- data/vendor/kreuzberg/src/extractors/image.rs +219 -198
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
- data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
- data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
- data/vendor/kreuzberg/src/extractors/text.rs +265 -260
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
- data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
- data/vendor/kreuzberg/src/lib.rs +114 -105
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
- data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
- data/vendor/kreuzberg/src/pdf/error.rs +214 -122
- data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +417 -393
- data/vendor/kreuzberg/src/pdf/text.rs +553 -158
- data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +27 -19
- data/vendor/kreuzberg/src/text/quality.rs +710 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
- data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
- data/vendor/kreuzberg/src/types.rs +1713 -903
- data/vendor/kreuzberg/src/utils/mod.rs +31 -17
- data/vendor/kreuzberg/src/utils/pool.rs +503 -0
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +968 -959
- data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_embed.rs +360 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
- data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
- data/vendor/kreuzberg/tests/config_features.rs +612 -598
- data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
- data/vendor/kreuzberg/tests/core_integration.rs +519 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +165 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/page_markers.rs +297 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
- data/vendor/kreuzberg/tests/security_validation.rs +416 -415
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
- data/vendor/kreuzberg-ffi/build.rs +168 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
- data/vendor/kreuzberg-ffi/src/error.rs +901 -0
- data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
- data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
- data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
- data/vendor/kreuzberg-ffi/src/result.rs +510 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
- data/vendor/kreuzberg-ffi/src/types.rs +363 -0
- data/vendor/kreuzberg-ffi/src/util.rs +210 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
- data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1127 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +196 -45
- data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
- data/vendor/rb-sys/Cargo.lock +0 -393
- data/vendor/rb-sys/Cargo.toml +0 -70
- data/vendor/rb-sys/Cargo.toml.orig +0 -57
- data/vendor/rb-sys/LICENSE-APACHE +0 -190
- data/vendor/rb-sys/bin/release.sh +0 -21
- data/vendor/rb-sys/build/features.rs +0 -108
- data/vendor/rb-sys/build/main.rs +0 -246
- data/vendor/rb-sys/build/stable_api_config.rs +0 -153
- data/vendor/rb-sys/build/version.rs +0 -48
- data/vendor/rb-sys/readme.md +0 -36
- data/vendor/rb-sys/src/bindings.rs +0 -21
- data/vendor/rb-sys/src/hidden.rs +0 -11
- data/vendor/rb-sys/src/lib.rs +0 -34
- data/vendor/rb-sys/src/macros.rs +0 -371
- data/vendor/rb-sys/src/memory.rs +0 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
- data/vendor/rb-sys/src/special_consts.rs +0 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
- data/vendor/rb-sys/src/stable_api.rs +0 -261
- data/vendor/rb-sys/src/symbol.rs +0 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
- data/vendor/rb-sys/src/utils.rs +0 -89
- data/vendor/rb-sys/src/value_type.rs +0 -7
|
@@ -1,553 +1,1830 @@
|
|
|
1
|
-
//! HTML to Markdown conversion functions.
|
|
2
|
-
//!
|
|
3
|
-
//! This module provides HTML to Markdown conversion using the `html-to-markdown-rs` library.
|
|
4
|
-
//! It supports inline image extraction and YAML frontmatter parsing for HTML metadata.
|
|
5
|
-
//!
|
|
6
|
-
//! # Features
|
|
7
|
-
//!
|
|
8
|
-
//! - **HTML to Markdown conversion**: Clean, readable Markdown output
|
|
9
|
-
//! - **Inline image extraction**: Extract base64 and data URI images
|
|
10
|
-
//! - **YAML frontmatter**: Parse YAML metadata from Markdown output
|
|
11
|
-
//! - **Customizable conversion**: Full access to `html-to-markdown-rs` options
|
|
12
|
-
//!
|
|
13
|
-
//! # Example
|
|
14
|
-
//!
|
|
15
|
-
//! ```rust
|
|
16
|
-
//! use kreuzberg::extraction::html::convert_html_to_markdown;
|
|
17
|
-
//!
|
|
18
|
-
//! # fn example() -> kreuzberg::Result<()> {
|
|
19
|
-
//! let html = r#"<h1>Title</h1><p>This is <strong>bold</strong> text.</p>"#;
|
|
20
|
-
//! let markdown = convert_html_to_markdown(html, None)?;
|
|
21
|
-
//!
|
|
22
|
-
//! assert!(markdown.contains("# Title"));
|
|
23
|
-
//! assert!(markdown.contains("**bold**"));
|
|
24
|
-
//! # Ok(())
|
|
25
|
-
//! # }
|
|
26
|
-
//! ```
|
|
27
|
-
use crate::error::{KreuzbergError, Result};
|
|
28
|
-
use crate::types::HtmlMetadata;
|
|
29
|
-
use html_to_markdown_rs::{
|
|
30
|
-
ConversionOptions, HtmlExtraction, InlineImage, InlineImageConfig as LibInlineImageConfig, InlineImageFormat,
|
|
31
|
-
convert as convert_html, convert_with_inline_images,
|
|
32
|
-
};
|
|
33
|
-
use serde::{Deserialize, Serialize};
|
|
34
|
-
use std::
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
pub
|
|
57
|
-
pub
|
|
58
|
-
pub
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
candidate =
|
|
93
|
-
|
|
94
|
-
if candidate.
|
|
95
|
-
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
}
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
fn
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
)
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
(
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
}
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
///
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
}
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
(
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
(
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
||
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
assert!(result.contains("
|
|
397
|
-
}
|
|
398
|
-
|
|
399
|
-
#[test]
|
|
400
|
-
fn
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
let
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
let
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
<
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
<
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
1
|
+
//! HTML to Markdown conversion functions.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides HTML to Markdown conversion using the `html-to-markdown-rs` library.
|
|
4
|
+
//! It supports inline image extraction and YAML frontmatter parsing for HTML metadata.
|
|
5
|
+
//!
|
|
6
|
+
//! # Features
|
|
7
|
+
//!
|
|
8
|
+
//! - **HTML to Markdown conversion**: Clean, readable Markdown output
|
|
9
|
+
//! - **Inline image extraction**: Extract base64 and data URI images
|
|
10
|
+
//! - **YAML frontmatter**: Parse YAML metadata from Markdown output
|
|
11
|
+
//! - **Customizable conversion**: Full access to `html-to-markdown-rs` options
|
|
12
|
+
//!
|
|
13
|
+
//! # Example
|
|
14
|
+
//!
|
|
15
|
+
//! ```rust
|
|
16
|
+
//! use kreuzberg::extraction::html::convert_html_to_markdown;
|
|
17
|
+
//!
|
|
18
|
+
//! # fn example() -> kreuzberg::Result<()> {
|
|
19
|
+
//! let html = r#"<h1>Title</h1><p>This is <strong>bold</strong> text.</p>"#;
|
|
20
|
+
//! let markdown = convert_html_to_markdown(html, None)?;
|
|
21
|
+
//!
|
|
22
|
+
//! assert!(markdown.contains("# Title"));
|
|
23
|
+
//! assert!(markdown.contains("**bold**"));
|
|
24
|
+
//! # Ok(())
|
|
25
|
+
//! # }
|
|
26
|
+
//! ```
|
|
27
|
+
use crate::error::{KreuzbergError, Result};
|
|
28
|
+
use crate::types::HtmlMetadata;
|
|
29
|
+
use html_to_markdown_rs::{
|
|
30
|
+
ConversionOptions, HtmlExtraction, InlineImage, InlineImageConfig as LibInlineImageConfig, InlineImageFormat,
|
|
31
|
+
MetadataConfig, convert as convert_html, convert_with_inline_images, convert_with_metadata,
|
|
32
|
+
};
|
|
33
|
+
use serde::{Deserialize, Serialize};
|
|
34
|
+
use std::collections::HashMap;
|
|
35
|
+
|
|
36
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
37
|
+
use std::{any::Any, thread};
|
|
38
|
+
|
|
39
|
+
pub use html_to_markdown_rs::{
|
|
40
|
+
CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, PreprocessingOptions,
|
|
41
|
+
PreprocessingPreset, WhitespaceMode,
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
#[cfg(target_arch = "wasm32")]
|
|
45
|
+
const MAX_HTML_SIZE_BYTES: usize = 2 * 1024 * 1024;
|
|
46
|
+
|
|
47
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
48
|
+
const LARGE_HTML_STACK_THRESHOLD_BYTES: usize = 512 * 1024;
|
|
49
|
+
|
|
50
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
51
|
+
const HTML_CONVERSION_STACK_SIZE_BYTES: usize = 16 * 1024 * 1024;
|
|
52
|
+
|
|
53
|
+
/// Result of HTML extraction with optional images and warnings.
|
|
54
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
55
|
+
pub struct HtmlExtractionResult {
|
|
56
|
+
pub markdown: String,
|
|
57
|
+
pub images: Vec<ExtractedInlineImage>,
|
|
58
|
+
pub warnings: Vec<String>,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/// Extracted inline image with metadata.
|
|
62
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
63
|
+
pub struct ExtractedInlineImage {
|
|
64
|
+
pub data: Vec<u8>,
|
|
65
|
+
pub format: String,
|
|
66
|
+
pub filename: Option<String>,
|
|
67
|
+
pub description: Option<String>,
|
|
68
|
+
pub dimensions: Option<(u32, u32)>,
|
|
69
|
+
pub attributes: HashMap<String, String>,
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
fn inline_image_format_to_str(format: &InlineImageFormat) -> String {
|
|
73
|
+
match format {
|
|
74
|
+
InlineImageFormat::Png => "png".to_string(),
|
|
75
|
+
InlineImageFormat::Jpeg => "jpeg".to_string(),
|
|
76
|
+
InlineImageFormat::Gif => "gif".to_string(),
|
|
77
|
+
InlineImageFormat::Bmp => "bmp".to_string(),
|
|
78
|
+
InlineImageFormat::Webp => "webp".to_string(),
|
|
79
|
+
InlineImageFormat::Svg => "svg".to_string(),
|
|
80
|
+
InlineImageFormat::Other(custom) => {
|
|
81
|
+
let trimmed = custom.trim();
|
|
82
|
+
if trimmed.is_empty() {
|
|
83
|
+
return "bin".to_string();
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
let lower = trimmed.to_ascii_lowercase();
|
|
87
|
+
if lower.starts_with("svg") {
|
|
88
|
+
return "svg".to_string();
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
let mut result = String::with_capacity(10);
|
|
92
|
+
let mut candidate = lower.as_str();
|
|
93
|
+
|
|
94
|
+
if let Some(idx) = candidate.find(['+', ';']) {
|
|
95
|
+
candidate = &candidate[..idx];
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if let Some(idx) = candidate.rfind('.') {
|
|
99
|
+
candidate = &candidate[idx + 1..];
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
candidate = candidate.trim_start_matches("x-");
|
|
103
|
+
|
|
104
|
+
if candidate.is_empty() {
|
|
105
|
+
"bin".to_string()
|
|
106
|
+
} else {
|
|
107
|
+
result.push_str(candidate);
|
|
108
|
+
result
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
fn inline_image_to_extracted(image: InlineImage) -> ExtractedInlineImage {
|
|
115
|
+
ExtractedInlineImage {
|
|
116
|
+
data: image.data,
|
|
117
|
+
format: inline_image_format_to_str(&image.format),
|
|
118
|
+
filename: image.filename,
|
|
119
|
+
description: image.description,
|
|
120
|
+
dimensions: image.dimensions,
|
|
121
|
+
attributes: image.attributes.into_iter().collect(),
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
fn resolve_conversion_options(options: Option<ConversionOptions>) -> ConversionOptions {
|
|
126
|
+
options.unwrap_or_else(|| ConversionOptions {
|
|
127
|
+
extract_metadata: true,
|
|
128
|
+
hocr_spatial_tables: false,
|
|
129
|
+
preprocessing: PreprocessingOptions {
|
|
130
|
+
enabled: false,
|
|
131
|
+
..Default::default()
|
|
132
|
+
},
|
|
133
|
+
..Default::default()
|
|
134
|
+
})
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
138
|
+
fn html_requires_large_stack(len: usize) -> bool {
|
|
139
|
+
len >= LARGE_HTML_STACK_THRESHOLD_BYTES
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
fn convert_html_with_options(html: &str, options: ConversionOptions) -> Result<String> {
|
|
143
|
+
convert_html(html, Some(options))
|
|
144
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to convert HTML to Markdown: {}", e)))
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
fn convert_inline_images_with_options(
|
|
148
|
+
html: &str,
|
|
149
|
+
options: ConversionOptions,
|
|
150
|
+
image_config: LibInlineImageConfig,
|
|
151
|
+
) -> Result<HtmlExtraction> {
|
|
152
|
+
convert_with_inline_images(html, Some(options), image_config)
|
|
153
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to convert HTML to Markdown with images: {}", e)))
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
157
|
+
fn convert_inline_images_with_large_stack(
|
|
158
|
+
html: String,
|
|
159
|
+
options: ConversionOptions,
|
|
160
|
+
image_config: LibInlineImageConfig,
|
|
161
|
+
) -> Result<HtmlExtraction> {
|
|
162
|
+
run_on_dedicated_stack(move || convert_inline_images_with_options(&html, options, image_config))
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
166
|
+
fn run_on_dedicated_stack<T, F>(job: F) -> Result<T>
|
|
167
|
+
where
|
|
168
|
+
T: Send + 'static,
|
|
169
|
+
F: FnOnce() -> Result<T> + Send + 'static,
|
|
170
|
+
{
|
|
171
|
+
let handle = thread::Builder::new()
|
|
172
|
+
.name("kreuzberg-html-conversion".to_string())
|
|
173
|
+
.stack_size(HTML_CONVERSION_STACK_SIZE_BYTES)
|
|
174
|
+
.spawn(job)
|
|
175
|
+
.map_err(|err| KreuzbergError::Other(format!("Failed to spawn HTML conversion thread: {}", err)))?;
|
|
176
|
+
|
|
177
|
+
match handle.join() {
|
|
178
|
+
Ok(result) => result,
|
|
179
|
+
Err(panic) => {
|
|
180
|
+
let reason = extract_panic_reason(&panic);
|
|
181
|
+
Err(KreuzbergError::Other(format!("HTML conversion panicked: {}", reason)))
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
187
|
+
fn extract_panic_reason(panic: &Box<dyn Any + Send + 'static>) -> String {
|
|
188
|
+
if let Some(msg) = panic.downcast_ref::<&str>() {
|
|
189
|
+
(*msg).to_string()
|
|
190
|
+
} else if let Some(msg) = panic.downcast_ref::<String>() {
|
|
191
|
+
msg.clone()
|
|
192
|
+
} else {
|
|
193
|
+
"unknown panic".to_string()
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
/// Convert HTML to markdown with optional configuration.
|
|
198
|
+
///
|
|
199
|
+
/// Uses sensible defaults if no configuration is provided:
|
|
200
|
+
/// - `extract_metadata = true` (parse YAML frontmatter)
|
|
201
|
+
/// - `hocr_spatial_tables = false` (disable hOCR table detection)
|
|
202
|
+
/// - `preprocessing.enabled = false` (disable HTML preprocessing)
|
|
203
|
+
///
|
|
204
|
+
/// # WASM Limitations
|
|
205
|
+
///
|
|
206
|
+
/// In WASM builds, HTML files larger than 2MB will be rejected with an error
|
|
207
|
+
/// to prevent stack overflow. For larger files, use the native library.
|
|
208
|
+
pub fn convert_html_to_markdown(html: &str, options: Option<ConversionOptions>) -> Result<String> {
|
|
209
|
+
#[cfg(target_arch = "wasm32")]
|
|
210
|
+
if html.len() > MAX_HTML_SIZE_BYTES {
|
|
211
|
+
return Err(KreuzbergError::validation(format!(
|
|
212
|
+
"HTML file size ({} bytes) exceeds WASM limit of {} bytes (2MB). \
|
|
213
|
+
Large HTML files cannot be processed in WASM due to stack constraints. \
|
|
214
|
+
Consider using the native library for files of this size.",
|
|
215
|
+
html.len(),
|
|
216
|
+
MAX_HTML_SIZE_BYTES
|
|
217
|
+
)));
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
let options = resolve_conversion_options(options);
|
|
221
|
+
|
|
222
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
223
|
+
if html_requires_large_stack(html.len()) {
|
|
224
|
+
let html = html.to_string();
|
|
225
|
+
return run_on_dedicated_stack(move || convert_html_with_options(&html, options));
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
convert_html_with_options(html, options)
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
/// Process HTML with optional image extraction.
|
|
232
|
+
///
|
|
233
|
+
/// # WASM Limitations
|
|
234
|
+
///
|
|
235
|
+
/// In WASM builds, HTML files larger than 2MB will be rejected to prevent stack overflow.
|
|
236
|
+
pub fn process_html(
|
|
237
|
+
html: &str,
|
|
238
|
+
options: Option<ConversionOptions>,
|
|
239
|
+
extract_images: bool,
|
|
240
|
+
max_image_size: u64,
|
|
241
|
+
) -> Result<HtmlExtractionResult> {
|
|
242
|
+
#[cfg(target_arch = "wasm32")]
|
|
243
|
+
if html.len() > MAX_HTML_SIZE_BYTES {
|
|
244
|
+
return Err(KreuzbergError::validation(format!(
|
|
245
|
+
"HTML file size ({} bytes) exceeds WASM limit of {} bytes (2MB). \
|
|
246
|
+
Large HTML files cannot be processed in WASM due to stack constraints.",
|
|
247
|
+
html.len(),
|
|
248
|
+
MAX_HTML_SIZE_BYTES
|
|
249
|
+
)));
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
if extract_images {
|
|
253
|
+
let options = resolve_conversion_options(options.clone());
|
|
254
|
+
let mut img_config = LibInlineImageConfig::new(max_image_size);
|
|
255
|
+
img_config.filename_prefix = Some("inline-image".to_string());
|
|
256
|
+
|
|
257
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
258
|
+
let extraction = if html_requires_large_stack(html.len()) {
|
|
259
|
+
convert_inline_images_with_large_stack(html.to_string(), options, img_config)?
|
|
260
|
+
} else {
|
|
261
|
+
convert_inline_images_with_options(html, options, img_config)?
|
|
262
|
+
};
|
|
263
|
+
|
|
264
|
+
#[cfg(target_arch = "wasm32")]
|
|
265
|
+
let extraction = convert_inline_images_with_options(html, options, img_config)?;
|
|
266
|
+
|
|
267
|
+
let images = extraction
|
|
268
|
+
.inline_images
|
|
269
|
+
.into_iter()
|
|
270
|
+
.map(inline_image_to_extracted)
|
|
271
|
+
.collect();
|
|
272
|
+
|
|
273
|
+
let warnings = extraction.warnings.into_iter().map(|w| w.message).collect();
|
|
274
|
+
|
|
275
|
+
Ok(HtmlExtractionResult {
|
|
276
|
+
markdown: extraction.markdown,
|
|
277
|
+
images,
|
|
278
|
+
warnings,
|
|
279
|
+
})
|
|
280
|
+
} else {
|
|
281
|
+
let options = resolve_conversion_options(options);
|
|
282
|
+
let markdown = convert_html_to_markdown(html, Some(options))?;
|
|
283
|
+
|
|
284
|
+
Ok(HtmlExtractionResult {
|
|
285
|
+
markdown,
|
|
286
|
+
images: Vec::new(),
|
|
287
|
+
warnings: Vec::new(),
|
|
288
|
+
})
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
/// Convert HTML to markdown with direct metadata extraction.
|
|
293
|
+
///
|
|
294
|
+
/// Extracts metadata directly from HTML using the metadata extraction
|
|
295
|
+
/// capabilities of the `html-to-markdown-rs` library, without relying
|
|
296
|
+
/// on YAML frontmatter in the converted markdown.
|
|
297
|
+
///
|
|
298
|
+
/// # WASM Limitations
|
|
299
|
+
///
|
|
300
|
+
/// In WASM builds, HTML files larger than 2MB will be rejected with an error
|
|
301
|
+
/// to prevent stack overflow. For larger files, use the native library.
|
|
302
|
+
pub fn convert_html_to_markdown_with_metadata(
|
|
303
|
+
html: &str,
|
|
304
|
+
options: Option<ConversionOptions>,
|
|
305
|
+
) -> Result<(String, Option<HtmlMetadata>)> {
|
|
306
|
+
#[cfg(target_arch = "wasm32")]
|
|
307
|
+
if html.len() > MAX_HTML_SIZE_BYTES {
|
|
308
|
+
return Err(KreuzbergError::validation(format!(
|
|
309
|
+
"HTML file size ({} bytes) exceeds WASM limit of {} bytes (2MB). \
|
|
310
|
+
Large HTML files cannot be processed in WASM due to stack constraints. \
|
|
311
|
+
Consider using the native library for files of this size.",
|
|
312
|
+
html.len(),
|
|
313
|
+
MAX_HTML_SIZE_BYTES
|
|
314
|
+
)));
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
let options = resolve_conversion_options(options);
|
|
318
|
+
let metadata_config = MetadataConfig::default();
|
|
319
|
+
|
|
320
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
321
|
+
if html_requires_large_stack(html.len()) {
|
|
322
|
+
let html = html.to_string();
|
|
323
|
+
return run_on_dedicated_stack(move || {
|
|
324
|
+
convert_with_metadata(&html, Some(options), metadata_config)
|
|
325
|
+
.map_err(|e| KreuzbergError::parsing(format!("HTML metadata extraction failed: {}", e)))
|
|
326
|
+
.map(|(markdown, extended_metadata)| {
|
|
327
|
+
let html_metadata = HtmlMetadata::from(extended_metadata);
|
|
328
|
+
(
|
|
329
|
+
markdown,
|
|
330
|
+
if html_metadata.is_empty() {
|
|
331
|
+
None
|
|
332
|
+
} else {
|
|
333
|
+
Some(html_metadata)
|
|
334
|
+
},
|
|
335
|
+
)
|
|
336
|
+
})
|
|
337
|
+
});
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
let (markdown, extended_metadata) = convert_with_metadata(html, Some(options), metadata_config)
|
|
341
|
+
.map_err(|e| KreuzbergError::parsing(format!("HTML metadata extraction failed: {}", e)))?;
|
|
342
|
+
|
|
343
|
+
let html_metadata = HtmlMetadata::from(extended_metadata);
|
|
344
|
+
|
|
345
|
+
Ok((
|
|
346
|
+
markdown,
|
|
347
|
+
if html_metadata.is_empty() {
|
|
348
|
+
None
|
|
349
|
+
} else {
|
|
350
|
+
Some(html_metadata)
|
|
351
|
+
},
|
|
352
|
+
))
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
#[cfg(test)]
|
|
356
|
+
mod tests {
|
|
357
|
+
use super::*;
|
|
358
|
+
use crate::types::{ImageType, LinkType, StructuredDataType, TextDirection};
|
|
359
|
+
|
|
360
|
+
#[test]
|
|
361
|
+
fn test_convert_simple_html() {
|
|
362
|
+
let html = "<h1>Hello World</h1><p>This is a test.</p>";
|
|
363
|
+
let result = convert_html_to_markdown(html, None).unwrap();
|
|
364
|
+
assert!(result.contains("# Hello World"));
|
|
365
|
+
assert!(result.contains("This is a test."));
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
#[test]
|
|
369
|
+
fn test_process_html_without_images() {
|
|
370
|
+
let html = "<h1>Test</h1><p>Content</p>";
|
|
371
|
+
let result = process_html(html, None, false, 1024 * 1024).unwrap();
|
|
372
|
+
assert!(result.markdown.contains("# Test"));
|
|
373
|
+
assert!(result.markdown.contains("Content"));
|
|
374
|
+
assert!(result.images.is_empty());
|
|
375
|
+
assert!(result.warnings.is_empty());
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
#[test]
|
|
379
|
+
fn test_html_with_inline_image() {
|
|
380
|
+
let html = r#"<p>Image: <img src="" alt="Test"></p>"#;
|
|
381
|
+
let mut options = ConversionOptions::default();
|
|
382
|
+
options.preprocessing.enabled = false;
|
|
383
|
+
let result = process_html(html, Some(options), true, 1024 * 1024).unwrap();
|
|
384
|
+
assert_eq!(result.images.len(), 1);
|
|
385
|
+
assert_eq!(result.images[0].format, "png");
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
#[test]
|
|
389
|
+
fn test_html_config_heading_style() {
|
|
390
|
+
let html = "<h1>Heading</h1>";
|
|
391
|
+
let options = ConversionOptions {
|
|
392
|
+
heading_style: HeadingStyle::Atx,
|
|
393
|
+
..Default::default()
|
|
394
|
+
};
|
|
395
|
+
let result = convert_html_to_markdown(html, Some(options)).unwrap();
|
|
396
|
+
assert!(result.contains("# Heading"));
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
#[test]
|
|
400
|
+
fn test_html_with_list() {
|
|
401
|
+
let html = "<ul><li>Item 1</li><li>Item 2</li></ul>";
|
|
402
|
+
let result = convert_html_to_markdown(html, None).unwrap();
|
|
403
|
+
assert!(result.contains("Item 1"));
|
|
404
|
+
assert!(result.contains("Item 2"));
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
#[test]
|
|
408
|
+
fn test_html_with_table() {
|
|
409
|
+
let html = "<table><tr><th>Header</th></tr><tr><td>Data</td></tr></table>";
|
|
410
|
+
let result = convert_html_to_markdown(html, None).unwrap();
|
|
411
|
+
assert!(result.contains("Header"));
|
|
412
|
+
assert!(result.contains("Data"));
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
#[test]
|
|
416
|
+
fn test_inline_image_format_conversion() {
|
|
417
|
+
assert_eq!(inline_image_format_to_str(&InlineImageFormat::Png), "png");
|
|
418
|
+
assert_eq!(inline_image_format_to_str(&InlineImageFormat::Jpeg), "jpeg");
|
|
419
|
+
assert_eq!(inline_image_format_to_str(&InlineImageFormat::Svg), "svg");
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
#[test]
|
|
423
|
+
fn test_preprocessing_config() {
|
|
424
|
+
let html = "<nav>Navigation</nav><p>Content</p>";
|
|
425
|
+
let mut options = ConversionOptions::default();
|
|
426
|
+
options.preprocessing.enabled = true;
|
|
427
|
+
options.preprocessing.preset = PreprocessingPreset::Standard;
|
|
428
|
+
options.preprocessing.remove_navigation = true;
|
|
429
|
+
let result = convert_html_to_markdown(html, Some(options)).unwrap();
|
|
430
|
+
assert!(result.contains("Content"));
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
#[test]
|
|
434
|
+
fn test_inline_image_format_other_with_extension() {
|
|
435
|
+
let format = InlineImageFormat::Other("image/x-custom.jpg".to_string());
|
|
436
|
+
assert_eq!(inline_image_format_to_str(&format), "jpg");
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
#[test]
|
|
440
|
+
fn test_inline_image_format_other_empty() {
|
|
441
|
+
let format = InlineImageFormat::Other("".to_string());
|
|
442
|
+
assert_eq!(inline_image_format_to_str(&format), "bin");
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
#[test]
|
|
446
|
+
fn test_inline_image_format_other_x_prefix() {
|
|
447
|
+
let format = InlineImageFormat::Other("x-custom".to_string());
|
|
448
|
+
assert_eq!(inline_image_format_to_str(&format), "custom");
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
#[test]
|
|
452
|
+
fn test_process_html_empty_string() {
|
|
453
|
+
let result = process_html("", None, false, 1024).unwrap();
|
|
454
|
+
assert!(result.markdown.is_empty() || result.markdown.trim().is_empty());
|
|
455
|
+
assert!(result.images.is_empty());
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
#[test]
|
|
459
|
+
fn test_preprocessing_keeps_main_content() {
|
|
460
|
+
let html = r#"
|
|
461
|
+
<!DOCTYPE html>
|
|
462
|
+
<html>
|
|
463
|
+
<body>
|
|
464
|
+
<nav><p>Skip me</p></nav>
|
|
465
|
+
<main id="content">
|
|
466
|
+
<article>
|
|
467
|
+
<h1>Taylor Swift</h1>
|
|
468
|
+
<p>Taylor Alison Swift is an American singer-songwriter.</p>
|
|
469
|
+
</article>
|
|
470
|
+
</main>
|
|
471
|
+
</body>
|
|
472
|
+
</html>
|
|
473
|
+
"#;
|
|
474
|
+
let markdown = convert_html_to_markdown(html, None).expect("conversion failed");
|
|
475
|
+
assert!(markdown.contains("Taylor Alison Swift"), "{markdown}");
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
/// Test extraction of core document metadata fields:
|
|
479
|
+
/// title, description, author, canonical_url, and base_href.
|
|
480
|
+
#[test]
|
|
481
|
+
fn test_metadata_document_fields() {
|
|
482
|
+
let html = r#"<!DOCTYPE html>
|
|
483
|
+
<html>
|
|
484
|
+
<head>
|
|
485
|
+
<title>Amazing Article</title>
|
|
486
|
+
<meta name="description" content="This is a description of the article">
|
|
487
|
+
<meta name="author" content="Jane Doe">
|
|
488
|
+
<link rel="canonical" href="https://example.com/article/amazing">
|
|
489
|
+
<base href="https://example.com/">
|
|
490
|
+
</head>
|
|
491
|
+
<body>
|
|
492
|
+
<h1>Amazing Article</h1>
|
|
493
|
+
<p>Content here.</p>
|
|
494
|
+
</body>
|
|
495
|
+
</html>"#;
|
|
496
|
+
|
|
497
|
+
let (_, metadata) = convert_html_to_markdown_with_metadata(html, None).unwrap();
|
|
498
|
+
let metadata = metadata.expect("metadata should be present");
|
|
499
|
+
|
|
500
|
+
assert_eq!(
|
|
501
|
+
metadata.title,
|
|
502
|
+
Some("Amazing Article".to_string()),
|
|
503
|
+
"Title should be extracted from <title> tag"
|
|
504
|
+
);
|
|
505
|
+
|
|
506
|
+
assert_eq!(
|
|
507
|
+
metadata.description,
|
|
508
|
+
Some("This is a description of the article".to_string()),
|
|
509
|
+
"Description should be extracted from meta description tag"
|
|
510
|
+
);
|
|
511
|
+
|
|
512
|
+
assert_eq!(
|
|
513
|
+
metadata.author,
|
|
514
|
+
Some("Jane Doe".to_string()),
|
|
515
|
+
"Author should be extracted from meta author tag"
|
|
516
|
+
);
|
|
517
|
+
|
|
518
|
+
assert_eq!(
|
|
519
|
+
metadata.canonical_url,
|
|
520
|
+
Some("https://example.com/article/amazing".to_string()),
|
|
521
|
+
"Canonical URL should be extracted from link[rel=canonical]"
|
|
522
|
+
);
|
|
523
|
+
|
|
524
|
+
assert_eq!(
|
|
525
|
+
metadata.base_href,
|
|
526
|
+
Some("https://example.com/".to_string()),
|
|
527
|
+
"Base href should be extracted from <base> tag"
|
|
528
|
+
);
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
/// Test that keywords are extracted as Vec<String>, not comma-separated string.
|
|
532
|
+
/// This validates the proper parsing of keyword metadata.
|
|
533
|
+
#[test]
|
|
534
|
+
fn test_metadata_keywords_as_vec() {
|
|
535
|
+
let html = r#"<!DOCTYPE html>
|
|
536
|
+
<html>
|
|
537
|
+
<head>
|
|
538
|
+
<meta name="keywords" content="rust, web, metadata, extraction">
|
|
539
|
+
</head>
|
|
540
|
+
<body>
|
|
541
|
+
<p>Test content</p>
|
|
542
|
+
</body>
|
|
543
|
+
</html>"#;
|
|
544
|
+
|
|
545
|
+
let (_, metadata) = convert_html_to_markdown_with_metadata(html, None).unwrap();
|
|
546
|
+
let metadata = metadata.expect("metadata should be present");
|
|
547
|
+
|
|
548
|
+
assert!(
|
|
549
|
+
!metadata.keywords.is_empty(),
|
|
550
|
+
"Keywords should be extracted as a vector"
|
|
551
|
+
);
|
|
552
|
+
assert!(
|
|
553
|
+
metadata.keywords.len() >= 4,
|
|
554
|
+
"Keywords should be split on comma separators"
|
|
555
|
+
);
|
|
556
|
+
|
|
557
|
+
let keyword_set: std::collections::HashSet<_> = metadata.keywords.iter().map(|k| k.trim()).collect();
|
|
558
|
+
assert!(
|
|
559
|
+
keyword_set.contains("rust") || keyword_set.iter().any(|k| k.contains("rust")),
|
|
560
|
+
"Keywords vector should contain 'rust'"
|
|
561
|
+
);
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
/// Test language extraction from the html lang attribute.
|
|
565
|
+
#[test]
|
|
566
|
+
fn test_metadata_language() {
|
|
567
|
+
let html = r#"<!DOCTYPE html>
|
|
568
|
+
<html lang="en-US">
|
|
569
|
+
<head>
|
|
570
|
+
<title>English Page</title>
|
|
571
|
+
</head>
|
|
572
|
+
<body>
|
|
573
|
+
<p>Content in English</p>
|
|
574
|
+
</body>
|
|
575
|
+
</html>"#;
|
|
576
|
+
|
|
577
|
+
let (_, metadata) = convert_html_to_markdown_with_metadata(html, None).unwrap();
|
|
578
|
+
let metadata = metadata.expect("metadata should be present");
|
|
579
|
+
|
|
580
|
+
assert_eq!(
|
|
581
|
+
metadata.language,
|
|
582
|
+
Some("en-US".to_string()),
|
|
583
|
+
"Language should be extracted from html lang attribute"
|
|
584
|
+
);
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
/// Test text direction extraction (ltr, rtl, auto).
|
|
588
|
+
/// Validates the detection of document text directionality.
|
|
589
|
+
#[test]
|
|
590
|
+
fn test_metadata_text_direction() {
|
|
591
|
+
let html_ltr = r#"<!DOCTYPE html>
|
|
592
|
+
<html dir="ltr">
|
|
593
|
+
<head><title>LTR</title></head>
|
|
594
|
+
<body><p>Left to right</p></body>
|
|
595
|
+
</html>"#;
|
|
596
|
+
|
|
597
|
+
let (_, metadata) = convert_html_to_markdown_with_metadata(html_ltr, None).unwrap();
|
|
598
|
+
let metadata = metadata.expect("metadata should be present");
|
|
599
|
+
assert_eq!(
|
|
600
|
+
metadata.text_direction,
|
|
601
|
+
Some(TextDirection::LeftToRight),
|
|
602
|
+
"Text direction should be extracted as LeftToRight"
|
|
603
|
+
);
|
|
604
|
+
|
|
605
|
+
let html_rtl = r#"<!DOCTYPE html>
|
|
606
|
+
<html dir="rtl">
|
|
607
|
+
<head><title>RTL</title></head>
|
|
608
|
+
<body><p>Right to left</p></body>
|
|
609
|
+
</html>"#;
|
|
610
|
+
|
|
611
|
+
let (_, metadata) = convert_html_to_markdown_with_metadata(html_rtl, None).unwrap();
|
|
612
|
+
let metadata = metadata.expect("metadata should be present");
|
|
613
|
+
assert_eq!(
|
|
614
|
+
metadata.text_direction,
|
|
615
|
+
Some(TextDirection::RightToLeft),
|
|
616
|
+
"Text direction should be extracted as RightToLeft"
|
|
617
|
+
);
|
|
618
|
+
|
|
619
|
+
let html_auto = r#"<!DOCTYPE html>
|
|
620
|
+
<html dir="auto">
|
|
621
|
+
<head><title>Auto</title></head>
|
|
622
|
+
<body><p>Auto direction</p></body>
|
|
623
|
+
</html>"#;
|
|
624
|
+
|
|
625
|
+
let (_, metadata) = convert_html_to_markdown_with_metadata(html_auto, None).unwrap();
|
|
626
|
+
let metadata = metadata.expect("metadata should be present");
|
|
627
|
+
assert_eq!(
|
|
628
|
+
metadata.text_direction,
|
|
629
|
+
Some(TextDirection::Auto),
|
|
630
|
+
"Text direction should be extracted as Auto"
|
|
631
|
+
);
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
/// Test Open Graph metadata extraction into BTreeMap.
|
|
635
|
+
/// Validates extraction of og:title, og:description, og:image, og:url,
|
|
636
|
+
/// og:type, and og:site_name.
|
|
637
|
+
#[test]
|
|
638
|
+
fn test_metadata_open_graph() {
|
|
639
|
+
let html = r#"<!DOCTYPE html>
|
|
640
|
+
<html>
|
|
641
|
+
<head>
|
|
642
|
+
<title>Social Article</title>
|
|
643
|
+
<meta property="og:title" content="Open Graph Title">
|
|
644
|
+
<meta property="og:description" content="Share this amazing article">
|
|
645
|
+
<meta property="og:image" content="https://example.com/image.jpg">
|
|
646
|
+
<meta property="og:url" content="https://example.com/article">
|
|
647
|
+
<meta property="og:type" content="article">
|
|
648
|
+
<meta property="og:site_name" content="My Website">
|
|
649
|
+
</head>
|
|
650
|
+
<body>
|
|
651
|
+
<h1>Article</h1>
|
|
652
|
+
</body>
|
|
653
|
+
</html>"#;
|
|
654
|
+
|
|
655
|
+
let (_, metadata) = convert_html_to_markdown_with_metadata(html, None).unwrap();
|
|
656
|
+
let metadata = metadata.expect("metadata should be present");
|
|
657
|
+
|
|
658
|
+
assert!(
|
|
659
|
+
!metadata.open_graph.is_empty(),
|
|
660
|
+
"Open Graph map should contain extracted OG tags"
|
|
661
|
+
);
|
|
662
|
+
|
|
663
|
+
assert!(
|
|
664
|
+
metadata.open_graph.contains_key("title")
|
|
665
|
+
|| metadata.open_graph.values().any(|v| v.contains("Open Graph Title")),
|
|
666
|
+
"Open Graph should contain title"
|
|
667
|
+
);
|
|
668
|
+
|
|
669
|
+
assert!(
|
|
670
|
+
metadata.open_graph.contains_key("description")
|
|
671
|
+
|| metadata.open_graph.values().any(|v| v.contains("Share this amazing")),
|
|
672
|
+
"Open Graph should contain description"
|
|
673
|
+
);
|
|
674
|
+
|
|
675
|
+
assert!(
|
|
676
|
+
metadata.open_graph.contains_key("image") || metadata.open_graph.values().any(|v| v.contains("image.jpg")),
|
|
677
|
+
"Open Graph should contain image URL"
|
|
678
|
+
);
|
|
679
|
+
|
|
680
|
+
assert!(
|
|
681
|
+
metadata.open_graph.contains_key("url")
|
|
682
|
+
|| metadata.open_graph.values().any(|v| v.contains("example.com/article")),
|
|
683
|
+
"Open Graph should contain URL"
|
|
684
|
+
);
|
|
685
|
+
|
|
686
|
+
assert!(
|
|
687
|
+
metadata.open_graph.contains_key("type") || metadata.open_graph.values().any(|v| v.contains("article")),
|
|
688
|
+
"Open Graph should contain type"
|
|
689
|
+
);
|
|
690
|
+
|
|
691
|
+
assert!(
|
|
692
|
+
metadata.open_graph.contains_key("site_name")
|
|
693
|
+
|| metadata.open_graph.values().any(|v| v.contains("My Website")),
|
|
694
|
+
"Open Graph should contain site name"
|
|
695
|
+
);
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
/// Test Twitter Card metadata extraction into BTreeMap.
|
|
699
|
+
/// Validates extraction of twitter:card, twitter:title, twitter:description,
|
|
700
|
+
/// twitter:image, twitter:site, and twitter:creator.
|
|
701
|
+
#[test]
|
|
702
|
+
fn test_metadata_twitter_card() {
|
|
703
|
+
let html = r#"<!DOCTYPE html>
|
|
704
|
+
<html>
|
|
705
|
+
<head>
|
|
706
|
+
<title>Tweetable Article</title>
|
|
707
|
+
<meta name="twitter:card" content="summary_large_image">
|
|
708
|
+
<meta name="twitter:title" content="Tweet-worthy Title">
|
|
709
|
+
<meta name="twitter:description" content="This deserves a retweet">
|
|
710
|
+
<meta name="twitter:image" content="https://example.com/tweet-image.jpg">
|
|
711
|
+
<meta name="twitter:site" content="@mywebsite">
|
|
712
|
+
<meta name="twitter:creator" content="@author">
|
|
713
|
+
</head>
|
|
714
|
+
<body>
|
|
715
|
+
<h1>Article</h1>
|
|
716
|
+
</body>
|
|
717
|
+
</html>"#;
|
|
718
|
+
|
|
719
|
+
let (_, metadata) = convert_html_to_markdown_with_metadata(html, None).unwrap();
|
|
720
|
+
let metadata = metadata.expect("metadata should be present");
|
|
721
|
+
|
|
722
|
+
assert!(
|
|
723
|
+
!metadata.twitter_card.is_empty(),
|
|
724
|
+
"Twitter Card map should contain extracted Twitter tags"
|
|
725
|
+
);
|
|
726
|
+
|
|
727
|
+
assert!(
|
|
728
|
+
metadata.twitter_card.contains_key("card")
|
|
729
|
+
|| metadata
|
|
730
|
+
.twitter_card
|
|
731
|
+
.values()
|
|
732
|
+
.any(|v| v.contains("summary_large_image")),
|
|
733
|
+
"Twitter Card should contain card type"
|
|
734
|
+
);
|
|
735
|
+
|
|
736
|
+
assert!(
|
|
737
|
+
metadata.twitter_card.contains_key("title")
|
|
738
|
+
|| metadata.twitter_card.values().any(|v| v.contains("Tweet-worthy Title")),
|
|
739
|
+
"Twitter Card should contain title"
|
|
740
|
+
);
|
|
741
|
+
|
|
742
|
+
assert!(
|
|
743
|
+
metadata.twitter_card.contains_key("description")
|
|
744
|
+
|| metadata.twitter_card.values().any(|v| v.contains("retweet")),
|
|
745
|
+
"Twitter Card should contain description"
|
|
746
|
+
);
|
|
747
|
+
|
|
748
|
+
assert!(
|
|
749
|
+
metadata.twitter_card.contains_key("image")
|
|
750
|
+
|| metadata.twitter_card.values().any(|v| v.contains("tweet-image.jpg")),
|
|
751
|
+
"Twitter Card should contain image"
|
|
752
|
+
);
|
|
753
|
+
|
|
754
|
+
assert!(
|
|
755
|
+
metadata.twitter_card.contains_key("site")
|
|
756
|
+
|| metadata.twitter_card.values().any(|v| v.contains("@mywebsite")),
|
|
757
|
+
"Twitter Card should contain site handle"
|
|
758
|
+
);
|
|
759
|
+
|
|
760
|
+
assert!(
|
|
761
|
+
metadata.twitter_card.contains_key("creator")
|
|
762
|
+
|| metadata.twitter_card.values().any(|v| v.contains("@author")),
|
|
763
|
+
"Twitter Card should contain creator handle"
|
|
764
|
+
);
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
/// Test generic meta tags extraction into meta_tags BTreeMap.
|
|
768
|
+
/// Validates that meta tags not covered by specific fields are captured.
|
|
769
|
+
#[test]
|
|
770
|
+
fn test_metadata_generic_meta_tags() {
|
|
771
|
+
let html = "\
|
|
772
|
+
<!DOCTYPE html>
|
|
773
|
+
<html>
|
|
774
|
+
<head>
|
|
775
|
+
<title>Generic Tags</title>
|
|
776
|
+
<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">
|
|
777
|
+
<meta name=\"robots\" content=\"index, follow\">
|
|
778
|
+
<meta name=\"theme-color\" content=\"#ffffff\">
|
|
779
|
+
<meta http-equiv=\"X-UA-Compatible\" content=\"ie=edge\">
|
|
780
|
+
</head>
|
|
781
|
+
<body>
|
|
782
|
+
<p>Content</p>
|
|
783
|
+
</body>
|
|
784
|
+
</html>";
|
|
785
|
+
|
|
786
|
+
let (_, metadata) = convert_html_to_markdown_with_metadata(html, None).unwrap();
|
|
787
|
+
let metadata = metadata.expect("metadata should be present");
|
|
788
|
+
|
|
789
|
+
assert!(
|
|
790
|
+
!metadata.meta_tags.is_empty(),
|
|
791
|
+
"Meta tags map should contain generic meta tags"
|
|
792
|
+
);
|
|
793
|
+
|
|
794
|
+
assert!(
|
|
795
|
+
metadata.meta_tags.contains_key("viewport")
|
|
796
|
+
|| metadata.meta_tags.values().any(|v| v.contains("width=device-width")),
|
|
797
|
+
"Meta tags should contain viewport"
|
|
798
|
+
);
|
|
799
|
+
|
|
800
|
+
assert!(
|
|
801
|
+
metadata.meta_tags.contains_key("robots")
|
|
802
|
+
|| metadata.meta_tags.values().any(|v| v.contains("index, follow")),
|
|
803
|
+
"Meta tags should contain robots directive"
|
|
804
|
+
);
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
/// Test header/heading extraction with level, text, id, depth, and html_offset.
|
|
808
|
+
#[test]
|
|
809
|
+
fn test_metadata_headers() {
|
|
810
|
+
let html = r#"<!DOCTYPE html>
|
|
811
|
+
<html>
|
|
812
|
+
<head><title>Headers</title></head>
|
|
813
|
+
<body>
|
|
814
|
+
<h1 id="main-title">Main Title</h1>
|
|
815
|
+
<h2>Section Header</h2>
|
|
816
|
+
<h3 id="subsection">Subsection</h3>
|
|
817
|
+
<p>Some content</p>
|
|
818
|
+
<h4>Deep Heading</h4>
|
|
819
|
+
</body>
|
|
820
|
+
</html>"#;
|
|
821
|
+
|
|
822
|
+
let (_, metadata) = convert_html_to_markdown_with_metadata(html, None).unwrap();
|
|
823
|
+
let metadata = metadata.expect("metadata should be present");
|
|
824
|
+
|
|
825
|
+
assert!(
|
|
826
|
+
!metadata.headers.is_empty(),
|
|
827
|
+
"Headers vector should contain extracted headings"
|
|
828
|
+
);
|
|
829
|
+
|
|
830
|
+
let h1 = metadata.headers.iter().find(|h| h.level == 1);
|
|
831
|
+
assert!(h1.is_some(), "H1 header should be extracted");
|
|
832
|
+
assert_eq!(h1.unwrap().text, "Main Title", "H1 text should be correctly extracted");
|
|
833
|
+
assert_eq!(
|
|
834
|
+
h1.unwrap().id,
|
|
835
|
+
Some("main-title".to_string()),
|
|
836
|
+
"H1 id attribute should be extracted"
|
|
837
|
+
);
|
|
838
|
+
assert!(
|
|
839
|
+
h1.unwrap().html_offset < 1000,
|
|
840
|
+
"H1 html_offset should be within reasonable range"
|
|
841
|
+
);
|
|
842
|
+
|
|
843
|
+
let h2 = metadata.headers.iter().find(|h| h.level == 2);
|
|
844
|
+
assert!(h2.is_some(), "H2 header should be extracted");
|
|
845
|
+
assert_eq!(
|
|
846
|
+
h2.unwrap().text,
|
|
847
|
+
"Section Header",
|
|
848
|
+
"H2 text should be correctly extracted"
|
|
849
|
+
);
|
|
850
|
+
|
|
851
|
+
let h3 = metadata.headers.iter().find(|h| h.level == 3);
|
|
852
|
+
assert!(h3.is_some(), "H3 header should be extracted");
|
|
853
|
+
assert_eq!(
|
|
854
|
+
h3.unwrap().id,
|
|
855
|
+
Some("subsection".to_string()),
|
|
856
|
+
"H3 id should be extracted"
|
|
857
|
+
);
|
|
858
|
+
|
|
859
|
+
let h4 = metadata.headers.iter().find(|h| h.level == 4);
|
|
860
|
+
assert!(h4.is_some(), "H4 header should be extracted");
|
|
861
|
+
}
|
|
862
|
+
|
|
863
|
+
/// Test link extraction with href, text, title, and link_type classification.
|
|
864
|
+
/// Validates correct classification of anchor, external, email, phone, and internal links.
|
|
865
|
+
#[test]
|
|
866
|
+
fn test_metadata_links() {
|
|
867
|
+
let html = "\
|
|
868
|
+
<!DOCTYPE html>
|
|
869
|
+
<html>
|
|
870
|
+
<head><title>Links</title></head>
|
|
871
|
+
<body>
|
|
872
|
+
<a href=\"#section1\">Anchor Link</a>
|
|
873
|
+
<a href=\"https://external.com/page\">External Link</a>
|
|
874
|
+
<a href=\"/about\" title=\"About Page\">Internal Link</a>
|
|
875
|
+
<a href=\"mailto:test@example.com\">Email Link</a>
|
|
876
|
+
<a href=\"tel:+1234567890\">Phone Link</a>
|
|
877
|
+
<a href=\"https://example.com/page\">Same Domain Link</a>
|
|
878
|
+
</body>
|
|
879
|
+
</html>";
|
|
880
|
+
|
|
881
|
+
let (_, metadata) = convert_html_to_markdown_with_metadata(html, None).unwrap();
|
|
882
|
+
let metadata = metadata.expect("metadata should be present");
|
|
883
|
+
|
|
884
|
+
assert!(
|
|
885
|
+
!metadata.links.is_empty(),
|
|
886
|
+
"Links vector should contain extracted links"
|
|
887
|
+
);
|
|
888
|
+
|
|
889
|
+
let anchor = metadata.links.iter().find(|l| l.href.starts_with('#'));
|
|
890
|
+
assert!(anchor.is_some(), "Anchor link should be extracted");
|
|
891
|
+
assert_eq!(
|
|
892
|
+
anchor.unwrap().link_type,
|
|
893
|
+
LinkType::Anchor,
|
|
894
|
+
"Link starting with # should be classified as Anchor"
|
|
895
|
+
);
|
|
896
|
+
assert_eq!(anchor.unwrap().text, "Anchor Link", "Link text should be extracted");
|
|
897
|
+
|
|
898
|
+
let external = metadata.links.iter().find(|l| l.href.contains("external.com"));
|
|
899
|
+
assert!(external.is_some(), "External link should be extracted");
|
|
900
|
+
assert_eq!(
|
|
901
|
+
external.unwrap().link_type,
|
|
902
|
+
LinkType::External,
|
|
903
|
+
"External domain link should be classified as External"
|
|
904
|
+
);
|
|
905
|
+
|
|
906
|
+
let email = metadata.links.iter().find(|l| l.href.starts_with("mailto:"));
|
|
907
|
+
assert!(email.is_some(), "Email link should be extracted");
|
|
908
|
+
assert_eq!(
|
|
909
|
+
email.unwrap().link_type,
|
|
910
|
+
LinkType::Email,
|
|
911
|
+
"mailto: link should be classified as Email"
|
|
912
|
+
);
|
|
913
|
+
|
|
914
|
+
let phone = metadata.links.iter().find(|l| l.href.starts_with("tel:"));
|
|
915
|
+
assert!(phone.is_some(), "Phone link should be extracted");
|
|
916
|
+
assert_eq!(
|
|
917
|
+
phone.unwrap().link_type,
|
|
918
|
+
LinkType::Phone,
|
|
919
|
+
"tel: link should be classified as Phone"
|
|
920
|
+
);
|
|
921
|
+
|
|
922
|
+
let internal = metadata.links.iter().find(|l| l.href == "/about");
|
|
923
|
+
assert!(internal.is_some(), "Internal link should be extracted");
|
|
924
|
+
assert_eq!(
|
|
925
|
+
internal.unwrap().title,
|
|
926
|
+
Some("About Page".to_string()),
|
|
927
|
+
"Link title attribute should be extracted"
|
|
928
|
+
);
|
|
929
|
+
}
|
|
930
|
+
|
|
931
|
+
/// Test image extraction with src, alt, title, dimensions, and image_type classification.
|
|
932
|
+
/// Validates distinction between data URIs, inline SVGs, external URLs, and relative paths.
|
|
933
|
+
#[test]
|
|
934
|
+
fn test_metadata_images() {
|
|
935
|
+
let html = r#"<!DOCTYPE html>
|
|
936
|
+
<html>
|
|
937
|
+
<head><title>Images</title></head>
|
|
938
|
+
<body>
|
|
939
|
+
<img src="https://example.com/photo.jpg" alt="Photo" title="A Photo">
|
|
940
|
+
<img src="/images/logo.png" alt="Logo" width="200" height="150">
|
|
941
|
+
<img src="" alt="Data URI">
|
|
943
|
+
<img src="./relative/image.gif" alt="Relative Path">
|
|
944
|
+
</body>
|
|
945
|
+
</html>"#;
|
|
946
|
+
|
|
947
|
+
let (_, metadata) = convert_html_to_markdown_with_metadata(html, None).unwrap();
|
|
948
|
+
let metadata = metadata.expect("metadata should be present");
|
|
949
|
+
|
|
950
|
+
assert!(
|
|
951
|
+
!metadata.images.is_empty(),
|
|
952
|
+
"Images vector should contain extracted images"
|
|
953
|
+
);
|
|
954
|
+
|
|
955
|
+
let external_img = metadata.images.iter().find(|img| img.src.contains("example.com"));
|
|
956
|
+
assert!(external_img.is_some(), "External image should be extracted");
|
|
957
|
+
assert_eq!(
|
|
958
|
+
external_img.unwrap().alt,
|
|
959
|
+
Some("Photo".to_string()),
|
|
960
|
+
"Image alt text should be extracted"
|
|
961
|
+
);
|
|
962
|
+
assert_eq!(
|
|
963
|
+
external_img.unwrap().title,
|
|
964
|
+
Some("A Photo".to_string()),
|
|
965
|
+
"Image title should be extracted"
|
|
966
|
+
);
|
|
967
|
+
assert_eq!(
|
|
968
|
+
external_img.unwrap().image_type,
|
|
969
|
+
ImageType::External,
|
|
970
|
+
"External image should be classified as External"
|
|
971
|
+
);
|
|
972
|
+
|
|
973
|
+
let img_with_dims = metadata.images.iter().find(|img| img.src.contains("logo.png"));
|
|
974
|
+
assert!(img_with_dims.is_some(), "Image with dimensions should be extracted");
|
|
975
|
+
assert_eq!(
|
|
976
|
+
img_with_dims.unwrap().dimensions,
|
|
977
|
+
Some((200, 150)),
|
|
978
|
+
"Image dimensions should be extracted as (width, height)"
|
|
979
|
+
);
|
|
980
|
+
|
|
981
|
+
let svg_img = metadata.images.iter().find(|img| img.src.contains("svg"));
|
|
982
|
+
assert!(svg_img.is_some(), "Inline SVG should be extracted");
|
|
983
|
+
assert!(
|
|
984
|
+
svg_img.unwrap().image_type == ImageType::InlineSvg || svg_img.unwrap().image_type == ImageType::DataUri,
|
|
985
|
+
"SVG should be classified as either InlineSvg or DataUri"
|
|
986
|
+
);
|
|
987
|
+
|
|
988
|
+
let data_uri_img = metadata.images.iter().find(|img| img.src.starts_with("data:image/png"));
|
|
989
|
+
assert!(data_uri_img.is_some(), "Data URI image should be extracted");
|
|
990
|
+
assert_eq!(
|
|
991
|
+
data_uri_img.unwrap().image_type,
|
|
992
|
+
ImageType::DataUri,
|
|
993
|
+
"Base64 data URI should be classified as DataUri"
|
|
994
|
+
);
|
|
995
|
+
|
|
996
|
+
let relative_img = metadata.images.iter().find(|img| img.src.contains("relative"));
|
|
997
|
+
assert!(relative_img.is_some(), "Relative path image should be extracted");
|
|
998
|
+
assert_eq!(
|
|
999
|
+
relative_img.unwrap().image_type,
|
|
1000
|
+
ImageType::Relative,
|
|
1001
|
+
"Relative path should be classified as Relative"
|
|
1002
|
+
);
|
|
1003
|
+
}
|
|
1004
|
+
|
|
1005
|
+
/// Test structured data extraction (JSON-LD, microdata, RDFa).
|
|
1006
|
+
/// Validates that structured data blocks are properly parsed and categorized.
|
|
1007
|
+
#[test]
|
|
1008
|
+
fn test_metadata_structured_data() {
|
|
1009
|
+
let html = r#"<!DOCTYPE html>
|
|
1010
|
+
<html>
|
|
1011
|
+
<head>
|
|
1012
|
+
<title>Structured Data</title>
|
|
1013
|
+
<script type="application/ld+json">
|
|
1014
|
+
{
|
|
1015
|
+
"@context": "https://schema.org",
|
|
1016
|
+
"@type": "Article",
|
|
1017
|
+
"headline": "Example Article",
|
|
1018
|
+
"author": "John Doe"
|
|
1019
|
+
}
|
|
1020
|
+
</script>
|
|
1021
|
+
</head>
|
|
1022
|
+
<body>
|
|
1023
|
+
<article itemscope itemtype="https://schema.org/NewsArticle">
|
|
1024
|
+
<h1 itemprop="headline">News Item</h1>
|
|
1025
|
+
<p itemprop="articleBody">Content here</p>
|
|
1026
|
+
</article>
|
|
1027
|
+
</body>
|
|
1028
|
+
</html>"#;
|
|
1029
|
+
|
|
1030
|
+
let (_, metadata) = convert_html_to_markdown_with_metadata(html, None).unwrap();
|
|
1031
|
+
let metadata = metadata.expect("metadata should be present");
|
|
1032
|
+
|
|
1033
|
+
assert!(
|
|
1034
|
+
!metadata.structured_data.is_empty(),
|
|
1035
|
+
"Structured data vector should contain extracted data blocks"
|
|
1036
|
+
);
|
|
1037
|
+
|
|
1038
|
+
let json_ld = metadata
|
|
1039
|
+
.structured_data
|
|
1040
|
+
.iter()
|
|
1041
|
+
.find(|sd| sd.data_type == StructuredDataType::JsonLd);
|
|
1042
|
+
assert!(json_ld.is_some(), "JSON-LD should be extracted");
|
|
1043
|
+
assert!(
|
|
1044
|
+
json_ld.unwrap().raw_json.contains("Article"),
|
|
1045
|
+
"JSON-LD raw_json should contain schema type"
|
|
1046
|
+
);
|
|
1047
|
+
assert_eq!(
|
|
1048
|
+
json_ld.unwrap().schema_type,
|
|
1049
|
+
Some("Article".to_string()),
|
|
1050
|
+
"JSON-LD schema_type should be detected"
|
|
1051
|
+
);
|
|
1052
|
+
|
|
1053
|
+
let microdata = metadata
|
|
1054
|
+
.structured_data
|
|
1055
|
+
.iter()
|
|
1056
|
+
.find(|sd| sd.data_type == StructuredDataType::Microdata);
|
|
1057
|
+
if let Some(md) = microdata {
|
|
1058
|
+
assert!(
|
|
1059
|
+
md.raw_json.contains("NewsArticle") || md.schema_type == Some("NewsArticle".to_string()),
|
|
1060
|
+
"Microdata schema_type should contain NewsArticle if extracted"
|
|
1061
|
+
);
|
|
1062
|
+
}
|
|
1063
|
+
}
|
|
1064
|
+
|
|
1065
|
+
/// Test that empty HTML returns default metadata (None or empty collections).
|
|
1066
|
+
#[test]
|
|
1067
|
+
fn test_metadata_empty_html() {
|
|
1068
|
+
let html = "";
|
|
1069
|
+
|
|
1070
|
+
let (_, metadata) = convert_html_to_markdown_with_metadata(html, None).unwrap();
|
|
1071
|
+
|
|
1072
|
+
assert!(
|
|
1073
|
+
metadata.is_none() || metadata.as_ref().unwrap().is_empty(),
|
|
1074
|
+
"Empty HTML should return None or empty metadata"
|
|
1075
|
+
);
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1078
|
+
/// Test that HTML with no metadata tags returns defaults.
|
|
1079
|
+
#[test]
|
|
1080
|
+
fn test_metadata_no_metadata() {
|
|
1081
|
+
let html = r#"<!DOCTYPE html>
|
|
1082
|
+
<html>
|
|
1083
|
+
<body>
|
|
1084
|
+
<h1>Simple Page</h1>
|
|
1085
|
+
<p>Just content, no metadata tags.</p>
|
|
1086
|
+
</body>
|
|
1087
|
+
</html>"#;
|
|
1088
|
+
|
|
1089
|
+
let (_, metadata) = convert_html_to_markdown_with_metadata(html, None).unwrap();
|
|
1090
|
+
|
|
1091
|
+
if let Some(meta) = metadata {
|
|
1092
|
+
assert!(
|
|
1093
|
+
meta.title.is_none() || meta.title.is_some(),
|
|
1094
|
+
"Title might be extracted from h1 or might be None"
|
|
1095
|
+
);
|
|
1096
|
+
assert!(meta.open_graph.is_empty(), "Open Graph should be empty with no OG tags");
|
|
1097
|
+
assert!(
|
|
1098
|
+
meta.twitter_card.is_empty(),
|
|
1099
|
+
"Twitter Card should be empty with no Twitter tags"
|
|
1100
|
+
);
|
|
1101
|
+
}
|
|
1102
|
+
}
|
|
1103
|
+
|
|
1104
|
+
/// Test that malformed HTML is handled gracefully without panics.
|
|
1105
|
+
#[test]
|
|
1106
|
+
fn test_metadata_malformed_html() {
|
|
1107
|
+
let html = r#"<!DOCTYPE html>
|
|
1108
|
+
<html>
|
|
1109
|
+
<head>
|
|
1110
|
+
<title>Malformed
|
|
1111
|
+
<meta name="author content="No closing quote
|
|
1112
|
+
</head>
|
|
1113
|
+
<body>
|
|
1114
|
+
<h1>Title
|
|
1115
|
+
<p>Unclosed paragraph
|
|
1116
|
+
<div>Unmatched closing tag</div></div>
|
|
1117
|
+
</body>
|
|
1118
|
+
</html>"#;
|
|
1119
|
+
|
|
1120
|
+
let result = convert_html_to_markdown_with_metadata(html, None);
|
|
1121
|
+
assert!(
|
|
1122
|
+
result.is_ok(),
|
|
1123
|
+
"Malformed HTML should be handled gracefully without error"
|
|
1124
|
+
);
|
|
1125
|
+
|
|
1126
|
+
let (_, metadata) = result.unwrap();
|
|
1127
|
+
assert!(
|
|
1128
|
+
metadata.is_some() || metadata.is_none(),
|
|
1129
|
+
"Should return either Some or None metadata"
|
|
1130
|
+
);
|
|
1131
|
+
}
|
|
1132
|
+
|
|
1133
|
+
/// Test handling of special characters and HTML entities in metadata values.
|
|
1134
|
+
#[test]
|
|
1135
|
+
fn test_metadata_special_characters() {
|
|
1136
|
+
let html = r#"<!DOCTYPE html>
|
|
1137
|
+
<html>
|
|
1138
|
+
<head>
|
|
1139
|
+
<title>Café & Restaurant "Guide"</title>
|
|
1140
|
+
<meta name="description" content="5 stars ★★★★★ < 50% off">
|
|
1141
|
+
<meta name="author" content="José García-López">
|
|
1142
|
+
<meta property="og:title" content="Quote "Special" & Characters">
|
|
1143
|
+
</head>
|
|
1144
|
+
<body>
|
|
1145
|
+
<h1>Article Title © 2024</h1>
|
|
1146
|
+
</body>
|
|
1147
|
+
</html>"#;
|
|
1148
|
+
|
|
1149
|
+
let (_, metadata) = convert_html_to_markdown_with_metadata(html, None).unwrap();
|
|
1150
|
+
let metadata = metadata.expect("metadata should be present");
|
|
1151
|
+
|
|
1152
|
+
if let Some(title) = &metadata.title {
|
|
1153
|
+
assert!(!title.is_empty(), "Title should be extracted and decoded");
|
|
1154
|
+
}
|
|
1155
|
+
|
|
1156
|
+
if let Some(author) = &metadata.author {
|
|
1157
|
+
assert!(
|
|
1158
|
+
author.contains("García") || author.contains("Jose"),
|
|
1159
|
+
"Special characters should be handled correctly"
|
|
1160
|
+
);
|
|
1161
|
+
}
|
|
1162
|
+
|
|
1163
|
+
if let Some(desc) = &metadata.description {
|
|
1164
|
+
assert!(!desc.is_empty(), "Description should be extracted");
|
|
1165
|
+
}
|
|
1166
|
+
}
|
|
1167
|
+
|
|
1168
|
+
/// Test handling of duplicate meta tags (last value should win or all collected).
|
|
1169
|
+
#[test]
|
|
1170
|
+
fn test_metadata_duplicate_tags() {
|
|
1171
|
+
let html = r#"<!DOCTYPE html>
|
|
1172
|
+
<html>
|
|
1173
|
+
<head>
|
|
1174
|
+
<title>First Title</title>
|
|
1175
|
+
<meta name="description" content="First description">
|
|
1176
|
+
<meta name="description" content="Second description (should override)">
|
|
1177
|
+
<meta name="author" content="Author One">
|
|
1178
|
+
<meta name="author" content="Author Two">
|
|
1179
|
+
</head>
|
|
1180
|
+
<body>
|
|
1181
|
+
<p>Content</p>
|
|
1182
|
+
</body>
|
|
1183
|
+
</html>"#;
|
|
1184
|
+
|
|
1185
|
+
let (_, metadata) = convert_html_to_markdown_with_metadata(html, None).unwrap();
|
|
1186
|
+
let metadata = metadata.expect("metadata should be present");
|
|
1187
|
+
|
|
1188
|
+
if let Some(title) = &metadata.title {
|
|
1189
|
+
assert_eq!(
|
|
1190
|
+
title, "First Title",
|
|
1191
|
+
"Title should be the single value from first title tag"
|
|
1192
|
+
);
|
|
1193
|
+
}
|
|
1194
|
+
|
|
1195
|
+
if let Some(description) = &metadata.description {
|
|
1196
|
+
assert!(
|
|
1197
|
+
!description.is_empty(),
|
|
1198
|
+
"Description should be populated even with duplicates"
|
|
1199
|
+
);
|
|
1200
|
+
assert!(
|
|
1201
|
+
description.contains("First") || description.contains("Second"),
|
|
1202
|
+
"Description should contain one of the duplicate values"
|
|
1203
|
+
);
|
|
1204
|
+
}
|
|
1205
|
+
}
|
|
1206
|
+
|
|
1207
|
+
/// Comprehensive test of a complete HTML document with ALL metadata types.
|
|
1208
|
+
/// Validates that all metadata extraction works together correctly.
|
|
1209
|
+
#[test]
|
|
1210
|
+
fn test_metadata_comprehensive() {
|
|
1211
|
+
let html = "<html lang=\"en\" dir=\"ltr\"><head>\
|
|
1212
|
+
<meta charset=\"UTF-8\">\
|
|
1213
|
+
<title>Complete Metadata Example</title>\
|
|
1214
|
+
<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\
|
|
1215
|
+
<meta name=\"description\" content=\"Comprehensive metadata extraction test page\">\
|
|
1216
|
+
<meta name=\"keywords\" content=\"metadata, extraction, rust, web\">\
|
|
1217
|
+
<meta name=\"author\" content=\"Test Author\">\
|
|
1218
|
+
<meta name=\"robots\" content=\"index, follow\">\
|
|
1219
|
+
<meta property=\"og:title\" content=\"OG Title\">\
|
|
1220
|
+
<meta property=\"og:description\" content=\"OG Description\">\
|
|
1221
|
+
<meta property=\"og:image\" content=\"https://example.com/og-image.jpg\">\
|
|
1222
|
+
<meta property=\"og:url\" content=\"https://example.com/article\">\
|
|
1223
|
+
<meta property=\"og:type\" content=\"article\">\
|
|
1224
|
+
<meta property=\"og:site_name\" content=\"Example Site\">\
|
|
1225
|
+
<meta name=\"twitter:card\" content=\"summary_large_image\">\
|
|
1226
|
+
<meta name=\"twitter:title\" content=\"Tweet Title\">\
|
|
1227
|
+
<meta name=\"twitter:description\" content=\"Tweet Description\">\
|
|
1228
|
+
<meta name=\"twitter:image\" content=\"https://example.com/tweet.jpg\">\
|
|
1229
|
+
<meta name=\"twitter:site\" content=\"@example\">\
|
|
1230
|
+
<link rel=\"canonical\" href=\"https://example.com/article/complete\">\
|
|
1231
|
+
<base href=\"https://example.com/\">\
|
|
1232
|
+
<script type=\"application/ld+json\">{\"@context\":\"https://schema.org\",\"@type\":\"Article\",\"headline\":\"Complete Metadata Example\",\"author\":\"Test Author\",\"datePublished\":\"2024-01-01\"}</script>\
|
|
1233
|
+
</head><body>\
|
|
1234
|
+
<header><h1 id=\"page-title\">Complete Metadata Example</h1><p>Test</p></header>\
|
|
1235
|
+
<nav><a href=\"#intro\">Intro</a><a href=\"https://external.com\">External</a></nav>\
|
|
1236
|
+
<main>\
|
|
1237
|
+
<section id=\"intro\"><h2>Introduction</h2><p>Purpose.</p><img src=\"https://example.com/intro.jpg\" alt=\"Intro image\" title=\"Intro\"></section>\
|
|
1238
|
+
<section id=\"content\">\
|
|
1239
|
+
<h3>Content</h3><h4>Sub</h4><p>Details.</p>\
|
|
1240
|
+
<h3>Gallery</h3>\
|
|
1241
|
+
<img src=\"/images/photo1.jpg\" alt=\"Photo 1\" width=\"400\" height=\"300\">\
|
|
1242
|
+
<img src=\"\" alt=\"Data URI\">\
|
|
1243
|
+
<img src=\"./relative/image.gif\" alt=\"Relative\">\
|
|
1244
|
+
</section>\
|
|
1245
|
+
<section id=\"links\">\
|
|
1246
|
+
<h3>Links</h3>\
|
|
1247
|
+
<a href=\"#top\">Top</a>\
|
|
1248
|
+
<a href=\"/about\" title=\"About\">Internal</a>\
|
|
1249
|
+
<a href=\"mailto:contact@example.com\">Email</a>\
|
|
1250
|
+
<a href=\"tel:+1-555-1234\">Phone</a>\
|
|
1251
|
+
</section>\
|
|
1252
|
+
</main>\
|
|
1253
|
+
<footer><p>2024 Example</p></footer>\
|
|
1254
|
+
</body></html>";
|
|
1255
|
+
|
|
1256
|
+
let (markdown, metadata) = convert_html_to_markdown_with_metadata(html, None).unwrap();
|
|
1257
|
+
let metadata = metadata.expect("comprehensive HTML should have metadata");
|
|
1258
|
+
|
|
1259
|
+
assert_eq!(
|
|
1260
|
+
metadata.title,
|
|
1261
|
+
Some("Complete Metadata Example".to_string()),
|
|
1262
|
+
"Title should be extracted"
|
|
1263
|
+
);
|
|
1264
|
+
assert_eq!(
|
|
1265
|
+
metadata.description,
|
|
1266
|
+
Some("Comprehensive metadata extraction test page".to_string()),
|
|
1267
|
+
"Description should be extracted"
|
|
1268
|
+
);
|
|
1269
|
+
assert_eq!(
|
|
1270
|
+
metadata.author,
|
|
1271
|
+
Some("Test Author".to_string()),
|
|
1272
|
+
"Author should be extracted"
|
|
1273
|
+
);
|
|
1274
|
+
assert!(!metadata.keywords.is_empty(), "Keywords should be extracted");
|
|
1275
|
+
assert_eq!(
|
|
1276
|
+
metadata.language,
|
|
1277
|
+
Some("en".to_string()),
|
|
1278
|
+
"Language should be extracted"
|
|
1279
|
+
);
|
|
1280
|
+
assert_eq!(
|
|
1281
|
+
metadata.text_direction,
|
|
1282
|
+
Some(TextDirection::LeftToRight),
|
|
1283
|
+
"Text direction should be extracted"
|
|
1284
|
+
);
|
|
1285
|
+
assert_eq!(
|
|
1286
|
+
metadata.canonical_url,
|
|
1287
|
+
Some("https://example.com/article/complete".to_string()),
|
|
1288
|
+
"Canonical URL should be extracted"
|
|
1289
|
+
);
|
|
1290
|
+
assert_eq!(
|
|
1291
|
+
metadata.base_href,
|
|
1292
|
+
Some("https://example.com/".to_string()),
|
|
1293
|
+
"Base href should be extracted"
|
|
1294
|
+
);
|
|
1295
|
+
|
|
1296
|
+
assert!(!metadata.open_graph.is_empty(), "Open Graph tags should be extracted");
|
|
1297
|
+
|
|
1298
|
+
assert!(
|
|
1299
|
+
!metadata.twitter_card.is_empty(),
|
|
1300
|
+
"Twitter Card tags should be extracted"
|
|
1301
|
+
);
|
|
1302
|
+
|
|
1303
|
+
assert!(!metadata.headers.is_empty(), "Headers should be extracted");
|
|
1304
|
+
let h1_count = metadata.headers.iter().filter(|h| h.level == 1).count();
|
|
1305
|
+
assert_eq!(h1_count, 1, "Should have exactly one H1");
|
|
1306
|
+
assert!(metadata.headers.iter().any(|h| h.level == 2), "Should have H2 headers");
|
|
1307
|
+
assert!(metadata.headers.iter().any(|h| h.level == 3), "Should have H3 headers");
|
|
1308
|
+
|
|
1309
|
+
assert!(!metadata.links.is_empty(), "Links should be extracted");
|
|
1310
|
+
assert!(
|
|
1311
|
+
metadata.links.iter().any(|l| l.link_type == LinkType::Anchor),
|
|
1312
|
+
"Anchor links should be present"
|
|
1313
|
+
);
|
|
1314
|
+
assert!(
|
|
1315
|
+
metadata.links.iter().any(|l| l.link_type == LinkType::Email),
|
|
1316
|
+
"Email links should be present"
|
|
1317
|
+
);
|
|
1318
|
+
assert!(
|
|
1319
|
+
metadata.links.iter().any(|l| l.link_type == LinkType::Phone),
|
|
1320
|
+
"Phone links should be present"
|
|
1321
|
+
);
|
|
1322
|
+
|
|
1323
|
+
assert!(!metadata.images.is_empty(), "Images should be extracted");
|
|
1324
|
+
assert!(
|
|
1325
|
+
metadata.images.iter().any(|img| img.image_type == ImageType::External),
|
|
1326
|
+
"External images should be present"
|
|
1327
|
+
);
|
|
1328
|
+
assert!(
|
|
1329
|
+
metadata.images.iter().any(|img| img.image_type == ImageType::DataUri),
|
|
1330
|
+
"Data URI images should be present"
|
|
1331
|
+
);
|
|
1332
|
+
assert!(
|
|
1333
|
+
metadata.images.iter().any(|img| img.image_type == ImageType::Relative),
|
|
1334
|
+
"Relative images should be present"
|
|
1335
|
+
);
|
|
1336
|
+
|
|
1337
|
+
let img_with_dims = metadata.images.iter().find(|img| img.dimensions.is_some());
|
|
1338
|
+
assert!(img_with_dims.is_some(), "At least one image should have dimensions");
|
|
1339
|
+
if let Some(img) = img_with_dims {
|
|
1340
|
+
assert_eq!(
|
|
1341
|
+
img.dimensions,
|
|
1342
|
+
Some((400, 300)),
|
|
1343
|
+
"Image dimensions should be correctly extracted"
|
|
1344
|
+
);
|
|
1345
|
+
}
|
|
1346
|
+
|
|
1347
|
+
assert!(
|
|
1348
|
+
!metadata.structured_data.is_empty(),
|
|
1349
|
+
"Structured data should be extracted"
|
|
1350
|
+
);
|
|
1351
|
+
|
|
1352
|
+
assert!(!markdown.is_empty(), "Markdown should be generated");
|
|
1353
|
+
assert!(
|
|
1354
|
+
markdown.contains("Complete Metadata Example"),
|
|
1355
|
+
"Markdown should contain heading text"
|
|
1356
|
+
);
|
|
1357
|
+
}
|
|
1358
|
+
|
|
1359
|
+
/// Real-world-like webpage structure with realistic metadata patterns.
|
|
1360
|
+
/// Tests extraction from a realistic blog post scenario.
|
|
1361
|
+
#[test]
|
|
1362
|
+
fn test_metadata_real_world_webpage() {
|
|
1363
|
+
let html = "<!DOCTYPE html>\
|
|
1364
|
+
<html lang=\"en\"><head>\
|
|
1365
|
+
<meta charset=\"UTF-8\">\
|
|
1366
|
+
<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\
|
|
1367
|
+
<title>How to Build Rust Web Applications | TechBlog</title>\
|
|
1368
|
+
<meta name=\"description\" content=\"Learn how to build scalable web applications using Rust\">\
|
|
1369
|
+
<meta name=\"keywords\" content=\"rust, web development, actix, async, tutorial\">\
|
|
1370
|
+
<meta name=\"author\" content=\"Sarah Chen\">\
|
|
1371
|
+
<link rel=\"canonical\" href=\"https://techblog.example.com/rust-web-apps\">\
|
|
1372
|
+
<base href=\"https://techblog.example.com/\">\
|
|
1373
|
+
<meta property=\"og:title\" content=\"How to Build Rust Web Applications\">\
|
|
1374
|
+
<meta property=\"og:description\" content=\"A comprehensive guide to building web apps with Rust\">\
|
|
1375
|
+
<meta property=\"og:image\" content=\"https://techblog.example.com/images/rust-web.jpg\">\
|
|
1376
|
+
<meta property=\"og:type\" content=\"article\">\
|
|
1377
|
+
<meta name=\"twitter:card\" content=\"summary_large_image\">\
|
|
1378
|
+
<meta name=\"twitter:title\" content=\"How to Build Rust Web Applications\">\
|
|
1379
|
+
<meta name=\"twitter:image\" content=\"https://techblog.example.com/images/rust-web-twitter.jpg\">\
|
|
1380
|
+
<meta name=\"twitter:creator\" content=\"@sarahcodes\">\
|
|
1381
|
+
<script type=\"application/ld+json\">{\"@context\":\"https://schema.org\",\"@type\":\"BlogPosting\",\"headline\":\"How to Build Rust Web Applications\"}</script>\
|
|
1382
|
+
</head><body>\
|
|
1383
|
+
<header><nav>\
|
|
1384
|
+
<a href=\"/\">Home</a><a href=\"/blog\">Blog</a><a href=\"/resources\">Resources</a><a href=\"/about\">About</a>\
|
|
1385
|
+
</nav></header>\
|
|
1386
|
+
<article>\
|
|
1387
|
+
<h1>How to Build Rust Web Applications</h1>\
|
|
1388
|
+
<img src=\"https://techblog.example.com/images/rust-web-hero.jpg\" alt=\"Rust web development\" title=\"Hero image\">\
|
|
1389
|
+
<p>Guide content here</p>\
|
|
1390
|
+
<h2>Getting Started</h2>\
|
|
1391
|
+
<p>Before diving in, install Rust.</p>\
|
|
1392
|
+
<h3>Installation</h3>\
|
|
1393
|
+
<p>Visit <a href=\"https://www.rust-lang.org/tools/install\">installation page</a>.</p>\
|
|
1394
|
+
<h3>Your First Project</h3>\
|
|
1395
|
+
<p>Create project with cargo</p>\
|
|
1396
|
+
<h2>Building</h2>\
|
|
1397
|
+
<h3>Dependencies</h3>\
|
|
1398
|
+
<p>Setup Cargo.toml</p>\
|
|
1399
|
+
<h3>Routes</h3>\
|
|
1400
|
+
<p>Learn <a href=\"/blog/rust-routing\">routing</a>.</p>\
|
|
1401
|
+
<h2>Advanced</h2>\
|
|
1402
|
+
<h3>Async</h3>\
|
|
1403
|
+
<p>See <a href=\"https://tokio.rs\" title=\"Tokio async runtime\">Tokio</a>.</p>\
|
|
1404
|
+
<h3>Database</h3>\
|
|
1405
|
+
<p>Contact <a href=\"mailto:hello@techblog.example.com\">hello@techblog.example.com</a></p>\
|
|
1406
|
+
<h2>Gallery</h2>\
|
|
1407
|
+
<img src=\"/images/diagram1.png\" alt=\"Architecture diagram\" width=\"600\" height=\"400\">\
|
|
1408
|
+
<img src=\"/images/diagram2.png\" alt=\"Flow chart\" width=\"600\" height=\"400\">\
|
|
1409
|
+
<h2>Conclusion</h2>\
|
|
1410
|
+
<p>Excellent choice. <a href=\"/blog/rust-deployment\">Deployment</a>.</p>\
|
|
1411
|
+
<footer><p>Questions? <a href=\"tel:+1-555-0100\">Call</a> or <a href=\"#contact\">contact</a>.</p></footer>\
|
|
1412
|
+
</article>\
|
|
1413
|
+
</body></html>";
|
|
1414
|
+
|
|
1415
|
+
let (markdown, metadata) = convert_html_to_markdown_with_metadata(html, None).unwrap();
|
|
1416
|
+
let metadata = metadata.expect("real-world HTML should have metadata");
|
|
1417
|
+
|
|
1418
|
+
assert_eq!(
|
|
1419
|
+
metadata.title,
|
|
1420
|
+
Some("How to Build Rust Web Applications | TechBlog".to_string()),
|
|
1421
|
+
"Real-world title with site name should be extracted"
|
|
1422
|
+
);
|
|
1423
|
+
assert!(metadata.description.is_some(), "Description should be present");
|
|
1424
|
+
assert_eq!(
|
|
1425
|
+
metadata.author,
|
|
1426
|
+
Some("Sarah Chen".to_string()),
|
|
1427
|
+
"Author should be extracted"
|
|
1428
|
+
);
|
|
1429
|
+
assert!(!metadata.keywords.is_empty(), "Keywords should be extracted");
|
|
1430
|
+
|
|
1431
|
+
assert!(!metadata.open_graph.is_empty(), "Article should have Open Graph tags");
|
|
1432
|
+
|
|
1433
|
+
assert!(
|
|
1434
|
+
!metadata.twitter_card.is_empty(),
|
|
1435
|
+
"Article should have Twitter Card tags"
|
|
1436
|
+
);
|
|
1437
|
+
|
|
1438
|
+
assert!(metadata.headers.len() >= 5, "Should extract multiple heading levels");
|
|
1439
|
+
assert!(
|
|
1440
|
+
metadata.headers.iter().any(|h| h.level == 1),
|
|
1441
|
+
"Should have H1 (main title)"
|
|
1442
|
+
);
|
|
1443
|
+
assert!(
|
|
1444
|
+
metadata.headers.iter().any(|h| h.level == 2),
|
|
1445
|
+
"Should have H2 (sections)"
|
|
1446
|
+
);
|
|
1447
|
+
assert!(
|
|
1448
|
+
metadata.headers.iter().any(|h| h.level == 3),
|
|
1449
|
+
"Should have H3 (subsections)"
|
|
1450
|
+
);
|
|
1451
|
+
|
|
1452
|
+
assert!(metadata.links.len() >= 3, "Should extract multiple links");
|
|
1453
|
+
assert!(
|
|
1454
|
+
metadata.links.iter().any(|l| l.link_type == LinkType::Internal),
|
|
1455
|
+
"Should have internal links"
|
|
1456
|
+
);
|
|
1457
|
+
assert!(
|
|
1458
|
+
metadata.links.iter().any(|l| l.link_type == LinkType::External),
|
|
1459
|
+
"Should have external links"
|
|
1460
|
+
);
|
|
1461
|
+
assert!(
|
|
1462
|
+
metadata.links.iter().any(|l| l.link_type == LinkType::Email)
|
|
1463
|
+
|| metadata.links.iter().any(|l| l.link_type == LinkType::Phone),
|
|
1464
|
+
"Should have either email or phone links"
|
|
1465
|
+
);
|
|
1466
|
+
|
|
1467
|
+
assert!(!metadata.images.is_empty(), "Should extract images");
|
|
1468
|
+
let hero_image = metadata.images.iter().find(|img| {
|
|
1469
|
+
img.alt
|
|
1470
|
+
.as_ref()
|
|
1471
|
+
.is_some_and(|a| a.contains("Hero") || a.contains("development") || a.contains("hero"))
|
|
1472
|
+
});
|
|
1473
|
+
if hero_image.is_none() {
|
|
1474
|
+
assert!(!metadata.images.is_empty(), "Should have extracted at least one image");
|
|
1475
|
+
}
|
|
1476
|
+
|
|
1477
|
+
assert!(
|
|
1478
|
+
!metadata.structured_data.is_empty(),
|
|
1479
|
+
"Should extract structured data (JSON-LD)"
|
|
1480
|
+
);
|
|
1481
|
+
let json_ld = metadata
|
|
1482
|
+
.structured_data
|
|
1483
|
+
.iter()
|
|
1484
|
+
.find(|sd| sd.data_type == StructuredDataType::JsonLd);
|
|
1485
|
+
assert!(json_ld.is_some(), "Should have JSON-LD structured data");
|
|
1486
|
+
assert_eq!(
|
|
1487
|
+
json_ld.unwrap().schema_type,
|
|
1488
|
+
Some("BlogPosting".to_string()),
|
|
1489
|
+
"JSON-LD should identify as BlogPosting schema"
|
|
1490
|
+
);
|
|
1491
|
+
|
|
1492
|
+
assert!(!markdown.is_empty(), "Should generate Markdown from HTML");
|
|
1493
|
+
assert!(markdown.contains("Rust"), "Markdown should contain article content");
|
|
1494
|
+
}
|
|
1495
|
+
|
|
1496
|
+
/// Test extraction of large HTML document (1MB+) for performance
|
|
1497
|
+
/// Generates HTML with 10,000+ elements and validates extraction
|
|
1498
|
+
/// completes within reasonable time (<30s) with no panics.
|
|
1499
|
+
#[test]
|
|
1500
|
+
fn test_large_html_performance() {
|
|
1501
|
+
let mut html = String::with_capacity(2_000_000);
|
|
1502
|
+
html.push_str(
|
|
1503
|
+
r#"<!DOCTYPE html>
|
|
1504
|
+
<html>
|
|
1505
|
+
<head>
|
|
1506
|
+
<title>Large HTML Performance Test</title>
|
|
1507
|
+
<meta name="description" content="Testing extraction performance on large documents">
|
|
1508
|
+
</head>
|
|
1509
|
+
<body>
|
|
1510
|
+
<h1>Large Document Test</h1>"#,
|
|
1511
|
+
);
|
|
1512
|
+
|
|
1513
|
+
for i in 0..10000 {
|
|
1514
|
+
html.push_str(&format!(
|
|
1515
|
+
"<article><h2>Article {}</h2><p>Content block {} with expanded text content to increase document size. \
|
|
1516
|
+
This article contains multiple paragraphs describing various topics. \
|
|
1517
|
+
The goal is to create sufficient HTML content to test performance on large documents. \
|
|
1518
|
+
Here are some additional details: Section A covers fundamentals, Section B covers implementation, \
|
|
1519
|
+
and Section C covers optimization. Each section has multiple subsections.</p>\
|
|
1520
|
+
<p>Additional content paragraph {} to further expand the document.</p></article>\n",
|
|
1521
|
+
i, i, i
|
|
1522
|
+
));
|
|
1523
|
+
}
|
|
1524
|
+
html.push_str("</body></html>");
|
|
1525
|
+
|
|
1526
|
+
let html_size_bytes = html.len();
|
|
1527
|
+
assert!(
|
|
1528
|
+
html_size_bytes > 1_000_000,
|
|
1529
|
+
"Generated HTML should be >1MB (got {} bytes)",
|
|
1530
|
+
html_size_bytes
|
|
1531
|
+
);
|
|
1532
|
+
|
|
1533
|
+
let start = std::time::Instant::now();
|
|
1534
|
+
|
|
1535
|
+
let result = process_html(&html, None, false, 1024 * 1024);
|
|
1536
|
+
|
|
1537
|
+
let duration = start.elapsed();
|
|
1538
|
+
|
|
1539
|
+
assert!(
|
|
1540
|
+
result.is_ok(),
|
|
1541
|
+
"Large HTML extraction should succeed. Error: {:?}",
|
|
1542
|
+
result.err()
|
|
1543
|
+
);
|
|
1544
|
+
|
|
1545
|
+
let result = result.unwrap();
|
|
1546
|
+
assert!(!result.markdown.is_empty(), "Markdown should be generated");
|
|
1547
|
+
|
|
1548
|
+
assert!(
|
|
1549
|
+
duration.as_secs() < 30,
|
|
1550
|
+
"Large HTML extraction took too long: {:.2}s (must be <30s)",
|
|
1551
|
+
duration.as_secs_f64()
|
|
1552
|
+
);
|
|
1553
|
+
}
|
|
1554
|
+
|
|
1555
|
+
/// Test WASM size boundary conditions
|
|
1556
|
+
/// Tests HTML exactly at and around the 2MB limit to ensure
|
|
1557
|
+
/// proper error handling and boundary detection.
|
|
1558
|
+
#[test]
|
|
1559
|
+
fn test_wasm_size_limit_boundary() {
|
|
1560
|
+
let mut html_under = String::from(
|
|
1561
|
+
r#"<!DOCTYPE html>
|
|
1562
|
+
<html>
|
|
1563
|
+
<head><title>Just Under Limit</title></head>
|
|
1564
|
+
<body><h1>Content</h1>"#,
|
|
1565
|
+
);
|
|
1566
|
+
|
|
1567
|
+
let target_size = 1_800_000;
|
|
1568
|
+
while html_under.len() < target_size {
|
|
1569
|
+
html_under.push_str("<p>Padding content for size testing. This is test data to reach the target document size. Lorem ipsum dolor sit amet.</p>\n");
|
|
1570
|
+
}
|
|
1571
|
+
html_under.truncate(target_size);
|
|
1572
|
+
html_under.push_str("</body></html>");
|
|
1573
|
+
|
|
1574
|
+
assert!(
|
|
1575
|
+
html_under.len() < 2 * 1024 * 1024,
|
|
1576
|
+
"HTML should be under 2MB limit (got {} bytes)",
|
|
1577
|
+
html_under.len()
|
|
1578
|
+
);
|
|
1579
|
+
|
|
1580
|
+
let result = process_html(&html_under, None, false, 1024);
|
|
1581
|
+
#[cfg(target_arch = "wasm32")]
|
|
1582
|
+
assert!(result.is_ok(), "HTML under 2MB should be accepted in WASM");
|
|
1583
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
1584
|
+
assert!(result.is_ok(), "HTML under 2MB should always be accepted");
|
|
1585
|
+
|
|
1586
|
+
let mut html_over = String::from(
|
|
1587
|
+
r#"<!DOCTYPE html>
|
|
1588
|
+
<html>
|
|
1589
|
+
<head><title>Over Limit</title></head>
|
|
1590
|
+
<body><h1>Content</h1>"#,
|
|
1591
|
+
);
|
|
1592
|
+
|
|
1593
|
+
let target_size = 2_200_000;
|
|
1594
|
+
while html_over.len() < target_size {
|
|
1595
|
+
html_over.push_str("<p>Oversized content for boundary testing. This section generates large HTML to exceed limits. Lorem ipsum dolor sit amet.</p>\n");
|
|
1596
|
+
}
|
|
1597
|
+
html_over.truncate(target_size);
|
|
1598
|
+
html_over.push_str("</body></html>");
|
|
1599
|
+
|
|
1600
|
+
assert!(
|
|
1601
|
+
html_over.len() > 2 * 1024 * 1024,
|
|
1602
|
+
"HTML should be over 2MB limit (got {} bytes)",
|
|
1603
|
+
html_over.len()
|
|
1604
|
+
);
|
|
1605
|
+
|
|
1606
|
+
let result = process_html(&html_over, None, false, 1024);
|
|
1607
|
+
#[cfg(target_arch = "wasm32")]
|
|
1608
|
+
{
|
|
1609
|
+
assert!(result.is_err(), "HTML over 2MB should be rejected in WASM with error");
|
|
1610
|
+
let error_msg = format!("{:?}", result.err());
|
|
1611
|
+
assert!(
|
|
1612
|
+
error_msg.contains("2MB") || error_msg.contains("WASM"),
|
|
1613
|
+
"Error message should clearly indicate WASM size limit"
|
|
1614
|
+
);
|
|
1615
|
+
}
|
|
1616
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
1617
|
+
{
|
|
1618
|
+
if let Err(e) = result {
|
|
1619
|
+
let msg = format!("{:?}", e);
|
|
1620
|
+
assert!(
|
|
1621
|
+
!msg.contains("WASM") && !msg.contains("2MB"),
|
|
1622
|
+
"Native builds should not enforce WASM size limit"
|
|
1623
|
+
);
|
|
1624
|
+
}
|
|
1625
|
+
}
|
|
1626
|
+
}
|
|
1627
|
+
|
|
1628
|
+
/// Test graceful handling of malformed JSON-LD structured data
|
|
1629
|
+
/// Validates that invalid JSON in script type="application/ld+json"
|
|
1630
|
+
/// does not cause panics and is skipped gracefully.
|
|
1631
|
+
#[test]
|
|
1632
|
+
fn test_malformed_json_ld_graceful_handling() {
|
|
1633
|
+
let html = r#"<!DOCTYPE html>
|
|
1634
|
+
<html>
|
|
1635
|
+
<head>
|
|
1636
|
+
<title>Malformed JSON-LD Test</title>
|
|
1637
|
+
<script type="application/ld+json">
|
|
1638
|
+
{
|
|
1639
|
+
"@context": "https://schema.org",
|
|
1640
|
+
"@type": "Article",
|
|
1641
|
+
"headline": "Test Article",
|
|
1642
|
+
"author": "John Doe"
|
|
1643
|
+
"datePublished": "2024-01-01"
|
|
1644
|
+
}
|
|
1645
|
+
</script>
|
|
1646
|
+
</head>
|
|
1647
|
+
<body>
|
|
1648
|
+
<h1>Article Title</h1>
|
|
1649
|
+
<p>This HTML contains invalid JSON-LD (missing comma after author field)</p>
|
|
1650
|
+
</body>
|
|
1651
|
+
</html>"#;
|
|
1652
|
+
|
|
1653
|
+
let result = convert_html_to_markdown_with_metadata(html, None);
|
|
1654
|
+
|
|
1655
|
+
assert!(
|
|
1656
|
+
result.is_ok(),
|
|
1657
|
+
"Malformed JSON-LD should not cause panic. Error: {:?}",
|
|
1658
|
+
result.err()
|
|
1659
|
+
);
|
|
1660
|
+
|
|
1661
|
+
let (markdown, metadata) = result.unwrap();
|
|
1662
|
+
|
|
1663
|
+
assert!(
|
|
1664
|
+
!markdown.is_empty(),
|
|
1665
|
+
"Markdown should be extracted despite invalid JSON-LD"
|
|
1666
|
+
);
|
|
1667
|
+
assert!(
|
|
1668
|
+
markdown.contains("Article Title") || markdown.contains("Article"),
|
|
1669
|
+
"Content should be properly converted to Markdown"
|
|
1670
|
+
);
|
|
1671
|
+
|
|
1672
|
+
if let Some(meta) = metadata {
|
|
1673
|
+
assert_eq!(
|
|
1674
|
+
meta.title,
|
|
1675
|
+
Some("Malformed JSON-LD Test".to_string()),
|
|
1676
|
+
"Document metadata should be extracted from tags"
|
|
1677
|
+
);
|
|
1678
|
+
}
|
|
1679
|
+
}
|
|
1680
|
+
|
|
1681
|
+
/// Test XSS sanitization in metadata fields
|
|
1682
|
+
/// Validates that script tags and malicious content in metadata
|
|
1683
|
+
/// are properly handled and don't cause panics.
|
|
1684
|
+
/// Note: The actual sanitization is done by the html-to-markdown-rs library,
|
|
1685
|
+
/// which may escape, strip, or preserve content depending on context.
|
|
1686
|
+
#[test]
|
|
1687
|
+
fn test_metadata_xss_sanitization() {
|
|
1688
|
+
let html = r#"<!DOCTYPE html>
|
|
1689
|
+
<html>
|
|
1690
|
+
<head>
|
|
1691
|
+
<title>Safe Title <script>alert('xss')</script></title>
|
|
1692
|
+
<meta name="description" content="Description with encoded content">
|
|
1693
|
+
<meta name="author" content="Author Name">
|
|
1694
|
+
<meta property="og:title" content="OG Title">
|
|
1695
|
+
<meta property="og:description" content="OG Description">
|
|
1696
|
+
</head>
|
|
1697
|
+
<body>
|
|
1698
|
+
<h1>Title Section</h1>
|
|
1699
|
+
<p>Content here</p>
|
|
1700
|
+
</body>
|
|
1701
|
+
</html>"#;
|
|
1702
|
+
|
|
1703
|
+
let result = convert_html_to_markdown_with_metadata(html, None);
|
|
1704
|
+
assert!(
|
|
1705
|
+
result.is_ok(),
|
|
1706
|
+
"HTML with script-like content should not cause error. Error: {:?}",
|
|
1707
|
+
result.err()
|
|
1708
|
+
);
|
|
1709
|
+
|
|
1710
|
+
let (markdown, metadata) = result.unwrap();
|
|
1711
|
+
|
|
1712
|
+
assert!(!markdown.is_empty(), "Markdown should be generated");
|
|
1713
|
+
|
|
1714
|
+
if let Some(meta) = metadata {
|
|
1715
|
+
if let Some(title) = &meta.title {
|
|
1716
|
+
assert!(!title.is_empty(), "Title should be extracted");
|
|
1717
|
+
assert!(
|
|
1718
|
+
title.contains("Safe") || title.contains("script"),
|
|
1719
|
+
"Title should extract content from title tag: {}",
|
|
1720
|
+
title
|
|
1721
|
+
);
|
|
1722
|
+
}
|
|
1723
|
+
|
|
1724
|
+
if let Some(desc) = &meta.description {
|
|
1725
|
+
assert!(!desc.is_empty(), "Description should be extracted");
|
|
1726
|
+
}
|
|
1727
|
+
|
|
1728
|
+
if let Some(author) = &meta.author {
|
|
1729
|
+
assert_eq!(author, "Author Name", "Author should be correctly extracted");
|
|
1730
|
+
}
|
|
1731
|
+
|
|
1732
|
+
if !meta.open_graph.is_empty() {
|
|
1733
|
+
let og_count = meta.open_graph.len();
|
|
1734
|
+
assert!(og_count > 0, "Open Graph tags should be extracted");
|
|
1735
|
+
}
|
|
1736
|
+
}
|
|
1737
|
+
}
|
|
1738
|
+
|
|
1739
|
+
/// Test thread safety of HTML extraction with concurrent access
|
|
1740
|
+
/// Validates that extracting the same HTML from multiple threads
|
|
1741
|
+
/// does not cause panics, data races, or corruption.
|
|
1742
|
+
#[test]
|
|
1743
|
+
fn test_concurrent_html_extraction() {
|
|
1744
|
+
use std::sync::Arc;
|
|
1745
|
+
|
|
1746
|
+
let html = Arc::new(
|
|
1747
|
+
r#"<!DOCTYPE html>
|
|
1748
|
+
<html lang="en">
|
|
1749
|
+
<head>
|
|
1750
|
+
<title>Concurrent Test Article</title>
|
|
1751
|
+
<meta name="description" content="Testing concurrent extraction">
|
|
1752
|
+
<meta name="author" content="Test Author">
|
|
1753
|
+
<meta property="og:title" content="OG Title">
|
|
1754
|
+
<meta property="og:description" content="OG Description">
|
|
1755
|
+
<meta name="twitter:card" content="summary">
|
|
1756
|
+
<script type="application/ld+json">
|
|
1757
|
+
{
|
|
1758
|
+
"@context": "https://schema.org",
|
|
1759
|
+
"@type": "Article",
|
|
1760
|
+
"headline": "Concurrent Test",
|
|
1761
|
+
"author": "Test Author"
|
|
1762
|
+
}
|
|
1763
|
+
</script>
|
|
1764
|
+
</head>
|
|
1765
|
+
<body>
|
|
1766
|
+
<h1>Concurrent Extraction Test</h1>
|
|
1767
|
+
<h2>Section 1</h2>
|
|
1768
|
+
<p>Content 1</p>
|
|
1769
|
+
<h2>Section 2</h2>
|
|
1770
|
+
<p>Content 2</p>
|
|
1771
|
+
<a href="https://example.com">External Link</a>
|
|
1772
|
+
<a href="/about">Internal Link</a>
|
|
1773
|
+
<img src="https://example.com/image.jpg" alt="Test Image">
|
|
1774
|
+
</body>
|
|
1775
|
+
</html>"#,
|
|
1776
|
+
);
|
|
1777
|
+
|
|
1778
|
+
let handles: Vec<_> = (0..10)
|
|
1779
|
+
.map(|thread_id| {
|
|
1780
|
+
let html = Arc::clone(&html);
|
|
1781
|
+
std::thread::spawn(move || {
|
|
1782
|
+
let result = convert_html_to_markdown_with_metadata(html.as_ref(), None);
|
|
1783
|
+
|
|
1784
|
+
assert!(
|
|
1785
|
+
result.is_ok(),
|
|
1786
|
+
"Thread {} extraction failed: {:?}",
|
|
1787
|
+
thread_id,
|
|
1788
|
+
result.err()
|
|
1789
|
+
);
|
|
1790
|
+
|
|
1791
|
+
let (markdown, metadata) = result.unwrap();
|
|
1792
|
+
|
|
1793
|
+
assert!(
|
|
1794
|
+
!markdown.is_empty(),
|
|
1795
|
+
"Thread {} markdown should not be empty",
|
|
1796
|
+
thread_id
|
|
1797
|
+
);
|
|
1798
|
+
|
|
1799
|
+
if let Some(meta) = metadata {
|
|
1800
|
+
assert_eq!(
|
|
1801
|
+
meta.title,
|
|
1802
|
+
Some("Concurrent Test Article".to_string()),
|
|
1803
|
+
"Thread {} should extract correct title",
|
|
1804
|
+
thread_id
|
|
1805
|
+
);
|
|
1806
|
+
|
|
1807
|
+
assert!(!meta.headers.is_empty(), "Thread {} should extract headers", thread_id);
|
|
1808
|
+
assert!(!meta.links.is_empty(), "Thread {} should extract links", thread_id);
|
|
1809
|
+
assert!(!meta.images.is_empty(), "Thread {} should extract images", thread_id);
|
|
1810
|
+
assert!(
|
|
1811
|
+
!meta.open_graph.is_empty(),
|
|
1812
|
+
"Thread {} should extract OG metadata",
|
|
1813
|
+
thread_id
|
|
1814
|
+
);
|
|
1815
|
+
}
|
|
1816
|
+
|
|
1817
|
+
true
|
|
1818
|
+
})
|
|
1819
|
+
})
|
|
1820
|
+
.collect();
|
|
1821
|
+
|
|
1822
|
+
let all_succeeded = handles.into_iter().enumerate().all(|(i, handle)| {
|
|
1823
|
+
let result = handle.join();
|
|
1824
|
+
assert!(result.is_ok(), "Thread {} panicked: {:?}", i, result.err());
|
|
1825
|
+
result.unwrap()
|
|
1826
|
+
});
|
|
1827
|
+
|
|
1828
|
+
assert!(all_succeeded, "All concurrent extraction threads should succeed");
|
|
1829
|
+
}
|
|
1830
|
+
}
|