kreuzberg 4.0.0.pre.rc.8 → 4.0.0.pre.rc.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -14
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -1
- data/.rubocop.yml +538 -538
- data/Gemfile +8 -8
- data/Gemfile.lock +4 -104
- data/README.md +454 -432
- data/Rakefile +25 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -341
- data/ext/kreuzberg_rb/extconf.rb +45 -45
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +6941 -6721
- data/ext/kreuzberg_rb/native/Cargo.toml +54 -54
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -15
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +3158 -3135
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +214 -182
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +81 -46
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +724 -724
- data/lib/kreuzberg/error_context.rb +80 -32
- data/lib/kreuzberg/errors.rb +118 -118
- data/lib/kreuzberg/extraction_api.rb +340 -85
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +279 -279
- data/lib/kreuzberg/setup_lib_path.rb +80 -80
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +109 -103
- data/lib/pdfium.dll +0 -0
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +546 -537
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -85
- data/spec/binding/cli_spec.rb +55 -55
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -41
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/Cargo.toml +45 -0
- data/vendor/kreuzberg/Cargo.toml +61 -38
- data/vendor/kreuzberg/README.md +230 -221
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -48
- data/vendor/kreuzberg/build.rs +843 -891
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
- data/vendor/kreuzberg/src/chunking/mod.rs +1877 -1877
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -220
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
- data/vendor/kreuzberg/src/core/config.rs +1080 -1080
- data/vendor/kreuzberg/src/core/extractor.rs +1156 -1156
- data/vendor/kreuzberg/src/core/io.rs +329 -329
- data/vendor/kreuzberg/src/core/mime.rs +605 -605
- data/vendor/kreuzberg/src/core/mod.rs +47 -47
- data/vendor/kreuzberg/src/core/pipeline.rs +1184 -1171
- data/vendor/kreuzberg/src/embeddings.rs +500 -432
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +398 -398
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +601 -569
- data/vendor/kreuzberg/src/extraction/image.rs +491 -491
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -562
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -213
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -81
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -284
- data/vendor/kreuzberg/src/extraction/pptx.rs +3100 -3100
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +447 -447
- data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +504 -504
- data/vendor/kreuzberg/src/extractors/docx.rs +400 -400
- data/vendor/kreuzberg/src/extractors/email.rs +157 -157
- data/vendor/kreuzberg/src/extractors/epub.rs +708 -708
- data/vendor/kreuzberg/src/extractors/excel.rs +345 -345
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -492
- data/vendor/kreuzberg/src/extractors/html.rs +407 -407
- data/vendor/kreuzberg/src/extractors/image.rs +219 -219
- data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +653 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +701 -701
- data/vendor/kreuzberg/src/extractors/mod.rs +429 -429
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +635 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -529
- data/vendor/kreuzberg/src/extractors/pdf.rs +749 -673
- data/vendor/kreuzberg/src/extractors/pptx.rs +267 -267
- data/vendor/kreuzberg/src/extractors/rst.rs +577 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +809 -809
- data/vendor/kreuzberg/src/extractors/security.rs +484 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
- data/vendor/kreuzberg/src/extractors/structured.rs +142 -142
- data/vendor/kreuzberg/src/extractors/text.rs +265 -265
- data/vendor/kreuzberg/src/extractors/typst.rs +651 -651
- data/vendor/kreuzberg/src/extractors/xml.rs +147 -147
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +275 -275
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +985 -985
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -219
- data/vendor/kreuzberg/src/lib.rs +113 -113
- data/vendor/kreuzberg/src/mcp/mod.rs +35 -35
- data/vendor/kreuzberg/src/mcp/server.rs +2076 -2076
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -863
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +452 -452
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +346 -328
- data/vendor/kreuzberg/src/pdf/error.rs +130 -130
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +489 -489
- data/vendor/kreuzberg/src/pdf/mod.rs +68 -66
- data/vendor/kreuzberg/src/pdf/rendering.rs +368 -368
- data/vendor/kreuzberg/src/pdf/table.rs +420 -417
- data/vendor/kreuzberg/src/pdf/text.rs +240 -240
- data/vendor/kreuzberg/src/plugins/extractor.rs +1044 -1044
- data/vendor/kreuzberg/src/plugins/mod.rs +212 -212
- data/vendor/kreuzberg/src/plugins/ocr.rs +639 -639
- data/vendor/kreuzberg/src/plugins/processor.rs +650 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1339
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +967 -967
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +25 -25
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -219
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +1055 -1055
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +545 -545
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -556
- data/vendor/kreuzberg/tests/batch_processing.rs +318 -318
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
- data/vendor/kreuzberg/tests/concurrency_stress.rs +533 -533
- data/vendor/kreuzberg/tests/config_features.rs +612 -612
- data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -416
- data/vendor/kreuzberg/tests/core_integration.rs +510 -510
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -500
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
- data/vendor/kreuzberg/tests/email_integration.rs +327 -327
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
- data/vendor/kreuzberg/tests/error_handling.rs +402 -402
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
- data/vendor/kreuzberg/tests/format_integration.rs +164 -161
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
- data/vendor/kreuzberg/tests/image_integration.rs +255 -255
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
- data/vendor/kreuzberg/tests/mime_detection.rs +429 -429
- data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -514
- data/vendor/kreuzberg/tests/ocr_errors.rs +698 -698
- data/vendor/kreuzberg/tests/ocr_quality.rs +629 -629
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
- data/vendor/kreuzberg/tests/pdf_integration.rs +45 -45
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -374
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1436 -1436
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -776
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -560
- data/vendor/kreuzberg/tests/plugin_system.rs +927 -927
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -587
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -775
- data/vendor/kreuzberg/tests/security_validation.rs +416 -416
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +631 -631
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -648
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/kreuzberg-ffi/Cargo.toml +63 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/build.rs +176 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +48 -0
- data/vendor/kreuzberg-tesseract/LICENSE +22 -0
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1354 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- data/vendor/rb-sys/.cargo_vcs_info.json +5 -5
- data/vendor/rb-sys/Cargo.lock +393 -393
- data/vendor/rb-sys/Cargo.toml +70 -70
- data/vendor/rb-sys/Cargo.toml.orig +57 -57
- data/vendor/rb-sys/LICENSE-APACHE +190 -190
- data/vendor/rb-sys/LICENSE-MIT +21 -21
- data/vendor/rb-sys/build/features.rs +111 -111
- data/vendor/rb-sys/build/main.rs +286 -286
- data/vendor/rb-sys/build/stable_api_config.rs +155 -155
- data/vendor/rb-sys/build/version.rs +50 -50
- data/vendor/rb-sys/readme.md +36 -36
- data/vendor/rb-sys/src/bindings.rs +21 -21
- data/vendor/rb-sys/src/hidden.rs +11 -11
- data/vendor/rb-sys/src/lib.rs +35 -35
- data/vendor/rb-sys/src/macros.rs +371 -371
- data/vendor/rb-sys/src/memory.rs +53 -53
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -38
- data/vendor/rb-sys/src/special_consts.rs +31 -31
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -179
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -257
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +324 -324
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +332 -332
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +325 -325
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +323 -323
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +339 -339
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +339 -339
- data/vendor/rb-sys/src/stable_api.rs +260 -260
- data/vendor/rb-sys/src/symbol.rs +31 -31
- data/vendor/rb-sys/src/tracking_allocator.rs +330 -330
- data/vendor/rb-sys/src/utils.rs +89 -89
- data/vendor/rb-sys/src/value_type.rs +7 -7
- metadata +44 -81
- data/vendor/rb-sys/bin/release.sh +0 -21
|
@@ -1,327 +1,327 @@
|
|
|
1
|
-
//! Email extraction integration tests.
|
|
2
|
-
//!
|
|
3
|
-
//! Tests for .eml (RFC822) email extraction.
|
|
4
|
-
//! Validates metadata extraction, content extraction, HTML/plain text handling, and attachments.
|
|
5
|
-
|
|
6
|
-
#![cfg(feature = "email")]
|
|
7
|
-
|
|
8
|
-
use kreuzberg::core::config::ExtractionConfig;
|
|
9
|
-
use kreuzberg::core::extractor::extract_bytes;
|
|
10
|
-
|
|
11
|
-
mod helpers;
|
|
12
|
-
|
|
13
|
-
/// Test basic EML extraction with subject, from, to, and body.
|
|
14
|
-
#[tokio::test]
|
|
15
|
-
async fn test_eml_basic_extraction() {
|
|
16
|
-
let config = ExtractionConfig::default();
|
|
17
|
-
|
|
18
|
-
let eml_content = b"From: sender@example.com\r\n\
|
|
19
|
-
To: recipient@example.com\r\n\
|
|
20
|
-
Subject: Test Email Subject\r\n\
|
|
21
|
-
Date: Mon, 1 Jan 2024 12:00:00 +0000\r\n\
|
|
22
|
-
Message-ID: <unique123@example.com>\r\n\
|
|
23
|
-
\r\n\
|
|
24
|
-
This is the email body content.";
|
|
25
|
-
|
|
26
|
-
let result = extract_bytes(eml_content, "message/rfc822", &config)
|
|
27
|
-
.await
|
|
28
|
-
.expect("Should extract EML successfully");
|
|
29
|
-
|
|
30
|
-
assert_eq!(result.mime_type, "message/rfc822");
|
|
31
|
-
|
|
32
|
-
assert_eq!(result.metadata.subject, Some("Test Email Subject".to_string()));
|
|
33
|
-
|
|
34
|
-
assert!(result.metadata.format.is_some());
|
|
35
|
-
let email_meta = match result.metadata.format.as_ref().unwrap() {
|
|
36
|
-
kreuzberg::FormatMetadata::Email(meta) => meta,
|
|
37
|
-
_ => panic!("Expected Email metadata"),
|
|
38
|
-
};
|
|
39
|
-
|
|
40
|
-
assert_eq!(email_meta.from_email, Some("sender@example.com".to_string()));
|
|
41
|
-
|
|
42
|
-
assert_eq!(email_meta.to_emails, vec!["recipient@example.com".to_string()]);
|
|
43
|
-
assert!(email_meta.cc_emails.is_empty(), "CC should be empty");
|
|
44
|
-
assert!(email_meta.bcc_emails.is_empty(), "BCC should be empty");
|
|
45
|
-
|
|
46
|
-
assert!(email_meta.message_id.is_some());
|
|
47
|
-
let msg_id = email_meta.message_id.clone().unwrap();
|
|
48
|
-
assert!(
|
|
49
|
-
msg_id.contains("unique123@example.com"),
|
|
50
|
-
"Message ID should contain unique123@example.com"
|
|
51
|
-
);
|
|
52
|
-
|
|
53
|
-
assert!(email_meta.attachments.is_empty(), "Should have no attachments");
|
|
54
|
-
|
|
55
|
-
assert!(result.metadata.date.is_some());
|
|
56
|
-
|
|
57
|
-
assert!(result.content.contains("Subject: Test Email Subject"));
|
|
58
|
-
assert!(result.content.contains("From: sender@example.com"));
|
|
59
|
-
assert!(result.content.contains("To: recipient@example.com"));
|
|
60
|
-
assert!(result.content.contains("This is the email body content"));
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
/// Test EML with attachments - metadata extraction.
|
|
64
|
-
#[tokio::test]
|
|
65
|
-
async fn test_eml_with_attachments() {
|
|
66
|
-
let config = ExtractionConfig::default();
|
|
67
|
-
|
|
68
|
-
let eml_content = b"From: sender@example.com\r\n\
|
|
69
|
-
To: recipient@example.com\r\n\
|
|
70
|
-
Subject: Email with Attachment\r\n\
|
|
71
|
-
Content-Type: multipart/mixed; boundary=\"----boundary\"\r\n\
|
|
72
|
-
\r\n\
|
|
73
|
-
------boundary\r\n\
|
|
74
|
-
Content-Type: text/plain\r\n\
|
|
75
|
-
\r\n\
|
|
76
|
-
Email body text.\r\n\
|
|
77
|
-
------boundary\r\n\
|
|
78
|
-
Content-Type: text/plain; name=\"file.txt\"\r\n\
|
|
79
|
-
Content-Disposition: attachment; filename=\"file.txt\"\r\n\
|
|
80
|
-
\r\n\
|
|
81
|
-
Attachment content here.\r\n\
|
|
82
|
-
------boundary--\r\n";
|
|
83
|
-
|
|
84
|
-
let result = extract_bytes(eml_content, "message/rfc822", &config)
|
|
85
|
-
.await
|
|
86
|
-
.expect("Should extract EML with attachment");
|
|
87
|
-
|
|
88
|
-
assert!(result.metadata.format.is_some());
|
|
89
|
-
let email_meta = match result.metadata.format.as_ref().unwrap() {
|
|
90
|
-
kreuzberg::FormatMetadata::Email(meta) => meta,
|
|
91
|
-
_ => panic!("Expected Email metadata"),
|
|
92
|
-
};
|
|
93
|
-
|
|
94
|
-
if !email_meta.attachments.is_empty() {
|
|
95
|
-
assert!(result.content.contains("Attachments:"));
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
assert!(result.content.contains("Email body text") || result.content.contains("Attachment content"));
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
/// Test EML with HTML body.
|
|
102
|
-
#[tokio::test]
|
|
103
|
-
async fn test_eml_html_body() {
|
|
104
|
-
let config = ExtractionConfig::default();
|
|
105
|
-
|
|
106
|
-
let eml_content = b"From: sender@example.com\r\n\
|
|
107
|
-
To: recipient@example.com\r\n\
|
|
108
|
-
Subject: HTML Email\r\n\
|
|
109
|
-
Content-Type: text/html; charset=utf-8\r\n\
|
|
110
|
-
\r\n\
|
|
111
|
-
<html>\r\n\
|
|
112
|
-
<head><style>body { color: blue; }</style></head>\r\n\
|
|
113
|
-
<body>\r\n\
|
|
114
|
-
<h1>HTML Heading</h1>\r\n\
|
|
115
|
-
<p>This is <b>bold</b> text in HTML.</p>\r\n\
|
|
116
|
-
<script>alert('test');</script>\r\n\
|
|
117
|
-
</body>\r\n\
|
|
118
|
-
</html>";
|
|
119
|
-
|
|
120
|
-
let result = extract_bytes(eml_content, "message/rfc822", &config)
|
|
121
|
-
.await
|
|
122
|
-
.expect("Should extract HTML email");
|
|
123
|
-
|
|
124
|
-
assert!(!result.content.contains("<script>"));
|
|
125
|
-
assert!(!result.content.contains("<style>"));
|
|
126
|
-
|
|
127
|
-
assert!(result.content.contains("HTML Heading") || result.content.contains("bold"));
|
|
128
|
-
|
|
129
|
-
assert!(result.metadata.format.is_some());
|
|
130
|
-
let email_meta = match result.metadata.format.as_ref().unwrap() {
|
|
131
|
-
kreuzberg::FormatMetadata::Email(meta) => meta,
|
|
132
|
-
_ => panic!("Expected Email metadata"),
|
|
133
|
-
};
|
|
134
|
-
assert_eq!(email_meta.from_email, Some("sender@example.com".to_string()));
|
|
135
|
-
assert_eq!(email_meta.to_emails, vec!["recipient@example.com".to_string()]);
|
|
136
|
-
assert_eq!(result.metadata.subject, Some("HTML Email".to_string()));
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
/// Test EML with plain text body.
|
|
140
|
-
#[tokio::test]
|
|
141
|
-
async fn test_eml_plain_text_body() {
|
|
142
|
-
let config = ExtractionConfig::default();
|
|
143
|
-
|
|
144
|
-
let eml_content = b"From: sender@example.com\r\n\
|
|
145
|
-
To: recipient@example.com\r\n\
|
|
146
|
-
Subject: Plain Text Email\r\n\
|
|
147
|
-
Content-Type: text/plain; charset=utf-8\r\n\
|
|
148
|
-
\r\n\
|
|
149
|
-
This is a plain text email.\r\n\
|
|
150
|
-
It has multiple lines.\r\n\
|
|
151
|
-
And preserves formatting.";
|
|
152
|
-
|
|
153
|
-
let result = extract_bytes(eml_content, "message/rfc822", &config)
|
|
154
|
-
.await
|
|
155
|
-
.expect("Should extract plain text email");
|
|
156
|
-
|
|
157
|
-
assert!(result.content.contains("This is a plain text email"));
|
|
158
|
-
assert!(result.content.contains("multiple lines"));
|
|
159
|
-
assert!(result.content.contains("preserves formatting"));
|
|
160
|
-
|
|
161
|
-
assert!(result.metadata.format.is_some());
|
|
162
|
-
let email_meta = match result.metadata.format.as_ref().unwrap() {
|
|
163
|
-
kreuzberg::FormatMetadata::Email(meta) => meta,
|
|
164
|
-
_ => panic!("Expected Email metadata"),
|
|
165
|
-
};
|
|
166
|
-
assert_eq!(email_meta.from_email, Some("sender@example.com".to_string()));
|
|
167
|
-
assert_eq!(email_meta.to_emails, vec!["recipient@example.com".to_string()]);
|
|
168
|
-
assert_eq!(result.metadata.subject, Some("Plain Text Email".to_string()));
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
/// Test EML multipart (HTML + plain text).
|
|
172
|
-
#[tokio::test]
|
|
173
|
-
async fn test_eml_multipart() {
|
|
174
|
-
let config = ExtractionConfig::default();
|
|
175
|
-
|
|
176
|
-
let eml_content = b"From: sender@example.com\r\n\
|
|
177
|
-
To: recipient@example.com\r\n\
|
|
178
|
-
Subject: Multipart Email\r\n\
|
|
179
|
-
Content-Type: multipart/alternative; boundary=\"----boundary\"\r\n\
|
|
180
|
-
\r\n\
|
|
181
|
-
------boundary\r\n\
|
|
182
|
-
Content-Type: text/plain\r\n\
|
|
183
|
-
\r\n\
|
|
184
|
-
Plain text version of the email.\r\n\
|
|
185
|
-
------boundary\r\n\
|
|
186
|
-
Content-Type: text/html\r\n\
|
|
187
|
-
\r\n\
|
|
188
|
-
<html><body><p>HTML version of the email.</p></body></html>\r\n\
|
|
189
|
-
------boundary--\r\n";
|
|
190
|
-
|
|
191
|
-
let result = extract_bytes(eml_content, "message/rfc822", &config)
|
|
192
|
-
.await
|
|
193
|
-
.expect("Should extract multipart email");
|
|
194
|
-
|
|
195
|
-
assert!(
|
|
196
|
-
result.content.contains("Plain text version") || result.content.contains("HTML version"),
|
|
197
|
-
"Should extract either plain text or HTML content"
|
|
198
|
-
);
|
|
199
|
-
|
|
200
|
-
assert!(result.metadata.format.is_some());
|
|
201
|
-
let email_meta = match result.metadata.format.as_ref().unwrap() {
|
|
202
|
-
kreuzberg::FormatMetadata::Email(meta) => meta,
|
|
203
|
-
_ => panic!("Expected Email metadata"),
|
|
204
|
-
};
|
|
205
|
-
assert_eq!(email_meta.from_email, Some("sender@example.com".to_string()));
|
|
206
|
-
assert_eq!(email_meta.to_emails, vec!["recipient@example.com".to_string()]);
|
|
207
|
-
assert_eq!(result.metadata.subject, Some("Multipart Email".to_string()));
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
/// Test MSG file extraction (Outlook format).
|
|
211
|
-
///
|
|
212
|
-
/// Note: Creating valid MSG files programmatically is complex.
|
|
213
|
-
/// This test verifies error handling for invalid MSG format.
|
|
214
|
-
#[tokio::test]
|
|
215
|
-
async fn test_msg_file_extraction() {
|
|
216
|
-
let config = ExtractionConfig::default();
|
|
217
|
-
|
|
218
|
-
let invalid_msg = b"This is not a valid MSG file";
|
|
219
|
-
|
|
220
|
-
let result = extract_bytes(invalid_msg, "application/vnd.ms-outlook", &config).await;
|
|
221
|
-
|
|
222
|
-
assert!(result.is_err(), "Invalid MSG should fail gracefully");
|
|
223
|
-
}
|
|
224
|
-
|
|
225
|
-
/// Test email thread with quoted replies.
|
|
226
|
-
#[tokio::test]
|
|
227
|
-
async fn test_email_thread() {
|
|
228
|
-
let config = ExtractionConfig::default();
|
|
229
|
-
|
|
230
|
-
let eml_content = b"From: person2@example.com\r\n\
|
|
231
|
-
To: person1@example.com\r\n\
|
|
232
|
-
Subject: Re: Original Subject\r\n\
|
|
233
|
-
In-Reply-To: <original@example.com>\r\n\
|
|
234
|
-
\r\n\
|
|
235
|
-
This is my reply.\r\n\
|
|
236
|
-
\r\n\
|
|
237
|
-
On Mon, 1 Jan 2024, person1@example.com wrote:\r\n\
|
|
238
|
-
> Original message text here.\r\n\
|
|
239
|
-
> This was the first message.";
|
|
240
|
-
|
|
241
|
-
let result = extract_bytes(eml_content, "message/rfc822", &config)
|
|
242
|
-
.await
|
|
243
|
-
.expect("Should extract email thread");
|
|
244
|
-
|
|
245
|
-
assert!(result.content.contains("This is my reply"));
|
|
246
|
-
|
|
247
|
-
assert!(result.content.contains("Original message text") || result.content.contains(">"));
|
|
248
|
-
}
|
|
249
|
-
|
|
250
|
-
/// Test email with various encodings (UTF-8, quoted-printable).
|
|
251
|
-
#[tokio::test]
|
|
252
|
-
async fn test_email_encodings() {
|
|
253
|
-
let config = ExtractionConfig::default();
|
|
254
|
-
|
|
255
|
-
let eml_content = "From: sender@example.com\r\n\
|
|
256
|
-
To: recipient@example.com\r\n\
|
|
257
|
-
Subject: Email with Unicode: 你好世界 🌍\r\n\
|
|
258
|
-
Content-Type: text/plain; charset=utf-8\r\n\
|
|
259
|
-
\r\n\
|
|
260
|
-
Email body with special chars: café, naïve, résumé.\r\n\
|
|
261
|
-
Emoji: 🎉 🚀 ✅"
|
|
262
|
-
.as_bytes();
|
|
263
|
-
|
|
264
|
-
let result = extract_bytes(eml_content, "message/rfc822", &config)
|
|
265
|
-
.await
|
|
266
|
-
.expect("Should extract UTF-8 email");
|
|
267
|
-
|
|
268
|
-
assert!(result.content.contains("café") || result.content.contains("naive") || !result.content.is_empty());
|
|
269
|
-
|
|
270
|
-
if let Some(subject) = result.metadata.subject {
|
|
271
|
-
assert!(subject.contains("Unicode") || subject.contains("Email"));
|
|
272
|
-
}
|
|
273
|
-
}
|
|
274
|
-
|
|
275
|
-
/// Test email with multiple recipients (To, CC, BCC).
|
|
276
|
-
#[tokio::test]
|
|
277
|
-
async fn test_email_large_attachments() {
|
|
278
|
-
let config = ExtractionConfig::default();
|
|
279
|
-
|
|
280
|
-
let eml_content = b"From: sender@example.com\r\n\
|
|
281
|
-
To: r1@example.com, r2@example.com, r3@example.com\r\n\
|
|
282
|
-
Cc: cc1@example.com, cc2@example.com\r\n\
|
|
283
|
-
Bcc: bcc@example.com\r\n\
|
|
284
|
-
Subject: Multiple Recipients\r\n\
|
|
285
|
-
\r\n\
|
|
286
|
-
Email to multiple recipients.";
|
|
287
|
-
|
|
288
|
-
let result = extract_bytes(eml_content, "message/rfc822", &config)
|
|
289
|
-
.await
|
|
290
|
-
.expect("Should extract email with multiple recipients");
|
|
291
|
-
|
|
292
|
-
assert!(result.metadata.format.is_some());
|
|
293
|
-
let email_meta = match result.metadata.format.as_ref().unwrap() {
|
|
294
|
-
kreuzberg::FormatMetadata::Email(meta) => meta,
|
|
295
|
-
_ => panic!("Expected Email metadata"),
|
|
296
|
-
};
|
|
297
|
-
|
|
298
|
-
assert_eq!(email_meta.from_email, Some("sender@example.com".to_string()));
|
|
299
|
-
|
|
300
|
-
assert_eq!(email_meta.to_emails.len(), 3, "Should have 3 To recipients");
|
|
301
|
-
assert!(email_meta.to_emails.contains(&"r1@example.com".to_string()));
|
|
302
|
-
assert!(email_meta.to_emails.contains(&"r2@example.com".to_string()));
|
|
303
|
-
assert!(email_meta.to_emails.contains(&"r3@example.com".to_string()));
|
|
304
|
-
|
|
305
|
-
assert_eq!(email_meta.cc_emails.len(), 2, "Should have 2 CC recipients");
|
|
306
|
-
assert!(email_meta.cc_emails.contains(&"cc1@example.com".to_string()));
|
|
307
|
-
assert!(email_meta.cc_emails.contains(&"cc2@example.com".to_string()));
|
|
308
|
-
|
|
309
|
-
assert_eq!(result.metadata.subject, Some("Multiple Recipients".to_string()));
|
|
310
|
-
|
|
311
|
-
assert!(email_meta.attachments.is_empty(), "Should have no attachments");
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
/// Test malformed email structure.
|
|
315
|
-
#[tokio::test]
|
|
316
|
-
async fn test_malformed_email() {
|
|
317
|
-
let config = ExtractionConfig::default();
|
|
318
|
-
|
|
319
|
-
let malformed_eml = b"This is not a valid email at all.";
|
|
320
|
-
|
|
321
|
-
let result = extract_bytes(malformed_eml, "message/rfc822", &config).await;
|
|
322
|
-
|
|
323
|
-
assert!(
|
|
324
|
-
result.is_ok() || result.is_err(),
|
|
325
|
-
"Should handle malformed email gracefully"
|
|
326
|
-
);
|
|
327
|
-
}
|
|
1
|
+
//! Email extraction integration tests.
|
|
2
|
+
//!
|
|
3
|
+
//! Tests for .eml (RFC822) email extraction.
|
|
4
|
+
//! Validates metadata extraction, content extraction, HTML/plain text handling, and attachments.
|
|
5
|
+
|
|
6
|
+
#![cfg(feature = "email")]
|
|
7
|
+
|
|
8
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
9
|
+
use kreuzberg::core::extractor::extract_bytes;
|
|
10
|
+
|
|
11
|
+
mod helpers;
|
|
12
|
+
|
|
13
|
+
/// Test basic EML extraction with subject, from, to, and body.
|
|
14
|
+
#[tokio::test]
|
|
15
|
+
async fn test_eml_basic_extraction() {
|
|
16
|
+
let config = ExtractionConfig::default();
|
|
17
|
+
|
|
18
|
+
let eml_content = b"From: sender@example.com\r\n\
|
|
19
|
+
To: recipient@example.com\r\n\
|
|
20
|
+
Subject: Test Email Subject\r\n\
|
|
21
|
+
Date: Mon, 1 Jan 2024 12:00:00 +0000\r\n\
|
|
22
|
+
Message-ID: <unique123@example.com>\r\n\
|
|
23
|
+
\r\n\
|
|
24
|
+
This is the email body content.";
|
|
25
|
+
|
|
26
|
+
let result = extract_bytes(eml_content, "message/rfc822", &config)
|
|
27
|
+
.await
|
|
28
|
+
.expect("Should extract EML successfully");
|
|
29
|
+
|
|
30
|
+
assert_eq!(result.mime_type, "message/rfc822");
|
|
31
|
+
|
|
32
|
+
assert_eq!(result.metadata.subject, Some("Test Email Subject".to_string()));
|
|
33
|
+
|
|
34
|
+
assert!(result.metadata.format.is_some());
|
|
35
|
+
let email_meta = match result.metadata.format.as_ref().unwrap() {
|
|
36
|
+
kreuzberg::FormatMetadata::Email(meta) => meta,
|
|
37
|
+
_ => panic!("Expected Email metadata"),
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
assert_eq!(email_meta.from_email, Some("sender@example.com".to_string()));
|
|
41
|
+
|
|
42
|
+
assert_eq!(email_meta.to_emails, vec!["recipient@example.com".to_string()]);
|
|
43
|
+
assert!(email_meta.cc_emails.is_empty(), "CC should be empty");
|
|
44
|
+
assert!(email_meta.bcc_emails.is_empty(), "BCC should be empty");
|
|
45
|
+
|
|
46
|
+
assert!(email_meta.message_id.is_some());
|
|
47
|
+
let msg_id = email_meta.message_id.clone().unwrap();
|
|
48
|
+
assert!(
|
|
49
|
+
msg_id.contains("unique123@example.com"),
|
|
50
|
+
"Message ID should contain unique123@example.com"
|
|
51
|
+
);
|
|
52
|
+
|
|
53
|
+
assert!(email_meta.attachments.is_empty(), "Should have no attachments");
|
|
54
|
+
|
|
55
|
+
assert!(result.metadata.date.is_some());
|
|
56
|
+
|
|
57
|
+
assert!(result.content.contains("Subject: Test Email Subject"));
|
|
58
|
+
assert!(result.content.contains("From: sender@example.com"));
|
|
59
|
+
assert!(result.content.contains("To: recipient@example.com"));
|
|
60
|
+
assert!(result.content.contains("This is the email body content"));
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/// Test EML with attachments - metadata extraction.
|
|
64
|
+
#[tokio::test]
|
|
65
|
+
async fn test_eml_with_attachments() {
|
|
66
|
+
let config = ExtractionConfig::default();
|
|
67
|
+
|
|
68
|
+
let eml_content = b"From: sender@example.com\r\n\
|
|
69
|
+
To: recipient@example.com\r\n\
|
|
70
|
+
Subject: Email with Attachment\r\n\
|
|
71
|
+
Content-Type: multipart/mixed; boundary=\"----boundary\"\r\n\
|
|
72
|
+
\r\n\
|
|
73
|
+
------boundary\r\n\
|
|
74
|
+
Content-Type: text/plain\r\n\
|
|
75
|
+
\r\n\
|
|
76
|
+
Email body text.\r\n\
|
|
77
|
+
------boundary\r\n\
|
|
78
|
+
Content-Type: text/plain; name=\"file.txt\"\r\n\
|
|
79
|
+
Content-Disposition: attachment; filename=\"file.txt\"\r\n\
|
|
80
|
+
\r\n\
|
|
81
|
+
Attachment content here.\r\n\
|
|
82
|
+
------boundary--\r\n";
|
|
83
|
+
|
|
84
|
+
let result = extract_bytes(eml_content, "message/rfc822", &config)
|
|
85
|
+
.await
|
|
86
|
+
.expect("Should extract EML with attachment");
|
|
87
|
+
|
|
88
|
+
assert!(result.metadata.format.is_some());
|
|
89
|
+
let email_meta = match result.metadata.format.as_ref().unwrap() {
|
|
90
|
+
kreuzberg::FormatMetadata::Email(meta) => meta,
|
|
91
|
+
_ => panic!("Expected Email metadata"),
|
|
92
|
+
};
|
|
93
|
+
|
|
94
|
+
if !email_meta.attachments.is_empty() {
|
|
95
|
+
assert!(result.content.contains("Attachments:"));
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
assert!(result.content.contains("Email body text") || result.content.contains("Attachment content"));
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/// Test EML with HTML body.
|
|
102
|
+
#[tokio::test]
|
|
103
|
+
async fn test_eml_html_body() {
|
|
104
|
+
let config = ExtractionConfig::default();
|
|
105
|
+
|
|
106
|
+
let eml_content = b"From: sender@example.com\r\n\
|
|
107
|
+
To: recipient@example.com\r\n\
|
|
108
|
+
Subject: HTML Email\r\n\
|
|
109
|
+
Content-Type: text/html; charset=utf-8\r\n\
|
|
110
|
+
\r\n\
|
|
111
|
+
<html>\r\n\
|
|
112
|
+
<head><style>body { color: blue; }</style></head>\r\n\
|
|
113
|
+
<body>\r\n\
|
|
114
|
+
<h1>HTML Heading</h1>\r\n\
|
|
115
|
+
<p>This is <b>bold</b> text in HTML.</p>\r\n\
|
|
116
|
+
<script>alert('test');</script>\r\n\
|
|
117
|
+
</body>\r\n\
|
|
118
|
+
</html>";
|
|
119
|
+
|
|
120
|
+
let result = extract_bytes(eml_content, "message/rfc822", &config)
|
|
121
|
+
.await
|
|
122
|
+
.expect("Should extract HTML email");
|
|
123
|
+
|
|
124
|
+
assert!(!result.content.contains("<script>"));
|
|
125
|
+
assert!(!result.content.contains("<style>"));
|
|
126
|
+
|
|
127
|
+
assert!(result.content.contains("HTML Heading") || result.content.contains("bold"));
|
|
128
|
+
|
|
129
|
+
assert!(result.metadata.format.is_some());
|
|
130
|
+
let email_meta = match result.metadata.format.as_ref().unwrap() {
|
|
131
|
+
kreuzberg::FormatMetadata::Email(meta) => meta,
|
|
132
|
+
_ => panic!("Expected Email metadata"),
|
|
133
|
+
};
|
|
134
|
+
assert_eq!(email_meta.from_email, Some("sender@example.com".to_string()));
|
|
135
|
+
assert_eq!(email_meta.to_emails, vec!["recipient@example.com".to_string()]);
|
|
136
|
+
assert_eq!(result.metadata.subject, Some("HTML Email".to_string()));
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
/// Test EML with plain text body.
|
|
140
|
+
#[tokio::test]
|
|
141
|
+
async fn test_eml_plain_text_body() {
|
|
142
|
+
let config = ExtractionConfig::default();
|
|
143
|
+
|
|
144
|
+
let eml_content = b"From: sender@example.com\r\n\
|
|
145
|
+
To: recipient@example.com\r\n\
|
|
146
|
+
Subject: Plain Text Email\r\n\
|
|
147
|
+
Content-Type: text/plain; charset=utf-8\r\n\
|
|
148
|
+
\r\n\
|
|
149
|
+
This is a plain text email.\r\n\
|
|
150
|
+
It has multiple lines.\r\n\
|
|
151
|
+
And preserves formatting.";
|
|
152
|
+
|
|
153
|
+
let result = extract_bytes(eml_content, "message/rfc822", &config)
|
|
154
|
+
.await
|
|
155
|
+
.expect("Should extract plain text email");
|
|
156
|
+
|
|
157
|
+
assert!(result.content.contains("This is a plain text email"));
|
|
158
|
+
assert!(result.content.contains("multiple lines"));
|
|
159
|
+
assert!(result.content.contains("preserves formatting"));
|
|
160
|
+
|
|
161
|
+
assert!(result.metadata.format.is_some());
|
|
162
|
+
let email_meta = match result.metadata.format.as_ref().unwrap() {
|
|
163
|
+
kreuzberg::FormatMetadata::Email(meta) => meta,
|
|
164
|
+
_ => panic!("Expected Email metadata"),
|
|
165
|
+
};
|
|
166
|
+
assert_eq!(email_meta.from_email, Some("sender@example.com".to_string()));
|
|
167
|
+
assert_eq!(email_meta.to_emails, vec!["recipient@example.com".to_string()]);
|
|
168
|
+
assert_eq!(result.metadata.subject, Some("Plain Text Email".to_string()));
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/// Test EML multipart (HTML + plain text).
|
|
172
|
+
#[tokio::test]
|
|
173
|
+
async fn test_eml_multipart() {
|
|
174
|
+
let config = ExtractionConfig::default();
|
|
175
|
+
|
|
176
|
+
let eml_content = b"From: sender@example.com\r\n\
|
|
177
|
+
To: recipient@example.com\r\n\
|
|
178
|
+
Subject: Multipart Email\r\n\
|
|
179
|
+
Content-Type: multipart/alternative; boundary=\"----boundary\"\r\n\
|
|
180
|
+
\r\n\
|
|
181
|
+
------boundary\r\n\
|
|
182
|
+
Content-Type: text/plain\r\n\
|
|
183
|
+
\r\n\
|
|
184
|
+
Plain text version of the email.\r\n\
|
|
185
|
+
------boundary\r\n\
|
|
186
|
+
Content-Type: text/html\r\n\
|
|
187
|
+
\r\n\
|
|
188
|
+
<html><body><p>HTML version of the email.</p></body></html>\r\n\
|
|
189
|
+
------boundary--\r\n";
|
|
190
|
+
|
|
191
|
+
let result = extract_bytes(eml_content, "message/rfc822", &config)
|
|
192
|
+
.await
|
|
193
|
+
.expect("Should extract multipart email");
|
|
194
|
+
|
|
195
|
+
assert!(
|
|
196
|
+
result.content.contains("Plain text version") || result.content.contains("HTML version"),
|
|
197
|
+
"Should extract either plain text or HTML content"
|
|
198
|
+
);
|
|
199
|
+
|
|
200
|
+
assert!(result.metadata.format.is_some());
|
|
201
|
+
let email_meta = match result.metadata.format.as_ref().unwrap() {
|
|
202
|
+
kreuzberg::FormatMetadata::Email(meta) => meta,
|
|
203
|
+
_ => panic!("Expected Email metadata"),
|
|
204
|
+
};
|
|
205
|
+
assert_eq!(email_meta.from_email, Some("sender@example.com".to_string()));
|
|
206
|
+
assert_eq!(email_meta.to_emails, vec!["recipient@example.com".to_string()]);
|
|
207
|
+
assert_eq!(result.metadata.subject, Some("Multipart Email".to_string()));
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
/// Test MSG file extraction (Outlook format).
|
|
211
|
+
///
|
|
212
|
+
/// Note: Creating valid MSG files programmatically is complex.
|
|
213
|
+
/// This test verifies error handling for invalid MSG format.
|
|
214
|
+
#[tokio::test]
|
|
215
|
+
async fn test_msg_file_extraction() {
|
|
216
|
+
let config = ExtractionConfig::default();
|
|
217
|
+
|
|
218
|
+
let invalid_msg = b"This is not a valid MSG file";
|
|
219
|
+
|
|
220
|
+
let result = extract_bytes(invalid_msg, "application/vnd.ms-outlook", &config).await;
|
|
221
|
+
|
|
222
|
+
assert!(result.is_err(), "Invalid MSG should fail gracefully");
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/// Test email thread with quoted replies.
|
|
226
|
+
#[tokio::test]
|
|
227
|
+
async fn test_email_thread() {
|
|
228
|
+
let config = ExtractionConfig::default();
|
|
229
|
+
|
|
230
|
+
let eml_content = b"From: person2@example.com\r\n\
|
|
231
|
+
To: person1@example.com\r\n\
|
|
232
|
+
Subject: Re: Original Subject\r\n\
|
|
233
|
+
In-Reply-To: <original@example.com>\r\n\
|
|
234
|
+
\r\n\
|
|
235
|
+
This is my reply.\r\n\
|
|
236
|
+
\r\n\
|
|
237
|
+
On Mon, 1 Jan 2024, person1@example.com wrote:\r\n\
|
|
238
|
+
> Original message text here.\r\n\
|
|
239
|
+
> This was the first message.";
|
|
240
|
+
|
|
241
|
+
let result = extract_bytes(eml_content, "message/rfc822", &config)
|
|
242
|
+
.await
|
|
243
|
+
.expect("Should extract email thread");
|
|
244
|
+
|
|
245
|
+
assert!(result.content.contains("This is my reply"));
|
|
246
|
+
|
|
247
|
+
assert!(result.content.contains("Original message text") || result.content.contains(">"));
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
/// Test email with various encodings (UTF-8, quoted-printable).
|
|
251
|
+
#[tokio::test]
|
|
252
|
+
async fn test_email_encodings() {
|
|
253
|
+
let config = ExtractionConfig::default();
|
|
254
|
+
|
|
255
|
+
let eml_content = "From: sender@example.com\r\n\
|
|
256
|
+
To: recipient@example.com\r\n\
|
|
257
|
+
Subject: Email with Unicode: 你好世界 🌍\r\n\
|
|
258
|
+
Content-Type: text/plain; charset=utf-8\r\n\
|
|
259
|
+
\r\n\
|
|
260
|
+
Email body with special chars: café, naïve, résumé.\r\n\
|
|
261
|
+
Emoji: 🎉 🚀 ✅"
|
|
262
|
+
.as_bytes();
|
|
263
|
+
|
|
264
|
+
let result = extract_bytes(eml_content, "message/rfc822", &config)
|
|
265
|
+
.await
|
|
266
|
+
.expect("Should extract UTF-8 email");
|
|
267
|
+
|
|
268
|
+
assert!(result.content.contains("café") || result.content.contains("naive") || !result.content.is_empty());
|
|
269
|
+
|
|
270
|
+
if let Some(subject) = result.metadata.subject {
|
|
271
|
+
assert!(subject.contains("Unicode") || subject.contains("Email"));
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
/// Test email with multiple recipients (To, CC, BCC).
|
|
276
|
+
#[tokio::test]
|
|
277
|
+
async fn test_email_large_attachments() {
|
|
278
|
+
let config = ExtractionConfig::default();
|
|
279
|
+
|
|
280
|
+
let eml_content = b"From: sender@example.com\r\n\
|
|
281
|
+
To: r1@example.com, r2@example.com, r3@example.com\r\n\
|
|
282
|
+
Cc: cc1@example.com, cc2@example.com\r\n\
|
|
283
|
+
Bcc: bcc@example.com\r\n\
|
|
284
|
+
Subject: Multiple Recipients\r\n\
|
|
285
|
+
\r\n\
|
|
286
|
+
Email to multiple recipients.";
|
|
287
|
+
|
|
288
|
+
let result = extract_bytes(eml_content, "message/rfc822", &config)
|
|
289
|
+
.await
|
|
290
|
+
.expect("Should extract email with multiple recipients");
|
|
291
|
+
|
|
292
|
+
assert!(result.metadata.format.is_some());
|
|
293
|
+
let email_meta = match result.metadata.format.as_ref().unwrap() {
|
|
294
|
+
kreuzberg::FormatMetadata::Email(meta) => meta,
|
|
295
|
+
_ => panic!("Expected Email metadata"),
|
|
296
|
+
};
|
|
297
|
+
|
|
298
|
+
assert_eq!(email_meta.from_email, Some("sender@example.com".to_string()));
|
|
299
|
+
|
|
300
|
+
assert_eq!(email_meta.to_emails.len(), 3, "Should have 3 To recipients");
|
|
301
|
+
assert!(email_meta.to_emails.contains(&"r1@example.com".to_string()));
|
|
302
|
+
assert!(email_meta.to_emails.contains(&"r2@example.com".to_string()));
|
|
303
|
+
assert!(email_meta.to_emails.contains(&"r3@example.com".to_string()));
|
|
304
|
+
|
|
305
|
+
assert_eq!(email_meta.cc_emails.len(), 2, "Should have 2 CC recipients");
|
|
306
|
+
assert!(email_meta.cc_emails.contains(&"cc1@example.com".to_string()));
|
|
307
|
+
assert!(email_meta.cc_emails.contains(&"cc2@example.com".to_string()));
|
|
308
|
+
|
|
309
|
+
assert_eq!(result.metadata.subject, Some("Multiple Recipients".to_string()));
|
|
310
|
+
|
|
311
|
+
assert!(email_meta.attachments.is_empty(), "Should have no attachments");
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
/// Test malformed email structure.
|
|
315
|
+
#[tokio::test]
|
|
316
|
+
async fn test_malformed_email() {
|
|
317
|
+
let config = ExtractionConfig::default();
|
|
318
|
+
|
|
319
|
+
let malformed_eml = b"This is not a valid email at all.";
|
|
320
|
+
|
|
321
|
+
let result = extract_bytes(malformed_eml, "message/rfc822", &config).await;
|
|
322
|
+
|
|
323
|
+
assert!(
|
|
324
|
+
result.is_ok() || result.is_err(),
|
|
325
|
+
"Should handle malformed email gracefully"
|
|
326
|
+
);
|
|
327
|
+
}
|