kreuzberg 4.0.0.rc1 → 4.0.0.rc2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -8
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -534
- data/.rubocop.yml +538 -0
- data/Gemfile +8 -9
- data/Gemfile.lock +9 -109
- data/README.md +426 -421
- data/Rakefile +25 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -340
- data/ext/kreuzberg_rb/extconf.rb +45 -35
- data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +44 -36
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -17
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +2998 -2939
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +148 -105
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +46 -45
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +691 -684
- data/lib/kreuzberg/error_context.rb +32 -0
- data/lib/kreuzberg/errors.rb +118 -50
- data/lib/kreuzberg/extraction_api.rb +85 -84
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +216 -216
- data/lib/kreuzberg/setup_lib_path.rb +80 -79
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +103 -82
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +520 -468
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -87
- data/spec/binding/cli_spec.rb +55 -54
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -42
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/kreuzberg/Cargo.toml +204 -134
- data/vendor/kreuzberg/README.md +175 -175
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
- data/vendor/kreuzberg/build.rs +474 -460
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1143
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -677
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -35
- data/vendor/kreuzberg/src/core/config.rs +1032 -1032
- data/vendor/kreuzberg/src/core/extractor.rs +1024 -903
- data/vendor/kreuzberg/src/core/io.rs +329 -327
- data/vendor/kreuzberg/src/core/mime.rs +605 -615
- data/vendor/kreuzberg/src/core/mod.rs +45 -42
- data/vendor/kreuzberg/src/core/pipeline.rs +984 -906
- data/vendor/kreuzberg/src/embeddings.rs +432 -323
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -40
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +553 -553
- data/vendor/kreuzberg/src/extraction/image.rs +368 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -564
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -77
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -128
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +446 -425
- data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +367 -479
- data/vendor/kreuzberg/src/extractors/email.rs +143 -129
- data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +343 -344
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
- data/vendor/kreuzberg/src/extractors/html.rs +393 -410
- data/vendor/kreuzberg/src/extractors/image.rs +198 -195
- data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +365 -268
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
- data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +493 -496
- data/vendor/kreuzberg/src/extractors/pptx.rs +248 -234
- data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
- data/vendor/kreuzberg/src/extractors/security.rs +484 -0
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +140 -126
- data/vendor/kreuzberg/src/extractors/text.rs +260 -242
- data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +135 -128
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -294
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -942
- data/vendor/kreuzberg/src/lib.rs +105 -102
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -32
- data/vendor/kreuzberg/src/mcp/server.rs +1968 -1966
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -847
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -122
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +393 -420
- data/vendor/kreuzberg/src/pdf/text.rs +158 -161
- data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -1010
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +620 -629
- data/vendor/kreuzberg/src/plugins/processor.rs +642 -641
- data/vendor/kreuzberg/src/plugins/registry.rs +1337 -1324
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +956 -955
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +19 -19
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +903 -873
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -542
- data/vendor/kreuzberg/tests/batch_processing.rs +316 -304
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -509
- data/vendor/kreuzberg/tests/config_features.rs +598 -580
- data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -439
- data/vendor/kreuzberg/tests/core_integration.rs +510 -493
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -424
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -124
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -676
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -43
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -1412
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -561
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -607
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
- data/vendor/kreuzberg/tests/security_validation.rs +415 -404
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/rb-sys/.cargo-ok +1 -0
- data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
- data/vendor/rb-sys/Cargo.lock +393 -0
- data/vendor/rb-sys/Cargo.toml +70 -0
- data/vendor/rb-sys/Cargo.toml.orig +57 -0
- data/vendor/rb-sys/LICENSE-APACHE +190 -0
- data/vendor/rb-sys/LICENSE-MIT +21 -0
- data/vendor/rb-sys/bin/release.sh +21 -0
- data/vendor/rb-sys/build/features.rs +108 -0
- data/vendor/rb-sys/build/main.rs +246 -0
- data/vendor/rb-sys/build/stable_api_config.rs +153 -0
- data/vendor/rb-sys/build/version.rs +48 -0
- data/vendor/rb-sys/readme.md +36 -0
- data/vendor/rb-sys/src/bindings.rs +21 -0
- data/vendor/rb-sys/src/hidden.rs +11 -0
- data/vendor/rb-sys/src/lib.rs +34 -0
- data/vendor/rb-sys/src/macros.rs +371 -0
- data/vendor/rb-sys/src/memory.rs +53 -0
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
- data/vendor/rb-sys/src/special_consts.rs +31 -0
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
- data/vendor/rb-sys/src/stable_api.rs +261 -0
- data/vendor/rb-sys/src/symbol.rs +31 -0
- data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
- data/vendor/rb-sys/src/utils.rs +89 -0
- data/vendor/rb-sys/src/value_type.rs +7 -0
- metadata +90 -95
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/spec/examples.txt +0 -104
- data/vendor/kreuzberg/src/bin/profile_extract.rs +0 -455
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +0 -275
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +0 -178
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +0 -491
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +0 -496
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +0 -1188
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +0 -162
- data/vendor/kreuzberg/src/extractors/pandoc.rs +0 -201
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +0 -92
- data/vendor/kreuzberg/tests/pandoc_integration.rs +0 -503
|
@@ -1,428 +1,428 @@
|
|
|
1
|
-
//! MIME type detection integration tests.
|
|
2
|
-
//!
|
|
3
|
-
//! Tests for MIME type detection from file extensions and content.
|
|
4
|
-
//! Validates detection accuracy, mismatch handling, and error cases.
|
|
5
|
-
|
|
6
|
-
use kreuzberg::core::mime::{detect_mime_type, validate_mime_type};
|
|
7
|
-
use std::io::Write;
|
|
8
|
-
use tempfile::NamedTempFile;
|
|
9
|
-
|
|
10
|
-
mod helpers;
|
|
11
|
-
|
|
12
|
-
/// Test MIME detection by file extension.
|
|
13
|
-
///
|
|
14
|
-
/// Validates that file extensions are correctly mapped to MIME types.
|
|
15
|
-
/// This is the primary MIME detection method (extension-first approach).
|
|
16
|
-
#[tokio::test]
|
|
17
|
-
async fn test_mime_detection_by_extension() {
|
|
18
|
-
use tempfile::TempDir;
|
|
19
|
-
|
|
20
|
-
let test_cases = vec![
|
|
21
|
-
("test.pdf", "application/pdf"),
|
|
22
|
-
(
|
|
23
|
-
"test.docx",
|
|
24
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
25
|
-
),
|
|
26
|
-
(
|
|
27
|
-
"test.xlsx",
|
|
28
|
-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
29
|
-
),
|
|
30
|
-
(
|
|
31
|
-
"test.pptx",
|
|
32
|
-
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
33
|
-
),
|
|
34
|
-
("test.txt", "text/plain"),
|
|
35
|
-
("test.md", "text/markdown"),
|
|
36
|
-
("test.html", "text/html"),
|
|
37
|
-
("test.json", "application/json"),
|
|
38
|
-
("test.xml", "application/xml"),
|
|
39
|
-
("test.csv", "text/csv"),
|
|
40
|
-
("test.png", "image/png"),
|
|
41
|
-
("test.jpg", "image/jpeg"),
|
|
42
|
-
("test.gif", "image/gif"),
|
|
43
|
-
("test.eml", "message/rfc822"),
|
|
44
|
-
("test.zip", "application/zip"),
|
|
45
|
-
];
|
|
46
|
-
|
|
47
|
-
for (filename, expected_mime) in test_cases {
|
|
48
|
-
let temp_dir = TempDir::new().expect("Should create temp dir");
|
|
49
|
-
let temp_path = temp_dir.path().join(filename);
|
|
50
|
-
|
|
51
|
-
std::fs::write(&temp_path, b"test content").unwrap();
|
|
52
|
-
|
|
53
|
-
let detected = detect_mime_type(&temp_path, true);
|
|
54
|
-
|
|
55
|
-
assert!(detected.is_ok(), "Should detect MIME type for {}", filename);
|
|
56
|
-
assert_eq!(detected.unwrap(), expected_mime, "MIME type mismatch for {}", filename);
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
/// Test case-insensitive extension detection.
|
|
61
|
-
#[tokio::test]
|
|
62
|
-
async fn test_mime_detection_case_insensitive() {
|
|
63
|
-
use tempfile::TempDir;
|
|
64
|
-
|
|
65
|
-
let test_cases = vec![
|
|
66
|
-
("test.PDF", "application/pdf"),
|
|
67
|
-
(
|
|
68
|
-
"test.DOCX",
|
|
69
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
70
|
-
),
|
|
71
|
-
("test.TXT", "text/plain"),
|
|
72
|
-
("test.Jpg", "image/jpeg"),
|
|
73
|
-
];
|
|
74
|
-
|
|
75
|
-
for (filename, expected_mime) in test_cases {
|
|
76
|
-
let temp_dir = TempDir::new().expect("Should create temp dir");
|
|
77
|
-
let temp_path = temp_dir.path().join(filename);
|
|
78
|
-
|
|
79
|
-
std::fs::write(&temp_path, b"test").unwrap();
|
|
80
|
-
|
|
81
|
-
let detected = detect_mime_type(&temp_path, true);
|
|
82
|
-
assert!(detected.is_ok(), "Should handle {} (case insensitive)", filename);
|
|
83
|
-
assert_eq!(detected.unwrap(), expected_mime);
|
|
84
|
-
}
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
/// Test MIME detection by content (magic bytes).
|
|
88
|
-
#[tokio::test]
|
|
89
|
-
async fn test_mime_detection_by_content() {
|
|
90
|
-
struct TestCase {
|
|
91
|
-
content: Vec<u8>,
|
|
92
|
-
filename: &'static str,
|
|
93
|
-
expected_fallback: Option<&'static str>,
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
let test_cases = vec![
|
|
97
|
-
TestCase {
|
|
98
|
-
content: b"%PDF-1.4\ntest content".to_vec(),
|
|
99
|
-
filename: "test",
|
|
100
|
-
expected_fallback: Some("application/pdf"),
|
|
101
|
-
},
|
|
102
|
-
TestCase {
|
|
103
|
-
content: vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A],
|
|
104
|
-
filename: "test",
|
|
105
|
-
expected_fallback: Some("image/png"),
|
|
106
|
-
},
|
|
107
|
-
TestCase {
|
|
108
|
-
content: vec![0x50, 0x4B, 0x03, 0x04],
|
|
109
|
-
filename: "test",
|
|
110
|
-
expected_fallback: Some("application/zip"),
|
|
111
|
-
},
|
|
112
|
-
TestCase {
|
|
113
|
-
content: vec![0xFF, 0xD8, 0xFF, 0xE0],
|
|
114
|
-
filename: "test",
|
|
115
|
-
expected_fallback: Some("image/jpeg"),
|
|
116
|
-
},
|
|
117
|
-
];
|
|
118
|
-
|
|
119
|
-
for test_case in test_cases {
|
|
120
|
-
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
121
|
-
let temp_path = temp_file.path().parent().unwrap().join(test_case.filename);
|
|
122
|
-
|
|
123
|
-
temp_file.write_all(&test_case.content).unwrap();
|
|
124
|
-
temp_file.flush().unwrap();
|
|
125
|
-
std::fs::copy(temp_file.path(), &temp_path).unwrap();
|
|
126
|
-
|
|
127
|
-
let detected = detect_mime_type(&temp_path, true);
|
|
128
|
-
|
|
129
|
-
if let Some(expected) = test_case.expected_fallback {
|
|
130
|
-
if let Ok(mime) = &detected {
|
|
131
|
-
assert!(
|
|
132
|
-
mime == expected || mime.starts_with("application/") || mime.starts_with("image/"),
|
|
133
|
-
"For {}, expected {} or reasonable fallback, got {}",
|
|
134
|
-
test_case.filename,
|
|
135
|
-
expected,
|
|
136
|
-
mime
|
|
137
|
-
);
|
|
138
|
-
} else {
|
|
139
|
-
assert!(
|
|
140
|
-
detected.is_err(),
|
|
141
|
-
"Should fail gracefully for {} without extension",
|
|
142
|
-
test_case.filename
|
|
143
|
-
);
|
|
144
|
-
}
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
let _ = std::fs::remove_file(&temp_path);
|
|
148
|
-
}
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
/// Test validation of supported MIME types.
|
|
152
|
-
///
|
|
153
|
-
/// Validates that all documented supported MIME types pass validation.
|
|
154
|
-
/// This ensures the MIME type registry is correctly configured.
|
|
155
|
-
#[tokio::test]
|
|
156
|
-
async fn test_mime_type_validation() {
|
|
157
|
-
let supported = vec![
|
|
158
|
-
"application/pdf",
|
|
159
|
-
"text/plain",
|
|
160
|
-
"text/markdown",
|
|
161
|
-
"application/json",
|
|
162
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
163
|
-
"image/png",
|
|
164
|
-
"image/jpeg",
|
|
165
|
-
"message/rfc822",
|
|
166
|
-
"text/csv",
|
|
167
|
-
"application/zip",
|
|
168
|
-
];
|
|
169
|
-
|
|
170
|
-
for mime_type in supported {
|
|
171
|
-
let result = validate_mime_type(mime_type);
|
|
172
|
-
assert!(result.is_ok(), "Should validate supported MIME type: {}", mime_type);
|
|
173
|
-
assert_eq!(result.unwrap(), mime_type);
|
|
174
|
-
}
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
/// Test validation of image MIME types (prefix matching).
|
|
178
|
-
#[tokio::test]
|
|
179
|
-
async fn test_mime_type_image_prefix_validation() {
|
|
180
|
-
let image_types = vec![
|
|
181
|
-
"image/png",
|
|
182
|
-
"image/jpeg",
|
|
183
|
-
"image/gif",
|
|
184
|
-
"image/webp",
|
|
185
|
-
"image/bmp",
|
|
186
|
-
"image/tiff",
|
|
187
|
-
"image/svg+xml",
|
|
188
|
-
"image/x-custom-format",
|
|
189
|
-
];
|
|
190
|
-
|
|
191
|
-
for mime_type in image_types {
|
|
192
|
-
let result = validate_mime_type(mime_type);
|
|
193
|
-
assert!(result.is_ok(), "Should validate image MIME type: {}", mime_type);
|
|
194
|
-
}
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
/// Test unknown/unsupported MIME type handling.
|
|
198
|
-
#[tokio::test]
|
|
199
|
-
async fn test_unknown_mime_type() {
|
|
200
|
-
let unsupported = vec![
|
|
201
|
-
"application/x-unknown-format",
|
|
202
|
-
"video/mp4",
|
|
203
|
-
"audio/mp3",
|
|
204
|
-
"application/octet-stream",
|
|
205
|
-
"text/x-unsupported",
|
|
206
|
-
];
|
|
207
|
-
|
|
208
|
-
for mime_type in unsupported {
|
|
209
|
-
let result = validate_mime_type(mime_type);
|
|
210
|
-
assert!(result.is_err(), "Should reject unsupported MIME type: {}", mime_type);
|
|
211
|
-
|
|
212
|
-
let error = result.unwrap_err();
|
|
213
|
-
assert!(
|
|
214
|
-
matches!(error, kreuzberg::KreuzbergError::UnsupportedFormat(_)),
|
|
215
|
-
"Should return UnsupportedFormat error for: {}",
|
|
216
|
-
mime_type
|
|
217
|
-
);
|
|
218
|
-
}
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
/// Test handling of MIME type mismatch (extension vs content).
|
|
222
|
-
#[tokio::test]
|
|
223
|
-
async fn test_mime_mismatch_warning() {
|
|
224
|
-
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
225
|
-
let temp_path = temp_file.path().parent().unwrap().join("document.pdf");
|
|
226
|
-
|
|
227
|
-
temp_file.write_all(&[0x50, 0x4B, 0x03, 0x04]).unwrap();
|
|
228
|
-
temp_file.flush().unwrap();
|
|
229
|
-
std::fs::copy(temp_file.path(), &temp_path).unwrap();
|
|
230
|
-
|
|
231
|
-
let detected = detect_mime_type(&temp_path, true);
|
|
232
|
-
|
|
233
|
-
assert!(detected.is_ok(), "Should detect MIME type even with mismatch");
|
|
234
|
-
|
|
235
|
-
assert_eq!(
|
|
236
|
-
detected.unwrap(),
|
|
237
|
-
"application/pdf",
|
|
238
|
-
"Extension-based detection should take precedence"
|
|
239
|
-
);
|
|
240
|
-
|
|
241
|
-
let _ = std::fs::remove_file(&temp_path);
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
/// Test file extension mismatch detection.
|
|
245
|
-
#[tokio::test]
|
|
246
|
-
async fn test_extension_content_mismatch() {
|
|
247
|
-
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
248
|
-
let temp_path = temp_file.path().parent().unwrap().join("document.txt");
|
|
249
|
-
|
|
250
|
-
temp_file.write_all(b"%PDF-1.4\n").unwrap();
|
|
251
|
-
temp_file.flush().unwrap();
|
|
252
|
-
std::fs::copy(temp_file.path(), &temp_path).unwrap();
|
|
253
|
-
|
|
254
|
-
let detected = detect_mime_type(&temp_path, true);
|
|
255
|
-
|
|
256
|
-
assert!(detected.is_ok(), "Should detect MIME type");
|
|
257
|
-
|
|
258
|
-
assert_eq!(
|
|
259
|
-
detected.unwrap(),
|
|
260
|
-
"text/plain",
|
|
261
|
-
"Should use extension for MIME detection"
|
|
262
|
-
);
|
|
263
|
-
|
|
264
|
-
let _ = std::fs::remove_file(&temp_path);
|
|
265
|
-
}
|
|
266
|
-
|
|
267
|
-
/// Test file without extension.
|
|
268
|
-
#[tokio::test]
|
|
269
|
-
async fn test_no_extension() {
|
|
270
|
-
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
271
|
-
let temp_path = temp_file.path().parent().unwrap().join("testfile");
|
|
272
|
-
|
|
273
|
-
temp_file.write_all(b"test content").unwrap();
|
|
274
|
-
temp_file.flush().unwrap();
|
|
275
|
-
std::fs::copy(temp_file.path(), &temp_path).unwrap();
|
|
276
|
-
|
|
277
|
-
let detected = detect_mime_type(&temp_path, true);
|
|
278
|
-
|
|
279
|
-
if detected.is_err() {
|
|
280
|
-
let error = detected.unwrap_err();
|
|
281
|
-
assert!(
|
|
282
|
-
matches!(
|
|
283
|
-
error,
|
|
284
|
-
kreuzberg::KreuzbergError::Validation { .. } | kreuzberg::KreuzbergError::UnsupportedFormat(_)
|
|
285
|
-
),
|
|
286
|
-
"Should return appropriate error for file without extension"
|
|
287
|
-
);
|
|
288
|
-
} else {
|
|
289
|
-
let mime = detected.unwrap();
|
|
290
|
-
assert!(
|
|
291
|
-
mime.contains('/'),
|
|
292
|
-
"Detected MIME type should be valid format: {}",
|
|
293
|
-
mime
|
|
294
|
-
);
|
|
295
|
-
}
|
|
296
|
-
|
|
297
|
-
let _ = std::fs::remove_file(&temp_path);
|
|
298
|
-
}
|
|
299
|
-
|
|
300
|
-
/// Test nonexistent file.
|
|
301
|
-
#[tokio::test]
|
|
302
|
-
async fn test_mime_detection_nonexistent_file() {
|
|
303
|
-
let nonexistent_path = "/nonexistent/path/to/file.pdf";
|
|
304
|
-
|
|
305
|
-
let result = detect_mime_type(nonexistent_path, true);
|
|
306
|
-
|
|
307
|
-
assert!(result.is_err(), "Should fail for nonexistent file");
|
|
308
|
-
|
|
309
|
-
let error = result.unwrap_err();
|
|
310
|
-
assert!(
|
|
311
|
-
matches!(error, kreuzberg::KreuzbergError::Validation { .. }),
|
|
312
|
-
"Should return Validation error for nonexistent file"
|
|
313
|
-
);
|
|
314
|
-
}
|
|
315
|
-
|
|
316
|
-
/// Test file existence check can be disabled.
|
|
317
|
-
#[tokio::test]
|
|
318
|
-
async fn test_mime_detection_skip_existence_check() {
|
|
319
|
-
let nonexistent_path = "/nonexistent/path/to/document.pdf";
|
|
320
|
-
|
|
321
|
-
let result = detect_mime_type(nonexistent_path, false);
|
|
322
|
-
|
|
323
|
-
assert!(result.is_ok(), "Should succeed when skipping existence check");
|
|
324
|
-
assert_eq!(result.unwrap(), "application/pdf");
|
|
325
|
-
}
|
|
326
|
-
|
|
327
|
-
/// Test multiple dots in filename.
|
|
328
|
-
#[tokio::test]
|
|
329
|
-
async fn test_filename_multiple_dots() {
|
|
330
|
-
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
331
|
-
let temp_path = temp_file.path().parent().unwrap().join("my.backup.file.pdf");
|
|
332
|
-
|
|
333
|
-
temp_file.write_all(b"test").unwrap();
|
|
334
|
-
temp_file.flush().unwrap();
|
|
335
|
-
std::fs::copy(temp_file.path(), &temp_path).unwrap();
|
|
336
|
-
|
|
337
|
-
let detected = detect_mime_type(&temp_path, true);
|
|
338
|
-
|
|
339
|
-
assert!(detected.is_ok(), "Should handle multiple dots in filename");
|
|
340
|
-
assert_eq!(detected.unwrap(), "application/pdf", "Should use last extension");
|
|
341
|
-
|
|
342
|
-
let _ = std::fs::remove_file(&temp_path);
|
|
343
|
-
}
|
|
344
|
-
|
|
345
|
-
/// Test special characters in filename.
|
|
346
|
-
#[tokio::test]
|
|
347
|
-
async fn test_filename_special_characters() {
|
|
348
|
-
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
349
|
-
let temp_path = temp_file.path().parent().unwrap().join("文档 (copy) [v2].pdf");
|
|
350
|
-
|
|
351
|
-
temp_file.write_all(b"test").unwrap();
|
|
352
|
-
temp_file.flush().unwrap();
|
|
353
|
-
std::fs::copy(temp_file.path(), &temp_path).unwrap();
|
|
354
|
-
|
|
355
|
-
let detected = detect_mime_type(&temp_path, true);
|
|
356
|
-
|
|
357
|
-
assert!(detected.is_ok(), "Should handle special characters in filename");
|
|
358
|
-
assert_eq!(detected.unwrap(), "application/pdf");
|
|
359
|
-
|
|
360
|
-
let _ = std::fs::remove_file(&temp_path);
|
|
361
|
-
}
|
|
362
|
-
|
|
363
|
-
/// Test MIME detection for all Pandoc-supported formats.
|
|
364
|
-
///
|
|
365
|
-
/// Validates that all document formats supported by Pandoc extractor
|
|
366
|
-
/// are correctly detected and mapped to their MIME types.
|
|
367
|
-
#[cfg(feature = "office")]
|
|
368
|
-
#[tokio::test]
|
|
369
|
-
async fn test_pandoc_formats_mime_detection() {
|
|
370
|
-
let pandoc_formats = vec![
|
|
371
|
-
("test.rst", "text/x-rst"),
|
|
372
|
-
("test.tex", "application/x-latex"),
|
|
373
|
-
("test.latex", "application/x-latex"),
|
|
374
|
-
("test.rtf", "application/rtf"),
|
|
375
|
-
("test.odt", "application/vnd.oasis.opendocument.text"),
|
|
376
|
-
("test.epub", "application/epub+zip"),
|
|
377
|
-
("test.org", "text/x-org"),
|
|
378
|
-
("test.typst", "application/x-typst"),
|
|
379
|
-
("test.commonmark", "text/x-commonmark"),
|
|
380
|
-
];
|
|
381
|
-
|
|
382
|
-
for (filename, expected_mime) in pandoc_formats {
|
|
383
|
-
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
384
|
-
let temp_path = temp_file.path().parent().unwrap().join(filename);
|
|
385
|
-
|
|
386
|
-
temp_file.write_all(b"test content").unwrap();
|
|
387
|
-
temp_file.flush().unwrap();
|
|
388
|
-
std::fs::copy(temp_file.path(), &temp_path).unwrap();
|
|
389
|
-
|
|
390
|
-
let detected = detect_mime_type(&temp_path, true);
|
|
391
|
-
|
|
392
|
-
assert!(
|
|
393
|
-
detected.is_ok(),
|
|
394
|
-
"Should detect MIME type for Pandoc format: {}",
|
|
395
|
-
filename
|
|
396
|
-
);
|
|
397
|
-
assert_eq!(
|
|
398
|
-
detected.unwrap(),
|
|
399
|
-
expected_mime,
|
|
400
|
-
"MIME type mismatch for Pandoc format: {}",
|
|
401
|
-
filename
|
|
402
|
-
);
|
|
403
|
-
|
|
404
|
-
let _ = std::fs::remove_file(&temp_path);
|
|
405
|
-
}
|
|
406
|
-
}
|
|
407
|
-
|
|
408
|
-
/// Test MIME validation for all Pandoc formats.
|
|
409
|
-
#[cfg(feature = "office")]
|
|
410
|
-
#[tokio::test]
|
|
411
|
-
async fn test_pandoc_mime_validation() {
|
|
412
|
-
let pandoc_mimes = vec![
|
|
413
|
-
"text/x-rst",
|
|
414
|
-
"application/x-latex",
|
|
415
|
-
"application/rtf",
|
|
416
|
-
"application/vnd.oasis.opendocument.text",
|
|
417
|
-
"application/epub+zip",
|
|
418
|
-
"text/x-org",
|
|
419
|
-
"application/x-typst",
|
|
420
|
-
"text/x-commonmark",
|
|
421
|
-
];
|
|
422
|
-
|
|
423
|
-
for mime_type in pandoc_mimes {
|
|
424
|
-
let result = validate_mime_type(mime_type);
|
|
425
|
-
assert!(result.is_ok(), "Pandoc MIME type should be supported: {}", mime_type);
|
|
426
|
-
assert_eq!(result.unwrap(), mime_type);
|
|
427
|
-
}
|
|
428
|
-
}
|
|
1
|
+
//! MIME type detection integration tests.
|
|
2
|
+
//!
|
|
3
|
+
//! Tests for MIME type detection from file extensions and content.
|
|
4
|
+
//! Validates detection accuracy, mismatch handling, and error cases.
|
|
5
|
+
|
|
6
|
+
use kreuzberg::core::mime::{detect_mime_type, validate_mime_type};
|
|
7
|
+
use std::io::Write;
|
|
8
|
+
use tempfile::NamedTempFile;
|
|
9
|
+
|
|
10
|
+
mod helpers;
|
|
11
|
+
|
|
12
|
+
/// Test MIME detection by file extension.
|
|
13
|
+
///
|
|
14
|
+
/// Validates that file extensions are correctly mapped to MIME types.
|
|
15
|
+
/// This is the primary MIME detection method (extension-first approach).
|
|
16
|
+
#[tokio::test]
|
|
17
|
+
async fn test_mime_detection_by_extension() {
|
|
18
|
+
use tempfile::TempDir;
|
|
19
|
+
|
|
20
|
+
let test_cases = vec![
|
|
21
|
+
("test.pdf", "application/pdf"),
|
|
22
|
+
(
|
|
23
|
+
"test.docx",
|
|
24
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
25
|
+
),
|
|
26
|
+
(
|
|
27
|
+
"test.xlsx",
|
|
28
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
29
|
+
),
|
|
30
|
+
(
|
|
31
|
+
"test.pptx",
|
|
32
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
33
|
+
),
|
|
34
|
+
("test.txt", "text/plain"),
|
|
35
|
+
("test.md", "text/markdown"),
|
|
36
|
+
("test.html", "text/html"),
|
|
37
|
+
("test.json", "application/json"),
|
|
38
|
+
("test.xml", "application/xml"),
|
|
39
|
+
("test.csv", "text/csv"),
|
|
40
|
+
("test.png", "image/png"),
|
|
41
|
+
("test.jpg", "image/jpeg"),
|
|
42
|
+
("test.gif", "image/gif"),
|
|
43
|
+
("test.eml", "message/rfc822"),
|
|
44
|
+
("test.zip", "application/zip"),
|
|
45
|
+
];
|
|
46
|
+
|
|
47
|
+
for (filename, expected_mime) in test_cases {
|
|
48
|
+
let temp_dir = TempDir::new().expect("Should create temp dir");
|
|
49
|
+
let temp_path = temp_dir.path().join(filename);
|
|
50
|
+
|
|
51
|
+
std::fs::write(&temp_path, b"test content").unwrap();
|
|
52
|
+
|
|
53
|
+
let detected = detect_mime_type(&temp_path, true);
|
|
54
|
+
|
|
55
|
+
assert!(detected.is_ok(), "Should detect MIME type for {}", filename);
|
|
56
|
+
assert_eq!(detected.unwrap(), expected_mime, "MIME type mismatch for {}", filename);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/// Test case-insensitive extension detection.
|
|
61
|
+
#[tokio::test]
|
|
62
|
+
async fn test_mime_detection_case_insensitive() {
|
|
63
|
+
use tempfile::TempDir;
|
|
64
|
+
|
|
65
|
+
let test_cases = vec![
|
|
66
|
+
("test.PDF", "application/pdf"),
|
|
67
|
+
(
|
|
68
|
+
"test.DOCX",
|
|
69
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
70
|
+
),
|
|
71
|
+
("test.TXT", "text/plain"),
|
|
72
|
+
("test.Jpg", "image/jpeg"),
|
|
73
|
+
];
|
|
74
|
+
|
|
75
|
+
for (filename, expected_mime) in test_cases {
|
|
76
|
+
let temp_dir = TempDir::new().expect("Should create temp dir");
|
|
77
|
+
let temp_path = temp_dir.path().join(filename);
|
|
78
|
+
|
|
79
|
+
std::fs::write(&temp_path, b"test").unwrap();
|
|
80
|
+
|
|
81
|
+
let detected = detect_mime_type(&temp_path, true);
|
|
82
|
+
assert!(detected.is_ok(), "Should handle {} (case insensitive)", filename);
|
|
83
|
+
assert_eq!(detected.unwrap(), expected_mime);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/// Test MIME detection by content (magic bytes).
|
|
88
|
+
#[tokio::test]
|
|
89
|
+
async fn test_mime_detection_by_content() {
|
|
90
|
+
struct TestCase {
|
|
91
|
+
content: Vec<u8>,
|
|
92
|
+
filename: &'static str,
|
|
93
|
+
expected_fallback: Option<&'static str>,
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
let test_cases = vec![
|
|
97
|
+
TestCase {
|
|
98
|
+
content: b"%PDF-1.4\ntest content".to_vec(),
|
|
99
|
+
filename: "test",
|
|
100
|
+
expected_fallback: Some("application/pdf"),
|
|
101
|
+
},
|
|
102
|
+
TestCase {
|
|
103
|
+
content: vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A],
|
|
104
|
+
filename: "test",
|
|
105
|
+
expected_fallback: Some("image/png"),
|
|
106
|
+
},
|
|
107
|
+
TestCase {
|
|
108
|
+
content: vec![0x50, 0x4B, 0x03, 0x04],
|
|
109
|
+
filename: "test",
|
|
110
|
+
expected_fallback: Some("application/zip"),
|
|
111
|
+
},
|
|
112
|
+
TestCase {
|
|
113
|
+
content: vec![0xFF, 0xD8, 0xFF, 0xE0],
|
|
114
|
+
filename: "test",
|
|
115
|
+
expected_fallback: Some("image/jpeg"),
|
|
116
|
+
},
|
|
117
|
+
];
|
|
118
|
+
|
|
119
|
+
for test_case in test_cases {
|
|
120
|
+
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
121
|
+
let temp_path = temp_file.path().parent().unwrap().join(test_case.filename);
|
|
122
|
+
|
|
123
|
+
temp_file.write_all(&test_case.content).unwrap();
|
|
124
|
+
temp_file.flush().unwrap();
|
|
125
|
+
std::fs::copy(temp_file.path(), &temp_path).unwrap();
|
|
126
|
+
|
|
127
|
+
let detected = detect_mime_type(&temp_path, true);
|
|
128
|
+
|
|
129
|
+
if let Some(expected) = test_case.expected_fallback {
|
|
130
|
+
if let Ok(mime) = &detected {
|
|
131
|
+
assert!(
|
|
132
|
+
mime == expected || mime.starts_with("application/") || mime.starts_with("image/"),
|
|
133
|
+
"For {}, expected {} or reasonable fallback, got {}",
|
|
134
|
+
test_case.filename,
|
|
135
|
+
expected,
|
|
136
|
+
mime
|
|
137
|
+
);
|
|
138
|
+
} else {
|
|
139
|
+
assert!(
|
|
140
|
+
detected.is_err(),
|
|
141
|
+
"Should fail gracefully for {} without extension",
|
|
142
|
+
test_case.filename
|
|
143
|
+
);
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
let _ = std::fs::remove_file(&temp_path);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/// Test validation of supported MIME types.
|
|
152
|
+
///
|
|
153
|
+
/// Validates that all documented supported MIME types pass validation.
|
|
154
|
+
/// This ensures the MIME type registry is correctly configured.
|
|
155
|
+
#[tokio::test]
|
|
156
|
+
async fn test_mime_type_validation() {
|
|
157
|
+
let supported = vec![
|
|
158
|
+
"application/pdf",
|
|
159
|
+
"text/plain",
|
|
160
|
+
"text/markdown",
|
|
161
|
+
"application/json",
|
|
162
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
163
|
+
"image/png",
|
|
164
|
+
"image/jpeg",
|
|
165
|
+
"message/rfc822",
|
|
166
|
+
"text/csv",
|
|
167
|
+
"application/zip",
|
|
168
|
+
];
|
|
169
|
+
|
|
170
|
+
for mime_type in supported {
|
|
171
|
+
let result = validate_mime_type(mime_type);
|
|
172
|
+
assert!(result.is_ok(), "Should validate supported MIME type: {}", mime_type);
|
|
173
|
+
assert_eq!(result.unwrap(), mime_type);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
/// Test validation of image MIME types (prefix matching).
|
|
178
|
+
#[tokio::test]
|
|
179
|
+
async fn test_mime_type_image_prefix_validation() {
|
|
180
|
+
let image_types = vec![
|
|
181
|
+
"image/png",
|
|
182
|
+
"image/jpeg",
|
|
183
|
+
"image/gif",
|
|
184
|
+
"image/webp",
|
|
185
|
+
"image/bmp",
|
|
186
|
+
"image/tiff",
|
|
187
|
+
"image/svg+xml",
|
|
188
|
+
"image/x-custom-format",
|
|
189
|
+
];
|
|
190
|
+
|
|
191
|
+
for mime_type in image_types {
|
|
192
|
+
let result = validate_mime_type(mime_type);
|
|
193
|
+
assert!(result.is_ok(), "Should validate image MIME type: {}", mime_type);
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
/// Test unknown/unsupported MIME type handling.
|
|
198
|
+
#[tokio::test]
|
|
199
|
+
async fn test_unknown_mime_type() {
|
|
200
|
+
let unsupported = vec![
|
|
201
|
+
"application/x-unknown-format",
|
|
202
|
+
"video/mp4",
|
|
203
|
+
"audio/mp3",
|
|
204
|
+
"application/octet-stream",
|
|
205
|
+
"text/x-unsupported",
|
|
206
|
+
];
|
|
207
|
+
|
|
208
|
+
for mime_type in unsupported {
|
|
209
|
+
let result = validate_mime_type(mime_type);
|
|
210
|
+
assert!(result.is_err(), "Should reject unsupported MIME type: {}", mime_type);
|
|
211
|
+
|
|
212
|
+
let error = result.unwrap_err();
|
|
213
|
+
assert!(
|
|
214
|
+
matches!(error, kreuzberg::KreuzbergError::UnsupportedFormat(_)),
|
|
215
|
+
"Should return UnsupportedFormat error for: {}",
|
|
216
|
+
mime_type
|
|
217
|
+
);
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
/// Test handling of MIME type mismatch (extension vs content).
|
|
222
|
+
#[tokio::test]
|
|
223
|
+
async fn test_mime_mismatch_warning() {
|
|
224
|
+
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
225
|
+
let temp_path = temp_file.path().parent().unwrap().join("document.pdf");
|
|
226
|
+
|
|
227
|
+
temp_file.write_all(&[0x50, 0x4B, 0x03, 0x04]).unwrap();
|
|
228
|
+
temp_file.flush().unwrap();
|
|
229
|
+
std::fs::copy(temp_file.path(), &temp_path).unwrap();
|
|
230
|
+
|
|
231
|
+
let detected = detect_mime_type(&temp_path, true);
|
|
232
|
+
|
|
233
|
+
assert!(detected.is_ok(), "Should detect MIME type even with mismatch");
|
|
234
|
+
|
|
235
|
+
assert_eq!(
|
|
236
|
+
detected.unwrap(),
|
|
237
|
+
"application/pdf",
|
|
238
|
+
"Extension-based detection should take precedence"
|
|
239
|
+
);
|
|
240
|
+
|
|
241
|
+
let _ = std::fs::remove_file(&temp_path);
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
/// Test file extension mismatch detection.
|
|
245
|
+
#[tokio::test]
|
|
246
|
+
async fn test_extension_content_mismatch() {
|
|
247
|
+
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
248
|
+
let temp_path = temp_file.path().parent().unwrap().join("document.txt");
|
|
249
|
+
|
|
250
|
+
temp_file.write_all(b"%PDF-1.4\n").unwrap();
|
|
251
|
+
temp_file.flush().unwrap();
|
|
252
|
+
std::fs::copy(temp_file.path(), &temp_path).unwrap();
|
|
253
|
+
|
|
254
|
+
let detected = detect_mime_type(&temp_path, true);
|
|
255
|
+
|
|
256
|
+
assert!(detected.is_ok(), "Should detect MIME type");
|
|
257
|
+
|
|
258
|
+
assert_eq!(
|
|
259
|
+
detected.unwrap(),
|
|
260
|
+
"text/plain",
|
|
261
|
+
"Should use extension for MIME detection"
|
|
262
|
+
);
|
|
263
|
+
|
|
264
|
+
let _ = std::fs::remove_file(&temp_path);
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
/// Test file without extension.
|
|
268
|
+
#[tokio::test]
|
|
269
|
+
async fn test_no_extension() {
|
|
270
|
+
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
271
|
+
let temp_path = temp_file.path().parent().unwrap().join("testfile");
|
|
272
|
+
|
|
273
|
+
temp_file.write_all(b"test content").unwrap();
|
|
274
|
+
temp_file.flush().unwrap();
|
|
275
|
+
std::fs::copy(temp_file.path(), &temp_path).unwrap();
|
|
276
|
+
|
|
277
|
+
let detected = detect_mime_type(&temp_path, true);
|
|
278
|
+
|
|
279
|
+
if detected.is_err() {
|
|
280
|
+
let error = detected.unwrap_err();
|
|
281
|
+
assert!(
|
|
282
|
+
matches!(
|
|
283
|
+
error,
|
|
284
|
+
kreuzberg::KreuzbergError::Validation { .. } | kreuzberg::KreuzbergError::UnsupportedFormat(_)
|
|
285
|
+
),
|
|
286
|
+
"Should return appropriate error for file without extension"
|
|
287
|
+
);
|
|
288
|
+
} else {
|
|
289
|
+
let mime = detected.unwrap();
|
|
290
|
+
assert!(
|
|
291
|
+
mime.contains('/'),
|
|
292
|
+
"Detected MIME type should be valid format: {}",
|
|
293
|
+
mime
|
|
294
|
+
);
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
let _ = std::fs::remove_file(&temp_path);
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
/// Test nonexistent file.
|
|
301
|
+
#[tokio::test]
|
|
302
|
+
async fn test_mime_detection_nonexistent_file() {
|
|
303
|
+
let nonexistent_path = "/nonexistent/path/to/file.pdf";
|
|
304
|
+
|
|
305
|
+
let result = detect_mime_type(nonexistent_path, true);
|
|
306
|
+
|
|
307
|
+
assert!(result.is_err(), "Should fail for nonexistent file");
|
|
308
|
+
|
|
309
|
+
let error = result.unwrap_err();
|
|
310
|
+
assert!(
|
|
311
|
+
matches!(error, kreuzberg::KreuzbergError::Validation { .. }),
|
|
312
|
+
"Should return Validation error for nonexistent file"
|
|
313
|
+
);
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
/// Test file existence check can be disabled.
|
|
317
|
+
#[tokio::test]
|
|
318
|
+
async fn test_mime_detection_skip_existence_check() {
|
|
319
|
+
let nonexistent_path = "/nonexistent/path/to/document.pdf";
|
|
320
|
+
|
|
321
|
+
let result = detect_mime_type(nonexistent_path, false);
|
|
322
|
+
|
|
323
|
+
assert!(result.is_ok(), "Should succeed when skipping existence check");
|
|
324
|
+
assert_eq!(result.unwrap(), "application/pdf");
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
/// Test multiple dots in filename.
|
|
328
|
+
#[tokio::test]
|
|
329
|
+
async fn test_filename_multiple_dots() {
|
|
330
|
+
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
331
|
+
let temp_path = temp_file.path().parent().unwrap().join("my.backup.file.pdf");
|
|
332
|
+
|
|
333
|
+
temp_file.write_all(b"test").unwrap();
|
|
334
|
+
temp_file.flush().unwrap();
|
|
335
|
+
std::fs::copy(temp_file.path(), &temp_path).unwrap();
|
|
336
|
+
|
|
337
|
+
let detected = detect_mime_type(&temp_path, true);
|
|
338
|
+
|
|
339
|
+
assert!(detected.is_ok(), "Should handle multiple dots in filename");
|
|
340
|
+
assert_eq!(detected.unwrap(), "application/pdf", "Should use last extension");
|
|
341
|
+
|
|
342
|
+
let _ = std::fs::remove_file(&temp_path);
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
/// Test special characters in filename.
|
|
346
|
+
#[tokio::test]
|
|
347
|
+
async fn test_filename_special_characters() {
|
|
348
|
+
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
349
|
+
let temp_path = temp_file.path().parent().unwrap().join("文档 (copy) [v2].pdf");
|
|
350
|
+
|
|
351
|
+
temp_file.write_all(b"test").unwrap();
|
|
352
|
+
temp_file.flush().unwrap();
|
|
353
|
+
std::fs::copy(temp_file.path(), &temp_path).unwrap();
|
|
354
|
+
|
|
355
|
+
let detected = detect_mime_type(&temp_path, true);
|
|
356
|
+
|
|
357
|
+
assert!(detected.is_ok(), "Should handle special characters in filename");
|
|
358
|
+
assert_eq!(detected.unwrap(), "application/pdf");
|
|
359
|
+
|
|
360
|
+
let _ = std::fs::remove_file(&temp_path);
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
/// Test MIME detection for all Pandoc-supported formats.
|
|
364
|
+
///
|
|
365
|
+
/// Validates that all document formats supported by Pandoc extractor
|
|
366
|
+
/// are correctly detected and mapped to their MIME types.
|
|
367
|
+
#[cfg(feature = "office")]
|
|
368
|
+
#[tokio::test]
|
|
369
|
+
async fn test_pandoc_formats_mime_detection() {
|
|
370
|
+
let pandoc_formats = vec![
|
|
371
|
+
("test.rst", "text/x-rst"),
|
|
372
|
+
("test.tex", "application/x-latex"),
|
|
373
|
+
("test.latex", "application/x-latex"),
|
|
374
|
+
("test.rtf", "application/rtf"),
|
|
375
|
+
("test.odt", "application/vnd.oasis.opendocument.text"),
|
|
376
|
+
("test.epub", "application/epub+zip"),
|
|
377
|
+
("test.org", "text/x-org"),
|
|
378
|
+
("test.typst", "application/x-typst"),
|
|
379
|
+
("test.commonmark", "text/x-commonmark"),
|
|
380
|
+
];
|
|
381
|
+
|
|
382
|
+
for (filename, expected_mime) in pandoc_formats {
|
|
383
|
+
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
384
|
+
let temp_path = temp_file.path().parent().unwrap().join(filename);
|
|
385
|
+
|
|
386
|
+
temp_file.write_all(b"test content").unwrap();
|
|
387
|
+
temp_file.flush().unwrap();
|
|
388
|
+
std::fs::copy(temp_file.path(), &temp_path).unwrap();
|
|
389
|
+
|
|
390
|
+
let detected = detect_mime_type(&temp_path, true);
|
|
391
|
+
|
|
392
|
+
assert!(
|
|
393
|
+
detected.is_ok(),
|
|
394
|
+
"Should detect MIME type for Pandoc format: {}",
|
|
395
|
+
filename
|
|
396
|
+
);
|
|
397
|
+
assert_eq!(
|
|
398
|
+
detected.unwrap(),
|
|
399
|
+
expected_mime,
|
|
400
|
+
"MIME type mismatch for Pandoc format: {}",
|
|
401
|
+
filename
|
|
402
|
+
);
|
|
403
|
+
|
|
404
|
+
let _ = std::fs::remove_file(&temp_path);
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
/// Test MIME validation for all Pandoc formats.
|
|
409
|
+
#[cfg(feature = "office")]
|
|
410
|
+
#[tokio::test]
|
|
411
|
+
async fn test_pandoc_mime_validation() {
|
|
412
|
+
let pandoc_mimes = vec![
|
|
413
|
+
"text/x-rst",
|
|
414
|
+
"application/x-latex",
|
|
415
|
+
"application/rtf",
|
|
416
|
+
"application/vnd.oasis.opendocument.text",
|
|
417
|
+
"application/epub+zip",
|
|
418
|
+
"text/x-org",
|
|
419
|
+
"application/x-typst",
|
|
420
|
+
"text/x-commonmark",
|
|
421
|
+
];
|
|
422
|
+
|
|
423
|
+
for mime_type in pandoc_mimes {
|
|
424
|
+
let result = validate_mime_type(mime_type);
|
|
425
|
+
assert!(result.is_ok(), "Pandoc MIME type should be supported: {}", mime_type);
|
|
426
|
+
assert_eq!(result.unwrap(), mime_type);
|
|
427
|
+
}
|
|
428
|
+
}
|