kreuzberg 4.0.0.rc1 → 4.0.0.rc2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +14 -8
- data/.rspec +3 -3
- data/.rubocop.yaml +1 -534
- data/.rubocop.yml +538 -0
- data/Gemfile +8 -9
- data/Gemfile.lock +9 -109
- data/README.md +426 -421
- data/Rakefile +25 -25
- data/Steepfile +47 -47
- data/examples/async_patterns.rb +341 -340
- data/ext/kreuzberg_rb/extconf.rb +45 -35
- data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +44 -36
- data/ext/kreuzberg_rb/native/README.md +425 -425
- data/ext/kreuzberg_rb/native/build.rs +15 -17
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
- data/ext/kreuzberg_rb/native/include/strings.h +20 -20
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
- data/ext/kreuzberg_rb/native/src/lib.rs +2998 -2939
- data/extconf.rb +28 -28
- data/kreuzberg.gemspec +148 -105
- data/lib/kreuzberg/api_proxy.rb +142 -142
- data/lib/kreuzberg/cache_api.rb +46 -45
- data/lib/kreuzberg/cli.rb +55 -55
- data/lib/kreuzberg/cli_proxy.rb +127 -127
- data/lib/kreuzberg/config.rb +691 -684
- data/lib/kreuzberg/error_context.rb +32 -0
- data/lib/kreuzberg/errors.rb +118 -50
- data/lib/kreuzberg/extraction_api.rb +85 -84
- data/lib/kreuzberg/mcp_proxy.rb +186 -186
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
- data/lib/kreuzberg/post_processor_protocol.rb +86 -86
- data/lib/kreuzberg/result.rb +216 -216
- data/lib/kreuzberg/setup_lib_path.rb +80 -79
- data/lib/kreuzberg/validator_protocol.rb +89 -89
- data/lib/kreuzberg/version.rb +5 -5
- data/lib/kreuzberg.rb +103 -82
- data/sig/kreuzberg/internal.rbs +184 -184
- data/sig/kreuzberg.rbs +520 -468
- data/spec/binding/cache_spec.rb +227 -227
- data/spec/binding/cli_proxy_spec.rb +85 -87
- data/spec/binding/cli_spec.rb +55 -54
- data/spec/binding/config_spec.rb +345 -345
- data/spec/binding/config_validation_spec.rb +283 -283
- data/spec/binding/error_handling_spec.rb +213 -213
- data/spec/binding/errors_spec.rb +66 -66
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
- data/spec/binding/plugins/postprocessor_spec.rb +269 -269
- data/spec/binding/plugins/validator_spec.rb +274 -274
- data/spec/fixtures/config.toml +39 -39
- data/spec/fixtures/config.yaml +41 -42
- data/spec/fixtures/invalid_config.toml +4 -4
- data/spec/smoke/package_spec.rb +178 -178
- data/spec/spec_helper.rb +42 -42
- data/vendor/kreuzberg/Cargo.toml +204 -134
- data/vendor/kreuzberg/README.md +175 -175
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
- data/vendor/kreuzberg/build.rs +474 -460
- data/vendor/kreuzberg/src/api/error.rs +81 -81
- data/vendor/kreuzberg/src/api/handlers.rs +199 -199
- data/vendor/kreuzberg/src/api/mod.rs +79 -79
- data/vendor/kreuzberg/src/api/server.rs +353 -353
- data/vendor/kreuzberg/src/api/types.rs +170 -170
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -1143
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -677
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -35
- data/vendor/kreuzberg/src/core/config.rs +1032 -1032
- data/vendor/kreuzberg/src/core/extractor.rs +1024 -903
- data/vendor/kreuzberg/src/core/io.rs +329 -327
- data/vendor/kreuzberg/src/core/mime.rs +605 -615
- data/vendor/kreuzberg/src/core/mod.rs +45 -42
- data/vendor/kreuzberg/src/core/pipeline.rs +984 -906
- data/vendor/kreuzberg/src/embeddings.rs +432 -323
- data/vendor/kreuzberg/src/error.rs +431 -431
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -40
- data/vendor/kreuzberg/src/extraction/email.rs +854 -854
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
- data/vendor/kreuzberg/src/extraction/html.rs +553 -553
- data/vendor/kreuzberg/src/extraction/image.rs +368 -368
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -564
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -77
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -128
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -3000
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
- data/vendor/kreuzberg/src/extraction/table.rs +328 -328
- data/vendor/kreuzberg/src/extraction/text.rs +269 -269
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
- data/vendor/kreuzberg/src/extractors/archive.rs +446 -425
- data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +367 -479
- data/vendor/kreuzberg/src/extractors/email.rs +143 -129
- data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +343 -344
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
- data/vendor/kreuzberg/src/extractors/html.rs +393 -410
- data/vendor/kreuzberg/src/extractors/image.rs +198 -195
- data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +365 -268
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
- data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +493 -496
- data/vendor/kreuzberg/src/extractors/pptx.rs +248 -234
- data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
- data/vendor/kreuzberg/src/extractors/security.rs +484 -0
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +140 -126
- data/vendor/kreuzberg/src/extractors/text.rs +260 -242
- data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +135 -128
- data/vendor/kreuzberg/src/image/dpi.rs +164 -164
- data/vendor/kreuzberg/src/image/mod.rs +6 -6
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
- data/vendor/kreuzberg/src/image/resize.rs +89 -89
- data/vendor/kreuzberg/src/keywords/config.rs +154 -154
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -267
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -294
- data/vendor/kreuzberg/src/keywords/types.rs +68 -68
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -942
- data/vendor/kreuzberg/src/lib.rs +105 -102
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -32
- data/vendor/kreuzberg/src/mcp/server.rs +1968 -1966
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
- data/vendor/kreuzberg/src/ocr/error.rs +37 -37
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -847
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -450
- data/vendor/kreuzberg/src/ocr/types.rs +393 -393
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
- data/vendor/kreuzberg/src/panic_context.rs +154 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -122
- data/vendor/kreuzberg/src/pdf/images.rs +139 -139
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -346
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -50
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
- data/vendor/kreuzberg/src/pdf/table.rs +393 -420
- data/vendor/kreuzberg/src/pdf/text.rs +158 -161
- data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -1010
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -209
- data/vendor/kreuzberg/src/plugins/ocr.rs +620 -629
- data/vendor/kreuzberg/src/plugins/processor.rs +642 -641
- data/vendor/kreuzberg/src/plugins/registry.rs +1337 -1324
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
- data/vendor/kreuzberg/src/plugins/validator.rs +956 -955
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
- data/vendor/kreuzberg/src/text/mod.rs +19 -19
- data/vendor/kreuzberg/src/text/quality.rs +697 -697
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
- data/vendor/kreuzberg/src/types.rs +903 -873
- data/vendor/kreuzberg/src/utils/mod.rs +17 -17
- data/vendor/kreuzberg/src/utils/quality.rs +959 -959
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -966
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -543
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -542
- data/vendor/kreuzberg/tests/batch_processing.rs +316 -304
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -509
- data/vendor/kreuzberg/tests/config_features.rs +598 -580
- data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -439
- data/vendor/kreuzberg/tests/core_integration.rs +510 -493
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -424
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -124
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -325
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -393
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -159
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -253
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -428
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -510
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -676
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -627
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -43
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -1412
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -771
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -561
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -921
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -607
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
- data/vendor/kreuzberg/tests/security_validation.rs +415 -404
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -609
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
- data/vendor/rb-sys/.cargo-ok +1 -0
- data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
- data/vendor/rb-sys/Cargo.lock +393 -0
- data/vendor/rb-sys/Cargo.toml +70 -0
- data/vendor/rb-sys/Cargo.toml.orig +57 -0
- data/vendor/rb-sys/LICENSE-APACHE +190 -0
- data/vendor/rb-sys/LICENSE-MIT +21 -0
- data/vendor/rb-sys/bin/release.sh +21 -0
- data/vendor/rb-sys/build/features.rs +108 -0
- data/vendor/rb-sys/build/main.rs +246 -0
- data/vendor/rb-sys/build/stable_api_config.rs +153 -0
- data/vendor/rb-sys/build/version.rs +48 -0
- data/vendor/rb-sys/readme.md +36 -0
- data/vendor/rb-sys/src/bindings.rs +21 -0
- data/vendor/rb-sys/src/hidden.rs +11 -0
- data/vendor/rb-sys/src/lib.rs +34 -0
- data/vendor/rb-sys/src/macros.rs +371 -0
- data/vendor/rb-sys/src/memory.rs +53 -0
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
- data/vendor/rb-sys/src/special_consts.rs +31 -0
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
- data/vendor/rb-sys/src/stable_api.rs +261 -0
- data/vendor/rb-sys/src/symbol.rs +31 -0
- data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
- data/vendor/rb-sys/src/utils.rs +89 -0
- data/vendor/rb-sys/src/value_type.rs +7 -0
- metadata +90 -95
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/spec/examples.txt +0 -104
- data/vendor/kreuzberg/src/bin/profile_extract.rs +0 -455
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +0 -275
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +0 -178
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +0 -491
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +0 -496
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +0 -1188
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +0 -162
- data/vendor/kreuzberg/src/extractors/pandoc.rs +0 -201
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +0 -92
- data/vendor/kreuzberg/tests/pandoc_integration.rs +0 -503
|
@@ -1,393 +1,393 @@
|
|
|
1
|
-
//! Error handling and edge case integration tests.
|
|
2
|
-
//!
|
|
3
|
-
//! Tests for corrupted files, edge cases, and invalid inputs.
|
|
4
|
-
//! Validates that the system handles errors gracefully without panics.
|
|
5
|
-
|
|
6
|
-
use kreuzberg::core::config::ExtractionConfig;
|
|
7
|
-
use kreuzberg::core::extractor::{extract_bytes, extract_file};
|
|
8
|
-
use std::io::Write;
|
|
9
|
-
use tempfile::NamedTempFile;
|
|
10
|
-
|
|
11
|
-
mod helpers;
|
|
12
|
-
|
|
13
|
-
/// Test truncated PDF - incomplete PDF file.
|
|
14
|
-
#[tokio::test]
|
|
15
|
-
async fn test_truncated_pdf() {
|
|
16
|
-
let config = ExtractionConfig::default();
|
|
17
|
-
|
|
18
|
-
let truncated_pdf = b"%PDF-1.4\n1 0 obj\n<<";
|
|
19
|
-
|
|
20
|
-
let result = extract_bytes(truncated_pdf, "application/pdf", &config).await;
|
|
21
|
-
|
|
22
|
-
assert!(result.is_err(), "Truncated PDF should fail gracefully");
|
|
23
|
-
|
|
24
|
-
let error = result.unwrap_err();
|
|
25
|
-
assert!(
|
|
26
|
-
matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
|
|
27
|
-
"Truncated PDF should produce Parsing error, got: {:?}",
|
|
28
|
-
error
|
|
29
|
-
);
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
/// Test corrupted ZIP - malformed archive.
|
|
33
|
-
#[tokio::test]
|
|
34
|
-
async fn test_corrupted_zip() {
|
|
35
|
-
let config = ExtractionConfig::default();
|
|
36
|
-
|
|
37
|
-
let corrupted_zip = vec![0x50, 0x4B, 0x03, 0x04, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00];
|
|
38
|
-
|
|
39
|
-
let result = extract_bytes(&corrupted_zip, "application/zip", &config).await;
|
|
40
|
-
|
|
41
|
-
assert!(result.is_err(), "Corrupted ZIP should fail gracefully");
|
|
42
|
-
|
|
43
|
-
let error = result.unwrap_err();
|
|
44
|
-
assert!(
|
|
45
|
-
matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
|
|
46
|
-
"Corrupted ZIP should produce Parsing error, got: {:?}",
|
|
47
|
-
error
|
|
48
|
-
);
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
/// Test invalid XML - bad XML syntax.
|
|
52
|
-
#[tokio::test]
|
|
53
|
-
async fn test_invalid_xml() {
|
|
54
|
-
let config = ExtractionConfig::default();
|
|
55
|
-
|
|
56
|
-
let invalid_xml = b"<?xml version=\"1.0\"?>\n\
|
|
57
|
-
<root>\n\
|
|
58
|
-
<unclosed>\n\
|
|
59
|
-
<another>text</wrong_tag>\n\
|
|
60
|
-
</root";
|
|
61
|
-
|
|
62
|
-
let result = extract_bytes(invalid_xml, "application/xml", &config).await;
|
|
63
|
-
|
|
64
|
-
match result {
|
|
65
|
-
Ok(extraction) => {
|
|
66
|
-
assert!(
|
|
67
|
-
extraction.chunks.is_none(),
|
|
68
|
-
"Chunks should be None without chunking config"
|
|
69
|
-
);
|
|
70
|
-
}
|
|
71
|
-
Err(error) => {
|
|
72
|
-
assert!(
|
|
73
|
-
matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
|
|
74
|
-
"Invalid XML error should be Parsing type, got: {:?}",
|
|
75
|
-
error
|
|
76
|
-
);
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
/// Test corrupted image - invalid image data.
|
|
82
|
-
#[tokio::test]
|
|
83
|
-
async fn test_corrupted_image() {
|
|
84
|
-
let config = ExtractionConfig::default();
|
|
85
|
-
|
|
86
|
-
let corrupted_png = vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0xFF, 0xFF, 0xFF, 0xFF];
|
|
87
|
-
|
|
88
|
-
let result = extract_bytes(&corrupted_png, "image/png", &config).await;
|
|
89
|
-
|
|
90
|
-
match result {
|
|
91
|
-
Ok(extraction) => {
|
|
92
|
-
assert!(
|
|
93
|
-
extraction.chunks.is_none(),
|
|
94
|
-
"Chunks should be None without chunking config"
|
|
95
|
-
);
|
|
96
|
-
}
|
|
97
|
-
Err(error) => {
|
|
98
|
-
assert!(
|
|
99
|
-
matches!(error, kreuzberg::KreuzbergError::Parsing { .. })
|
|
100
|
-
|| matches!(error, kreuzberg::KreuzbergError::Ocr { .. }),
|
|
101
|
-
"Corrupted image error should be Parsing or OCR type, got: {:?}",
|
|
102
|
-
error
|
|
103
|
-
);
|
|
104
|
-
}
|
|
105
|
-
}
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
/// Test empty file - 0 bytes.
|
|
109
|
-
#[tokio::test]
|
|
110
|
-
async fn test_empty_file() {
|
|
111
|
-
let config = ExtractionConfig::default();
|
|
112
|
-
|
|
113
|
-
let empty_data = b"";
|
|
114
|
-
|
|
115
|
-
let result_pdf = extract_bytes(empty_data, "application/pdf", &config).await;
|
|
116
|
-
let result_text = extract_bytes(empty_data, "text/plain", &config).await;
|
|
117
|
-
let result_xml = extract_bytes(empty_data, "application/xml", &config).await;
|
|
118
|
-
|
|
119
|
-
match result_pdf {
|
|
120
|
-
Ok(extraction) => {
|
|
121
|
-
assert!(
|
|
122
|
-
extraction.content.is_empty(),
|
|
123
|
-
"Empty PDF should have empty content if it succeeds"
|
|
124
|
-
);
|
|
125
|
-
assert!(extraction.chunks.is_none(), "Chunks should be None");
|
|
126
|
-
}
|
|
127
|
-
Err(error) => {
|
|
128
|
-
assert!(
|
|
129
|
-
matches!(
|
|
130
|
-
error,
|
|
131
|
-
kreuzberg::KreuzbergError::Parsing { .. } | kreuzberg::KreuzbergError::Validation { .. }
|
|
132
|
-
),
|
|
133
|
-
"Empty PDF should produce Parsing or Validation error, got: {:?}",
|
|
134
|
-
error
|
|
135
|
-
);
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
match result_text {
|
|
140
|
-
Ok(extraction) => {
|
|
141
|
-
assert!(
|
|
142
|
-
extraction.content.is_empty(),
|
|
143
|
-
"Empty text file should have empty content"
|
|
144
|
-
);
|
|
145
|
-
assert!(extraction.chunks.is_none(), "Chunks should be None");
|
|
146
|
-
}
|
|
147
|
-
Err(error) => {
|
|
148
|
-
panic!("Empty text file should not fail, got error: {:?}", error);
|
|
149
|
-
}
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
match result_xml {
|
|
153
|
-
Ok(extraction) => {
|
|
154
|
-
assert!(
|
|
155
|
-
extraction.content.is_empty(),
|
|
156
|
-
"Empty XML should have empty content if it succeeds"
|
|
157
|
-
);
|
|
158
|
-
assert!(extraction.chunks.is_none(), "Chunks should be None");
|
|
159
|
-
}
|
|
160
|
-
Err(error) => {
|
|
161
|
-
assert!(
|
|
162
|
-
matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
|
|
163
|
-
"Empty XML error should be Parsing type, got: {:?}",
|
|
164
|
-
error
|
|
165
|
-
);
|
|
166
|
-
}
|
|
167
|
-
}
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
/// Test very large file - stress test with large content.
|
|
171
|
-
#[tokio::test]
|
|
172
|
-
async fn test_very_large_file() {
|
|
173
|
-
let config = ExtractionConfig::default();
|
|
174
|
-
|
|
175
|
-
let large_text = "This is a line of text that will be repeated many times.\n".repeat(200_000);
|
|
176
|
-
let large_bytes = large_text.as_bytes();
|
|
177
|
-
|
|
178
|
-
let result = extract_bytes(large_bytes, "text/plain", &config).await;
|
|
179
|
-
|
|
180
|
-
assert!(result.is_ok(), "Large file should be processed successfully");
|
|
181
|
-
let extraction = result.unwrap();
|
|
182
|
-
|
|
183
|
-
assert!(!extraction.content.is_empty(), "Large file content should not be empty");
|
|
184
|
-
assert!(extraction.content.len() > 1_000_000, "Content should be large");
|
|
185
|
-
assert!(
|
|
186
|
-
extraction.chunks.is_none(),
|
|
187
|
-
"Chunks should be None without chunking config"
|
|
188
|
-
);
|
|
189
|
-
assert!(
|
|
190
|
-
extraction.detected_languages.is_none(),
|
|
191
|
-
"Language detection not enabled"
|
|
192
|
-
);
|
|
193
|
-
assert!(extraction.tables.is_empty(), "Text file should not have tables");
|
|
194
|
-
|
|
195
|
-
assert!(
|
|
196
|
-
extraction.content.contains("This is a line of text"),
|
|
197
|
-
"Content should preserve original text"
|
|
198
|
-
);
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
/// Test unicode filenames - non-ASCII paths.
|
|
202
|
-
#[tokio::test]
|
|
203
|
-
async fn test_unicode_filenames() {
|
|
204
|
-
let config = ExtractionConfig::default();
|
|
205
|
-
|
|
206
|
-
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
207
|
-
temp_file.write_all(b"Test content with Unicode filename.").unwrap();
|
|
208
|
-
|
|
209
|
-
let result = extract_file(temp_file.path(), Some("text/plain"), &config).await;
|
|
210
|
-
|
|
211
|
-
assert!(result.is_ok(), "Unicode filename should be handled");
|
|
212
|
-
let extraction = result.unwrap();
|
|
213
|
-
|
|
214
|
-
assert!(
|
|
215
|
-
extraction.content.contains("Test content"),
|
|
216
|
-
"Content should be extracted"
|
|
217
|
-
);
|
|
218
|
-
assert!(
|
|
219
|
-
extraction.chunks.is_none(),
|
|
220
|
-
"Chunks should be None without chunking config"
|
|
221
|
-
);
|
|
222
|
-
assert!(
|
|
223
|
-
extraction.detected_languages.is_none(),
|
|
224
|
-
"Language detection not enabled"
|
|
225
|
-
);
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
/// Test special characters in content - emojis, RTL text.
|
|
229
|
-
#[tokio::test]
|
|
230
|
-
async fn test_special_characters_content() {
|
|
231
|
-
let config = ExtractionConfig::default();
|
|
232
|
-
|
|
233
|
-
let special_text = "Emojis: 🎉 🚀 ✅ 🌍\n\
|
|
234
|
-
Arabic (RTL): مرحبا بالعالم\n\
|
|
235
|
-
Chinese: 你好世界\n\
|
|
236
|
-
Japanese: こんにちは世界\n\
|
|
237
|
-
Special chars: © ® ™ € £ ¥\n\
|
|
238
|
-
Math symbols: ∑ ∫ √ ≈ ∞";
|
|
239
|
-
|
|
240
|
-
let result = extract_bytes(special_text.as_bytes(), "text/plain", &config).await;
|
|
241
|
-
|
|
242
|
-
assert!(result.is_ok(), "Special characters should be handled");
|
|
243
|
-
let extraction = result.unwrap();
|
|
244
|
-
|
|
245
|
-
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
246
|
-
assert!(extraction.content.len() > 10, "Should have substantial content");
|
|
247
|
-
assert!(
|
|
248
|
-
extraction.chunks.is_none(),
|
|
249
|
-
"Chunks should be None without chunking config"
|
|
250
|
-
);
|
|
251
|
-
assert!(
|
|
252
|
-
extraction.detected_languages.is_none(),
|
|
253
|
-
"Language detection not enabled"
|
|
254
|
-
);
|
|
255
|
-
|
|
256
|
-
assert!(
|
|
257
|
-
extraction.content.contains("Emojis")
|
|
258
|
-
|| extraction.content.contains("Arabic")
|
|
259
|
-
|| extraction.content.contains("Chinese"),
|
|
260
|
-
"Should preserve at least some special character text"
|
|
261
|
-
);
|
|
262
|
-
}
|
|
263
|
-
|
|
264
|
-
/// Test nonexistent file - file not found.
|
|
265
|
-
#[tokio::test]
|
|
266
|
-
async fn test_nonexistent_file() {
|
|
267
|
-
let config = ExtractionConfig::default();
|
|
268
|
-
|
|
269
|
-
let nonexistent_path = "/nonexistent/path/to/file.pdf";
|
|
270
|
-
|
|
271
|
-
let result = extract_file(nonexistent_path, Some("application/pdf"), &config).await;
|
|
272
|
-
|
|
273
|
-
assert!(result.is_err(), "Nonexistent file should return error");
|
|
274
|
-
|
|
275
|
-
let error = result.unwrap_err();
|
|
276
|
-
assert!(
|
|
277
|
-
matches!(error, kreuzberg::KreuzbergError::Io(_))
|
|
278
|
-
|| matches!(error, kreuzberg::KreuzbergError::Validation { .. }),
|
|
279
|
-
"Should be IO or Validation error for nonexistent file, got: {:?}",
|
|
280
|
-
error
|
|
281
|
-
);
|
|
282
|
-
}
|
|
283
|
-
|
|
284
|
-
/// Test unsupported format - unknown file type.
|
|
285
|
-
#[tokio::test]
|
|
286
|
-
async fn test_unsupported_format() {
|
|
287
|
-
let config = ExtractionConfig::default();
|
|
288
|
-
|
|
289
|
-
let data = b"Some random data";
|
|
290
|
-
|
|
291
|
-
let result = extract_bytes(data, "application/x-unknown-format", &config).await;
|
|
292
|
-
|
|
293
|
-
assert!(result.is_err(), "Unsupported format should return error");
|
|
294
|
-
|
|
295
|
-
let error = result.unwrap_err();
|
|
296
|
-
assert!(
|
|
297
|
-
matches!(error, kreuzberg::KreuzbergError::UnsupportedFormat(_)),
|
|
298
|
-
"Should be UnsupportedFormat error, got: {:?}",
|
|
299
|
-
error
|
|
300
|
-
);
|
|
301
|
-
}
|
|
302
|
-
|
|
303
|
-
/// Test permission denied - no read access (platform-specific).
|
|
304
|
-
#[tokio::test]
|
|
305
|
-
#[cfg(unix)]
|
|
306
|
-
async fn test_permission_denied() {
|
|
307
|
-
use std::fs;
|
|
308
|
-
use std::os::unix::fs::PermissionsExt;
|
|
309
|
-
|
|
310
|
-
let config = ExtractionConfig::default();
|
|
311
|
-
|
|
312
|
-
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
313
|
-
temp_file.write_all(b"Test content").unwrap();
|
|
314
|
-
|
|
315
|
-
let mut perms = fs::metadata(temp_file.path()).unwrap().permissions();
|
|
316
|
-
perms.set_mode(0o000);
|
|
317
|
-
fs::set_permissions(temp_file.path(), perms).unwrap();
|
|
318
|
-
|
|
319
|
-
let result = extract_file(temp_file.path(), Some("text/plain"), &config).await;
|
|
320
|
-
|
|
321
|
-
let mut perms = fs::metadata(temp_file.path()).unwrap().permissions();
|
|
322
|
-
perms.set_mode(0o644);
|
|
323
|
-
fs::set_permissions(temp_file.path(), perms).unwrap();
|
|
324
|
-
|
|
325
|
-
assert!(result.is_err(), "Permission denied should return error");
|
|
326
|
-
}
|
|
327
|
-
|
|
328
|
-
/// Test file extension mismatch - .pdf extension with DOCX content.
|
|
329
|
-
#[tokio::test]
|
|
330
|
-
async fn test_file_extension_mismatch() {
|
|
331
|
-
let config = ExtractionConfig::default();
|
|
332
|
-
|
|
333
|
-
let docx_magic = vec![0x50, 0x4B, 0x03, 0x04, 0x14, 0x00, 0x00, 0x00];
|
|
334
|
-
|
|
335
|
-
let result = extract_bytes(&docx_magic, "application/pdf", &config).await;
|
|
336
|
-
|
|
337
|
-
assert!(result.is_err(), "MIME type mismatch should fail");
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
/// Test extraction with null bytes in content.
|
|
341
|
-
#[tokio::test]
|
|
342
|
-
async fn test_null_bytes_in_content() {
|
|
343
|
-
let config = ExtractionConfig::default();
|
|
344
|
-
|
|
345
|
-
let data_with_nulls = b"Text before\x00null\x00bytes\x00after";
|
|
346
|
-
|
|
347
|
-
let result = extract_bytes(data_with_nulls, "text/plain", &config).await;
|
|
348
|
-
|
|
349
|
-
assert!(result.is_ok(), "Null bytes should be handled");
|
|
350
|
-
let extraction = result.unwrap();
|
|
351
|
-
|
|
352
|
-
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
353
|
-
assert!(
|
|
354
|
-
extraction.chunks.is_none(),
|
|
355
|
-
"Chunks should be None without chunking config"
|
|
356
|
-
);
|
|
357
|
-
|
|
358
|
-
assert!(
|
|
359
|
-
extraction.content.contains("Text before") || extraction.content.contains("after"),
|
|
360
|
-
"Should preserve at least some of the text content"
|
|
361
|
-
);
|
|
362
|
-
}
|
|
363
|
-
|
|
364
|
-
/// Test concurrent extractions of same file.
|
|
365
|
-
#[tokio::test]
|
|
366
|
-
async fn test_concurrent_extractions() {
|
|
367
|
-
let config = ExtractionConfig::default();
|
|
368
|
-
|
|
369
|
-
let text_data = b"Concurrent extraction test content.";
|
|
370
|
-
|
|
371
|
-
let handles: Vec<_> = (0..10)
|
|
372
|
-
.map(|_| {
|
|
373
|
-
let config = config.clone();
|
|
374
|
-
tokio::spawn(async move { extract_bytes(text_data, "text/plain", &config).await })
|
|
375
|
-
})
|
|
376
|
-
.collect();
|
|
377
|
-
|
|
378
|
-
for handle in handles {
|
|
379
|
-
let result = handle.await.expect("Task should complete");
|
|
380
|
-
assert!(result.is_ok(), "Concurrent extraction should succeed");
|
|
381
|
-
|
|
382
|
-
let extraction = result.unwrap();
|
|
383
|
-
assert!(
|
|
384
|
-
extraction.content.contains("Concurrent extraction"),
|
|
385
|
-
"Content should be extracted correctly"
|
|
386
|
-
);
|
|
387
|
-
assert!(extraction.chunks.is_none(), "Chunks should be None");
|
|
388
|
-
assert!(
|
|
389
|
-
extraction.detected_languages.is_none(),
|
|
390
|
-
"Language detection not enabled"
|
|
391
|
-
);
|
|
392
|
-
}
|
|
393
|
-
}
|
|
1
|
+
//! Error handling and edge case integration tests.
|
|
2
|
+
//!
|
|
3
|
+
//! Tests for corrupted files, edge cases, and invalid inputs.
|
|
4
|
+
//! Validates that the system handles errors gracefully without panics.
|
|
5
|
+
|
|
6
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
7
|
+
use kreuzberg::core::extractor::{extract_bytes, extract_file};
|
|
8
|
+
use std::io::Write;
|
|
9
|
+
use tempfile::NamedTempFile;
|
|
10
|
+
|
|
11
|
+
mod helpers;
|
|
12
|
+
|
|
13
|
+
/// Test truncated PDF - incomplete PDF file.
|
|
14
|
+
#[tokio::test]
|
|
15
|
+
async fn test_truncated_pdf() {
|
|
16
|
+
let config = ExtractionConfig::default();
|
|
17
|
+
|
|
18
|
+
let truncated_pdf = b"%PDF-1.4\n1 0 obj\n<<";
|
|
19
|
+
|
|
20
|
+
let result = extract_bytes(truncated_pdf, "application/pdf", &config).await;
|
|
21
|
+
|
|
22
|
+
assert!(result.is_err(), "Truncated PDF should fail gracefully");
|
|
23
|
+
|
|
24
|
+
let error = result.unwrap_err();
|
|
25
|
+
assert!(
|
|
26
|
+
matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
|
|
27
|
+
"Truncated PDF should produce Parsing error, got: {:?}",
|
|
28
|
+
error
|
|
29
|
+
);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/// Test corrupted ZIP - malformed archive.
|
|
33
|
+
#[tokio::test]
|
|
34
|
+
async fn test_corrupted_zip() {
|
|
35
|
+
let config = ExtractionConfig::default();
|
|
36
|
+
|
|
37
|
+
let corrupted_zip = vec![0x50, 0x4B, 0x03, 0x04, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00];
|
|
38
|
+
|
|
39
|
+
let result = extract_bytes(&corrupted_zip, "application/zip", &config).await;
|
|
40
|
+
|
|
41
|
+
assert!(result.is_err(), "Corrupted ZIP should fail gracefully");
|
|
42
|
+
|
|
43
|
+
let error = result.unwrap_err();
|
|
44
|
+
assert!(
|
|
45
|
+
matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
|
|
46
|
+
"Corrupted ZIP should produce Parsing error, got: {:?}",
|
|
47
|
+
error
|
|
48
|
+
);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/// Test invalid XML - bad XML syntax.
|
|
52
|
+
#[tokio::test]
|
|
53
|
+
async fn test_invalid_xml() {
|
|
54
|
+
let config = ExtractionConfig::default();
|
|
55
|
+
|
|
56
|
+
let invalid_xml = b"<?xml version=\"1.0\"?>\n\
|
|
57
|
+
<root>\n\
|
|
58
|
+
<unclosed>\n\
|
|
59
|
+
<another>text</wrong_tag>\n\
|
|
60
|
+
</root";
|
|
61
|
+
|
|
62
|
+
let result = extract_bytes(invalid_xml, "application/xml", &config).await;
|
|
63
|
+
|
|
64
|
+
match result {
|
|
65
|
+
Ok(extraction) => {
|
|
66
|
+
assert!(
|
|
67
|
+
extraction.chunks.is_none(),
|
|
68
|
+
"Chunks should be None without chunking config"
|
|
69
|
+
);
|
|
70
|
+
}
|
|
71
|
+
Err(error) => {
|
|
72
|
+
assert!(
|
|
73
|
+
matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
|
|
74
|
+
"Invalid XML error should be Parsing type, got: {:?}",
|
|
75
|
+
error
|
|
76
|
+
);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/// Test corrupted image - invalid image data.
|
|
82
|
+
#[tokio::test]
|
|
83
|
+
async fn test_corrupted_image() {
|
|
84
|
+
let config = ExtractionConfig::default();
|
|
85
|
+
|
|
86
|
+
let corrupted_png = vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0xFF, 0xFF, 0xFF, 0xFF];
|
|
87
|
+
|
|
88
|
+
let result = extract_bytes(&corrupted_png, "image/png", &config).await;
|
|
89
|
+
|
|
90
|
+
match result {
|
|
91
|
+
Ok(extraction) => {
|
|
92
|
+
assert!(
|
|
93
|
+
extraction.chunks.is_none(),
|
|
94
|
+
"Chunks should be None without chunking config"
|
|
95
|
+
);
|
|
96
|
+
}
|
|
97
|
+
Err(error) => {
|
|
98
|
+
assert!(
|
|
99
|
+
matches!(error, kreuzberg::KreuzbergError::Parsing { .. })
|
|
100
|
+
|| matches!(error, kreuzberg::KreuzbergError::Ocr { .. }),
|
|
101
|
+
"Corrupted image error should be Parsing or OCR type, got: {:?}",
|
|
102
|
+
error
|
|
103
|
+
);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/// Test empty file - 0 bytes.
|
|
109
|
+
#[tokio::test]
|
|
110
|
+
async fn test_empty_file() {
|
|
111
|
+
let config = ExtractionConfig::default();
|
|
112
|
+
|
|
113
|
+
let empty_data = b"";
|
|
114
|
+
|
|
115
|
+
let result_pdf = extract_bytes(empty_data, "application/pdf", &config).await;
|
|
116
|
+
let result_text = extract_bytes(empty_data, "text/plain", &config).await;
|
|
117
|
+
let result_xml = extract_bytes(empty_data, "application/xml", &config).await;
|
|
118
|
+
|
|
119
|
+
match result_pdf {
|
|
120
|
+
Ok(extraction) => {
|
|
121
|
+
assert!(
|
|
122
|
+
extraction.content.is_empty(),
|
|
123
|
+
"Empty PDF should have empty content if it succeeds"
|
|
124
|
+
);
|
|
125
|
+
assert!(extraction.chunks.is_none(), "Chunks should be None");
|
|
126
|
+
}
|
|
127
|
+
Err(error) => {
|
|
128
|
+
assert!(
|
|
129
|
+
matches!(
|
|
130
|
+
error,
|
|
131
|
+
kreuzberg::KreuzbergError::Parsing { .. } | kreuzberg::KreuzbergError::Validation { .. }
|
|
132
|
+
),
|
|
133
|
+
"Empty PDF should produce Parsing or Validation error, got: {:?}",
|
|
134
|
+
error
|
|
135
|
+
);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
match result_text {
|
|
140
|
+
Ok(extraction) => {
|
|
141
|
+
assert!(
|
|
142
|
+
extraction.content.is_empty(),
|
|
143
|
+
"Empty text file should have empty content"
|
|
144
|
+
);
|
|
145
|
+
assert!(extraction.chunks.is_none(), "Chunks should be None");
|
|
146
|
+
}
|
|
147
|
+
Err(error) => {
|
|
148
|
+
panic!("Empty text file should not fail, got error: {:?}", error);
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
match result_xml {
|
|
153
|
+
Ok(extraction) => {
|
|
154
|
+
assert!(
|
|
155
|
+
extraction.content.is_empty(),
|
|
156
|
+
"Empty XML should have empty content if it succeeds"
|
|
157
|
+
);
|
|
158
|
+
assert!(extraction.chunks.is_none(), "Chunks should be None");
|
|
159
|
+
}
|
|
160
|
+
Err(error) => {
|
|
161
|
+
assert!(
|
|
162
|
+
matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
|
|
163
|
+
"Empty XML error should be Parsing type, got: {:?}",
|
|
164
|
+
error
|
|
165
|
+
);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/// Test very large file - stress test with large content.
|
|
171
|
+
#[tokio::test]
|
|
172
|
+
async fn test_very_large_file() {
|
|
173
|
+
let config = ExtractionConfig::default();
|
|
174
|
+
|
|
175
|
+
let large_text = "This is a line of text that will be repeated many times.\n".repeat(200_000);
|
|
176
|
+
let large_bytes = large_text.as_bytes();
|
|
177
|
+
|
|
178
|
+
let result = extract_bytes(large_bytes, "text/plain", &config).await;
|
|
179
|
+
|
|
180
|
+
assert!(result.is_ok(), "Large file should be processed successfully");
|
|
181
|
+
let extraction = result.unwrap();
|
|
182
|
+
|
|
183
|
+
assert!(!extraction.content.is_empty(), "Large file content should not be empty");
|
|
184
|
+
assert!(extraction.content.len() > 1_000_000, "Content should be large");
|
|
185
|
+
assert!(
|
|
186
|
+
extraction.chunks.is_none(),
|
|
187
|
+
"Chunks should be None without chunking config"
|
|
188
|
+
);
|
|
189
|
+
assert!(
|
|
190
|
+
extraction.detected_languages.is_none(),
|
|
191
|
+
"Language detection not enabled"
|
|
192
|
+
);
|
|
193
|
+
assert!(extraction.tables.is_empty(), "Text file should not have tables");
|
|
194
|
+
|
|
195
|
+
assert!(
|
|
196
|
+
extraction.content.contains("This is a line of text"),
|
|
197
|
+
"Content should preserve original text"
|
|
198
|
+
);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/// Test unicode filenames - non-ASCII paths.
|
|
202
|
+
#[tokio::test]
|
|
203
|
+
async fn test_unicode_filenames() {
|
|
204
|
+
let config = ExtractionConfig::default();
|
|
205
|
+
|
|
206
|
+
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
207
|
+
temp_file.write_all(b"Test content with Unicode filename.").unwrap();
|
|
208
|
+
|
|
209
|
+
let result = extract_file(temp_file.path(), Some("text/plain"), &config).await;
|
|
210
|
+
|
|
211
|
+
assert!(result.is_ok(), "Unicode filename should be handled");
|
|
212
|
+
let extraction = result.unwrap();
|
|
213
|
+
|
|
214
|
+
assert!(
|
|
215
|
+
extraction.content.contains("Test content"),
|
|
216
|
+
"Content should be extracted"
|
|
217
|
+
);
|
|
218
|
+
assert!(
|
|
219
|
+
extraction.chunks.is_none(),
|
|
220
|
+
"Chunks should be None without chunking config"
|
|
221
|
+
);
|
|
222
|
+
assert!(
|
|
223
|
+
extraction.detected_languages.is_none(),
|
|
224
|
+
"Language detection not enabled"
|
|
225
|
+
);
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/// Test special characters in content - emojis, RTL text.
|
|
229
|
+
#[tokio::test]
|
|
230
|
+
async fn test_special_characters_content() {
|
|
231
|
+
let config = ExtractionConfig::default();
|
|
232
|
+
|
|
233
|
+
let special_text = "Emojis: 🎉 🚀 ✅ 🌍\n\
|
|
234
|
+
Arabic (RTL): مرحبا بالعالم\n\
|
|
235
|
+
Chinese: 你好世界\n\
|
|
236
|
+
Japanese: こんにちは世界\n\
|
|
237
|
+
Special chars: © ® ™ € £ ¥\n\
|
|
238
|
+
Math symbols: ∑ ∫ √ ≈ ∞";
|
|
239
|
+
|
|
240
|
+
let result = extract_bytes(special_text.as_bytes(), "text/plain", &config).await;
|
|
241
|
+
|
|
242
|
+
assert!(result.is_ok(), "Special characters should be handled");
|
|
243
|
+
let extraction = result.unwrap();
|
|
244
|
+
|
|
245
|
+
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
246
|
+
assert!(extraction.content.len() > 10, "Should have substantial content");
|
|
247
|
+
assert!(
|
|
248
|
+
extraction.chunks.is_none(),
|
|
249
|
+
"Chunks should be None without chunking config"
|
|
250
|
+
);
|
|
251
|
+
assert!(
|
|
252
|
+
extraction.detected_languages.is_none(),
|
|
253
|
+
"Language detection not enabled"
|
|
254
|
+
);
|
|
255
|
+
|
|
256
|
+
assert!(
|
|
257
|
+
extraction.content.contains("Emojis")
|
|
258
|
+
|| extraction.content.contains("Arabic")
|
|
259
|
+
|| extraction.content.contains("Chinese"),
|
|
260
|
+
"Should preserve at least some special character text"
|
|
261
|
+
);
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
/// Test nonexistent file - file not found.
|
|
265
|
+
#[tokio::test]
|
|
266
|
+
async fn test_nonexistent_file() {
|
|
267
|
+
let config = ExtractionConfig::default();
|
|
268
|
+
|
|
269
|
+
let nonexistent_path = "/nonexistent/path/to/file.pdf";
|
|
270
|
+
|
|
271
|
+
let result = extract_file(nonexistent_path, Some("application/pdf"), &config).await;
|
|
272
|
+
|
|
273
|
+
assert!(result.is_err(), "Nonexistent file should return error");
|
|
274
|
+
|
|
275
|
+
let error = result.unwrap_err();
|
|
276
|
+
assert!(
|
|
277
|
+
matches!(error, kreuzberg::KreuzbergError::Io(_))
|
|
278
|
+
|| matches!(error, kreuzberg::KreuzbergError::Validation { .. }),
|
|
279
|
+
"Should be IO or Validation error for nonexistent file, got: {:?}",
|
|
280
|
+
error
|
|
281
|
+
);
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
/// Test unsupported format - unknown file type.
|
|
285
|
+
#[tokio::test]
|
|
286
|
+
async fn test_unsupported_format() {
|
|
287
|
+
let config = ExtractionConfig::default();
|
|
288
|
+
|
|
289
|
+
let data = b"Some random data";
|
|
290
|
+
|
|
291
|
+
let result = extract_bytes(data, "application/x-unknown-format", &config).await;
|
|
292
|
+
|
|
293
|
+
assert!(result.is_err(), "Unsupported format should return error");
|
|
294
|
+
|
|
295
|
+
let error = result.unwrap_err();
|
|
296
|
+
assert!(
|
|
297
|
+
matches!(error, kreuzberg::KreuzbergError::UnsupportedFormat(_)),
|
|
298
|
+
"Should be UnsupportedFormat error, got: {:?}",
|
|
299
|
+
error
|
|
300
|
+
);
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
/// Test permission denied - no read access (platform-specific).
|
|
304
|
+
#[tokio::test]
|
|
305
|
+
#[cfg(unix)]
|
|
306
|
+
async fn test_permission_denied() {
|
|
307
|
+
use std::fs;
|
|
308
|
+
use std::os::unix::fs::PermissionsExt;
|
|
309
|
+
|
|
310
|
+
let config = ExtractionConfig::default();
|
|
311
|
+
|
|
312
|
+
let mut temp_file = NamedTempFile::new().expect("Should create temp file");
|
|
313
|
+
temp_file.write_all(b"Test content").unwrap();
|
|
314
|
+
|
|
315
|
+
let mut perms = fs::metadata(temp_file.path()).unwrap().permissions();
|
|
316
|
+
perms.set_mode(0o000);
|
|
317
|
+
fs::set_permissions(temp_file.path(), perms).unwrap();
|
|
318
|
+
|
|
319
|
+
let result = extract_file(temp_file.path(), Some("text/plain"), &config).await;
|
|
320
|
+
|
|
321
|
+
let mut perms = fs::metadata(temp_file.path()).unwrap().permissions();
|
|
322
|
+
perms.set_mode(0o644);
|
|
323
|
+
fs::set_permissions(temp_file.path(), perms).unwrap();
|
|
324
|
+
|
|
325
|
+
assert!(result.is_err(), "Permission denied should return error");
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
/// Test file extension mismatch - .pdf extension with DOCX content.
|
|
329
|
+
#[tokio::test]
|
|
330
|
+
async fn test_file_extension_mismatch() {
|
|
331
|
+
let config = ExtractionConfig::default();
|
|
332
|
+
|
|
333
|
+
let docx_magic = vec![0x50, 0x4B, 0x03, 0x04, 0x14, 0x00, 0x00, 0x00];
|
|
334
|
+
|
|
335
|
+
let result = extract_bytes(&docx_magic, "application/pdf", &config).await;
|
|
336
|
+
|
|
337
|
+
assert!(result.is_err(), "MIME type mismatch should fail");
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
/// Test extraction with null bytes in content.
|
|
341
|
+
#[tokio::test]
|
|
342
|
+
async fn test_null_bytes_in_content() {
|
|
343
|
+
let config = ExtractionConfig::default();
|
|
344
|
+
|
|
345
|
+
let data_with_nulls = b"Text before\x00null\x00bytes\x00after";
|
|
346
|
+
|
|
347
|
+
let result = extract_bytes(data_with_nulls, "text/plain", &config).await;
|
|
348
|
+
|
|
349
|
+
assert!(result.is_ok(), "Null bytes should be handled");
|
|
350
|
+
let extraction = result.unwrap();
|
|
351
|
+
|
|
352
|
+
assert!(!extraction.content.is_empty(), "Content should not be empty");
|
|
353
|
+
assert!(
|
|
354
|
+
extraction.chunks.is_none(),
|
|
355
|
+
"Chunks should be None without chunking config"
|
|
356
|
+
);
|
|
357
|
+
|
|
358
|
+
assert!(
|
|
359
|
+
extraction.content.contains("Text before") || extraction.content.contains("after"),
|
|
360
|
+
"Should preserve at least some of the text content"
|
|
361
|
+
);
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
/// Test concurrent extractions of same file.
|
|
365
|
+
#[tokio::test]
|
|
366
|
+
async fn test_concurrent_extractions() {
|
|
367
|
+
let config = ExtractionConfig::default();
|
|
368
|
+
|
|
369
|
+
let text_data = b"Concurrent extraction test content.";
|
|
370
|
+
|
|
371
|
+
let handles: Vec<_> = (0..10)
|
|
372
|
+
.map(|_| {
|
|
373
|
+
let config = config.clone();
|
|
374
|
+
tokio::spawn(async move { extract_bytes(text_data, "text/plain", &config).await })
|
|
375
|
+
})
|
|
376
|
+
.collect();
|
|
377
|
+
|
|
378
|
+
for handle in handles {
|
|
379
|
+
let result = handle.await.expect("Task should complete");
|
|
380
|
+
assert!(result.is_ok(), "Concurrent extraction should succeed");
|
|
381
|
+
|
|
382
|
+
let extraction = result.unwrap();
|
|
383
|
+
assert!(
|
|
384
|
+
extraction.content.contains("Concurrent extraction"),
|
|
385
|
+
"Content should be extracted correctly"
|
|
386
|
+
);
|
|
387
|
+
assert!(extraction.chunks.is_none(), "Chunks should be None");
|
|
388
|
+
assert!(
|
|
389
|
+
extraction.detected_languages.is_none(),
|
|
390
|
+
"Language detection not enabled"
|
|
391
|
+
);
|
|
392
|
+
}
|
|
393
|
+
}
|