RubyGems - kreuzberg - Versions diffs - 4.0.0.rc1 → 4.0.0.rc2 - Mend

kreuzberg 4.0.0.rc1 → 4.0.0.rc2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (342) hide show

checksums.yaml +4 -4
data/.gitignore +14 -8
data/.rspec +3 -3
data/.rubocop.yaml +1 -534
data/.rubocop.yml +538 -0
data/Gemfile +8 -9
data/Gemfile.lock +9 -109
data/README.md +426 -421
data/Rakefile +25 -25
data/Steepfile +47 -47
data/examples/async_patterns.rb +341 -340
data/ext/kreuzberg_rb/extconf.rb +45 -35
data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
data/ext/kreuzberg_rb/native/Cargo.toml +44 -36
data/ext/kreuzberg_rb/native/README.md +425 -425
data/ext/kreuzberg_rb/native/build.rs +15 -17
data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
data/ext/kreuzberg_rb/native/include/strings.h +20 -20
data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
data/ext/kreuzberg_rb/native/src/lib.rs +2998 -2939
data/extconf.rb +28 -28
data/kreuzberg.gemspec +148 -105
data/lib/kreuzberg/api_proxy.rb +142 -142
data/lib/kreuzberg/cache_api.rb +46 -45
data/lib/kreuzberg/cli.rb +55 -55
data/lib/kreuzberg/cli_proxy.rb +127 -127
data/lib/kreuzberg/config.rb +691 -684
data/lib/kreuzberg/error_context.rb +32 -0
data/lib/kreuzberg/errors.rb +118 -50
data/lib/kreuzberg/extraction_api.rb +85 -84
data/lib/kreuzberg/mcp_proxy.rb +186 -186
data/lib/kreuzberg/ocr_backend_protocol.rb +113 -113
data/lib/kreuzberg/post_processor_protocol.rb +86 -86
data/lib/kreuzberg/result.rb +216 -216
data/lib/kreuzberg/setup_lib_path.rb +80 -79
data/lib/kreuzberg/validator_protocol.rb +89 -89
data/lib/kreuzberg/version.rb +5 -5
data/lib/kreuzberg.rb +103 -82
data/sig/kreuzberg/internal.rbs +184 -184
data/sig/kreuzberg.rbs +520 -468
data/spec/binding/cache_spec.rb +227 -227
data/spec/binding/cli_proxy_spec.rb +85 -87
data/spec/binding/cli_spec.rb +55 -54
data/spec/binding/config_spec.rb +345 -345
data/spec/binding/config_validation_spec.rb +283 -283
data/spec/binding/error_handling_spec.rb +213 -213
data/spec/binding/errors_spec.rb +66 -66
data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
data/spec/binding/plugins/postprocessor_spec.rb +269 -269
data/spec/binding/plugins/validator_spec.rb +274 -274
data/spec/fixtures/config.toml +39 -39
data/spec/fixtures/config.yaml +41 -42
data/spec/fixtures/invalid_config.toml +4 -4
data/spec/smoke/package_spec.rb +178 -178
data/spec/spec_helper.rb +42 -42
data/vendor/kreuzberg/Cargo.toml +204 -134
data/vendor/kreuzberg/README.md +175 -175
data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
data/vendor/kreuzberg/build.rs +474 -460
data/vendor/kreuzberg/src/api/error.rs +81 -81
data/vendor/kreuzberg/src/api/handlers.rs +199 -199
data/vendor/kreuzberg/src/api/mod.rs +79 -79
data/vendor/kreuzberg/src/api/server.rs +353 -353
data/vendor/kreuzberg/src/api/types.rs +170 -170
data/vendor/kreuzberg/src/cache/mod.rs +1167 -1143
data/vendor/kreuzberg/src/chunking/mod.rs +677 -677
data/vendor/kreuzberg/src/core/batch_mode.rs +95 -35
data/vendor/kreuzberg/src/core/config.rs +1032 -1032
data/vendor/kreuzberg/src/core/extractor.rs +1024 -903
data/vendor/kreuzberg/src/core/io.rs +329 -327
data/vendor/kreuzberg/src/core/mime.rs +605 -615
data/vendor/kreuzberg/src/core/mod.rs +45 -42
data/vendor/kreuzberg/src/core/pipeline.rs +984 -906
data/vendor/kreuzberg/src/embeddings.rs +432 -323
data/vendor/kreuzberg/src/error.rs +431 -431
data/vendor/kreuzberg/src/extraction/archive.rs +954 -954
data/vendor/kreuzberg/src/extraction/docx.rs +40 -40
data/vendor/kreuzberg/src/extraction/email.rs +854 -854
data/vendor/kreuzberg/src/extraction/excel.rs +688 -688
data/vendor/kreuzberg/src/extraction/html.rs +553 -553
data/vendor/kreuzberg/src/extraction/image.rs +368 -368
data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -564
data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
data/vendor/kreuzberg/src/extraction/mod.rs +81 -77
data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -128
data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -3000
data/vendor/kreuzberg/src/extraction/structured.rs +490 -490
data/vendor/kreuzberg/src/extraction/table.rs +328 -328
data/vendor/kreuzberg/src/extraction/text.rs +269 -269
data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
data/vendor/kreuzberg/src/extractors/archive.rs +446 -425
data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
data/vendor/kreuzberg/src/extractors/docx.rs +367 -479
data/vendor/kreuzberg/src/extractors/email.rs +143 -129
data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
data/vendor/kreuzberg/src/extractors/excel.rs +343 -344
data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
data/vendor/kreuzberg/src/extractors/html.rs +393 -410
data/vendor/kreuzberg/src/extractors/image.rs +198 -195
data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
data/vendor/kreuzberg/src/extractors/mod.rs +365 -268
data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
data/vendor/kreuzberg/src/extractors/pdf.rs +493 -496
data/vendor/kreuzberg/src/extractors/pptx.rs +248 -234
data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
data/vendor/kreuzberg/src/extractors/security.rs +484 -0
data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
data/vendor/kreuzberg/src/extractors/structured.rs +140 -126
data/vendor/kreuzberg/src/extractors/text.rs +260 -242
data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
data/vendor/kreuzberg/src/extractors/xml.rs +135 -128
data/vendor/kreuzberg/src/image/dpi.rs +164 -164
data/vendor/kreuzberg/src/image/mod.rs +6 -6
data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
data/vendor/kreuzberg/src/image/resize.rs +89 -89
data/vendor/kreuzberg/src/keywords/config.rs +154 -154
data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
data/vendor/kreuzberg/src/keywords/processor.rs +267 -267
data/vendor/kreuzberg/src/keywords/rake.rs +293 -294
data/vendor/kreuzberg/src/keywords/types.rs +68 -68
data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
data/vendor/kreuzberg/src/language_detection/mod.rs +942 -942
data/vendor/kreuzberg/src/lib.rs +105 -102
data/vendor/kreuzberg/src/mcp/mod.rs +32 -32
data/vendor/kreuzberg/src/mcp/server.rs +1968 -1966
data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
data/vendor/kreuzberg/src/ocr/error.rs +37 -37
data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
data/vendor/kreuzberg/src/ocr/mod.rs +58 -58
data/vendor/kreuzberg/src/ocr/processor.rs +863 -847
data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -450
data/vendor/kreuzberg/src/ocr/types.rs +393 -393
data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
data/vendor/kreuzberg/src/panic_context.rs +154 -0
data/vendor/kreuzberg/src/pdf/error.rs +122 -122
data/vendor/kreuzberg/src/pdf/images.rs +139 -139
data/vendor/kreuzberg/src/pdf/metadata.rs +346 -346
data/vendor/kreuzberg/src/pdf/mod.rs +50 -50
data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
data/vendor/kreuzberg/src/pdf/table.rs +393 -420
data/vendor/kreuzberg/src/pdf/text.rs +158 -161
data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -1010
data/vendor/kreuzberg/src/plugins/mod.rs +209 -209
data/vendor/kreuzberg/src/plugins/ocr.rs +620 -629
data/vendor/kreuzberg/src/plugins/processor.rs +642 -641
data/vendor/kreuzberg/src/plugins/registry.rs +1337 -1324
data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
data/vendor/kreuzberg/src/plugins/validator.rs +956 -955
data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
data/vendor/kreuzberg/src/text/mod.rs +19 -19
data/vendor/kreuzberg/src/text/quality.rs +697 -697
data/vendor/kreuzberg/src/text/string_utils.rs +217 -217
data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -796
data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -902
data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -147
data/vendor/kreuzberg/src/types.rs +903 -873
data/vendor/kreuzberg/src/utils/mod.rs +17 -17
data/vendor/kreuzberg/src/utils/quality.rs +959 -959
data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
data/vendor/kreuzberg/tests/api_tests.rs +966 -966
data/vendor/kreuzberg/tests/archive_integration.rs +543 -543
data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -542
data/vendor/kreuzberg/tests/batch_processing.rs +316 -304
data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -509
data/vendor/kreuzberg/tests/config_features.rs +598 -580
data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -439
data/vendor/kreuzberg/tests/core_integration.rs +510 -493
data/vendor/kreuzberg/tests/csv_integration.rs +414 -424
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -124
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
data/vendor/kreuzberg/tests/email_integration.rs +325 -325
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
data/vendor/kreuzberg/tests/error_handling.rs +393 -393
data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
data/vendor/kreuzberg/tests/format_integration.rs +159 -159
data/vendor/kreuzberg/tests/helpers/mod.rs +142 -142
data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
data/vendor/kreuzberg/tests/image_integration.rs +253 -253
data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
data/vendor/kreuzberg/tests/mime_detection.rs +428 -428
data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -510
data/vendor/kreuzberg/tests/ocr_errors.rs +676 -676
data/vendor/kreuzberg/tests/ocr_quality.rs +627 -627
data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
data/vendor/kreuzberg/tests/pdf_integration.rs +43 -43
data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -1412
data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -771
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -561
data/vendor/kreuzberg/tests/plugin_system.rs +921 -921
data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -607
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
data/vendor/kreuzberg/tests/security_validation.rs +415 -404
data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
data/vendor/kreuzberg/tests/test_fastembed.rs +609 -609
data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
data/vendor/rb-sys/.cargo-ok +1 -0
data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
data/vendor/rb-sys/Cargo.lock +393 -0
data/vendor/rb-sys/Cargo.toml +70 -0
data/vendor/rb-sys/Cargo.toml.orig +57 -0
data/vendor/rb-sys/LICENSE-APACHE +190 -0
data/vendor/rb-sys/LICENSE-MIT +21 -0
data/vendor/rb-sys/bin/release.sh +21 -0
data/vendor/rb-sys/build/features.rs +108 -0
data/vendor/rb-sys/build/main.rs +246 -0
data/vendor/rb-sys/build/stable_api_config.rs +153 -0
data/vendor/rb-sys/build/version.rs +48 -0
data/vendor/rb-sys/readme.md +36 -0
data/vendor/rb-sys/src/bindings.rs +21 -0
data/vendor/rb-sys/src/hidden.rs +11 -0
data/vendor/rb-sys/src/lib.rs +34 -0
data/vendor/rb-sys/src/macros.rs +371 -0
data/vendor/rb-sys/src/memory.rs +53 -0
data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
data/vendor/rb-sys/src/special_consts.rs +31 -0
data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
data/vendor/rb-sys/src/stable_api.rs +261 -0
data/vendor/rb-sys/src/symbol.rs +31 -0
data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
data/vendor/rb-sys/src/utils.rs +89 -0
data/vendor/rb-sys/src/value_type.rs +7 -0
metadata +90 -95
data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
data/spec/examples.txt +0 -104
data/vendor/kreuzberg/src/bin/profile_extract.rs +0 -455
data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +0 -275
data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +0 -178
data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +0 -491
data/vendor/kreuzberg/src/extraction/pandoc/server.rs +0 -496
data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +0 -1188
data/vendor/kreuzberg/src/extraction/pandoc/version.rs +0 -162
data/vendor/kreuzberg/src/extractors/pandoc.rs +0 -201
data/vendor/kreuzberg/tests/chunking_offset_demo.rs +0 -92
data/vendor/kreuzberg/tests/pandoc_integration.rs +0 -503

data/vendor/kreuzberg/tests/error_handling.rs CHANGED Viewed

@@ -1,393 +1,393 @@
-//! Error handling and edge case integration tests.
-//!
-//! Tests for corrupted files, edge cases, and invalid inputs.
-//! Validates that the system handles errors gracefully without panics.
-use kreuzberg::core::config::ExtractionConfig;
-use kreuzberg::core::extractor::{extract_bytes, extract_file};
-use std::io::Write;
-use tempfile::NamedTempFile;
-mod helpers;
-/// Test truncated PDF - incomplete PDF file.
-#[tokio::test]
-async fn test_truncated_pdf() {
-    let config = ExtractionConfig::default();
-    let truncated_pdf = b"%PDF-1.4\n1 0 obj\n<<";
-    let result = extract_bytes(truncated_pdf, "application/pdf", &config).await;
-    assert!(result.is_err(), "Truncated PDF should fail gracefully");
-    let error = result.unwrap_err();
-    assert!(
-        matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
-        "Truncated PDF should produce Parsing error, got: {:?}",
-        error
-    );
-}
-/// Test corrupted ZIP - malformed archive.
-#[tokio::test]
-async fn test_corrupted_zip() {
-    let config = ExtractionConfig::default();
-    let corrupted_zip = vec![0x50, 0x4B, 0x03, 0x04, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00];
-    let result = extract_bytes(&corrupted_zip, "application/zip", &config).await;
-    assert!(result.is_err(), "Corrupted ZIP should fail gracefully");
-    let error = result.unwrap_err();
-    assert!(
-        matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
-        "Corrupted ZIP should produce Parsing error, got: {:?}",
-        error
-    );
-}
-/// Test invalid XML - bad XML syntax.
-#[tokio::test]
-async fn test_invalid_xml() {
-    let config = ExtractionConfig::default();
-    let invalid_xml = b"<?xml version=\"1.0\"?>\n\
-<root>\n\
-<unclosed>\n\
-<another>text</wrong_tag>\n\
-</root";
-    let result = extract_bytes(invalid_xml, "application/xml", &config).await;
-    match result {
-        Ok(extraction) => {
-            assert!(
-                extraction.chunks.is_none(),
-                "Chunks should be None without chunking config"
-            );
-        }
-        Err(error) => {
-            assert!(
-                matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
-                "Invalid XML error should be Parsing type, got: {:?}",
-                error
-            );
-        }
-    }
-}
-/// Test corrupted image - invalid image data.
-#[tokio::test]
-async fn test_corrupted_image() {
-    let config = ExtractionConfig::default();
-    let corrupted_png = vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0xFF, 0xFF, 0xFF, 0xFF];
-    let result = extract_bytes(&corrupted_png, "image/png", &config).await;
-    match result {
-        Ok(extraction) => {
-            assert!(
-                extraction.chunks.is_none(),
-                "Chunks should be None without chunking config"
-            );
-        }
-        Err(error) => {
-            assert!(
-                matches!(error, kreuzberg::KreuzbergError::Parsing { .. })
-                    || matches!(error, kreuzberg::KreuzbergError::Ocr { .. }),
-                "Corrupted image error should be Parsing or OCR type, got: {:?}",
-                error
-            );
-        }
-    }
-}
-/// Test empty file - 0 bytes.
-#[tokio::test]
-async fn test_empty_file() {
-    let config = ExtractionConfig::default();
-    let empty_data = b"";
-    let result_pdf = extract_bytes(empty_data, "application/pdf", &config).await;
-    let result_text = extract_bytes(empty_data, "text/plain", &config).await;
-    let result_xml = extract_bytes(empty_data, "application/xml", &config).await;
-    match result_pdf {
-        Ok(extraction) => {
-            assert!(
-                extraction.content.is_empty(),
-                "Empty PDF should have empty content if it succeeds"
-            );
-            assert!(extraction.chunks.is_none(), "Chunks should be None");
-        }
-        Err(error) => {
-            assert!(
-                matches!(
-                    error,
-                    kreuzberg::KreuzbergError::Parsing { .. } | kreuzberg::KreuzbergError::Validation { .. }
-                ),
-                "Empty PDF should produce Parsing or Validation error, got: {:?}",
-                error
-            );
-        }
-    }
-    match result_text {
-        Ok(extraction) => {
-            assert!(
-                extraction.content.is_empty(),
-                "Empty text file should have empty content"
-            );
-            assert!(extraction.chunks.is_none(), "Chunks should be None");
-        }
-        Err(error) => {
-            panic!("Empty text file should not fail, got error: {:?}", error);
-        }
-    }
-    match result_xml {
-        Ok(extraction) => {
-            assert!(
-                extraction.content.is_empty(),
-                "Empty XML should have empty content if it succeeds"
-            );
-            assert!(extraction.chunks.is_none(), "Chunks should be None");
-        }
-        Err(error) => {
-            assert!(
-                matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
-                "Empty XML error should be Parsing type, got: {:?}",
-                error
-            );
-        }
-    }
-}
-/// Test very large file - stress test with large content.
-#[tokio::test]
-async fn test_very_large_file() {
-    let config = ExtractionConfig::default();
-    let large_text = "This is a line of text that will be repeated many times.\n".repeat(200_000);
-    let large_bytes = large_text.as_bytes();
-    let result = extract_bytes(large_bytes, "text/plain", &config).await;
-    assert!(result.is_ok(), "Large file should be processed successfully");
-    let extraction = result.unwrap();
-    assert!(!extraction.content.is_empty(), "Large file content should not be empty");
-    assert!(extraction.content.len() > 1_000_000, "Content should be large");
-    assert!(
-        extraction.chunks.is_none(),
-        "Chunks should be None without chunking config"
-    );
-    assert!(
-        extraction.detected_languages.is_none(),
-        "Language detection not enabled"
-    );
-    assert!(extraction.tables.is_empty(), "Text file should not have tables");
-    assert!(
-        extraction.content.contains("This is a line of text"),
-        "Content should preserve original text"
-    );
-}
-/// Test unicode filenames - non-ASCII paths.
-#[tokio::test]
-async fn test_unicode_filenames() {
-    let config = ExtractionConfig::default();
-    let mut temp_file = NamedTempFile::new().expect("Should create temp file");
-    temp_file.write_all(b"Test content with Unicode filename.").unwrap();
-    let result = extract_file(temp_file.path(), Some("text/plain"), &config).await;
-    assert!(result.is_ok(), "Unicode filename should be handled");
-    let extraction = result.unwrap();
-    assert!(
-        extraction.content.contains("Test content"),
-        "Content should be extracted"
-    );
-    assert!(
-        extraction.chunks.is_none(),
-        "Chunks should be None without chunking config"
-    );
-    assert!(
-        extraction.detected_languages.is_none(),
-        "Language detection not enabled"
-    );
-}
-/// Test special characters in content - emojis, RTL text.
-#[tokio::test]
-async fn test_special_characters_content() {
-    let config = ExtractionConfig::default();
-    let special_text = "Emojis: 🎉 🚀 ✅ 🌍\n\
-Arabic (RTL): مرحبا بالعالم\n\
-Chinese: 你好世界\n\
-Japanese: こんにちは世界\n\
-Special chars: © ® ™ € £ ¥\n\
-Math symbols: ∑ ∫ √ ≈ ∞";
-    let result = extract_bytes(special_text.as_bytes(), "text/plain", &config).await;
-    assert!(result.is_ok(), "Special characters should be handled");
-    let extraction = result.unwrap();
-    assert!(!extraction.content.is_empty(), "Content should not be empty");
-    assert!(extraction.content.len() > 10, "Should have substantial content");
-    assert!(
-        extraction.chunks.is_none(),
-        "Chunks should be None without chunking config"
-    );
-    assert!(
-        extraction.detected_languages.is_none(),
-        "Language detection not enabled"
-    );
-    assert!(
-        extraction.content.contains("Emojis")
-            || extraction.content.contains("Arabic")
-            || extraction.content.contains("Chinese"),
-        "Should preserve at least some special character text"
-    );
-}
-/// Test nonexistent file - file not found.
-#[tokio::test]
-async fn test_nonexistent_file() {
-    let config = ExtractionConfig::default();
-    let nonexistent_path = "/nonexistent/path/to/file.pdf";
-    let result = extract_file(nonexistent_path, Some("application/pdf"), &config).await;
-    assert!(result.is_err(), "Nonexistent file should return error");
-    let error = result.unwrap_err();
-    assert!(
-        matches!(error, kreuzberg::KreuzbergError::Io(_))
-            || matches!(error, kreuzberg::KreuzbergError::Validation { .. }),
-        "Should be IO or Validation error for nonexistent file, got: {:?}",
-        error
-    );
-}
-/// Test unsupported format - unknown file type.
-#[tokio::test]
-async fn test_unsupported_format() {
-    let config = ExtractionConfig::default();
-    let data = b"Some random data";
-    let result = extract_bytes(data, "application/x-unknown-format", &config).await;
-    assert!(result.is_err(), "Unsupported format should return error");
-    let error = result.unwrap_err();
-    assert!(
-        matches!(error, kreuzberg::KreuzbergError::UnsupportedFormat(_)),
-        "Should be UnsupportedFormat error, got: {:?}",
-        error
-    );
-}
-/// Test permission denied - no read access (platform-specific).
-#[tokio::test]
-#[cfg(unix)]
-async fn test_permission_denied() {
-    use std::fs;
-    use std::os::unix::fs::PermissionsExt;
-    let config = ExtractionConfig::default();
-    let mut temp_file = NamedTempFile::new().expect("Should create temp file");
-    temp_file.write_all(b"Test content").unwrap();
-    let mut perms = fs::metadata(temp_file.path()).unwrap().permissions();
-    perms.set_mode(0o000);
-    fs::set_permissions(temp_file.path(), perms).unwrap();
-    let result = extract_file(temp_file.path(), Some("text/plain"), &config).await;
-    let mut perms = fs::metadata(temp_file.path()).unwrap().permissions();
-    perms.set_mode(0o644);
-    fs::set_permissions(temp_file.path(), perms).unwrap();
-    assert!(result.is_err(), "Permission denied should return error");
-}
-/// Test file extension mismatch - .pdf extension with DOCX content.
-#[tokio::test]
-async fn test_file_extension_mismatch() {
-    let config = ExtractionConfig::default();
-    let docx_magic = vec![0x50, 0x4B, 0x03, 0x04, 0x14, 0x00, 0x00, 0x00];
-    let result = extract_bytes(&docx_magic, "application/pdf", &config).await;
-    assert!(result.is_err(), "MIME type mismatch should fail");
-}
-/// Test extraction with null bytes in content.
-#[tokio::test]
-async fn test_null_bytes_in_content() {
-    let config = ExtractionConfig::default();
-    let data_with_nulls = b"Text before\x00null\x00bytes\x00after";
-    let result = extract_bytes(data_with_nulls, "text/plain", &config).await;
-    assert!(result.is_ok(), "Null bytes should be handled");
-    let extraction = result.unwrap();
-    assert!(!extraction.content.is_empty(), "Content should not be empty");
-    assert!(
-        extraction.chunks.is_none(),
-        "Chunks should be None without chunking config"
-    );
-    assert!(
-        extraction.content.contains("Text before") || extraction.content.contains("after"),
-        "Should preserve at least some of the text content"
-    );
-}
-/// Test concurrent extractions of same file.
-#[tokio::test]
-async fn test_concurrent_extractions() {
-    let config = ExtractionConfig::default();
-    let text_data = b"Concurrent extraction test content.";
-    let handles: Vec<_> = (0..10)
-        .map(|_| {
-            let config = config.clone();
-            tokio::spawn(async move { extract_bytes(text_data, "text/plain", &config).await })
-        })
-        .collect();
-    for handle in handles {
-        let result = handle.await.expect("Task should complete");
-        assert!(result.is_ok(), "Concurrent extraction should succeed");
-        let extraction = result.unwrap();
-        assert!(
-            extraction.content.contains("Concurrent extraction"),
-            "Content should be extracted correctly"
-        );
-        assert!(extraction.chunks.is_none(), "Chunks should be None");
-        assert!(
-            extraction.detected_languages.is_none(),
-            "Language detection not enabled"
-        );
-    }
-}
+//! Error handling and edge case integration tests.
+//!
+//! Tests for corrupted files, edge cases, and invalid inputs.
+//! Validates that the system handles errors gracefully without panics.
+use kreuzberg::core::config::ExtractionConfig;
+use kreuzberg::core::extractor::{extract_bytes, extract_file};
+use std::io::Write;
+use tempfile::NamedTempFile;
+mod helpers;
+/// Test truncated PDF - incomplete PDF file.
+#[tokio::test]
+async fn test_truncated_pdf() {
+    let config = ExtractionConfig::default();
+    let truncated_pdf = b"%PDF-1.4\n1 0 obj\n<<";
+    let result = extract_bytes(truncated_pdf, "application/pdf", &config).await;
+    assert!(result.is_err(), "Truncated PDF should fail gracefully");
+    let error = result.unwrap_err();
+    assert!(
+        matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
+        "Truncated PDF should produce Parsing error, got: {:?}",
+        error
+    );
+}
+/// Test corrupted ZIP - malformed archive.
+#[tokio::test]
+async fn test_corrupted_zip() {
+    let config = ExtractionConfig::default();
+    let corrupted_zip = vec![0x50, 0x4B, 0x03, 0x04, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00];
+    let result = extract_bytes(&corrupted_zip, "application/zip", &config).await;
+    assert!(result.is_err(), "Corrupted ZIP should fail gracefully");
+    let error = result.unwrap_err();
+    assert!(
+        matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
+        "Corrupted ZIP should produce Parsing error, got: {:?}",
+        error
+    );
+}
+/// Test invalid XML - bad XML syntax.
+#[tokio::test]
+async fn test_invalid_xml() {
+    let config = ExtractionConfig::default();
+    let invalid_xml = b"<?xml version=\"1.0\"?>\n\
+<root>\n\
+<unclosed>\n\
+<another>text</wrong_tag>\n\
+</root";
+    let result = extract_bytes(invalid_xml, "application/xml", &config).await;
+    match result {
+        Ok(extraction) => {
+            assert!(
+                extraction.chunks.is_none(),
+                "Chunks should be None without chunking config"
+            );
+        }
+        Err(error) => {
+            assert!(
+                matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
+                "Invalid XML error should be Parsing type, got: {:?}",
+                error
+            );
+        }
+    }
+}
+/// Test corrupted image - invalid image data.
+#[tokio::test]
+async fn test_corrupted_image() {
+    let config = ExtractionConfig::default();
+    let corrupted_png = vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0xFF, 0xFF, 0xFF, 0xFF];
+    let result = extract_bytes(&corrupted_png, "image/png", &config).await;
+    match result {
+        Ok(extraction) => {
+            assert!(
+                extraction.chunks.is_none(),
+                "Chunks should be None without chunking config"
+            );
+        }
+        Err(error) => {
+            assert!(
+                matches!(error, kreuzberg::KreuzbergError::Parsing { .. })
+                    || matches!(error, kreuzberg::KreuzbergError::Ocr { .. }),
+                "Corrupted image error should be Parsing or OCR type, got: {:?}",
+                error
+            );
+        }
+    }
+}
+/// Test empty file - 0 bytes.
+#[tokio::test]
+async fn test_empty_file() {
+    let config = ExtractionConfig::default();
+    let empty_data = b"";
+    let result_pdf = extract_bytes(empty_data, "application/pdf", &config).await;
+    let result_text = extract_bytes(empty_data, "text/plain", &config).await;
+    let result_xml = extract_bytes(empty_data, "application/xml", &config).await;
+    match result_pdf {
+        Ok(extraction) => {
+            assert!(
+                extraction.content.is_empty(),
+                "Empty PDF should have empty content if it succeeds"
+            );
+            assert!(extraction.chunks.is_none(), "Chunks should be None");
+        }
+        Err(error) => {
+            assert!(
+                matches!(
+                    error,
+                    kreuzberg::KreuzbergError::Parsing { .. } | kreuzberg::KreuzbergError::Validation { .. }
+                ),
+                "Empty PDF should produce Parsing or Validation error, got: {:?}",
+                error
+            );
+        }
+    }
+    match result_text {
+        Ok(extraction) => {
+            assert!(
+                extraction.content.is_empty(),
+                "Empty text file should have empty content"
+            );
+            assert!(extraction.chunks.is_none(), "Chunks should be None");
+        }
+        Err(error) => {
+            panic!("Empty text file should not fail, got error: {:?}", error);
+        }
+    }
+    match result_xml {
+        Ok(extraction) => {
+            assert!(
+                extraction.content.is_empty(),
+                "Empty XML should have empty content if it succeeds"
+            );
+            assert!(extraction.chunks.is_none(), "Chunks should be None");
+        }
+        Err(error) => {
+            assert!(
+                matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
+                "Empty XML error should be Parsing type, got: {:?}",
+                error
+            );
+        }
+    }
+}
+/// Test very large file - stress test with large content.
+#[tokio::test]
+async fn test_very_large_file() {
+    let config = ExtractionConfig::default();
+    let large_text = "This is a line of text that will be repeated many times.\n".repeat(200_000);
+    let large_bytes = large_text.as_bytes();
+    let result = extract_bytes(large_bytes, "text/plain", &config).await;
+    assert!(result.is_ok(), "Large file should be processed successfully");
+    let extraction = result.unwrap();
+    assert!(!extraction.content.is_empty(), "Large file content should not be empty");
+    assert!(extraction.content.len() > 1_000_000, "Content should be large");
+    assert!(
+        extraction.chunks.is_none(),
+        "Chunks should be None without chunking config"
+    );
+    assert!(
+        extraction.detected_languages.is_none(),
+        "Language detection not enabled"
+    );
+    assert!(extraction.tables.is_empty(), "Text file should not have tables");
+    assert!(
+        extraction.content.contains("This is a line of text"),
+        "Content should preserve original text"
+    );
+}
+/// Test unicode filenames - non-ASCII paths.
+#[tokio::test]
+async fn test_unicode_filenames() {
+    let config = ExtractionConfig::default();
+    let mut temp_file = NamedTempFile::new().expect("Should create temp file");
+    temp_file.write_all(b"Test content with Unicode filename.").unwrap();
+    let result = extract_file(temp_file.path(), Some("text/plain"), &config).await;
+    assert!(result.is_ok(), "Unicode filename should be handled");
+    let extraction = result.unwrap();
+    assert!(
+        extraction.content.contains("Test content"),
+        "Content should be extracted"
+    );
+    assert!(
+        extraction.chunks.is_none(),
+        "Chunks should be None without chunking config"
+    );
+    assert!(
+        extraction.detected_languages.is_none(),
+        "Language detection not enabled"
+    );
+}
+/// Test special characters in content - emojis, RTL text.
+#[tokio::test]
+async fn test_special_characters_content() {
+    let config = ExtractionConfig::default();
+    let special_text = "Emojis: 🎉 🚀 ✅ 🌍\n\
+Arabic (RTL): مرحبا بالعالم\n\
+Chinese: 你好世界\n\
+Japanese: こんにちは世界\n\
+Special chars: © ® ™ € £ ¥\n\
+Math symbols: ∑ ∫ √ ≈ ∞";
+    let result = extract_bytes(special_text.as_bytes(), "text/plain", &config).await;
+    assert!(result.is_ok(), "Special characters should be handled");
+    let extraction = result.unwrap();
+    assert!(!extraction.content.is_empty(), "Content should not be empty");
+    assert!(extraction.content.len() > 10, "Should have substantial content");
+    assert!(
+        extraction.chunks.is_none(),
+        "Chunks should be None without chunking config"
+    );
+    assert!(
+        extraction.detected_languages.is_none(),
+        "Language detection not enabled"
+    );
+    assert!(
+        extraction.content.contains("Emojis")
+            || extraction.content.contains("Arabic")
+            || extraction.content.contains("Chinese"),
+        "Should preserve at least some special character text"
+    );
+}
+/// Test nonexistent file - file not found.
+#[tokio::test]
+async fn test_nonexistent_file() {
+    let config = ExtractionConfig::default();
+    let nonexistent_path = "/nonexistent/path/to/file.pdf";
+    let result = extract_file(nonexistent_path, Some("application/pdf"), &config).await;
+    assert!(result.is_err(), "Nonexistent file should return error");
+    let error = result.unwrap_err();
+    assert!(
+        matches!(error, kreuzberg::KreuzbergError::Io(_))
+            || matches!(error, kreuzberg::KreuzbergError::Validation { .. }),
+        "Should be IO or Validation error for nonexistent file, got: {:?}",
+        error
+    );
+}
+/// Test unsupported format - unknown file type.
+#[tokio::test]
+async fn test_unsupported_format() {
+    let config = ExtractionConfig::default();
+    let data = b"Some random data";
+    let result = extract_bytes(data, "application/x-unknown-format", &config).await;
+    assert!(result.is_err(), "Unsupported format should return error");
+    let error = result.unwrap_err();
+    assert!(
+        matches!(error, kreuzberg::KreuzbergError::UnsupportedFormat(_)),
+        "Should be UnsupportedFormat error, got: {:?}",
+        error
+    );
+}
+/// Test permission denied - no read access (platform-specific).
+#[tokio::test]
+#[cfg(unix)]
+async fn test_permission_denied() {
+    use std::fs;
+    use std::os::unix::fs::PermissionsExt;
+    let config = ExtractionConfig::default();
+    let mut temp_file = NamedTempFile::new().expect("Should create temp file");
+    temp_file.write_all(b"Test content").unwrap();
+    let mut perms = fs::metadata(temp_file.path()).unwrap().permissions();
+    perms.set_mode(0o000);
+    fs::set_permissions(temp_file.path(), perms).unwrap();
+    let result = extract_file(temp_file.path(), Some("text/plain"), &config).await;
+    let mut perms = fs::metadata(temp_file.path()).unwrap().permissions();
+    perms.set_mode(0o644);
+    fs::set_permissions(temp_file.path(), perms).unwrap();
+    assert!(result.is_err(), "Permission denied should return error");
+}
+/// Test file extension mismatch - .pdf extension with DOCX content.
+#[tokio::test]
+async fn test_file_extension_mismatch() {
+    let config = ExtractionConfig::default();
+    let docx_magic = vec![0x50, 0x4B, 0x03, 0x04, 0x14, 0x00, 0x00, 0x00];
+    let result = extract_bytes(&docx_magic, "application/pdf", &config).await;
+    assert!(result.is_err(), "MIME type mismatch should fail");
+}
+/// Test extraction with null bytes in content.
+#[tokio::test]
+async fn test_null_bytes_in_content() {
+    let config = ExtractionConfig::default();
+    let data_with_nulls = b"Text before\x00null\x00bytes\x00after";
+    let result = extract_bytes(data_with_nulls, "text/plain", &config).await;
+    assert!(result.is_ok(), "Null bytes should be handled");
+    let extraction = result.unwrap();
+    assert!(!extraction.content.is_empty(), "Content should not be empty");
+    assert!(
+        extraction.chunks.is_none(),
+        "Chunks should be None without chunking config"
+    );
+    assert!(
+        extraction.content.contains("Text before") || extraction.content.contains("after"),
+        "Should preserve at least some of the text content"
+    );
+}
+/// Test concurrent extractions of same file.
+#[tokio::test]
+async fn test_concurrent_extractions() {
+    let config = ExtractionConfig::default();
+    let text_data = b"Concurrent extraction test content.";
+    let handles: Vec<_> = (0..10)
+        .map(|_| {
+            let config = config.clone();
+            tokio::spawn(async move { extract_bytes(text_data, "text/plain", &config).await })
+        })
+        .collect();
+    for handle in handles {
+        let result = handle.await.expect("Task should complete");
+        assert!(result.is_ok(), "Concurrent extraction should succeed");
+        let extraction = result.unwrap();
+        assert!(
+            extraction.content.contains("Concurrent extraction"),
+            "Content should be extracted correctly"
+        );
+        assert!(extraction.chunks.is_none(), "Chunks should be None");
+        assert!(
+            extraction.detected_languages.is_none(),
+            "Language detection not enabled"
+        );
+    }
+}