RubyGems - kreuzberg - Versions diffs - 4.0.0.pre.rc.6 - Mend

kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (330) hide show

checksums.yaml +7 -0
data/.gitignore +14 -0
data/.rspec +3 -0
data/.rubocop.yaml +1 -0
data/.rubocop.yml +538 -0
data/Gemfile +8 -0
data/Gemfile.lock +157 -0
data/README.md +426 -0
data/Rakefile +25 -0
data/Steepfile +47 -0
data/examples/async_patterns.rb +341 -0
data/ext/kreuzberg_rb/extconf.rb +45 -0
data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
data/ext/kreuzberg_rb/native/README.md +425 -0
data/ext/kreuzberg_rb/native/build.rs +15 -0
data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
data/ext/kreuzberg_rb/native/include/strings.h +20 -0
data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
data/extconf.rb +28 -0
data/kreuzberg.gemspec +148 -0
data/lib/kreuzberg/api_proxy.rb +142 -0
data/lib/kreuzberg/cache_api.rb +46 -0
data/lib/kreuzberg/cli.rb +55 -0
data/lib/kreuzberg/cli_proxy.rb +127 -0
data/lib/kreuzberg/config.rb +691 -0
data/lib/kreuzberg/error_context.rb +32 -0
data/lib/kreuzberg/errors.rb +118 -0
data/lib/kreuzberg/extraction_api.rb +85 -0
data/lib/kreuzberg/mcp_proxy.rb +186 -0
data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
data/lib/kreuzberg/post_processor_protocol.rb +86 -0
data/lib/kreuzberg/result.rb +216 -0
data/lib/kreuzberg/setup_lib_path.rb +80 -0
data/lib/kreuzberg/validator_protocol.rb +89 -0
data/lib/kreuzberg/version.rb +5 -0
data/lib/kreuzberg.rb +103 -0
data/sig/kreuzberg/internal.rbs +184 -0
data/sig/kreuzberg.rbs +520 -0
data/spec/binding/cache_spec.rb +227 -0
data/spec/binding/cli_proxy_spec.rb +85 -0
data/spec/binding/cli_spec.rb +55 -0
data/spec/binding/config_spec.rb +345 -0
data/spec/binding/config_validation_spec.rb +283 -0
data/spec/binding/error_handling_spec.rb +213 -0
data/spec/binding/errors_spec.rb +66 -0
data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
data/spec/binding/plugins/postprocessor_spec.rb +269 -0
data/spec/binding/plugins/validator_spec.rb +274 -0
data/spec/fixtures/config.toml +39 -0
data/spec/fixtures/config.yaml +41 -0
data/spec/fixtures/invalid_config.toml +4 -0
data/spec/smoke/package_spec.rb +178 -0
data/spec/spec_helper.rb +42 -0
data/vendor/kreuzberg/Cargo.toml +204 -0
data/vendor/kreuzberg/README.md +175 -0
data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
data/vendor/kreuzberg/build.rs +474 -0
data/vendor/kreuzberg/src/api/error.rs +81 -0
data/vendor/kreuzberg/src/api/handlers.rs +199 -0
data/vendor/kreuzberg/src/api/mod.rs +79 -0
data/vendor/kreuzberg/src/api/server.rs +353 -0
data/vendor/kreuzberg/src/api/types.rs +170 -0
data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
data/vendor/kreuzberg/src/core/config.rs +1032 -0
data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
data/vendor/kreuzberg/src/core/io.rs +329 -0
data/vendor/kreuzberg/src/core/mime.rs +605 -0
data/vendor/kreuzberg/src/core/mod.rs +45 -0
data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
data/vendor/kreuzberg/src/embeddings.rs +432 -0
data/vendor/kreuzberg/src/error.rs +431 -0
data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
data/vendor/kreuzberg/src/extraction/email.rs +854 -0
data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
data/vendor/kreuzberg/src/extraction/html.rs +553 -0
data/vendor/kreuzberg/src/extraction/image.rs +368 -0
data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
data/vendor/kreuzberg/src/extraction/table.rs +328 -0
data/vendor/kreuzberg/src/extraction/text.rs +269 -0
data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
data/vendor/kreuzberg/src/extractors/email.rs +143 -0
data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
data/vendor/kreuzberg/src/extractors/html.rs +393 -0
data/vendor/kreuzberg/src/extractors/image.rs +198 -0
data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
data/vendor/kreuzberg/src/extractors/security.rs +484 -0
data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
data/vendor/kreuzberg/src/extractors/text.rs +260 -0
data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
data/vendor/kreuzberg/src/image/dpi.rs +164 -0
data/vendor/kreuzberg/src/image/mod.rs +6 -0
data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
data/vendor/kreuzberg/src/image/resize.rs +89 -0
data/vendor/kreuzberg/src/keywords/config.rs +154 -0
data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
data/vendor/kreuzberg/src/keywords/types.rs +68 -0
data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
data/vendor/kreuzberg/src/lib.rs +105 -0
data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
data/vendor/kreuzberg/src/ocr/error.rs +37 -0
data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
data/vendor/kreuzberg/src/ocr/types.rs +393 -0
data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
data/vendor/kreuzberg/src/panic_context.rs +154 -0
data/vendor/kreuzberg/src/pdf/error.rs +122 -0
data/vendor/kreuzberg/src/pdf/images.rs +139 -0
data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
data/vendor/kreuzberg/src/pdf/table.rs +393 -0
data/vendor/kreuzberg/src/pdf/text.rs +158 -0
data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
data/vendor/kreuzberg/src/text/mod.rs +19 -0
data/vendor/kreuzberg/src/text/quality.rs +697 -0
data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
data/vendor/kreuzberg/src/types.rs +903 -0
data/vendor/kreuzberg/src/utils/mod.rs +17 -0
data/vendor/kreuzberg/src/utils/quality.rs +959 -0
data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
data/vendor/kreuzberg/tests/api_tests.rs +966 -0
data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
data/vendor/kreuzberg/tests/config_features.rs +598 -0
data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
data/vendor/kreuzberg/tests/core_integration.rs +510 -0
data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
data/vendor/kreuzberg/tests/email_integration.rs +325 -0
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
data/vendor/kreuzberg/tests/error_handling.rs +393 -0
data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
data/vendor/kreuzberg/tests/format_integration.rs +159 -0
data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
data/vendor/kreuzberg/tests/image_integration.rs +253 -0
data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
data/vendor/kreuzberg/tests/security_validation.rs +415 -0
data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
data/vendor/rb-sys/.cargo-ok +1 -0
data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
data/vendor/rb-sys/Cargo.lock +393 -0
data/vendor/rb-sys/Cargo.toml +70 -0
data/vendor/rb-sys/Cargo.toml.orig +57 -0
data/vendor/rb-sys/LICENSE-APACHE +190 -0
data/vendor/rb-sys/LICENSE-MIT +21 -0
data/vendor/rb-sys/bin/release.sh +21 -0
data/vendor/rb-sys/build/features.rs +108 -0
data/vendor/rb-sys/build/main.rs +246 -0
data/vendor/rb-sys/build/stable_api_config.rs +153 -0
data/vendor/rb-sys/build/version.rs +48 -0
data/vendor/rb-sys/readme.md +36 -0
data/vendor/rb-sys/src/bindings.rs +21 -0
data/vendor/rb-sys/src/hidden.rs +11 -0
data/vendor/rb-sys/src/lib.rs +34 -0
data/vendor/rb-sys/src/macros.rs +371 -0
data/vendor/rb-sys/src/memory.rs +53 -0
data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
data/vendor/rb-sys/src/special_consts.rs +31 -0
data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
data/vendor/rb-sys/src/stable_api.rs +261 -0
data/vendor/rb-sys/src/symbol.rs +31 -0
data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
data/vendor/rb-sys/src/utils.rs +89 -0
data/vendor/rb-sys/src/value_type.rs +7 -0
metadata +536 -0

data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs ADDED Viewed

@@ -0,0 +1,275 @@
+//! Integration tests for the native EPUB extractor
+//!
+//! These tests validate the native Rust EPUB extractor (EpubExtractor)
+//! which uses zip + roxmltree + html-to-markdown-rs (permissive licenses).
+//!
+//! This test suite verifies the fix for the two-pass OPF parsing bug that
+//! caused 99.84% content loss due to single-pass manifest/spine resolution.
+#![cfg(feature = "office")]
+use kreuzberg::core::config::ExtractionConfig;
+use kreuzberg::extractors::EpubExtractor;
+use kreuzberg::plugins::DocumentExtractor;
+use std::path::PathBuf;
+/// Helper to resolve workspace root and construct test file paths
+fn get_test_epub_path(filename: &str) -> PathBuf {
+    let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+        .parent()
+        .unwrap()
+        .parent()
+        .unwrap();
+    workspace_root.join(format!("test_documents/epub/{}", filename))
+}
+/// Test 1: Basic EPUB extraction - wasteland.epub
+///
+/// Validates:
+/// - Two-pass OPF parsing works correctly
+/// - Manifest is fully populated before spine resolution
+/// - Content is extracted successfully (>2000 bytes expected)
+/// - Metadata is extracted correctly
+#[tokio::test]
+async fn test_native_epub_wasteland_extraction() {
+    let test_file = get_test_epub_path("wasteland.epub");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        return;
+    }
+    let bytes = std::fs::read(&test_file).expect("Failed to read wasteland.epub");
+    let extractor = EpubExtractor::new();
+    let config = ExtractionConfig::default();
+    let result = extractor
+        .extract_bytes(&bytes, "application/epub+zip", &config)
+        .await
+        .expect("Should extract wasteland.epub successfully");
+    assert!(
+        result.content.len() > 2000,
+        "Should extract substantial content from Wasteland, got {} bytes",
+        result.content.len()
+    );
+    assert!(
+        result.metadata.additional.contains_key("title"),
+        "Should extract title metadata"
+    );
+    assert_eq!(
+        result.metadata.additional.get("title").and_then(|v| v.as_str()),
+        Some("The Waste Land"),
+        "Should have correct title"
+    );
+    assert!(
+        result.metadata.additional.contains_key("creator"),
+        "Should extract creator metadata"
+    );
+    assert!(
+        result.content.contains("April") || result.content.contains("cruellest"),
+        "Should contain key phrases from The Waste Land"
+    );
+    println!("✅ Wasteland extraction test passed ({} bytes)", result.content.len());
+}
+/// Test 2: EPUB with images - img.epub
+///
+/// Validates:
+/// - EPUB with embedded images extracts successfully
+/// - Text content is extracted (images are in manifest but not in content)
+/// - Metadata is extracted
+#[tokio::test]
+async fn test_native_epub_images_extraction() {
+    let test_file = get_test_epub_path("img.epub");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        return;
+    }
+    let bytes = std::fs::read(&test_file).expect("Failed to read img.epub");
+    let extractor = EpubExtractor::new();
+    let config = ExtractionConfig::default();
+    let result = extractor
+        .extract_bytes(&bytes, "application/epub+zip", &config)
+        .await
+        .expect("Should extract img.epub successfully");
+    assert!(
+        result.content.len() > 50,
+        "Should extract text content from EPUB with images, got {} bytes",
+        result.content.len()
+    );
+    assert!(
+        result.metadata.additional.contains_key("title"),
+        "Should extract title metadata"
+    );
+    println!("✅ Images EPUB extraction test passed ({} bytes)", result.content.len());
+}
+/// Test 3: Features EPUB - features.epub
+///
+/// Validates:
+/// - Complex EPUB3 features document extracts successfully
+/// - Multiple chapters/sections are extracted (not just first)
+/// - Substantial content is present (>1000 bytes)
+#[tokio::test]
+async fn test_native_epub_features_extraction() {
+    let test_file = get_test_epub_path("features.epub");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        return;
+    }
+    let bytes = std::fs::read(&test_file).expect("Failed to read features.epub");
+    let extractor = EpubExtractor::new();
+    let config = ExtractionConfig::default();
+    let result = extractor
+        .extract_bytes(&bytes, "application/epub+zip", &config)
+        .await
+        .expect("Should extract features.epub successfully");
+    assert!(
+        result.content.len() > 1000,
+        "CRITICAL: Should extract from ALL chapters, got only {} bytes. \
+         This indicates the two-pass bug is not fixed!",
+        result.content.len()
+    );
+    println!(
+        "✅ Features EPUB extraction test passed ({} bytes)",
+        result.content.len()
+    );
+}
+/// Test 4: EPUB2 with cover - epub2_cover.epub
+///
+/// Validates:
+/// - EPUB2 format is supported
+/// - Cover handling works correctly
+/// - Content and metadata extracted
+#[tokio::test]
+async fn test_native_epub2_cover_extraction() {
+    let test_file = get_test_epub_path("epub2_cover.epub");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        return;
+    }
+    let bytes = std::fs::read(&test_file).expect("Failed to read epub2_cover.epub");
+    let extractor = EpubExtractor::new();
+    let config = ExtractionConfig::default();
+    let result = extractor
+        .extract_bytes(&bytes, "application/epub+zip", &config)
+        .await
+        .expect("Should extract epub2_cover.epub successfully");
+    assert!(
+        result.content.len() > 50,
+        "Should extract content from EPUB2 with cover, got {} bytes",
+        result.content.len()
+    );
+    assert_eq!(
+        result.metadata.additional.get("title").and_then(|v| v.as_str()),
+        Some("Pandoc EPUB Test"),
+        "Should have correct title"
+    );
+    println!("✅ EPUB2 cover extraction test passed ({} bytes)", result.content.len());
+}
+/// Test 5: Deterministic extraction
+///
+/// Validates:
+/// - Same input produces same output (no randomness)
+/// - Extraction is stable and reproducible
+#[tokio::test]
+async fn test_native_epub_deterministic_extraction() {
+    let test_file = get_test_epub_path("features.epub");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        return;
+    }
+    let bytes = std::fs::read(&test_file).expect("Failed to read features.epub");
+    let extractor = EpubExtractor::new();
+    let config = ExtractionConfig::default();
+    let result1 = extractor
+        .extract_bytes(&bytes, "application/epub+zip", &config)
+        .await
+        .expect("First extraction should succeed");
+    let result2 = extractor
+        .extract_bytes(&bytes, "application/epub+zip", &config)
+        .await
+        .expect("Second extraction should succeed");
+    assert_eq!(
+        result1.content, result2.content,
+        "Extraction should be deterministic - same input should produce same output"
+    );
+    assert_eq!(
+        result1.metadata.additional, result2.metadata.additional,
+        "Metadata extraction should be deterministic"
+    );
+    println!("✅ Deterministic extraction test passed");
+}
+/// Test 6: No content loss across multiple EPUBs
+///
+/// Validates:
+/// - All test EPUB files extract successfully
+/// - No file has empty or nearly-empty content
+/// - Bug causing 99.84% content loss is fixed
+#[tokio::test]
+async fn test_native_epub_no_content_loss() {
+    let epub_files = vec![
+        ("epub2_cover.epub", 50),
+        ("epub2_no_cover.epub", 50),
+        ("img.epub", 50),
+        ("features.epub", 1000),
+        ("wasteland.epub", 2000),
+    ];
+    let extractor = EpubExtractor::new();
+    let config = ExtractionConfig::default();
+    for (epub_file, min_bytes) in epub_files {
+        let test_file = get_test_epub_path(epub_file);
+        if !test_file.exists() {
+            println!("⚠ Skipping {}: not found", epub_file);
+            continue;
+        }
+        let bytes = std::fs::read(&test_file).unwrap_or_else(|_| panic!("Failed to read {}", epub_file));
+        let result = extractor
+            .extract_bytes(&bytes, "application/epub+zip", &config)
+            .await
+            .unwrap_or_else(|_| panic!("Should extract {}", epub_file));
+        assert!(
+            result.content.len() >= min_bytes,
+            "CRITICAL: {} extracted only {} bytes (expected >= {}). Content loss bug?",
+            epub_file,
+            result.content.len(),
+            min_bytes
+        );
+        println!("✓ {} - {} bytes extracted", epub_file, result.content.len());
+    }
+    println!("✅ All EPUBs extracted successfully - no content loss!");
+}

data/vendor/kreuzberg/tests/error_handling.rs ADDED Viewed

@@ -0,0 +1,393 @@
+//! Error handling and edge case integration tests.
+//!
+//! Tests for corrupted files, edge cases, and invalid inputs.
+//! Validates that the system handles errors gracefully without panics.
+use kreuzberg::core::config::ExtractionConfig;
+use kreuzberg::core::extractor::{extract_bytes, extract_file};
+use std::io::Write;
+use tempfile::NamedTempFile;
+mod helpers;
+/// Test truncated PDF - incomplete PDF file.
+#[tokio::test]
+async fn test_truncated_pdf() {
+    let config = ExtractionConfig::default();
+    let truncated_pdf = b"%PDF-1.4\n1 0 obj\n<<";
+    let result = extract_bytes(truncated_pdf, "application/pdf", &config).await;
+    assert!(result.is_err(), "Truncated PDF should fail gracefully");
+    let error = result.unwrap_err();
+    assert!(
+        matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
+        "Truncated PDF should produce Parsing error, got: {:?}",
+        error
+    );
+}
+/// Test corrupted ZIP - malformed archive.
+#[tokio::test]
+async fn test_corrupted_zip() {
+    let config = ExtractionConfig::default();
+    let corrupted_zip = vec![0x50, 0x4B, 0x03, 0x04, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00];
+    let result = extract_bytes(&corrupted_zip, "application/zip", &config).await;
+    assert!(result.is_err(), "Corrupted ZIP should fail gracefully");
+    let error = result.unwrap_err();
+    assert!(
+        matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
+        "Corrupted ZIP should produce Parsing error, got: {:?}",
+        error
+    );
+}
+/// Test invalid XML - bad XML syntax.
+#[tokio::test]
+async fn test_invalid_xml() {
+    let config = ExtractionConfig::default();
+    let invalid_xml = b"<?xml version=\"1.0\"?>\n\
+<root>\n\
+<unclosed>\n\
+<another>text</wrong_tag>\n\
+</root";
+    let result = extract_bytes(invalid_xml, "application/xml", &config).await;
+    match result {
+        Ok(extraction) => {
+            assert!(
+                extraction.chunks.is_none(),
+                "Chunks should be None without chunking config"
+            );
+        }
+        Err(error) => {
+            assert!(
+                matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
+                "Invalid XML error should be Parsing type, got: {:?}",
+                error
+            );
+        }
+    }
+}
+/// Test corrupted image - invalid image data.
+#[tokio::test]
+async fn test_corrupted_image() {
+    let config = ExtractionConfig::default();
+    let corrupted_png = vec![0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0xFF, 0xFF, 0xFF, 0xFF];
+    let result = extract_bytes(&corrupted_png, "image/png", &config).await;
+    match result {
+        Ok(extraction) => {
+            assert!(
+                extraction.chunks.is_none(),
+                "Chunks should be None without chunking config"
+            );
+        }
+        Err(error) => {
+            assert!(
+                matches!(error, kreuzberg::KreuzbergError::Parsing { .. })
+                    || matches!(error, kreuzberg::KreuzbergError::Ocr { .. }),
+                "Corrupted image error should be Parsing or OCR type, got: {:?}",
+                error
+            );
+        }
+    }
+}
+/// Test empty file - 0 bytes.
+#[tokio::test]
+async fn test_empty_file() {
+    let config = ExtractionConfig::default();
+    let empty_data = b"";
+    let result_pdf = extract_bytes(empty_data, "application/pdf", &config).await;
+    let result_text = extract_bytes(empty_data, "text/plain", &config).await;
+    let result_xml = extract_bytes(empty_data, "application/xml", &config).await;
+    match result_pdf {
+        Ok(extraction) => {
+            assert!(
+                extraction.content.is_empty(),
+                "Empty PDF should have empty content if it succeeds"
+            );
+            assert!(extraction.chunks.is_none(), "Chunks should be None");
+        }
+        Err(error) => {
+            assert!(
+                matches!(
+                    error,
+                    kreuzberg::KreuzbergError::Parsing { .. } | kreuzberg::KreuzbergError::Validation { .. }
+                ),
+                "Empty PDF should produce Parsing or Validation error, got: {:?}",
+                error
+            );
+        }
+    }
+    match result_text {
+        Ok(extraction) => {
+            assert!(
+                extraction.content.is_empty(),
+                "Empty text file should have empty content"
+            );
+            assert!(extraction.chunks.is_none(), "Chunks should be None");
+        }
+        Err(error) => {
+            panic!("Empty text file should not fail, got error: {:?}", error);
+        }
+    }
+    match result_xml {
+        Ok(extraction) => {
+            assert!(
+                extraction.content.is_empty(),
+                "Empty XML should have empty content if it succeeds"
+            );
+            assert!(extraction.chunks.is_none(), "Chunks should be None");
+        }
+        Err(error) => {
+            assert!(
+                matches!(error, kreuzberg::KreuzbergError::Parsing { .. }),
+                "Empty XML error should be Parsing type, got: {:?}",
+                error
+            );
+        }
+    }
+}
+/// Test very large file - stress test with large content.
+#[tokio::test]
+async fn test_very_large_file() {
+    let config = ExtractionConfig::default();
+    let large_text = "This is a line of text that will be repeated many times.\n".repeat(200_000);
+    let large_bytes = large_text.as_bytes();
+    let result = extract_bytes(large_bytes, "text/plain", &config).await;
+    assert!(result.is_ok(), "Large file should be processed successfully");
+    let extraction = result.unwrap();
+    assert!(!extraction.content.is_empty(), "Large file content should not be empty");
+    assert!(extraction.content.len() > 1_000_000, "Content should be large");
+    assert!(
+        extraction.chunks.is_none(),
+        "Chunks should be None without chunking config"
+    );
+    assert!(
+        extraction.detected_languages.is_none(),
+        "Language detection not enabled"
+    );
+    assert!(extraction.tables.is_empty(), "Text file should not have tables");
+    assert!(
+        extraction.content.contains("This is a line of text"),
+        "Content should preserve original text"
+    );
+}
+/// Test unicode filenames - non-ASCII paths.
+#[tokio::test]
+async fn test_unicode_filenames() {
+    let config = ExtractionConfig::default();
+    let mut temp_file = NamedTempFile::new().expect("Should create temp file");
+    temp_file.write_all(b"Test content with Unicode filename.").unwrap();
+    let result = extract_file(temp_file.path(), Some("text/plain"), &config).await;
+    assert!(result.is_ok(), "Unicode filename should be handled");
+    let extraction = result.unwrap();
+    assert!(
+        extraction.content.contains("Test content"),
+        "Content should be extracted"
+    );
+    assert!(
+        extraction.chunks.is_none(),
+        "Chunks should be None without chunking config"
+    );
+    assert!(
+        extraction.detected_languages.is_none(),
+        "Language detection not enabled"
+    );
+}
+/// Test special characters in content - emojis, RTL text.
+#[tokio::test]
+async fn test_special_characters_content() {
+    let config = ExtractionConfig::default();
+    let special_text = "Emojis: 🎉 🚀 ✅ 🌍\n\
+Arabic (RTL): مرحبا بالعالم\n\
+Chinese: 你好世界\n\
+Japanese: こんにちは世界\n\
+Special chars: © ® ™ € £ ¥\n\
+Math symbols: ∑ ∫ √ ≈ ∞";
+    let result = extract_bytes(special_text.as_bytes(), "text/plain", &config).await;
+    assert!(result.is_ok(), "Special characters should be handled");
+    let extraction = result.unwrap();
+    assert!(!extraction.content.is_empty(), "Content should not be empty");
+    assert!(extraction.content.len() > 10, "Should have substantial content");
+    assert!(
+        extraction.chunks.is_none(),
+        "Chunks should be None without chunking config"
+    );
+    assert!(
+        extraction.detected_languages.is_none(),
+        "Language detection not enabled"
+    );
+    assert!(
+        extraction.content.contains("Emojis")
+            || extraction.content.contains("Arabic")
+            || extraction.content.contains("Chinese"),
+        "Should preserve at least some special character text"
+    );
+}
+/// Test nonexistent file - file not found.
+#[tokio::test]
+async fn test_nonexistent_file() {
+    let config = ExtractionConfig::default();
+    let nonexistent_path = "/nonexistent/path/to/file.pdf";
+    let result = extract_file(nonexistent_path, Some("application/pdf"), &config).await;
+    assert!(result.is_err(), "Nonexistent file should return error");
+    let error = result.unwrap_err();
+    assert!(
+        matches!(error, kreuzberg::KreuzbergError::Io(_))
+            || matches!(error, kreuzberg::KreuzbergError::Validation { .. }),
+        "Should be IO or Validation error for nonexistent file, got: {:?}",
+        error
+    );
+}
+/// Test unsupported format - unknown file type.
+#[tokio::test]
+async fn test_unsupported_format() {
+    let config = ExtractionConfig::default();
+    let data = b"Some random data";
+    let result = extract_bytes(data, "application/x-unknown-format", &config).await;
+    assert!(result.is_err(), "Unsupported format should return error");
+    let error = result.unwrap_err();
+    assert!(
+        matches!(error, kreuzberg::KreuzbergError::UnsupportedFormat(_)),
+        "Should be UnsupportedFormat error, got: {:?}",
+        error
+    );
+}
+/// Test permission denied - no read access (platform-specific).
+#[tokio::test]
+#[cfg(unix)]
+async fn test_permission_denied() {
+    use std::fs;
+    use std::os::unix::fs::PermissionsExt;
+    let config = ExtractionConfig::default();
+    let mut temp_file = NamedTempFile::new().expect("Should create temp file");
+    temp_file.write_all(b"Test content").unwrap();
+    let mut perms = fs::metadata(temp_file.path()).unwrap().permissions();
+    perms.set_mode(0o000);
+    fs::set_permissions(temp_file.path(), perms).unwrap();
+    let result = extract_file(temp_file.path(), Some("text/plain"), &config).await;
+    let mut perms = fs::metadata(temp_file.path()).unwrap().permissions();
+    perms.set_mode(0o644);
+    fs::set_permissions(temp_file.path(), perms).unwrap();
+    assert!(result.is_err(), "Permission denied should return error");
+}
+/// Test file extension mismatch - .pdf extension with DOCX content.
+#[tokio::test]
+async fn test_file_extension_mismatch() {
+    let config = ExtractionConfig::default();
+    let docx_magic = vec![0x50, 0x4B, 0x03, 0x04, 0x14, 0x00, 0x00, 0x00];
+    let result = extract_bytes(&docx_magic, "application/pdf", &config).await;
+    assert!(result.is_err(), "MIME type mismatch should fail");
+}
+/// Test extraction with null bytes in content.
+#[tokio::test]
+async fn test_null_bytes_in_content() {
+    let config = ExtractionConfig::default();
+    let data_with_nulls = b"Text before\x00null\x00bytes\x00after";
+    let result = extract_bytes(data_with_nulls, "text/plain", &config).await;
+    assert!(result.is_ok(), "Null bytes should be handled");
+    let extraction = result.unwrap();
+    assert!(!extraction.content.is_empty(), "Content should not be empty");
+    assert!(
+        extraction.chunks.is_none(),
+        "Chunks should be None without chunking config"
+    );
+    assert!(
+        extraction.content.contains("Text before") || extraction.content.contains("after"),
+        "Should preserve at least some of the text content"
+    );
+}
+/// Test concurrent extractions of same file.
+#[tokio::test]
+async fn test_concurrent_extractions() {
+    let config = ExtractionConfig::default();
+    let text_data = b"Concurrent extraction test content.";
+    let handles: Vec<_> = (0..10)
+        .map(|_| {
+            let config = config.clone();
+            tokio::spawn(async move { extract_bytes(text_data, "text/plain", &config).await })
+        })
+        .collect();
+    for handle in handles {
+        let result = handle.await.expect("Task should complete");
+        assert!(result.is_ok(), "Concurrent extraction should succeed");
+        let extraction = result.unwrap();
+        assert!(
+            extraction.content.contains("Concurrent extraction"),
+            "Content should be extracted correctly"
+        );
+        assert!(extraction.chunks.is_none(), "Chunks should be None");
+        assert!(
+            extraction.detected_languages.is_none(),
+            "Language detection not enabled"
+        );
+    }
+}