RubyGems - kreuzberg - Versions diffs - 4.1.2 → 4.2.1 - Mend

kreuzberg 4.1.2 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (103) hide show

checksums.yaml +4 -4
data/Gemfile.lock +2 -2
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
data/kreuzberg.gemspec +13 -1
data/lib/kreuzberg/cli.rb +16 -6
data/lib/kreuzberg/cli_proxy.rb +3 -1
data/lib/kreuzberg/config.rb +121 -39
data/lib/kreuzberg/djot_content.rb +225 -0
data/lib/kreuzberg/extraction_api.rb +20 -4
data/lib/kreuzberg/result.rb +12 -2
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +1 -0
data/sig/kreuzberg.rbs +28 -12
data/spec/binding/batch_operations_spec.rb +80 -0
data/spec/binding/batch_spec.rb +6 -5
data/spec/binding/error_recovery_spec.rb +3 -3
data/spec/binding/metadata_types_spec.rb +77 -57
data/spec/binding/tables_spec.rb +11 -2
data/spec/serialization_spec.rb +134 -0
data/spec/unit/config/output_format_spec.rb +380 -0
data/vendor/Cargo.toml +1 -1
data/vendor/kreuzberg/Cargo.toml +1 -1
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/api/startup.rs +15 -1
data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
data/vendor/kreuzberg/src/core/io.rs +7 -7
data/vendor/kreuzberg/src/core/mime.rs +4 -4
data/vendor/kreuzberg/src/embeddings.rs +4 -4
data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
data/vendor/kreuzberg/src/mcp/format.rs +237 -39
data/vendor/kreuzberg/src/mcp/params.rs +26 -33
data/vendor/kreuzberg/src/mcp/server.rs +6 -3
data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
data/vendor/kreuzberg/tests/api_embed.rs +84 -50
data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
data/vendor/kreuzberg/tests/api_tests.rs +298 -139
data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
data/vendor/kreuzberg/tests/config_features.rs +19 -15
data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
data/vendor/kreuzberg/tests/core_integration.rs +57 -57
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
data/vendor/kreuzberg/tests/email_integration.rs +7 -7
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/error_handling.rs +13 -11
data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/page_markers.rs +1 -1
data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
data/vendor/kreuzberg/tests/security_validation.rs +20 -19
data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +12 -2

data/vendor/kreuzberg/tests/contract_mcp.rs ADDED Viewed

@@ -0,0 +1,314 @@
+//! MCP contract tests - verify MCP config matches Rust core
+//!
+//! This test suite validates that MCP (Model Context Protocol) configuration
+//! produces identical JSON to the Rust core library when parsing configuration.
+//! This ensures that MCP users get the same configuration behavior as CLI and SDK users.
+use kreuzberg::core::config::ExtractionConfig;
+use kreuzberg::core::config::OutputFormat;
+use serde_json::json;
+#[test]
+fn test_mcp_basic_config_json_matches_rust_core() {
+    // Create config via Rust core
+    let rust_config = ExtractionConfig {
+        use_cache: true,
+        enable_quality_processing: true,
+        force_ocr: false,
+        output_format: OutputFormat::Plain,
+        result_format: kreuzberg::types::OutputFormat::Unified,
+        ..Default::default()
+    };
+    let rust_json = serde_json::to_value(&rust_config).expect("Failed to serialize rust config");
+    // Simulate MCP config parameter deserialization
+    let mcp_json = json!({
+        "use_cache": true,
+        "enable_quality_processing": true,
+        "force_ocr": false,
+        "output_format": "plain",
+        "result_format": "unified"
+    });
+    let mcp_config: ExtractionConfig =
+        serde_json::from_value(mcp_json.clone()).expect("Failed to deserialize MCP config");
+    let mcp_serialized = serde_json::to_value(&mcp_config).expect("Failed to serialize MCP config");
+    // Verify they produce identical JSON for the relevant fields
+    assert_eq!(
+        rust_json.get("use_cache"),
+        mcp_serialized.get("use_cache"),
+        "MCP use_cache must match Rust core"
+    );
+    assert_eq!(
+        rust_json.get("enable_quality_processing"),
+        mcp_serialized.get("enable_quality_processing"),
+        "MCP enable_quality_processing must match Rust core"
+    );
+    assert_eq!(
+        rust_json.get("force_ocr"),
+        mcp_serialized.get("force_ocr"),
+        "MCP force_ocr must match Rust core"
+    );
+    assert_eq!(
+        rust_json.get("output_format"),
+        mcp_serialized.get("output_format"),
+        "MCP output_format must match Rust core"
+    );
+}
+#[test]
+fn test_mcp_ocr_config_nested_matches_rust_core() {
+    let mcp_json = json!({
+        "ocr": {
+            "backend": "tesseract"
+        },
+        "force_ocr": true
+    });
+    let config: ExtractionConfig = serde_json::from_value(mcp_json).expect("Failed to deserialize OCR config");
+    // Verify OCR config deserialized correctly
+    assert!(config.ocr.is_some(), "OCR config should be present");
+    assert!(config.force_ocr, "force_ocr should be true");
+    if let Some(ocr) = &config.ocr {
+        assert_eq!(ocr.backend, "tesseract", "OCR backend should be tesseract");
+    }
+    // Verify roundtrip
+    let serialized = serde_json::to_value(&config).expect("Failed to serialize");
+    assert!(serialized.get("ocr").is_some(), "Serialized config should include ocr");
+}
+#[test]
+fn test_mcp_chunking_config_nested_matches_rust_core() {
+    let mcp_json = json!({
+        "chunking": {
+            "max_chars": 500,
+            "max_overlap": 50,
+            "strategy": "sliding_window"
+        }
+    });
+    let config: ExtractionConfig = serde_json::from_value(mcp_json).expect("Failed to deserialize chunking config");
+    // Verify chunking config deserialized correctly
+    assert!(config.chunking.is_some(), "Chunking config should be present");
+    if let Some(chunking) = &config.chunking {
+        assert_eq!(chunking.max_chars, 500, "max_chars should be 500");
+        assert_eq!(chunking.max_overlap, 50, "max_overlap should be 50");
+    }
+    // Verify roundtrip
+    let serialized = serde_json::to_value(&config).expect("Failed to serialize");
+    assert!(
+        serialized.get("chunking").is_some(),
+        "Serialized config should include chunking"
+    );
+}
+#[test]
+fn test_mcp_full_config_preserves_all_fields() {
+    let full_config_json = json!({
+        "use_cache": false,
+        "enable_quality_processing": true,
+        "force_ocr": true,
+        "output_format": "markdown",
+        "result_format": "unified",
+        "max_concurrent_extractions": 8,
+        "ocr": {
+            "backend": "tesseract"
+        },
+        "chunking": {
+            "max_chars": 1000,
+            "max_overlap": 200
+        }
+    });
+    let config: ExtractionConfig =
+        serde_json::from_value(full_config_json.clone()).expect("Failed to deserialize full config");
+    let roundtrip_json = serde_json::to_value(&config).expect("Failed to serialize");
+    // Verify all top-level fields preserved
+    assert!(!config.use_cache, "use_cache should be false");
+    assert!(
+        config.enable_quality_processing,
+        "enable_quality_processing should be true"
+    );
+    assert!(config.force_ocr, "force_ocr should be true");
+    assert_eq!(
+        config.max_concurrent_extractions,
+        Some(8),
+        "max_concurrent_extractions should be 8"
+    );
+    // Verify nested fields preserved
+    assert!(config.ocr.is_some(), "OCR config should be present");
+    assert!(config.chunking.is_some(), "Chunking config should be present");
+    // Verify roundtrip integrity
+    assert_eq!(
+        roundtrip_json.get("use_cache"),
+        full_config_json.get("use_cache"),
+        "use_cache should survive roundtrip"
+    );
+    assert_eq!(
+        roundtrip_json.get("force_ocr"),
+        full_config_json.get("force_ocr"),
+        "force_ocr should survive roundtrip"
+    );
+    assert_eq!(
+        roundtrip_json.get("max_concurrent_extractions"),
+        full_config_json.get("max_concurrent_extractions"),
+        "max_concurrent_extractions should survive roundtrip"
+    );
+}
+#[test]
+fn test_mcp_default_config_matches_rust_core_defaults() {
+    // Create Rust core default
+    let rust_default = ExtractionConfig::default();
+    let rust_json = serde_json::to_value(&rust_default).expect("Failed to serialize default");
+    // Create empty JSON (simulates MCP with no overrides)
+    let mcp_json = json!({});
+    let mcp_config: ExtractionConfig = serde_json::from_value(mcp_json).expect("Failed to deserialize empty config");
+    let mcp_json_serialized = serde_json::to_value(&mcp_config).expect("Failed to serialize MCP default");
+    // Verify defaults match
+    assert_eq!(
+        mcp_json_serialized.get("use_cache"),
+        rust_json.get("use_cache"),
+        "use_cache default should match"
+    );
+    assert_eq!(
+        mcp_json_serialized.get("enable_quality_processing"),
+        rust_json.get("enable_quality_processing"),
+        "enable_quality_processing default should match"
+    );
+    assert_eq!(
+        mcp_json_serialized.get("force_ocr"),
+        rust_json.get("force_ocr"),
+        "force_ocr default should match"
+    );
+    assert_eq!(
+        mcp_json_serialized.get("result_format"),
+        rust_json.get("result_format"),
+        "result_format default should match"
+    );
+    assert_eq!(
+        mcp_json_serialized.get("output_format"),
+        rust_json.get("output_format"),
+        "output_format default should match"
+    );
+}
+#[test]
+fn test_mcp_output_format_values_are_valid() {
+    // Test all valid output format values (lowercase, as per serde rename_all)
+    let valid_formats = vec!["plain", "markdown", "html"];
+    for format in valid_formats {
+        let mcp_json = json!({
+            "output_format": format
+        });
+        let result = serde_json::from_value::<ExtractionConfig>(mcp_json);
+        assert!(result.is_ok(), "Format '{}' should deserialize successfully", format);
+        let config = result.unwrap();
+        assert!(
+            !config.output_format.to_string().is_empty(),
+            "Deserialized format should have valid string representation"
+        );
+    }
+}
+#[test]
+fn test_mcp_result_format_values_are_valid() {
+    // Test valid result format values (lowercase, as per serde rename_all)
+    let valid_formats = vec!["unified", "element_based"];
+    for format in valid_formats {
+        let mcp_json = json!({
+            "result_format": format
+        });
+        let result = serde_json::from_value::<ExtractionConfig>(mcp_json);
+        assert!(
+            result.is_ok(),
+            "Result format '{}' should deserialize successfully",
+            format
+        );
+    }
+}
+#[test]
+fn test_mcp_partial_override_preserves_defaults() {
+    // Create a partial config that overrides only one field
+    let partial_json = json!({
+        "force_ocr": true
+    });
+    let config: ExtractionConfig = serde_json::from_value(partial_json).expect("Failed to deserialize partial config");
+    // Verify override applied
+    assert!(config.force_ocr, "force_ocr override should be applied");
+    // Verify defaults preserved for other fields
+    assert!(config.use_cache, "use_cache should retain default when not overridden");
+    assert!(
+        config.enable_quality_processing,
+        "enable_quality_processing should retain default when not overridden"
+    );
+}
+#[test]
+fn test_mcp_error_handling_for_invalid_json() {
+    // Test that invalid format values produce errors (or are handled gracefully)
+    let invalid_json = json!({
+        "output_format": "InvalidFormat"
+    });
+    let result = serde_json::from_value::<ExtractionConfig>(invalid_json);
+    // The deserialization should either fail or parse to a valid state
+    // depending on how OutputFormat handles unknown values
+    if let Ok(config) = result {
+        let _ = config.output_format.to_string();
+    }
+}
+#[test]
+fn test_mcp_concurrent_extractions_override() {
+    let mcp_json = json!({
+        "max_concurrent_extractions": 16
+    });
+    let config: ExtractionConfig =
+        serde_json::from_value(mcp_json).expect("Failed to deserialize config with concurrent extractions");
+    assert_eq!(
+        config.max_concurrent_extractions,
+        Some(16),
+        "max_concurrent_extractions should be overridden to 16"
+    );
+}
+#[test]
+fn test_mcp_config_json_keys_case_sensitive() {
+    // Verify that config JSON keys are case-sensitive
+    let lowercase_json = json!({
+        "use_cache": true,
+        "force_ocr": false
+    });
+    let config: ExtractionConfig =
+        serde_json::from_value(lowercase_json).expect("Failed to deserialize lowercase config");
+    assert!(config.use_cache, "use_cache should be true");
+    assert!(!config.force_ocr, "force_ocr should be false");
+    // Note: serde by default fails on unknown fields, so camelCase would fail
+    // This test documents the expected behavior
+}

data/vendor/kreuzberg/tests/core_integration.rs CHANGED Viewed

@@ -26,16 +26,16 @@ fn assert_text_content(actual: &str, expected: &str) {
 /// Test basic file extraction with MIME detection.
 #[tokio::test]
 async fn test_extract_file_basic() {
-    let dir = tempdir().unwrap();
+    let dir = tempdir().expect("Operation failed");
     let file_path = dir.path().join("test.txt");
-    let mut file = File::create(&file_path).unwrap();
-    file.write_all(b"Hello, Kreuzberg!").unwrap();
+    let mut file = File::create(&file_path).expect("Operation failed");
+    file.write_all(b"Hello, Kreuzberg!").expect("Operation failed");
     let config = ExtractionConfig::default();
     let result = extract_file(&file_path, None, &config).await;
     assert!(result.is_ok(), "Basic file extraction should succeed");
-    let result = result.unwrap();
+    let result = result.expect("Operation failed");
     assert_text_content(&result.content, "Hello, Kreuzberg!");
     assert_eq!(result.mime_type, "text/plain");
@@ -47,16 +47,16 @@ async fn test_extract_file_basic() {
 /// Test extraction with explicit MIME type override.
 #[tokio::test]
 async fn test_extract_file_with_mime_override() {
-    let dir = tempdir().unwrap();
+    let dir = tempdir().expect("Operation failed");
     let file_path = dir.path().join("data.bin");
-    let mut file = File::create(&file_path).unwrap();
-    file.write_all(b"Binary content").unwrap();
+    let mut file = File::create(&file_path).expect("Operation failed");
+    file.write_all(b"Binary content").expect("Operation failed");
     let config = ExtractionConfig::default();
     let result = extract_file(&file_path, Some("text/plain"), &config).await;
     assert!(result.is_ok(), "MIME override should work");
-    let result = result.unwrap();
+    let result = result.expect("Operation failed");
     assert_eq!(result.mime_type, "text/plain");
     assert!(!result.content.is_empty(), "Should extract content");
@@ -66,7 +66,7 @@ async fn test_extract_file_with_mime_override() {
 /// Test extraction of multiple file types.
 #[tokio::test]
 async fn test_extract_multiple_file_types() {
-    let dir = tempdir().unwrap();
+    let dir = tempdir().expect("Operation failed");
     let config = ExtractionConfig::default();
     let test_files: Vec<(&str, &[u8], &str)> = vec![
@@ -80,9 +80,11 @@ async fn test_extract_multiple_file_types() {
     for (filename, content, expected_mime) in test_files {
         let file_path = dir.path().join(filename);
-        fs::write(&file_path, content).unwrap();
+        fs::write(&file_path, content).expect("Operation failed");
-        let result = extract_file(&file_path, None, &config).await.unwrap();
+        let result = extract_file(&file_path, None, &config)
+            .await
+            .expect("Async operation failed");
         assert_eq!(result.mime_type, expected_mime, "MIME type mismatch for {}", filename);
         assert!(
@@ -115,7 +117,7 @@ async fn test_extract_bytes_various_mime_types() {
         let result = extract_bytes(content, mime_type, &config).await;
         assert!(result.is_ok(), "Extract bytes failed for MIME type: {}", mime_type);
-        let result = result.unwrap();
+        let result = result.expect("Operation failed");
         assert_eq!(result.mime_type, mime_type, "MIME type mismatch");
         assert!(
@@ -131,7 +133,7 @@ async fn test_extract_bytes_various_mime_types() {
 /// Test batch extraction with concurrent processing.
 #[tokio::test]
 async fn test_batch_extract_file_concurrency() {
-    let dir = tempdir().unwrap();
+    let dir = tempdir().expect("Operation failed");
     let config = ExtractionConfig::default();
     let num_files = 10;
@@ -139,14 +141,14 @@ async fn test_batch_extract_file_concurrency() {
     for i in 0..num_files {
         let file_path = dir.path().join(format!("test_{}.txt", i));
-        fs::write(&file_path, format!("Content {}", i)).unwrap();
+        fs::write(&file_path, format!("Content {}", i)).expect("Operation failed");
         paths.push(file_path);
     }
     let results = batch_extract_file(paths.clone(), &config).await;
     assert!(results.is_ok());
-    let results = results.unwrap();
+    let results = results.expect("Operation failed");
     assert_eq!(results.len(), num_files);
     for (i, result) in results.iter().enumerate() {
@@ -169,7 +171,7 @@ async fn test_batch_extract_empty() {
     let results = batch_extract_file(paths, &config).await;
     assert!(results.is_ok());
-    assert_eq!(results.unwrap().len(), 0);
+    assert_eq!(results.expect("Operation failed").len(), 0);
 }
 /// Test batch_extract_bytes with concurrent processing.
@@ -193,7 +195,7 @@ async fn test_batch_extract_bytes_concurrency() {
     let results = batch_extract_bytes(owned_contents, &config).await;
     assert!(results.is_ok());
-    let results = results.unwrap();
+    let results = results.expect("Operation failed");
     assert_eq!(results.len(), 5);
     for (i, result) in results.iter().enumerate() {
@@ -214,28 +216,28 @@ async fn test_batch_extract_bytes_concurrency() {
 /// Test sync wrappers for extraction functions.
 #[test]
 fn test_sync_wrappers() {
-    let dir = tempdir().unwrap();
+    let dir = tempdir().expect("Operation failed");
     let file_path = dir.path().join("sync_test.txt");
-    fs::write(&file_path, "sync content").unwrap();
+    fs::write(&file_path, "sync content").expect("Operation failed");
     let config = ExtractionConfig::default();
     let result = extract_file_sync(&file_path, None, &config);
     assert!(result.is_ok(), "Sync file extraction should succeed");
-    let extraction = result.unwrap();
+    let extraction = result.expect("Operation failed");
     assert_text_content(&extraction.content, "sync content");
     assert!(extraction.chunks.is_none(), "Chunks should be None");
     let result = extract_bytes_sync(b"test bytes", "text/plain", &config);
     assert!(result.is_ok(), "Sync bytes extraction should succeed");
-    let extraction = result.unwrap();
+    let extraction = result.expect("Operation failed");
     assert_text_content(&extraction.content, "test bytes");
     assert!(extraction.chunks.is_none(), "Chunks should be None");
     let paths = vec![file_path];
     let results = batch_extract_file_sync(paths, &config);
     assert!(results.is_ok(), "Batch sync file should succeed");
-    let results = results.unwrap();
+    let results = results.expect("Operation failed");
     assert_eq!(results.len(), 1);
     assert_text_content(&results[0].content, "sync content");
     assert!(results[0].chunks.is_none(), "Chunks should be None");
@@ -247,7 +249,7 @@ fn test_sync_wrappers() {
         .collect();
     let results = batch_extract_bytes_sync(owned_contents, &config);
     assert!(results.is_ok(), "Batch bytes sync should succeed");
-    let results = results.unwrap();
+    let results = results.expect("Operation failed");
     assert_eq!(results.len(), 1);
     assert_text_content(&results[0].content, "test");
     assert!(results[0].chunks.is_none(), "Chunks should be None");
@@ -256,7 +258,7 @@ fn test_sync_wrappers() {
 /// Test MIME type detection for various extensions.
 #[test]
 fn test_mime_detection_comprehensive() {
-    let dir = tempdir().unwrap();
+    let dir = tempdir().expect("Operation failed");
     let test_cases = vec![
         ("test.txt", "text/plain"),
@@ -287,9 +289,9 @@ fn test_mime_detection_comprehensive() {
     for (filename, expected_mime) in test_cases {
         let file_path = dir.path().join(filename);
-        File::create(&file_path).unwrap();
+        File::create(&file_path).expect("Operation failed");
-        let detected = detect_mime_type(&file_path, true).unwrap();
+        let detected = detect_mime_type(&file_path, true).expect("Operation failed");
         assert_eq!(detected, expected_mime, "Failed for {}", filename);
         let validated = validate_mime_type(&detected);
@@ -312,7 +314,7 @@ fn test_mime_validation() {
 /// Test case-insensitive extension handling.
 #[test]
 fn test_case_insensitive_extensions() {
-    let dir = tempdir().unwrap();
+    let dir = tempdir().expect("Operation failed");
     let test_cases = vec![
         ("test.PDF", "application/pdf"),
@@ -326,9 +328,9 @@ fn test_case_insensitive_extensions() {
     for (filename, expected_mime) in test_cases {
         let file_path = dir.path().join(filename);
-        File::create(&file_path).unwrap();
+        File::create(&file_path).expect("Operation failed");
-        let detected = detect_mime_type(&file_path, true).unwrap();
+        let detected = detect_mime_type(&file_path, true).expect("Operation failed");
         assert_eq!(detected, expected_mime, "Failed for {}", filename);
     }
 }
@@ -336,7 +338,7 @@ fn test_case_insensitive_extensions() {
 /// Test config loading from TOML file.
 #[test]
 fn test_config_loading() {
-    let dir = tempdir().unwrap();
+    let dir = tempdir().expect("Operation failed");
     let config_path = dir.path().join("kreuzberg.toml");
     fs::write(
@@ -355,19 +357,19 @@ max_chars = 2000
 max_overlap = 300
     "#,
     )
-    .unwrap();
+    .expect("Operation failed");
-    let config = ExtractionConfig::from_toml_file(&config_path).unwrap();
+    let config = ExtractionConfig::from_toml_file(&config_path).expect("Operation failed");
     assert!(!config.use_cache);
     assert!(config.enable_quality_processing);
     assert!(!config.force_ocr);
-    let ocr_config = config.ocr.unwrap();
+    let ocr_config = config.ocr.expect("Operation failed");
     assert_eq!(ocr_config.backend, "tesseract");
     assert_eq!(ocr_config.language, "deu");
-    let chunking_config = config.chunking.unwrap();
+    let chunking_config = config.chunking.expect("Operation failed");
     assert_eq!(chunking_config.max_chars, 2000);
     assert_eq!(chunking_config.max_overlap, 300);
 }
@@ -375,9 +377,9 @@ max_overlap = 300
 /// Test config discovery in parent directories.
 #[test]
 fn test_config_discovery() {
-    let dir = tempdir().unwrap();
+    let dir = tempdir().expect("Operation failed");
     let subdir = dir.path().join("subdir");
-    fs::create_dir(&subdir).unwrap();
+    fs::create_dir(&subdir).expect("Operation failed");
     let config_path = dir.path().join("kreuzberg.toml");
     fs::write(
@@ -387,16 +389,16 @@ use_cache = false
 enable_quality_processing = true
     "#,
     )
-    .unwrap();
+    .expect("Operation failed");
-    let original_dir = std::env::current_dir().unwrap();
-    std::env::set_current_dir(&subdir).unwrap();
+    let original_dir = std::env::current_dir().expect("Operation failed");
+    std::env::set_current_dir(&subdir).expect("Operation failed");
-    let config = ExtractionConfig::discover().unwrap();
+    let config = ExtractionConfig::discover().expect("Operation failed");
     assert!(config.is_some());
-    assert!(!config.unwrap().use_cache);
+    assert!(!config.expect("Operation failed").use_cache);
-    std::env::set_current_dir(original_dir).unwrap();
+    std::env::set_current_dir(original_dir).expect("Operation failed");
 }
 /// Test error handling for nonexistent files.
@@ -406,10 +408,8 @@ async fn test_nonexistent_file_error() {
     let result = extract_file("/nonexistent/file.txt", None, &config).await;
     assert!(result.is_err());
-    assert!(matches!(
-        result.unwrap_err(),
-        kreuzberg::KreuzbergError::Validation { .. }
-    ));
+    // File validation returns Io error for missing files (NotFound)
+    assert!(matches!(result.unwrap_err(), kreuzberg::KreuzbergError::Io(_)));
 }
 /// Test error handling for unsupported MIME types.
@@ -428,9 +428,9 @@ async fn test_unsupported_mime_type_error() {
 /// Test pipeline execution (currently stub, will be expanded in Phase 2).
 #[tokio::test]
 async fn test_pipeline_execution() {
-    let dir = tempdir().unwrap();
+    let dir = tempdir().expect("Operation failed");
     let file_path = dir.path().join("pipeline_test.txt");
-    fs::write(&file_path, "pipeline content").unwrap();
+    fs::write(&file_path, "pipeline content").expect("Operation failed");
     let config = ExtractionConfig {
         enable_quality_processing: true,
@@ -440,7 +440,7 @@ async fn test_pipeline_execution() {
     let result = extract_file(&file_path, None, &config).await;
     assert!(result.is_ok(), "Pipeline execution should succeed");
-    let result = result.unwrap();
+    let result = result.expect("Operation failed");
     assert_text_content(&result.content, "pipeline content");
     assert_eq!(result.mime_type, "text/plain");
     assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
@@ -450,9 +450,9 @@ async fn test_pipeline_execution() {
 /// Test extraction with OCR config (placeholder test for Phase 2).
 #[tokio::test]
 async fn test_extraction_with_ocr_config() {
-    let dir = tempdir().unwrap();
+    let dir = tempdir().expect("Operation failed");
     let file_path = dir.path().join("ocr_test.txt");
-    fs::write(&file_path, "ocr content").unwrap();
+    fs::write(&file_path, "ocr content").expect("Operation failed");
     let config = ExtractionConfig {
         ocr: Some(kreuzberg::OcrConfig {
@@ -473,11 +473,11 @@ async fn test_extraction_with_ocr_config() {
 #[cfg(feature = "chunking")]
 #[tokio::test]
 async fn test_extraction_with_chunking_config() {
-    let dir = tempdir().unwrap();
+    let dir = tempdir().expect("Operation failed");
     let file_path = dir.path().join("chunking_test.txt");
     let long_content = "content for chunking. ".repeat(100);
-    fs::write(&file_path, &long_content).unwrap();
+    fs::write(&file_path, &long_content).expect("Operation failed");
     let config = ExtractionConfig {
         chunking: Some(kreuzberg::ChunkingConfig {
@@ -492,21 +492,21 @@ async fn test_extraction_with_chunking_config() {
     let result = extract_file(&file_path, None, &config).await;
     assert!(result.is_ok(), "Extraction with chunking should succeed");
-    let result = result.unwrap();
+    let result = result.expect("Operation failed");
     assert!(
         result.chunks.is_some(),
         "Chunks should be populated when chunking enabled"
     );
-    let chunks = result.chunks.unwrap();
+    let chunks = result.chunks.expect("Operation failed");
     assert!(chunks.len() > 1, "Should have multiple chunks for long content");
     assert!(result.metadata.additional.contains_key("chunk_count"));
-    let chunk_count = result.metadata.additional.get("chunk_count").unwrap();
+    let chunk_count = result.metadata.additional.get("chunk_count").expect("Value not found");
     assert_eq!(
         chunks.len(),
-        chunk_count.as_u64().unwrap() as usize,
+        chunk_count.as_u64().expect("Operation failed") as usize,
         "chunk_count should match chunks length"
     );