RubyGems - kreuzberg - Versions diffs - 4.1.1 → 4.2.0 - Mend

kreuzberg 4.1.1 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

checksums.yaml +4 -4
data/Gemfile.lock +4 -4
data/README.md +8 -5
data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
data/ext/kreuzberg_rb/native/src/config/types.rs +23 -13
data/kreuzberg.gemspec +14 -2
data/lib/kreuzberg/api_proxy.rb +0 -1
data/lib/kreuzberg/cli_proxy.rb +0 -1
data/lib/kreuzberg/config.rb +70 -35
data/lib/kreuzberg/mcp_proxy.rb +0 -1
data/lib/kreuzberg/version.rb +1 -1
data/sig/kreuzberg.rbs +5 -1
data/spec/binding/batch_operations_spec.rb +80 -0
data/spec/binding/metadata_types_spec.rb +77 -57
data/spec/serialization_spec.rb +134 -0
data/spec/unit/config/output_format_spec.rb +380 -0
data/vendor/Cargo.toml +1 -1
data/vendor/kreuzberg/Cargo.toml +3 -3
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/embeddings.rs +4 -4
data/vendor/kreuzberg/src/mcp/format.rs +237 -39
data/vendor/kreuzberg/src/mcp/params.rs +26 -33
data/vendor/kreuzberg/src/mcp/server.rs +6 -3
data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
data/vendor/kreuzberg/tests/api_embed.rs +84 -50
data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
data/vendor/kreuzberg/tests/api_tests.rs +298 -139
data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
data/vendor/kreuzberg/tests/config_features.rs +19 -15
data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
data/vendor/kreuzberg/tests/core_integration.rs +55 -53
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
data/vendor/kreuzberg/tests/email_integration.rs +7 -7
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/error_handling.rs +13 -11
data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/page_markers.rs +1 -1
data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
data/vendor/kreuzberg/tests/security_validation.rs +20 -19
data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
data/vendor/kreuzberg-tesseract/build.rs +4 -4
data/vendor/kreuzberg-tesseract/src/lib.rs +6 -6
data/vendor/kreuzberg-tesseract/tests/integration_test.rs +3 -3
metadata +13 -2

data/vendor/kreuzberg/tests/mcp_integration.rs ADDED Viewed

@@ -0,0 +1,849 @@
+//! MCP integration tests for API consistency and breaking changes.
+//!
+//! This test suite validates that:
+//! 1. MCP parameters properly handle extraction configuration
+//! 2. MCP parameter deserialization is consistent
+//! 3. Various config combinations work correctly
+//! 4. End-to-end MCP tool invocations work with real data
+//! 5. Error handling is consistent across MCP tools
+//!
+//! Note: These tests verify the parameter structures used by MCP.
+//! The build_config function in the MCP server should accept
+//! a config JSON field instead of separate enable_ocr/force_ocr flags
+//! to align with the new API consistency approach.
+#![allow(clippy::bool_assert_comparison)]
+#![allow(clippy::field_reassign_with_default)]
+use serde_json::json;
+/// Test that parameter structures can handle various JSON configurations
+#[test]
+fn test_extraction_config_parameter_structure() {
+    // This demonstrates the new approach: config JSON instead of separate flags
+    let config_json = json!({
+        "use_cache": true,
+        "force_ocr": true,
+        "output_format": "markdown",
+    });
+    let config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(config_json).expect("Failed to parse config");
+    assert_eq!(config.use_cache, true);
+    assert_eq!(config.force_ocr, true);
+    assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
+}
+#[test]
+fn test_mcp_style_params_with_config() {
+    // This demonstrates how MCP params should accept full config JSON
+    let mcp_request = json!({
+        "path": "/test.pdf",
+        "mime_type": "application/pdf",
+        "config": {
+            "use_cache": false,
+            "force_ocr": true,
+            "output_format": "markdown",
+        }
+    });
+    // The config field should be parseable as ExtractionConfig
+    let config_obj = mcp_request.get("config").expect("Should have config field");
+    let config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(config_obj.clone()).expect("Failed to parse config");
+    assert_eq!(config.force_ocr, true);
+    assert_eq!(config.use_cache, false);
+}
+#[test]
+fn test_mcp_params_backward_compatibility_minimal() {
+    // Minimal MCP params structure
+    let params = json!({
+        "path": "/test.pdf",
+    });
+    // Should be deserializable
+    let path = params.get("path").expect("Should have path");
+    assert_eq!(path, "/test.pdf");
+}
+#[test]
+fn test_mcp_params_with_all_fields() {
+    // Complete MCP params with config
+    let params = json!({
+        "path": "/test.pdf",
+        "mime_type": "application/pdf",
+        "config": {
+            "use_cache": true,
+            "enable_quality_processing": true,
+            "force_ocr": false,
+            "output_format": "plain",
+        }
+    });
+    // Extract and validate config
+    if let Some(config_obj) = params.get("config") {
+        let config: kreuzberg::core::config::ExtractionConfig =
+            serde_json::from_value(config_obj.clone()).expect("Failed to parse");
+        assert_eq!(config.use_cache, true);
+        assert_eq!(config.force_ocr, false);
+        assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Plain);
+    }
+}
+#[test]
+fn test_batch_extraction_params_structure() {
+    // Batch extraction params with paths and config
+    let batch_params = json!({
+        "paths": ["/file1.pdf", "/file2.pdf", "/file3.pdf"],
+        "config": {
+            "force_ocr": true,
+            "max_concurrent_extractions": 4,
+        }
+    });
+    let paths = batch_params.get("paths").expect("Should have paths");
+    assert!(paths.is_array(), "paths field should be an array");
+    let path_array = paths.as_array().expect("paths should be deserializable as array");
+    assert_eq!(path_array.len(), 3, "paths array should contain exactly 3 elements");
+    if let Some(config_obj) = batch_params.get("config") {
+        let config: kreuzberg::core::config::ExtractionConfig =
+            serde_json::from_value(config_obj.clone()).expect("Failed to parse");
+        assert_eq!(config.force_ocr, true);
+        assert_eq!(config.max_concurrent_extractions, Some(4));
+    }
+}
+#[test]
+fn test_config_merge_in_mcp_context() {
+    // Test 1: Verify default config baseline
+    let default_config = kreuzberg::core::config::ExtractionConfig::default();
+    assert_eq!(default_config.use_cache, true, "Default cache should be enabled");
+    assert_eq!(default_config.force_ocr, false, "Default force_ocr should be false");
+    assert_eq!(
+        default_config.output_format,
+        kreuzberg::core::config::OutputFormat::Plain,
+        "Default output format should be Plain"
+    );
+    // Test 2: Request provides single field override - verify precedence
+    let request_config_json = json!({
+        "force_ocr": true,
+    });
+    let request_config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(request_config_json).expect("Failed to parse request config");
+    // Request config should override that field
+    assert_eq!(request_config.force_ocr, true, "Request force_ocr should be true");
+    // But unspecified fields should use defaults
+    assert_eq!(
+        request_config.use_cache, true,
+        "Unspecified use_cache should default to true"
+    );
+    assert_eq!(
+        request_config.output_format,
+        kreuzberg::core::config::OutputFormat::Plain,
+        "Unspecified output_format should default to Plain"
+    );
+    // Test 3: Multiple field overrides - verify precedence chain
+    let multi_override_json = json!({
+        "use_cache": false,
+        "force_ocr": true,
+        "output_format": "markdown",
+    });
+    let multi_config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(multi_override_json).expect("Failed to parse multi-field config");
+    // All specified fields should override defaults
+    assert_eq!(multi_config.use_cache, false, "Override use_cache should be false");
+    assert_eq!(multi_config.force_ocr, true, "Override force_ocr should be true");
+    assert_eq!(
+        multi_config.output_format,
+        kreuzberg::core::config::OutputFormat::Markdown,
+        "Override output_format should be Markdown"
+    );
+    // Unspecified numeric fields should still have defaults
+    if let Some(max_conc) = multi_config.max_concurrent_extractions {
+        panic!(
+            "max_concurrent_extractions should not be specified when not in request, got: {}",
+            max_conc
+        );
+    }
+    // Test 4: Verify config can be fully constructed with all fields
+    let full_json = json!({
+        "use_cache": false,
+        "enable_quality_processing": true,
+        "force_ocr": true,
+        "output_format": "html",
+        "max_concurrent_extractions": 8,
+    });
+    let full_config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(full_json).expect("Failed to parse full config");
+    assert_eq!(full_config.use_cache, false, "Full config use_cache should be false");
+    assert_eq!(
+        full_config.enable_quality_processing, true,
+        "Full config quality processing should be true"
+    );
+    assert_eq!(full_config.force_ocr, true, "Full config force_ocr should be true");
+    assert_eq!(
+        full_config.output_format,
+        kreuzberg::core::config::OutputFormat::Html,
+        "Full config output_format should be Html"
+    );
+    assert_eq!(
+        full_config.max_concurrent_extractions,
+        Some(8),
+        "Full config max_concurrent should be 8"
+    );
+}
+#[test]
+fn test_config_json_flexibility() {
+    // Config JSON can have any combination of fields
+    let configs = vec![
+        json!({}),                                                             // Empty = all defaults
+        json!({"force_ocr": true}),                                            // Single field
+        json!({"force_ocr": true, "use_cache": false}),                        // Multiple fields
+        json!({"output_format": "markdown", "max_concurrent_extractions": 8}), // Various types
+    ];
+    for config_json in configs {
+        let config: Result<kreuzberg::core::config::ExtractionConfig, _> = serde_json::from_value(config_json);
+        assert!(config.is_ok(), "Config should deserialize successfully");
+    }
+}
+#[test]
+fn test_extraction_config_serialization_for_mcp() {
+    // MCP should be able to serialize config back to JSON
+    let mut config = kreuzberg::core::config::ExtractionConfig::default();
+    config.force_ocr = true;
+    config.output_format = kreuzberg::core::config::OutputFormat::Markdown;
+    let json = serde_json::to_value(&config).expect("Failed to serialize");
+    // Verify it round-trips
+    let restored: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(json).expect("Failed to deserialize");
+    assert_eq!(config.force_ocr, restored.force_ocr);
+    assert_eq!(config.output_format, restored.output_format);
+}
+// ============================================================================
+// E2E TEST CASES
+// ============================================================================
+/// Test MCP config with all options enabled
+#[test]
+fn test_mcp_config_full_extraction() {
+    let config_json = json!({
+        "use_cache": false,
+        "enable_quality_processing": true,
+        "force_ocr": false,
+        "output_format": "markdown",
+        "max_concurrent_extractions": 4,
+    });
+    let config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(config_json).expect("Failed to parse full config");
+    // Verify all fields deserialized correctly
+    assert_eq!(config.use_cache, false);
+    assert_eq!(config.enable_quality_processing, true);
+    assert_eq!(config.force_ocr, false);
+    assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
+    assert_eq!(config.max_concurrent_extractions, Some(4));
+}
+/// Test MCP config with markdown output format
+#[test]
+fn test_mcp_config_output_format_markdown() {
+    let config_json = json!({
+        "output_format": "markdown",
+    });
+    let config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(config_json).expect("Failed to parse markdown config");
+    assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
+}
+/// Test MCP config with element-based result structure
+#[test]
+fn test_mcp_config_result_format_element_based() {
+    let config_json = json!({
+        "output_format": "markdown",
+        "use_cache": true,
+        "enable_quality_processing": true,
+    });
+    let config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(config_json).expect("Failed to parse element format");
+    assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
+    assert_eq!(config.use_cache, true);
+    assert_eq!(config.enable_quality_processing, true);
+}
+/// Test batch extraction with config applied to all files
+#[test]
+fn test_mcp_batch_with_config() {
+    let batch_request = json!({
+        "paths": ["/file1.txt", "/file2.txt", "/file3.txt"],
+        "config": {
+            "force_ocr": true,
+            "output_format": "plain",
+            "max_concurrent_extractions": 2,
+        }
+    });
+    // Verify paths are array
+    let paths = batch_request.get("paths").expect("Should have paths");
+    assert!(paths.is_array(), "paths field should be an array");
+    let path_array = paths.as_array().expect("paths should be deserializable as array");
+    assert_eq!(path_array.len(), 3, "paths array should contain exactly 3 elements");
+    // Verify config applies to batch
+    let config_obj = batch_request.get("config").expect("Should have config");
+    let config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(config_obj.clone()).expect("Failed to parse batch config");
+    assert_eq!(config.force_ocr, true);
+    assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Plain);
+    assert_eq!(config.max_concurrent_extractions, Some(2));
+}
+/// Test MCP error handling with invalid JSON config
+#[test]
+fn test_mcp_invalid_config_json_error() {
+    let invalid_config = "not a valid json object";
+    let result: Result<kreuzberg::core::config::ExtractionConfig, _> = serde_json::from_str(invalid_config);
+    assert!(result.is_err(), "Invalid JSON should produce error");
+}
+/// Test that MCP config field precedence is correct
+#[test]
+fn test_mcp_config_overrides() {
+    // Simulate MCP request with inline config
+    let mcp_params = json!({
+        "path": "/document.pdf",
+        "mime_type": "application/pdf",
+        "config": {
+            "force_ocr": true,
+            "use_cache": false,
+            "output_format": "markdown",
+        }
+    });
+    if let Some(config_obj) = mcp_params.get("config") {
+        let parsed_config: kreuzberg::core::config::ExtractionConfig =
+            serde_json::from_value(config_obj.clone()).expect("Failed to parse");
+        // Verify request config overrides defaults
+        assert_eq!(parsed_config.force_ocr, true);
+        assert_eq!(parsed_config.use_cache, false);
+        assert_eq!(
+            parsed_config.output_format,
+            kreuzberg::core::config::OutputFormat::Markdown
+        );
+    }
+}
+/// Test that deprecated parameters (enable_ocr, force_ocr as separate fields) are rejected
+#[test]
+fn test_mcp_no_deprecated_params() {
+    // This simulates MCP params that incorrectly use separate flags
+    let deprecated_params = json!({
+        "path": "/document.pdf",
+        "enable_ocr": true,  // deprecated!
+        "force_ocr": true,    // should be in config
+    });
+    // The correct approach: config field contains all options
+    let correct_params = json!({
+        "path": "/document.pdf",
+        "config": {
+            "force_ocr": true,
+        }
+    });
+    // Extract and verify correct params
+    if let Some(config_obj) = correct_params.get("config") {
+        let config: kreuzberg::core::config::ExtractionConfig =
+            serde_json::from_value(config_obj.clone()).expect("Failed to parse");
+        assert_eq!(config.force_ocr, true);
+    }
+    // Verify deprecated params are NOT in the correct structure
+    assert!(
+        deprecated_params.get("config").is_none(),
+        "Deprecated params should not be in config"
+    );
+}
+/// End-to-end test with real text extraction
+#[tokio::test]
+async fn test_mcp_real_pdf_extraction() {
+    // Create a simple test document in bytes
+    let test_content = b"Hello, MCP!";
+    // Create MCP request structure
+    let mcp_request = json!({
+        "mime_type": "text/plain",
+        "config": {
+            "output_format": "plain",
+            "use_cache": false,
+        }
+    });
+    // Extract config from request
+    if let Some(config_obj) = mcp_request.get("config") {
+        let config: kreuzberg::core::config::ExtractionConfig =
+            serde_json::from_value(config_obj.clone()).expect("Failed to parse config");
+        // Use async extract_bytes to process content
+        let result = kreuzberg::extract_bytes(test_content, "text/plain", &config)
+            .await
+            .expect("Extraction should succeed");
+        // Verify result has content
+        assert!(!result.content.is_empty());
+        assert!(result.content.contains("MCP") || result.content.contains("Hello"));
+    }
+}
+/// Test MCP batch extraction with mixed formats
+#[test]
+fn test_mcp_batch_mixed_formats() {
+    let batch_config = json!({
+        "files": [
+            {
+                "path": "/document.pdf",
+                "mime_type": "application/pdf",
+            },
+            {
+                "path": "/document.docx",
+                "mime_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            },
+            {
+                "path": "/document.txt",
+                "mime_type": "text/plain",
+            }
+        ],
+        "config": {
+            "output_format": "markdown",
+            "force_ocr": false,
+        }
+    });
+    let files = batch_config.get("files").expect("Should have files");
+    assert!(files.is_array(), "files field should be an array");
+    let file_array = files.as_array().expect("files should be deserializable as array");
+    assert_eq!(file_array.len(), 3, "files array should contain exactly 3 elements");
+    if let Some(config_obj) = batch_config.get("config") {
+        let config: kreuzberg::core::config::ExtractionConfig =
+            serde_json::from_value(config_obj.clone()).expect("Failed to parse batch config");
+        assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
+        assert_eq!(config.force_ocr, false);
+    }
+}
+/// Test MCP request with minimal config (all defaults)
+#[test]
+fn test_mcp_minimal_config() {
+    let minimal_request = json!({
+        "path": "/document.pdf",
+    });
+    // Path should exist and be correct
+    assert_eq!(
+        minimal_request.get("path"),
+        Some(&serde_json::Value::String("/document.pdf".to_string())),
+        "Path field should be present and set to /document.pdf"
+    );
+    // If no config, use defaults
+    let config = match minimal_request.get("config") {
+        Some(config_obj) => {
+            serde_json::from_value(config_obj.clone()).expect("Failed to parse config from minimal request")
+        }
+        None => kreuzberg::core::config::ExtractionConfig::default(),
+    };
+    // Verify defaults are applied
+    assert_eq!(config.use_cache, true);
+    assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Plain);
+}
+/// Test MCP config with all output formats
+#[test]
+fn test_mcp_all_output_formats() {
+    let formats = vec!["plain", "markdown", "html"];
+    for format_str in formats {
+        let config_json = json!({
+            "output_format": format_str,
+        });
+        let config: kreuzberg::core::config::ExtractionConfig =
+            serde_json::from_value(config_json).expect("Failed to parse output format config");
+        // Verify format was set
+        let format_display = format!("{}", config.output_format);
+        assert_eq!(format_display, format_str);
+    }
+}
+/// Test MCP concurrent extraction config
+#[test]
+fn test_mcp_concurrent_extraction_config() {
+    let concurrent_configs = vec![1, 2, 4, 8, 16];
+    for max_concurrent in concurrent_configs {
+        let config_json = json!({
+            "max_concurrent_extractions": max_concurrent,
+        });
+        let config: kreuzberg::core::config::ExtractionConfig =
+            serde_json::from_value(config_json).expect("Failed to parse concurrent config");
+        assert_eq!(config.max_concurrent_extractions, Some(max_concurrent));
+    }
+}
+/// Test MCP config with cache disabled
+#[test]
+fn test_mcp_cache_disabled_config() {
+    let config_json = json!({
+        "use_cache": false,
+        "force_ocr": true,
+    });
+    let config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(config_json).expect("Failed to parse cache config");
+    assert_eq!(config.use_cache, false);
+    assert_eq!(config.force_ocr, true);
+}
+/// Test MCP config round-trip serialization
+#[test]
+fn test_mcp_config_round_trip_serialization() {
+    let original_config = kreuzberg::core::config::ExtractionConfig {
+        use_cache: false,
+        enable_quality_processing: true,
+        force_ocr: true,
+        output_format: kreuzberg::core::config::OutputFormat::Markdown,
+        max_concurrent_extractions: Some(4),
+        ..Default::default()
+    };
+    // Serialize to JSON
+    let json_value = serde_json::to_value(&original_config).expect("Failed to serialize");
+    // Deserialize back
+    let restored_config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(json_value).expect("Failed to deserialize");
+    // Verify round-trip
+    assert_eq!(original_config.use_cache, restored_config.use_cache);
+    assert_eq!(
+        original_config.enable_quality_processing,
+        restored_config.enable_quality_processing
+    );
+    assert_eq!(original_config.force_ocr, restored_config.force_ocr);
+    assert_eq!(original_config.output_format, restored_config.output_format);
+    assert_eq!(
+        original_config.max_concurrent_extractions,
+        restored_config.max_concurrent_extractions
+    );
+}
+/// Test MCP tool invocation with extract_bytes semantics
+#[tokio::test]
+async fn test_mcp_tool_extract_bytes_semantics() {
+    let test_bytes = b"Test content for MCP extraction";
+    let mime_type = "text/plain";
+    let config_json = json!({
+        "output_format": "plain",
+    });
+    let config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(config_json).expect("Failed to parse config");
+    // Simulate MCP tool: extract_bytes
+    let result = kreuzberg::extract_bytes(test_bytes, mime_type, &config)
+        .await
+        .expect("Extraction should succeed");
+    assert!(!result.content.is_empty());
+    assert!(result.mime_type.contains("text"));
+}
+/// Test MCP tool invocation with file path semantics
+#[test]
+fn test_mcp_tool_extract_file_semantics() {
+    // Create temporary test file
+    let test_dir = tempfile::tempdir().expect("Failed to create temp dir");
+    let test_file = test_dir.path().join("test.txt");
+    std::fs::write(&test_file, b"Test content").expect("Failed to write test file");
+    let config_json = json!({
+        "output_format": "plain",
+    });
+    let config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(config_json).expect("Failed to parse config");
+    // Simulate MCP tool: extract_file (sync)
+    if test_file.exists() {
+        let file_path = test_file.to_str().expect("test_file path should be valid UTF-8");
+        let result = kreuzberg::extract_file_sync(file_path, None, &config).expect("Extraction should succeed");
+        assert!(!result.content.is_empty());
+    }
+}
+/// Test MCP batch extraction semantics
+#[tokio::test]
+async fn test_mcp_batch_extraction_semantics() {
+    let test_bytes_1 = b"Content 1";
+    let test_bytes_2 = b"Content 2";
+    let mime_type = "text/plain";
+    let config_json = json!({
+        "output_format": "plain",
+    });
+    let config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(config_json).expect("Failed to parse config");
+    // Simulate MCP batch tool: batch_extract_bytes
+    let test_data = vec![
+        (test_bytes_1.to_vec(), mime_type.to_string()),
+        (test_bytes_2.to_vec(), mime_type.to_string()),
+    ];
+    // Extract each item
+    for (bytes, mime) in test_data {
+        let result = kreuzberg::extract_bytes(&bytes, &mime, &config)
+            .await
+            .expect("Batch extraction should succeed");
+        assert!(!result.content.is_empty());
+    }
+}
+/// Test MCP error cases with invalid configurations
+#[test]
+fn test_mcp_error_invalid_format_field() {
+    let invalid_config = json!({
+        "output_format": "invalid_format_that_does_not_exist",
+    });
+    let result: Result<kreuzberg::core::config::ExtractionConfig, _> = serde_json::from_value(invalid_config);
+    // This should fail during deserialization
+    assert!(result.is_err());
+}
+/// Test MCP parameter validation with zero concurrent count
+#[test]
+fn test_mcp_validate_zero_concurrent() {
+    // Zero values should be accepted by serde, but MCP validation should flag
+    let config_json = json!({
+        "max_concurrent_extractions": 0,
+    });
+    let config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(config_json).expect("Failed to parse");
+    // The config accepted the value; MCP server should validate semantically
+    assert_eq!(config.max_concurrent_extractions, Some(0));
+}
+/// Test MCP tool with empty batch
+#[test]
+fn test_mcp_empty_batch_handling() {
+    let empty_batch = json!({
+        "paths": [],
+        "config": {
+            "output_format": "plain",
+        }
+    });
+    let paths = empty_batch.get("paths").expect("Should have paths");
+    assert!(paths.is_array(), "paths field should be an array");
+    let path_array = paths.as_array().expect("paths should be deserializable as array");
+    assert_eq!(path_array.len(), 0, "paths array should be empty");
+}
+/// Test MCP parameter extraction with nested config
+#[test]
+fn test_mcp_nested_config_extraction() {
+    let nested_request = json!({
+        "tool": "extract_file",
+        "parameters": {
+            "path": "/document.pdf",
+            "config": {
+                "output_format": "markdown",
+                "force_ocr": true,
+            }
+        }
+    });
+    if let Some(params) = nested_request.get("parameters")
+        && let Some(config_obj) = params.get("config")
+    {
+        let config: kreuzberg::core::config::ExtractionConfig =
+            serde_json::from_value(config_obj.clone()).expect("Failed to parse nested config");
+        assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
+        assert_eq!(config.force_ocr, true);
+    }
+}
+/// Test MCP HTML output format
+#[test]
+fn test_mcp_html_output_format() {
+    let config_json = json!({
+        "output_format": "html",
+    });
+    let config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(config_json).expect("Failed to parse HTML config");
+    assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Html);
+}
+/// Test MCP config with all boolean combinations
+#[test]
+fn test_mcp_boolean_combinations() {
+    let combinations = vec![(true, true), (true, false), (false, true), (false, false)];
+    for (use_cache, quality_processing) in combinations {
+        let config_json = json!({
+            "use_cache": use_cache,
+            "enable_quality_processing": quality_processing,
+        });
+        let config: kreuzberg::core::config::ExtractionConfig =
+            serde_json::from_value(config_json).expect("Failed to parse config");
+        assert_eq!(config.use_cache, use_cache);
+        assert_eq!(config.enable_quality_processing, quality_processing);
+    }
+}
+/// Test MCP response structure with extraction result
+#[test]
+fn test_mcp_response_structure_validation() {
+    let mcp_response = json!({
+        "status": "success",
+        "data": {
+            "content": "Extracted text",
+            "mime_type": "text/plain",
+            "metadata": {
+                "source": "test",
+                "extracted_at": "2024-01-25",
+            }
+        }
+    });
+    assert_eq!(
+        mcp_response.get("status").expect("status field should exist"),
+        "success"
+    );
+    assert!(
+        mcp_response.get("data").is_some(),
+        "data field should be present in MCP response"
+    );
+}
+/// Test MCP request/response roundtrip with config
+#[test]
+fn test_mcp_request_response_roundtrip() {
+    let original_config = json!({
+        "use_cache": false,
+        "force_ocr": true,
+        "output_format": "markdown",
+        "max_concurrent_extractions": 4,
+    });
+    // Simulate sending to MCP and getting back
+    let config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(original_config.clone()).expect("Failed to parse");
+    // Serialize back
+    let response_config = serde_json::to_value(&config).expect("Failed to serialize");
+    // Verify it matches
+    assert_eq!(original_config.get("use_cache"), response_config.get("use_cache"));
+    assert_eq!(original_config.get("force_ocr"), response_config.get("force_ocr"));
+    assert_eq!(
+        original_config.get("output_format"),
+        response_config.get("output_format")
+    );
+}
+/// Test MCP config with partial updates
+#[test]
+fn test_mcp_config_partial_updates() {
+    let mut base_config = kreuzberg::core::config::ExtractionConfig::default();
+    base_config.use_cache = true;
+    base_config.force_ocr = false;
+    // Partial update
+    let update_json = json!({
+        "force_ocr": true,
+    });
+    let update_config: kreuzberg::core::config::ExtractionConfig =
+        serde_json::from_value(update_json).expect("Failed to parse update");
+    // In MCP, updates replace config completely
+    let updated = update_config;
+    // New config has update applied
+    assert_eq!(updated.force_ocr, true);
+    // But other fields revert to defaults (not merged)
+    assert_eq!(updated.use_cache, true);
+}
+/// Test MCP API consistency for all formats
+#[test]
+fn test_mcp_api_consistency_all_formats() {
+    let formats = vec!["plain", "markdown", "html"];
+    for format_str in formats {
+        let config = json!({
+            "output_format": format_str,
+        });
+        let parsed: kreuzberg::core::config::ExtractionConfig =
+            serde_json::from_value(config).expect("Failed to parse");
+        // Verify format is consistent
+        let serialized = serde_json::to_value(&parsed).expect("Failed to serialize");
+        let reserialized: kreuzberg::core::config::ExtractionConfig =
+            serde_json::from_value(serialized).expect("Failed to deserialize");
+        let original_format = format!("{}", parsed.output_format);
+        let restored_format = format!("{}", reserialized.output_format);
+        assert_eq!(original_format, restored_format);
+    }
+}