RubyGems - kreuzberg - Versions diffs - 4.2.8 → 4.2.9 - Mend

kreuzberg 4.2.8 → 4.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/Gemfile.lock +2 -2
data/README.md +1 -1
data/lib/kreuzberg/version.rb +1 -1
data/vendor/Cargo.toml +1 -1
data/vendor/kreuzberg/Cargo.toml +1 -1
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +5 -3
data/vendor/kreuzberg/src/extractors/pdf/mod.rs +168 -5
data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +27 -0
data/vendor/kreuzberg/src/mcp/params.rs +0 -16
data/vendor/kreuzberg/src/mcp/server.rs +29 -24
data/vendor/kreuzberg/src/mcp/tools/extraction.rs +21 -43
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 2a1e253ec8ac69b4394b7f68cd0d9d2050e66885139219a664ee4cff4f7e4c2f
-  data.tar.gz: f6a6998344418328b89aa4a220a080b2708d0540ceb643a12e815fe5040c088b
+  metadata.gz: 1bdd32141526f545868c567acbc8e3a7caf94b4ff7e42bebf859fe33416669e4
+  data.tar.gz: 10da5a6da3a781b9676ba1213a535a69edde90b89ccad45489fab9fb593f5f73
 SHA512:
-  metadata.gz: c3b982490411f182ee4567e2c692c9cfc9d0f1a670db081d73ea20d707c4e8fb26a12da1e077f405cea1c85f219b3971c00ecee2ffc03bce740cfd0bbe2c65ec
-  data.tar.gz: c93cf5fb1319aac459129d5266abd02d46731574ddd741edb357ae30f1bb2a42d75eac0e5ff0dd4af8b26dc0fc0da646a1854438c0467ef299d8156f87e5d4c5
+  metadata.gz: e45428f1c646ed0683f51fa932c2432b0563d3258912fbe7b49f75acf0cdbc43c844c92b17cf7d4a5ddccb0b010d23cce4b20de950877fbe64ecafb858312bc5
+  data.tar.gz: f0abcd49fe46a4f0e3e2bf80e217ff36970b4a6037ecec6ea889230605a83178d76bff31d0960d50fb2ad4e1ea6f703c595bd43c244ff0e082ab365eb86bf02a

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    kreuzberg (4.2.8)
+    kreuzberg (4.2.9)
       rb_sys (~> 0.9.119)
 GEM
@@ -209,7 +209,7 @@ CHECKSUMS
   i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
   io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
   json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
-  kreuzberg (4.2.8)
+  kreuzberg (4.2.9)
   language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
   lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
   listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2

data/README.md CHANGED Viewed

@@ -22,7 +22,7 @@
     <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
   </a>
   <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
-    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.8" alt="Go">
+    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.9" alt="Go">
   </a>
   <a href="https://www.nuget.org/packages/Kreuzberg/">
     <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">

data/lib/kreuzberg/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Kreuzberg
-  VERSION = '4.2.8'
+  VERSION = '4.2.9'
 end

data/vendor/Cargo.toml CHANGED Viewed

@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
 resolver = "2"
 [workspace.package]
-version = "4.2.8"
+version = "4.2.9"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

data/vendor/kreuzberg/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg"
-version = "4.2.8"
+version = "4.2.9"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

data/vendor/kreuzberg/README.md CHANGED Viewed

@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
 This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
-> **🚀 Version 4.2.8 Release**
+> **🚀 Version 4.2.9 Release**
 > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
 >
 > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.

data/vendor/kreuzberg/src/extractors/pdf/extraction.rs CHANGED Viewed

@@ -4,7 +4,7 @@
 use crate::Result;
 use crate::core::config::ExtractionConfig;
-use crate::types::PageContent;
+use crate::types::{PageBoundary, PageContent};
 #[cfg(feature = "pdf")]
 use crate::types::Table;
@@ -17,6 +17,7 @@ pub(crate) type PdfExtractionPhaseResult = (
     String,
     Vec<Table>,
     Option<Vec<PageContent>>,
+    Option<Vec<PageBoundary>>,
 );
 /// Extract text, metadata, and tables from a PDF document using a single shared instance.
@@ -41,17 +42,18 @@ pub(crate) type PdfExtractionPhaseResult = (
 /// - Native extracted text (or empty if using OCR)
 /// - Extracted tables (if OCR feature enabled)
 /// - Per-page content (if page extraction configured)
+/// - Page boundaries for per-page OCR evaluation
 #[cfg(feature = "pdf")]
 pub(crate) fn extract_all_from_document(
     document: &PdfDocument,
     config: &ExtractionConfig,
 ) -> Result<PdfExtractionPhaseResult> {
-    let (native_text, _boundaries, page_contents, pdf_metadata) =
+    let (native_text, boundaries, page_contents, pdf_metadata) =
         crate::pdf::text::extract_text_and_metadata_from_pdf_document(document, Some(config))?;
     let tables = extract_tables_from_document(document, &pdf_metadata)?;
-    Ok((pdf_metadata, native_text, tables, page_contents))
+    Ok((pdf_metadata, native_text, tables, page_contents, boundaries))
 }
 /// Extract tables from PDF document using native text positions.

data/vendor/kreuzberg/src/extractors/pdf/mod.rs CHANGED Viewed

@@ -22,7 +22,7 @@ use crate::pdf::error::PdfError;
 // Re-export for backward compatibility
 #[cfg(feature = "ocr")]
-pub use ocr::{NativeTextStats, OcrFallbackDecision, evaluate_native_text_for_ocr};
+pub use ocr::{NativeTextStats, OcrFallbackDecision, evaluate_native_text_for_ocr, evaluate_per_page_ocr};
 use extraction::extract_all_from_document;
 #[cfg(feature = "ocr")]
@@ -78,7 +78,7 @@ impl DocumentExtractor for PdfExtractor {
         config: &ExtractionConfig,
     ) -> Result<ExtractionResult> {
         #[cfg(feature = "pdf")]
-        let (pdf_metadata, native_text, tables, page_contents) = {
+        let (pdf_metadata, native_text, tables, page_contents, _boundaries) = {
             #[cfg(target_arch = "wasm32")]
             {
                 let pdfium = crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")
@@ -128,7 +128,7 @@ impl DocumentExtractor for PdfExtractor {
                             }
                         })?;
-                        let (pdf_metadata, native_text, tables, page_contents) =
+                        let (pdf_metadata, native_text, tables, page_contents, _boundaries) =
                             extract_all_from_document(&document, &config_owned)?;
                         if let Some(page_cfg) = config_owned.pages.as_ref()
@@ -142,7 +142,13 @@ impl DocumentExtractor for PdfExtractor {
                             .into());
                         }
-                        Ok::<_, crate::error::KreuzbergError>((pdf_metadata, native_text, tables, page_contents))
+                        Ok::<_, crate::error::KreuzbergError>((
+                            pdf_metadata,
+                            native_text,
+                            tables,
+                            page_contents,
+                            _boundaries,
+                        ))
                     })
                     .await
                     .map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))??
@@ -188,7 +194,11 @@ impl DocumentExtractor for PdfExtractor {
                 native_text
             }
         } else if config.ocr.is_some() {
-            let decision = ocr::evaluate_native_text_for_ocr(&native_text, None);
+            let decision = ocr::evaluate_per_page_ocr(
+                &native_text,
+                _boundaries.as_deref(),
+                pdf_metadata.pdf_specific.page_count,
+            );
             if std::env::var("KREUZBERG_DEBUG_OCR").is_ok() {
                 eprintln!(
@@ -365,6 +375,159 @@ mod tests {
         assert!(ocr::evaluate_native_text_for_ocr(sample, Some(2)).fallback);
     }
+    #[cfg(feature = "ocr")]
+    #[test]
+    fn test_per_page_ocr_no_boundaries_falls_back_to_whole_doc() {
+        let text = "This document has enough meaningful words for evaluation purposes here.";
+        let decision = ocr::evaluate_per_page_ocr(text, None, Some(1));
+        assert!(!decision.fallback);
+    }
+    #[cfg(feature = "ocr")]
+    #[test]
+    fn test_per_page_ocr_empty_boundaries_falls_back_to_whole_doc() {
+        let text = "This document has enough meaningful words for evaluation purposes here.";
+        let decision = ocr::evaluate_per_page_ocr(text, Some(&[]), Some(1));
+        assert!(!decision.fallback);
+    }
+    #[cfg(feature = "ocr")]
+    #[test]
+    fn test_per_page_ocr_all_pages_good() {
+        use crate::types::PageBoundary;
+        let page1 = "This first page has plenty of meaningful searchable text content here.";
+        let page2 = "This second page also has plenty of meaningful searchable text content.";
+        let text = format!("{}{}", page1, page2);
+        let boundaries = vec![
+            PageBoundary {
+                byte_start: 0,
+                byte_end: page1.len(),
+                page_number: 1,
+            },
+            PageBoundary {
+                byte_start: page1.len(),
+                byte_end: text.len(),
+                page_number: 2,
+            },
+        ];
+        let decision = ocr::evaluate_per_page_ocr(&text, Some(&boundaries), Some(2));
+        assert!(!decision.fallback);
+    }
+    #[cfg(feature = "ocr")]
+    #[test]
+    fn test_per_page_ocr_one_bad_page_triggers_fallback() {
+        use crate::types::PageBoundary;
+        let good_page = "This page has plenty of meaningful searchable text content for extraction.";
+        let bad_page = " . ; ";
+        let text = format!("{}{}", good_page, bad_page);
+        let boundaries = vec![
+            PageBoundary {
+                byte_start: 0,
+                byte_end: good_page.len(),
+                page_number: 1,
+            },
+            PageBoundary {
+                byte_start: good_page.len(),
+                byte_end: text.len(),
+                page_number: 2,
+            },
+        ];
+        let decision = ocr::evaluate_per_page_ocr(&text, Some(&boundaries), Some(2));
+        assert!(decision.fallback);
+    }
+    #[cfg(feature = "ocr")]
+    #[test]
+    fn test_per_page_ocr_empty_page_triggers_fallback() {
+        use crate::types::PageBoundary;
+        let good_page = "This page has plenty of meaningful searchable text content for extraction.";
+        let empty_page = "";
+        let text = format!("{}{}", good_page, empty_page);
+        let boundaries = vec![
+            PageBoundary {
+                byte_start: 0,
+                byte_end: good_page.len(),
+                page_number: 1,
+            },
+            PageBoundary {
+                byte_start: good_page.len(),
+                byte_end: text.len(),
+                page_number: 2,
+            },
+        ];
+        let decision = ocr::evaluate_per_page_ocr(&text, Some(&boundaries), Some(2));
+        assert!(decision.fallback);
+    }
+    #[cfg(feature = "ocr")]
+    #[test]
+    fn test_per_page_ocr_preserves_document_stats_on_fallback() {
+        use crate::types::PageBoundary;
+        let good_page = "This page has plenty of meaningful searchable text content for extraction.";
+        let bad_page = " . ; ";
+        let text = format!("{}{}", good_page, bad_page);
+        let boundaries = vec![
+            PageBoundary {
+                byte_start: 0,
+                byte_end: good_page.len(),
+                page_number: 1,
+            },
+            PageBoundary {
+                byte_start: good_page.len(),
+                byte_end: text.len(),
+                page_number: 2,
+            },
+        ];
+        let decision = ocr::evaluate_per_page_ocr(&text, Some(&boundaries), Some(2));
+        assert!(decision.fallback);
+        assert!(decision.stats.non_whitespace > 0);
+        assert!(decision.stats.meaningful_words > 0);
+    }
+    #[cfg(feature = "ocr")]
+    #[test]
+    fn test_per_page_ocr_invalid_boundaries_skipped() {
+        use crate::types::PageBoundary;
+        let text = "This page has plenty of meaningful searchable text content for extraction.";
+        let boundaries = vec![
+            PageBoundary {
+                byte_start: 0,
+                byte_end: text.len(),
+                page_number: 1,
+            },
+            PageBoundary {
+                byte_start: 999,
+                byte_end: 9999,
+                page_number: 2,
+            },
+        ];
+        let decision = ocr::evaluate_per_page_ocr(text, Some(&boundaries), Some(1));
+        assert!(!decision.fallback);
+    }
+    #[cfg(feature = "ocr")]
+    #[test]
+    fn test_per_page_ocr_multi_page_correct_page_count() {
+        let text = "ab cd ef";
+        let decision_wrong = ocr::evaluate_native_text_for_ocr(text, None);
+        let decision_correct = ocr::evaluate_native_text_for_ocr(text, Some(20));
+        assert!(
+            decision_correct.avg_non_whitespace < decision_wrong.avg_non_whitespace,
+            "Correct page count should produce lower per-page averages"
+        );
+    }
     #[tokio::test]
     #[cfg(feature = "pdf")]
     async fn test_pdf_batch_mode_validates_page_config_enabled() {

data/vendor/kreuzberg/src/extractors/pdf/ocr.rs CHANGED Viewed

@@ -139,6 +139,33 @@ pub fn evaluate_native_text_for_ocr(native_text: &str, page_count: Option<usize>
     }
 }
+#[cfg(feature = "ocr")]
+pub fn evaluate_per_page_ocr(
+    native_text: &str,
+    boundaries: Option<&[crate::types::PageBoundary]>,
+    page_count: Option<usize>,
+) -> OcrFallbackDecision {
+    let boundaries = match boundaries {
+        Some(b) if !b.is_empty() => b,
+        _ => return evaluate_native_text_for_ocr(native_text, page_count),
+    };
+    let mut document_decision = evaluate_native_text_for_ocr(native_text, page_count);
+    for boundary in boundaries {
+        if boundary.byte_end > native_text.len() || boundary.byte_start > boundary.byte_end {
+            continue;
+        }
+        let page_text = &native_text[boundary.byte_start..boundary.byte_end];
+        if evaluate_native_text_for_ocr(page_text, Some(1)).fallback {
+            document_decision.fallback = true;
+            return document_decision;
+        }
+    }
+    document_decision
+}
 /// Extract text from PDF using OCR.
 ///
 /// Renders all pages to images and processes them with OCR backend.

data/vendor/kreuzberg/src/mcp/params.rs CHANGED Viewed

@@ -15,9 +15,6 @@ pub struct ExtractFileParams {
     /// Extraction configuration (JSON object)
     #[serde(skip_serializing_if = "Option::is_none")]
     pub config: Option<serde_json::Value>,
-    /// Use async extraction (default: false for sync)
-    #[serde(default)]
-    pub r#async: bool,
 }
 /// Request parameters for bytes extraction.
@@ -31,9 +28,6 @@ pub struct ExtractBytesParams {
     /// Extraction configuration (JSON object)
     #[serde(skip_serializing_if = "Option::is_none")]
     pub config: Option<serde_json::Value>,
-    /// Use async extraction (default: false for sync)
-    #[serde(default)]
-    pub r#async: bool,
 }
 /// Request parameters for batch file extraction.
@@ -44,9 +38,6 @@ pub struct BatchExtractFilesParams {
     /// Extraction configuration (JSON object)
     #[serde(skip_serializing_if = "Option::is_none")]
     pub config: Option<serde_json::Value>,
-    /// Use async extraction (default: false for sync)
-    #[serde(default)]
-    pub r#async: bool,
 }
 /// Request parameters for MIME type detection.
@@ -75,7 +66,6 @@ mod tests {
         assert_eq!(params.path, "/test.pdf");
         assert_eq!(params.mime_type, None);
         assert_eq!(params.config, None);
-        assert!(!params.r#async);
     }
     #[test]
@@ -86,7 +76,6 @@ mod tests {
         assert_eq!(params.data, "SGVsbG8=");
         assert_eq!(params.mime_type, None);
         assert_eq!(params.config, None);
-        assert!(!params.r#async);
     }
     #[test]
@@ -96,7 +85,6 @@ mod tests {
         assert_eq!(params.paths.len(), 2);
         assert_eq!(params.config, None);
-        assert!(!params.r#async);
     }
     #[test]
@@ -131,7 +119,6 @@ mod tests {
             path: "/test.pdf".to_string(),
             mime_type: Some("application/pdf".to_string()),
             config: Some(serde_json::json!({"use_cache": false})),
-            r#async: true,
         };
         let json = serde_json::to_string(&params).unwrap();
@@ -140,7 +127,6 @@ mod tests {
         assert_eq!(params.path, deserialized.path);
         assert_eq!(params.mime_type, deserialized.mime_type);
         assert_eq!(params.config, deserialized.config);
-        assert_eq!(params.r#async, deserialized.r#async);
     }
     #[test]
@@ -149,7 +135,6 @@ mod tests {
             data: "SGVsbG8=".to_string(),
             mime_type: None,
             config: None,
-            r#async: false,
         };
         let json = serde_json::to_string(&params).unwrap();
@@ -163,7 +148,6 @@ mod tests {
         let params = BatchExtractFilesParams {
             paths: vec!["/a.pdf".to_string(), "/b.pdf".to_string()],
             config: Some(serde_json::json!({"use_cache": true})),
-            r#async: true,
         };
         let json = serde_json::to_string(&params).unwrap();

data/vendor/kreuzberg/src/mcp/server.rs CHANGED Viewed

@@ -68,6 +68,10 @@ impl KreuzbergMcp {
     ///
     /// This tool extracts text, metadata, and tables from documents in various formats
     /// including PDFs, Word documents, Excel spreadsheets, images (with OCR), and more.
+    ///
+    /// Note: The `async` parameter is accepted for API compatibility but ignored.
+    /// Extraction always runs asynchronously since the MCP server operates within
+    /// a Tokio runtime. Using sync wrappers would cause a nested runtime panic.
     #[tool(
         description = "Extract content from a file by path. Supports PDFs, Word, Excel, images (with OCR), HTML, and more.",
         annotations(title = "Extract File", read_only_hint = true, idempotent_hint = true)
@@ -78,18 +82,17 @@ impl KreuzbergMcp {
     ) -> Result<CallToolResult, rmcp::ErrorData> {
         use super::errors::map_kreuzberg_error_to_mcp;
         use super::format::{build_config, format_extraction_result};
-        use crate::{extract_file, extract_file_sync};
+        use crate::extract_file;
         let config =
             build_config(&self.default_config, params.config).map_err(|e| rmcp::ErrorData::invalid_params(e, None))?;
-        let result = if params.r#async {
-            extract_file(&params.path, params.mime_type.as_deref(), &config)
-                .await
-                .map_err(map_kreuzberg_error_to_mcp)?
-        } else {
-            extract_file_sync(&params.path, params.mime_type.as_deref(), &config).map_err(map_kreuzberg_error_to_mcp)?
-        };
+        // Always use async extraction - we're already in a Tokio runtime context.
+        // Calling sync wrappers (which use GLOBAL_RUNTIME.block_on()) from within
+        // an async context causes "Cannot start a runtime from within a runtime" panic.
+        let result = extract_file(&params.path, params.mime_type.as_deref(), &config)
+            .await
+            .map_err(map_kreuzberg_error_to_mcp)?;
         let response = format_extraction_result(&result);
         Ok(CallToolResult::success(vec![Content::text(response)]))
@@ -98,6 +101,10 @@ impl KreuzbergMcp {
     /// Extract content from base64-encoded bytes.
     ///
     /// This tool extracts text, metadata, and tables from base64-encoded document data.
+    ///
+    /// Note: The `async` parameter is accepted for API compatibility but ignored.
+    /// Extraction always runs asynchronously since the MCP server operates within
+    /// a Tokio runtime. Using sync wrappers would cause a nested runtime panic.
     #[tool(
         description = "Extract content from base64-encoded file data. Returns extracted text, metadata, and tables.",
         annotations(title = "Extract Bytes", read_only_hint = true, idempotent_hint = true)
@@ -108,7 +115,7 @@ impl KreuzbergMcp {
     ) -> Result<CallToolResult, rmcp::ErrorData> {
         use super::errors::map_kreuzberg_error_to_mcp;
         use super::format::{build_config, format_extraction_result};
-        use crate::{extract_bytes, extract_bytes_sync};
+        use crate::extract_bytes;
         use base64::prelude::*;
         let bytes = BASE64_STANDARD
@@ -120,13 +127,10 @@ impl KreuzbergMcp {
         let mime_type = params.mime_type.as_deref().unwrap_or("");
-        let result = if params.r#async {
-            extract_bytes(&bytes, mime_type, &config)
-                .await
-                .map_err(map_kreuzberg_error_to_mcp)?
-        } else {
-            extract_bytes_sync(&bytes, mime_type, &config).map_err(map_kreuzberg_error_to_mcp)?
-        };
+        // Always use async extraction - we're already in a Tokio runtime context.
+        let result = extract_bytes(&bytes, mime_type, &config)
+            .await
+            .map_err(map_kreuzberg_error_to_mcp)?;
         let response = format_extraction_result(&result);
         Ok(CallToolResult::success(vec![Content::text(response)]))
@@ -135,6 +139,10 @@ impl KreuzbergMcp {
     /// Extract content from multiple files in parallel.
     ///
     /// This tool efficiently processes multiple documents simultaneously, useful for batch operations.
+    ///
+    /// Note: The `async` parameter is accepted for API compatibility but ignored.
+    /// Extraction always runs asynchronously since the MCP server operates within
+    /// a Tokio runtime. Using sync wrappers would cause a nested runtime panic.
     #[tool(
         description = "Extract content from multiple files in parallel. Returns results for all files.",
         annotations(title = "Batch Extract Files", read_only_hint = true, idempotent_hint = true)
@@ -145,18 +153,15 @@ impl KreuzbergMcp {
     ) -> Result<CallToolResult, rmcp::ErrorData> {
         use super::errors::map_kreuzberg_error_to_mcp;
         use super::format::build_config;
-        use crate::{batch_extract_file, batch_extract_file_sync};
+        use crate::batch_extract_file;
         let config =
             build_config(&self.default_config, params.config).map_err(|e| rmcp::ErrorData::invalid_params(e, None))?;
-        let results = if params.r#async {
-            batch_extract_file(params.paths.clone(), &config)
-                .await
-                .map_err(map_kreuzberg_error_to_mcp)?
-        } else {
-            batch_extract_file_sync(params.paths.clone(), &config).map_err(map_kreuzberg_error_to_mcp)?
-        };
+        // Always use async extraction - we're already in a Tokio runtime context.
+        let results = batch_extract_file(params.paths.clone(), &config)
+            .await
+            .map_err(map_kreuzberg_error_to_mcp)?;
         let response = serde_json::to_string_pretty(&results).unwrap_or_default();
         Ok(CallToolResult::success(vec![Content::text(response)]))

data/vendor/kreuzberg/src/mcp/tools/extraction.rs CHANGED Viewed

@@ -3,8 +3,8 @@
 use base64::prelude::*;
 use std::borrow::Cow;
 use crate::{
-    ExtractionConfig, batch_extract_file, batch_extract_file_sync, extract_bytes, extract_bytes_sync, extract_file,
-    extract_file_sync, mcp::errors::map_kreuzberg_error_to_mcp, mcp::format::{build_config, format_extraction_result},
+    ExtractionConfig, batch_extract_file, extract_bytes, extract_file,
+    mcp::errors::map_kreuzberg_error_to_mcp, mcp::format::{build_config, format_extraction_result},
     mcp::params::{BatchExtractFilesParams, ExtractBytesParams, ExtractFileParams},
 };
 use rmcp::{
@@ -34,13 +34,9 @@ pub(in crate::mcp) trait ExtractionTool {
         let config = build_config(self.default_config(), params.config)
             .map_err(|e| McpError::invalid_params(e, None))?;
-        let result = if params.r#async {
-            extract_file(&params.path, params.mime_type.as_deref(), &config)
-                .await
-                .map_err(map_kreuzberg_error_to_mcp)?
-        } else {
-            extract_file_sync(&params.path, params.mime_type.as_deref(), &config).map_err(map_kreuzberg_error_to_mcp)?
-        };
+        let result = extract_file(&params.path, params.mime_type.as_deref(), &config)
+            .await
+            .map_err(map_kreuzberg_error_to_mcp)?;
         let response = format_extraction_result(&result);
         Ok(CallToolResult::success(vec![Content::text(response)]))
@@ -66,13 +62,9 @@ pub(in crate::mcp) trait ExtractionTool {
         let mime_type = params.mime_type.as_deref().unwrap_or("");
-        let result = if params.r#async {
-            extract_bytes(&bytes, mime_type, &config)
-                .await
-                .map_err(map_kreuzberg_error_to_mcp)?
-        } else {
-            extract_bytes_sync(&bytes, mime_type, &config).map_err(map_kreuzberg_error_to_mcp)?
-        };
+        let result = extract_bytes(&bytes, mime_type, &config)
+            .await
+            .map_err(map_kreuzberg_error_to_mcp)?;
         let response = format_extraction_result(&result);
         Ok(CallToolResult::success(vec![Content::text(response)]))
@@ -92,13 +84,9 @@ pub(in crate::mcp) trait ExtractionTool {
         let config = build_config(self.default_config(), params.config)
             .map_err(|e| McpError::invalid_params(e, None))?;
-        let results = if params.r#async {
-            batch_extract_file(params.paths.clone(), &config)
-                .await
-                .map_err(map_kreuzberg_error_to_mcp)?
-        } else {
-            batch_extract_file_sync(params.paths.clone(), &config).map_err(map_kreuzberg_error_to_mcp)?
-        };
+        let results = batch_extract_file(params.paths.clone(), &config)
+            .await
+            .map_err(map_kreuzberg_error_to_mcp)?;
         let response = serde_json::to_string_pretty(&results).unwrap_or_default();
         Ok(CallToolResult::success(vec![Content::text(response)]))
@@ -152,8 +140,7 @@ mod tests {
             path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
             mime_type: None,
             config: None,
-            r#async: true,
-        };
+                    };
         let result = server.extract_file(Parameters(params)).await;
@@ -179,8 +166,7 @@ mod tests {
             path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
             mime_type: None,
             config: None,
-            r#async: true,
-        };
+                    };
         let result = server.extract_file(Parameters(params)).await;
@@ -205,8 +191,7 @@ mod tests {
             path: "/nonexistent/file.pdf".to_string(),
             mime_type: None,
             config: None,
-            r#async: true,
-        };
+                    };
         let result = server.extract_file(Parameters(params)).await;
@@ -222,8 +207,7 @@ mod tests {
             path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
             mime_type: Some(Cow::Borrowed("application/pdf")),
             config: None,
-            r#async: true,
-        };
+                    };
         let result = server.extract_file(Parameters(params)).await;
@@ -241,8 +225,7 @@ mod tests {
             data: encoded,
             mime_type: Some(Cow::Borrowed("text/plain")),
             config: None,
-            r#async: true,
-        };
+                    };
         let result = server.extract_bytes(Parameters(params)).await;
@@ -268,8 +251,7 @@ mod tests {
             data: "not-valid-base64!!!".to_string(),
             mime_type: None,
             config: None,
-            r#async: true,
-        };
+                    };
         let result = server.extract_bytes(Parameters(params)).await;
@@ -285,8 +267,7 @@ mod tests {
         let params = BatchExtractFilesParams {
             paths: vec![get_test_path("pdfs_with_tables/tiny.pdf").to_string()],
             config: None,
-            r#async: true,
-        };
+                    };
         let result = server.batch_extract_files(Parameters(params)).await;
@@ -311,8 +292,7 @@ mod tests {
         let params = BatchExtractFilesParams {
             paths: vec![],
             config: None,
-            r#async: true,
-        };
+                    };
         let result = server.batch_extract_files(Parameters(params)).await;
@@ -341,8 +321,7 @@ mod tests {
                 path: test_file.to_string(),
                 mime_type: None,
                 config: None,
-                r#async: true,
-            };
+                            };
             let result = server.extract_file(Parameters(params)).await;
@@ -368,8 +347,7 @@ mod tests {
             let params = BatchExtractFilesParams {
                 paths: vec![file1.to_string(), file2.to_string()],
                 config: None,
-                r#async: true,
-            };
+                            };
             let result = server.batch_extract_files(Parameters(params)).await;

data/vendor/kreuzberg-tesseract/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg-tesseract"
-version = "4.2.8"
+version = "4.2.9"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: kreuzberg
 version: !ruby/object:Gem::Version
-  version: 4.2.8
+  version: 4.2.9
 platform: ruby
 authors:
 - Na'aman Hirschfeld
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2026-02-02 00:00:00.000000000 Z
+date: 2026-02-03 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rb_sys