RubyGems - kreuzberg - Versions diffs - 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8 - Mend

kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

checksums.yaml +4 -4
data/Gemfile.lock +5 -5
data/README.md +15 -9
data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
data/ext/kreuzberg_rb/native/Cargo.lock +511 -325
data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
data/kreuzberg.gemspec +38 -4
data/lib/kreuzberg/config.rb +34 -1
data/lib/kreuzberg/result.rb +77 -14
data/lib/kreuzberg/version.rb +1 -1
data/sig/kreuzberg.rbs +23 -6
data/vendor/kreuzberg/Cargo.toml +32 -11
data/vendor/kreuzberg/README.md +54 -8
data/vendor/kreuzberg/build.rs +549 -132
data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
data/vendor/kreuzberg/src/core/config.rs +49 -1
data/vendor/kreuzberg/src/core/extractor.rs +134 -2
data/vendor/kreuzberg/src/core/mod.rs +4 -2
data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
data/vendor/kreuzberg/src/extraction/html.rs +24 -8
data/vendor/kreuzberg/src/extraction/image.rs +124 -1
data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
data/vendor/kreuzberg/src/extractors/email.rs +29 -15
data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
data/vendor/kreuzberg/src/extractors/html.rs +29 -15
data/vendor/kreuzberg/src/extractors/image.rs +25 -4
data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
data/vendor/kreuzberg/src/extractors/pdf.rs +197 -17
data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
data/vendor/kreuzberg/src/extractors/text.rs +7 -2
data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
data/vendor/kreuzberg/src/lib.rs +10 -2
data/vendor/kreuzberg/src/mcp/mod.rs +3 -0
data/vendor/kreuzberg/src/mcp/server.rs +120 -12
data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
data/vendor/kreuzberg/src/pdf/bundled.rs +328 -0
data/vendor/kreuzberg/src/pdf/error.rs +8 -0
data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
data/vendor/kreuzberg/src/pdf/mod.rs +18 -2
data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
data/vendor/kreuzberg/src/pdf/table.rs +26 -2
data/vendor/kreuzberg/src/pdf/text.rs +89 -7
data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
data/vendor/kreuzberg/src/text/mod.rs +6 -0
data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
data/vendor/kreuzberg/src/types.rs +173 -21
data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
data/vendor/kreuzberg/tests/config_features.rs +15 -1
data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
data/vendor/kreuzberg/tests/email_integration.rs +2 -0
data/vendor/kreuzberg/tests/error_handling.rs +43 -34
data/vendor/kreuzberg/tests/format_integration.rs +2 -0
data/vendor/kreuzberg/tests/image_integration.rs +2 -0
data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
data/vendor/kreuzberg/tests/security_validation.rs +1 -0
data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
data/vendor/rb-sys/Cargo.lock +15 -15
data/vendor/rb-sys/Cargo.toml +4 -4
data/vendor/rb-sys/Cargo.toml.orig +4 -4
data/vendor/rb-sys/build/features.rs +5 -2
data/vendor/rb-sys/build/main.rs +55 -15
data/vendor/rb-sys/build/stable_api_config.rs +4 -2
data/vendor/rb-sys/build/version.rs +3 -1
data/vendor/rb-sys/src/lib.rs +1 -0
data/vendor/rb-sys/src/macros.rs +2 -2
data/vendor/rb-sys/src/special_consts.rs +1 -1
data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
data/vendor/rb-sys/src/stable_api.rs +0 -1
data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
metadata +13 -10
data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
data/vendor/rb-sys/.cargo-ok +0 -1
data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316

data/vendor/kreuzberg/src/chunking/processor.rs ADDED Viewed

@@ -0,0 +1,220 @@
+//! Text chunking post-processor.
+//!
+//! This module provides a PostProcessor plugin that chunks text content in
+//! extraction results.
+use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
+use crate::{ExtractionConfig, ExtractionResult, KreuzbergError, Result};
+use async_trait::async_trait;
+/// Post-processor that chunks text in document content.
+///
+/// This processor:
+/// - Runs in the Middle processing stage
+/// - Only processes when `config.chunking` is configured
+/// - Stores chunks in `result.chunks`
+/// - Uses configurable chunk size and overlap
+///
+/// # Example
+///
+/// ```rust,no_run
+/// use kreuzberg::plugins::{Plugin, PostProcessor};
+/// use kreuzberg::chunking::processor::ChunkingProcessor;
+///
+/// let processor = ChunkingProcessor;
+/// assert_eq!(processor.name(), "text-chunking");
+/// ```
+#[derive(Debug, Clone, Copy)]
+pub struct ChunkingProcessor;
+impl Plugin for ChunkingProcessor {
+    fn name(&self) -> &str {
+        "text-chunking"
+    }
+    fn version(&self) -> String {
+        env!("CARGO_PKG_VERSION").to_string()
+    }
+    fn initialize(&self) -> Result<()> {
+        Ok(())
+    }
+    fn shutdown(&self) -> Result<()> {
+        Ok(())
+    }
+}
+#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
+#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
+impl PostProcessor for ChunkingProcessor {
+    async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig) -> Result<()> {
+        let chunking_config = match &config.chunking {
+            Some(cfg) => cfg,
+            None => return Ok(()),
+        };
+        let chunk_config = crate::chunking::ChunkingConfig {
+            max_characters: chunking_config.max_chars,
+            overlap: chunking_config.max_overlap,
+            trim: true,
+            chunker_type: crate::chunking::ChunkerType::Text,
+        };
+        let chunking_result = crate::chunking::chunk_text(&result.content, &chunk_config, None)
+            .map_err(|e| KreuzbergError::Other(format!("Chunking failed: {}", e)))?;
+        result.chunks = Some(chunking_result.chunks);
+        Ok(())
+    }
+    fn processing_stage(&self) -> ProcessingStage {
+        ProcessingStage::Middle
+    }
+    fn should_process(&self, _result: &ExtractionResult, config: &ExtractionConfig) -> bool {
+        config.chunking.is_some()
+    }
+    fn estimated_duration_ms(&self, result: &ExtractionResult) -> u64 {
+        let text_length = result.content.len();
+        // Chunking is fast: ~1ms per 10KB
+        (text_length / 10240).max(1) as u64
+    }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::core::config::ChunkingConfig;
+    use crate::types::Metadata;
+    #[tokio::test]
+    async fn test_chunking_processor() {
+        let processor = ChunkingProcessor;
+        let config = ExtractionConfig {
+            chunking: Some(ChunkingConfig {
+                max_chars: 100,
+                max_overlap: 10,
+                embedding: None,
+                preset: None,
+            }),
+            ..Default::default()
+        };
+        let mut result = ExtractionResult {
+	            content: "This is a longer text that should be split into multiple chunks to test the chunking processor functionality.".to_string(),
+	            mime_type: "text/plain".to_string(),
+	            metadata: Metadata::default(),
+	            tables: vec![],
+	            detected_languages: None,
+	            chunks: None,
+	            images: None,
+	            pages: None,
+	        };
+        processor.process(&mut result, &config).await.unwrap();
+        assert!(result.chunks.is_some());
+        let chunks = result.chunks.unwrap();
+        assert!(!chunks.is_empty());
+    }
+    #[tokio::test]
+    async fn test_chunking_processor_no_config() {
+        let processor = ChunkingProcessor;
+        let config = ExtractionConfig::default();
+        let mut result = ExtractionResult {
+            content: "Some text".to_string(),
+            mime_type: "text/plain".to_string(),
+            metadata: Metadata::default(),
+            tables: vec![],
+            detected_languages: None,
+            chunks: None,
+            images: None,
+            pages: None,
+        };
+        processor.process(&mut result, &config).await.unwrap();
+        assert!(result.chunks.is_none());
+    }
+    #[test]
+    fn test_chunking_processor_plugin_interface() {
+        let processor = ChunkingProcessor;
+        assert_eq!(processor.name(), "text-chunking");
+        assert!(!processor.version().is_empty());
+        assert!(processor.initialize().is_ok());
+        assert!(processor.shutdown().is_ok());
+    }
+    #[test]
+    fn test_chunking_processor_stage() {
+        let processor = ChunkingProcessor;
+        assert_eq!(processor.processing_stage(), ProcessingStage::Middle);
+    }
+    #[test]
+    fn test_chunking_processor_should_process() {
+        let processor = ChunkingProcessor;
+        let result = ExtractionResult {
+            content: "Sample text".to_string(),
+            mime_type: "text/plain".to_string(),
+            metadata: Metadata::default(),
+            tables: vec![],
+            detected_languages: None,
+            chunks: None,
+            images: None,
+            pages: None,
+        };
+        let config_with_chunking = ExtractionConfig {
+            chunking: Some(crate::core::config::ChunkingConfig {
+                max_chars: 100,
+                max_overlap: 10,
+                embedding: None,
+                preset: None,
+            }),
+            ..Default::default()
+        };
+        assert!(processor.should_process(&result, &config_with_chunking));
+        let config_without_chunking = ExtractionConfig::default();
+        assert!(!processor.should_process(&result, &config_without_chunking));
+    }
+    #[test]
+    fn test_chunking_processor_estimated_duration() {
+        let processor = ChunkingProcessor;
+        let short_result = ExtractionResult {
+            content: "Short".to_string(),
+            mime_type: "text/plain".to_string(),
+            metadata: Metadata::default(),
+            tables: vec![],
+            detected_languages: None,
+            chunks: None,
+            images: None,
+            pages: None,
+        };
+        let long_result = ExtractionResult {
+            content: "a".repeat(100000),
+            mime_type: "text/plain".to_string(),
+            metadata: Metadata::default(),
+            tables: vec![],
+            detected_languages: None,
+            chunks: None,
+            images: None,
+            pages: None,
+        };
+        let short_duration = processor.estimated_duration_ms(&short_result);
+        let long_duration = processor.estimated_duration_ms(&long_result);
+        assert!(long_duration > short_duration);
+    }
+}

data/vendor/kreuzberg/src/core/config.rs CHANGED Viewed

@@ -7,6 +7,40 @@ use crate::{KreuzbergError, Result};
 use serde::{Deserialize, Serialize};
 use std::path::Path;
+/// Page extraction and tracking configuration.
+///
+/// Controls how pages are extracted, tracked, and represented in the extraction results.
+/// When `None`, page tracking is disabled.
+///
+/// Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
+/// when page boundaries are available and chunking is configured.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(default)]
+pub struct PageConfig {
+    /// Extract pages as separate array (ExtractionResult.pages)
+    #[serde(default)]
+    pub extract_pages: bool,
+    /// Insert page markers in main content string
+    #[serde(default)]
+    pub insert_page_markers: bool,
+    /// Page marker format (use {page_num} placeholder)
+    /// Default: "\n\n<!-- PAGE {page_num} -->\n\n"
+    #[serde(default = "default_page_marker_format")]
+    pub marker_format: String,
+}
+impl Default for PageConfig {
+    fn default() -> Self {
+        Self {
+            extract_pages: false,
+            insert_page_markers: false,
+            marker_format: "\n\n<!-- PAGE {page_num} -->\n\n".to_string(),
+        }
+    }
+}
 /// Main extraction configuration.
 ///
 /// This struct contains all configuration options for the extraction process.
@@ -50,6 +84,7 @@ pub struct ExtractionConfig {
     pub images: Option<ImageExtractionConfig>,
     /// PDF-specific options (None = use defaults)
+    #[cfg(feature = "pdf")]
     #[serde(default)]
     pub pdf_options: Option<PdfConfig>,
@@ -61,6 +96,10 @@ pub struct ExtractionConfig {
     #[serde(default)]
     pub language_detection: Option<LanguageDetectionConfig>,
+    /// Page extraction configuration (None = no page tracking)
+    #[serde(default)]
+    pub pages: Option<PageConfig>,
     /// Keyword extraction configuration (None = no keyword extraction)
     #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
     #[serde(default)]
@@ -225,6 +264,7 @@ pub struct ImageExtractionConfig {
 }
 /// PDF-specific configuration.
+#[cfg(feature = "pdf")]
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct PdfConfig {
     /// Extract images from PDF
@@ -277,6 +317,9 @@ fn default_eng() -> String {
 fn default_tesseract_backend() -> String {
     "tesseract".to_string()
 }
+fn default_page_marker_format() -> String {
+    "\n\n<!-- PAGE {page_num} -->\n\n".to_string()
+}
 fn default_chunk_size() -> usize {
     1000
 }
@@ -317,9 +360,11 @@ impl Default for ExtractionConfig {
             force_ocr: false,
             chunking: None,
             images: None,
+            #[cfg(feature = "pdf")]
             pdf_options: None,
             token_reduction: None,
             language_detection: None,
+            pages: None,
             #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
             keywords: None,
             postprocessor: None,
@@ -647,6 +692,7 @@ max_dpi = 600
     }
     #[test]
+    #[cfg(feature = "pdf")]
     fn test_config_with_pdf_options() {
         let dir = tempdir().unwrap();
         let config_path = dir.path().join("kreuzberg.toml");
@@ -770,9 +816,10 @@ enabled = true
         assert!(config.ocr.is_some());
         assert!(config.chunking.is_some());
         assert!(config.images.is_some());
-        assert!(config.pdf_options.is_some());
         assert!(config.token_reduction.is_some());
         assert!(config.language_detection.is_some());
+        #[cfg(feature = "pdf")]
+        assert!(config.pdf_options.is_some());
     }
     #[test]
@@ -838,6 +885,7 @@ enabled = true
     }
     #[test]
+    #[cfg(feature = "pdf")]
     fn test_pdf_config_defaults() {
         let dir = tempdir().unwrap();
         let config_path = dir.path().join("kreuzberg.toml");

data/vendor/kreuzberg/src/core/extractor.rs CHANGED Viewed

@@ -20,6 +20,7 @@ use crate::types::ExtractionResult;
 #[cfg(feature = "office")]
 use crate::types::LibreOfficeConversionResult;
 use crate::{KreuzbergError, Result};
+#[cfg(feature = "tokio-runtime")]
 use once_cell::sync::Lazy;
 #[cfg(feature = "office")]
 use serde_json::json;
@@ -97,6 +98,12 @@ fn sanitize_path(path: &Path) -> String {
 /// 2. If runtime creation fails, the process is already in a critical state
 /// 3. This is a one-time initialization - if it fails, nothing will work
 /// 4. Better to fail fast than return errors from every sync operation
+///
+/// # Availability
+///
+/// This static is only available when the `tokio-runtime` feature is enabled.
+/// For WASM targets, use the truly synchronous extraction functions instead.
+#[cfg(feature = "tokio-runtime")]
 static GLOBAL_RUNTIME: Lazy<tokio::runtime::Runtime> = Lazy::new(|| {
     tokio::runtime::Builder::new_multi_thread()
         .enable_all()
@@ -310,13 +317,13 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
 ///
 /// Individual file errors are captured in the result metadata. System errors
 /// (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
+#[cfg(feature = "tokio-runtime")]
 #[cfg_attr(feature = "otel", tracing::instrument(
     skip(config, paths),
     fields(
         extraction.batch_size = paths.len(),
     )
 ))]
-#[cfg(feature = "tokio-runtime")]
 pub async fn batch_extract_file(
     paths: Vec<impl AsRef<Path>>,
     config: &ExtractionConfig,
@@ -380,6 +387,7 @@ pub async fn batch_extract_file(
                     detected_languages: None,
                     chunks: None,
                     images: None,
+                    pages: None,
                 });
             }
             Err(join_err) => {
@@ -407,13 +415,13 @@ pub async fn batch_extract_file(
 /// # Returns
 ///
 /// A vector of `ExtractionResult` in the same order as the input.
+#[cfg(feature = "tokio-runtime")]
 #[cfg_attr(feature = "otel", tracing::instrument(
     skip(config, contents),
     fields(
         extraction.batch_size = contents.len(),
     )
 ))]
-#[cfg(feature = "tokio-runtime")]
 pub async fn batch_extract_bytes(
     contents: Vec<(&[u8], &str)>,
     config: &ExtractionConfig,
@@ -483,6 +491,7 @@ pub async fn batch_extract_bytes(
                     detected_languages: None,
                     chunks: None,
                     images: None,
+                    pages: None,
                 });
             }
             Err(join_err) => {
@@ -502,6 +511,10 @@ pub async fn batch_extract_bytes(
 ///
 /// Uses the global Tokio runtime for 100x+ performance improvement over creating
 /// a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
+///
+/// This function is only available with the `tokio-runtime` feature. For WASM targets,
+/// use a truly synchronous extraction approach instead.
+#[cfg(feature = "tokio-runtime")]
 pub fn extract_file_sync(
     path: impl AsRef<Path>,
     mime_type: Option<&str>,
@@ -514,14 +527,31 @@ pub fn extract_file_sync(
 ///
 /// Uses the global Tokio runtime for 100x+ performance improvement over creating
 /// a new runtime per call.
+///
+/// With the `tokio-runtime` feature, this blocks the current thread using the global
+/// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
+#[cfg(feature = "tokio-runtime")]
 pub fn extract_bytes_sync(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
     GLOBAL_RUNTIME.block_on(extract_bytes(content, mime_type, config))
 }
+/// Synchronous wrapper for `extract_bytes` (WASM-compatible version).
+///
+/// This is a truly synchronous implementation without tokio runtime dependency.
+/// It calls `extract_bytes_sync_impl()` to perform the extraction.
+#[cfg(not(feature = "tokio-runtime"))]
+pub fn extract_bytes_sync(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
+    extract_bytes_sync_impl(content.to_vec(), Some(mime_type.to_string()), Some(config.clone()))
+}
 /// Synchronous wrapper for `batch_extract_file`.
 ///
 /// Uses the global Tokio runtime for 100x+ performance improvement over creating
 /// a new runtime per call.
+///
+/// This function is only available with the `tokio-runtime` feature. For WASM targets,
+/// use a truly synchronous extraction approach instead.
+#[cfg(feature = "tokio-runtime")]
 pub fn batch_extract_file_sync(
     paths: Vec<impl AsRef<Path>>,
     config: &ExtractionConfig,
@@ -533,6 +563,11 @@ pub fn batch_extract_file_sync(
 ///
 /// Uses the global Tokio runtime for 100x+ performance improvement over creating
 /// a new runtime per call.
+///
+/// With the `tokio-runtime` feature, this blocks the current thread using the global
+/// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
+/// that iterates through items and calls `extract_bytes_sync()`.
+#[cfg(feature = "tokio-runtime")]
 pub fn batch_extract_bytes_sync(
     contents: Vec<(&[u8], &str)>,
     config: &ExtractionConfig,
@@ -540,6 +575,103 @@ pub fn batch_extract_bytes_sync(
     GLOBAL_RUNTIME.block_on(batch_extract_bytes(contents, config))
 }
+/// Synchronous wrapper for `batch_extract_bytes` (WASM-compatible version).
+///
+/// This is a truly synchronous implementation that iterates through items
+/// and calls `extract_bytes_sync()` for each.
+#[cfg(not(feature = "tokio-runtime"))]
+pub fn batch_extract_bytes_sync(
+    contents: Vec<(&[u8], &str)>,
+    config: &ExtractionConfig,
+) -> Result<Vec<ExtractionResult>> {
+    let mut results = Vec::with_capacity(contents.len());
+    for (content, mime_type) in contents {
+        let result = extract_bytes_sync(content, mime_type, config);
+        results.push(result.unwrap_or_else(|e| {
+            use crate::types::{ErrorMetadata, Metadata};
+            ExtractionResult {
+                content: format!("Error: {}", e),
+                mime_type: "text/plain".to_string(),
+                metadata: Metadata {
+                    error: Some(ErrorMetadata {
+                        error_type: format!("{:?}", e),
+                        message: e.to_string(),
+                    }),
+                    ..Default::default()
+                },
+                tables: vec![],
+                detected_languages: None,
+                chunks: None,
+                images: None,
+                pages: None,
+            }
+        }));
+    }
+    Ok(results)
+}
+/// Synchronous extraction implementation for WASM compatibility.
+///
+/// This function performs extraction without requiring a tokio runtime.
+/// It calls the sync extractor methods directly.
+///
+/// # Arguments
+///
+/// * `content` - The byte content to extract
+/// * `mime_type` - Optional MIME type to validate/use
+/// * `config` - Optional extraction configuration
+///
+/// # Returns
+///
+/// An `ExtractionResult` or a `KreuzbergError`
+///
+/// # Implementation Notes
+///
+/// This is called when the `tokio-runtime` feature is disabled.
+/// It replicates the logic of `extract_bytes` but uses synchronous extractor methods.
+#[cfg(not(feature = "tokio-runtime"))]
+fn extract_bytes_sync_impl(
+    content: Vec<u8>,
+    mime_type: Option<String>,
+    config: Option<ExtractionConfig>,
+) -> Result<ExtractionResult> {
+    use crate::core::mime;
+    let config = config.unwrap_or_default();
+    // Validate MIME type if provided
+    let validated_mime = if let Some(mime) = mime_type {
+        mime::validate_mime_type(&mime)?
+    } else {
+        return Err(KreuzbergError::Validation {
+            message: "MIME type is required for synchronous extraction".to_string(),
+            source: None,
+        });
+    };
+    // Ensure extractors are initialized
+    crate::extractors::ensure_initialized()?;
+    // Get the appropriate extractor
+    let extractor = get_extractor(&validated_mime)?;
+    // Check if extractor supports synchronous extraction
+    let sync_extractor = extractor.as_sync_extractor().ok_or_else(|| {
+        KreuzbergError::UnsupportedFormat(format!(
+            "Extractor for '{}' does not support synchronous extraction",
+            validated_mime
+        ))
+    })?;
+    // Call the sync extract method
+    let mut result = sync_extractor.extract_sync(&content, &validated_mime, &config)?;
+    // Run post-processing pipeline (sync version)
+    result = crate::core::pipeline::run_pipeline_sync(result, &config)?;
+    Ok(result)
+}
 async fn extract_file_with_extractor(
     path: &Path,
     mime_type: &str,

data/vendor/kreuzberg/src/core/mod.rs CHANGED Viewed

@@ -37,9 +37,11 @@ pub mod mime;
 pub mod pipeline;
 pub use config::{
-    ChunkingConfig, ExtractionConfig, ImageExtractionConfig, LanguageDetectionConfig, OcrConfig, PdfConfig,
-    TokenReductionConfig,
+    ChunkingConfig, ExtractionConfig, ImageExtractionConfig, LanguageDetectionConfig, OcrConfig, TokenReductionConfig,
 };
+#[cfg(feature = "pdf")]
+pub use config::PdfConfig;
 #[cfg(feature = "tokio-runtime")]
 pub use extractor::{batch_extract_bytes, batch_extract_file};
 pub use extractor::{extract_bytes, extract_file};