RubyGems - kreuzberg - Versions diffs - 4.0.0.pre.rc.6 → 4.0.0.rc1 - Mend

kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

checksums.yaml +4 -4
data/.gitignore +0 -6
data/.rubocop.yaml +534 -1
data/Gemfile +2 -1
data/Gemfile.lock +11 -11
data/README.md +5 -10
data/examples/async_patterns.rb +0 -1
data/ext/kreuzberg_rb/extconf.rb +0 -10
data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
data/ext/kreuzberg_rb/native/build.rs +2 -0
data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
data/ext/kreuzberg_rb/native/include/strings.h +2 -2
data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
data/kreuzberg.gemspec +14 -57
data/lib/kreuzberg/cache_api.rb +0 -1
data/lib/kreuzberg/cli.rb +2 -2
data/lib/kreuzberg/config.rb +2 -9
data/lib/kreuzberg/errors.rb +7 -75
data/lib/kreuzberg/extraction_api.rb +0 -1
data/lib/kreuzberg/setup_lib_path.rb +0 -1
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +0 -21
data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
data/sig/kreuzberg.rbs +3 -55
data/spec/binding/cli_proxy_spec.rb +4 -2
data/spec/binding/cli_spec.rb +11 -12
data/spec/examples.txt +104 -0
data/spec/fixtures/config.yaml +1 -0
data/spec/spec_helper.rb +1 -1
data/vendor/kreuzberg/Cargo.toml +42 -112
data/vendor/kreuzberg/README.md +2 -2
data/vendor/kreuzberg/build.rs +4 -18
data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
data/vendor/kreuzberg/src/cache/mod.rs +3 -27
data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
data/vendor/kreuzberg/src/core/extractor.rs +81 -202
data/vendor/kreuzberg/src/core/io.rs +2 -4
data/vendor/kreuzberg/src/core/mime.rs +12 -2
data/vendor/kreuzberg/src/core/mod.rs +1 -4
data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
data/vendor/kreuzberg/src/embeddings.rs +16 -125
data/vendor/kreuzberg/src/error.rs +1 -1
data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
data/vendor/kreuzberg/src/extraction/image.rs +13 -13
data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
data/vendor/kreuzberg/src/extractors/email.rs +0 -14
data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
data/vendor/kreuzberg/src/extractors/html.rs +154 -137
data/vendor/kreuzberg/src/extractors/image.rs +4 -7
data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
data/vendor/kreuzberg/src/extractors/text.rs +5 -23
data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
data/vendor/kreuzberg/src/lib.rs +1 -4
data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
data/vendor/kreuzberg/src/mcp/server.rs +3 -5
data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
data/vendor/kreuzberg/src/pdf/error.rs +1 -1
data/vendor/kreuzberg/src/pdf/table.rs +44 -17
data/vendor/kreuzberg/src/pdf/text.rs +3 -0
data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
data/vendor/kreuzberg/src/types.rs +12 -42
data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
data/vendor/kreuzberg/tests/config_features.rs +0 -18
data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
data/vendor/kreuzberg/tests/core_integration.rs +7 -24
data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
data/vendor/kreuzberg/tests/security_validation.rs +1 -12
metadata +25 -90
data/.rubocop.yml +0 -538
data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
data/lib/kreuzberg/error_context.rb +0 -32
data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
data/vendor/kreuzberg/src/extractors/security.rs +0 -484
data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
data/vendor/kreuzberg/src/panic_context.rs +0 -154
data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
data/vendor/rb-sys/.cargo-ok +0 -1
data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
data/vendor/rb-sys/Cargo.lock +0 -393
data/vendor/rb-sys/Cargo.toml +0 -70
data/vendor/rb-sys/Cargo.toml.orig +0 -57
data/vendor/rb-sys/LICENSE-APACHE +0 -190
data/vendor/rb-sys/LICENSE-MIT +0 -21
data/vendor/rb-sys/bin/release.sh +0 -21
data/vendor/rb-sys/build/features.rs +0 -108
data/vendor/rb-sys/build/main.rs +0 -246
data/vendor/rb-sys/build/stable_api_config.rs +0 -153
data/vendor/rb-sys/build/version.rs +0 -48
data/vendor/rb-sys/readme.md +0 -36
data/vendor/rb-sys/src/bindings.rs +0 -21
data/vendor/rb-sys/src/hidden.rs +0 -11
data/vendor/rb-sys/src/lib.rs +0 -34
data/vendor/rb-sys/src/macros.rs +0 -371
data/vendor/rb-sys/src/memory.rs +0 -53
data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
data/vendor/rb-sys/src/special_consts.rs +0 -31
data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
data/vendor/rb-sys/src/stable_api.rs +0 -261
data/vendor/rb-sys/src/symbol.rs +0 -31
data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
data/vendor/rb-sys/src/utils.rs +0 -89
data/vendor/rb-sys/src/value_type.rs +0 -7

data/vendor/kreuzberg/src/extractors/pandoc.rs ADDED Viewed

@@ -0,0 +1,201 @@
+//! Pandoc-based extractors for various document formats.
+//!
+//! Supports: DOCX, ODT, EPUB, LaTeX, RST, RTF, and many more formats via Pandoc.
+use crate::Result;
+use crate::core::config::ExtractionConfig;
+use crate::extraction::pandoc::extract_bytes_from_mime;
+use crate::plugins::{DocumentExtractor, Plugin};
+use crate::types::{ExtractionResult, Metadata};
+use async_trait::async_trait;
+/// Generic Pandoc extractor for all Pandoc-supported formats.
+///
+/// This extractor handles all document formats supported by Pandoc, including:
+/// - Microsoft Word (DOCX)
+/// - OpenDocument Text (ODT)
+/// - EPUB
+/// - LaTeX
+/// - reStructuredText (RST)
+/// - RTF
+/// - And many more
+pub struct PandocExtractor;
+impl PandocExtractor {
+    /// Create a new Pandoc extractor.
+    pub fn new() -> Self {
+        Self
+    }
+}
+impl Default for PandocExtractor {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+impl Plugin for PandocExtractor {
+    fn name(&self) -> &str {
+        "pandoc-extractor"
+    }
+    fn version(&self) -> String {
+        env!("CARGO_PKG_VERSION").to_string()
+    }
+    fn initialize(&self) -> Result<()> {
+        Ok(())
+    }
+    fn shutdown(&self) -> Result<()> {
+        Ok(())
+    }
+    fn description(&self) -> &str {
+        "Extracts content from Pandoc-supported formats (DOCX, ODT, EPUB, LaTeX, RST, RTF, etc.)"
+    }
+    fn author(&self) -> &str {
+        "Kreuzberg Team"
+    }
+}
+#[async_trait]
+impl DocumentExtractor for PandocExtractor {
+    async fn extract_bytes(
+        &self,
+        content: &[u8],
+        mime_type: &str,
+        _config: &ExtractionConfig,
+    ) -> Result<ExtractionResult> {
+        let pandoc_result = extract_bytes_from_mime(content, mime_type).await?;
+        let mut additional = std::collections::HashMap::new();
+        for (key, value) in pandoc_result.metadata {
+            additional.insert(key, value);
+        }
+        Ok(ExtractionResult {
+            content: pandoc_result.content,
+            mime_type: mime_type.to_string(),
+            metadata: Metadata {
+                additional,
+                ..Default::default()
+            },
+            tables: vec![],
+            detected_languages: None,
+            chunks: None,
+            images: None,
+        })
+    }
+    fn supported_mime_types(&self) -> &[&str] {
+        &[
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            "application/vnd.oasis.opendocument.text",
+            "application/epub+zip",
+            "application/x-latex",
+            "text/x-tex",
+            "text/x-rst",
+            "text/prs.fallenstein.rst",
+            "application/rtf",
+            "text/rtf",
+            "application/x-typst",
+            "application/x-ipynb+json",
+            "application/x-fictionbook+xml",
+            "text/x-org",
+            "text/x-commonmark",
+            "text/x-gfm",
+            "text/x-multimarkdown",
+            "text/x-markdown-extra",
+            "application/docbook+xml",
+            "application/x-jats+xml",
+            "application/x-opml+xml",
+        ]
+    }
+    fn priority(&self) -> i32 {
+        40
+    }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::extraction::pandoc::validate_pandoc_version;
+    #[tokio::test]
+    async fn test_pandoc_extractor_plugin_interface() {
+        let extractor = PandocExtractor::new();
+        assert_eq!(extractor.name(), "pandoc-extractor");
+        assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
+        assert_eq!(extractor.priority(), 40);
+        assert!(!extractor.supported_mime_types().is_empty());
+    }
+    #[tokio::test]
+    async fn test_pandoc_extractor_supports_docx() {
+        let extractor = PandocExtractor::new();
+        assert!(
+            extractor
+                .supported_mime_types()
+                .contains(&"application/vnd.openxmlformats-officedocument.wordprocessingml.document")
+        );
+    }
+    #[tokio::test]
+    async fn test_pandoc_extractor_supports_odt() {
+        let extractor = PandocExtractor::new();
+        assert!(
+            extractor
+                .supported_mime_types()
+                .contains(&"application/vnd.oasis.opendocument.text")
+        );
+    }
+    #[tokio::test]
+    async fn test_pandoc_extractor_supports_epub() {
+        let extractor = PandocExtractor::new();
+        assert!(extractor.supported_mime_types().contains(&"application/epub+zip"));
+    }
+    #[tokio::test]
+    async fn test_pandoc_extractor_supports_latex() {
+        let extractor = PandocExtractor::new();
+        assert!(extractor.supported_mime_types().contains(&"application/x-latex"));
+    }
+    #[tokio::test]
+    async fn test_pandoc_extractor_supports_rst() {
+        let extractor = PandocExtractor::new();
+        assert!(extractor.supported_mime_types().contains(&"text/x-rst"));
+    }
+    #[tokio::test]
+    async fn test_pandoc_extractor_markdown() {
+        if validate_pandoc_version().await.is_err() {
+            return;
+        }
+        let extractor = PandocExtractor::new();
+        let markdown = b"# Hello World\n\nThis is a test.";
+        let config = ExtractionConfig::default();
+        let result = extractor.extract_bytes(markdown, "text/x-rst", &config).await;
+        let _ = result;
+    }
+    #[tokio::test]
+    async fn test_pandoc_extractor_default() {
+        let extractor = PandocExtractor;
+        assert_eq!(extractor.name(), "pandoc-extractor");
+    }
+    #[tokio::test]
+    async fn test_pandoc_extractor_initialize_shutdown() {
+        let extractor = PandocExtractor::new();
+        assert!(extractor.initialize().is_ok());
+        assert!(extractor.shutdown().is_ok());
+    }
+}

data/vendor/kreuzberg/src/extractors/pdf.rs CHANGED Viewed

@@ -147,25 +147,31 @@ fn extract_tables_from_document(
     let mut all_tables = Vec::new();
+    // Process each page
     for (page_index, page) in document.pages().iter().enumerate() {
-        let words = extract_words_from_page(&page, 0.0)?;
+        // Extract words with positions from the page
+        let words = extract_words_from_page(&page, 0.0)?; // Use 0.0 confidence for PDF (always high quality)
         if words.is_empty() {
             continue;
         }
+        // Use existing table reconstruction logic
+        // These thresholds match the defaults from TesseractConfig
         let column_threshold = 50;
         let row_threshold_ratio = 0.5;
+        // Reconstruct table from positioned words
         let table_cells = reconstruct_table(&words, column_threshold, row_threshold_ratio, true);
         if !table_cells.is_empty() {
+            // Generate markdown representation
             let markdown = table_to_markdown(&table_cells);
             all_tables.push(Table {
                 cells: table_cells,
                 markdown,
-                page_number: page_index + 1,
+                page_number: page_index + 1, // 1-indexed
             });
         }
     }
@@ -281,13 +287,6 @@ impl Plugin for PdfExtractor {
 #[async_trait]
 impl DocumentExtractor for PdfExtractor {
-    #[cfg_attr(feature = "otel", tracing::instrument(
-        skip(self, content, config),
-        fields(
-            extractor.name = self.name(),
-            content.size_bytes = content.len(),
-        )
-    ))]
     async fn extract_bytes(
         &self,
         content: &[u8],
@@ -296,10 +295,9 @@ impl DocumentExtractor for PdfExtractor {
     ) -> Result<ExtractionResult> {
         #[cfg(feature = "pdf")]
         let (pdf_metadata, native_text, tables) = if crate::core::batch_mode::is_batch_mode() {
+            // Batch mode: Move PDF extraction to blocking thread pool to enable parallelism
             let content_owned = content.to_vec();
-            let span = tracing::Span::current();
             tokio::task::spawn_blocking(move || {
-                let _guard = span.entered();
                 let bindings = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
                     .or_else(|_| Pdfium::bind_to_system_library())
                     .map_err(|e| PdfError::MetadataExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
@@ -318,6 +316,7 @@ impl DocumentExtractor for PdfExtractor {
                 let metadata = crate::pdf::metadata::extract_metadata_from_document(&document)?;
                 let native_text = crate::pdf::text::extract_text_from_pdf_document(&document)?;
+                // Extract tables from native PDF text (when not using OCR)
                 let tables = extract_tables_from_document(&document, &metadata)?;
                 Ok::<_, crate::error::KreuzbergError>((metadata, native_text, tables))
@@ -325,6 +324,7 @@ impl DocumentExtractor for PdfExtractor {
             .await
             .map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))??
         } else {
+            // Single-file mode: Direct extraction (no spawn overhead)
             let bindings = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
                 .or_else(|_| Pdfium::bind_to_system_library())
                 .map_err(|e| PdfError::MetadataExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
@@ -343,6 +343,7 @@ impl DocumentExtractor for PdfExtractor {
             let metadata = crate::pdf::metadata::extract_metadata_from_document(&document)?;
             let native_text = crate::pdf::text::extract_text_from_pdf_document(&document)?;
+            // Extract tables from native PDF text (when not using OCR)
             let tables = extract_tables_from_document(&document, &metadata)?;
             (metadata, native_text, tables)
@@ -415,6 +416,9 @@ impl DocumentExtractor for PdfExtractor {
             None
         };
+        // Tables were extracted during metadata/text extraction phase
+        // (see extract_tables_from_document function below)
         Ok(ExtractionResult {
             content: text,
             mime_type: mime_type.to_string(),
@@ -430,7 +434,6 @@ impl DocumentExtractor for PdfExtractor {
         })
     }
-    #[cfg(feature = "tokio-runtime")]
     async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
         let bytes = tokio::fs::read(path).await?;
         self.extract_bytes(&bytes, mime_type, config).await

data/vendor/kreuzberg/src/extractors/pptx.rs CHANGED Viewed

@@ -43,10 +43,8 @@ impl PptxExtractor {
         for image in &mut images {
             let image_data = image.data.clone();
             let tess_config_clone = tess_config.clone();
-            let span = tracing::Span::current();
             let ocr_result = tokio::task::spawn_blocking(move || {
-                let _guard = span.entered();
                 let cache_dir = std::env::var("KREUZBERG_CACHE_DIR").ok().map(std::path::PathBuf::from);
                 let proc = OcrProcessor::new(cache_dir)?;
@@ -102,13 +100,6 @@ impl Plugin for PptxExtractor {
 #[async_trait]
 impl DocumentExtractor for PptxExtractor {
-    #[cfg_attr(feature = "otel", tracing::instrument(
-        skip(self, content, config),
-        fields(
-            extractor.name = self.name(),
-            content.size_bytes = content.len(),
-        )
-    ))]
     async fn extract_bytes(
         &self,
         content: &[u8],
@@ -117,16 +108,17 @@ impl DocumentExtractor for PptxExtractor {
     ) -> Result<ExtractionResult> {
         let extract_images = config.images.as_ref().is_some_and(|img| img.extract_images);
+        // Extract PPTX content
         let pptx_result = if crate::core::batch_mode::is_batch_mode() {
+            // Batch mode: Use spawn_blocking for parallelism
             let content_owned = content.to_vec();
-            let span = tracing::Span::current();
             tokio::task::spawn_blocking(move || {
-                let _guard = span.entered();
                 crate::extraction::pptx::extract_pptx_from_bytes(&content_owned, extract_images)
             })
             .await
             .map_err(|e| crate::error::KreuzbergError::parsing(format!("PPTX extraction task failed: {}", e)))??
         } else {
+            // Single-file mode: Direct extraction (no spawn overhead)
             crate::extraction::pptx::extract_pptx_from_bytes(content, extract_images)?
         };
@@ -164,12 +156,6 @@ impl DocumentExtractor for PptxExtractor {
         })
     }
-    #[cfg_attr(feature = "otel", tracing::instrument(
-        skip(self, path, config),
-        fields(
-            extractor.name = self.name(),
-        )
-    ))]
     async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
         let path_str = path
             .to_str()

data/vendor/kreuzberg/src/extractors/structured.rs CHANGED Viewed

@@ -42,13 +42,6 @@ impl Plugin for StructuredExtractor {
 #[async_trait]
 impl DocumentExtractor for StructuredExtractor {
-    #[cfg_attr(feature = "otel", tracing::instrument(
-        skip(self, content, _config),
-        fields(
-            extractor.name = self.name(),
-            content.size_bytes = content.len(),
-        )
-    ))]
     async fn extract_bytes(
         &self,
         content: &[u8],
@@ -87,13 +80,6 @@ impl DocumentExtractor for StructuredExtractor {
         })
     }
-    #[cfg(feature = "tokio-runtime")]
-    #[cfg_attr(feature = "otel", tracing::instrument(
-        skip(self, path, config),
-        fields(
-            extractor.name = self.name(),
-        )
-    ))]
     async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
         let bytes = tokio::fs::read(path).await?;
         self.extract_bytes(&bytes, mime_type, config).await

data/vendor/kreuzberg/src/extractors/text.rs CHANGED Viewed

@@ -53,33 +53,22 @@ impl Plugin for PlainTextExtractor {
 #[async_trait]
 impl DocumentExtractor for PlainTextExtractor {
-    #[cfg_attr(feature = "otel", tracing::instrument(
-        skip(self, content, _config),
-        fields(
-            extractor.name = self.name(),
-            content.size_bytes = content.len(),
-        )
-    ))]
     async fn extract_bytes(
         &self,
         content: &[u8],
         mime_type: &str,
         _config: &ExtractionConfig,
     ) -> Result<ExtractionResult> {
-        let text = String::from_utf8_lossy(content).into_owned();
-        let text = text.trim_end_matches('\n').trim_end_matches('\r').to_string();
-        let line_count = text.lines().count();
-        let word_count = text.split_whitespace().count();
-        let character_count = text.len();
+        let text_result = parse_text(content, false)?;
         Ok(ExtractionResult {
-            content: text,
+            content: text_result.content,
             mime_type: mime_type.to_string(),
             metadata: crate::types::Metadata {
                 format: Some(crate::types::FormatMetadata::Text(crate::types::TextMetadata {
-                    line_count,
-                    word_count,
-                    character_count,
+                    line_count: text_result.line_count,
+                    word_count: text_result.word_count,
+                    character_count: text_result.character_count,
                     headers: None,
                     links: None,
                     code_blocks: None,
@@ -149,13 +138,6 @@ impl Plugin for MarkdownExtractor {
 #[async_trait]
 impl DocumentExtractor for MarkdownExtractor {
-    #[cfg_attr(feature = "otel", tracing::instrument(
-        skip(self, content, _config),
-        fields(
-            extractor.name = self.name(),
-            content.size_bytes = content.len(),
-        )
-    ))]
     async fn extract_bytes(
         &self,
         content: &[u8],

data/vendor/kreuzberg/src/extractors/xml.rs CHANGED Viewed

@@ -53,13 +53,6 @@ impl Plugin for XmlExtractor {
 #[async_trait]
 impl DocumentExtractor for XmlExtractor {
-    #[cfg_attr(feature = "otel", tracing::instrument(
-        skip(self, content, _config),
-        fields(
-            extractor.name = self.name(),
-            content.size_bytes = content.len(),
-        )
-    ))]
     async fn extract_bytes(
         &self,
         content: &[u8],

data/vendor/kreuzberg/src/keywords/rake.rs CHANGED Viewed

@@ -248,6 +248,7 @@ mod tests {
         let english_text = "Natural language processing is a subfield of artificial intelligence.";
         let config = KeywordConfig::rake().with_language("fr");
         let keywords = extract_keywords_rake(english_text, &config).unwrap();
+        dbg!(&keywords);
         assert!(
             !keywords.is_empty(),
             "Should fall back to English stopwords and extract keywords"

data/vendor/kreuzberg/src/lib.rs CHANGED Viewed

@@ -39,7 +39,6 @@ pub mod core;
 pub mod error;
 pub mod extraction;
 pub mod extractors;
-pub mod panic_context;
 pub mod plugins;
 pub mod text;
 pub mod types;
@@ -80,9 +79,7 @@ pub mod pdf;
 pub use error::{KreuzbergError, Result};
 pub use types::*;
-#[cfg(feature = "tokio-runtime")]
-pub use core::extractor::{batch_extract_bytes, batch_extract_file};
-pub use core::extractor::{extract_bytes, extract_file};
+pub use core::extractor::{batch_extract_bytes, batch_extract_file, extract_bytes, extract_file};
 pub use core::extractor::{batch_extract_bytes_sync, batch_extract_file_sync, extract_bytes_sync, extract_file_sync};

data/vendor/kreuzberg/src/mcp/mod.rs CHANGED Viewed

@@ -16,7 +16,7 @@
 //! use kreuzberg::mcp::start_mcp_server;
 //!
 //! #[tokio::main]
-//! async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+//! async fn main() -> anyhow::Result<()> {
 //!     start_mcp_server().await?;
 //!     Ok(())
 //! }

data/vendor/kreuzberg/src/mcp/server.rs CHANGED Viewed

@@ -428,12 +428,12 @@ impl Default for KreuzbergMcp {
 /// use kreuzberg::mcp::start_mcp_server;
 ///
 /// #[tokio::main]
-/// async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+/// async fn main() -> anyhow::Result<()> {
 ///     start_mcp_server().await?;
 ///     Ok(())
 /// }
 /// ```
-pub async fn start_mcp_server() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+pub async fn start_mcp_server() -> Result<(), Box<dyn std::error::Error>> {
     let service = KreuzbergMcp::new()?.serve(stdio()).await?;
     service.waiting().await?;
@@ -444,9 +444,7 @@ pub async fn start_mcp_server() -> Result<(), Box<dyn std::error::Error + Send +
 ///
 /// This variant allows specifying a custom extraction configuration
 /// (e.g., loaded from a file) instead of using defaults.
-pub async fn start_mcp_server_with_config(
-    config: ExtractionConfig,
-) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+pub async fn start_mcp_server_with_config(config: ExtractionConfig) -> Result<(), Box<dyn std::error::Error>> {
     let service = KreuzbergMcp::with_config(config).serve(stdio()).await?;
     service.waiting().await?;

data/vendor/kreuzberg/src/ocr/processor.rs CHANGED Viewed

@@ -51,14 +51,6 @@ impl OcrProcessor {
         Ok(Self { cache })
     }
-    #[cfg_attr(feature = "otel", tracing::instrument(
-        skip(self, image_bytes),
-        fields(
-            ocr.backend = "tesseract",
-            ocr.language = %config.language,
-            image.size_bytes = image_bytes.len(),
-        )
-    ))]
     pub fn process_image(&self, image_bytes: &[u8], config: &TesseractConfig) -> Result<OcrExtractionResult, OcrError> {
         config.validate().map_err(OcrError::InvalidConfiguration)?;
@@ -72,14 +64,9 @@ impl OcrProcessor {
         if config.use_cache
             && let Some(cached_result) = self.cache.get_cached_result(&image_hash, "tesseract", &config_str)?
         {
-            #[cfg(feature = "otel")]
-            tracing::Span::current().record("cache.hit", true);
             return Ok(cached_result);
         }
-        #[cfg(feature = "otel")]
-        tracing::Span::current().record("cache.hit", false);
         let result = self.perform_ocr(image_bytes, config)?;
         if config.use_cache {
@@ -241,6 +228,7 @@ impl OcrProcessor {
         });
         // Validate language before initializing to prevent segfault ~keep
+        // tesseract-rs can crash on empty language or missing language files
         if config.language.trim().is_empty() {
             return Err(OcrError::TesseractInitializationFailed(
                 "Language cannot be empty. Please specify a valid language code (e.g., 'eng')".to_string(),
@@ -248,6 +236,7 @@ impl OcrProcessor {
         }
         // Validate language file exists before initializing to prevent segfault ~keep
+        // tesseract-rs can crash if language file is missing instead of returning error
         if !tessdata_path.is_empty() {
             let languages: Vec<&str> = config.language.split('+').collect();
             for lang in languages {
@@ -373,11 +362,6 @@ impl OcrProcessor {
             )
         });
-        api.recognize()
-            .map_err(|e| OcrError::ProcessingFailed(format!("Failed to recognize text: {}", e)))?;
-        log_ci_debug(ci_debug_enabled, "recognize", || "completed".to_string());
         let tsv_data_for_tables = if config.enable_table_detection || config.output_format == "tsv" {
             Some(
                 api.get_tsv_text(0)

data/vendor/kreuzberg/src/pdf/error.rs CHANGED Viewed

@@ -40,7 +40,7 @@ impl std::error::Error for PdfError {}
 impl From<lopdf::Error> for PdfError {
     fn from(err: lopdf::Error) -> Self {
         match err {
-            lopdf::Error::IO(io_err) => PdfError::IOError(io_err.to_string()),
+            lopdf::Error::IO(_) => panic!("lopdf IO errors should not be converted to PdfError - let them bubble up"),
             _ => PdfError::InvalidPdf(err.to_string()),
         }
     }