RubyGems - kreuzberg - Versions diffs - 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8 - Mend

kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

checksums.yaml +4 -4
data/Gemfile.lock +5 -5
data/README.md +15 -9
data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
data/ext/kreuzberg_rb/native/Cargo.lock +511 -325
data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
data/kreuzberg.gemspec +38 -4
data/lib/kreuzberg/config.rb +34 -1
data/lib/kreuzberg/result.rb +77 -14
data/lib/kreuzberg/version.rb +1 -1
data/sig/kreuzberg.rbs +23 -6
data/vendor/kreuzberg/Cargo.toml +32 -11
data/vendor/kreuzberg/README.md +54 -8
data/vendor/kreuzberg/build.rs +549 -132
data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
data/vendor/kreuzberg/src/core/config.rs +49 -1
data/vendor/kreuzberg/src/core/extractor.rs +134 -2
data/vendor/kreuzberg/src/core/mod.rs +4 -2
data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
data/vendor/kreuzberg/src/extraction/html.rs +24 -8
data/vendor/kreuzberg/src/extraction/image.rs +124 -1
data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
data/vendor/kreuzberg/src/extractors/email.rs +29 -15
data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
data/vendor/kreuzberg/src/extractors/html.rs +29 -15
data/vendor/kreuzberg/src/extractors/image.rs +25 -4
data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
data/vendor/kreuzberg/src/extractors/pdf.rs +197 -17
data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
data/vendor/kreuzberg/src/extractors/text.rs +7 -2
data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
data/vendor/kreuzberg/src/lib.rs +10 -2
data/vendor/kreuzberg/src/mcp/mod.rs +3 -0
data/vendor/kreuzberg/src/mcp/server.rs +120 -12
data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
data/vendor/kreuzberg/src/pdf/bundled.rs +328 -0
data/vendor/kreuzberg/src/pdf/error.rs +8 -0
data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
data/vendor/kreuzberg/src/pdf/mod.rs +18 -2
data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
data/vendor/kreuzberg/src/pdf/table.rs +26 -2
data/vendor/kreuzberg/src/pdf/text.rs +89 -7
data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
data/vendor/kreuzberg/src/text/mod.rs +6 -0
data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
data/vendor/kreuzberg/src/types.rs +173 -21
data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
data/vendor/kreuzberg/tests/config_features.rs +15 -1
data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
data/vendor/kreuzberg/tests/email_integration.rs +2 -0
data/vendor/kreuzberg/tests/error_handling.rs +43 -34
data/vendor/kreuzberg/tests/format_integration.rs +2 -0
data/vendor/kreuzberg/tests/image_integration.rs +2 -0
data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
data/vendor/kreuzberg/tests/security_validation.rs +1 -0
data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
data/vendor/rb-sys/Cargo.lock +15 -15
data/vendor/rb-sys/Cargo.toml +4 -4
data/vendor/rb-sys/Cargo.toml.orig +4 -4
data/vendor/rb-sys/build/features.rs +5 -2
data/vendor/rb-sys/build/main.rs +55 -15
data/vendor/rb-sys/build/stable_api_config.rs +4 -2
data/vendor/rb-sys/build/version.rs +3 -1
data/vendor/rb-sys/src/lib.rs +1 -0
data/vendor/rb-sys/src/macros.rs +2 -2
data/vendor/rb-sys/src/special_consts.rs +1 -1
data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
data/vendor/rb-sys/src/stable_api.rs +0 -1
data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
metadata +13 -10
data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
data/vendor/rb-sys/.cargo-ok +0 -1
data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316

data/vendor/kreuzberg/src/text/quality_processor.rs ADDED Viewed

@@ -0,0 +1,219 @@
+//! Quality processing post-processor.
+//!
+//! This module provides a PostProcessor plugin that performs quality assessment and
+//! text cleaning on extraction results.
+use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
+use crate::{ExtractionConfig, ExtractionResult, Result};
+use async_trait::async_trait;
+/// Post-processor that calculates quality score and cleans text.
+///
+/// This processor:
+/// - Runs in the Early processing stage
+/// - Calculates quality score when `config.enable_quality_processing` is true
+/// - Stores quality score in `metadata.additional["quality_score"]`
+/// - Cleans and normalizes extracted text
+///
+/// # Example
+///
+/// ```rust,no_run
+/// use kreuzberg::plugins::{Plugin, PostProcessor};
+/// use kreuzberg::text::quality::processor::QualityProcessor;
+///
+/// let processor = QualityProcessor;
+/// assert_eq!(processor.name(), "quality-processing");
+/// ```
+#[derive(Debug, Clone, Copy)]
+pub struct QualityProcessor;
+impl Plugin for QualityProcessor {
+    fn name(&self) -> &str {
+        "quality-processing"
+    }
+    fn version(&self) -> String {
+        env!("CARGO_PKG_VERSION").to_string()
+    }
+    fn initialize(&self) -> Result<()> {
+        Ok(())
+    }
+    fn shutdown(&self) -> Result<()> {
+        Ok(())
+    }
+}
+#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
+#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
+impl PostProcessor for QualityProcessor {
+    async fn process(&self, result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
+        // Calculate quality score
+        let quality_score = crate::text::quality::calculate_quality_score(
+            &result.content,
+            Some(
+                &result
+                    .metadata
+                    .additional
+                    .iter()
+                    .map(|(k, v)| (k.clone(), v.to_string()))
+                    .collect(),
+            ),
+        );
+        result.metadata.additional.insert(
+            "quality_score".to_string(),
+            serde_json::Value::Number(
+                serde_json::Number::from_f64(quality_score).unwrap_or(serde_json::Number::from(0)),
+            ),
+        );
+        Ok(())
+    }
+    fn processing_stage(&self) -> ProcessingStage {
+        ProcessingStage::Early
+    }
+    fn should_process(&self, _result: &ExtractionResult, config: &ExtractionConfig) -> bool {
+        config.enable_quality_processing
+    }
+    fn estimated_duration_ms(&self, result: &ExtractionResult) -> u64 {
+        let text_length = result.content.len();
+        // Quality processing is relatively fast: ~1ms per 100KB
+        (text_length / 102400).max(1) as u64
+    }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::types::Metadata;
+    #[tokio::test]
+    async fn test_quality_processor() {
+        let processor = QualityProcessor;
+        let config = ExtractionConfig {
+            enable_quality_processing: true,
+            ..Default::default()
+        };
+        let mut result = ExtractionResult {
+	            content: "This is a well-written paragraph with proper structure. It contains multiple sentences. The quality should be good.".to_string(),
+	            mime_type: "text/plain".to_string(),
+	            metadata: Metadata::default(),
+	            tables: vec![],
+	            detected_languages: None,
+	            chunks: None,
+	            images: None,
+	            pages: None,
+	        };
+        processor.process(&mut result, &config).await.unwrap();
+        assert!(result.metadata.additional.contains_key("quality_score"));
+        let score = result.metadata.additional.get("quality_score").unwrap();
+        assert!(score.is_number());
+    }
+    #[tokio::test]
+    async fn test_quality_processor_disabled() {
+        let processor = QualityProcessor;
+        let config = ExtractionConfig {
+            enable_quality_processing: false,
+            ..Default::default()
+        };
+        let mut result = ExtractionResult {
+            content: "Some text".to_string(),
+            mime_type: "text/plain".to_string(),
+            metadata: Metadata::default(),
+            tables: vec![],
+            detected_languages: None,
+            chunks: None,
+            images: None,
+            pages: None,
+        };
+        // When disabled, the processor should not run, so no quality_score should be added
+        // (because should_process returns false)
+        processor.process(&mut result, &config).await.unwrap();
+    }
+    #[test]
+    fn test_quality_processor_plugin_interface() {
+        let processor = QualityProcessor;
+        assert_eq!(processor.name(), "quality-processing");
+        assert!(!processor.version().is_empty());
+        assert!(processor.initialize().is_ok());
+        assert!(processor.shutdown().is_ok());
+    }
+    #[test]
+    fn test_quality_processor_stage() {
+        let processor = QualityProcessor;
+        assert_eq!(processor.processing_stage(), ProcessingStage::Early);
+    }
+    #[test]
+    fn test_quality_processor_should_process() {
+        let processor = QualityProcessor;
+        let result = ExtractionResult {
+            content: "Sample text".to_string(),
+            mime_type: "text/plain".to_string(),
+            metadata: Metadata::default(),
+            tables: vec![],
+            detected_languages: None,
+            chunks: None,
+            images: None,
+            pages: None,
+        };
+        let config_with_quality = ExtractionConfig {
+            enable_quality_processing: true,
+            ..Default::default()
+        };
+        assert!(processor.should_process(&result, &config_with_quality));
+        let config_without_quality = ExtractionConfig {
+            enable_quality_processing: false,
+            ..Default::default()
+        };
+        assert!(!processor.should_process(&result, &config_without_quality));
+    }
+    #[test]
+    fn test_quality_processor_estimated_duration() {
+        let processor = QualityProcessor;
+        let short_result = ExtractionResult {
+            content: "Short".to_string(),
+            mime_type: "text/plain".to_string(),
+            metadata: Metadata::default(),
+            tables: vec![],
+            detected_languages: None,
+            chunks: None,
+            images: None,
+            pages: None,
+        };
+        let long_result = ExtractionResult {
+            content: "a".repeat(1000000),
+            mime_type: "text/plain".to_string(),
+            metadata: Metadata::default(),
+            tables: vec![],
+            detected_languages: None,
+            chunks: None,
+            images: None,
+            pages: None,
+        };
+        let short_duration = processor.estimated_duration_ms(&short_result);
+        let long_duration = processor.estimated_duration_ms(&long_result);
+        assert!(long_duration > short_duration);
+    }
+}

data/vendor/kreuzberg/src/types.rs CHANGED Viewed

@@ -34,6 +34,13 @@ pub struct ExtractionResult {
     /// Each image may optionally contain a nested `ocr_result` if OCR was performed.
     #[serde(skip_serializing_if = "Option::is_none")]
     pub images: Option<Vec<ExtractedImage>>,
+    /// Per-page content when page extraction is enabled.
+    ///
+    /// When page extraction is configured, the document is split into per-page content
+    /// with tables and images mapped to their respective pages.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub pages: Option<Vec<PageContent>>,
 }
 /// Format-specific metadata (discriminated union).
@@ -62,17 +69,49 @@ pub enum FormatMetadata {
 /// via a discriminated union, and additional custom fields from postprocessors.
 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
 pub struct Metadata {
-    /// Language of the document (ISO 639 code)
+    /// Document title
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub title: Option<String>,
+    /// Document subject or description
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub subject: Option<String>,
+    /// Primary author(s) - always Vec for consistency
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub authors: Option<Vec<String>>,
+    /// Keywords/tags - always Vec for consistency
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub keywords: Option<Vec<String>>,
+    /// Primary language (ISO 639 code)
     #[serde(skip_serializing_if = "Option::is_none")]
     pub language: Option<String>,
-    /// Document date (format varies by source)
+    /// Creation timestamp (ISO 8601 format)
     #[serde(skip_serializing_if = "Option::is_none")]
-    pub date: Option<String>,
+    pub created_at: Option<String>,
-    /// Document subject/description
+    /// Last modification timestamp (ISO 8601 format)
     #[serde(skip_serializing_if = "Option::is_none")]
-    pub subject: Option<String>,
+    pub modified_at: Option<String>,
+    /// User who created the document
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub created_by: Option<String>,
+    /// User who last modified the document
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub modified_by: Option<String>,
+    /// Page/slide/sheet structure with boundaries
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub pages: Option<PageStructure>,
+    /// Document date (DEPRECATED - use created_at/modified_at instead)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub date: Option<String>,
     /// Format-specific metadata (discriminated union)
     ///
@@ -102,6 +141,110 @@ pub struct Metadata {
     pub additional: HashMap<String, serde_json::Value>,
 }
+/// Unified page structure for documents.
+///
+/// Supports different page types (PDF pages, PPTX slides, Excel sheets)
+/// with character offset boundaries for chunk-to-page mapping.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PageStructure {
+    /// Total number of pages/slides/sheets
+    pub total_count: usize,
+    /// Type of paginated unit
+    pub unit_type: PageUnitType,
+    /// Character offset boundaries for each page
+    ///
+    /// Maps character ranges in the extracted content to page numbers.
+    /// Used for chunk page range calculation.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub boundaries: Option<Vec<PageBoundary>>,
+    /// Detailed per-page metadata (optional, only when needed)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub pages: Option<Vec<PageInfo>>,
+}
+/// Type of paginated unit in a document.
+///
+/// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum PageUnitType {
+    /// Standard document pages (PDF, DOCX, images)
+    Page,
+    /// Presentation slides (PPTX, ODP)
+    Slide,
+    /// Spreadsheet sheets (XLSX, ODS)
+    Sheet,
+}
+/// Byte offset boundary for a page.
+///
+/// Tracks where a specific page's content starts and ends in the main content string,
+/// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
+/// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PageBoundary {
+    /// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
+    pub byte_start: usize,
+    /// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
+    pub byte_end: usize,
+    /// Page number (1-indexed)
+    pub page_number: usize,
+}
+/// Metadata for individual page/slide/sheet.
+///
+/// Captures per-page information including dimensions, content counts,
+/// and visibility state (for presentations).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PageInfo {
+    /// Page number (1-indexed)
+    pub number: usize,
+    /// Page title (usually for presentations)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub title: Option<String>,
+    /// Dimensions in points (PDF) or pixels (images): (width, height)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub dimensions: Option<(f64, f64)>,
+    /// Number of images on this page
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_count: Option<usize>,
+    /// Number of tables on this page
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub table_count: Option<usize>,
+    /// Whether this page is hidden (e.g., in presentations)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub hidden: Option<bool>,
+}
+/// Content for a single page/slide.
+///
+/// When page extraction is enabled, documents are split into per-page content
+/// with associated tables and images mapped to each page.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PageContent {
+    /// Page number (1-indexed)
+    pub page_number: usize,
+    /// Text content for this page
+    pub content: String,
+    /// Tables found on this page
+    #[serde(skip_serializing_if = "Vec::is_empty", default)]
+    pub tables: Vec<Table>,
+    /// Images found on this page
+    #[serde(skip_serializing_if = "Vec::is_empty", default)]
+    pub images: Vec<ExtractedImage>,
+}
 /// Excel/spreadsheet metadata.
 ///
 /// Contains information about sheets in Excel, LibreOffice Calc, and other
@@ -348,11 +491,11 @@ pub struct Chunk {
 /// Metadata about a chunk's position in the original document.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ChunkMetadata {
-    /// Character offset where this chunk starts in the original text.
-    pub char_start: usize,
+    /// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
+    pub byte_start: usize,
-    /// Character offset where this chunk ends in the original text.
-    pub char_end: usize,
+    /// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
+    pub byte_end: usize,
     /// Number of tokens in this chunk (if available).
     ///
@@ -365,6 +508,18 @@ pub struct ChunkMetadata {
     /// Total number of chunks in the document.
     pub total_chunks: usize,
+    /// First page number this chunk spans (1-indexed).
+    ///
+    /// Only populated when page tracking is enabled in extraction configuration.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub first_page: Option<usize>,
+    /// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
+    ///
+    /// Only populated when page tracking is enabled in extraction configuration.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub last_page: Option<usize>,
 }
 /// Extracted image from a document.
@@ -505,22 +660,22 @@ pub struct PptxExtractionResult {
     pub table_count: usize,
     /// Extracted images from the presentation
     pub images: Vec<ExtractedImage>,
+    /// Slide structure with boundaries (when page tracking is enabled)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub page_structure: Option<PageStructure>,
+    /// Per-slide content (when page tracking is enabled)
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub page_contents: Option<Vec<PageContent>>,
 }
 /// PowerPoint presentation metadata.
 ///
-/// Contains document-level metadata extracted from the PPTX file.
+/// Contains PPTX-specific metadata. Common fields like title, author, and description
+/// are now in the base `Metadata` struct.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct PptxMetadata {
-    /// Presentation title
-    pub title: Option<String>,
-    /// Author name
-    pub author: Option<String>,
-    /// Description/comments
-    pub description: Option<String>,
-    /// Summary text
-    pub summary: Option<String>,
     /// List of fonts used in the presentation
+    #[serde(skip_serializing_if = "Vec::is_empty", default)]
     pub fonts: Vec<String>,
 }
@@ -885,19 +1040,16 @@ mod tests {
         let json = serde_json::to_value(&metadata).unwrap();
         println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
-        // Check that format_type is present
         assert!(
             json.get("format_type").is_some(),
             "format_type should be present in serialized JSON"
         );
         assert_eq!(json.get("format_type").unwrap(), "text");
-        // Check that Text metadata fields are present
         assert_eq!(json.get("line_count").unwrap(), 1);
         assert_eq!(json.get("word_count").unwrap(), 2);
         assert_eq!(json.get("character_count").unwrap(), 13);
-        // Check that additional field is merged
         assert_eq!(json.get("quality_score").unwrap(), 1.0);
     }
 }

data/vendor/kreuzberg/tests/archive_integration.rs CHANGED Viewed

@@ -3,6 +3,8 @@
 //! Tests for ZIP, TAR, TAR.GZ, and 7z archive extraction.
 //! Validates metadata extraction, content extraction, nested archives, and error handling.
+#![cfg(feature = "archives")]
 use kreuzberg::core::config::ExtractionConfig;
 use kreuzberg::core::extractor::{extract_bytes, extract_bytes_sync};
 use std::io::{Cursor, Write};

data/vendor/kreuzberg/tests/batch_processing.rs CHANGED Viewed

@@ -4,9 +4,9 @@
 //! Validates concurrent processing, error handling, and performance.
 use kreuzberg::core::config::ExtractionConfig;
-use kreuzberg::core::extractor::{
-    batch_extract_bytes, batch_extract_bytes_sync, batch_extract_file, batch_extract_file_sync,
-};
+#[cfg(feature = "pdf")]
+use kreuzberg::core::extractor::batch_extract_file_sync;
+use kreuzberg::core::extractor::{batch_extract_bytes, batch_extract_bytes_sync, batch_extract_file};
 use std::path::PathBuf;
 mod helpers;
@@ -26,6 +26,7 @@ fn assert_text_content(actual: &str, expected: &str) {
 /// Test batch extraction with multiple file formats (PDF, DOCX, TXT).
 #[tokio::test]
+#[cfg(all(feature = "pdf", feature = "office", feature = "tokio-runtime"))]
 async fn test_batch_extract_file_multiple_formats() {
     if !test_documents_available() {
         println!("Skipping test: test_documents/ directory not found");
@@ -73,6 +74,7 @@ async fn test_batch_extract_file_multiple_formats() {
 /// Test synchronous batch extraction variant.
 #[test]
+#[cfg(feature = "pdf")]
 fn test_batch_extract_file_sync_variant() {
     if !test_documents_available() {
         println!("Skipping test: test_documents/ directory not found");

data/vendor/kreuzberg/tests/concurrency_stress.rs CHANGED Viewed

@@ -18,7 +18,6 @@ use kreuzberg::plugins::registry::{get_document_extractor_registry, get_post_pro
 use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
 use kreuzberg::types::{ExtractionResult, Metadata};
 use std::sync::Arc;
-use std::sync::atomic::{AtomicUsize, Ordering};
 #[cfg(feature = "ocr")]
 use kreuzberg::core::config::OcrConfig;
@@ -52,13 +51,16 @@ fn assert_text_content(actual: &str, expected: &str) {
 async fn test_concurrent_extractions_mixed_formats() {
     let config = ExtractionConfig::default();
-    let test_cases = vec![
+    #[allow(unused_mut)]
+    let mut test_cases = vec![
         (b"Plain text content" as &[u8], "text/plain"),
         (b"{\"key\": \"value\"}", "application/json"),
-        (b"<root><item>XML content</item></root>", "application/xml"),
         (b"# Markdown\n\nContent here", "text/markdown"),
     ];
+    #[cfg(feature = "xml")]
+    test_cases.push((b"<root><item>XML content</item></root>" as &[u8], "application/xml"));
     let mut handles = vec![];
     for _ in 0..10 {
         for (data, mime_type) in &test_cases {
@@ -242,6 +244,7 @@ async fn test_concurrent_ocr_processing() {
 #[test]
 fn test_concurrent_ocr_cache_stress() {
     use helpers::{get_test_file_path, skip_if_missing};
+    use std::sync::atomic::Ordering;
     if skip_if_missing("images/ocr_image.jpg") {
         tracing::debug!("Skipping OCR cache stress test: test file not available");
@@ -366,6 +369,7 @@ async fn test_concurrent_pipeline_processing() {
                 detected_languages: None,
                 chunks: None,
                 images: None,
+                pages: None,
             };
             run_pipeline(result, &config).await
@@ -486,13 +490,16 @@ async fn test_high_concurrency_stress() {
         ..Default::default()
     };
-    let formats = vec![
+    #[allow(unused_mut)]
+    let mut formats = vec![
         (b"Text content" as &[u8], "text/plain"),
         (b"{\"json\": true}", "application/json"),
-        (b"<xml><item>content</item></xml>", "application/xml"),
         (b"# Markdown\n\nContent", "text/markdown"),
     ];
+    #[cfg(feature = "xml")]
+    formats.push((b"<xml><item>content</item></xml>" as &[u8], "application/xml"));
     let mut handles = vec![];
     for _ in 0..100 {
         for (data, mime_type) in &formats {
@@ -516,9 +523,10 @@ async fn test_high_concurrency_stress() {
     .await
     .expect("High-load stress test should complete within 60s");
+    let expected_successes = 100 * formats.len();
     let success_count = results.iter().filter(|r| r.is_ok()).count();
     assert_eq!(
-        success_count, 400,
+        success_count, expected_successes,
         "All extractions should succeed under stress, got {} successes",
         success_count
     );

data/vendor/kreuzberg/tests/config_features.rs CHANGED Viewed

@@ -3,13 +3,19 @@
 //! Tests for chunking, language detection, caching, token reduction, and quality processing.
 //! Validates that configuration options work correctly end-to-end.
-use kreuzberg::core::config::{ChunkingConfig, ExtractionConfig, LanguageDetectionConfig, TokenReductionConfig};
+#[cfg(feature = "chunking")]
+use kreuzberg::core::config::ChunkingConfig;
+use kreuzberg::core::config::ExtractionConfig;
+#[cfg(feature = "language-detection")]
+use kreuzberg::core::config::LanguageDetectionConfig;
+use kreuzberg::core::config::TokenReductionConfig;
 use kreuzberg::core::extractor::extract_bytes;
 mod helpers;
 /// Test chunking enabled - text split into chunks.
 #[tokio::test]
+#[cfg(feature = "chunking")]
 async fn test_chunking_enabled() {
     let config = ExtractionConfig {
         chunking: Some(ChunkingConfig {
@@ -52,6 +58,7 @@ async fn test_chunking_enabled() {
 /// Test chunking with overlap - overlap preserved between chunks.
 #[tokio::test]
+#[cfg(feature = "chunking")]
 async fn test_chunking_with_overlap() {
     let config = ExtractionConfig {
         chunking: Some(ChunkingConfig {
@@ -91,6 +98,7 @@ async fn test_chunking_with_overlap() {
 /// Test chunking with custom sizes - custom chunk size and overlap.
 #[tokio::test]
+#[cfg(feature = "chunking")]
 async fn test_chunking_custom_sizes() {
     let config = ExtractionConfig {
         chunking: Some(ChunkingConfig {
@@ -151,6 +159,7 @@ async fn test_chunking_disabled() {
 /// Test language detection for single language document.
 #[tokio::test]
+#[cfg(feature = "language-detection")]
 async fn test_language_detection_single() {
     let config = ExtractionConfig {
         language_detection: Some(LanguageDetectionConfig {
@@ -177,6 +186,7 @@ async fn test_language_detection_single() {
 /// Test language detection for multi-language document.
 #[cfg_attr(coverage, ignore = "coverage instrumentation affects multi-language heuristics")]
 #[tokio::test]
+#[cfg(feature = "language-detection")]
 async fn test_language_detection_multiple() {
     let config = ExtractionConfig {
         language_detection: Some(LanguageDetectionConfig {
@@ -201,6 +211,7 @@ async fn test_language_detection_multiple() {
 /// Test language detection with confidence threshold.
 #[tokio::test]
+#[cfg(feature = "language-detection")]
 async fn test_language_detection_confidence() {
     let config = ExtractionConfig {
         language_detection: Some(LanguageDetectionConfig {
@@ -225,6 +236,7 @@ async fn test_language_detection_confidence() {
 /// Test language detection disabled.
 #[tokio::test]
+#[cfg(feature = "language-detection")]
 async fn test_language_detection_disabled() {
     let config = ExtractionConfig {
         language_detection: Some(LanguageDetectionConfig {
@@ -397,6 +409,7 @@ async fn test_token_reduction_disabled() {
 /// Test quality processing enabled - quality scoring applied.
 #[tokio::test]
+#[cfg(feature = "quality")]
 async fn test_quality_processing_enabled() {
     let config = ExtractionConfig {
         enable_quality_processing: true,
@@ -420,6 +433,7 @@ async fn test_quality_processing_enabled() {
 /// Test quality processing calculates score for different text quality.
 #[tokio::test]
+#[cfg(feature = "quality")]
 async fn test_quality_threshold_filtering() {
     let config = ExtractionConfig {
         enable_quality_processing: true,

data/vendor/kreuzberg/tests/config_loading_tests.rs CHANGED Viewed

@@ -389,6 +389,7 @@ extract_images = true
         "Should have language detection config"
     );
     assert!(config.images.is_some(), "Should have image extraction config");
+    #[cfg(feature = "pdf")]
     assert!(config.pdf_options.is_some(), "Should have PDF config");
 }