RubyGems - kreuzberg - Versions diffs - 4.2.6 → 4.2.7 - Mend

kreuzberg 4.2.6 → 4.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (139) hide show

checksums.yaml +4 -4
data/Gemfile.lock +7 -4
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
data/ext/kreuzberg_rb/native/src/result.rs +5 -3
data/lib/kreuzberg/version.rb +1 -1
data/sig/kreuzberg.rbs +228 -37
data/spec/binding/batch_operations_spec.rb +2 -0
data/vendor/Cargo.toml +3 -2
data/vendor/kreuzberg/Cargo.toml +2 -1
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/api/error.rs +29 -1
data/vendor/kreuzberg/src/api/handlers.rs +28 -25
data/vendor/kreuzberg/src/api/openapi.rs +14 -1
data/vendor/kreuzberg/src/chunking/config.rs +2 -37
data/vendor/kreuzberg/src/chunking/core.rs +78 -2
data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
data/vendor/kreuzberg/src/extraction/email.rs +31 -19
data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
data/vendor/kreuzberg/src/extractors/email.rs +5 -3
data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
data/vendor/kreuzberg/src/extractors/html.rs +1 -1
data/vendor/kreuzberg/src/extractors/image.rs +3 -3
data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
data/vendor/kreuzberg/src/extractors/text.rs +2 -2
data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
data/vendor/kreuzberg/src/lib.rs +1 -1
data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
data/vendor/kreuzberg/src/mcp/format.rs +5 -4
data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
data/vendor/kreuzberg/src/ocr/types.rs +3 -4
data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
data/vendor/kreuzberg/src/text/quality.rs +13 -13
data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
data/vendor/kreuzberg/src/types/djot.rs +15 -4
data/vendor/kreuzberg/src/types/extraction.rs +24 -4
data/vendor/kreuzberg/src/types/formats.rs +9 -5
data/vendor/kreuzberg/src/types/metadata.rs +68 -7
data/vendor/kreuzberg/src/types/mod.rs +7 -5
data/vendor/kreuzberg/src/types/page.rs +9 -0
data/vendor/kreuzberg/src/types/tables.rs +2 -0
data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
data/vendor/kreuzberg/tests/config_features.rs +19 -11
data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
data/vendor/kreuzberg/tests/core_integration.rs +5 -6
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
data/vendor/kreuzberg-ffi/src/error.rs +56 -0
data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
data/vendor/kreuzberg-ffi/src/result.rs +2 -1
data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +2 -2

data/vendor/kreuzberg/src/core/pipeline/tests.rs CHANGED Viewed

@@ -4,6 +4,7 @@ use super::*;
 use crate::core::config::OutputFormat;
 use crate::types::Metadata;
 use lazy_static::lazy_static;
+use std::borrow::Cow;
 const VALIDATION_MARKER_KEY: &str = "registry_validation_marker";
 #[cfg(feature = "quality")]
@@ -19,7 +20,7 @@ lazy_static! {
 async fn test_run_pipeline_basic() {
     let mut result = ExtractionResult {
         content: "test".to_string(),
-        mime_type: "text/plain".to_string(),
+        mime_type: Cow::Borrowed("text/plain"),
         metadata: Metadata::default(),
         tables: vec![],
         detected_languages: None,
@@ -30,7 +31,7 @@ async fn test_run_pipeline_basic() {
         elements: None,
     };
     result.metadata.additional.insert(
-        VALIDATION_MARKER_KEY.to_string(),
+        Cow::Borrowed(VALIDATION_MARKER_KEY),
         serde_json::json!(ORDER_VALIDATION_MARKER),
     );
     let config = ExtractionConfig::default();
@@ -44,7 +45,7 @@ async fn test_run_pipeline_basic() {
 async fn test_pipeline_with_quality_processing() {
     let result = ExtractionResult {
         content: "This is a test document with some meaningful content.".to_string(),
-        mime_type: "text/plain".to_string(),
+        mime_type: Cow::Borrowed("text/plain"),
         metadata: Metadata::default(),
         tables: vec![],
         detected_languages: None,
@@ -67,7 +68,7 @@ async fn test_pipeline_with_quality_processing() {
 async fn test_pipeline_without_quality_processing() {
     let result = ExtractionResult {
         content: "test".to_string(),
-        mime_type: "text/plain".to_string(),
+        mime_type: Cow::Borrowed("text/plain"),
         metadata: Metadata::default(),
         tables: vec![],
         detected_languages: None,
@@ -91,7 +92,7 @@ async fn test_pipeline_without_quality_processing() {
 async fn test_pipeline_with_chunking() {
     let result = ExtractionResult {
         content: "This is a long text that should be chunked. ".repeat(100),
-        mime_type: "text/plain".to_string(),
+        mime_type: Cow::Borrowed("text/plain"),
         metadata: Metadata::default(),
         tables: vec![],
         detected_languages: None,
@@ -103,8 +104,10 @@ async fn test_pipeline_with_chunking() {
     };
     let config = ExtractionConfig {
         chunking: Some(crate::ChunkingConfig {
-            max_chars: 500,
-            max_overlap: 50,
+            max_characters: 500,
+            overlap: 50,
+            trim: true,
+            chunker_type: crate::ChunkerType::Text,
             embedding: None,
             preset: None,
         }),
@@ -121,7 +124,7 @@ async fn test_pipeline_with_chunking() {
 async fn test_pipeline_without_chunking() {
     let result = ExtractionResult {
         content: "test".to_string(),
-        mime_type: "text/plain".to_string(),
+        mime_type: Cow::Borrowed("text/plain"),
         metadata: Metadata::default(),
         tables: vec![],
         detected_languages: None,
@@ -142,14 +145,14 @@ async fn test_pipeline_without_chunking() {
 #[tokio::test]
 async fn test_pipeline_preserves_metadata() {
-    use std::collections::HashMap;
-    let mut additional = HashMap::new();
-    additional.insert("source".to_string(), serde_json::json!("test"));
-    additional.insert("page".to_string(), serde_json::json!(1));
+    use ahash::AHashMap;
+    let mut additional = AHashMap::new();
+    additional.insert(Cow::Borrowed("source"), serde_json::json!("test"));
+    additional.insert(Cow::Borrowed("page"), serde_json::json!(1));
     let result = ExtractionResult {
         content: "test".to_string(),
-        mime_type: "text/plain".to_string(),
+        mime_type: Cow::Borrowed("text/plain"),
         metadata: Metadata {
             additional,
             ..Default::default()
@@ -187,7 +190,7 @@ async fn test_pipeline_preserves_tables() {
     let result = ExtractionResult {
         content: "test".to_string(),
-        mime_type: "text/plain".to_string(),
+        mime_type: Cow::Borrowed("text/plain"),
         metadata: Metadata::default(),
         tables: vec![table],
         detected_languages: None,
@@ -219,7 +222,7 @@ async fn test_pipeline_empty_content() {
     let result = ExtractionResult {
         content: String::new(),
-        mime_type: "text/plain".to_string(),
+        mime_type: Cow::Borrowed("text/plain"),
         metadata: Metadata::default(),
         tables: vec![],
         detected_languages: None,
@@ -242,7 +245,7 @@ async fn test_pipeline_empty_content() {
 async fn test_pipeline_with_all_features() {
     let result = ExtractionResult {
         content: "This is a comprehensive test document. ".repeat(50),
-        mime_type: "text/plain".to_string(),
+        mime_type: Cow::Borrowed("text/plain"),
         metadata: Metadata::default(),
         tables: vec![],
         detected_languages: None,
@@ -255,8 +258,10 @@ async fn test_pipeline_with_all_features() {
     let config = ExtractionConfig {
         enable_quality_processing: true,
         chunking: Some(crate::ChunkingConfig {
-            max_chars: 500,
-            max_overlap: 50,
+            max_characters: 500,
+            overlap: 50,
+            trim: true,
+            chunker_type: crate::ChunkerType::Text,
             embedding: None,
             preset: None,
         }),
@@ -295,7 +300,7 @@ machine learning that uses neural networks with multiple layers.
 Natural language processing enables computers to understand human language.
             "#
         .to_string(),
-        mime_type: "text/plain".to_string(),
+        mime_type: Cow::Borrowed("text/plain"),
         metadata: Metadata::default(),
         tables: vec![],
         detected_languages: None,
@@ -342,7 +347,7 @@ async fn test_pipeline_without_keyword_config() {
     }
     let result = ExtractionResult {
         content: "Machine learning and artificial intelligence.".to_string(),
-        mime_type: "text/plain".to_string(),
+        mime_type: Cow::Borrowed("text/plain"),
         metadata: Metadata::default(),
         tables: vec![],
         detected_languages: None,
@@ -380,7 +385,7 @@ async fn test_pipeline_keyword_extraction_short_content() {
     let result = ExtractionResult {
         content: "Short text".to_string(),
-        mime_type: "text/plain".to_string(),
+        mime_type: Cow::Borrowed("text/plain"),
         metadata: Metadata::default(),
         tables: vec![],
         detected_languages: None,
@@ -437,7 +442,7 @@ async fn test_postprocessor_runs_before_validator() {
             result
                 .metadata
                 .additional
-                .insert("processed".to_string(), serde_json::json!(true));
+                .insert(Cow::Borrowed("processed"), serde_json::json!(true));
             Ok(())
         }
@@ -517,7 +522,7 @@ async fn test_postprocessor_runs_before_validator() {
     let mut result = ExtractionResult {
         content: "test".to_string(),
-        mime_type: "text/plain".to_string(),
+        mime_type: Cow::Borrowed("text/plain"),
         metadata: Metadata::default(),
         tables: vec![],
         detected_languages: None,
@@ -528,7 +533,7 @@ async fn test_postprocessor_runs_before_validator() {
         elements: None,
     };
     result.metadata.additional.insert(
-        VALIDATION_MARKER_KEY.to_string(),
+        Cow::Borrowed(VALIDATION_MARKER_KEY),
         serde_json::json!(POSTPROCESSOR_VALIDATION_MARKER),
     );
@@ -614,7 +619,7 @@ async fn test_quality_processing_runs_before_validator() {
     let mut result = ExtractionResult {
         content: "This is meaningful test content for quality scoring.".to_string(),
-        mime_type: "text/plain".to_string(),
+        mime_type: Cow::Borrowed("text/plain"),
         metadata: Metadata::default(),
         tables: vec![],
         detected_languages: None,
@@ -625,7 +630,7 @@ async fn test_quality_processing_runs_before_validator() {
         elements: None,
     };
     result.metadata.additional.insert(
-        VALIDATION_MARKER_KEY.to_string(),
+        Cow::Borrowed(VALIDATION_MARKER_KEY),
         serde_json::json!(QUALITY_VALIDATION_MARKER),
     );
@@ -682,7 +687,7 @@ async fn test_multiple_postprocessors_run_before_validator() {
             result
                 .metadata
                 .additional
-                .insert("execution_order".to_string(), serde_json::json!(order));
+                .insert(Cow::Borrowed("execution_order"), serde_json::json!(order));
             Ok(())
         }
@@ -721,7 +726,7 @@ async fn test_multiple_postprocessors_run_before_validator() {
             result
                 .metadata
                 .additional
-                .insert("execution_order".to_string(), serde_json::json!(order));
+                .insert(Cow::Borrowed("execution_order"), serde_json::json!(order));
             Ok(())
         }
@@ -812,7 +817,7 @@ async fn test_multiple_postprocessors_run_before_validator() {
     let result = ExtractionResult {
         content: "test".to_string(),
-        mime_type: "text/plain".to_string(),
+        mime_type: Cow::Borrowed("text/plain"),
         metadata: Metadata::default(),
         tables: vec![],
         detected_languages: None,
@@ -839,7 +844,7 @@ async fn test_multiple_postprocessors_run_before_validator() {
 async fn test_run_pipeline_with_output_format_plain() {
     let result = ExtractionResult {
         content: "test content".to_string(),
-        mime_type: "text/plain".to_string(),
+        mime_type: Cow::Borrowed("text/plain"),
         metadata: Metadata::default(),
         tables: vec![],
         detected_languages: None,
@@ -865,7 +870,7 @@ async fn test_run_pipeline_with_output_format_djot() {
     let result = ExtractionResult {
         content: "test content".to_string(),
-        mime_type: "text/djot".to_string(),
+        mime_type: Cow::Borrowed("text/djot"),
         metadata: Metadata::default(),
         tables: vec![],
         detected_languages: None,
@@ -894,7 +899,7 @@ async fn test_run_pipeline_with_output_format_djot() {
             images: vec![],
             links: vec![],
             footnotes: vec![],
-            attributes: std::collections::HashMap::new(),
+            attributes: Vec::new(),
         }),
     };
@@ -912,7 +917,7 @@ async fn test_run_pipeline_with_output_format_djot() {
 async fn test_run_pipeline_with_output_format_html() {
     let result = ExtractionResult {
         content: "test content".to_string(),
-        mime_type: "text/plain".to_string(),
+        mime_type: Cow::Borrowed("text/plain"),
         metadata: Metadata::default(),
         tables: vec![],
         detected_languages: None,
@@ -942,7 +947,7 @@ async fn test_run_pipeline_applies_output_format_last() {
     let result = ExtractionResult {
         content: "test".to_string(),
-        mime_type: "text/plain".to_string(),
+        mime_type: Cow::Borrowed("text/plain"),
         metadata: Metadata::default(),
         tables: vec![],
         detected_languages: None,
@@ -958,7 +963,7 @@ async fn test_run_pipeline_applies_output_format_last() {
             images: vec![],
             links: vec![],
             footnotes: vec![],
-            attributes: std::collections::HashMap::new(),
+            attributes: Vec::new(),
         }),
     };

data/vendor/kreuzberg/src/extraction/email.rs CHANGED Viewed

@@ -24,6 +24,8 @@
 //! # Ok(())
 //! # }
 //! ```
+use bytes::Bytes;
 use crate::error::{KreuzbergError, Result};
 use crate::types::{EmailAttachment, EmailExtractionResult};
 use mail_parser::MimeHeaders;
@@ -101,7 +103,7 @@ pub fn parse_eml_content(data: &[u8]) -> Result<EmailExtractionResult> {
     let html_content = message.body_html(0).map(|s| s.to_string());
-    let cleaned_text = if let Some(plain) = &plain_text {
+    let cleaned_text = if let Some(ref plain) = plain_text {
         plain.clone()
     } else if let Some(html) = &html_content {
         clean_html_content(html)
@@ -132,7 +134,7 @@ pub fn parse_eml_content(data: &[u8]) -> Result<EmailExtractionResult> {
             mime_type: Some(mime_type),
             size: Some(size),
             is_image,
-            data: Some(data.to_vec()),
+            data: Some(Bytes::copy_from_slice(data)),
         });
     }
@@ -174,39 +176,49 @@ pub fn parse_msg_content(data: &[u8]) -> Result<EmailExtractionResult> {
     let to_emails = outlook
         .to
         .iter()
-        .map(|p| p.email.clone())
-        .filter(|e| !e.is_empty())
+        .filter_map(|p| {
+            if p.email.is_empty() {
+                None
+            } else {
+                Some(p.email.clone())
+            }
+        })
         .collect::<Vec<String>>();
     let cc_emails = outlook
         .cc
         .iter()
-        .map(|p| p.email.clone())
-        .filter(|e| !e.is_empty())
+        .filter_map(|p| {
+            if p.email.is_empty() {
+                None
+            } else {
+                Some(p.email.clone())
+            }
+        })
         .collect::<Vec<String>>();
-    let bcc_emails = if !outlook.bcc.is_empty() {
-        vec![outlook.bcc.clone()]
-    } else {
+    let bcc_emails = if outlook.bcc.is_empty() {
         vec![]
+    } else {
+        vec![outlook.bcc.clone()]
     };
-    let date = if !outlook.headers.date.is_empty() {
-        Some(outlook.headers.date.clone())
-    } else {
+    let date = if outlook.headers.date.is_empty() {
         None
+    } else {
+        Some(outlook.headers.date.clone())
     };
-    let message_id = if !outlook.headers.message_id.is_empty() {
-        Some(outlook.headers.message_id.clone())
-    } else {
+    let message_id = if outlook.headers.message_id.is_empty() {
         None
+    } else {
+        Some(outlook.headers.message_id.clone())
     };
-    let plain_text = if !outlook.body.is_empty() {
-        Some(outlook.body.clone())
-    } else {
+    let plain_text = if outlook.body.is_empty() {
         None
+    } else {
+        Some(outlook.body.clone())
     };
     let html_content = None;
@@ -231,7 +243,7 @@ pub fn parse_msg_content(data: &[u8]) -> Result<EmailExtractionResult> {
             };
             let data = if !att.payload.is_empty() {
-                hex::decode(&att.payload).ok()
+                hex::decode(&att.payload).ok().map(Bytes::from)
             } else {
                 None
             };

data/vendor/kreuzberg/src/extraction/excel.rs CHANGED Viewed

@@ -448,13 +448,13 @@ fn generate_markdown_and_cells(sheet_name: &str, range: &Range<Data>, capacity:
             markdown.push_str(" | ");
         }
         let cell_str = format_cell_to_string(cell);
-        header_cells.push(cell_str.clone());
         if cell_str.contains('|') || cell_str.contains('\\') {
             escape_markdown_into(&mut markdown, &cell_str);
         } else {
             markdown.push_str(&cell_str);
         }
+        header_cells.push(cell_str);
     }
     markdown.push_str(" |\n");
     cells.push(header_cells);
@@ -475,18 +475,19 @@ fn generate_markdown_and_cells(sheet_name: &str, range: &Range<Data>, capacity:
             if i > 0 {
                 markdown.push_str(" | ");
             }
-            if let Some(cell) = row.get(i) {
+            let cell_str = if let Some(cell) = row.get(i) {
                 let cell_str = format_cell_to_string(cell);
-                row_cells.push(cell_str.clone());
                 if cell_str.contains('|') || cell_str.contains('\\') {
                     escape_markdown_into(&mut markdown, &cell_str);
                 } else {
                     markdown.push_str(&cell_str);
                 }
+                cell_str
             } else {
-                row_cells.push(String::new());
-            }
+                String::new()
+            };
+            row_cells.push(cell_str);
         }
         markdown.push_str(" |\n");
         cells.push(row_cells);

data/vendor/kreuzberg/src/extraction/html/image_handling.rs CHANGED Viewed

@@ -1,5 +1,7 @@
 //! Image handling and conversion functionality for HTML extraction.
+use bytes::Bytes;
 use super::types::ExtractedInlineImage;
 use html_to_markdown_rs::{InlineImage, InlineImageFormat};
@@ -49,13 +51,16 @@ pub fn inline_image_format_to_str(format: &InlineImageFormat) -> String {
     }
 }
+// Note: This function returns String because ExtractedInlineImage.format is String (internal to HTML extraction).
+// For external ExtractedImage, use detect_image_format from pptx which returns Cow<'static, str>.
 /// Convert a library InlineImage to an ExtractedInlineImage.
 ///
 /// Maps the library's image representation to the extraction API's format,
 /// converting the format enum to a string representation.
 pub fn inline_image_to_extracted(image: InlineImage) -> ExtractedInlineImage {
     ExtractedInlineImage {
-        data: image.data,
+        data: Bytes::from(image.data),
         format: inline_image_format_to_str(&image.format),
         filename: image.filename,
         description: image.description,

data/vendor/kreuzberg/src/extraction/html/types.rs CHANGED Viewed

@@ -1,7 +1,7 @@
 //! Type definitions for HTML extraction.
+use bytes::Bytes;
 use serde::{Deserialize, Serialize};
-use std::collections::HashMap;
 pub use html_to_markdown_rs::{
     CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, PreprocessingOptions,
@@ -19,10 +19,11 @@ pub struct HtmlExtractionResult {
 /// Extracted inline image with metadata.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ExtractedInlineImage {
-    pub data: Vec<u8>,
+    /// Uses `bytes::Bytes` for cheap cloning of large buffers.
+    pub data: Bytes,
     pub format: String,
     pub filename: Option<String>,
     pub description: Option<String>,
     pub dimensions: Option<(u32, u32)>,
-    pub attributes: HashMap<String, String>,
+    pub attributes: Vec<(String, String)>,
 }

data/vendor/kreuzberg/src/extraction/libreoffice.rs CHANGED Viewed

@@ -45,6 +45,7 @@
 use crate::error::{KreuzbergError, Result};
 use crate::types::LibreOfficeConversionResult;
+use std::borrow::Cow;
 use std::collections::HashSet;
 use std::env;
 use std::fs as std_fs;
@@ -326,9 +327,9 @@ pub async fn convert_doc_to_docx(doc_bytes: &[u8]) -> Result<LibreOfficeConversi
     Ok(LibreOfficeConversionResult {
         converted_bytes,
-        original_format: "doc".to_string(),
-        target_format: "docx".to_string(),
-        target_mime: crate::core::mime::DOCX_MIME_TYPE.to_string(),
+        original_format: Cow::Borrowed("doc"),
+        target_format: Cow::Borrowed("docx"),
+        target_mime: Cow::Borrowed(crate::core::mime::DOCX_MIME_TYPE),
     })
 }
@@ -350,9 +351,9 @@ pub async fn convert_ppt_to_pptx(ppt_bytes: &[u8]) -> Result<LibreOfficeConversi
     Ok(LibreOfficeConversionResult {
         converted_bytes,
-        original_format: "ppt".to_string(),
-        target_format: "pptx".to_string(),
-        target_mime: crate::core::mime::POWER_POINT_MIME_TYPE.to_string(),
+        original_format: Cow::Borrowed("ppt"),
+        target_format: Cow::Borrowed("pptx"),
+        target_mime: Cow::Borrowed(crate::core::mime::POWER_POINT_MIME_TYPE),
     })
 }
@@ -505,9 +506,9 @@ mod tests {
     async fn test_conversion_result_structure() {
         let result = LibreOfficeConversionResult {
             converted_bytes: vec![1, 2, 3],
-            original_format: "doc".to_string(),
-            target_format: "docx".to_string(),
-            target_mime: crate::core::mime::DOCX_MIME_TYPE.to_string(),
+            original_format: Cow::Borrowed("doc"),
+            target_format: Cow::Borrowed("docx"),
+            target_mime: Cow::Borrowed(crate::core::mime::DOCX_MIME_TYPE),
         };
         assert_eq!(result.original_format, "doc");

data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs CHANGED Viewed

@@ -3,6 +3,8 @@
 //! This module handles image-related parsing from slide XML and
 //! detection of image formats from file data.
+use std::borrow::Cow;
 pub(super) fn html_escape(text: &str) -> String {
     text.replace('&', "&amp;")
         .replace('<', "&lt;")
@@ -11,21 +13,21 @@ pub(super) fn html_escape(text: &str) -> String {
         .replace('\'', "&#x27;")
 }
-pub(super) fn detect_image_format(data: &[u8]) -> String {
+pub(super) fn detect_image_format(data: &[u8]) -> Cow<'static, str> {
     if data.starts_with(&[0xFF, 0xD8, 0xFF]) {
-        "jpeg".to_string()
+        Cow::Borrowed("jpeg")
     } else if data.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
-        "png".to_string()
+        Cow::Borrowed("png")
     } else if data.starts_with(b"GIF") {
-        "gif".to_string()
+        Cow::Borrowed("gif")
     } else if data.starts_with(b"BM") {
-        "bmp".to_string()
+        Cow::Borrowed("bmp")
     } else if data.starts_with(b"<svg") || data.starts_with(b"<?xml") {
-        "svg".to_string()
+        Cow::Borrowed("svg")
     } else if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
-        "tiff".to_string()
+        Cow::Borrowed("tiff")
     } else {
-        "unknown".to_string()
+        Cow::Borrowed("unknown")
     }
 }

data/vendor/kreuzberg/src/extraction/pptx/mod.rs CHANGED Viewed

@@ -45,6 +45,8 @@ mod image_handling;
 mod metadata;
 mod parser;
+use bytes::Bytes;
 use crate::error::Result;
 use crate::types::{ExtractedImage, PptxExtractionResult};
@@ -117,8 +119,8 @@ pub fn extract_pptx_from_path(
                 let image_index = extracted_images.len();
                 extracted_images.push(ExtractedImage {
-                    data,
-                    format,
+                    data: Bytes::from(data),
+                    format, // Already a Cow<'static, str> from detect_image_format
                     image_index,
                     page_number: Some(slide.slide_number as usize),
                     width: None,
@@ -333,11 +335,13 @@ mod tests {
 <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">"#,
             );
             for (i, _) in slides.iter().enumerate() {
-                rels_xml.push_str(&format!(
+                use std::fmt::Write;
+                let _ = write!(
+                    rels_xml,
                     r#"<Relationship Id="rId{}" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide{}.xml"/>"#,
                     i + 1,
                     i + 1
-                ));
+                );
             }
             rels_xml.push_str("</Relationships>");
             zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();

data/vendor/kreuzberg/src/extraction/structured.rs CHANGED Viewed

@@ -33,12 +33,13 @@
 use crate::error::{KreuzbergError, Result};
 use crate::text::utf8_validation;
 use serde::{Deserialize, Serialize};
+use std::borrow::Cow;
 use std::collections::HashMap;
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct StructuredDataResult {
     pub content: String,
-    pub format: String,
+    pub format: Cow<'static, str>,
     pub metadata: HashMap<String, String>,
     pub text_fields: Vec<String>,
 }
@@ -97,7 +98,7 @@ pub fn parse_json(data: &[u8], config: Option<JsonExtractionConfig>) -> Result<S
     Ok(StructuredDataResult {
         content,
-        format: "json".to_string(),
+        format: Cow::Borrowed("json"),
         metadata,
         text_fields,
     })
@@ -254,7 +255,7 @@ pub fn parse_yaml(data: &[u8]) -> Result<StructuredDataResult> {
     Ok(StructuredDataResult {
         content,
-        format: "yaml".to_string(),
+        format: Cow::Borrowed("yaml"),
         metadata,
         text_fields,
     })
@@ -326,7 +327,7 @@ pub fn parse_toml(data: &[u8]) -> Result<StructuredDataResult> {
     Ok(StructuredDataResult {
         content,
-        format: "toml".to_string(),
+        format: Cow::Borrowed("toml"),
         metadata,
         text_fields,
     })

data/vendor/kreuzberg/src/extraction/transform/content.rs CHANGED Viewed

@@ -167,7 +167,7 @@ pub(super) fn process_images(
                 element_index: Some(elements.len()),
                 additional: {
                     let mut m = HashMap::new();
-                    m.insert("format".to_string(), image.format.clone());
+                    m.insert("format".to_string(), image.format.to_string());
                     if let Some(width) = image.width {
                         m.insert("width".to_string(), width.to_string());
                     }