RubyGems - kreuzberg - Versions diffs - 4.2.6 → 4.2.7 - Mend

kreuzberg 4.2.6 → 4.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (139) hide show

checksums.yaml +4 -4
data/Gemfile.lock +7 -4
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
data/ext/kreuzberg_rb/native/src/result.rs +5 -3
data/lib/kreuzberg/version.rb +1 -1
data/sig/kreuzberg.rbs +228 -37
data/spec/binding/batch_operations_spec.rb +2 -0
data/vendor/Cargo.toml +3 -2
data/vendor/kreuzberg/Cargo.toml +2 -1
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/api/error.rs +29 -1
data/vendor/kreuzberg/src/api/handlers.rs +28 -25
data/vendor/kreuzberg/src/api/openapi.rs +14 -1
data/vendor/kreuzberg/src/chunking/config.rs +2 -37
data/vendor/kreuzberg/src/chunking/core.rs +78 -2
data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
data/vendor/kreuzberg/src/extraction/email.rs +31 -19
data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
data/vendor/kreuzberg/src/extractors/email.rs +5 -3
data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
data/vendor/kreuzberg/src/extractors/html.rs +1 -1
data/vendor/kreuzberg/src/extractors/image.rs +3 -3
data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
data/vendor/kreuzberg/src/extractors/text.rs +2 -2
data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
data/vendor/kreuzberg/src/lib.rs +1 -1
data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
data/vendor/kreuzberg/src/mcp/format.rs +5 -4
data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
data/vendor/kreuzberg/src/ocr/types.rs +3 -4
data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
data/vendor/kreuzberg/src/text/quality.rs +13 -13
data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
data/vendor/kreuzberg/src/types/djot.rs +15 -4
data/vendor/kreuzberg/src/types/extraction.rs +24 -4
data/vendor/kreuzberg/src/types/formats.rs +9 -5
data/vendor/kreuzberg/src/types/metadata.rs +68 -7
data/vendor/kreuzberg/src/types/mod.rs +7 -5
data/vendor/kreuzberg/src/types/page.rs +9 -0
data/vendor/kreuzberg/src/types/tables.rs +2 -0
data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
data/vendor/kreuzberg/tests/config_features.rs +19 -11
data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
data/vendor/kreuzberg/tests/core_integration.rs +5 -6
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
data/vendor/kreuzberg-ffi/src/error.rs +56 -0
data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
data/vendor/kreuzberg-ffi/src/result.rs +2 -1
data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +2 -2

data/vendor/kreuzberg/src/core/config/extraction/loaders.rs CHANGED Viewed

@@ -44,10 +44,10 @@ impl ExtractionConfig {
         let config: Self = toml::from_str(&content)
             .map_err(|e| KreuzbergError::validation(format!("Invalid TOML in {}: {}", path.display(), e)))?;
-        let config_arc = Arc::new(config.clone());
-        CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc));
+        let config_arc = Arc::new(config);
+        CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc.clone()));
-        Ok(config)
+        Ok((*config_arc).clone())
     }
     /// Load configuration from a YAML file.
@@ -72,10 +72,10 @@ impl ExtractionConfig {
         let config: Self = serde_yaml_ng::from_str(&content)
             .map_err(|e| KreuzbergError::validation(format!("Invalid YAML in {}: {}", path.display(), e)))?;
-        let config_arc = Arc::new(config.clone());
-        CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc));
+        let config_arc = Arc::new(config);
+        CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc.clone()));
-        Ok(config)
+        Ok((*config_arc).clone())
     }
     /// Load configuration from a JSON file.
@@ -100,10 +100,10 @@ impl ExtractionConfig {
         let config: Self = serde_json::from_str(&content)
             .map_err(|e| KreuzbergError::validation(format!("Invalid JSON in {}: {}", path.display(), e)))?;
-        let config_arc = Arc::new(config.clone());
-        CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc));
+        let config_arc = Arc::new(config);
+        CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc.clone()));
-        Ok(config)
+        Ok((*config_arc).clone())
     }
     /// Load configuration from a file, auto-detecting format by extension.
@@ -169,10 +169,10 @@ impl ExtractionConfig {
             }
         };
-        let config_arc = Arc::new(config.clone());
-        CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc));
+        let config_arc = Arc::new(config);
+        CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc.clone()));
-        Ok(config)
+        Ok((*config_arc).clone())
     }
     /// Discover configuration file in parent directories.

data/vendor/kreuzberg/src/core/config/mod.rs CHANGED Viewed

@@ -17,4 +17,4 @@ pub use ocr::OcrConfig;
 pub use page::PageConfig;
 #[cfg(feature = "pdf")]
 pub use pdf::{HierarchyConfig, PdfConfig};
-pub use processing::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, PostProcessorConfig};
+pub use processing::{ChunkerType, ChunkingConfig, EmbeddingConfig, EmbeddingModelType, PostProcessorConfig};

data/vendor/kreuzberg/src/core/config/processing.rs CHANGED Viewed

@@ -7,6 +7,19 @@ use serde::{Deserialize, Serialize};
 use std::collections::HashSet;
 use std::path::PathBuf;
+/// Type of text chunker to use.
+///
+/// # Variants
+///
+/// * `Text` - Generic text splitter, splits on whitespace and punctuation
+/// * `Markdown` - Markdown-aware splitter, preserves formatting and structure
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
+pub enum ChunkerType {
+    #[default]
+    Text,
+    Markdown,
+}
 /// Post-processor configuration.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct PostProcessorConfig {
@@ -59,15 +72,34 @@ impl Default for PostProcessorConfig {
 }
 /// Chunking configuration.
+///
+/// Configures text chunking for document content, including chunk size,
+/// overlap, trimming behavior, and optional embeddings.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ChunkingConfig {
     /// Maximum characters per chunk
-    #[serde(default = "default_chunk_size")]
-    pub max_chars: usize,
+    ///
+    /// Default: 1000
+    #[serde(default = "default_chunk_size", rename = "max_chars", alias = "max_characters")]
+    pub max_characters: usize,
     /// Overlap between chunks in characters
-    #[serde(default = "default_chunk_overlap")]
-    pub max_overlap: usize,
+    ///
+    /// Default: 200
+    #[serde(default = "default_chunk_overlap", rename = "max_overlap", alias = "overlap")]
+    pub overlap: usize,
+    /// Whether to trim whitespace from chunk boundaries
+    ///
+    /// Default: true
+    #[serde(default = "default_trim")]
+    pub trim: bool,
+    /// Type of chunker to use (Text or Markdown)
+    ///
+    /// Default: Text
+    #[serde(default = "default_chunker_type")]
+    pub chunker_type: ChunkerType,
     /// Optional embedding configuration for chunk embeddings
     #[serde(skip_serializing_if = "Option::is_none")]
@@ -78,6 +110,19 @@ pub struct ChunkingConfig {
     pub preset: Option<String>,
 }
+impl Default for ChunkingConfig {
+    fn default() -> Self {
+        Self {
+            max_characters: 1000,
+            overlap: 200,
+            trim: true,
+            chunker_type: ChunkerType::Text,
+            embedding: None,
+            preset: None,
+        }
+    }
+}
 /// Embedding configuration for text chunks.
 ///
 /// Configures embedding generation using ONNX models via fastembed-rs.
@@ -149,6 +194,14 @@ fn default_chunk_overlap() -> usize {
     200
 }
+fn default_trim() -> bool {
+    true
+}
+fn default_chunker_type() -> ChunkerType {
+    ChunkerType::Text
+}
 fn default_normalize() -> bool {
     true
 }
@@ -196,13 +249,17 @@ mod tests {
     #[test]
     fn test_chunking_config_defaults() {
         let config = ChunkingConfig {
-            max_chars: 1000,
-            max_overlap: 200,
+            max_characters: 1000,
+            overlap: 200,
+            trim: true,
+            chunker_type: ChunkerType::Text,
             embedding: None,
             preset: None,
         };
-        assert_eq!(config.max_chars, 1000);
-        assert_eq!(config.max_overlap, 200);
+        assert_eq!(config.max_characters, 1000);
+        assert_eq!(config.overlap, 200);
+        assert!(config.trim);
+        assert_eq!(config.chunker_type, ChunkerType::Text);
     }
     #[test]

data/vendor/kreuzberg/src/core/config_validation/mod.rs CHANGED Viewed

@@ -141,6 +141,14 @@ mod tests {
         assert!(validate_language_code("DEU").is_ok());
     }
+    #[test]
+    fn test_validate_language_code_all_keyword() {
+        assert!(validate_language_code("all").is_ok());
+        assert!(validate_language_code("ALL").is_ok());
+        assert!(validate_language_code("All").is_ok());
+        assert!(validate_language_code("*").is_ok());
+    }
     #[test]
     fn test_validate_language_code_invalid() {
         let result = validate_language_code("invalid");

data/vendor/kreuzberg/src/core/config_validation/sections.rs CHANGED Viewed

@@ -167,6 +167,11 @@ pub fn validate_ocr_backend(backend: &str) -> Result<()> {
 pub fn validate_language_code(code: &str) -> Result<()> {
     let code_lower = code.to_lowercase();
+    // Accept "all" and "*" as special values to auto-detect installed languages
+    if code_lower == "all" || code_lower == "*" {
+        return Ok(());
+    }
     if VALID_LANGUAGE_CODES.contains(&code_lower.as_str()) {
         return Ok(());
     }

data/vendor/kreuzberg/src/core/extractor/batch.rs CHANGED Viewed

@@ -6,6 +6,7 @@
 use crate::core::config::ExtractionConfig;
 use crate::types::{ErrorMetadata, ExtractionResult, Metadata};
 use crate::{KreuzbergError, Result};
+use std::borrow::Cow;
 use std::path::Path;
 use std::sync::Arc;
@@ -65,9 +66,9 @@ pub async fn batch_extract_file(
         return Ok(vec![]);
     }
-    let config = Arc::new(config.clone());
+    let config_arc = Arc::new(config.clone());
-    let max_concurrent = config
+    let max_concurrent = config_arc
         .max_concurrent_extractions
         .unwrap_or_else(|| (num_cpus::get() as f64 * 1.5).ceil() as usize);
     let semaphore = Arc::new(Semaphore::new(max_concurrent));
@@ -76,7 +77,7 @@ pub async fn batch_extract_file(
     for (index, path) in paths.into_iter().enumerate() {
         let path_buf = path.as_ref().to_path_buf();
-        let config_clone = Arc::clone(&config);
+        let config_clone = Arc::clone(&config_arc);
         let semaphore_clone = Arc::clone(&semaphore);
         tasks.spawn(async move {
@@ -108,7 +109,7 @@ pub async fn batch_extract_file(
                 results[index] = Some(ExtractionResult {
                     content: format!("Error: {}", e),
-                    mime_type: "text/plain".to_string(),
+                    mime_type: Cow::Borrowed("text/plain"),
                     metadata,
                     tables: vec![],
                     detected_languages: None,
@@ -180,10 +181,9 @@ pub async fn batch_extract_bytes(
         return Ok(vec![]);
     }
-    let batch_config = config.clone();
-    let config = Arc::new(batch_config);
+    let config_arc = Arc::new(config.clone());
-    let max_concurrent = config
+    let max_concurrent = config_arc
         .max_concurrent_extractions
         .unwrap_or_else(|| (num_cpus::get() as f64 * 1.5).ceil() as usize);
     let semaphore = Arc::new(Semaphore::new(max_concurrent));
@@ -191,7 +191,7 @@ pub async fn batch_extract_bytes(
     let mut tasks = JoinSet::new();
     for (index, (bytes, mime_type)) in contents.into_iter().enumerate() {
-        let config_clone = Arc::clone(&config);
+        let config_clone = Arc::clone(&config_arc);
         let semaphore_clone = Arc::clone(&semaphore);
         tasks.spawn(async move {
@@ -224,7 +224,7 @@ pub async fn batch_extract_bytes(
                 results[index] = Some(ExtractionResult {
                     content: format!("Error: {}", e),
-                    mime_type: "text/plain".to_string(),
+                    mime_type: Cow::Borrowed("text/plain"),
                     metadata,
                     tables: vec![],
                     detected_languages: None,

data/vendor/kreuzberg/src/core/extractor/file.rs CHANGED Viewed

@@ -18,6 +18,8 @@ use crate::types::ExtractionResult;
 use crate::types::LibreOfficeConversionResult;
 #[cfg(feature = "office")]
 use serde_json::json;
+#[cfg(feature = "office")]
+use std::borrow::Cow;
 use std::path::Path;
 #[cfg(feature = "office")]
@@ -226,9 +228,9 @@ pub(in crate::core::extractor) fn apply_libreoffice_metadata(
     legacy_mime: &str,
     conversion: &LibreOfficeConversionResult,
 ) {
-    result.mime_type = pool_mime_type(legacy_mime);
+    result.mime_type = pool_mime_type(legacy_mime).into();
     result.metadata.additional.insert(
-        "libreoffice_conversion".to_string(),
+        Cow::Borrowed("libreoffice_conversion"),
         json!({
             "converter": "libreoffice",
             "original_format": conversion.original_format,

data/vendor/kreuzberg/src/core/extractor/legacy.rs CHANGED Viewed

@@ -24,18 +24,18 @@
 /// It replicates the logic of `extract_bytes` but uses synchronous extractor methods.
 #[cfg(not(feature = "tokio-runtime"))]
 pub(super) fn extract_bytes_sync_impl(
-    content: Vec<u8>,
-    mime_type: Option<String>,
-    config: Option<crate::core::config::ExtractionConfig>,
+    content: &[u8],
+    mime_type: Option<&str>,
+    config: Option<&crate::core::config::ExtractionConfig>,
 ) -> crate::Result<crate::types::ExtractionResult> {
     use crate::KreuzbergError;
     use crate::core::extractor::helpers::get_extractor;
     use crate::core::mime;
-    let config = config.unwrap_or_default();
+    let cfg = config.cloned().unwrap_or_default();
     let validated_mime = if let Some(mime) = mime_type {
-        mime::validate_mime_type(&mime)?
+        mime::validate_mime_type(mime)?
     } else {
         return Err(KreuzbergError::Validation {
             message: "MIME type is required for synchronous extraction".to_string(),
@@ -54,9 +54,9 @@ pub(super) fn extract_bytes_sync_impl(
         ))
     })?;
-    let mut result = sync_extractor.extract_sync(&content, &validated_mime, &config)?;
+    let mut result = sync_extractor.extract_sync(content, &validated_mime, &cfg)?;
-    result = crate::core::pipeline::run_pipeline_sync(result, &config)?;
+    result = crate::core::pipeline::run_pipeline_sync(result, &cfg)?;
     Ok(result)
 }

data/vendor/kreuzberg/src/core/extractor/sync.rs CHANGED Viewed

@@ -107,7 +107,7 @@ pub fn extract_bytes_sync(content: &[u8], mime_type: &str, config: &ExtractionCo
 /// It calls `extract_bytes_sync_impl()` to perform the extraction.
 #[cfg(not(feature = "tokio-runtime"))]
 pub fn extract_bytes_sync(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
-    super::legacy::extract_bytes_sync_impl(content.to_vec(), Some(mime_type.to_string()), Some(config.clone()))
+    super::legacy::extract_bytes_sync_impl(content, Some(mime_type), Some(config))
 }
 /// Synchronous wrapper for `batch_extract_file`.
@@ -180,14 +180,14 @@ pub fn batch_extract_bytes_sync(
     config: &ExtractionConfig,
 ) -> Result<Vec<ExtractionResult>> {
     use crate::types::{ErrorMetadata, Metadata};
-    use crate::utils::intern_mime_type;
+    use std::borrow::Cow;
     let mut results = Vec::with_capacity(contents.len());
     for (content, mime_type) in contents {
         let result = extract_bytes_sync(&content, &mime_type, config);
         results.push(result.unwrap_or_else(|e| ExtractionResult {
             content: format!("Error: {}", e),
-            mime_type: intern_mime_type("text/plain").to_string(),
+            mime_type: Cow::Borrowed("text/plain"),
             metadata: Metadata {
                 error: Some(ErrorMetadata {
                     error_type: format!("{:?}", e),

data/vendor/kreuzberg/src/core/pipeline/execution.rs CHANGED Viewed

@@ -7,6 +7,7 @@ use crate::core::config::ExtractionConfig;
 use crate::plugins::ProcessingStage;
 use crate::types::ExtractionResult;
 use crate::{KreuzbergError, Result};
+use std::borrow::Cow;
 /// Execute all registered post-processors by stage.
 pub(super) async fn execute_processors(
@@ -37,7 +38,7 @@ pub(super) async fn execute_processors(
                     }
                     Err(err) => {
                         result.metadata.additional.insert(
-                            format!("processing_error_{processor_name}"),
+                            Cow::Owned(format!("processing_error_{processor_name}")),
                             serde_json::Value::String(err.to_string()),
                         );
                     }

data/vendor/kreuzberg/src/core/pipeline/features.rs CHANGED Viewed

@@ -6,27 +6,21 @@
 use crate::Result;
 use crate::core::config::ExtractionConfig;
 use crate::types::ExtractionResult;
+use std::borrow::Cow;
 /// Execute chunking if configured.
 pub(super) fn execute_chunking(result: &mut ExtractionResult, config: &ExtractionConfig) -> Result<()> {
     #[cfg(feature = "chunking")]
     if let Some(ref chunking_config) = config.chunking {
-        let chunk_config = crate::chunking::ChunkingConfig {
-            max_characters: chunking_config.max_chars,
-            overlap: chunking_config.max_overlap,
-            trim: true,
-            chunker_type: crate::chunking::ChunkerType::Text,
-        };
         let page_boundaries = result.metadata.pages.as_ref().and_then(|ps| ps.boundaries.as_deref());
-        match crate::chunking::chunk_text(&result.content, &chunk_config, page_boundaries) {
+        match crate::chunking::chunk_text(&result.content, chunking_config, page_boundaries) {
             Ok(chunking_result) => {
                 result.chunks = Some(chunking_result.chunks);
                 if let Some(ref chunks) = result.chunks {
                     result.metadata.additional.insert(
-                        "chunk_count".to_string(),
+                        Cow::Borrowed("chunk_count"),
                         serde_json::Value::Number(serde_json::Number::from(chunks.len())),
                     );
                 }
@@ -40,13 +34,13 @@ pub(super) fn execute_chunking(result: &mut ExtractionResult, config: &Extractio
                             result
                                 .metadata
                                 .additional
-                                .insert("embeddings_generated".to_string(), serde_json::Value::Bool(true));
+                                .insert(Cow::Borrowed("embeddings_generated"), serde_json::Value::Bool(true));
                         }
                         Err(e) => {
-                            result
-                                .metadata
-                                .additional
-                                .insert("embedding_error".to_string(), serde_json::Value::String(e.to_string()));
+                            result.metadata.additional.insert(
+                                Cow::Borrowed("embedding_error"),
+                                serde_json::Value::String(e.to_string()),
+                            );
                         }
                     }
                 }
@@ -54,16 +48,16 @@ pub(super) fn execute_chunking(result: &mut ExtractionResult, config: &Extractio
                 #[cfg(not(feature = "embeddings"))]
                 if chunking_config.embedding.is_some() {
                     result.metadata.additional.insert(
-                        "embedding_error".to_string(),
+                        Cow::Borrowed("embedding_error"),
                         serde_json::Value::String("Embeddings feature not enabled".to_string()),
                     );
                 }
             }
             Err(e) => {
-                result
-                    .metadata
-                    .additional
-                    .insert("chunking_error".to_string(), serde_json::Value::String(e.to_string()));
+                result.metadata.additional.insert(
+                    Cow::Borrowed("chunking_error"),
+                    serde_json::Value::String(e.to_string()),
+                );
             }
         }
     }
@@ -71,7 +65,7 @@ pub(super) fn execute_chunking(result: &mut ExtractionResult, config: &Extractio
     #[cfg(not(feature = "chunking"))]
     if config.chunking.is_some() {
         result.metadata.additional.insert(
-            "chunking_error".to_string(),
+            Cow::Borrowed("chunking_error"),
             serde_json::Value::String("Chunking feature not enabled".to_string()),
         );
     }
@@ -89,7 +83,7 @@ pub(super) fn execute_language_detection(result: &mut ExtractionResult, config:
             }
             Err(e) => {
                 result.metadata.additional.insert(
-                    "language_detection_error".to_string(),
+                    Cow::Borrowed("language_detection_error"),
                     serde_json::Value::String(e.to_string()),
                 );
             }
@@ -99,7 +93,7 @@ pub(super) fn execute_language_detection(result: &mut ExtractionResult, config:
     #[cfg(not(feature = "language-detection"))]
     if config.language_detection.is_some() {
         result.metadata.additional.insert(
-            "language_detection_error".to_string(),
+            Cow::Borrowed("language_detection_error"),
             serde_json::Value::String("Language detection feature not enabled".to_string()),
         );
     }

data/vendor/kreuzberg/src/core/pipeline/format.rs CHANGED Viewed

@@ -5,6 +5,7 @@
 use crate::core::config::OutputFormat;
 use crate::types::ExtractionResult;
+use std::borrow::Cow;
 /// Apply output format conversion to the extraction result.
 ///
@@ -23,7 +24,7 @@ use crate::types::ExtractionResult;
 /// * `output_format` - The desired output format
 pub fn apply_output_format(result: &mut ExtractionResult, output_format: OutputFormat) {
     // Check if content was already formatted during extraction
-    let already_formatted = match result.mime_type.as_str() {
+    let already_formatted = match &*result.mime_type {
         "text/markdown" if output_format == OutputFormat::Markdown => true,
         "text/djot" if output_format == OutputFormat::Djot => true,
         _ => false,
@@ -46,7 +47,7 @@ pub fn apply_output_format(result: &mut ExtractionResult, output_format: OutputF
                 Err(e) => {
                     // Keep original content on error, record error in metadata
                     result.metadata.additional.insert(
-                        "output_format_error".to_string(),
+                        Cow::Borrowed("output_format_error"),
                         serde_json::Value::String(format!("Failed to convert to djot: {}", e)),
                     );
                 }
@@ -66,7 +67,7 @@ pub fn apply_output_format(result: &mut ExtractionResult, output_format: OutputF
                     Err(e) => {
                         // Keep original content on error, record error in metadata
                         result.metadata.additional.insert(
-                            "output_format_error".to_string(),
+                            Cow::Borrowed("output_format_error"),
                             serde_json::Value::String(format!("Failed to convert to markdown: {}", e)),
                         );
                     }
@@ -87,7 +88,7 @@ pub fn apply_output_format(result: &mut ExtractionResult, output_format: OutputF
                             Err(e) => {
                                 // Keep original content on error, record error in metadata
                                 result.metadata.additional.insert(
-                                    "output_format_error".to_string(),
+                                    Cow::Borrowed("output_format_error"),
                                     serde_json::Value::String(format!("Failed to convert djot to HTML: {}", e)),
                                 );
                             }
@@ -96,7 +97,7 @@ pub fn apply_output_format(result: &mut ExtractionResult, output_format: OutputF
                     Err(e) => {
                         // Keep original content on error, record error in metadata
                         result.metadata.additional.insert(
-                            "output_format_error".to_string(),
+                            Cow::Borrowed("output_format_error"),
                             serde_json::Value::String(format!("Failed to generate djot for HTML conversion: {}", e)),
                         );
                     }
@@ -128,7 +129,7 @@ mod tests {
     fn test_apply_output_format_plain() {
         let mut result = ExtractionResult {
             content: "Hello World".to_string(),
-            mime_type: "text/plain".to_string(),
+            mime_type: Cow::Borrowed("text/plain"),
             metadata: Metadata::default(),
             tables: vec![],
             detected_languages: None,
@@ -151,7 +152,7 @@ mod tests {
         let mut result = ExtractionResult {
             content: "Hello World".to_string(),
-            mime_type: "text/djot".to_string(),
+            mime_type: Cow::Borrowed("text/djot"),
             metadata: Metadata::default(),
             tables: vec![],
             detected_languages: None,
@@ -180,7 +181,7 @@ mod tests {
                 images: vec![],
                 links: vec![],
                 footnotes: vec![],
-                attributes: std::collections::HashMap::new(),
+                attributes: Vec::new(),
             }),
         };
@@ -194,7 +195,7 @@ mod tests {
     fn test_apply_output_format_djot_without_djot_content() {
         let mut result = ExtractionResult {
             content: "Hello World".to_string(),
-            mime_type: "text/plain".to_string(),
+            mime_type: Cow::Borrowed("text/plain"),
             metadata: Metadata::default(),
             tables: vec![],
             detected_languages: None,
@@ -216,7 +217,7 @@ mod tests {
     fn test_apply_output_format_html() {
         let mut result = ExtractionResult {
             content: "Hello World".to_string(),
-            mime_type: "text/plain".to_string(),
+            mime_type: Cow::Borrowed("text/plain"),
             metadata: Metadata::default(),
             tables: vec![],
             detected_languages: None,
@@ -239,7 +240,7 @@ mod tests {
     fn test_apply_output_format_html_escapes_special_chars() {
         let mut result = ExtractionResult {
             content: "<script>alert('XSS')</script>".to_string(),
-            mime_type: "text/plain".to_string(),
+            mime_type: Cow::Borrowed("text/plain"),
             metadata: Metadata::default(),
             tables: vec![],
             detected_languages: None,
@@ -262,7 +263,7 @@ mod tests {
     fn test_apply_output_format_markdown() {
         let mut result = ExtractionResult {
             content: "Hello World".to_string(),
-            mime_type: "text/plain".to_string(),
+            mime_type: Cow::Borrowed("text/plain"),
             metadata: Metadata::default(),
             tables: vec![],
             detected_languages: None,
@@ -281,8 +282,9 @@ mod tests {
     #[test]
     fn test_apply_output_format_preserves_metadata() {
-        let mut additional = std::collections::HashMap::new();
-        additional.insert("custom_key".to_string(), serde_json::json!("custom_value"));
+        use ahash::AHashMap;
+        let mut additional = AHashMap::new();
+        additional.insert(Cow::Borrowed("custom_key"), serde_json::json!("custom_value"));
         let metadata = Metadata {
             title: Some("Test Title".to_string()),
             additional,
@@ -291,7 +293,7 @@ mod tests {
         let mut result = ExtractionResult {
             content: "Hello World".to_string(),
-            mime_type: "text/plain".to_string(),
+            mime_type: Cow::Borrowed("text/plain"),
             metadata,
             tables: vec![],
             detected_languages: None,
@@ -324,7 +326,7 @@ mod tests {
         let mut result = ExtractionResult {
             content: "Hello World".to_string(),
-            mime_type: "text/plain".to_string(),
+            mime_type: Cow::Borrowed("text/plain"),
             metadata: Metadata::default(),
             tables: vec![table],
             detected_languages: None,
@@ -367,12 +369,12 @@ mod tests {
             images: vec![],
             links: vec![],
             footnotes: vec![],
-            attributes: std::collections::HashMap::new(),
+            attributes: Vec::new(),
         };
         let mut result = ExtractionResult {
             content: "test".to_string(),
-            mime_type: "text/djot".to_string(),
+            mime_type: Cow::Borrowed("text/djot"),
             metadata: Metadata::default(),
             tables: vec![],
             detected_languages: None,