RubyGems - kreuzberg - Versions diffs - 4.2.6 → 4.2.7 - Mend

kreuzberg 4.2.6 → 4.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (139) hide show

checksums.yaml +4 -4
data/Gemfile.lock +7 -4
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
data/ext/kreuzberg_rb/native/src/result.rs +5 -3
data/lib/kreuzberg/version.rb +1 -1
data/sig/kreuzberg.rbs +228 -37
data/spec/binding/batch_operations_spec.rb +2 -0
data/vendor/Cargo.toml +3 -2
data/vendor/kreuzberg/Cargo.toml +2 -1
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/api/error.rs +29 -1
data/vendor/kreuzberg/src/api/handlers.rs +28 -25
data/vendor/kreuzberg/src/api/openapi.rs +14 -1
data/vendor/kreuzberg/src/chunking/config.rs +2 -37
data/vendor/kreuzberg/src/chunking/core.rs +78 -2
data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
data/vendor/kreuzberg/src/extraction/email.rs +31 -19
data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
data/vendor/kreuzberg/src/extractors/email.rs +5 -3
data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
data/vendor/kreuzberg/src/extractors/html.rs +1 -1
data/vendor/kreuzberg/src/extractors/image.rs +3 -3
data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
data/vendor/kreuzberg/src/extractors/text.rs +2 -2
data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
data/vendor/kreuzberg/src/lib.rs +1 -1
data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
data/vendor/kreuzberg/src/mcp/format.rs +5 -4
data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
data/vendor/kreuzberg/src/ocr/types.rs +3 -4
data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
data/vendor/kreuzberg/src/text/quality.rs +13 -13
data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
data/vendor/kreuzberg/src/types/djot.rs +15 -4
data/vendor/kreuzberg/src/types/extraction.rs +24 -4
data/vendor/kreuzberg/src/types/formats.rs +9 -5
data/vendor/kreuzberg/src/types/metadata.rs +68 -7
data/vendor/kreuzberg/src/types/mod.rs +7 -5
data/vendor/kreuzberg/src/types/page.rs +9 -0
data/vendor/kreuzberg/src/types/tables.rs +2 -0
data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
data/vendor/kreuzberg/tests/config_features.rs +19 -11
data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
data/vendor/kreuzberg/tests/core_integration.rs +5 -6
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
data/vendor/kreuzberg-ffi/src/error.rs +56 -0
data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
data/vendor/kreuzberg-ffi/src/result.rs +2 -1
data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +2 -2

data/vendor/kreuzberg/src/extraction/transform/mod.rs CHANGED Viewed

@@ -21,6 +21,8 @@ use crate::types::{Element, ExtractionResult};
 use content::{
     add_page_break, format_table_as_text, process_content, process_hierarchy, process_images, process_tables,
 };
+#[cfg(test)]
+use std::borrow::Cow;
 /// Transform an extraction result into semantic elements.
 ///
@@ -117,7 +119,7 @@ pub fn transform_extraction_result_to_elements(result: &ExtractionResult) -> Vec
                         element_index: Some(elements.len()),
                         additional: {
                             let mut m = std::collections::HashMap::new();
-                            m.insert("format".to_string(), image.format.clone());
+                            m.insert("format".to_string(), image.format.to_string());
                             if let Some(width) = image.width {
                                 m.insert("width".to_string(), width.to_string());
                             }
@@ -138,6 +140,7 @@ pub fn transform_extraction_result_to_elements(result: &ExtractionResult) -> Vec
 #[cfg(test)]
 mod tests {
     use super::*;
+    use bytes::Bytes;
     #[test]
     fn test_detect_bullet_items() {
@@ -262,7 +265,7 @@ mod tests {
         // Create a mock result with pages and hierarchy
         let result = ExtractionResult {
             content: "Full document content".to_string(),
-            mime_type: "application/pdf".to_string(),
+            mime_type: Cow::Borrowed("application/pdf"),
             metadata: test_metadata(Some("Test Document".to_string())),
             tables: vec![],
             detected_languages: None,
@@ -358,8 +361,8 @@ mod tests {
         };
         let image = ExtractedImage {
-            data: vec![1, 2, 3, 4],
-            format: "jpeg".to_string(),
+            data: Bytes::from_static(&[1, 2, 3, 4]),
+            format: std::borrow::Cow::Borrowed("jpeg"),
             image_index: 0,
             page_number: Some(1),
             width: Some(640),
@@ -373,7 +376,7 @@ mod tests {
         let result = ExtractionResult {
             content: "Test content".to_string(),
-            mime_type: "application/pdf".to_string(),
+            mime_type: Cow::Borrowed("application/pdf"),
             metadata: test_metadata(Some("Test".to_string())),
             tables: vec![],
             detected_languages: None,
@@ -421,7 +424,7 @@ mod tests {
         // Create a result without pages
         let result = ExtractionResult {
             content: "Simple text content\n\nSecond paragraph".to_string(),
-            mime_type: "text/plain".to_string(),
+            mime_type: Cow::Borrowed("text/plain"),
             metadata: test_metadata(Some("Simple Doc".to_string())),
             tables: vec![],
             detected_languages: None,
@@ -453,7 +456,7 @@ mod tests {
         let result = ExtractionResult {
             content: "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.".to_string(),
-            mime_type: "text/plain".to_string(),
+            mime_type: Cow::Borrowed("text/plain"),
             metadata: test_metadata(None),
             tables: vec![],
             detected_languages: None,

data/vendor/kreuzberg/src/extractors/archive.rs CHANGED Viewed

@@ -8,7 +8,9 @@ use crate::extraction::archive::{
 };
 use crate::plugins::{DocumentExtractor, Plugin};
 use crate::types::{ArchiveMetadata, ExtractionResult, Metadata};
+use ahash::AHashMap;
 use async_trait::async_trait;
+use std::borrow::Cow;
 use std::collections::HashMap;
 /// Build an ExtractionResult from archive metadata and text contents.
@@ -18,7 +20,7 @@ use std::collections::HashMap;
 fn build_archive_result(
     extraction_metadata: ExtractedMetadata,
     text_contents: HashMap<String, String>,
-    format_name: &str,
+    format_name: &'static str,
     mime_type: &str,
 ) -> ExtractionResult {
     let file_names: Vec<String> = extraction_metadata
@@ -28,14 +30,14 @@ fn build_archive_result(
         .collect();
     let archive_metadata = ArchiveMetadata {
-        format: format_name.to_string(),
+        format: Cow::Borrowed(format_name),
         file_count: extraction_metadata.file_count,
         file_list: file_names,
         total_size: extraction_metadata.total_size as usize,
         compressed_size: None,
     };
-    let mut additional = HashMap::new();
+    let mut additional = AHashMap::new();
     let file_details: Vec<serde_json::Value> = extraction_metadata
         .file_list
         .iter()
@@ -47,7 +49,7 @@ fn build_archive_result(
             })
         })
         .collect();
-    additional.insert("files".to_string(), serde_json::json!(file_details));
+    additional.insert(Cow::Borrowed("files"), serde_json::json!(file_details));
     let mut output = format!(
         "{} Archive ({} files, {} bytes)\n\n",
@@ -67,7 +69,7 @@ fn build_archive_result(
     ExtractionResult {
         content: output,
-        mime_type: mime_type.to_string(),
+        mime_type: mime_type.to_string().into(),
         metadata: Metadata {
             format: Some(crate::types::FormatMetadata::Archive(archive_metadata)),
             additional,

data/vendor/kreuzberg/src/extractors/bibtex.rs CHANGED Viewed

@@ -7,8 +7,10 @@ use crate::Result;
 use crate::core::config::ExtractionConfig;
 use crate::plugins::{DocumentExtractor, Plugin};
 use crate::types::{ExtractionResult, Metadata};
+use ahash::AHashMap;
 use async_trait::async_trait;
-use std::collections::{HashMap, HashSet};
+use std::borrow::Cow;
+use std::collections::HashSet;
 #[cfg(feature = "office")]
 use biblatex::{Bibliography, ChunksExt};
@@ -79,7 +81,7 @@ impl DocumentExtractor for BibtexExtractor {
         let mut entries_vec = Vec::new();
         let mut authors_set = HashSet::new();
         let mut years_set = HashSet::new();
-        let mut entry_types_map = HashMap::new();
+        let mut entry_types_map: AHashMap<String, i32> = AHashMap::new();
         let mut formatted_entries = String::new();
         match Bibliography::parse(&bibtex_str) {
@@ -129,19 +131,19 @@ impl DocumentExtractor for BibtexExtractor {
             }
         }
-        let mut additional = HashMap::new();
+        let mut additional: AHashMap<Cow<'static, str>, serde_json::Value> = AHashMap::new();
-        additional.insert("entry_count".to_string(), serde_json::json!(entries_vec.len()));
+        additional.insert(Cow::Borrowed("entry_count"), serde_json::json!(entries_vec.len()));
         let mut authors_list: Vec<String> = authors_set.into_iter().collect();
         authors_list.sort();
-        additional.insert("authors".to_string(), serde_json::json!(authors_list));
+        additional.insert(Cow::Borrowed("authors"), serde_json::json!(authors_list));
         if !years_set.is_empty() {
             let min_year = years_set.iter().min().copied().unwrap_or(0);
             let max_year = years_set.iter().max().copied().unwrap_or(0);
             additional.insert(
-                "year_range".to_string(),
+                Cow::Borrowed("year_range"),
                 serde_json::json!({
                     "min": min_year,
                     "max": max_year,
@@ -155,14 +157,14 @@ impl DocumentExtractor for BibtexExtractor {
             for (entry_type, count) in entry_types_map {
                 entry_types_json[entry_type] = serde_json::json!(count);
             }
-            additional.insert("entry_types".to_string(), entry_types_json);
+            additional.insert(Cow::Borrowed("entry_types"), entry_types_json);
         }
-        additional.insert("citation_keys".to_string(), serde_json::json!(entries_vec));
+        additional.insert(Cow::Borrowed("citation_keys"), serde_json::json!(entries_vec));
         Ok(ExtractionResult {
             content: formatted_entries,
-            mime_type: mime_type.to_string(),
+            mime_type: mime_type.to_string().into(),
             metadata: Metadata {
                 additional,
                 ..Default::default()
@@ -222,7 +224,10 @@ mod tests {
         assert!(result.content.contains("Sample Title"));
         let metadata = &result.metadata;
-        assert_eq!(metadata.additional.get("entry_count"), Some(&serde_json::json!(1)));
+        assert_eq!(
+            metadata.additional.get(&Cow::Borrowed("entry_count")),
+            Some(&serde_json::json!(1))
+        );
     }
     #[tokio::test]
@@ -258,15 +263,18 @@ mod tests {
         let metadata = &result.metadata;
-        assert_eq!(metadata.additional.get("entry_count"), Some(&serde_json::json!(3)));
+        assert_eq!(
+            metadata.additional.get(&Cow::Borrowed("entry_count")),
+            Some(&serde_json::json!(3))
+        );
-        if let Some(keys) = metadata.additional.get("citation_keys")
+        if let Some(keys) = metadata.additional.get(&Cow::Borrowed("citation_keys"))
             && let Some(keys_array) = keys.as_array()
         {
             assert_eq!(keys_array.len(), 3);
         }
-        if let Some(types) = metadata.additional.get("entry_types") {
+        if let Some(types) = metadata.additional.get(&Cow::Borrowed("entry_types")) {
             assert!(types.get("article").is_some());
             assert!(types.get("book").is_some());
             assert!(types.get("inproceedings").is_some());
@@ -330,7 +338,10 @@ mod tests {
         assert!(result.content.contains("The TeXbook"));
         let metadata = &result.metadata;
-        assert_eq!(metadata.additional.get("entry_count"), Some(&serde_json::json!(1)));
+        assert_eq!(
+            metadata.additional.get(&Cow::Borrowed("entry_count")),
+            Some(&serde_json::json!(1))
+        );
         if let Some(year_range) = metadata.additional.get("year_range") {
             assert_eq!(year_range.get("min"), Some(&serde_json::json!(1984)));
@@ -368,7 +379,10 @@ mod tests {
         let result = result.expect("Should extract valid metadata");
         let metadata = &result.metadata;
-        assert_eq!(metadata.additional.get("entry_count"), Some(&serde_json::json!(3)));
+        assert_eq!(
+            metadata.additional.get(&Cow::Borrowed("entry_count")),
+            Some(&serde_json::json!(3))
+        );
         if let Some(authors) = metadata.additional.get("authors")
             && let Some(authors_array) = authors.as_array()
@@ -381,7 +395,7 @@ mod tests {
             assert_eq!(year_range.get("max"), Some(&serde_json::json!(2021)));
         }
-        if let Some(types) = metadata.additional.get("entry_types") {
+        if let Some(types) = metadata.additional.get(&Cow::Borrowed("entry_types")) {
             assert_eq!(types.get("article"), Some(&serde_json::json!(2)));
             assert_eq!(types.get("book"), Some(&serde_json::json!(1)));
         }
@@ -401,7 +415,10 @@ mod tests {
         let result = result.expect("Should extract empty bibliography");
         let metadata = &result.metadata;
-        assert_eq!(metadata.additional.get("entry_count"), Some(&serde_json::json!(0)));
+        assert_eq!(
+            metadata.additional.get(&Cow::Borrowed("entry_count")),
+            Some(&serde_json::json!(0))
+        );
     }
     #[tokio::test]

data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs CHANGED Viewed

@@ -2,8 +2,6 @@
 //!
 //! Handles parsing of Djot attributes from jotdown events and string syntax.
-use std::collections::HashMap;
 /// Parse jotdown attributes into our Attributes representation.
 ///
 /// Converts jotdown's internal attribute representation to Kreuzberg's
@@ -14,7 +12,7 @@ pub fn parse_jotdown_attributes(attrs: &jotdown::Attributes) -> crate::types::At
     let mut id = None;
     let mut classes = Vec::new();
-    let mut key_values = HashMap::new();
+    let mut key_values = Vec::new();
     for (kind, value) in attrs.iter() {
         match kind {
@@ -26,7 +24,7 @@ pub fn parse_jotdown_attributes(attrs: &jotdown::Attributes) -> crate::types::At
                 classes.push(value.to_string());
             }
             AttributeKind::Pair { key } => {
-                key_values.insert(key.to_string(), value.to_string());
+                key_values.push((key.to_string(), value.to_string()));
             }
             AttributeKind::Comment => {
                 // Comments are ignored in our representation
@@ -49,7 +47,7 @@ pub fn parse_djot_attributes(attr_str: &str) -> crate::types::Attributes {
     let mut attrs = Attributes {
         id: None,
         classes: Vec::new(),
-        key_values: HashMap::new(),
+        key_values: Vec::new(),
     };
     // Simple parser for attribute syntax
@@ -66,7 +64,7 @@ pub fn parse_djot_attributes(attr_str: &str) -> crate::types::Attributes {
             // Key-value pair
             if let Some((key, value)) = token.split_once('=') {
                 let clean_value = value.trim_matches('"').trim_matches('\'');
-                attrs.key_values.insert(key.to_string(), clean_value.to_string());
+                attrs.key_values.push((key.to_string(), clean_value.to_string()));
             }
         }
     }
@@ -106,12 +104,11 @@ mod tests {
     #[test]
     fn test_render_attributes_with_all_parts() {
-        let mut attrs = crate::types::Attributes {
+        let attrs = crate::types::Attributes {
             id: Some("my-id".to_string()),
             classes: vec!["class1".to_string(), "class2".to_string()],
-            key_values: HashMap::new(),
+            key_values: vec![("data-test".to_string(), "value".to_string())],
         };
-        attrs.key_values.insert("data-test".to_string(), "value".to_string());
         let rendered = render_attributes(&attrs);
         assert!(rendered.contains("#my-id"));
@@ -125,7 +122,7 @@ mod tests {
         let attrs = crate::types::Attributes {
             id: None,
             classes: vec![],
-            key_values: HashMap::new(),
+            key_values: Vec::new(),
         };
         let rendered = render_attributes(&attrs);

data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs CHANGED Viewed

@@ -7,6 +7,8 @@
 use super::rendering::render_block_to_djot;
 use jotdown::Parser;
+#[cfg(test)]
+use std::borrow::Cow;
 /// Convert DjotContent back to djot markup.
 ///
@@ -150,7 +152,7 @@ mod tests {
     fn test_extraction_result_to_djot_with_djot_content() {
         let result = ExtractionResult {
             content: "Test content".to_string(),
-            mime_type: "text/djot".to_string(),
+            mime_type: Cow::Borrowed("text/djot"),
             metadata: Metadata::default(),
             tables: vec![],
             detected_languages: None,
@@ -191,7 +193,7 @@ mod tests {
     fn test_extraction_result_to_djot_without_djot_content() {
         let result = ExtractionResult {
             content: "Paragraph one\n\nParagraph two".to_string(),
-            mime_type: "text/plain".to_string(),
+            mime_type: Cow::Borrowed("text/plain"),
             metadata: Metadata::default(),
             tables: vec![],
             detected_languages: None,

data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs CHANGED Viewed

@@ -9,6 +9,7 @@ use crate::plugins::{DocumentExtractor, Plugin};
 use crate::types::{ExtractionResult, Metadata};
 use async_trait::async_trait;
 use jotdown::{Event, Parser};
+use std::borrow::Cow;
 /// Djot markup extractor with metadata and table support.
 ///
@@ -90,7 +91,7 @@ impl DocumentExtractor for DjotExtractor {
         if !metadata.additional.contains_key("title")
             && let Some(title) = crate::extractors::frontmatter_utils::extract_title_from_content(&remaining_content)
         {
-            metadata.additional.insert("title".to_string(), title.into());
+            metadata.additional.insert(Cow::Borrowed("title"), title.into());
         }
         // Parse with jotdown and collect events once for extraction
@@ -105,7 +106,7 @@ impl DocumentExtractor for DjotExtractor {
         Ok(ExtractionResult {
             content: extracted_text,
-            mime_type: mime_type.to_string(),
+            mime_type: mime_type.to_string().into(),
             metadata,
             tables,
             detected_languages: None,

data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs CHANGED Viewed

@@ -135,7 +135,7 @@ pub(super) fn handle_block_start(
         }
         Container::TaskListItem { checked } => {
             let mut attrs = parsed_attrs.unwrap_or_default();
-            attrs.key_values.insert("checked".to_string(), checked.to_string());
+            attrs.key_values.push(("checked".to_string(), checked.to_string()));
             push_block(
                 state,
                 FormattedBlock {

data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs CHANGED Viewed

@@ -14,8 +14,6 @@ use super::text_extraction::extract_text_from_events;
 use crate::extractors::djot_format::attributes::parse_jotdown_attributes;
 use crate::types::{Attributes, DjotContent, DjotImage, DjotLink, FormattedBlock};
 use jotdown::{Container, Event};
-use std::collections::HashMap;
 /// Extract complete djot content with 100% feature extraction.
 ///
 /// Processes ALL djot events to build a rich DjotContent structure including:
@@ -42,7 +40,7 @@ pub fn extract_complete_djot_content(
     let mut images = Vec::new();
     let mut links = Vec::new();
     let mut footnotes = Vec::new();
-    let attributes_map: HashMap<String, Attributes> = HashMap::new();
+    let attributes_map: Vec<(String, Attributes)> = Vec::new();
     let mut state = ExtractionState::new();
@@ -186,7 +184,7 @@ fn handle_start_event(
     };
     // Try block handlers first
-    if handle_block_start(state, container, attrs, parsed_attrs.clone(), footnotes) {
+    if handle_block_start(state, container, attrs, parsed_attrs.as_ref().cloned(), footnotes) {
         return;
     }

data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs CHANGED Viewed

@@ -9,7 +9,7 @@ use std::collections::HashMap;
 pub(super) fn handle_footnote_reference(state: &mut ExtractionState, label: &str) {
     state.flush_text();
-    let mut meta = HashMap::new();
+    let mut meta: HashMap<String, String> = HashMap::new();
     meta.insert("label".to_string(), label.to_string());
     state.current_inline_elements.push(InlineElement {

data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs CHANGED Viewed

@@ -3,7 +3,6 @@
 use super::state::ExtractionState;
 use crate::types::{DjotImage, DjotLink, InlineElement, InlineType};
 use jotdown::Container;
-use std::collections::HashMap;
 /// Handle start of inline elements.
 pub(super) fn handle_inline_start(
@@ -123,7 +122,7 @@ pub(super) fn handle_math_end(state: &mut ExtractionState, display: bool) {
     let math_text = std::mem::take(&mut state.math_content);
     state.inline_type_stack.pop();
-    let mut meta = HashMap::new();
+    let mut meta: std::collections::HashMap<String, String> = std::collections::HashMap::new();
     meta.insert("display".to_string(), display.to_string());
     state.current_inline_elements.push(InlineElement {
@@ -144,7 +143,7 @@ pub(super) fn finalize_inline_element(state: &mut ExtractionState, container: &C
         if matches!(container, Container::RawInline { .. })
             && let Some(fmt) = state.raw_format.take()
         {
-            let mut m = HashMap::new();
+            let mut m: std::collections::HashMap<String, String> = std::collections::HashMap::new();
             m.insert("format".to_string(), fmt);
             meta = Some(m);
         }
@@ -167,7 +166,7 @@ pub(super) fn handle_link_end(state: &mut ExtractionState, url: &str, links: &mu
         }
         state.inline_type_stack.pop();
-        let mut meta = HashMap::new();
+        let mut meta: std::collections::HashMap<String, String> = std::collections::HashMap::new();
         meta.insert("href".to_string(), url.to_string());
         state.current_inline_elements.push(InlineElement {
@@ -188,7 +187,7 @@ pub(super) fn handle_image_end(state: &mut ExtractionState, src: &str, images: &
         }
         state.inline_type_stack.pop();
-        let mut meta = HashMap::new();
+        let mut meta: std::collections::HashMap<String, String> = std::collections::HashMap::new();
         meta.insert("src".to_string(), src.to_string());
         state.current_inline_elements.push(InlineElement {

data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs CHANGED Viewed

@@ -43,7 +43,7 @@ pub fn extract_tables_from_events(events: &[Event]) -> Vec<Table> {
                 if !current_row.is_empty()
                     && let Some((ref mut rows, _)) = current_table
                 {
-                    rows.push(current_row.clone());
+                    rows.push(std::mem::take(&mut current_row));
                 }
                 current_row = Vec::new();
             }

data/vendor/kreuzberg/src/extractors/docbook.rs CHANGED Viewed

@@ -398,7 +398,7 @@ impl DocumentExtractor for DocbookExtractor {
         Ok(ExtractionResult {
             content: extracted_content,
-            mime_type: mime_type.to_string(),
+            mime_type: mime_type.to_string().into(),
             metadata,
             tables,
             detected_languages: None,

data/vendor/kreuzberg/src/extractors/docx.rs CHANGED Viewed

@@ -9,7 +9,9 @@ use crate::core::config::ExtractionConfig;
 use crate::extraction::{cells_to_markdown, office_metadata};
 use crate::plugins::{DocumentExtractor, Plugin};
 use crate::types::{ExtractionResult, Metadata, PageBoundary, PageInfo, PageStructure, PageUnitType, Table};
+use ahash::AHashMap;
 use async_trait::async_trait;
+use std::borrow::Cow;
 use std::io::Cursor;
 /// High-performance DOCX extractor using docx-lite.
@@ -181,22 +183,22 @@ impl DocumentExtractor for DocxExtractor {
                 .map_err(|e| crate::error::KreuzbergError::parsing(format!("Failed to open ZIP archive: {}", e)))?
         };
-        let mut metadata_map = std::collections::HashMap::new();
+        let mut metadata_map = AHashMap::new();
         let mut parsed_keywords: Option<Vec<String>> = None;
         if let Ok(core) = office_metadata::extract_core_properties(&mut archive) {
             if let Some(title) = core.title {
-                metadata_map.insert("title".to_string(), serde_json::Value::String(title));
+                metadata_map.insert(Cow::Borrowed("title"), serde_json::Value::String(title));
             }
             if let Some(creator) = core.creator {
                 metadata_map.insert(
-                    "authors".to_string(),
+                    Cow::Borrowed("authors"),
                     serde_json::Value::Array(vec![serde_json::Value::String(creator.clone())]),
                 );
-                metadata_map.insert("created_by".to_string(), serde_json::Value::String(creator));
+                metadata_map.insert(Cow::Borrowed("created_by"), serde_json::Value::String(creator));
             }
             if let Some(subject) = core.subject {
-                metadata_map.insert("subject".to_string(), serde_json::Value::String(subject));
+                metadata_map.insert(Cow::Borrowed("subject"), serde_json::Value::String(subject));
             }
             if let Some(keywords) = core.keywords {
                 // Parse comma-separated keywords into Vec<String>
@@ -209,70 +211,76 @@ impl DocumentExtractor for DocxExtractor {
                 );
             }
             if let Some(description) = core.description {
-                metadata_map.insert("description".to_string(), serde_json::Value::String(description));
+                metadata_map.insert(Cow::Borrowed("description"), serde_json::Value::String(description));
             }
             if let Some(modified_by) = core.last_modified_by {
-                metadata_map.insert("modified_by".to_string(), serde_json::Value::String(modified_by));
+                metadata_map.insert(Cow::Borrowed("modified_by"), serde_json::Value::String(modified_by));
             }
             if let Some(created) = core.created {
-                metadata_map.insert("created_at".to_string(), serde_json::Value::String(created));
+                metadata_map.insert(Cow::Borrowed("created_at"), serde_json::Value::String(created));
             }
             if let Some(modified) = core.modified {
-                metadata_map.insert("modified_at".to_string(), serde_json::Value::String(modified));
+                metadata_map.insert(Cow::Borrowed("modified_at"), serde_json::Value::String(modified));
             }
             if let Some(revision) = core.revision {
-                metadata_map.insert("revision".to_string(), serde_json::Value::String(revision));
+                metadata_map.insert(Cow::Borrowed("revision"), serde_json::Value::String(revision));
             }
             if let Some(category) = core.category {
-                metadata_map.insert("category".to_string(), serde_json::Value::String(category));
+                metadata_map.insert(Cow::Borrowed("category"), serde_json::Value::String(category));
             }
             if let Some(content_status) = core.content_status {
-                metadata_map.insert("content_status".to_string(), serde_json::Value::String(content_status));
+                metadata_map.insert(
+                    Cow::Borrowed("content_status"),
+                    serde_json::Value::String(content_status),
+                );
             }
             if let Some(language) = core.language {
-                metadata_map.insert("language".to_string(), serde_json::Value::String(language));
+                metadata_map.insert(Cow::Borrowed("language"), serde_json::Value::String(language));
             }
         }
         if let Ok(app) = office_metadata::extract_docx_app_properties(&mut archive) {
             if let Some(pages) = app.pages {
-                metadata_map.insert("page_count".to_string(), serde_json::Value::Number(pages.into()));
+                metadata_map.insert(Cow::Borrowed("page_count"), serde_json::Value::Number(pages.into()));
             }
             if let Some(words) = app.words {
-                metadata_map.insert("word_count".to_string(), serde_json::Value::Number(words.into()));
+                metadata_map.insert(Cow::Borrowed("word_count"), serde_json::Value::Number(words.into()));
             }
             if let Some(chars) = app.characters {
-                metadata_map.insert("character_count".to_string(), serde_json::Value::Number(chars.into()));
+                metadata_map.insert(
+                    Cow::Borrowed("character_count"),
+                    serde_json::Value::Number(chars.into()),
+                );
             }
             if let Some(lines) = app.lines {
-                metadata_map.insert("line_count".to_string(), serde_json::Value::Number(lines.into()));
+                metadata_map.insert(Cow::Borrowed("line_count"), serde_json::Value::Number(lines.into()));
             }
             if let Some(paragraphs) = app.paragraphs {
                 metadata_map.insert(
-                    "paragraph_count".to_string(),
+                    Cow::Borrowed("paragraph_count"),
                     serde_json::Value::Number(paragraphs.into()),
                 );
             }
             if let Some(template) = app.template {
-                metadata_map.insert("template".to_string(), serde_json::Value::String(template));
+                metadata_map.insert(Cow::Borrowed("template"), serde_json::Value::String(template));
             }
             if let Some(company) = app.company {
-                metadata_map.insert("organization".to_string(), serde_json::Value::String(company));
+                metadata_map.insert(Cow::Borrowed("company"), serde_json::Value::String(company));
             }
             if let Some(time) = app.total_time {
                 metadata_map.insert(
-                    "total_editing_time_minutes".to_string(),
+                    Cow::Borrowed("total_editing_time_minutes"),
                     serde_json::Value::Number(time.into()),
                 );
             }
             if let Some(application) = app.application {
-                metadata_map.insert("application".to_string(), serde_json::Value::String(application));
+                metadata_map.insert(Cow::Borrowed("application"), serde_json::Value::String(application));
             }
         }
         if let Ok(custom) = office_metadata::extract_custom_properties(&mut archive) {
             for (key, value) in custom {
-                metadata_map.insert(format!("custom_{}", key), value);
+                metadata_map.insert(Cow::Owned(format!("custom_{}", key)), value);
             }
         }
@@ -301,7 +309,7 @@ impl DocumentExtractor for DocxExtractor {
         Ok(ExtractionResult {
             content: text,
-            mime_type: mime_type.to_string(),
+            mime_type: mime_type.to_string().into(),
             metadata: Metadata {
                 pages: page_structure,
                 keywords: parsed_keywords,