RubyGems - kreuzberg - Versions diffs - 4.2.6 → 4.2.7 - Mend

kreuzberg 4.2.6 → 4.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (139) hide show

checksums.yaml +4 -4
data/Gemfile.lock +7 -4
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
data/ext/kreuzberg_rb/native/src/result.rs +5 -3
data/lib/kreuzberg/version.rb +1 -1
data/sig/kreuzberg.rbs +228 -37
data/spec/binding/batch_operations_spec.rb +2 -0
data/vendor/Cargo.toml +3 -2
data/vendor/kreuzberg/Cargo.toml +2 -1
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/api/error.rs +29 -1
data/vendor/kreuzberg/src/api/handlers.rs +28 -25
data/vendor/kreuzberg/src/api/openapi.rs +14 -1
data/vendor/kreuzberg/src/chunking/config.rs +2 -37
data/vendor/kreuzberg/src/chunking/core.rs +78 -2
data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
data/vendor/kreuzberg/src/extraction/email.rs +31 -19
data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
data/vendor/kreuzberg/src/extractors/email.rs +5 -3
data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
data/vendor/kreuzberg/src/extractors/html.rs +1 -1
data/vendor/kreuzberg/src/extractors/image.rs +3 -3
data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
data/vendor/kreuzberg/src/extractors/text.rs +2 -2
data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
data/vendor/kreuzberg/src/lib.rs +1 -1
data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
data/vendor/kreuzberg/src/mcp/format.rs +5 -4
data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
data/vendor/kreuzberg/src/ocr/types.rs +3 -4
data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
data/vendor/kreuzberg/src/text/quality.rs +13 -13
data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
data/vendor/kreuzberg/src/types/djot.rs +15 -4
data/vendor/kreuzberg/src/types/extraction.rs +24 -4
data/vendor/kreuzberg/src/types/formats.rs +9 -5
data/vendor/kreuzberg/src/types/metadata.rs +68 -7
data/vendor/kreuzberg/src/types/mod.rs +7 -5
data/vendor/kreuzberg/src/types/page.rs +9 -0
data/vendor/kreuzberg/src/types/tables.rs +2 -0
data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
data/vendor/kreuzberg/tests/config_features.rs +19 -11
data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
data/vendor/kreuzberg/tests/core_integration.rs +5 -6
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
data/vendor/kreuzberg-ffi/src/error.rs +56 -0
data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
data/vendor/kreuzberg-ffi/src/result.rs +2 -1
data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +2 -2

data/vendor/kreuzberg/src/types/djot.rs CHANGED Viewed

@@ -21,6 +21,8 @@ use super::metadata::Metadata;
 ///
 /// Available when the `djot` feature is enabled.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
+#[cfg_attr(feature = "api", schema(no_recursion))]
 pub struct DjotContent {
     /// Plain text representation for backwards compatibility
     pub plain_text: String,
@@ -44,14 +46,16 @@ pub struct DjotContent {
     pub footnotes: Vec<Footnote>,
     /// Attributes mapped by element identifier (if present)
-    #[serde(skip_serializing_if = "HashMap::is_empty", default)]
-    pub attributes: HashMap<String, Attributes>,
+    #[serde(skip_serializing_if = "Vec::is_empty", default)]
+    pub attributes: Vec<(String, Attributes)>,
 }
 /// Block-level element in a Djot document.
 ///
 /// Represents structural elements like headings, paragraphs, lists, code blocks, etc.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
+#[cfg_attr(feature = "api", schema(no_recursion))]
 pub struct FormattedBlock {
     /// Type of block element
     pub block_type: BlockType,
@@ -83,6 +87,7 @@ pub struct FormattedBlock {
 /// Types of block-level elements in Djot.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub enum BlockType {
     Paragraph,
     Heading,
@@ -106,6 +111,7 @@ pub enum BlockType {
 ///
 /// Represents text with formatting, links, images, etc.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct InlineElement {
     /// Type of inline element
     pub element_type: InlineType,
@@ -125,6 +131,7 @@ pub struct InlineElement {
 /// Types of inline elements in Djot.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub enum InlineType {
     Text,
     Strong,
@@ -148,6 +155,7 @@ pub enum InlineType {
 ///
 /// Represents the attributes attached to elements using {.class #id key="value"} syntax.
 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct Attributes {
     /// Element ID (#identifier)
     #[serde(skip_serializing_if = "Option::is_none")]
@@ -158,12 +166,13 @@ pub struct Attributes {
     pub classes: Vec<String>,
     /// Key-value pairs (key="value")
-    #[serde(skip_serializing_if = "HashMap::is_empty", default)]
-    pub key_values: HashMap<String, String>,
+    #[serde(skip_serializing_if = "Vec::is_empty", default)]
+    pub key_values: Vec<(String, String)>,
 }
 /// Image element in Djot.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct DjotImage {
     /// Image source URL or path
     pub src: String,
@@ -182,6 +191,7 @@ pub struct DjotImage {
 /// Link element in Djot.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct DjotLink {
     /// Link URL
     pub url: String,
@@ -200,6 +210,7 @@ pub struct DjotLink {
 /// Footnote in Djot.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct Footnote {
     /// Footnote label
     pub label: String,

data/vendor/kreuzberg/src/types/extraction.rs CHANGED Viewed

@@ -1,6 +1,8 @@
 //! Core extraction types and results.
+use bytes::Bytes;
 use serde::{Deserialize, Serialize};
+use std::borrow::Cow;
 use std::collections::HashMap;
 use super::djot::DjotContent;
@@ -12,9 +14,12 @@ use super::tables::Table;
 ///
 /// This is the main result type returned by all extraction functions.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
+#[cfg_attr(feature = "api", schema(no_recursion))]
 pub struct ExtractionResult {
     pub content: String,
-    pub mime_type: String,
+    #[cfg_attr(feature = "api", schema(value_type = String))]
+    pub mime_type: Cow<'static, str>,
     pub metadata: Metadata,
     pub tables: Vec<Table>,
     #[serde(skip_serializing_if = "Option::is_none")]
@@ -75,6 +80,7 @@ pub struct ExtractionResult {
 /// contains the text content, optional embedding vector (if embedding generation
 /// is configured), and metadata about its position in the document.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct Chunk {
     /// The text content of this chunk.
     pub content: String,
@@ -92,6 +98,7 @@ pub struct Chunk {
 /// Metadata about a chunk's position in the original document.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct ChunkMetadata {
     /// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
     pub byte_start: usize,
@@ -130,12 +137,17 @@ pub struct ChunkMetadata {
 /// Raw bytes allow cross-language compatibility - users can convert to
 /// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct ExtractedImage {
-    /// Raw image data (PNG, JPEG, WebP, etc. bytes)
-    pub data: Vec<u8>,
+    /// Raw image data (PNG, JPEG, WebP, etc. bytes).
+    /// Uses `bytes::Bytes` for cheap cloning of large buffers.
+    #[cfg_attr(feature = "api", schema(value_type = Vec<u8>, format = "binary"))]
+    pub data: Bytes,
     /// Image format (e.g., "jpeg", "png", "webp")
-    pub format: String,
+    /// Uses Cow<'static, str> to avoid allocation for static literals.
+    #[cfg_attr(feature = "api", schema(value_type = String))]
+    pub format: Cow<'static, str>,
     /// Zero-indexed position of this image in the document/page
     pub image_index: usize,
@@ -173,6 +185,7 @@ pub struct ExtractedImage {
     /// When OCR is performed on this image, the result is embedded here
     /// rather than in a separate collection, making the relationship explicit.
     #[serde(skip_serializing_if = "Option::is_none")]
+    #[cfg_attr(feature = "api", schema(value_type = Option<ExtractionResult>))]
     pub ocr_result: Option<Box<ExtractionResult>>,
 }
@@ -182,6 +195,7 @@ pub struct ExtractedImage {
 /// Output format selection for extraction results.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 #[serde(rename_all = "snake_case")]
 pub enum OutputFormat {
     /// Unified format with all content in `content` field
@@ -196,6 +210,8 @@ pub enum OutputFormat {
 /// Wraps a string identifier that is deterministically generated
 /// from element type, content, and page number.
 #[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
+#[cfg_attr(feature = "api", schema(value_type = String))]
 pub struct ElementId(String);
 impl ElementId {
@@ -230,6 +246,7 @@ impl std::fmt::Display for ElementId {
 /// Categorizes text content into semantic units for downstream processing.
 /// Supports the element types commonly found in Unstructured documents.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 #[serde(rename_all = "snake_case")]
 pub enum ElementType {
     /// Document title
@@ -258,6 +275,7 @@ pub enum ElementType {
 /// Bounding box coordinates for element positioning.
 #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct BoundingBox {
     /// Left x-coordinate
     pub x0: f64,
@@ -271,6 +289,7 @@ pub struct BoundingBox {
 /// Metadata for a semantic element.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct ElementMetadata {
     /// Page number (1-indexed)
     pub page_number: Option<usize>,
@@ -289,6 +308,7 @@ pub struct ElementMetadata {
 /// Represents a logical unit of content with semantic classification,
 /// unique identifier, and metadata for tracking origin and position.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct Element {
     /// Unique element identifier
     pub element_id: ElementId,

data/vendor/kreuzberg/src/types/formats.rs CHANGED Viewed

@@ -1,6 +1,8 @@
 //! Format-specific extraction results and OCR configuration types.
+use bytes::Bytes;
 use serde::{Deserialize, Serialize};
+use std::borrow::Cow;
 use std::collections::HashMap;
 use super::extraction::ExtractedImage;
@@ -153,8 +155,9 @@ pub struct EmailAttachment {
     pub size: Option<usize>,
     /// Whether this attachment is an image
     pub is_image: bool,
-    /// Attachment data (if extracted)
-    pub data: Option<Vec<u8>>,
+    /// Attachment data (if extracted).
+    /// Uses `bytes::Bytes` for cheap cloning of large buffers.
+    pub data: Option<Bytes>,
 }
 /// OCR extraction result.
@@ -351,6 +354,7 @@ impl Default for TesseractConfig {
 /// Tracks the transformations applied to an image during OCR preprocessing,
 /// including DPI normalization, resizing, and resampling.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct ImagePreprocessingMetadata {
     /// Original image dimensions (width, height) in pixels
     pub original_dimensions: (usize, usize),
@@ -435,9 +439,9 @@ pub struct LibreOfficeConversionResult {
     /// Converted file bytes
     pub converted_bytes: Vec<u8>,
     /// Original format identifier
-    pub original_format: String,
+    pub original_format: Cow<'static, str>,
     /// Target format identifier
-    pub target_format: String,
+    pub target_format: Cow<'static, str>,
     /// Target MIME type after conversion
-    pub target_mime: String,
+    pub target_mime: Cow<'static, str>,
 }

data/vendor/kreuzberg/src/types/metadata.rs CHANGED Viewed

@@ -2,7 +2,10 @@
 //!
 //! This module defines metadata structures for various document formats.
-use serde::{Deserialize, Serialize};
+use std::borrow::Cow;
+use ahash::AHashMap;
+use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use std::collections::{BTreeMap, HashMap};
 #[cfg(feature = "pdf")]
@@ -11,11 +14,41 @@ use crate::pdf::metadata::PdfMetadata;
 use super::formats::ImagePreprocessingMetadata;
 use super::page::PageStructure;
+/// Custom serialization and deserialization for AHashMap<Cow<'static, str>, Value>.
+///
+/// serde doesn't natively support serializing Cow keys, so we convert to/from
+/// a HashMap<String, Value> for the wire format, while keeping the in-memory
+/// representation optimized with Cow keys (avoiding allocations for static strings).
+mod additional_serde {
+    use super::*;
+    pub fn serialize<S>(map: &AHashMap<Cow<'static, str>, serde_json::Value>, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        // Convert to HashMap for serialization
+        let converted: HashMap<String, serde_json::Value> =
+            map.iter().map(|(k, v)| (k.to_string(), v.clone())).collect();
+        converted.serialize(serializer)
+    }
+    pub fn deserialize<'de, D>(deserializer: D) -> Result<AHashMap<Cow<'static, str>, serde_json::Value>, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        // Deserialize from HashMap
+        let map = HashMap::<String, serde_json::Value>::deserialize(deserializer)?;
+        let result = map.into_iter().map(|(k, v)| (Cow::Owned(k), v)).collect();
+        Ok(result)
+    }
+}
 /// Format-specific metadata (discriminated union).
 ///
 /// Only one format type can exist per extraction result. This provides
 /// type-safe, clean metadata without nested optionals.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 #[serde(tag = "format_type", rename_all = "snake_case")]
 pub enum FormatMetadata {
     #[cfg(feature = "pdf")]
@@ -27,6 +60,7 @@ pub enum FormatMetadata {
     Image(ImageMetadata),
     Xml(XmlMetadata),
     Text(TextMetadata),
+    #[cfg_attr(feature = "api", schema(value_type = HtmlMetadata))]
     Html(Box<HtmlMetadata>),
     Ocr(OcrMetadata),
 }
@@ -36,6 +70,7 @@ pub enum FormatMetadata {
 /// Contains common fields applicable to all formats, format-specific metadata
 /// via a discriminated union, and additional custom fields from postprocessors.
 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct Metadata {
     /// Document title
     #[serde(skip_serializing_if = "Option::is_none")]
@@ -82,6 +117,7 @@ pub struct Metadata {
     /// Contains detailed metadata specific to the document format.
     /// Serializes with a `format_type` discriminator field.
     #[serde(flatten, skip_serializing_if = "Option::is_none")]
+    #[cfg_attr(feature = "api", schema(value_type = Option<Object>))]
     pub format: Option<FormatMetadata>,
     /// Image preprocessing metadata (when OCR preprocessing was applied)
@@ -98,11 +134,17 @@ pub struct Metadata {
     /// Additional custom fields from postprocessors.
     ///
-    /// This flattened HashMap allows Python/TypeScript postprocessors to add
+    /// This flattened map allows Python/TypeScript postprocessors to add
     /// arbitrary fields (entity extraction, keyword extraction, etc.).
     /// Fields are merged at the root level during serialization.
-    #[serde(flatten)]
-    pub additional: HashMap<String, serde_json::Value>,
+    /// Uses `Cow<'static, str>` keys so static string keys avoid allocation.
+    #[serde(
+        flatten,
+        serialize_with = "additional_serde::serialize",
+        deserialize_with = "additional_serde::deserialize"
+    )]
+    #[cfg_attr(feature = "api", schema(value_type = HashMap<String, serde_json::Value>))]
+    pub additional: AHashMap<Cow<'static, str>, serde_json::Value>,
 }
 /// Excel/spreadsheet metadata.
@@ -110,6 +152,7 @@ pub struct Metadata {
 /// Contains information about sheets in Excel, LibreOffice Calc, and other
 /// spreadsheet formats (.xlsx, .xls, .ods, etc.).
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct ExcelMetadata {
     /// Total number of sheets in the workbook
     pub sheet_count: usize,
@@ -121,6 +164,7 @@ pub struct ExcelMetadata {
 ///
 /// Includes sender/recipient information, message ID, and attachment list.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct EmailMetadata {
     /// Sender's email address
     #[serde(skip_serializing_if = "Option::is_none")]
@@ -149,9 +193,11 @@ pub struct EmailMetadata {
 ///
 /// Extracted from compressed archive files containing file lists and size information.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct ArchiveMetadata {
     /// Archive format ("ZIP", "TAR", "7Z", etc.)
-    pub format: String,
+    #[cfg_attr(feature = "api", schema(value_type = String))]
+    pub format: Cow<'static, str>,
     /// Total number of files in the archive
     pub file_count: usize,
     /// List of file paths within the archive
@@ -168,6 +214,7 @@ pub struct ArchiveMetadata {
 ///
 /// Includes dimensions, format, and EXIF data.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct ImageMetadata {
     /// Image width in pixels
     pub width: u32,
@@ -183,6 +230,7 @@ pub struct ImageMetadata {
 ///
 /// Provides statistics about XML document structure.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct XmlMetadata {
     /// Total number of XML elements processed
     pub element_count: usize,
@@ -195,6 +243,7 @@ pub struct XmlMetadata {
 /// Extracted from plain text and Markdown files. Includes word counts and,
 /// for Markdown, structural elements like headers and links.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct TextMetadata {
     /// Number of lines in the document
     pub line_count: usize,
@@ -218,6 +267,7 @@ pub struct TextMetadata {
 /// Text direction enumeration for HTML documents.
 #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 #[serde(rename_all = "lowercase")]
 pub enum TextDirection {
     /// Left-to-right text direction
@@ -233,6 +283,7 @@ pub enum TextDirection {
 /// Header/heading element metadata.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct HeaderMetadata {
     /// Header level: 1 (h1) through 6 (h6)
     pub level: u8,
@@ -249,6 +300,7 @@ pub struct HeaderMetadata {
 /// Link element metadata.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct LinkMetadata {
     /// The href URL value
     pub href: String,
@@ -262,11 +314,12 @@ pub struct LinkMetadata {
     /// Rel attribute values
     pub rel: Vec<String>,
     /// Additional attributes as key-value pairs
-    pub attributes: HashMap<String, String>,
+    pub attributes: Vec<(String, String)>,
 }
 /// Link type classification.
 #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 #[serde(rename_all = "lowercase")]
 pub enum LinkType {
     /// Anchor link (#section)
@@ -285,6 +338,7 @@ pub enum LinkType {
 /// Image element metadata.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct ImageMetadataType {
     /// Image source (URL, data URI, or SVG content)
     pub src: String,
@@ -299,11 +353,12 @@ pub struct ImageMetadataType {
     /// Image type classification
     pub image_type: ImageType,
     /// Additional attributes as key-value pairs
-    pub attributes: HashMap<String, String>,
+    pub attributes: Vec<(String, String)>,
 }
 /// Image type classification.
 #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 #[serde(rename_all = "lowercase")]
 pub enum ImageType {
     /// Data URI image
@@ -320,6 +375,7 @@ pub enum ImageType {
 /// Structured data (Schema.org, microdata, RDFa) block.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct StructuredData {
     /// Type of structured data
     pub data_type: StructuredDataType,
@@ -332,6 +388,7 @@ pub struct StructuredData {
 /// Structured data type classification.
 #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 #[serde(rename_all = "lowercase")]
 pub enum StructuredDataType {
     /// JSON-LD structured data
@@ -349,6 +406,7 @@ pub enum StructuredDataType {
 /// Includes document-level metadata, Open Graph data, Twitter Card metadata,
 /// and extracted structural elements (headers, links, images, structured data).
 #[derive(Debug, Clone, Serialize, Deserialize, Default)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct HtmlMetadata {
     /// Document title from `<title>` tag
     #[serde(skip_serializing_if = "Option::is_none")]
@@ -524,6 +582,7 @@ impl From<html_to_markdown_rs::ExtendedMetadata> for HtmlMetadata {
 ///
 /// Captures information about OCR processing configuration and results.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct OcrMetadata {
     /// OCR language code(s) used
     pub language: String,
@@ -543,6 +602,7 @@ pub struct OcrMetadata {
 /// Error metadata (for batch operations).
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct ErrorMetadata {
     pub error_type: String,
     pub message: String,
@@ -552,6 +612,7 @@ pub struct ErrorMetadata {
 ///
 /// Extracted from PPTX files containing slide counts and presentation details.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct PptxMetadata {
     /// Total number of slides in the presentation
     pub slide_count: usize,

data/vendor/kreuzberg/src/types/mod.rs CHANGED Viewed

@@ -20,6 +20,8 @@ pub use tables::*;
 #[cfg(test)]
 mod tests {
     use super::*;
+    use bytes::Bytes;
+    use std::borrow::Cow;
     use std::sync::Arc;
     #[test]
@@ -38,7 +40,7 @@ mod tests {
         metadata
             .additional
-            .insert("quality_score".to_string(), serde_json::json!(1.0));
+            .insert(Cow::Borrowed("quality_score"), serde_json::json!(1.0));
         let json = serde_json::to_value(&metadata).unwrap();
         println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
@@ -162,8 +164,8 @@ mod tests {
     #[test]
     fn test_page_content_arc_images_roundtrip() {
         let image1 = Arc::new(ExtractedImage {
-            data: vec![0xFF, 0xD8, 0xFF],
-            format: "jpeg".to_string(),
+            data: Bytes::from_static(&[0xFF, 0xD8, 0xFF]),
+            format: Cow::Borrowed("jpeg"),
             image_index: 0,
             page_number: Some(1),
             width: Some(100),
@@ -176,8 +178,8 @@ mod tests {
         });
         let image2 = Arc::new(ExtractedImage {
-            data: vec![0x89, 0x50, 0x4E],
-            format: "png".to_string(),
+            data: Bytes::from_static(&[0x89, 0x50, 0x4E]),
+            format: Cow::Borrowed("png"),
             image_index: 1,
             page_number: Some(1),
             width: Some(300),

data/vendor/kreuzberg/src/types/page.rs CHANGED Viewed

@@ -15,6 +15,7 @@ use super::tables::Table;
 /// Supports different page types (PDF pages, PPTX slides, Excel sheets)
 /// with character offset boundaries for chunk-to-page mapping.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct PageStructure {
     /// Total number of pages/slides/sheets
     pub total_count: usize,
@@ -39,6 +40,7 @@ pub struct PageStructure {
 /// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(rename_all = "snake_case")]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub enum PageUnitType {
     /// Standard document pages (PDF, DOCX, images)
     Page,
@@ -54,6 +56,7 @@ pub enum PageUnitType {
 /// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
 /// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct PageBoundary {
     /// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
     pub byte_start: usize,
@@ -68,6 +71,7 @@ pub struct PageBoundary {
 /// Captures per-page information including dimensions, content counts,
 /// and visibility state (for presentations).
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct PageInfo {
     /// Page number (1-indexed)
     pub number: usize,
@@ -108,6 +112,7 @@ pub struct PageInfo {
 /// This reduces memory overhead for documents with shared tables/images
 /// by avoiding redundant copies during serialization.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct PageContent {
     /// Page number (1-indexed)
     pub page_number: usize,
@@ -120,6 +125,7 @@ pub struct PageContent {
     /// Serializes as Vec<Table> for JSON compatibility while maintaining
     /// Arc semantics in-memory for zero-copy sharing.
     #[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
+    #[cfg_attr(feature = "api", schema(value_type = Vec<Table>))]
     pub tables: Vec<Arc<Table>>,
     /// Images found on this page (uses Arc for memory efficiency)
@@ -127,6 +133,7 @@ pub struct PageContent {
     /// Serializes as Vec<ExtractedImage> for JSON compatibility while maintaining
     /// Arc semantics in-memory for zero-copy sharing.
     #[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
+    #[cfg_attr(feature = "api", schema(value_type = Vec<ExtractedImage>))]
     pub images: Vec<Arc<ExtractedImage>>,
     /// Hierarchy information for the page (when hierarchy extraction is enabled)
@@ -141,6 +148,7 @@ pub struct PageContent {
 /// Used when PDF text hierarchy extraction is enabled. Contains hierarchical
 /// blocks with heading levels (H1-H6) for semantic document structure.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct PageHierarchy {
     /// Number of hierarchy blocks on this page
     pub block_count: usize,
@@ -155,6 +163,7 @@ pub struct PageHierarchy {
 /// Represents a block of text with semantic heading information extracted from
 /// font size clustering and hierarchical analysis.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct HierarchicalBlock {
     /// The text content of this block
     pub text: String,

data/vendor/kreuzberg/src/types/tables.rs CHANGED Viewed

@@ -7,6 +7,7 @@ use serde::{Deserialize, Serialize};
 /// Represents a table detected and extracted from a document (PDF, image, etc.).
 /// Tables are converted to both structured cell data and Markdown format.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct Table {
     /// Table cells as a 2D vector (rows × columns)
     pub cells: Vec<Vec<String>>,
@@ -20,6 +21,7 @@ pub struct Table {
 ///
 /// Future extension point for rich table support with cell-level metadata.
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
 pub struct TableCell {
     /// Cell content as text
     pub content: String,

data/vendor/kreuzberg/tests/concurrency_stress.rs CHANGED Viewed

@@ -17,6 +17,7 @@ use kreuzberg::core::pipeline::run_pipeline;
 use kreuzberg::plugins::registry::{get_document_extractor_registry, get_post_processor_registry};
 use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
 use kreuzberg::types::{ExtractionResult, Metadata};
+use std::borrow::Cow;
 use std::sync::Arc;
 #[cfg(feature = "ocr")]
@@ -378,7 +379,7 @@ async fn test_concurrent_pipeline_processing() {
         handles.push(tokio::spawn(async move {
             let result = ExtractionResult {
                 content: format!("Content {}", i),
-                mime_type: "text/plain".to_string(),
+                mime_type: Cow::Borrowed("text/plain"),
                 metadata: Metadata::default(),
                 tables: vec![],
                 detected_languages: None,