kreuzberg 4.2.6 → 4.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +7 -4
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
  5. data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
  7. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
  8. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
  9. data/ext/kreuzberg_rb/native/src/result.rs +5 -3
  10. data/lib/kreuzberg/version.rb +1 -1
  11. data/sig/kreuzberg.rbs +228 -37
  12. data/spec/binding/batch_operations_spec.rb +2 -0
  13. data/vendor/Cargo.toml +3 -2
  14. data/vendor/kreuzberg/Cargo.toml +2 -1
  15. data/vendor/kreuzberg/README.md +1 -1
  16. data/vendor/kreuzberg/src/api/error.rs +29 -1
  17. data/vendor/kreuzberg/src/api/handlers.rs +28 -25
  18. data/vendor/kreuzberg/src/api/openapi.rs +14 -1
  19. data/vendor/kreuzberg/src/chunking/config.rs +2 -37
  20. data/vendor/kreuzberg/src/chunking/core.rs +78 -2
  21. data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
  22. data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
  23. data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
  24. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
  25. data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
  26. data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
  27. data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
  28. data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
  29. data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
  30. data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
  31. data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
  32. data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
  33. data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
  34. data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
  35. data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
  36. data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
  37. data/vendor/kreuzberg/src/extraction/email.rs +31 -19
  38. data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
  39. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
  40. data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
  41. data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
  42. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
  43. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
  44. data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
  45. data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
  47. data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
  48. data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
  49. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
  50. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
  51. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
  52. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
  53. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
  54. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
  55. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
  56. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
  57. data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
  58. data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
  59. data/vendor/kreuzberg/src/extractors/email.rs +5 -3
  60. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
  61. data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
  62. data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
  63. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
  64. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
  65. data/vendor/kreuzberg/src/extractors/html.rs +1 -1
  66. data/vendor/kreuzberg/src/extractors/image.rs +3 -3
  67. data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
  68. data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
  69. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
  70. data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
  71. data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
  72. data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
  73. data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
  74. data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
  75. data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
  76. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
  77. data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
  78. data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
  79. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
  80. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
  81. data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
  82. data/vendor/kreuzberg/src/extractors/text.rs +2 -2
  83. data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
  84. data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
  85. data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
  86. data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
  87. data/vendor/kreuzberg/src/lib.rs +1 -1
  88. data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
  89. data/vendor/kreuzberg/src/mcp/format.rs +5 -4
  90. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
  91. data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
  92. data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
  93. data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
  94. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
  95. data/vendor/kreuzberg/src/ocr/types.rs +3 -4
  96. data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
  97. data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
  98. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
  99. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
  100. data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
  101. data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
  102. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
  103. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
  104. data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
  105. data/vendor/kreuzberg/src/text/quality.rs +13 -13
  106. data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
  107. data/vendor/kreuzberg/src/types/djot.rs +15 -4
  108. data/vendor/kreuzberg/src/types/extraction.rs +24 -4
  109. data/vendor/kreuzberg/src/types/formats.rs +9 -5
  110. data/vendor/kreuzberg/src/types/metadata.rs +68 -7
  111. data/vendor/kreuzberg/src/types/mod.rs +7 -5
  112. data/vendor/kreuzberg/src/types/page.rs +9 -0
  113. data/vendor/kreuzberg/src/types/tables.rs +2 -0
  114. data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
  115. data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
  116. data/vendor/kreuzberg/tests/config_features.rs +19 -11
  117. data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
  118. data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
  119. data/vendor/kreuzberg/tests/core_integration.rs +5 -6
  120. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
  121. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
  122. data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
  123. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
  124. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
  125. data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
  126. data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
  127. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
  128. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  129. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
  130. data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
  131. data/vendor/kreuzberg-ffi/src/error.rs +56 -0
  132. data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
  133. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
  134. data/vendor/kreuzberg-ffi/src/result.rs +2 -1
  135. data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
  136. data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
  137. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
  138. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  139. metadata +2 -2
@@ -21,6 +21,8 @@ use super::metadata::Metadata;
21
21
  ///
22
22
  /// Available when the `djot` feature is enabled.
23
23
  #[derive(Debug, Clone, Serialize, Deserialize)]
24
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
25
+ #[cfg_attr(feature = "api", schema(no_recursion))]
24
26
  pub struct DjotContent {
25
27
  /// Plain text representation for backwards compatibility
26
28
  pub plain_text: String,
@@ -44,14 +46,16 @@ pub struct DjotContent {
44
46
  pub footnotes: Vec<Footnote>,
45
47
 
46
48
  /// Attributes mapped by element identifier (if present)
47
- #[serde(skip_serializing_if = "HashMap::is_empty", default)]
48
- pub attributes: HashMap<String, Attributes>,
49
+ #[serde(skip_serializing_if = "Vec::is_empty", default)]
50
+ pub attributes: Vec<(String, Attributes)>,
49
51
  }
50
52
 
51
53
  /// Block-level element in a Djot document.
52
54
  ///
53
55
  /// Represents structural elements like headings, paragraphs, lists, code blocks, etc.
54
56
  #[derive(Debug, Clone, Serialize, Deserialize)]
57
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
58
+ #[cfg_attr(feature = "api", schema(no_recursion))]
55
59
  pub struct FormattedBlock {
56
60
  /// Type of block element
57
61
  pub block_type: BlockType,
@@ -83,6 +87,7 @@ pub struct FormattedBlock {
83
87
  /// Types of block-level elements in Djot.
84
88
  #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
85
89
  #[serde(rename_all = "snake_case")]
90
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
86
91
  pub enum BlockType {
87
92
  Paragraph,
88
93
  Heading,
@@ -106,6 +111,7 @@ pub enum BlockType {
106
111
  ///
107
112
  /// Represents text with formatting, links, images, etc.
108
113
  #[derive(Debug, Clone, Serialize, Deserialize)]
114
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
109
115
  pub struct InlineElement {
110
116
  /// Type of inline element
111
117
  pub element_type: InlineType,
@@ -125,6 +131,7 @@ pub struct InlineElement {
125
131
  /// Types of inline elements in Djot.
126
132
  #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
127
133
  #[serde(rename_all = "snake_case")]
134
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
128
135
  pub enum InlineType {
129
136
  Text,
130
137
  Strong,
@@ -148,6 +155,7 @@ pub enum InlineType {
148
155
  ///
149
156
  /// Represents the attributes attached to elements using {.class #id key="value"} syntax.
150
157
  #[derive(Debug, Clone, Serialize, Deserialize, Default)]
158
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
151
159
  pub struct Attributes {
152
160
  /// Element ID (#identifier)
153
161
  #[serde(skip_serializing_if = "Option::is_none")]
@@ -158,12 +166,13 @@ pub struct Attributes {
158
166
  pub classes: Vec<String>,
159
167
 
160
168
  /// Key-value pairs (key="value")
161
- #[serde(skip_serializing_if = "HashMap::is_empty", default)]
162
- pub key_values: HashMap<String, String>,
169
+ #[serde(skip_serializing_if = "Vec::is_empty", default)]
170
+ pub key_values: Vec<(String, String)>,
163
171
  }
164
172
 
165
173
  /// Image element in Djot.
166
174
  #[derive(Debug, Clone, Serialize, Deserialize)]
175
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
167
176
  pub struct DjotImage {
168
177
  /// Image source URL or path
169
178
  pub src: String,
@@ -182,6 +191,7 @@ pub struct DjotImage {
182
191
 
183
192
  /// Link element in Djot.
184
193
  #[derive(Debug, Clone, Serialize, Deserialize)]
194
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
185
195
  pub struct DjotLink {
186
196
  /// Link URL
187
197
  pub url: String,
@@ -200,6 +210,7 @@ pub struct DjotLink {
200
210
 
201
211
  /// Footnote in Djot.
202
212
  #[derive(Debug, Clone, Serialize, Deserialize)]
213
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
203
214
  pub struct Footnote {
204
215
  /// Footnote label
205
216
  pub label: String,
@@ -1,6 +1,8 @@
1
1
  //! Core extraction types and results.
2
2
 
3
+ use bytes::Bytes;
3
4
  use serde::{Deserialize, Serialize};
5
+ use std::borrow::Cow;
4
6
  use std::collections::HashMap;
5
7
 
6
8
  use super::djot::DjotContent;
@@ -12,9 +14,12 @@ use super::tables::Table;
12
14
  ///
13
15
  /// This is the main result type returned by all extraction functions.
14
16
  #[derive(Debug, Clone, Serialize, Deserialize)]
17
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
18
+ #[cfg_attr(feature = "api", schema(no_recursion))]
15
19
  pub struct ExtractionResult {
16
20
  pub content: String,
17
- pub mime_type: String,
21
+ #[cfg_attr(feature = "api", schema(value_type = String))]
22
+ pub mime_type: Cow<'static, str>,
18
23
  pub metadata: Metadata,
19
24
  pub tables: Vec<Table>,
20
25
  #[serde(skip_serializing_if = "Option::is_none")]
@@ -75,6 +80,7 @@ pub struct ExtractionResult {
75
80
  /// contains the text content, optional embedding vector (if embedding generation
76
81
  /// is configured), and metadata about its position in the document.
77
82
  #[derive(Debug, Clone, Serialize, Deserialize)]
83
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
78
84
  pub struct Chunk {
79
85
  /// The text content of this chunk.
80
86
  pub content: String,
@@ -92,6 +98,7 @@ pub struct Chunk {
92
98
 
93
99
  /// Metadata about a chunk's position in the original document.
94
100
  #[derive(Debug, Clone, Serialize, Deserialize)]
101
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
95
102
  pub struct ChunkMetadata {
96
103
  /// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
97
104
  pub byte_start: usize,
@@ -130,12 +137,17 @@ pub struct ChunkMetadata {
130
137
  /// Raw bytes allow cross-language compatibility - users can convert to
131
138
  /// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
132
139
  #[derive(Debug, Clone, Serialize, Deserialize)]
140
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
133
141
  pub struct ExtractedImage {
134
- /// Raw image data (PNG, JPEG, WebP, etc. bytes)
135
- pub data: Vec<u8>,
142
+ /// Raw image data (PNG, JPEG, WebP, etc. bytes).
143
+ /// Uses `bytes::Bytes` for cheap cloning of large buffers.
144
+ #[cfg_attr(feature = "api", schema(value_type = Vec<u8>, format = "binary"))]
145
+ pub data: Bytes,
136
146
 
137
147
  /// Image format (e.g., "jpeg", "png", "webp")
138
- pub format: String,
148
+ /// Uses Cow<'static, str> to avoid allocation for static literals.
149
+ #[cfg_attr(feature = "api", schema(value_type = String))]
150
+ pub format: Cow<'static, str>,
139
151
 
140
152
  /// Zero-indexed position of this image in the document/page
141
153
  pub image_index: usize,
@@ -173,6 +185,7 @@ pub struct ExtractedImage {
173
185
  /// When OCR is performed on this image, the result is embedded here
174
186
  /// rather than in a separate collection, making the relationship explicit.
175
187
  #[serde(skip_serializing_if = "Option::is_none")]
188
+ #[cfg_attr(feature = "api", schema(value_type = Option<ExtractionResult>))]
176
189
  pub ocr_result: Option<Box<ExtractionResult>>,
177
190
  }
178
191
 
@@ -182,6 +195,7 @@ pub struct ExtractedImage {
182
195
 
183
196
  /// Output format selection for extraction results.
184
197
  #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
198
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
185
199
  #[serde(rename_all = "snake_case")]
186
200
  pub enum OutputFormat {
187
201
  /// Unified format with all content in `content` field
@@ -196,6 +210,8 @@ pub enum OutputFormat {
196
210
  /// Wraps a string identifier that is deterministically generated
197
211
  /// from element type, content, and page number.
198
212
  #[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
213
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
214
+ #[cfg_attr(feature = "api", schema(value_type = String))]
199
215
  pub struct ElementId(String);
200
216
 
201
217
  impl ElementId {
@@ -230,6 +246,7 @@ impl std::fmt::Display for ElementId {
230
246
  /// Categorizes text content into semantic units for downstream processing.
231
247
  /// Supports the element types commonly found in Unstructured documents.
232
248
  #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
249
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
233
250
  #[serde(rename_all = "snake_case")]
234
251
  pub enum ElementType {
235
252
  /// Document title
@@ -258,6 +275,7 @@ pub enum ElementType {
258
275
 
259
276
  /// Bounding box coordinates for element positioning.
260
277
  #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
278
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
261
279
  pub struct BoundingBox {
262
280
  /// Left x-coordinate
263
281
  pub x0: f64,
@@ -271,6 +289,7 @@ pub struct BoundingBox {
271
289
 
272
290
  /// Metadata for a semantic element.
273
291
  #[derive(Debug, Clone, Serialize, Deserialize)]
292
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
274
293
  pub struct ElementMetadata {
275
294
  /// Page number (1-indexed)
276
295
  pub page_number: Option<usize>,
@@ -289,6 +308,7 @@ pub struct ElementMetadata {
289
308
  /// Represents a logical unit of content with semantic classification,
290
309
  /// unique identifier, and metadata for tracking origin and position.
291
310
  #[derive(Debug, Clone, Serialize, Deserialize)]
311
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
292
312
  pub struct Element {
293
313
  /// Unique element identifier
294
314
  pub element_id: ElementId,
@@ -1,6 +1,8 @@
1
1
  //! Format-specific extraction results and OCR configuration types.
2
2
 
3
+ use bytes::Bytes;
3
4
  use serde::{Deserialize, Serialize};
5
+ use std::borrow::Cow;
4
6
  use std::collections::HashMap;
5
7
 
6
8
  use super::extraction::ExtractedImage;
@@ -153,8 +155,9 @@ pub struct EmailAttachment {
153
155
  pub size: Option<usize>,
154
156
  /// Whether this attachment is an image
155
157
  pub is_image: bool,
156
- /// Attachment data (if extracted)
157
- pub data: Option<Vec<u8>>,
158
+ /// Attachment data (if extracted).
159
+ /// Uses `bytes::Bytes` for cheap cloning of large buffers.
160
+ pub data: Option<Bytes>,
158
161
  }
159
162
 
160
163
  /// OCR extraction result.
@@ -351,6 +354,7 @@ impl Default for TesseractConfig {
351
354
  /// Tracks the transformations applied to an image during OCR preprocessing,
352
355
  /// including DPI normalization, resizing, and resampling.
353
356
  #[derive(Debug, Clone, Serialize, Deserialize)]
357
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
354
358
  pub struct ImagePreprocessingMetadata {
355
359
  /// Original image dimensions (width, height) in pixels
356
360
  pub original_dimensions: (usize, usize),
@@ -435,9 +439,9 @@ pub struct LibreOfficeConversionResult {
435
439
  /// Converted file bytes
436
440
  pub converted_bytes: Vec<u8>,
437
441
  /// Original format identifier
438
- pub original_format: String,
442
+ pub original_format: Cow<'static, str>,
439
443
  /// Target format identifier
440
- pub target_format: String,
444
+ pub target_format: Cow<'static, str>,
441
445
  /// Target MIME type after conversion
442
- pub target_mime: String,
446
+ pub target_mime: Cow<'static, str>,
443
447
  }
@@ -2,7 +2,10 @@
2
2
  //!
3
3
  //! This module defines metadata structures for various document formats.
4
4
 
5
- use serde::{Deserialize, Serialize};
5
+ use std::borrow::Cow;
6
+
7
+ use ahash::AHashMap;
8
+ use serde::{Deserialize, Deserializer, Serialize, Serializer};
6
9
  use std::collections::{BTreeMap, HashMap};
7
10
 
8
11
  #[cfg(feature = "pdf")]
@@ -11,11 +14,41 @@ use crate::pdf::metadata::PdfMetadata;
11
14
  use super::formats::ImagePreprocessingMetadata;
12
15
  use super::page::PageStructure;
13
16
 
17
+ /// Custom serialization and deserialization for AHashMap<Cow<'static, str>, Value>.
18
+ ///
19
+ /// serde doesn't natively support serializing Cow keys, so we convert to/from
20
+ /// a HashMap<String, Value> for the wire format, while keeping the in-memory
21
+ /// representation optimized with Cow keys (avoiding allocations for static strings).
22
+ mod additional_serde {
23
+ use super::*;
24
+
25
+ pub fn serialize<S>(map: &AHashMap<Cow<'static, str>, serde_json::Value>, serializer: S) -> Result<S::Ok, S::Error>
26
+ where
27
+ S: Serializer,
28
+ {
29
+ // Convert to HashMap for serialization
30
+ let converted: HashMap<String, serde_json::Value> =
31
+ map.iter().map(|(k, v)| (k.to_string(), v.clone())).collect();
32
+ converted.serialize(serializer)
33
+ }
34
+
35
+ pub fn deserialize<'de, D>(deserializer: D) -> Result<AHashMap<Cow<'static, str>, serde_json::Value>, D::Error>
36
+ where
37
+ D: Deserializer<'de>,
38
+ {
39
+ // Deserialize from HashMap
40
+ let map = HashMap::<String, serde_json::Value>::deserialize(deserializer)?;
41
+ let result = map.into_iter().map(|(k, v)| (Cow::Owned(k), v)).collect();
42
+ Ok(result)
43
+ }
44
+ }
45
+
14
46
  /// Format-specific metadata (discriminated union).
15
47
  ///
16
48
  /// Only one format type can exist per extraction result. This provides
17
49
  /// type-safe, clean metadata without nested optionals.
18
50
  #[derive(Debug, Clone, Serialize, Deserialize)]
51
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
19
52
  #[serde(tag = "format_type", rename_all = "snake_case")]
20
53
  pub enum FormatMetadata {
21
54
  #[cfg(feature = "pdf")]
@@ -27,6 +60,7 @@ pub enum FormatMetadata {
27
60
  Image(ImageMetadata),
28
61
  Xml(XmlMetadata),
29
62
  Text(TextMetadata),
63
+ #[cfg_attr(feature = "api", schema(value_type = HtmlMetadata))]
30
64
  Html(Box<HtmlMetadata>),
31
65
  Ocr(OcrMetadata),
32
66
  }
@@ -36,6 +70,7 @@ pub enum FormatMetadata {
36
70
  /// Contains common fields applicable to all formats, format-specific metadata
37
71
  /// via a discriminated union, and additional custom fields from postprocessors.
38
72
  #[derive(Debug, Clone, Serialize, Deserialize, Default)]
73
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
39
74
  pub struct Metadata {
40
75
  /// Document title
41
76
  #[serde(skip_serializing_if = "Option::is_none")]
@@ -82,6 +117,7 @@ pub struct Metadata {
82
117
  /// Contains detailed metadata specific to the document format.
83
118
  /// Serializes with a `format_type` discriminator field.
84
119
  #[serde(flatten, skip_serializing_if = "Option::is_none")]
120
+ #[cfg_attr(feature = "api", schema(value_type = Option<Object>))]
85
121
  pub format: Option<FormatMetadata>,
86
122
 
87
123
  /// Image preprocessing metadata (when OCR preprocessing was applied)
@@ -98,11 +134,17 @@ pub struct Metadata {
98
134
 
99
135
  /// Additional custom fields from postprocessors.
100
136
  ///
101
- /// This flattened HashMap allows Python/TypeScript postprocessors to add
137
+ /// This flattened map allows Python/TypeScript postprocessors to add
102
138
  /// arbitrary fields (entity extraction, keyword extraction, etc.).
103
139
  /// Fields are merged at the root level during serialization.
104
- #[serde(flatten)]
105
- pub additional: HashMap<String, serde_json::Value>,
140
+ /// Uses `Cow<'static, str>` keys so static string keys avoid allocation.
141
+ #[serde(
142
+ flatten,
143
+ serialize_with = "additional_serde::serialize",
144
+ deserialize_with = "additional_serde::deserialize"
145
+ )]
146
+ #[cfg_attr(feature = "api", schema(value_type = HashMap<String, serde_json::Value>))]
147
+ pub additional: AHashMap<Cow<'static, str>, serde_json::Value>,
106
148
  }
107
149
 
108
150
  /// Excel/spreadsheet metadata.
@@ -110,6 +152,7 @@ pub struct Metadata {
110
152
  /// Contains information about sheets in Excel, LibreOffice Calc, and other
111
153
  /// spreadsheet formats (.xlsx, .xls, .ods, etc.).
112
154
  #[derive(Debug, Clone, Serialize, Deserialize)]
155
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
113
156
  pub struct ExcelMetadata {
114
157
  /// Total number of sheets in the workbook
115
158
  pub sheet_count: usize,
@@ -121,6 +164,7 @@ pub struct ExcelMetadata {
121
164
  ///
122
165
  /// Includes sender/recipient information, message ID, and attachment list.
123
166
  #[derive(Debug, Clone, Serialize, Deserialize)]
167
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
124
168
  pub struct EmailMetadata {
125
169
  /// Sender's email address
126
170
  #[serde(skip_serializing_if = "Option::is_none")]
@@ -149,9 +193,11 @@ pub struct EmailMetadata {
149
193
  ///
150
194
  /// Extracted from compressed archive files containing file lists and size information.
151
195
  #[derive(Debug, Clone, Serialize, Deserialize)]
196
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
152
197
  pub struct ArchiveMetadata {
153
198
  /// Archive format ("ZIP", "TAR", "7Z", etc.)
154
- pub format: String,
199
+ #[cfg_attr(feature = "api", schema(value_type = String))]
200
+ pub format: Cow<'static, str>,
155
201
  /// Total number of files in the archive
156
202
  pub file_count: usize,
157
203
  /// List of file paths within the archive
@@ -168,6 +214,7 @@ pub struct ArchiveMetadata {
168
214
  ///
169
215
  /// Includes dimensions, format, and EXIF data.
170
216
  #[derive(Debug, Clone, Serialize, Deserialize)]
217
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
171
218
  pub struct ImageMetadata {
172
219
  /// Image width in pixels
173
220
  pub width: u32,
@@ -183,6 +230,7 @@ pub struct ImageMetadata {
183
230
  ///
184
231
  /// Provides statistics about XML document structure.
185
232
  #[derive(Debug, Clone, Serialize, Deserialize)]
233
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
186
234
  pub struct XmlMetadata {
187
235
  /// Total number of XML elements processed
188
236
  pub element_count: usize,
@@ -195,6 +243,7 @@ pub struct XmlMetadata {
195
243
  /// Extracted from plain text and Markdown files. Includes word counts and,
196
244
  /// for Markdown, structural elements like headers and links.
197
245
  #[derive(Debug, Clone, Serialize, Deserialize)]
246
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
198
247
  pub struct TextMetadata {
199
248
  /// Number of lines in the document
200
249
  pub line_count: usize,
@@ -218,6 +267,7 @@ pub struct TextMetadata {
218
267
 
219
268
  /// Text direction enumeration for HTML documents.
220
269
  #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
270
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
221
271
  #[serde(rename_all = "lowercase")]
222
272
  pub enum TextDirection {
223
273
  /// Left-to-right text direction
@@ -233,6 +283,7 @@ pub enum TextDirection {
233
283
 
234
284
  /// Header/heading element metadata.
235
285
  #[derive(Debug, Clone, Serialize, Deserialize)]
286
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
236
287
  pub struct HeaderMetadata {
237
288
  /// Header level: 1 (h1) through 6 (h6)
238
289
  pub level: u8,
@@ -249,6 +300,7 @@ pub struct HeaderMetadata {
249
300
 
250
301
  /// Link element metadata.
251
302
  #[derive(Debug, Clone, Serialize, Deserialize)]
303
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
252
304
  pub struct LinkMetadata {
253
305
  /// The href URL value
254
306
  pub href: String,
@@ -262,11 +314,12 @@ pub struct LinkMetadata {
262
314
  /// Rel attribute values
263
315
  pub rel: Vec<String>,
264
316
  /// Additional attributes as key-value pairs
265
- pub attributes: HashMap<String, String>,
317
+ pub attributes: Vec<(String, String)>,
266
318
  }
267
319
 
268
320
  /// Link type classification.
269
321
  #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
322
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
270
323
  #[serde(rename_all = "lowercase")]
271
324
  pub enum LinkType {
272
325
  /// Anchor link (#section)
@@ -285,6 +338,7 @@ pub enum LinkType {
285
338
 
286
339
  /// Image element metadata.
287
340
  #[derive(Debug, Clone, Serialize, Deserialize)]
341
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
288
342
  pub struct ImageMetadataType {
289
343
  /// Image source (URL, data URI, or SVG content)
290
344
  pub src: String,
@@ -299,11 +353,12 @@ pub struct ImageMetadataType {
299
353
  /// Image type classification
300
354
  pub image_type: ImageType,
301
355
  /// Additional attributes as key-value pairs
302
- pub attributes: HashMap<String, String>,
356
+ pub attributes: Vec<(String, String)>,
303
357
  }
304
358
 
305
359
  /// Image type classification.
306
360
  #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
361
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
307
362
  #[serde(rename_all = "lowercase")]
308
363
  pub enum ImageType {
309
364
  /// Data URI image
@@ -320,6 +375,7 @@ pub enum ImageType {
320
375
 
321
376
  /// Structured data (Schema.org, microdata, RDFa) block.
322
377
  #[derive(Debug, Clone, Serialize, Deserialize)]
378
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
323
379
  pub struct StructuredData {
324
380
  /// Type of structured data
325
381
  pub data_type: StructuredDataType,
@@ -332,6 +388,7 @@ pub struct StructuredData {
332
388
 
333
389
  /// Structured data type classification.
334
390
  #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
391
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
335
392
  #[serde(rename_all = "lowercase")]
336
393
  pub enum StructuredDataType {
337
394
  /// JSON-LD structured data
@@ -349,6 +406,7 @@ pub enum StructuredDataType {
349
406
  /// Includes document-level metadata, Open Graph data, Twitter Card metadata,
350
407
  /// and extracted structural elements (headers, links, images, structured data).
351
408
  #[derive(Debug, Clone, Serialize, Deserialize, Default)]
409
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
352
410
  pub struct HtmlMetadata {
353
411
  /// Document title from `<title>` tag
354
412
  #[serde(skip_serializing_if = "Option::is_none")]
@@ -524,6 +582,7 @@ impl From<html_to_markdown_rs::ExtendedMetadata> for HtmlMetadata {
524
582
  ///
525
583
  /// Captures information about OCR processing configuration and results.
526
584
  #[derive(Debug, Clone, Serialize, Deserialize)]
585
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
527
586
  pub struct OcrMetadata {
528
587
  /// OCR language code(s) used
529
588
  pub language: String,
@@ -543,6 +602,7 @@ pub struct OcrMetadata {
543
602
 
544
603
  /// Error metadata (for batch operations).
545
604
  #[derive(Debug, Clone, Serialize, Deserialize)]
605
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
546
606
  pub struct ErrorMetadata {
547
607
  pub error_type: String,
548
608
  pub message: String,
@@ -552,6 +612,7 @@ pub struct ErrorMetadata {
552
612
  ///
553
613
  /// Extracted from PPTX files containing slide counts and presentation details.
554
614
  #[derive(Debug, Clone, Serialize, Deserialize)]
615
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
555
616
  pub struct PptxMetadata {
556
617
  /// Total number of slides in the presentation
557
618
  pub slide_count: usize,
@@ -20,6 +20,8 @@ pub use tables::*;
20
20
  #[cfg(test)]
21
21
  mod tests {
22
22
  use super::*;
23
+ use bytes::Bytes;
24
+ use std::borrow::Cow;
23
25
  use std::sync::Arc;
24
26
 
25
27
  #[test]
@@ -38,7 +40,7 @@ mod tests {
38
40
 
39
41
  metadata
40
42
  .additional
41
- .insert("quality_score".to_string(), serde_json::json!(1.0));
43
+ .insert(Cow::Borrowed("quality_score"), serde_json::json!(1.0));
42
44
 
43
45
  let json = serde_json::to_value(&metadata).unwrap();
44
46
  println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
@@ -162,8 +164,8 @@ mod tests {
162
164
  #[test]
163
165
  fn test_page_content_arc_images_roundtrip() {
164
166
  let image1 = Arc::new(ExtractedImage {
165
- data: vec![0xFF, 0xD8, 0xFF],
166
- format: "jpeg".to_string(),
167
+ data: Bytes::from_static(&[0xFF, 0xD8, 0xFF]),
168
+ format: Cow::Borrowed("jpeg"),
167
169
  image_index: 0,
168
170
  page_number: Some(1),
169
171
  width: Some(100),
@@ -176,8 +178,8 @@ mod tests {
176
178
  });
177
179
 
178
180
  let image2 = Arc::new(ExtractedImage {
179
- data: vec![0x89, 0x50, 0x4E],
180
- format: "png".to_string(),
181
+ data: Bytes::from_static(&[0x89, 0x50, 0x4E]),
182
+ format: Cow::Borrowed("png"),
181
183
  image_index: 1,
182
184
  page_number: Some(1),
183
185
  width: Some(300),
@@ -15,6 +15,7 @@ use super::tables::Table;
15
15
  /// Supports different page types (PDF pages, PPTX slides, Excel sheets)
16
16
  /// with character offset boundaries for chunk-to-page mapping.
17
17
  #[derive(Debug, Clone, Serialize, Deserialize)]
18
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
18
19
  pub struct PageStructure {
19
20
  /// Total number of pages/slides/sheets
20
21
  pub total_count: usize,
@@ -39,6 +40,7 @@ pub struct PageStructure {
39
40
  /// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
40
41
  #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
41
42
  #[serde(rename_all = "snake_case")]
43
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
42
44
  pub enum PageUnitType {
43
45
  /// Standard document pages (PDF, DOCX, images)
44
46
  Page,
@@ -54,6 +56,7 @@ pub enum PageUnitType {
54
56
  /// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
55
57
  /// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
56
58
  #[derive(Debug, Clone, Serialize, Deserialize)]
59
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
57
60
  pub struct PageBoundary {
58
61
  /// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
59
62
  pub byte_start: usize,
@@ -68,6 +71,7 @@ pub struct PageBoundary {
68
71
  /// Captures per-page information including dimensions, content counts,
69
72
  /// and visibility state (for presentations).
70
73
  #[derive(Debug, Clone, Serialize, Deserialize)]
74
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
71
75
  pub struct PageInfo {
72
76
  /// Page number (1-indexed)
73
77
  pub number: usize,
@@ -108,6 +112,7 @@ pub struct PageInfo {
108
112
  /// This reduces memory overhead for documents with shared tables/images
109
113
  /// by avoiding redundant copies during serialization.
110
114
  #[derive(Debug, Clone, Serialize, Deserialize)]
115
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
111
116
  pub struct PageContent {
112
117
  /// Page number (1-indexed)
113
118
  pub page_number: usize,
@@ -120,6 +125,7 @@ pub struct PageContent {
120
125
  /// Serializes as Vec<Table> for JSON compatibility while maintaining
121
126
  /// Arc semantics in-memory for zero-copy sharing.
122
127
  #[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
128
+ #[cfg_attr(feature = "api", schema(value_type = Vec<Table>))]
123
129
  pub tables: Vec<Arc<Table>>,
124
130
 
125
131
  /// Images found on this page (uses Arc for memory efficiency)
@@ -127,6 +133,7 @@ pub struct PageContent {
127
133
  /// Serializes as Vec<ExtractedImage> for JSON compatibility while maintaining
128
134
  /// Arc semantics in-memory for zero-copy sharing.
129
135
  #[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
136
+ #[cfg_attr(feature = "api", schema(value_type = Vec<ExtractedImage>))]
130
137
  pub images: Vec<Arc<ExtractedImage>>,
131
138
 
132
139
  /// Hierarchy information for the page (when hierarchy extraction is enabled)
@@ -141,6 +148,7 @@ pub struct PageContent {
141
148
  /// Used when PDF text hierarchy extraction is enabled. Contains hierarchical
142
149
  /// blocks with heading levels (H1-H6) for semantic document structure.
143
150
  #[derive(Debug, Clone, Serialize, Deserialize)]
151
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
144
152
  pub struct PageHierarchy {
145
153
  /// Number of hierarchy blocks on this page
146
154
  pub block_count: usize,
@@ -155,6 +163,7 @@ pub struct PageHierarchy {
155
163
  /// Represents a block of text with semantic heading information extracted from
156
164
  /// font size clustering and hierarchical analysis.
157
165
  #[derive(Debug, Clone, Serialize, Deserialize)]
166
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
158
167
  pub struct HierarchicalBlock {
159
168
  /// The text content of this block
160
169
  pub text: String,
@@ -7,6 +7,7 @@ use serde::{Deserialize, Serialize};
7
7
  /// Represents a table detected and extracted from a document (PDF, image, etc.).
8
8
  /// Tables are converted to both structured cell data and Markdown format.
9
9
  #[derive(Debug, Clone, Serialize, Deserialize)]
10
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
10
11
  pub struct Table {
11
12
  /// Table cells as a 2D vector (rows × columns)
12
13
  pub cells: Vec<Vec<String>>,
@@ -20,6 +21,7 @@ pub struct Table {
20
21
  ///
21
22
  /// Future extension point for rich table support with cell-level metadata.
22
23
  #[derive(Debug, Clone, Serialize, Deserialize)]
24
+ #[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
23
25
  pub struct TableCell {
24
26
  /// Cell content as text
25
27
  pub content: String,
@@ -17,6 +17,7 @@ use kreuzberg::core::pipeline::run_pipeline;
17
17
  use kreuzberg::plugins::registry::{get_document_extractor_registry, get_post_processor_registry};
18
18
  use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
19
19
  use kreuzberg::types::{ExtractionResult, Metadata};
20
+ use std::borrow::Cow;
20
21
  use std::sync::Arc;
21
22
 
22
23
  #[cfg(feature = "ocr")]
@@ -378,7 +379,7 @@ async fn test_concurrent_pipeline_processing() {
378
379
  handles.push(tokio::spawn(async move {
379
380
  let result = ExtractionResult {
380
381
  content: format!("Content {}", i),
381
- mime_type: "text/plain".to_string(),
382
+ mime_type: Cow::Borrowed("text/plain"),
382
383
  metadata: Metadata::default(),
383
384
  tables: vec![],
384
385
  detected_languages: None,