kreuzberg 4.2.6 → 4.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
- data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
- data/ext/kreuzberg_rb/native/src/result.rs +5 -3
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +228 -37
- data/spec/binding/batch_operations_spec.rb +2 -0
- data/vendor/Cargo.toml +3 -2
- data/vendor/kreuzberg/Cargo.toml +2 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +29 -1
- data/vendor/kreuzberg/src/api/handlers.rs +28 -25
- data/vendor/kreuzberg/src/api/openapi.rs +14 -1
- data/vendor/kreuzberg/src/chunking/config.rs +2 -37
- data/vendor/kreuzberg/src/chunking/core.rs +78 -2
- data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
- data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
- data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
- data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
- data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
- data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
- data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
- data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
- data/vendor/kreuzberg/src/extraction/email.rs +31 -19
- data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
- data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
- data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
- data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
- data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
- data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
- data/vendor/kreuzberg/src/extractors/email.rs +5 -3
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
- data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
- data/vendor/kreuzberg/src/extractors/html.rs +1 -1
- data/vendor/kreuzberg/src/extractors/image.rs +3 -3
- data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
- data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
- data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
- data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
- data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
- data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
- data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
- data/vendor/kreuzberg/src/extractors/text.rs +2 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
- data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
- data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
- data/vendor/kreuzberg/src/lib.rs +1 -1
- data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
- data/vendor/kreuzberg/src/mcp/format.rs +5 -4
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
- data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
- data/vendor/kreuzberg/src/ocr/types.rs +3 -4
- data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
- data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
- data/vendor/kreuzberg/src/text/quality.rs +13 -13
- data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
- data/vendor/kreuzberg/src/types/djot.rs +15 -4
- data/vendor/kreuzberg/src/types/extraction.rs +24 -4
- data/vendor/kreuzberg/src/types/formats.rs +9 -5
- data/vendor/kreuzberg/src/types/metadata.rs +68 -7
- data/vendor/kreuzberg/src/types/mod.rs +7 -5
- data/vendor/kreuzberg/src/types/page.rs +9 -0
- data/vendor/kreuzberg/src/types/tables.rs +2 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
- data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
- data/vendor/kreuzberg/tests/config_features.rs +19 -11
- data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
- data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
- data/vendor/kreuzberg/tests/core_integration.rs +5 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
- data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
- data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
- data/vendor/kreuzberg-ffi/src/error.rs +56 -0
- data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
- data/vendor/kreuzberg-ffi/src/result.rs +2 -1
- data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
- data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +2 -2
|
@@ -21,6 +21,8 @@ use super::metadata::Metadata;
|
|
|
21
21
|
///
|
|
22
22
|
/// Available when the `djot` feature is enabled.
|
|
23
23
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
24
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
25
|
+
#[cfg_attr(feature = "api", schema(no_recursion))]
|
|
24
26
|
pub struct DjotContent {
|
|
25
27
|
/// Plain text representation for backwards compatibility
|
|
26
28
|
pub plain_text: String,
|
|
@@ -44,14 +46,16 @@ pub struct DjotContent {
|
|
|
44
46
|
pub footnotes: Vec<Footnote>,
|
|
45
47
|
|
|
46
48
|
/// Attributes mapped by element identifier (if present)
|
|
47
|
-
#[serde(skip_serializing_if = "
|
|
48
|
-
pub attributes:
|
|
49
|
+
#[serde(skip_serializing_if = "Vec::is_empty", default)]
|
|
50
|
+
pub attributes: Vec<(String, Attributes)>,
|
|
49
51
|
}
|
|
50
52
|
|
|
51
53
|
/// Block-level element in a Djot document.
|
|
52
54
|
///
|
|
53
55
|
/// Represents structural elements like headings, paragraphs, lists, code blocks, etc.
|
|
54
56
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
57
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
58
|
+
#[cfg_attr(feature = "api", schema(no_recursion))]
|
|
55
59
|
pub struct FormattedBlock {
|
|
56
60
|
/// Type of block element
|
|
57
61
|
pub block_type: BlockType,
|
|
@@ -83,6 +87,7 @@ pub struct FormattedBlock {
|
|
|
83
87
|
/// Types of block-level elements in Djot.
|
|
84
88
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
|
85
89
|
#[serde(rename_all = "snake_case")]
|
|
90
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
86
91
|
pub enum BlockType {
|
|
87
92
|
Paragraph,
|
|
88
93
|
Heading,
|
|
@@ -106,6 +111,7 @@ pub enum BlockType {
|
|
|
106
111
|
///
|
|
107
112
|
/// Represents text with formatting, links, images, etc.
|
|
108
113
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
114
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
109
115
|
pub struct InlineElement {
|
|
110
116
|
/// Type of inline element
|
|
111
117
|
pub element_type: InlineType,
|
|
@@ -125,6 +131,7 @@ pub struct InlineElement {
|
|
|
125
131
|
/// Types of inline elements in Djot.
|
|
126
132
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
|
127
133
|
#[serde(rename_all = "snake_case")]
|
|
134
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
128
135
|
pub enum InlineType {
|
|
129
136
|
Text,
|
|
130
137
|
Strong,
|
|
@@ -148,6 +155,7 @@ pub enum InlineType {
|
|
|
148
155
|
///
|
|
149
156
|
/// Represents the attributes attached to elements using {.class #id key="value"} syntax.
|
|
150
157
|
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
|
158
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
151
159
|
pub struct Attributes {
|
|
152
160
|
/// Element ID (#identifier)
|
|
153
161
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
@@ -158,12 +166,13 @@ pub struct Attributes {
|
|
|
158
166
|
pub classes: Vec<String>,
|
|
159
167
|
|
|
160
168
|
/// Key-value pairs (key="value")
|
|
161
|
-
#[serde(skip_serializing_if = "
|
|
162
|
-
pub key_values:
|
|
169
|
+
#[serde(skip_serializing_if = "Vec::is_empty", default)]
|
|
170
|
+
pub key_values: Vec<(String, String)>,
|
|
163
171
|
}
|
|
164
172
|
|
|
165
173
|
/// Image element in Djot.
|
|
166
174
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
175
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
167
176
|
pub struct DjotImage {
|
|
168
177
|
/// Image source URL or path
|
|
169
178
|
pub src: String,
|
|
@@ -182,6 +191,7 @@ pub struct DjotImage {
|
|
|
182
191
|
|
|
183
192
|
/// Link element in Djot.
|
|
184
193
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
194
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
185
195
|
pub struct DjotLink {
|
|
186
196
|
/// Link URL
|
|
187
197
|
pub url: String,
|
|
@@ -200,6 +210,7 @@ pub struct DjotLink {
|
|
|
200
210
|
|
|
201
211
|
/// Footnote in Djot.
|
|
202
212
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
213
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
203
214
|
pub struct Footnote {
|
|
204
215
|
/// Footnote label
|
|
205
216
|
pub label: String,
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
//! Core extraction types and results.
|
|
2
2
|
|
|
3
|
+
use bytes::Bytes;
|
|
3
4
|
use serde::{Deserialize, Serialize};
|
|
5
|
+
use std::borrow::Cow;
|
|
4
6
|
use std::collections::HashMap;
|
|
5
7
|
|
|
6
8
|
use super::djot::DjotContent;
|
|
@@ -12,9 +14,12 @@ use super::tables::Table;
|
|
|
12
14
|
///
|
|
13
15
|
/// This is the main result type returned by all extraction functions.
|
|
14
16
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
17
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
18
|
+
#[cfg_attr(feature = "api", schema(no_recursion))]
|
|
15
19
|
pub struct ExtractionResult {
|
|
16
20
|
pub content: String,
|
|
17
|
-
|
|
21
|
+
#[cfg_attr(feature = "api", schema(value_type = String))]
|
|
22
|
+
pub mime_type: Cow<'static, str>,
|
|
18
23
|
pub metadata: Metadata,
|
|
19
24
|
pub tables: Vec<Table>,
|
|
20
25
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
@@ -75,6 +80,7 @@ pub struct ExtractionResult {
|
|
|
75
80
|
/// contains the text content, optional embedding vector (if embedding generation
|
|
76
81
|
/// is configured), and metadata about its position in the document.
|
|
77
82
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
83
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
78
84
|
pub struct Chunk {
|
|
79
85
|
/// The text content of this chunk.
|
|
80
86
|
pub content: String,
|
|
@@ -92,6 +98,7 @@ pub struct Chunk {
|
|
|
92
98
|
|
|
93
99
|
/// Metadata about a chunk's position in the original document.
|
|
94
100
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
101
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
95
102
|
pub struct ChunkMetadata {
|
|
96
103
|
/// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
|
|
97
104
|
pub byte_start: usize,
|
|
@@ -130,12 +137,17 @@ pub struct ChunkMetadata {
|
|
|
130
137
|
/// Raw bytes allow cross-language compatibility - users can convert to
|
|
131
138
|
/// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
|
|
132
139
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
140
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
133
141
|
pub struct ExtractedImage {
|
|
134
|
-
/// Raw image data (PNG, JPEG, WebP, etc. bytes)
|
|
135
|
-
|
|
142
|
+
/// Raw image data (PNG, JPEG, WebP, etc. bytes).
|
|
143
|
+
/// Uses `bytes::Bytes` for cheap cloning of large buffers.
|
|
144
|
+
#[cfg_attr(feature = "api", schema(value_type = Vec<u8>, format = "binary"))]
|
|
145
|
+
pub data: Bytes,
|
|
136
146
|
|
|
137
147
|
/// Image format (e.g., "jpeg", "png", "webp")
|
|
138
|
-
|
|
148
|
+
/// Uses Cow<'static, str> to avoid allocation for static literals.
|
|
149
|
+
#[cfg_attr(feature = "api", schema(value_type = String))]
|
|
150
|
+
pub format: Cow<'static, str>,
|
|
139
151
|
|
|
140
152
|
/// Zero-indexed position of this image in the document/page
|
|
141
153
|
pub image_index: usize,
|
|
@@ -173,6 +185,7 @@ pub struct ExtractedImage {
|
|
|
173
185
|
/// When OCR is performed on this image, the result is embedded here
|
|
174
186
|
/// rather than in a separate collection, making the relationship explicit.
|
|
175
187
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
188
|
+
#[cfg_attr(feature = "api", schema(value_type = Option<ExtractionResult>))]
|
|
176
189
|
pub ocr_result: Option<Box<ExtractionResult>>,
|
|
177
190
|
}
|
|
178
191
|
|
|
@@ -182,6 +195,7 @@ pub struct ExtractedImage {
|
|
|
182
195
|
|
|
183
196
|
/// Output format selection for extraction results.
|
|
184
197
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
|
|
198
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
185
199
|
#[serde(rename_all = "snake_case")]
|
|
186
200
|
pub enum OutputFormat {
|
|
187
201
|
/// Unified format with all content in `content` field
|
|
@@ -196,6 +210,8 @@ pub enum OutputFormat {
|
|
|
196
210
|
/// Wraps a string identifier that is deterministically generated
|
|
197
211
|
/// from element type, content, and page number.
|
|
198
212
|
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
|
213
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
214
|
+
#[cfg_attr(feature = "api", schema(value_type = String))]
|
|
199
215
|
pub struct ElementId(String);
|
|
200
216
|
|
|
201
217
|
impl ElementId {
|
|
@@ -230,6 +246,7 @@ impl std::fmt::Display for ElementId {
|
|
|
230
246
|
/// Categorizes text content into semantic units for downstream processing.
|
|
231
247
|
/// Supports the element types commonly found in Unstructured documents.
|
|
232
248
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
249
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
233
250
|
#[serde(rename_all = "snake_case")]
|
|
234
251
|
pub enum ElementType {
|
|
235
252
|
/// Document title
|
|
@@ -258,6 +275,7 @@ pub enum ElementType {
|
|
|
258
275
|
|
|
259
276
|
/// Bounding box coordinates for element positioning.
|
|
260
277
|
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
|
278
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
261
279
|
pub struct BoundingBox {
|
|
262
280
|
/// Left x-coordinate
|
|
263
281
|
pub x0: f64,
|
|
@@ -271,6 +289,7 @@ pub struct BoundingBox {
|
|
|
271
289
|
|
|
272
290
|
/// Metadata for a semantic element.
|
|
273
291
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
292
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
274
293
|
pub struct ElementMetadata {
|
|
275
294
|
/// Page number (1-indexed)
|
|
276
295
|
pub page_number: Option<usize>,
|
|
@@ -289,6 +308,7 @@ pub struct ElementMetadata {
|
|
|
289
308
|
/// Represents a logical unit of content with semantic classification,
|
|
290
309
|
/// unique identifier, and metadata for tracking origin and position.
|
|
291
310
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
311
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
292
312
|
pub struct Element {
|
|
293
313
|
/// Unique element identifier
|
|
294
314
|
pub element_id: ElementId,
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
//! Format-specific extraction results and OCR configuration types.
|
|
2
2
|
|
|
3
|
+
use bytes::Bytes;
|
|
3
4
|
use serde::{Deserialize, Serialize};
|
|
5
|
+
use std::borrow::Cow;
|
|
4
6
|
use std::collections::HashMap;
|
|
5
7
|
|
|
6
8
|
use super::extraction::ExtractedImage;
|
|
@@ -153,8 +155,9 @@ pub struct EmailAttachment {
|
|
|
153
155
|
pub size: Option<usize>,
|
|
154
156
|
/// Whether this attachment is an image
|
|
155
157
|
pub is_image: bool,
|
|
156
|
-
/// Attachment data (if extracted)
|
|
157
|
-
|
|
158
|
+
/// Attachment data (if extracted).
|
|
159
|
+
/// Uses `bytes::Bytes` for cheap cloning of large buffers.
|
|
160
|
+
pub data: Option<Bytes>,
|
|
158
161
|
}
|
|
159
162
|
|
|
160
163
|
/// OCR extraction result.
|
|
@@ -351,6 +354,7 @@ impl Default for TesseractConfig {
|
|
|
351
354
|
/// Tracks the transformations applied to an image during OCR preprocessing,
|
|
352
355
|
/// including DPI normalization, resizing, and resampling.
|
|
353
356
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
357
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
354
358
|
pub struct ImagePreprocessingMetadata {
|
|
355
359
|
/// Original image dimensions (width, height) in pixels
|
|
356
360
|
pub original_dimensions: (usize, usize),
|
|
@@ -435,9 +439,9 @@ pub struct LibreOfficeConversionResult {
|
|
|
435
439
|
/// Converted file bytes
|
|
436
440
|
pub converted_bytes: Vec<u8>,
|
|
437
441
|
/// Original format identifier
|
|
438
|
-
pub original_format:
|
|
442
|
+
pub original_format: Cow<'static, str>,
|
|
439
443
|
/// Target format identifier
|
|
440
|
-
pub target_format:
|
|
444
|
+
pub target_format: Cow<'static, str>,
|
|
441
445
|
/// Target MIME type after conversion
|
|
442
|
-
pub target_mime:
|
|
446
|
+
pub target_mime: Cow<'static, str>,
|
|
443
447
|
}
|
|
@@ -2,7 +2,10 @@
|
|
|
2
2
|
//!
|
|
3
3
|
//! This module defines metadata structures for various document formats.
|
|
4
4
|
|
|
5
|
-
use
|
|
5
|
+
use std::borrow::Cow;
|
|
6
|
+
|
|
7
|
+
use ahash::AHashMap;
|
|
8
|
+
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
|
6
9
|
use std::collections::{BTreeMap, HashMap};
|
|
7
10
|
|
|
8
11
|
#[cfg(feature = "pdf")]
|
|
@@ -11,11 +14,41 @@ use crate::pdf::metadata::PdfMetadata;
|
|
|
11
14
|
use super::formats::ImagePreprocessingMetadata;
|
|
12
15
|
use super::page::PageStructure;
|
|
13
16
|
|
|
17
|
+
/// Custom serialization and deserialization for AHashMap<Cow<'static, str>, Value>.
|
|
18
|
+
///
|
|
19
|
+
/// serde doesn't natively support serializing Cow keys, so we convert to/from
|
|
20
|
+
/// a HashMap<String, Value> for the wire format, while keeping the in-memory
|
|
21
|
+
/// representation optimized with Cow keys (avoiding allocations for static strings).
|
|
22
|
+
mod additional_serde {
|
|
23
|
+
use super::*;
|
|
24
|
+
|
|
25
|
+
pub fn serialize<S>(map: &AHashMap<Cow<'static, str>, serde_json::Value>, serializer: S) -> Result<S::Ok, S::Error>
|
|
26
|
+
where
|
|
27
|
+
S: Serializer,
|
|
28
|
+
{
|
|
29
|
+
// Convert to HashMap for serialization
|
|
30
|
+
let converted: HashMap<String, serde_json::Value> =
|
|
31
|
+
map.iter().map(|(k, v)| (k.to_string(), v.clone())).collect();
|
|
32
|
+
converted.serialize(serializer)
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
pub fn deserialize<'de, D>(deserializer: D) -> Result<AHashMap<Cow<'static, str>, serde_json::Value>, D::Error>
|
|
36
|
+
where
|
|
37
|
+
D: Deserializer<'de>,
|
|
38
|
+
{
|
|
39
|
+
// Deserialize from HashMap
|
|
40
|
+
let map = HashMap::<String, serde_json::Value>::deserialize(deserializer)?;
|
|
41
|
+
let result = map.into_iter().map(|(k, v)| (Cow::Owned(k), v)).collect();
|
|
42
|
+
Ok(result)
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
14
46
|
/// Format-specific metadata (discriminated union).
|
|
15
47
|
///
|
|
16
48
|
/// Only one format type can exist per extraction result. This provides
|
|
17
49
|
/// type-safe, clean metadata without nested optionals.
|
|
18
50
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
51
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
19
52
|
#[serde(tag = "format_type", rename_all = "snake_case")]
|
|
20
53
|
pub enum FormatMetadata {
|
|
21
54
|
#[cfg(feature = "pdf")]
|
|
@@ -27,6 +60,7 @@ pub enum FormatMetadata {
|
|
|
27
60
|
Image(ImageMetadata),
|
|
28
61
|
Xml(XmlMetadata),
|
|
29
62
|
Text(TextMetadata),
|
|
63
|
+
#[cfg_attr(feature = "api", schema(value_type = HtmlMetadata))]
|
|
30
64
|
Html(Box<HtmlMetadata>),
|
|
31
65
|
Ocr(OcrMetadata),
|
|
32
66
|
}
|
|
@@ -36,6 +70,7 @@ pub enum FormatMetadata {
|
|
|
36
70
|
/// Contains common fields applicable to all formats, format-specific metadata
|
|
37
71
|
/// via a discriminated union, and additional custom fields from postprocessors.
|
|
38
72
|
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
|
73
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
39
74
|
pub struct Metadata {
|
|
40
75
|
/// Document title
|
|
41
76
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
@@ -82,6 +117,7 @@ pub struct Metadata {
|
|
|
82
117
|
/// Contains detailed metadata specific to the document format.
|
|
83
118
|
/// Serializes with a `format_type` discriminator field.
|
|
84
119
|
#[serde(flatten, skip_serializing_if = "Option::is_none")]
|
|
120
|
+
#[cfg_attr(feature = "api", schema(value_type = Option<Object>))]
|
|
85
121
|
pub format: Option<FormatMetadata>,
|
|
86
122
|
|
|
87
123
|
/// Image preprocessing metadata (when OCR preprocessing was applied)
|
|
@@ -98,11 +134,17 @@ pub struct Metadata {
|
|
|
98
134
|
|
|
99
135
|
/// Additional custom fields from postprocessors.
|
|
100
136
|
///
|
|
101
|
-
/// This flattened
|
|
137
|
+
/// This flattened map allows Python/TypeScript postprocessors to add
|
|
102
138
|
/// arbitrary fields (entity extraction, keyword extraction, etc.).
|
|
103
139
|
/// Fields are merged at the root level during serialization.
|
|
104
|
-
|
|
105
|
-
|
|
140
|
+
/// Uses `Cow<'static, str>` keys so static string keys avoid allocation.
|
|
141
|
+
#[serde(
|
|
142
|
+
flatten,
|
|
143
|
+
serialize_with = "additional_serde::serialize",
|
|
144
|
+
deserialize_with = "additional_serde::deserialize"
|
|
145
|
+
)]
|
|
146
|
+
#[cfg_attr(feature = "api", schema(value_type = HashMap<String, serde_json::Value>))]
|
|
147
|
+
pub additional: AHashMap<Cow<'static, str>, serde_json::Value>,
|
|
106
148
|
}
|
|
107
149
|
|
|
108
150
|
/// Excel/spreadsheet metadata.
|
|
@@ -110,6 +152,7 @@ pub struct Metadata {
|
|
|
110
152
|
/// Contains information about sheets in Excel, LibreOffice Calc, and other
|
|
111
153
|
/// spreadsheet formats (.xlsx, .xls, .ods, etc.).
|
|
112
154
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
155
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
113
156
|
pub struct ExcelMetadata {
|
|
114
157
|
/// Total number of sheets in the workbook
|
|
115
158
|
pub sheet_count: usize,
|
|
@@ -121,6 +164,7 @@ pub struct ExcelMetadata {
|
|
|
121
164
|
///
|
|
122
165
|
/// Includes sender/recipient information, message ID, and attachment list.
|
|
123
166
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
167
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
124
168
|
pub struct EmailMetadata {
|
|
125
169
|
/// Sender's email address
|
|
126
170
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
@@ -149,9 +193,11 @@ pub struct EmailMetadata {
|
|
|
149
193
|
///
|
|
150
194
|
/// Extracted from compressed archive files containing file lists and size information.
|
|
151
195
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
196
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
152
197
|
pub struct ArchiveMetadata {
|
|
153
198
|
/// Archive format ("ZIP", "TAR", "7Z", etc.)
|
|
154
|
-
|
|
199
|
+
#[cfg_attr(feature = "api", schema(value_type = String))]
|
|
200
|
+
pub format: Cow<'static, str>,
|
|
155
201
|
/// Total number of files in the archive
|
|
156
202
|
pub file_count: usize,
|
|
157
203
|
/// List of file paths within the archive
|
|
@@ -168,6 +214,7 @@ pub struct ArchiveMetadata {
|
|
|
168
214
|
///
|
|
169
215
|
/// Includes dimensions, format, and EXIF data.
|
|
170
216
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
217
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
171
218
|
pub struct ImageMetadata {
|
|
172
219
|
/// Image width in pixels
|
|
173
220
|
pub width: u32,
|
|
@@ -183,6 +230,7 @@ pub struct ImageMetadata {
|
|
|
183
230
|
///
|
|
184
231
|
/// Provides statistics about XML document structure.
|
|
185
232
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
233
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
186
234
|
pub struct XmlMetadata {
|
|
187
235
|
/// Total number of XML elements processed
|
|
188
236
|
pub element_count: usize,
|
|
@@ -195,6 +243,7 @@ pub struct XmlMetadata {
|
|
|
195
243
|
/// Extracted from plain text and Markdown files. Includes word counts and,
|
|
196
244
|
/// for Markdown, structural elements like headers and links.
|
|
197
245
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
246
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
198
247
|
pub struct TextMetadata {
|
|
199
248
|
/// Number of lines in the document
|
|
200
249
|
pub line_count: usize,
|
|
@@ -218,6 +267,7 @@ pub struct TextMetadata {
|
|
|
218
267
|
|
|
219
268
|
/// Text direction enumeration for HTML documents.
|
|
220
269
|
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
|
|
270
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
221
271
|
#[serde(rename_all = "lowercase")]
|
|
222
272
|
pub enum TextDirection {
|
|
223
273
|
/// Left-to-right text direction
|
|
@@ -233,6 +283,7 @@ pub enum TextDirection {
|
|
|
233
283
|
|
|
234
284
|
/// Header/heading element metadata.
|
|
235
285
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
286
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
236
287
|
pub struct HeaderMetadata {
|
|
237
288
|
/// Header level: 1 (h1) through 6 (h6)
|
|
238
289
|
pub level: u8,
|
|
@@ -249,6 +300,7 @@ pub struct HeaderMetadata {
|
|
|
249
300
|
|
|
250
301
|
/// Link element metadata.
|
|
251
302
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
303
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
252
304
|
pub struct LinkMetadata {
|
|
253
305
|
/// The href URL value
|
|
254
306
|
pub href: String,
|
|
@@ -262,11 +314,12 @@ pub struct LinkMetadata {
|
|
|
262
314
|
/// Rel attribute values
|
|
263
315
|
pub rel: Vec<String>,
|
|
264
316
|
/// Additional attributes as key-value pairs
|
|
265
|
-
pub attributes:
|
|
317
|
+
pub attributes: Vec<(String, String)>,
|
|
266
318
|
}
|
|
267
319
|
|
|
268
320
|
/// Link type classification.
|
|
269
321
|
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
|
|
322
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
270
323
|
#[serde(rename_all = "lowercase")]
|
|
271
324
|
pub enum LinkType {
|
|
272
325
|
/// Anchor link (#section)
|
|
@@ -285,6 +338,7 @@ pub enum LinkType {
|
|
|
285
338
|
|
|
286
339
|
/// Image element metadata.
|
|
287
340
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
341
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
288
342
|
pub struct ImageMetadataType {
|
|
289
343
|
/// Image source (URL, data URI, or SVG content)
|
|
290
344
|
pub src: String,
|
|
@@ -299,11 +353,12 @@ pub struct ImageMetadataType {
|
|
|
299
353
|
/// Image type classification
|
|
300
354
|
pub image_type: ImageType,
|
|
301
355
|
/// Additional attributes as key-value pairs
|
|
302
|
-
pub attributes:
|
|
356
|
+
pub attributes: Vec<(String, String)>,
|
|
303
357
|
}
|
|
304
358
|
|
|
305
359
|
/// Image type classification.
|
|
306
360
|
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
|
|
361
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
307
362
|
#[serde(rename_all = "lowercase")]
|
|
308
363
|
pub enum ImageType {
|
|
309
364
|
/// Data URI image
|
|
@@ -320,6 +375,7 @@ pub enum ImageType {
|
|
|
320
375
|
|
|
321
376
|
/// Structured data (Schema.org, microdata, RDFa) block.
|
|
322
377
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
378
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
323
379
|
pub struct StructuredData {
|
|
324
380
|
/// Type of structured data
|
|
325
381
|
pub data_type: StructuredDataType,
|
|
@@ -332,6 +388,7 @@ pub struct StructuredData {
|
|
|
332
388
|
|
|
333
389
|
/// Structured data type classification.
|
|
334
390
|
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
|
|
391
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
335
392
|
#[serde(rename_all = "lowercase")]
|
|
336
393
|
pub enum StructuredDataType {
|
|
337
394
|
/// JSON-LD structured data
|
|
@@ -349,6 +406,7 @@ pub enum StructuredDataType {
|
|
|
349
406
|
/// Includes document-level metadata, Open Graph data, Twitter Card metadata,
|
|
350
407
|
/// and extracted structural elements (headers, links, images, structured data).
|
|
351
408
|
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
|
409
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
352
410
|
pub struct HtmlMetadata {
|
|
353
411
|
/// Document title from `<title>` tag
|
|
354
412
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
@@ -524,6 +582,7 @@ impl From<html_to_markdown_rs::ExtendedMetadata> for HtmlMetadata {
|
|
|
524
582
|
///
|
|
525
583
|
/// Captures information about OCR processing configuration and results.
|
|
526
584
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
585
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
527
586
|
pub struct OcrMetadata {
|
|
528
587
|
/// OCR language code(s) used
|
|
529
588
|
pub language: String,
|
|
@@ -543,6 +602,7 @@ pub struct OcrMetadata {
|
|
|
543
602
|
|
|
544
603
|
/// Error metadata (for batch operations).
|
|
545
604
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
605
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
546
606
|
pub struct ErrorMetadata {
|
|
547
607
|
pub error_type: String,
|
|
548
608
|
pub message: String,
|
|
@@ -552,6 +612,7 @@ pub struct ErrorMetadata {
|
|
|
552
612
|
///
|
|
553
613
|
/// Extracted from PPTX files containing slide counts and presentation details.
|
|
554
614
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
615
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
555
616
|
pub struct PptxMetadata {
|
|
556
617
|
/// Total number of slides in the presentation
|
|
557
618
|
pub slide_count: usize,
|
|
@@ -20,6 +20,8 @@ pub use tables::*;
|
|
|
20
20
|
#[cfg(test)]
|
|
21
21
|
mod tests {
|
|
22
22
|
use super::*;
|
|
23
|
+
use bytes::Bytes;
|
|
24
|
+
use std::borrow::Cow;
|
|
23
25
|
use std::sync::Arc;
|
|
24
26
|
|
|
25
27
|
#[test]
|
|
@@ -38,7 +40,7 @@ mod tests {
|
|
|
38
40
|
|
|
39
41
|
metadata
|
|
40
42
|
.additional
|
|
41
|
-
.insert("quality_score"
|
|
43
|
+
.insert(Cow::Borrowed("quality_score"), serde_json::json!(1.0));
|
|
42
44
|
|
|
43
45
|
let json = serde_json::to_value(&metadata).unwrap();
|
|
44
46
|
println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
|
|
@@ -162,8 +164,8 @@ mod tests {
|
|
|
162
164
|
#[test]
|
|
163
165
|
fn test_page_content_arc_images_roundtrip() {
|
|
164
166
|
let image1 = Arc::new(ExtractedImage {
|
|
165
|
-
data:
|
|
166
|
-
format: "jpeg"
|
|
167
|
+
data: Bytes::from_static(&[0xFF, 0xD8, 0xFF]),
|
|
168
|
+
format: Cow::Borrowed("jpeg"),
|
|
167
169
|
image_index: 0,
|
|
168
170
|
page_number: Some(1),
|
|
169
171
|
width: Some(100),
|
|
@@ -176,8 +178,8 @@ mod tests {
|
|
|
176
178
|
});
|
|
177
179
|
|
|
178
180
|
let image2 = Arc::new(ExtractedImage {
|
|
179
|
-
data:
|
|
180
|
-
format: "png"
|
|
181
|
+
data: Bytes::from_static(&[0x89, 0x50, 0x4E]),
|
|
182
|
+
format: Cow::Borrowed("png"),
|
|
181
183
|
image_index: 1,
|
|
182
184
|
page_number: Some(1),
|
|
183
185
|
width: Some(300),
|
|
@@ -15,6 +15,7 @@ use super::tables::Table;
|
|
|
15
15
|
/// Supports different page types (PDF pages, PPTX slides, Excel sheets)
|
|
16
16
|
/// with character offset boundaries for chunk-to-page mapping.
|
|
17
17
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
18
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
18
19
|
pub struct PageStructure {
|
|
19
20
|
/// Total number of pages/slides/sheets
|
|
20
21
|
pub total_count: usize,
|
|
@@ -39,6 +40,7 @@ pub struct PageStructure {
|
|
|
39
40
|
/// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
|
|
40
41
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
41
42
|
#[serde(rename_all = "snake_case")]
|
|
43
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
42
44
|
pub enum PageUnitType {
|
|
43
45
|
/// Standard document pages (PDF, DOCX, images)
|
|
44
46
|
Page,
|
|
@@ -54,6 +56,7 @@ pub enum PageUnitType {
|
|
|
54
56
|
/// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
|
|
55
57
|
/// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
|
|
56
58
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
59
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
57
60
|
pub struct PageBoundary {
|
|
58
61
|
/// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
|
|
59
62
|
pub byte_start: usize,
|
|
@@ -68,6 +71,7 @@ pub struct PageBoundary {
|
|
|
68
71
|
/// Captures per-page information including dimensions, content counts,
|
|
69
72
|
/// and visibility state (for presentations).
|
|
70
73
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
74
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
71
75
|
pub struct PageInfo {
|
|
72
76
|
/// Page number (1-indexed)
|
|
73
77
|
pub number: usize,
|
|
@@ -108,6 +112,7 @@ pub struct PageInfo {
|
|
|
108
112
|
/// This reduces memory overhead for documents with shared tables/images
|
|
109
113
|
/// by avoiding redundant copies during serialization.
|
|
110
114
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
115
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
111
116
|
pub struct PageContent {
|
|
112
117
|
/// Page number (1-indexed)
|
|
113
118
|
pub page_number: usize,
|
|
@@ -120,6 +125,7 @@ pub struct PageContent {
|
|
|
120
125
|
/// Serializes as Vec<Table> for JSON compatibility while maintaining
|
|
121
126
|
/// Arc semantics in-memory for zero-copy sharing.
|
|
122
127
|
#[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
|
|
128
|
+
#[cfg_attr(feature = "api", schema(value_type = Vec<Table>))]
|
|
123
129
|
pub tables: Vec<Arc<Table>>,
|
|
124
130
|
|
|
125
131
|
/// Images found on this page (uses Arc for memory efficiency)
|
|
@@ -127,6 +133,7 @@ pub struct PageContent {
|
|
|
127
133
|
/// Serializes as Vec<ExtractedImage> for JSON compatibility while maintaining
|
|
128
134
|
/// Arc semantics in-memory for zero-copy sharing.
|
|
129
135
|
#[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
|
|
136
|
+
#[cfg_attr(feature = "api", schema(value_type = Vec<ExtractedImage>))]
|
|
130
137
|
pub images: Vec<Arc<ExtractedImage>>,
|
|
131
138
|
|
|
132
139
|
/// Hierarchy information for the page (when hierarchy extraction is enabled)
|
|
@@ -141,6 +148,7 @@ pub struct PageContent {
|
|
|
141
148
|
/// Used when PDF text hierarchy extraction is enabled. Contains hierarchical
|
|
142
149
|
/// blocks with heading levels (H1-H6) for semantic document structure.
|
|
143
150
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
151
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
144
152
|
pub struct PageHierarchy {
|
|
145
153
|
/// Number of hierarchy blocks on this page
|
|
146
154
|
pub block_count: usize,
|
|
@@ -155,6 +163,7 @@ pub struct PageHierarchy {
|
|
|
155
163
|
/// Represents a block of text with semantic heading information extracted from
|
|
156
164
|
/// font size clustering and hierarchical analysis.
|
|
157
165
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
166
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
158
167
|
pub struct HierarchicalBlock {
|
|
159
168
|
/// The text content of this block
|
|
160
169
|
pub text: String,
|
|
@@ -7,6 +7,7 @@ use serde::{Deserialize, Serialize};
|
|
|
7
7
|
/// Represents a table detected and extracted from a document (PDF, image, etc.).
|
|
8
8
|
/// Tables are converted to both structured cell data and Markdown format.
|
|
9
9
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
10
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
10
11
|
pub struct Table {
|
|
11
12
|
/// Table cells as a 2D vector (rows × columns)
|
|
12
13
|
pub cells: Vec<Vec<String>>,
|
|
@@ -20,6 +21,7 @@ pub struct Table {
|
|
|
20
21
|
///
|
|
21
22
|
/// Future extension point for rich table support with cell-level metadata.
|
|
22
23
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
24
|
+
#[cfg_attr(feature = "api", derive(utoipa::ToSchema))]
|
|
23
25
|
pub struct TableCell {
|
|
24
26
|
/// Cell content as text
|
|
25
27
|
pub content: String,
|
|
@@ -17,6 +17,7 @@ use kreuzberg::core::pipeline::run_pipeline;
|
|
|
17
17
|
use kreuzberg::plugins::registry::{get_document_extractor_registry, get_post_processor_registry};
|
|
18
18
|
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
19
19
|
use kreuzberg::types::{ExtractionResult, Metadata};
|
|
20
|
+
use std::borrow::Cow;
|
|
20
21
|
use std::sync::Arc;
|
|
21
22
|
|
|
22
23
|
#[cfg(feature = "ocr")]
|
|
@@ -378,7 +379,7 @@ async fn test_concurrent_pipeline_processing() {
|
|
|
378
379
|
handles.push(tokio::spawn(async move {
|
|
379
380
|
let result = ExtractionResult {
|
|
380
381
|
content: format!("Content {}", i),
|
|
381
|
-
mime_type: "text/plain"
|
|
382
|
+
mime_type: Cow::Borrowed("text/plain"),
|
|
382
383
|
metadata: Metadata::default(),
|
|
383
384
|
tables: vec![],
|
|
384
385
|
detected_languages: None,
|