kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -5
- data/README.md +15 -9
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +511 -325
- data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
- data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
- data/kreuzberg.gemspec +38 -4
- data/lib/kreuzberg/config.rb +34 -1
- data/lib/kreuzberg/result.rb +77 -14
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +23 -6
- data/vendor/kreuzberg/Cargo.toml +32 -11
- data/vendor/kreuzberg/README.md +54 -8
- data/vendor/kreuzberg/build.rs +549 -132
- data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
- data/vendor/kreuzberg/src/core/config.rs +49 -1
- data/vendor/kreuzberg/src/core/extractor.rs +134 -2
- data/vendor/kreuzberg/src/core/mod.rs +4 -2
- data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
- data/vendor/kreuzberg/src/extraction/html.rs +24 -8
- data/vendor/kreuzberg/src/extraction/image.rs +124 -1
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
- data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
- data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
- data/vendor/kreuzberg/src/extractors/email.rs +29 -15
- data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
- data/vendor/kreuzberg/src/extractors/html.rs +29 -15
- data/vendor/kreuzberg/src/extractors/image.rs +25 -4
- data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
- data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
- data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +197 -17
- data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
- data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +7 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
- data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
- data/vendor/kreuzberg/src/lib.rs +10 -2
- data/vendor/kreuzberg/src/mcp/mod.rs +3 -0
- data/vendor/kreuzberg/src/mcp/server.rs +120 -12
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +328 -0
- data/vendor/kreuzberg/src/pdf/error.rs +8 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
- data/vendor/kreuzberg/src/pdf/mod.rs +18 -2
- data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
- data/vendor/kreuzberg/src/pdf/table.rs +26 -2
- data/vendor/kreuzberg/src/pdf/text.rs +89 -7
- data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
- data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
- data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
- data/vendor/kreuzberg/src/text/mod.rs +6 -0
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
- data/vendor/kreuzberg/src/types.rs +173 -21
- data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
- data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
- data/vendor/kreuzberg/tests/config_features.rs +15 -1
- data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/email_integration.rs +2 -0
- data/vendor/kreuzberg/tests/error_handling.rs +43 -34
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/image_integration.rs +2 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
- data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
- data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
- data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
- data/vendor/rb-sys/Cargo.lock +15 -15
- data/vendor/rb-sys/Cargo.toml +4 -4
- data/vendor/rb-sys/Cargo.toml.orig +4 -4
- data/vendor/rb-sys/build/features.rs +5 -2
- data/vendor/rb-sys/build/main.rs +55 -15
- data/vendor/rb-sys/build/stable_api_config.rs +4 -2
- data/vendor/rb-sys/build/version.rs +3 -1
- data/vendor/rb-sys/src/lib.rs +1 -0
- data/vendor/rb-sys/src/macros.rs +2 -2
- data/vendor/rb-sys/src/special_consts.rs +1 -1
- data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
- data/vendor/rb-sys/src/stable_api.rs +0 -1
- data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
- metadata +13 -10
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
//! Quality processing post-processor.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides a PostProcessor plugin that performs quality assessment and
|
|
4
|
+
//! text cleaning on extraction results.
|
|
5
|
+
|
|
6
|
+
use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
7
|
+
use crate::{ExtractionConfig, ExtractionResult, Result};
|
|
8
|
+
use async_trait::async_trait;
|
|
9
|
+
|
|
10
|
+
/// Post-processor that calculates quality score and cleans text.
|
|
11
|
+
///
|
|
12
|
+
/// This processor:
|
|
13
|
+
/// - Runs in the Early processing stage
|
|
14
|
+
/// - Calculates quality score when `config.enable_quality_processing` is true
|
|
15
|
+
/// - Stores quality score in `metadata.additional["quality_score"]`
|
|
16
|
+
/// - Cleans and normalizes extracted text
|
|
17
|
+
///
|
|
18
|
+
/// # Example
|
|
19
|
+
///
|
|
20
|
+
/// ```rust,no_run
|
|
21
|
+
/// use kreuzberg::plugins::{Plugin, PostProcessor};
|
|
22
|
+
/// use kreuzberg::text::quality::processor::QualityProcessor;
|
|
23
|
+
///
|
|
24
|
+
/// let processor = QualityProcessor;
|
|
25
|
+
/// assert_eq!(processor.name(), "quality-processing");
|
|
26
|
+
/// ```
|
|
27
|
+
#[derive(Debug, Clone, Copy)]
|
|
28
|
+
pub struct QualityProcessor;
|
|
29
|
+
|
|
30
|
+
impl Plugin for QualityProcessor {
|
|
31
|
+
fn name(&self) -> &str {
|
|
32
|
+
"quality-processing"
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
fn version(&self) -> String {
|
|
36
|
+
env!("CARGO_PKG_VERSION").to_string()
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
fn initialize(&self) -> Result<()> {
|
|
40
|
+
Ok(())
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
fn shutdown(&self) -> Result<()> {
|
|
44
|
+
Ok(())
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
|
|
49
|
+
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
|
|
50
|
+
impl PostProcessor for QualityProcessor {
|
|
51
|
+
async fn process(&self, result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
52
|
+
// Calculate quality score
|
|
53
|
+
let quality_score = crate::text::quality::calculate_quality_score(
|
|
54
|
+
&result.content,
|
|
55
|
+
Some(
|
|
56
|
+
&result
|
|
57
|
+
.metadata
|
|
58
|
+
.additional
|
|
59
|
+
.iter()
|
|
60
|
+
.map(|(k, v)| (k.clone(), v.to_string()))
|
|
61
|
+
.collect(),
|
|
62
|
+
),
|
|
63
|
+
);
|
|
64
|
+
|
|
65
|
+
result.metadata.additional.insert(
|
|
66
|
+
"quality_score".to_string(),
|
|
67
|
+
serde_json::Value::Number(
|
|
68
|
+
serde_json::Number::from_f64(quality_score).unwrap_or(serde_json::Number::from(0)),
|
|
69
|
+
),
|
|
70
|
+
);
|
|
71
|
+
|
|
72
|
+
Ok(())
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
fn processing_stage(&self) -> ProcessingStage {
|
|
76
|
+
ProcessingStage::Early
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
fn should_process(&self, _result: &ExtractionResult, config: &ExtractionConfig) -> bool {
|
|
80
|
+
config.enable_quality_processing
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
fn estimated_duration_ms(&self, result: &ExtractionResult) -> u64 {
|
|
84
|
+
let text_length = result.content.len();
|
|
85
|
+
// Quality processing is relatively fast: ~1ms per 100KB
|
|
86
|
+
(text_length / 102400).max(1) as u64
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
#[cfg(test)]
|
|
91
|
+
mod tests {
|
|
92
|
+
use super::*;
|
|
93
|
+
use crate::types::Metadata;
|
|
94
|
+
|
|
95
|
+
#[tokio::test]
|
|
96
|
+
async fn test_quality_processor() {
|
|
97
|
+
let processor = QualityProcessor;
|
|
98
|
+
let config = ExtractionConfig {
|
|
99
|
+
enable_quality_processing: true,
|
|
100
|
+
..Default::default()
|
|
101
|
+
};
|
|
102
|
+
|
|
103
|
+
let mut result = ExtractionResult {
|
|
104
|
+
content: "This is a well-written paragraph with proper structure. It contains multiple sentences. The quality should be good.".to_string(),
|
|
105
|
+
mime_type: "text/plain".to_string(),
|
|
106
|
+
metadata: Metadata::default(),
|
|
107
|
+
tables: vec![],
|
|
108
|
+
detected_languages: None,
|
|
109
|
+
chunks: None,
|
|
110
|
+
images: None,
|
|
111
|
+
pages: None,
|
|
112
|
+
};
|
|
113
|
+
|
|
114
|
+
processor.process(&mut result, &config).await.unwrap();
|
|
115
|
+
|
|
116
|
+
assert!(result.metadata.additional.contains_key("quality_score"));
|
|
117
|
+
let score = result.metadata.additional.get("quality_score").unwrap();
|
|
118
|
+
assert!(score.is_number());
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
#[tokio::test]
|
|
122
|
+
async fn test_quality_processor_disabled() {
|
|
123
|
+
let processor = QualityProcessor;
|
|
124
|
+
let config = ExtractionConfig {
|
|
125
|
+
enable_quality_processing: false,
|
|
126
|
+
..Default::default()
|
|
127
|
+
};
|
|
128
|
+
|
|
129
|
+
let mut result = ExtractionResult {
|
|
130
|
+
content: "Some text".to_string(),
|
|
131
|
+
mime_type: "text/plain".to_string(),
|
|
132
|
+
metadata: Metadata::default(),
|
|
133
|
+
tables: vec![],
|
|
134
|
+
detected_languages: None,
|
|
135
|
+
chunks: None,
|
|
136
|
+
images: None,
|
|
137
|
+
pages: None,
|
|
138
|
+
};
|
|
139
|
+
|
|
140
|
+
// When disabled, the processor should not run, so no quality_score should be added
|
|
141
|
+
// (because should_process returns false)
|
|
142
|
+
processor.process(&mut result, &config).await.unwrap();
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
#[test]
|
|
146
|
+
fn test_quality_processor_plugin_interface() {
|
|
147
|
+
let processor = QualityProcessor;
|
|
148
|
+
assert_eq!(processor.name(), "quality-processing");
|
|
149
|
+
assert!(!processor.version().is_empty());
|
|
150
|
+
assert!(processor.initialize().is_ok());
|
|
151
|
+
assert!(processor.shutdown().is_ok());
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
#[test]
|
|
155
|
+
fn test_quality_processor_stage() {
|
|
156
|
+
let processor = QualityProcessor;
|
|
157
|
+
assert_eq!(processor.processing_stage(), ProcessingStage::Early);
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
#[test]
|
|
161
|
+
fn test_quality_processor_should_process() {
|
|
162
|
+
let processor = QualityProcessor;
|
|
163
|
+
|
|
164
|
+
let result = ExtractionResult {
|
|
165
|
+
content: "Sample text".to_string(),
|
|
166
|
+
mime_type: "text/plain".to_string(),
|
|
167
|
+
metadata: Metadata::default(),
|
|
168
|
+
tables: vec![],
|
|
169
|
+
detected_languages: None,
|
|
170
|
+
chunks: None,
|
|
171
|
+
images: None,
|
|
172
|
+
pages: None,
|
|
173
|
+
};
|
|
174
|
+
|
|
175
|
+
let config_with_quality = ExtractionConfig {
|
|
176
|
+
enable_quality_processing: true,
|
|
177
|
+
..Default::default()
|
|
178
|
+
};
|
|
179
|
+
assert!(processor.should_process(&result, &config_with_quality));
|
|
180
|
+
|
|
181
|
+
let config_without_quality = ExtractionConfig {
|
|
182
|
+
enable_quality_processing: false,
|
|
183
|
+
..Default::default()
|
|
184
|
+
};
|
|
185
|
+
assert!(!processor.should_process(&result, &config_without_quality));
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
#[test]
|
|
189
|
+
fn test_quality_processor_estimated_duration() {
|
|
190
|
+
let processor = QualityProcessor;
|
|
191
|
+
|
|
192
|
+
let short_result = ExtractionResult {
|
|
193
|
+
content: "Short".to_string(),
|
|
194
|
+
mime_type: "text/plain".to_string(),
|
|
195
|
+
metadata: Metadata::default(),
|
|
196
|
+
tables: vec![],
|
|
197
|
+
detected_languages: None,
|
|
198
|
+
chunks: None,
|
|
199
|
+
images: None,
|
|
200
|
+
pages: None,
|
|
201
|
+
};
|
|
202
|
+
|
|
203
|
+
let long_result = ExtractionResult {
|
|
204
|
+
content: "a".repeat(1000000),
|
|
205
|
+
mime_type: "text/plain".to_string(),
|
|
206
|
+
metadata: Metadata::default(),
|
|
207
|
+
tables: vec![],
|
|
208
|
+
detected_languages: None,
|
|
209
|
+
chunks: None,
|
|
210
|
+
images: None,
|
|
211
|
+
pages: None,
|
|
212
|
+
};
|
|
213
|
+
|
|
214
|
+
let short_duration = processor.estimated_duration_ms(&short_result);
|
|
215
|
+
let long_duration = processor.estimated_duration_ms(&long_result);
|
|
216
|
+
|
|
217
|
+
assert!(long_duration > short_duration);
|
|
218
|
+
}
|
|
219
|
+
}
|
|
@@ -34,6 +34,13 @@ pub struct ExtractionResult {
|
|
|
34
34
|
/// Each image may optionally contain a nested `ocr_result` if OCR was performed.
|
|
35
35
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
36
36
|
pub images: Option<Vec<ExtractedImage>>,
|
|
37
|
+
|
|
38
|
+
/// Per-page content when page extraction is enabled.
|
|
39
|
+
///
|
|
40
|
+
/// When page extraction is configured, the document is split into per-page content
|
|
41
|
+
/// with tables and images mapped to their respective pages.
|
|
42
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
43
|
+
pub pages: Option<Vec<PageContent>>,
|
|
37
44
|
}
|
|
38
45
|
|
|
39
46
|
/// Format-specific metadata (discriminated union).
|
|
@@ -62,17 +69,49 @@ pub enum FormatMetadata {
|
|
|
62
69
|
/// via a discriminated union, and additional custom fields from postprocessors.
|
|
63
70
|
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
|
64
71
|
pub struct Metadata {
|
|
65
|
-
///
|
|
72
|
+
/// Document title
|
|
73
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
74
|
+
pub title: Option<String>,
|
|
75
|
+
|
|
76
|
+
/// Document subject or description
|
|
77
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
78
|
+
pub subject: Option<String>,
|
|
79
|
+
|
|
80
|
+
/// Primary author(s) - always Vec for consistency
|
|
81
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
82
|
+
pub authors: Option<Vec<String>>,
|
|
83
|
+
|
|
84
|
+
/// Keywords/tags - always Vec for consistency
|
|
85
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
86
|
+
pub keywords: Option<Vec<String>>,
|
|
87
|
+
|
|
88
|
+
/// Primary language (ISO 639 code)
|
|
66
89
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
67
90
|
pub language: Option<String>,
|
|
68
91
|
|
|
69
|
-
///
|
|
92
|
+
/// Creation timestamp (ISO 8601 format)
|
|
70
93
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
71
|
-
pub
|
|
94
|
+
pub created_at: Option<String>,
|
|
72
95
|
|
|
73
|
-
///
|
|
96
|
+
/// Last modification timestamp (ISO 8601 format)
|
|
74
97
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
75
|
-
pub
|
|
98
|
+
pub modified_at: Option<String>,
|
|
99
|
+
|
|
100
|
+
/// User who created the document
|
|
101
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
102
|
+
pub created_by: Option<String>,
|
|
103
|
+
|
|
104
|
+
/// User who last modified the document
|
|
105
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
106
|
+
pub modified_by: Option<String>,
|
|
107
|
+
|
|
108
|
+
/// Page/slide/sheet structure with boundaries
|
|
109
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
110
|
+
pub pages: Option<PageStructure>,
|
|
111
|
+
|
|
112
|
+
/// Document date (DEPRECATED - use created_at/modified_at instead)
|
|
113
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
114
|
+
pub date: Option<String>,
|
|
76
115
|
|
|
77
116
|
/// Format-specific metadata (discriminated union)
|
|
78
117
|
///
|
|
@@ -102,6 +141,110 @@ pub struct Metadata {
|
|
|
102
141
|
pub additional: HashMap<String, serde_json::Value>,
|
|
103
142
|
}
|
|
104
143
|
|
|
144
|
+
/// Unified page structure for documents.
|
|
145
|
+
///
|
|
146
|
+
/// Supports different page types (PDF pages, PPTX slides, Excel sheets)
|
|
147
|
+
/// with character offset boundaries for chunk-to-page mapping.
|
|
148
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
149
|
+
pub struct PageStructure {
|
|
150
|
+
/// Total number of pages/slides/sheets
|
|
151
|
+
pub total_count: usize,
|
|
152
|
+
|
|
153
|
+
/// Type of paginated unit
|
|
154
|
+
pub unit_type: PageUnitType,
|
|
155
|
+
|
|
156
|
+
/// Character offset boundaries for each page
|
|
157
|
+
///
|
|
158
|
+
/// Maps character ranges in the extracted content to page numbers.
|
|
159
|
+
/// Used for chunk page range calculation.
|
|
160
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
161
|
+
pub boundaries: Option<Vec<PageBoundary>>,
|
|
162
|
+
|
|
163
|
+
/// Detailed per-page metadata (optional, only when needed)
|
|
164
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
165
|
+
pub pages: Option<Vec<PageInfo>>,
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/// Type of paginated unit in a document.
|
|
169
|
+
///
|
|
170
|
+
/// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
|
|
171
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
172
|
+
#[serde(rename_all = "snake_case")]
|
|
173
|
+
pub enum PageUnitType {
|
|
174
|
+
/// Standard document pages (PDF, DOCX, images)
|
|
175
|
+
Page,
|
|
176
|
+
/// Presentation slides (PPTX, ODP)
|
|
177
|
+
Slide,
|
|
178
|
+
/// Spreadsheet sheets (XLSX, ODS)
|
|
179
|
+
Sheet,
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
/// Byte offset boundary for a page.
|
|
183
|
+
///
|
|
184
|
+
/// Tracks where a specific page's content starts and ends in the main content string,
|
|
185
|
+
/// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
|
|
186
|
+
/// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
|
|
187
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
188
|
+
pub struct PageBoundary {
|
|
189
|
+
/// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
|
|
190
|
+
pub byte_start: usize,
|
|
191
|
+
/// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
|
|
192
|
+
pub byte_end: usize,
|
|
193
|
+
/// Page number (1-indexed)
|
|
194
|
+
pub page_number: usize,
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
/// Metadata for individual page/slide/sheet.
|
|
198
|
+
///
|
|
199
|
+
/// Captures per-page information including dimensions, content counts,
|
|
200
|
+
/// and visibility state (for presentations).
|
|
201
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
202
|
+
pub struct PageInfo {
|
|
203
|
+
/// Page number (1-indexed)
|
|
204
|
+
pub number: usize,
|
|
205
|
+
|
|
206
|
+
/// Page title (usually for presentations)
|
|
207
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
208
|
+
pub title: Option<String>,
|
|
209
|
+
|
|
210
|
+
/// Dimensions in points (PDF) or pixels (images): (width, height)
|
|
211
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
212
|
+
pub dimensions: Option<(f64, f64)>,
|
|
213
|
+
|
|
214
|
+
/// Number of images on this page
|
|
215
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
216
|
+
pub image_count: Option<usize>,
|
|
217
|
+
|
|
218
|
+
/// Number of tables on this page
|
|
219
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
220
|
+
pub table_count: Option<usize>,
|
|
221
|
+
|
|
222
|
+
/// Whether this page is hidden (e.g., in presentations)
|
|
223
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
224
|
+
pub hidden: Option<bool>,
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
/// Content for a single page/slide.
|
|
228
|
+
///
|
|
229
|
+
/// When page extraction is enabled, documents are split into per-page content
|
|
230
|
+
/// with associated tables and images mapped to each page.
|
|
231
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
232
|
+
pub struct PageContent {
|
|
233
|
+
/// Page number (1-indexed)
|
|
234
|
+
pub page_number: usize,
|
|
235
|
+
|
|
236
|
+
/// Text content for this page
|
|
237
|
+
pub content: String,
|
|
238
|
+
|
|
239
|
+
/// Tables found on this page
|
|
240
|
+
#[serde(skip_serializing_if = "Vec::is_empty", default)]
|
|
241
|
+
pub tables: Vec<Table>,
|
|
242
|
+
|
|
243
|
+
/// Images found on this page
|
|
244
|
+
#[serde(skip_serializing_if = "Vec::is_empty", default)]
|
|
245
|
+
pub images: Vec<ExtractedImage>,
|
|
246
|
+
}
|
|
247
|
+
|
|
105
248
|
/// Excel/spreadsheet metadata.
|
|
106
249
|
///
|
|
107
250
|
/// Contains information about sheets in Excel, LibreOffice Calc, and other
|
|
@@ -348,11 +491,11 @@ pub struct Chunk {
|
|
|
348
491
|
/// Metadata about a chunk's position in the original document.
|
|
349
492
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
350
493
|
pub struct ChunkMetadata {
|
|
351
|
-
///
|
|
352
|
-
pub
|
|
494
|
+
/// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
|
|
495
|
+
pub byte_start: usize,
|
|
353
496
|
|
|
354
|
-
///
|
|
355
|
-
pub
|
|
497
|
+
/// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
|
|
498
|
+
pub byte_end: usize,
|
|
356
499
|
|
|
357
500
|
/// Number of tokens in this chunk (if available).
|
|
358
501
|
///
|
|
@@ -365,6 +508,18 @@ pub struct ChunkMetadata {
|
|
|
365
508
|
|
|
366
509
|
/// Total number of chunks in the document.
|
|
367
510
|
pub total_chunks: usize,
|
|
511
|
+
|
|
512
|
+
/// First page number this chunk spans (1-indexed).
|
|
513
|
+
///
|
|
514
|
+
/// Only populated when page tracking is enabled in extraction configuration.
|
|
515
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
516
|
+
pub first_page: Option<usize>,
|
|
517
|
+
|
|
518
|
+
/// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
|
|
519
|
+
///
|
|
520
|
+
/// Only populated when page tracking is enabled in extraction configuration.
|
|
521
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
522
|
+
pub last_page: Option<usize>,
|
|
368
523
|
}
|
|
369
524
|
|
|
370
525
|
/// Extracted image from a document.
|
|
@@ -505,22 +660,22 @@ pub struct PptxExtractionResult {
|
|
|
505
660
|
pub table_count: usize,
|
|
506
661
|
/// Extracted images from the presentation
|
|
507
662
|
pub images: Vec<ExtractedImage>,
|
|
663
|
+
/// Slide structure with boundaries (when page tracking is enabled)
|
|
664
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
665
|
+
pub page_structure: Option<PageStructure>,
|
|
666
|
+
/// Per-slide content (when page tracking is enabled)
|
|
667
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
668
|
+
pub page_contents: Option<Vec<PageContent>>,
|
|
508
669
|
}
|
|
509
670
|
|
|
510
671
|
/// PowerPoint presentation metadata.
|
|
511
672
|
///
|
|
512
|
-
/// Contains
|
|
673
|
+
/// Contains PPTX-specific metadata. Common fields like title, author, and description
|
|
674
|
+
/// are now in the base `Metadata` struct.
|
|
513
675
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
514
676
|
pub struct PptxMetadata {
|
|
515
|
-
/// Presentation title
|
|
516
|
-
pub title: Option<String>,
|
|
517
|
-
/// Author name
|
|
518
|
-
pub author: Option<String>,
|
|
519
|
-
/// Description/comments
|
|
520
|
-
pub description: Option<String>,
|
|
521
|
-
/// Summary text
|
|
522
|
-
pub summary: Option<String>,
|
|
523
677
|
/// List of fonts used in the presentation
|
|
678
|
+
#[serde(skip_serializing_if = "Vec::is_empty", default)]
|
|
524
679
|
pub fonts: Vec<String>,
|
|
525
680
|
}
|
|
526
681
|
|
|
@@ -885,19 +1040,16 @@ mod tests {
|
|
|
885
1040
|
let json = serde_json::to_value(&metadata).unwrap();
|
|
886
1041
|
println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
|
|
887
1042
|
|
|
888
|
-
// Check that format_type is present
|
|
889
1043
|
assert!(
|
|
890
1044
|
json.get("format_type").is_some(),
|
|
891
1045
|
"format_type should be present in serialized JSON"
|
|
892
1046
|
);
|
|
893
1047
|
assert_eq!(json.get("format_type").unwrap(), "text");
|
|
894
1048
|
|
|
895
|
-
// Check that Text metadata fields are present
|
|
896
1049
|
assert_eq!(json.get("line_count").unwrap(), 1);
|
|
897
1050
|
assert_eq!(json.get("word_count").unwrap(), 2);
|
|
898
1051
|
assert_eq!(json.get("character_count").unwrap(), 13);
|
|
899
1052
|
|
|
900
|
-
// Check that additional field is merged
|
|
901
1053
|
assert_eq!(json.get("quality_score").unwrap(), 1.0);
|
|
902
1054
|
}
|
|
903
1055
|
}
|
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
//! Tests for ZIP, TAR, TAR.GZ, and 7z archive extraction.
|
|
4
4
|
//! Validates metadata extraction, content extraction, nested archives, and error handling.
|
|
5
5
|
|
|
6
|
+
#![cfg(feature = "archives")]
|
|
7
|
+
|
|
6
8
|
use kreuzberg::core::config::ExtractionConfig;
|
|
7
9
|
use kreuzberg::core::extractor::{extract_bytes, extract_bytes_sync};
|
|
8
10
|
use std::io::{Cursor, Write};
|
|
@@ -4,9 +4,9 @@
|
|
|
4
4
|
//! Validates concurrent processing, error handling, and performance.
|
|
5
5
|
|
|
6
6
|
use kreuzberg::core::config::ExtractionConfig;
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
};
|
|
7
|
+
#[cfg(feature = "pdf")]
|
|
8
|
+
use kreuzberg::core::extractor::batch_extract_file_sync;
|
|
9
|
+
use kreuzberg::core::extractor::{batch_extract_bytes, batch_extract_bytes_sync, batch_extract_file};
|
|
10
10
|
use std::path::PathBuf;
|
|
11
11
|
|
|
12
12
|
mod helpers;
|
|
@@ -26,6 +26,7 @@ fn assert_text_content(actual: &str, expected: &str) {
|
|
|
26
26
|
|
|
27
27
|
/// Test batch extraction with multiple file formats (PDF, DOCX, TXT).
|
|
28
28
|
#[tokio::test]
|
|
29
|
+
#[cfg(all(feature = "pdf", feature = "office", feature = "tokio-runtime"))]
|
|
29
30
|
async fn test_batch_extract_file_multiple_formats() {
|
|
30
31
|
if !test_documents_available() {
|
|
31
32
|
println!("Skipping test: test_documents/ directory not found");
|
|
@@ -73,6 +74,7 @@ async fn test_batch_extract_file_multiple_formats() {
|
|
|
73
74
|
|
|
74
75
|
/// Test synchronous batch extraction variant.
|
|
75
76
|
#[test]
|
|
77
|
+
#[cfg(feature = "pdf")]
|
|
76
78
|
fn test_batch_extract_file_sync_variant() {
|
|
77
79
|
if !test_documents_available() {
|
|
78
80
|
println!("Skipping test: test_documents/ directory not found");
|
|
@@ -18,7 +18,6 @@ use kreuzberg::plugins::registry::{get_document_extractor_registry, get_post_pro
|
|
|
18
18
|
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
19
19
|
use kreuzberg::types::{ExtractionResult, Metadata};
|
|
20
20
|
use std::sync::Arc;
|
|
21
|
-
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
22
21
|
|
|
23
22
|
#[cfg(feature = "ocr")]
|
|
24
23
|
use kreuzberg::core::config::OcrConfig;
|
|
@@ -52,13 +51,16 @@ fn assert_text_content(actual: &str, expected: &str) {
|
|
|
52
51
|
async fn test_concurrent_extractions_mixed_formats() {
|
|
53
52
|
let config = ExtractionConfig::default();
|
|
54
53
|
|
|
55
|
-
|
|
54
|
+
#[allow(unused_mut)]
|
|
55
|
+
let mut test_cases = vec![
|
|
56
56
|
(b"Plain text content" as &[u8], "text/plain"),
|
|
57
57
|
(b"{\"key\": \"value\"}", "application/json"),
|
|
58
|
-
(b"<root><item>XML content</item></root>", "application/xml"),
|
|
59
58
|
(b"# Markdown\n\nContent here", "text/markdown"),
|
|
60
59
|
];
|
|
61
60
|
|
|
61
|
+
#[cfg(feature = "xml")]
|
|
62
|
+
test_cases.push((b"<root><item>XML content</item></root>" as &[u8], "application/xml"));
|
|
63
|
+
|
|
62
64
|
let mut handles = vec![];
|
|
63
65
|
for _ in 0..10 {
|
|
64
66
|
for (data, mime_type) in &test_cases {
|
|
@@ -242,6 +244,7 @@ async fn test_concurrent_ocr_processing() {
|
|
|
242
244
|
#[test]
|
|
243
245
|
fn test_concurrent_ocr_cache_stress() {
|
|
244
246
|
use helpers::{get_test_file_path, skip_if_missing};
|
|
247
|
+
use std::sync::atomic::Ordering;
|
|
245
248
|
|
|
246
249
|
if skip_if_missing("images/ocr_image.jpg") {
|
|
247
250
|
tracing::debug!("Skipping OCR cache stress test: test file not available");
|
|
@@ -366,6 +369,7 @@ async fn test_concurrent_pipeline_processing() {
|
|
|
366
369
|
detected_languages: None,
|
|
367
370
|
chunks: None,
|
|
368
371
|
images: None,
|
|
372
|
+
pages: None,
|
|
369
373
|
};
|
|
370
374
|
|
|
371
375
|
run_pipeline(result, &config).await
|
|
@@ -486,13 +490,16 @@ async fn test_high_concurrency_stress() {
|
|
|
486
490
|
..Default::default()
|
|
487
491
|
};
|
|
488
492
|
|
|
489
|
-
|
|
493
|
+
#[allow(unused_mut)]
|
|
494
|
+
let mut formats = vec![
|
|
490
495
|
(b"Text content" as &[u8], "text/plain"),
|
|
491
496
|
(b"{\"json\": true}", "application/json"),
|
|
492
|
-
(b"<xml><item>content</item></xml>", "application/xml"),
|
|
493
497
|
(b"# Markdown\n\nContent", "text/markdown"),
|
|
494
498
|
];
|
|
495
499
|
|
|
500
|
+
#[cfg(feature = "xml")]
|
|
501
|
+
formats.push((b"<xml><item>content</item></xml>" as &[u8], "application/xml"));
|
|
502
|
+
|
|
496
503
|
let mut handles = vec![];
|
|
497
504
|
for _ in 0..100 {
|
|
498
505
|
for (data, mime_type) in &formats {
|
|
@@ -516,9 +523,10 @@ async fn test_high_concurrency_stress() {
|
|
|
516
523
|
.await
|
|
517
524
|
.expect("High-load stress test should complete within 60s");
|
|
518
525
|
|
|
526
|
+
let expected_successes = 100 * formats.len();
|
|
519
527
|
let success_count = results.iter().filter(|r| r.is_ok()).count();
|
|
520
528
|
assert_eq!(
|
|
521
|
-
success_count,
|
|
529
|
+
success_count, expected_successes,
|
|
522
530
|
"All extractions should succeed under stress, got {} successes",
|
|
523
531
|
success_count
|
|
524
532
|
);
|
|
@@ -3,13 +3,19 @@
|
|
|
3
3
|
//! Tests for chunking, language detection, caching, token reduction, and quality processing.
|
|
4
4
|
//! Validates that configuration options work correctly end-to-end.
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
#[cfg(feature = "chunking")]
|
|
7
|
+
use kreuzberg::core::config::ChunkingConfig;
|
|
8
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
9
|
+
#[cfg(feature = "language-detection")]
|
|
10
|
+
use kreuzberg::core::config::LanguageDetectionConfig;
|
|
11
|
+
use kreuzberg::core::config::TokenReductionConfig;
|
|
7
12
|
use kreuzberg::core::extractor::extract_bytes;
|
|
8
13
|
|
|
9
14
|
mod helpers;
|
|
10
15
|
|
|
11
16
|
/// Test chunking enabled - text split into chunks.
|
|
12
17
|
#[tokio::test]
|
|
18
|
+
#[cfg(feature = "chunking")]
|
|
13
19
|
async fn test_chunking_enabled() {
|
|
14
20
|
let config = ExtractionConfig {
|
|
15
21
|
chunking: Some(ChunkingConfig {
|
|
@@ -52,6 +58,7 @@ async fn test_chunking_enabled() {
|
|
|
52
58
|
|
|
53
59
|
/// Test chunking with overlap - overlap preserved between chunks.
|
|
54
60
|
#[tokio::test]
|
|
61
|
+
#[cfg(feature = "chunking")]
|
|
55
62
|
async fn test_chunking_with_overlap() {
|
|
56
63
|
let config = ExtractionConfig {
|
|
57
64
|
chunking: Some(ChunkingConfig {
|
|
@@ -91,6 +98,7 @@ async fn test_chunking_with_overlap() {
|
|
|
91
98
|
|
|
92
99
|
/// Test chunking with custom sizes - custom chunk size and overlap.
|
|
93
100
|
#[tokio::test]
|
|
101
|
+
#[cfg(feature = "chunking")]
|
|
94
102
|
async fn test_chunking_custom_sizes() {
|
|
95
103
|
let config = ExtractionConfig {
|
|
96
104
|
chunking: Some(ChunkingConfig {
|
|
@@ -151,6 +159,7 @@ async fn test_chunking_disabled() {
|
|
|
151
159
|
|
|
152
160
|
/// Test language detection for single language document.
|
|
153
161
|
#[tokio::test]
|
|
162
|
+
#[cfg(feature = "language-detection")]
|
|
154
163
|
async fn test_language_detection_single() {
|
|
155
164
|
let config = ExtractionConfig {
|
|
156
165
|
language_detection: Some(LanguageDetectionConfig {
|
|
@@ -177,6 +186,7 @@ async fn test_language_detection_single() {
|
|
|
177
186
|
/// Test language detection for multi-language document.
|
|
178
187
|
#[cfg_attr(coverage, ignore = "coverage instrumentation affects multi-language heuristics")]
|
|
179
188
|
#[tokio::test]
|
|
189
|
+
#[cfg(feature = "language-detection")]
|
|
180
190
|
async fn test_language_detection_multiple() {
|
|
181
191
|
let config = ExtractionConfig {
|
|
182
192
|
language_detection: Some(LanguageDetectionConfig {
|
|
@@ -201,6 +211,7 @@ async fn test_language_detection_multiple() {
|
|
|
201
211
|
|
|
202
212
|
/// Test language detection with confidence threshold.
|
|
203
213
|
#[tokio::test]
|
|
214
|
+
#[cfg(feature = "language-detection")]
|
|
204
215
|
async fn test_language_detection_confidence() {
|
|
205
216
|
let config = ExtractionConfig {
|
|
206
217
|
language_detection: Some(LanguageDetectionConfig {
|
|
@@ -225,6 +236,7 @@ async fn test_language_detection_confidence() {
|
|
|
225
236
|
|
|
226
237
|
/// Test language detection disabled.
|
|
227
238
|
#[tokio::test]
|
|
239
|
+
#[cfg(feature = "language-detection")]
|
|
228
240
|
async fn test_language_detection_disabled() {
|
|
229
241
|
let config = ExtractionConfig {
|
|
230
242
|
language_detection: Some(LanguageDetectionConfig {
|
|
@@ -397,6 +409,7 @@ async fn test_token_reduction_disabled() {
|
|
|
397
409
|
|
|
398
410
|
/// Test quality processing enabled - quality scoring applied.
|
|
399
411
|
#[tokio::test]
|
|
412
|
+
#[cfg(feature = "quality")]
|
|
400
413
|
async fn test_quality_processing_enabled() {
|
|
401
414
|
let config = ExtractionConfig {
|
|
402
415
|
enable_quality_processing: true,
|
|
@@ -420,6 +433,7 @@ async fn test_quality_processing_enabled() {
|
|
|
420
433
|
|
|
421
434
|
/// Test quality processing calculates score for different text quality.
|
|
422
435
|
#[tokio::test]
|
|
436
|
+
#[cfg(feature = "quality")]
|
|
423
437
|
async fn test_quality_threshold_filtering() {
|
|
424
438
|
let config = ExtractionConfig {
|
|
425
439
|
enable_quality_processing: true,
|
|
@@ -389,6 +389,7 @@ extract_images = true
|
|
|
389
389
|
"Should have language detection config"
|
|
390
390
|
);
|
|
391
391
|
assert!(config.images.is_some(), "Should have image extraction config");
|
|
392
|
+
#[cfg(feature = "pdf")]
|
|
392
393
|
assert!(config.pdf_options.is_some(), "Should have PDF config");
|
|
393
394
|
}
|
|
394
395
|
|