kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +5 -5
  3. data/README.md +15 -9
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
  5. data/ext/kreuzberg_rb/native/Cargo.lock +511 -325
  6. data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
  8. data/kreuzberg.gemspec +38 -4
  9. data/lib/kreuzberg/config.rb +34 -1
  10. data/lib/kreuzberg/result.rb +77 -14
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/sig/kreuzberg.rbs +23 -6
  13. data/vendor/kreuzberg/Cargo.toml +32 -11
  14. data/vendor/kreuzberg/README.md +54 -8
  15. data/vendor/kreuzberg/build.rs +549 -132
  16. data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
  17. data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
  18. data/vendor/kreuzberg/src/core/config.rs +49 -1
  19. data/vendor/kreuzberg/src/core/extractor.rs +134 -2
  20. data/vendor/kreuzberg/src/core/mod.rs +4 -2
  21. data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
  22. data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
  23. data/vendor/kreuzberg/src/extraction/html.rs +24 -8
  24. data/vendor/kreuzberg/src/extraction/image.rs +124 -1
  25. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
  26. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
  27. data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
  28. data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
  29. data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
  30. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  31. data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
  32. data/vendor/kreuzberg/src/extractors/email.rs +29 -15
  33. data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
  34. data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
  35. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
  36. data/vendor/kreuzberg/src/extractors/html.rs +29 -15
  37. data/vendor/kreuzberg/src/extractors/image.rs +25 -4
  38. data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
  39. data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
  40. data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
  41. data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
  42. data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
  43. data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
  44. data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
  45. data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
  46. data/vendor/kreuzberg/src/extractors/pdf.rs +197 -17
  47. data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
  48. data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
  49. data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
  50. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  51. data/vendor/kreuzberg/src/extractors/text.rs +7 -2
  52. data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
  53. data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
  54. data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
  55. data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
  56. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
  57. data/vendor/kreuzberg/src/lib.rs +10 -2
  58. data/vendor/kreuzberg/src/mcp/mod.rs +3 -0
  59. data/vendor/kreuzberg/src/mcp/server.rs +120 -12
  60. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
  61. data/vendor/kreuzberg/src/pdf/bundled.rs +328 -0
  62. data/vendor/kreuzberg/src/pdf/error.rs +8 -0
  63. data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
  64. data/vendor/kreuzberg/src/pdf/mod.rs +18 -2
  65. data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
  66. data/vendor/kreuzberg/src/pdf/table.rs +26 -2
  67. data/vendor/kreuzberg/src/pdf/text.rs +89 -7
  68. data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
  69. data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
  70. data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
  71. data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
  72. data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
  73. data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
  74. data/vendor/kreuzberg/src/text/mod.rs +6 -0
  75. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
  76. data/vendor/kreuzberg/src/types.rs +173 -21
  77. data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
  78. data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
  79. data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
  80. data/vendor/kreuzberg/tests/config_features.rs +15 -1
  81. data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
  82. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
  83. data/vendor/kreuzberg/tests/email_integration.rs +2 -0
  84. data/vendor/kreuzberg/tests/error_handling.rs +43 -34
  85. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  86. data/vendor/kreuzberg/tests/image_integration.rs +2 -0
  87. data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
  88. data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
  89. data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
  90. data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
  91. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
  92. data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
  93. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
  95. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
  96. data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
  97. data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
  98. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
  99. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
  100. data/vendor/kreuzberg/tests/security_validation.rs +1 -0
  101. data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
  102. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
  103. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
  104. data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
  105. data/vendor/rb-sys/Cargo.lock +15 -15
  106. data/vendor/rb-sys/Cargo.toml +4 -4
  107. data/vendor/rb-sys/Cargo.toml.orig +4 -4
  108. data/vendor/rb-sys/build/features.rs +5 -2
  109. data/vendor/rb-sys/build/main.rs +55 -15
  110. data/vendor/rb-sys/build/stable_api_config.rs +4 -2
  111. data/vendor/rb-sys/build/version.rs +3 -1
  112. data/vendor/rb-sys/src/lib.rs +1 -0
  113. data/vendor/rb-sys/src/macros.rs +2 -2
  114. data/vendor/rb-sys/src/special_consts.rs +1 -1
  115. data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
  116. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
  117. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
  118. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
  119. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
  120. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
  121. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
  122. data/vendor/rb-sys/src/stable_api.rs +0 -1
  123. data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
  124. metadata +13 -10
  125. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  126. data/vendor/rb-sys/.cargo-ok +0 -1
  127. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
@@ -0,0 +1,219 @@
1
+ //! Quality processing post-processor.
2
+ //!
3
+ //! This module provides a PostProcessor plugin that performs quality assessment and
4
+ //! text cleaning on extraction results.
5
+
6
+ use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
7
+ use crate::{ExtractionConfig, ExtractionResult, Result};
8
+ use async_trait::async_trait;
9
+
10
+ /// Post-processor that calculates quality score and cleans text.
11
+ ///
12
+ /// This processor:
13
+ /// - Runs in the Early processing stage
14
+ /// - Calculates quality score when `config.enable_quality_processing` is true
15
+ /// - Stores quality score in `metadata.additional["quality_score"]`
16
+ /// - Cleans and normalizes extracted text
17
+ ///
18
+ /// # Example
19
+ ///
20
+ /// ```rust,no_run
21
+ /// use kreuzberg::plugins::{Plugin, PostProcessor};
22
+ /// use kreuzberg::text::quality::processor::QualityProcessor;
23
+ ///
24
+ /// let processor = QualityProcessor;
25
+ /// assert_eq!(processor.name(), "quality-processing");
26
+ /// ```
27
+ #[derive(Debug, Clone, Copy)]
28
+ pub struct QualityProcessor;
29
+
30
+ impl Plugin for QualityProcessor {
31
+ fn name(&self) -> &str {
32
+ "quality-processing"
33
+ }
34
+
35
+ fn version(&self) -> String {
36
+ env!("CARGO_PKG_VERSION").to_string()
37
+ }
38
+
39
+ fn initialize(&self) -> Result<()> {
40
+ Ok(())
41
+ }
42
+
43
+ fn shutdown(&self) -> Result<()> {
44
+ Ok(())
45
+ }
46
+ }
47
+
48
+ #[cfg_attr(not(target_arch = "wasm32"), async_trait)]
49
+ #[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
50
+ impl PostProcessor for QualityProcessor {
51
+ async fn process(&self, result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
52
+ // Calculate quality score
53
+ let quality_score = crate::text::quality::calculate_quality_score(
54
+ &result.content,
55
+ Some(
56
+ &result
57
+ .metadata
58
+ .additional
59
+ .iter()
60
+ .map(|(k, v)| (k.clone(), v.to_string()))
61
+ .collect(),
62
+ ),
63
+ );
64
+
65
+ result.metadata.additional.insert(
66
+ "quality_score".to_string(),
67
+ serde_json::Value::Number(
68
+ serde_json::Number::from_f64(quality_score).unwrap_or(serde_json::Number::from(0)),
69
+ ),
70
+ );
71
+
72
+ Ok(())
73
+ }
74
+
75
+ fn processing_stage(&self) -> ProcessingStage {
76
+ ProcessingStage::Early
77
+ }
78
+
79
+ fn should_process(&self, _result: &ExtractionResult, config: &ExtractionConfig) -> bool {
80
+ config.enable_quality_processing
81
+ }
82
+
83
+ fn estimated_duration_ms(&self, result: &ExtractionResult) -> u64 {
84
+ let text_length = result.content.len();
85
+ // Quality processing is relatively fast: ~1ms per 100KB
86
+ (text_length / 102400).max(1) as u64
87
+ }
88
+ }
89
+
90
+ #[cfg(test)]
91
+ mod tests {
92
+ use super::*;
93
+ use crate::types::Metadata;
94
+
95
+ #[tokio::test]
96
+ async fn test_quality_processor() {
97
+ let processor = QualityProcessor;
98
+ let config = ExtractionConfig {
99
+ enable_quality_processing: true,
100
+ ..Default::default()
101
+ };
102
+
103
+ let mut result = ExtractionResult {
104
+ content: "This is a well-written paragraph with proper structure. It contains multiple sentences. The quality should be good.".to_string(),
105
+ mime_type: "text/plain".to_string(),
106
+ metadata: Metadata::default(),
107
+ tables: vec![],
108
+ detected_languages: None,
109
+ chunks: None,
110
+ images: None,
111
+ pages: None,
112
+ };
113
+
114
+ processor.process(&mut result, &config).await.unwrap();
115
+
116
+ assert!(result.metadata.additional.contains_key("quality_score"));
117
+ let score = result.metadata.additional.get("quality_score").unwrap();
118
+ assert!(score.is_number());
119
+ }
120
+
121
+ #[tokio::test]
122
+ async fn test_quality_processor_disabled() {
123
+ let processor = QualityProcessor;
124
+ let config = ExtractionConfig {
125
+ enable_quality_processing: false,
126
+ ..Default::default()
127
+ };
128
+
129
+ let mut result = ExtractionResult {
130
+ content: "Some text".to_string(),
131
+ mime_type: "text/plain".to_string(),
132
+ metadata: Metadata::default(),
133
+ tables: vec![],
134
+ detected_languages: None,
135
+ chunks: None,
136
+ images: None,
137
+ pages: None,
138
+ };
139
+
140
+ // When disabled, the processor should not run, so no quality_score should be added
141
+ // (because should_process returns false)
142
+ processor.process(&mut result, &config).await.unwrap();
143
+ }
144
+
145
+ #[test]
146
+ fn test_quality_processor_plugin_interface() {
147
+ let processor = QualityProcessor;
148
+ assert_eq!(processor.name(), "quality-processing");
149
+ assert!(!processor.version().is_empty());
150
+ assert!(processor.initialize().is_ok());
151
+ assert!(processor.shutdown().is_ok());
152
+ }
153
+
154
+ #[test]
155
+ fn test_quality_processor_stage() {
156
+ let processor = QualityProcessor;
157
+ assert_eq!(processor.processing_stage(), ProcessingStage::Early);
158
+ }
159
+
160
+ #[test]
161
+ fn test_quality_processor_should_process() {
162
+ let processor = QualityProcessor;
163
+
164
+ let result = ExtractionResult {
165
+ content: "Sample text".to_string(),
166
+ mime_type: "text/plain".to_string(),
167
+ metadata: Metadata::default(),
168
+ tables: vec![],
169
+ detected_languages: None,
170
+ chunks: None,
171
+ images: None,
172
+ pages: None,
173
+ };
174
+
175
+ let config_with_quality = ExtractionConfig {
176
+ enable_quality_processing: true,
177
+ ..Default::default()
178
+ };
179
+ assert!(processor.should_process(&result, &config_with_quality));
180
+
181
+ let config_without_quality = ExtractionConfig {
182
+ enable_quality_processing: false,
183
+ ..Default::default()
184
+ };
185
+ assert!(!processor.should_process(&result, &config_without_quality));
186
+ }
187
+
188
+ #[test]
189
+ fn test_quality_processor_estimated_duration() {
190
+ let processor = QualityProcessor;
191
+
192
+ let short_result = ExtractionResult {
193
+ content: "Short".to_string(),
194
+ mime_type: "text/plain".to_string(),
195
+ metadata: Metadata::default(),
196
+ tables: vec![],
197
+ detected_languages: None,
198
+ chunks: None,
199
+ images: None,
200
+ pages: None,
201
+ };
202
+
203
+ let long_result = ExtractionResult {
204
+ content: "a".repeat(1000000),
205
+ mime_type: "text/plain".to_string(),
206
+ metadata: Metadata::default(),
207
+ tables: vec![],
208
+ detected_languages: None,
209
+ chunks: None,
210
+ images: None,
211
+ pages: None,
212
+ };
213
+
214
+ let short_duration = processor.estimated_duration_ms(&short_result);
215
+ let long_duration = processor.estimated_duration_ms(&long_result);
216
+
217
+ assert!(long_duration > short_duration);
218
+ }
219
+ }
@@ -34,6 +34,13 @@ pub struct ExtractionResult {
34
34
  /// Each image may optionally contain a nested `ocr_result` if OCR was performed.
35
35
  #[serde(skip_serializing_if = "Option::is_none")]
36
36
  pub images: Option<Vec<ExtractedImage>>,
37
+
38
+ /// Per-page content when page extraction is enabled.
39
+ ///
40
+ /// When page extraction is configured, the document is split into per-page content
41
+ /// with tables and images mapped to their respective pages.
42
+ #[serde(skip_serializing_if = "Option::is_none")]
43
+ pub pages: Option<Vec<PageContent>>,
37
44
  }
38
45
 
39
46
  /// Format-specific metadata (discriminated union).
@@ -62,17 +69,49 @@ pub enum FormatMetadata {
62
69
  /// via a discriminated union, and additional custom fields from postprocessors.
63
70
  #[derive(Debug, Clone, Serialize, Deserialize, Default)]
64
71
  pub struct Metadata {
65
- /// Language of the document (ISO 639 code)
72
+ /// Document title
73
+ #[serde(skip_serializing_if = "Option::is_none")]
74
+ pub title: Option<String>,
75
+
76
+ /// Document subject or description
77
+ #[serde(skip_serializing_if = "Option::is_none")]
78
+ pub subject: Option<String>,
79
+
80
+ /// Primary author(s) - always Vec for consistency
81
+ #[serde(skip_serializing_if = "Option::is_none")]
82
+ pub authors: Option<Vec<String>>,
83
+
84
+ /// Keywords/tags - always Vec for consistency
85
+ #[serde(skip_serializing_if = "Option::is_none")]
86
+ pub keywords: Option<Vec<String>>,
87
+
88
+ /// Primary language (ISO 639 code)
66
89
  #[serde(skip_serializing_if = "Option::is_none")]
67
90
  pub language: Option<String>,
68
91
 
69
- /// Document date (format varies by source)
92
+ /// Creation timestamp (ISO 8601 format)
70
93
  #[serde(skip_serializing_if = "Option::is_none")]
71
- pub date: Option<String>,
94
+ pub created_at: Option<String>,
72
95
 
73
- /// Document subject/description
96
+ /// Last modification timestamp (ISO 8601 format)
74
97
  #[serde(skip_serializing_if = "Option::is_none")]
75
- pub subject: Option<String>,
98
+ pub modified_at: Option<String>,
99
+
100
+ /// User who created the document
101
+ #[serde(skip_serializing_if = "Option::is_none")]
102
+ pub created_by: Option<String>,
103
+
104
+ /// User who last modified the document
105
+ #[serde(skip_serializing_if = "Option::is_none")]
106
+ pub modified_by: Option<String>,
107
+
108
+ /// Page/slide/sheet structure with boundaries
109
+ #[serde(skip_serializing_if = "Option::is_none")]
110
+ pub pages: Option<PageStructure>,
111
+
112
+ /// Document date (DEPRECATED - use created_at/modified_at instead)
113
+ #[serde(skip_serializing_if = "Option::is_none")]
114
+ pub date: Option<String>,
76
115
 
77
116
  /// Format-specific metadata (discriminated union)
78
117
  ///
@@ -102,6 +141,110 @@ pub struct Metadata {
102
141
  pub additional: HashMap<String, serde_json::Value>,
103
142
  }
104
143
 
144
+ /// Unified page structure for documents.
145
+ ///
146
+ /// Supports different page types (PDF pages, PPTX slides, Excel sheets)
147
+ /// with character offset boundaries for chunk-to-page mapping.
148
+ #[derive(Debug, Clone, Serialize, Deserialize)]
149
+ pub struct PageStructure {
150
+ /// Total number of pages/slides/sheets
151
+ pub total_count: usize,
152
+
153
+ /// Type of paginated unit
154
+ pub unit_type: PageUnitType,
155
+
156
+ /// Character offset boundaries for each page
157
+ ///
158
+ /// Maps character ranges in the extracted content to page numbers.
159
+ /// Used for chunk page range calculation.
160
+ #[serde(skip_serializing_if = "Option::is_none")]
161
+ pub boundaries: Option<Vec<PageBoundary>>,
162
+
163
+ /// Detailed per-page metadata (optional, only when needed)
164
+ #[serde(skip_serializing_if = "Option::is_none")]
165
+ pub pages: Option<Vec<PageInfo>>,
166
+ }
167
+
168
+ /// Type of paginated unit in a document.
169
+ ///
170
+ /// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
171
+ #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
172
+ #[serde(rename_all = "snake_case")]
173
+ pub enum PageUnitType {
174
+ /// Standard document pages (PDF, DOCX, images)
175
+ Page,
176
+ /// Presentation slides (PPTX, ODP)
177
+ Slide,
178
+ /// Spreadsheet sheets (XLSX, ODS)
179
+ Sheet,
180
+ }
181
+
182
+ /// Byte offset boundary for a page.
183
+ ///
184
+ /// Tracks where a specific page's content starts and ends in the main content string,
185
+ /// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
186
+ /// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
187
+ #[derive(Debug, Clone, Serialize, Deserialize)]
188
+ pub struct PageBoundary {
189
+ /// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
190
+ pub byte_start: usize,
191
+ /// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
192
+ pub byte_end: usize,
193
+ /// Page number (1-indexed)
194
+ pub page_number: usize,
195
+ }
196
+
197
+ /// Metadata for individual page/slide/sheet.
198
+ ///
199
+ /// Captures per-page information including dimensions, content counts,
200
+ /// and visibility state (for presentations).
201
+ #[derive(Debug, Clone, Serialize, Deserialize)]
202
+ pub struct PageInfo {
203
+ /// Page number (1-indexed)
204
+ pub number: usize,
205
+
206
+ /// Page title (usually for presentations)
207
+ #[serde(skip_serializing_if = "Option::is_none")]
208
+ pub title: Option<String>,
209
+
210
+ /// Dimensions in points (PDF) or pixels (images): (width, height)
211
+ #[serde(skip_serializing_if = "Option::is_none")]
212
+ pub dimensions: Option<(f64, f64)>,
213
+
214
+ /// Number of images on this page
215
+ #[serde(skip_serializing_if = "Option::is_none")]
216
+ pub image_count: Option<usize>,
217
+
218
+ /// Number of tables on this page
219
+ #[serde(skip_serializing_if = "Option::is_none")]
220
+ pub table_count: Option<usize>,
221
+
222
+ /// Whether this page is hidden (e.g., in presentations)
223
+ #[serde(skip_serializing_if = "Option::is_none")]
224
+ pub hidden: Option<bool>,
225
+ }
226
+
227
+ /// Content for a single page/slide.
228
+ ///
229
+ /// When page extraction is enabled, documents are split into per-page content
230
+ /// with associated tables and images mapped to each page.
231
+ #[derive(Debug, Clone, Serialize, Deserialize)]
232
+ pub struct PageContent {
233
+ /// Page number (1-indexed)
234
+ pub page_number: usize,
235
+
236
+ /// Text content for this page
237
+ pub content: String,
238
+
239
+ /// Tables found on this page
240
+ #[serde(skip_serializing_if = "Vec::is_empty", default)]
241
+ pub tables: Vec<Table>,
242
+
243
+ /// Images found on this page
244
+ #[serde(skip_serializing_if = "Vec::is_empty", default)]
245
+ pub images: Vec<ExtractedImage>,
246
+ }
247
+
105
248
  /// Excel/spreadsheet metadata.
106
249
  ///
107
250
  /// Contains information about sheets in Excel, LibreOffice Calc, and other
@@ -348,11 +491,11 @@ pub struct Chunk {
348
491
  /// Metadata about a chunk's position in the original document.
349
492
  #[derive(Debug, Clone, Serialize, Deserialize)]
350
493
  pub struct ChunkMetadata {
351
- /// Character offset where this chunk starts in the original text.
352
- pub char_start: usize,
494
+ /// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
495
+ pub byte_start: usize,
353
496
 
354
- /// Character offset where this chunk ends in the original text.
355
- pub char_end: usize,
497
+ /// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
498
+ pub byte_end: usize,
356
499
 
357
500
  /// Number of tokens in this chunk (if available).
358
501
  ///
@@ -365,6 +508,18 @@ pub struct ChunkMetadata {
365
508
 
366
509
  /// Total number of chunks in the document.
367
510
  pub total_chunks: usize,
511
+
512
+ /// First page number this chunk spans (1-indexed).
513
+ ///
514
+ /// Only populated when page tracking is enabled in extraction configuration.
515
+ #[serde(skip_serializing_if = "Option::is_none")]
516
+ pub first_page: Option<usize>,
517
+
518
+ /// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
519
+ ///
520
+ /// Only populated when page tracking is enabled in extraction configuration.
521
+ #[serde(skip_serializing_if = "Option::is_none")]
522
+ pub last_page: Option<usize>,
368
523
  }
369
524
 
370
525
  /// Extracted image from a document.
@@ -505,22 +660,22 @@ pub struct PptxExtractionResult {
505
660
  pub table_count: usize,
506
661
  /// Extracted images from the presentation
507
662
  pub images: Vec<ExtractedImage>,
663
+ /// Slide structure with boundaries (when page tracking is enabled)
664
+ #[serde(skip_serializing_if = "Option::is_none")]
665
+ pub page_structure: Option<PageStructure>,
666
+ /// Per-slide content (when page tracking is enabled)
667
+ #[serde(skip_serializing_if = "Option::is_none")]
668
+ pub page_contents: Option<Vec<PageContent>>,
508
669
  }
509
670
 
510
671
  /// PowerPoint presentation metadata.
511
672
  ///
512
- /// Contains document-level metadata extracted from the PPTX file.
673
+ /// Contains PPTX-specific metadata. Common fields like title, author, and description
674
+ /// are now in the base `Metadata` struct.
513
675
  #[derive(Debug, Clone, Serialize, Deserialize)]
514
676
  pub struct PptxMetadata {
515
- /// Presentation title
516
- pub title: Option<String>,
517
- /// Author name
518
- pub author: Option<String>,
519
- /// Description/comments
520
- pub description: Option<String>,
521
- /// Summary text
522
- pub summary: Option<String>,
523
677
  /// List of fonts used in the presentation
678
+ #[serde(skip_serializing_if = "Vec::is_empty", default)]
524
679
  pub fonts: Vec<String>,
525
680
  }
526
681
 
@@ -885,19 +1040,16 @@ mod tests {
885
1040
  let json = serde_json::to_value(&metadata).unwrap();
886
1041
  println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
887
1042
 
888
- // Check that format_type is present
889
1043
  assert!(
890
1044
  json.get("format_type").is_some(),
891
1045
  "format_type should be present in serialized JSON"
892
1046
  );
893
1047
  assert_eq!(json.get("format_type").unwrap(), "text");
894
1048
 
895
- // Check that Text metadata fields are present
896
1049
  assert_eq!(json.get("line_count").unwrap(), 1);
897
1050
  assert_eq!(json.get("word_count").unwrap(), 2);
898
1051
  assert_eq!(json.get("character_count").unwrap(), 13);
899
1052
 
900
- // Check that additional field is merged
901
1053
  assert_eq!(json.get("quality_score").unwrap(), 1.0);
902
1054
  }
903
1055
  }
@@ -3,6 +3,8 @@
3
3
  //! Tests for ZIP, TAR, TAR.GZ, and 7z archive extraction.
4
4
  //! Validates metadata extraction, content extraction, nested archives, and error handling.
5
5
 
6
+ #![cfg(feature = "archives")]
7
+
6
8
  use kreuzberg::core::config::ExtractionConfig;
7
9
  use kreuzberg::core::extractor::{extract_bytes, extract_bytes_sync};
8
10
  use std::io::{Cursor, Write};
@@ -4,9 +4,9 @@
4
4
  //! Validates concurrent processing, error handling, and performance.
5
5
 
6
6
  use kreuzberg::core::config::ExtractionConfig;
7
- use kreuzberg::core::extractor::{
8
- batch_extract_bytes, batch_extract_bytes_sync, batch_extract_file, batch_extract_file_sync,
9
- };
7
+ #[cfg(feature = "pdf")]
8
+ use kreuzberg::core::extractor::batch_extract_file_sync;
9
+ use kreuzberg::core::extractor::{batch_extract_bytes, batch_extract_bytes_sync, batch_extract_file};
10
10
  use std::path::PathBuf;
11
11
 
12
12
  mod helpers;
@@ -26,6 +26,7 @@ fn assert_text_content(actual: &str, expected: &str) {
26
26
 
27
27
  /// Test batch extraction with multiple file formats (PDF, DOCX, TXT).
28
28
  #[tokio::test]
29
+ #[cfg(all(feature = "pdf", feature = "office", feature = "tokio-runtime"))]
29
30
  async fn test_batch_extract_file_multiple_formats() {
30
31
  if !test_documents_available() {
31
32
  println!("Skipping test: test_documents/ directory not found");
@@ -73,6 +74,7 @@ async fn test_batch_extract_file_multiple_formats() {
73
74
 
74
75
  /// Test synchronous batch extraction variant.
75
76
  #[test]
77
+ #[cfg(feature = "pdf")]
76
78
  fn test_batch_extract_file_sync_variant() {
77
79
  if !test_documents_available() {
78
80
  println!("Skipping test: test_documents/ directory not found");
@@ -18,7 +18,6 @@ use kreuzberg::plugins::registry::{get_document_extractor_registry, get_post_pro
18
18
  use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
19
19
  use kreuzberg::types::{ExtractionResult, Metadata};
20
20
  use std::sync::Arc;
21
- use std::sync::atomic::{AtomicUsize, Ordering};
22
21
 
23
22
  #[cfg(feature = "ocr")]
24
23
  use kreuzberg::core::config::OcrConfig;
@@ -52,13 +51,16 @@ fn assert_text_content(actual: &str, expected: &str) {
52
51
  async fn test_concurrent_extractions_mixed_formats() {
53
52
  let config = ExtractionConfig::default();
54
53
 
55
- let test_cases = vec![
54
+ #[allow(unused_mut)]
55
+ let mut test_cases = vec![
56
56
  (b"Plain text content" as &[u8], "text/plain"),
57
57
  (b"{\"key\": \"value\"}", "application/json"),
58
- (b"<root><item>XML content</item></root>", "application/xml"),
59
58
  (b"# Markdown\n\nContent here", "text/markdown"),
60
59
  ];
61
60
 
61
+ #[cfg(feature = "xml")]
62
+ test_cases.push((b"<root><item>XML content</item></root>" as &[u8], "application/xml"));
63
+
62
64
  let mut handles = vec![];
63
65
  for _ in 0..10 {
64
66
  for (data, mime_type) in &test_cases {
@@ -242,6 +244,7 @@ async fn test_concurrent_ocr_processing() {
242
244
  #[test]
243
245
  fn test_concurrent_ocr_cache_stress() {
244
246
  use helpers::{get_test_file_path, skip_if_missing};
247
+ use std::sync::atomic::Ordering;
245
248
 
246
249
  if skip_if_missing("images/ocr_image.jpg") {
247
250
  tracing::debug!("Skipping OCR cache stress test: test file not available");
@@ -366,6 +369,7 @@ async fn test_concurrent_pipeline_processing() {
366
369
  detected_languages: None,
367
370
  chunks: None,
368
371
  images: None,
372
+ pages: None,
369
373
  };
370
374
 
371
375
  run_pipeline(result, &config).await
@@ -486,13 +490,16 @@ async fn test_high_concurrency_stress() {
486
490
  ..Default::default()
487
491
  };
488
492
 
489
- let formats = vec![
493
+ #[allow(unused_mut)]
494
+ let mut formats = vec![
490
495
  (b"Text content" as &[u8], "text/plain"),
491
496
  (b"{\"json\": true}", "application/json"),
492
- (b"<xml><item>content</item></xml>", "application/xml"),
493
497
  (b"# Markdown\n\nContent", "text/markdown"),
494
498
  ];
495
499
 
500
+ #[cfg(feature = "xml")]
501
+ formats.push((b"<xml><item>content</item></xml>" as &[u8], "application/xml"));
502
+
496
503
  let mut handles = vec![];
497
504
  for _ in 0..100 {
498
505
  for (data, mime_type) in &formats {
@@ -516,9 +523,10 @@ async fn test_high_concurrency_stress() {
516
523
  .await
517
524
  .expect("High-load stress test should complete within 60s");
518
525
 
526
+ let expected_successes = 100 * formats.len();
519
527
  let success_count = results.iter().filter(|r| r.is_ok()).count();
520
528
  assert_eq!(
521
- success_count, 400,
529
+ success_count, expected_successes,
522
530
  "All extractions should succeed under stress, got {} successes",
523
531
  success_count
524
532
  );
@@ -3,13 +3,19 @@
3
3
  //! Tests for chunking, language detection, caching, token reduction, and quality processing.
4
4
  //! Validates that configuration options work correctly end-to-end.
5
5
 
6
- use kreuzberg::core::config::{ChunkingConfig, ExtractionConfig, LanguageDetectionConfig, TokenReductionConfig};
6
+ #[cfg(feature = "chunking")]
7
+ use kreuzberg::core::config::ChunkingConfig;
8
+ use kreuzberg::core::config::ExtractionConfig;
9
+ #[cfg(feature = "language-detection")]
10
+ use kreuzberg::core::config::LanguageDetectionConfig;
11
+ use kreuzberg::core::config::TokenReductionConfig;
7
12
  use kreuzberg::core::extractor::extract_bytes;
8
13
 
9
14
  mod helpers;
10
15
 
11
16
  /// Test chunking enabled - text split into chunks.
12
17
  #[tokio::test]
18
+ #[cfg(feature = "chunking")]
13
19
  async fn test_chunking_enabled() {
14
20
  let config = ExtractionConfig {
15
21
  chunking: Some(ChunkingConfig {
@@ -52,6 +58,7 @@ async fn test_chunking_enabled() {
52
58
 
53
59
  /// Test chunking with overlap - overlap preserved between chunks.
54
60
  #[tokio::test]
61
+ #[cfg(feature = "chunking")]
55
62
  async fn test_chunking_with_overlap() {
56
63
  let config = ExtractionConfig {
57
64
  chunking: Some(ChunkingConfig {
@@ -91,6 +98,7 @@ async fn test_chunking_with_overlap() {
91
98
 
92
99
  /// Test chunking with custom sizes - custom chunk size and overlap.
93
100
  #[tokio::test]
101
+ #[cfg(feature = "chunking")]
94
102
  async fn test_chunking_custom_sizes() {
95
103
  let config = ExtractionConfig {
96
104
  chunking: Some(ChunkingConfig {
@@ -151,6 +159,7 @@ async fn test_chunking_disabled() {
151
159
 
152
160
  /// Test language detection for single language document.
153
161
  #[tokio::test]
162
+ #[cfg(feature = "language-detection")]
154
163
  async fn test_language_detection_single() {
155
164
  let config = ExtractionConfig {
156
165
  language_detection: Some(LanguageDetectionConfig {
@@ -177,6 +186,7 @@ async fn test_language_detection_single() {
177
186
  /// Test language detection for multi-language document.
178
187
  #[cfg_attr(coverage, ignore = "coverage instrumentation affects multi-language heuristics")]
179
188
  #[tokio::test]
189
+ #[cfg(feature = "language-detection")]
180
190
  async fn test_language_detection_multiple() {
181
191
  let config = ExtractionConfig {
182
192
  language_detection: Some(LanguageDetectionConfig {
@@ -201,6 +211,7 @@ async fn test_language_detection_multiple() {
201
211
 
202
212
  /// Test language detection with confidence threshold.
203
213
  #[tokio::test]
214
+ #[cfg(feature = "language-detection")]
204
215
  async fn test_language_detection_confidence() {
205
216
  let config = ExtractionConfig {
206
217
  language_detection: Some(LanguageDetectionConfig {
@@ -225,6 +236,7 @@ async fn test_language_detection_confidence() {
225
236
 
226
237
  /// Test language detection disabled.
227
238
  #[tokio::test]
239
+ #[cfg(feature = "language-detection")]
228
240
  async fn test_language_detection_disabled() {
229
241
  let config = ExtractionConfig {
230
242
  language_detection: Some(LanguageDetectionConfig {
@@ -397,6 +409,7 @@ async fn test_token_reduction_disabled() {
397
409
 
398
410
  /// Test quality processing enabled - quality scoring applied.
399
411
  #[tokio::test]
412
+ #[cfg(feature = "quality")]
400
413
  async fn test_quality_processing_enabled() {
401
414
  let config = ExtractionConfig {
402
415
  enable_quality_processing: true,
@@ -420,6 +433,7 @@ async fn test_quality_processing_enabled() {
420
433
 
421
434
  /// Test quality processing calculates score for different text quality.
422
435
  #[tokio::test]
436
+ #[cfg(feature = "quality")]
423
437
  async fn test_quality_threshold_filtering() {
424
438
  let config = ExtractionConfig {
425
439
  enable_quality_processing: true,
@@ -389,6 +389,7 @@ extract_images = true
389
389
  "Should have language detection config"
390
390
  );
391
391
  assert!(config.images.is_some(), "Should have image extraction config");
392
+ #[cfg(feature = "pdf")]
392
393
  assert!(config.pdf_options.is_some(), "Should have PDF config");
393
394
  }
394
395