kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +5 -5
  3. data/README.md +15 -9
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
  5. data/ext/kreuzberg_rb/native/Cargo.lock +511 -325
  6. data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
  8. data/kreuzberg.gemspec +38 -4
  9. data/lib/kreuzberg/config.rb +34 -1
  10. data/lib/kreuzberg/result.rb +77 -14
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/sig/kreuzberg.rbs +23 -6
  13. data/vendor/kreuzberg/Cargo.toml +32 -11
  14. data/vendor/kreuzberg/README.md +54 -8
  15. data/vendor/kreuzberg/build.rs +549 -132
  16. data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
  17. data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
  18. data/vendor/kreuzberg/src/core/config.rs +49 -1
  19. data/vendor/kreuzberg/src/core/extractor.rs +134 -2
  20. data/vendor/kreuzberg/src/core/mod.rs +4 -2
  21. data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
  22. data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
  23. data/vendor/kreuzberg/src/extraction/html.rs +24 -8
  24. data/vendor/kreuzberg/src/extraction/image.rs +124 -1
  25. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
  26. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
  27. data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
  28. data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
  29. data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
  30. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  31. data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
  32. data/vendor/kreuzberg/src/extractors/email.rs +29 -15
  33. data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
  34. data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
  35. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
  36. data/vendor/kreuzberg/src/extractors/html.rs +29 -15
  37. data/vendor/kreuzberg/src/extractors/image.rs +25 -4
  38. data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
  39. data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
  40. data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
  41. data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
  42. data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
  43. data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
  44. data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
  45. data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
  46. data/vendor/kreuzberg/src/extractors/pdf.rs +197 -17
  47. data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
  48. data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
  49. data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
  50. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  51. data/vendor/kreuzberg/src/extractors/text.rs +7 -2
  52. data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
  53. data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
  54. data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
  55. data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
  56. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
  57. data/vendor/kreuzberg/src/lib.rs +10 -2
  58. data/vendor/kreuzberg/src/mcp/mod.rs +3 -0
  59. data/vendor/kreuzberg/src/mcp/server.rs +120 -12
  60. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
  61. data/vendor/kreuzberg/src/pdf/bundled.rs +328 -0
  62. data/vendor/kreuzberg/src/pdf/error.rs +8 -0
  63. data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
  64. data/vendor/kreuzberg/src/pdf/mod.rs +18 -2
  65. data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
  66. data/vendor/kreuzberg/src/pdf/table.rs +26 -2
  67. data/vendor/kreuzberg/src/pdf/text.rs +89 -7
  68. data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
  69. data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
  70. data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
  71. data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
  72. data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
  73. data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
  74. data/vendor/kreuzberg/src/text/mod.rs +6 -0
  75. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
  76. data/vendor/kreuzberg/src/types.rs +173 -21
  77. data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
  78. data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
  79. data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
  80. data/vendor/kreuzberg/tests/config_features.rs +15 -1
  81. data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
  82. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
  83. data/vendor/kreuzberg/tests/email_integration.rs +2 -0
  84. data/vendor/kreuzberg/tests/error_handling.rs +43 -34
  85. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  86. data/vendor/kreuzberg/tests/image_integration.rs +2 -0
  87. data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
  88. data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
  89. data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
  90. data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
  91. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
  92. data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
  93. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
  95. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
  96. data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
  97. data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
  98. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
  99. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
  100. data/vendor/kreuzberg/tests/security_validation.rs +1 -0
  101. data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
  102. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
  103. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
  104. data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
  105. data/vendor/rb-sys/Cargo.lock +15 -15
  106. data/vendor/rb-sys/Cargo.toml +4 -4
  107. data/vendor/rb-sys/Cargo.toml.orig +4 -4
  108. data/vendor/rb-sys/build/features.rs +5 -2
  109. data/vendor/rb-sys/build/main.rs +55 -15
  110. data/vendor/rb-sys/build/stable_api_config.rs +4 -2
  111. data/vendor/rb-sys/build/version.rs +3 -1
  112. data/vendor/rb-sys/src/lib.rs +1 -0
  113. data/vendor/rb-sys/src/macros.rs +2 -2
  114. data/vendor/rb-sys/src/special_consts.rs +1 -1
  115. data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
  116. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
  117. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
  118. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
  119. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
  120. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
  121. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
  122. data/vendor/rb-sys/src/stable_api.rs +0 -1
  123. data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
  124. metadata +13 -10
  125. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  126. data/vendor/rb-sys/.cargo-ok +0 -1
  127. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
@@ -1,14 +1,19 @@
1
1
  use super::error::{PdfError, Result};
2
+ use crate::core::config::PageConfig;
3
+ use crate::types::{PageBoundary, PageContent};
2
4
  use pdfium_render::prelude::*;
3
5
 
6
+ /// Result type for PDF text extraction with optional page tracking.
7
+ #[allow(dead_code)]
8
+ type PdfTextExtractionResult = (String, Option<Vec<PageBoundary>>, Option<Vec<PageContent>>);
9
+
4
10
  pub struct PdfTextExtractor {
5
11
  pdfium: Pdfium,
6
12
  }
7
13
 
8
14
  impl PdfTextExtractor {
9
15
  pub fn new() -> Result<Self> {
10
- let binding = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
11
- .or_else(|_| Pdfium::bind_to_system_library())
16
+ let binding = Pdfium::bind_to_system_library()
12
17
  .map_err(|e| PdfError::TextExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
13
18
 
14
19
  let pdfium = Pdfium::new(binding);
@@ -31,7 +36,8 @@ impl PdfTextExtractor {
31
36
  }
32
37
  })?;
33
38
 
34
- extract_text_from_pdf_document(&document)
39
+ let (content, _, _) = extract_text_from_pdf_document(&document, None)?;
40
+ Ok(content)
35
41
  }
36
42
 
37
43
  pub fn extract_text_with_passwords(&self, pdf_bytes: &[u8], passwords: &[&str]) -> Result<String> {
@@ -89,28 +95,104 @@ pub fn extract_text_from_pdf_with_passwords(pdf_bytes: &[u8], passwords: &[&str]
89
95
  extractor.extract_text_with_passwords(pdf_bytes, passwords)
90
96
  }
91
97
 
92
- pub fn extract_text_from_pdf_document(document: &PdfDocument<'_>) -> Result<String> {
98
+ /// Extract text from PDF document with optional page boundary tracking.
99
+ ///
100
+ /// # Arguments
101
+ ///
102
+ /// * `document` - The PDF document to extract text from
103
+ /// * `page_config` - Optional page configuration for boundary tracking and page markers
104
+ ///
105
+ /// # Returns
106
+ ///
107
+ /// A tuple containing:
108
+ /// - The extracted text content (String)
109
+ /// - Optional page boundaries when page tracking is enabled (Vec<PageBoundary>)
110
+ /// - Optional per-page content when extract_pages is enabled (Vec<PageContent>)
111
+ ///
112
+ /// # Implementation Details
113
+ ///
114
+ /// When page_config is None, returns fast path with (content, None, None).
115
+ /// When page_config is Some, tracks byte offsets using .len() for O(1) performance (UTF-8 valid boundaries).
116
+ pub fn extract_text_from_pdf_document(
117
+ document: &PdfDocument<'_>,
118
+ page_config: Option<&PageConfig>,
119
+ ) -> Result<PdfTextExtractionResult> {
93
120
  let page_count = document.pages().len() as usize;
94
121
 
122
+ if page_config.is_none() {
123
+ let estimated_size = page_count * 2048;
124
+ let mut content = String::with_capacity(estimated_size);
125
+
126
+ for page in document.pages().iter() {
127
+ let text = page
128
+ .text()
129
+ .map_err(|e| PdfError::TextExtractionFailed(format!("Page text extraction failed: {}", e)))?;
130
+
131
+ let page_text = text.all();
132
+
133
+ if !content.is_empty() {
134
+ content.push_str("\n\n");
135
+ }
136
+ content.push_str(&page_text);
137
+ }
138
+
139
+ content.shrink_to_fit();
140
+ return Ok((content, None, None));
141
+ }
142
+
143
+ let config = page_config.unwrap();
95
144
  let estimated_size = page_count * 2048;
96
145
  let mut content = String::with_capacity(estimated_size);
146
+ let mut boundaries = Vec::with_capacity(page_count);
147
+ let mut page_contents = if config.extract_pages {
148
+ Some(Vec::with_capacity(page_count))
149
+ } else {
150
+ None
151
+ };
152
+
153
+ for (page_idx, page) in document.pages().iter().enumerate() {
154
+ let page_number = page_idx + 1;
97
155
 
98
- for page in document.pages().iter() {
99
156
  let text = page
100
157
  .text()
101
158
  .map_err(|e| PdfError::TextExtractionFailed(format!("Page text extraction failed: {}", e)))?;
102
159
 
103
160
  let page_text = text.all();
104
161
 
105
- if !content.is_empty() {
162
+ if page_number > 1 && config.insert_page_markers {
163
+ let marker = config.marker_format.replace("{page_num}", &page_number.to_string());
164
+ content.push_str(&marker);
165
+ }
166
+
167
+ if page_number > 1 && !config.insert_page_markers && !content.is_empty() {
106
168
  content.push_str("\n\n");
107
169
  }
170
+
171
+ let byte_start = content.len();
172
+
108
173
  content.push_str(&page_text);
174
+
175
+ let byte_end = content.len();
176
+
177
+ boundaries.push(PageBoundary {
178
+ byte_start,
179
+ byte_end,
180
+ page_number,
181
+ });
182
+
183
+ if let Some(ref mut pages) = page_contents {
184
+ pages.push(PageContent {
185
+ page_number,
186
+ content: page_text,
187
+ tables: Vec::new(),
188
+ images: Vec::new(),
189
+ });
190
+ }
109
191
  }
110
192
 
111
193
  content.shrink_to_fit();
112
194
 
113
- Ok(content)
195
+ Ok((content, Some(boundaries), page_contents))
114
196
  }
115
197
 
116
198
  #[cfg(test)]
@@ -10,6 +10,9 @@ use async_trait::async_trait;
10
10
  use std::path::Path;
11
11
  use std::sync::Arc;
12
12
 
13
+ #[cfg(not(feature = "tokio-runtime"))]
14
+ use crate::KreuzbergError;
15
+
13
16
  /// Trait for document extractor plugins.
14
17
  ///
15
18
  /// Implement this trait to add support for new document formats or to override
@@ -61,6 +64,7 @@ use std::sync::Arc;
61
64
  /// detected_languages: None,
62
65
  /// chunks: None,
63
66
  /// images: None,
67
+ /// pages: None,
64
68
  /// })
65
69
  /// }
66
70
  ///
@@ -139,6 +143,7 @@ pub trait DocumentExtractor: Plugin {
139
143
  /// detected_languages: None,
140
144
  /// chunks: None,
141
145
  /// images: None,
146
+ /// pages: None,
142
147
  /// })
143
148
  /// }
144
149
  /// # }
@@ -209,14 +214,27 @@ pub trait DocumentExtractor: Plugin {
209
214
  /// detected_languages: None,
210
215
  /// chunks: None,
211
216
  /// images: None,
217
+ /// pages: None,
212
218
  /// })
213
219
  /// }
214
220
  /// # }
215
221
  /// ```
216
222
  async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
217
- use crate::core::io;
218
- let bytes = io::read_file_async(path).await?;
219
- self.extract_bytes(&bytes, mime_type, config).await
223
+ #[cfg(feature = "tokio-runtime")]
224
+ {
225
+ use crate::core::io;
226
+ let bytes = io::read_file_async(path).await?;
227
+ self.extract_bytes(&bytes, mime_type, config).await
228
+ }
229
+ #[cfg(not(feature = "tokio-runtime"))]
230
+ {
231
+ let _ = (path, mime_type, config);
232
+ // For WASM and non-tokio environments, file extraction is not supported
233
+ // through the default implementation. Implementations must provide their own.
234
+ Err(KreuzbergError::Other(
235
+ "File-based extraction requires the tokio-runtime feature".to_string(),
236
+ ))
237
+ }
220
238
  }
221
239
 
222
240
  /// Get the list of MIME types supported by this extractor.
@@ -359,6 +377,14 @@ pub trait DocumentExtractor: Plugin {
359
377
  fn can_handle(&self, _path: &Path, _mime_type: &str) -> bool {
360
378
  true
361
379
  }
380
+
381
+ /// Attempt to get a reference to this extractor as a SyncExtractor.
382
+ ///
383
+ /// Returns None if the extractor doesn't support synchronous extraction.
384
+ /// This is used for WASM and other sync-only environments.
385
+ fn as_sync_extractor(&self) -> Option<&dyn crate::extractors::SyncExtractor> {
386
+ None
387
+ }
362
388
  }
363
389
 
364
390
  /// Register a document extractor with the global registry.
@@ -412,6 +438,7 @@ pub trait DocumentExtractor: Plugin {
412
438
  /// detected_languages: None,
413
439
  /// chunks: None,
414
440
  /// images: None,
441
+ /// pages: None,
415
442
  /// })
416
443
  /// }
417
444
  ///
@@ -577,6 +604,7 @@ mod tests {
577
604
  detected_languages: None,
578
605
  chunks: None,
579
606
  images: None,
607
+ pages: None,
580
608
  })
581
609
  }
582
610
 
@@ -749,6 +777,7 @@ mod tests {
749
777
  detected_languages: None,
750
778
  chunks: None,
751
779
  images: None,
780
+ pages: None,
752
781
  })
753
782
  }
754
783
 
@@ -953,6 +982,7 @@ mod tests {
953
982
  detected_languages: None,
954
983
  chunks: None,
955
984
  images: None,
985
+ pages: None,
956
986
  })
957
987
  }
958
988
 
@@ -998,6 +1028,7 @@ mod tests {
998
1028
  detected_languages: None,
999
1029
  chunks: None,
1000
1030
  images: None,
1031
+ pages: None,
1001
1032
  })
1002
1033
  }
1003
1034
 
@@ -47,6 +47,7 @@
47
47
  //! # detected_languages: None,
48
48
  //! # chunks: None,
49
49
  //! # images: None,
50
+ //! # pages: None,
50
51
  //! # })
51
52
  //! # }
52
53
  //! # async fn extract_file(&self, _: &std::path::Path, _: &str, _: &kreuzberg::ExtractionConfig)
@@ -59,6 +60,7 @@
59
60
  //! # detected_languages: None,
60
61
  //! # chunks: None,
61
62
  //! # images: None,
63
+ //! # pages: None,
62
64
  //! # })
63
65
  //! # }
64
66
  //! # fn supported_mime_types(&self) -> &[&str] { &[] }
@@ -120,6 +122,7 @@
120
122
  //! detected_languages: None,
121
123
  //! chunks: None,
122
124
  //! images: None,
125
+ //! pages: None,
123
126
  //! })
124
127
  //! }
125
128
  //!
@@ -10,6 +10,9 @@ use async_trait::async_trait;
10
10
  use std::path::Path;
11
11
  use std::sync::Arc;
12
12
 
13
+ #[cfg(not(feature = "tokio-runtime"))]
14
+ use crate::KreuzbergError;
15
+
13
16
  /// OCR backend types.
14
17
  #[derive(Debug, Clone, Copy, PartialEq, Eq)]
15
18
  pub enum OcrBackendType {
@@ -64,6 +67,7 @@ pub enum OcrBackendType {
64
67
  /// detected_languages: None,
65
68
  /// chunks: None,
66
69
  /// images: None,
70
+ /// pages: None,
67
71
  /// })
68
72
  /// }
69
73
  ///
@@ -142,6 +146,7 @@ pub trait OcrBackend: Plugin {
142
146
  /// detected_languages: None,
143
147
  /// chunks: None,
144
148
  /// images: None,
149
+ /// pages: None,
145
150
  /// })
146
151
  /// }
147
152
  /// # }
@@ -162,9 +167,21 @@ pub trait OcrBackend: Plugin {
162
167
  ///
163
168
  /// Same as `process_image`, plus file I/O errors.
164
169
  async fn process_file(&self, path: &Path, config: &OcrConfig) -> Result<ExtractionResult> {
165
- use crate::core::io;
166
- let bytes = io::read_file_async(path).await?;
167
- self.process_image(&bytes, config).await
170
+ #[cfg(feature = "tokio-runtime")]
171
+ {
172
+ use crate::core::io;
173
+ let bytes = io::read_file_async(path).await?;
174
+ self.process_image(&bytes, config).await
175
+ }
176
+ #[cfg(not(feature = "tokio-runtime"))]
177
+ {
178
+ let _ = (path, config);
179
+ // For WASM and non-tokio environments, file-based OCR is not supported
180
+ // through the default implementation. Implementations must provide their own.
181
+ Err(KreuzbergError::Other(
182
+ "File-based OCR processing requires the tokio-runtime feature".to_string(),
183
+ ))
184
+ }
168
185
  }
169
186
 
170
187
  /// Check if this backend supports a given language code.
@@ -302,6 +319,7 @@ pub trait OcrBackend: Plugin {
302
319
  /// detected_languages: None,
303
320
  /// chunks: None,
304
321
  /// images: None,
322
+ /// pages: None,
305
323
  /// })
306
324
  /// }
307
325
  /// fn supports_language(&self, _: &str) -> bool { true }
@@ -462,6 +480,7 @@ mod tests {
462
480
  detected_languages: None,
463
481
  chunks: None,
464
482
  images: None,
483
+ pages: None,
465
484
  })
466
485
  }
467
486
 
@@ -373,6 +373,7 @@ mod tests {
373
373
  detected_languages: None,
374
374
  chunks: None,
375
375
  images: None,
376
+ pages: None,
376
377
  };
377
378
 
378
379
  let config = ExtractionConfig::default();
@@ -422,6 +423,7 @@ mod tests {
422
423
  detected_languages: None,
423
424
  chunks: None,
424
425
  images: None,
426
+ pages: None,
425
427
  };
426
428
 
427
429
  let config = ExtractionConfig::default();
@@ -488,6 +490,7 @@ mod tests {
488
490
  detected_languages: None,
489
491
  chunks: None,
490
492
  images: None,
493
+ pages: None,
491
494
  };
492
495
 
493
496
  let config = ExtractionConfig::default();
@@ -513,6 +516,7 @@ mod tests {
513
516
  additional,
514
517
  ..Default::default()
515
518
  },
519
+ pages: None,
516
520
  tables: vec![],
517
521
  detected_languages: None,
518
522
  chunks: None,
@@ -543,6 +547,7 @@ mod tests {
543
547
  detected_languages: None,
544
548
  chunks: None,
545
549
  images: None,
550
+ pages: None,
546
551
  };
547
552
 
548
553
  assert_eq!(processor.estimated_duration_ms(&result), 0);
@@ -593,6 +598,7 @@ mod tests {
593
598
  detected_languages: None,
594
599
  chunks: None,
595
600
  images: None,
601
+ pages: None,
596
602
  };
597
603
 
598
604
  let txt_result = ExtractionResult {
@@ -603,6 +609,7 @@ mod tests {
603
609
  detected_languages: None,
604
610
  chunks: None,
605
611
  images: None,
612
+ pages: None,
606
613
  };
607
614
 
608
615
  assert!(processor.should_process(&pdf_result, &config));
@@ -631,6 +638,7 @@ mod tests {
631
638
  detected_languages: None,
632
639
  chunks: None,
633
640
  images: None,
641
+ pages: None,
634
642
  };
635
643
 
636
644
  let config = ExtractionConfig::default();
@@ -661,6 +661,7 @@ mod tests {
661
661
  detected_languages: None,
662
662
  chunks: None,
663
663
  images: None,
664
+ pages: None,
664
665
  })
665
666
  }
666
667
 
@@ -705,6 +706,7 @@ mod tests {
705
706
  detected_languages: None,
706
707
  chunks: None,
707
708
  images: None,
709
+ pages: None,
708
710
  })
709
711
  }
710
712
 
@@ -489,6 +489,7 @@ mod tests {
489
489
  detected_languages: None,
490
490
  chunks: None,
491
491
  images: None,
492
+ pages: None,
492
493
  };
493
494
 
494
495
  let config = ExtractionConfig::default();
@@ -507,6 +508,7 @@ mod tests {
507
508
  detected_languages: None,
508
509
  chunks: None,
509
510
  images: None,
511
+ pages: None,
510
512
  };
511
513
 
512
514
  let config = ExtractionConfig::default();
@@ -527,6 +529,7 @@ mod tests {
527
529
  detected_languages: None,
528
530
  chunks: None,
529
531
  images: None,
532
+ pages: None,
530
533
  };
531
534
 
532
535
  let config = ExtractionConfig::default();
@@ -562,6 +565,7 @@ mod tests {
562
565
  detected_languages: None,
563
566
  chunks: None,
564
567
  images: None,
568
+ pages: None,
565
569
  };
566
570
 
567
571
  let config = ExtractionConfig::default();
@@ -609,6 +613,7 @@ mod tests {
609
613
  detected_languages: None,
610
614
  chunks: None,
611
615
  images: None,
616
+ pages: None,
612
617
  };
613
618
 
614
619
  let txt_result = ExtractionResult {
@@ -619,6 +624,7 @@ mod tests {
619
624
  detected_languages: None,
620
625
  chunks: None,
621
626
  images: None,
627
+ pages: None,
622
628
  };
623
629
 
624
630
  assert!(validator.should_validate(&pdf_result, &config));
@@ -702,6 +708,7 @@ mod tests {
702
708
  detected_languages: None,
703
709
  chunks: None,
704
710
  images: None,
711
+ pages: None,
705
712
  };
706
713
 
707
714
  let config = ExtractionConfig::default();
@@ -729,6 +736,7 @@ mod tests {
729
736
  additional,
730
737
  ..Default::default()
731
738
  },
739
+ pages: None,
732
740
  tables: vec![],
733
741
  detected_languages: None,
734
742
  chunks: None,
@@ -759,6 +767,7 @@ mod tests {
759
767
  detected_languages: None,
760
768
  chunks: None,
761
769
  images: None,
770
+ pages: None,
762
771
  };
763
772
 
764
773
  let config = ExtractionConfig::default();
@@ -787,6 +796,7 @@ mod tests {
787
796
  detected_languages: None,
788
797
  chunks: None,
789
798
  images: None,
799
+ pages: None,
790
800
  };
791
801
 
792
802
  assert!(validator.validate(&result, &config).await.is_ok());
@@ -805,6 +815,7 @@ mod tests {
805
815
  detected_languages: None,
806
816
  chunks: None,
807
817
  images: None,
818
+ pages: None,
808
819
  };
809
820
 
810
821
  let config = ExtractionConfig::default();
@@ -7,9 +7,15 @@ pub mod string_utils;
7
7
  #[cfg(feature = "quality")]
8
8
  pub mod token_reduction;
9
9
 
10
+ #[cfg(feature = "quality")]
11
+ pub mod quality_processor;
12
+
10
13
  #[cfg(feature = "quality")]
11
14
  pub use quality::{calculate_quality_score, clean_extracted_text, normalize_spaces};
12
15
 
16
+ #[cfg(feature = "quality")]
17
+ pub use quality_processor::QualityProcessor;
18
+
13
19
  #[cfg(feature = "quality")]
14
20
  pub use string_utils::{calculate_text_confidence, fix_mojibake, get_encoding_cache_key, safe_decode};
15
21