kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +5 -3
  3. data/README.md +15 -9
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
  5. data/ext/kreuzberg_rb/native/Cargo.lock +516 -324
  6. data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
  8. data/kreuzberg.gemspec +38 -4
  9. data/lib/kreuzberg/config.rb +34 -1
  10. data/lib/kreuzberg/result.rb +77 -14
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/sig/kreuzberg.rbs +23 -6
  13. data/vendor/kreuzberg/Cargo.toml +25 -11
  14. data/vendor/kreuzberg/README.md +13 -8
  15. data/vendor/kreuzberg/build.rs +17 -6
  16. data/vendor/kreuzberg/src/api/mod.rs +2 -0
  17. data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
  18. data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
  19. data/vendor/kreuzberg/src/core/config.rs +49 -1
  20. data/vendor/kreuzberg/src/core/extractor.rs +134 -2
  21. data/vendor/kreuzberg/src/core/mod.rs +4 -2
  22. data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
  23. data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
  24. data/vendor/kreuzberg/src/extraction/html.rs +24 -8
  25. data/vendor/kreuzberg/src/extraction/image.rs +124 -1
  26. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
  27. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
  28. data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
  29. data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
  30. data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
  31. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  32. data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
  33. data/vendor/kreuzberg/src/extractors/email.rs +29 -15
  34. data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
  35. data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
  36. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
  37. data/vendor/kreuzberg/src/extractors/html.rs +29 -15
  38. data/vendor/kreuzberg/src/extractors/image.rs +25 -4
  39. data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
  40. data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
  41. data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
  42. data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
  43. data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
  44. data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
  45. data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
  46. data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
  47. data/vendor/kreuzberg/src/extractors/pdf.rs +194 -17
  48. data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
  49. data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
  50. data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
  51. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  52. data/vendor/kreuzberg/src/extractors/text.rs +7 -2
  53. data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
  54. data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
  55. data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
  56. data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
  57. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
  58. data/vendor/kreuzberg/src/lib.rs +10 -2
  59. data/vendor/kreuzberg/src/mcp/mod.rs +2 -0
  60. data/vendor/kreuzberg/src/mcp/server.rs +14 -12
  61. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
  62. data/vendor/kreuzberg/src/pdf/error.rs +8 -0
  63. data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
  64. data/vendor/kreuzberg/src/pdf/mod.rs +14 -2
  65. data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
  66. data/vendor/kreuzberg/src/pdf/table.rs +26 -2
  67. data/vendor/kreuzberg/src/pdf/text.rs +89 -7
  68. data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
  69. data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
  70. data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
  71. data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
  72. data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
  73. data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
  74. data/vendor/kreuzberg/src/text/mod.rs +6 -0
  75. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
  76. data/vendor/kreuzberg/src/types.rs +173 -21
  77. data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
  78. data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
  79. data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
  80. data/vendor/kreuzberg/tests/config_features.rs +15 -1
  81. data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
  82. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
  83. data/vendor/kreuzberg/tests/email_integration.rs +2 -0
  84. data/vendor/kreuzberg/tests/error_handling.rs +43 -34
  85. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  86. data/vendor/kreuzberg/tests/image_integration.rs +2 -0
  87. data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
  88. data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
  89. data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
  90. data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
  91. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
  92. data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
  93. data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
  94. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
  95. data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
  97. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
  98. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
  99. data/vendor/kreuzberg/tests/security_validation.rs +1 -0
  100. data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
  101. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
  102. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
  103. data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
  104. data/vendor/rb-sys/Cargo.lock +15 -15
  105. data/vendor/rb-sys/Cargo.toml +4 -4
  106. data/vendor/rb-sys/Cargo.toml.orig +4 -4
  107. data/vendor/rb-sys/bin/release.sh +9 -8
  108. data/vendor/rb-sys/build/features.rs +5 -2
  109. data/vendor/rb-sys/build/main.rs +55 -15
  110. data/vendor/rb-sys/build/stable_api_config.rs +4 -2
  111. data/vendor/rb-sys/build/version.rs +3 -1
  112. data/vendor/rb-sys/src/macros.rs +2 -2
  113. data/vendor/rb-sys/src/special_consts.rs +1 -1
  114. data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
  115. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
  116. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
  117. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
  118. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
  119. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
  120. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
  121. data/vendor/rb-sys/src/stable_api.rs +0 -1
  122. data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
  123. metadata +11 -10
  124. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  125. data/vendor/rb-sys/.cargo-ok +0 -1
  126. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
@@ -4,8 +4,13 @@
4
4
 
5
5
  use crate::Result;
6
6
  use crate::core::config::LanguageDetectionConfig;
7
+ use once_cell::sync::Lazy;
8
+ use std::sync::Arc;
7
9
  use whatlang::{Lang, detect};
8
10
 
11
+ pub mod processor;
12
+ pub use processor::LanguageDetector;
13
+
9
14
  /// Detect languages in text using whatlang.
10
15
  ///
11
16
  /// Returns a list of detected language codes (ISO 639-3 format).
@@ -940,3 +945,41 @@ mod tests {
940
945
  assert_eq!(langs[0], "eng");
941
946
  }
942
947
  }
948
+
949
+ /// Lazy-initialized flag that ensures language detection processor is registered exactly once.
950
+ ///
951
+ /// This static is accessed on first use to automatically register the
952
+ /// language detection processor with the plugin registry.
953
+ static PROCESSOR_INITIALIZED: Lazy<Result<()>> = Lazy::new(register_language_detection_processor);
954
+
955
+ /// Ensure the language detection processor is registered.
956
+ ///
957
+ /// This function is called automatically when needed.
958
+ /// It's safe to call multiple times - registration only happens once.
959
+ pub fn ensure_initialized() -> Result<()> {
960
+ PROCESSOR_INITIALIZED
961
+ .as_ref()
962
+ .map(|_| ())
963
+ .map_err(|e| crate::KreuzbergError::Plugin {
964
+ message: format!("Failed to register language detection processor: {}", e),
965
+ plugin_name: "language-detection".to_string(),
966
+ })
967
+ }
968
+
969
+ /// Register the language detection processor with the global registry.
970
+ ///
971
+ /// This function should be called once at application startup to register
972
+ /// the language detection post-processor.
973
+ ///
974
+ /// **Note:** This is called automatically on first use.
975
+ /// Explicit calling is optional.
976
+ pub fn register_language_detection_processor() -> Result<()> {
977
+ let registry = crate::plugins::registry::get_post_processor_registry();
978
+ let mut registry = registry
979
+ .write()
980
+ .map_err(|e| crate::KreuzbergError::Other(format!("Post-processor registry lock poisoned: {}", e)))?;
981
+
982
+ registry.register(Arc::new(LanguageDetector), 40)?;
983
+
984
+ Ok(())
985
+ }
@@ -0,0 +1,219 @@
1
+ //! Language detection post-processor.
2
+ //!
3
+ //! This module provides a PostProcessor plugin that detects languages in
4
+ //! extraction results and stores them in the result.
5
+
6
+ use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
7
+ use crate::{ExtractionConfig, ExtractionResult, KreuzbergError, Result};
8
+ use async_trait::async_trait;
9
+
10
+ /// Post-processor that detects languages in document content.
11
+ ///
12
+ /// This processor:
13
+ /// - Runs in the Early processing stage
14
+ /// - Only processes when `config.language_detection` is configured
15
+ /// - Stores detected languages in `result.detected_languages`
16
+ /// - Uses the whatlang library for detection
17
+ ///
18
+ /// # Example
19
+ ///
20
+ /// ```rust,no_run
21
+ /// use kreuzberg::plugins::{Plugin, PostProcessor};
22
+ /// use kreuzberg::language_detection::processor::LanguageDetector;
23
+ ///
24
+ /// let processor = LanguageDetector;
25
+ /// assert_eq!(processor.name(), "language-detection");
26
+ /// ```
27
+ #[derive(Debug, Clone, Copy)]
28
+ pub struct LanguageDetector;
29
+
30
+ impl Plugin for LanguageDetector {
31
+ fn name(&self) -> &str {
32
+ "language-detection"
33
+ }
34
+
35
+ fn version(&self) -> String {
36
+ env!("CARGO_PKG_VERSION").to_string()
37
+ }
38
+
39
+ fn initialize(&self) -> Result<()> {
40
+ Ok(())
41
+ }
42
+
43
+ fn shutdown(&self) -> Result<()> {
44
+ Ok(())
45
+ }
46
+ }
47
+
48
+ #[cfg_attr(not(target_arch = "wasm32"), async_trait)]
49
+ #[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
50
+ impl PostProcessor for LanguageDetector {
51
+ async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig) -> Result<()> {
52
+ let lang_config = match &config.language_detection {
53
+ Some(cfg) => cfg,
54
+ None => return Ok(()),
55
+ };
56
+
57
+ match super::detect_languages(&result.content, lang_config)
58
+ .map_err(|e| KreuzbergError::Other(format!("Language detection failed: {}", e)))?
59
+ {
60
+ Some(languages) => {
61
+ result.detected_languages = Some(languages);
62
+ }
63
+ None => {
64
+ result.detected_languages = None;
65
+ }
66
+ }
67
+
68
+ Ok(())
69
+ }
70
+
71
+ fn processing_stage(&self) -> ProcessingStage {
72
+ ProcessingStage::Early
73
+ }
74
+
75
+ fn should_process(&self, _result: &ExtractionResult, config: &ExtractionConfig) -> bool {
76
+ config.language_detection.is_some()
77
+ }
78
+
79
+ fn estimated_duration_ms(&self, result: &ExtractionResult) -> u64 {
80
+ let text_length = result.content.len();
81
+ // Language detection is relatively fast: ~1ms per 1KB
82
+ (text_length / 1024).max(1) as u64
83
+ }
84
+ }
85
+
86
+ #[cfg(test)]
87
+ mod tests {
88
+ use super::*;
89
+ use crate::core::config::LanguageDetectionConfig;
90
+ use crate::types::Metadata;
91
+
92
+ #[tokio::test]
93
+ async fn test_language_detector_processor() {
94
+ let processor = LanguageDetector;
95
+ let config = ExtractionConfig {
96
+ language_detection: Some(LanguageDetectionConfig {
97
+ enabled: true,
98
+ min_confidence: 0.8,
99
+ detect_multiple: false,
100
+ }),
101
+ ..Default::default()
102
+ };
103
+
104
+ let mut result = ExtractionResult {
105
+ content: "Hello world! This is a test of the language detection system.".to_string(),
106
+ mime_type: "text/plain".to_string(),
107
+ metadata: Metadata::default(),
108
+ tables: vec![],
109
+ detected_languages: None,
110
+ chunks: None,
111
+ images: None,
112
+ pages: None,
113
+ };
114
+
115
+ processor.process(&mut result, &config).await.unwrap();
116
+
117
+ assert!(result.detected_languages.is_some());
118
+ let langs = result.detected_languages.unwrap();
119
+ assert!(!langs.is_empty());
120
+ assert_eq!(langs[0], "eng");
121
+ }
122
+
123
+ #[tokio::test]
124
+ async fn test_language_detector_no_config() {
125
+ let processor = LanguageDetector;
126
+ let config = ExtractionConfig::default();
127
+
128
+ let mut result = ExtractionResult {
129
+ content: "Hello world!".to_string(),
130
+ mime_type: "text/plain".to_string(),
131
+ metadata: Metadata::default(),
132
+ tables: vec![],
133
+ detected_languages: None,
134
+ chunks: None,
135
+ images: None,
136
+ pages: None,
137
+ };
138
+
139
+ processor.process(&mut result, &config).await.unwrap();
140
+
141
+ assert!(result.detected_languages.is_none());
142
+ }
143
+
144
+ #[test]
145
+ fn test_language_detector_plugin_interface() {
146
+ let processor = LanguageDetector;
147
+ assert_eq!(processor.name(), "language-detection");
148
+ assert!(!processor.version().is_empty());
149
+ assert!(processor.initialize().is_ok());
150
+ assert!(processor.shutdown().is_ok());
151
+ }
152
+
153
+ #[test]
154
+ fn test_language_detector_stage() {
155
+ let processor = LanguageDetector;
156
+ assert_eq!(processor.processing_stage(), ProcessingStage::Early);
157
+ }
158
+
159
+ #[test]
160
+ fn test_language_detector_should_process() {
161
+ let processor = LanguageDetector;
162
+
163
+ let result = ExtractionResult {
164
+ content: "Sample text".to_string(),
165
+ mime_type: "text/plain".to_string(),
166
+ metadata: Metadata::default(),
167
+ tables: vec![],
168
+ detected_languages: None,
169
+ chunks: None,
170
+ images: None,
171
+ pages: None,
172
+ };
173
+
174
+ let config_with_lang = ExtractionConfig {
175
+ language_detection: Some(LanguageDetectionConfig {
176
+ enabled: true,
177
+ min_confidence: 0.8,
178
+ detect_multiple: false,
179
+ }),
180
+ ..Default::default()
181
+ };
182
+ assert!(processor.should_process(&result, &config_with_lang));
183
+
184
+ let config_without_lang = ExtractionConfig::default();
185
+ assert!(!processor.should_process(&result, &config_without_lang));
186
+ }
187
+
188
+ #[test]
189
+ fn test_language_detector_estimated_duration() {
190
+ let processor = LanguageDetector;
191
+
192
+ let short_result = ExtractionResult {
193
+ content: "Short".to_string(),
194
+ mime_type: "text/plain".to_string(),
195
+ metadata: Metadata::default(),
196
+ tables: vec![],
197
+ detected_languages: None,
198
+ chunks: None,
199
+ images: None,
200
+ pages: None,
201
+ };
202
+
203
+ let long_result = ExtractionResult {
204
+ content: "a".repeat(10000),
205
+ mime_type: "text/plain".to_string(),
206
+ metadata: Metadata::default(),
207
+ tables: vec![],
208
+ detected_languages: None,
209
+ chunks: None,
210
+ images: None,
211
+ pages: None,
212
+ };
213
+
214
+ let short_duration = processor.estimated_duration_ms(&short_result);
215
+ let long_duration = processor.estimated_duration_ms(&long_result);
216
+
217
+ assert!(long_duration > short_duration);
218
+ }
219
+ }
@@ -84,13 +84,21 @@ pub use types::*;
84
84
  pub use core::extractor::{batch_extract_bytes, batch_extract_file};
85
85
  pub use core::extractor::{extract_bytes, extract_file};
86
86
 
87
- pub use core::extractor::{batch_extract_bytes_sync, batch_extract_file_sync, extract_bytes_sync, extract_file_sync};
87
+ // Available in WASM (bytes-based)
88
+ pub use core::extractor::{batch_extract_bytes_sync, extract_bytes_sync};
89
+
90
+ // Only available with filesystem access
91
+ #[cfg(feature = "tokio-runtime")]
92
+ pub use core::extractor::{batch_extract_file_sync, extract_file_sync};
88
93
 
89
94
  pub use core::config::{
90
95
  ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig, ImageExtractionConfig,
91
- LanguageDetectionConfig, OcrConfig, PdfConfig, PostProcessorConfig, TokenReductionConfig,
96
+ LanguageDetectionConfig, OcrConfig, PostProcessorConfig, TokenReductionConfig,
92
97
  };
93
98
 
99
+ #[cfg(feature = "pdf")]
100
+ pub use core::config::PdfConfig;
101
+
94
102
  pub use core::mime::{
95
103
  DOCX_MIME_TYPE, EXCEL_MIME_TYPE, HTML_MIME_TYPE, JSON_MIME_TYPE, MARKDOWN_MIME_TYPE, PDF_MIME_TYPE,
96
104
  PLAIN_TEXT_MIME_TYPE, POWER_POINT_MIME_TYPE, XML_MIME_TYPE, detect_mime_type, detect_mime_type_from_bytes,
@@ -1,3 +1,5 @@
1
+ #![cfg(feature = "mcp")]
2
+
1
3
  //! Model Context Protocol (MCP) server implementation.
2
4
  //!
3
5
  //! Provides an MCP server that exposes Kreuzberg's document extraction
@@ -750,6 +750,7 @@ mod tests {
750
750
  detected_languages: None,
751
751
  chunks: None,
752
752
  images: None,
753
+ pages: None,
753
754
  };
754
755
 
755
756
  let formatted = format_extraction_result(&result);
@@ -786,6 +787,7 @@ mod tests {
786
787
  detected_languages: None,
787
788
  chunks: None,
788
789
  images: None,
790
+ pages: None,
789
791
  };
790
792
 
791
793
  let formatted = format_extraction_result(&result);
@@ -807,6 +809,7 @@ mod tests {
807
809
  detected_languages: None,
808
810
  chunks: None,
809
811
  images: None,
812
+ pages: None,
810
813
  };
811
814
 
812
815
  let formatted = format_extraction_result(&result);
@@ -825,6 +828,7 @@ mod tests {
825
828
  detected_languages: None,
826
829
  chunks: None,
827
830
  images: None,
831
+ pages: None,
828
832
  };
829
833
 
830
834
  let formatted = format_extraction_result(&result);
@@ -1622,19 +1626,17 @@ mod tests {
1622
1626
 
1623
1627
  let result = server.batch_extract_files(Parameters(params)).await;
1624
1628
 
1625
- if result.is_ok() {
1626
- let call_result = result.unwrap();
1627
- if let Some(content) = call_result.content.first()
1628
- && let RawContent::Text(text) = &content.raw
1629
- {
1630
- assert!(text.text.contains("Document 1"));
1631
- assert!(text.text.contains("Document 2"));
1629
+ if let Ok(call_result) = result
1630
+ && let Some(content) = call_result.content.first()
1631
+ && let RawContent::Text(text) = &content.raw
1632
+ {
1633
+ assert!(text.text.contains("Document 1"));
1634
+ assert!(text.text.contains("Document 2"));
1632
1635
 
1633
- let doc1_pos = text.text.find("Document 1");
1634
- let doc2_pos = text.text.find("Document 2");
1635
- if let (Some(pos1), Some(pos2)) = (doc1_pos, doc2_pos) {
1636
- assert!(pos1 < pos2, "Documents should be in order");
1637
- }
1636
+ let doc1_pos = text.text.find("Document 1");
1637
+ let doc2_pos = text.text.find("Document 2");
1638
+ if let (Some(pos1), Some(pos2)) = (doc1_pos, doc2_pos) {
1639
+ assert!(pos1 < pos2, "Documents should be in order");
1638
1640
  }
1639
1641
  }
1640
1642
  }
@@ -161,6 +161,7 @@ impl OcrBackend for TesseractBackend {
161
161
  content: ocr_result.content,
162
162
  mime_type: ocr_result.mime_type,
163
163
  metadata,
164
+ pages: None,
164
165
  tables: ocr_result
165
166
  .tables
166
167
  .into_iter()
@@ -214,6 +215,7 @@ impl OcrBackend for TesseractBackend {
214
215
  content: ocr_result.content,
215
216
  mime_type: ocr_result.mime_type,
216
217
  metadata,
218
+ pages: None,
217
219
  tables: ocr_result
218
220
  .tables
219
221
  .into_iter()
@@ -10,6 +10,7 @@ pub enum PdfError {
10
10
  TextExtractionFailed(String),
11
11
  RenderingFailed(String),
12
12
  MetadataExtractionFailed(String),
13
+ ExtractionFailed(String),
13
14
  IOError(String),
14
15
  }
15
16
 
@@ -28,6 +29,7 @@ impl fmt::Display for PdfError {
28
29
  PdfError::MetadataExtractionFailed(msg) => {
29
30
  write!(f, "Metadata extraction failed: {}", msg)
30
31
  }
32
+ PdfError::ExtractionFailed(msg) => write!(f, "Extraction failed: {}", msg),
31
33
  PdfError::IOError(msg) => write!(f, "I/O error: {}", msg),
32
34
  }
33
35
  }
@@ -119,4 +121,10 @@ mod tests {
119
121
  let err2 = err1.clone();
120
122
  assert_eq!(err1.to_string(), err2.to_string());
121
123
  }
124
+
125
+ #[test]
126
+ fn test_extraction_failed_error() {
127
+ let err = PdfError::ExtractionFailed("page data mismatch".to_string());
128
+ assert_eq!(err.to_string(), "Extraction failed: page data mismatch");
129
+ }
122
130
  }