kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +5 -3
  3. data/README.md +15 -9
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
  5. data/ext/kreuzberg_rb/native/Cargo.lock +516 -324
  6. data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
  8. data/kreuzberg.gemspec +38 -4
  9. data/lib/kreuzberg/config.rb +34 -1
  10. data/lib/kreuzberg/result.rb +77 -14
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/sig/kreuzberg.rbs +23 -6
  13. data/vendor/kreuzberg/Cargo.toml +25 -11
  14. data/vendor/kreuzberg/README.md +13 -8
  15. data/vendor/kreuzberg/build.rs +17 -6
  16. data/vendor/kreuzberg/src/api/mod.rs +2 -0
  17. data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
  18. data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
  19. data/vendor/kreuzberg/src/core/config.rs +49 -1
  20. data/vendor/kreuzberg/src/core/extractor.rs +134 -2
  21. data/vendor/kreuzberg/src/core/mod.rs +4 -2
  22. data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
  23. data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
  24. data/vendor/kreuzberg/src/extraction/html.rs +24 -8
  25. data/vendor/kreuzberg/src/extraction/image.rs +124 -1
  26. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
  27. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
  28. data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
  29. data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
  30. data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
  31. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  32. data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
  33. data/vendor/kreuzberg/src/extractors/email.rs +29 -15
  34. data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
  35. data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
  36. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
  37. data/vendor/kreuzberg/src/extractors/html.rs +29 -15
  38. data/vendor/kreuzberg/src/extractors/image.rs +25 -4
  39. data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
  40. data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
  41. data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
  42. data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
  43. data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
  44. data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
  45. data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
  46. data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
  47. data/vendor/kreuzberg/src/extractors/pdf.rs +194 -17
  48. data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
  49. data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
  50. data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
  51. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  52. data/vendor/kreuzberg/src/extractors/text.rs +7 -2
  53. data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
  54. data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
  55. data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
  56. data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
  57. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
  58. data/vendor/kreuzberg/src/lib.rs +10 -2
  59. data/vendor/kreuzberg/src/mcp/mod.rs +2 -0
  60. data/vendor/kreuzberg/src/mcp/server.rs +14 -12
  61. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
  62. data/vendor/kreuzberg/src/pdf/error.rs +8 -0
  63. data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
  64. data/vendor/kreuzberg/src/pdf/mod.rs +14 -2
  65. data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
  66. data/vendor/kreuzberg/src/pdf/table.rs +26 -2
  67. data/vendor/kreuzberg/src/pdf/text.rs +89 -7
  68. data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
  69. data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
  70. data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
  71. data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
  72. data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
  73. data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
  74. data/vendor/kreuzberg/src/text/mod.rs +6 -0
  75. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
  76. data/vendor/kreuzberg/src/types.rs +173 -21
  77. data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
  78. data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
  79. data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
  80. data/vendor/kreuzberg/tests/config_features.rs +15 -1
  81. data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
  82. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
  83. data/vendor/kreuzberg/tests/email_integration.rs +2 -0
  84. data/vendor/kreuzberg/tests/error_handling.rs +43 -34
  85. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  86. data/vendor/kreuzberg/tests/image_integration.rs +2 -0
  87. data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
  88. data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
  89. data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
  90. data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
  91. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
  92. data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
  93. data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
  94. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
  95. data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
  97. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
  98. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
  99. data/vendor/kreuzberg/tests/security_validation.rs +1 -0
  100. data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
  101. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
  102. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
  103. data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
  104. data/vendor/rb-sys/Cargo.lock +15 -15
  105. data/vendor/rb-sys/Cargo.toml +4 -4
  106. data/vendor/rb-sys/Cargo.toml.orig +4 -4
  107. data/vendor/rb-sys/bin/release.sh +9 -8
  108. data/vendor/rb-sys/build/features.rs +5 -2
  109. data/vendor/rb-sys/build/main.rs +55 -15
  110. data/vendor/rb-sys/build/stable_api_config.rs +4 -2
  111. data/vendor/rb-sys/build/version.rs +3 -1
  112. data/vendor/rb-sys/src/macros.rs +2 -2
  113. data/vendor/rb-sys/src/special_consts.rs +1 -1
  114. data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
  115. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
  116. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
  117. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
  118. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
  119. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
  120. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
  121. data/vendor/rb-sys/src/stable_api.rs +0 -1
  122. data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
  123. metadata +11 -10
  124. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  125. data/vendor/rb-sys/.cargo-ok +0 -1
  126. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
@@ -23,6 +23,7 @@ use crate::types::{ExtractionResult, Metadata, Table};
23
23
  use async_trait::async_trait;
24
24
  use quick_xml::Reader;
25
25
  use quick_xml::events::Event;
26
+ #[cfg(feature = "tokio-runtime")]
26
27
  use std::path::Path;
27
28
 
28
29
  /// Strip namespace prefix from XML tag names.
@@ -403,6 +404,7 @@ impl DocumentExtractor for DocbookExtractor {
403
404
  detected_languages: None,
404
405
  chunks: None,
405
406
  images: None,
407
+ pages: None,
406
408
  })
407
409
  }
408
410
 
@@ -1,3 +1,5 @@
1
+ #![cfg(all(feature = "tokio-runtime", feature = "office"))]
2
+
1
3
  //! DOCX extractor using docx-lite for high-performance text extraction.
2
4
  //!
3
5
  //! Supports: Microsoft Word (.docx)
@@ -6,7 +8,7 @@ use crate::Result;
6
8
  use crate::core::config::ExtractionConfig;
7
9
  use crate::extraction::{cells_to_markdown, office_metadata};
8
10
  use crate::plugins::{DocumentExtractor, Plugin};
9
- use crate::types::{ExtractionResult, Metadata, Table};
11
+ use crate::types::{ExtractionResult, Metadata, PageBoundary, PageInfo, PageStructure, PageUnitType, Table};
10
12
  use async_trait::async_trait;
11
13
  use std::io::Cursor;
12
14
 
@@ -116,26 +118,30 @@ impl DocumentExtractor for DocxExtractor {
116
118
  mime_type: &str,
117
119
  _config: &ExtractionConfig,
118
120
  ) -> Result<ExtractionResult> {
119
- let (text, tables) = if crate::core::batch_mode::is_batch_mode() {
121
+ let (text, tables, page_boundaries) = if crate::core::batch_mode::is_batch_mode() {
120
122
  let content_owned = content.to_vec();
121
123
  let span = tracing::Span::current();
122
- tokio::task::spawn_blocking(move || -> crate::error::Result<(String, Vec<Table>)> {
123
- let _guard = span.entered();
124
- let cursor = Cursor::new(&content_owned);
125
- let doc = docx_lite::parse_document(cursor)
126
- .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
124
+ tokio::task::spawn_blocking(
125
+ move || -> crate::error::Result<(String, Vec<Table>, Option<Vec<PageBoundary>>)> {
126
+ let _guard = span.entered();
127
+ let cursor = Cursor::new(&content_owned);
128
+ let doc = docx_lite::parse_document(cursor)
129
+ .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX parsing failed: {}", e)))?;
127
130
 
128
- let text = doc.extract_text();
131
+ let text = doc.extract_text();
129
132
 
130
- let tables: Vec<Table> = doc
131
- .tables
132
- .iter()
133
- .enumerate()
134
- .map(|(idx, table)| convert_docx_table_to_table(table, idx))
135
- .collect();
133
+ let tables: Vec<Table> = doc
134
+ .tables
135
+ .iter()
136
+ .enumerate()
137
+ .map(|(idx, table)| convert_docx_table_to_table(table, idx))
138
+ .collect();
136
139
 
137
- Ok((text, tables))
138
- })
140
+ let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(&content_owned)?;
141
+
142
+ Ok((text, tables, page_boundaries))
143
+ },
144
+ )
139
145
  .await
140
146
  .map_err(|e| crate::error::KreuzbergError::parsing(format!("DOCX extraction task failed: {}", e)))??
141
147
  } else {
@@ -152,7 +158,9 @@ impl DocumentExtractor for DocxExtractor {
152
158
  .map(|(idx, table)| convert_docx_table_to_table(table, idx))
153
159
  .collect();
154
160
 
155
- (text, tables)
161
+ let page_boundaries = crate::extraction::docx::detect_page_breaks_from_docx(content)?;
162
+
163
+ (text, tables, page_boundaries)
156
164
  };
157
165
 
158
166
  let mut archive = if crate::core::batch_mode::is_batch_mode() {
@@ -260,13 +268,38 @@ impl DocumentExtractor for DocxExtractor {
260
268
  }
261
269
  }
262
270
 
271
+ let page_structure = if let Some(boundaries) = page_boundaries {
272
+ let total_count = boundaries.len();
273
+ Some(PageStructure {
274
+ total_count,
275
+ unit_type: PageUnitType::Page,
276
+ boundaries: Some(boundaries),
277
+ pages: Some(
278
+ (1..=total_count)
279
+ .map(|page_num| PageInfo {
280
+ number: page_num,
281
+ title: None,
282
+ dimensions: None,
283
+ image_count: None,
284
+ table_count: None,
285
+ hidden: None,
286
+ })
287
+ .collect(),
288
+ ),
289
+ })
290
+ } else {
291
+ None
292
+ };
293
+
263
294
  Ok(ExtractionResult {
264
295
  content: text,
265
296
  mime_type: mime_type.to_string(),
266
297
  metadata: Metadata {
298
+ pages: page_structure,
267
299
  additional: metadata_map,
268
300
  ..Default::default()
269
301
  },
302
+ pages: None,
270
303
  tables,
271
304
  detected_languages: None,
272
305
  chunks: None,
@@ -2,9 +2,11 @@
2
2
 
3
3
  use crate::Result;
4
4
  use crate::core::config::ExtractionConfig;
5
+ use crate::extractors::SyncExtractor;
5
6
  use crate::plugins::{DocumentExtractor, Plugin};
6
7
  use crate::types::{EmailMetadata, ExtractionResult, Metadata};
7
8
  use async_trait::async_trait;
9
+ #[cfg(feature = "tokio-runtime")]
8
10
  use std::path::Path;
9
11
 
10
12
  /// Email message extractor.
@@ -42,21 +44,8 @@ impl Plugin for EmailExtractor {
42
44
  }
43
45
  }
44
46
 
45
- #[async_trait]
46
- impl DocumentExtractor for EmailExtractor {
47
- #[cfg_attr(feature = "otel", tracing::instrument(
48
- skip(self, content, _config),
49
- fields(
50
- extractor.name = self.name(),
51
- content.size_bytes = content.len(),
52
- )
53
- ))]
54
- async fn extract_bytes(
55
- &self,
56
- content: &[u8],
57
- mime_type: &str,
58
- _config: &ExtractionConfig,
59
- ) -> Result<ExtractionResult> {
47
+ impl SyncExtractor for EmailExtractor {
48
+ fn extract_sync(&self, content: &[u8], mime_type: &str, _config: &ExtractionConfig) -> Result<ExtractionResult> {
60
49
  let email_result = crate::extraction::email::extract_email_content(content, mime_type)?;
61
50
 
62
51
  let text = crate::extraction::email::build_email_text_output(&email_result);
@@ -96,8 +85,28 @@ impl DocumentExtractor for EmailExtractor {
96
85
  detected_languages: None,
97
86
  chunks: None,
98
87
  images: None,
88
+ pages: None,
99
89
  })
100
90
  }
91
+ }
92
+
93
+ #[async_trait]
94
+ impl DocumentExtractor for EmailExtractor {
95
+ #[cfg_attr(feature = "otel", tracing::instrument(
96
+ skip(self, content, config),
97
+ fields(
98
+ extractor.name = self.name(),
99
+ content.size_bytes = content.len(),
100
+ )
101
+ ))]
102
+ async fn extract_bytes(
103
+ &self,
104
+ content: &[u8],
105
+ mime_type: &str,
106
+ config: &ExtractionConfig,
107
+ ) -> Result<ExtractionResult> {
108
+ self.extract_sync(content, mime_type, config)
109
+ }
101
110
 
102
111
  #[cfg(feature = "tokio-runtime")]
103
112
  #[cfg_attr(feature = "otel", tracing::instrument(
@@ -106,6 +115,7 @@ impl DocumentExtractor for EmailExtractor {
106
115
  extractor.name = self.name(),
107
116
  )
108
117
  ))]
118
+ #[cfg(feature = "tokio-runtime")]
109
119
  async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
110
120
  let bytes = tokio::fs::read(path).await?;
111
121
  self.extract_bytes(&bytes, mime_type, config).await
@@ -118,6 +128,10 @@ impl DocumentExtractor for EmailExtractor {
118
128
  fn priority(&self) -> i32 {
119
129
  50
120
130
  }
131
+
132
+ fn as_sync_extractor(&self) -> Option<&dyn crate::extractors::SyncExtractor> {
133
+ Some(self)
134
+ }
121
135
  }
122
136
 
123
137
  #[cfg(test)]
@@ -574,6 +574,7 @@ impl DocumentExtractor for EpubExtractor {
574
574
  additional: metadata_map,
575
575
  ..Default::default()
576
576
  },
577
+ pages: None,
577
578
  tables: vec![],
578
579
  detected_languages: None,
579
580
  chunks: None,
@@ -150,6 +150,7 @@ impl DocumentExtractor for ExcelExtractor {
150
150
  additional,
151
151
  ..Default::default()
152
152
  },
153
+ pages: None,
153
154
  tables,
154
155
  detected_languages: None,
155
156
  chunks: None,
@@ -193,6 +194,7 @@ impl DocumentExtractor for ExcelExtractor {
193
194
  additional,
194
195
  ..Default::default()
195
196
  },
197
+ pages: None,
196
198
  tables,
197
199
  detected_languages: None,
198
200
  chunks: None,
@@ -440,6 +440,7 @@ impl DocumentExtractor for FictionBookExtractor {
440
440
  detected_languages: None,
441
441
  chunks: None,
442
442
  images: None,
443
+ pages: None,
443
444
  })
444
445
  }
445
446
 
@@ -2,9 +2,11 @@
2
2
 
3
3
  use crate::Result;
4
4
  use crate::core::config::ExtractionConfig;
5
+ use crate::extractors::SyncExtractor;
5
6
  use crate::plugins::{DocumentExtractor, Plugin};
6
7
  use crate::types::{ExtractionResult, Metadata, Table};
7
8
  use async_trait::async_trait;
9
+ #[cfg(feature = "tokio-runtime")]
8
10
  use std::path::Path;
9
11
 
10
12
  // NOTE: scraper dependency has been removed in favor of html-to-markdown-rs
@@ -193,21 +195,8 @@ impl Plugin for HtmlExtractor {
193
195
  }
194
196
  }
195
197
 
196
- #[async_trait]
197
- impl DocumentExtractor for HtmlExtractor {
198
- #[cfg_attr(feature = "otel", tracing::instrument(
199
- skip(self, content, config),
200
- fields(
201
- extractor.name = self.name(),
202
- content.size_bytes = content.len(),
203
- )
204
- ))]
205
- async fn extract_bytes(
206
- &self,
207
- content: &[u8],
208
- mime_type: &str,
209
- config: &ExtractionConfig,
210
- ) -> Result<ExtractionResult> {
198
+ impl SyncExtractor for HtmlExtractor {
199
+ fn extract_sync(&self, content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
211
200
  let html = std::str::from_utf8(content)
212
201
  .map(|s| s.to_string())
213
202
  .unwrap_or_else(|_| String::from_utf8_lossy(content).to_string());
@@ -225,12 +214,32 @@ impl DocumentExtractor for HtmlExtractor {
225
214
  format: html_metadata.map(|m| crate::types::FormatMetadata::Html(Box::new(m))),
226
215
  ..Default::default()
227
216
  },
217
+ pages: None,
228
218
  tables,
229
219
  detected_languages: None,
230
220
  chunks: None,
231
221
  images: None,
232
222
  })
233
223
  }
224
+ }
225
+
226
+ #[async_trait]
227
+ impl DocumentExtractor for HtmlExtractor {
228
+ #[cfg_attr(feature = "otel", tracing::instrument(
229
+ skip(self, content, config),
230
+ fields(
231
+ extractor.name = self.name(),
232
+ content.size_bytes = content.len(),
233
+ )
234
+ ))]
235
+ async fn extract_bytes(
236
+ &self,
237
+ content: &[u8],
238
+ mime_type: &str,
239
+ config: &ExtractionConfig,
240
+ ) -> Result<ExtractionResult> {
241
+ self.extract_sync(content, mime_type, config)
242
+ }
234
243
 
235
244
  #[cfg(feature = "tokio-runtime")]
236
245
  #[cfg_attr(feature = "otel", tracing::instrument(
@@ -239,6 +248,7 @@ impl DocumentExtractor for HtmlExtractor {
239
248
  extractor.name = self.name(),
240
249
  )
241
250
  ))]
251
+ #[cfg(feature = "tokio-runtime")]
242
252
  async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
243
253
  let bytes = tokio::fs::read(path).await?;
244
254
  self.extract_bytes(&bytes, mime_type, config).await
@@ -251,6 +261,10 @@ impl DocumentExtractor for HtmlExtractor {
251
261
  fn priority(&self) -> i32 {
252
262
  50
253
263
  }
264
+
265
+ fn as_sync_extractor(&self) -> Option<&dyn crate::extractors::SyncExtractor> {
266
+ Some(self)
267
+ }
254
268
  }
255
269
 
256
270
  #[cfg(test)]
@@ -20,9 +20,14 @@ impl ImageExtractor {
20
20
  Self
21
21
  }
22
22
 
23
- /// Extract text from image using OCR.
23
+ /// Extract text from image using OCR with optional page tracking for multi-frame TIFFs.
24
24
  #[cfg(feature = "ocr")]
25
- async fn extract_with_ocr(&self, content: &[u8], config: &ExtractionConfig) -> Result<ExtractionResult> {
25
+ async fn extract_with_ocr(
26
+ &self,
27
+ content: &[u8],
28
+ mime_type: &str,
29
+ config: &ExtractionConfig,
30
+ ) -> Result<ExtractionResult> {
26
31
  use crate::plugins::registry::get_ocr_backend_registry;
27
32
 
28
33
  let ocr_config = config.ocr.as_ref().ok_or_else(|| crate::KreuzbergError::Parsing {
@@ -39,7 +44,21 @@ impl ImageExtractor {
39
44
  registry.get(&ocr_config.backend)?
40
45
  };
41
46
 
42
- backend.process_image(content, ocr_config).await
47
+ let ocr_result = backend.process_image(content, ocr_config).await?;
48
+
49
+ let ocr_text = ocr_result.content.clone();
50
+ let ocr_extraction_result = crate::extraction::image::extract_text_from_image_with_ocr(
51
+ content,
52
+ mime_type,
53
+ ocr_text,
54
+ config.pages.as_ref(),
55
+ )?;
56
+
57
+ let mut result = ocr_result;
58
+ result.content = ocr_extraction_result.content;
59
+ result.pages = ocr_extraction_result.page_contents;
60
+
61
+ Ok(result)
43
62
  }
44
63
  }
45
64
 
@@ -102,7 +121,7 @@ impl DocumentExtractor for ImageExtractor {
102
121
  if config.ocr.is_some() {
103
122
  #[cfg(feature = "ocr")]
104
123
  {
105
- let mut ocr_result = self.extract_with_ocr(content, config).await?;
124
+ let mut ocr_result = self.extract_with_ocr(content, mime_type, config).await?;
106
125
 
107
126
  ocr_result.metadata.format = Some(crate::types::FormatMetadata::Image(image_metadata));
108
127
  ocr_result.mime_type = mime_type.to_string();
@@ -123,6 +142,7 @@ impl DocumentExtractor for ImageExtractor {
123
142
  format: Some(crate::types::FormatMetadata::Image(image_metadata)),
124
143
  ..Default::default()
125
144
  },
145
+ pages: None,
126
146
  tables: vec![],
127
147
  detected_languages: None,
128
148
  chunks: None,
@@ -141,6 +161,7 @@ impl DocumentExtractor for ImageExtractor {
141
161
  format: Some(crate::types::FormatMetadata::Image(image_metadata)),
142
162
  ..Default::default()
143
163
  },
164
+ pages: None,
144
165
  tables: vec![],
145
166
  detected_languages: None,
146
167
  chunks: None,
@@ -21,6 +21,7 @@ use crate::types::{ExtractionResult, Metadata, Table};
21
21
  use async_trait::async_trait;
22
22
  use quick_xml::Reader;
23
23
  use quick_xml::events::Event;
24
+ #[cfg(feature = "tokio-runtime")]
24
25
  use std::path::Path;
25
26
 
26
27
  /// JATS document extractor.
@@ -569,6 +570,7 @@ impl DocumentExtractor for JatsExtractor {
569
570
  detected_languages: None,
570
571
  chunks: None,
571
572
  images: None,
573
+ pages: None,
572
574
  })
573
575
  }
574
576
 
@@ -582,6 +584,7 @@ impl DocumentExtractor for JatsExtractor {
582
584
  )
583
585
  )
584
586
  )]
587
+ #[cfg(feature = "tokio-runtime")]
585
588
  async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
586
589
  let bytes = tokio::fs::read(path).await?;
587
590
  self.extract_bytes(&bytes, mime_type, config).await
@@ -336,6 +336,7 @@ impl DocumentExtractor for JupyterExtractor {
336
336
  additional: metadata_additional,
337
337
  ..Default::default()
338
338
  },
339
+ pages: None,
339
340
  tables: vec![],
340
341
  detected_languages: None,
341
342
  chunks: None,
@@ -93,6 +93,7 @@ impl DocumentExtractor for LatexExtractor {
93
93
  detected_languages: None,
94
94
  chunks: None,
95
95
  images: None,
96
+ pages: None,
96
97
  })
97
98
  }
98
99
 
@@ -365,6 +365,7 @@ impl DocumentExtractor for MarkdownExtractor {
365
365
  detected_languages: None,
366
366
  chunks: None,
367
367
  images: None,
368
+ pages: None,
368
369
  })
369
370
  }
370
371
 
@@ -4,14 +4,69 @@
4
4
  //! All extractors implement the `DocumentExtractor` plugin trait.
5
5
 
6
6
  use crate::Result;
7
+ use crate::core::config::ExtractionConfig;
7
8
  use crate::plugins::registry::get_document_extractor_registry;
9
+ use crate::types::ExtractionResult;
8
10
  use once_cell::sync::Lazy;
9
11
  use std::sync::Arc;
10
12
 
11
- pub mod security;
13
+ /// Trait for extractors that can work synchronously (WASM-compatible).
14
+ ///
15
+ /// This trait defines the synchronous extraction interface for WASM targets and other
16
+ /// environments where async/tokio runtimes are not available or desirable.
17
+ ///
18
+ /// # Implementation
19
+ ///
20
+ /// Extractors that need to support WASM should implement this trait in addition to
21
+ /// the async `DocumentExtractor` trait. This allows the same extractor to work in both
22
+ /// environments by delegating to the sync implementation.
23
+ ///
24
+ /// # MIME Type Validation
25
+ ///
26
+ /// The `mime_type` parameter is guaranteed to be already validated.
27
+ ///
28
+ /// # Example
29
+ ///
30
+ /// ```rust,ignore
31
+ /// impl SyncExtractor for PlainTextExtractor {
32
+ /// fn extract_sync(&self, content: &[u8], config: &ExtractionConfig) -> Result<ExtractionResult> {
33
+ /// let text = String::from_utf8_lossy(content).to_string();
34
+ /// Ok(ExtractionResult {
35
+ /// content: text,
36
+ /// mime_type: "text/plain".to_string(),
37
+ /// metadata: Metadata::default(),
38
+ /// tables: vec![],
39
+ /// detected_languages: None,
40
+ /// chunks: None,
41
+ /// images: None,
42
+ /// })
43
+ /// }
44
+ /// }
45
+ /// ```
46
+ pub trait SyncExtractor {
47
+ /// Extract content from a byte array synchronously.
48
+ ///
49
+ /// This method performs extraction without requiring an async runtime.
50
+ /// It is called by `extract_bytes_sync()` when the `tokio-runtime` feature is disabled.
51
+ ///
52
+ /// # Arguments
53
+ ///
54
+ /// * `content` - Raw document bytes
55
+ /// * `mime_type` - MIME type of the document (already validated)
56
+ /// * `config` - Extraction configuration
57
+ ///
58
+ /// # Returns
59
+ ///
60
+ /// An `ExtractionResult` containing the extracted content and metadata.
61
+ fn extract_sync(&self, content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult>;
62
+ }
63
+
12
64
  pub mod structured;
13
65
  pub mod text;
14
66
 
67
+ #[cfg(feature = "archives")]
68
+ pub mod security;
69
+
15
70
  #[cfg(feature = "ocr")]
16
71
  pub mod image;
17
72
 
@@ -30,7 +85,7 @@ pub mod html;
30
85
  #[cfg(feature = "office")]
31
86
  pub mod bibtex;
32
87
 
33
- #[cfg(feature = "office")]
88
+ #[cfg(all(feature = "tokio-runtime", feature = "office"))]
34
89
  pub mod docx;
35
90
 
36
91
  #[cfg(feature = "office")]
@@ -54,7 +109,7 @@ pub mod jupyter;
54
109
  #[cfg(feature = "office")]
55
110
  pub mod orgmode;
56
111
 
57
- #[cfg(feature = "office")]
112
+ #[cfg(all(feature = "tokio-runtime", feature = "office"))]
58
113
  pub mod odt;
59
114
 
60
115
  #[cfg(feature = "office")]
@@ -69,7 +124,7 @@ pub mod jats;
69
124
  #[cfg(feature = "pdf")]
70
125
  pub mod pdf;
71
126
 
72
- #[cfg(feature = "office")]
127
+ #[cfg(all(feature = "tokio-runtime", feature = "office"))]
73
128
  pub mod pptx;
74
129
 
75
130
  #[cfg(feature = "office")]
@@ -102,7 +157,7 @@ pub use html::HtmlExtractor;
102
157
  #[cfg(feature = "office")]
103
158
  pub use bibtex::BibtexExtractor;
104
159
 
105
- #[cfg(feature = "office")]
160
+ #[cfg(all(feature = "tokio-runtime", feature = "office"))]
106
161
  pub use docx::DocxExtractor;
107
162
 
108
163
  #[cfg(feature = "office")]
@@ -126,7 +181,7 @@ pub use jupyter::JupyterExtractor;
126
181
  #[cfg(feature = "office")]
127
182
  pub use orgmode::OrgModeExtractor;
128
183
 
129
- #[cfg(feature = "office")]
184
+ #[cfg(all(feature = "tokio-runtime", feature = "office"))]
130
185
  pub use odt::OdtExtractor;
131
186
 
132
187
  #[cfg(feature = "xml")]
@@ -141,7 +196,7 @@ pub use typst::TypstExtractor;
141
196
  #[cfg(feature = "pdf")]
142
197
  pub use pdf::PdfExtractor;
143
198
 
144
- #[cfg(feature = "office")]
199
+ #[cfg(all(feature = "tokio-runtime", feature = "office"))]
145
200
  pub use pptx::PptxExtractor;
146
201
 
147
202
  #[cfg(feature = "office")]
@@ -230,11 +285,8 @@ pub fn register_default_extractors() -> Result<()> {
230
285
  {
231
286
  registry.register(Arc::new(EnhancedMarkdownExtractor::new()))?;
232
287
  registry.register(Arc::new(BibtexExtractor::new()))?;
233
- registry.register(Arc::new(DocxExtractor::new()))?;
234
288
  registry.register(Arc::new(EpubExtractor::new()))?;
235
289
  registry.register(Arc::new(FictionBookExtractor::new()))?;
236
- registry.register(Arc::new(PptxExtractor::new()))?;
237
- registry.register(Arc::new(OdtExtractor::new()))?;
238
290
  registry.register(Arc::new(RtfExtractor::new()))?;
239
291
  registry.register(Arc::new(RstExtractor::new()))?;
240
292
  registry.register(Arc::new(LatexExtractor::new()))?;
@@ -244,6 +296,13 @@ pub fn register_default_extractors() -> Result<()> {
244
296
  registry.register(Arc::new(TypstExtractor::new()))?;
245
297
  }
246
298
 
299
+ #[cfg(all(feature = "tokio-runtime", feature = "office"))]
300
+ {
301
+ registry.register(Arc::new(DocxExtractor::new()))?;
302
+ registry.register(Arc::new(PptxExtractor::new()))?;
303
+ registry.register(Arc::new(OdtExtractor::new()))?;
304
+ }
305
+
247
306
  #[cfg(feature = "email")]
248
307
  registry.register(Arc::new(EmailExtractor::new()))?;
249
308
 
@@ -313,14 +372,11 @@ mod tests {
313
372
 
314
373
  #[cfg(feature = "office")]
315
374
  {
316
- expected_count += 13;
375
+ expected_count += 10;
317
376
  assert!(extractor_names.contains(&"markdown-extractor".to_string()));
318
377
  assert!(extractor_names.contains(&"bibtex-extractor".to_string()));
319
- assert!(extractor_names.contains(&"docx-extractor".to_string()));
320
378
  assert!(extractor_names.contains(&"epub-extractor".to_string()));
321
379
  assert!(extractor_names.contains(&"fictionbook-extractor".to_string()));
322
- assert!(extractor_names.contains(&"pptx-extractor".to_string()));
323
- assert!(extractor_names.contains(&"odt-extractor".to_string()));
324
380
  assert!(extractor_names.contains(&"rtf-extractor".to_string()));
325
381
  assert!(extractor_names.contains(&"rst-extractor".to_string()));
326
382
  assert!(extractor_names.contains(&"latex-extractor".to_string()));
@@ -330,6 +386,14 @@ mod tests {
330
386
  assert!(extractor_names.contains(&"typst-extractor".to_string()));
331
387
  }
332
388
 
389
+ #[cfg(all(feature = "tokio-runtime", feature = "office"))]
390
+ {
391
+ expected_count += 3;
392
+ assert!(extractor_names.contains(&"docx-extractor".to_string()));
393
+ assert!(extractor_names.contains(&"pptx-extractor".to_string()));
394
+ assert!(extractor_names.contains(&"odt-extractor".to_string()));
395
+ }
396
+
333
397
  #[cfg(feature = "email")]
334
398
  {
335
399
  expected_count += 1;
@@ -1,3 +1,5 @@
1
+ #![cfg(all(feature = "tokio-runtime", feature = "office"))]
2
+
1
3
  //! ODT (OpenDocument Text) extractor using native Rust parsing.
2
4
  //!
3
5
  //! Supports: OpenDocument Text (.odt)
@@ -169,13 +171,10 @@ fn extract_content_text(archive: &mut zip::ZipArchive<Cursor<Vec<u8>>>) -> crate
169
171
 
170
172
  let mut text_parts: Vec<String> = Vec::new();
171
173
 
172
- // Find the office:text or text body element - this is the main document body
173
174
  for body_child in root.children() {
174
175
  if body_child.tag_name().name() == "body" {
175
- // Process the text element inside body
176
176
  for text_elem in body_child.children() {
177
177
  if text_elem.tag_name().name() == "text" {
178
- // Now process only direct children of the text element
179
178
  process_document_elements(text_elem, &mut text_parts);
180
179
  }
181
180
  }
@@ -563,6 +562,7 @@ impl DocumentExtractor for OdtExtractor {
563
562
  additional: metadata_map,
564
563
  ..Default::default()
565
564
  },
565
+ pages: None,
566
566
  tables,
567
567
  detected_languages: None,
568
568
  chunks: None,
@@ -191,6 +191,7 @@ impl DocumentExtractor for OpmlExtractor {
191
191
  additional: metadata_map,
192
192
  ..Default::default()
193
193
  },
194
+ pages: None,
194
195
  tables: vec![],
195
196
  detected_languages: None,
196
197
  chunks: None,
@@ -304,6 +304,7 @@ impl DocumentExtractor for OrgModeExtractor {
304
304
  detected_languages: None,
305
305
  chunks: None,
306
306
  images: None,
307
+ pages: None,
307
308
  })
308
309
  }
309
310