kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +5 -3
  3. data/README.md +15 -9
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
  5. data/ext/kreuzberg_rb/native/Cargo.lock +516 -324
  6. data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
  8. data/kreuzberg.gemspec +38 -4
  9. data/lib/kreuzberg/config.rb +34 -1
  10. data/lib/kreuzberg/result.rb +77 -14
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/sig/kreuzberg.rbs +23 -6
  13. data/vendor/kreuzberg/Cargo.toml +25 -11
  14. data/vendor/kreuzberg/README.md +13 -8
  15. data/vendor/kreuzberg/build.rs +17 -6
  16. data/vendor/kreuzberg/src/api/mod.rs +2 -0
  17. data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
  18. data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
  19. data/vendor/kreuzberg/src/core/config.rs +49 -1
  20. data/vendor/kreuzberg/src/core/extractor.rs +134 -2
  21. data/vendor/kreuzberg/src/core/mod.rs +4 -2
  22. data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
  23. data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
  24. data/vendor/kreuzberg/src/extraction/html.rs +24 -8
  25. data/vendor/kreuzberg/src/extraction/image.rs +124 -1
  26. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
  27. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
  28. data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
  29. data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
  30. data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
  31. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  32. data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
  33. data/vendor/kreuzberg/src/extractors/email.rs +29 -15
  34. data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
  35. data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
  36. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
  37. data/vendor/kreuzberg/src/extractors/html.rs +29 -15
  38. data/vendor/kreuzberg/src/extractors/image.rs +25 -4
  39. data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
  40. data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
  41. data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
  42. data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
  43. data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
  44. data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
  45. data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
  46. data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
  47. data/vendor/kreuzberg/src/extractors/pdf.rs +194 -17
  48. data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
  49. data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
  50. data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
  51. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  52. data/vendor/kreuzberg/src/extractors/text.rs +7 -2
  53. data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
  54. data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
  55. data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
  56. data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
  57. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
  58. data/vendor/kreuzberg/src/lib.rs +10 -2
  59. data/vendor/kreuzberg/src/mcp/mod.rs +2 -0
  60. data/vendor/kreuzberg/src/mcp/server.rs +14 -12
  61. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
  62. data/vendor/kreuzberg/src/pdf/error.rs +8 -0
  63. data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
  64. data/vendor/kreuzberg/src/pdf/mod.rs +14 -2
  65. data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
  66. data/vendor/kreuzberg/src/pdf/table.rs +26 -2
  67. data/vendor/kreuzberg/src/pdf/text.rs +89 -7
  68. data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
  69. data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
  70. data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
  71. data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
  72. data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
  73. data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
  74. data/vendor/kreuzberg/src/text/mod.rs +6 -0
  75. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
  76. data/vendor/kreuzberg/src/types.rs +173 -21
  77. data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
  78. data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
  79. data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
  80. data/vendor/kreuzberg/tests/config_features.rs +15 -1
  81. data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
  82. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
  83. data/vendor/kreuzberg/tests/email_integration.rs +2 -0
  84. data/vendor/kreuzberg/tests/error_handling.rs +43 -34
  85. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  86. data/vendor/kreuzberg/tests/image_integration.rs +2 -0
  87. data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
  88. data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
  89. data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
  90. data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
  91. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
  92. data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
  93. data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
  94. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
  95. data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
  97. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
  98. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
  99. data/vendor/kreuzberg/tests/security_validation.rs +1 -0
  100. data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
  101. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
  102. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
  103. data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
  104. data/vendor/rb-sys/Cargo.lock +15 -15
  105. data/vendor/rb-sys/Cargo.toml +4 -4
  106. data/vendor/rb-sys/Cargo.toml.orig +4 -4
  107. data/vendor/rb-sys/bin/release.sh +9 -8
  108. data/vendor/rb-sys/build/features.rs +5 -2
  109. data/vendor/rb-sys/build/main.rs +55 -15
  110. data/vendor/rb-sys/build/stable_api_config.rs +4 -2
  111. data/vendor/rb-sys/build/version.rs +3 -1
  112. data/vendor/rb-sys/src/macros.rs +2 -2
  113. data/vendor/rb-sys/src/special_consts.rs +1 -1
  114. data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
  115. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
  116. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
  117. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
  118. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
  119. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
  120. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
  121. data/vendor/rb-sys/src/stable_api.rs +0 -1
  122. data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
  123. metadata +11 -10
  124. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  125. data/vendor/rb-sys/.cargo-ok +0 -1
  126. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
@@ -0,0 +1,220 @@
1
+ //! Text chunking post-processor.
2
+ //!
3
+ //! This module provides a PostProcessor plugin that chunks text content in
4
+ //! extraction results.
5
+
6
+ use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
7
+ use crate::{ExtractionConfig, ExtractionResult, KreuzbergError, Result};
8
+ use async_trait::async_trait;
9
+
10
+ /// Post-processor that chunks text in document content.
11
+ ///
12
+ /// This processor:
13
+ /// - Runs in the Middle processing stage
14
+ /// - Only processes when `config.chunking` is configured
15
+ /// - Stores chunks in `result.chunks`
16
+ /// - Uses configurable chunk size and overlap
17
+ ///
18
+ /// # Example
19
+ ///
20
+ /// ```rust,no_run
21
+ /// use kreuzberg::plugins::{Plugin, PostProcessor};
22
+ /// use kreuzberg::chunking::processor::ChunkingProcessor;
23
+ ///
24
+ /// let processor = ChunkingProcessor;
25
+ /// assert_eq!(processor.name(), "text-chunking");
26
+ /// ```
27
+ #[derive(Debug, Clone, Copy)]
28
+ pub struct ChunkingProcessor;
29
+
30
+ impl Plugin for ChunkingProcessor {
31
+ fn name(&self) -> &str {
32
+ "text-chunking"
33
+ }
34
+
35
+ fn version(&self) -> String {
36
+ env!("CARGO_PKG_VERSION").to_string()
37
+ }
38
+
39
+ fn initialize(&self) -> Result<()> {
40
+ Ok(())
41
+ }
42
+
43
+ fn shutdown(&self) -> Result<()> {
44
+ Ok(())
45
+ }
46
+ }
47
+
48
+ #[cfg_attr(not(target_arch = "wasm32"), async_trait)]
49
+ #[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
50
+ impl PostProcessor for ChunkingProcessor {
51
+ async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig) -> Result<()> {
52
+ let chunking_config = match &config.chunking {
53
+ Some(cfg) => cfg,
54
+ None => return Ok(()),
55
+ };
56
+
57
+ let chunk_config = crate::chunking::ChunkingConfig {
58
+ max_characters: chunking_config.max_chars,
59
+ overlap: chunking_config.max_overlap,
60
+ trim: true,
61
+ chunker_type: crate::chunking::ChunkerType::Text,
62
+ };
63
+
64
+ let chunking_result = crate::chunking::chunk_text(&result.content, &chunk_config, None)
65
+ .map_err(|e| KreuzbergError::Other(format!("Chunking failed: {}", e)))?;
66
+ result.chunks = Some(chunking_result.chunks);
67
+
68
+ Ok(())
69
+ }
70
+
71
+ fn processing_stage(&self) -> ProcessingStage {
72
+ ProcessingStage::Middle
73
+ }
74
+
75
+ fn should_process(&self, _result: &ExtractionResult, config: &ExtractionConfig) -> bool {
76
+ config.chunking.is_some()
77
+ }
78
+
79
+ fn estimated_duration_ms(&self, result: &ExtractionResult) -> u64 {
80
+ let text_length = result.content.len();
81
+ // Chunking is fast: ~1ms per 10KB
82
+ (text_length / 10240).max(1) as u64
83
+ }
84
+ }
85
+
86
+ #[cfg(test)]
87
+ mod tests {
88
+ use super::*;
89
+ use crate::core::config::ChunkingConfig;
90
+ use crate::types::Metadata;
91
+
92
+ #[tokio::test]
93
+ async fn test_chunking_processor() {
94
+ let processor = ChunkingProcessor;
95
+ let config = ExtractionConfig {
96
+ chunking: Some(ChunkingConfig {
97
+ max_chars: 100,
98
+ max_overlap: 10,
99
+ embedding: None,
100
+ preset: None,
101
+ }),
102
+ ..Default::default()
103
+ };
104
+
105
+ let mut result = ExtractionResult {
106
+ content: "This is a longer text that should be split into multiple chunks to test the chunking processor functionality.".to_string(),
107
+ mime_type: "text/plain".to_string(),
108
+ metadata: Metadata::default(),
109
+ tables: vec![],
110
+ detected_languages: None,
111
+ chunks: None,
112
+ images: None,
113
+ pages: None,
114
+ };
115
+
116
+ processor.process(&mut result, &config).await.unwrap();
117
+
118
+ assert!(result.chunks.is_some());
119
+ let chunks = result.chunks.unwrap();
120
+ assert!(!chunks.is_empty());
121
+ }
122
+
123
+ #[tokio::test]
124
+ async fn test_chunking_processor_no_config() {
125
+ let processor = ChunkingProcessor;
126
+ let config = ExtractionConfig::default();
127
+
128
+ let mut result = ExtractionResult {
129
+ content: "Some text".to_string(),
130
+ mime_type: "text/plain".to_string(),
131
+ metadata: Metadata::default(),
132
+ tables: vec![],
133
+ detected_languages: None,
134
+ chunks: None,
135
+ images: None,
136
+ pages: None,
137
+ };
138
+
139
+ processor.process(&mut result, &config).await.unwrap();
140
+
141
+ assert!(result.chunks.is_none());
142
+ }
143
+
144
+ #[test]
145
+ fn test_chunking_processor_plugin_interface() {
146
+ let processor = ChunkingProcessor;
147
+ assert_eq!(processor.name(), "text-chunking");
148
+ assert!(!processor.version().is_empty());
149
+ assert!(processor.initialize().is_ok());
150
+ assert!(processor.shutdown().is_ok());
151
+ }
152
+
153
+ #[test]
154
+ fn test_chunking_processor_stage() {
155
+ let processor = ChunkingProcessor;
156
+ assert_eq!(processor.processing_stage(), ProcessingStage::Middle);
157
+ }
158
+
159
+ #[test]
160
+ fn test_chunking_processor_should_process() {
161
+ let processor = ChunkingProcessor;
162
+
163
+ let result = ExtractionResult {
164
+ content: "Sample text".to_string(),
165
+ mime_type: "text/plain".to_string(),
166
+ metadata: Metadata::default(),
167
+ tables: vec![],
168
+ detected_languages: None,
169
+ chunks: None,
170
+ images: None,
171
+ pages: None,
172
+ };
173
+
174
+ let config_with_chunking = ExtractionConfig {
175
+ chunking: Some(crate::core::config::ChunkingConfig {
176
+ max_chars: 100,
177
+ max_overlap: 10,
178
+ embedding: None,
179
+ preset: None,
180
+ }),
181
+ ..Default::default()
182
+ };
183
+ assert!(processor.should_process(&result, &config_with_chunking));
184
+
185
+ let config_without_chunking = ExtractionConfig::default();
186
+ assert!(!processor.should_process(&result, &config_without_chunking));
187
+ }
188
+
189
+ #[test]
190
+ fn test_chunking_processor_estimated_duration() {
191
+ let processor = ChunkingProcessor;
192
+
193
+ let short_result = ExtractionResult {
194
+ content: "Short".to_string(),
195
+ mime_type: "text/plain".to_string(),
196
+ metadata: Metadata::default(),
197
+ tables: vec![],
198
+ detected_languages: None,
199
+ chunks: None,
200
+ images: None,
201
+ pages: None,
202
+ };
203
+
204
+ let long_result = ExtractionResult {
205
+ content: "a".repeat(100000),
206
+ mime_type: "text/plain".to_string(),
207
+ metadata: Metadata::default(),
208
+ tables: vec![],
209
+ detected_languages: None,
210
+ chunks: None,
211
+ images: None,
212
+ pages: None,
213
+ };
214
+
215
+ let short_duration = processor.estimated_duration_ms(&short_result);
216
+ let long_duration = processor.estimated_duration_ms(&long_result);
217
+
218
+ assert!(long_duration > short_duration);
219
+ }
220
+ }
@@ -7,6 +7,40 @@ use crate::{KreuzbergError, Result};
7
7
  use serde::{Deserialize, Serialize};
8
8
  use std::path::Path;
9
9
 
10
+ /// Page extraction and tracking configuration.
11
+ ///
12
+ /// Controls how pages are extracted, tracked, and represented in the extraction results.
13
+ /// When `None`, page tracking is disabled.
14
+ ///
15
+ /// Page range tracking in chunk metadata (first_page/last_page) is automatically enabled
16
+ /// when page boundaries are available and chunking is configured.
17
+ #[derive(Debug, Clone, Serialize, Deserialize)]
18
+ #[serde(default)]
19
+ pub struct PageConfig {
20
+ /// Extract pages as separate array (ExtractionResult.pages)
21
+ #[serde(default)]
22
+ pub extract_pages: bool,
23
+
24
+ /// Insert page markers in main content string
25
+ #[serde(default)]
26
+ pub insert_page_markers: bool,
27
+
28
+ /// Page marker format (use {page_num} placeholder)
29
+ /// Default: "\n\n<!-- PAGE {page_num} -->\n\n"
30
+ #[serde(default = "default_page_marker_format")]
31
+ pub marker_format: String,
32
+ }
33
+
34
+ impl Default for PageConfig {
35
+ fn default() -> Self {
36
+ Self {
37
+ extract_pages: false,
38
+ insert_page_markers: false,
39
+ marker_format: "\n\n<!-- PAGE {page_num} -->\n\n".to_string(),
40
+ }
41
+ }
42
+ }
43
+
10
44
  /// Main extraction configuration.
11
45
  ///
12
46
  /// This struct contains all configuration options for the extraction process.
@@ -50,6 +84,7 @@ pub struct ExtractionConfig {
50
84
  pub images: Option<ImageExtractionConfig>,
51
85
 
52
86
  /// PDF-specific options (None = use defaults)
87
+ #[cfg(feature = "pdf")]
53
88
  #[serde(default)]
54
89
  pub pdf_options: Option<PdfConfig>,
55
90
 
@@ -61,6 +96,10 @@ pub struct ExtractionConfig {
61
96
  #[serde(default)]
62
97
  pub language_detection: Option<LanguageDetectionConfig>,
63
98
 
99
+ /// Page extraction configuration (None = no page tracking)
100
+ #[serde(default)]
101
+ pub pages: Option<PageConfig>,
102
+
64
103
  /// Keyword extraction configuration (None = no keyword extraction)
65
104
  #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
66
105
  #[serde(default)]
@@ -225,6 +264,7 @@ pub struct ImageExtractionConfig {
225
264
  }
226
265
 
227
266
  /// PDF-specific configuration.
267
+ #[cfg(feature = "pdf")]
228
268
  #[derive(Debug, Clone, Serialize, Deserialize)]
229
269
  pub struct PdfConfig {
230
270
  /// Extract images from PDF
@@ -277,6 +317,9 @@ fn default_eng() -> String {
277
317
  fn default_tesseract_backend() -> String {
278
318
  "tesseract".to_string()
279
319
  }
320
+ fn default_page_marker_format() -> String {
321
+ "\n\n<!-- PAGE {page_num} -->\n\n".to_string()
322
+ }
280
323
  fn default_chunk_size() -> usize {
281
324
  1000
282
325
  }
@@ -317,9 +360,11 @@ impl Default for ExtractionConfig {
317
360
  force_ocr: false,
318
361
  chunking: None,
319
362
  images: None,
363
+ #[cfg(feature = "pdf")]
320
364
  pdf_options: None,
321
365
  token_reduction: None,
322
366
  language_detection: None,
367
+ pages: None,
323
368
  #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
324
369
  keywords: None,
325
370
  postprocessor: None,
@@ -647,6 +692,7 @@ max_dpi = 600
647
692
  }
648
693
 
649
694
  #[test]
695
+ #[cfg(feature = "pdf")]
650
696
  fn test_config_with_pdf_options() {
651
697
  let dir = tempdir().unwrap();
652
698
  let config_path = dir.path().join("kreuzberg.toml");
@@ -770,9 +816,10 @@ enabled = true
770
816
  assert!(config.ocr.is_some());
771
817
  assert!(config.chunking.is_some());
772
818
  assert!(config.images.is_some());
773
- assert!(config.pdf_options.is_some());
774
819
  assert!(config.token_reduction.is_some());
775
820
  assert!(config.language_detection.is_some());
821
+ #[cfg(feature = "pdf")]
822
+ assert!(config.pdf_options.is_some());
776
823
  }
777
824
 
778
825
  #[test]
@@ -838,6 +885,7 @@ enabled = true
838
885
  }
839
886
 
840
887
  #[test]
888
+ #[cfg(feature = "pdf")]
841
889
  fn test_pdf_config_defaults() {
842
890
  let dir = tempdir().unwrap();
843
891
  let config_path = dir.path().join("kreuzberg.toml");
@@ -20,6 +20,7 @@ use crate::types::ExtractionResult;
20
20
  #[cfg(feature = "office")]
21
21
  use crate::types::LibreOfficeConversionResult;
22
22
  use crate::{KreuzbergError, Result};
23
+ #[cfg(feature = "tokio-runtime")]
23
24
  use once_cell::sync::Lazy;
24
25
  #[cfg(feature = "office")]
25
26
  use serde_json::json;
@@ -97,6 +98,12 @@ fn sanitize_path(path: &Path) -> String {
97
98
  /// 2. If runtime creation fails, the process is already in a critical state
98
99
  /// 3. This is a one-time initialization - if it fails, nothing will work
99
100
  /// 4. Better to fail fast than return errors from every sync operation
101
+ ///
102
+ /// # Availability
103
+ ///
104
+ /// This static is only available when the `tokio-runtime` feature is enabled.
105
+ /// For WASM targets, use the truly synchronous extraction functions instead.
106
+ #[cfg(feature = "tokio-runtime")]
100
107
  static GLOBAL_RUNTIME: Lazy<tokio::runtime::Runtime> = Lazy::new(|| {
101
108
  tokio::runtime::Builder::new_multi_thread()
102
109
  .enable_all()
@@ -310,13 +317,13 @@ pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionC
310
317
  ///
311
318
  /// Individual file errors are captured in the result metadata. System errors
312
319
  /// (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
320
+ #[cfg(feature = "tokio-runtime")]
313
321
  #[cfg_attr(feature = "otel", tracing::instrument(
314
322
  skip(config, paths),
315
323
  fields(
316
324
  extraction.batch_size = paths.len(),
317
325
  )
318
326
  ))]
319
- #[cfg(feature = "tokio-runtime")]
320
327
  pub async fn batch_extract_file(
321
328
  paths: Vec<impl AsRef<Path>>,
322
329
  config: &ExtractionConfig,
@@ -380,6 +387,7 @@ pub async fn batch_extract_file(
380
387
  detected_languages: None,
381
388
  chunks: None,
382
389
  images: None,
390
+ pages: None,
383
391
  });
384
392
  }
385
393
  Err(join_err) => {
@@ -407,13 +415,13 @@ pub async fn batch_extract_file(
407
415
  /// # Returns
408
416
  ///
409
417
  /// A vector of `ExtractionResult` in the same order as the input.
418
+ #[cfg(feature = "tokio-runtime")]
410
419
  #[cfg_attr(feature = "otel", tracing::instrument(
411
420
  skip(config, contents),
412
421
  fields(
413
422
  extraction.batch_size = contents.len(),
414
423
  )
415
424
  ))]
416
- #[cfg(feature = "tokio-runtime")]
417
425
  pub async fn batch_extract_bytes(
418
426
  contents: Vec<(&[u8], &str)>,
419
427
  config: &ExtractionConfig,
@@ -483,6 +491,7 @@ pub async fn batch_extract_bytes(
483
491
  detected_languages: None,
484
492
  chunks: None,
485
493
  images: None,
494
+ pages: None,
486
495
  });
487
496
  }
488
497
  Err(join_err) => {
@@ -502,6 +511,10 @@ pub async fn batch_extract_bytes(
502
511
  ///
503
512
  /// Uses the global Tokio runtime for 100x+ performance improvement over creating
504
513
  /// a new runtime per call. Always uses the global runtime to avoid nested runtime issues.
514
+ ///
515
+ /// This function is only available with the `tokio-runtime` feature. For WASM targets,
516
+ /// use a truly synchronous extraction approach instead.
517
+ #[cfg(feature = "tokio-runtime")]
505
518
  pub fn extract_file_sync(
506
519
  path: impl AsRef<Path>,
507
520
  mime_type: Option<&str>,
@@ -514,14 +527,31 @@ pub fn extract_file_sync(
514
527
  ///
515
528
  /// Uses the global Tokio runtime for 100x+ performance improvement over creating
516
529
  /// a new runtime per call.
530
+ ///
531
+ /// With the `tokio-runtime` feature, this blocks the current thread using the global
532
+ /// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation.
533
+ #[cfg(feature = "tokio-runtime")]
517
534
  pub fn extract_bytes_sync(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
518
535
  GLOBAL_RUNTIME.block_on(extract_bytes(content, mime_type, config))
519
536
  }
520
537
 
538
+ /// Synchronous wrapper for `extract_bytes` (WASM-compatible version).
539
+ ///
540
+ /// This is a truly synchronous implementation without tokio runtime dependency.
541
+ /// It calls `extract_bytes_sync_impl()` to perform the extraction.
542
+ #[cfg(not(feature = "tokio-runtime"))]
543
+ pub fn extract_bytes_sync(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
544
+ extract_bytes_sync_impl(content.to_vec(), Some(mime_type.to_string()), Some(config.clone()))
545
+ }
546
+
521
547
  /// Synchronous wrapper for `batch_extract_file`.
522
548
  ///
523
549
  /// Uses the global Tokio runtime for 100x+ performance improvement over creating
524
550
  /// a new runtime per call.
551
+ ///
552
+ /// This function is only available with the `tokio-runtime` feature. For WASM targets,
553
+ /// use a truly synchronous extraction approach instead.
554
+ #[cfg(feature = "tokio-runtime")]
525
555
  pub fn batch_extract_file_sync(
526
556
  paths: Vec<impl AsRef<Path>>,
527
557
  config: &ExtractionConfig,
@@ -533,6 +563,11 @@ pub fn batch_extract_file_sync(
533
563
  ///
534
564
  /// Uses the global Tokio runtime for 100x+ performance improvement over creating
535
565
  /// a new runtime per call.
566
+ ///
567
+ /// With the `tokio-runtime` feature, this blocks the current thread using the global
568
+ /// Tokio runtime. Without it (WASM), this calls a truly synchronous implementation
569
+ /// that iterates through items and calls `extract_bytes_sync()`.
570
+ #[cfg(feature = "tokio-runtime")]
536
571
  pub fn batch_extract_bytes_sync(
537
572
  contents: Vec<(&[u8], &str)>,
538
573
  config: &ExtractionConfig,
@@ -540,6 +575,103 @@ pub fn batch_extract_bytes_sync(
540
575
  GLOBAL_RUNTIME.block_on(batch_extract_bytes(contents, config))
541
576
  }
542
577
 
578
+ /// Synchronous wrapper for `batch_extract_bytes` (WASM-compatible version).
579
+ ///
580
+ /// This is a truly synchronous implementation that iterates through items
581
+ /// and calls `extract_bytes_sync()` for each.
582
+ #[cfg(not(feature = "tokio-runtime"))]
583
+ pub fn batch_extract_bytes_sync(
584
+ contents: Vec<(&[u8], &str)>,
585
+ config: &ExtractionConfig,
586
+ ) -> Result<Vec<ExtractionResult>> {
587
+ let mut results = Vec::with_capacity(contents.len());
588
+ for (content, mime_type) in contents {
589
+ let result = extract_bytes_sync(content, mime_type, config);
590
+ results.push(result.unwrap_or_else(|e| {
591
+ use crate::types::{ErrorMetadata, Metadata};
592
+ ExtractionResult {
593
+ content: format!("Error: {}", e),
594
+ mime_type: "text/plain".to_string(),
595
+ metadata: Metadata {
596
+ error: Some(ErrorMetadata {
597
+ error_type: format!("{:?}", e),
598
+ message: e.to_string(),
599
+ }),
600
+ ..Default::default()
601
+ },
602
+ tables: vec![],
603
+ detected_languages: None,
604
+ chunks: None,
605
+ images: None,
606
+ pages: None,
607
+ }
608
+ }));
609
+ }
610
+ Ok(results)
611
+ }
612
+
613
+ /// Synchronous extraction implementation for WASM compatibility.
614
+ ///
615
+ /// This function performs extraction without requiring a tokio runtime.
616
+ /// It calls the sync extractor methods directly.
617
+ ///
618
+ /// # Arguments
619
+ ///
620
+ /// * `content` - The byte content to extract
621
+ /// * `mime_type` - Optional MIME type to validate/use
622
+ /// * `config` - Optional extraction configuration
623
+ ///
624
+ /// # Returns
625
+ ///
626
+ /// An `ExtractionResult` or a `KreuzbergError`
627
+ ///
628
+ /// # Implementation Notes
629
+ ///
630
+ /// This is called when the `tokio-runtime` feature is disabled.
631
+ /// It replicates the logic of `extract_bytes` but uses synchronous extractor methods.
632
+ #[cfg(not(feature = "tokio-runtime"))]
633
+ fn extract_bytes_sync_impl(
634
+ content: Vec<u8>,
635
+ mime_type: Option<String>,
636
+ config: Option<ExtractionConfig>,
637
+ ) -> Result<ExtractionResult> {
638
+ use crate::core::mime;
639
+
640
+ let config = config.unwrap_or_default();
641
+
642
+ // Validate MIME type if provided
643
+ let validated_mime = if let Some(mime) = mime_type {
644
+ mime::validate_mime_type(&mime)?
645
+ } else {
646
+ return Err(KreuzbergError::Validation {
647
+ message: "MIME type is required for synchronous extraction".to_string(),
648
+ source: None,
649
+ });
650
+ };
651
+
652
+ // Ensure extractors are initialized
653
+ crate::extractors::ensure_initialized()?;
654
+
655
+ // Get the appropriate extractor
656
+ let extractor = get_extractor(&validated_mime)?;
657
+
658
+ // Check if extractor supports synchronous extraction
659
+ let sync_extractor = extractor.as_sync_extractor().ok_or_else(|| {
660
+ KreuzbergError::UnsupportedFormat(format!(
661
+ "Extractor for '{}' does not support synchronous extraction",
662
+ validated_mime
663
+ ))
664
+ })?;
665
+
666
+ // Call the sync extract method
667
+ let mut result = sync_extractor.extract_sync(&content, &validated_mime, &config)?;
668
+
669
+ // Run post-processing pipeline (sync version)
670
+ result = crate::core::pipeline::run_pipeline_sync(result, &config)?;
671
+
672
+ Ok(result)
673
+ }
674
+
543
675
  async fn extract_file_with_extractor(
544
676
  path: &Path,
545
677
  mime_type: &str,
@@ -37,9 +37,11 @@ pub mod mime;
37
37
  pub mod pipeline;
38
38
 
39
39
  pub use config::{
40
- ChunkingConfig, ExtractionConfig, ImageExtractionConfig, LanguageDetectionConfig, OcrConfig, PdfConfig,
41
- TokenReductionConfig,
40
+ ChunkingConfig, ExtractionConfig, ImageExtractionConfig, LanguageDetectionConfig, OcrConfig, TokenReductionConfig,
42
41
  };
42
+
43
+ #[cfg(feature = "pdf")]
44
+ pub use config::PdfConfig;
43
45
  #[cfg(feature = "tokio-runtime")]
44
46
  pub use extractor::{batch_extract_bytes, batch_extract_file};
45
47
  pub use extractor::{extract_bytes, extract_file};