kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +5 -5
  3. data/README.md +15 -9
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
  5. data/ext/kreuzberg_rb/native/Cargo.lock +511 -325
  6. data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
  8. data/kreuzberg.gemspec +38 -4
  9. data/lib/kreuzberg/config.rb +34 -1
  10. data/lib/kreuzberg/result.rb +77 -14
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/sig/kreuzberg.rbs +23 -6
  13. data/vendor/kreuzberg/Cargo.toml +32 -11
  14. data/vendor/kreuzberg/README.md +54 -8
  15. data/vendor/kreuzberg/build.rs +549 -132
  16. data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
  17. data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
  18. data/vendor/kreuzberg/src/core/config.rs +49 -1
  19. data/vendor/kreuzberg/src/core/extractor.rs +134 -2
  20. data/vendor/kreuzberg/src/core/mod.rs +4 -2
  21. data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
  22. data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
  23. data/vendor/kreuzberg/src/extraction/html.rs +24 -8
  24. data/vendor/kreuzberg/src/extraction/image.rs +124 -1
  25. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
  26. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
  27. data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
  28. data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
  29. data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
  30. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  31. data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
  32. data/vendor/kreuzberg/src/extractors/email.rs +29 -15
  33. data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
  34. data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
  35. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
  36. data/vendor/kreuzberg/src/extractors/html.rs +29 -15
  37. data/vendor/kreuzberg/src/extractors/image.rs +25 -4
  38. data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
  39. data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
  40. data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
  41. data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
  42. data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
  43. data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
  44. data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
  45. data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
  46. data/vendor/kreuzberg/src/extractors/pdf.rs +197 -17
  47. data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
  48. data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
  49. data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
  50. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  51. data/vendor/kreuzberg/src/extractors/text.rs +7 -2
  52. data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
  53. data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
  54. data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
  55. data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
  56. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
  57. data/vendor/kreuzberg/src/lib.rs +10 -2
  58. data/vendor/kreuzberg/src/mcp/mod.rs +3 -0
  59. data/vendor/kreuzberg/src/mcp/server.rs +120 -12
  60. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
  61. data/vendor/kreuzberg/src/pdf/bundled.rs +328 -0
  62. data/vendor/kreuzberg/src/pdf/error.rs +8 -0
  63. data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
  64. data/vendor/kreuzberg/src/pdf/mod.rs +18 -2
  65. data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
  66. data/vendor/kreuzberg/src/pdf/table.rs +26 -2
  67. data/vendor/kreuzberg/src/pdf/text.rs +89 -7
  68. data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
  69. data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
  70. data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
  71. data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
  72. data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
  73. data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
  74. data/vendor/kreuzberg/src/text/mod.rs +6 -0
  75. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
  76. data/vendor/kreuzberg/src/types.rs +173 -21
  77. data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
  78. data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
  79. data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
  80. data/vendor/kreuzberg/tests/config_features.rs +15 -1
  81. data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
  82. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
  83. data/vendor/kreuzberg/tests/email_integration.rs +2 -0
  84. data/vendor/kreuzberg/tests/error_handling.rs +43 -34
  85. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  86. data/vendor/kreuzberg/tests/image_integration.rs +2 -0
  87. data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
  88. data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
  89. data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
  90. data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
  91. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
  92. data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
  93. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
  95. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
  96. data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
  97. data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
  98. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
  99. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
  100. data/vendor/kreuzberg/tests/security_validation.rs +1 -0
  101. data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
  102. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
  103. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
  104. data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
  105. data/vendor/rb-sys/Cargo.lock +15 -15
  106. data/vendor/rb-sys/Cargo.toml +4 -4
  107. data/vendor/rb-sys/Cargo.toml.orig +4 -4
  108. data/vendor/rb-sys/build/features.rs +5 -2
  109. data/vendor/rb-sys/build/main.rs +55 -15
  110. data/vendor/rb-sys/build/stable_api_config.rs +4 -2
  111. data/vendor/rb-sys/build/version.rs +3 -1
  112. data/vendor/rb-sys/src/lib.rs +1 -0
  113. data/vendor/rb-sys/src/macros.rs +2 -2
  114. data/vendor/rb-sys/src/special_consts.rs +1 -1
  115. data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
  116. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
  117. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
  118. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
  119. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
  120. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
  121. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
  122. data/vendor/rb-sys/src/stable_api.rs +0 -1
  123. data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
  124. metadata +13 -10
  125. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  126. data/vendor/rb-sys/.cargo-ok +0 -1
  127. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
@@ -3,7 +3,7 @@
3
3
  use crate::Result;
4
4
  use crate::core::config::ExtractionConfig;
5
5
  use crate::plugins::{DocumentExtractor, Plugin};
6
- use crate::types::{ExtractionResult, Metadata};
6
+ use crate::types::{ExtractionResult, Metadata, PageContent};
7
7
  use async_trait::async_trait;
8
8
  use std::path::Path;
9
9
 
@@ -140,7 +140,7 @@ fn evaluate_native_text_for_ocr(native_text: &str, page_count: Option<usize>) ->
140
140
  #[cfg(all(feature = "pdf", feature = "ocr"))]
141
141
  fn extract_tables_from_document(
142
142
  document: &PdfDocument,
143
- _metadata: &crate::pdf::metadata::PdfMetadata,
143
+ _metadata: &crate::pdf::metadata::PdfExtractionMetadata,
144
144
  ) -> Result<Vec<Table>> {
145
145
  use crate::ocr::table::{reconstruct_table, table_to_markdown};
146
146
  use crate::pdf::table::extract_words_from_page;
@@ -177,11 +177,41 @@ fn extract_tables_from_document(
177
177
  #[cfg(all(feature = "pdf", not(feature = "ocr")))]
178
178
  fn extract_tables_from_document(
179
179
  _document: &PdfDocument,
180
- _metadata: &crate::pdf::metadata::PdfMetadata,
180
+ _metadata: &crate::pdf::metadata::PdfExtractionMetadata,
181
181
  ) -> Result<Vec<crate::types::Table>> {
182
182
  Ok(vec![])
183
183
  }
184
184
 
185
+ /// Helper function to assign tables and images to pages.
186
+ ///
187
+ /// If page_contents is None, returns None (no per-page tracking enabled).
188
+ /// Otherwise, iterates through tables and images, assigning them to pages based on page_number.
189
+ fn assign_tables_and_images_to_pages(
190
+ mut page_contents: Option<Vec<PageContent>>,
191
+ tables: &[crate::types::Table],
192
+ images: &[crate::types::ExtractedImage],
193
+ ) -> Option<Vec<PageContent>> {
194
+ let pages = page_contents.take()?;
195
+
196
+ let mut updated_pages = pages;
197
+
198
+ for table in tables {
199
+ if let Some(page) = updated_pages.iter_mut().find(|p| p.page_number == table.page_number) {
200
+ page.tables.push(table.clone());
201
+ }
202
+ }
203
+
204
+ for image in images {
205
+ if let Some(page_num) = image.page_number
206
+ && let Some(page) = updated_pages.iter_mut().find(|p| p.page_number == page_num)
207
+ {
208
+ page.images.push(image.clone());
209
+ }
210
+ }
211
+
212
+ Some(updated_pages)
213
+ }
214
+
185
215
  /// PDF document extractor using pypdfium2 and playa-pdf.
186
216
  pub struct PdfExtractor;
187
217
 
@@ -295,9 +325,10 @@ impl DocumentExtractor for PdfExtractor {
295
325
  config: &ExtractionConfig,
296
326
  ) -> Result<ExtractionResult> {
297
327
  #[cfg(feature = "pdf")]
298
- let (pdf_metadata, native_text, tables) = if crate::core::batch_mode::is_batch_mode() {
328
+ let (pdf_metadata, native_text, tables, page_contents) = if crate::core::batch_mode::is_batch_mode() {
299
329
  let content_owned = content.to_vec();
300
330
  let span = tracing::Span::current();
331
+ let pages_config = config.pages.clone();
301
332
  tokio::task::spawn_blocking(move || {
302
333
  let _guard = span.entered();
303
334
  let bindings = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
@@ -315,12 +346,25 @@ impl DocumentExtractor for PdfExtractor {
315
346
  }
316
347
  })?;
317
348
 
318
- let metadata = crate::pdf::metadata::extract_metadata_from_document(&document)?;
319
- let native_text = crate::pdf::text::extract_text_from_pdf_document(&document)?;
349
+ let (native_text, boundaries, page_contents) =
350
+ crate::pdf::text::extract_text_from_pdf_document(&document, pages_config.as_ref())?;
320
351
 
321
- let tables = extract_tables_from_document(&document, &metadata)?;
352
+ let pdf_metadata =
353
+ crate::pdf::metadata::extract_metadata_from_document(&document, boundaries.as_deref())?;
322
354
 
323
- Ok::<_, crate::error::KreuzbergError>((metadata, native_text, tables))
355
+ let tables = extract_tables_from_document(&document, &pdf_metadata)?;
356
+
357
+ if let Some(ref page_cfg) = pages_config
358
+ && page_cfg.extract_pages
359
+ && page_contents.is_none()
360
+ {
361
+ return Err(PdfError::ExtractionFailed(
362
+ "Page extraction was configured but no page data was extracted in batch mode".to_string(),
363
+ )
364
+ .into());
365
+ }
366
+
367
+ Ok::<_, crate::error::KreuzbergError>((pdf_metadata, native_text, tables, page_contents))
324
368
  })
325
369
  .await
326
370
  .map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))??
@@ -340,12 +384,14 @@ impl DocumentExtractor for PdfExtractor {
340
384
  }
341
385
  })?;
342
386
 
343
- let metadata = crate::pdf::metadata::extract_metadata_from_document(&document)?;
344
- let native_text = crate::pdf::text::extract_text_from_pdf_document(&document)?;
387
+ let (native_text, boundaries, page_contents) =
388
+ crate::pdf::text::extract_text_from_pdf_document(&document, config.pages.as_ref())?;
389
+
390
+ let pdf_metadata = crate::pdf::metadata::extract_metadata_from_document(&document, boundaries.as_deref())?;
345
391
 
346
- let tables = extract_tables_from_document(&document, &metadata)?;
392
+ let tables = extract_tables_from_document(&document, &pdf_metadata)?;
347
393
 
348
- (metadata, native_text, tables)
394
+ (pdf_metadata, native_text, tables, page_contents)
349
395
  };
350
396
 
351
397
  #[cfg(feature = "ocr")]
@@ -356,20 +402,19 @@ impl DocumentExtractor for PdfExtractor {
356
402
  native_text
357
403
  }
358
404
  } else if config.ocr.is_some() {
359
- let decision = evaluate_native_text_for_ocr(&native_text, pdf_metadata.page_count);
405
+ let decision = evaluate_native_text_for_ocr(&native_text, None);
360
406
 
361
407
  if std::env::var("KREUZBERG_DEBUG_OCR").is_ok() {
362
408
  eprintln!(
363
409
  "[kreuzberg::pdf::ocr] fallback={} non_whitespace={} alnum={} meaningful_words={} \
364
- avg_non_whitespace={:.2} avg_alnum={:.2} alnum_ratio={:.3} pages={}",
410
+ avg_non_whitespace={:.2} avg_alnum={:.2} alnum_ratio={:.3}",
365
411
  decision.fallback,
366
412
  decision.stats.non_whitespace,
367
413
  decision.stats.alnum,
368
414
  decision.stats.meaningful_words,
369
415
  decision.avg_non_whitespace,
370
416
  decision.avg_alnum,
371
- decision.stats.alnum_ratio,
372
- pdf_metadata.page_count.unwrap_or(0)
417
+ decision.stats.alnum_ratio
373
418
  );
374
419
  }
375
420
 
@@ -385,6 +430,20 @@ impl DocumentExtractor for PdfExtractor {
385
430
  #[cfg(not(feature = "ocr"))]
386
431
  let text = native_text;
387
432
 
433
+ #[cfg(feature = "pdf")]
434
+ if let Some(ref page_cfg) = config.pages
435
+ && page_cfg.insert_page_markers
436
+ {
437
+ let marker_placeholder = page_cfg.marker_format.replace("{page_num}", "");
438
+ if !marker_placeholder.is_empty() && !text.contains(&marker_placeholder) {
439
+ #[cfg(feature = "otel")]
440
+ tracing::warn!(
441
+ "Page markers were configured but none found in extracted content. \
442
+ This may indicate very short documents or incomplete extraction."
443
+ );
444
+ }
445
+ }
446
+
388
447
  let images = if config.images.is_some() {
389
448
  match crate::pdf::images::extract_images_from_pdf(content) {
390
449
  Ok(pdf_images) => Some(
@@ -415,14 +474,33 @@ impl DocumentExtractor for PdfExtractor {
415
474
  None
416
475
  };
417
476
 
477
+ let final_pages = assign_tables_and_images_to_pages(page_contents, &tables, images.as_deref().unwrap_or(&[]));
478
+
418
479
  Ok(ExtractionResult {
419
480
  content: text,
420
481
  mime_type: mime_type.to_string(),
421
482
  metadata: Metadata {
422
483
  #[cfg(feature = "pdf")]
423
- format: Some(crate::types::FormatMetadata::Pdf(pdf_metadata)),
484
+ title: pdf_metadata.title.clone(),
485
+ #[cfg(feature = "pdf")]
486
+ subject: pdf_metadata.subject.clone(),
487
+ #[cfg(feature = "pdf")]
488
+ authors: pdf_metadata.authors.clone(),
489
+ #[cfg(feature = "pdf")]
490
+ keywords: pdf_metadata.keywords.clone(),
491
+ #[cfg(feature = "pdf")]
492
+ created_at: pdf_metadata.created_at.clone(),
493
+ #[cfg(feature = "pdf")]
494
+ modified_at: pdf_metadata.modified_at.clone(),
495
+ #[cfg(feature = "pdf")]
496
+ created_by: pdf_metadata.created_by.clone(),
497
+ #[cfg(feature = "pdf")]
498
+ pages: pdf_metadata.page_structure.clone(),
499
+ #[cfg(feature = "pdf")]
500
+ format: Some(crate::types::FormatMetadata::Pdf(pdf_metadata.pdf_specific)),
424
501
  ..Default::default()
425
502
  },
503
+ pages: final_pages,
426
504
  tables,
427
505
  detected_languages: None,
428
506
  chunks: None,
@@ -490,4 +568,106 @@ mod tests {
490
568
  let sample = " . , ; : -- -- ";
491
569
  assert!(evaluate_native_text_for_ocr(sample, Some(2)).fallback);
492
570
  }
571
+
572
+ #[tokio::test]
573
+ #[cfg(feature = "pdf")]
574
+ async fn test_pdf_batch_mode_validates_page_config_enabled() {
575
+ use crate::core::config::PageConfig;
576
+
577
+ let extractor = PdfExtractor::new();
578
+
579
+ let config = ExtractionConfig {
580
+ pages: Some(PageConfig {
581
+ extract_pages: true,
582
+ insert_page_markers: false,
583
+ marker_format: "<!-- PAGE {page_num} -->".to_string(),
584
+ }),
585
+ ..Default::default()
586
+ };
587
+
588
+ let pdf_path =
589
+ std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/pdfs/google_doc_document.pdf");
590
+ if let Ok(content) = std::fs::read(pdf_path) {
591
+ let result = extractor.extract_bytes(&content, "application/pdf", &config).await;
592
+ assert!(
593
+ result.is_ok(),
594
+ "Failed to extract PDF with page config: {:?}",
595
+ result.err()
596
+ );
597
+
598
+ let extraction_result = result.unwrap();
599
+ assert!(
600
+ extraction_result.pages.is_some(),
601
+ "Pages should be extracted when extract_pages is true"
602
+ );
603
+ }
604
+ }
605
+
606
+ #[tokio::test]
607
+ #[cfg(feature = "pdf")]
608
+ async fn test_pdf_batch_mode_validates_page_config_disabled() {
609
+ let extractor = PdfExtractor::new();
610
+ let config = ExtractionConfig::default();
611
+
612
+ let pdf_path =
613
+ std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/pdfs/google_doc_document.pdf");
614
+ if let Ok(content) = std::fs::read(pdf_path) {
615
+ let result = extractor.extract_bytes(&content, "application/pdf", &config).await;
616
+ assert!(
617
+ result.is_ok(),
618
+ "Failed to extract PDF without page config: {:?}",
619
+ result.err()
620
+ );
621
+
622
+ let extraction_result = result.unwrap();
623
+ assert!(
624
+ extraction_result.pages.is_none(),
625
+ "Pages should not be extracted when pages config is None"
626
+ );
627
+ }
628
+ }
629
+
630
+ #[tokio::test]
631
+ #[cfg(feature = "pdf")]
632
+ async fn test_pdf_page_marker_validation() {
633
+ use crate::core::config::PageConfig;
634
+
635
+ let extractor = PdfExtractor::new();
636
+
637
+ let config = ExtractionConfig {
638
+ pages: Some(PageConfig {
639
+ extract_pages: true,
640
+ insert_page_markers: true,
641
+ marker_format: "\n\n<!-- PAGE {page_num} -->\n\n".to_string(),
642
+ }),
643
+ ..Default::default()
644
+ };
645
+
646
+ let pdf_path =
647
+ std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/pdfs/multi_page.pdf");
648
+ if let Ok(content) = std::fs::read(pdf_path) {
649
+ let result = extractor.extract_bytes(&content, "application/pdf", &config).await;
650
+ assert!(
651
+ result.is_ok(),
652
+ "Failed to extract PDF with page markers: {:?}",
653
+ result.err()
654
+ );
655
+
656
+ let extraction_result = result.unwrap();
657
+ let marker_placeholder = "<!-- PAGE ";
658
+ if extraction_result.content.len() > 100 {
659
+ assert!(
660
+ extraction_result.content.contains(marker_placeholder),
661
+ "Page markers should be inserted when configured and document has multiple pages"
662
+ );
663
+ }
664
+ }
665
+ }
666
+
667
+ #[test]
668
+ #[cfg(feature = "pdf")]
669
+ fn test_pdf_extractor_without_feature_pdf() {
670
+ let extractor = PdfExtractor::new();
671
+ assert_eq!(extractor.name(), "pdf-extractor");
672
+ }
493
673
  }
@@ -1,3 +1,5 @@
1
+ #![cfg(all(feature = "tokio-runtime", feature = "office"))]
2
+
1
3
  //! PowerPoint presentation extractor.
2
4
 
3
5
  use crate::Result;
@@ -69,6 +71,7 @@ impl PptxExtractor {
69
71
  detected_languages: None,
70
72
  chunks: None,
71
73
  images: None,
74
+ pages: None,
72
75
  };
73
76
  image.ocr_result = Some(Box::new(extraction_result));
74
77
  }
@@ -117,17 +120,18 @@ impl DocumentExtractor for PptxExtractor {
117
120
  ) -> Result<ExtractionResult> {
118
121
  let extract_images = config.images.as_ref().is_some_and(|img| img.extract_images);
119
122
 
123
+ let pages_config = config.pages.clone();
120
124
  let pptx_result = if crate::core::batch_mode::is_batch_mode() {
121
125
  let content_owned = content.to_vec();
122
126
  let span = tracing::Span::current();
123
127
  tokio::task::spawn_blocking(move || {
124
128
  let _guard = span.entered();
125
- crate::extraction::pptx::extract_pptx_from_bytes(&content_owned, extract_images)
129
+ crate::extraction::pptx::extract_pptx_from_bytes(&content_owned, extract_images, pages_config.as_ref())
126
130
  })
127
131
  .await
128
132
  .map_err(|e| crate::error::KreuzbergError::parsing(format!("PPTX extraction task failed: {}", e)))??
129
133
  } else {
130
- crate::extraction::pptx::extract_pptx_from_bytes(content, extract_images)?
134
+ crate::extraction::pptx::extract_pptx_from_bytes(content, extract_images, config.pages.as_ref())?
131
135
  };
132
136
 
133
137
  let mut additional = std::collections::HashMap::new();
@@ -149,14 +153,21 @@ impl DocumentExtractor for PptxExtractor {
149
153
  None
150
154
  };
151
155
 
156
+ let mut metadata = Metadata {
157
+ format: Some(crate::types::FormatMetadata::Pptx(pptx_result.metadata)),
158
+ additional,
159
+ ..Default::default()
160
+ };
161
+
162
+ if let Some(page_structure) = pptx_result.page_structure {
163
+ metadata.pages = Some(page_structure);
164
+ }
165
+
152
166
  Ok(ExtractionResult {
153
167
  content: pptx_result.content,
154
168
  mime_type: mime_type.to_string(),
155
- metadata: Metadata {
156
- format: Some(crate::types::FormatMetadata::Pptx(pptx_result.metadata)),
157
- additional,
158
- ..Default::default()
159
- },
169
+ metadata,
170
+ pages: pptx_result.page_contents,
160
171
  tables: vec![],
161
172
  detected_languages: None,
162
173
  chunks: None,
@@ -177,7 +188,8 @@ impl DocumentExtractor for PptxExtractor {
177
188
 
178
189
  let extract_images = config.images.as_ref().is_some_and(|img| img.extract_images);
179
190
 
180
- let pptx_result = crate::extraction::pptx::extract_pptx_from_path(path_str, extract_images)?;
191
+ let pptx_result =
192
+ crate::extraction::pptx::extract_pptx_from_path(path_str, extract_images, config.pages.as_ref())?;
181
193
 
182
194
  let mut additional = std::collections::HashMap::new();
183
195
  additional.insert("slide_count".to_string(), serde_json::json!(pptx_result.slide_count));
@@ -198,14 +210,21 @@ impl DocumentExtractor for PptxExtractor {
198
210
  None
199
211
  };
200
212
 
213
+ let mut metadata = Metadata {
214
+ format: Some(crate::types::FormatMetadata::Pptx(pptx_result.metadata)),
215
+ additional,
216
+ ..Default::default()
217
+ };
218
+
219
+ if let Some(page_structure) = pptx_result.page_structure {
220
+ metadata.pages = Some(page_structure);
221
+ }
222
+
201
223
  Ok(ExtractionResult {
202
224
  content: pptx_result.content,
203
225
  mime_type: mime_type.to_string(),
204
- metadata: Metadata {
205
- format: Some(crate::types::FormatMetadata::Pptx(pptx_result.metadata)),
206
- additional,
207
- ..Default::default()
208
- },
226
+ metadata,
227
+ pages: pptx_result.page_contents,
209
228
  tables: vec![],
210
229
  detected_languages: None,
211
230
  chunks: None,
@@ -453,6 +453,7 @@ impl DocumentExtractor for RstExtractor {
453
453
  detected_languages: None,
454
454
  chunks: None,
455
455
  images: None,
456
+ pages: None,
456
457
  })
457
458
  }
458
459
 
@@ -391,9 +391,7 @@ fn extract_text_from_rtf(content: &str) -> (String, Vec<Table>) {
391
391
  if let Some(state) = table_state.as_ref()
392
392
  && !state.in_row
393
393
  && !state.rows.is_empty()
394
- {
395
- // We'll finalize once we see content outside the table
396
- }
394
+ {}
397
395
  }
398
396
  _ => {}
399
397
  }
@@ -571,7 +569,7 @@ fn extract_rtf_metadata(rtf_content: &str, extracted_text: &str) -> HashMap<Stri
571
569
  };
572
570
 
573
571
  let mut chars = cleaned_segment.chars().peekable();
574
- chars.next(); // consume the leading backslash
572
+ chars.next();
575
573
  let (keyword, numeric) = parse_rtf_control_word(&mut chars);
576
574
  let remaining: String = chars.collect();
577
575
  let trimmed = remaining.trim();
@@ -771,6 +769,7 @@ impl DocumentExtractor for RtfExtractor {
771
769
  additional: metadata_map,
772
770
  ..Default::default()
773
771
  },
772
+ pages: None,
774
773
  tables,
775
774
  detected_languages: None,
776
775
  chunks: None,
@@ -5,6 +5,7 @@ use crate::core::config::ExtractionConfig;
5
5
  use crate::plugins::{DocumentExtractor, Plugin};
6
6
  use crate::types::{ExtractionResult, Metadata};
7
7
  use async_trait::async_trait;
8
+ #[cfg(feature = "tokio-runtime")]
8
9
  use std::path::Path;
9
10
 
10
11
  /// Structured data extractor supporting JSON, YAML, and TOML.
@@ -80,6 +81,7 @@ impl DocumentExtractor for StructuredExtractor {
80
81
  additional,
81
82
  ..Default::default()
82
83
  },
84
+ pages: None,
83
85
  tables: vec![],
84
86
  detected_languages: None,
85
87
  chunks: None,
@@ -86,6 +86,7 @@ impl DocumentExtractor for PlainTextExtractor {
86
86
  })),
87
87
  ..Default::default()
88
88
  },
89
+ pages: None,
89
90
  tables: vec![],
90
91
  detected_languages: None,
91
92
  chunks: None,
@@ -94,7 +95,7 @@ impl DocumentExtractor for PlainTextExtractor {
94
95
  }
95
96
 
96
97
  fn supported_mime_types(&self) -> &[&str] {
97
- &["text/plain"]
98
+ &["text/plain", "text/csv", "text/tab-separated-values"]
98
99
  }
99
100
 
100
101
  fn priority(&self) -> i32 {
@@ -178,6 +179,7 @@ impl DocumentExtractor for MarkdownExtractor {
178
179
  })),
179
180
  ..Default::default()
180
181
  },
182
+ pages: None,
181
183
  tables: vec![],
182
184
  detected_languages: None,
183
185
  chunks: None,
@@ -245,7 +247,10 @@ mod tests {
245
247
  let extractor = PlainTextExtractor::new();
246
248
  assert_eq!(extractor.name(), "plain-text-extractor");
247
249
  assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
248
- assert_eq!(extractor.supported_mime_types(), &["text/plain"]);
250
+ assert_eq!(
251
+ extractor.supported_mime_types(),
252
+ &["text/plain", "text/csv", "text/tab-separated-values"]
253
+ );
249
254
  assert_eq!(extractor.priority(), 50);
250
255
  }
251
256
 
@@ -112,6 +112,7 @@ impl DocumentExtractor for TypstExtractor {
112
112
  detected_languages: None,
113
113
  chunks: None,
114
114
  images: None,
115
+ pages: None,
115
116
  })
116
117
  }
117
118
 
@@ -3,6 +3,7 @@
3
3
  use crate::Result;
4
4
  use crate::core::config::ExtractionConfig;
5
5
  use crate::extraction::xml::parse_xml;
6
+ use crate::extractors::SyncExtractor;
6
7
  use crate::plugins::{DocumentExtractor, Plugin};
7
8
  use crate::types::ExtractionResult;
8
9
  use async_trait::async_trait;
@@ -51,21 +52,8 @@ impl Plugin for XmlExtractor {
51
52
  }
52
53
  }
53
54
 
54
- #[async_trait]
55
- impl DocumentExtractor for XmlExtractor {
56
- #[cfg_attr(feature = "otel", tracing::instrument(
57
- skip(self, content, _config),
58
- fields(
59
- extractor.name = self.name(),
60
- content.size_bytes = content.len(),
61
- )
62
- ))]
63
- async fn extract_bytes(
64
- &self,
65
- content: &[u8],
66
- mime_type: &str,
67
- _config: &ExtractionConfig,
68
- ) -> Result<ExtractionResult> {
55
+ impl SyncExtractor for XmlExtractor {
56
+ fn extract_sync(&self, content: &[u8], mime_type: &str, _config: &ExtractionConfig) -> Result<ExtractionResult> {
69
57
  let xml_result = parse_xml(content, false)?;
70
58
 
71
59
  Ok(ExtractionResult {
@@ -82,8 +70,28 @@ impl DocumentExtractor for XmlExtractor {
82
70
  detected_languages: None,
83
71
  chunks: None,
84
72
  images: None,
73
+ pages: None,
85
74
  })
86
75
  }
76
+ }
77
+
78
+ #[async_trait]
79
+ impl DocumentExtractor for XmlExtractor {
80
+ #[cfg_attr(feature = "otel", tracing::instrument(
81
+ skip(self, content, config),
82
+ fields(
83
+ extractor.name = self.name(),
84
+ content.size_bytes = content.len(),
85
+ )
86
+ ))]
87
+ async fn extract_bytes(
88
+ &self,
89
+ content: &[u8],
90
+ mime_type: &str,
91
+ config: &ExtractionConfig,
92
+ ) -> Result<ExtractionResult> {
93
+ self.extract_sync(content, mime_type, config)
94
+ }
87
95
 
88
96
  fn supported_mime_types(&self) -> &[&str] {
89
97
  &["application/xml", "text/xml", "image/svg+xml"]
@@ -92,6 +100,10 @@ impl DocumentExtractor for XmlExtractor {
92
100
  fn priority(&self) -> i32 {
93
101
  50
94
102
  }
103
+
104
+ fn as_sync_extractor(&self) -> Option<&dyn crate::extractors::SyncExtractor> {
105
+ Some(self)
106
+ }
95
107
  }
96
108
 
97
109
  #[cfg(test)]
@@ -45,7 +45,8 @@ impl Plugin for KeywordExtractor {
45
45
  }
46
46
  }
47
47
 
48
- #[async_trait]
48
+ #[cfg_attr(not(target_arch = "wasm32"), async_trait)]
49
+ #[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
49
50
  impl PostProcessor for KeywordExtractor {
50
51
  async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig) -> Result<()> {
51
52
  let keyword_config = match &config.keywords {
@@ -112,6 +113,7 @@ machine learning that uses neural networks with multiple layers.
112
113
  detected_languages: None,
113
114
  chunks: None,
114
115
  images: None,
116
+ pages: None,
115
117
  };
116
118
 
117
119
  processor.process(&mut result, &config).await.unwrap();
@@ -140,6 +142,7 @@ machine learning that uses neural networks with multiple layers.
140
142
  detected_languages: None,
141
143
  chunks: None,
142
144
  images: None,
145
+ pages: None,
143
146
  };
144
147
 
145
148
  processor.process(&mut result, &config).await.unwrap();
@@ -164,6 +167,7 @@ machine learning that uses neural networks with multiple layers.
164
167
  detected_languages: None,
165
168
  chunks: None,
166
169
  images: None,
170
+ pages: None,
167
171
  };
168
172
 
169
173
  processor.process(&mut result, &config).await.unwrap();
@@ -188,6 +192,7 @@ machine learning that uses neural networks with multiple layers.
188
192
  detected_languages: None,
189
193
  chunks: None,
190
194
  images: None,
195
+ pages: None,
191
196
  };
192
197
 
193
198
  processor.process(&mut result, &config).await.unwrap();
@@ -223,6 +228,7 @@ machine learning that uses neural networks with multiple layers.
223
228
  detected_languages: None,
224
229
  chunks: None,
225
230
  images: None,
231
+ pages: None,
226
232
  };
227
233
 
228
234
  let config_with_keywords = ExtractionConfig {
@@ -247,6 +253,7 @@ machine learning that uses neural networks with multiple layers.
247
253
  detected_languages: None,
248
254
  chunks: None,
249
255
  images: None,
256
+ pages: None,
250
257
  };
251
258
 
252
259
  let long_result = ExtractionResult {
@@ -257,6 +264,7 @@ machine learning that uses neural networks with multiple layers.
257
264
  detected_languages: None,
258
265
  chunks: None,
259
266
  images: None,
267
+ pages: None,
260
268
  };
261
269
 
262
270
  let short_duration = processor.estimated_duration_ms(&short_result);