kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -5
- data/README.md +15 -9
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +511 -325
- data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
- data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
- data/kreuzberg.gemspec +38 -4
- data/lib/kreuzberg/config.rb +34 -1
- data/lib/kreuzberg/result.rb +77 -14
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +23 -6
- data/vendor/kreuzberg/Cargo.toml +32 -11
- data/vendor/kreuzberg/README.md +54 -8
- data/vendor/kreuzberg/build.rs +549 -132
- data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
- data/vendor/kreuzberg/src/core/config.rs +49 -1
- data/vendor/kreuzberg/src/core/extractor.rs +134 -2
- data/vendor/kreuzberg/src/core/mod.rs +4 -2
- data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
- data/vendor/kreuzberg/src/extraction/html.rs +24 -8
- data/vendor/kreuzberg/src/extraction/image.rs +124 -1
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
- data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
- data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
- data/vendor/kreuzberg/src/extractors/email.rs +29 -15
- data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
- data/vendor/kreuzberg/src/extractors/html.rs +29 -15
- data/vendor/kreuzberg/src/extractors/image.rs +25 -4
- data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
- data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
- data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +197 -17
- data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
- data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +7 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
- data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
- data/vendor/kreuzberg/src/lib.rs +10 -2
- data/vendor/kreuzberg/src/mcp/mod.rs +3 -0
- data/vendor/kreuzberg/src/mcp/server.rs +120 -12
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +328 -0
- data/vendor/kreuzberg/src/pdf/error.rs +8 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
- data/vendor/kreuzberg/src/pdf/mod.rs +18 -2
- data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
- data/vendor/kreuzberg/src/pdf/table.rs +26 -2
- data/vendor/kreuzberg/src/pdf/text.rs +89 -7
- data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
- data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
- data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
- data/vendor/kreuzberg/src/text/mod.rs +6 -0
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
- data/vendor/kreuzberg/src/types.rs +173 -21
- data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
- data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
- data/vendor/kreuzberg/tests/config_features.rs +15 -1
- data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/email_integration.rs +2 -0
- data/vendor/kreuzberg/tests/error_handling.rs +43 -34
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/image_integration.rs +2 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
- data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
- data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
- data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
- data/vendor/rb-sys/Cargo.lock +15 -15
- data/vendor/rb-sys/Cargo.toml +4 -4
- data/vendor/rb-sys/Cargo.toml.orig +4 -4
- data/vendor/rb-sys/build/features.rs +5 -2
- data/vendor/rb-sys/build/main.rs +55 -15
- data/vendor/rb-sys/build/stable_api_config.rs +4 -2
- data/vendor/rb-sys/build/version.rs +3 -1
- data/vendor/rb-sys/src/lib.rs +1 -0
- data/vendor/rb-sys/src/macros.rs +2 -2
- data/vendor/rb-sys/src/special_consts.rs +1 -1
- data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
- data/vendor/rb-sys/src/stable_api.rs +0 -1
- data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
- metadata +13 -10
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
use crate::Result;
|
|
4
4
|
use crate::core::config::ExtractionConfig;
|
|
5
5
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
6
|
-
use crate::types::{ExtractionResult, Metadata};
|
|
6
|
+
use crate::types::{ExtractionResult, Metadata, PageContent};
|
|
7
7
|
use async_trait::async_trait;
|
|
8
8
|
use std::path::Path;
|
|
9
9
|
|
|
@@ -140,7 +140,7 @@ fn evaluate_native_text_for_ocr(native_text: &str, page_count: Option<usize>) ->
|
|
|
140
140
|
#[cfg(all(feature = "pdf", feature = "ocr"))]
|
|
141
141
|
fn extract_tables_from_document(
|
|
142
142
|
document: &PdfDocument,
|
|
143
|
-
_metadata: &crate::pdf::metadata::
|
|
143
|
+
_metadata: &crate::pdf::metadata::PdfExtractionMetadata,
|
|
144
144
|
) -> Result<Vec<Table>> {
|
|
145
145
|
use crate::ocr::table::{reconstruct_table, table_to_markdown};
|
|
146
146
|
use crate::pdf::table::extract_words_from_page;
|
|
@@ -177,11 +177,41 @@ fn extract_tables_from_document(
|
|
|
177
177
|
#[cfg(all(feature = "pdf", not(feature = "ocr")))]
|
|
178
178
|
fn extract_tables_from_document(
|
|
179
179
|
_document: &PdfDocument,
|
|
180
|
-
_metadata: &crate::pdf::metadata::
|
|
180
|
+
_metadata: &crate::pdf::metadata::PdfExtractionMetadata,
|
|
181
181
|
) -> Result<Vec<crate::types::Table>> {
|
|
182
182
|
Ok(vec![])
|
|
183
183
|
}
|
|
184
184
|
|
|
185
|
+
/// Helper function to assign tables and images to pages.
|
|
186
|
+
///
|
|
187
|
+
/// If page_contents is None, returns None (no per-page tracking enabled).
|
|
188
|
+
/// Otherwise, iterates through tables and images, assigning them to pages based on page_number.
|
|
189
|
+
fn assign_tables_and_images_to_pages(
|
|
190
|
+
mut page_contents: Option<Vec<PageContent>>,
|
|
191
|
+
tables: &[crate::types::Table],
|
|
192
|
+
images: &[crate::types::ExtractedImage],
|
|
193
|
+
) -> Option<Vec<PageContent>> {
|
|
194
|
+
let pages = page_contents.take()?;
|
|
195
|
+
|
|
196
|
+
let mut updated_pages = pages;
|
|
197
|
+
|
|
198
|
+
for table in tables {
|
|
199
|
+
if let Some(page) = updated_pages.iter_mut().find(|p| p.page_number == table.page_number) {
|
|
200
|
+
page.tables.push(table.clone());
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
for image in images {
|
|
205
|
+
if let Some(page_num) = image.page_number
|
|
206
|
+
&& let Some(page) = updated_pages.iter_mut().find(|p| p.page_number == page_num)
|
|
207
|
+
{
|
|
208
|
+
page.images.push(image.clone());
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
Some(updated_pages)
|
|
213
|
+
}
|
|
214
|
+
|
|
185
215
|
/// PDF document extractor using pypdfium2 and playa-pdf.
|
|
186
216
|
pub struct PdfExtractor;
|
|
187
217
|
|
|
@@ -295,9 +325,10 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
295
325
|
config: &ExtractionConfig,
|
|
296
326
|
) -> Result<ExtractionResult> {
|
|
297
327
|
#[cfg(feature = "pdf")]
|
|
298
|
-
let (pdf_metadata, native_text, tables) = if crate::core::batch_mode::is_batch_mode() {
|
|
328
|
+
let (pdf_metadata, native_text, tables, page_contents) = if crate::core::batch_mode::is_batch_mode() {
|
|
299
329
|
let content_owned = content.to_vec();
|
|
300
330
|
let span = tracing::Span::current();
|
|
331
|
+
let pages_config = config.pages.clone();
|
|
301
332
|
tokio::task::spawn_blocking(move || {
|
|
302
333
|
let _guard = span.entered();
|
|
303
334
|
let bindings = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
|
|
@@ -315,12 +346,25 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
315
346
|
}
|
|
316
347
|
})?;
|
|
317
348
|
|
|
318
|
-
let
|
|
319
|
-
|
|
349
|
+
let (native_text, boundaries, page_contents) =
|
|
350
|
+
crate::pdf::text::extract_text_from_pdf_document(&document, pages_config.as_ref())?;
|
|
320
351
|
|
|
321
|
-
let
|
|
352
|
+
let pdf_metadata =
|
|
353
|
+
crate::pdf::metadata::extract_metadata_from_document(&document, boundaries.as_deref())?;
|
|
322
354
|
|
|
323
|
-
|
|
355
|
+
let tables = extract_tables_from_document(&document, &pdf_metadata)?;
|
|
356
|
+
|
|
357
|
+
if let Some(ref page_cfg) = pages_config
|
|
358
|
+
&& page_cfg.extract_pages
|
|
359
|
+
&& page_contents.is_none()
|
|
360
|
+
{
|
|
361
|
+
return Err(PdfError::ExtractionFailed(
|
|
362
|
+
"Page extraction was configured but no page data was extracted in batch mode".to_string(),
|
|
363
|
+
)
|
|
364
|
+
.into());
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
Ok::<_, crate::error::KreuzbergError>((pdf_metadata, native_text, tables, page_contents))
|
|
324
368
|
})
|
|
325
369
|
.await
|
|
326
370
|
.map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))??
|
|
@@ -340,12 +384,14 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
340
384
|
}
|
|
341
385
|
})?;
|
|
342
386
|
|
|
343
|
-
let
|
|
344
|
-
|
|
387
|
+
let (native_text, boundaries, page_contents) =
|
|
388
|
+
crate::pdf::text::extract_text_from_pdf_document(&document, config.pages.as_ref())?;
|
|
389
|
+
|
|
390
|
+
let pdf_metadata = crate::pdf::metadata::extract_metadata_from_document(&document, boundaries.as_deref())?;
|
|
345
391
|
|
|
346
|
-
let tables = extract_tables_from_document(&document, &
|
|
392
|
+
let tables = extract_tables_from_document(&document, &pdf_metadata)?;
|
|
347
393
|
|
|
348
|
-
(
|
|
394
|
+
(pdf_metadata, native_text, tables, page_contents)
|
|
349
395
|
};
|
|
350
396
|
|
|
351
397
|
#[cfg(feature = "ocr")]
|
|
@@ -356,20 +402,19 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
356
402
|
native_text
|
|
357
403
|
}
|
|
358
404
|
} else if config.ocr.is_some() {
|
|
359
|
-
let decision = evaluate_native_text_for_ocr(&native_text,
|
|
405
|
+
let decision = evaluate_native_text_for_ocr(&native_text, None);
|
|
360
406
|
|
|
361
407
|
if std::env::var("KREUZBERG_DEBUG_OCR").is_ok() {
|
|
362
408
|
eprintln!(
|
|
363
409
|
"[kreuzberg::pdf::ocr] fallback={} non_whitespace={} alnum={} meaningful_words={} \
|
|
364
|
-
avg_non_whitespace={:.2} avg_alnum={:.2} alnum_ratio={:.3}
|
|
410
|
+
avg_non_whitespace={:.2} avg_alnum={:.2} alnum_ratio={:.3}",
|
|
365
411
|
decision.fallback,
|
|
366
412
|
decision.stats.non_whitespace,
|
|
367
413
|
decision.stats.alnum,
|
|
368
414
|
decision.stats.meaningful_words,
|
|
369
415
|
decision.avg_non_whitespace,
|
|
370
416
|
decision.avg_alnum,
|
|
371
|
-
decision.stats.alnum_ratio
|
|
372
|
-
pdf_metadata.page_count.unwrap_or(0)
|
|
417
|
+
decision.stats.alnum_ratio
|
|
373
418
|
);
|
|
374
419
|
}
|
|
375
420
|
|
|
@@ -385,6 +430,20 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
385
430
|
#[cfg(not(feature = "ocr"))]
|
|
386
431
|
let text = native_text;
|
|
387
432
|
|
|
433
|
+
#[cfg(feature = "pdf")]
|
|
434
|
+
if let Some(ref page_cfg) = config.pages
|
|
435
|
+
&& page_cfg.insert_page_markers
|
|
436
|
+
{
|
|
437
|
+
let marker_placeholder = page_cfg.marker_format.replace("{page_num}", "");
|
|
438
|
+
if !marker_placeholder.is_empty() && !text.contains(&marker_placeholder) {
|
|
439
|
+
#[cfg(feature = "otel")]
|
|
440
|
+
tracing::warn!(
|
|
441
|
+
"Page markers were configured but none found in extracted content. \
|
|
442
|
+
This may indicate very short documents or incomplete extraction."
|
|
443
|
+
);
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
|
|
388
447
|
let images = if config.images.is_some() {
|
|
389
448
|
match crate::pdf::images::extract_images_from_pdf(content) {
|
|
390
449
|
Ok(pdf_images) => Some(
|
|
@@ -415,14 +474,33 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
415
474
|
None
|
|
416
475
|
};
|
|
417
476
|
|
|
477
|
+
let final_pages = assign_tables_and_images_to_pages(page_contents, &tables, images.as_deref().unwrap_or(&[]));
|
|
478
|
+
|
|
418
479
|
Ok(ExtractionResult {
|
|
419
480
|
content: text,
|
|
420
481
|
mime_type: mime_type.to_string(),
|
|
421
482
|
metadata: Metadata {
|
|
422
483
|
#[cfg(feature = "pdf")]
|
|
423
|
-
|
|
484
|
+
title: pdf_metadata.title.clone(),
|
|
485
|
+
#[cfg(feature = "pdf")]
|
|
486
|
+
subject: pdf_metadata.subject.clone(),
|
|
487
|
+
#[cfg(feature = "pdf")]
|
|
488
|
+
authors: pdf_metadata.authors.clone(),
|
|
489
|
+
#[cfg(feature = "pdf")]
|
|
490
|
+
keywords: pdf_metadata.keywords.clone(),
|
|
491
|
+
#[cfg(feature = "pdf")]
|
|
492
|
+
created_at: pdf_metadata.created_at.clone(),
|
|
493
|
+
#[cfg(feature = "pdf")]
|
|
494
|
+
modified_at: pdf_metadata.modified_at.clone(),
|
|
495
|
+
#[cfg(feature = "pdf")]
|
|
496
|
+
created_by: pdf_metadata.created_by.clone(),
|
|
497
|
+
#[cfg(feature = "pdf")]
|
|
498
|
+
pages: pdf_metadata.page_structure.clone(),
|
|
499
|
+
#[cfg(feature = "pdf")]
|
|
500
|
+
format: Some(crate::types::FormatMetadata::Pdf(pdf_metadata.pdf_specific)),
|
|
424
501
|
..Default::default()
|
|
425
502
|
},
|
|
503
|
+
pages: final_pages,
|
|
426
504
|
tables,
|
|
427
505
|
detected_languages: None,
|
|
428
506
|
chunks: None,
|
|
@@ -490,4 +568,106 @@ mod tests {
|
|
|
490
568
|
let sample = " . , ; : -- -- ";
|
|
491
569
|
assert!(evaluate_native_text_for_ocr(sample, Some(2)).fallback);
|
|
492
570
|
}
|
|
571
|
+
|
|
572
|
+
#[tokio::test]
|
|
573
|
+
#[cfg(feature = "pdf")]
|
|
574
|
+
async fn test_pdf_batch_mode_validates_page_config_enabled() {
|
|
575
|
+
use crate::core::config::PageConfig;
|
|
576
|
+
|
|
577
|
+
let extractor = PdfExtractor::new();
|
|
578
|
+
|
|
579
|
+
let config = ExtractionConfig {
|
|
580
|
+
pages: Some(PageConfig {
|
|
581
|
+
extract_pages: true,
|
|
582
|
+
insert_page_markers: false,
|
|
583
|
+
marker_format: "<!-- PAGE {page_num} -->".to_string(),
|
|
584
|
+
}),
|
|
585
|
+
..Default::default()
|
|
586
|
+
};
|
|
587
|
+
|
|
588
|
+
let pdf_path =
|
|
589
|
+
std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/pdfs/google_doc_document.pdf");
|
|
590
|
+
if let Ok(content) = std::fs::read(pdf_path) {
|
|
591
|
+
let result = extractor.extract_bytes(&content, "application/pdf", &config).await;
|
|
592
|
+
assert!(
|
|
593
|
+
result.is_ok(),
|
|
594
|
+
"Failed to extract PDF with page config: {:?}",
|
|
595
|
+
result.err()
|
|
596
|
+
);
|
|
597
|
+
|
|
598
|
+
let extraction_result = result.unwrap();
|
|
599
|
+
assert!(
|
|
600
|
+
extraction_result.pages.is_some(),
|
|
601
|
+
"Pages should be extracted when extract_pages is true"
|
|
602
|
+
);
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
#[tokio::test]
|
|
607
|
+
#[cfg(feature = "pdf")]
|
|
608
|
+
async fn test_pdf_batch_mode_validates_page_config_disabled() {
|
|
609
|
+
let extractor = PdfExtractor::new();
|
|
610
|
+
let config = ExtractionConfig::default();
|
|
611
|
+
|
|
612
|
+
let pdf_path =
|
|
613
|
+
std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/pdfs/google_doc_document.pdf");
|
|
614
|
+
if let Ok(content) = std::fs::read(pdf_path) {
|
|
615
|
+
let result = extractor.extract_bytes(&content, "application/pdf", &config).await;
|
|
616
|
+
assert!(
|
|
617
|
+
result.is_ok(),
|
|
618
|
+
"Failed to extract PDF without page config: {:?}",
|
|
619
|
+
result.err()
|
|
620
|
+
);
|
|
621
|
+
|
|
622
|
+
let extraction_result = result.unwrap();
|
|
623
|
+
assert!(
|
|
624
|
+
extraction_result.pages.is_none(),
|
|
625
|
+
"Pages should not be extracted when pages config is None"
|
|
626
|
+
);
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
#[tokio::test]
|
|
631
|
+
#[cfg(feature = "pdf")]
|
|
632
|
+
async fn test_pdf_page_marker_validation() {
|
|
633
|
+
use crate::core::config::PageConfig;
|
|
634
|
+
|
|
635
|
+
let extractor = PdfExtractor::new();
|
|
636
|
+
|
|
637
|
+
let config = ExtractionConfig {
|
|
638
|
+
pages: Some(PageConfig {
|
|
639
|
+
extract_pages: true,
|
|
640
|
+
insert_page_markers: true,
|
|
641
|
+
marker_format: "\n\n<!-- PAGE {page_num} -->\n\n".to_string(),
|
|
642
|
+
}),
|
|
643
|
+
..Default::default()
|
|
644
|
+
};
|
|
645
|
+
|
|
646
|
+
let pdf_path =
|
|
647
|
+
std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("../../test_documents/pdfs/multi_page.pdf");
|
|
648
|
+
if let Ok(content) = std::fs::read(pdf_path) {
|
|
649
|
+
let result = extractor.extract_bytes(&content, "application/pdf", &config).await;
|
|
650
|
+
assert!(
|
|
651
|
+
result.is_ok(),
|
|
652
|
+
"Failed to extract PDF with page markers: {:?}",
|
|
653
|
+
result.err()
|
|
654
|
+
);
|
|
655
|
+
|
|
656
|
+
let extraction_result = result.unwrap();
|
|
657
|
+
let marker_placeholder = "<!-- PAGE ";
|
|
658
|
+
if extraction_result.content.len() > 100 {
|
|
659
|
+
assert!(
|
|
660
|
+
extraction_result.content.contains(marker_placeholder),
|
|
661
|
+
"Page markers should be inserted when configured and document has multiple pages"
|
|
662
|
+
);
|
|
663
|
+
}
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
#[test]
|
|
668
|
+
#[cfg(feature = "pdf")]
|
|
669
|
+
fn test_pdf_extractor_without_feature_pdf() {
|
|
670
|
+
let extractor = PdfExtractor::new();
|
|
671
|
+
assert_eq!(extractor.name(), "pdf-extractor");
|
|
672
|
+
}
|
|
493
673
|
}
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
#![cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
2
|
+
|
|
1
3
|
//! PowerPoint presentation extractor.
|
|
2
4
|
|
|
3
5
|
use crate::Result;
|
|
@@ -69,6 +71,7 @@ impl PptxExtractor {
|
|
|
69
71
|
detected_languages: None,
|
|
70
72
|
chunks: None,
|
|
71
73
|
images: None,
|
|
74
|
+
pages: None,
|
|
72
75
|
};
|
|
73
76
|
image.ocr_result = Some(Box::new(extraction_result));
|
|
74
77
|
}
|
|
@@ -117,17 +120,18 @@ impl DocumentExtractor for PptxExtractor {
|
|
|
117
120
|
) -> Result<ExtractionResult> {
|
|
118
121
|
let extract_images = config.images.as_ref().is_some_and(|img| img.extract_images);
|
|
119
122
|
|
|
123
|
+
let pages_config = config.pages.clone();
|
|
120
124
|
let pptx_result = if crate::core::batch_mode::is_batch_mode() {
|
|
121
125
|
let content_owned = content.to_vec();
|
|
122
126
|
let span = tracing::Span::current();
|
|
123
127
|
tokio::task::spawn_blocking(move || {
|
|
124
128
|
let _guard = span.entered();
|
|
125
|
-
crate::extraction::pptx::extract_pptx_from_bytes(&content_owned, extract_images)
|
|
129
|
+
crate::extraction::pptx::extract_pptx_from_bytes(&content_owned, extract_images, pages_config.as_ref())
|
|
126
130
|
})
|
|
127
131
|
.await
|
|
128
132
|
.map_err(|e| crate::error::KreuzbergError::parsing(format!("PPTX extraction task failed: {}", e)))??
|
|
129
133
|
} else {
|
|
130
|
-
crate::extraction::pptx::extract_pptx_from_bytes(content, extract_images)?
|
|
134
|
+
crate::extraction::pptx::extract_pptx_from_bytes(content, extract_images, config.pages.as_ref())?
|
|
131
135
|
};
|
|
132
136
|
|
|
133
137
|
let mut additional = std::collections::HashMap::new();
|
|
@@ -149,14 +153,21 @@ impl DocumentExtractor for PptxExtractor {
|
|
|
149
153
|
None
|
|
150
154
|
};
|
|
151
155
|
|
|
156
|
+
let mut metadata = Metadata {
|
|
157
|
+
format: Some(crate::types::FormatMetadata::Pptx(pptx_result.metadata)),
|
|
158
|
+
additional,
|
|
159
|
+
..Default::default()
|
|
160
|
+
};
|
|
161
|
+
|
|
162
|
+
if let Some(page_structure) = pptx_result.page_structure {
|
|
163
|
+
metadata.pages = Some(page_structure);
|
|
164
|
+
}
|
|
165
|
+
|
|
152
166
|
Ok(ExtractionResult {
|
|
153
167
|
content: pptx_result.content,
|
|
154
168
|
mime_type: mime_type.to_string(),
|
|
155
|
-
metadata
|
|
156
|
-
|
|
157
|
-
additional,
|
|
158
|
-
..Default::default()
|
|
159
|
-
},
|
|
169
|
+
metadata,
|
|
170
|
+
pages: pptx_result.page_contents,
|
|
160
171
|
tables: vec![],
|
|
161
172
|
detected_languages: None,
|
|
162
173
|
chunks: None,
|
|
@@ -177,7 +188,8 @@ impl DocumentExtractor for PptxExtractor {
|
|
|
177
188
|
|
|
178
189
|
let extract_images = config.images.as_ref().is_some_and(|img| img.extract_images);
|
|
179
190
|
|
|
180
|
-
let pptx_result =
|
|
191
|
+
let pptx_result =
|
|
192
|
+
crate::extraction::pptx::extract_pptx_from_path(path_str, extract_images, config.pages.as_ref())?;
|
|
181
193
|
|
|
182
194
|
let mut additional = std::collections::HashMap::new();
|
|
183
195
|
additional.insert("slide_count".to_string(), serde_json::json!(pptx_result.slide_count));
|
|
@@ -198,14 +210,21 @@ impl DocumentExtractor for PptxExtractor {
|
|
|
198
210
|
None
|
|
199
211
|
};
|
|
200
212
|
|
|
213
|
+
let mut metadata = Metadata {
|
|
214
|
+
format: Some(crate::types::FormatMetadata::Pptx(pptx_result.metadata)),
|
|
215
|
+
additional,
|
|
216
|
+
..Default::default()
|
|
217
|
+
};
|
|
218
|
+
|
|
219
|
+
if let Some(page_structure) = pptx_result.page_structure {
|
|
220
|
+
metadata.pages = Some(page_structure);
|
|
221
|
+
}
|
|
222
|
+
|
|
201
223
|
Ok(ExtractionResult {
|
|
202
224
|
content: pptx_result.content,
|
|
203
225
|
mime_type: mime_type.to_string(),
|
|
204
|
-
metadata
|
|
205
|
-
|
|
206
|
-
additional,
|
|
207
|
-
..Default::default()
|
|
208
|
-
},
|
|
226
|
+
metadata,
|
|
227
|
+
pages: pptx_result.page_contents,
|
|
209
228
|
tables: vec![],
|
|
210
229
|
detected_languages: None,
|
|
211
230
|
chunks: None,
|
|
@@ -391,9 +391,7 @@ fn extract_text_from_rtf(content: &str) -> (String, Vec<Table>) {
|
|
|
391
391
|
if let Some(state) = table_state.as_ref()
|
|
392
392
|
&& !state.in_row
|
|
393
393
|
&& !state.rows.is_empty()
|
|
394
|
-
{
|
|
395
|
-
// We'll finalize once we see content outside the table
|
|
396
|
-
}
|
|
394
|
+
{}
|
|
397
395
|
}
|
|
398
396
|
_ => {}
|
|
399
397
|
}
|
|
@@ -571,7 +569,7 @@ fn extract_rtf_metadata(rtf_content: &str, extracted_text: &str) -> HashMap<Stri
|
|
|
571
569
|
};
|
|
572
570
|
|
|
573
571
|
let mut chars = cleaned_segment.chars().peekable();
|
|
574
|
-
chars.next();
|
|
572
|
+
chars.next();
|
|
575
573
|
let (keyword, numeric) = parse_rtf_control_word(&mut chars);
|
|
576
574
|
let remaining: String = chars.collect();
|
|
577
575
|
let trimmed = remaining.trim();
|
|
@@ -771,6 +769,7 @@ impl DocumentExtractor for RtfExtractor {
|
|
|
771
769
|
additional: metadata_map,
|
|
772
770
|
..Default::default()
|
|
773
771
|
},
|
|
772
|
+
pages: None,
|
|
774
773
|
tables,
|
|
775
774
|
detected_languages: None,
|
|
776
775
|
chunks: None,
|
|
@@ -5,6 +5,7 @@ use crate::core::config::ExtractionConfig;
|
|
|
5
5
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
6
6
|
use crate::types::{ExtractionResult, Metadata};
|
|
7
7
|
use async_trait::async_trait;
|
|
8
|
+
#[cfg(feature = "tokio-runtime")]
|
|
8
9
|
use std::path::Path;
|
|
9
10
|
|
|
10
11
|
/// Structured data extractor supporting JSON, YAML, and TOML.
|
|
@@ -80,6 +81,7 @@ impl DocumentExtractor for StructuredExtractor {
|
|
|
80
81
|
additional,
|
|
81
82
|
..Default::default()
|
|
82
83
|
},
|
|
84
|
+
pages: None,
|
|
83
85
|
tables: vec![],
|
|
84
86
|
detected_languages: None,
|
|
85
87
|
chunks: None,
|
|
@@ -86,6 +86,7 @@ impl DocumentExtractor for PlainTextExtractor {
|
|
|
86
86
|
})),
|
|
87
87
|
..Default::default()
|
|
88
88
|
},
|
|
89
|
+
pages: None,
|
|
89
90
|
tables: vec![],
|
|
90
91
|
detected_languages: None,
|
|
91
92
|
chunks: None,
|
|
@@ -94,7 +95,7 @@ impl DocumentExtractor for PlainTextExtractor {
|
|
|
94
95
|
}
|
|
95
96
|
|
|
96
97
|
fn supported_mime_types(&self) -> &[&str] {
|
|
97
|
-
&["text/plain"]
|
|
98
|
+
&["text/plain", "text/csv", "text/tab-separated-values"]
|
|
98
99
|
}
|
|
99
100
|
|
|
100
101
|
fn priority(&self) -> i32 {
|
|
@@ -178,6 +179,7 @@ impl DocumentExtractor for MarkdownExtractor {
|
|
|
178
179
|
})),
|
|
179
180
|
..Default::default()
|
|
180
181
|
},
|
|
182
|
+
pages: None,
|
|
181
183
|
tables: vec![],
|
|
182
184
|
detected_languages: None,
|
|
183
185
|
chunks: None,
|
|
@@ -245,7 +247,10 @@ mod tests {
|
|
|
245
247
|
let extractor = PlainTextExtractor::new();
|
|
246
248
|
assert_eq!(extractor.name(), "plain-text-extractor");
|
|
247
249
|
assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
|
|
248
|
-
assert_eq!(
|
|
250
|
+
assert_eq!(
|
|
251
|
+
extractor.supported_mime_types(),
|
|
252
|
+
&["text/plain", "text/csv", "text/tab-separated-values"]
|
|
253
|
+
);
|
|
249
254
|
assert_eq!(extractor.priority(), 50);
|
|
250
255
|
}
|
|
251
256
|
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
use crate::Result;
|
|
4
4
|
use crate::core::config::ExtractionConfig;
|
|
5
5
|
use crate::extraction::xml::parse_xml;
|
|
6
|
+
use crate::extractors::SyncExtractor;
|
|
6
7
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
7
8
|
use crate::types::ExtractionResult;
|
|
8
9
|
use async_trait::async_trait;
|
|
@@ -51,21 +52,8 @@ impl Plugin for XmlExtractor {
|
|
|
51
52
|
}
|
|
52
53
|
}
|
|
53
54
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
57
|
-
skip(self, content, _config),
|
|
58
|
-
fields(
|
|
59
|
-
extractor.name = self.name(),
|
|
60
|
-
content.size_bytes = content.len(),
|
|
61
|
-
)
|
|
62
|
-
))]
|
|
63
|
-
async fn extract_bytes(
|
|
64
|
-
&self,
|
|
65
|
-
content: &[u8],
|
|
66
|
-
mime_type: &str,
|
|
67
|
-
_config: &ExtractionConfig,
|
|
68
|
-
) -> Result<ExtractionResult> {
|
|
55
|
+
impl SyncExtractor for XmlExtractor {
|
|
56
|
+
fn extract_sync(&self, content: &[u8], mime_type: &str, _config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
69
57
|
let xml_result = parse_xml(content, false)?;
|
|
70
58
|
|
|
71
59
|
Ok(ExtractionResult {
|
|
@@ -82,8 +70,28 @@ impl DocumentExtractor for XmlExtractor {
|
|
|
82
70
|
detected_languages: None,
|
|
83
71
|
chunks: None,
|
|
84
72
|
images: None,
|
|
73
|
+
pages: None,
|
|
85
74
|
})
|
|
86
75
|
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
#[async_trait]
|
|
79
|
+
impl DocumentExtractor for XmlExtractor {
|
|
80
|
+
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
81
|
+
skip(self, content, config),
|
|
82
|
+
fields(
|
|
83
|
+
extractor.name = self.name(),
|
|
84
|
+
content.size_bytes = content.len(),
|
|
85
|
+
)
|
|
86
|
+
))]
|
|
87
|
+
async fn extract_bytes(
|
|
88
|
+
&self,
|
|
89
|
+
content: &[u8],
|
|
90
|
+
mime_type: &str,
|
|
91
|
+
config: &ExtractionConfig,
|
|
92
|
+
) -> Result<ExtractionResult> {
|
|
93
|
+
self.extract_sync(content, mime_type, config)
|
|
94
|
+
}
|
|
87
95
|
|
|
88
96
|
fn supported_mime_types(&self) -> &[&str] {
|
|
89
97
|
&["application/xml", "text/xml", "image/svg+xml"]
|
|
@@ -92,6 +100,10 @@ impl DocumentExtractor for XmlExtractor {
|
|
|
92
100
|
fn priority(&self) -> i32 {
|
|
93
101
|
50
|
|
94
102
|
}
|
|
103
|
+
|
|
104
|
+
fn as_sync_extractor(&self) -> Option<&dyn crate::extractors::SyncExtractor> {
|
|
105
|
+
Some(self)
|
|
106
|
+
}
|
|
95
107
|
}
|
|
96
108
|
|
|
97
109
|
#[cfg(test)]
|
|
@@ -45,7 +45,8 @@ impl Plugin for KeywordExtractor {
|
|
|
45
45
|
}
|
|
46
46
|
}
|
|
47
47
|
|
|
48
|
-
#[async_trait]
|
|
48
|
+
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
|
|
49
|
+
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
|
|
49
50
|
impl PostProcessor for KeywordExtractor {
|
|
50
51
|
async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig) -> Result<()> {
|
|
51
52
|
let keyword_config = match &config.keywords {
|
|
@@ -112,6 +113,7 @@ machine learning that uses neural networks with multiple layers.
|
|
|
112
113
|
detected_languages: None,
|
|
113
114
|
chunks: None,
|
|
114
115
|
images: None,
|
|
116
|
+
pages: None,
|
|
115
117
|
};
|
|
116
118
|
|
|
117
119
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -140,6 +142,7 @@ machine learning that uses neural networks with multiple layers.
|
|
|
140
142
|
detected_languages: None,
|
|
141
143
|
chunks: None,
|
|
142
144
|
images: None,
|
|
145
|
+
pages: None,
|
|
143
146
|
};
|
|
144
147
|
|
|
145
148
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -164,6 +167,7 @@ machine learning that uses neural networks with multiple layers.
|
|
|
164
167
|
detected_languages: None,
|
|
165
168
|
chunks: None,
|
|
166
169
|
images: None,
|
|
170
|
+
pages: None,
|
|
167
171
|
};
|
|
168
172
|
|
|
169
173
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -188,6 +192,7 @@ machine learning that uses neural networks with multiple layers.
|
|
|
188
192
|
detected_languages: None,
|
|
189
193
|
chunks: None,
|
|
190
194
|
images: None,
|
|
195
|
+
pages: None,
|
|
191
196
|
};
|
|
192
197
|
|
|
193
198
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -223,6 +228,7 @@ machine learning that uses neural networks with multiple layers.
|
|
|
223
228
|
detected_languages: None,
|
|
224
229
|
chunks: None,
|
|
225
230
|
images: None,
|
|
231
|
+
pages: None,
|
|
226
232
|
};
|
|
227
233
|
|
|
228
234
|
let config_with_keywords = ExtractionConfig {
|
|
@@ -247,6 +253,7 @@ machine learning that uses neural networks with multiple layers.
|
|
|
247
253
|
detected_languages: None,
|
|
248
254
|
chunks: None,
|
|
249
255
|
images: None,
|
|
256
|
+
pages: None,
|
|
250
257
|
};
|
|
251
258
|
|
|
252
259
|
let long_result = ExtractionResult {
|
|
@@ -257,6 +264,7 @@ machine learning that uses neural networks with multiple layers.
|
|
|
257
264
|
detected_languages: None,
|
|
258
265
|
chunks: None,
|
|
259
266
|
images: None,
|
|
267
|
+
pages: None,
|
|
260
268
|
};
|
|
261
269
|
|
|
262
270
|
let short_duration = processor.estimated_duration_ms(&short_result);
|