kreuzberg 4.2.0 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
- data/lib/kreuzberg/cli.rb +16 -6
- data/lib/kreuzberg/cli_proxy.rb +3 -1
- data/lib/kreuzberg/config.rb +56 -9
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/extraction_api.rb +20 -4
- data/lib/kreuzberg/result.rb +12 -2
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +1 -0
- data/sig/kreuzberg.rbs +23 -11
- data/spec/binding/batch_spec.rb +6 -5
- data/spec/binding/error_recovery_spec.rb +3 -3
- data/spec/binding/tables_spec.rb +11 -2
- data/spec/unit/config/output_format_spec.rb +18 -18
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/startup.rs +15 -1
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
- data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
- data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
- data/vendor/kreuzberg/src/core/io.rs +7 -7
- data/vendor/kreuzberg/src/core/mime.rs +4 -4
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
- data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
- data/vendor/kreuzberg/tests/config_behavioral.rs +14 -12
- data/vendor/kreuzberg/tests/core_integration.rs +2 -4
- data/vendor/kreuzberg/tests/mime_detection.rs +3 -2
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +284 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +4 -2
|
@@ -1,13 +1,15 @@
|
|
|
1
1
|
//! Regression tests for PPTX/PPSX extraction bugs
|
|
2
2
|
//!
|
|
3
3
|
//! GitHub Issue #321: PPTX extraction fails on shapes without txBody (image placeholders) + PPSX not supported
|
|
4
|
+
//! GitHub Issue #329: Extracting images from PPTX results in reversed page numbers
|
|
4
5
|
//!
|
|
5
6
|
//! Bug 1: "No txBody found" - PPTX extraction fails when any shape lacks a text body
|
|
6
7
|
//! Bug 2: PPSX not supported - PowerPoint Show files rejected entirely
|
|
8
|
+
//! Bug 3: Image page numbers reversed - image on slide 1 reports page_number=2
|
|
7
9
|
|
|
8
10
|
#![cfg(feature = "office")]
|
|
9
11
|
|
|
10
|
-
use kreuzberg::{ExtractionConfig, extract_file};
|
|
12
|
+
use kreuzberg::{ExtractionConfig, ImageExtractionConfig, extract_file};
|
|
11
13
|
use std::io::Write;
|
|
12
14
|
use tempfile::NamedTempFile;
|
|
13
15
|
use zip::CompressionMethod;
|
|
@@ -512,3 +514,284 @@ async fn test_pptx_mixed_shapes_extraction() {
|
|
|
512
514
|
}
|
|
513
515
|
}
|
|
514
516
|
}
|
|
517
|
+
|
|
518
|
+
/// Test that images extracted from PPTX have correct page numbers.
|
|
519
|
+
///
|
|
520
|
+
/// When a PPTX has multiple slides and an image on slide 1, the extracted image
|
|
521
|
+
/// should have page_number=1 (not reversed).
|
|
522
|
+
///
|
|
523
|
+
/// GitHub Issue #329: Image on slide 1 of 2-slide PPTX reports page_number=2
|
|
524
|
+
#[tokio::test]
|
|
525
|
+
async fn test_pptx_image_page_numbers_not_reversed() {
|
|
526
|
+
// Create a PPTX with 2 slides, image on slide 1
|
|
527
|
+
let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
|
|
528
|
+
|
|
529
|
+
// A minimal 1x1 red PNG image (valid PNG format)
|
|
530
|
+
let png_image: &[u8] = &[
|
|
531
|
+
0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, // PNG signature
|
|
532
|
+
0x00, 0x00, 0x00, 0x0D, // IHDR chunk length
|
|
533
|
+
0x49, 0x48, 0x44, 0x52, // "IHDR"
|
|
534
|
+
0x00, 0x00, 0x00, 0x01, // width: 1
|
|
535
|
+
0x00, 0x00, 0x00, 0x01, // height: 1
|
|
536
|
+
0x08, 0x02, // bit depth: 8, color type: RGB
|
|
537
|
+
0x00, 0x00, 0x00, // compression, filter, interlace
|
|
538
|
+
0x90, 0x77, 0x53, 0xDE, // IHDR CRC
|
|
539
|
+
0x00, 0x00, 0x00, 0x0C, // IDAT chunk length
|
|
540
|
+
0x49, 0x44, 0x41, 0x54, // "IDAT"
|
|
541
|
+
0x08, 0xD7, 0x63, 0xF8, 0xCF, 0xC0, 0x00, 0x00, // compressed data
|
|
542
|
+
0x01, 0x01, 0x01, 0x00, // checksum
|
|
543
|
+
0x18, 0xDD, 0x8D, 0xB4, // IDAT CRC
|
|
544
|
+
0x00, 0x00, 0x00, 0x00, // IEND chunk length
|
|
545
|
+
0x49, 0x45, 0x4E, 0x44, // "IEND"
|
|
546
|
+
0xAE, 0x42, 0x60, 0x82, // IEND CRC
|
|
547
|
+
];
|
|
548
|
+
|
|
549
|
+
{
|
|
550
|
+
let mut zip = ZipWriter::new(&mut temp_file);
|
|
551
|
+
let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
|
|
552
|
+
|
|
553
|
+
// Add [Content_Types].xml
|
|
554
|
+
zip.start_file("[Content_Types].xml", options)
|
|
555
|
+
.expect("Operation failed");
|
|
556
|
+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
557
|
+
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
558
|
+
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
559
|
+
<Default Extension="xml" ContentType="application/xml"/>
|
|
560
|
+
<Default Extension="png" ContentType="image/png"/>
|
|
561
|
+
<Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
|
|
562
|
+
<Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
|
|
563
|
+
<Override PartName="/ppt/slides/slide2.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
|
|
564
|
+
</Types>"#).expect("Operation failed");
|
|
565
|
+
|
|
566
|
+
// Add _rels/.rels
|
|
567
|
+
zip.start_file("_rels/.rels", options).expect("Operation failed");
|
|
568
|
+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
569
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
570
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
|
|
571
|
+
</Relationships>"#).expect("Operation failed");
|
|
572
|
+
|
|
573
|
+
// Add ppt/presentation.xml
|
|
574
|
+
zip.start_file("ppt/presentation.xml", options)
|
|
575
|
+
.expect("Operation failed");
|
|
576
|
+
zip.write_all(
|
|
577
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
578
|
+
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
579
|
+
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
|
|
580
|
+
<p:sldIdLst>
|
|
581
|
+
<p:sldId id="256" r:id="rId2"/>
|
|
582
|
+
<p:sldId id="257" r:id="rId3"/>
|
|
583
|
+
</p:sldIdLst>
|
|
584
|
+
</p:presentation>"#,
|
|
585
|
+
)
|
|
586
|
+
.expect("Operation failed");
|
|
587
|
+
|
|
588
|
+
// Add ppt/_rels/presentation.xml.rels
|
|
589
|
+
// BUG REPRODUCTION: Slides listed in REVERSE order in XML (slide2 before slide1)
|
|
590
|
+
// This is valid XML - PowerPoint doesn't guarantee order in rels files
|
|
591
|
+
// GitHub Issue #329: This causes page numbers to be reversed
|
|
592
|
+
zip.start_file("ppt/_rels/presentation.xml.rels", options)
|
|
593
|
+
.expect("Operation failed");
|
|
594
|
+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
595
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
596
|
+
<Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide2.xml"/>
|
|
597
|
+
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
|
|
598
|
+
</Relationships>"#).expect("Operation failed");
|
|
599
|
+
|
|
600
|
+
// Add the image file
|
|
601
|
+
zip.start_file("ppt/media/image1.png", options)
|
|
602
|
+
.expect("Operation failed");
|
|
603
|
+
zip.write_all(png_image).expect("Operation failed");
|
|
604
|
+
|
|
605
|
+
// Add slide 1 WITH an image
|
|
606
|
+
zip.start_file("ppt/slides/slide1.xml", options)
|
|
607
|
+
.expect("Operation failed");
|
|
608
|
+
zip.write_all(
|
|
609
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
610
|
+
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
611
|
+
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
612
|
+
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
|
|
613
|
+
<p:cSld>
|
|
614
|
+
<p:spTree>
|
|
615
|
+
<p:nvGrpSpPr><p:cNvPr id="1" name=""/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr>
|
|
616
|
+
<p:grpSpPr/>
|
|
617
|
+
<p:sp>
|
|
618
|
+
<p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
|
|
619
|
+
<p:spPr/>
|
|
620
|
+
<p:txBody><a:bodyPr/><a:lstStyle/><a:p><a:r><a:t>Slide 1 - Has Image</a:t></a:r></a:p></p:txBody>
|
|
621
|
+
</p:sp>
|
|
622
|
+
<p:pic>
|
|
623
|
+
<p:nvPicPr>
|
|
624
|
+
<p:cNvPr id="3" name="Picture 1"/>
|
|
625
|
+
<p:cNvPicPr><a:picLocks noChangeAspect="1"/></p:cNvPicPr>
|
|
626
|
+
<p:nvPr/>
|
|
627
|
+
</p:nvPicPr>
|
|
628
|
+
<p:blipFill>
|
|
629
|
+
<a:blip r:embed="rId2"/>
|
|
630
|
+
<a:stretch><a:fillRect/></a:stretch>
|
|
631
|
+
</p:blipFill>
|
|
632
|
+
<p:spPr>
|
|
633
|
+
<a:xfrm><a:off x="0" y="0"/><a:ext cx="100000" cy="100000"/></a:xfrm>
|
|
634
|
+
<a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
|
|
635
|
+
</p:spPr>
|
|
636
|
+
</p:pic>
|
|
637
|
+
</p:spTree>
|
|
638
|
+
</p:cSld>
|
|
639
|
+
</p:sld>"#,
|
|
640
|
+
)
|
|
641
|
+
.expect("Operation failed");
|
|
642
|
+
|
|
643
|
+
// Add slide 1 relationships (points to the image)
|
|
644
|
+
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
|
|
645
|
+
.expect("Operation failed");
|
|
646
|
+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
647
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
648
|
+
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image1.png"/>
|
|
649
|
+
</Relationships>"#).expect("Operation failed");
|
|
650
|
+
|
|
651
|
+
// Add slide 2 WITHOUT an image
|
|
652
|
+
zip.start_file("ppt/slides/slide2.xml", options)
|
|
653
|
+
.expect("Operation failed");
|
|
654
|
+
zip.write_all(
|
|
655
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
656
|
+
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
657
|
+
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
658
|
+
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
|
|
659
|
+
<p:cSld>
|
|
660
|
+
<p:spTree>
|
|
661
|
+
<p:nvGrpSpPr><p:cNvPr id="1" name=""/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr>
|
|
662
|
+
<p:grpSpPr/>
|
|
663
|
+
<p:sp>
|
|
664
|
+
<p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
|
|
665
|
+
<p:spPr/>
|
|
666
|
+
<p:txBody><a:bodyPr/><a:lstStyle/><a:p><a:r><a:t>Slide 2 - No Image</a:t></a:r></a:p></p:txBody>
|
|
667
|
+
</p:sp>
|
|
668
|
+
</p:spTree>
|
|
669
|
+
</p:cSld>
|
|
670
|
+
</p:sld>"#,
|
|
671
|
+
)
|
|
672
|
+
.expect("Operation failed");
|
|
673
|
+
|
|
674
|
+
// Add empty slide 2 relationships
|
|
675
|
+
zip.start_file("ppt/slides/_rels/slide2.xml.rels", options)
|
|
676
|
+
.expect("Operation failed");
|
|
677
|
+
zip.write_all(
|
|
678
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
679
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
680
|
+
</Relationships>"#,
|
|
681
|
+
)
|
|
682
|
+
.expect("Operation failed");
|
|
683
|
+
|
|
684
|
+
zip.finish().expect("Operation failed");
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
// Extract with images enabled
|
|
688
|
+
let config = ExtractionConfig {
|
|
689
|
+
images: Some(ImageExtractionConfig {
|
|
690
|
+
extract_images: true,
|
|
691
|
+
target_dpi: 300,
|
|
692
|
+
max_image_dimension: 4096,
|
|
693
|
+
auto_adjust_dpi: true,
|
|
694
|
+
min_dpi: 72,
|
|
695
|
+
max_dpi: 600,
|
|
696
|
+
}),
|
|
697
|
+
..Default::default()
|
|
698
|
+
};
|
|
699
|
+
|
|
700
|
+
let result = extract_file(
|
|
701
|
+
temp_file.path(),
|
|
702
|
+
Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
|
|
703
|
+
&config,
|
|
704
|
+
)
|
|
705
|
+
.await;
|
|
706
|
+
|
|
707
|
+
match result {
|
|
708
|
+
Ok(extraction) => {
|
|
709
|
+
// Verify text extraction works
|
|
710
|
+
assert!(extraction.content.contains("Slide 1"), "Should extract slide 1 text");
|
|
711
|
+
assert!(extraction.content.contains("Slide 2"), "Should extract slide 2 text");
|
|
712
|
+
|
|
713
|
+
// Verify we got an image
|
|
714
|
+
let images = extraction.images.as_ref().expect("Images should be present");
|
|
715
|
+
assert!(!images.is_empty(), "Should extract at least one image");
|
|
716
|
+
|
|
717
|
+
// THE CRITICAL TEST: Image on slide 1 should have page_number=1, NOT 2
|
|
718
|
+
let image = &images[0];
|
|
719
|
+
assert_eq!(
|
|
720
|
+
image.page_number,
|
|
721
|
+
Some(1),
|
|
722
|
+
"GitHub Issue #329: Image on slide 1 should have page_number=1, but got {:?}. \
|
|
723
|
+
The page numbers are reversed!",
|
|
724
|
+
image.page_number
|
|
725
|
+
);
|
|
726
|
+
|
|
727
|
+
println!("✅ PPTX image page numbers are correct!");
|
|
728
|
+
println!(" Image on slide 1 has page_number={:?}", image.page_number);
|
|
729
|
+
}
|
|
730
|
+
Err(e) => {
|
|
731
|
+
panic!("PPTX extraction failed: {:?}", e);
|
|
732
|
+
}
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
/// Test with actual user-provided PPTX file from GitHub Issue #329.
|
|
737
|
+
///
|
|
738
|
+
/// The user's file has slides listed in reverse order in presentation.xml.rels,
|
|
739
|
+
/// which caused images to have incorrect page numbers.
|
|
740
|
+
#[tokio::test]
|
|
741
|
+
async fn test_pptx_image_page_numbers_issue329_user_file() {
|
|
742
|
+
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
743
|
+
.parent()
|
|
744
|
+
.expect("Operation failed")
|
|
745
|
+
.parent()
|
|
746
|
+
.expect("Operation failed");
|
|
747
|
+
let test_file = workspace_root.join("test_documents/presentations/pptx_reversed_slide_order_issue329.pptx");
|
|
748
|
+
|
|
749
|
+
if !test_file.exists() {
|
|
750
|
+
println!("Skipping test: User file not found at {:?}", test_file);
|
|
751
|
+
return;
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
// Extract with images enabled
|
|
755
|
+
let config = ExtractionConfig {
|
|
756
|
+
images: Some(ImageExtractionConfig {
|
|
757
|
+
extract_images: true,
|
|
758
|
+
target_dpi: 300,
|
|
759
|
+
max_image_dimension: 4096,
|
|
760
|
+
auto_adjust_dpi: true,
|
|
761
|
+
min_dpi: 72,
|
|
762
|
+
max_dpi: 600,
|
|
763
|
+
}),
|
|
764
|
+
..Default::default()
|
|
765
|
+
};
|
|
766
|
+
|
|
767
|
+
let result = extract_file(&test_file, None, &config).await;
|
|
768
|
+
|
|
769
|
+
match result {
|
|
770
|
+
Ok(extraction) => {
|
|
771
|
+
// The user's file has an image on slide 1
|
|
772
|
+
let images = extraction.images.as_ref().expect("Images should be extracted");
|
|
773
|
+
|
|
774
|
+
if images.is_empty() {
|
|
775
|
+
println!("No images extracted from user file (may not have embedded images)");
|
|
776
|
+
return;
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
// All images should have page_number = 1 since they're on the first slide
|
|
780
|
+
for (idx, image) in images.iter().enumerate() {
|
|
781
|
+
assert_eq!(
|
|
782
|
+
image.page_number,
|
|
783
|
+
Some(1),
|
|
784
|
+
"GitHub Issue #329: Image {} should have page_number=1, but got {:?}",
|
|
785
|
+
idx,
|
|
786
|
+
image.page_number
|
|
787
|
+
);
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
println!("✅ User file from Issue #329 - image page numbers correct!");
|
|
791
|
+
println!(" Found {} images, all with page_number=1", images.len());
|
|
792
|
+
}
|
|
793
|
+
Err(e) => {
|
|
794
|
+
panic!("Failed to extract user file: {:?}", e);
|
|
795
|
+
}
|
|
796
|
+
}
|
|
797
|
+
}
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.2.
|
|
4
|
+
version: 4.2.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-01-
|
|
11
|
+
date: 2026-01-27 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -233,6 +233,7 @@ files:
|
|
|
233
233
|
- lib/kreuzberg/cli.rb
|
|
234
234
|
- lib/kreuzberg/cli_proxy.rb
|
|
235
235
|
- lib/kreuzberg/config.rb
|
|
236
|
+
- lib/kreuzberg/djot_content.rb
|
|
236
237
|
- lib/kreuzberg/error_context.rb
|
|
237
238
|
- lib/kreuzberg/errors.rb
|
|
238
239
|
- lib/kreuzberg/extraction_api.rb
|
|
@@ -591,6 +592,7 @@ files:
|
|
|
591
592
|
- vendor/kreuzberg/src/plugins/registry/ocr.rs
|
|
592
593
|
- vendor/kreuzberg/src/plugins/registry/processor.rs
|
|
593
594
|
- vendor/kreuzberg/src/plugins/registry/validator.rs
|
|
595
|
+
- vendor/kreuzberg/src/plugins/startup_validation.rs
|
|
594
596
|
- vendor/kreuzberg/src/plugins/traits.rs
|
|
595
597
|
- vendor/kreuzberg/src/plugins/validator/mod.rs
|
|
596
598
|
- vendor/kreuzberg/src/plugins/validator/registry.rs
|