kreuzberg 4.2.0 → 4.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
- data/lib/kreuzberg/cli.rb +16 -6
- data/lib/kreuzberg/cli_proxy.rb +3 -1
- data/lib/kreuzberg/config.rb +59 -28
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/extraction_api.rb +20 -4
- data/lib/kreuzberg/result.rb +12 -2
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +1 -0
- data/sig/kreuzberg.rbs +23 -11
- data/spec/binding/batch_spec.rb +6 -5
- data/spec/binding/config_spec.rb +1 -1
- data/spec/binding/error_recovery_spec.rb +3 -3
- data/spec/binding/tables_spec.rb +11 -2
- data/spec/unit/config/extraction_config_spec.rb +2 -2
- data/spec/unit/config/output_format_spec.rb +18 -18
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +3 -2
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +60 -0
- data/vendor/kreuzberg/src/api/handlers.rs +153 -32
- data/vendor/kreuzberg/src/api/mod.rs +2 -0
- data/vendor/kreuzberg/src/api/openapi.rs +141 -0
- data/vendor/kreuzberg/src/api/router.rs +24 -2
- data/vendor/kreuzberg/src/api/startup.rs +21 -1
- data/vendor/kreuzberg/src/api/types.rs +50 -4
- data/vendor/kreuzberg/src/core/config/processing.rs +8 -1
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
- data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
- data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
- data/vendor/kreuzberg/src/core/io.rs +7 -7
- data/vendor/kreuzberg/src/core/mime.rs +4 -4
- data/vendor/kreuzberg/src/extraction/excel.rs +246 -9
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
- data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
- data/vendor/kreuzberg/tests/config_behavioral.rs +14 -12
- data/vendor/kreuzberg/tests/core_integration.rs +2 -4
- data/vendor/kreuzberg/tests/mime_detection.rs +3 -2
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +284 -1
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +56 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
- data/vendor/kreuzberg-ffi/src/helpers.rs +13 -1
- data/vendor/kreuzberg-ffi/src/lib.rs +8 -5
- data/vendor/kreuzberg-ffi/src/memory.rs +35 -1
- data/vendor/kreuzberg-ffi/src/types.rs +8 -5
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +5 -2
|
@@ -1,13 +1,15 @@
|
|
|
1
1
|
//! Regression tests for PPTX/PPSX extraction bugs
|
|
2
2
|
//!
|
|
3
3
|
//! GitHub Issue #321: PPTX extraction fails on shapes without txBody (image placeholders) + PPSX not supported
|
|
4
|
+
//! GitHub Issue #329: Extracting images from PPTX results in reversed page numbers
|
|
4
5
|
//!
|
|
5
6
|
//! Bug 1: "No txBody found" - PPTX extraction fails when any shape lacks a text body
|
|
6
7
|
//! Bug 2: PPSX not supported - PowerPoint Show files rejected entirely
|
|
8
|
+
//! Bug 3: Image page numbers reversed - image on slide 1 reports page_number=2
|
|
7
9
|
|
|
8
10
|
#![cfg(feature = "office")]
|
|
9
11
|
|
|
10
|
-
use kreuzberg::{ExtractionConfig, extract_file};
|
|
12
|
+
use kreuzberg::{ExtractionConfig, ImageExtractionConfig, extract_file};
|
|
11
13
|
use std::io::Write;
|
|
12
14
|
use tempfile::NamedTempFile;
|
|
13
15
|
use zip::CompressionMethod;
|
|
@@ -512,3 +514,284 @@ async fn test_pptx_mixed_shapes_extraction() {
|
|
|
512
514
|
}
|
|
513
515
|
}
|
|
514
516
|
}
|
|
517
|
+
|
|
518
|
+
/// Test that images extracted from PPTX have correct page numbers.
|
|
519
|
+
///
|
|
520
|
+
/// When a PPTX has multiple slides and an image on slide 1, the extracted image
|
|
521
|
+
/// should have page_number=1 (not reversed).
|
|
522
|
+
///
|
|
523
|
+
/// GitHub Issue #329: Image on slide 1 of 2-slide PPTX reports page_number=2
|
|
524
|
+
#[tokio::test]
|
|
525
|
+
async fn test_pptx_image_page_numbers_not_reversed() {
|
|
526
|
+
// Create a PPTX with 2 slides, image on slide 1
|
|
527
|
+
let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
|
|
528
|
+
|
|
529
|
+
// A minimal 1x1 red PNG image (valid PNG format)
|
|
530
|
+
let png_image: &[u8] = &[
|
|
531
|
+
0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, // PNG signature
|
|
532
|
+
0x00, 0x00, 0x00, 0x0D, // IHDR chunk length
|
|
533
|
+
0x49, 0x48, 0x44, 0x52, // "IHDR"
|
|
534
|
+
0x00, 0x00, 0x00, 0x01, // width: 1
|
|
535
|
+
0x00, 0x00, 0x00, 0x01, // height: 1
|
|
536
|
+
0x08, 0x02, // bit depth: 8, color type: RGB
|
|
537
|
+
0x00, 0x00, 0x00, // compression, filter, interlace
|
|
538
|
+
0x90, 0x77, 0x53, 0xDE, // IHDR CRC
|
|
539
|
+
0x00, 0x00, 0x00, 0x0C, // IDAT chunk length
|
|
540
|
+
0x49, 0x44, 0x41, 0x54, // "IDAT"
|
|
541
|
+
0x08, 0xD7, 0x63, 0xF8, 0xCF, 0xC0, 0x00, 0x00, // compressed data
|
|
542
|
+
0x01, 0x01, 0x01, 0x00, // checksum
|
|
543
|
+
0x18, 0xDD, 0x8D, 0xB4, // IDAT CRC
|
|
544
|
+
0x00, 0x00, 0x00, 0x00, // IEND chunk length
|
|
545
|
+
0x49, 0x45, 0x4E, 0x44, // "IEND"
|
|
546
|
+
0xAE, 0x42, 0x60, 0x82, // IEND CRC
|
|
547
|
+
];
|
|
548
|
+
|
|
549
|
+
{
|
|
550
|
+
let mut zip = ZipWriter::new(&mut temp_file);
|
|
551
|
+
let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
|
|
552
|
+
|
|
553
|
+
// Add [Content_Types].xml
|
|
554
|
+
zip.start_file("[Content_Types].xml", options)
|
|
555
|
+
.expect("Operation failed");
|
|
556
|
+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
557
|
+
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
558
|
+
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
559
|
+
<Default Extension="xml" ContentType="application/xml"/>
|
|
560
|
+
<Default Extension="png" ContentType="image/png"/>
|
|
561
|
+
<Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
|
|
562
|
+
<Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
|
|
563
|
+
<Override PartName="/ppt/slides/slide2.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
|
|
564
|
+
</Types>"#).expect("Operation failed");
|
|
565
|
+
|
|
566
|
+
// Add _rels/.rels
|
|
567
|
+
zip.start_file("_rels/.rels", options).expect("Operation failed");
|
|
568
|
+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
569
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
570
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
|
|
571
|
+
</Relationships>"#).expect("Operation failed");
|
|
572
|
+
|
|
573
|
+
// Add ppt/presentation.xml
|
|
574
|
+
zip.start_file("ppt/presentation.xml", options)
|
|
575
|
+
.expect("Operation failed");
|
|
576
|
+
zip.write_all(
|
|
577
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
578
|
+
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
579
|
+
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
|
|
580
|
+
<p:sldIdLst>
|
|
581
|
+
<p:sldId id="256" r:id="rId2"/>
|
|
582
|
+
<p:sldId id="257" r:id="rId3"/>
|
|
583
|
+
</p:sldIdLst>
|
|
584
|
+
</p:presentation>"#,
|
|
585
|
+
)
|
|
586
|
+
.expect("Operation failed");
|
|
587
|
+
|
|
588
|
+
// Add ppt/_rels/presentation.xml.rels
|
|
589
|
+
// BUG REPRODUCTION: Slides listed in REVERSE order in XML (slide2 before slide1)
|
|
590
|
+
// This is valid XML - PowerPoint doesn't guarantee order in rels files
|
|
591
|
+
// GitHub Issue #329: This causes page numbers to be reversed
|
|
592
|
+
zip.start_file("ppt/_rels/presentation.xml.rels", options)
|
|
593
|
+
.expect("Operation failed");
|
|
594
|
+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
595
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
596
|
+
<Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide2.xml"/>
|
|
597
|
+
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
|
|
598
|
+
</Relationships>"#).expect("Operation failed");
|
|
599
|
+
|
|
600
|
+
// Add the image file
|
|
601
|
+
zip.start_file("ppt/media/image1.png", options)
|
|
602
|
+
.expect("Operation failed");
|
|
603
|
+
zip.write_all(png_image).expect("Operation failed");
|
|
604
|
+
|
|
605
|
+
// Add slide 1 WITH an image
|
|
606
|
+
zip.start_file("ppt/slides/slide1.xml", options)
|
|
607
|
+
.expect("Operation failed");
|
|
608
|
+
zip.write_all(
|
|
609
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
610
|
+
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
611
|
+
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
612
|
+
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
|
|
613
|
+
<p:cSld>
|
|
614
|
+
<p:spTree>
|
|
615
|
+
<p:nvGrpSpPr><p:cNvPr id="1" name=""/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr>
|
|
616
|
+
<p:grpSpPr/>
|
|
617
|
+
<p:sp>
|
|
618
|
+
<p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
|
|
619
|
+
<p:spPr/>
|
|
620
|
+
<p:txBody><a:bodyPr/><a:lstStyle/><a:p><a:r><a:t>Slide 1 - Has Image</a:t></a:r></a:p></p:txBody>
|
|
621
|
+
</p:sp>
|
|
622
|
+
<p:pic>
|
|
623
|
+
<p:nvPicPr>
|
|
624
|
+
<p:cNvPr id="3" name="Picture 1"/>
|
|
625
|
+
<p:cNvPicPr><a:picLocks noChangeAspect="1"/></p:cNvPicPr>
|
|
626
|
+
<p:nvPr/>
|
|
627
|
+
</p:nvPicPr>
|
|
628
|
+
<p:blipFill>
|
|
629
|
+
<a:blip r:embed="rId2"/>
|
|
630
|
+
<a:stretch><a:fillRect/></a:stretch>
|
|
631
|
+
</p:blipFill>
|
|
632
|
+
<p:spPr>
|
|
633
|
+
<a:xfrm><a:off x="0" y="0"/><a:ext cx="100000" cy="100000"/></a:xfrm>
|
|
634
|
+
<a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
|
|
635
|
+
</p:spPr>
|
|
636
|
+
</p:pic>
|
|
637
|
+
</p:spTree>
|
|
638
|
+
</p:cSld>
|
|
639
|
+
</p:sld>"#,
|
|
640
|
+
)
|
|
641
|
+
.expect("Operation failed");
|
|
642
|
+
|
|
643
|
+
// Add slide 1 relationships (points to the image)
|
|
644
|
+
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
|
|
645
|
+
.expect("Operation failed");
|
|
646
|
+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
647
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
648
|
+
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image1.png"/>
|
|
649
|
+
</Relationships>"#).expect("Operation failed");
|
|
650
|
+
|
|
651
|
+
// Add slide 2 WITHOUT an image
|
|
652
|
+
zip.start_file("ppt/slides/slide2.xml", options)
|
|
653
|
+
.expect("Operation failed");
|
|
654
|
+
zip.write_all(
|
|
655
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
656
|
+
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
657
|
+
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
658
|
+
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
|
|
659
|
+
<p:cSld>
|
|
660
|
+
<p:spTree>
|
|
661
|
+
<p:nvGrpSpPr><p:cNvPr id="1" name=""/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr>
|
|
662
|
+
<p:grpSpPr/>
|
|
663
|
+
<p:sp>
|
|
664
|
+
<p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
|
|
665
|
+
<p:spPr/>
|
|
666
|
+
<p:txBody><a:bodyPr/><a:lstStyle/><a:p><a:r><a:t>Slide 2 - No Image</a:t></a:r></a:p></p:txBody>
|
|
667
|
+
</p:sp>
|
|
668
|
+
</p:spTree>
|
|
669
|
+
</p:cSld>
|
|
670
|
+
</p:sld>"#,
|
|
671
|
+
)
|
|
672
|
+
.expect("Operation failed");
|
|
673
|
+
|
|
674
|
+
// Add empty slide 2 relationships
|
|
675
|
+
zip.start_file("ppt/slides/_rels/slide2.xml.rels", options)
|
|
676
|
+
.expect("Operation failed");
|
|
677
|
+
zip.write_all(
|
|
678
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
679
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
680
|
+
</Relationships>"#,
|
|
681
|
+
)
|
|
682
|
+
.expect("Operation failed");
|
|
683
|
+
|
|
684
|
+
zip.finish().expect("Operation failed");
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
// Extract with images enabled
|
|
688
|
+
let config = ExtractionConfig {
|
|
689
|
+
images: Some(ImageExtractionConfig {
|
|
690
|
+
extract_images: true,
|
|
691
|
+
target_dpi: 300,
|
|
692
|
+
max_image_dimension: 4096,
|
|
693
|
+
auto_adjust_dpi: true,
|
|
694
|
+
min_dpi: 72,
|
|
695
|
+
max_dpi: 600,
|
|
696
|
+
}),
|
|
697
|
+
..Default::default()
|
|
698
|
+
};
|
|
699
|
+
|
|
700
|
+
let result = extract_file(
|
|
701
|
+
temp_file.path(),
|
|
702
|
+
Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
|
|
703
|
+
&config,
|
|
704
|
+
)
|
|
705
|
+
.await;
|
|
706
|
+
|
|
707
|
+
match result {
|
|
708
|
+
Ok(extraction) => {
|
|
709
|
+
// Verify text extraction works
|
|
710
|
+
assert!(extraction.content.contains("Slide 1"), "Should extract slide 1 text");
|
|
711
|
+
assert!(extraction.content.contains("Slide 2"), "Should extract slide 2 text");
|
|
712
|
+
|
|
713
|
+
// Verify we got an image
|
|
714
|
+
let images = extraction.images.as_ref().expect("Images should be present");
|
|
715
|
+
assert!(!images.is_empty(), "Should extract at least one image");
|
|
716
|
+
|
|
717
|
+
// THE CRITICAL TEST: Image on slide 1 should have page_number=1, NOT 2
|
|
718
|
+
let image = &images[0];
|
|
719
|
+
assert_eq!(
|
|
720
|
+
image.page_number,
|
|
721
|
+
Some(1),
|
|
722
|
+
"GitHub Issue #329: Image on slide 1 should have page_number=1, but got {:?}. \
|
|
723
|
+
The page numbers are reversed!",
|
|
724
|
+
image.page_number
|
|
725
|
+
);
|
|
726
|
+
|
|
727
|
+
println!("✅ PPTX image page numbers are correct!");
|
|
728
|
+
println!(" Image on slide 1 has page_number={:?}", image.page_number);
|
|
729
|
+
}
|
|
730
|
+
Err(e) => {
|
|
731
|
+
panic!("PPTX extraction failed: {:?}", e);
|
|
732
|
+
}
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
/// Test with actual user-provided PPTX file from GitHub Issue #329.
|
|
737
|
+
///
|
|
738
|
+
/// The user's file has slides listed in reverse order in presentation.xml.rels,
|
|
739
|
+
/// which caused images to have incorrect page numbers.
|
|
740
|
+
#[tokio::test]
|
|
741
|
+
async fn test_pptx_image_page_numbers_issue329_user_file() {
|
|
742
|
+
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
743
|
+
.parent()
|
|
744
|
+
.expect("Operation failed")
|
|
745
|
+
.parent()
|
|
746
|
+
.expect("Operation failed");
|
|
747
|
+
let test_file = workspace_root.join("test_documents/presentations/pptx_reversed_slide_order_issue329.pptx");
|
|
748
|
+
|
|
749
|
+
if !test_file.exists() {
|
|
750
|
+
println!("Skipping test: User file not found at {:?}", test_file);
|
|
751
|
+
return;
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
// Extract with images enabled
|
|
755
|
+
let config = ExtractionConfig {
|
|
756
|
+
images: Some(ImageExtractionConfig {
|
|
757
|
+
extract_images: true,
|
|
758
|
+
target_dpi: 300,
|
|
759
|
+
max_image_dimension: 4096,
|
|
760
|
+
auto_adjust_dpi: true,
|
|
761
|
+
min_dpi: 72,
|
|
762
|
+
max_dpi: 600,
|
|
763
|
+
}),
|
|
764
|
+
..Default::default()
|
|
765
|
+
};
|
|
766
|
+
|
|
767
|
+
let result = extract_file(&test_file, None, &config).await;
|
|
768
|
+
|
|
769
|
+
match result {
|
|
770
|
+
Ok(extraction) => {
|
|
771
|
+
// The user's file has an image on slide 1
|
|
772
|
+
let images = extraction.images.as_ref().expect("Images should be extracted");
|
|
773
|
+
|
|
774
|
+
if images.is_empty() {
|
|
775
|
+
println!("No images extracted from user file (may not have embedded images)");
|
|
776
|
+
return;
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
// All images should have page_number = 1 since they're on the first slide
|
|
780
|
+
for (idx, image) in images.iter().enumerate() {
|
|
781
|
+
assert_eq!(
|
|
782
|
+
image.page_number,
|
|
783
|
+
Some(1),
|
|
784
|
+
"GitHub Issue #329: Image {} should have page_number=1, but got {:?}",
|
|
785
|
+
idx,
|
|
786
|
+
image.page_number
|
|
787
|
+
);
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
println!("✅ User file from Issue #329 - image page numbers correct!");
|
|
791
|
+
println!(" Found {} images, all with page_number=1", images.len());
|
|
792
|
+
}
|
|
793
|
+
Err(e) => {
|
|
794
|
+
panic!("Failed to extract user file: {:?}", e);
|
|
795
|
+
}
|
|
796
|
+
}
|
|
797
|
+
}
|
|
@@ -88,3 +88,59 @@ fn test_xlsx_minimal_metadata_extraction() {
|
|
|
88
88
|
|
|
89
89
|
println!("✅ XLSX minimal metadata extraction test passed!");
|
|
90
90
|
}
|
|
91
|
+
|
|
92
|
+
/// Test for issue #331: OOM with XLSX files containing Excel Solver add-in data
|
|
93
|
+
///
|
|
94
|
+
/// This test reproduces the issue where Excel Solver stores configuration data
|
|
95
|
+
/// in cells at extreme positions (XFD1048550-1048575 = column 16384, rows near 1M).
|
|
96
|
+
/// The sheet dimension is set to "A1:XFD1048575", which could cause Kreuzberg
|
|
97
|
+
/// to attempt allocating memory for ~17 trillion cells (16384 × 1048575).
|
|
98
|
+
///
|
|
99
|
+
/// Expected behavior: Should handle extreme dimensions gracefully without OOM.
|
|
100
|
+
/// The file is only 6.8KB and contains minimal actual data.
|
|
101
|
+
#[test]
|
|
102
|
+
fn test_xlsx_excel_solver_extreme_dimensions_no_oom() {
|
|
103
|
+
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
104
|
+
.parent()
|
|
105
|
+
.expect("Operation failed")
|
|
106
|
+
.parent()
|
|
107
|
+
.expect("Operation failed");
|
|
108
|
+
let test_file = workspace_root.join("tests/fixtures/xlsx-oom-repro/kreuzberg-oom-repro.xlsx");
|
|
109
|
+
|
|
110
|
+
if !test_file.exists() {
|
|
111
|
+
println!("Skipping test: Test file not found at {:?}", test_file);
|
|
112
|
+
println!("Run: node tests/fixtures/xlsx-oom-repro/generate-oom-xlsx.mjs");
|
|
113
|
+
return;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
let file_path = test_file.to_str().expect("File path should be valid UTF-8");
|
|
117
|
+
|
|
118
|
+
// This should NOT cause OOM even though dimension claims A1:XFD1048575
|
|
119
|
+
// The actual data is minimal (only ~26 cells with Solver metadata)
|
|
120
|
+
let result = read_excel_file(file_path).expect("Should extract XLSX with extreme dimensions without OOM");
|
|
121
|
+
|
|
122
|
+
// Verify we got the actual data, not a massive allocation
|
|
123
|
+
assert!(!result.sheets.is_empty(), "Should have at least one sheet");
|
|
124
|
+
|
|
125
|
+
// The file has normal cells A1, B1 plus Solver cells at extreme positions
|
|
126
|
+
// Verify we extracted something reasonable, not 17 trillion cells
|
|
127
|
+
let sheet = &result.sheets[0];
|
|
128
|
+
assert!(
|
|
129
|
+
sheet.markdown.len() < 10000,
|
|
130
|
+
"Sheet markdown content should be small (< 10000 chars), not massive. Got {} chars",
|
|
131
|
+
sheet.markdown.len()
|
|
132
|
+
);
|
|
133
|
+
|
|
134
|
+
// Verify metadata was extracted
|
|
135
|
+
assert!(
|
|
136
|
+
result.metadata.contains_key("sheet_count"),
|
|
137
|
+
"Should have sheet_count metadata"
|
|
138
|
+
);
|
|
139
|
+
|
|
140
|
+
println!("✅ XLSX Excel Solver extreme dimensions test passed!");
|
|
141
|
+
println!(
|
|
142
|
+
" Sheet markdown length: {} chars (reasonable size)",
|
|
143
|
+
sheet.markdown.len()
|
|
144
|
+
);
|
|
145
|
+
println!(" Successfully handled dimension A1:XFD1048575 without OOM");
|
|
146
|
+
}
|
|
@@ -223,7 +223,7 @@ typedef struct CErrorDetails {
|
|
|
223
223
|
* # Memory Layout
|
|
224
224
|
*
|
|
225
225
|
* Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
|
|
226
|
-
* Field order:
|
|
226
|
+
* Field order: 13 pointers (8 bytes each) + 1 bool + 7 bytes padding = 112 bytes total
|
|
227
227
|
*
|
|
228
228
|
* The `#[repr(C)]` attribute ensures the struct follows C's memory layout rules:
|
|
229
229
|
* - Fields are laid out in order
|
|
@@ -284,6 +284,10 @@ typedef struct CExtractionResult {
|
|
|
284
284
|
* Per-page content as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
285
285
|
*/
|
|
286
286
|
char *pages_json;
|
|
287
|
+
/**
|
|
288
|
+
* Semantic elements as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
289
|
+
*/
|
|
290
|
+
char *elements_json;
|
|
287
291
|
/**
|
|
288
292
|
* Whether extraction was successful
|
|
289
293
|
*/
|
|
@@ -1608,7 +1612,7 @@ char *kreuzberg_clone_string(const char *s);
|
|
|
1608
1612
|
*
|
|
1609
1613
|
* # Memory Layout
|
|
1610
1614
|
*
|
|
1611
|
-
* This function frees all
|
|
1615
|
+
* This function frees all 13 string fields in CExtractionResult:
|
|
1612
1616
|
* 1. content
|
|
1613
1617
|
* 2. mime_type
|
|
1614
1618
|
* 3. language
|
|
@@ -1621,6 +1625,7 @@ char *kreuzberg_clone_string(const char *s);
|
|
|
1621
1625
|
* 10. images_json
|
|
1622
1626
|
* 11. page_structure_json (FIXED: was missing before PR #3)
|
|
1623
1627
|
* 12. pages_json (FIXED: was missing before PR #3)
|
|
1628
|
+
* 13. elements_json (ADDED: for element-based extraction support)
|
|
1624
1629
|
*
|
|
1625
1630
|
* # Example (C)
|
|
1626
1631
|
*
|
|
@@ -67,7 +67,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
|
|
|
67
67
|
images,
|
|
68
68
|
pages,
|
|
69
69
|
djot_content: _,
|
|
70
|
-
elements
|
|
70
|
+
elements,
|
|
71
71
|
} = result;
|
|
72
72
|
|
|
73
73
|
let sanitized_content = if content.contains('\0') {
|
|
@@ -179,6 +179,17 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
|
|
|
179
179
|
_ => None,
|
|
180
180
|
};
|
|
181
181
|
|
|
182
|
+
let elements_json_guard = match elements {
|
|
183
|
+
Some(elements) if !elements.is_empty() => {
|
|
184
|
+
let json =
|
|
185
|
+
serde_json::to_string(&elements).map_err(|e| format!("Failed to serialize elements to JSON: {}", e))?;
|
|
186
|
+
Some(CStringGuard::new(CString::new(json).map_err(|e| {
|
|
187
|
+
format!("Failed to convert elements JSON to C string: {}", e)
|
|
188
|
+
})?))
|
|
189
|
+
}
|
|
190
|
+
_ => None,
|
|
191
|
+
};
|
|
192
|
+
|
|
182
193
|
Ok(Box::into_raw(Box::new(CExtractionResult {
|
|
183
194
|
content: content_guard.into_raw(),
|
|
184
195
|
mime_type: mime_type_guard.into_raw(),
|
|
@@ -192,6 +203,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
|
|
|
192
203
|
images_json: images_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
|
|
193
204
|
page_structure_json: page_structure_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
|
|
194
205
|
pages_json: pages_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
|
|
206
|
+
elements_json: elements_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
|
|
195
207
|
success: true,
|
|
196
208
|
_padding1: [0u8; 7],
|
|
197
209
|
})))
|
|
@@ -134,8 +134,8 @@ mod tests {
|
|
|
134
134
|
// Test size
|
|
135
135
|
assert_eq!(
|
|
136
136
|
std::mem::size_of::<CExtractionResult>(),
|
|
137
|
-
|
|
138
|
-
"CExtractionResult must be exactly
|
|
137
|
+
112,
|
|
138
|
+
"CExtractionResult must be exactly 112 bytes"
|
|
139
139
|
);
|
|
140
140
|
|
|
141
141
|
// Test alignment
|
|
@@ -197,6 +197,7 @@ mod tests {
|
|
|
197
197
|
images_json: ptr::null_mut(),
|
|
198
198
|
page_structure_json: ptr::null_mut(),
|
|
199
199
|
pages_json: ptr::null_mut(),
|
|
200
|
+
elements_json: ptr::null_mut(),
|
|
200
201
|
success: true,
|
|
201
202
|
_padding1: [0u8; 7],
|
|
202
203
|
}))
|
|
@@ -510,6 +511,7 @@ mod tests {
|
|
|
510
511
|
images_json: ptr::null_mut(),
|
|
511
512
|
page_structure_json: ptr::null_mut(),
|
|
512
513
|
pages_json: ptr::null_mut(),
|
|
514
|
+
elements_json: ptr::null_mut(),
|
|
513
515
|
success: true,
|
|
514
516
|
_padding1: [0u8; 7],
|
|
515
517
|
}));
|
|
@@ -522,7 +524,7 @@ mod tests {
|
|
|
522
524
|
#[test]
|
|
523
525
|
fn test_extraction_result_free_all_fields_allocated() {
|
|
524
526
|
unsafe {
|
|
525
|
-
// Test freeing a result where ALL
|
|
527
|
+
// Test freeing a result where ALL 13 string fields are allocated
|
|
526
528
|
// This verifies that kreuzberg_free_result properly frees all fields
|
|
527
529
|
let result = Box::into_raw(Box::new(CExtractionResult {
|
|
528
530
|
content: CString::new("test content").unwrap().into_raw(),
|
|
@@ -537,11 +539,12 @@ mod tests {
|
|
|
537
539
|
images_json: CString::new("[{\"data\":\"base64\"}]").unwrap().into_raw(),
|
|
538
540
|
page_structure_json: CString::new("{\"pages\":1}").unwrap().into_raw(),
|
|
539
541
|
pages_json: CString::new("[{\"page\":1,\"content\":\"test\"}]").unwrap().into_raw(),
|
|
542
|
+
elements_json: CString::new("[]").unwrap().into_raw(),
|
|
540
543
|
success: true,
|
|
541
544
|
_padding1: [0u8; 7],
|
|
542
545
|
}));
|
|
543
546
|
|
|
544
|
-
// Should properly free all
|
|
547
|
+
// Should properly free all 13 allocated string fields without leaking memory
|
|
545
548
|
kreuzberg_free_result(result);
|
|
546
549
|
}
|
|
547
550
|
}
|
|
@@ -621,7 +624,7 @@ mod tests {
|
|
|
621
624
|
/// Test CExtractionResult size exactly matches FFI contract
|
|
622
625
|
#[test]
|
|
623
626
|
fn test_c_extraction_result_size() {
|
|
624
|
-
assert_eq!(std::mem::size_of::<CExtractionResult>(),
|
|
627
|
+
assert_eq!(std::mem::size_of::<CExtractionResult>(), 112);
|
|
625
628
|
assert_eq!(std::mem::align_of::<CExtractionResult>(), 8);
|
|
626
629
|
}
|
|
627
630
|
|
|
@@ -146,7 +146,7 @@ pub unsafe extern "C" fn kreuzberg_clone_string(s: *const c_char) -> *mut c_char
|
|
|
146
146
|
///
|
|
147
147
|
/// # Memory Layout
|
|
148
148
|
///
|
|
149
|
-
/// This function frees all
|
|
149
|
+
/// This function frees all 13 string fields in CExtractionResult:
|
|
150
150
|
/// 1. content
|
|
151
151
|
/// 2. mime_type
|
|
152
152
|
/// 3. language
|
|
@@ -159,6 +159,7 @@ pub unsafe extern "C" fn kreuzberg_clone_string(s: *const c_char) -> *mut c_char
|
|
|
159
159
|
/// 10. images_json
|
|
160
160
|
/// 11. page_structure_json (FIXED: was missing before PR #3)
|
|
161
161
|
/// 12. pages_json (FIXED: was missing before PR #3)
|
|
162
|
+
/// 13. elements_json (ADDED: for element-based extraction support)
|
|
162
163
|
///
|
|
163
164
|
/// # Example (C)
|
|
164
165
|
///
|
|
@@ -209,6 +210,9 @@ pub unsafe extern "C" fn kreuzberg_free_result(result: *mut CExtractionResult) {
|
|
|
209
210
|
if !result_box.pages_json.is_null() {
|
|
210
211
|
unsafe { drop(CString::from_raw(result_box.pages_json)) };
|
|
211
212
|
}
|
|
213
|
+
if !result_box.elements_json.is_null() {
|
|
214
|
+
unsafe { drop(CString::from_raw(result_box.elements_json)) };
|
|
215
|
+
}
|
|
212
216
|
}
|
|
213
217
|
}
|
|
214
218
|
|
|
@@ -232,6 +236,7 @@ mod tests {
|
|
|
232
236
|
images_json: CString::new("[]").unwrap().into_raw(),
|
|
233
237
|
page_structure_json: CString::new("{}").unwrap().into_raw(),
|
|
234
238
|
pages_json: CString::new("[]").unwrap().into_raw(),
|
|
239
|
+
elements_json: CString::new("[]").unwrap().into_raw(),
|
|
235
240
|
success: true,
|
|
236
241
|
_padding1: [0u8; 7],
|
|
237
242
|
}))
|
|
@@ -252,6 +257,7 @@ mod tests {
|
|
|
252
257
|
images_json: ptr::null_mut(),
|
|
253
258
|
page_structure_json: ptr::null_mut(),
|
|
254
259
|
pages_json: ptr::null_mut(),
|
|
260
|
+
elements_json: ptr::null_mut(),
|
|
255
261
|
success: true,
|
|
256
262
|
_padding1: [0u8; 7],
|
|
257
263
|
}))
|
|
@@ -343,6 +349,34 @@ mod tests {
|
|
|
343
349
|
images_json: ptr::null_mut(),
|
|
344
350
|
page_structure_json: CString::new("{\"pages\": []}").unwrap().into_raw(),
|
|
345
351
|
pages_json: CString::new("[{\"content\": \"page 1\"}]").unwrap().into_raw(),
|
|
352
|
+
elements_json: ptr::null_mut(),
|
|
353
|
+
success: true,
|
|
354
|
+
_padding1: [0u8; 7],
|
|
355
|
+
}));
|
|
356
|
+
|
|
357
|
+
unsafe { kreuzberg_free_result(result) };
|
|
358
|
+
// If we get here without crashing or leaking, the test passed
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
#[test]
|
|
362
|
+
fn test_free_result_elements_json() {
|
|
363
|
+
// Test: ensure elements_json is freed
|
|
364
|
+
let result = Box::into_raw(Box::new(CExtractionResult {
|
|
365
|
+
content: CString::new("test").unwrap().into_raw(),
|
|
366
|
+
mime_type: CString::new("text/plain").unwrap().into_raw(),
|
|
367
|
+
language: ptr::null_mut(),
|
|
368
|
+
date: ptr::null_mut(),
|
|
369
|
+
subject: ptr::null_mut(),
|
|
370
|
+
tables_json: ptr::null_mut(),
|
|
371
|
+
detected_languages_json: ptr::null_mut(),
|
|
372
|
+
metadata_json: ptr::null_mut(),
|
|
373
|
+
chunks_json: ptr::null_mut(),
|
|
374
|
+
images_json: ptr::null_mut(),
|
|
375
|
+
page_structure_json: ptr::null_mut(),
|
|
376
|
+
pages_json: ptr::null_mut(),
|
|
377
|
+
elements_json: CString::new(r#"[{"element_id":"abc","element_type":"title","text":"Hello"}]"#)
|
|
378
|
+
.unwrap()
|
|
379
|
+
.into_raw(),
|
|
346
380
|
success: true,
|
|
347
381
|
_padding1: [0u8; 7],
|
|
348
382
|
}));
|
|
@@ -51,7 +51,7 @@ impl Drop for CStringGuard {
|
|
|
51
51
|
/// # Memory Layout
|
|
52
52
|
///
|
|
53
53
|
/// Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
|
|
54
|
-
/// Field order:
|
|
54
|
+
/// Field order: 13 pointers (8 bytes each) + 1 bool + 7 bytes padding = 112 bytes total
|
|
55
55
|
///
|
|
56
56
|
/// The `#[repr(C)]` attribute ensures the struct follows C's memory layout rules:
|
|
57
57
|
/// - Fields are laid out in order
|
|
@@ -88,6 +88,8 @@ pub struct CExtractionResult {
|
|
|
88
88
|
pub page_structure_json: *mut c_char,
|
|
89
89
|
/// Per-page content as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
90
90
|
pub pages_json: *mut c_char,
|
|
91
|
+
/// Semantic elements as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
92
|
+
pub elements_json: *mut c_char,
|
|
91
93
|
/// Whether extraction was successful
|
|
92
94
|
pub success: bool,
|
|
93
95
|
/// Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
|
|
@@ -150,7 +152,7 @@ pub struct CBatchResult {
|
|
|
150
152
|
const _: () = {
|
|
151
153
|
const fn assert_c_extraction_result_size() {
|
|
152
154
|
const SIZE: usize = std::mem::size_of::<CExtractionResult>();
|
|
153
|
-
const _: () = assert!(SIZE ==
|
|
155
|
+
const _: () = assert!(SIZE == 112, "CExtractionResult size must be 112 bytes");
|
|
154
156
|
}
|
|
155
157
|
|
|
156
158
|
const fn assert_c_extraction_result_alignment() {
|
|
@@ -195,8 +197,8 @@ mod tests {
|
|
|
195
197
|
fn test_c_extraction_result_size() {
|
|
196
198
|
assert_eq!(
|
|
197
199
|
std::mem::size_of::<CExtractionResult>(),
|
|
198
|
-
|
|
199
|
-
"CExtractionResult must be exactly
|
|
200
|
+
112,
|
|
201
|
+
"CExtractionResult must be exactly 112 bytes"
|
|
200
202
|
);
|
|
201
203
|
}
|
|
202
204
|
|
|
@@ -327,7 +329,8 @@ mod tests {
|
|
|
327
329
|
assert_eq!(offset_of!(CExtractionResult, images_json), 72);
|
|
328
330
|
assert_eq!(offset_of!(CExtractionResult, page_structure_json), 80);
|
|
329
331
|
assert_eq!(offset_of!(CExtractionResult, pages_json), 88);
|
|
330
|
-
assert_eq!(offset_of!(CExtractionResult,
|
|
332
|
+
assert_eq!(offset_of!(CExtractionResult, elements_json), 96);
|
|
333
|
+
assert_eq!(offset_of!(CExtractionResult, success), 104);
|
|
331
334
|
}
|
|
332
335
|
|
|
333
336
|
/// Verify field offsets in CBatchResult match expectations
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.2.
|
|
4
|
+
version: 4.2.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-01-
|
|
11
|
+
date: 2026-01-28 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -233,6 +233,7 @@ files:
|
|
|
233
233
|
- lib/kreuzberg/cli.rb
|
|
234
234
|
- lib/kreuzberg/cli_proxy.rb
|
|
235
235
|
- lib/kreuzberg/config.rb
|
|
236
|
+
- lib/kreuzberg/djot_content.rb
|
|
236
237
|
- lib/kreuzberg/error_context.rb
|
|
237
238
|
- lib/kreuzberg/errors.rb
|
|
238
239
|
- lib/kreuzberg/extraction_api.rb
|
|
@@ -362,6 +363,7 @@ files:
|
|
|
362
363
|
- vendor/kreuzberg/src/api/error.rs
|
|
363
364
|
- vendor/kreuzberg/src/api/handlers.rs
|
|
364
365
|
- vendor/kreuzberg/src/api/mod.rs
|
|
366
|
+
- vendor/kreuzberg/src/api/openapi.rs
|
|
365
367
|
- vendor/kreuzberg/src/api/router.rs
|
|
366
368
|
- vendor/kreuzberg/src/api/startup.rs
|
|
367
369
|
- vendor/kreuzberg/src/api/types.rs
|
|
@@ -591,6 +593,7 @@ files:
|
|
|
591
593
|
- vendor/kreuzberg/src/plugins/registry/ocr.rs
|
|
592
594
|
- vendor/kreuzberg/src/plugins/registry/processor.rs
|
|
593
595
|
- vendor/kreuzberg/src/plugins/registry/validator.rs
|
|
596
|
+
- vendor/kreuzberg/src/plugins/startup_validation.rs
|
|
594
597
|
- vendor/kreuzberg/src/plugins/traits.rs
|
|
595
598
|
- vendor/kreuzberg/src/plugins/validator/mod.rs
|
|
596
599
|
- vendor/kreuzberg/src/plugins/validator/registry.rs
|