kreuzberg 4.2.0 → 4.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
  5. data/lib/kreuzberg/cli.rb +16 -6
  6. data/lib/kreuzberg/cli_proxy.rb +3 -1
  7. data/lib/kreuzberg/config.rb +59 -28
  8. data/lib/kreuzberg/djot_content.rb +225 -0
  9. data/lib/kreuzberg/extraction_api.rb +20 -4
  10. data/lib/kreuzberg/result.rb +12 -2
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/lib/kreuzberg.rb +1 -0
  13. data/sig/kreuzberg.rbs +23 -11
  14. data/spec/binding/batch_spec.rb +6 -5
  15. data/spec/binding/config_spec.rb +1 -1
  16. data/spec/binding/error_recovery_spec.rb +3 -3
  17. data/spec/binding/tables_spec.rb +11 -2
  18. data/spec/unit/config/extraction_config_spec.rb +2 -2
  19. data/spec/unit/config/output_format_spec.rb +18 -18
  20. data/vendor/Cargo.toml +1 -1
  21. data/vendor/kreuzberg/Cargo.toml +3 -2
  22. data/vendor/kreuzberg/README.md +1 -1
  23. data/vendor/kreuzberg/src/api/error.rs +60 -0
  24. data/vendor/kreuzberg/src/api/handlers.rs +153 -32
  25. data/vendor/kreuzberg/src/api/mod.rs +2 -0
  26. data/vendor/kreuzberg/src/api/openapi.rs +141 -0
  27. data/vendor/kreuzberg/src/api/router.rs +24 -2
  28. data/vendor/kreuzberg/src/api/startup.rs +21 -1
  29. data/vendor/kreuzberg/src/api/types.rs +50 -4
  30. data/vendor/kreuzberg/src/core/config/processing.rs +8 -1
  31. data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
  32. data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
  33. data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
  34. data/vendor/kreuzberg/src/core/io.rs +7 -7
  35. data/vendor/kreuzberg/src/core/mime.rs +4 -4
  36. data/vendor/kreuzberg/src/extraction/excel.rs +246 -9
  37. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
  38. data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
  39. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
  40. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
  41. data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
  42. data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
  43. data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
  44. data/vendor/kreuzberg/tests/config_behavioral.rs +14 -12
  45. data/vendor/kreuzberg/tests/core_integration.rs +2 -4
  46. data/vendor/kreuzberg/tests/mime_detection.rs +3 -2
  47. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +284 -1
  48. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +56 -0
  49. data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
  50. data/vendor/kreuzberg-ffi/src/helpers.rs +13 -1
  51. data/vendor/kreuzberg-ffi/src/lib.rs +8 -5
  52. data/vendor/kreuzberg-ffi/src/memory.rs +35 -1
  53. data/vendor/kreuzberg-ffi/src/types.rs +8 -5
  54. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  55. metadata +5 -2
@@ -1,13 +1,15 @@
1
1
  //! Regression tests for PPTX/PPSX extraction bugs
2
2
  //!
3
3
  //! GitHub Issue #321: PPTX extraction fails on shapes without txBody (image placeholders) + PPSX not supported
4
+ //! GitHub Issue #329: Extracting images from PPTX results in reversed page numbers
4
5
  //!
5
6
  //! Bug 1: "No txBody found" - PPTX extraction fails when any shape lacks a text body
6
7
  //! Bug 2: PPSX not supported - PowerPoint Show files rejected entirely
8
+ //! Bug 3: Image page numbers reversed - image on slide 1 reports page_number=2
7
9
 
8
10
  #![cfg(feature = "office")]
9
11
 
10
- use kreuzberg::{ExtractionConfig, extract_file};
12
+ use kreuzberg::{ExtractionConfig, ImageExtractionConfig, extract_file};
11
13
  use std::io::Write;
12
14
  use tempfile::NamedTempFile;
13
15
  use zip::CompressionMethod;
@@ -512,3 +514,284 @@ async fn test_pptx_mixed_shapes_extraction() {
512
514
  }
513
515
  }
514
516
  }
517
+
518
+ /// Test that images extracted from PPTX have correct page numbers.
519
+ ///
520
+ /// When a PPTX has multiple slides and an image on slide 1, the extracted image
521
+ /// should have page_number=1 (not reversed).
522
+ ///
523
+ /// GitHub Issue #329: Image on slide 1 of 2-slide PPTX reports page_number=2
524
+ #[tokio::test]
525
+ async fn test_pptx_image_page_numbers_not_reversed() {
526
+ // Create a PPTX with 2 slides, image on slide 1
527
+ let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
528
+
529
+ // A minimal 1x1 red PNG image (valid PNG format)
530
+ let png_image: &[u8] = &[
531
+ 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, // PNG signature
532
+ 0x00, 0x00, 0x00, 0x0D, // IHDR chunk length
533
+ 0x49, 0x48, 0x44, 0x52, // "IHDR"
534
+ 0x00, 0x00, 0x00, 0x01, // width: 1
535
+ 0x00, 0x00, 0x00, 0x01, // height: 1
536
+ 0x08, 0x02, // bit depth: 8, color type: RGB
537
+ 0x00, 0x00, 0x00, // compression, filter, interlace
538
+ 0x90, 0x77, 0x53, 0xDE, // IHDR CRC
539
+ 0x00, 0x00, 0x00, 0x0C, // IDAT chunk length
540
+ 0x49, 0x44, 0x41, 0x54, // "IDAT"
541
+ 0x08, 0xD7, 0x63, 0xF8, 0xCF, 0xC0, 0x00, 0x00, // compressed data
542
+ 0x01, 0x01, 0x01, 0x00, // checksum
543
+ 0x18, 0xDD, 0x8D, 0xB4, // IDAT CRC
544
+ 0x00, 0x00, 0x00, 0x00, // IEND chunk length
545
+ 0x49, 0x45, 0x4E, 0x44, // "IEND"
546
+ 0xAE, 0x42, 0x60, 0x82, // IEND CRC
547
+ ];
548
+
549
+ {
550
+ let mut zip = ZipWriter::new(&mut temp_file);
551
+ let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
552
+
553
+ // Add [Content_Types].xml
554
+ zip.start_file("[Content_Types].xml", options)
555
+ .expect("Operation failed");
556
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
557
+ <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
558
+ <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
559
+ <Default Extension="xml" ContentType="application/xml"/>
560
+ <Default Extension="png" ContentType="image/png"/>
561
+ <Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
562
+ <Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
563
+ <Override PartName="/ppt/slides/slide2.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
564
+ </Types>"#).expect("Operation failed");
565
+
566
+ // Add _rels/.rels
567
+ zip.start_file("_rels/.rels", options).expect("Operation failed");
568
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
569
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
570
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
571
+ </Relationships>"#).expect("Operation failed");
572
+
573
+ // Add ppt/presentation.xml
574
+ zip.start_file("ppt/presentation.xml", options)
575
+ .expect("Operation failed");
576
+ zip.write_all(
577
+ br#"<?xml version="1.0" encoding="UTF-8"?>
578
+ <p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
579
+ xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
580
+ <p:sldIdLst>
581
+ <p:sldId id="256" r:id="rId2"/>
582
+ <p:sldId id="257" r:id="rId3"/>
583
+ </p:sldIdLst>
584
+ </p:presentation>"#,
585
+ )
586
+ .expect("Operation failed");
587
+
588
+ // Add ppt/_rels/presentation.xml.rels
589
+ // BUG REPRODUCTION: Slides listed in REVERSE order in XML (slide2 before slide1)
590
+ // This is valid XML - PowerPoint doesn't guarantee order in rels files
591
+ // GitHub Issue #329: This causes page numbers to be reversed
592
+ zip.start_file("ppt/_rels/presentation.xml.rels", options)
593
+ .expect("Operation failed");
594
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
595
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
596
+ <Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide2.xml"/>
597
+ <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
598
+ </Relationships>"#).expect("Operation failed");
599
+
600
+ // Add the image file
601
+ zip.start_file("ppt/media/image1.png", options)
602
+ .expect("Operation failed");
603
+ zip.write_all(png_image).expect("Operation failed");
604
+
605
+ // Add slide 1 WITH an image
606
+ zip.start_file("ppt/slides/slide1.xml", options)
607
+ .expect("Operation failed");
608
+ zip.write_all(
609
+ br#"<?xml version="1.0" encoding="UTF-8"?>
610
+ <p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
611
+ xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
612
+ xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
613
+ <p:cSld>
614
+ <p:spTree>
615
+ <p:nvGrpSpPr><p:cNvPr id="1" name=""/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr>
616
+ <p:grpSpPr/>
617
+ <p:sp>
618
+ <p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
619
+ <p:spPr/>
620
+ <p:txBody><a:bodyPr/><a:lstStyle/><a:p><a:r><a:t>Slide 1 - Has Image</a:t></a:r></a:p></p:txBody>
621
+ </p:sp>
622
+ <p:pic>
623
+ <p:nvPicPr>
624
+ <p:cNvPr id="3" name="Picture 1"/>
625
+ <p:cNvPicPr><a:picLocks noChangeAspect="1"/></p:cNvPicPr>
626
+ <p:nvPr/>
627
+ </p:nvPicPr>
628
+ <p:blipFill>
629
+ <a:blip r:embed="rId2"/>
630
+ <a:stretch><a:fillRect/></a:stretch>
631
+ </p:blipFill>
632
+ <p:spPr>
633
+ <a:xfrm><a:off x="0" y="0"/><a:ext cx="100000" cy="100000"/></a:xfrm>
634
+ <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
635
+ </p:spPr>
636
+ </p:pic>
637
+ </p:spTree>
638
+ </p:cSld>
639
+ </p:sld>"#,
640
+ )
641
+ .expect("Operation failed");
642
+
643
+ // Add slide 1 relationships (points to the image)
644
+ zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
645
+ .expect("Operation failed");
646
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
647
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
648
+ <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image1.png"/>
649
+ </Relationships>"#).expect("Operation failed");
650
+
651
+ // Add slide 2 WITHOUT an image
652
+ zip.start_file("ppt/slides/slide2.xml", options)
653
+ .expect("Operation failed");
654
+ zip.write_all(
655
+ br#"<?xml version="1.0" encoding="UTF-8"?>
656
+ <p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
657
+ xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
658
+ xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
659
+ <p:cSld>
660
+ <p:spTree>
661
+ <p:nvGrpSpPr><p:cNvPr id="1" name=""/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr>
662
+ <p:grpSpPr/>
663
+ <p:sp>
664
+ <p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
665
+ <p:spPr/>
666
+ <p:txBody><a:bodyPr/><a:lstStyle/><a:p><a:r><a:t>Slide 2 - No Image</a:t></a:r></a:p></p:txBody>
667
+ </p:sp>
668
+ </p:spTree>
669
+ </p:cSld>
670
+ </p:sld>"#,
671
+ )
672
+ .expect("Operation failed");
673
+
674
+ // Add empty slide 2 relationships
675
+ zip.start_file("ppt/slides/_rels/slide2.xml.rels", options)
676
+ .expect("Operation failed");
677
+ zip.write_all(
678
+ br#"<?xml version="1.0" encoding="UTF-8"?>
679
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
680
+ </Relationships>"#,
681
+ )
682
+ .expect("Operation failed");
683
+
684
+ zip.finish().expect("Operation failed");
685
+ }
686
+
687
+ // Extract with images enabled
688
+ let config = ExtractionConfig {
689
+ images: Some(ImageExtractionConfig {
690
+ extract_images: true,
691
+ target_dpi: 300,
692
+ max_image_dimension: 4096,
693
+ auto_adjust_dpi: true,
694
+ min_dpi: 72,
695
+ max_dpi: 600,
696
+ }),
697
+ ..Default::default()
698
+ };
699
+
700
+ let result = extract_file(
701
+ temp_file.path(),
702
+ Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
703
+ &config,
704
+ )
705
+ .await;
706
+
707
+ match result {
708
+ Ok(extraction) => {
709
+ // Verify text extraction works
710
+ assert!(extraction.content.contains("Slide 1"), "Should extract slide 1 text");
711
+ assert!(extraction.content.contains("Slide 2"), "Should extract slide 2 text");
712
+
713
+ // Verify we got an image
714
+ let images = extraction.images.as_ref().expect("Images should be present");
715
+ assert!(!images.is_empty(), "Should extract at least one image");
716
+
717
+ // THE CRITICAL TEST: Image on slide 1 should have page_number=1, NOT 2
718
+ let image = &images[0];
719
+ assert_eq!(
720
+ image.page_number,
721
+ Some(1),
722
+ "GitHub Issue #329: Image on slide 1 should have page_number=1, but got {:?}. \
723
+ The page numbers are reversed!",
724
+ image.page_number
725
+ );
726
+
727
+ println!("✅ PPTX image page numbers are correct!");
728
+ println!(" Image on slide 1 has page_number={:?}", image.page_number);
729
+ }
730
+ Err(e) => {
731
+ panic!("PPTX extraction failed: {:?}", e);
732
+ }
733
+ }
734
+ }
735
+
736
+ /// Test with actual user-provided PPTX file from GitHub Issue #329.
737
+ ///
738
+ /// The user's file has slides listed in reverse order in presentation.xml.rels,
739
+ /// which caused images to have incorrect page numbers.
740
+ #[tokio::test]
741
+ async fn test_pptx_image_page_numbers_issue329_user_file() {
742
+ let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
743
+ .parent()
744
+ .expect("Operation failed")
745
+ .parent()
746
+ .expect("Operation failed");
747
+ let test_file = workspace_root.join("test_documents/presentations/pptx_reversed_slide_order_issue329.pptx");
748
+
749
+ if !test_file.exists() {
750
+ println!("Skipping test: User file not found at {:?}", test_file);
751
+ return;
752
+ }
753
+
754
+ // Extract with images enabled
755
+ let config = ExtractionConfig {
756
+ images: Some(ImageExtractionConfig {
757
+ extract_images: true,
758
+ target_dpi: 300,
759
+ max_image_dimension: 4096,
760
+ auto_adjust_dpi: true,
761
+ min_dpi: 72,
762
+ max_dpi: 600,
763
+ }),
764
+ ..Default::default()
765
+ };
766
+
767
+ let result = extract_file(&test_file, None, &config).await;
768
+
769
+ match result {
770
+ Ok(extraction) => {
771
+ // The user's file has an image on slide 1
772
+ let images = extraction.images.as_ref().expect("Images should be extracted");
773
+
774
+ if images.is_empty() {
775
+ println!("No images extracted from user file (may not have embedded images)");
776
+ return;
777
+ }
778
+
779
+ // All images should have page_number = 1 since they're on the first slide
780
+ for (idx, image) in images.iter().enumerate() {
781
+ assert_eq!(
782
+ image.page_number,
783
+ Some(1),
784
+ "GitHub Issue #329: Image {} should have page_number=1, but got {:?}",
785
+ idx,
786
+ image.page_number
787
+ );
788
+ }
789
+
790
+ println!("✅ User file from Issue #329 - image page numbers correct!");
791
+ println!(" Found {} images, all with page_number=1", images.len());
792
+ }
793
+ Err(e) => {
794
+ panic!("Failed to extract user file: {:?}", e);
795
+ }
796
+ }
797
+ }
@@ -88,3 +88,59 @@ fn test_xlsx_minimal_metadata_extraction() {
88
88
 
89
89
  println!("✅ XLSX minimal metadata extraction test passed!");
90
90
  }
91
+
92
+ /// Test for issue #331: OOM with XLSX files containing Excel Solver add-in data
93
+ ///
94
+ /// This test reproduces the issue where Excel Solver stores configuration data
95
+ /// in cells at extreme positions (XFD1048550-1048575 = column 16384, rows near 1M).
96
+ /// The sheet dimension is set to "A1:XFD1048575", which could cause Kreuzberg
97
+ /// to attempt allocating memory for ~17 trillion cells (16384 × 1048575).
98
+ ///
99
+ /// Expected behavior: Should handle extreme dimensions gracefully without OOM.
100
+ /// The file is only 6.8KB and contains minimal actual data.
101
+ #[test]
102
+ fn test_xlsx_excel_solver_extreme_dimensions_no_oom() {
103
+ let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
104
+ .parent()
105
+ .expect("Operation failed")
106
+ .parent()
107
+ .expect("Operation failed");
108
+ let test_file = workspace_root.join("tests/fixtures/xlsx-oom-repro/kreuzberg-oom-repro.xlsx");
109
+
110
+ if !test_file.exists() {
111
+ println!("Skipping test: Test file not found at {:?}", test_file);
112
+ println!("Run: node tests/fixtures/xlsx-oom-repro/generate-oom-xlsx.mjs");
113
+ return;
114
+ }
115
+
116
+ let file_path = test_file.to_str().expect("File path should be valid UTF-8");
117
+
118
+ // This should NOT cause OOM even though dimension claims A1:XFD1048575
119
+ // The actual data is minimal (only ~26 cells with Solver metadata)
120
+ let result = read_excel_file(file_path).expect("Should extract XLSX with extreme dimensions without OOM");
121
+
122
+ // Verify we got the actual data, not a massive allocation
123
+ assert!(!result.sheets.is_empty(), "Should have at least one sheet");
124
+
125
+ // The file has normal cells A1, B1 plus Solver cells at extreme positions
126
+ // Verify we extracted something reasonable, not 17 trillion cells
127
+ let sheet = &result.sheets[0];
128
+ assert!(
129
+ sheet.markdown.len() < 10000,
130
+ "Sheet markdown content should be small (< 10000 chars), not massive. Got {} chars",
131
+ sheet.markdown.len()
132
+ );
133
+
134
+ // Verify metadata was extracted
135
+ assert!(
136
+ result.metadata.contains_key("sheet_count"),
137
+ "Should have sheet_count metadata"
138
+ );
139
+
140
+ println!("✅ XLSX Excel Solver extreme dimensions test passed!");
141
+ println!(
142
+ " Sheet markdown length: {} chars (reasonable size)",
143
+ sheet.markdown.len()
144
+ );
145
+ println!(" Successfully handled dimension A1:XFD1048575 without OOM");
146
+ }
@@ -223,7 +223,7 @@ typedef struct CErrorDetails {
223
223
  * # Memory Layout
224
224
  *
225
225
  * Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
226
- * Field order: 12 pointers (8 bytes each) + 1 bool + 7 bytes padding = 104 bytes total
226
+ * Field order: 13 pointers (8 bytes each) + 1 bool + 7 bytes padding = 112 bytes total
227
227
  *
228
228
  * The `#[repr(C)]` attribute ensures the struct follows C's memory layout rules:
229
229
  * - Fields are laid out in order
@@ -284,6 +284,10 @@ typedef struct CExtractionResult {
284
284
  * Per-page content as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
285
285
  */
286
286
  char *pages_json;
287
+ /**
288
+ * Semantic elements as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
289
+ */
290
+ char *elements_json;
287
291
  /**
288
292
  * Whether extraction was successful
289
293
  */
@@ -1608,7 +1612,7 @@ char *kreuzberg_clone_string(const char *s);
1608
1612
  *
1609
1613
  * # Memory Layout
1610
1614
  *
1611
- * This function frees all 12 string fields in CExtractionResult:
1615
+ * This function frees all 13 string fields in CExtractionResult:
1612
1616
  * 1. content
1613
1617
  * 2. mime_type
1614
1618
  * 3. language
@@ -1621,6 +1625,7 @@ char *kreuzberg_clone_string(const char *s);
1621
1625
  * 10. images_json
1622
1626
  * 11. page_structure_json (FIXED: was missing before PR #3)
1623
1627
  * 12. pages_json (FIXED: was missing before PR #3)
1628
+ * 13. elements_json (ADDED: for element-based extraction support)
1624
1629
  *
1625
1630
  * # Example (C)
1626
1631
  *
@@ -67,7 +67,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
67
67
  images,
68
68
  pages,
69
69
  djot_content: _,
70
- elements: _,
70
+ elements,
71
71
  } = result;
72
72
 
73
73
  let sanitized_content = if content.contains('\0') {
@@ -179,6 +179,17 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
179
179
  _ => None,
180
180
  };
181
181
 
182
+ let elements_json_guard = match elements {
183
+ Some(elements) if !elements.is_empty() => {
184
+ let json =
185
+ serde_json::to_string(&elements).map_err(|e| format!("Failed to serialize elements to JSON: {}", e))?;
186
+ Some(CStringGuard::new(CString::new(json).map_err(|e| {
187
+ format!("Failed to convert elements JSON to C string: {}", e)
188
+ })?))
189
+ }
190
+ _ => None,
191
+ };
192
+
182
193
  Ok(Box::into_raw(Box::new(CExtractionResult {
183
194
  content: content_guard.into_raw(),
184
195
  mime_type: mime_type_guard.into_raw(),
@@ -192,6 +203,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
192
203
  images_json: images_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
193
204
  page_structure_json: page_structure_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
194
205
  pages_json: pages_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
206
+ elements_json: elements_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
195
207
  success: true,
196
208
  _padding1: [0u8; 7],
197
209
  })))
@@ -134,8 +134,8 @@ mod tests {
134
134
  // Test size
135
135
  assert_eq!(
136
136
  std::mem::size_of::<CExtractionResult>(),
137
- 104,
138
- "CExtractionResult must be exactly 104 bytes"
137
+ 112,
138
+ "CExtractionResult must be exactly 112 bytes"
139
139
  );
140
140
 
141
141
  // Test alignment
@@ -197,6 +197,7 @@ mod tests {
197
197
  images_json: ptr::null_mut(),
198
198
  page_structure_json: ptr::null_mut(),
199
199
  pages_json: ptr::null_mut(),
200
+ elements_json: ptr::null_mut(),
200
201
  success: true,
201
202
  _padding1: [0u8; 7],
202
203
  }))
@@ -510,6 +511,7 @@ mod tests {
510
511
  images_json: ptr::null_mut(),
511
512
  page_structure_json: ptr::null_mut(),
512
513
  pages_json: ptr::null_mut(),
514
+ elements_json: ptr::null_mut(),
513
515
  success: true,
514
516
  _padding1: [0u8; 7],
515
517
  }));
@@ -522,7 +524,7 @@ mod tests {
522
524
  #[test]
523
525
  fn test_extraction_result_free_all_fields_allocated() {
524
526
  unsafe {
525
- // Test freeing a result where ALL 12 string fields are allocated
527
+ // Test freeing a result where ALL 13 string fields are allocated
526
528
  // This verifies that kreuzberg_free_result properly frees all fields
527
529
  let result = Box::into_raw(Box::new(CExtractionResult {
528
530
  content: CString::new("test content").unwrap().into_raw(),
@@ -537,11 +539,12 @@ mod tests {
537
539
  images_json: CString::new("[{\"data\":\"base64\"}]").unwrap().into_raw(),
538
540
  page_structure_json: CString::new("{\"pages\":1}").unwrap().into_raw(),
539
541
  pages_json: CString::new("[{\"page\":1,\"content\":\"test\"}]").unwrap().into_raw(),
542
+ elements_json: CString::new("[]").unwrap().into_raw(),
540
543
  success: true,
541
544
  _padding1: [0u8; 7],
542
545
  }));
543
546
 
544
- // Should properly free all 12 allocated string fields without leaking memory
547
+ // Should properly free all 13 allocated string fields without leaking memory
545
548
  kreuzberg_free_result(result);
546
549
  }
547
550
  }
@@ -621,7 +624,7 @@ mod tests {
621
624
  /// Test CExtractionResult size exactly matches FFI contract
622
625
  #[test]
623
626
  fn test_c_extraction_result_size() {
624
- assert_eq!(std::mem::size_of::<CExtractionResult>(), 104);
627
+ assert_eq!(std::mem::size_of::<CExtractionResult>(), 112);
625
628
  assert_eq!(std::mem::align_of::<CExtractionResult>(), 8);
626
629
  }
627
630
 
@@ -146,7 +146,7 @@ pub unsafe extern "C" fn kreuzberg_clone_string(s: *const c_char) -> *mut c_char
146
146
  ///
147
147
  /// # Memory Layout
148
148
  ///
149
- /// This function frees all 12 string fields in CExtractionResult:
149
+ /// This function frees all 13 string fields in CExtractionResult:
150
150
  /// 1. content
151
151
  /// 2. mime_type
152
152
  /// 3. language
@@ -159,6 +159,7 @@ pub unsafe extern "C" fn kreuzberg_clone_string(s: *const c_char) -> *mut c_char
159
159
  /// 10. images_json
160
160
  /// 11. page_structure_json (FIXED: was missing before PR #3)
161
161
  /// 12. pages_json (FIXED: was missing before PR #3)
162
+ /// 13. elements_json (ADDED: for element-based extraction support)
162
163
  ///
163
164
  /// # Example (C)
164
165
  ///
@@ -209,6 +210,9 @@ pub unsafe extern "C" fn kreuzberg_free_result(result: *mut CExtractionResult) {
209
210
  if !result_box.pages_json.is_null() {
210
211
  unsafe { drop(CString::from_raw(result_box.pages_json)) };
211
212
  }
213
+ if !result_box.elements_json.is_null() {
214
+ unsafe { drop(CString::from_raw(result_box.elements_json)) };
215
+ }
212
216
  }
213
217
  }
214
218
 
@@ -232,6 +236,7 @@ mod tests {
232
236
  images_json: CString::new("[]").unwrap().into_raw(),
233
237
  page_structure_json: CString::new("{}").unwrap().into_raw(),
234
238
  pages_json: CString::new("[]").unwrap().into_raw(),
239
+ elements_json: CString::new("[]").unwrap().into_raw(),
235
240
  success: true,
236
241
  _padding1: [0u8; 7],
237
242
  }))
@@ -252,6 +257,7 @@ mod tests {
252
257
  images_json: ptr::null_mut(),
253
258
  page_structure_json: ptr::null_mut(),
254
259
  pages_json: ptr::null_mut(),
260
+ elements_json: ptr::null_mut(),
255
261
  success: true,
256
262
  _padding1: [0u8; 7],
257
263
  }))
@@ -343,6 +349,34 @@ mod tests {
343
349
  images_json: ptr::null_mut(),
344
350
  page_structure_json: CString::new("{\"pages\": []}").unwrap().into_raw(),
345
351
  pages_json: CString::new("[{\"content\": \"page 1\"}]").unwrap().into_raw(),
352
+ elements_json: ptr::null_mut(),
353
+ success: true,
354
+ _padding1: [0u8; 7],
355
+ }));
356
+
357
+ unsafe { kreuzberg_free_result(result) };
358
+ // If we get here without crashing or leaking, the test passed
359
+ }
360
+
361
+ #[test]
362
+ fn test_free_result_elements_json() {
363
+ // Test: ensure elements_json is freed
364
+ let result = Box::into_raw(Box::new(CExtractionResult {
365
+ content: CString::new("test").unwrap().into_raw(),
366
+ mime_type: CString::new("text/plain").unwrap().into_raw(),
367
+ language: ptr::null_mut(),
368
+ date: ptr::null_mut(),
369
+ subject: ptr::null_mut(),
370
+ tables_json: ptr::null_mut(),
371
+ detected_languages_json: ptr::null_mut(),
372
+ metadata_json: ptr::null_mut(),
373
+ chunks_json: ptr::null_mut(),
374
+ images_json: ptr::null_mut(),
375
+ page_structure_json: ptr::null_mut(),
376
+ pages_json: ptr::null_mut(),
377
+ elements_json: CString::new(r#"[{"element_id":"abc","element_type":"title","text":"Hello"}]"#)
378
+ .unwrap()
379
+ .into_raw(),
346
380
  success: true,
347
381
  _padding1: [0u8; 7],
348
382
  }));
@@ -51,7 +51,7 @@ impl Drop for CStringGuard {
51
51
  /// # Memory Layout
52
52
  ///
53
53
  /// Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
54
- /// Field order: 12 pointers (8 bytes each) + 1 bool + 7 bytes padding = 104 bytes total
54
+ /// Field order: 13 pointers (8 bytes each) + 1 bool + 7 bytes padding = 112 bytes total
55
55
  ///
56
56
  /// The `#[repr(C)]` attribute ensures the struct follows C's memory layout rules:
57
57
  /// - Fields are laid out in order
@@ -88,6 +88,8 @@ pub struct CExtractionResult {
88
88
  pub page_structure_json: *mut c_char,
89
89
  /// Per-page content as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
90
90
  pub pages_json: *mut c_char,
91
+ /// Semantic elements as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
92
+ pub elements_json: *mut c_char,
91
93
  /// Whether extraction was successful
92
94
  pub success: bool,
93
95
  /// Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
@@ -150,7 +152,7 @@ pub struct CBatchResult {
150
152
  const _: () = {
151
153
  const fn assert_c_extraction_result_size() {
152
154
  const SIZE: usize = std::mem::size_of::<CExtractionResult>();
153
- const _: () = assert!(SIZE == 104, "CExtractionResult size must be 104 bytes");
155
+ const _: () = assert!(SIZE == 112, "CExtractionResult size must be 112 bytes");
154
156
  }
155
157
 
156
158
  const fn assert_c_extraction_result_alignment() {
@@ -195,8 +197,8 @@ mod tests {
195
197
  fn test_c_extraction_result_size() {
196
198
  assert_eq!(
197
199
  std::mem::size_of::<CExtractionResult>(),
198
- 104,
199
- "CExtractionResult must be exactly 104 bytes"
200
+ 112,
201
+ "CExtractionResult must be exactly 112 bytes"
200
202
  );
201
203
  }
202
204
 
@@ -327,7 +329,8 @@ mod tests {
327
329
  assert_eq!(offset_of!(CExtractionResult, images_json), 72);
328
330
  assert_eq!(offset_of!(CExtractionResult, page_structure_json), 80);
329
331
  assert_eq!(offset_of!(CExtractionResult, pages_json), 88);
330
- assert_eq!(offset_of!(CExtractionResult, success), 96);
332
+ assert_eq!(offset_of!(CExtractionResult, elements_json), 96);
333
+ assert_eq!(offset_of!(CExtractionResult, success), 104);
331
334
  }
332
335
 
333
336
  /// Verify field offsets in CBatchResult match expectations
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.2.0"
3
+ version = "4.2.2"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.2.0
4
+ version: 4.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-01-26 00:00:00.000000000 Z
11
+ date: 2026-01-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -233,6 +233,7 @@ files:
233
233
  - lib/kreuzberg/cli.rb
234
234
  - lib/kreuzberg/cli_proxy.rb
235
235
  - lib/kreuzberg/config.rb
236
+ - lib/kreuzberg/djot_content.rb
236
237
  - lib/kreuzberg/error_context.rb
237
238
  - lib/kreuzberg/errors.rb
238
239
  - lib/kreuzberg/extraction_api.rb
@@ -362,6 +363,7 @@ files:
362
363
  - vendor/kreuzberg/src/api/error.rs
363
364
  - vendor/kreuzberg/src/api/handlers.rs
364
365
  - vendor/kreuzberg/src/api/mod.rs
366
+ - vendor/kreuzberg/src/api/openapi.rs
365
367
  - vendor/kreuzberg/src/api/router.rs
366
368
  - vendor/kreuzberg/src/api/startup.rs
367
369
  - vendor/kreuzberg/src/api/types.rs
@@ -591,6 +593,7 @@ files:
591
593
  - vendor/kreuzberg/src/plugins/registry/ocr.rs
592
594
  - vendor/kreuzberg/src/plugins/registry/processor.rs
593
595
  - vendor/kreuzberg/src/plugins/registry/validator.rs
596
+ - vendor/kreuzberg/src/plugins/startup_validation.rs
594
597
  - vendor/kreuzberg/src/plugins/traits.rs
595
598
  - vendor/kreuzberg/src/plugins/validator/mod.rs
596
599
  - vendor/kreuzberg/src/plugins/validator/registry.rs