RubyGems - kreuzberg - Versions diffs - 4.2.0 → 4.2.2 - Mend

kreuzberg 4.2.0 → 4.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

checksums.yaml +4 -4
data/Gemfile.lock +2 -2
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
data/lib/kreuzberg/cli.rb +16 -6
data/lib/kreuzberg/cli_proxy.rb +3 -1
data/lib/kreuzberg/config.rb +59 -28
data/lib/kreuzberg/djot_content.rb +225 -0
data/lib/kreuzberg/extraction_api.rb +20 -4
data/lib/kreuzberg/result.rb +12 -2
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +1 -0
data/sig/kreuzberg.rbs +23 -11
data/spec/binding/batch_spec.rb +6 -5
data/spec/binding/config_spec.rb +1 -1
data/spec/binding/error_recovery_spec.rb +3 -3
data/spec/binding/tables_spec.rb +11 -2
data/spec/unit/config/extraction_config_spec.rb +2 -2
data/spec/unit/config/output_format_spec.rb +18 -18
data/vendor/Cargo.toml +1 -1
data/vendor/kreuzberg/Cargo.toml +3 -2
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/api/error.rs +60 -0
data/vendor/kreuzberg/src/api/handlers.rs +153 -32
data/vendor/kreuzberg/src/api/mod.rs +2 -0
data/vendor/kreuzberg/src/api/openapi.rs +141 -0
data/vendor/kreuzberg/src/api/router.rs +24 -2
data/vendor/kreuzberg/src/api/startup.rs +21 -1
data/vendor/kreuzberg/src/api/types.rs +50 -4
data/vendor/kreuzberg/src/core/config/processing.rs +8 -1
data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
data/vendor/kreuzberg/src/core/io.rs +7 -7
data/vendor/kreuzberg/src/core/mime.rs +4 -4
data/vendor/kreuzberg/src/extraction/excel.rs +246 -9
data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
data/vendor/kreuzberg/tests/config_behavioral.rs +14 -12
data/vendor/kreuzberg/tests/core_integration.rs +2 -4
data/vendor/kreuzberg/tests/mime_detection.rs +3 -2
data/vendor/kreuzberg/tests/pptx_regression_tests.rs +284 -1
data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +56 -0
data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
data/vendor/kreuzberg-ffi/src/helpers.rs +13 -1
data/vendor/kreuzberg-ffi/src/lib.rs +8 -5
data/vendor/kreuzberg-ffi/src/memory.rs +35 -1
data/vendor/kreuzberg-ffi/src/types.rs +8 -5
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +5 -2

data/vendor/kreuzberg/tests/pptx_regression_tests.rs CHANGED Viewed

@@ -1,13 +1,15 @@
 //! Regression tests for PPTX/PPSX extraction bugs
 //!
 //! GitHub Issue #321: PPTX extraction fails on shapes without txBody (image placeholders) + PPSX not supported
+//! GitHub Issue #329: Extracting images from PPTX results in reversed page numbers
 //!
 //! Bug 1: "No txBody found" - PPTX extraction fails when any shape lacks a text body
 //! Bug 2: PPSX not supported - PowerPoint Show files rejected entirely
+//! Bug 3: Image page numbers reversed - image on slide 1 reports page_number=2
 #![cfg(feature = "office")]
-use kreuzberg::{ExtractionConfig, extract_file};
+use kreuzberg::{ExtractionConfig, ImageExtractionConfig, extract_file};
 use std::io::Write;
 use tempfile::NamedTempFile;
 use zip::CompressionMethod;
@@ -512,3 +514,284 @@ async fn test_pptx_mixed_shapes_extraction() {
         }
     }
 }
+/// Test that images extracted from PPTX have correct page numbers.
+///
+/// When a PPTX has multiple slides and an image on slide 1, the extracted image
+/// should have page_number=1 (not reversed).
+///
+/// GitHub Issue #329: Image on slide 1 of 2-slide PPTX reports page_number=2
+#[tokio::test]
+async fn test_pptx_image_page_numbers_not_reversed() {
+    // Create a PPTX with 2 slides, image on slide 1
+    let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
+    // A minimal 1x1 red PNG image (valid PNG format)
+    let png_image: &[u8] = &[
+        0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, // PNG signature
+        0x00, 0x00, 0x00, 0x0D, // IHDR chunk length
+        0x49, 0x48, 0x44, 0x52, // "IHDR"
+        0x00, 0x00, 0x00, 0x01, // width: 1
+        0x00, 0x00, 0x00, 0x01, // height: 1
+        0x08, 0x02, // bit depth: 8, color type: RGB
+        0x00, 0x00, 0x00, // compression, filter, interlace
+        0x90, 0x77, 0x53, 0xDE, // IHDR CRC
+        0x00, 0x00, 0x00, 0x0C, // IDAT chunk length
+        0x49, 0x44, 0x41, 0x54, // "IDAT"
+        0x08, 0xD7, 0x63, 0xF8, 0xCF, 0xC0, 0x00, 0x00, // compressed data
+        0x01, 0x01, 0x01, 0x00, // checksum
+        0x18, 0xDD, 0x8D, 0xB4, // IDAT CRC
+        0x00, 0x00, 0x00, 0x00, // IEND chunk length
+        0x49, 0x45, 0x4E, 0x44, // "IEND"
+        0xAE, 0x42, 0x60, 0x82, // IEND CRC
+    ];
+    {
+        let mut zip = ZipWriter::new(&mut temp_file);
+        let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
+        // Add [Content_Types].xml
+        zip.start_file("[Content_Types].xml", options)
+            .expect("Operation failed");
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
+  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
+  <Default Extension="xml" ContentType="application/xml"/>
+  <Default Extension="png" ContentType="image/png"/>
+  <Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
+  <Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
+  <Override PartName="/ppt/slides/slide2.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
+</Types>"#).expect("Operation failed");
+        // Add _rels/.rels
+        zip.start_file("_rels/.rels", options).expect("Operation failed");
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
+</Relationships>"#).expect("Operation failed");
+        // Add ppt/presentation.xml
+        zip.start_file("ppt/presentation.xml", options)
+            .expect("Operation failed");
+        zip.write_all(
+            br#"<?xml version="1.0" encoding="UTF-8"?>
+<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
+                xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
+  <p:sldIdLst>
+    <p:sldId id="256" r:id="rId2"/>
+    <p:sldId id="257" r:id="rId3"/>
+  </p:sldIdLst>
+</p:presentation>"#,
+        )
+        .expect("Operation failed");
+        // Add ppt/_rels/presentation.xml.rels
+        // BUG REPRODUCTION: Slides listed in REVERSE order in XML (slide2 before slide1)
+        // This is valid XML - PowerPoint doesn't guarantee order in rels files
+        // GitHub Issue #329: This causes page numbers to be reversed
+        zip.start_file("ppt/_rels/presentation.xml.rels", options)
+            .expect("Operation failed");
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+  <Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide2.xml"/>
+  <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
+</Relationships>"#).expect("Operation failed");
+        // Add the image file
+        zip.start_file("ppt/media/image1.png", options)
+            .expect("Operation failed");
+        zip.write_all(png_image).expect("Operation failed");
+        // Add slide 1 WITH an image
+        zip.start_file("ppt/slides/slide1.xml", options)
+            .expect("Operation failed");
+        zip.write_all(
+            br#"<?xml version="1.0" encoding="UTF-8"?>
+<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
+       xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
+       xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
+  <p:cSld>
+    <p:spTree>
+      <p:nvGrpSpPr><p:cNvPr id="1" name=""/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr>
+      <p:grpSpPr/>
+      <p:sp>
+        <p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
+        <p:spPr/>
+        <p:txBody><a:bodyPr/><a:lstStyle/><a:p><a:r><a:t>Slide 1 - Has Image</a:t></a:r></a:p></p:txBody>
+      </p:sp>
+      <p:pic>
+        <p:nvPicPr>
+          <p:cNvPr id="3" name="Picture 1"/>
+          <p:cNvPicPr><a:picLocks noChangeAspect="1"/></p:cNvPicPr>
+          <p:nvPr/>
+        </p:nvPicPr>
+        <p:blipFill>
+          <a:blip r:embed="rId2"/>
+          <a:stretch><a:fillRect/></a:stretch>
+        </p:blipFill>
+        <p:spPr>
+          <a:xfrm><a:off x="0" y="0"/><a:ext cx="100000" cy="100000"/></a:xfrm>
+          <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
+        </p:spPr>
+      </p:pic>
+    </p:spTree>
+  </p:cSld>
+</p:sld>"#,
+        )
+        .expect("Operation failed");
+        // Add slide 1 relationships (points to the image)
+        zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
+            .expect("Operation failed");
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+  <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image1.png"/>
+</Relationships>"#).expect("Operation failed");
+        // Add slide 2 WITHOUT an image
+        zip.start_file("ppt/slides/slide2.xml", options)
+            .expect("Operation failed");
+        zip.write_all(
+            br#"<?xml version="1.0" encoding="UTF-8"?>
+<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
+       xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
+       xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
+  <p:cSld>
+    <p:spTree>
+      <p:nvGrpSpPr><p:cNvPr id="1" name=""/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr>
+      <p:grpSpPr/>
+      <p:sp>
+        <p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
+        <p:spPr/>
+        <p:txBody><a:bodyPr/><a:lstStyle/><a:p><a:r><a:t>Slide 2 - No Image</a:t></a:r></a:p></p:txBody>
+      </p:sp>
+    </p:spTree>
+  </p:cSld>
+</p:sld>"#,
+        )
+        .expect("Operation failed");
+        // Add empty slide 2 relationships
+        zip.start_file("ppt/slides/_rels/slide2.xml.rels", options)
+            .expect("Operation failed");
+        zip.write_all(
+            br#"<?xml version="1.0" encoding="UTF-8"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+</Relationships>"#,
+        )
+        .expect("Operation failed");
+        zip.finish().expect("Operation failed");
+    }
+    // Extract with images enabled
+    let config = ExtractionConfig {
+        images: Some(ImageExtractionConfig {
+            extract_images: true,
+            target_dpi: 300,
+            max_image_dimension: 4096,
+            auto_adjust_dpi: true,
+            min_dpi: 72,
+            max_dpi: 600,
+        }),
+        ..Default::default()
+    };
+    let result = extract_file(
+        temp_file.path(),
+        Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
+        &config,
+    )
+    .await;
+    match result {
+        Ok(extraction) => {
+            // Verify text extraction works
+            assert!(extraction.content.contains("Slide 1"), "Should extract slide 1 text");
+            assert!(extraction.content.contains("Slide 2"), "Should extract slide 2 text");
+            // Verify we got an image
+            let images = extraction.images.as_ref().expect("Images should be present");
+            assert!(!images.is_empty(), "Should extract at least one image");
+            // THE CRITICAL TEST: Image on slide 1 should have page_number=1, NOT 2
+            let image = &images[0];
+            assert_eq!(
+                image.page_number,
+                Some(1),
+                "GitHub Issue #329: Image on slide 1 should have page_number=1, but got {:?}. \
+                 The page numbers are reversed!",
+                image.page_number
+            );
+            println!("✅ PPTX image page numbers are correct!");
+            println!("   Image on slide 1 has page_number={:?}", image.page_number);
+        }
+        Err(e) => {
+            panic!("PPTX extraction failed: {:?}", e);
+        }
+    }
+}
+/// Test with actual user-provided PPTX file from GitHub Issue #329.
+///
+/// The user's file has slides listed in reverse order in presentation.xml.rels,
+/// which caused images to have incorrect page numbers.
+#[tokio::test]
+async fn test_pptx_image_page_numbers_issue329_user_file() {
+    let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+        .parent()
+        .expect("Operation failed")
+        .parent()
+        .expect("Operation failed");
+    let test_file = workspace_root.join("test_documents/presentations/pptx_reversed_slide_order_issue329.pptx");
+    if !test_file.exists() {
+        println!("Skipping test: User file not found at {:?}", test_file);
+        return;
+    }
+    // Extract with images enabled
+    let config = ExtractionConfig {
+        images: Some(ImageExtractionConfig {
+            extract_images: true,
+            target_dpi: 300,
+            max_image_dimension: 4096,
+            auto_adjust_dpi: true,
+            min_dpi: 72,
+            max_dpi: 600,
+        }),
+        ..Default::default()
+    };
+    let result = extract_file(&test_file, None, &config).await;
+    match result {
+        Ok(extraction) => {
+            // The user's file has an image on slide 1
+            let images = extraction.images.as_ref().expect("Images should be extracted");
+            if images.is_empty() {
+                println!("No images extracted from user file (may not have embedded images)");
+                return;
+            }
+            // All images should have page_number = 1 since they're on the first slide
+            for (idx, image) in images.iter().enumerate() {
+                assert_eq!(
+                    image.page_number,
+                    Some(1),
+                    "GitHub Issue #329: Image {} should have page_number=1, but got {:?}",
+                    idx,
+                    image.page_number
+                );
+            }
+            println!("✅ User file from Issue #329 - image page numbers correct!");
+            println!("   Found {} images, all with page_number=1", images.len());
+        }
+        Err(e) => {
+            panic!("Failed to extract user file: {:?}", e);
+        }
+    }
+}

data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs CHANGED Viewed

@@ -88,3 +88,59 @@ fn test_xlsx_minimal_metadata_extraction() {
     println!("✅ XLSX minimal metadata extraction test passed!");
 }
+/// Test for issue #331: OOM with XLSX files containing Excel Solver add-in data
+///
+/// This test reproduces the issue where Excel Solver stores configuration data
+/// in cells at extreme positions (XFD1048550-1048575 = column 16384, rows near 1M).
+/// The sheet dimension is set to "A1:XFD1048575", which could cause Kreuzberg
+/// to attempt allocating memory for ~17 trillion cells (16384 × 1048575).
+///
+/// Expected behavior: Should handle extreme dimensions gracefully without OOM.
+/// The file is only 6.8KB and contains minimal actual data.
+#[test]
+fn test_xlsx_excel_solver_extreme_dimensions_no_oom() {
+    let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+        .parent()
+        .expect("Operation failed")
+        .parent()
+        .expect("Operation failed");
+    let test_file = workspace_root.join("tests/fixtures/xlsx-oom-repro/kreuzberg-oom-repro.xlsx");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        println!("Run: node tests/fixtures/xlsx-oom-repro/generate-oom-xlsx.mjs");
+        return;
+    }
+    let file_path = test_file.to_str().expect("File path should be valid UTF-8");
+    // This should NOT cause OOM even though dimension claims A1:XFD1048575
+    // The actual data is minimal (only ~26 cells with Solver metadata)
+    let result = read_excel_file(file_path).expect("Should extract XLSX with extreme dimensions without OOM");
+    // Verify we got the actual data, not a massive allocation
+    assert!(!result.sheets.is_empty(), "Should have at least one sheet");
+    // The file has normal cells A1, B1 plus Solver cells at extreme positions
+    // Verify we extracted something reasonable, not 17 trillion cells
+    let sheet = &result.sheets[0];
+    assert!(
+        sheet.markdown.len() < 10000,
+        "Sheet markdown content should be small (< 10000 chars), not massive. Got {} chars",
+        sheet.markdown.len()
+    );
+    // Verify metadata was extracted
+    assert!(
+        result.metadata.contains_key("sheet_count"),
+        "Should have sheet_count metadata"
+    );
+    println!("✅ XLSX Excel Solver extreme dimensions test passed!");
+    println!(
+        "   Sheet markdown length: {} chars (reasonable size)",
+        sheet.markdown.len()
+    );
+    println!("   Successfully handled dimension A1:XFD1048575 without OOM");
+}

data/vendor/kreuzberg-ffi/kreuzberg.h CHANGED Viewed

@@ -223,7 +223,7 @@ typedef struct CErrorDetails {
  * # Memory Layout
  *
  * Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
- * Field order: 12 pointers (8 bytes each) + 1 bool + 7 bytes padding = 104 bytes total
+ * Field order: 13 pointers (8 bytes each) + 1 bool + 7 bytes padding = 112 bytes total
  *
  * The `#[repr(C)]` attribute ensures the struct follows C's memory layout rules:
  * - Fields are laid out in order
@@ -284,6 +284,10 @@ typedef struct CExtractionResult {
    * Per-page content as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
    */
   char *pages_json;
+  /**
+   * Semantic elements as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
+   */
+  char *elements_json;
   /**
    * Whether extraction was successful
    */
@@ -1608,7 +1612,7 @@ char *kreuzberg_clone_string(const char *s);
  *
  * # Memory Layout
  *
- * This function frees all 12 string fields in CExtractionResult:
+ * This function frees all 13 string fields in CExtractionResult:
  * 1. content
  * 2. mime_type
  * 3. language
@@ -1621,6 +1625,7 @@ char *kreuzberg_clone_string(const char *s);
  * 10. images_json
  * 11. page_structure_json (FIXED: was missing before PR #3)
  * 12. pages_json (FIXED: was missing before PR #3)
+ * 13. elements_json (ADDED: for element-based extraction support)
  *
  * # Example (C)
  *

data/vendor/kreuzberg-ffi/src/helpers.rs CHANGED Viewed

@@ -67,7 +67,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
         images,
         pages,
         djot_content: _,
-        elements: _,
+        elements,
     } = result;
     let sanitized_content = if content.contains('\0') {
@@ -179,6 +179,17 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
         _ => None,
     };
+    let elements_json_guard = match elements {
+        Some(elements) if !elements.is_empty() => {
+            let json =
+                serde_json::to_string(&elements).map_err(|e| format!("Failed to serialize elements to JSON: {}", e))?;
+            Some(CStringGuard::new(CString::new(json).map_err(|e| {
+                format!("Failed to convert elements JSON to C string: {}", e)
+            })?))
+        }
+        _ => None,
+    };
     Ok(Box::into_raw(Box::new(CExtractionResult {
         content: content_guard.into_raw(),
         mime_type: mime_type_guard.into_raw(),
@@ -192,6 +203,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
         images_json: images_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
         page_structure_json: page_structure_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
         pages_json: pages_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
+        elements_json: elements_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
         success: true,
         _padding1: [0u8; 7],
     })))

data/vendor/kreuzberg-ffi/src/lib.rs CHANGED Viewed

@@ -134,8 +134,8 @@ mod tests {
         // Test size
         assert_eq!(
             std::mem::size_of::<CExtractionResult>(),
-            104,
-            "CExtractionResult must be exactly 104 bytes"
+            112,
+            "CExtractionResult must be exactly 112 bytes"
         );
         // Test alignment
@@ -197,6 +197,7 @@ mod tests {
             images_json: ptr::null_mut(),
             page_structure_json: ptr::null_mut(),
             pages_json: ptr::null_mut(),
+            elements_json: ptr::null_mut(),
             success: true,
             _padding1: [0u8; 7],
         }))
@@ -510,6 +511,7 @@ mod tests {
                 images_json: ptr::null_mut(),
                 page_structure_json: ptr::null_mut(),
                 pages_json: ptr::null_mut(),
+                elements_json: ptr::null_mut(),
                 success: true,
                 _padding1: [0u8; 7],
             }));
@@ -522,7 +524,7 @@ mod tests {
     #[test]
     fn test_extraction_result_free_all_fields_allocated() {
         unsafe {
-            // Test freeing a result where ALL 12 string fields are allocated
+            // Test freeing a result where ALL 13 string fields are allocated
             // This verifies that kreuzberg_free_result properly frees all fields
             let result = Box::into_raw(Box::new(CExtractionResult {
                 content: CString::new("test content").unwrap().into_raw(),
@@ -537,11 +539,12 @@ mod tests {
                 images_json: CString::new("[{\"data\":\"base64\"}]").unwrap().into_raw(),
                 page_structure_json: CString::new("{\"pages\":1}").unwrap().into_raw(),
                 pages_json: CString::new("[{\"page\":1,\"content\":\"test\"}]").unwrap().into_raw(),
+                elements_json: CString::new("[]").unwrap().into_raw(),
                 success: true,
                 _padding1: [0u8; 7],
             }));
-            // Should properly free all 12 allocated string fields without leaking memory
+            // Should properly free all 13 allocated string fields without leaking memory
             kreuzberg_free_result(result);
         }
     }
@@ -621,7 +624,7 @@ mod tests {
     /// Test CExtractionResult size exactly matches FFI contract
     #[test]
     fn test_c_extraction_result_size() {
-        assert_eq!(std::mem::size_of::<CExtractionResult>(), 104);
+        assert_eq!(std::mem::size_of::<CExtractionResult>(), 112);
         assert_eq!(std::mem::align_of::<CExtractionResult>(), 8);
     }

data/vendor/kreuzberg-ffi/src/memory.rs CHANGED Viewed

@@ -146,7 +146,7 @@ pub unsafe extern "C" fn kreuzberg_clone_string(s: *const c_char) -> *mut c_char
 ///
 /// # Memory Layout
 ///
-/// This function frees all 12 string fields in CExtractionResult:
+/// This function frees all 13 string fields in CExtractionResult:
 /// 1. content
 /// 2. mime_type
 /// 3. language
@@ -159,6 +159,7 @@ pub unsafe extern "C" fn kreuzberg_clone_string(s: *const c_char) -> *mut c_char
 /// 10. images_json
 /// 11. page_structure_json (FIXED: was missing before PR #3)
 /// 12. pages_json (FIXED: was missing before PR #3)
+/// 13. elements_json (ADDED: for element-based extraction support)
 ///
 /// # Example (C)
 ///
@@ -209,6 +210,9 @@ pub unsafe extern "C" fn kreuzberg_free_result(result: *mut CExtractionResult) {
         if !result_box.pages_json.is_null() {
             unsafe { drop(CString::from_raw(result_box.pages_json)) };
         }
+        if !result_box.elements_json.is_null() {
+            unsafe { drop(CString::from_raw(result_box.elements_json)) };
+        }
     }
 }
@@ -232,6 +236,7 @@ mod tests {
             images_json: CString::new("[]").unwrap().into_raw(),
             page_structure_json: CString::new("{}").unwrap().into_raw(),
             pages_json: CString::new("[]").unwrap().into_raw(),
+            elements_json: CString::new("[]").unwrap().into_raw(),
             success: true,
             _padding1: [0u8; 7],
         }))
@@ -252,6 +257,7 @@ mod tests {
             images_json: ptr::null_mut(),
             page_structure_json: ptr::null_mut(),
             pages_json: ptr::null_mut(),
+            elements_json: ptr::null_mut(),
             success: true,
             _padding1: [0u8; 7],
         }))
@@ -343,6 +349,34 @@ mod tests {
             images_json: ptr::null_mut(),
             page_structure_json: CString::new("{\"pages\": []}").unwrap().into_raw(),
             pages_json: CString::new("[{\"content\": \"page 1\"}]").unwrap().into_raw(),
+            elements_json: ptr::null_mut(),
+            success: true,
+            _padding1: [0u8; 7],
+        }));
+        unsafe { kreuzberg_free_result(result) };
+        // If we get here without crashing or leaking, the test passed
+    }
+    #[test]
+    fn test_free_result_elements_json() {
+        // Test: ensure elements_json is freed
+        let result = Box::into_raw(Box::new(CExtractionResult {
+            content: CString::new("test").unwrap().into_raw(),
+            mime_type: CString::new("text/plain").unwrap().into_raw(),
+            language: ptr::null_mut(),
+            date: ptr::null_mut(),
+            subject: ptr::null_mut(),
+            tables_json: ptr::null_mut(),
+            detected_languages_json: ptr::null_mut(),
+            metadata_json: ptr::null_mut(),
+            chunks_json: ptr::null_mut(),
+            images_json: ptr::null_mut(),
+            page_structure_json: ptr::null_mut(),
+            pages_json: ptr::null_mut(),
+            elements_json: CString::new(r#"[{"element_id":"abc","element_type":"title","text":"Hello"}]"#)
+                .unwrap()
+                .into_raw(),
             success: true,
             _padding1: [0u8; 7],
         }));

data/vendor/kreuzberg-ffi/src/types.rs CHANGED Viewed

@@ -51,7 +51,7 @@ impl Drop for CStringGuard {
 /// # Memory Layout
 ///
 /// Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
-/// Field order: 12 pointers (8 bytes each) + 1 bool + 7 bytes padding = 104 bytes total
+/// Field order: 13 pointers (8 bytes each) + 1 bool + 7 bytes padding = 112 bytes total
 ///
 /// The `#[repr(C)]` attribute ensures the struct follows C's memory layout rules:
 /// - Fields are laid out in order
@@ -88,6 +88,8 @@ pub struct CExtractionResult {
     pub page_structure_json: *mut c_char,
     /// Per-page content as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
     pub pages_json: *mut c_char,
+    /// Semantic elements as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
+    pub elements_json: *mut c_char,
     /// Whether extraction was successful
     pub success: bool,
     /// Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
@@ -150,7 +152,7 @@ pub struct CBatchResult {
 const _: () = {
     const fn assert_c_extraction_result_size() {
         const SIZE: usize = std::mem::size_of::<CExtractionResult>();
-        const _: () = assert!(SIZE == 104, "CExtractionResult size must be 104 bytes");
+        const _: () = assert!(SIZE == 112, "CExtractionResult size must be 112 bytes");
     }
     const fn assert_c_extraction_result_alignment() {
@@ -195,8 +197,8 @@ mod tests {
     fn test_c_extraction_result_size() {
         assert_eq!(
             std::mem::size_of::<CExtractionResult>(),
-            104,
-            "CExtractionResult must be exactly 104 bytes"
+            112,
+            "CExtractionResult must be exactly 112 bytes"
         );
     }
@@ -327,7 +329,8 @@ mod tests {
         assert_eq!(offset_of!(CExtractionResult, images_json), 72);
         assert_eq!(offset_of!(CExtractionResult, page_structure_json), 80);
         assert_eq!(offset_of!(CExtractionResult, pages_json), 88);
-        assert_eq!(offset_of!(CExtractionResult, success), 96);
+        assert_eq!(offset_of!(CExtractionResult, elements_json), 96);
+        assert_eq!(offset_of!(CExtractionResult, success), 104);
     }
     /// Verify field offsets in CBatchResult match expectations

data/vendor/kreuzberg-tesseract/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg-tesseract"
-version = "4.2.0"
+version = "4.2.2"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: kreuzberg
 version: !ruby/object:Gem::Version
-  version: 4.2.0
+  version: 4.2.2
 platform: ruby
 authors:
 - Na'aman Hirschfeld
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2026-01-26 00:00:00.000000000 Z
+date: 2026-01-28 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -233,6 +233,7 @@ files:
 - lib/kreuzberg/cli.rb
 - lib/kreuzberg/cli_proxy.rb
 - lib/kreuzberg/config.rb
+- lib/kreuzberg/djot_content.rb
 - lib/kreuzberg/error_context.rb
 - lib/kreuzberg/errors.rb
 - lib/kreuzberg/extraction_api.rb
@@ -362,6 +363,7 @@ files:
 - vendor/kreuzberg/src/api/error.rs
 - vendor/kreuzberg/src/api/handlers.rs
 - vendor/kreuzberg/src/api/mod.rs
+- vendor/kreuzberg/src/api/openapi.rs
 - vendor/kreuzberg/src/api/router.rs
 - vendor/kreuzberg/src/api/startup.rs
 - vendor/kreuzberg/src/api/types.rs
@@ -591,6 +593,7 @@ files:
 - vendor/kreuzberg/src/plugins/registry/ocr.rs
 - vendor/kreuzberg/src/plugins/registry/processor.rs
 - vendor/kreuzberg/src/plugins/registry/validator.rs
+- vendor/kreuzberg/src/plugins/startup_validation.rs
 - vendor/kreuzberg/src/plugins/traits.rs
 - vendor/kreuzberg/src/plugins/validator/mod.rs
 - vendor/kreuzberg/src/plugins/validator/registry.rs