RubyGems - kreuzberg - Versions diffs - 4.2.1 → 4.2.2 - Mend

kreuzberg 4.2.1 → 4.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +4 -4
data/Gemfile.lock +2 -2
data/README.md +1 -1
data/lib/kreuzberg/config.rb +4 -20
data/lib/kreuzberg/version.rb +1 -1
data/spec/binding/config_spec.rb +1 -1
data/spec/unit/config/extraction_config_spec.rb +2 -2
data/vendor/Cargo.toml +1 -1
data/vendor/kreuzberg/Cargo.toml +3 -2
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/api/error.rs +60 -0
data/vendor/kreuzberg/src/api/handlers.rs +153 -32
data/vendor/kreuzberg/src/api/mod.rs +2 -0
data/vendor/kreuzberg/src/api/openapi.rs +141 -0
data/vendor/kreuzberg/src/api/router.rs +24 -2
data/vendor/kreuzberg/src/api/startup.rs +11 -5
data/vendor/kreuzberg/src/api/types.rs +50 -4
data/vendor/kreuzberg/src/core/config/processing.rs +8 -1
data/vendor/kreuzberg/src/extraction/excel.rs +246 -9
data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +56 -0
data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
data/vendor/kreuzberg-ffi/src/helpers.rs +13 -1
data/vendor/kreuzberg-ffi/src/lib.rs +8 -5
data/vendor/kreuzberg-ffi/src/memory.rs +35 -1
data/vendor/kreuzberg-ffi/src/types.rs +8 -5
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +3 -2

data/vendor/kreuzberg/src/extraction/excel.rs CHANGED Viewed

@@ -27,16 +27,23 @@
 //! # Ok(())
 //! # }
 //! ```
-use calamine::{Data, Range, Reader, open_workbook_auto};
+use calamine::{Data, DataRef, Range, Reader, open_workbook_auto};
 use std::collections::HashMap;
 use std::fmt::Write as FmtWrite;
-use std::io::Cursor;
+use std::io::{Cursor, Read, Seek};
 use std::path::Path;
 use crate::error::{KreuzbergError, Result};
 use crate::extraction::capacity;
 use crate::types::{ExcelSheet, ExcelWorkbook};
+/// Maximum number of cells in a Range's bounding box before we consider it pathological.
+/// This threshold is set to prevent OOM when processing files with sparse data at extreme
+/// positions (e.g., Excel Solver files that have cells at A1 and XFD1048575).
+///
+/// 100 million cells at ~64 bytes each = ~6.4 GB, which is a reasonable upper limit.
+const MAX_BOUNDING_BOX_CELLS: u64 = 100_000_000;
 #[cfg(feature = "office")]
 use crate::extraction::office_metadata::{
     extract_core_properties, extract_custom_properties, extract_xlsx_app_properties,
@@ -45,11 +52,13 @@ use crate::extraction::office_metadata::{
 use serde_json::Value;
 pub fn read_excel_file(file_path: &str) -> Result<ExcelWorkbook> {
+    let lower_path = file_path.to_lowercase();
     #[cfg(feature = "office")]
-    let office_metadata = if file_path.to_lowercase().ends_with(".xlsx")
-        || file_path.to_lowercase().ends_with(".xlsm")
-        || file_path.to_lowercase().ends_with(".xlam")
-        || file_path.to_lowercase().ends_with(".xltm")
+    let office_metadata = if lower_path.ends_with(".xlsx")
+        || lower_path.ends_with(".xlsm")
+        || lower_path.ends_with(".xlam")
+        || lower_path.ends_with(".xltm")
     {
         extract_xlsx_office_metadata_from_file(file_path).ok()
     } else {
@@ -59,7 +68,19 @@ pub fn read_excel_file(file_path: &str) -> Result<ExcelWorkbook> {
     #[cfg(not(feature = "office"))]
     let office_metadata: Option<HashMap<String, String>> = None;
-    // We analyze the error and only wrap format errors, letting real IO errors bubble up ~keep
+    // For XLSX files, use specialized handler with OOM protection
+    if lower_path.ends_with(".xlsx")
+        || lower_path.ends_with(".xlsm")
+        || lower_path.ends_with(".xlam")
+        || lower_path.ends_with(".xltm")
+    {
+        let file = std::fs::File::open(file_path)?;
+        let workbook = calamine::Xlsx::new(std::io::BufReader::new(file))
+            .map_err(|e| KreuzbergError::parsing(format!("Failed to parse XLSX: {}", e)))?;
+        return process_xlsx_workbook(workbook, office_metadata);
+    }
+    // For other formats, use open_workbook_auto
     let workbook = match open_workbook_auto(Path::new(file_path)) {
         Ok(wb) => wb,
         Err(calamine::Error::Io(io_err)) => {
@@ -94,7 +115,7 @@ pub fn read_excel_bytes(data: &[u8], file_extension: &str) -> Result<ExcelWorkbo
         ".xlsx" | ".xlsm" | ".xlam" | ".xltm" => {
             let workbook = calamine::Xlsx::new(cursor)
                 .map_err(|e| KreuzbergError::parsing(format!("Failed to parse XLSX: {}", e)))?;
-            process_workbook(workbook, office_metadata)
+            process_xlsx_workbook(workbook, office_metadata)
         }
         ".xls" | ".xla" => {
             let workbook = calamine::Xls::new(cursor)
@@ -118,6 +139,194 @@ pub fn read_excel_bytes(data: &[u8], file_extension: &str) -> Result<ExcelWorkbo
     }
 }
+/// Process XLSX workbooks with special handling for pathological sparse files.
+///
+/// This function uses calamine's `worksheet_cells_reader()` API to detect sheets with
+/// extreme bounding boxes BEFORE allocating memory for the full Range. This prevents
+/// OOM when processing files like Excel Solver files that have cells at both A1 and
+/// XFD1048575, creating a bounding box of ~17 billion cells.
+fn process_xlsx_workbook<RS: Read + Seek>(
+    mut workbook: calamine::Xlsx<RS>,
+    office_metadata: Option<HashMap<String, String>>,
+) -> Result<ExcelWorkbook> {
+    let sheet_names = workbook.sheet_names();
+    let mut sheets = Vec::with_capacity(sheet_names.len());
+    for name in &sheet_names {
+        // Use worksheet_cells_reader to stream cells and detect pathological bounding boxes
+        match process_xlsx_sheet_safe(&mut workbook, name) {
+            Ok(sheet) => sheets.push(sheet),
+            Err(e) => {
+                // Log but don't fail - continue with other sheets
+                tracing::warn!("Failed to process sheet '{}': {}", name, e);
+            }
+        }
+    }
+    let metadata = extract_metadata(&workbook, &sheet_names, office_metadata);
+    Ok(ExcelWorkbook { sheets, metadata })
+}
+/// Process a single XLSX sheet safely by pre-checking the bounding box.
+///
+/// This function streams cells to compute the actual bounding box without allocating
+/// a full Range, then only creates the Range if the bounding box is within safe limits.
+fn process_xlsx_sheet_safe<RS: Read + Seek>(workbook: &mut calamine::Xlsx<RS>, sheet_name: &str) -> Result<ExcelSheet> {
+    // First pass: stream cells to compute actual bounding box and collect cell data
+    let (cells, row_min, row_max, col_min, col_max) = {
+        let mut cell_reader = workbook
+            .worksheet_cells_reader(sheet_name)
+            .map_err(|e| KreuzbergError::parsing(format!("Failed to read sheet '{}': {}", sheet_name, e)))?;
+        let mut cells: Vec<((u32, u32), Data)> = Vec::new();
+        let mut row_min = u32::MAX;
+        let mut row_max = 0u32;
+        let mut col_min = u32::MAX;
+        let mut col_max = 0u32;
+        // Stream through all cells, tracking bounds
+        while let Ok(Some(cell)) = cell_reader.next_cell() {
+            let (row, col) = cell.get_position();
+            row_min = row_min.min(row);
+            row_max = row_max.max(row);
+            col_min = col_min.min(col);
+            col_max = col_max.max(col);
+            // Convert DataRef to owned Data
+            let data: Data = match cell.get_value() {
+                DataRef::Empty => Data::Empty,
+                DataRef::String(s) => Data::String(s.clone()),
+                DataRef::SharedString(s) => Data::String(s.to_string()),
+                DataRef::Float(f) => Data::Float(*f),
+                DataRef::Int(i) => Data::Int(*i),
+                DataRef::Bool(b) => Data::Bool(*b),
+                DataRef::DateTime(dt) => Data::DateTime(*dt),
+                DataRef::DateTimeIso(s) => Data::DateTimeIso(s.clone()),
+                DataRef::DurationIso(s) => Data::DurationIso(s.clone()),
+                DataRef::Error(e) => Data::Error(e.clone()),
+            };
+            cells.push(((row, col), data));
+        }
+        (cells, row_min, row_max, col_min, col_max)
+    }; // cell_reader is dropped here, releasing the borrow
+    // Check if sheet is empty
+    if cells.is_empty() {
+        return Ok(ExcelSheet {
+            name: sheet_name.to_owned(),
+            markdown: format!("## {}\n\n*Empty sheet*", sheet_name),
+            row_count: 0,
+            col_count: 0,
+            cell_count: 0,
+            table_cells: None,
+        });
+    }
+    // Calculate bounding box size
+    let bb_rows = (row_max - row_min + 1) as u64;
+    let bb_cols = (col_max - col_min + 1) as u64;
+    let bb_cells = bb_rows.saturating_mul(bb_cols);
+    // Check for pathological bounding box
+    if bb_cells > MAX_BOUNDING_BOX_CELLS {
+        // Sheet has sparse data at extreme positions - process directly from cells
+        return process_sparse_sheet_from_cells(sheet_name, cells, row_min, row_max, col_min, col_max);
+    }
+    // Safe to create a Range - bounding box is within limits
+    // Use calamine's normal worksheet_range which will create the Range
+    let range = workbook
+        .worksheet_range(sheet_name)
+        .map_err(|e| KreuzbergError::parsing(format!("Failed to parse sheet '{}': {}", sheet_name, e)))?;
+    Ok(process_sheet(sheet_name, &range))
+}
+/// Process a sparse sheet directly from collected cells without creating a full Range.
+///
+/// This is used when the bounding box would exceed MAX_BOUNDING_BOX_CELLS.
+/// Instead of creating a dense Range, we generate markdown directly from the sparse cells.
+fn process_sparse_sheet_from_cells(
+    sheet_name: &str,
+    cells: Vec<((u32, u32), Data)>,
+    row_min: u32,
+    row_max: u32,
+    col_min: u32,
+    col_max: u32,
+) -> Result<ExcelSheet> {
+    let cell_count = cells.len();
+    let bb_rows = (row_max - row_min + 1) as usize;
+    let bb_cols = (col_max - col_min + 1) as usize;
+    // Create a warning message about the sparse data
+    let mut markdown = String::with_capacity(500 + cell_count * 50);
+    write!(
+        markdown,
+        "## {}\n\n*Note: Sheet contains sparse data spanning {} rows x {} columns ({} actual cells). \
+         Bounding box too large for dense extraction. Showing actual cell data below.*\n\n",
+        sheet_name, bb_rows, bb_cols, cell_count
+    )
+    .expect("write to String cannot fail");
+    // Group cells by row for tabular display
+    let mut cells_by_row: HashMap<u32, Vec<(u32, &Data)>> = HashMap::new();
+    for ((row, col), data) in &cells {
+        cells_by_row.entry(*row).or_default().push((*col, data));
+    }
+    // Sort rows and output as simple key-value pairs
+    let mut rows: Vec<_> = cells_by_row.keys().copied().collect();
+    rows.sort_unstable();
+    // Limit output to first 1000 cells to avoid huge output
+    let mut output_count = 0;
+    const MAX_OUTPUT_CELLS: usize = 1000;
+    for row in rows {
+        if output_count >= MAX_OUTPUT_CELLS {
+            write!(markdown, "\n... ({} more cells not shown)\n", cell_count - output_count)
+                .expect("write to String cannot fail");
+            break;
+        }
+        let mut row_cells = cells_by_row.remove(&row).unwrap_or_default();
+        row_cells.sort_by_key(|(col, _)| *col);
+        for (col, data) in row_cells {
+            if output_count >= MAX_OUTPUT_CELLS {
+                break;
+            }
+            let cell_ref = col_to_excel_letter(col);
+            let cell_str = format_cell_to_string(data);
+            if !cell_str.is_empty() {
+                writeln!(markdown, "- **{}{}**: {}", cell_ref, row + 1, cell_str).expect("write to String cannot fail");
+                output_count += 1;
+            }
+        }
+    }
+    Ok(ExcelSheet {
+        name: sheet_name.to_owned(),
+        markdown,
+        row_count: bb_rows,
+        col_count: bb_cols,
+        cell_count,
+        table_cells: None, // No structured table for sparse sheets
+    })
+}
+/// Convert a 0-indexed column number to Excel-style letter(s) (A, B, ..., Z, AA, AB, ...).
+fn col_to_excel_letter(col: u32) -> String {
+    let mut result = String::new();
+    let mut n = col + 1; // 1-indexed for calculation
+    while n > 0 {
+        n -= 1;
+        result.insert(0, (b'A' + (n % 26) as u8) as char);
+        n /= 26;
+    }
+    result
+}
 fn process_workbook<RS, R>(mut workbook: R, office_metadata: Option<HashMap<String, String>>) -> Result<ExcelWorkbook>
 where
     RS: std::io::Read + std::io::Seek,
@@ -143,7 +352,10 @@ fn process_sheet(name: &str, range: &Range<Data>) -> ExcelSheet {
     let (rows, cols) = range.get_size();
     let cell_count = range.used_cells().count();
-    let estimated_capacity = 50 + (cols * 20) + (rows * cols * 12);
+    // Fix for issue #331: Use actual cell count instead of declared dimensions
+    // to avoid OOM on sparse sheets with extreme dimensions (e.g., Excel Solver files).
+    // Declared dimensions can claim A1:XFD1048575 (~17T cells) while actual data is minimal.
+    let estimated_capacity = 50 + (cols * 20) + (cell_count * 12);
     if rows == 0 || cols == 0 {
         let markdown = format!("## {}\n\n*Empty sheet*", name);
@@ -176,6 +388,31 @@ fn process_sheet(name: &str, range: &Range<Data>) -> ExcelSheet {
 ///
 /// Returns (markdown, table_cells) where table_cells is a 2D vector of strings.
 fn generate_markdown_and_cells(sheet_name: &str, range: &Range<Data>, capacity: usize) -> (String, Vec<Vec<String>>) {
+    // Fix for issue #331: Protect against extreme declared dimensions.
+    // Excel Solver files can declare A1:XFD1048575 (1M+ rows) but only have ~26 actual cells.
+    // Calling range.rows().collect() would iterate ALL declared rows causing OOM.
+    const MAX_REASONABLE_ROWS: usize = 100_000; // Cap at 100K rows for safety
+    let (declared_rows, _declared_cols) = range.get_size();
+    // If declared rows exceed reasonable limit, skip processing to avoid OOM
+    if declared_rows > MAX_REASONABLE_ROWS {
+        let actual_cell_count = range.used_cells().count();
+        // If actual data is minimal compared to declared size, it's a sparse/pathological file
+        if actual_cell_count < 10_000 {
+            // Return minimal output instead of OOM
+            let result_capacity = 100 + sheet_name.len();
+            let mut result = String::with_capacity(result_capacity);
+            write!(
+                result,
+                "## {}\n\n*Sheet has extreme declared dimensions ({} rows) with minimal actual data ({} cells). Skipping to prevent OOM.*",
+                sheet_name, declared_rows, actual_cell_count
+            ).unwrap();
+            return (result, Vec::new());
+        }
+    }
     let rows: Vec<_> = range.rows().collect();
     if rows.is_empty() {
         let result_capacity = 50 + sheet_name.len();

data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs CHANGED Viewed

@@ -88,3 +88,59 @@ fn test_xlsx_minimal_metadata_extraction() {
     println!("✅ XLSX minimal metadata extraction test passed!");
 }
+/// Test for issue #331: OOM with XLSX files containing Excel Solver add-in data
+///
+/// This test reproduces the issue where Excel Solver stores configuration data
+/// in cells at extreme positions (XFD1048550-1048575 = column 16384, rows near 1M).
+/// The sheet dimension is set to "A1:XFD1048575", which could cause Kreuzberg
+/// to attempt allocating memory for ~17 trillion cells (16384 × 1048575).
+///
+/// Expected behavior: Should handle extreme dimensions gracefully without OOM.
+/// The file is only 6.8KB and contains minimal actual data.
+#[test]
+fn test_xlsx_excel_solver_extreme_dimensions_no_oom() {
+    let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+        .parent()
+        .expect("Operation failed")
+        .parent()
+        .expect("Operation failed");
+    let test_file = workspace_root.join("tests/fixtures/xlsx-oom-repro/kreuzberg-oom-repro.xlsx");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        println!("Run: node tests/fixtures/xlsx-oom-repro/generate-oom-xlsx.mjs");
+        return;
+    }
+    let file_path = test_file.to_str().expect("File path should be valid UTF-8");
+    // This should NOT cause OOM even though dimension claims A1:XFD1048575
+    // The actual data is minimal (only ~26 cells with Solver metadata)
+    let result = read_excel_file(file_path).expect("Should extract XLSX with extreme dimensions without OOM");
+    // Verify we got the actual data, not a massive allocation
+    assert!(!result.sheets.is_empty(), "Should have at least one sheet");
+    // The file has normal cells A1, B1 plus Solver cells at extreme positions
+    // Verify we extracted something reasonable, not 17 trillion cells
+    let sheet = &result.sheets[0];
+    assert!(
+        sheet.markdown.len() < 10000,
+        "Sheet markdown content should be small (< 10000 chars), not massive. Got {} chars",
+        sheet.markdown.len()
+    );
+    // Verify metadata was extracted
+    assert!(
+        result.metadata.contains_key("sheet_count"),
+        "Should have sheet_count metadata"
+    );
+    println!("✅ XLSX Excel Solver extreme dimensions test passed!");
+    println!(
+        "   Sheet markdown length: {} chars (reasonable size)",
+        sheet.markdown.len()
+    );
+    println!("   Successfully handled dimension A1:XFD1048575 without OOM");
+}

data/vendor/kreuzberg-ffi/kreuzberg.h CHANGED Viewed

@@ -223,7 +223,7 @@ typedef struct CErrorDetails {
  * # Memory Layout
  *
  * Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
- * Field order: 12 pointers (8 bytes each) + 1 bool + 7 bytes padding = 104 bytes total
+ * Field order: 13 pointers (8 bytes each) + 1 bool + 7 bytes padding = 112 bytes total
  *
  * The `#[repr(C)]` attribute ensures the struct follows C's memory layout rules:
  * - Fields are laid out in order
@@ -284,6 +284,10 @@ typedef struct CExtractionResult {
    * Per-page content as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
    */
   char *pages_json;
+  /**
+   * Semantic elements as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
+   */
+  char *elements_json;
   /**
    * Whether extraction was successful
    */
@@ -1608,7 +1612,7 @@ char *kreuzberg_clone_string(const char *s);
  *
  * # Memory Layout
  *
- * This function frees all 12 string fields in CExtractionResult:
+ * This function frees all 13 string fields in CExtractionResult:
  * 1. content
  * 2. mime_type
  * 3. language
@@ -1621,6 +1625,7 @@ char *kreuzberg_clone_string(const char *s);
  * 10. images_json
  * 11. page_structure_json (FIXED: was missing before PR #3)
  * 12. pages_json (FIXED: was missing before PR #3)
+ * 13. elements_json (ADDED: for element-based extraction support)
  *
  * # Example (C)
  *

data/vendor/kreuzberg-ffi/src/helpers.rs CHANGED Viewed

@@ -67,7 +67,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
         images,
         pages,
         djot_content: _,
-        elements: _,
+        elements,
     } = result;
     let sanitized_content = if content.contains('\0') {
@@ -179,6 +179,17 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
         _ => None,
     };
+    let elements_json_guard = match elements {
+        Some(elements) if !elements.is_empty() => {
+            let json =
+                serde_json::to_string(&elements).map_err(|e| format!("Failed to serialize elements to JSON: {}", e))?;
+            Some(CStringGuard::new(CString::new(json).map_err(|e| {
+                format!("Failed to convert elements JSON to C string: {}", e)
+            })?))
+        }
+        _ => None,
+    };
     Ok(Box::into_raw(Box::new(CExtractionResult {
         content: content_guard.into_raw(),
         mime_type: mime_type_guard.into_raw(),
@@ -192,6 +203,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
         images_json: images_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
         page_structure_json: page_structure_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
         pages_json: pages_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
+        elements_json: elements_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
         success: true,
         _padding1: [0u8; 7],
     })))

data/vendor/kreuzberg-ffi/src/lib.rs CHANGED Viewed

@@ -134,8 +134,8 @@ mod tests {
         // Test size
         assert_eq!(
             std::mem::size_of::<CExtractionResult>(),
-            104,
-            "CExtractionResult must be exactly 104 bytes"
+            112,
+            "CExtractionResult must be exactly 112 bytes"
         );
         // Test alignment
@@ -197,6 +197,7 @@ mod tests {
             images_json: ptr::null_mut(),
             page_structure_json: ptr::null_mut(),
             pages_json: ptr::null_mut(),
+            elements_json: ptr::null_mut(),
             success: true,
             _padding1: [0u8; 7],
         }))
@@ -510,6 +511,7 @@ mod tests {
                 images_json: ptr::null_mut(),
                 page_structure_json: ptr::null_mut(),
                 pages_json: ptr::null_mut(),
+                elements_json: ptr::null_mut(),
                 success: true,
                 _padding1: [0u8; 7],
             }));
@@ -522,7 +524,7 @@ mod tests {
     #[test]
     fn test_extraction_result_free_all_fields_allocated() {
         unsafe {
-            // Test freeing a result where ALL 12 string fields are allocated
+            // Test freeing a result where ALL 13 string fields are allocated
             // This verifies that kreuzberg_free_result properly frees all fields
             let result = Box::into_raw(Box::new(CExtractionResult {
                 content: CString::new("test content").unwrap().into_raw(),
@@ -537,11 +539,12 @@ mod tests {
                 images_json: CString::new("[{\"data\":\"base64\"}]").unwrap().into_raw(),
                 page_structure_json: CString::new("{\"pages\":1}").unwrap().into_raw(),
                 pages_json: CString::new("[{\"page\":1,\"content\":\"test\"}]").unwrap().into_raw(),
+                elements_json: CString::new("[]").unwrap().into_raw(),
                 success: true,
                 _padding1: [0u8; 7],
             }));
-            // Should properly free all 12 allocated string fields without leaking memory
+            // Should properly free all 13 allocated string fields without leaking memory
             kreuzberg_free_result(result);
         }
     }
@@ -621,7 +624,7 @@ mod tests {
     /// Test CExtractionResult size exactly matches FFI contract
     #[test]
     fn test_c_extraction_result_size() {
-        assert_eq!(std::mem::size_of::<CExtractionResult>(), 104);
+        assert_eq!(std::mem::size_of::<CExtractionResult>(), 112);
         assert_eq!(std::mem::align_of::<CExtractionResult>(), 8);
     }

data/vendor/kreuzberg-ffi/src/memory.rs CHANGED Viewed

@@ -146,7 +146,7 @@ pub unsafe extern "C" fn kreuzberg_clone_string(s: *const c_char) -> *mut c_char
 ///
 /// # Memory Layout
 ///
-/// This function frees all 12 string fields in CExtractionResult:
+/// This function frees all 13 string fields in CExtractionResult:
 /// 1. content
 /// 2. mime_type
 /// 3. language
@@ -159,6 +159,7 @@ pub unsafe extern "C" fn kreuzberg_clone_string(s: *const c_char) -> *mut c_char
 /// 10. images_json
 /// 11. page_structure_json (FIXED: was missing before PR #3)
 /// 12. pages_json (FIXED: was missing before PR #3)
+/// 13. elements_json (ADDED: for element-based extraction support)
 ///
 /// # Example (C)
 ///
@@ -209,6 +210,9 @@ pub unsafe extern "C" fn kreuzberg_free_result(result: *mut CExtractionResult) {
         if !result_box.pages_json.is_null() {
             unsafe { drop(CString::from_raw(result_box.pages_json)) };
         }
+        if !result_box.elements_json.is_null() {
+            unsafe { drop(CString::from_raw(result_box.elements_json)) };
+        }
     }
 }
@@ -232,6 +236,7 @@ mod tests {
             images_json: CString::new("[]").unwrap().into_raw(),
             page_structure_json: CString::new("{}").unwrap().into_raw(),
             pages_json: CString::new("[]").unwrap().into_raw(),
+            elements_json: CString::new("[]").unwrap().into_raw(),
             success: true,
             _padding1: [0u8; 7],
         }))
@@ -252,6 +257,7 @@ mod tests {
             images_json: ptr::null_mut(),
             page_structure_json: ptr::null_mut(),
             pages_json: ptr::null_mut(),
+            elements_json: ptr::null_mut(),
             success: true,
             _padding1: [0u8; 7],
         }))
@@ -343,6 +349,34 @@ mod tests {
             images_json: ptr::null_mut(),
             page_structure_json: CString::new("{\"pages\": []}").unwrap().into_raw(),
             pages_json: CString::new("[{\"content\": \"page 1\"}]").unwrap().into_raw(),
+            elements_json: ptr::null_mut(),
+            success: true,
+            _padding1: [0u8; 7],
+        }));
+        unsafe { kreuzberg_free_result(result) };
+        // If we get here without crashing or leaking, the test passed
+    }
+    #[test]
+    fn test_free_result_elements_json() {
+        // Test: ensure elements_json is freed
+        let result = Box::into_raw(Box::new(CExtractionResult {
+            content: CString::new("test").unwrap().into_raw(),
+            mime_type: CString::new("text/plain").unwrap().into_raw(),
+            language: ptr::null_mut(),
+            date: ptr::null_mut(),
+            subject: ptr::null_mut(),
+            tables_json: ptr::null_mut(),
+            detected_languages_json: ptr::null_mut(),
+            metadata_json: ptr::null_mut(),
+            chunks_json: ptr::null_mut(),
+            images_json: ptr::null_mut(),
+            page_structure_json: ptr::null_mut(),
+            pages_json: ptr::null_mut(),
+            elements_json: CString::new(r#"[{"element_id":"abc","element_type":"title","text":"Hello"}]"#)
+                .unwrap()
+                .into_raw(),
             success: true,
             _padding1: [0u8; 7],
         }));

data/vendor/kreuzberg-ffi/src/types.rs CHANGED Viewed

@@ -51,7 +51,7 @@ impl Drop for CStringGuard {
 /// # Memory Layout
 ///
 /// Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
-/// Field order: 12 pointers (8 bytes each) + 1 bool + 7 bytes padding = 104 bytes total
+/// Field order: 13 pointers (8 bytes each) + 1 bool + 7 bytes padding = 112 bytes total
 ///
 /// The `#[repr(C)]` attribute ensures the struct follows C's memory layout rules:
 /// - Fields are laid out in order
@@ -88,6 +88,8 @@ pub struct CExtractionResult {
     pub page_structure_json: *mut c_char,
     /// Per-page content as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
     pub pages_json: *mut c_char,
+    /// Semantic elements as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
+    pub elements_json: *mut c_char,
     /// Whether extraction was successful
     pub success: bool,
     /// Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
@@ -150,7 +152,7 @@ pub struct CBatchResult {
 const _: () = {
     const fn assert_c_extraction_result_size() {
         const SIZE: usize = std::mem::size_of::<CExtractionResult>();
-        const _: () = assert!(SIZE == 104, "CExtractionResult size must be 104 bytes");
+        const _: () = assert!(SIZE == 112, "CExtractionResult size must be 112 bytes");
     }
     const fn assert_c_extraction_result_alignment() {
@@ -195,8 +197,8 @@ mod tests {
     fn test_c_extraction_result_size() {
         assert_eq!(
             std::mem::size_of::<CExtractionResult>(),
-            104,
-            "CExtractionResult must be exactly 104 bytes"
+            112,
+            "CExtractionResult must be exactly 112 bytes"
         );
     }
@@ -327,7 +329,8 @@ mod tests {
         assert_eq!(offset_of!(CExtractionResult, images_json), 72);
         assert_eq!(offset_of!(CExtractionResult, page_structure_json), 80);
         assert_eq!(offset_of!(CExtractionResult, pages_json), 88);
-        assert_eq!(offset_of!(CExtractionResult, success), 96);
+        assert_eq!(offset_of!(CExtractionResult, elements_json), 96);
+        assert_eq!(offset_of!(CExtractionResult, success), 104);
     }
     /// Verify field offsets in CBatchResult match expectations

data/vendor/kreuzberg-tesseract/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg-tesseract"
-version = "4.2.1"
+version = "4.2.2"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: kreuzberg
 version: !ruby/object:Gem::Version
-  version: 4.2.1
+  version: 4.2.2
 platform: ruby
 authors:
 - Na'aman Hirschfeld
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2026-01-27 00:00:00.000000000 Z
+date: 2026-01-28 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -363,6 +363,7 @@ files:
 - vendor/kreuzberg/src/api/error.rs
 - vendor/kreuzberg/src/api/handlers.rs
 - vendor/kreuzberg/src/api/mod.rs
+- vendor/kreuzberg/src/api/openapi.rs
 - vendor/kreuzberg/src/api/router.rs
 - vendor/kreuzberg/src/api/startup.rs
 - vendor/kreuzberg/src/api/types.rs