kreuzberg 4.2.1 → 4.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,16 +27,23 @@
27
27
  //! # Ok(())
28
28
  //! # }
29
29
  //! ```
30
- use calamine::{Data, Range, Reader, open_workbook_auto};
30
+ use calamine::{Data, DataRef, Range, Reader, open_workbook_auto};
31
31
  use std::collections::HashMap;
32
32
  use std::fmt::Write as FmtWrite;
33
- use std::io::Cursor;
33
+ use std::io::{Cursor, Read, Seek};
34
34
  use std::path::Path;
35
35
 
36
36
  use crate::error::{KreuzbergError, Result};
37
37
  use crate::extraction::capacity;
38
38
  use crate::types::{ExcelSheet, ExcelWorkbook};
39
39
 
40
+ /// Maximum number of cells in a Range's bounding box before we consider it pathological.
41
+ /// This threshold is set to prevent OOM when processing files with sparse data at extreme
42
+ /// positions (e.g., Excel Solver files that have cells at A1 and XFD1048575).
43
+ ///
44
+ /// 100 million cells at ~64 bytes each = ~6.4 GB, which is a reasonable upper limit.
45
+ const MAX_BOUNDING_BOX_CELLS: u64 = 100_000_000;
46
+
40
47
  #[cfg(feature = "office")]
41
48
  use crate::extraction::office_metadata::{
42
49
  extract_core_properties, extract_custom_properties, extract_xlsx_app_properties,
@@ -45,11 +52,13 @@ use crate::extraction::office_metadata::{
45
52
  use serde_json::Value;
46
53
 
47
54
  pub fn read_excel_file(file_path: &str) -> Result<ExcelWorkbook> {
55
+ let lower_path = file_path.to_lowercase();
56
+
48
57
  #[cfg(feature = "office")]
49
- let office_metadata = if file_path.to_lowercase().ends_with(".xlsx")
50
- || file_path.to_lowercase().ends_with(".xlsm")
51
- || file_path.to_lowercase().ends_with(".xlam")
52
- || file_path.to_lowercase().ends_with(".xltm")
58
+ let office_metadata = if lower_path.ends_with(".xlsx")
59
+ || lower_path.ends_with(".xlsm")
60
+ || lower_path.ends_with(".xlam")
61
+ || lower_path.ends_with(".xltm")
53
62
  {
54
63
  extract_xlsx_office_metadata_from_file(file_path).ok()
55
64
  } else {
@@ -59,7 +68,19 @@ pub fn read_excel_file(file_path: &str) -> Result<ExcelWorkbook> {
59
68
  #[cfg(not(feature = "office"))]
60
69
  let office_metadata: Option<HashMap<String, String>> = None;
61
70
 
62
- // We analyze the error and only wrap format errors, letting real IO errors bubble up ~keep
71
+ // For XLSX files, use specialized handler with OOM protection
72
+ if lower_path.ends_with(".xlsx")
73
+ || lower_path.ends_with(".xlsm")
74
+ || lower_path.ends_with(".xlam")
75
+ || lower_path.ends_with(".xltm")
76
+ {
77
+ let file = std::fs::File::open(file_path)?;
78
+ let workbook = calamine::Xlsx::new(std::io::BufReader::new(file))
79
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to parse XLSX: {}", e)))?;
80
+ return process_xlsx_workbook(workbook, office_metadata);
81
+ }
82
+
83
+ // For other formats, use open_workbook_auto
63
84
  let workbook = match open_workbook_auto(Path::new(file_path)) {
64
85
  Ok(wb) => wb,
65
86
  Err(calamine::Error::Io(io_err)) => {
@@ -94,7 +115,7 @@ pub fn read_excel_bytes(data: &[u8], file_extension: &str) -> Result<ExcelWorkbo
94
115
  ".xlsx" | ".xlsm" | ".xlam" | ".xltm" => {
95
116
  let workbook = calamine::Xlsx::new(cursor)
96
117
  .map_err(|e| KreuzbergError::parsing(format!("Failed to parse XLSX: {}", e)))?;
97
- process_workbook(workbook, office_metadata)
118
+ process_xlsx_workbook(workbook, office_metadata)
98
119
  }
99
120
  ".xls" | ".xla" => {
100
121
  let workbook = calamine::Xls::new(cursor)
@@ -118,6 +139,194 @@ pub fn read_excel_bytes(data: &[u8], file_extension: &str) -> Result<ExcelWorkbo
118
139
  }
119
140
  }
120
141
 
142
+ /// Process XLSX workbooks with special handling for pathological sparse files.
143
+ ///
144
+ /// This function uses calamine's `worksheet_cells_reader()` API to detect sheets with
145
+ /// extreme bounding boxes BEFORE allocating memory for the full Range. This prevents
146
+ /// OOM when processing files like Excel Solver files that have cells at both A1 and
147
+ /// XFD1048575, creating a bounding box of ~17 billion cells.
148
+ fn process_xlsx_workbook<RS: Read + Seek>(
149
+ mut workbook: calamine::Xlsx<RS>,
150
+ office_metadata: Option<HashMap<String, String>>,
151
+ ) -> Result<ExcelWorkbook> {
152
+ let sheet_names = workbook.sheet_names();
153
+ let mut sheets = Vec::with_capacity(sheet_names.len());
154
+
155
+ for name in &sheet_names {
156
+ // Use worksheet_cells_reader to stream cells and detect pathological bounding boxes
157
+ match process_xlsx_sheet_safe(&mut workbook, name) {
158
+ Ok(sheet) => sheets.push(sheet),
159
+ Err(e) => {
160
+ // Log but don't fail - continue with other sheets
161
+ tracing::warn!("Failed to process sheet '{}': {}", name, e);
162
+ }
163
+ }
164
+ }
165
+
166
+ let metadata = extract_metadata(&workbook, &sheet_names, office_metadata);
167
+ Ok(ExcelWorkbook { sheets, metadata })
168
+ }
169
+
170
+ /// Process a single XLSX sheet safely by pre-checking the bounding box.
171
+ ///
172
+ /// This function streams cells to compute the actual bounding box without allocating
173
+ /// a full Range, then only creates the Range if the bounding box is within safe limits.
174
+ fn process_xlsx_sheet_safe<RS: Read + Seek>(workbook: &mut calamine::Xlsx<RS>, sheet_name: &str) -> Result<ExcelSheet> {
175
+ // First pass: stream cells to compute actual bounding box and collect cell data
176
+ let (cells, row_min, row_max, col_min, col_max) = {
177
+ let mut cell_reader = workbook
178
+ .worksheet_cells_reader(sheet_name)
179
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to read sheet '{}': {}", sheet_name, e)))?;
180
+
181
+ let mut cells: Vec<((u32, u32), Data)> = Vec::new();
182
+ let mut row_min = u32::MAX;
183
+ let mut row_max = 0u32;
184
+ let mut col_min = u32::MAX;
185
+ let mut col_max = 0u32;
186
+
187
+ // Stream through all cells, tracking bounds
188
+ while let Ok(Some(cell)) = cell_reader.next_cell() {
189
+ let (row, col) = cell.get_position();
190
+ row_min = row_min.min(row);
191
+ row_max = row_max.max(row);
192
+ col_min = col_min.min(col);
193
+ col_max = col_max.max(col);
194
+
195
+ // Convert DataRef to owned Data
196
+ let data: Data = match cell.get_value() {
197
+ DataRef::Empty => Data::Empty,
198
+ DataRef::String(s) => Data::String(s.clone()),
199
+ DataRef::SharedString(s) => Data::String(s.to_string()),
200
+ DataRef::Float(f) => Data::Float(*f),
201
+ DataRef::Int(i) => Data::Int(*i),
202
+ DataRef::Bool(b) => Data::Bool(*b),
203
+ DataRef::DateTime(dt) => Data::DateTime(*dt),
204
+ DataRef::DateTimeIso(s) => Data::DateTimeIso(s.clone()),
205
+ DataRef::DurationIso(s) => Data::DurationIso(s.clone()),
206
+ DataRef::Error(e) => Data::Error(e.clone()),
207
+ };
208
+ cells.push(((row, col), data));
209
+ }
210
+ (cells, row_min, row_max, col_min, col_max)
211
+ }; // cell_reader is dropped here, releasing the borrow
212
+
213
+ // Check if sheet is empty
214
+ if cells.is_empty() {
215
+ return Ok(ExcelSheet {
216
+ name: sheet_name.to_owned(),
217
+ markdown: format!("## {}\n\n*Empty sheet*", sheet_name),
218
+ row_count: 0,
219
+ col_count: 0,
220
+ cell_count: 0,
221
+ table_cells: None,
222
+ });
223
+ }
224
+
225
+ // Calculate bounding box size
226
+ let bb_rows = (row_max - row_min + 1) as u64;
227
+ let bb_cols = (col_max - col_min + 1) as u64;
228
+ let bb_cells = bb_rows.saturating_mul(bb_cols);
229
+
230
+ // Check for pathological bounding box
231
+ if bb_cells > MAX_BOUNDING_BOX_CELLS {
232
+ // Sheet has sparse data at extreme positions - process directly from cells
233
+ return process_sparse_sheet_from_cells(sheet_name, cells, row_min, row_max, col_min, col_max);
234
+ }
235
+
236
+ // Safe to create a Range - bounding box is within limits
237
+ // Use calamine's normal worksheet_range which will create the Range
238
+ let range = workbook
239
+ .worksheet_range(sheet_name)
240
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to parse sheet '{}': {}", sheet_name, e)))?;
241
+
242
+ Ok(process_sheet(sheet_name, &range))
243
+ }
244
+
245
+ /// Process a sparse sheet directly from collected cells without creating a full Range.
246
+ ///
247
+ /// This is used when the bounding box would exceed MAX_BOUNDING_BOX_CELLS.
248
+ /// Instead of creating a dense Range, we generate markdown directly from the sparse cells.
249
+ fn process_sparse_sheet_from_cells(
250
+ sheet_name: &str,
251
+ cells: Vec<((u32, u32), Data)>,
252
+ row_min: u32,
253
+ row_max: u32,
254
+ col_min: u32,
255
+ col_max: u32,
256
+ ) -> Result<ExcelSheet> {
257
+ let cell_count = cells.len();
258
+ let bb_rows = (row_max - row_min + 1) as usize;
259
+ let bb_cols = (col_max - col_min + 1) as usize;
260
+
261
+ // Create a warning message about the sparse data
262
+ let mut markdown = String::with_capacity(500 + cell_count * 50);
263
+ write!(
264
+ markdown,
265
+ "## {}\n\n*Note: Sheet contains sparse data spanning {} rows x {} columns ({} actual cells). \
266
+ Bounding box too large for dense extraction. Showing actual cell data below.*\n\n",
267
+ sheet_name, bb_rows, bb_cols, cell_count
268
+ )
269
+ .expect("write to String cannot fail");
270
+
271
+ // Group cells by row for tabular display
272
+ let mut cells_by_row: HashMap<u32, Vec<(u32, &Data)>> = HashMap::new();
273
+ for ((row, col), data) in &cells {
274
+ cells_by_row.entry(*row).or_default().push((*col, data));
275
+ }
276
+
277
+ // Sort rows and output as simple key-value pairs
278
+ let mut rows: Vec<_> = cells_by_row.keys().copied().collect();
279
+ rows.sort_unstable();
280
+
281
+ // Limit output to first 1000 cells to avoid huge output
282
+ let mut output_count = 0;
283
+ const MAX_OUTPUT_CELLS: usize = 1000;
284
+
285
+ for row in rows {
286
+ if output_count >= MAX_OUTPUT_CELLS {
287
+ write!(markdown, "\n... ({} more cells not shown)\n", cell_count - output_count)
288
+ .expect("write to String cannot fail");
289
+ break;
290
+ }
291
+
292
+ let mut row_cells = cells_by_row.remove(&row).unwrap_or_default();
293
+ row_cells.sort_by_key(|(col, _)| *col);
294
+
295
+ for (col, data) in row_cells {
296
+ if output_count >= MAX_OUTPUT_CELLS {
297
+ break;
298
+ }
299
+ let cell_ref = col_to_excel_letter(col);
300
+ let cell_str = format_cell_to_string(data);
301
+ if !cell_str.is_empty() {
302
+ writeln!(markdown, "- **{}{}**: {}", cell_ref, row + 1, cell_str).expect("write to String cannot fail");
303
+ output_count += 1;
304
+ }
305
+ }
306
+ }
307
+
308
+ Ok(ExcelSheet {
309
+ name: sheet_name.to_owned(),
310
+ markdown,
311
+ row_count: bb_rows,
312
+ col_count: bb_cols,
313
+ cell_count,
314
+ table_cells: None, // No structured table for sparse sheets
315
+ })
316
+ }
317
+
318
+ /// Convert a 0-indexed column number to Excel-style letter(s) (A, B, ..., Z, AA, AB, ...).
319
+ fn col_to_excel_letter(col: u32) -> String {
320
+ let mut result = String::new();
321
+ let mut n = col + 1; // 1-indexed for calculation
322
+ while n > 0 {
323
+ n -= 1;
324
+ result.insert(0, (b'A' + (n % 26) as u8) as char);
325
+ n /= 26;
326
+ }
327
+ result
328
+ }
329
+
121
330
  fn process_workbook<RS, R>(mut workbook: R, office_metadata: Option<HashMap<String, String>>) -> Result<ExcelWorkbook>
122
331
  where
123
332
  RS: std::io::Read + std::io::Seek,
@@ -143,7 +352,10 @@ fn process_sheet(name: &str, range: &Range<Data>) -> ExcelSheet {
143
352
  let (rows, cols) = range.get_size();
144
353
  let cell_count = range.used_cells().count();
145
354
 
146
- let estimated_capacity = 50 + (cols * 20) + (rows * cols * 12);
355
+ // Fix for issue #331: Use actual cell count instead of declared dimensions
356
+ // to avoid OOM on sparse sheets with extreme dimensions (e.g., Excel Solver files).
357
+ // Declared dimensions can claim A1:XFD1048575 (~17T cells) while actual data is minimal.
358
+ let estimated_capacity = 50 + (cols * 20) + (cell_count * 12);
147
359
 
148
360
  if rows == 0 || cols == 0 {
149
361
  let markdown = format!("## {}\n\n*Empty sheet*", name);
@@ -176,6 +388,31 @@ fn process_sheet(name: &str, range: &Range<Data>) -> ExcelSheet {
176
388
  ///
177
389
  /// Returns (markdown, table_cells) where table_cells is a 2D vector of strings.
178
390
  fn generate_markdown_and_cells(sheet_name: &str, range: &Range<Data>, capacity: usize) -> (String, Vec<Vec<String>>) {
391
+ // Fix for issue #331: Protect against extreme declared dimensions.
392
+ // Excel Solver files can declare A1:XFD1048575 (1M+ rows) but only have ~26 actual cells.
393
+ // Calling range.rows().collect() would iterate ALL declared rows causing OOM.
394
+ const MAX_REASONABLE_ROWS: usize = 100_000; // Cap at 100K rows for safety
395
+
396
+ let (declared_rows, _declared_cols) = range.get_size();
397
+
398
+ // If declared rows exceed reasonable limit, skip processing to avoid OOM
399
+ if declared_rows > MAX_REASONABLE_ROWS {
400
+ let actual_cell_count = range.used_cells().count();
401
+
402
+ // If actual data is minimal compared to declared size, it's a sparse/pathological file
403
+ if actual_cell_count < 10_000 {
404
+ // Return minimal output instead of OOM
405
+ let result_capacity = 100 + sheet_name.len();
406
+ let mut result = String::with_capacity(result_capacity);
407
+ write!(
408
+ result,
409
+ "## {}\n\n*Sheet has extreme declared dimensions ({} rows) with minimal actual data ({} cells). Skipping to prevent OOM.*",
410
+ sheet_name, declared_rows, actual_cell_count
411
+ ).unwrap();
412
+ return (result, Vec::new());
413
+ }
414
+ }
415
+
179
416
  let rows: Vec<_> = range.rows().collect();
180
417
  if rows.is_empty() {
181
418
  let result_capacity = 50 + sheet_name.len();
@@ -88,3 +88,59 @@ fn test_xlsx_minimal_metadata_extraction() {
88
88
 
89
89
  println!("✅ XLSX minimal metadata extraction test passed!");
90
90
  }
91
+
92
+ /// Test for issue #331: OOM with XLSX files containing Excel Solver add-in data
93
+ ///
94
+ /// This test reproduces the issue where Excel Solver stores configuration data
95
+ /// in cells at extreme positions (XFD1048550-1048575 = column 16384, rows near 1M).
96
+ /// The sheet dimension is set to "A1:XFD1048575", which could cause Kreuzberg
97
+ /// to attempt allocating memory for ~17 trillion cells (16384 × 1048575).
98
+ ///
99
+ /// Expected behavior: Should handle extreme dimensions gracefully without OOM.
100
+ /// The file is only 6.8KB and contains minimal actual data.
101
+ #[test]
102
+ fn test_xlsx_excel_solver_extreme_dimensions_no_oom() {
103
+ let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
104
+ .parent()
105
+ .expect("Operation failed")
106
+ .parent()
107
+ .expect("Operation failed");
108
+ let test_file = workspace_root.join("tests/fixtures/xlsx-oom-repro/kreuzberg-oom-repro.xlsx");
109
+
110
+ if !test_file.exists() {
111
+ println!("Skipping test: Test file not found at {:?}", test_file);
112
+ println!("Run: node tests/fixtures/xlsx-oom-repro/generate-oom-xlsx.mjs");
113
+ return;
114
+ }
115
+
116
+ let file_path = test_file.to_str().expect("File path should be valid UTF-8");
117
+
118
+ // This should NOT cause OOM even though dimension claims A1:XFD1048575
119
+ // The actual data is minimal (only ~26 cells with Solver metadata)
120
+ let result = read_excel_file(file_path).expect("Should extract XLSX with extreme dimensions without OOM");
121
+
122
+ // Verify we got the actual data, not a massive allocation
123
+ assert!(!result.sheets.is_empty(), "Should have at least one sheet");
124
+
125
+ // The file has normal cells A1, B1 plus Solver cells at extreme positions
126
+ // Verify we extracted something reasonable, not 17 trillion cells
127
+ let sheet = &result.sheets[0];
128
+ assert!(
129
+ sheet.markdown.len() < 10000,
130
+ "Sheet markdown content should be small (< 10000 chars), not massive. Got {} chars",
131
+ sheet.markdown.len()
132
+ );
133
+
134
+ // Verify metadata was extracted
135
+ assert!(
136
+ result.metadata.contains_key("sheet_count"),
137
+ "Should have sheet_count metadata"
138
+ );
139
+
140
+ println!("✅ XLSX Excel Solver extreme dimensions test passed!");
141
+ println!(
142
+ " Sheet markdown length: {} chars (reasonable size)",
143
+ sheet.markdown.len()
144
+ );
145
+ println!(" Successfully handled dimension A1:XFD1048575 without OOM");
146
+ }
@@ -223,7 +223,7 @@ typedef struct CErrorDetails {
223
223
  * # Memory Layout
224
224
  *
225
225
  * Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
226
- * Field order: 12 pointers (8 bytes each) + 1 bool + 7 bytes padding = 104 bytes total
226
+ * Field order: 13 pointers (8 bytes each) + 1 bool + 7 bytes padding = 112 bytes total
227
227
  *
228
228
  * The `#[repr(C)]` attribute ensures the struct follows C's memory layout rules:
229
229
  * - Fields are laid out in order
@@ -284,6 +284,10 @@ typedef struct CExtractionResult {
284
284
  * Per-page content as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
285
285
  */
286
286
  char *pages_json;
287
+ /**
288
+ * Semantic elements as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
289
+ */
290
+ char *elements_json;
287
291
  /**
288
292
  * Whether extraction was successful
289
293
  */
@@ -1608,7 +1612,7 @@ char *kreuzberg_clone_string(const char *s);
1608
1612
  *
1609
1613
  * # Memory Layout
1610
1614
  *
1611
- * This function frees all 12 string fields in CExtractionResult:
1615
+ * This function frees all 13 string fields in CExtractionResult:
1612
1616
  * 1. content
1613
1617
  * 2. mime_type
1614
1618
  * 3. language
@@ -1621,6 +1625,7 @@ char *kreuzberg_clone_string(const char *s);
1621
1625
  * 10. images_json
1622
1626
  * 11. page_structure_json (FIXED: was missing before PR #3)
1623
1627
  * 12. pages_json (FIXED: was missing before PR #3)
1628
+ * 13. elements_json (ADDED: for element-based extraction support)
1624
1629
  *
1625
1630
  * # Example (C)
1626
1631
  *
@@ -67,7 +67,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
67
67
  images,
68
68
  pages,
69
69
  djot_content: _,
70
- elements: _,
70
+ elements,
71
71
  } = result;
72
72
 
73
73
  let sanitized_content = if content.contains('\0') {
@@ -179,6 +179,17 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
179
179
  _ => None,
180
180
  };
181
181
 
182
+ let elements_json_guard = match elements {
183
+ Some(elements) if !elements.is_empty() => {
184
+ let json =
185
+ serde_json::to_string(&elements).map_err(|e| format!("Failed to serialize elements to JSON: {}", e))?;
186
+ Some(CStringGuard::new(CString::new(json).map_err(|e| {
187
+ format!("Failed to convert elements JSON to C string: {}", e)
188
+ })?))
189
+ }
190
+ _ => None,
191
+ };
192
+
182
193
  Ok(Box::into_raw(Box::new(CExtractionResult {
183
194
  content: content_guard.into_raw(),
184
195
  mime_type: mime_type_guard.into_raw(),
@@ -192,6 +203,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
192
203
  images_json: images_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
193
204
  page_structure_json: page_structure_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
194
205
  pages_json: pages_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
206
+ elements_json: elements_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
195
207
  success: true,
196
208
  _padding1: [0u8; 7],
197
209
  })))
@@ -134,8 +134,8 @@ mod tests {
134
134
  // Test size
135
135
  assert_eq!(
136
136
  std::mem::size_of::<CExtractionResult>(),
137
- 104,
138
- "CExtractionResult must be exactly 104 bytes"
137
+ 112,
138
+ "CExtractionResult must be exactly 112 bytes"
139
139
  );
140
140
 
141
141
  // Test alignment
@@ -197,6 +197,7 @@ mod tests {
197
197
  images_json: ptr::null_mut(),
198
198
  page_structure_json: ptr::null_mut(),
199
199
  pages_json: ptr::null_mut(),
200
+ elements_json: ptr::null_mut(),
200
201
  success: true,
201
202
  _padding1: [0u8; 7],
202
203
  }))
@@ -510,6 +511,7 @@ mod tests {
510
511
  images_json: ptr::null_mut(),
511
512
  page_structure_json: ptr::null_mut(),
512
513
  pages_json: ptr::null_mut(),
514
+ elements_json: ptr::null_mut(),
513
515
  success: true,
514
516
  _padding1: [0u8; 7],
515
517
  }));
@@ -522,7 +524,7 @@ mod tests {
522
524
  #[test]
523
525
  fn test_extraction_result_free_all_fields_allocated() {
524
526
  unsafe {
525
- // Test freeing a result where ALL 12 string fields are allocated
527
+ // Test freeing a result where ALL 13 string fields are allocated
526
528
  // This verifies that kreuzberg_free_result properly frees all fields
527
529
  let result = Box::into_raw(Box::new(CExtractionResult {
528
530
  content: CString::new("test content").unwrap().into_raw(),
@@ -537,11 +539,12 @@ mod tests {
537
539
  images_json: CString::new("[{\"data\":\"base64\"}]").unwrap().into_raw(),
538
540
  page_structure_json: CString::new("{\"pages\":1}").unwrap().into_raw(),
539
541
  pages_json: CString::new("[{\"page\":1,\"content\":\"test\"}]").unwrap().into_raw(),
542
+ elements_json: CString::new("[]").unwrap().into_raw(),
540
543
  success: true,
541
544
  _padding1: [0u8; 7],
542
545
  }));
543
546
 
544
- // Should properly free all 12 allocated string fields without leaking memory
547
+ // Should properly free all 13 allocated string fields without leaking memory
545
548
  kreuzberg_free_result(result);
546
549
  }
547
550
  }
@@ -621,7 +624,7 @@ mod tests {
621
624
  /// Test CExtractionResult size exactly matches FFI contract
622
625
  #[test]
623
626
  fn test_c_extraction_result_size() {
624
- assert_eq!(std::mem::size_of::<CExtractionResult>(), 104);
627
+ assert_eq!(std::mem::size_of::<CExtractionResult>(), 112);
625
628
  assert_eq!(std::mem::align_of::<CExtractionResult>(), 8);
626
629
  }
627
630
 
@@ -146,7 +146,7 @@ pub unsafe extern "C" fn kreuzberg_clone_string(s: *const c_char) -> *mut c_char
146
146
  ///
147
147
  /// # Memory Layout
148
148
  ///
149
- /// This function frees all 12 string fields in CExtractionResult:
149
+ /// This function frees all 13 string fields in CExtractionResult:
150
150
  /// 1. content
151
151
  /// 2. mime_type
152
152
  /// 3. language
@@ -159,6 +159,7 @@ pub unsafe extern "C" fn kreuzberg_clone_string(s: *const c_char) -> *mut c_char
159
159
  /// 10. images_json
160
160
  /// 11. page_structure_json (FIXED: was missing before PR #3)
161
161
  /// 12. pages_json (FIXED: was missing before PR #3)
162
+ /// 13. elements_json (ADDED: for element-based extraction support)
162
163
  ///
163
164
  /// # Example (C)
164
165
  ///
@@ -209,6 +210,9 @@ pub unsafe extern "C" fn kreuzberg_free_result(result: *mut CExtractionResult) {
209
210
  if !result_box.pages_json.is_null() {
210
211
  unsafe { drop(CString::from_raw(result_box.pages_json)) };
211
212
  }
213
+ if !result_box.elements_json.is_null() {
214
+ unsafe { drop(CString::from_raw(result_box.elements_json)) };
215
+ }
212
216
  }
213
217
  }
214
218
 
@@ -232,6 +236,7 @@ mod tests {
232
236
  images_json: CString::new("[]").unwrap().into_raw(),
233
237
  page_structure_json: CString::new("{}").unwrap().into_raw(),
234
238
  pages_json: CString::new("[]").unwrap().into_raw(),
239
+ elements_json: CString::new("[]").unwrap().into_raw(),
235
240
  success: true,
236
241
  _padding1: [0u8; 7],
237
242
  }))
@@ -252,6 +257,7 @@ mod tests {
252
257
  images_json: ptr::null_mut(),
253
258
  page_structure_json: ptr::null_mut(),
254
259
  pages_json: ptr::null_mut(),
260
+ elements_json: ptr::null_mut(),
255
261
  success: true,
256
262
  _padding1: [0u8; 7],
257
263
  }))
@@ -343,6 +349,34 @@ mod tests {
343
349
  images_json: ptr::null_mut(),
344
350
  page_structure_json: CString::new("{\"pages\": []}").unwrap().into_raw(),
345
351
  pages_json: CString::new("[{\"content\": \"page 1\"}]").unwrap().into_raw(),
352
+ elements_json: ptr::null_mut(),
353
+ success: true,
354
+ _padding1: [0u8; 7],
355
+ }));
356
+
357
+ unsafe { kreuzberg_free_result(result) };
358
+ // If we get here without crashing or leaking, the test passed
359
+ }
360
+
361
+ #[test]
362
+ fn test_free_result_elements_json() {
363
+ // Test: ensure elements_json is freed
364
+ let result = Box::into_raw(Box::new(CExtractionResult {
365
+ content: CString::new("test").unwrap().into_raw(),
366
+ mime_type: CString::new("text/plain").unwrap().into_raw(),
367
+ language: ptr::null_mut(),
368
+ date: ptr::null_mut(),
369
+ subject: ptr::null_mut(),
370
+ tables_json: ptr::null_mut(),
371
+ detected_languages_json: ptr::null_mut(),
372
+ metadata_json: ptr::null_mut(),
373
+ chunks_json: ptr::null_mut(),
374
+ images_json: ptr::null_mut(),
375
+ page_structure_json: ptr::null_mut(),
376
+ pages_json: ptr::null_mut(),
377
+ elements_json: CString::new(r#"[{"element_id":"abc","element_type":"title","text":"Hello"}]"#)
378
+ .unwrap()
379
+ .into_raw(),
346
380
  success: true,
347
381
  _padding1: [0u8; 7],
348
382
  }));
@@ -51,7 +51,7 @@ impl Drop for CStringGuard {
51
51
  /// # Memory Layout
52
52
  ///
53
53
  /// Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
54
- /// Field order: 12 pointers (8 bytes each) + 1 bool + 7 bytes padding = 104 bytes total
54
+ /// Field order: 13 pointers (8 bytes each) + 1 bool + 7 bytes padding = 112 bytes total
55
55
  ///
56
56
  /// The `#[repr(C)]` attribute ensures the struct follows C's memory layout rules:
57
57
  /// - Fields are laid out in order
@@ -88,6 +88,8 @@ pub struct CExtractionResult {
88
88
  pub page_structure_json: *mut c_char,
89
89
  /// Per-page content as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
90
90
  pub pages_json: *mut c_char,
91
+ /// Semantic elements as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
92
+ pub elements_json: *mut c_char,
91
93
  /// Whether extraction was successful
92
94
  pub success: bool,
93
95
  /// Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
@@ -150,7 +152,7 @@ pub struct CBatchResult {
150
152
  const _: () = {
151
153
  const fn assert_c_extraction_result_size() {
152
154
  const SIZE: usize = std::mem::size_of::<CExtractionResult>();
153
- const _: () = assert!(SIZE == 104, "CExtractionResult size must be 104 bytes");
155
+ const _: () = assert!(SIZE == 112, "CExtractionResult size must be 112 bytes");
154
156
  }
155
157
 
156
158
  const fn assert_c_extraction_result_alignment() {
@@ -195,8 +197,8 @@ mod tests {
195
197
  fn test_c_extraction_result_size() {
196
198
  assert_eq!(
197
199
  std::mem::size_of::<CExtractionResult>(),
198
- 104,
199
- "CExtractionResult must be exactly 104 bytes"
200
+ 112,
201
+ "CExtractionResult must be exactly 112 bytes"
200
202
  );
201
203
  }
202
204
 
@@ -327,7 +329,8 @@ mod tests {
327
329
  assert_eq!(offset_of!(CExtractionResult, images_json), 72);
328
330
  assert_eq!(offset_of!(CExtractionResult, page_structure_json), 80);
329
331
  assert_eq!(offset_of!(CExtractionResult, pages_json), 88);
330
- assert_eq!(offset_of!(CExtractionResult, success), 96);
332
+ assert_eq!(offset_of!(CExtractionResult, elements_json), 96);
333
+ assert_eq!(offset_of!(CExtractionResult, success), 104);
331
334
  }
332
335
 
333
336
  /// Verify field offsets in CBatchResult match expectations
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.2.1"
3
+ version = "4.2.2"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.2.1
4
+ version: 4.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-01-27 00:00:00.000000000 Z
11
+ date: 2026-01-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -363,6 +363,7 @@ files:
363
363
  - vendor/kreuzberg/src/api/error.rs
364
364
  - vendor/kreuzberg/src/api/handlers.rs
365
365
  - vendor/kreuzberg/src/api/mod.rs
366
+ - vendor/kreuzberg/src/api/openapi.rs
366
367
  - vendor/kreuzberg/src/api/router.rs
367
368
  - vendor/kreuzberg/src/api/startup.rs
368
369
  - vendor/kreuzberg/src/api/types.rs