kreuzberg 4.2.1 → 4.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/lib/kreuzberg/config.rb +4 -20
- data/lib/kreuzberg/version.rb +1 -1
- data/spec/binding/config_spec.rb +1 -1
- data/spec/unit/config/extraction_config_spec.rb +2 -2
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +3 -2
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +60 -0
- data/vendor/kreuzberg/src/api/handlers.rs +153 -32
- data/vendor/kreuzberg/src/api/mod.rs +2 -0
- data/vendor/kreuzberg/src/api/openapi.rs +141 -0
- data/vendor/kreuzberg/src/api/router.rs +24 -2
- data/vendor/kreuzberg/src/api/startup.rs +11 -5
- data/vendor/kreuzberg/src/api/types.rs +50 -4
- data/vendor/kreuzberg/src/core/config/processing.rs +8 -1
- data/vendor/kreuzberg/src/extraction/excel.rs +246 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +56 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
- data/vendor/kreuzberg-ffi/src/helpers.rs +13 -1
- data/vendor/kreuzberg-ffi/src/lib.rs +8 -5
- data/vendor/kreuzberg-ffi/src/memory.rs +35 -1
- data/vendor/kreuzberg-ffi/src/types.rs +8 -5
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +3 -2
|
@@ -27,16 +27,23 @@
|
|
|
27
27
|
//! # Ok(())
|
|
28
28
|
//! # }
|
|
29
29
|
//! ```
|
|
30
|
-
use calamine::{Data, Range, Reader, open_workbook_auto};
|
|
30
|
+
use calamine::{Data, DataRef, Range, Reader, open_workbook_auto};
|
|
31
31
|
use std::collections::HashMap;
|
|
32
32
|
use std::fmt::Write as FmtWrite;
|
|
33
|
-
use std::io::Cursor;
|
|
33
|
+
use std::io::{Cursor, Read, Seek};
|
|
34
34
|
use std::path::Path;
|
|
35
35
|
|
|
36
36
|
use crate::error::{KreuzbergError, Result};
|
|
37
37
|
use crate::extraction::capacity;
|
|
38
38
|
use crate::types::{ExcelSheet, ExcelWorkbook};
|
|
39
39
|
|
|
40
|
+
/// Maximum number of cells in a Range's bounding box before we consider it pathological.
|
|
41
|
+
/// This threshold is set to prevent OOM when processing files with sparse data at extreme
|
|
42
|
+
/// positions (e.g., Excel Solver files that have cells at A1 and XFD1048575).
|
|
43
|
+
///
|
|
44
|
+
/// 100 million cells at ~64 bytes each = ~6.4 GB, which is a reasonable upper limit.
|
|
45
|
+
const MAX_BOUNDING_BOX_CELLS: u64 = 100_000_000;
|
|
46
|
+
|
|
40
47
|
#[cfg(feature = "office")]
|
|
41
48
|
use crate::extraction::office_metadata::{
|
|
42
49
|
extract_core_properties, extract_custom_properties, extract_xlsx_app_properties,
|
|
@@ -45,11 +52,13 @@ use crate::extraction::office_metadata::{
|
|
|
45
52
|
use serde_json::Value;
|
|
46
53
|
|
|
47
54
|
pub fn read_excel_file(file_path: &str) -> Result<ExcelWorkbook> {
|
|
55
|
+
let lower_path = file_path.to_lowercase();
|
|
56
|
+
|
|
48
57
|
#[cfg(feature = "office")]
|
|
49
|
-
let office_metadata = if
|
|
50
|
-
||
|
|
51
|
-
||
|
|
52
|
-
||
|
|
58
|
+
let office_metadata = if lower_path.ends_with(".xlsx")
|
|
59
|
+
|| lower_path.ends_with(".xlsm")
|
|
60
|
+
|| lower_path.ends_with(".xlam")
|
|
61
|
+
|| lower_path.ends_with(".xltm")
|
|
53
62
|
{
|
|
54
63
|
extract_xlsx_office_metadata_from_file(file_path).ok()
|
|
55
64
|
} else {
|
|
@@ -59,7 +68,19 @@ pub fn read_excel_file(file_path: &str) -> Result<ExcelWorkbook> {
|
|
|
59
68
|
#[cfg(not(feature = "office"))]
|
|
60
69
|
let office_metadata: Option<HashMap<String, String>> = None;
|
|
61
70
|
|
|
62
|
-
//
|
|
71
|
+
// For XLSX files, use specialized handler with OOM protection
|
|
72
|
+
if lower_path.ends_with(".xlsx")
|
|
73
|
+
|| lower_path.ends_with(".xlsm")
|
|
74
|
+
|| lower_path.ends_with(".xlam")
|
|
75
|
+
|| lower_path.ends_with(".xltm")
|
|
76
|
+
{
|
|
77
|
+
let file = std::fs::File::open(file_path)?;
|
|
78
|
+
let workbook = calamine::Xlsx::new(std::io::BufReader::new(file))
|
|
79
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to parse XLSX: {}", e)))?;
|
|
80
|
+
return process_xlsx_workbook(workbook, office_metadata);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// For other formats, use open_workbook_auto
|
|
63
84
|
let workbook = match open_workbook_auto(Path::new(file_path)) {
|
|
64
85
|
Ok(wb) => wb,
|
|
65
86
|
Err(calamine::Error::Io(io_err)) => {
|
|
@@ -94,7 +115,7 @@ pub fn read_excel_bytes(data: &[u8], file_extension: &str) -> Result<ExcelWorkbo
|
|
|
94
115
|
".xlsx" | ".xlsm" | ".xlam" | ".xltm" => {
|
|
95
116
|
let workbook = calamine::Xlsx::new(cursor)
|
|
96
117
|
.map_err(|e| KreuzbergError::parsing(format!("Failed to parse XLSX: {}", e)))?;
|
|
97
|
-
|
|
118
|
+
process_xlsx_workbook(workbook, office_metadata)
|
|
98
119
|
}
|
|
99
120
|
".xls" | ".xla" => {
|
|
100
121
|
let workbook = calamine::Xls::new(cursor)
|
|
@@ -118,6 +139,194 @@ pub fn read_excel_bytes(data: &[u8], file_extension: &str) -> Result<ExcelWorkbo
|
|
|
118
139
|
}
|
|
119
140
|
}
|
|
120
141
|
|
|
142
|
+
/// Process XLSX workbooks with special handling for pathological sparse files.
|
|
143
|
+
///
|
|
144
|
+
/// This function uses calamine's `worksheet_cells_reader()` API to detect sheets with
|
|
145
|
+
/// extreme bounding boxes BEFORE allocating memory for the full Range. This prevents
|
|
146
|
+
/// OOM when processing files like Excel Solver files that have cells at both A1 and
|
|
147
|
+
/// XFD1048575, creating a bounding box of ~17 billion cells.
|
|
148
|
+
fn process_xlsx_workbook<RS: Read + Seek>(
|
|
149
|
+
mut workbook: calamine::Xlsx<RS>,
|
|
150
|
+
office_metadata: Option<HashMap<String, String>>,
|
|
151
|
+
) -> Result<ExcelWorkbook> {
|
|
152
|
+
let sheet_names = workbook.sheet_names();
|
|
153
|
+
let mut sheets = Vec::with_capacity(sheet_names.len());
|
|
154
|
+
|
|
155
|
+
for name in &sheet_names {
|
|
156
|
+
// Use worksheet_cells_reader to stream cells and detect pathological bounding boxes
|
|
157
|
+
match process_xlsx_sheet_safe(&mut workbook, name) {
|
|
158
|
+
Ok(sheet) => sheets.push(sheet),
|
|
159
|
+
Err(e) => {
|
|
160
|
+
// Log but don't fail - continue with other sheets
|
|
161
|
+
tracing::warn!("Failed to process sheet '{}': {}", name, e);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
let metadata = extract_metadata(&workbook, &sheet_names, office_metadata);
|
|
167
|
+
Ok(ExcelWorkbook { sheets, metadata })
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/// Process a single XLSX sheet safely by pre-checking the bounding box.
|
|
171
|
+
///
|
|
172
|
+
/// This function streams cells to compute the actual bounding box without allocating
|
|
173
|
+
/// a full Range, then only creates the Range if the bounding box is within safe limits.
|
|
174
|
+
fn process_xlsx_sheet_safe<RS: Read + Seek>(workbook: &mut calamine::Xlsx<RS>, sheet_name: &str) -> Result<ExcelSheet> {
|
|
175
|
+
// First pass: stream cells to compute actual bounding box and collect cell data
|
|
176
|
+
let (cells, row_min, row_max, col_min, col_max) = {
|
|
177
|
+
let mut cell_reader = workbook
|
|
178
|
+
.worksheet_cells_reader(sheet_name)
|
|
179
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to read sheet '{}': {}", sheet_name, e)))?;
|
|
180
|
+
|
|
181
|
+
let mut cells: Vec<((u32, u32), Data)> = Vec::new();
|
|
182
|
+
let mut row_min = u32::MAX;
|
|
183
|
+
let mut row_max = 0u32;
|
|
184
|
+
let mut col_min = u32::MAX;
|
|
185
|
+
let mut col_max = 0u32;
|
|
186
|
+
|
|
187
|
+
// Stream through all cells, tracking bounds
|
|
188
|
+
while let Ok(Some(cell)) = cell_reader.next_cell() {
|
|
189
|
+
let (row, col) = cell.get_position();
|
|
190
|
+
row_min = row_min.min(row);
|
|
191
|
+
row_max = row_max.max(row);
|
|
192
|
+
col_min = col_min.min(col);
|
|
193
|
+
col_max = col_max.max(col);
|
|
194
|
+
|
|
195
|
+
// Convert DataRef to owned Data
|
|
196
|
+
let data: Data = match cell.get_value() {
|
|
197
|
+
DataRef::Empty => Data::Empty,
|
|
198
|
+
DataRef::String(s) => Data::String(s.clone()),
|
|
199
|
+
DataRef::SharedString(s) => Data::String(s.to_string()),
|
|
200
|
+
DataRef::Float(f) => Data::Float(*f),
|
|
201
|
+
DataRef::Int(i) => Data::Int(*i),
|
|
202
|
+
DataRef::Bool(b) => Data::Bool(*b),
|
|
203
|
+
DataRef::DateTime(dt) => Data::DateTime(*dt),
|
|
204
|
+
DataRef::DateTimeIso(s) => Data::DateTimeIso(s.clone()),
|
|
205
|
+
DataRef::DurationIso(s) => Data::DurationIso(s.clone()),
|
|
206
|
+
DataRef::Error(e) => Data::Error(e.clone()),
|
|
207
|
+
};
|
|
208
|
+
cells.push(((row, col), data));
|
|
209
|
+
}
|
|
210
|
+
(cells, row_min, row_max, col_min, col_max)
|
|
211
|
+
}; // cell_reader is dropped here, releasing the borrow
|
|
212
|
+
|
|
213
|
+
// Check if sheet is empty
|
|
214
|
+
if cells.is_empty() {
|
|
215
|
+
return Ok(ExcelSheet {
|
|
216
|
+
name: sheet_name.to_owned(),
|
|
217
|
+
markdown: format!("## {}\n\n*Empty sheet*", sheet_name),
|
|
218
|
+
row_count: 0,
|
|
219
|
+
col_count: 0,
|
|
220
|
+
cell_count: 0,
|
|
221
|
+
table_cells: None,
|
|
222
|
+
});
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// Calculate bounding box size
|
|
226
|
+
let bb_rows = (row_max - row_min + 1) as u64;
|
|
227
|
+
let bb_cols = (col_max - col_min + 1) as u64;
|
|
228
|
+
let bb_cells = bb_rows.saturating_mul(bb_cols);
|
|
229
|
+
|
|
230
|
+
// Check for pathological bounding box
|
|
231
|
+
if bb_cells > MAX_BOUNDING_BOX_CELLS {
|
|
232
|
+
// Sheet has sparse data at extreme positions - process directly from cells
|
|
233
|
+
return process_sparse_sheet_from_cells(sheet_name, cells, row_min, row_max, col_min, col_max);
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
// Safe to create a Range - bounding box is within limits
|
|
237
|
+
// Use calamine's normal worksheet_range which will create the Range
|
|
238
|
+
let range = workbook
|
|
239
|
+
.worksheet_range(sheet_name)
|
|
240
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to parse sheet '{}': {}", sheet_name, e)))?;
|
|
241
|
+
|
|
242
|
+
Ok(process_sheet(sheet_name, &range))
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
/// Process a sparse sheet directly from collected cells without creating a full Range.
|
|
246
|
+
///
|
|
247
|
+
/// This is used when the bounding box would exceed MAX_BOUNDING_BOX_CELLS.
|
|
248
|
+
/// Instead of creating a dense Range, we generate markdown directly from the sparse cells.
|
|
249
|
+
fn process_sparse_sheet_from_cells(
|
|
250
|
+
sheet_name: &str,
|
|
251
|
+
cells: Vec<((u32, u32), Data)>,
|
|
252
|
+
row_min: u32,
|
|
253
|
+
row_max: u32,
|
|
254
|
+
col_min: u32,
|
|
255
|
+
col_max: u32,
|
|
256
|
+
) -> Result<ExcelSheet> {
|
|
257
|
+
let cell_count = cells.len();
|
|
258
|
+
let bb_rows = (row_max - row_min + 1) as usize;
|
|
259
|
+
let bb_cols = (col_max - col_min + 1) as usize;
|
|
260
|
+
|
|
261
|
+
// Create a warning message about the sparse data
|
|
262
|
+
let mut markdown = String::with_capacity(500 + cell_count * 50);
|
|
263
|
+
write!(
|
|
264
|
+
markdown,
|
|
265
|
+
"## {}\n\n*Note: Sheet contains sparse data spanning {} rows x {} columns ({} actual cells). \
|
|
266
|
+
Bounding box too large for dense extraction. Showing actual cell data below.*\n\n",
|
|
267
|
+
sheet_name, bb_rows, bb_cols, cell_count
|
|
268
|
+
)
|
|
269
|
+
.expect("write to String cannot fail");
|
|
270
|
+
|
|
271
|
+
// Group cells by row for tabular display
|
|
272
|
+
let mut cells_by_row: HashMap<u32, Vec<(u32, &Data)>> = HashMap::new();
|
|
273
|
+
for ((row, col), data) in &cells {
|
|
274
|
+
cells_by_row.entry(*row).or_default().push((*col, data));
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
// Sort rows and output as simple key-value pairs
|
|
278
|
+
let mut rows: Vec<_> = cells_by_row.keys().copied().collect();
|
|
279
|
+
rows.sort_unstable();
|
|
280
|
+
|
|
281
|
+
// Limit output to first 1000 cells to avoid huge output
|
|
282
|
+
let mut output_count = 0;
|
|
283
|
+
const MAX_OUTPUT_CELLS: usize = 1000;
|
|
284
|
+
|
|
285
|
+
for row in rows {
|
|
286
|
+
if output_count >= MAX_OUTPUT_CELLS {
|
|
287
|
+
write!(markdown, "\n... ({} more cells not shown)\n", cell_count - output_count)
|
|
288
|
+
.expect("write to String cannot fail");
|
|
289
|
+
break;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
let mut row_cells = cells_by_row.remove(&row).unwrap_or_default();
|
|
293
|
+
row_cells.sort_by_key(|(col, _)| *col);
|
|
294
|
+
|
|
295
|
+
for (col, data) in row_cells {
|
|
296
|
+
if output_count >= MAX_OUTPUT_CELLS {
|
|
297
|
+
break;
|
|
298
|
+
}
|
|
299
|
+
let cell_ref = col_to_excel_letter(col);
|
|
300
|
+
let cell_str = format_cell_to_string(data);
|
|
301
|
+
if !cell_str.is_empty() {
|
|
302
|
+
writeln!(markdown, "- **{}{}**: {}", cell_ref, row + 1, cell_str).expect("write to String cannot fail");
|
|
303
|
+
output_count += 1;
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
Ok(ExcelSheet {
|
|
309
|
+
name: sheet_name.to_owned(),
|
|
310
|
+
markdown,
|
|
311
|
+
row_count: bb_rows,
|
|
312
|
+
col_count: bb_cols,
|
|
313
|
+
cell_count,
|
|
314
|
+
table_cells: None, // No structured table for sparse sheets
|
|
315
|
+
})
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
/// Convert a 0-indexed column number to Excel-style letter(s) (A, B, ..., Z, AA, AB, ...).
|
|
319
|
+
fn col_to_excel_letter(col: u32) -> String {
|
|
320
|
+
let mut result = String::new();
|
|
321
|
+
let mut n = col + 1; // 1-indexed for calculation
|
|
322
|
+
while n > 0 {
|
|
323
|
+
n -= 1;
|
|
324
|
+
result.insert(0, (b'A' + (n % 26) as u8) as char);
|
|
325
|
+
n /= 26;
|
|
326
|
+
}
|
|
327
|
+
result
|
|
328
|
+
}
|
|
329
|
+
|
|
121
330
|
fn process_workbook<RS, R>(mut workbook: R, office_metadata: Option<HashMap<String, String>>) -> Result<ExcelWorkbook>
|
|
122
331
|
where
|
|
123
332
|
RS: std::io::Read + std::io::Seek,
|
|
@@ -143,7 +352,10 @@ fn process_sheet(name: &str, range: &Range<Data>) -> ExcelSheet {
|
|
|
143
352
|
let (rows, cols) = range.get_size();
|
|
144
353
|
let cell_count = range.used_cells().count();
|
|
145
354
|
|
|
146
|
-
|
|
355
|
+
// Fix for issue #331: Use actual cell count instead of declared dimensions
|
|
356
|
+
// to avoid OOM on sparse sheets with extreme dimensions (e.g., Excel Solver files).
|
|
357
|
+
// Declared dimensions can claim A1:XFD1048575 (~17T cells) while actual data is minimal.
|
|
358
|
+
let estimated_capacity = 50 + (cols * 20) + (cell_count * 12);
|
|
147
359
|
|
|
148
360
|
if rows == 0 || cols == 0 {
|
|
149
361
|
let markdown = format!("## {}\n\n*Empty sheet*", name);
|
|
@@ -176,6 +388,31 @@ fn process_sheet(name: &str, range: &Range<Data>) -> ExcelSheet {
|
|
|
176
388
|
///
|
|
177
389
|
/// Returns (markdown, table_cells) where table_cells is a 2D vector of strings.
|
|
178
390
|
fn generate_markdown_and_cells(sheet_name: &str, range: &Range<Data>, capacity: usize) -> (String, Vec<Vec<String>>) {
|
|
391
|
+
// Fix for issue #331: Protect against extreme declared dimensions.
|
|
392
|
+
// Excel Solver files can declare A1:XFD1048575 (1M+ rows) but only have ~26 actual cells.
|
|
393
|
+
// Calling range.rows().collect() would iterate ALL declared rows causing OOM.
|
|
394
|
+
const MAX_REASONABLE_ROWS: usize = 100_000; // Cap at 100K rows for safety
|
|
395
|
+
|
|
396
|
+
let (declared_rows, _declared_cols) = range.get_size();
|
|
397
|
+
|
|
398
|
+
// If declared rows exceed reasonable limit, skip processing to avoid OOM
|
|
399
|
+
if declared_rows > MAX_REASONABLE_ROWS {
|
|
400
|
+
let actual_cell_count = range.used_cells().count();
|
|
401
|
+
|
|
402
|
+
// If actual data is minimal compared to declared size, it's a sparse/pathological file
|
|
403
|
+
if actual_cell_count < 10_000 {
|
|
404
|
+
// Return minimal output instead of OOM
|
|
405
|
+
let result_capacity = 100 + sheet_name.len();
|
|
406
|
+
let mut result = String::with_capacity(result_capacity);
|
|
407
|
+
write!(
|
|
408
|
+
result,
|
|
409
|
+
"## {}\n\n*Sheet has extreme declared dimensions ({} rows) with minimal actual data ({} cells). Skipping to prevent OOM.*",
|
|
410
|
+
sheet_name, declared_rows, actual_cell_count
|
|
411
|
+
).unwrap();
|
|
412
|
+
return (result, Vec::new());
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
|
|
179
416
|
let rows: Vec<_> = range.rows().collect();
|
|
180
417
|
if rows.is_empty() {
|
|
181
418
|
let result_capacity = 50 + sheet_name.len();
|
|
@@ -88,3 +88,59 @@ fn test_xlsx_minimal_metadata_extraction() {
|
|
|
88
88
|
|
|
89
89
|
println!("✅ XLSX minimal metadata extraction test passed!");
|
|
90
90
|
}
|
|
91
|
+
|
|
92
|
+
/// Test for issue #331: OOM with XLSX files containing Excel Solver add-in data
|
|
93
|
+
///
|
|
94
|
+
/// This test reproduces the issue where Excel Solver stores configuration data
|
|
95
|
+
/// in cells at extreme positions (XFD1048550-1048575 = column 16384, rows near 1M).
|
|
96
|
+
/// The sheet dimension is set to "A1:XFD1048575", which could cause Kreuzberg
|
|
97
|
+
/// to attempt allocating memory for ~17 trillion cells (16384 × 1048575).
|
|
98
|
+
///
|
|
99
|
+
/// Expected behavior: Should handle extreme dimensions gracefully without OOM.
|
|
100
|
+
/// The file is only 6.8KB and contains minimal actual data.
|
|
101
|
+
#[test]
|
|
102
|
+
fn test_xlsx_excel_solver_extreme_dimensions_no_oom() {
|
|
103
|
+
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
104
|
+
.parent()
|
|
105
|
+
.expect("Operation failed")
|
|
106
|
+
.parent()
|
|
107
|
+
.expect("Operation failed");
|
|
108
|
+
let test_file = workspace_root.join("tests/fixtures/xlsx-oom-repro/kreuzberg-oom-repro.xlsx");
|
|
109
|
+
|
|
110
|
+
if !test_file.exists() {
|
|
111
|
+
println!("Skipping test: Test file not found at {:?}", test_file);
|
|
112
|
+
println!("Run: node tests/fixtures/xlsx-oom-repro/generate-oom-xlsx.mjs");
|
|
113
|
+
return;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
let file_path = test_file.to_str().expect("File path should be valid UTF-8");
|
|
117
|
+
|
|
118
|
+
// This should NOT cause OOM even though dimension claims A1:XFD1048575
|
|
119
|
+
// The actual data is minimal (only ~26 cells with Solver metadata)
|
|
120
|
+
let result = read_excel_file(file_path).expect("Should extract XLSX with extreme dimensions without OOM");
|
|
121
|
+
|
|
122
|
+
// Verify we got the actual data, not a massive allocation
|
|
123
|
+
assert!(!result.sheets.is_empty(), "Should have at least one sheet");
|
|
124
|
+
|
|
125
|
+
// The file has normal cells A1, B1 plus Solver cells at extreme positions
|
|
126
|
+
// Verify we extracted something reasonable, not 17 trillion cells
|
|
127
|
+
let sheet = &result.sheets[0];
|
|
128
|
+
assert!(
|
|
129
|
+
sheet.markdown.len() < 10000,
|
|
130
|
+
"Sheet markdown content should be small (< 10000 chars), not massive. Got {} chars",
|
|
131
|
+
sheet.markdown.len()
|
|
132
|
+
);
|
|
133
|
+
|
|
134
|
+
// Verify metadata was extracted
|
|
135
|
+
assert!(
|
|
136
|
+
result.metadata.contains_key("sheet_count"),
|
|
137
|
+
"Should have sheet_count metadata"
|
|
138
|
+
);
|
|
139
|
+
|
|
140
|
+
println!("✅ XLSX Excel Solver extreme dimensions test passed!");
|
|
141
|
+
println!(
|
|
142
|
+
" Sheet markdown length: {} chars (reasonable size)",
|
|
143
|
+
sheet.markdown.len()
|
|
144
|
+
);
|
|
145
|
+
println!(" Successfully handled dimension A1:XFD1048575 without OOM");
|
|
146
|
+
}
|
|
@@ -223,7 +223,7 @@ typedef struct CErrorDetails {
|
|
|
223
223
|
* # Memory Layout
|
|
224
224
|
*
|
|
225
225
|
* Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
|
|
226
|
-
* Field order:
|
|
226
|
+
* Field order: 13 pointers (8 bytes each) + 1 bool + 7 bytes padding = 112 bytes total
|
|
227
227
|
*
|
|
228
228
|
* The `#[repr(C)]` attribute ensures the struct follows C's memory layout rules:
|
|
229
229
|
* - Fields are laid out in order
|
|
@@ -284,6 +284,10 @@ typedef struct CExtractionResult {
|
|
|
284
284
|
* Per-page content as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
285
285
|
*/
|
|
286
286
|
char *pages_json;
|
|
287
|
+
/**
|
|
288
|
+
* Semantic elements as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
289
|
+
*/
|
|
290
|
+
char *elements_json;
|
|
287
291
|
/**
|
|
288
292
|
* Whether extraction was successful
|
|
289
293
|
*/
|
|
@@ -1608,7 +1612,7 @@ char *kreuzberg_clone_string(const char *s);
|
|
|
1608
1612
|
*
|
|
1609
1613
|
* # Memory Layout
|
|
1610
1614
|
*
|
|
1611
|
-
* This function frees all
|
|
1615
|
+
* This function frees all 13 string fields in CExtractionResult:
|
|
1612
1616
|
* 1. content
|
|
1613
1617
|
* 2. mime_type
|
|
1614
1618
|
* 3. language
|
|
@@ -1621,6 +1625,7 @@ char *kreuzberg_clone_string(const char *s);
|
|
|
1621
1625
|
* 10. images_json
|
|
1622
1626
|
* 11. page_structure_json (FIXED: was missing before PR #3)
|
|
1623
1627
|
* 12. pages_json (FIXED: was missing before PR #3)
|
|
1628
|
+
* 13. elements_json (ADDED: for element-based extraction support)
|
|
1624
1629
|
*
|
|
1625
1630
|
* # Example (C)
|
|
1626
1631
|
*
|
|
@@ -67,7 +67,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
|
|
|
67
67
|
images,
|
|
68
68
|
pages,
|
|
69
69
|
djot_content: _,
|
|
70
|
-
elements
|
|
70
|
+
elements,
|
|
71
71
|
} = result;
|
|
72
72
|
|
|
73
73
|
let sanitized_content = if content.contains('\0') {
|
|
@@ -179,6 +179,17 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
|
|
|
179
179
|
_ => None,
|
|
180
180
|
};
|
|
181
181
|
|
|
182
|
+
let elements_json_guard = match elements {
|
|
183
|
+
Some(elements) if !elements.is_empty() => {
|
|
184
|
+
let json =
|
|
185
|
+
serde_json::to_string(&elements).map_err(|e| format!("Failed to serialize elements to JSON: {}", e))?;
|
|
186
|
+
Some(CStringGuard::new(CString::new(json).map_err(|e| {
|
|
187
|
+
format!("Failed to convert elements JSON to C string: {}", e)
|
|
188
|
+
})?))
|
|
189
|
+
}
|
|
190
|
+
_ => None,
|
|
191
|
+
};
|
|
192
|
+
|
|
182
193
|
Ok(Box::into_raw(Box::new(CExtractionResult {
|
|
183
194
|
content: content_guard.into_raw(),
|
|
184
195
|
mime_type: mime_type_guard.into_raw(),
|
|
@@ -192,6 +203,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
|
|
|
192
203
|
images_json: images_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
|
|
193
204
|
page_structure_json: page_structure_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
|
|
194
205
|
pages_json: pages_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
|
|
206
|
+
elements_json: elements_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
|
|
195
207
|
success: true,
|
|
196
208
|
_padding1: [0u8; 7],
|
|
197
209
|
})))
|
|
@@ -134,8 +134,8 @@ mod tests {
|
|
|
134
134
|
// Test size
|
|
135
135
|
assert_eq!(
|
|
136
136
|
std::mem::size_of::<CExtractionResult>(),
|
|
137
|
-
|
|
138
|
-
"CExtractionResult must be exactly
|
|
137
|
+
112,
|
|
138
|
+
"CExtractionResult must be exactly 112 bytes"
|
|
139
139
|
);
|
|
140
140
|
|
|
141
141
|
// Test alignment
|
|
@@ -197,6 +197,7 @@ mod tests {
|
|
|
197
197
|
images_json: ptr::null_mut(),
|
|
198
198
|
page_structure_json: ptr::null_mut(),
|
|
199
199
|
pages_json: ptr::null_mut(),
|
|
200
|
+
elements_json: ptr::null_mut(),
|
|
200
201
|
success: true,
|
|
201
202
|
_padding1: [0u8; 7],
|
|
202
203
|
}))
|
|
@@ -510,6 +511,7 @@ mod tests {
|
|
|
510
511
|
images_json: ptr::null_mut(),
|
|
511
512
|
page_structure_json: ptr::null_mut(),
|
|
512
513
|
pages_json: ptr::null_mut(),
|
|
514
|
+
elements_json: ptr::null_mut(),
|
|
513
515
|
success: true,
|
|
514
516
|
_padding1: [0u8; 7],
|
|
515
517
|
}));
|
|
@@ -522,7 +524,7 @@ mod tests {
|
|
|
522
524
|
#[test]
|
|
523
525
|
fn test_extraction_result_free_all_fields_allocated() {
|
|
524
526
|
unsafe {
|
|
525
|
-
// Test freeing a result where ALL
|
|
527
|
+
// Test freeing a result where ALL 13 string fields are allocated
|
|
526
528
|
// This verifies that kreuzberg_free_result properly frees all fields
|
|
527
529
|
let result = Box::into_raw(Box::new(CExtractionResult {
|
|
528
530
|
content: CString::new("test content").unwrap().into_raw(),
|
|
@@ -537,11 +539,12 @@ mod tests {
|
|
|
537
539
|
images_json: CString::new("[{\"data\":\"base64\"}]").unwrap().into_raw(),
|
|
538
540
|
page_structure_json: CString::new("{\"pages\":1}").unwrap().into_raw(),
|
|
539
541
|
pages_json: CString::new("[{\"page\":1,\"content\":\"test\"}]").unwrap().into_raw(),
|
|
542
|
+
elements_json: CString::new("[]").unwrap().into_raw(),
|
|
540
543
|
success: true,
|
|
541
544
|
_padding1: [0u8; 7],
|
|
542
545
|
}));
|
|
543
546
|
|
|
544
|
-
// Should properly free all
|
|
547
|
+
// Should properly free all 13 allocated string fields without leaking memory
|
|
545
548
|
kreuzberg_free_result(result);
|
|
546
549
|
}
|
|
547
550
|
}
|
|
@@ -621,7 +624,7 @@ mod tests {
|
|
|
621
624
|
/// Test CExtractionResult size exactly matches FFI contract
|
|
622
625
|
#[test]
|
|
623
626
|
fn test_c_extraction_result_size() {
|
|
624
|
-
assert_eq!(std::mem::size_of::<CExtractionResult>(),
|
|
627
|
+
assert_eq!(std::mem::size_of::<CExtractionResult>(), 112);
|
|
625
628
|
assert_eq!(std::mem::align_of::<CExtractionResult>(), 8);
|
|
626
629
|
}
|
|
627
630
|
|
|
@@ -146,7 +146,7 @@ pub unsafe extern "C" fn kreuzberg_clone_string(s: *const c_char) -> *mut c_char
|
|
|
146
146
|
///
|
|
147
147
|
/// # Memory Layout
|
|
148
148
|
///
|
|
149
|
-
/// This function frees all
|
|
149
|
+
/// This function frees all 13 string fields in CExtractionResult:
|
|
150
150
|
/// 1. content
|
|
151
151
|
/// 2. mime_type
|
|
152
152
|
/// 3. language
|
|
@@ -159,6 +159,7 @@ pub unsafe extern "C" fn kreuzberg_clone_string(s: *const c_char) -> *mut c_char
|
|
|
159
159
|
/// 10. images_json
|
|
160
160
|
/// 11. page_structure_json (FIXED: was missing before PR #3)
|
|
161
161
|
/// 12. pages_json (FIXED: was missing before PR #3)
|
|
162
|
+
/// 13. elements_json (ADDED: for element-based extraction support)
|
|
162
163
|
///
|
|
163
164
|
/// # Example (C)
|
|
164
165
|
///
|
|
@@ -209,6 +210,9 @@ pub unsafe extern "C" fn kreuzberg_free_result(result: *mut CExtractionResult) {
|
|
|
209
210
|
if !result_box.pages_json.is_null() {
|
|
210
211
|
unsafe { drop(CString::from_raw(result_box.pages_json)) };
|
|
211
212
|
}
|
|
213
|
+
if !result_box.elements_json.is_null() {
|
|
214
|
+
unsafe { drop(CString::from_raw(result_box.elements_json)) };
|
|
215
|
+
}
|
|
212
216
|
}
|
|
213
217
|
}
|
|
214
218
|
|
|
@@ -232,6 +236,7 @@ mod tests {
|
|
|
232
236
|
images_json: CString::new("[]").unwrap().into_raw(),
|
|
233
237
|
page_structure_json: CString::new("{}").unwrap().into_raw(),
|
|
234
238
|
pages_json: CString::new("[]").unwrap().into_raw(),
|
|
239
|
+
elements_json: CString::new("[]").unwrap().into_raw(),
|
|
235
240
|
success: true,
|
|
236
241
|
_padding1: [0u8; 7],
|
|
237
242
|
}))
|
|
@@ -252,6 +257,7 @@ mod tests {
|
|
|
252
257
|
images_json: ptr::null_mut(),
|
|
253
258
|
page_structure_json: ptr::null_mut(),
|
|
254
259
|
pages_json: ptr::null_mut(),
|
|
260
|
+
elements_json: ptr::null_mut(),
|
|
255
261
|
success: true,
|
|
256
262
|
_padding1: [0u8; 7],
|
|
257
263
|
}))
|
|
@@ -343,6 +349,34 @@ mod tests {
|
|
|
343
349
|
images_json: ptr::null_mut(),
|
|
344
350
|
page_structure_json: CString::new("{\"pages\": []}").unwrap().into_raw(),
|
|
345
351
|
pages_json: CString::new("[{\"content\": \"page 1\"}]").unwrap().into_raw(),
|
|
352
|
+
elements_json: ptr::null_mut(),
|
|
353
|
+
success: true,
|
|
354
|
+
_padding1: [0u8; 7],
|
|
355
|
+
}));
|
|
356
|
+
|
|
357
|
+
unsafe { kreuzberg_free_result(result) };
|
|
358
|
+
// If we get here without crashing or leaking, the test passed
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
#[test]
|
|
362
|
+
fn test_free_result_elements_json() {
|
|
363
|
+
// Test: ensure elements_json is freed
|
|
364
|
+
let result = Box::into_raw(Box::new(CExtractionResult {
|
|
365
|
+
content: CString::new("test").unwrap().into_raw(),
|
|
366
|
+
mime_type: CString::new("text/plain").unwrap().into_raw(),
|
|
367
|
+
language: ptr::null_mut(),
|
|
368
|
+
date: ptr::null_mut(),
|
|
369
|
+
subject: ptr::null_mut(),
|
|
370
|
+
tables_json: ptr::null_mut(),
|
|
371
|
+
detected_languages_json: ptr::null_mut(),
|
|
372
|
+
metadata_json: ptr::null_mut(),
|
|
373
|
+
chunks_json: ptr::null_mut(),
|
|
374
|
+
images_json: ptr::null_mut(),
|
|
375
|
+
page_structure_json: ptr::null_mut(),
|
|
376
|
+
pages_json: ptr::null_mut(),
|
|
377
|
+
elements_json: CString::new(r#"[{"element_id":"abc","element_type":"title","text":"Hello"}]"#)
|
|
378
|
+
.unwrap()
|
|
379
|
+
.into_raw(),
|
|
346
380
|
success: true,
|
|
347
381
|
_padding1: [0u8; 7],
|
|
348
382
|
}));
|
|
@@ -51,7 +51,7 @@ impl Drop for CStringGuard {
|
|
|
51
51
|
/// # Memory Layout
|
|
52
52
|
///
|
|
53
53
|
/// Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
|
|
54
|
-
/// Field order:
|
|
54
|
+
/// Field order: 13 pointers (8 bytes each) + 1 bool + 7 bytes padding = 112 bytes total
|
|
55
55
|
///
|
|
56
56
|
/// The `#[repr(C)]` attribute ensures the struct follows C's memory layout rules:
|
|
57
57
|
/// - Fields are laid out in order
|
|
@@ -88,6 +88,8 @@ pub struct CExtractionResult {
|
|
|
88
88
|
pub page_structure_json: *mut c_char,
|
|
89
89
|
/// Per-page content as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
90
90
|
pub pages_json: *mut c_char,
|
|
91
|
+
/// Semantic elements as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
92
|
+
pub elements_json: *mut c_char,
|
|
91
93
|
/// Whether extraction was successful
|
|
92
94
|
pub success: bool,
|
|
93
95
|
/// Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
|
|
@@ -150,7 +152,7 @@ pub struct CBatchResult {
|
|
|
150
152
|
const _: () = {
|
|
151
153
|
const fn assert_c_extraction_result_size() {
|
|
152
154
|
const SIZE: usize = std::mem::size_of::<CExtractionResult>();
|
|
153
|
-
const _: () = assert!(SIZE ==
|
|
155
|
+
const _: () = assert!(SIZE == 112, "CExtractionResult size must be 112 bytes");
|
|
154
156
|
}
|
|
155
157
|
|
|
156
158
|
const fn assert_c_extraction_result_alignment() {
|
|
@@ -195,8 +197,8 @@ mod tests {
|
|
|
195
197
|
fn test_c_extraction_result_size() {
|
|
196
198
|
assert_eq!(
|
|
197
199
|
std::mem::size_of::<CExtractionResult>(),
|
|
198
|
-
|
|
199
|
-
"CExtractionResult must be exactly
|
|
200
|
+
112,
|
|
201
|
+
"CExtractionResult must be exactly 112 bytes"
|
|
200
202
|
);
|
|
201
203
|
}
|
|
202
204
|
|
|
@@ -327,7 +329,8 @@ mod tests {
|
|
|
327
329
|
assert_eq!(offset_of!(CExtractionResult, images_json), 72);
|
|
328
330
|
assert_eq!(offset_of!(CExtractionResult, page_structure_json), 80);
|
|
329
331
|
assert_eq!(offset_of!(CExtractionResult, pages_json), 88);
|
|
330
|
-
assert_eq!(offset_of!(CExtractionResult,
|
|
332
|
+
assert_eq!(offset_of!(CExtractionResult, elements_json), 96);
|
|
333
|
+
assert_eq!(offset_of!(CExtractionResult, success), 104);
|
|
331
334
|
}
|
|
332
335
|
|
|
333
336
|
/// Verify field offsets in CBatchResult match expectations
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.2.
|
|
4
|
+
version: 4.2.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-01-
|
|
11
|
+
date: 2026-01-28 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -363,6 +363,7 @@ files:
|
|
|
363
363
|
- vendor/kreuzberg/src/api/error.rs
|
|
364
364
|
- vendor/kreuzberg/src/api/handlers.rs
|
|
365
365
|
- vendor/kreuzberg/src/api/mod.rs
|
|
366
|
+
- vendor/kreuzberg/src/api/openapi.rs
|
|
366
367
|
- vendor/kreuzberg/src/api/router.rs
|
|
367
368
|
- vendor/kreuzberg/src/api/startup.rs
|
|
368
369
|
- vendor/kreuzberg/src/api/types.rs
|