kreuzberg 4.2.1 → 4.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +9 -9
- data/README.md +1 -1
- data/lib/kreuzberg/api_proxy.rb +3 -3
- data/lib/kreuzberg/cli_proxy.rb +2 -2
- data/lib/kreuzberg/config.rb +4 -20
- data/lib/kreuzberg/mcp_proxy.rb +3 -3
- data/lib/kreuzberg/version.rb +1 -1
- data/spec/binding/config_spec.rb +1 -1
- data/spec/unit/config/extraction_config_spec.rb +2 -2
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +3 -2
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +89 -0
- data/vendor/kreuzberg/src/api/handlers.rs +153 -32
- data/vendor/kreuzberg/src/api/mod.rs +2 -0
- data/vendor/kreuzberg/src/api/openapi.rs +141 -0
- data/vendor/kreuzberg/src/api/router.rs +24 -2
- data/vendor/kreuzberg/src/api/startup.rs +11 -5
- data/vendor/kreuzberg/src/api/types.rs +50 -4
- data/vendor/kreuzberg/src/core/config/processing.rs +8 -1
- data/vendor/kreuzberg/src/extraction/excel.rs +246 -9
- data/vendor/kreuzberg/src/mcp/format.rs +46 -57
- data/vendor/kreuzberg/src/mcp/server.rs +2 -8
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +1 -7
- data/vendor/kreuzberg/tests/api_chunk.rs +25 -0
- data/vendor/kreuzberg/tests/api_embed.rs +60 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +56 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
- data/vendor/kreuzberg-ffi/src/helpers.rs +13 -1
- data/vendor/kreuzberg-ffi/src/lib.rs +8 -5
- data/vendor/kreuzberg-ffi/src/memory.rs +35 -1
- data/vendor/kreuzberg-ffi/src/types.rs +8 -5
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +3 -2
|
@@ -144,7 +144,7 @@ impl KreuzbergMcp {
|
|
|
144
144
|
Parameters(params): Parameters<super::params::BatchExtractFilesParams>,
|
|
145
145
|
) -> Result<CallToolResult, rmcp::ErrorData> {
|
|
146
146
|
use super::errors::map_kreuzberg_error_to_mcp;
|
|
147
|
-
use super::format::
|
|
147
|
+
use super::format::build_config;
|
|
148
148
|
use crate::{batch_extract_file, batch_extract_file_sync};
|
|
149
149
|
|
|
150
150
|
let config =
|
|
@@ -158,13 +158,7 @@ impl KreuzbergMcp {
|
|
|
158
158
|
batch_extract_file_sync(params.paths.clone(), &config).map_err(map_kreuzberg_error_to_mcp)?
|
|
159
159
|
};
|
|
160
160
|
|
|
161
|
-
let
|
|
162
|
-
for (i, result) in results.iter().enumerate() {
|
|
163
|
-
response.push_str(&format!("=== Document {}: {} ===\n", i + 1, params.paths[i]));
|
|
164
|
-
response.push_str(&format_extraction_result(result));
|
|
165
|
-
response.push_str("\n\n");
|
|
166
|
-
}
|
|
167
|
-
|
|
161
|
+
let response = serde_json::to_string_pretty(&results).unwrap_or_default();
|
|
168
162
|
Ok(CallToolResult::success(vec![Content::text(response)]))
|
|
169
163
|
}
|
|
170
164
|
|
|
@@ -99,13 +99,7 @@ pub(in crate::mcp) trait ExtractionTool {
|
|
|
99
99
|
batch_extract_file_sync(params.paths.clone(), &config).map_err(map_kreuzberg_error_to_mcp)?
|
|
100
100
|
};
|
|
101
101
|
|
|
102
|
-
let
|
|
103
|
-
for (i, result) in results.iter().enumerate() {
|
|
104
|
-
response.push_str(&format!("=== Document {}: {} ===\n", i + 1, params.paths[i]));
|
|
105
|
-
response.push_str(&format_extraction_result(result));
|
|
106
|
-
response.push_str("\n\n");
|
|
107
|
-
}
|
|
108
|
-
|
|
102
|
+
let response = serde_json::to_string_pretty(&results).unwrap_or_default();
|
|
109
103
|
Ok(CallToolResult::success(vec![Content::text(response)]))
|
|
110
104
|
}
|
|
111
105
|
}
|
|
@@ -321,3 +321,28 @@ async fn test_chunk_custom_config() {
|
|
|
321
321
|
assert_eq!(chunk_response.config.overlap, 5);
|
|
322
322
|
assert!(!chunk_response.config.trim);
|
|
323
323
|
}
|
|
324
|
+
|
|
325
|
+
#[tokio::test]
|
|
326
|
+
async fn test_chunk_rejects_json_array() {
|
|
327
|
+
let app = create_router(ExtractionConfig::default());
|
|
328
|
+
|
|
329
|
+
// Send a JSON array instead of object
|
|
330
|
+
let response = app
|
|
331
|
+
.oneshot(
|
|
332
|
+
Request::builder()
|
|
333
|
+
.uri("/chunk")
|
|
334
|
+
.method("POST")
|
|
335
|
+
.header("content-type", "application/json")
|
|
336
|
+
.body(Body::from(r#"[["text"], {"text": "content"}]"#))
|
|
337
|
+
.expect("Operation failed"),
|
|
338
|
+
)
|
|
339
|
+
.await
|
|
340
|
+
.expect("Operation failed");
|
|
341
|
+
|
|
342
|
+
// Should reject with 400 or 422, NOT 200
|
|
343
|
+
assert!(
|
|
344
|
+
response.status() == StatusCode::BAD_REQUEST || response.status() == StatusCode::UNPROCESSABLE_ENTITY,
|
|
345
|
+
"Expected 400 or 422, got {}",
|
|
346
|
+
response.status()
|
|
347
|
+
);
|
|
348
|
+
}
|
|
@@ -255,6 +255,66 @@ async fn test_embed_malformed_json() {
|
|
|
255
255
|
assert_eq!(response.status(), StatusCode::BAD_REQUEST);
|
|
256
256
|
}
|
|
257
257
|
|
|
258
|
+
/// Test embed endpoint rejects JSON array at root level.
|
|
259
|
+
#[tokio::test]
|
|
260
|
+
async fn test_embed_rejects_json_array() {
|
|
261
|
+
let app = create_router(ExtractionConfig::default());
|
|
262
|
+
|
|
263
|
+
// Send a JSON array instead of object
|
|
264
|
+
let response = app
|
|
265
|
+
.oneshot(
|
|
266
|
+
Request::builder()
|
|
267
|
+
.method("POST")
|
|
268
|
+
.uri("/embed")
|
|
269
|
+
.header("content-type", "application/json")
|
|
270
|
+
.body(Body::from(r#"[["text1"], {"texts": ["text2"]}]"#))
|
|
271
|
+
.expect("Operation failed"),
|
|
272
|
+
)
|
|
273
|
+
.await
|
|
274
|
+
.expect("Operation failed");
|
|
275
|
+
|
|
276
|
+
// Should reject with 400 or 422, NOT 200
|
|
277
|
+
assert!(
|
|
278
|
+
response.status() == StatusCode::BAD_REQUEST || response.status() == StatusCode::UNPROCESSABLE_ENTITY,
|
|
279
|
+
"Expected 400 or 422, got {}",
|
|
280
|
+
response.status()
|
|
281
|
+
);
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
/// Test embed endpoint rejects simple JSON array with strings.
|
|
285
|
+
#[tokio::test]
|
|
286
|
+
async fn test_embed_rejects_simple_json_array() {
|
|
287
|
+
let app = create_router(ExtractionConfig::default());
|
|
288
|
+
|
|
289
|
+
// Send a simple string array instead of object with texts field
|
|
290
|
+
let response = app
|
|
291
|
+
.oneshot(
|
|
292
|
+
Request::builder()
|
|
293
|
+
.method("POST")
|
|
294
|
+
.uri("/embed")
|
|
295
|
+
.header("content-type", "application/json")
|
|
296
|
+
.body(Body::from(r#"["text1", "text2", "text3"]"#))
|
|
297
|
+
.expect("Operation failed"),
|
|
298
|
+
)
|
|
299
|
+
.await
|
|
300
|
+
.expect("Operation failed");
|
|
301
|
+
|
|
302
|
+
assert_eq!(response.status(), StatusCode::BAD_REQUEST);
|
|
303
|
+
|
|
304
|
+
// Check that error response contains helpful message
|
|
305
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
306
|
+
.await
|
|
307
|
+
.expect("Failed to read response body");
|
|
308
|
+
let error_response: serde_json::Value = serde_json::from_slice(&body).expect("Failed to parse error response");
|
|
309
|
+
|
|
310
|
+
assert!(
|
|
311
|
+
error_response["message"]
|
|
312
|
+
.as_str()
|
|
313
|
+
.map(|msg| msg.contains("array") || msg.contains("object"))
|
|
314
|
+
.unwrap_or(false)
|
|
315
|
+
);
|
|
316
|
+
}
|
|
317
|
+
|
|
258
318
|
/// Test embed endpoint preserves embedding vector values across calls.
|
|
259
319
|
#[tokio::test]
|
|
260
320
|
async fn test_embed_deterministic() {
|
|
@@ -88,3 +88,59 @@ fn test_xlsx_minimal_metadata_extraction() {
|
|
|
88
88
|
|
|
89
89
|
println!("✅ XLSX minimal metadata extraction test passed!");
|
|
90
90
|
}
|
|
91
|
+
|
|
92
|
+
/// Test for issue #331: OOM with XLSX files containing Excel Solver add-in data
|
|
93
|
+
///
|
|
94
|
+
/// This test reproduces the issue where Excel Solver stores configuration data
|
|
95
|
+
/// in cells at extreme positions (XFD1048550-1048575 = column 16384, rows near 1M).
|
|
96
|
+
/// The sheet dimension is set to "A1:XFD1048575", which could cause Kreuzberg
|
|
97
|
+
/// to attempt allocating memory for ~17 trillion cells (16384 × 1048575).
|
|
98
|
+
///
|
|
99
|
+
/// Expected behavior: Should handle extreme dimensions gracefully without OOM.
|
|
100
|
+
/// The file is only 6.8KB and contains minimal actual data.
|
|
101
|
+
#[test]
|
|
102
|
+
fn test_xlsx_excel_solver_extreme_dimensions_no_oom() {
|
|
103
|
+
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
104
|
+
.parent()
|
|
105
|
+
.expect("Operation failed")
|
|
106
|
+
.parent()
|
|
107
|
+
.expect("Operation failed");
|
|
108
|
+
let test_file = workspace_root.join("tests/fixtures/xlsx-oom-repro/kreuzberg-oom-repro.xlsx");
|
|
109
|
+
|
|
110
|
+
if !test_file.exists() {
|
|
111
|
+
println!("Skipping test: Test file not found at {:?}", test_file);
|
|
112
|
+
println!("Run: node tests/fixtures/xlsx-oom-repro/generate-oom-xlsx.mjs");
|
|
113
|
+
return;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
let file_path = test_file.to_str().expect("File path should be valid UTF-8");
|
|
117
|
+
|
|
118
|
+
// This should NOT cause OOM even though dimension claims A1:XFD1048575
|
|
119
|
+
// The actual data is minimal (only ~26 cells with Solver metadata)
|
|
120
|
+
let result = read_excel_file(file_path).expect("Should extract XLSX with extreme dimensions without OOM");
|
|
121
|
+
|
|
122
|
+
// Verify we got the actual data, not a massive allocation
|
|
123
|
+
assert!(!result.sheets.is_empty(), "Should have at least one sheet");
|
|
124
|
+
|
|
125
|
+
// The file has normal cells A1, B1 plus Solver cells at extreme positions
|
|
126
|
+
// Verify we extracted something reasonable, not 17 trillion cells
|
|
127
|
+
let sheet = &result.sheets[0];
|
|
128
|
+
assert!(
|
|
129
|
+
sheet.markdown.len() < 10000,
|
|
130
|
+
"Sheet markdown content should be small (< 10000 chars), not massive. Got {} chars",
|
|
131
|
+
sheet.markdown.len()
|
|
132
|
+
);
|
|
133
|
+
|
|
134
|
+
// Verify metadata was extracted
|
|
135
|
+
assert!(
|
|
136
|
+
result.metadata.contains_key("sheet_count"),
|
|
137
|
+
"Should have sheet_count metadata"
|
|
138
|
+
);
|
|
139
|
+
|
|
140
|
+
println!("✅ XLSX Excel Solver extreme dimensions test passed!");
|
|
141
|
+
println!(
|
|
142
|
+
" Sheet markdown length: {} chars (reasonable size)",
|
|
143
|
+
sheet.markdown.len()
|
|
144
|
+
);
|
|
145
|
+
println!(" Successfully handled dimension A1:XFD1048575 without OOM");
|
|
146
|
+
}
|
|
@@ -223,7 +223,7 @@ typedef struct CErrorDetails {
|
|
|
223
223
|
* # Memory Layout
|
|
224
224
|
*
|
|
225
225
|
* Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
|
|
226
|
-
* Field order:
|
|
226
|
+
* Field order: 13 pointers (8 bytes each) + 1 bool + 7 bytes padding = 112 bytes total
|
|
227
227
|
*
|
|
228
228
|
* The `#[repr(C)]` attribute ensures the struct follows C's memory layout rules:
|
|
229
229
|
* - Fields are laid out in order
|
|
@@ -284,6 +284,10 @@ typedef struct CExtractionResult {
|
|
|
284
284
|
* Per-page content as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
285
285
|
*/
|
|
286
286
|
char *pages_json;
|
|
287
|
+
/**
|
|
288
|
+
* Semantic elements as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
289
|
+
*/
|
|
290
|
+
char *elements_json;
|
|
287
291
|
/**
|
|
288
292
|
* Whether extraction was successful
|
|
289
293
|
*/
|
|
@@ -1608,7 +1612,7 @@ char *kreuzberg_clone_string(const char *s);
|
|
|
1608
1612
|
*
|
|
1609
1613
|
* # Memory Layout
|
|
1610
1614
|
*
|
|
1611
|
-
* This function frees all
|
|
1615
|
+
* This function frees all 13 string fields in CExtractionResult:
|
|
1612
1616
|
* 1. content
|
|
1613
1617
|
* 2. mime_type
|
|
1614
1618
|
* 3. language
|
|
@@ -1621,6 +1625,7 @@ char *kreuzberg_clone_string(const char *s);
|
|
|
1621
1625
|
* 10. images_json
|
|
1622
1626
|
* 11. page_structure_json (FIXED: was missing before PR #3)
|
|
1623
1627
|
* 12. pages_json (FIXED: was missing before PR #3)
|
|
1628
|
+
* 13. elements_json (ADDED: for element-based extraction support)
|
|
1624
1629
|
*
|
|
1625
1630
|
* # Example (C)
|
|
1626
1631
|
*
|
|
@@ -67,7 +67,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
|
|
|
67
67
|
images,
|
|
68
68
|
pages,
|
|
69
69
|
djot_content: _,
|
|
70
|
-
elements
|
|
70
|
+
elements,
|
|
71
71
|
} = result;
|
|
72
72
|
|
|
73
73
|
let sanitized_content = if content.contains('\0') {
|
|
@@ -179,6 +179,17 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
|
|
|
179
179
|
_ => None,
|
|
180
180
|
};
|
|
181
181
|
|
|
182
|
+
let elements_json_guard = match elements {
|
|
183
|
+
Some(elements) if !elements.is_empty() => {
|
|
184
|
+
let json =
|
|
185
|
+
serde_json::to_string(&elements).map_err(|e| format!("Failed to serialize elements to JSON: {}", e))?;
|
|
186
|
+
Some(CStringGuard::new(CString::new(json).map_err(|e| {
|
|
187
|
+
format!("Failed to convert elements JSON to C string: {}", e)
|
|
188
|
+
})?))
|
|
189
|
+
}
|
|
190
|
+
_ => None,
|
|
191
|
+
};
|
|
192
|
+
|
|
182
193
|
Ok(Box::into_raw(Box::new(CExtractionResult {
|
|
183
194
|
content: content_guard.into_raw(),
|
|
184
195
|
mime_type: mime_type_guard.into_raw(),
|
|
@@ -192,6 +203,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
|
|
|
192
203
|
images_json: images_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
|
|
193
204
|
page_structure_json: page_structure_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
|
|
194
205
|
pages_json: pages_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
|
|
206
|
+
elements_json: elements_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
|
|
195
207
|
success: true,
|
|
196
208
|
_padding1: [0u8; 7],
|
|
197
209
|
})))
|
|
@@ -134,8 +134,8 @@ mod tests {
|
|
|
134
134
|
// Test size
|
|
135
135
|
assert_eq!(
|
|
136
136
|
std::mem::size_of::<CExtractionResult>(),
|
|
137
|
-
|
|
138
|
-
"CExtractionResult must be exactly
|
|
137
|
+
112,
|
|
138
|
+
"CExtractionResult must be exactly 112 bytes"
|
|
139
139
|
);
|
|
140
140
|
|
|
141
141
|
// Test alignment
|
|
@@ -197,6 +197,7 @@ mod tests {
|
|
|
197
197
|
images_json: ptr::null_mut(),
|
|
198
198
|
page_structure_json: ptr::null_mut(),
|
|
199
199
|
pages_json: ptr::null_mut(),
|
|
200
|
+
elements_json: ptr::null_mut(),
|
|
200
201
|
success: true,
|
|
201
202
|
_padding1: [0u8; 7],
|
|
202
203
|
}))
|
|
@@ -510,6 +511,7 @@ mod tests {
|
|
|
510
511
|
images_json: ptr::null_mut(),
|
|
511
512
|
page_structure_json: ptr::null_mut(),
|
|
512
513
|
pages_json: ptr::null_mut(),
|
|
514
|
+
elements_json: ptr::null_mut(),
|
|
513
515
|
success: true,
|
|
514
516
|
_padding1: [0u8; 7],
|
|
515
517
|
}));
|
|
@@ -522,7 +524,7 @@ mod tests {
|
|
|
522
524
|
#[test]
|
|
523
525
|
fn test_extraction_result_free_all_fields_allocated() {
|
|
524
526
|
unsafe {
|
|
525
|
-
// Test freeing a result where ALL
|
|
527
|
+
// Test freeing a result where ALL 13 string fields are allocated
|
|
526
528
|
// This verifies that kreuzberg_free_result properly frees all fields
|
|
527
529
|
let result = Box::into_raw(Box::new(CExtractionResult {
|
|
528
530
|
content: CString::new("test content").unwrap().into_raw(),
|
|
@@ -537,11 +539,12 @@ mod tests {
|
|
|
537
539
|
images_json: CString::new("[{\"data\":\"base64\"}]").unwrap().into_raw(),
|
|
538
540
|
page_structure_json: CString::new("{\"pages\":1}").unwrap().into_raw(),
|
|
539
541
|
pages_json: CString::new("[{\"page\":1,\"content\":\"test\"}]").unwrap().into_raw(),
|
|
542
|
+
elements_json: CString::new("[]").unwrap().into_raw(),
|
|
540
543
|
success: true,
|
|
541
544
|
_padding1: [0u8; 7],
|
|
542
545
|
}));
|
|
543
546
|
|
|
544
|
-
// Should properly free all
|
|
547
|
+
// Should properly free all 13 allocated string fields without leaking memory
|
|
545
548
|
kreuzberg_free_result(result);
|
|
546
549
|
}
|
|
547
550
|
}
|
|
@@ -621,7 +624,7 @@ mod tests {
|
|
|
621
624
|
/// Test CExtractionResult size exactly matches FFI contract
|
|
622
625
|
#[test]
|
|
623
626
|
fn test_c_extraction_result_size() {
|
|
624
|
-
assert_eq!(std::mem::size_of::<CExtractionResult>(),
|
|
627
|
+
assert_eq!(std::mem::size_of::<CExtractionResult>(), 112);
|
|
625
628
|
assert_eq!(std::mem::align_of::<CExtractionResult>(), 8);
|
|
626
629
|
}
|
|
627
630
|
|
|
@@ -146,7 +146,7 @@ pub unsafe extern "C" fn kreuzberg_clone_string(s: *const c_char) -> *mut c_char
|
|
|
146
146
|
///
|
|
147
147
|
/// # Memory Layout
|
|
148
148
|
///
|
|
149
|
-
/// This function frees all
|
|
149
|
+
/// This function frees all 13 string fields in CExtractionResult:
|
|
150
150
|
/// 1. content
|
|
151
151
|
/// 2. mime_type
|
|
152
152
|
/// 3. language
|
|
@@ -159,6 +159,7 @@ pub unsafe extern "C" fn kreuzberg_clone_string(s: *const c_char) -> *mut c_char
|
|
|
159
159
|
/// 10. images_json
|
|
160
160
|
/// 11. page_structure_json (FIXED: was missing before PR #3)
|
|
161
161
|
/// 12. pages_json (FIXED: was missing before PR #3)
|
|
162
|
+
/// 13. elements_json (ADDED: for element-based extraction support)
|
|
162
163
|
///
|
|
163
164
|
/// # Example (C)
|
|
164
165
|
///
|
|
@@ -209,6 +210,9 @@ pub unsafe extern "C" fn kreuzberg_free_result(result: *mut CExtractionResult) {
|
|
|
209
210
|
if !result_box.pages_json.is_null() {
|
|
210
211
|
unsafe { drop(CString::from_raw(result_box.pages_json)) };
|
|
211
212
|
}
|
|
213
|
+
if !result_box.elements_json.is_null() {
|
|
214
|
+
unsafe { drop(CString::from_raw(result_box.elements_json)) };
|
|
215
|
+
}
|
|
212
216
|
}
|
|
213
217
|
}
|
|
214
218
|
|
|
@@ -232,6 +236,7 @@ mod tests {
|
|
|
232
236
|
images_json: CString::new("[]").unwrap().into_raw(),
|
|
233
237
|
page_structure_json: CString::new("{}").unwrap().into_raw(),
|
|
234
238
|
pages_json: CString::new("[]").unwrap().into_raw(),
|
|
239
|
+
elements_json: CString::new("[]").unwrap().into_raw(),
|
|
235
240
|
success: true,
|
|
236
241
|
_padding1: [0u8; 7],
|
|
237
242
|
}))
|
|
@@ -252,6 +257,7 @@ mod tests {
|
|
|
252
257
|
images_json: ptr::null_mut(),
|
|
253
258
|
page_structure_json: ptr::null_mut(),
|
|
254
259
|
pages_json: ptr::null_mut(),
|
|
260
|
+
elements_json: ptr::null_mut(),
|
|
255
261
|
success: true,
|
|
256
262
|
_padding1: [0u8; 7],
|
|
257
263
|
}))
|
|
@@ -343,6 +349,34 @@ mod tests {
|
|
|
343
349
|
images_json: ptr::null_mut(),
|
|
344
350
|
page_structure_json: CString::new("{\"pages\": []}").unwrap().into_raw(),
|
|
345
351
|
pages_json: CString::new("[{\"content\": \"page 1\"}]").unwrap().into_raw(),
|
|
352
|
+
elements_json: ptr::null_mut(),
|
|
353
|
+
success: true,
|
|
354
|
+
_padding1: [0u8; 7],
|
|
355
|
+
}));
|
|
356
|
+
|
|
357
|
+
unsafe { kreuzberg_free_result(result) };
|
|
358
|
+
// If we get here without crashing or leaking, the test passed
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
#[test]
|
|
362
|
+
fn test_free_result_elements_json() {
|
|
363
|
+
// Test: ensure elements_json is freed
|
|
364
|
+
let result = Box::into_raw(Box::new(CExtractionResult {
|
|
365
|
+
content: CString::new("test").unwrap().into_raw(),
|
|
366
|
+
mime_type: CString::new("text/plain").unwrap().into_raw(),
|
|
367
|
+
language: ptr::null_mut(),
|
|
368
|
+
date: ptr::null_mut(),
|
|
369
|
+
subject: ptr::null_mut(),
|
|
370
|
+
tables_json: ptr::null_mut(),
|
|
371
|
+
detected_languages_json: ptr::null_mut(),
|
|
372
|
+
metadata_json: ptr::null_mut(),
|
|
373
|
+
chunks_json: ptr::null_mut(),
|
|
374
|
+
images_json: ptr::null_mut(),
|
|
375
|
+
page_structure_json: ptr::null_mut(),
|
|
376
|
+
pages_json: ptr::null_mut(),
|
|
377
|
+
elements_json: CString::new(r#"[{"element_id":"abc","element_type":"title","text":"Hello"}]"#)
|
|
378
|
+
.unwrap()
|
|
379
|
+
.into_raw(),
|
|
346
380
|
success: true,
|
|
347
381
|
_padding1: [0u8; 7],
|
|
348
382
|
}));
|
|
@@ -51,7 +51,7 @@ impl Drop for CStringGuard {
|
|
|
51
51
|
/// # Memory Layout
|
|
52
52
|
///
|
|
53
53
|
/// Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
|
|
54
|
-
/// Field order:
|
|
54
|
+
/// Field order: 13 pointers (8 bytes each) + 1 bool + 7 bytes padding = 112 bytes total
|
|
55
55
|
///
|
|
56
56
|
/// The `#[repr(C)]` attribute ensures the struct follows C's memory layout rules:
|
|
57
57
|
/// - Fields are laid out in order
|
|
@@ -88,6 +88,8 @@ pub struct CExtractionResult {
|
|
|
88
88
|
pub page_structure_json: *mut c_char,
|
|
89
89
|
/// Per-page content as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
90
90
|
pub pages_json: *mut c_char,
|
|
91
|
+
/// Semantic elements as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
|
|
92
|
+
pub elements_json: *mut c_char,
|
|
91
93
|
/// Whether extraction was successful
|
|
92
94
|
pub success: bool,
|
|
93
95
|
/// Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
|
|
@@ -150,7 +152,7 @@ pub struct CBatchResult {
|
|
|
150
152
|
const _: () = {
|
|
151
153
|
const fn assert_c_extraction_result_size() {
|
|
152
154
|
const SIZE: usize = std::mem::size_of::<CExtractionResult>();
|
|
153
|
-
const _: () = assert!(SIZE ==
|
|
155
|
+
const _: () = assert!(SIZE == 112, "CExtractionResult size must be 112 bytes");
|
|
154
156
|
}
|
|
155
157
|
|
|
156
158
|
const fn assert_c_extraction_result_alignment() {
|
|
@@ -195,8 +197,8 @@ mod tests {
|
|
|
195
197
|
fn test_c_extraction_result_size() {
|
|
196
198
|
assert_eq!(
|
|
197
199
|
std::mem::size_of::<CExtractionResult>(),
|
|
198
|
-
|
|
199
|
-
"CExtractionResult must be exactly
|
|
200
|
+
112,
|
|
201
|
+
"CExtractionResult must be exactly 112 bytes"
|
|
200
202
|
);
|
|
201
203
|
}
|
|
202
204
|
|
|
@@ -327,7 +329,8 @@ mod tests {
|
|
|
327
329
|
assert_eq!(offset_of!(CExtractionResult, images_json), 72);
|
|
328
330
|
assert_eq!(offset_of!(CExtractionResult, page_structure_json), 80);
|
|
329
331
|
assert_eq!(offset_of!(CExtractionResult, pages_json), 88);
|
|
330
|
-
assert_eq!(offset_of!(CExtractionResult,
|
|
332
|
+
assert_eq!(offset_of!(CExtractionResult, elements_json), 96);
|
|
333
|
+
assert_eq!(offset_of!(CExtractionResult, success), 104);
|
|
331
334
|
}
|
|
332
335
|
|
|
333
336
|
/// Verify field offsets in CBatchResult match expectations
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.2.
|
|
4
|
+
version: 4.2.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-01-
|
|
11
|
+
date: 2026-01-28 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -363,6 +363,7 @@ files:
|
|
|
363
363
|
- vendor/kreuzberg/src/api/error.rs
|
|
364
364
|
- vendor/kreuzberg/src/api/handlers.rs
|
|
365
365
|
- vendor/kreuzberg/src/api/mod.rs
|
|
366
|
+
- vendor/kreuzberg/src/api/openapi.rs
|
|
366
367
|
- vendor/kreuzberg/src/api/router.rs
|
|
367
368
|
- vendor/kreuzberg/src/api/startup.rs
|
|
368
369
|
- vendor/kreuzberg/src/api/types.rs
|