RubyGems - kreuzberg - Versions diffs - 4.2.1 → 4.2.3 - Mend

kreuzberg 4.2.1 → 4.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +4 -4
data/Gemfile.lock +9 -9
data/README.md +1 -1
data/lib/kreuzberg/api_proxy.rb +3 -3
data/lib/kreuzberg/cli_proxy.rb +2 -2
data/lib/kreuzberg/config.rb +4 -20
data/lib/kreuzberg/mcp_proxy.rb +3 -3
data/lib/kreuzberg/version.rb +1 -1
data/spec/binding/config_spec.rb +1 -1
data/spec/unit/config/extraction_config_spec.rb +2 -2
data/vendor/Cargo.toml +1 -1
data/vendor/kreuzberg/Cargo.toml +3 -2
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/api/error.rs +89 -0
data/vendor/kreuzberg/src/api/handlers.rs +153 -32
data/vendor/kreuzberg/src/api/mod.rs +2 -0
data/vendor/kreuzberg/src/api/openapi.rs +141 -0
data/vendor/kreuzberg/src/api/router.rs +24 -2
data/vendor/kreuzberg/src/api/startup.rs +11 -5
data/vendor/kreuzberg/src/api/types.rs +50 -4
data/vendor/kreuzberg/src/core/config/processing.rs +8 -1
data/vendor/kreuzberg/src/extraction/excel.rs +246 -9
data/vendor/kreuzberg/src/mcp/format.rs +46 -57
data/vendor/kreuzberg/src/mcp/server.rs +2 -8
data/vendor/kreuzberg/src/mcp/tools/extraction.rs +1 -7
data/vendor/kreuzberg/tests/api_chunk.rs +25 -0
data/vendor/kreuzberg/tests/api_embed.rs +60 -0
data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +56 -0
data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
data/vendor/kreuzberg-ffi/src/helpers.rs +13 -1
data/vendor/kreuzberg-ffi/src/lib.rs +8 -5
data/vendor/kreuzberg-ffi/src/memory.rs +35 -1
data/vendor/kreuzberg-ffi/src/types.rs +8 -5
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +3 -2

data/vendor/kreuzberg/src/mcp/server.rs CHANGED Viewed

@@ -144,7 +144,7 @@ impl KreuzbergMcp {
         Parameters(params): Parameters<super::params::BatchExtractFilesParams>,
     ) -> Result<CallToolResult, rmcp::ErrorData> {
         use super::errors::map_kreuzberg_error_to_mcp;
-        use super::format::{build_config, format_extraction_result};
+        use super::format::build_config;
         use crate::{batch_extract_file, batch_extract_file_sync};
         let config =
@@ -158,13 +158,7 @@ impl KreuzbergMcp {
             batch_extract_file_sync(params.paths.clone(), &config).map_err(map_kreuzberg_error_to_mcp)?
         };
-        let mut response = String::new();
-        for (i, result) in results.iter().enumerate() {
-            response.push_str(&format!("=== Document {}: {} ===\n", i + 1, params.paths[i]));
-            response.push_str(&format_extraction_result(result));
-            response.push_str("\n\n");
-        }
+        let response = serde_json::to_string_pretty(&results).unwrap_or_default();
         Ok(CallToolResult::success(vec![Content::text(response)]))
     }

data/vendor/kreuzberg/src/mcp/tools/extraction.rs CHANGED Viewed

@@ -99,13 +99,7 @@ pub(in crate::mcp) trait ExtractionTool {
             batch_extract_file_sync(params.paths.clone(), &config).map_err(map_kreuzberg_error_to_mcp)?
         };
-        let mut response = String::new();
-        for (i, result) in results.iter().enumerate() {
-            response.push_str(&format!("=== Document {}: {} ===\n", i + 1, params.paths[i]));
-            response.push_str(&format_extraction_result(result));
-            response.push_str("\n\n");
-        }
+        let response = serde_json::to_string_pretty(&results).unwrap_or_default();
         Ok(CallToolResult::success(vec![Content::text(response)]))
     }
 }

data/vendor/kreuzberg/tests/api_chunk.rs CHANGED Viewed

@@ -321,3 +321,28 @@ async fn test_chunk_custom_config() {
     assert_eq!(chunk_response.config.overlap, 5);
     assert!(!chunk_response.config.trim);
 }
+#[tokio::test]
+async fn test_chunk_rejects_json_array() {
+    let app = create_router(ExtractionConfig::default());
+    // Send a JSON array instead of object
+    let response = app
+        .oneshot(
+            Request::builder()
+                .uri("/chunk")
+                .method("POST")
+                .header("content-type", "application/json")
+                .body(Body::from(r#"[["text"], {"text": "content"}]"#))
+                .expect("Operation failed"),
+        )
+        .await
+        .expect("Operation failed");
+    // Should reject with 400 or 422, NOT 200
+    assert!(
+        response.status() == StatusCode::BAD_REQUEST || response.status() == StatusCode::UNPROCESSABLE_ENTITY,
+        "Expected 400 or 422, got {}",
+        response.status()
+    );
+}

data/vendor/kreuzberg/tests/api_embed.rs CHANGED Viewed

@@ -255,6 +255,66 @@ async fn test_embed_malformed_json() {
     assert_eq!(response.status(), StatusCode::BAD_REQUEST);
 }
+/// Test embed endpoint rejects JSON array at root level.
+#[tokio::test]
+async fn test_embed_rejects_json_array() {
+    let app = create_router(ExtractionConfig::default());
+    // Send a JSON array instead of object
+    let response = app
+        .oneshot(
+            Request::builder()
+                .method("POST")
+                .uri("/embed")
+                .header("content-type", "application/json")
+                .body(Body::from(r#"[["text1"], {"texts": ["text2"]}]"#))
+                .expect("Operation failed"),
+        )
+        .await
+        .expect("Operation failed");
+    // Should reject with 400 or 422, NOT 200
+    assert!(
+        response.status() == StatusCode::BAD_REQUEST || response.status() == StatusCode::UNPROCESSABLE_ENTITY,
+        "Expected 400 or 422, got {}",
+        response.status()
+    );
+}
+/// Test embed endpoint rejects simple JSON array with strings.
+#[tokio::test]
+async fn test_embed_rejects_simple_json_array() {
+    let app = create_router(ExtractionConfig::default());
+    // Send a simple string array instead of object with texts field
+    let response = app
+        .oneshot(
+            Request::builder()
+                .method("POST")
+                .uri("/embed")
+                .header("content-type", "application/json")
+                .body(Body::from(r#"["text1", "text2", "text3"]"#))
+                .expect("Operation failed"),
+        )
+        .await
+        .expect("Operation failed");
+    assert_eq!(response.status(), StatusCode::BAD_REQUEST);
+    // Check that error response contains helpful message
+    let body = axum::body::to_bytes(response.into_body(), usize::MAX)
+        .await
+        .expect("Failed to read response body");
+    let error_response: serde_json::Value = serde_json::from_slice(&body).expect("Failed to parse error response");
+    assert!(
+        error_response["message"]
+            .as_str()
+            .map(|msg| msg.contains("array") || msg.contains("object"))
+            .unwrap_or(false)
+    );
+}
 /// Test embed endpoint preserves embedding vector values across calls.
 #[tokio::test]
 async fn test_embed_deterministic() {

data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs CHANGED Viewed

@@ -88,3 +88,59 @@ fn test_xlsx_minimal_metadata_extraction() {
     println!("✅ XLSX minimal metadata extraction test passed!");
 }
+/// Test for issue #331: OOM with XLSX files containing Excel Solver add-in data
+///
+/// This test reproduces the issue where Excel Solver stores configuration data
+/// in cells at extreme positions (XFD1048550-1048575 = column 16384, rows near 1M).
+/// The sheet dimension is set to "A1:XFD1048575", which could cause Kreuzberg
+/// to attempt allocating memory for ~17 trillion cells (16384 × 1048575).
+///
+/// Expected behavior: Should handle extreme dimensions gracefully without OOM.
+/// The file is only 6.8KB and contains minimal actual data.
+#[test]
+fn test_xlsx_excel_solver_extreme_dimensions_no_oom() {
+    let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+        .parent()
+        .expect("Operation failed")
+        .parent()
+        .expect("Operation failed");
+    let test_file = workspace_root.join("tests/fixtures/xlsx-oom-repro/kreuzberg-oom-repro.xlsx");
+    if !test_file.exists() {
+        println!("Skipping test: Test file not found at {:?}", test_file);
+        println!("Run: node tests/fixtures/xlsx-oom-repro/generate-oom-xlsx.mjs");
+        return;
+    }
+    let file_path = test_file.to_str().expect("File path should be valid UTF-8");
+    // This should NOT cause OOM even though dimension claims A1:XFD1048575
+    // The actual data is minimal (only ~26 cells with Solver metadata)
+    let result = read_excel_file(file_path).expect("Should extract XLSX with extreme dimensions without OOM");
+    // Verify we got the actual data, not a massive allocation
+    assert!(!result.sheets.is_empty(), "Should have at least one sheet");
+    // The file has normal cells A1, B1 plus Solver cells at extreme positions
+    // Verify we extracted something reasonable, not 17 trillion cells
+    let sheet = &result.sheets[0];
+    assert!(
+        sheet.markdown.len() < 10000,
+        "Sheet markdown content should be small (< 10000 chars), not massive. Got {} chars",
+        sheet.markdown.len()
+    );
+    // Verify metadata was extracted
+    assert!(
+        result.metadata.contains_key("sheet_count"),
+        "Should have sheet_count metadata"
+    );
+    println!("✅ XLSX Excel Solver extreme dimensions test passed!");
+    println!(
+        "   Sheet markdown length: {} chars (reasonable size)",
+        sheet.markdown.len()
+    );
+    println!("   Successfully handled dimension A1:XFD1048575 without OOM");
+}

data/vendor/kreuzberg-ffi/kreuzberg.h CHANGED Viewed

@@ -223,7 +223,7 @@ typedef struct CErrorDetails {
  * # Memory Layout
  *
  * Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
- * Field order: 12 pointers (8 bytes each) + 1 bool + 7 bytes padding = 104 bytes total
+ * Field order: 13 pointers (8 bytes each) + 1 bool + 7 bytes padding = 112 bytes total
  *
  * The `#[repr(C)]` attribute ensures the struct follows C's memory layout rules:
  * - Fields are laid out in order
@@ -284,6 +284,10 @@ typedef struct CExtractionResult {
    * Per-page content as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
    */
   char *pages_json;
+  /**
+   * Semantic elements as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
+   */
+  char *elements_json;
   /**
    * Whether extraction was successful
    */
@@ -1608,7 +1612,7 @@ char *kreuzberg_clone_string(const char *s);
  *
  * # Memory Layout
  *
- * This function frees all 12 string fields in CExtractionResult:
+ * This function frees all 13 string fields in CExtractionResult:
  * 1. content
  * 2. mime_type
  * 3. language
@@ -1621,6 +1625,7 @@ char *kreuzberg_clone_string(const char *s);
  * 10. images_json
  * 11. page_structure_json (FIXED: was missing before PR #3)
  * 12. pages_json (FIXED: was missing before PR #3)
+ * 13. elements_json (ADDED: for element-based extraction support)
  *
  * # Example (C)
  *

data/vendor/kreuzberg-ffi/src/helpers.rs CHANGED Viewed

@@ -67,7 +67,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
         images,
         pages,
         djot_content: _,
-        elements: _,
+        elements,
     } = result;
     let sanitized_content = if content.contains('\0') {
@@ -179,6 +179,17 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
         _ => None,
     };
+    let elements_json_guard = match elements {
+        Some(elements) if !elements.is_empty() => {
+            let json =
+                serde_json::to_string(&elements).map_err(|e| format!("Failed to serialize elements to JSON: {}", e))?;
+            Some(CStringGuard::new(CString::new(json).map_err(|e| {
+                format!("Failed to convert elements JSON to C string: {}", e)
+            })?))
+        }
+        _ => None,
+    };
     Ok(Box::into_raw(Box::new(CExtractionResult {
         content: content_guard.into_raw(),
         mime_type: mime_type_guard.into_raw(),
@@ -192,6 +203,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
         images_json: images_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
         page_structure_json: page_structure_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
         pages_json: pages_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
+        elements_json: elements_json_guard.map_or(ptr::null_mut(), |g| g.into_raw()),
         success: true,
         _padding1: [0u8; 7],
     })))

data/vendor/kreuzberg-ffi/src/lib.rs CHANGED Viewed

@@ -134,8 +134,8 @@ mod tests {
         // Test size
         assert_eq!(
             std::mem::size_of::<CExtractionResult>(),
-            104,
-            "CExtractionResult must be exactly 104 bytes"
+            112,
+            "CExtractionResult must be exactly 112 bytes"
         );
         // Test alignment
@@ -197,6 +197,7 @@ mod tests {
             images_json: ptr::null_mut(),
             page_structure_json: ptr::null_mut(),
             pages_json: ptr::null_mut(),
+            elements_json: ptr::null_mut(),
             success: true,
             _padding1: [0u8; 7],
         }))
@@ -510,6 +511,7 @@ mod tests {
                 images_json: ptr::null_mut(),
                 page_structure_json: ptr::null_mut(),
                 pages_json: ptr::null_mut(),
+                elements_json: ptr::null_mut(),
                 success: true,
                 _padding1: [0u8; 7],
             }));
@@ -522,7 +524,7 @@ mod tests {
     #[test]
     fn test_extraction_result_free_all_fields_allocated() {
         unsafe {
-            // Test freeing a result where ALL 12 string fields are allocated
+            // Test freeing a result where ALL 13 string fields are allocated
             // This verifies that kreuzberg_free_result properly frees all fields
             let result = Box::into_raw(Box::new(CExtractionResult {
                 content: CString::new("test content").unwrap().into_raw(),
@@ -537,11 +539,12 @@ mod tests {
                 images_json: CString::new("[{\"data\":\"base64\"}]").unwrap().into_raw(),
                 page_structure_json: CString::new("{\"pages\":1}").unwrap().into_raw(),
                 pages_json: CString::new("[{\"page\":1,\"content\":\"test\"}]").unwrap().into_raw(),
+                elements_json: CString::new("[]").unwrap().into_raw(),
                 success: true,
                 _padding1: [0u8; 7],
             }));
-            // Should properly free all 12 allocated string fields without leaking memory
+            // Should properly free all 13 allocated string fields without leaking memory
             kreuzberg_free_result(result);
         }
     }
@@ -621,7 +624,7 @@ mod tests {
     /// Test CExtractionResult size exactly matches FFI contract
     #[test]
     fn test_c_extraction_result_size() {
-        assert_eq!(std::mem::size_of::<CExtractionResult>(), 104);
+        assert_eq!(std::mem::size_of::<CExtractionResult>(), 112);
         assert_eq!(std::mem::align_of::<CExtractionResult>(), 8);
     }

data/vendor/kreuzberg-ffi/src/memory.rs CHANGED Viewed

@@ -146,7 +146,7 @@ pub unsafe extern "C" fn kreuzberg_clone_string(s: *const c_char) -> *mut c_char
 ///
 /// # Memory Layout
 ///
-/// This function frees all 12 string fields in CExtractionResult:
+/// This function frees all 13 string fields in CExtractionResult:
 /// 1. content
 /// 2. mime_type
 /// 3. language
@@ -159,6 +159,7 @@ pub unsafe extern "C" fn kreuzberg_clone_string(s: *const c_char) -> *mut c_char
 /// 10. images_json
 /// 11. page_structure_json (FIXED: was missing before PR #3)
 /// 12. pages_json (FIXED: was missing before PR #3)
+/// 13. elements_json (ADDED: for element-based extraction support)
 ///
 /// # Example (C)
 ///
@@ -209,6 +210,9 @@ pub unsafe extern "C" fn kreuzberg_free_result(result: *mut CExtractionResult) {
         if !result_box.pages_json.is_null() {
             unsafe { drop(CString::from_raw(result_box.pages_json)) };
         }
+        if !result_box.elements_json.is_null() {
+            unsafe { drop(CString::from_raw(result_box.elements_json)) };
+        }
     }
 }
@@ -232,6 +236,7 @@ mod tests {
             images_json: CString::new("[]").unwrap().into_raw(),
             page_structure_json: CString::new("{}").unwrap().into_raw(),
             pages_json: CString::new("[]").unwrap().into_raw(),
+            elements_json: CString::new("[]").unwrap().into_raw(),
             success: true,
             _padding1: [0u8; 7],
         }))
@@ -252,6 +257,7 @@ mod tests {
             images_json: ptr::null_mut(),
             page_structure_json: ptr::null_mut(),
             pages_json: ptr::null_mut(),
+            elements_json: ptr::null_mut(),
             success: true,
             _padding1: [0u8; 7],
         }))
@@ -343,6 +349,34 @@ mod tests {
             images_json: ptr::null_mut(),
             page_structure_json: CString::new("{\"pages\": []}").unwrap().into_raw(),
             pages_json: CString::new("[{\"content\": \"page 1\"}]").unwrap().into_raw(),
+            elements_json: ptr::null_mut(),
+            success: true,
+            _padding1: [0u8; 7],
+        }));
+        unsafe { kreuzberg_free_result(result) };
+        // If we get here without crashing or leaking, the test passed
+    }
+    #[test]
+    fn test_free_result_elements_json() {
+        // Test: ensure elements_json is freed
+        let result = Box::into_raw(Box::new(CExtractionResult {
+            content: CString::new("test").unwrap().into_raw(),
+            mime_type: CString::new("text/plain").unwrap().into_raw(),
+            language: ptr::null_mut(),
+            date: ptr::null_mut(),
+            subject: ptr::null_mut(),
+            tables_json: ptr::null_mut(),
+            detected_languages_json: ptr::null_mut(),
+            metadata_json: ptr::null_mut(),
+            chunks_json: ptr::null_mut(),
+            images_json: ptr::null_mut(),
+            page_structure_json: ptr::null_mut(),
+            pages_json: ptr::null_mut(),
+            elements_json: CString::new(r#"[{"element_id":"abc","element_type":"title","text":"Hello"}]"#)
+                .unwrap()
+                .into_raw(),
             success: true,
             _padding1: [0u8; 7],
         }));

data/vendor/kreuzberg-ffi/src/types.rs CHANGED Viewed

@@ -51,7 +51,7 @@ impl Drop for CStringGuard {
 /// # Memory Layout
 ///
 /// Must be kept in sync with the Java side's MemoryLayout definition in KreuzbergFFI.java
-/// Field order: 12 pointers (8 bytes each) + 1 bool + 7 bytes padding = 104 bytes total
+/// Field order: 13 pointers (8 bytes each) + 1 bool + 7 bytes padding = 112 bytes total
 ///
 /// The `#[repr(C)]` attribute ensures the struct follows C's memory layout rules:
 /// - Fields are laid out in order
@@ -88,6 +88,8 @@ pub struct CExtractionResult {
     pub page_structure_json: *mut c_char,
     /// Per-page content as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
     pub pages_json: *mut c_char,
+    /// Semantic elements as JSON array (null-terminated string, or NULL if not available, must be freed with kreuzberg_free_string)
+    pub elements_json: *mut c_char,
     /// Whether extraction was successful
     pub success: bool,
     /// Padding to match Java MemoryLayout (7 bytes padding to align to 8-byte boundary)
@@ -150,7 +152,7 @@ pub struct CBatchResult {
 const _: () = {
     const fn assert_c_extraction_result_size() {
         const SIZE: usize = std::mem::size_of::<CExtractionResult>();
-        const _: () = assert!(SIZE == 104, "CExtractionResult size must be 104 bytes");
+        const _: () = assert!(SIZE == 112, "CExtractionResult size must be 112 bytes");
     }
     const fn assert_c_extraction_result_alignment() {
@@ -195,8 +197,8 @@ mod tests {
     fn test_c_extraction_result_size() {
         assert_eq!(
             std::mem::size_of::<CExtractionResult>(),
-            104,
-            "CExtractionResult must be exactly 104 bytes"
+            112,
+            "CExtractionResult must be exactly 112 bytes"
         );
     }
@@ -327,7 +329,8 @@ mod tests {
         assert_eq!(offset_of!(CExtractionResult, images_json), 72);
         assert_eq!(offset_of!(CExtractionResult, page_structure_json), 80);
         assert_eq!(offset_of!(CExtractionResult, pages_json), 88);
-        assert_eq!(offset_of!(CExtractionResult, success), 96);
+        assert_eq!(offset_of!(CExtractionResult, elements_json), 96);
+        assert_eq!(offset_of!(CExtractionResult, success), 104);
     }
     /// Verify field offsets in CBatchResult match expectations

data/vendor/kreuzberg-tesseract/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg-tesseract"
-version = "4.2.1"
+version = "4.2.3"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: kreuzberg
 version: !ruby/object:Gem::Version
-  version: 4.2.1
+  version: 4.2.3
 platform: ruby
 authors:
 - Na'aman Hirschfeld
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2026-01-27 00:00:00.000000000 Z
+date: 2026-01-28 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -363,6 +363,7 @@ files:
 - vendor/kreuzberg/src/api/error.rs
 - vendor/kreuzberg/src/api/handlers.rs
 - vendor/kreuzberg/src/api/mod.rs
+- vendor/kreuzberg/src/api/openapi.rs
 - vendor/kreuzberg/src/api/router.rs
 - vendor/kreuzberg/src/api/startup.rs
 - vendor/kreuzberg/src/api/types.rs