RubyGems - kreuzberg - Versions diffs - 4.2.2 → 4.2.3 - Mend

kreuzberg 4.2.2 → 4.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/Gemfile.lock +9 -9
data/README.md +1 -1
data/lib/kreuzberg/api_proxy.rb +3 -3
data/lib/kreuzberg/cli_proxy.rb +2 -2
data/lib/kreuzberg/mcp_proxy.rb +3 -3
data/lib/kreuzberg/version.rb +1 -1
data/vendor/Cargo.toml +1 -1
data/vendor/kreuzberg/Cargo.toml +2 -2
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/api/error.rs +29 -0
data/vendor/kreuzberg/src/mcp/format.rs +46 -57
data/vendor/kreuzberg/src/mcp/server.rs +2 -8
data/vendor/kreuzberg/src/mcp/tools/extraction.rs +1 -7
data/vendor/kreuzberg/tests/api_chunk.rs +25 -0
data/vendor/kreuzberg/tests/api_embed.rs +60 -0
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 1765714785cbe89567dcb13ed0c1e1b79c2da7a2143a0d0b4653c5578a3ada84
-  data.tar.gz: 64d6db5e4d88992920f37fe9a3e28ab08b5bdd0b28385da570e8207e67d90f34
+  metadata.gz: 2c6fc44b151014f7e56c82bd191f55244a4294a259b24b95fc494dba6f8eaba6
+  data.tar.gz: 6e40a732814ff3e2a164e718cdb1c7a6ae838b2b2210a66b232f8675c7f79a80
 SHA512:
-  metadata.gz: cbb71395a285ddb1a74101fc935ebb8266b81c6172043a128721ab37fad583c7202e559f9e8cb2534bf110721bf20e2d0cbe6838554c772831c56bc09583bf75
-  data.tar.gz: b752cf56da8810211e5efd5e5d69f136eb7d0a3d5e27e985b81dff18bde442f0033b15962823ad7e3c5a27e080d02b6a6df1726bffe4aa21eaf89f56a5c6b56f
+  metadata.gz: f9c3a45f31c3ad9e3857872d8705b397b40c4317844ef421f4da4c2918e57411f5a626df4f6706d7db4916f33b8644c736e7b41508b398fd0197f1a87170fa3c
+  data.tar.gz: 8b05a75be261dbe583c4873d9d21079efff97d6c9c0340bbd8a73a43c9d15955431f4de20cd8b4a8b7956872f52e4467c253f5da03177a1e7d3b6a10d202b59d

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    kreuzberg (4.2.2)
+    kreuzberg (4.2.3)
 GEM
   remote: https://rubygems.org/
@@ -61,7 +61,7 @@ GEM
     parser (3.3.10.1)
       ast (~> 2.4.1)
       racc
-    prism (1.8.0)
+    prism (1.9.0)
     pry (0.16.0)
       coderay (~> 1.1)
       method_source (~> 1.0)
@@ -98,7 +98,7 @@ GEM
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.13.0)
     rspec-support (3.13.6)
-    rubocop (1.82.1)
+    rubocop (1.84.0)
       json (~> 2.3)
       language_server-protocol (~> 3.17.0.2)
       lint_roller (~> 1.1.0)
@@ -106,7 +106,7 @@ GEM
       parser (>= 3.3.0.2)
       rainbow (>= 2.2.2, < 4.0)
       regexp_parser (>= 2.9.3, < 3.0)
-      rubocop-ast (>= 1.48.0, < 2.0)
+      rubocop-ast (>= 1.49.0, < 2.0)
       ruby-progressbar (~> 1.7)
       unicode-display_width (>= 2.4.0, < 4.0)
     rubocop-ast (1.49.0)
@@ -121,7 +121,7 @@ GEM
       rubocop (~> 1.81)
     ruby-progressbar (1.13.0)
     securerandom (0.4.1)
-    sorbet-runtime (0.6.12897)
+    sorbet-runtime (0.6.12903)
     steep (1.10.0)
       activesupport (>= 5.1)
       concurrent-ruby (>= 1.1.10)
@@ -207,7 +207,7 @@ CHECKSUMS
   i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
   io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
   json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
-  kreuzberg (4.2.2)
+  kreuzberg (4.2.3)
   language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
   lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
   listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
@@ -217,7 +217,7 @@ CHECKSUMS
   mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
   parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
   parser (3.3.10.1) sha256=06f6a725d2cd91e5e7f2b7c32ba143631e1f7c8ae2fb918fc4cebec187e6a688
-  prism (1.8.0) sha256=84453a16ef5530ea62c5f03ec16b52a459575ad4e7b9c2b360fd8ce2c39c1254
+  prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
   pry (0.16.0) sha256=d76c69065698ed1f85e717bd33d7942c38a50868f6b0673c636192b3d1b6054e
   pry-byebug (3.12.0) sha256=594e094ae8a8390a7ad4c7b36ae36e13304ed02664c67417d108dc5f7213d1b7
   racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
@@ -236,13 +236,13 @@ CHECKSUMS
   rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
   rspec-mocks (3.13.7) sha256=0979034e64b1d7a838aaaddf12bf065ea4dc40ef3d4c39f01f93ae2c66c62b1c
   rspec-support (3.13.6) sha256=2e8de3702427eab064c9352fe74488cc12a1bfae887ad8b91cba480ec9f8afb2
-  rubocop (1.82.1) sha256=09f1a6a654a960eda767aebea33e47603080f8e9c9a3f019bf9b94c9cab5e273
+  rubocop (1.84.0) sha256=88dec310153bb685a879f5a7cdb601f6287b8f0ee675d9dc63a17c7204c4190a
   rubocop-ast (1.49.0) sha256=49c3676d3123a0923d333e20c6c2dbaaae2d2287b475273fddee0c61da9f71fd
   rubocop-performance (1.26.1) sha256=cd19b936ff196df85829d264b522fd4f98b6c89ad271fa52744a8c11b8f71834
   rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
   ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
   securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
-  sorbet-runtime (0.6.12897) sha256=0348ab8803c4c3646977fee298083ded9b7e74d5b34b50c567c63eb7e36eb286
+  sorbet-runtime (0.6.12903) sha256=c23968c0dcf5a5db57f32c003fe3db7fb588c168cdd57d92ea4dceaba063118a
   steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
   strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
   terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2

data/README.md CHANGED Viewed

@@ -22,7 +22,7 @@
     <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
   </a>
   <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
-    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.2" alt="Go">
+    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.3" alt="Go">
   </a>
   <a href="https://www.nuget.org/packages/Kreuzberg/">
     <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">

data/lib/kreuzberg/api_proxy.rb CHANGED Viewed

@@ -6,9 +6,9 @@ module Kreuzberg
   # @example Start the server
   # @example With block
   module APIProxy
-    Error = Class.new(Kreuzberg::Errors::Error)
-    MissingBinaryError = Class.new(Error)
-    ServerError = Class.new(Error)
+    class Error < Kreuzberg::Errors::Error; end
+    class MissingBinaryError < Error; end
+    class ServerError < Error; end
     # API server instance
     class Server

data/lib/kreuzberg/cli_proxy.rb CHANGED Viewed

@@ -5,8 +5,8 @@ require 'open3'
 module Kreuzberg
   # @example
   module CLIProxy
-    Error = Class.new(Kreuzberg::Errors::Error)
-    MissingBinaryError = Class.new(Error)
+    class Error < Kreuzberg::Errors::Error; end
+    class MissingBinaryError < Error; end
     # CLI execution error with stderr and exit status
     class CLIExecutionError < Error

data/lib/kreuzberg/mcp_proxy.rb CHANGED Viewed

@@ -6,9 +6,9 @@ require 'json'
 module Kreuzberg
   # @example Start MCP server
   module MCPProxy
-    Error = Class.new(Kreuzberg::Errors::Error)
-    MissingBinaryError = Class.new(Error)
-    ServerError = Class.new(Error)
+    class Error < Kreuzberg::Errors::Error; end
+    class MissingBinaryError < Error; end
+    class ServerError < Error; end
     # MCP server instance
     class Server

data/lib/kreuzberg/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Kreuzberg
-  VERSION = '4.2.2'
+  VERSION = '4.2.3'
 end

data/vendor/Cargo.toml CHANGED Viewed

@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
 resolver = "2"
 [workspace.package]
-version = "4.2.2"
+version = "4.2.3"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

data/vendor/kreuzberg/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg"
-version = "4.2.2"
+version = "4.2.3"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -198,7 +198,7 @@ rake = { version = "0.3.6", optional = true }
 axum = { version = "0.8", features = ["macros", "json", "multipart"], optional = true }
 tower = { version = "0.5", optional = true }
 tower-http = { version = "0.6", features = ["cors", "trace", "limit"], optional = true }
-utoipa = { version = "5.3", features = ["axum_extras"], optional = true }
+utoipa = { version = "5.4", features = ["axum_extras"], optional = true }
 rmcp = { version = "0.14.0", features = [
     "server",
     "macros",

data/vendor/kreuzberg/README.md CHANGED Viewed

@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
 This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
-> **🚀 Version 4.2.2 Release**
+> **🚀 Version 4.2.3 Release**
 > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
 >
 > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.

data/vendor/kreuzberg/src/api/error.rs CHANGED Viewed

@@ -2,6 +2,7 @@
 use axum::{
     Json,
+    body::to_bytes,
     extract::{FromRequest, Request, rejection::JsonRejection},
     http::StatusCode,
     response::{IntoResponse, Response},
@@ -16,6 +17,9 @@ use super::types::ErrorResponse;
 ///
 /// This wraps axum's `Json` extractor but uses `ApiError` as the rejection type,
 /// ensuring that all JSON parsing errors are returned as JSON with proper content type.
+///
+/// Additionally, this extractor validates that the root JSON value is an object (not an array),
+/// which prevents serde from incorrectly deserializing JSON arrays into struct fields.
 #[derive(Debug, Clone, Copy, Default)]
 pub struct JsonApi<T>(pub T);
@@ -27,6 +31,31 @@ where
     type Rejection = ApiError;
     async fn from_request(req: Request, state: &S) -> Result<Self, Self::Rejection> {
+        // First, extract the body to check if it's a valid JSON object (not array)
+        let (parts, body) = req.into_parts();
+        let bytes = to_bytes(body, usize::MAX).await.map_err(|_| {
+            ApiError::new(
+                StatusCode::BAD_REQUEST,
+                KreuzbergError::Other("Failed to read request body".to_string()),
+            )
+        })?;
+        // Validate that the root JSON is an object, not an array
+        if !bytes.is_empty() {
+            let trimmed = std::str::from_utf8(&bytes).unwrap_or("").trim_start();
+            if trimmed.starts_with('[') {
+                return Err(ApiError::new(
+                    StatusCode::BAD_REQUEST,
+                    KreuzbergError::validation(
+                        "Expected JSON object, but received JSON array. \
+                         Please wrap your data in an object with appropriate fields.",
+                    ),
+                ));
+            }
+        }
+        // Reconstruct the request and use the standard Json extractor
+        let req = Request::from_parts(parts, axum::body::Body::from(bytes));
         match Json::<T>::from_request(req, state).await {
             Ok(Json(value)) => Ok(JsonApi(value)),
             Err(rejection) => Err(ApiError::from(rejection)),

data/vendor/kreuzberg/src/mcp/format.rs CHANGED Viewed

@@ -72,28 +72,12 @@ pub(super) fn build_config(
     }
 }
-/// Format extraction result as human-readable text.
+/// Format extraction result as JSON string.
+///
+/// Serializes the full `ExtractionResult` to JSON, ensuring 1:1 parity
+/// with the API and CLI JSON output.
 pub(super) fn format_extraction_result(result: &KreuzbergResult) -> String {
-    let mut response = String::new();
-    response.push_str(&format!("Content ({} characters):\n", result.content.len()));
-    response.push_str(&result.content);
-    response.push_str("\n\n");
-    response.push_str("Metadata:\n");
-    response.push_str(&serde_json::to_string_pretty(&result.metadata).unwrap_or_default());
-    response.push_str("\n\n");
-    if !result.tables.is_empty() {
-        response.push_str(&format!("Tables ({}):\n", result.tables.len()));
-        for (i, table) in result.tables.iter().enumerate() {
-            response.push_str(&format!("\nTable {} (page {}):\n", i + 1, table.page_number));
-            response.push_str(&table.markdown);
-            response.push('\n');
-        }
-    }
-    response
+    serde_json::to_string_pretty(result).unwrap_or_default()
 }
 #[cfg(test)]
@@ -303,7 +287,7 @@ mod tests {
     }
     #[test]
-    fn test_format_extraction_result_with_content() {
+    fn test_format_extraction_result_is_valid_json() {
         let result = KreuzbergResult {
             content: "Sample extracted text".to_string(),
             mime_type: "text/plain".to_string(),
@@ -318,36 +302,27 @@ mod tests {
         };
         let formatted = format_extraction_result(&result);
+        let parsed: serde_json::Value = serde_json::from_str(&formatted).expect("Should be valid JSON");
-        assert!(formatted.contains("Content (21 characters)"));
-        assert!(formatted.contains("Sample extracted text"));
-        assert!(formatted.contains("Metadata:"));
+        assert_eq!(parsed["content"], "Sample extracted text");
+        assert_eq!(parsed["mime_type"], "text/plain");
+        assert!(parsed["metadata"].is_object());
     }
     #[test]
-    fn test_format_extraction_result_with_tables() {
+    fn test_format_extraction_result_includes_tables() {
         let result = KreuzbergResult {
             content: "Document with tables".to_string(),
             mime_type: "application/pdf".to_string(),
             metadata: crate::Metadata::default(),
-            tables: vec![
-                crate::Table {
-                    cells: vec![
-                        vec!["Col1".to_string(), "Col2".to_string()],
-                        vec!["A".to_string(), "B".to_string()],
-                    ],
-                    page_number: 1,
-                    markdown: "| Col1 | Col2 |\n|------|------|\n| A    | B    |".to_string(),
-                },
-                crate::Table {
-                    cells: vec![
-                        vec!["X".to_string(), "Y".to_string()],
-                        vec!["1".to_string(), "2".to_string()],
-                    ],
-                    page_number: 2,
-                    markdown: "| X | Y |\n|---|---|\n| 1 | 2 |".to_string(),
-                },
-            ],
+            tables: vec![crate::Table {
+                cells: vec![
+                    vec!["Col1".to_string(), "Col2".to_string()],
+                    vec!["A".to_string(), "B".to_string()],
+                ],
+                page_number: 1,
+                markdown: "| Col1 | Col2 |\n|------|------|\n| A    | B    |".to_string(),
+            }],
             detected_languages: None,
             chunks: None,
             images: None,
@@ -357,23 +332,33 @@ mod tests {
         };
         let formatted = format_extraction_result(&result);
+        let parsed: serde_json::Value = serde_json::from_str(&formatted).expect("Should be valid JSON");
-        assert!(formatted.contains("Tables (2)"));
-        assert!(formatted.contains("Table 1 (page 1)"));
-        assert!(formatted.contains("Table 2 (page 2)"));
-        assert!(formatted.contains("| Col1 | Col2 |"));
-        assert!(formatted.contains("| X | Y |"));
+        assert_eq!(parsed["tables"].as_array().unwrap().len(), 1);
+        assert_eq!(parsed["tables"][0]["page_number"], 1);
     }
     #[test]
-    fn test_format_extraction_result_empty_content() {
+    fn test_format_extraction_result_includes_chunks_when_present() {
         let result = KreuzbergResult {
-            content: String::new(),
+            content: "Chunked text".to_string(),
             mime_type: "text/plain".to_string(),
             metadata: crate::Metadata::default(),
             tables: vec![],
             detected_languages: None,
-            chunks: None,
+            chunks: Some(vec![crate::Chunk {
+                content: "Chunk 1".to_string(),
+                embedding: None,
+                metadata: crate::ChunkMetadata {
+                    byte_start: 0,
+                    byte_end: 7,
+                    token_count: None,
+                    chunk_index: 0,
+                    total_chunks: 1,
+                    first_page: None,
+                    last_page: None,
+                },
+            }]),
             images: None,
             pages: None,
             elements: None,
@@ -381,13 +366,14 @@ mod tests {
         };
         let formatted = format_extraction_result(&result);
+        let parsed: serde_json::Value = serde_json::from_str(&formatted).expect("Should be valid JSON");
-        assert!(formatted.contains("Content (0 characters)"));
-        assert!(formatted.contains("Metadata:"));
+        assert_eq!(parsed["chunks"].as_array().unwrap().len(), 1);
+        assert_eq!(parsed["chunks"][0]["content"], "Chunk 1");
     }
     #[test]
-    fn test_format_extraction_result_no_tables() {
+    fn test_format_extraction_result_omits_none_fields() {
         let result = KreuzbergResult {
             content: "Simple text".to_string(),
             mime_type: "text/plain".to_string(),
@@ -402,8 +388,11 @@ mod tests {
         };
         let formatted = format_extraction_result(&result);
+        let parsed: serde_json::Value = serde_json::from_str(&formatted).expect("Should be valid JSON");
-        assert!(formatted.contains("Simple text"));
-        assert!(!formatted.contains("Tables"));
+        // None fields should be omitted via skip_serializing_if
+        assert!(parsed.get("chunks").is_none());
+        assert!(parsed.get("images").is_none());
+        assert!(parsed.get("detected_languages").is_none());
     }
 }

data/vendor/kreuzberg/src/mcp/server.rs CHANGED Viewed

@@ -144,7 +144,7 @@ impl KreuzbergMcp {
         Parameters(params): Parameters<super::params::BatchExtractFilesParams>,
     ) -> Result<CallToolResult, rmcp::ErrorData> {
         use super::errors::map_kreuzberg_error_to_mcp;
-        use super::format::{build_config, format_extraction_result};
+        use super::format::build_config;
         use crate::{batch_extract_file, batch_extract_file_sync};
         let config =
@@ -158,13 +158,7 @@ impl KreuzbergMcp {
             batch_extract_file_sync(params.paths.clone(), &config).map_err(map_kreuzberg_error_to_mcp)?
         };
-        let mut response = String::new();
-        for (i, result) in results.iter().enumerate() {
-            response.push_str(&format!("=== Document {}: {} ===\n", i + 1, params.paths[i]));
-            response.push_str(&format_extraction_result(result));
-            response.push_str("\n\n");
-        }
+        let response = serde_json::to_string_pretty(&results).unwrap_or_default();
         Ok(CallToolResult::success(vec![Content::text(response)]))
     }

data/vendor/kreuzberg/src/mcp/tools/extraction.rs CHANGED Viewed

@@ -99,13 +99,7 @@ pub(in crate::mcp) trait ExtractionTool {
             batch_extract_file_sync(params.paths.clone(), &config).map_err(map_kreuzberg_error_to_mcp)?
         };
-        let mut response = String::new();
-        for (i, result) in results.iter().enumerate() {
-            response.push_str(&format!("=== Document {}: {} ===\n", i + 1, params.paths[i]));
-            response.push_str(&format_extraction_result(result));
-            response.push_str("\n\n");
-        }
+        let response = serde_json::to_string_pretty(&results).unwrap_or_default();
         Ok(CallToolResult::success(vec![Content::text(response)]))
     }
 }

data/vendor/kreuzberg/tests/api_chunk.rs CHANGED Viewed

@@ -321,3 +321,28 @@ async fn test_chunk_custom_config() {
     assert_eq!(chunk_response.config.overlap, 5);
     assert!(!chunk_response.config.trim);
 }
+#[tokio::test]
+async fn test_chunk_rejects_json_array() {
+    let app = create_router(ExtractionConfig::default());
+    // Send a JSON array instead of object
+    let response = app
+        .oneshot(
+            Request::builder()
+                .uri("/chunk")
+                .method("POST")
+                .header("content-type", "application/json")
+                .body(Body::from(r#"[["text"], {"text": "content"}]"#))
+                .expect("Operation failed"),
+        )
+        .await
+        .expect("Operation failed");
+    // Should reject with 400 or 422, NOT 200
+    assert!(
+        response.status() == StatusCode::BAD_REQUEST || response.status() == StatusCode::UNPROCESSABLE_ENTITY,
+        "Expected 400 or 422, got {}",
+        response.status()
+    );
+}

data/vendor/kreuzberg/tests/api_embed.rs CHANGED Viewed

@@ -255,6 +255,66 @@ async fn test_embed_malformed_json() {
     assert_eq!(response.status(), StatusCode::BAD_REQUEST);
 }
+/// Test embed endpoint rejects JSON array at root level.
+#[tokio::test]
+async fn test_embed_rejects_json_array() {
+    let app = create_router(ExtractionConfig::default());
+    // Send a JSON array instead of object
+    let response = app
+        .oneshot(
+            Request::builder()
+                .method("POST")
+                .uri("/embed")
+                .header("content-type", "application/json")
+                .body(Body::from(r#"[["text1"], {"texts": ["text2"]}]"#))
+                .expect("Operation failed"),
+        )
+        .await
+        .expect("Operation failed");
+    // Should reject with 400 or 422, NOT 200
+    assert!(
+        response.status() == StatusCode::BAD_REQUEST || response.status() == StatusCode::UNPROCESSABLE_ENTITY,
+        "Expected 400 or 422, got {}",
+        response.status()
+    );
+}
+/// Test embed endpoint rejects simple JSON array with strings.
+#[tokio::test]
+async fn test_embed_rejects_simple_json_array() {
+    let app = create_router(ExtractionConfig::default());
+    // Send a simple string array instead of object with texts field
+    let response = app
+        .oneshot(
+            Request::builder()
+                .method("POST")
+                .uri("/embed")
+                .header("content-type", "application/json")
+                .body(Body::from(r#"["text1", "text2", "text3"]"#))
+                .expect("Operation failed"),
+        )
+        .await
+        .expect("Operation failed");
+    assert_eq!(response.status(), StatusCode::BAD_REQUEST);
+    // Check that error response contains helpful message
+    let body = axum::body::to_bytes(response.into_body(), usize::MAX)
+        .await
+        .expect("Failed to read response body");
+    let error_response: serde_json::Value = serde_json::from_slice(&body).expect("Failed to parse error response");
+    assert!(
+        error_response["message"]
+            .as_str()
+            .map(|msg| msg.contains("array") || msg.contains("object"))
+            .unwrap_or(false)
+    );
+}
 /// Test embed endpoint preserves embedding vector values across calls.
 #[tokio::test]
 async fn test_embed_deterministic() {

data/vendor/kreuzberg-tesseract/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg-tesseract"
-version = "4.2.2"
+version = "4.2.3"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: kreuzberg
 version: !ruby/object:Gem::Version
-  version: 4.2.2
+  version: 4.2.3
 platform: ruby
 authors:
 - Na'aman Hirschfeld