kreuzberg 4.2.2 → 4.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1765714785cbe89567dcb13ed0c1e1b79c2da7a2143a0d0b4653c5578a3ada84
4
- data.tar.gz: 64d6db5e4d88992920f37fe9a3e28ab08b5bdd0b28385da570e8207e67d90f34
3
+ metadata.gz: 2c6fc44b151014f7e56c82bd191f55244a4294a259b24b95fc494dba6f8eaba6
4
+ data.tar.gz: 6e40a732814ff3e2a164e718cdb1c7a6ae838b2b2210a66b232f8675c7f79a80
5
5
  SHA512:
6
- metadata.gz: cbb71395a285ddb1a74101fc935ebb8266b81c6172043a128721ab37fad583c7202e559f9e8cb2534bf110721bf20e2d0cbe6838554c772831c56bc09583bf75
7
- data.tar.gz: b752cf56da8810211e5efd5e5d69f136eb7d0a3d5e27e985b81dff18bde442f0033b15962823ad7e3c5a27e080d02b6a6df1726bffe4aa21eaf89f56a5c6b56f
6
+ metadata.gz: f9c3a45f31c3ad9e3857872d8705b397b40c4317844ef421f4da4c2918e57411f5a626df4f6706d7db4916f33b8644c736e7b41508b398fd0197f1a87170fa3c
7
+ data.tar.gz: 8b05a75be261dbe583c4873d9d21079efff97d6c9c0340bbd8a73a43c9d15955431f4de20cd8b4a8b7956872f52e4467c253f5da03177a1e7d3b6a10d202b59d
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.2.2)
4
+ kreuzberg (4.2.3)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -61,7 +61,7 @@ GEM
61
61
  parser (3.3.10.1)
62
62
  ast (~> 2.4.1)
63
63
  racc
64
- prism (1.8.0)
64
+ prism (1.9.0)
65
65
  pry (0.16.0)
66
66
  coderay (~> 1.1)
67
67
  method_source (~> 1.0)
@@ -98,7 +98,7 @@ GEM
98
98
  diff-lcs (>= 1.2.0, < 2.0)
99
99
  rspec-support (~> 3.13.0)
100
100
  rspec-support (3.13.6)
101
- rubocop (1.82.1)
101
+ rubocop (1.84.0)
102
102
  json (~> 2.3)
103
103
  language_server-protocol (~> 3.17.0.2)
104
104
  lint_roller (~> 1.1.0)
@@ -106,7 +106,7 @@ GEM
106
106
  parser (>= 3.3.0.2)
107
107
  rainbow (>= 2.2.2, < 4.0)
108
108
  regexp_parser (>= 2.9.3, < 3.0)
109
- rubocop-ast (>= 1.48.0, < 2.0)
109
+ rubocop-ast (>= 1.49.0, < 2.0)
110
110
  ruby-progressbar (~> 1.7)
111
111
  unicode-display_width (>= 2.4.0, < 4.0)
112
112
  rubocop-ast (1.49.0)
@@ -121,7 +121,7 @@ GEM
121
121
  rubocop (~> 1.81)
122
122
  ruby-progressbar (1.13.0)
123
123
  securerandom (0.4.1)
124
- sorbet-runtime (0.6.12897)
124
+ sorbet-runtime (0.6.12903)
125
125
  steep (1.10.0)
126
126
  activesupport (>= 5.1)
127
127
  concurrent-ruby (>= 1.1.10)
@@ -207,7 +207,7 @@ CHECKSUMS
207
207
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
208
208
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
209
209
  json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
210
- kreuzberg (4.2.2)
210
+ kreuzberg (4.2.3)
211
211
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
212
212
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
213
213
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
@@ -217,7 +217,7 @@ CHECKSUMS
217
217
  mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
218
218
  parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
219
219
  parser (3.3.10.1) sha256=06f6a725d2cd91e5e7f2b7c32ba143631e1f7c8ae2fb918fc4cebec187e6a688
220
- prism (1.8.0) sha256=84453a16ef5530ea62c5f03ec16b52a459575ad4e7b9c2b360fd8ce2c39c1254
220
+ prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
221
221
  pry (0.16.0) sha256=d76c69065698ed1f85e717bd33d7942c38a50868f6b0673c636192b3d1b6054e
222
222
  pry-byebug (3.12.0) sha256=594e094ae8a8390a7ad4c7b36ae36e13304ed02664c67417d108dc5f7213d1b7
223
223
  racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
@@ -236,13 +236,13 @@ CHECKSUMS
236
236
  rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
237
237
  rspec-mocks (3.13.7) sha256=0979034e64b1d7a838aaaddf12bf065ea4dc40ef3d4c39f01f93ae2c66c62b1c
238
238
  rspec-support (3.13.6) sha256=2e8de3702427eab064c9352fe74488cc12a1bfae887ad8b91cba480ec9f8afb2
239
- rubocop (1.82.1) sha256=09f1a6a654a960eda767aebea33e47603080f8e9c9a3f019bf9b94c9cab5e273
239
+ rubocop (1.84.0) sha256=88dec310153bb685a879f5a7cdb601f6287b8f0ee675d9dc63a17c7204c4190a
240
240
  rubocop-ast (1.49.0) sha256=49c3676d3123a0923d333e20c6c2dbaaae2d2287b475273fddee0c61da9f71fd
241
241
  rubocop-performance (1.26.1) sha256=cd19b936ff196df85829d264b522fd4f98b6c89ad271fa52744a8c11b8f71834
242
242
  rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
243
243
  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
244
244
  securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
245
- sorbet-runtime (0.6.12897) sha256=0348ab8803c4c3646977fee298083ded9b7e74d5b34b50c567c63eb7e36eb286
245
+ sorbet-runtime (0.6.12903) sha256=c23968c0dcf5a5db57f32c003fe3db7fb588c168cdd57d92ea4dceaba063118a
246
246
  steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
247
247
  strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
248
248
  terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.2" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.3" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -6,9 +6,9 @@ module Kreuzberg
6
6
  # @example Start the server
7
7
  # @example With block
8
8
  module APIProxy
9
- Error = Class.new(Kreuzberg::Errors::Error)
10
- MissingBinaryError = Class.new(Error)
11
- ServerError = Class.new(Error)
9
+ class Error < Kreuzberg::Errors::Error; end
10
+ class MissingBinaryError < Error; end
11
+ class ServerError < Error; end
12
12
 
13
13
  # API server instance
14
14
  class Server
@@ -5,8 +5,8 @@ require 'open3'
5
5
  module Kreuzberg
6
6
  # @example
7
7
  module CLIProxy
8
- Error = Class.new(Kreuzberg::Errors::Error)
9
- MissingBinaryError = Class.new(Error)
8
+ class Error < Kreuzberg::Errors::Error; end
9
+ class MissingBinaryError < Error; end
10
10
 
11
11
  # CLI execution error with stderr and exit status
12
12
  class CLIExecutionError < Error
@@ -6,9 +6,9 @@ require 'json'
6
6
  module Kreuzberg
7
7
  # @example Start MCP server
8
8
  module MCPProxy
9
- Error = Class.new(Kreuzberg::Errors::Error)
10
- MissingBinaryError = Class.new(Error)
11
- ServerError = Class.new(Error)
9
+ class Error < Kreuzberg::Errors::Error; end
10
+ class MissingBinaryError < Error; end
11
+ class ServerError < Error; end
12
12
 
13
13
  # MCP server instance
14
14
  class Server
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.2.2'
4
+ VERSION = '4.2.3'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
3
3
  resolver = "2"
4
4
 
5
5
  [workspace.package]
6
- version = "4.2.2"
6
+ version = "4.2.3"
7
7
  edition = "2024"
8
8
  rust-version = "1.91"
9
9
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.2.2"
3
+ version = "4.2.3"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -198,7 +198,7 @@ rake = { version = "0.3.6", optional = true }
198
198
  axum = { version = "0.8", features = ["macros", "json", "multipart"], optional = true }
199
199
  tower = { version = "0.5", optional = true }
200
200
  tower-http = { version = "0.6", features = ["cors", "trace", "limit"], optional = true }
201
- utoipa = { version = "5.3", features = ["axum_extras"], optional = true }
201
+ utoipa = { version = "5.4", features = ["axum_extras"], optional = true }
202
202
  rmcp = { version = "0.14.0", features = [
203
203
  "server",
204
204
  "macros",
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.2.2 Release**
20
+ > **🚀 Version 4.2.3 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -2,6 +2,7 @@
2
2
 
3
3
  use axum::{
4
4
  Json,
5
+ body::to_bytes,
5
6
  extract::{FromRequest, Request, rejection::JsonRejection},
6
7
  http::StatusCode,
7
8
  response::{IntoResponse, Response},
@@ -16,6 +17,9 @@ use super::types::ErrorResponse;
16
17
  ///
17
18
  /// This wraps axum's `Json` extractor but uses `ApiError` as the rejection type,
18
19
  /// ensuring that all JSON parsing errors are returned as JSON with proper content type.
20
+ ///
21
+ /// Additionally, this extractor validates that the root JSON value is an object (not an array),
22
+ /// which prevents serde from incorrectly deserializing JSON arrays into struct fields.
19
23
  #[derive(Debug, Clone, Copy, Default)]
20
24
  pub struct JsonApi<T>(pub T);
21
25
 
@@ -27,6 +31,31 @@ where
27
31
  type Rejection = ApiError;
28
32
 
29
33
  async fn from_request(req: Request, state: &S) -> Result<Self, Self::Rejection> {
34
+ // First, extract the body to check if it's a valid JSON object (not array)
35
+ let (parts, body) = req.into_parts();
36
+ let bytes = to_bytes(body, usize::MAX).await.map_err(|_| {
37
+ ApiError::new(
38
+ StatusCode::BAD_REQUEST,
39
+ KreuzbergError::Other("Failed to read request body".to_string()),
40
+ )
41
+ })?;
42
+
43
+ // Validate that the root JSON is an object, not an array
44
+ if !bytes.is_empty() {
45
+ let trimmed = std::str::from_utf8(&bytes).unwrap_or("").trim_start();
46
+ if trimmed.starts_with('[') {
47
+ return Err(ApiError::new(
48
+ StatusCode::BAD_REQUEST,
49
+ KreuzbergError::validation(
50
+ "Expected JSON object, but received JSON array. \
51
+ Please wrap your data in an object with appropriate fields.",
52
+ ),
53
+ ));
54
+ }
55
+ }
56
+
57
+ // Reconstruct the request and use the standard Json extractor
58
+ let req = Request::from_parts(parts, axum::body::Body::from(bytes));
30
59
  match Json::<T>::from_request(req, state).await {
31
60
  Ok(Json(value)) => Ok(JsonApi(value)),
32
61
  Err(rejection) => Err(ApiError::from(rejection)),
@@ -72,28 +72,12 @@ pub(super) fn build_config(
72
72
  }
73
73
  }
74
74
 
75
- /// Format extraction result as human-readable text.
75
+ /// Format extraction result as JSON string.
76
+ ///
77
+ /// Serializes the full `ExtractionResult` to JSON, ensuring 1:1 parity
78
+ /// with the API and CLI JSON output.
76
79
  pub(super) fn format_extraction_result(result: &KreuzbergResult) -> String {
77
- let mut response = String::new();
78
-
79
- response.push_str(&format!("Content ({} characters):\n", result.content.len()));
80
- response.push_str(&result.content);
81
- response.push_str("\n\n");
82
-
83
- response.push_str("Metadata:\n");
84
- response.push_str(&serde_json::to_string_pretty(&result.metadata).unwrap_or_default());
85
- response.push_str("\n\n");
86
-
87
- if !result.tables.is_empty() {
88
- response.push_str(&format!("Tables ({}):\n", result.tables.len()));
89
- for (i, table) in result.tables.iter().enumerate() {
90
- response.push_str(&format!("\nTable {} (page {}):\n", i + 1, table.page_number));
91
- response.push_str(&table.markdown);
92
- response.push('\n');
93
- }
94
- }
95
-
96
- response
80
+ serde_json::to_string_pretty(result).unwrap_or_default()
97
81
  }
98
82
 
99
83
  #[cfg(test)]
@@ -303,7 +287,7 @@ mod tests {
303
287
  }
304
288
 
305
289
  #[test]
306
- fn test_format_extraction_result_with_content() {
290
+ fn test_format_extraction_result_is_valid_json() {
307
291
  let result = KreuzbergResult {
308
292
  content: "Sample extracted text".to_string(),
309
293
  mime_type: "text/plain".to_string(),
@@ -318,36 +302,27 @@ mod tests {
318
302
  };
319
303
 
320
304
  let formatted = format_extraction_result(&result);
305
+ let parsed: serde_json::Value = serde_json::from_str(&formatted).expect("Should be valid JSON");
321
306
 
322
- assert!(formatted.contains("Content (21 characters)"));
323
- assert!(formatted.contains("Sample extracted text"));
324
- assert!(formatted.contains("Metadata:"));
307
+ assert_eq!(parsed["content"], "Sample extracted text");
308
+ assert_eq!(parsed["mime_type"], "text/plain");
309
+ assert!(parsed["metadata"].is_object());
325
310
  }
326
311
 
327
312
  #[test]
328
- fn test_format_extraction_result_with_tables() {
313
+ fn test_format_extraction_result_includes_tables() {
329
314
  let result = KreuzbergResult {
330
315
  content: "Document with tables".to_string(),
331
316
  mime_type: "application/pdf".to_string(),
332
317
  metadata: crate::Metadata::default(),
333
- tables: vec![
334
- crate::Table {
335
- cells: vec![
336
- vec!["Col1".to_string(), "Col2".to_string()],
337
- vec!["A".to_string(), "B".to_string()],
338
- ],
339
- page_number: 1,
340
- markdown: "| Col1 | Col2 |\n|------|------|\n| A | B |".to_string(),
341
- },
342
- crate::Table {
343
- cells: vec![
344
- vec!["X".to_string(), "Y".to_string()],
345
- vec!["1".to_string(), "2".to_string()],
346
- ],
347
- page_number: 2,
348
- markdown: "| X | Y |\n|---|---|\n| 1 | 2 |".to_string(),
349
- },
350
- ],
318
+ tables: vec![crate::Table {
319
+ cells: vec![
320
+ vec!["Col1".to_string(), "Col2".to_string()],
321
+ vec!["A".to_string(), "B".to_string()],
322
+ ],
323
+ page_number: 1,
324
+ markdown: "| Col1 | Col2 |\n|------|------|\n| A | B |".to_string(),
325
+ }],
351
326
  detected_languages: None,
352
327
  chunks: None,
353
328
  images: None,
@@ -357,23 +332,33 @@ mod tests {
357
332
  };
358
333
 
359
334
  let formatted = format_extraction_result(&result);
335
+ let parsed: serde_json::Value = serde_json::from_str(&formatted).expect("Should be valid JSON");
360
336
 
361
- assert!(formatted.contains("Tables (2)"));
362
- assert!(formatted.contains("Table 1 (page 1)"));
363
- assert!(formatted.contains("Table 2 (page 2)"));
364
- assert!(formatted.contains("| Col1 | Col2 |"));
365
- assert!(formatted.contains("| X | Y |"));
337
+ assert_eq!(parsed["tables"].as_array().unwrap().len(), 1);
338
+ assert_eq!(parsed["tables"][0]["page_number"], 1);
366
339
  }
367
340
 
368
341
  #[test]
369
- fn test_format_extraction_result_empty_content() {
342
+ fn test_format_extraction_result_includes_chunks_when_present() {
370
343
  let result = KreuzbergResult {
371
- content: String::new(),
344
+ content: "Chunked text".to_string(),
372
345
  mime_type: "text/plain".to_string(),
373
346
  metadata: crate::Metadata::default(),
374
347
  tables: vec![],
375
348
  detected_languages: None,
376
- chunks: None,
349
+ chunks: Some(vec![crate::Chunk {
350
+ content: "Chunk 1".to_string(),
351
+ embedding: None,
352
+ metadata: crate::ChunkMetadata {
353
+ byte_start: 0,
354
+ byte_end: 7,
355
+ token_count: None,
356
+ chunk_index: 0,
357
+ total_chunks: 1,
358
+ first_page: None,
359
+ last_page: None,
360
+ },
361
+ }]),
377
362
  images: None,
378
363
  pages: None,
379
364
  elements: None,
@@ -381,13 +366,14 @@ mod tests {
381
366
  };
382
367
 
383
368
  let formatted = format_extraction_result(&result);
369
+ let parsed: serde_json::Value = serde_json::from_str(&formatted).expect("Should be valid JSON");
384
370
 
385
- assert!(formatted.contains("Content (0 characters)"));
386
- assert!(formatted.contains("Metadata:"));
371
+ assert_eq!(parsed["chunks"].as_array().unwrap().len(), 1);
372
+ assert_eq!(parsed["chunks"][0]["content"], "Chunk 1");
387
373
  }
388
374
 
389
375
  #[test]
390
- fn test_format_extraction_result_no_tables() {
376
+ fn test_format_extraction_result_omits_none_fields() {
391
377
  let result = KreuzbergResult {
392
378
  content: "Simple text".to_string(),
393
379
  mime_type: "text/plain".to_string(),
@@ -402,8 +388,11 @@ mod tests {
402
388
  };
403
389
 
404
390
  let formatted = format_extraction_result(&result);
391
+ let parsed: serde_json::Value = serde_json::from_str(&formatted).expect("Should be valid JSON");
405
392
 
406
- assert!(formatted.contains("Simple text"));
407
- assert!(!formatted.contains("Tables"));
393
+ // None fields should be omitted via skip_serializing_if
394
+ assert!(parsed.get("chunks").is_none());
395
+ assert!(parsed.get("images").is_none());
396
+ assert!(parsed.get("detected_languages").is_none());
408
397
  }
409
398
  }
@@ -144,7 +144,7 @@ impl KreuzbergMcp {
144
144
  Parameters(params): Parameters<super::params::BatchExtractFilesParams>,
145
145
  ) -> Result<CallToolResult, rmcp::ErrorData> {
146
146
  use super::errors::map_kreuzberg_error_to_mcp;
147
- use super::format::{build_config, format_extraction_result};
147
+ use super::format::build_config;
148
148
  use crate::{batch_extract_file, batch_extract_file_sync};
149
149
 
150
150
  let config =
@@ -158,13 +158,7 @@ impl KreuzbergMcp {
158
158
  batch_extract_file_sync(params.paths.clone(), &config).map_err(map_kreuzberg_error_to_mcp)?
159
159
  };
160
160
 
161
- let mut response = String::new();
162
- for (i, result) in results.iter().enumerate() {
163
- response.push_str(&format!("=== Document {}: {} ===\n", i + 1, params.paths[i]));
164
- response.push_str(&format_extraction_result(result));
165
- response.push_str("\n\n");
166
- }
167
-
161
+ let response = serde_json::to_string_pretty(&results).unwrap_or_default();
168
162
  Ok(CallToolResult::success(vec![Content::text(response)]))
169
163
  }
170
164
 
@@ -99,13 +99,7 @@ pub(in crate::mcp) trait ExtractionTool {
99
99
  batch_extract_file_sync(params.paths.clone(), &config).map_err(map_kreuzberg_error_to_mcp)?
100
100
  };
101
101
 
102
- let mut response = String::new();
103
- for (i, result) in results.iter().enumerate() {
104
- response.push_str(&format!("=== Document {}: {} ===\n", i + 1, params.paths[i]));
105
- response.push_str(&format_extraction_result(result));
106
- response.push_str("\n\n");
107
- }
108
-
102
+ let response = serde_json::to_string_pretty(&results).unwrap_or_default();
109
103
  Ok(CallToolResult::success(vec![Content::text(response)]))
110
104
  }
111
105
  }
@@ -321,3 +321,28 @@ async fn test_chunk_custom_config() {
321
321
  assert_eq!(chunk_response.config.overlap, 5);
322
322
  assert!(!chunk_response.config.trim);
323
323
  }
324
+
325
+ #[tokio::test]
326
+ async fn test_chunk_rejects_json_array() {
327
+ let app = create_router(ExtractionConfig::default());
328
+
329
+ // Send a JSON array instead of object
330
+ let response = app
331
+ .oneshot(
332
+ Request::builder()
333
+ .uri("/chunk")
334
+ .method("POST")
335
+ .header("content-type", "application/json")
336
+ .body(Body::from(r#"[["text"], {"text": "content"}]"#))
337
+ .expect("Operation failed"),
338
+ )
339
+ .await
340
+ .expect("Operation failed");
341
+
342
+ // Should reject with 400 or 422, NOT 200
343
+ assert!(
344
+ response.status() == StatusCode::BAD_REQUEST || response.status() == StatusCode::UNPROCESSABLE_ENTITY,
345
+ "Expected 400 or 422, got {}",
346
+ response.status()
347
+ );
348
+ }
@@ -255,6 +255,66 @@ async fn test_embed_malformed_json() {
255
255
  assert_eq!(response.status(), StatusCode::BAD_REQUEST);
256
256
  }
257
257
 
258
+ /// Test embed endpoint rejects JSON array at root level.
259
+ #[tokio::test]
260
+ async fn test_embed_rejects_json_array() {
261
+ let app = create_router(ExtractionConfig::default());
262
+
263
+ // Send a JSON array instead of object
264
+ let response = app
265
+ .oneshot(
266
+ Request::builder()
267
+ .method("POST")
268
+ .uri("/embed")
269
+ .header("content-type", "application/json")
270
+ .body(Body::from(r#"[["text1"], {"texts": ["text2"]}]"#))
271
+ .expect("Operation failed"),
272
+ )
273
+ .await
274
+ .expect("Operation failed");
275
+
276
+ // Should reject with 400 or 422, NOT 200
277
+ assert!(
278
+ response.status() == StatusCode::BAD_REQUEST || response.status() == StatusCode::UNPROCESSABLE_ENTITY,
279
+ "Expected 400 or 422, got {}",
280
+ response.status()
281
+ );
282
+ }
283
+
284
+ /// Test embed endpoint rejects simple JSON array with strings.
285
+ #[tokio::test]
286
+ async fn test_embed_rejects_simple_json_array() {
287
+ let app = create_router(ExtractionConfig::default());
288
+
289
+ // Send a simple string array instead of object with texts field
290
+ let response = app
291
+ .oneshot(
292
+ Request::builder()
293
+ .method("POST")
294
+ .uri("/embed")
295
+ .header("content-type", "application/json")
296
+ .body(Body::from(r#"["text1", "text2", "text3"]"#))
297
+ .expect("Operation failed"),
298
+ )
299
+ .await
300
+ .expect("Operation failed");
301
+
302
+ assert_eq!(response.status(), StatusCode::BAD_REQUEST);
303
+
304
+ // Check that error response contains helpful message
305
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
306
+ .await
307
+ .expect("Failed to read response body");
308
+ let error_response: serde_json::Value = serde_json::from_slice(&body).expect("Failed to parse error response");
309
+
310
+ assert!(
311
+ error_response["message"]
312
+ .as_str()
313
+ .map(|msg| msg.contains("array") || msg.contains("object"))
314
+ .unwrap_or(false)
315
+ );
316
+ }
317
+
258
318
  /// Test embed endpoint preserves embedding vector values across calls.
259
319
  #[tokio::test]
260
320
  async fn test_embed_deterministic() {
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.2.2"
3
+ version = "4.2.3"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.2.2
4
+ version: 4.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld