kreuzberg 4.2.2 → 4.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +9 -9
- data/README.md +1 -1
- data/lib/kreuzberg/api_proxy.rb +3 -3
- data/lib/kreuzberg/cli_proxy.rb +2 -2
- data/lib/kreuzberg/mcp_proxy.rb +3 -3
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +2 -2
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +29 -0
- data/vendor/kreuzberg/src/mcp/format.rs +46 -57
- data/vendor/kreuzberg/src/mcp/server.rs +2 -8
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +1 -7
- data/vendor/kreuzberg/tests/api_chunk.rs +25 -0
- data/vendor/kreuzberg/tests/api_embed.rs +60 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 2c6fc44b151014f7e56c82bd191f55244a4294a259b24b95fc494dba6f8eaba6
|
|
4
|
+
data.tar.gz: 6e40a732814ff3e2a164e718cdb1c7a6ae838b2b2210a66b232f8675c7f79a80
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f9c3a45f31c3ad9e3857872d8705b397b40c4317844ef421f4da4c2918e57411f5a626df4f6706d7db4916f33b8644c736e7b41508b398fd0197f1a87170fa3c
|
|
7
|
+
data.tar.gz: 8b05a75be261dbe583c4873d9d21079efff97d6c9c0340bbd8a73a43c9d15955431f4de20cd8b4a8b7956872f52e4467c253f5da03177a1e7d3b6a10d202b59d
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.2.
|
|
4
|
+
kreuzberg (4.2.3)
|
|
5
5
|
|
|
6
6
|
GEM
|
|
7
7
|
remote: https://rubygems.org/
|
|
@@ -61,7 +61,7 @@ GEM
|
|
|
61
61
|
parser (3.3.10.1)
|
|
62
62
|
ast (~> 2.4.1)
|
|
63
63
|
racc
|
|
64
|
-
prism (1.
|
|
64
|
+
prism (1.9.0)
|
|
65
65
|
pry (0.16.0)
|
|
66
66
|
coderay (~> 1.1)
|
|
67
67
|
method_source (~> 1.0)
|
|
@@ -98,7 +98,7 @@ GEM
|
|
|
98
98
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
99
99
|
rspec-support (~> 3.13.0)
|
|
100
100
|
rspec-support (3.13.6)
|
|
101
|
-
rubocop (1.
|
|
101
|
+
rubocop (1.84.0)
|
|
102
102
|
json (~> 2.3)
|
|
103
103
|
language_server-protocol (~> 3.17.0.2)
|
|
104
104
|
lint_roller (~> 1.1.0)
|
|
@@ -106,7 +106,7 @@ GEM
|
|
|
106
106
|
parser (>= 3.3.0.2)
|
|
107
107
|
rainbow (>= 2.2.2, < 4.0)
|
|
108
108
|
regexp_parser (>= 2.9.3, < 3.0)
|
|
109
|
-
rubocop-ast (>= 1.
|
|
109
|
+
rubocop-ast (>= 1.49.0, < 2.0)
|
|
110
110
|
ruby-progressbar (~> 1.7)
|
|
111
111
|
unicode-display_width (>= 2.4.0, < 4.0)
|
|
112
112
|
rubocop-ast (1.49.0)
|
|
@@ -121,7 +121,7 @@ GEM
|
|
|
121
121
|
rubocop (~> 1.81)
|
|
122
122
|
ruby-progressbar (1.13.0)
|
|
123
123
|
securerandom (0.4.1)
|
|
124
|
-
sorbet-runtime (0.6.
|
|
124
|
+
sorbet-runtime (0.6.12903)
|
|
125
125
|
steep (1.10.0)
|
|
126
126
|
activesupport (>= 5.1)
|
|
127
127
|
concurrent-ruby (>= 1.1.10)
|
|
@@ -207,7 +207,7 @@ CHECKSUMS
|
|
|
207
207
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
208
208
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
209
209
|
json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
|
|
210
|
-
kreuzberg (4.2.
|
|
210
|
+
kreuzberg (4.2.3)
|
|
211
211
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
212
212
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
213
213
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
@@ -217,7 +217,7 @@ CHECKSUMS
|
|
|
217
217
|
mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
|
|
218
218
|
parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
|
|
219
219
|
parser (3.3.10.1) sha256=06f6a725d2cd91e5e7f2b7c32ba143631e1f7c8ae2fb918fc4cebec187e6a688
|
|
220
|
-
prism (1.
|
|
220
|
+
prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
|
|
221
221
|
pry (0.16.0) sha256=d76c69065698ed1f85e717bd33d7942c38a50868f6b0673c636192b3d1b6054e
|
|
222
222
|
pry-byebug (3.12.0) sha256=594e094ae8a8390a7ad4c7b36ae36e13304ed02664c67417d108dc5f7213d1b7
|
|
223
223
|
racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
|
|
@@ -236,13 +236,13 @@ CHECKSUMS
|
|
|
236
236
|
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
|
|
237
237
|
rspec-mocks (3.13.7) sha256=0979034e64b1d7a838aaaddf12bf065ea4dc40ef3d4c39f01f93ae2c66c62b1c
|
|
238
238
|
rspec-support (3.13.6) sha256=2e8de3702427eab064c9352fe74488cc12a1bfae887ad8b91cba480ec9f8afb2
|
|
239
|
-
rubocop (1.
|
|
239
|
+
rubocop (1.84.0) sha256=88dec310153bb685a879f5a7cdb601f6287b8f0ee675d9dc63a17c7204c4190a
|
|
240
240
|
rubocop-ast (1.49.0) sha256=49c3676d3123a0923d333e20c6c2dbaaae2d2287b475273fddee0c61da9f71fd
|
|
241
241
|
rubocop-performance (1.26.1) sha256=cd19b936ff196df85829d264b522fd4f98b6c89ad271fa52744a8c11b8f71834
|
|
242
242
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
243
243
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
244
244
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
245
|
-
sorbet-runtime (0.6.
|
|
245
|
+
sorbet-runtime (0.6.12903) sha256=c23968c0dcf5a5db57f32c003fe3db7fb588c168cdd57d92ea4dceaba063118a
|
|
246
246
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
247
247
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
248
248
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.3" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
data/lib/kreuzberg/api_proxy.rb
CHANGED
|
@@ -6,9 +6,9 @@ module Kreuzberg
|
|
|
6
6
|
# @example Start the server
|
|
7
7
|
# @example With block
|
|
8
8
|
module APIProxy
|
|
9
|
-
Error
|
|
10
|
-
MissingBinaryError
|
|
11
|
-
ServerError
|
|
9
|
+
class Error < Kreuzberg::Errors::Error; end
|
|
10
|
+
class MissingBinaryError < Error; end
|
|
11
|
+
class ServerError < Error; end
|
|
12
12
|
|
|
13
13
|
# API server instance
|
|
14
14
|
class Server
|
data/lib/kreuzberg/cli_proxy.rb
CHANGED
|
@@ -5,8 +5,8 @@ require 'open3'
|
|
|
5
5
|
module Kreuzberg
|
|
6
6
|
# @example
|
|
7
7
|
module CLIProxy
|
|
8
|
-
Error
|
|
9
|
-
MissingBinaryError
|
|
8
|
+
class Error < Kreuzberg::Errors::Error; end
|
|
9
|
+
class MissingBinaryError < Error; end
|
|
10
10
|
|
|
11
11
|
# CLI execution error with stderr and exit status
|
|
12
12
|
class CLIExecutionError < Error
|
data/lib/kreuzberg/mcp_proxy.rb
CHANGED
|
@@ -6,9 +6,9 @@ require 'json'
|
|
|
6
6
|
module Kreuzberg
|
|
7
7
|
# @example Start MCP server
|
|
8
8
|
module MCPProxy
|
|
9
|
-
Error
|
|
10
|
-
MissingBinaryError
|
|
11
|
-
ServerError
|
|
9
|
+
class Error < Kreuzberg::Errors::Error; end
|
|
10
|
+
class MissingBinaryError < Error; end
|
|
11
|
+
class ServerError < Error; end
|
|
12
12
|
|
|
13
13
|
# MCP server instance
|
|
14
14
|
class Server
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.2.
|
|
3
|
+
version = "4.2.3"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -198,7 +198,7 @@ rake = { version = "0.3.6", optional = true }
|
|
|
198
198
|
axum = { version = "0.8", features = ["macros", "json", "multipart"], optional = true }
|
|
199
199
|
tower = { version = "0.5", optional = true }
|
|
200
200
|
tower-http = { version = "0.6", features = ["cors", "trace", "limit"], optional = true }
|
|
201
|
-
utoipa = { version = "5.
|
|
201
|
+
utoipa = { version = "5.4", features = ["axum_extras"], optional = true }
|
|
202
202
|
rmcp = { version = "0.14.0", features = [
|
|
203
203
|
"server",
|
|
204
204
|
"macros",
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.2.
|
|
20
|
+
> **🚀 Version 4.2.3 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
use axum::{
|
|
4
4
|
Json,
|
|
5
|
+
body::to_bytes,
|
|
5
6
|
extract::{FromRequest, Request, rejection::JsonRejection},
|
|
6
7
|
http::StatusCode,
|
|
7
8
|
response::{IntoResponse, Response},
|
|
@@ -16,6 +17,9 @@ use super::types::ErrorResponse;
|
|
|
16
17
|
///
|
|
17
18
|
/// This wraps axum's `Json` extractor but uses `ApiError` as the rejection type,
|
|
18
19
|
/// ensuring that all JSON parsing errors are returned as JSON with proper content type.
|
|
20
|
+
///
|
|
21
|
+
/// Additionally, this extractor validates that the root JSON value is an object (not an array),
|
|
22
|
+
/// which prevents serde from incorrectly deserializing JSON arrays into struct fields.
|
|
19
23
|
#[derive(Debug, Clone, Copy, Default)]
|
|
20
24
|
pub struct JsonApi<T>(pub T);
|
|
21
25
|
|
|
@@ -27,6 +31,31 @@ where
|
|
|
27
31
|
type Rejection = ApiError;
|
|
28
32
|
|
|
29
33
|
async fn from_request(req: Request, state: &S) -> Result<Self, Self::Rejection> {
|
|
34
|
+
// First, extract the body to check if it's a valid JSON object (not array)
|
|
35
|
+
let (parts, body) = req.into_parts();
|
|
36
|
+
let bytes = to_bytes(body, usize::MAX).await.map_err(|_| {
|
|
37
|
+
ApiError::new(
|
|
38
|
+
StatusCode::BAD_REQUEST,
|
|
39
|
+
KreuzbergError::Other("Failed to read request body".to_string()),
|
|
40
|
+
)
|
|
41
|
+
})?;
|
|
42
|
+
|
|
43
|
+
// Validate that the root JSON is an object, not an array
|
|
44
|
+
if !bytes.is_empty() {
|
|
45
|
+
let trimmed = std::str::from_utf8(&bytes).unwrap_or("").trim_start();
|
|
46
|
+
if trimmed.starts_with('[') {
|
|
47
|
+
return Err(ApiError::new(
|
|
48
|
+
StatusCode::BAD_REQUEST,
|
|
49
|
+
KreuzbergError::validation(
|
|
50
|
+
"Expected JSON object, but received JSON array. \
|
|
51
|
+
Please wrap your data in an object with appropriate fields.",
|
|
52
|
+
),
|
|
53
|
+
));
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// Reconstruct the request and use the standard Json extractor
|
|
58
|
+
let req = Request::from_parts(parts, axum::body::Body::from(bytes));
|
|
30
59
|
match Json::<T>::from_request(req, state).await {
|
|
31
60
|
Ok(Json(value)) => Ok(JsonApi(value)),
|
|
32
61
|
Err(rejection) => Err(ApiError::from(rejection)),
|
|
@@ -72,28 +72,12 @@ pub(super) fn build_config(
|
|
|
72
72
|
}
|
|
73
73
|
}
|
|
74
74
|
|
|
75
|
-
/// Format extraction result as
|
|
75
|
+
/// Format extraction result as JSON string.
|
|
76
|
+
///
|
|
77
|
+
/// Serializes the full `ExtractionResult` to JSON, ensuring 1:1 parity
|
|
78
|
+
/// with the API and CLI JSON output.
|
|
76
79
|
pub(super) fn format_extraction_result(result: &KreuzbergResult) -> String {
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
response.push_str(&format!("Content ({} characters):\n", result.content.len()));
|
|
80
|
-
response.push_str(&result.content);
|
|
81
|
-
response.push_str("\n\n");
|
|
82
|
-
|
|
83
|
-
response.push_str("Metadata:\n");
|
|
84
|
-
response.push_str(&serde_json::to_string_pretty(&result.metadata).unwrap_or_default());
|
|
85
|
-
response.push_str("\n\n");
|
|
86
|
-
|
|
87
|
-
if !result.tables.is_empty() {
|
|
88
|
-
response.push_str(&format!("Tables ({}):\n", result.tables.len()));
|
|
89
|
-
for (i, table) in result.tables.iter().enumerate() {
|
|
90
|
-
response.push_str(&format!("\nTable {} (page {}):\n", i + 1, table.page_number));
|
|
91
|
-
response.push_str(&table.markdown);
|
|
92
|
-
response.push('\n');
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
response
|
|
80
|
+
serde_json::to_string_pretty(result).unwrap_or_default()
|
|
97
81
|
}
|
|
98
82
|
|
|
99
83
|
#[cfg(test)]
|
|
@@ -303,7 +287,7 @@ mod tests {
|
|
|
303
287
|
}
|
|
304
288
|
|
|
305
289
|
#[test]
|
|
306
|
-
fn
|
|
290
|
+
fn test_format_extraction_result_is_valid_json() {
|
|
307
291
|
let result = KreuzbergResult {
|
|
308
292
|
content: "Sample extracted text".to_string(),
|
|
309
293
|
mime_type: "text/plain".to_string(),
|
|
@@ -318,36 +302,27 @@ mod tests {
|
|
|
318
302
|
};
|
|
319
303
|
|
|
320
304
|
let formatted = format_extraction_result(&result);
|
|
305
|
+
let parsed: serde_json::Value = serde_json::from_str(&formatted).expect("Should be valid JSON");
|
|
321
306
|
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
assert!(
|
|
307
|
+
assert_eq!(parsed["content"], "Sample extracted text");
|
|
308
|
+
assert_eq!(parsed["mime_type"], "text/plain");
|
|
309
|
+
assert!(parsed["metadata"].is_object());
|
|
325
310
|
}
|
|
326
311
|
|
|
327
312
|
#[test]
|
|
328
|
-
fn
|
|
313
|
+
fn test_format_extraction_result_includes_tables() {
|
|
329
314
|
let result = KreuzbergResult {
|
|
330
315
|
content: "Document with tables".to_string(),
|
|
331
316
|
mime_type: "application/pdf".to_string(),
|
|
332
317
|
metadata: crate::Metadata::default(),
|
|
333
|
-
tables: vec![
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
},
|
|
342
|
-
crate::Table {
|
|
343
|
-
cells: vec![
|
|
344
|
-
vec!["X".to_string(), "Y".to_string()],
|
|
345
|
-
vec!["1".to_string(), "2".to_string()],
|
|
346
|
-
],
|
|
347
|
-
page_number: 2,
|
|
348
|
-
markdown: "| X | Y |\n|---|---|\n| 1 | 2 |".to_string(),
|
|
349
|
-
},
|
|
350
|
-
],
|
|
318
|
+
tables: vec![crate::Table {
|
|
319
|
+
cells: vec![
|
|
320
|
+
vec!["Col1".to_string(), "Col2".to_string()],
|
|
321
|
+
vec!["A".to_string(), "B".to_string()],
|
|
322
|
+
],
|
|
323
|
+
page_number: 1,
|
|
324
|
+
markdown: "| Col1 | Col2 |\n|------|------|\n| A | B |".to_string(),
|
|
325
|
+
}],
|
|
351
326
|
detected_languages: None,
|
|
352
327
|
chunks: None,
|
|
353
328
|
images: None,
|
|
@@ -357,23 +332,33 @@ mod tests {
|
|
|
357
332
|
};
|
|
358
333
|
|
|
359
334
|
let formatted = format_extraction_result(&result);
|
|
335
|
+
let parsed: serde_json::Value = serde_json::from_str(&formatted).expect("Should be valid JSON");
|
|
360
336
|
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
assert!(formatted.contains("Table 2 (page 2)"));
|
|
364
|
-
assert!(formatted.contains("| Col1 | Col2 |"));
|
|
365
|
-
assert!(formatted.contains("| X | Y |"));
|
|
337
|
+
assert_eq!(parsed["tables"].as_array().unwrap().len(), 1);
|
|
338
|
+
assert_eq!(parsed["tables"][0]["page_number"], 1);
|
|
366
339
|
}
|
|
367
340
|
|
|
368
341
|
#[test]
|
|
369
|
-
fn
|
|
342
|
+
fn test_format_extraction_result_includes_chunks_when_present() {
|
|
370
343
|
let result = KreuzbergResult {
|
|
371
|
-
content:
|
|
344
|
+
content: "Chunked text".to_string(),
|
|
372
345
|
mime_type: "text/plain".to_string(),
|
|
373
346
|
metadata: crate::Metadata::default(),
|
|
374
347
|
tables: vec![],
|
|
375
348
|
detected_languages: None,
|
|
376
|
-
chunks:
|
|
349
|
+
chunks: Some(vec![crate::Chunk {
|
|
350
|
+
content: "Chunk 1".to_string(),
|
|
351
|
+
embedding: None,
|
|
352
|
+
metadata: crate::ChunkMetadata {
|
|
353
|
+
byte_start: 0,
|
|
354
|
+
byte_end: 7,
|
|
355
|
+
token_count: None,
|
|
356
|
+
chunk_index: 0,
|
|
357
|
+
total_chunks: 1,
|
|
358
|
+
first_page: None,
|
|
359
|
+
last_page: None,
|
|
360
|
+
},
|
|
361
|
+
}]),
|
|
377
362
|
images: None,
|
|
378
363
|
pages: None,
|
|
379
364
|
elements: None,
|
|
@@ -381,13 +366,14 @@ mod tests {
|
|
|
381
366
|
};
|
|
382
367
|
|
|
383
368
|
let formatted = format_extraction_result(&result);
|
|
369
|
+
let parsed: serde_json::Value = serde_json::from_str(&formatted).expect("Should be valid JSON");
|
|
384
370
|
|
|
385
|
-
|
|
386
|
-
|
|
371
|
+
assert_eq!(parsed["chunks"].as_array().unwrap().len(), 1);
|
|
372
|
+
assert_eq!(parsed["chunks"][0]["content"], "Chunk 1");
|
|
387
373
|
}
|
|
388
374
|
|
|
389
375
|
#[test]
|
|
390
|
-
fn
|
|
376
|
+
fn test_format_extraction_result_omits_none_fields() {
|
|
391
377
|
let result = KreuzbergResult {
|
|
392
378
|
content: "Simple text".to_string(),
|
|
393
379
|
mime_type: "text/plain".to_string(),
|
|
@@ -402,8 +388,11 @@ mod tests {
|
|
|
402
388
|
};
|
|
403
389
|
|
|
404
390
|
let formatted = format_extraction_result(&result);
|
|
391
|
+
let parsed: serde_json::Value = serde_json::from_str(&formatted).expect("Should be valid JSON");
|
|
405
392
|
|
|
406
|
-
|
|
407
|
-
assert!(
|
|
393
|
+
// None fields should be omitted via skip_serializing_if
|
|
394
|
+
assert!(parsed.get("chunks").is_none());
|
|
395
|
+
assert!(parsed.get("images").is_none());
|
|
396
|
+
assert!(parsed.get("detected_languages").is_none());
|
|
408
397
|
}
|
|
409
398
|
}
|
|
@@ -144,7 +144,7 @@ impl KreuzbergMcp {
|
|
|
144
144
|
Parameters(params): Parameters<super::params::BatchExtractFilesParams>,
|
|
145
145
|
) -> Result<CallToolResult, rmcp::ErrorData> {
|
|
146
146
|
use super::errors::map_kreuzberg_error_to_mcp;
|
|
147
|
-
use super::format::
|
|
147
|
+
use super::format::build_config;
|
|
148
148
|
use crate::{batch_extract_file, batch_extract_file_sync};
|
|
149
149
|
|
|
150
150
|
let config =
|
|
@@ -158,13 +158,7 @@ impl KreuzbergMcp {
|
|
|
158
158
|
batch_extract_file_sync(params.paths.clone(), &config).map_err(map_kreuzberg_error_to_mcp)?
|
|
159
159
|
};
|
|
160
160
|
|
|
161
|
-
let
|
|
162
|
-
for (i, result) in results.iter().enumerate() {
|
|
163
|
-
response.push_str(&format!("=== Document {}: {} ===\n", i + 1, params.paths[i]));
|
|
164
|
-
response.push_str(&format_extraction_result(result));
|
|
165
|
-
response.push_str("\n\n");
|
|
166
|
-
}
|
|
167
|
-
|
|
161
|
+
let response = serde_json::to_string_pretty(&results).unwrap_or_default();
|
|
168
162
|
Ok(CallToolResult::success(vec![Content::text(response)]))
|
|
169
163
|
}
|
|
170
164
|
|
|
@@ -99,13 +99,7 @@ pub(in crate::mcp) trait ExtractionTool {
|
|
|
99
99
|
batch_extract_file_sync(params.paths.clone(), &config).map_err(map_kreuzberg_error_to_mcp)?
|
|
100
100
|
};
|
|
101
101
|
|
|
102
|
-
let
|
|
103
|
-
for (i, result) in results.iter().enumerate() {
|
|
104
|
-
response.push_str(&format!("=== Document {}: {} ===\n", i + 1, params.paths[i]));
|
|
105
|
-
response.push_str(&format_extraction_result(result));
|
|
106
|
-
response.push_str("\n\n");
|
|
107
|
-
}
|
|
108
|
-
|
|
102
|
+
let response = serde_json::to_string_pretty(&results).unwrap_or_default();
|
|
109
103
|
Ok(CallToolResult::success(vec![Content::text(response)]))
|
|
110
104
|
}
|
|
111
105
|
}
|
|
@@ -321,3 +321,28 @@ async fn test_chunk_custom_config() {
|
|
|
321
321
|
assert_eq!(chunk_response.config.overlap, 5);
|
|
322
322
|
assert!(!chunk_response.config.trim);
|
|
323
323
|
}
|
|
324
|
+
|
|
325
|
+
#[tokio::test]
|
|
326
|
+
async fn test_chunk_rejects_json_array() {
|
|
327
|
+
let app = create_router(ExtractionConfig::default());
|
|
328
|
+
|
|
329
|
+
// Send a JSON array instead of object
|
|
330
|
+
let response = app
|
|
331
|
+
.oneshot(
|
|
332
|
+
Request::builder()
|
|
333
|
+
.uri("/chunk")
|
|
334
|
+
.method("POST")
|
|
335
|
+
.header("content-type", "application/json")
|
|
336
|
+
.body(Body::from(r#"[["text"], {"text": "content"}]"#))
|
|
337
|
+
.expect("Operation failed"),
|
|
338
|
+
)
|
|
339
|
+
.await
|
|
340
|
+
.expect("Operation failed");
|
|
341
|
+
|
|
342
|
+
// Should reject with 400 or 422, NOT 200
|
|
343
|
+
assert!(
|
|
344
|
+
response.status() == StatusCode::BAD_REQUEST || response.status() == StatusCode::UNPROCESSABLE_ENTITY,
|
|
345
|
+
"Expected 400 or 422, got {}",
|
|
346
|
+
response.status()
|
|
347
|
+
);
|
|
348
|
+
}
|
|
@@ -255,6 +255,66 @@ async fn test_embed_malformed_json() {
|
|
|
255
255
|
assert_eq!(response.status(), StatusCode::BAD_REQUEST);
|
|
256
256
|
}
|
|
257
257
|
|
|
258
|
+
/// Test embed endpoint rejects JSON array at root level.
|
|
259
|
+
#[tokio::test]
|
|
260
|
+
async fn test_embed_rejects_json_array() {
|
|
261
|
+
let app = create_router(ExtractionConfig::default());
|
|
262
|
+
|
|
263
|
+
// Send a JSON array instead of object
|
|
264
|
+
let response = app
|
|
265
|
+
.oneshot(
|
|
266
|
+
Request::builder()
|
|
267
|
+
.method("POST")
|
|
268
|
+
.uri("/embed")
|
|
269
|
+
.header("content-type", "application/json")
|
|
270
|
+
.body(Body::from(r#"[["text1"], {"texts": ["text2"]}]"#))
|
|
271
|
+
.expect("Operation failed"),
|
|
272
|
+
)
|
|
273
|
+
.await
|
|
274
|
+
.expect("Operation failed");
|
|
275
|
+
|
|
276
|
+
// Should reject with 400 or 422, NOT 200
|
|
277
|
+
assert!(
|
|
278
|
+
response.status() == StatusCode::BAD_REQUEST || response.status() == StatusCode::UNPROCESSABLE_ENTITY,
|
|
279
|
+
"Expected 400 or 422, got {}",
|
|
280
|
+
response.status()
|
|
281
|
+
);
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
/// Test embed endpoint rejects simple JSON array with strings.
|
|
285
|
+
#[tokio::test]
|
|
286
|
+
async fn test_embed_rejects_simple_json_array() {
|
|
287
|
+
let app = create_router(ExtractionConfig::default());
|
|
288
|
+
|
|
289
|
+
// Send a simple string array instead of object with texts field
|
|
290
|
+
let response = app
|
|
291
|
+
.oneshot(
|
|
292
|
+
Request::builder()
|
|
293
|
+
.method("POST")
|
|
294
|
+
.uri("/embed")
|
|
295
|
+
.header("content-type", "application/json")
|
|
296
|
+
.body(Body::from(r#"["text1", "text2", "text3"]"#))
|
|
297
|
+
.expect("Operation failed"),
|
|
298
|
+
)
|
|
299
|
+
.await
|
|
300
|
+
.expect("Operation failed");
|
|
301
|
+
|
|
302
|
+
assert_eq!(response.status(), StatusCode::BAD_REQUEST);
|
|
303
|
+
|
|
304
|
+
// Check that error response contains helpful message
|
|
305
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
306
|
+
.await
|
|
307
|
+
.expect("Failed to read response body");
|
|
308
|
+
let error_response: serde_json::Value = serde_json::from_slice(&body).expect("Failed to parse error response");
|
|
309
|
+
|
|
310
|
+
assert!(
|
|
311
|
+
error_response["message"]
|
|
312
|
+
.as_str()
|
|
313
|
+
.map(|msg| msg.contains("array") || msg.contains("object"))
|
|
314
|
+
.unwrap_or(false)
|
|
315
|
+
);
|
|
316
|
+
}
|
|
317
|
+
|
|
258
318
|
/// Test embed endpoint preserves embedding vector values across calls.
|
|
259
319
|
#[tokio::test]
|
|
260
320
|
async fn test_embed_deterministic() {
|