kreuzberg 4.1.1 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +8 -5
- data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +23 -13
- data/kreuzberg.gemspec +14 -2
- data/lib/kreuzberg/api_proxy.rb +0 -1
- data/lib/kreuzberg/cli_proxy.rb +0 -1
- data/lib/kreuzberg/config.rb +70 -35
- data/lib/kreuzberg/mcp_proxy.rb +0 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +5 -1
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +3 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +55 -53
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
- data/vendor/kreuzberg-tesseract/build.rs +4 -4
- data/vendor/kreuzberg-tesseract/src/lib.rs +6 -6
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +3 -3
- metadata +13 -2
|
@@ -4,25 +4,72 @@
|
|
|
4
4
|
|
|
5
5
|
use crate::{ExtractionConfig, ExtractionResult as KreuzbergResult};
|
|
6
6
|
|
|
7
|
+
/// Merge extraction configuration using JSON-level merge.
|
|
8
|
+
///
|
|
9
|
+
/// This function performs a JSON-level merge where fields present in the override
|
|
10
|
+
/// JSON take precedence over the base config. This approach correctly handles
|
|
11
|
+
/// boolean fields that are explicitly set to their default values.
|
|
12
|
+
///
|
|
13
|
+
/// # Strategy
|
|
14
|
+
///
|
|
15
|
+
/// 1. Serialize base config to JSON
|
|
16
|
+
/// 2. For each field in the override JSON, merge into base JSON (field-by-field override)
|
|
17
|
+
/// 3. Deserialize merged JSON back to ExtractionConfig
|
|
18
|
+
///
|
|
19
|
+
/// This ensures that explicitly provided values always take precedence, even if
|
|
20
|
+
/// they match the default value. Unspecified fields are preserved from base config.
|
|
21
|
+
///
|
|
22
|
+
/// # Examples
|
|
23
|
+
///
|
|
24
|
+
/// ```rust,no_run
|
|
25
|
+
/// use kreuzberg::{ExtractionConfig, OutputFormat};
|
|
26
|
+
/// use serde_json::json;
|
|
27
|
+
///
|
|
28
|
+
/// let mut base = ExtractionConfig::default();
|
|
29
|
+
/// base.use_cache = true;
|
|
30
|
+
///
|
|
31
|
+
/// let override_json = json!({
|
|
32
|
+
/// "force_ocr": true,
|
|
33
|
+
/// });
|
|
34
|
+
///
|
|
35
|
+
/// let merged = merge_configs(&base, &override_json).unwrap();
|
|
36
|
+
/// assert_eq!(merged.use_cache, true); // from base
|
|
37
|
+
/// assert_eq!(merged.force_ocr, true); // from override
|
|
38
|
+
/// ```
|
|
39
|
+
fn merge_configs(base: &ExtractionConfig, override_json: serde_json::Value) -> Result<ExtractionConfig, String> {
|
|
40
|
+
// Serialize base config to JSON
|
|
41
|
+
let mut config_json =
|
|
42
|
+
serde_json::to_value(base).map_err(|e| format!("Failed to serialize base config to JSON: {}", e))?;
|
|
43
|
+
|
|
44
|
+
// Merge JSON value into config JSON (simple field-by-field merge)
|
|
45
|
+
// For each key in the provided JSON, override the corresponding key in config JSON
|
|
46
|
+
if let serde_json::Value::Object(json_obj) = override_json
|
|
47
|
+
&& let Some(config_obj) = config_json.as_object_mut()
|
|
48
|
+
{
|
|
49
|
+
for (key, value) in json_obj {
|
|
50
|
+
config_obj.insert(key, value);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Deserialize merged JSON back to ExtractionConfig
|
|
55
|
+
serde_json::from_value(config_json).map_err(|e| format!("Failed to deserialize merged config: {}", e))
|
|
56
|
+
}
|
|
57
|
+
|
|
7
58
|
/// Build extraction config from MCP parameters.
|
|
8
59
|
///
|
|
9
|
-
///
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
output_format: None,
|
|
19
|
-
})
|
|
60
|
+
/// Merges the provided config JSON (if any) with the default config using JSON-level
|
|
61
|
+
/// merge semantics. Unspecified fields in the JSON preserve their values from the default config.
|
|
62
|
+
pub(super) fn build_config(
|
|
63
|
+
default_config: &ExtractionConfig,
|
|
64
|
+
config_json: Option<serde_json::Value>,
|
|
65
|
+
) -> Result<ExtractionConfig, String> {
|
|
66
|
+
if let Some(json) = config_json {
|
|
67
|
+
// Merge using JSON-level merge: provided JSON fields override default config
|
|
68
|
+
merge_configs(default_config, json)
|
|
20
69
|
} else {
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
config
|
|
70
|
+
// No config provided, use default
|
|
71
|
+
Ok(default_config.clone())
|
|
72
|
+
}
|
|
26
73
|
}
|
|
27
74
|
|
|
28
75
|
/// Format extraction result as human-readable text.
|
|
@@ -54,20 +101,35 @@ mod tests {
|
|
|
54
101
|
use super::*;
|
|
55
102
|
|
|
56
103
|
#[test]
|
|
57
|
-
fn
|
|
104
|
+
fn test_build_config_with_no_config() {
|
|
58
105
|
let default_config = ExtractionConfig::default();
|
|
59
106
|
|
|
60
|
-
let config = build_config(&default_config,
|
|
61
|
-
|
|
62
|
-
|
|
107
|
+
let config = build_config(&default_config, None).unwrap();
|
|
108
|
+
assert_eq!(config.use_cache, default_config.use_cache);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
#[test]
|
|
112
|
+
fn test_build_config_with_config_json() {
|
|
113
|
+
let default_config = ExtractionConfig::default();
|
|
114
|
+
let config_json = serde_json::json!({
|
|
115
|
+
"use_cache": false
|
|
116
|
+
});
|
|
63
117
|
|
|
64
|
-
let config = build_config(&default_config,
|
|
65
|
-
assert!(config.
|
|
66
|
-
|
|
118
|
+
let config = build_config(&default_config, Some(config_json)).unwrap();
|
|
119
|
+
assert!(!config.use_cache);
|
|
120
|
+
}
|
|
67
121
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
122
|
+
#[test]
|
|
123
|
+
fn test_build_config_with_invalid_config_json() {
|
|
124
|
+
let default_config = ExtractionConfig::default();
|
|
125
|
+
// Provide invalid type for a field (string instead of boolean)
|
|
126
|
+
let config_json = serde_json::json!({
|
|
127
|
+
"use_cache": "not_a_boolean"
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
let result = build_config(&default_config, Some(config_json));
|
|
131
|
+
assert!(result.is_err());
|
|
132
|
+
assert!(result.unwrap_err().contains("Failed to deserialize"));
|
|
71
133
|
}
|
|
72
134
|
|
|
73
135
|
#[test]
|
|
@@ -77,31 +139,167 @@ mod tests {
|
|
|
77
139
|
..Default::default()
|
|
78
140
|
};
|
|
79
141
|
|
|
80
|
-
let config = build_config(&default_config,
|
|
142
|
+
let config = build_config(&default_config, None).unwrap();
|
|
81
143
|
|
|
82
144
|
assert!(!config.use_cache);
|
|
83
145
|
}
|
|
84
146
|
|
|
85
147
|
#[test]
|
|
86
|
-
fn
|
|
87
|
-
let default_config = ExtractionConfig
|
|
148
|
+
fn test_build_config_overrides_default_settings() {
|
|
149
|
+
let default_config = ExtractionConfig {
|
|
150
|
+
use_cache: true,
|
|
151
|
+
..Default::default()
|
|
152
|
+
};
|
|
88
153
|
|
|
89
|
-
let
|
|
154
|
+
let config_json = serde_json::json!({
|
|
155
|
+
"use_cache": false
|
|
156
|
+
});
|
|
90
157
|
|
|
91
|
-
|
|
92
|
-
assert!(!config.
|
|
158
|
+
let config = build_config(&default_config, Some(config_json)).unwrap();
|
|
159
|
+
assert!(!config.use_cache);
|
|
93
160
|
}
|
|
94
161
|
|
|
95
162
|
#[test]
|
|
96
|
-
fn
|
|
97
|
-
|
|
163
|
+
fn test_build_config_merges_partial_config() {
|
|
164
|
+
// Base config with custom use_cache setting
|
|
165
|
+
let default_config = ExtractionConfig {
|
|
166
|
+
use_cache: false,
|
|
167
|
+
enable_quality_processing: true,
|
|
168
|
+
force_ocr: false,
|
|
169
|
+
..Default::default()
|
|
170
|
+
};
|
|
171
|
+
|
|
172
|
+
// Override only force_ocr
|
|
173
|
+
let config_json = serde_json::json!({
|
|
174
|
+
"force_ocr": true
|
|
175
|
+
});
|
|
176
|
+
|
|
177
|
+
let config = build_config(&default_config, Some(config_json)).unwrap();
|
|
178
|
+
|
|
179
|
+
// use_cache should be preserved from default_config
|
|
180
|
+
assert!(!config.use_cache, "use_cache should be preserved from default config");
|
|
181
|
+
// enable_quality_processing should be preserved
|
|
182
|
+
assert!(
|
|
183
|
+
config.enable_quality_processing,
|
|
184
|
+
"enable_quality_processing should be preserved"
|
|
185
|
+
);
|
|
186
|
+
// force_ocr should be overridden
|
|
187
|
+
assert!(config.force_ocr, "force_ocr should be overridden to true");
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
#[test]
|
|
191
|
+
fn test_build_config_merges_nested_config() {
|
|
192
|
+
let default_config = ExtractionConfig {
|
|
193
|
+
use_cache: true,
|
|
194
|
+
..Default::default()
|
|
195
|
+
};
|
|
196
|
+
|
|
197
|
+
// Override output format only
|
|
198
|
+
let config_json = serde_json::json!({
|
|
199
|
+
"output_format": "markdown"
|
|
200
|
+
});
|
|
201
|
+
|
|
202
|
+
let config = build_config(&default_config, Some(config_json)).unwrap();
|
|
203
|
+
|
|
204
|
+
// use_cache should be preserved
|
|
205
|
+
assert!(config.use_cache, "use_cache should be preserved from default config");
|
|
206
|
+
// output_format should be overridden
|
|
207
|
+
assert_eq!(
|
|
208
|
+
config.output_format,
|
|
209
|
+
crate::core::config::formats::OutputFormat::Markdown,
|
|
210
|
+
"output_format should be overridden to markdown"
|
|
211
|
+
);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
#[test]
|
|
215
|
+
fn test_build_config_merges_with_custom_defaults() {
|
|
216
|
+
// Create a default config with custom values
|
|
217
|
+
let default_config = ExtractionConfig {
|
|
218
|
+
use_cache: false,
|
|
219
|
+
enable_quality_processing: true,
|
|
220
|
+
force_ocr: false,
|
|
221
|
+
..Default::default()
|
|
222
|
+
};
|
|
223
|
+
|
|
224
|
+
// Provide partial override (only force_ocr)
|
|
225
|
+
let config_json = serde_json::json!({
|
|
226
|
+
"force_ocr": true,
|
|
227
|
+
});
|
|
228
|
+
|
|
229
|
+
let config = build_config(&default_config, Some(config_json)).unwrap();
|
|
230
|
+
|
|
231
|
+
// force_ocr should be overridden
|
|
232
|
+
assert!(config.force_ocr, "force_ocr should be overridden to true");
|
|
233
|
+
// use_cache should be preserved from default_config
|
|
234
|
+
assert!(
|
|
235
|
+
!config.use_cache,
|
|
236
|
+
"use_cache should be preserved from default config (false)"
|
|
237
|
+
);
|
|
238
|
+
// enable_quality_processing should be preserved
|
|
239
|
+
assert!(
|
|
240
|
+
config.enable_quality_processing,
|
|
241
|
+
"enable_quality_processing should be preserved (true)"
|
|
242
|
+
);
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
#[test]
|
|
246
|
+
fn test_build_config_merges_multiple_fields() {
|
|
247
|
+
let default_config = ExtractionConfig {
|
|
248
|
+
use_cache: true,
|
|
249
|
+
enable_quality_processing: false,
|
|
250
|
+
force_ocr: true,
|
|
251
|
+
..Default::default()
|
|
252
|
+
};
|
|
253
|
+
|
|
254
|
+
// Override multiple fields
|
|
255
|
+
let config_json = serde_json::json!({
|
|
256
|
+
"use_cache": false,
|
|
257
|
+
"output_format": "markdown",
|
|
258
|
+
});
|
|
259
|
+
|
|
260
|
+
let config = build_config(&default_config, Some(config_json)).unwrap();
|
|
261
|
+
|
|
262
|
+
// use_cache should be overridden
|
|
263
|
+
assert!(!config.use_cache, "use_cache should be overridden to false");
|
|
264
|
+
// output_format should be overridden
|
|
265
|
+
assert_eq!(
|
|
266
|
+
config.output_format,
|
|
267
|
+
crate::core::config::formats::OutputFormat::Markdown,
|
|
268
|
+
"output_format should be overridden to markdown"
|
|
269
|
+
);
|
|
270
|
+
// force_ocr should be preserved (not in override)
|
|
271
|
+
assert!(
|
|
272
|
+
config.force_ocr,
|
|
273
|
+
"force_ocr should be preserved from default config (true)"
|
|
274
|
+
);
|
|
275
|
+
// enable_quality_processing should be preserved
|
|
276
|
+
assert!(
|
|
277
|
+
!config.enable_quality_processing,
|
|
278
|
+
"enable_quality_processing should be preserved (false)"
|
|
279
|
+
);
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
#[test]
|
|
283
|
+
fn test_build_config_boolean_override_to_default_value() {
|
|
284
|
+
// This test validates the critical bug fix: when user explicitly sets a boolean
|
|
285
|
+
// to its default value, the merge logic should correctly use the override value,
|
|
286
|
+
// not fall back to the base config.
|
|
287
|
+
let base = ExtractionConfig {
|
|
288
|
+
use_cache: false,
|
|
289
|
+
..Default::default()
|
|
290
|
+
};
|
|
291
|
+
|
|
292
|
+
// User explicitly provides use_cache: true (which IS the default)
|
|
293
|
+
let override_json = serde_json::json!({"use_cache": true});
|
|
98
294
|
|
|
99
|
-
let
|
|
295
|
+
let merged = build_config(&base, Some(override_json)).unwrap();
|
|
100
296
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
297
|
+
// Before the fix: merged.use_cache would be false (WRONG - fell back to base)
|
|
298
|
+
// After the fix: merged.use_cache should be true (CORRECT - override applied)
|
|
299
|
+
assert!(
|
|
300
|
+
merged.use_cache,
|
|
301
|
+
"Should use explicit override even if it matches default"
|
|
302
|
+
);
|
|
105
303
|
}
|
|
106
304
|
|
|
107
305
|
#[test]
|
|
@@ -12,12 +12,9 @@ pub struct ExtractFileParams {
|
|
|
12
12
|
/// Optional MIME type hint (auto-detected if not provided)
|
|
13
13
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
14
14
|
pub mime_type: Option<String>,
|
|
15
|
-
///
|
|
16
|
-
#[serde(
|
|
17
|
-
pub
|
|
18
|
-
/// Force OCR even if text extraction succeeds
|
|
19
|
-
#[serde(default)]
|
|
20
|
-
pub force_ocr: bool,
|
|
15
|
+
/// Extraction configuration (JSON object)
|
|
16
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
17
|
+
pub config: Option<serde_json::Value>,
|
|
21
18
|
/// Use async extraction (default: false for sync)
|
|
22
19
|
#[serde(default)]
|
|
23
20
|
pub r#async: bool,
|
|
@@ -31,12 +28,9 @@ pub struct ExtractBytesParams {
|
|
|
31
28
|
/// Optional MIME type hint (auto-detected if not provided)
|
|
32
29
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
33
30
|
pub mime_type: Option<String>,
|
|
34
|
-
///
|
|
35
|
-
#[serde(
|
|
36
|
-
pub
|
|
37
|
-
/// Force OCR even if text extraction succeeds
|
|
38
|
-
#[serde(default)]
|
|
39
|
-
pub force_ocr: bool,
|
|
31
|
+
/// Extraction configuration (JSON object)
|
|
32
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
33
|
+
pub config: Option<serde_json::Value>,
|
|
40
34
|
/// Use async extraction (default: false for sync)
|
|
41
35
|
#[serde(default)]
|
|
42
36
|
pub r#async: bool,
|
|
@@ -47,12 +41,9 @@ pub struct ExtractBytesParams {
|
|
|
47
41
|
pub struct BatchExtractFilesParams {
|
|
48
42
|
/// Paths to files to extract
|
|
49
43
|
pub paths: Vec<String>,
|
|
50
|
-
///
|
|
51
|
-
#[serde(
|
|
52
|
-
pub
|
|
53
|
-
/// Force OCR even if text extraction succeeds
|
|
54
|
-
#[serde(default)]
|
|
55
|
-
pub force_ocr: bool,
|
|
44
|
+
/// Extraction configuration (JSON object)
|
|
45
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
46
|
+
pub config: Option<serde_json::Value>,
|
|
56
47
|
/// Use async extraction (default: false for sync)
|
|
57
48
|
#[serde(default)]
|
|
58
49
|
pub r#async: bool,
|
|
@@ -83,8 +74,7 @@ mod tests {
|
|
|
83
74
|
|
|
84
75
|
assert_eq!(params.path, "/test.pdf");
|
|
85
76
|
assert_eq!(params.mime_type, None);
|
|
86
|
-
|
|
87
|
-
assert!(!params.force_ocr);
|
|
77
|
+
assert_eq!(params.config, None);
|
|
88
78
|
assert!(!params.r#async);
|
|
89
79
|
}
|
|
90
80
|
|
|
@@ -95,8 +85,7 @@ mod tests {
|
|
|
95
85
|
|
|
96
86
|
assert_eq!(params.data, "SGVsbG8=");
|
|
97
87
|
assert_eq!(params.mime_type, None);
|
|
98
|
-
|
|
99
|
-
assert!(!params.force_ocr);
|
|
88
|
+
assert_eq!(params.config, None);
|
|
100
89
|
assert!(!params.r#async);
|
|
101
90
|
}
|
|
102
91
|
|
|
@@ -106,8 +95,7 @@ mod tests {
|
|
|
106
95
|
let params: BatchExtractFilesParams = serde_json::from_str(json).unwrap();
|
|
107
96
|
|
|
108
97
|
assert_eq!(params.paths.len(), 2);
|
|
109
|
-
|
|
110
|
-
assert!(!params.force_ocr);
|
|
98
|
+
assert_eq!(params.config, None);
|
|
111
99
|
assert!(!params.r#async);
|
|
112
100
|
}
|
|
113
101
|
|
|
@@ -128,13 +116,21 @@ mod tests {
|
|
|
128
116
|
assert!(!params.use_content);
|
|
129
117
|
}
|
|
130
118
|
|
|
119
|
+
#[test]
|
|
120
|
+
fn test_extract_file_params_with_config() {
|
|
121
|
+
let json = r#"{"path": "/test.pdf", "config": {"use_cache": false}}"#;
|
|
122
|
+
let params: ExtractFileParams = serde_json::from_str(json).unwrap();
|
|
123
|
+
|
|
124
|
+
assert_eq!(params.path, "/test.pdf");
|
|
125
|
+
assert!(params.config.is_some());
|
|
126
|
+
}
|
|
127
|
+
|
|
131
128
|
#[test]
|
|
132
129
|
fn test_extract_file_params_serialization() {
|
|
133
130
|
let params = ExtractFileParams {
|
|
134
131
|
path: "/test.pdf".to_string(),
|
|
135
132
|
mime_type: Some("application/pdf".to_string()),
|
|
136
|
-
|
|
137
|
-
force_ocr: false,
|
|
133
|
+
config: Some(serde_json::json!({"use_cache": false})),
|
|
138
134
|
r#async: true,
|
|
139
135
|
};
|
|
140
136
|
|
|
@@ -143,8 +139,7 @@ mod tests {
|
|
|
143
139
|
|
|
144
140
|
assert_eq!(params.path, deserialized.path);
|
|
145
141
|
assert_eq!(params.mime_type, deserialized.mime_type);
|
|
146
|
-
assert_eq!(params.
|
|
147
|
-
assert_eq!(params.force_ocr, deserialized.force_ocr);
|
|
142
|
+
assert_eq!(params.config, deserialized.config);
|
|
148
143
|
assert_eq!(params.r#async, deserialized.r#async);
|
|
149
144
|
}
|
|
150
145
|
|
|
@@ -153,8 +148,7 @@ mod tests {
|
|
|
153
148
|
let params = ExtractBytesParams {
|
|
154
149
|
data: "SGVsbG8=".to_string(),
|
|
155
150
|
mime_type: None,
|
|
156
|
-
|
|
157
|
-
force_ocr: false,
|
|
151
|
+
config: None,
|
|
158
152
|
r#async: false,
|
|
159
153
|
};
|
|
160
154
|
|
|
@@ -168,8 +162,7 @@ mod tests {
|
|
|
168
162
|
fn test_batch_extract_params_serialization() {
|
|
169
163
|
let params = BatchExtractFilesParams {
|
|
170
164
|
paths: vec!["/a.pdf".to_string(), "/b.pdf".to_string()],
|
|
171
|
-
|
|
172
|
-
force_ocr: true,
|
|
165
|
+
config: Some(serde_json::json!({"use_cache": true})),
|
|
173
166
|
r#async: true,
|
|
174
167
|
};
|
|
175
168
|
|
|
@@ -177,7 +170,7 @@ mod tests {
|
|
|
177
170
|
let deserialized: BatchExtractFilesParams = serde_json::from_str(&json).unwrap();
|
|
178
171
|
|
|
179
172
|
assert_eq!(params.paths, deserialized.paths);
|
|
180
|
-
assert_eq!(params.
|
|
173
|
+
assert_eq!(params.config, deserialized.config);
|
|
181
174
|
}
|
|
182
175
|
|
|
183
176
|
#[test]
|
|
@@ -80,7 +80,8 @@ impl KreuzbergMcp {
|
|
|
80
80
|
use super::format::{build_config, format_extraction_result};
|
|
81
81
|
use crate::{extract_file, extract_file_sync};
|
|
82
82
|
|
|
83
|
-
let config =
|
|
83
|
+
let config =
|
|
84
|
+
build_config(&self.default_config, params.config).map_err(|e| rmcp::ErrorData::invalid_params(e, None))?;
|
|
84
85
|
|
|
85
86
|
let result = if params.r#async {
|
|
86
87
|
extract_file(¶ms.path, params.mime_type.as_deref(), &config)
|
|
@@ -114,7 +115,8 @@ impl KreuzbergMcp {
|
|
|
114
115
|
.decode(¶ms.data)
|
|
115
116
|
.map_err(|e| rmcp::ErrorData::invalid_params(format!("Invalid base64: {}", e), None))?;
|
|
116
117
|
|
|
117
|
-
let config =
|
|
118
|
+
let config =
|
|
119
|
+
build_config(&self.default_config, params.config).map_err(|e| rmcp::ErrorData::invalid_params(e, None))?;
|
|
118
120
|
|
|
119
121
|
let mime_type = params.mime_type.as_deref().unwrap_or("");
|
|
120
122
|
|
|
@@ -145,7 +147,8 @@ impl KreuzbergMcp {
|
|
|
145
147
|
use super::format::{build_config, format_extraction_result};
|
|
146
148
|
use crate::{batch_extract_file, batch_extract_file_sync};
|
|
147
149
|
|
|
148
|
-
let config =
|
|
150
|
+
let config =
|
|
151
|
+
build_config(&self.default_config, params.config).map_err(|e| rmcp::ErrorData::invalid_params(e, None))?;
|
|
149
152
|
|
|
150
153
|
let results = if params.r#async {
|
|
151
154
|
batch_extract_file(params.paths.clone(), &config)
|
|
@@ -30,7 +30,8 @@ pub(in crate::mcp) trait ExtractionTool {
|
|
|
30
30
|
&self,
|
|
31
31
|
Parameters(params): Parameters<ExtractFileParams>,
|
|
32
32
|
) -> Result<CallToolResult, McpError> {
|
|
33
|
-
let config = build_config(self.default_config(), params.
|
|
33
|
+
let config = build_config(self.default_config(), params.config)
|
|
34
|
+
.map_err(|e| McpError::invalid_params(e, None))?;
|
|
34
35
|
|
|
35
36
|
let result = if params.r#async {
|
|
36
37
|
extract_file(¶ms.path, params.mime_type.as_deref(), &config)
|
|
@@ -59,7 +60,8 @@ pub(in crate::mcp) trait ExtractionTool {
|
|
|
59
60
|
.decode(¶ms.data)
|
|
60
61
|
.map_err(|e| McpError::invalid_params(format!("Invalid base64: {}", e), None))?;
|
|
61
62
|
|
|
62
|
-
let config = build_config(self.default_config(), params.
|
|
63
|
+
let config = build_config(self.default_config(), params.config)
|
|
64
|
+
.map_err(|e| McpError::invalid_params(e, None))?;
|
|
63
65
|
|
|
64
66
|
let mime_type = params.mime_type.as_deref().unwrap_or("");
|
|
65
67
|
|
|
@@ -86,7 +88,8 @@ pub(in crate::mcp) trait ExtractionTool {
|
|
|
86
88
|
&self,
|
|
87
89
|
Parameters(params): Parameters<BatchExtractFilesParams>,
|
|
88
90
|
) -> Result<CallToolResult, McpError> {
|
|
89
|
-
let config = build_config(self.default_config(), params.
|
|
91
|
+
let config = build_config(self.default_config(), params.config)
|
|
92
|
+
.map_err(|e| McpError::invalid_params(e, None))?;
|
|
90
93
|
|
|
91
94
|
let results = if params.r#async {
|
|
92
95
|
batch_extract_file(params.paths.clone(), &config)
|
|
@@ -153,8 +156,7 @@ mod tests {
|
|
|
153
156
|
let params = ExtractFileParams {
|
|
154
157
|
path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
|
|
155
158
|
mime_type: None,
|
|
156
|
-
|
|
157
|
-
force_ocr: false,
|
|
159
|
+
config: None,
|
|
158
160
|
r#async: true,
|
|
159
161
|
};
|
|
160
162
|
|
|
@@ -181,8 +183,7 @@ mod tests {
|
|
|
181
183
|
let params = ExtractFileParams {
|
|
182
184
|
path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
|
|
183
185
|
mime_type: None,
|
|
184
|
-
|
|
185
|
-
force_ocr: false,
|
|
186
|
+
config: None,
|
|
186
187
|
r#async: true,
|
|
187
188
|
};
|
|
188
189
|
|
|
@@ -208,8 +209,7 @@ mod tests {
|
|
|
208
209
|
let params = ExtractFileParams {
|
|
209
210
|
path: "/nonexistent/file.pdf".to_string(),
|
|
210
211
|
mime_type: None,
|
|
211
|
-
|
|
212
|
-
force_ocr: false,
|
|
212
|
+
config: None,
|
|
213
213
|
r#async: true,
|
|
214
214
|
};
|
|
215
215
|
|
|
@@ -226,8 +226,7 @@ mod tests {
|
|
|
226
226
|
let params = ExtractFileParams {
|
|
227
227
|
path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
|
|
228
228
|
mime_type: Some("application/pdf".to_string()),
|
|
229
|
-
|
|
230
|
-
force_ocr: false,
|
|
229
|
+
config: None,
|
|
231
230
|
r#async: true,
|
|
232
231
|
};
|
|
233
232
|
|
|
@@ -246,8 +245,7 @@ mod tests {
|
|
|
246
245
|
let params = ExtractBytesParams {
|
|
247
246
|
data: encoded,
|
|
248
247
|
mime_type: Some("text/plain".to_string()),
|
|
249
|
-
|
|
250
|
-
force_ocr: false,
|
|
248
|
+
config: None,
|
|
251
249
|
r#async: true,
|
|
252
250
|
};
|
|
253
251
|
|
|
@@ -274,8 +272,7 @@ mod tests {
|
|
|
274
272
|
let params = ExtractBytesParams {
|
|
275
273
|
data: "not-valid-base64!!!".to_string(),
|
|
276
274
|
mime_type: None,
|
|
277
|
-
|
|
278
|
-
force_ocr: false,
|
|
275
|
+
config: None,
|
|
279
276
|
r#async: true,
|
|
280
277
|
};
|
|
281
278
|
|
|
@@ -292,8 +289,7 @@ mod tests {
|
|
|
292
289
|
let server = TestMcpServer::new();
|
|
293
290
|
let params = BatchExtractFilesParams {
|
|
294
291
|
paths: vec![get_test_path("pdfs_with_tables/tiny.pdf").to_string()],
|
|
295
|
-
|
|
296
|
-
force_ocr: false,
|
|
292
|
+
config: None,
|
|
297
293
|
r#async: true,
|
|
298
294
|
};
|
|
299
295
|
|
|
@@ -319,8 +315,7 @@ mod tests {
|
|
|
319
315
|
let server = TestMcpServer::new();
|
|
320
316
|
let params = BatchExtractFilesParams {
|
|
321
317
|
paths: vec![],
|
|
322
|
-
|
|
323
|
-
force_ocr: false,
|
|
318
|
+
config: None,
|
|
324
319
|
r#async: true,
|
|
325
320
|
};
|
|
326
321
|
|
|
@@ -350,8 +345,7 @@ mod tests {
|
|
|
350
345
|
let params = ExtractFileParams {
|
|
351
346
|
path: test_file.to_string(),
|
|
352
347
|
mime_type: None,
|
|
353
|
-
|
|
354
|
-
force_ocr: false,
|
|
348
|
+
config: None,
|
|
355
349
|
r#async: true,
|
|
356
350
|
};
|
|
357
351
|
|
|
@@ -378,8 +372,7 @@ mod tests {
|
|
|
378
372
|
if std::path::Path::new(&file1).exists() && std::path::Path::new(&file2).exists() {
|
|
379
373
|
let params = BatchExtractFilesParams {
|
|
380
374
|
paths: vec![file1.to_string(), file2.to_string()],
|
|
381
|
-
|
|
382
|
-
force_ocr: false,
|
|
375
|
+
config: None,
|
|
383
376
|
r#async: true,
|
|
384
377
|
};
|
|
385
378
|
|