kreuzberg 4.1.1 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +4 -4
  3. data/README.md +8 -5
  4. data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
  5. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +23 -13
  7. data/kreuzberg.gemspec +14 -2
  8. data/lib/kreuzberg/api_proxy.rb +0 -1
  9. data/lib/kreuzberg/cli_proxy.rb +0 -1
  10. data/lib/kreuzberg/config.rb +70 -35
  11. data/lib/kreuzberg/mcp_proxy.rb +0 -1
  12. data/lib/kreuzberg/version.rb +1 -1
  13. data/sig/kreuzberg.rbs +5 -1
  14. data/spec/binding/batch_operations_spec.rb +80 -0
  15. data/spec/binding/metadata_types_spec.rb +77 -57
  16. data/spec/serialization_spec.rb +134 -0
  17. data/spec/unit/config/output_format_spec.rb +380 -0
  18. data/vendor/Cargo.toml +1 -1
  19. data/vendor/kreuzberg/Cargo.toml +3 -3
  20. data/vendor/kreuzberg/README.md +1 -1
  21. data/vendor/kreuzberg/src/embeddings.rs +4 -4
  22. data/vendor/kreuzberg/src/mcp/format.rs +237 -39
  23. data/vendor/kreuzberg/src/mcp/params.rs +26 -33
  24. data/vendor/kreuzberg/src/mcp/server.rs +6 -3
  25. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
  26. data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
  27. data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
  28. data/vendor/kreuzberg/tests/api_embed.rs +84 -50
  29. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
  30. data/vendor/kreuzberg/tests/api_tests.rs +298 -139
  31. data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
  32. data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
  33. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
  34. data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
  35. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
  36. data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
  37. data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
  38. data/vendor/kreuzberg/tests/config_features.rs +19 -15
  39. data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
  40. data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
  41. data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
  42. data/vendor/kreuzberg/tests/core_integration.rs +55 -53
  43. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
  44. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
  45. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
  46. data/vendor/kreuzberg/tests/email_integration.rs +7 -7
  47. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
  48. data/vendor/kreuzberg/tests/error_handling.rs +13 -11
  49. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
  50. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  51. data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
  52. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
  53. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
  54. data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
  55. data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
  56. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
  57. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
  58. data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
  59. data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
  60. data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
  61. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
  62. data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
  63. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
  64. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
  65. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
  66. data/vendor/kreuzberg/tests/page_markers.rs +1 -1
  67. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
  68. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  69. data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
  70. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
  71. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
  72. data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
  73. data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
  74. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
  75. data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
  76. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
  77. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
  78. data/vendor/kreuzberg/tests/security_validation.rs +20 -19
  79. data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
  80. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
  81. data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
  82. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
  83. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
  84. data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
  85. data/vendor/kreuzberg-tesseract/build.rs +4 -4
  86. data/vendor/kreuzberg-tesseract/src/lib.rs +6 -6
  87. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +3 -3
  88. metadata +13 -2
@@ -4,25 +4,72 @@
4
4
 
5
5
  use crate::{ExtractionConfig, ExtractionResult as KreuzbergResult};
6
6
 
7
+ /// Merge extraction configuration using JSON-level merge.
8
+ ///
9
+ /// This function performs a JSON-level merge where fields present in the override
10
+ /// JSON take precedence over the base config. This approach correctly handles
11
+ /// boolean fields that are explicitly set to their default values.
12
+ ///
13
+ /// # Strategy
14
+ ///
15
+ /// 1. Serialize base config to JSON
16
+ /// 2. For each field in the override JSON, merge into base JSON (field-by-field override)
17
+ /// 3. Deserialize merged JSON back to ExtractionConfig
18
+ ///
19
+ /// This ensures that explicitly provided values always take precedence, even if
20
+ /// they match the default value. Unspecified fields are preserved from base config.
21
+ ///
22
+ /// # Examples
23
+ ///
24
+ /// ```rust,no_run
25
+ /// use kreuzberg::{ExtractionConfig, OutputFormat};
26
+ /// use serde_json::json;
27
+ ///
28
+ /// let mut base = ExtractionConfig::default();
29
+ /// base.use_cache = true;
30
+ ///
31
+ /// let override_json = json!({
32
+ /// "force_ocr": true,
33
+ /// });
34
+ ///
35
+ /// let merged = merge_configs(&base, &override_json).unwrap();
36
+ /// assert_eq!(merged.use_cache, true); // from base
37
+ /// assert_eq!(merged.force_ocr, true); // from override
38
+ /// ```
39
+ fn merge_configs(base: &ExtractionConfig, override_json: serde_json::Value) -> Result<ExtractionConfig, String> {
40
+ // Serialize base config to JSON
41
+ let mut config_json =
42
+ serde_json::to_value(base).map_err(|e| format!("Failed to serialize base config to JSON: {}", e))?;
43
+
44
+ // Merge JSON value into config JSON (simple field-by-field merge)
45
+ // For each key in the provided JSON, override the corresponding key in config JSON
46
+ if let serde_json::Value::Object(json_obj) = override_json
47
+ && let Some(config_obj) = config_json.as_object_mut()
48
+ {
49
+ for (key, value) in json_obj {
50
+ config_obj.insert(key, value);
51
+ }
52
+ }
53
+
54
+ // Deserialize merged JSON back to ExtractionConfig
55
+ serde_json::from_value(config_json).map_err(|e| format!("Failed to deserialize merged config: {}", e))
56
+ }
57
+
7
58
  /// Build extraction config from MCP parameters.
8
59
  ///
9
- /// Starts with the default config and overlays OCR settings from request parameters.
10
- pub(super) fn build_config(default_config: &ExtractionConfig, enable_ocr: bool, force_ocr: bool) -> ExtractionConfig {
11
- let mut config = default_config.clone();
12
-
13
- config.ocr = if enable_ocr {
14
- Some(crate::OcrConfig {
15
- backend: "tesseract".to_string(),
16
- language: "eng".to_string(),
17
- tesseract_config: None,
18
- output_format: None,
19
- })
60
+ /// Merges the provided config JSON (if any) with the default config using JSON-level
61
+ /// merge semantics. Unspecified fields in the JSON preserve their values from the default config.
62
+ pub(super) fn build_config(
63
+ default_config: &ExtractionConfig,
64
+ config_json: Option<serde_json::Value>,
65
+ ) -> Result<ExtractionConfig, String> {
66
+ if let Some(json) = config_json {
67
+ // Merge using JSON-level merge: provided JSON fields override default config
68
+ merge_configs(default_config, json)
20
69
  } else {
21
- None
22
- };
23
- config.force_ocr = force_ocr;
24
-
25
- config
70
+ // No config provided, use default
71
+ Ok(default_config.clone())
72
+ }
26
73
  }
27
74
 
28
75
  /// Format extraction result as human-readable text.
@@ -54,20 +101,35 @@ mod tests {
54
101
  use super::*;
55
102
 
56
103
  #[test]
57
- fn test_build_config() {
104
+ fn test_build_config_with_no_config() {
58
105
  let default_config = ExtractionConfig::default();
59
106
 
60
- let config = build_config(&default_config, false, false);
61
- assert!(config.ocr.is_none());
62
- assert!(!config.force_ocr);
107
+ let config = build_config(&default_config, None).unwrap();
108
+ assert_eq!(config.use_cache, default_config.use_cache);
109
+ }
110
+
111
+ #[test]
112
+ fn test_build_config_with_config_json() {
113
+ let default_config = ExtractionConfig::default();
114
+ let config_json = serde_json::json!({
115
+ "use_cache": false
116
+ });
63
117
 
64
- let config = build_config(&default_config, true, false);
65
- assert!(config.ocr.is_some());
66
- assert!(!config.force_ocr);
118
+ let config = build_config(&default_config, Some(config_json)).unwrap();
119
+ assert!(!config.use_cache);
120
+ }
67
121
 
68
- let config = build_config(&default_config, true, true);
69
- assert!(config.ocr.is_some());
70
- assert!(config.force_ocr);
122
+ #[test]
123
+ fn test_build_config_with_invalid_config_json() {
124
+ let default_config = ExtractionConfig::default();
125
+ // Provide invalid type for a field (string instead of boolean)
126
+ let config_json = serde_json::json!({
127
+ "use_cache": "not_a_boolean"
128
+ });
129
+
130
+ let result = build_config(&default_config, Some(config_json));
131
+ assert!(result.is_err());
132
+ assert!(result.unwrap_err().contains("Failed to deserialize"));
71
133
  }
72
134
 
73
135
  #[test]
@@ -77,31 +139,167 @@ mod tests {
77
139
  ..Default::default()
78
140
  };
79
141
 
80
- let config = build_config(&default_config, false, false);
142
+ let config = build_config(&default_config, None).unwrap();
81
143
 
82
144
  assert!(!config.use_cache);
83
145
  }
84
146
 
85
147
  #[test]
86
- fn test_build_config_ocr_disabled_by_default() {
87
- let default_config = ExtractionConfig::default();
148
+ fn test_build_config_overrides_default_settings() {
149
+ let default_config = ExtractionConfig {
150
+ use_cache: true,
151
+ ..Default::default()
152
+ };
88
153
 
89
- let config = build_config(&default_config, false, false);
154
+ let config_json = serde_json::json!({
155
+ "use_cache": false
156
+ });
90
157
 
91
- assert!(config.ocr.is_none());
92
- assert!(!config.force_ocr);
158
+ let config = build_config(&default_config, Some(config_json)).unwrap();
159
+ assert!(!config.use_cache);
93
160
  }
94
161
 
95
162
  #[test]
96
- fn test_build_config_ocr_enabled_creates_tesseract_config() {
97
- let default_config = ExtractionConfig::default();
163
+ fn test_build_config_merges_partial_config() {
164
+ // Base config with custom use_cache setting
165
+ let default_config = ExtractionConfig {
166
+ use_cache: false,
167
+ enable_quality_processing: true,
168
+ force_ocr: false,
169
+ ..Default::default()
170
+ };
171
+
172
+ // Override only force_ocr
173
+ let config_json = serde_json::json!({
174
+ "force_ocr": true
175
+ });
176
+
177
+ let config = build_config(&default_config, Some(config_json)).unwrap();
178
+
179
+ // use_cache should be preserved from default_config
180
+ assert!(!config.use_cache, "use_cache should be preserved from default config");
181
+ // enable_quality_processing should be preserved
182
+ assert!(
183
+ config.enable_quality_processing,
184
+ "enable_quality_processing should be preserved"
185
+ );
186
+ // force_ocr should be overridden
187
+ assert!(config.force_ocr, "force_ocr should be overridden to true");
188
+ }
189
+
190
+ #[test]
191
+ fn test_build_config_merges_nested_config() {
192
+ let default_config = ExtractionConfig {
193
+ use_cache: true,
194
+ ..Default::default()
195
+ };
196
+
197
+ // Override output format only
198
+ let config_json = serde_json::json!({
199
+ "output_format": "markdown"
200
+ });
201
+
202
+ let config = build_config(&default_config, Some(config_json)).unwrap();
203
+
204
+ // use_cache should be preserved
205
+ assert!(config.use_cache, "use_cache should be preserved from default config");
206
+ // output_format should be overridden
207
+ assert_eq!(
208
+ config.output_format,
209
+ crate::core::config::formats::OutputFormat::Markdown,
210
+ "output_format should be overridden to markdown"
211
+ );
212
+ }
213
+
214
+ #[test]
215
+ fn test_build_config_merges_with_custom_defaults() {
216
+ // Create a default config with custom values
217
+ let default_config = ExtractionConfig {
218
+ use_cache: false,
219
+ enable_quality_processing: true,
220
+ force_ocr: false,
221
+ ..Default::default()
222
+ };
223
+
224
+ // Provide partial override (only force_ocr)
225
+ let config_json = serde_json::json!({
226
+ "force_ocr": true,
227
+ });
228
+
229
+ let config = build_config(&default_config, Some(config_json)).unwrap();
230
+
231
+ // force_ocr should be overridden
232
+ assert!(config.force_ocr, "force_ocr should be overridden to true");
233
+ // use_cache should be preserved from default_config
234
+ assert!(
235
+ !config.use_cache,
236
+ "use_cache should be preserved from default config (false)"
237
+ );
238
+ // enable_quality_processing should be preserved
239
+ assert!(
240
+ config.enable_quality_processing,
241
+ "enable_quality_processing should be preserved (true)"
242
+ );
243
+ }
244
+
245
+ #[test]
246
+ fn test_build_config_merges_multiple_fields() {
247
+ let default_config = ExtractionConfig {
248
+ use_cache: true,
249
+ enable_quality_processing: false,
250
+ force_ocr: true,
251
+ ..Default::default()
252
+ };
253
+
254
+ // Override multiple fields
255
+ let config_json = serde_json::json!({
256
+ "use_cache": false,
257
+ "output_format": "markdown",
258
+ });
259
+
260
+ let config = build_config(&default_config, Some(config_json)).unwrap();
261
+
262
+ // use_cache should be overridden
263
+ assert!(!config.use_cache, "use_cache should be overridden to false");
264
+ // output_format should be overridden
265
+ assert_eq!(
266
+ config.output_format,
267
+ crate::core::config::formats::OutputFormat::Markdown,
268
+ "output_format should be overridden to markdown"
269
+ );
270
+ // force_ocr should be preserved (not in override)
271
+ assert!(
272
+ config.force_ocr,
273
+ "force_ocr should be preserved from default config (true)"
274
+ );
275
+ // enable_quality_processing should be preserved
276
+ assert!(
277
+ !config.enable_quality_processing,
278
+ "enable_quality_processing should be preserved (false)"
279
+ );
280
+ }
281
+
282
+ #[test]
283
+ fn test_build_config_boolean_override_to_default_value() {
284
+ // This test validates the critical bug fix: when user explicitly sets a boolean
285
+ // to its default value, the merge logic should correctly use the override value,
286
+ // not fall back to the base config.
287
+ let base = ExtractionConfig {
288
+ use_cache: false,
289
+ ..Default::default()
290
+ };
291
+
292
+ // User explicitly provides use_cache: true (which IS the default)
293
+ let override_json = serde_json::json!({"use_cache": true});
98
294
 
99
- let config = build_config(&default_config, true, false);
295
+ let merged = build_config(&base, Some(override_json)).unwrap();
100
296
 
101
- assert!(config.ocr.is_some());
102
- let ocr_config = config.ocr.unwrap();
103
- assert_eq!(ocr_config.backend, "tesseract");
104
- assert_eq!(ocr_config.language, "eng");
297
+ // Before the fix: merged.use_cache would be false (WRONG - fell back to base)
298
+ // After the fix: merged.use_cache should be true (CORRECT - override applied)
299
+ assert!(
300
+ merged.use_cache,
301
+ "Should use explicit override even if it matches default"
302
+ );
105
303
  }
106
304
 
107
305
  #[test]
@@ -12,12 +12,9 @@ pub struct ExtractFileParams {
12
12
  /// Optional MIME type hint (auto-detected if not provided)
13
13
  #[serde(skip_serializing_if = "Option::is_none")]
14
14
  pub mime_type: Option<String>,
15
- /// Enable OCR for scanned documents
16
- #[serde(default)]
17
- pub enable_ocr: bool,
18
- /// Force OCR even if text extraction succeeds
19
- #[serde(default)]
20
- pub force_ocr: bool,
15
+ /// Extraction configuration (JSON object)
16
+ #[serde(skip_serializing_if = "Option::is_none")]
17
+ pub config: Option<serde_json::Value>,
21
18
  /// Use async extraction (default: false for sync)
22
19
  #[serde(default)]
23
20
  pub r#async: bool,
@@ -31,12 +28,9 @@ pub struct ExtractBytesParams {
31
28
  /// Optional MIME type hint (auto-detected if not provided)
32
29
  #[serde(skip_serializing_if = "Option::is_none")]
33
30
  pub mime_type: Option<String>,
34
- /// Enable OCR for scanned documents
35
- #[serde(default)]
36
- pub enable_ocr: bool,
37
- /// Force OCR even if text extraction succeeds
38
- #[serde(default)]
39
- pub force_ocr: bool,
31
+ /// Extraction configuration (JSON object)
32
+ #[serde(skip_serializing_if = "Option::is_none")]
33
+ pub config: Option<serde_json::Value>,
40
34
  /// Use async extraction (default: false for sync)
41
35
  #[serde(default)]
42
36
  pub r#async: bool,
@@ -47,12 +41,9 @@ pub struct ExtractBytesParams {
47
41
  pub struct BatchExtractFilesParams {
48
42
  /// Paths to files to extract
49
43
  pub paths: Vec<String>,
50
- /// Enable OCR for scanned documents
51
- #[serde(default)]
52
- pub enable_ocr: bool,
53
- /// Force OCR even if text extraction succeeds
54
- #[serde(default)]
55
- pub force_ocr: bool,
44
+ /// Extraction configuration (JSON object)
45
+ #[serde(skip_serializing_if = "Option::is_none")]
46
+ pub config: Option<serde_json::Value>,
56
47
  /// Use async extraction (default: false for sync)
57
48
  #[serde(default)]
58
49
  pub r#async: bool,
@@ -83,8 +74,7 @@ mod tests {
83
74
 
84
75
  assert_eq!(params.path, "/test.pdf");
85
76
  assert_eq!(params.mime_type, None);
86
- assert!(!params.enable_ocr);
87
- assert!(!params.force_ocr);
77
+ assert_eq!(params.config, None);
88
78
  assert!(!params.r#async);
89
79
  }
90
80
 
@@ -95,8 +85,7 @@ mod tests {
95
85
 
96
86
  assert_eq!(params.data, "SGVsbG8=");
97
87
  assert_eq!(params.mime_type, None);
98
- assert!(!params.enable_ocr);
99
- assert!(!params.force_ocr);
88
+ assert_eq!(params.config, None);
100
89
  assert!(!params.r#async);
101
90
  }
102
91
 
@@ -106,8 +95,7 @@ mod tests {
106
95
  let params: BatchExtractFilesParams = serde_json::from_str(json).unwrap();
107
96
 
108
97
  assert_eq!(params.paths.len(), 2);
109
- assert!(!params.enable_ocr);
110
- assert!(!params.force_ocr);
98
+ assert_eq!(params.config, None);
111
99
  assert!(!params.r#async);
112
100
  }
113
101
 
@@ -128,13 +116,21 @@ mod tests {
128
116
  assert!(!params.use_content);
129
117
  }
130
118
 
119
+ #[test]
120
+ fn test_extract_file_params_with_config() {
121
+ let json = r#"{"path": "/test.pdf", "config": {"use_cache": false}}"#;
122
+ let params: ExtractFileParams = serde_json::from_str(json).unwrap();
123
+
124
+ assert_eq!(params.path, "/test.pdf");
125
+ assert!(params.config.is_some());
126
+ }
127
+
131
128
  #[test]
132
129
  fn test_extract_file_params_serialization() {
133
130
  let params = ExtractFileParams {
134
131
  path: "/test.pdf".to_string(),
135
132
  mime_type: Some("application/pdf".to_string()),
136
- enable_ocr: true,
137
- force_ocr: false,
133
+ config: Some(serde_json::json!({"use_cache": false})),
138
134
  r#async: true,
139
135
  };
140
136
 
@@ -143,8 +139,7 @@ mod tests {
143
139
 
144
140
  assert_eq!(params.path, deserialized.path);
145
141
  assert_eq!(params.mime_type, deserialized.mime_type);
146
- assert_eq!(params.enable_ocr, deserialized.enable_ocr);
147
- assert_eq!(params.force_ocr, deserialized.force_ocr);
142
+ assert_eq!(params.config, deserialized.config);
148
143
  assert_eq!(params.r#async, deserialized.r#async);
149
144
  }
150
145
 
@@ -153,8 +148,7 @@ mod tests {
153
148
  let params = ExtractBytesParams {
154
149
  data: "SGVsbG8=".to_string(),
155
150
  mime_type: None,
156
- enable_ocr: false,
157
- force_ocr: false,
151
+ config: None,
158
152
  r#async: false,
159
153
  };
160
154
 
@@ -168,8 +162,7 @@ mod tests {
168
162
  fn test_batch_extract_params_serialization() {
169
163
  let params = BatchExtractFilesParams {
170
164
  paths: vec!["/a.pdf".to_string(), "/b.pdf".to_string()],
171
- enable_ocr: true,
172
- force_ocr: true,
165
+ config: Some(serde_json::json!({"use_cache": true})),
173
166
  r#async: true,
174
167
  };
175
168
 
@@ -177,7 +170,7 @@ mod tests {
177
170
  let deserialized: BatchExtractFilesParams = serde_json::from_str(&json).unwrap();
178
171
 
179
172
  assert_eq!(params.paths, deserialized.paths);
180
- assert_eq!(params.enable_ocr, deserialized.enable_ocr);
173
+ assert_eq!(params.config, deserialized.config);
181
174
  }
182
175
 
183
176
  #[test]
@@ -80,7 +80,8 @@ impl KreuzbergMcp {
80
80
  use super::format::{build_config, format_extraction_result};
81
81
  use crate::{extract_file, extract_file_sync};
82
82
 
83
- let config = build_config(&self.default_config, params.enable_ocr, params.force_ocr);
83
+ let config =
84
+ build_config(&self.default_config, params.config).map_err(|e| rmcp::ErrorData::invalid_params(e, None))?;
84
85
 
85
86
  let result = if params.r#async {
86
87
  extract_file(&params.path, params.mime_type.as_deref(), &config)
@@ -114,7 +115,8 @@ impl KreuzbergMcp {
114
115
  .decode(&params.data)
115
116
  .map_err(|e| rmcp::ErrorData::invalid_params(format!("Invalid base64: {}", e), None))?;
116
117
 
117
- let config = build_config(&self.default_config, params.enable_ocr, params.force_ocr);
118
+ let config =
119
+ build_config(&self.default_config, params.config).map_err(|e| rmcp::ErrorData::invalid_params(e, None))?;
118
120
 
119
121
  let mime_type = params.mime_type.as_deref().unwrap_or("");
120
122
 
@@ -145,7 +147,8 @@ impl KreuzbergMcp {
145
147
  use super::format::{build_config, format_extraction_result};
146
148
  use crate::{batch_extract_file, batch_extract_file_sync};
147
149
 
148
- let config = build_config(&self.default_config, params.enable_ocr, params.force_ocr);
150
+ let config =
151
+ build_config(&self.default_config, params.config).map_err(|e| rmcp::ErrorData::invalid_params(e, None))?;
149
152
 
150
153
  let results = if params.r#async {
151
154
  batch_extract_file(params.paths.clone(), &config)
@@ -30,7 +30,8 @@ pub(in crate::mcp) trait ExtractionTool {
30
30
  &self,
31
31
  Parameters(params): Parameters<ExtractFileParams>,
32
32
  ) -> Result<CallToolResult, McpError> {
33
- let config = build_config(self.default_config(), params.enable_ocr, params.force_ocr);
33
+ let config = build_config(self.default_config(), params.config)
34
+ .map_err(|e| McpError::invalid_params(e, None))?;
34
35
 
35
36
  let result = if params.r#async {
36
37
  extract_file(&params.path, params.mime_type.as_deref(), &config)
@@ -59,7 +60,8 @@ pub(in crate::mcp) trait ExtractionTool {
59
60
  .decode(&params.data)
60
61
  .map_err(|e| McpError::invalid_params(format!("Invalid base64: {}", e), None))?;
61
62
 
62
- let config = build_config(self.default_config(), params.enable_ocr, params.force_ocr);
63
+ let config = build_config(self.default_config(), params.config)
64
+ .map_err(|e| McpError::invalid_params(e, None))?;
63
65
 
64
66
  let mime_type = params.mime_type.as_deref().unwrap_or("");
65
67
 
@@ -86,7 +88,8 @@ pub(in crate::mcp) trait ExtractionTool {
86
88
  &self,
87
89
  Parameters(params): Parameters<BatchExtractFilesParams>,
88
90
  ) -> Result<CallToolResult, McpError> {
89
- let config = build_config(self.default_config(), params.enable_ocr, params.force_ocr);
91
+ let config = build_config(self.default_config(), params.config)
92
+ .map_err(|e| McpError::invalid_params(e, None))?;
90
93
 
91
94
  let results = if params.r#async {
92
95
  batch_extract_file(params.paths.clone(), &config)
@@ -153,8 +156,7 @@ mod tests {
153
156
  let params = ExtractFileParams {
154
157
  path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
155
158
  mime_type: None,
156
- enable_ocr: false,
157
- force_ocr: false,
159
+ config: None,
158
160
  r#async: true,
159
161
  };
160
162
 
@@ -181,8 +183,7 @@ mod tests {
181
183
  let params = ExtractFileParams {
182
184
  path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
183
185
  mime_type: None,
184
- enable_ocr: false,
185
- force_ocr: false,
186
+ config: None,
186
187
  r#async: true,
187
188
  };
188
189
 
@@ -208,8 +209,7 @@ mod tests {
208
209
  let params = ExtractFileParams {
209
210
  path: "/nonexistent/file.pdf".to_string(),
210
211
  mime_type: None,
211
- enable_ocr: false,
212
- force_ocr: false,
212
+ config: None,
213
213
  r#async: true,
214
214
  };
215
215
 
@@ -226,8 +226,7 @@ mod tests {
226
226
  let params = ExtractFileParams {
227
227
  path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
228
228
  mime_type: Some("application/pdf".to_string()),
229
- enable_ocr: false,
230
- force_ocr: false,
229
+ config: None,
231
230
  r#async: true,
232
231
  };
233
232
 
@@ -246,8 +245,7 @@ mod tests {
246
245
  let params = ExtractBytesParams {
247
246
  data: encoded,
248
247
  mime_type: Some("text/plain".to_string()),
249
- enable_ocr: false,
250
- force_ocr: false,
248
+ config: None,
251
249
  r#async: true,
252
250
  };
253
251
 
@@ -274,8 +272,7 @@ mod tests {
274
272
  let params = ExtractBytesParams {
275
273
  data: "not-valid-base64!!!".to_string(),
276
274
  mime_type: None,
277
- enable_ocr: false,
278
- force_ocr: false,
275
+ config: None,
279
276
  r#async: true,
280
277
  };
281
278
 
@@ -292,8 +289,7 @@ mod tests {
292
289
  let server = TestMcpServer::new();
293
290
  let params = BatchExtractFilesParams {
294
291
  paths: vec![get_test_path("pdfs_with_tables/tiny.pdf").to_string()],
295
- enable_ocr: false,
296
- force_ocr: false,
292
+ config: None,
297
293
  r#async: true,
298
294
  };
299
295
 
@@ -319,8 +315,7 @@ mod tests {
319
315
  let server = TestMcpServer::new();
320
316
  let params = BatchExtractFilesParams {
321
317
  paths: vec![],
322
- enable_ocr: false,
323
- force_ocr: false,
318
+ config: None,
324
319
  r#async: true,
325
320
  };
326
321
 
@@ -350,8 +345,7 @@ mod tests {
350
345
  let params = ExtractFileParams {
351
346
  path: test_file.to_string(),
352
347
  mime_type: None,
353
- enable_ocr: false,
354
- force_ocr: false,
348
+ config: None,
355
349
  r#async: true,
356
350
  };
357
351
 
@@ -378,8 +372,7 @@ mod tests {
378
372
  if std::path::Path::new(&file1).exists() && std::path::Path::new(&file2).exists() {
379
373
  let params = BatchExtractFilesParams {
380
374
  paths: vec![file1.to_string(), file2.to_string()],
381
- enable_ocr: false,
382
- force_ocr: false,
375
+ config: None,
383
376
  r#async: true,
384
377
  };
385
378