kreuzberg 4.1.2 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
  5. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  6. data/kreuzberg.gemspec +13 -1
  7. data/lib/kreuzberg/cli.rb +16 -6
  8. data/lib/kreuzberg/cli_proxy.rb +3 -1
  9. data/lib/kreuzberg/config.rb +121 -39
  10. data/lib/kreuzberg/djot_content.rb +225 -0
  11. data/lib/kreuzberg/extraction_api.rb +20 -4
  12. data/lib/kreuzberg/result.rb +12 -2
  13. data/lib/kreuzberg/version.rb +1 -1
  14. data/lib/kreuzberg.rb +1 -0
  15. data/sig/kreuzberg.rbs +28 -12
  16. data/spec/binding/batch_operations_spec.rb +80 -0
  17. data/spec/binding/batch_spec.rb +6 -5
  18. data/spec/binding/error_recovery_spec.rb +3 -3
  19. data/spec/binding/metadata_types_spec.rb +77 -57
  20. data/spec/binding/tables_spec.rb +11 -2
  21. data/spec/serialization_spec.rb +134 -0
  22. data/spec/unit/config/output_format_spec.rb +380 -0
  23. data/vendor/Cargo.toml +1 -1
  24. data/vendor/kreuzberg/Cargo.toml +1 -1
  25. data/vendor/kreuzberg/README.md +1 -1
  26. data/vendor/kreuzberg/src/api/startup.rs +15 -1
  27. data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
  28. data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
  29. data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
  30. data/vendor/kreuzberg/src/core/io.rs +7 -7
  31. data/vendor/kreuzberg/src/core/mime.rs +4 -4
  32. data/vendor/kreuzberg/src/embeddings.rs +4 -4
  33. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
  34. data/vendor/kreuzberg/src/mcp/format.rs +237 -39
  35. data/vendor/kreuzberg/src/mcp/params.rs +26 -33
  36. data/vendor/kreuzberg/src/mcp/server.rs +6 -3
  37. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
  38. data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
  39. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
  40. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
  41. data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
  42. data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
  43. data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
  44. data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
  45. data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
  46. data/vendor/kreuzberg/tests/api_embed.rs +84 -50
  47. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
  48. data/vendor/kreuzberg/tests/api_tests.rs +298 -139
  49. data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
  50. data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
  51. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
  52. data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
  53. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
  54. data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
  55. data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
  56. data/vendor/kreuzberg/tests/config_features.rs +19 -15
  57. data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
  58. data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
  59. data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
  60. data/vendor/kreuzberg/tests/core_integration.rs +57 -57
  61. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
  62. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
  63. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
  64. data/vendor/kreuzberg/tests/email_integration.rs +7 -7
  65. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
  66. data/vendor/kreuzberg/tests/error_handling.rs +13 -11
  67. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
  68. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  69. data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
  70. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
  71. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
  72. data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
  73. data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
  74. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
  75. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
  76. data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
  77. data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
  78. data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
  79. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
  80. data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
  81. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
  82. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
  83. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
  84. data/vendor/kreuzberg/tests/page_markers.rs +1 -1
  85. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
  86. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  87. data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
  88. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
  89. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
  90. data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
  91. data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
  92. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
  93. data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
  94. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
  95. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
  96. data/vendor/kreuzberg/tests/security_validation.rs +20 -19
  97. data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
  98. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
  99. data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
  100. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
  101. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
  102. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  103. metadata +12 -2
@@ -0,0 +1,849 @@
1
+ //! MCP integration tests for API consistency and breaking changes.
2
+ //!
3
+ //! This test suite validates that:
4
+ //! 1. MCP parameters properly handle extraction configuration
5
+ //! 2. MCP parameter deserialization is consistent
6
+ //! 3. Various config combinations work correctly
7
+ //! 4. End-to-end MCP tool invocations work with real data
8
+ //! 5. Error handling is consistent across MCP tools
9
+ //!
10
+ //! Note: These tests verify the parameter structures used by MCP.
11
+ //! The build_config function in the MCP server should accept
12
+ //! a config JSON field instead of separate enable_ocr/force_ocr flags
13
+ //! to align with the new API consistency approach.
14
+
15
+ #![allow(clippy::bool_assert_comparison)]
16
+ #![allow(clippy::field_reassign_with_default)]
17
+
18
+ use serde_json::json;
19
+
20
+ /// Test that parameter structures can handle various JSON configurations
21
+ #[test]
22
+ fn test_extraction_config_parameter_structure() {
23
+ // This demonstrates the new approach: config JSON instead of separate flags
24
+ let config_json = json!({
25
+ "use_cache": true,
26
+ "force_ocr": true,
27
+ "output_format": "markdown",
28
+ });
29
+
30
+ let config: kreuzberg::core::config::ExtractionConfig =
31
+ serde_json::from_value(config_json).expect("Failed to parse config");
32
+
33
+ assert_eq!(config.use_cache, true);
34
+ assert_eq!(config.force_ocr, true);
35
+ assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
36
+ }
37
+
38
+ #[test]
39
+ fn test_mcp_style_params_with_config() {
40
+ // This demonstrates how MCP params should accept full config JSON
41
+ let mcp_request = json!({
42
+ "path": "/test.pdf",
43
+ "mime_type": "application/pdf",
44
+ "config": {
45
+ "use_cache": false,
46
+ "force_ocr": true,
47
+ "output_format": "markdown",
48
+ }
49
+ });
50
+
51
+ // The config field should be parseable as ExtractionConfig
52
+ let config_obj = mcp_request.get("config").expect("Should have config field");
53
+ let config: kreuzberg::core::config::ExtractionConfig =
54
+ serde_json::from_value(config_obj.clone()).expect("Failed to parse config");
55
+
56
+ assert_eq!(config.force_ocr, true);
57
+ assert_eq!(config.use_cache, false);
58
+ }
59
+
60
+ #[test]
61
+ fn test_mcp_params_backward_compatibility_minimal() {
62
+ // Minimal MCP params structure
63
+ let params = json!({
64
+ "path": "/test.pdf",
65
+ });
66
+
67
+ // Should be deserializable
68
+ let path = params.get("path").expect("Should have path");
69
+ assert_eq!(path, "/test.pdf");
70
+ }
71
+
72
+ #[test]
73
+ fn test_mcp_params_with_all_fields() {
74
+ // Complete MCP params with config
75
+ let params = json!({
76
+ "path": "/test.pdf",
77
+ "mime_type": "application/pdf",
78
+ "config": {
79
+ "use_cache": true,
80
+ "enable_quality_processing": true,
81
+ "force_ocr": false,
82
+ "output_format": "plain",
83
+ }
84
+ });
85
+
86
+ // Extract and validate config
87
+ if let Some(config_obj) = params.get("config") {
88
+ let config: kreuzberg::core::config::ExtractionConfig =
89
+ serde_json::from_value(config_obj.clone()).expect("Failed to parse");
90
+
91
+ assert_eq!(config.use_cache, true);
92
+ assert_eq!(config.force_ocr, false);
93
+ assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Plain);
94
+ }
95
+ }
96
+
97
+ #[test]
98
+ fn test_batch_extraction_params_structure() {
99
+ // Batch extraction params with paths and config
100
+ let batch_params = json!({
101
+ "paths": ["/file1.pdf", "/file2.pdf", "/file3.pdf"],
102
+ "config": {
103
+ "force_ocr": true,
104
+ "max_concurrent_extractions": 4,
105
+ }
106
+ });
107
+
108
+ let paths = batch_params.get("paths").expect("Should have paths");
109
+ assert!(paths.is_array(), "paths field should be an array");
110
+ let path_array = paths.as_array().expect("paths should be deserializable as array");
111
+ assert_eq!(path_array.len(), 3, "paths array should contain exactly 3 elements");
112
+
113
+ if let Some(config_obj) = batch_params.get("config") {
114
+ let config: kreuzberg::core::config::ExtractionConfig =
115
+ serde_json::from_value(config_obj.clone()).expect("Failed to parse");
116
+ assert_eq!(config.force_ocr, true);
117
+ assert_eq!(config.max_concurrent_extractions, Some(4));
118
+ }
119
+ }
120
+
121
+ #[test]
122
+ fn test_config_merge_in_mcp_context() {
123
+ // Test 1: Verify default config baseline
124
+ let default_config = kreuzberg::core::config::ExtractionConfig::default();
125
+ assert_eq!(default_config.use_cache, true, "Default cache should be enabled");
126
+ assert_eq!(default_config.force_ocr, false, "Default force_ocr should be false");
127
+ assert_eq!(
128
+ default_config.output_format,
129
+ kreuzberg::core::config::OutputFormat::Plain,
130
+ "Default output format should be Plain"
131
+ );
132
+
133
+ // Test 2: Request provides single field override - verify precedence
134
+ let request_config_json = json!({
135
+ "force_ocr": true,
136
+ });
137
+ let request_config: kreuzberg::core::config::ExtractionConfig =
138
+ serde_json::from_value(request_config_json).expect("Failed to parse request config");
139
+
140
+ // Request config should override that field
141
+ assert_eq!(request_config.force_ocr, true, "Request force_ocr should be true");
142
+
143
+ // But unspecified fields should use defaults
144
+ assert_eq!(
145
+ request_config.use_cache, true,
146
+ "Unspecified use_cache should default to true"
147
+ );
148
+ assert_eq!(
149
+ request_config.output_format,
150
+ kreuzberg::core::config::OutputFormat::Plain,
151
+ "Unspecified output_format should default to Plain"
152
+ );
153
+
154
+ // Test 3: Multiple field overrides - verify precedence chain
155
+ let multi_override_json = json!({
156
+ "use_cache": false,
157
+ "force_ocr": true,
158
+ "output_format": "markdown",
159
+ });
160
+ let multi_config: kreuzberg::core::config::ExtractionConfig =
161
+ serde_json::from_value(multi_override_json).expect("Failed to parse multi-field config");
162
+
163
+ // All specified fields should override defaults
164
+ assert_eq!(multi_config.use_cache, false, "Override use_cache should be false");
165
+ assert_eq!(multi_config.force_ocr, true, "Override force_ocr should be true");
166
+ assert_eq!(
167
+ multi_config.output_format,
168
+ kreuzberg::core::config::OutputFormat::Markdown,
169
+ "Override output_format should be Markdown"
170
+ );
171
+
172
+ // Unspecified numeric fields should still have defaults
173
+ if let Some(max_conc) = multi_config.max_concurrent_extractions {
174
+ panic!(
175
+ "max_concurrent_extractions should not be specified when not in request, got: {}",
176
+ max_conc
177
+ );
178
+ }
179
+
180
+ // Test 4: Verify config can be fully constructed with all fields
181
+ let full_json = json!({
182
+ "use_cache": false,
183
+ "enable_quality_processing": true,
184
+ "force_ocr": true,
185
+ "output_format": "html",
186
+ "max_concurrent_extractions": 8,
187
+ });
188
+ let full_config: kreuzberg::core::config::ExtractionConfig =
189
+ serde_json::from_value(full_json).expect("Failed to parse full config");
190
+
191
+ assert_eq!(full_config.use_cache, false, "Full config use_cache should be false");
192
+ assert_eq!(
193
+ full_config.enable_quality_processing, true,
194
+ "Full config quality processing should be true"
195
+ );
196
+ assert_eq!(full_config.force_ocr, true, "Full config force_ocr should be true");
197
+ assert_eq!(
198
+ full_config.output_format,
199
+ kreuzberg::core::config::OutputFormat::Html,
200
+ "Full config output_format should be Html"
201
+ );
202
+ assert_eq!(
203
+ full_config.max_concurrent_extractions,
204
+ Some(8),
205
+ "Full config max_concurrent should be 8"
206
+ );
207
+ }
208
+
209
+ #[test]
210
+ fn test_config_json_flexibility() {
211
+ // Config JSON can have any combination of fields
212
+ let configs = vec![
213
+ json!({}), // Empty = all defaults
214
+ json!({"force_ocr": true}), // Single field
215
+ json!({"force_ocr": true, "use_cache": false}), // Multiple fields
216
+ json!({"output_format": "markdown", "max_concurrent_extractions": 8}), // Various types
217
+ ];
218
+
219
+ for config_json in configs {
220
+ let config: Result<kreuzberg::core::config::ExtractionConfig, _> = serde_json::from_value(config_json);
221
+ assert!(config.is_ok(), "Config should deserialize successfully");
222
+ }
223
+ }
224
+
225
+ #[test]
226
+ fn test_extraction_config_serialization_for_mcp() {
227
+ // MCP should be able to serialize config back to JSON
228
+ let mut config = kreuzberg::core::config::ExtractionConfig::default();
229
+ config.force_ocr = true;
230
+ config.output_format = kreuzberg::core::config::OutputFormat::Markdown;
231
+
232
+ let json = serde_json::to_value(&config).expect("Failed to serialize");
233
+
234
+ // Verify it round-trips
235
+ let restored: kreuzberg::core::config::ExtractionConfig =
236
+ serde_json::from_value(json).expect("Failed to deserialize");
237
+
238
+ assert_eq!(config.force_ocr, restored.force_ocr);
239
+ assert_eq!(config.output_format, restored.output_format);
240
+ }
241
+
242
+ // ============================================================================
243
+ // E2E TEST CASES
244
+ // ============================================================================
245
+
246
+ /// Test MCP config with all options enabled
247
+ #[test]
248
+ fn test_mcp_config_full_extraction() {
249
+ let config_json = json!({
250
+ "use_cache": false,
251
+ "enable_quality_processing": true,
252
+ "force_ocr": false,
253
+ "output_format": "markdown",
254
+ "max_concurrent_extractions": 4,
255
+ });
256
+
257
+ let config: kreuzberg::core::config::ExtractionConfig =
258
+ serde_json::from_value(config_json).expect("Failed to parse full config");
259
+
260
+ // Verify all fields deserialized correctly
261
+ assert_eq!(config.use_cache, false);
262
+ assert_eq!(config.enable_quality_processing, true);
263
+ assert_eq!(config.force_ocr, false);
264
+ assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
265
+ assert_eq!(config.max_concurrent_extractions, Some(4));
266
+ }
267
+
268
+ /// Test MCP config with markdown output format
269
+ #[test]
270
+ fn test_mcp_config_output_format_markdown() {
271
+ let config_json = json!({
272
+ "output_format": "markdown",
273
+ });
274
+
275
+ let config: kreuzberg::core::config::ExtractionConfig =
276
+ serde_json::from_value(config_json).expect("Failed to parse markdown config");
277
+
278
+ assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
279
+ }
280
+
281
+ /// Test MCP config with element-based result structure
282
+ #[test]
283
+ fn test_mcp_config_result_format_element_based() {
284
+ let config_json = json!({
285
+ "output_format": "markdown",
286
+ "use_cache": true,
287
+ "enable_quality_processing": true,
288
+ });
289
+
290
+ let config: kreuzberg::core::config::ExtractionConfig =
291
+ serde_json::from_value(config_json).expect("Failed to parse element format");
292
+
293
+ assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
294
+ assert_eq!(config.use_cache, true);
295
+ assert_eq!(config.enable_quality_processing, true);
296
+ }
297
+
298
+ /// Test batch extraction with config applied to all files
299
+ #[test]
300
+ fn test_mcp_batch_with_config() {
301
+ let batch_request = json!({
302
+ "paths": ["/file1.txt", "/file2.txt", "/file3.txt"],
303
+ "config": {
304
+ "force_ocr": true,
305
+ "output_format": "plain",
306
+ "max_concurrent_extractions": 2,
307
+ }
308
+ });
309
+
310
+ // Verify paths are array
311
+ let paths = batch_request.get("paths").expect("Should have paths");
312
+ assert!(paths.is_array(), "paths field should be an array");
313
+ let path_array = paths.as_array().expect("paths should be deserializable as array");
314
+ assert_eq!(path_array.len(), 3, "paths array should contain exactly 3 elements");
315
+
316
+ // Verify config applies to batch
317
+ let config_obj = batch_request.get("config").expect("Should have config");
318
+ let config: kreuzberg::core::config::ExtractionConfig =
319
+ serde_json::from_value(config_obj.clone()).expect("Failed to parse batch config");
320
+
321
+ assert_eq!(config.force_ocr, true);
322
+ assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Plain);
323
+ assert_eq!(config.max_concurrent_extractions, Some(2));
324
+ }
325
+
326
+ /// Test MCP error handling with invalid JSON config
327
+ #[test]
328
+ fn test_mcp_invalid_config_json_error() {
329
+ let invalid_config = "not a valid json object";
330
+
331
+ let result: Result<kreuzberg::core::config::ExtractionConfig, _> = serde_json::from_str(invalid_config);
332
+
333
+ assert!(result.is_err(), "Invalid JSON should produce error");
334
+ }
335
+
336
+ /// Test that MCP config field precedence is correct
337
+ #[test]
338
+ fn test_mcp_config_overrides() {
339
+ // Simulate MCP request with inline config
340
+ let mcp_params = json!({
341
+ "path": "/document.pdf",
342
+ "mime_type": "application/pdf",
343
+ "config": {
344
+ "force_ocr": true,
345
+ "use_cache": false,
346
+ "output_format": "markdown",
347
+ }
348
+ });
349
+
350
+ if let Some(config_obj) = mcp_params.get("config") {
351
+ let parsed_config: kreuzberg::core::config::ExtractionConfig =
352
+ serde_json::from_value(config_obj.clone()).expect("Failed to parse");
353
+
354
+ // Verify request config overrides defaults
355
+ assert_eq!(parsed_config.force_ocr, true);
356
+ assert_eq!(parsed_config.use_cache, false);
357
+ assert_eq!(
358
+ parsed_config.output_format,
359
+ kreuzberg::core::config::OutputFormat::Markdown
360
+ );
361
+ }
362
+ }
363
+
364
+ /// Test that deprecated parameters (enable_ocr, force_ocr as separate fields) are rejected
365
+ #[test]
366
+ fn test_mcp_no_deprecated_params() {
367
+ // This simulates MCP params that incorrectly use separate flags
368
+ let deprecated_params = json!({
369
+ "path": "/document.pdf",
370
+ "enable_ocr": true, // deprecated!
371
+ "force_ocr": true, // should be in config
372
+ });
373
+
374
+ // The correct approach: config field contains all options
375
+ let correct_params = json!({
376
+ "path": "/document.pdf",
377
+ "config": {
378
+ "force_ocr": true,
379
+ }
380
+ });
381
+
382
+ // Extract and verify correct params
383
+ if let Some(config_obj) = correct_params.get("config") {
384
+ let config: kreuzberg::core::config::ExtractionConfig =
385
+ serde_json::from_value(config_obj.clone()).expect("Failed to parse");
386
+ assert_eq!(config.force_ocr, true);
387
+ }
388
+
389
+ // Verify deprecated params are NOT in the correct structure
390
+ assert!(
391
+ deprecated_params.get("config").is_none(),
392
+ "Deprecated params should not be in config"
393
+ );
394
+ }
395
+
396
+ /// End-to-end test with real text extraction
397
+ #[tokio::test]
398
+ async fn test_mcp_real_pdf_extraction() {
399
+ // Create a simple test document in bytes
400
+ let test_content = b"Hello, MCP!";
401
+
402
+ // Create MCP request structure
403
+ let mcp_request = json!({
404
+ "mime_type": "text/plain",
405
+ "config": {
406
+ "output_format": "plain",
407
+ "use_cache": false,
408
+ }
409
+ });
410
+
411
+ // Extract config from request
412
+ if let Some(config_obj) = mcp_request.get("config") {
413
+ let config: kreuzberg::core::config::ExtractionConfig =
414
+ serde_json::from_value(config_obj.clone()).expect("Failed to parse config");
415
+
416
+ // Use async extract_bytes to process content
417
+ let result = kreuzberg::extract_bytes(test_content, "text/plain", &config)
418
+ .await
419
+ .expect("Extraction should succeed");
420
+
421
+ // Verify result has content
422
+ assert!(!result.content.is_empty());
423
+ assert!(result.content.contains("MCP") || result.content.contains("Hello"));
424
+ }
425
+ }
426
+
427
+ /// Test MCP batch extraction with mixed formats
428
+ #[test]
429
+ fn test_mcp_batch_mixed_formats() {
430
+ let batch_config = json!({
431
+ "files": [
432
+ {
433
+ "path": "/document.pdf",
434
+ "mime_type": "application/pdf",
435
+ },
436
+ {
437
+ "path": "/document.docx",
438
+ "mime_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
439
+ },
440
+ {
441
+ "path": "/document.txt",
442
+ "mime_type": "text/plain",
443
+ }
444
+ ],
445
+ "config": {
446
+ "output_format": "markdown",
447
+ "force_ocr": false,
448
+ }
449
+ });
450
+
451
+ let files = batch_config.get("files").expect("Should have files");
452
+ assert!(files.is_array(), "files field should be an array");
453
+ let file_array = files.as_array().expect("files should be deserializable as array");
454
+ assert_eq!(file_array.len(), 3, "files array should contain exactly 3 elements");
455
+
456
+ if let Some(config_obj) = batch_config.get("config") {
457
+ let config: kreuzberg::core::config::ExtractionConfig =
458
+ serde_json::from_value(config_obj.clone()).expect("Failed to parse batch config");
459
+ assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
460
+ assert_eq!(config.force_ocr, false);
461
+ }
462
+ }
463
+
464
+ /// Test MCP request with minimal config (all defaults)
465
+ #[test]
466
+ fn test_mcp_minimal_config() {
467
+ let minimal_request = json!({
468
+ "path": "/document.pdf",
469
+ });
470
+
471
+ // Path should exist and be correct
472
+ assert_eq!(
473
+ minimal_request.get("path"),
474
+ Some(&serde_json::Value::String("/document.pdf".to_string())),
475
+ "Path field should be present and set to /document.pdf"
476
+ );
477
+
478
+ // If no config, use defaults
479
+ let config = match minimal_request.get("config") {
480
+ Some(config_obj) => {
481
+ serde_json::from_value(config_obj.clone()).expect("Failed to parse config from minimal request")
482
+ }
483
+ None => kreuzberg::core::config::ExtractionConfig::default(),
484
+ };
485
+
486
+ // Verify defaults are applied
487
+ assert_eq!(config.use_cache, true);
488
+ assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Plain);
489
+ }
490
+
491
+ /// Test MCP config with all output formats
492
+ #[test]
493
+ fn test_mcp_all_output_formats() {
494
+ let formats = vec!["plain", "markdown", "html"];
495
+
496
+ for format_str in formats {
497
+ let config_json = json!({
498
+ "output_format": format_str,
499
+ });
500
+
501
+ let config: kreuzberg::core::config::ExtractionConfig =
502
+ serde_json::from_value(config_json).expect("Failed to parse output format config");
503
+
504
+ // Verify format was set
505
+ let format_display = format!("{}", config.output_format);
506
+ assert_eq!(format_display, format_str);
507
+ }
508
+ }
509
+
510
+ /// Test MCP concurrent extraction config
511
+ #[test]
512
+ fn test_mcp_concurrent_extraction_config() {
513
+ let concurrent_configs = vec![1, 2, 4, 8, 16];
514
+
515
+ for max_concurrent in concurrent_configs {
516
+ let config_json = json!({
517
+ "max_concurrent_extractions": max_concurrent,
518
+ });
519
+
520
+ let config: kreuzberg::core::config::ExtractionConfig =
521
+ serde_json::from_value(config_json).expect("Failed to parse concurrent config");
522
+
523
+ assert_eq!(config.max_concurrent_extractions, Some(max_concurrent));
524
+ }
525
+ }
526
+
527
+ /// Test MCP config with cache disabled
528
+ #[test]
529
+ fn test_mcp_cache_disabled_config() {
530
+ let config_json = json!({
531
+ "use_cache": false,
532
+ "force_ocr": true,
533
+ });
534
+
535
+ let config: kreuzberg::core::config::ExtractionConfig =
536
+ serde_json::from_value(config_json).expect("Failed to parse cache config");
537
+
538
+ assert_eq!(config.use_cache, false);
539
+ assert_eq!(config.force_ocr, true);
540
+ }
541
+
542
+ /// Test MCP config round-trip serialization
543
+ #[test]
544
+ fn test_mcp_config_round_trip_serialization() {
545
+ let original_config = kreuzberg::core::config::ExtractionConfig {
546
+ use_cache: false,
547
+ enable_quality_processing: true,
548
+ force_ocr: true,
549
+ output_format: kreuzberg::core::config::OutputFormat::Markdown,
550
+ max_concurrent_extractions: Some(4),
551
+ ..Default::default()
552
+ };
553
+
554
+ // Serialize to JSON
555
+ let json_value = serde_json::to_value(&original_config).expect("Failed to serialize");
556
+
557
+ // Deserialize back
558
+ let restored_config: kreuzberg::core::config::ExtractionConfig =
559
+ serde_json::from_value(json_value).expect("Failed to deserialize");
560
+
561
+ // Verify round-trip
562
+ assert_eq!(original_config.use_cache, restored_config.use_cache);
563
+ assert_eq!(
564
+ original_config.enable_quality_processing,
565
+ restored_config.enable_quality_processing
566
+ );
567
+ assert_eq!(original_config.force_ocr, restored_config.force_ocr);
568
+ assert_eq!(original_config.output_format, restored_config.output_format);
569
+ assert_eq!(
570
+ original_config.max_concurrent_extractions,
571
+ restored_config.max_concurrent_extractions
572
+ );
573
+ }
574
+
575
+ /// Test MCP tool invocation with extract_bytes semantics
576
+ #[tokio::test]
577
+ async fn test_mcp_tool_extract_bytes_semantics() {
578
+ let test_bytes = b"Test content for MCP extraction";
579
+ let mime_type = "text/plain";
580
+
581
+ let config_json = json!({
582
+ "output_format": "plain",
583
+ });
584
+
585
+ let config: kreuzberg::core::config::ExtractionConfig =
586
+ serde_json::from_value(config_json).expect("Failed to parse config");
587
+
588
+ // Simulate MCP tool: extract_bytes
589
+ let result = kreuzberg::extract_bytes(test_bytes, mime_type, &config)
590
+ .await
591
+ .expect("Extraction should succeed");
592
+
593
+ assert!(!result.content.is_empty());
594
+ assert!(result.mime_type.contains("text"));
595
+ }
596
+
597
+ /// Test MCP tool invocation with file path semantics
598
+ #[test]
599
+ fn test_mcp_tool_extract_file_semantics() {
600
+ // Create temporary test file
601
+ let test_dir = tempfile::tempdir().expect("Failed to create temp dir");
602
+ let test_file = test_dir.path().join("test.txt");
603
+ std::fs::write(&test_file, b"Test content").expect("Failed to write test file");
604
+
605
+ let config_json = json!({
606
+ "output_format": "plain",
607
+ });
608
+
609
+ let config: kreuzberg::core::config::ExtractionConfig =
610
+ serde_json::from_value(config_json).expect("Failed to parse config");
611
+
612
+ // Simulate MCP tool: extract_file (sync)
613
+ if test_file.exists() {
614
+ let file_path = test_file.to_str().expect("test_file path should be valid UTF-8");
615
+ let result = kreuzberg::extract_file_sync(file_path, None, &config).expect("Extraction should succeed");
616
+
617
+ assert!(!result.content.is_empty());
618
+ }
619
+ }
620
+
621
+ /// Test MCP batch extraction semantics
622
+ #[tokio::test]
623
+ async fn test_mcp_batch_extraction_semantics() {
624
+ let test_bytes_1 = b"Content 1";
625
+ let test_bytes_2 = b"Content 2";
626
+ let mime_type = "text/plain";
627
+
628
+ let config_json = json!({
629
+ "output_format": "plain",
630
+ });
631
+
632
+ let config: kreuzberg::core::config::ExtractionConfig =
633
+ serde_json::from_value(config_json).expect("Failed to parse config");
634
+
635
+ // Simulate MCP batch tool: batch_extract_bytes
636
+ let test_data = vec![
637
+ (test_bytes_1.to_vec(), mime_type.to_string()),
638
+ (test_bytes_2.to_vec(), mime_type.to_string()),
639
+ ];
640
+
641
+ // Extract each item
642
+ for (bytes, mime) in test_data {
643
+ let result = kreuzberg::extract_bytes(&bytes, &mime, &config)
644
+ .await
645
+ .expect("Batch extraction should succeed");
646
+ assert!(!result.content.is_empty());
647
+ }
648
+ }
649
+
650
+ /// Test MCP error cases with invalid configurations
651
+ #[test]
652
+ fn test_mcp_error_invalid_format_field() {
653
+ let invalid_config = json!({
654
+ "output_format": "invalid_format_that_does_not_exist",
655
+ });
656
+
657
+ let result: Result<kreuzberg::core::config::ExtractionConfig, _> = serde_json::from_value(invalid_config);
658
+
659
+ // This should fail during deserialization
660
+ assert!(result.is_err());
661
+ }
662
+
663
+ /// Test MCP parameter validation with zero concurrent count
664
+ #[test]
665
+ fn test_mcp_validate_zero_concurrent() {
666
+ // Zero values should be accepted by serde, but MCP validation should flag
667
+ let config_json = json!({
668
+ "max_concurrent_extractions": 0,
669
+ });
670
+
671
+ let config: kreuzberg::core::config::ExtractionConfig =
672
+ serde_json::from_value(config_json).expect("Failed to parse");
673
+
674
+ // The config accepted the value; MCP server should validate semantically
675
+ assert_eq!(config.max_concurrent_extractions, Some(0));
676
+ }
677
+
678
+ /// Test MCP tool with empty batch
679
+ #[test]
680
+ fn test_mcp_empty_batch_handling() {
681
+ let empty_batch = json!({
682
+ "paths": [],
683
+ "config": {
684
+ "output_format": "plain",
685
+ }
686
+ });
687
+
688
+ let paths = empty_batch.get("paths").expect("Should have paths");
689
+ assert!(paths.is_array(), "paths field should be an array");
690
+ let path_array = paths.as_array().expect("paths should be deserializable as array");
691
+ assert_eq!(path_array.len(), 0, "paths array should be empty");
692
+ }
693
+
694
+ /// Test MCP parameter extraction with nested config
695
+ #[test]
696
+ fn test_mcp_nested_config_extraction() {
697
+ let nested_request = json!({
698
+ "tool": "extract_file",
699
+ "parameters": {
700
+ "path": "/document.pdf",
701
+ "config": {
702
+ "output_format": "markdown",
703
+ "force_ocr": true,
704
+ }
705
+ }
706
+ });
707
+
708
+ if let Some(params) = nested_request.get("parameters")
709
+ && let Some(config_obj) = params.get("config")
710
+ {
711
+ let config: kreuzberg::core::config::ExtractionConfig =
712
+ serde_json::from_value(config_obj.clone()).expect("Failed to parse nested config");
713
+
714
+ assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Markdown);
715
+ assert_eq!(config.force_ocr, true);
716
+ }
717
+ }
718
+
719
+ /// Test MCP HTML output format
720
+ #[test]
721
+ fn test_mcp_html_output_format() {
722
+ let config_json = json!({
723
+ "output_format": "html",
724
+ });
725
+
726
+ let config: kreuzberg::core::config::ExtractionConfig =
727
+ serde_json::from_value(config_json).expect("Failed to parse HTML config");
728
+
729
+ assert_eq!(config.output_format, kreuzberg::core::config::OutputFormat::Html);
730
+ }
731
+
732
+ /// Test MCP config with all boolean combinations
733
+ #[test]
734
+ fn test_mcp_boolean_combinations() {
735
+ let combinations = vec![(true, true), (true, false), (false, true), (false, false)];
736
+
737
+ for (use_cache, quality_processing) in combinations {
738
+ let config_json = json!({
739
+ "use_cache": use_cache,
740
+ "enable_quality_processing": quality_processing,
741
+ });
742
+
743
+ let config: kreuzberg::core::config::ExtractionConfig =
744
+ serde_json::from_value(config_json).expect("Failed to parse config");
745
+
746
+ assert_eq!(config.use_cache, use_cache);
747
+ assert_eq!(config.enable_quality_processing, quality_processing);
748
+ }
749
+ }
750
+
751
+ /// Test MCP response structure with extraction result
752
+ #[test]
753
+ fn test_mcp_response_structure_validation() {
754
+ let mcp_response = json!({
755
+ "status": "success",
756
+ "data": {
757
+ "content": "Extracted text",
758
+ "mime_type": "text/plain",
759
+ "metadata": {
760
+ "source": "test",
761
+ "extracted_at": "2024-01-25",
762
+ }
763
+ }
764
+ });
765
+
766
+ assert_eq!(
767
+ mcp_response.get("status").expect("status field should exist"),
768
+ "success"
769
+ );
770
+ assert!(
771
+ mcp_response.get("data").is_some(),
772
+ "data field should be present in MCP response"
773
+ );
774
+ }
775
+
776
+ /// Test MCP request/response roundtrip with config
777
+ #[test]
778
+ fn test_mcp_request_response_roundtrip() {
779
+ let original_config = json!({
780
+ "use_cache": false,
781
+ "force_ocr": true,
782
+ "output_format": "markdown",
783
+ "max_concurrent_extractions": 4,
784
+ });
785
+
786
+ // Simulate sending to MCP and getting back
787
+ let config: kreuzberg::core::config::ExtractionConfig =
788
+ serde_json::from_value(original_config.clone()).expect("Failed to parse");
789
+
790
+ // Serialize back
791
+ let response_config = serde_json::to_value(&config).expect("Failed to serialize");
792
+
793
+ // Verify it matches
794
+ assert_eq!(original_config.get("use_cache"), response_config.get("use_cache"));
795
+ assert_eq!(original_config.get("force_ocr"), response_config.get("force_ocr"));
796
+ assert_eq!(
797
+ original_config.get("output_format"),
798
+ response_config.get("output_format")
799
+ );
800
+ }
801
+
802
+ /// Test MCP config with partial updates
803
+ #[test]
804
+ fn test_mcp_config_partial_updates() {
805
+ let mut base_config = kreuzberg::core::config::ExtractionConfig::default();
806
+ base_config.use_cache = true;
807
+ base_config.force_ocr = false;
808
+
809
+ // Partial update
810
+ let update_json = json!({
811
+ "force_ocr": true,
812
+ });
813
+
814
+ let update_config: kreuzberg::core::config::ExtractionConfig =
815
+ serde_json::from_value(update_json).expect("Failed to parse update");
816
+
817
+ // In MCP, updates replace config completely
818
+ let updated = update_config;
819
+
820
+ // New config has update applied
821
+ assert_eq!(updated.force_ocr, true);
822
+ // But other fields revert to defaults (not merged)
823
+ assert_eq!(updated.use_cache, true);
824
+ }
825
+
826
+ /// Test MCP API consistency for all formats
827
+ #[test]
828
+ fn test_mcp_api_consistency_all_formats() {
829
+ let formats = vec!["plain", "markdown", "html"];
830
+
831
+ for format_str in formats {
832
+ let config = json!({
833
+ "output_format": format_str,
834
+ });
835
+
836
+ let parsed: kreuzberg::core::config::ExtractionConfig =
837
+ serde_json::from_value(config).expect("Failed to parse");
838
+
839
+ // Verify format is consistent
840
+ let serialized = serde_json::to_value(&parsed).expect("Failed to serialize");
841
+ let reserialized: kreuzberg::core::config::ExtractionConfig =
842
+ serde_json::from_value(serialized).expect("Failed to deserialize");
843
+
844
+ let original_format = format!("{}", parsed.output_format);
845
+ let restored_format = format!("{}", reserialized.output_format);
846
+
847
+ assert_eq!(original_format, restored_format);
848
+ }
849
+ }