kreuzberg 4.1.2 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
  5. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  6. data/kreuzberg.gemspec +13 -1
  7. data/lib/kreuzberg/cli.rb +16 -6
  8. data/lib/kreuzberg/cli_proxy.rb +3 -1
  9. data/lib/kreuzberg/config.rb +121 -39
  10. data/lib/kreuzberg/djot_content.rb +225 -0
  11. data/lib/kreuzberg/extraction_api.rb +20 -4
  12. data/lib/kreuzberg/result.rb +12 -2
  13. data/lib/kreuzberg/version.rb +1 -1
  14. data/lib/kreuzberg.rb +1 -0
  15. data/sig/kreuzberg.rbs +28 -12
  16. data/spec/binding/batch_operations_spec.rb +80 -0
  17. data/spec/binding/batch_spec.rb +6 -5
  18. data/spec/binding/error_recovery_spec.rb +3 -3
  19. data/spec/binding/metadata_types_spec.rb +77 -57
  20. data/spec/binding/tables_spec.rb +11 -2
  21. data/spec/serialization_spec.rb +134 -0
  22. data/spec/unit/config/output_format_spec.rb +380 -0
  23. data/vendor/Cargo.toml +1 -1
  24. data/vendor/kreuzberg/Cargo.toml +1 -1
  25. data/vendor/kreuzberg/README.md +1 -1
  26. data/vendor/kreuzberg/src/api/startup.rs +15 -1
  27. data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
  28. data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
  29. data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
  30. data/vendor/kreuzberg/src/core/io.rs +7 -7
  31. data/vendor/kreuzberg/src/core/mime.rs +4 -4
  32. data/vendor/kreuzberg/src/embeddings.rs +4 -4
  33. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
  34. data/vendor/kreuzberg/src/mcp/format.rs +237 -39
  35. data/vendor/kreuzberg/src/mcp/params.rs +26 -33
  36. data/vendor/kreuzberg/src/mcp/server.rs +6 -3
  37. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
  38. data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
  39. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
  40. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
  41. data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
  42. data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
  43. data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
  44. data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
  45. data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
  46. data/vendor/kreuzberg/tests/api_embed.rs +84 -50
  47. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
  48. data/vendor/kreuzberg/tests/api_tests.rs +298 -139
  49. data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
  50. data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
  51. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
  52. data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
  53. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
  54. data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
  55. data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
  56. data/vendor/kreuzberg/tests/config_features.rs +19 -15
  57. data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
  58. data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
  59. data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
  60. data/vendor/kreuzberg/tests/core_integration.rs +57 -57
  61. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
  62. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
  63. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
  64. data/vendor/kreuzberg/tests/email_integration.rs +7 -7
  65. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
  66. data/vendor/kreuzberg/tests/error_handling.rs +13 -11
  67. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
  68. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  69. data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
  70. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
  71. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
  72. data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
  73. data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
  74. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
  75. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
  76. data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
  77. data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
  78. data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
  79. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
  80. data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
  81. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
  82. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
  83. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
  84. data/vendor/kreuzberg/tests/page_markers.rs +1 -1
  85. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
  86. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  87. data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
  88. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
  89. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
  90. data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
  91. data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
  92. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
  93. data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
  94. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
  95. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
  96. data/vendor/kreuzberg/tests/security_validation.rs +20 -19
  97. data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
  98. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
  99. data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
  100. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
  101. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
  102. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  103. metadata +12 -2
@@ -0,0 +1,314 @@
1
+ //! MCP contract tests - verify MCP config matches Rust core
2
+ //!
3
+ //! This test suite validates that MCP (Model Context Protocol) configuration
4
+ //! produces identical JSON to the Rust core library when parsing configuration.
5
+ //! This ensures that MCP users get the same configuration behavior as CLI and SDK users.
6
+
7
+ use kreuzberg::core::config::ExtractionConfig;
8
+ use kreuzberg::core::config::OutputFormat;
9
+ use serde_json::json;
10
+
11
+ #[test]
12
+ fn test_mcp_basic_config_json_matches_rust_core() {
13
+ // Create config via Rust core
14
+ let rust_config = ExtractionConfig {
15
+ use_cache: true,
16
+ enable_quality_processing: true,
17
+ force_ocr: false,
18
+ output_format: OutputFormat::Plain,
19
+ result_format: kreuzberg::types::OutputFormat::Unified,
20
+ ..Default::default()
21
+ };
22
+ let rust_json = serde_json::to_value(&rust_config).expect("Failed to serialize rust config");
23
+
24
+ // Simulate MCP config parameter deserialization
25
+ let mcp_json = json!({
26
+ "use_cache": true,
27
+ "enable_quality_processing": true,
28
+ "force_ocr": false,
29
+ "output_format": "plain",
30
+ "result_format": "unified"
31
+ });
32
+ let mcp_config: ExtractionConfig =
33
+ serde_json::from_value(mcp_json.clone()).expect("Failed to deserialize MCP config");
34
+ let mcp_serialized = serde_json::to_value(&mcp_config).expect("Failed to serialize MCP config");
35
+
36
+ // Verify they produce identical JSON for the relevant fields
37
+ assert_eq!(
38
+ rust_json.get("use_cache"),
39
+ mcp_serialized.get("use_cache"),
40
+ "MCP use_cache must match Rust core"
41
+ );
42
+ assert_eq!(
43
+ rust_json.get("enable_quality_processing"),
44
+ mcp_serialized.get("enable_quality_processing"),
45
+ "MCP enable_quality_processing must match Rust core"
46
+ );
47
+ assert_eq!(
48
+ rust_json.get("force_ocr"),
49
+ mcp_serialized.get("force_ocr"),
50
+ "MCP force_ocr must match Rust core"
51
+ );
52
+ assert_eq!(
53
+ rust_json.get("output_format"),
54
+ mcp_serialized.get("output_format"),
55
+ "MCP output_format must match Rust core"
56
+ );
57
+ }
58
+
59
+ #[test]
60
+ fn test_mcp_ocr_config_nested_matches_rust_core() {
61
+ let mcp_json = json!({
62
+ "ocr": {
63
+ "backend": "tesseract"
64
+ },
65
+ "force_ocr": true
66
+ });
67
+
68
+ let config: ExtractionConfig = serde_json::from_value(mcp_json).expect("Failed to deserialize OCR config");
69
+
70
+ // Verify OCR config deserialized correctly
71
+ assert!(config.ocr.is_some(), "OCR config should be present");
72
+ assert!(config.force_ocr, "force_ocr should be true");
73
+
74
+ if let Some(ocr) = &config.ocr {
75
+ assert_eq!(ocr.backend, "tesseract", "OCR backend should be tesseract");
76
+ }
77
+
78
+ // Verify roundtrip
79
+ let serialized = serde_json::to_value(&config).expect("Failed to serialize");
80
+ assert!(serialized.get("ocr").is_some(), "Serialized config should include ocr");
81
+ }
82
+
83
+ #[test]
84
+ fn test_mcp_chunking_config_nested_matches_rust_core() {
85
+ let mcp_json = json!({
86
+ "chunking": {
87
+ "max_chars": 500,
88
+ "max_overlap": 50,
89
+ "strategy": "sliding_window"
90
+ }
91
+ });
92
+
93
+ let config: ExtractionConfig = serde_json::from_value(mcp_json).expect("Failed to deserialize chunking config");
94
+
95
+ // Verify chunking config deserialized correctly
96
+ assert!(config.chunking.is_some(), "Chunking config should be present");
97
+
98
+ if let Some(chunking) = &config.chunking {
99
+ assert_eq!(chunking.max_chars, 500, "max_chars should be 500");
100
+ assert_eq!(chunking.max_overlap, 50, "max_overlap should be 50");
101
+ }
102
+
103
+ // Verify roundtrip
104
+ let serialized = serde_json::to_value(&config).expect("Failed to serialize");
105
+ assert!(
106
+ serialized.get("chunking").is_some(),
107
+ "Serialized config should include chunking"
108
+ );
109
+ }
110
+
111
+ #[test]
112
+ fn test_mcp_full_config_preserves_all_fields() {
113
+ let full_config_json = json!({
114
+ "use_cache": false,
115
+ "enable_quality_processing": true,
116
+ "force_ocr": true,
117
+ "output_format": "markdown",
118
+ "result_format": "unified",
119
+ "max_concurrent_extractions": 8,
120
+ "ocr": {
121
+ "backend": "tesseract"
122
+ },
123
+ "chunking": {
124
+ "max_chars": 1000,
125
+ "max_overlap": 200
126
+ }
127
+ });
128
+
129
+ let config: ExtractionConfig =
130
+ serde_json::from_value(full_config_json.clone()).expect("Failed to deserialize full config");
131
+ let roundtrip_json = serde_json::to_value(&config).expect("Failed to serialize");
132
+
133
+ // Verify all top-level fields preserved
134
+ assert!(!config.use_cache, "use_cache should be false");
135
+ assert!(
136
+ config.enable_quality_processing,
137
+ "enable_quality_processing should be true"
138
+ );
139
+ assert!(config.force_ocr, "force_ocr should be true");
140
+ assert_eq!(
141
+ config.max_concurrent_extractions,
142
+ Some(8),
143
+ "max_concurrent_extractions should be 8"
144
+ );
145
+
146
+ // Verify nested fields preserved
147
+ assert!(config.ocr.is_some(), "OCR config should be present");
148
+ assert!(config.chunking.is_some(), "Chunking config should be present");
149
+
150
+ // Verify roundtrip integrity
151
+ assert_eq!(
152
+ roundtrip_json.get("use_cache"),
153
+ full_config_json.get("use_cache"),
154
+ "use_cache should survive roundtrip"
155
+ );
156
+ assert_eq!(
157
+ roundtrip_json.get("force_ocr"),
158
+ full_config_json.get("force_ocr"),
159
+ "force_ocr should survive roundtrip"
160
+ );
161
+ assert_eq!(
162
+ roundtrip_json.get("max_concurrent_extractions"),
163
+ full_config_json.get("max_concurrent_extractions"),
164
+ "max_concurrent_extractions should survive roundtrip"
165
+ );
166
+ }
167
+
168
+ #[test]
169
+ fn test_mcp_default_config_matches_rust_core_defaults() {
170
+ // Create Rust core default
171
+ let rust_default = ExtractionConfig::default();
172
+ let rust_json = serde_json::to_value(&rust_default).expect("Failed to serialize default");
173
+
174
+ // Create empty JSON (simulates MCP with no overrides)
175
+ let mcp_json = json!({});
176
+ let mcp_config: ExtractionConfig = serde_json::from_value(mcp_json).expect("Failed to deserialize empty config");
177
+ let mcp_json_serialized = serde_json::to_value(&mcp_config).expect("Failed to serialize MCP default");
178
+
179
+ // Verify defaults match
180
+ assert_eq!(
181
+ mcp_json_serialized.get("use_cache"),
182
+ rust_json.get("use_cache"),
183
+ "use_cache default should match"
184
+ );
185
+ assert_eq!(
186
+ mcp_json_serialized.get("enable_quality_processing"),
187
+ rust_json.get("enable_quality_processing"),
188
+ "enable_quality_processing default should match"
189
+ );
190
+ assert_eq!(
191
+ mcp_json_serialized.get("force_ocr"),
192
+ rust_json.get("force_ocr"),
193
+ "force_ocr default should match"
194
+ );
195
+ assert_eq!(
196
+ mcp_json_serialized.get("result_format"),
197
+ rust_json.get("result_format"),
198
+ "result_format default should match"
199
+ );
200
+ assert_eq!(
201
+ mcp_json_serialized.get("output_format"),
202
+ rust_json.get("output_format"),
203
+ "output_format default should match"
204
+ );
205
+ }
206
+
207
+ #[test]
208
+ fn test_mcp_output_format_values_are_valid() {
209
+ // Test all valid output format values (lowercase, as per serde rename_all)
210
+ let valid_formats = vec!["plain", "markdown", "html"];
211
+
212
+ for format in valid_formats {
213
+ let mcp_json = json!({
214
+ "output_format": format
215
+ });
216
+
217
+ let result = serde_json::from_value::<ExtractionConfig>(mcp_json);
218
+ assert!(result.is_ok(), "Format '{}' should deserialize successfully", format);
219
+
220
+ let config = result.unwrap();
221
+ assert!(
222
+ !config.output_format.to_string().is_empty(),
223
+ "Deserialized format should have valid string representation"
224
+ );
225
+ }
226
+ }
227
+
228
+ #[test]
229
+ fn test_mcp_result_format_values_are_valid() {
230
+ // Test valid result format values (lowercase, as per serde rename_all)
231
+ let valid_formats = vec!["unified", "element_based"];
232
+
233
+ for format in valid_formats {
234
+ let mcp_json = json!({
235
+ "result_format": format
236
+ });
237
+
238
+ let result = serde_json::from_value::<ExtractionConfig>(mcp_json);
239
+ assert!(
240
+ result.is_ok(),
241
+ "Result format '{}' should deserialize successfully",
242
+ format
243
+ );
244
+ }
245
+ }
246
+
247
+ #[test]
248
+ fn test_mcp_partial_override_preserves_defaults() {
249
+ // Create a partial config that overrides only one field
250
+ let partial_json = json!({
251
+ "force_ocr": true
252
+ });
253
+
254
+ let config: ExtractionConfig = serde_json::from_value(partial_json).expect("Failed to deserialize partial config");
255
+
256
+ // Verify override applied
257
+ assert!(config.force_ocr, "force_ocr override should be applied");
258
+
259
+ // Verify defaults preserved for other fields
260
+ assert!(config.use_cache, "use_cache should retain default when not overridden");
261
+ assert!(
262
+ config.enable_quality_processing,
263
+ "enable_quality_processing should retain default when not overridden"
264
+ );
265
+ }
266
+
267
+ #[test]
268
+ fn test_mcp_error_handling_for_invalid_json() {
269
+ // Test that invalid format values produce errors (or are handled gracefully)
270
+ let invalid_json = json!({
271
+ "output_format": "InvalidFormat"
272
+ });
273
+
274
+ let result = serde_json::from_value::<ExtractionConfig>(invalid_json);
275
+ // The deserialization should either fail or parse to a valid state
276
+ // depending on how OutputFormat handles unknown values
277
+ if let Ok(config) = result {
278
+ let _ = config.output_format.to_string();
279
+ }
280
+ }
281
+
282
+ #[test]
283
+ fn test_mcp_concurrent_extractions_override() {
284
+ let mcp_json = json!({
285
+ "max_concurrent_extractions": 16
286
+ });
287
+
288
+ let config: ExtractionConfig =
289
+ serde_json::from_value(mcp_json).expect("Failed to deserialize config with concurrent extractions");
290
+
291
+ assert_eq!(
292
+ config.max_concurrent_extractions,
293
+ Some(16),
294
+ "max_concurrent_extractions should be overridden to 16"
295
+ );
296
+ }
297
+
298
+ #[test]
299
+ fn test_mcp_config_json_keys_case_sensitive() {
300
+ // Verify that config JSON keys are case-sensitive
301
+ let lowercase_json = json!({
302
+ "use_cache": true,
303
+ "force_ocr": false
304
+ });
305
+
306
+ let config: ExtractionConfig =
307
+ serde_json::from_value(lowercase_json).expect("Failed to deserialize lowercase config");
308
+
309
+ assert!(config.use_cache, "use_cache should be true");
310
+ assert!(!config.force_ocr, "force_ocr should be false");
311
+
312
+ // Note: serde by default fails on unknown fields, so camelCase would fail
313
+ // This test documents the expected behavior
314
+ }
@@ -26,16 +26,16 @@ fn assert_text_content(actual: &str, expected: &str) {
26
26
  /// Test basic file extraction with MIME detection.
27
27
  #[tokio::test]
28
28
  async fn test_extract_file_basic() {
29
- let dir = tempdir().unwrap();
29
+ let dir = tempdir().expect("Operation failed");
30
30
  let file_path = dir.path().join("test.txt");
31
- let mut file = File::create(&file_path).unwrap();
32
- file.write_all(b"Hello, Kreuzberg!").unwrap();
31
+ let mut file = File::create(&file_path).expect("Operation failed");
32
+ file.write_all(b"Hello, Kreuzberg!").expect("Operation failed");
33
33
 
34
34
  let config = ExtractionConfig::default();
35
35
  let result = extract_file(&file_path, None, &config).await;
36
36
 
37
37
  assert!(result.is_ok(), "Basic file extraction should succeed");
38
- let result = result.unwrap();
38
+ let result = result.expect("Operation failed");
39
39
 
40
40
  assert_text_content(&result.content, "Hello, Kreuzberg!");
41
41
  assert_eq!(result.mime_type, "text/plain");
@@ -47,16 +47,16 @@ async fn test_extract_file_basic() {
47
47
  /// Test extraction with explicit MIME type override.
48
48
  #[tokio::test]
49
49
  async fn test_extract_file_with_mime_override() {
50
- let dir = tempdir().unwrap();
50
+ let dir = tempdir().expect("Operation failed");
51
51
  let file_path = dir.path().join("data.bin");
52
- let mut file = File::create(&file_path).unwrap();
53
- file.write_all(b"Binary content").unwrap();
52
+ let mut file = File::create(&file_path).expect("Operation failed");
53
+ file.write_all(b"Binary content").expect("Operation failed");
54
54
 
55
55
  let config = ExtractionConfig::default();
56
56
  let result = extract_file(&file_path, Some("text/plain"), &config).await;
57
57
 
58
58
  assert!(result.is_ok(), "MIME override should work");
59
- let result = result.unwrap();
59
+ let result = result.expect("Operation failed");
60
60
 
61
61
  assert_eq!(result.mime_type, "text/plain");
62
62
  assert!(!result.content.is_empty(), "Should extract content");
@@ -66,7 +66,7 @@ async fn test_extract_file_with_mime_override() {
66
66
  /// Test extraction of multiple file types.
67
67
  #[tokio::test]
68
68
  async fn test_extract_multiple_file_types() {
69
- let dir = tempdir().unwrap();
69
+ let dir = tempdir().expect("Operation failed");
70
70
  let config = ExtractionConfig::default();
71
71
 
72
72
  let test_files: Vec<(&str, &[u8], &str)> = vec![
@@ -80,9 +80,11 @@ async fn test_extract_multiple_file_types() {
80
80
 
81
81
  for (filename, content, expected_mime) in test_files {
82
82
  let file_path = dir.path().join(filename);
83
- fs::write(&file_path, content).unwrap();
83
+ fs::write(&file_path, content).expect("Operation failed");
84
84
 
85
- let result = extract_file(&file_path, None, &config).await.unwrap();
85
+ let result = extract_file(&file_path, None, &config)
86
+ .await
87
+ .expect("Async operation failed");
86
88
 
87
89
  assert_eq!(result.mime_type, expected_mime, "MIME type mismatch for {}", filename);
88
90
  assert!(
@@ -115,7 +117,7 @@ async fn test_extract_bytes_various_mime_types() {
115
117
  let result = extract_bytes(content, mime_type, &config).await;
116
118
  assert!(result.is_ok(), "Extract bytes failed for MIME type: {}", mime_type);
117
119
 
118
- let result = result.unwrap();
120
+ let result = result.expect("Operation failed");
119
121
 
120
122
  assert_eq!(result.mime_type, mime_type, "MIME type mismatch");
121
123
  assert!(
@@ -131,7 +133,7 @@ async fn test_extract_bytes_various_mime_types() {
131
133
  /// Test batch extraction with concurrent processing.
132
134
  #[tokio::test]
133
135
  async fn test_batch_extract_file_concurrency() {
134
- let dir = tempdir().unwrap();
136
+ let dir = tempdir().expect("Operation failed");
135
137
  let config = ExtractionConfig::default();
136
138
 
137
139
  let num_files = 10;
@@ -139,14 +141,14 @@ async fn test_batch_extract_file_concurrency() {
139
141
 
140
142
  for i in 0..num_files {
141
143
  let file_path = dir.path().join(format!("test_{}.txt", i));
142
- fs::write(&file_path, format!("Content {}", i)).unwrap();
144
+ fs::write(&file_path, format!("Content {}", i)).expect("Operation failed");
143
145
  paths.push(file_path);
144
146
  }
145
147
 
146
148
  let results = batch_extract_file(paths.clone(), &config).await;
147
149
  assert!(results.is_ok());
148
150
 
149
- let results = results.unwrap();
151
+ let results = results.expect("Operation failed");
150
152
  assert_eq!(results.len(), num_files);
151
153
 
152
154
  for (i, result) in results.iter().enumerate() {
@@ -169,7 +171,7 @@ async fn test_batch_extract_empty() {
169
171
 
170
172
  let results = batch_extract_file(paths, &config).await;
171
173
  assert!(results.is_ok());
172
- assert_eq!(results.unwrap().len(), 0);
174
+ assert_eq!(results.expect("Operation failed").len(), 0);
173
175
  }
174
176
 
175
177
  /// Test batch_extract_bytes with concurrent processing.
@@ -193,7 +195,7 @@ async fn test_batch_extract_bytes_concurrency() {
193
195
  let results = batch_extract_bytes(owned_contents, &config).await;
194
196
  assert!(results.is_ok());
195
197
 
196
- let results = results.unwrap();
198
+ let results = results.expect("Operation failed");
197
199
  assert_eq!(results.len(), 5);
198
200
 
199
201
  for (i, result) in results.iter().enumerate() {
@@ -214,28 +216,28 @@ async fn test_batch_extract_bytes_concurrency() {
214
216
  /// Test sync wrappers for extraction functions.
215
217
  #[test]
216
218
  fn test_sync_wrappers() {
217
- let dir = tempdir().unwrap();
219
+ let dir = tempdir().expect("Operation failed");
218
220
  let file_path = dir.path().join("sync_test.txt");
219
- fs::write(&file_path, "sync content").unwrap();
221
+ fs::write(&file_path, "sync content").expect("Operation failed");
220
222
 
221
223
  let config = ExtractionConfig::default();
222
224
 
223
225
  let result = extract_file_sync(&file_path, None, &config);
224
226
  assert!(result.is_ok(), "Sync file extraction should succeed");
225
- let extraction = result.unwrap();
227
+ let extraction = result.expect("Operation failed");
226
228
  assert_text_content(&extraction.content, "sync content");
227
229
  assert!(extraction.chunks.is_none(), "Chunks should be None");
228
230
 
229
231
  let result = extract_bytes_sync(b"test bytes", "text/plain", &config);
230
232
  assert!(result.is_ok(), "Sync bytes extraction should succeed");
231
- let extraction = result.unwrap();
233
+ let extraction = result.expect("Operation failed");
232
234
  assert_text_content(&extraction.content, "test bytes");
233
235
  assert!(extraction.chunks.is_none(), "Chunks should be None");
234
236
 
235
237
  let paths = vec![file_path];
236
238
  let results = batch_extract_file_sync(paths, &config);
237
239
  assert!(results.is_ok(), "Batch sync file should succeed");
238
- let results = results.unwrap();
240
+ let results = results.expect("Operation failed");
239
241
  assert_eq!(results.len(), 1);
240
242
  assert_text_content(&results[0].content, "sync content");
241
243
  assert!(results[0].chunks.is_none(), "Chunks should be None");
@@ -247,7 +249,7 @@ fn test_sync_wrappers() {
247
249
  .collect();
248
250
  let results = batch_extract_bytes_sync(owned_contents, &config);
249
251
  assert!(results.is_ok(), "Batch bytes sync should succeed");
250
- let results = results.unwrap();
252
+ let results = results.expect("Operation failed");
251
253
  assert_eq!(results.len(), 1);
252
254
  assert_text_content(&results[0].content, "test");
253
255
  assert!(results[0].chunks.is_none(), "Chunks should be None");
@@ -256,7 +258,7 @@ fn test_sync_wrappers() {
256
258
  /// Test MIME type detection for various extensions.
257
259
  #[test]
258
260
  fn test_mime_detection_comprehensive() {
259
- let dir = tempdir().unwrap();
261
+ let dir = tempdir().expect("Operation failed");
260
262
 
261
263
  let test_cases = vec![
262
264
  ("test.txt", "text/plain"),
@@ -287,9 +289,9 @@ fn test_mime_detection_comprehensive() {
287
289
 
288
290
  for (filename, expected_mime) in test_cases {
289
291
  let file_path = dir.path().join(filename);
290
- File::create(&file_path).unwrap();
292
+ File::create(&file_path).expect("Operation failed");
291
293
 
292
- let detected = detect_mime_type(&file_path, true).unwrap();
294
+ let detected = detect_mime_type(&file_path, true).expect("Operation failed");
293
295
  assert_eq!(detected, expected_mime, "Failed for {}", filename);
294
296
 
295
297
  let validated = validate_mime_type(&detected);
@@ -312,7 +314,7 @@ fn test_mime_validation() {
312
314
  /// Test case-insensitive extension handling.
313
315
  #[test]
314
316
  fn test_case_insensitive_extensions() {
315
- let dir = tempdir().unwrap();
317
+ let dir = tempdir().expect("Operation failed");
316
318
 
317
319
  let test_cases = vec![
318
320
  ("test.PDF", "application/pdf"),
@@ -326,9 +328,9 @@ fn test_case_insensitive_extensions() {
326
328
 
327
329
  for (filename, expected_mime) in test_cases {
328
330
  let file_path = dir.path().join(filename);
329
- File::create(&file_path).unwrap();
331
+ File::create(&file_path).expect("Operation failed");
330
332
 
331
- let detected = detect_mime_type(&file_path, true).unwrap();
333
+ let detected = detect_mime_type(&file_path, true).expect("Operation failed");
332
334
  assert_eq!(detected, expected_mime, "Failed for {}", filename);
333
335
  }
334
336
  }
@@ -336,7 +338,7 @@ fn test_case_insensitive_extensions() {
336
338
  /// Test config loading from TOML file.
337
339
  #[test]
338
340
  fn test_config_loading() {
339
- let dir = tempdir().unwrap();
341
+ let dir = tempdir().expect("Operation failed");
340
342
  let config_path = dir.path().join("kreuzberg.toml");
341
343
 
342
344
  fs::write(
@@ -355,19 +357,19 @@ max_chars = 2000
355
357
  max_overlap = 300
356
358
  "#,
357
359
  )
358
- .unwrap();
360
+ .expect("Operation failed");
359
361
 
360
- let config = ExtractionConfig::from_toml_file(&config_path).unwrap();
362
+ let config = ExtractionConfig::from_toml_file(&config_path).expect("Operation failed");
361
363
 
362
364
  assert!(!config.use_cache);
363
365
  assert!(config.enable_quality_processing);
364
366
  assert!(!config.force_ocr);
365
367
 
366
- let ocr_config = config.ocr.unwrap();
368
+ let ocr_config = config.ocr.expect("Operation failed");
367
369
  assert_eq!(ocr_config.backend, "tesseract");
368
370
  assert_eq!(ocr_config.language, "deu");
369
371
 
370
- let chunking_config = config.chunking.unwrap();
372
+ let chunking_config = config.chunking.expect("Operation failed");
371
373
  assert_eq!(chunking_config.max_chars, 2000);
372
374
  assert_eq!(chunking_config.max_overlap, 300);
373
375
  }
@@ -375,9 +377,9 @@ max_overlap = 300
375
377
  /// Test config discovery in parent directories.
376
378
  #[test]
377
379
  fn test_config_discovery() {
378
- let dir = tempdir().unwrap();
380
+ let dir = tempdir().expect("Operation failed");
379
381
  let subdir = dir.path().join("subdir");
380
- fs::create_dir(&subdir).unwrap();
382
+ fs::create_dir(&subdir).expect("Operation failed");
381
383
 
382
384
  let config_path = dir.path().join("kreuzberg.toml");
383
385
  fs::write(
@@ -387,16 +389,16 @@ use_cache = false
387
389
  enable_quality_processing = true
388
390
  "#,
389
391
  )
390
- .unwrap();
392
+ .expect("Operation failed");
391
393
 
392
- let original_dir = std::env::current_dir().unwrap();
393
- std::env::set_current_dir(&subdir).unwrap();
394
+ let original_dir = std::env::current_dir().expect("Operation failed");
395
+ std::env::set_current_dir(&subdir).expect("Operation failed");
394
396
 
395
- let config = ExtractionConfig::discover().unwrap();
397
+ let config = ExtractionConfig::discover().expect("Operation failed");
396
398
  assert!(config.is_some());
397
- assert!(!config.unwrap().use_cache);
399
+ assert!(!config.expect("Operation failed").use_cache);
398
400
 
399
- std::env::set_current_dir(original_dir).unwrap();
401
+ std::env::set_current_dir(original_dir).expect("Operation failed");
400
402
  }
401
403
 
402
404
  /// Test error handling for nonexistent files.
@@ -406,10 +408,8 @@ async fn test_nonexistent_file_error() {
406
408
  let result = extract_file("/nonexistent/file.txt", None, &config).await;
407
409
 
408
410
  assert!(result.is_err());
409
- assert!(matches!(
410
- result.unwrap_err(),
411
- kreuzberg::KreuzbergError::Validation { .. }
412
- ));
411
+ // File validation returns Io error for missing files (NotFound)
412
+ assert!(matches!(result.unwrap_err(), kreuzberg::KreuzbergError::Io(_)));
413
413
  }
414
414
 
415
415
  /// Test error handling for unsupported MIME types.
@@ -428,9 +428,9 @@ async fn test_unsupported_mime_type_error() {
428
428
  /// Test pipeline execution (currently stub, will be expanded in Phase 2).
429
429
  #[tokio::test]
430
430
  async fn test_pipeline_execution() {
431
- let dir = tempdir().unwrap();
431
+ let dir = tempdir().expect("Operation failed");
432
432
  let file_path = dir.path().join("pipeline_test.txt");
433
- fs::write(&file_path, "pipeline content").unwrap();
433
+ fs::write(&file_path, "pipeline content").expect("Operation failed");
434
434
 
435
435
  let config = ExtractionConfig {
436
436
  enable_quality_processing: true,
@@ -440,7 +440,7 @@ async fn test_pipeline_execution() {
440
440
  let result = extract_file(&file_path, None, &config).await;
441
441
  assert!(result.is_ok(), "Pipeline execution should succeed");
442
442
 
443
- let result = result.unwrap();
443
+ let result = result.expect("Operation failed");
444
444
  assert_text_content(&result.content, "pipeline content");
445
445
  assert_eq!(result.mime_type, "text/plain");
446
446
  assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
@@ -450,9 +450,9 @@ async fn test_pipeline_execution() {
450
450
  /// Test extraction with OCR config (placeholder test for Phase 2).
451
451
  #[tokio::test]
452
452
  async fn test_extraction_with_ocr_config() {
453
- let dir = tempdir().unwrap();
453
+ let dir = tempdir().expect("Operation failed");
454
454
  let file_path = dir.path().join("ocr_test.txt");
455
- fs::write(&file_path, "ocr content").unwrap();
455
+ fs::write(&file_path, "ocr content").expect("Operation failed");
456
456
 
457
457
  let config = ExtractionConfig {
458
458
  ocr: Some(kreuzberg::OcrConfig {
@@ -473,11 +473,11 @@ async fn test_extraction_with_ocr_config() {
473
473
  #[cfg(feature = "chunking")]
474
474
  #[tokio::test]
475
475
  async fn test_extraction_with_chunking_config() {
476
- let dir = tempdir().unwrap();
476
+ let dir = tempdir().expect("Operation failed");
477
477
  let file_path = dir.path().join("chunking_test.txt");
478
478
 
479
479
  let long_content = "content for chunking. ".repeat(100);
480
- fs::write(&file_path, &long_content).unwrap();
480
+ fs::write(&file_path, &long_content).expect("Operation failed");
481
481
 
482
482
  let config = ExtractionConfig {
483
483
  chunking: Some(kreuzberg::ChunkingConfig {
@@ -492,21 +492,21 @@ async fn test_extraction_with_chunking_config() {
492
492
  let result = extract_file(&file_path, None, &config).await;
493
493
  assert!(result.is_ok(), "Extraction with chunking should succeed");
494
494
 
495
- let result = result.unwrap();
495
+ let result = result.expect("Operation failed");
496
496
 
497
497
  assert!(
498
498
  result.chunks.is_some(),
499
499
  "Chunks should be populated when chunking enabled"
500
500
  );
501
501
 
502
- let chunks = result.chunks.unwrap();
502
+ let chunks = result.chunks.expect("Operation failed");
503
503
  assert!(chunks.len() > 1, "Should have multiple chunks for long content");
504
504
 
505
505
  assert!(result.metadata.additional.contains_key("chunk_count"));
506
- let chunk_count = result.metadata.additional.get("chunk_count").unwrap();
506
+ let chunk_count = result.metadata.additional.get("chunk_count").expect("Value not found");
507
507
  assert_eq!(
508
508
  chunks.len(),
509
- chunk_count.as_u64().unwrap() as usize,
509
+ chunk_count.as_u64().expect("Operation failed") as usize,
510
510
  "chunk_count should match chunks length"
511
511
  );
512
512