kreuzberg 4.1.2 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/kreuzberg.gemspec +13 -1
- data/lib/kreuzberg/config.rb +70 -35
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +5 -1
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +55 -53
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +10 -2
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
//! MCP contract tests - verify MCP config matches Rust core
|
|
2
|
+
//!
|
|
3
|
+
//! This test suite validates that MCP (Model Context Protocol) configuration
|
|
4
|
+
//! produces identical JSON to the Rust core library when parsing configuration.
|
|
5
|
+
//! This ensures that MCP users get the same configuration behavior as CLI and SDK users.
|
|
6
|
+
|
|
7
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
8
|
+
use kreuzberg::core::config::OutputFormat;
|
|
9
|
+
use serde_json::json;
|
|
10
|
+
|
|
11
|
+
#[test]
|
|
12
|
+
fn test_mcp_basic_config_json_matches_rust_core() {
|
|
13
|
+
// Create config via Rust core
|
|
14
|
+
let rust_config = ExtractionConfig {
|
|
15
|
+
use_cache: true,
|
|
16
|
+
enable_quality_processing: true,
|
|
17
|
+
force_ocr: false,
|
|
18
|
+
output_format: OutputFormat::Plain,
|
|
19
|
+
result_format: kreuzberg::types::OutputFormat::Unified,
|
|
20
|
+
..Default::default()
|
|
21
|
+
};
|
|
22
|
+
let rust_json = serde_json::to_value(&rust_config).expect("Failed to serialize rust config");
|
|
23
|
+
|
|
24
|
+
// Simulate MCP config parameter deserialization
|
|
25
|
+
let mcp_json = json!({
|
|
26
|
+
"use_cache": true,
|
|
27
|
+
"enable_quality_processing": true,
|
|
28
|
+
"force_ocr": false,
|
|
29
|
+
"output_format": "plain",
|
|
30
|
+
"result_format": "unified"
|
|
31
|
+
});
|
|
32
|
+
let mcp_config: ExtractionConfig =
|
|
33
|
+
serde_json::from_value(mcp_json.clone()).expect("Failed to deserialize MCP config");
|
|
34
|
+
let mcp_serialized = serde_json::to_value(&mcp_config).expect("Failed to serialize MCP config");
|
|
35
|
+
|
|
36
|
+
// Verify they produce identical JSON for the relevant fields
|
|
37
|
+
assert_eq!(
|
|
38
|
+
rust_json.get("use_cache"),
|
|
39
|
+
mcp_serialized.get("use_cache"),
|
|
40
|
+
"MCP use_cache must match Rust core"
|
|
41
|
+
);
|
|
42
|
+
assert_eq!(
|
|
43
|
+
rust_json.get("enable_quality_processing"),
|
|
44
|
+
mcp_serialized.get("enable_quality_processing"),
|
|
45
|
+
"MCP enable_quality_processing must match Rust core"
|
|
46
|
+
);
|
|
47
|
+
assert_eq!(
|
|
48
|
+
rust_json.get("force_ocr"),
|
|
49
|
+
mcp_serialized.get("force_ocr"),
|
|
50
|
+
"MCP force_ocr must match Rust core"
|
|
51
|
+
);
|
|
52
|
+
assert_eq!(
|
|
53
|
+
rust_json.get("output_format"),
|
|
54
|
+
mcp_serialized.get("output_format"),
|
|
55
|
+
"MCP output_format must match Rust core"
|
|
56
|
+
);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
#[test]
|
|
60
|
+
fn test_mcp_ocr_config_nested_matches_rust_core() {
|
|
61
|
+
let mcp_json = json!({
|
|
62
|
+
"ocr": {
|
|
63
|
+
"backend": "tesseract"
|
|
64
|
+
},
|
|
65
|
+
"force_ocr": true
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
let config: ExtractionConfig = serde_json::from_value(mcp_json).expect("Failed to deserialize OCR config");
|
|
69
|
+
|
|
70
|
+
// Verify OCR config deserialized correctly
|
|
71
|
+
assert!(config.ocr.is_some(), "OCR config should be present");
|
|
72
|
+
assert!(config.force_ocr, "force_ocr should be true");
|
|
73
|
+
|
|
74
|
+
if let Some(ocr) = &config.ocr {
|
|
75
|
+
assert_eq!(ocr.backend, "tesseract", "OCR backend should be tesseract");
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Verify roundtrip
|
|
79
|
+
let serialized = serde_json::to_value(&config).expect("Failed to serialize");
|
|
80
|
+
assert!(serialized.get("ocr").is_some(), "Serialized config should include ocr");
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
#[test]
|
|
84
|
+
fn test_mcp_chunking_config_nested_matches_rust_core() {
|
|
85
|
+
let mcp_json = json!({
|
|
86
|
+
"chunking": {
|
|
87
|
+
"max_chars": 500,
|
|
88
|
+
"max_overlap": 50,
|
|
89
|
+
"strategy": "sliding_window"
|
|
90
|
+
}
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
let config: ExtractionConfig = serde_json::from_value(mcp_json).expect("Failed to deserialize chunking config");
|
|
94
|
+
|
|
95
|
+
// Verify chunking config deserialized correctly
|
|
96
|
+
assert!(config.chunking.is_some(), "Chunking config should be present");
|
|
97
|
+
|
|
98
|
+
if let Some(chunking) = &config.chunking {
|
|
99
|
+
assert_eq!(chunking.max_chars, 500, "max_chars should be 500");
|
|
100
|
+
assert_eq!(chunking.max_overlap, 50, "max_overlap should be 50");
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Verify roundtrip
|
|
104
|
+
let serialized = serde_json::to_value(&config).expect("Failed to serialize");
|
|
105
|
+
assert!(
|
|
106
|
+
serialized.get("chunking").is_some(),
|
|
107
|
+
"Serialized config should include chunking"
|
|
108
|
+
);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
#[test]
|
|
112
|
+
fn test_mcp_full_config_preserves_all_fields() {
|
|
113
|
+
let full_config_json = json!({
|
|
114
|
+
"use_cache": false,
|
|
115
|
+
"enable_quality_processing": true,
|
|
116
|
+
"force_ocr": true,
|
|
117
|
+
"output_format": "markdown",
|
|
118
|
+
"result_format": "unified",
|
|
119
|
+
"max_concurrent_extractions": 8,
|
|
120
|
+
"ocr": {
|
|
121
|
+
"backend": "tesseract"
|
|
122
|
+
},
|
|
123
|
+
"chunking": {
|
|
124
|
+
"max_chars": 1000,
|
|
125
|
+
"max_overlap": 200
|
|
126
|
+
}
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
let config: ExtractionConfig =
|
|
130
|
+
serde_json::from_value(full_config_json.clone()).expect("Failed to deserialize full config");
|
|
131
|
+
let roundtrip_json = serde_json::to_value(&config).expect("Failed to serialize");
|
|
132
|
+
|
|
133
|
+
// Verify all top-level fields preserved
|
|
134
|
+
assert!(!config.use_cache, "use_cache should be false");
|
|
135
|
+
assert!(
|
|
136
|
+
config.enable_quality_processing,
|
|
137
|
+
"enable_quality_processing should be true"
|
|
138
|
+
);
|
|
139
|
+
assert!(config.force_ocr, "force_ocr should be true");
|
|
140
|
+
assert_eq!(
|
|
141
|
+
config.max_concurrent_extractions,
|
|
142
|
+
Some(8),
|
|
143
|
+
"max_concurrent_extractions should be 8"
|
|
144
|
+
);
|
|
145
|
+
|
|
146
|
+
// Verify nested fields preserved
|
|
147
|
+
assert!(config.ocr.is_some(), "OCR config should be present");
|
|
148
|
+
assert!(config.chunking.is_some(), "Chunking config should be present");
|
|
149
|
+
|
|
150
|
+
// Verify roundtrip integrity
|
|
151
|
+
assert_eq!(
|
|
152
|
+
roundtrip_json.get("use_cache"),
|
|
153
|
+
full_config_json.get("use_cache"),
|
|
154
|
+
"use_cache should survive roundtrip"
|
|
155
|
+
);
|
|
156
|
+
assert_eq!(
|
|
157
|
+
roundtrip_json.get("force_ocr"),
|
|
158
|
+
full_config_json.get("force_ocr"),
|
|
159
|
+
"force_ocr should survive roundtrip"
|
|
160
|
+
);
|
|
161
|
+
assert_eq!(
|
|
162
|
+
roundtrip_json.get("max_concurrent_extractions"),
|
|
163
|
+
full_config_json.get("max_concurrent_extractions"),
|
|
164
|
+
"max_concurrent_extractions should survive roundtrip"
|
|
165
|
+
);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
#[test]
|
|
169
|
+
fn test_mcp_default_config_matches_rust_core_defaults() {
|
|
170
|
+
// Create Rust core default
|
|
171
|
+
let rust_default = ExtractionConfig::default();
|
|
172
|
+
let rust_json = serde_json::to_value(&rust_default).expect("Failed to serialize default");
|
|
173
|
+
|
|
174
|
+
// Create empty JSON (simulates MCP with no overrides)
|
|
175
|
+
let mcp_json = json!({});
|
|
176
|
+
let mcp_config: ExtractionConfig = serde_json::from_value(mcp_json).expect("Failed to deserialize empty config");
|
|
177
|
+
let mcp_json_serialized = serde_json::to_value(&mcp_config).expect("Failed to serialize MCP default");
|
|
178
|
+
|
|
179
|
+
// Verify defaults match
|
|
180
|
+
assert_eq!(
|
|
181
|
+
mcp_json_serialized.get("use_cache"),
|
|
182
|
+
rust_json.get("use_cache"),
|
|
183
|
+
"use_cache default should match"
|
|
184
|
+
);
|
|
185
|
+
assert_eq!(
|
|
186
|
+
mcp_json_serialized.get("enable_quality_processing"),
|
|
187
|
+
rust_json.get("enable_quality_processing"),
|
|
188
|
+
"enable_quality_processing default should match"
|
|
189
|
+
);
|
|
190
|
+
assert_eq!(
|
|
191
|
+
mcp_json_serialized.get("force_ocr"),
|
|
192
|
+
rust_json.get("force_ocr"),
|
|
193
|
+
"force_ocr default should match"
|
|
194
|
+
);
|
|
195
|
+
assert_eq!(
|
|
196
|
+
mcp_json_serialized.get("result_format"),
|
|
197
|
+
rust_json.get("result_format"),
|
|
198
|
+
"result_format default should match"
|
|
199
|
+
);
|
|
200
|
+
assert_eq!(
|
|
201
|
+
mcp_json_serialized.get("output_format"),
|
|
202
|
+
rust_json.get("output_format"),
|
|
203
|
+
"output_format default should match"
|
|
204
|
+
);
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
#[test]
|
|
208
|
+
fn test_mcp_output_format_values_are_valid() {
|
|
209
|
+
// Test all valid output format values (lowercase, as per serde rename_all)
|
|
210
|
+
let valid_formats = vec!["plain", "markdown", "html"];
|
|
211
|
+
|
|
212
|
+
for format in valid_formats {
|
|
213
|
+
let mcp_json = json!({
|
|
214
|
+
"output_format": format
|
|
215
|
+
});
|
|
216
|
+
|
|
217
|
+
let result = serde_json::from_value::<ExtractionConfig>(mcp_json);
|
|
218
|
+
assert!(result.is_ok(), "Format '{}' should deserialize successfully", format);
|
|
219
|
+
|
|
220
|
+
let config = result.unwrap();
|
|
221
|
+
assert!(
|
|
222
|
+
!config.output_format.to_string().is_empty(),
|
|
223
|
+
"Deserialized format should have valid string representation"
|
|
224
|
+
);
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
#[test]
|
|
229
|
+
fn test_mcp_result_format_values_are_valid() {
|
|
230
|
+
// Test valid result format values (lowercase, as per serde rename_all)
|
|
231
|
+
let valid_formats = vec!["unified", "element_based"];
|
|
232
|
+
|
|
233
|
+
for format in valid_formats {
|
|
234
|
+
let mcp_json = json!({
|
|
235
|
+
"result_format": format
|
|
236
|
+
});
|
|
237
|
+
|
|
238
|
+
let result = serde_json::from_value::<ExtractionConfig>(mcp_json);
|
|
239
|
+
assert!(
|
|
240
|
+
result.is_ok(),
|
|
241
|
+
"Result format '{}' should deserialize successfully",
|
|
242
|
+
format
|
|
243
|
+
);
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
#[test]
|
|
248
|
+
fn test_mcp_partial_override_preserves_defaults() {
|
|
249
|
+
// Create a partial config that overrides only one field
|
|
250
|
+
let partial_json = json!({
|
|
251
|
+
"force_ocr": true
|
|
252
|
+
});
|
|
253
|
+
|
|
254
|
+
let config: ExtractionConfig = serde_json::from_value(partial_json).expect("Failed to deserialize partial config");
|
|
255
|
+
|
|
256
|
+
// Verify override applied
|
|
257
|
+
assert!(config.force_ocr, "force_ocr override should be applied");
|
|
258
|
+
|
|
259
|
+
// Verify defaults preserved for other fields
|
|
260
|
+
assert!(config.use_cache, "use_cache should retain default when not overridden");
|
|
261
|
+
assert!(
|
|
262
|
+
config.enable_quality_processing,
|
|
263
|
+
"enable_quality_processing should retain default when not overridden"
|
|
264
|
+
);
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
#[test]
|
|
268
|
+
fn test_mcp_error_handling_for_invalid_json() {
|
|
269
|
+
// Test that invalid format values produce errors (or are handled gracefully)
|
|
270
|
+
let invalid_json = json!({
|
|
271
|
+
"output_format": "InvalidFormat"
|
|
272
|
+
});
|
|
273
|
+
|
|
274
|
+
let result = serde_json::from_value::<ExtractionConfig>(invalid_json);
|
|
275
|
+
// The deserialization should either fail or parse to a valid state
|
|
276
|
+
// depending on how OutputFormat handles unknown values
|
|
277
|
+
if let Ok(config) = result {
|
|
278
|
+
let _ = config.output_format.to_string();
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
#[test]
|
|
283
|
+
fn test_mcp_concurrent_extractions_override() {
|
|
284
|
+
let mcp_json = json!({
|
|
285
|
+
"max_concurrent_extractions": 16
|
|
286
|
+
});
|
|
287
|
+
|
|
288
|
+
let config: ExtractionConfig =
|
|
289
|
+
serde_json::from_value(mcp_json).expect("Failed to deserialize config with concurrent extractions");
|
|
290
|
+
|
|
291
|
+
assert_eq!(
|
|
292
|
+
config.max_concurrent_extractions,
|
|
293
|
+
Some(16),
|
|
294
|
+
"max_concurrent_extractions should be overridden to 16"
|
|
295
|
+
);
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
#[test]
|
|
299
|
+
fn test_mcp_config_json_keys_case_sensitive() {
|
|
300
|
+
// Verify that config JSON keys are case-sensitive
|
|
301
|
+
let lowercase_json = json!({
|
|
302
|
+
"use_cache": true,
|
|
303
|
+
"force_ocr": false
|
|
304
|
+
});
|
|
305
|
+
|
|
306
|
+
let config: ExtractionConfig =
|
|
307
|
+
serde_json::from_value(lowercase_json).expect("Failed to deserialize lowercase config");
|
|
308
|
+
|
|
309
|
+
assert!(config.use_cache, "use_cache should be true");
|
|
310
|
+
assert!(!config.force_ocr, "force_ocr should be false");
|
|
311
|
+
|
|
312
|
+
// Note: serde by default fails on unknown fields, so camelCase would fail
|
|
313
|
+
// This test documents the expected behavior
|
|
314
|
+
}
|
|
@@ -26,16 +26,16 @@ fn assert_text_content(actual: &str, expected: &str) {
|
|
|
26
26
|
/// Test basic file extraction with MIME detection.
|
|
27
27
|
#[tokio::test]
|
|
28
28
|
async fn test_extract_file_basic() {
|
|
29
|
-
let dir = tempdir().
|
|
29
|
+
let dir = tempdir().expect("Operation failed");
|
|
30
30
|
let file_path = dir.path().join("test.txt");
|
|
31
|
-
let mut file = File::create(&file_path).
|
|
32
|
-
file.write_all(b"Hello, Kreuzberg!").
|
|
31
|
+
let mut file = File::create(&file_path).expect("Operation failed");
|
|
32
|
+
file.write_all(b"Hello, Kreuzberg!").expect("Operation failed");
|
|
33
33
|
|
|
34
34
|
let config = ExtractionConfig::default();
|
|
35
35
|
let result = extract_file(&file_path, None, &config).await;
|
|
36
36
|
|
|
37
37
|
assert!(result.is_ok(), "Basic file extraction should succeed");
|
|
38
|
-
let result = result.
|
|
38
|
+
let result = result.expect("Operation failed");
|
|
39
39
|
|
|
40
40
|
assert_text_content(&result.content, "Hello, Kreuzberg!");
|
|
41
41
|
assert_eq!(result.mime_type, "text/plain");
|
|
@@ -47,16 +47,16 @@ async fn test_extract_file_basic() {
|
|
|
47
47
|
/// Test extraction with explicit MIME type override.
|
|
48
48
|
#[tokio::test]
|
|
49
49
|
async fn test_extract_file_with_mime_override() {
|
|
50
|
-
let dir = tempdir().
|
|
50
|
+
let dir = tempdir().expect("Operation failed");
|
|
51
51
|
let file_path = dir.path().join("data.bin");
|
|
52
|
-
let mut file = File::create(&file_path).
|
|
53
|
-
file.write_all(b"Binary content").
|
|
52
|
+
let mut file = File::create(&file_path).expect("Operation failed");
|
|
53
|
+
file.write_all(b"Binary content").expect("Operation failed");
|
|
54
54
|
|
|
55
55
|
let config = ExtractionConfig::default();
|
|
56
56
|
let result = extract_file(&file_path, Some("text/plain"), &config).await;
|
|
57
57
|
|
|
58
58
|
assert!(result.is_ok(), "MIME override should work");
|
|
59
|
-
let result = result.
|
|
59
|
+
let result = result.expect("Operation failed");
|
|
60
60
|
|
|
61
61
|
assert_eq!(result.mime_type, "text/plain");
|
|
62
62
|
assert!(!result.content.is_empty(), "Should extract content");
|
|
@@ -66,7 +66,7 @@ async fn test_extract_file_with_mime_override() {
|
|
|
66
66
|
/// Test extraction of multiple file types.
|
|
67
67
|
#[tokio::test]
|
|
68
68
|
async fn test_extract_multiple_file_types() {
|
|
69
|
-
let dir = tempdir().
|
|
69
|
+
let dir = tempdir().expect("Operation failed");
|
|
70
70
|
let config = ExtractionConfig::default();
|
|
71
71
|
|
|
72
72
|
let test_files: Vec<(&str, &[u8], &str)> = vec![
|
|
@@ -80,9 +80,11 @@ async fn test_extract_multiple_file_types() {
|
|
|
80
80
|
|
|
81
81
|
for (filename, content, expected_mime) in test_files {
|
|
82
82
|
let file_path = dir.path().join(filename);
|
|
83
|
-
fs::write(&file_path, content).
|
|
83
|
+
fs::write(&file_path, content).expect("Operation failed");
|
|
84
84
|
|
|
85
|
-
let result = extract_file(&file_path, None, &config)
|
|
85
|
+
let result = extract_file(&file_path, None, &config)
|
|
86
|
+
.await
|
|
87
|
+
.expect("Async operation failed");
|
|
86
88
|
|
|
87
89
|
assert_eq!(result.mime_type, expected_mime, "MIME type mismatch for {}", filename);
|
|
88
90
|
assert!(
|
|
@@ -115,7 +117,7 @@ async fn test_extract_bytes_various_mime_types() {
|
|
|
115
117
|
let result = extract_bytes(content, mime_type, &config).await;
|
|
116
118
|
assert!(result.is_ok(), "Extract bytes failed for MIME type: {}", mime_type);
|
|
117
119
|
|
|
118
|
-
let result = result.
|
|
120
|
+
let result = result.expect("Operation failed");
|
|
119
121
|
|
|
120
122
|
assert_eq!(result.mime_type, mime_type, "MIME type mismatch");
|
|
121
123
|
assert!(
|
|
@@ -131,7 +133,7 @@ async fn test_extract_bytes_various_mime_types() {
|
|
|
131
133
|
/// Test batch extraction with concurrent processing.
|
|
132
134
|
#[tokio::test]
|
|
133
135
|
async fn test_batch_extract_file_concurrency() {
|
|
134
|
-
let dir = tempdir().
|
|
136
|
+
let dir = tempdir().expect("Operation failed");
|
|
135
137
|
let config = ExtractionConfig::default();
|
|
136
138
|
|
|
137
139
|
let num_files = 10;
|
|
@@ -139,14 +141,14 @@ async fn test_batch_extract_file_concurrency() {
|
|
|
139
141
|
|
|
140
142
|
for i in 0..num_files {
|
|
141
143
|
let file_path = dir.path().join(format!("test_{}.txt", i));
|
|
142
|
-
fs::write(&file_path, format!("Content {}", i)).
|
|
144
|
+
fs::write(&file_path, format!("Content {}", i)).expect("Operation failed");
|
|
143
145
|
paths.push(file_path);
|
|
144
146
|
}
|
|
145
147
|
|
|
146
148
|
let results = batch_extract_file(paths.clone(), &config).await;
|
|
147
149
|
assert!(results.is_ok());
|
|
148
150
|
|
|
149
|
-
let results = results.
|
|
151
|
+
let results = results.expect("Operation failed");
|
|
150
152
|
assert_eq!(results.len(), num_files);
|
|
151
153
|
|
|
152
154
|
for (i, result) in results.iter().enumerate() {
|
|
@@ -169,7 +171,7 @@ async fn test_batch_extract_empty() {
|
|
|
169
171
|
|
|
170
172
|
let results = batch_extract_file(paths, &config).await;
|
|
171
173
|
assert!(results.is_ok());
|
|
172
|
-
assert_eq!(results.
|
|
174
|
+
assert_eq!(results.expect("Operation failed").len(), 0);
|
|
173
175
|
}
|
|
174
176
|
|
|
175
177
|
/// Test batch_extract_bytes with concurrent processing.
|
|
@@ -193,7 +195,7 @@ async fn test_batch_extract_bytes_concurrency() {
|
|
|
193
195
|
let results = batch_extract_bytes(owned_contents, &config).await;
|
|
194
196
|
assert!(results.is_ok());
|
|
195
197
|
|
|
196
|
-
let results = results.
|
|
198
|
+
let results = results.expect("Operation failed");
|
|
197
199
|
assert_eq!(results.len(), 5);
|
|
198
200
|
|
|
199
201
|
for (i, result) in results.iter().enumerate() {
|
|
@@ -214,28 +216,28 @@ async fn test_batch_extract_bytes_concurrency() {
|
|
|
214
216
|
/// Test sync wrappers for extraction functions.
|
|
215
217
|
#[test]
|
|
216
218
|
fn test_sync_wrappers() {
|
|
217
|
-
let dir = tempdir().
|
|
219
|
+
let dir = tempdir().expect("Operation failed");
|
|
218
220
|
let file_path = dir.path().join("sync_test.txt");
|
|
219
|
-
fs::write(&file_path, "sync content").
|
|
221
|
+
fs::write(&file_path, "sync content").expect("Operation failed");
|
|
220
222
|
|
|
221
223
|
let config = ExtractionConfig::default();
|
|
222
224
|
|
|
223
225
|
let result = extract_file_sync(&file_path, None, &config);
|
|
224
226
|
assert!(result.is_ok(), "Sync file extraction should succeed");
|
|
225
|
-
let extraction = result.
|
|
227
|
+
let extraction = result.expect("Operation failed");
|
|
226
228
|
assert_text_content(&extraction.content, "sync content");
|
|
227
229
|
assert!(extraction.chunks.is_none(), "Chunks should be None");
|
|
228
230
|
|
|
229
231
|
let result = extract_bytes_sync(b"test bytes", "text/plain", &config);
|
|
230
232
|
assert!(result.is_ok(), "Sync bytes extraction should succeed");
|
|
231
|
-
let extraction = result.
|
|
233
|
+
let extraction = result.expect("Operation failed");
|
|
232
234
|
assert_text_content(&extraction.content, "test bytes");
|
|
233
235
|
assert!(extraction.chunks.is_none(), "Chunks should be None");
|
|
234
236
|
|
|
235
237
|
let paths = vec![file_path];
|
|
236
238
|
let results = batch_extract_file_sync(paths, &config);
|
|
237
239
|
assert!(results.is_ok(), "Batch sync file should succeed");
|
|
238
|
-
let results = results.
|
|
240
|
+
let results = results.expect("Operation failed");
|
|
239
241
|
assert_eq!(results.len(), 1);
|
|
240
242
|
assert_text_content(&results[0].content, "sync content");
|
|
241
243
|
assert!(results[0].chunks.is_none(), "Chunks should be None");
|
|
@@ -247,7 +249,7 @@ fn test_sync_wrappers() {
|
|
|
247
249
|
.collect();
|
|
248
250
|
let results = batch_extract_bytes_sync(owned_contents, &config);
|
|
249
251
|
assert!(results.is_ok(), "Batch bytes sync should succeed");
|
|
250
|
-
let results = results.
|
|
252
|
+
let results = results.expect("Operation failed");
|
|
251
253
|
assert_eq!(results.len(), 1);
|
|
252
254
|
assert_text_content(&results[0].content, "test");
|
|
253
255
|
assert!(results[0].chunks.is_none(), "Chunks should be None");
|
|
@@ -256,7 +258,7 @@ fn test_sync_wrappers() {
|
|
|
256
258
|
/// Test MIME type detection for various extensions.
|
|
257
259
|
#[test]
|
|
258
260
|
fn test_mime_detection_comprehensive() {
|
|
259
|
-
let dir = tempdir().
|
|
261
|
+
let dir = tempdir().expect("Operation failed");
|
|
260
262
|
|
|
261
263
|
let test_cases = vec![
|
|
262
264
|
("test.txt", "text/plain"),
|
|
@@ -287,9 +289,9 @@ fn test_mime_detection_comprehensive() {
|
|
|
287
289
|
|
|
288
290
|
for (filename, expected_mime) in test_cases {
|
|
289
291
|
let file_path = dir.path().join(filename);
|
|
290
|
-
File::create(&file_path).
|
|
292
|
+
File::create(&file_path).expect("Operation failed");
|
|
291
293
|
|
|
292
|
-
let detected = detect_mime_type(&file_path, true).
|
|
294
|
+
let detected = detect_mime_type(&file_path, true).expect("Operation failed");
|
|
293
295
|
assert_eq!(detected, expected_mime, "Failed for {}", filename);
|
|
294
296
|
|
|
295
297
|
let validated = validate_mime_type(&detected);
|
|
@@ -312,7 +314,7 @@ fn test_mime_validation() {
|
|
|
312
314
|
/// Test case-insensitive extension handling.
|
|
313
315
|
#[test]
|
|
314
316
|
fn test_case_insensitive_extensions() {
|
|
315
|
-
let dir = tempdir().
|
|
317
|
+
let dir = tempdir().expect("Operation failed");
|
|
316
318
|
|
|
317
319
|
let test_cases = vec![
|
|
318
320
|
("test.PDF", "application/pdf"),
|
|
@@ -326,9 +328,9 @@ fn test_case_insensitive_extensions() {
|
|
|
326
328
|
|
|
327
329
|
for (filename, expected_mime) in test_cases {
|
|
328
330
|
let file_path = dir.path().join(filename);
|
|
329
|
-
File::create(&file_path).
|
|
331
|
+
File::create(&file_path).expect("Operation failed");
|
|
330
332
|
|
|
331
|
-
let detected = detect_mime_type(&file_path, true).
|
|
333
|
+
let detected = detect_mime_type(&file_path, true).expect("Operation failed");
|
|
332
334
|
assert_eq!(detected, expected_mime, "Failed for {}", filename);
|
|
333
335
|
}
|
|
334
336
|
}
|
|
@@ -336,7 +338,7 @@ fn test_case_insensitive_extensions() {
|
|
|
336
338
|
/// Test config loading from TOML file.
|
|
337
339
|
#[test]
|
|
338
340
|
fn test_config_loading() {
|
|
339
|
-
let dir = tempdir().
|
|
341
|
+
let dir = tempdir().expect("Operation failed");
|
|
340
342
|
let config_path = dir.path().join("kreuzberg.toml");
|
|
341
343
|
|
|
342
344
|
fs::write(
|
|
@@ -355,19 +357,19 @@ max_chars = 2000
|
|
|
355
357
|
max_overlap = 300
|
|
356
358
|
"#,
|
|
357
359
|
)
|
|
358
|
-
.
|
|
360
|
+
.expect("Operation failed");
|
|
359
361
|
|
|
360
|
-
let config = ExtractionConfig::from_toml_file(&config_path).
|
|
362
|
+
let config = ExtractionConfig::from_toml_file(&config_path).expect("Operation failed");
|
|
361
363
|
|
|
362
364
|
assert!(!config.use_cache);
|
|
363
365
|
assert!(config.enable_quality_processing);
|
|
364
366
|
assert!(!config.force_ocr);
|
|
365
367
|
|
|
366
|
-
let ocr_config = config.ocr.
|
|
368
|
+
let ocr_config = config.ocr.expect("Operation failed");
|
|
367
369
|
assert_eq!(ocr_config.backend, "tesseract");
|
|
368
370
|
assert_eq!(ocr_config.language, "deu");
|
|
369
371
|
|
|
370
|
-
let chunking_config = config.chunking.
|
|
372
|
+
let chunking_config = config.chunking.expect("Operation failed");
|
|
371
373
|
assert_eq!(chunking_config.max_chars, 2000);
|
|
372
374
|
assert_eq!(chunking_config.max_overlap, 300);
|
|
373
375
|
}
|
|
@@ -375,9 +377,9 @@ max_overlap = 300
|
|
|
375
377
|
/// Test config discovery in parent directories.
|
|
376
378
|
#[test]
|
|
377
379
|
fn test_config_discovery() {
|
|
378
|
-
let dir = tempdir().
|
|
380
|
+
let dir = tempdir().expect("Operation failed");
|
|
379
381
|
let subdir = dir.path().join("subdir");
|
|
380
|
-
fs::create_dir(&subdir).
|
|
382
|
+
fs::create_dir(&subdir).expect("Operation failed");
|
|
381
383
|
|
|
382
384
|
let config_path = dir.path().join("kreuzberg.toml");
|
|
383
385
|
fs::write(
|
|
@@ -387,16 +389,16 @@ use_cache = false
|
|
|
387
389
|
enable_quality_processing = true
|
|
388
390
|
"#,
|
|
389
391
|
)
|
|
390
|
-
.
|
|
392
|
+
.expect("Operation failed");
|
|
391
393
|
|
|
392
|
-
let original_dir = std::env::current_dir().
|
|
393
|
-
std::env::set_current_dir(&subdir).
|
|
394
|
+
let original_dir = std::env::current_dir().expect("Operation failed");
|
|
395
|
+
std::env::set_current_dir(&subdir).expect("Operation failed");
|
|
394
396
|
|
|
395
|
-
let config = ExtractionConfig::discover().
|
|
397
|
+
let config = ExtractionConfig::discover().expect("Operation failed");
|
|
396
398
|
assert!(config.is_some());
|
|
397
|
-
assert!(!config.
|
|
399
|
+
assert!(!config.expect("Operation failed").use_cache);
|
|
398
400
|
|
|
399
|
-
std::env::set_current_dir(original_dir).
|
|
401
|
+
std::env::set_current_dir(original_dir).expect("Operation failed");
|
|
400
402
|
}
|
|
401
403
|
|
|
402
404
|
/// Test error handling for nonexistent files.
|
|
@@ -428,9 +430,9 @@ async fn test_unsupported_mime_type_error() {
|
|
|
428
430
|
/// Test pipeline execution (currently stub, will be expanded in Phase 2).
|
|
429
431
|
#[tokio::test]
|
|
430
432
|
async fn test_pipeline_execution() {
|
|
431
|
-
let dir = tempdir().
|
|
433
|
+
let dir = tempdir().expect("Operation failed");
|
|
432
434
|
let file_path = dir.path().join("pipeline_test.txt");
|
|
433
|
-
fs::write(&file_path, "pipeline content").
|
|
435
|
+
fs::write(&file_path, "pipeline content").expect("Operation failed");
|
|
434
436
|
|
|
435
437
|
let config = ExtractionConfig {
|
|
436
438
|
enable_quality_processing: true,
|
|
@@ -440,7 +442,7 @@ async fn test_pipeline_execution() {
|
|
|
440
442
|
let result = extract_file(&file_path, None, &config).await;
|
|
441
443
|
assert!(result.is_ok(), "Pipeline execution should succeed");
|
|
442
444
|
|
|
443
|
-
let result = result.
|
|
445
|
+
let result = result.expect("Operation failed");
|
|
444
446
|
assert_text_content(&result.content, "pipeline content");
|
|
445
447
|
assert_eq!(result.mime_type, "text/plain");
|
|
446
448
|
assert!(result.chunks.is_none(), "Chunks should be None without chunking config");
|
|
@@ -450,9 +452,9 @@ async fn test_pipeline_execution() {
|
|
|
450
452
|
/// Test extraction with OCR config (placeholder test for Phase 2).
|
|
451
453
|
#[tokio::test]
|
|
452
454
|
async fn test_extraction_with_ocr_config() {
|
|
453
|
-
let dir = tempdir().
|
|
455
|
+
let dir = tempdir().expect("Operation failed");
|
|
454
456
|
let file_path = dir.path().join("ocr_test.txt");
|
|
455
|
-
fs::write(&file_path, "ocr content").
|
|
457
|
+
fs::write(&file_path, "ocr content").expect("Operation failed");
|
|
456
458
|
|
|
457
459
|
let config = ExtractionConfig {
|
|
458
460
|
ocr: Some(kreuzberg::OcrConfig {
|
|
@@ -473,11 +475,11 @@ async fn test_extraction_with_ocr_config() {
|
|
|
473
475
|
#[cfg(feature = "chunking")]
|
|
474
476
|
#[tokio::test]
|
|
475
477
|
async fn test_extraction_with_chunking_config() {
|
|
476
|
-
let dir = tempdir().
|
|
478
|
+
let dir = tempdir().expect("Operation failed");
|
|
477
479
|
let file_path = dir.path().join("chunking_test.txt");
|
|
478
480
|
|
|
479
481
|
let long_content = "content for chunking. ".repeat(100);
|
|
480
|
-
fs::write(&file_path, &long_content).
|
|
482
|
+
fs::write(&file_path, &long_content).expect("Operation failed");
|
|
481
483
|
|
|
482
484
|
let config = ExtractionConfig {
|
|
483
485
|
chunking: Some(kreuzberg::ChunkingConfig {
|
|
@@ -492,21 +494,21 @@ async fn test_extraction_with_chunking_config() {
|
|
|
492
494
|
let result = extract_file(&file_path, None, &config).await;
|
|
493
495
|
assert!(result.is_ok(), "Extraction with chunking should succeed");
|
|
494
496
|
|
|
495
|
-
let result = result.
|
|
497
|
+
let result = result.expect("Operation failed");
|
|
496
498
|
|
|
497
499
|
assert!(
|
|
498
500
|
result.chunks.is_some(),
|
|
499
501
|
"Chunks should be populated when chunking enabled"
|
|
500
502
|
);
|
|
501
503
|
|
|
502
|
-
let chunks = result.chunks.
|
|
504
|
+
let chunks = result.chunks.expect("Operation failed");
|
|
503
505
|
assert!(chunks.len() > 1, "Should have multiple chunks for long content");
|
|
504
506
|
|
|
505
507
|
assert!(result.metadata.additional.contains_key("chunk_count"));
|
|
506
|
-
let chunk_count = result.metadata.additional.get("chunk_count").
|
|
508
|
+
let chunk_count = result.metadata.additional.get("chunk_count").expect("Value not found");
|
|
507
509
|
assert_eq!(
|
|
508
510
|
chunks.len(),
|
|
509
|
-
chunk_count.as_u64().
|
|
511
|
+
chunk_count.as_u64().expect("Operation failed") as usize,
|
|
510
512
|
"chunk_count should match chunks length"
|
|
511
513
|
);
|
|
512
514
|
|