kreuzberg 4.1.2 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/kreuzberg.gemspec +13 -1
- data/lib/kreuzberg/config.rb +70 -35
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +5 -1
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +55 -53
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +10 -2
|
@@ -26,10 +26,10 @@ async fn test_chunk_basic() {
|
|
|
26
26
|
})
|
|
27
27
|
.to_string(),
|
|
28
28
|
))
|
|
29
|
-
.
|
|
29
|
+
.expect("Operation failed"),
|
|
30
30
|
)
|
|
31
31
|
.await
|
|
32
|
-
.
|
|
32
|
+
.expect("Operation failed");
|
|
33
33
|
|
|
34
34
|
assert_eq!(response.status(), StatusCode::OK);
|
|
35
35
|
}
|
|
@@ -44,10 +44,10 @@ async fn test_chunk_empty_text_returns_400() {
|
|
|
44
44
|
.method("POST")
|
|
45
45
|
.header("content-type", "application/json")
|
|
46
46
|
.body(Body::from(json!({"text": ""}).to_string()))
|
|
47
|
-
.
|
|
47
|
+
.expect("Operation failed"),
|
|
48
48
|
)
|
|
49
49
|
.await
|
|
50
|
-
.
|
|
50
|
+
.expect("Operation failed");
|
|
51
51
|
|
|
52
52
|
assert_eq!(response.status(), StatusCode::BAD_REQUEST);
|
|
53
53
|
}
|
|
@@ -68,10 +68,10 @@ async fn test_chunk_markdown_strategy() {
|
|
|
68
68
|
})
|
|
69
69
|
.to_string(),
|
|
70
70
|
))
|
|
71
|
-
.
|
|
71
|
+
.expect("Operation failed"),
|
|
72
72
|
)
|
|
73
73
|
.await
|
|
74
|
-
.
|
|
74
|
+
.expect("Operation failed");
|
|
75
75
|
|
|
76
76
|
assert_eq!(response.status(), StatusCode::OK);
|
|
77
77
|
}
|
|
@@ -99,15 +99,17 @@ async fn test_chunk_response_structure() {
|
|
|
99
99
|
})
|
|
100
100
|
.to_string(),
|
|
101
101
|
))
|
|
102
|
-
.
|
|
102
|
+
.expect("Operation failed"),
|
|
103
103
|
)
|
|
104
104
|
.await
|
|
105
|
-
.
|
|
105
|
+
.expect("Operation failed");
|
|
106
106
|
|
|
107
107
|
assert_eq!(response.status(), StatusCode::OK);
|
|
108
108
|
|
|
109
|
-
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
110
|
-
|
|
109
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
110
|
+
.await
|
|
111
|
+
.expect("Failed to convert to bytes");
|
|
112
|
+
let chunk_response: ChunkResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
|
111
113
|
|
|
112
114
|
// Verify response structure
|
|
113
115
|
assert!(chunk_response.chunk_count > 0);
|
|
@@ -143,10 +145,10 @@ async fn test_chunk_invalid_strategy_returns_400() {
|
|
|
143
145
|
})
|
|
144
146
|
.to_string(),
|
|
145
147
|
))
|
|
146
|
-
.
|
|
148
|
+
.expect("Operation failed"),
|
|
147
149
|
)
|
|
148
150
|
.await
|
|
149
|
-
.
|
|
151
|
+
.expect("Operation failed");
|
|
150
152
|
|
|
151
153
|
assert_eq!(response.status(), StatusCode::BAD_REQUEST);
|
|
152
154
|
}
|
|
@@ -168,15 +170,17 @@ async fn test_chunk_with_defaults() {
|
|
|
168
170
|
})
|
|
169
171
|
.to_string(),
|
|
170
172
|
))
|
|
171
|
-
.
|
|
173
|
+
.expect("Operation failed"),
|
|
172
174
|
)
|
|
173
175
|
.await
|
|
174
|
-
.
|
|
176
|
+
.expect("Operation failed");
|
|
175
177
|
|
|
176
178
|
assert_eq!(response.status(), StatusCode::OK);
|
|
177
179
|
|
|
178
|
-
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
179
|
-
|
|
180
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
181
|
+
.await
|
|
182
|
+
.expect("Failed to convert to bytes");
|
|
183
|
+
let chunk_response: ChunkResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
|
180
184
|
|
|
181
185
|
// Verify defaults are applied
|
|
182
186
|
assert_eq!(chunk_response.config.max_characters, 2000);
|
|
@@ -195,10 +199,10 @@ async fn test_chunk_malformed_json_returns_400() {
|
|
|
195
199
|
.method("POST")
|
|
196
200
|
.header("content-type", "application/json")
|
|
197
201
|
.body(Body::from("{invalid json}"))
|
|
198
|
-
.
|
|
202
|
+
.expect("Operation failed"),
|
|
199
203
|
)
|
|
200
204
|
.await
|
|
201
|
-
.
|
|
205
|
+
.expect("Operation failed");
|
|
202
206
|
|
|
203
207
|
assert_eq!(response.status(), StatusCode::BAD_REQUEST);
|
|
204
208
|
}
|
|
@@ -221,15 +225,17 @@ async fn test_chunk_case_insensitive_chunker_type() {
|
|
|
221
225
|
})
|
|
222
226
|
.to_string(),
|
|
223
227
|
))
|
|
224
|
-
.
|
|
228
|
+
.expect("Operation failed"),
|
|
225
229
|
)
|
|
226
230
|
.await
|
|
227
|
-
.
|
|
231
|
+
.expect("Operation failed");
|
|
228
232
|
|
|
229
233
|
assert_eq!(response.status(), StatusCode::OK);
|
|
230
234
|
|
|
231
|
-
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
232
|
-
|
|
235
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
236
|
+
.await
|
|
237
|
+
.expect("Failed to convert to bytes");
|
|
238
|
+
let chunk_response: ChunkResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
|
233
239
|
|
|
234
240
|
// Verify it's normalized to lowercase
|
|
235
241
|
assert_eq!(chunk_response.chunker_type, "markdown");
|
|
@@ -258,15 +264,17 @@ async fn test_chunk_long_text() {
|
|
|
258
264
|
})
|
|
259
265
|
.to_string(),
|
|
260
266
|
))
|
|
261
|
-
.
|
|
267
|
+
.expect("Operation failed"),
|
|
262
268
|
)
|
|
263
269
|
.await
|
|
264
|
-
.
|
|
270
|
+
.expect("Operation failed");
|
|
265
271
|
|
|
266
272
|
assert_eq!(response.status(), StatusCode::OK);
|
|
267
273
|
|
|
268
|
-
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
269
|
-
|
|
274
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
275
|
+
.await
|
|
276
|
+
.expect("Failed to convert to bytes");
|
|
277
|
+
let chunk_response: ChunkResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
|
270
278
|
|
|
271
279
|
// Should have multiple chunks
|
|
272
280
|
assert!(chunk_response.chunk_count > 1);
|
|
@@ -296,15 +304,17 @@ async fn test_chunk_custom_config() {
|
|
|
296
304
|
})
|
|
297
305
|
.to_string(),
|
|
298
306
|
))
|
|
299
|
-
.
|
|
307
|
+
.expect("Operation failed"),
|
|
300
308
|
)
|
|
301
309
|
.await
|
|
302
|
-
.
|
|
310
|
+
.expect("Operation failed");
|
|
303
311
|
|
|
304
312
|
assert_eq!(response.status(), StatusCode::OK);
|
|
305
313
|
|
|
306
|
-
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
307
|
-
|
|
314
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
315
|
+
.await
|
|
316
|
+
.expect("Failed to convert to bytes");
|
|
317
|
+
let chunk_response: ChunkResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
|
308
318
|
|
|
309
319
|
// Verify custom config was applied
|
|
310
320
|
assert_eq!(chunk_response.config.max_characters, 30);
|
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
//! API consistency tests for ExtractionConfig and related types.
|
|
2
|
+
//!
|
|
3
|
+
//! This test suite validates that:
|
|
4
|
+
//! 1. ExtractionConfig serialization is complete with all fields
|
|
5
|
+
//! 2. All required configuration fields are present
|
|
6
|
+
//! 3. Configuration types maintain consistency across different formats
|
|
7
|
+
//! 4. No configuration fields are accidentally hidden or lost
|
|
8
|
+
|
|
9
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
10
|
+
use kreuzberg::core::config::OutputFormat;
|
|
11
|
+
use serde_json::json;
|
|
12
|
+
|
|
13
|
+
#[test]
|
|
14
|
+
fn test_extraction_config_serialization_includes_all_fields() {
|
|
15
|
+
let config = ExtractionConfig::default();
|
|
16
|
+
let json = serde_json::to_value(&config).expect("Failed to serialize config");
|
|
17
|
+
|
|
18
|
+
// Verify core fields exist and are accessible
|
|
19
|
+
assert!(
|
|
20
|
+
json.get("use_cache").is_some(),
|
|
21
|
+
"Missing 'use_cache' field in serialized config"
|
|
22
|
+
);
|
|
23
|
+
assert!(
|
|
24
|
+
json.get("enable_quality_processing").is_some(),
|
|
25
|
+
"Missing 'enable_quality_processing' field"
|
|
26
|
+
);
|
|
27
|
+
assert!(
|
|
28
|
+
json.get("force_ocr").is_some(),
|
|
29
|
+
"Missing 'force_ocr' field in serialized config"
|
|
30
|
+
);
|
|
31
|
+
assert!(
|
|
32
|
+
json.get("max_concurrent_extractions").is_some(),
|
|
33
|
+
"Missing 'max_concurrent_extractions' field"
|
|
34
|
+
);
|
|
35
|
+
assert!(
|
|
36
|
+
json.get("result_format").is_some(),
|
|
37
|
+
"Missing 'result_format' field in serialized config"
|
|
38
|
+
);
|
|
39
|
+
assert!(
|
|
40
|
+
json.get("output_format").is_some(),
|
|
41
|
+
"Missing 'output_format' field in serialized config"
|
|
42
|
+
);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
#[test]
|
|
46
|
+
fn test_extraction_config_defaults_are_correct() {
|
|
47
|
+
let config = ExtractionConfig::default();
|
|
48
|
+
|
|
49
|
+
assert!(config.use_cache, "Default use_cache should be true");
|
|
50
|
+
assert!(
|
|
51
|
+
config.enable_quality_processing,
|
|
52
|
+
"Default enable_quality_processing should be true"
|
|
53
|
+
);
|
|
54
|
+
assert!(!config.force_ocr, "Default force_ocr should be false");
|
|
55
|
+
assert_eq!(
|
|
56
|
+
config.max_concurrent_extractions, None,
|
|
57
|
+
"Default max_concurrent_extractions should be None"
|
|
58
|
+
);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
#[test]
|
|
62
|
+
fn test_extraction_config_serialization_roundtrip() {
|
|
63
|
+
let config = ExtractionConfig::default();
|
|
64
|
+
|
|
65
|
+
// Serialize to JSON
|
|
66
|
+
let json_string = serde_json::to_string(&config).expect("Failed to serialize");
|
|
67
|
+
|
|
68
|
+
// Deserialize back
|
|
69
|
+
let deserialized: ExtractionConfig =
|
|
70
|
+
serde_json::from_str(&json_string).expect("Failed to deserialize config from JSON");
|
|
71
|
+
|
|
72
|
+
// Verify roundtrip integrity
|
|
73
|
+
assert_eq!(
|
|
74
|
+
config.use_cache, deserialized.use_cache,
|
|
75
|
+
"use_cache should survive roundtrip"
|
|
76
|
+
);
|
|
77
|
+
assert_eq!(
|
|
78
|
+
config.enable_quality_processing, deserialized.enable_quality_processing,
|
|
79
|
+
"enable_quality_processing should survive roundtrip"
|
|
80
|
+
);
|
|
81
|
+
assert_eq!(
|
|
82
|
+
config.force_ocr, deserialized.force_ocr,
|
|
83
|
+
"force_ocr should survive roundtrip"
|
|
84
|
+
);
|
|
85
|
+
assert_eq!(
|
|
86
|
+
config.result_format, deserialized.result_format,
|
|
87
|
+
"result_format should survive roundtrip"
|
|
88
|
+
);
|
|
89
|
+
assert_eq!(
|
|
90
|
+
config.output_format, deserialized.output_format,
|
|
91
|
+
"output_format should survive roundtrip"
|
|
92
|
+
);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
#[test]
|
|
96
|
+
fn test_extraction_config_json_structure() {
|
|
97
|
+
let config = ExtractionConfig::default();
|
|
98
|
+
let json = serde_json::to_value(&config).expect("Failed to serialize config");
|
|
99
|
+
|
|
100
|
+
let obj = json.as_object().expect("Config should serialize as object");
|
|
101
|
+
|
|
102
|
+
// Verify all expected fields are present as keys
|
|
103
|
+
let expected_fields = vec![
|
|
104
|
+
"use_cache",
|
|
105
|
+
"enable_quality_processing",
|
|
106
|
+
"force_ocr",
|
|
107
|
+
"max_concurrent_extractions",
|
|
108
|
+
"result_format",
|
|
109
|
+
"output_format",
|
|
110
|
+
];
|
|
111
|
+
|
|
112
|
+
for field in expected_fields {
|
|
113
|
+
assert!(obj.contains_key(field), "Missing field in JSON: {}", field);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
#[test]
|
|
118
|
+
fn test_extraction_config_values_are_correct_types() {
|
|
119
|
+
let config = ExtractionConfig::default();
|
|
120
|
+
let json = serde_json::to_value(&config).expect("Failed to serialize config");
|
|
121
|
+
|
|
122
|
+
// Verify field types
|
|
123
|
+
assert!(
|
|
124
|
+
json.get("use_cache").expect("Value not found").is_boolean(),
|
|
125
|
+
"use_cache should be boolean"
|
|
126
|
+
);
|
|
127
|
+
assert!(
|
|
128
|
+
json.get("enable_quality_processing")
|
|
129
|
+
.expect("Value not found")
|
|
130
|
+
.is_boolean(),
|
|
131
|
+
"enable_quality_processing should be boolean"
|
|
132
|
+
);
|
|
133
|
+
assert!(
|
|
134
|
+
json.get("force_ocr").expect("Value not found").is_boolean(),
|
|
135
|
+
"force_ocr should be boolean"
|
|
136
|
+
);
|
|
137
|
+
assert!(
|
|
138
|
+
json.get("result_format").expect("Value not found").is_string(),
|
|
139
|
+
"result_format should be string"
|
|
140
|
+
);
|
|
141
|
+
assert!(
|
|
142
|
+
json.get("output_format").expect("Value not found").is_string(),
|
|
143
|
+
"output_format should be string"
|
|
144
|
+
);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
#[test]
|
|
148
|
+
fn test_extraction_config_with_custom_values() {
|
|
149
|
+
let config = ExtractionConfig {
|
|
150
|
+
use_cache: false,
|
|
151
|
+
force_ocr: true,
|
|
152
|
+
max_concurrent_extractions: Some(8),
|
|
153
|
+
..ExtractionConfig::default()
|
|
154
|
+
};
|
|
155
|
+
|
|
156
|
+
let json = serde_json::to_value(&config).expect("Failed to serialize");
|
|
157
|
+
|
|
158
|
+
assert_eq!(json.get("use_cache").expect("Value not found"), &json!(false));
|
|
159
|
+
assert_eq!(json.get("force_ocr").expect("Value not found"), &json!(true));
|
|
160
|
+
assert_eq!(
|
|
161
|
+
json.get("max_concurrent_extractions").expect("Value not found"),
|
|
162
|
+
&json!(8)
|
|
163
|
+
);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
#[test]
|
|
167
|
+
fn test_extraction_config_partial_json_parsing() {
|
|
168
|
+
// Test that we can parse partial JSON and fields get defaults
|
|
169
|
+
let partial_json = json!({
|
|
170
|
+
"use_cache": false,
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
let config: ExtractionConfig = serde_json::from_value(partial_json).expect("Failed to parse partial config");
|
|
174
|
+
|
|
175
|
+
assert!(!config.use_cache, "Explicit use_cache should be respected");
|
|
176
|
+
assert!(
|
|
177
|
+
config.enable_quality_processing,
|
|
178
|
+
"Omitted enable_quality_processing should use default"
|
|
179
|
+
);
|
|
180
|
+
assert!(!config.force_ocr, "Omitted force_ocr should use default");
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
#[test]
|
|
184
|
+
fn test_extraction_config_empty_json_uses_defaults() {
|
|
185
|
+
// Empty object should use all defaults
|
|
186
|
+
let empty_json = json!({});
|
|
187
|
+
|
|
188
|
+
let config: ExtractionConfig = serde_json::from_value(empty_json).expect("Failed to parse empty config");
|
|
189
|
+
|
|
190
|
+
let default_config = ExtractionConfig::default();
|
|
191
|
+
assert_eq!(config.use_cache, default_config.use_cache);
|
|
192
|
+
assert_eq!(
|
|
193
|
+
config.enable_quality_processing,
|
|
194
|
+
default_config.enable_quality_processing
|
|
195
|
+
);
|
|
196
|
+
assert_eq!(config.force_ocr, default_config.force_ocr);
|
|
197
|
+
assert_eq!(config.result_format, default_config.result_format);
|
|
198
|
+
assert_eq!(config.output_format, default_config.output_format);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
#[test]
|
|
202
|
+
fn test_extraction_config_output_format_valid_values() {
|
|
203
|
+
// Test that output_format accepts valid values (case-insensitive)
|
|
204
|
+
let json_plain = json!({"output_format": "plain"});
|
|
205
|
+
let config_plain: ExtractionConfig =
|
|
206
|
+
serde_json::from_value(json_plain).expect("Failed to parse plain output_format");
|
|
207
|
+
assert_eq!(config_plain.output_format, OutputFormat::Plain);
|
|
208
|
+
|
|
209
|
+
let json_markdown = json!({"output_format": "markdown"});
|
|
210
|
+
let config_markdown: ExtractionConfig =
|
|
211
|
+
serde_json::from_value(json_markdown).expect("Failed to parse markdown output_format");
|
|
212
|
+
assert_eq!(config_markdown.output_format, OutputFormat::Markdown);
|
|
213
|
+
|
|
214
|
+
let json_html = json!({"output_format": "html"});
|
|
215
|
+
let config_html: ExtractionConfig = serde_json::from_value(json_html).expect("Failed to parse html output_format");
|
|
216
|
+
assert_eq!(config_html.output_format, OutputFormat::Html);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
#[test]
|
|
220
|
+
fn test_extraction_config_result_format_valid_values() {
|
|
221
|
+
// Test that result_format accepts valid values
|
|
222
|
+
let json_unified = json!({"result_format": "unified"});
|
|
223
|
+
let config_unified: ExtractionConfig =
|
|
224
|
+
serde_json::from_value(json_unified).expect("Failed to parse unified result_format");
|
|
225
|
+
// result_format uses types::OutputFormat, not core::config::OutputFormat
|
|
226
|
+
let _ = config_unified.result_format;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
#[test]
|
|
230
|
+
fn test_extraction_config_no_unknown_fields_in_default() {
|
|
231
|
+
// Verify that the default config only has expected fields when serialized
|
|
232
|
+
let config = ExtractionConfig::default();
|
|
233
|
+
let json = serde_json::to_value(&config).expect("Failed to serialize");
|
|
234
|
+
let obj = json.as_object().expect("Should be object");
|
|
235
|
+
|
|
236
|
+
// These are the fields we expect (some may be null based on feature flags)
|
|
237
|
+
let expected_fields = vec![
|
|
238
|
+
"use_cache",
|
|
239
|
+
"enable_quality_processing",
|
|
240
|
+
"ocr",
|
|
241
|
+
"force_ocr",
|
|
242
|
+
"chunking",
|
|
243
|
+
"images",
|
|
244
|
+
"pdf_options",
|
|
245
|
+
"token_reduction",
|
|
246
|
+
"language_detection",
|
|
247
|
+
"pages",
|
|
248
|
+
"keywords",
|
|
249
|
+
"postprocessor",
|
|
250
|
+
"html_options",
|
|
251
|
+
"max_concurrent_extractions",
|
|
252
|
+
"result_format",
|
|
253
|
+
"output_format",
|
|
254
|
+
];
|
|
255
|
+
|
|
256
|
+
for key in obj.keys() {
|
|
257
|
+
assert!(
|
|
258
|
+
expected_fields.contains(&key.as_str()),
|
|
259
|
+
"Unexpected field in config: {}",
|
|
260
|
+
key
|
|
261
|
+
);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
#[test]
|
|
266
|
+
fn test_extraction_config_needs_image_processing() {
|
|
267
|
+
// Test the needs_image_processing helper method
|
|
268
|
+
let mut config = ExtractionConfig::default();
|
|
269
|
+
|
|
270
|
+
// By default, should not need image processing
|
|
271
|
+
assert!(
|
|
272
|
+
!config.needs_image_processing(),
|
|
273
|
+
"Default config should not need image processing"
|
|
274
|
+
);
|
|
275
|
+
|
|
276
|
+
// With OCR enabled, should need image processing
|
|
277
|
+
config.ocr = Some(kreuzberg::OcrConfig {
|
|
278
|
+
backend: "tesseract".to_string(),
|
|
279
|
+
language: "eng".to_string(),
|
|
280
|
+
tesseract_config: None,
|
|
281
|
+
output_format: None,
|
|
282
|
+
});
|
|
283
|
+
assert!(
|
|
284
|
+
config.needs_image_processing(),
|
|
285
|
+
"Config with OCR should need image processing"
|
|
286
|
+
);
|
|
287
|
+
|
|
288
|
+
// Reset for next test
|
|
289
|
+
config.ocr = None;
|
|
290
|
+
config.images = Some(kreuzberg::ImageExtractionConfig {
|
|
291
|
+
extract_images: true,
|
|
292
|
+
target_dpi: 150,
|
|
293
|
+
max_image_dimension: 2000,
|
|
294
|
+
auto_adjust_dpi: true,
|
|
295
|
+
min_dpi: 72,
|
|
296
|
+
max_dpi: 600,
|
|
297
|
+
});
|
|
298
|
+
assert!(
|
|
299
|
+
config.needs_image_processing(),
|
|
300
|
+
"Config with image extraction should need image processing"
|
|
301
|
+
);
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
#[test]
|
|
305
|
+
fn test_output_format_serialization_lowercase() {
|
|
306
|
+
// Verify that OutputFormat serializes to lowercase values
|
|
307
|
+
let json = serde_json::json!({"output_format": "markdown"});
|
|
308
|
+
let config: ExtractionConfig = serde_json::from_value(json).expect("Failed to parse");
|
|
309
|
+
let reserialized = serde_json::to_value(&config).expect("Failed to reserialize");
|
|
310
|
+
|
|
311
|
+
// Should serialize back to lowercase
|
|
312
|
+
assert_eq!(reserialized["output_format"], "markdown");
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
#[test]
|
|
316
|
+
fn test_extraction_config_field_presence_consistency() {
|
|
317
|
+
// Test that all serialized configs have the expected top-level fields
|
|
318
|
+
let config = ExtractionConfig::default();
|
|
319
|
+
let json1 = serde_json::to_value(&config).expect("Failed to serialize");
|
|
320
|
+
|
|
321
|
+
let config2 = ExtractionConfig {
|
|
322
|
+
force_ocr: true,
|
|
323
|
+
..ExtractionConfig::default()
|
|
324
|
+
};
|
|
325
|
+
let json2 = serde_json::to_value(&config2).expect("Failed to serialize");
|
|
326
|
+
|
|
327
|
+
// Both should have the same top-level keys
|
|
328
|
+
let keys1: Vec<_> = json1.as_object().expect("Expected object value").keys().collect();
|
|
329
|
+
let keys2: Vec<_> = json2.as_object().expect("Expected object value").keys().collect();
|
|
330
|
+
|
|
331
|
+
assert_eq!(keys1.len(), keys2.len(), "Configs should have same number of keys");
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
#[test]
|
|
335
|
+
fn test_output_format_all_variants() {
|
|
336
|
+
// Test all output format variants can be serialized and deserialized
|
|
337
|
+
let formats = vec![
|
|
338
|
+
OutputFormat::Plain,
|
|
339
|
+
OutputFormat::Markdown,
|
|
340
|
+
OutputFormat::Html,
|
|
341
|
+
OutputFormat::Djot,
|
|
342
|
+
];
|
|
343
|
+
|
|
344
|
+
for fmt in formats {
|
|
345
|
+
let serialized = serde_json::to_value(fmt).expect("Failed to serialize");
|
|
346
|
+
let deserialized: OutputFormat = serde_json::from_value(serialized).expect("Failed to deserialize");
|
|
347
|
+
assert_eq!(fmt, deserialized, "Format should survive roundtrip");
|
|
348
|
+
}
|
|
349
|
+
}
|