kreuzberg 4.1.2 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/kreuzberg.gemspec +13 -1
- data/lib/kreuzberg/cli.rb +16 -6
- data/lib/kreuzberg/cli_proxy.rb +3 -1
- data/lib/kreuzberg/config.rb +121 -39
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/extraction_api.rb +20 -4
- data/lib/kreuzberg/result.rb +12 -2
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +1 -0
- data/sig/kreuzberg.rbs +28 -12
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/batch_spec.rb +6 -5
- data/spec/binding/error_recovery_spec.rb +3 -3
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/binding/tables_spec.rb +11 -2
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/startup.rs +15 -1
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
- data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
- data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
- data/vendor/kreuzberg/src/core/io.rs +7 -7
- data/vendor/kreuzberg/src/core/mime.rs +4 -4
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
- data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +57 -57
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +12 -2
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
//! API consistency tests for ExtractionConfig and related types.
|
|
2
|
+
//!
|
|
3
|
+
//! This test suite validates that:
|
|
4
|
+
//! 1. ExtractionConfig serialization is complete with all fields
|
|
5
|
+
//! 2. All required configuration fields are present
|
|
6
|
+
//! 3. Configuration types maintain consistency across different formats
|
|
7
|
+
//! 4. No configuration fields are accidentally hidden or lost
|
|
8
|
+
|
|
9
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
10
|
+
use kreuzberg::core::config::OutputFormat;
|
|
11
|
+
use serde_json::json;
|
|
12
|
+
|
|
13
|
+
#[test]
|
|
14
|
+
fn test_extraction_config_serialization_includes_all_fields() {
|
|
15
|
+
let config = ExtractionConfig::default();
|
|
16
|
+
let json = serde_json::to_value(&config).expect("Failed to serialize config");
|
|
17
|
+
|
|
18
|
+
// Verify core fields exist and are accessible
|
|
19
|
+
assert!(
|
|
20
|
+
json.get("use_cache").is_some(),
|
|
21
|
+
"Missing 'use_cache' field in serialized config"
|
|
22
|
+
);
|
|
23
|
+
assert!(
|
|
24
|
+
json.get("enable_quality_processing").is_some(),
|
|
25
|
+
"Missing 'enable_quality_processing' field"
|
|
26
|
+
);
|
|
27
|
+
assert!(
|
|
28
|
+
json.get("force_ocr").is_some(),
|
|
29
|
+
"Missing 'force_ocr' field in serialized config"
|
|
30
|
+
);
|
|
31
|
+
assert!(
|
|
32
|
+
json.get("max_concurrent_extractions").is_some(),
|
|
33
|
+
"Missing 'max_concurrent_extractions' field"
|
|
34
|
+
);
|
|
35
|
+
assert!(
|
|
36
|
+
json.get("result_format").is_some(),
|
|
37
|
+
"Missing 'result_format' field in serialized config"
|
|
38
|
+
);
|
|
39
|
+
assert!(
|
|
40
|
+
json.get("output_format").is_some(),
|
|
41
|
+
"Missing 'output_format' field in serialized config"
|
|
42
|
+
);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
#[test]
|
|
46
|
+
fn test_extraction_config_defaults_are_correct() {
|
|
47
|
+
let config = ExtractionConfig::default();
|
|
48
|
+
|
|
49
|
+
assert!(config.use_cache, "Default use_cache should be true");
|
|
50
|
+
assert!(
|
|
51
|
+
config.enable_quality_processing,
|
|
52
|
+
"Default enable_quality_processing should be true"
|
|
53
|
+
);
|
|
54
|
+
assert!(!config.force_ocr, "Default force_ocr should be false");
|
|
55
|
+
assert_eq!(
|
|
56
|
+
config.max_concurrent_extractions, None,
|
|
57
|
+
"Default max_concurrent_extractions should be None"
|
|
58
|
+
);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
#[test]
|
|
62
|
+
fn test_extraction_config_serialization_roundtrip() {
|
|
63
|
+
let config = ExtractionConfig::default();
|
|
64
|
+
|
|
65
|
+
// Serialize to JSON
|
|
66
|
+
let json_string = serde_json::to_string(&config).expect("Failed to serialize");
|
|
67
|
+
|
|
68
|
+
// Deserialize back
|
|
69
|
+
let deserialized: ExtractionConfig =
|
|
70
|
+
serde_json::from_str(&json_string).expect("Failed to deserialize config from JSON");
|
|
71
|
+
|
|
72
|
+
// Verify roundtrip integrity
|
|
73
|
+
assert_eq!(
|
|
74
|
+
config.use_cache, deserialized.use_cache,
|
|
75
|
+
"use_cache should survive roundtrip"
|
|
76
|
+
);
|
|
77
|
+
assert_eq!(
|
|
78
|
+
config.enable_quality_processing, deserialized.enable_quality_processing,
|
|
79
|
+
"enable_quality_processing should survive roundtrip"
|
|
80
|
+
);
|
|
81
|
+
assert_eq!(
|
|
82
|
+
config.force_ocr, deserialized.force_ocr,
|
|
83
|
+
"force_ocr should survive roundtrip"
|
|
84
|
+
);
|
|
85
|
+
assert_eq!(
|
|
86
|
+
config.result_format, deserialized.result_format,
|
|
87
|
+
"result_format should survive roundtrip"
|
|
88
|
+
);
|
|
89
|
+
assert_eq!(
|
|
90
|
+
config.output_format, deserialized.output_format,
|
|
91
|
+
"output_format should survive roundtrip"
|
|
92
|
+
);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
#[test]
|
|
96
|
+
fn test_extraction_config_json_structure() {
|
|
97
|
+
let config = ExtractionConfig::default();
|
|
98
|
+
let json = serde_json::to_value(&config).expect("Failed to serialize config");
|
|
99
|
+
|
|
100
|
+
let obj = json.as_object().expect("Config should serialize as object");
|
|
101
|
+
|
|
102
|
+
// Verify all expected fields are present as keys
|
|
103
|
+
let expected_fields = vec![
|
|
104
|
+
"use_cache",
|
|
105
|
+
"enable_quality_processing",
|
|
106
|
+
"force_ocr",
|
|
107
|
+
"max_concurrent_extractions",
|
|
108
|
+
"result_format",
|
|
109
|
+
"output_format",
|
|
110
|
+
];
|
|
111
|
+
|
|
112
|
+
for field in expected_fields {
|
|
113
|
+
assert!(obj.contains_key(field), "Missing field in JSON: {}", field);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
#[test]
|
|
118
|
+
fn test_extraction_config_values_are_correct_types() {
|
|
119
|
+
let config = ExtractionConfig::default();
|
|
120
|
+
let json = serde_json::to_value(&config).expect("Failed to serialize config");
|
|
121
|
+
|
|
122
|
+
// Verify field types
|
|
123
|
+
assert!(
|
|
124
|
+
json.get("use_cache").expect("Value not found").is_boolean(),
|
|
125
|
+
"use_cache should be boolean"
|
|
126
|
+
);
|
|
127
|
+
assert!(
|
|
128
|
+
json.get("enable_quality_processing")
|
|
129
|
+
.expect("Value not found")
|
|
130
|
+
.is_boolean(),
|
|
131
|
+
"enable_quality_processing should be boolean"
|
|
132
|
+
);
|
|
133
|
+
assert!(
|
|
134
|
+
json.get("force_ocr").expect("Value not found").is_boolean(),
|
|
135
|
+
"force_ocr should be boolean"
|
|
136
|
+
);
|
|
137
|
+
assert!(
|
|
138
|
+
json.get("result_format").expect("Value not found").is_string(),
|
|
139
|
+
"result_format should be string"
|
|
140
|
+
);
|
|
141
|
+
assert!(
|
|
142
|
+
json.get("output_format").expect("Value not found").is_string(),
|
|
143
|
+
"output_format should be string"
|
|
144
|
+
);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
#[test]
|
|
148
|
+
fn test_extraction_config_with_custom_values() {
|
|
149
|
+
let config = ExtractionConfig {
|
|
150
|
+
use_cache: false,
|
|
151
|
+
force_ocr: true,
|
|
152
|
+
max_concurrent_extractions: Some(8),
|
|
153
|
+
..ExtractionConfig::default()
|
|
154
|
+
};
|
|
155
|
+
|
|
156
|
+
let json = serde_json::to_value(&config).expect("Failed to serialize");
|
|
157
|
+
|
|
158
|
+
assert_eq!(json.get("use_cache").expect("Value not found"), &json!(false));
|
|
159
|
+
assert_eq!(json.get("force_ocr").expect("Value not found"), &json!(true));
|
|
160
|
+
assert_eq!(
|
|
161
|
+
json.get("max_concurrent_extractions").expect("Value not found"),
|
|
162
|
+
&json!(8)
|
|
163
|
+
);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
#[test]
|
|
167
|
+
fn test_extraction_config_partial_json_parsing() {
|
|
168
|
+
// Test that we can parse partial JSON and fields get defaults
|
|
169
|
+
let partial_json = json!({
|
|
170
|
+
"use_cache": false,
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
let config: ExtractionConfig = serde_json::from_value(partial_json).expect("Failed to parse partial config");
|
|
174
|
+
|
|
175
|
+
assert!(!config.use_cache, "Explicit use_cache should be respected");
|
|
176
|
+
assert!(
|
|
177
|
+
config.enable_quality_processing,
|
|
178
|
+
"Omitted enable_quality_processing should use default"
|
|
179
|
+
);
|
|
180
|
+
assert!(!config.force_ocr, "Omitted force_ocr should use default");
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
#[test]
|
|
184
|
+
fn test_extraction_config_empty_json_uses_defaults() {
|
|
185
|
+
// Empty object should use all defaults
|
|
186
|
+
let empty_json = json!({});
|
|
187
|
+
|
|
188
|
+
let config: ExtractionConfig = serde_json::from_value(empty_json).expect("Failed to parse empty config");
|
|
189
|
+
|
|
190
|
+
let default_config = ExtractionConfig::default();
|
|
191
|
+
assert_eq!(config.use_cache, default_config.use_cache);
|
|
192
|
+
assert_eq!(
|
|
193
|
+
config.enable_quality_processing,
|
|
194
|
+
default_config.enable_quality_processing
|
|
195
|
+
);
|
|
196
|
+
assert_eq!(config.force_ocr, default_config.force_ocr);
|
|
197
|
+
assert_eq!(config.result_format, default_config.result_format);
|
|
198
|
+
assert_eq!(config.output_format, default_config.output_format);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
#[test]
|
|
202
|
+
fn test_extraction_config_output_format_valid_values() {
|
|
203
|
+
// Test that output_format accepts valid values (case-insensitive)
|
|
204
|
+
let json_plain = json!({"output_format": "plain"});
|
|
205
|
+
let config_plain: ExtractionConfig =
|
|
206
|
+
serde_json::from_value(json_plain).expect("Failed to parse plain output_format");
|
|
207
|
+
assert_eq!(config_plain.output_format, OutputFormat::Plain);
|
|
208
|
+
|
|
209
|
+
let json_markdown = json!({"output_format": "markdown"});
|
|
210
|
+
let config_markdown: ExtractionConfig =
|
|
211
|
+
serde_json::from_value(json_markdown).expect("Failed to parse markdown output_format");
|
|
212
|
+
assert_eq!(config_markdown.output_format, OutputFormat::Markdown);
|
|
213
|
+
|
|
214
|
+
let json_html = json!({"output_format": "html"});
|
|
215
|
+
let config_html: ExtractionConfig = serde_json::from_value(json_html).expect("Failed to parse html output_format");
|
|
216
|
+
assert_eq!(config_html.output_format, OutputFormat::Html);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
#[test]
|
|
220
|
+
fn test_extraction_config_result_format_valid_values() {
|
|
221
|
+
// Test that result_format accepts valid values
|
|
222
|
+
let json_unified = json!({"result_format": "unified"});
|
|
223
|
+
let config_unified: ExtractionConfig =
|
|
224
|
+
serde_json::from_value(json_unified).expect("Failed to parse unified result_format");
|
|
225
|
+
// result_format uses types::OutputFormat, not core::config::OutputFormat
|
|
226
|
+
let _ = config_unified.result_format;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
#[test]
|
|
230
|
+
fn test_extraction_config_no_unknown_fields_in_default() {
|
|
231
|
+
// Verify that the default config only has expected fields when serialized
|
|
232
|
+
let config = ExtractionConfig::default();
|
|
233
|
+
let json = serde_json::to_value(&config).expect("Failed to serialize");
|
|
234
|
+
let obj = json.as_object().expect("Should be object");
|
|
235
|
+
|
|
236
|
+
// These are the fields we expect (some may be null based on feature flags)
|
|
237
|
+
let expected_fields = vec![
|
|
238
|
+
"use_cache",
|
|
239
|
+
"enable_quality_processing",
|
|
240
|
+
"ocr",
|
|
241
|
+
"force_ocr",
|
|
242
|
+
"chunking",
|
|
243
|
+
"images",
|
|
244
|
+
"pdf_options",
|
|
245
|
+
"token_reduction",
|
|
246
|
+
"language_detection",
|
|
247
|
+
"pages",
|
|
248
|
+
"keywords",
|
|
249
|
+
"postprocessor",
|
|
250
|
+
"html_options",
|
|
251
|
+
"max_concurrent_extractions",
|
|
252
|
+
"result_format",
|
|
253
|
+
"output_format",
|
|
254
|
+
];
|
|
255
|
+
|
|
256
|
+
for key in obj.keys() {
|
|
257
|
+
assert!(
|
|
258
|
+
expected_fields.contains(&key.as_str()),
|
|
259
|
+
"Unexpected field in config: {}",
|
|
260
|
+
key
|
|
261
|
+
);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
#[test]
|
|
266
|
+
fn test_extraction_config_needs_image_processing() {
|
|
267
|
+
// Test the needs_image_processing helper method
|
|
268
|
+
let mut config = ExtractionConfig::default();
|
|
269
|
+
|
|
270
|
+
// By default, should not need image processing
|
|
271
|
+
assert!(
|
|
272
|
+
!config.needs_image_processing(),
|
|
273
|
+
"Default config should not need image processing"
|
|
274
|
+
);
|
|
275
|
+
|
|
276
|
+
// With OCR enabled, should need image processing
|
|
277
|
+
config.ocr = Some(kreuzberg::OcrConfig {
|
|
278
|
+
backend: "tesseract".to_string(),
|
|
279
|
+
language: "eng".to_string(),
|
|
280
|
+
tesseract_config: None,
|
|
281
|
+
output_format: None,
|
|
282
|
+
});
|
|
283
|
+
assert!(
|
|
284
|
+
config.needs_image_processing(),
|
|
285
|
+
"Config with OCR should need image processing"
|
|
286
|
+
);
|
|
287
|
+
|
|
288
|
+
// Reset for next test
|
|
289
|
+
config.ocr = None;
|
|
290
|
+
config.images = Some(kreuzberg::ImageExtractionConfig {
|
|
291
|
+
extract_images: true,
|
|
292
|
+
target_dpi: 150,
|
|
293
|
+
max_image_dimension: 2000,
|
|
294
|
+
auto_adjust_dpi: true,
|
|
295
|
+
min_dpi: 72,
|
|
296
|
+
max_dpi: 600,
|
|
297
|
+
});
|
|
298
|
+
assert!(
|
|
299
|
+
config.needs_image_processing(),
|
|
300
|
+
"Config with image extraction should need image processing"
|
|
301
|
+
);
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
#[test]
|
|
305
|
+
fn test_output_format_serialization_lowercase() {
|
|
306
|
+
// Verify that OutputFormat serializes to lowercase values
|
|
307
|
+
let json = serde_json::json!({"output_format": "markdown"});
|
|
308
|
+
let config: ExtractionConfig = serde_json::from_value(json).expect("Failed to parse");
|
|
309
|
+
let reserialized = serde_json::to_value(&config).expect("Failed to reserialize");
|
|
310
|
+
|
|
311
|
+
// Should serialize back to lowercase
|
|
312
|
+
assert_eq!(reserialized["output_format"], "markdown");
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
#[test]
|
|
316
|
+
fn test_extraction_config_field_presence_consistency() {
|
|
317
|
+
// Test that all serialized configs have the expected top-level fields
|
|
318
|
+
let config = ExtractionConfig::default();
|
|
319
|
+
let json1 = serde_json::to_value(&config).expect("Failed to serialize");
|
|
320
|
+
|
|
321
|
+
let config2 = ExtractionConfig {
|
|
322
|
+
force_ocr: true,
|
|
323
|
+
..ExtractionConfig::default()
|
|
324
|
+
};
|
|
325
|
+
let json2 = serde_json::to_value(&config2).expect("Failed to serialize");
|
|
326
|
+
|
|
327
|
+
// Both should have the same top-level keys
|
|
328
|
+
let keys1: Vec<_> = json1.as_object().expect("Expected object value").keys().collect();
|
|
329
|
+
let keys2: Vec<_> = json2.as_object().expect("Expected object value").keys().collect();
|
|
330
|
+
|
|
331
|
+
assert_eq!(keys1.len(), keys2.len(), "Configs should have same number of keys");
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
#[test]
|
|
335
|
+
fn test_output_format_all_variants() {
|
|
336
|
+
// Test all output format variants can be serialized and deserialized
|
|
337
|
+
let formats = vec![
|
|
338
|
+
OutputFormat::Plain,
|
|
339
|
+
OutputFormat::Markdown,
|
|
340
|
+
OutputFormat::Html,
|
|
341
|
+
OutputFormat::Djot,
|
|
342
|
+
];
|
|
343
|
+
|
|
344
|
+
for fmt in formats {
|
|
345
|
+
let serialized = serde_json::to_value(fmt).expect("Failed to serialize");
|
|
346
|
+
let deserialized: OutputFormat = serde_json::from_value(serialized).expect("Failed to deserialize");
|
|
347
|
+
assert_eq!(fmt, deserialized, "Format should survive roundtrip");
|
|
348
|
+
}
|
|
349
|
+
}
|
|
@@ -29,16 +29,20 @@ async fn test_embed_valid_texts() {
|
|
|
29
29
|
.method("POST")
|
|
30
30
|
.uri("/embed")
|
|
31
31
|
.header("content-type", "application/json")
|
|
32
|
-
.body(Body::from(
|
|
33
|
-
|
|
32
|
+
.body(Body::from(
|
|
33
|
+
serde_json::to_string(&request_body).expect("Operation failed"),
|
|
34
|
+
))
|
|
35
|
+
.expect("Operation failed"),
|
|
34
36
|
)
|
|
35
37
|
.await
|
|
36
|
-
.
|
|
38
|
+
.expect("Operation failed");
|
|
37
39
|
|
|
38
40
|
assert_eq!(response.status(), StatusCode::OK);
|
|
39
41
|
|
|
40
|
-
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
41
|
-
|
|
42
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
43
|
+
.await
|
|
44
|
+
.expect("Failed to convert to bytes");
|
|
45
|
+
let embed_response: EmbedResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
|
42
46
|
|
|
43
47
|
assert_eq!(embed_response.count, 2);
|
|
44
48
|
assert_eq!(embed_response.embeddings.len(), 2);
|
|
@@ -66,11 +70,13 @@ async fn test_embed_empty_texts() {
|
|
|
66
70
|
.method("POST")
|
|
67
71
|
.uri("/embed")
|
|
68
72
|
.header("content-type", "application/json")
|
|
69
|
-
.body(Body::from(
|
|
70
|
-
|
|
73
|
+
.body(Body::from(
|
|
74
|
+
serde_json::to_string(&request_body).expect("Operation failed"),
|
|
75
|
+
))
|
|
76
|
+
.expect("Operation failed"),
|
|
71
77
|
)
|
|
72
78
|
.await
|
|
73
|
-
.
|
|
79
|
+
.expect("Operation failed");
|
|
74
80
|
|
|
75
81
|
assert_eq!(response.status(), StatusCode::BAD_REQUEST);
|
|
76
82
|
}
|
|
@@ -97,16 +103,20 @@ async fn test_embed_with_custom_config() {
|
|
|
97
103
|
.method("POST")
|
|
98
104
|
.uri("/embed")
|
|
99
105
|
.header("content-type", "application/json")
|
|
100
|
-
.body(Body::from(
|
|
101
|
-
|
|
106
|
+
.body(Body::from(
|
|
107
|
+
serde_json::to_string(&request_body).expect("Operation failed"),
|
|
108
|
+
))
|
|
109
|
+
.expect("Operation failed"),
|
|
102
110
|
)
|
|
103
111
|
.await
|
|
104
|
-
.
|
|
112
|
+
.expect("Operation failed");
|
|
105
113
|
|
|
106
114
|
assert_eq!(response.status(), StatusCode::OK);
|
|
107
115
|
|
|
108
|
-
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
109
|
-
|
|
116
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
117
|
+
.await
|
|
118
|
+
.expect("Failed to convert to bytes");
|
|
119
|
+
let embed_response: EmbedResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
|
110
120
|
|
|
111
121
|
assert_eq!(embed_response.count, 1);
|
|
112
122
|
assert_eq!(embed_response.embeddings.len(), 1);
|
|
@@ -128,16 +138,20 @@ async fn test_embed_single_text() {
|
|
|
128
138
|
.method("POST")
|
|
129
139
|
.uri("/embed")
|
|
130
140
|
.header("content-type", "application/json")
|
|
131
|
-
.body(Body::from(
|
|
132
|
-
|
|
141
|
+
.body(Body::from(
|
|
142
|
+
serde_json::to_string(&request_body).expect("Operation failed"),
|
|
143
|
+
))
|
|
144
|
+
.expect("Operation failed"),
|
|
133
145
|
)
|
|
134
146
|
.await
|
|
135
|
-
.
|
|
147
|
+
.expect("Operation failed");
|
|
136
148
|
|
|
137
149
|
assert_eq!(response.status(), StatusCode::OK);
|
|
138
150
|
|
|
139
|
-
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
140
|
-
|
|
151
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
152
|
+
.await
|
|
153
|
+
.expect("Failed to convert to bytes");
|
|
154
|
+
let embed_response: EmbedResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
|
141
155
|
|
|
142
156
|
assert_eq!(embed_response.count, 1);
|
|
143
157
|
assert_eq!(embed_response.embeddings.len(), 1);
|
|
@@ -160,16 +174,20 @@ async fn test_embed_batch() {
|
|
|
160
174
|
.method("POST")
|
|
161
175
|
.uri("/embed")
|
|
162
176
|
.header("content-type", "application/json")
|
|
163
|
-
.body(Body::from(
|
|
164
|
-
|
|
177
|
+
.body(Body::from(
|
|
178
|
+
serde_json::to_string(&request_body).expect("Operation failed"),
|
|
179
|
+
))
|
|
180
|
+
.expect("Operation failed"),
|
|
165
181
|
)
|
|
166
182
|
.await
|
|
167
|
-
.
|
|
183
|
+
.expect("Operation failed");
|
|
168
184
|
|
|
169
185
|
assert_eq!(response.status(), StatusCode::OK);
|
|
170
186
|
|
|
171
|
-
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
172
|
-
|
|
187
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
188
|
+
.await
|
|
189
|
+
.expect("Failed to convert to bytes");
|
|
190
|
+
let embed_response: EmbedResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
|
173
191
|
|
|
174
192
|
assert_eq!(embed_response.count, 10);
|
|
175
193
|
assert_eq!(embed_response.embeddings.len(), 10);
|
|
@@ -198,16 +216,20 @@ async fn test_embed_long_text() {
|
|
|
198
216
|
.method("POST")
|
|
199
217
|
.uri("/embed")
|
|
200
218
|
.header("content-type", "application/json")
|
|
201
|
-
.body(Body::from(
|
|
202
|
-
|
|
219
|
+
.body(Body::from(
|
|
220
|
+
serde_json::to_string(&request_body).expect("Operation failed"),
|
|
221
|
+
))
|
|
222
|
+
.expect("Operation failed"),
|
|
203
223
|
)
|
|
204
224
|
.await
|
|
205
|
-
.
|
|
225
|
+
.expect("Operation failed");
|
|
206
226
|
|
|
207
227
|
assert_eq!(response.status(), StatusCode::OK);
|
|
208
228
|
|
|
209
|
-
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
210
|
-
|
|
229
|
+
let body = axum::body::to_bytes(response.into_body(), usize::MAX)
|
|
230
|
+
.await
|
|
231
|
+
.expect("Failed to convert to bytes");
|
|
232
|
+
let embed_response: EmbedResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
|
|
211
233
|
|
|
212
234
|
assert_eq!(embed_response.count, 1);
|
|
213
235
|
assert_eq!(embed_response.embeddings.len(), 1);
|
|
@@ -225,10 +247,10 @@ async fn test_embed_malformed_json() {
|
|
|
225
247
|
.uri("/embed")
|
|
226
248
|
.header("content-type", "application/json")
|
|
227
249
|
.body(Body::from("{invalid json}"))
|
|
228
|
-
.
|
|
250
|
+
.expect("Operation failed"),
|
|
229
251
|
)
|
|
230
252
|
.await
|
|
231
|
-
.
|
|
253
|
+
.expect("Operation failed");
|
|
232
254
|
|
|
233
255
|
assert_eq!(response.status(), StatusCode::BAD_REQUEST);
|
|
234
256
|
}
|
|
@@ -250,16 +272,20 @@ async fn test_embed_deterministic() {
|
|
|
250
272
|
.method("POST")
|
|
251
273
|
.uri("/embed")
|
|
252
274
|
.header("content-type", "application/json")
|
|
253
|
-
.body(Body::from(
|
|
254
|
-
|
|
275
|
+
.body(Body::from(
|
|
276
|
+
serde_json::to_string(&request_body).expect("Operation failed"),
|
|
277
|
+
))
|
|
278
|
+
.expect("Operation failed"),
|
|
255
279
|
)
|
|
256
280
|
.await
|
|
257
|
-
.
|
|
281
|
+
.expect("Operation failed");
|
|
258
282
|
|
|
259
283
|
assert_eq!(response1.status(), StatusCode::OK);
|
|
260
284
|
|
|
261
|
-
let body1 = axum::body::to_bytes(response1.into_body(), usize::MAX)
|
|
262
|
-
|
|
285
|
+
let body1 = axum::body::to_bytes(response1.into_body(), usize::MAX)
|
|
286
|
+
.await
|
|
287
|
+
.expect("Failed to convert to bytes");
|
|
288
|
+
let embed_response1: EmbedResponse = serde_json::from_slice(&body1).expect("Failed to deserialize");
|
|
263
289
|
|
|
264
290
|
// Second call with same text
|
|
265
291
|
let response2 = app
|
|
@@ -268,16 +294,20 @@ async fn test_embed_deterministic() {
|
|
|
268
294
|
.method("POST")
|
|
269
295
|
.uri("/embed")
|
|
270
296
|
.header("content-type", "application/json")
|
|
271
|
-
.body(Body::from(
|
|
272
|
-
|
|
297
|
+
.body(Body::from(
|
|
298
|
+
serde_json::to_string(&request_body).expect("Operation failed"),
|
|
299
|
+
))
|
|
300
|
+
.expect("Operation failed"),
|
|
273
301
|
)
|
|
274
302
|
.await
|
|
275
|
-
.
|
|
303
|
+
.expect("Operation failed");
|
|
276
304
|
|
|
277
305
|
assert_eq!(response2.status(), StatusCode::OK);
|
|
278
306
|
|
|
279
|
-
let body2 = axum::body::to_bytes(response2.into_body(), usize::MAX)
|
|
280
|
-
|
|
307
|
+
let body2 = axum::body::to_bytes(response2.into_body(), usize::MAX)
|
|
308
|
+
.await
|
|
309
|
+
.expect("Failed to convert to bytes");
|
|
310
|
+
let embed_response2: EmbedResponse = serde_json::from_slice(&body2).expect("Failed to deserialize");
|
|
281
311
|
|
|
282
312
|
// Compare embeddings - they should be identical
|
|
283
313
|
assert_eq!(embed_response1.embeddings.len(), embed_response2.embeddings.len());
|
|
@@ -307,18 +337,20 @@ async fn test_embed_different_presets() {
|
|
|
307
337
|
.method("POST")
|
|
308
338
|
.uri("/embed")
|
|
309
339
|
.header("content-type", "application/json")
|
|
310
|
-
.body(Body::from(
|
|
311
|
-
|
|
340
|
+
.body(Body::from(
|
|
341
|
+
serde_json::to_string(&request_fast).expect("Operation failed"),
|
|
342
|
+
))
|
|
343
|
+
.expect("Operation failed"),
|
|
312
344
|
)
|
|
313
345
|
.await
|
|
314
|
-
.
|
|
346
|
+
.expect("Operation failed");
|
|
315
347
|
|
|
316
348
|
assert_eq!(response_fast.status(), StatusCode::OK);
|
|
317
349
|
|
|
318
350
|
let body_fast = axum::body::to_bytes(response_fast.into_body(), usize::MAX)
|
|
319
351
|
.await
|
|
320
|
-
.
|
|
321
|
-
let embed_fast: EmbedResponse = serde_json::from_slice(&body_fast).
|
|
352
|
+
.expect("Operation failed");
|
|
353
|
+
let embed_fast: EmbedResponse = serde_json::from_slice(&body_fast).expect("Failed to deserialize");
|
|
322
354
|
|
|
323
355
|
// Test with "balanced" preset
|
|
324
356
|
let request_balanced = json!({
|
|
@@ -337,18 +369,20 @@ async fn test_embed_different_presets() {
|
|
|
337
369
|
.method("POST")
|
|
338
370
|
.uri("/embed")
|
|
339
371
|
.header("content-type", "application/json")
|
|
340
|
-
.body(Body::from(
|
|
341
|
-
|
|
372
|
+
.body(Body::from(
|
|
373
|
+
serde_json::to_string(&request_balanced).expect("Operation failed"),
|
|
374
|
+
))
|
|
375
|
+
.expect("Operation failed"),
|
|
342
376
|
)
|
|
343
377
|
.await
|
|
344
|
-
.
|
|
378
|
+
.expect("Operation failed");
|
|
345
379
|
|
|
346
380
|
assert_eq!(response_balanced.status(), StatusCode::OK);
|
|
347
381
|
|
|
348
382
|
let body_balanced = axum::body::to_bytes(response_balanced.into_body(), usize::MAX)
|
|
349
383
|
.await
|
|
350
|
-
.
|
|
351
|
-
let embed_balanced: EmbedResponse = serde_json::from_slice(&body_balanced).
|
|
384
|
+
.expect("Operation failed");
|
|
385
|
+
let embed_balanced: EmbedResponse = serde_json::from_slice(&body_balanced).expect("Failed to deserialize");
|
|
352
386
|
|
|
353
387
|
// Different presets should have different dimensions
|
|
354
388
|
assert_ne!(embed_fast.dimensions, embed_balanced.dimensions);
|
|
@@ -93,7 +93,10 @@ startxref
|
|
|
93
93
|
.expect("Failed to read response body");
|
|
94
94
|
|
|
95
95
|
let parsed: Value = serde_json::from_slice(&body).expect("Failed to parse response");
|
|
96
|
-
eprintln!(
|
|
96
|
+
eprintln!(
|
|
97
|
+
"Extraction result: {}",
|
|
98
|
+
serde_json::to_string_pretty(&parsed).expect("Failed to parse")
|
|
99
|
+
);
|
|
97
100
|
}
|
|
98
101
|
|
|
99
102
|
/// Test extracting a 1MB text file (control test without PDF).
|
|
@@ -187,7 +190,10 @@ async fn test_find_size_breaking_point() {
|
|
|
187
190
|
.expect("Failed to read response body");
|
|
188
191
|
|
|
189
192
|
if let Ok(parsed) = serde_json::from_slice::<Value>(&body) {
|
|
190
|
-
eprintln!(
|
|
193
|
+
eprintln!(
|
|
194
|
+
"Error response: {}",
|
|
195
|
+
serde_json::to_string_pretty(&parsed).expect("Failed to parse")
|
|
196
|
+
);
|
|
191
197
|
} else {
|
|
192
198
|
eprintln!("Response body (not JSON): {}", String::from_utf8_lossy(&body));
|
|
193
199
|
}
|