kreuzberg 4.1.2 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
  5. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  6. data/kreuzberg.gemspec +13 -1
  7. data/lib/kreuzberg/cli.rb +16 -6
  8. data/lib/kreuzberg/cli_proxy.rb +3 -1
  9. data/lib/kreuzberg/config.rb +121 -39
  10. data/lib/kreuzberg/djot_content.rb +225 -0
  11. data/lib/kreuzberg/extraction_api.rb +20 -4
  12. data/lib/kreuzberg/result.rb +12 -2
  13. data/lib/kreuzberg/version.rb +1 -1
  14. data/lib/kreuzberg.rb +1 -0
  15. data/sig/kreuzberg.rbs +28 -12
  16. data/spec/binding/batch_operations_spec.rb +80 -0
  17. data/spec/binding/batch_spec.rb +6 -5
  18. data/spec/binding/error_recovery_spec.rb +3 -3
  19. data/spec/binding/metadata_types_spec.rb +77 -57
  20. data/spec/binding/tables_spec.rb +11 -2
  21. data/spec/serialization_spec.rb +134 -0
  22. data/spec/unit/config/output_format_spec.rb +380 -0
  23. data/vendor/Cargo.toml +1 -1
  24. data/vendor/kreuzberg/Cargo.toml +1 -1
  25. data/vendor/kreuzberg/README.md +1 -1
  26. data/vendor/kreuzberg/src/api/startup.rs +15 -1
  27. data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
  28. data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
  29. data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
  30. data/vendor/kreuzberg/src/core/io.rs +7 -7
  31. data/vendor/kreuzberg/src/core/mime.rs +4 -4
  32. data/vendor/kreuzberg/src/embeddings.rs +4 -4
  33. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
  34. data/vendor/kreuzberg/src/mcp/format.rs +237 -39
  35. data/vendor/kreuzberg/src/mcp/params.rs +26 -33
  36. data/vendor/kreuzberg/src/mcp/server.rs +6 -3
  37. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
  38. data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
  39. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
  40. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
  41. data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
  42. data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
  43. data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
  44. data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
  45. data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
  46. data/vendor/kreuzberg/tests/api_embed.rs +84 -50
  47. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
  48. data/vendor/kreuzberg/tests/api_tests.rs +298 -139
  49. data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
  50. data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
  51. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
  52. data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
  53. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
  54. data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
  55. data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
  56. data/vendor/kreuzberg/tests/config_features.rs +19 -15
  57. data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
  58. data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
  59. data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
  60. data/vendor/kreuzberg/tests/core_integration.rs +57 -57
  61. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
  62. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
  63. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
  64. data/vendor/kreuzberg/tests/email_integration.rs +7 -7
  65. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
  66. data/vendor/kreuzberg/tests/error_handling.rs +13 -11
  67. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
  68. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  69. data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
  70. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
  71. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
  72. data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
  73. data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
  74. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
  75. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
  76. data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
  77. data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
  78. data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
  79. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
  80. data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
  81. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
  82. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
  83. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
  84. data/vendor/kreuzberg/tests/page_markers.rs +1 -1
  85. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
  86. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  87. data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
  88. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
  89. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
  90. data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
  91. data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
  92. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
  93. data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
  94. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
  95. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
  96. data/vendor/kreuzberg/tests/security_validation.rs +20 -19
  97. data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
  98. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
  99. data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
  100. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
  101. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
  102. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  103. metadata +12 -2
@@ -0,0 +1,349 @@
1
+ //! API consistency tests for ExtractionConfig and related types.
2
+ //!
3
+ //! This test suite validates that:
4
+ //! 1. ExtractionConfig serialization is complete with all fields
5
+ //! 2. All required configuration fields are present
6
+ //! 3. Configuration types maintain consistency across different formats
7
+ //! 4. No configuration fields are accidentally hidden or lost
8
+
9
+ use kreuzberg::core::config::ExtractionConfig;
10
+ use kreuzberg::core::config::OutputFormat;
11
+ use serde_json::json;
12
+
13
+ #[test]
14
+ fn test_extraction_config_serialization_includes_all_fields() {
15
+ let config = ExtractionConfig::default();
16
+ let json = serde_json::to_value(&config).expect("Failed to serialize config");
17
+
18
+ // Verify core fields exist and are accessible
19
+ assert!(
20
+ json.get("use_cache").is_some(),
21
+ "Missing 'use_cache' field in serialized config"
22
+ );
23
+ assert!(
24
+ json.get("enable_quality_processing").is_some(),
25
+ "Missing 'enable_quality_processing' field"
26
+ );
27
+ assert!(
28
+ json.get("force_ocr").is_some(),
29
+ "Missing 'force_ocr' field in serialized config"
30
+ );
31
+ assert!(
32
+ json.get("max_concurrent_extractions").is_some(),
33
+ "Missing 'max_concurrent_extractions' field"
34
+ );
35
+ assert!(
36
+ json.get("result_format").is_some(),
37
+ "Missing 'result_format' field in serialized config"
38
+ );
39
+ assert!(
40
+ json.get("output_format").is_some(),
41
+ "Missing 'output_format' field in serialized config"
42
+ );
43
+ }
44
+
45
+ #[test]
46
+ fn test_extraction_config_defaults_are_correct() {
47
+ let config = ExtractionConfig::default();
48
+
49
+ assert!(config.use_cache, "Default use_cache should be true");
50
+ assert!(
51
+ config.enable_quality_processing,
52
+ "Default enable_quality_processing should be true"
53
+ );
54
+ assert!(!config.force_ocr, "Default force_ocr should be false");
55
+ assert_eq!(
56
+ config.max_concurrent_extractions, None,
57
+ "Default max_concurrent_extractions should be None"
58
+ );
59
+ }
60
+
61
+ #[test]
62
+ fn test_extraction_config_serialization_roundtrip() {
63
+ let config = ExtractionConfig::default();
64
+
65
+ // Serialize to JSON
66
+ let json_string = serde_json::to_string(&config).expect("Failed to serialize");
67
+
68
+ // Deserialize back
69
+ let deserialized: ExtractionConfig =
70
+ serde_json::from_str(&json_string).expect("Failed to deserialize config from JSON");
71
+
72
+ // Verify roundtrip integrity
73
+ assert_eq!(
74
+ config.use_cache, deserialized.use_cache,
75
+ "use_cache should survive roundtrip"
76
+ );
77
+ assert_eq!(
78
+ config.enable_quality_processing, deserialized.enable_quality_processing,
79
+ "enable_quality_processing should survive roundtrip"
80
+ );
81
+ assert_eq!(
82
+ config.force_ocr, deserialized.force_ocr,
83
+ "force_ocr should survive roundtrip"
84
+ );
85
+ assert_eq!(
86
+ config.result_format, deserialized.result_format,
87
+ "result_format should survive roundtrip"
88
+ );
89
+ assert_eq!(
90
+ config.output_format, deserialized.output_format,
91
+ "output_format should survive roundtrip"
92
+ );
93
+ }
94
+
95
+ #[test]
96
+ fn test_extraction_config_json_structure() {
97
+ let config = ExtractionConfig::default();
98
+ let json = serde_json::to_value(&config).expect("Failed to serialize config");
99
+
100
+ let obj = json.as_object().expect("Config should serialize as object");
101
+
102
+ // Verify all expected fields are present as keys
103
+ let expected_fields = vec![
104
+ "use_cache",
105
+ "enable_quality_processing",
106
+ "force_ocr",
107
+ "max_concurrent_extractions",
108
+ "result_format",
109
+ "output_format",
110
+ ];
111
+
112
+ for field in expected_fields {
113
+ assert!(obj.contains_key(field), "Missing field in JSON: {}", field);
114
+ }
115
+ }
116
+
117
+ #[test]
118
+ fn test_extraction_config_values_are_correct_types() {
119
+ let config = ExtractionConfig::default();
120
+ let json = serde_json::to_value(&config).expect("Failed to serialize config");
121
+
122
+ // Verify field types
123
+ assert!(
124
+ json.get("use_cache").expect("Value not found").is_boolean(),
125
+ "use_cache should be boolean"
126
+ );
127
+ assert!(
128
+ json.get("enable_quality_processing")
129
+ .expect("Value not found")
130
+ .is_boolean(),
131
+ "enable_quality_processing should be boolean"
132
+ );
133
+ assert!(
134
+ json.get("force_ocr").expect("Value not found").is_boolean(),
135
+ "force_ocr should be boolean"
136
+ );
137
+ assert!(
138
+ json.get("result_format").expect("Value not found").is_string(),
139
+ "result_format should be string"
140
+ );
141
+ assert!(
142
+ json.get("output_format").expect("Value not found").is_string(),
143
+ "output_format should be string"
144
+ );
145
+ }
146
+
147
+ #[test]
148
+ fn test_extraction_config_with_custom_values() {
149
+ let config = ExtractionConfig {
150
+ use_cache: false,
151
+ force_ocr: true,
152
+ max_concurrent_extractions: Some(8),
153
+ ..ExtractionConfig::default()
154
+ };
155
+
156
+ let json = serde_json::to_value(&config).expect("Failed to serialize");
157
+
158
+ assert_eq!(json.get("use_cache").expect("Value not found"), &json!(false));
159
+ assert_eq!(json.get("force_ocr").expect("Value not found"), &json!(true));
160
+ assert_eq!(
161
+ json.get("max_concurrent_extractions").expect("Value not found"),
162
+ &json!(8)
163
+ );
164
+ }
165
+
166
+ #[test]
167
+ fn test_extraction_config_partial_json_parsing() {
168
+ // Test that we can parse partial JSON and fields get defaults
169
+ let partial_json = json!({
170
+ "use_cache": false,
171
+ });
172
+
173
+ let config: ExtractionConfig = serde_json::from_value(partial_json).expect("Failed to parse partial config");
174
+
175
+ assert!(!config.use_cache, "Explicit use_cache should be respected");
176
+ assert!(
177
+ config.enable_quality_processing,
178
+ "Omitted enable_quality_processing should use default"
179
+ );
180
+ assert!(!config.force_ocr, "Omitted force_ocr should use default");
181
+ }
182
+
183
+ #[test]
184
+ fn test_extraction_config_empty_json_uses_defaults() {
185
+ // Empty object should use all defaults
186
+ let empty_json = json!({});
187
+
188
+ let config: ExtractionConfig = serde_json::from_value(empty_json).expect("Failed to parse empty config");
189
+
190
+ let default_config = ExtractionConfig::default();
191
+ assert_eq!(config.use_cache, default_config.use_cache);
192
+ assert_eq!(
193
+ config.enable_quality_processing,
194
+ default_config.enable_quality_processing
195
+ );
196
+ assert_eq!(config.force_ocr, default_config.force_ocr);
197
+ assert_eq!(config.result_format, default_config.result_format);
198
+ assert_eq!(config.output_format, default_config.output_format);
199
+ }
200
+
201
+ #[test]
202
+ fn test_extraction_config_output_format_valid_values() {
203
+ // Test that output_format accepts valid values (case-insensitive)
204
+ let json_plain = json!({"output_format": "plain"});
205
+ let config_plain: ExtractionConfig =
206
+ serde_json::from_value(json_plain).expect("Failed to parse plain output_format");
207
+ assert_eq!(config_plain.output_format, OutputFormat::Plain);
208
+
209
+ let json_markdown = json!({"output_format": "markdown"});
210
+ let config_markdown: ExtractionConfig =
211
+ serde_json::from_value(json_markdown).expect("Failed to parse markdown output_format");
212
+ assert_eq!(config_markdown.output_format, OutputFormat::Markdown);
213
+
214
+ let json_html = json!({"output_format": "html"});
215
+ let config_html: ExtractionConfig = serde_json::from_value(json_html).expect("Failed to parse html output_format");
216
+ assert_eq!(config_html.output_format, OutputFormat::Html);
217
+ }
218
+
219
+ #[test]
220
+ fn test_extraction_config_result_format_valid_values() {
221
+ // Test that result_format accepts valid values
222
+ let json_unified = json!({"result_format": "unified"});
223
+ let config_unified: ExtractionConfig =
224
+ serde_json::from_value(json_unified).expect("Failed to parse unified result_format");
225
+ // result_format uses types::OutputFormat, not core::config::OutputFormat
226
+ let _ = config_unified.result_format;
227
+ }
228
+
229
+ #[test]
230
+ fn test_extraction_config_no_unknown_fields_in_default() {
231
+ // Verify that the default config only has expected fields when serialized
232
+ let config = ExtractionConfig::default();
233
+ let json = serde_json::to_value(&config).expect("Failed to serialize");
234
+ let obj = json.as_object().expect("Should be object");
235
+
236
+ // These are the fields we expect (some may be null based on feature flags)
237
+ let expected_fields = vec![
238
+ "use_cache",
239
+ "enable_quality_processing",
240
+ "ocr",
241
+ "force_ocr",
242
+ "chunking",
243
+ "images",
244
+ "pdf_options",
245
+ "token_reduction",
246
+ "language_detection",
247
+ "pages",
248
+ "keywords",
249
+ "postprocessor",
250
+ "html_options",
251
+ "max_concurrent_extractions",
252
+ "result_format",
253
+ "output_format",
254
+ ];
255
+
256
+ for key in obj.keys() {
257
+ assert!(
258
+ expected_fields.contains(&key.as_str()),
259
+ "Unexpected field in config: {}",
260
+ key
261
+ );
262
+ }
263
+ }
264
+
265
+ #[test]
266
+ fn test_extraction_config_needs_image_processing() {
267
+ // Test the needs_image_processing helper method
268
+ let mut config = ExtractionConfig::default();
269
+
270
+ // By default, should not need image processing
271
+ assert!(
272
+ !config.needs_image_processing(),
273
+ "Default config should not need image processing"
274
+ );
275
+
276
+ // With OCR enabled, should need image processing
277
+ config.ocr = Some(kreuzberg::OcrConfig {
278
+ backend: "tesseract".to_string(),
279
+ language: "eng".to_string(),
280
+ tesseract_config: None,
281
+ output_format: None,
282
+ });
283
+ assert!(
284
+ config.needs_image_processing(),
285
+ "Config with OCR should need image processing"
286
+ );
287
+
288
+ // Reset for next test
289
+ config.ocr = None;
290
+ config.images = Some(kreuzberg::ImageExtractionConfig {
291
+ extract_images: true,
292
+ target_dpi: 150,
293
+ max_image_dimension: 2000,
294
+ auto_adjust_dpi: true,
295
+ min_dpi: 72,
296
+ max_dpi: 600,
297
+ });
298
+ assert!(
299
+ config.needs_image_processing(),
300
+ "Config with image extraction should need image processing"
301
+ );
302
+ }
303
+
304
+ #[test]
305
+ fn test_output_format_serialization_lowercase() {
306
+ // Verify that OutputFormat serializes to lowercase values
307
+ let json = serde_json::json!({"output_format": "markdown"});
308
+ let config: ExtractionConfig = serde_json::from_value(json).expect("Failed to parse");
309
+ let reserialized = serde_json::to_value(&config).expect("Failed to reserialize");
310
+
311
+ // Should serialize back to lowercase
312
+ assert_eq!(reserialized["output_format"], "markdown");
313
+ }
314
+
315
+ #[test]
316
+ fn test_extraction_config_field_presence_consistency() {
317
+ // Test that all serialized configs have the expected top-level fields
318
+ let config = ExtractionConfig::default();
319
+ let json1 = serde_json::to_value(&config).expect("Failed to serialize");
320
+
321
+ let config2 = ExtractionConfig {
322
+ force_ocr: true,
323
+ ..ExtractionConfig::default()
324
+ };
325
+ let json2 = serde_json::to_value(&config2).expect("Failed to serialize");
326
+
327
+ // Both should have the same top-level keys
328
+ let keys1: Vec<_> = json1.as_object().expect("Expected object value").keys().collect();
329
+ let keys2: Vec<_> = json2.as_object().expect("Expected object value").keys().collect();
330
+
331
+ assert_eq!(keys1.len(), keys2.len(), "Configs should have same number of keys");
332
+ }
333
+
334
+ #[test]
335
+ fn test_output_format_all_variants() {
336
+ // Test all output format variants can be serialized and deserialized
337
+ let formats = vec![
338
+ OutputFormat::Plain,
339
+ OutputFormat::Markdown,
340
+ OutputFormat::Html,
341
+ OutputFormat::Djot,
342
+ ];
343
+
344
+ for fmt in formats {
345
+ let serialized = serde_json::to_value(fmt).expect("Failed to serialize");
346
+ let deserialized: OutputFormat = serde_json::from_value(serialized).expect("Failed to deserialize");
347
+ assert_eq!(fmt, deserialized, "Format should survive roundtrip");
348
+ }
349
+ }
@@ -29,16 +29,20 @@ async fn test_embed_valid_texts() {
29
29
  .method("POST")
30
30
  .uri("/embed")
31
31
  .header("content-type", "application/json")
32
- .body(Body::from(serde_json::to_string(&request_body).unwrap()))
33
- .unwrap(),
32
+ .body(Body::from(
33
+ serde_json::to_string(&request_body).expect("Operation failed"),
34
+ ))
35
+ .expect("Operation failed"),
34
36
  )
35
37
  .await
36
- .unwrap();
38
+ .expect("Operation failed");
37
39
 
38
40
  assert_eq!(response.status(), StatusCode::OK);
39
41
 
40
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
41
- let embed_response: EmbedResponse = serde_json::from_slice(&body).unwrap();
42
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
43
+ .await
44
+ .expect("Failed to convert to bytes");
45
+ let embed_response: EmbedResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
42
46
 
43
47
  assert_eq!(embed_response.count, 2);
44
48
  assert_eq!(embed_response.embeddings.len(), 2);
@@ -66,11 +70,13 @@ async fn test_embed_empty_texts() {
66
70
  .method("POST")
67
71
  .uri("/embed")
68
72
  .header("content-type", "application/json")
69
- .body(Body::from(serde_json::to_string(&request_body).unwrap()))
70
- .unwrap(),
73
+ .body(Body::from(
74
+ serde_json::to_string(&request_body).expect("Operation failed"),
75
+ ))
76
+ .expect("Operation failed"),
71
77
  )
72
78
  .await
73
- .unwrap();
79
+ .expect("Operation failed");
74
80
 
75
81
  assert_eq!(response.status(), StatusCode::BAD_REQUEST);
76
82
  }
@@ -97,16 +103,20 @@ async fn test_embed_with_custom_config() {
97
103
  .method("POST")
98
104
  .uri("/embed")
99
105
  .header("content-type", "application/json")
100
- .body(Body::from(serde_json::to_string(&request_body).unwrap()))
101
- .unwrap(),
106
+ .body(Body::from(
107
+ serde_json::to_string(&request_body).expect("Operation failed"),
108
+ ))
109
+ .expect("Operation failed"),
102
110
  )
103
111
  .await
104
- .unwrap();
112
+ .expect("Operation failed");
105
113
 
106
114
  assert_eq!(response.status(), StatusCode::OK);
107
115
 
108
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
109
- let embed_response: EmbedResponse = serde_json::from_slice(&body).unwrap();
116
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
117
+ .await
118
+ .expect("Failed to convert to bytes");
119
+ let embed_response: EmbedResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
110
120
 
111
121
  assert_eq!(embed_response.count, 1);
112
122
  assert_eq!(embed_response.embeddings.len(), 1);
@@ -128,16 +138,20 @@ async fn test_embed_single_text() {
128
138
  .method("POST")
129
139
  .uri("/embed")
130
140
  .header("content-type", "application/json")
131
- .body(Body::from(serde_json::to_string(&request_body).unwrap()))
132
- .unwrap(),
141
+ .body(Body::from(
142
+ serde_json::to_string(&request_body).expect("Operation failed"),
143
+ ))
144
+ .expect("Operation failed"),
133
145
  )
134
146
  .await
135
- .unwrap();
147
+ .expect("Operation failed");
136
148
 
137
149
  assert_eq!(response.status(), StatusCode::OK);
138
150
 
139
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
140
- let embed_response: EmbedResponse = serde_json::from_slice(&body).unwrap();
151
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
152
+ .await
153
+ .expect("Failed to convert to bytes");
154
+ let embed_response: EmbedResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
141
155
 
142
156
  assert_eq!(embed_response.count, 1);
143
157
  assert_eq!(embed_response.embeddings.len(), 1);
@@ -160,16 +174,20 @@ async fn test_embed_batch() {
160
174
  .method("POST")
161
175
  .uri("/embed")
162
176
  .header("content-type", "application/json")
163
- .body(Body::from(serde_json::to_string(&request_body).unwrap()))
164
- .unwrap(),
177
+ .body(Body::from(
178
+ serde_json::to_string(&request_body).expect("Operation failed"),
179
+ ))
180
+ .expect("Operation failed"),
165
181
  )
166
182
  .await
167
- .unwrap();
183
+ .expect("Operation failed");
168
184
 
169
185
  assert_eq!(response.status(), StatusCode::OK);
170
186
 
171
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
172
- let embed_response: EmbedResponse = serde_json::from_slice(&body).unwrap();
187
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
188
+ .await
189
+ .expect("Failed to convert to bytes");
190
+ let embed_response: EmbedResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
173
191
 
174
192
  assert_eq!(embed_response.count, 10);
175
193
  assert_eq!(embed_response.embeddings.len(), 10);
@@ -198,16 +216,20 @@ async fn test_embed_long_text() {
198
216
  .method("POST")
199
217
  .uri("/embed")
200
218
  .header("content-type", "application/json")
201
- .body(Body::from(serde_json::to_string(&request_body).unwrap()))
202
- .unwrap(),
219
+ .body(Body::from(
220
+ serde_json::to_string(&request_body).expect("Operation failed"),
221
+ ))
222
+ .expect("Operation failed"),
203
223
  )
204
224
  .await
205
- .unwrap();
225
+ .expect("Operation failed");
206
226
 
207
227
  assert_eq!(response.status(), StatusCode::OK);
208
228
 
209
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
210
- let embed_response: EmbedResponse = serde_json::from_slice(&body).unwrap();
229
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
230
+ .await
231
+ .expect("Failed to convert to bytes");
232
+ let embed_response: EmbedResponse = serde_json::from_slice(&body).expect("Failed to deserialize");
211
233
 
212
234
  assert_eq!(embed_response.count, 1);
213
235
  assert_eq!(embed_response.embeddings.len(), 1);
@@ -225,10 +247,10 @@ async fn test_embed_malformed_json() {
225
247
  .uri("/embed")
226
248
  .header("content-type", "application/json")
227
249
  .body(Body::from("{invalid json}"))
228
- .unwrap(),
250
+ .expect("Operation failed"),
229
251
  )
230
252
  .await
231
- .unwrap();
253
+ .expect("Operation failed");
232
254
 
233
255
  assert_eq!(response.status(), StatusCode::BAD_REQUEST);
234
256
  }
@@ -250,16 +272,20 @@ async fn test_embed_deterministic() {
250
272
  .method("POST")
251
273
  .uri("/embed")
252
274
  .header("content-type", "application/json")
253
- .body(Body::from(serde_json::to_string(&request_body).unwrap()))
254
- .unwrap(),
275
+ .body(Body::from(
276
+ serde_json::to_string(&request_body).expect("Operation failed"),
277
+ ))
278
+ .expect("Operation failed"),
255
279
  )
256
280
  .await
257
- .unwrap();
281
+ .expect("Operation failed");
258
282
 
259
283
  assert_eq!(response1.status(), StatusCode::OK);
260
284
 
261
- let body1 = axum::body::to_bytes(response1.into_body(), usize::MAX).await.unwrap();
262
- let embed_response1: EmbedResponse = serde_json::from_slice(&body1).unwrap();
285
+ let body1 = axum::body::to_bytes(response1.into_body(), usize::MAX)
286
+ .await
287
+ .expect("Failed to convert to bytes");
288
+ let embed_response1: EmbedResponse = serde_json::from_slice(&body1).expect("Failed to deserialize");
263
289
 
264
290
  // Second call with same text
265
291
  let response2 = app
@@ -268,16 +294,20 @@ async fn test_embed_deterministic() {
268
294
  .method("POST")
269
295
  .uri("/embed")
270
296
  .header("content-type", "application/json")
271
- .body(Body::from(serde_json::to_string(&request_body).unwrap()))
272
- .unwrap(),
297
+ .body(Body::from(
298
+ serde_json::to_string(&request_body).expect("Operation failed"),
299
+ ))
300
+ .expect("Operation failed"),
273
301
  )
274
302
  .await
275
- .unwrap();
303
+ .expect("Operation failed");
276
304
 
277
305
  assert_eq!(response2.status(), StatusCode::OK);
278
306
 
279
- let body2 = axum::body::to_bytes(response2.into_body(), usize::MAX).await.unwrap();
280
- let embed_response2: EmbedResponse = serde_json::from_slice(&body2).unwrap();
307
+ let body2 = axum::body::to_bytes(response2.into_body(), usize::MAX)
308
+ .await
309
+ .expect("Failed to convert to bytes");
310
+ let embed_response2: EmbedResponse = serde_json::from_slice(&body2).expect("Failed to deserialize");
281
311
 
282
312
  // Compare embeddings - they should be identical
283
313
  assert_eq!(embed_response1.embeddings.len(), embed_response2.embeddings.len());
@@ -307,18 +337,20 @@ async fn test_embed_different_presets() {
307
337
  .method("POST")
308
338
  .uri("/embed")
309
339
  .header("content-type", "application/json")
310
- .body(Body::from(serde_json::to_string(&request_fast).unwrap()))
311
- .unwrap(),
340
+ .body(Body::from(
341
+ serde_json::to_string(&request_fast).expect("Operation failed"),
342
+ ))
343
+ .expect("Operation failed"),
312
344
  )
313
345
  .await
314
- .unwrap();
346
+ .expect("Operation failed");
315
347
 
316
348
  assert_eq!(response_fast.status(), StatusCode::OK);
317
349
 
318
350
  let body_fast = axum::body::to_bytes(response_fast.into_body(), usize::MAX)
319
351
  .await
320
- .unwrap();
321
- let embed_fast: EmbedResponse = serde_json::from_slice(&body_fast).unwrap();
352
+ .expect("Operation failed");
353
+ let embed_fast: EmbedResponse = serde_json::from_slice(&body_fast).expect("Failed to deserialize");
322
354
 
323
355
  // Test with "balanced" preset
324
356
  let request_balanced = json!({
@@ -337,18 +369,20 @@ async fn test_embed_different_presets() {
337
369
  .method("POST")
338
370
  .uri("/embed")
339
371
  .header("content-type", "application/json")
340
- .body(Body::from(serde_json::to_string(&request_balanced).unwrap()))
341
- .unwrap(),
372
+ .body(Body::from(
373
+ serde_json::to_string(&request_balanced).expect("Operation failed"),
374
+ ))
375
+ .expect("Operation failed"),
342
376
  )
343
377
  .await
344
- .unwrap();
378
+ .expect("Operation failed");
345
379
 
346
380
  assert_eq!(response_balanced.status(), StatusCode::OK);
347
381
 
348
382
  let body_balanced = axum::body::to_bytes(response_balanced.into_body(), usize::MAX)
349
383
  .await
350
- .unwrap();
351
- let embed_balanced: EmbedResponse = serde_json::from_slice(&body_balanced).unwrap();
384
+ .expect("Operation failed");
385
+ let embed_balanced: EmbedResponse = serde_json::from_slice(&body_balanced).expect("Failed to deserialize");
352
386
 
353
387
  // Different presets should have different dimensions
354
388
  assert_ne!(embed_fast.dimensions, embed_balanced.dimensions);
@@ -93,7 +93,10 @@ startxref
93
93
  .expect("Failed to read response body");
94
94
 
95
95
  let parsed: Value = serde_json::from_slice(&body).expect("Failed to parse response");
96
- eprintln!("Extraction result: {}", serde_json::to_string_pretty(&parsed).unwrap());
96
+ eprintln!(
97
+ "Extraction result: {}",
98
+ serde_json::to_string_pretty(&parsed).expect("Failed to parse")
99
+ );
97
100
  }
98
101
 
99
102
  /// Test extracting a 1MB text file (control test without PDF).
@@ -187,7 +190,10 @@ async fn test_find_size_breaking_point() {
187
190
  .expect("Failed to read response body");
188
191
 
189
192
  if let Ok(parsed) = serde_json::from_slice::<Value>(&body) {
190
- eprintln!("Error response: {}", serde_json::to_string_pretty(&parsed).unwrap());
193
+ eprintln!(
194
+ "Error response: {}",
195
+ serde_json::to_string_pretty(&parsed).expect("Failed to parse")
196
+ );
191
197
  } else {
192
198
  eprintln!("Response body (not JSON): {}", String::from_utf8_lossy(&body));
193
199
  }