kreuzberg 4.1.1 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +4 -4
  3. data/README.md +8 -5
  4. data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
  5. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +23 -13
  7. data/kreuzberg.gemspec +14 -2
  8. data/lib/kreuzberg/api_proxy.rb +0 -1
  9. data/lib/kreuzberg/cli_proxy.rb +0 -1
  10. data/lib/kreuzberg/config.rb +70 -35
  11. data/lib/kreuzberg/mcp_proxy.rb +0 -1
  12. data/lib/kreuzberg/version.rb +1 -1
  13. data/sig/kreuzberg.rbs +5 -1
  14. data/spec/binding/batch_operations_spec.rb +80 -0
  15. data/spec/binding/metadata_types_spec.rb +77 -57
  16. data/spec/serialization_spec.rb +134 -0
  17. data/spec/unit/config/output_format_spec.rb +380 -0
  18. data/vendor/Cargo.toml +1 -1
  19. data/vendor/kreuzberg/Cargo.toml +3 -3
  20. data/vendor/kreuzberg/README.md +1 -1
  21. data/vendor/kreuzberg/src/embeddings.rs +4 -4
  22. data/vendor/kreuzberg/src/mcp/format.rs +237 -39
  23. data/vendor/kreuzberg/src/mcp/params.rs +26 -33
  24. data/vendor/kreuzberg/src/mcp/server.rs +6 -3
  25. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
  26. data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
  27. data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
  28. data/vendor/kreuzberg/tests/api_embed.rs +84 -50
  29. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
  30. data/vendor/kreuzberg/tests/api_tests.rs +298 -139
  31. data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
  32. data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
  33. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
  34. data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
  35. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
  36. data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
  37. data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
  38. data/vendor/kreuzberg/tests/config_features.rs +19 -15
  39. data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
  40. data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
  41. data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
  42. data/vendor/kreuzberg/tests/core_integration.rs +55 -53
  43. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
  44. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
  45. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
  46. data/vendor/kreuzberg/tests/email_integration.rs +7 -7
  47. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
  48. data/vendor/kreuzberg/tests/error_handling.rs +13 -11
  49. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
  50. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  51. data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
  52. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
  53. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
  54. data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
  55. data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
  56. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
  57. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
  58. data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
  59. data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
  60. data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
  61. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
  62. data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
  63. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
  64. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
  65. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
  66. data/vendor/kreuzberg/tests/page_markers.rs +1 -1
  67. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
  68. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  69. data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
  70. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
  71. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
  72. data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
  73. data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
  74. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
  75. data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
  76. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
  77. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
  78. data/vendor/kreuzberg/tests/security_validation.rs +20 -19
  79. data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
  80. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
  81. data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
  82. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
  83. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
  84. data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
  85. data/vendor/kreuzberg-tesseract/build.rs +4 -4
  86. data/vendor/kreuzberg-tesseract/src/lib.rs +6 -6
  87. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +3 -3
  88. metadata +13 -2
@@ -20,14 +20,21 @@ async fn test_health_endpoint() {
20
20
  let app = create_router(ExtractionConfig::default());
21
21
 
22
22
  let response = app
23
- .oneshot(Request::builder().uri("/health").body(Body::empty()).unwrap())
23
+ .oneshot(
24
+ Request::builder()
25
+ .uri("/health")
26
+ .body(Body::empty())
27
+ .expect("Failed to create HTTP request body"),
28
+ )
24
29
  .await
25
- .unwrap();
30
+ .expect("Failed to send HTTP request");
26
31
 
27
32
  assert_eq!(response.status(), StatusCode::OK);
28
33
 
29
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
30
- let health: HealthResponse = serde_json::from_slice(&body).unwrap();
34
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
35
+ .await
36
+ .expect("Failed to read HTTP response body");
37
+ let health: HealthResponse = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
31
38
 
32
39
  assert_eq!(health.status, "healthy");
33
40
  assert!(!health.version.is_empty());
@@ -39,14 +46,21 @@ async fn test_info_endpoint() {
39
46
  let app = create_router(ExtractionConfig::default());
40
47
 
41
48
  let response = app
42
- .oneshot(Request::builder().uri("/info").body(Body::empty()).unwrap())
49
+ .oneshot(
50
+ Request::builder()
51
+ .uri("/info")
52
+ .body(Body::empty())
53
+ .expect("Failed to create HTTP request body"),
54
+ )
43
55
  .await
44
- .unwrap();
56
+ .expect("Failed to send HTTP request");
45
57
 
46
58
  assert_eq!(response.status(), StatusCode::OK);
47
59
 
48
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
49
- let info: InfoResponse = serde_json::from_slice(&body).unwrap();
60
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
61
+ .await
62
+ .expect("Failed to read HTTP response body");
63
+ let info: InfoResponse = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
50
64
 
51
65
  assert!(!info.version.is_empty());
52
66
  assert!(info.rust_backend);
@@ -67,10 +81,10 @@ async fn test_extract_no_files() {
67
81
  .uri("/extract")
68
82
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
69
83
  .body(Body::from(body_content))
70
- .unwrap(),
84
+ .expect("Operation failed"),
71
85
  )
72
86
  .await
73
- .unwrap();
87
+ .expect("Operation failed");
74
88
 
75
89
  assert_eq!(response.status(), StatusCode::BAD_REQUEST);
76
90
  }
@@ -100,19 +114,26 @@ async fn test_extract_text_file() {
100
114
  .uri("/extract")
101
115
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
102
116
  .body(Body::from(body_content))
103
- .unwrap(),
117
+ .expect("Operation failed"),
104
118
  )
105
119
  .await
106
- .unwrap();
120
+ .expect("Operation failed");
107
121
 
108
122
  assert_eq!(response.status(), StatusCode::OK);
109
123
 
110
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
111
- let results: Vec<serde_json::Value> = serde_json::from_slice(&body).unwrap();
124
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
125
+ .await
126
+ .expect("Failed to read HTTP response body");
127
+ let results: Vec<serde_json::Value> = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
112
128
 
113
129
  assert_eq!(results.len(), 1);
114
130
  assert_eq!(results[0]["mime_type"], "text/plain");
115
- assert!(results[0]["content"].as_str().unwrap().contains("Hello, world!"));
131
+ assert!(
132
+ results[0]["content"]
133
+ .as_str()
134
+ .expect("Failed to extract string from JSON value")
135
+ .contains("Hello, world!")
136
+ );
116
137
 
117
138
  assert!(
118
139
  results[0]["chunks"].is_null(),
@@ -158,19 +179,26 @@ async fn test_extract_with_config() {
158
179
  .uri("/extract")
159
180
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
160
181
  .body(Body::from(body_content))
161
- .unwrap(),
182
+ .expect("Operation failed"),
162
183
  )
163
184
  .await
164
- .unwrap();
185
+ .expect("Operation failed");
165
186
 
166
187
  assert_eq!(response.status(), StatusCode::OK);
167
188
 
168
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
169
- let results: Vec<serde_json::Value> = serde_json::from_slice(&body).unwrap();
189
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
190
+ .await
191
+ .expect("Failed to read HTTP response body");
192
+ let results: Vec<serde_json::Value> = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
170
193
 
171
194
  assert_eq!(results.len(), 1);
172
195
  assert_eq!(results[0]["mime_type"], "text/plain");
173
- assert!(results[0]["content"].as_str().unwrap().contains("Hello, world!"));
196
+ assert!(
197
+ results[0]["content"]
198
+ .as_str()
199
+ .expect("Failed to extract string from JSON value")
200
+ .contains("Hello, world!")
201
+ );
174
202
 
175
203
  assert!(
176
204
  results[0]["chunks"].is_null(),
@@ -214,10 +242,10 @@ async fn test_extract_invalid_config() {
214
242
  .uri("/extract")
215
243
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
216
244
  .body(Body::from(body_content))
217
- .unwrap(),
245
+ .expect("Operation failed"),
218
246
  )
219
247
  .await
220
- .unwrap();
248
+ .expect("Operation failed");
221
249
 
222
250
  assert_eq!(response.status(), StatusCode::BAD_REQUEST);
223
251
  }
@@ -253,19 +281,31 @@ async fn test_extract_multiple_files() {
253
281
  .uri("/extract")
254
282
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
255
283
  .body(Body::from(body_content))
256
- .unwrap(),
284
+ .expect("Operation failed"),
257
285
  )
258
286
  .await
259
- .unwrap();
287
+ .expect("Operation failed");
260
288
 
261
289
  assert_eq!(response.status(), StatusCode::OK);
262
290
 
263
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
264
- let results: Vec<serde_json::Value> = serde_json::from_slice(&body).unwrap();
291
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
292
+ .await
293
+ .expect("Failed to read HTTP response body");
294
+ let results: Vec<serde_json::Value> = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
265
295
 
266
296
  assert_eq!(results.len(), 2);
267
- assert!(results[0]["content"].as_str().unwrap().contains("First file"));
268
- assert!(results[1]["content"].as_str().unwrap().contains("Second file"));
297
+ assert!(
298
+ results[0]["content"]
299
+ .as_str()
300
+ .expect("Failed to extract string from JSON value")
301
+ .contains("First file")
302
+ );
303
+ assert!(
304
+ results[1]["content"]
305
+ .as_str()
306
+ .expect("Failed to extract string from JSON value")
307
+ .contains("Second file")
308
+ );
269
309
 
270
310
  for result in &results {
271
311
  assert!(
@@ -304,19 +344,26 @@ async fn test_extract_markdown_file() {
304
344
  .uri("/extract")
305
345
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
306
346
  .body(Body::from(body_content))
307
- .unwrap(),
347
+ .expect("Operation failed"),
308
348
  )
309
349
  .await
310
- .unwrap();
350
+ .expect("Operation failed");
311
351
 
312
352
  assert_eq!(response.status(), StatusCode::OK);
313
353
 
314
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
315
- let results: Vec<serde_json::Value> = serde_json::from_slice(&body).unwrap();
354
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
355
+ .await
356
+ .expect("Failed to read HTTP response body");
357
+ let results: Vec<serde_json::Value> = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
316
358
 
317
359
  assert_eq!(results.len(), 1);
318
360
  assert_eq!(results[0]["mime_type"], "text/markdown");
319
- assert!(results[0]["content"].as_str().unwrap().contains("Heading"));
361
+ assert!(
362
+ results[0]["content"]
363
+ .as_str()
364
+ .expect("Failed to extract string from JSON value")
365
+ .contains("Heading")
366
+ );
320
367
  }
321
368
 
322
369
  /// Test extract endpoint with JSON content.
@@ -344,15 +391,17 @@ async fn test_extract_json_file() {
344
391
  .uri("/extract")
345
392
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
346
393
  .body(Body::from(body_content))
347
- .unwrap(),
394
+ .expect("Operation failed"),
348
395
  )
349
396
  .await
350
- .unwrap();
397
+ .expect("Operation failed");
351
398
 
352
399
  assert_eq!(response.status(), StatusCode::OK);
353
400
 
354
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
355
- let results: Vec<serde_json::Value> = serde_json::from_slice(&body).unwrap();
401
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
402
+ .await
403
+ .expect("Failed to read HTTP response body");
404
+ let results: Vec<serde_json::Value> = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
356
405
 
357
406
  assert_eq!(results.len(), 1);
358
407
  assert_eq!(results[0]["mime_type"], "application/json");
@@ -384,19 +433,26 @@ async fn test_extract_xml_file() {
384
433
  .uri("/extract")
385
434
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
386
435
  .body(Body::from(body_content))
387
- .unwrap(),
436
+ .expect("Operation failed"),
388
437
  )
389
438
  .await
390
- .unwrap();
439
+ .expect("Operation failed");
391
440
 
392
441
  assert_eq!(response.status(), StatusCode::OK);
393
442
 
394
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
395
- let results: Vec<serde_json::Value> = serde_json::from_slice(&body).unwrap();
443
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
444
+ .await
445
+ .expect("Failed to read HTTP response body");
446
+ let results: Vec<serde_json::Value> = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
396
447
 
397
448
  assert_eq!(results.len(), 1);
398
449
  assert_eq!(results[0]["mime_type"], "application/xml");
399
- assert!(results[0]["content"].as_str().unwrap().contains("test"));
450
+ assert!(
451
+ results[0]["content"]
452
+ .as_str()
453
+ .expect("Failed to extract string from JSON value")
454
+ .contains("test")
455
+ );
400
456
  }
401
457
 
402
458
  /// Test extract endpoint with HTML content.
@@ -425,19 +481,26 @@ async fn test_extract_html_file() {
425
481
  .uri("/extract")
426
482
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
427
483
  .body(Body::from(body_content))
428
- .unwrap(),
484
+ .expect("Operation failed"),
429
485
  )
430
486
  .await
431
- .unwrap();
487
+ .expect("Operation failed");
432
488
 
433
489
  assert_eq!(response.status(), StatusCode::OK);
434
490
 
435
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
436
- let results: Vec<serde_json::Value> = serde_json::from_slice(&body).unwrap();
491
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
492
+ .await
493
+ .expect("Failed to read HTTP response body");
494
+ let results: Vec<serde_json::Value> = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
437
495
 
438
496
  assert_eq!(results.len(), 1);
439
497
  assert_eq!(results[0]["mime_type"], "text/html");
440
- assert!(results[0]["content"].as_str().unwrap().contains("Title"));
498
+ assert!(
499
+ results[0]["content"]
500
+ .as_str()
501
+ .expect("Failed to extract string from JSON value")
502
+ .contains("Title")
503
+ );
441
504
  }
442
505
 
443
506
  /// Test extract endpoint with missing Content-Type header.
@@ -451,10 +514,10 @@ async fn test_extract_missing_content_type() {
451
514
  .method("POST")
452
515
  .uri("/extract")
453
516
  .body(Body::from("some data"))
454
- .unwrap(),
517
+ .expect("Operation failed"),
455
518
  )
456
519
  .await
457
- .unwrap();
520
+ .expect("Operation failed");
458
521
 
459
522
  assert!(response.status() == StatusCode::BAD_REQUEST || response.status() == StatusCode::UNSUPPORTED_MEDIA_TYPE);
460
523
  }
@@ -484,15 +547,17 @@ async fn test_extract_empty_file() {
484
547
  .uri("/extract")
485
548
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
486
549
  .body(Body::from(body_content))
487
- .unwrap(),
550
+ .expect("Operation failed"),
488
551
  )
489
552
  .await
490
- .unwrap();
553
+ .expect("Operation failed");
491
554
 
492
555
  assert_eq!(response.status(), StatusCode::OK);
493
556
 
494
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
495
- let results: Vec<serde_json::Value> = serde_json::from_slice(&body).unwrap();
557
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
558
+ .await
559
+ .expect("Failed to read HTTP response body");
560
+ let results: Vec<serde_json::Value> = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
496
561
 
497
562
  assert_eq!(results.len(), 1);
498
563
  assert_eq!(results[0]["mime_type"], "text/plain");
@@ -523,10 +588,10 @@ async fn test_extract_unsupported_mime_type() {
523
588
  .uri("/extract")
524
589
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
525
590
  .body(Body::from(body_content))
526
- .unwrap(),
591
+ .expect("Operation failed"),
527
592
  )
528
593
  .await
529
- .unwrap();
594
+ .expect("Operation failed");
530
595
 
531
596
  assert!(
532
597
  response.status() == StatusCode::UNPROCESSABLE_ENTITY || response.status() == StatusCode::INTERNAL_SERVER_ERROR
@@ -558,10 +623,10 @@ async fn test_extract_without_filename() {
558
623
  .uri("/extract")
559
624
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
560
625
  .body(Body::from(body_content))
561
- .unwrap(),
626
+ .expect("Operation failed"),
562
627
  )
563
628
  .await
564
- .unwrap();
629
+ .expect("Operation failed");
565
630
 
566
631
  assert_eq!(response.status(), StatusCode::OK);
567
632
  }
@@ -581,10 +646,10 @@ async fn test_extract_malformed_multipart() {
581
646
  .uri("/extract")
582
647
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
583
648
  .body(Body::from(body_content))
584
- .unwrap(),
649
+ .expect("Operation failed"),
585
650
  )
586
651
  .await
587
- .unwrap();
652
+ .expect("Operation failed");
588
653
 
589
654
  assert!(response.status().is_client_error() || response.status().is_server_error());
590
655
  }
@@ -595,9 +660,14 @@ async fn test_cors_headers() {
595
660
  let app = create_router(ExtractionConfig::default());
596
661
 
597
662
  let response = app
598
- .oneshot(Request::builder().uri("/health").body(Body::empty()).unwrap())
663
+ .oneshot(
664
+ Request::builder()
665
+ .uri("/health")
666
+ .body(Body::empty())
667
+ .expect("Failed to create HTTP request body"),
668
+ )
599
669
  .await
600
- .unwrap();
670
+ .expect("Failed to send HTTP request");
601
671
 
602
672
  assert_eq!(response.status(), StatusCode::OK);
603
673
 
@@ -618,10 +688,10 @@ async fn test_cors_preflight() {
618
688
  .header("origin", "http://example.com")
619
689
  .header("access-control-request-method", "POST")
620
690
  .body(Body::empty())
621
- .unwrap(),
691
+ .expect("Operation failed"),
622
692
  )
623
693
  .await
624
- .unwrap();
694
+ .expect("Operation failed");
625
695
 
626
696
  assert!(response.status().is_success() || response.status() == StatusCode::NO_CONTENT);
627
697
  }
@@ -641,15 +711,17 @@ async fn test_error_response_format_validation() {
641
711
  .uri("/extract")
642
712
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
643
713
  .body(Body::from(body_content))
644
- .unwrap(),
714
+ .expect("Operation failed"),
645
715
  )
646
716
  .await
647
- .unwrap();
717
+ .expect("Operation failed");
648
718
 
649
719
  assert_eq!(response.status(), StatusCode::BAD_REQUEST);
650
720
 
651
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
652
- let error: serde_json::Value = serde_json::from_slice(&body).unwrap();
721
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
722
+ .await
723
+ .expect("Failed to read HTTP response body");
724
+ let error: serde_json::Value = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
653
725
 
654
726
  assert!(error["error_type"].is_string());
655
727
  assert!(error["message"].is_string());
@@ -686,18 +758,25 @@ async fn test_error_response_format_parsing() {
686
758
  .uri("/extract")
687
759
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
688
760
  .body(Body::from(body_content))
689
- .unwrap(),
761
+ .expect("Operation failed"),
690
762
  )
691
763
  .await
692
- .unwrap();
764
+ .expect("Operation failed");
693
765
 
694
766
  assert_eq!(response.status(), StatusCode::BAD_REQUEST);
695
767
 
696
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
697
- let error: serde_json::Value = serde_json::from_slice(&body).unwrap();
768
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
769
+ .await
770
+ .expect("Failed to read HTTP response body");
771
+ let error: serde_json::Value = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
698
772
 
699
773
  assert_eq!(error["error_type"], "ValidationError");
700
- assert!(error["message"].as_str().unwrap().contains("configuration"));
774
+ assert!(
775
+ error["message"]
776
+ .as_str()
777
+ .expect("Failed to extract string from JSON value")
778
+ .contains("configuration")
779
+ );
701
780
  }
702
781
 
703
782
  /// Test 404 error for non-existent endpoint.
@@ -706,9 +785,14 @@ async fn test_not_found_endpoint() {
706
785
  let app = create_router(ExtractionConfig::default());
707
786
 
708
787
  let response = app
709
- .oneshot(Request::builder().uri("/nonexistent").body(Body::empty()).unwrap())
788
+ .oneshot(
789
+ Request::builder()
790
+ .uri("/nonexistent")
791
+ .body(Body::empty())
792
+ .expect("Failed to create HTTP request body"),
793
+ )
710
794
  .await
711
- .unwrap();
795
+ .expect("Failed to send HTTP request");
712
796
 
713
797
  assert_eq!(response.status(), StatusCode::NOT_FOUND);
714
798
  }
@@ -738,15 +822,17 @@ async fn test_extract_large_file() {
738
822
  .uri("/extract")
739
823
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
740
824
  .body(Body::from(body_content))
741
- .unwrap(),
825
+ .expect("Operation failed"),
742
826
  )
743
827
  .await
744
- .unwrap();
828
+ .expect("Operation failed");
745
829
 
746
830
  assert_eq!(response.status(), StatusCode::OK);
747
831
 
748
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
749
- let results: Vec<serde_json::Value> = serde_json::from_slice(&body).unwrap();
832
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
833
+ .await
834
+ .expect("Failed to read HTTP response body");
835
+ let results: Vec<serde_json::Value> = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
750
836
 
751
837
  assert_eq!(results.len(), 1);
752
838
  assert_eq!(results[0]["mime_type"], "text/plain");
@@ -786,7 +872,7 @@ async fn test_concurrent_requests() {
786
872
  .uri("/extract")
787
873
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
788
874
  .body(Body::from(body_clone))
789
- .unwrap(),
875
+ .expect("Operation failed"),
790
876
  )
791
877
  .await
792
878
  });
@@ -795,7 +881,10 @@ async fn test_concurrent_requests() {
795
881
  }
796
882
 
797
883
  for handle in handles {
798
- let response = handle.await.unwrap().unwrap();
884
+ let response = handle
885
+ .await
886
+ .expect("Async operation failed")
887
+ .expect("Async operation failed");
799
888
  assert_eq!(response.status(), StatusCode::OK);
800
889
  }
801
890
  }
@@ -806,14 +895,21 @@ async fn test_cache_stats_endpoint() {
806
895
  let app = create_router(ExtractionConfig::default());
807
896
 
808
897
  let response = app
809
- .oneshot(Request::builder().uri("/cache/stats").body(Body::empty()).unwrap())
898
+ .oneshot(
899
+ Request::builder()
900
+ .uri("/cache/stats")
901
+ .body(Body::empty())
902
+ .expect("Failed to create HTTP request body"),
903
+ )
810
904
  .await
811
- .unwrap();
905
+ .expect("Failed to send HTTP request");
812
906
 
813
907
  assert_eq!(response.status(), StatusCode::OK);
814
908
 
815
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
816
- let stats: serde_json::Value = serde_json::from_slice(&body).unwrap();
909
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
910
+ .await
911
+ .expect("Failed to read HTTP response body");
912
+ let stats: serde_json::Value = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
817
913
 
818
914
  assert!(stats["directory"].is_string());
819
915
  assert!(stats["total_files"].is_number());
@@ -831,15 +927,17 @@ async fn test_cache_clear_endpoint() {
831
927
  .method("DELETE")
832
928
  .uri("/cache/clear")
833
929
  .body(Body::empty())
834
- .unwrap(),
930
+ .expect("Operation failed"),
835
931
  )
836
932
  .await
837
- .unwrap();
933
+ .expect("Operation failed");
838
934
 
839
935
  assert_eq!(response.status(), StatusCode::OK);
840
936
 
841
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
842
- let clear_result: serde_json::Value = serde_json::from_slice(&body).unwrap();
937
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
938
+ .await
939
+ .expect("Failed to read HTTP response body");
940
+ let clear_result: serde_json::Value = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
843
941
 
844
942
  assert!(clear_result["directory"].is_string());
845
943
  assert!(clear_result["removed_files"].is_number());
@@ -877,15 +975,17 @@ async fn test_extract_mixed_content_types() {
877
975
  .uri("/extract")
878
976
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
879
977
  .body(Body::from(body_content))
880
- .unwrap(),
978
+ .expect("Operation failed"),
881
979
  )
882
980
  .await
883
- .unwrap();
981
+ .expect("Operation failed");
884
982
 
885
983
  assert_eq!(response.status(), StatusCode::OK);
886
984
 
887
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
888
- let results: Vec<serde_json::Value> = serde_json::from_slice(&body).unwrap();
985
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
986
+ .await
987
+ .expect("Failed to read HTTP response body");
988
+ let results: Vec<serde_json::Value> = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
889
989
 
890
990
  assert_eq!(results.len(), 2);
891
991
  assert_eq!(results[0]["mime_type"], "text/plain");
@@ -921,10 +1021,10 @@ async fn test_extract_unknown_multipart_field() {
921
1021
  .uri("/extract")
922
1022
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
923
1023
  .body(Body::from(body_content))
924
- .unwrap(),
1024
+ .expect("Operation failed"),
925
1025
  )
926
1026
  .await
927
- .unwrap();
1027
+ .expect("Operation failed");
928
1028
 
929
1029
  assert_eq!(response.status(), StatusCode::OK);
930
1030
  }
@@ -953,10 +1053,10 @@ async fn test_extract_default_mime_type() {
953
1053
  .uri("/extract")
954
1054
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
955
1055
  .body(Body::from(body_content))
956
- .unwrap(),
1056
+ .expect("Operation failed"),
957
1057
  )
958
1058
  .await
959
- .unwrap();
1059
+ .expect("Operation failed");
960
1060
 
961
1061
  assert!(
962
1062
  response.status() == StatusCode::OK
@@ -997,10 +1097,10 @@ async fn test_size_limits_custom_limits() {
997
1097
  .uri("/extract")
998
1098
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
999
1099
  .body(Body::from(body_content))
1000
- .unwrap(),
1100
+ .expect("Operation failed"),
1001
1101
  )
1002
1102
  .await
1003
- .unwrap();
1103
+ .expect("Operation failed");
1004
1104
 
1005
1105
  assert_eq!(response.status(), StatusCode::OK);
1006
1106
  }
@@ -1081,10 +1181,10 @@ async fn test_extract_file_larger_than_2mb() {
1081
1181
  .uri("/extract")
1082
1182
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
1083
1183
  .body(Body::from(body_content))
1084
- .unwrap(),
1184
+ .expect("Operation failed"),
1085
1185
  )
1086
1186
  .await
1087
- .unwrap();
1187
+ .expect("Operation failed");
1088
1188
 
1089
1189
  assert_eq!(
1090
1190
  response.status(),
@@ -1092,12 +1192,19 @@ async fn test_extract_file_larger_than_2mb() {
1092
1192
  "3MB file should be accepted. If this fails with 400 or 413, the size limit fix is not working correctly."
1093
1193
  );
1094
1194
 
1095
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
1096
- let results: Vec<serde_json::Value> = serde_json::from_slice(&body).unwrap();
1195
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
1196
+ .await
1197
+ .expect("Failed to read HTTP response body");
1198
+ let results: Vec<serde_json::Value> = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
1097
1199
 
1098
1200
  assert_eq!(results.len(), 1);
1099
1201
  assert_eq!(results[0]["mime_type"], "text/plain");
1100
- assert!(results[0]["content"].as_str().unwrap().contains("A"));
1202
+ assert!(
1203
+ results[0]["content"]
1204
+ .as_str()
1205
+ .expect("Failed to extract string from JSON value")
1206
+ .contains("A")
1207
+ );
1101
1208
  }
1102
1209
 
1103
1210
  /// Test extracting a 2MB file (just above the old Axum limit).
@@ -1128,10 +1235,10 @@ async fn test_extract_2mb_file() {
1128
1235
  .uri("/extract")
1129
1236
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
1130
1237
  .body(Body::from(body_content))
1131
- .unwrap(),
1238
+ .expect("Operation failed"),
1132
1239
  )
1133
1240
  .await
1134
- .unwrap();
1241
+ .expect("Operation failed");
1135
1242
 
1136
1243
  assert_eq!(
1137
1244
  response.status(),
@@ -1139,12 +1246,19 @@ async fn test_extract_2mb_file() {
1139
1246
  "2MB file should be accepted (boundary case)"
1140
1247
  );
1141
1248
 
1142
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
1143
- let results: Vec<serde_json::Value> = serde_json::from_slice(&body).unwrap();
1249
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
1250
+ .await
1251
+ .expect("Failed to read HTTP response body");
1252
+ let results: Vec<serde_json::Value> = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
1144
1253
 
1145
1254
  assert_eq!(results.len(), 1);
1146
1255
  assert_eq!(results[0]["mime_type"], "text/plain");
1147
- assert!(results[0]["content"].as_str().unwrap().contains("X"));
1256
+ assert!(
1257
+ results[0]["content"]
1258
+ .as_str()
1259
+ .expect("Failed to extract string from JSON value")
1260
+ .contains("X")
1261
+ );
1148
1262
  }
1149
1263
 
1150
1264
  /// Test extracting a 5MB file.
@@ -1174,19 +1288,26 @@ async fn test_extract_5mb_file() {
1174
1288
  .uri("/extract")
1175
1289
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
1176
1290
  .body(Body::from(body_content))
1177
- .unwrap(),
1291
+ .expect("Operation failed"),
1178
1292
  )
1179
1293
  .await
1180
- .unwrap();
1294
+ .expect("Operation failed");
1181
1295
 
1182
1296
  assert_eq!(response.status(), StatusCode::OK, "5MB file should be accepted");
1183
1297
 
1184
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
1185
- let results: Vec<serde_json::Value> = serde_json::from_slice(&body).unwrap();
1298
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
1299
+ .await
1300
+ .expect("Failed to read HTTP response body");
1301
+ let results: Vec<serde_json::Value> = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
1186
1302
 
1187
1303
  assert_eq!(results.len(), 1);
1188
1304
  assert_eq!(results[0]["mime_type"], "text/plain");
1189
- assert!(results[0]["content"].as_str().unwrap().contains("B"));
1305
+ assert!(
1306
+ results[0]["content"]
1307
+ .as_str()
1308
+ .expect("Failed to extract string from JSON value")
1309
+ .contains("B")
1310
+ );
1190
1311
  }
1191
1312
 
1192
1313
  /// Test extracting a 10MB file.
@@ -1216,19 +1337,26 @@ async fn test_extract_10mb_file() {
1216
1337
  .uri("/extract")
1217
1338
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
1218
1339
  .body(Body::from(body_content))
1219
- .unwrap(),
1340
+ .expect("Operation failed"),
1220
1341
  )
1221
1342
  .await
1222
- .unwrap();
1343
+ .expect("Operation failed");
1223
1344
 
1224
1345
  assert_eq!(response.status(), StatusCode::OK, "10MB file should be accepted");
1225
1346
 
1226
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
1227
- let results: Vec<serde_json::Value> = serde_json::from_slice(&body).unwrap();
1347
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
1348
+ .await
1349
+ .expect("Failed to read HTTP response body");
1350
+ let results: Vec<serde_json::Value> = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
1228
1351
 
1229
1352
  assert_eq!(results.len(), 1);
1230
1353
  assert_eq!(results[0]["mime_type"], "text/plain");
1231
- assert!(results[0]["content"].as_str().unwrap().contains("C"));
1354
+ assert!(
1355
+ results[0]["content"]
1356
+ .as_str()
1357
+ .expect("Failed to extract string from JSON value")
1358
+ .contains("C")
1359
+ );
1232
1360
  }
1233
1361
 
1234
1362
  /// Test extracting a 50MB file (half the default limit).
@@ -1259,19 +1387,26 @@ async fn test_extract_50mb_file() {
1259
1387
  .uri("/extract")
1260
1388
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
1261
1389
  .body(Body::from(body_content))
1262
- .unwrap(),
1390
+ .expect("Operation failed"),
1263
1391
  )
1264
1392
  .await
1265
- .unwrap();
1393
+ .expect("Operation failed");
1266
1394
 
1267
1395
  assert_eq!(response.status(), StatusCode::OK, "50MB file should be accepted");
1268
1396
 
1269
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
1270
- let results: Vec<serde_json::Value> = serde_json::from_slice(&body).unwrap();
1397
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
1398
+ .await
1399
+ .expect("Failed to read HTTP response body");
1400
+ let results: Vec<serde_json::Value> = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
1271
1401
 
1272
1402
  assert_eq!(results.len(), 1);
1273
1403
  assert_eq!(results[0]["mime_type"], "text/plain");
1274
- assert!(results[0]["content"].as_str().unwrap().contains("D"));
1404
+ assert!(
1405
+ results[0]["content"]
1406
+ .as_str()
1407
+ .expect("Failed to extract string from JSON value")
1408
+ .contains("D")
1409
+ );
1275
1410
  }
1276
1411
 
1277
1412
  /// Test extracting a 90MB file (near the 100MB default limit).
@@ -1302,10 +1437,10 @@ async fn test_extract_90mb_file() {
1302
1437
  .uri("/extract")
1303
1438
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
1304
1439
  .body(Body::from(body_content))
1305
- .unwrap(),
1440
+ .expect("Operation failed"),
1306
1441
  )
1307
1442
  .await
1308
- .unwrap();
1443
+ .expect("Operation failed");
1309
1444
 
1310
1445
  assert_eq!(
1311
1446
  response.status(),
@@ -1313,12 +1448,19 @@ async fn test_extract_90mb_file() {
1313
1448
  "90MB file should be accepted (within default 100MB limit)"
1314
1449
  );
1315
1450
 
1316
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
1317
- let results: Vec<serde_json::Value> = serde_json::from_slice(&body).unwrap();
1451
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
1452
+ .await
1453
+ .expect("Failed to read HTTP response body");
1454
+ let results: Vec<serde_json::Value> = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
1318
1455
 
1319
1456
  assert_eq!(results.len(), 1);
1320
1457
  assert_eq!(results[0]["mime_type"], "text/plain");
1321
- assert!(results[0]["content"].as_str().unwrap().contains("E"));
1458
+ assert!(
1459
+ results[0]["content"]
1460
+ .as_str()
1461
+ .expect("Failed to extract string from JSON value")
1462
+ .contains("E")
1463
+ );
1322
1464
  }
1323
1465
 
1324
1466
  /// Test extracting a file over the 100MB default limit (HTTP 400/413).
@@ -1350,10 +1492,10 @@ async fn test_extract_file_over_default_limit() {
1350
1492
  .uri("/extract")
1351
1493
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
1352
1494
  .body(Body::from(body_content))
1353
- .unwrap(),
1495
+ .expect("Operation failed"),
1354
1496
  )
1355
1497
  .await
1356
- .unwrap();
1498
+ .expect("Operation failed");
1357
1499
 
1358
1500
  assert!(
1359
1501
  response.status() == StatusCode::BAD_REQUEST || response.status() == StatusCode::PAYLOAD_TOO_LARGE,
@@ -1402,10 +1544,10 @@ async fn test_extract_multiple_large_files_within_limit() {
1402
1544
  .uri("/extract")
1403
1545
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
1404
1546
  .body(Body::from(body_content))
1405
- .unwrap(),
1547
+ .expect("Operation failed"),
1406
1548
  )
1407
1549
  .await
1408
- .unwrap();
1550
+ .expect("Operation failed");
1409
1551
 
1410
1552
  assert_eq!(
1411
1553
  response.status(),
@@ -1413,16 +1555,33 @@ async fn test_extract_multiple_large_files_within_limit() {
1413
1555
  "Multiple files totaling 75MB should be accepted"
1414
1556
  );
1415
1557
 
1416
- let body = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
1417
- let results: Vec<serde_json::Value> = serde_json::from_slice(&body).unwrap();
1558
+ let body = axum::body::to_bytes(response.into_body(), usize::MAX)
1559
+ .await
1560
+ .expect("Failed to read HTTP response body");
1561
+ let results: Vec<serde_json::Value> = serde_json::from_slice(&body).expect("Failed to deserialize JSON response");
1418
1562
 
1419
1563
  assert_eq!(results.len(), 3, "Should have 3 results");
1420
1564
  for result in &results {
1421
1565
  assert_eq!(result["mime_type"], "text/plain");
1422
1566
  }
1423
- assert!(results[0]["content"].as_str().unwrap().contains("G"));
1424
- assert!(results[1]["content"].as_str().unwrap().contains("H"));
1425
- assert!(results[2]["content"].as_str().unwrap().contains("I"));
1567
+ assert!(
1568
+ results[0]["content"]
1569
+ .as_str()
1570
+ .expect("Failed to extract string from JSON value")
1571
+ .contains("G")
1572
+ );
1573
+ assert!(
1574
+ results[1]["content"]
1575
+ .as_str()
1576
+ .expect("Failed to extract string from JSON value")
1577
+ .contains("H")
1578
+ );
1579
+ assert!(
1580
+ results[2]["content"]
1581
+ .as_str()
1582
+ .expect("Failed to extract string from JSON value")
1583
+ .contains("I")
1584
+ );
1426
1585
  }
1427
1586
 
1428
1587
  /// Test extracting multiple large files exceeding limit (HTTP 400/413).
@@ -1459,10 +1618,10 @@ async fn test_extract_multiple_large_files_exceeding_limit() {
1459
1618
  .uri("/extract")
1460
1619
  .header("content-type", format!("multipart/form-data; boundary={}", boundary))
1461
1620
  .body(Body::from(body_content))
1462
- .unwrap(),
1621
+ .expect("Operation failed"),
1463
1622
  )
1464
1623
  .await
1465
- .unwrap();
1624
+ .expect("Operation failed");
1466
1625
 
1467
1626
  assert!(
1468
1627
  response.status() == StatusCode::BAD_REQUEST || response.status() == StatusCode::PAYLOAD_TOO_LARGE,