kreuzberg 4.1.1 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +8 -5
- data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +23 -13
- data/kreuzberg.gemspec +14 -2
- data/lib/kreuzberg/api_proxy.rb +0 -1
- data/lib/kreuzberg/cli_proxy.rb +0 -1
- data/lib/kreuzberg/config.rb +70 -35
- data/lib/kreuzberg/mcp_proxy.rb +0 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +5 -1
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +3 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +55 -53
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
- data/vendor/kreuzberg-tesseract/build.rs +4 -4
- data/vendor/kreuzberg-tesseract/src/lib.rs +6 -6
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +3 -3
- metadata +13 -2
|
@@ -34,7 +34,7 @@ async fn test_zip_basic_extraction() {
|
|
|
34
34
|
assert!(result.content.contains("Hello from ZIP!"));
|
|
35
35
|
|
|
36
36
|
assert!(result.metadata.format.is_some());
|
|
37
|
-
let archive_meta = match result.metadata.format.as_ref().
|
|
37
|
+
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
|
|
38
38
|
kreuzberg::FormatMetadata::Archive(meta) => meta,
|
|
39
39
|
_ => panic!("Expected Archive metadata"),
|
|
40
40
|
};
|
|
@@ -54,16 +54,16 @@ async fn test_zip_multiple_files() {
|
|
|
54
54
|
let mut zip = ZipWriter::new(&mut cursor);
|
|
55
55
|
let options = FileOptions::<'_, ()>::default();
|
|
56
56
|
|
|
57
|
-
zip.start_file("file1.txt", options).
|
|
58
|
-
zip.write_all(b"Content 1").
|
|
57
|
+
zip.start_file("file1.txt", options).expect("Operation failed");
|
|
58
|
+
zip.write_all(b"Content 1").expect("Operation failed");
|
|
59
59
|
|
|
60
|
-
zip.start_file("file2.md", options).
|
|
61
|
-
zip.write_all(b"# Content 2").
|
|
60
|
+
zip.start_file("file2.md", options).expect("Operation failed");
|
|
61
|
+
zip.write_all(b"# Content 2").expect("Operation failed");
|
|
62
62
|
|
|
63
|
-
zip.start_file("file3.json", options).
|
|
64
|
-
zip.write_all(b"{\"key\": \"value\"}").
|
|
63
|
+
zip.start_file("file3.json", options).expect("Operation failed");
|
|
64
|
+
zip.write_all(b"{\"key\": \"value\"}").expect("Operation failed");
|
|
65
65
|
|
|
66
|
-
zip.finish().
|
|
66
|
+
zip.finish().expect("Operation failed");
|
|
67
67
|
}
|
|
68
68
|
|
|
69
69
|
let zip_bytes = cursor.into_inner();
|
|
@@ -84,7 +84,7 @@ async fn test_zip_multiple_files() {
|
|
|
84
84
|
assert!(result.content.contains("value"));
|
|
85
85
|
|
|
86
86
|
assert!(result.metadata.format.is_some());
|
|
87
|
-
let archive_meta = match result.metadata.format.as_ref().
|
|
87
|
+
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
|
|
88
88
|
kreuzberg::FormatMetadata::Archive(meta) => meta,
|
|
89
89
|
_ => panic!("Expected Archive metadata"),
|
|
90
90
|
};
|
|
@@ -105,16 +105,17 @@ async fn test_zip_nested_directories() {
|
|
|
105
105
|
let mut zip = ZipWriter::new(&mut cursor);
|
|
106
106
|
let options = FileOptions::<'_, ()>::default();
|
|
107
107
|
|
|
108
|
-
zip.add_directory("dir1/", options).
|
|
109
|
-
zip.add_directory("dir1/subdir/", options).
|
|
108
|
+
zip.add_directory("dir1/", options).expect("Operation failed");
|
|
109
|
+
zip.add_directory("dir1/subdir/", options).expect("Operation failed");
|
|
110
110
|
|
|
111
|
-
zip.start_file("dir1/file.txt", options).
|
|
112
|
-
zip.write_all(b"File in dir1").
|
|
111
|
+
zip.start_file("dir1/file.txt", options).expect("Operation failed");
|
|
112
|
+
zip.write_all(b"File in dir1").expect("Operation failed");
|
|
113
113
|
|
|
114
|
-
zip.start_file("dir1/subdir/nested.txt", options)
|
|
115
|
-
|
|
114
|
+
zip.start_file("dir1/subdir/nested.txt", options)
|
|
115
|
+
.expect("Operation failed");
|
|
116
|
+
zip.write_all(b"Nested file").expect("Operation failed");
|
|
116
117
|
|
|
117
|
-
zip.finish().
|
|
118
|
+
zip.finish().expect("Operation failed");
|
|
118
119
|
}
|
|
119
120
|
|
|
120
121
|
let zip_bytes = cursor.into_inner();
|
|
@@ -134,7 +135,7 @@ async fn test_zip_nested_directories() {
|
|
|
134
135
|
assert!(result.content.contains("Nested file"));
|
|
135
136
|
|
|
136
137
|
assert!(result.metadata.format.is_some());
|
|
137
|
-
let archive_meta = match result.metadata.format.as_ref().
|
|
138
|
+
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
|
|
138
139
|
kreuzberg::FormatMetadata::Archive(meta) => meta,
|
|
139
140
|
_ => panic!("Expected Archive metadata"),
|
|
140
141
|
};
|
|
@@ -172,7 +173,7 @@ async fn test_tar_extraction() {
|
|
|
172
173
|
assert!(result.content.contains("Hello from TAR!"));
|
|
173
174
|
|
|
174
175
|
assert!(result.metadata.format.is_some());
|
|
175
|
-
let archive_meta = match result.metadata.format.as_ref().
|
|
176
|
+
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
|
|
176
177
|
kreuzberg::FormatMetadata::Archive(meta) => meta,
|
|
177
178
|
_ => panic!("Expected Archive metadata"),
|
|
178
179
|
};
|
|
@@ -202,7 +203,7 @@ async fn test_tar_gz_extraction() {
|
|
|
202
203
|
assert!(result.content.contains("test.txt"));
|
|
203
204
|
|
|
204
205
|
assert!(result.metadata.format.is_some());
|
|
205
|
-
let archive_meta = match result.metadata.format.as_ref().
|
|
206
|
+
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
|
|
206
207
|
kreuzberg::FormatMetadata::Archive(meta) => meta,
|
|
207
208
|
_ => panic!("Expected Archive metadata"),
|
|
208
209
|
};
|
|
@@ -242,13 +243,14 @@ async fn test_nested_archive() {
|
|
|
242
243
|
let mut zip = ZipWriter::new(&mut cursor);
|
|
243
244
|
let options = FileOptions::<'_, ()>::default();
|
|
244
245
|
|
|
245
|
-
zip.start_file("inner.zip", options).
|
|
246
|
-
zip.write_all(&inner_zip).
|
|
246
|
+
zip.start_file("inner.zip", options).expect("Operation failed");
|
|
247
|
+
zip.write_all(&inner_zip).expect("Operation failed");
|
|
247
248
|
|
|
248
|
-
zip.start_file("readme.txt", options).
|
|
249
|
-
zip.write_all(b"This archive contains another archive")
|
|
249
|
+
zip.start_file("readme.txt", options).expect("Operation failed");
|
|
250
|
+
zip.write_all(b"This archive contains another archive")
|
|
251
|
+
.expect("Operation failed");
|
|
250
252
|
|
|
251
|
-
zip.finish().
|
|
253
|
+
zip.finish().expect("Operation failed");
|
|
252
254
|
}
|
|
253
255
|
|
|
254
256
|
let outer_zip_bytes = cursor.into_inner();
|
|
@@ -265,7 +267,7 @@ async fn test_nested_archive() {
|
|
|
265
267
|
assert!(result.content.contains("This archive contains another archive"));
|
|
266
268
|
|
|
267
269
|
assert!(result.metadata.format.is_some());
|
|
268
|
-
let archive_meta = match result.metadata.format.as_ref().
|
|
270
|
+
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
|
|
269
271
|
kreuzberg::FormatMetadata::Archive(meta) => meta,
|
|
270
272
|
_ => panic!("Expected Archive metadata"),
|
|
271
273
|
};
|
|
@@ -284,19 +286,19 @@ async fn test_archive_mixed_formats() {
|
|
|
284
286
|
let mut zip = ZipWriter::new(&mut cursor);
|
|
285
287
|
let options = FileOptions::<'_, ()>::default();
|
|
286
288
|
|
|
287
|
-
zip.start_file("document.txt", options).
|
|
288
|
-
zip.write_all(b"Text document").
|
|
289
|
+
zip.start_file("document.txt", options).expect("Operation failed");
|
|
290
|
+
zip.write_all(b"Text document").expect("Operation failed");
|
|
289
291
|
|
|
290
|
-
zip.start_file("readme.md", options).
|
|
291
|
-
zip.write_all(b"# README").
|
|
292
|
+
zip.start_file("readme.md", options).expect("Operation failed");
|
|
293
|
+
zip.write_all(b"# README").expect("Operation failed");
|
|
292
294
|
|
|
293
|
-
zip.start_file("image.png", options).
|
|
294
|
-
zip.write_all(&[0x89, 0x50, 0x4E, 0x47]).
|
|
295
|
+
zip.start_file("image.png", options).expect("Operation failed");
|
|
296
|
+
zip.write_all(&[0x89, 0x50, 0x4E, 0x47]).expect("Operation failed");
|
|
295
297
|
|
|
296
|
-
zip.start_file("document.pdf", options).
|
|
297
|
-
zip.write_all(b"%PDF-1.4").
|
|
298
|
+
zip.start_file("document.pdf", options).expect("Operation failed");
|
|
299
|
+
zip.write_all(b"%PDF-1.4").expect("Operation failed");
|
|
298
300
|
|
|
299
|
-
zip.finish().
|
|
301
|
+
zip.finish().expect("Operation failed");
|
|
300
302
|
}
|
|
301
303
|
|
|
302
304
|
let zip_bytes = cursor.into_inner();
|
|
@@ -317,7 +319,7 @@ async fn test_archive_mixed_formats() {
|
|
|
317
319
|
assert!(result.content.contains("# README"));
|
|
318
320
|
|
|
319
321
|
assert!(result.metadata.format.is_some());
|
|
320
|
-
let archive_meta = match result.metadata.format.as_ref().
|
|
322
|
+
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
|
|
321
323
|
kreuzberg::FormatMetadata::Archive(meta) => meta,
|
|
322
324
|
_ => panic!("Expected Archive metadata"),
|
|
323
325
|
};
|
|
@@ -373,11 +375,13 @@ async fn test_large_archive() {
|
|
|
373
375
|
let options = FileOptions::<'_, ()>::default();
|
|
374
376
|
|
|
375
377
|
for i in 0..100 {
|
|
376
|
-
zip.start_file(format!("file_{}.txt", i), options)
|
|
377
|
-
|
|
378
|
+
zip.start_file(format!("file_{}.txt", i), options)
|
|
379
|
+
.expect("Operation failed");
|
|
380
|
+
zip.write_all(format!("Content {}", i).as_bytes())
|
|
381
|
+
.expect("Failed to convert to bytes");
|
|
378
382
|
}
|
|
379
383
|
|
|
380
|
-
zip.finish().
|
|
384
|
+
zip.finish().expect("Operation failed");
|
|
381
385
|
}
|
|
382
386
|
|
|
383
387
|
let zip_bytes = cursor.into_inner();
|
|
@@ -390,7 +394,7 @@ async fn test_large_archive() {
|
|
|
390
394
|
assert!(result.tables.is_empty(), "Archive should not have tables");
|
|
391
395
|
|
|
392
396
|
assert!(result.metadata.format.is_some());
|
|
393
|
-
let archive_meta = match result.metadata.format.as_ref().
|
|
397
|
+
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
|
|
394
398
|
kreuzberg::FormatMetadata::Archive(meta) => meta,
|
|
395
399
|
_ => panic!("Expected Archive metadata"),
|
|
396
400
|
};
|
|
@@ -418,16 +422,19 @@ async fn test_archive_with_special_characters() {
|
|
|
418
422
|
let mut zip = ZipWriter::new(&mut cursor);
|
|
419
423
|
let options = FileOptions::<'_, ()>::default();
|
|
420
424
|
|
|
421
|
-
zip.start_file("测试文件.txt", options).
|
|
422
|
-
zip.write_all("Unicode content".as_bytes())
|
|
425
|
+
zip.start_file("测试文件.txt", options).expect("Operation failed");
|
|
426
|
+
zip.write_all("Unicode content".as_bytes())
|
|
427
|
+
.expect("Failed to convert to bytes");
|
|
423
428
|
|
|
424
|
-
zip.start_file("file with spaces.txt", options)
|
|
425
|
-
|
|
429
|
+
zip.start_file("file with spaces.txt", options)
|
|
430
|
+
.expect("Operation failed");
|
|
431
|
+
zip.write_all(b"Spaces in filename").expect("Operation failed");
|
|
426
432
|
|
|
427
|
-
zip.start_file("file-with-dashes.txt", options)
|
|
428
|
-
|
|
433
|
+
zip.start_file("file-with-dashes.txt", options)
|
|
434
|
+
.expect("Operation failed");
|
|
435
|
+
zip.write_all(b"Dashes").expect("Operation failed");
|
|
429
436
|
|
|
430
|
-
zip.finish().
|
|
437
|
+
zip.finish().expect("Operation failed");
|
|
431
438
|
}
|
|
432
439
|
|
|
433
440
|
let zip_bytes = cursor.into_inner();
|
|
@@ -444,7 +451,7 @@ async fn test_archive_with_special_characters() {
|
|
|
444
451
|
assert!(result.content.contains("file-with-dashes.txt"));
|
|
445
452
|
|
|
446
453
|
assert!(result.metadata.format.is_some());
|
|
447
|
-
let archive_meta = match result.metadata.format.as_ref().
|
|
454
|
+
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
|
|
448
455
|
kreuzberg::FormatMetadata::Archive(meta) => meta,
|
|
449
456
|
_ => panic!("Expected Archive metadata"),
|
|
450
457
|
};
|
|
@@ -463,7 +470,7 @@ async fn test_empty_archive() {
|
|
|
463
470
|
let mut cursor = Cursor::new(Vec::new());
|
|
464
471
|
{
|
|
465
472
|
let zip = ZipWriter::new(&mut cursor);
|
|
466
|
-
zip.finish().
|
|
473
|
+
zip.finish().expect("Operation failed");
|
|
467
474
|
}
|
|
468
475
|
|
|
469
476
|
let zip_bytes = cursor.into_inner();
|
|
@@ -477,7 +484,7 @@ async fn test_empty_archive() {
|
|
|
477
484
|
|
|
478
485
|
assert!(result.content.contains("ZIP Archive"));
|
|
479
486
|
assert!(result.metadata.format.is_some());
|
|
480
|
-
let archive_meta = match result.metadata.format.as_ref().
|
|
487
|
+
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
|
|
481
488
|
kreuzberg::FormatMetadata::Archive(meta) => meta,
|
|
482
489
|
_ => panic!("Expected Archive metadata"),
|
|
483
490
|
};
|
|
@@ -503,7 +510,7 @@ fn test_archive_extraction_sync() {
|
|
|
503
510
|
assert!(result.content.contains("Hello from ZIP!"));
|
|
504
511
|
|
|
505
512
|
assert!(result.metadata.format.is_some(), "Should have archive metadata");
|
|
506
|
-
let archive_meta = match result.metadata.format.as_ref().
|
|
513
|
+
let archive_meta = match result.metadata.format.as_ref().expect("Operation failed") {
|
|
507
514
|
kreuzberg::FormatMetadata::Archive(meta) => meta,
|
|
508
515
|
_ => panic!("Expected Archive metadata"),
|
|
509
516
|
};
|
|
@@ -519,10 +526,10 @@ fn create_simple_zip() -> Vec<u8> {
|
|
|
519
526
|
let mut zip = ZipWriter::new(&mut cursor);
|
|
520
527
|
let options = FileOptions::<'_, ()>::default();
|
|
521
528
|
|
|
522
|
-
zip.start_file("test.txt", options).
|
|
523
|
-
zip.write_all(b"Hello from ZIP!").
|
|
529
|
+
zip.start_file("test.txt", options).expect("Operation failed");
|
|
530
|
+
zip.write_all(b"Hello from ZIP!").expect("Operation failed");
|
|
524
531
|
|
|
525
|
-
zip.finish().
|
|
532
|
+
zip.finish().expect("Operation failed");
|
|
526
533
|
}
|
|
527
534
|
cursor.into_inner()
|
|
528
535
|
}
|
|
@@ -534,12 +541,12 @@ fn create_simple_tar() -> Vec<u8> {
|
|
|
534
541
|
|
|
535
542
|
let data = b"Hello from TAR!";
|
|
536
543
|
let mut header = tar::Header::new_gnu();
|
|
537
|
-
header.set_path("test.txt").
|
|
544
|
+
header.set_path("test.txt").expect("Operation failed");
|
|
538
545
|
header.set_size(data.len() as u64);
|
|
539
546
|
header.set_cksum();
|
|
540
|
-
tar.append(&header, &data[..]).
|
|
547
|
+
tar.append(&header, &data[..]).expect("Operation failed");
|
|
541
548
|
|
|
542
|
-
tar.finish().
|
|
549
|
+
tar.finish().expect("Operation failed");
|
|
543
550
|
}
|
|
544
551
|
cursor.into_inner()
|
|
545
552
|
}
|
|
@@ -63,7 +63,7 @@ async fn test_batch_documents_parallel_execution() {
|
|
|
63
63
|
let parallel_duration = parallel_start.elapsed();
|
|
64
64
|
|
|
65
65
|
assert!(results.is_ok(), "Batch extraction should succeed");
|
|
66
|
-
let results = results.
|
|
66
|
+
let results = results.expect("Operation failed");
|
|
67
67
|
assert_eq!(results.len(), 20, "Should process all 20 files");
|
|
68
68
|
|
|
69
69
|
for result in &results {
|
|
@@ -102,7 +102,7 @@ async fn test_batch_documents_concurrency_limiting() {
|
|
|
102
102
|
let results = batch_extract_file(paths, &config).await;
|
|
103
103
|
|
|
104
104
|
assert!(results.is_ok());
|
|
105
|
-
let results = results.
|
|
105
|
+
let results = results.expect("Operation failed");
|
|
106
106
|
assert_eq!(results.len(), 4);
|
|
107
107
|
}
|
|
108
108
|
|
|
@@ -127,7 +127,7 @@ async fn test_batch_documents_default_concurrency() {
|
|
|
127
127
|
let duration = start.elapsed();
|
|
128
128
|
|
|
129
129
|
assert!(results.is_ok());
|
|
130
|
-
let results = results.
|
|
130
|
+
let results = results.expect("Operation failed");
|
|
131
131
|
assert_eq!(results.len(), 50);
|
|
132
132
|
|
|
133
133
|
println!("Processed 50 files in {:?}", duration);
|
|
@@ -152,7 +152,9 @@ async fn test_batch_documents_preserves_order() {
|
|
|
152
152
|
get_test_file_path("xml/simple_note.xml"),
|
|
153
153
|
];
|
|
154
154
|
|
|
155
|
-
let results = batch_extract_file(paths, &config)
|
|
155
|
+
let results = batch_extract_file(paths, &config)
|
|
156
|
+
.await
|
|
157
|
+
.expect("Async operation failed");
|
|
156
158
|
|
|
157
159
|
assert_eq!(results.len(), 3, "Should have 3 results");
|
|
158
160
|
|
|
@@ -201,7 +203,7 @@ async fn test_multipage_pdf_extraction() {
|
|
|
201
203
|
let duration = start.elapsed();
|
|
202
204
|
|
|
203
205
|
assert!(result.is_ok(), "Multi-page PDF extraction should succeed");
|
|
204
|
-
let extraction = result.
|
|
206
|
+
let extraction = result.expect("Operation failed");
|
|
205
207
|
|
|
206
208
|
assert!(!extraction.content.is_empty(), "Should extract text from all pages");
|
|
207
209
|
println!("Extracted multi-page PDF in {:?}", duration);
|
|
@@ -230,7 +232,7 @@ async fn test_concurrent_pdf_extractions() {
|
|
|
230
232
|
let duration = start.elapsed();
|
|
231
233
|
|
|
232
234
|
assert!(results.is_ok());
|
|
233
|
-
let results = results.
|
|
235
|
+
let results = results.expect("Operation failed");
|
|
234
236
|
assert_eq!(results.len(), 10);
|
|
235
237
|
|
|
236
238
|
println!("Processed 10 PDFs in {:?}", duration);
|
|
@@ -318,7 +320,7 @@ async fn test_batch_bytes_parallel_processing() {
|
|
|
318
320
|
let duration = start.elapsed();
|
|
319
321
|
|
|
320
322
|
assert!(results.is_ok());
|
|
321
|
-
let results = results.
|
|
323
|
+
let results = results.expect("Operation failed");
|
|
322
324
|
assert_eq!(results.len(), 30);
|
|
323
325
|
|
|
324
326
|
for (i, result) in results.iter().enumerate() {
|
|
@@ -350,7 +352,7 @@ async fn test_batch_bytes_mixed_valid_invalid() {
|
|
|
350
352
|
let results = batch_extract_bytes(owned_contents, &config).await;
|
|
351
353
|
|
|
352
354
|
assert!(results.is_ok());
|
|
353
|
-
let results = results.
|
|
355
|
+
let results = results.expect("Operation failed");
|
|
354
356
|
assert_eq!(results.len(), 5);
|
|
355
357
|
|
|
356
358
|
assert_text_content(&results[0].content, "valid content 1");
|
|
@@ -394,7 +396,7 @@ async fn test_batch_utilizes_multiple_cores() {
|
|
|
394
396
|
let duration = start.elapsed();
|
|
395
397
|
|
|
396
398
|
assert!(results.is_ok());
|
|
397
|
-
let results = results.
|
|
399
|
+
let results = results.expect("Operation failed");
|
|
398
400
|
assert_eq!(results.len(), 20);
|
|
399
401
|
|
|
400
402
|
println!(
|
|
@@ -437,7 +439,7 @@ async fn test_batch_memory_pressure_handling() {
|
|
|
437
439
|
let duration = start.elapsed();
|
|
438
440
|
|
|
439
441
|
assert!(results.is_ok());
|
|
440
|
-
let results = results.
|
|
442
|
+
let results = results.expect("Operation failed");
|
|
441
443
|
assert_eq!(results.len(), 50);
|
|
442
444
|
|
|
443
445
|
println!("Processed 50 large documents with concurrency limit in {:?}", duration);
|
|
@@ -469,7 +471,9 @@ async fn test_batch_scales_with_cpu_count() {
|
|
|
469
471
|
.collect();
|
|
470
472
|
|
|
471
473
|
let start = Instant::now();
|
|
472
|
-
let _ = batch_extract_bytes(owned_contents_1, &config_1)
|
|
474
|
+
let _ = batch_extract_bytes(owned_contents_1, &config_1)
|
|
475
|
+
.await
|
|
476
|
+
.expect("Async operation failed");
|
|
473
477
|
let duration_1 = start.elapsed();
|
|
474
478
|
|
|
475
479
|
let config_full = ExtractionConfig {
|
|
@@ -483,7 +487,9 @@ async fn test_batch_scales_with_cpu_count() {
|
|
|
483
487
|
.collect();
|
|
484
488
|
|
|
485
489
|
let start = Instant::now();
|
|
486
|
-
let _ = batch_extract_bytes(owned_contents_full, &config_full)
|
|
490
|
+
let _ = batch_extract_bytes(owned_contents_full, &config_full)
|
|
491
|
+
.await
|
|
492
|
+
.expect("Async operation failed");
|
|
487
493
|
let duration_full = start.elapsed();
|
|
488
494
|
|
|
489
495
|
println!(
|
|
@@ -522,7 +528,7 @@ async fn test_batch_mixed_document_types() {
|
|
|
522
528
|
let results = batch_extract_file(paths, &config).await;
|
|
523
529
|
|
|
524
530
|
assert!(results.is_ok());
|
|
525
|
-
let results = results.
|
|
531
|
+
let results = results.expect("Operation failed");
|
|
526
532
|
assert_eq!(results.len(), 4);
|
|
527
533
|
|
|
528
534
|
for (i, result) in results.iter().enumerate() {
|
|
@@ -572,7 +578,9 @@ async fn test_batch_accuracy_under_load() {
|
|
|
572
578
|
.map(|(bytes, mime)| (bytes.to_vec(), mime.to_string()))
|
|
573
579
|
.collect();
|
|
574
580
|
|
|
575
|
-
let results = batch_extract_bytes(owned_contents, &config)
|
|
581
|
+
let results = batch_extract_bytes(owned_contents, &config)
|
|
582
|
+
.await
|
|
583
|
+
.expect("Async operation failed");
|
|
576
584
|
|
|
577
585
|
assert_eq!(results.len(), 100);
|
|
578
586
|
|
|
@@ -21,7 +21,7 @@ mod tests {
|
|
|
21
21
|
|
|
22
22
|
let mut buffers = vec![];
|
|
23
23
|
for _ in 0..3 {
|
|
24
|
-
let buf = pool.acquire().
|
|
24
|
+
let buf = pool.acquire().expect("Operation failed");
|
|
25
25
|
buffers.push(buf);
|
|
26
26
|
}
|
|
27
27
|
|
|
@@ -31,7 +31,7 @@ mod tests {
|
|
|
31
31
|
|
|
32
32
|
let mut buffers = vec![];
|
|
33
33
|
for _ in 0..3 {
|
|
34
|
-
let buf = pool.acquire().
|
|
34
|
+
let buf = pool.acquire().expect("Operation failed");
|
|
35
35
|
buffers.push(buf);
|
|
36
36
|
}
|
|
37
37
|
drop(buffers);
|
|
@@ -47,8 +47,8 @@ mod tests {
|
|
|
47
47
|
let mut results = vec![];
|
|
48
48
|
|
|
49
49
|
for _i in 0..5 {
|
|
50
|
-
let string_buf = processor.string_pool().acquire().
|
|
51
|
-
let byte_buf = processor.byte_pool().acquire().
|
|
50
|
+
let string_buf = processor.string_pool().acquire().expect("Operation failed");
|
|
51
|
+
let byte_buf = processor.byte_pool().acquire().expect("Operation failed");
|
|
52
52
|
|
|
53
53
|
results.push((string_buf, byte_buf));
|
|
54
54
|
}
|
|
@@ -65,17 +65,17 @@ mod tests {
|
|
|
65
65
|
let pool = create_string_buffer_pool(5, 4096);
|
|
66
66
|
|
|
67
67
|
let capacity_initial = {
|
|
68
|
-
let buf = pool.acquire().
|
|
68
|
+
let buf = pool.acquire().expect("Operation failed");
|
|
69
69
|
buf.capacity()
|
|
70
70
|
};
|
|
71
71
|
|
|
72
72
|
for _ in 0..10 {
|
|
73
|
-
let mut buf = pool.acquire().
|
|
73
|
+
let mut buf = pool.acquire().expect("Operation failed");
|
|
74
74
|
buf.push_str("test data");
|
|
75
75
|
}
|
|
76
76
|
|
|
77
77
|
let capacity_final = {
|
|
78
|
-
let buf = pool.acquire().
|
|
78
|
+
let buf = pool.acquire().expect("Operation failed");
|
|
79
79
|
buf.capacity()
|
|
80
80
|
};
|
|
81
81
|
|
|
@@ -101,15 +101,15 @@ mod tests {
|
|
|
101
101
|
let processor = BatchProcessor::new();
|
|
102
102
|
|
|
103
103
|
{
|
|
104
|
-
let _s1 = processor.string_pool().acquire().
|
|
105
|
-
let _s2 = processor.string_pool().acquire().
|
|
106
|
-
let _b1 = processor.byte_pool().acquire().
|
|
104
|
+
let _s1 = processor.string_pool().acquire().expect("Operation failed");
|
|
105
|
+
let _s2 = processor.string_pool().acquire().expect("Operation failed");
|
|
106
|
+
let _b1 = processor.byte_pool().acquire().expect("Operation failed");
|
|
107
107
|
}
|
|
108
108
|
|
|
109
109
|
assert!(processor.string_pool_size() > 0);
|
|
110
110
|
assert!(processor.byte_pool_size() > 0);
|
|
111
111
|
|
|
112
|
-
processor.clear_pools().
|
|
112
|
+
processor.clear_pools().expect("Operation failed");
|
|
113
113
|
|
|
114
114
|
assert_eq!(processor.string_pool_size(), 0);
|
|
115
115
|
assert_eq!(processor.byte_pool_size(), 0);
|
|
@@ -137,7 +137,7 @@ mod tests {
|
|
|
137
137
|
}
|
|
138
138
|
|
|
139
139
|
for handle in handles {
|
|
140
|
-
handle.join().
|
|
140
|
+
handle.join().expect("Operation failed");
|
|
141
141
|
}
|
|
142
142
|
|
|
143
143
|
assert!(processor.string_pool_size() <= 10);
|
|
@@ -148,7 +148,7 @@ mod tests {
|
|
|
148
148
|
fn test_pool_respects_capacity_hints() {
|
|
149
149
|
let pool = create_string_buffer_pool(3, 2048);
|
|
150
150
|
|
|
151
|
-
let buf = pool.acquire().
|
|
151
|
+
let buf = pool.acquire().expect("Operation failed");
|
|
152
152
|
assert!(buf.capacity() >= 2048, "buffer should respect capacity hint");
|
|
153
153
|
}
|
|
154
154
|
}
|
|
@@ -51,7 +51,7 @@ async fn test_batch_extract_file_multiple_formats() {
|
|
|
51
51
|
let results = batch_extract_file(paths, &config).await;
|
|
52
52
|
|
|
53
53
|
assert!(results.is_ok(), "Batch extraction should succeed");
|
|
54
|
-
let results = results.
|
|
54
|
+
let results = results.expect("Operation failed");
|
|
55
55
|
|
|
56
56
|
assert_eq!(results.len(), 3);
|
|
57
57
|
|
|
@@ -95,7 +95,7 @@ fn test_batch_extract_file_sync_variant() {
|
|
|
95
95
|
let results = batch_extract_file_sync(paths, &config);
|
|
96
96
|
|
|
97
97
|
assert!(results.is_ok(), "Sync batch extraction should succeed");
|
|
98
|
-
let results = results.
|
|
98
|
+
let results = results.expect("Operation failed");
|
|
99
99
|
|
|
100
100
|
assert_eq!(results.len(), 2);
|
|
101
101
|
|
|
@@ -137,7 +137,7 @@ async fn test_batch_extract_bytes_multiple() {
|
|
|
137
137
|
let results = batch_extract_bytes(owned_contents, &config).await;
|
|
138
138
|
|
|
139
139
|
assert!(results.is_ok(), "Batch bytes extraction should succeed");
|
|
140
|
-
let results = results.
|
|
140
|
+
let results = results.expect("Operation failed");
|
|
141
141
|
|
|
142
142
|
assert_eq!(results.len(), 3);
|
|
143
143
|
|
|
@@ -161,7 +161,11 @@ async fn test_batch_extract_empty_list() {
|
|
|
161
161
|
let results = batch_extract_file(paths, &config).await;
|
|
162
162
|
|
|
163
163
|
assert!(results.is_ok(), "Empty batch should succeed");
|
|
164
|
-
assert_eq!(
|
|
164
|
+
assert_eq!(
|
|
165
|
+
results.expect("Operation failed").len(),
|
|
166
|
+
0,
|
|
167
|
+
"Should return empty vector"
|
|
168
|
+
);
|
|
165
169
|
}
|
|
166
170
|
|
|
167
171
|
/// Test batch extraction when one file fails (others should succeed).
|
|
@@ -187,7 +191,7 @@ async fn test_batch_extract_one_file_fails() {
|
|
|
187
191
|
let results = batch_extract_file(paths, &config).await;
|
|
188
192
|
|
|
189
193
|
assert!(results.is_ok(), "Batch should succeed even with one failure");
|
|
190
|
-
let results = results.
|
|
194
|
+
let results = results.expect("Operation failed");
|
|
191
195
|
|
|
192
196
|
assert_eq!(results.len(), 3);
|
|
193
197
|
|
|
@@ -216,7 +220,7 @@ async fn test_batch_extract_all_fail() {
|
|
|
216
220
|
let results = batch_extract_file(paths, &config).await;
|
|
217
221
|
|
|
218
222
|
assert!(results.is_ok(), "Batch should succeed (errors in metadata)");
|
|
219
|
-
let results = results.
|
|
223
|
+
let results = results.expect("Operation failed");
|
|
220
224
|
|
|
221
225
|
assert_eq!(results.len(), 3);
|
|
222
226
|
|
|
@@ -251,7 +255,7 @@ async fn test_batch_extract_concurrent() {
|
|
|
251
255
|
let duration = start.elapsed();
|
|
252
256
|
|
|
253
257
|
assert!(results.is_ok(), "Concurrent batch should succeed");
|
|
254
|
-
let results = results.
|
|
258
|
+
let results = results.expect("Operation failed");
|
|
255
259
|
|
|
256
260
|
assert_eq!(results.len(), 20);
|
|
257
261
|
|
|
@@ -289,7 +293,7 @@ async fn test_batch_extract_large_batch() {
|
|
|
289
293
|
let results = batch_extract_file(paths, &config).await;
|
|
290
294
|
|
|
291
295
|
assert!(results.is_ok(), "Large batch should succeed");
|
|
292
|
-
let results = results.
|
|
296
|
+
let results = results.expect("Operation failed");
|
|
293
297
|
|
|
294
298
|
assert_eq!(results.len(), 50);
|
|
295
299
|
|
|
@@ -319,7 +323,7 @@ fn test_batch_extract_bytes_sync_variant() {
|
|
|
319
323
|
let results = batch_extract_bytes_sync(owned_contents, &config);
|
|
320
324
|
|
|
321
325
|
assert!(results.is_ok(), "Sync batch bytes extraction should succeed");
|
|
322
|
-
let results = results.
|
|
326
|
+
let results = results.expect("Operation failed");
|
|
323
327
|
|
|
324
328
|
assert_eq!(results.len(), 3);
|
|
325
329
|
assert_text_content(&results[0].content, "content 1");
|
|
@@ -65,7 +65,7 @@ async fn test_all_entry_types() {
|
|
|
65
65
|
.await;
|
|
66
66
|
|
|
67
67
|
assert!(result.is_ok(), "Failed to parse {} entry", expected_type);
|
|
68
|
-
let result = result.
|
|
68
|
+
let result = result.expect("Operation failed");
|
|
69
69
|
|
|
70
70
|
if let Some(entry_types) = result.metadata.additional.get("entry_types") {
|
|
71
71
|
assert!(entry_types.as_object().is_some(), "Entry types should be an object");
|
|
@@ -116,7 +116,7 @@ async fn test_all_common_fields() {
|
|
|
116
116
|
.await;
|
|
117
117
|
|
|
118
118
|
assert!(result.is_ok());
|
|
119
|
-
let result = result.
|
|
119
|
+
let result = result.expect("Operation failed");
|
|
120
120
|
|
|
121
121
|
let content = &result.content;
|
|
122
122
|
|
|
@@ -183,7 +183,7 @@ async fn test_author_parsing() {
|
|
|
183
183
|
.await;
|
|
184
184
|
|
|
185
185
|
assert!(result.is_ok());
|
|
186
|
-
let result = result.
|
|
186
|
+
let result = result.expect("Operation failed");
|
|
187
187
|
|
|
188
188
|
if let Some(authors) = result.metadata.additional.get("authors") {
|
|
189
189
|
let authors_array = authors.as_array().expect("Authors should be an array");
|
|
@@ -221,7 +221,7 @@ async fn test_special_characters() {
|
|
|
221
221
|
.await;
|
|
222
222
|
|
|
223
223
|
assert!(result.is_ok());
|
|
224
|
-
let result = result.
|
|
224
|
+
let result = result.expect("Operation failed");
|
|
225
225
|
|
|
226
226
|
assert_eq!(
|
|
227
227
|
result.metadata.additional.get("entry_count"),
|
|
@@ -250,7 +250,7 @@ async fn test_year_range_extraction() {
|
|
|
250
250
|
.await;
|
|
251
251
|
|
|
252
252
|
assert!(result.is_ok());
|
|
253
|
-
let result = result.
|
|
253
|
+
let result = result.expect("Operation failed");
|
|
254
254
|
|
|
255
255
|
if let Some(year_range) = result.metadata.additional.get("year_range") {
|
|
256
256
|
assert_eq!(year_range.get("min"), Some(&serde_json::json!(1990)));
|
|
@@ -281,7 +281,7 @@ async fn test_citation_keys_extraction() {
|
|
|
281
281
|
.await;
|
|
282
282
|
|
|
283
283
|
assert!(result.is_ok());
|
|
284
|
-
let result = result.
|
|
284
|
+
let result = result.expect("Operation failed");
|
|
285
285
|
|
|
286
286
|
if let Some(citation_keys) = result.metadata.additional.get("citation_keys") {
|
|
287
287
|
let keys_array = citation_keys.as_array().expect("Citation keys should be an array");
|
|
@@ -316,7 +316,7 @@ async fn test_entry_type_distribution() {
|
|
|
316
316
|
.await;
|
|
317
317
|
|
|
318
318
|
assert!(result.is_ok());
|
|
319
|
-
let result = result.
|
|
319
|
+
let result = result.expect("Operation failed");
|
|
320
320
|
|
|
321
321
|
if let Some(entry_types) = result.metadata.additional.get("entry_types") {
|
|
322
322
|
let types_obj = entry_types.as_object().expect("Entry types should be an object");
|
|
@@ -348,7 +348,7 @@ async fn test_unicode_support() {
|
|
|
348
348
|
.await;
|
|
349
349
|
|
|
350
350
|
assert!(result.is_ok());
|
|
351
|
-
let result = result.
|
|
351
|
+
let result = result.expect("Operation failed");
|
|
352
352
|
|
|
353
353
|
assert_eq!(
|
|
354
354
|
result.metadata.additional.get("entry_count"),
|
|
@@ -376,7 +376,7 @@ async fn test_empty_fields() {
|
|
|
376
376
|
.await;
|
|
377
377
|
|
|
378
378
|
assert!(result.is_ok());
|
|
379
|
-
let result = result.
|
|
379
|
+
let result = result.expect("Operation failed");
|
|
380
380
|
assert_eq!(
|
|
381
381
|
result.metadata.additional.get("entry_count"),
|
|
382
382
|
Some(&serde_json::json!(1))
|
|
@@ -397,7 +397,7 @@ async fn test_comprehensive_file() {
|
|
|
397
397
|
.await;
|
|
398
398
|
|
|
399
399
|
assert!(result.is_ok());
|
|
400
|
-
let result = result.
|
|
400
|
+
let result = result.expect("Operation failed");
|
|
401
401
|
|
|
402
402
|
assert_eq!(
|
|
403
403
|
result.metadata.additional.get("entry_count"),
|