kreuzberg 4.1.2 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/kreuzberg.gemspec +13 -1
- data/lib/kreuzberg/cli.rb +16 -6
- data/lib/kreuzberg/cli_proxy.rb +3 -1
- data/lib/kreuzberg/config.rb +121 -39
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/extraction_api.rb +20 -4
- data/lib/kreuzberg/result.rb +12 -2
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +1 -0
- data/sig/kreuzberg.rbs +28 -12
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/batch_spec.rb +6 -5
- data/spec/binding/error_recovery_spec.rb +3 -3
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/binding/tables_spec.rb +11 -2
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/startup.rs +15 -1
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
- data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
- data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
- data/vendor/kreuzberg/src/core/io.rs +7 -7
- data/vendor/kreuzberg/src/core/mime.rs +4 -4
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
- data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +57 -57
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +12 -2
|
@@ -187,26 +187,26 @@ fn test_register_custom_postprocessor() {
|
|
|
187
187
|
});
|
|
188
188
|
|
|
189
189
|
{
|
|
190
|
-
let mut reg = registry.write().
|
|
191
|
-
reg.shutdown_all().
|
|
190
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
191
|
+
reg.shutdown_all().expect("Operation failed");
|
|
192
192
|
}
|
|
193
193
|
|
|
194
194
|
{
|
|
195
|
-
let mut reg = registry.write().
|
|
195
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
196
196
|
let result = reg.register(Arc::clone(&processor) as Arc<dyn PostProcessor>, 100);
|
|
197
197
|
assert!(result.is_ok(), "Failed to register processor: {:?}", result.err());
|
|
198
198
|
}
|
|
199
199
|
|
|
200
200
|
let list = {
|
|
201
|
-
let reg = registry.read().
|
|
201
|
+
let reg = registry.read().expect("Operation failed");
|
|
202
202
|
reg.list()
|
|
203
203
|
};
|
|
204
204
|
|
|
205
205
|
assert!(list.contains(&"test-appender".to_string()));
|
|
206
206
|
|
|
207
207
|
{
|
|
208
|
-
let mut reg = registry.write().
|
|
209
|
-
reg.shutdown_all().
|
|
208
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
209
|
+
reg.shutdown_all().expect("Operation failed");
|
|
210
210
|
}
|
|
211
211
|
}
|
|
212
212
|
|
|
@@ -224,9 +224,9 @@ fn test_postprocessor_called_during_extraction() {
|
|
|
224
224
|
});
|
|
225
225
|
|
|
226
226
|
{
|
|
227
|
-
let mut reg = registry.write().
|
|
227
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
228
228
|
reg.register(Arc::clone(&processor) as Arc<dyn PostProcessor>, 100)
|
|
229
|
-
.
|
|
229
|
+
.expect("Operation failed");
|
|
230
230
|
}
|
|
231
231
|
|
|
232
232
|
let config = ExtractionConfig::default();
|
|
@@ -234,7 +234,7 @@ fn test_postprocessor_called_during_extraction() {
|
|
|
234
234
|
|
|
235
235
|
assert!(result.is_ok(), "Extraction failed: {:?}", result.err());
|
|
236
236
|
|
|
237
|
-
let extraction_result = result.
|
|
237
|
+
let extraction_result = result.expect("Operation failed");
|
|
238
238
|
assert!(
|
|
239
239
|
extraction_result.content.contains("[APPENDED BY PROCESSOR]"),
|
|
240
240
|
"Processor did not modify content. Content: {}",
|
|
@@ -248,8 +248,8 @@ fn test_postprocessor_called_during_extraction() {
|
|
|
248
248
|
);
|
|
249
249
|
|
|
250
250
|
{
|
|
251
|
-
let mut reg = registry.write().
|
|
252
|
-
reg.shutdown_all().
|
|
251
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
252
|
+
reg.shutdown_all().expect("Operation failed");
|
|
253
253
|
}
|
|
254
254
|
}
|
|
255
255
|
|
|
@@ -265,8 +265,9 @@ fn test_postprocessor_modifies_content() {
|
|
|
265
265
|
});
|
|
266
266
|
|
|
267
267
|
{
|
|
268
|
-
let mut reg = registry.write().
|
|
269
|
-
reg.register(processor as Arc<dyn PostProcessor>, 100)
|
|
268
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
269
|
+
reg.register(processor as Arc<dyn PostProcessor>, 100)
|
|
270
|
+
.expect("Operation failed");
|
|
270
271
|
}
|
|
271
272
|
|
|
272
273
|
let config = ExtractionConfig::default();
|
|
@@ -274,14 +275,14 @@ fn test_postprocessor_modifies_content() {
|
|
|
274
275
|
|
|
275
276
|
assert!(result.is_ok());
|
|
276
277
|
|
|
277
|
-
let extraction_result = result.
|
|
278
|
+
let extraction_result = result.expect("Operation failed");
|
|
278
279
|
let has_lowercase = extraction_result.content.chars().any(|c| c.is_lowercase());
|
|
279
280
|
|
|
280
281
|
assert!(!has_lowercase, "Content was not fully uppercased");
|
|
281
282
|
|
|
282
283
|
{
|
|
283
|
-
let mut reg = registry.write().
|
|
284
|
-
reg.shutdown_all().
|
|
284
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
285
|
+
reg.shutdown_all().expect("Operation failed");
|
|
285
286
|
}
|
|
286
287
|
}
|
|
287
288
|
|
|
@@ -298,9 +299,9 @@ fn test_postprocessor_adds_metadata() {
|
|
|
298
299
|
});
|
|
299
300
|
|
|
300
301
|
{
|
|
301
|
-
let mut reg = registry.write().
|
|
302
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
302
303
|
reg.register(Arc::clone(&processor) as Arc<dyn PostProcessor>, 100)
|
|
303
|
-
.
|
|
304
|
+
.expect("Operation failed");
|
|
304
305
|
}
|
|
305
306
|
|
|
306
307
|
assert!(
|
|
@@ -313,7 +314,7 @@ fn test_postprocessor_adds_metadata() {
|
|
|
313
314
|
|
|
314
315
|
assert!(result.is_ok());
|
|
315
316
|
|
|
316
|
-
let extraction_result = result.
|
|
317
|
+
let extraction_result = result.expect("Operation failed");
|
|
317
318
|
|
|
318
319
|
assert!(
|
|
319
320
|
extraction_result.metadata.additional.contains_key("processed_by"),
|
|
@@ -324,12 +325,19 @@ fn test_postprocessor_adds_metadata() {
|
|
|
324
325
|
"Metadata 'word_count' not added"
|
|
325
326
|
);
|
|
326
327
|
|
|
327
|
-
let processed_by = extraction_result
|
|
328
|
-
|
|
328
|
+
let processed_by = extraction_result
|
|
329
|
+
.metadata
|
|
330
|
+
.additional
|
|
331
|
+
.get("processed_by")
|
|
332
|
+
.expect("Operation failed");
|
|
333
|
+
assert_eq!(
|
|
334
|
+
processed_by.as_str().expect("Failed to extract string from value"),
|
|
335
|
+
"metadata-adder"
|
|
336
|
+
);
|
|
329
337
|
|
|
330
338
|
{
|
|
331
|
-
let mut reg = registry.write().
|
|
332
|
-
reg.shutdown_all().
|
|
339
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
340
|
+
reg.shutdown_all().expect("Operation failed");
|
|
333
341
|
}
|
|
334
342
|
|
|
335
343
|
assert!(
|
|
@@ -350,18 +358,18 @@ fn test_unregister_postprocessor() {
|
|
|
350
358
|
});
|
|
351
359
|
|
|
352
360
|
{
|
|
353
|
-
let mut reg = registry.write().
|
|
361
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
354
362
|
reg.register(Arc::clone(&processor) as Arc<dyn PostProcessor>, 100)
|
|
355
|
-
.
|
|
363
|
+
.expect("Operation failed");
|
|
356
364
|
}
|
|
357
365
|
|
|
358
366
|
{
|
|
359
|
-
let mut reg = registry.write().
|
|
360
|
-
reg.remove("unregister-test").
|
|
367
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
368
|
+
reg.remove("unregister-test").expect("Operation failed");
|
|
361
369
|
}
|
|
362
370
|
|
|
363
371
|
let list = {
|
|
364
|
-
let reg = registry.read().
|
|
372
|
+
let reg = registry.read().expect("Operation failed");
|
|
365
373
|
reg.list()
|
|
366
374
|
};
|
|
367
375
|
|
|
@@ -373,7 +381,7 @@ fn test_unregister_postprocessor() {
|
|
|
373
381
|
|
|
374
382
|
assert!(result.is_ok());
|
|
375
383
|
|
|
376
|
-
let extraction_result = result.
|
|
384
|
+
let extraction_result = result.expect("Operation failed");
|
|
377
385
|
assert!(
|
|
378
386
|
!extraction_result.content.contains("[SHOULD NOT APPEAR]"),
|
|
379
387
|
"Unregistered processor still modified content"
|
|
@@ -382,8 +390,8 @@ fn test_unregister_postprocessor() {
|
|
|
382
390
|
assert_eq!(processor.call_count.load(Ordering::SeqCst), 0);
|
|
383
391
|
|
|
384
392
|
{
|
|
385
|
-
let mut reg = registry.write().
|
|
386
|
-
reg.shutdown_all().
|
|
393
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
394
|
+
reg.shutdown_all().expect("Operation failed");
|
|
387
395
|
}
|
|
388
396
|
}
|
|
389
397
|
|
|
@@ -393,8 +401,8 @@ fn test_clear_all_postprocessors() {
|
|
|
393
401
|
let registry = get_post_processor_registry();
|
|
394
402
|
|
|
395
403
|
{
|
|
396
|
-
let mut reg = registry.write().
|
|
397
|
-
reg.shutdown_all().
|
|
404
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
405
|
+
reg.shutdown_all().expect("Operation failed");
|
|
398
406
|
}
|
|
399
407
|
|
|
400
408
|
let processor1 = Arc::new(AppendTextProcessor {
|
|
@@ -410,18 +418,20 @@ fn test_clear_all_postprocessors() {
|
|
|
410
418
|
});
|
|
411
419
|
|
|
412
420
|
{
|
|
413
|
-
let mut reg = registry.write().
|
|
414
|
-
reg.register(processor1 as Arc<dyn PostProcessor>, 100)
|
|
415
|
-
|
|
421
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
422
|
+
reg.register(processor1 as Arc<dyn PostProcessor>, 100)
|
|
423
|
+
.expect("Operation failed");
|
|
424
|
+
reg.register(processor2 as Arc<dyn PostProcessor>, 100)
|
|
425
|
+
.expect("Operation failed");
|
|
416
426
|
}
|
|
417
427
|
|
|
418
428
|
{
|
|
419
|
-
let mut reg = registry.write().
|
|
420
|
-
reg.shutdown_all().
|
|
429
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
430
|
+
reg.shutdown_all().expect("Operation failed");
|
|
421
431
|
}
|
|
422
432
|
|
|
423
433
|
let list = {
|
|
424
|
-
let reg = registry.read().
|
|
434
|
+
let reg = registry.read().expect("Operation failed");
|
|
425
435
|
reg.list()
|
|
426
436
|
};
|
|
427
437
|
|
|
@@ -440,8 +450,9 @@ fn test_postprocessor_error_handling() {
|
|
|
440
450
|
});
|
|
441
451
|
|
|
442
452
|
{
|
|
443
|
-
let mut reg = registry.write().
|
|
444
|
-
reg.register(failing_processor as Arc<dyn PostProcessor>, 100)
|
|
453
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
454
|
+
reg.register(failing_processor as Arc<dyn PostProcessor>, 100)
|
|
455
|
+
.expect("Operation failed");
|
|
445
456
|
}
|
|
446
457
|
|
|
447
458
|
let config = ExtractionConfig::default();
|
|
@@ -462,8 +473,8 @@ fn test_postprocessor_error_handling() {
|
|
|
462
473
|
}
|
|
463
474
|
|
|
464
475
|
{
|
|
465
|
-
let mut reg = registry.write().
|
|
466
|
-
reg.shutdown_all().
|
|
476
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
477
|
+
reg.shutdown_all().expect("Operation failed");
|
|
467
478
|
}
|
|
468
479
|
}
|
|
469
480
|
|
|
@@ -473,8 +484,8 @@ fn test_postprocessor_invalid_name() {
|
|
|
473
484
|
let registry = get_post_processor_registry();
|
|
474
485
|
|
|
475
486
|
{
|
|
476
|
-
let mut reg = registry.write().
|
|
477
|
-
reg.shutdown_all().
|
|
487
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
488
|
+
reg.shutdown_all().expect("Operation failed");
|
|
478
489
|
}
|
|
479
490
|
|
|
480
491
|
let processor = Arc::new(AppendTextProcessor {
|
|
@@ -484,16 +495,19 @@ fn test_postprocessor_invalid_name() {
|
|
|
484
495
|
});
|
|
485
496
|
|
|
486
497
|
{
|
|
487
|
-
let mut reg = registry.write().
|
|
498
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
488
499
|
let result = reg.register(processor, 100);
|
|
489
500
|
|
|
490
501
|
assert!(result.is_err());
|
|
491
|
-
assert!(matches!(
|
|
502
|
+
assert!(matches!(
|
|
503
|
+
result.expect_err("Operation should fail"),
|
|
504
|
+
KreuzbergError::Validation { .. }
|
|
505
|
+
));
|
|
492
506
|
}
|
|
493
507
|
|
|
494
508
|
{
|
|
495
|
-
let mut reg = registry.write().
|
|
496
|
-
reg.shutdown_all().
|
|
509
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
510
|
+
reg.shutdown_all().expect("Operation failed");
|
|
497
511
|
}
|
|
498
512
|
}
|
|
499
513
|
|
|
@@ -520,10 +534,13 @@ fn test_multiple_postprocessors_execution_order() {
|
|
|
520
534
|
});
|
|
521
535
|
|
|
522
536
|
{
|
|
523
|
-
let mut reg = registry.write().
|
|
524
|
-
reg.register(early_processor as Arc<dyn PostProcessor>, 100)
|
|
525
|
-
|
|
526
|
-
reg.register(
|
|
537
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
538
|
+
reg.register(early_processor as Arc<dyn PostProcessor>, 100)
|
|
539
|
+
.expect("Operation failed");
|
|
540
|
+
reg.register(middle_processor as Arc<dyn PostProcessor>, 100)
|
|
541
|
+
.expect("Operation failed");
|
|
542
|
+
reg.register(late_processor as Arc<dyn PostProcessor>, 100)
|
|
543
|
+
.expect("Operation failed");
|
|
527
544
|
}
|
|
528
545
|
|
|
529
546
|
let config = ExtractionConfig::default();
|
|
@@ -531,15 +548,15 @@ fn test_multiple_postprocessors_execution_order() {
|
|
|
531
548
|
|
|
532
549
|
assert!(result.is_ok());
|
|
533
550
|
|
|
534
|
-
let extraction_result = result.
|
|
551
|
+
let extraction_result = result.expect("Operation failed");
|
|
535
552
|
|
|
536
553
|
assert!(extraction_result.metadata.additional.contains_key("processed_by"));
|
|
537
554
|
assert!(!extraction_result.content.chars().any(|c| c.is_lowercase()));
|
|
538
555
|
assert!(extraction_result.content.contains("[LATE]"));
|
|
539
556
|
|
|
540
557
|
{
|
|
541
|
-
let mut reg = registry.write().
|
|
542
|
-
reg.shutdown_all().
|
|
558
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
559
|
+
reg.shutdown_all().expect("Operation failed");
|
|
543
560
|
}
|
|
544
561
|
}
|
|
545
562
|
|
|
@@ -557,9 +574,9 @@ fn test_postprocessor_preserves_mime_type() {
|
|
|
557
574
|
});
|
|
558
575
|
|
|
559
576
|
{
|
|
560
|
-
let mut reg = registry.write().
|
|
577
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
561
578
|
reg.register(Arc::clone(&processor) as Arc<dyn PostProcessor>, 100)
|
|
562
|
-
.
|
|
579
|
+
.expect("Operation failed");
|
|
563
580
|
}
|
|
564
581
|
|
|
565
582
|
let config = ExtractionConfig::default();
|
|
@@ -567,11 +584,11 @@ fn test_postprocessor_preserves_mime_type() {
|
|
|
567
584
|
|
|
568
585
|
assert!(result.is_ok());
|
|
569
586
|
|
|
570
|
-
let extraction_result = result.
|
|
587
|
+
let extraction_result = result.expect("Operation failed");
|
|
571
588
|
assert_eq!(extraction_result.mime_type, "text/plain");
|
|
572
589
|
|
|
573
590
|
{
|
|
574
|
-
let mut reg = registry.write().
|
|
575
|
-
reg.shutdown_all().
|
|
591
|
+
let mut reg = registry.write().expect("Operation failed");
|
|
592
|
+
reg.shutdown_all().expect("Operation failed");
|
|
576
593
|
}
|
|
577
594
|
}
|
|
@@ -202,9 +202,9 @@ async fn test_extractor_extraction_failure() {
|
|
|
202
202
|
should_fail_extract: true,
|
|
203
203
|
});
|
|
204
204
|
|
|
205
|
-
registry.register(failing_extractor).
|
|
205
|
+
registry.register(failing_extractor).expect("Operation failed");
|
|
206
206
|
|
|
207
|
-
let extractor = registry.get("text/plain").
|
|
207
|
+
let extractor = registry.get("text/plain").expect("Value not found");
|
|
208
208
|
let config = ExtractionConfig::default();
|
|
209
209
|
let result = extractor.extract_bytes(b"test", "text/plain", &config).await;
|
|
210
210
|
|
|
@@ -227,8 +227,8 @@ fn test_extractor_duplicate_registration() {
|
|
|
227
227
|
should_fail_extract: false,
|
|
228
228
|
});
|
|
229
229
|
|
|
230
|
-
registry.register(extractor1).
|
|
231
|
-
registry.register(extractor2).
|
|
230
|
+
registry.register(extractor1).expect("Operation failed");
|
|
231
|
+
registry.register(extractor2).expect("Operation failed");
|
|
232
232
|
|
|
233
233
|
let names = registry.list();
|
|
234
234
|
assert_eq!(names.len(), 1);
|
|
@@ -255,13 +255,13 @@ fn test_extractor_concurrent_registration() {
|
|
|
255
255
|
let mut reg = registry_clone
|
|
256
256
|
.write()
|
|
257
257
|
.expect("Failed to acquire write lock on registry in test");
|
|
258
|
-
reg.register(extractor).
|
|
258
|
+
reg.register(extractor).expect("Operation failed");
|
|
259
259
|
});
|
|
260
260
|
handles.push(handle);
|
|
261
261
|
}
|
|
262
262
|
|
|
263
263
|
for handle in handles {
|
|
264
|
-
handle.join().
|
|
264
|
+
handle.join().expect("Operation failed");
|
|
265
265
|
}
|
|
266
266
|
|
|
267
267
|
let reg = registry
|
|
@@ -323,10 +323,10 @@ fn test_extractor_priority_ordering_complex() {
|
|
|
323
323
|
name: format!("priority-{}", priority),
|
|
324
324
|
priority,
|
|
325
325
|
});
|
|
326
|
-
registry.register(extractor).
|
|
326
|
+
registry.register(extractor).expect("Operation failed");
|
|
327
327
|
}
|
|
328
328
|
|
|
329
|
-
let selected = registry.get("text/plain").
|
|
329
|
+
let selected = registry.get("text/plain").expect("Value not found");
|
|
330
330
|
assert_eq!(selected.name(), "priority-100");
|
|
331
331
|
assert_eq!(selected.priority(), 100);
|
|
332
332
|
}
|
|
@@ -382,10 +382,10 @@ fn test_extractor_wildcard_vs_exact_priority() {
|
|
|
382
382
|
should_fail_extract: false,
|
|
383
383
|
});
|
|
384
384
|
|
|
385
|
-
registry.register(wildcard_arc).
|
|
386
|
-
registry.register(exact).
|
|
385
|
+
registry.register(wildcard_arc).expect("Operation failed");
|
|
386
|
+
registry.register(exact).expect("Operation failed");
|
|
387
387
|
|
|
388
|
-
let selected = registry.get("text/plain").
|
|
388
|
+
let selected = registry.get("text/plain").expect("Value not found");
|
|
389
389
|
assert_eq!(selected.name(), "exact-low");
|
|
390
390
|
}
|
|
391
391
|
|
|
@@ -420,11 +420,11 @@ fn test_extractor_list_after_partial_removal() {
|
|
|
420
420
|
should_fail_init: false,
|
|
421
421
|
should_fail_extract: false,
|
|
422
422
|
});
|
|
423
|
-
registry.register(extractor).
|
|
423
|
+
registry.register(extractor).expect("Operation failed");
|
|
424
424
|
}
|
|
425
425
|
|
|
426
|
-
registry.remove("extractor-2").
|
|
427
|
-
registry.remove("extractor-3").
|
|
426
|
+
registry.remove("extractor-2").expect("Operation failed");
|
|
427
|
+
registry.remove("extractor-3").expect("Operation failed");
|
|
428
428
|
|
|
429
429
|
let names = registry.list();
|
|
430
430
|
assert_eq!(names.len(), 3);
|
|
@@ -452,9 +452,9 @@ async fn test_processor_execution_order_within_stage() {
|
|
|
452
452
|
stage: ProcessingStage::Early,
|
|
453
453
|
});
|
|
454
454
|
|
|
455
|
-
registry.register(low, 10).
|
|
456
|
-
registry.register(high, 100).
|
|
457
|
-
registry.register(medium, 50).
|
|
455
|
+
registry.register(low, 10).expect("Operation failed");
|
|
456
|
+
registry.register(high, 100).expect("Operation failed");
|
|
457
|
+
registry.register(medium, 50).expect("Operation failed");
|
|
458
458
|
|
|
459
459
|
let processors = registry.get_for_stage(ProcessingStage::Early);
|
|
460
460
|
assert_eq!(processors.len(), 3);
|
|
@@ -474,7 +474,10 @@ async fn test_processor_execution_order_within_stage() {
|
|
|
474
474
|
|
|
475
475
|
let config = ExtractionConfig::default();
|
|
476
476
|
for processor in processors {
|
|
477
|
-
processor
|
|
477
|
+
processor
|
|
478
|
+
.process(&mut result, &config)
|
|
479
|
+
.await
|
|
480
|
+
.expect("Async operation failed");
|
|
478
481
|
}
|
|
479
482
|
|
|
480
483
|
assert_eq!(result.content, "start [high] [medium] [low]");
|
|
@@ -488,7 +491,7 @@ async fn test_processor_error_propagation() {
|
|
|
488
491
|
name: "failing".to_string(),
|
|
489
492
|
});
|
|
490
493
|
|
|
491
|
-
registry.register(failing, 50).
|
|
494
|
+
registry.register(failing, 50).expect("Operation failed");
|
|
492
495
|
|
|
493
496
|
let processors = registry.get_for_stage(ProcessingStage::Early);
|
|
494
497
|
assert_eq!(processors.len(), 1);
|
|
@@ -531,9 +534,9 @@ fn test_processor_multiple_stages() {
|
|
|
531
534
|
stage: ProcessingStage::Late,
|
|
532
535
|
});
|
|
533
536
|
|
|
534
|
-
registry.register(early, 50).
|
|
535
|
-
registry.register(middle, 50).
|
|
536
|
-
registry.register(late, 50).
|
|
537
|
+
registry.register(early, 50).expect("Operation failed");
|
|
538
|
+
registry.register(middle, 50).expect("Operation failed");
|
|
539
|
+
registry.register(late, 50).expect("Operation failed");
|
|
537
540
|
|
|
538
541
|
assert_eq!(registry.get_for_stage(ProcessingStage::Early).len(), 1);
|
|
539
542
|
assert_eq!(registry.get_for_stage(ProcessingStage::Middle).len(), 1);
|
|
@@ -593,8 +596,8 @@ fn test_processor_same_priority_same_stage() {
|
|
|
593
596
|
stage: ProcessingStage::Early,
|
|
594
597
|
});
|
|
595
598
|
|
|
596
|
-
registry.register(proc1, 50).
|
|
597
|
-
registry.register(proc2, 50).
|
|
599
|
+
registry.register(proc1, 50).expect("Operation failed");
|
|
600
|
+
registry.register(proc2, 50).expect("Operation failed");
|
|
598
601
|
|
|
599
602
|
let processors = registry.get_for_stage(ProcessingStage::Early);
|
|
600
603
|
assert_eq!(processors.len(), 2);
|
|
@@ -609,10 +612,10 @@ fn test_processor_remove_from_specific_stage() {
|
|
|
609
612
|
stage: ProcessingStage::Early,
|
|
610
613
|
});
|
|
611
614
|
|
|
612
|
-
registry.register(early, 50).
|
|
615
|
+
registry.register(early, 50).expect("Operation failed");
|
|
613
616
|
assert_eq!(registry.get_for_stage(ProcessingStage::Early).len(), 1);
|
|
614
617
|
|
|
615
|
-
registry.remove("processor").
|
|
618
|
+
registry.remove("processor").expect("Operation failed");
|
|
616
619
|
assert_eq!(registry.get_for_stage(ProcessingStage::Early).len(), 0);
|
|
617
620
|
}
|
|
618
621
|
|
|
@@ -625,7 +628,7 @@ fn test_processor_list_across_stages() {
|
|
|
625
628
|
name: format!("{:?}-processor", stage),
|
|
626
629
|
stage,
|
|
627
630
|
});
|
|
628
|
-
registry.register(processor, 50).
|
|
631
|
+
registry.register(processor, 50).expect("Operation failed");
|
|
629
632
|
}
|
|
630
633
|
|
|
631
634
|
let names = registry.list();
|
|
@@ -641,10 +644,10 @@ fn test_processor_shutdown_clears_all_stages() {
|
|
|
641
644
|
name: format!("{:?}-processor", stage),
|
|
642
645
|
stage,
|
|
643
646
|
});
|
|
644
|
-
registry.register(processor, 50).
|
|
647
|
+
registry.register(processor, 50).expect("Operation failed");
|
|
645
648
|
}
|
|
646
649
|
|
|
647
|
-
registry.shutdown_all().
|
|
650
|
+
registry.shutdown_all().expect("Operation failed");
|
|
648
651
|
|
|
649
652
|
assert_eq!(registry.get_for_stage(ProcessingStage::Early).len(), 0);
|
|
650
653
|
assert_eq!(registry.get_for_stage(ProcessingStage::Middle).len(), 0);
|
|
@@ -660,7 +663,7 @@ async fn test_validator_content_validation() {
|
|
|
660
663
|
min_length: 10,
|
|
661
664
|
});
|
|
662
665
|
|
|
663
|
-
registry.register(strict).
|
|
666
|
+
registry.register(strict).expect("Operation failed");
|
|
664
667
|
|
|
665
668
|
let validators = registry.get_all();
|
|
666
669
|
assert_eq!(validators.len(), 1);
|
|
@@ -791,9 +794,9 @@ fn test_validator_priority_ordering() {
|
|
|
791
794
|
let low = Arc::new(LowPriorityValidator);
|
|
792
795
|
let high_priority = Arc::new(HighPriorityValidator);
|
|
793
796
|
|
|
794
|
-
registry.register(medium).
|
|
795
|
-
registry.register(low).
|
|
796
|
-
registry.register(high_priority).
|
|
797
|
+
registry.register(medium).expect("Operation failed");
|
|
798
|
+
registry.register(low).expect("Operation failed");
|
|
799
|
+
registry.register(high_priority).expect("Operation failed");
|
|
797
800
|
|
|
798
801
|
let validators = registry.get_all();
|
|
799
802
|
assert_eq!(validators.len(), 3);
|
|
@@ -857,13 +860,13 @@ fn test_validator_remove_and_reregister() {
|
|
|
857
860
|
min_length: 5,
|
|
858
861
|
});
|
|
859
862
|
|
|
860
|
-
registry.register(Arc::clone(&validator)).
|
|
863
|
+
registry.register(Arc::clone(&validator)).expect("Operation failed");
|
|
861
864
|
assert_eq!(registry.get_all().len(), 1);
|
|
862
865
|
|
|
863
|
-
registry.remove("validator").
|
|
866
|
+
registry.remove("validator").expect("Operation failed");
|
|
864
867
|
assert_eq!(registry.get_all().len(), 0);
|
|
865
868
|
|
|
866
|
-
registry.register(validator).
|
|
869
|
+
registry.register(validator).expect("Operation failed");
|
|
867
870
|
assert_eq!(registry.get_all().len(), 1);
|
|
868
871
|
}
|
|
869
872
|
|
|
@@ -890,9 +893,9 @@ fn test_multiple_registries_independence() {
|
|
|
890
893
|
min_length: 5,
|
|
891
894
|
});
|
|
892
895
|
|
|
893
|
-
extractor_registry.register(extractor).
|
|
894
|
-
processor_registry.register(processor, 50).
|
|
895
|
-
validator_registry.register(validator).
|
|
896
|
+
extractor_registry.register(extractor).expect("Operation failed");
|
|
897
|
+
processor_registry.register(processor, 50).expect("Operation failed");
|
|
898
|
+
validator_registry.register(validator).expect("Operation failed");
|
|
896
899
|
|
|
897
900
|
assert_eq!(ocr_registry.list().len(), 0);
|
|
898
901
|
assert_eq!(extractor_registry.list().len(), 1);
|
|
@@ -923,14 +926,14 @@ fn test_shutdown_all_registries() {
|
|
|
923
926
|
min_length: 5,
|
|
924
927
|
});
|
|
925
928
|
|
|
926
|
-
extractor_registry.register(extractor).
|
|
927
|
-
processor_registry.register(processor, 50).
|
|
928
|
-
validator_registry.register(validator).
|
|
929
|
+
extractor_registry.register(extractor).expect("Operation failed");
|
|
930
|
+
processor_registry.register(processor, 50).expect("Operation failed");
|
|
931
|
+
validator_registry.register(validator).expect("Operation failed");
|
|
929
932
|
|
|
930
|
-
ocr_registry.shutdown_all().
|
|
931
|
-
extractor_registry.shutdown_all().
|
|
932
|
-
processor_registry.shutdown_all().
|
|
933
|
-
validator_registry.shutdown_all().
|
|
933
|
+
ocr_registry.shutdown_all().expect("Operation failed");
|
|
934
|
+
extractor_registry.shutdown_all().expect("Operation failed");
|
|
935
|
+
processor_registry.shutdown_all().expect("Operation failed");
|
|
936
|
+
validator_registry.shutdown_all().expect("Operation failed");
|
|
934
937
|
|
|
935
938
|
assert_eq!(ocr_registry.list().len(), 0);
|
|
936
939
|
assert_eq!(extractor_registry.list().len(), 0);
|