kreuzberg 4.2.0 → 4.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
  5. data/lib/kreuzberg/cli.rb +16 -6
  6. data/lib/kreuzberg/cli_proxy.rb +3 -1
  7. data/lib/kreuzberg/config.rb +59 -28
  8. data/lib/kreuzberg/djot_content.rb +225 -0
  9. data/lib/kreuzberg/extraction_api.rb +20 -4
  10. data/lib/kreuzberg/result.rb +12 -2
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/lib/kreuzberg.rb +1 -0
  13. data/sig/kreuzberg.rbs +23 -11
  14. data/spec/binding/batch_spec.rb +6 -5
  15. data/spec/binding/config_spec.rb +1 -1
  16. data/spec/binding/error_recovery_spec.rb +3 -3
  17. data/spec/binding/tables_spec.rb +11 -2
  18. data/spec/unit/config/extraction_config_spec.rb +2 -2
  19. data/spec/unit/config/output_format_spec.rb +18 -18
  20. data/vendor/Cargo.toml +1 -1
  21. data/vendor/kreuzberg/Cargo.toml +3 -2
  22. data/vendor/kreuzberg/README.md +1 -1
  23. data/vendor/kreuzberg/src/api/error.rs +60 -0
  24. data/vendor/kreuzberg/src/api/handlers.rs +153 -32
  25. data/vendor/kreuzberg/src/api/mod.rs +2 -0
  26. data/vendor/kreuzberg/src/api/openapi.rs +141 -0
  27. data/vendor/kreuzberg/src/api/router.rs +24 -2
  28. data/vendor/kreuzberg/src/api/startup.rs +21 -1
  29. data/vendor/kreuzberg/src/api/types.rs +50 -4
  30. data/vendor/kreuzberg/src/core/config/processing.rs +8 -1
  31. data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
  32. data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
  33. data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
  34. data/vendor/kreuzberg/src/core/io.rs +7 -7
  35. data/vendor/kreuzberg/src/core/mime.rs +4 -4
  36. data/vendor/kreuzberg/src/extraction/excel.rs +246 -9
  37. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
  38. data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
  39. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
  40. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
  41. data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
  42. data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
  43. data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
  44. data/vendor/kreuzberg/tests/config_behavioral.rs +14 -12
  45. data/vendor/kreuzberg/tests/core_integration.rs +2 -4
  46. data/vendor/kreuzberg/tests/mime_detection.rs +3 -2
  47. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +284 -1
  48. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +56 -0
  49. data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
  50. data/vendor/kreuzberg-ffi/src/helpers.rs +13 -1
  51. data/vendor/kreuzberg-ffi/src/lib.rs +8 -5
  52. data/vendor/kreuzberg-ffi/src/memory.rs +35 -1
  53. data/vendor/kreuzberg-ffi/src/types.rs +8 -5
  54. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  55. metadata +5 -2
@@ -43,9 +43,26 @@ impl DocumentExtractorRegistry {
43
43
  let priority = extractor.priority();
44
44
  let mime_types: Vec<String> = extractor.supported_mime_types().iter().map(|s| s.to_string()).collect();
45
45
 
46
- super::validate_plugin_name(&name)?;
46
+ if let Err(e) = super::validate_plugin_name(&name) {
47
+ tracing::warn!(
48
+ "Failed to validate document extractor name '{}': {}. \
49
+ Registration aborted. Plugin names must be non-empty and contain only alphanumeric characters, hyphens, and underscores.",
50
+ name,
51
+ e
52
+ );
53
+ return Err(e);
54
+ }
47
55
 
48
- extractor.initialize()?;
56
+ if let Err(e) = extractor.initialize() {
57
+ tracing::error!(
58
+ "Failed to initialize document extractor '{}': {}. \
59
+ Extraction for MIME types {:?} will be unavailable.",
60
+ name,
61
+ e,
62
+ mime_types
63
+ );
64
+ return Err(e);
65
+ }
49
66
 
50
67
  let mut index_entries = Vec::new();
51
68
 
@@ -57,7 +74,13 @@ impl DocumentExtractorRegistry {
57
74
  index_entries.push((mime_type.clone(), priority));
58
75
  }
59
76
 
60
- self.name_index.insert(name, index_entries);
77
+ self.name_index.insert(name.clone(), index_entries);
78
+ tracing::debug!(
79
+ "Registered document extractor '{}' with priority {} for MIME types: {:?}",
80
+ name,
81
+ priority,
82
+ mime_types
83
+ );
61
84
 
62
85
  Ok(())
63
86
  }
@@ -128,7 +151,13 @@ impl DocumentExtractorRegistry {
128
151
  pub fn remove(&mut self, name: &str) -> Result<()> {
129
152
  let index_entries = match self.name_index.remove(name) {
130
153
  Some(entries) => entries,
131
- None => return Ok(()),
154
+ None => {
155
+ tracing::debug!(
156
+ "Document extractor '{}' not found in registry (already removed or never registered)",
157
+ name
158
+ );
159
+ return Ok(());
160
+ }
132
161
  };
133
162
 
134
163
  let mut extractor_to_shutdown: Option<Arc<dyn DocumentExtractor>> = None;
@@ -148,7 +177,16 @@ impl DocumentExtractorRegistry {
148
177
  }
149
178
 
150
179
  if let Some(extractor) = extractor_to_shutdown {
151
- extractor.shutdown()?;
180
+ if let Err(e) = extractor.shutdown() {
181
+ tracing::warn!(
182
+ "Failed to shutdown document extractor '{}': {}. \
183
+ Resources may not have been properly released.",
184
+ name,
185
+ e
186
+ );
187
+ return Err(e);
188
+ }
189
+ tracing::debug!("Successfully removed and shut down document extractor '{}'", name);
152
190
  }
153
191
 
154
192
  Ok(())
@@ -157,9 +195,19 @@ impl DocumentExtractorRegistry {
157
195
  /// Shutdown all extractors and clear the registry.
158
196
  pub fn shutdown_all(&mut self) -> Result<()> {
159
197
  let names = self.list();
198
+ let count = names.len();
199
+
200
+ if count > 0 {
201
+ tracing::debug!("Shutting down {} document extractors", count);
202
+ }
203
+
160
204
  for name in names {
161
205
  self.remove(&name)?;
162
206
  }
207
+
208
+ if count > 0 {
209
+ tracing::debug!("Successfully shut down all {} document extractors", count);
210
+ }
163
211
  Ok(())
164
212
  }
165
213
  }
@@ -413,4 +461,202 @@ mod tests {
413
461
  assert_eq!(registry.get("text/markdown").unwrap().name(), "multi-extractor");
414
462
  assert_eq!(registry.get("text/html").unwrap().name(), "multi-extractor");
415
463
  }
464
+
465
+ struct FailingExtractor {
466
+ name: String,
467
+ fail_on_init: bool,
468
+ }
469
+
470
+ impl Plugin for FailingExtractor {
471
+ fn name(&self) -> &str {
472
+ &self.name
473
+ }
474
+ fn version(&self) -> String {
475
+ "1.0.0".to_string()
476
+ }
477
+ fn initialize(&self) -> Result<()> {
478
+ if self.fail_on_init {
479
+ Err(KreuzbergError::Plugin {
480
+ message: "Extractor initialization failed".to_string(),
481
+ plugin_name: self.name.clone(),
482
+ })
483
+ } else {
484
+ Ok(())
485
+ }
486
+ }
487
+ fn shutdown(&self) -> Result<()> {
488
+ Ok(())
489
+ }
490
+ }
491
+
492
+ #[async_trait]
493
+ impl DocumentExtractor for FailingExtractor {
494
+ async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> {
495
+ Ok(ExtractionResult {
496
+ content: "test".to_string(),
497
+ mime_type: "text/plain".to_string(),
498
+ metadata: crate::types::Metadata::default(),
499
+ tables: vec![],
500
+ detected_languages: None,
501
+ chunks: None,
502
+ images: None,
503
+ djot_content: None,
504
+ pages: None,
505
+ elements: None,
506
+ })
507
+ }
508
+
509
+ fn supported_mime_types(&self) -> &[&str] {
510
+ &["text/plain"]
511
+ }
512
+
513
+ fn priority(&self) -> i32 {
514
+ 50
515
+ }
516
+ }
517
+
518
+ #[test]
519
+ fn test_document_extractor_initialization_failure_logs_error() {
520
+ let mut registry = DocumentExtractorRegistry::new();
521
+
522
+ let extractor = Arc::new(FailingExtractor {
523
+ name: "failing-extractor".to_string(),
524
+ fail_on_init: true,
525
+ });
526
+
527
+ let result = registry.register(extractor);
528
+ assert!(result.is_err());
529
+ assert_eq!(registry.list().len(), 0);
530
+ }
531
+
532
+ #[test]
533
+ fn test_document_extractor_invalid_name_empty_logs_warning() {
534
+ let mut registry = DocumentExtractorRegistry::new();
535
+
536
+ let extractor = Arc::new(MockExtractor {
537
+ name: "".to_string(),
538
+ mime_types: &["text/plain"],
539
+ priority: 50,
540
+ });
541
+
542
+ let result = registry.register(extractor);
543
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
544
+ }
545
+
546
+ #[test]
547
+ fn test_document_extractor_invalid_name_with_spaces_logs_warning() {
548
+ let mut registry = DocumentExtractorRegistry::new();
549
+
550
+ let extractor = Arc::new(MockExtractor {
551
+ name: "invalid extractor".to_string(),
552
+ mime_types: &["text/plain"],
553
+ priority: 50,
554
+ });
555
+
556
+ let result = registry.register(extractor);
557
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
558
+ }
559
+
560
+ #[test]
561
+ fn test_document_extractor_successful_registration_logs_debug() {
562
+ let mut registry = DocumentExtractorRegistry::new();
563
+
564
+ let extractor = Arc::new(MockExtractor {
565
+ name: "valid-pdf-extractor".to_string(),
566
+ mime_types: &["application/pdf"],
567
+ priority: 100,
568
+ });
569
+
570
+ let result = registry.register(extractor);
571
+ assert!(result.is_ok());
572
+ assert_eq!(registry.list().len(), 1);
573
+ }
574
+
575
+ #[test]
576
+ fn test_document_extractor_remove_nonexistent_logs_debug() {
577
+ let mut registry = DocumentExtractorRegistry::new();
578
+
579
+ let result = registry.remove("nonexistent-extractor");
580
+ assert!(result.is_ok());
581
+ assert_eq!(registry.list().len(), 0);
582
+ }
583
+
584
+ #[test]
585
+ fn test_document_extractor_shutdown_empty_registry() {
586
+ let mut registry = DocumentExtractorRegistry::new();
587
+ let result = registry.shutdown_all();
588
+ assert!(result.is_ok());
589
+ assert_eq!(registry.list().len(), 0);
590
+ }
591
+
592
+ #[test]
593
+ fn test_document_extractor_shutdown_with_multiple_extractors() {
594
+ let mut registry = DocumentExtractorRegistry::new();
595
+
596
+ let extractor1 = Arc::new(MockExtractor {
597
+ name: "extractor1".to_string(),
598
+ mime_types: &["text/plain"],
599
+ priority: 50,
600
+ });
601
+
602
+ let extractor2 = Arc::new(MockExtractor {
603
+ name: "extractor2".to_string(),
604
+ mime_types: &["application/pdf"],
605
+ priority: 100,
606
+ });
607
+
608
+ let extractor3 = Arc::new(MockExtractor {
609
+ name: "extractor3".to_string(),
610
+ mime_types: &["image/png"],
611
+ priority: 75,
612
+ });
613
+
614
+ registry.register(extractor1).unwrap();
615
+ registry.register(extractor2).unwrap();
616
+ registry.register(extractor3).unwrap();
617
+
618
+ assert_eq!(registry.list().len(), 3);
619
+
620
+ registry.shutdown_all().unwrap();
621
+ assert_eq!(registry.list().len(), 0);
622
+ }
623
+
624
+ #[test]
625
+ fn test_document_extractor_priority_ordering_complex() {
626
+ let mut registry = DocumentExtractorRegistry::new();
627
+
628
+ let extractors = vec![
629
+ (
630
+ Arc::new(MockExtractor {
631
+ name: "priority-1".to_string(),
632
+ mime_types: &["application/pdf"],
633
+ priority: 1,
634
+ }),
635
+ 1,
636
+ ),
637
+ (
638
+ Arc::new(MockExtractor {
639
+ name: "priority-100".to_string(),
640
+ mime_types: &["application/pdf"],
641
+ priority: 100,
642
+ }),
643
+ 100,
644
+ ),
645
+ (
646
+ Arc::new(MockExtractor {
647
+ name: "priority-50".to_string(),
648
+ mime_types: &["application/pdf"],
649
+ priority: 50,
650
+ }),
651
+ 50,
652
+ ),
653
+ ];
654
+
655
+ for (extractor, _priority) in &extractors {
656
+ registry.register(extractor.clone()).unwrap();
657
+ }
658
+
659
+ let retrieved = registry.get("application/pdf").unwrap();
660
+ assert_eq!(retrieved.name(), "priority-100");
661
+ }
416
662
  }
@@ -31,6 +31,8 @@ impl OcrBackendRegistry {
31
31
  /// Create a new OCR backend registry with default backends.
32
32
  ///
33
33
  /// Registers the Tesseract backend by default if the "ocr" feature is enabled.
34
+ /// Logs warnings if backend initialization fails (common in containerized environments
35
+ /// with missing dependencies or permission issues).
34
36
  pub fn new() -> Self {
35
37
  #[cfg(feature = "ocr")]
36
38
  let mut registry = Self {
@@ -45,8 +47,27 @@ impl OcrBackendRegistry {
45
47
  #[cfg(feature = "ocr")]
46
48
  {
47
49
  use crate::ocr::tesseract_backend::TesseractBackend;
48
- if let Ok(backend) = TesseractBackend::new() {
49
- let _ = registry.register(Arc::new(backend));
50
+ match TesseractBackend::new() {
51
+ Ok(backend) => {
52
+ if let Err(e) = registry.register(Arc::new(backend)) {
53
+ tracing::error!(
54
+ "Failed to register Tesseract OCR backend: {}. \
55
+ OCR functionality will be unavailable. \
56
+ Check TESSDATA_PREFIX environment variable and tessdata file permissions.",
57
+ e
58
+ );
59
+ }
60
+ }
61
+ Err(e) => {
62
+ tracing::warn!(
63
+ "Tesseract OCR backend initialization failed: {}. \
64
+ OCR functionality will be unavailable. \
65
+ Common causes: missing TESSDATA_PREFIX env var, \
66
+ tessdata files not found, or permission issues in containerized environments. \
67
+ See https://docs.kreuzberg.dev/guides/docker/ for Kubernetes troubleshooting.",
68
+ e
69
+ );
70
+ }
50
71
  }
51
72
  }
52
73
 
@@ -290,4 +311,131 @@ mod tests {
290
311
  registry.shutdown_all().unwrap();
291
312
  assert_eq!(registry.list().len(), 0);
292
313
  }
314
+
315
+ struct FailingOcrBackend {
316
+ name: String,
317
+ fail_on_init: bool,
318
+ }
319
+
320
+ impl Plugin for FailingOcrBackend {
321
+ fn name(&self) -> &str {
322
+ &self.name
323
+ }
324
+ fn version(&self) -> String {
325
+ "1.0.0".to_string()
326
+ }
327
+ fn initialize(&self) -> Result<()> {
328
+ if self.fail_on_init {
329
+ Err(KreuzbergError::Plugin {
330
+ message: "Backend initialization failed".to_string(),
331
+ plugin_name: self.name.clone(),
332
+ })
333
+ } else {
334
+ Ok(())
335
+ }
336
+ }
337
+ fn shutdown(&self) -> Result<()> {
338
+ Ok(())
339
+ }
340
+ }
341
+
342
+ #[async_trait]
343
+ impl OcrBackend for FailingOcrBackend {
344
+ async fn process_image(&self, _: &[u8], _: &OcrConfig) -> Result<ExtractionResult> {
345
+ Ok(ExtractionResult {
346
+ content: "test".to_string(),
347
+ mime_type: "text/plain".to_string(),
348
+ metadata: crate::types::Metadata::default(),
349
+ tables: vec![],
350
+ detected_languages: None,
351
+ chunks: None,
352
+ images: None,
353
+ djot_content: None,
354
+ pages: None,
355
+ elements: None,
356
+ })
357
+ }
358
+
359
+ fn supports_language(&self, _lang: &str) -> bool {
360
+ false
361
+ }
362
+
363
+ fn backend_type(&self) -> crate::plugins::ocr::OcrBackendType {
364
+ crate::plugins::ocr::OcrBackendType::Custom
365
+ }
366
+ }
367
+
368
+ #[test]
369
+ fn test_ocr_backend_initialization_failure_logs_error() {
370
+ let mut registry = OcrBackendRegistry::new_empty();
371
+
372
+ let backend = Arc::new(FailingOcrBackend {
373
+ name: "failing-ocr".to_string(),
374
+ fail_on_init: true,
375
+ });
376
+
377
+ let result = registry.register(backend);
378
+ assert!(result.is_err());
379
+ assert_eq!(registry.list().len(), 0);
380
+ }
381
+
382
+ #[test]
383
+ fn test_ocr_backend_invalid_name_empty_logs_warning() {
384
+ let mut registry = OcrBackendRegistry::new_empty();
385
+
386
+ let backend = Arc::new(MockOcrBackend {
387
+ name: "".to_string(),
388
+ languages: vec!["eng".to_string()],
389
+ });
390
+
391
+ let result = registry.register(backend);
392
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
393
+ }
394
+
395
+ #[test]
396
+ fn test_ocr_backend_invalid_name_with_spaces_logs_warning() {
397
+ let mut registry = OcrBackendRegistry::new_empty();
398
+
399
+ let backend = Arc::new(MockOcrBackend {
400
+ name: "invalid ocr backend".to_string(),
401
+ languages: vec!["eng".to_string()],
402
+ });
403
+
404
+ let result = registry.register(backend);
405
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
406
+ }
407
+
408
+ #[test]
409
+ fn test_ocr_backend_successful_registration_logs_debug() {
410
+ let mut registry = OcrBackendRegistry::new_empty();
411
+
412
+ let backend = Arc::new(MockOcrBackend {
413
+ name: "valid-ocr".to_string(),
414
+ languages: vec!["eng".to_string()],
415
+ });
416
+
417
+ let result = registry.register(backend);
418
+ assert!(result.is_ok());
419
+ assert_eq!(registry.list().len(), 1);
420
+ }
421
+
422
+ #[test]
423
+ fn test_ocr_backend_multiple_registrations() {
424
+ let mut registry = OcrBackendRegistry::new_empty();
425
+
426
+ let backend1 = Arc::new(MockOcrBackend {
427
+ name: "ocr-backend-1".to_string(),
428
+ languages: vec!["eng".to_string()],
429
+ });
430
+
431
+ let backend2 = Arc::new(MockOcrBackend {
432
+ name: "ocr-backend-2".to_string(),
433
+ languages: vec!["deu".to_string()],
434
+ });
435
+
436
+ registry.register(backend1).unwrap();
437
+ registry.register(backend2).unwrap();
438
+
439
+ assert_eq!(registry.list().len(), 2);
440
+ }
293
441
  }