kreuzberg 4.1.2 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
  5. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  6. data/kreuzberg.gemspec +13 -1
  7. data/lib/kreuzberg/cli.rb +16 -6
  8. data/lib/kreuzberg/cli_proxy.rb +3 -1
  9. data/lib/kreuzberg/config.rb +121 -39
  10. data/lib/kreuzberg/djot_content.rb +225 -0
  11. data/lib/kreuzberg/extraction_api.rb +20 -4
  12. data/lib/kreuzberg/result.rb +12 -2
  13. data/lib/kreuzberg/version.rb +1 -1
  14. data/lib/kreuzberg.rb +1 -0
  15. data/sig/kreuzberg.rbs +28 -12
  16. data/spec/binding/batch_operations_spec.rb +80 -0
  17. data/spec/binding/batch_spec.rb +6 -5
  18. data/spec/binding/error_recovery_spec.rb +3 -3
  19. data/spec/binding/metadata_types_spec.rb +77 -57
  20. data/spec/binding/tables_spec.rb +11 -2
  21. data/spec/serialization_spec.rb +134 -0
  22. data/spec/unit/config/output_format_spec.rb +380 -0
  23. data/vendor/Cargo.toml +1 -1
  24. data/vendor/kreuzberg/Cargo.toml +1 -1
  25. data/vendor/kreuzberg/README.md +1 -1
  26. data/vendor/kreuzberg/src/api/startup.rs +15 -1
  27. data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
  28. data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
  29. data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
  30. data/vendor/kreuzberg/src/core/io.rs +7 -7
  31. data/vendor/kreuzberg/src/core/mime.rs +4 -4
  32. data/vendor/kreuzberg/src/embeddings.rs +4 -4
  33. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
  34. data/vendor/kreuzberg/src/mcp/format.rs +237 -39
  35. data/vendor/kreuzberg/src/mcp/params.rs +26 -33
  36. data/vendor/kreuzberg/src/mcp/server.rs +6 -3
  37. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
  38. data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
  39. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
  40. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
  41. data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
  42. data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
  43. data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
  44. data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
  45. data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
  46. data/vendor/kreuzberg/tests/api_embed.rs +84 -50
  47. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
  48. data/vendor/kreuzberg/tests/api_tests.rs +298 -139
  49. data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
  50. data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
  51. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
  52. data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
  53. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
  54. data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
  55. data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
  56. data/vendor/kreuzberg/tests/config_features.rs +19 -15
  57. data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
  58. data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
  59. data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
  60. data/vendor/kreuzberg/tests/core_integration.rs +57 -57
  61. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
  62. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
  63. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
  64. data/vendor/kreuzberg/tests/email_integration.rs +7 -7
  65. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
  66. data/vendor/kreuzberg/tests/error_handling.rs +13 -11
  67. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
  68. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  69. data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
  70. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
  71. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
  72. data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
  73. data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
  74. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
  75. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
  76. data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
  77. data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
  78. data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
  79. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
  80. data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
  81. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
  82. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
  83. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
  84. data/vendor/kreuzberg/tests/page_markers.rs +1 -1
  85. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
  86. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  87. data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
  88. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
  89. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
  90. data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
  91. data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
  92. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
  93. data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
  94. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
  95. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
  96. data/vendor/kreuzberg/tests/security_validation.rs +20 -19
  97. data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
  98. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
  99. data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
  100. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
  101. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
  102. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  103. metadata +12 -2
@@ -80,7 +80,8 @@ impl KreuzbergMcp {
80
80
  use super::format::{build_config, format_extraction_result};
81
81
  use crate::{extract_file, extract_file_sync};
82
82
 
83
- let config = build_config(&self.default_config, params.enable_ocr, params.force_ocr);
83
+ let config =
84
+ build_config(&self.default_config, params.config).map_err(|e| rmcp::ErrorData::invalid_params(e, None))?;
84
85
 
85
86
  let result = if params.r#async {
86
87
  extract_file(&params.path, params.mime_type.as_deref(), &config)
@@ -114,7 +115,8 @@ impl KreuzbergMcp {
114
115
  .decode(&params.data)
115
116
  .map_err(|e| rmcp::ErrorData::invalid_params(format!("Invalid base64: {}", e), None))?;
116
117
 
117
- let config = build_config(&self.default_config, params.enable_ocr, params.force_ocr);
118
+ let config =
119
+ build_config(&self.default_config, params.config).map_err(|e| rmcp::ErrorData::invalid_params(e, None))?;
118
120
 
119
121
  let mime_type = params.mime_type.as_deref().unwrap_or("");
120
122
 
@@ -145,7 +147,8 @@ impl KreuzbergMcp {
145
147
  use super::format::{build_config, format_extraction_result};
146
148
  use crate::{batch_extract_file, batch_extract_file_sync};
147
149
 
148
- let config = build_config(&self.default_config, params.enable_ocr, params.force_ocr);
150
+ let config =
151
+ build_config(&self.default_config, params.config).map_err(|e| rmcp::ErrorData::invalid_params(e, None))?;
149
152
 
150
153
  let results = if params.r#async {
151
154
  batch_extract_file(params.paths.clone(), &config)
@@ -30,7 +30,8 @@ pub(in crate::mcp) trait ExtractionTool {
30
30
  &self,
31
31
  Parameters(params): Parameters<ExtractFileParams>,
32
32
  ) -> Result<CallToolResult, McpError> {
33
- let config = build_config(self.default_config(), params.enable_ocr, params.force_ocr);
33
+ let config = build_config(self.default_config(), params.config)
34
+ .map_err(|e| McpError::invalid_params(e, None))?;
34
35
 
35
36
  let result = if params.r#async {
36
37
  extract_file(&params.path, params.mime_type.as_deref(), &config)
@@ -59,7 +60,8 @@ pub(in crate::mcp) trait ExtractionTool {
59
60
  .decode(&params.data)
60
61
  .map_err(|e| McpError::invalid_params(format!("Invalid base64: {}", e), None))?;
61
62
 
62
- let config = build_config(self.default_config(), params.enable_ocr, params.force_ocr);
63
+ let config = build_config(self.default_config(), params.config)
64
+ .map_err(|e| McpError::invalid_params(e, None))?;
63
65
 
64
66
  let mime_type = params.mime_type.as_deref().unwrap_or("");
65
67
 
@@ -86,7 +88,8 @@ pub(in crate::mcp) trait ExtractionTool {
86
88
  &self,
87
89
  Parameters(params): Parameters<BatchExtractFilesParams>,
88
90
  ) -> Result<CallToolResult, McpError> {
89
- let config = build_config(self.default_config(), params.enable_ocr, params.force_ocr);
91
+ let config = build_config(self.default_config(), params.config)
92
+ .map_err(|e| McpError::invalid_params(e, None))?;
90
93
 
91
94
  let results = if params.r#async {
92
95
  batch_extract_file(params.paths.clone(), &config)
@@ -153,8 +156,7 @@ mod tests {
153
156
  let params = ExtractFileParams {
154
157
  path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
155
158
  mime_type: None,
156
- enable_ocr: false,
157
- force_ocr: false,
159
+ config: None,
158
160
  r#async: true,
159
161
  };
160
162
 
@@ -181,8 +183,7 @@ mod tests {
181
183
  let params = ExtractFileParams {
182
184
  path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
183
185
  mime_type: None,
184
- enable_ocr: false,
185
- force_ocr: false,
186
+ config: None,
186
187
  r#async: true,
187
188
  };
188
189
 
@@ -208,8 +209,7 @@ mod tests {
208
209
  let params = ExtractFileParams {
209
210
  path: "/nonexistent/file.pdf".to_string(),
210
211
  mime_type: None,
211
- enable_ocr: false,
212
- force_ocr: false,
212
+ config: None,
213
213
  r#async: true,
214
214
  };
215
215
 
@@ -226,8 +226,7 @@ mod tests {
226
226
  let params = ExtractFileParams {
227
227
  path: get_test_path("pdfs_with_tables/tiny.pdf").to_string(),
228
228
  mime_type: Some("application/pdf".to_string()),
229
- enable_ocr: false,
230
- force_ocr: false,
229
+ config: None,
231
230
  r#async: true,
232
231
  };
233
232
 
@@ -246,8 +245,7 @@ mod tests {
246
245
  let params = ExtractBytesParams {
247
246
  data: encoded,
248
247
  mime_type: Some("text/plain".to_string()),
249
- enable_ocr: false,
250
- force_ocr: false,
248
+ config: None,
251
249
  r#async: true,
252
250
  };
253
251
 
@@ -274,8 +272,7 @@ mod tests {
274
272
  let params = ExtractBytesParams {
275
273
  data: "not-valid-base64!!!".to_string(),
276
274
  mime_type: None,
277
- enable_ocr: false,
278
- force_ocr: false,
275
+ config: None,
279
276
  r#async: true,
280
277
  };
281
278
 
@@ -292,8 +289,7 @@ mod tests {
292
289
  let server = TestMcpServer::new();
293
290
  let params = BatchExtractFilesParams {
294
291
  paths: vec![get_test_path("pdfs_with_tables/tiny.pdf").to_string()],
295
- enable_ocr: false,
296
- force_ocr: false,
292
+ config: None,
297
293
  r#async: true,
298
294
  };
299
295
 
@@ -319,8 +315,7 @@ mod tests {
319
315
  let server = TestMcpServer::new();
320
316
  let params = BatchExtractFilesParams {
321
317
  paths: vec![],
322
- enable_ocr: false,
323
- force_ocr: false,
318
+ config: None,
324
319
  r#async: true,
325
320
  };
326
321
 
@@ -350,8 +345,7 @@ mod tests {
350
345
  let params = ExtractFileParams {
351
346
  path: test_file.to_string(),
352
347
  mime_type: None,
353
- enable_ocr: false,
354
- force_ocr: false,
348
+ config: None,
355
349
  r#async: true,
356
350
  };
357
351
 
@@ -378,8 +372,7 @@ mod tests {
378
372
  if std::path::Path::new(&file1).exists() && std::path::Path::new(&file2).exists() {
379
373
  let params = BatchExtractFilesParams {
380
374
  paths: vec![file1.to_string(), file2.to_string()],
381
- enable_ocr: false,
382
- force_ocr: false,
375
+ config: None,
383
376
  r#async: true,
384
377
  };
385
378
 
@@ -206,6 +206,7 @@ mod extractor;
206
206
  mod ocr;
207
207
  mod processor;
208
208
  pub mod registry;
209
+ pub mod startup_validation;
209
210
  mod traits;
210
211
  mod validator;
211
212
 
@@ -43,9 +43,26 @@ impl DocumentExtractorRegistry {
43
43
  let priority = extractor.priority();
44
44
  let mime_types: Vec<String> = extractor.supported_mime_types().iter().map(|s| s.to_string()).collect();
45
45
 
46
- super::validate_plugin_name(&name)?;
46
+ if let Err(e) = super::validate_plugin_name(&name) {
47
+ tracing::warn!(
48
+ "Failed to validate document extractor name '{}': {}. \
49
+ Registration aborted. Plugin names must be non-empty and contain only alphanumeric characters, hyphens, and underscores.",
50
+ name,
51
+ e
52
+ );
53
+ return Err(e);
54
+ }
47
55
 
48
- extractor.initialize()?;
56
+ if let Err(e) = extractor.initialize() {
57
+ tracing::error!(
58
+ "Failed to initialize document extractor '{}': {}. \
59
+ Extraction for MIME types {:?} will be unavailable.",
60
+ name,
61
+ e,
62
+ mime_types
63
+ );
64
+ return Err(e);
65
+ }
49
66
 
50
67
  let mut index_entries = Vec::new();
51
68
 
@@ -57,7 +74,13 @@ impl DocumentExtractorRegistry {
57
74
  index_entries.push((mime_type.clone(), priority));
58
75
  }
59
76
 
60
- self.name_index.insert(name, index_entries);
77
+ self.name_index.insert(name.clone(), index_entries);
78
+ tracing::debug!(
79
+ "Registered document extractor '{}' with priority {} for MIME types: {:?}",
80
+ name,
81
+ priority,
82
+ mime_types
83
+ );
61
84
 
62
85
  Ok(())
63
86
  }
@@ -128,7 +151,13 @@ impl DocumentExtractorRegistry {
128
151
  pub fn remove(&mut self, name: &str) -> Result<()> {
129
152
  let index_entries = match self.name_index.remove(name) {
130
153
  Some(entries) => entries,
131
- None => return Ok(()),
154
+ None => {
155
+ tracing::debug!(
156
+ "Document extractor '{}' not found in registry (already removed or never registered)",
157
+ name
158
+ );
159
+ return Ok(());
160
+ }
132
161
  };
133
162
 
134
163
  let mut extractor_to_shutdown: Option<Arc<dyn DocumentExtractor>> = None;
@@ -148,7 +177,16 @@ impl DocumentExtractorRegistry {
148
177
  }
149
178
 
150
179
  if let Some(extractor) = extractor_to_shutdown {
151
- extractor.shutdown()?;
180
+ if let Err(e) = extractor.shutdown() {
181
+ tracing::warn!(
182
+ "Failed to shutdown document extractor '{}': {}. \
183
+ Resources may not have been properly released.",
184
+ name,
185
+ e
186
+ );
187
+ return Err(e);
188
+ }
189
+ tracing::debug!("Successfully removed and shut down document extractor '{}'", name);
152
190
  }
153
191
 
154
192
  Ok(())
@@ -157,9 +195,19 @@ impl DocumentExtractorRegistry {
157
195
  /// Shutdown all extractors and clear the registry.
158
196
  pub fn shutdown_all(&mut self) -> Result<()> {
159
197
  let names = self.list();
198
+ let count = names.len();
199
+
200
+ if count > 0 {
201
+ tracing::debug!("Shutting down {} document extractors", count);
202
+ }
203
+
160
204
  for name in names {
161
205
  self.remove(&name)?;
162
206
  }
207
+
208
+ if count > 0 {
209
+ tracing::debug!("Successfully shut down all {} document extractors", count);
210
+ }
163
211
  Ok(())
164
212
  }
165
213
  }
@@ -413,4 +461,202 @@ mod tests {
413
461
  assert_eq!(registry.get("text/markdown").unwrap().name(), "multi-extractor");
414
462
  assert_eq!(registry.get("text/html").unwrap().name(), "multi-extractor");
415
463
  }
464
+
465
+ struct FailingExtractor {
466
+ name: String,
467
+ fail_on_init: bool,
468
+ }
469
+
470
+ impl Plugin for FailingExtractor {
471
+ fn name(&self) -> &str {
472
+ &self.name
473
+ }
474
+ fn version(&self) -> String {
475
+ "1.0.0".to_string()
476
+ }
477
+ fn initialize(&self) -> Result<()> {
478
+ if self.fail_on_init {
479
+ Err(KreuzbergError::Plugin {
480
+ message: "Extractor initialization failed".to_string(),
481
+ plugin_name: self.name.clone(),
482
+ })
483
+ } else {
484
+ Ok(())
485
+ }
486
+ }
487
+ fn shutdown(&self) -> Result<()> {
488
+ Ok(())
489
+ }
490
+ }
491
+
492
+ #[async_trait]
493
+ impl DocumentExtractor for FailingExtractor {
494
+ async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> {
495
+ Ok(ExtractionResult {
496
+ content: "test".to_string(),
497
+ mime_type: "text/plain".to_string(),
498
+ metadata: crate::types::Metadata::default(),
499
+ tables: vec![],
500
+ detected_languages: None,
501
+ chunks: None,
502
+ images: None,
503
+ djot_content: None,
504
+ pages: None,
505
+ elements: None,
506
+ })
507
+ }
508
+
509
+ fn supported_mime_types(&self) -> &[&str] {
510
+ &["text/plain"]
511
+ }
512
+
513
+ fn priority(&self) -> i32 {
514
+ 50
515
+ }
516
+ }
517
+
518
+ #[test]
519
+ fn test_document_extractor_initialization_failure_logs_error() {
520
+ let mut registry = DocumentExtractorRegistry::new();
521
+
522
+ let extractor = Arc::new(FailingExtractor {
523
+ name: "failing-extractor".to_string(),
524
+ fail_on_init: true,
525
+ });
526
+
527
+ let result = registry.register(extractor);
528
+ assert!(result.is_err());
529
+ assert_eq!(registry.list().len(), 0);
530
+ }
531
+
532
+ #[test]
533
+ fn test_document_extractor_invalid_name_empty_logs_warning() {
534
+ let mut registry = DocumentExtractorRegistry::new();
535
+
536
+ let extractor = Arc::new(MockExtractor {
537
+ name: "".to_string(),
538
+ mime_types: &["text/plain"],
539
+ priority: 50,
540
+ });
541
+
542
+ let result = registry.register(extractor);
543
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
544
+ }
545
+
546
+ #[test]
547
+ fn test_document_extractor_invalid_name_with_spaces_logs_warning() {
548
+ let mut registry = DocumentExtractorRegistry::new();
549
+
550
+ let extractor = Arc::new(MockExtractor {
551
+ name: "invalid extractor".to_string(),
552
+ mime_types: &["text/plain"],
553
+ priority: 50,
554
+ });
555
+
556
+ let result = registry.register(extractor);
557
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
558
+ }
559
+
560
+ #[test]
561
+ fn test_document_extractor_successful_registration_logs_debug() {
562
+ let mut registry = DocumentExtractorRegistry::new();
563
+
564
+ let extractor = Arc::new(MockExtractor {
565
+ name: "valid-pdf-extractor".to_string(),
566
+ mime_types: &["application/pdf"],
567
+ priority: 100,
568
+ });
569
+
570
+ let result = registry.register(extractor);
571
+ assert!(result.is_ok());
572
+ assert_eq!(registry.list().len(), 1);
573
+ }
574
+
575
+ #[test]
576
+ fn test_document_extractor_remove_nonexistent_logs_debug() {
577
+ let mut registry = DocumentExtractorRegistry::new();
578
+
579
+ let result = registry.remove("nonexistent-extractor");
580
+ assert!(result.is_ok());
581
+ assert_eq!(registry.list().len(), 0);
582
+ }
583
+
584
+ #[test]
585
+ fn test_document_extractor_shutdown_empty_registry() {
586
+ let mut registry = DocumentExtractorRegistry::new();
587
+ let result = registry.shutdown_all();
588
+ assert!(result.is_ok());
589
+ assert_eq!(registry.list().len(), 0);
590
+ }
591
+
592
+ #[test]
593
+ fn test_document_extractor_shutdown_with_multiple_extractors() {
594
+ let mut registry = DocumentExtractorRegistry::new();
595
+
596
+ let extractor1 = Arc::new(MockExtractor {
597
+ name: "extractor1".to_string(),
598
+ mime_types: &["text/plain"],
599
+ priority: 50,
600
+ });
601
+
602
+ let extractor2 = Arc::new(MockExtractor {
603
+ name: "extractor2".to_string(),
604
+ mime_types: &["application/pdf"],
605
+ priority: 100,
606
+ });
607
+
608
+ let extractor3 = Arc::new(MockExtractor {
609
+ name: "extractor3".to_string(),
610
+ mime_types: &["image/png"],
611
+ priority: 75,
612
+ });
613
+
614
+ registry.register(extractor1).unwrap();
615
+ registry.register(extractor2).unwrap();
616
+ registry.register(extractor3).unwrap();
617
+
618
+ assert_eq!(registry.list().len(), 3);
619
+
620
+ registry.shutdown_all().unwrap();
621
+ assert_eq!(registry.list().len(), 0);
622
+ }
623
+
624
+ #[test]
625
+ fn test_document_extractor_priority_ordering_complex() {
626
+ let mut registry = DocumentExtractorRegistry::new();
627
+
628
+ let extractors = vec![
629
+ (
630
+ Arc::new(MockExtractor {
631
+ name: "priority-1".to_string(),
632
+ mime_types: &["application/pdf"],
633
+ priority: 1,
634
+ }),
635
+ 1,
636
+ ),
637
+ (
638
+ Arc::new(MockExtractor {
639
+ name: "priority-100".to_string(),
640
+ mime_types: &["application/pdf"],
641
+ priority: 100,
642
+ }),
643
+ 100,
644
+ ),
645
+ (
646
+ Arc::new(MockExtractor {
647
+ name: "priority-50".to_string(),
648
+ mime_types: &["application/pdf"],
649
+ priority: 50,
650
+ }),
651
+ 50,
652
+ ),
653
+ ];
654
+
655
+ for (extractor, _priority) in &extractors {
656
+ registry.register(extractor.clone()).unwrap();
657
+ }
658
+
659
+ let retrieved = registry.get("application/pdf").unwrap();
660
+ assert_eq!(retrieved.name(), "priority-100");
661
+ }
416
662
  }
@@ -31,6 +31,8 @@ impl OcrBackendRegistry {
31
31
  /// Create a new OCR backend registry with default backends.
32
32
  ///
33
33
  /// Registers the Tesseract backend by default if the "ocr" feature is enabled.
34
+ /// Logs warnings if backend initialization fails (common in containerized environments
35
+ /// with missing dependencies or permission issues).
34
36
  pub fn new() -> Self {
35
37
  #[cfg(feature = "ocr")]
36
38
  let mut registry = Self {
@@ -45,8 +47,27 @@ impl OcrBackendRegistry {
45
47
  #[cfg(feature = "ocr")]
46
48
  {
47
49
  use crate::ocr::tesseract_backend::TesseractBackend;
48
- if let Ok(backend) = TesseractBackend::new() {
49
- let _ = registry.register(Arc::new(backend));
50
+ match TesseractBackend::new() {
51
+ Ok(backend) => {
52
+ if let Err(e) = registry.register(Arc::new(backend)) {
53
+ tracing::error!(
54
+ "Failed to register Tesseract OCR backend: {}. \
55
+ OCR functionality will be unavailable. \
56
+ Check TESSDATA_PREFIX environment variable and tessdata file permissions.",
57
+ e
58
+ );
59
+ }
60
+ }
61
+ Err(e) => {
62
+ tracing::warn!(
63
+ "Tesseract OCR backend initialization failed: {}. \
64
+ OCR functionality will be unavailable. \
65
+ Common causes: missing TESSDATA_PREFIX env var, \
66
+ tessdata files not found, or permission issues in containerized environments. \
67
+ See https://docs.kreuzberg.dev/guides/docker/ for Kubernetes troubleshooting.",
68
+ e
69
+ );
70
+ }
50
71
  }
51
72
  }
52
73
 
@@ -290,4 +311,131 @@ mod tests {
290
311
  registry.shutdown_all().unwrap();
291
312
  assert_eq!(registry.list().len(), 0);
292
313
  }
314
+
315
+ struct FailingOcrBackend {
316
+ name: String,
317
+ fail_on_init: bool,
318
+ }
319
+
320
+ impl Plugin for FailingOcrBackend {
321
+ fn name(&self) -> &str {
322
+ &self.name
323
+ }
324
+ fn version(&self) -> String {
325
+ "1.0.0".to_string()
326
+ }
327
+ fn initialize(&self) -> Result<()> {
328
+ if self.fail_on_init {
329
+ Err(KreuzbergError::Plugin {
330
+ message: "Backend initialization failed".to_string(),
331
+ plugin_name: self.name.clone(),
332
+ })
333
+ } else {
334
+ Ok(())
335
+ }
336
+ }
337
+ fn shutdown(&self) -> Result<()> {
338
+ Ok(())
339
+ }
340
+ }
341
+
342
+ #[async_trait]
343
+ impl OcrBackend for FailingOcrBackend {
344
+ async fn process_image(&self, _: &[u8], _: &OcrConfig) -> Result<ExtractionResult> {
345
+ Ok(ExtractionResult {
346
+ content: "test".to_string(),
347
+ mime_type: "text/plain".to_string(),
348
+ metadata: crate::types::Metadata::default(),
349
+ tables: vec![],
350
+ detected_languages: None,
351
+ chunks: None,
352
+ images: None,
353
+ djot_content: None,
354
+ pages: None,
355
+ elements: None,
356
+ })
357
+ }
358
+
359
+ fn supports_language(&self, _lang: &str) -> bool {
360
+ false
361
+ }
362
+
363
+ fn backend_type(&self) -> crate::plugins::ocr::OcrBackendType {
364
+ crate::plugins::ocr::OcrBackendType::Custom
365
+ }
366
+ }
367
+
368
+ #[test]
369
+ fn test_ocr_backend_initialization_failure_logs_error() {
370
+ let mut registry = OcrBackendRegistry::new_empty();
371
+
372
+ let backend = Arc::new(FailingOcrBackend {
373
+ name: "failing-ocr".to_string(),
374
+ fail_on_init: true,
375
+ });
376
+
377
+ let result = registry.register(backend);
378
+ assert!(result.is_err());
379
+ assert_eq!(registry.list().len(), 0);
380
+ }
381
+
382
+ #[test]
383
+ fn test_ocr_backend_invalid_name_empty_logs_warning() {
384
+ let mut registry = OcrBackendRegistry::new_empty();
385
+
386
+ let backend = Arc::new(MockOcrBackend {
387
+ name: "".to_string(),
388
+ languages: vec!["eng".to_string()],
389
+ });
390
+
391
+ let result = registry.register(backend);
392
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
393
+ }
394
+
395
+ #[test]
396
+ fn test_ocr_backend_invalid_name_with_spaces_logs_warning() {
397
+ let mut registry = OcrBackendRegistry::new_empty();
398
+
399
+ let backend = Arc::new(MockOcrBackend {
400
+ name: "invalid ocr backend".to_string(),
401
+ languages: vec!["eng".to_string()],
402
+ });
403
+
404
+ let result = registry.register(backend);
405
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
406
+ }
407
+
408
+ #[test]
409
+ fn test_ocr_backend_successful_registration_logs_debug() {
410
+ let mut registry = OcrBackendRegistry::new_empty();
411
+
412
+ let backend = Arc::new(MockOcrBackend {
413
+ name: "valid-ocr".to_string(),
414
+ languages: vec!["eng".to_string()],
415
+ });
416
+
417
+ let result = registry.register(backend);
418
+ assert!(result.is_ok());
419
+ assert_eq!(registry.list().len(), 1);
420
+ }
421
+
422
+ #[test]
423
+ fn test_ocr_backend_multiple_registrations() {
424
+ let mut registry = OcrBackendRegistry::new_empty();
425
+
426
+ let backend1 = Arc::new(MockOcrBackend {
427
+ name: "ocr-backend-1".to_string(),
428
+ languages: vec!["eng".to_string()],
429
+ });
430
+
431
+ let backend2 = Arc::new(MockOcrBackend {
432
+ name: "ocr-backend-2".to_string(),
433
+ languages: vec!["deu".to_string()],
434
+ });
435
+
436
+ registry.register(backend1).unwrap();
437
+ registry.register(backend2).unwrap();
438
+
439
+ assert_eq!(registry.list().len(), 2);
440
+ }
293
441
  }