kreuzberg 4.2.0 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
  5. data/lib/kreuzberg/cli.rb +16 -6
  6. data/lib/kreuzberg/cli_proxy.rb +3 -1
  7. data/lib/kreuzberg/config.rb +56 -9
  8. data/lib/kreuzberg/djot_content.rb +225 -0
  9. data/lib/kreuzberg/extraction_api.rb +20 -4
  10. data/lib/kreuzberg/result.rb +12 -2
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/lib/kreuzberg.rb +1 -0
  13. data/sig/kreuzberg.rbs +23 -11
  14. data/spec/binding/batch_spec.rb +6 -5
  15. data/spec/binding/error_recovery_spec.rb +3 -3
  16. data/spec/binding/tables_spec.rb +11 -2
  17. data/spec/unit/config/output_format_spec.rb +18 -18
  18. data/vendor/Cargo.toml +1 -1
  19. data/vendor/kreuzberg/Cargo.toml +1 -1
  20. data/vendor/kreuzberg/README.md +1 -1
  21. data/vendor/kreuzberg/src/api/startup.rs +15 -1
  22. data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
  23. data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
  24. data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
  25. data/vendor/kreuzberg/src/core/io.rs +7 -7
  26. data/vendor/kreuzberg/src/core/mime.rs +4 -4
  27. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
  28. data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
  29. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
  30. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
  31. data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
  32. data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
  33. data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
  34. data/vendor/kreuzberg/tests/config_behavioral.rs +14 -12
  35. data/vendor/kreuzberg/tests/core_integration.rs +2 -4
  36. data/vendor/kreuzberg/tests/mime_detection.rs +3 -2
  37. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +284 -1
  38. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  39. metadata +4 -2
@@ -32,11 +32,33 @@ impl PostProcessorRegistry {
32
32
  let name = processor.name().to_string();
33
33
  let stage = processor.processing_stage();
34
34
 
35
- super::validate_plugin_name(&name)?;
35
+ if let Err(e) = super::validate_plugin_name(&name) {
36
+ tracing::warn!(
37
+ "Failed to validate post-processor name '{}': {}. \
38
+ Registration aborted. Plugin names must be non-empty and contain only alphanumeric characters, hyphens, and underscores.",
39
+ name,
40
+ e
41
+ );
42
+ return Err(e);
43
+ }
36
44
 
37
- processor.initialize()?;
45
+ if let Err(e) = processor.initialize() {
46
+ tracing::error!(
47
+ "Failed to initialize post-processor '{}' for processing stage {:?} with priority {}: {}. \
48
+ Post-processing step will not be executed.",
49
+ name,
50
+ stage,
51
+ priority,
52
+ e
53
+ );
54
+ return Err(e);
55
+ }
38
56
 
39
57
  if self.name_index.contains_key(&name) {
58
+ tracing::debug!(
59
+ "Post-processor '{}' is already registered. Removing old instance and registering new one.",
60
+ name
61
+ );
40
62
  self.remove(&name)?;
41
63
  }
42
64
 
@@ -47,7 +69,13 @@ impl PostProcessorRegistry {
47
69
  .or_default()
48
70
  .push(Arc::clone(&processor));
49
71
 
50
- self.name_index.insert(name, (stage, priority));
72
+ self.name_index.insert(name.clone(), (stage, priority));
73
+ tracing::debug!(
74
+ "Registered post-processor '{}' for stage {:?} with priority {}",
75
+ name,
76
+ stage,
77
+ priority
78
+ );
51
79
 
52
80
  Ok(())
53
81
  }
@@ -84,7 +112,13 @@ impl PostProcessorRegistry {
84
112
  pub fn remove(&mut self, name: &str) -> Result<()> {
85
113
  let (stage, priority) = match self.name_index.remove(name) {
86
114
  Some(location) => location,
87
- None => return Ok(()),
115
+ None => {
116
+ tracing::debug!(
117
+ "Post-processor '{}' not found in registry (already removed or never registered)",
118
+ name
119
+ );
120
+ return Ok(());
121
+ }
88
122
  };
89
123
 
90
124
  let processor_to_shutdown = if let Some(priority_map) = self.processors.get_mut(&stage) {
@@ -110,7 +144,16 @@ impl PostProcessorRegistry {
110
144
  };
111
145
 
112
146
  if let Some(processor) = processor_to_shutdown {
113
- processor.shutdown()?;
147
+ if let Err(e) = processor.shutdown() {
148
+ tracing::warn!(
149
+ "Failed to shutdown post-processor '{}': {}. \
150
+ Resources may not have been properly released.",
151
+ name,
152
+ e
153
+ );
154
+ return Err(e);
155
+ }
156
+ tracing::debug!("Successfully removed and shut down post-processor '{}'", name);
114
157
  }
115
158
 
116
159
  Ok(())
@@ -119,9 +162,19 @@ impl PostProcessorRegistry {
119
162
  /// Shutdown all processors and clear the registry.
120
163
  pub fn shutdown_all(&mut self) -> Result<()> {
121
164
  let names = self.list();
165
+ let count = names.len();
166
+
167
+ if count > 0 {
168
+ tracing::debug!("Shutting down {} post-processors", count);
169
+ }
170
+
122
171
  for name in names {
123
172
  self.remove(&name)?;
124
173
  }
174
+
175
+ if count > 0 {
176
+ tracing::debug!("Successfully shut down all {} post-processors", count);
177
+ }
125
178
  Ok(())
126
179
  }
127
180
  }
@@ -301,4 +354,159 @@ mod tests {
301
354
  let processors = registry.get_for_stage(ProcessingStage::Late);
302
355
  assert_eq!(processors.len(), 0);
303
356
  }
357
+
358
+ struct FailingPostProcessor {
359
+ name: String,
360
+ stage: ProcessingStage,
361
+ fail_on_init: bool,
362
+ }
363
+
364
+ impl Plugin for FailingPostProcessor {
365
+ fn name(&self) -> &str {
366
+ &self.name
367
+ }
368
+ fn version(&self) -> String {
369
+ "1.0.0".to_string()
370
+ }
371
+ fn initialize(&self) -> Result<()> {
372
+ if self.fail_on_init {
373
+ Err(KreuzbergError::Plugin {
374
+ message: "Processor initialization failed".to_string(),
375
+ plugin_name: self.name.clone(),
376
+ })
377
+ } else {
378
+ Ok(())
379
+ }
380
+ }
381
+ fn shutdown(&self) -> Result<()> {
382
+ Ok(())
383
+ }
384
+ }
385
+
386
+ #[async_trait]
387
+ impl PostProcessor for FailingPostProcessor {
388
+ async fn process(&self, _result: &mut ExtractionResult, _: &ExtractionConfig) -> Result<()> {
389
+ Ok(())
390
+ }
391
+
392
+ fn processing_stage(&self) -> ProcessingStage {
393
+ self.stage
394
+ }
395
+ }
396
+
397
+ #[test]
398
+ fn test_post_processor_initialization_failure_logs_error() {
399
+ let mut registry = PostProcessorRegistry::new();
400
+
401
+ let processor = Arc::new(FailingPostProcessor {
402
+ name: "failing-processor".to_string(),
403
+ stage: ProcessingStage::Early,
404
+ fail_on_init: true,
405
+ });
406
+
407
+ let result = registry.register(processor, 50);
408
+ assert!(result.is_err());
409
+ assert_eq!(registry.list().len(), 0);
410
+ }
411
+
412
+ #[test]
413
+ fn test_post_processor_invalid_name_empty_logs_warning() {
414
+ let mut registry = PostProcessorRegistry::new();
415
+
416
+ let processor = Arc::new(MockPostProcessor {
417
+ name: "".to_string(),
418
+ stage: ProcessingStage::Early,
419
+ });
420
+
421
+ let result = registry.register(processor, 50);
422
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
423
+ }
424
+
425
+ #[test]
426
+ fn test_post_processor_invalid_name_with_spaces_logs_warning() {
427
+ let mut registry = PostProcessorRegistry::new();
428
+
429
+ let processor = Arc::new(MockPostProcessor {
430
+ name: "invalid processor".to_string(),
431
+ stage: ProcessingStage::Early,
432
+ });
433
+
434
+ let result = registry.register(processor, 50);
435
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
436
+ }
437
+
438
+ #[test]
439
+ fn test_post_processor_successful_registration_logs_debug() {
440
+ let mut registry = PostProcessorRegistry::new();
441
+
442
+ let processor = Arc::new(MockPostProcessor {
443
+ name: "valid-processor".to_string(),
444
+ stage: ProcessingStage::Early,
445
+ });
446
+
447
+ let result = registry.register(processor, 50);
448
+ assert!(result.is_ok());
449
+ assert_eq!(registry.list().len(), 1);
450
+ }
451
+
452
+ #[test]
453
+ fn test_post_processor_remove_nonexistent_logs_debug() {
454
+ let mut registry = PostProcessorRegistry::new();
455
+
456
+ let result = registry.remove("nonexistent-processor");
457
+ assert!(result.is_ok());
458
+ assert_eq!(registry.list().len(), 0);
459
+ }
460
+
461
+ #[test]
462
+ fn test_post_processor_register_same_name_twice() {
463
+ let mut registry = PostProcessorRegistry::new();
464
+
465
+ let processor = Arc::new(MockPostProcessor {
466
+ name: "duplicate-processor".to_string(),
467
+ stage: ProcessingStage::Early,
468
+ });
469
+
470
+ registry.register(processor.clone(), 50).unwrap();
471
+ assert_eq!(registry.list().len(), 1);
472
+
473
+ registry.register(processor, 75).unwrap();
474
+ assert_eq!(registry.list().len(), 1);
475
+ }
476
+
477
+ #[test]
478
+ fn test_post_processor_multiple_stages() {
479
+ let mut registry = PostProcessorRegistry::new();
480
+
481
+ let early_processor = Arc::new(MockPostProcessor {
482
+ name: "early-proc".to_string(),
483
+ stage: ProcessingStage::Early,
484
+ });
485
+
486
+ let middle_processor = Arc::new(MockPostProcessor {
487
+ name: "middle-proc".to_string(),
488
+ stage: ProcessingStage::Middle,
489
+ });
490
+
491
+ let late_processor = Arc::new(MockPostProcessor {
492
+ name: "late-proc".to_string(),
493
+ stage: ProcessingStage::Late,
494
+ });
495
+
496
+ registry.register(early_processor, 100).unwrap();
497
+ registry.register(middle_processor, 50).unwrap();
498
+ registry.register(late_processor, 25).unwrap();
499
+
500
+ assert_eq!(registry.get_for_stage(ProcessingStage::Early).len(), 1);
501
+ assert_eq!(registry.get_for_stage(ProcessingStage::Middle).len(), 1);
502
+ assert_eq!(registry.get_for_stage(ProcessingStage::Late).len(), 1);
503
+ }
504
+
505
+ #[test]
506
+ fn test_post_processor_shutdown_empty_registry() {
507
+ let mut registry = PostProcessorRegistry::new();
508
+ let result = registry.shutdown_all();
509
+ assert!(result.is_ok());
510
+ assert_eq!(registry.list().len(), 0);
511
+ }
304
512
  }
@@ -30,11 +30,32 @@ impl ValidatorRegistry {
30
30
  let name = validator.name().to_string();
31
31
  let priority = validator.priority();
32
32
 
33
- super::validate_plugin_name(&name)?;
33
+ if let Err(e) = super::validate_plugin_name(&name) {
34
+ tracing::warn!(
35
+ "Failed to validate validator name '{}': {}. \
36
+ Registration aborted. Plugin names must be non-empty and contain only alphanumeric characters, hyphens, and underscores.",
37
+ name,
38
+ e
39
+ );
40
+ return Err(e);
41
+ }
34
42
 
35
- validator.initialize()?;
43
+ if let Err(e) = validator.initialize() {
44
+ tracing::error!(
45
+ "Failed to initialize validator '{}' with priority {}: {}. \
46
+ Validation step will not be executed.",
47
+ name,
48
+ priority,
49
+ e
50
+ );
51
+ return Err(e);
52
+ }
36
53
 
37
- self.validators.entry(priority).or_default().insert(name, validator);
54
+ self.validators
55
+ .entry(priority)
56
+ .or_default()
57
+ .insert(name.clone(), validator);
58
+ tracing::debug!("Registered validator '{}' with priority {}", name, priority);
38
59
 
39
60
  Ok(())
40
61
  }
@@ -68,17 +89,35 @@ impl ValidatorRegistry {
68
89
  /// Remove a validator from the registry.
69
90
  pub fn remove(&mut self, name: &str) -> Result<()> {
70
91
  let mut validator_to_shutdown: Option<Arc<dyn Validator>> = None;
92
+ let mut found = false;
71
93
 
72
94
  for validators in self.validators.values_mut() {
73
95
  if let Some(validator) = validators.shift_remove(name)
74
96
  && validator_to_shutdown.is_none()
75
97
  {
76
98
  validator_to_shutdown = Some(validator);
99
+ found = true;
77
100
  }
78
101
  }
79
102
 
103
+ if !found {
104
+ tracing::debug!(
105
+ "Validator '{}' not found in registry (already removed or never registered)",
106
+ name
107
+ );
108
+ }
109
+
80
110
  if let Some(validator) = validator_to_shutdown {
81
- validator.shutdown()?;
111
+ if let Err(e) = validator.shutdown() {
112
+ tracing::warn!(
113
+ "Failed to shutdown validator '{}': {}. \
114
+ Resources may not have been properly released.",
115
+ name,
116
+ e
117
+ );
118
+ return Err(e);
119
+ }
120
+ tracing::debug!("Successfully removed and shut down validator '{}'", name);
82
121
  }
83
122
 
84
123
  self.validators.retain(|_, validators| !validators.is_empty());
@@ -89,9 +128,19 @@ impl ValidatorRegistry {
89
128
  /// Shutdown all validators and clear the registry.
90
129
  pub fn shutdown_all(&mut self) -> Result<()> {
91
130
  let names = self.list();
131
+ let count = names.len();
132
+
133
+ if count > 0 {
134
+ tracing::debug!("Shutting down {} validators", count);
135
+ }
136
+
92
137
  for name in names {
93
138
  self.remove(&name)?;
94
139
  }
140
+
141
+ if count > 0 {
142
+ tracing::debug!("Successfully shut down all {} validators", count);
143
+ }
95
144
  Ok(())
96
145
  }
97
146
  }
@@ -235,4 +284,171 @@ mod tests {
235
284
  registry.shutdown_all().unwrap();
236
285
  assert_eq!(registry.get_all().len(), 0);
237
286
  }
287
+
288
+ struct FailingValidator {
289
+ name: String,
290
+ priority: i32,
291
+ fail_on_init: bool,
292
+ }
293
+
294
+ impl Plugin for FailingValidator {
295
+ fn name(&self) -> &str {
296
+ &self.name
297
+ }
298
+ fn version(&self) -> String {
299
+ "1.0.0".to_string()
300
+ }
301
+ fn initialize(&self) -> Result<()> {
302
+ if self.fail_on_init {
303
+ Err(KreuzbergError::Plugin {
304
+ message: "Validator initialization failed".to_string(),
305
+ plugin_name: self.name.clone(),
306
+ })
307
+ } else {
308
+ Ok(())
309
+ }
310
+ }
311
+ fn shutdown(&self) -> Result<()> {
312
+ Ok(())
313
+ }
314
+ }
315
+
316
+ #[async_trait]
317
+ impl Validator for FailingValidator {
318
+ async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
319
+ Ok(())
320
+ }
321
+
322
+ fn priority(&self) -> i32 {
323
+ self.priority
324
+ }
325
+ }
326
+
327
+ #[test]
328
+ fn test_validator_initialization_failure_logs_error() {
329
+ let mut registry = ValidatorRegistry::new();
330
+
331
+ let validator = Arc::new(FailingValidator {
332
+ name: "failing-validator".to_string(),
333
+ priority: 50,
334
+ fail_on_init: true,
335
+ });
336
+
337
+ let result = registry.register(validator);
338
+ assert!(result.is_err());
339
+ assert_eq!(registry.get_all().len(), 0);
340
+ }
341
+
342
+ #[test]
343
+ fn test_validator_invalid_name_empty_logs_warning() {
344
+ let mut registry = ValidatorRegistry::new();
345
+
346
+ let validator = Arc::new(MockValidator {
347
+ name: "".to_string(),
348
+ priority: 50,
349
+ });
350
+
351
+ let result = registry.register(validator);
352
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
353
+ }
354
+
355
+ #[test]
356
+ fn test_validator_successful_registration_logs_debug() {
357
+ let mut registry = ValidatorRegistry::new();
358
+
359
+ let validator = Arc::new(MockValidator {
360
+ name: "valid-validator".to_string(),
361
+ priority: 50,
362
+ });
363
+
364
+ let result = registry.register(validator);
365
+ assert!(result.is_ok());
366
+ assert_eq!(registry.get_all().len(), 1);
367
+ }
368
+
369
+ #[test]
370
+ fn test_validator_remove_nonexistent_logs_debug() {
371
+ let mut registry = ValidatorRegistry::new();
372
+
373
+ let result = registry.remove("nonexistent-validator");
374
+ assert!(result.is_ok());
375
+ assert_eq!(registry.get_all().len(), 0);
376
+ }
377
+
378
+ #[test]
379
+ fn test_validator_priority_ordering_reversed() {
380
+ let mut registry = ValidatorRegistry::new();
381
+
382
+ let high = Arc::new(MockValidator {
383
+ name: "high-priority".to_string(),
384
+ priority: 100,
385
+ });
386
+
387
+ let low = Arc::new(MockValidator {
388
+ name: "low-priority".to_string(),
389
+ priority: 10,
390
+ });
391
+
392
+ registry.register(low).unwrap();
393
+ registry.register(high).unwrap();
394
+
395
+ let validators = registry.get_all();
396
+ assert_eq!(validators.len(), 2);
397
+ assert_eq!(validators[0].name(), "high-priority");
398
+ assert_eq!(validators[1].name(), "low-priority");
399
+ }
400
+
401
+ #[test]
402
+ fn test_validator_multiple_same_priority() {
403
+ let mut registry = ValidatorRegistry::new();
404
+
405
+ let validator1 = Arc::new(MockValidator {
406
+ name: "validator-a".to_string(),
407
+ priority: 50,
408
+ });
409
+
410
+ let validator2 = Arc::new(MockValidator {
411
+ name: "validator-b".to_string(),
412
+ priority: 50,
413
+ });
414
+
415
+ let validator3 = Arc::new(MockValidator {
416
+ name: "validator-c".to_string(),
417
+ priority: 50,
418
+ });
419
+
420
+ registry.register(validator1).unwrap();
421
+ registry.register(validator2).unwrap();
422
+ registry.register(validator3).unwrap();
423
+
424
+ let validators = registry.get_all();
425
+ assert_eq!(validators.len(), 3);
426
+ }
427
+
428
+ #[test]
429
+ fn test_validator_shutdown_empty_registry() {
430
+ let mut registry = ValidatorRegistry::new();
431
+ let result = registry.shutdown_all();
432
+ assert!(result.is_ok());
433
+ assert_eq!(registry.get_all().len(), 0);
434
+ }
435
+
436
+ #[test]
437
+ fn test_validator_remove_and_readd() {
438
+ let mut registry = ValidatorRegistry::new();
439
+
440
+ let validator = Arc::new(MockValidator {
441
+ name: "test-validator".to_string(),
442
+ priority: 50,
443
+ });
444
+
445
+ registry.register(validator.clone()).unwrap();
446
+ assert_eq!(registry.get_all().len(), 1);
447
+
448
+ registry.remove("test-validator").unwrap();
449
+ assert_eq!(registry.get_all().len(), 0);
450
+
451
+ registry.register(validator).unwrap();
452
+ assert_eq!(registry.get_all().len(), 1);
453
+ }
238
454
  }