kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +0 -6
  3. data/.rubocop.yaml +534 -1
  4. data/Gemfile +2 -1
  5. data/Gemfile.lock +11 -11
  6. data/README.md +5 -10
  7. data/examples/async_patterns.rb +0 -1
  8. data/ext/kreuzberg_rb/extconf.rb +0 -10
  9. data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
  10. data/ext/kreuzberg_rb/native/build.rs +2 -0
  11. data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
  12. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
  13. data/ext/kreuzberg_rb/native/include/strings.h +2 -2
  14. data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
  15. data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
  16. data/kreuzberg.gemspec +14 -57
  17. data/lib/kreuzberg/cache_api.rb +0 -1
  18. data/lib/kreuzberg/cli.rb +2 -2
  19. data/lib/kreuzberg/config.rb +2 -9
  20. data/lib/kreuzberg/errors.rb +7 -75
  21. data/lib/kreuzberg/extraction_api.rb +0 -1
  22. data/lib/kreuzberg/setup_lib_path.rb +0 -1
  23. data/lib/kreuzberg/version.rb +1 -1
  24. data/lib/kreuzberg.rb +0 -21
  25. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  26. data/sig/kreuzberg.rbs +3 -55
  27. data/spec/binding/cli_proxy_spec.rb +4 -2
  28. data/spec/binding/cli_spec.rb +11 -12
  29. data/spec/examples.txt +104 -0
  30. data/spec/fixtures/config.yaml +1 -0
  31. data/spec/spec_helper.rb +1 -1
  32. data/vendor/kreuzberg/Cargo.toml +42 -112
  33. data/vendor/kreuzberg/README.md +2 -2
  34. data/vendor/kreuzberg/build.rs +4 -18
  35. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  36. data/vendor/kreuzberg/src/cache/mod.rs +3 -27
  37. data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
  38. data/vendor/kreuzberg/src/core/extractor.rs +81 -202
  39. data/vendor/kreuzberg/src/core/io.rs +2 -4
  40. data/vendor/kreuzberg/src/core/mime.rs +12 -2
  41. data/vendor/kreuzberg/src/core/mod.rs +1 -4
  42. data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
  43. data/vendor/kreuzberg/src/embeddings.rs +16 -125
  44. data/vendor/kreuzberg/src/error.rs +1 -1
  45. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/image.rs +13 -13
  47. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
  48. data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
  49. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
  50. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  51. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  52. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  53. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  54. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  55. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  56. data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
  57. data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
  58. data/vendor/kreuzberg/src/extractors/email.rs +0 -14
  59. data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
  60. data/vendor/kreuzberg/src/extractors/html.rs +154 -137
  61. data/vendor/kreuzberg/src/extractors/image.rs +4 -7
  62. data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
  63. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  64. data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
  65. data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
  66. data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
  67. data/vendor/kreuzberg/src/extractors/text.rs +5 -23
  68. data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
  69. data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
  70. data/vendor/kreuzberg/src/lib.rs +1 -4
  71. data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
  72. data/vendor/kreuzberg/src/mcp/server.rs +3 -5
  73. data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
  74. data/vendor/kreuzberg/src/pdf/error.rs +1 -1
  75. data/vendor/kreuzberg/src/pdf/table.rs +44 -17
  76. data/vendor/kreuzberg/src/pdf/text.rs +3 -0
  77. data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
  78. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
  79. data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
  80. data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
  81. data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
  82. data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
  83. data/vendor/kreuzberg/src/types.rs +12 -42
  84. data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
  85. data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
  86. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  87. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
  88. data/vendor/kreuzberg/tests/config_features.rs +0 -18
  89. data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
  90. data/vendor/kreuzberg/tests/core_integration.rs +7 -24
  91. data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
  92. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
  93. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
  95. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
  97. data/vendor/kreuzberg/tests/security_validation.rs +1 -12
  98. metadata +25 -90
  99. data/.rubocop.yml +0 -538
  100. data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
  101. data/lib/kreuzberg/error_context.rb +0 -32
  102. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  103. data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
  104. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
  105. data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
  106. data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
  107. data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
  108. data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
  109. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  110. data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
  111. data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
  112. data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
  113. data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
  114. data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
  115. data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
  116. data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
  117. data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
  119. data/vendor/kreuzberg/src/extractors/security.rs +0 -484
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
  121. data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
  122. data/vendor/kreuzberg/src/panic_context.rs +0 -154
  123. data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
  124. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
  125. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
  126. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
  127. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
  128. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
  129. data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
  130. data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
  131. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
  132. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
  133. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
  134. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
  135. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
  136. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
  137. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
  138. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
  139. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
  140. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
  141. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
  142. data/vendor/rb-sys/.cargo-ok +0 -1
  143. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  144. data/vendor/rb-sys/Cargo.lock +0 -393
  145. data/vendor/rb-sys/Cargo.toml +0 -70
  146. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  147. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  148. data/vendor/rb-sys/LICENSE-MIT +0 -21
  149. data/vendor/rb-sys/bin/release.sh +0 -21
  150. data/vendor/rb-sys/build/features.rs +0 -108
  151. data/vendor/rb-sys/build/main.rs +0 -246
  152. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  153. data/vendor/rb-sys/build/version.rs +0 -48
  154. data/vendor/rb-sys/readme.md +0 -36
  155. data/vendor/rb-sys/src/bindings.rs +0 -21
  156. data/vendor/rb-sys/src/hidden.rs +0 -11
  157. data/vendor/rb-sys/src/lib.rs +0 -34
  158. data/vendor/rb-sys/src/macros.rs +0 -371
  159. data/vendor/rb-sys/src/memory.rs +0 -53
  160. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  161. data/vendor/rb-sys/src/special_consts.rs +0 -31
  162. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  163. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  164. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  165. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  166. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  167. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  168. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  169. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  170. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  171. data/vendor/rb-sys/src/stable_api.rs +0 -261
  172. data/vendor/rb-sys/src/symbol.rs +0 -31
  173. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  174. data/vendor/rb-sys/src/utils.rs +0 -89
  175. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -30,13 +30,6 @@ use crate::{KreuzbergError, Result};
30
30
  /// - Validator errors bubble up immediately
31
31
  /// - Post-processor errors are caught and recorded in metadata
32
32
  /// - System errors (IO, RuntimeError equivalents) always bubble up
33
- #[cfg_attr(feature = "otel", tracing::instrument(
34
- skip(result, config),
35
- fields(
36
- pipeline.stage = "post_processing",
37
- content.length = result.content.len(),
38
- )
39
- ))]
40
33
  pub async fn run_pipeline(mut result: ExtractionResult, config: &ExtractionConfig) -> Result<ExtractionResult> {
41
34
  let pp_config = config.postprocessor.as_ref();
42
35
  let postprocessing_enabled = pp_config.is_none_or(|c| c.enabled);
@@ -234,18 +227,13 @@ mod tests {
234
227
  use crate::types::Metadata;
235
228
  use lazy_static::lazy_static;
236
229
 
237
- const VALIDATION_MARKER_KEY: &str = "registry_validation_marker";
238
- const QUALITY_VALIDATION_MARKER: &str = "quality_validation_test";
239
- const POSTPROCESSOR_VALIDATION_MARKER: &str = "postprocessor_validation_test";
240
- const ORDER_VALIDATION_MARKER: &str = "order_validation_test";
241
-
242
230
  lazy_static! {
243
231
  static ref REGISTRY_TEST_GUARD: std::sync::Mutex<()> = std::sync::Mutex::new(());
244
232
  }
245
233
 
246
234
  #[tokio::test]
247
235
  async fn test_run_pipeline_basic() {
248
- let mut result = ExtractionResult {
236
+ let result = ExtractionResult {
249
237
  content: "test".to_string(),
250
238
  mime_type: "text/plain".to_string(),
251
239
  metadata: Metadata::default(),
@@ -254,10 +242,6 @@ mod tests {
254
242
  chunks: None,
255
243
  images: None,
256
244
  };
257
- result.metadata.additional.insert(
258
- VALIDATION_MARKER_KEY.to_string(),
259
- serde_json::json!(ORDER_VALIDATION_MARKER),
260
- );
261
245
  let config = ExtractionConfig::default();
262
246
 
263
247
  let processed = run_pipeline(result, &config).await.unwrap();
@@ -413,17 +397,9 @@ mod tests {
413
397
 
414
398
  #[tokio::test]
415
399
  async fn test_pipeline_empty_content() {
416
- let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
417
-
418
- {
419
- let registry = crate::plugins::registry::get_post_processor_registry();
420
- registry.write().unwrap().shutdown_all().unwrap();
421
- }
422
400
  {
423
- let registry = crate::plugins::registry::get_validator_registry();
424
- registry.write().unwrap().shutdown_all().unwrap();
425
- }
426
-
401
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
402
+ } // Drop guard before async operations
427
403
  let result = ExtractionResult {
428
404
  content: String::new(),
429
405
  mime_type: "text/plain".to_string(),
@@ -435,8 +411,6 @@ mod tests {
435
411
  };
436
412
  let config = ExtractionConfig::default();
437
413
 
438
- drop(_guard);
439
-
440
414
  let processed = run_pipeline(result, &config).await.unwrap();
441
415
  assert_eq!(processed.content, "");
442
416
  }
@@ -472,8 +446,6 @@ mod tests {
472
446
  #[tokio::test]
473
447
  #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
474
448
  async fn test_pipeline_with_keyword_extraction() {
475
- let _ = crate::keywords::register_keyword_processor();
476
-
477
449
  let result = ExtractionResult {
478
450
  content: r#"
479
451
  Machine learning is a branch of artificial intelligence that focuses on
@@ -544,18 +516,6 @@ Natural language processing enables computers to understand human language.
544
516
  #[tokio::test]
545
517
  #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
546
518
  async fn test_pipeline_keyword_extraction_short_content() {
547
- let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
548
- crate::plugins::registry::get_validator_registry()
549
- .write()
550
- .unwrap()
551
- .shutdown_all()
552
- .unwrap();
553
- crate::plugins::registry::get_post_processor_registry()
554
- .write()
555
- .unwrap()
556
- .shutdown_all()
557
- .unwrap();
558
-
559
519
  let result = ExtractionResult {
560
520
  content: "Short text".to_string(),
561
521
  mime_type: "text/plain".to_string(),
@@ -577,8 +537,6 @@ Natural language processing enables computers to understand human language.
577
537
  ..Default::default()
578
538
  };
579
539
 
580
- drop(_guard);
581
-
582
540
  let processed = run_pipeline(result, &config).await.unwrap();
583
541
 
584
542
  assert!(!processed.metadata.additional.contains_key("keywords"));
@@ -586,6 +544,9 @@ Natural language processing enables computers to understand human language.
586
544
 
587
545
  #[tokio::test]
588
546
  async fn test_postprocessor_runs_before_validator() {
547
+ {
548
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
549
+ } // Drop guard before async operations
589
550
  use crate::plugins::{Plugin, PostProcessor, ProcessingStage, Validator};
590
551
  use async_trait::async_trait;
591
552
  use std::sync::Arc;
@@ -640,17 +601,6 @@ Natural language processing enables computers to understand human language.
640
601
  #[async_trait]
641
602
  impl Validator for TestValidator {
642
603
  async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
643
- let should_validate = result
644
- .metadata
645
- .additional
646
- .get(VALIDATION_MARKER_KEY)
647
- .and_then(|v| v.as_str())
648
- == Some(POSTPROCESSOR_VALIDATION_MARKER);
649
-
650
- if !should_validate {
651
- return Ok(());
652
- }
653
-
654
604
  let processed = result
655
605
  .metadata
656
606
  .additional
@@ -669,23 +619,18 @@ Natural language processing enables computers to understand human language.
669
619
  }
670
620
 
671
621
  let pp_registry = crate::plugins::registry::get_post_processor_registry();
672
- let val_registry = crate::plugins::registry::get_validator_registry();
673
-
674
- let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
675
- pp_registry.write().unwrap().shutdown_all().unwrap();
676
- val_registry.write().unwrap().shutdown_all().unwrap();
677
-
678
622
  {
679
623
  let mut registry = pp_registry.write().unwrap();
680
624
  registry.register(Arc::new(TestPostProcessor), 0).unwrap();
681
625
  }
682
626
 
627
+ let val_registry = crate::plugins::registry::get_validator_registry();
683
628
  {
684
629
  let mut registry = val_registry.write().unwrap();
685
630
  registry.register(Arc::new(TestValidator)).unwrap();
686
631
  }
687
632
 
688
- let mut result = ExtractionResult {
633
+ let result = ExtractionResult {
689
634
  content: "test".to_string(),
690
635
  mime_type: "text/plain".to_string(),
691
636
  metadata: Metadata::default(),
@@ -694,18 +639,18 @@ Natural language processing enables computers to understand human language.
694
639
  chunks: None,
695
640
  images: None,
696
641
  };
697
- result.metadata.additional.insert(
698
- VALIDATION_MARKER_KEY.to_string(),
699
- serde_json::json!(POSTPROCESSOR_VALIDATION_MARKER),
700
- );
701
642
 
702
643
  let config = ExtractionConfig::default();
703
- drop(_guard);
704
-
705
644
  let processed = run_pipeline(result, &config).await;
706
645
 
707
- pp_registry.write().unwrap().shutdown_all().unwrap();
708
- val_registry.write().unwrap().shutdown_all().unwrap();
646
+ {
647
+ let mut registry = pp_registry.write().unwrap();
648
+ registry.remove("test-processor").unwrap();
649
+ }
650
+ {
651
+ let mut registry = val_registry.write().unwrap();
652
+ registry.remove("test-validator").unwrap();
653
+ }
709
654
 
710
655
  assert!(processed.is_ok(), "Validator should have seen post-processor metadata");
711
656
  let processed = processed.unwrap();
@@ -719,7 +664,9 @@ Natural language processing enables computers to understand human language.
719
664
  #[tokio::test]
720
665
  #[cfg(feature = "quality")]
721
666
  async fn test_quality_processing_runs_before_validator() {
722
- let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
667
+ {
668
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
669
+ } // Drop guard before async operations
723
670
  use crate::plugins::{Plugin, Validator};
724
671
  use async_trait::async_trait;
725
672
  use std::sync::Arc;
@@ -743,17 +690,6 @@ Natural language processing enables computers to understand human language.
743
690
  #[async_trait]
744
691
  impl Validator for QualityValidator {
745
692
  async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
746
- let should_validate = result
747
- .metadata
748
- .additional
749
- .get(VALIDATION_MARKER_KEY)
750
- .and_then(|v| v.as_str())
751
- == Some(QUALITY_VALIDATION_MARKER);
752
-
753
- if !should_validate {
754
- return Ok(());
755
- }
756
-
757
693
  if !result.metadata.additional.contains_key("quality_score") {
758
694
  return Err(crate::KreuzbergError::Validation {
759
695
  message: "Quality processing did not run before validator".to_string(),
@@ -770,7 +706,7 @@ Natural language processing enables computers to understand human language.
770
706
  registry.register(Arc::new(QualityValidator)).unwrap();
771
707
  }
772
708
 
773
- let mut result = ExtractionResult {
709
+ let result = ExtractionResult {
774
710
  content: "This is meaningful test content for quality scoring.".to_string(),
775
711
  mime_type: "text/plain".to_string(),
776
712
  metadata: Metadata::default(),
@@ -779,18 +715,12 @@ Natural language processing enables computers to understand human language.
779
715
  chunks: None,
780
716
  images: None,
781
717
  };
782
- result.metadata.additional.insert(
783
- VALIDATION_MARKER_KEY.to_string(),
784
- serde_json::json!(QUALITY_VALIDATION_MARKER),
785
- );
786
718
 
787
719
  let config = ExtractionConfig {
788
720
  enable_quality_processing: true,
789
721
  ..Default::default()
790
722
  };
791
723
 
792
- drop(_guard);
793
-
794
724
  let processed = run_pipeline(result, &config).await;
795
725
 
796
726
  {
@@ -803,6 +733,9 @@ Natural language processing enables computers to understand human language.
803
733
 
804
734
  #[tokio::test]
805
735
  async fn test_multiple_postprocessors_run_before_validator() {
736
+ {
737
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
738
+ } // Drop guard before async operations
806
739
  use crate::plugins::{Plugin, PostProcessor, ProcessingStage, Validator};
807
740
  use async_trait::async_trait;
808
741
  use std::sync::Arc;
@@ -904,17 +837,6 @@ Natural language processing enables computers to understand human language.
904
837
  #[async_trait]
905
838
  impl Validator for OrderValidator {
906
839
  async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
907
- let should_validate = result
908
- .metadata
909
- .additional
910
- .get(VALIDATION_MARKER_KEY)
911
- .and_then(|v| v.as_str())
912
- == Some(ORDER_VALIDATION_MARKER);
913
-
914
- if !should_validate {
915
- return Ok(());
916
- }
917
-
918
840
  let order = result
919
841
  .metadata
920
842
  .additional
@@ -944,18 +866,13 @@ Natural language processing enables computers to understand human language.
944
866
  }
945
867
 
946
868
  let pp_registry = crate::plugins::registry::get_post_processor_registry();
947
- let val_registry = crate::plugins::registry::get_validator_registry();
948
- let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
949
-
950
- pp_registry.write().unwrap().shutdown_all().unwrap();
951
- val_registry.write().unwrap().shutdown_all().unwrap();
952
-
953
869
  {
954
870
  let mut registry = pp_registry.write().unwrap();
955
871
  registry.register(Arc::new(EarlyProcessor), 0).unwrap();
956
872
  registry.register(Arc::new(LateProcessor), 0).unwrap();
957
873
  }
958
874
 
875
+ let val_registry = crate::plugins::registry::get_validator_registry();
959
876
  {
960
877
  let mut registry = val_registry.write().unwrap();
961
878
  registry.register(Arc::new(OrderValidator)).unwrap();
@@ -972,12 +889,17 @@ Natural language processing enables computers to understand human language.
972
889
  };
973
890
 
974
891
  let config = ExtractionConfig::default();
975
- drop(_guard);
976
-
977
892
  let processed = run_pipeline(result, &config).await;
978
893
 
979
- pp_registry.write().unwrap().shutdown_all().unwrap();
980
- val_registry.write().unwrap().shutdown_all().unwrap();
894
+ {
895
+ let mut registry = pp_registry.write().unwrap();
896
+ registry.remove("early-proc").unwrap();
897
+ registry.remove("late-proc").unwrap();
898
+ }
899
+ {
900
+ let mut registry = val_registry.write().unwrap();
901
+ registry.remove("order-validator").unwrap();
902
+ }
981
903
 
982
904
  assert!(processed.is_ok(), "All processors should run before validator");
983
905
  }
@@ -45,104 +45,9 @@ use std::collections::HashMap;
45
45
  #[cfg(feature = "embeddings")]
46
46
  use lazy_static::lazy_static;
47
47
 
48
- /// Wrapper for TextEmbedding that prevents cleanup during process shutdown.
49
- ///
50
- /// # Problem
51
- ///
52
- /// When the process terminates, global static objects are dropped. The `TextEmbedding`
53
- /// objects from fastembed contain ONNX Runtime sessions (via `ort v2.0.0-rc.10`), and
54
- /// during their `Drop` implementation, ONNX Runtime's C++ destructor tries to acquire
55
- /// mutexes for cleanup.
56
- ///
57
- /// At process shutdown time, the C++ runtime may have already begun tearing down
58
- /// threading infrastructure, causing mutex operations to fail with:
59
- /// "mutex lock failed: Invalid argument"
60
- ///
61
- /// This manifests as:
62
- /// ```text
63
- /// libc++abi: terminating due to uncaught exception of type std::__1::system_error:
64
- /// mutex lock failed: Invalid argument
65
- /// ```
66
- ///
67
- /// This is a known issue in `ort` (see pykeio/ort#441), fixed in later versions via commit
68
- /// 317be20 ("fix: let `Environment` drop"), but we're using v2.0.0-rc.10 through fastembed
69
- /// v5.3.1 which predates the fix.
70
- ///
71
- /// # Solution
72
- ///
73
- /// We use `Box::leak` to intentionally leak `TextEmbedding` objects during process
74
- /// shutdown, preventing their `Drop` implementation from running. This is acceptable because:
75
- ///
76
- /// 1. The OS will reclaim all process memory anyway
77
- /// 2. Avoiding the crash is more important than cleanup
78
- /// 3. This only affects process termination, not runtime behavior
79
- /// 4. Models are long-lived and would survive until process exit anyway
80
- /// 5. The memory leak is bounded (one model per unique config)
81
- ///
82
- /// # Remaining Issue
83
- ///
84
- /// Even with this fix, you may still see the mutex error during final process cleanup.
85
- /// This is because `ort` v2.0.0-rc.10 also holds the ONNX Runtime `Environment` as a
86
- /// static variable, which gets dropped during C++ static destruction after Rust cleanup.
87
- /// This error occurs *after* all Rust code has finished and can be safely ignored - all
88
- /// tests pass before the error occurs.
89
- ///
90
- /// The error will be resolved when fastembed upgrades to ort >= 2.0.0 (post-rc.10) which
91
- /// contains the proper fix.
92
- ///
93
- /// # Safety
94
- ///
95
- /// The leak is contained to process shutdown and does not affect runtime behavior.
96
- /// All normal usage patterns (creating embeddings, caching models) work identically.
97
- /// We use static references to the leaked models, which is safe because:
98
- /// - The pointers are never null (we leak valid Box<TextEmbedding>)
99
- /// - The models live until process exit
100
- /// - We never manually deallocate the leaked memory
101
- /// - Mutex provides interior mutability for the embed() method
102
- ///
103
- /// Thread-safe wrapper for leaked TextEmbedding that allows interior mutability.
104
- ///
105
- /// This wrapper holds a raw pointer to a leaked `TextEmbedding` and provides
106
- /// safe access through the Mutex lock in MODEL_CACHE.
107
- #[cfg(feature = "embeddings")]
108
- pub(crate) struct LeakedModel {
109
- ptr: *mut TextEmbedding,
110
- }
111
-
112
- #[cfg(feature = "embeddings")]
113
- impl LeakedModel {
114
- fn new(model: TextEmbedding) -> Self {
115
- Self {
116
- ptr: Box::into_raw(Box::new(model)),
117
- }
118
- }
119
-
120
- /// Get a mutable reference to the model.
121
- ///
122
- /// # Safety
123
- ///
124
- /// This is safe to call only when:
125
- /// 1. The caller has exclusive access (guaranteed by Mutex in MODEL_CACHE)
126
- /// 2. The pointer is valid (guaranteed by Box::into_raw and never deallocating)
127
- #[allow(unsafe_code, clippy::mut_from_ref)]
128
- unsafe fn get_mut(&self) -> &mut TextEmbedding {
129
- unsafe { &mut *self.ptr }
130
- }
131
- }
132
-
133
- #[cfg(feature = "embeddings")]
134
- #[allow(unsafe_code)]
135
- unsafe impl Send for LeakedModel {}
136
- #[cfg(feature = "embeddings")]
137
- #[allow(unsafe_code)]
138
- unsafe impl Sync for LeakedModel {}
139
-
140
- #[cfg(feature = "embeddings")]
141
- type CachedEmbedding = Arc<Mutex<LeakedModel>>;
142
-
143
48
  #[cfg(feature = "embeddings")]
144
49
  lazy_static! {
145
- static ref MODEL_CACHE: RwLock<HashMap<String, CachedEmbedding>> = RwLock::new(HashMap::new());
50
+ static ref MODEL_CACHE: RwLock<HashMap<String, Arc<Mutex<TextEmbedding>>>> = RwLock::new(HashMap::new());
146
51
  }
147
52
 
148
53
  /// Get or initialize a text embedding model from cache.
@@ -150,11 +55,10 @@ lazy_static! {
150
55
  /// This function ensures models are initialized only once and reused across
151
56
  /// the application, avoiding redundant downloads and initialization overhead.
152
57
  #[cfg(feature = "embeddings")]
153
- #[allow(private_interfaces)]
154
58
  pub fn get_or_init_model(
155
59
  model: EmbeddingModel,
156
60
  cache_dir: Option<std::path::PathBuf>,
157
- ) -> crate::Result<CachedEmbedding> {
61
+ ) -> crate::Result<Arc<Mutex<TextEmbedding>>> {
158
62
  let cache_directory = cache_dir.unwrap_or_else(|| {
159
63
  let mut path = std::env::current_dir().unwrap_or_else(|_| std::path::PathBuf::from("."));
160
64
  path.push(".kreuzberg");
@@ -165,26 +69,21 @@ pub fn get_or_init_model(
165
69
  let model_key = format!("{:?}_{}", model, cache_directory.display());
166
70
 
167
71
  {
168
- match MODEL_CACHE.read() {
169
- Ok(cache) => {
170
- if let Some(cached_model) = cache.get(&model_key) {
171
- return Ok(Arc::clone(cached_model));
172
- }
173
- }
174
- Err(poison_error) => {
175
- let cache = poison_error.get_ref();
176
- if let Some(cached_model) = cache.get(&model_key) {
177
- return Ok(Arc::clone(cached_model));
178
- }
179
- }
72
+ let cache = MODEL_CACHE.read().map_err(|e| crate::KreuzbergError::Plugin {
73
+ message: format!("Failed to acquire model cache read lock: {}", e),
74
+ plugin_name: "embeddings".to_string(),
75
+ })?;
76
+
77
+ if let Some(cached_model) = cache.get(&model_key) {
78
+ return Ok(Arc::clone(cached_model));
180
79
  }
181
80
  }
182
81
 
183
82
  {
184
- let mut cache = match MODEL_CACHE.write() {
185
- Ok(guard) => guard,
186
- Err(poison_error) => poison_error.into_inner(),
187
- };
83
+ let mut cache = MODEL_CACHE.write().map_err(|e| crate::KreuzbergError::Plugin {
84
+ message: format!("Failed to acquire model cache write lock: {}", e),
85
+ plugin_name: "embeddings".to_string(),
86
+ })?;
188
87
 
189
88
  if let Some(cached_model) = cache.get(&model_key) {
190
89
  return Ok(Arc::clone(cached_model));
@@ -198,8 +97,7 @@ pub fn get_or_init_model(
198
97
  plugin_name: "embeddings".to_string(),
199
98
  })?;
200
99
 
201
- let leaked_model = LeakedModel::new(embedding_model);
202
- let arc_model = Arc::new(Mutex::new(leaked_model));
100
+ let arc_model = Arc::new(Mutex::new(embedding_model));
203
101
  cache.insert(model_key, Arc::clone(&arc_model));
204
102
 
205
103
  Ok(arc_model)
@@ -350,15 +248,12 @@ pub fn generate_embeddings_for_chunks(
350
248
  let texts: Vec<String> = chunks.iter().map(|chunk| chunk.content.clone()).collect();
351
249
 
352
250
  let embeddings_result = {
353
- let locked_model = model.lock().map_err(|e| crate::KreuzbergError::Plugin {
251
+ let mut locked_model = model.lock().map_err(|e| crate::KreuzbergError::Plugin {
354
252
  message: format!("Failed to acquire model lock: {}", e),
355
253
  plugin_name: "embeddings".to_string(),
356
254
  })?;
357
255
 
358
- #[allow(unsafe_code)]
359
- let model_mut = unsafe { locked_model.get_mut() };
360
-
361
- model_mut
256
+ locked_model
362
257
  .embed(texts, Some(config.batch_size))
363
258
  .map_err(|e| crate::KreuzbergError::Plugin {
364
259
  message: format!("Failed to generate embeddings: {}", e),
@@ -425,8 +320,4 @@ mod tests {
425
320
  assert_eq!(quality.chunk_size, 2000);
426
321
  assert_eq!(quality.overlap, 200);
427
322
  }
428
-
429
- #[cfg(feature = "embeddings")]
430
- #[test]
431
- fn test_lock_poisoning_recovery_semantics() {}
432
323
  }
@@ -60,7 +60,7 @@ pub type Result<T> = std::result::Result<T, KreuzbergError>;
60
60
  /// - `Cache` - Cache operation errors (non-fatal, can be ignored)
61
61
  /// - `ImageProcessing` - Image manipulation errors
62
62
  /// - `Serialization` - JSON/MessagePack serialization errors
63
- /// - `MissingDependency` - Missing optional dependencies (tesseract, etc.)
63
+ /// - `MissingDependency` - Missing optional dependencies (tesseract, pandoc, etc.)
64
64
  /// - `Plugin` - Plugin-specific errors
65
65
  /// - `LockPoisoned` - Mutex/RwLock poisoning (should not happen in normal operation)
66
66
  /// - `UnsupportedFormat` - Unsupported MIME type or file format
@@ -16,7 +16,7 @@ use crate::error::{KreuzbergError, Result};
16
16
  ///
17
17
  /// # Performance
18
18
  /// docx-lite uses streaming XML parsing for minimal memory overhead and high throughput
19
- /// (~160 MB/s average).
19
+ /// (~160 MB/s average). It eliminates subprocess overhead compared to Pandoc (~400x faster).
20
20
  pub fn extract_text(bytes: &[u8]) -> Result<String> {
21
21
  docx_lite::extract_text_from_bytes(bytes)
22
22
  .map_err(|e| KreuzbergError::parsing(format!("DOCX text extraction failed: {}", e)))
@@ -39,7 +39,7 @@ pub fn extract_image_metadata(bytes: &[u8]) -> Result<ImageMetadata> {
39
39
 
40
40
  let width = image.width();
41
41
  let height = image.height();
42
- let format_str = format!("{:?}", format).to_uppercase();
42
+ let format_str = format!("{:?}", format);
43
43
 
44
44
  let exif_data = extract_exif_data(bytes);
45
45
 
@@ -123,7 +123,7 @@ mod tests {
123
123
  let metadata = result.unwrap();
124
124
  assert_eq!(metadata.width, 100);
125
125
  assert_eq!(metadata.height, 80);
126
- assert_eq!(metadata.format, "PNG");
126
+ assert_eq!(metadata.format, "Png");
127
127
  }
128
128
 
129
129
  #[test]
@@ -135,7 +135,7 @@ mod tests {
135
135
  let metadata = result.unwrap();
136
136
  assert_eq!(metadata.width, 200);
137
137
  assert_eq!(metadata.height, 150);
138
- assert_eq!(metadata.format, "JPEG");
138
+ assert_eq!(metadata.format, "Jpeg");
139
139
  }
140
140
 
141
141
  #[test]
@@ -147,7 +147,7 @@ mod tests {
147
147
  let metadata = result.unwrap();
148
148
  assert_eq!(metadata.width, 120);
149
149
  assert_eq!(metadata.height, 90);
150
- assert_eq!(metadata.format, "WEBP");
150
+ assert_eq!(metadata.format, "WebP");
151
151
  }
152
152
 
153
153
  #[test]
@@ -159,7 +159,7 @@ mod tests {
159
159
  let metadata = result.unwrap();
160
160
  assert_eq!(metadata.width, 50);
161
161
  assert_eq!(metadata.height, 50);
162
- assert_eq!(metadata.format, "BMP");
162
+ assert_eq!(metadata.format, "Bmp");
163
163
  }
164
164
 
165
165
  #[test]
@@ -171,7 +171,7 @@ mod tests {
171
171
  let metadata = result.unwrap();
172
172
  assert_eq!(metadata.width, 180);
173
173
  assert_eq!(metadata.height, 120);
174
- assert_eq!(metadata.format, "TIFF");
174
+ assert_eq!(metadata.format, "Tiff");
175
175
  }
176
176
 
177
177
  #[test]
@@ -183,7 +183,7 @@ mod tests {
183
183
  let metadata = result.unwrap();
184
184
  assert_eq!(metadata.width, 64);
185
185
  assert_eq!(metadata.height, 64);
186
- assert_eq!(metadata.format, "GIF");
186
+ assert_eq!(metadata.format, "Gif");
187
187
  }
188
188
 
189
189
  #[test]
@@ -217,8 +217,8 @@ mod tests {
217
217
  let png_metadata = extract_image_metadata(&png_bytes).unwrap();
218
218
  let jpeg_metadata = extract_image_metadata(&jpeg_bytes).unwrap();
219
219
 
220
- assert_eq!(png_metadata.format, "PNG");
221
- assert_eq!(jpeg_metadata.format, "JPEG");
220
+ assert_eq!(png_metadata.format, "Png");
221
+ assert_eq!(jpeg_metadata.format, "Jpeg");
222
222
  }
223
223
 
224
224
  #[test]
@@ -284,7 +284,7 @@ mod tests {
284
284
  let metadata = result.unwrap();
285
285
  assert_eq!(metadata.width, 1);
286
286
  assert_eq!(metadata.height, 1);
287
- assert_eq!(metadata.format, "PNG");
287
+ assert_eq!(metadata.format, "Png");
288
288
  }
289
289
 
290
290
  #[test]
@@ -361,8 +361,8 @@ mod tests {
361
361
  let jpeg_meta = extract_image_metadata(&jpeg_bytes).unwrap();
362
362
  let webp_meta = extract_image_metadata(&webp_bytes).unwrap();
363
363
 
364
- assert_eq!(png_meta.format, "PNG");
365
- assert_eq!(jpeg_meta.format, "JPEG");
366
- assert_eq!(webp_meta.format, "WEBP");
364
+ assert_eq!(png_meta.format, "Png");
365
+ assert_eq!(jpeg_meta.format, "Jpeg");
366
+ assert_eq!(webp_meta.format, "WebP");
367
367
  }
368
368
  }
@@ -255,6 +255,7 @@ pub async fn convert_office_doc(
255
255
  let stderr = String::from_utf8_lossy(&output.stderr);
256
256
  let stdout = String::from_utf8_lossy(&output.stdout);
257
257
 
258
+ // Build detailed error message with both stdout and stderr
258
259
  let mut error_details = format!(
259
260
  "LibreOffice process failed with return code {}",
260
261
  output.status.code().unwrap_or(-1)
@@ -25,6 +25,9 @@ pub mod libreoffice;
25
25
  #[cfg(feature = "office")]
26
26
  pub mod office_metadata;
27
27
 
28
+ #[cfg(feature = "office")]
29
+ pub mod pandoc;
30
+
28
31
  #[cfg(feature = "office")]
29
32
  pub mod pptx;
30
33
 
@@ -34,9 +37,6 @@ pub mod table;
34
37
  #[cfg(feature = "xml")]
35
38
  pub mod xml;
36
39
 
37
- #[cfg(any(feature = "office", feature = "html"))]
38
- pub mod markdown;
39
-
40
40
  pub use structured::{JsonExtractionConfig, StructuredDataResult, parse_json, parse_toml, parse_yaml};
41
41
  pub use text::parse_text;
42
42
 
@@ -63,9 +63,8 @@ pub use libreoffice::{check_libreoffice_available, convert_doc_to_docx, convert_
63
63
 
64
64
  #[cfg(feature = "office")]
65
65
  pub use office_metadata::{
66
- CoreProperties, CustomProperties, DocxAppProperties, OdtProperties, PptxAppProperties, XlsxAppProperties,
67
- extract_core_properties, extract_custom_properties, extract_docx_app_properties, extract_odt_properties,
68
- extract_pptx_app_properties, extract_xlsx_app_properties,
66
+ CoreProperties, CustomProperties, DocxAppProperties, PptxAppProperties, XlsxAppProperties, extract_core_properties,
67
+ extract_custom_properties, extract_docx_app_properties, extract_pptx_app_properties, extract_xlsx_app_properties,
69
68
  };
70
69
 
71
70
  #[cfg(feature = "office")]
@@ -76,6 +75,3 @@ pub use table::table_from_arrow_to_markdown;
76
75
 
77
76
  #[cfg(feature = "xml")]
78
77
  pub use xml::parse_xml;
79
-
80
- #[cfg(any(feature = "office", feature = "html"))]
81
- pub use markdown::cells_to_markdown;
@@ -35,7 +35,6 @@
35
35
  pub mod app_properties;
36
36
  pub mod core_properties;
37
37
  pub mod custom_properties;
38
- pub mod odt_properties;
39
38
 
40
39
  pub use app_properties::{
41
40
  DocxAppProperties, PptxAppProperties, XlsxAppProperties, extract_docx_app_properties, extract_pptx_app_properties,
@@ -43,7 +42,6 @@ pub use app_properties::{
43
42
  };
44
43
  pub use core_properties::{CoreProperties, extract_core_properties};
45
44
  pub use custom_properties::{CustomProperties, extract_custom_properties};
46
- pub use odt_properties::{OdtProperties, extract_odt_properties};
47
45
 
48
46
  use roxmltree::Node;
49
47