RubyGems - kreuzberg - Versions diffs - 4.0.0.pre.rc.6 → 4.0.0.rc1 - Mend

kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

checksums.yaml +4 -4
data/.gitignore +0 -6
data/.rubocop.yaml +534 -1
data/Gemfile +2 -1
data/Gemfile.lock +11 -11
data/README.md +5 -10
data/examples/async_patterns.rb +0 -1
data/ext/kreuzberg_rb/extconf.rb +0 -10
data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
data/ext/kreuzberg_rb/native/build.rs +2 -0
data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
data/ext/kreuzberg_rb/native/include/strings.h +2 -2
data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
data/kreuzberg.gemspec +14 -57
data/lib/kreuzberg/cache_api.rb +0 -1
data/lib/kreuzberg/cli.rb +2 -2
data/lib/kreuzberg/config.rb +2 -9
data/lib/kreuzberg/errors.rb +7 -75
data/lib/kreuzberg/extraction_api.rb +0 -1
data/lib/kreuzberg/setup_lib_path.rb +0 -1
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +0 -21
data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
data/sig/kreuzberg.rbs +3 -55
data/spec/binding/cli_proxy_spec.rb +4 -2
data/spec/binding/cli_spec.rb +11 -12
data/spec/examples.txt +104 -0
data/spec/fixtures/config.yaml +1 -0
data/spec/spec_helper.rb +1 -1
data/vendor/kreuzberg/Cargo.toml +42 -112
data/vendor/kreuzberg/README.md +2 -2
data/vendor/kreuzberg/build.rs +4 -18
data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
data/vendor/kreuzberg/src/cache/mod.rs +3 -27
data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
data/vendor/kreuzberg/src/core/extractor.rs +81 -202
data/vendor/kreuzberg/src/core/io.rs +2 -4
data/vendor/kreuzberg/src/core/mime.rs +12 -2
data/vendor/kreuzberg/src/core/mod.rs +1 -4
data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
data/vendor/kreuzberg/src/embeddings.rs +16 -125
data/vendor/kreuzberg/src/error.rs +1 -1
data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
data/vendor/kreuzberg/src/extraction/image.rs +13 -13
data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
data/vendor/kreuzberg/src/extractors/email.rs +0 -14
data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
data/vendor/kreuzberg/src/extractors/html.rs +154 -137
data/vendor/kreuzberg/src/extractors/image.rs +4 -7
data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
data/vendor/kreuzberg/src/extractors/text.rs +5 -23
data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
data/vendor/kreuzberg/src/lib.rs +1 -4
data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
data/vendor/kreuzberg/src/mcp/server.rs +3 -5
data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
data/vendor/kreuzberg/src/pdf/error.rs +1 -1
data/vendor/kreuzberg/src/pdf/table.rs +44 -17
data/vendor/kreuzberg/src/pdf/text.rs +3 -0
data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
data/vendor/kreuzberg/src/types.rs +12 -42
data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
data/vendor/kreuzberg/tests/config_features.rs +0 -18
data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
data/vendor/kreuzberg/tests/core_integration.rs +7 -24
data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
data/vendor/kreuzberg/tests/security_validation.rs +1 -12
metadata +25 -90
data/.rubocop.yml +0 -538
data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
data/lib/kreuzberg/error_context.rb +0 -32
data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
data/vendor/kreuzberg/src/extractors/security.rs +0 -484
data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
data/vendor/kreuzberg/src/panic_context.rs +0 -154
data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
data/vendor/rb-sys/.cargo-ok +0 -1
data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
data/vendor/rb-sys/Cargo.lock +0 -393
data/vendor/rb-sys/Cargo.toml +0 -70
data/vendor/rb-sys/Cargo.toml.orig +0 -57
data/vendor/rb-sys/LICENSE-APACHE +0 -190
data/vendor/rb-sys/LICENSE-MIT +0 -21
data/vendor/rb-sys/bin/release.sh +0 -21
data/vendor/rb-sys/build/features.rs +0 -108
data/vendor/rb-sys/build/main.rs +0 -246
data/vendor/rb-sys/build/stable_api_config.rs +0 -153
data/vendor/rb-sys/build/version.rs +0 -48
data/vendor/rb-sys/readme.md +0 -36
data/vendor/rb-sys/src/bindings.rs +0 -21
data/vendor/rb-sys/src/hidden.rs +0 -11
data/vendor/rb-sys/src/lib.rs +0 -34
data/vendor/rb-sys/src/macros.rs +0 -371
data/vendor/rb-sys/src/memory.rs +0 -53
data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
data/vendor/rb-sys/src/special_consts.rs +0 -31
data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
data/vendor/rb-sys/src/stable_api.rs +0 -261
data/vendor/rb-sys/src/symbol.rs +0 -31
data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
data/vendor/rb-sys/src/utils.rs +0 -89
data/vendor/rb-sys/src/value_type.rs +0 -7

data/vendor/kreuzberg/src/core/pipeline.rs CHANGED Viewed

@@ -30,13 +30,6 @@ use crate::{KreuzbergError, Result};
 /// - Validator errors bubble up immediately
 /// - Post-processor errors are caught and recorded in metadata
 /// - System errors (IO, RuntimeError equivalents) always bubble up
-#[cfg_attr(feature = "otel", tracing::instrument(
-    skip(result, config),
-    fields(
-        pipeline.stage = "post_processing",
-        content.length = result.content.len(),
-    )
-))]
 pub async fn run_pipeline(mut result: ExtractionResult, config: &ExtractionConfig) -> Result<ExtractionResult> {
     let pp_config = config.postprocessor.as_ref();
     let postprocessing_enabled = pp_config.is_none_or(|c| c.enabled);
@@ -234,18 +227,13 @@ mod tests {
     use crate::types::Metadata;
     use lazy_static::lazy_static;
-    const VALIDATION_MARKER_KEY: &str = "registry_validation_marker";
-    const QUALITY_VALIDATION_MARKER: &str = "quality_validation_test";
-    const POSTPROCESSOR_VALIDATION_MARKER: &str = "postprocessor_validation_test";
-    const ORDER_VALIDATION_MARKER: &str = "order_validation_test";
     lazy_static! {
         static ref REGISTRY_TEST_GUARD: std::sync::Mutex<()> = std::sync::Mutex::new(());
     }
     #[tokio::test]
     async fn test_run_pipeline_basic() {
-        let mut result = ExtractionResult {
+        let result = ExtractionResult {
             content: "test".to_string(),
             mime_type: "text/plain".to_string(),
             metadata: Metadata::default(),
@@ -254,10 +242,6 @@ mod tests {
             chunks: None,
             images: None,
         };
-        result.metadata.additional.insert(
-            VALIDATION_MARKER_KEY.to_string(),
-            serde_json::json!(ORDER_VALIDATION_MARKER),
-        );
         let config = ExtractionConfig::default();
         let processed = run_pipeline(result, &config).await.unwrap();
@@ -413,17 +397,9 @@ mod tests {
     #[tokio::test]
     async fn test_pipeline_empty_content() {
-        let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
-        {
-            let registry = crate::plugins::registry::get_post_processor_registry();
-            registry.write().unwrap().shutdown_all().unwrap();
-        }
         {
-            let registry = crate::plugins::registry::get_validator_registry();
-            registry.write().unwrap().shutdown_all().unwrap();
-        }
+            let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
+        } // Drop guard before async operations
         let result = ExtractionResult {
             content: String::new(),
             mime_type: "text/plain".to_string(),
@@ -435,8 +411,6 @@ mod tests {
         };
         let config = ExtractionConfig::default();
-        drop(_guard);
         let processed = run_pipeline(result, &config).await.unwrap();
         assert_eq!(processed.content, "");
     }
@@ -472,8 +446,6 @@ mod tests {
     #[tokio::test]
     #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
     async fn test_pipeline_with_keyword_extraction() {
-        let _ = crate::keywords::register_keyword_processor();
         let result = ExtractionResult {
             content: r#"
 Machine learning is a branch of artificial intelligence that focuses on
@@ -544,18 +516,6 @@ Natural language processing enables computers to understand human language.
     #[tokio::test]
     #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
     async fn test_pipeline_keyword_extraction_short_content() {
-        let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
-        crate::plugins::registry::get_validator_registry()
-            .write()
-            .unwrap()
-            .shutdown_all()
-            .unwrap();
-        crate::plugins::registry::get_post_processor_registry()
-            .write()
-            .unwrap()
-            .shutdown_all()
-            .unwrap();
         let result = ExtractionResult {
             content: "Short text".to_string(),
             mime_type: "text/plain".to_string(),
@@ -577,8 +537,6 @@ Natural language processing enables computers to understand human language.
             ..Default::default()
         };
-        drop(_guard);
         let processed = run_pipeline(result, &config).await.unwrap();
         assert!(!processed.metadata.additional.contains_key("keywords"));
@@ -586,6 +544,9 @@ Natural language processing enables computers to understand human language.
     #[tokio::test]
     async fn test_postprocessor_runs_before_validator() {
+        {
+            let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
+        } // Drop guard before async operations
         use crate::plugins::{Plugin, PostProcessor, ProcessingStage, Validator};
         use async_trait::async_trait;
         use std::sync::Arc;
@@ -640,17 +601,6 @@ Natural language processing enables computers to understand human language.
         #[async_trait]
         impl Validator for TestValidator {
             async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
-                let should_validate = result
-                    .metadata
-                    .additional
-                    .get(VALIDATION_MARKER_KEY)
-                    .and_then(|v| v.as_str())
-                    == Some(POSTPROCESSOR_VALIDATION_MARKER);
-                if !should_validate {
-                    return Ok(());
-                }
                 let processed = result
                     .metadata
                     .additional
@@ -669,23 +619,18 @@ Natural language processing enables computers to understand human language.
         }
         let pp_registry = crate::plugins::registry::get_post_processor_registry();
-        let val_registry = crate::plugins::registry::get_validator_registry();
-        let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
-        pp_registry.write().unwrap().shutdown_all().unwrap();
-        val_registry.write().unwrap().shutdown_all().unwrap();
         {
             let mut registry = pp_registry.write().unwrap();
             registry.register(Arc::new(TestPostProcessor), 0).unwrap();
         }
+        let val_registry = crate::plugins::registry::get_validator_registry();
         {
             let mut registry = val_registry.write().unwrap();
             registry.register(Arc::new(TestValidator)).unwrap();
         }
-        let mut result = ExtractionResult {
+        let result = ExtractionResult {
             content: "test".to_string(),
             mime_type: "text/plain".to_string(),
             metadata: Metadata::default(),
@@ -694,18 +639,18 @@ Natural language processing enables computers to understand human language.
             chunks: None,
             images: None,
         };
-        result.metadata.additional.insert(
-            VALIDATION_MARKER_KEY.to_string(),
-            serde_json::json!(POSTPROCESSOR_VALIDATION_MARKER),
-        );
         let config = ExtractionConfig::default();
-        drop(_guard);
         let processed = run_pipeline(result, &config).await;
-        pp_registry.write().unwrap().shutdown_all().unwrap();
-        val_registry.write().unwrap().shutdown_all().unwrap();
+        {
+            let mut registry = pp_registry.write().unwrap();
+            registry.remove("test-processor").unwrap();
+        }
+        {
+            let mut registry = val_registry.write().unwrap();
+            registry.remove("test-validator").unwrap();
+        }
         assert!(processed.is_ok(), "Validator should have seen post-processor metadata");
         let processed = processed.unwrap();
@@ -719,7 +664,9 @@ Natural language processing enables computers to understand human language.
     #[tokio::test]
     #[cfg(feature = "quality")]
     async fn test_quality_processing_runs_before_validator() {
-        let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
+        {
+            let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
+        } // Drop guard before async operations
         use crate::plugins::{Plugin, Validator};
         use async_trait::async_trait;
         use std::sync::Arc;
@@ -743,17 +690,6 @@ Natural language processing enables computers to understand human language.
         #[async_trait]
         impl Validator for QualityValidator {
             async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
-                let should_validate = result
-                    .metadata
-                    .additional
-                    .get(VALIDATION_MARKER_KEY)
-                    .and_then(|v| v.as_str())
-                    == Some(QUALITY_VALIDATION_MARKER);
-                if !should_validate {
-                    return Ok(());
-                }
                 if !result.metadata.additional.contains_key("quality_score") {
                     return Err(crate::KreuzbergError::Validation {
                         message: "Quality processing did not run before validator".to_string(),
@@ -770,7 +706,7 @@ Natural language processing enables computers to understand human language.
             registry.register(Arc::new(QualityValidator)).unwrap();
         }
-        let mut result = ExtractionResult {
+        let result = ExtractionResult {
             content: "This is meaningful test content for quality scoring.".to_string(),
             mime_type: "text/plain".to_string(),
             metadata: Metadata::default(),
@@ -779,18 +715,12 @@ Natural language processing enables computers to understand human language.
             chunks: None,
             images: None,
         };
-        result.metadata.additional.insert(
-            VALIDATION_MARKER_KEY.to_string(),
-            serde_json::json!(QUALITY_VALIDATION_MARKER),
-        );
         let config = ExtractionConfig {
             enable_quality_processing: true,
             ..Default::default()
         };
-        drop(_guard);
         let processed = run_pipeline(result, &config).await;
         {
@@ -803,6 +733,9 @@ Natural language processing enables computers to understand human language.
     #[tokio::test]
     async fn test_multiple_postprocessors_run_before_validator() {
+        {
+            let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
+        } // Drop guard before async operations
         use crate::plugins::{Plugin, PostProcessor, ProcessingStage, Validator};
         use async_trait::async_trait;
         use std::sync::Arc;
@@ -904,17 +837,6 @@ Natural language processing enables computers to understand human language.
         #[async_trait]
         impl Validator for OrderValidator {
             async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
-                let should_validate = result
-                    .metadata
-                    .additional
-                    .get(VALIDATION_MARKER_KEY)
-                    .and_then(|v| v.as_str())
-                    == Some(ORDER_VALIDATION_MARKER);
-                if !should_validate {
-                    return Ok(());
-                }
                 let order = result
                     .metadata
                     .additional
@@ -944,18 +866,13 @@ Natural language processing enables computers to understand human language.
         }
         let pp_registry = crate::plugins::registry::get_post_processor_registry();
-        let val_registry = crate::plugins::registry::get_validator_registry();
-        let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
-        pp_registry.write().unwrap().shutdown_all().unwrap();
-        val_registry.write().unwrap().shutdown_all().unwrap();
         {
             let mut registry = pp_registry.write().unwrap();
             registry.register(Arc::new(EarlyProcessor), 0).unwrap();
             registry.register(Arc::new(LateProcessor), 0).unwrap();
         }
+        let val_registry = crate::plugins::registry::get_validator_registry();
         {
             let mut registry = val_registry.write().unwrap();
             registry.register(Arc::new(OrderValidator)).unwrap();
@@ -972,12 +889,17 @@ Natural language processing enables computers to understand human language.
         };
         let config = ExtractionConfig::default();
-        drop(_guard);
         let processed = run_pipeline(result, &config).await;
-        pp_registry.write().unwrap().shutdown_all().unwrap();
-        val_registry.write().unwrap().shutdown_all().unwrap();
+        {
+            let mut registry = pp_registry.write().unwrap();
+            registry.remove("early-proc").unwrap();
+            registry.remove("late-proc").unwrap();
+        }
+        {
+            let mut registry = val_registry.write().unwrap();
+            registry.remove("order-validator").unwrap();
+        }
         assert!(processed.is_ok(), "All processors should run before validator");
     }

data/vendor/kreuzberg/src/embeddings.rs CHANGED Viewed

@@ -45,104 +45,9 @@ use std::collections::HashMap;
 #[cfg(feature = "embeddings")]
 use lazy_static::lazy_static;
-/// Wrapper for TextEmbedding that prevents cleanup during process shutdown.
-///
-/// # Problem
-///
-/// When the process terminates, global static objects are dropped. The `TextEmbedding`
-/// objects from fastembed contain ONNX Runtime sessions (via `ort v2.0.0-rc.10`), and
-/// during their `Drop` implementation, ONNX Runtime's C++ destructor tries to acquire
-/// mutexes for cleanup.
-///
-/// At process shutdown time, the C++ runtime may have already begun tearing down
-/// threading infrastructure, causing mutex operations to fail with:
-/// "mutex lock failed: Invalid argument"
-///
-/// This manifests as:
-/// ```text
-/// libc++abi: terminating due to uncaught exception of type std::__1::system_error:
-/// mutex lock failed: Invalid argument
-/// ```
-///
-/// This is a known issue in `ort` (see pykeio/ort#441), fixed in later versions via commit
-/// 317be20 ("fix: let `Environment` drop"), but we're using v2.0.0-rc.10 through fastembed
-/// v5.3.1 which predates the fix.
-///
-/// # Solution
-///
-/// We use `Box::leak` to intentionally leak `TextEmbedding` objects during process
-/// shutdown, preventing their `Drop` implementation from running. This is acceptable because:
-///
-/// 1. The OS will reclaim all process memory anyway
-/// 2. Avoiding the crash is more important than cleanup
-/// 3. This only affects process termination, not runtime behavior
-/// 4. Models are long-lived and would survive until process exit anyway
-/// 5. The memory leak is bounded (one model per unique config)
-///
-/// # Remaining Issue
-///
-/// Even with this fix, you may still see the mutex error during final process cleanup.
-/// This is because `ort` v2.0.0-rc.10 also holds the ONNX Runtime `Environment` as a
-/// static variable, which gets dropped during C++ static destruction after Rust cleanup.
-/// This error occurs *after* all Rust code has finished and can be safely ignored - all
-/// tests pass before the error occurs.
-///
-/// The error will be resolved when fastembed upgrades to ort >= 2.0.0 (post-rc.10) which
-/// contains the proper fix.
-///
-/// # Safety
-///
-/// The leak is contained to process shutdown and does not affect runtime behavior.
-/// All normal usage patterns (creating embeddings, caching models) work identically.
-/// We use static references to the leaked models, which is safe because:
-/// - The pointers are never null (we leak valid Box<TextEmbedding>)
-/// - The models live until process exit
-/// - We never manually deallocate the leaked memory
-/// - Mutex provides interior mutability for the embed() method
-///
-/// Thread-safe wrapper for leaked TextEmbedding that allows interior mutability.
-///
-/// This wrapper holds a raw pointer to a leaked `TextEmbedding` and provides
-/// safe access through the Mutex lock in MODEL_CACHE.
-#[cfg(feature = "embeddings")]
-pub(crate) struct LeakedModel {
-    ptr: *mut TextEmbedding,
-}
-#[cfg(feature = "embeddings")]
-impl LeakedModel {
-    fn new(model: TextEmbedding) -> Self {
-        Self {
-            ptr: Box::into_raw(Box::new(model)),
-        }
-    }
-    /// Get a mutable reference to the model.
-    ///
-    /// # Safety
-    ///
-    /// This is safe to call only when:
-    /// 1. The caller has exclusive access (guaranteed by Mutex in MODEL_CACHE)
-    /// 2. The pointer is valid (guaranteed by Box::into_raw and never deallocating)
-    #[allow(unsafe_code, clippy::mut_from_ref)]
-    unsafe fn get_mut(&self) -> &mut TextEmbedding {
-        unsafe { &mut *self.ptr }
-    }
-}
-#[cfg(feature = "embeddings")]
-#[allow(unsafe_code)]
-unsafe impl Send for LeakedModel {}
-#[cfg(feature = "embeddings")]
-#[allow(unsafe_code)]
-unsafe impl Sync for LeakedModel {}
-#[cfg(feature = "embeddings")]
-type CachedEmbedding = Arc<Mutex<LeakedModel>>;
 #[cfg(feature = "embeddings")]
 lazy_static! {
-    static ref MODEL_CACHE: RwLock<HashMap<String, CachedEmbedding>> = RwLock::new(HashMap::new());
+    static ref MODEL_CACHE: RwLock<HashMap<String, Arc<Mutex<TextEmbedding>>>> = RwLock::new(HashMap::new());
 }
 /// Get or initialize a text embedding model from cache.
@@ -150,11 +55,10 @@ lazy_static! {
 /// This function ensures models are initialized only once and reused across
 /// the application, avoiding redundant downloads and initialization overhead.
 #[cfg(feature = "embeddings")]
-#[allow(private_interfaces)]
 pub fn get_or_init_model(
     model: EmbeddingModel,
     cache_dir: Option<std::path::PathBuf>,
-) -> crate::Result<CachedEmbedding> {
+) -> crate::Result<Arc<Mutex<TextEmbedding>>> {
     let cache_directory = cache_dir.unwrap_or_else(|| {
         let mut path = std::env::current_dir().unwrap_or_else(|_| std::path::PathBuf::from("."));
         path.push(".kreuzberg");
@@ -165,26 +69,21 @@ pub fn get_or_init_model(
     let model_key = format!("{:?}_{}", model, cache_directory.display());
     {
-        match MODEL_CACHE.read() {
-            Ok(cache) => {
-                if let Some(cached_model) = cache.get(&model_key) {
-                    return Ok(Arc::clone(cached_model));
-                }
-            }
-            Err(poison_error) => {
-                let cache = poison_error.get_ref();
-                if let Some(cached_model) = cache.get(&model_key) {
-                    return Ok(Arc::clone(cached_model));
-                }
-            }
+        let cache = MODEL_CACHE.read().map_err(|e| crate::KreuzbergError::Plugin {
+            message: format!("Failed to acquire model cache read lock: {}", e),
+            plugin_name: "embeddings".to_string(),
+        })?;
+        if let Some(cached_model) = cache.get(&model_key) {
+            return Ok(Arc::clone(cached_model));
         }
     }
     {
-        let mut cache = match MODEL_CACHE.write() {
-            Ok(guard) => guard,
-            Err(poison_error) => poison_error.into_inner(),
-        };
+        let mut cache = MODEL_CACHE.write().map_err(|e| crate::KreuzbergError::Plugin {
+            message: format!("Failed to acquire model cache write lock: {}", e),
+            plugin_name: "embeddings".to_string(),
+        })?;
         if let Some(cached_model) = cache.get(&model_key) {
             return Ok(Arc::clone(cached_model));
@@ -198,8 +97,7 @@ pub fn get_or_init_model(
             plugin_name: "embeddings".to_string(),
         })?;
-        let leaked_model = LeakedModel::new(embedding_model);
-        let arc_model = Arc::new(Mutex::new(leaked_model));
+        let arc_model = Arc::new(Mutex::new(embedding_model));
         cache.insert(model_key, Arc::clone(&arc_model));
         Ok(arc_model)
@@ -350,15 +248,12 @@ pub fn generate_embeddings_for_chunks(
     let texts: Vec<String> = chunks.iter().map(|chunk| chunk.content.clone()).collect();
     let embeddings_result = {
-        let locked_model = model.lock().map_err(|e| crate::KreuzbergError::Plugin {
+        let mut locked_model = model.lock().map_err(|e| crate::KreuzbergError::Plugin {
             message: format!("Failed to acquire model lock: {}", e),
             plugin_name: "embeddings".to_string(),
         })?;
-        #[allow(unsafe_code)]
-        let model_mut = unsafe { locked_model.get_mut() };
-        model_mut
+        locked_model
             .embed(texts, Some(config.batch_size))
             .map_err(|e| crate::KreuzbergError::Plugin {
                 message: format!("Failed to generate embeddings: {}", e),
@@ -425,8 +320,4 @@ mod tests {
         assert_eq!(quality.chunk_size, 2000);
         assert_eq!(quality.overlap, 200);
     }
-    #[cfg(feature = "embeddings")]
-    #[test]
-    fn test_lock_poisoning_recovery_semantics() {}
 }

data/vendor/kreuzberg/src/error.rs CHANGED Viewed

@@ -60,7 +60,7 @@ pub type Result<T> = std::result::Result<T, KreuzbergError>;
 /// - `Cache` - Cache operation errors (non-fatal, can be ignored)
 /// - `ImageProcessing` - Image manipulation errors
 /// - `Serialization` - JSON/MessagePack serialization errors
-/// - `MissingDependency` - Missing optional dependencies (tesseract, etc.)
+/// - `MissingDependency` - Missing optional dependencies (tesseract, pandoc, etc.)
 /// - `Plugin` - Plugin-specific errors
 /// - `LockPoisoned` - Mutex/RwLock poisoning (should not happen in normal operation)
 /// - `UnsupportedFormat` - Unsupported MIME type or file format

data/vendor/kreuzberg/src/extraction/docx.rs CHANGED Viewed

@@ -16,7 +16,7 @@ use crate::error::{KreuzbergError, Result};
 ///
 /// # Performance
 /// docx-lite uses streaming XML parsing for minimal memory overhead and high throughput
-/// (~160 MB/s average).
+/// (~160 MB/s average). It eliminates subprocess overhead compared to Pandoc (~400x faster).
 pub fn extract_text(bytes: &[u8]) -> Result<String> {
     docx_lite::extract_text_from_bytes(bytes)
         .map_err(|e| KreuzbergError::parsing(format!("DOCX text extraction failed: {}", e)))

data/vendor/kreuzberg/src/extraction/image.rs CHANGED Viewed

@@ -39,7 +39,7 @@ pub fn extract_image_metadata(bytes: &[u8]) -> Result<ImageMetadata> {
     let width = image.width();
     let height = image.height();
-    let format_str = format!("{:?}", format).to_uppercase();
+    let format_str = format!("{:?}", format);
     let exif_data = extract_exif_data(bytes);
@@ -123,7 +123,7 @@ mod tests {
         let metadata = result.unwrap();
         assert_eq!(metadata.width, 100);
         assert_eq!(metadata.height, 80);
-        assert_eq!(metadata.format, "PNG");
+        assert_eq!(metadata.format, "Png");
     }
     #[test]
@@ -135,7 +135,7 @@ mod tests {
         let metadata = result.unwrap();
         assert_eq!(metadata.width, 200);
         assert_eq!(metadata.height, 150);
-        assert_eq!(metadata.format, "JPEG");
+        assert_eq!(metadata.format, "Jpeg");
     }
     #[test]
@@ -147,7 +147,7 @@ mod tests {
         let metadata = result.unwrap();
         assert_eq!(metadata.width, 120);
         assert_eq!(metadata.height, 90);
-        assert_eq!(metadata.format, "WEBP");
+        assert_eq!(metadata.format, "WebP");
     }
     #[test]
@@ -159,7 +159,7 @@ mod tests {
         let metadata = result.unwrap();
         assert_eq!(metadata.width, 50);
         assert_eq!(metadata.height, 50);
-        assert_eq!(metadata.format, "BMP");
+        assert_eq!(metadata.format, "Bmp");
     }
     #[test]
@@ -171,7 +171,7 @@ mod tests {
         let metadata = result.unwrap();
         assert_eq!(metadata.width, 180);
         assert_eq!(metadata.height, 120);
-        assert_eq!(metadata.format, "TIFF");
+        assert_eq!(metadata.format, "Tiff");
     }
     #[test]
@@ -183,7 +183,7 @@ mod tests {
         let metadata = result.unwrap();
         assert_eq!(metadata.width, 64);
         assert_eq!(metadata.height, 64);
-        assert_eq!(metadata.format, "GIF");
+        assert_eq!(metadata.format, "Gif");
     }
     #[test]
@@ -217,8 +217,8 @@ mod tests {
         let png_metadata = extract_image_metadata(&png_bytes).unwrap();
         let jpeg_metadata = extract_image_metadata(&jpeg_bytes).unwrap();
-        assert_eq!(png_metadata.format, "PNG");
-        assert_eq!(jpeg_metadata.format, "JPEG");
+        assert_eq!(png_metadata.format, "Png");
+        assert_eq!(jpeg_metadata.format, "Jpeg");
     }
     #[test]
@@ -284,7 +284,7 @@ mod tests {
         let metadata = result.unwrap();
         assert_eq!(metadata.width, 1);
         assert_eq!(metadata.height, 1);
-        assert_eq!(metadata.format, "PNG");
+        assert_eq!(metadata.format, "Png");
     }
     #[test]
@@ -361,8 +361,8 @@ mod tests {
         let jpeg_meta = extract_image_metadata(&jpeg_bytes).unwrap();
         let webp_meta = extract_image_metadata(&webp_bytes).unwrap();
-        assert_eq!(png_meta.format, "PNG");
-        assert_eq!(jpeg_meta.format, "JPEG");
-        assert_eq!(webp_meta.format, "WEBP");
+        assert_eq!(png_meta.format, "Png");
+        assert_eq!(jpeg_meta.format, "Jpeg");
+        assert_eq!(webp_meta.format, "WebP");
     }
 }

data/vendor/kreuzberg/src/extraction/libreoffice.rs CHANGED Viewed

@@ -255,6 +255,7 @@ pub async fn convert_office_doc(
         let stderr = String::from_utf8_lossy(&output.stderr);
         let stdout = String::from_utf8_lossy(&output.stdout);
+        // Build detailed error message with both stdout and stderr
         let mut error_details = format!(
             "LibreOffice process failed with return code {}",
             output.status.code().unwrap_or(-1)

data/vendor/kreuzberg/src/extraction/mod.rs CHANGED Viewed

@@ -25,6 +25,9 @@ pub mod libreoffice;
 #[cfg(feature = "office")]
 pub mod office_metadata;
+#[cfg(feature = "office")]
+pub mod pandoc;
 #[cfg(feature = "office")]
 pub mod pptx;
@@ -34,9 +37,6 @@ pub mod table;
 #[cfg(feature = "xml")]
 pub mod xml;
-#[cfg(any(feature = "office", feature = "html"))]
-pub mod markdown;
 pub use structured::{JsonExtractionConfig, StructuredDataResult, parse_json, parse_toml, parse_yaml};
 pub use text::parse_text;
@@ -63,9 +63,8 @@ pub use libreoffice::{check_libreoffice_available, convert_doc_to_docx, convert_
 #[cfg(feature = "office")]
 pub use office_metadata::{
-    CoreProperties, CustomProperties, DocxAppProperties, OdtProperties, PptxAppProperties, XlsxAppProperties,
-    extract_core_properties, extract_custom_properties, extract_docx_app_properties, extract_odt_properties,
-    extract_pptx_app_properties, extract_xlsx_app_properties,
+    CoreProperties, CustomProperties, DocxAppProperties, PptxAppProperties, XlsxAppProperties, extract_core_properties,
+    extract_custom_properties, extract_docx_app_properties, extract_pptx_app_properties, extract_xlsx_app_properties,
 };
 #[cfg(feature = "office")]
@@ -76,6 +75,3 @@ pub use table::table_from_arrow_to_markdown;
 #[cfg(feature = "xml")]
 pub use xml::parse_xml;
-#[cfg(any(feature = "office", feature = "html"))]
-pub use markdown::cells_to_markdown;

data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs CHANGED Viewed

@@ -35,7 +35,6 @@
 pub mod app_properties;
 pub mod core_properties;
 pub mod custom_properties;
-pub mod odt_properties;
 pub use app_properties::{
     DocxAppProperties, PptxAppProperties, XlsxAppProperties, extract_docx_app_properties, extract_pptx_app_properties,
@@ -43,7 +42,6 @@ pub use app_properties::{
 };
 pub use core_properties::{CoreProperties, extract_core_properties};
 pub use custom_properties::{CustomProperties, extract_custom_properties};
-pub use odt_properties::{OdtProperties, extract_odt_properties};
 use roxmltree::Node;