RubyGems - kreuzberg - Versions diffs - 4.0.0.pre.rc.6 → 4.0.0.rc1 - Mend

kreuzberg 4.0.0.pre.rc.6 → 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

checksums.yaml +4 -4
data/.gitignore +0 -6
data/.rubocop.yaml +534 -1
data/Gemfile +2 -1
data/Gemfile.lock +11 -11
data/README.md +5 -10
data/examples/async_patterns.rb +0 -1
data/ext/kreuzberg_rb/extconf.rb +0 -10
data/ext/kreuzberg_rb/native/Cargo.toml +15 -23
data/ext/kreuzberg_rb/native/build.rs +2 -0
data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
data/ext/kreuzberg_rb/native/include/strings.h +2 -2
data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
data/ext/kreuzberg_rb/native/src/lib.rs +16 -75
data/kreuzberg.gemspec +14 -57
data/lib/kreuzberg/cache_api.rb +0 -1
data/lib/kreuzberg/cli.rb +2 -2
data/lib/kreuzberg/config.rb +2 -9
data/lib/kreuzberg/errors.rb +7 -75
data/lib/kreuzberg/extraction_api.rb +0 -1
data/lib/kreuzberg/setup_lib_path.rb +0 -1
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +0 -21
data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
data/sig/kreuzberg.rbs +3 -55
data/spec/binding/cli_proxy_spec.rb +4 -2
data/spec/binding/cli_spec.rb +11 -12
data/spec/examples.txt +104 -0
data/spec/fixtures/config.yaml +1 -0
data/spec/spec_helper.rb +1 -1
data/vendor/kreuzberg/Cargo.toml +42 -112
data/vendor/kreuzberg/README.md +2 -2
data/vendor/kreuzberg/build.rs +4 -18
data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
data/vendor/kreuzberg/src/cache/mod.rs +3 -27
data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
data/vendor/kreuzberg/src/core/extractor.rs +81 -202
data/vendor/kreuzberg/src/core/io.rs +2 -4
data/vendor/kreuzberg/src/core/mime.rs +12 -2
data/vendor/kreuzberg/src/core/mod.rs +1 -4
data/vendor/kreuzberg/src/core/pipeline.rs +33 -111
data/vendor/kreuzberg/src/embeddings.rs +16 -125
data/vendor/kreuzberg/src/error.rs +1 -1
data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
data/vendor/kreuzberg/src/extraction/image.rs +13 -13
data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -0
data/vendor/kreuzberg/src/extraction/mod.rs +5 -9
data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
data/vendor/kreuzberg/src/extractors/archive.rs +0 -21
data/vendor/kreuzberg/src/extractors/docx.rs +128 -16
data/vendor/kreuzberg/src/extractors/email.rs +0 -14
data/vendor/kreuzberg/src/extractors/excel.rs +20 -19
data/vendor/kreuzberg/src/extractors/html.rs +154 -137
data/vendor/kreuzberg/src/extractors/image.rs +4 -7
data/vendor/kreuzberg/src/extractors/mod.rs +9 -106
data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
data/vendor/kreuzberg/src/extractors/pdf.rs +15 -12
data/vendor/kreuzberg/src/extractors/pptx.rs +3 -17
data/vendor/kreuzberg/src/extractors/structured.rs +0 -14
data/vendor/kreuzberg/src/extractors/text.rs +5 -23
data/vendor/kreuzberg/src/extractors/xml.rs +0 -7
data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
data/vendor/kreuzberg/src/lib.rs +1 -4
data/vendor/kreuzberg/src/mcp/mod.rs +1 -1
data/vendor/kreuzberg/src/mcp/server.rs +3 -5
data/vendor/kreuzberg/src/ocr/processor.rs +2 -18
data/vendor/kreuzberg/src/pdf/error.rs +1 -1
data/vendor/kreuzberg/src/pdf/table.rs +44 -17
data/vendor/kreuzberg/src/pdf/text.rs +3 -0
data/vendor/kreuzberg/src/plugins/extractor.rs +5 -8
data/vendor/kreuzberg/src/plugins/ocr.rs +11 -2
data/vendor/kreuzberg/src/plugins/processor.rs +1 -2
data/vendor/kreuzberg/src/plugins/registry.rs +0 -13
data/vendor/kreuzberg/src/plugins/validator.rs +8 -9
data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
data/vendor/kreuzberg/src/types.rs +12 -42
data/vendor/kreuzberg/tests/batch_orchestration.rs +5 -19
data/vendor/kreuzberg/tests/batch_processing.rs +3 -15
data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -17
data/vendor/kreuzberg/tests/config_features.rs +0 -18
data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -15
data/vendor/kreuzberg/tests/core_integration.rs +7 -24
data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
data/vendor/kreuzberg/tests/pipeline_integration.rs +1 -0
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -0
data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -1
data/vendor/kreuzberg/tests/security_validation.rs +1 -12
metadata +25 -90
data/.rubocop.yml +0 -538
data/ext/kreuzberg_rb/native/Cargo.lock +0 -6535
data/lib/kreuzberg/error_context.rb +0 -32
data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
data/vendor/kreuzberg/src/extraction/markdown.rs +0 -213
data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -287
data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -469
data/vendor/kreuzberg/src/extractors/docbook.rs +0 -502
data/vendor/kreuzberg/src/extractors/epub.rs +0 -707
data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -491
data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
data/vendor/kreuzberg/src/extractors/jats.rs +0 -1051
data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -367
data/vendor/kreuzberg/src/extractors/latex.rs +0 -652
data/vendor/kreuzberg/src/extractors/markdown.rs +0 -700
data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
data/vendor/kreuzberg/src/extractors/opml.rs +0 -634
data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -528
data/vendor/kreuzberg/src/extractors/rst.rs +0 -576
data/vendor/kreuzberg/src/extractors/rtf.rs +0 -810
data/vendor/kreuzberg/src/extractors/security.rs +0 -484
data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
data/vendor/kreuzberg/src/extractors/typst.rs +0 -650
data/vendor/kreuzberg/src/panic_context.rs +0 -154
data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -498
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -695
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -692
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -776
data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1259
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -647
data/vendor/rb-sys/.cargo-ok +0 -1
data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
data/vendor/rb-sys/Cargo.lock +0 -393
data/vendor/rb-sys/Cargo.toml +0 -70
data/vendor/rb-sys/Cargo.toml.orig +0 -57
data/vendor/rb-sys/LICENSE-APACHE +0 -190
data/vendor/rb-sys/LICENSE-MIT +0 -21
data/vendor/rb-sys/bin/release.sh +0 -21
data/vendor/rb-sys/build/features.rs +0 -108
data/vendor/rb-sys/build/main.rs +0 -246
data/vendor/rb-sys/build/stable_api_config.rs +0 -153
data/vendor/rb-sys/build/version.rs +0 -48
data/vendor/rb-sys/readme.md +0 -36
data/vendor/rb-sys/src/bindings.rs +0 -21
data/vendor/rb-sys/src/hidden.rs +0 -11
data/vendor/rb-sys/src/lib.rs +0 -34
data/vendor/rb-sys/src/macros.rs +0 -371
data/vendor/rb-sys/src/memory.rs +0 -53
data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
data/vendor/rb-sys/src/special_consts.rs +0 -31
data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
data/vendor/rb-sys/src/stable_api.rs +0 -261
data/vendor/rb-sys/src/symbol.rs +0 -31
data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
data/vendor/rb-sys/src/utils.rs +0 -89
data/vendor/rb-sys/src/value_type.rs +0 -7

data/vendor/kreuzberg/src/pdf/table.rs CHANGED Viewed

@@ -4,7 +4,7 @@
 //! allowing us to reuse the existing table reconstruction logic.
 use super::error::{PdfError, Result};
-use html_to_markdown_rs::hocr::HocrWord;
+use crate::ocr::table::HocrWord;
 use pdfium_render::prelude::*;
 /// Spacing threshold for word boundary detection (in PDF units).
@@ -35,24 +35,25 @@ const MIN_WORD_LENGTH: usize = 1;
 /// use kreuzberg::pdf::table::extract_words_from_page;
 /// use pdfium_render::prelude::*;
 ///
-/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
 /// let pdfium = Pdfium::default();
 /// let document = pdfium.load_pdf_from_file("example.pdf", None)?;
 /// let page = document.pages().get(0)?;
 /// let words = extract_words_from_page(&page, 90.0)?;
-/// # Ok(())
-/// # }
 /// ```
 pub fn extract_words_from_page(page: &PdfPage, min_confidence: f64) -> Result<Vec<HocrWord>> {
+    // Get page dimensions for coordinate system
     let page_width = page.width().value as i32;
     let page_height = page.height().value as i32;
+    // Get all text from page
     let page_text = page
         .text()
         .map_err(|e| PdfError::TextExtractionFailed(format!("Failed to get page text: {}", e)))?;
+    // Extract character-level information
     let chars = page_text.chars();
+    // Group characters into words based on spacing
     let words = group_chars_into_words(chars, page_width, page_height, min_confidence)?;
     Ok(words)
@@ -90,22 +91,26 @@ fn group_chars_into_words(
     let mut current_word_chars: Vec<CharInfo> = Vec::new();
     for pdf_char in chars.iter() {
+        // Get character bounds (use loose_bounds for table detection)
         let bounds = pdf_char
             .loose_bounds()
             .map_err(|e| PdfError::TextExtractionFailed(format!("Failed to get char bounds: {}", e)))?;
+        // Get unicode character (skip if invalid)
         let Some(ch) = pdf_char.unicode_char() else {
             continue;
         };
+        // Extract character information
         let char_info = CharInfo {
             text: ch,
             x: bounds.left().value,
-            y: bounds.bottom().value,
+            y: bounds.bottom().value, // PDF coordinates: bottom-left origin
             width: bounds.width().value,
             height: bounds.height().value,
         };
+        // Skip whitespace characters (they're used for word boundaries)
         if char_info.text.is_whitespace() {
             if !current_word_chars.is_empty() {
                 if let Some(word) = finalize_word(&current_word_chars, page_height, min_confidence) {
@@ -116,6 +121,7 @@ fn group_chars_into_words(
             continue;
         }
+        // Check if this character should start a new word
         if should_start_new_word(&current_word_chars, &char_info) && !current_word_chars.is_empty() {
             if let Some(word) = finalize_word(&current_word_chars, page_height, min_confidence) {
                 words.push(word);
@@ -126,10 +132,11 @@ fn group_chars_into_words(
         current_word_chars.push(char_info);
     }
-    if !current_word_chars.is_empty()
-        && let Some(word) = finalize_word(&current_word_chars, page_height, min_confidence)
-    {
-        words.push(word);
+    // Finalize last word
+    if !current_word_chars.is_empty() {
+        if let Some(word) = finalize_word(&current_word_chars, page_height, min_confidence) {
+            words.push(word);
+        }
     }
     Ok(words)
@@ -146,11 +153,13 @@ fn should_start_new_word(current_word_chars: &[CharInfo], new_char: &CharInfo) -
     let last_char = &current_word_chars[current_word_chars.len() - 1];
+    // Check vertical distance (different lines)
     let vertical_distance = (new_char.y - last_char.y).abs();
     if vertical_distance > last_char.height * 0.5 {
         return true;
     }
+    // Check horizontal distance (word spacing)
     let horizontal_gap = new_char.x - (last_char.x + last_char.width);
     horizontal_gap > WORD_SPACING_THRESHOLD
 }
@@ -164,12 +173,14 @@ fn finalize_word(chars: &[CharInfo], page_height: i32, min_confidence: f64) -> O
         return None;
     }
+    // Build word text
     let text: String = chars.iter().map(|c| c.text).collect();
     if text.len() < MIN_WORD_LENGTH {
         return None;
     }
+    // Calculate bounding box (encompassing all characters)
     let left = chars
         .iter()
         .map(|c| c.x)
@@ -194,10 +205,14 @@ fn finalize_word(chars: &[CharInfo], page_height: i32, min_confidence: f64) -> O
     let width = (right - left).round() as i32;
     let height = (top - bottom).round() as i32;
+    // Convert PDF coordinates (bottom-left origin) to image coordinates (top-left origin)
+    // HocrWord expects top-left origin like images/OCR output
     let top_in_image_coords = (page_height as f32 - top).round() as i32;
+    // PDF text has high confidence (no OCR uncertainty)
     let confidence = 95.0;
+    // Apply confidence threshold
     if confidence < min_confidence {
         return None;
     }
@@ -255,18 +270,20 @@ mod tests {
             height: 12.0,
         }];
+        // Close character - same word
         let close_char = CharInfo {
             text: 'B',
-            x: 111.0,
+            x: 111.0, // 1 unit gap
             y: 50.0,
             width: 10.0,
             height: 12.0,
         };
         assert!(!should_start_new_word(&chars, &close_char));
+        // Far character - new word
         let far_char = CharInfo {
             text: 'C',
-            x: 120.0,
+            x: 120.0, // 10 unit gap (> WORD_SPACING_THRESHOLD)
             y: 50.0,
             width: 10.0,
             height: 12.0,
@@ -284,10 +301,11 @@ mod tests {
             height: 12.0,
         }];
+        // Character on different line
         let new_line_char = CharInfo {
             text: 'B',
             x: 100.0,
-            y: 70.0,
+            y: 70.0, // Different y
             width: 10.0,
             height: 12.0,
         };
@@ -318,7 +336,7 @@ mod tests {
         assert_eq!(word.text, "Hi");
         assert_eq!(word.left, 100);
-        assert_eq!(word.width, 18);
+        assert_eq!(word.width, 18); // 110 + 8 - 100
         assert_eq!(word.height, 12);
         assert_eq!(word.confidence, 95.0);
     }
@@ -340,19 +358,22 @@ mod tests {
             height: 12.0,
         }];
+        // Low threshold - should pass
         let word = finalize_word(&chars, 800, 90.0);
         assert!(word.is_some());
+        // High threshold - should fail
         let word = finalize_word(&chars, 800, 96.0);
         assert!(word.is_none());
     }
     #[test]
     fn test_coordinate_conversion() {
+        // Test PDF coordinate (bottom-left origin) to image coordinate (top-left origin)
         let chars = vec![CharInfo {
             text: 'A',
             x: 100.0,
-            y: 700.0,
+            y: 700.0, // PDF coordinates: bottom-left origin
             width: 10.0,
             height: 12.0,
         }];
@@ -360,11 +381,13 @@ mod tests {
         let page_height = 800;
         let word = finalize_word(&chars, page_height, 0.0).unwrap();
+        // top_in_image_coords = page_height - (y + height) = 800 - (700 + 12) = 88
         assert_eq!(word.top, 88);
     }
     #[test]
     fn test_word_bounding_box() {
+        // Test that bounding box encompasses all characters
         let chars = vec![
             CharInfo {
                 text: 'A',
@@ -376,18 +399,22 @@ mod tests {
             CharInfo {
                 text: 'B',
                 x: 110.0,
-                y: 51.0,
+                y: 51.0, // Slightly different y
                 width: 10.0,
-                height: 13.0,
+                height: 13.0, // Slightly different height
             },
         ];
         let word = finalize_word(&chars, 800, 0.0).unwrap();
+        // Left should be minimum x
         assert_eq!(word.left, 100);
-        assert_eq!(word.width, 20);
+        // Width should span from leftmost to rightmost character
+        assert_eq!(word.width, 20); // 120 - 100
+        // Height should encompass both characters
+        // max(y+height) - min(y) = max(51+13, 50+12) - 50 = 64 - 50 = 14
         assert_eq!(word.height, 14);
     }
 }

data/vendor/kreuzberg/src/pdf/text.rs CHANGED Viewed

@@ -92,6 +92,8 @@ pub fn extract_text_from_pdf_with_passwords(pdf_bytes: &[u8], passwords: &[&str]
 pub fn extract_text_from_pdf_document(document: &PdfDocument<'_>) -> Result<String> {
     let page_count = document.pages().len() as usize;
+    // Pre-allocate capacity based on estimated page size (average 2KB per page)
+    // This reduces memory reallocations during string concatenation
     let estimated_size = page_count * 2048;
     let mut content = String::with_capacity(estimated_size);
@@ -108,6 +110,7 @@ pub fn extract_text_from_pdf_document(document: &PdfDocument<'_>) -> Result<Stri
         content.push_str(&page_text);
     }
+    // Shrink to actual size to free unused capacity
     content.shrink_to_fit();
     Ok(content)

data/vendor/kreuzberg/src/plugins/extractor.rs CHANGED Viewed

@@ -361,6 +361,8 @@ pub trait DocumentExtractor: Plugin {
     }
 }
+// Public registration APIs
 /// Register a document extractor with the global registry.
 ///
 /// The extractor will be registered for all MIME types it supports and will be
@@ -536,7 +538,6 @@ pub fn clear_extractors() -> crate::Result<()> {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use serial_test::serial;
     struct MockExtractor {
         mime_types: Vec<&'static str>,
@@ -829,8 +830,9 @@ mod tests {
         assert_eq!(result.mime_type, "application/json");
     }
+    // Tests for public registration APIs
     #[test]
-    #[serial]
     fn test_register_extractor() {
         use std::sync::Arc;
@@ -845,7 +847,6 @@ mod tests {
     }
     #[test]
-    #[serial]
     fn test_unregister_extractor() {
         use std::sync::Arc;
@@ -860,14 +861,12 @@ mod tests {
     }
     #[test]
-    #[serial]
     fn test_unregister_nonexistent_extractor() {
         let result = super::unregister_extractor("nonexistent-extractor-xyz");
         assert!(result.is_ok());
     }
     #[test]
-    #[serial]
     fn test_list_extractors() {
         use std::sync::Arc;
@@ -889,6 +888,7 @@ mod tests {
         super::register_extractor(extractor2).unwrap();
         let list = super::list_extractors().unwrap();
+        // Both extractors have the same name, so only one will be registered
         assert_eq!(list.len(), 1);
         assert!(list.contains(&"mock-extractor".to_string()));
@@ -896,7 +896,6 @@ mod tests {
     }
     #[test]
-    #[serial]
     fn test_clear_extractors() {
         use std::sync::Arc;
@@ -922,7 +921,6 @@ mod tests {
     }
     #[test]
-    #[serial]
     fn test_register_extractor_with_invalid_name() {
         use std::sync::Arc;
@@ -967,7 +965,6 @@ mod tests {
     }
     #[test]
-    #[serial]
     fn test_register_extractor_with_empty_name() {
         use std::sync::Arc;

data/vendor/kreuzberg/src/plugins/ocr.rs CHANGED Viewed

@@ -81,8 +81,7 @@ pub enum OcrBackendType {
 ///     }
 /// }
 /// ```
-#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
-#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
+#[async_trait]
 pub trait OcrBackend: Plugin {
     /// Process an image and extract text via OCR.
     ///
@@ -253,6 +252,8 @@ pub trait OcrBackend: Plugin {
     }
 }
+// Public registration APIs
 /// Register an OCR backend with the global registry.
 ///
 /// The OCR backend will be registered with its name from the `name()` method
@@ -319,6 +320,8 @@ pub fn register_ocr_backend(backend: Arc<dyn OcrBackend>) -> crate::Result<()> {
     let registry = get_ocr_backend_registry();
     // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
+    // This is a critical runtime error (similar to OOM) that should bubble up
+    // as it indicates the registry is in an inconsistent state.
     let mut registry = registry
         .write()
         .expect("OCR backend registry lock poisoned - critical runtime error");
@@ -354,6 +357,8 @@ pub fn unregister_ocr_backend(name: &str) -> crate::Result<()> {
     let registry = get_ocr_backend_registry();
     // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
+    // This is a critical runtime error (similar to OOM) that should bubble up
+    // as it indicates the registry is in an inconsistent state.
     let mut registry = registry
         .write()
         .expect("OCR backend registry lock poisoned - critical runtime error");
@@ -387,6 +392,8 @@ pub fn list_ocr_backends() -> crate::Result<Vec<String>> {
     let registry = get_ocr_backend_registry();
     // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
+    // This is a critical runtime error (similar to OOM) that should bubble up
+    // as it indicates the registry is in an inconsistent state.
     let registry = registry
         .read()
         .expect("OCR backend registry lock poisoned - critical runtime error");
@@ -418,6 +425,8 @@ pub fn clear_ocr_backends() -> crate::Result<()> {
     let registry = get_ocr_backend_registry();
     // ~keep: Lock poisoning indicates a panic in another thread holding the lock.
+    // This is a critical runtime error (similar to OOM) that should bubble up
+    // as it indicates the registry is in an inconsistent state.
     let mut registry = registry
         .write()
         .expect("OCR backend registry lock poisoned - critical runtime error");

data/vendor/kreuzberg/src/plugins/processor.rs CHANGED Viewed

@@ -105,8 +105,7 @@ pub enum ProcessingStage {
 ///     }
 /// }
 /// ```
-#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
-#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
+#[async_trait]
 pub trait PostProcessor: Plugin {
     /// Process an extraction result.
     ///

data/vendor/kreuzberg/src/plugins/registry.rs CHANGED Viewed

@@ -264,19 +264,10 @@ impl DocumentExtractorRegistry {
     /// # Returns
     ///
     /// The highest priority extractor, or an error if none found.
-    #[cfg_attr(feature = "otel", tracing::instrument(
-        skip(self),
-        fields(
-            registry.mime_type = %mime_type,
-            registry.found = tracing::field::Empty,
-        )
-    ))]
     pub fn get(&self, mime_type: &str) -> Result<Arc<dyn DocumentExtractor>> {
         if let Some(priority_map) = self.extractors.get(mime_type)
             && let Some((_priority, extractor)) = priority_map.iter().next_back()
         {
-            #[cfg(feature = "otel")]
-            tracing::Span::current().record("registry.found", true);
             return Ok(Arc::clone(extractor));
         }
@@ -302,13 +293,9 @@ impl DocumentExtractorRegistry {
         }
         if let Some((_priority, extractor)) = best_match {
-            #[cfg(feature = "otel")]
-            tracing::Span::current().record("registry.found", true);
             return Ok(extractor);
         }
-        #[cfg(feature = "otel")]
-        tracing::Span::current().record("registry.found", false);
         Err(KreuzbergError::UnsupportedFormat(mime_type.to_string()))
     }

data/vendor/kreuzberg/src/plugins/validator.rs CHANGED Viewed

@@ -68,8 +68,7 @@ use std::sync::Arc;
 ///     }
 /// }
 /// ```
-#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
-#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
+#[async_trait]
 pub trait Validator: Plugin {
     /// Validate an extraction result.
     ///
@@ -276,6 +275,8 @@ pub trait Validator: Plugin {
     }
 }
+// Public registration APIs
 /// Register a validator with the global registry.
 ///
 /// The validator will be registered with its default priority and will be called
@@ -811,8 +812,9 @@ mod tests {
         assert!(validator.validate(&result, &config).await.is_ok());
     }
+    // Tests for public registration APIs
     #[test]
-    #[serial_test::serial]
     fn test_register_validator() {
         use std::sync::Arc;
@@ -824,7 +826,6 @@ mod tests {
     }
     #[test]
-    #[serial_test::serial]
     fn test_unregister_validator() {
         use std::sync::Arc;
@@ -836,20 +837,19 @@ mod tests {
     }
     #[test]
-    #[serial_test::serial]
     fn test_unregister_nonexistent_validator() {
         let result = super::unregister_validator("nonexistent-validator-xyz");
         assert!(result.is_ok());
     }
     #[test]
-    #[serial_test::serial]
     fn test_list_validators() {
         use std::sync::Arc;
         super::clear_validators().unwrap();
         let validator1 = Arc::new(MockValidator { should_fail: false });
+        // Both validators have the same name, so only one will be registered
         let validator2 = Arc::new(MockValidator { should_fail: false });
         let list_before = super::list_validators().unwrap();
@@ -859,6 +859,7 @@ mod tests {
         super::register_validator(validator2).unwrap();
         let list = super::list_validators().unwrap();
+        // Only 1 validator registered since they have the same name
         assert_eq!(list.len(), 1);
         assert!(list.contains(&"mock-validator".to_string()));
@@ -866,7 +867,6 @@ mod tests {
     }
     #[test]
-    #[serial_test::serial]
     fn test_clear_validators() {
         use std::sync::Arc;
@@ -878,6 +878,7 @@ mod tests {
         super::register_validator(validator1).unwrap();
         super::register_validator(validator2).unwrap();
+        // Verify at least one validator is registered
         let list_before = super::list_validators().unwrap();
         assert!(!list_before.is_empty());
@@ -889,7 +890,6 @@ mod tests {
     }
     #[test]
-    #[serial_test::serial]
     fn test_register_validator_with_invalid_name() {
         use std::sync::Arc;
@@ -922,7 +922,6 @@ mod tests {
     }
     #[test]
-    #[serial_test::serial]
     fn test_register_validator_with_empty_name() {
         use std::sync::Arc;

data/vendor/kreuzberg/src/stopwords/mod.rs CHANGED Viewed

@@ -100,7 +100,7 @@ macro_rules! embed_stopwords {
                         panic!(
                             "Failed to parse embedded stopwords for language '{}': {}. \
                             This indicates corrupted or malformed JSON in the embedded stopwords data. \
-                            Please report this issue at https://github.com/kreuzberg-dev/kreuzberg/issues",
+                            Please report this issue at https://github.com/Goldziher/kreuzberg/issues",
                             $lang, e
                         );
                     }
@@ -1437,7 +1437,7 @@ mod tests {
         let duration = start.elapsed();
         assert!(
-            duration.as_millis() < 500,
+            duration.as_millis() < 100,
             "30,000 lookups took too long: {:?}",
             duration
         );

data/vendor/kreuzberg/src/types.rs CHANGED Viewed

@@ -844,6 +844,18 @@ pub struct CacheStats {
     pub newest_file_age_days: f64,
 }
+/// Pandoc extraction result.
+///
+/// Result of extracting content from a document using Pandoc,
+/// including text and any metadata Pandoc was able to extract.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PandocExtractionResult {
+    /// Extracted text content
+    pub content: String,
+    /// Metadata extracted by Pandoc (varies by format)
+    pub metadata: HashMap<String, serde_json::Value>,
+}
 /// LibreOffice conversion result.
 ///
 /// Result of converting a legacy office document (e.g., .doc, .ppt)
@@ -859,45 +871,3 @@ pub struct LibreOfficeConversionResult {
     /// Target MIME type after conversion
     pub target_mime: String,
 }
-#[cfg(test)]
-mod tests {
-    use super::*;
-    #[test]
-    fn test_metadata_serialization_with_format() {
-        let mut metadata = Metadata {
-            format: Some(FormatMetadata::Text(TextMetadata {
-                line_count: 1,
-                word_count: 2,
-                character_count: 13,
-                headers: None,
-                links: None,
-                code_blocks: None,
-            })),
-            ..Default::default()
-        };
-        metadata
-            .additional
-            .insert("quality_score".to_string(), serde_json::json!(1.0));
-        let json = serde_json::to_value(&metadata).unwrap();
-        println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
-        // Check that format_type is present
-        assert!(
-            json.get("format_type").is_some(),
-            "format_type should be present in serialized JSON"
-        );
-        assert_eq!(json.get("format_type").unwrap(), "text");
-        // Check that Text metadata fields are present
-        assert_eq!(json.get("line_count").unwrap(), 1);
-        assert_eq!(json.get("word_count").unwrap(), 2);
-        assert_eq!(json.get("character_count").unwrap(), 13);
-        // Check that additional field is merged
-        assert_eq!(json.get("quality_score").unwrap(), 1.0);
-    }
-}

data/vendor/kreuzberg/tests/batch_orchestration.rs CHANGED Viewed

@@ -19,18 +19,6 @@ use kreuzberg::core::extractor::extract_file_sync;
 mod helpers;
-fn trim_trailing_newlines(value: &str) -> &str {
-    value.trim_end_matches(['\n', '\r'])
-}
-fn assert_text_content(actual: &str, expected: &str) {
-    assert_eq!(
-        trim_trailing_newlines(actual),
-        expected,
-        "Content mismatch after trimming trailing newlines"
-    );
-}
 /// Test that batch extraction processes documents in parallel.
 ///
 /// Validates:
@@ -317,8 +305,7 @@ async fn test_batch_bytes_parallel_processing() {
     assert_eq!(results.len(), 30);
     for (i, result) in results.iter().enumerate() {
-        let expected = format!("Test content number {}", i);
-        assert_text_content(&result.content, &expected);
+        assert_eq!(result.content, format!("Test content number {}", i));
     }
     println!("Batch processed 30 byte arrays in {:?}", duration);
@@ -343,9 +330,9 @@ async fn test_batch_bytes_mixed_valid_invalid() {
     let results = results.unwrap();
     assert_eq!(results.len(), 5);
-    assert_text_content(&results[0].content, "valid content 1");
-    assert_text_content(&results[2].content, "valid content 2");
-    assert_text_content(&results[4].content, "valid content 3");
+    assert_eq!(results[0].content, "valid content 1");
+    assert_eq!(results[2].content, "valid content 2");
+    assert_eq!(results[4].content, "valid content 3");
     assert!(results[1].metadata.error.is_some());
     assert!(results[3].metadata.error.is_some());
@@ -547,8 +534,7 @@ async fn test_batch_accuracy_under_load() {
     for (i, result) in results.iter().enumerate() {
         let expected = format!("Document number {} with unique content", i);
         assert_eq!(
-            trim_trailing_newlines(&result.content),
-            expected,
+            result.content, expected,
             "Document {} content mismatch - possible cross-contamination",
             i
         );