RubyGems - kreuzberg - Versions diffs - 4.0.0.pre.rc.14 → 4.0.0.pre.rc.15 - Mend

kreuzberg 4.0.0.pre.rc.14 → 4.0.0.pre.rc.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

checksums.yaml +4 -4
data/Gemfile.lock +1 -2
data/ext/kreuzberg_rb/native/Cargo.lock +25 -215
data/ext/kreuzberg_rb/native/Cargo.toml +1 -2
data/ext/kreuzberg_rb/native/build.rs +38 -1
data/lib/kreuzberg/result.rb +1 -1
data/lib/kreuzberg/version.rb +1 -1
data/lib/{libpdfium.dylib → libpdfium.so} +0 -0
data/vendor/Cargo.toml +2 -2
data/vendor/kreuzberg/Cargo.toml +1 -1
data/vendor/kreuzberg/build.rs +54 -10
data/vendor/kreuzberg/src/api/mod.rs +8 -0
data/vendor/kreuzberg/src/extraction/html.rs +40 -7
data/vendor/kreuzberg/src/pdf/bundled.rs +115 -9
data/vendor/kreuzberg/tests/format_integration.rs +1 -0
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +11 -21
data/vendor/kreuzberg-ffi/Cargo.toml +0 -63
data/vendor/kreuzberg-ffi/README.md +0 -851
data/vendor/kreuzberg-ffi/build.rs +0 -176
data/vendor/kreuzberg-ffi/cbindgen.toml +0 -27
data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
data/vendor/kreuzberg-ffi/kreuzberg.h +0 -1087
data/vendor/kreuzberg-ffi/src/lib.rs +0 -3616
data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -247
data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470

data/vendor/kreuzberg/src/extraction/html.rs CHANGED Viewed

@@ -41,11 +41,10 @@ pub use html_to_markdown_rs::{
     PreprocessingPreset, WhitespaceMode,
 };
-// WASM has a much smaller stack, so we need a lower threshold
-// In practice, WASM can't spawn threads anyway, so this threshold doesn't help much
-// We set it very high to avoid the overhead of the "large stack" path which is a no-op in WASM
+// WASM has a much smaller stack and cannot spawn threads for large documents
+// Set a conservative limit to prevent stack overflow in WASM builds
 #[cfg(target_arch = "wasm32")]
-const LARGE_HTML_STACK_THRESHOLD_BYTES: usize = usize::MAX;
+const MAX_HTML_SIZE_BYTES: usize = 2 * 1024 * 1024; // 2MB limit for WASM
 #[cfg(not(target_arch = "wasm32"))]
 const LARGE_HTML_STACK_THRESHOLD_BYTES: usize = 512 * 1024;
@@ -221,22 +220,56 @@ fn convert_inline_images_with_large_stack(
 /// - `extract_metadata = true` (parse YAML frontmatter)
 /// - `hocr_spatial_tables = false` (disable hOCR table detection)
 /// - `preprocessing.enabled = false` (disable HTML preprocessing)
+///
+/// # WASM Limitations
+///
+/// In WASM builds, HTML files larger than 2MB will be rejected with an error
+/// to prevent stack overflow. For larger files, use the native library.
 pub fn convert_html_to_markdown(html: &str, options: Option<ConversionOptions>) -> Result<String> {
+    // WASM builds have strict size limits due to limited stack space
+    #[cfg(target_arch = "wasm32")]
+    if html.len() > MAX_HTML_SIZE_BYTES {
+        return Err(KreuzbergError::validation(format!(
+            "HTML file size ({} bytes) exceeds WASM limit of {} bytes (2MB). \
+             Large HTML files cannot be processed in WASM due to stack constraints. \
+             Consider using the native library for files of this size.",
+            html.len(),
+            MAX_HTML_SIZE_BYTES
+        )));
+    }
     let options = resolve_conversion_options(options);
+    #[cfg(not(target_arch = "wasm32"))]
     if html_requires_large_stack(html.len()) {
-        convert_html_with_options_large_stack(html.to_string(), options)
-    } else {
-        convert_html_with_options(html, options)
+        return convert_html_with_options_large_stack(html.to_string(), options);
     }
+    convert_html_with_options(html, options)
 }
 /// Process HTML with optional image extraction.
+///
+/// # WASM Limitations
+///
+/// In WASM builds, HTML files larger than 2MB will be rejected to prevent stack overflow.
 pub fn process_html(
     html: &str,
     options: Option<ConversionOptions>,
     extract_images: bool,
     max_image_size: u64,
 ) -> Result<HtmlExtractionResult> {
+    // WASM builds have strict size limits due to limited stack space
+    #[cfg(target_arch = "wasm32")]
+    if html.len() > MAX_HTML_SIZE_BYTES {
+        return Err(KreuzbergError::validation(format!(
+            "HTML file size ({} bytes) exceeds WASM limit of {} bytes (2MB). \
+             Large HTML files cannot be processed in WASM due to stack constraints.",
+            html.len(),
+            MAX_HTML_SIZE_BYTES
+        )));
+    }
     if extract_images {
         let options = resolve_conversion_options(options.clone());
         let mut img_config = LibInlineImageConfig::new(max_image_size);

data/vendor/kreuzberg/src/pdf/bundled.rs CHANGED Viewed

@@ -4,13 +4,25 @@
 //! using `include_bytes!` during compilation. This module handles runtime extraction to a
 //! temporary directory and provides the path for dynamic loading.
 //!
+//! # Thread Safety
+//!
+//! Extraction is protected by a `Mutex` to prevent race conditions during concurrent access.
+//! The first thread to call `extract_bundled_pdfium()` will perform the extraction while
+//! others wait for completion.
+//!
+//! To prevent the "file too short" race condition where one thread loads a partially-written
+//! file, we use atomic file operations: write to a temporary file, then atomically rename to
+//! the final location. This ensures other threads never observe a partial file.
+//!
 //! # How It Works
 //!
 //! 1. During build (build.rs): PDFium is copied to OUT_DIR and the build script sets
 //!    `KREUZBERG_PDFIUM_BUNDLED_PATH` environment variable
 //! 2. At compile time: `include_bytes!` embeds the library binary in the executable
 //! 3. At runtime: `extract_bundled_pdfium()` extracts to `$TMPDIR/kreuzberg-pdfium/`
-//! 4. Library is reused if already present (based on modification time)
+//! 4. Library is reused if already present (based on file size validation)
+//! 5. Concurrent calls are serialized with a `Mutex` to prevent partial writes
+//! 6. Atomic rename (write temp file → rename) prevents "file too short" race conditions
 //!
 //! # Example
 //!
@@ -30,10 +42,17 @@
 use std::fs;
 use std::io;
 use std::path::{Path, PathBuf};
+use std::sync::Mutex;
 #[cfg(unix)]
 use std::os::unix::fs::PermissionsExt;
+// SAFETY: Global mutex protects against TOCTOU (time-of-check-time-of-use) race conditions
+// where multiple threads simultaneously check if the file exists, both find it missing,
+// and try to write concurrently. This mutex ensures only one thread performs extraction
+// while others wait for completion.
+static EXTRACTION_LOCK: Mutex<()> = Mutex::new(());
 /// Runtime library name and extraction directory for the bundled PDFium library.
 ///
 /// Returns tuple of (library_name, extraction_directory)
@@ -93,6 +112,13 @@ fn is_extracted_library_valid(lib_path: &Path, embedded_size: usize) -> bool {
 /// - Reuses extracted library if size matches
 /// - Sets permissions to 0755 on Unix
 /// - Returns path to extracted library
+/// - **Thread-safe**: Synchronized with a global `Mutex` to prevent concurrent writes
+///
+/// # Concurrency
+///
+/// This function is fully thread-safe. When multiple threads call it simultaneously,
+/// only the first thread performs the actual extraction while others wait. This prevents
+/// the "file too short" error that occurs when one thread reads a partially-written file.
 ///
 /// # WASM Handling
 ///
@@ -150,34 +176,68 @@ pub fn extract_bundled_pdfium() -> io::Result<PathBuf> {
         return Ok(lib_path);
     }
-    // Write library to disk
-    fs::write(&lib_path, bundled_lib).map_err(|e| {
+    // SAFETY: EXTRACTION_LOCK is a static Mutex that protects against concurrent writes.
+    // This serializes extraction across threads, preventing the "file too short" error
+    // that occurs when one thread reads a partially-written file.
+    let _guard = EXTRACTION_LOCK
+        .lock()
+        .map_err(|e| io::Error::other(format!("Failed to acquire extraction lock: {}", e)))?;
+    // Double-check after acquiring lock: another thread may have already extracted the file
+    if is_extracted_library_valid(&lib_path, bundled_lib.len()) {
+        return Ok(lib_path);
+    }
+    // Write to a temporary file first, then atomically rename to prevent other threads
+    // from reading a partially written file. This fixes the "file too short" race condition.
+    let temp_path = lib_path.with_extension(format!("tmp.{}", std::process::id()));
+    // Write library to temporary file
+    fs::write(&temp_path, bundled_lib).map_err(|e| {
         io::Error::new(
             e.kind(),
             format!(
-                "Failed to extract bundled pdfium library to '{}': {}",
-                lib_path.display(),
+                "Failed to write bundled pdfium library to temp file '{}': {}",
+                temp_path.display(),
                 e
             ),
         )
     })?;
-    // Set executable permissions on Unix
+    // Set executable permissions on Unix (before rename)
     #[cfg(unix)]
     {
         let perms = fs::Permissions::from_mode(0o755);
-        fs::set_permissions(&lib_path, perms).map_err(|e| {
+        fs::set_permissions(&temp_path, perms).map_err(|e| {
+            // Clean up temp file on error
+            let _ = fs::remove_file(&temp_path);
             io::Error::new(
                 e.kind(),
                 format!(
-                    "Failed to set permissions on bundled pdfium library '{}': {}",
-                    lib_path.display(),
+                    "Failed to set permissions on bundled pdfium temp file '{}': {}",
+                    temp_path.display(),
                     e
                 ),
             )
         })?;
     }
+    // Atomically rename temp file to final location
+    // This ensures other threads never see a partially written file
+    fs::rename(&temp_path, &lib_path).map_err(|e| {
+        // Clean up temp file on error
+        let _ = fs::remove_file(&temp_path);
+        io::Error::new(
+            e.kind(),
+            format!(
+                "Failed to rename bundled pdfium library from '{}' to '{}': {}",
+                temp_path.display(),
+                lib_path.display(),
+                e
+            ),
+        )
+    })?;
     Ok(lib_path)
 }
@@ -324,6 +384,52 @@ mod tests {
         assert_eq!(size1, size2, "Reused library should have same file size");
     }
+    #[test]
+    #[cfg(feature = "bundled-pdfium")]
+    fn test_extract_bundled_pdfium_concurrent_access() {
+        use std::thread;
+        // Spawn multiple threads that all try to extract simultaneously
+        let handles: Vec<_> = (0..10)
+            .map(|_| {
+                thread::spawn(|| {
+                    let result = extract_bundled_pdfium();
+                    assert!(result.is_ok(), "Concurrent extraction should succeed");
+                    result.unwrap()
+                })
+            })
+            .collect();
+        // Collect all results
+        let paths: Vec<PathBuf> = handles
+            .into_iter()
+            .map(|h| h.join().expect("Thread should complete"))
+            .collect();
+        // All paths should be identical
+        let first_path = &paths[0];
+        assert!(
+            paths.iter().all(|p| p == first_path),
+            "All concurrent extractions should return the same path"
+        );
+        // Verify file exists and is valid
+        assert!(
+            first_path.exists(),
+            "Extracted library should exist at: {}",
+            first_path.display()
+        );
+        // Verify file size is not truncated/partial
+        let metadata = fs::metadata(first_path).expect("Should be able to read metadata");
+        let file_size = metadata.len();
+        assert!(
+            file_size > 1_000_000,
+            "PDFium library should be at least 1MB, got {} bytes",
+            file_size
+        );
+    }
     #[test]
     #[cfg(unix)]
     #[cfg(feature = "bundled-pdfium")]

data/vendor/kreuzberg/tests/format_integration.rs CHANGED Viewed

@@ -49,6 +49,7 @@ async fn test_pdf_password_protected_async() {
 #[cfg(feature = "office")]
 #[tokio::test]
+#[cfg_attr(target_os = "windows", ignore = "LibreOffice tests timeout on Windows CI")]
 async fn test_legacy_doc_extraction_async() {
     if !test_documents_available() {
         return;

data/vendor/kreuzberg-tesseract/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg-tesseract"
-version = "4.0.0-rc.14"
+version = "4.0.0-rc.15"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

metadata CHANGED Viewed

@@ -1,13 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: kreuzberg
 version: !ruby/object:Gem::Version
-  version: 4.0.0.pre.rc.14
+  version: 4.0.0.pre.rc.15
 platform: ruby
 authors:
 - Na'aman Hirschfeld
+autorequire:
 bindir: exe
 cert_chain: []
-date: 2025-12-20 00:00:00.000000000 Z
+date: 2025-12-21 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -213,7 +214,7 @@ files:
 - lib/kreuzberg/setup_lib_path.rb
 - lib/kreuzberg/validator_protocol.rb
 - lib/kreuzberg/version.rb
-- lib/libpdfium.dylib
+- lib/libpdfium.so
 - sig/kreuzberg.rbs
 - sig/kreuzberg/internal.rbs
 - spec/binding/cache_spec.rb
@@ -232,19 +233,6 @@ files:
 - spec/smoke/package_spec.rb
 - spec/spec_helper.rb
 - vendor/Cargo.toml
-- vendor/kreuzberg-ffi/Cargo.toml
-- vendor/kreuzberg-ffi/README.md
-- vendor/kreuzberg-ffi/build.rs
-- vendor/kreuzberg-ffi/cbindgen.toml
-- vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in
-- vendor/kreuzberg-ffi/kreuzberg.h
-- vendor/kreuzberg-ffi/src/lib.rs
-- vendor/kreuzberg-ffi/src/panic_shield.rs
-- vendor/kreuzberg-ffi/tests.disabled/README.md
-- vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs
-- vendor/kreuzberg-ffi/tests.disabled/config_tests.rs
-- vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs
-- vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs
 - vendor/kreuzberg-tesseract/.commitlintrc.json
 - vendor/kreuzberg-tesseract/.crate-ignore
 - vendor/kreuzberg-tesseract/Cargo.lock
@@ -544,13 +532,14 @@ homepage: https://github.com/kreuzberg-dev/kreuzberg
 licenses:
 - MIT
 metadata:
-  bug_tracker_uri: https://github.com/kreuzberg-dev/kreuzberg/issues
+  homepage_uri: https://github.com/kreuzberg-dev/kreuzberg
+  source_code_uri: https://github.com/kreuzberg-dev/kreuzberg
   changelog_uri: https://github.com/kreuzberg-dev/kreuzberg/blob/main/CHANGELOG.md
   documentation_uri: https://docs.kreuzberg.dev
-  homepage_uri: https://github.com/kreuzberg-dev/kreuzberg
-  keywords: document-intelligence,document-extraction,ocr,rust,bindings
+  bug_tracker_uri: https://github.com/kreuzberg-dev/kreuzberg/issues
   rubygems_mfa_required: 'true'
-  source_code_uri: https://github.com/kreuzberg-dev/kreuzberg
+  keywords: document-intelligence,document-extraction,ocr,rust,bindings
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -565,7 +554,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 4.0.2
+rubygems_version: 3.5.22
+signing_key:
 specification_version: 4
 summary: High-performance document intelligence framework
 test_files: []

data/vendor/kreuzberg-ffi/Cargo.toml DELETED Viewed

@@ -1,63 +0,0 @@
-[package]
-name = "kreuzberg-ffi"
-version = "4.0.0-rc.14"
-edition = "2024"
-rust-version = "1.91"
-authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
-description = "C FFI bindings for Kreuzberg document intelligence library"
-license = "MIT"
-repository = "https://github.com/kreuzberg-dev/kreuzberg"
-homepage = "https://kreuzberg.dev"
-documentation = "https://docs.rs/kreuzberg-ffi"
-readme = "README.md"
-keywords = ["ffi", "bindings", "document", "extraction", "api"]
-categories = ["development-tools::ffi", "text-processing"]
-[lib]
-crate-type = ["cdylib", "staticlib", "rlib"]
-[features]
-# Mirror embeddings feature availability from kreuzberg dependency
-embeddings = []
-[dependencies]
-# On Windows MinGW, disable embeddings/ort since ONNX Runtime is not available
-# in MinGW-compatible form. Use all other features but exclude embeddings.
-[target.'cfg(all(windows, target_env = "gnu"))'.dependencies]
-kreuzberg = { path = "../kreuzberg", features = [
-    "pdf",
-    "excel",
-    "office",
-    "email",
-    "html",
-    "xml",
-    "archives",
-    "ocr",
-    "language-detection",
-    "chunking",
-    "quality",
-    "keywords",
-    "api",
-    "mcp",
-    "otel",
-    "bundled-pdfium",
-] }
-serde_json = "1.0.145"
-serde = { version = "1.0.228", features = ["derive"] }
-async-trait = "0.1.89"
-tokio = { version = "1.48.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
-html-to-markdown-rs = { version = "2.15.0", default-features = false }
-[target.'cfg(not(all(windows, target_env = "gnu")))'.dependencies]
-kreuzberg = { path = "../kreuzberg", features = ["full", "bundled-pdfium"] }
-serde_json = "1.0.145"
-serde = { version = "1.0.228", features = ["derive"] }
-async-trait = "0.1.89"
-tokio = { version = "1.48.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
-html-to-markdown-rs = { version = "2.15.0", default-features = false }
-[build-dependencies]
-cbindgen = "0.29"
-[dev-dependencies]
-tempfile = "3.23.0"