kreuzberg 4.0.0.pre.rc.14 → 4.0.0.pre.rc.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -41,11 +41,10 @@ pub use html_to_markdown_rs::{
41
41
  PreprocessingPreset, WhitespaceMode,
42
42
  };
43
43
 
44
- // WASM has a much smaller stack, so we need a lower threshold
45
- // In practice, WASM can't spawn threads anyway, so this threshold doesn't help much
46
- // We set it very high to avoid the overhead of the "large stack" path which is a no-op in WASM
44
+ // WASM has a much smaller stack and cannot spawn threads for large documents
45
+ // Set a conservative limit to prevent stack overflow in WASM builds
47
46
  #[cfg(target_arch = "wasm32")]
48
- const LARGE_HTML_STACK_THRESHOLD_BYTES: usize = usize::MAX;
47
+ const MAX_HTML_SIZE_BYTES: usize = 2 * 1024 * 1024; // 2MB limit for WASM
49
48
 
50
49
  #[cfg(not(target_arch = "wasm32"))]
51
50
  const LARGE_HTML_STACK_THRESHOLD_BYTES: usize = 512 * 1024;
@@ -221,22 +220,56 @@ fn convert_inline_images_with_large_stack(
221
220
  /// - `extract_metadata = true` (parse YAML frontmatter)
222
221
  /// - `hocr_spatial_tables = false` (disable hOCR table detection)
223
222
  /// - `preprocessing.enabled = false` (disable HTML preprocessing)
223
+ ///
224
+ /// # WASM Limitations
225
+ ///
226
+ /// In WASM builds, HTML files larger than 2MB will be rejected with an error
227
+ /// to prevent stack overflow. For larger files, use the native library.
224
228
  pub fn convert_html_to_markdown(html: &str, options: Option<ConversionOptions>) -> Result<String> {
229
+ // WASM builds have strict size limits due to limited stack space
230
+ #[cfg(target_arch = "wasm32")]
231
+ if html.len() > MAX_HTML_SIZE_BYTES {
232
+ return Err(KreuzbergError::validation(format!(
233
+ "HTML file size ({} bytes) exceeds WASM limit of {} bytes (2MB). \
234
+ Large HTML files cannot be processed in WASM due to stack constraints. \
235
+ Consider using the native library for files of this size.",
236
+ html.len(),
237
+ MAX_HTML_SIZE_BYTES
238
+ )));
239
+ }
240
+
225
241
  let options = resolve_conversion_options(options);
242
+
243
+ #[cfg(not(target_arch = "wasm32"))]
226
244
  if html_requires_large_stack(html.len()) {
227
- convert_html_with_options_large_stack(html.to_string(), options)
228
- } else {
229
- convert_html_with_options(html, options)
245
+ return convert_html_with_options_large_stack(html.to_string(), options);
230
246
  }
247
+
248
+ convert_html_with_options(html, options)
231
249
  }
232
250
 
233
251
  /// Process HTML with optional image extraction.
252
+ ///
253
+ /// # WASM Limitations
254
+ ///
255
+ /// In WASM builds, HTML files larger than 2MB will be rejected to prevent stack overflow.
234
256
  pub fn process_html(
235
257
  html: &str,
236
258
  options: Option<ConversionOptions>,
237
259
  extract_images: bool,
238
260
  max_image_size: u64,
239
261
  ) -> Result<HtmlExtractionResult> {
262
+ // WASM builds have strict size limits due to limited stack space
263
+ #[cfg(target_arch = "wasm32")]
264
+ if html.len() > MAX_HTML_SIZE_BYTES {
265
+ return Err(KreuzbergError::validation(format!(
266
+ "HTML file size ({} bytes) exceeds WASM limit of {} bytes (2MB). \
267
+ Large HTML files cannot be processed in WASM due to stack constraints.",
268
+ html.len(),
269
+ MAX_HTML_SIZE_BYTES
270
+ )));
271
+ }
272
+
240
273
  if extract_images {
241
274
  let options = resolve_conversion_options(options.clone());
242
275
  let mut img_config = LibInlineImageConfig::new(max_image_size);
@@ -4,13 +4,25 @@
4
4
  //! using `include_bytes!` during compilation. This module handles runtime extraction to a
5
5
  //! temporary directory and provides the path for dynamic loading.
6
6
  //!
7
+ //! # Thread Safety
8
+ //!
9
+ //! Extraction is protected by a `Mutex` to prevent race conditions during concurrent access.
10
+ //! The first thread to call `extract_bundled_pdfium()` will perform the extraction while
11
+ //! others wait for completion.
12
+ //!
13
+ //! To prevent the "file too short" race condition where one thread loads a partially-written
14
+ //! file, we use atomic file operations: write to a temporary file, then atomically rename to
15
+ //! the final location. This ensures other threads never observe a partial file.
16
+ //!
7
17
  //! # How It Works
8
18
  //!
9
19
  //! 1. During build (build.rs): PDFium is copied to OUT_DIR and the build script sets
10
20
  //! `KREUZBERG_PDFIUM_BUNDLED_PATH` environment variable
11
21
  //! 2. At compile time: `include_bytes!` embeds the library binary in the executable
12
22
  //! 3. At runtime: `extract_bundled_pdfium()` extracts to `$TMPDIR/kreuzberg-pdfium/`
13
- //! 4. Library is reused if already present (based on modification time)
23
+ //! 4. Library is reused if already present (based on file size validation)
24
+ //! 5. Concurrent calls are serialized with a `Mutex` to prevent partial writes
25
+ //! 6. Atomic rename (write temp file → rename) prevents "file too short" race conditions
14
26
  //!
15
27
  //! # Example
16
28
  //!
@@ -30,10 +42,17 @@
30
42
  use std::fs;
31
43
  use std::io;
32
44
  use std::path::{Path, PathBuf};
45
+ use std::sync::Mutex;
33
46
 
34
47
  #[cfg(unix)]
35
48
  use std::os::unix::fs::PermissionsExt;
36
49
 
50
+ // SAFETY: Global mutex protects against TOCTOU (time-of-check-time-of-use) race conditions
51
+ // where multiple threads simultaneously check if the file exists, both find it missing,
52
+ // and try to write concurrently. This mutex ensures only one thread performs extraction
53
+ // while others wait for completion.
54
+ static EXTRACTION_LOCK: Mutex<()> = Mutex::new(());
55
+
37
56
  /// Runtime library name and extraction directory for the bundled PDFium library.
38
57
  ///
39
58
  /// Returns tuple of (library_name, extraction_directory)
@@ -93,6 +112,13 @@ fn is_extracted_library_valid(lib_path: &Path, embedded_size: usize) -> bool {
93
112
  /// - Reuses extracted library if size matches
94
113
  /// - Sets permissions to 0755 on Unix
95
114
  /// - Returns path to extracted library
115
+ /// - **Thread-safe**: Synchronized with a global `Mutex` to prevent concurrent writes
116
+ ///
117
+ /// # Concurrency
118
+ ///
119
+ /// This function is fully thread-safe. When multiple threads call it simultaneously,
120
+ /// only the first thread performs the actual extraction while others wait. This prevents
121
+ /// the "file too short" error that occurs when one thread reads a partially-written file.
96
122
  ///
97
123
  /// # WASM Handling
98
124
  ///
@@ -150,34 +176,68 @@ pub fn extract_bundled_pdfium() -> io::Result<PathBuf> {
150
176
  return Ok(lib_path);
151
177
  }
152
178
 
153
- // Write library to disk
154
- fs::write(&lib_path, bundled_lib).map_err(|e| {
179
+ // SAFETY: EXTRACTION_LOCK is a static Mutex that protects against concurrent writes.
180
+ // This serializes extraction across threads, preventing the "file too short" error
181
+ // that occurs when one thread reads a partially-written file.
182
+ let _guard = EXTRACTION_LOCK
183
+ .lock()
184
+ .map_err(|e| io::Error::other(format!("Failed to acquire extraction lock: {}", e)))?;
185
+
186
+ // Double-check after acquiring lock: another thread may have already extracted the file
187
+ if is_extracted_library_valid(&lib_path, bundled_lib.len()) {
188
+ return Ok(lib_path);
189
+ }
190
+
191
+ // Write to a temporary file first, then atomically rename to prevent other threads
192
+ // from reading a partially written file. This fixes the "file too short" race condition.
193
+ let temp_path = lib_path.with_extension(format!("tmp.{}", std::process::id()));
194
+
195
+ // Write library to temporary file
196
+ fs::write(&temp_path, bundled_lib).map_err(|e| {
155
197
  io::Error::new(
156
198
  e.kind(),
157
199
  format!(
158
- "Failed to extract bundled pdfium library to '{}': {}",
159
- lib_path.display(),
200
+ "Failed to write bundled pdfium library to temp file '{}': {}",
201
+ temp_path.display(),
160
202
  e
161
203
  ),
162
204
  )
163
205
  })?;
164
206
 
165
- // Set executable permissions on Unix
207
+ // Set executable permissions on Unix (before rename)
166
208
  #[cfg(unix)]
167
209
  {
168
210
  let perms = fs::Permissions::from_mode(0o755);
169
- fs::set_permissions(&lib_path, perms).map_err(|e| {
211
+ fs::set_permissions(&temp_path, perms).map_err(|e| {
212
+ // Clean up temp file on error
213
+ let _ = fs::remove_file(&temp_path);
170
214
  io::Error::new(
171
215
  e.kind(),
172
216
  format!(
173
- "Failed to set permissions on bundled pdfium library '{}': {}",
174
- lib_path.display(),
217
+ "Failed to set permissions on bundled pdfium temp file '{}': {}",
218
+ temp_path.display(),
175
219
  e
176
220
  ),
177
221
  )
178
222
  })?;
179
223
  }
180
224
 
225
+ // Atomically rename temp file to final location
226
+ // This ensures other threads never see a partially written file
227
+ fs::rename(&temp_path, &lib_path).map_err(|e| {
228
+ // Clean up temp file on error
229
+ let _ = fs::remove_file(&temp_path);
230
+ io::Error::new(
231
+ e.kind(),
232
+ format!(
233
+ "Failed to rename bundled pdfium library from '{}' to '{}': {}",
234
+ temp_path.display(),
235
+ lib_path.display(),
236
+ e
237
+ ),
238
+ )
239
+ })?;
240
+
181
241
  Ok(lib_path)
182
242
  }
183
243
 
@@ -324,6 +384,52 @@ mod tests {
324
384
  assert_eq!(size1, size2, "Reused library should have same file size");
325
385
  }
326
386
 
387
+ #[test]
388
+ #[cfg(feature = "bundled-pdfium")]
389
+ fn test_extract_bundled_pdfium_concurrent_access() {
390
+ use std::thread;
391
+
392
+ // Spawn multiple threads that all try to extract simultaneously
393
+ let handles: Vec<_> = (0..10)
394
+ .map(|_| {
395
+ thread::spawn(|| {
396
+ let result = extract_bundled_pdfium();
397
+ assert!(result.is_ok(), "Concurrent extraction should succeed");
398
+ result.unwrap()
399
+ })
400
+ })
401
+ .collect();
402
+
403
+ // Collect all results
404
+ let paths: Vec<PathBuf> = handles
405
+ .into_iter()
406
+ .map(|h| h.join().expect("Thread should complete"))
407
+ .collect();
408
+
409
+ // All paths should be identical
410
+ let first_path = &paths[0];
411
+ assert!(
412
+ paths.iter().all(|p| p == first_path),
413
+ "All concurrent extractions should return the same path"
414
+ );
415
+
416
+ // Verify file exists and is valid
417
+ assert!(
418
+ first_path.exists(),
419
+ "Extracted library should exist at: {}",
420
+ first_path.display()
421
+ );
422
+
423
+ // Verify file size is not truncated/partial
424
+ let metadata = fs::metadata(first_path).expect("Should be able to read metadata");
425
+ let file_size = metadata.len();
426
+ assert!(
427
+ file_size > 1_000_000,
428
+ "PDFium library should be at least 1MB, got {} bytes",
429
+ file_size
430
+ );
431
+ }
432
+
327
433
  #[test]
328
434
  #[cfg(unix)]
329
435
  #[cfg(feature = "bundled-pdfium")]
@@ -49,6 +49,7 @@ async fn test_pdf_password_protected_async() {
49
49
 
50
50
  #[cfg(feature = "office")]
51
51
  #[tokio::test]
52
+ #[cfg_attr(target_os = "windows", ignore = "LibreOffice tests timeout on Windows CI")]
52
53
  async fn test_legacy_doc_extraction_async() {
53
54
  if !test_documents_available() {
54
55
  return;
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.0.0-rc.14"
3
+ version = "4.0.0-rc.15"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
metadata CHANGED
@@ -1,13 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.0.0.pre.rc.14
4
+ version: 4.0.0.pre.rc.15
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
+ autorequire:
8
9
  bindir: exe
9
10
  cert_chain: []
10
- date: 2025-12-20 00:00:00.000000000 Z
11
+ date: 2025-12-21 00:00:00.000000000 Z
11
12
  dependencies:
12
13
  - !ruby/object:Gem::Dependency
13
14
  name: bundler
@@ -213,7 +214,7 @@ files:
213
214
  - lib/kreuzberg/setup_lib_path.rb
214
215
  - lib/kreuzberg/validator_protocol.rb
215
216
  - lib/kreuzberg/version.rb
216
- - lib/libpdfium.dylib
217
+ - lib/libpdfium.so
217
218
  - sig/kreuzberg.rbs
218
219
  - sig/kreuzberg/internal.rbs
219
220
  - spec/binding/cache_spec.rb
@@ -232,19 +233,6 @@ files:
232
233
  - spec/smoke/package_spec.rb
233
234
  - spec/spec_helper.rb
234
235
  - vendor/Cargo.toml
235
- - vendor/kreuzberg-ffi/Cargo.toml
236
- - vendor/kreuzberg-ffi/README.md
237
- - vendor/kreuzberg-ffi/build.rs
238
- - vendor/kreuzberg-ffi/cbindgen.toml
239
- - vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in
240
- - vendor/kreuzberg-ffi/kreuzberg.h
241
- - vendor/kreuzberg-ffi/src/lib.rs
242
- - vendor/kreuzberg-ffi/src/panic_shield.rs
243
- - vendor/kreuzberg-ffi/tests.disabled/README.md
244
- - vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs
245
- - vendor/kreuzberg-ffi/tests.disabled/config_tests.rs
246
- - vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs
247
- - vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs
248
236
  - vendor/kreuzberg-tesseract/.commitlintrc.json
249
237
  - vendor/kreuzberg-tesseract/.crate-ignore
250
238
  - vendor/kreuzberg-tesseract/Cargo.lock
@@ -544,13 +532,14 @@ homepage: https://github.com/kreuzberg-dev/kreuzberg
544
532
  licenses:
545
533
  - MIT
546
534
  metadata:
547
- bug_tracker_uri: https://github.com/kreuzberg-dev/kreuzberg/issues
535
+ homepage_uri: https://github.com/kreuzberg-dev/kreuzberg
536
+ source_code_uri: https://github.com/kreuzberg-dev/kreuzberg
548
537
  changelog_uri: https://github.com/kreuzberg-dev/kreuzberg/blob/main/CHANGELOG.md
549
538
  documentation_uri: https://docs.kreuzberg.dev
550
- homepage_uri: https://github.com/kreuzberg-dev/kreuzberg
551
- keywords: document-intelligence,document-extraction,ocr,rust,bindings
539
+ bug_tracker_uri: https://github.com/kreuzberg-dev/kreuzberg/issues
552
540
  rubygems_mfa_required: 'true'
553
- source_code_uri: https://github.com/kreuzberg-dev/kreuzberg
541
+ keywords: document-intelligence,document-extraction,ocr,rust,bindings
542
+ post_install_message:
554
543
  rdoc_options: []
555
544
  require_paths:
556
545
  - lib
@@ -565,7 +554,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
565
554
  - !ruby/object:Gem::Version
566
555
  version: '0'
567
556
  requirements: []
568
- rubygems_version: 4.0.2
557
+ rubygems_version: 3.5.22
558
+ signing_key:
569
559
  specification_version: 4
570
560
  summary: High-performance document intelligence framework
571
561
  test_files: []
@@ -1,63 +0,0 @@
1
- [package]
2
- name = "kreuzberg-ffi"
3
- version = "4.0.0-rc.14"
4
- edition = "2024"
5
- rust-version = "1.91"
6
- authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
7
- description = "C FFI bindings for Kreuzberg document intelligence library"
8
- license = "MIT"
9
- repository = "https://github.com/kreuzberg-dev/kreuzberg"
10
- homepage = "https://kreuzberg.dev"
11
- documentation = "https://docs.rs/kreuzberg-ffi"
12
- readme = "README.md"
13
- keywords = ["ffi", "bindings", "document", "extraction", "api"]
14
- categories = ["development-tools::ffi", "text-processing"]
15
-
16
- [lib]
17
- crate-type = ["cdylib", "staticlib", "rlib"]
18
-
19
- [features]
20
- # Mirror embeddings feature availability from kreuzberg dependency
21
- embeddings = []
22
-
23
- [dependencies]
24
- # On Windows MinGW, disable embeddings/ort since ONNX Runtime is not available
25
- # in MinGW-compatible form. Use all other features but exclude embeddings.
26
- [target.'cfg(all(windows, target_env = "gnu"))'.dependencies]
27
- kreuzberg = { path = "../kreuzberg", features = [
28
- "pdf",
29
- "excel",
30
- "office",
31
- "email",
32
- "html",
33
- "xml",
34
- "archives",
35
- "ocr",
36
- "language-detection",
37
- "chunking",
38
- "quality",
39
- "keywords",
40
- "api",
41
- "mcp",
42
- "otel",
43
- "bundled-pdfium",
44
- ] }
45
- serde_json = "1.0.145"
46
- serde = { version = "1.0.228", features = ["derive"] }
47
- async-trait = "0.1.89"
48
- tokio = { version = "1.48.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
49
- html-to-markdown-rs = { version = "2.15.0", default-features = false }
50
-
51
- [target.'cfg(not(all(windows, target_env = "gnu")))'.dependencies]
52
- kreuzberg = { path = "../kreuzberg", features = ["full", "bundled-pdfium"] }
53
- serde_json = "1.0.145"
54
- serde = { version = "1.0.228", features = ["derive"] }
55
- async-trait = "0.1.89"
56
- tokio = { version = "1.48.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
57
- html-to-markdown-rs = { version = "2.15.0", default-features = false }
58
-
59
- [build-dependencies]
60
- cbindgen = "0.29"
61
-
62
- [dev-dependencies]
63
- tempfile = "3.23.0"