kreuzberg 4.0.0.pre.rc.8 → 4.0.0.pre.rc.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +12 -9
  3. data/README.md +22 -0
  4. data/ext/kreuzberg_rb/native/Cargo.lock +397 -177
  5. data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
  6. data/ext/kreuzberg_rb/native/src/lib.rs +36 -13
  7. data/kreuzberg.gemspec +34 -2
  8. data/lib/kreuzberg/cache_api.rb +35 -0
  9. data/lib/kreuzberg/error_context.rb +49 -1
  10. data/lib/kreuzberg/extraction_api.rb +255 -0
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/lib/kreuzberg.rb +6 -0
  13. data/lib/libpdfium.dylib +0 -0
  14. data/sig/kreuzberg.rbs +9 -0
  15. data/vendor/Cargo.toml +44 -0
  16. data/vendor/kreuzberg/Cargo.toml +61 -38
  17. data/vendor/kreuzberg/README.md +36 -27
  18. data/vendor/kreuzberg/build.rs +197 -245
  19. data/vendor/kreuzberg/src/core/pipeline.rs +13 -0
  20. data/vendor/kreuzberg/src/embeddings.rs +71 -3
  21. data/vendor/kreuzberg/src/error.rs +1 -1
  22. data/vendor/kreuzberg/src/extraction/html.rs +37 -5
  23. data/vendor/kreuzberg/src/extractors/pdf.rs +93 -44
  24. data/vendor/kreuzberg/src/pdf/bindings.rs +44 -0
  25. data/vendor/kreuzberg/src/pdf/bundled.rs +19 -1
  26. data/vendor/kreuzberg/src/pdf/metadata.rs +2 -2
  27. data/vendor/kreuzberg/src/pdf/mod.rs +2 -0
  28. data/vendor/kreuzberg/src/pdf/rendering.rs +2 -2
  29. data/vendor/kreuzberg/src/pdf/table.rs +3 -0
  30. data/vendor/kreuzberg/src/pdf/text.rs +2 -2
  31. data/vendor/kreuzberg/src/text/quality_processor.rs +1 -1
  32. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -1
  33. data/vendor/kreuzberg/tests/format_integration.rs +4 -1
  34. data/vendor/kreuzberg-ffi/Cargo.toml +63 -0
  35. data/vendor/kreuzberg-ffi/README.md +851 -0
  36. data/vendor/kreuzberg-ffi/build.rs +176 -0
  37. data/vendor/kreuzberg-ffi/cbindgen.toml +27 -0
  38. data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -0
  39. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  40. data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -0
  41. data/vendor/kreuzberg-ffi/src/lib.rs +3616 -0
  42. data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -0
  43. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  44. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  45. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  46. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  47. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  48. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  49. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  50. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  51. data/vendor/kreuzberg-tesseract/Cargo.toml +48 -0
  52. data/vendor/kreuzberg-tesseract/LICENSE +22 -0
  53. data/vendor/kreuzberg-tesseract/README.md +399 -0
  54. data/vendor/kreuzberg-tesseract/build.rs +1354 -0
  55. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  56. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  57. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  58. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  59. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  60. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  61. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  62. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  63. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  64. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  65. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  66. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  67. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  68. metadata +39 -3
  69. data/vendor/rb-sys/bin/release.sh +0 -21
@@ -11,6 +11,30 @@
11
11
  //! - Batch processing for efficient embedding generation
12
12
  //! - Optional GPU acceleration via ONNX Runtime execution providers
13
13
  //!
14
+ //! # ONNX Runtime Requirement
15
+ //!
16
+ //! **CRITICAL**: This module requires ONNX Runtime to be installed on the system.
17
+ //! The `embeddings` feature uses dynamic loading (`ort-load-dynamic`), which detects
18
+ //! the ONNX Runtime library at runtime.
19
+ //!
20
+ //! ## Installation Instructions
21
+ //!
22
+ //! - **macOS**: `brew install onnxruntime`
23
+ //! - **Linux (Ubuntu/Debian)**: `apt install libonnxruntime libonnxruntime-dev`
24
+ //! - **Linux (Fedora)**: `dnf install onnxruntime onnxruntime-devel`
25
+ //! - **Linux (Arch)**: `pacman -S onnxruntime`
26
+ //! - **Windows (MSVC)**: Download from https://github.com/microsoft/onnxruntime/releases and add to PATH
27
+ //!
28
+ //! Alternatively, set the `ORT_DYLIB_PATH` environment variable to the ONNX Runtime library path.
29
+ //!
30
+ //! For Docker/containers, install via package manager in your base image.
31
+ //! Verified packages: Ubuntu 22.04+, Fedora 38+, Arch Linux.
32
+ //!
33
+ //! ## Platform Limitations
34
+ //!
35
+ //! **Windows MinGW builds are not supported**. ONNX Runtime requires the MSVC toolchain on Windows.
36
+ //! Please use Windows MSVC builds or disable the embeddings feature.
37
+ //!
14
38
  //! # Example
15
39
  //!
16
40
  //! ```rust,ignore
@@ -145,6 +169,35 @@ lazy_static! {
145
169
  static ref MODEL_CACHE: RwLock<HashMap<String, CachedEmbedding>> = RwLock::new(HashMap::new());
146
170
  }
147
171
 
172
+ /// Returns installation instructions for ONNX Runtime.
173
+ #[cfg(feature = "embeddings")]
174
+ fn onnx_runtime_install_message() -> String {
175
+ #[cfg(all(windows, target_env = "gnu"))]
176
+ {
177
+ return "ONNX Runtime embeddings are not supported on Windows MinGW builds. \
178
+ ONNX Runtime requires MSVC toolchain. \
179
+ Please use Windows MSVC builds or disable embeddings feature."
180
+ .to_string();
181
+ }
182
+
183
+ #[cfg(not(all(windows, target_env = "gnu")))]
184
+ {
185
+ "ONNX Runtime is required for embeddings functionality. \
186
+ Install: \
187
+ macOS: 'brew install onnxruntime', \
188
+ Linux (Ubuntu/Debian): 'apt install libonnxruntime libonnxruntime-dev', \
189
+ Linux (Fedora): 'dnf install onnxruntime onnxruntime-devel', \
190
+ Linux (Arch): 'pacman -S onnxruntime', \
191
+ Windows (MSVC): Download from https://github.com/microsoft/onnxruntime/releases and add to PATH. \
192
+ \
193
+ Alternatively, set ORT_DYLIB_PATH environment variable to the ONNX Runtime library path. \
194
+ \
195
+ For Docker/containers: Install via package manager in your base image. \
196
+ Verified packages: Ubuntu 22.04+, Fedora 38+, Arch Linux."
197
+ .to_string()
198
+ }
199
+ }
200
+
148
201
  /// Get or initialize a text embedding model from cache.
149
202
  ///
150
203
  /// This function ensures models are initialized only once and reused across
@@ -193,9 +246,24 @@ pub fn get_or_init_model(
193
246
  let mut init_options = InitOptions::new(model);
194
247
  init_options = init_options.with_cache_dir(cache_directory);
195
248
 
196
- let embedding_model = TextEmbedding::try_new(init_options).map_err(|e| crate::KreuzbergError::Plugin {
197
- message: format!("Failed to initialize embedding model: {}", e),
198
- plugin_name: "embeddings".to_string(),
249
+ let embedding_model = TextEmbedding::try_new(init_options).map_err(|e| {
250
+ let error_msg = e.to_string();
251
+
252
+ // Detect ONNX Runtime loading errors by checking for common patterns
253
+ if error_msg.contains("onnxruntime")
254
+ || error_msg.contains("ORT")
255
+ || error_msg.contains("libonnxruntime")
256
+ || error_msg.contains("onnxruntime.dll")
257
+ || error_msg.contains("Unable to load")
258
+ || error_msg.contains("library load failed")
259
+ {
260
+ crate::KreuzbergError::MissingDependency(format!("ONNX Runtime - {}", onnx_runtime_install_message()))
261
+ } else {
262
+ crate::KreuzbergError::Plugin {
263
+ message: format!("Failed to initialize embedding model: {}", e),
264
+ plugin_name: "embeddings".to_string(),
265
+ }
266
+ }
199
267
  })?;
200
268
 
201
269
  let leaked_model = LeakedModel::new(embedding_model);
@@ -177,7 +177,7 @@ impl From<crate::pdf::error::PdfError> for KreuzbergError {
177
177
 
178
178
  macro_rules! error_constructor {
179
179
  ($name:ident, $variant:ident) => {
180
- paste::paste! {
180
+ pastey::paste! {
181
181
  #[doc = "Create a " $variant " error"]
182
182
  pub fn $name<S: Into<String>>(message: S) -> Self {
183
183
  Self::$variant {
@@ -31,14 +31,26 @@ use html_to_markdown_rs::{
31
31
  convert as convert_html, convert_with_inline_images,
32
32
  };
33
33
  use serde::{Deserialize, Serialize};
34
- use std::{any::Any, collections::HashMap, thread};
34
+ use std::collections::HashMap;
35
+
36
+ #[cfg(not(target_arch = "wasm32"))]
37
+ use std::{any::Any, thread};
35
38
 
36
39
  pub use html_to_markdown_rs::{
37
40
  CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, PreprocessingOptions,
38
41
  PreprocessingPreset, WhitespaceMode,
39
42
  };
40
43
 
44
+ // WASM has a much smaller stack, so we need a lower threshold
45
+ // In practice, WASM can't spawn threads anyway, so this threshold doesn't help much
46
+ // We set it very high to avoid the overhead of the "large stack" path which is a no-op in WASM
47
+ #[cfg(target_arch = "wasm32")]
48
+ const LARGE_HTML_STACK_THRESHOLD_BYTES: usize = usize::MAX;
49
+
50
+ #[cfg(not(target_arch = "wasm32"))]
41
51
  const LARGE_HTML_STACK_THRESHOLD_BYTES: usize = 512 * 1024;
52
+
53
+ #[cfg(not(target_arch = "wasm32"))]
42
54
  const HTML_CONVERSION_STACK_SIZE_BYTES: usize = 16 * 1024 * 1024;
43
55
 
44
56
  /// Result of HTML extraction with optional images and warnings.
@@ -132,10 +144,6 @@ fn convert_html_with_options(html: &str, options: ConversionOptions) -> Result<S
132
144
  .map_err(|e| KreuzbergError::parsing(format!("Failed to convert HTML to Markdown: {}", e)))
133
145
  }
134
146
 
135
- fn convert_html_with_options_large_stack(html: String, options: ConversionOptions) -> Result<String> {
136
- run_on_dedicated_stack(move || convert_html_with_options(&html, options))
137
- }
138
-
139
147
  fn convert_inline_images_with_options(
140
148
  html: &str,
141
149
  options: ConversionOptions,
@@ -145,6 +153,13 @@ fn convert_inline_images_with_options(
145
153
  .map_err(|e| KreuzbergError::parsing(format!("Failed to convert HTML to Markdown with images: {}", e)))
146
154
  }
147
155
 
156
+ // Native (non-WASM) implementations use dedicated thread stack for large HTML documents
157
+ #[cfg(not(target_arch = "wasm32"))]
158
+ fn convert_html_with_options_large_stack(html: String, options: ConversionOptions) -> Result<String> {
159
+ run_on_dedicated_stack(move || convert_html_with_options(&html, options))
160
+ }
161
+
162
+ #[cfg(not(target_arch = "wasm32"))]
148
163
  fn convert_inline_images_with_large_stack(
149
164
  html: String,
150
165
  options: ConversionOptions,
@@ -153,6 +168,7 @@ fn convert_inline_images_with_large_stack(
153
168
  run_on_dedicated_stack(move || convert_inline_images_with_options(&html, options, image_config))
154
169
  }
155
170
 
171
+ #[cfg(not(target_arch = "wasm32"))]
156
172
  fn run_on_dedicated_stack<T, F>(job: F) -> Result<T>
157
173
  where
158
174
  T: Send + 'static,
@@ -173,6 +189,7 @@ where
173
189
  }
174
190
  }
175
191
 
192
+ #[cfg(not(target_arch = "wasm32"))]
176
193
  fn extract_panic_reason(panic: &Box<dyn Any + Send + 'static>) -> String {
177
194
  if let Some(msg) = panic.downcast_ref::<&str>() {
178
195
  (*msg).to_string()
@@ -183,6 +200,21 @@ fn extract_panic_reason(panic: &Box<dyn Any + Send + 'static>) -> String {
183
200
  }
184
201
  }
185
202
 
203
+ // WASM implementations skip dedicated stack (not supported) and process inline
204
+ #[cfg(target_arch = "wasm32")]
205
+ fn convert_html_with_options_large_stack(html: String, options: ConversionOptions) -> Result<String> {
206
+ convert_html_with_options(&html, options)
207
+ }
208
+
209
+ #[cfg(target_arch = "wasm32")]
210
+ fn convert_inline_images_with_large_stack(
211
+ html: String,
212
+ options: ConversionOptions,
213
+ image_config: LibInlineImageConfig,
214
+ ) -> Result<HtmlExtraction> {
215
+ convert_inline_images_with_options(&html, options, image_config)
216
+ }
217
+
186
218
  /// Convert HTML to markdown with optional configuration.
187
219
  ///
188
220
  /// Uses sensible defaults if no configuration is provided:
@@ -325,19 +325,33 @@ impl DocumentExtractor for PdfExtractor {
325
325
  config: &ExtractionConfig,
326
326
  ) -> Result<ExtractionResult> {
327
327
  #[cfg(feature = "pdf")]
328
- let (pdf_metadata, native_text, tables, page_contents) = if crate::core::batch_mode::is_batch_mode() {
329
- let content_owned = content.to_vec();
330
- let span = tracing::Span::current();
331
- let pages_config = config.pages.clone();
332
- tokio::task::spawn_blocking(move || {
333
- let _guard = span.entered();
334
- let bindings = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
335
- .or_else(|_| Pdfium::bind_to_system_library())
336
- .map_err(|e| PdfError::MetadataExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
337
-
328
+ let (pdf_metadata, native_text, tables, page_contents) = {
329
+ // WASM target: always synchronous (no tokio::task::spawn_blocking)
330
+ // Other targets: use spawn_blocking in batch mode for better parallelism
331
+ #[cfg(target_arch = "wasm32")]
332
+ {
333
+ // SAFETY: For WASM targets, this code path should only be reached if the
334
+ // WASM environment has properly initialized PDFium. The error message
335
+ // will direct users to the documentation for setup requirements.
336
+ let bindings =
337
+ crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")
338
+ .map_err(|pdf_err| {
339
+ // Provide context-specific error for WASM PDF failures
340
+ if pdf_err.to_string().contains("WASM") || pdf_err.to_string().contains("Module") {
341
+ crate::error::KreuzbergError::Parsing {
342
+ message: "PDF extraction requires proper WASM module initialization. \
343
+ Ensure your WASM environment is set up with PDFium support. \
344
+ See: https://docs.kreuzberg.dev/wasm/pdf"
345
+ .to_string(),
346
+ source: None,
347
+ }
348
+ } else {
349
+ pdf_err.into()
350
+ }
351
+ })?;
338
352
  let pdfium = Pdfium::new(bindings);
339
353
 
340
- let document = pdfium.load_pdf_from_byte_slice(&content_owned, None).map_err(|e| {
354
+ let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
341
355
  let err_msg = e.to_string();
342
356
  if err_msg.contains("password") || err_msg.contains("Password") {
343
357
  PdfError::PasswordRequired
@@ -347,51 +361,86 @@ impl DocumentExtractor for PdfExtractor {
347
361
  })?;
348
362
 
349
363
  let (native_text, boundaries, page_contents) =
350
- crate::pdf::text::extract_text_from_pdf_document(&document, pages_config.as_ref())?;
364
+ crate::pdf::text::extract_text_from_pdf_document(&document, config.pages.as_ref())?;
351
365
 
352
366
  let pdf_metadata =
353
367
  crate::pdf::metadata::extract_metadata_from_document(&document, boundaries.as_deref())?;
354
368
 
355
369
  let tables = extract_tables_from_document(&document, &pdf_metadata)?;
356
370
 
357
- if let Some(ref page_cfg) = pages_config
358
- && page_cfg.extract_pages
359
- && page_contents.is_none()
360
- {
361
- return Err(PdfError::ExtractionFailed(
362
- "Page extraction was configured but no page data was extracted in batch mode".to_string(),
363
- )
364
- .into());
365
- }
366
-
367
- Ok::<_, crate::error::KreuzbergError>((pdf_metadata, native_text, tables, page_contents))
368
- })
369
- .await
370
- .map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))??
371
- } else {
372
- let bindings = Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path("./"))
373
- .or_else(|_| Pdfium::bind_to_system_library())
374
- .map_err(|e| PdfError::MetadataExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
371
+ (pdf_metadata, native_text, tables, page_contents)
372
+ }
373
+ #[cfg(not(target_arch = "wasm32"))]
374
+ {
375
+ if crate::core::batch_mode::is_batch_mode() {
376
+ let content_owned = content.to_vec();
377
+ let span = tracing::Span::current();
378
+ let pages_config = config.pages.clone();
379
+ tokio::task::spawn_blocking(move || {
380
+ let _guard = span.entered();
381
+ let bindings =
382
+ crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
383
+
384
+ let pdfium = Pdfium::new(bindings);
385
+
386
+ let document = pdfium.load_pdf_from_byte_slice(&content_owned, None).map_err(|e| {
387
+ let err_msg = e.to_string();
388
+ if err_msg.contains("password") || err_msg.contains("Password") {
389
+ PdfError::PasswordRequired
390
+ } else {
391
+ PdfError::InvalidPdf(err_msg)
392
+ }
393
+ })?;
394
+
395
+ let (native_text, boundaries, page_contents) =
396
+ crate::pdf::text::extract_text_from_pdf_document(&document, pages_config.as_ref())?;
397
+
398
+ let pdf_metadata =
399
+ crate::pdf::metadata::extract_metadata_from_document(&document, boundaries.as_deref())?;
400
+
401
+ let tables = extract_tables_from_document(&document, &pdf_metadata)?;
402
+
403
+ if let Some(ref page_cfg) = pages_config
404
+ && page_cfg.extract_pages
405
+ && page_contents.is_none()
406
+ {
407
+ return Err(PdfError::ExtractionFailed(
408
+ "Page extraction was configured but no page data was extracted in batch mode"
409
+ .to_string(),
410
+ )
411
+ .into());
412
+ }
413
+
414
+ Ok::<_, crate::error::KreuzbergError>((pdf_metadata, native_text, tables, page_contents))
415
+ })
416
+ .await
417
+ .map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))??
418
+ } else {
419
+ let bindings =
420
+ crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
375
421
 
376
- let pdfium = Pdfium::new(bindings);
422
+ let pdfium = Pdfium::new(bindings);
377
423
 
378
- let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
379
- let err_msg = e.to_string();
380
- if err_msg.contains("password") || err_msg.contains("Password") {
381
- PdfError::PasswordRequired
382
- } else {
383
- PdfError::InvalidPdf(err_msg)
384
- }
385
- })?;
424
+ let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
425
+ let err_msg = e.to_string();
426
+ if err_msg.contains("password") || err_msg.contains("Password") {
427
+ PdfError::PasswordRequired
428
+ } else {
429
+ PdfError::InvalidPdf(err_msg)
430
+ }
431
+ })?;
386
432
 
387
- let (native_text, boundaries, page_contents) =
388
- crate::pdf::text::extract_text_from_pdf_document(&document, config.pages.as_ref())?;
433
+ let (native_text, boundaries, page_contents) =
434
+ crate::pdf::text::extract_text_from_pdf_document(&document, config.pages.as_ref())?;
389
435
 
390
- let pdf_metadata = crate::pdf::metadata::extract_metadata_from_document(&document, boundaries.as_deref())?;
436
+ let pdf_metadata =
437
+ crate::pdf::metadata::extract_metadata_from_document(&document, boundaries.as_deref())?;
391
438
 
392
- let tables = extract_tables_from_document(&document, &pdf_metadata)?;
439
+ let tables = extract_tables_from_document(&document, &pdf_metadata)?;
393
440
 
394
- (pdf_metadata, native_text, tables, page_contents)
441
+ (pdf_metadata, native_text, tables, page_contents)
442
+ }
443
+ }
395
444
  };
396
445
 
397
446
  #[cfg(feature = "ocr")]
@@ -0,0 +1,44 @@
1
+ use super::error::PdfError;
2
+ use pdfium_render::prelude::*;
3
+
4
+ pub(crate) fn bind_pdfium(
5
+ map_err: fn(String) -> PdfError,
6
+ context: &'static str,
7
+ ) -> Result<Box<dyn PdfiumLibraryBindings>, PdfError> {
8
+ #[cfg(all(feature = "pdf", feature = "pdf-bundled"))]
9
+ {
10
+ // WASM target: use dynamic binding to WASM module
11
+ // SAFETY: pdfium-render handles WASM module lifecycle internally.
12
+ // For WASM builds, the PDFium library is linked at compile time
13
+ // and the WASM runtime manages initialization.
14
+ #[cfg(target_arch = "wasm32")]
15
+ {
16
+ Pdfium::bind_to_system_library()
17
+ .map_err(|e| map_err(format!("Failed to initialize Pdfium for WASM ({}): {}", context, e)))
18
+ }
19
+
20
+ // Non-WASM targets: extract and link dynamically
21
+ #[cfg(not(target_arch = "wasm32"))]
22
+ {
23
+ let lib_path = crate::pdf::extract_bundled_pdfium()
24
+ .map_err(|e| map_err(format!("Failed to extract bundled Pdfium ({}): {}", context, e)))?;
25
+
26
+ let lib_dir = lib_path.parent().ok_or_else(|| {
27
+ map_err(format!(
28
+ "Failed to determine Pdfium extraction directory for '{}' ({})",
29
+ lib_path.display(),
30
+ context
31
+ ))
32
+ })?;
33
+
34
+ Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path(lib_dir))
35
+ .map_err(|e| map_err(format!("Failed to initialize Pdfium ({}): {}", context, e)))
36
+ }
37
+ }
38
+
39
+ #[cfg(all(feature = "pdf", not(feature = "pdf-bundled")))]
40
+ {
41
+ Pdfium::bind_to_system_library()
42
+ .map_err(|e| map_err(format!("Failed to initialize Pdfium ({}): {}", context, e)))
43
+ }
44
+ }
@@ -89,17 +89,24 @@ fn is_extracted_library_valid(lib_path: &Path, embedded_size: usize) -> bool {
89
89
  /// # Behavior
90
90
  ///
91
91
  /// - Embeds PDFium library using `include_bytes!`
92
- /// - Extracts to `$TMPDIR/kreuzberg-pdfium/`
92
+ /// - Extracts to `$TMPDIR/kreuzberg-pdfium/` (non-WASM only)
93
93
  /// - Reuses extracted library if size matches
94
94
  /// - Sets permissions to 0755 on Unix
95
95
  /// - Returns path to extracted library
96
96
  ///
97
+ /// # WASM Handling
98
+ ///
99
+ /// On WASM targets (wasm32-*), this function returns an error with a helpful
100
+ /// message directing users to use WASM-specific initialization. WASM PDFium
101
+ /// is initialized through the runtime, not via file extraction.
102
+ ///
97
103
  /// # Errors
98
104
  ///
99
105
  /// Returns `std::io::Error` if:
100
106
  /// - Cannot create extraction directory
101
107
  /// - Cannot write library file
102
108
  /// - Cannot set file permissions (Unix only)
109
+ /// - Target is WASM (filesystem access not available)
103
110
  ///
104
111
  /// # Platform-Specific Library Names
105
112
  ///
@@ -107,6 +114,17 @@ fn is_extracted_library_valid(lib_path: &Path, embedded_size: usize) -> bool {
107
114
  /// - macOS: `libpdfium.dylib`
108
115
  /// - Windows: `pdfium.dll`
109
116
  pub fn extract_bundled_pdfium() -> io::Result<PathBuf> {
117
+ // WASM targets cannot use file extraction
118
+ #[cfg(target_arch = "wasm32")]
119
+ {
120
+ return Err(io::Error::new(
121
+ io::ErrorKind::Unsupported,
122
+ "File extraction is not available in WASM. \
123
+ PDFium for WASM must be initialized via the WebAssembly runtime. \
124
+ Use a WASM-compatible environment with proper module initialization.",
125
+ ));
126
+ }
127
+
110
128
  let (lib_name, _) = bundled_library_info();
111
129
  let extract_dir = get_extraction_dir()?;
112
130
 
@@ -1,3 +1,4 @@
1
+ use super::bindings::bind_pdfium;
1
2
  use super::error::{PdfError, Result};
2
3
  use crate::types::{PageBoundary, PageInfo, PageStructure, PageUnitType};
3
4
  use pdfium_render::prelude::*;
@@ -85,8 +86,7 @@ pub fn extract_metadata(pdf_bytes: &[u8]) -> Result<PdfMetadata> {
85
86
  ///
86
87
  /// Returns only PDF-specific metadata (version, producer, encryption status, dimensions).
87
88
  pub fn extract_metadata_with_password(pdf_bytes: &[u8], password: Option<&str>) -> Result<PdfMetadata> {
88
- let bindings = Pdfium::bind_to_system_library()
89
- .map_err(|e| PdfError::MetadataExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
89
+ let bindings = bind_pdfium(PdfError::MetadataExtractionFailed, "metadata extraction")?;
90
90
 
91
91
  let pdfium = Pdfium::new(bindings);
92
92
 
@@ -35,6 +35,8 @@
35
35
  //!
36
36
  //! This module requires the `pdf` feature. The `ocr` feature enables additional
37
37
  //! functionality in the PDF extractor for rendering pages to images.
38
+ #[cfg(feature = "pdf")]
39
+ pub(crate) mod bindings;
38
40
  #[cfg(all(feature = "pdf", feature = "pdf-bundled"))]
39
41
  pub mod bundled;
40
42
  #[cfg(feature = "pdf")]
@@ -1,3 +1,4 @@
1
+ use super::bindings::bind_pdfium;
1
2
  use super::error::{PdfError, Result};
2
3
  use image::DynamicImage;
3
4
  use pdfium_render::prelude::*;
@@ -32,8 +33,7 @@ pub struct PdfRenderer {
32
33
 
33
34
  impl PdfRenderer {
34
35
  pub fn new() -> Result<Self> {
35
- let binding = Pdfium::bind_to_system_library()
36
- .map_err(|e| PdfError::RenderingFailed(format!("Failed to initialize Pdfium: {}", e)))?;
36
+ let binding = bind_pdfium(PdfError::RenderingFailed, "page rendering")?;
37
37
 
38
38
  let pdfium = Pdfium::new(binding);
39
39
  Ok(Self { pdfium })
@@ -13,9 +13,11 @@ use pdfium_render::prelude::*;
13
13
  /// Spacing threshold for word boundary detection (in PDF units).
14
14
  ///
15
15
  /// Characters separated by more than this distance are considered separate words.
16
+ #[cfg(feature = "ocr")]
16
17
  const WORD_SPACING_THRESHOLD: f32 = 3.0;
17
18
 
18
19
  /// Minimum word length for table detection (filter out noise).
20
+ #[cfg(feature = "ocr")]
19
21
  const MIN_WORD_LENGTH: usize = 1;
20
22
 
21
23
  /// Extract words with positions from PDF page for table detection.
@@ -80,6 +82,7 @@ pub fn extract_words_from_page(_page: &PdfPage, _min_confidence: f64) -> Result<
80
82
  }
81
83
 
82
84
  /// Character with position information extracted from PDF.
85
+ #[cfg(feature = "ocr")]
83
86
  #[derive(Debug, Clone)]
84
87
  struct CharInfo {
85
88
  text: char,
@@ -1,3 +1,4 @@
1
+ use super::bindings::bind_pdfium;
1
2
  use super::error::{PdfError, Result};
2
3
  use crate::core::config::PageConfig;
3
4
  use crate::types::{PageBoundary, PageContent};
@@ -13,8 +14,7 @@ pub struct PdfTextExtractor {
13
14
 
14
15
  impl PdfTextExtractor {
15
16
  pub fn new() -> Result<Self> {
16
- let binding = Pdfium::bind_to_system_library()
17
- .map_err(|e| PdfError::TextExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
17
+ let binding = bind_pdfium(PdfError::TextExtractionFailed, "text extraction")?;
18
18
 
19
19
  let pdfium = Pdfium::new(binding);
20
20
  Ok(Self { pdfium })
@@ -19,7 +19,7 @@ use async_trait::async_trait;
19
19
  ///
20
20
  /// ```rust,no_run
21
21
  /// use kreuzberg::plugins::{Plugin, PostProcessor};
22
- /// use kreuzberg::text::quality::processor::QualityProcessor;
22
+ /// use kreuzberg::text::QualityProcessor;
23
23
  ///
24
24
  /// let processor = QualityProcessor;
25
25
  /// assert_eq!(processor.name(), "quality-processing");
@@ -244,7 +244,7 @@ async fn test_concurrent_ocr_processing() {
244
244
  #[test]
245
245
  fn test_concurrent_ocr_cache_stress() {
246
246
  use helpers::{get_test_file_path, skip_if_missing};
247
- use std::sync::atomic::Ordering;
247
+ use std::sync::atomic::{AtomicUsize, Ordering};
248
248
 
249
249
  if skip_if_missing("images/ocr_image.jpg") {
250
250
  tracing::debug!("Skipping OCR cache stress test: test file not available");
@@ -9,7 +9,10 @@
9
9
 
10
10
  mod helpers;
11
11
 
12
- use helpers::{assert_mime_type, assert_non_empty_content, get_test_file_path, test_documents_available};
12
+ use helpers::{assert_mime_type, get_test_file_path, test_documents_available};
13
+
14
+ #[cfg(any(feature = "office", feature = "ocr"))]
15
+ use helpers::assert_non_empty_content;
13
16
  use kreuzberg::core::config::ExtractionConfig;
14
17
  use kreuzberg::core::extractor::extract_file;
15
18
 
@@ -0,0 +1,63 @@
1
+ [package]
2
+ name = "kreuzberg-ffi"
3
+ version = "4.0.0-rc.11"
4
+ edition = "2024"
5
+ rust-version = "1.91"
6
+ authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
7
+ description = "C FFI bindings for Kreuzberg document intelligence library"
8
+ license = "MIT"
9
+ repository = "https://github.com/kreuzberg-dev/kreuzberg"
10
+ homepage = "https://kreuzberg.dev"
11
+ documentation = "https://docs.rs/kreuzberg-ffi"
12
+ readme = "README.md"
13
+ keywords = ["ffi", "bindings", "document", "extraction", "api"]
14
+ categories = ["development-tools::ffi", "text-processing"]
15
+
16
+ [lib]
17
+ crate-type = ["cdylib", "staticlib", "rlib"]
18
+
19
+ [features]
20
+ # Mirror embeddings feature availability from kreuzberg dependency
21
+ embeddings = []
22
+
23
+ [dependencies]
24
+ # On Windows MinGW, disable embeddings/ort since ONNX Runtime is not available
25
+ # in MinGW-compatible form. Use all other features but exclude embeddings.
26
+ [target.'cfg(all(windows, target_env = "gnu"))'.dependencies]
27
+ kreuzberg = { path = "../kreuzberg", features = [
28
+ "pdf",
29
+ "excel",
30
+ "office",
31
+ "email",
32
+ "html",
33
+ "xml",
34
+ "archives",
35
+ "ocr",
36
+ "language-detection",
37
+ "chunking",
38
+ "quality",
39
+ "keywords",
40
+ "api",
41
+ "mcp",
42
+ "otel",
43
+ "bundled-pdfium",
44
+ ] }
45
+ serde_json = "1.0.145"
46
+ serde = { version = "1.0.228", features = ["derive"] }
47
+ async-trait = "0.1.89"
48
+ tokio = { version = "1.48.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
49
+ html-to-markdown-rs = { version = "2.14.11", default-features = false }
50
+
51
+ [target.'cfg(not(all(windows, target_env = "gnu")))'.dependencies]
52
+ kreuzberg = { path = "../kreuzberg", features = ["full", "bundled-pdfium"] }
53
+ serde_json = "1.0.145"
54
+ serde = { version = "1.0.228", features = ["derive"] }
55
+ async-trait = "0.1.89"
56
+ tokio = { version = "1.48.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
57
+ html-to-markdown-rs = { version = "2.14.11", default-features = false }
58
+
59
+ [build-dependencies]
60
+ cbindgen = "0.29"
61
+
62
+ [dev-dependencies]
63
+ tempfile = "3.23.0"