kreuzberg 4.0.0.pre.rc.8 → 4.0.0.pre.rc.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +12 -9
- data/README.md +22 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +397 -177
- data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
- data/ext/kreuzberg_rb/native/src/lib.rs +36 -13
- data/kreuzberg.gemspec +34 -2
- data/lib/kreuzberg/cache_api.rb +35 -0
- data/lib/kreuzberg/error_context.rb +49 -1
- data/lib/kreuzberg/extraction_api.rb +255 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +6 -0
- data/lib/libpdfium.dylib +0 -0
- data/sig/kreuzberg.rbs +9 -0
- data/vendor/Cargo.toml +44 -0
- data/vendor/kreuzberg/Cargo.toml +61 -38
- data/vendor/kreuzberg/README.md +36 -27
- data/vendor/kreuzberg/build.rs +197 -245
- data/vendor/kreuzberg/src/core/pipeline.rs +13 -0
- data/vendor/kreuzberg/src/embeddings.rs +71 -3
- data/vendor/kreuzberg/src/error.rs +1 -1
- data/vendor/kreuzberg/src/extraction/html.rs +37 -5
- data/vendor/kreuzberg/src/extractors/pdf.rs +93 -44
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +19 -1
- data/vendor/kreuzberg/src/pdf/metadata.rs +2 -2
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +2 -2
- data/vendor/kreuzberg/src/pdf/table.rs +3 -0
- data/vendor/kreuzberg/src/pdf/text.rs +2 -2
- data/vendor/kreuzberg/src/text/quality_processor.rs +1 -1
- data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -1
- data/vendor/kreuzberg/tests/format_integration.rs +4 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +63 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/build.rs +176 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +48 -0
- data/vendor/kreuzberg-tesseract/LICENSE +22 -0
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1354 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +39 -3
- data/vendor/rb-sys/bin/release.sh +0 -21
|
@@ -11,6 +11,30 @@
|
|
|
11
11
|
//! - Batch processing for efficient embedding generation
|
|
12
12
|
//! - Optional GPU acceleration via ONNX Runtime execution providers
|
|
13
13
|
//!
|
|
14
|
+
//! # ONNX Runtime Requirement
|
|
15
|
+
//!
|
|
16
|
+
//! **CRITICAL**: This module requires ONNX Runtime to be installed on the system.
|
|
17
|
+
//! The `embeddings` feature uses dynamic loading (`ort-load-dynamic`), which detects
|
|
18
|
+
//! the ONNX Runtime library at runtime.
|
|
19
|
+
//!
|
|
20
|
+
//! ## Installation Instructions
|
|
21
|
+
//!
|
|
22
|
+
//! - **macOS**: `brew install onnxruntime`
|
|
23
|
+
//! - **Linux (Ubuntu/Debian)**: `apt install libonnxruntime libonnxruntime-dev`
|
|
24
|
+
//! - **Linux (Fedora)**: `dnf install onnxruntime onnxruntime-devel`
|
|
25
|
+
//! - **Linux (Arch)**: `pacman -S onnxruntime`
|
|
26
|
+
//! - **Windows (MSVC)**: Download from https://github.com/microsoft/onnxruntime/releases and add to PATH
|
|
27
|
+
//!
|
|
28
|
+
//! Alternatively, set the `ORT_DYLIB_PATH` environment variable to the ONNX Runtime library path.
|
|
29
|
+
//!
|
|
30
|
+
//! For Docker/containers, install via package manager in your base image.
|
|
31
|
+
//! Verified packages: Ubuntu 22.04+, Fedora 38+, Arch Linux.
|
|
32
|
+
//!
|
|
33
|
+
//! ## Platform Limitations
|
|
34
|
+
//!
|
|
35
|
+
//! **Windows MinGW builds are not supported**. ONNX Runtime requires the MSVC toolchain on Windows.
|
|
36
|
+
//! Please use Windows MSVC builds or disable the embeddings feature.
|
|
37
|
+
//!
|
|
14
38
|
//! # Example
|
|
15
39
|
//!
|
|
16
40
|
//! ```rust,ignore
|
|
@@ -145,6 +169,35 @@ lazy_static! {
|
|
|
145
169
|
static ref MODEL_CACHE: RwLock<HashMap<String, CachedEmbedding>> = RwLock::new(HashMap::new());
|
|
146
170
|
}
|
|
147
171
|
|
|
172
|
+
/// Returns installation instructions for ONNX Runtime.
|
|
173
|
+
#[cfg(feature = "embeddings")]
|
|
174
|
+
fn onnx_runtime_install_message() -> String {
|
|
175
|
+
#[cfg(all(windows, target_env = "gnu"))]
|
|
176
|
+
{
|
|
177
|
+
return "ONNX Runtime embeddings are not supported on Windows MinGW builds. \
|
|
178
|
+
ONNX Runtime requires MSVC toolchain. \
|
|
179
|
+
Please use Windows MSVC builds or disable embeddings feature."
|
|
180
|
+
.to_string();
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
#[cfg(not(all(windows, target_env = "gnu")))]
|
|
184
|
+
{
|
|
185
|
+
"ONNX Runtime is required for embeddings functionality. \
|
|
186
|
+
Install: \
|
|
187
|
+
macOS: 'brew install onnxruntime', \
|
|
188
|
+
Linux (Ubuntu/Debian): 'apt install libonnxruntime libonnxruntime-dev', \
|
|
189
|
+
Linux (Fedora): 'dnf install onnxruntime onnxruntime-devel', \
|
|
190
|
+
Linux (Arch): 'pacman -S onnxruntime', \
|
|
191
|
+
Windows (MSVC): Download from https://github.com/microsoft/onnxruntime/releases and add to PATH. \
|
|
192
|
+
\
|
|
193
|
+
Alternatively, set ORT_DYLIB_PATH environment variable to the ONNX Runtime library path. \
|
|
194
|
+
\
|
|
195
|
+
For Docker/containers: Install via package manager in your base image. \
|
|
196
|
+
Verified packages: Ubuntu 22.04+, Fedora 38+, Arch Linux."
|
|
197
|
+
.to_string()
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
148
201
|
/// Get or initialize a text embedding model from cache.
|
|
149
202
|
///
|
|
150
203
|
/// This function ensures models are initialized only once and reused across
|
|
@@ -193,9 +246,24 @@ pub fn get_or_init_model(
|
|
|
193
246
|
let mut init_options = InitOptions::new(model);
|
|
194
247
|
init_options = init_options.with_cache_dir(cache_directory);
|
|
195
248
|
|
|
196
|
-
let embedding_model = TextEmbedding::try_new(init_options).map_err(|e|
|
|
197
|
-
|
|
198
|
-
|
|
249
|
+
let embedding_model = TextEmbedding::try_new(init_options).map_err(|e| {
|
|
250
|
+
let error_msg = e.to_string();
|
|
251
|
+
|
|
252
|
+
// Detect ONNX Runtime loading errors by checking for common patterns
|
|
253
|
+
if error_msg.contains("onnxruntime")
|
|
254
|
+
|| error_msg.contains("ORT")
|
|
255
|
+
|| error_msg.contains("libonnxruntime")
|
|
256
|
+
|| error_msg.contains("onnxruntime.dll")
|
|
257
|
+
|| error_msg.contains("Unable to load")
|
|
258
|
+
|| error_msg.contains("library load failed")
|
|
259
|
+
{
|
|
260
|
+
crate::KreuzbergError::MissingDependency(format!("ONNX Runtime - {}", onnx_runtime_install_message()))
|
|
261
|
+
} else {
|
|
262
|
+
crate::KreuzbergError::Plugin {
|
|
263
|
+
message: format!("Failed to initialize embedding model: {}", e),
|
|
264
|
+
plugin_name: "embeddings".to_string(),
|
|
265
|
+
}
|
|
266
|
+
}
|
|
199
267
|
})?;
|
|
200
268
|
|
|
201
269
|
let leaked_model = LeakedModel::new(embedding_model);
|
|
@@ -177,7 +177,7 @@ impl From<crate::pdf::error::PdfError> for KreuzbergError {
|
|
|
177
177
|
|
|
178
178
|
macro_rules! error_constructor {
|
|
179
179
|
($name:ident, $variant:ident) => {
|
|
180
|
-
|
|
180
|
+
pastey::paste! {
|
|
181
181
|
#[doc = "Create a " $variant " error"]
|
|
182
182
|
pub fn $name<S: Into<String>>(message: S) -> Self {
|
|
183
183
|
Self::$variant {
|
|
@@ -31,14 +31,26 @@ use html_to_markdown_rs::{
|
|
|
31
31
|
convert as convert_html, convert_with_inline_images,
|
|
32
32
|
};
|
|
33
33
|
use serde::{Deserialize, Serialize};
|
|
34
|
-
use std::
|
|
34
|
+
use std::collections::HashMap;
|
|
35
|
+
|
|
36
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
37
|
+
use std::{any::Any, thread};
|
|
35
38
|
|
|
36
39
|
pub use html_to_markdown_rs::{
|
|
37
40
|
CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, PreprocessingOptions,
|
|
38
41
|
PreprocessingPreset, WhitespaceMode,
|
|
39
42
|
};
|
|
40
43
|
|
|
44
|
+
// WASM has a much smaller stack, so we need a lower threshold
|
|
45
|
+
// In practice, WASM can't spawn threads anyway, so this threshold doesn't help much
|
|
46
|
+
// We set it very high to avoid the overhead of the "large stack" path which is a no-op in WASM
|
|
47
|
+
#[cfg(target_arch = "wasm32")]
|
|
48
|
+
const LARGE_HTML_STACK_THRESHOLD_BYTES: usize = usize::MAX;
|
|
49
|
+
|
|
50
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
41
51
|
const LARGE_HTML_STACK_THRESHOLD_BYTES: usize = 512 * 1024;
|
|
52
|
+
|
|
53
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
42
54
|
const HTML_CONVERSION_STACK_SIZE_BYTES: usize = 16 * 1024 * 1024;
|
|
43
55
|
|
|
44
56
|
/// Result of HTML extraction with optional images and warnings.
|
|
@@ -132,10 +144,6 @@ fn convert_html_with_options(html: &str, options: ConversionOptions) -> Result<S
|
|
|
132
144
|
.map_err(|e| KreuzbergError::parsing(format!("Failed to convert HTML to Markdown: {}", e)))
|
|
133
145
|
}
|
|
134
146
|
|
|
135
|
-
fn convert_html_with_options_large_stack(html: String, options: ConversionOptions) -> Result<String> {
|
|
136
|
-
run_on_dedicated_stack(move || convert_html_with_options(&html, options))
|
|
137
|
-
}
|
|
138
|
-
|
|
139
147
|
fn convert_inline_images_with_options(
|
|
140
148
|
html: &str,
|
|
141
149
|
options: ConversionOptions,
|
|
@@ -145,6 +153,13 @@ fn convert_inline_images_with_options(
|
|
|
145
153
|
.map_err(|e| KreuzbergError::parsing(format!("Failed to convert HTML to Markdown with images: {}", e)))
|
|
146
154
|
}
|
|
147
155
|
|
|
156
|
+
// Native (non-WASM) implementations use dedicated thread stack for large HTML documents
|
|
157
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
158
|
+
fn convert_html_with_options_large_stack(html: String, options: ConversionOptions) -> Result<String> {
|
|
159
|
+
run_on_dedicated_stack(move || convert_html_with_options(&html, options))
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
148
163
|
fn convert_inline_images_with_large_stack(
|
|
149
164
|
html: String,
|
|
150
165
|
options: ConversionOptions,
|
|
@@ -153,6 +168,7 @@ fn convert_inline_images_with_large_stack(
|
|
|
153
168
|
run_on_dedicated_stack(move || convert_inline_images_with_options(&html, options, image_config))
|
|
154
169
|
}
|
|
155
170
|
|
|
171
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
156
172
|
fn run_on_dedicated_stack<T, F>(job: F) -> Result<T>
|
|
157
173
|
where
|
|
158
174
|
T: Send + 'static,
|
|
@@ -173,6 +189,7 @@ where
|
|
|
173
189
|
}
|
|
174
190
|
}
|
|
175
191
|
|
|
192
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
176
193
|
fn extract_panic_reason(panic: &Box<dyn Any + Send + 'static>) -> String {
|
|
177
194
|
if let Some(msg) = panic.downcast_ref::<&str>() {
|
|
178
195
|
(*msg).to_string()
|
|
@@ -183,6 +200,21 @@ fn extract_panic_reason(panic: &Box<dyn Any + Send + 'static>) -> String {
|
|
|
183
200
|
}
|
|
184
201
|
}
|
|
185
202
|
|
|
203
|
+
// WASM implementations skip dedicated stack (not supported) and process inline
|
|
204
|
+
#[cfg(target_arch = "wasm32")]
|
|
205
|
+
fn convert_html_with_options_large_stack(html: String, options: ConversionOptions) -> Result<String> {
|
|
206
|
+
convert_html_with_options(&html, options)
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
#[cfg(target_arch = "wasm32")]
|
|
210
|
+
fn convert_inline_images_with_large_stack(
|
|
211
|
+
html: String,
|
|
212
|
+
options: ConversionOptions,
|
|
213
|
+
image_config: LibInlineImageConfig,
|
|
214
|
+
) -> Result<HtmlExtraction> {
|
|
215
|
+
convert_inline_images_with_options(&html, options, image_config)
|
|
216
|
+
}
|
|
217
|
+
|
|
186
218
|
/// Convert HTML to markdown with optional configuration.
|
|
187
219
|
///
|
|
188
220
|
/// Uses sensible defaults if no configuration is provided:
|
|
@@ -325,19 +325,33 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
325
325
|
config: &ExtractionConfig,
|
|
326
326
|
) -> Result<ExtractionResult> {
|
|
327
327
|
#[cfg(feature = "pdf")]
|
|
328
|
-
let (pdf_metadata, native_text, tables, page_contents) =
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
328
|
+
let (pdf_metadata, native_text, tables, page_contents) = {
|
|
329
|
+
// WASM target: always synchronous (no tokio::task::spawn_blocking)
|
|
330
|
+
// Other targets: use spawn_blocking in batch mode for better parallelism
|
|
331
|
+
#[cfg(target_arch = "wasm32")]
|
|
332
|
+
{
|
|
333
|
+
// SAFETY: For WASM targets, this code path should only be reached if the
|
|
334
|
+
// WASM environment has properly initialized PDFium. The error message
|
|
335
|
+
// will direct users to the documentation for setup requirements.
|
|
336
|
+
let bindings =
|
|
337
|
+
crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")
|
|
338
|
+
.map_err(|pdf_err| {
|
|
339
|
+
// Provide context-specific error for WASM PDF failures
|
|
340
|
+
if pdf_err.to_string().contains("WASM") || pdf_err.to_string().contains("Module") {
|
|
341
|
+
crate::error::KreuzbergError::Parsing {
|
|
342
|
+
message: "PDF extraction requires proper WASM module initialization. \
|
|
343
|
+
Ensure your WASM environment is set up with PDFium support. \
|
|
344
|
+
See: https://docs.kreuzberg.dev/wasm/pdf"
|
|
345
|
+
.to_string(),
|
|
346
|
+
source: None,
|
|
347
|
+
}
|
|
348
|
+
} else {
|
|
349
|
+
pdf_err.into()
|
|
350
|
+
}
|
|
351
|
+
})?;
|
|
338
352
|
let pdfium = Pdfium::new(bindings);
|
|
339
353
|
|
|
340
|
-
let document = pdfium.load_pdf_from_byte_slice(
|
|
354
|
+
let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
|
|
341
355
|
let err_msg = e.to_string();
|
|
342
356
|
if err_msg.contains("password") || err_msg.contains("Password") {
|
|
343
357
|
PdfError::PasswordRequired
|
|
@@ -347,51 +361,86 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
347
361
|
})?;
|
|
348
362
|
|
|
349
363
|
let (native_text, boundaries, page_contents) =
|
|
350
|
-
crate::pdf::text::extract_text_from_pdf_document(&document,
|
|
364
|
+
crate::pdf::text::extract_text_from_pdf_document(&document, config.pages.as_ref())?;
|
|
351
365
|
|
|
352
366
|
let pdf_metadata =
|
|
353
367
|
crate::pdf::metadata::extract_metadata_from_document(&document, boundaries.as_deref())?;
|
|
354
368
|
|
|
355
369
|
let tables = extract_tables_from_document(&document, &pdf_metadata)?;
|
|
356
370
|
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
)
|
|
364
|
-
.
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
371
|
+
(pdf_metadata, native_text, tables, page_contents)
|
|
372
|
+
}
|
|
373
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
374
|
+
{
|
|
375
|
+
if crate::core::batch_mode::is_batch_mode() {
|
|
376
|
+
let content_owned = content.to_vec();
|
|
377
|
+
let span = tracing::Span::current();
|
|
378
|
+
let pages_config = config.pages.clone();
|
|
379
|
+
tokio::task::spawn_blocking(move || {
|
|
380
|
+
let _guard = span.entered();
|
|
381
|
+
let bindings =
|
|
382
|
+
crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
|
|
383
|
+
|
|
384
|
+
let pdfium = Pdfium::new(bindings);
|
|
385
|
+
|
|
386
|
+
let document = pdfium.load_pdf_from_byte_slice(&content_owned, None).map_err(|e| {
|
|
387
|
+
let err_msg = e.to_string();
|
|
388
|
+
if err_msg.contains("password") || err_msg.contains("Password") {
|
|
389
|
+
PdfError::PasswordRequired
|
|
390
|
+
} else {
|
|
391
|
+
PdfError::InvalidPdf(err_msg)
|
|
392
|
+
}
|
|
393
|
+
})?;
|
|
394
|
+
|
|
395
|
+
let (native_text, boundaries, page_contents) =
|
|
396
|
+
crate::pdf::text::extract_text_from_pdf_document(&document, pages_config.as_ref())?;
|
|
397
|
+
|
|
398
|
+
let pdf_metadata =
|
|
399
|
+
crate::pdf::metadata::extract_metadata_from_document(&document, boundaries.as_deref())?;
|
|
400
|
+
|
|
401
|
+
let tables = extract_tables_from_document(&document, &pdf_metadata)?;
|
|
402
|
+
|
|
403
|
+
if let Some(ref page_cfg) = pages_config
|
|
404
|
+
&& page_cfg.extract_pages
|
|
405
|
+
&& page_contents.is_none()
|
|
406
|
+
{
|
|
407
|
+
return Err(PdfError::ExtractionFailed(
|
|
408
|
+
"Page extraction was configured but no page data was extracted in batch mode"
|
|
409
|
+
.to_string(),
|
|
410
|
+
)
|
|
411
|
+
.into());
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
Ok::<_, crate::error::KreuzbergError>((pdf_metadata, native_text, tables, page_contents))
|
|
415
|
+
})
|
|
416
|
+
.await
|
|
417
|
+
.map_err(|e| crate::error::KreuzbergError::Other(format!("PDF extraction task failed: {}", e)))??
|
|
418
|
+
} else {
|
|
419
|
+
let bindings =
|
|
420
|
+
crate::pdf::bindings::bind_pdfium(PdfError::MetadataExtractionFailed, "initialize Pdfium")?;
|
|
375
421
|
|
|
376
|
-
|
|
422
|
+
let pdfium = Pdfium::new(bindings);
|
|
377
423
|
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
424
|
+
let document = pdfium.load_pdf_from_byte_slice(content, None).map_err(|e| {
|
|
425
|
+
let err_msg = e.to_string();
|
|
426
|
+
if err_msg.contains("password") || err_msg.contains("Password") {
|
|
427
|
+
PdfError::PasswordRequired
|
|
428
|
+
} else {
|
|
429
|
+
PdfError::InvalidPdf(err_msg)
|
|
430
|
+
}
|
|
431
|
+
})?;
|
|
386
432
|
|
|
387
|
-
|
|
388
|
-
|
|
433
|
+
let (native_text, boundaries, page_contents) =
|
|
434
|
+
crate::pdf::text::extract_text_from_pdf_document(&document, config.pages.as_ref())?;
|
|
389
435
|
|
|
390
|
-
|
|
436
|
+
let pdf_metadata =
|
|
437
|
+
crate::pdf::metadata::extract_metadata_from_document(&document, boundaries.as_deref())?;
|
|
391
438
|
|
|
392
|
-
|
|
439
|
+
let tables = extract_tables_from_document(&document, &pdf_metadata)?;
|
|
393
440
|
|
|
394
|
-
|
|
441
|
+
(pdf_metadata, native_text, tables, page_contents)
|
|
442
|
+
}
|
|
443
|
+
}
|
|
395
444
|
};
|
|
396
445
|
|
|
397
446
|
#[cfg(feature = "ocr")]
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
use super::error::PdfError;
|
|
2
|
+
use pdfium_render::prelude::*;
|
|
3
|
+
|
|
4
|
+
pub(crate) fn bind_pdfium(
|
|
5
|
+
map_err: fn(String) -> PdfError,
|
|
6
|
+
context: &'static str,
|
|
7
|
+
) -> Result<Box<dyn PdfiumLibraryBindings>, PdfError> {
|
|
8
|
+
#[cfg(all(feature = "pdf", feature = "pdf-bundled"))]
|
|
9
|
+
{
|
|
10
|
+
// WASM target: use dynamic binding to WASM module
|
|
11
|
+
// SAFETY: pdfium-render handles WASM module lifecycle internally.
|
|
12
|
+
// For WASM builds, the PDFium library is linked at compile time
|
|
13
|
+
// and the WASM runtime manages initialization.
|
|
14
|
+
#[cfg(target_arch = "wasm32")]
|
|
15
|
+
{
|
|
16
|
+
Pdfium::bind_to_system_library()
|
|
17
|
+
.map_err(|e| map_err(format!("Failed to initialize Pdfium for WASM ({}): {}", context, e)))
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
// Non-WASM targets: extract and link dynamically
|
|
21
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
22
|
+
{
|
|
23
|
+
let lib_path = crate::pdf::extract_bundled_pdfium()
|
|
24
|
+
.map_err(|e| map_err(format!("Failed to extract bundled Pdfium ({}): {}", context, e)))?;
|
|
25
|
+
|
|
26
|
+
let lib_dir = lib_path.parent().ok_or_else(|| {
|
|
27
|
+
map_err(format!(
|
|
28
|
+
"Failed to determine Pdfium extraction directory for '{}' ({})",
|
|
29
|
+
lib_path.display(),
|
|
30
|
+
context
|
|
31
|
+
))
|
|
32
|
+
})?;
|
|
33
|
+
|
|
34
|
+
Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path(lib_dir))
|
|
35
|
+
.map_err(|e| map_err(format!("Failed to initialize Pdfium ({}): {}", context, e)))
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
#[cfg(all(feature = "pdf", not(feature = "pdf-bundled")))]
|
|
40
|
+
{
|
|
41
|
+
Pdfium::bind_to_system_library()
|
|
42
|
+
.map_err(|e| map_err(format!("Failed to initialize Pdfium ({}): {}", context, e)))
|
|
43
|
+
}
|
|
44
|
+
}
|
|
@@ -89,17 +89,24 @@ fn is_extracted_library_valid(lib_path: &Path, embedded_size: usize) -> bool {
|
|
|
89
89
|
/// # Behavior
|
|
90
90
|
///
|
|
91
91
|
/// - Embeds PDFium library using `include_bytes!`
|
|
92
|
-
/// - Extracts to `$TMPDIR/kreuzberg-pdfium/`
|
|
92
|
+
/// - Extracts to `$TMPDIR/kreuzberg-pdfium/` (non-WASM only)
|
|
93
93
|
/// - Reuses extracted library if size matches
|
|
94
94
|
/// - Sets permissions to 0755 on Unix
|
|
95
95
|
/// - Returns path to extracted library
|
|
96
96
|
///
|
|
97
|
+
/// # WASM Handling
|
|
98
|
+
///
|
|
99
|
+
/// On WASM targets (wasm32-*), this function returns an error with a helpful
|
|
100
|
+
/// message directing users to use WASM-specific initialization. WASM PDFium
|
|
101
|
+
/// is initialized through the runtime, not via file extraction.
|
|
102
|
+
///
|
|
97
103
|
/// # Errors
|
|
98
104
|
///
|
|
99
105
|
/// Returns `std::io::Error` if:
|
|
100
106
|
/// - Cannot create extraction directory
|
|
101
107
|
/// - Cannot write library file
|
|
102
108
|
/// - Cannot set file permissions (Unix only)
|
|
109
|
+
/// - Target is WASM (filesystem access not available)
|
|
103
110
|
///
|
|
104
111
|
/// # Platform-Specific Library Names
|
|
105
112
|
///
|
|
@@ -107,6 +114,17 @@ fn is_extracted_library_valid(lib_path: &Path, embedded_size: usize) -> bool {
|
|
|
107
114
|
/// - macOS: `libpdfium.dylib`
|
|
108
115
|
/// - Windows: `pdfium.dll`
|
|
109
116
|
pub fn extract_bundled_pdfium() -> io::Result<PathBuf> {
|
|
117
|
+
// WASM targets cannot use file extraction
|
|
118
|
+
#[cfg(target_arch = "wasm32")]
|
|
119
|
+
{
|
|
120
|
+
return Err(io::Error::new(
|
|
121
|
+
io::ErrorKind::Unsupported,
|
|
122
|
+
"File extraction is not available in WASM. \
|
|
123
|
+
PDFium for WASM must be initialized via the WebAssembly runtime. \
|
|
124
|
+
Use a WASM-compatible environment with proper module initialization.",
|
|
125
|
+
));
|
|
126
|
+
}
|
|
127
|
+
|
|
110
128
|
let (lib_name, _) = bundled_library_info();
|
|
111
129
|
let extract_dir = get_extraction_dir()?;
|
|
112
130
|
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
use super::bindings::bind_pdfium;
|
|
1
2
|
use super::error::{PdfError, Result};
|
|
2
3
|
use crate::types::{PageBoundary, PageInfo, PageStructure, PageUnitType};
|
|
3
4
|
use pdfium_render::prelude::*;
|
|
@@ -85,8 +86,7 @@ pub fn extract_metadata(pdf_bytes: &[u8]) -> Result<PdfMetadata> {
|
|
|
85
86
|
///
|
|
86
87
|
/// Returns only PDF-specific metadata (version, producer, encryption status, dimensions).
|
|
87
88
|
pub fn extract_metadata_with_password(pdf_bytes: &[u8], password: Option<&str>) -> Result<PdfMetadata> {
|
|
88
|
-
let bindings =
|
|
89
|
-
.map_err(|e| PdfError::MetadataExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
|
|
89
|
+
let bindings = bind_pdfium(PdfError::MetadataExtractionFailed, "metadata extraction")?;
|
|
90
90
|
|
|
91
91
|
let pdfium = Pdfium::new(bindings);
|
|
92
92
|
|
|
@@ -35,6 +35,8 @@
|
|
|
35
35
|
//!
|
|
36
36
|
//! This module requires the `pdf` feature. The `ocr` feature enables additional
|
|
37
37
|
//! functionality in the PDF extractor for rendering pages to images.
|
|
38
|
+
#[cfg(feature = "pdf")]
|
|
39
|
+
pub(crate) mod bindings;
|
|
38
40
|
#[cfg(all(feature = "pdf", feature = "pdf-bundled"))]
|
|
39
41
|
pub mod bundled;
|
|
40
42
|
#[cfg(feature = "pdf")]
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
use super::bindings::bind_pdfium;
|
|
1
2
|
use super::error::{PdfError, Result};
|
|
2
3
|
use image::DynamicImage;
|
|
3
4
|
use pdfium_render::prelude::*;
|
|
@@ -32,8 +33,7 @@ pub struct PdfRenderer {
|
|
|
32
33
|
|
|
33
34
|
impl PdfRenderer {
|
|
34
35
|
pub fn new() -> Result<Self> {
|
|
35
|
-
let binding =
|
|
36
|
-
.map_err(|e| PdfError::RenderingFailed(format!("Failed to initialize Pdfium: {}", e)))?;
|
|
36
|
+
let binding = bind_pdfium(PdfError::RenderingFailed, "page rendering")?;
|
|
37
37
|
|
|
38
38
|
let pdfium = Pdfium::new(binding);
|
|
39
39
|
Ok(Self { pdfium })
|
|
@@ -13,9 +13,11 @@ use pdfium_render::prelude::*;
|
|
|
13
13
|
/// Spacing threshold for word boundary detection (in PDF units).
|
|
14
14
|
///
|
|
15
15
|
/// Characters separated by more than this distance are considered separate words.
|
|
16
|
+
#[cfg(feature = "ocr")]
|
|
16
17
|
const WORD_SPACING_THRESHOLD: f32 = 3.0;
|
|
17
18
|
|
|
18
19
|
/// Minimum word length for table detection (filter out noise).
|
|
20
|
+
#[cfg(feature = "ocr")]
|
|
19
21
|
const MIN_WORD_LENGTH: usize = 1;
|
|
20
22
|
|
|
21
23
|
/// Extract words with positions from PDF page for table detection.
|
|
@@ -80,6 +82,7 @@ pub fn extract_words_from_page(_page: &PdfPage, _min_confidence: f64) -> Result<
|
|
|
80
82
|
}
|
|
81
83
|
|
|
82
84
|
/// Character with position information extracted from PDF.
|
|
85
|
+
#[cfg(feature = "ocr")]
|
|
83
86
|
#[derive(Debug, Clone)]
|
|
84
87
|
struct CharInfo {
|
|
85
88
|
text: char,
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
use super::bindings::bind_pdfium;
|
|
1
2
|
use super::error::{PdfError, Result};
|
|
2
3
|
use crate::core::config::PageConfig;
|
|
3
4
|
use crate::types::{PageBoundary, PageContent};
|
|
@@ -13,8 +14,7 @@ pub struct PdfTextExtractor {
|
|
|
13
14
|
|
|
14
15
|
impl PdfTextExtractor {
|
|
15
16
|
pub fn new() -> Result<Self> {
|
|
16
|
-
let binding =
|
|
17
|
-
.map_err(|e| PdfError::TextExtractionFailed(format!("Failed to initialize Pdfium: {}", e)))?;
|
|
17
|
+
let binding = bind_pdfium(PdfError::TextExtractionFailed, "text extraction")?;
|
|
18
18
|
|
|
19
19
|
let pdfium = Pdfium::new(binding);
|
|
20
20
|
Ok(Self { pdfium })
|
|
@@ -19,7 +19,7 @@ use async_trait::async_trait;
|
|
|
19
19
|
///
|
|
20
20
|
/// ```rust,no_run
|
|
21
21
|
/// use kreuzberg::plugins::{Plugin, PostProcessor};
|
|
22
|
-
/// use kreuzberg::text::
|
|
22
|
+
/// use kreuzberg::text::QualityProcessor;
|
|
23
23
|
///
|
|
24
24
|
/// let processor = QualityProcessor;
|
|
25
25
|
/// assert_eq!(processor.name(), "quality-processing");
|
|
@@ -244,7 +244,7 @@ async fn test_concurrent_ocr_processing() {
|
|
|
244
244
|
#[test]
|
|
245
245
|
fn test_concurrent_ocr_cache_stress() {
|
|
246
246
|
use helpers::{get_test_file_path, skip_if_missing};
|
|
247
|
-
use std::sync::atomic::Ordering;
|
|
247
|
+
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
248
248
|
|
|
249
249
|
if skip_if_missing("images/ocr_image.jpg") {
|
|
250
250
|
tracing::debug!("Skipping OCR cache stress test: test file not available");
|
|
@@ -9,7 +9,10 @@
|
|
|
9
9
|
|
|
10
10
|
mod helpers;
|
|
11
11
|
|
|
12
|
-
use helpers::{assert_mime_type,
|
|
12
|
+
use helpers::{assert_mime_type, get_test_file_path, test_documents_available};
|
|
13
|
+
|
|
14
|
+
#[cfg(any(feature = "office", feature = "ocr"))]
|
|
15
|
+
use helpers::assert_non_empty_content;
|
|
13
16
|
use kreuzberg::core::config::ExtractionConfig;
|
|
14
17
|
use kreuzberg::core::extractor::extract_file;
|
|
15
18
|
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
[package]
|
|
2
|
+
name = "kreuzberg-ffi"
|
|
3
|
+
version = "4.0.0-rc.11"
|
|
4
|
+
edition = "2024"
|
|
5
|
+
rust-version = "1.91"
|
|
6
|
+
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
7
|
+
description = "C FFI bindings for Kreuzberg document intelligence library"
|
|
8
|
+
license = "MIT"
|
|
9
|
+
repository = "https://github.com/kreuzberg-dev/kreuzberg"
|
|
10
|
+
homepage = "https://kreuzberg.dev"
|
|
11
|
+
documentation = "https://docs.rs/kreuzberg-ffi"
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
keywords = ["ffi", "bindings", "document", "extraction", "api"]
|
|
14
|
+
categories = ["development-tools::ffi", "text-processing"]
|
|
15
|
+
|
|
16
|
+
[lib]
|
|
17
|
+
crate-type = ["cdylib", "staticlib", "rlib"]
|
|
18
|
+
|
|
19
|
+
[features]
|
|
20
|
+
# Mirror embeddings feature availability from kreuzberg dependency
|
|
21
|
+
embeddings = []
|
|
22
|
+
|
|
23
|
+
[dependencies]
|
|
24
|
+
# On Windows MinGW, disable embeddings/ort since ONNX Runtime is not available
|
|
25
|
+
# in MinGW-compatible form. Use all other features but exclude embeddings.
|
|
26
|
+
[target.'cfg(all(windows, target_env = "gnu"))'.dependencies]
|
|
27
|
+
kreuzberg = { path = "../kreuzberg", features = [
|
|
28
|
+
"pdf",
|
|
29
|
+
"excel",
|
|
30
|
+
"office",
|
|
31
|
+
"email",
|
|
32
|
+
"html",
|
|
33
|
+
"xml",
|
|
34
|
+
"archives",
|
|
35
|
+
"ocr",
|
|
36
|
+
"language-detection",
|
|
37
|
+
"chunking",
|
|
38
|
+
"quality",
|
|
39
|
+
"keywords",
|
|
40
|
+
"api",
|
|
41
|
+
"mcp",
|
|
42
|
+
"otel",
|
|
43
|
+
"bundled-pdfium",
|
|
44
|
+
] }
|
|
45
|
+
serde_json = "1.0.145"
|
|
46
|
+
serde = { version = "1.0.228", features = ["derive"] }
|
|
47
|
+
async-trait = "0.1.89"
|
|
48
|
+
tokio = { version = "1.48.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
|
|
49
|
+
html-to-markdown-rs = { version = "2.14.11", default-features = false }
|
|
50
|
+
|
|
51
|
+
[target.'cfg(not(all(windows, target_env = "gnu")))'.dependencies]
|
|
52
|
+
kreuzberg = { path = "../kreuzberg", features = ["full", "bundled-pdfium"] }
|
|
53
|
+
serde_json = "1.0.145"
|
|
54
|
+
serde = { version = "1.0.228", features = ["derive"] }
|
|
55
|
+
async-trait = "0.1.89"
|
|
56
|
+
tokio = { version = "1.48.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
|
|
57
|
+
html-to-markdown-rs = { version = "2.14.11", default-features = false }
|
|
58
|
+
|
|
59
|
+
[build-dependencies]
|
|
60
|
+
cbindgen = "0.29"
|
|
61
|
+
|
|
62
|
+
[dev-dependencies]
|
|
63
|
+
tempfile = "3.23.0"
|