kreuzberg 4.0.0.pre.rc.14 → 4.0.0.pre.rc.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +25 -215
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -2
- data/ext/kreuzberg_rb/native/build.rs +38 -1
- data/lib/kreuzberg/result.rb +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/{libpdfium.dylib → libpdfium.so} +0 -0
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/build.rs +54 -10
- data/vendor/kreuzberg/src/api/mod.rs +8 -0
- data/vendor/kreuzberg/src/extraction/html.rs +40 -7
- data/vendor/kreuzberg/src/pdf/bundled.rs +115 -9
- data/vendor/kreuzberg/tests/format_integration.rs +1 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +11 -21
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -63
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/build.rs +0 -176
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -27
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -1087
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -3616
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -247
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
|
@@ -41,11 +41,10 @@ pub use html_to_markdown_rs::{
|
|
|
41
41
|
PreprocessingPreset, WhitespaceMode,
|
|
42
42
|
};
|
|
43
43
|
|
|
44
|
-
// WASM has a much smaller stack
|
|
45
|
-
//
|
|
46
|
-
// We set it very high to avoid the overhead of the "large stack" path which is a no-op in WASM
|
|
44
|
+
// WASM has a much smaller stack and cannot spawn threads for large documents
|
|
45
|
+
// Set a conservative limit to prevent stack overflow in WASM builds
|
|
47
46
|
#[cfg(target_arch = "wasm32")]
|
|
48
|
-
const
|
|
47
|
+
const MAX_HTML_SIZE_BYTES: usize = 2 * 1024 * 1024; // 2MB limit for WASM
|
|
49
48
|
|
|
50
49
|
#[cfg(not(target_arch = "wasm32"))]
|
|
51
50
|
const LARGE_HTML_STACK_THRESHOLD_BYTES: usize = 512 * 1024;
|
|
@@ -221,22 +220,56 @@ fn convert_inline_images_with_large_stack(
|
|
|
221
220
|
/// - `extract_metadata = true` (parse YAML frontmatter)
|
|
222
221
|
/// - `hocr_spatial_tables = false` (disable hOCR table detection)
|
|
223
222
|
/// - `preprocessing.enabled = false` (disable HTML preprocessing)
|
|
223
|
+
///
|
|
224
|
+
/// # WASM Limitations
|
|
225
|
+
///
|
|
226
|
+
/// In WASM builds, HTML files larger than 2MB will be rejected with an error
|
|
227
|
+
/// to prevent stack overflow. For larger files, use the native library.
|
|
224
228
|
pub fn convert_html_to_markdown(html: &str, options: Option<ConversionOptions>) -> Result<String> {
|
|
229
|
+
// WASM builds have strict size limits due to limited stack space
|
|
230
|
+
#[cfg(target_arch = "wasm32")]
|
|
231
|
+
if html.len() > MAX_HTML_SIZE_BYTES {
|
|
232
|
+
return Err(KreuzbergError::validation(format!(
|
|
233
|
+
"HTML file size ({} bytes) exceeds WASM limit of {} bytes (2MB). \
|
|
234
|
+
Large HTML files cannot be processed in WASM due to stack constraints. \
|
|
235
|
+
Consider using the native library for files of this size.",
|
|
236
|
+
html.len(),
|
|
237
|
+
MAX_HTML_SIZE_BYTES
|
|
238
|
+
)));
|
|
239
|
+
}
|
|
240
|
+
|
|
225
241
|
let options = resolve_conversion_options(options);
|
|
242
|
+
|
|
243
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
226
244
|
if html_requires_large_stack(html.len()) {
|
|
227
|
-
convert_html_with_options_large_stack(html.to_string(), options)
|
|
228
|
-
} else {
|
|
229
|
-
convert_html_with_options(html, options)
|
|
245
|
+
return convert_html_with_options_large_stack(html.to_string(), options);
|
|
230
246
|
}
|
|
247
|
+
|
|
248
|
+
convert_html_with_options(html, options)
|
|
231
249
|
}
|
|
232
250
|
|
|
233
251
|
/// Process HTML with optional image extraction.
|
|
252
|
+
///
|
|
253
|
+
/// # WASM Limitations
|
|
254
|
+
///
|
|
255
|
+
/// In WASM builds, HTML files larger than 2MB will be rejected to prevent stack overflow.
|
|
234
256
|
pub fn process_html(
|
|
235
257
|
html: &str,
|
|
236
258
|
options: Option<ConversionOptions>,
|
|
237
259
|
extract_images: bool,
|
|
238
260
|
max_image_size: u64,
|
|
239
261
|
) -> Result<HtmlExtractionResult> {
|
|
262
|
+
// WASM builds have strict size limits due to limited stack space
|
|
263
|
+
#[cfg(target_arch = "wasm32")]
|
|
264
|
+
if html.len() > MAX_HTML_SIZE_BYTES {
|
|
265
|
+
return Err(KreuzbergError::validation(format!(
|
|
266
|
+
"HTML file size ({} bytes) exceeds WASM limit of {} bytes (2MB). \
|
|
267
|
+
Large HTML files cannot be processed in WASM due to stack constraints.",
|
|
268
|
+
html.len(),
|
|
269
|
+
MAX_HTML_SIZE_BYTES
|
|
270
|
+
)));
|
|
271
|
+
}
|
|
272
|
+
|
|
240
273
|
if extract_images {
|
|
241
274
|
let options = resolve_conversion_options(options.clone());
|
|
242
275
|
let mut img_config = LibInlineImageConfig::new(max_image_size);
|
|
@@ -4,13 +4,25 @@
|
|
|
4
4
|
//! using `include_bytes!` during compilation. This module handles runtime extraction to a
|
|
5
5
|
//! temporary directory and provides the path for dynamic loading.
|
|
6
6
|
//!
|
|
7
|
+
//! # Thread Safety
|
|
8
|
+
//!
|
|
9
|
+
//! Extraction is protected by a `Mutex` to prevent race conditions during concurrent access.
|
|
10
|
+
//! The first thread to call `extract_bundled_pdfium()` will perform the extraction while
|
|
11
|
+
//! others wait for completion.
|
|
12
|
+
//!
|
|
13
|
+
//! To prevent the "file too short" race condition where one thread loads a partially-written
|
|
14
|
+
//! file, we use atomic file operations: write to a temporary file, then atomically rename to
|
|
15
|
+
//! the final location. This ensures other threads never observe a partial file.
|
|
16
|
+
//!
|
|
7
17
|
//! # How It Works
|
|
8
18
|
//!
|
|
9
19
|
//! 1. During build (build.rs): PDFium is copied to OUT_DIR and the build script sets
|
|
10
20
|
//! `KREUZBERG_PDFIUM_BUNDLED_PATH` environment variable
|
|
11
21
|
//! 2. At compile time: `include_bytes!` embeds the library binary in the executable
|
|
12
22
|
//! 3. At runtime: `extract_bundled_pdfium()` extracts to `$TMPDIR/kreuzberg-pdfium/`
|
|
13
|
-
//! 4. Library is reused if already present (based on
|
|
23
|
+
//! 4. Library is reused if already present (based on file size validation)
|
|
24
|
+
//! 5. Concurrent calls are serialized with a `Mutex` to prevent partial writes
|
|
25
|
+
//! 6. Atomic rename (write temp file → rename) prevents "file too short" race conditions
|
|
14
26
|
//!
|
|
15
27
|
//! # Example
|
|
16
28
|
//!
|
|
@@ -30,10 +42,17 @@
|
|
|
30
42
|
use std::fs;
|
|
31
43
|
use std::io;
|
|
32
44
|
use std::path::{Path, PathBuf};
|
|
45
|
+
use std::sync::Mutex;
|
|
33
46
|
|
|
34
47
|
#[cfg(unix)]
|
|
35
48
|
use std::os::unix::fs::PermissionsExt;
|
|
36
49
|
|
|
50
|
+
// SAFETY: Global mutex protects against TOCTOU (time-of-check-time-of-use) race conditions
|
|
51
|
+
// where multiple threads simultaneously check if the file exists, both find it missing,
|
|
52
|
+
// and try to write concurrently. This mutex ensures only one thread performs extraction
|
|
53
|
+
// while others wait for completion.
|
|
54
|
+
static EXTRACTION_LOCK: Mutex<()> = Mutex::new(());
|
|
55
|
+
|
|
37
56
|
/// Runtime library name and extraction directory for the bundled PDFium library.
|
|
38
57
|
///
|
|
39
58
|
/// Returns tuple of (library_name, extraction_directory)
|
|
@@ -93,6 +112,13 @@ fn is_extracted_library_valid(lib_path: &Path, embedded_size: usize) -> bool {
|
|
|
93
112
|
/// - Reuses extracted library if size matches
|
|
94
113
|
/// - Sets permissions to 0755 on Unix
|
|
95
114
|
/// - Returns path to extracted library
|
|
115
|
+
/// - **Thread-safe**: Synchronized with a global `Mutex` to prevent concurrent writes
|
|
116
|
+
///
|
|
117
|
+
/// # Concurrency
|
|
118
|
+
///
|
|
119
|
+
/// This function is fully thread-safe. When multiple threads call it simultaneously,
|
|
120
|
+
/// only the first thread performs the actual extraction while others wait. This prevents
|
|
121
|
+
/// the "file too short" error that occurs when one thread reads a partially-written file.
|
|
96
122
|
///
|
|
97
123
|
/// # WASM Handling
|
|
98
124
|
///
|
|
@@ -150,34 +176,68 @@ pub fn extract_bundled_pdfium() -> io::Result<PathBuf> {
|
|
|
150
176
|
return Ok(lib_path);
|
|
151
177
|
}
|
|
152
178
|
|
|
153
|
-
//
|
|
154
|
-
|
|
179
|
+
// SAFETY: EXTRACTION_LOCK is a static Mutex that protects against concurrent writes.
|
|
180
|
+
// This serializes extraction across threads, preventing the "file too short" error
|
|
181
|
+
// that occurs when one thread reads a partially-written file.
|
|
182
|
+
let _guard = EXTRACTION_LOCK
|
|
183
|
+
.lock()
|
|
184
|
+
.map_err(|e| io::Error::other(format!("Failed to acquire extraction lock: {}", e)))?;
|
|
185
|
+
|
|
186
|
+
// Double-check after acquiring lock: another thread may have already extracted the file
|
|
187
|
+
if is_extracted_library_valid(&lib_path, bundled_lib.len()) {
|
|
188
|
+
return Ok(lib_path);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Write to a temporary file first, then atomically rename to prevent other threads
|
|
192
|
+
// from reading a partially written file. This fixes the "file too short" race condition.
|
|
193
|
+
let temp_path = lib_path.with_extension(format!("tmp.{}", std::process::id()));
|
|
194
|
+
|
|
195
|
+
// Write library to temporary file
|
|
196
|
+
fs::write(&temp_path, bundled_lib).map_err(|e| {
|
|
155
197
|
io::Error::new(
|
|
156
198
|
e.kind(),
|
|
157
199
|
format!(
|
|
158
|
-
"Failed to
|
|
159
|
-
|
|
200
|
+
"Failed to write bundled pdfium library to temp file '{}': {}",
|
|
201
|
+
temp_path.display(),
|
|
160
202
|
e
|
|
161
203
|
),
|
|
162
204
|
)
|
|
163
205
|
})?;
|
|
164
206
|
|
|
165
|
-
// Set executable permissions on Unix
|
|
207
|
+
// Set executable permissions on Unix (before rename)
|
|
166
208
|
#[cfg(unix)]
|
|
167
209
|
{
|
|
168
210
|
let perms = fs::Permissions::from_mode(0o755);
|
|
169
|
-
fs::set_permissions(&
|
|
211
|
+
fs::set_permissions(&temp_path, perms).map_err(|e| {
|
|
212
|
+
// Clean up temp file on error
|
|
213
|
+
let _ = fs::remove_file(&temp_path);
|
|
170
214
|
io::Error::new(
|
|
171
215
|
e.kind(),
|
|
172
216
|
format!(
|
|
173
|
-
"Failed to set permissions on bundled pdfium
|
|
174
|
-
|
|
217
|
+
"Failed to set permissions on bundled pdfium temp file '{}': {}",
|
|
218
|
+
temp_path.display(),
|
|
175
219
|
e
|
|
176
220
|
),
|
|
177
221
|
)
|
|
178
222
|
})?;
|
|
179
223
|
}
|
|
180
224
|
|
|
225
|
+
// Atomically rename temp file to final location
|
|
226
|
+
// This ensures other threads never see a partially written file
|
|
227
|
+
fs::rename(&temp_path, &lib_path).map_err(|e| {
|
|
228
|
+
// Clean up temp file on error
|
|
229
|
+
let _ = fs::remove_file(&temp_path);
|
|
230
|
+
io::Error::new(
|
|
231
|
+
e.kind(),
|
|
232
|
+
format!(
|
|
233
|
+
"Failed to rename bundled pdfium library from '{}' to '{}': {}",
|
|
234
|
+
temp_path.display(),
|
|
235
|
+
lib_path.display(),
|
|
236
|
+
e
|
|
237
|
+
),
|
|
238
|
+
)
|
|
239
|
+
})?;
|
|
240
|
+
|
|
181
241
|
Ok(lib_path)
|
|
182
242
|
}
|
|
183
243
|
|
|
@@ -324,6 +384,52 @@ mod tests {
|
|
|
324
384
|
assert_eq!(size1, size2, "Reused library should have same file size");
|
|
325
385
|
}
|
|
326
386
|
|
|
387
|
+
#[test]
|
|
388
|
+
#[cfg(feature = "bundled-pdfium")]
|
|
389
|
+
fn test_extract_bundled_pdfium_concurrent_access() {
|
|
390
|
+
use std::thread;
|
|
391
|
+
|
|
392
|
+
// Spawn multiple threads that all try to extract simultaneously
|
|
393
|
+
let handles: Vec<_> = (0..10)
|
|
394
|
+
.map(|_| {
|
|
395
|
+
thread::spawn(|| {
|
|
396
|
+
let result = extract_bundled_pdfium();
|
|
397
|
+
assert!(result.is_ok(), "Concurrent extraction should succeed");
|
|
398
|
+
result.unwrap()
|
|
399
|
+
})
|
|
400
|
+
})
|
|
401
|
+
.collect();
|
|
402
|
+
|
|
403
|
+
// Collect all results
|
|
404
|
+
let paths: Vec<PathBuf> = handles
|
|
405
|
+
.into_iter()
|
|
406
|
+
.map(|h| h.join().expect("Thread should complete"))
|
|
407
|
+
.collect();
|
|
408
|
+
|
|
409
|
+
// All paths should be identical
|
|
410
|
+
let first_path = &paths[0];
|
|
411
|
+
assert!(
|
|
412
|
+
paths.iter().all(|p| p == first_path),
|
|
413
|
+
"All concurrent extractions should return the same path"
|
|
414
|
+
);
|
|
415
|
+
|
|
416
|
+
// Verify file exists and is valid
|
|
417
|
+
assert!(
|
|
418
|
+
first_path.exists(),
|
|
419
|
+
"Extracted library should exist at: {}",
|
|
420
|
+
first_path.display()
|
|
421
|
+
);
|
|
422
|
+
|
|
423
|
+
// Verify file size is not truncated/partial
|
|
424
|
+
let metadata = fs::metadata(first_path).expect("Should be able to read metadata");
|
|
425
|
+
let file_size = metadata.len();
|
|
426
|
+
assert!(
|
|
427
|
+
file_size > 1_000_000,
|
|
428
|
+
"PDFium library should be at least 1MB, got {} bytes",
|
|
429
|
+
file_size
|
|
430
|
+
);
|
|
431
|
+
}
|
|
432
|
+
|
|
327
433
|
#[test]
|
|
328
434
|
#[cfg(unix)]
|
|
329
435
|
#[cfg(feature = "bundled-pdfium")]
|
|
@@ -49,6 +49,7 @@ async fn test_pdf_password_protected_async() {
|
|
|
49
49
|
|
|
50
50
|
#[cfg(feature = "office")]
|
|
51
51
|
#[tokio::test]
|
|
52
|
+
#[cfg_attr(target_os = "windows", ignore = "LibreOffice tests timeout on Windows CI")]
|
|
52
53
|
async fn test_legacy_doc_extraction_async() {
|
|
53
54
|
if !test_documents_available() {
|
|
54
55
|
return;
|
metadata
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.0.0.pre.rc.
|
|
4
|
+
version: 4.0.0.pre.rc.15
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
|
+
autorequire:
|
|
8
9
|
bindir: exe
|
|
9
10
|
cert_chain: []
|
|
10
|
-
date: 2025-12-
|
|
11
|
+
date: 2025-12-21 00:00:00.000000000 Z
|
|
11
12
|
dependencies:
|
|
12
13
|
- !ruby/object:Gem::Dependency
|
|
13
14
|
name: bundler
|
|
@@ -213,7 +214,7 @@ files:
|
|
|
213
214
|
- lib/kreuzberg/setup_lib_path.rb
|
|
214
215
|
- lib/kreuzberg/validator_protocol.rb
|
|
215
216
|
- lib/kreuzberg/version.rb
|
|
216
|
-
- lib/libpdfium.
|
|
217
|
+
- lib/libpdfium.so
|
|
217
218
|
- sig/kreuzberg.rbs
|
|
218
219
|
- sig/kreuzberg/internal.rbs
|
|
219
220
|
- spec/binding/cache_spec.rb
|
|
@@ -232,19 +233,6 @@ files:
|
|
|
232
233
|
- spec/smoke/package_spec.rb
|
|
233
234
|
- spec/spec_helper.rb
|
|
234
235
|
- vendor/Cargo.toml
|
|
235
|
-
- vendor/kreuzberg-ffi/Cargo.toml
|
|
236
|
-
- vendor/kreuzberg-ffi/README.md
|
|
237
|
-
- vendor/kreuzberg-ffi/build.rs
|
|
238
|
-
- vendor/kreuzberg-ffi/cbindgen.toml
|
|
239
|
-
- vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in
|
|
240
|
-
- vendor/kreuzberg-ffi/kreuzberg.h
|
|
241
|
-
- vendor/kreuzberg-ffi/src/lib.rs
|
|
242
|
-
- vendor/kreuzberg-ffi/src/panic_shield.rs
|
|
243
|
-
- vendor/kreuzberg-ffi/tests.disabled/README.md
|
|
244
|
-
- vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs
|
|
245
|
-
- vendor/kreuzberg-ffi/tests.disabled/config_tests.rs
|
|
246
|
-
- vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs
|
|
247
|
-
- vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs
|
|
248
236
|
- vendor/kreuzberg-tesseract/.commitlintrc.json
|
|
249
237
|
- vendor/kreuzberg-tesseract/.crate-ignore
|
|
250
238
|
- vendor/kreuzberg-tesseract/Cargo.lock
|
|
@@ -544,13 +532,14 @@ homepage: https://github.com/kreuzberg-dev/kreuzberg
|
|
|
544
532
|
licenses:
|
|
545
533
|
- MIT
|
|
546
534
|
metadata:
|
|
547
|
-
|
|
535
|
+
homepage_uri: https://github.com/kreuzberg-dev/kreuzberg
|
|
536
|
+
source_code_uri: https://github.com/kreuzberg-dev/kreuzberg
|
|
548
537
|
changelog_uri: https://github.com/kreuzberg-dev/kreuzberg/blob/main/CHANGELOG.md
|
|
549
538
|
documentation_uri: https://docs.kreuzberg.dev
|
|
550
|
-
|
|
551
|
-
keywords: document-intelligence,document-extraction,ocr,rust,bindings
|
|
539
|
+
bug_tracker_uri: https://github.com/kreuzberg-dev/kreuzberg/issues
|
|
552
540
|
rubygems_mfa_required: 'true'
|
|
553
|
-
|
|
541
|
+
keywords: document-intelligence,document-extraction,ocr,rust,bindings
|
|
542
|
+
post_install_message:
|
|
554
543
|
rdoc_options: []
|
|
555
544
|
require_paths:
|
|
556
545
|
- lib
|
|
@@ -565,7 +554,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
565
554
|
- !ruby/object:Gem::Version
|
|
566
555
|
version: '0'
|
|
567
556
|
requirements: []
|
|
568
|
-
rubygems_version:
|
|
557
|
+
rubygems_version: 3.5.22
|
|
558
|
+
signing_key:
|
|
569
559
|
specification_version: 4
|
|
570
560
|
summary: High-performance document intelligence framework
|
|
571
561
|
test_files: []
|
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
[package]
|
|
2
|
-
name = "kreuzberg-ffi"
|
|
3
|
-
version = "4.0.0-rc.14"
|
|
4
|
-
edition = "2024"
|
|
5
|
-
rust-version = "1.91"
|
|
6
|
-
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
7
|
-
description = "C FFI bindings for Kreuzberg document intelligence library"
|
|
8
|
-
license = "MIT"
|
|
9
|
-
repository = "https://github.com/kreuzberg-dev/kreuzberg"
|
|
10
|
-
homepage = "https://kreuzberg.dev"
|
|
11
|
-
documentation = "https://docs.rs/kreuzberg-ffi"
|
|
12
|
-
readme = "README.md"
|
|
13
|
-
keywords = ["ffi", "bindings", "document", "extraction", "api"]
|
|
14
|
-
categories = ["development-tools::ffi", "text-processing"]
|
|
15
|
-
|
|
16
|
-
[lib]
|
|
17
|
-
crate-type = ["cdylib", "staticlib", "rlib"]
|
|
18
|
-
|
|
19
|
-
[features]
|
|
20
|
-
# Mirror embeddings feature availability from kreuzberg dependency
|
|
21
|
-
embeddings = []
|
|
22
|
-
|
|
23
|
-
[dependencies]
|
|
24
|
-
# On Windows MinGW, disable embeddings/ort since ONNX Runtime is not available
|
|
25
|
-
# in MinGW-compatible form. Use all other features but exclude embeddings.
|
|
26
|
-
[target.'cfg(all(windows, target_env = "gnu"))'.dependencies]
|
|
27
|
-
kreuzberg = { path = "../kreuzberg", features = [
|
|
28
|
-
"pdf",
|
|
29
|
-
"excel",
|
|
30
|
-
"office",
|
|
31
|
-
"email",
|
|
32
|
-
"html",
|
|
33
|
-
"xml",
|
|
34
|
-
"archives",
|
|
35
|
-
"ocr",
|
|
36
|
-
"language-detection",
|
|
37
|
-
"chunking",
|
|
38
|
-
"quality",
|
|
39
|
-
"keywords",
|
|
40
|
-
"api",
|
|
41
|
-
"mcp",
|
|
42
|
-
"otel",
|
|
43
|
-
"bundled-pdfium",
|
|
44
|
-
] }
|
|
45
|
-
serde_json = "1.0.145"
|
|
46
|
-
serde = { version = "1.0.228", features = ["derive"] }
|
|
47
|
-
async-trait = "0.1.89"
|
|
48
|
-
tokio = { version = "1.48.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
|
|
49
|
-
html-to-markdown-rs = { version = "2.15.0", default-features = false }
|
|
50
|
-
|
|
51
|
-
[target.'cfg(not(all(windows, target_env = "gnu")))'.dependencies]
|
|
52
|
-
kreuzberg = { path = "../kreuzberg", features = ["full", "bundled-pdfium"] }
|
|
53
|
-
serde_json = "1.0.145"
|
|
54
|
-
serde = { version = "1.0.228", features = ["derive"] }
|
|
55
|
-
async-trait = "0.1.89"
|
|
56
|
-
tokio = { version = "1.48.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
|
|
57
|
-
html-to-markdown-rs = { version = "2.15.0", default-features = false }
|
|
58
|
-
|
|
59
|
-
[build-dependencies]
|
|
60
|
-
cbindgen = "0.29"
|
|
61
|
-
|
|
62
|
-
[dev-dependencies]
|
|
63
|
-
tempfile = "3.23.0"
|