kreuzberg 4.0.0.pre.rc.16 → 4.0.0.pre.rc.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +65 -3
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/ext/kreuzberg_rb/native/build.rs +70 -45
- data/ext/kreuzberg_rb/native/src/lib.rs +6 -7
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/src/extraction/html.rs +1 -14
- data/vendor/kreuzberg-ffi/Cargo.toml +74 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +230 -0
- data/vendor/kreuzberg-ffi/build.rs +176 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +2959 -0
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +626 -0
- data/vendor/kreuzberg-ffi/src/config.rs +1050 -0
- data/vendor/kreuzberg-ffi/src/error.rs +950 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +4107 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
- data/vendor/kreuzberg-ffi/src/result.rs +517 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +675 -0
- data/vendor/kreuzberg-ffi/src/result_view.rs +815 -0
- data/vendor/kreuzberg-ffi/src/string_intern.rs +596 -0
- data/vendor/kreuzberg-ffi/src/validation.rs +938 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +24 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: a1c1d7ddcc45217bf5d9ea47a4d9d3ef9f41ed5a4bd87f4ff1f2ada7cfe0bca6
|
|
4
|
+
data.tar.gz: 167fb6c623c9e4368bcd2388e0ef4631d170d0e167cc0555c2dc7cd814bff9eb
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 37661576ba03012b1549c2388e0aee4c24cdb0e05cb34164fc85ca654c20236b28829678531c50fe46260433c43907c3f03cf516753f12c78ee2026a0b14a446
|
|
7
|
+
data.tar.gz: 96e1b3b10589fa7f47fc9609158da556e7d889a61963953c72e5f92337897828d68934d8dc0a66d7b2e28774204b6b1bf2c95226d5c26c84a98a01ce797e24df
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.0.0.pre.rc.
|
|
4
|
+
kreuzberg (4.0.0.pre.rc.18)
|
|
5
5
|
|
|
6
6
|
GEM
|
|
7
7
|
remote: https://rubygems.org/
|
|
@@ -33,7 +33,7 @@ GEM
|
|
|
33
33
|
ffi (1.17.2-arm64-darwin)
|
|
34
34
|
ffi (1.17.2-x86_64-linux-gnu)
|
|
35
35
|
fileutils (1.8.0)
|
|
36
|
-
i18n (1.14.
|
|
36
|
+
i18n (1.14.8)
|
|
37
37
|
concurrent-ruby (~> 1.0)
|
|
38
38
|
json (2.18.0)
|
|
39
39
|
language_server-protocol (3.17.0.5)
|
|
@@ -455,6 +455,18 @@ dependencies = [
|
|
|
455
455
|
"core2",
|
|
456
456
|
]
|
|
457
457
|
|
|
458
|
+
[[package]]
|
|
459
|
+
name = "bitvec"
|
|
460
|
+
version = "1.0.1"
|
|
461
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
462
|
+
checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c"
|
|
463
|
+
dependencies = [
|
|
464
|
+
"funty",
|
|
465
|
+
"radium",
|
|
466
|
+
"tap",
|
|
467
|
+
"wyz",
|
|
468
|
+
]
|
|
469
|
+
|
|
458
470
|
[[package]]
|
|
459
471
|
name = "blake3"
|
|
460
472
|
version = "1.8.2"
|
|
@@ -987,6 +999,20 @@ dependencies = [
|
|
|
987
999
|
"serde",
|
|
988
1000
|
]
|
|
989
1001
|
|
|
1002
|
+
[[package]]
|
|
1003
|
+
name = "dashmap"
|
|
1004
|
+
version = "6.1.0"
|
|
1005
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1006
|
+
checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf"
|
|
1007
|
+
dependencies = [
|
|
1008
|
+
"cfg-if",
|
|
1009
|
+
"crossbeam-utils",
|
|
1010
|
+
"hashbrown 0.14.5",
|
|
1011
|
+
"lock_api",
|
|
1012
|
+
"once_cell",
|
|
1013
|
+
"parking_lot_core",
|
|
1014
|
+
]
|
|
1015
|
+
|
|
990
1016
|
[[package]]
|
|
991
1017
|
name = "debug_unsafe"
|
|
992
1018
|
version = "0.1.3"
|
|
@@ -1434,6 +1460,12 @@ dependencies = [
|
|
|
1434
1460
|
"windows-sys 0.59.0",
|
|
1435
1461
|
]
|
|
1436
1462
|
|
|
1463
|
+
[[package]]
|
|
1464
|
+
name = "funty"
|
|
1465
|
+
version = "2.0.0"
|
|
1466
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1467
|
+
checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
|
|
1468
|
+
|
|
1437
1469
|
[[package]]
|
|
1438
1470
|
name = "futf"
|
|
1439
1471
|
version = "0.1.5"
|
|
@@ -1638,6 +1670,12 @@ dependencies = [
|
|
|
1638
1670
|
"zerocopy",
|
|
1639
1671
|
]
|
|
1640
1672
|
|
|
1673
|
+
[[package]]
|
|
1674
|
+
name = "hashbrown"
|
|
1675
|
+
version = "0.14.5"
|
|
1676
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1677
|
+
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
|
|
1678
|
+
|
|
1641
1679
|
[[package]]
|
|
1642
1680
|
name = "hashbrown"
|
|
1643
1681
|
version = "0.15.5"
|
|
@@ -2280,7 +2318,7 @@ dependencies = [
|
|
|
2280
2318
|
|
|
2281
2319
|
[[package]]
|
|
2282
2320
|
name = "kreuzberg"
|
|
2283
|
-
version = "4.0.0-rc.
|
|
2321
|
+
version = "4.0.0-rc.17"
|
|
2284
2322
|
dependencies = [
|
|
2285
2323
|
"ahash",
|
|
2286
2324
|
"async-trait",
|
|
@@ -2288,8 +2326,10 @@ dependencies = [
|
|
|
2288
2326
|
"base64 0.22.1",
|
|
2289
2327
|
"base64-simd",
|
|
2290
2328
|
"biblatex",
|
|
2329
|
+
"bitvec",
|
|
2291
2330
|
"calamine",
|
|
2292
2331
|
"chardetng",
|
|
2332
|
+
"dashmap",
|
|
2293
2333
|
"docx-lite",
|
|
2294
2334
|
"encoding_rs",
|
|
2295
2335
|
"fast_image_resize",
|
|
@@ -2334,6 +2374,7 @@ dependencies = [
|
|
|
2334
2374
|
"serde_json",
|
|
2335
2375
|
"serde_yaml_ng",
|
|
2336
2376
|
"sevenz-rust2",
|
|
2377
|
+
"simdutf8",
|
|
2337
2378
|
"tar",
|
|
2338
2379
|
"text-splitter",
|
|
2339
2380
|
"thiserror 2.0.17",
|
|
@@ -2354,7 +2395,7 @@ dependencies = [
|
|
|
2354
2395
|
|
|
2355
2396
|
[[package]]
|
|
2356
2397
|
name = "kreuzberg-rb"
|
|
2357
|
-
version = "4.0.0-rc.
|
|
2398
|
+
version = "4.0.0-rc.17"
|
|
2358
2399
|
dependencies = [
|
|
2359
2400
|
"async-trait",
|
|
2360
2401
|
"html-to-markdown-rs",
|
|
@@ -2368,7 +2409,7 @@ dependencies = [
|
|
|
2368
2409
|
|
|
2369
2410
|
[[package]]
|
|
2370
2411
|
name = "kreuzberg-tesseract"
|
|
2371
|
-
version = "4.0.0-rc.
|
|
2412
|
+
version = "4.0.0-rc.17"
|
|
2372
2413
|
dependencies = [
|
|
2373
2414
|
"cc",
|
|
2374
2415
|
"cmake",
|
|
@@ -4124,6 +4165,12 @@ version = "5.3.0"
|
|
|
4124
4165
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4125
4166
|
checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
|
|
4126
4167
|
|
|
4168
|
+
[[package]]
|
|
4169
|
+
name = "radium"
|
|
4170
|
+
version = "0.7.0"
|
|
4171
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4172
|
+
checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09"
|
|
4173
|
+
|
|
4127
4174
|
[[package]]
|
|
4128
4175
|
name = "rake"
|
|
4129
4176
|
version = "0.3.6"
|
|
@@ -5218,6 +5265,12 @@ dependencies = [
|
|
|
5218
5265
|
"syn",
|
|
5219
5266
|
]
|
|
5220
5267
|
|
|
5268
|
+
[[package]]
|
|
5269
|
+
name = "tap"
|
|
5270
|
+
version = "1.0.1"
|
|
5271
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5272
|
+
checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
|
|
5273
|
+
|
|
5221
5274
|
[[package]]
|
|
5222
5275
|
name = "tar"
|
|
5223
5276
|
version = "0.4.44"
|
|
@@ -6439,6 +6492,15 @@ version = "0.6.2"
|
|
|
6439
6492
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6440
6493
|
checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
|
|
6441
6494
|
|
|
6495
|
+
[[package]]
|
|
6496
|
+
name = "wyz"
|
|
6497
|
+
version = "0.5.1"
|
|
6498
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6499
|
+
checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed"
|
|
6500
|
+
dependencies = [
|
|
6501
|
+
"tap",
|
|
6502
|
+
]
|
|
6503
|
+
|
|
6442
6504
|
[[package]]
|
|
6443
6505
|
name = "xattr"
|
|
6444
6506
|
version = "1.6.1"
|
|
@@ -1,52 +1,77 @@
|
|
|
1
|
-
|
|
1
|
+
use std::env;
|
|
2
|
+
use std::path::PathBuf;
|
|
3
|
+
|
|
2
4
|
fn main() {
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
5
|
+
let target = env::var("TARGET").unwrap();
|
|
6
|
+
let profile = env::var("PROFILE").unwrap_or_else(|_| "release".to_string());
|
|
7
|
+
|
|
8
|
+
// Try to locate kreuzberg-ffi library built alongside this crate
|
|
9
|
+
let cargo_manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
|
|
10
|
+
let manifest_path = PathBuf::from(&cargo_manifest_dir);
|
|
11
|
+
|
|
12
|
+
// Prefer host target layout, but include target-triple layout for cross builds.
|
|
13
|
+
// IMPORTANT: Only search lib directories, NOT deps directories.
|
|
14
|
+
// The deps/ directories may contain dylibs with hardcoded install_name paths,
|
|
15
|
+
// which causes load errors on macOS when users install the gem.
|
|
16
|
+
if let Some(packages_root) = manifest_path
|
|
17
|
+
.parent()
|
|
18
|
+
.and_then(|p| p.parent())
|
|
19
|
+
.and_then(|p| p.parent())
|
|
20
|
+
.and_then(|p| p.parent())
|
|
21
|
+
.and_then(|p| p.parent())
|
|
22
|
+
{
|
|
23
|
+
let host_lib_dir = packages_root.join("target").join(&profile);
|
|
24
|
+
let target_lib_dir = packages_root.join("target").join(&target).join(&profile);
|
|
25
|
+
|
|
26
|
+
// Try to find the static library and link it directly on Unix-like systems
|
|
27
|
+
// to avoid the linker preferring dylib over static lib.
|
|
28
|
+
if !target.contains("windows") {
|
|
29
|
+
let static_lib_name = if target.contains("windows") {
|
|
30
|
+
"kreuzberg_ffi.lib"
|
|
31
|
+
} else {
|
|
32
|
+
"libkreuzberg_ffi.a"
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
// Check both host and target lib directories for the static library
|
|
36
|
+
for lib_dir in [&host_lib_dir, &target_lib_dir] {
|
|
37
|
+
let static_lib = lib_dir.join(static_lib_name);
|
|
38
|
+
if static_lib.exists() {
|
|
39
|
+
// Found static library, link it directly by passing the full path
|
|
40
|
+
println!("cargo:rustc-link-arg={}", static_lib.display());
|
|
41
|
+
// Don't add the library search path or -l flag
|
|
42
|
+
// Jump to platform-specific configuration
|
|
43
|
+
if target.contains("darwin") {
|
|
44
|
+
println!("cargo:rustc-link-arg=-Wl,-undefined,dynamic_lookup");
|
|
45
|
+
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
|
|
46
|
+
} else if target.contains("linux") {
|
|
47
|
+
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
|
|
48
|
+
}
|
|
49
|
+
println!("cargo:rerun-if-changed=build.rs");
|
|
50
|
+
return;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Fallback: Add search paths and use standard linking
|
|
56
|
+
for dir in [host_lib_dir, target_lib_dir] {
|
|
57
|
+
println!("cargo:rustc-link-search=native={}", dir.display());
|
|
58
|
+
}
|
|
13
59
|
}
|
|
14
|
-
println!("cargo:rustc-link-arg=-Wl,-undefined,dynamic_lookup");
|
|
15
|
-
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
|
|
16
|
-
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
|
|
17
|
-
}
|
|
18
60
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
if
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
.and_then(|p| p.parent())
|
|
26
|
-
.and_then(|p| p.parent())
|
|
27
|
-
.and_then(|p| p.parent())
|
|
28
|
-
.map(|p| p.join("target/release"))
|
|
29
|
-
.expect("Failed to construct lib path");
|
|
30
|
-
println!("cargo:rustc-link-search={}", lib_path.display());
|
|
61
|
+
// Link the kreuzberg-ffi library
|
|
62
|
+
// When kreuzberg-ffi is built, its symbols become available for linking
|
|
63
|
+
if target.contains("windows") {
|
|
64
|
+
println!("cargo:rustc-link-lib=dylib=kreuzberg_ffi");
|
|
65
|
+
} else {
|
|
66
|
+
println!("cargo:rustc-link-lib=static=kreuzberg_ffi");
|
|
31
67
|
}
|
|
32
|
-
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
|
|
33
|
-
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
|
|
34
|
-
}
|
|
35
68
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
.and_then(|p| p.parent())
|
|
42
|
-
.and_then(|p| p.parent())
|
|
43
|
-
.and_then(|p| p.parent())
|
|
44
|
-
.and_then(|p| p.parent())
|
|
45
|
-
.map(|p| p.join("target/release"))
|
|
46
|
-
.expect("Failed to construct lib path");
|
|
47
|
-
println!("cargo:rustc-link-search={}", lib_path.display());
|
|
69
|
+
if target.contains("darwin") {
|
|
70
|
+
println!("cargo:rustc-link-arg=-Wl,-undefined,dynamic_lookup");
|
|
71
|
+
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
|
|
72
|
+
} else if target.contains("linux") {
|
|
73
|
+
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
|
|
48
74
|
}
|
|
49
|
-
}
|
|
50
75
|
|
|
51
|
-
|
|
52
|
-
|
|
76
|
+
println!("cargo:rerun-if-changed=build.rs");
|
|
77
|
+
}
|
|
@@ -78,8 +78,7 @@ pub struct CMetadataField {
|
|
|
78
78
|
}
|
|
79
79
|
|
|
80
80
|
// These C ABI functions are provided by the kreuzberg-ffi crate
|
|
81
|
-
//
|
|
82
|
-
#[link(name = "kreuzberg_ffi", kind = "static")]
|
|
81
|
+
// Linking is handled by build.rs to ensure static linking
|
|
83
82
|
unsafe extern "C" {
|
|
84
83
|
pub fn kreuzberg_last_error_code() -> i32;
|
|
85
84
|
pub fn kreuzberg_last_panic_context() -> *mut c_char;
|
|
@@ -3071,7 +3070,7 @@ fn validate_chunking_params(max_chars: usize, max_overlap: usize) -> Result<i32,
|
|
|
3071
3070
|
/// Gets valid binarization methods as a JSON string
|
|
3072
3071
|
///
|
|
3073
3072
|
/// @return [String] JSON array of valid binarization methods
|
|
3074
|
-
fn get_valid_binarization_methods(
|
|
3073
|
+
fn get_valid_binarization_methods(_ruby: &Ruby) -> Result<String, Error> {
|
|
3075
3074
|
let ptr = unsafe { kreuzberg_get_valid_binarization_methods() };
|
|
3076
3075
|
if ptr.is_null() {
|
|
3077
3076
|
return Err(runtime_error("Failed to get valid binarization methods"));
|
|
@@ -3095,7 +3094,7 @@ fn get_valid_binarization_methods(ruby: &Ruby) -> Result<String, Error> {
|
|
|
3095
3094
|
/// Gets valid language codes as a JSON string
|
|
3096
3095
|
///
|
|
3097
3096
|
/// @return [String] JSON array of valid language codes
|
|
3098
|
-
fn get_valid_language_codes(
|
|
3097
|
+
fn get_valid_language_codes(_ruby: &Ruby) -> Result<String, Error> {
|
|
3099
3098
|
let ptr = unsafe { kreuzberg_get_valid_language_codes() };
|
|
3100
3099
|
if ptr.is_null() {
|
|
3101
3100
|
return Err(runtime_error("Failed to get valid language codes"));
|
|
@@ -3119,7 +3118,7 @@ fn get_valid_language_codes(ruby: &Ruby) -> Result<String, Error> {
|
|
|
3119
3118
|
/// Gets valid OCR backends as a JSON string
|
|
3120
3119
|
///
|
|
3121
3120
|
/// @return [String] JSON array of valid OCR backends
|
|
3122
|
-
fn get_valid_ocr_backends(
|
|
3121
|
+
fn get_valid_ocr_backends(_ruby: &Ruby) -> Result<String, Error> {
|
|
3123
3122
|
let ptr = unsafe { kreuzberg_get_valid_ocr_backends() };
|
|
3124
3123
|
if ptr.is_null() {
|
|
3125
3124
|
return Err(runtime_error("Failed to get valid OCR backends"));
|
|
@@ -3143,7 +3142,7 @@ fn get_valid_ocr_backends(ruby: &Ruby) -> Result<String, Error> {
|
|
|
3143
3142
|
/// Gets valid token reduction levels as a JSON string
|
|
3144
3143
|
///
|
|
3145
3144
|
/// @return [String] JSON array of valid token reduction levels
|
|
3146
|
-
fn get_valid_token_reduction_levels(
|
|
3145
|
+
fn get_valid_token_reduction_levels(_ruby: &Ruby) -> Result<String, Error> {
|
|
3147
3146
|
let ptr = unsafe { kreuzberg_get_valid_token_reduction_levels() };
|
|
3148
3147
|
if ptr.is_null() {
|
|
3149
3148
|
return Err(runtime_error("Failed to get valid token reduction levels"));
|
|
@@ -3395,7 +3394,7 @@ fn get_error_details_native(ruby: &Ruby) -> Result<Value, Error> {
|
|
|
3395
3394
|
// SAFETY: FFI function is thread-safe and returns a struct with allocated C strings
|
|
3396
3395
|
let details = unsafe { kreuzberg_get_error_details() };
|
|
3397
3396
|
|
|
3398
|
-
let hash =
|
|
3397
|
+
let hash = ruby.hash_new();
|
|
3399
3398
|
|
|
3400
3399
|
// Convert C strings to Ruby strings, handling nulls safely
|
|
3401
3400
|
// SAFETY: All non-null pointers from FFI must be valid C strings
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
[workspace]
|
|
2
|
-
members = ["kreuzberg", "kreuzberg-tesseract"]
|
|
2
|
+
members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.0.0-rc.
|
|
5
|
+
version = "4.0.0-rc.18"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -163,6 +163,7 @@ fn convert_html_with_options_large_stack(html: String, options: ConversionOption
|
|
|
163
163
|
}
|
|
164
164
|
|
|
165
165
|
#[cfg(not(target_arch = "wasm32"))]
|
|
166
|
+
#[allow(dead_code)]
|
|
166
167
|
fn convert_inline_images_with_large_stack(
|
|
167
168
|
html: String,
|
|
168
169
|
options: ConversionOptions,
|
|
@@ -204,20 +205,6 @@ fn extract_panic_reason(panic: &Box<dyn Any + Send + 'static>) -> String {
|
|
|
204
205
|
}
|
|
205
206
|
|
|
206
207
|
// WASM implementations skip dedicated stack (not supported) and process inline
|
|
207
|
-
#[cfg(target_arch = "wasm32")]
|
|
208
|
-
fn convert_html_with_options_large_stack(html: String, options: ConversionOptions) -> Result<String> {
|
|
209
|
-
convert_html_with_options(&html, options)
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
#[cfg(target_arch = "wasm32")]
|
|
213
|
-
fn convert_inline_images_with_large_stack(
|
|
214
|
-
html: String,
|
|
215
|
-
options: ConversionOptions,
|
|
216
|
-
image_config: LibInlineImageConfig,
|
|
217
|
-
) -> Result<HtmlExtraction> {
|
|
218
|
-
convert_inline_images_with_options(&html, options, image_config)
|
|
219
|
-
}
|
|
220
|
-
|
|
221
208
|
/// Convert HTML to markdown with optional configuration.
|
|
222
209
|
///
|
|
223
210
|
/// Uses sensible defaults if no configuration is provided:
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
[package]
|
|
2
|
+
name = "kreuzberg-ffi"
|
|
3
|
+
version.workspace = true
|
|
4
|
+
edition.workspace = true
|
|
5
|
+
rust-version.workspace = true
|
|
6
|
+
authors.workspace = true
|
|
7
|
+
description = "C FFI bindings for Kreuzberg document intelligence library"
|
|
8
|
+
license.workspace = true
|
|
9
|
+
repository = "https://github.com/kreuzberg-dev/kreuzberg"
|
|
10
|
+
homepage = "https://kreuzberg.dev"
|
|
11
|
+
documentation = "https://docs.rs/kreuzberg-ffi"
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
keywords = ["ffi", "bindings", "document", "extraction", "api"]
|
|
14
|
+
categories = ["development-tools::ffi", "text-processing"]
|
|
15
|
+
|
|
16
|
+
[lib]
|
|
17
|
+
# cdylib: Required by Java (FFM API) and Go (cgo dynamic linking)
|
|
18
|
+
# staticlib: Required by Python (PyO3 static linking) to avoid dylib install_name issues on macOS
|
|
19
|
+
# rlib: Standard Rust library format for workspace dependencies
|
|
20
|
+
crate-type = ["cdylib", "staticlib", "rlib"]
|
|
21
|
+
|
|
22
|
+
[features]
|
|
23
|
+
# Mirror embeddings feature availability from kreuzberg dependency
|
|
24
|
+
embeddings = []
|
|
25
|
+
# Optional rayon for parallel batch processing
|
|
26
|
+
rayon = ["dep:rayon"]
|
|
27
|
+
# Re-export kreuzberg features for downstream conditional compilation
|
|
28
|
+
pdf = []
|
|
29
|
+
keywords-yake = []
|
|
30
|
+
keywords-rake = []
|
|
31
|
+
|
|
32
|
+
[dependencies]
|
|
33
|
+
serde_json = { workspace = true }
|
|
34
|
+
serde = { workspace = true }
|
|
35
|
+
async-trait = { workspace = true }
|
|
36
|
+
tokio = { workspace = true }
|
|
37
|
+
html-to-markdown-rs = { version = "2.16.1", default-features = false }
|
|
38
|
+
rayon = { version = "1.11", optional = true }
|
|
39
|
+
|
|
40
|
+
# On Windows MinGW, disable embeddings/ort since ONNX Runtime is not available
|
|
41
|
+
# in MinGW-compatible form. Use all other features but exclude embeddings.
|
|
42
|
+
[target.'cfg(all(windows, target_env = "gnu"))'.dependencies]
|
|
43
|
+
kreuzberg = { path = "../kreuzberg", features = [
|
|
44
|
+
"pdf",
|
|
45
|
+
"excel",
|
|
46
|
+
"office",
|
|
47
|
+
"email",
|
|
48
|
+
"html",
|
|
49
|
+
"xml",
|
|
50
|
+
"archives",
|
|
51
|
+
"ocr",
|
|
52
|
+
"language-detection",
|
|
53
|
+
"chunking",
|
|
54
|
+
"quality",
|
|
55
|
+
"keywords",
|
|
56
|
+
"api",
|
|
57
|
+
"mcp",
|
|
58
|
+
"otel",
|
|
59
|
+
"bundled-pdfium",
|
|
60
|
+
] }
|
|
61
|
+
|
|
62
|
+
[target.'cfg(not(all(windows, target_env = "gnu")))'.dependencies]
|
|
63
|
+
kreuzberg = { path = "../kreuzberg", features = ["full", "bundled-pdfium"] }
|
|
64
|
+
|
|
65
|
+
[build-dependencies]
|
|
66
|
+
cbindgen = "0.29"
|
|
67
|
+
|
|
68
|
+
[dev-dependencies]
|
|
69
|
+
tempfile = { workspace = true }
|
|
70
|
+
criterion = { version = "0.8", features = ["html_reports"] }
|
|
71
|
+
|
|
72
|
+
[[bench]]
|
|
73
|
+
name = "result_view_benchmark"
|
|
74
|
+
harness = false
|