kreuzberg 4.0.0.pre.rc.8 → 4.0.0.pre.rc.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +12 -9
- data/README.md +22 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +397 -177
- data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
- data/ext/kreuzberg_rb/native/src/lib.rs +36 -13
- data/kreuzberg.gemspec +34 -2
- data/lib/kreuzberg/cache_api.rb +35 -0
- data/lib/kreuzberg/error_context.rb +49 -1
- data/lib/kreuzberg/extraction_api.rb +255 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +6 -0
- data/lib/libpdfium.dylib +0 -0
- data/sig/kreuzberg.rbs +9 -0
- data/vendor/Cargo.toml +44 -0
- data/vendor/kreuzberg/Cargo.toml +61 -38
- data/vendor/kreuzberg/README.md +36 -27
- data/vendor/kreuzberg/build.rs +197 -245
- data/vendor/kreuzberg/src/core/pipeline.rs +13 -0
- data/vendor/kreuzberg/src/embeddings.rs +71 -3
- data/vendor/kreuzberg/src/error.rs +1 -1
- data/vendor/kreuzberg/src/extraction/html.rs +37 -5
- data/vendor/kreuzberg/src/extractors/pdf.rs +93 -44
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +19 -1
- data/vendor/kreuzberg/src/pdf/metadata.rs +2 -2
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +2 -2
- data/vendor/kreuzberg/src/pdf/table.rs +3 -0
- data/vendor/kreuzberg/src/pdf/text.rs +2 -2
- data/vendor/kreuzberg/src/text/quality_processor.rs +1 -1
- data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -1
- data/vendor/kreuzberg/tests/format_integration.rs +4 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +63 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/build.rs +176 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +48 -0
- data/vendor/kreuzberg-tesseract/LICENSE +22 -0
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1354 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- metadata +39 -3
- data/vendor/rb-sys/bin/release.sh +0 -21
data/vendor/kreuzberg/build.rs
CHANGED
|
@@ -1,3 +1,29 @@
|
|
|
1
|
+
// Kreuzberg Build Script - PDFium Linking Configuration
|
|
2
|
+
//
|
|
3
|
+
// This build script handles PDFium library downloading and linking for the kreuzberg crate.
|
|
4
|
+
// It supports multiple linking strategies via Cargo features:
|
|
5
|
+
//
|
|
6
|
+
// 1. Default (pdf, bundled-pdfium): Download dynamic library and embed in binary
|
|
7
|
+
// - Self-contained binary that extracts library at runtime
|
|
8
|
+
// - Larger binary size but no external .so dependency
|
|
9
|
+
// - No PDFIUM_*_PATH environment variables needed
|
|
10
|
+
//
|
|
11
|
+
// 2. static-pdfium: Static linking (no runtime dependency)
|
|
12
|
+
// - REQUIRES: PDFIUM_STATIC_LIB_PATH environment variable pointing to libpdfium.a directory
|
|
13
|
+
// - Reason: bblanchon/pdfium-binaries only provides dynamic libraries
|
|
14
|
+
// - Use case: Docker with musl, fully static binaries
|
|
15
|
+
// - Note: libpdfium.a must be obtained separately (e.g., paulocoutinhox/pdfium-lib)
|
|
16
|
+
//
|
|
17
|
+
// 3. system-pdfium: Use system-installed pdfium
|
|
18
|
+
// - Detected via pkg-config or KREUZBERG_PDFIUM_SYSTEM_PATH
|
|
19
|
+
//
|
|
20
|
+
// Environment Variables:
|
|
21
|
+
// - PDFIUM_STATIC_LIB_PATH: Path to directory containing libpdfium.a (for static-pdfium)
|
|
22
|
+
// - KREUZBERG_PDFIUM_PREBUILT: Path to prebuilt pdfium directory (skip download)
|
|
23
|
+
// - KREUZBERG_PDFIUM_SYSTEM_PATH: System pdfium library path (for system-pdfium)
|
|
24
|
+
// - PDFIUM_VERSION: Override version for bblanchon/pdfium-binaries
|
|
25
|
+
// - KREUZBERG_PDFIUM_DOWNLOAD_RETRIES: Number of download retries (default: 5)
|
|
26
|
+
|
|
1
27
|
use std::env;
|
|
2
28
|
use std::fs;
|
|
3
29
|
use std::io;
|
|
@@ -9,13 +35,11 @@ use std::time::Duration;
|
|
|
9
35
|
/// PDFium linking strategy
|
|
10
36
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
11
37
|
enum PdfiumLinkStrategy {
|
|
12
|
-
/// Download and link
|
|
13
|
-
DownloadDynamic,
|
|
14
|
-
/// Download and link statically (pdf-static feature)
|
|
38
|
+
/// Download and link statically (static-pdfium feature)
|
|
15
39
|
DownloadStatic,
|
|
16
|
-
/// Download, link dynamically, and embed in binary (
|
|
40
|
+
/// Download, link dynamically, and embed in binary (bundled-pdfium feature)
|
|
17
41
|
Bundled,
|
|
18
|
-
/// Use system-installed pdfium via pkg-config (
|
|
42
|
+
/// Use system-installed pdfium via pkg-config (system-pdfium feature)
|
|
19
43
|
System,
|
|
20
44
|
}
|
|
21
45
|
|
|
@@ -35,17 +59,11 @@ fn main() {
|
|
|
35
59
|
return;
|
|
36
60
|
}
|
|
37
61
|
|
|
38
|
-
validate_feature_exclusivity();
|
|
39
62
|
let strategy = determine_link_strategy(&target);
|
|
40
63
|
|
|
41
64
|
tracing::debug!("Using PDFium linking strategy: {:?}", strategy);
|
|
42
65
|
|
|
43
66
|
match strategy {
|
|
44
|
-
PdfiumLinkStrategy::DownloadDynamic => {
|
|
45
|
-
let pdfium_dir = download_or_use_prebuilt(&target, &out_dir);
|
|
46
|
-
link_dynamically(&pdfium_dir, &target);
|
|
47
|
-
copy_lib_to_package(&pdfium_dir, &target);
|
|
48
|
-
}
|
|
49
67
|
PdfiumLinkStrategy::DownloadStatic => {
|
|
50
68
|
let pdfium_dir = download_or_use_prebuilt(&target, &out_dir);
|
|
51
69
|
link_statically(&pdfium_dir, &target);
|
|
@@ -70,43 +88,49 @@ fn main() {
|
|
|
70
88
|
// FEATURE & STRATEGY VALIDATION
|
|
71
89
|
// ============================================================================
|
|
72
90
|
|
|
73
|
-
/// Validate that only one linking strategy feature is enabled at a time
|
|
74
|
-
fn validate_feature_exclusivity() {
|
|
75
|
-
let strategies = [
|
|
76
|
-
cfg!(feature = "pdf-static"),
|
|
77
|
-
cfg!(feature = "pdf-bundled"),
|
|
78
|
-
cfg!(feature = "pdf-system"),
|
|
79
|
-
];
|
|
80
|
-
let count = strategies.iter().filter(|&&x| x).count();
|
|
81
|
-
|
|
82
|
-
if count > 1 {
|
|
83
|
-
panic!(
|
|
84
|
-
"Only one of pdf-static, pdf-bundled, pdf-system can be enabled at once.\n\
|
|
85
|
-
Please choose a single PDFium linking strategy."
|
|
86
|
-
);
|
|
87
|
-
}
|
|
88
|
-
}
|
|
89
|
-
|
|
90
91
|
/// Determine which linking strategy to use based on features and target
|
|
91
92
|
fn determine_link_strategy(target: &str) -> PdfiumLinkStrategy {
|
|
92
|
-
// WASM
|
|
93
|
+
// WASM handling: check for PDFIUM_WASM_LIB environment variable
|
|
93
94
|
if target.contains("wasm") {
|
|
94
|
-
|
|
95
|
+
if let Ok(wasm_lib) = env::var("PDFIUM_WASM_LIB") {
|
|
96
|
+
println!("cargo:rustc-link-search=native={}", wasm_lib);
|
|
97
|
+
println!("cargo:rustc-link-lib=static=pdfium");
|
|
98
|
+
return PdfiumLinkStrategy::DownloadStatic;
|
|
99
|
+
}
|
|
100
|
+
// For WASM without explicit PDFIUM_WASM_LIB, use bundled strategy
|
|
101
|
+
// This downloads pdfium-lib which provides WASM-compatible builds
|
|
102
|
+
println!("cargo:warning=WASM build using bundled PDFium (set PDFIUM_WASM_LIB to link custom WASM PDFium)");
|
|
103
|
+
return PdfiumLinkStrategy::Bundled;
|
|
95
104
|
}
|
|
96
105
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
106
|
+
let system_pdfium = cfg!(feature = "system-pdfium");
|
|
107
|
+
let bundled_pdfium = cfg!(feature = "bundled-pdfium");
|
|
108
|
+
let static_pdfium = cfg!(feature = "static-pdfium");
|
|
109
|
+
|
|
110
|
+
let enabled_count = usize::from(system_pdfium) + usize::from(bundled_pdfium) + usize::from(static_pdfium);
|
|
111
|
+
if enabled_count > 1 {
|
|
112
|
+
println!(
|
|
113
|
+
"cargo:warning=Multiple PDFium linking strategies enabled (static-pdfium={}, bundled-pdfium={}, system-pdfium={}); using bundled-pdfium for this build",
|
|
114
|
+
static_pdfium, bundled_pdfium, system_pdfium
|
|
115
|
+
);
|
|
100
116
|
}
|
|
101
|
-
|
|
117
|
+
|
|
118
|
+
// Feature-based strategy selection.
|
|
119
|
+
// Prefer bundled-pdfium when multiple strategies are enabled (e.g. `--all-features`) because it
|
|
120
|
+
// does not require external PDFIUM_STATIC_LIB_PATH and does not depend on a system install.
|
|
121
|
+
if bundled_pdfium {
|
|
102
122
|
return PdfiumLinkStrategy::Bundled;
|
|
103
123
|
}
|
|
104
|
-
if
|
|
124
|
+
if system_pdfium {
|
|
125
|
+
return PdfiumLinkStrategy::System;
|
|
126
|
+
}
|
|
127
|
+
if static_pdfium {
|
|
105
128
|
return PdfiumLinkStrategy::DownloadStatic;
|
|
106
129
|
}
|
|
107
130
|
|
|
108
|
-
// Default: download and link dynamically
|
|
109
|
-
|
|
131
|
+
// Default: download and link dynamically (bundled-pdfium preferred if pdf not already selected)
|
|
132
|
+
// When only 'pdf' feature is enabled (no linking strategy), default to bundled-pdfium
|
|
133
|
+
PdfiumLinkStrategy::Bundled
|
|
110
134
|
}
|
|
111
135
|
|
|
112
136
|
// ============================================================================
|
|
@@ -136,6 +160,9 @@ fn download_or_use_prebuilt(target: &str, out_dir: &Path) -> PathBuf {
|
|
|
136
160
|
if prebuilt_path.exists() {
|
|
137
161
|
prepare_prebuilt_pdfium(&prebuilt_path, &pdfium_dir)
|
|
138
162
|
.unwrap_or_else(|err| panic!("Failed to copy Pdfium from {}: {}", prebuilt_path.display(), err));
|
|
163
|
+
if target.contains("windows") {
|
|
164
|
+
ensure_windows_import_library(&pdfium_dir);
|
|
165
|
+
}
|
|
139
166
|
return pdfium_dir;
|
|
140
167
|
} else {
|
|
141
168
|
panic!(
|
|
@@ -166,19 +193,34 @@ fn download_or_use_prebuilt(target: &str, out_dir: &Path) -> PathBuf {
|
|
|
166
193
|
|
|
167
194
|
// Windows-specific: ensure pdfium.lib exists
|
|
168
195
|
if target.contains("windows") {
|
|
169
|
-
|
|
170
|
-
let dll_lib = lib_dir.join("pdfium.dll.lib");
|
|
171
|
-
let expected_lib = lib_dir.join("pdfium.lib");
|
|
172
|
-
|
|
173
|
-
if dll_lib.exists() && !expected_lib.exists() {
|
|
174
|
-
tracing::debug!("Renaming cached {} to {}", dll_lib.display(), expected_lib.display());
|
|
175
|
-
fs::rename(&dll_lib, &expected_lib).expect("Failed to rename pdfium.dll.lib to pdfium.lib");
|
|
176
|
-
}
|
|
196
|
+
ensure_windows_import_library(&pdfium_dir);
|
|
177
197
|
}
|
|
178
198
|
|
|
179
199
|
pdfium_dir
|
|
180
200
|
}
|
|
181
201
|
|
|
202
|
+
fn ensure_windows_import_library(pdfium_dir: &Path) {
|
|
203
|
+
let lib_dir = pdfium_dir.join("lib");
|
|
204
|
+
let dll_lib = lib_dir.join("pdfium.dll.lib");
|
|
205
|
+
let expected_lib = lib_dir.join("pdfium.lib");
|
|
206
|
+
|
|
207
|
+
if dll_lib.exists() && !expected_lib.exists() {
|
|
208
|
+
tracing::debug!(
|
|
209
|
+
"Ensuring Windows import library at {} (source: {})",
|
|
210
|
+
expected_lib.display(),
|
|
211
|
+
dll_lib.display()
|
|
212
|
+
);
|
|
213
|
+
fs::copy(&dll_lib, &expected_lib).unwrap_or_else(|err| {
|
|
214
|
+
panic!(
|
|
215
|
+
"Failed to copy Windows import library from {} to {}: {}",
|
|
216
|
+
dll_lib.display(),
|
|
217
|
+
expected_lib.display(),
|
|
218
|
+
err
|
|
219
|
+
)
|
|
220
|
+
});
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
182
224
|
// ============================================================================
|
|
183
225
|
// DOWNLOAD UTILITIES
|
|
184
226
|
// ============================================================================
|
|
@@ -436,13 +478,14 @@ fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
|
|
|
436
478
|
/// Get platform-specific runtime library name and subdirectory
|
|
437
479
|
///
|
|
438
480
|
/// Returns tuple of (library_name, subdirectory) for the target platform:
|
|
439
|
-
/// - WASM: ("libpdfium.a", "lib")
|
|
481
|
+
/// - WASM: ("libpdfium.a", "release/lib")
|
|
440
482
|
/// - Windows: ("pdfium.dll", "bin")
|
|
441
483
|
/// - macOS: ("libpdfium.dylib", "lib")
|
|
442
484
|
/// - Linux: ("libpdfium.so", "lib")
|
|
443
485
|
fn runtime_library_info(target: &str) -> (String, &'static str) {
|
|
444
486
|
if target.contains("wasm") {
|
|
445
|
-
|
|
487
|
+
// pdfium-lib `wasm.tgz` extracts into `release/lib/libpdfium.a`
|
|
488
|
+
("libpdfium.a".to_string(), "release/lib")
|
|
446
489
|
} else if target.contains("windows") {
|
|
447
490
|
("pdfium.dll".to_string(), "bin")
|
|
448
491
|
} else if target.contains("darwin") {
|
|
@@ -516,69 +559,6 @@ fn find_pdfium_library(pdfium_dir: &Path, lib_name: &str, expected_subdir: &str)
|
|
|
516
559
|
Err(error_msg)
|
|
517
560
|
}
|
|
518
561
|
|
|
519
|
-
/// Fix macOS install name (rpath) for dynamic library
|
|
520
|
-
///
|
|
521
|
-
/// Uses install_name_tool to set the install name to @rpath/{lib_name}
|
|
522
|
-
/// to enable relative path loading on macOS.
|
|
523
|
-
fn fix_macos_install_name(lib_path: &Path, lib_name: &str) {
|
|
524
|
-
let new_install_name = format!("@rpath/{}", lib_name);
|
|
525
|
-
|
|
526
|
-
tracing::debug!("Fixing install_name for {} to {}", lib_path.display(), new_install_name);
|
|
527
|
-
|
|
528
|
-
let status = Command::new("install_name_tool")
|
|
529
|
-
.arg("-id")
|
|
530
|
-
.arg(&new_install_name)
|
|
531
|
-
.arg(lib_path)
|
|
532
|
-
.status();
|
|
533
|
-
|
|
534
|
-
match status {
|
|
535
|
-
Ok(s) if s.success() => {
|
|
536
|
-
tracing::debug!("Successfully updated install_name");
|
|
537
|
-
}
|
|
538
|
-
Ok(s) => {
|
|
539
|
-
tracing::debug!("install_name_tool failed with status: {}", s);
|
|
540
|
-
}
|
|
541
|
-
Err(e) => {
|
|
542
|
-
tracing::debug!("Failed to run install_name_tool: {}", e);
|
|
543
|
-
}
|
|
544
|
-
}
|
|
545
|
-
}
|
|
546
|
-
|
|
547
|
-
/// Code sign binary on macOS if needed
|
|
548
|
-
///
|
|
549
|
-
/// Uses codesign to sign the binary. Identity from KREUZBERG_CODESIGN_IDENTITY
|
|
550
|
-
/// env var (default: "-" for adhoc signing). Only runs on apple-darwin targets.
|
|
551
|
-
fn codesign_if_needed(target: &str, binary: &Path) {
|
|
552
|
-
if !target.contains("apple-darwin") || !binary.exists() {
|
|
553
|
-
return;
|
|
554
|
-
}
|
|
555
|
-
|
|
556
|
-
let identity = env::var("KREUZBERG_CODESIGN_IDENTITY").unwrap_or_else(|_| "-".to_string());
|
|
557
|
-
let status = Command::new("codesign")
|
|
558
|
-
.arg("--force")
|
|
559
|
-
.arg("--timestamp=none")
|
|
560
|
-
.arg("--sign")
|
|
561
|
-
.arg(identity)
|
|
562
|
-
.arg(binary)
|
|
563
|
-
.status();
|
|
564
|
-
|
|
565
|
-
match status {
|
|
566
|
-
Ok(result) if result.success() => {
|
|
567
|
-
tracing::debug!("Codesigned {}", binary.display());
|
|
568
|
-
}
|
|
569
|
-
Ok(result) => {
|
|
570
|
-
tracing::debug!(
|
|
571
|
-
"codesign exited with status {} while signing {}",
|
|
572
|
-
result,
|
|
573
|
-
binary.display()
|
|
574
|
-
);
|
|
575
|
-
}
|
|
576
|
-
Err(err) => {
|
|
577
|
-
tracing::debug!("Failed to run codesign for {}: {}", binary.display(), err);
|
|
578
|
-
}
|
|
579
|
-
}
|
|
580
|
-
}
|
|
581
|
-
|
|
582
562
|
// ============================================================================
|
|
583
563
|
// LINKING STRATEGIES
|
|
584
564
|
// ============================================================================
|
|
@@ -622,18 +602,100 @@ fn link_dynamically(pdfium_dir: &Path, target: &str) {
|
|
|
622
602
|
}
|
|
623
603
|
}
|
|
624
604
|
|
|
625
|
-
/// Link PDFium statically (
|
|
605
|
+
/// Link PDFium statically (static-pdfium feature)
|
|
626
606
|
///
|
|
627
607
|
/// Embeds PDFium into the binary as a static library. Adds system
|
|
628
608
|
/// dependencies required for static linking on Linux.
|
|
629
609
|
/// Supports flexible archive structures by finding library in multiple locations.
|
|
610
|
+
///
|
|
611
|
+
/// Environment Variables:
|
|
612
|
+
/// - `PDFIUM_STATIC_LIB_PATH`: Path to directory containing libpdfium.a (for Docker/musl builds)
|
|
613
|
+
///
|
|
614
|
+
/// Note: bblanchon/pdfium-binaries only provides dynamic libraries.
|
|
615
|
+
/// On macOS, this will fallback to dynamic linking with a warning.
|
|
616
|
+
/// On Linux, you must provide PDFIUM_STATIC_LIB_PATH pointing to a static build.
|
|
630
617
|
fn link_statically(pdfium_dir: &Path, target: &str) {
|
|
631
|
-
|
|
618
|
+
// For static linking, we need libpdfium.a (not .dylib or .so)
|
|
619
|
+
let static_lib_name = "libpdfium.a";
|
|
620
|
+
let lib_subdir = if target.contains("wasm") { "release/lib" } else { "lib" };
|
|
621
|
+
|
|
622
|
+
// First, check if user provided a static library path via environment variable
|
|
623
|
+
if let Ok(custom_path) = env::var("PDFIUM_STATIC_LIB_PATH") {
|
|
624
|
+
let custom_lib_dir = PathBuf::from(&custom_path);
|
|
625
|
+
|
|
626
|
+
if !custom_lib_dir.exists() {
|
|
627
|
+
panic!(
|
|
628
|
+
"PDFIUM_STATIC_LIB_PATH points to '{}' but the directory does not exist",
|
|
629
|
+
custom_path
|
|
630
|
+
);
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
let custom_lib = custom_lib_dir.join(static_lib_name);
|
|
634
|
+
if !custom_lib.exists() {
|
|
635
|
+
panic!(
|
|
636
|
+
"PDFIUM_STATIC_LIB_PATH points to '{}' but {} not found.\n\
|
|
637
|
+
Expected to find: {}",
|
|
638
|
+
custom_path,
|
|
639
|
+
static_lib_name,
|
|
640
|
+
custom_lib.display()
|
|
641
|
+
);
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
tracing::debug!("Using custom static PDFium from: {}", custom_lib.display());
|
|
645
|
+
println!("cargo:rustc-link-search=native={}", custom_lib_dir.display());
|
|
646
|
+
println!("cargo:rustc-link-lib=static=pdfium");
|
|
647
|
+
|
|
648
|
+
// Static linking requires additional system dependencies
|
|
649
|
+
if target.contains("linux") {
|
|
650
|
+
println!("cargo:rustc-link-lib=dylib=pthread");
|
|
651
|
+
println!("cargo:rustc-link-lib=dylib=dl");
|
|
652
|
+
} else if target.contains("windows") {
|
|
653
|
+
println!("cargo:rustc-link-lib=dylib=ws2_32");
|
|
654
|
+
println!("cargo:rustc-link-lib=dylib=userenv");
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
return;
|
|
658
|
+
}
|
|
632
659
|
|
|
633
660
|
// Find the actual library location (handles multiple possible archive structures)
|
|
634
|
-
let lib_path = match find_pdfium_library(pdfium_dir,
|
|
661
|
+
let lib_path = match find_pdfium_library(pdfium_dir, static_lib_name, lib_subdir) {
|
|
635
662
|
Ok(path) => path.parent().unwrap_or(pdfium_dir).to_path_buf(),
|
|
636
|
-
Err(
|
|
663
|
+
Err(_err) => {
|
|
664
|
+
// Static library not found - check if we're on macOS and can fallback
|
|
665
|
+
if target.contains("darwin") {
|
|
666
|
+
eprintln!("cargo:warning=Static PDFium library (libpdfium.a) not found for macOS.");
|
|
667
|
+
eprintln!("cargo:warning=bblanchon/pdfium-binaries only provides dynamic libraries.");
|
|
668
|
+
eprintln!("cargo:warning=Falling back to dynamic linking for local development.");
|
|
669
|
+
eprintln!("cargo:warning=Production Linux builds require PDFIUM_STATIC_LIB_PATH.");
|
|
670
|
+
|
|
671
|
+
// Fallback to dynamic linking on macOS
|
|
672
|
+
link_dynamically(pdfium_dir, target);
|
|
673
|
+
return;
|
|
674
|
+
} else {
|
|
675
|
+
// On Linux/Windows, provide helpful error with actionable steps
|
|
676
|
+
panic!(
|
|
677
|
+
"Static PDFium library (libpdfium.a) not found.\n\n\
|
|
678
|
+
bblanchon/pdfium-binaries only provides dynamic libraries.\n\n\
|
|
679
|
+
For static linking (required for Docker with musl), you must:\n\n\
|
|
680
|
+
1. Build static PDFium or obtain from a source that provides it\n\
|
|
681
|
+
- See: https://github.com/ajrcarey/pdfium-render/issues/53\n\
|
|
682
|
+
- Or use: https://github.com/paulocoutinhox/pdfium-lib (provides static builds)\n\n\
|
|
683
|
+
2. Set environment variable pointing to the directory containing libpdfium.a:\n\
|
|
684
|
+
export PDFIUM_STATIC_LIB_PATH=/path/to/pdfium/lib\n\n\
|
|
685
|
+
3. Or use alternative features:\n\
|
|
686
|
+
- 'pdf' (dynamic linking, requires .so at runtime)\n\
|
|
687
|
+
- 'bundled-pdfium' (embeds dynamic library in binary)\n\
|
|
688
|
+
- 'system-pdfium' (use system-installed pdfium)\n\n\
|
|
689
|
+
Example Dockerfile pattern:\n\
|
|
690
|
+
FROM alpine:latest as pdfium-builder\n\
|
|
691
|
+
# Download/build static libpdfium.a\n\
|
|
692
|
+
\n\
|
|
693
|
+
FROM rust:alpine as builder\n\
|
|
694
|
+
ENV PDFIUM_STATIC_LIB_PATH=/pdfium/lib\n\
|
|
695
|
+
COPY --from=pdfium-builder /path/to/libpdfium.a /pdfium/lib/"
|
|
696
|
+
);
|
|
697
|
+
}
|
|
698
|
+
}
|
|
637
699
|
};
|
|
638
700
|
|
|
639
701
|
println!("cargo:rustc-link-search=native={}", lib_path.display());
|
|
@@ -653,21 +715,22 @@ fn link_statically(pdfium_dir: &Path, target: &str) {
|
|
|
653
715
|
|
|
654
716
|
// Static linking requires additional system dependencies
|
|
655
717
|
if target.contains("linux") {
|
|
656
|
-
// Linux requires additional libraries for static linking
|
|
657
718
|
println!("cargo:rustc-link-lib=dylib=pthread");
|
|
658
719
|
println!("cargo:rustc-link-lib=dylib=dl");
|
|
720
|
+
} else if target.contains("windows") {
|
|
721
|
+
println!("cargo:rustc-link-lib=dylib=ws2_32");
|
|
722
|
+
println!("cargo:rustc-link-lib=dylib=userenv");
|
|
659
723
|
}
|
|
660
724
|
}
|
|
661
725
|
|
|
662
|
-
/// Link PDFium bundled (
|
|
726
|
+
/// Link PDFium bundled (bundled-pdfium feature)
|
|
663
727
|
///
|
|
664
728
|
/// Links dynamically but copies library to OUT_DIR for embedding in binary.
|
|
665
729
|
/// Each binary extracts and uses its own copy of the PDFium library.
|
|
666
730
|
/// Supports flexible archive structures by finding library in multiple locations.
|
|
731
|
+
///
|
|
732
|
+
/// For WASM targets, links statically using the bundled static library.
|
|
667
733
|
fn link_bundled(pdfium_dir: &Path, target: &str, out_dir: &Path) {
|
|
668
|
-
// Link dynamically for build
|
|
669
|
-
link_dynamically(pdfium_dir, target);
|
|
670
|
-
|
|
671
734
|
// Copy library to OUT_DIR for bundling using flexible detection
|
|
672
735
|
let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
|
|
673
736
|
let src_lib = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
|
|
@@ -685,10 +748,20 @@ fn link_bundled(pdfium_dir: &Path, target: &str, out_dir: &Path) {
|
|
|
685
748
|
.unwrap_or_else(|| panic!("Non-UTF8 path for bundled library: {}", bundled_lib.display()));
|
|
686
749
|
println!("cargo:rustc-env=KREUZBERG_PDFIUM_BUNDLED_PATH={}", bundled_path);
|
|
687
750
|
|
|
688
|
-
|
|
751
|
+
// For WASM, link statically using the bundled library
|
|
752
|
+
if target.contains("wasm") {
|
|
753
|
+
let lib_dir = bundled_lib
|
|
754
|
+
.parent()
|
|
755
|
+
.unwrap_or_else(|| panic!("Invalid bundled library path: {}", bundled_lib.display()));
|
|
756
|
+
println!("cargo:rustc-link-search=native={}", lib_dir.display());
|
|
757
|
+
println!("cargo:rustc-link-lib=static=pdfium");
|
|
758
|
+
tracing::debug!("Bundled PDFium static library linked for WASM at: {}", bundled_path);
|
|
759
|
+
} else {
|
|
760
|
+
tracing::debug!("Bundled PDFium library at: {}", bundled_path);
|
|
761
|
+
}
|
|
689
762
|
}
|
|
690
763
|
|
|
691
|
-
/// Link system-installed PDFium (
|
|
764
|
+
/// Link system-installed PDFium (system-pdfium feature)
|
|
692
765
|
///
|
|
693
766
|
/// Attempts to find PDFium via pkg-config first, then falls back to
|
|
694
767
|
/// environment variables (KREUZBERG_PDFIUM_SYSTEM_PATH, KREUZBERG_PDFIUM_SYSTEM_INCLUDE).
|
|
@@ -733,7 +806,7 @@ fn link_system(_target: &str) {
|
|
|
733
806
|
|
|
734
807
|
// No system pdfium found
|
|
735
808
|
panic!(
|
|
736
|
-
"
|
|
809
|
+
"system-pdfium feature enabled but pdfium not found.\n\
|
|
737
810
|
\n\
|
|
738
811
|
Please install pdfium system-wide or provide:\n\
|
|
739
812
|
- KREUZBERG_PDFIUM_SYSTEM_PATH: path to directory containing libpdfium\n\
|
|
@@ -741,8 +814,8 @@ fn link_system(_target: &str) {
|
|
|
741
814
|
\n\
|
|
742
815
|
Alternatively, use a different linking strategy:\n\
|
|
743
816
|
- Default (dynamic): cargo build --features pdf\n\
|
|
744
|
-
- Static linking: cargo build --features pdf,
|
|
745
|
-
- Bundled: cargo build --features pdf,
|
|
817
|
+
- Static linking: cargo build --features pdf,static-pdfium\n\
|
|
818
|
+
- Bundled: cargo build --features pdf,bundled-pdfium"
|
|
746
819
|
);
|
|
747
820
|
}
|
|
748
821
|
|
|
@@ -768,124 +841,3 @@ fn link_system_frameworks(target: &str) {
|
|
|
768
841
|
println!("cargo:rustc-link-lib=dylib=advapi32");
|
|
769
842
|
}
|
|
770
843
|
}
|
|
771
|
-
|
|
772
|
-
// ============================================================================
|
|
773
|
-
// LIBRARY DISTRIBUTION
|
|
774
|
-
// ============================================================================
|
|
775
|
-
|
|
776
|
-
/// Copy PDFium library to various package directories
|
|
777
|
-
///
|
|
778
|
-
/// Distributes the compiled/downloaded PDFium library to:
|
|
779
|
-
/// - CLI target directories (debug/release)
|
|
780
|
-
/// - Python package directory
|
|
781
|
-
/// - Node.js package directory
|
|
782
|
-
/// - Ruby gem directory
|
|
783
|
-
///
|
|
784
|
-
/// On macOS, also fixes install_name and applies code signing.
|
|
785
|
-
/// Supports flexible archive structures by finding library in multiple locations.
|
|
786
|
-
fn copy_lib_to_package(pdfium_dir: &Path, target: &str) {
|
|
787
|
-
let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
|
|
788
|
-
let src_lib = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
|
|
789
|
-
Ok(path) => path,
|
|
790
|
-
Err(err) => {
|
|
791
|
-
tracing::debug!("Failed to locate PDFium library: {}", err);
|
|
792
|
-
return;
|
|
793
|
-
}
|
|
794
|
-
};
|
|
795
|
-
|
|
796
|
-
if target.contains("darwin") {
|
|
797
|
-
fix_macos_install_name(&src_lib, &runtime_lib_name);
|
|
798
|
-
codesign_if_needed(target, &src_lib);
|
|
799
|
-
}
|
|
800
|
-
|
|
801
|
-
let crate_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
|
|
802
|
-
let workspace_root = crate_dir.parent().unwrap().parent().unwrap();
|
|
803
|
-
|
|
804
|
-
if let Ok(profile) = env::var("PROFILE") {
|
|
805
|
-
let target_dir = if let Ok(cargo_target) = env::var("TARGET") {
|
|
806
|
-
workspace_root.join("target").join(cargo_target).join(&profile)
|
|
807
|
-
} else {
|
|
808
|
-
workspace_root.join("target").join(&profile)
|
|
809
|
-
};
|
|
810
|
-
|
|
811
|
-
if target_dir.exists() {
|
|
812
|
-
copy_lib_if_needed(
|
|
813
|
-
&src_lib,
|
|
814
|
-
&target_dir.join(&runtime_lib_name),
|
|
815
|
-
"CLI target directory",
|
|
816
|
-
target,
|
|
817
|
-
);
|
|
818
|
-
}
|
|
819
|
-
|
|
820
|
-
let simple_target_dir = workspace_root.join("target").join(&profile);
|
|
821
|
-
if simple_target_dir != target_dir {
|
|
822
|
-
fs::create_dir_all(&simple_target_dir).ok();
|
|
823
|
-
copy_lib_if_needed(
|
|
824
|
-
&src_lib,
|
|
825
|
-
&simple_target_dir.join(&runtime_lib_name),
|
|
826
|
-
"Java FFI target directory",
|
|
827
|
-
target,
|
|
828
|
-
);
|
|
829
|
-
}
|
|
830
|
-
}
|
|
831
|
-
|
|
832
|
-
let python_dest_dir = workspace_root.join("packages").join("python").join("kreuzberg");
|
|
833
|
-
if python_dest_dir.exists() {
|
|
834
|
-
copy_lib_if_needed(
|
|
835
|
-
&src_lib,
|
|
836
|
-
&python_dest_dir.join(&runtime_lib_name),
|
|
837
|
-
"Python package",
|
|
838
|
-
target,
|
|
839
|
-
);
|
|
840
|
-
} else {
|
|
841
|
-
tracing::debug!("Python package directory not found, skipping Python library copy");
|
|
842
|
-
}
|
|
843
|
-
|
|
844
|
-
let node_dest_dir = workspace_root.join("crates").join("kreuzberg-node");
|
|
845
|
-
if node_dest_dir.exists() {
|
|
846
|
-
copy_lib_if_needed(
|
|
847
|
-
&src_lib,
|
|
848
|
-
&node_dest_dir.join(&runtime_lib_name),
|
|
849
|
-
"Node.js package",
|
|
850
|
-
target,
|
|
851
|
-
);
|
|
852
|
-
} else {
|
|
853
|
-
tracing::debug!("Node.js package directory not found, skipping Node library copy");
|
|
854
|
-
}
|
|
855
|
-
|
|
856
|
-
let ruby_dest_dir = workspace_root.join("packages").join("ruby").join("lib");
|
|
857
|
-
if ruby_dest_dir.exists() {
|
|
858
|
-
copy_lib_if_needed(&src_lib, &ruby_dest_dir.join(&runtime_lib_name), "Ruby package", target);
|
|
859
|
-
} else {
|
|
860
|
-
tracing::debug!("Ruby package directory not found, skipping Ruby library copy");
|
|
861
|
-
}
|
|
862
|
-
}
|
|
863
|
-
|
|
864
|
-
/// Copy library to destination if needed (based on modification time)
|
|
865
|
-
///
|
|
866
|
-
/// Only copies if destination doesn't exist or source is newer than destination.
|
|
867
|
-
/// Applies platform-specific post-processing (code signing on macOS).
|
|
868
|
-
fn copy_lib_if_needed(src: &Path, dest: &Path, package_name: &str, target: &str) {
|
|
869
|
-
use std::fs;
|
|
870
|
-
|
|
871
|
-
let should_copy = if dest.exists() {
|
|
872
|
-
let src_metadata = fs::metadata(src).ok();
|
|
873
|
-
let dest_metadata = fs::metadata(dest).ok();
|
|
874
|
-
match (src_metadata, dest_metadata) {
|
|
875
|
-
(Some(src), Some(dest)) => src.modified().ok() > dest.modified().ok(),
|
|
876
|
-
_ => true,
|
|
877
|
-
}
|
|
878
|
-
} else {
|
|
879
|
-
true
|
|
880
|
-
};
|
|
881
|
-
|
|
882
|
-
if should_copy {
|
|
883
|
-
match fs::copy(src, dest) {
|
|
884
|
-
Ok(_) => {
|
|
885
|
-
tracing::debug!("Copied {} to {} ({})", src.display(), dest.display(), package_name);
|
|
886
|
-
codesign_if_needed(target, dest);
|
|
887
|
-
}
|
|
888
|
-
Err(e) => tracing::debug!("Failed to copy library to {}: {}", package_name, e),
|
|
889
|
-
}
|
|
890
|
-
}
|
|
891
|
-
}
|
|
@@ -653,6 +653,18 @@ mod tests {
|
|
|
653
653
|
#[tokio::test]
|
|
654
654
|
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
655
655
|
async fn test_pipeline_with_keyword_extraction() {
|
|
656
|
+
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
|
|
657
|
+
crate::plugins::registry::get_validator_registry()
|
|
658
|
+
.write()
|
|
659
|
+
.unwrap()
|
|
660
|
+
.shutdown_all()
|
|
661
|
+
.unwrap();
|
|
662
|
+
crate::plugins::registry::get_post_processor_registry()
|
|
663
|
+
.write()
|
|
664
|
+
.unwrap()
|
|
665
|
+
.shutdown_all()
|
|
666
|
+
.unwrap();
|
|
667
|
+
|
|
656
668
|
let _ = crate::keywords::register_keyword_processor();
|
|
657
669
|
|
|
658
670
|
let result = ExtractionResult {
|
|
@@ -703,6 +715,7 @@ Natural language processing enables computers to understand human language.
|
|
|
703
715
|
#[tokio::test]
|
|
704
716
|
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
705
717
|
async fn test_pipeline_without_keyword_config() {
|
|
718
|
+
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
|
|
706
719
|
let result = ExtractionResult {
|
|
707
720
|
content: "Machine learning and artificial intelligence.".to_string(),
|
|
708
721
|
mime_type: "text/plain".to_string(),
|