kreuzberg 4.0.0.pre.rc.7 → 4.0.0.pre.rc.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +13 -12
  3. data/README.md +22 -0
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +1 -1
  5. data/ext/kreuzberg_rb/native/Cargo.lock +397 -183
  6. data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +36 -13
  8. data/kreuzberg.gemspec +34 -2
  9. data/lib/kreuzberg/cache_api.rb +35 -0
  10. data/lib/kreuzberg/error_context.rb +49 -1
  11. data/lib/kreuzberg/extraction_api.rb +255 -0
  12. data/lib/kreuzberg/version.rb +1 -1
  13. data/lib/kreuzberg.rb +6 -0
  14. data/lib/libpdfium.dylib +0 -0
  15. data/sig/kreuzberg.rbs +9 -0
  16. data/vendor/Cargo.toml +44 -0
  17. data/vendor/kreuzberg/Cargo.toml +65 -35
  18. data/vendor/kreuzberg/README.md +50 -0
  19. data/vendor/kreuzberg/build.rs +548 -190
  20. data/vendor/kreuzberg/src/api/mod.rs +0 -2
  21. data/vendor/kreuzberg/src/core/pipeline.rs +13 -0
  22. data/vendor/kreuzberg/src/embeddings.rs +71 -3
  23. data/vendor/kreuzberg/src/error.rs +1 -1
  24. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  25. data/vendor/kreuzberg/src/extraction/html.rs +37 -5
  26. data/vendor/kreuzberg/src/extractors/pdf.rs +99 -47
  27. data/vendor/kreuzberg/src/mcp/mod.rs +3 -2
  28. data/vendor/kreuzberg/src/mcp/server.rs +106 -0
  29. data/vendor/kreuzberg/src/pdf/bindings.rs +44 -0
  30. data/vendor/kreuzberg/src/pdf/bundled.rs +346 -0
  31. data/vendor/kreuzberg/src/pdf/metadata.rs +2 -2
  32. data/vendor/kreuzberg/src/pdf/mod.rs +6 -0
  33. data/vendor/kreuzberg/src/pdf/rendering.rs +2 -2
  34. data/vendor/kreuzberg/src/pdf/table.rs +3 -0
  35. data/vendor/kreuzberg/src/pdf/text.rs +2 -2
  36. data/vendor/kreuzberg/src/text/quality_processor.rs +1 -1
  37. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -1
  38. data/vendor/kreuzberg/tests/format_integration.rs +4 -1
  39. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
  40. data/vendor/kreuzberg-ffi/Cargo.toml +63 -0
  41. data/vendor/kreuzberg-ffi/README.md +851 -0
  42. data/vendor/kreuzberg-ffi/build.rs +176 -0
  43. data/vendor/kreuzberg-ffi/cbindgen.toml +27 -0
  44. data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -0
  45. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  46. data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -0
  47. data/vendor/kreuzberg-ffi/src/lib.rs +3616 -0
  48. data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -0
  49. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  50. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  51. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  52. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  53. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  54. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  55. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  56. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  57. data/vendor/kreuzberg-tesseract/Cargo.toml +48 -0
  58. data/vendor/kreuzberg-tesseract/LICENSE +22 -0
  59. data/vendor/kreuzberg-tesseract/README.md +399 -0
  60. data/vendor/kreuzberg-tesseract/build.rs +1354 -0
  61. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  62. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  63. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  64. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  65. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  66. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  67. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  68. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  69. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  70. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  71. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  72. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  73. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  74. data/vendor/rb-sys/src/lib.rs +1 -0
  75. metadata +41 -3
  76. data/vendor/rb-sys/bin/release.sh +0 -22
@@ -1,3 +1,29 @@
1
+ // Kreuzberg Build Script - PDFium Linking Configuration
2
+ //
3
+ // This build script handles PDFium library downloading and linking for the kreuzberg crate.
4
+ // It supports multiple linking strategies via Cargo features:
5
+ //
6
+ // 1. Default (pdf, bundled-pdfium): Download dynamic library and embed in binary
7
+ // - Self-contained binary that extracts library at runtime
8
+ // - Larger binary size but no external .so dependency
9
+ // - No PDFIUM_*_PATH environment variables needed
10
+ //
11
+ // 2. static-pdfium: Static linking (no runtime dependency)
12
+ // - REQUIRES: PDFIUM_STATIC_LIB_PATH environment variable pointing to libpdfium.a directory
13
+ // - Reason: bblanchon/pdfium-binaries only provides dynamic libraries
14
+ // - Use case: Docker with musl, fully static binaries
15
+ // - Note: libpdfium.a must be obtained separately (e.g., paulocoutinhox/pdfium-lib)
16
+ //
17
+ // 3. system-pdfium: Use system-installed pdfium
18
+ // - Detected via pkg-config or KREUZBERG_PDFIUM_SYSTEM_PATH
19
+ //
20
+ // Environment Variables:
21
+ // - PDFIUM_STATIC_LIB_PATH: Path to directory containing libpdfium.a (for static-pdfium)
22
+ // - KREUZBERG_PDFIUM_PREBUILT: Path to prebuilt pdfium directory (skip download)
23
+ // - KREUZBERG_PDFIUM_SYSTEM_PATH: System pdfium library path (for system-pdfium)
24
+ // - PDFIUM_VERSION: Override version for bblanchon/pdfium-binaries
25
+ // - KREUZBERG_PDFIUM_DOWNLOAD_RETRIES: Number of download retries (default: 5)
26
+
1
27
  use std::env;
2
28
  use std::fs;
3
29
  use std::io;
@@ -6,6 +32,21 @@ use std::process::Command;
6
32
  use std::thread;
7
33
  use std::time::Duration;
8
34
 
35
+ /// PDFium linking strategy
36
+ #[derive(Debug, Clone, Copy, PartialEq, Eq)]
37
+ enum PdfiumLinkStrategy {
38
+ /// Download and link statically (static-pdfium feature)
39
+ DownloadStatic,
40
+ /// Download, link dynamically, and embed in binary (bundled-pdfium feature)
41
+ Bundled,
42
+ /// Use system-installed pdfium via pkg-config (system-pdfium feature)
43
+ System,
44
+ }
45
+
46
+ // ============================================================================
47
+ // MAIN BUILD ORCHESTRATION
48
+ // ============================================================================
49
+
9
50
  fn main() {
10
51
  let target = env::var("TARGET").unwrap();
11
52
  let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
@@ -18,15 +59,111 @@ fn main() {
18
59
  return;
19
60
  }
20
61
 
21
- let (download_url, lib_name) = get_pdfium_url_and_lib(&target);
62
+ let strategy = determine_link_strategy(&target);
63
+
64
+ tracing::debug!("Using PDFium linking strategy: {:?}", strategy);
65
+
66
+ match strategy {
67
+ PdfiumLinkStrategy::DownloadStatic => {
68
+ let pdfium_dir = download_or_use_prebuilt(&target, &out_dir);
69
+ link_statically(&pdfium_dir, &target);
70
+ // Skip copy_lib_to_package - library embedded in binary
71
+ }
72
+ PdfiumLinkStrategy::Bundled => {
73
+ let pdfium_dir = download_or_use_prebuilt(&target, &out_dir);
74
+ link_bundled(&pdfium_dir, &target, &out_dir);
75
+ // Skip copy_lib_to_package - each binary extracts its own
76
+ }
77
+ PdfiumLinkStrategy::System => {
78
+ link_system(&target);
79
+ // No download or copy needed
80
+ }
81
+ }
82
+
83
+ link_system_frameworks(&target);
84
+ println!("cargo:rerun-if-changed=build.rs");
85
+ }
86
+
87
+ // ============================================================================
88
+ // FEATURE & STRATEGY VALIDATION
89
+ // ============================================================================
90
+
91
+ /// Determine which linking strategy to use based on features and target
92
+ fn determine_link_strategy(target: &str) -> PdfiumLinkStrategy {
93
+ // WASM handling: check for PDFIUM_WASM_LIB environment variable
94
+ if target.contains("wasm") {
95
+ if let Ok(wasm_lib) = env::var("PDFIUM_WASM_LIB") {
96
+ println!("cargo:rustc-link-search=native={}", wasm_lib);
97
+ println!("cargo:rustc-link-lib=static=pdfium");
98
+ return PdfiumLinkStrategy::DownloadStatic;
99
+ }
100
+ // For WASM without explicit PDFIUM_WASM_LIB, use bundled strategy
101
+ // This downloads pdfium-lib which provides WASM-compatible builds
102
+ println!("cargo:warning=WASM build using bundled PDFium (set PDFIUM_WASM_LIB to link custom WASM PDFium)");
103
+ return PdfiumLinkStrategy::Bundled;
104
+ }
105
+
106
+ let system_pdfium = cfg!(feature = "system-pdfium");
107
+ let bundled_pdfium = cfg!(feature = "bundled-pdfium");
108
+ let static_pdfium = cfg!(feature = "static-pdfium");
109
+
110
+ let enabled_count = usize::from(system_pdfium) + usize::from(bundled_pdfium) + usize::from(static_pdfium);
111
+ if enabled_count > 1 {
112
+ println!(
113
+ "cargo:warning=Multiple PDFium linking strategies enabled (static-pdfium={}, bundled-pdfium={}, system-pdfium={}); using bundled-pdfium for this build",
114
+ static_pdfium, bundled_pdfium, system_pdfium
115
+ );
116
+ }
117
+
118
+ // Feature-based strategy selection.
119
+ // Prefer bundled-pdfium when multiple strategies are enabled (e.g. `--all-features`) because it
120
+ // does not require external PDFIUM_STATIC_LIB_PATH and does not depend on a system install.
121
+ if bundled_pdfium {
122
+ return PdfiumLinkStrategy::Bundled;
123
+ }
124
+ if system_pdfium {
125
+ return PdfiumLinkStrategy::System;
126
+ }
127
+ if static_pdfium {
128
+ return PdfiumLinkStrategy::DownloadStatic;
129
+ }
22
130
 
131
+ // Default: download and link dynamically (bundled-pdfium preferred if pdf not already selected)
132
+ // When only 'pdf' feature is enabled (no linking strategy), default to bundled-pdfium
133
+ PdfiumLinkStrategy::Bundled
134
+ }
135
+
136
+ // ============================================================================
137
+ // DOWNLOAD & PREBUILT ORCHESTRATION
138
+ // ============================================================================
139
+
140
+ /// Download PDFium or use prebuilt directory
141
+ ///
142
+ /// This is the main orchestrator function that:
143
+ /// 1. Checks for `KREUZBERG_PDFIUM_PREBUILT` environment variable
144
+ /// 2. If set and valid, uses prebuilt pdfium directory
145
+ /// 3. If not set, downloads pdfium to out_dir (with caching)
146
+ /// 4. Returns PathBuf to pdfium directory
147
+ ///
148
+ /// Reuses all existing helper functions:
149
+ /// - `get_pdfium_url_and_lib()` - determines download URL for target
150
+ /// - `download_and_extract_pdfium()` - downloads with retry logic
151
+ /// - `runtime_library_info()` - platform-specific library names
152
+ /// - `prepare_prebuilt_pdfium()` - handles prebuilt copy
153
+ fn download_or_use_prebuilt(target: &str, out_dir: &Path) -> PathBuf {
154
+ let (download_url, _lib_name) = get_pdfium_url_and_lib(target);
23
155
  let pdfium_dir = out_dir.join("pdfium");
24
156
 
157
+ // Check for prebuilt pdfium directory
25
158
  if let Some(prebuilt) = env::var_os("KREUZBERG_PDFIUM_PREBUILT") {
26
159
  let prebuilt_path = PathBuf::from(prebuilt);
27
160
  if prebuilt_path.exists() {
28
161
  prepare_prebuilt_pdfium(&prebuilt_path, &pdfium_dir)
29
162
  .unwrap_or_else(|err| panic!("Failed to copy Pdfium from {}: {}", prebuilt_path.display(), err));
163
+ if target.contains("windows") {
164
+ ensure_windows_import_library(&pdfium_dir);
165
+ }
166
+ return pdfium_dir;
30
167
  } else {
31
168
  panic!(
32
169
  "Environment variable KREUZBERG_PDFIUM_PREBUILT points to '{}' but the directory does not exist",
@@ -35,8 +172,10 @@ fn main() {
35
172
  }
36
173
  }
37
174
 
38
- let (runtime_lib_name, runtime_subdir) = runtime_library_info(&target);
39
- let runtime_lib_path = pdfium_dir.join(runtime_subdir).join(&runtime_lib_name);
175
+ // Check if library already exists (cache validation) using flexible detection
176
+ let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
177
+ let lib_found = find_pdfium_library(&pdfium_dir, &runtime_lib_name, runtime_subdir).is_ok();
178
+
40
179
  let import_lib_exists = if target.contains("windows") {
41
180
  let lib_dir = pdfium_dir.join("lib");
42
181
  lib_dir.join("pdfium.lib").exists() || lib_dir.join("pdfium.dll.lib").exists()
@@ -44,60 +183,52 @@ fn main() {
44
183
  true
45
184
  };
46
185
 
47
- if !runtime_lib_path.exists() || !import_lib_exists {
186
+ if !lib_found || !import_lib_exists {
48
187
  tracing::debug!("Pdfium library not found, downloading for target: {}", target);
49
188
  tracing::debug!("Download URL: {}", download_url);
50
189
  download_and_extract_pdfium(&download_url, &pdfium_dir);
51
190
  } else {
52
- tracing::debug!("Pdfium library already present at {}", runtime_lib_path.display());
191
+ tracing::debug!("Pdfium library already cached at {}", pdfium_dir.display());
53
192
  }
54
193
 
194
+ // Windows-specific: ensure pdfium.lib exists
55
195
  if target.contains("windows") {
56
- let lib_dir = pdfium_dir.join("lib");
57
- let dll_lib = lib_dir.join("pdfium.dll.lib");
58
- let expected_lib = lib_dir.join("pdfium.lib");
59
-
60
- if dll_lib.exists() && !expected_lib.exists() {
61
- tracing::debug!("Renaming cached {} to {}", dll_lib.display(), expected_lib.display());
62
- fs::rename(&dll_lib, &expected_lib).expect("Failed to rename pdfium.dll.lib to pdfium.lib");
63
- }
196
+ ensure_windows_import_library(&pdfium_dir);
64
197
  }
65
198
 
66
- let lib_dir = pdfium_dir.join("lib");
67
- println!("cargo:rustc-link-search=native={}", lib_dir.display());
68
-
69
- // WASM requires static linking
70
- let link_type = if target.contains("wasm") { "static" } else { "dylib" };
71
- println!("cargo:rustc-link-lib={}={}", link_type, lib_name);
72
-
73
- if target.contains("darwin") {
74
- println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
75
- println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
76
- } else if target.contains("linux") {
77
- println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
78
- println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
79
- }
199
+ pdfium_dir
200
+ }
80
201
 
81
- copy_lib_to_package(&pdfium_dir, &target);
202
+ fn ensure_windows_import_library(pdfium_dir: &Path) {
203
+ let lib_dir = pdfium_dir.join("lib");
204
+ let dll_lib = lib_dir.join("pdfium.dll.lib");
205
+ let expected_lib = lib_dir.join("pdfium.lib");
82
206
 
83
- if target.contains("darwin") {
84
- println!("cargo:rustc-link-lib=framework=CoreFoundation");
85
- println!("cargo:rustc-link-lib=framework=CoreGraphics");
86
- println!("cargo:rustc-link-lib=framework=CoreText");
87
- println!("cargo:rustc-link-lib=framework=AppKit");
88
- println!("cargo:rustc-link-lib=dylib=c++");
89
- } else if target.contains("linux") {
90
- println!("cargo:rustc-link-lib=dylib=stdc++");
91
- println!("cargo:rustc-link-lib=dylib=m");
92
- } else if target.contains("windows") {
93
- println!("cargo:rustc-link-lib=dylib=gdi32");
94
- println!("cargo:rustc-link-lib=dylib=user32");
95
- println!("cargo:rustc-link-lib=dylib=advapi32");
207
+ if dll_lib.exists() && !expected_lib.exists() {
208
+ tracing::debug!(
209
+ "Ensuring Windows import library at {} (source: {})",
210
+ expected_lib.display(),
211
+ dll_lib.display()
212
+ );
213
+ fs::copy(&dll_lib, &expected_lib).unwrap_or_else(|err| {
214
+ panic!(
215
+ "Failed to copy Windows import library from {} to {}: {}",
216
+ dll_lib.display(),
217
+ expected_lib.display(),
218
+ err
219
+ )
220
+ });
96
221
  }
97
-
98
- println!("cargo:rerun-if-changed=build.rs");
99
222
  }
100
223
 
224
+ // ============================================================================
225
+ // DOWNLOAD UTILITIES
226
+ // ============================================================================
227
+
228
+ /// Fetch the latest release version from a GitHub repository
229
+ ///
230
+ /// Uses curl to query the GitHub API and extract the tag_name from the
231
+ /// latest release JSON response. Falls back to "7529" if API call fails.
101
232
  fn get_latest_version(repo: &str) -> String {
102
233
  let api_url = format!("https://api.github.com/repos/{}/releases/latest", repo);
103
234
 
@@ -122,6 +253,12 @@ fn get_latest_version(repo: &str) -> String {
122
253
  "7529".to_string()
123
254
  }
124
255
 
256
+ /// Get the download URL and library name for the target platform
257
+ ///
258
+ /// Determines platform/architecture from target triple and constructs
259
+ /// the appropriate GitHub release download URL. Supports:
260
+ /// - WASM: paulocoutinhox/pdfium-lib
261
+ /// - Other platforms: bblanchon/pdfium-binaries
125
262
  fn get_pdfium_url_and_lib(target: &str) -> (String, String) {
126
263
  if target.contains("wasm") {
127
264
  let version = env::var("PDFIUM_WASM_VERSION")
@@ -180,6 +317,15 @@ fn get_pdfium_url_and_lib(target: &str) -> (String, String) {
180
317
  (url, "pdfium".to_string())
181
318
  }
182
319
 
320
+ /// Download and extract PDFium archive with retry logic
321
+ ///
322
+ /// Features:
323
+ /// - Exponential backoff retry (configurable via env vars)
324
+ /// - File type validation (gzip check)
325
+ /// - Windows-specific import library handling (pdfium.dll.lib -> pdfium.lib)
326
+ /// - Environment variables:
327
+ /// - KREUZBERG_PDFIUM_DOWNLOAD_RETRIES: number of retries (default: 5)
328
+ /// - KREUZBERG_PDFIUM_DOWNLOAD_BACKOFF_SECS: initial backoff in seconds (default: 2)
183
329
  fn download_and_extract_pdfium(url: &str, dest_dir: &Path) {
184
330
  fs::create_dir_all(dest_dir).expect("Failed to create pdfium directory");
185
331
 
@@ -291,195 +437,407 @@ fn download_and_extract_pdfium(url: &str, dest_dir: &Path) {
291
437
  tracing::debug!("Pdfium downloaded and extracted successfully");
292
438
  }
293
439
 
294
- fn copy_lib_to_package(pdfium_dir: &Path, target: &str) {
295
- let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
296
- let src_lib = pdfium_dir.join(runtime_subdir).join(&runtime_lib_name);
440
+ // ============================================================================
441
+ // PREBUILT HANDLING
442
+ // ============================================================================
297
443
 
298
- if !src_lib.exists() {
299
- tracing::debug!("Source library not found: {}", src_lib.display());
300
- return;
301
- }
302
-
303
- if target.contains("darwin") {
304
- fix_macos_install_name(&src_lib, &runtime_lib_name);
305
- codesign_if_needed(target, &src_lib);
444
+ /// Prepare prebuilt PDFium by copying to destination directory
445
+ ///
446
+ /// Removes existing destination if present, then recursively copies
447
+ /// all files from prebuilt source to destination.
448
+ fn prepare_prebuilt_pdfium(prebuilt_src: &Path, dest_dir: &Path) -> io::Result<()> {
449
+ if dest_dir.exists() {
450
+ fs::remove_dir_all(dest_dir)?;
306
451
  }
452
+ copy_dir_all(prebuilt_src, dest_dir)
453
+ }
307
454
 
308
- let crate_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
309
- let workspace_root = crate_dir.parent().unwrap().parent().unwrap();
310
-
311
- if let Ok(profile) = env::var("PROFILE") {
312
- let target_dir = if let Ok(cargo_target) = env::var("TARGET") {
313
- workspace_root.join("target").join(cargo_target).join(&profile)
455
+ /// Recursively copy directory tree
456
+ ///
457
+ /// Used by `prepare_prebuilt_pdfium()` to copy entire pdfium directory
458
+ /// structure, preserving all files and subdirectories.
459
+ fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
460
+ fs::create_dir_all(dst)?;
461
+ for entry in fs::read_dir(src)? {
462
+ let entry = entry?;
463
+ let file_type = entry.file_type()?;
464
+ let target_path = dst.join(entry.file_name());
465
+ if file_type.is_dir() {
466
+ copy_dir_all(&entry.path(), &target_path)?;
314
467
  } else {
315
- workspace_root.join("target").join(&profile)
316
- };
317
-
318
- if target_dir.exists() {
319
- copy_lib_if_needed(
320
- &src_lib,
321
- &target_dir.join(&runtime_lib_name),
322
- "CLI target directory",
323
- target,
324
- );
325
- }
326
-
327
- let simple_target_dir = workspace_root.join("target").join(&profile);
328
- if simple_target_dir != target_dir {
329
- fs::create_dir_all(&simple_target_dir).ok();
330
- copy_lib_if_needed(
331
- &src_lib,
332
- &simple_target_dir.join(&runtime_lib_name),
333
- "Java FFI target directory",
334
- target,
335
- );
468
+ fs::copy(entry.path(), &target_path)?;
336
469
  }
337
470
  }
471
+ Ok(())
472
+ }
338
473
 
339
- let python_dest_dir = workspace_root.join("packages").join("python").join("kreuzberg");
340
- if python_dest_dir.exists() {
341
- copy_lib_if_needed(
342
- &src_lib,
343
- &python_dest_dir.join(&runtime_lib_name),
344
- "Python package",
345
- target,
346
- );
474
+ // ============================================================================
475
+ // PLATFORM UTILITIES
476
+ // ============================================================================
477
+
478
+ /// Get platform-specific runtime library name and subdirectory
479
+ ///
480
+ /// Returns tuple of (library_name, subdirectory) for the target platform:
481
+ /// - WASM: ("libpdfium.a", "release/lib")
482
+ /// - Windows: ("pdfium.dll", "bin")
483
+ /// - macOS: ("libpdfium.dylib", "lib")
484
+ /// - Linux: ("libpdfium.so", "lib")
485
+ fn runtime_library_info(target: &str) -> (String, &'static str) {
486
+ if target.contains("wasm") {
487
+ // pdfium-lib `wasm.tgz` extracts into `release/lib/libpdfium.a`
488
+ ("libpdfium.a".to_string(), "release/lib")
489
+ } else if target.contains("windows") {
490
+ ("pdfium.dll".to_string(), "bin")
491
+ } else if target.contains("darwin") {
492
+ ("libpdfium.dylib".to_string(), "lib")
347
493
  } else {
348
- tracing::debug!("Python package directory not found, skipping Python library copy");
494
+ ("libpdfium.so".to_string(), "lib")
349
495
  }
496
+ }
350
497
 
351
- let node_dest_dir = workspace_root.join("crates").join("kreuzberg-node");
352
- if node_dest_dir.exists() {
353
- copy_lib_if_needed(
354
- &src_lib,
355
- &node_dest_dir.join(&runtime_lib_name),
356
- "Node.js package",
357
- target,
358
- );
359
- } else {
360
- tracing::debug!("Node.js package directory not found, skipping Node library copy");
498
+ /// Find PDFium library in archive with flexible directory detection
499
+ ///
500
+ /// Attempts to locate the library at multiple possible locations:
501
+ /// - {subdir}/{lib_name} (standard location)
502
+ /// - {lib_name} (root of archive)
503
+ /// - bin/{lib_name} (alternative location)
504
+ /// - lib/{lib_name} (explicit lib directory)
505
+ ///
506
+ /// This handles variations in archive structure across different platform builds,
507
+ /// particularly macOS ARM64 where the archive structure may differ.
508
+ ///
509
+ /// Returns the full path to the library if found, or an error with available files.
510
+ fn find_pdfium_library(pdfium_dir: &Path, lib_name: &str, expected_subdir: &str) -> Result<PathBuf, String> {
511
+ // Candidates in priority order
512
+ let candidates = [
513
+ pdfium_dir.join(expected_subdir).join(lib_name), // Standard: lib/libpdfium.dylib
514
+ pdfium_dir.join(lib_name), // Root: libpdfium.dylib
515
+ pdfium_dir.join("bin").join(lib_name), // Alternative: bin/libpdfium.dylib
516
+ pdfium_dir.join("lib").join(lib_name), // Explicit lib: lib/libpdfium.dylib
517
+ ];
518
+
519
+ // Try each candidate
520
+ for candidate in &candidates {
521
+ if candidate.exists() {
522
+ tracing::debug!("Found PDFium library at: {}", candidate.display());
523
+ return Ok(candidate.clone());
524
+ }
361
525
  }
362
526
 
363
- let ruby_dest_dir = workspace_root.join("packages").join("ruby").join("lib");
364
- if ruby_dest_dir.exists() {
365
- copy_lib_if_needed(&src_lib, &ruby_dest_dir.join(&runtime_lib_name), "Ruby package", target);
366
- } else {
367
- tracing::debug!("Ruby package directory not found, skipping Ruby library copy");
527
+ // Library not found - provide detailed error with directory listing
528
+ let mut error_msg = format!(
529
+ "PDFium library not found at expected location: {}/{}\n\n",
530
+ pdfium_dir.display(),
531
+ expected_subdir
532
+ );
533
+ error_msg.push_str("Attempted locations:\n");
534
+ for candidate in &candidates {
535
+ error_msg.push_str(&format!(" - {}\n", candidate.display()));
536
+ }
537
+
538
+ // List actual contents of pdfium directory for debugging
539
+ error_msg.push_str("\nActual archive contents:\n");
540
+ if let Ok(entries) = fs::read_dir(pdfium_dir) {
541
+ for entry in entries.flatten() {
542
+ let path = entry.path();
543
+ let file_type = if path.is_dir() { "dir" } else { "file" };
544
+ error_msg.push_str(&format!(" {} ({})\n", path.display(), file_type));
545
+
546
+ // Show contents of subdirectories
547
+ if path.is_dir()
548
+ && let Ok(sub_entries) = fs::read_dir(&path)
549
+ {
550
+ for sub_entry in sub_entries.flatten() {
551
+ let sub_path = sub_entry.path();
552
+ let sub_type = if sub_path.is_dir() { "dir" } else { "file" };
553
+ error_msg.push_str(&format!(" {} ({})\n", sub_path.display(), sub_type));
554
+ }
555
+ }
556
+ }
368
557
  }
558
+
559
+ Err(error_msg)
369
560
  }
370
561
 
371
- fn copy_lib_if_needed(src: &Path, dest: &Path, package_name: &str, target: &str) {
372
- use std::fs;
562
+ // ============================================================================
563
+ // LINKING STRATEGIES
564
+ // ============================================================================
373
565
 
374
- let should_copy = if dest.exists() {
375
- let src_metadata = fs::metadata(src).ok();
376
- let dest_metadata = fs::metadata(dest).ok();
377
- match (src_metadata, dest_metadata) {
378
- (Some(src), Some(dest)) => src.modified().ok() > dest.modified().ok(),
379
- _ => true,
380
- }
381
- } else {
382
- true
566
+ /// Link PDFium dynamically (default)
567
+ ///
568
+ /// Sets up linker to use PDFium as a dynamic library (.dylib/.so/.dll)
569
+ /// with platform-specific rpath configuration for runtime library discovery.
570
+ /// Supports flexible archive structures by adding multiple possible lib directories.
571
+ fn link_dynamically(pdfium_dir: &Path, target: &str) {
572
+ let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
573
+
574
+ // Find the actual library location (handles multiple possible archive structures)
575
+ let lib_path = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
576
+ Ok(path) => path.parent().unwrap_or(pdfium_dir).to_path_buf(),
577
+ Err(err) => panic!("{}", err),
383
578
  };
384
579
 
385
- if should_copy {
386
- match fs::copy(src, dest) {
387
- Ok(_) => {
388
- tracing::debug!("Copied {} to {} ({})", src.display(), dest.display(), package_name);
389
- codesign_if_needed(target, dest);
390
- }
391
- Err(e) => tracing::debug!("Failed to copy library to {}: {}", package_name, e),
392
- }
580
+ println!("cargo:rustc-link-search=native={}", lib_path.display());
581
+ println!("cargo:rustc-link-lib=dylib=pdfium");
582
+
583
+ // Also add standard lib directory for compatibility
584
+ let std_lib_dir = pdfium_dir.join("lib");
585
+ if std_lib_dir.exists() && std_lib_dir != lib_path {
586
+ println!("cargo:rustc-link-search=native={}", std_lib_dir.display());
393
587
  }
394
- }
395
588
 
396
- fn codesign_if_needed(target: &str, binary: &Path) {
397
- if !target.contains("apple-darwin") || !binary.exists() {
398
- return;
589
+ // Add bin directory for platforms where it might be needed
590
+ let bin_dir = pdfium_dir.join("bin");
591
+ if bin_dir.exists() && bin_dir != lib_path {
592
+ println!("cargo:rustc-link-search=native={}", bin_dir.display());
399
593
  }
400
594
 
401
- let identity = env::var("KREUZBERG_CODESIGN_IDENTITY").unwrap_or_else(|_| "-".to_string());
402
- let status = Command::new("codesign")
403
- .arg("--force")
404
- .arg("--timestamp=none")
405
- .arg("--sign")
406
- .arg(identity)
407
- .arg(binary)
408
- .status();
595
+ // Set rpath for dynamic linking
596
+ if target.contains("darwin") {
597
+ println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
598
+ println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
599
+ } else if target.contains("linux") {
600
+ println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
601
+ println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
602
+ }
603
+ }
409
604
 
410
- match status {
411
- Ok(result) if result.success() => {
412
- tracing::debug!("Codesigned {}", binary.display());
605
+ /// Link PDFium statically (static-pdfium feature)
606
+ ///
607
+ /// Embeds PDFium into the binary as a static library. Adds system
608
+ /// dependencies required for static linking on Linux.
609
+ /// Supports flexible archive structures by finding library in multiple locations.
610
+ ///
611
+ /// Environment Variables:
612
+ /// - `PDFIUM_STATIC_LIB_PATH`: Path to directory containing libpdfium.a (for Docker/musl builds)
613
+ ///
614
+ /// Note: bblanchon/pdfium-binaries only provides dynamic libraries.
615
+ /// On macOS, this will fallback to dynamic linking with a warning.
616
+ /// On Linux, you must provide PDFIUM_STATIC_LIB_PATH pointing to a static build.
617
+ fn link_statically(pdfium_dir: &Path, target: &str) {
618
+ // For static linking, we need libpdfium.a (not .dylib or .so)
619
+ let static_lib_name = "libpdfium.a";
620
+ let lib_subdir = if target.contains("wasm") { "release/lib" } else { "lib" };
621
+
622
+ // First, check if user provided a static library path via environment variable
623
+ if let Ok(custom_path) = env::var("PDFIUM_STATIC_LIB_PATH") {
624
+ let custom_lib_dir = PathBuf::from(&custom_path);
625
+
626
+ if !custom_lib_dir.exists() {
627
+ panic!(
628
+ "PDFIUM_STATIC_LIB_PATH points to '{}' but the directory does not exist",
629
+ custom_path
630
+ );
413
631
  }
414
- Ok(result) => {
415
- tracing::debug!(
416
- "codesign exited with status {} while signing {}",
417
- result,
418
- binary.display()
632
+
633
+ let custom_lib = custom_lib_dir.join(static_lib_name);
634
+ if !custom_lib.exists() {
635
+ panic!(
636
+ "PDFIUM_STATIC_LIB_PATH points to '{}' but {} not found.\n\
637
+ Expected to find: {}",
638
+ custom_path,
639
+ static_lib_name,
640
+ custom_lib.display()
419
641
  );
420
642
  }
421
- Err(err) => {
422
- tracing::debug!("Failed to run codesign for {}: {}", binary.display(), err);
643
+
644
+ tracing::debug!("Using custom static PDFium from: {}", custom_lib.display());
645
+ println!("cargo:rustc-link-search=native={}", custom_lib_dir.display());
646
+ println!("cargo:rustc-link-lib=static=pdfium");
647
+
648
+ // Static linking requires additional system dependencies
649
+ if target.contains("linux") {
650
+ println!("cargo:rustc-link-lib=dylib=pthread");
651
+ println!("cargo:rustc-link-lib=dylib=dl");
652
+ } else if target.contains("windows") {
653
+ println!("cargo:rustc-link-lib=dylib=ws2_32");
654
+ println!("cargo:rustc-link-lib=dylib=userenv");
423
655
  }
656
+
657
+ return;
424
658
  }
425
- }
426
659
 
427
- fn runtime_library_info(target: &str) -> (String, &'static str) {
428
- if target.contains("wasm") {
429
- ("libpdfium.a".to_string(), "lib")
430
- } else if target.contains("windows") {
431
- ("pdfium.dll".to_string(), "bin")
432
- } else if target.contains("darwin") {
433
- ("libpdfium.dylib".to_string(), "lib")
434
- } else {
435
- ("libpdfium.so".to_string(), "lib")
660
+ // Find the actual library location (handles multiple possible archive structures)
661
+ let lib_path = match find_pdfium_library(pdfium_dir, static_lib_name, lib_subdir) {
662
+ Ok(path) => path.parent().unwrap_or(pdfium_dir).to_path_buf(),
663
+ Err(_err) => {
664
+ // Static library not found - check if we're on macOS and can fallback
665
+ if target.contains("darwin") {
666
+ eprintln!("cargo:warning=Static PDFium library (libpdfium.a) not found for macOS.");
667
+ eprintln!("cargo:warning=bblanchon/pdfium-binaries only provides dynamic libraries.");
668
+ eprintln!("cargo:warning=Falling back to dynamic linking for local development.");
669
+ eprintln!("cargo:warning=Production Linux builds require PDFIUM_STATIC_LIB_PATH.");
670
+
671
+ // Fallback to dynamic linking on macOS
672
+ link_dynamically(pdfium_dir, target);
673
+ return;
674
+ } else {
675
+ // On Linux/Windows, provide helpful error with actionable steps
676
+ panic!(
677
+ "Static PDFium library (libpdfium.a) not found.\n\n\
678
+ bblanchon/pdfium-binaries only provides dynamic libraries.\n\n\
679
+ For static linking (required for Docker with musl), you must:\n\n\
680
+ 1. Build static PDFium or obtain from a source that provides it\n\
681
+ - See: https://github.com/ajrcarey/pdfium-render/issues/53\n\
682
+ - Or use: https://github.com/paulocoutinhox/pdfium-lib (provides static builds)\n\n\
683
+ 2. Set environment variable pointing to the directory containing libpdfium.a:\n\
684
+ export PDFIUM_STATIC_LIB_PATH=/path/to/pdfium/lib\n\n\
685
+ 3. Or use alternative features:\n\
686
+ - 'pdf' (dynamic linking, requires .so at runtime)\n\
687
+ - 'bundled-pdfium' (embeds dynamic library in binary)\n\
688
+ - 'system-pdfium' (use system-installed pdfium)\n\n\
689
+ Example Dockerfile pattern:\n\
690
+ FROM alpine:latest as pdfium-builder\n\
691
+ # Download/build static libpdfium.a\n\
692
+ \n\
693
+ FROM rust:alpine as builder\n\
694
+ ENV PDFIUM_STATIC_LIB_PATH=/pdfium/lib\n\
695
+ COPY --from=pdfium-builder /path/to/libpdfium.a /pdfium/lib/"
696
+ );
697
+ }
698
+ }
699
+ };
700
+
701
+ println!("cargo:rustc-link-search=native={}", lib_path.display());
702
+ println!("cargo:rustc-link-lib=static=pdfium");
703
+
704
+ // Also add standard lib directory for compatibility
705
+ let std_lib_dir = pdfium_dir.join("lib");
706
+ if std_lib_dir.exists() && std_lib_dir != lib_path {
707
+ println!("cargo:rustc-link-search=native={}", std_lib_dir.display());
436
708
  }
437
- }
438
709
 
439
- fn prepare_prebuilt_pdfium(prebuilt_src: &Path, dest_dir: &Path) -> io::Result<()> {
440
- if dest_dir.exists() {
441
- fs::remove_dir_all(dest_dir)?;
710
+ // Add bin directory for platforms where it might be needed
711
+ let bin_dir = pdfium_dir.join("bin");
712
+ if bin_dir.exists() && bin_dir != lib_path {
713
+ println!("cargo:rustc-link-search=native={}", bin_dir.display());
442
714
  }
443
- copy_dir_all(prebuilt_src, dest_dir)
444
- }
445
715
 
446
- fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
447
- fs::create_dir_all(dst)?;
448
- for entry in fs::read_dir(src)? {
449
- let entry = entry?;
450
- let file_type = entry.file_type()?;
451
- let target_path = dst.join(entry.file_name());
452
- if file_type.is_dir() {
453
- copy_dir_all(&entry.path(), &target_path)?;
454
- } else {
455
- fs::copy(entry.path(), &target_path)?;
456
- }
716
+ // Static linking requires additional system dependencies
717
+ if target.contains("linux") {
718
+ println!("cargo:rustc-link-lib=dylib=pthread");
719
+ println!("cargo:rustc-link-lib=dylib=dl");
720
+ } else if target.contains("windows") {
721
+ println!("cargo:rustc-link-lib=dylib=ws2_32");
722
+ println!("cargo:rustc-link-lib=dylib=userenv");
457
723
  }
458
- Ok(())
459
724
  }
460
725
 
461
- fn fix_macos_install_name(lib_path: &Path, lib_name: &str) {
462
- use std::process::Command;
726
+ /// Link PDFium bundled (bundled-pdfium feature)
727
+ ///
728
+ /// Links dynamically but copies library to OUT_DIR for embedding in binary.
729
+ /// Each binary extracts and uses its own copy of the PDFium library.
730
+ /// Supports flexible archive structures by finding library in multiple locations.
731
+ ///
732
+ /// For WASM targets, links statically using the bundled static library.
733
+ fn link_bundled(pdfium_dir: &Path, target: &str, out_dir: &Path) {
734
+ // Copy library to OUT_DIR for bundling using flexible detection
735
+ let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
736
+ let src_lib = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
737
+ Ok(path) => path,
738
+ Err(err) => panic!("{}", err),
739
+ };
740
+ let bundled_lib = out_dir.join(&runtime_lib_name);
463
741
 
464
- let new_install_name = format!("@rpath/{}", lib_name);
742
+ fs::copy(&src_lib, &bundled_lib)
743
+ .unwrap_or_else(|err| panic!("Failed to copy library to OUT_DIR for bundling: {}", err));
465
744
 
466
- tracing::debug!("Fixing install_name for {} to {}", lib_path.display(), new_install_name);
745
+ // Emit environment variable with bundled library path
746
+ let bundled_path = bundled_lib
747
+ .to_str()
748
+ .unwrap_or_else(|| panic!("Non-UTF8 path for bundled library: {}", bundled_lib.display()));
749
+ println!("cargo:rustc-env=KREUZBERG_PDFIUM_BUNDLED_PATH={}", bundled_path);
467
750
 
468
- let status = Command::new("install_name_tool")
469
- .arg("-id")
470
- .arg(&new_install_name)
471
- .arg(lib_path)
472
- .status();
751
+ // For WASM, link statically using the bundled library
752
+ if target.contains("wasm") {
753
+ let lib_dir = bundled_lib
754
+ .parent()
755
+ .unwrap_or_else(|| panic!("Invalid bundled library path: {}", bundled_lib.display()));
756
+ println!("cargo:rustc-link-search=native={}", lib_dir.display());
757
+ println!("cargo:rustc-link-lib=static=pdfium");
758
+ tracing::debug!("Bundled PDFium static library linked for WASM at: {}", bundled_path);
759
+ } else {
760
+ tracing::debug!("Bundled PDFium library at: {}", bundled_path);
761
+ }
762
+ }
473
763
 
474
- match status {
475
- Ok(s) if s.success() => {
476
- tracing::debug!("Successfully updated install_name");
764
+ /// Link system-installed PDFium (system-pdfium feature)
765
+ ///
766
+ /// Attempts to find PDFium via pkg-config first, then falls back to
767
+ /// environment variables (KREUZBERG_PDFIUM_SYSTEM_PATH, KREUZBERG_PDFIUM_SYSTEM_INCLUDE).
768
+ fn link_system(_target: &str) {
769
+ // Try pkg-config first
770
+ match pkg_config::Config::new().atleast_version("5.0").probe("pdfium") {
771
+ Ok(library) => {
772
+ tracing::debug!("Found system pdfium via pkg-config");
773
+ for include_path in &library.include_paths {
774
+ println!("cargo:include={}", include_path.display());
775
+ }
776
+ return;
477
777
  }
478
- Ok(s) => {
479
- tracing::debug!("install_name_tool failed with status: {}", s);
778
+ Err(err) => {
779
+ tracing::debug!("pkg-config probe failed: {}", err);
480
780
  }
481
- Err(e) => {
482
- tracing::debug!("Failed to run install_name_tool: {}", e);
781
+ }
782
+
783
+ // Fallback to environment variables
784
+ let lib_path = env::var("KREUZBERG_PDFIUM_SYSTEM_PATH").ok();
785
+ let include_path = env::var("KREUZBERG_PDFIUM_SYSTEM_INCLUDE").ok();
786
+
787
+ if let Some(lib_dir) = lib_path {
788
+ let lib_dir_path = PathBuf::from(&lib_dir);
789
+ if !lib_dir_path.exists() {
790
+ panic!(
791
+ "KREUZBERG_PDFIUM_SYSTEM_PATH points to '{}' but the directory does not exist",
792
+ lib_dir
793
+ );
483
794
  }
795
+
796
+ println!("cargo:rustc-link-search=native={}", lib_dir);
797
+ println!("cargo:rustc-link-lib=dylib=pdfium");
798
+
799
+ if let Some(inc_dir) = include_path {
800
+ println!("cargo:include={}", inc_dir);
801
+ }
802
+
803
+ tracing::debug!("Using system pdfium from: {}", lib_dir);
804
+ return;
805
+ }
806
+
807
+ // No system pdfium found
808
+ panic!(
809
+ "system-pdfium feature enabled but pdfium not found.\n\
810
+ \n\
811
+ Please install pdfium system-wide or provide:\n\
812
+ - KREUZBERG_PDFIUM_SYSTEM_PATH: path to directory containing libpdfium\n\
813
+ - KREUZBERG_PDFIUM_SYSTEM_INCLUDE: path to pdfium headers (optional)\n\
814
+ \n\
815
+ Alternatively, use a different linking strategy:\n\
816
+ - Default (dynamic): cargo build --features pdf\n\
817
+ - Static linking: cargo build --features pdf,static-pdfium\n\
818
+ - Bundled: cargo build --features pdf,bundled-pdfium"
819
+ );
820
+ }
821
+
822
+ /// Link system frameworks and standard libraries
823
+ ///
824
+ /// Adds platform-specific system libraries required for PDFium linking:
825
+ /// - macOS: CoreFoundation, CoreGraphics, CoreText, AppKit, libc++
826
+ /// - Linux: stdc++, libm
827
+ /// - Windows: gdi32, user32, advapi32
828
+ fn link_system_frameworks(target: &str) {
829
+ if target.contains("darwin") {
830
+ println!("cargo:rustc-link-lib=framework=CoreFoundation");
831
+ println!("cargo:rustc-link-lib=framework=CoreGraphics");
832
+ println!("cargo:rustc-link-lib=framework=CoreText");
833
+ println!("cargo:rustc-link-lib=framework=AppKit");
834
+ println!("cargo:rustc-link-lib=dylib=c++");
835
+ } else if target.contains("linux") {
836
+ println!("cargo:rustc-link-lib=dylib=stdc++");
837
+ println!("cargo:rustc-link-lib=dylib=m");
838
+ } else if target.contains("windows") {
839
+ println!("cargo:rustc-link-lib=dylib=gdi32");
840
+ println!("cargo:rustc-link-lib=dylib=user32");
841
+ println!("cargo:rustc-link-lib=dylib=advapi32");
484
842
  }
485
843
  }