kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +5 -5
  3. data/README.md +15 -9
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
  5. data/ext/kreuzberg_rb/native/Cargo.lock +511 -325
  6. data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
  8. data/kreuzberg.gemspec +38 -4
  9. data/lib/kreuzberg/config.rb +34 -1
  10. data/lib/kreuzberg/result.rb +77 -14
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/sig/kreuzberg.rbs +23 -6
  13. data/vendor/kreuzberg/Cargo.toml +32 -11
  14. data/vendor/kreuzberg/README.md +54 -8
  15. data/vendor/kreuzberg/build.rs +549 -132
  16. data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
  17. data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
  18. data/vendor/kreuzberg/src/core/config.rs +49 -1
  19. data/vendor/kreuzberg/src/core/extractor.rs +134 -2
  20. data/vendor/kreuzberg/src/core/mod.rs +4 -2
  21. data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
  22. data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
  23. data/vendor/kreuzberg/src/extraction/html.rs +24 -8
  24. data/vendor/kreuzberg/src/extraction/image.rs +124 -1
  25. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
  26. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
  27. data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
  28. data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
  29. data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
  30. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  31. data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
  32. data/vendor/kreuzberg/src/extractors/email.rs +29 -15
  33. data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
  34. data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
  35. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
  36. data/vendor/kreuzberg/src/extractors/html.rs +29 -15
  37. data/vendor/kreuzberg/src/extractors/image.rs +25 -4
  38. data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
  39. data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
  40. data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
  41. data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
  42. data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
  43. data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
  44. data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
  45. data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
  46. data/vendor/kreuzberg/src/extractors/pdf.rs +197 -17
  47. data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
  48. data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
  49. data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
  50. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  51. data/vendor/kreuzberg/src/extractors/text.rs +7 -2
  52. data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
  53. data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
  54. data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
  55. data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
  56. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
  57. data/vendor/kreuzberg/src/lib.rs +10 -2
  58. data/vendor/kreuzberg/src/mcp/mod.rs +3 -0
  59. data/vendor/kreuzberg/src/mcp/server.rs +120 -12
  60. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
  61. data/vendor/kreuzberg/src/pdf/bundled.rs +328 -0
  62. data/vendor/kreuzberg/src/pdf/error.rs +8 -0
  63. data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
  64. data/vendor/kreuzberg/src/pdf/mod.rs +18 -2
  65. data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
  66. data/vendor/kreuzberg/src/pdf/table.rs +26 -2
  67. data/vendor/kreuzberg/src/pdf/text.rs +89 -7
  68. data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
  69. data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
  70. data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
  71. data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
  72. data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
  73. data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
  74. data/vendor/kreuzberg/src/text/mod.rs +6 -0
  75. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
  76. data/vendor/kreuzberg/src/types.rs +173 -21
  77. data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
  78. data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
  79. data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
  80. data/vendor/kreuzberg/tests/config_features.rs +15 -1
  81. data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
  82. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
  83. data/vendor/kreuzberg/tests/email_integration.rs +2 -0
  84. data/vendor/kreuzberg/tests/error_handling.rs +43 -34
  85. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  86. data/vendor/kreuzberg/tests/image_integration.rs +2 -0
  87. data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
  88. data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
  89. data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
  90. data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
  91. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
  92. data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
  93. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
  95. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
  96. data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
  97. data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
  98. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
  99. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
  100. data/vendor/kreuzberg/tests/security_validation.rs +1 -0
  101. data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
  102. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
  103. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
  104. data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
  105. data/vendor/rb-sys/Cargo.lock +15 -15
  106. data/vendor/rb-sys/Cargo.toml +4 -4
  107. data/vendor/rb-sys/Cargo.toml.orig +4 -4
  108. data/vendor/rb-sys/build/features.rs +5 -2
  109. data/vendor/rb-sys/build/main.rs +55 -15
  110. data/vendor/rb-sys/build/stable_api_config.rs +4 -2
  111. data/vendor/rb-sys/build/version.rs +3 -1
  112. data/vendor/rb-sys/src/lib.rs +1 -0
  113. data/vendor/rb-sys/src/macros.rs +2 -2
  114. data/vendor/rb-sys/src/special_consts.rs +1 -1
  115. data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
  116. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
  117. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
  118. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
  119. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
  120. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
  121. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
  122. data/vendor/rb-sys/src/stable_api.rs +0 -1
  123. data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
  124. metadata +13 -10
  125. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  126. data/vendor/rb-sys/.cargo-ok +0 -1
  127. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
@@ -6,21 +6,137 @@ use std::process::Command;
6
6
  use std::thread;
7
7
  use std::time::Duration;
8
8
 
9
+ /// PDFium linking strategy
10
+ #[derive(Debug, Clone, Copy, PartialEq, Eq)]
11
+ enum PdfiumLinkStrategy {
12
+ /// Download and link dynamically (default behavior)
13
+ DownloadDynamic,
14
+ /// Download and link statically (pdf-static feature)
15
+ DownloadStatic,
16
+ /// Download, link dynamically, and embed in binary (pdf-bundled feature)
17
+ Bundled,
18
+ /// Use system-installed pdfium via pkg-config (pdf-system feature)
19
+ System,
20
+ }
21
+
22
+ // ============================================================================
23
+ // MAIN BUILD ORCHESTRATION
24
+ // ============================================================================
25
+
9
26
  fn main() {
10
27
  let target = env::var("TARGET").unwrap();
11
28
  let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
12
29
 
13
30
  println!("cargo::rustc-check-cfg=cfg(coverage)");
14
31
 
15
- let (download_url, lib_name) = get_pdfium_url_and_lib(&target);
32
+ // Skip pdfium linking if the pdf feature is not enabled
33
+ if !cfg!(feature = "pdf") {
34
+ tracing::debug!("PDF feature not enabled, skipping pdfium linking");
35
+ return;
36
+ }
37
+
38
+ validate_feature_exclusivity();
39
+ let strategy = determine_link_strategy(&target);
40
+
41
+ tracing::debug!("Using PDFium linking strategy: {:?}", strategy);
42
+
43
+ match strategy {
44
+ PdfiumLinkStrategy::DownloadDynamic => {
45
+ let pdfium_dir = download_or_use_prebuilt(&target, &out_dir);
46
+ link_dynamically(&pdfium_dir, &target);
47
+ copy_lib_to_package(&pdfium_dir, &target);
48
+ }
49
+ PdfiumLinkStrategy::DownloadStatic => {
50
+ let pdfium_dir = download_or_use_prebuilt(&target, &out_dir);
51
+ link_statically(&pdfium_dir, &target);
52
+ // Skip copy_lib_to_package - library embedded in binary
53
+ }
54
+ PdfiumLinkStrategy::Bundled => {
55
+ let pdfium_dir = download_or_use_prebuilt(&target, &out_dir);
56
+ link_bundled(&pdfium_dir, &target, &out_dir);
57
+ // Skip copy_lib_to_package - each binary extracts its own
58
+ }
59
+ PdfiumLinkStrategy::System => {
60
+ link_system(&target);
61
+ // No download or copy needed
62
+ }
63
+ }
64
+
65
+ link_system_frameworks(&target);
66
+ println!("cargo:rerun-if-changed=build.rs");
67
+ }
68
+
69
+ // ============================================================================
70
+ // FEATURE & STRATEGY VALIDATION
71
+ // ============================================================================
72
+
73
+ /// Validate that only one linking strategy feature is enabled at a time
74
+ fn validate_feature_exclusivity() {
75
+ let strategies = [
76
+ cfg!(feature = "pdf-static"),
77
+ cfg!(feature = "pdf-bundled"),
78
+ cfg!(feature = "pdf-system"),
79
+ ];
80
+ let count = strategies.iter().filter(|&&x| x).count();
81
+
82
+ if count > 1 {
83
+ panic!(
84
+ "Only one of pdf-static, pdf-bundled, pdf-system can be enabled at once.\n\
85
+ Please choose a single PDFium linking strategy."
86
+ );
87
+ }
88
+ }
89
+
90
+ /// Determine which linking strategy to use based on features and target
91
+ fn determine_link_strategy(target: &str) -> PdfiumLinkStrategy {
92
+ // WASM always uses static linking
93
+ if target.contains("wasm") {
94
+ return PdfiumLinkStrategy::DownloadStatic;
95
+ }
96
+
97
+ // Feature-based strategy selection (priority order)
98
+ if cfg!(feature = "pdf-system") {
99
+ return PdfiumLinkStrategy::System;
100
+ }
101
+ if cfg!(feature = "pdf-bundled") {
102
+ return PdfiumLinkStrategy::Bundled;
103
+ }
104
+ if cfg!(feature = "pdf-static") {
105
+ return PdfiumLinkStrategy::DownloadStatic;
106
+ }
107
+
108
+ // Default: download and link dynamically
109
+ PdfiumLinkStrategy::DownloadDynamic
110
+ }
16
111
 
112
+ // ============================================================================
113
+ // DOWNLOAD & PREBUILT ORCHESTRATION
114
+ // ============================================================================
115
+
116
+ /// Download PDFium or use prebuilt directory
117
+ ///
118
+ /// This is the main orchestrator function that:
119
+ /// 1. Checks for `KREUZBERG_PDFIUM_PREBUILT` environment variable
120
+ /// 2. If set and valid, uses prebuilt pdfium directory
121
+ /// 3. If not set, downloads pdfium to out_dir (with caching)
122
+ /// 4. Returns PathBuf to pdfium directory
123
+ ///
124
+ /// Reuses all existing helper functions:
125
+ /// - `get_pdfium_url_and_lib()` - determines download URL for target
126
+ /// - `download_and_extract_pdfium()` - downloads with retry logic
127
+ /// - `runtime_library_info()` - platform-specific library names
128
+ /// - `prepare_prebuilt_pdfium()` - handles prebuilt copy
129
+ fn download_or_use_prebuilt(target: &str, out_dir: &Path) -> PathBuf {
130
+ let (download_url, _lib_name) = get_pdfium_url_and_lib(target);
17
131
  let pdfium_dir = out_dir.join("pdfium");
18
132
 
133
+ // Check for prebuilt pdfium directory
19
134
  if let Some(prebuilt) = env::var_os("KREUZBERG_PDFIUM_PREBUILT") {
20
135
  let prebuilt_path = PathBuf::from(prebuilt);
21
136
  if prebuilt_path.exists() {
22
137
  prepare_prebuilt_pdfium(&prebuilt_path, &pdfium_dir)
23
138
  .unwrap_or_else(|err| panic!("Failed to copy Pdfium from {}: {}", prebuilt_path.display(), err));
139
+ return pdfium_dir;
24
140
  } else {
25
141
  panic!(
26
142
  "Environment variable KREUZBERG_PDFIUM_PREBUILT points to '{}' but the directory does not exist",
@@ -29,8 +145,10 @@ fn main() {
29
145
  }
30
146
  }
31
147
 
32
- let (runtime_lib_name, runtime_subdir) = runtime_library_info(&target);
33
- let runtime_lib_path = pdfium_dir.join(runtime_subdir).join(&runtime_lib_name);
148
+ // Check if library already exists (cache validation) using flexible detection
149
+ let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
150
+ let lib_found = find_pdfium_library(&pdfium_dir, &runtime_lib_name, runtime_subdir).is_ok();
151
+
34
152
  let import_lib_exists = if target.contains("windows") {
35
153
  let lib_dir = pdfium_dir.join("lib");
36
154
  lib_dir.join("pdfium.lib").exists() || lib_dir.join("pdfium.dll.lib").exists()
@@ -38,14 +156,15 @@ fn main() {
38
156
  true
39
157
  };
40
158
 
41
- if !runtime_lib_path.exists() || !import_lib_exists {
159
+ if !lib_found || !import_lib_exists {
42
160
  tracing::debug!("Pdfium library not found, downloading for target: {}", target);
43
161
  tracing::debug!("Download URL: {}", download_url);
44
162
  download_and_extract_pdfium(&download_url, &pdfium_dir);
45
163
  } else {
46
- tracing::debug!("Pdfium library already present at {}", runtime_lib_path.display());
164
+ tracing::debug!("Pdfium library already cached at {}", pdfium_dir.display());
47
165
  }
48
166
 
167
+ // Windows-specific: ensure pdfium.lib exists
49
168
  if target.contains("windows") {
50
169
  let lib_dir = pdfium_dir.join("lib");
51
170
  let dll_lib = lib_dir.join("pdfium.dll.lib");
@@ -57,38 +176,17 @@ fn main() {
57
176
  }
58
177
  }
59
178
 
60
- let lib_dir = pdfium_dir.join("lib");
61
- println!("cargo:rustc-link-search=native={}", lib_dir.display());
62
- println!("cargo:rustc-link-lib=dylib={}", lib_name);
63
-
64
- if target.contains("darwin") {
65
- println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
66
- println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
67
- } else if target.contains("linux") {
68
- println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
69
- println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
70
- }
71
-
72
- copy_lib_to_package(&pdfium_dir, &target);
73
-
74
- if target.contains("darwin") {
75
- println!("cargo:rustc-link-lib=framework=CoreFoundation");
76
- println!("cargo:rustc-link-lib=framework=CoreGraphics");
77
- println!("cargo:rustc-link-lib=framework=CoreText");
78
- println!("cargo:rustc-link-lib=framework=AppKit");
79
- println!("cargo:rustc-link-lib=dylib=c++");
80
- } else if target.contains("linux") {
81
- println!("cargo:rustc-link-lib=dylib=stdc++");
82
- println!("cargo:rustc-link-lib=dylib=m");
83
- } else if target.contains("windows") {
84
- println!("cargo:rustc-link-lib=dylib=gdi32");
85
- println!("cargo:rustc-link-lib=dylib=user32");
86
- println!("cargo:rustc-link-lib=dylib=advapi32");
87
- }
88
-
89
- println!("cargo:rerun-if-changed=build.rs");
179
+ pdfium_dir
90
180
  }
91
181
 
182
+ // ============================================================================
183
+ // DOWNLOAD UTILITIES
184
+ // ============================================================================
185
+
186
+ /// Fetch the latest release version from a GitHub repository
187
+ ///
188
+ /// Uses curl to query the GitHub API and extract the tag_name from the
189
+ /// latest release JSON response. Falls back to "7529" if API call fails.
92
190
  fn get_latest_version(repo: &str) -> String {
93
191
  let api_url = format!("https://api.github.com/repos/{}/releases/latest", repo);
94
192
 
@@ -113,6 +211,12 @@ fn get_latest_version(repo: &str) -> String {
113
211
  "7529".to_string()
114
212
  }
115
213
 
214
+ /// Get the download URL and library name for the target platform
215
+ ///
216
+ /// Determines platform/architecture from target triple and constructs
217
+ /// the appropriate GitHub release download URL. Supports:
218
+ /// - WASM: paulocoutinhox/pdfium-lib
219
+ /// - Other platforms: bblanchon/pdfium-binaries
116
220
  fn get_pdfium_url_and_lib(target: &str) -> (String, String) {
117
221
  if target.contains("wasm") {
118
222
  let version = env::var("PDFIUM_WASM_VERSION")
@@ -121,11 +225,12 @@ fn get_pdfium_url_and_lib(target: &str) -> (String, String) {
121
225
  .unwrap_or_else(|| get_latest_version("paulocoutinhox/pdfium-lib"));
122
226
  tracing::debug!("Using pdfium-lib version: {}", version);
123
227
 
124
- let wasm_arch = if target.contains("wasm32") { "wasm32" } else { "wasm64" };
228
+ // WASM builds use a single 'wasm.tgz' asset regardless of architecture
229
+ // The archive contains both wasm32 and wasm64 if available
125
230
  return (
126
231
  format!(
127
- "https://github.com/paulocoutinhox/pdfium-lib/releases/download/{}/pdfium-{}.tar.gz",
128
- version, wasm_arch
232
+ "https://github.com/paulocoutinhox/pdfium-lib/releases/download/{}/wasm.tgz",
233
+ version
129
234
  ),
130
235
  "pdfium".to_string(),
131
236
  );
@@ -170,6 +275,15 @@ fn get_pdfium_url_and_lib(target: &str) -> (String, String) {
170
275
  (url, "pdfium".to_string())
171
276
  }
172
277
 
278
+ /// Download and extract PDFium archive with retry logic
279
+ ///
280
+ /// Features:
281
+ /// - Exponential backoff retry (configurable via env vars)
282
+ /// - File type validation (gzip check)
283
+ /// - Windows-specific import library handling (pdfium.dll.lib -> pdfium.lib)
284
+ /// - Environment variables:
285
+ /// - KREUZBERG_PDFIUM_DOWNLOAD_RETRIES: number of retries (default: 5)
286
+ /// - KREUZBERG_PDFIUM_DOWNLOAD_BACKOFF_SECS: initial backoff in seconds (default: 2)
173
287
  fn download_and_extract_pdfium(url: &str, dest_dir: &Path) {
174
288
  fs::create_dir_all(dest_dir).expect("Failed to create pdfium directory");
175
289
 
@@ -281,15 +395,404 @@ fn download_and_extract_pdfium(url: &str, dest_dir: &Path) {
281
395
  tracing::debug!("Pdfium downloaded and extracted successfully");
282
396
  }
283
397
 
284
- fn copy_lib_to_package(pdfium_dir: &Path, target: &str) {
398
+ // ============================================================================
399
+ // PREBUILT HANDLING
400
+ // ============================================================================
401
+
402
+ /// Prepare prebuilt PDFium by copying to destination directory
403
+ ///
404
+ /// Removes existing destination if present, then recursively copies
405
+ /// all files from prebuilt source to destination.
406
+ fn prepare_prebuilt_pdfium(prebuilt_src: &Path, dest_dir: &Path) -> io::Result<()> {
407
+ if dest_dir.exists() {
408
+ fs::remove_dir_all(dest_dir)?;
409
+ }
410
+ copy_dir_all(prebuilt_src, dest_dir)
411
+ }
412
+
413
+ /// Recursively copy directory tree
414
+ ///
415
+ /// Used by `prepare_prebuilt_pdfium()` to copy entire pdfium directory
416
+ /// structure, preserving all files and subdirectories.
417
+ fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
418
+ fs::create_dir_all(dst)?;
419
+ for entry in fs::read_dir(src)? {
420
+ let entry = entry?;
421
+ let file_type = entry.file_type()?;
422
+ let target_path = dst.join(entry.file_name());
423
+ if file_type.is_dir() {
424
+ copy_dir_all(&entry.path(), &target_path)?;
425
+ } else {
426
+ fs::copy(entry.path(), &target_path)?;
427
+ }
428
+ }
429
+ Ok(())
430
+ }
431
+
432
+ // ============================================================================
433
+ // PLATFORM UTILITIES
434
+ // ============================================================================
435
+
436
+ /// Get platform-specific runtime library name and subdirectory
437
+ ///
438
+ /// Returns tuple of (library_name, subdirectory) for the target platform:
439
+ /// - WASM: ("libpdfium.a", "lib")
440
+ /// - Windows: ("pdfium.dll", "bin")
441
+ /// - macOS: ("libpdfium.dylib", "lib")
442
+ /// - Linux: ("libpdfium.so", "lib")
443
+ fn runtime_library_info(target: &str) -> (String, &'static str) {
444
+ if target.contains("wasm") {
445
+ ("libpdfium.a".to_string(), "lib")
446
+ } else if target.contains("windows") {
447
+ ("pdfium.dll".to_string(), "bin")
448
+ } else if target.contains("darwin") {
449
+ ("libpdfium.dylib".to_string(), "lib")
450
+ } else {
451
+ ("libpdfium.so".to_string(), "lib")
452
+ }
453
+ }
454
+
455
+ /// Find PDFium library in archive with flexible directory detection
456
+ ///
457
+ /// Attempts to locate the library at multiple possible locations:
458
+ /// - {subdir}/{lib_name} (standard location)
459
+ /// - {lib_name} (root of archive)
460
+ /// - bin/{lib_name} (alternative location)
461
+ /// - lib/{lib_name} (explicit lib directory)
462
+ ///
463
+ /// This handles variations in archive structure across different platform builds,
464
+ /// particularly macOS ARM64 where the archive structure may differ.
465
+ ///
466
+ /// Returns the full path to the library if found, or an error with available files.
467
+ fn find_pdfium_library(pdfium_dir: &Path, lib_name: &str, expected_subdir: &str) -> Result<PathBuf, String> {
468
+ // Candidates in priority order
469
+ let candidates = [
470
+ pdfium_dir.join(expected_subdir).join(lib_name), // Standard: lib/libpdfium.dylib
471
+ pdfium_dir.join(lib_name), // Root: libpdfium.dylib
472
+ pdfium_dir.join("bin").join(lib_name), // Alternative: bin/libpdfium.dylib
473
+ pdfium_dir.join("lib").join(lib_name), // Explicit lib: lib/libpdfium.dylib
474
+ ];
475
+
476
+ // Try each candidate
477
+ for candidate in &candidates {
478
+ if candidate.exists() {
479
+ tracing::debug!("Found PDFium library at: {}", candidate.display());
480
+ return Ok(candidate.clone());
481
+ }
482
+ }
483
+
484
+ // Library not found - provide detailed error with directory listing
485
+ let mut error_msg = format!(
486
+ "PDFium library not found at expected location: {}/{}\n\n",
487
+ pdfium_dir.display(),
488
+ expected_subdir
489
+ );
490
+ error_msg.push_str("Attempted locations:\n");
491
+ for candidate in &candidates {
492
+ error_msg.push_str(&format!(" - {}\n", candidate.display()));
493
+ }
494
+
495
+ // List actual contents of pdfium directory for debugging
496
+ error_msg.push_str("\nActual archive contents:\n");
497
+ if let Ok(entries) = fs::read_dir(pdfium_dir) {
498
+ for entry in entries.flatten() {
499
+ let path = entry.path();
500
+ let file_type = if path.is_dir() { "dir" } else { "file" };
501
+ error_msg.push_str(&format!(" {} ({})\n", path.display(), file_type));
502
+
503
+ // Show contents of subdirectories
504
+ if path.is_dir()
505
+ && let Ok(sub_entries) = fs::read_dir(&path)
506
+ {
507
+ for sub_entry in sub_entries.flatten() {
508
+ let sub_path = sub_entry.path();
509
+ let sub_type = if sub_path.is_dir() { "dir" } else { "file" };
510
+ error_msg.push_str(&format!(" {} ({})\n", sub_path.display(), sub_type));
511
+ }
512
+ }
513
+ }
514
+ }
515
+
516
+ Err(error_msg)
517
+ }
518
+
519
+ /// Fix macOS install name (rpath) for dynamic library
520
+ ///
521
+ /// Uses install_name_tool to set the install name to @rpath/{lib_name}
522
+ /// to enable relative path loading on macOS.
523
+ fn fix_macos_install_name(lib_path: &Path, lib_name: &str) {
524
+ let new_install_name = format!("@rpath/{}", lib_name);
525
+
526
+ tracing::debug!("Fixing install_name for {} to {}", lib_path.display(), new_install_name);
527
+
528
+ let status = Command::new("install_name_tool")
529
+ .arg("-id")
530
+ .arg(&new_install_name)
531
+ .arg(lib_path)
532
+ .status();
533
+
534
+ match status {
535
+ Ok(s) if s.success() => {
536
+ tracing::debug!("Successfully updated install_name");
537
+ }
538
+ Ok(s) => {
539
+ tracing::debug!("install_name_tool failed with status: {}", s);
540
+ }
541
+ Err(e) => {
542
+ tracing::debug!("Failed to run install_name_tool: {}", e);
543
+ }
544
+ }
545
+ }
546
+
547
+ /// Code sign binary on macOS if needed
548
+ ///
549
+ /// Uses codesign to sign the binary. Identity from KREUZBERG_CODESIGN_IDENTITY
550
+ /// env var (default: "-" for adhoc signing). Only runs on apple-darwin targets.
551
+ fn codesign_if_needed(target: &str, binary: &Path) {
552
+ if !target.contains("apple-darwin") || !binary.exists() {
553
+ return;
554
+ }
555
+
556
+ let identity = env::var("KREUZBERG_CODESIGN_IDENTITY").unwrap_or_else(|_| "-".to_string());
557
+ let status = Command::new("codesign")
558
+ .arg("--force")
559
+ .arg("--timestamp=none")
560
+ .arg("--sign")
561
+ .arg(identity)
562
+ .arg(binary)
563
+ .status();
564
+
565
+ match status {
566
+ Ok(result) if result.success() => {
567
+ tracing::debug!("Codesigned {}", binary.display());
568
+ }
569
+ Ok(result) => {
570
+ tracing::debug!(
571
+ "codesign exited with status {} while signing {}",
572
+ result,
573
+ binary.display()
574
+ );
575
+ }
576
+ Err(err) => {
577
+ tracing::debug!("Failed to run codesign for {}: {}", binary.display(), err);
578
+ }
579
+ }
580
+ }
581
+
582
+ // ============================================================================
583
+ // LINKING STRATEGIES
584
+ // ============================================================================
585
+
586
+ /// Link PDFium dynamically (default)
587
+ ///
588
+ /// Sets up linker to use PDFium as a dynamic library (.dylib/.so/.dll)
589
+ /// with platform-specific rpath configuration for runtime library discovery.
590
+ /// Supports flexible archive structures by adding multiple possible lib directories.
591
+ fn link_dynamically(pdfium_dir: &Path, target: &str) {
285
592
  let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
286
- let src_lib = pdfium_dir.join(runtime_subdir).join(&runtime_lib_name);
287
593
 
288
- if !src_lib.exists() {
289
- tracing::debug!("Source library not found: {}", src_lib.display());
594
+ // Find the actual library location (handles multiple possible archive structures)
595
+ let lib_path = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
596
+ Ok(path) => path.parent().unwrap_or(pdfium_dir).to_path_buf(),
597
+ Err(err) => panic!("{}", err),
598
+ };
599
+
600
+ println!("cargo:rustc-link-search=native={}", lib_path.display());
601
+ println!("cargo:rustc-link-lib=dylib=pdfium");
602
+
603
+ // Also add standard lib directory for compatibility
604
+ let std_lib_dir = pdfium_dir.join("lib");
605
+ if std_lib_dir.exists() && std_lib_dir != lib_path {
606
+ println!("cargo:rustc-link-search=native={}", std_lib_dir.display());
607
+ }
608
+
609
+ // Add bin directory for platforms where it might be needed
610
+ let bin_dir = pdfium_dir.join("bin");
611
+ if bin_dir.exists() && bin_dir != lib_path {
612
+ println!("cargo:rustc-link-search=native={}", bin_dir.display());
613
+ }
614
+
615
+ // Set rpath for dynamic linking
616
+ if target.contains("darwin") {
617
+ println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
618
+ println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
619
+ } else if target.contains("linux") {
620
+ println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
621
+ println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
622
+ }
623
+ }
624
+
625
+ /// Link PDFium statically (pdf-static feature)
626
+ ///
627
+ /// Embeds PDFium into the binary as a static library. Adds system
628
+ /// dependencies required for static linking on Linux.
629
+ /// Supports flexible archive structures by finding library in multiple locations.
630
+ fn link_statically(pdfium_dir: &Path, target: &str) {
631
+ let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
632
+
633
+ // Find the actual library location (handles multiple possible archive structures)
634
+ let lib_path = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
635
+ Ok(path) => path.parent().unwrap_or(pdfium_dir).to_path_buf(),
636
+ Err(err) => panic!("{}", err),
637
+ };
638
+
639
+ println!("cargo:rustc-link-search=native={}", lib_path.display());
640
+ println!("cargo:rustc-link-lib=static=pdfium");
641
+
642
+ // Also add standard lib directory for compatibility
643
+ let std_lib_dir = pdfium_dir.join("lib");
644
+ if std_lib_dir.exists() && std_lib_dir != lib_path {
645
+ println!("cargo:rustc-link-search=native={}", std_lib_dir.display());
646
+ }
647
+
648
+ // Add bin directory for platforms where it might be needed
649
+ let bin_dir = pdfium_dir.join("bin");
650
+ if bin_dir.exists() && bin_dir != lib_path {
651
+ println!("cargo:rustc-link-search=native={}", bin_dir.display());
652
+ }
653
+
654
+ // Static linking requires additional system dependencies
655
+ if target.contains("linux") {
656
+ // Linux requires additional libraries for static linking
657
+ println!("cargo:rustc-link-lib=dylib=pthread");
658
+ println!("cargo:rustc-link-lib=dylib=dl");
659
+ }
660
+ }
661
+
662
+ /// Link PDFium bundled (pdf-bundled feature)
663
+ ///
664
+ /// Links dynamically but copies library to OUT_DIR for embedding in binary.
665
+ /// Each binary extracts and uses its own copy of the PDFium library.
666
+ /// Supports flexible archive structures by finding library in multiple locations.
667
+ fn link_bundled(pdfium_dir: &Path, target: &str, out_dir: &Path) {
668
+ // Link dynamically for build
669
+ link_dynamically(pdfium_dir, target);
670
+
671
+ // Copy library to OUT_DIR for bundling using flexible detection
672
+ let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
673
+ let src_lib = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
674
+ Ok(path) => path,
675
+ Err(err) => panic!("{}", err),
676
+ };
677
+ let bundled_lib = out_dir.join(&runtime_lib_name);
678
+
679
+ fs::copy(&src_lib, &bundled_lib)
680
+ .unwrap_or_else(|err| panic!("Failed to copy library to OUT_DIR for bundling: {}", err));
681
+
682
+ // Emit environment variable with bundled library path
683
+ let bundled_path = bundled_lib
684
+ .to_str()
685
+ .unwrap_or_else(|| panic!("Non-UTF8 path for bundled library: {}", bundled_lib.display()));
686
+ println!("cargo:rustc-env=KREUZBERG_PDFIUM_BUNDLED_PATH={}", bundled_path);
687
+
688
+ tracing::debug!("Bundled PDFium library at: {}", bundled_path);
689
+ }
690
+
691
+ /// Link system-installed PDFium (pdf-system feature)
692
+ ///
693
+ /// Attempts to find PDFium via pkg-config first, then falls back to
694
+ /// environment variables (KREUZBERG_PDFIUM_SYSTEM_PATH, KREUZBERG_PDFIUM_SYSTEM_INCLUDE).
695
+ fn link_system(_target: &str) {
696
+ // Try pkg-config first
697
+ match pkg_config::Config::new().atleast_version("5.0").probe("pdfium") {
698
+ Ok(library) => {
699
+ tracing::debug!("Found system pdfium via pkg-config");
700
+ for include_path in &library.include_paths {
701
+ println!("cargo:include={}", include_path.display());
702
+ }
703
+ return;
704
+ }
705
+ Err(err) => {
706
+ tracing::debug!("pkg-config probe failed: {}", err);
707
+ }
708
+ }
709
+
710
+ // Fallback to environment variables
711
+ let lib_path = env::var("KREUZBERG_PDFIUM_SYSTEM_PATH").ok();
712
+ let include_path = env::var("KREUZBERG_PDFIUM_SYSTEM_INCLUDE").ok();
713
+
714
+ if let Some(lib_dir) = lib_path {
715
+ let lib_dir_path = PathBuf::from(&lib_dir);
716
+ if !lib_dir_path.exists() {
717
+ panic!(
718
+ "KREUZBERG_PDFIUM_SYSTEM_PATH points to '{}' but the directory does not exist",
719
+ lib_dir
720
+ );
721
+ }
722
+
723
+ println!("cargo:rustc-link-search=native={}", lib_dir);
724
+ println!("cargo:rustc-link-lib=dylib=pdfium");
725
+
726
+ if let Some(inc_dir) = include_path {
727
+ println!("cargo:include={}", inc_dir);
728
+ }
729
+
730
+ tracing::debug!("Using system pdfium from: {}", lib_dir);
290
731
  return;
291
732
  }
292
733
 
734
+ // No system pdfium found
735
+ panic!(
736
+ "pdf-system feature enabled but pdfium not found.\n\
737
+ \n\
738
+ Please install pdfium system-wide or provide:\n\
739
+ - KREUZBERG_PDFIUM_SYSTEM_PATH: path to directory containing libpdfium\n\
740
+ - KREUZBERG_PDFIUM_SYSTEM_INCLUDE: path to pdfium headers (optional)\n\
741
+ \n\
742
+ Alternatively, use a different linking strategy:\n\
743
+ - Default (dynamic): cargo build --features pdf\n\
744
+ - Static linking: cargo build --features pdf,pdf-static\n\
745
+ - Bundled: cargo build --features pdf,pdf-bundled"
746
+ );
747
+ }
748
+
749
+ /// Link system frameworks and standard libraries
750
+ ///
751
+ /// Adds platform-specific system libraries required for PDFium linking:
752
+ /// - macOS: CoreFoundation, CoreGraphics, CoreText, AppKit, libc++
753
+ /// - Linux: stdc++, libm
754
+ /// - Windows: gdi32, user32, advapi32
755
+ fn link_system_frameworks(target: &str) {
756
+ if target.contains("darwin") {
757
+ println!("cargo:rustc-link-lib=framework=CoreFoundation");
758
+ println!("cargo:rustc-link-lib=framework=CoreGraphics");
759
+ println!("cargo:rustc-link-lib=framework=CoreText");
760
+ println!("cargo:rustc-link-lib=framework=AppKit");
761
+ println!("cargo:rustc-link-lib=dylib=c++");
762
+ } else if target.contains("linux") {
763
+ println!("cargo:rustc-link-lib=dylib=stdc++");
764
+ println!("cargo:rustc-link-lib=dylib=m");
765
+ } else if target.contains("windows") {
766
+ println!("cargo:rustc-link-lib=dylib=gdi32");
767
+ println!("cargo:rustc-link-lib=dylib=user32");
768
+ println!("cargo:rustc-link-lib=dylib=advapi32");
769
+ }
770
+ }
771
+
772
+ // ============================================================================
773
+ // LIBRARY DISTRIBUTION
774
+ // ============================================================================
775
+
776
+ /// Copy PDFium library to various package directories
777
+ ///
778
+ /// Distributes the compiled/downloaded PDFium library to:
779
+ /// - CLI target directories (debug/release)
780
+ /// - Python package directory
781
+ /// - Node.js package directory
782
+ /// - Ruby gem directory
783
+ ///
784
+ /// On macOS, also fixes install_name and applies code signing.
785
+ /// Supports flexible archive structures by finding library in multiple locations.
786
+ fn copy_lib_to_package(pdfium_dir: &Path, target: &str) {
787
+ let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
788
+ let src_lib = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
789
+ Ok(path) => path,
790
+ Err(err) => {
791
+ tracing::debug!("Failed to locate PDFium library: {}", err);
792
+ return;
793
+ }
794
+ };
795
+
293
796
  if target.contains("darwin") {
294
797
  fix_macos_install_name(&src_lib, &runtime_lib_name);
295
798
  codesign_if_needed(target, &src_lib);
@@ -314,7 +817,6 @@ fn copy_lib_to_package(pdfium_dir: &Path, target: &str) {
314
817
  );
315
818
  }
316
819
 
317
- // Also copy to target/{profile} for Java FFI (Maven expects it here)
318
820
  let simple_target_dir = workspace_root.join("target").join(&profile);
319
821
  if simple_target_dir != target_dir {
320
822
  fs::create_dir_all(&simple_target_dir).ok();
@@ -359,6 +861,10 @@ fn copy_lib_to_package(pdfium_dir: &Path, target: &str) {
359
861
  }
360
862
  }
361
863
 
864
+ /// Copy library to destination if needed (based on modification time)
865
+ ///
866
+ /// Only copies if destination doesn't exist or source is newer than destination.
867
+ /// Applies platform-specific post-processing (code signing on macOS).
362
868
  fn copy_lib_if_needed(src: &Path, dest: &Path, package_name: &str, target: &str) {
363
869
  use std::fs;
364
870
 
@@ -383,92 +889,3 @@ fn copy_lib_if_needed(src: &Path, dest: &Path, package_name: &str, target: &str)
383
889
  }
384
890
  }
385
891
  }
386
-
387
- fn codesign_if_needed(target: &str, binary: &Path) {
388
- if !target.contains("apple-darwin") || !binary.exists() {
389
- return;
390
- }
391
-
392
- let identity = env::var("KREUZBERG_CODESIGN_IDENTITY").unwrap_or_else(|_| "-".to_string());
393
- let status = Command::new("codesign")
394
- .arg("--force")
395
- .arg("--timestamp=none")
396
- .arg("--sign")
397
- .arg(identity)
398
- .arg(binary)
399
- .status();
400
-
401
- match status {
402
- Ok(result) if result.success() => {
403
- tracing::debug!("Codesigned {}", binary.display());
404
- }
405
- Ok(result) => {
406
- tracing::debug!(
407
- "codesign exited with status {} while signing {}",
408
- result,
409
- binary.display()
410
- );
411
- }
412
- Err(err) => {
413
- tracing::debug!("Failed to run codesign for {}: {}", binary.display(), err);
414
- }
415
- }
416
- }
417
-
418
- fn runtime_library_info(target: &str) -> (String, &'static str) {
419
- if target.contains("windows") {
420
- ("pdfium.dll".to_string(), "bin")
421
- } else if target.contains("darwin") {
422
- ("libpdfium.dylib".to_string(), "lib")
423
- } else {
424
- ("libpdfium.so".to_string(), "lib")
425
- }
426
- }
427
-
428
- fn prepare_prebuilt_pdfium(prebuilt_src: &Path, dest_dir: &Path) -> io::Result<()> {
429
- if dest_dir.exists() {
430
- fs::remove_dir_all(dest_dir)?;
431
- }
432
- copy_dir_all(prebuilt_src, dest_dir)
433
- }
434
-
435
- fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
436
- fs::create_dir_all(dst)?;
437
- for entry in fs::read_dir(src)? {
438
- let entry = entry?;
439
- let file_type = entry.file_type()?;
440
- let target_path = dst.join(entry.file_name());
441
- if file_type.is_dir() {
442
- copy_dir_all(&entry.path(), &target_path)?;
443
- } else {
444
- fs::copy(entry.path(), &target_path)?;
445
- }
446
- }
447
- Ok(())
448
- }
449
-
450
- fn fix_macos_install_name(lib_path: &Path, lib_name: &str) {
451
- use std::process::Command;
452
-
453
- let new_install_name = format!("@rpath/{}", lib_name);
454
-
455
- tracing::debug!("Fixing install_name for {} to {}", lib_path.display(), new_install_name);
456
-
457
- let status = Command::new("install_name_tool")
458
- .arg("-id")
459
- .arg(&new_install_name)
460
- .arg(lib_path)
461
- .status();
462
-
463
- match status {
464
- Ok(s) if s.success() => {
465
- tracing::debug!("Successfully updated install_name");
466
- }
467
- Ok(s) => {
468
- tracing::debug!("install_name_tool failed with status: {}", s);
469
- }
470
- Err(e) => {
471
- tracing::debug!("Failed to run install_name_tool: {}", e);
472
- }
473
- }
474
- }