kreuzberg 4.0.0.pre.rc.8 → 4.0.0.pre.rc.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +12 -9
  3. data/README.md +22 -0
  4. data/ext/kreuzberg_rb/native/Cargo.lock +397 -177
  5. data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
  6. data/ext/kreuzberg_rb/native/src/lib.rs +36 -13
  7. data/kreuzberg.gemspec +34 -2
  8. data/lib/kreuzberg/cache_api.rb +35 -0
  9. data/lib/kreuzberg/error_context.rb +49 -1
  10. data/lib/kreuzberg/extraction_api.rb +255 -0
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/lib/kreuzberg.rb +6 -0
  13. data/lib/libpdfium.dylib +0 -0
  14. data/sig/kreuzberg.rbs +9 -0
  15. data/vendor/Cargo.toml +44 -0
  16. data/vendor/kreuzberg/Cargo.toml +61 -38
  17. data/vendor/kreuzberg/README.md +36 -27
  18. data/vendor/kreuzberg/build.rs +197 -245
  19. data/vendor/kreuzberg/src/core/pipeline.rs +13 -0
  20. data/vendor/kreuzberg/src/embeddings.rs +71 -3
  21. data/vendor/kreuzberg/src/error.rs +1 -1
  22. data/vendor/kreuzberg/src/extraction/html.rs +37 -5
  23. data/vendor/kreuzberg/src/extractors/pdf.rs +93 -44
  24. data/vendor/kreuzberg/src/pdf/bindings.rs +44 -0
  25. data/vendor/kreuzberg/src/pdf/bundled.rs +19 -1
  26. data/vendor/kreuzberg/src/pdf/metadata.rs +2 -2
  27. data/vendor/kreuzberg/src/pdf/mod.rs +2 -0
  28. data/vendor/kreuzberg/src/pdf/rendering.rs +2 -2
  29. data/vendor/kreuzberg/src/pdf/table.rs +3 -0
  30. data/vendor/kreuzberg/src/pdf/text.rs +2 -2
  31. data/vendor/kreuzberg/src/text/quality_processor.rs +1 -1
  32. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -1
  33. data/vendor/kreuzberg/tests/format_integration.rs +4 -1
  34. data/vendor/kreuzberg-ffi/Cargo.toml +63 -0
  35. data/vendor/kreuzberg-ffi/README.md +851 -0
  36. data/vendor/kreuzberg-ffi/build.rs +176 -0
  37. data/vendor/kreuzberg-ffi/cbindgen.toml +27 -0
  38. data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -0
  39. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  40. data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -0
  41. data/vendor/kreuzberg-ffi/src/lib.rs +3616 -0
  42. data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -0
  43. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  44. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  45. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  46. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  47. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  48. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  49. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  50. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  51. data/vendor/kreuzberg-tesseract/Cargo.toml +48 -0
  52. data/vendor/kreuzberg-tesseract/LICENSE +22 -0
  53. data/vendor/kreuzberg-tesseract/README.md +399 -0
  54. data/vendor/kreuzberg-tesseract/build.rs +1354 -0
  55. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  56. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  57. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  58. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  59. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  60. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  61. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  62. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  63. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  64. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  65. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  66. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  67. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  68. metadata +39 -3
  69. data/vendor/rb-sys/bin/release.sh +0 -21
@@ -1,3 +1,29 @@
1
+ // Kreuzberg Build Script - PDFium Linking Configuration
2
+ //
3
+ // This build script handles PDFium library downloading and linking for the kreuzberg crate.
4
+ // It supports multiple linking strategies via Cargo features:
5
+ //
6
+ // 1. Default (pdf, bundled-pdfium): Download dynamic library and embed in binary
7
+ // - Self-contained binary that extracts library at runtime
8
+ // - Larger binary size but no external .so dependency
9
+ // - No PDFIUM_*_PATH environment variables needed
10
+ //
11
+ // 2. static-pdfium: Static linking (no runtime dependency)
12
+ // - REQUIRES: PDFIUM_STATIC_LIB_PATH environment variable pointing to libpdfium.a directory
13
+ // - Reason: bblanchon/pdfium-binaries only provides dynamic libraries
14
+ // - Use case: Docker with musl, fully static binaries
15
+ // - Note: libpdfium.a must be obtained separately (e.g., paulocoutinhox/pdfium-lib)
16
+ //
17
+ // 3. system-pdfium: Use system-installed pdfium
18
+ // - Detected via pkg-config or KREUZBERG_PDFIUM_SYSTEM_PATH
19
+ //
20
+ // Environment Variables:
21
+ // - PDFIUM_STATIC_LIB_PATH: Path to directory containing libpdfium.a (for static-pdfium)
22
+ // - KREUZBERG_PDFIUM_PREBUILT: Path to prebuilt pdfium directory (skip download)
23
+ // - KREUZBERG_PDFIUM_SYSTEM_PATH: System pdfium library path (for system-pdfium)
24
+ // - PDFIUM_VERSION: Override version for bblanchon/pdfium-binaries
25
+ // - KREUZBERG_PDFIUM_DOWNLOAD_RETRIES: Number of download retries (default: 5)
26
+
1
27
  use std::env;
2
28
  use std::fs;
3
29
  use std::io;
@@ -9,13 +35,11 @@ use std::time::Duration;
9
35
  /// PDFium linking strategy
10
36
  #[derive(Debug, Clone, Copy, PartialEq, Eq)]
11
37
  enum PdfiumLinkStrategy {
12
- /// Download and link dynamically (default behavior)
13
- DownloadDynamic,
14
- /// Download and link statically (pdf-static feature)
38
+ /// Download and link statically (static-pdfium feature)
15
39
  DownloadStatic,
16
- /// Download, link dynamically, and embed in binary (pdf-bundled feature)
40
+ /// Download, link dynamically, and embed in binary (bundled-pdfium feature)
17
41
  Bundled,
18
- /// Use system-installed pdfium via pkg-config (pdf-system feature)
42
+ /// Use system-installed pdfium via pkg-config (system-pdfium feature)
19
43
  System,
20
44
  }
21
45
 
@@ -35,17 +59,11 @@ fn main() {
35
59
  return;
36
60
  }
37
61
 
38
- validate_feature_exclusivity();
39
62
  let strategy = determine_link_strategy(&target);
40
63
 
41
64
  tracing::debug!("Using PDFium linking strategy: {:?}", strategy);
42
65
 
43
66
  match strategy {
44
- PdfiumLinkStrategy::DownloadDynamic => {
45
- let pdfium_dir = download_or_use_prebuilt(&target, &out_dir);
46
- link_dynamically(&pdfium_dir, &target);
47
- copy_lib_to_package(&pdfium_dir, &target);
48
- }
49
67
  PdfiumLinkStrategy::DownloadStatic => {
50
68
  let pdfium_dir = download_or_use_prebuilt(&target, &out_dir);
51
69
  link_statically(&pdfium_dir, &target);
@@ -70,43 +88,49 @@ fn main() {
70
88
  // FEATURE & STRATEGY VALIDATION
71
89
  // ============================================================================
72
90
 
73
- /// Validate that only one linking strategy feature is enabled at a time
74
- fn validate_feature_exclusivity() {
75
- let strategies = [
76
- cfg!(feature = "pdf-static"),
77
- cfg!(feature = "pdf-bundled"),
78
- cfg!(feature = "pdf-system"),
79
- ];
80
- let count = strategies.iter().filter(|&&x| x).count();
81
-
82
- if count > 1 {
83
- panic!(
84
- "Only one of pdf-static, pdf-bundled, pdf-system can be enabled at once.\n\
85
- Please choose a single PDFium linking strategy."
86
- );
87
- }
88
- }
89
-
90
91
  /// Determine which linking strategy to use based on features and target
91
92
  fn determine_link_strategy(target: &str) -> PdfiumLinkStrategy {
92
- // WASM always uses static linking
93
+ // WASM handling: check for PDFIUM_WASM_LIB environment variable
93
94
  if target.contains("wasm") {
94
- return PdfiumLinkStrategy::DownloadStatic;
95
+ if let Ok(wasm_lib) = env::var("PDFIUM_WASM_LIB") {
96
+ println!("cargo:rustc-link-search=native={}", wasm_lib);
97
+ println!("cargo:rustc-link-lib=static=pdfium");
98
+ return PdfiumLinkStrategy::DownloadStatic;
99
+ }
100
+ // For WASM without explicit PDFIUM_WASM_LIB, use bundled strategy
101
+ // This downloads pdfium-lib which provides WASM-compatible builds
102
+ println!("cargo:warning=WASM build using bundled PDFium (set PDFIUM_WASM_LIB to link custom WASM PDFium)");
103
+ return PdfiumLinkStrategy::Bundled;
95
104
  }
96
105
 
97
- // Feature-based strategy selection (priority order)
98
- if cfg!(feature = "pdf-system") {
99
- return PdfiumLinkStrategy::System;
106
+ let system_pdfium = cfg!(feature = "system-pdfium");
107
+ let bundled_pdfium = cfg!(feature = "bundled-pdfium");
108
+ let static_pdfium = cfg!(feature = "static-pdfium");
109
+
110
+ let enabled_count = usize::from(system_pdfium) + usize::from(bundled_pdfium) + usize::from(static_pdfium);
111
+ if enabled_count > 1 {
112
+ println!(
113
+ "cargo:warning=Multiple PDFium linking strategies enabled (static-pdfium={}, bundled-pdfium={}, system-pdfium={}); using bundled-pdfium for this build",
114
+ static_pdfium, bundled_pdfium, system_pdfium
115
+ );
100
116
  }
101
- if cfg!(feature = "pdf-bundled") {
117
+
118
+ // Feature-based strategy selection.
119
+ // Prefer bundled-pdfium when multiple strategies are enabled (e.g. `--all-features`) because it
120
+ // does not require external PDFIUM_STATIC_LIB_PATH and does not depend on a system install.
121
+ if bundled_pdfium {
102
122
  return PdfiumLinkStrategy::Bundled;
103
123
  }
104
- if cfg!(feature = "pdf-static") {
124
+ if system_pdfium {
125
+ return PdfiumLinkStrategy::System;
126
+ }
127
+ if static_pdfium {
105
128
  return PdfiumLinkStrategy::DownloadStatic;
106
129
  }
107
130
 
108
- // Default: download and link dynamically
109
- PdfiumLinkStrategy::DownloadDynamic
131
+ // Default: download and link dynamically (bundled-pdfium preferred if pdf not already selected)
132
+ // When only 'pdf' feature is enabled (no linking strategy), default to bundled-pdfium
133
+ PdfiumLinkStrategy::Bundled
110
134
  }
111
135
 
112
136
  // ============================================================================
@@ -136,6 +160,9 @@ fn download_or_use_prebuilt(target: &str, out_dir: &Path) -> PathBuf {
136
160
  if prebuilt_path.exists() {
137
161
  prepare_prebuilt_pdfium(&prebuilt_path, &pdfium_dir)
138
162
  .unwrap_or_else(|err| panic!("Failed to copy Pdfium from {}: {}", prebuilt_path.display(), err));
163
+ if target.contains("windows") {
164
+ ensure_windows_import_library(&pdfium_dir);
165
+ }
139
166
  return pdfium_dir;
140
167
  } else {
141
168
  panic!(
@@ -166,19 +193,34 @@ fn download_or_use_prebuilt(target: &str, out_dir: &Path) -> PathBuf {
166
193
 
167
194
  // Windows-specific: ensure pdfium.lib exists
168
195
  if target.contains("windows") {
169
- let lib_dir = pdfium_dir.join("lib");
170
- let dll_lib = lib_dir.join("pdfium.dll.lib");
171
- let expected_lib = lib_dir.join("pdfium.lib");
172
-
173
- if dll_lib.exists() && !expected_lib.exists() {
174
- tracing::debug!("Renaming cached {} to {}", dll_lib.display(), expected_lib.display());
175
- fs::rename(&dll_lib, &expected_lib).expect("Failed to rename pdfium.dll.lib to pdfium.lib");
176
- }
196
+ ensure_windows_import_library(&pdfium_dir);
177
197
  }
178
198
 
179
199
  pdfium_dir
180
200
  }
181
201
 
202
+ fn ensure_windows_import_library(pdfium_dir: &Path) {
203
+ let lib_dir = pdfium_dir.join("lib");
204
+ let dll_lib = lib_dir.join("pdfium.dll.lib");
205
+ let expected_lib = lib_dir.join("pdfium.lib");
206
+
207
+ if dll_lib.exists() && !expected_lib.exists() {
208
+ tracing::debug!(
209
+ "Ensuring Windows import library at {} (source: {})",
210
+ expected_lib.display(),
211
+ dll_lib.display()
212
+ );
213
+ fs::copy(&dll_lib, &expected_lib).unwrap_or_else(|err| {
214
+ panic!(
215
+ "Failed to copy Windows import library from {} to {}: {}",
216
+ dll_lib.display(),
217
+ expected_lib.display(),
218
+ err
219
+ )
220
+ });
221
+ }
222
+ }
223
+
182
224
  // ============================================================================
183
225
  // DOWNLOAD UTILITIES
184
226
  // ============================================================================
@@ -436,13 +478,14 @@ fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
436
478
  /// Get platform-specific runtime library name and subdirectory
437
479
  ///
438
480
  /// Returns tuple of (library_name, subdirectory) for the target platform:
439
- /// - WASM: ("libpdfium.a", "lib")
481
+ /// - WASM: ("libpdfium.a", "release/lib")
440
482
  /// - Windows: ("pdfium.dll", "bin")
441
483
  /// - macOS: ("libpdfium.dylib", "lib")
442
484
  /// - Linux: ("libpdfium.so", "lib")
443
485
  fn runtime_library_info(target: &str) -> (String, &'static str) {
444
486
  if target.contains("wasm") {
445
- ("libpdfium.a".to_string(), "lib")
487
+ // pdfium-lib `wasm.tgz` extracts into `release/lib/libpdfium.a`
488
+ ("libpdfium.a".to_string(), "release/lib")
446
489
  } else if target.contains("windows") {
447
490
  ("pdfium.dll".to_string(), "bin")
448
491
  } else if target.contains("darwin") {
@@ -516,69 +559,6 @@ fn find_pdfium_library(pdfium_dir: &Path, lib_name: &str, expected_subdir: &str)
516
559
  Err(error_msg)
517
560
  }
518
561
 
519
- /// Fix macOS install name (rpath) for dynamic library
520
- ///
521
- /// Uses install_name_tool to set the install name to @rpath/{lib_name}
522
- /// to enable relative path loading on macOS.
523
- fn fix_macos_install_name(lib_path: &Path, lib_name: &str) {
524
- let new_install_name = format!("@rpath/{}", lib_name);
525
-
526
- tracing::debug!("Fixing install_name for {} to {}", lib_path.display(), new_install_name);
527
-
528
- let status = Command::new("install_name_tool")
529
- .arg("-id")
530
- .arg(&new_install_name)
531
- .arg(lib_path)
532
- .status();
533
-
534
- match status {
535
- Ok(s) if s.success() => {
536
- tracing::debug!("Successfully updated install_name");
537
- }
538
- Ok(s) => {
539
- tracing::debug!("install_name_tool failed with status: {}", s);
540
- }
541
- Err(e) => {
542
- tracing::debug!("Failed to run install_name_tool: {}", e);
543
- }
544
- }
545
- }
546
-
547
- /// Code sign binary on macOS if needed
548
- ///
549
- /// Uses codesign to sign the binary. Identity from KREUZBERG_CODESIGN_IDENTITY
550
- /// env var (default: "-" for adhoc signing). Only runs on apple-darwin targets.
551
- fn codesign_if_needed(target: &str, binary: &Path) {
552
- if !target.contains("apple-darwin") || !binary.exists() {
553
- return;
554
- }
555
-
556
- let identity = env::var("KREUZBERG_CODESIGN_IDENTITY").unwrap_or_else(|_| "-".to_string());
557
- let status = Command::new("codesign")
558
- .arg("--force")
559
- .arg("--timestamp=none")
560
- .arg("--sign")
561
- .arg(identity)
562
- .arg(binary)
563
- .status();
564
-
565
- match status {
566
- Ok(result) if result.success() => {
567
- tracing::debug!("Codesigned {}", binary.display());
568
- }
569
- Ok(result) => {
570
- tracing::debug!(
571
- "codesign exited with status {} while signing {}",
572
- result,
573
- binary.display()
574
- );
575
- }
576
- Err(err) => {
577
- tracing::debug!("Failed to run codesign for {}: {}", binary.display(), err);
578
- }
579
- }
580
- }
581
-
582
562
  // ============================================================================
583
563
  // LINKING STRATEGIES
584
564
  // ============================================================================
@@ -622,18 +602,100 @@ fn link_dynamically(pdfium_dir: &Path, target: &str) {
622
602
  }
623
603
  }
624
604
 
625
- /// Link PDFium statically (pdf-static feature)
605
+ /// Link PDFium statically (static-pdfium feature)
626
606
  ///
627
607
  /// Embeds PDFium into the binary as a static library. Adds system
628
608
  /// dependencies required for static linking on Linux.
629
609
  /// Supports flexible archive structures by finding library in multiple locations.
610
+ ///
611
+ /// Environment Variables:
612
+ /// - `PDFIUM_STATIC_LIB_PATH`: Path to directory containing libpdfium.a (for Docker/musl builds)
613
+ ///
614
+ /// Note: bblanchon/pdfium-binaries only provides dynamic libraries.
615
+ /// On macOS, this will fallback to dynamic linking with a warning.
616
+ /// On Linux, you must provide PDFIUM_STATIC_LIB_PATH pointing to a static build.
630
617
  fn link_statically(pdfium_dir: &Path, target: &str) {
631
- let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
618
+ // For static linking, we need libpdfium.a (not .dylib or .so)
619
+ let static_lib_name = "libpdfium.a";
620
+ let lib_subdir = if target.contains("wasm") { "release/lib" } else { "lib" };
621
+
622
+ // First, check if user provided a static library path via environment variable
623
+ if let Ok(custom_path) = env::var("PDFIUM_STATIC_LIB_PATH") {
624
+ let custom_lib_dir = PathBuf::from(&custom_path);
625
+
626
+ if !custom_lib_dir.exists() {
627
+ panic!(
628
+ "PDFIUM_STATIC_LIB_PATH points to '{}' but the directory does not exist",
629
+ custom_path
630
+ );
631
+ }
632
+
633
+ let custom_lib = custom_lib_dir.join(static_lib_name);
634
+ if !custom_lib.exists() {
635
+ panic!(
636
+ "PDFIUM_STATIC_LIB_PATH points to '{}' but {} not found.\n\
637
+ Expected to find: {}",
638
+ custom_path,
639
+ static_lib_name,
640
+ custom_lib.display()
641
+ );
642
+ }
643
+
644
+ tracing::debug!("Using custom static PDFium from: {}", custom_lib.display());
645
+ println!("cargo:rustc-link-search=native={}", custom_lib_dir.display());
646
+ println!("cargo:rustc-link-lib=static=pdfium");
647
+
648
+ // Static linking requires additional system dependencies
649
+ if target.contains("linux") {
650
+ println!("cargo:rustc-link-lib=dylib=pthread");
651
+ println!("cargo:rustc-link-lib=dylib=dl");
652
+ } else if target.contains("windows") {
653
+ println!("cargo:rustc-link-lib=dylib=ws2_32");
654
+ println!("cargo:rustc-link-lib=dylib=userenv");
655
+ }
656
+
657
+ return;
658
+ }
632
659
 
633
660
  // Find the actual library location (handles multiple possible archive structures)
634
- let lib_path = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
661
+ let lib_path = match find_pdfium_library(pdfium_dir, static_lib_name, lib_subdir) {
635
662
  Ok(path) => path.parent().unwrap_or(pdfium_dir).to_path_buf(),
636
- Err(err) => panic!("{}", err),
663
+ Err(_err) => {
664
+ // Static library not found - check if we're on macOS and can fallback
665
+ if target.contains("darwin") {
666
+ eprintln!("cargo:warning=Static PDFium library (libpdfium.a) not found for macOS.");
667
+ eprintln!("cargo:warning=bblanchon/pdfium-binaries only provides dynamic libraries.");
668
+ eprintln!("cargo:warning=Falling back to dynamic linking for local development.");
669
+ eprintln!("cargo:warning=Production Linux builds require PDFIUM_STATIC_LIB_PATH.");
670
+
671
+ // Fallback to dynamic linking on macOS
672
+ link_dynamically(pdfium_dir, target);
673
+ return;
674
+ } else {
675
+ // On Linux/Windows, provide helpful error with actionable steps
676
+ panic!(
677
+ "Static PDFium library (libpdfium.a) not found.\n\n\
678
+ bblanchon/pdfium-binaries only provides dynamic libraries.\n\n\
679
+ For static linking (required for Docker with musl), you must:\n\n\
680
+ 1. Build static PDFium or obtain from a source that provides it\n\
681
+ - See: https://github.com/ajrcarey/pdfium-render/issues/53\n\
682
+ - Or use: https://github.com/paulocoutinhox/pdfium-lib (provides static builds)\n\n\
683
+ 2. Set environment variable pointing to the directory containing libpdfium.a:\n\
684
+ export PDFIUM_STATIC_LIB_PATH=/path/to/pdfium/lib\n\n\
685
+ 3. Or use alternative features:\n\
686
+ - 'pdf' (dynamic linking, requires .so at runtime)\n\
687
+ - 'bundled-pdfium' (embeds dynamic library in binary)\n\
688
+ - 'system-pdfium' (use system-installed pdfium)\n\n\
689
+ Example Dockerfile pattern:\n\
690
+ FROM alpine:latest as pdfium-builder\n\
691
+ # Download/build static libpdfium.a\n\
692
+ \n\
693
+ FROM rust:alpine as builder\n\
694
+ ENV PDFIUM_STATIC_LIB_PATH=/pdfium/lib\n\
695
+ COPY --from=pdfium-builder /path/to/libpdfium.a /pdfium/lib/"
696
+ );
697
+ }
698
+ }
637
699
  };
638
700
 
639
701
  println!("cargo:rustc-link-search=native={}", lib_path.display());
@@ -653,21 +715,22 @@ fn link_statically(pdfium_dir: &Path, target: &str) {
653
715
 
654
716
  // Static linking requires additional system dependencies
655
717
  if target.contains("linux") {
656
- // Linux requires additional libraries for static linking
657
718
  println!("cargo:rustc-link-lib=dylib=pthread");
658
719
  println!("cargo:rustc-link-lib=dylib=dl");
720
+ } else if target.contains("windows") {
721
+ println!("cargo:rustc-link-lib=dylib=ws2_32");
722
+ println!("cargo:rustc-link-lib=dylib=userenv");
659
723
  }
660
724
  }
661
725
 
662
- /// Link PDFium bundled (pdf-bundled feature)
726
+ /// Link PDFium bundled (bundled-pdfium feature)
663
727
  ///
664
728
  /// Links dynamically but copies library to OUT_DIR for embedding in binary.
665
729
  /// Each binary extracts and uses its own copy of the PDFium library.
666
730
  /// Supports flexible archive structures by finding library in multiple locations.
731
+ ///
732
+ /// For WASM targets, links statically using the bundled static library.
667
733
  fn link_bundled(pdfium_dir: &Path, target: &str, out_dir: &Path) {
668
- // Link dynamically for build
669
- link_dynamically(pdfium_dir, target);
670
-
671
734
  // Copy library to OUT_DIR for bundling using flexible detection
672
735
  let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
673
736
  let src_lib = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
@@ -685,10 +748,20 @@ fn link_bundled(pdfium_dir: &Path, target: &str, out_dir: &Path) {
685
748
  .unwrap_or_else(|| panic!("Non-UTF8 path for bundled library: {}", bundled_lib.display()));
686
749
  println!("cargo:rustc-env=KREUZBERG_PDFIUM_BUNDLED_PATH={}", bundled_path);
687
750
 
688
- tracing::debug!("Bundled PDFium library at: {}", bundled_path);
751
+ // For WASM, link statically using the bundled library
752
+ if target.contains("wasm") {
753
+ let lib_dir = bundled_lib
754
+ .parent()
755
+ .unwrap_or_else(|| panic!("Invalid bundled library path: {}", bundled_lib.display()));
756
+ println!("cargo:rustc-link-search=native={}", lib_dir.display());
757
+ println!("cargo:rustc-link-lib=static=pdfium");
758
+ tracing::debug!("Bundled PDFium static library linked for WASM at: {}", bundled_path);
759
+ } else {
760
+ tracing::debug!("Bundled PDFium library at: {}", bundled_path);
761
+ }
689
762
  }
690
763
 
691
- /// Link system-installed PDFium (pdf-system feature)
764
+ /// Link system-installed PDFium (system-pdfium feature)
692
765
  ///
693
766
  /// Attempts to find PDFium via pkg-config first, then falls back to
694
767
  /// environment variables (KREUZBERG_PDFIUM_SYSTEM_PATH, KREUZBERG_PDFIUM_SYSTEM_INCLUDE).
@@ -733,7 +806,7 @@ fn link_system(_target: &str) {
733
806
 
734
807
  // No system pdfium found
735
808
  panic!(
736
- "pdf-system feature enabled but pdfium not found.\n\
809
+ "system-pdfium feature enabled but pdfium not found.\n\
737
810
  \n\
738
811
  Please install pdfium system-wide or provide:\n\
739
812
  - KREUZBERG_PDFIUM_SYSTEM_PATH: path to directory containing libpdfium\n\
@@ -741,8 +814,8 @@ fn link_system(_target: &str) {
741
814
  \n\
742
815
  Alternatively, use a different linking strategy:\n\
743
816
  - Default (dynamic): cargo build --features pdf\n\
744
- - Static linking: cargo build --features pdf,pdf-static\n\
745
- - Bundled: cargo build --features pdf,pdf-bundled"
817
+ - Static linking: cargo build --features pdf,static-pdfium\n\
818
+ - Bundled: cargo build --features pdf,bundled-pdfium"
746
819
  );
747
820
  }
748
821
 
@@ -768,124 +841,3 @@ fn link_system_frameworks(target: &str) {
768
841
  println!("cargo:rustc-link-lib=dylib=advapi32");
769
842
  }
770
843
  }
771
-
772
- // ============================================================================
773
- // LIBRARY DISTRIBUTION
774
- // ============================================================================
775
-
776
- /// Copy PDFium library to various package directories
777
- ///
778
- /// Distributes the compiled/downloaded PDFium library to:
779
- /// - CLI target directories (debug/release)
780
- /// - Python package directory
781
- /// - Node.js package directory
782
- /// - Ruby gem directory
783
- ///
784
- /// On macOS, also fixes install_name and applies code signing.
785
- /// Supports flexible archive structures by finding library in multiple locations.
786
- fn copy_lib_to_package(pdfium_dir: &Path, target: &str) {
787
- let (runtime_lib_name, runtime_subdir) = runtime_library_info(target);
788
- let src_lib = match find_pdfium_library(pdfium_dir, &runtime_lib_name, runtime_subdir) {
789
- Ok(path) => path,
790
- Err(err) => {
791
- tracing::debug!("Failed to locate PDFium library: {}", err);
792
- return;
793
- }
794
- };
795
-
796
- if target.contains("darwin") {
797
- fix_macos_install_name(&src_lib, &runtime_lib_name);
798
- codesign_if_needed(target, &src_lib);
799
- }
800
-
801
- let crate_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
802
- let workspace_root = crate_dir.parent().unwrap().parent().unwrap();
803
-
804
- if let Ok(profile) = env::var("PROFILE") {
805
- let target_dir = if let Ok(cargo_target) = env::var("TARGET") {
806
- workspace_root.join("target").join(cargo_target).join(&profile)
807
- } else {
808
- workspace_root.join("target").join(&profile)
809
- };
810
-
811
- if target_dir.exists() {
812
- copy_lib_if_needed(
813
- &src_lib,
814
- &target_dir.join(&runtime_lib_name),
815
- "CLI target directory",
816
- target,
817
- );
818
- }
819
-
820
- let simple_target_dir = workspace_root.join("target").join(&profile);
821
- if simple_target_dir != target_dir {
822
- fs::create_dir_all(&simple_target_dir).ok();
823
- copy_lib_if_needed(
824
- &src_lib,
825
- &simple_target_dir.join(&runtime_lib_name),
826
- "Java FFI target directory",
827
- target,
828
- );
829
- }
830
- }
831
-
832
- let python_dest_dir = workspace_root.join("packages").join("python").join("kreuzberg");
833
- if python_dest_dir.exists() {
834
- copy_lib_if_needed(
835
- &src_lib,
836
- &python_dest_dir.join(&runtime_lib_name),
837
- "Python package",
838
- target,
839
- );
840
- } else {
841
- tracing::debug!("Python package directory not found, skipping Python library copy");
842
- }
843
-
844
- let node_dest_dir = workspace_root.join("crates").join("kreuzberg-node");
845
- if node_dest_dir.exists() {
846
- copy_lib_if_needed(
847
- &src_lib,
848
- &node_dest_dir.join(&runtime_lib_name),
849
- "Node.js package",
850
- target,
851
- );
852
- } else {
853
- tracing::debug!("Node.js package directory not found, skipping Node library copy");
854
- }
855
-
856
- let ruby_dest_dir = workspace_root.join("packages").join("ruby").join("lib");
857
- if ruby_dest_dir.exists() {
858
- copy_lib_if_needed(&src_lib, &ruby_dest_dir.join(&runtime_lib_name), "Ruby package", target);
859
- } else {
860
- tracing::debug!("Ruby package directory not found, skipping Ruby library copy");
861
- }
862
- }
863
-
864
- /// Copy library to destination if needed (based on modification time)
865
- ///
866
- /// Only copies if destination doesn't exist or source is newer than destination.
867
- /// Applies platform-specific post-processing (code signing on macOS).
868
- fn copy_lib_if_needed(src: &Path, dest: &Path, package_name: &str, target: &str) {
869
- use std::fs;
870
-
871
- let should_copy = if dest.exists() {
872
- let src_metadata = fs::metadata(src).ok();
873
- let dest_metadata = fs::metadata(dest).ok();
874
- match (src_metadata, dest_metadata) {
875
- (Some(src), Some(dest)) => src.modified().ok() > dest.modified().ok(),
876
- _ => true,
877
- }
878
- } else {
879
- true
880
- };
881
-
882
- if should_copy {
883
- match fs::copy(src, dest) {
884
- Ok(_) => {
885
- tracing::debug!("Copied {} to {} ({})", src.display(), dest.display(), package_name);
886
- codesign_if_needed(target, dest);
887
- }
888
- Err(e) => tracing::debug!("Failed to copy library to {}: {}", package_name, e),
889
- }
890
- }
891
- }
@@ -653,6 +653,18 @@ mod tests {
653
653
  #[tokio::test]
654
654
  #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
655
655
  async fn test_pipeline_with_keyword_extraction() {
656
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
657
+ crate::plugins::registry::get_validator_registry()
658
+ .write()
659
+ .unwrap()
660
+ .shutdown_all()
661
+ .unwrap();
662
+ crate::plugins::registry::get_post_processor_registry()
663
+ .write()
664
+ .unwrap()
665
+ .shutdown_all()
666
+ .unwrap();
667
+
656
668
  let _ = crate::keywords::register_keyword_processor();
657
669
 
658
670
  let result = ExtractionResult {
@@ -703,6 +715,7 @@ Natural language processing enables computers to understand human language.
703
715
  #[tokio::test]
704
716
  #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
705
717
  async fn test_pipeline_without_keyword_config() {
718
+ let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
706
719
  let result = ExtractionResult {
707
720
  content: "Machine learning and artificial intelligence.".to_string(),
708
721
  mime_type: "text/plain".to_string(),