kreuzberg 4.0.0.pre.rc.7 → 4.0.0.pre.rc.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +13 -12
  3. data/README.md +22 -0
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +1 -1
  5. data/ext/kreuzberg_rb/native/Cargo.lock +397 -183
  6. data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +36 -13
  8. data/kreuzberg.gemspec +34 -2
  9. data/lib/kreuzberg/cache_api.rb +35 -0
  10. data/lib/kreuzberg/error_context.rb +49 -1
  11. data/lib/kreuzberg/extraction_api.rb +255 -0
  12. data/lib/kreuzberg/version.rb +1 -1
  13. data/lib/kreuzberg.rb +6 -0
  14. data/lib/libpdfium.dylib +0 -0
  15. data/sig/kreuzberg.rbs +9 -0
  16. data/vendor/Cargo.toml +44 -0
  17. data/vendor/kreuzberg/Cargo.toml +65 -35
  18. data/vendor/kreuzberg/README.md +50 -0
  19. data/vendor/kreuzberg/build.rs +548 -190
  20. data/vendor/kreuzberg/src/api/mod.rs +0 -2
  21. data/vendor/kreuzberg/src/core/pipeline.rs +13 -0
  22. data/vendor/kreuzberg/src/embeddings.rs +71 -3
  23. data/vendor/kreuzberg/src/error.rs +1 -1
  24. data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
  25. data/vendor/kreuzberg/src/extraction/html.rs +37 -5
  26. data/vendor/kreuzberg/src/extractors/pdf.rs +99 -47
  27. data/vendor/kreuzberg/src/mcp/mod.rs +3 -2
  28. data/vendor/kreuzberg/src/mcp/server.rs +106 -0
  29. data/vendor/kreuzberg/src/pdf/bindings.rs +44 -0
  30. data/vendor/kreuzberg/src/pdf/bundled.rs +346 -0
  31. data/vendor/kreuzberg/src/pdf/metadata.rs +2 -2
  32. data/vendor/kreuzberg/src/pdf/mod.rs +6 -0
  33. data/vendor/kreuzberg/src/pdf/rendering.rs +2 -2
  34. data/vendor/kreuzberg/src/pdf/table.rs +3 -0
  35. data/vendor/kreuzberg/src/pdf/text.rs +2 -2
  36. data/vendor/kreuzberg/src/text/quality_processor.rs +1 -1
  37. data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -1
  38. data/vendor/kreuzberg/tests/format_integration.rs +4 -1
  39. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
  40. data/vendor/kreuzberg-ffi/Cargo.toml +63 -0
  41. data/vendor/kreuzberg-ffi/README.md +851 -0
  42. data/vendor/kreuzberg-ffi/build.rs +176 -0
  43. data/vendor/kreuzberg-ffi/cbindgen.toml +27 -0
  44. data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -0
  45. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  46. data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -0
  47. data/vendor/kreuzberg-ffi/src/lib.rs +3616 -0
  48. data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -0
  49. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  50. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  51. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  52. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  53. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  54. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  55. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  56. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  57. data/vendor/kreuzberg-tesseract/Cargo.toml +48 -0
  58. data/vendor/kreuzberg-tesseract/LICENSE +22 -0
  59. data/vendor/kreuzberg-tesseract/README.md +399 -0
  60. data/vendor/kreuzberg-tesseract/build.rs +1354 -0
  61. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  62. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  63. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  64. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  65. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  66. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  67. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  68. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  69. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  70. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  71. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  72. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  73. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  74. data/vendor/rb-sys/src/lib.rs +1 -0
  75. metadata +41 -3
  76. data/vendor/rb-sys/bin/release.sh +0 -22
@@ -0,0 +1,176 @@
1
+ use std::env;
2
+ use std::fs;
3
+ use std::path::{Path, PathBuf};
4
+
5
+ fn main() {
6
+ if let Err(e) = run() {
7
+ eprintln!("Build script error: {}", e);
8
+ std::process::exit(1);
9
+ }
10
+ }
11
+
12
+ fn run() -> Result<(), String> {
13
+ let crate_dir = env::var("CARGO_MANIFEST_DIR").map_err(|_| "CARGO_MANIFEST_DIR not set".to_string())?;
14
+
15
+ let config =
16
+ cbindgen::Config::from_file("cbindgen.toml").map_err(|e| format!("Failed to load cbindgen config: {}", e))?;
17
+
18
+ cbindgen::generate_with_config(&crate_dir, config)
19
+ .map_err(|e| format!("Failed to generate C bindings: {}", e))?
20
+ .write_to_file("kreuzberg.h");
21
+
22
+ // Generate pkg-config files
23
+ let pc_template = std::fs::read_to_string("kreuzberg-ffi.pc.in")
24
+ .map_err(|e| format!("Failed to read pkg-config template: {}", e))?;
25
+
26
+ let version = env::var("CARGO_PKG_VERSION").map_err(|_| "CARGO_PKG_VERSION not set".to_string())?;
27
+
28
+ let repo_root = Path::new(&crate_dir).parent().and_then(|p| p.parent()).ok_or_else(|| {
29
+ "CARGO_MANIFEST_DIR did not have expected depth (expected crates/kreuzberg-ffi/...)".to_string()
30
+ })?;
31
+
32
+ // Normalize paths to use forward slashes for pkg-config compatibility across all platforms
33
+ let dev_prefix = repo_root.to_string_lossy().replace('\\', "/");
34
+
35
+ // Platform-specific private libs - detect both OS and target environment
36
+ // Use CARGO_CFG_TARGET_OS for cross-compilation support and CARGO_CFG_TARGET_ENV for MSVC detection
37
+ let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_else(|_| "unknown".to_string());
38
+ let target_env = env::var("CARGO_CFG_TARGET_ENV").unwrap_or_else(|_| "gnu".to_string());
39
+
40
+ let libs_private = match target_os.as_str() {
41
+ "linux" => "-lpthread -ldl -lm",
42
+ "macos" => "-framework CoreFoundation -framework Security -lpthread",
43
+ "windows" => match target_env.as_str() {
44
+ "msvc" => "-lws2_32 -luserenv -lbcrypt",
45
+ // gnu targets (MinGW, etc.) support GCC-specific flags
46
+ "gnu" => "-lpthread -lws2_32 -luserenv -lbcrypt -static-libgcc -static-libstdc++",
47
+ _ => "-lws2_32 -luserenv -lbcrypt",
48
+ },
49
+ _ => "",
50
+ };
51
+
52
+ let out_dir = PathBuf::from(env::var("OUT_DIR").map_err(|_| "OUT_DIR not set".to_string())?);
53
+ let profile_dir = out_dir
54
+ .ancestors()
55
+ .nth(3)
56
+ .ok_or_else(|| "OUT_DIR did not have expected depth (expected target/{debug,release}/build/...)".to_string())?;
57
+
58
+ // Copy PDFium library from kreuzberg build output to profile_dir (target/release or target/debug)
59
+ // This is necessary for Java and other language bindings that need bundled-pdfium
60
+ copy_pdfium_to_profile_dir(profile_dir)?;
61
+
62
+ // Development version (for monorepo use) - use actual monorepo paths
63
+ // Normalize path separators for pkg-config compatibility across all platforms
64
+ let dev_libdir = profile_dir.to_string_lossy().replace('\\', "/");
65
+ let dev_includedir = format!("{}/crates/kreuzberg-ffi", dev_prefix);
66
+ let dev_pc = format!(
67
+ r#"prefix={}
68
+ exec_prefix=${{prefix}}
69
+ libdir={}
70
+ includedir={}
71
+
72
+ Name: kreuzberg-ffi
73
+ Description: C FFI bindings for Kreuzberg document intelligence library
74
+ Version: {}
75
+ URL: https://kreuzberg.dev
76
+ Libs: -L${{libdir}} -lkreuzberg_ffi
77
+ Libs.private: {}
78
+ Cflags: -I${{includedir}}
79
+ "#,
80
+ dev_prefix, dev_libdir, dev_includedir, version, libs_private
81
+ );
82
+ std::fs::write("kreuzberg-ffi.pc", dev_pc).map_err(|e| format!("Failed to write development pkg-config: {}", e))?;
83
+
84
+ // Installation version (for release artifacts)
85
+ let install_pc = pc_template
86
+ .replace("@PREFIX@", "/usr/local")
87
+ .replace("@VERSION@", &version)
88
+ .replace("@LIBS_PRIVATE@", libs_private);
89
+ std::fs::write("kreuzberg-ffi-install.pc", install_pc)
90
+ .map_err(|e| format!("Failed to write installation pkg-config: {}", e))?;
91
+
92
+ #[cfg(target_os = "macos")]
93
+ {
94
+ println!("cargo:rustc-link-arg=-rpath");
95
+ println!("cargo:rustc-link-arg=@loader_path");
96
+
97
+ println!("cargo:rustc-link-arg=-rpath");
98
+ println!("cargo:rustc-link-arg=@executable_path/../target/release");
99
+ }
100
+
101
+ println!("cargo:rerun-if-changed=cbindgen.toml");
102
+ println!("cargo:rerun-if-changed=src/lib.rs");
103
+ println!("cargo:rerun-if-changed=kreuzberg-ffi.pc.in");
104
+
105
+ Ok(())
106
+ }
107
+
108
+ /// Copy bundled PDFium library from kreuzberg build output to profile directory.
109
+ /// This enables Java/other language bindings to find libpdfium.dylib/so/dll at runtime.
110
+ fn copy_pdfium_to_profile_dir(profile_dir: &Path) -> Result<(), String> {
111
+ let build_dir = profile_dir.join("build");
112
+
113
+ // Search for PDFium in kreuzberg build output directory
114
+ // Pattern: target/{debug,release}/build/kreuzberg-{hash}/out/libpdfium.*
115
+ if let Ok(entries) = fs::read_dir(&build_dir) {
116
+ for entry in entries.flatten() {
117
+ let path = entry.path();
118
+ if path.is_dir()
119
+ && path
120
+ .file_name()
121
+ .is_some_and(|n| n.to_string_lossy().starts_with("kreuzberg-"))
122
+ {
123
+ let out_dir = path.join("out");
124
+ if out_dir.exists() {
125
+ // Try to copy PDFium from this build directory
126
+ if copy_pdfium_from_dir(&out_dir, profile_dir).is_err() {
127
+ continue; // Try next directory if this one fails
128
+ } else {
129
+ return Ok(()); // Success!
130
+ }
131
+ }
132
+ }
133
+ }
134
+ }
135
+
136
+ // If we get here, PDFium was not found - this is a warning, not an error
137
+ // because PDFium might be system-installed or the bundled-pdfium feature might not be enabled
138
+ eprintln!("Warning: bundled PDFium library not found in build output. Some features may not work.");
139
+ eprintln!("If PDFium is needed, ensure the 'bundled-pdfium' feature is enabled.");
140
+
141
+ Ok(())
142
+ }
143
+
144
+ /// Copy PDFium library files from source directory to destination.
145
+ fn copy_pdfium_from_dir(src_dir: &Path, dest_dir: &Path) -> Result<(), String> {
146
+ // Read all files in the source directory
147
+ let entries = fs::read_dir(src_dir).map_err(|e| format!("Failed to read {}: {}", src_dir.display(), e))?;
148
+
149
+ // Look for libpdfium.* files
150
+ for entry in entries.flatten() {
151
+ let path = entry.path();
152
+ let file_name = path.file_name().ok_or("No file name")?;
153
+ let file_name_str = file_name.to_string_lossy();
154
+
155
+ if file_name_str.starts_with("libpdfium") || file_name_str.starts_with("pdfium") {
156
+ let dest_file = dest_dir.join(file_name);
157
+ match fs::copy(&path, &dest_file) {
158
+ Ok(bytes_copied) => {
159
+ eprintln!(
160
+ "Copied {} ({} bytes) to {}",
161
+ path.display(),
162
+ bytes_copied,
163
+ dest_file.display()
164
+ );
165
+ return Ok(());
166
+ }
167
+ Err(e) => {
168
+ eprintln!("Failed to copy {} to {}: {}", path.display(), dest_file.display(), e);
169
+ return Err(format!("Failed to copy PDFium: {}", e));
170
+ }
171
+ }
172
+ }
173
+ }
174
+
175
+ Err(format!("PDFium library not found in {}", src_dir.display()))
176
+ }
@@ -0,0 +1,27 @@
1
+ language = "C"
2
+ include_guard = "KREUZBERG_FFI_H"
3
+ pragma_once = true
4
+ header = "/* Auto-generated C bindings for Kreuzberg */"
5
+ autogen_warning = "/* Warning, this file is autogenerated by cbindgen. Don't modify this manually. */"
6
+ documentation = true
7
+ line_length = 100
8
+ after_includes = """
9
+ /**
10
+ * Opaque type for extraction configuration.
11
+ * This is an opaque pointer type - callers should not access its internals.
12
+ */
13
+ typedef struct ExtractionConfig ExtractionConfig;
14
+ """
15
+
16
+ [export]
17
+ include = [
18
+ "kreuzberg_extract_file_sync",
19
+ "kreuzberg_free_string",
20
+ "kreuzberg_free_result",
21
+ "kreuzberg_last_error",
22
+ "kreuzberg_version",
23
+ ]
24
+
25
+ [parse]
26
+ parse_deps = false
27
+ include = []
@@ -0,0 +1,12 @@
1
+ prefix=/usr/local
2
+ exec_prefix=${prefix}
3
+ libdir=${exec_prefix}/lib
4
+ includedir=${prefix}/include
5
+
6
+ Name: kreuzberg-ffi
7
+ Description: C FFI bindings for Kreuzberg document intelligence library
8
+ Version: 4.0.0-rc.11
9
+ URL: https://kreuzberg.dev
10
+ Libs: -L${libdir} -lkreuzberg_ffi
11
+ Libs.private: -framework CoreFoundation -framework Security -lpthread
12
+ Cflags: -I${includedir}
@@ -0,0 +1,12 @@
1
+ prefix=@PREFIX@
2
+ exec_prefix=${prefix}
3
+ libdir=${exec_prefix}/lib
4
+ includedir=${prefix}/include
5
+
6
+ Name: kreuzberg-ffi
7
+ Description: C FFI bindings for Kreuzberg document intelligence library
8
+ Version: @VERSION@
9
+ URL: https://kreuzberg.dev
10
+ Libs: -L${libdir} -lkreuzberg_ffi
11
+ Libs.private: @LIBS_PRIVATE@
12
+ Cflags: -I${includedir}