kreuzberg 4.0.0.pre.rc.15 → 4.0.0.pre.rc.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +5 -0
  3. data/Gemfile.lock +3 -2
  4. data/ext/kreuzberg_rb/native/Cargo.lock +1 -1
  5. data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
  6. data/ext/kreuzberg_rb/native/build.rs +69 -46
  7. data/ext/kreuzberg_rb/native/src/lib.rs +683 -11
  8. data/lib/kreuzberg/config.rb +111 -8
  9. data/lib/kreuzberg/error_context.rb +76 -0
  10. data/lib/kreuzberg/result.rb +78 -0
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/sig/kreuzberg.rbs +8 -0
  13. data/spec/binding/batch_spec.rb +374 -0
  14. data/spec/binding/config_result_spec.rb +377 -0
  15. data/spec/binding/config_validation_spec.rb +98 -0
  16. data/vendor/Cargo.toml +1 -1
  17. data/vendor/kreuzberg/Cargo.toml +15 -2
  18. data/vendor/kreuzberg/benches/token_reduction.rs +135 -0
  19. data/vendor/kreuzberg/src/chunking/mod.rs +464 -28
  20. data/vendor/kreuzberg/src/core/batch_optimizations.rs +304 -0
  21. data/vendor/kreuzberg/src/core/config_validation.rs +662 -0
  22. data/vendor/kreuzberg/src/core/extractor.rs +19 -2
  23. data/vendor/kreuzberg/src/core/formats.rs +251 -0
  24. data/vendor/kreuzberg/src/core/mod.rs +12 -0
  25. data/vendor/kreuzberg/src/core/pipeline.rs +103 -32
  26. data/vendor/kreuzberg/src/extraction/archive.rs +18 -6
  27. data/vendor/kreuzberg/src/extraction/docx.rs +7 -3
  28. data/vendor/kreuzberg/src/extraction/email.rs +15 -11
  29. data/vendor/kreuzberg/src/extraction/excel.rs +24 -5
  30. data/vendor/kreuzberg/src/extraction/html.rs +10 -15
  31. data/vendor/kreuzberg/src/extraction/markdown.rs +5 -2
  32. data/vendor/kreuzberg/src/extraction/pptx.rs +8 -6
  33. data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
  34. data/vendor/kreuzberg/src/extraction/table.rs +3 -1
  35. data/vendor/kreuzberg/src/extraction/text.rs +27 -10
  36. data/vendor/kreuzberg/src/extractors/html.rs +2 -1
  37. data/vendor/kreuzberg/src/extractors/pdf.rs +74 -42
  38. data/vendor/kreuzberg/src/lib.rs +2 -2
  39. data/vendor/kreuzberg/src/ocr/language_registry.rs +526 -0
  40. data/vendor/kreuzberg/src/ocr/mod.rs +2 -0
  41. data/vendor/kreuzberg/src/pdf/bindings.rs +202 -19
  42. data/vendor/kreuzberg/src/pdf/bundled.rs +12 -3
  43. data/vendor/kreuzberg/src/pdf/metadata.rs +8 -0
  44. data/vendor/kreuzberg/src/pdf/rendering.rs +4 -0
  45. data/vendor/kreuzberg/src/pdf/text.rs +164 -30
  46. data/vendor/kreuzberg/src/text/mod.rs +2 -0
  47. data/vendor/kreuzberg/src/text/quality_processor.rs +37 -12
  48. data/vendor/kreuzberg/src/text/string_utils.rs +27 -10
  49. data/vendor/kreuzberg/src/text/token_reduction/core.rs +37 -5
  50. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +24 -10
  51. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +2 -1
  52. data/vendor/kreuzberg/src/text/utf8_validation.rs +197 -0
  53. data/vendor/kreuzberg/src/types.rs +380 -6
  54. data/vendor/kreuzberg/src/utils/mod.rs +11 -0
  55. data/vendor/kreuzberg/src/utils/pool.rs +364 -0
  56. data/vendor/kreuzberg/src/utils/quality.rs +12 -3
  57. data/vendor/kreuzberg/src/utils/string_pool.rs +424 -0
  58. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +169 -0
  59. data/vendor/kreuzberg/tests/ocr_language_registry.rs +207 -0
  60. data/vendor/kreuzberg/tests/pipeline_integration.rs +3 -1
  61. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +17 -0
  62. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  63. metadata +14 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1ac94696cb48598d98ae55f75c69c59e1d248577b965a3921e21998ee33d2352
4
- data.tar.gz: 684e9f74a5f0d5c2c52677fec3cec493707b084dc77815396b237864dfeded90
3
+ metadata.gz: f91977b1472bb6211f3ac2efad274e8cbc77dc5ed9832529eccbebeae1f74b4f
4
+ data.tar.gz: b8a32377a80cfec656e8ddd65576dc220f497fc65b7b45b307db54a0b3b4a274
5
5
  SHA512:
6
- metadata.gz: 6ed0b13217aad741e169850f155a28f921a37a41ffa95fb12a733798b49625f7a9db030eae90ddf00ee3e367b5a563a426fa301c8a16604c8ad5ca3ba78432fc
7
- data.tar.gz: 6c3acf2fb24f573a65e81fdac91f3735a6e2335c340d79a453d73fb43b63b807a8b9e93bbbba38a8c55550be72f0f503513b142238a1c3965e279d8ed522b3ae
6
+ metadata.gz: cb391d9f82848e0b19b0c8df2cce7db455d1b73ba5e5c6dd63a2cc87732d8dd0cd6596ca7f9b305061d9400db95c5890292efbd16af7e55a9434f3f29a337642
7
+ data.tar.gz: 9e41afcc217e00d9feb3f8c4adecb7152743227f2c77f4bcfd9fd5e3d4b64b01171d3bdbb2b1290e10d2efdf13cdb53feb8fee01a95b6ba4d87ea76425b56692
data/.rubocop.yml CHANGED
@@ -52,6 +52,7 @@ Metrics/AbcSize:
52
52
  Exclude:
53
53
  - 'spec/**/*'
54
54
  - 'examples/**/*'
55
+ - 'lib/kreuzberg/config.rb'
55
56
 
56
57
  Naming/FileName:
57
58
  Enabled: true
@@ -99,6 +100,10 @@ Metrics/PerceivedComplexity:
99
100
  Exclude:
100
101
  - 'lib/kreuzberg/config.rb'
101
102
 
103
+ Metrics/ClassLength:
104
+ Exclude:
105
+ - 'lib/kreuzberg/config.rb'
106
+
102
107
  RSpec/RepeatedExampleGroupBody:
103
108
  Enabled: false
104
109
 
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.0.0.pre.rc.15)
4
+ kreuzberg (4.0.0.pre.rc.17)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -33,7 +33,7 @@ GEM
33
33
  ffi (1.17.2-arm64-darwin)
34
34
  ffi (1.17.2-x86_64-linux-gnu)
35
35
  fileutils (1.8.0)
36
- i18n (1.14.7)
36
+ i18n (1.14.8)
37
37
  concurrent-ruby (~> 1.0)
38
38
  json (2.18.0)
39
39
  language_server-protocol (3.17.0.5)
@@ -137,6 +137,7 @@ GEM
137
137
 
138
138
  PLATFORMS
139
139
  arm64-darwin-24
140
+ arm64-darwin-25
140
141
  x86_64-linux
141
142
 
142
143
  DEPENDENCIES
@@ -2354,7 +2354,7 @@ dependencies = [
2354
2354
 
2355
2355
  [[package]]
2356
2356
  name = "kreuzberg-rb"
2357
- version = "4.0.0-rc.15"
2357
+ version = "4.0.0-rc.17"
2358
2358
  dependencies = [
2359
2359
  "async-trait",
2360
2360
  "html-to-markdown-rs",
@@ -7,7 +7,7 @@ rb-sys = { path = "../../../vendor/rb-sys" }
7
7
 
8
8
  [package]
9
9
  name = "kreuzberg-rb"
10
- version = "4.0.0-rc.15"
10
+ version = "4.0.0-rc.17"
11
11
  edition = "2024"
12
12
  rust-version = "1.91"
13
13
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,52 +1,75 @@
1
- #[cfg(target_os = "macos")]
2
- fn main() {
3
- if let Ok(cargo_manifest_dir) = std::env::var("CARGO_MANIFEST_DIR") {
4
- let lib_path = std::path::Path::new(&cargo_manifest_dir)
5
- .parent()
6
- .and_then(|p| p.parent())
7
- .and_then(|p| p.parent())
8
- .and_then(|p| p.parent())
9
- .and_then(|p| p.parent())
10
- .map(|p| p.join("target/release"))
11
- .expect("Failed to construct lib path");
12
- println!("cargo:rustc-link-search={}", lib_path.display());
13
- }
14
- println!("cargo:rustc-link-arg=-Wl,-undefined,dynamic_lookup");
15
- println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
16
- println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
17
- }
1
+ use std::env;
2
+ use std::path::PathBuf;
18
3
 
19
- #[cfg(target_os = "linux")]
20
4
  fn main() {
21
- if let Ok(cargo_manifest_dir) = std::env::var("CARGO_MANIFEST_DIR") {
22
- let lib_path = std::path::Path::new(&cargo_manifest_dir)
23
- .parent()
24
- .and_then(|p| p.parent())
25
- .and_then(|p| p.parent())
26
- .and_then(|p| p.parent())
27
- .and_then(|p| p.parent())
28
- .map(|p| p.join("target/release"))
29
- .expect("Failed to construct lib path");
30
- println!("cargo:rustc-link-search={}", lib_path.display());
5
+ let target = env::var("TARGET").unwrap();
6
+ let profile = env::var("PROFILE").unwrap_or_else(|_| "release".to_string());
7
+
8
+ // Try to locate kreuzberg-ffi library built alongside this crate
9
+ let cargo_manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
10
+ let manifest_path = PathBuf::from(&cargo_manifest_dir);
11
+
12
+ // Prefer host target layout, but include target-triple layout for cross builds.
13
+ // IMPORTANT: Only search lib directories, NOT deps directories.
14
+ // The deps/ directories may contain dylibs with hardcoded install_name paths,
15
+ // which causes load errors on macOS when users install the gem.
16
+ if let Some(packages_root) = manifest_path
17
+ .parent()
18
+ .and_then(|p| p.parent())
19
+ .and_then(|p| p.parent())
20
+ .and_then(|p| p.parent())
21
+ .and_then(|p| p.parent())
22
+ {
23
+ let host_lib_dir = packages_root.join("target").join(&profile);
24
+ let target_lib_dir = packages_root.join("target").join(&target).join(&profile);
25
+
26
+ // Try to find the static library and link it directly on Unix-like systems
27
+ // to avoid the linker preferring dylib over static lib.
28
+ if !target.contains("windows") {
29
+ let static_lib_name = if target.contains("windows") {
30
+ "kreuzberg_ffi.lib"
31
+ } else {
32
+ "libkreuzberg_ffi.a"
33
+ };
34
+
35
+ // Check both host and target lib directories for the static library
36
+ for lib_dir in [&host_lib_dir, &target_lib_dir] {
37
+ let static_lib = lib_dir.join(static_lib_name);
38
+ if static_lib.exists() {
39
+ // Found static library, link it directly by passing the full path
40
+ println!("cargo:rustc-link-arg={}", static_lib.display());
41
+ // Don't add the library search path or -l flag
42
+ // Jump to platform-specific configuration
43
+ if target.contains("darwin") {
44
+ println!("cargo:rustc-link-arg=-Wl,-undefined,dynamic_lookup");
45
+ println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
46
+ } else if target.contains("linux") {
47
+ println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
48
+ }
49
+ println!("cargo:rerun-if-changed=build.rs");
50
+ return;
51
+ }
52
+ }
53
+ }
54
+
55
+ // Fallback: Add search paths and use standard linking
56
+ for dir in [host_lib_dir, target_lib_dir] {
57
+ if dir.exists() {
58
+ println!("cargo:rustc-link-search=native={}", dir.display());
59
+ }
60
+ }
31
61
  }
32
- println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
33
- println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
34
- }
35
62
 
36
- #[cfg(target_os = "windows")]
37
- fn main() {
38
- if let Ok(cargo_manifest_dir) = std::env::var("CARGO_MANIFEST_DIR") {
39
- let lib_path = std::path::Path::new(&cargo_manifest_dir)
40
- .parent()
41
- .and_then(|p| p.parent())
42
- .and_then(|p| p.parent())
43
- .and_then(|p| p.parent())
44
- .and_then(|p| p.parent())
45
- .map(|p| p.join("target/release"))
46
- .expect("Failed to construct lib path");
47
- println!("cargo:rustc-link-search={}", lib_path.display());
63
+ // Link the kreuzberg-ffi library
64
+ // When kreuzberg-ffi is built, its symbols become available for linking
65
+ println!("cargo:rustc-link-lib=static=kreuzberg_ffi");
66
+
67
+ if target.contains("darwin") {
68
+ println!("cargo:rustc-link-arg=-Wl,-undefined,dynamic_lookup");
69
+ println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
70
+ } else if target.contains("linux") {
71
+ println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
48
72
  }
49
- }
50
73
 
51
- #[cfg(not(any(target_os = "macos", target_os = "linux", target_os = "windows")))]
52
- fn main() {}
74
+ println!("cargo:rerun-if-changed=build.rs");
75
+ }