kreuzberg 4.0.0.pre.rc.15 → 4.0.0.pre.rc.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +5 -0
- data/Gemfile.lock +3 -2
- data/ext/kreuzberg_rb/native/Cargo.lock +1 -1
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/ext/kreuzberg_rb/native/build.rs +69 -46
- data/ext/kreuzberg_rb/native/src/lib.rs +683 -11
- data/lib/kreuzberg/config.rb +111 -8
- data/lib/kreuzberg/error_context.rb +76 -0
- data/lib/kreuzberg/result.rb +78 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +8 -0
- data/spec/binding/batch_spec.rb +374 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_validation_spec.rb +98 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +15 -2
- data/vendor/kreuzberg/benches/token_reduction.rs +135 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +464 -28
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +304 -0
- data/vendor/kreuzberg/src/core/config_validation.rs +662 -0
- data/vendor/kreuzberg/src/core/extractor.rs +19 -2
- data/vendor/kreuzberg/src/core/formats.rs +251 -0
- data/vendor/kreuzberg/src/core/mod.rs +12 -0
- data/vendor/kreuzberg/src/core/pipeline.rs +103 -32
- data/vendor/kreuzberg/src/extraction/archive.rs +18 -6
- data/vendor/kreuzberg/src/extraction/docx.rs +7 -3
- data/vendor/kreuzberg/src/extraction/email.rs +15 -11
- data/vendor/kreuzberg/src/extraction/excel.rs +24 -5
- data/vendor/kreuzberg/src/extraction/html.rs +10 -15
- data/vendor/kreuzberg/src/extraction/markdown.rs +5 -2
- data/vendor/kreuzberg/src/extraction/pptx.rs +8 -6
- data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
- data/vendor/kreuzberg/src/extraction/table.rs +3 -1
- data/vendor/kreuzberg/src/extraction/text.rs +27 -10
- data/vendor/kreuzberg/src/extractors/html.rs +2 -1
- data/vendor/kreuzberg/src/extractors/pdf.rs +74 -42
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/ocr/language_registry.rs +526 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +2 -0
- data/vendor/kreuzberg/src/pdf/bindings.rs +202 -19
- data/vendor/kreuzberg/src/pdf/bundled.rs +12 -3
- data/vendor/kreuzberg/src/pdf/metadata.rs +8 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +4 -0
- data/vendor/kreuzberg/src/pdf/text.rs +164 -30
- data/vendor/kreuzberg/src/text/mod.rs +2 -0
- data/vendor/kreuzberg/src/text/quality_processor.rs +37 -12
- data/vendor/kreuzberg/src/text/string_utils.rs +27 -10
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +37 -5
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +24 -10
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +2 -1
- data/vendor/kreuzberg/src/text/utf8_validation.rs +197 -0
- data/vendor/kreuzberg/src/types.rs +380 -6
- data/vendor/kreuzberg/src/utils/mod.rs +11 -0
- data/vendor/kreuzberg/src/utils/pool.rs +364 -0
- data/vendor/kreuzberg/src/utils/quality.rs +12 -3
- data/vendor/kreuzberg/src/utils/string_pool.rs +424 -0
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +169 -0
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +207 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +3 -1
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +17 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +14 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: f91977b1472bb6211f3ac2efad274e8cbc77dc5ed9832529eccbebeae1f74b4f
|
|
4
|
+
data.tar.gz: b8a32377a80cfec656e8ddd65576dc220f497fc65b7b45b307db54a0b3b4a274
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: cb391d9f82848e0b19b0c8df2cce7db455d1b73ba5e5c6dd63a2cc87732d8dd0cd6596ca7f9b305061d9400db95c5890292efbd16af7e55a9434f3f29a337642
|
|
7
|
+
data.tar.gz: 9e41afcc217e00d9feb3f8c4adecb7152743227f2c77f4bcfd9fd5e3d4b64b01171d3bdbb2b1290e10d2efdf13cdb53feb8fee01a95b6ba4d87ea76425b56692
|
data/.rubocop.yml
CHANGED
|
@@ -52,6 +52,7 @@ Metrics/AbcSize:
|
|
|
52
52
|
Exclude:
|
|
53
53
|
- 'spec/**/*'
|
|
54
54
|
- 'examples/**/*'
|
|
55
|
+
- 'lib/kreuzberg/config.rb'
|
|
55
56
|
|
|
56
57
|
Naming/FileName:
|
|
57
58
|
Enabled: true
|
|
@@ -99,6 +100,10 @@ Metrics/PerceivedComplexity:
|
|
|
99
100
|
Exclude:
|
|
100
101
|
- 'lib/kreuzberg/config.rb'
|
|
101
102
|
|
|
103
|
+
Metrics/ClassLength:
|
|
104
|
+
Exclude:
|
|
105
|
+
- 'lib/kreuzberg/config.rb'
|
|
106
|
+
|
|
102
107
|
RSpec/RepeatedExampleGroupBody:
|
|
103
108
|
Enabled: false
|
|
104
109
|
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.0.0.pre.rc.
|
|
4
|
+
kreuzberg (4.0.0.pre.rc.17)
|
|
5
5
|
|
|
6
6
|
GEM
|
|
7
7
|
remote: https://rubygems.org/
|
|
@@ -33,7 +33,7 @@ GEM
|
|
|
33
33
|
ffi (1.17.2-arm64-darwin)
|
|
34
34
|
ffi (1.17.2-x86_64-linux-gnu)
|
|
35
35
|
fileutils (1.8.0)
|
|
36
|
-
i18n (1.14.
|
|
36
|
+
i18n (1.14.8)
|
|
37
37
|
concurrent-ruby (~> 1.0)
|
|
38
38
|
json (2.18.0)
|
|
39
39
|
language_server-protocol (3.17.0.5)
|
|
@@ -137,6 +137,7 @@ GEM
|
|
|
137
137
|
|
|
138
138
|
PLATFORMS
|
|
139
139
|
arm64-darwin-24
|
|
140
|
+
arm64-darwin-25
|
|
140
141
|
x86_64-linux
|
|
141
142
|
|
|
142
143
|
DEPENDENCIES
|
|
@@ -1,52 +1,75 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
if let Ok(cargo_manifest_dir) = std::env::var("CARGO_MANIFEST_DIR") {
|
|
4
|
-
let lib_path = std::path::Path::new(&cargo_manifest_dir)
|
|
5
|
-
.parent()
|
|
6
|
-
.and_then(|p| p.parent())
|
|
7
|
-
.and_then(|p| p.parent())
|
|
8
|
-
.and_then(|p| p.parent())
|
|
9
|
-
.and_then(|p| p.parent())
|
|
10
|
-
.map(|p| p.join("target/release"))
|
|
11
|
-
.expect("Failed to construct lib path");
|
|
12
|
-
println!("cargo:rustc-link-search={}", lib_path.display());
|
|
13
|
-
}
|
|
14
|
-
println!("cargo:rustc-link-arg=-Wl,-undefined,dynamic_lookup");
|
|
15
|
-
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
|
|
16
|
-
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path/.");
|
|
17
|
-
}
|
|
1
|
+
use std::env;
|
|
2
|
+
use std::path::PathBuf;
|
|
18
3
|
|
|
19
|
-
#[cfg(target_os = "linux")]
|
|
20
4
|
fn main() {
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
5
|
+
let target = env::var("TARGET").unwrap();
|
|
6
|
+
let profile = env::var("PROFILE").unwrap_or_else(|_| "release".to_string());
|
|
7
|
+
|
|
8
|
+
// Try to locate kreuzberg-ffi library built alongside this crate
|
|
9
|
+
let cargo_manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
|
|
10
|
+
let manifest_path = PathBuf::from(&cargo_manifest_dir);
|
|
11
|
+
|
|
12
|
+
// Prefer host target layout, but include target-triple layout for cross builds.
|
|
13
|
+
// IMPORTANT: Only search lib directories, NOT deps directories.
|
|
14
|
+
// The deps/ directories may contain dylibs with hardcoded install_name paths,
|
|
15
|
+
// which causes load errors on macOS when users install the gem.
|
|
16
|
+
if let Some(packages_root) = manifest_path
|
|
17
|
+
.parent()
|
|
18
|
+
.and_then(|p| p.parent())
|
|
19
|
+
.and_then(|p| p.parent())
|
|
20
|
+
.and_then(|p| p.parent())
|
|
21
|
+
.and_then(|p| p.parent())
|
|
22
|
+
{
|
|
23
|
+
let host_lib_dir = packages_root.join("target").join(&profile);
|
|
24
|
+
let target_lib_dir = packages_root.join("target").join(&target).join(&profile);
|
|
25
|
+
|
|
26
|
+
// Try to find the static library and link it directly on Unix-like systems
|
|
27
|
+
// to avoid the linker preferring dylib over static lib.
|
|
28
|
+
if !target.contains("windows") {
|
|
29
|
+
let static_lib_name = if target.contains("windows") {
|
|
30
|
+
"kreuzberg_ffi.lib"
|
|
31
|
+
} else {
|
|
32
|
+
"libkreuzberg_ffi.a"
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
// Check both host and target lib directories for the static library
|
|
36
|
+
for lib_dir in [&host_lib_dir, &target_lib_dir] {
|
|
37
|
+
let static_lib = lib_dir.join(static_lib_name);
|
|
38
|
+
if static_lib.exists() {
|
|
39
|
+
// Found static library, link it directly by passing the full path
|
|
40
|
+
println!("cargo:rustc-link-arg={}", static_lib.display());
|
|
41
|
+
// Don't add the library search path or -l flag
|
|
42
|
+
// Jump to platform-specific configuration
|
|
43
|
+
if target.contains("darwin") {
|
|
44
|
+
println!("cargo:rustc-link-arg=-Wl,-undefined,dynamic_lookup");
|
|
45
|
+
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
|
|
46
|
+
} else if target.contains("linux") {
|
|
47
|
+
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
|
|
48
|
+
}
|
|
49
|
+
println!("cargo:rerun-if-changed=build.rs");
|
|
50
|
+
return;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Fallback: Add search paths and use standard linking
|
|
56
|
+
for dir in [host_lib_dir, target_lib_dir] {
|
|
57
|
+
if dir.exists() {
|
|
58
|
+
println!("cargo:rustc-link-search=native={}", dir.display());
|
|
59
|
+
}
|
|
60
|
+
}
|
|
31
61
|
}
|
|
32
|
-
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
|
|
33
|
-
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN/.");
|
|
34
|
-
}
|
|
35
62
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
.map(|p| p.join("target/release"))
|
|
46
|
-
.expect("Failed to construct lib path");
|
|
47
|
-
println!("cargo:rustc-link-search={}", lib_path.display());
|
|
63
|
+
// Link the kreuzberg-ffi library
|
|
64
|
+
// When kreuzberg-ffi is built, its symbols become available for linking
|
|
65
|
+
println!("cargo:rustc-link-lib=static=kreuzberg_ffi");
|
|
66
|
+
|
|
67
|
+
if target.contains("darwin") {
|
|
68
|
+
println!("cargo:rustc-link-arg=-Wl,-undefined,dynamic_lookup");
|
|
69
|
+
println!("cargo:rustc-link-arg=-Wl,-rpath,@loader_path");
|
|
70
|
+
} else if target.contains("linux") {
|
|
71
|
+
println!("cargo:rustc-link-arg=-Wl,-rpath,$ORIGIN");
|
|
48
72
|
}
|
|
49
|
-
}
|
|
50
73
|
|
|
51
|
-
|
|
52
|
-
|
|
74
|
+
println!("cargo:rerun-if-changed=build.rs");
|
|
75
|
+
}
|