kreuzberg 4.0.0.pre.rc.7 → 4.0.0.pre.rc.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +13 -12
- data/README.md +22 -0
- data/ext/kreuzberg_rb/native/.cargo/config.toml +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +397 -183
- data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
- data/ext/kreuzberg_rb/native/src/lib.rs +36 -13
- data/kreuzberg.gemspec +34 -2
- data/lib/kreuzberg/cache_api.rb +35 -0
- data/lib/kreuzberg/error_context.rb +49 -1
- data/lib/kreuzberg/extraction_api.rb +255 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +6 -0
- data/lib/libpdfium.dylib +0 -0
- data/sig/kreuzberg.rbs +9 -0
- data/vendor/Cargo.toml +44 -0
- data/vendor/kreuzberg/Cargo.toml +65 -35
- data/vendor/kreuzberg/README.md +50 -0
- data/vendor/kreuzberg/build.rs +548 -190
- data/vendor/kreuzberg/src/api/mod.rs +0 -2
- data/vendor/kreuzberg/src/core/pipeline.rs +13 -0
- data/vendor/kreuzberg/src/embeddings.rs +71 -3
- data/vendor/kreuzberg/src/error.rs +1 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -1
- data/vendor/kreuzberg/src/extraction/html.rs +37 -5
- data/vendor/kreuzberg/src/extractors/pdf.rs +99 -47
- data/vendor/kreuzberg/src/mcp/mod.rs +3 -2
- data/vendor/kreuzberg/src/mcp/server.rs +106 -0
- data/vendor/kreuzberg/src/pdf/bindings.rs +44 -0
- data/vendor/kreuzberg/src/pdf/bundled.rs +346 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +2 -2
- data/vendor/kreuzberg/src/pdf/mod.rs +6 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +2 -2
- data/vendor/kreuzberg/src/pdf/table.rs +3 -0
- data/vendor/kreuzberg/src/pdf/text.rs +2 -2
- data/vendor/kreuzberg/src/text/quality_processor.rs +1 -1
- data/vendor/kreuzberg/tests/concurrency_stress.rs +1 -1
- data/vendor/kreuzberg/tests/format_integration.rs +4 -1
- data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +63 -0
- data/vendor/kreuzberg-ffi/README.md +851 -0
- data/vendor/kreuzberg-ffi/build.rs +176 -0
- data/vendor/kreuzberg-ffi/cbindgen.toml +27 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi-install.pc +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +1087 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +3616 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +247 -0
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
- data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
- data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +48 -0
- data/vendor/kreuzberg-tesseract/LICENSE +22 -0
- data/vendor/kreuzberg-tesseract/README.md +399 -0
- data/vendor/kreuzberg-tesseract/build.rs +1354 -0
- data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
- data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
- data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
- data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
- data/vendor/rb-sys/src/lib.rs +1 -0
- metadata +41 -3
- data/vendor/rb-sys/bin/release.sh +0 -22
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ea76ed9d63fda80f47f1054c421989e77269b2d3bc8810ab25cd7e59b062ec7d
|
|
4
|
+
data.tar.gz: de96aea5d18ed67f34117fca308c5cc30b9e719e60c2a8fb0384f50d1fbd704f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 96fec6456241cf9a03ab60ea19ccf1ca8beda412ade51e298bc13480374a5ff7565fda57b2fb8c73edebc54ab3641d4832d740fc68a0e4bba2d610e9b340f682
|
|
7
|
+
data.tar.gz: 254f57261deda88616238abeb5faffd7ebdc767fb01cdc712193edbabd6a5329aeadc55dc0d93c1e5eda491924662b947a737e40eb49e3a9d9b764239e4c1221
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.0.0.pre.rc.
|
|
4
|
+
kreuzberg (4.0.0.pre.rc.11)
|
|
5
5
|
|
|
6
6
|
GEM
|
|
7
7
|
remote: https://rubygems.org/
|
|
@@ -21,16 +21,17 @@ GEM
|
|
|
21
21
|
uri (>= 0.13.1)
|
|
22
22
|
ast (2.4.3)
|
|
23
23
|
base64 (0.3.0)
|
|
24
|
-
bigdecimal (
|
|
24
|
+
bigdecimal (4.0.1)
|
|
25
25
|
byebug (12.0.0)
|
|
26
26
|
coderay (1.1.3)
|
|
27
|
-
concurrent-ruby (1.3.
|
|
27
|
+
concurrent-ruby (1.3.6)
|
|
28
28
|
connection_pool (3.0.2)
|
|
29
29
|
csv (3.3.5)
|
|
30
30
|
diff-lcs (1.6.2)
|
|
31
31
|
drb (2.2.3)
|
|
32
32
|
ffi (1.17.2)
|
|
33
33
|
ffi (1.17.2-arm64-darwin)
|
|
34
|
+
ffi (1.17.2-x86_64-linux-gnu)
|
|
34
35
|
fileutils (1.8.0)
|
|
35
36
|
i18n (1.14.7)
|
|
36
37
|
concurrent-ruby (~> 1.0)
|
|
@@ -42,13 +43,14 @@ GEM
|
|
|
42
43
|
rb-inotify (~> 0.9, >= 0.9.10)
|
|
43
44
|
logger (1.7.0)
|
|
44
45
|
method_source (1.1.0)
|
|
45
|
-
minitest (
|
|
46
|
+
minitest (6.0.0)
|
|
47
|
+
prism (~> 1.5)
|
|
46
48
|
mutex_m (0.3.0)
|
|
47
49
|
parallel (1.27.0)
|
|
48
50
|
parser (3.3.10.0)
|
|
49
51
|
ast (~> 2.4.1)
|
|
50
52
|
racc
|
|
51
|
-
prism (1.
|
|
53
|
+
prism (1.7.0)
|
|
52
54
|
pry (0.15.2)
|
|
53
55
|
coderay (~> 1.1)
|
|
54
56
|
method_source (~> 1.0)
|
|
@@ -58,13 +60,13 @@ GEM
|
|
|
58
60
|
racc (1.8.1)
|
|
59
61
|
rainbow (3.1.1)
|
|
60
62
|
rake (13.3.1)
|
|
61
|
-
rake-compiler (1.3.
|
|
63
|
+
rake-compiler (1.3.1)
|
|
62
64
|
rake
|
|
63
65
|
rake-compiler-dock (1.10.0)
|
|
64
66
|
rb-fsevent (0.11.2)
|
|
65
67
|
rb-inotify (0.11.1)
|
|
66
68
|
ffi (~> 1.0)
|
|
67
|
-
rb_sys (0.9.
|
|
69
|
+
rb_sys (0.9.123)
|
|
68
70
|
rake-compiler-dock (= 1.10.0)
|
|
69
71
|
rbs (3.9.5)
|
|
70
72
|
logger
|
|
@@ -82,7 +84,7 @@ GEM
|
|
|
82
84
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
83
85
|
rspec-support (~> 3.13.0)
|
|
84
86
|
rspec-support (3.13.6)
|
|
85
|
-
rubocop (1.
|
|
87
|
+
rubocop (1.82.0)
|
|
86
88
|
json (~> 2.3)
|
|
87
89
|
language_server-protocol (~> 3.17.0.2)
|
|
88
90
|
lint_roller (~> 1.1.0)
|
|
@@ -90,7 +92,7 @@ GEM
|
|
|
90
92
|
parser (>= 3.3.0.2)
|
|
91
93
|
rainbow (>= 2.2.2, < 4.0)
|
|
92
94
|
regexp_parser (>= 2.9.3, < 3.0)
|
|
93
|
-
rubocop-ast (>= 1.
|
|
95
|
+
rubocop-ast (>= 1.48.0, < 2.0)
|
|
94
96
|
ruby-progressbar (~> 1.7)
|
|
95
97
|
unicode-display_width (>= 2.4.0, < 4.0)
|
|
96
98
|
rubocop-ast (1.48.0)
|
|
@@ -122,21 +124,20 @@ GEM
|
|
|
122
124
|
strscan (>= 1.0.0)
|
|
123
125
|
terminal-table (>= 2, < 5)
|
|
124
126
|
uri (>= 0.12.0)
|
|
125
|
-
strscan (3.1.
|
|
127
|
+
strscan (3.1.6)
|
|
126
128
|
terminal-table (4.0.0)
|
|
127
129
|
unicode-display_width (>= 1.1.1, < 4)
|
|
128
130
|
tzinfo (2.0.6)
|
|
129
131
|
concurrent-ruby (~> 1.0)
|
|
130
132
|
unicode-display_width (3.2.0)
|
|
131
133
|
unicode-emoji (~> 4.1)
|
|
132
|
-
unicode-emoji (4.
|
|
134
|
+
unicode-emoji (4.2.0)
|
|
133
135
|
uri (1.1.1)
|
|
134
136
|
yard (0.9.38)
|
|
135
137
|
|
|
136
138
|
PLATFORMS
|
|
137
139
|
arm64-darwin-23
|
|
138
140
|
arm64-darwin-24
|
|
139
|
-
x64-mingw-ucrt
|
|
140
141
|
x86_64-linux
|
|
141
142
|
|
|
142
143
|
DEPENDENCIES
|
data/README.md
CHANGED
|
@@ -38,6 +38,11 @@ Extract text, tables, images, and metadata from 56 file formats including PDF, D
|
|
|
38
38
|
|
|
39
39
|
### Optional System Dependencies
|
|
40
40
|
|
|
41
|
+
- **ONNX Runtime**: For embeddings functionality
|
|
42
|
+
- macOS: `brew install onnxruntime`
|
|
43
|
+
- Ubuntu: `sudo apt-get install libonnxruntime libonnxruntime-dev`
|
|
44
|
+
- Windows: `scoop install onnxruntime` or download from [GitHub](https://github.com/microsoft/onnxruntime/releases)
|
|
45
|
+
|
|
41
46
|
- **Tesseract**: For OCR functionality
|
|
42
47
|
- macOS: `brew install tesseract`
|
|
43
48
|
- Ubuntu: `sudo apt-get install tesseract-ocr`
|
|
@@ -417,6 +422,23 @@ bundle exec rubocop
|
|
|
417
422
|
|
|
418
423
|
**Note**: The Ruby bindings use a vendored copy of the core `kreuzberg` Rust crate. For local development, create a symlink at `vendor/kreuzberg` pointing to `../../crates/kreuzberg`. In CI and gem packaging, the actual vendored files are copied to this location.
|
|
419
424
|
|
|
425
|
+
## PDFium Integration
|
|
426
|
+
|
|
427
|
+
PDF extraction is powered by PDFium, which is automatically bundled with this package. No system installation required.
|
|
428
|
+
|
|
429
|
+
### Platform Support
|
|
430
|
+
|
|
431
|
+
| Platform | Status | Notes |
|
|
432
|
+
|----------|--------|-------|
|
|
433
|
+
| Linux x86_64 | ✅ | Bundled |
|
|
434
|
+
| macOS ARM64 | ✅ | Bundled |
|
|
435
|
+
| macOS x86_64 | ✅ | Bundled |
|
|
436
|
+
| Windows x86_64 | ✅ | Bundled |
|
|
437
|
+
|
|
438
|
+
### Binary Size Impact
|
|
439
|
+
|
|
440
|
+
PDFium adds approximately 8-15 MB to the package size depending on platform. This ensures consistent PDF extraction across all environments without external dependencies.
|
|
441
|
+
|
|
420
442
|
## License
|
|
421
443
|
|
|
422
444
|
MIT License. See [LICENSE](../../LICENSE) for details.
|
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
[build]
|
|
2
|
-
rustflags = ["-A", "unpredictable-function-pointer-comparisons"]
|
|
2
|
+
rustflags = ["-A", "unpredictable-function-pointer-comparisons", "-A", "fn_ptr_eq"]
|