kreuzberg 4.4.4 → 4.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +8 -8
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +31 -60
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -49
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +9 -7
- data/vendor/kreuzberg/Cargo.toml +47 -30
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/handlers.rs +9 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +15 -0
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +40 -35
- data/vendor/kreuzberg/src/mcp/params.rs +12 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +26 -3
- data/vendor/kreuzberg/src/pdf/markdown/paragraphs.rs +260 -6
- data/vendor/kreuzberg/src/utils/json_utils.rs +135 -0
- data/vendor/kreuzberg/src/utils/mod.rs +2 -1
- data/vendor/kreuzberg/tests/pdf_integration.rs +41 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +6 -6
- data/vendor/kreuzberg-ffi/kreuzberg.h +238 -90
- data/vendor/kreuzberg-ffi/src/config_builder.rs +442 -2
- data/vendor/kreuzberg-ffi/src/error.rs +6 -6
- data/vendor/kreuzberg-ffi/src/helpers.rs +29 -13
- data/vendor/kreuzberg-ffi/src/lib.rs +6 -3
- data/vendor/kreuzberg-ffi/src/memory.rs +4 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +9 -13
- data/vendor/kreuzberg-ffi/src/result_view.rs +61 -65
- data/vendor/kreuzberg-ffi/src/string_intern.rs +6 -11
- data/vendor/kreuzberg-ffi/src/types.rs +53 -48
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +3 -3
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -8
- data/vendor/kreuzberg-pdfium-render/src/bindgen/pdfium_7678.rs +14 -0
- data/vendor/kreuzberg-pdfium-render/src/bindings/dynamic_bindings.rs +23 -0
- data/vendor/kreuzberg-pdfium-render/src/bindings/static_bindings.rs +17 -0
- data/vendor/kreuzberg-pdfium-render/src/bindings/wasm_bindings.rs +109 -0
- data/vendor/kreuzberg-pdfium-render/src/bindings.rs +39 -0
- data/vendor/kreuzberg-pdfium-render/src/lib.rs +0 -6
- data/vendor/kreuzberg-pdfium-render/src/pdf/action.rs +1 -0
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/object/group.rs +2 -4
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/struct_element.rs +8 -0
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/permissions.rs +1 -0
- data/vendor/kreuzberg-pdfium-render/src/pdf/document.rs +34 -37
- data/vendor/kreuzberg-pdfium-render/src/pdf/link.rs +4 -4
- data/vendor/kreuzberg-pdfium-render/src/pdfium.rs +1 -0
- data/vendor/kreuzberg-pdfium-render/src/utils.rs +9 -10
- data/vendor/kreuzberg-tesseract/Cargo.toml +10 -6
- metadata +3 -8
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/attachment.rs +0 -184
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/attachments.rs +0 -289
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/bookmark.rs +0 -538
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/bookmarks.rs +0 -234
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/signature.rs +0 -186
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/signatures.rs +0 -109
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c376485167aade739cda1b3ac0ba9f8f19bc6c69201f29ab6a13fdb1b9615c9e
|
|
4
|
+
data.tar.gz: e89130964d89de12fb5dd2c179f695d239605eaf0dd39312afaeb8c96114d0e0
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 740a010ae02293ec8228ed99f01c62f6d139953d77cd11de7913769f083b0e349986fd1c5b930f505176dcf39c1475d15077cf67bddec40a755e154f8974cd8d
|
|
7
|
+
data.tar.gz: 8b04c3aa6a73b2284d81948ed0ab954b42f7605a3609e976f47914d5d274f70dc13b0432e7890bbc426b238627da3ed76a79f4c7c915e05859c3e4aef4e28600
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.4.
|
|
4
|
+
kreuzberg (4.4.5)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
sorbet-runtime (~> 0.5)
|
|
7
7
|
|
|
@@ -49,7 +49,7 @@ GEM
|
|
|
49
49
|
i18n (1.14.8)
|
|
50
50
|
concurrent-ruby (~> 1.0)
|
|
51
51
|
io-console (0.8.2)
|
|
52
|
-
json (2.19.
|
|
52
|
+
json (2.19.1)
|
|
53
53
|
json-schema (6.2.0)
|
|
54
54
|
addressable (~> 2.8)
|
|
55
55
|
bigdecimal (>= 3.1, < 5)
|
|
@@ -122,7 +122,7 @@ GEM
|
|
|
122
122
|
rubocop-ast (>= 1.49.0, < 2.0)
|
|
123
123
|
ruby-progressbar (~> 1.7)
|
|
124
124
|
unicode-display_width (>= 2.4.0, < 4.0)
|
|
125
|
-
rubocop-ast (1.49.
|
|
125
|
+
rubocop-ast (1.49.1)
|
|
126
126
|
parser (>= 3.3.7.2)
|
|
127
127
|
prism (~> 1.7)
|
|
128
128
|
rubocop-performance (1.26.1)
|
|
@@ -134,7 +134,7 @@ GEM
|
|
|
134
134
|
rubocop (~> 1.81)
|
|
135
135
|
ruby-progressbar (1.13.0)
|
|
136
136
|
securerandom (0.4.1)
|
|
137
|
-
sorbet-runtime (0.6.
|
|
137
|
+
sorbet-runtime (0.6.13011)
|
|
138
138
|
steep (1.10.0)
|
|
139
139
|
activesupport (>= 5.1)
|
|
140
140
|
concurrent-ruby (>= 1.1.10)
|
|
@@ -220,9 +220,9 @@ CHECKSUMS
|
|
|
220
220
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
221
221
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
222
222
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
223
|
-
json (2.19.
|
|
223
|
+
json (2.19.1) sha256=dd94fdc59e48bff85913829a32350b3148156bc4fd2a95a2568a78b11344082d
|
|
224
224
|
json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
|
|
225
|
-
kreuzberg (4.4.
|
|
225
|
+
kreuzberg (4.4.5)
|
|
226
226
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
227
227
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
228
228
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
@@ -254,12 +254,12 @@ CHECKSUMS
|
|
|
254
254
|
rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
|
|
255
255
|
rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
|
|
256
256
|
rubocop (1.85.1) sha256=3dbcf9e961baa4c376eeeb2a03913dca5e3987033b04d38fa538aa1e7406cc77
|
|
257
|
-
rubocop-ast (1.49.
|
|
257
|
+
rubocop-ast (1.49.1) sha256=4412f3ee70f6fe4546cc489548e0f6fcf76cafcfa80fa03af67098ffed755035
|
|
258
258
|
rubocop-performance (1.26.1) sha256=cd19b936ff196df85829d264b522fd4f98b6c89ad271fa52744a8c11b8f71834
|
|
259
259
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
260
260
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
261
261
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
262
|
-
sorbet-runtime (0.6.
|
|
262
|
+
sorbet-runtime (0.6.13011) sha256=d451e380097747d64d39595fbbb6db2a198310f9eff0f810cd6e5696b402833f
|
|
263
263
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
264
264
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
265
265
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.4.5" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -1390,9 +1390,9 @@ dependencies = [
|
|
|
1390
1390
|
|
|
1391
1391
|
[[package]]
|
|
1392
1392
|
name = "fastembed"
|
|
1393
|
-
version = "5.12.
|
|
1393
|
+
version = "5.12.1"
|
|
1394
1394
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1395
|
-
checksum = "
|
|
1395
|
+
checksum = "b609359080bf7dfff6ac5ace3d6944355ede4c8a51406a316202ae86ff8346a8"
|
|
1396
1396
|
dependencies = [
|
|
1397
1397
|
"anyhow",
|
|
1398
1398
|
"hf-hub 0.4.3",
|
|
@@ -1901,9 +1901,9 @@ dependencies = [
|
|
|
1901
1901
|
|
|
1902
1902
|
[[package]]
|
|
1903
1903
|
name = "hayro-jpeg2000"
|
|
1904
|
-
version = "0.3.
|
|
1904
|
+
version = "0.3.4"
|
|
1905
1905
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1906
|
-
checksum = "
|
|
1906
|
+
checksum = "c1a74cfc18c0093ef8009a0d6c1ba3024df0cce228503a14c1372e1e23eed43e"
|
|
1907
1907
|
dependencies = [
|
|
1908
1908
|
"fearless_simd",
|
|
1909
1909
|
]
|
|
@@ -1977,9 +1977,9 @@ dependencies = [
|
|
|
1977
1977
|
|
|
1978
1978
|
[[package]]
|
|
1979
1979
|
name = "html-to-markdown-rs"
|
|
1980
|
-
version = "2.28.
|
|
1980
|
+
version = "2.28.2"
|
|
1981
1981
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1982
|
-
checksum = "
|
|
1982
|
+
checksum = "3f9377e16af590b764fd98fd176027cf8831c5335f8964f3f643753e38913a4e"
|
|
1983
1983
|
dependencies = [
|
|
1984
1984
|
"ahash",
|
|
1985
1985
|
"astral-tl",
|
|
@@ -2329,9 +2329,9 @@ dependencies = [
|
|
|
2329
2329
|
|
|
2330
2330
|
[[package]]
|
|
2331
2331
|
name = "image"
|
|
2332
|
-
version = "0.25.
|
|
2332
|
+
version = "0.25.10"
|
|
2333
2333
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2334
|
-
checksum = "
|
|
2334
|
+
checksum = "85ab80394333c02fe689eaf900ab500fbd0c2213da414687ebf995a65d5a6104"
|
|
2335
2335
|
dependencies = [
|
|
2336
2336
|
"bytemuck",
|
|
2337
2337
|
"byteorder-lite",
|
|
@@ -2344,9 +2344,9 @@ dependencies = [
|
|
|
2344
2344
|
"png",
|
|
2345
2345
|
"ravif",
|
|
2346
2346
|
"rayon",
|
|
2347
|
-
"tiff
|
|
2348
|
-
"zune-core
|
|
2349
|
-
"zune-jpeg
|
|
2347
|
+
"tiff",
|
|
2348
|
+
"zune-core",
|
|
2349
|
+
"zune-jpeg",
|
|
2350
2350
|
]
|
|
2351
2351
|
|
|
2352
2352
|
[[package]]
|
|
@@ -2659,7 +2659,7 @@ dependencies = [
|
|
|
2659
2659
|
"tar",
|
|
2660
2660
|
"text-splitter",
|
|
2661
2661
|
"thiserror 2.0.18",
|
|
2662
|
-
"tiff
|
|
2662
|
+
"tiff",
|
|
2663
2663
|
"tokio",
|
|
2664
2664
|
"toml 1.0.6+spec-1.1.0",
|
|
2665
2665
|
"tower",
|
|
@@ -2729,7 +2729,7 @@ dependencies = [
|
|
|
2729
2729
|
|
|
2730
2730
|
[[package]]
|
|
2731
2731
|
name = "kreuzberg-rb"
|
|
2732
|
-
version = "4.4.
|
|
2732
|
+
version = "4.4.5"
|
|
2733
2733
|
dependencies = [
|
|
2734
2734
|
"async-trait",
|
|
2735
2735
|
"html-to-markdown-rs",
|
|
@@ -2793,9 +2793,9 @@ checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7"
|
|
|
2793
2793
|
|
|
2794
2794
|
[[package]]
|
|
2795
2795
|
name = "libc"
|
|
2796
|
-
version = "0.2.
|
|
2796
|
+
version = "0.2.183"
|
|
2797
2797
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2798
|
-
checksum = "
|
|
2798
|
+
checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d"
|
|
2799
2799
|
|
|
2800
2800
|
[[package]]
|
|
2801
2801
|
name = "libfuzzer-sys"
|
|
@@ -3118,9 +3118,9 @@ dependencies = [
|
|
|
3118
3118
|
|
|
3119
3119
|
[[package]]
|
|
3120
3120
|
name = "moxcms"
|
|
3121
|
-
version = "0.
|
|
3121
|
+
version = "0.8.1"
|
|
3122
3122
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3123
|
-
checksum = "
|
|
3123
|
+
checksum = "bb85c154ba489f01b25c0d36ae69a87e4a1c73a72631fc6c0eb6dde34a73e44b"
|
|
3124
3124
|
dependencies = [
|
|
3125
3125
|
"num-traits",
|
|
3126
3126
|
"pxfm",
|
|
@@ -3821,9 +3821,9 @@ dependencies = [
|
|
|
3821
3821
|
|
|
3822
3822
|
[[package]]
|
|
3823
3823
|
name = "quinn-proto"
|
|
3824
|
-
version = "0.11.
|
|
3824
|
+
version = "0.11.14"
|
|
3825
3825
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3826
|
-
checksum = "
|
|
3826
|
+
checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098"
|
|
3827
3827
|
dependencies = [
|
|
3828
3828
|
"aws-lc-rs",
|
|
3829
3829
|
"bytes",
|
|
@@ -3992,9 +3992,9 @@ dependencies = [
|
|
|
3992
3992
|
|
|
3993
3993
|
[[package]]
|
|
3994
3994
|
name = "ravif"
|
|
3995
|
-
version = "0.
|
|
3995
|
+
version = "0.13.0"
|
|
3996
3996
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3997
|
-
checksum = "
|
|
3997
|
+
checksum = "e52310197d971b0f5be7fe6b57530dcd27beb35c1b013f29d66c1ad73fbbcc45"
|
|
3998
3998
|
dependencies = [
|
|
3999
3999
|
"avif-serialize",
|
|
4000
4000
|
"imgref",
|
|
@@ -4482,9 +4482,9 @@ dependencies = [
|
|
|
4482
4482
|
|
|
4483
4483
|
[[package]]
|
|
4484
4484
|
name = "schannel"
|
|
4485
|
-
version = "0.1.
|
|
4485
|
+
version = "0.1.29"
|
|
4486
4486
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4487
|
-
checksum = "
|
|
4487
|
+
checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939"
|
|
4488
4488
|
dependencies = [
|
|
4489
4489
|
"windows-sys 0.61.2",
|
|
4490
4490
|
]
|
|
@@ -4991,9 +4991,9 @@ dependencies = [
|
|
|
4991
4991
|
|
|
4992
4992
|
[[package]]
|
|
4993
4993
|
name = "tempfile"
|
|
4994
|
-
version = "3.
|
|
4994
|
+
version = "3.27.0"
|
|
4995
4995
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4996
|
-
checksum = "
|
|
4996
|
+
checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd"
|
|
4997
4997
|
dependencies = [
|
|
4998
4998
|
"fastrand",
|
|
4999
4999
|
"getrandom 0.4.2",
|
|
@@ -5079,20 +5079,6 @@ dependencies = [
|
|
|
5079
5079
|
"cfg-if",
|
|
5080
5080
|
]
|
|
5081
5081
|
|
|
5082
|
-
[[package]]
|
|
5083
|
-
name = "tiff"
|
|
5084
|
-
version = "0.10.3"
|
|
5085
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5086
|
-
checksum = "af9605de7fee8d9551863fd692cce7637f548dbd9db9180fcc07ccc6d26c336f"
|
|
5087
|
-
dependencies = [
|
|
5088
|
-
"fax",
|
|
5089
|
-
"flate2",
|
|
5090
|
-
"half",
|
|
5091
|
-
"quick-error",
|
|
5092
|
-
"weezl",
|
|
5093
|
-
"zune-jpeg 0.4.21",
|
|
5094
|
-
]
|
|
5095
|
-
|
|
5096
5082
|
[[package]]
|
|
5097
5083
|
name = "tiff"
|
|
5098
5084
|
version = "0.11.3"
|
|
@@ -5104,7 +5090,7 @@ dependencies = [
|
|
|
5104
5090
|
"half",
|
|
5105
5091
|
"quick-error",
|
|
5106
5092
|
"weezl",
|
|
5107
|
-
"zune-jpeg
|
|
5093
|
+
"zune-jpeg",
|
|
5108
5094
|
]
|
|
5109
5095
|
|
|
5110
5096
|
[[package]]
|
|
@@ -6483,18 +6469,18 @@ dependencies = [
|
|
|
6483
6469
|
|
|
6484
6470
|
[[package]]
|
|
6485
6471
|
name = "zerocopy"
|
|
6486
|
-
version = "0.8.
|
|
6472
|
+
version = "0.8.42"
|
|
6487
6473
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6488
|
-
checksum = "
|
|
6474
|
+
checksum = "f2578b716f8a7a858b7f02d5bd870c14bf4ddbbcf3a4c05414ba6503640505e3"
|
|
6489
6475
|
dependencies = [
|
|
6490
6476
|
"zerocopy-derive",
|
|
6491
6477
|
]
|
|
6492
6478
|
|
|
6493
6479
|
[[package]]
|
|
6494
6480
|
name = "zerocopy-derive"
|
|
6495
|
-
version = "0.8.
|
|
6481
|
+
version = "0.8.42"
|
|
6496
6482
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6497
|
-
checksum = "
|
|
6483
|
+
checksum = "7e6cc098ea4d3bd6246687de65af3f920c430e236bee1e3bf2e441463f08a02f"
|
|
6498
6484
|
dependencies = [
|
|
6499
6485
|
"proc-macro2",
|
|
6500
6486
|
"quote",
|
|
@@ -6613,12 +6599,6 @@ dependencies = [
|
|
|
6613
6599
|
"simd-adler32",
|
|
6614
6600
|
]
|
|
6615
6601
|
|
|
6616
|
-
[[package]]
|
|
6617
|
-
name = "zune-core"
|
|
6618
|
-
version = "0.4.12"
|
|
6619
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6620
|
-
checksum = "3f423a2c17029964870cfaabb1f13dfab7d092a62a29a89264f4d36990ca414a"
|
|
6621
|
-
|
|
6622
6602
|
[[package]]
|
|
6623
6603
|
name = "zune-core"
|
|
6624
6604
|
version = "0.5.1"
|
|
@@ -6634,20 +6614,11 @@ dependencies = [
|
|
|
6634
6614
|
"simd-adler32",
|
|
6635
6615
|
]
|
|
6636
6616
|
|
|
6637
|
-
[[package]]
|
|
6638
|
-
name = "zune-jpeg"
|
|
6639
|
-
version = "0.4.21"
|
|
6640
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6641
|
-
checksum = "29ce2c8a9384ad323cf564b67da86e21d3cfdff87908bc1223ed5c99bc792713"
|
|
6642
|
-
dependencies = [
|
|
6643
|
-
"zune-core 0.4.12",
|
|
6644
|
-
]
|
|
6645
|
-
|
|
6646
6617
|
[[package]]
|
|
6647
6618
|
name = "zune-jpeg"
|
|
6648
6619
|
version = "0.5.12"
|
|
6649
6620
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6650
6621
|
checksum = "410e9ecef634c709e3831c2cfdb8d9c32164fae1c67496d5b68fff728eec37fe"
|
|
6651
6622
|
dependencies = [
|
|
6652
|
-
"zune-core
|
|
6623
|
+
"zune-core",
|
|
6653
6624
|
]
|
|
@@ -1,51 +1,6 @@
|
|
|
1
|
-
[workspace]
|
|
2
|
-
|
|
3
|
-
[workspace.dependencies]
|
|
4
|
-
bytes = { version = "1", features = ["serde"] }
|
|
5
|
-
serde = { version = "1.0.228", features = ["derive"] }
|
|
6
|
-
serde_json = { version = "1.0.149" }
|
|
7
|
-
tokio = { version = "1.50.0", features = [
|
|
8
|
-
"rt",
|
|
9
|
-
"rt-multi-thread",
|
|
10
|
-
"macros",
|
|
11
|
-
"sync",
|
|
12
|
-
"process",
|
|
13
|
-
"fs",
|
|
14
|
-
"time",
|
|
15
|
-
"io-util",
|
|
16
|
-
] }
|
|
17
|
-
thiserror = "2.0.18"
|
|
18
|
-
anyhow = "1.0"
|
|
19
|
-
libc = "0.2.182"
|
|
20
|
-
async-trait = "0.1.89"
|
|
21
|
-
tracing = "0.1"
|
|
22
|
-
ahash = "0.8.12"
|
|
23
|
-
base64 = "0.22.1"
|
|
24
|
-
hex = "0.4.3"
|
|
25
|
-
num_cpus = "1.17.0"
|
|
26
|
-
once_cell = "1.21.3"
|
|
27
|
-
parking_lot = "0.12.5"
|
|
28
|
-
html-to-markdown-rs = { version = "2.28.1", default-features = false }
|
|
29
|
-
reqwest = { version = "0.13.2", default-features = false }
|
|
30
|
-
image = { version = "0.25.9", default-features = false }
|
|
31
|
-
toml = "1.0.6"
|
|
32
|
-
tempfile = "3.26.0"
|
|
33
|
-
lzma-rust2 = { version = "0.16.2" }
|
|
34
|
-
log = "0.4"
|
|
35
|
-
getrandom = { version = "0.4.2", features = ["wasm_js"] }
|
|
36
|
-
console_error_panic_hook = "0.1"
|
|
37
|
-
ctor = "0.6"
|
|
38
|
-
lazy_static = "1.5.0"
|
|
39
|
-
rayon = "1.11.0"
|
|
40
|
-
chrono = "0.4"
|
|
41
|
-
itertools = "0.14"
|
|
42
|
-
|
|
43
|
-
[workspace.lints.clippy]
|
|
44
|
-
collapsible_if = "allow"
|
|
45
|
-
|
|
46
1
|
[package]
|
|
47
2
|
name = "kreuzberg-rb"
|
|
48
|
-
version = "4.4.
|
|
3
|
+
version = "4.4.5"
|
|
49
4
|
edition = "2024"
|
|
50
5
|
rust-version = "1.91"
|
|
51
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -58,8 +13,8 @@ description = "Ruby bindings (Magnus) for Kreuzberg - high-performance document
|
|
|
58
13
|
keywords = ["ruby", "magnus", "document", "extraction", "bindings"]
|
|
59
14
|
categories = ["api-bindings", "text-processing"]
|
|
60
15
|
|
|
61
|
-
[lints]
|
|
62
|
-
|
|
16
|
+
[lints.clippy]
|
|
17
|
+
collapsible_if = "allow"
|
|
63
18
|
|
|
64
19
|
[lib]
|
|
65
20
|
name = "kreuzberg_rb"
|
|
@@ -111,7 +66,7 @@ tokio = { version = "1.50.0", features = [
|
|
|
111
66
|
"time",
|
|
112
67
|
"io-util",
|
|
113
68
|
] }
|
|
114
|
-
html-to-markdown-rs = { version = "2.28.
|
|
69
|
+
html-to-markdown-rs = { version = "2.28.2", default-features = false }
|
|
115
70
|
|
|
116
71
|
[dev-dependencies]
|
|
117
72
|
pretty_assertions = "1.4"
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.4.
|
|
5
|
+
version = "4.4.5"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -23,26 +23,28 @@ criterion = { version = "0.8", features = ["html_reports"] }
|
|
|
23
23
|
ctor = "0.6"
|
|
24
24
|
getrandom = { version = "0.4.2", features = ["wasm_js"] }
|
|
25
25
|
hex = "0.4.3"
|
|
26
|
-
html-to-markdown-rs = { version = "2.28.
|
|
27
|
-
image = { version = "0.25.
|
|
26
|
+
html-to-markdown-rs = { version = "2.28.2", default-features = false }
|
|
27
|
+
image = { version = "0.25.10", default-features = false }
|
|
28
28
|
itertools = "0.14"
|
|
29
29
|
js-sys = "0.3"
|
|
30
|
+
kreuzberg = { path = "./crates/kreuzberg", version = "4.4.5", default-features = false }
|
|
31
|
+
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.4.5" }
|
|
30
32
|
lazy_static = "1.5.0"
|
|
31
|
-
libc = "0.2.
|
|
33
|
+
libc = "0.2.183"
|
|
32
34
|
log = "0.4"
|
|
33
35
|
lzma-rust2 = { version = "0.16.2" }
|
|
34
36
|
num_cpus = "1.17.0"
|
|
35
37
|
once_cell = "1.21.3"
|
|
36
38
|
ort = { version = "=2.0.0-rc.11", default-features = false }
|
|
37
39
|
parking_lot = "0.12.5"
|
|
38
|
-
pdfium-render = { package = "kreuzberg-pdfium-render", version = "4.3" }
|
|
40
|
+
pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
|
|
39
41
|
rayon = "1.11.0"
|
|
40
42
|
reqwest = { version = "0.13.2", default-features = false }
|
|
41
43
|
serde = { version = "1.0.228", features = ["derive"] }
|
|
42
44
|
serde_json = { version = "1.0.149" }
|
|
43
|
-
tempfile = "3.
|
|
45
|
+
tempfile = "3.27.0"
|
|
44
46
|
thiserror = "2.0.18"
|
|
45
|
-
tokio = { version = "1.50.0", features = ["
|
|
47
|
+
tokio = { version = "1.50.0", features = ["sync", "process", "fs", "io-util"] }
|
|
46
48
|
toml = "1.0.6"
|
|
47
49
|
tracing = "0.1"
|
|
48
50
|
wasm-bindgen = { version = "0.2", features = ["enable-interning"] }
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.4.
|
|
3
|
+
version = "4.4.5"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -25,13 +25,13 @@ pool-metrics = []
|
|
|
25
25
|
|
|
26
26
|
simd-utf8 = ["dep:simdutf8"]
|
|
27
27
|
|
|
28
|
-
tokio-runtime = ["dep:tokio"]
|
|
28
|
+
tokio-runtime = ["dep:tokio", "tokio/rt", "tokio/rt-multi-thread"]
|
|
29
29
|
|
|
30
30
|
pdf = ["dep:pdfium-render", "dep:lopdf", "dep:image"]
|
|
31
31
|
static-pdfium = ["pdf"]
|
|
32
32
|
bundled-pdfium = ["pdf"]
|
|
33
33
|
system-pdfium = ["pdf"]
|
|
34
|
-
excel = ["dep:calamine"
|
|
34
|
+
excel = ["dep:calamine"]
|
|
35
35
|
excel-wasm = ["dep:calamine"]
|
|
36
36
|
office = [
|
|
37
37
|
"dep:cfb",
|
|
@@ -70,6 +70,7 @@ paddle-ocr = [
|
|
|
70
70
|
"dep:sha2",
|
|
71
71
|
"dep:image",
|
|
72
72
|
"dep:hf-hub",
|
|
73
|
+
"dep:ureq",
|
|
73
74
|
"html",
|
|
74
75
|
"tokio-runtime",
|
|
75
76
|
"ocr",
|
|
@@ -105,7 +106,7 @@ wasm-target = [
|
|
|
105
106
|
]
|
|
106
107
|
wasm-threads = ["dep:wasm-bindgen-rayon"]
|
|
107
108
|
|
|
108
|
-
|
|
109
|
+
formats = [
|
|
109
110
|
"pdf",
|
|
110
111
|
"excel",
|
|
111
112
|
"office",
|
|
@@ -113,18 +114,13 @@ full = [
|
|
|
113
114
|
"html",
|
|
114
115
|
"xml",
|
|
115
116
|
"archives",
|
|
116
|
-
"ocr",
|
|
117
|
-
"paddle-ocr",
|
|
118
117
|
"language-detection",
|
|
119
118
|
"chunking",
|
|
120
|
-
"embeddings",
|
|
121
119
|
"quality",
|
|
122
120
|
"keywords",
|
|
123
121
|
"mdx",
|
|
124
|
-
"api",
|
|
125
|
-
"mcp",
|
|
126
|
-
"otel",
|
|
127
122
|
]
|
|
123
|
+
full = ["formats", "ocr", "paddle-ocr", "embeddings", "api", "mcp", "otel"]
|
|
128
124
|
server = ["pdf", "excel", "html", "ocr", "paddle-ocr", "chunking", "api", "mcp"]
|
|
129
125
|
|
|
130
126
|
[build-dependencies]
|
|
@@ -141,7 +137,7 @@ dashmap = "6.1"
|
|
|
141
137
|
simdutf8 = { version = "0.1", optional = true }
|
|
142
138
|
hex = "0.4.3"
|
|
143
139
|
lazy_static = "1.5.0"
|
|
144
|
-
libc = "0.2.
|
|
140
|
+
libc = "0.2.183"
|
|
145
141
|
memchr = "2.8.0"
|
|
146
142
|
num_cpus = "1.17.0"
|
|
147
143
|
once_cell = "1.21.3"
|
|
@@ -157,20 +153,23 @@ toml = "1.0.6"
|
|
|
157
153
|
mime_guess = "2.0"
|
|
158
154
|
rmp-serde = "1.3"
|
|
159
155
|
thiserror = "2.0.18"
|
|
160
|
-
tokio = { version = "1.50.0", features = ["
|
|
156
|
+
tokio = { version = "1.50.0", features = ["sync", "process", "fs", "io-util"], optional = true }
|
|
161
157
|
indexmap = "2.13.0"
|
|
162
158
|
tracing = "0.1"
|
|
163
|
-
pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render"
|
|
159
|
+
pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
|
|
164
160
|
lopdf = { version = "0.39.0", optional = true }
|
|
165
|
-
calamine = { version = "0.
|
|
161
|
+
calamine = { version = "0.34.0", features = ["dates"], optional = true }
|
|
166
162
|
|
|
167
163
|
roxmltree = { version = "0.21.1", optional = true }
|
|
168
164
|
zip = { version = "8.2.0", optional = true, default-features = false, features = [
|
|
169
165
|
"deflate-flate2",
|
|
170
166
|
] }
|
|
171
167
|
mail-parser = { version = "0.11.2", optional = true }
|
|
172
|
-
html-to-markdown-rs = { version = "2.28.
|
|
173
|
-
"inline-images",
|
|
168
|
+
html-to-markdown-rs = { version = "2.28.2", default-features = false, features = [
|
|
169
|
+
"inline-images",
|
|
170
|
+
"metadata",
|
|
171
|
+
"visitor",
|
|
172
|
+
], optional = true }
|
|
174
173
|
cfb = { version = "0.14.0", optional = true }
|
|
175
174
|
quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
|
|
176
175
|
tar = { version = "0.4.44", optional = true }
|
|
@@ -187,9 +186,17 @@ biblib = { version = "0.4", default-features = false, features = [
|
|
|
187
186
|
], optional = true }
|
|
188
187
|
org = { version = "0.3", optional = true }
|
|
189
188
|
|
|
190
|
-
kreuzberg-tesseract = { path = "../kreuzberg-tesseract",
|
|
191
|
-
image = { version = "0.25.
|
|
192
|
-
"png",
|
|
189
|
+
kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }
|
|
190
|
+
image = { version = "0.25.10", default-features = false, features = [
|
|
191
|
+
"png",
|
|
192
|
+
"jpeg",
|
|
193
|
+
"webp",
|
|
194
|
+
"bmp",
|
|
195
|
+
"tiff",
|
|
196
|
+
"gif",
|
|
197
|
+
"pnm",
|
|
198
|
+
"rayon",
|
|
199
|
+
], optional = true }
|
|
193
200
|
tiff = { version = "0.11", optional = true }
|
|
194
201
|
fast_image_resize = { version = "6.0.0", optional = true }
|
|
195
202
|
kamadak-exif = { version = "0.6.1", optional = true }
|
|
@@ -209,7 +216,7 @@ axum = { version = "0.8", features = ["macros", "json", "multipart"], optional =
|
|
|
209
216
|
tower = { version = "0.5", optional = true }
|
|
210
217
|
tower-http = { version = "0.6", features = ["cors", "trace", "limit"], optional = true }
|
|
211
218
|
utoipa = { version = "5.4", features = ["axum_extras"], optional = true }
|
|
212
|
-
rmcp = { version = "1.1.
|
|
219
|
+
rmcp = { version = "1.1.1", features = [
|
|
213
220
|
"server",
|
|
214
221
|
"macros",
|
|
215
222
|
"base64",
|
|
@@ -224,25 +231,29 @@ infer = "0.19.0"
|
|
|
224
231
|
sha2 = { version = "0.10", optional = true }
|
|
225
232
|
|
|
226
233
|
[dev-dependencies]
|
|
227
|
-
tempfile = "3.
|
|
234
|
+
tempfile = "3.27.0"
|
|
228
235
|
filetime = "0.2"
|
|
229
236
|
tar = "0.4.44"
|
|
230
237
|
zip = { version = "8.2.0", default-features = false, features = ["deflate-flate2"] }
|
|
231
238
|
serial_test = "3.4.0"
|
|
232
239
|
anyhow = "1.0"
|
|
240
|
+
tokio = { version = "1.50.0", features = ["macros", "time"] }
|
|
233
241
|
tokio-test = "0.4"
|
|
234
242
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
|
235
243
|
criterion = { version = "0.8", features = ["html_reports"] }
|
|
236
|
-
image = { version = "0.25.
|
|
244
|
+
image = { version = "0.25.10", default-features = false, features = ["png"] }
|
|
237
245
|
|
|
238
246
|
[target.'cfg(all(not(target_os = "windows"), not(target_arch = "wasm32")))'.dependencies]
|
|
239
247
|
pprof = { version = "0.15.0", features = ["flamegraph"], optional = true }
|
|
240
248
|
# PaddleOCR via ONNX Runtime - not available on WASM (vendored from paddle-ocr-rs)
|
|
241
|
-
kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr",
|
|
249
|
+
kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr", optional = true }
|
|
242
250
|
# Use rustls on non-Windows platforms (Linux, macOS)
|
|
243
251
|
# Blocking feature needed for model downloads
|
|
244
|
-
reqwest = { version = "0.13.2", default-features = false
|
|
245
|
-
"json",
|
|
252
|
+
reqwest = { version = "0.13.2", default-features = false, features = [
|
|
253
|
+
"json",
|
|
254
|
+
"rustls",
|
|
255
|
+
"blocking",
|
|
256
|
+
], optional = true }
|
|
246
257
|
# Use rustls-tls for fastembed on non-Windows platforms
|
|
247
258
|
fastembed = { version = "5.12", default-features = false, features = [
|
|
248
259
|
"hf-hub-rustls-tls",
|
|
@@ -250,15 +261,18 @@ fastembed = { version = "5.12", default-features = false, features = [
|
|
|
250
261
|
], optional = true }
|
|
251
262
|
hf-hub = { version = "0.5", default-features = false, features = ["ureq"], optional = true }
|
|
252
263
|
# Force ureq (transitive dep via hf-hub) to use rustls on non-Windows
|
|
253
|
-
ureq = { version = "3.2", default-features = false, features = ["rustls", "json"] }
|
|
264
|
+
ureq = { version = "3.2", default-features = false, features = ["rustls", "json"], optional = true }
|
|
254
265
|
|
|
255
266
|
# Use native-tls on Windows to avoid aws-lc-sys CMake build issues with MinGW
|
|
256
267
|
[target.'cfg(all(target_os = "windows", not(target_arch = "wasm32")))'.dependencies]
|
|
257
268
|
# PaddleOCR via ONNX Runtime - not available on WASM (vendored from paddle-ocr-rs)
|
|
258
|
-
kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr",
|
|
269
|
+
kreuzberg-paddle-ocr = { path = "../kreuzberg-paddle-ocr", optional = true }
|
|
259
270
|
# Blocking feature needed for model downloads
|
|
260
|
-
reqwest = { version = "0.13.2", default-features = false
|
|
261
|
-
"json",
|
|
271
|
+
reqwest = { version = "0.13.2", default-features = false, features = [
|
|
272
|
+
"json",
|
|
273
|
+
"native-tls",
|
|
274
|
+
"blocking",
|
|
275
|
+
], optional = true }
|
|
262
276
|
# Use native-tls for fastembed on Windows
|
|
263
277
|
fastembed = { version = "5.12", default-features = false, features = [
|
|
264
278
|
"hf-hub-native-tls",
|
|
@@ -266,7 +280,10 @@ fastembed = { version = "5.12", default-features = false, features = [
|
|
|
266
280
|
], optional = true }
|
|
267
281
|
hf-hub = { version = "0.5", default-features = false, features = ["ureq"], optional = true }
|
|
268
282
|
# Force ureq (transitive dep via hf-hub) to use native-tls on Windows
|
|
269
|
-
ureq = { version = "3.2", default-features = false, features = [
|
|
283
|
+
ureq = { version = "3.2", default-features = false, features = [
|
|
284
|
+
"native-tls",
|
|
285
|
+
"json",
|
|
286
|
+
], optional = true }
|
|
270
287
|
|
|
271
288
|
[target.'cfg(target_arch = "wasm32")'.dependencies]
|
|
272
289
|
wasm-bindgen-rayon = { version = "1.3", optional = true }
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.4.
|
|
20
|
+
> **🚀 Version 4.4.5 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -170,6 +170,15 @@ pub async fn extract_handler(
|
|
|
170
170
|
}
|
|
171
171
|
};
|
|
172
172
|
}
|
|
173
|
+
"pdf_password" => {
|
|
174
|
+
let pwd = field
|
|
175
|
+
.text()
|
|
176
|
+
.await
|
|
177
|
+
.map_err(|e| ApiError::validation(crate::error::KreuzbergError::validation(e.to_string())))?;
|
|
178
|
+
let cfg = config.get_or_insert_with(|| (*state.default_config).clone());
|
|
179
|
+
let pdf_opts = cfg.pdf_options.get_or_insert_with(Default::default);
|
|
180
|
+
pdf_opts.passwords.get_or_insert_with(Vec::new).push(pwd);
|
|
181
|
+
}
|
|
173
182
|
_ => {}
|
|
174
183
|
}
|
|
175
184
|
}
|