kreuzberg 4.8.2 → 4.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +75 -62
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/ext/kreuzberg_rb/native/src/config/types.rs +1 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +1 -5
- data/vendor/Cargo.toml +5 -5
- data/vendor/kreuzberg/Cargo.toml +5 -5
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
- data/vendor/kreuzberg/src/core/config/pdf.rs +23 -0
- data/vendor/kreuzberg/src/doc_orientation.rs +1 -1
- data/vendor/kreuzberg/src/embeddings/mod.rs +15 -1
- data/vendor/kreuzberg/src/extraction/derive.rs +15 -3
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +18 -1
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +140 -0
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +349 -0
- data/vendor/kreuzberg/src/layout/models/slanet.rs +1 -1
- data/vendor/kreuzberg/src/layout/models/table_classifier.rs +1 -1
- data/vendor/kreuzberg/src/layout/models/tatr.rs +1 -1
- data/vendor/kreuzberg/src/layout/session.rs +1 -1
- data/vendor/kreuzberg/src/lib.rs +1 -1
- data/vendor/kreuzberg/src/pdf/hierarchy/extraction.rs +4 -0
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -0
- data/vendor/kreuzberg/src/pdf/oxide/annotations.rs +258 -0
- data/vendor/kreuzberg/src/pdf/oxide/hierarchy.rs +235 -0
- data/vendor/kreuzberg/src/pdf/oxide/images.rs +53 -0
- data/vendor/kreuzberg/src/pdf/oxide/metadata.rs +381 -0
- data/vendor/kreuzberg/src/pdf/oxide/mod.rs +43 -0
- data/vendor/kreuzberg/src/pdf/oxide/table.rs +243 -0
- data/vendor/kreuzberg/src/pdf/oxide/text.rs +249 -0
- data/vendor/kreuzberg/src/pdf/oxide_text.rs +8 -6
- data/vendor/kreuzberg/src/pdf/structure/adapters.rs +1 -0
- data/vendor/kreuzberg/src/pdf/structure/assembly.rs +1 -0
- data/vendor/kreuzberg/src/pdf/structure/bridge.rs +51 -0
- data/vendor/kreuzberg/src/pdf/structure/classify.rs +3 -0
- data/vendor/kreuzberg/src/pdf/structure/content_convert.rs +3 -0
- data/vendor/kreuzberg/src/pdf/structure/layout_classify.rs +1 -0
- data/vendor/kreuzberg/src/pdf/structure/mod.rs +2 -0
- data/vendor/kreuzberg/src/pdf/structure/paragraphs.rs +2 -0
- data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +240 -1
- data/vendor/kreuzberg/src/pdf/table_reconstruct.rs +1 -0
- data/vendor/kreuzberg/src/pdf/text.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +5 -1
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +2 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +4 -10
- data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
- data/vendor/kreuzberg-ffi/src/config/merge.rs +0 -3
- data/vendor/kreuzberg-ffi/src/config_builder.rs +0 -6
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -1
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-paddle-ocr/src/base_net.rs +1 -3
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +9 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 8b636a0d3207a655747e4b288b0f66b616b460b282f49d2b4947ec23ac319cfc
|
|
4
|
+
data.tar.gz: ef5522acd061db0093d68afb96d440888f605aefaa6e0cc262f248e2e82e62cb
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 89b72104facd4e87ecc5c65011305abdcd77fd829073dcd235829d67297ad17b09136bb9c2df0dd3dcf7e79ba2f86bba6ceb69fdbf22b783725555b4efa375b3
|
|
7
|
+
data.tar.gz: c82c2c92b595613497385af930f497762d56831818cda45d230044a61fbbe5730da4d7781cb956fa6096b53e364dfff9772d3517fe2cc879785d53659a45d5d0
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.3" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -699,9 +699,9 @@ dependencies = [
|
|
|
699
699
|
|
|
700
700
|
[[package]]
|
|
701
701
|
name = "cc"
|
|
702
|
-
version = "1.2.
|
|
702
|
+
version = "1.2.60"
|
|
703
703
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
704
|
-
checksum = "
|
|
704
|
+
checksum = "43c5703da9466b66a946814e1adf53ea2c90f10063b86290cc9eb67ce3478a20"
|
|
705
705
|
dependencies = [
|
|
706
706
|
"find-msvc-tools",
|
|
707
707
|
"jobserver",
|
|
@@ -1162,19 +1162,20 @@ dependencies = [
|
|
|
1162
1162
|
|
|
1163
1163
|
[[package]]
|
|
1164
1164
|
name = "ctor"
|
|
1165
|
-
version = "0.
|
|
1165
|
+
version = "0.9.1"
|
|
1166
1166
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1167
|
-
checksum = "
|
|
1167
|
+
checksum = "c1c888a2a4f677017373fb6c01e13e318dd9e78758445ed5eb985e355d3f8281"
|
|
1168
1168
|
dependencies = [
|
|
1169
1169
|
"ctor-proc-macro",
|
|
1170
1170
|
"dtor",
|
|
1171
|
+
"link-section",
|
|
1171
1172
|
]
|
|
1172
1173
|
|
|
1173
1174
|
[[package]]
|
|
1174
1175
|
name = "ctor-proc-macro"
|
|
1175
|
-
version = "0.0.
|
|
1176
|
+
version = "0.0.12"
|
|
1176
1177
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1177
|
-
checksum = "
|
|
1178
|
+
checksum = "a7ab264ea985f1bd27887d7b21ea2bb046728e05d11909ca138d700c494730db"
|
|
1178
1179
|
|
|
1179
1180
|
[[package]]
|
|
1180
1181
|
name = "darling"
|
|
@@ -1416,18 +1417,18 @@ dependencies = [
|
|
|
1416
1417
|
|
|
1417
1418
|
[[package]]
|
|
1418
1419
|
name = "dtor"
|
|
1419
|
-
version = "0.
|
|
1420
|
+
version = "0.6.0"
|
|
1420
1421
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1421
|
-
checksum = "
|
|
1422
|
+
checksum = "30e4690622ab6700ced40fc370a3f07b7d111f0154bb6fb08f73b4c8834f75b6"
|
|
1422
1423
|
dependencies = [
|
|
1423
1424
|
"dtor-proc-macro",
|
|
1424
1425
|
]
|
|
1425
1426
|
|
|
1426
1427
|
[[package]]
|
|
1427
1428
|
name = "dtor-proc-macro"
|
|
1428
|
-
version = "0.0.
|
|
1429
|
+
version = "0.0.12"
|
|
1429
1430
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1430
|
-
checksum = "
|
|
1431
|
+
checksum = "8c98b077c7463d01d22dde8a24378ddf1ca7263dc687cffbed38819ea6c21131"
|
|
1431
1432
|
|
|
1432
1433
|
[[package]]
|
|
1433
1434
|
name = "dunce"
|
|
@@ -1868,9 +1869,9 @@ dependencies = [
|
|
|
1868
1869
|
|
|
1869
1870
|
[[package]]
|
|
1870
1871
|
name = "gif"
|
|
1871
|
-
version = "0.14.
|
|
1872
|
+
version = "0.14.2"
|
|
1872
1873
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1873
|
-
checksum = "
|
|
1874
|
+
checksum = "ee8cfcc411d9adbbaba82fb72661cc1bcca13e8bba98b364e62b2dba8f960159"
|
|
1874
1875
|
dependencies = [
|
|
1875
1876
|
"color_quant",
|
|
1876
1877
|
"weezl",
|
|
@@ -2029,6 +2030,12 @@ dependencies = [
|
|
|
2029
2030
|
"foldhash 0.2.0",
|
|
2030
2031
|
]
|
|
2031
2032
|
|
|
2033
|
+
[[package]]
|
|
2034
|
+
name = "hashbrown"
|
|
2035
|
+
version = "0.17.0"
|
|
2036
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2037
|
+
checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51"
|
|
2038
|
+
|
|
2032
2039
|
[[package]]
|
|
2033
2040
|
name = "hashify"
|
|
2034
2041
|
version = "0.2.9"
|
|
@@ -2094,7 +2101,7 @@ dependencies = [
|
|
|
2094
2101
|
"indicatif 0.17.11",
|
|
2095
2102
|
"libc",
|
|
2096
2103
|
"log",
|
|
2097
|
-
"rand 0.9.
|
|
2104
|
+
"rand 0.9.3",
|
|
2098
2105
|
"serde",
|
|
2099
2106
|
"serde_json",
|
|
2100
2107
|
"thiserror 2.0.18",
|
|
@@ -2113,7 +2120,7 @@ dependencies = [
|
|
|
2113
2120
|
"indicatif 0.18.4",
|
|
2114
2121
|
"libc",
|
|
2115
2122
|
"log",
|
|
2116
|
-
"rand 0.9.
|
|
2123
|
+
"rand 0.9.3",
|
|
2117
2124
|
"serde",
|
|
2118
2125
|
"serde_json",
|
|
2119
2126
|
"thiserror 2.0.18",
|
|
@@ -2526,7 +2533,7 @@ dependencies = [
|
|
|
2526
2533
|
"itertools 0.14.0",
|
|
2527
2534
|
"nalgebra",
|
|
2528
2535
|
"num",
|
|
2529
|
-
"rand 0.9.
|
|
2536
|
+
"rand 0.9.3",
|
|
2530
2537
|
"rand_distr",
|
|
2531
2538
|
]
|
|
2532
2539
|
|
|
@@ -2538,12 +2545,12 @@ checksum = "e7c5cedc30da3a610cac6b4ba17597bdf7152cf974e8aab3afb3d54455e371c8"
|
|
|
2538
2545
|
|
|
2539
2546
|
[[package]]
|
|
2540
2547
|
name = "indexmap"
|
|
2541
|
-
version = "2.
|
|
2548
|
+
version = "2.14.0"
|
|
2542
2549
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2543
|
-
checksum = "
|
|
2550
|
+
checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
|
|
2544
2551
|
dependencies = [
|
|
2545
2552
|
"equivalent",
|
|
2546
|
-
"hashbrown 0.
|
|
2553
|
+
"hashbrown 0.17.0",
|
|
2547
2554
|
"serde",
|
|
2548
2555
|
"serde_core",
|
|
2549
2556
|
]
|
|
@@ -2759,9 +2766,9 @@ checksum = "086b08ec7a274cd60cd575ed3651ba081ee72dec0d39a6210e8adcff9efe3880"
|
|
|
2759
2766
|
|
|
2760
2767
|
[[package]]
|
|
2761
2768
|
name = "js-sys"
|
|
2762
|
-
version = "0.3.
|
|
2769
|
+
version = "0.3.95"
|
|
2763
2770
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2764
|
-
checksum = "
|
|
2771
|
+
checksum = "2964e92d1d9dc3364cae4d718d93f227e3abb088e747d92e0395bfdedf1c12ca"
|
|
2765
2772
|
dependencies = [
|
|
2766
2773
|
"cfg-if",
|
|
2767
2774
|
"futures-util",
|
|
@@ -2780,7 +2787,7 @@ dependencies = [
|
|
|
2780
2787
|
|
|
2781
2788
|
[[package]]
|
|
2782
2789
|
name = "kreuzberg"
|
|
2783
|
-
version = "4.8.
|
|
2790
|
+
version = "4.8.3"
|
|
2784
2791
|
dependencies = [
|
|
2785
2792
|
"ahash",
|
|
2786
2793
|
"async-trait",
|
|
@@ -2875,7 +2882,7 @@ dependencies = [
|
|
|
2875
2882
|
|
|
2876
2883
|
[[package]]
|
|
2877
2884
|
name = "kreuzberg-ffi"
|
|
2878
|
-
version = "4.8.
|
|
2885
|
+
version = "4.8.3"
|
|
2879
2886
|
dependencies = [
|
|
2880
2887
|
"ahash",
|
|
2881
2888
|
"async-trait",
|
|
@@ -2891,7 +2898,7 @@ dependencies = [
|
|
|
2891
2898
|
|
|
2892
2899
|
[[package]]
|
|
2893
2900
|
name = "kreuzberg-paddle-ocr"
|
|
2894
|
-
version = "4.8.
|
|
2901
|
+
version = "4.8.3"
|
|
2895
2902
|
dependencies = [
|
|
2896
2903
|
"geo-clipper",
|
|
2897
2904
|
"geo-types",
|
|
@@ -2905,7 +2912,7 @@ dependencies = [
|
|
|
2905
2912
|
|
|
2906
2913
|
[[package]]
|
|
2907
2914
|
name = "kreuzberg-pdfium-render"
|
|
2908
|
-
version = "4.8.
|
|
2915
|
+
version = "4.8.3"
|
|
2909
2916
|
dependencies = [
|
|
2910
2917
|
"bitflags",
|
|
2911
2918
|
"bytemuck",
|
|
@@ -2928,7 +2935,7 @@ dependencies = [
|
|
|
2928
2935
|
|
|
2929
2936
|
[[package]]
|
|
2930
2937
|
name = "kreuzberg-rb"
|
|
2931
|
-
version = "4.8.
|
|
2938
|
+
version = "4.8.3"
|
|
2932
2939
|
dependencies = [
|
|
2933
2940
|
"async-trait",
|
|
2934
2941
|
"html-to-markdown-rs",
|
|
@@ -2945,7 +2952,7 @@ dependencies = [
|
|
|
2945
2952
|
|
|
2946
2953
|
[[package]]
|
|
2947
2954
|
name = "kreuzberg-tesseract"
|
|
2948
|
-
version = "4.8.
|
|
2955
|
+
version = "4.8.3"
|
|
2949
2956
|
dependencies = [
|
|
2950
2957
|
"cc",
|
|
2951
2958
|
"cmake",
|
|
@@ -3022,16 +3029,22 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
|
|
|
3022
3029
|
|
|
3023
3030
|
[[package]]
|
|
3024
3031
|
name = "libredox"
|
|
3025
|
-
version = "0.1.
|
|
3032
|
+
version = "0.1.16"
|
|
3026
3033
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3027
|
-
checksum = "
|
|
3034
|
+
checksum = "e02f3bb43d335493c96bf3fd3a321600bf6bd07ed34bc64118e9293bdffea46c"
|
|
3028
3035
|
dependencies = [
|
|
3029
3036
|
"bitflags",
|
|
3030
3037
|
"libc",
|
|
3031
3038
|
"plain",
|
|
3032
|
-
"redox_syscall 0.7.
|
|
3039
|
+
"redox_syscall 0.7.4",
|
|
3033
3040
|
]
|
|
3034
3041
|
|
|
3042
|
+
[[package]]
|
|
3043
|
+
name = "link-section"
|
|
3044
|
+
version = "0.0.12"
|
|
3045
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3046
|
+
checksum = "f52437d47b0358721ec869cc7374b2a21f7b2237af9b439c0391341a1fbfbf1b"
|
|
3047
|
+
|
|
3035
3048
|
[[package]]
|
|
3036
3049
|
name = "linux-raw-sys"
|
|
3037
3050
|
version = "0.12.1"
|
|
@@ -3116,7 +3129,7 @@ dependencies = [
|
|
|
3116
3129
|
"md-5",
|
|
3117
3130
|
"nom 8.0.0",
|
|
3118
3131
|
"nom_locate",
|
|
3119
|
-
"rand 0.10.
|
|
3132
|
+
"rand 0.10.1",
|
|
3120
3133
|
"rangemap",
|
|
3121
3134
|
"rayon",
|
|
3122
3135
|
"sha2 0.10.9",
|
|
@@ -3698,7 +3711,7 @@ dependencies = [
|
|
|
3698
3711
|
"futures-util",
|
|
3699
3712
|
"opentelemetry",
|
|
3700
3713
|
"percent-encoding",
|
|
3701
|
-
"rand 0.9.
|
|
3714
|
+
"rand 0.9.3",
|
|
3702
3715
|
"thiserror 2.0.18",
|
|
3703
3716
|
"tokio",
|
|
3704
3717
|
"tokio-stream",
|
|
@@ -4065,7 +4078,7 @@ dependencies = [
|
|
|
4065
4078
|
"bytes",
|
|
4066
4079
|
"getrandom 0.3.4",
|
|
4067
4080
|
"lru-slab",
|
|
4068
|
-
"rand 0.9.
|
|
4081
|
+
"rand 0.9.3",
|
|
4069
4082
|
"ring",
|
|
4070
4083
|
"rustc-hash",
|
|
4071
4084
|
"rustls",
|
|
@@ -4131,9 +4144,9 @@ dependencies = [
|
|
|
4131
4144
|
|
|
4132
4145
|
[[package]]
|
|
4133
4146
|
name = "rand"
|
|
4134
|
-
version = "0.9.
|
|
4147
|
+
version = "0.9.3"
|
|
4135
4148
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4136
|
-
checksum = "
|
|
4149
|
+
checksum = "7ec095654a25171c2124e9e3393a930bddbffdc939556c914957a4c3e0a87166"
|
|
4137
4150
|
dependencies = [
|
|
4138
4151
|
"rand_chacha",
|
|
4139
4152
|
"rand_core 0.9.5",
|
|
@@ -4141,9 +4154,9 @@ dependencies = [
|
|
|
4141
4154
|
|
|
4142
4155
|
[[package]]
|
|
4143
4156
|
name = "rand"
|
|
4144
|
-
version = "0.10.
|
|
4157
|
+
version = "0.10.1"
|
|
4145
4158
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4146
|
-
checksum = "
|
|
4159
|
+
checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207"
|
|
4147
4160
|
dependencies = [
|
|
4148
4161
|
"chacha20",
|
|
4149
4162
|
"getrandom 0.4.2",
|
|
@@ -4182,7 +4195,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
4182
4195
|
checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
|
|
4183
4196
|
dependencies = [
|
|
4184
4197
|
"num-traits",
|
|
4185
|
-
"rand 0.9.
|
|
4198
|
+
"rand 0.9.3",
|
|
4186
4199
|
]
|
|
4187
4200
|
|
|
4188
4201
|
[[package]]
|
|
@@ -4218,7 +4231,7 @@ dependencies = [
|
|
|
4218
4231
|
"num-traits",
|
|
4219
4232
|
"paste",
|
|
4220
4233
|
"profiling",
|
|
4221
|
-
"rand 0.9.
|
|
4234
|
+
"rand 0.9.3",
|
|
4222
4235
|
"rand_chacha",
|
|
4223
4236
|
"simd_helpers",
|
|
4224
4237
|
"thiserror 2.0.18",
|
|
@@ -4319,9 +4332,9 @@ dependencies = [
|
|
|
4319
4332
|
|
|
4320
4333
|
[[package]]
|
|
4321
4334
|
name = "redox_syscall"
|
|
4322
|
-
version = "0.7.
|
|
4335
|
+
version = "0.7.4"
|
|
4323
4336
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4324
|
-
checksum = "
|
|
4337
|
+
checksum = "f450ad9c3b1da563fb6948a8e0fb0fb9269711c9c73d9ea1de5058c79c8d643a"
|
|
4325
4338
|
dependencies = [
|
|
4326
4339
|
"bitflags",
|
|
4327
4340
|
]
|
|
@@ -4454,9 +4467,9 @@ dependencies = [
|
|
|
4454
4467
|
|
|
4455
4468
|
[[package]]
|
|
4456
4469
|
name = "rmcp"
|
|
4457
|
-
version = "1.
|
|
4470
|
+
version = "1.4.0"
|
|
4458
4471
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4459
|
-
checksum = "
|
|
4472
|
+
checksum = "f542f74cf247da16f19bbc87e298cd201e912314f4083e88cdd671f44f5fcb53"
|
|
4460
4473
|
dependencies = [
|
|
4461
4474
|
"async-trait",
|
|
4462
4475
|
"base64 0.22.1",
|
|
@@ -4468,7 +4481,7 @@ dependencies = [
|
|
|
4468
4481
|
"http-body-util",
|
|
4469
4482
|
"pastey 0.2.1",
|
|
4470
4483
|
"pin-project-lite",
|
|
4471
|
-
"rand 0.10.
|
|
4484
|
+
"rand 0.10.1",
|
|
4472
4485
|
"rmcp-macros",
|
|
4473
4486
|
"schemars",
|
|
4474
4487
|
"serde",
|
|
@@ -4485,9 +4498,9 @@ dependencies = [
|
|
|
4485
4498
|
|
|
4486
4499
|
[[package]]
|
|
4487
4500
|
name = "rmcp-macros"
|
|
4488
|
-
version = "1.
|
|
4501
|
+
version = "1.4.0"
|
|
4489
4502
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4490
|
-
checksum = "
|
|
4503
|
+
checksum = "b2391e4ae47f314e70eaafb6c7bd82e495e770b935448864446302143019151f"
|
|
4491
4504
|
dependencies = [
|
|
4492
4505
|
"darling 0.23.0",
|
|
4493
4506
|
"proc-macro2",
|
|
@@ -4616,9 +4629,9 @@ checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f"
|
|
|
4616
4629
|
|
|
4617
4630
|
[[package]]
|
|
4618
4631
|
name = "rustls-webpki"
|
|
4619
|
-
version = "0.103.
|
|
4632
|
+
version = "0.103.11"
|
|
4620
4633
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4621
|
-
checksum = "
|
|
4634
|
+
checksum = "20a6af516fea4b20eccceaf166e8aa666ac996208e8a644ce3ef5aa783bc7cd4"
|
|
4622
4635
|
dependencies = [
|
|
4623
4636
|
"aws-lc-rs",
|
|
4624
4637
|
"ring",
|
|
@@ -5032,9 +5045,9 @@ dependencies = [
|
|
|
5032
5045
|
|
|
5033
5046
|
[[package]]
|
|
5034
5047
|
name = "sse-stream"
|
|
5035
|
-
version = "0.2.
|
|
5048
|
+
version = "0.2.2"
|
|
5036
5049
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5037
|
-
checksum = "
|
|
5050
|
+
checksum = "2c5e6deb40826033bd7b11c7ef25ef71193fabd71f680f40dd16538a2704d2f4"
|
|
5038
5051
|
dependencies = [
|
|
5039
5052
|
"bytes",
|
|
5040
5053
|
"futures-util",
|
|
@@ -5369,7 +5382,7 @@ dependencies = [
|
|
|
5369
5382
|
"monostate",
|
|
5370
5383
|
"onig",
|
|
5371
5384
|
"paste",
|
|
5372
|
-
"rand 0.9.
|
|
5385
|
+
"rand 0.9.3",
|
|
5373
5386
|
"rayon",
|
|
5374
5387
|
"rayon-cond",
|
|
5375
5388
|
"regex",
|
|
@@ -6040,9 +6053,9 @@ dependencies = [
|
|
|
6040
6053
|
|
|
6041
6054
|
[[package]]
|
|
6042
6055
|
name = "wasm-bindgen"
|
|
6043
|
-
version = "0.2.
|
|
6056
|
+
version = "0.2.118"
|
|
6044
6057
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6045
|
-
checksum = "
|
|
6058
|
+
checksum = "0bf938a0bacb0469e83c1e148908bd7d5a6010354cf4fb73279b7447422e3a89"
|
|
6046
6059
|
dependencies = [
|
|
6047
6060
|
"cfg-if",
|
|
6048
6061
|
"once_cell",
|
|
@@ -6053,9 +6066,9 @@ dependencies = [
|
|
|
6053
6066
|
|
|
6054
6067
|
[[package]]
|
|
6055
6068
|
name = "wasm-bindgen-futures"
|
|
6056
|
-
version = "0.4.
|
|
6069
|
+
version = "0.4.68"
|
|
6057
6070
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6058
|
-
checksum = "
|
|
6071
|
+
checksum = "f371d383f2fb139252e0bfac3b81b265689bf45b6874af544ffa4c975ac1ebf8"
|
|
6059
6072
|
dependencies = [
|
|
6060
6073
|
"js-sys",
|
|
6061
6074
|
"wasm-bindgen",
|
|
@@ -6063,9 +6076,9 @@ dependencies = [
|
|
|
6063
6076
|
|
|
6064
6077
|
[[package]]
|
|
6065
6078
|
name = "wasm-bindgen-macro"
|
|
6066
|
-
version = "0.2.
|
|
6079
|
+
version = "0.2.118"
|
|
6067
6080
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6068
|
-
checksum = "
|
|
6081
|
+
checksum = "eeff24f84126c0ec2db7a449f0c2ec963c6a49efe0698c4242929da037ca28ed"
|
|
6069
6082
|
dependencies = [
|
|
6070
6083
|
"quote",
|
|
6071
6084
|
"wasm-bindgen-macro-support",
|
|
@@ -6073,9 +6086,9 @@ dependencies = [
|
|
|
6073
6086
|
|
|
6074
6087
|
[[package]]
|
|
6075
6088
|
name = "wasm-bindgen-macro-support"
|
|
6076
|
-
version = "0.2.
|
|
6089
|
+
version = "0.2.118"
|
|
6077
6090
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6078
|
-
checksum = "
|
|
6091
|
+
checksum = "9d08065faf983b2b80a79fd87d8254c409281cf7de75fc4b773019824196c904"
|
|
6079
6092
|
dependencies = [
|
|
6080
6093
|
"bumpalo",
|
|
6081
6094
|
"proc-macro2",
|
|
@@ -6086,9 +6099,9 @@ dependencies = [
|
|
|
6086
6099
|
|
|
6087
6100
|
[[package]]
|
|
6088
6101
|
name = "wasm-bindgen-shared"
|
|
6089
|
-
version = "0.2.
|
|
6102
|
+
version = "0.2.118"
|
|
6090
6103
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6091
|
-
checksum = "
|
|
6104
|
+
checksum = "5fd04d9e306f1907bd13c6361b5c6bfc7b3b3c095ed3f8a9246390f8dbdee129"
|
|
6092
6105
|
dependencies = [
|
|
6093
6106
|
"unicode-ident",
|
|
6094
6107
|
]
|
|
@@ -6142,9 +6155,9 @@ dependencies = [
|
|
|
6142
6155
|
|
|
6143
6156
|
[[package]]
|
|
6144
6157
|
name = "web-sys"
|
|
6145
|
-
version = "0.3.
|
|
6158
|
+
version = "0.3.95"
|
|
6146
6159
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6147
|
-
checksum = "
|
|
6160
|
+
checksum = "4f2dfbb17949fa2088e5d39408c48368947b86f7834484e87b73de55bc14d97d"
|
|
6148
6161
|
dependencies = [
|
|
6149
6162
|
"js-sys",
|
|
6150
6163
|
"wasm-bindgen",
|
data/lib/kreuzberg/version.rb
CHANGED
data/lib/kreuzberg.rb
CHANGED
|
@@ -37,15 +37,11 @@ module Kreuzberg
|
|
|
37
37
|
autoload :DocumentStructure, 'kreuzberg/types'
|
|
38
38
|
autoload :PdfAnnotation, 'kreuzberg/types'
|
|
39
39
|
autoload :PdfAnnotationBoundingBox, 'kreuzberg/types'
|
|
40
|
+
autoload :KeywordAlgorithm, 'kreuzberg/types'
|
|
40
41
|
|
|
41
42
|
ExtractionConfig = Config::Extraction
|
|
42
43
|
PageConfig = Config::PageConfig
|
|
43
44
|
|
|
44
|
-
module KeywordAlgorithm
|
|
45
|
-
YAKE = :yake
|
|
46
|
-
RAKE = :rake
|
|
47
|
-
end
|
|
48
|
-
|
|
49
45
|
@__cache_tracker = { entries: 0, bytes: 0 }
|
|
50
46
|
|
|
51
47
|
class << self
|
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.8.
|
|
5
|
+
version = "4.8.3"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -23,7 +23,7 @@ clap = { version = "4.6", features = ["derive", "color", "suggestions"] }
|
|
|
23
23
|
comrak = { version = "0.52", default-features = false }
|
|
24
24
|
console_error_panic_hook = "0.1"
|
|
25
25
|
criterion = { version = "0.8", features = ["html_reports"] }
|
|
26
|
-
ctor = "0.
|
|
26
|
+
ctor = "0.9"
|
|
27
27
|
dbase = "0.7"
|
|
28
28
|
futures = "0.3"
|
|
29
29
|
getrandom = { version = "0.4.2", features = ["wasm_js"] }
|
|
@@ -32,8 +32,8 @@ html-to-markdown-rs = { version = "3.1.0", default-features = false }
|
|
|
32
32
|
image = { version = "0.25.10", default-features = false }
|
|
33
33
|
itertools = "0.14"
|
|
34
34
|
js-sys = "0.3"
|
|
35
|
-
kreuzberg = { path = "./crates/kreuzberg", version = "4.8.
|
|
36
|
-
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.8.
|
|
35
|
+
kreuzberg = { path = "./crates/kreuzberg", version = "4.8.3", default-features = false }
|
|
36
|
+
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.8.3" }
|
|
37
37
|
lazy_static = "1.5.0"
|
|
38
38
|
libc = "0.2.184"
|
|
39
39
|
liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
|
|
@@ -45,7 +45,7 @@ num_cpus = "1.17.0"
|
|
|
45
45
|
once_cell = "1.21.4"
|
|
46
46
|
ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
|
|
47
47
|
parking_lot = "0.12.5"
|
|
48
|
-
pdf_oxide = { version = "0.3.
|
|
48
|
+
pdf_oxide = { version = "0.3.24", default-features = false }
|
|
49
49
|
pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
|
|
50
50
|
rayon = "1.11.0"
|
|
51
51
|
reqwest = { version = "0.13.2", default-features = false }
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.8.
|
|
3
|
+
version = "4.8.3"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -285,7 +285,7 @@ image = { version = "0.25.10", default-features = false, features = [
|
|
|
285
285
|
"pnm",
|
|
286
286
|
"rayon",
|
|
287
287
|
], optional = true }
|
|
288
|
-
indexmap = "2.
|
|
288
|
+
indexmap = "2.14.0"
|
|
289
289
|
infer = "0.19.0"
|
|
290
290
|
jotdown = "0.9"
|
|
291
291
|
kamadak-exif = { version = "0.6.1", optional = true }
|
|
@@ -314,14 +314,14 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
|
|
|
314
314
|
outlook-pst = { version = "1.2.0", optional = true }
|
|
315
315
|
parking_lot = "0.12.5"
|
|
316
316
|
pastey = "0.2"
|
|
317
|
-
pdf_oxide = { version = "0.3.
|
|
317
|
+
pdf_oxide = { version = "0.3.24", default-features = false, optional = true }
|
|
318
318
|
pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
|
|
319
319
|
pulldown-cmark = { version = "0.13" }
|
|
320
320
|
quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
|
|
321
321
|
rake = { version = "0.3.6", optional = true }
|
|
322
322
|
rayon = "1.11.0"
|
|
323
323
|
regex = "1.12.3"
|
|
324
|
-
rmcp = { version = "1.
|
|
324
|
+
rmcp = { version = "1.4.0", features = [
|
|
325
325
|
"server",
|
|
326
326
|
"macros",
|
|
327
327
|
"base64",
|
|
@@ -405,7 +405,7 @@ criterion = { version = "0.8", features = ["html_reports"] }
|
|
|
405
405
|
dotenvy = "0.15"
|
|
406
406
|
filetime = "0.2"
|
|
407
407
|
image = { version = "0.25.10", default-features = false, features = ["png"] }
|
|
408
|
-
jsonschema = "0.
|
|
408
|
+
jsonschema = "0.46"
|
|
409
409
|
serial_test = "3.4.0"
|
|
410
410
|
tar = "0.4.45"
|
|
411
411
|
tempfile = "3.27.0"
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
18
18
|
|
|
19
19
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
20
20
|
|
|
21
|
-
> **🚀 Version 4.8.
|
|
21
|
+
> **🚀 Version 4.8.3 Release**
|
|
22
22
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
23
23
|
>
|
|
24
24
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -38,7 +38,7 @@ pub use llm::{LlmConfig, StructuredExtractionConfig};
|
|
|
38
38
|
pub use ocr::{OcrConfig, OcrPipelineConfig, OcrPipelineStage, OcrQualityThresholds};
|
|
39
39
|
pub use page::PageConfig;
|
|
40
40
|
#[cfg(feature = "pdf")]
|
|
41
|
-
pub use pdf::{HierarchyConfig, PdfConfig};
|
|
41
|
+
pub use pdf::{HierarchyConfig, PdfBackend, PdfConfig};
|
|
42
42
|
pub use processing::{
|
|
43
43
|
ChunkSizing, ChunkerType, ChunkingConfig, EmbeddingConfig, EmbeddingModelType, PostProcessorConfig,
|
|
44
44
|
};
|
|
@@ -5,10 +5,31 @@
|
|
|
5
5
|
|
|
6
6
|
use serde::{Deserialize, Serialize};
|
|
7
7
|
|
|
8
|
+
/// PDF extraction backend selection.
|
|
9
|
+
///
|
|
10
|
+
/// Controls which PDF library is used for text extraction:
|
|
11
|
+
/// - `Pdfium`: pdfium-render (default, C++ based, mature)
|
|
12
|
+
/// - `PdfOxide`: pdf_oxide (pure Rust, faster, requires `pdf-oxide` feature)
|
|
13
|
+
/// - `Auto`: automatically select based on available features
|
|
14
|
+
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
|
|
15
|
+
#[serde(rename_all = "lowercase")]
|
|
16
|
+
pub enum PdfBackend {
|
|
17
|
+
/// Use pdfium-render backend (default).
|
|
18
|
+
#[default]
|
|
19
|
+
Pdfium,
|
|
20
|
+
/// Use pdf_oxide backend (pure Rust). Requires `pdf-oxide` feature.
|
|
21
|
+
PdfOxide,
|
|
22
|
+
/// Automatically select the best available backend.
|
|
23
|
+
Auto,
|
|
24
|
+
}
|
|
25
|
+
|
|
8
26
|
/// PDF-specific configuration.
|
|
9
27
|
#[cfg(feature = "pdf")]
|
|
10
28
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
11
29
|
pub struct PdfConfig {
|
|
30
|
+
/// PDF extraction backend. Default: `Pdfium`.
|
|
31
|
+
#[serde(default)]
|
|
32
|
+
pub backend: PdfBackend,
|
|
12
33
|
/// Extract images from PDF
|
|
13
34
|
#[serde(default)]
|
|
14
35
|
pub extract_images: bool,
|
|
@@ -86,6 +107,7 @@ pub struct HierarchyConfig {
|
|
|
86
107
|
impl Default for PdfConfig {
|
|
87
108
|
fn default() -> Self {
|
|
88
109
|
Self {
|
|
110
|
+
backend: PdfBackend::default(),
|
|
89
111
|
extract_images: false,
|
|
90
112
|
passwords: None,
|
|
91
113
|
extract_metadata: true,
|
|
@@ -155,6 +177,7 @@ mod tests {
|
|
|
155
177
|
fn test_pdf_config_custom_margins() {
|
|
156
178
|
use super::*;
|
|
157
179
|
let config = PdfConfig {
|
|
180
|
+
backend: PdfBackend::default(),
|
|
158
181
|
extract_images: false,
|
|
159
182
|
passwords: None,
|
|
160
183
|
extract_metadata: true,
|
|
@@ -174,7 +174,7 @@ impl DocOrientationDetector {
|
|
|
174
174
|
message: format!("Failed to create doc_ori session builder: {e}"),
|
|
175
175
|
source: None,
|
|
176
176
|
})?
|
|
177
|
-
.with_optimization_level(GraphOptimizationLevel::
|
|
177
|
+
.with_optimization_level(GraphOptimizationLevel::All)
|
|
178
178
|
.map_err(|e| KreuzbergError::Ocr {
|
|
179
179
|
message: format!("Failed to set doc_ori optimization level: {e}"),
|
|
180
180
|
source: None,
|