kreuzberg 4.3.7 → 4.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +72 -39
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +2 -2
- data/vendor/kreuzberg/Cargo.toml +17 -4
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/handlers.rs +18 -0
- data/vendor/kreuzberg/src/api/openapi.rs +2 -0
- data/vendor/kreuzberg/src/api/router.rs +3 -2
- data/vendor/kreuzberg/src/core/mime.rs +536 -208
- data/vendor/kreuzberg/src/extractors/mdx.rs +945 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +15 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +14 -1
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +4 -2
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +12 -1
- data/vendor/kreuzberg/src/lib.rs +3 -2
- data/vendor/kreuzberg/src/mcp/server.rs +20 -2
- data/vendor/kreuzberg/src/pdf/markdown/assembly.rs +81 -9
- data/vendor/kreuzberg/src/pdf/markdown/bridge.rs +269 -20
- data/vendor/kreuzberg/src/pdf/markdown/classify.rs +94 -0
- data/vendor/kreuzberg/src/pdf/markdown/constants.rs +6 -0
- data/vendor/kreuzberg/src/pdf/markdown/pipeline.rs +470 -6
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +1 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +3 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +3 -0
- data/vendor/kreuzberg/tests/page_markers.rs +106 -0
- data/vendor/kreuzberg/tests/pdf_markdown_extraction.rs +2 -2
- data/vendor/kreuzberg/tests/pdf_markdown_regression.rs +726 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +1 -1
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 32a54c81c378239aaf604785d6475aa99a97b42a59dbe3e4fed156df0109ad28
|
|
4
|
+
data.tar.gz: dd0a6158f4a5c112faeb7e1bae603d3f7f4da5c21c2102105a40864154c35448
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 073f79596f0c96b783645800f152c84ecbae090bec967019d841fc155875275a6db5c1e8007e8446091ed08d6372d222881ffd7b460b2a91b3d556481acb9a63
|
|
7
|
+
data.tar.gz: 9c0422293b1bd07c79227056159f574106fbb1f6306549fd267ff3eb892332ebdddd5762c15ee8aec75c4837095fe2d6ee392965662fc00cad573a8eb962b71e
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.3.
|
|
4
|
+
kreuzberg (4.3.8)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
sorbet-runtime (~> 0.5)
|
|
7
7
|
|
|
@@ -124,7 +124,7 @@ GEM
|
|
|
124
124
|
rubocop (~> 1.81)
|
|
125
125
|
ruby-progressbar (1.13.0)
|
|
126
126
|
securerandom (0.4.1)
|
|
127
|
-
sorbet-runtime (0.6.
|
|
127
|
+
sorbet-runtime (0.6.12957)
|
|
128
128
|
steep (1.10.0)
|
|
129
129
|
activesupport (>= 5.1)
|
|
130
130
|
concurrent-ruby (>= 1.1.10)
|
|
@@ -210,7 +210,7 @@ CHECKSUMS
|
|
|
210
210
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
211
211
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
212
212
|
json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
|
|
213
|
-
kreuzberg (4.3.
|
|
213
|
+
kreuzberg (4.3.8)
|
|
214
214
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
215
215
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
216
216
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
@@ -245,7 +245,7 @@ CHECKSUMS
|
|
|
245
245
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
246
246
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
247
247
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
248
|
-
sorbet-runtime (0.6.
|
|
248
|
+
sorbet-runtime (0.6.12957) sha256=691ccafeae2663236777ffc8a348266907fca3985b089f013d7ccd59bef19056
|
|
249
249
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
250
250
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
251
251
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.8" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -470,9 +470,9 @@ dependencies = [
|
|
|
470
470
|
|
|
471
471
|
[[package]]
|
|
472
472
|
name = "biblib"
|
|
473
|
-
version = "0.
|
|
473
|
+
version = "0.4.2"
|
|
474
474
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
475
|
-
checksum = "
|
|
475
|
+
checksum = "f289d61c020b03326674c9a13caf0b9a2744465510df96ff97979083a28517c6"
|
|
476
476
|
dependencies = [
|
|
477
477
|
"compact_str",
|
|
478
478
|
"either",
|
|
@@ -976,6 +976,19 @@ dependencies = [
|
|
|
976
976
|
"windows-sys 0.59.0",
|
|
977
977
|
]
|
|
978
978
|
|
|
979
|
+
[[package]]
|
|
980
|
+
name = "console"
|
|
981
|
+
version = "0.16.2"
|
|
982
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
983
|
+
checksum = "03e45a4a8926227e4197636ba97a9fc9b00477e9f4bd711395687c5f0734bec4"
|
|
984
|
+
dependencies = [
|
|
985
|
+
"encode_unicode",
|
|
986
|
+
"libc",
|
|
987
|
+
"once_cell",
|
|
988
|
+
"unicode-width",
|
|
989
|
+
"windows-sys 0.61.2",
|
|
990
|
+
]
|
|
991
|
+
|
|
979
992
|
[[package]]
|
|
980
993
|
name = "console_error_panic_hook"
|
|
981
994
|
version = "0.1.7"
|
|
@@ -1119,21 +1132,6 @@ dependencies = [
|
|
|
1119
1132
|
"libc",
|
|
1120
1133
|
]
|
|
1121
1134
|
|
|
1122
|
-
[[package]]
|
|
1123
|
-
name = "crc"
|
|
1124
|
-
version = "3.3.0"
|
|
1125
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1126
|
-
checksum = "9710d3b3739c2e349eb44fe848ad0b7c8cb1e42bd87ee49371df2f7acaf3e675"
|
|
1127
|
-
dependencies = [
|
|
1128
|
-
"crc-catalog",
|
|
1129
|
-
]
|
|
1130
|
-
|
|
1131
|
-
[[package]]
|
|
1132
|
-
name = "crc-catalog"
|
|
1133
|
-
version = "2.4.0"
|
|
1134
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1135
|
-
checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5"
|
|
1136
|
-
|
|
1137
1135
|
[[package]]
|
|
1138
1136
|
name = "crc32fast"
|
|
1139
1137
|
version = "1.5.0"
|
|
@@ -1657,7 +1655,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
1657
1655
|
checksum = "b4339d45a80579ab8305616a501eacdbf18fb0f7def7fa6e4c0b75941416d5b0"
|
|
1658
1656
|
dependencies = [
|
|
1659
1657
|
"anyhow",
|
|
1660
|
-
"hf-hub",
|
|
1658
|
+
"hf-hub 0.4.3",
|
|
1661
1659
|
"ndarray",
|
|
1662
1660
|
"ort",
|
|
1663
1661
|
"safetensors",
|
|
@@ -2237,7 +2235,7 @@ checksum = "629d8f3bbeda9d148036d6b0de0a3ab947abd08ce90626327fc3547a49d59d97"
|
|
|
2237
2235
|
dependencies = [
|
|
2238
2236
|
"dirs",
|
|
2239
2237
|
"http",
|
|
2240
|
-
"indicatif",
|
|
2238
|
+
"indicatif 0.17.11",
|
|
2241
2239
|
"libc",
|
|
2242
2240
|
"log",
|
|
2243
2241
|
"native-tls",
|
|
@@ -2250,6 +2248,25 @@ dependencies = [
|
|
|
2250
2248
|
"windows-sys 0.60.2",
|
|
2251
2249
|
]
|
|
2252
2250
|
|
|
2251
|
+
[[package]]
|
|
2252
|
+
name = "hf-hub"
|
|
2253
|
+
version = "0.5.0"
|
|
2254
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2255
|
+
checksum = "aef3982638978efa195ff11b305f51f1f22f4f0a6cabee7af79b383ebee6a213"
|
|
2256
|
+
dependencies = [
|
|
2257
|
+
"dirs",
|
|
2258
|
+
"http",
|
|
2259
|
+
"indicatif 0.18.4",
|
|
2260
|
+
"libc",
|
|
2261
|
+
"log",
|
|
2262
|
+
"rand 0.9.2",
|
|
2263
|
+
"serde",
|
|
2264
|
+
"serde_json",
|
|
2265
|
+
"thiserror 2.0.18",
|
|
2266
|
+
"ureq 3.2.0",
|
|
2267
|
+
"windows-sys 0.61.2",
|
|
2268
|
+
]
|
|
2269
|
+
|
|
2253
2270
|
[[package]]
|
|
2254
2271
|
name = "hmac"
|
|
2255
2272
|
version = "0.12.1"
|
|
@@ -2709,13 +2726,26 @@ version = "0.17.11"
|
|
|
2709
2726
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2710
2727
|
checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235"
|
|
2711
2728
|
dependencies = [
|
|
2712
|
-
"console",
|
|
2729
|
+
"console 0.15.11",
|
|
2713
2730
|
"number_prefix",
|
|
2714
2731
|
"portable-atomic",
|
|
2715
2732
|
"unicode-width",
|
|
2716
2733
|
"web-time",
|
|
2717
2734
|
]
|
|
2718
2735
|
|
|
2736
|
+
[[package]]
|
|
2737
|
+
name = "indicatif"
|
|
2738
|
+
version = "0.18.4"
|
|
2739
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2740
|
+
checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb"
|
|
2741
|
+
dependencies = [
|
|
2742
|
+
"console 0.16.2",
|
|
2743
|
+
"portable-atomic",
|
|
2744
|
+
"unicode-width",
|
|
2745
|
+
"unit-prefix",
|
|
2746
|
+
"web-time",
|
|
2747
|
+
]
|
|
2748
|
+
|
|
2719
2749
|
[[package]]
|
|
2720
2750
|
name = "infer"
|
|
2721
2751
|
version = "0.19.0"
|
|
@@ -2901,7 +2931,7 @@ dependencies = [
|
|
|
2901
2931
|
|
|
2902
2932
|
[[package]]
|
|
2903
2933
|
name = "kreuzberg"
|
|
2904
|
-
version = "4.3.
|
|
2934
|
+
version = "4.3.8"
|
|
2905
2935
|
dependencies = [
|
|
2906
2936
|
"ahash",
|
|
2907
2937
|
"async-trait",
|
|
@@ -2926,7 +2956,7 @@ dependencies = [
|
|
|
2926
2956
|
"hayro-jbig2",
|
|
2927
2957
|
"hayro-jpeg2000",
|
|
2928
2958
|
"hex",
|
|
2929
|
-
"hf-hub",
|
|
2959
|
+
"hf-hub 0.5.0",
|
|
2930
2960
|
"html-to-markdown-rs",
|
|
2931
2961
|
"image",
|
|
2932
2962
|
"indexmap",
|
|
@@ -2939,7 +2969,7 @@ dependencies = [
|
|
|
2939
2969
|
"lazy_static",
|
|
2940
2970
|
"libc",
|
|
2941
2971
|
"lopdf",
|
|
2942
|
-
"lzma-rust2 0.
|
|
2972
|
+
"lzma-rust2 0.16.2",
|
|
2943
2973
|
"mail-parser",
|
|
2944
2974
|
"memchr",
|
|
2945
2975
|
"mime_guess",
|
|
@@ -2954,7 +2984,7 @@ dependencies = [
|
|
|
2954
2984
|
"pkg-config",
|
|
2955
2985
|
"polars",
|
|
2956
2986
|
"pulldown-cmark",
|
|
2957
|
-
"quick-xml 0.39.
|
|
2987
|
+
"quick-xml 0.39.2",
|
|
2958
2988
|
"rake",
|
|
2959
2989
|
"rayon",
|
|
2960
2990
|
"regex",
|
|
@@ -2993,7 +3023,7 @@ dependencies = [
|
|
|
2993
3023
|
|
|
2994
3024
|
[[package]]
|
|
2995
3025
|
name = "kreuzberg-ffi"
|
|
2996
|
-
version = "4.3.
|
|
3026
|
+
version = "4.3.8"
|
|
2997
3027
|
dependencies = [
|
|
2998
3028
|
"ahash",
|
|
2999
3029
|
"async-trait",
|
|
@@ -3009,7 +3039,7 @@ dependencies = [
|
|
|
3009
3039
|
|
|
3010
3040
|
[[package]]
|
|
3011
3041
|
name = "kreuzberg-paddle-ocr"
|
|
3012
|
-
version = "4.3.
|
|
3042
|
+
version = "4.3.8"
|
|
3013
3043
|
dependencies = [
|
|
3014
3044
|
"geo-clipper",
|
|
3015
3045
|
"geo-types",
|
|
@@ -3024,7 +3054,7 @@ dependencies = [
|
|
|
3024
3054
|
|
|
3025
3055
|
[[package]]
|
|
3026
3056
|
name = "kreuzberg-pdfium-render"
|
|
3027
|
-
version = "4.3.
|
|
3057
|
+
version = "4.3.8"
|
|
3028
3058
|
dependencies = [
|
|
3029
3059
|
"bitflags",
|
|
3030
3060
|
"bytemuck",
|
|
@@ -3048,7 +3078,7 @@ dependencies = [
|
|
|
3048
3078
|
|
|
3049
3079
|
[[package]]
|
|
3050
3080
|
name = "kreuzberg-rb"
|
|
3051
|
-
version = "4.3.
|
|
3081
|
+
version = "4.3.8"
|
|
3052
3082
|
dependencies = [
|
|
3053
3083
|
"async-trait",
|
|
3054
3084
|
"html-to-markdown-rs",
|
|
@@ -3065,7 +3095,7 @@ dependencies = [
|
|
|
3065
3095
|
|
|
3066
3096
|
[[package]]
|
|
3067
3097
|
name = "kreuzberg-tesseract"
|
|
3068
|
-
version = "4.3.
|
|
3098
|
+
version = "4.3.8"
|
|
3069
3099
|
dependencies = [
|
|
3070
3100
|
"cc",
|
|
3071
3101
|
"cmake",
|
|
@@ -3295,18 +3325,13 @@ name = "lzma-rust2"
|
|
|
3295
3325
|
version = "0.15.7"
|
|
3296
3326
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3297
3327
|
checksum = "1670343e58806300d87950e3401e820b519b9384281bbabfb15e3636689ffd69"
|
|
3298
|
-
dependencies = [
|
|
3299
|
-
"crc",
|
|
3300
|
-
"sha2",
|
|
3301
|
-
]
|
|
3302
3328
|
|
|
3303
3329
|
[[package]]
|
|
3304
3330
|
name = "lzma-rust2"
|
|
3305
|
-
version = "0.16.
|
|
3331
|
+
version = "0.16.2"
|
|
3306
3332
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
3307
|
-
checksum = "
|
|
3333
|
+
checksum = "47bb1e988e6fb779cf720ad431242d3f03167c1b3f2b1aae7f1a94b2495b36ae"
|
|
3308
3334
|
dependencies = [
|
|
3309
|
-
"crc",
|
|
3310
3335
|
"sha2",
|
|
3311
3336
|
]
|
|
3312
3337
|
|
|
@@ -4891,9 +4916,9 @@ dependencies = [
|
|
|
4891
4916
|
|
|
4892
4917
|
[[package]]
|
|
4893
4918
|
name = "quick-xml"
|
|
4894
|
-
version = "0.39.
|
|
4919
|
+
version = "0.39.2"
|
|
4895
4920
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
4896
|
-
checksum = "
|
|
4921
|
+
checksum = "958f21e8e7ceb5a1aa7fa87fab28e7c75976e0bfe7e23ff069e0a260f894067d"
|
|
4897
4922
|
dependencies = [
|
|
4898
4923
|
"memchr",
|
|
4899
4924
|
"serde",
|
|
@@ -7035,6 +7060,12 @@ version = "0.1.1"
|
|
|
7035
7060
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
7036
7061
|
checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
|
|
7037
7062
|
|
|
7063
|
+
[[package]]
|
|
7064
|
+
name = "unit-prefix"
|
|
7065
|
+
version = "0.5.2"
|
|
7066
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
7067
|
+
checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3"
|
|
7068
|
+
|
|
7038
7069
|
[[package]]
|
|
7039
7070
|
name = "unsafe-libyaml"
|
|
7040
7071
|
version = "0.2.11"
|
|
@@ -7088,6 +7119,7 @@ dependencies = [
|
|
|
7088
7119
|
"base64 0.22.1",
|
|
7089
7120
|
"cookie_store",
|
|
7090
7121
|
"der",
|
|
7122
|
+
"flate2",
|
|
7091
7123
|
"log",
|
|
7092
7124
|
"native-tls",
|
|
7093
7125
|
"percent-encoding",
|
|
@@ -7095,6 +7127,7 @@ dependencies = [
|
|
|
7095
7127
|
"rustls-pki-types",
|
|
7096
7128
|
"serde",
|
|
7097
7129
|
"serde_json",
|
|
7130
|
+
"socks",
|
|
7098
7131
|
"ureq-proto",
|
|
7099
7132
|
"utf-8",
|
|
7100
7133
|
"webpki-root-certs",
|
|
@@ -8122,7 +8155,7 @@ dependencies = [
|
|
|
8122
8155
|
"getrandom 0.4.1",
|
|
8123
8156
|
"hmac",
|
|
8124
8157
|
"indexmap",
|
|
8125
|
-
"lzma-rust2 0.16.
|
|
8158
|
+
"lzma-rust2 0.16.2",
|
|
8126
8159
|
"memchr",
|
|
8127
8160
|
"pbkdf2",
|
|
8128
8161
|
"ppmd-rust",
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.3.
|
|
5
|
+
version = "4.3.8"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -26,7 +26,7 @@ image = { version = "0.25.9", default-features = false }
|
|
|
26
26
|
js-sys = "0.3"
|
|
27
27
|
libc = "0.2.182"
|
|
28
28
|
log = "0.4"
|
|
29
|
-
lzma-rust2 = { version = "0.
|
|
29
|
+
lzma-rust2 = { version = "0.16.2" }
|
|
30
30
|
num_cpus = "1.17.0"
|
|
31
31
|
once_cell = "1.21.3"
|
|
32
32
|
parking_lot = "0.12.5"
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.3.
|
|
3
|
+
version = "4.3.8"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -51,6 +51,7 @@ email = ["dep:mail-parser", "dep:cfb"]
|
|
|
51
51
|
html = ["dep:html-to-markdown-rs"]
|
|
52
52
|
xml = ["dep:quick-xml", "dep:roxmltree"]
|
|
53
53
|
archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:lzma-rust2", "dep:flate2"]
|
|
54
|
+
mdx = ["dep:pulldown-cmark"]
|
|
54
55
|
|
|
55
56
|
ocr = [
|
|
56
57
|
"dep:kreuzberg-tesseract",
|
|
@@ -90,7 +91,17 @@ mcp-http = ["mcp", "api"]
|
|
|
90
91
|
|
|
91
92
|
otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:tracing-opentelemetry"]
|
|
92
93
|
|
|
93
|
-
wasm-target = [
|
|
94
|
+
wasm-target = [
|
|
95
|
+
"pdf",
|
|
96
|
+
"html",
|
|
97
|
+
"xml",
|
|
98
|
+
"email",
|
|
99
|
+
"language-detection",
|
|
100
|
+
"chunking",
|
|
101
|
+
"quality",
|
|
102
|
+
"office",
|
|
103
|
+
"mdx",
|
|
104
|
+
]
|
|
94
105
|
wasm-threads = ["dep:wasm-bindgen-rayon"]
|
|
95
106
|
|
|
96
107
|
full = [
|
|
@@ -108,6 +119,7 @@ full = [
|
|
|
108
119
|
"embeddings",
|
|
109
120
|
"quality",
|
|
110
121
|
"keywords",
|
|
122
|
+
"mdx",
|
|
111
123
|
"api",
|
|
112
124
|
"mcp",
|
|
113
125
|
"otel",
|
|
@@ -124,6 +136,7 @@ cli = [
|
|
|
124
136
|
"language-detection",
|
|
125
137
|
"chunking",
|
|
126
138
|
"quality",
|
|
139
|
+
"mdx",
|
|
127
140
|
]
|
|
128
141
|
|
|
129
142
|
[build-dependencies]
|
|
@@ -174,10 +187,10 @@ mail-parser = { version = "0.11.2", optional = true }
|
|
|
174
187
|
html-to-markdown-rs = { version = "2.25.1", default-features = false , features = [
|
|
175
188
|
"inline-images", "metadata", ], optional = true }
|
|
176
189
|
cfb = { version = "0.14.0", optional = true }
|
|
177
|
-
quick-xml = { version = "0.39.
|
|
190
|
+
quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
|
|
178
191
|
tar = { version = "0.4.44", optional = true }
|
|
179
192
|
sevenz-rust2 = { version = "0.20.1", optional = true }
|
|
180
|
-
lzma-rust2 = { version = "0.
|
|
193
|
+
lzma-rust2 = { version = "0.16.2" , optional = true }
|
|
181
194
|
flate2 = { version = "1.1", optional = true }
|
|
182
195
|
|
|
183
196
|
pulldown-cmark = { version = "0.13", optional = true }
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.3.
|
|
20
|
+
> **🚀 Version 4.3.8 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -193,6 +193,24 @@ pub async fn extract_handler(
|
|
|
193
193
|
Ok(Json(results))
|
|
194
194
|
}
|
|
195
195
|
|
|
196
|
+
/// Formats endpoint handler.
|
|
197
|
+
///
|
|
198
|
+
/// GET /formats
|
|
199
|
+
///
|
|
200
|
+
/// Returns all supported file extensions and their corresponding MIME types.
|
|
201
|
+
#[utoipa::path(
|
|
202
|
+
get,
|
|
203
|
+
path = "/formats",
|
|
204
|
+
tag = "health",
|
|
205
|
+
responses(
|
|
206
|
+
(status = 200, description = "Supported formats", body = Vec<crate::SupportedFormat>),
|
|
207
|
+
)
|
|
208
|
+
)]
|
|
209
|
+
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.formats"))]
|
|
210
|
+
pub async fn formats_handler() -> Json<Vec<crate::SupportedFormat>> {
|
|
211
|
+
Json(crate::list_supported_formats())
|
|
212
|
+
}
|
|
213
|
+
|
|
196
214
|
/// Cache stats endpoint handler.
|
|
197
215
|
///
|
|
198
216
|
/// GET /cache/stats
|
|
@@ -33,6 +33,7 @@ use utoipa::OpenApi;
|
|
|
33
33
|
crate::api::handlers::health_handler,
|
|
34
34
|
crate::api::handlers::info_handler,
|
|
35
35
|
crate::api::handlers::extract_handler,
|
|
36
|
+
crate::api::handlers::formats_handler,
|
|
36
37
|
crate::api::handlers::cache_stats_handler,
|
|
37
38
|
crate::api::handlers::cache_clear_handler,
|
|
38
39
|
crate::api::handlers::embed_handler,
|
|
@@ -53,6 +54,7 @@ use utoipa::OpenApi;
|
|
|
53
54
|
crate::api::types::ChunkItem,
|
|
54
55
|
crate::api::types::ChunkingConfigRequest,
|
|
55
56
|
crate::api::types::ChunkingConfigResponse,
|
|
57
|
+
crate::core::mime::SupportedFormat,
|
|
56
58
|
crate::types::extraction::ExtractionResult,
|
|
57
59
|
crate::types::extraction::Chunk,
|
|
58
60
|
crate::types::extraction::ChunkMetadata,
|
|
@@ -17,8 +17,8 @@ use crate::{ExtractionConfig, core::ServerConfig};
|
|
|
17
17
|
|
|
18
18
|
use super::{
|
|
19
19
|
handlers::{
|
|
20
|
-
cache_clear_handler, cache_stats_handler, chunk_handler, embed_handler, extract_handler,
|
|
21
|
-
info_handler,
|
|
20
|
+
cache_clear_handler, cache_stats_handler, chunk_handler, embed_handler, extract_handler, formats_handler,
|
|
21
|
+
health_handler, info_handler,
|
|
22
22
|
},
|
|
23
23
|
types::{ApiSizeLimits, ApiState},
|
|
24
24
|
};
|
|
@@ -157,6 +157,7 @@ pub fn create_router_with_limits_and_server_config(
|
|
|
157
157
|
.route("/extract", post(extract_handler))
|
|
158
158
|
.route("/embed", post(embed_handler))
|
|
159
159
|
.route("/chunk", post(chunk_handler))
|
|
160
|
+
.route("/formats", get(formats_handler))
|
|
160
161
|
.route("/health", get(health_handler))
|
|
161
162
|
.route("/info", get(info_handler))
|
|
162
163
|
.route("/cache/stats", get(cache_stats_handler))
|