kreuzberg 4.3.7 → 4.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +4 -4
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +72 -39
  5. data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
  6. data/lib/kreuzberg/version.rb +1 -1
  7. data/vendor/Cargo.toml +2 -2
  8. data/vendor/kreuzberg/Cargo.toml +17 -4
  9. data/vendor/kreuzberg/README.md +1 -1
  10. data/vendor/kreuzberg/src/api/handlers.rs +18 -0
  11. data/vendor/kreuzberg/src/api/openapi.rs +2 -0
  12. data/vendor/kreuzberg/src/api/router.rs +3 -2
  13. data/vendor/kreuzberg/src/core/mime.rs +536 -208
  14. data/vendor/kreuzberg/src/extractors/mdx.rs +945 -0
  15. data/vendor/kreuzberg/src/extractors/mod.rs +15 -0
  16. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +14 -1
  17. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +4 -2
  18. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +12 -1
  19. data/vendor/kreuzberg/src/lib.rs +3 -2
  20. data/vendor/kreuzberg/src/mcp/server.rs +20 -2
  21. data/vendor/kreuzberg/src/pdf/markdown/assembly.rs +81 -9
  22. data/vendor/kreuzberg/src/pdf/markdown/bridge.rs +269 -20
  23. data/vendor/kreuzberg/src/pdf/markdown/classify.rs +94 -0
  24. data/vendor/kreuzberg/src/pdf/markdown/constants.rs +6 -0
  25. data/vendor/kreuzberg/src/pdf/markdown/pipeline.rs +470 -6
  26. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +1 -0
  27. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +3 -0
  28. data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
  29. data/vendor/kreuzberg/src/plugins/ocr.rs +3 -0
  30. data/vendor/kreuzberg/tests/page_markers.rs +106 -0
  31. data/vendor/kreuzberg/tests/pdf_markdown_extraction.rs +2 -2
  32. data/vendor/kreuzberg/tests/pdf_markdown_regression.rs +726 -0
  33. data/vendor/kreuzberg-ffi/Cargo.toml +1 -1
  34. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
  35. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  36. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  37. metadata +4 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1c9f9cd70dd541fd9c193c1ab60782eb04e287019f707eeb0d6eb853f64fc039
4
- data.tar.gz: ce8345437d8e47062a21799b0352ceb2d5917a0546421b39a928fe1b659dc28d
3
+ metadata.gz: 32a54c81c378239aaf604785d6475aa99a97b42a59dbe3e4fed156df0109ad28
4
+ data.tar.gz: dd0a6158f4a5c112faeb7e1bae603d3f7f4da5c21c2102105a40864154c35448
5
5
  SHA512:
6
- metadata.gz: 012fac8575af2561d8b114649ef370ee281e90629a3ac95abad398dc3371853109e76e19b4aca9aeeba315853cae72f2ff9788681bd7b1bcfa3cb8245dbaa939
7
- data.tar.gz: fce6aa4f32a4651821d20ba61c77d6b61fc1038492e4b7d960c585ce2c131dd228430835fcb35f945998c64ae3faa71655ab5748b6243e2c70453e9505a442a0
6
+ metadata.gz: 073f79596f0c96b783645800f152c84ecbae090bec967019d841fc155875275a6db5c1e8007e8446091ed08d6372d222881ffd7b460b2a91b3d556481acb9a63
7
+ data.tar.gz: 9c0422293b1bd07c79227056159f574106fbb1f6306549fd267ff3eb892332ebdddd5762c15ee8aec75c4837095fe2d6ee392965662fc00cad573a8eb962b71e
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.3.7)
4
+ kreuzberg (4.3.8)
5
5
  rb_sys (~> 0.9.119)
6
6
  sorbet-runtime (~> 0.5)
7
7
 
@@ -124,7 +124,7 @@ GEM
124
124
  rubocop (~> 1.81)
125
125
  ruby-progressbar (1.13.0)
126
126
  securerandom (0.4.1)
127
- sorbet-runtime (0.6.12956)
127
+ sorbet-runtime (0.6.12957)
128
128
  steep (1.10.0)
129
129
  activesupport (>= 5.1)
130
130
  concurrent-ruby (>= 1.1.10)
@@ -210,7 +210,7 @@ CHECKSUMS
210
210
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
211
211
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
212
212
  json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
213
- kreuzberg (4.3.7)
213
+ kreuzberg (4.3.8)
214
214
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
215
215
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
216
216
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
@@ -245,7 +245,7 @@ CHECKSUMS
245
245
  rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
246
246
  ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
247
247
  securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
248
- sorbet-runtime (0.6.12956) sha256=fee716a62d0b1d94ebc8e6ba23e76a7654eeac66c1f5cc1e1bef78b8e9ff87c7
248
+ sorbet-runtime (0.6.12957) sha256=691ccafeae2663236777ffc8a348266907fca3985b089f013d7ccd59bef19056
249
249
  steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
250
250
  strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
251
251
  terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.7" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.8" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -470,9 +470,9 @@ dependencies = [
470
470
 
471
471
  [[package]]
472
472
  name = "biblib"
473
- version = "0.3.2"
473
+ version = "0.4.2"
474
474
  source = "registry+https://github.com/rust-lang/crates.io-index"
475
- checksum = "877d134cd77e9e3c8b90e848d5e28e3126aa8db8d66a6f97e7af89c468f631a9"
475
+ checksum = "f289d61c020b03326674c9a13caf0b9a2744465510df96ff97979083a28517c6"
476
476
  dependencies = [
477
477
  "compact_str",
478
478
  "either",
@@ -976,6 +976,19 @@ dependencies = [
976
976
  "windows-sys 0.59.0",
977
977
  ]
978
978
 
979
+ [[package]]
980
+ name = "console"
981
+ version = "0.16.2"
982
+ source = "registry+https://github.com/rust-lang/crates.io-index"
983
+ checksum = "03e45a4a8926227e4197636ba97a9fc9b00477e9f4bd711395687c5f0734bec4"
984
+ dependencies = [
985
+ "encode_unicode",
986
+ "libc",
987
+ "once_cell",
988
+ "unicode-width",
989
+ "windows-sys 0.61.2",
990
+ ]
991
+
979
992
  [[package]]
980
993
  name = "console_error_panic_hook"
981
994
  version = "0.1.7"
@@ -1119,21 +1132,6 @@ dependencies = [
1119
1132
  "libc",
1120
1133
  ]
1121
1134
 
1122
- [[package]]
1123
- name = "crc"
1124
- version = "3.3.0"
1125
- source = "registry+https://github.com/rust-lang/crates.io-index"
1126
- checksum = "9710d3b3739c2e349eb44fe848ad0b7c8cb1e42bd87ee49371df2f7acaf3e675"
1127
- dependencies = [
1128
- "crc-catalog",
1129
- ]
1130
-
1131
- [[package]]
1132
- name = "crc-catalog"
1133
- version = "2.4.0"
1134
- source = "registry+https://github.com/rust-lang/crates.io-index"
1135
- checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5"
1136
-
1137
1135
  [[package]]
1138
1136
  name = "crc32fast"
1139
1137
  version = "1.5.0"
@@ -1657,7 +1655,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
1657
1655
  checksum = "b4339d45a80579ab8305616a501eacdbf18fb0f7def7fa6e4c0b75941416d5b0"
1658
1656
  dependencies = [
1659
1657
  "anyhow",
1660
- "hf-hub",
1658
+ "hf-hub 0.4.3",
1661
1659
  "ndarray",
1662
1660
  "ort",
1663
1661
  "safetensors",
@@ -2237,7 +2235,7 @@ checksum = "629d8f3bbeda9d148036d6b0de0a3ab947abd08ce90626327fc3547a49d59d97"
2237
2235
  dependencies = [
2238
2236
  "dirs",
2239
2237
  "http",
2240
- "indicatif",
2238
+ "indicatif 0.17.11",
2241
2239
  "libc",
2242
2240
  "log",
2243
2241
  "native-tls",
@@ -2250,6 +2248,25 @@ dependencies = [
2250
2248
  "windows-sys 0.60.2",
2251
2249
  ]
2252
2250
 
2251
+ [[package]]
2252
+ name = "hf-hub"
2253
+ version = "0.5.0"
2254
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2255
+ checksum = "aef3982638978efa195ff11b305f51f1f22f4f0a6cabee7af79b383ebee6a213"
2256
+ dependencies = [
2257
+ "dirs",
2258
+ "http",
2259
+ "indicatif 0.18.4",
2260
+ "libc",
2261
+ "log",
2262
+ "rand 0.9.2",
2263
+ "serde",
2264
+ "serde_json",
2265
+ "thiserror 2.0.18",
2266
+ "ureq 3.2.0",
2267
+ "windows-sys 0.61.2",
2268
+ ]
2269
+
2253
2270
  [[package]]
2254
2271
  name = "hmac"
2255
2272
  version = "0.12.1"
@@ -2709,13 +2726,26 @@ version = "0.17.11"
2709
2726
  source = "registry+https://github.com/rust-lang/crates.io-index"
2710
2727
  checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235"
2711
2728
  dependencies = [
2712
- "console",
2729
+ "console 0.15.11",
2713
2730
  "number_prefix",
2714
2731
  "portable-atomic",
2715
2732
  "unicode-width",
2716
2733
  "web-time",
2717
2734
  ]
2718
2735
 
2736
+ [[package]]
2737
+ name = "indicatif"
2738
+ version = "0.18.4"
2739
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2740
+ checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb"
2741
+ dependencies = [
2742
+ "console 0.16.2",
2743
+ "portable-atomic",
2744
+ "unicode-width",
2745
+ "unit-prefix",
2746
+ "web-time",
2747
+ ]
2748
+
2719
2749
  [[package]]
2720
2750
  name = "infer"
2721
2751
  version = "0.19.0"
@@ -2901,7 +2931,7 @@ dependencies = [
2901
2931
 
2902
2932
  [[package]]
2903
2933
  name = "kreuzberg"
2904
- version = "4.3.6"
2934
+ version = "4.3.8"
2905
2935
  dependencies = [
2906
2936
  "ahash",
2907
2937
  "async-trait",
@@ -2926,7 +2956,7 @@ dependencies = [
2926
2956
  "hayro-jbig2",
2927
2957
  "hayro-jpeg2000",
2928
2958
  "hex",
2929
- "hf-hub",
2959
+ "hf-hub 0.5.0",
2930
2960
  "html-to-markdown-rs",
2931
2961
  "image",
2932
2962
  "indexmap",
@@ -2939,7 +2969,7 @@ dependencies = [
2939
2969
  "lazy_static",
2940
2970
  "libc",
2941
2971
  "lopdf",
2942
- "lzma-rust2 0.15.7",
2972
+ "lzma-rust2 0.16.2",
2943
2973
  "mail-parser",
2944
2974
  "memchr",
2945
2975
  "mime_guess",
@@ -2954,7 +2984,7 @@ dependencies = [
2954
2984
  "pkg-config",
2955
2985
  "polars",
2956
2986
  "pulldown-cmark",
2957
- "quick-xml 0.39.1",
2987
+ "quick-xml 0.39.2",
2958
2988
  "rake",
2959
2989
  "rayon",
2960
2990
  "regex",
@@ -2993,7 +3023,7 @@ dependencies = [
2993
3023
 
2994
3024
  [[package]]
2995
3025
  name = "kreuzberg-ffi"
2996
- version = "4.3.6"
3026
+ version = "4.3.8"
2997
3027
  dependencies = [
2998
3028
  "ahash",
2999
3029
  "async-trait",
@@ -3009,7 +3039,7 @@ dependencies = [
3009
3039
 
3010
3040
  [[package]]
3011
3041
  name = "kreuzberg-paddle-ocr"
3012
- version = "4.3.6"
3042
+ version = "4.3.8"
3013
3043
  dependencies = [
3014
3044
  "geo-clipper",
3015
3045
  "geo-types",
@@ -3024,7 +3054,7 @@ dependencies = [
3024
3054
 
3025
3055
  [[package]]
3026
3056
  name = "kreuzberg-pdfium-render"
3027
- version = "4.3.6"
3057
+ version = "4.3.8"
3028
3058
  dependencies = [
3029
3059
  "bitflags",
3030
3060
  "bytemuck",
@@ -3048,7 +3078,7 @@ dependencies = [
3048
3078
 
3049
3079
  [[package]]
3050
3080
  name = "kreuzberg-rb"
3051
- version = "4.3.6"
3081
+ version = "4.3.8"
3052
3082
  dependencies = [
3053
3083
  "async-trait",
3054
3084
  "html-to-markdown-rs",
@@ -3065,7 +3095,7 @@ dependencies = [
3065
3095
 
3066
3096
  [[package]]
3067
3097
  name = "kreuzberg-tesseract"
3068
- version = "4.3.6"
3098
+ version = "4.3.8"
3069
3099
  dependencies = [
3070
3100
  "cc",
3071
3101
  "cmake",
@@ -3295,18 +3325,13 @@ name = "lzma-rust2"
3295
3325
  version = "0.15.7"
3296
3326
  source = "registry+https://github.com/rust-lang/crates.io-index"
3297
3327
  checksum = "1670343e58806300d87950e3401e820b519b9384281bbabfb15e3636689ffd69"
3298
- dependencies = [
3299
- "crc",
3300
- "sha2",
3301
- ]
3302
3328
 
3303
3329
  [[package]]
3304
3330
  name = "lzma-rust2"
3305
- version = "0.16.1"
3331
+ version = "0.16.2"
3306
3332
  source = "registry+https://github.com/rust-lang/crates.io-index"
3307
- checksum = "d673a11333485e7d8b93d62a9a5b07b22daf5e8a8655a44c1bb18aa4bf3d1524"
3333
+ checksum = "47bb1e988e6fb779cf720ad431242d3f03167c1b3f2b1aae7f1a94b2495b36ae"
3308
3334
  dependencies = [
3309
- "crc",
3310
3335
  "sha2",
3311
3336
  ]
3312
3337
 
@@ -4891,9 +4916,9 @@ dependencies = [
4891
4916
 
4892
4917
  [[package]]
4893
4918
  name = "quick-xml"
4894
- version = "0.39.1"
4919
+ version = "0.39.2"
4895
4920
  source = "registry+https://github.com/rust-lang/crates.io-index"
4896
- checksum = "bd58c6a1fc307e1092aa0bb23d204ca4d1f021764142cd0424dccc84d2d5d106"
4921
+ checksum = "958f21e8e7ceb5a1aa7fa87fab28e7c75976e0bfe7e23ff069e0a260f894067d"
4897
4922
  dependencies = [
4898
4923
  "memchr",
4899
4924
  "serde",
@@ -7035,6 +7060,12 @@ version = "0.1.1"
7035
7060
  source = "registry+https://github.com/rust-lang/crates.io-index"
7036
7061
  checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
7037
7062
 
7063
+ [[package]]
7064
+ name = "unit-prefix"
7065
+ version = "0.5.2"
7066
+ source = "registry+https://github.com/rust-lang/crates.io-index"
7067
+ checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3"
7068
+
7038
7069
  [[package]]
7039
7070
  name = "unsafe-libyaml"
7040
7071
  version = "0.2.11"
@@ -7088,6 +7119,7 @@ dependencies = [
7088
7119
  "base64 0.22.1",
7089
7120
  "cookie_store",
7090
7121
  "der",
7122
+ "flate2",
7091
7123
  "log",
7092
7124
  "native-tls",
7093
7125
  "percent-encoding",
@@ -7095,6 +7127,7 @@ dependencies = [
7095
7127
  "rustls-pki-types",
7096
7128
  "serde",
7097
7129
  "serde_json",
7130
+ "socks",
7098
7131
  "ureq-proto",
7099
7132
  "utf-8",
7100
7133
  "webpki-root-certs",
@@ -8122,7 +8155,7 @@ dependencies = [
8122
8155
  "getrandom 0.4.1",
8123
8156
  "hmac",
8124
8157
  "indexmap",
8125
- "lzma-rust2 0.16.1",
8158
+ "lzma-rust2 0.16.2",
8126
8159
  "memchr",
8127
8160
  "pbkdf2",
8128
8161
  "ppmd-rust",
@@ -37,7 +37,7 @@ collapsible_if = "allow"
37
37
 
38
38
  [package]
39
39
  name = "kreuzberg-rb"
40
- version = "4.3.7"
40
+ version = "4.3.8"
41
41
  edition = "2024"
42
42
  rust-version = "1.91"
43
43
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.3.7'
4
+ VERSION = '4.3.8'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.3.7"
5
+ version = "4.3.8"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -26,7 +26,7 @@ image = { version = "0.25.9", default-features = false }
26
26
  js-sys = "0.3"
27
27
  libc = "0.2.182"
28
28
  log = "0.4"
29
- lzma-rust2 = { version = "0.15.7" }
29
+ lzma-rust2 = { version = "0.16.2" }
30
30
  num_cpus = "1.17.0"
31
31
  once_cell = "1.21.3"
32
32
  parking_lot = "0.12.5"
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.3.7"
3
+ version = "4.3.8"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -51,6 +51,7 @@ email = ["dep:mail-parser", "dep:cfb"]
51
51
  html = ["dep:html-to-markdown-rs"]
52
52
  xml = ["dep:quick-xml", "dep:roxmltree"]
53
53
  archives = ["dep:zip", "dep:tar", "dep:sevenz-rust2", "dep:lzma-rust2", "dep:flate2"]
54
+ mdx = ["dep:pulldown-cmark"]
54
55
 
55
56
  ocr = [
56
57
  "dep:kreuzberg-tesseract",
@@ -90,7 +91,17 @@ mcp-http = ["mcp", "api"]
90
91
 
91
92
  otel = ["dep:opentelemetry", "dep:opentelemetry_sdk", "dep:tracing-opentelemetry"]
92
93
 
93
- wasm-target = ["pdf", "html", "xml", "email", "language-detection", "chunking", "quality", "office"]
94
+ wasm-target = [
95
+ "pdf",
96
+ "html",
97
+ "xml",
98
+ "email",
99
+ "language-detection",
100
+ "chunking",
101
+ "quality",
102
+ "office",
103
+ "mdx",
104
+ ]
94
105
  wasm-threads = ["dep:wasm-bindgen-rayon"]
95
106
 
96
107
  full = [
@@ -108,6 +119,7 @@ full = [
108
119
  "embeddings",
109
120
  "quality",
110
121
  "keywords",
122
+ "mdx",
111
123
  "api",
112
124
  "mcp",
113
125
  "otel",
@@ -124,6 +136,7 @@ cli = [
124
136
  "language-detection",
125
137
  "chunking",
126
138
  "quality",
139
+ "mdx",
127
140
  ]
128
141
 
129
142
  [build-dependencies]
@@ -174,10 +187,10 @@ mail-parser = { version = "0.11.2", optional = true }
174
187
  html-to-markdown-rs = { version = "2.25.1", default-features = false , features = [
175
188
  "inline-images", "metadata", ], optional = true }
176
189
  cfb = { version = "0.14.0", optional = true }
177
- quick-xml = { version = "0.39.1", features = ["serialize"], optional = true }
190
+ quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
178
191
  tar = { version = "0.4.44", optional = true }
179
192
  sevenz-rust2 = { version = "0.20.1", optional = true }
180
- lzma-rust2 = { version = "0.15.7" , optional = true }
193
+ lzma-rust2 = { version = "0.16.2" , optional = true }
181
194
  flate2 = { version = "1.1", optional = true }
182
195
 
183
196
  pulldown-cmark = { version = "0.13", optional = true }
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
17
17
 
18
18
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
19
19
 
20
- > **🚀 Version 4.3.7 Release**
20
+ > **🚀 Version 4.3.8 Release**
21
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
22
22
  >
23
23
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -193,6 +193,24 @@ pub async fn extract_handler(
193
193
  Ok(Json(results))
194
194
  }
195
195
 
196
+ /// Formats endpoint handler.
197
+ ///
198
+ /// GET /formats
199
+ ///
200
+ /// Returns all supported file extensions and their corresponding MIME types.
201
+ #[utoipa::path(
202
+ get,
203
+ path = "/formats",
204
+ tag = "health",
205
+ responses(
206
+ (status = 200, description = "Supported formats", body = Vec<crate::SupportedFormat>),
207
+ )
208
+ )]
209
+ #[cfg_attr(feature = "otel", tracing::instrument(name = "api.formats"))]
210
+ pub async fn formats_handler() -> Json<Vec<crate::SupportedFormat>> {
211
+ Json(crate::list_supported_formats())
212
+ }
213
+
196
214
  /// Cache stats endpoint handler.
197
215
  ///
198
216
  /// GET /cache/stats
@@ -33,6 +33,7 @@ use utoipa::OpenApi;
33
33
  crate::api::handlers::health_handler,
34
34
  crate::api::handlers::info_handler,
35
35
  crate::api::handlers::extract_handler,
36
+ crate::api::handlers::formats_handler,
36
37
  crate::api::handlers::cache_stats_handler,
37
38
  crate::api::handlers::cache_clear_handler,
38
39
  crate::api::handlers::embed_handler,
@@ -53,6 +54,7 @@ use utoipa::OpenApi;
53
54
  crate::api::types::ChunkItem,
54
55
  crate::api::types::ChunkingConfigRequest,
55
56
  crate::api::types::ChunkingConfigResponse,
57
+ crate::core::mime::SupportedFormat,
56
58
  crate::types::extraction::ExtractionResult,
57
59
  crate::types::extraction::Chunk,
58
60
  crate::types::extraction::ChunkMetadata,
@@ -17,8 +17,8 @@ use crate::{ExtractionConfig, core::ServerConfig};
17
17
 
18
18
  use super::{
19
19
  handlers::{
20
- cache_clear_handler, cache_stats_handler, chunk_handler, embed_handler, extract_handler, health_handler,
21
- info_handler,
20
+ cache_clear_handler, cache_stats_handler, chunk_handler, embed_handler, extract_handler, formats_handler,
21
+ health_handler, info_handler,
22
22
  },
23
23
  types::{ApiSizeLimits, ApiState},
24
24
  };
@@ -157,6 +157,7 @@ pub fn create_router_with_limits_and_server_config(
157
157
  .route("/extract", post(extract_handler))
158
158
  .route("/embed", post(embed_handler))
159
159
  .route("/chunk", post(chunk_handler))
160
+ .route("/formats", get(formats_handler))
160
161
  .route("/health", get(health_handler))
161
162
  .route("/info", get(info_handler))
162
163
  .route("/cache/stats", get(cache_stats_handler))