kreuzberg 4.8.2 → 4.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/ext/kreuzberg_rb/native/Cargo.lock +75 -62
  4. data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
  5. data/ext/kreuzberg_rb/native/src/config/types.rs +1 -0
  6. data/lib/kreuzberg/version.rb +1 -1
  7. data/lib/kreuzberg.rb +1 -5
  8. data/vendor/Cargo.toml +5 -5
  9. data/vendor/kreuzberg/Cargo.toml +5 -5
  10. data/vendor/kreuzberg/README.md +1 -1
  11. data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
  12. data/vendor/kreuzberg/src/core/config/pdf.rs +23 -0
  13. data/vendor/kreuzberg/src/doc_orientation.rs +1 -1
  14. data/vendor/kreuzberg/src/embeddings/mod.rs +15 -1
  15. data/vendor/kreuzberg/src/extraction/derive.rs +15 -3
  16. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +18 -1
  17. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +140 -0
  18. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +349 -0
  19. data/vendor/kreuzberg/src/layout/models/slanet.rs +1 -1
  20. data/vendor/kreuzberg/src/layout/models/table_classifier.rs +1 -1
  21. data/vendor/kreuzberg/src/layout/models/tatr.rs +1 -1
  22. data/vendor/kreuzberg/src/layout/session.rs +1 -1
  23. data/vendor/kreuzberg/src/lib.rs +1 -1
  24. data/vendor/kreuzberg/src/pdf/hierarchy/extraction.rs +4 -0
  25. data/vendor/kreuzberg/src/pdf/mod.rs +2 -0
  26. data/vendor/kreuzberg/src/pdf/oxide/annotations.rs +258 -0
  27. data/vendor/kreuzberg/src/pdf/oxide/hierarchy.rs +235 -0
  28. data/vendor/kreuzberg/src/pdf/oxide/images.rs +53 -0
  29. data/vendor/kreuzberg/src/pdf/oxide/metadata.rs +381 -0
  30. data/vendor/kreuzberg/src/pdf/oxide/mod.rs +43 -0
  31. data/vendor/kreuzberg/src/pdf/oxide/table.rs +243 -0
  32. data/vendor/kreuzberg/src/pdf/oxide/text.rs +249 -0
  33. data/vendor/kreuzberg/src/pdf/oxide_text.rs +8 -6
  34. data/vendor/kreuzberg/src/pdf/structure/adapters.rs +1 -0
  35. data/vendor/kreuzberg/src/pdf/structure/assembly.rs +1 -0
  36. data/vendor/kreuzberg/src/pdf/structure/bridge.rs +51 -0
  37. data/vendor/kreuzberg/src/pdf/structure/classify.rs +3 -0
  38. data/vendor/kreuzberg/src/pdf/structure/content_convert.rs +3 -0
  39. data/vendor/kreuzberg/src/pdf/structure/layout_classify.rs +1 -0
  40. data/vendor/kreuzberg/src/pdf/structure/mod.rs +2 -0
  41. data/vendor/kreuzberg/src/pdf/structure/paragraphs.rs +2 -0
  42. data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +240 -1
  43. data/vendor/kreuzberg/src/pdf/table_reconstruct.rs +1 -0
  44. data/vendor/kreuzberg/src/pdf/text.rs +1 -1
  45. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +5 -1
  46. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +2 -1
  47. data/vendor/kreuzberg-ffi/Cargo.toml +4 -10
  48. data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
  49. data/vendor/kreuzberg-ffi/src/config/merge.rs +0 -3
  50. data/vendor/kreuzberg-ffi/src/config_builder.rs +0 -6
  51. data/vendor/kreuzberg-ffi/src/lib.rs +0 -1
  52. data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
  53. data/vendor/kreuzberg-paddle-ocr/src/base_net.rs +1 -3
  54. data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
  55. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  56. metadata +9 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4ec647109dee8229fd50ce9fe4d87f13bbb769779b22b4862cac321052610d6c
4
- data.tar.gz: 18c3cf6df8e339f4286da7fe9b8da84185fa93abf89fcbf339b259c0095a9a5c
3
+ metadata.gz: 8b636a0d3207a655747e4b288b0f66b616b460b282f49d2b4947ec23ac319cfc
4
+ data.tar.gz: ef5522acd061db0093d68afb96d440888f605aefaa6e0cc262f248e2e82e62cb
5
5
  SHA512:
6
- metadata.gz: f8c0ab16048bdb9026b55ff15f3ae342af9178a2ab6b1fd85777c3271da35e46474bee025af429745d20584604d520d85a4c8e4ea96bffc49ef5dfca55471b6f
7
- data.tar.gz: 46ec0ff10138bd48d7b9ffada23956cd557616d8fdf09090f16562d4daab04381176b2881e08bfdfc1f726a0061e1e25feefa695620fef530444662abb866605
6
+ metadata.gz: 89b72104facd4e87ecc5c65011305abdcd77fd829073dcd235829d67297ad17b09136bb9c2df0dd3dcf7e79ba2f86bba6ceb69fdbf22b783725555b4efa375b3
7
+ data.tar.gz: c82c2c92b595613497385af930f497762d56831818cda45d230044a61fbbe5730da4d7781cb956fa6096b53e364dfff9772d3517fe2cc879785d53659a45d5d0
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.2" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.8.3" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -699,9 +699,9 @@ dependencies = [
699
699
 
700
700
  [[package]]
701
701
  name = "cc"
702
- version = "1.2.59"
702
+ version = "1.2.60"
703
703
  source = "registry+https://github.com/rust-lang/crates.io-index"
704
- checksum = "b7a4d3ec6524d28a329fc53654bbadc9bdd7b0431f5d65f1a56ffb28a1ee5283"
704
+ checksum = "43c5703da9466b66a946814e1adf53ea2c90f10063b86290cc9eb67ce3478a20"
705
705
  dependencies = [
706
706
  "find-msvc-tools",
707
707
  "jobserver",
@@ -1162,19 +1162,20 @@ dependencies = [
1162
1162
 
1163
1163
  [[package]]
1164
1164
  name = "ctor"
1165
- version = "0.8.0"
1165
+ version = "0.9.1"
1166
1166
  source = "registry+https://github.com/rust-lang/crates.io-index"
1167
- checksum = "352d39c2f7bef1d6ad73db6f5160efcaed66d94ef8c6c573a8410c00bf909a98"
1167
+ checksum = "c1c888a2a4f677017373fb6c01e13e318dd9e78758445ed5eb985e355d3f8281"
1168
1168
  dependencies = [
1169
1169
  "ctor-proc-macro",
1170
1170
  "dtor",
1171
+ "link-section",
1171
1172
  ]
1172
1173
 
1173
1174
  [[package]]
1174
1175
  name = "ctor-proc-macro"
1175
- version = "0.0.7"
1176
+ version = "0.0.12"
1176
1177
  source = "registry+https://github.com/rust-lang/crates.io-index"
1177
- checksum = "52560adf09603e58c9a7ee1fe1dcb95a16927b17c127f0ac02d6e768a0e25bc1"
1178
+ checksum = "a7ab264ea985f1bd27887d7b21ea2bb046728e05d11909ca138d700c494730db"
1178
1179
 
1179
1180
  [[package]]
1180
1181
  name = "darling"
@@ -1416,18 +1417,18 @@ dependencies = [
1416
1417
 
1417
1418
  [[package]]
1418
1419
  name = "dtor"
1419
- version = "0.3.0"
1420
+ version = "0.6.0"
1420
1421
  source = "registry+https://github.com/rust-lang/crates.io-index"
1421
- checksum = "f1057d6c64987086ff8ed0fd3fbf377a6b7d205cc7715868cd401705f715cbe4"
1422
+ checksum = "30e4690622ab6700ced40fc370a3f07b7d111f0154bb6fb08f73b4c8834f75b6"
1422
1423
  dependencies = [
1423
1424
  "dtor-proc-macro",
1424
1425
  ]
1425
1426
 
1426
1427
  [[package]]
1427
1428
  name = "dtor-proc-macro"
1428
- version = "0.0.6"
1429
+ version = "0.0.12"
1429
1430
  source = "registry+https://github.com/rust-lang/crates.io-index"
1430
- checksum = "f678cf4a922c215c63e0de95eb1ff08a958a81d47e485cf9da1e27bf6305cfa5"
1431
+ checksum = "8c98b077c7463d01d22dde8a24378ddf1ca7263dc687cffbed38819ea6c21131"
1431
1432
 
1432
1433
  [[package]]
1433
1434
  name = "dunce"
@@ -1868,9 +1869,9 @@ dependencies = [
1868
1869
 
1869
1870
  [[package]]
1870
1871
  name = "gif"
1871
- version = "0.14.1"
1872
+ version = "0.14.2"
1872
1873
  source = "registry+https://github.com/rust-lang/crates.io-index"
1873
- checksum = "f5df2ba84018d80c213569363bdcd0c64e6933c67fe4c1d60ecf822971a3c35e"
1874
+ checksum = "ee8cfcc411d9adbbaba82fb72661cc1bcca13e8bba98b364e62b2dba8f960159"
1874
1875
  dependencies = [
1875
1876
  "color_quant",
1876
1877
  "weezl",
@@ -2029,6 +2030,12 @@ dependencies = [
2029
2030
  "foldhash 0.2.0",
2030
2031
  ]
2031
2032
 
2033
+ [[package]]
2034
+ name = "hashbrown"
2035
+ version = "0.17.0"
2036
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2037
+ checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51"
2038
+
2032
2039
  [[package]]
2033
2040
  name = "hashify"
2034
2041
  version = "0.2.9"
@@ -2094,7 +2101,7 @@ dependencies = [
2094
2101
  "indicatif 0.17.11",
2095
2102
  "libc",
2096
2103
  "log",
2097
- "rand 0.9.2",
2104
+ "rand 0.9.3",
2098
2105
  "serde",
2099
2106
  "serde_json",
2100
2107
  "thiserror 2.0.18",
@@ -2113,7 +2120,7 @@ dependencies = [
2113
2120
  "indicatif 0.18.4",
2114
2121
  "libc",
2115
2122
  "log",
2116
- "rand 0.9.2",
2123
+ "rand 0.9.3",
2117
2124
  "serde",
2118
2125
  "serde_json",
2119
2126
  "thiserror 2.0.18",
@@ -2526,7 +2533,7 @@ dependencies = [
2526
2533
  "itertools 0.14.0",
2527
2534
  "nalgebra",
2528
2535
  "num",
2529
- "rand 0.9.2",
2536
+ "rand 0.9.3",
2530
2537
  "rand_distr",
2531
2538
  ]
2532
2539
 
@@ -2538,12 +2545,12 @@ checksum = "e7c5cedc30da3a610cac6b4ba17597bdf7152cf974e8aab3afb3d54455e371c8"
2538
2545
 
2539
2546
  [[package]]
2540
2547
  name = "indexmap"
2541
- version = "2.13.1"
2548
+ version = "2.14.0"
2542
2549
  source = "registry+https://github.com/rust-lang/crates.io-index"
2543
- checksum = "45a8a2b9cb3e0b0c1803dbb0758ffac5de2f425b23c28f518faabd9d805342ff"
2550
+ checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
2544
2551
  dependencies = [
2545
2552
  "equivalent",
2546
- "hashbrown 0.16.1",
2553
+ "hashbrown 0.17.0",
2547
2554
  "serde",
2548
2555
  "serde_core",
2549
2556
  ]
@@ -2759,9 +2766,9 @@ checksum = "086b08ec7a274cd60cd575ed3651ba081ee72dec0d39a6210e8adcff9efe3880"
2759
2766
 
2760
2767
  [[package]]
2761
2768
  name = "js-sys"
2762
- version = "0.3.94"
2769
+ version = "0.3.95"
2763
2770
  source = "registry+https://github.com/rust-lang/crates.io-index"
2764
- checksum = "2e04e2ef80ce82e13552136fabeef8a5ed1f985a96805761cbb9a2c34e7664d9"
2771
+ checksum = "2964e92d1d9dc3364cae4d718d93f227e3abb088e747d92e0395bfdedf1c12ca"
2765
2772
  dependencies = [
2766
2773
  "cfg-if",
2767
2774
  "futures-util",
@@ -2780,7 +2787,7 @@ dependencies = [
2780
2787
 
2781
2788
  [[package]]
2782
2789
  name = "kreuzberg"
2783
- version = "4.8.1"
2790
+ version = "4.8.3"
2784
2791
  dependencies = [
2785
2792
  "ahash",
2786
2793
  "async-trait",
@@ -2875,7 +2882,7 @@ dependencies = [
2875
2882
 
2876
2883
  [[package]]
2877
2884
  name = "kreuzberg-ffi"
2878
- version = "4.8.1"
2885
+ version = "4.8.3"
2879
2886
  dependencies = [
2880
2887
  "ahash",
2881
2888
  "async-trait",
@@ -2891,7 +2898,7 @@ dependencies = [
2891
2898
 
2892
2899
  [[package]]
2893
2900
  name = "kreuzberg-paddle-ocr"
2894
- version = "4.8.1"
2901
+ version = "4.8.3"
2895
2902
  dependencies = [
2896
2903
  "geo-clipper",
2897
2904
  "geo-types",
@@ -2905,7 +2912,7 @@ dependencies = [
2905
2912
 
2906
2913
  [[package]]
2907
2914
  name = "kreuzberg-pdfium-render"
2908
- version = "4.8.1"
2915
+ version = "4.8.3"
2909
2916
  dependencies = [
2910
2917
  "bitflags",
2911
2918
  "bytemuck",
@@ -2928,7 +2935,7 @@ dependencies = [
2928
2935
 
2929
2936
  [[package]]
2930
2937
  name = "kreuzberg-rb"
2931
- version = "4.8.1"
2938
+ version = "4.8.3"
2932
2939
  dependencies = [
2933
2940
  "async-trait",
2934
2941
  "html-to-markdown-rs",
@@ -2945,7 +2952,7 @@ dependencies = [
2945
2952
 
2946
2953
  [[package]]
2947
2954
  name = "kreuzberg-tesseract"
2948
- version = "4.8.1"
2955
+ version = "4.8.3"
2949
2956
  dependencies = [
2950
2957
  "cc",
2951
2958
  "cmake",
@@ -3022,16 +3029,22 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
3022
3029
 
3023
3030
  [[package]]
3024
3031
  name = "libredox"
3025
- version = "0.1.15"
3032
+ version = "0.1.16"
3026
3033
  source = "registry+https://github.com/rust-lang/crates.io-index"
3027
- checksum = "7ddbf48fd451246b1f8c2610bd3b4ac0cc6e149d89832867093ab69a17194f08"
3034
+ checksum = "e02f3bb43d335493c96bf3fd3a321600bf6bd07ed34bc64118e9293bdffea46c"
3028
3035
  dependencies = [
3029
3036
  "bitflags",
3030
3037
  "libc",
3031
3038
  "plain",
3032
- "redox_syscall 0.7.3",
3039
+ "redox_syscall 0.7.4",
3033
3040
  ]
3034
3041
 
3042
+ [[package]]
3043
+ name = "link-section"
3044
+ version = "0.0.12"
3045
+ source = "registry+https://github.com/rust-lang/crates.io-index"
3046
+ checksum = "f52437d47b0358721ec869cc7374b2a21f7b2237af9b439c0391341a1fbfbf1b"
3047
+
3035
3048
  [[package]]
3036
3049
  name = "linux-raw-sys"
3037
3050
  version = "0.12.1"
@@ -3116,7 +3129,7 @@ dependencies = [
3116
3129
  "md-5",
3117
3130
  "nom 8.0.0",
3118
3131
  "nom_locate",
3119
- "rand 0.10.0",
3132
+ "rand 0.10.1",
3120
3133
  "rangemap",
3121
3134
  "rayon",
3122
3135
  "sha2 0.10.9",
@@ -3698,7 +3711,7 @@ dependencies = [
3698
3711
  "futures-util",
3699
3712
  "opentelemetry",
3700
3713
  "percent-encoding",
3701
- "rand 0.9.2",
3714
+ "rand 0.9.3",
3702
3715
  "thiserror 2.0.18",
3703
3716
  "tokio",
3704
3717
  "tokio-stream",
@@ -4065,7 +4078,7 @@ dependencies = [
4065
4078
  "bytes",
4066
4079
  "getrandom 0.3.4",
4067
4080
  "lru-slab",
4068
- "rand 0.9.2",
4081
+ "rand 0.9.3",
4069
4082
  "ring",
4070
4083
  "rustc-hash",
4071
4084
  "rustls",
@@ -4131,9 +4144,9 @@ dependencies = [
4131
4144
 
4132
4145
  [[package]]
4133
4146
  name = "rand"
4134
- version = "0.9.2"
4147
+ version = "0.9.3"
4135
4148
  source = "registry+https://github.com/rust-lang/crates.io-index"
4136
- checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
4149
+ checksum = "7ec095654a25171c2124e9e3393a930bddbffdc939556c914957a4c3e0a87166"
4137
4150
  dependencies = [
4138
4151
  "rand_chacha",
4139
4152
  "rand_core 0.9.5",
@@ -4141,9 +4154,9 @@ dependencies = [
4141
4154
 
4142
4155
  [[package]]
4143
4156
  name = "rand"
4144
- version = "0.10.0"
4157
+ version = "0.10.1"
4145
4158
  source = "registry+https://github.com/rust-lang/crates.io-index"
4146
- checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8"
4159
+ checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207"
4147
4160
  dependencies = [
4148
4161
  "chacha20",
4149
4162
  "getrandom 0.4.2",
@@ -4182,7 +4195,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
4182
4195
  checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
4183
4196
  dependencies = [
4184
4197
  "num-traits",
4185
- "rand 0.9.2",
4198
+ "rand 0.9.3",
4186
4199
  ]
4187
4200
 
4188
4201
  [[package]]
@@ -4218,7 +4231,7 @@ dependencies = [
4218
4231
  "num-traits",
4219
4232
  "paste",
4220
4233
  "profiling",
4221
- "rand 0.9.2",
4234
+ "rand 0.9.3",
4222
4235
  "rand_chacha",
4223
4236
  "simd_helpers",
4224
4237
  "thiserror 2.0.18",
@@ -4319,9 +4332,9 @@ dependencies = [
4319
4332
 
4320
4333
  [[package]]
4321
4334
  name = "redox_syscall"
4322
- version = "0.7.3"
4335
+ version = "0.7.4"
4323
4336
  source = "registry+https://github.com/rust-lang/crates.io-index"
4324
- checksum = "6ce70a74e890531977d37e532c34d45e9055d2409ed08ddba14529471ed0be16"
4337
+ checksum = "f450ad9c3b1da563fb6948a8e0fb0fb9269711c9c73d9ea1de5058c79c8d643a"
4325
4338
  dependencies = [
4326
4339
  "bitflags",
4327
4340
  ]
@@ -4454,9 +4467,9 @@ dependencies = [
4454
4467
 
4455
4468
  [[package]]
4456
4469
  name = "rmcp"
4457
- version = "1.3.0"
4470
+ version = "1.4.0"
4458
4471
  source = "registry+https://github.com/rust-lang/crates.io-index"
4459
- checksum = "2231b2c085b371c01bc90c0e6c1cab8834711b6394533375bdbf870b0166d419"
4472
+ checksum = "f542f74cf247da16f19bbc87e298cd201e912314f4083e88cdd671f44f5fcb53"
4460
4473
  dependencies = [
4461
4474
  "async-trait",
4462
4475
  "base64 0.22.1",
@@ -4468,7 +4481,7 @@ dependencies = [
4468
4481
  "http-body-util",
4469
4482
  "pastey 0.2.1",
4470
4483
  "pin-project-lite",
4471
- "rand 0.10.0",
4484
+ "rand 0.10.1",
4472
4485
  "rmcp-macros",
4473
4486
  "schemars",
4474
4487
  "serde",
@@ -4485,9 +4498,9 @@ dependencies = [
4485
4498
 
4486
4499
  [[package]]
4487
4500
  name = "rmcp-macros"
4488
- version = "1.3.0"
4501
+ version = "1.4.0"
4489
4502
  source = "registry+https://github.com/rust-lang/crates.io-index"
4490
- checksum = "36ea0e100fadf81be85d7ff70f86cd805c7572601d4ab2946207f36540854b43"
4503
+ checksum = "b2391e4ae47f314e70eaafb6c7bd82e495e770b935448864446302143019151f"
4491
4504
  dependencies = [
4492
4505
  "darling 0.23.0",
4493
4506
  "proc-macro2",
@@ -4616,9 +4629,9 @@ checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f"
4616
4629
 
4617
4630
  [[package]]
4618
4631
  name = "rustls-webpki"
4619
- version = "0.103.10"
4632
+ version = "0.103.11"
4620
4633
  source = "registry+https://github.com/rust-lang/crates.io-index"
4621
- checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef"
4634
+ checksum = "20a6af516fea4b20eccceaf166e8aa666ac996208e8a644ce3ef5aa783bc7cd4"
4622
4635
  dependencies = [
4623
4636
  "aws-lc-rs",
4624
4637
  "ring",
@@ -5032,9 +5045,9 @@ dependencies = [
5032
5045
 
5033
5046
  [[package]]
5034
5047
  name = "sse-stream"
5035
- version = "0.2.1"
5048
+ version = "0.2.2"
5036
5049
  source = "registry+https://github.com/rust-lang/crates.io-index"
5037
- checksum = "eb4dc4d33c68ec1f27d386b5610a351922656e1fdf5c05bbaad930cd1519479a"
5050
+ checksum = "2c5e6deb40826033bd7b11c7ef25ef71193fabd71f680f40dd16538a2704d2f4"
5038
5051
  dependencies = [
5039
5052
  "bytes",
5040
5053
  "futures-util",
@@ -5369,7 +5382,7 @@ dependencies = [
5369
5382
  "monostate",
5370
5383
  "onig",
5371
5384
  "paste",
5372
- "rand 0.9.2",
5385
+ "rand 0.9.3",
5373
5386
  "rayon",
5374
5387
  "rayon-cond",
5375
5388
  "regex",
@@ -6040,9 +6053,9 @@ dependencies = [
6040
6053
 
6041
6054
  [[package]]
6042
6055
  name = "wasm-bindgen"
6043
- version = "0.2.117"
6056
+ version = "0.2.118"
6044
6057
  source = "registry+https://github.com/rust-lang/crates.io-index"
6045
- checksum = "0551fc1bb415591e3372d0bc4780db7e587d84e2a7e79da121051c5c4b89d0b0"
6058
+ checksum = "0bf938a0bacb0469e83c1e148908bd7d5a6010354cf4fb73279b7447422e3a89"
6046
6059
  dependencies = [
6047
6060
  "cfg-if",
6048
6061
  "once_cell",
@@ -6053,9 +6066,9 @@ dependencies = [
6053
6066
 
6054
6067
  [[package]]
6055
6068
  name = "wasm-bindgen-futures"
6056
- version = "0.4.67"
6069
+ version = "0.4.68"
6057
6070
  source = "registry+https://github.com/rust-lang/crates.io-index"
6058
- checksum = "03623de6905b7206edd0a75f69f747f134b7f0a2323392d664448bf2d3c5d87e"
6071
+ checksum = "f371d383f2fb139252e0bfac3b81b265689bf45b6874af544ffa4c975ac1ebf8"
6059
6072
  dependencies = [
6060
6073
  "js-sys",
6061
6074
  "wasm-bindgen",
@@ -6063,9 +6076,9 @@ dependencies = [
6063
6076
 
6064
6077
  [[package]]
6065
6078
  name = "wasm-bindgen-macro"
6066
- version = "0.2.117"
6079
+ version = "0.2.118"
6067
6080
  source = "registry+https://github.com/rust-lang/crates.io-index"
6068
- checksum = "7fbdf9a35adf44786aecd5ff89b4563a90325f9da0923236f6104e603c7e86be"
6081
+ checksum = "eeff24f84126c0ec2db7a449f0c2ec963c6a49efe0698c4242929da037ca28ed"
6069
6082
  dependencies = [
6070
6083
  "quote",
6071
6084
  "wasm-bindgen-macro-support",
@@ -6073,9 +6086,9 @@ dependencies = [
6073
6086
 
6074
6087
  [[package]]
6075
6088
  name = "wasm-bindgen-macro-support"
6076
- version = "0.2.117"
6089
+ version = "0.2.118"
6077
6090
  source = "registry+https://github.com/rust-lang/crates.io-index"
6078
- checksum = "dca9693ef2bab6d4e6707234500350d8dad079eb508dca05530c85dc3a529ff2"
6091
+ checksum = "9d08065faf983b2b80a79fd87d8254c409281cf7de75fc4b773019824196c904"
6079
6092
  dependencies = [
6080
6093
  "bumpalo",
6081
6094
  "proc-macro2",
@@ -6086,9 +6099,9 @@ dependencies = [
6086
6099
 
6087
6100
  [[package]]
6088
6101
  name = "wasm-bindgen-shared"
6089
- version = "0.2.117"
6102
+ version = "0.2.118"
6090
6103
  source = "registry+https://github.com/rust-lang/crates.io-index"
6091
- checksum = "39129a682a6d2d841b6c429d0c51e5cb0ed1a03829d8b3d1e69a011e62cb3d3b"
6104
+ checksum = "5fd04d9e306f1907bd13c6361b5c6bfc7b3b3c095ed3f8a9246390f8dbdee129"
6092
6105
  dependencies = [
6093
6106
  "unicode-ident",
6094
6107
  ]
@@ -6142,9 +6155,9 @@ dependencies = [
6142
6155
 
6143
6156
  [[package]]
6144
6157
  name = "web-sys"
6145
- version = "0.3.94"
6158
+ version = "0.3.95"
6146
6159
  source = "registry+https://github.com/rust-lang/crates.io-index"
6147
- checksum = "cd70027e39b12f0849461e08ffc50b9cd7688d942c1c8e3c7b22273236b4dd0a"
6160
+ checksum = "4f2dfbb17949fa2088e5d39408c48368947b86f7834484e87b73de55bc14d97d"
6148
6161
  dependencies = [
6149
6162
  "js-sys",
6150
6163
  "wasm-bindgen",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.8.2"
3
+ version = "4.8.3"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -334,6 +334,7 @@ pub fn parse_pdf_config(ruby: &Ruby, hash: RHash) -> Result<PdfConfig, Error> {
334
334
  };
335
335
 
336
336
  let config = PdfConfig {
337
+ backend: kreuzberg::PdfBackend::default(),
337
338
  extract_images,
338
339
  passwords,
339
340
  extract_metadata,
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.8.2'
4
+ VERSION = '4.8.3'
5
5
  end
data/lib/kreuzberg.rb CHANGED
@@ -37,15 +37,11 @@ module Kreuzberg
37
37
  autoload :DocumentStructure, 'kreuzberg/types'
38
38
  autoload :PdfAnnotation, 'kreuzberg/types'
39
39
  autoload :PdfAnnotationBoundingBox, 'kreuzberg/types'
40
+ autoload :KeywordAlgorithm, 'kreuzberg/types'
40
41
 
41
42
  ExtractionConfig = Config::Extraction
42
43
  PageConfig = Config::PageConfig
43
44
 
44
- module KeywordAlgorithm
45
- YAKE = :yake
46
- RAKE = :rake
47
- end
48
-
49
45
  @__cache_tracker = { entries: 0, bytes: 0 }
50
46
 
51
47
  class << self
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.8.2"
5
+ version = "4.8.3"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -23,7 +23,7 @@ clap = { version = "4.6", features = ["derive", "color", "suggestions"] }
23
23
  comrak = { version = "0.52", default-features = false }
24
24
  console_error_panic_hook = "0.1"
25
25
  criterion = { version = "0.8", features = ["html_reports"] }
26
- ctor = "0.8"
26
+ ctor = "0.9"
27
27
  dbase = "0.7"
28
28
  futures = "0.3"
29
29
  getrandom = { version = "0.4.2", features = ["wasm_js"] }
@@ -32,8 +32,8 @@ html-to-markdown-rs = { version = "3.1.0", default-features = false }
32
32
  image = { version = "0.25.10", default-features = false }
33
33
  itertools = "0.14"
34
34
  js-sys = "0.3"
35
- kreuzberg = { path = "./crates/kreuzberg", version = "4.8.2", default-features = false }
36
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.8.2" }
35
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.8.3", default-features = false }
36
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.8.3" }
37
37
  lazy_static = "1.5.0"
38
38
  libc = "0.2.184"
39
39
  liter-llm = { version = "1.2", features = ["native-http", "tracing"], default-features = false }
@@ -45,7 +45,7 @@ num_cpus = "1.17.0"
45
45
  once_cell = "1.21.4"
46
46
  ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
47
47
  parking_lot = "0.12.5"
48
- pdf_oxide = { version = "0.3.22", default-features = false }
48
+ pdf_oxide = { version = "0.3.24", default-features = false }
49
49
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
50
50
  rayon = "1.11.0"
51
51
  reqwest = { version = "0.13.2", default-features = false }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.8.2"
3
+ version = "4.8.3"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -285,7 +285,7 @@ image = { version = "0.25.10", default-features = false, features = [
285
285
  "pnm",
286
286
  "rayon",
287
287
  ], optional = true }
288
- indexmap = "2.13.1"
288
+ indexmap = "2.14.0"
289
289
  infer = "0.19.0"
290
290
  jotdown = "0.9"
291
291
  kamadak-exif = { version = "0.6.1", optional = true }
@@ -314,14 +314,14 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
314
314
  outlook-pst = { version = "1.2.0", optional = true }
315
315
  parking_lot = "0.12.5"
316
316
  pastey = "0.2"
317
- pdf_oxide = { version = "0.3.22", default-features = false, optional = true }
317
+ pdf_oxide = { version = "0.3.24", default-features = false, optional = true }
318
318
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
319
319
  pulldown-cmark = { version = "0.13" }
320
320
  quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
321
321
  rake = { version = "0.3.6", optional = true }
322
322
  rayon = "1.11.0"
323
323
  regex = "1.12.3"
324
- rmcp = { version = "1.3.0", features = [
324
+ rmcp = { version = "1.4.0", features = [
325
325
  "server",
326
326
  "macros",
327
327
  "base64",
@@ -405,7 +405,7 @@ criterion = { version = "0.8", features = ["html_reports"] }
405
405
  dotenvy = "0.15"
406
406
  filetime = "0.2"
407
407
  image = { version = "0.25.10", default-features = false, features = ["png"] }
408
- jsonschema = "0.45"
408
+ jsonschema = "0.46"
409
409
  serial_test = "3.4.0"
410
410
  tar = "0.4.45"
411
411
  tempfile = "3.27.0"
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
18
18
 
19
19
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
20
20
 
21
- > **🚀 Version 4.8.2 Release**
21
+ > **🚀 Version 4.8.3 Release**
22
22
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
23
23
  >
24
24
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -38,7 +38,7 @@ pub use llm::{LlmConfig, StructuredExtractionConfig};
38
38
  pub use ocr::{OcrConfig, OcrPipelineConfig, OcrPipelineStage, OcrQualityThresholds};
39
39
  pub use page::PageConfig;
40
40
  #[cfg(feature = "pdf")]
41
- pub use pdf::{HierarchyConfig, PdfConfig};
41
+ pub use pdf::{HierarchyConfig, PdfBackend, PdfConfig};
42
42
  pub use processing::{
43
43
  ChunkSizing, ChunkerType, ChunkingConfig, EmbeddingConfig, EmbeddingModelType, PostProcessorConfig,
44
44
  };
@@ -5,10 +5,31 @@
5
5
 
6
6
  use serde::{Deserialize, Serialize};
7
7
 
8
+ /// PDF extraction backend selection.
9
+ ///
10
+ /// Controls which PDF library is used for text extraction:
11
+ /// - `Pdfium`: pdfium-render (default, C++ based, mature)
12
+ /// - `PdfOxide`: pdf_oxide (pure Rust, faster, requires `pdf-oxide` feature)
13
+ /// - `Auto`: automatically select based on available features
14
+ #[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
15
+ #[serde(rename_all = "lowercase")]
16
+ pub enum PdfBackend {
17
+ /// Use pdfium-render backend (default).
18
+ #[default]
19
+ Pdfium,
20
+ /// Use pdf_oxide backend (pure Rust). Requires `pdf-oxide` feature.
21
+ PdfOxide,
22
+ /// Automatically select the best available backend.
23
+ Auto,
24
+ }
25
+
8
26
  /// PDF-specific configuration.
9
27
  #[cfg(feature = "pdf")]
10
28
  #[derive(Debug, Clone, Serialize, Deserialize)]
11
29
  pub struct PdfConfig {
30
+ /// PDF extraction backend. Default: `Pdfium`.
31
+ #[serde(default)]
32
+ pub backend: PdfBackend,
12
33
  /// Extract images from PDF
13
34
  #[serde(default)]
14
35
  pub extract_images: bool,
@@ -86,6 +107,7 @@ pub struct HierarchyConfig {
86
107
  impl Default for PdfConfig {
87
108
  fn default() -> Self {
88
109
  Self {
110
+ backend: PdfBackend::default(),
89
111
  extract_images: false,
90
112
  passwords: None,
91
113
  extract_metadata: true,
@@ -155,6 +177,7 @@ mod tests {
155
177
  fn test_pdf_config_custom_margins() {
156
178
  use super::*;
157
179
  let config = PdfConfig {
180
+ backend: PdfBackend::default(),
158
181
  extract_images: false,
159
182
  passwords: None,
160
183
  extract_metadata: true,
@@ -174,7 +174,7 @@ impl DocOrientationDetector {
174
174
  message: format!("Failed to create doc_ori session builder: {e}"),
175
175
  source: None,
176
176
  })?
177
- .with_optimization_level(GraphOptimizationLevel::Level3)
177
+ .with_optimization_level(GraphOptimizationLevel::All)
178
178
  .map_err(|e| KreuzbergError::Ocr {
179
179
  message: format!("Failed to set doc_ori optimization level: {e}"),
180
180
  source: None,