kreuzberg 4.0.0.pre.rc.17 → 4.0.0.pre.rc.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +3 -3
  3. data/ext/kreuzberg_rb/native/Cargo.lock +336 -13
  4. data/ext/kreuzberg_rb/native/Cargo.toml +2 -5
  5. data/ext/kreuzberg_rb/native/build.rs +3 -64
  6. data/lib/kreuzberg/config.rb +41 -6
  7. data/lib/kreuzberg/version.rb +1 -1
  8. data/sig/kreuzberg.rbs +10 -3
  9. data/spec/binding/config_spec.rb +74 -0
  10. data/spec/binding/font_config_spec.rb +220 -0
  11. data/vendor/Cargo.toml +3 -2
  12. data/vendor/kreuzberg/Cargo.toml +10 -9
  13. data/vendor/kreuzberg/src/api/handlers.rs +3 -2
  14. data/vendor/kreuzberg/src/api/server.rs +211 -36
  15. data/vendor/kreuzberg/src/api/types.rs +20 -7
  16. data/vendor/kreuzberg/src/core/batch_optimizations.rs +106 -26
  17. data/vendor/kreuzberg/src/core/config.rs +253 -22
  18. data/vendor/kreuzberg/src/core/extractor.rs +41 -3
  19. data/vendor/kreuzberg/src/core/pipeline.rs +15 -59
  20. data/vendor/kreuzberg/src/extraction/capacity.rs +270 -0
  21. data/vendor/kreuzberg/src/extraction/docx.rs +26 -17
  22. data/vendor/kreuzberg/src/extraction/excel.rs +132 -128
  23. data/vendor/kreuzberg/src/extraction/html.rs +13 -14
  24. data/vendor/kreuzberg/src/extraction/markdown.rs +4 -3
  25. data/vendor/kreuzberg/src/extraction/mod.rs +12 -0
  26. data/vendor/kreuzberg/src/extraction/pptx.rs +4 -1
  27. data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
  28. data/vendor/kreuzberg/src/extractors/email.rs +1 -1
  29. data/vendor/kreuzberg/src/extractors/excel.rs +62 -21
  30. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
  31. data/vendor/kreuzberg/src/extractors/html.rs +30 -18
  32. data/vendor/kreuzberg/src/extractors/jats.rs +1 -1
  33. data/vendor/kreuzberg/src/extractors/markdown.rs +3 -3
  34. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -2
  35. data/vendor/kreuzberg/src/extractors/pdf.rs +33 -25
  36. data/vendor/kreuzberg/src/extractors/typst.rs +2 -2
  37. data/vendor/kreuzberg/src/pdf/bindings.rs +0 -37
  38. data/vendor/kreuzberg/src/pdf/metadata.rs +37 -27
  39. data/vendor/kreuzberg/src/pdf/table.rs +20 -20
  40. data/vendor/kreuzberg/src/text/quality.rs +30 -15
  41. data/vendor/kreuzberg/src/text/quality_processor.rs +3 -11
  42. data/vendor/kreuzberg/src/text/token_reduction/core.rs +75 -60
  43. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +20 -8
  44. data/vendor/kreuzberg/src/types.rs +5 -4
  45. data/vendor/kreuzberg/src/utils/mod.rs +3 -0
  46. data/vendor/kreuzberg/src/utils/pool.rs +172 -15
  47. data/vendor/kreuzberg/src/utils/pool_sizing.rs +393 -0
  48. data/vendor/kreuzberg/src/utils/string_pool.rs +373 -10
  49. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +504 -0
  50. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +302 -0
  51. data/vendor/kreuzberg/tests/api_tests.rs +514 -0
  52. data/vendor/kreuzberg/tests/concurrency_stress.rs +4 -0
  53. data/vendor/kreuzberg/tests/email_integration.rs +1 -1
  54. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +1 -1
  55. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +2 -2
  56. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -0
  57. data/vendor/kreuzberg/tests/pdfium_linking.rs +4 -4
  58. data/vendor/kreuzberg/tests/pipeline_integration.rs +8 -0
  59. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -1
  60. data/vendor/kreuzberg-ffi/Cargo.toml +74 -0
  61. data/vendor/kreuzberg-ffi/README.md +851 -0
  62. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +230 -0
  63. data/vendor/kreuzberg-ffi/build.rs +176 -0
  64. data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
  65. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  66. data/vendor/kreuzberg-ffi/kreuzberg.h +2959 -0
  67. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +624 -0
  68. data/vendor/kreuzberg-ffi/src/config.rs +1050 -0
  69. data/vendor/kreuzberg-ffi/src/error.rs +950 -0
  70. data/vendor/kreuzberg-ffi/src/lib.rs +4109 -0
  71. data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
  72. data/vendor/kreuzberg-ffi/src/result.rs +517 -0
  73. data/vendor/kreuzberg-ffi/src/result_pool.rs +675 -0
  74. data/vendor/kreuzberg-ffi/src/result_view.rs +815 -0
  75. data/vendor/kreuzberg-ffi/src/string_intern.rs +596 -0
  76. data/vendor/kreuzberg-ffi/src/validation.rs +938 -0
  77. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  78. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  79. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  80. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  81. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  82. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  83. data/vendor/kreuzberg-tesseract/build.rs +0 -227
  84. metadata +29 -33
  85. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  86. data/vendor/rb-sys/Cargo.lock +0 -393
  87. data/vendor/rb-sys/Cargo.toml +0 -70
  88. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  89. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  90. data/vendor/rb-sys/LICENSE-MIT +0 -21
  91. data/vendor/rb-sys/build/features.rs +0 -111
  92. data/vendor/rb-sys/build/main.rs +0 -286
  93. data/vendor/rb-sys/build/stable_api_config.rs +0 -155
  94. data/vendor/rb-sys/build/version.rs +0 -50
  95. data/vendor/rb-sys/readme.md +0 -36
  96. data/vendor/rb-sys/src/bindings.rs +0 -21
  97. data/vendor/rb-sys/src/hidden.rs +0 -11
  98. data/vendor/rb-sys/src/lib.rs +0 -35
  99. data/vendor/rb-sys/src/macros.rs +0 -371
  100. data/vendor/rb-sys/src/memory.rs +0 -53
  101. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  102. data/vendor/rb-sys/src/special_consts.rs +0 -31
  103. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  104. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  105. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -324
  106. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -332
  107. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -325
  108. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -323
  109. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -339
  110. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -339
  111. data/vendor/rb-sys/src/stable_api.rs +0 -260
  112. data/vendor/rb-sys/src/symbol.rs +0 -31
  113. data/vendor/rb-sys/src/tracking_allocator.rs +0 -330
  114. data/vendor/rb-sys/src/utils.rs +0 -89
  115. data/vendor/rb-sys/src/value_type.rs +0 -7
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f91977b1472bb6211f3ac2efad274e8cbc77dc5ed9832529eccbebeae1f74b4f
4
- data.tar.gz: b8a32377a80cfec656e8ddd65576dc220f497fc65b7b45b307db54a0b3b4a274
3
+ metadata.gz: 0f5df1c1138122d449d77193b97ee6c4f40de044077765f1d68ce4f0bc6aba2a
4
+ data.tar.gz: c48abedda657f892a912cd9cca7f40167fa3257d75f98527e0bc95da4580e630
5
5
  SHA512:
6
- metadata.gz: cb391d9f82848e0b19b0c8df2cce7db455d1b73ba5e5c6dd63a2cc87732d8dd0cd6596ca7f9b305061d9400db95c5890292efbd16af7e55a9434f3f29a337642
7
- data.tar.gz: 9e41afcc217e00d9feb3f8c4adecb7152743227f2c77f4bcfd9fd5e3d4b64b01171d3bdbb2b1290e10d2efdf13cdb53feb8fee01a95b6ba4d87ea76425b56692
6
+ metadata.gz: a2a0a7854003f48d69eb89cf79a3252aadba11f001edfe7ba4d03f16198b3d68394bd84589c5b379c7a4dcd4784391a2fd3b1c5ce636d8a490382a77d62fd671
7
+ data.tar.gz: f3d571515eb5598e34fdc8dd18296cd069a6fa25e7cf9017a9f3f1980a82fcebca977e9fc18e361d0f00386f72109ffe8f3e1afcf15dcbc35b5e6472b3f83853
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.0.0.pre.rc.17)
4
+ kreuzberg (4.0.0.pre.rc.19)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -68,7 +68,7 @@ GEM
68
68
  ffi (~> 1.0)
69
69
  rb_sys (0.9.123)
70
70
  rake-compiler-dock (= 1.10.0)
71
- rbs (3.9.5)
71
+ rbs (3.10.0)
72
72
  logger
73
73
  regexp_parser (2.11.3)
74
74
  rspec (3.13.2)
@@ -84,7 +84,7 @@ GEM
84
84
  diff-lcs (>= 1.2.0, < 2.0)
85
85
  rspec-support (~> 3.13.0)
86
86
  rspec-support (3.13.6)
87
- rubocop (1.82.0)
87
+ rubocop (1.82.1)
88
88
  json (~> 2.3)
89
89
  language_server-protocol (~> 3.17.0.2)
90
90
  lint_roller (~> 1.1.0)
@@ -75,6 +75,56 @@ dependencies = [
75
75
  "libc",
76
76
  ]
77
77
 
78
+ [[package]]
79
+ name = "anstream"
80
+ version = "0.6.21"
81
+ source = "registry+https://github.com/rust-lang/crates.io-index"
82
+ checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a"
83
+ dependencies = [
84
+ "anstyle",
85
+ "anstyle-parse",
86
+ "anstyle-query",
87
+ "anstyle-wincon",
88
+ "colorchoice",
89
+ "is_terminal_polyfill",
90
+ "utf8parse",
91
+ ]
92
+
93
+ [[package]]
94
+ name = "anstyle"
95
+ version = "1.0.13"
96
+ source = "registry+https://github.com/rust-lang/crates.io-index"
97
+ checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78"
98
+
99
+ [[package]]
100
+ name = "anstyle-parse"
101
+ version = "0.2.7"
102
+ source = "registry+https://github.com/rust-lang/crates.io-index"
103
+ checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2"
104
+ dependencies = [
105
+ "utf8parse",
106
+ ]
107
+
108
+ [[package]]
109
+ name = "anstyle-query"
110
+ version = "1.1.5"
111
+ source = "registry+https://github.com/rust-lang/crates.io-index"
112
+ checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
113
+ dependencies = [
114
+ "windows-sys 0.61.2",
115
+ ]
116
+
117
+ [[package]]
118
+ name = "anstyle-wincon"
119
+ version = "3.0.11"
120
+ source = "registry+https://github.com/rust-lang/crates.io-index"
121
+ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
122
+ dependencies = [
123
+ "anstyle",
124
+ "once_cell_polyfill",
125
+ "windows-sys 0.61.2",
126
+ ]
127
+
78
128
  [[package]]
79
129
  name = "anyhow"
80
130
  version = "1.0.100"
@@ -455,6 +505,18 @@ dependencies = [
455
505
  "core2",
456
506
  ]
457
507
 
508
+ [[package]]
509
+ name = "bitvec"
510
+ version = "1.0.1"
511
+ source = "registry+https://github.com/rust-lang/crates.io-index"
512
+ checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c"
513
+ dependencies = [
514
+ "funty",
515
+ "radium",
516
+ "tap",
517
+ "wyz",
518
+ ]
519
+
458
520
  [[package]]
459
521
  name = "blake3"
460
522
  version = "1.8.2"
@@ -596,6 +658,25 @@ dependencies = [
596
658
  "cipher",
597
659
  ]
598
660
 
661
+ [[package]]
662
+ name = "cbindgen"
663
+ version = "0.29.2"
664
+ source = "registry+https://github.com/rust-lang/crates.io-index"
665
+ checksum = "befbfd072a8e81c02f8c507aefce431fe5e7d051f83d48a23ffc9b9fe5a11799"
666
+ dependencies = [
667
+ "clap",
668
+ "heck",
669
+ "indexmap",
670
+ "log",
671
+ "proc-macro2",
672
+ "quote",
673
+ "serde",
674
+ "serde_json",
675
+ "syn",
676
+ "tempfile",
677
+ "toml 0.9.10+spec-1.1.0",
678
+ ]
679
+
599
680
  [[package]]
600
681
  name = "cc"
601
682
  version = "1.2.50"
@@ -696,6 +777,33 @@ dependencies = [
696
777
  "libloading 0.8.9",
697
778
  ]
698
779
 
780
+ [[package]]
781
+ name = "clap"
782
+ version = "4.5.53"
783
+ source = "registry+https://github.com/rust-lang/crates.io-index"
784
+ checksum = "c9e340e012a1bf4935f5282ed1436d1489548e8f72308207ea5df0e23d2d03f8"
785
+ dependencies = [
786
+ "clap_builder",
787
+ ]
788
+
789
+ [[package]]
790
+ name = "clap_builder"
791
+ version = "4.5.53"
792
+ source = "registry+https://github.com/rust-lang/crates.io-index"
793
+ checksum = "d76b5d13eaa18c901fd2f7fca939fefe3a0727a953561fefdf3b2922b8569d00"
794
+ dependencies = [
795
+ "anstream",
796
+ "anstyle",
797
+ "clap_lex",
798
+ "strsim",
799
+ ]
800
+
801
+ [[package]]
802
+ name = "clap_lex"
803
+ version = "0.7.6"
804
+ source = "registry+https://github.com/rust-lang/crates.io-index"
805
+ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
806
+
699
807
  [[package]]
700
808
  name = "cmake"
701
809
  version = "0.1.57"
@@ -720,6 +828,12 @@ version = "1.1.0"
720
828
  source = "registry+https://github.com/rust-lang/crates.io-index"
721
829
  checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b"
722
830
 
831
+ [[package]]
832
+ name = "colorchoice"
833
+ version = "1.0.4"
834
+ source = "registry+https://github.com/rust-lang/crates.io-index"
835
+ checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75"
836
+
723
837
  [[package]]
724
838
  name = "compact_str"
725
839
  version = "0.9.0"
@@ -987,6 +1101,20 @@ dependencies = [
987
1101
  "serde",
988
1102
  ]
989
1103
 
1104
+ [[package]]
1105
+ name = "dashmap"
1106
+ version = "6.1.0"
1107
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1108
+ checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf"
1109
+ dependencies = [
1110
+ "cfg-if",
1111
+ "crossbeam-utils",
1112
+ "hashbrown 0.14.5",
1113
+ "lock_api",
1114
+ "once_cell",
1115
+ "parking_lot_core",
1116
+ ]
1117
+
990
1118
  [[package]]
991
1119
  name = "debug_unsafe"
992
1120
  version = "0.1.3"
@@ -1078,13 +1206,34 @@ dependencies = [
1078
1206
  "subtle",
1079
1207
  ]
1080
1208
 
1209
+ [[package]]
1210
+ name = "dirs"
1211
+ version = "5.0.1"
1212
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1213
+ checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225"
1214
+ dependencies = [
1215
+ "dirs-sys 0.4.1",
1216
+ ]
1217
+
1081
1218
  [[package]]
1082
1219
  name = "dirs"
1083
1220
  version = "6.0.0"
1084
1221
  source = "registry+https://github.com/rust-lang/crates.io-index"
1085
1222
  checksum = "c3e8aa94d75141228480295a7d0e7feb620b1a5ad9f12bc40be62411e38cce4e"
1086
1223
  dependencies = [
1087
- "dirs-sys",
1224
+ "dirs-sys 0.5.0",
1225
+ ]
1226
+
1227
+ [[package]]
1228
+ name = "dirs-sys"
1229
+ version = "0.4.1"
1230
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1231
+ checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c"
1232
+ dependencies = [
1233
+ "libc",
1234
+ "option-ext",
1235
+ "redox_users 0.4.6",
1236
+ "windows-sys 0.48.0",
1088
1237
  ]
1089
1238
 
1090
1239
  [[package]]
@@ -1095,7 +1244,7 @@ checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab"
1095
1244
  dependencies = [
1096
1245
  "libc",
1097
1246
  "option-ext",
1098
- "redox_users",
1247
+ "redox_users 0.5.2",
1099
1248
  "windows-sys 0.61.2",
1100
1249
  ]
1101
1250
 
@@ -1434,6 +1583,12 @@ dependencies = [
1434
1583
  "windows-sys 0.59.0",
1435
1584
  ]
1436
1585
 
1586
+ [[package]]
1587
+ name = "funty"
1588
+ version = "2.0.0"
1589
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1590
+ checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
1591
+
1437
1592
  [[package]]
1438
1593
  name = "futf"
1439
1594
  version = "0.1.5"
@@ -1638,6 +1793,12 @@ dependencies = [
1638
1793
  "zerocopy",
1639
1794
  ]
1640
1795
 
1796
+ [[package]]
1797
+ name = "hashbrown"
1798
+ version = "0.14.5"
1799
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1800
+ checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
1801
+
1641
1802
  [[package]]
1642
1803
  name = "hashbrown"
1643
1804
  version = "0.15.5"
@@ -1701,7 +1862,7 @@ version = "0.4.3"
1701
1862
  source = "registry+https://github.com/rust-lang/crates.io-index"
1702
1863
  checksum = "629d8f3bbeda9d148036d6b0de0a3ab947abd08ce90626327fc3547a49d59d97"
1703
1864
  dependencies = [
1704
- "dirs",
1865
+ "dirs 6.0.0",
1705
1866
  "http",
1706
1867
  "indicatif",
1707
1868
  "libc",
@@ -1744,15 +1905,16 @@ dependencies = [
1744
1905
 
1745
1906
  [[package]]
1746
1907
  name = "html-to-markdown-rs"
1747
- version = "2.15.0"
1908
+ version = "2.16.1"
1748
1909
  source = "registry+https://github.com/rust-lang/crates.io-index"
1749
- checksum = "7741e7928e84f3f3497c84b8dd27e9fcc3368bd133e44ca800715eb34a1d58c8"
1910
+ checksum = "eda029e154a976514850a89a56a1f07f03fb0611e0e8fc2357fd4ec739d63acc"
1750
1911
  dependencies = [
1751
1912
  "astral-tl",
1752
1913
  "base64 0.22.1",
1753
1914
  "html-escape",
1754
1915
  "html5ever",
1755
1916
  "image",
1917
+ "lru",
1756
1918
  "markup5ever_rcdom",
1757
1919
  "once_cell",
1758
1920
  "regex",
@@ -2184,6 +2346,12 @@ dependencies = [
2184
2346
  "serde",
2185
2347
  ]
2186
2348
 
2349
+ [[package]]
2350
+ name = "is_terminal_polyfill"
2351
+ version = "1.70.2"
2352
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2353
+ checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
2354
+
2187
2355
  [[package]]
2188
2356
  name = "itertools"
2189
2357
  version = "0.12.1"
@@ -2280,7 +2448,7 @@ dependencies = [
2280
2448
 
2281
2449
  [[package]]
2282
2450
  name = "kreuzberg"
2283
- version = "4.0.0-rc.14"
2451
+ version = "4.0.0-rc.18"
2284
2452
  dependencies = [
2285
2453
  "ahash",
2286
2454
  "async-trait",
@@ -2288,8 +2456,11 @@ dependencies = [
2288
2456
  "base64 0.22.1",
2289
2457
  "base64-simd",
2290
2458
  "biblatex",
2459
+ "bitvec",
2291
2460
  "calamine",
2292
2461
  "chardetng",
2462
+ "dashmap",
2463
+ "dirs 5.0.1",
2293
2464
  "docx-lite",
2294
2465
  "encoding_rs",
2295
2466
  "fast_image_resize",
@@ -2315,6 +2486,7 @@ dependencies = [
2315
2486
  "opentelemetry",
2316
2487
  "opentelemetry_sdk",
2317
2488
  "org",
2489
+ "parking_lot",
2318
2490
  "pastey 0.2.1",
2319
2491
  "pdfium-render",
2320
2492
  "pkg-config",
@@ -2334,6 +2506,7 @@ dependencies = [
2334
2506
  "serde_json",
2335
2507
  "serde_yaml_ng",
2336
2508
  "sevenz-rust2",
2509
+ "simdutf8",
2337
2510
  "tar",
2338
2511
  "text-splitter",
2339
2512
  "thiserror 2.0.17",
@@ -2352,13 +2525,27 @@ dependencies = [
2352
2525
  "zip 7.0.0",
2353
2526
  ]
2354
2527
 
2528
+ [[package]]
2529
+ name = "kreuzberg-ffi"
2530
+ version = "4.0.0-rc.18"
2531
+ dependencies = [
2532
+ "async-trait",
2533
+ "cbindgen",
2534
+ "html-to-markdown-rs",
2535
+ "kreuzberg",
2536
+ "serde",
2537
+ "serde_json",
2538
+ "tokio",
2539
+ ]
2540
+
2355
2541
  [[package]]
2356
2542
  name = "kreuzberg-rb"
2357
- version = "4.0.0-rc.17"
2543
+ version = "4.0.0-rc.19"
2358
2544
  dependencies = [
2359
2545
  "async-trait",
2360
2546
  "html-to-markdown-rs",
2361
2547
  "kreuzberg",
2548
+ "kreuzberg-ffi",
2362
2549
  "magnus",
2363
2550
  "pretty_assertions",
2364
2551
  "rb-sys",
@@ -2368,7 +2555,9 @@ dependencies = [
2368
2555
 
2369
2556
  [[package]]
2370
2557
  name = "kreuzberg-tesseract"
2371
- version = "4.0.0-rc.14"
2558
+ version = "4.0.0-rc.18"
2559
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2560
+ checksum = "477791cd4bba39222e187ae6e235643e034a87c96f2c6fb5796667020560adba"
2372
2561
  dependencies = [
2373
2562
  "cc",
2374
2563
  "cmake",
@@ -2553,6 +2742,15 @@ dependencies = [
2553
2742
  "weezl",
2554
2743
  ]
2555
2744
 
2745
+ [[package]]
2746
+ name = "lru"
2747
+ version = "0.16.2"
2748
+ source = "registry+https://github.com/rust-lang/crates.io-index"
2749
+ checksum = "96051b46fc183dc9cd4a223960ef37b9af631b55191852a8274bfef064cda20f"
2750
+ dependencies = [
2751
+ "hashbrown 0.16.1",
2752
+ ]
2753
+
2556
2754
  [[package]]
2557
2755
  name = "lru-slab"
2558
2756
  version = "0.1.2"
@@ -3051,6 +3249,12 @@ version = "1.21.3"
3051
3249
  source = "registry+https://github.com/rust-lang/crates.io-index"
3052
3250
  checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
3053
3251
 
3252
+ [[package]]
3253
+ name = "once_cell_polyfill"
3254
+ version = "1.70.2"
3255
+ source = "registry+https://github.com/rust-lang/crates.io-index"
3256
+ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
3257
+
3054
3258
  [[package]]
3055
3259
  name = "onig"
3056
3260
  version = "6.5.1"
@@ -4124,6 +4328,12 @@ version = "5.3.0"
4124
4328
  source = "registry+https://github.com/rust-lang/crates.io-index"
4125
4329
  checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
4126
4330
 
4331
+ [[package]]
4332
+ name = "radium"
4333
+ version = "0.7.0"
4334
+ source = "registry+https://github.com/rust-lang/crates.io-index"
4335
+ checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09"
4336
+
4127
4337
  [[package]]
4128
4338
  name = "rake"
4129
4339
  version = "0.3.6"
@@ -4278,16 +4488,18 @@ dependencies = [
4278
4488
 
4279
4489
  [[package]]
4280
4490
  name = "rb-sys"
4281
- version = "0.9.119"
4491
+ version = "0.9.123"
4492
+ source = "registry+https://github.com/rust-lang/crates.io-index"
4493
+ checksum = "45fb1a185af97ee456f1c9e56dbe6e2e662bec4fdeaf83c4c28e0e6adfb18816"
4282
4494
  dependencies = [
4283
4495
  "rb-sys-build",
4284
4496
  ]
4285
4497
 
4286
4498
  [[package]]
4287
4499
  name = "rb-sys-build"
4288
- version = "0.9.119"
4500
+ version = "0.9.123"
4289
4501
  source = "registry+https://github.com/rust-lang/crates.io-index"
4290
- checksum = "2e0109499e06c85f56df4abad7d9c642ea8a2dd821d1d7132b4d1b69534677f3"
4502
+ checksum = "a58ebd02d7a6033e6a5f6f8d150c1e9f16506039092b84a73e6bedce6d3adf41"
4291
4503
  dependencies = [
4292
4504
  "bindgen",
4293
4505
  "lazy_static",
@@ -4333,6 +4545,17 @@ dependencies = [
4333
4545
  "bitflags",
4334
4546
  ]
4335
4547
 
4548
+ [[package]]
4549
+ name = "redox_users"
4550
+ version = "0.4.6"
4551
+ source = "registry+https://github.com/rust-lang/crates.io-index"
4552
+ checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43"
4553
+ dependencies = [
4554
+ "getrandom 0.2.16",
4555
+ "libredox",
4556
+ "thiserror 1.0.69",
4557
+ ]
4558
+
4336
4559
  [[package]]
4337
4560
  name = "redox_users"
4338
4561
  version = "0.5.2"
@@ -4395,9 +4618,9 @@ checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
4395
4618
 
4396
4619
  [[package]]
4397
4620
  name = "reqwest"
4398
- version = "0.12.26"
4621
+ version = "0.12.28"
4399
4622
  source = "registry+https://github.com/rust-lang/crates.io-index"
4400
- checksum = "3b4c14b2d9afca6a60277086b0cc6a6ae0b568f6f7916c943a8cdc79f8be240f"
4623
+ checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147"
4401
4624
  dependencies = [
4402
4625
  "base64 0.22.1",
4403
4626
  "bytes",
@@ -5218,6 +5441,12 @@ dependencies = [
5218
5441
  "syn",
5219
5442
  ]
5220
5443
 
5444
+ [[package]]
5445
+ name = "tap"
5446
+ version = "1.0.1"
5447
+ source = "registry+https://github.com/rust-lang/crates.io-index"
5448
+ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
5449
+
5221
5450
  [[package]]
5222
5451
  name = "tar"
5223
5452
  version = "0.4.44"
@@ -5229,6 +5458,19 @@ dependencies = [
5229
5458
  "xattr",
5230
5459
  ]
5231
5460
 
5461
+ [[package]]
5462
+ name = "tempfile"
5463
+ version = "3.23.0"
5464
+ source = "registry+https://github.com/rust-lang/crates.io-index"
5465
+ checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16"
5466
+ dependencies = [
5467
+ "fastrand",
5468
+ "getrandom 0.3.4",
5469
+ "once_cell",
5470
+ "rustix",
5471
+ "windows-sys 0.61.2",
5472
+ ]
5473
+
5232
5474
  [[package]]
5233
5475
  name = "tendril"
5234
5476
  version = "0.4.3"
@@ -5937,6 +6179,12 @@ version = "1.0.4"
5937
6179
  source = "registry+https://github.com/rust-lang/crates.io-index"
5938
6180
  checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
5939
6181
 
6182
+ [[package]]
6183
+ name = "utf8parse"
6184
+ version = "0.2.2"
6185
+ source = "registry+https://github.com/rust-lang/crates.io-index"
6186
+ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
6187
+
5940
6188
  [[package]]
5941
6189
  name = "uuid"
5942
6190
  version = "1.19.0"
@@ -6253,6 +6501,15 @@ dependencies = [
6253
6501
  "windows-link",
6254
6502
  ]
6255
6503
 
6504
+ [[package]]
6505
+ name = "windows-sys"
6506
+ version = "0.48.0"
6507
+ source = "registry+https://github.com/rust-lang/crates.io-index"
6508
+ checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
6509
+ dependencies = [
6510
+ "windows-targets 0.48.5",
6511
+ ]
6512
+
6256
6513
  [[package]]
6257
6514
  name = "windows-sys"
6258
6515
  version = "0.52.0"
@@ -6289,6 +6546,21 @@ dependencies = [
6289
6546
  "windows-link",
6290
6547
  ]
6291
6548
 
6549
+ [[package]]
6550
+ name = "windows-targets"
6551
+ version = "0.48.5"
6552
+ source = "registry+https://github.com/rust-lang/crates.io-index"
6553
+ checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
6554
+ dependencies = [
6555
+ "windows_aarch64_gnullvm 0.48.5",
6556
+ "windows_aarch64_msvc 0.48.5",
6557
+ "windows_i686_gnu 0.48.5",
6558
+ "windows_i686_msvc 0.48.5",
6559
+ "windows_x86_64_gnu 0.48.5",
6560
+ "windows_x86_64_gnullvm 0.48.5",
6561
+ "windows_x86_64_msvc 0.48.5",
6562
+ ]
6563
+
6292
6564
  [[package]]
6293
6565
  name = "windows-targets"
6294
6566
  version = "0.52.6"
@@ -6322,6 +6594,12 @@ dependencies = [
6322
6594
  "windows_x86_64_msvc 0.53.1",
6323
6595
  ]
6324
6596
 
6597
+ [[package]]
6598
+ name = "windows_aarch64_gnullvm"
6599
+ version = "0.48.5"
6600
+ source = "registry+https://github.com/rust-lang/crates.io-index"
6601
+ checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
6602
+
6325
6603
  [[package]]
6326
6604
  name = "windows_aarch64_gnullvm"
6327
6605
  version = "0.52.6"
@@ -6334,6 +6612,12 @@ version = "0.53.1"
6334
6612
  source = "registry+https://github.com/rust-lang/crates.io-index"
6335
6613
  checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53"
6336
6614
 
6615
+ [[package]]
6616
+ name = "windows_aarch64_msvc"
6617
+ version = "0.48.5"
6618
+ source = "registry+https://github.com/rust-lang/crates.io-index"
6619
+ checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
6620
+
6337
6621
  [[package]]
6338
6622
  name = "windows_aarch64_msvc"
6339
6623
  version = "0.52.6"
@@ -6346,6 +6630,12 @@ version = "0.53.1"
6346
6630
  source = "registry+https://github.com/rust-lang/crates.io-index"
6347
6631
  checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006"
6348
6632
 
6633
+ [[package]]
6634
+ name = "windows_i686_gnu"
6635
+ version = "0.48.5"
6636
+ source = "registry+https://github.com/rust-lang/crates.io-index"
6637
+ checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
6638
+
6349
6639
  [[package]]
6350
6640
  name = "windows_i686_gnu"
6351
6641
  version = "0.52.6"
@@ -6370,6 +6660,12 @@ version = "0.53.1"
6370
6660
  source = "registry+https://github.com/rust-lang/crates.io-index"
6371
6661
  checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c"
6372
6662
 
6663
+ [[package]]
6664
+ name = "windows_i686_msvc"
6665
+ version = "0.48.5"
6666
+ source = "registry+https://github.com/rust-lang/crates.io-index"
6667
+ checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
6668
+
6373
6669
  [[package]]
6374
6670
  name = "windows_i686_msvc"
6375
6671
  version = "0.52.6"
@@ -6382,6 +6678,12 @@ version = "0.53.1"
6382
6678
  source = "registry+https://github.com/rust-lang/crates.io-index"
6383
6679
  checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2"
6384
6680
 
6681
+ [[package]]
6682
+ name = "windows_x86_64_gnu"
6683
+ version = "0.48.5"
6684
+ source = "registry+https://github.com/rust-lang/crates.io-index"
6685
+ checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
6686
+
6385
6687
  [[package]]
6386
6688
  name = "windows_x86_64_gnu"
6387
6689
  version = "0.52.6"
@@ -6394,6 +6696,12 @@ version = "0.53.1"
6394
6696
  source = "registry+https://github.com/rust-lang/crates.io-index"
6395
6697
  checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499"
6396
6698
 
6699
+ [[package]]
6700
+ name = "windows_x86_64_gnullvm"
6701
+ version = "0.48.5"
6702
+ source = "registry+https://github.com/rust-lang/crates.io-index"
6703
+ checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
6704
+
6397
6705
  [[package]]
6398
6706
  name = "windows_x86_64_gnullvm"
6399
6707
  version = "0.52.6"
@@ -6406,6 +6714,12 @@ version = "0.53.1"
6406
6714
  source = "registry+https://github.com/rust-lang/crates.io-index"
6407
6715
  checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1"
6408
6716
 
6717
+ [[package]]
6718
+ name = "windows_x86_64_msvc"
6719
+ version = "0.48.5"
6720
+ source = "registry+https://github.com/rust-lang/crates.io-index"
6721
+ checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
6722
+
6409
6723
  [[package]]
6410
6724
  name = "windows_x86_64_msvc"
6411
6725
  version = "0.52.6"
@@ -6439,6 +6753,15 @@ version = "0.6.2"
6439
6753
  source = "registry+https://github.com/rust-lang/crates.io-index"
6440
6754
  checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
6441
6755
 
6756
+ [[package]]
6757
+ name = "wyz"
6758
+ version = "0.5.1"
6759
+ source = "registry+https://github.com/rust-lang/crates.io-index"
6760
+ checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed"
6761
+ dependencies = [
6762
+ "tap",
6763
+ ]
6764
+
6442
6765
  [[package]]
6443
6766
  name = "xattr"
6444
6767
  version = "1.6.1"
@@ -1,13 +1,9 @@
1
1
  # This crate is excluded from the workspace to use a vendored kreuzberg crate for gem packaging
2
2
  [workspace]
3
3
 
4
- [patch.crates-io]
5
- # Patch rb-sys to fix Windows i32/i64 type mismatch in tracking_allocator.rs
6
- rb-sys = { path = "../../../vendor/rb-sys" }
7
-
8
4
  [package]
9
5
  name = "kreuzberg-rb"
10
- version = "4.0.0-rc.17"
6
+ version = "4.0.0-rc.19"
11
7
  edition = "2024"
12
8
  rust-version = "1.91"
13
9
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -30,6 +26,7 @@ default = []
30
26
  [dependencies]
31
27
  async-trait = "0.1.89"
32
28
  kreuzberg = { path = "../../../vendor/kreuzberg", features = ["full"] }
29
+ kreuzberg-ffi = { path = "../../../vendor/kreuzberg-ffi" }
33
30
  magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
34
31
  "rb-sys",
35
32
  ] }