kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
data/Gemfile.lock ADDED
@@ -0,0 +1,260 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ kreuzberg (4.3.5)
5
+ rb_sys (~> 0.9.119)
6
+
7
+ GEM
8
+ remote: https://rubygems.org/
9
+ specs:
10
+ activesupport (8.1.2)
11
+ base64
12
+ bigdecimal
13
+ concurrent-ruby (~> 1.0, >= 1.3.1)
14
+ connection_pool (>= 2.2.5)
15
+ drb
16
+ i18n (>= 1.6, < 2)
17
+ json
18
+ logger (>= 1.4.2)
19
+ minitest (>= 5.1)
20
+ securerandom (>= 0.3)
21
+ tzinfo (~> 2.0, >= 2.0.5)
22
+ uri (>= 0.13.1)
23
+ ast (2.4.3)
24
+ base64 (0.3.0)
25
+ bigdecimal (4.0.1)
26
+ byebug (13.0.0)
27
+ reline (>= 0.6.0)
28
+ coderay (1.1.3)
29
+ concurrent-ruby (1.3.6)
30
+ connection_pool (3.0.2)
31
+ csv (3.3.5)
32
+ diff-lcs (1.6.2)
33
+ drb (2.2.3)
34
+ ffi (1.17.3)
35
+ ffi (1.17.3-aarch64-linux-gnu)
36
+ ffi (1.17.3-aarch64-linux-musl)
37
+ ffi (1.17.3-arm-linux-gnu)
38
+ ffi (1.17.3-arm-linux-musl)
39
+ ffi (1.17.3-arm64-darwin)
40
+ ffi (1.17.3-x86-linux-gnu)
41
+ ffi (1.17.3-x86-linux-musl)
42
+ ffi (1.17.3-x86_64-darwin)
43
+ ffi (1.17.3-x86_64-linux-gnu)
44
+ ffi (1.17.3-x86_64-linux-musl)
45
+ fileutils (1.8.0)
46
+ i18n (1.14.8)
47
+ concurrent-ruby (~> 1.0)
48
+ io-console (0.8.2)
49
+ json (2.18.1)
50
+ language_server-protocol (3.17.0.5)
51
+ lint_roller (1.1.0)
52
+ listen (3.10.0)
53
+ logger
54
+ rb-fsevent (~> 0.10, >= 0.10.3)
55
+ rb-inotify (~> 0.9, >= 0.9.10)
56
+ logger (1.7.0)
57
+ method_source (1.1.0)
58
+ minitest (6.0.1)
59
+ prism (~> 1.5)
60
+ mutex_m (0.3.0)
61
+ parallel (1.27.0)
62
+ parser (3.3.10.2)
63
+ ast (~> 2.4.1)
64
+ racc
65
+ prism (1.9.0)
66
+ pry (0.16.0)
67
+ coderay (~> 1.1)
68
+ method_source (~> 1.0)
69
+ reline (>= 0.6.0)
70
+ pry-byebug (3.12.0)
71
+ byebug (~> 13.0)
72
+ pry (>= 0.13, < 0.17)
73
+ racc (1.8.1)
74
+ rainbow (3.1.1)
75
+ rake (13.3.1)
76
+ rake-compiler (1.3.1)
77
+ rake
78
+ rake-compiler-dock (1.11.0)
79
+ rb-fsevent (0.11.2)
80
+ rb-inotify (0.11.1)
81
+ ffi (~> 1.0)
82
+ rb_sys (0.9.124)
83
+ rake-compiler-dock (= 1.11.0)
84
+ rbs (3.10.3)
85
+ logger
86
+ tsort
87
+ regexp_parser (2.11.3)
88
+ reline (0.6.3)
89
+ io-console (~> 0.5)
90
+ rspec (3.13.2)
91
+ rspec-core (~> 3.13.0)
92
+ rspec-expectations (~> 3.13.0)
93
+ rspec-mocks (~> 3.13.0)
94
+ rspec-core (3.13.6)
95
+ rspec-support (~> 3.13.0)
96
+ rspec-expectations (3.13.5)
97
+ diff-lcs (>= 1.2.0, < 2.0)
98
+ rspec-support (~> 3.13.0)
99
+ rspec-mocks (3.13.7)
100
+ diff-lcs (>= 1.2.0, < 2.0)
101
+ rspec-support (~> 3.13.0)
102
+ rspec-support (3.13.7)
103
+ rubocop (1.84.2)
104
+ json (~> 2.3)
105
+ language_server-protocol (~> 3.17.0.2)
106
+ lint_roller (~> 1.1.0)
107
+ parallel (~> 1.10)
108
+ parser (>= 3.3.0.2)
109
+ rainbow (>= 2.2.2, < 4.0)
110
+ regexp_parser (>= 2.9.3, < 3.0)
111
+ rubocop-ast (>= 1.49.0, < 2.0)
112
+ ruby-progressbar (~> 1.7)
113
+ unicode-display_width (>= 2.4.0, < 4.0)
114
+ rubocop-ast (1.49.0)
115
+ parser (>= 3.3.7.2)
116
+ prism (~> 1.7)
117
+ rubocop-performance (1.26.1)
118
+ lint_roller (~> 1.1)
119
+ rubocop (>= 1.75.0, < 2.0)
120
+ rubocop-ast (>= 1.47.1, < 2.0)
121
+ rubocop-rspec (3.9.0)
122
+ lint_roller (~> 1.1)
123
+ rubocop (~> 1.81)
124
+ ruby-progressbar (1.13.0)
125
+ securerandom (0.4.1)
126
+ sorbet-runtime (0.6.12942)
127
+ steep (1.10.0)
128
+ activesupport (>= 5.1)
129
+ concurrent-ruby (>= 1.1.10)
130
+ csv (>= 3.0.9)
131
+ fileutils (>= 1.1.0)
132
+ json (>= 2.1.0)
133
+ language_server-protocol (>= 3.17.0.4, < 4.0)
134
+ listen (~> 3.0)
135
+ logger (>= 1.3.0)
136
+ mutex_m (>= 0.3.0)
137
+ parser (>= 3.1)
138
+ rainbow (>= 2.2.2, < 4.0)
139
+ rbs (~> 3.9)
140
+ securerandom (>= 0.1)
141
+ strscan (>= 1.0.0)
142
+ terminal-table (>= 2, < 5)
143
+ uri (>= 0.12.0)
144
+ strscan (3.1.7)
145
+ terminal-table (4.0.0)
146
+ unicode-display_width (>= 1.1.1, < 4)
147
+ tsort (0.2.0)
148
+ tzinfo (2.0.6)
149
+ concurrent-ruby (~> 1.0)
150
+ unicode-display_width (3.2.0)
151
+ unicode-emoji (~> 4.1)
152
+ unicode-emoji (4.2.0)
153
+ uri (1.1.1)
154
+ yard (0.9.38)
155
+
156
+ PLATFORMS
157
+ aarch64-linux-gnu
158
+ aarch64-linux-musl
159
+ arm-linux-gnu
160
+ arm-linux-musl
161
+ arm64-darwin
162
+ ruby
163
+ x86-linux-gnu
164
+ x86-linux-musl
165
+ x86_64-darwin
166
+ x86_64-linux
167
+ x86_64-linux-gnu
168
+ x86_64-linux-musl
169
+
170
+ DEPENDENCIES
171
+ bundler (~> 4.0)
172
+ kreuzberg!
173
+ pry (~> 0.14)
174
+ pry-byebug (~> 3.10)
175
+ rake (~> 13.0)
176
+ rake-compiler (~> 1.2)
177
+ rbs (~> 3.0)
178
+ rspec (~> 3.12)
179
+ rubocop (~> 1.66)
180
+ rubocop-performance (~> 1.21)
181
+ rubocop-rspec (~> 3.0)
182
+ sorbet-runtime (~> 0.5)
183
+ steep (~> 1.8)
184
+ yard (~> 0.9)
185
+
186
+ CHECKSUMS
187
+ activesupport (8.1.2) sha256=88842578ccd0d40f658289b0e8c842acfe9af751afee2e0744a7873f50b6fdae
188
+ ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
189
+ base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
190
+ bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
191
+ byebug (13.0.0) sha256=d2263efe751941ca520fa29744b71972d39cbc41839496706f5d9b22e92ae05d
192
+ coderay (1.1.3) sha256=dc530018a4684512f8f38143cd2a096c9f02a1fc2459edcfe534787a7fc77d4b
193
+ concurrent-ruby (1.3.6) sha256=6b56837e1e7e5292f9864f34b69c5a2cbc75c0cf5338f1ce9903d10fa762d5ab
194
+ connection_pool (3.0.2) sha256=33fff5ba71a12d2aa26cb72b1db8bba2a1a01823559fb01d29eb74c286e62e0a
195
+ csv (3.3.5) sha256=6e5134ac3383ef728b7f02725d9872934f523cb40b961479f69cf3afa6c8e73f
196
+ diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
197
+ drb (2.2.3) sha256=0b00d6fdb50995fe4a45dea13663493c841112e4068656854646f418fda13373
198
+ ffi (1.17.3) sha256=0e9f39f7bb3934f77ad6feab49662be77e87eedcdeb2a3f5c0234c2938563d4c
199
+ ffi (1.17.3-aarch64-linux-gnu) sha256=28ad573df26560f0aedd8a90c3371279a0b2bd0b4e834b16a2baa10bd7a97068
200
+ ffi (1.17.3-aarch64-linux-musl) sha256=020b33b76775b1abacc3b7d86b287cef3251f66d747092deec592c7f5df764b2
201
+ ffi (1.17.3-arm-linux-gnu) sha256=5bd4cea83b68b5ec0037f99c57d5ce2dd5aa438f35decc5ef68a7d085c785668
202
+ ffi (1.17.3-arm-linux-musl) sha256=0d7626bb96265f9af78afa33e267d71cfef9d9a8eb8f5525344f8da6c7d76053
203
+ ffi (1.17.3-arm64-darwin) sha256=0c690555d4cee17a7f07c04d59df39b2fba74ec440b19da1f685c6579bb0717f
204
+ ffi (1.17.3-x86-linux-gnu) sha256=868a88fcaf5186c3a46b7c7c2b2c34550e1e61a405670ab23f5b6c9971529089
205
+ ffi (1.17.3-x86-linux-musl) sha256=f0286aa6ef40605cf586e61406c446de34397b85dbb08cc99fdaddaef8343945
206
+ ffi (1.17.3-x86_64-darwin) sha256=1f211811eb5cfaa25998322cdd92ab104bfbd26d1c4c08471599c511f2c00bb5
207
+ ffi (1.17.3-x86_64-linux-gnu) sha256=3746b01f677aae7b16dc1acb7cb3cc17b3e35bdae7676a3f568153fb0e2c887f
208
+ ffi (1.17.3-x86_64-linux-musl) sha256=086b221c3a68320b7564066f46fed23449a44f7a1935f1fe5a245bd89d9aea56
209
+ fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
210
+ i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
211
+ io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
212
+ json (2.18.1) sha256=fe112755501b8d0466b5ada6cf50c8c3f41e897fa128ac5d263ec09eedc9f986
213
+ kreuzberg (4.3.5)
214
+ language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
215
+ lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
216
+ listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
217
+ logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
218
+ method_source (1.1.0) sha256=181301c9c45b731b4769bc81e8860e72f9161ad7d66dd99103c9ab84f560f5c5
219
+ minitest (6.0.1) sha256=7854c74f48e2e975969062833adc4013f249a4b212f5e7b9d5c040bf838d54bb
220
+ mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
221
+ parallel (1.27.0) sha256=4ac151e1806b755fb4e2dc2332cbf0e54f2e24ba821ff2d3dcf86bf6dc4ae130
222
+ parser (3.3.10.2) sha256=6f60c84aa4bdcedb6d1a2434b738fe8a8136807b6adc8f7f53b97da9bc4e9357
223
+ prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
224
+ pry (0.16.0) sha256=d76c69065698ed1f85e717bd33d7942c38a50868f6b0673c636192b3d1b6054e
225
+ pry-byebug (3.12.0) sha256=594e094ae8a8390a7ad4c7b36ae36e13304ed02664c67417d108dc5f7213d1b7
226
+ racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
227
+ rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
228
+ rake (13.3.1) sha256=8c9e89d09f66a26a01264e7e3480ec0607f0c497a861ef16063604b1b08eb19c
229
+ rake-compiler (1.3.1) sha256=6b351612b6e2d73ddd5563ee799bb58685176e05363db6758504bd11573d670a
230
+ rake-compiler-dock (1.11.0) sha256=eab51f2cd533eb35cea6b624a75281f047123e70a64c58b607471bb49428f8c2
231
+ rb-fsevent (0.11.2) sha256=43900b972e7301d6570f64b850a5aa67833ee7d87b458ee92805d56b7318aefe
232
+ rb-inotify (0.11.1) sha256=a0a700441239b0ff18eb65e3866236cd78613d6b9f78fea1f9ac47a85e47be6e
233
+ rb_sys (0.9.124) sha256=513476557b12eaf73764b3da9f8746024558fe8699bda785fb548c9aa3877ae7
234
+ rbs (3.10.3) sha256=70627f3919016134d554e6c99195552ae3ef6020fe034c8e983facc9c192daa6
235
+ regexp_parser (2.11.3) sha256=ca13f381a173b7a93450e53459075c9b76a10433caadcb2f1180f2c741fc55a4
236
+ reline (0.6.3) sha256=1198b04973565b36ec0f11542ab3f5cfeeec34823f4e54cebde90968092b1835
237
+ rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
238
+ rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
239
+ rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
240
+ rspec-mocks (3.13.7) sha256=0979034e64b1d7a838aaaddf12bf065ea4dc40ef3d4c39f01f93ae2c66c62b1c
241
+ rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
242
+ rubocop (1.84.2) sha256=5692cea54168f3dc8cb79a6fe95c5424b7ea893c707ad7a4307b0585e88dbf5f
243
+ rubocop-ast (1.49.0) sha256=49c3676d3123a0923d333e20c6c2dbaaae2d2287b475273fddee0c61da9f71fd
244
+ rubocop-performance (1.26.1) sha256=cd19b936ff196df85829d264b522fd4f98b6c89ad271fa52744a8c11b8f71834
245
+ rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
246
+ ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
247
+ securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
248
+ sorbet-runtime (0.6.12942) sha256=967bda04814d234e4239c4f883c1d0ee6de3e47bf8bafd2c0cc30d18df2ddd3a
249
+ steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
250
+ strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
251
+ terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
252
+ tsort (0.2.0) sha256=9650a793f6859a43b6641671278f79cfead60ac714148aabe4e3f0060480089f
253
+ tzinfo (2.0.6) sha256=8daf828cc77bcf7d63b0e3bdb6caa47e2272dcfaf4fbfe46f8c3a9df087a829b
254
+ unicode-display_width (3.2.0) sha256=0cdd96b5681a5949cdbc2c55e7b420facae74c4aaf9a9815eee1087cb1853c42
255
+ unicode-emoji (4.2.0) sha256=519e69150f75652e40bf736106cfbc8f0f73aa3fb6a65afe62fefa7f80b0f80f
256
+ uri (1.1.1) sha256=379fa58d27ffb1387eaada68c749d1426738bd0f654d812fcc07e7568f5c57c6
257
+ yard (0.9.38) sha256=721fb82afb10532aa49860655f6cc2eaa7130889df291b052e1e6b268283010f
258
+
259
+ BUNDLED WITH
260
+ 4.0.5
data/README.md ADDED
@@ -0,0 +1,399 @@
1
+ # Ruby
2
+
3
+ <div align="center" style="display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; margin: 20px 0;">
4
+ <!-- Language Bindings -->
5
+ <a href="https://crates.io/crates/kreuzberg">
6
+ <img src="https://img.shields.io/crates/v/kreuzberg?label=Rust&color=007ec6" alt="Rust">
7
+ </a>
8
+ <a href="https://hex.pm/packages/kreuzberg">
9
+ <img src="https://img.shields.io/hexpm/v/kreuzberg?label=Elixir&color=007ec6" alt="Elixir">
10
+ </a>
11
+ <a href="https://pypi.org/project/kreuzberg/">
12
+ <img src="https://img.shields.io/pypi/v/kreuzberg?label=Python&color=007ec6" alt="Python">
13
+ </a>
14
+ <a href="https://www.npmjs.com/package/@kreuzberg/node">
15
+ <img src="https://img.shields.io/npm/v/@kreuzberg/node?label=Node.js&color=007ec6" alt="Node.js">
16
+ </a>
17
+ <a href="https://www.npmjs.com/package/@kreuzberg/wasm">
18
+ <img src="https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM&color=007ec6" alt="WASM">
19
+ </a>
20
+
21
+ <a href="https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg">
22
+ <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
+ </a>
24
+ <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.3.5" alt="Go">
26
+ </a>
27
+ <a href="https://www.nuget.org/packages/Kreuzberg/">
28
+ <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
29
+ </a>
30
+ <a href="https://packagist.org/packages/kreuzberg/kreuzberg">
31
+ <img src="https://img.shields.io/packagist/v/kreuzberg/kreuzberg?label=PHP&color=007ec6" alt="PHP">
32
+ </a>
33
+ <a href="https://rubygems.org/gems/kreuzberg">
34
+ <img src="https://img.shields.io/gem/v/kreuzberg?label=Ruby&color=007ec6" alt="Ruby">
35
+ </a>
36
+ <a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
37
+ <img src="https://img.shields.io/badge/Docker-007ec6?logo=docker&logoColor=white" alt="Docker">
38
+ </a>
39
+
40
+ <!-- Project Info -->
41
+ <a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
42
+ <img src="https://img.shields.io/badge/License-MIT-blue.svg" alt="License">
43
+ </a>
44
+ <a href="https://docs.kreuzberg.dev">
45
+ <img src="https://img.shields.io/badge/docs-kreuzberg.dev-blue" alt="Documentation">
46
+ </a>
47
+ </div>
48
+
49
+ <img width="1128" height="191" alt="Banner2" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
50
+
51
+ <div align="center" style="margin-top: 20px;">
52
+ <a href="https://discord.gg/xt9WY3GnKR">
53
+ <img height="22" src="https://img.shields.io/badge/Discord-Join%20our%20community-7289da?logo=discord&logoColor=white" alt="Discord">
54
+ </a>
55
+ </div>
56
+
57
+
58
+ Extract text, tables, images, and metadata from 75+ file formats including PDF, Office documents, and images. Ruby bindings with idiomatic Ruby API and native performance.
59
+
60
+
61
+ ## Installation
62
+
63
+ ### Package Installation
64
+
65
+
66
+ Install via one of the supported package managers:
67
+
68
+
69
+
70
+ **gem:**
71
+ ```bash
72
+ gem install kreuzberg
73
+ ```
74
+
75
+
76
+
77
+
78
+ **Bundler:**
79
+ ```ruby
80
+ gem 'kreuzberg'
81
+ ```
82
+
83
+
84
+
85
+
86
+
87
+ ### System Requirements
88
+
89
+ - **Ruby 3.2.0 or higher** required (including Ruby 4.x)
90
+ - Ruby 4.0+ is fully supported with no code changes required
91
+ - Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.24+ for embeddings support
92
+ - Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
93
+
94
+ **Ruby 4.0 Compatibility:** Kreuzberg is fully compatible with Ruby 4.0 (released December 25, 2025) and all Ruby 4.x versions. All tests pass with 100% compatibility. The gem compiles without any breaking changes. Key Ruby 4.0 features like Ruby Box, ZJIT compiler, and Ractor improvements work seamlessly with Kreuzberg.
95
+
96
+
97
+
98
+ ## Quick Start
99
+
100
+ ### Basic Extraction
101
+
102
+ Extract text, metadata, and structure from any supported document format:
103
+
104
+ ```ruby
105
+ require 'kreuzberg'
106
+
107
+ result = Kreuzberg.extract_file_sync('document.pdf')
108
+
109
+ puts "Content:"
110
+ puts result.content
111
+
112
+ puts "\nMetadata:"
113
+ puts "Title: #{result.metadata&.dig('title')}"
114
+ puts "Author: #{result.metadata&.dig('author')}"
115
+
116
+ puts "\nTables found: #{result.tables.length}"
117
+ puts "Images found: #{result.images.length}"
118
+ ```
119
+
120
+
121
+ ### Common Use Cases
122
+
123
+ #### Extract with Custom Configuration
124
+
125
+ Most use cases benefit from configuration to control extraction behavior:
126
+
127
+
128
+ **With OCR (for scanned documents):**
129
+
130
+ ```ruby
131
+ require 'kreuzberg'
132
+
133
+ ocr_config = Kreuzberg::Config::OCR.new(
134
+ backend: 'tesseract',
135
+ language: 'eng'
136
+ )
137
+
138
+ config = Kreuzberg::Config::Extraction.new(ocr: ocr_config)
139
+ result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
140
+
141
+ puts "Extracted text from scanned document:"
142
+ puts result.content
143
+ puts "Used OCR backend: tesseract"
144
+ ```
145
+
146
+
147
+
148
+
149
+ #### Table Extraction
150
+
151
+
152
+ See [Table Extraction Guide](https://kreuzberg.dev/features/table-extraction/) for detailed examples.
153
+
154
+
155
+
156
+ #### Processing Multiple Files
157
+
158
+
159
+ ```ruby
160
+ require 'kreuzberg'
161
+
162
+ puts "Kreuzberg version: #{Kreuzberg::VERSION}"
163
+ puts "FFI bindings loaded successfully"
164
+
165
+ result = Kreuzberg.extract_file_sync('sample.pdf')
166
+ puts "Installation verified! Extracted #{result.content.length} characters"
167
+ ```
168
+
169
+
170
+
171
+
172
+
173
+ #### Async Processing
174
+
175
+ For non-blocking document processing:
176
+
177
+ ```ruby
178
+ require 'kreuzberg'
179
+
180
+ config = Kreuzberg::Config::Extraction.new(
181
+ use_cache: true,
182
+ enable_quality_processing: true
183
+ )
184
+
185
+ result = Kreuzberg.extract_file_sync('contract.pdf', config: config)
186
+
187
+ puts "Extracted #{result.content.length} characters"
188
+ puts "Quality score: #{result.metadata&.dig('quality_score')}"
189
+ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
190
+ ```
191
+
192
+
193
+
194
+
195
+
196
+
197
+ ### Next Steps
198
+
199
+ - **[Installation Guide](https://kreuzberg.dev/getting-started/installation/)** - Platform-specific setup
200
+ - **[API Documentation](https://kreuzberg.dev/api/)** - Complete API reference
201
+ - **[Examples & Guides](https://kreuzberg.dev/guides/)** - Full code examples and usage guides
202
+ - **[Configuration Guide](https://kreuzberg.dev/guides/configuration/)** - Advanced configuration options
203
+
204
+
205
+
206
+ ## Features
207
+
208
+ ### Supported File Formats (75+)
209
+
210
+ 75+ file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
211
+
212
+ #### Office Documents
213
+
214
+ | Category | Formats | Capabilities |
215
+ |----------|---------|--------------|
216
+ | **Word Processing** | `.docx`, `.odt` | Full text, tables, images, metadata, styles |
217
+ | **Spreadsheets** | `.xlsx`, `.xlsm`, `.xlsb`, `.xls`, `.xla`, `.xlam`, `.xltm`, `.ods` | Sheet data, formulas, cell metadata, charts |
218
+ | **Presentations** | `.pptx`, `.ppt`, `.ppsx` | Slides, speaker notes, images, metadata |
219
+ | **PDF** | `.pdf` | Text, tables, images, metadata, OCR support |
220
+ | **eBooks** | `.epub`, `.fb2` | Chapters, metadata, embedded resources |
221
+
222
+ #### Images (OCR-Enabled)
223
+
224
+ | Category | Formats | Features |
225
+ |----------|---------|----------|
226
+ | **Raster** | `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`, `.tiff`, `.tif` | OCR, table detection, EXIF metadata, dimensions, color space |
227
+ | **Advanced** | `.jp2`, `.jpx`, `.jpm`, `.mj2`, `.jbig2`, `.jb2`, `.pnm`, `.pbm`, `.pgm`, `.ppm` | OCR via hayro-jpeg2000 (pure Rust decoder), JBIG2 support, table detection, format-specific metadata |
228
+ | **Vector** | `.svg` | DOM parsing, embedded text, graphics metadata |
229
+
230
+ #### Web & Data
231
+
232
+ | Category | Formats | Features |
233
+ |----------|---------|----------|
234
+ | **Markup** | `.html`, `.htm`, `.xhtml`, `.xml`, `.svg` | DOM parsing, metadata (Open Graph, Twitter Card), link extraction |
235
+ | **Structured Data** | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv` | Schema detection, nested structures, validation |
236
+ | **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.djot`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, Djot, reStructuredText, Org Mode |
237
+
238
+ #### Email & Archives
239
+
240
+ | Category | Formats | Features |
241
+ |----------|---------|----------|
242
+ | **Email** | `.eml`, `.msg` | Headers, body (HTML/plain), attachments, threading |
243
+ | **Archives** | `.zip`, `.tar`, `.tgz`, `.gz`, `.7z` | File listing, nested archives, metadata |
244
+
245
+ #### Academic & Scientific
246
+
247
+ | Category | Formats | Features |
248
+ |----------|---------|----------|
249
+ | **Citations** | `.bib`, `.biblatex`, `.ris`, `.nbib`, `.enw`, `.csl` | Structured parsing: RIS (structured), PubMed/MEDLINE, EndNote XML (structured), BibTeX, CSL JSON |
250
+ | **Scientific** | `.tex`, `.latex`, `.typst`, `.jats`, `.ipynb`, `.docbook` | LaTeX, Jupyter notebooks, PubMed JATS |
251
+ | **Documentation** | `.opml`, `.pod`, `.mdoc`, `.troff` | Technical documentation formats |
252
+
253
+ **[Complete Format Reference](https://kreuzberg.dev/reference/formats/)**
254
+
255
+ ### Key Capabilities
256
+
257
+ - **Text Extraction** - Extract all text content with position and formatting information
258
+ - **Metadata Extraction** - Retrieve document properties, creation date, author, etc.
259
+ - **Table Extraction** - Parse tables with structure and cell content preservation
260
+ - **Image Extraction** - Extract embedded images and render page previews
261
+ - **OCR Support** - Integrate multiple OCR backends for scanned documents
262
+
263
+ - **Async/Await** - Non-blocking document processing with concurrent operations
264
+
265
+
266
+ - **Plugin System** - Extensible post-processing for custom text transformation
267
+
268
+
269
+ - **Embeddings** - Generate vector embeddings using ONNX Runtime models
270
+
271
+ - **Batch Processing** - Efficiently process multiple documents in parallel
272
+ - **Memory Efficient** - Stream large files without loading entirely into memory
273
+ - **Language Detection** - Detect and support multiple languages in documents
274
+ - **Configuration** - Fine-grained control over extraction behavior
275
+
276
+ ### Performance Characteristics
277
+
278
+ | Format | Speed | Memory | Notes |
279
+ |--------|-------|--------|-------|
280
+ | **PDF (text)** | 10-100 MB/s | ~50MB per doc | Fastest extraction |
281
+ | **Office docs** | 20-200 MB/s | ~100MB per doc | DOCX, XLSX, PPTX |
282
+ | **Images (OCR)** | 1-5 MB/s | Variable | Depends on OCR backend |
283
+ | **Archives** | 5-50 MB/s | ~200MB per doc | ZIP, TAR, etc. |
284
+ | **Web formats** | 50-200 MB/s | Streaming | HTML, XML, JSON |
285
+
286
+
287
+
288
+ ## OCR Support
289
+
290
+ Kreuzberg supports multiple OCR backends for extracting text from scanned documents and images:
291
+
292
+
293
+ - **Tesseract**
294
+
295
+ - **Paddleocr**
296
+
297
+
298
+ ### OCR Configuration Example
299
+
300
+ ```ruby
301
+ require 'kreuzberg'
302
+
303
+ ocr_config = Kreuzberg::Config::OCR.new(
304
+ backend: 'tesseract',
305
+ language: 'eng'
306
+ )
307
+
308
+ config = Kreuzberg::Config::Extraction.new(ocr: ocr_config)
309
+ result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
310
+
311
+ puts "Extracted text from scanned document:"
312
+ puts result.content
313
+ puts "Used OCR backend: tesseract"
314
+ ```
315
+
316
+
317
+
318
+
319
+ ## Async Support
320
+
321
+ This binding provides full async/await support for non-blocking document processing:
322
+
323
+ ```ruby
324
+ require 'kreuzberg'
325
+
326
+ config = Kreuzberg::Config::Extraction.new(
327
+ use_cache: true,
328
+ enable_quality_processing: true
329
+ )
330
+
331
+ result = Kreuzberg.extract_file_sync('contract.pdf', config: config)
332
+
333
+ puts "Extracted #{result.content.length} characters"
334
+ puts "Quality score: #{result.metadata&.dig('quality_score')}"
335
+ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
336
+ ```
337
+
338
+
339
+
340
+
341
+ ## Plugin System
342
+
343
+ Kreuzberg supports extensible post-processing plugins for custom text transformation and filtering.
344
+
345
+ For detailed plugin documentation, visit [Plugin System Guide](https://kreuzberg.dev/guides/plugins/).
346
+
347
+
348
+
349
+
350
+ ## Embeddings Support
351
+
352
+ Generate vector embeddings for extracted text using the built-in ONNX Runtime support. Requires ONNX Runtime installation.
353
+
354
+ **[Embeddings Guide](https://kreuzberg.dev/features/#embeddings)**
355
+
356
+
357
+
358
+ ## Batch Processing
359
+
360
+ Process multiple documents efficiently:
361
+
362
+ ```ruby
363
+ require 'kreuzberg'
364
+
365
+ puts "Kreuzberg version: #{Kreuzberg::VERSION}"
366
+ puts "FFI bindings loaded successfully"
367
+
368
+ result = Kreuzberg.extract_file_sync('sample.pdf')
369
+ puts "Installation verified! Extracted #{result.content.length} characters"
370
+ ```
371
+
372
+
373
+
374
+
375
+ ## Configuration
376
+
377
+ For advanced configuration options including language detection, table extraction, OCR settings, and more:
378
+
379
+ **[Configuration Guide](https://kreuzberg.dev/guides/configuration/)**
380
+
381
+ ## Documentation
382
+
383
+ - **[Official Documentation](https://kreuzberg.dev/)**
384
+ - **[API Reference](https://kreuzberg.dev/reference/api-ruby/)**
385
+ - **[Examples & Guides](https://kreuzberg.dev/guides/)**
386
+
387
+ ## Contributing
388
+
389
+ Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg-dev/kreuzberg/blob/main/CONTRIBUTING.md).
390
+
391
+ ## License
392
+
393
+ MIT License - see LICENSE file for details.
394
+
395
+ ## Support
396
+
397
+ - **Discord Community**: [Join our Discord](https://discord.gg/xt9WY3GnKR)
398
+ - **GitHub Issues**: [Report bugs](https://github.com/kreuzberg-dev/kreuzberg/issues)
399
+ - **Discussions**: [Ask questions](https://github.com/kreuzberg-dev/kreuzberg/discussions)
data/Rakefile ADDED
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rake/extensiontask'
5
+ require 'rspec/core/rake_task'
6
+
7
+ GEMSPEC = Gem::Specification.load(File.expand_path('kreuzberg.gemspec', __dir__))
8
+
9
+ # Vendor kreuzberg core crates before compilation
10
+ task :vendor do
11
+ vendor_script = File.expand_path('../../scripts/ci/ruby/vendor-kreuzberg-core.py', __dir__)
12
+ puts 'Vendoring kreuzberg core crates...'
13
+ sh "python3 #{vendor_script}"
14
+ end
15
+
16
+ Rake::ExtensionTask.new('kreuzberg_rb', GEMSPEC) do |ext|
17
+ ext.lib_dir = 'lib'
18
+ ext.ext_dir = 'ext/kreuzberg_rb'
19
+ ext.cross_compile = true
20
+ ext.cross_platform = %w[
21
+ x86_64-linux
22
+ aarch64-linux
23
+ x86_64-darwin
24
+ arm64-darwin
25
+ x64-mingw32
26
+ x64-mingw-ucrt
27
+ ]
28
+ end
29
+
30
+ RSpec::Core::RakeTask.new(:spec)
31
+
32
+ task compile: :vendor
33
+ task spec: :compile
34
+ task default: :spec