kreuzberg 4.0.0.rc2 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (446) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -14
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -1
  5. data/.rubocop.yml +543 -538
  6. data/Gemfile +8 -8
  7. data/Gemfile.lock +194 -6
  8. data/README.md +396 -426
  9. data/Rakefile +34 -25
  10. data/Steepfile +51 -47
  11. data/examples/async_patterns.rb +283 -341
  12. data/ext/kreuzberg_rb/extconf.rb +65 -45
  13. data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
  14. data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
  15. data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
  16. data/ext/kreuzberg_rb/native/README.md +425 -425
  17. data/ext/kreuzberg_rb/native/build.rs +15 -15
  18. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  19. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  20. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  21. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  22. data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
  23. data/extconf.rb +60 -28
  24. data/kreuzberg.gemspec +199 -148
  25. data/lib/kreuzberg/api_proxy.rb +126 -142
  26. data/lib/kreuzberg/cache_api.rb +67 -46
  27. data/lib/kreuzberg/cli.rb +47 -55
  28. data/lib/kreuzberg/cli_proxy.rb +117 -127
  29. data/lib/kreuzberg/config.rb +936 -691
  30. data/lib/kreuzberg/error_context.rb +136 -32
  31. data/lib/kreuzberg/errors.rb +116 -118
  32. data/lib/kreuzberg/extraction_api.rb +313 -85
  33. data/lib/kreuzberg/mcp_proxy.rb +177 -186
  34. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
  35. data/lib/kreuzberg/post_processor_protocol.rb +15 -86
  36. data/lib/kreuzberg/result.rb +334 -216
  37. data/lib/kreuzberg/setup_lib_path.rb +99 -80
  38. data/lib/kreuzberg/types.rb +170 -0
  39. data/lib/kreuzberg/validator_protocol.rb +16 -89
  40. data/lib/kreuzberg/version.rb +5 -5
  41. data/lib/kreuzberg.rb +96 -103
  42. data/lib/libpdfium.so +0 -0
  43. data/sig/kreuzberg/internal.rbs +184 -184
  44. data/sig/kreuzberg.rbs +561 -520
  45. data/spec/binding/async_operations_spec.rb +473 -0
  46. data/spec/binding/batch_operations_spec.rb +595 -0
  47. data/spec/binding/batch_spec.rb +359 -0
  48. data/spec/binding/cache_spec.rb +227 -227
  49. data/spec/binding/cli_proxy_spec.rb +85 -85
  50. data/spec/binding/cli_spec.rb +55 -55
  51. data/spec/binding/config_result_spec.rb +377 -0
  52. data/spec/binding/config_spec.rb +419 -345
  53. data/spec/binding/config_validation_spec.rb +377 -283
  54. data/spec/binding/embeddings_spec.rb +816 -0
  55. data/spec/binding/error_handling_spec.rb +399 -213
  56. data/spec/binding/error_recovery_spec.rb +488 -0
  57. data/spec/binding/errors_spec.rb +66 -66
  58. data/spec/binding/font_config_spec.rb +220 -0
  59. data/spec/binding/images_spec.rb +738 -0
  60. data/spec/binding/keywords_extraction_spec.rb +600 -0
  61. data/spec/binding/metadata_types_spec.rb +1228 -0
  62. data/spec/binding/pages_extraction_spec.rb +471 -0
  63. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  64. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  65. data/spec/binding/plugins/validator_spec.rb +273 -274
  66. data/spec/binding/tables_spec.rb +641 -0
  67. data/spec/fixtures/config.toml +38 -39
  68. data/spec/fixtures/config.yaml +41 -41
  69. data/spec/fixtures/invalid_config.toml +3 -4
  70. data/spec/smoke/package_spec.rb +177 -178
  71. data/spec/spec_helper.rb +40 -42
  72. data/spec/unit/config/chunking_config_spec.rb +213 -0
  73. data/spec/unit/config/embedding_config_spec.rb +343 -0
  74. data/spec/unit/config/extraction_config_spec.rb +438 -0
  75. data/spec/unit/config/font_config_spec.rb +285 -0
  76. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  77. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  78. data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
  79. data/spec/unit/config/keyword_config_spec.rb +229 -0
  80. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  81. data/spec/unit/config/ocr_config_spec.rb +171 -0
  82. data/spec/unit/config/page_config_spec.rb +221 -0
  83. data/spec/unit/config/pdf_config_spec.rb +267 -0
  84. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  85. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  86. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  87. data/test/metadata_types_test.rb +959 -0
  88. data/vendor/Cargo.toml +61 -0
  89. data/vendor/kreuzberg/Cargo.toml +259 -204
  90. data/vendor/kreuzberg/README.md +263 -175
  91. data/vendor/kreuzberg/build.rs +782 -474
  92. data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
  93. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
  94. data/vendor/kreuzberg/src/api/error.rs +81 -81
  95. data/vendor/kreuzberg/src/api/handlers.rs +320 -199
  96. data/vendor/kreuzberg/src/api/mod.rs +94 -79
  97. data/vendor/kreuzberg/src/api/server.rs +518 -353
  98. data/vendor/kreuzberg/src/api/types.rs +206 -170
  99. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
  100. data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
  101. data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
  102. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
  103. data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
  104. data/vendor/kreuzberg/src/core/config.rs +1914 -1032
  105. data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
  106. data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
  107. data/vendor/kreuzberg/src/core/formats.rs +235 -0
  108. data/vendor/kreuzberg/src/core/io.rs +329 -329
  109. data/vendor/kreuzberg/src/core/mime.rs +605 -605
  110. data/vendor/kreuzberg/src/core/mod.rs +61 -45
  111. data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
  112. data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
  113. data/vendor/kreuzberg/src/embeddings.rs +471 -432
  114. data/vendor/kreuzberg/src/error.rs +431 -431
  115. data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
  116. data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
  117. data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
  118. data/vendor/kreuzberg/src/extraction/email.rs +855 -854
  119. data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
  120. data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
  121. data/vendor/kreuzberg/src/extraction/image.rs +492 -368
  122. data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
  123. data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
  124. data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
  125. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  126. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  127. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  128. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
  129. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
  130. data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
  131. data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
  132. data/vendor/kreuzberg/src/extraction/table.rs +329 -328
  133. data/vendor/kreuzberg/src/extraction/text.rs +277 -269
  134. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  135. data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
  136. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
  137. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
  138. data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
  139. data/vendor/kreuzberg/src/extractors/email.rs +157 -143
  140. data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
  141. data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
  142. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
  143. data/vendor/kreuzberg/src/extractors/html.rs +419 -393
  144. data/vendor/kreuzberg/src/extractors/image.rs +219 -198
  145. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
  146. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
  147. data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
  149. data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
  150. data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
  151. data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
  152. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
  153. data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
  154. data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
  155. data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
  156. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
  157. data/vendor/kreuzberg/src/extractors/security.rs +484 -484
  158. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
  159. data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
  160. data/vendor/kreuzberg/src/extractors/text.rs +265 -260
  161. data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
  162. data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
  163. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  164. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  165. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  166. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  167. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  168. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  169. data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
  170. data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
  171. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  172. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  173. data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
  174. data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
  175. data/vendor/kreuzberg/src/lib.rs +114 -105
  176. data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
  177. data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
  178. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  179. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  180. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  181. data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
  182. data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
  183. data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
  184. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  185. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  186. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
  187. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  188. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  189. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  190. data/vendor/kreuzberg/src/panic_context.rs +154 -154
  191. data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
  192. data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
  193. data/vendor/kreuzberg/src/pdf/error.rs +214 -122
  194. data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
  196. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  197. data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
  198. data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
  199. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
  200. data/vendor/kreuzberg/src/pdf/table.rs +417 -393
  201. data/vendor/kreuzberg/src/pdf/text.rs +553 -158
  202. data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
  203. data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
  204. data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
  205. data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
  206. data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
  207. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  208. data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
  209. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  210. data/vendor/kreuzberg/src/text/mod.rs +27 -19
  211. data/vendor/kreuzberg/src/text/quality.rs +710 -697
  212. data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
  213. data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
  214. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  215. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  216. data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
  217. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
  218. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  219. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  220. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
  221. data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
  222. data/vendor/kreuzberg/src/types.rs +1713 -903
  223. data/vendor/kreuzberg/src/utils/mod.rs +31 -17
  224. data/vendor/kreuzberg/src/utils/pool.rs +503 -0
  225. data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
  226. data/vendor/kreuzberg/src/utils/quality.rs +968 -959
  227. data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
  228. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  229. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  230. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  231. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  232. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  233. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  234. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  235. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  236. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  237. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  238. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  239. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  240. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  241. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  242. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  243. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  244. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  245. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  246. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  247. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  248. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  249. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  250. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  251. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  252. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  253. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  254. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  255. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  256. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  257. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  258. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  259. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  260. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  261. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  262. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  263. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  264. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  265. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  266. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  267. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  268. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  269. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  270. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  271. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  272. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  273. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  274. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  275. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  276. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  277. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  278. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  279. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  280. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  281. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  282. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  283. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  284. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  285. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  286. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  287. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  288. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  289. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  290. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  291. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  292. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  293. data/vendor/kreuzberg/tests/api_embed.rs +360 -0
  294. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
  295. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
  296. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
  297. data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
  298. data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
  299. data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
  300. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
  301. data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
  302. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
  303. data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
  304. data/vendor/kreuzberg/tests/config_features.rs +612 -598
  305. data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
  306. data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
  307. data/vendor/kreuzberg/tests/core_integration.rs +519 -510
  308. data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
  309. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
  310. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
  311. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
  312. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
  313. data/vendor/kreuzberg/tests/email_integration.rs +327 -325
  314. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
  315. data/vendor/kreuzberg/tests/error_handling.rs +402 -393
  316. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
  317. data/vendor/kreuzberg/tests/format_integration.rs +165 -159
  318. data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
  319. data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
  320. data/vendor/kreuzberg/tests/image_integration.rs +255 -253
  321. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
  322. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
  323. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
  324. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  325. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  326. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
  327. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
  328. data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
  329. data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
  330. data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
  331. data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
  332. data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
  333. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  334. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
  335. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
  336. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
  337. data/vendor/kreuzberg/tests/page_markers.rs +297 -0
  338. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
  339. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
  340. data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
  341. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
  342. data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
  343. data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
  344. data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
  345. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
  346. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
  347. data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
  348. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  349. data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
  350. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
  351. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
  352. data/vendor/kreuzberg/tests/security_validation.rs +416 -415
  353. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  354. data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
  355. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
  356. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
  357. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  358. data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
  359. data/vendor/kreuzberg-ffi/README.md +851 -0
  360. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
  361. data/vendor/kreuzberg-ffi/build.rs +168 -0
  362. data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
  363. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  364. data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
  365. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
  366. data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
  367. data/vendor/kreuzberg-ffi/src/error.rs +901 -0
  368. data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
  369. data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
  370. data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
  371. data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
  372. data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
  373. data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
  374. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
  375. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
  376. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
  377. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
  378. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
  379. data/vendor/kreuzberg-ffi/src/result.rs +510 -0
  380. data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
  381. data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
  382. data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
  383. data/vendor/kreuzberg-ffi/src/types.rs +363 -0
  384. data/vendor/kreuzberg-ffi/src/util.rs +210 -0
  385. data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
  386. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  387. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  388. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  389. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  390. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  391. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  392. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  393. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  394. data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
  395. data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
  396. data/vendor/kreuzberg-tesseract/README.md +399 -0
  397. data/vendor/kreuzberg-tesseract/build.rs +1127 -0
  398. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  399. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  400. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  401. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  402. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  403. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  404. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  405. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  406. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  407. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  408. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  409. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  410. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  411. metadata +196 -45
  412. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  413. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  414. data/vendor/rb-sys/.cargo-ok +0 -1
  415. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  416. data/vendor/rb-sys/Cargo.lock +0 -393
  417. data/vendor/rb-sys/Cargo.toml +0 -70
  418. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  419. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  420. data/vendor/rb-sys/bin/release.sh +0 -21
  421. data/vendor/rb-sys/build/features.rs +0 -108
  422. data/vendor/rb-sys/build/main.rs +0 -246
  423. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  424. data/vendor/rb-sys/build/version.rs +0 -48
  425. data/vendor/rb-sys/readme.md +0 -36
  426. data/vendor/rb-sys/src/bindings.rs +0 -21
  427. data/vendor/rb-sys/src/hidden.rs +0 -11
  428. data/vendor/rb-sys/src/lib.rs +0 -34
  429. data/vendor/rb-sys/src/macros.rs +0 -371
  430. data/vendor/rb-sys/src/memory.rs +0 -53
  431. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  432. data/vendor/rb-sys/src/special_consts.rs +0 -31
  433. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  434. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  435. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  436. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  437. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  438. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  439. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  440. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  441. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  442. data/vendor/rb-sys/src/stable_api.rs +0 -261
  443. data/vendor/rb-sys/src/symbol.rs +0 -31
  444. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  445. data/vendor/rb-sys/src/utils.rs +0 -89
  446. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -1,85 +1,313 @@
1
- # frozen_string_literal: true
2
-
3
- module Kreuzberg
4
- # Provides extraction methods for documents and text.
5
- module ExtractionAPI
6
- def extract_file_sync(path, mime_type: nil, config: nil)
7
- opts = normalize_config(config)
8
- hash = if mime_type
9
- native_extract_file_sync(path.to_s, mime_type.to_s, **opts)
10
- else
11
- native_extract_file_sync(path.to_s, **opts)
12
- end
13
- result = Result.new(hash)
14
- record_cache_entry!(result, opts)
15
- result
16
- end
17
-
18
- def extract_bytes_sync(data, mime_type, config: nil)
19
- opts = normalize_config(config)
20
- hash = native_extract_bytes_sync(data.to_s, mime_type.to_s, **opts)
21
- result = Result.new(hash)
22
- record_cache_entry!(result, opts)
23
- result
24
- end
25
-
26
- def batch_extract_files_sync(paths, config: nil)
27
- opts = normalize_config(config)
28
- hashes = native_batch_extract_files_sync(paths.map(&:to_s), **opts)
29
- results = hashes.map { |hash| Result.new(hash) }
30
- record_cache_entry!(results, opts)
31
- results
32
- end
33
-
34
- def extract_file(path, mime_type: nil, config: nil)
35
- opts = normalize_config(config)
36
- hash = if mime_type
37
- native_extract_file(path.to_s, mime_type.to_s, **opts)
38
- else
39
- native_extract_file(path.to_s, **opts)
40
- end
41
- result = Result.new(hash)
42
- record_cache_entry!(result, opts)
43
- result
44
- end
45
-
46
- def extract_bytes(data, mime_type, config: nil)
47
- opts = normalize_config(config)
48
- hash = native_extract_bytes(data.to_s, mime_type.to_s, **opts)
49
- result = Result.new(hash)
50
- record_cache_entry!(result, opts)
51
- result
52
- end
53
-
54
- def batch_extract_files(paths, config: nil)
55
- opts = normalize_config(config)
56
- hashes = native_batch_extract_files(paths.map(&:to_s), **opts)
57
- results = hashes.map { |hash| Result.new(hash) }
58
- record_cache_entry!(results, opts)
59
- results
60
- end
61
-
62
- def batch_extract_bytes_sync(data_array, mime_types, config: nil)
63
- opts = normalize_config(config)
64
- hashes = native_batch_extract_bytes_sync(data_array.map(&:to_s), mime_types.map(&:to_s), **opts)
65
- results = hashes.map { |hash| Result.new(hash) }
66
- record_cache_entry!(results, opts)
67
- results
68
- end
69
-
70
- def batch_extract_bytes(data_array, mime_types, config: nil)
71
- opts = normalize_config(config)
72
- hashes = native_batch_extract_bytes(data_array.map(&:to_s), mime_types.map(&:to_s), **opts)
73
- results = hashes.map { |hash| Result.new(hash) }
74
- record_cache_entry!(results, opts)
75
- results
76
- end
77
-
78
- def normalize_config(config)
79
- return {} if config.nil?
80
- return config if config.is_a?(Hash)
81
-
82
- config.to_h
83
- end
84
- end
85
- end
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ module ExtractionAPI
5
+ # @param path [String, Pathname] Path to the document file to extract
6
+ # @param mime_type [String, nil] Optional MIME type for the file (e.g., 'application/pdf').
7
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration controlling
8
+ # @return [Result] Extraction result containing content, metadata, tables, and images
9
+ # @raise [Errors::IOError] If the file cannot be read or access is denied
10
+ # @raise [Errors::ParsingError] If document parsing fails
11
+ # @raise [Errors::UnsupportedFormatError] If the file format is not supported
12
+ # @raise [Errors::OCRError] If OCR is enabled and fails
13
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
14
+ # @example Extract a PDF file
15
+ # @example Extract with explicit MIME type
16
+ # @example Extract with OCR enabled
17
+ def extract_file_sync(path:, mime_type: nil, config: nil)
18
+ opts = normalize_config(config)
19
+ hash = if mime_type
20
+ native_extract_file_sync(path.to_s, mime_type.to_s, **opts)
21
+ else
22
+ native_extract_file_sync(path.to_s, **opts)
23
+ end
24
+ result = Result.new(hash)
25
+ record_cache_entry!(result, opts)
26
+ result
27
+ end
28
+
29
+ # Synchronously extract content from byte data.
30
+ #
31
+ # Performs document extraction directly from binary data in memory. Useful for
32
+ # extracting content from files already loaded into memory or from network streams.
33
+ #
34
+ # @param data [String] Binary document data (can contain any byte values)
35
+ # @param mime_type [String] MIME type of the data (required, e.g., 'application/pdf').
36
+ # This parameter is mandatory to guide the extraction engine.
37
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration. Accepts
38
+ # either a {Config::Extraction} object or a configuration hash.
39
+ #
40
+ # @return [Result] Extraction result containing content, metadata, tables, and images
41
+ #
42
+ # @raise [Errors::ParsingError] If document parsing fails
43
+ # @raise [Errors::UnsupportedFormatError] If the MIME type is not supported
44
+ # @raise [Errors::OCRError] If OCR is enabled and fails
45
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
46
+ #
47
+ # @example Extract PDF from memory
48
+ # pdf_data = File.read("document.pdf", binmode: true)
49
+ # result = Kreuzberg.extract_bytes_sync(pdf_data, "application/pdf")
50
+ # puts result.content
51
+ #
52
+ # @example Extract from a network stream
53
+ # response = HTTParty.get("https://example.com/document.docx")
54
+ # result = Kreuzberg.extract_bytes_sync(response.body, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
55
+ def extract_bytes_sync(data:, mime_type:, config: nil)
56
+ opts = normalize_config(config)
57
+ hash = native_extract_bytes_sync(data.to_s, mime_type.to_s, **opts)
58
+ result = Result.new(hash)
59
+ record_cache_entry!(result, opts)
60
+ result
61
+ end
62
+
63
+ # Synchronously extract content from multiple files.
64
+ #
65
+ # Processes multiple files in a single batch operation. Files are extracted sequentially,
66
+ # and results maintain the same order as the input paths. This is useful for bulk
67
+ # processing multiple documents with consistent configuration.
68
+ #
69
+ # @param paths [Array<String, Pathname>] Array of file paths to extract. Each path
70
+ # is converted to a string and MIME type is auto-detected from extension.
71
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration applied to all files.
72
+ # Accepts either a {Config::Extraction} object or a configuration hash.
73
+ #
74
+ # @return [Array<Result>] Array of extraction results in the same order as input paths.
75
+ # Array length matches the input paths length.
76
+ #
77
+ # @raise [Errors::IOError] If any file cannot be read
78
+ # @raise [Errors::ParsingError] If any document parsing fails
79
+ # @raise [Errors::UnsupportedFormatError] If any file format is not supported
80
+ # @raise [Errors::OCRError] If OCR is enabled and fails on any document
81
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
82
+ #
83
+ # @example Batch extract multiple PDFs
84
+ # paths = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]
85
+ # results = Kreuzberg.batch_extract_files_sync(paths)
86
+ # results.each_with_index do |result, idx|
87
+ # puts "File #{idx}: #{result.content.length} characters"
88
+ # end
89
+ #
90
+ # @example Batch extract with consistent configuration
91
+ # paths = Dir.glob("documents/*.pdf")
92
+ # config = Kreuzberg::Config::Extraction.new(force_ocr: true)
93
+ # results = Kreuzberg.batch_extract_files_sync(paths, config: config)
94
+ def batch_extract_files_sync(paths:, config: nil)
95
+ opts = normalize_config(config)
96
+ hashes = native_batch_extract_files_sync(paths.map(&:to_s), **opts)
97
+ results = hashes.map { |hash| Result.new(hash) }
98
+ record_cache_entry!(results, opts)
99
+ results
100
+ end
101
+
102
+ # Asynchronously extract content from a file.
103
+ #
104
+ # Non-blocking extraction that returns a {Result} promise. Extraction is performed
105
+ # in the background using native threads or the Tokio runtime. This method is
106
+ # preferred for I/O-bound operations and integrating with async workflows.
107
+ #
108
+ # @param path [String, Pathname] Path to the document file to extract
109
+ # @param mime_type [String, nil] Optional MIME type for the file (e.g., 'application/pdf').
110
+ # If omitted, type is detected from file extension.
111
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration. Accepts
112
+ # either a {Config::Extraction} object or a configuration hash.
113
+ #
114
+ # @return [Result] Extraction result containing content, metadata, tables, and images.
115
+ # In async contexts, this result is available upon method return.
116
+ #
117
+ # @raise [Errors::IOError] If the file cannot be read or access is denied
118
+ # @raise [Errors::ParsingError] If document parsing fails
119
+ # @raise [Errors::UnsupportedFormatError] If the file format is not supported
120
+ # @raise [Errors::OCRError] If OCR is enabled and fails
121
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
122
+ #
123
+ # @example Extract a PDF file asynchronously
124
+ # result = Kreuzberg.extract_file("large_document.pdf")
125
+ # puts result.content
126
+ #
127
+ # @example Extract with custom OCR configuration
128
+ # config = Kreuzberg::Config::Extraction.new(
129
+ # ocr: Kreuzberg::Config::OCR.new(language: "deu")
130
+ # )
131
+ # result = Kreuzberg.extract_file("document.pdf", config: config)
132
+ def extract_file(path:, mime_type: nil, config: nil)
133
+ opts = normalize_config(config)
134
+ hash = if mime_type
135
+ native_extract_file(path.to_s, mime_type.to_s, **opts)
136
+ else
137
+ native_extract_file(path.to_s, **opts)
138
+ end
139
+ result = Result.new(hash)
140
+ record_cache_entry!(result, opts)
141
+ result
142
+ end
143
+
144
+ # Asynchronously extract content from byte data.
145
+ #
146
+ # Non-blocking extraction from in-memory binary data. Like {#extract_file},
147
+ # this performs extraction in the background, making it suitable for handling
148
+ # high-volume extraction workloads without blocking the main thread.
149
+ #
150
+ # @param data [String] Binary document data (can contain any byte values)
151
+ # @param mime_type [String] MIME type of the data (required, e.g., 'application/pdf').
152
+ # This parameter is mandatory to guide the extraction engine.
153
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration. Accepts
154
+ # either a {Config::Extraction} object or a configuration hash.
155
+ #
156
+ # @return [Result] Extraction result containing content, metadata, tables, and images
157
+ #
158
+ # @raise [Errors::ParsingError] If document parsing fails
159
+ # @raise [Errors::UnsupportedFormatError] If the MIME type is not supported
160
+ # @raise [Errors::OCRError] If OCR is enabled and fails
161
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
162
+ #
163
+ # @example Extract PDF from memory asynchronously
164
+ # pdf_data = File.read("document.pdf", binmode: true)
165
+ # result = Kreuzberg.extract_bytes(pdf_data, "application/pdf")
166
+ # puts result.content
167
+ #
168
+ # @example Extract with image extraction
169
+ # data = File.read("file.docx", binmode: true)
170
+ # config = Kreuzberg::Config::Extraction.new(
171
+ # image_extraction: Kreuzberg::Config::ImageExtraction.new(extract_images: true)
172
+ # )
173
+ # result = Kreuzberg.extract_bytes(data, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", config: config)
174
+ def extract_bytes(data:, mime_type:, config: nil)
175
+ opts = normalize_config(config)
176
+ hash = native_extract_bytes(data.to_s, mime_type.to_s, **opts)
177
+ result = Result.new(hash)
178
+ record_cache_entry!(result, opts)
179
+ result
180
+ end
181
+
182
+ # Asynchronously extract content from multiple files.
183
+ #
184
+ # Non-blocking batch extraction from multiple files. Results maintain the same order
185
+ # as input paths. This is the preferred method for bulk processing when non-blocking
186
+ # I/O is required (e.g., in web servers or async applications).
187
+ #
188
+ # @param paths [Array<String, Pathname>] Array of file paths to extract. Each path
189
+ # is converted to a string and MIME type is auto-detected from extension.
190
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration applied to all files.
191
+ # Accepts either a {Config::Extraction} object or a configuration hash.
192
+ #
193
+ # @return [Array<Result>] Array of extraction results in the same order as input paths.
194
+ # Array length matches the input paths length.
195
+ #
196
+ # @raise [Errors::IOError] If any file cannot be read
197
+ # @raise [Errors::ParsingError] If any document parsing fails
198
+ # @raise [Errors::UnsupportedFormatError] If any file format is not supported
199
+ # @raise [Errors::OCRError] If OCR is enabled and fails on any document
200
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
201
+ #
202
+ # @example Batch extract multiple files asynchronously
203
+ # paths = ["invoice_1.pdf", "invoice_2.pdf", "invoice_3.pdf"]
204
+ # results = Kreuzberg.batch_extract_files(paths)
205
+ # results.each_with_index do |result, idx|
206
+ # puts "Invoice #{idx}: #{result.detected_languages}"
207
+ # end
208
+ #
209
+ # @example Batch extract with chunking
210
+ # paths = Dir.glob("reports/*.docx")
211
+ # config = Kreuzberg::Config::Extraction.new(
212
+ # chunking: Kreuzberg::Config::Chunking.new(max_chars: 1000, max_overlap: 200)
213
+ # )
214
+ # results = Kreuzberg.batch_extract_files(paths, config: config)
215
+ def batch_extract_files(paths:, config: nil)
216
+ opts = normalize_config(config)
217
+ hashes = native_batch_extract_files(paths.map(&:to_s), **opts)
218
+ results = hashes.map { |hash| Result.new(hash) }
219
+ record_cache_entry!(results, opts)
220
+ results
221
+ end
222
+
223
+ # Synchronously extract content from multiple byte data sources.
224
+ #
225
+ # Processes multiple in-memory binary documents in a single batch operation. Results
226
+ # maintain the same order as the input data array. The mime_types array must have
227
+ # the same length as the data_array.
228
+ #
229
+ # @param data_array [Array<String>] Array of binary document data. Each element can
230
+ # contain any byte values (e.g., PDF binary data).
231
+ # @param mime_types [Array<String>] Array of MIME types corresponding to each data item.
232
+ # Must be the same length as data_array (e.g., ["application/pdf", "application/msword"]).
233
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration applied to all items.
234
+ # Accepts either a {Config::Extraction} object or a configuration hash.
235
+ #
236
+ # @return [Array<Result>] Array of extraction results in the same order as input data.
237
+ # Array length matches the data_array length.
238
+ #
239
+ # @raise [ArgumentError] If data_array and mime_types have different lengths
240
+ # @raise [Errors::ParsingError] If any document parsing fails
241
+ # @raise [Errors::UnsupportedFormatError] If any MIME type is not supported
242
+ # @raise [Errors::OCRError] If OCR is enabled and fails on any document
243
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
244
+ #
245
+ # @example Batch extract binary documents
246
+ # pdf_data_1 = File.read("doc1.pdf", binmode: true)
247
+ # pdf_data_2 = File.read("doc2.pdf", binmode: true)
248
+ # docx_data = File.read("report.docx", binmode: true)
249
+ #
250
+ # data = [pdf_data_1, pdf_data_2, docx_data]
251
+ # types = ["application/pdf", "application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
252
+ # results = Kreuzberg.batch_extract_bytes_sync(data, types)
253
+ # results.each { |r| puts r.content }
254
+ def batch_extract_bytes_sync(data_array:, mime_types:, config: nil)
255
+ opts = normalize_config(config)
256
+ hashes = native_batch_extract_bytes_sync(data_array.map(&:to_s), mime_types.map(&:to_s), **opts)
257
+ results = hashes.map { |hash| Result.new(hash) }
258
+ record_cache_entry!(results, opts)
259
+ results
260
+ end
261
+
262
+ # Asynchronously extract content from multiple byte data sources.
263
+ #
264
+ # Non-blocking batch extraction from multiple in-memory binary documents. Results
265
+ # maintain the same order as the input data array. This method is preferred when
266
+ # processing multiple documents without blocking (e.g., handling multiple uploads).
267
+ #
268
+ # @param data_array [Array<String>] Array of binary document data. Each element can
269
+ # contain any byte values (e.g., PDF binary data).
270
+ # @param mime_types [Array<String>] Array of MIME types corresponding to each data item.
271
+ # Must be the same length as data_array (e.g., ["application/pdf", "application/msword"]).
272
+ # @param config [Config::Extraction, Hash, nil] Extraction configuration applied to all items.
273
+ # Accepts either a {Config::Extraction} object or a configuration hash.
274
+ #
275
+ # @return [Array<Result>] Array of extraction results in the same order as input data.
276
+ # Array length matches the data_array length.
277
+ #
278
+ # @raise [ArgumentError] If data_array and mime_types have different lengths
279
+ # @raise [Errors::ParsingError] If any document parsing fails
280
+ # @raise [Errors::UnsupportedFormatError] If any MIME type is not supported
281
+ # @raise [Errors::OCRError] If OCR is enabled and fails on any document
282
+ # @raise [Errors::MissingDependencyError] If a required dependency is missing
283
+ #
284
+ # @example Batch extract uploaded documents asynchronously
285
+ # # From a web request with multiple file uploads
286
+ # uploaded_files = params[:files] # Array of uploaded file objects
287
+ # data = uploaded_files.map(&:read)
288
+ # types = uploaded_files.map(&:content_type)
289
+ #
290
+ # results = Kreuzberg.batch_extract_bytes(data, types)
291
+ # results.each { |r| puts r.content }
292
+ #
293
+ # @example Batch extract with OCR
294
+ # data = [scan_1_bytes, scan_2_bytes, scan_3_bytes]
295
+ # types = ["image/png", "image/png", "image/png"]
296
+ # config = Kreuzberg::Config::Extraction.new(force_ocr: true)
297
+ # results = Kreuzberg.batch_extract_bytes(data, types, config: config)
298
+ def batch_extract_bytes(data_array:, mime_types:, config: nil)
299
+ opts = normalize_config(config)
300
+ hashes = native_batch_extract_bytes(data_array.map(&:to_s), mime_types.map(&:to_s), **opts)
301
+ results = hashes.map { |hash| Result.new(hash) }
302
+ record_cache_entry!(results, opts)
303
+ results
304
+ end
305
+
306
+ def normalize_config(config)
307
+ return {} if config.nil?
308
+ return config if config.is_a?(Hash)
309
+
310
+ config.to_h
311
+ end
312
+ end
313
+ end