kreuzberg 4.0.0.rc2 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (446) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -14
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -1
  5. data/.rubocop.yml +543 -538
  6. data/Gemfile +8 -8
  7. data/Gemfile.lock +194 -6
  8. data/README.md +396 -426
  9. data/Rakefile +34 -25
  10. data/Steepfile +51 -47
  11. data/examples/async_patterns.rb +283 -341
  12. data/ext/kreuzberg_rb/extconf.rb +65 -45
  13. data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
  14. data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
  15. data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
  16. data/ext/kreuzberg_rb/native/README.md +425 -425
  17. data/ext/kreuzberg_rb/native/build.rs +15 -15
  18. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  19. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  20. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  21. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  22. data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
  23. data/extconf.rb +60 -28
  24. data/kreuzberg.gemspec +199 -148
  25. data/lib/kreuzberg/api_proxy.rb +126 -142
  26. data/lib/kreuzberg/cache_api.rb +67 -46
  27. data/lib/kreuzberg/cli.rb +47 -55
  28. data/lib/kreuzberg/cli_proxy.rb +117 -127
  29. data/lib/kreuzberg/config.rb +936 -691
  30. data/lib/kreuzberg/error_context.rb +136 -32
  31. data/lib/kreuzberg/errors.rb +116 -118
  32. data/lib/kreuzberg/extraction_api.rb +313 -85
  33. data/lib/kreuzberg/mcp_proxy.rb +177 -186
  34. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
  35. data/lib/kreuzberg/post_processor_protocol.rb +15 -86
  36. data/lib/kreuzberg/result.rb +334 -216
  37. data/lib/kreuzberg/setup_lib_path.rb +99 -80
  38. data/lib/kreuzberg/types.rb +170 -0
  39. data/lib/kreuzberg/validator_protocol.rb +16 -89
  40. data/lib/kreuzberg/version.rb +5 -5
  41. data/lib/kreuzberg.rb +96 -103
  42. data/lib/libpdfium.so +0 -0
  43. data/sig/kreuzberg/internal.rbs +184 -184
  44. data/sig/kreuzberg.rbs +561 -520
  45. data/spec/binding/async_operations_spec.rb +473 -0
  46. data/spec/binding/batch_operations_spec.rb +595 -0
  47. data/spec/binding/batch_spec.rb +359 -0
  48. data/spec/binding/cache_spec.rb +227 -227
  49. data/spec/binding/cli_proxy_spec.rb +85 -85
  50. data/spec/binding/cli_spec.rb +55 -55
  51. data/spec/binding/config_result_spec.rb +377 -0
  52. data/spec/binding/config_spec.rb +419 -345
  53. data/spec/binding/config_validation_spec.rb +377 -283
  54. data/spec/binding/embeddings_spec.rb +816 -0
  55. data/spec/binding/error_handling_spec.rb +399 -213
  56. data/spec/binding/error_recovery_spec.rb +488 -0
  57. data/spec/binding/errors_spec.rb +66 -66
  58. data/spec/binding/font_config_spec.rb +220 -0
  59. data/spec/binding/images_spec.rb +738 -0
  60. data/spec/binding/keywords_extraction_spec.rb +600 -0
  61. data/spec/binding/metadata_types_spec.rb +1228 -0
  62. data/spec/binding/pages_extraction_spec.rb +471 -0
  63. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  64. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  65. data/spec/binding/plugins/validator_spec.rb +273 -274
  66. data/spec/binding/tables_spec.rb +641 -0
  67. data/spec/fixtures/config.toml +38 -39
  68. data/spec/fixtures/config.yaml +41 -41
  69. data/spec/fixtures/invalid_config.toml +3 -4
  70. data/spec/smoke/package_spec.rb +177 -178
  71. data/spec/spec_helper.rb +40 -42
  72. data/spec/unit/config/chunking_config_spec.rb +213 -0
  73. data/spec/unit/config/embedding_config_spec.rb +343 -0
  74. data/spec/unit/config/extraction_config_spec.rb +438 -0
  75. data/spec/unit/config/font_config_spec.rb +285 -0
  76. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  77. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  78. data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
  79. data/spec/unit/config/keyword_config_spec.rb +229 -0
  80. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  81. data/spec/unit/config/ocr_config_spec.rb +171 -0
  82. data/spec/unit/config/page_config_spec.rb +221 -0
  83. data/spec/unit/config/pdf_config_spec.rb +267 -0
  84. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  85. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  86. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  87. data/test/metadata_types_test.rb +959 -0
  88. data/vendor/Cargo.toml +61 -0
  89. data/vendor/kreuzberg/Cargo.toml +259 -204
  90. data/vendor/kreuzberg/README.md +263 -175
  91. data/vendor/kreuzberg/build.rs +782 -474
  92. data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
  93. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
  94. data/vendor/kreuzberg/src/api/error.rs +81 -81
  95. data/vendor/kreuzberg/src/api/handlers.rs +320 -199
  96. data/vendor/kreuzberg/src/api/mod.rs +94 -79
  97. data/vendor/kreuzberg/src/api/server.rs +518 -353
  98. data/vendor/kreuzberg/src/api/types.rs +206 -170
  99. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
  100. data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
  101. data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
  102. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
  103. data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
  104. data/vendor/kreuzberg/src/core/config.rs +1914 -1032
  105. data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
  106. data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
  107. data/vendor/kreuzberg/src/core/formats.rs +235 -0
  108. data/vendor/kreuzberg/src/core/io.rs +329 -329
  109. data/vendor/kreuzberg/src/core/mime.rs +605 -605
  110. data/vendor/kreuzberg/src/core/mod.rs +61 -45
  111. data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
  112. data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
  113. data/vendor/kreuzberg/src/embeddings.rs +471 -432
  114. data/vendor/kreuzberg/src/error.rs +431 -431
  115. data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
  116. data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
  117. data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
  118. data/vendor/kreuzberg/src/extraction/email.rs +855 -854
  119. data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
  120. data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
  121. data/vendor/kreuzberg/src/extraction/image.rs +492 -368
  122. data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
  123. data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
  124. data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
  125. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  126. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  127. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  128. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
  129. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
  130. data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
  131. data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
  132. data/vendor/kreuzberg/src/extraction/table.rs +329 -328
  133. data/vendor/kreuzberg/src/extraction/text.rs +277 -269
  134. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  135. data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
  136. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
  137. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
  138. data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
  139. data/vendor/kreuzberg/src/extractors/email.rs +157 -143
  140. data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
  141. data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
  142. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
  143. data/vendor/kreuzberg/src/extractors/html.rs +419 -393
  144. data/vendor/kreuzberg/src/extractors/image.rs +219 -198
  145. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
  146. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
  147. data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
  149. data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
  150. data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
  151. data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
  152. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
  153. data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
  154. data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
  155. data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
  156. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
  157. data/vendor/kreuzberg/src/extractors/security.rs +484 -484
  158. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
  159. data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
  160. data/vendor/kreuzberg/src/extractors/text.rs +265 -260
  161. data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
  162. data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
  163. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  164. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  165. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  166. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  167. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  168. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  169. data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
  170. data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
  171. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  172. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  173. data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
  174. data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
  175. data/vendor/kreuzberg/src/lib.rs +114 -105
  176. data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
  177. data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
  178. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  179. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  180. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  181. data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
  182. data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
  183. data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
  184. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  185. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  186. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
  187. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  188. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  189. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  190. data/vendor/kreuzberg/src/panic_context.rs +154 -154
  191. data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
  192. data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
  193. data/vendor/kreuzberg/src/pdf/error.rs +214 -122
  194. data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
  196. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  197. data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
  198. data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
  199. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
  200. data/vendor/kreuzberg/src/pdf/table.rs +417 -393
  201. data/vendor/kreuzberg/src/pdf/text.rs +553 -158
  202. data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
  203. data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
  204. data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
  205. data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
  206. data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
  207. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  208. data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
  209. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  210. data/vendor/kreuzberg/src/text/mod.rs +27 -19
  211. data/vendor/kreuzberg/src/text/quality.rs +710 -697
  212. data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
  213. data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
  214. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  215. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  216. data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
  217. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
  218. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  219. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  220. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
  221. data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
  222. data/vendor/kreuzberg/src/types.rs +1713 -903
  223. data/vendor/kreuzberg/src/utils/mod.rs +31 -17
  224. data/vendor/kreuzberg/src/utils/pool.rs +503 -0
  225. data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
  226. data/vendor/kreuzberg/src/utils/quality.rs +968 -959
  227. data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
  228. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  229. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  230. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  231. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  232. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  233. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  234. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  235. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  236. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  237. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  238. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  239. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  240. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  241. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  242. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  243. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  244. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  245. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  246. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  247. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  248. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  249. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  250. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  251. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  252. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  253. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  254. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  255. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  256. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  257. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  258. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  259. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  260. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  261. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  262. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  263. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  264. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  265. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  266. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  267. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  268. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  269. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  270. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  271. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  272. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  273. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  274. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  275. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  276. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  277. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  278. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  279. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  280. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  281. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  282. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  283. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  284. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  285. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  286. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  287. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  288. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  289. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  290. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  291. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  292. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  293. data/vendor/kreuzberg/tests/api_embed.rs +360 -0
  294. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
  295. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
  296. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
  297. data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
  298. data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
  299. data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
  300. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
  301. data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
  302. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
  303. data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
  304. data/vendor/kreuzberg/tests/config_features.rs +612 -598
  305. data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
  306. data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
  307. data/vendor/kreuzberg/tests/core_integration.rs +519 -510
  308. data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
  309. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
  310. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
  311. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
  312. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
  313. data/vendor/kreuzberg/tests/email_integration.rs +327 -325
  314. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
  315. data/vendor/kreuzberg/tests/error_handling.rs +402 -393
  316. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
  317. data/vendor/kreuzberg/tests/format_integration.rs +165 -159
  318. data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
  319. data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
  320. data/vendor/kreuzberg/tests/image_integration.rs +255 -253
  321. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
  322. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
  323. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
  324. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  325. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  326. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
  327. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
  328. data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
  329. data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
  330. data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
  331. data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
  332. data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
  333. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  334. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
  335. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
  336. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
  337. data/vendor/kreuzberg/tests/page_markers.rs +297 -0
  338. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
  339. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
  340. data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
  341. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
  342. data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
  343. data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
  344. data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
  345. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
  346. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
  347. data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
  348. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  349. data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
  350. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
  351. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
  352. data/vendor/kreuzberg/tests/security_validation.rs +416 -415
  353. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  354. data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
  355. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
  356. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
  357. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  358. data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
  359. data/vendor/kreuzberg-ffi/README.md +851 -0
  360. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
  361. data/vendor/kreuzberg-ffi/build.rs +168 -0
  362. data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
  363. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  364. data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
  365. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
  366. data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
  367. data/vendor/kreuzberg-ffi/src/error.rs +901 -0
  368. data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
  369. data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
  370. data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
  371. data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
  372. data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
  373. data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
  374. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
  375. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
  376. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
  377. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
  378. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
  379. data/vendor/kreuzberg-ffi/src/result.rs +510 -0
  380. data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
  381. data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
  382. data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
  383. data/vendor/kreuzberg-ffi/src/types.rs +363 -0
  384. data/vendor/kreuzberg-ffi/src/util.rs +210 -0
  385. data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
  386. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  387. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  388. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  389. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  390. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  391. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  392. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  393. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  394. data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
  395. data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
  396. data/vendor/kreuzberg-tesseract/README.md +399 -0
  397. data/vendor/kreuzberg-tesseract/build.rs +1127 -0
  398. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  399. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  400. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  401. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  402. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  403. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  404. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  405. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  406. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  407. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  408. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  409. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  410. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  411. metadata +196 -45
  412. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  413. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  414. data/vendor/rb-sys/.cargo-ok +0 -1
  415. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  416. data/vendor/rb-sys/Cargo.lock +0 -393
  417. data/vendor/rb-sys/Cargo.toml +0 -70
  418. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  419. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  420. data/vendor/rb-sys/bin/release.sh +0 -21
  421. data/vendor/rb-sys/build/features.rs +0 -108
  422. data/vendor/rb-sys/build/main.rs +0 -246
  423. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  424. data/vendor/rb-sys/build/version.rs +0 -48
  425. data/vendor/rb-sys/readme.md +0 -36
  426. data/vendor/rb-sys/src/bindings.rs +0 -21
  427. data/vendor/rb-sys/src/hidden.rs +0 -11
  428. data/vendor/rb-sys/src/lib.rs +0 -34
  429. data/vendor/rb-sys/src/macros.rs +0 -371
  430. data/vendor/rb-sys/src/memory.rs +0 -53
  431. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  432. data/vendor/rb-sys/src/special_consts.rs +0 -31
  433. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  434. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  435. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  436. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  437. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  438. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  439. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  440. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  441. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  442. data/vendor/rb-sys/src/stable_api.rs +0 -261
  443. data/vendor/rb-sys/src/symbol.rs +0 -31
  444. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  445. data/vendor/rb-sys/src/utils.rs +0 -89
  446. data/vendor/rb-sys/src/value_type.rs +0 -7
data/sig/kreuzberg.rbs CHANGED
@@ -1,520 +1,561 @@
1
- # Type signatures for Kreuzberg document intelligence framework
2
-
3
- module Kreuzberg
4
- VERSION: String
5
-
6
- # Error code constants
7
- ERROR_CODE_SUCCESS: Integer
8
- ERROR_CODE_GENERIC: Integer
9
- ERROR_CODE_PANIC: Integer
10
- ERROR_CODE_INVALID_ARGUMENT: Integer
11
- ERROR_CODE_IO: Integer
12
- ERROR_CODE_PARSING: Integer
13
- ERROR_CODE_OCR: Integer
14
- ERROR_CODE_MISSING_DEPENDENCY: Integer
15
-
16
- # Config namespace (defined in lib/kreuzberg/config.rb)
17
- module Config
18
- class OCR
19
- attr_reader backend: String
20
- attr_reader language: String
21
- attr_reader tesseract_config: Tesseract?
22
-
23
- def initialize: (?backend: String, ?language: String, ?tesseract_config: (Tesseract | Hash[Symbol, untyped])?) -> void
24
- def to_h: () -> Hash[Symbol, untyped]
25
- end
26
-
27
- class Tesseract
28
- def initialize: (**untyped options) -> void
29
- def to_h: () -> Hash[Symbol, untyped]
30
- end
31
-
32
- class Chunking
33
- attr_reader max_chars: Integer
34
- attr_reader max_overlap: Integer
35
- attr_reader preset: String?
36
- attr_reader embedding: Embedding?
37
- attr_reader enabled: bool?
38
-
39
- def initialize: (
40
- ?max_chars: Integer?,
41
- ?max_overlap: Integer?,
42
- ?preset: String?,
43
- ?embedding: (Embedding | Hash[Symbol, untyped])?,
44
- ?chunk_size: Integer?,
45
- ?chunk_overlap: Integer?,
46
- ?enabled: bool
47
- ) -> void
48
- def to_h: () -> Hash[Symbol, untyped]
49
- end
50
-
51
- class Embedding
52
- attr_reader model: Hash[Symbol, untyped]
53
- attr_reader normalize: bool?
54
- attr_reader batch_size: Integer?
55
- attr_reader show_download_progress: bool?
56
- attr_reader cache_dir: String?
57
-
58
- def initialize: (
59
- ?model: Hash[Symbol, untyped],
60
- ?normalize: bool?,
61
- ?batch_size: Integer?,
62
- ?show_download_progress: bool?,
63
- ?cache_dir: String?
64
- ) -> void
65
- def to_h: () -> Hash[Symbol, untyped]
66
- end
67
-
68
- class LanguageDetection
69
- attr_reader enabled: bool
70
- attr_reader min_confidence: Float
71
- attr_reader detect_multiple: bool
72
-
73
- def initialize: (?enabled: bool, ?min_confidence: Float, ?detect_multiple: bool) -> void
74
- def to_h: () -> Hash[Symbol, untyped]
75
- end
76
-
77
- class PDF
78
- attr_reader extract_images: bool
79
- attr_reader passwords: Array[String]?
80
- attr_reader extract_metadata: bool
81
-
82
- def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool) -> void
83
- def to_h: () -> Hash[Symbol, untyped]
84
- end
85
-
86
- class ImageExtraction
87
- attr_reader extract_images: bool
88
- attr_reader target_dpi: Integer
89
- attr_reader max_image_dimension: Integer
90
- attr_reader auto_adjust_dpi: bool
91
- attr_reader min_dpi: Integer
92
- attr_reader max_dpi: Integer
93
-
94
- def initialize: (
95
- ?extract_images: bool,
96
- ?target_dpi: Integer,
97
- ?max_image_dimension: Integer,
98
- ?auto_adjust_dpi: bool,
99
- ?min_dpi: Integer,
100
- ?max_dpi: Integer
101
- ) -> void
102
- def to_h: () -> Hash[Symbol, untyped]
103
- end
104
-
105
- class ImagePreprocessing
106
- attr_reader target_dpi: Integer
107
- attr_reader auto_rotate: bool
108
- attr_reader deskew: bool
109
- attr_reader denoise: bool
110
- attr_reader contrast_enhance: bool
111
- attr_reader binarization_method: String
112
- attr_reader invert_colors: bool
113
-
114
- def initialize: (
115
- ?target_dpi: Integer,
116
- ?auto_rotate: bool,
117
- ?deskew: bool,
118
- ?denoise: bool,
119
- ?contrast_enhance: bool,
120
- ?binarization_method: String,
121
- ?invert_colors: bool
122
- ) -> void
123
- def to_h: () -> Hash[Symbol, untyped]
124
- end
125
-
126
- class TokenReduction
127
- attr_reader mode: String
128
- attr_reader preserve_important_words: bool
129
-
130
- def initialize: (?mode: String, ?preserve_important_words: bool) -> void
131
- def to_h: () -> Hash[Symbol, untyped]
132
- end
133
-
134
- class PostProcessor
135
- attr_reader enabled: bool
136
- attr_reader enabled_processors: Array[String]?
137
- attr_reader disabled_processors: Array[String]?
138
-
139
- def initialize: (?enabled: bool, ?enabled_processors: Array[String]?, ?disabled_processors: Array[String]?) -> void
140
- def to_h: () -> Hash[Symbol, untyped]
141
- end
142
-
143
- class HtmlPreprocessing
144
- attr_reader enabled: bool?
145
- attr_reader preset: Symbol?
146
- attr_reader remove_navigation: bool?
147
- attr_reader remove_forms: bool?
148
-
149
- def initialize: (?enabled: bool?, ?preset: Symbol?, ?remove_navigation: bool?, ?remove_forms: bool?) -> void
150
- def to_h: () -> Hash[Symbol, untyped]
151
- end
152
-
153
- class HtmlOptions
154
- def initialize: (**untyped options) -> void
155
- def to_h: () -> Hash[Symbol, untyped]
156
- end
157
-
158
- class Keywords
159
- def initialize: (
160
- ?algorithm: Symbol?,
161
- ?max_keywords: Integer?,
162
- ?min_score: Float?,
163
- ?ngram_range: Array[Integer]?,
164
- ?language: Symbol?,
165
- ?yake_params: Hash[Symbol, untyped]?,
166
- ?rake_params: Hash[Symbol, untyped]?
167
- ) -> void
168
- def to_h: () -> Hash[Symbol, untyped]
169
- end
170
-
171
- class Extraction
172
- attr_reader use_cache: bool
173
- attr_reader enable_quality_processing: bool
174
- attr_reader force_ocr: bool
175
- attr_reader ocr: OCR?
176
- attr_reader chunking: Chunking?
177
- attr_reader language_detection: LanguageDetection?
178
- attr_reader pdf_options: PDF?
179
- attr_reader image_extraction: ImageExtraction?
180
- attr_reader image_preprocessing: ImagePreprocessing?
181
- attr_reader postprocessor: PostProcessor?
182
- attr_reader token_reduction: TokenReduction?
183
- attr_reader keywords: Keywords?
184
- attr_reader html_options: HtmlOptions?
185
- attr_reader max_concurrent_extractions: Integer?
186
-
187
- def self.from_file: (String path) -> Extraction
188
- def initialize: (
189
- ?use_cache: bool,
190
- ?enable_quality_processing: bool,
191
- ?force_ocr: bool,
192
- ?ocr: (OCR | Hash[Symbol, untyped])?,
193
- ?chunking: (Chunking | Hash[Symbol, untyped])?,
194
- ?language_detection: (LanguageDetection | Hash[Symbol, untyped])?,
195
- ?pdf_options: (PDF | Hash[Symbol, untyped])?,
196
- ?image_extraction: (ImageExtraction | Hash[Symbol, untyped])?,
197
- ?image_preprocessing: (ImagePreprocessing | Hash[Symbol, untyped])?,
198
- ?postprocessor: (PostProcessor | Hash[Symbol, untyped])?,
199
- ?token_reduction: (TokenReduction | Hash[Symbol, untyped])?,
200
- ?keywords: (Keywords | Hash[Symbol, untyped])?,
201
- ?html_options: (HtmlOptions | Hash[Symbol, untyped])?,
202
- ?max_concurrent_extractions: Integer?
203
- ) -> void
204
- def to_h: () -> Hash[Symbol, untyped]
205
-
206
- private
207
-
208
- def normalize_config: [T] (T | Hash[Symbol, untyped] | nil value, Class klass) -> T?
209
- end
210
-
211
- # Backwards compatibility alias
212
- Ocr: singleton(OCR)
213
- end
214
-
215
- # Alias for Config::Extraction (for API consistency with other language bindings)
216
- ExtractionConfig: singleton(Config::Extraction)
217
-
218
- # Extraction result type
219
- type extraction_result_hash = {
220
- content: String,
221
- mime_type: String,
222
- metadata_json: String,
223
- tables: Array[table_hash]?,
224
- detected_languages: Array[String]?,
225
- chunks: Array[chunk_hash]?,
226
- images: Array[image_hash]?
227
- }
228
-
229
- type table_hash = {
230
- cells: Array[Array[String]],
231
- markdown: String,
232
- page_number: Integer
233
- }
234
-
235
- type chunk_hash = {
236
- content: String,
237
- char_start: Integer,
238
- char_end: Integer,
239
- token_count: Integer?,
240
- chunk_index: Integer?,
241
- total_chunks: Integer?,
242
- embedding: Array[Float]?
243
- }
244
-
245
- type image_hash = {
246
- data: String,
247
- format: String,
248
- image_index: Integer,
249
- page_number: Integer?,
250
- width: Integer?,
251
- height: Integer?,
252
- colorspace: String?,
253
- bits_per_component: Integer?,
254
- is_mask: bool?,
255
- description: String?,
256
- ocr_result: extraction_result_hash?
257
- }
258
-
259
- type config_hash = Hash[Symbol, untyped]
260
- type config_input = config_hash | _ToH
261
-
262
- interface _ToH
263
- def to_h: () -> config_hash
264
- end
265
-
266
- # Extraction result wrapper
267
- class Result
268
- # Table structure
269
- class Table
270
- attr_reader cells: Array[Array[String]]
271
- attr_reader markdown: String
272
- attr_reader page_number: Integer
273
-
274
- def initialize: (cells: Array[Array[String]], markdown: String, page_number: Integer) -> void
275
- def to_h: () -> table_hash
276
- end
277
-
278
- # Text chunk
279
- class Chunk
280
- attr_reader content: String
281
- attr_reader char_start: Integer
282
- attr_reader char_end: Integer
283
- attr_reader token_count: Integer?
284
- attr_reader chunk_index: Integer?
285
- attr_reader total_chunks: Integer?
286
- attr_reader embedding: Array[Float]?
287
-
288
- def initialize: (
289
- content: String,
290
- char_start: Integer,
291
- char_end: Integer,
292
- token_count: Integer?,
293
- chunk_index: Integer?,
294
- total_chunks: Integer?,
295
- embedding: Array[Float]?
296
- ) -> void
297
- def to_h: () -> chunk_hash
298
- end
299
-
300
- class Image
301
- attr_reader data: String
302
- attr_reader format: String
303
- attr_reader image_index: Integer
304
- attr_reader page_number: Integer?
305
- attr_reader width: Integer?
306
- attr_reader height: Integer?
307
- attr_reader colorspace: String?
308
- attr_reader bits_per_component: Integer?
309
- attr_reader is_mask: bool?
310
- attr_reader description: String?
311
- attr_reader ocr_result: Result?
312
-
313
- def initialize: (
314
- data: String,
315
- format: String,
316
- image_index: Integer,
317
- page_number: Integer?,
318
- width: Integer?,
319
- height: Integer?,
320
- colorspace: String?,
321
- bits_per_component: Integer?,
322
- is_mask: bool?,
323
- description: String?,
324
- ocr_result: Result?
325
- ) -> void
326
- def to_h: () -> image_hash
327
- end
328
-
329
- attr_reader content: String
330
- attr_reader mime_type: String
331
- attr_reader metadata: Hash[untyped, untyped]
332
- attr_reader metadata_json: String
333
- attr_reader tables: Array[Table]
334
- attr_reader detected_languages: Array[String]?
335
- attr_reader chunks: Array[Chunk]?
336
- attr_reader images: Array[Image]?
337
-
338
- def initialize: (extraction_result_hash hash) -> void
339
- def to_h: () -> Hash[Symbol, untyped]
340
- def to_json: (*untyped) -> String
341
-
342
- private
343
-
344
- def parse_metadata: (String metadata_json) -> Hash[untyped, untyped]
345
- def parse_tables: (Array[table_hash]? tables_data) -> Array[Table]
346
- def parse_detected_languages: (Array[String]? langs_data) -> Array[String]?
347
- def parse_chunks: (Array[chunk_hash]? chunks_data) -> Array[Chunk]?
348
- end
349
-
350
- # Module methods (extraction API)
351
- def self.extract_file_sync: (
352
- String | Pathname path,
353
- ?mime_type: String?,
354
- ?config: config_input?
355
- ) -> Result
356
-
357
- def self.extract_bytes_sync: (
358
- String data,
359
- String mime_type,
360
- ?config: config_input?
361
- ) -> Result
362
-
363
- def self.batch_extract_files_sync: (
364
- Array[String | Pathname] paths,
365
- ?config: config_input?
366
- ) -> Array[Result]
367
-
368
- def self.batch_extract_bytes_sync: (
369
- Array[String] data_array,
370
- Array[String] mime_types,
371
- ?config: config_input?
372
- ) -> Array[Result]
373
-
374
- def self.extract_file: (
375
- String | Pathname path,
376
- ?mime_type: String?,
377
- ?config: config_input?
378
- ) -> Result
379
-
380
- def self.extract_bytes: (
381
- String data,
382
- String mime_type,
383
- ?config: config_input?
384
- ) -> Result
385
-
386
- def self.batch_extract_files: (
387
- Array[String | Pathname] paths,
388
- ?config: config_input?
389
- ) -> Array[Result]
390
-
391
- def self.batch_extract_bytes: (
392
- Array[String] data_array,
393
- Array[String] mime_types,
394
- ?config: config_input?
395
- ) -> Array[Result]
396
-
397
- # Cache API
398
- def self.clear_cache: () -> void
399
- def self.cache_stats: () -> Hash[Symbol | String, Integer]
400
-
401
- # Config loading (native method)
402
- def self._config_from_file_native: (String path) -> Hash[Symbol, untyped]
403
-
404
- # Error introspection (native methods)
405
- def self._last_error_code_native: () -> Integer
406
- def self._last_panic_context_json_native: () -> String?
407
-
408
- # Plugin registration
409
- def self.register_post_processor: (String name, _PostProcessor processor, ?stage: Symbol?) -> void
410
- def self.unregister_post_processor: (String name) -> void
411
- def self.clear_post_processors: () -> void
412
- def self.register_validator: (String name, _Validator validator, ?priority: Integer?) -> void
413
- def self.unregister_validator: (String name) -> void
414
- def self.clear_validators: () -> void
415
- def self.register_ocr_backend: (_OcrBackend backend) -> void
416
-
417
- interface _PostProcessor
418
- def call: (extraction_result_hash result) -> extraction_result_hash
419
- end
420
-
421
- interface _Validator
422
- def call: (extraction_result_hash result) -> void
423
- end
424
-
425
- interface _OcrBackend
426
- def name: () -> String
427
- def extract_text: (String file_path_or_bytes, Hash[Symbol, untyped] config) -> String
428
- end
429
-
430
- module ErrorContext
431
- def self.last_error_code: () -> Integer
432
- def self.last_panic_context: () -> Errors::PanicContext?
433
- def self.last_panic_context_json: () -> String?
434
- end
435
-
436
- module Errors
437
- # Panic context information from FFI error introspection
438
- class PanicContext
439
- attr_reader file: String
440
- attr_reader line: Integer
441
- attr_reader function: String
442
- attr_reader message: String
443
- attr_reader timestamp_secs: Integer
444
-
445
- def initialize: (
446
- file: String,
447
- line: Integer,
448
- function: String,
449
- message: String,
450
- timestamp_secs: Integer
451
- ) -> void
452
- def to_s: () -> String
453
- def to_h: () -> Hash[Symbol, String | Integer]
454
- def self.from_json: (String) -> PanicContext?
455
-
456
- private
457
-
458
- def self.with_defaults: (Hash[Symbol, untyped] sliced) -> {file: String, line: Integer, function: String, message: String, timestamp_secs: Integer}
459
- end
460
-
461
- class Error < StandardError
462
- attr_reader panic_context: PanicContext?
463
- attr_reader error_code: Integer?
464
-
465
- def initialize: (String message, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
466
- end
467
-
468
- class ValidationError < Error
469
- end
470
-
471
- class ParsingError < Error
472
- attr_reader context: Hash[untyped, untyped]?
473
-
474
- def initialize: (String message, ?context: Hash[untyped, untyped]?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
475
- end
476
-
477
- class OCRError < Error
478
- attr_reader context: Hash[untyped, untyped]?
479
-
480
- def initialize: (String message, ?context: Hash[untyped, untyped]?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
481
- end
482
-
483
- class MissingDependencyError < Error
484
- attr_reader dependency: String?
485
-
486
- def initialize: (String message, ?dependency: String?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
487
- end
488
-
489
- class IOError < Error
490
- end
491
-
492
- class PluginError < Error
493
- end
494
-
495
- class UnsupportedFormatError < Error
496
- end
497
- end
498
-
499
- # Internal modules (prepended to Kreuzberg singleton)
500
- # These are not checked by steep - see Steepfile
501
- module CacheAPI : Object
502
- end
503
-
504
- module ExtractionAPI : Object
505
- end
506
-
507
- module PostProcessorProtocol
508
- def call: (extraction_result_hash result) -> extraction_result_hash
509
- end
510
-
511
- module ValidatorProtocol
512
- def call: (extraction_result_hash result) -> void
513
- end
514
-
515
- module OcrBackendProtocol
516
- def name: () -> String
517
- def extract_text: (String file_path_or_bytes, Hash[Symbol, untyped] config) -> String
518
- def process_image: (String file_path_or_bytes, Hash[Symbol, untyped] config) -> String
519
- end
520
- end
1
+ # Type signatures for Kreuzberg document intelligence framework
2
+
3
+ module Kreuzberg
4
+ VERSION: String
5
+
6
+ # Error code constants
7
+ ERROR_CODE_SUCCESS: Integer
8
+ ERROR_CODE_GENERIC: Integer
9
+ ERROR_CODE_PANIC: Integer
10
+ ERROR_CODE_INVALID_ARGUMENT: Integer
11
+ ERROR_CODE_IO: Integer
12
+ ERROR_CODE_PARSING: Integer
13
+ ERROR_CODE_OCR: Integer
14
+ ERROR_CODE_MISSING_DEPENDENCY: Integer
15
+
16
+ # Config namespace (defined in lib/kreuzberg/config.rb)
17
+ module Config
18
+ class OCR
19
+ attr_reader backend: String
20
+ attr_reader language: String
21
+ attr_reader tesseract_config: Tesseract?
22
+
23
+ def initialize: (?backend: String, ?language: String, ?tesseract_config: (Tesseract | Hash[Symbol, untyped])?) -> void
24
+ def to_h: () -> Hash[Symbol, untyped]
25
+ end
26
+
27
+ class Tesseract
28
+ def initialize: (**untyped options) -> void
29
+ def to_h: () -> Hash[Symbol, untyped]
30
+ end
31
+
32
+ class Chunking
33
+ attr_reader max_chars: Integer
34
+ attr_reader max_overlap: Integer
35
+ attr_reader preset: String?
36
+ attr_reader embedding: Embedding?
37
+ attr_reader enabled: bool?
38
+
39
+ def initialize: (
40
+ ?max_chars: Integer?,
41
+ ?max_overlap: Integer?,
42
+ ?preset: String?,
43
+ ?embedding: (Embedding | Hash[Symbol, untyped])?,
44
+ ?chunk_size: Integer?,
45
+ ?chunk_overlap: Integer?,
46
+ ?enabled: bool
47
+ ) -> void
48
+ def to_h: () -> Hash[Symbol, untyped]
49
+ end
50
+
51
+ class Embedding
52
+ attr_reader model: Hash[Symbol, untyped]
53
+ attr_reader normalize: bool?
54
+ attr_reader batch_size: Integer?
55
+ attr_reader show_download_progress: bool?
56
+ attr_reader cache_dir: String?
57
+
58
+ def initialize: (
59
+ ?model: Hash[Symbol, untyped],
60
+ ?normalize: bool?,
61
+ ?batch_size: Integer?,
62
+ ?show_download_progress: bool?,
63
+ ?cache_dir: String?
64
+ ) -> void
65
+ def to_h: () -> Hash[Symbol, untyped]
66
+ end
67
+
68
+ class LanguageDetection
69
+ attr_reader enabled: bool
70
+ attr_reader min_confidence: Float
71
+ attr_reader detect_multiple: bool
72
+
73
+ def initialize: (?enabled: bool, ?min_confidence: Float, ?detect_multiple: bool) -> void
74
+ def to_h: () -> Hash[Symbol, untyped]
75
+ end
76
+
77
+ class FontConfig
78
+ attr_accessor enabled: bool
79
+ attr_accessor custom_font_dirs: Array[String]?
80
+
81
+ def initialize: (?enabled: bool, ?custom_font_dirs: Array[String]?) -> void
82
+ def to_h: () -> Hash[Symbol, untyped]
83
+ end
84
+
85
+ class PDF
86
+ attr_reader extract_images: bool
87
+ attr_reader passwords: Array[String]?
88
+ attr_reader extract_metadata: bool
89
+ attr_reader font_config: FontConfig?
90
+
91
+ def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool, ?font_config: (FontConfig | Hash[Symbol, untyped])?) -> void
92
+ def to_h: () -> Hash[Symbol, untyped]
93
+ end
94
+
95
+ class ImageExtraction
96
+ attr_reader extract_images: bool
97
+ attr_reader target_dpi: Integer
98
+ attr_reader max_image_dimension: Integer
99
+ attr_reader auto_adjust_dpi: bool
100
+ attr_reader min_dpi: Integer
101
+ attr_reader max_dpi: Integer
102
+
103
+ def initialize: (
104
+ ?extract_images: bool,
105
+ ?target_dpi: Integer,
106
+ ?max_image_dimension: Integer,
107
+ ?auto_adjust_dpi: bool,
108
+ ?min_dpi: Integer,
109
+ ?max_dpi: Integer
110
+ ) -> void
111
+ def to_h: () -> Hash[Symbol, untyped]
112
+ end
113
+
114
+ class ImagePreprocessing
115
+ attr_reader target_dpi: Integer
116
+ attr_reader auto_rotate: bool
117
+ attr_reader deskew: bool
118
+ attr_reader denoise: bool
119
+ attr_reader contrast_enhance: bool
120
+ attr_reader binarization_method: String
121
+ attr_reader invert_colors: bool
122
+
123
+ def initialize: (
124
+ ?target_dpi: Integer,
125
+ ?auto_rotate: bool,
126
+ ?deskew: bool,
127
+ ?denoise: bool,
128
+ ?contrast_enhance: bool,
129
+ ?binarization_method: String,
130
+ ?invert_colors: bool
131
+ ) -> void
132
+ def to_h: () -> Hash[Symbol, untyped]
133
+ end
134
+
135
+ class TokenReduction
136
+ attr_reader mode: String
137
+ attr_reader preserve_important_words: bool
138
+
139
+ def initialize: (?mode: String, ?preserve_important_words: bool) -> void
140
+ def to_h: () -> Hash[Symbol, untyped]
141
+ end
142
+
143
+ class PostProcessor
144
+ attr_reader enabled: bool
145
+ attr_reader enabled_processors: Array[String]?
146
+ attr_reader disabled_processors: Array[String]?
147
+
148
+ def initialize: (?enabled: bool, ?enabled_processors: Array[String]?, ?disabled_processors: Array[String]?) -> void
149
+ def to_h: () -> Hash[Symbol, untyped]
150
+ end
151
+
152
+ class HtmlPreprocessing
153
+ attr_reader enabled: bool?
154
+ attr_reader preset: Symbol?
155
+ attr_reader remove_navigation: bool?
156
+ attr_reader remove_forms: bool?
157
+
158
+ def initialize: (?enabled: bool?, ?preset: Symbol?, ?remove_navigation: bool?, ?remove_forms: bool?) -> void
159
+ def to_h: () -> Hash[Symbol, untyped]
160
+ end
161
+
162
+ class HtmlOptions
163
+ def initialize: (**untyped options) -> void
164
+ def to_h: () -> Hash[Symbol, untyped]
165
+ end
166
+
167
+ class Keywords
168
+ def initialize: (
169
+ ?algorithm: Symbol?,
170
+ ?max_keywords: Integer?,
171
+ ?min_score: Float?,
172
+ ?ngram_range: Array[Integer]?,
173
+ ?language: Symbol?,
174
+ ?yake_params: Hash[Symbol, untyped]?,
175
+ ?rake_params: Hash[Symbol, untyped]?
176
+ ) -> void
177
+ def to_h: () -> Hash[Symbol, untyped]
178
+ end
179
+
180
+ class PageConfig
181
+ attr_reader extract_pages: bool
182
+ attr_reader insert_page_markers: bool
183
+ attr_reader marker_format: String
184
+
185
+ def initialize: (?extract_pages: bool, ?insert_page_markers: bool, ?marker_format: String) -> void
186
+ def to_h: () -> Hash[Symbol, untyped]
187
+ end
188
+
189
+ class Extraction
190
+ attr_reader use_cache: bool
191
+ attr_reader enable_quality_processing: bool
192
+ attr_reader force_ocr: bool
193
+ attr_reader ocr: OCR?
194
+ attr_reader chunking: Chunking?
195
+ attr_reader language_detection: LanguageDetection?
196
+ attr_reader pdf_options: PDF?
197
+ attr_reader image_extraction: ImageExtraction?
198
+ attr_reader image_preprocessing: ImagePreprocessing?
199
+ attr_reader postprocessor: PostProcessor?
200
+ attr_reader token_reduction: TokenReduction?
201
+ attr_reader keywords: Keywords?
202
+ attr_reader html_options: HtmlOptions?
203
+ attr_reader pages: PageConfig?
204
+ attr_reader max_concurrent_extractions: Integer?
205
+
206
+ def self.from_file: (String path) -> Extraction
207
+ def initialize: (
208
+ ?use_cache: bool,
209
+ ?enable_quality_processing: bool,
210
+ ?force_ocr: bool,
211
+ ?ocr: (OCR | Hash[Symbol, untyped])?,
212
+ ?chunking: (Chunking | Hash[Symbol, untyped])?,
213
+ ?language_detection: (LanguageDetection | Hash[Symbol, untyped])?,
214
+ ?pdf_options: (PDF | Hash[Symbol, untyped])?,
215
+ ?image_extraction: (ImageExtraction | Hash[Symbol, untyped])?,
216
+ ?image_preprocessing: (ImagePreprocessing | Hash[Symbol, untyped])?,
217
+ ?postprocessor: (PostProcessor | Hash[Symbol, untyped])?,
218
+ ?token_reduction: (TokenReduction | Hash[Symbol, untyped])?,
219
+ ?keywords: (Keywords | Hash[Symbol, untyped])?,
220
+ ?html_options: (HtmlOptions | Hash[Symbol, untyped])?,
221
+ ?pages: (PageConfig | Hash[Symbol, untyped])?,
222
+ ?max_concurrent_extractions: Integer?
223
+ ) -> void
224
+ def to_h: () -> Hash[Symbol, untyped]
225
+
226
+ private
227
+
228
+ def normalize_config: [T] (T | Hash[Symbol, untyped] | nil value, Class klass) -> T?
229
+ end
230
+
231
+ end
232
+
233
+ # Alias for Config::Extraction (for API consistency with other language bindings)
234
+ ExtractionConfig: singleton(Config::Extraction)
235
+
236
+ # Alias for Config::PageConfig (for API consistency with other language bindings)
237
+ PageConfig: singleton(Config::PageConfig)
238
+
239
+ # Keyword algorithm constants
240
+ module KeywordAlgorithm
241
+ YAKE: Symbol
242
+ RAKE: Symbol
243
+ end
244
+
245
+ # Extraction result type
246
+ type extraction_result_hash = {
247
+ content: String,
248
+ mime_type: String,
249
+ metadata_json: String,
250
+ tables: Array[table_hash]?,
251
+ detected_languages: Array[String]?,
252
+ chunks: Array[chunk_hash]?,
253
+ images: Array[image_hash]?
254
+ }
255
+
256
+ type table_hash = {
257
+ cells: Array[Array[String]],
258
+ markdown: String,
259
+ page_number: Integer
260
+ }
261
+
262
+ type chunk_hash = {
263
+ content: String,
264
+ byte_start: Integer,
265
+ byte_end: Integer,
266
+ token_count: Integer?,
267
+ chunk_index: Integer?,
268
+ total_chunks: Integer?,
269
+ first_page: Integer?,
270
+ last_page: Integer?,
271
+ embedding: Array[Float]?
272
+ }
273
+
274
+ type image_hash = {
275
+ data: String,
276
+ format: String,
277
+ image_index: Integer,
278
+ page_number: Integer?,
279
+ width: Integer?,
280
+ height: Integer?,
281
+ colorspace: String?,
282
+ bits_per_component: Integer?,
283
+ is_mask: bool?,
284
+ description: String?,
285
+ ocr_result: extraction_result_hash?
286
+ }
287
+
288
+ type config_hash = Hash[Symbol, untyped]
289
+ type config_input = config_hash | _ToH
290
+
291
+ interface _ToH
292
+ def to_h: () -> config_hash
293
+ end
294
+
295
+ # Extraction result wrapper
296
+ class Result
297
+ # Table structure
298
+ class Table
299
+ attr_reader cells: Array[Array[String]]
300
+ attr_reader markdown: String
301
+ attr_reader page_number: Integer
302
+
303
+ def initialize: (cells: Array[Array[String]], markdown: String, page_number: Integer) -> void
304
+ def to_h: () -> table_hash
305
+ end
306
+
307
+ # Text chunk
308
+ class Chunk
309
+ attr_reader content: String
310
+ attr_reader byte_start: Integer
311
+ attr_reader byte_end: Integer
312
+ attr_reader token_count: Integer?
313
+ attr_reader chunk_index: Integer?
314
+ attr_reader total_chunks: Integer?
315
+ attr_reader first_page: Integer?
316
+ attr_reader last_page: Integer?
317
+ attr_reader embedding: Array[Float]?
318
+
319
+ def initialize: (
320
+ content: String,
321
+ byte_start: Integer,
322
+ byte_end: Integer,
323
+ token_count: Integer?,
324
+ chunk_index: Integer?,
325
+ total_chunks: Integer?,
326
+ first_page: Integer?,
327
+ last_page: Integer?,
328
+ embedding: Array[Float]?
329
+ ) -> void
330
+ def to_h: () -> chunk_hash
331
+ end
332
+
333
+ class Image
334
+ attr_reader data: String
335
+ attr_reader format: String
336
+ attr_reader image_index: Integer
337
+ attr_reader page_number: Integer?
338
+ attr_reader width: Integer?
339
+ attr_reader height: Integer?
340
+ attr_reader colorspace: String?
341
+ attr_reader bits_per_component: Integer?
342
+ attr_reader is_mask: bool?
343
+ attr_reader description: String?
344
+ attr_reader ocr_result: Result?
345
+
346
+ def initialize: (
347
+ data: String,
348
+ format: String,
349
+ image_index: Integer,
350
+ page_number: Integer?,
351
+ width: Integer?,
352
+ height: Integer?,
353
+ colorspace: String?,
354
+ bits_per_component: Integer?,
355
+ is_mask: bool?,
356
+ description: String?,
357
+ ocr_result: Result?
358
+ ) -> void
359
+ def to_h: () -> image_hash
360
+ end
361
+
362
+ attr_reader content: String
363
+ attr_reader mime_type: String
364
+ attr_reader metadata: Hash[untyped, untyped]
365
+ attr_reader metadata_json: String
366
+ attr_reader tables: Array[Table]
367
+ attr_reader detected_languages: Array[String]?
368
+ attr_reader chunks: Array[Chunk]?
369
+ attr_reader images: Array[Image]?
370
+
371
+ def initialize: (extraction_result_hash hash) -> void
372
+ def to_h: () -> Hash[Symbol, untyped]
373
+ def to_json: (*untyped) -> String
374
+
375
+ private
376
+
377
+ def parse_metadata: (String metadata_json) -> Hash[untyped, untyped]
378
+ def parse_tables: (Array[table_hash]? tables_data) -> Array[Table]
379
+ def parse_detected_languages: (Array[String]? langs_data) -> Array[String]?
380
+ def parse_chunks: (Array[chunk_hash]? chunks_data) -> Array[Chunk]?
381
+ end
382
+
383
+ # Module methods (extraction API)
384
+ def self.extract_file_sync: (
385
+ String | Pathname path,
386
+ ?mime_type: String?,
387
+ ?config: config_input?
388
+ ) -> Result
389
+
390
+ def self.extract_bytes_sync: (
391
+ String data,
392
+ String mime_type,
393
+ ?config: config_input?
394
+ ) -> Result
395
+
396
+ def self.batch_extract_files_sync: (
397
+ Array[String | Pathname] paths,
398
+ ?config: config_input?
399
+ ) -> Array[Result]
400
+
401
+ def self.batch_extract_bytes_sync: (
402
+ Array[String] data_array,
403
+ Array[String] mime_types,
404
+ ?config: config_input?
405
+ ) -> Array[Result]
406
+
407
+ def self.extract_file: (
408
+ String | Pathname path,
409
+ ?mime_type: String?,
410
+ ?config: config_input?
411
+ ) -> Result
412
+
413
+ def self.extract_bytes: (
414
+ String data,
415
+ String mime_type,
416
+ ?config: config_input?
417
+ ) -> Result
418
+
419
+ def self.batch_extract_files: (
420
+ Array[String | Pathname] paths,
421
+ ?config: config_input?
422
+ ) -> Array[Result]
423
+
424
+ def self.batch_extract_bytes: (
425
+ Array[String] data_array,
426
+ Array[String] mime_types,
427
+ ?config: config_input?
428
+ ) -> Array[Result]
429
+
430
+ # Cache API
431
+ def self.clear_cache: () -> void
432
+ def self.cache_stats: () -> Hash[Symbol | String, Integer]
433
+
434
+ # Config loading (native method)
435
+ def self._config_from_file_native: (String path) -> Hash[Symbol, untyped]
436
+
437
+ # Error introspection (native methods)
438
+ def self._last_error_code_native: () -> Integer
439
+ def self._last_panic_context_json_native: () -> String?
440
+ def self._get_error_details_native: () -> Hash[Symbol, untyped]
441
+ def self._classify_error_native: (String message) -> Integer
442
+ def self._error_code_name_native: (Integer code) -> String
443
+ def self._error_code_description_native: (Integer code) -> String
444
+
445
+ # Plugin registration
446
+ def self.register_post_processor: (String name, _PostProcessor processor, ?stage: Symbol?) -> void
447
+ def self.unregister_post_processor: (String name) -> void
448
+ def self.clear_post_processors: () -> void
449
+ def self.register_validator: (String name, _Validator validator, ?priority: Integer?) -> void
450
+ def self.unregister_validator: (String name) -> void
451
+ def self.clear_validators: () -> void
452
+ def self.register_ocr_backend: (_OcrBackend backend) -> void
453
+
454
+ interface _PostProcessor
455
+ def call: (extraction_result_hash result) -> extraction_result_hash
456
+ end
457
+
458
+ interface _Validator
459
+ def call: (extraction_result_hash result) -> void
460
+ end
461
+
462
+ interface _OcrBackend
463
+ def name: () -> String
464
+ def extract_text: (String file_path_or_bytes, Hash[Symbol, untyped] config) -> String
465
+ end
466
+
467
+ module ErrorContext
468
+ def self.last_error_code: () -> Integer
469
+ def self.last_panic_context: () -> Errors::PanicContext?
470
+ def self.last_panic_context_json: () -> String?
471
+ def self.error_details: () -> Hash[Symbol, untyped]
472
+ def self.classify_error: (String message) -> Integer
473
+ def self.error_code_name: (Integer code) -> String
474
+ def self.error_code_description: (Integer code) -> String
475
+ end
476
+
477
+ module Errors
478
+ # Panic context information from FFI error introspection
479
+ class PanicContext
480
+ attr_reader file: String
481
+ attr_reader line: Integer
482
+ attr_reader function: String
483
+ attr_reader message: String
484
+ attr_reader timestamp_secs: Integer
485
+
486
+ def initialize: (
487
+ file: String,
488
+ line: Integer,
489
+ function: String,
490
+ message: String,
491
+ timestamp_secs: Integer
492
+ ) -> void
493
+ def to_s: () -> String
494
+ def to_h: () -> Hash[Symbol, String | Integer]
495
+ def self.from_json: (String) -> PanicContext?
496
+
497
+ private
498
+
499
+ def self.with_defaults: (Hash[Symbol, untyped] sliced) -> {file: String, line: Integer, function: String, message: String, timestamp_secs: Integer}
500
+ end
501
+
502
+ class Error < StandardError
503
+ attr_reader panic_context: PanicContext?
504
+ attr_reader error_code: Integer?
505
+
506
+ def initialize: (String message, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
507
+ end
508
+
509
+ class ValidationError < Error
510
+ end
511
+
512
+ class ParsingError < Error
513
+ attr_reader context: Hash[untyped, untyped]?
514
+
515
+ def initialize: (String message, ?context: Hash[untyped, untyped]?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
516
+ end
517
+
518
+ class OCRError < Error
519
+ attr_reader context: Hash[untyped, untyped]?
520
+
521
+ def initialize: (String message, ?context: Hash[untyped, untyped]?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
522
+ end
523
+
524
+ class MissingDependencyError < Error
525
+ attr_reader dependency: String?
526
+
527
+ def initialize: (String message, ?dependency: String?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
528
+ end
529
+
530
+ class IOError < Error
531
+ end
532
+
533
+ class PluginError < Error
534
+ end
535
+
536
+ class UnsupportedFormatError < Error
537
+ end
538
+ end
539
+
540
+ # Internal modules (prepended to Kreuzberg singleton)
541
+ # These are not checked by steep - see Steepfile
542
+ module CacheAPI : Object
543
+ end
544
+
545
+ module ExtractionAPI : Object
546
+ end
547
+
548
+ module PostProcessorProtocol
549
+ def call: (extraction_result_hash result) -> extraction_result_hash
550
+ end
551
+
552
+ module ValidatorProtocol
553
+ def call: (extraction_result_hash result) -> void
554
+ end
555
+
556
+ module OcrBackendProtocol
557
+ def name: () -> String
558
+ def extract_text: (String file_path_or_bytes, Hash[Symbol, untyped] config) -> String
559
+ def process_image: (String file_path_or_bytes, Hash[Symbol, untyped] config) -> String
560
+ end
561
+ end