kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
data/sig/kreuzberg.rbs ADDED
@@ -0,0 +1,520 @@
1
+ # Type signatures for Kreuzberg document intelligence framework
2
+
3
+ module Kreuzberg
4
+ VERSION: String
5
+
6
+ # Error code constants
7
+ ERROR_CODE_SUCCESS: Integer
8
+ ERROR_CODE_GENERIC: Integer
9
+ ERROR_CODE_PANIC: Integer
10
+ ERROR_CODE_INVALID_ARGUMENT: Integer
11
+ ERROR_CODE_IO: Integer
12
+ ERROR_CODE_PARSING: Integer
13
+ ERROR_CODE_OCR: Integer
14
+ ERROR_CODE_MISSING_DEPENDENCY: Integer
15
+
16
+ # Config namespace (defined in lib/kreuzberg/config.rb)
17
+ module Config
18
+ class OCR
19
+ attr_reader backend: String
20
+ attr_reader language: String
21
+ attr_reader tesseract_config: Tesseract?
22
+
23
+ def initialize: (?backend: String, ?language: String, ?tesseract_config: (Tesseract | Hash[Symbol, untyped])?) -> void
24
+ def to_h: () -> Hash[Symbol, untyped]
25
+ end
26
+
27
+ class Tesseract
28
+ def initialize: (**untyped options) -> void
29
+ def to_h: () -> Hash[Symbol, untyped]
30
+ end
31
+
32
+ class Chunking
33
+ attr_reader max_chars: Integer
34
+ attr_reader max_overlap: Integer
35
+ attr_reader preset: String?
36
+ attr_reader embedding: Embedding?
37
+ attr_reader enabled: bool?
38
+
39
+ def initialize: (
40
+ ?max_chars: Integer?,
41
+ ?max_overlap: Integer?,
42
+ ?preset: String?,
43
+ ?embedding: (Embedding | Hash[Symbol, untyped])?,
44
+ ?chunk_size: Integer?,
45
+ ?chunk_overlap: Integer?,
46
+ ?enabled: bool
47
+ ) -> void
48
+ def to_h: () -> Hash[Symbol, untyped]
49
+ end
50
+
51
+ class Embedding
52
+ attr_reader model: Hash[Symbol, untyped]
53
+ attr_reader normalize: bool?
54
+ attr_reader batch_size: Integer?
55
+ attr_reader show_download_progress: bool?
56
+ attr_reader cache_dir: String?
57
+
58
+ def initialize: (
59
+ ?model: Hash[Symbol, untyped],
60
+ ?normalize: bool?,
61
+ ?batch_size: Integer?,
62
+ ?show_download_progress: bool?,
63
+ ?cache_dir: String?
64
+ ) -> void
65
+ def to_h: () -> Hash[Symbol, untyped]
66
+ end
67
+
68
+ class LanguageDetection
69
+ attr_reader enabled: bool
70
+ attr_reader min_confidence: Float
71
+ attr_reader detect_multiple: bool
72
+
73
+ def initialize: (?enabled: bool, ?min_confidence: Float, ?detect_multiple: bool) -> void
74
+ def to_h: () -> Hash[Symbol, untyped]
75
+ end
76
+
77
+ class PDF
78
+ attr_reader extract_images: bool
79
+ attr_reader passwords: Array[String]?
80
+ attr_reader extract_metadata: bool
81
+
82
+ def initialize: (?extract_images: bool, ?passwords: (Array[String] | String)?, ?extract_metadata: bool) -> void
83
+ def to_h: () -> Hash[Symbol, untyped]
84
+ end
85
+
86
+ class ImageExtraction
87
+ attr_reader extract_images: bool
88
+ attr_reader target_dpi: Integer
89
+ attr_reader max_image_dimension: Integer
90
+ attr_reader auto_adjust_dpi: bool
91
+ attr_reader min_dpi: Integer
92
+ attr_reader max_dpi: Integer
93
+
94
+ def initialize: (
95
+ ?extract_images: bool,
96
+ ?target_dpi: Integer,
97
+ ?max_image_dimension: Integer,
98
+ ?auto_adjust_dpi: bool,
99
+ ?min_dpi: Integer,
100
+ ?max_dpi: Integer
101
+ ) -> void
102
+ def to_h: () -> Hash[Symbol, untyped]
103
+ end
104
+
105
+ class ImagePreprocessing
106
+ attr_reader target_dpi: Integer
107
+ attr_reader auto_rotate: bool
108
+ attr_reader deskew: bool
109
+ attr_reader denoise: bool
110
+ attr_reader contrast_enhance: bool
111
+ attr_reader binarization_method: String
112
+ attr_reader invert_colors: bool
113
+
114
+ def initialize: (
115
+ ?target_dpi: Integer,
116
+ ?auto_rotate: bool,
117
+ ?deskew: bool,
118
+ ?denoise: bool,
119
+ ?contrast_enhance: bool,
120
+ ?binarization_method: String,
121
+ ?invert_colors: bool
122
+ ) -> void
123
+ def to_h: () -> Hash[Symbol, untyped]
124
+ end
125
+
126
+ class TokenReduction
127
+ attr_reader mode: String
128
+ attr_reader preserve_important_words: bool
129
+
130
+ def initialize: (?mode: String, ?preserve_important_words: bool) -> void
131
+ def to_h: () -> Hash[Symbol, untyped]
132
+ end
133
+
134
+ class PostProcessor
135
+ attr_reader enabled: bool
136
+ attr_reader enabled_processors: Array[String]?
137
+ attr_reader disabled_processors: Array[String]?
138
+
139
+ def initialize: (?enabled: bool, ?enabled_processors: Array[String]?, ?disabled_processors: Array[String]?) -> void
140
+ def to_h: () -> Hash[Symbol, untyped]
141
+ end
142
+
143
+ class HtmlPreprocessing
144
+ attr_reader enabled: bool?
145
+ attr_reader preset: Symbol?
146
+ attr_reader remove_navigation: bool?
147
+ attr_reader remove_forms: bool?
148
+
149
+ def initialize: (?enabled: bool?, ?preset: Symbol?, ?remove_navigation: bool?, ?remove_forms: bool?) -> void
150
+ def to_h: () -> Hash[Symbol, untyped]
151
+ end
152
+
153
+ class HtmlOptions
154
+ def initialize: (**untyped options) -> void
155
+ def to_h: () -> Hash[Symbol, untyped]
156
+ end
157
+
158
+ class Keywords
159
+ def initialize: (
160
+ ?algorithm: Symbol?,
161
+ ?max_keywords: Integer?,
162
+ ?min_score: Float?,
163
+ ?ngram_range: Array[Integer]?,
164
+ ?language: Symbol?,
165
+ ?yake_params: Hash[Symbol, untyped]?,
166
+ ?rake_params: Hash[Symbol, untyped]?
167
+ ) -> void
168
+ def to_h: () -> Hash[Symbol, untyped]
169
+ end
170
+
171
+ class Extraction
172
+ attr_reader use_cache: bool
173
+ attr_reader enable_quality_processing: bool
174
+ attr_reader force_ocr: bool
175
+ attr_reader ocr: OCR?
176
+ attr_reader chunking: Chunking?
177
+ attr_reader language_detection: LanguageDetection?
178
+ attr_reader pdf_options: PDF?
179
+ attr_reader image_extraction: ImageExtraction?
180
+ attr_reader image_preprocessing: ImagePreprocessing?
181
+ attr_reader postprocessor: PostProcessor?
182
+ attr_reader token_reduction: TokenReduction?
183
+ attr_reader keywords: Keywords?
184
+ attr_reader html_options: HtmlOptions?
185
+ attr_reader max_concurrent_extractions: Integer?
186
+
187
+ def self.from_file: (String path) -> Extraction
188
+ def initialize: (
189
+ ?use_cache: bool,
190
+ ?enable_quality_processing: bool,
191
+ ?force_ocr: bool,
192
+ ?ocr: (OCR | Hash[Symbol, untyped])?,
193
+ ?chunking: (Chunking | Hash[Symbol, untyped])?,
194
+ ?language_detection: (LanguageDetection | Hash[Symbol, untyped])?,
195
+ ?pdf_options: (PDF | Hash[Symbol, untyped])?,
196
+ ?image_extraction: (ImageExtraction | Hash[Symbol, untyped])?,
197
+ ?image_preprocessing: (ImagePreprocessing | Hash[Symbol, untyped])?,
198
+ ?postprocessor: (PostProcessor | Hash[Symbol, untyped])?,
199
+ ?token_reduction: (TokenReduction | Hash[Symbol, untyped])?,
200
+ ?keywords: (Keywords | Hash[Symbol, untyped])?,
201
+ ?html_options: (HtmlOptions | Hash[Symbol, untyped])?,
202
+ ?max_concurrent_extractions: Integer?
203
+ ) -> void
204
+ def to_h: () -> Hash[Symbol, untyped]
205
+
206
+ private
207
+
208
+ def normalize_config: [T] (T | Hash[Symbol, untyped] | nil value, Class klass) -> T?
209
+ end
210
+
211
+ # Backwards compatibility alias
212
+ Ocr: singleton(OCR)
213
+ end
214
+
215
+ # Alias for Config::Extraction (for API consistency with other language bindings)
216
+ ExtractionConfig: singleton(Config::Extraction)
217
+
218
+ # Extraction result type
219
+ type extraction_result_hash = {
220
+ content: String,
221
+ mime_type: String,
222
+ metadata_json: String,
223
+ tables: Array[table_hash]?,
224
+ detected_languages: Array[String]?,
225
+ chunks: Array[chunk_hash]?,
226
+ images: Array[image_hash]?
227
+ }
228
+
229
+ type table_hash = {
230
+ cells: Array[Array[String]],
231
+ markdown: String,
232
+ page_number: Integer
233
+ }
234
+
235
+ type chunk_hash = {
236
+ content: String,
237
+ char_start: Integer,
238
+ char_end: Integer,
239
+ token_count: Integer?,
240
+ chunk_index: Integer?,
241
+ total_chunks: Integer?,
242
+ embedding: Array[Float]?
243
+ }
244
+
245
+ type image_hash = {
246
+ data: String,
247
+ format: String,
248
+ image_index: Integer,
249
+ page_number: Integer?,
250
+ width: Integer?,
251
+ height: Integer?,
252
+ colorspace: String?,
253
+ bits_per_component: Integer?,
254
+ is_mask: bool?,
255
+ description: String?,
256
+ ocr_result: extraction_result_hash?
257
+ }
258
+
259
+ type config_hash = Hash[Symbol, untyped]
260
+ type config_input = config_hash | _ToH
261
+
262
+ interface _ToH
263
+ def to_h: () -> config_hash
264
+ end
265
+
266
+ # Extraction result wrapper
267
+ class Result
268
+ # Table structure
269
+ class Table
270
+ attr_reader cells: Array[Array[String]]
271
+ attr_reader markdown: String
272
+ attr_reader page_number: Integer
273
+
274
+ def initialize: (cells: Array[Array[String]], markdown: String, page_number: Integer) -> void
275
+ def to_h: () -> table_hash
276
+ end
277
+
278
+ # Text chunk
279
+ class Chunk
280
+ attr_reader content: String
281
+ attr_reader char_start: Integer
282
+ attr_reader char_end: Integer
283
+ attr_reader token_count: Integer?
284
+ attr_reader chunk_index: Integer?
285
+ attr_reader total_chunks: Integer?
286
+ attr_reader embedding: Array[Float]?
287
+
288
+ def initialize: (
289
+ content: String,
290
+ char_start: Integer,
291
+ char_end: Integer,
292
+ token_count: Integer?,
293
+ chunk_index: Integer?,
294
+ total_chunks: Integer?,
295
+ embedding: Array[Float]?
296
+ ) -> void
297
+ def to_h: () -> chunk_hash
298
+ end
299
+
300
+ class Image
301
+ attr_reader data: String
302
+ attr_reader format: String
303
+ attr_reader image_index: Integer
304
+ attr_reader page_number: Integer?
305
+ attr_reader width: Integer?
306
+ attr_reader height: Integer?
307
+ attr_reader colorspace: String?
308
+ attr_reader bits_per_component: Integer?
309
+ attr_reader is_mask: bool?
310
+ attr_reader description: String?
311
+ attr_reader ocr_result: Result?
312
+
313
+ def initialize: (
314
+ data: String,
315
+ format: String,
316
+ image_index: Integer,
317
+ page_number: Integer?,
318
+ width: Integer?,
319
+ height: Integer?,
320
+ colorspace: String?,
321
+ bits_per_component: Integer?,
322
+ is_mask: bool?,
323
+ description: String?,
324
+ ocr_result: Result?
325
+ ) -> void
326
+ def to_h: () -> image_hash
327
+ end
328
+
329
+ attr_reader content: String
330
+ attr_reader mime_type: String
331
+ attr_reader metadata: Hash[untyped, untyped]
332
+ attr_reader metadata_json: String
333
+ attr_reader tables: Array[Table]
334
+ attr_reader detected_languages: Array[String]?
335
+ attr_reader chunks: Array[Chunk]?
336
+ attr_reader images: Array[Image]?
337
+
338
+ def initialize: (extraction_result_hash hash) -> void
339
+ def to_h: () -> Hash[Symbol, untyped]
340
+ def to_json: (*untyped) -> String
341
+
342
+ private
343
+
344
+ def parse_metadata: (String metadata_json) -> Hash[untyped, untyped]
345
+ def parse_tables: (Array[table_hash]? tables_data) -> Array[Table]
346
+ def parse_detected_languages: (Array[String]? langs_data) -> Array[String]?
347
+ def parse_chunks: (Array[chunk_hash]? chunks_data) -> Array[Chunk]?
348
+ end
349
+
350
+ # Module methods (extraction API)
351
+ def self.extract_file_sync: (
352
+ String | Pathname path,
353
+ ?mime_type: String?,
354
+ ?config: config_input?
355
+ ) -> Result
356
+
357
+ def self.extract_bytes_sync: (
358
+ String data,
359
+ String mime_type,
360
+ ?config: config_input?
361
+ ) -> Result
362
+
363
+ def self.batch_extract_files_sync: (
364
+ Array[String | Pathname] paths,
365
+ ?config: config_input?
366
+ ) -> Array[Result]
367
+
368
+ def self.batch_extract_bytes_sync: (
369
+ Array[String] data_array,
370
+ Array[String] mime_types,
371
+ ?config: config_input?
372
+ ) -> Array[Result]
373
+
374
+ def self.extract_file: (
375
+ String | Pathname path,
376
+ ?mime_type: String?,
377
+ ?config: config_input?
378
+ ) -> Result
379
+
380
+ def self.extract_bytes: (
381
+ String data,
382
+ String mime_type,
383
+ ?config: config_input?
384
+ ) -> Result
385
+
386
+ def self.batch_extract_files: (
387
+ Array[String | Pathname] paths,
388
+ ?config: config_input?
389
+ ) -> Array[Result]
390
+
391
+ def self.batch_extract_bytes: (
392
+ Array[String] data_array,
393
+ Array[String] mime_types,
394
+ ?config: config_input?
395
+ ) -> Array[Result]
396
+
397
+ # Cache API
398
+ def self.clear_cache: () -> void
399
+ def self.cache_stats: () -> Hash[Symbol | String, Integer]
400
+
401
+ # Config loading (native method)
402
+ def self._config_from_file_native: (String path) -> Hash[Symbol, untyped]
403
+
404
+ # Error introspection (native methods)
405
+ def self._last_error_code_native: () -> Integer
406
+ def self._last_panic_context_json_native: () -> String?
407
+
408
+ # Plugin registration
409
+ def self.register_post_processor: (String name, _PostProcessor processor, ?stage: Symbol?) -> void
410
+ def self.unregister_post_processor: (String name) -> void
411
+ def self.clear_post_processors: () -> void
412
+ def self.register_validator: (String name, _Validator validator, ?priority: Integer?) -> void
413
+ def self.unregister_validator: (String name) -> void
414
+ def self.clear_validators: () -> void
415
+ def self.register_ocr_backend: (_OcrBackend backend) -> void
416
+
417
+ interface _PostProcessor
418
+ def call: (extraction_result_hash result) -> extraction_result_hash
419
+ end
420
+
421
+ interface _Validator
422
+ def call: (extraction_result_hash result) -> void
423
+ end
424
+
425
+ interface _OcrBackend
426
+ def name: () -> String
427
+ def extract_text: (String file_path_or_bytes, Hash[Symbol, untyped] config) -> String
428
+ end
429
+
430
+ module ErrorContext
431
+ def self.last_error_code: () -> Integer
432
+ def self.last_panic_context: () -> Errors::PanicContext?
433
+ def self.last_panic_context_json: () -> String?
434
+ end
435
+
436
+ module Errors
437
+ # Panic context information from FFI error introspection
438
+ class PanicContext
439
+ attr_reader file: String
440
+ attr_reader line: Integer
441
+ attr_reader function: String
442
+ attr_reader message: String
443
+ attr_reader timestamp_secs: Integer
444
+
445
+ def initialize: (
446
+ file: String,
447
+ line: Integer,
448
+ function: String,
449
+ message: String,
450
+ timestamp_secs: Integer
451
+ ) -> void
452
+ def to_s: () -> String
453
+ def to_h: () -> Hash[Symbol, String | Integer]
454
+ def self.from_json: (String) -> PanicContext?
455
+
456
+ private
457
+
458
+ def self.with_defaults: (Hash[Symbol, untyped] sliced) -> {file: String, line: Integer, function: String, message: String, timestamp_secs: Integer}
459
+ end
460
+
461
+ class Error < StandardError
462
+ attr_reader panic_context: PanicContext?
463
+ attr_reader error_code: Integer?
464
+
465
+ def initialize: (String message, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
466
+ end
467
+
468
+ class ValidationError < Error
469
+ end
470
+
471
+ class ParsingError < Error
472
+ attr_reader context: Hash[untyped, untyped]?
473
+
474
+ def initialize: (String message, ?context: Hash[untyped, untyped]?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
475
+ end
476
+
477
+ class OCRError < Error
478
+ attr_reader context: Hash[untyped, untyped]?
479
+
480
+ def initialize: (String message, ?context: Hash[untyped, untyped]?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
481
+ end
482
+
483
+ class MissingDependencyError < Error
484
+ attr_reader dependency: String?
485
+
486
+ def initialize: (String message, ?dependency: String?, ?panic_context: PanicContext?, ?error_code: Integer?) -> void
487
+ end
488
+
489
+ class IOError < Error
490
+ end
491
+
492
+ class PluginError < Error
493
+ end
494
+
495
+ class UnsupportedFormatError < Error
496
+ end
497
+ end
498
+
499
+ # Internal modules (prepended to Kreuzberg singleton)
500
+ # These are not checked by steep - see Steepfile
501
+ module CacheAPI : Object
502
+ end
503
+
504
+ module ExtractionAPI : Object
505
+ end
506
+
507
+ module PostProcessorProtocol
508
+ def call: (extraction_result_hash result) -> extraction_result_hash
509
+ end
510
+
511
+ module ValidatorProtocol
512
+ def call: (extraction_result_hash result) -> void
513
+ end
514
+
515
+ module OcrBackendProtocol
516
+ def name: () -> String
517
+ def extract_text: (String file_path_or_bytes, Hash[Symbol, untyped] config) -> String
518
+ def process_image: (String file_path_or_bytes, Hash[Symbol, untyped] config) -> String
519
+ end
520
+ end