kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,691 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ module Config
5
+ # OCR configuration
6
+ #
7
+ # @example
8
+ # ocr = OCR.new(backend: "tesseract", language: "eng")
9
+ #
10
+ class OCR
11
+ attr_reader :backend, :language, :tesseract_config
12
+
13
+ def initialize(
14
+ backend: 'tesseract',
15
+ language: 'eng',
16
+ tesseract_config: nil
17
+ )
18
+ @backend = backend.to_s
19
+ @language = language.to_s
20
+ @tesseract_config = normalize_tesseract_config(tesseract_config)
21
+ end
22
+
23
+ def to_h
24
+ {
25
+ backend: @backend,
26
+ language: @language,
27
+ tesseract_config: @tesseract_config&.to_h
28
+ }.compact
29
+ end
30
+
31
+ private
32
+
33
+ def normalize_tesseract_config(value)
34
+ return nil if value.nil?
35
+ return value if value.is_a?(Tesseract)
36
+ return Tesseract.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
37
+
38
+ raise ArgumentError, "Expected #{Tesseract}, Hash, or nil, got #{value.class}"
39
+ end
40
+ end
41
+
42
+ # Tesseract OCR engine configuration
43
+ class Tesseract
44
+ attr_reader :options
45
+
46
+ def initialize(**options)
47
+ @options = options.transform_keys(&:to_sym)
48
+ normalize_nested_preprocessing!
49
+ end
50
+
51
+ def to_h
52
+ @options.dup
53
+ end
54
+
55
+ private
56
+
57
+ def normalize_nested_preprocessing!
58
+ preprocessing = @options[:preprocessing]
59
+ return if preprocessing.nil?
60
+ return if preprocessing.is_a?(ImagePreprocessing)
61
+ return @options[:preprocessing] = ImagePreprocessing.new(**preprocessing.transform_keys(&:to_sym)) if
62
+ preprocessing.is_a?(Hash)
63
+
64
+ raise ArgumentError, "preprocessing must be #{ImagePreprocessing} or Hash"
65
+ end
66
+ end
67
+
68
+ # Chunking configuration
69
+ #
70
+ # @example
71
+ # chunking = Chunking.new(max_chars: 1000, max_overlap: 200)
72
+ #
73
+ class Chunking
74
+ attr_reader :max_chars, :max_overlap, :preset, :embedding, :enabled
75
+
76
+ def initialize(
77
+ max_chars: nil,
78
+ max_overlap: nil,
79
+ preset: nil,
80
+ embedding: nil,
81
+ chunk_size: nil,
82
+ chunk_overlap: nil,
83
+ enabled: true
84
+ )
85
+ resolved_size = chunk_size || max_chars || 1000
86
+ resolved_overlap = chunk_overlap || max_overlap || 200
87
+
88
+ @max_chars = resolved_size.to_i
89
+ @max_overlap = resolved_overlap.to_i
90
+ @preset = preset&.to_s
91
+ @embedding = normalize_embedding(embedding)
92
+ @enabled = boolean_or_nil(enabled)
93
+ end
94
+
95
+ def to_h
96
+ config = {
97
+ max_chars: @max_chars,
98
+ max_overlap: @max_overlap,
99
+ preset: @preset,
100
+ embedding: @embedding&.to_h
101
+ }.compact
102
+ # @type var config: Hash[Symbol, untyped]
103
+ config[:enabled] = @enabled unless @enabled.nil?
104
+ config
105
+ end
106
+
107
+ private
108
+
109
+ def normalize_embedding(value)
110
+ return nil if value.nil?
111
+ return value if value.is_a?(Embedding)
112
+ return Embedding.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
113
+
114
+ raise ArgumentError, "Expected #{Embedding}, Hash, or nil, got #{value.class}"
115
+ end
116
+
117
+ def boolean_or_nil(value)
118
+ return nil if value.nil?
119
+
120
+ value ? true : false
121
+ end
122
+ end
123
+
124
+ # Embedding model configuration for document chunking
125
+ class Embedding
126
+ attr_reader :model, :normalize, :batch_size, :show_download_progress, :cache_dir
127
+
128
+ def initialize(
129
+ model: { type: :preset, name: 'balanced' },
130
+ normalize: true,
131
+ batch_size: 32,
132
+ show_download_progress: false,
133
+ cache_dir: nil
134
+ )
135
+ @model = normalize_model(model)
136
+ @normalize = boolean_or_nil(normalize)
137
+ @batch_size = batch_size&.to_i
138
+ @show_download_progress = boolean_or_nil(show_download_progress)
139
+ @cache_dir = cache_dir&.to_s
140
+ end
141
+
142
+ def to_h
143
+ {
144
+ model: @model,
145
+ normalize: @normalize,
146
+ batch_size: @batch_size,
147
+ show_download_progress: @show_download_progress,
148
+ cache_dir: @cache_dir
149
+ }.compact
150
+ end
151
+
152
+ private
153
+
154
+ def normalize_model(model)
155
+ normalized = if model.respond_to?(:to_h)
156
+ model.to_h
157
+ else
158
+ model
159
+ end
160
+ raise ArgumentError, 'model must be a Hash describing the embedding model' unless normalized.is_a?(Hash)
161
+
162
+ normalized.transform_keys(&:to_sym)
163
+ end
164
+
165
+ def boolean_or_nil(value)
166
+ return nil if value.nil?
167
+
168
+ value ? true : false
169
+ end
170
+ end
171
+
172
+ # Language detection configuration
173
+ #
174
+ # @example
175
+ # lang = LanguageDetection.new(enabled: true, min_confidence: 0.8)
176
+ #
177
+ class LanguageDetection
178
+ attr_reader :enabled, :min_confidence, :detect_multiple
179
+
180
+ def initialize(enabled: false, min_confidence: 0.5, detect_multiple: false)
181
+ @enabled = enabled ? true : false
182
+ @min_confidence = min_confidence.to_f
183
+ @detect_multiple = detect_multiple ? true : false
184
+ end
185
+
186
+ def to_h
187
+ {
188
+ enabled: @enabled,
189
+ min_confidence: @min_confidence,
190
+ detect_multiple: @detect_multiple
191
+ }
192
+ end
193
+ end
194
+
195
+ # PDF-specific options
196
+ #
197
+ # @example
198
+ # pdf = PDF.new(extract_images: true, passwords: ["secret", "backup"])
199
+ #
200
+ class PDF
201
+ attr_reader :extract_images, :passwords, :extract_metadata
202
+
203
+ def initialize(
204
+ extract_images: false,
205
+ passwords: nil,
206
+ extract_metadata: true
207
+ )
208
+ @extract_images = extract_images ? true : false
209
+ @passwords = if passwords.is_a?(Array)
210
+ passwords.map(&:to_s)
211
+ else
212
+ (passwords ? [passwords.to_s] : nil)
213
+ end
214
+ @extract_metadata = extract_metadata ? true : false
215
+ end
216
+
217
+ def to_h
218
+ {
219
+ extract_images: @extract_images,
220
+ passwords: @passwords,
221
+ extract_metadata: @extract_metadata
222
+ }.compact
223
+ end
224
+ end
225
+
226
+ # Image extraction configuration
227
+ #
228
+ # @example
229
+ # image = ImageExtraction.new(extract_images: true, target_dpi: 300)
230
+ #
231
+ # @example With auto-adjust DPI
232
+ # image = ImageExtraction.new(
233
+ # extract_images: true,
234
+ # auto_adjust_dpi: true,
235
+ # min_dpi: 150,
236
+ # max_dpi: 600
237
+ # )
238
+ #
239
+ class ImageExtraction
240
+ attr_reader :extract_images, :target_dpi, :max_image_dimension,
241
+ :auto_adjust_dpi, :min_dpi, :max_dpi
242
+
243
+ def initialize(
244
+ extract_images: true,
245
+ target_dpi: 300,
246
+ max_image_dimension: 2000,
247
+ auto_adjust_dpi: true,
248
+ min_dpi: 150,
249
+ max_dpi: 600
250
+ )
251
+ @extract_images = extract_images ? true : false
252
+ @target_dpi = target_dpi.to_i
253
+ @max_image_dimension = max_image_dimension.to_i
254
+ @auto_adjust_dpi = auto_adjust_dpi ? true : false
255
+ @min_dpi = min_dpi.to_i
256
+ @max_dpi = max_dpi.to_i
257
+ end
258
+
259
+ def to_h
260
+ {
261
+ extract_images: @extract_images,
262
+ target_dpi: @target_dpi,
263
+ max_image_dimension: @max_image_dimension,
264
+ auto_adjust_dpi: @auto_adjust_dpi,
265
+ min_dpi: @min_dpi,
266
+ max_dpi: @max_dpi
267
+ }
268
+ end
269
+ end
270
+
271
+ # Image preprocessing configuration for OCR
272
+ #
273
+ # @example Basic preprocessing
274
+ # preprocessing = ImagePreprocessing.new(
275
+ # binarization_method: "otsu",
276
+ # denoise: true
277
+ # )
278
+ #
279
+ # @example Advanced preprocessing
280
+ # preprocessing = ImagePreprocessing.new(
281
+ # target_dpi: 600,
282
+ # auto_rotate: true,
283
+ # deskew: true,
284
+ # denoise: true,
285
+ # contrast_enhance: true,
286
+ # binarization_method: "sauvola",
287
+ # invert_colors: false
288
+ # )
289
+ #
290
+ class ImagePreprocessing
291
+ attr_reader :target_dpi, :auto_rotate, :deskew, :denoise,
292
+ :contrast_enhance, :binarization_method, :invert_colors
293
+
294
+ def initialize(
295
+ target_dpi: 300,
296
+ auto_rotate: true,
297
+ deskew: true,
298
+ denoise: false,
299
+ contrast_enhance: true,
300
+ binarization_method: 'otsu',
301
+ invert_colors: false
302
+ )
303
+ @target_dpi = target_dpi.to_i
304
+ @auto_rotate = auto_rotate ? true : false
305
+ @deskew = deskew ? true : false
306
+ @denoise = denoise ? true : false
307
+ @contrast_enhance = contrast_enhance ? true : false
308
+ @binarization_method = binarization_method.to_s
309
+ @invert_colors = invert_colors ? true : false
310
+
311
+ valid_methods = %w[otsu sauvola adaptive]
312
+ return if valid_methods.include?(@binarization_method)
313
+
314
+ raise ArgumentError, "binarization_method must be one of: #{valid_methods.join(', ')}"
315
+ end
316
+
317
+ def to_h
318
+ {
319
+ target_dpi: @target_dpi,
320
+ auto_rotate: @auto_rotate,
321
+ deskew: @deskew,
322
+ denoise: @denoise,
323
+ contrast_enhance: @contrast_enhance,
324
+ binarization_method: @binarization_method,
325
+ invert_colors: @invert_colors
326
+ }
327
+ end
328
+ end
329
+
330
+ # Token reduction configuration
331
+ #
332
+ # @example Disable token reduction
333
+ # token = TokenReduction.new(mode: "off")
334
+ #
335
+ # @example Light reduction
336
+ # token = TokenReduction.new(mode: "light", preserve_important_words: true)
337
+ #
338
+ # @example Aggressive reduction
339
+ # token = TokenReduction.new(mode: "aggressive", preserve_important_words: false)
340
+ #
341
+ class TokenReduction
342
+ attr_reader :mode, :preserve_important_words
343
+
344
+ def initialize(mode: 'off', preserve_important_words: true)
345
+ @mode = mode.to_s
346
+ @preserve_important_words = preserve_important_words ? true : false
347
+
348
+ valid_modes = %w[off light moderate aggressive maximum]
349
+ return if valid_modes.include?(@mode)
350
+
351
+ raise ArgumentError, "mode must be one of: #{valid_modes.join(', ')}"
352
+ end
353
+
354
+ def to_h
355
+ {
356
+ mode: @mode,
357
+ preserve_important_words: @preserve_important_words
358
+ }
359
+ end
360
+ end
361
+
362
+ # HTML preprocessing configuration for content extraction
363
+ class HtmlPreprocessing
364
+ attr_reader :enabled, :preset, :remove_navigation, :remove_forms
365
+
366
+ def initialize(enabled: nil, preset: nil, remove_navigation: nil, remove_forms: nil)
367
+ @enabled = boolean_or_nil(enabled)
368
+ @preset = preset&.to_sym
369
+ @remove_navigation = boolean_or_nil(remove_navigation)
370
+ @remove_forms = boolean_or_nil(remove_forms)
371
+ end
372
+
373
+ def to_h
374
+ {
375
+ enabled: @enabled,
376
+ preset: @preset,
377
+ remove_navigation: @remove_navigation,
378
+ remove_forms: @remove_forms
379
+ }.compact
380
+ end
381
+
382
+ private
383
+
384
+ def boolean_or_nil(value)
385
+ return nil if value.nil?
386
+
387
+ value ? true : false
388
+ end
389
+ end
390
+
391
+ # HTML rendering options for document conversion
392
+ class HtmlOptions
393
+ attr_reader :options
394
+
395
+ def initialize(**options)
396
+ normalized = options.transform_keys(&:to_sym)
397
+ symbol_keys = %i[
398
+ heading_style
399
+ code_block_style
400
+ highlight_style
401
+ list_indent_type
402
+ newline_style
403
+ whitespace_mode
404
+ ]
405
+ symbol_keys.each do |key|
406
+ normalized[key] = normalized[key]&.to_sym if normalized.key?(key)
407
+ end
408
+ if normalized[:preprocessing].is_a?(Hash)
409
+ normalized[:preprocessing] = HtmlPreprocessing.new(**normalized[:preprocessing])
410
+ end
411
+ @options = normalized
412
+ end
413
+
414
+ def to_h
415
+ @options.transform_values { |value| value.respond_to?(:to_h) ? value.to_h : value }
416
+ end
417
+ end
418
+
419
+ # YAKE keyword extraction parameters
420
+ class KeywordYakeParams
421
+ attr_reader :window_size
422
+
423
+ def initialize(window_size: 2)
424
+ @window_size = window_size.to_i
425
+ end
426
+
427
+ def to_h
428
+ { window_size: @window_size }
429
+ end
430
+ end
431
+
432
+ # RAKE keyword extraction parameters
433
+ class KeywordRakeParams
434
+ attr_reader :min_word_length, :max_words_per_phrase
435
+
436
+ def initialize(min_word_length: 1, max_words_per_phrase: 3)
437
+ @min_word_length = min_word_length.to_i
438
+ @max_words_per_phrase = max_words_per_phrase.to_i
439
+ end
440
+
441
+ def to_h
442
+ {
443
+ min_word_length: @min_word_length,
444
+ max_words_per_phrase: @max_words_per_phrase
445
+ }
446
+ end
447
+ end
448
+
449
+ # Keyword extraction configuration for document analysis
450
+ class Keywords
451
+ attr_reader :algorithm, :max_keywords, :min_score, :ngram_range,
452
+ :language, :yake_params, :rake_params
453
+
454
+ def initialize(
455
+ algorithm: nil,
456
+ max_keywords: nil,
457
+ min_score: nil,
458
+ ngram_range: nil,
459
+ language: nil,
460
+ yake_params: nil,
461
+ rake_params: nil
462
+ )
463
+ @algorithm = algorithm&.to_s
464
+ @max_keywords = max_keywords&.to_i
465
+ @min_score = min_score&.to_f
466
+ @ngram_range = ngram_range&.map(&:to_i)
467
+ @language = language&.to_s
468
+ @yake_params = normalize_nested(yake_params, KeywordYakeParams)
469
+ @rake_params = normalize_nested(rake_params, KeywordRakeParams)
470
+ end
471
+
472
+ def to_h
473
+ {
474
+ algorithm: @algorithm,
475
+ max_keywords: @max_keywords,
476
+ min_score: @min_score,
477
+ ngram_range: @ngram_range,
478
+ language: @language,
479
+ yake_params: @yake_params&.to_h,
480
+ rake_params: @rake_params&.to_h
481
+ }.compact
482
+ end
483
+
484
+ private
485
+
486
+ def normalize_nested(value, klass)
487
+ return nil if value.nil?
488
+ return value if value.is_a?(klass)
489
+ return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
490
+
491
+ raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
492
+ end
493
+ end
494
+
495
+ # Post-processor configuration
496
+ #
497
+ # @example Enable all post-processors
498
+ # postprocessor = PostProcessor.new(enabled: true)
499
+ #
500
+ # @example Enable specific processors
501
+ # postprocessor = PostProcessor.new(
502
+ # enabled: true,
503
+ # enabled_processors: ["quality", "formatting"]
504
+ # )
505
+ #
506
+ # @example Disable specific processors
507
+ # postprocessor = PostProcessor.new(
508
+ # enabled: true,
509
+ # disabled_processors: ["token_reduction"]
510
+ # )
511
+ #
512
+ class PostProcessor
513
+ attr_reader :enabled, :enabled_processors, :disabled_processors
514
+
515
+ def initialize(
516
+ enabled: true,
517
+ enabled_processors: nil,
518
+ disabled_processors: nil
519
+ )
520
+ @enabled = enabled ? true : false
521
+ @enabled_processors = enabled_processors&.map(&:to_s)
522
+ @disabled_processors = disabled_processors&.map(&:to_s)
523
+ end
524
+
525
+ def to_h
526
+ {
527
+ enabled: @enabled,
528
+ enabled_processors: @enabled_processors,
529
+ disabled_processors: @disabled_processors
530
+ }.compact
531
+ end
532
+ end
533
+
534
+ # Main extraction configuration
535
+ #
536
+ # @example Basic usage
537
+ # config = Extraction.new(use_cache: true, force_ocr: true)
538
+ #
539
+ # @example With OCR
540
+ # ocr = Config::OCR.new(backend: "tesseract", language: "eng")
541
+ # config = Extraction.new(ocr: ocr)
542
+ #
543
+ # @example With image extraction
544
+ # image = Config::ImageExtraction.new(extract_images: true, target_dpi: 600)
545
+ # config = Extraction.new(image_extraction: image)
546
+ #
547
+ # @example With preprocessing
548
+ # preprocessing = Config::ImagePreprocessing.new(
549
+ # binarization_method: "sauvola",
550
+ # denoise: true
551
+ # )
552
+ # config = Extraction.new(image_preprocessing: preprocessing)
553
+ #
554
+ # @example With post-processing
555
+ # postprocessor = Config::PostProcessor.new(
556
+ # enabled: true,
557
+ # enabled_processors: ["quality"]
558
+ # )
559
+ # config = Extraction.new(postprocessor: postprocessor)
560
+ #
561
+ # @example With all options
562
+ # config = Extraction.new(
563
+ # use_cache: true,
564
+ # enable_quality_processing: true,
565
+ # force_ocr: false,
566
+ # ocr: Config::OCR.new(language: "deu"),
567
+ # chunking: Config::Chunking.new(max_chars: 500),
568
+ # language_detection: Config::LanguageDetection.new(enabled: true),
569
+ # pdf_options: Config::PDF.new(extract_images: true, passwords: ["secret"]),
570
+ # image_extraction: Config::ImageExtraction.new(target_dpi: 600),
571
+ # image_preprocessing: Config::ImagePreprocessing.new(denoise: true),
572
+ # postprocessor: Config::PostProcessor.new(enabled: true)
573
+ # )
574
+ #
575
+ class Extraction
576
+ attr_reader :use_cache, :enable_quality_processing, :force_ocr,
577
+ :ocr, :chunking, :language_detection, :pdf_options,
578
+ :image_extraction, :image_preprocessing, :postprocessor,
579
+ :token_reduction, :keywords, :html_options,
580
+ :max_concurrent_extractions
581
+
582
+ # Load configuration from a file.
583
+ #
584
+ # Detects the file format from the extension (.toml, .yaml, .json)
585
+ # and loads the configuration accordingly.
586
+ #
587
+ # @param path [String] Path to the configuration file
588
+ # @return [Kreuzberg::Config::Extraction] Loaded configuration object
589
+ #
590
+ # @example Load from TOML
591
+ # config = Kreuzberg::Config::Extraction.from_file("config.toml")
592
+ #
593
+ # @example Load from YAML
594
+ # config = Kreuzberg::Config::Extraction.from_file("config.yaml")
595
+ #
596
+ def self.from_file(path)
597
+ hash = Kreuzberg._config_from_file_native(path)
598
+ # Convert string keys to symbols for keyword arguments
599
+ new(**hash.transform_keys(&:to_sym))
600
+ end
601
+
602
+ # Discover configuration file in current or parent directories.
603
+ #
604
+ # Searches for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in the current
605
+ # directory and parent directories.
606
+ #
607
+ # @return [Kreuzberg::Config::Extraction, nil] Loaded configuration object or nil if not found
608
+ #
609
+ # @example
610
+ # config = Kreuzberg::Config::Extraction.discover
611
+ # if config
612
+ # # Use discovered config
613
+ # end
614
+ #
615
+ def self.discover
616
+ hash = Kreuzberg._config_discover_native
617
+ return nil if hash.nil?
618
+
619
+ # Convert string keys to symbols for keyword arguments
620
+ new(**hash.transform_keys(&:to_sym))
621
+ end
622
+
623
+ def initialize(
624
+ use_cache: true,
625
+ enable_quality_processing: false,
626
+ force_ocr: false,
627
+ ocr: nil,
628
+ chunking: nil,
629
+ language_detection: nil,
630
+ pdf_options: nil,
631
+ image_extraction: nil,
632
+ image_preprocessing: nil,
633
+ postprocessor: nil,
634
+ token_reduction: nil,
635
+ keywords: nil,
636
+ html_options: nil,
637
+ max_concurrent_extractions: nil
638
+ )
639
+ @use_cache = use_cache ? true : false
640
+ @enable_quality_processing = enable_quality_processing ? true : false
641
+ @force_ocr = force_ocr ? true : false
642
+ @ocr = normalize_config(ocr, OCR)
643
+ @chunking = normalize_config(chunking, Chunking)
644
+ @language_detection = normalize_config(language_detection, LanguageDetection)
645
+ @pdf_options = normalize_config(pdf_options, PDF)
646
+ @image_extraction = normalize_config(image_extraction, ImageExtraction)
647
+ @image_preprocessing = normalize_config(image_preprocessing, ImagePreprocessing)
648
+ @postprocessor = normalize_config(postprocessor, PostProcessor)
649
+ @token_reduction = normalize_config(token_reduction, TokenReduction)
650
+ @keywords = normalize_config(keywords, Keywords)
651
+ @html_options = normalize_config(html_options, HtmlOptions)
652
+ @max_concurrent_extractions = max_concurrent_extractions&.to_i
653
+ end
654
+
655
+ # rubocop:disable Metrics/CyclomaticComplexity
656
+ def to_h
657
+ {
658
+ use_cache: @use_cache,
659
+ enable_quality_processing: @enable_quality_processing,
660
+ force_ocr: @force_ocr,
661
+ ocr: @ocr&.to_h,
662
+ chunking: @chunking&.to_h,
663
+ language_detection: @language_detection&.to_h,
664
+ pdf_options: @pdf_options&.to_h,
665
+ image_extraction: @image_extraction&.to_h,
666
+ image_preprocessing: @image_preprocessing&.to_h,
667
+ postprocessor: @postprocessor&.to_h,
668
+ token_reduction: @token_reduction&.to_h,
669
+ keywords: @keywords&.to_h,
670
+ html_options: @html_options&.to_h,
671
+ max_concurrent_extractions: @max_concurrent_extractions
672
+ }.compact
673
+ end
674
+ # rubocop:enable Metrics/CyclomaticComplexity
675
+
676
+ private
677
+
678
+ def normalize_config(value, klass)
679
+ return nil if value.nil?
680
+ return value if value.is_a?(klass)
681
+ # Convert string keys to symbols for keyword arguments
682
+ return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
683
+
684
+ raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
685
+ end
686
+ end
687
+
688
+ # Backwards compatibility aliases
689
+ Ocr = OCR
690
+ end
691
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Kreuzberg
6
+ # ErrorContext module provides access to FFI error introspection functions.
7
+ # Retrieve the last error code and panic context information from errors.
8
+ module ErrorContext
9
+ class << self
10
+ def last_error_code
11
+ Kreuzberg._last_error_code_native
12
+ rescue StandardError
13
+ 0
14
+ end
15
+
16
+ def last_panic_context
17
+ json_str = Kreuzberg._last_panic_context_json_native
18
+ return nil unless json_str
19
+
20
+ Errors::PanicContext.from_json(json_str)
21
+ rescue StandardError
22
+ nil
23
+ end
24
+
25
+ def last_panic_context_json
26
+ Kreuzberg._last_panic_context_json_native
27
+ rescue StandardError
28
+ nil
29
+ end
30
+ end
31
+ end
32
+ end