kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,2998 @@
1
+ //! Kreuzberg Ruby Bindings (Magnus)
2
+ //!
3
+ //! High-performance document intelligence framework bindings for Ruby.
4
+ //! Provides extraction, OCR, chunking, and language detection for 30+ file formats.
5
+
6
+ use html_to_markdown_rs::options::{
7
+ CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, PreprocessingPreset,
8
+ WhitespaceMode,
9
+ };
10
+ use kreuzberg::keywords::{
11
+ KeywordAlgorithm as RustKeywordAlgorithm, KeywordConfig as RustKeywordConfig, RakeParams as RustRakeParams,
12
+ YakeParams as RustYakeParams,
13
+ };
14
+ use kreuzberg::types::TesseractConfig as RustTesseractConfig;
15
+ use kreuzberg::{
16
+ ChunkingConfig, EmbeddingConfig, ExtractionConfig, ExtractionResult as RustExtractionResult, ImageExtractionConfig,
17
+ ImagePreprocessingConfig, KreuzbergError, LanguageDetectionConfig, OcrConfig, PdfConfig, PostProcessorConfig,
18
+ TokenReductionConfig,
19
+ };
20
+ use magnus::exception::ExceptionClass;
21
+ use magnus::r_hash::ForEach;
22
+ use magnus::value::ReprValue;
23
+ use magnus::{Error, IntoValue, RArray, RHash, Ruby, Symbol, TryConvert, Value, function, scan_args::scan_args};
24
+ use std::fs;
25
+ use std::path::{Path, PathBuf};
26
+
27
+ /// Keeps Ruby values alive across plugin registrations by informing the GC.
28
+ struct GcGuardedValue {
29
+ value: Value,
30
+ }
31
+
32
+ impl GcGuardedValue {
33
+ fn new(value: Value) -> Self {
34
+ let ruby = Ruby::get().expect("Ruby not initialized");
35
+ ruby.gc_register_address(&value);
36
+ Self { value }
37
+ }
38
+
39
+ fn value(&self) -> Value {
40
+ self.value
41
+ }
42
+ }
43
+
44
+ impl Drop for GcGuardedValue {
45
+ fn drop(&mut self) {
46
+ if let Ok(ruby) = Ruby::get() {
47
+ ruby.gc_unregister_address(&self.value);
48
+ }
49
+ }
50
+ }
51
+
52
+ unsafe extern "C" {
53
+ fn kreuzberg_last_error_code() -> i32;
54
+ fn kreuzberg_last_panic_context() -> *const std::ffi::c_char;
55
+ fn kreuzberg_free_string(s: *mut std::ffi::c_char);
56
+ }
57
+
58
+ /// Retrieve panic context from FFI if available
59
+ fn get_panic_context() -> Option<String> {
60
+ unsafe {
61
+ let ctx_ptr = kreuzberg_last_panic_context();
62
+ if ctx_ptr.is_null() {
63
+ return None;
64
+ }
65
+
66
+ let c_str = std::ffi::CStr::from_ptr(ctx_ptr);
67
+ let context = c_str.to_string_lossy().to_string();
68
+ kreuzberg_free_string(ctx_ptr as *mut std::ffi::c_char);
69
+
70
+ if context.is_empty() { None } else { Some(context) }
71
+ }
72
+ }
73
+
74
+ /// Retrieve error code from FFI
75
+ fn get_error_code() -> i32 {
76
+ unsafe { kreuzberg_last_error_code() }
77
+ }
78
+
79
+ /// Convert Kreuzberg errors to Ruby exceptions
80
+ fn kreuzberg_error(err: KreuzbergError) -> Error {
81
+ let ruby = Ruby::get().expect("Ruby not initialized");
82
+
83
+ let fetch_error_class = |name: &str| -> Option<ExceptionClass> {
84
+ ruby.eval::<ExceptionClass>(&format!("Kreuzberg::Errors::{}", name))
85
+ .ok()
86
+ };
87
+
88
+ match err {
89
+ KreuzbergError::Validation { message, .. } => {
90
+ if let Some(class) = fetch_error_class("ValidationError") {
91
+ Error::new(class, message)
92
+ } else {
93
+ Error::new(ruby.exception_arg_error(), message)
94
+ }
95
+ }
96
+ KreuzbergError::Parsing { message, .. } => {
97
+ if let Some(class) = fetch_error_class("ParsingError") {
98
+ Error::new(class, message)
99
+ } else {
100
+ Error::new(ruby.exception_runtime_error(), format!("ParsingError: {}", message))
101
+ }
102
+ }
103
+ KreuzbergError::Ocr { message, .. } => {
104
+ if let Some(class) = fetch_error_class("OCRError") {
105
+ Error::new(class, message)
106
+ } else {
107
+ Error::new(ruby.exception_runtime_error(), format!("OCRError: {}", message))
108
+ }
109
+ }
110
+ KreuzbergError::MissingDependency(message) => {
111
+ if let Some(class) = fetch_error_class("MissingDependencyError") {
112
+ Error::new(class, message)
113
+ } else {
114
+ Error::new(
115
+ ruby.exception_runtime_error(),
116
+ format!("MissingDependencyError: {}", message),
117
+ )
118
+ }
119
+ }
120
+ KreuzbergError::Plugin { message, plugin_name } => {
121
+ if let Some(class) = fetch_error_class("PluginError") {
122
+ Error::new(class, format!("{}: {}", plugin_name, message))
123
+ } else {
124
+ Error::new(
125
+ ruby.exception_runtime_error(),
126
+ format!("Plugin error in '{}': {}", plugin_name, message),
127
+ )
128
+ }
129
+ }
130
+ KreuzbergError::Io(err) => {
131
+ if let Some(class) = fetch_error_class("IOError") {
132
+ Error::new(class, err.to_string())
133
+ } else {
134
+ Error::new(ruby.exception_runtime_error(), format!("IO error: {}", err))
135
+ }
136
+ }
137
+ KreuzbergError::UnsupportedFormat(message) => {
138
+ if let Some(class) = fetch_error_class("UnsupportedFormatError") {
139
+ Error::new(class, message)
140
+ } else {
141
+ Error::new(
142
+ ruby.exception_runtime_error(),
143
+ format!("UnsupportedFormatError: {}", message),
144
+ )
145
+ }
146
+ }
147
+ other => Error::new(ruby.exception_runtime_error(), other.to_string()),
148
+ }
149
+ }
150
+
151
+ fn runtime_error(message: impl Into<String>) -> Error {
152
+ let ruby = Ruby::get().expect("Ruby not initialized");
153
+ Error::new(ruby.exception_runtime_error(), message.into())
154
+ }
155
+
156
+ /// Convert Ruby Symbol or String to Rust String
157
+ fn symbol_to_string(value: Value) -> Result<String, Error> {
158
+ if let Some(symbol) = Symbol::from_value(value) {
159
+ Ok(symbol.name()?.to_string())
160
+ } else {
161
+ String::try_convert(value)
162
+ }
163
+ }
164
+
165
+ /// Get keyword argument from hash (supports both symbol and string keys)
166
+ fn get_kw(ruby: &Ruby, hash: RHash, name: &str) -> Option<Value> {
167
+ hash.get(name).or_else(|| {
168
+ let sym = ruby.intern(name);
169
+ hash.get(sym)
170
+ })
171
+ }
172
+
173
+ fn set_hash_entry(_ruby: &Ruby, hash: &RHash, key: &str, value: Value) -> Result<(), Error> {
174
+ hash.aset(key, value)?;
175
+ Ok(())
176
+ }
177
+
178
+ fn ocr_config_to_ruby_hash(ruby: &Ruby, config: &kreuzberg::OcrConfig) -> Result<RHash, Error> {
179
+ let value =
180
+ serde_json::to_value(config).map_err(|e| runtime_error(format!("Failed to serialize OCR config: {}", e)))?;
181
+ let ruby_value = json_value_to_ruby(ruby, &value)?;
182
+ RHash::try_convert(ruby_value).map_err(|_| runtime_error("OCR config must return a Hash"))
183
+ }
184
+
185
+ fn cache_root_dir() -> Result<PathBuf, Error> {
186
+ std::env::current_dir()
187
+ .map(|dir| dir.join(".kreuzberg"))
188
+ .map_err(|e| runtime_error(format!("Failed to get current directory: {}", e)))
189
+ }
190
+
191
+ fn cache_directories(root: &Path) -> Result<Vec<PathBuf>, Error> {
192
+ if !root.exists() {
193
+ return Ok(vec![]);
194
+ }
195
+
196
+ let mut dirs = vec![root.to_path_buf()];
197
+ let entries = fs::read_dir(root).map_err(|e| runtime_error(format!("Failed to read cache root: {}", e)))?;
198
+
199
+ for entry in entries {
200
+ let entry = entry.map_err(|e| runtime_error(format!("Failed to read cache directory entry: {}", e)))?;
201
+ if entry
202
+ .file_type()
203
+ .map_err(|e| runtime_error(format!("Failed to determine cache entry type: {}", e)))?
204
+ .is_dir()
205
+ {
206
+ dirs.push(entry.path());
207
+ }
208
+ }
209
+
210
+ Ok(dirs)
211
+ }
212
+
213
+ fn json_value_to_ruby(ruby: &Ruby, value: &serde_json::Value) -> Result<Value, Error> {
214
+ Ok(match value {
215
+ serde_json::Value::Null => ruby.qnil().as_value(),
216
+ serde_json::Value::Bool(b) => {
217
+ if *b {
218
+ ruby.qtrue().as_value()
219
+ } else {
220
+ ruby.qfalse().as_value()
221
+ }
222
+ }
223
+ serde_json::Value::Number(num) => {
224
+ if let Some(i) = num.as_i64() {
225
+ ruby.integer_from_i64(i).into_value_with(ruby)
226
+ } else if let Some(u) = num.as_u64() {
227
+ ruby.integer_from_u64(u).into_value_with(ruby)
228
+ } else if let Some(f) = num.as_f64() {
229
+ ruby.float_from_f64(f).into_value_with(ruby)
230
+ } else {
231
+ ruby.qnil().as_value()
232
+ }
233
+ }
234
+ serde_json::Value::String(s) => ruby.str_new(s).into_value_with(ruby),
235
+ serde_json::Value::Array(items) => {
236
+ let ary = ruby.ary_new();
237
+ for item in items {
238
+ ary.push(json_value_to_ruby(ruby, item)?)?;
239
+ }
240
+ ary.into_value_with(ruby)
241
+ }
242
+ serde_json::Value::Object(map) => {
243
+ let hash = ruby.hash_new();
244
+ for (key, val) in map {
245
+ let key_value = ruby.str_new(key).into_value_with(ruby);
246
+ let val_value = json_value_to_ruby(ruby, val)?;
247
+ hash.aset(key_value, val_value)?;
248
+ }
249
+ hash.into_value_with(ruby)
250
+ }
251
+ })
252
+ }
253
+
254
+ fn ruby_key_to_string(value: Value) -> Result<String, Error> {
255
+ if let Ok(sym) = Symbol::try_convert(value) {
256
+ Ok(sym.name()?.to_string())
257
+ } else {
258
+ String::try_convert(value)
259
+ }
260
+ }
261
+
262
+ fn ruby_value_to_json(value: Value) -> Result<serde_json::Value, Error> {
263
+ let ruby = Ruby::get().expect("Ruby not initialized");
264
+
265
+ if value.is_nil() {
266
+ return Ok(serde_json::Value::Null);
267
+ }
268
+
269
+ if value.equal(ruby.qtrue())? {
270
+ return Ok(serde_json::Value::Bool(true));
271
+ }
272
+
273
+ if value.equal(ruby.qfalse())? {
274
+ return Ok(serde_json::Value::Bool(false));
275
+ }
276
+
277
+ if let Ok(integer) = i64::try_convert(value) {
278
+ return Ok(serde_json::Value::Number(integer.into()));
279
+ }
280
+
281
+ if let Ok(unsigned) = u64::try_convert(value) {
282
+ return Ok(serde_json::Value::Number(serde_json::Number::from(unsigned)));
283
+ }
284
+
285
+ if let Ok(float) = f64::try_convert(value)
286
+ && let Some(num) = serde_json::Number::from_f64(float)
287
+ {
288
+ return Ok(serde_json::Value::Number(num));
289
+ }
290
+
291
+ if let Ok(sym) = Symbol::try_convert(value) {
292
+ return Ok(serde_json::Value::String(sym.name()?.to_string()));
293
+ }
294
+
295
+ if let Ok(string) = String::try_convert(value) {
296
+ return Ok(serde_json::Value::String(string));
297
+ }
298
+
299
+ if let Ok(array) = RArray::try_convert(value) {
300
+ let mut values = Vec::with_capacity(array.len());
301
+ for item in array.into_iter() {
302
+ values.push(ruby_value_to_json(item)?);
303
+ }
304
+ return Ok(serde_json::Value::Array(values));
305
+ }
306
+
307
+ if let Ok(hash) = RHash::try_convert(value) {
308
+ let mut map = serde_json::Map::new();
309
+ hash.foreach(|key: Value, val: Value| {
310
+ let key_string = ruby_key_to_string(key)?;
311
+ let json_value = ruby_value_to_json(val)?;
312
+ map.insert(key_string, json_value);
313
+ Ok(ForEach::Continue)
314
+ })?;
315
+
316
+ return Ok(serde_json::Value::Object(map));
317
+ }
318
+
319
+ Err(runtime_error("Unsupported Ruby value for JSON conversion"))
320
+ }
321
+
322
+ /// Parse OcrConfig from Ruby Hash
323
+ fn parse_ocr_config(ruby: &Ruby, hash: RHash) -> Result<OcrConfig, Error> {
324
+ let backend = if let Some(val) = get_kw(ruby, hash, "backend") {
325
+ symbol_to_string(val)?
326
+ } else {
327
+ "tesseract".to_string()
328
+ };
329
+
330
+ let language = if let Some(val) = get_kw(ruby, hash, "language") {
331
+ symbol_to_string(val)?
332
+ } else {
333
+ "eng".to_string()
334
+ };
335
+
336
+ let mut config = OcrConfig {
337
+ backend,
338
+ language,
339
+ tesseract_config: None,
340
+ };
341
+
342
+ if let Some(val) = get_kw(ruby, hash, "tesseract_config")
343
+ && !val.is_nil()
344
+ {
345
+ let tc_json = ruby_value_to_json(val)?;
346
+ let parsed: RustTesseractConfig =
347
+ serde_json::from_value(tc_json).map_err(|e| runtime_error(format!("Invalid tesseract_config: {}", e)))?;
348
+ config.tesseract_config = Some(parsed);
349
+ }
350
+
351
+ Ok(config)
352
+ }
353
+
354
+ /// Parse ChunkingConfig from Ruby Hash
355
+ fn parse_chunking_config(ruby: &Ruby, hash: RHash) -> Result<ChunkingConfig, Error> {
356
+ let max_chars = if let Some(val) = get_kw(ruby, hash, "max_chars") {
357
+ usize::try_convert(val)?
358
+ } else {
359
+ 1000
360
+ };
361
+
362
+ let max_overlap = if let Some(val) = get_kw(ruby, hash, "max_overlap") {
363
+ usize::try_convert(val)?
364
+ } else {
365
+ 200
366
+ };
367
+
368
+ let preset = if let Some(val) = get_kw(ruby, hash, "preset")
369
+ && !val.is_nil()
370
+ {
371
+ Some(symbol_to_string(val)?)
372
+ } else {
373
+ None
374
+ };
375
+
376
+ let embedding = if let Some(val) = get_kw(ruby, hash, "embedding")
377
+ && !val.is_nil()
378
+ {
379
+ let json_value = ruby_value_to_json(val)?;
380
+ let parsed: EmbeddingConfig = serde_json::from_value(json_value)
381
+ .map_err(|e| runtime_error(format!("Invalid chunking.embedding: {}", e)))?;
382
+ Some(parsed)
383
+ } else {
384
+ None
385
+ };
386
+
387
+ let config = ChunkingConfig {
388
+ max_chars,
389
+ max_overlap,
390
+ embedding,
391
+ preset,
392
+ };
393
+
394
+ Ok(config)
395
+ }
396
+
397
+ /// Parse LanguageDetectionConfig from Ruby Hash
398
+ fn parse_language_detection_config(ruby: &Ruby, hash: RHash) -> Result<LanguageDetectionConfig, Error> {
399
+ let enabled = if let Some(val) = get_kw(ruby, hash, "enabled") {
400
+ bool::try_convert(val)?
401
+ } else {
402
+ true
403
+ };
404
+
405
+ let min_confidence = if let Some(val) = get_kw(ruby, hash, "min_confidence") {
406
+ f64::try_convert(val)?
407
+ } else {
408
+ 0.8
409
+ };
410
+
411
+ let detect_multiple = if let Some(val) = get_kw(ruby, hash, "detect_multiple") {
412
+ bool::try_convert(val)?
413
+ } else {
414
+ false
415
+ };
416
+
417
+ let config = LanguageDetectionConfig {
418
+ enabled,
419
+ min_confidence,
420
+ detect_multiple,
421
+ };
422
+
423
+ Ok(config)
424
+ }
425
+
426
+ /// Parse PdfConfig from Ruby Hash
427
+ fn parse_pdf_config(ruby: &Ruby, hash: RHash) -> Result<PdfConfig, Error> {
428
+ let extract_images = if let Some(val) = get_kw(ruby, hash, "extract_images") {
429
+ bool::try_convert(val)?
430
+ } else {
431
+ false
432
+ };
433
+
434
+ let passwords = if let Some(val) = get_kw(ruby, hash, "passwords") {
435
+ if !val.is_nil() {
436
+ let arr = RArray::try_convert(val)?;
437
+ Some(arr.to_vec::<String>()?)
438
+ } else {
439
+ None
440
+ }
441
+ } else {
442
+ None
443
+ };
444
+
445
+ let extract_metadata = if let Some(val) = get_kw(ruby, hash, "extract_metadata") {
446
+ bool::try_convert(val)?
447
+ } else {
448
+ true
449
+ };
450
+
451
+ let config = PdfConfig {
452
+ extract_images,
453
+ passwords,
454
+ extract_metadata,
455
+ };
456
+
457
+ Ok(config)
458
+ }
459
+
460
+ /// Parse ImageExtractionConfig from Ruby Hash
461
+ fn parse_image_extraction_config(ruby: &Ruby, hash: RHash) -> Result<ImageExtractionConfig, Error> {
462
+ let extract_images = if let Some(val) = get_kw(ruby, hash, "extract_images") {
463
+ bool::try_convert(val)?
464
+ } else {
465
+ true
466
+ };
467
+
468
+ let target_dpi = if let Some(val) = get_kw(ruby, hash, "target_dpi") {
469
+ i32::try_convert(val)?
470
+ } else {
471
+ 300
472
+ };
473
+
474
+ let max_image_dimension = if let Some(val) = get_kw(ruby, hash, "max_image_dimension") {
475
+ i32::try_convert(val)?
476
+ } else {
477
+ 4096
478
+ };
479
+
480
+ let auto_adjust_dpi = if let Some(val) = get_kw(ruby, hash, "auto_adjust_dpi") {
481
+ bool::try_convert(val)?
482
+ } else {
483
+ true
484
+ };
485
+
486
+ let min_dpi = if let Some(val) = get_kw(ruby, hash, "min_dpi") {
487
+ i32::try_convert(val)?
488
+ } else {
489
+ 72
490
+ };
491
+
492
+ let max_dpi = if let Some(val) = get_kw(ruby, hash, "max_dpi") {
493
+ i32::try_convert(val)?
494
+ } else {
495
+ 600
496
+ };
497
+
498
+ let config = ImageExtractionConfig {
499
+ extract_images,
500
+ target_dpi,
501
+ max_image_dimension,
502
+ auto_adjust_dpi,
503
+ min_dpi,
504
+ max_dpi,
505
+ };
506
+
507
+ Ok(config)
508
+ }
509
+
510
+ /// Parse ImagePreprocessingConfig from Ruby Hash
511
+ ///
512
+ /// Note: Currently not used in ExtractionConfig but provided for completeness.
513
+ /// ImagePreprocessingConfig is typically used in OCR operations.
514
+ #[allow(dead_code)]
515
+ fn parse_image_preprocessing_config(ruby: &Ruby, hash: RHash) -> Result<ImagePreprocessingConfig, Error> {
516
+ let target_dpi = if let Some(val) = get_kw(ruby, hash, "target_dpi") {
517
+ i32::try_convert(val)?
518
+ } else {
519
+ 300
520
+ };
521
+
522
+ let auto_rotate = if let Some(val) = get_kw(ruby, hash, "auto_rotate") {
523
+ bool::try_convert(val)?
524
+ } else {
525
+ true
526
+ };
527
+
528
+ let deskew = if let Some(val) = get_kw(ruby, hash, "deskew") {
529
+ bool::try_convert(val)?
530
+ } else {
531
+ true
532
+ };
533
+
534
+ let denoise = if let Some(val) = get_kw(ruby, hash, "denoise") {
535
+ bool::try_convert(val)?
536
+ } else {
537
+ false
538
+ };
539
+
540
+ let contrast_enhance = if let Some(val) = get_kw(ruby, hash, "contrast_enhance") {
541
+ bool::try_convert(val)?
542
+ } else {
543
+ false
544
+ };
545
+
546
+ let binarization_method = if let Some(val) = get_kw(ruby, hash, "binarization_method") {
547
+ symbol_to_string(val)?
548
+ } else {
549
+ "otsu".to_string()
550
+ };
551
+
552
+ let invert_colors = if let Some(val) = get_kw(ruby, hash, "invert_colors") {
553
+ bool::try_convert(val)?
554
+ } else {
555
+ false
556
+ };
557
+
558
+ let config = ImagePreprocessingConfig {
559
+ target_dpi,
560
+ auto_rotate,
561
+ deskew,
562
+ denoise,
563
+ contrast_enhance,
564
+ binarization_method,
565
+ invert_colors,
566
+ };
567
+
568
+ Ok(config)
569
+ }
570
+
571
+ /// Parse PostProcessorConfig from Ruby Hash
572
+ fn parse_postprocessor_config(ruby: &Ruby, hash: RHash) -> Result<PostProcessorConfig, Error> {
573
+ let enabled = if let Some(val) = get_kw(ruby, hash, "enabled") {
574
+ bool::try_convert(val)?
575
+ } else {
576
+ true
577
+ };
578
+
579
+ let enabled_processors = if let Some(val) = get_kw(ruby, hash, "enabled_processors")
580
+ && !val.is_nil()
581
+ {
582
+ let arr = RArray::try_convert(val)?;
583
+ Some(arr.to_vec::<String>()?)
584
+ } else {
585
+ None
586
+ };
587
+
588
+ let disabled_processors = if let Some(val) = get_kw(ruby, hash, "disabled_processors")
589
+ && !val.is_nil()
590
+ {
591
+ let arr = RArray::try_convert(val)?;
592
+ Some(arr.to_vec::<String>()?)
593
+ } else {
594
+ None
595
+ };
596
+
597
+ let config = PostProcessorConfig {
598
+ enabled,
599
+ enabled_processors,
600
+ disabled_processors,
601
+ };
602
+
603
+ Ok(config)
604
+ }
605
+
606
+ /// Parse TokenReductionConfig from Ruby Hash
607
+ fn parse_token_reduction_config(ruby: &Ruby, hash: RHash) -> Result<TokenReductionConfig, Error> {
608
+ let mode = if let Some(val) = get_kw(ruby, hash, "mode") {
609
+ symbol_to_string(val)?
610
+ } else {
611
+ "off".to_string()
612
+ };
613
+
614
+ let preserve_important_words = if let Some(val) = get_kw(ruby, hash, "preserve_important_words") {
615
+ bool::try_convert(val)?
616
+ } else {
617
+ true
618
+ };
619
+
620
+ let config = TokenReductionConfig {
621
+ mode,
622
+ preserve_important_words,
623
+ };
624
+
625
+ Ok(config)
626
+ }
627
+
628
+ fn parse_keyword_config(ruby: &Ruby, hash: RHash) -> Result<RustKeywordConfig, Error> {
629
+ let mut config = RustKeywordConfig::default();
630
+
631
+ if let Some(val) = get_kw(ruby, hash, "algorithm") {
632
+ let algo = symbol_to_string(val)?;
633
+ config.algorithm = match algo.to_lowercase().as_str() {
634
+ "yake" => RustKeywordAlgorithm::Yake,
635
+ "rake" => RustKeywordAlgorithm::Rake,
636
+ other => {
637
+ return Err(runtime_error(format!(
638
+ "Invalid keywords.algorithm '{}', expected 'yake' or 'rake'",
639
+ other
640
+ )));
641
+ }
642
+ };
643
+ }
644
+
645
+ if let Some(val) = get_kw(ruby, hash, "max_keywords") {
646
+ config.max_keywords = usize::try_convert(val)?;
647
+ }
648
+
649
+ if let Some(val) = get_kw(ruby, hash, "min_score") {
650
+ config.min_score = f64::try_convert(val)? as f32;
651
+ }
652
+
653
+ if let Some(val) = get_kw(ruby, hash, "ngram_range") {
654
+ let ary = RArray::try_convert(val)?;
655
+ if ary.len() == 2 {
656
+ let values = ary.to_vec::<i64>()?;
657
+ config.ngram_range = (values[0] as usize, values[1] as usize);
658
+ } else {
659
+ return Err(runtime_error("keywords.ngram_range must have exactly two values"));
660
+ }
661
+ }
662
+
663
+ if let Some(val) = get_kw(ruby, hash, "language")
664
+ && !val.is_nil()
665
+ {
666
+ config.language = Some(symbol_to_string(val)?);
667
+ }
668
+
669
+ if let Some(val) = get_kw(ruby, hash, "yake_params")
670
+ && !val.is_nil()
671
+ {
672
+ let yake_hash = RHash::try_convert(val)?;
673
+ let window = if let Some(window_val) = get_kw(ruby, yake_hash, "window_size") {
674
+ usize::try_convert(window_val)?
675
+ } else {
676
+ 2
677
+ };
678
+ config.yake_params = Some(RustYakeParams { window_size: window });
679
+ }
680
+
681
+ if let Some(val) = get_kw(ruby, hash, "rake_params")
682
+ && !val.is_nil()
683
+ {
684
+ let rake_hash = RHash::try_convert(val)?;
685
+ let mut params = RustRakeParams::default();
686
+ if let Some(val) = get_kw(ruby, rake_hash, "min_word_length") {
687
+ params.min_word_length = usize::try_convert(val)?;
688
+ }
689
+ if let Some(val) = get_kw(ruby, rake_hash, "max_words_per_phrase") {
690
+ params.max_words_per_phrase = usize::try_convert(val)?;
691
+ }
692
+ config.rake_params = Some(params);
693
+ }
694
+
695
+ Ok(config)
696
+ }
697
+
698
+ fn parse_html_options(ruby: &Ruby, hash: RHash) -> Result<ConversionOptions, Error> {
699
+ let mut options = ConversionOptions::default();
700
+
701
+ if let Some(val) = get_kw(ruby, hash, "heading_style") {
702
+ let style = symbol_to_string(val)?;
703
+ options.heading_style = match style.to_lowercase().as_str() {
704
+ "atx" => HeadingStyle::Atx,
705
+ "underlined" => HeadingStyle::Underlined,
706
+ "atx_closed" | "atx-closed" => HeadingStyle::AtxClosed,
707
+ other => return Err(runtime_error(format!("Invalid html_options.heading_style '{}'", other))),
708
+ };
709
+ }
710
+
711
+ if let Some(val) = get_kw(ruby, hash, "list_indent_type") {
712
+ let val_str = symbol_to_string(val)?;
713
+ options.list_indent_type = match val_str.to_lowercase().as_str() {
714
+ "spaces" => ListIndentType::Spaces,
715
+ "tabs" => ListIndentType::Tabs,
716
+ other => {
717
+ return Err(runtime_error(format!(
718
+ "Invalid html_options.list_indent_type '{}'",
719
+ other
720
+ )));
721
+ }
722
+ };
723
+ }
724
+
725
+ if let Some(val) = get_kw(ruby, hash, "list_indent_width") {
726
+ options.list_indent_width = usize::try_convert(val)?;
727
+ }
728
+
729
+ if let Some(val) = get_kw(ruby, hash, "bullets") {
730
+ options.bullets = String::try_convert(val)?;
731
+ }
732
+
733
+ if let Some(val) = get_kw(ruby, hash, "strong_em_symbol") {
734
+ let symbol = String::try_convert(val)?;
735
+ let mut chars = symbol.chars();
736
+ options.strong_em_symbol = chars
737
+ .next()
738
+ .ok_or_else(|| runtime_error("html_options.strong_em_symbol must not be empty"))?;
739
+ }
740
+
741
+ if let Some(val) = get_kw(ruby, hash, "escape_asterisks") {
742
+ options.escape_asterisks = bool::try_convert(val)?;
743
+ }
744
+ if let Some(val) = get_kw(ruby, hash, "escape_underscores") {
745
+ options.escape_underscores = bool::try_convert(val)?;
746
+ }
747
+ if let Some(val) = get_kw(ruby, hash, "escape_misc") {
748
+ options.escape_misc = bool::try_convert(val)?;
749
+ }
750
+ if let Some(val) = get_kw(ruby, hash, "escape_ascii") {
751
+ options.escape_ascii = bool::try_convert(val)?;
752
+ }
753
+
754
+ if let Some(val) = get_kw(ruby, hash, "code_language") {
755
+ options.code_language = String::try_convert(val)?;
756
+ }
757
+
758
+ if let Some(val) = get_kw(ruby, hash, "autolinks") {
759
+ options.autolinks = bool::try_convert(val)?;
760
+ }
761
+
762
+ if let Some(val) = get_kw(ruby, hash, "default_title") {
763
+ options.default_title = bool::try_convert(val)?;
764
+ }
765
+
766
+ if let Some(val) = get_kw(ruby, hash, "br_in_tables") {
767
+ options.br_in_tables = bool::try_convert(val)?;
768
+ }
769
+
770
+ if let Some(val) = get_kw(ruby, hash, "hocr_spatial_tables") {
771
+ options.hocr_spatial_tables = bool::try_convert(val)?;
772
+ }
773
+
774
+ if let Some(val) = get_kw(ruby, hash, "highlight_style") {
775
+ let style = symbol_to_string(val)?;
776
+ options.highlight_style = match style.to_lowercase().as_str() {
777
+ "double_equal" | "double-equal" => HighlightStyle::DoubleEqual,
778
+ "html" => HighlightStyle::Html,
779
+ "bold" => HighlightStyle::Bold,
780
+ "none" => HighlightStyle::None,
781
+ other => {
782
+ return Err(runtime_error(format!(
783
+ "Invalid html_options.highlight_style '{}'",
784
+ other
785
+ )));
786
+ }
787
+ };
788
+ }
789
+
790
+ if let Some(val) = get_kw(ruby, hash, "extract_metadata") {
791
+ options.extract_metadata = bool::try_convert(val)?;
792
+ }
793
+
794
+ if let Some(val) = get_kw(ruby, hash, "whitespace_mode") {
795
+ let mode = symbol_to_string(val)?;
796
+ options.whitespace_mode = match mode.to_lowercase().as_str() {
797
+ "normalized" => WhitespaceMode::Normalized,
798
+ "strict" => WhitespaceMode::Strict,
799
+ other => {
800
+ return Err(runtime_error(format!(
801
+ "Invalid html_options.whitespace_mode '{}'",
802
+ other
803
+ )));
804
+ }
805
+ };
806
+ }
807
+
808
+ if let Some(val) = get_kw(ruby, hash, "strip_newlines") {
809
+ options.strip_newlines = bool::try_convert(val)?;
810
+ }
811
+
812
+ if let Some(val) = get_kw(ruby, hash, "wrap") {
813
+ options.wrap = bool::try_convert(val)?;
814
+ }
815
+
816
+ if let Some(val) = get_kw(ruby, hash, "wrap_width") {
817
+ options.wrap_width = usize::try_convert(val)?;
818
+ }
819
+
820
+ if let Some(val) = get_kw(ruby, hash, "convert_as_inline") {
821
+ options.convert_as_inline = bool::try_convert(val)?;
822
+ }
823
+
824
+ if let Some(val) = get_kw(ruby, hash, "sub_symbol") {
825
+ options.sub_symbol = String::try_convert(val)?;
826
+ }
827
+
828
+ if let Some(val) = get_kw(ruby, hash, "sup_symbol") {
829
+ options.sup_symbol = String::try_convert(val)?;
830
+ }
831
+
832
+ if let Some(val) = get_kw(ruby, hash, "newline_style") {
833
+ let style = symbol_to_string(val)?;
834
+ options.newline_style = match style.to_lowercase().as_str() {
835
+ "spaces" => NewlineStyle::Spaces,
836
+ "backslash" => NewlineStyle::Backslash,
837
+ other => return Err(runtime_error(format!("Invalid html_options.newline_style '{}'", other))),
838
+ };
839
+ }
840
+
841
+ if let Some(val) = get_kw(ruby, hash, "code_block_style") {
842
+ let style = symbol_to_string(val)?;
843
+ options.code_block_style = match style.to_lowercase().as_str() {
844
+ "indented" => CodeBlockStyle::Indented,
845
+ "backticks" => CodeBlockStyle::Backticks,
846
+ "tildes" => CodeBlockStyle::Tildes,
847
+ other => {
848
+ return Err(runtime_error(format!(
849
+ "Invalid html_options.code_block_style '{}'",
850
+ other
851
+ )));
852
+ }
853
+ };
854
+ }
855
+
856
+ if let Some(val) = get_kw(ruby, hash, "keep_inline_images_in") {
857
+ let arr = RArray::try_convert(val)?;
858
+ options.keep_inline_images_in = arr.to_vec::<String>()?;
859
+ }
860
+
861
+ if let Some(val) = get_kw(ruby, hash, "encoding") {
862
+ options.encoding = String::try_convert(val)?;
863
+ }
864
+
865
+ if let Some(val) = get_kw(ruby, hash, "debug") {
866
+ options.debug = bool::try_convert(val)?;
867
+ }
868
+
869
+ if let Some(val) = get_kw(ruby, hash, "strip_tags") {
870
+ let arr = RArray::try_convert(val)?;
871
+ options.strip_tags = arr.to_vec::<String>()?;
872
+ }
873
+
874
+ if let Some(val) = get_kw(ruby, hash, "preserve_tags") {
875
+ let arr = RArray::try_convert(val)?;
876
+ options.preserve_tags = arr.to_vec::<String>()?;
877
+ }
878
+
879
+ if let Some(val) = get_kw(ruby, hash, "preprocessing")
880
+ && !val.is_nil()
881
+ {
882
+ let pre_hash = RHash::try_convert(val)?;
883
+ let mut preprocessing = options.preprocessing.clone();
884
+ if let Some(v) = get_kw(ruby, pre_hash, "enabled") {
885
+ preprocessing.enabled = bool::try_convert(v)?;
886
+ }
887
+ if let Some(v) = get_kw(ruby, pre_hash, "preset") {
888
+ let preset = symbol_to_string(v)?;
889
+ preprocessing.preset = match preset.to_lowercase().as_str() {
890
+ "minimal" => PreprocessingPreset::Minimal,
891
+ "standard" => PreprocessingPreset::Standard,
892
+ "aggressive" => PreprocessingPreset::Aggressive,
893
+ other => {
894
+ return Err(runtime_error(format!(
895
+ "Invalid html_options.preprocessing.preset '{}'",
896
+ other
897
+ )));
898
+ }
899
+ };
900
+ }
901
+ if let Some(v) = get_kw(ruby, pre_hash, "remove_navigation") {
902
+ preprocessing.remove_navigation = bool::try_convert(v)?;
903
+ }
904
+ if let Some(v) = get_kw(ruby, pre_hash, "remove_forms") {
905
+ preprocessing.remove_forms = bool::try_convert(v)?;
906
+ }
907
+ options.preprocessing = preprocessing;
908
+ }
909
+
910
+ Ok(options)
911
+ }
912
+
913
+ fn keyword_algorithm_to_str(algo: RustKeywordAlgorithm) -> &'static str {
914
+ match algo {
915
+ RustKeywordAlgorithm::Yake => "yake",
916
+ RustKeywordAlgorithm::Rake => "rake",
917
+ }
918
+ }
919
+
920
+ fn keyword_config_to_ruby_hash(ruby: &Ruby, config: &RustKeywordConfig) -> Result<RHash, Error> {
921
+ let hash = ruby.hash_new();
922
+ hash.aset("algorithm", keyword_algorithm_to_str(config.algorithm))?;
923
+ hash.aset("max_keywords", config.max_keywords as i64)?;
924
+ hash.aset("min_score", config.min_score)?;
925
+ hash.aset("language", config.language.clone().unwrap_or_default())?;
926
+
927
+ let range_array = ruby.ary_new();
928
+ range_array.push(config.ngram_range.0 as i64)?;
929
+ range_array.push(config.ngram_range.1 as i64)?;
930
+ hash.aset("ngram_range", range_array)?;
931
+
932
+ if let Some(yake) = &config.yake_params {
933
+ let yake_hash = ruby.hash_new();
934
+ yake_hash.aset("window_size", yake.window_size as i64)?;
935
+ hash.aset("yake_params", yake_hash)?;
936
+ }
937
+
938
+ if let Some(rake) = &config.rake_params {
939
+ let rake_hash = ruby.hash_new();
940
+ rake_hash.aset("min_word_length", rake.min_word_length as i64)?;
941
+ rake_hash.aset("max_words_per_phrase", rake.max_words_per_phrase as i64)?;
942
+ hash.aset("rake_params", rake_hash)?;
943
+ }
944
+
945
+ Ok(hash)
946
+ }
947
+
948
+ fn html_options_to_ruby_hash(ruby: &Ruby, options: &ConversionOptions) -> Result<RHash, Error> {
949
+ let hash = ruby.hash_new();
950
+ hash.aset(
951
+ "heading_style",
952
+ match options.heading_style {
953
+ HeadingStyle::Atx => "atx",
954
+ HeadingStyle::Underlined => "underlined",
955
+ HeadingStyle::AtxClosed => "atx_closed",
956
+ },
957
+ )?;
958
+ hash.aset(
959
+ "list_indent_type",
960
+ match options.list_indent_type {
961
+ ListIndentType::Spaces => "spaces",
962
+ ListIndentType::Tabs => "tabs",
963
+ },
964
+ )?;
965
+ hash.aset("list_indent_width", options.list_indent_width as i64)?;
966
+ hash.aset("bullets", options.bullets.clone())?;
967
+ hash.aset("strong_em_symbol", options.strong_em_symbol.to_string())?;
968
+ hash.aset("escape_asterisks", options.escape_asterisks)?;
969
+ hash.aset("escape_underscores", options.escape_underscores)?;
970
+ hash.aset("escape_misc", options.escape_misc)?;
971
+ hash.aset("escape_ascii", options.escape_ascii)?;
972
+ hash.aset("code_language", options.code_language.clone())?;
973
+ hash.aset("autolinks", options.autolinks)?;
974
+ hash.aset("default_title", options.default_title)?;
975
+ hash.aset("br_in_tables", options.br_in_tables)?;
976
+ hash.aset("hocr_spatial_tables", options.hocr_spatial_tables)?;
977
+ hash.aset(
978
+ "highlight_style",
979
+ match options.highlight_style {
980
+ HighlightStyle::DoubleEqual => "double_equal",
981
+ HighlightStyle::Html => "html",
982
+ HighlightStyle::Bold => "bold",
983
+ HighlightStyle::None => "none",
984
+ },
985
+ )?;
986
+ hash.aset("extract_metadata", options.extract_metadata)?;
987
+ hash.aset(
988
+ "whitespace_mode",
989
+ match options.whitespace_mode {
990
+ WhitespaceMode::Normalized => "normalized",
991
+ WhitespaceMode::Strict => "strict",
992
+ },
993
+ )?;
994
+ hash.aset("strip_newlines", options.strip_newlines)?;
995
+ hash.aset("wrap", options.wrap)?;
996
+ hash.aset("wrap_width", options.wrap_width as i64)?;
997
+ hash.aset("convert_as_inline", options.convert_as_inline)?;
998
+ hash.aset("sub_symbol", options.sub_symbol.clone())?;
999
+ hash.aset("sup_symbol", options.sup_symbol.clone())?;
1000
+ hash.aset(
1001
+ "newline_style",
1002
+ match options.newline_style {
1003
+ NewlineStyle::Spaces => "spaces",
1004
+ NewlineStyle::Backslash => "backslash",
1005
+ },
1006
+ )?;
1007
+ hash.aset(
1008
+ "code_block_style",
1009
+ match options.code_block_style {
1010
+ CodeBlockStyle::Indented => "indented",
1011
+ CodeBlockStyle::Backticks => "backticks",
1012
+ CodeBlockStyle::Tildes => "tildes",
1013
+ },
1014
+ )?;
1015
+
1016
+ let keep_inline = ruby.ary_new();
1017
+ for tag in &options.keep_inline_images_in {
1018
+ keep_inline.push(tag.as_str())?;
1019
+ }
1020
+ hash.aset("keep_inline_images_in", keep_inline)?;
1021
+
1022
+ hash.aset("encoding", options.encoding.clone())?;
1023
+ hash.aset("debug", options.debug)?;
1024
+
1025
+ let strip_tags = ruby.ary_new();
1026
+ for tag in &options.strip_tags {
1027
+ strip_tags.push(tag.as_str())?;
1028
+ }
1029
+ hash.aset("strip_tags", strip_tags)?;
1030
+
1031
+ let preserve_tags = ruby.ary_new();
1032
+ for tag in &options.preserve_tags {
1033
+ preserve_tags.push(tag.as_str())?;
1034
+ }
1035
+ hash.aset("preserve_tags", preserve_tags)?;
1036
+
1037
+ let pre_hash = ruby.hash_new();
1038
+ pre_hash.aset("enabled", options.preprocessing.enabled)?;
1039
+ pre_hash.aset(
1040
+ "preset",
1041
+ match options.preprocessing.preset {
1042
+ PreprocessingPreset::Minimal => "minimal",
1043
+ PreprocessingPreset::Standard => "standard",
1044
+ PreprocessingPreset::Aggressive => "aggressive",
1045
+ },
1046
+ )?;
1047
+ pre_hash.aset("remove_navigation", options.preprocessing.remove_navigation)?;
1048
+ pre_hash.aset("remove_forms", options.preprocessing.remove_forms)?;
1049
+ hash.aset("preprocessing", pre_hash)?;
1050
+
1051
+ Ok(hash)
1052
+ }
1053
+ /// Parse ExtractionConfig from Ruby Hash
1054
+ fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<ExtractionConfig, Error> {
1055
+ let mut config = ExtractionConfig::default();
1056
+
1057
+ if let Some(hash) = opts {
1058
+ if let Some(val) = get_kw(ruby, hash, "use_cache") {
1059
+ config.use_cache = bool::try_convert(val)?;
1060
+ }
1061
+
1062
+ if let Some(val) = get_kw(ruby, hash, "enable_quality_processing") {
1063
+ config.enable_quality_processing = bool::try_convert(val)?;
1064
+ }
1065
+
1066
+ if let Some(val) = get_kw(ruby, hash, "force_ocr") {
1067
+ config.force_ocr = bool::try_convert(val)?;
1068
+ }
1069
+
1070
+ if let Some(val) = get_kw(ruby, hash, "ocr")
1071
+ && !val.is_nil()
1072
+ {
1073
+ let ocr_hash = RHash::try_convert(val)?;
1074
+ config.ocr = Some(parse_ocr_config(ruby, ocr_hash)?);
1075
+ }
1076
+
1077
+ if let Some(val) = get_kw(ruby, hash, "chunking")
1078
+ && !val.is_nil()
1079
+ {
1080
+ let chunking_hash = RHash::try_convert(val)?;
1081
+ config.chunking = Some(parse_chunking_config(ruby, chunking_hash)?);
1082
+ }
1083
+
1084
+ if let Some(val) = get_kw(ruby, hash, "language_detection")
1085
+ && !val.is_nil()
1086
+ {
1087
+ let lang_hash = RHash::try_convert(val)?;
1088
+ config.language_detection = Some(parse_language_detection_config(ruby, lang_hash)?);
1089
+ }
1090
+
1091
+ if let Some(val) = get_kw(ruby, hash, "pdf_options")
1092
+ && !val.is_nil()
1093
+ {
1094
+ let pdf_hash = RHash::try_convert(val)?;
1095
+ config.pdf_options = Some(parse_pdf_config(ruby, pdf_hash)?);
1096
+ }
1097
+
1098
+ if let Some(val) = get_kw(ruby, hash, "images")
1099
+ && !val.is_nil()
1100
+ {
1101
+ let images_hash = RHash::try_convert(val)?;
1102
+ config.images = Some(parse_image_extraction_config(ruby, images_hash)?);
1103
+ }
1104
+
1105
+ if let Some(val) = get_kw(ruby, hash, "postprocessor")
1106
+ && !val.is_nil()
1107
+ {
1108
+ let postprocessor_hash = RHash::try_convert(val)?;
1109
+ config.postprocessor = Some(parse_postprocessor_config(ruby, postprocessor_hash)?);
1110
+ }
1111
+
1112
+ if let Some(val) = get_kw(ruby, hash, "token_reduction")
1113
+ && !val.is_nil()
1114
+ {
1115
+ let token_reduction_hash = RHash::try_convert(val)?;
1116
+ config.token_reduction = Some(parse_token_reduction_config(ruby, token_reduction_hash)?);
1117
+ }
1118
+
1119
+ if let Some(val) = get_kw(ruby, hash, "keywords")
1120
+ && !val.is_nil()
1121
+ {
1122
+ let keywords_hash = RHash::try_convert(val)?;
1123
+ config.keywords = Some(parse_keyword_config(ruby, keywords_hash)?);
1124
+ }
1125
+
1126
+ if let Some(val) = get_kw(ruby, hash, "html_options")
1127
+ && !val.is_nil()
1128
+ {
1129
+ let html_hash = RHash::try_convert(val)?;
1130
+ config.html_options = Some(parse_html_options(ruby, html_hash)?);
1131
+ }
1132
+
1133
+ if let Some(val) = get_kw(ruby, hash, "max_concurrent_extractions") {
1134
+ let value = usize::try_convert(val)?;
1135
+ config.max_concurrent_extractions = Some(value);
1136
+ }
1137
+ }
1138
+
1139
+ Ok(config)
1140
+ }
1141
+
1142
+ /// Convert ExtractionConfig to Ruby Hash for Config::Extraction.
1143
+ ///
1144
+ /// This function converts a Rust ExtractionConfig into a Ruby hash that can be passed
1145
+ /// to Kreuzberg::Config::Extraction.new(**hash).
1146
+ fn extraction_config_to_ruby_hash(ruby: &Ruby, config: ExtractionConfig) -> Result<RHash, Error> {
1147
+ let hash = ruby.hash_new();
1148
+
1149
+ set_hash_entry(
1150
+ ruby,
1151
+ &hash,
1152
+ "use_cache",
1153
+ if config.use_cache {
1154
+ ruby.qtrue().as_value()
1155
+ } else {
1156
+ ruby.qfalse().as_value()
1157
+ },
1158
+ )?;
1159
+ set_hash_entry(
1160
+ ruby,
1161
+ &hash,
1162
+ "enable_quality_processing",
1163
+ if config.enable_quality_processing {
1164
+ ruby.qtrue().as_value()
1165
+ } else {
1166
+ ruby.qfalse().as_value()
1167
+ },
1168
+ )?;
1169
+ set_hash_entry(
1170
+ ruby,
1171
+ &hash,
1172
+ "force_ocr",
1173
+ if config.force_ocr {
1174
+ ruby.qtrue().as_value()
1175
+ } else {
1176
+ ruby.qfalse().as_value()
1177
+ },
1178
+ )?;
1179
+
1180
+ if let Some(ocr) = config.ocr {
1181
+ let ocr_hash = ruby.hash_new();
1182
+ set_hash_entry(
1183
+ ruby,
1184
+ &ocr_hash,
1185
+ "backend",
1186
+ ruby.str_new(&ocr.backend).into_value_with(ruby),
1187
+ )?;
1188
+ set_hash_entry(
1189
+ ruby,
1190
+ &ocr_hash,
1191
+ "language",
1192
+ ruby.str_new(&ocr.language).into_value_with(ruby),
1193
+ )?;
1194
+ if let Some(tesseract_config) = ocr.tesseract_config {
1195
+ let tc_json = serde_json::to_value(&tesseract_config)
1196
+ .map_err(|e| runtime_error(format!("Failed to serialize tesseract_config: {}", e)))?;
1197
+ let tc_ruby = json_value_to_ruby(ruby, &tc_json)?;
1198
+ set_hash_entry(ruby, &ocr_hash, "tesseract_config", tc_ruby)?;
1199
+ }
1200
+ set_hash_entry(ruby, &hash, "ocr", ocr_hash.into_value_with(ruby))?;
1201
+ }
1202
+
1203
+ if let Some(chunking) = config.chunking {
1204
+ let chunking_hash = ruby.hash_new();
1205
+ set_hash_entry(
1206
+ ruby,
1207
+ &chunking_hash,
1208
+ "max_chars",
1209
+ ruby.integer_from_i64(chunking.max_chars as i64).into_value_with(ruby),
1210
+ )?;
1211
+ set_hash_entry(
1212
+ ruby,
1213
+ &chunking_hash,
1214
+ "max_overlap",
1215
+ ruby.integer_from_i64(chunking.max_overlap as i64).into_value_with(ruby),
1216
+ )?;
1217
+ if let Some(preset) = chunking.preset {
1218
+ set_hash_entry(
1219
+ ruby,
1220
+ &chunking_hash,
1221
+ "preset",
1222
+ ruby.str_new(&preset).into_value_with(ruby),
1223
+ )?;
1224
+ }
1225
+ if let Some(embedding) = chunking.embedding {
1226
+ let embedding_json = serde_json::to_value(&embedding)
1227
+ .map_err(|e| runtime_error(format!("Failed to serialize embedding config: {}", e)))?;
1228
+ let embedding_value = json_value_to_ruby(ruby, &embedding_json)?;
1229
+ set_hash_entry(ruby, &chunking_hash, "embedding", embedding_value)?;
1230
+ }
1231
+ set_hash_entry(ruby, &hash, "chunking", chunking_hash.into_value_with(ruby))?;
1232
+ }
1233
+
1234
+ if let Some(lang_detection) = config.language_detection {
1235
+ let lang_hash = ruby.hash_new();
1236
+ set_hash_entry(
1237
+ ruby,
1238
+ &lang_hash,
1239
+ "enabled",
1240
+ if lang_detection.enabled {
1241
+ ruby.qtrue().as_value()
1242
+ } else {
1243
+ ruby.qfalse().as_value()
1244
+ },
1245
+ )?;
1246
+ set_hash_entry(
1247
+ ruby,
1248
+ &lang_hash,
1249
+ "min_confidence",
1250
+ ruby.float_from_f64(lang_detection.min_confidence).into_value_with(ruby),
1251
+ )?;
1252
+ set_hash_entry(
1253
+ ruby,
1254
+ &lang_hash,
1255
+ "detect_multiple",
1256
+ if lang_detection.detect_multiple {
1257
+ ruby.qtrue().as_value()
1258
+ } else {
1259
+ ruby.qfalse().as_value()
1260
+ },
1261
+ )?;
1262
+ set_hash_entry(ruby, &hash, "language_detection", lang_hash.into_value_with(ruby))?;
1263
+ }
1264
+
1265
+ if let Some(pdf_options) = config.pdf_options {
1266
+ let pdf_hash = ruby.hash_new();
1267
+ set_hash_entry(
1268
+ ruby,
1269
+ &pdf_hash,
1270
+ "extract_images",
1271
+ if pdf_options.extract_images {
1272
+ ruby.qtrue().as_value()
1273
+ } else {
1274
+ ruby.qfalse().as_value()
1275
+ },
1276
+ )?;
1277
+ if let Some(passwords) = pdf_options.passwords {
1278
+ let passwords_array = ruby.ary_from_vec(passwords);
1279
+ set_hash_entry(ruby, &pdf_hash, "passwords", passwords_array.into_value_with(ruby))?;
1280
+ }
1281
+ set_hash_entry(
1282
+ ruby,
1283
+ &pdf_hash,
1284
+ "extract_metadata",
1285
+ if pdf_options.extract_metadata {
1286
+ ruby.qtrue().as_value()
1287
+ } else {
1288
+ ruby.qfalse().as_value()
1289
+ },
1290
+ )?;
1291
+ set_hash_entry(ruby, &hash, "pdf_options", pdf_hash.into_value_with(ruby))?;
1292
+ }
1293
+
1294
+ if let Some(images) = config.images {
1295
+ let images_hash = ruby.hash_new();
1296
+ set_hash_entry(
1297
+ ruby,
1298
+ &images_hash,
1299
+ "extract_images",
1300
+ if images.extract_images {
1301
+ ruby.qtrue().as_value()
1302
+ } else {
1303
+ ruby.qfalse().as_value()
1304
+ },
1305
+ )?;
1306
+ set_hash_entry(
1307
+ ruby,
1308
+ &images_hash,
1309
+ "target_dpi",
1310
+ ruby.integer_from_i64(images.target_dpi as i64).into_value_with(ruby),
1311
+ )?;
1312
+ set_hash_entry(
1313
+ ruby,
1314
+ &images_hash,
1315
+ "max_image_dimension",
1316
+ ruby.integer_from_i64(images.max_image_dimension as i64)
1317
+ .into_value_with(ruby),
1318
+ )?;
1319
+ set_hash_entry(
1320
+ ruby,
1321
+ &images_hash,
1322
+ "auto_adjust_dpi",
1323
+ if images.auto_adjust_dpi {
1324
+ ruby.qtrue().as_value()
1325
+ } else {
1326
+ ruby.qfalse().as_value()
1327
+ },
1328
+ )?;
1329
+ set_hash_entry(
1330
+ ruby,
1331
+ &images_hash,
1332
+ "min_dpi",
1333
+ ruby.integer_from_i64(images.min_dpi as i64).into_value_with(ruby),
1334
+ )?;
1335
+ set_hash_entry(
1336
+ ruby,
1337
+ &images_hash,
1338
+ "max_dpi",
1339
+ ruby.integer_from_i64(images.max_dpi as i64).into_value_with(ruby),
1340
+ )?;
1341
+ set_hash_entry(ruby, &hash, "image_extraction", images_hash.into_value_with(ruby))?;
1342
+ }
1343
+
1344
+ if let Some(postprocessor) = config.postprocessor {
1345
+ let pp_hash = ruby.hash_new();
1346
+ set_hash_entry(
1347
+ ruby,
1348
+ &pp_hash,
1349
+ "enabled",
1350
+ if postprocessor.enabled {
1351
+ ruby.qtrue().as_value()
1352
+ } else {
1353
+ ruby.qfalse().as_value()
1354
+ },
1355
+ )?;
1356
+ if let Some(enabled_processors) = postprocessor.enabled_processors {
1357
+ let enabled_array = ruby.ary_from_vec(enabled_processors);
1358
+ set_hash_entry(
1359
+ ruby,
1360
+ &pp_hash,
1361
+ "enabled_processors",
1362
+ enabled_array.into_value_with(ruby),
1363
+ )?;
1364
+ }
1365
+ if let Some(disabled_processors) = postprocessor.disabled_processors {
1366
+ let disabled_array = ruby.ary_from_vec(disabled_processors);
1367
+ set_hash_entry(
1368
+ ruby,
1369
+ &pp_hash,
1370
+ "disabled_processors",
1371
+ disabled_array.into_value_with(ruby),
1372
+ )?;
1373
+ }
1374
+ set_hash_entry(ruby, &hash, "postprocessor", pp_hash.into_value_with(ruby))?;
1375
+ }
1376
+
1377
+ if let Some(token_reduction) = config.token_reduction {
1378
+ let tr_hash = ruby.hash_new();
1379
+ set_hash_entry(
1380
+ ruby,
1381
+ &tr_hash,
1382
+ "mode",
1383
+ ruby.str_new(&token_reduction.mode).into_value_with(ruby),
1384
+ )?;
1385
+ set_hash_entry(
1386
+ ruby,
1387
+ &tr_hash,
1388
+ "preserve_important_words",
1389
+ if token_reduction.preserve_important_words {
1390
+ ruby.qtrue().as_value()
1391
+ } else {
1392
+ ruby.qfalse().as_value()
1393
+ },
1394
+ )?;
1395
+ set_hash_entry(ruby, &hash, "token_reduction", tr_hash.into_value_with(ruby))?;
1396
+ }
1397
+
1398
+ if let Some(keywords) = config.keywords {
1399
+ let keywords_hash = keyword_config_to_ruby_hash(ruby, &keywords)?;
1400
+ set_hash_entry(ruby, &hash, "keywords", keywords_hash.into_value_with(ruby))?;
1401
+ }
1402
+
1403
+ if let Some(html_options) = config.html_options {
1404
+ let html_hash = html_options_to_ruby_hash(ruby, &html_options)?;
1405
+ set_hash_entry(ruby, &hash, "html_options", html_hash.into_value_with(ruby))?;
1406
+ }
1407
+
1408
+ if let Some(max_concurrent) = config.max_concurrent_extractions {
1409
+ set_hash_entry(
1410
+ ruby,
1411
+ &hash,
1412
+ "max_concurrent_extractions",
1413
+ ruby.integer_from_u64(max_concurrent as u64).into_value_with(ruby),
1414
+ )?;
1415
+ }
1416
+
1417
+ Ok(hash)
1418
+ }
1419
+
1420
+ /// Load extraction configuration from a file.
1421
+ ///
1422
+ /// Detects the file format from the extension (.toml, .yaml, .json)
1423
+ /// and loads the configuration accordingly. Returns a hash to be used by Ruby.
1424
+ ///
1425
+ /// @param path [String] Path to the configuration file
1426
+ /// @return [Hash] Configuration hash
1427
+ ///
1428
+ /// @example Load from TOML
1429
+ /// hash = Kreuzberg._config_from_file_native("config.toml")
1430
+ ///
1431
+ /// @example Load from YAML
1432
+ /// hash = Kreuzberg._config_from_file_native("config.yaml")
1433
+ ///
1434
+ fn config_from_file(path: String) -> Result<RHash, Error> {
1435
+ let ruby = Ruby::get().expect("Ruby not initialized");
1436
+ let file_path = Path::new(&path);
1437
+
1438
+ let extension = file_path
1439
+ .extension()
1440
+ .and_then(|ext| ext.to_str())
1441
+ .ok_or_else(|| runtime_error("File path must have an extension (.toml, .yaml, or .json)"))?;
1442
+
1443
+ let config = match extension {
1444
+ "toml" => ExtractionConfig::from_toml_file(file_path).map_err(kreuzberg_error)?,
1445
+ "yaml" => ExtractionConfig::from_yaml_file(file_path).map_err(kreuzberg_error)?,
1446
+ "json" => ExtractionConfig::from_json_file(file_path).map_err(kreuzberg_error)?,
1447
+ _ => {
1448
+ return Err(runtime_error(format!(
1449
+ "Unsupported file extension '{}'. Supported: .toml, .yaml, .json",
1450
+ extension
1451
+ )));
1452
+ }
1453
+ };
1454
+
1455
+ extraction_config_to_ruby_hash(&ruby, config)
1456
+ }
1457
+
1458
+ /// Discover configuration file in current or parent directories.
1459
+ ///
1460
+ /// Searches for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in the current
1461
+ /// directory and parent directories. Returns nil if no config file is found.
1462
+ ///
1463
+ /// @return [Hash, nil] Configuration hash or nil if not found
1464
+ ///
1465
+ /// @example
1466
+ /// hash = Kreuzberg._config_discover_native
1467
+ /// # => {...config hash...} or nil
1468
+ ///
1469
+ fn config_discover() -> Result<Value, Error> {
1470
+ let ruby = Ruby::get().expect("Ruby not initialized");
1471
+
1472
+ let maybe_config = ExtractionConfig::discover().map_err(kreuzberg_error)?;
1473
+
1474
+ match maybe_config {
1475
+ Some(config) => {
1476
+ let hash = extraction_config_to_ruby_hash(&ruby, config)?;
1477
+ Ok(hash.as_value())
1478
+ }
1479
+ None => Ok(ruby.qnil().as_value()),
1480
+ }
1481
+ }
1482
+
1483
+ /// Convert Rust ExtractionResult to Ruby Hash
1484
+ fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> Result<RHash, Error> {
1485
+ let hash = ruby.hash_new();
1486
+
1487
+ let content_value = ruby.str_new(result.content.as_str()).into_value_with(ruby);
1488
+ set_hash_entry(ruby, &hash, "content", content_value)?;
1489
+
1490
+ let mime_value = ruby.str_new(result.mime_type.as_str()).into_value_with(ruby);
1491
+ set_hash_entry(ruby, &hash, "mime_type", mime_value)?;
1492
+
1493
+ let metadata_json = serde_json::to_string(&result.metadata)
1494
+ .map_err(|e| runtime_error(format!("Failed to serialize metadata: {}", e)))?;
1495
+ let metadata_json_value = ruby.str_new(&metadata_json).into_value_with(ruby);
1496
+ set_hash_entry(ruby, &hash, "metadata_json", metadata_json_value)?;
1497
+ let metadata_value = serde_json::to_value(&result.metadata)
1498
+ .map_err(|e| runtime_error(format!("Failed to serialize metadata: {}", e)))?;
1499
+ let metadata_hash = json_value_to_ruby(ruby, &metadata_value)?;
1500
+ set_hash_entry(ruby, &hash, "metadata", metadata_hash)?;
1501
+
1502
+ let tables_array = ruby.ary_new();
1503
+ for table in result.tables {
1504
+ let table_hash = ruby.hash_new();
1505
+
1506
+ let cells_array = ruby.ary_new();
1507
+ for row in table.cells {
1508
+ let row_array = ruby.ary_from_vec(row);
1509
+ cells_array.push(row_array)?;
1510
+ }
1511
+ table_hash.aset("cells", cells_array)?;
1512
+
1513
+ table_hash.aset("markdown", table.markdown)?;
1514
+
1515
+ table_hash.aset("page_number", table.page_number)?;
1516
+
1517
+ tables_array.push(table_hash)?;
1518
+ }
1519
+ let tables_value = tables_array.into_value_with(ruby);
1520
+ set_hash_entry(ruby, &hash, "tables", tables_value)?;
1521
+
1522
+ if let Some(langs) = result.detected_languages {
1523
+ let langs_array = ruby.ary_from_vec(langs);
1524
+ let langs_value = langs_array.into_value_with(ruby);
1525
+ set_hash_entry(ruby, &hash, "detected_languages", langs_value)?;
1526
+ } else {
1527
+ set_hash_entry(ruby, &hash, "detected_languages", ruby.qnil().as_value())?;
1528
+ }
1529
+
1530
+ if let Some(chunks) = result.chunks {
1531
+ let chunks_array = ruby.ary_new();
1532
+ for chunk in chunks {
1533
+ let chunk_hash = ruby.hash_new();
1534
+ chunk_hash.aset("content", chunk.content)?;
1535
+ chunk_hash.aset("char_start", chunk.metadata.char_start)?;
1536
+ chunk_hash.aset("char_end", chunk.metadata.char_end)?;
1537
+ if let Some(token_count) = chunk.metadata.token_count {
1538
+ chunk_hash.aset("token_count", token_count)?;
1539
+ } else {
1540
+ chunk_hash.aset("token_count", ruby.qnil().as_value())?;
1541
+ }
1542
+ chunk_hash.aset("chunk_index", chunk.metadata.chunk_index)?;
1543
+ chunk_hash.aset("total_chunks", chunk.metadata.total_chunks)?;
1544
+ if let Some(embedding) = chunk.embedding {
1545
+ let embedding_array = ruby.ary_new();
1546
+ for value in embedding {
1547
+ embedding_array.push(ruby.float_from_f64(value as f64).into_value_with(ruby))?;
1548
+ }
1549
+ chunk_hash.aset("embedding", embedding_array)?;
1550
+ } else {
1551
+ chunk_hash.aset("embedding", ruby.qnil().as_value())?;
1552
+ }
1553
+ chunks_array.push(chunk_hash)?;
1554
+ }
1555
+ let chunks_value = chunks_array.into_value_with(ruby);
1556
+ set_hash_entry(ruby, &hash, "chunks", chunks_value)?;
1557
+ } else {
1558
+ set_hash_entry(ruby, &hash, "chunks", ruby.qnil().as_value())?;
1559
+ }
1560
+
1561
+ if let Some(images) = result.images {
1562
+ let images_array = ruby.ary_new();
1563
+ for image in images {
1564
+ let image_hash = ruby.hash_new();
1565
+ let data_value = ruby.str_from_slice(&image.data).into_value_with(ruby);
1566
+ image_hash.aset("data", data_value)?;
1567
+ image_hash.aset("format", image.format)?;
1568
+ image_hash.aset("image_index", image.image_index as i64)?;
1569
+ if let Some(page) = image.page_number {
1570
+ image_hash.aset("page_number", page as i64)?;
1571
+ } else {
1572
+ image_hash.aset("page_number", ruby.qnil().as_value())?;
1573
+ }
1574
+ if let Some(width) = image.width {
1575
+ image_hash.aset("width", width as i64)?;
1576
+ } else {
1577
+ image_hash.aset("width", ruby.qnil().as_value())?;
1578
+ }
1579
+ if let Some(height) = image.height {
1580
+ image_hash.aset("height", height as i64)?;
1581
+ } else {
1582
+ image_hash.aset("height", ruby.qnil().as_value())?;
1583
+ }
1584
+ if let Some(colorspace) = image.colorspace {
1585
+ image_hash.aset("colorspace", colorspace)?;
1586
+ } else {
1587
+ image_hash.aset("colorspace", ruby.qnil().as_value())?;
1588
+ }
1589
+ if let Some(bits) = image.bits_per_component {
1590
+ image_hash.aset("bits_per_component", bits as i64)?;
1591
+ } else {
1592
+ image_hash.aset("bits_per_component", ruby.qnil().as_value())?;
1593
+ }
1594
+ image_hash.aset(
1595
+ "is_mask",
1596
+ if image.is_mask {
1597
+ ruby.qtrue().as_value()
1598
+ } else {
1599
+ ruby.qfalse().as_value()
1600
+ },
1601
+ )?;
1602
+ if let Some(description) = image.description {
1603
+ image_hash.aset("description", description)?;
1604
+ } else {
1605
+ image_hash.aset("description", ruby.qnil().as_value())?;
1606
+ }
1607
+ if let Some(ocr_result) = image.ocr_result {
1608
+ let nested = extraction_result_to_ruby(ruby, *ocr_result)?;
1609
+ image_hash.aset("ocr_result", nested.into_value_with(ruby))?;
1610
+ } else {
1611
+ image_hash.aset("ocr_result", ruby.qnil().as_value())?;
1612
+ }
1613
+ images_array.push(image_hash)?;
1614
+ }
1615
+ set_hash_entry(ruby, &hash, "images", images_array.into_value_with(ruby))?;
1616
+ } else {
1617
+ set_hash_entry(ruby, &hash, "images", ruby.qnil().as_value())?;
1618
+ }
1619
+
1620
+ Ok(hash)
1621
+ }
1622
+
1623
+ /// Extract content from a file (synchronous).
1624
+ ///
1625
+ /// @param path [String] Path to the file
1626
+ /// @param mime_type [String, nil] Optional MIME type hint
1627
+ /// @param options [Hash] Extraction configuration
1628
+ /// @return [Hash] Extraction result with :content, :mime_type, :metadata, :tables, etc.
1629
+ ///
1630
+ /// @example Basic usage
1631
+ /// result = Kreuzberg.extract_file_sync("document.pdf")
1632
+ /// puts result[:content]
1633
+ ///
1634
+ /// @example With OCR
1635
+ /// result = Kreuzberg.extract_file_sync("scanned.pdf", nil, force_ocr: true)
1636
+ ///
1637
+ fn extract_file_sync(args: &[Value]) -> Result<RHash, Error> {
1638
+ let ruby = Ruby::get().expect("Ruby not initialized");
1639
+ let args = scan_args::<(String,), (Option<String>,), (), (), RHash, ()>(args)?;
1640
+ let (path,) = args.required;
1641
+ let (mime_type,) = args.optional;
1642
+ let opts = Some(args.keywords);
1643
+
1644
+ let config = parse_extraction_config(&ruby, opts)?;
1645
+
1646
+ let result = kreuzberg::extract_file_sync(&path, mime_type.as_deref(), &config).map_err(kreuzberg_error)?;
1647
+
1648
+ extraction_result_to_ruby(&ruby, result)
1649
+ }
1650
+
1651
+ /// Extract content from bytes (synchronous).
1652
+ ///
1653
+ /// @param data [String] Binary data to extract
1654
+ /// @param mime_type [String] MIME type of the data
1655
+ /// @param options [Hash] Extraction configuration
1656
+ /// @return [Hash] Extraction result
1657
+ ///
1658
+ /// @example
1659
+ /// data = File.binread("document.pdf")
1660
+ /// result = Kreuzberg.extract_bytes_sync(data, "application/pdf")
1661
+ ///
1662
+ fn extract_bytes_sync(args: &[Value]) -> Result<RHash, Error> {
1663
+ let ruby = Ruby::get().expect("Ruby not initialized");
1664
+ let args = scan_args::<(String, String), (), (), (), RHash, ()>(args)?;
1665
+ let (data, mime_type) = args.required;
1666
+ let opts = Some(args.keywords);
1667
+
1668
+ let config = parse_extraction_config(&ruby, opts)?;
1669
+
1670
+ let result = kreuzberg::extract_bytes_sync(data.as_bytes(), &mime_type, &config).map_err(kreuzberg_error)?;
1671
+
1672
+ extraction_result_to_ruby(&ruby, result)
1673
+ }
1674
+
1675
+ /// Batch extract content from multiple files (synchronous).
1676
+ ///
1677
+ /// @param paths [Array<String>] List of file paths
1678
+ /// @param options [Hash] Extraction configuration
1679
+ /// @return [Array<Hash>] Array of extraction results
1680
+ ///
1681
+ /// @example
1682
+ /// paths = ["doc1.pdf", "doc2.docx", "doc3.xlsx"]
1683
+ /// results = Kreuzberg.batch_extract_files_sync(paths)
1684
+ /// results.each { |r| puts r[:content] }
1685
+ ///
1686
+ fn batch_extract_files_sync(args: &[Value]) -> Result<RArray, Error> {
1687
+ let ruby = Ruby::get().expect("Ruby not initialized");
1688
+ let args = scan_args::<(RArray,), (), (), (), RHash, ()>(args)?;
1689
+ let (paths_array,) = args.required;
1690
+ let opts = Some(args.keywords);
1691
+
1692
+ let config = parse_extraction_config(&ruby, opts)?;
1693
+
1694
+ let paths: Vec<String> = paths_array.to_vec::<String>()?;
1695
+
1696
+ let results = kreuzberg::batch_extract_file_sync(paths, &config).map_err(kreuzberg_error)?;
1697
+
1698
+ let results_array = ruby.ary_new();
1699
+ for result in results {
1700
+ results_array.push(extraction_result_to_ruby(&ruby, result)?)?;
1701
+ }
1702
+
1703
+ Ok(results_array)
1704
+ }
1705
+
1706
+ /// Extract content from a file (asynchronous).
1707
+ ///
1708
+ /// Note: Ruby doesn't have native async/await, so this uses a blocking Tokio runtime.
1709
+ /// For true async behavior, use the synchronous version in a background thread.
1710
+ ///
1711
+ /// @param path [String] Path to the file
1712
+ /// @param mime_type [String, nil] Optional MIME type hint
1713
+ /// @param options [Hash] Extraction configuration
1714
+ /// @return [Hash] Extraction result
1715
+ ///
1716
+ fn extract_file(args: &[Value]) -> Result<RHash, Error> {
1717
+ let ruby = Ruby::get().expect("Ruby not initialized");
1718
+ let args = scan_args::<(String,), (Option<String>,), (), (), RHash, ()>(args)?;
1719
+ let (path,) = args.required;
1720
+ let (mime_type,) = args.optional;
1721
+ let opts = Some(args.keywords);
1722
+
1723
+ let config = parse_extraction_config(&ruby, opts)?;
1724
+
1725
+ let runtime =
1726
+ tokio::runtime::Runtime::new().map_err(|e| runtime_error(format!("Failed to create Tokio runtime: {}", e)))?;
1727
+
1728
+ let result = runtime
1729
+ .block_on(async { kreuzberg::extract_file(&path, mime_type.as_deref(), &config).await })
1730
+ .map_err(kreuzberg_error)?;
1731
+
1732
+ extraction_result_to_ruby(&ruby, result)
1733
+ }
1734
+
1735
+ /// Extract content from bytes (asynchronous).
1736
+ ///
1737
+ /// @param data [String] Binary data
1738
+ /// @param mime_type [String] MIME type
1739
+ /// @param options [Hash] Extraction configuration
1740
+ /// @return [Hash] Extraction result
1741
+ ///
1742
+ fn extract_bytes(args: &[Value]) -> Result<RHash, Error> {
1743
+ let ruby = Ruby::get().expect("Ruby not initialized");
1744
+ let args = scan_args::<(String, String), (), (), (), RHash, ()>(args)?;
1745
+ let (data, mime_type) = args.required;
1746
+ let opts = Some(args.keywords);
1747
+
1748
+ let config = parse_extraction_config(&ruby, opts)?;
1749
+
1750
+ let runtime =
1751
+ tokio::runtime::Runtime::new().map_err(|e| runtime_error(format!("Failed to create Tokio runtime: {}", e)))?;
1752
+
1753
+ let result = runtime
1754
+ .block_on(async { kreuzberg::extract_bytes(data.as_bytes(), &mime_type, &config).await })
1755
+ .map_err(kreuzberg_error)?;
1756
+
1757
+ extraction_result_to_ruby(&ruby, result)
1758
+ }
1759
+
1760
+ /// Batch extract content from multiple files (asynchronous).
1761
+ ///
1762
+ /// @param paths [Array<String>] List of file paths
1763
+ /// @param options [Hash] Extraction configuration
1764
+ /// @return [Array<Hash>] Array of extraction results
1765
+ ///
1766
+ fn batch_extract_files(args: &[Value]) -> Result<RArray, Error> {
1767
+ let ruby = Ruby::get().expect("Ruby not initialized");
1768
+ let args = scan_args::<(RArray,), (), (), (), RHash, ()>(args)?;
1769
+ let (paths_array,) = args.required;
1770
+ let opts = Some(args.keywords);
1771
+
1772
+ let config = parse_extraction_config(&ruby, opts)?;
1773
+
1774
+ let paths: Vec<String> = paths_array.to_vec::<String>()?;
1775
+
1776
+ let runtime =
1777
+ tokio::runtime::Runtime::new().map_err(|e| runtime_error(format!("Failed to create Tokio runtime: {}", e)))?;
1778
+
1779
+ let results = runtime
1780
+ .block_on(async { kreuzberg::batch_extract_file(paths, &config).await })
1781
+ .map_err(kreuzberg_error)?;
1782
+
1783
+ let results_array = ruby.ary_new();
1784
+ for result in results {
1785
+ results_array.push(extraction_result_to_ruby(&ruby, result)?)?;
1786
+ }
1787
+
1788
+ Ok(results_array)
1789
+ }
1790
+
1791
+ /// Batch extract content from multiple byte arrays (synchronous).
1792
+ ///
1793
+ /// @param bytes_array [Array<String>] List of binary data strings
1794
+ /// @param mime_types [Array<String>] List of MIME types corresponding to each byte array
1795
+ /// @param options [Hash] Extraction configuration
1796
+ /// @return [Array<Hash>] Array of extraction results
1797
+ ///
1798
+ /// @example
1799
+ /// data1 = File.binread("document.pdf")
1800
+ /// data2 = File.binread("invoice.docx")
1801
+ /// results = Kreuzberg.batch_extract_bytes_sync([data1, data2], ["application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"])
1802
+ ///
1803
+ fn batch_extract_bytes_sync(args: &[Value]) -> Result<RArray, Error> {
1804
+ let ruby = Ruby::get().expect("Ruby not initialized");
1805
+ let args = scan_args::<(RArray, RArray), (), (), (), RHash, ()>(args)?;
1806
+ let (bytes_array, mime_types_array) = args.required;
1807
+ let opts = Some(args.keywords);
1808
+
1809
+ let config = parse_extraction_config(&ruby, opts)?;
1810
+
1811
+ let bytes_vec: Vec<String> = bytes_array.to_vec::<String>()?;
1812
+ let mime_types: Vec<String> = mime_types_array.to_vec::<String>()?;
1813
+
1814
+ if bytes_vec.len() != mime_types.len() {
1815
+ return Err(runtime_error(format!(
1816
+ "bytes_array and mime_types must have the same length: {} vs {}",
1817
+ bytes_vec.len(),
1818
+ mime_types.len()
1819
+ )));
1820
+ }
1821
+
1822
+ let contents: Vec<(&[u8], &str)> = bytes_vec
1823
+ .iter()
1824
+ .zip(mime_types.iter())
1825
+ .map(|(bytes, mime)| (bytes.as_bytes(), mime.as_str()))
1826
+ .collect();
1827
+
1828
+ let results = kreuzberg::batch_extract_bytes_sync(contents, &config).map_err(kreuzberg_error)?;
1829
+
1830
+ let results_array = ruby.ary_new();
1831
+ for result in results {
1832
+ results_array.push(extraction_result_to_ruby(&ruby, result)?)?;
1833
+ }
1834
+
1835
+ Ok(results_array)
1836
+ }
1837
+
1838
+ /// Batch extract content from multiple byte arrays (asynchronous).
1839
+ ///
1840
+ /// @param bytes_array [Array<String>] List of binary data strings
1841
+ /// @param mime_types [Array<String>] List of MIME types corresponding to each byte array
1842
+ /// @param options [Hash] Extraction configuration
1843
+ /// @return [Array<Hash>] Array of extraction results
1844
+ ///
1845
+ fn batch_extract_bytes(args: &[Value]) -> Result<RArray, Error> {
1846
+ let ruby = Ruby::get().expect("Ruby not initialized");
1847
+ let args = scan_args::<(RArray, RArray), (), (), (), RHash, ()>(args)?;
1848
+ let (bytes_array, mime_types_array) = args.required;
1849
+ let opts = Some(args.keywords);
1850
+
1851
+ let config = parse_extraction_config(&ruby, opts)?;
1852
+
1853
+ let bytes_vec: Vec<String> = bytes_array.to_vec::<String>()?;
1854
+ let mime_types: Vec<String> = mime_types_array.to_vec::<String>()?;
1855
+
1856
+ if bytes_vec.len() != mime_types.len() {
1857
+ return Err(runtime_error(format!(
1858
+ "bytes_array and mime_types must have the same length: {} vs {}",
1859
+ bytes_vec.len(),
1860
+ mime_types.len()
1861
+ )));
1862
+ }
1863
+
1864
+ let contents: Vec<(&[u8], &str)> = bytes_vec
1865
+ .iter()
1866
+ .zip(mime_types.iter())
1867
+ .map(|(bytes, mime)| (bytes.as_bytes(), mime.as_str()))
1868
+ .collect();
1869
+
1870
+ let runtime =
1871
+ tokio::runtime::Runtime::new().map_err(|e| runtime_error(format!("Failed to create Tokio runtime: {}", e)))?;
1872
+
1873
+ let results = runtime
1874
+ .block_on(async { kreuzberg::batch_extract_bytes(contents, &config).await })
1875
+ .map_err(kreuzberg_error)?;
1876
+
1877
+ let results_array = ruby.ary_new();
1878
+ for result in results {
1879
+ results_array.push(extraction_result_to_ruby(&ruby, result)?)?;
1880
+ }
1881
+
1882
+ Ok(results_array)
1883
+ }
1884
+
1885
+ /// Clear all cache entries.
1886
+ ///
1887
+ /// @return [void]
1888
+ ///
1889
+ /// @example
1890
+ /// Kreuzberg.clear_cache
1891
+ ///
1892
+ fn ruby_clear_cache() -> Result<(), Error> {
1893
+ let cache_root = cache_root_dir()?;
1894
+ if !cache_root.exists() {
1895
+ return Ok(());
1896
+ }
1897
+
1898
+ for dir in cache_directories(&cache_root)? {
1899
+ let Some(dir_str) = dir.to_str() else {
1900
+ return Err(runtime_error("Cache directory path contains non-UTF8 characters"));
1901
+ };
1902
+
1903
+ // OSError/RuntimeError must bubble up - system errors need user reports ~keep
1904
+ kreuzberg::cache::clear_cache_directory(dir_str).map_err(kreuzberg_error)?;
1905
+ }
1906
+
1907
+ Ok(())
1908
+ }
1909
+
1910
+ /// Get cache statistics.
1911
+ ///
1912
+ /// @return [Hash] Cache statistics with :total_entries and :total_size_bytes
1913
+ ///
1914
+ /// @example
1915
+ /// stats = Kreuzberg.cache_stats
1916
+ /// puts "Cache entries: #{stats[:total_entries]}"
1917
+ /// puts "Cache size: #{stats[:total_size_bytes]} bytes"
1918
+ ///
1919
+ fn ruby_cache_stats() -> Result<RHash, Error> {
1920
+ let ruby = Ruby::get().expect("Ruby not initialized");
1921
+
1922
+ let hash = ruby.hash_new();
1923
+ let cache_root = cache_root_dir()?;
1924
+
1925
+ if !cache_root.exists() {
1926
+ hash.aset("total_entries", 0)?;
1927
+ hash.aset("total_size_bytes", 0)?;
1928
+ return Ok(hash);
1929
+ }
1930
+
1931
+ let mut total_entries: usize = 0;
1932
+ let mut total_bytes: f64 = 0.0;
1933
+
1934
+ for dir in cache_directories(&cache_root)? {
1935
+ let Some(dir_str) = dir.to_str() else {
1936
+ return Err(runtime_error("Cache directory path contains non-UTF8 characters"));
1937
+ };
1938
+
1939
+ // OSError/RuntimeError must bubble up - system errors need user reports ~keep
1940
+ let stats = kreuzberg::cache::get_cache_metadata(dir_str).map_err(kreuzberg_error)?;
1941
+ total_entries += stats.total_files;
1942
+ total_bytes += stats.total_size_mb * 1024.0 * 1024.0;
1943
+ }
1944
+
1945
+ set_hash_entry(
1946
+ &ruby,
1947
+ &hash,
1948
+ "total_entries",
1949
+ ruby.integer_from_u64(total_entries as u64).into_value_with(&ruby),
1950
+ )?;
1951
+ set_hash_entry(
1952
+ &ruby,
1953
+ &hash,
1954
+ "total_size_bytes",
1955
+ ruby.integer_from_u64(total_bytes.round() as u64).into_value_with(&ruby),
1956
+ )?;
1957
+
1958
+ Ok(hash)
1959
+ }
1960
+
1961
+ /// Register a post-processor plugin.
1962
+ ///
1963
+ /// @param name [String] Unique identifier for the post-processor
1964
+ /// @param processor [Proc] Ruby Proc/lambda that processes extraction results
1965
+ /// @param priority [Integer] Execution priority (default: 50, higher = runs first)
1966
+ /// @return [nil]
1967
+ ///
1968
+ /// # Example
1969
+ /// ```text
1970
+ /// Kreuzberg.register_post_processor("uppercase", ->(result) {
1971
+ /// result[:content] = result[:content].upcase
1972
+ /// result
1973
+ /// }, 100)
1974
+ /// ```
1975
+ fn register_post_processor(args: &[Value]) -> Result<(), Error> {
1976
+ let _ruby = Ruby::get().expect("Ruby not initialized");
1977
+ let args = scan_args::<(String, Value), (Option<i32>,), (), (), (), ()>(args)?;
1978
+ let (name, processor) = args.required;
1979
+ let (priority,) = args.optional;
1980
+ let priority = priority.unwrap_or(50);
1981
+
1982
+ if !processor.respond_to("call", true)? {
1983
+ return Err(runtime_error("Post-processor must be a Proc or respond to 'call'"));
1984
+ }
1985
+
1986
+ use async_trait::async_trait;
1987
+ use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
1988
+ use std::sync::Arc;
1989
+
1990
+ struct RubyPostProcessor {
1991
+ name: String,
1992
+ processor: GcGuardedValue,
1993
+ }
1994
+
1995
+ unsafe impl Send for RubyPostProcessor {}
1996
+ unsafe impl Sync for RubyPostProcessor {}
1997
+
1998
+ impl Plugin for RubyPostProcessor {
1999
+ fn name(&self) -> &str {
2000
+ &self.name
2001
+ }
2002
+
2003
+ fn version(&self) -> String {
2004
+ "1.0.0".to_string()
2005
+ }
2006
+
2007
+ fn initialize(&self) -> kreuzberg::Result<()> {
2008
+ Ok(())
2009
+ }
2010
+
2011
+ fn shutdown(&self) -> kreuzberg::Result<()> {
2012
+ Ok(())
2013
+ }
2014
+ }
2015
+
2016
+ #[async_trait]
2017
+ impl PostProcessor for RubyPostProcessor {
2018
+ async fn process(
2019
+ &self,
2020
+ result: &mut kreuzberg::ExtractionResult,
2021
+ _config: &kreuzberg::ExtractionConfig,
2022
+ ) -> kreuzberg::Result<()> {
2023
+ let processor_name = self.name.clone();
2024
+ let processor = self.processor.value();
2025
+ let result_clone = result.clone();
2026
+
2027
+ let updated_result = tokio::task::block_in_place(|| {
2028
+ let ruby = Ruby::get().expect("Ruby not initialized");
2029
+ let result_hash = extraction_result_to_ruby(&ruby, result_clone.clone()).map_err(|e| {
2030
+ kreuzberg::KreuzbergError::Plugin {
2031
+ message: format!("Failed to convert result to Ruby: {}", e),
2032
+ plugin_name: processor_name.clone(),
2033
+ }
2034
+ })?;
2035
+
2036
+ let modified = processor
2037
+ .funcall::<_, _, magnus::Value>("call", (result_hash,))
2038
+ .map_err(|e| kreuzberg::KreuzbergError::Plugin {
2039
+ message: format!("Ruby post-processor failed: {}", e),
2040
+ plugin_name: processor_name.clone(),
2041
+ })?;
2042
+
2043
+ let modified_hash =
2044
+ magnus::RHash::try_convert(modified).map_err(|e| kreuzberg::KreuzbergError::Plugin {
2045
+ message: format!("Post-processor must return a Hash: {}", e),
2046
+ plugin_name: processor_name.clone(),
2047
+ })?;
2048
+
2049
+ let mut updated_result = result_clone;
2050
+
2051
+ if let Some(content_val) = get_kw(&ruby, modified_hash, "content") {
2052
+ let new_content =
2053
+ String::try_convert(content_val).map_err(|e| kreuzberg::KreuzbergError::Plugin {
2054
+ message: format!("Failed to convert content: {}", e),
2055
+ plugin_name: processor_name.clone(),
2056
+ })?;
2057
+ updated_result.content = new_content;
2058
+ }
2059
+
2060
+ if let Some(mime_val) = get_kw(&ruby, modified_hash, "mime_type") {
2061
+ let new_mime = String::try_convert(mime_val).map_err(|e| kreuzberg::KreuzbergError::Plugin {
2062
+ message: format!("Failed to convert mime_type: {}", e),
2063
+ plugin_name: processor_name.clone(),
2064
+ })?;
2065
+ updated_result.mime_type = new_mime;
2066
+ }
2067
+
2068
+ if let Some(metadata_val) = get_kw(&ruby, modified_hash, "metadata") {
2069
+ if metadata_val.is_nil() {
2070
+ updated_result.metadata = kreuzberg::types::Metadata::default();
2071
+ } else {
2072
+ let metadata_json =
2073
+ ruby_value_to_json(metadata_val).map_err(|e| kreuzberg::KreuzbergError::Plugin {
2074
+ message: format!("Metadata must be JSON-serializable: {}", e),
2075
+ plugin_name: processor_name.clone(),
2076
+ })?;
2077
+ let metadata: kreuzberg::types::Metadata =
2078
+ serde_json::from_value(metadata_json).map_err(|e| kreuzberg::KreuzbergError::Plugin {
2079
+ message: format!("Failed to deserialize metadata: {}", e),
2080
+ plugin_name: processor_name.clone(),
2081
+ })?;
2082
+ updated_result.metadata = metadata;
2083
+ }
2084
+ }
2085
+
2086
+ if let Some(tables_val) = get_kw(&ruby, modified_hash, "tables") {
2087
+ let tables_json =
2088
+ ruby_value_to_json(tables_val).map_err(|e| kreuzberg::KreuzbergError::Plugin {
2089
+ message: format!("Tables must be JSON-serializable: {}", e),
2090
+ plugin_name: processor_name.clone(),
2091
+ })?;
2092
+ if tables_json.is_null() {
2093
+ updated_result.tables.clear();
2094
+ } else {
2095
+ let tables: Vec<kreuzberg::types::Table> =
2096
+ serde_json::from_value(tables_json).map_err(|e| kreuzberg::KreuzbergError::Plugin {
2097
+ message: format!("Failed to deserialize tables: {}", e),
2098
+ plugin_name: processor_name.clone(),
2099
+ })?;
2100
+ updated_result.tables = tables;
2101
+ }
2102
+ }
2103
+
2104
+ if let Some(languages_val) = get_kw(&ruby, modified_hash, "detected_languages") {
2105
+ if languages_val.is_nil() {
2106
+ updated_result.detected_languages = None;
2107
+ } else {
2108
+ let langs_json =
2109
+ ruby_value_to_json(languages_val).map_err(|e| kreuzberg::KreuzbergError::Plugin {
2110
+ message: format!("detected_languages must be JSON-serializable: {}", e),
2111
+ plugin_name: processor_name.clone(),
2112
+ })?;
2113
+ let languages: Vec<String> =
2114
+ serde_json::from_value(langs_json).map_err(|e| kreuzberg::KreuzbergError::Plugin {
2115
+ message: format!("Failed to deserialize detected_languages: {}", e),
2116
+ plugin_name: processor_name.clone(),
2117
+ })?;
2118
+ updated_result.detected_languages = Some(languages);
2119
+ }
2120
+ }
2121
+
2122
+ if let Some(chunks_val) = get_kw(&ruby, modified_hash, "chunks") {
2123
+ if chunks_val.is_nil() {
2124
+ updated_result.chunks = None;
2125
+ } else {
2126
+ let chunks_json =
2127
+ ruby_value_to_json(chunks_val).map_err(|e| kreuzberg::KreuzbergError::Plugin {
2128
+ message: format!("Chunks must be JSON-serializable: {}", e),
2129
+ plugin_name: processor_name.clone(),
2130
+ })?;
2131
+ let chunks: Vec<kreuzberg::types::Chunk> =
2132
+ serde_json::from_value(chunks_json).map_err(|e| kreuzberg::KreuzbergError::Plugin {
2133
+ message: format!("Failed to deserialize chunks: {}", e),
2134
+ plugin_name: processor_name.clone(),
2135
+ })?;
2136
+ updated_result.chunks = Some(chunks);
2137
+ }
2138
+ }
2139
+
2140
+ Ok::<kreuzberg::ExtractionResult, kreuzberg::KreuzbergError>(updated_result)
2141
+ })?;
2142
+
2143
+ *result = updated_result;
2144
+ Ok(())
2145
+ }
2146
+
2147
+ fn processing_stage(&self) -> ProcessingStage {
2148
+ ProcessingStage::Late
2149
+ }
2150
+ }
2151
+
2152
+ let processor_impl = Arc::new(RubyPostProcessor {
2153
+ name: name.clone(),
2154
+ processor: GcGuardedValue::new(processor),
2155
+ });
2156
+
2157
+ let registry = kreuzberg::get_post_processor_registry();
2158
+ registry
2159
+ .write()
2160
+ .map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
2161
+ .register(processor_impl, priority)
2162
+ .map_err(kreuzberg_error)?;
2163
+
2164
+ Ok(())
2165
+ }
2166
+
2167
+ /// Register a validator plugin.
2168
+ ///
2169
+ /// @param name [String] Unique identifier for the validator
2170
+ /// @param validator [Proc] Ruby Proc/lambda that validates extraction results
2171
+ /// @param priority [Integer] Execution priority (default: 50, higher = runs first)
2172
+ /// @return [nil]
2173
+ ///
2174
+ /// # Example
2175
+ /// ```text
2176
+ /// Kreuzberg.register_validator("min_length", ->(result) {
2177
+ /// raise "Content too short" if result[:content].length < 100
2178
+ /// }, 100)
2179
+ /// ```
2180
+ fn register_validator(args: &[Value]) -> Result<(), Error> {
2181
+ let _ruby = Ruby::get().expect("Ruby not initialized");
2182
+ let args = scan_args::<(String, Value), (Option<i32>,), (), (), (), ()>(args)?;
2183
+ let (name, validator) = args.required;
2184
+ let (priority,) = args.optional;
2185
+ let priority = priority.unwrap_or(50);
2186
+
2187
+ if !validator.respond_to("call", true)? {
2188
+ return Err(runtime_error("Validator must be a Proc or respond to 'call'"));
2189
+ }
2190
+
2191
+ use async_trait::async_trait;
2192
+ use kreuzberg::plugins::{Plugin, Validator};
2193
+ use std::sync::Arc;
2194
+
2195
+ struct RubyValidator {
2196
+ name: String,
2197
+ validator: GcGuardedValue,
2198
+ priority: i32,
2199
+ }
2200
+
2201
+ unsafe impl Send for RubyValidator {}
2202
+ unsafe impl Sync for RubyValidator {}
2203
+
2204
+ impl Plugin for RubyValidator {
2205
+ fn name(&self) -> &str {
2206
+ &self.name
2207
+ }
2208
+
2209
+ fn version(&self) -> String {
2210
+ "1.0.0".to_string()
2211
+ }
2212
+
2213
+ fn initialize(&self) -> kreuzberg::Result<()> {
2214
+ Ok(())
2215
+ }
2216
+
2217
+ fn shutdown(&self) -> kreuzberg::Result<()> {
2218
+ Ok(())
2219
+ }
2220
+ }
2221
+
2222
+ #[async_trait]
2223
+ impl Validator for RubyValidator {
2224
+ async fn validate(
2225
+ &self,
2226
+ result: &kreuzberg::ExtractionResult,
2227
+ _config: &kreuzberg::ExtractionConfig,
2228
+ ) -> kreuzberg::Result<()> {
2229
+ let validator_name = self.name.clone();
2230
+ let validator = self.validator.value();
2231
+ let result_clone = result.clone();
2232
+
2233
+ tokio::task::block_in_place(|| {
2234
+ let ruby = Ruby::get().expect("Ruby not initialized");
2235
+ let result_hash =
2236
+ extraction_result_to_ruby(&ruby, result_clone).map_err(|e| kreuzberg::KreuzbergError::Plugin {
2237
+ message: format!("Failed to convert result to Ruby: {}", e),
2238
+ plugin_name: validator_name.clone(),
2239
+ })?;
2240
+
2241
+ validator
2242
+ .funcall::<_, _, magnus::Value>("call", (result_hash,))
2243
+ .map_err(|e| kreuzberg::KreuzbergError::Validation {
2244
+ message: format!("Validation failed: {}", e),
2245
+ source: None,
2246
+ })?;
2247
+
2248
+ Ok(())
2249
+ })
2250
+ }
2251
+
2252
+ fn priority(&self) -> i32 {
2253
+ self.priority
2254
+ }
2255
+ }
2256
+
2257
+ let validator_impl = Arc::new(RubyValidator {
2258
+ name: name.clone(),
2259
+ validator: GcGuardedValue::new(validator),
2260
+ priority,
2261
+ });
2262
+
2263
+ let registry = kreuzberg::get_validator_registry();
2264
+ registry
2265
+ .write()
2266
+ .map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
2267
+ .register(validator_impl)
2268
+ .map_err(kreuzberg_error)?;
2269
+
2270
+ Ok(())
2271
+ }
2272
+
2273
+ /// Register an OCR backend plugin.
2274
+ ///
2275
+ /// @param name [String] Unique identifier for the OCR backend
2276
+ /// @param backend [Object] Ruby object implementing OCR backend interface
2277
+ /// @return [nil]
2278
+ ///
2279
+ /// # Example
2280
+ /// ```text
2281
+ /// class CustomOcr
2282
+ /// def process_image(image_bytes, language)
2283
+ /// # Return extracted text
2284
+ /// "Extracted text"
2285
+ /// end
2286
+ ///
2287
+ /// def supports_language?(lang)
2288
+ /// %w[eng deu fra].include?(lang)
2289
+ /// end
2290
+ /// end
2291
+ ///
2292
+ /// Kreuzberg.register_ocr_backend("custom", CustomOcr.new)
2293
+ /// ```
2294
+ fn register_ocr_backend(name: String, backend: Value) -> Result<(), Error> {
2295
+ if !backend.respond_to("name", true)? {
2296
+ return Err(runtime_error("OCR backend must respond to 'name'"));
2297
+ }
2298
+ if !backend.respond_to("process_image", true)? {
2299
+ return Err(runtime_error("OCR backend must respond to 'process_image'"));
2300
+ }
2301
+
2302
+ use async_trait::async_trait;
2303
+ use kreuzberg::plugins::{OcrBackend, OcrBackendType, Plugin};
2304
+ use std::sync::Arc;
2305
+
2306
+ struct RubyOcrBackend {
2307
+ name: String,
2308
+ backend: GcGuardedValue,
2309
+ }
2310
+
2311
+ unsafe impl Send for RubyOcrBackend {}
2312
+ unsafe impl Sync for RubyOcrBackend {}
2313
+
2314
+ impl Plugin for RubyOcrBackend {
2315
+ fn name(&self) -> &str {
2316
+ &self.name
2317
+ }
2318
+
2319
+ fn version(&self) -> String {
2320
+ "1.0.0".to_string()
2321
+ }
2322
+
2323
+ fn initialize(&self) -> kreuzberg::Result<()> {
2324
+ Ok(())
2325
+ }
2326
+
2327
+ fn shutdown(&self) -> kreuzberg::Result<()> {
2328
+ Ok(())
2329
+ }
2330
+ }
2331
+
2332
+ #[async_trait]
2333
+ impl OcrBackend for RubyOcrBackend {
2334
+ async fn process_image(
2335
+ &self,
2336
+ image_bytes: &[u8],
2337
+ config: &kreuzberg::OcrConfig,
2338
+ ) -> kreuzberg::Result<kreuzberg::ExtractionResult> {
2339
+ let ruby = Ruby::get().expect("Ruby not initialized");
2340
+ let image_str = ruby.str_from_slice(image_bytes);
2341
+
2342
+ let config_hash = ocr_config_to_ruby_hash(&ruby, config).map_err(|e| kreuzberg::KreuzbergError::Ocr {
2343
+ message: format!("Failed to convert OCR config: {}", e),
2344
+ source: None,
2345
+ })?;
2346
+
2347
+ let response = self
2348
+ .backend
2349
+ .value()
2350
+ .funcall::<_, _, Value>("process_image", (image_str, config_hash.into_value_with(&ruby)))
2351
+ .map_err(|e| kreuzberg::KreuzbergError::Ocr {
2352
+ message: format!("Ruby OCR backend failed: {}", e),
2353
+ source: None,
2354
+ })?;
2355
+
2356
+ let text = String::try_convert(response).map_err(|e| kreuzberg::KreuzbergError::Ocr {
2357
+ message: format!("OCR backend must return a String: {}", e),
2358
+ source: None,
2359
+ })?;
2360
+
2361
+ Ok(kreuzberg::ExtractionResult {
2362
+ content: text,
2363
+ mime_type: "text/plain".to_string(),
2364
+ metadata: kreuzberg::types::Metadata::default(),
2365
+ tables: vec![],
2366
+ detected_languages: None,
2367
+ chunks: None,
2368
+ images: None,
2369
+ })
2370
+ }
2371
+
2372
+ fn supports_language(&self, lang: &str) -> bool {
2373
+ match self.backend.value().respond_to("supports_language?", true) {
2374
+ Ok(true) => self
2375
+ .backend
2376
+ .value()
2377
+ .funcall::<_, _, bool>("supports_language?", (lang,))
2378
+ .unwrap_or(true),
2379
+ _ => true,
2380
+ }
2381
+ }
2382
+
2383
+ fn backend_type(&self) -> OcrBackendType {
2384
+ OcrBackendType::Custom
2385
+ }
2386
+ }
2387
+
2388
+ let backend_impl = Arc::new(RubyOcrBackend {
2389
+ name: name.clone(),
2390
+ backend: GcGuardedValue::new(backend),
2391
+ });
2392
+
2393
+ let registry = kreuzberg::get_ocr_backend_registry();
2394
+ registry
2395
+ .write()
2396
+ .map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
2397
+ .register(backend_impl)
2398
+ .map_err(kreuzberg_error)?;
2399
+
2400
+ Ok(())
2401
+ }
2402
+
2403
+ /// Unregister a post-processor plugin.
2404
+ ///
2405
+ /// @param name [String] Name of the post-processor to remove
2406
+ /// @return [nil]
2407
+ ///
2408
+ fn unregister_post_processor(name: String) -> Result<(), Error> {
2409
+ let registry = kreuzberg::get_post_processor_registry();
2410
+ registry
2411
+ .write()
2412
+ .map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
2413
+ .remove(&name)
2414
+ .map_err(kreuzberg_error)?;
2415
+ Ok(())
2416
+ }
2417
+
2418
+ /// Unregister a validator plugin.
2419
+ ///
2420
+ /// @param name [String] Name of the validator to remove
2421
+ /// @return [nil]
2422
+ ///
2423
+ fn unregister_validator(name: String) -> Result<(), Error> {
2424
+ let registry = kreuzberg::get_validator_registry();
2425
+ registry
2426
+ .write()
2427
+ .map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
2428
+ .remove(&name)
2429
+ .map_err(kreuzberg_error)?;
2430
+ Ok(())
2431
+ }
2432
+
2433
+ /// Clear all registered post-processors.
2434
+ ///
2435
+ /// @return [nil]
2436
+ ///
2437
+ fn clear_post_processors() -> Result<(), Error> {
2438
+ let registry = kreuzberg::get_post_processor_registry();
2439
+ registry
2440
+ .write()
2441
+ .map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
2442
+ .shutdown_all()
2443
+ .map_err(kreuzberg_error)?;
2444
+ Ok(())
2445
+ }
2446
+
2447
+ /// Clear all registered validators.
2448
+ ///
2449
+ /// @return [nil]
2450
+ ///
2451
+ fn clear_validators() -> Result<(), Error> {
2452
+ let registry = kreuzberg::get_validator_registry();
2453
+ registry
2454
+ .write()
2455
+ .map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
2456
+ .shutdown_all()
2457
+ .map_err(kreuzberg_error)?;
2458
+ Ok(())
2459
+ }
2460
+
2461
+ /// List all registered validators.
2462
+ ///
2463
+ /// @return [Array<String>] Array of validator names
2464
+ ///
2465
+ fn list_validators() -> Result<Vec<String>, Error> {
2466
+ let registry = kreuzberg::get_validator_registry();
2467
+ let validators = registry
2468
+ .read()
2469
+ .map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
2470
+ .list();
2471
+ Ok(validators)
2472
+ }
2473
+
2474
+ /// List all registered post-processors.
2475
+ ///
2476
+ /// @return [Array<String>] Array of post-processor names
2477
+ ///
2478
+ fn list_post_processors() -> Result<Vec<String>, Error> {
2479
+ let registry = kreuzberg::get_post_processor_registry();
2480
+ let processors = registry
2481
+ .read()
2482
+ .map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
2483
+ .list();
2484
+ Ok(processors)
2485
+ }
2486
+
2487
+ /// Unregister an OCR backend by name.
2488
+ ///
2489
+ /// Removes a previously registered OCR backend from the global registry.
2490
+ ///
2491
+ /// @param name [String] Backend name to unregister
2492
+ /// @return [void]
2493
+ ///
2494
+ /// @example
2495
+ /// Kreuzberg.unregister_ocr_backend("my_ocr")
2496
+ ///
2497
+ fn unregister_ocr_backend(name: String) -> Result<(), Error> {
2498
+ kreuzberg::plugins::unregister_ocr_backend(&name).map_err(|e| runtime_error(e.to_string()))
2499
+ }
2500
+
2501
+ /// List all registered OCR backend names.
2502
+ ///
2503
+ /// Returns an array of all OCR backend names currently registered in the global registry.
2504
+ ///
2505
+ /// @return [Array<String>] Array of OCR backend names
2506
+ ///
2507
+ /// @example
2508
+ /// backends = Kreuzberg.list_ocr_backends
2509
+ /// #=> ["tesseract", "my_custom_ocr"]
2510
+ ///
2511
+ fn list_ocr_backends() -> Result<Vec<String>, Error> {
2512
+ kreuzberg::plugins::list_ocr_backends().map_err(|e| runtime_error(e.to_string()))
2513
+ }
2514
+
2515
+ /// Clear all registered OCR backends.
2516
+ ///
2517
+ /// Removes all OCR backends from the global registry and calls their shutdown methods.
2518
+ ///
2519
+ /// @return [void]
2520
+ ///
2521
+ /// @example
2522
+ /// Kreuzberg.clear_ocr_backends
2523
+ ///
2524
+ fn clear_ocr_backends() -> Result<(), Error> {
2525
+ kreuzberg::plugins::clear_ocr_backends().map_err(|e| runtime_error(e.to_string()))
2526
+ }
2527
+
2528
+ /// List all registered document extractor names.
2529
+ ///
2530
+ /// Returns an array of all document extractor names currently registered in the global registry.
2531
+ ///
2532
+ /// @return [Array<String>] Array of document extractor names
2533
+ ///
2534
+ /// @example
2535
+ /// extractors = Kreuzberg.list_document_extractors
2536
+ /// #=> ["pdf", "docx", "txt"]
2537
+ ///
2538
+ fn list_document_extractors() -> Result<Vec<String>, Error> {
2539
+ kreuzberg::plugins::list_extractors().map_err(|e| runtime_error(e.to_string()))
2540
+ }
2541
+
2542
+ /// Unregister a document extractor by name.
2543
+ ///
2544
+ /// Removes a previously registered document extractor from the global registry.
2545
+ ///
2546
+ /// @param name [String] Extractor name to unregister
2547
+ /// @return [void]
2548
+ ///
2549
+ /// @example
2550
+ /// Kreuzberg.unregister_document_extractor("my_extractor")
2551
+ ///
2552
+ fn unregister_document_extractor(name: String) -> Result<(), Error> {
2553
+ kreuzberg::plugins::unregister_extractor(&name).map_err(|e| runtime_error(e.to_string()))
2554
+ }
2555
+
2556
+ /// Clear all registered document extractors.
2557
+ ///
2558
+ /// Removes all document extractors from the global registry and calls their shutdown methods.
2559
+ ///
2560
+ /// @return [void]
2561
+ ///
2562
+ /// @example
2563
+ /// Kreuzberg.clear_document_extractors
2564
+ ///
2565
+ fn clear_document_extractors() -> Result<(), Error> {
2566
+ kreuzberg::plugins::clear_extractors().map_err(|e| runtime_error(e.to_string()))
2567
+ }
2568
+
2569
+ /// Validate that a MIME type is supported.
2570
+ ///
2571
+ /// @param mime_type [String] The MIME type to validate
2572
+ /// @return [String] The validated MIME type (may be normalized)
2573
+ ///
2574
+ /// @example
2575
+ /// validated = Kreuzberg.validate_mime_type("application/pdf")
2576
+ /// #=> "application/pdf"
2577
+ ///
2578
+ /// @example Validate image MIME type
2579
+ /// validated = Kreuzberg.validate_mime_type("image/jpeg")
2580
+ /// #=> "image/jpeg"
2581
+ ///
2582
+ fn validate_mime_type_native(mime_type: String) -> Result<String, Error> {
2583
+ kreuzberg::validate_mime_type(&mime_type).map_err(kreuzberg_error)
2584
+ }
2585
+
2586
+ /// Detect MIME type from byte content.
2587
+ ///
2588
+ /// Uses magic byte detection to determine the MIME type of content.
2589
+ ///
2590
+ /// @param bytes [String] The byte content to analyze
2591
+ /// @return [String] Detected MIME type
2592
+ ///
2593
+ /// @example
2594
+ /// pdf_bytes = "%PDF-1.4\n"
2595
+ /// mime = Kreuzberg.detect_mime_type(pdf_bytes)
2596
+ /// #=> "application/pdf"
2597
+ ///
2598
+ fn detect_mime_type_from_bytes(bytes: String) -> Result<String, Error> {
2599
+ let mime_type = kreuzberg::detect_mime_type_from_bytes(bytes.as_bytes()).map_err(kreuzberg_error)?;
2600
+ Ok(mime_type)
2601
+ }
2602
+
2603
+ /// Detect MIME type from a file path.
2604
+ ///
2605
+ /// Detects MIME type by reading the file's magic bytes.
2606
+ ///
2607
+ /// @param path [String] Path to the file
2608
+ /// @return [String] Detected MIME type
2609
+ ///
2610
+ /// @example
2611
+ /// mime = Kreuzberg.detect_mime_type_from_path("document.pdf")
2612
+ /// #=> "application/pdf"
2613
+ ///
2614
+ fn detect_mime_type_from_path_native(path: String) -> Result<String, Error> {
2615
+ let content = fs::read(&path).map_err(KreuzbergError::Io).map_err(kreuzberg_error)?;
2616
+ let mime_type = kreuzberg::detect_mime_type_from_bytes(&content).map_err(kreuzberg_error)?;
2617
+ Ok(mime_type)
2618
+ }
2619
+
2620
+ /// Get file extensions for a given MIME type.
2621
+ ///
2622
+ /// Returns an array of file extensions commonly associated with the MIME type.
2623
+ ///
2624
+ /// @param mime_type [String] The MIME type
2625
+ /// @return [Array<String>] Array of file extensions (without dots)
2626
+ ///
2627
+ /// @example
2628
+ /// exts = Kreuzberg.get_extensions_for_mime("application/pdf")
2629
+ /// #=> ["pdf"]
2630
+ ///
2631
+ /// @example
2632
+ /// exts = Kreuzberg.get_extensions_for_mime("image/jpeg")
2633
+ /// #=> ["jpg", "jpeg"]
2634
+ ///
2635
+ fn get_extensions_for_mime_native(mime_type: String) -> Result<Vec<String>, Error> {
2636
+ kreuzberg::get_extensions_for_mime(&mime_type).map_err(kreuzberg_error)
2637
+ }
2638
+
2639
+ /// List all available embedding preset names.
2640
+ ///
2641
+ /// Returns an array of preset names that can be used with get_embedding_preset.
2642
+ ///
2643
+ /// # Returns
2644
+ ///
2645
+ /// Array of 4 preset names: ["fast", "balanced", "quality", "multilingual"]
2646
+ ///
2647
+ /// # Example
2648
+ ///
2649
+ /// ```ruby
2650
+ /// require 'kreuzberg'
2651
+ ///
2652
+ /// presets = Kreuzberg.list_embedding_presets
2653
+ /// puts presets # => ["fast", "balanced", "quality", "multilingual"]
2654
+ /// ```
2655
+ fn list_embedding_presets(ruby: &Ruby) -> Result<RArray, Error> {
2656
+ let presets = kreuzberg::embeddings::list_presets();
2657
+ let array = ruby.ary_new();
2658
+ for name in presets {
2659
+ array.push(name)?;
2660
+ }
2661
+ Ok(array)
2662
+ }
2663
+
2664
+ /// Get a specific embedding preset by name.
2665
+ ///
2666
+ /// Returns a preset configuration hash, or nil if the preset name is not found.
2667
+ ///
2668
+ /// # Arguments
2669
+ ///
2670
+ /// * `name` - The preset name (case-sensitive)
2671
+ ///
2672
+ /// # Returns
2673
+ ///
2674
+ /// Hash with preset configuration or nil if not found
2675
+ ///
2676
+ /// Available presets:
2677
+ /// - "fast": AllMiniLML6V2Q (384 dimensions) - Quick prototyping, low-latency
2678
+ /// - "balanced": BGEBaseENV15 (768 dimensions) - General-purpose RAG
2679
+ /// - "quality": BGELargeENV15 (1024 dimensions) - High-quality embeddings
2680
+ /// - "multilingual": MultilingualE5Base (768 dimensions) - Multi-language support
2681
+ ///
2682
+ /// # Example
2683
+ ///
2684
+ /// ```ruby
2685
+ /// require 'kreuzberg'
2686
+ ///
2687
+ /// preset = Kreuzberg.get_embedding_preset("balanced")
2688
+ /// if preset
2689
+ /// puts "Model: #{preset[:model_name]}, Dims: #{preset[:dimensions]}"
2690
+ /// # => Model: BGEBaseENV15, Dims: 768
2691
+ /// end
2692
+ /// ```
2693
+ fn get_embedding_preset(ruby: &Ruby, name: String) -> Result<Value, Error> {
2694
+ let preset = kreuzberg::embeddings::get_preset(&name);
2695
+
2696
+ match preset {
2697
+ Some(preset) => {
2698
+ let hash = ruby.hash_new();
2699
+
2700
+ set_hash_entry(ruby, &hash, "name", ruby.str_new(preset.name).as_value())?;
2701
+ set_hash_entry(ruby, &hash, "chunk_size", preset.chunk_size.into_value_with(ruby))?;
2702
+ set_hash_entry(ruby, &hash, "overlap", preset.overlap.into_value_with(ruby))?;
2703
+
2704
+ let model_name = format!("{:?}", preset.model);
2705
+
2706
+ set_hash_entry(ruby, &hash, "model_name", ruby.str_new(&model_name).as_value())?;
2707
+ set_hash_entry(ruby, &hash, "dimensions", preset.dimensions.into_value_with(ruby))?;
2708
+ set_hash_entry(ruby, &hash, "description", ruby.str_new(preset.description).as_value())?;
2709
+
2710
+ Ok(hash.as_value())
2711
+ }
2712
+ None => Ok(ruby.qnil().as_value()),
2713
+ }
2714
+ }
2715
+
2716
+ /// Get the last error code from FFI
2717
+ ///
2718
+ /// Returns an i32 error code indicating the type of error that occurred:
2719
+ /// - 0: Success (no error)
2720
+ /// - 1: GenericError
2721
+ /// - 2: Panic
2722
+ /// - 3: InvalidArgument
2723
+ /// - 4: IoError
2724
+ /// - 5: ParsingError
2725
+ /// - 6: OcrError
2726
+ /// - 7: MissingDependency
2727
+ ///
2728
+ /// @return [Integer] The error code
2729
+ fn last_error_code() -> i32 {
2730
+ get_error_code()
2731
+ }
2732
+
2733
+ /// Get the last panic context from FFI as a JSON string
2734
+ ///
2735
+ /// Returns a JSON string containing panic context if the last error was a panic,
2736
+ /// or nil if no panic context is available.
2737
+ ///
2738
+ /// The JSON structure contains:
2739
+ /// - file: Source file where panic occurred
2740
+ /// - line: Line number
2741
+ /// - function: Function name
2742
+ /// - message: Panic message
2743
+ /// - timestamp_secs: Unix timestamp
2744
+ ///
2745
+ /// @return [String, nil] JSON string with panic context or nil
2746
+ fn last_panic_context_json(ruby: &Ruby) -> Value {
2747
+ match get_panic_context() {
2748
+ Some(json) => ruby.str_new(&json).as_value(),
2749
+ None => ruby.qnil().as_value(),
2750
+ }
2751
+ }
2752
+
2753
+ /// Initialize the Kreuzberg Ruby module
2754
+ #[magnus::init]
2755
+ fn init(ruby: &Ruby) -> Result<(), Error> {
2756
+ let module = ruby.define_module("Kreuzberg")?;
2757
+
2758
+ module.define_module_function("extract_file_sync", function!(extract_file_sync, -1))?;
2759
+ module.define_module_function("extract_bytes_sync", function!(extract_bytes_sync, -1))?;
2760
+ module.define_module_function("batch_extract_files_sync", function!(batch_extract_files_sync, -1))?;
2761
+ module.define_module_function("batch_extract_bytes_sync", function!(batch_extract_bytes_sync, -1))?;
2762
+
2763
+ module.define_module_function("extract_file", function!(extract_file, -1))?;
2764
+ module.define_module_function("extract_bytes", function!(extract_bytes, -1))?;
2765
+ module.define_module_function("batch_extract_files", function!(batch_extract_files, -1))?;
2766
+ module.define_module_function("batch_extract_bytes", function!(batch_extract_bytes, -1))?;
2767
+
2768
+ module.define_module_function("clear_cache", function!(ruby_clear_cache, 0))?;
2769
+ module.define_module_function("cache_stats", function!(ruby_cache_stats, 0))?;
2770
+
2771
+ module.define_module_function("register_post_processor", function!(register_post_processor, -1))?;
2772
+ module.define_module_function("register_validator", function!(register_validator, -1))?;
2773
+ module.define_module_function("register_ocr_backend", function!(register_ocr_backend, 2))?;
2774
+ module.define_module_function("unregister_post_processor", function!(unregister_post_processor, 1))?;
2775
+ module.define_module_function("unregister_validator", function!(unregister_validator, 1))?;
2776
+ module.define_module_function("clear_post_processors", function!(clear_post_processors, 0))?;
2777
+ module.define_module_function("clear_validators", function!(clear_validators, 0))?;
2778
+ module.define_module_function("list_post_processors", function!(list_post_processors, 0))?;
2779
+ module.define_module_function("list_validators", function!(list_validators, 0))?;
2780
+ module.define_module_function("unregister_ocr_backend", function!(unregister_ocr_backend, 1))?;
2781
+ module.define_module_function("list_ocr_backends", function!(list_ocr_backends, 0))?;
2782
+ module.define_module_function("clear_ocr_backends", function!(clear_ocr_backends, 0))?;
2783
+ module.define_module_function("list_document_extractors", function!(list_document_extractors, 0))?;
2784
+ module.define_module_function(
2785
+ "unregister_document_extractor",
2786
+ function!(unregister_document_extractor, 1),
2787
+ )?;
2788
+ module.define_module_function("clear_document_extractors", function!(clear_document_extractors, 0))?;
2789
+
2790
+ module.define_module_function("_config_from_file_native", function!(config_from_file, 1))?;
2791
+ module.define_module_function("_config_discover_native", function!(config_discover, 0))?;
2792
+
2793
+ module.define_module_function("detect_mime_type", function!(detect_mime_type_from_bytes, 1))?;
2794
+ module.define_module_function(
2795
+ "detect_mime_type_from_path",
2796
+ function!(detect_mime_type_from_path_native, 1),
2797
+ )?;
2798
+ module.define_module_function("get_extensions_for_mime", function!(get_extensions_for_mime_native, 1))?;
2799
+ module.define_module_function("validate_mime_type", function!(validate_mime_type_native, 1))?;
2800
+
2801
+ module.define_module_function("list_embedding_presets", function!(list_embedding_presets, 0))?;
2802
+ module.define_module_function("get_embedding_preset", function!(get_embedding_preset, 1))?;
2803
+
2804
+ module.define_module_function("_last_error_code_native", function!(last_error_code, 0))?;
2805
+ module.define_module_function("_last_panic_context_json_native", function!(last_panic_context_json, 0))?;
2806
+
2807
+ Ok(())
2808
+ }
2809
+
2810
+ #[cfg(test)]
2811
+ mod tests {
2812
+ use super::*;
2813
+
2814
+ #[test]
2815
+ fn test_ruby_clear_cache_clears_directory() {
2816
+ use std::fs;
2817
+ use std::path::PathBuf;
2818
+
2819
+ let thread_id = std::thread::current().id();
2820
+ let cache_dir = PathBuf::from(format!("/tmp/kreuzberg_test_clear_{:?}", thread_id));
2821
+
2822
+ let _ = fs::remove_dir_all(&cache_dir);
2823
+
2824
+ fs::create_dir_all(&cache_dir).expect("Failed to create cache directory");
2825
+
2826
+ let test_file = cache_dir.join("test_cache.msgpack");
2827
+ fs::write(&test_file, b"test data").expect("Failed to write test file");
2828
+
2829
+ assert!(test_file.exists(), "Test file should exist before clear");
2830
+
2831
+ let cache_dir_str = cache_dir.to_str().expect("Cache dir must be valid UTF-8");
2832
+ let result = kreuzberg::cache::clear_cache_directory(cache_dir_str);
2833
+
2834
+ assert!(result.is_ok(), "Cache clear should succeed");
2835
+ let (removed, _) = result.unwrap();
2836
+ assert_eq!(removed, 1, "Should remove one file");
2837
+
2838
+ assert!(!test_file.exists(), "Test file should be removed after clear");
2839
+
2840
+ let _ = fs::remove_dir_all(&cache_dir);
2841
+ }
2842
+
2843
+ #[test]
2844
+ fn test_ruby_cache_stats_returns_correct_structure() {
2845
+ use std::fs;
2846
+ use std::path::PathBuf;
2847
+
2848
+ let thread_id = std::thread::current().id();
2849
+ let cache_dir = PathBuf::from(format!("/tmp/kreuzberg_test_stats_{:?}", thread_id));
2850
+
2851
+ let _ = fs::remove_dir_all(&cache_dir);
2852
+
2853
+ fs::create_dir_all(&cache_dir).expect("Failed to create cache directory");
2854
+
2855
+ let test_file1 = cache_dir.join("test1.msgpack");
2856
+ let test_file2 = cache_dir.join("test2.msgpack");
2857
+ fs::write(&test_file1, b"test data 1").expect("Failed to write test file 1");
2858
+ fs::write(&test_file2, b"test data 2").expect("Failed to write test file 2");
2859
+
2860
+ let cache_dir_str = cache_dir.to_str().expect("Cache dir must be valid UTF-8");
2861
+ let stats = kreuzberg::cache::get_cache_metadata(cache_dir_str);
2862
+
2863
+ assert!(stats.is_ok(), "Cache stats should succeed");
2864
+ let stats = stats.unwrap();
2865
+
2866
+ assert_eq!(stats.total_files, 2, "Should report 2 files");
2867
+ assert!(stats.total_size_mb > 0.0, "Total size should be greater than 0");
2868
+ assert!(
2869
+ stats.available_space_mb > 0.0,
2870
+ "Available space should be greater than 0"
2871
+ );
2872
+
2873
+ let _ = fs::remove_dir_all(&cache_dir);
2874
+ }
2875
+
2876
+ #[test]
2877
+ fn test_ruby_cache_stats_converts_mb_to_bytes() {
2878
+ let size_mb = 1.5;
2879
+ let size_bytes = (size_mb * 1024.0 * 1024.0) as u64;
2880
+ assert_eq!(size_bytes, 1_572_864, "Should convert MB to bytes correctly");
2881
+ }
2882
+
2883
+ #[test]
2884
+ fn test_ruby_clear_cache_handles_empty_directory() {
2885
+ use std::fs;
2886
+ use std::path::PathBuf;
2887
+
2888
+ let thread_id = std::thread::current().id();
2889
+ let cache_dir = PathBuf::from(format!("/tmp/kreuzberg_test_empty_{:?}", thread_id));
2890
+
2891
+ let _ = fs::remove_dir_all(&cache_dir);
2892
+
2893
+ fs::create_dir_all(&cache_dir).expect("Failed to create cache directory");
2894
+
2895
+ let cache_dir_str = cache_dir.to_str().expect("Cache dir must be valid UTF-8");
2896
+ let result = kreuzberg::cache::clear_cache_directory(cache_dir_str);
2897
+
2898
+ assert!(result.is_ok(), "Should handle empty directory");
2899
+ let (removed, freed) = result.unwrap();
2900
+ assert_eq!(removed, 0, "Should remove 0 files from empty directory");
2901
+ assert_eq!(freed, 0.0, "Should free 0 MB from empty directory");
2902
+
2903
+ let _ = fs::remove_dir_all(&cache_dir);
2904
+ }
2905
+
2906
+ #[test]
2907
+ fn test_image_extraction_config_conversion() {
2908
+ let config = ImageExtractionConfig {
2909
+ extract_images: true,
2910
+ target_dpi: 300,
2911
+ max_image_dimension: 4096,
2912
+ auto_adjust_dpi: true,
2913
+ min_dpi: 72,
2914
+ max_dpi: 600,
2915
+ };
2916
+
2917
+ assert!(config.extract_images);
2918
+ assert_eq!(config.target_dpi, 300);
2919
+ assert_eq!(config.max_image_dimension, 4096);
2920
+ assert!(config.auto_adjust_dpi);
2921
+ assert_eq!(config.min_dpi, 72);
2922
+ assert_eq!(config.max_dpi, 600);
2923
+ }
2924
+
2925
+ #[test]
2926
+ fn test_image_preprocessing_config_conversion() {
2927
+ let config = ImagePreprocessingConfig {
2928
+ target_dpi: 300,
2929
+ auto_rotate: true,
2930
+ deskew: true,
2931
+ denoise: false,
2932
+ contrast_enhance: false,
2933
+ binarization_method: "otsu".to_string(),
2934
+ invert_colors: false,
2935
+ };
2936
+
2937
+ assert_eq!(config.target_dpi, 300);
2938
+ assert!(config.auto_rotate);
2939
+ assert!(config.deskew);
2940
+ assert!(!config.denoise);
2941
+ assert!(!config.contrast_enhance);
2942
+ assert_eq!(config.binarization_method, "otsu");
2943
+ assert!(!config.invert_colors);
2944
+ }
2945
+
2946
+ #[test]
2947
+ fn test_postprocessor_config_conversion() {
2948
+ let config = PostProcessorConfig {
2949
+ enabled: true,
2950
+ enabled_processors: Some(vec!["processor1".to_string(), "processor2".to_string()]),
2951
+ disabled_processors: None,
2952
+ };
2953
+
2954
+ assert!(config.enabled);
2955
+ assert!(config.enabled_processors.is_some());
2956
+ assert_eq!(config.enabled_processors.unwrap().len(), 2);
2957
+ assert!(config.disabled_processors.is_none());
2958
+ }
2959
+
2960
+ #[test]
2961
+ fn test_token_reduction_config_conversion() {
2962
+ let config = TokenReductionConfig {
2963
+ mode: "moderate".to_string(),
2964
+ preserve_important_words: true,
2965
+ };
2966
+
2967
+ assert_eq!(config.mode, "moderate");
2968
+ assert!(config.preserve_important_words);
2969
+ }
2970
+
2971
+ #[test]
2972
+ fn test_extraction_config_with_new_fields() {
2973
+ let config = ExtractionConfig {
2974
+ images: Some(ImageExtractionConfig {
2975
+ extract_images: true,
2976
+ target_dpi: 300,
2977
+ max_image_dimension: 4096,
2978
+ auto_adjust_dpi: true,
2979
+ min_dpi: 72,
2980
+ max_dpi: 600,
2981
+ }),
2982
+ postprocessor: Some(PostProcessorConfig {
2983
+ enabled: true,
2984
+ enabled_processors: None,
2985
+ disabled_processors: None,
2986
+ }),
2987
+ token_reduction: Some(TokenReductionConfig {
2988
+ mode: "light".to_string(),
2989
+ preserve_important_words: true,
2990
+ }),
2991
+ ..Default::default()
2992
+ };
2993
+
2994
+ assert!(config.images.is_some());
2995
+ assert!(config.postprocessor.is_some());
2996
+ assert!(config.token_reduction.is_some());
2997
+ }
2998
+ }