kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,4 @@
1
+ pub mod tsv_parser;
2
+
3
+ pub use html_to_markdown_rs::hocr::{HocrWord, reconstruct_table, table_to_markdown};
4
+ pub use tsv_parser::extract_words_from_tsv;
@@ -0,0 +1,144 @@
1
+ use super::super::error::OcrError;
2
+ use super::super::utils::{TSV_MIN_FIELDS, TSV_WORD_LEVEL};
3
+ use html_to_markdown_rs::hocr::HocrWord;
4
+
5
+ /// Extract words from Tesseract TSV output and convert to HocrWord format
6
+ ///
7
+ /// This parses Tesseract's TSV format (level, page_num, block_num, ...) and
8
+ /// converts it to the HocrWord format used by html-to-markdown-rs for table reconstruction.
9
+ pub fn extract_words_from_tsv(tsv_data: &str, min_confidence: f64) -> Result<Vec<HocrWord>, OcrError> {
10
+ let mut words = Vec::new();
11
+
12
+ for (line_num, line) in tsv_data.lines().enumerate() {
13
+ if line_num == 0 {
14
+ continue;
15
+ }
16
+
17
+ let line = line.trim();
18
+ if line.is_empty() {
19
+ continue;
20
+ }
21
+
22
+ let fields: Vec<&str> = line.split('\t').collect();
23
+ if fields.len() < TSV_MIN_FIELDS {
24
+ continue;
25
+ }
26
+
27
+ let level = fields[0].parse::<u32>().unwrap_or(0);
28
+ if level != TSV_WORD_LEVEL {
29
+ continue;
30
+ }
31
+
32
+ let conf = fields[10].parse::<f64>().unwrap_or(-1.0);
33
+ if conf < min_confidence {
34
+ continue;
35
+ }
36
+
37
+ let text = fields[11].trim();
38
+ if text.is_empty() {
39
+ continue;
40
+ }
41
+
42
+ let word = HocrWord {
43
+ text: text.to_string(),
44
+ left: fields[6].parse().unwrap_or(0),
45
+ top: fields[7].parse().unwrap_or(0),
46
+ width: fields[8].parse().unwrap_or(0),
47
+ height: fields[9].parse().unwrap_or(0),
48
+ confidence: conf,
49
+ };
50
+
51
+ words.push(word);
52
+ }
53
+
54
+ Ok(words)
55
+ }
56
+
57
+ #[cfg(test)]
58
+ mod tests {
59
+ use super::*;
60
+
61
+ #[test]
62
+ fn test_extract_words_basic() {
63
+ let tsv = r#"level page_num block_num par_num line_num word_num left top width height conf text
64
+ 5 1 0 0 0 0 100 50 80 30 95.5 Hello
65
+ 5 1 0 0 0 1 190 50 70 30 92.3 World"#;
66
+
67
+ let words = extract_words_from_tsv(tsv, 0.0).unwrap();
68
+ assert_eq!(words.len(), 2);
69
+
70
+ assert_eq!(words[0].text, "Hello");
71
+ assert_eq!(words[0].left, 100);
72
+ assert_eq!(words[0].top, 50);
73
+ assert_eq!(words[0].confidence, 95.5);
74
+
75
+ assert_eq!(words[1].text, "World");
76
+ assert_eq!(words[1].left, 190);
77
+ }
78
+
79
+ #[test]
80
+ fn test_extract_words_confidence_filter() {
81
+ let tsv = r#"level page_num block_num par_num line_num word_num left top width height conf text
82
+ 5 1 0 0 0 0 100 50 80 30 95.5 Hello
83
+ 5 1 0 0 0 1 190 50 70 30 50.0 World
84
+ 5 1 0 0 0 2 270 50 60 30 92.3 Test"#;
85
+
86
+ let words = extract_words_from_tsv(tsv, 90.0).unwrap();
87
+ assert_eq!(words.len(), 2);
88
+ assert_eq!(words[0].text, "Hello");
89
+ assert_eq!(words[1].text, "Test");
90
+ }
91
+
92
+ #[test]
93
+ fn test_extract_words_level_filter() {
94
+ let tsv = r#"level page_num block_num par_num line_num word_num left top width height conf text
95
+ 3 1 0 0 0 0 100 50 80 30 95.5 Paragraph
96
+ 5 1 0 0 0 0 100 50 80 30 95.5 Hello
97
+ 4 1 0 0 0 1 190 50 70 30 92.3 Line"#;
98
+
99
+ let words = extract_words_from_tsv(tsv, 0.0).unwrap();
100
+ assert_eq!(words.len(), 1);
101
+ assert_eq!(words[0].text, "Hello");
102
+ }
103
+
104
+ #[test]
105
+ fn test_hocr_word_methods() {
106
+ let word = HocrWord {
107
+ text: "Hello".to_string(),
108
+ left: 100,
109
+ top: 50,
110
+ width: 80,
111
+ height: 30,
112
+ confidence: 95.5,
113
+ };
114
+
115
+ assert_eq!(word.right(), 180);
116
+ assert_eq!(word.bottom(), 80);
117
+ assert_eq!(word.y_center(), 65.0);
118
+ assert_eq!(word.x_center(), 140.0);
119
+ }
120
+
121
+ #[test]
122
+ fn test_extract_words_empty_text() {
123
+ let tsv = r#"level page_num block_num par_num line_num word_num left top width height conf text
124
+ 5 1 0 0 0 0 100 50 80 30 95.5
125
+ 5 1 0 0 0 1 190 50 70 30 92.3 World"#;
126
+
127
+ let words = extract_words_from_tsv(tsv, 0.0).unwrap();
128
+ assert_eq!(words.len(), 1);
129
+ assert_eq!(words[0].text, "World");
130
+ }
131
+
132
+ #[test]
133
+ fn test_extract_words_malformed() {
134
+ let tsv = r#"level page_num block_num
135
+ 5 1 0 0 0 0 100 50 80 30 95.5 Hello
136
+ invalid line
137
+ 5 1 0 0 0 1 190 50 70 30 92.3 World"#;
138
+
139
+ let words = extract_words_from_tsv(tsv, 0.0).unwrap();
140
+ assert_eq!(words.len(), 2);
141
+ assert_eq!(words[0].text, "Hello");
142
+ assert_eq!(words[1].text, "World");
143
+ }
144
+ }
@@ -0,0 +1,450 @@
1
+ //! Native Tesseract OCR backend.
2
+ //!
3
+ //! This module provides the native Tesseract backend that implements the OcrBackend
4
+ //! trait, bridging the plugin system with the low-level OcrProcessor.
5
+
6
+ use crate::Result;
7
+ use crate::core::config::OcrConfig;
8
+ use crate::ocr::processor::OcrProcessor;
9
+ use crate::plugins::{OcrBackend, OcrBackendType, Plugin};
10
+ use crate::types::ExtractionResult;
11
+ use async_trait::async_trait;
12
+ use std::path::Path;
13
+ use std::sync::Arc;
14
+
15
+ use crate::ocr::types::TesseractConfig as InternalTesseractConfig;
16
+
17
+ /// Native Tesseract OCR backend.
18
+ ///
19
+ /// This backend wraps the OcrProcessor and implements the OcrBackend trait,
20
+ /// allowing it to be used through the plugin system.
21
+ ///
22
+ /// # Thread Safety
23
+ ///
24
+ /// Uses Arc for shared ownership and is thread-safe (Send + Sync).
25
+ pub struct TesseractBackend {
26
+ processor: Arc<OcrProcessor>,
27
+ }
28
+
29
+ impl TesseractBackend {
30
+ /// Create a new Tesseract backend with default cache directory.
31
+ pub fn new() -> Result<Self> {
32
+ let processor = OcrProcessor::new(None).map_err(|e| crate::KreuzbergError::Ocr {
33
+ message: format!("Failed to create Tesseract processor: {}", e),
34
+ source: Some(Box::new(e)),
35
+ })?;
36
+ Ok(Self {
37
+ processor: Arc::new(processor),
38
+ })
39
+ }
40
+
41
+ /// Create a new Tesseract backend with custom cache directory.
42
+ pub fn with_cache_dir(cache_dir: std::path::PathBuf) -> Result<Self> {
43
+ let processor = OcrProcessor::new(Some(cache_dir)).map_err(|e| crate::KreuzbergError::Ocr {
44
+ message: format!("Failed to create Tesseract processor: {}", e),
45
+ source: Some(Box::new(e)),
46
+ })?;
47
+ Ok(Self {
48
+ processor: Arc::new(processor),
49
+ })
50
+ }
51
+
52
+ /// Convert public API TesseractConfig to internal TesseractConfig.
53
+ ///
54
+ /// The public API types (crate::types) use i32 for compatibility with PyO3,
55
+ /// while internal types (crate::ocr::types) use u8/u32 for efficiency.
56
+ fn convert_config(public_config: &crate::types::TesseractConfig) -> InternalTesseractConfig {
57
+ InternalTesseractConfig {
58
+ language: public_config.language.clone(),
59
+ psm: public_config.psm as u8,
60
+ output_format: public_config.output_format.clone(),
61
+ oem: public_config.oem as u8,
62
+ min_confidence: public_config.min_confidence,
63
+ preprocessing: public_config.preprocessing.clone(),
64
+ enable_table_detection: public_config.enable_table_detection,
65
+ table_min_confidence: public_config.table_min_confidence,
66
+ table_column_threshold: public_config.table_column_threshold as u32,
67
+ table_row_threshold_ratio: public_config.table_row_threshold_ratio,
68
+ use_cache: public_config.use_cache,
69
+ classify_use_pre_adapted_templates: public_config.classify_use_pre_adapted_templates,
70
+ language_model_ngram_on: public_config.language_model_ngram_on,
71
+ tessedit_dont_blkrej_good_wds: public_config.tessedit_dont_blkrej_good_wds,
72
+ tessedit_dont_rowrej_good_wds: public_config.tessedit_dont_rowrej_good_wds,
73
+ tessedit_enable_dict_correction: public_config.tessedit_enable_dict_correction,
74
+ tessedit_char_whitelist: public_config.tessedit_char_whitelist.clone(),
75
+ tessedit_char_blacklist: public_config.tessedit_char_blacklist.clone(),
76
+ tessedit_use_primary_params_model: public_config.tessedit_use_primary_params_model,
77
+ textord_space_size_is_variable: public_config.textord_space_size_is_variable,
78
+ thresholding_method: public_config.thresholding_method,
79
+ }
80
+ }
81
+
82
+ /// Convert OcrConfig to internal TesseractConfig.
83
+ ///
84
+ /// Uses tesseract_config from OcrConfig if provided, otherwise uses defaults
85
+ /// with the language from OcrConfig.
86
+ fn config_to_tesseract(&self, config: &OcrConfig) -> InternalTesseractConfig {
87
+ match &config.tesseract_config {
88
+ Some(tess_config) => Self::convert_config(tess_config),
89
+ None => InternalTesseractConfig {
90
+ language: config.language.clone(),
91
+ ..Default::default()
92
+ },
93
+ }
94
+ }
95
+ }
96
+
97
+ impl Default for TesseractBackend {
98
+ fn default() -> Self {
99
+ Self::new().unwrap()
100
+ }
101
+ }
102
+
103
+ impl Plugin for TesseractBackend {
104
+ fn name(&self) -> &str {
105
+ "tesseract"
106
+ }
107
+
108
+ fn version(&self) -> String {
109
+ kreuzberg_tesseract::TesseractAPI::version()
110
+ }
111
+
112
+ fn initialize(&self) -> Result<()> {
113
+ Ok(())
114
+ }
115
+
116
+ fn shutdown(&self) -> Result<()> {
117
+ self.processor.clear_cache().map_err(|e| crate::KreuzbergError::Plugin {
118
+ message: format!("Failed to clear Tesseract cache: {}", e),
119
+ plugin_name: "tesseract".to_string(),
120
+ })
121
+ }
122
+ }
123
+
124
+ #[async_trait]
125
+ impl OcrBackend for TesseractBackend {
126
+ async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> Result<ExtractionResult> {
127
+ let tess_config = self.config_to_tesseract(config);
128
+ let tess_config_clone = tess_config.clone();
129
+
130
+ let processor = Arc::clone(&self.processor);
131
+ let image_bytes = image_bytes.to_vec();
132
+
133
+ let ocr_result = tokio::task::spawn_blocking(move || processor.process_image(&image_bytes, &tess_config_clone))
134
+ .await
135
+ .map_err(|e| crate::KreuzbergError::Plugin {
136
+ message: format!("Tesseract task panicked: {}", e),
137
+ plugin_name: "tesseract".to_string(),
138
+ })?
139
+ .map_err(|e| crate::KreuzbergError::Ocr {
140
+ message: format!("Tesseract OCR failed: {}", e),
141
+ source: Some(Box::new(e)),
142
+ })?;
143
+
144
+ let metadata = crate::types::Metadata {
145
+ format: Some(crate::types::FormatMetadata::Ocr(crate::types::OcrMetadata {
146
+ language: tess_config.language.clone(),
147
+ psm: tess_config.psm as i32,
148
+ output_format: tess_config.output_format.clone(),
149
+ table_count: ocr_result.tables.len(),
150
+ table_rows: ocr_result.tables.first().map(|t| t.cells.len()),
151
+ table_cols: ocr_result
152
+ .tables
153
+ .first()
154
+ .and_then(|t| t.cells.first().map(|row| row.len())),
155
+ })),
156
+ additional: ocr_result.metadata,
157
+ ..Default::default()
158
+ };
159
+
160
+ Ok(ExtractionResult {
161
+ content: ocr_result.content,
162
+ mime_type: ocr_result.mime_type,
163
+ metadata,
164
+ tables: ocr_result
165
+ .tables
166
+ .into_iter()
167
+ .map(|t| crate::types::Table {
168
+ cells: t.cells,
169
+ markdown: t.markdown,
170
+ page_number: t.page_number,
171
+ })
172
+ .collect(),
173
+ detected_languages: None,
174
+ chunks: None,
175
+ images: None,
176
+ })
177
+ }
178
+
179
+ async fn process_file(&self, path: &Path, config: &OcrConfig) -> Result<ExtractionResult> {
180
+ let tess_config = self.config_to_tesseract(config);
181
+ let tess_config_clone = tess_config.clone();
182
+
183
+ let processor = Arc::clone(&self.processor);
184
+ let path_str = path.to_string_lossy().to_string();
185
+
186
+ let ocr_result = tokio::task::spawn_blocking(move || processor.process_file(&path_str, &tess_config_clone))
187
+ .await
188
+ .map_err(|e| crate::KreuzbergError::Plugin {
189
+ message: format!("Tesseract task panicked: {}", e),
190
+ plugin_name: "tesseract".to_string(),
191
+ })?
192
+ .map_err(|e| crate::KreuzbergError::Ocr {
193
+ message: format!("Tesseract OCR failed: {}", e),
194
+ source: Some(Box::new(e)),
195
+ })?;
196
+
197
+ let metadata = crate::types::Metadata {
198
+ format: Some(crate::types::FormatMetadata::Ocr(crate::types::OcrMetadata {
199
+ language: tess_config.language.clone(),
200
+ psm: tess_config.psm as i32,
201
+ output_format: tess_config.output_format.clone(),
202
+ table_count: ocr_result.tables.len(),
203
+ table_rows: ocr_result.tables.first().map(|t| t.cells.len()),
204
+ table_cols: ocr_result
205
+ .tables
206
+ .first()
207
+ .and_then(|t| t.cells.first().map(|row| row.len())),
208
+ })),
209
+ additional: ocr_result.metadata,
210
+ ..Default::default()
211
+ };
212
+
213
+ Ok(ExtractionResult {
214
+ content: ocr_result.content,
215
+ mime_type: ocr_result.mime_type,
216
+ metadata,
217
+ tables: ocr_result
218
+ .tables
219
+ .into_iter()
220
+ .map(|t| crate::types::Table {
221
+ cells: t.cells,
222
+ markdown: t.markdown,
223
+ page_number: t.page_number,
224
+ })
225
+ .collect(),
226
+ detected_languages: None,
227
+ chunks: None,
228
+ images: None,
229
+ })
230
+ }
231
+
232
+ fn supports_language(&self, lang: &str) -> bool {
233
+ // TODO: Query Tesseract for available languages
234
+ matches!(
235
+ lang,
236
+ "eng"
237
+ | "deu"
238
+ | "fra"
239
+ | "spa"
240
+ | "ita"
241
+ | "por"
242
+ | "rus"
243
+ | "chi_sim"
244
+ | "chi_tra"
245
+ | "jpn"
246
+ | "kor"
247
+ | "ara"
248
+ | "hin"
249
+ | "ben"
250
+ | "tha"
251
+ | "vie"
252
+ | "heb"
253
+ | "tur"
254
+ | "pol"
255
+ | "nld"
256
+ | "swe"
257
+ | "dan"
258
+ | "fin"
259
+ | "nor"
260
+ | "ces"
261
+ | "hun"
262
+ | "ron"
263
+ | "ukr"
264
+ | "bul"
265
+ | "hrv"
266
+ | "srp"
267
+ | "slk"
268
+ | "slv"
269
+ | "lit"
270
+ | "lav"
271
+ | "est"
272
+ )
273
+ }
274
+
275
+ fn backend_type(&self) -> OcrBackendType {
276
+ OcrBackendType::Tesseract
277
+ }
278
+
279
+ fn supported_languages(&self) -> Vec<String> {
280
+ // TODO: Query Tesseract API for available languages dynamically
281
+ vec![
282
+ "eng", "deu", "fra", "spa", "ita", "por", "rus", "chi_sim", "chi_tra", "jpn", "kor", "ara", "hin", "ben",
283
+ "tha", "vie", "heb", "tur", "pol", "nld", "swe", "dan", "fin", "nor", "ces", "hun", "ron", "ukr", "bul",
284
+ "hrv", "srp", "slk", "slv", "lit", "lav", "est",
285
+ ]
286
+ .into_iter()
287
+ .map(String::from)
288
+ .collect()
289
+ }
290
+
291
+ fn supports_table_detection(&self) -> bool {
292
+ true
293
+ }
294
+ }
295
+
296
+ #[cfg(test)]
297
+ mod tests {
298
+ use super::*;
299
+
300
+ #[test]
301
+ fn test_tesseract_backend_creation() {
302
+ let backend = TesseractBackend::new();
303
+ assert!(backend.is_ok());
304
+ }
305
+
306
+ #[test]
307
+ fn test_tesseract_backend_plugin_interface() {
308
+ let backend = TesseractBackend::new().unwrap();
309
+ assert_eq!(backend.name(), "tesseract");
310
+ assert!(!backend.version().is_empty());
311
+ assert!(backend.initialize().is_ok());
312
+ }
313
+
314
+ #[test]
315
+ fn test_tesseract_backend_type() {
316
+ let backend = TesseractBackend::new().unwrap();
317
+ assert_eq!(backend.backend_type(), OcrBackendType::Tesseract);
318
+ }
319
+
320
+ #[test]
321
+ fn test_tesseract_backend_supports_language() {
322
+ let backend = TesseractBackend::new().unwrap();
323
+ assert!(backend.supports_language("eng"));
324
+ assert!(backend.supports_language("deu"));
325
+ assert!(backend.supports_language("fra"));
326
+ assert!(!backend.supports_language("xyz"));
327
+ }
328
+
329
+ #[test]
330
+ fn test_tesseract_backend_supports_table_detection() {
331
+ let backend = TesseractBackend::new().unwrap();
332
+ assert!(backend.supports_table_detection());
333
+ }
334
+
335
+ #[test]
336
+ fn test_tesseract_backend_supported_languages() {
337
+ let backend = TesseractBackend::new().unwrap();
338
+ let languages = backend.supported_languages();
339
+ assert!(languages.contains(&"eng".to_string()));
340
+ assert!(languages.contains(&"deu".to_string()));
341
+ assert!(languages.len() > 30);
342
+ }
343
+
344
+ #[test]
345
+ fn test_config_to_tesseract_with_none() {
346
+ let backend = TesseractBackend::new().unwrap();
347
+ let ocr_config = OcrConfig {
348
+ backend: "tesseract".to_string(),
349
+ language: "deu".to_string(),
350
+ tesseract_config: None,
351
+ };
352
+
353
+ let tess_config = backend.config_to_tesseract(&ocr_config);
354
+ assert_eq!(tess_config.language, "deu");
355
+ assert_eq!(tess_config.psm, InternalTesseractConfig::default().psm);
356
+ }
357
+
358
+ #[test]
359
+ fn test_config_to_tesseract_with_some() {
360
+ let backend = TesseractBackend::new().unwrap();
361
+ let custom_tess_config = crate::types::TesseractConfig {
362
+ language: "fra".to_string(),
363
+ psm: 6,
364
+ enable_table_detection: true,
365
+ ..Default::default()
366
+ };
367
+
368
+ let ocr_config = OcrConfig {
369
+ backend: "tesseract".to_string(),
370
+ language: "eng".to_string(),
371
+ tesseract_config: Some(custom_tess_config),
372
+ };
373
+
374
+ let tess_config = backend.config_to_tesseract(&ocr_config);
375
+ assert_eq!(tess_config.language, "fra");
376
+ assert_eq!(tess_config.psm, 6);
377
+ assert!(tess_config.enable_table_detection);
378
+ }
379
+
380
+ #[test]
381
+ fn test_tesseract_backend_default() {
382
+ let backend = TesseractBackend::default();
383
+ assert_eq!(backend.name(), "tesseract");
384
+ }
385
+
386
+ #[test]
387
+ fn test_config_conversion_with_new_fields() {
388
+ let backend = TesseractBackend::new().unwrap();
389
+
390
+ let preprocessing = crate::types::ImagePreprocessingConfig {
391
+ target_dpi: 600,
392
+ auto_rotate: false,
393
+ deskew: true,
394
+ denoise: true,
395
+ contrast_enhance: true,
396
+ binarization_method: "adaptive".to_string(),
397
+ invert_colors: false,
398
+ };
399
+
400
+ let custom_tess_config = crate::types::TesseractConfig {
401
+ language: "eng".to_string(),
402
+ psm: 6,
403
+ output_format: "markdown".to_string(),
404
+ oem: 1,
405
+ min_confidence: 80.0,
406
+ preprocessing: Some(preprocessing.clone()),
407
+ tessedit_char_blacklist: "!@#$".to_string(),
408
+ ..Default::default()
409
+ };
410
+
411
+ let ocr_config = OcrConfig {
412
+ backend: "tesseract".to_string(),
413
+ language: "eng".to_string(),
414
+ tesseract_config: Some(custom_tess_config),
415
+ };
416
+
417
+ let tess_config = backend.config_to_tesseract(&ocr_config);
418
+
419
+ assert_eq!(tess_config.oem, 1);
420
+ assert_eq!(tess_config.min_confidence, 80.0);
421
+ assert_eq!(tess_config.tessedit_char_blacklist, "!@#$");
422
+
423
+ assert!(tess_config.preprocessing.is_some());
424
+ let preproc = tess_config.preprocessing.unwrap();
425
+ assert_eq!(preproc.target_dpi, 600);
426
+ assert!(!preproc.auto_rotate);
427
+ assert!(preproc.deskew);
428
+ assert!(preproc.denoise);
429
+ assert!(preproc.contrast_enhance);
430
+ assert_eq!(preproc.binarization_method, "adaptive");
431
+ assert!(!preproc.invert_colors);
432
+ }
433
+
434
+ #[test]
435
+ fn test_convert_config_type_conversions() {
436
+ let public_config = crate::types::TesseractConfig {
437
+ language: "eng".to_string(),
438
+ psm: 6,
439
+ oem: 3,
440
+ table_column_threshold: 100,
441
+ ..Default::default()
442
+ };
443
+
444
+ let internal_config = TesseractBackend::convert_config(&public_config);
445
+
446
+ assert_eq!(internal_config.psm, 6u8);
447
+ assert_eq!(internal_config.oem, 3u8);
448
+ assert_eq!(internal_config.table_column_threshold, 100u32);
449
+ }
450
+ }