kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,771 @@
1
+ //! Comprehensive OCR backend plugin system tests.
2
+ //!
3
+ //! Tests custom OCR backend registration, execution, parameter passing,
4
+ //! error handling, and backend switching with real image extraction.
5
+
6
+ use async_trait::async_trait;
7
+ use kreuzberg::core::config::{ExtractionConfig, OcrConfig};
8
+ use kreuzberg::plugins::registry::get_ocr_backend_registry;
9
+ use kreuzberg::plugins::{OcrBackend, OcrBackendType, Plugin};
10
+ use kreuzberg::types::{ExtractionResult, Metadata};
11
+ use kreuzberg::{KreuzbergError, Result, extract_file_sync};
12
+ use serial_test::serial;
13
+ use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
14
+ use std::sync::{Arc, Mutex};
15
+
16
+ struct MockOcrBackend {
17
+ name: String,
18
+ return_text: String,
19
+ call_count: AtomicUsize,
20
+ last_language: Mutex<String>,
21
+ initialized: AtomicBool,
22
+ }
23
+
24
+ impl Plugin for MockOcrBackend {
25
+ fn name(&self) -> &str {
26
+ &self.name
27
+ }
28
+
29
+ fn version(&self) -> String {
30
+ "1.0.0".to_string()
31
+ }
32
+
33
+ fn initialize(&self) -> Result<()> {
34
+ self.initialized.store(true, Ordering::Release);
35
+ Ok(())
36
+ }
37
+
38
+ fn shutdown(&self) -> Result<()> {
39
+ self.initialized.store(false, Ordering::Release);
40
+ Ok(())
41
+ }
42
+ }
43
+
44
+ #[async_trait]
45
+ impl OcrBackend for MockOcrBackend {
46
+ async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> Result<ExtractionResult> {
47
+ self.call_count.fetch_add(1, Ordering::SeqCst);
48
+
49
+ *self.last_language.lock().unwrap() = config.language.clone();
50
+
51
+ if image_bytes.is_empty() {
52
+ return Err(KreuzbergError::validation("Empty image data".to_string()));
53
+ }
54
+
55
+ Ok(ExtractionResult {
56
+ content: format!("{} (lang: {})", self.return_text, config.language),
57
+ mime_type: "text/plain".to_string(),
58
+ metadata: Metadata::default(),
59
+ tables: vec![],
60
+ detected_languages: None,
61
+ chunks: None,
62
+ images: None,
63
+ })
64
+ }
65
+
66
+ fn supports_language(&self, lang: &str) -> bool {
67
+ matches!(lang, "eng" | "deu" | "fra")
68
+ }
69
+
70
+ fn backend_type(&self) -> OcrBackendType {
71
+ OcrBackendType::Custom
72
+ }
73
+
74
+ fn supported_languages(&self) -> Vec<String> {
75
+ vec!["eng".to_string(), "deu".to_string(), "fra".to_string()]
76
+ }
77
+ }
78
+
79
+ struct FailingOcrBackend {
80
+ name: String,
81
+ }
82
+
83
+ impl Plugin for FailingOcrBackend {
84
+ fn name(&self) -> &str {
85
+ &self.name
86
+ }
87
+
88
+ fn version(&self) -> String {
89
+ "1.0.0".to_string()
90
+ }
91
+
92
+ fn initialize(&self) -> Result<()> {
93
+ Ok(())
94
+ }
95
+
96
+ fn shutdown(&self) -> Result<()> {
97
+ Ok(())
98
+ }
99
+ }
100
+
101
+ #[async_trait]
102
+ impl OcrBackend for FailingOcrBackend {
103
+ async fn process_image(&self, _image_bytes: &[u8], _config: &OcrConfig) -> Result<ExtractionResult> {
104
+ Err(KreuzbergError::ocr("OCR processing intentionally failed".to_string()))
105
+ }
106
+
107
+ fn supports_language(&self, _lang: &str) -> bool {
108
+ true
109
+ }
110
+
111
+ fn backend_type(&self) -> OcrBackendType {
112
+ OcrBackendType::Custom
113
+ }
114
+ }
115
+
116
+ struct ValidatingOcrBackend {
117
+ name: String,
118
+ min_size: usize,
119
+ }
120
+
121
+ impl Plugin for ValidatingOcrBackend {
122
+ fn name(&self) -> &str {
123
+ &self.name
124
+ }
125
+
126
+ fn version(&self) -> String {
127
+ "1.0.0".to_string()
128
+ }
129
+
130
+ fn initialize(&self) -> Result<()> {
131
+ Ok(())
132
+ }
133
+
134
+ fn shutdown(&self) -> Result<()> {
135
+ Ok(())
136
+ }
137
+ }
138
+
139
+ #[async_trait]
140
+ impl OcrBackend for ValidatingOcrBackend {
141
+ async fn process_image(&self, image_bytes: &[u8], _config: &OcrConfig) -> Result<ExtractionResult> {
142
+ if image_bytes.len() < self.min_size {
143
+ return Err(KreuzbergError::validation(format!(
144
+ "Image too small: {} < {} bytes",
145
+ image_bytes.len(),
146
+ self.min_size
147
+ )));
148
+ }
149
+
150
+ Ok(ExtractionResult {
151
+ content: format!("Processed {} bytes", image_bytes.len()),
152
+ mime_type: "text/plain".to_string(),
153
+ metadata: Metadata::default(),
154
+ tables: vec![],
155
+ detected_languages: None,
156
+ chunks: None,
157
+ images: None,
158
+ })
159
+ }
160
+
161
+ fn supports_language(&self, _lang: &str) -> bool {
162
+ true
163
+ }
164
+
165
+ fn backend_type(&self) -> OcrBackendType {
166
+ OcrBackendType::Custom
167
+ }
168
+ }
169
+
170
+ struct MetadataOcrBackend {
171
+ name: String,
172
+ }
173
+
174
+ impl Plugin for MetadataOcrBackend {
175
+ fn name(&self) -> &str {
176
+ &self.name
177
+ }
178
+
179
+ fn version(&self) -> String {
180
+ "1.0.0".to_string()
181
+ }
182
+
183
+ fn initialize(&self) -> Result<()> {
184
+ Ok(())
185
+ }
186
+
187
+ fn shutdown(&self) -> Result<()> {
188
+ Ok(())
189
+ }
190
+ }
191
+
192
+ #[async_trait]
193
+ impl OcrBackend for MetadataOcrBackend {
194
+ async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> Result<ExtractionResult> {
195
+ let mut metadata = Metadata::default();
196
+ metadata
197
+ .additional
198
+ .insert("ocr_backend".to_string(), serde_json::json!(self.name()));
199
+ metadata
200
+ .additional
201
+ .insert("image_size".to_string(), serde_json::json!(image_bytes.len()));
202
+ metadata
203
+ .additional
204
+ .insert("ocr_language".to_string(), serde_json::json!(config.language));
205
+
206
+ Ok(ExtractionResult {
207
+ content: "OCR processed text".to_string(),
208
+ mime_type: "text/plain".to_string(),
209
+ metadata,
210
+ tables: vec![],
211
+ detected_languages: None,
212
+ chunks: None,
213
+ images: None,
214
+ })
215
+ }
216
+
217
+ fn supports_language(&self, _lang: &str) -> bool {
218
+ true
219
+ }
220
+
221
+ fn backend_type(&self) -> OcrBackendType {
222
+ OcrBackendType::Custom
223
+ }
224
+ }
225
+
226
+ #[serial]
227
+ #[test]
228
+ fn test_register_custom_ocr_backend() {
229
+ let registry = get_ocr_backend_registry();
230
+
231
+ {
232
+ let mut reg = registry.write().unwrap();
233
+ reg.shutdown_all().unwrap();
234
+ }
235
+
236
+ let backend = Arc::new(MockOcrBackend {
237
+ name: "test-ocr".to_string(),
238
+ return_text: "Mocked OCR Result".to_string(),
239
+ call_count: AtomicUsize::new(0),
240
+ last_language: Mutex::new(String::new()),
241
+ initialized: AtomicBool::new(false),
242
+ });
243
+
244
+ {
245
+ let mut reg = registry.write().unwrap();
246
+ let result = reg.register(Arc::clone(&backend) as Arc<dyn OcrBackend>);
247
+ assert!(result.is_ok(), "Failed to register OCR backend: {:?}", result.err());
248
+ }
249
+
250
+ assert!(
251
+ backend.initialized.load(Ordering::Acquire),
252
+ "OCR backend was not initialized"
253
+ );
254
+
255
+ let list = {
256
+ let reg = registry.read().unwrap();
257
+ reg.list()
258
+ };
259
+
260
+ assert!(list.contains(&"test-ocr".to_string()));
261
+
262
+ {
263
+ let mut reg = registry.write().unwrap();
264
+ reg.shutdown_all().unwrap();
265
+ }
266
+ }
267
+
268
+ #[serial]
269
+ #[test]
270
+ fn test_ocr_backend_used_for_image_extraction() {
271
+ let test_image = "../../test_documents/images/test_hello_world.png";
272
+ let registry = get_ocr_backend_registry();
273
+
274
+ {
275
+ let mut reg = registry.write().unwrap();
276
+ reg.shutdown_all().unwrap();
277
+ }
278
+
279
+ let backend = Arc::new(MockOcrBackend {
280
+ name: "extraction-test-ocr".to_string(),
281
+ return_text: "CUSTOM OCR TEXT".to_string(),
282
+ call_count: AtomicUsize::new(0),
283
+ last_language: Mutex::new(String::new()),
284
+ initialized: AtomicBool::new(false),
285
+ });
286
+
287
+ {
288
+ let mut reg = registry.write().unwrap();
289
+ reg.register(Arc::clone(&backend) as Arc<dyn OcrBackend>).unwrap();
290
+ }
291
+
292
+ let ocr_config = OcrConfig {
293
+ backend: "extraction-test-ocr".to_string(),
294
+ language: "eng".to_string(),
295
+ tesseract_config: None,
296
+ };
297
+
298
+ let config = ExtractionConfig {
299
+ ocr: Some(ocr_config),
300
+ force_ocr: true,
301
+ ..Default::default()
302
+ };
303
+
304
+ let result = extract_file_sync(test_image, None, &config);
305
+
306
+ assert!(result.is_ok(), "Extraction failed: {:?}", result.err());
307
+
308
+ let extraction_result = result.unwrap();
309
+ assert!(
310
+ extraction_result.content.contains("CUSTOM OCR TEXT"),
311
+ "Custom OCR backend was not used. Content: {}",
312
+ extraction_result.content
313
+ );
314
+
315
+ assert_eq!(
316
+ backend.call_count.load(Ordering::SeqCst),
317
+ 1,
318
+ "OCR backend was not called exactly once"
319
+ );
320
+
321
+ {
322
+ let mut reg = registry.write().unwrap();
323
+ reg.shutdown_all().unwrap();
324
+ }
325
+ }
326
+
327
+ #[serial]
328
+ #[test]
329
+ fn test_ocr_backend_receives_correct_parameters() {
330
+ let test_image = "../../test_documents/images/test_hello_world.png";
331
+ let registry = get_ocr_backend_registry();
332
+
333
+ {
334
+ let mut reg = registry.write().unwrap();
335
+ reg.shutdown_all().unwrap();
336
+ }
337
+
338
+ let backend = Arc::new(MockOcrBackend {
339
+ name: "param-test-ocr".to_string(),
340
+ return_text: "Test".to_string(),
341
+ call_count: AtomicUsize::new(0),
342
+ last_language: Mutex::new(String::new()),
343
+ initialized: AtomicBool::new(false),
344
+ });
345
+
346
+ {
347
+ let mut reg = registry.write().unwrap();
348
+ reg.register(Arc::clone(&backend) as Arc<dyn OcrBackend>).unwrap();
349
+ }
350
+
351
+ let ocr_config = OcrConfig {
352
+ backend: "param-test-ocr".to_string(),
353
+ language: "deu".to_string(),
354
+ tesseract_config: None,
355
+ };
356
+
357
+ let config = ExtractionConfig {
358
+ ocr: Some(ocr_config),
359
+ force_ocr: true,
360
+ ..Default::default()
361
+ };
362
+
363
+ let result = extract_file_sync(test_image, None, &config);
364
+
365
+ assert!(result.is_ok());
366
+
367
+ let last_lang = backend.last_language.lock().unwrap();
368
+ assert_eq!(*last_lang, "deu", "Language parameter not passed correctly");
369
+
370
+ let extraction_result = result.unwrap();
371
+ assert!(extraction_result.content.contains("(lang: deu)"));
372
+
373
+ {
374
+ let mut reg = registry.write().unwrap();
375
+ reg.shutdown_all().unwrap();
376
+ }
377
+ }
378
+
379
+ #[serial]
380
+ #[test]
381
+ fn test_ocr_backend_returns_correct_format() {
382
+ let test_image = "../../test_documents/images/test_hello_world.png";
383
+ let registry = get_ocr_backend_registry();
384
+
385
+ {
386
+ let mut reg = registry.write().unwrap();
387
+ reg.shutdown_all().unwrap();
388
+ }
389
+
390
+ let backend = Arc::new(MetadataOcrBackend {
391
+ name: "format-test-ocr".to_string(),
392
+ });
393
+
394
+ {
395
+ let mut reg = registry.write().unwrap();
396
+ reg.register(backend as Arc<dyn OcrBackend>).unwrap();
397
+ }
398
+
399
+ let ocr_config = OcrConfig {
400
+ backend: "format-test-ocr".to_string(),
401
+ language: "eng".to_string(),
402
+ tesseract_config: None,
403
+ };
404
+
405
+ let config = ExtractionConfig {
406
+ ocr: Some(ocr_config),
407
+ force_ocr: true,
408
+ ..Default::default()
409
+ };
410
+
411
+ let result = extract_file_sync(test_image, None, &config);
412
+
413
+ assert!(result.is_ok());
414
+
415
+ let extraction_result = result.unwrap();
416
+
417
+ assert!(!extraction_result.content.is_empty());
418
+ assert_eq!(extraction_result.mime_type, "image/png");
419
+ assert!(extraction_result.metadata.additional.contains_key("ocr_backend"));
420
+ assert!(extraction_result.metadata.additional.contains_key("image_size"));
421
+ assert!(extraction_result.metadata.additional.contains_key("ocr_language"));
422
+
423
+ {
424
+ let mut reg = registry.write().unwrap();
425
+ reg.shutdown_all().unwrap();
426
+ }
427
+ }
428
+
429
+ #[serial]
430
+ #[test]
431
+ fn test_ocr_backend_error_handling() {
432
+ let test_image = "../../test_documents/images/test_hello_world.png";
433
+ let registry = get_ocr_backend_registry();
434
+
435
+ {
436
+ let mut reg = registry.write().unwrap();
437
+ reg.shutdown_all().unwrap();
438
+ }
439
+
440
+ let backend = Arc::new(FailingOcrBackend {
441
+ name: "failing-ocr".to_string(),
442
+ });
443
+
444
+ {
445
+ let mut reg = registry.write().unwrap();
446
+ reg.register(backend as Arc<dyn OcrBackend>).unwrap();
447
+ }
448
+
449
+ let ocr_config = OcrConfig {
450
+ backend: "failing-ocr".to_string(),
451
+ language: "eng".to_string(),
452
+ tesseract_config: None,
453
+ };
454
+
455
+ let config = ExtractionConfig {
456
+ ocr: Some(ocr_config),
457
+ force_ocr: true,
458
+ ..Default::default()
459
+ };
460
+
461
+ let result = extract_file_sync(test_image, None, &config);
462
+
463
+ assert!(result.is_err(), "Expected OCR to fail");
464
+
465
+ match result.err().unwrap() {
466
+ KreuzbergError::Ocr { message, .. } => {
467
+ assert!(message.contains("intentionally failed"));
468
+ }
469
+ other => panic!("Expected Ocr error, got: {:?}", other),
470
+ }
471
+
472
+ {
473
+ let mut reg = registry.write().unwrap();
474
+ reg.shutdown_all().unwrap();
475
+ }
476
+ }
477
+
478
+ #[serial]
479
+ #[test]
480
+ fn test_ocr_backend_validation_error() {
481
+ let test_image = "../../test_documents/images/test_hello_world.png";
482
+ let registry = get_ocr_backend_registry();
483
+
484
+ {
485
+ let mut reg = registry.write().unwrap();
486
+ reg.shutdown_all().unwrap();
487
+ }
488
+
489
+ let backend = Arc::new(ValidatingOcrBackend {
490
+ name: "validating-ocr".to_string(),
491
+ min_size: 1_000_000,
492
+ });
493
+
494
+ {
495
+ let mut reg = registry.write().unwrap();
496
+ reg.register(backend as Arc<dyn OcrBackend>).unwrap();
497
+ }
498
+
499
+ let ocr_config = OcrConfig {
500
+ backend: "validating-ocr".to_string(),
501
+ language: "eng".to_string(),
502
+ tesseract_config: None,
503
+ };
504
+
505
+ let config = ExtractionConfig {
506
+ ocr: Some(ocr_config),
507
+ force_ocr: true,
508
+ ..Default::default()
509
+ };
510
+
511
+ let result = extract_file_sync(test_image, None, &config);
512
+
513
+ assert!(result.is_err(), "Expected validation to fail");
514
+
515
+ match result.err().unwrap() {
516
+ KreuzbergError::Validation { message, .. } => {
517
+ assert!(message.contains("Image too small"));
518
+ }
519
+ other => panic!("Expected Validation error, got: {:?}", other),
520
+ }
521
+
522
+ {
523
+ let mut reg = registry.write().unwrap();
524
+ reg.shutdown_all().unwrap();
525
+ }
526
+ }
527
+
528
+ #[serial]
529
+ #[test]
530
+ fn test_switching_between_ocr_backends() {
531
+ let test_image = "../../test_documents/images/test_hello_world.png";
532
+ let registry = get_ocr_backend_registry();
533
+
534
+ {
535
+ let mut reg = registry.write().unwrap();
536
+ reg.shutdown_all().unwrap();
537
+ }
538
+
539
+ let backend1 = Arc::new(MockOcrBackend {
540
+ name: "backend-1".to_string(),
541
+ return_text: "BACKEND ONE OUTPUT".to_string(),
542
+ call_count: AtomicUsize::new(0),
543
+ last_language: Mutex::new(String::new()),
544
+ initialized: AtomicBool::new(false),
545
+ });
546
+
547
+ let backend2 = Arc::new(MockOcrBackend {
548
+ name: "backend-2".to_string(),
549
+ return_text: "BACKEND TWO OUTPUT".to_string(),
550
+ call_count: AtomicUsize::new(0),
551
+ last_language: Mutex::new(String::new()),
552
+ initialized: AtomicBool::new(false),
553
+ });
554
+
555
+ {
556
+ let mut reg = registry.write().unwrap();
557
+ reg.register(Arc::clone(&backend1) as Arc<dyn OcrBackend>).unwrap();
558
+ reg.register(Arc::clone(&backend2) as Arc<dyn OcrBackend>).unwrap();
559
+ }
560
+
561
+ let ocr_config1 = OcrConfig {
562
+ backend: "backend-1".to_string(),
563
+ language: "eng".to_string(),
564
+ tesseract_config: None,
565
+ };
566
+
567
+ let config1 = ExtractionConfig {
568
+ ocr: Some(ocr_config1),
569
+ force_ocr: false,
570
+ ..Default::default()
571
+ };
572
+
573
+ let result1 = extract_file_sync(test_image, None, &config1);
574
+ assert!(result1.is_ok());
575
+ assert!(result1.unwrap().content.contains("BACKEND ONE OUTPUT"));
576
+ assert_eq!(backend1.call_count.load(Ordering::SeqCst), 1);
577
+ assert_eq!(backend2.call_count.load(Ordering::SeqCst), 0);
578
+
579
+ let ocr_config2 = OcrConfig {
580
+ backend: "backend-2".to_string(),
581
+ language: "eng".to_string(),
582
+ tesseract_config: None,
583
+ };
584
+
585
+ let config2 = ExtractionConfig {
586
+ ocr: Some(ocr_config2),
587
+ force_ocr: false,
588
+ ..Default::default()
589
+ };
590
+
591
+ let result2 = extract_file_sync(test_image, None, &config2);
592
+ assert!(result2.is_ok());
593
+ assert!(result2.unwrap().content.contains("BACKEND TWO OUTPUT"));
594
+ assert_eq!(backend1.call_count.load(Ordering::SeqCst), 1);
595
+ assert_eq!(backend2.call_count.load(Ordering::SeqCst), 1);
596
+
597
+ {
598
+ let mut reg = registry.write().unwrap();
599
+ reg.shutdown_all().unwrap();
600
+ }
601
+ }
602
+
603
+ #[serial]
604
+ #[test]
605
+ fn test_ocr_backend_language_support() {
606
+ let registry = get_ocr_backend_registry();
607
+
608
+ {
609
+ let mut reg = registry.write().unwrap();
610
+ reg.shutdown_all().unwrap();
611
+ }
612
+
613
+ let backend = Arc::new(MockOcrBackend {
614
+ name: "lang-test-ocr".to_string(),
615
+ return_text: "Test".to_string(),
616
+ call_count: AtomicUsize::new(0),
617
+ last_language: Mutex::new(String::new()),
618
+ initialized: AtomicBool::new(false),
619
+ });
620
+
621
+ {
622
+ let mut reg = registry.write().unwrap();
623
+ reg.register(Arc::clone(&backend) as Arc<dyn OcrBackend>).unwrap();
624
+ }
625
+
626
+ assert!(backend.supports_language("eng"));
627
+ assert!(backend.supports_language("deu"));
628
+ assert!(backend.supports_language("fra"));
629
+ assert!(!backend.supports_language("jpn"));
630
+
631
+ let supported = backend.supported_languages();
632
+ assert_eq!(supported.len(), 3);
633
+ assert!(supported.contains(&"eng".to_string()));
634
+ assert!(supported.contains(&"deu".to_string()));
635
+ assert!(supported.contains(&"fra".to_string()));
636
+
637
+ {
638
+ let mut reg = registry.write().unwrap();
639
+ reg.shutdown_all().unwrap();
640
+ }
641
+ }
642
+
643
+ #[serial]
644
+ #[test]
645
+ fn test_ocr_backend_type() {
646
+ let backend = MockOcrBackend {
647
+ name: "type-test".to_string(),
648
+ return_text: "Test".to_string(),
649
+ call_count: AtomicUsize::new(0),
650
+ last_language: Mutex::new(String::new()),
651
+ initialized: AtomicBool::new(false),
652
+ };
653
+
654
+ assert_eq!(backend.backend_type(), OcrBackendType::Custom);
655
+ }
656
+
657
+ #[serial]
658
+ #[test]
659
+ fn test_ocr_backend_invalid_name() {
660
+ let registry = get_ocr_backend_registry();
661
+
662
+ {
663
+ let mut reg = registry.write().unwrap();
664
+ reg.shutdown_all().unwrap();
665
+ }
666
+
667
+ let backend = Arc::new(MockOcrBackend {
668
+ name: "invalid name".to_string(),
669
+ return_text: "Test".to_string(),
670
+ call_count: AtomicUsize::new(0),
671
+ last_language: Mutex::new(String::new()),
672
+ initialized: AtomicBool::new(false),
673
+ });
674
+
675
+ {
676
+ let mut reg = registry.write().unwrap();
677
+ let result = reg.register(backend);
678
+
679
+ assert!(result.is_err());
680
+ assert!(matches!(result.err().unwrap(), KreuzbergError::Validation { .. }));
681
+ }
682
+
683
+ {
684
+ let mut reg = registry.write().unwrap();
685
+ reg.shutdown_all().unwrap();
686
+ }
687
+ }
688
+
689
+ #[serial]
690
+ #[test]
691
+ fn test_ocr_backend_initialization_lifecycle() {
692
+ let registry = get_ocr_backend_registry();
693
+
694
+ {
695
+ let mut reg = registry.write().unwrap();
696
+ reg.shutdown_all().unwrap();
697
+ }
698
+
699
+ let backend = Arc::new(MockOcrBackend {
700
+ name: "lifecycle-ocr".to_string(),
701
+ return_text: "Test".to_string(),
702
+ call_count: AtomicUsize::new(0),
703
+ last_language: Mutex::new(String::new()),
704
+ initialized: AtomicBool::new(false),
705
+ });
706
+
707
+ assert!(
708
+ !backend.initialized.load(Ordering::Acquire),
709
+ "Backend should not be initialized yet"
710
+ );
711
+
712
+ {
713
+ let mut reg = registry.write().unwrap();
714
+ reg.register(Arc::clone(&backend) as Arc<dyn OcrBackend>).unwrap();
715
+ }
716
+
717
+ assert!(
718
+ backend.initialized.load(Ordering::Acquire),
719
+ "Backend should be initialized after registration"
720
+ );
721
+
722
+ {
723
+ let mut reg = registry.write().unwrap();
724
+ reg.shutdown_all().unwrap();
725
+ }
726
+
727
+ assert!(
728
+ !backend.initialized.load(Ordering::Acquire),
729
+ "Backend should be shutdown"
730
+ );
731
+ }
732
+
733
+ #[serial]
734
+ #[test]
735
+ fn test_unregister_ocr_backend() {
736
+ let registry = get_ocr_backend_registry();
737
+
738
+ {
739
+ let mut reg = registry.write().unwrap();
740
+ reg.shutdown_all().unwrap();
741
+ }
742
+
743
+ let backend = Arc::new(MockOcrBackend {
744
+ name: "unregister-ocr".to_string(),
745
+ return_text: "Test".to_string(),
746
+ call_count: AtomicUsize::new(0),
747
+ last_language: Mutex::new(String::new()),
748
+ initialized: AtomicBool::new(false),
749
+ });
750
+
751
+ {
752
+ let mut reg = registry.write().unwrap();
753
+ reg.register(Arc::clone(&backend) as Arc<dyn OcrBackend>).unwrap();
754
+ }
755
+
756
+ {
757
+ let mut reg = registry.write().unwrap();
758
+ reg.remove("unregister-ocr").unwrap();
759
+ }
760
+
761
+ let list = {
762
+ let reg = registry.read().unwrap();
763
+ reg.list()
764
+ };
765
+
766
+ assert!(!list.contains(&"unregister-ocr".to_string()));
767
+ assert!(
768
+ !backend.initialized.load(Ordering::Acquire),
769
+ "Backend should be shutdown after unregistration"
770
+ );
771
+ }