kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,921 @@
1
+ //! Comprehensive plugin system integration tests.
2
+ //!
3
+ //! Tests plugin registration, discovery, error handling, concurrent access,
4
+ //! and cross-registry interactions for all 4 plugin types.
5
+
6
+ use async_trait::async_trait;
7
+ use kreuzberg::core::config::ExtractionConfig;
8
+ use kreuzberg::plugins::registry::{
9
+ DocumentExtractorRegistry, OcrBackendRegistry, PostProcessorRegistry, ValidatorRegistry,
10
+ };
11
+ use kreuzberg::plugins::{DocumentExtractor, Plugin, PostProcessor, ProcessingStage, Validator};
12
+ use kreuzberg::types::{ExtractionResult, Metadata};
13
+ use kreuzberg::{KreuzbergError, Result};
14
+ use std::sync::Arc;
15
+
16
+ struct FailingExtractor {
17
+ name: String,
18
+ should_fail_init: bool,
19
+ should_fail_extract: bool,
20
+ }
21
+
22
+ impl Plugin for FailingExtractor {
23
+ fn name(&self) -> &str {
24
+ &self.name
25
+ }
26
+ fn version(&self) -> String {
27
+ "1.0.0".to_string()
28
+ }
29
+ fn initialize(&self) -> Result<()> {
30
+ if self.should_fail_init {
31
+ Err(KreuzbergError::Plugin {
32
+ message: "Initialization failed".to_string(),
33
+ plugin_name: self.name.clone(),
34
+ })
35
+ } else {
36
+ Ok(())
37
+ }
38
+ }
39
+ fn shutdown(&self) -> Result<()> {
40
+ Ok(())
41
+ }
42
+ }
43
+
44
+ #[async_trait]
45
+ impl DocumentExtractor for FailingExtractor {
46
+ async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> {
47
+ if self.should_fail_extract {
48
+ Err(KreuzbergError::Parsing {
49
+ message: "Extraction failed".to_string(),
50
+ source: None,
51
+ })
52
+ } else {
53
+ Ok(ExtractionResult {
54
+ content: "success".to_string(),
55
+ mime_type: "text/plain".to_string(),
56
+ metadata: Metadata::default(),
57
+ tables: vec![],
58
+ detected_languages: None,
59
+ chunks: None,
60
+ images: None,
61
+ })
62
+ }
63
+ }
64
+
65
+ fn supported_mime_types(&self) -> &[&str] {
66
+ &["text/plain"]
67
+ }
68
+
69
+ fn priority(&self) -> i32 {
70
+ 50
71
+ }
72
+ }
73
+
74
+ struct MetadataModifyingProcessor {
75
+ name: String,
76
+ stage: ProcessingStage,
77
+ }
78
+
79
+ impl Plugin for MetadataModifyingProcessor {
80
+ fn name(&self) -> &str {
81
+ &self.name
82
+ }
83
+ fn version(&self) -> String {
84
+ "1.0.0".to_string()
85
+ }
86
+ fn initialize(&self) -> Result<()> {
87
+ Ok(())
88
+ }
89
+ fn shutdown(&self) -> Result<()> {
90
+ Ok(())
91
+ }
92
+ }
93
+
94
+ #[async_trait]
95
+ impl PostProcessor for MetadataModifyingProcessor {
96
+ async fn process(&self, result: &mut ExtractionResult, _: &ExtractionConfig) -> Result<()> {
97
+ result.content.push_str(&format!(" [{}]", self.name));
98
+ Ok(())
99
+ }
100
+
101
+ fn processing_stage(&self) -> ProcessingStage {
102
+ self.stage
103
+ }
104
+ }
105
+
106
+ struct FailingProcessor {
107
+ name: String,
108
+ }
109
+
110
+ impl Plugin for FailingProcessor {
111
+ fn name(&self) -> &str {
112
+ &self.name
113
+ }
114
+ fn version(&self) -> String {
115
+ "1.0.0".to_string()
116
+ }
117
+ fn initialize(&self) -> Result<()> {
118
+ Ok(())
119
+ }
120
+ fn shutdown(&self) -> Result<()> {
121
+ Ok(())
122
+ }
123
+ }
124
+
125
+ #[async_trait]
126
+ impl PostProcessor for FailingProcessor {
127
+ async fn process(&self, _: &mut ExtractionResult, _: &ExtractionConfig) -> Result<()> {
128
+ Err(KreuzbergError::Plugin {
129
+ message: "Processing failed".to_string(),
130
+ plugin_name: self.name.clone(),
131
+ })
132
+ }
133
+
134
+ fn processing_stage(&self) -> ProcessingStage {
135
+ ProcessingStage::Early
136
+ }
137
+ }
138
+
139
+ struct StrictValidator {
140
+ name: String,
141
+ min_length: usize,
142
+ }
143
+
144
+ impl Plugin for StrictValidator {
145
+ fn name(&self) -> &str {
146
+ &self.name
147
+ }
148
+ fn version(&self) -> String {
149
+ "1.0.0".to_string()
150
+ }
151
+ fn initialize(&self) -> Result<()> {
152
+ Ok(())
153
+ }
154
+ fn shutdown(&self) -> Result<()> {
155
+ Ok(())
156
+ }
157
+ }
158
+
159
+ #[async_trait]
160
+ impl Validator for StrictValidator {
161
+ async fn validate(&self, result: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
162
+ if result.content.len() < self.min_length {
163
+ Err(KreuzbergError::validation(format!(
164
+ "Content too short: {} < {}",
165
+ result.content.len(),
166
+ self.min_length
167
+ )))
168
+ } else {
169
+ Ok(())
170
+ }
171
+ }
172
+
173
+ fn priority(&self) -> i32 {
174
+ 50
175
+ }
176
+ }
177
+
178
+ #[test]
179
+ fn test_extractor_registration_failure() {
180
+ let mut registry = DocumentExtractorRegistry::new();
181
+
182
+ let failing_extractor = Arc::new(FailingExtractor {
183
+ name: "failing-extractor".to_string(),
184
+ should_fail_init: true,
185
+ should_fail_extract: false,
186
+ });
187
+
188
+ let result = registry.register(failing_extractor);
189
+ assert!(matches!(result, Err(KreuzbergError::Plugin { .. })));
190
+ }
191
+
192
+ #[tokio::test]
193
+ async fn test_extractor_extraction_failure() {
194
+ let mut registry = DocumentExtractorRegistry::new();
195
+
196
+ let failing_extractor = Arc::new(FailingExtractor {
197
+ name: "failing-extractor".to_string(),
198
+ should_fail_init: false,
199
+ should_fail_extract: true,
200
+ });
201
+
202
+ registry.register(failing_extractor).unwrap();
203
+
204
+ let extractor = registry.get("text/plain").unwrap();
205
+ let config = ExtractionConfig::default();
206
+ let result = extractor.extract_bytes(b"test", "text/plain", &config).await;
207
+
208
+ assert!(matches!(result, Err(KreuzbergError::Parsing { .. })));
209
+ }
210
+
211
+ #[test]
212
+ fn test_extractor_duplicate_registration() {
213
+ let mut registry = DocumentExtractorRegistry::new();
214
+
215
+ let extractor1 = Arc::new(FailingExtractor {
216
+ name: "same-name".to_string(),
217
+ should_fail_init: false,
218
+ should_fail_extract: false,
219
+ });
220
+
221
+ let extractor2 = Arc::new(FailingExtractor {
222
+ name: "same-name".to_string(),
223
+ should_fail_init: false,
224
+ should_fail_extract: false,
225
+ });
226
+
227
+ registry.register(extractor1).unwrap();
228
+ registry.register(extractor2).unwrap();
229
+
230
+ let names = registry.list();
231
+ assert_eq!(names.len(), 1);
232
+ assert!(names.contains(&"same-name".to_string()));
233
+ }
234
+
235
+ #[test]
236
+ fn test_extractor_concurrent_registration() {
237
+ use std::sync::{Arc as StdArc, RwLock};
238
+ use std::thread;
239
+
240
+ let registry = StdArc::new(RwLock::new(DocumentExtractorRegistry::new()));
241
+ let mut handles = vec![];
242
+
243
+ for i in 0..10 {
244
+ let registry_clone = StdArc::clone(&registry);
245
+ let handle = thread::spawn(move || {
246
+ let extractor = Arc::new(FailingExtractor {
247
+ name: format!("extractor-{}", i),
248
+ should_fail_init: false,
249
+ should_fail_extract: false,
250
+ });
251
+
252
+ let mut reg = registry_clone
253
+ .write()
254
+ .expect("Failed to acquire write lock on registry in test");
255
+ reg.register(extractor).unwrap();
256
+ });
257
+ handles.push(handle);
258
+ }
259
+
260
+ for handle in handles {
261
+ handle.join().unwrap();
262
+ }
263
+
264
+ let reg = registry
265
+ .read()
266
+ .expect("Failed to acquire read lock on registry in test");
267
+ assert_eq!(reg.list().len(), 10);
268
+ }
269
+
270
+ #[test]
271
+ fn test_extractor_priority_ordering_complex() {
272
+ let mut registry = DocumentExtractorRegistry::new();
273
+
274
+ struct PriorityExtractor {
275
+ name: String,
276
+ priority: i32,
277
+ }
278
+
279
+ impl Plugin for PriorityExtractor {
280
+ fn name(&self) -> &str {
281
+ &self.name
282
+ }
283
+ fn version(&self) -> String {
284
+ "1.0.0".to_string()
285
+ }
286
+ fn initialize(&self) -> Result<()> {
287
+ Ok(())
288
+ }
289
+ fn shutdown(&self) -> Result<()> {
290
+ Ok(())
291
+ }
292
+ }
293
+
294
+ #[async_trait]
295
+ impl DocumentExtractor for PriorityExtractor {
296
+ async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> {
297
+ Ok(ExtractionResult {
298
+ content: "test".to_string(),
299
+ mime_type: "text/plain".to_string(),
300
+ metadata: Metadata::default(),
301
+ tables: vec![],
302
+ detected_languages: None,
303
+ chunks: None,
304
+ images: None,
305
+ })
306
+ }
307
+ fn supported_mime_types(&self) -> &[&str] {
308
+ &["text/plain"]
309
+ }
310
+ fn priority(&self) -> i32 {
311
+ self.priority
312
+ }
313
+ }
314
+
315
+ for priority in [10, 50, 100, 25, 75] {
316
+ let extractor = Arc::new(PriorityExtractor {
317
+ name: format!("priority-{}", priority),
318
+ priority,
319
+ });
320
+ registry.register(extractor).unwrap();
321
+ }
322
+
323
+ let selected = registry.get("text/plain").unwrap();
324
+ assert_eq!(selected.name(), "priority-100");
325
+ assert_eq!(selected.priority(), 100);
326
+ }
327
+
328
+ #[test]
329
+ fn test_extractor_wildcard_vs_exact_priority() {
330
+ let mut registry = DocumentExtractorRegistry::new();
331
+
332
+ let _wildcard = Arc::new(FailingExtractor {
333
+ name: "wildcard-high".to_string(),
334
+ should_fail_init: false,
335
+ should_fail_extract: false,
336
+ });
337
+
338
+ struct WildcardExtractor(FailingExtractor);
339
+ impl Plugin for WildcardExtractor {
340
+ fn name(&self) -> &str {
341
+ self.0.name()
342
+ }
343
+ fn version(&self) -> String {
344
+ self.0.version()
345
+ }
346
+ fn initialize(&self) -> Result<()> {
347
+ Ok(())
348
+ }
349
+ fn shutdown(&self) -> Result<()> {
350
+ Ok(())
351
+ }
352
+ }
353
+
354
+ #[async_trait]
355
+ impl DocumentExtractor for WildcardExtractor {
356
+ async fn extract_bytes(&self, c: &[u8], m: &str, cfg: &ExtractionConfig) -> Result<ExtractionResult> {
357
+ self.0.extract_bytes(c, m, cfg).await
358
+ }
359
+ fn supported_mime_types(&self) -> &[&str] {
360
+ &["text/*"]
361
+ }
362
+ fn priority(&self) -> i32 {
363
+ 100
364
+ }
365
+ }
366
+
367
+ let wildcard_arc = Arc::new(WildcardExtractor(FailingExtractor {
368
+ name: "wildcard-high".to_string(),
369
+ should_fail_init: false,
370
+ should_fail_extract: false,
371
+ }));
372
+
373
+ let exact = Arc::new(FailingExtractor {
374
+ name: "exact-low".to_string(),
375
+ should_fail_init: false,
376
+ should_fail_extract: false,
377
+ });
378
+
379
+ registry.register(wildcard_arc).unwrap();
380
+ registry.register(exact).unwrap();
381
+
382
+ let selected = registry.get("text/plain").unwrap();
383
+ assert_eq!(selected.name(), "exact-low");
384
+ }
385
+
386
+ #[test]
387
+ fn test_extractor_empty_mime_type() {
388
+ let registry = DocumentExtractorRegistry::new();
389
+ let result = registry.get("");
390
+ assert!(matches!(result, Err(KreuzbergError::UnsupportedFormat(_))));
391
+ }
392
+
393
+ #[test]
394
+ fn test_extractor_special_characters_mime() {
395
+ let registry = DocumentExtractorRegistry::new();
396
+ let result = registry.get("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
397
+ assert!(matches!(result, Err(KreuzbergError::UnsupportedFormat(_))));
398
+ }
399
+
400
+ #[test]
401
+ fn test_extractor_remove_nonexistent() {
402
+ let mut registry = DocumentExtractorRegistry::new();
403
+ let result = registry.remove("nonexistent");
404
+ assert!(result.is_ok());
405
+ }
406
+
407
+ #[test]
408
+ fn test_extractor_list_after_partial_removal() {
409
+ let mut registry = DocumentExtractorRegistry::new();
410
+
411
+ for i in 0..5 {
412
+ let extractor = Arc::new(FailingExtractor {
413
+ name: format!("extractor-{}", i),
414
+ should_fail_init: false,
415
+ should_fail_extract: false,
416
+ });
417
+ registry.register(extractor).unwrap();
418
+ }
419
+
420
+ registry.remove("extractor-2").unwrap();
421
+ registry.remove("extractor-3").unwrap();
422
+
423
+ let names = registry.list();
424
+ assert_eq!(names.len(), 3);
425
+ assert!(names.contains(&"extractor-0".to_string()));
426
+ assert!(names.contains(&"extractor-1".to_string()));
427
+ assert!(names.contains(&"extractor-4".to_string()));
428
+ }
429
+
430
+ #[tokio::test]
431
+ async fn test_processor_execution_order_within_stage() {
432
+ let mut registry = PostProcessorRegistry::new();
433
+
434
+ let high = Arc::new(MetadataModifyingProcessor {
435
+ name: "high".to_string(),
436
+ stage: ProcessingStage::Early,
437
+ });
438
+
439
+ let medium = Arc::new(MetadataModifyingProcessor {
440
+ name: "medium".to_string(),
441
+ stage: ProcessingStage::Early,
442
+ });
443
+
444
+ let low = Arc::new(MetadataModifyingProcessor {
445
+ name: "low".to_string(),
446
+ stage: ProcessingStage::Early,
447
+ });
448
+
449
+ registry.register(low, 10).unwrap();
450
+ registry.register(high, 100).unwrap();
451
+ registry.register(medium, 50).unwrap();
452
+
453
+ let processors = registry.get_for_stage(ProcessingStage::Early);
454
+ assert_eq!(processors.len(), 3);
455
+
456
+ let mut result = ExtractionResult {
457
+ content: "start".to_string(),
458
+ mime_type: "text/plain".to_string(),
459
+ metadata: Metadata::default(),
460
+ tables: vec![],
461
+ detected_languages: None,
462
+ chunks: None,
463
+ images: None,
464
+ };
465
+
466
+ let config = ExtractionConfig::default();
467
+ for processor in processors {
468
+ processor.process(&mut result, &config).await.unwrap();
469
+ }
470
+
471
+ assert_eq!(result.content, "start [high] [medium] [low]");
472
+ }
473
+
474
+ #[tokio::test]
475
+ async fn test_processor_error_propagation() {
476
+ let mut registry = PostProcessorRegistry::new();
477
+
478
+ let failing = Arc::new(FailingProcessor {
479
+ name: "failing".to_string(),
480
+ });
481
+
482
+ registry.register(failing, 50).unwrap();
483
+
484
+ let processors = registry.get_for_stage(ProcessingStage::Early);
485
+ assert_eq!(processors.len(), 1);
486
+
487
+ let mut result = ExtractionResult {
488
+ content: "test".to_string(),
489
+ mime_type: "text/plain".to_string(),
490
+ metadata: Metadata::default(),
491
+ tables: vec![],
492
+ detected_languages: None,
493
+ chunks: None,
494
+ images: None,
495
+ };
496
+
497
+ let config = ExtractionConfig::default();
498
+ let process_result = processors[0].process(&mut result, &config).await;
499
+
500
+ assert!(matches!(process_result, Err(KreuzbergError::Plugin { .. })));
501
+ }
502
+
503
+ #[test]
504
+ fn test_processor_multiple_stages() {
505
+ let mut registry = PostProcessorRegistry::new();
506
+
507
+ let early = Arc::new(MetadataModifyingProcessor {
508
+ name: "early".to_string(),
509
+ stage: ProcessingStage::Early,
510
+ });
511
+
512
+ let middle = Arc::new(MetadataModifyingProcessor {
513
+ name: "middle".to_string(),
514
+ stage: ProcessingStage::Middle,
515
+ });
516
+
517
+ let late = Arc::new(MetadataModifyingProcessor {
518
+ name: "late".to_string(),
519
+ stage: ProcessingStage::Late,
520
+ });
521
+
522
+ registry.register(early, 50).unwrap();
523
+ registry.register(middle, 50).unwrap();
524
+ registry.register(late, 50).unwrap();
525
+
526
+ assert_eq!(registry.get_for_stage(ProcessingStage::Early).len(), 1);
527
+ assert_eq!(registry.get_for_stage(ProcessingStage::Middle).len(), 1);
528
+ assert_eq!(registry.get_for_stage(ProcessingStage::Late).len(), 1);
529
+ }
530
+
531
+ #[test]
532
+ fn test_processor_registration_failure() {
533
+ struct FailingInitProcessor;
534
+
535
+ impl Plugin for FailingInitProcessor {
536
+ fn name(&self) -> &str {
537
+ "failing-init"
538
+ }
539
+ fn version(&self) -> String {
540
+ "1.0.0".to_string()
541
+ }
542
+ fn initialize(&self) -> Result<()> {
543
+ Err(KreuzbergError::Plugin {
544
+ message: "Init failed".to_string(),
545
+ plugin_name: "failing-init".to_string(),
546
+ })
547
+ }
548
+ fn shutdown(&self) -> Result<()> {
549
+ Ok(())
550
+ }
551
+ }
552
+
553
+ #[async_trait]
554
+ impl PostProcessor for FailingInitProcessor {
555
+ async fn process(&self, _: &mut ExtractionResult, _: &ExtractionConfig) -> Result<()> {
556
+ Ok(())
557
+ }
558
+ fn processing_stage(&self) -> ProcessingStage {
559
+ ProcessingStage::Early
560
+ }
561
+ }
562
+
563
+ let mut registry = PostProcessorRegistry::new();
564
+ let processor = Arc::new(FailingInitProcessor);
565
+
566
+ let result = registry.register(processor, 50);
567
+ assert!(matches!(result, Err(KreuzbergError::Plugin { .. })));
568
+ }
569
+
570
+ #[test]
571
+ fn test_processor_same_priority_same_stage() {
572
+ let mut registry = PostProcessorRegistry::new();
573
+
574
+ let proc1 = Arc::new(MetadataModifyingProcessor {
575
+ name: "processor1".to_string(),
576
+ stage: ProcessingStage::Early,
577
+ });
578
+
579
+ let proc2 = Arc::new(MetadataModifyingProcessor {
580
+ name: "processor2".to_string(),
581
+ stage: ProcessingStage::Early,
582
+ });
583
+
584
+ registry.register(proc1, 50).unwrap();
585
+ registry.register(proc2, 50).unwrap();
586
+
587
+ let processors = registry.get_for_stage(ProcessingStage::Early);
588
+ assert_eq!(processors.len(), 2);
589
+ }
590
+
591
+ #[test]
592
+ fn test_processor_remove_from_specific_stage() {
593
+ let mut registry = PostProcessorRegistry::new();
594
+
595
+ let early = Arc::new(MetadataModifyingProcessor {
596
+ name: "processor".to_string(),
597
+ stage: ProcessingStage::Early,
598
+ });
599
+
600
+ registry.register(early, 50).unwrap();
601
+ assert_eq!(registry.get_for_stage(ProcessingStage::Early).len(), 1);
602
+
603
+ registry.remove("processor").unwrap();
604
+ assert_eq!(registry.get_for_stage(ProcessingStage::Early).len(), 0);
605
+ }
606
+
607
+ #[test]
608
+ fn test_processor_list_across_stages() {
609
+ let mut registry = PostProcessorRegistry::new();
610
+
611
+ for stage in [ProcessingStage::Early, ProcessingStage::Middle, ProcessingStage::Late] {
612
+ let processor = Arc::new(MetadataModifyingProcessor {
613
+ name: format!("{:?}-processor", stage),
614
+ stage,
615
+ });
616
+ registry.register(processor, 50).unwrap();
617
+ }
618
+
619
+ let names = registry.list();
620
+ assert_eq!(names.len(), 3);
621
+ }
622
+
623
+ #[test]
624
+ fn test_processor_shutdown_clears_all_stages() {
625
+ let mut registry = PostProcessorRegistry::new();
626
+
627
+ for stage in [ProcessingStage::Early, ProcessingStage::Middle, ProcessingStage::Late] {
628
+ let processor = Arc::new(MetadataModifyingProcessor {
629
+ name: format!("{:?}-processor", stage),
630
+ stage,
631
+ });
632
+ registry.register(processor, 50).unwrap();
633
+ }
634
+
635
+ registry.shutdown_all().unwrap();
636
+
637
+ assert_eq!(registry.get_for_stage(ProcessingStage::Early).len(), 0);
638
+ assert_eq!(registry.get_for_stage(ProcessingStage::Middle).len(), 0);
639
+ assert_eq!(registry.get_for_stage(ProcessingStage::Late).len(), 0);
640
+ }
641
+
642
+ #[tokio::test]
643
+ async fn test_validator_content_validation() {
644
+ let mut registry = ValidatorRegistry::new();
645
+
646
+ let strict = Arc::new(StrictValidator {
647
+ name: "strict".to_string(),
648
+ min_length: 10,
649
+ });
650
+
651
+ registry.register(strict).unwrap();
652
+
653
+ let validators = registry.get_all();
654
+ assert_eq!(validators.len(), 1);
655
+
656
+ let config = ExtractionConfig::default();
657
+
658
+ let short_result = ExtractionResult {
659
+ content: "short".to_string(),
660
+ mime_type: "text/plain".to_string(),
661
+ metadata: Metadata::default(),
662
+ tables: vec![],
663
+ detected_languages: None,
664
+ chunks: None,
665
+ images: None,
666
+ };
667
+
668
+ let validation = validators[0].validate(&short_result, &config).await;
669
+ assert!(matches!(validation, Err(KreuzbergError::Validation { .. })));
670
+
671
+ let long_result = ExtractionResult {
672
+ content: "this is long enough content".to_string(),
673
+ mime_type: "text/plain".to_string(),
674
+ metadata: Metadata::default(),
675
+ tables: vec![],
676
+ detected_languages: None,
677
+ chunks: None,
678
+ images: None,
679
+ };
680
+
681
+ let validation = validators[0].validate(&long_result, &config).await;
682
+ assert!(validation.is_ok());
683
+ }
684
+
685
+ #[test]
686
+ fn test_validator_priority_ordering() {
687
+ let mut registry = ValidatorRegistry::new();
688
+
689
+ let _high = Arc::new(StrictValidator {
690
+ name: "high-priority".to_string(),
691
+ min_length: 5,
692
+ });
693
+
694
+ struct MediumPriorityValidator;
695
+ impl Plugin for MediumPriorityValidator {
696
+ fn name(&self) -> &str {
697
+ "medium-priority"
698
+ }
699
+ fn version(&self) -> String {
700
+ "1.0.0".to_string()
701
+ }
702
+ fn initialize(&self) -> Result<()> {
703
+ Ok(())
704
+ }
705
+ fn shutdown(&self) -> Result<()> {
706
+ Ok(())
707
+ }
708
+ }
709
+
710
+ #[async_trait]
711
+ impl Validator for MediumPriorityValidator {
712
+ async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
713
+ Ok(())
714
+ }
715
+ fn priority(&self) -> i32 {
716
+ 50
717
+ }
718
+ }
719
+
720
+ struct LowPriorityValidator;
721
+ impl Plugin for LowPriorityValidator {
722
+ fn name(&self) -> &str {
723
+ "low-priority"
724
+ }
725
+ fn version(&self) -> String {
726
+ "1.0.0".to_string()
727
+ }
728
+ fn initialize(&self) -> Result<()> {
729
+ Ok(())
730
+ }
731
+ fn shutdown(&self) -> Result<()> {
732
+ Ok(())
733
+ }
734
+ }
735
+
736
+ #[async_trait]
737
+ impl Validator for LowPriorityValidator {
738
+ async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
739
+ Ok(())
740
+ }
741
+ fn priority(&self) -> i32 {
742
+ 10
743
+ }
744
+ }
745
+
746
+ struct HighPriorityValidator;
747
+ impl Plugin for HighPriorityValidator {
748
+ fn name(&self) -> &str {
749
+ "high-priority"
750
+ }
751
+ fn version(&self) -> String {
752
+ "1.0.0".to_string()
753
+ }
754
+ fn initialize(&self) -> Result<()> {
755
+ Ok(())
756
+ }
757
+ fn shutdown(&self) -> Result<()> {
758
+ Ok(())
759
+ }
760
+ }
761
+
762
+ #[async_trait]
763
+ impl Validator for HighPriorityValidator {
764
+ async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
765
+ Ok(())
766
+ }
767
+ fn priority(&self) -> i32 {
768
+ 100
769
+ }
770
+ }
771
+
772
+ let medium = Arc::new(MediumPriorityValidator);
773
+ let low = Arc::new(LowPriorityValidator);
774
+ let high_priority = Arc::new(HighPriorityValidator);
775
+
776
+ registry.register(medium).unwrap();
777
+ registry.register(low).unwrap();
778
+ registry.register(high_priority).unwrap();
779
+
780
+ let validators = registry.get_all();
781
+ assert_eq!(validators.len(), 3);
782
+ assert_eq!(validators[0].name(), "high-priority");
783
+ assert_eq!(validators[1].name(), "medium-priority");
784
+ assert_eq!(validators[2].name(), "low-priority");
785
+ }
786
+
787
+ #[test]
788
+ fn test_validator_registration_failure() {
789
+ struct FailingInitValidator;
790
+
791
+ impl Plugin for FailingInitValidator {
792
+ fn name(&self) -> &str {
793
+ "failing"
794
+ }
795
+ fn version(&self) -> String {
796
+ "1.0.0".to_string()
797
+ }
798
+ fn initialize(&self) -> Result<()> {
799
+ Err(KreuzbergError::Plugin {
800
+ message: "Init failed".to_string(),
801
+ plugin_name: "failing".to_string(),
802
+ })
803
+ }
804
+ fn shutdown(&self) -> Result<()> {
805
+ Ok(())
806
+ }
807
+ }
808
+
809
+ #[async_trait]
810
+ impl Validator for FailingInitValidator {
811
+ async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
812
+ Ok(())
813
+ }
814
+ fn priority(&self) -> i32 {
815
+ 50
816
+ }
817
+ }
818
+
819
+ let mut registry = ValidatorRegistry::new();
820
+ let validator = Arc::new(FailingInitValidator);
821
+
822
+ let result = registry.register(validator);
823
+ assert!(matches!(result, Err(KreuzbergError::Plugin { .. })));
824
+ }
825
+
826
+ #[test]
827
+ fn test_validator_empty_registry() {
828
+ let registry = ValidatorRegistry::new();
829
+ let validators = registry.get_all();
830
+ assert_eq!(validators.len(), 0);
831
+ }
832
+
833
+ #[test]
834
+ fn test_validator_remove_and_reregister() {
835
+ let mut registry = ValidatorRegistry::new();
836
+
837
+ let validator: Arc<dyn Validator> = Arc::new(StrictValidator {
838
+ name: "validator".to_string(),
839
+ min_length: 5,
840
+ });
841
+
842
+ registry.register(Arc::clone(&validator)).unwrap();
843
+ assert_eq!(registry.get_all().len(), 1);
844
+
845
+ registry.remove("validator").unwrap();
846
+ assert_eq!(registry.get_all().len(), 0);
847
+
848
+ registry.register(validator).unwrap();
849
+ assert_eq!(registry.get_all().len(), 1);
850
+ }
851
+
852
+ #[test]
853
+ fn test_multiple_registries_independence() {
854
+ let ocr_registry = OcrBackendRegistry::new_empty();
855
+ let mut extractor_registry = DocumentExtractorRegistry::new();
856
+ let mut processor_registry = PostProcessorRegistry::new();
857
+ let mut validator_registry = ValidatorRegistry::new();
858
+
859
+ let extractor = Arc::new(FailingExtractor {
860
+ name: "test-extractor".to_string(),
861
+ should_fail_init: false,
862
+ should_fail_extract: false,
863
+ });
864
+
865
+ let processor = Arc::new(MetadataModifyingProcessor {
866
+ name: "test-processor".to_string(),
867
+ stage: ProcessingStage::Early,
868
+ });
869
+
870
+ let validator = Arc::new(StrictValidator {
871
+ name: "test-validator".to_string(),
872
+ min_length: 5,
873
+ });
874
+
875
+ extractor_registry.register(extractor).unwrap();
876
+ processor_registry.register(processor, 50).unwrap();
877
+ validator_registry.register(validator).unwrap();
878
+
879
+ assert_eq!(ocr_registry.list().len(), 0);
880
+ assert_eq!(extractor_registry.list().len(), 1);
881
+ assert_eq!(processor_registry.list().len(), 1);
882
+ assert_eq!(validator_registry.get_all().len(), 1);
883
+ }
884
+
885
+ #[test]
886
+ fn test_shutdown_all_registries() {
887
+ let mut ocr_registry = OcrBackendRegistry::new_empty();
888
+ let mut extractor_registry = DocumentExtractorRegistry::new();
889
+ let mut processor_registry = PostProcessorRegistry::new();
890
+ let mut validator_registry = ValidatorRegistry::new();
891
+
892
+ let extractor = Arc::new(FailingExtractor {
893
+ name: "test-extractor".to_string(),
894
+ should_fail_init: false,
895
+ should_fail_extract: false,
896
+ });
897
+
898
+ let processor = Arc::new(MetadataModifyingProcessor {
899
+ name: "test-processor".to_string(),
900
+ stage: ProcessingStage::Early,
901
+ });
902
+
903
+ let validator = Arc::new(StrictValidator {
904
+ name: "test-validator".to_string(),
905
+ min_length: 5,
906
+ });
907
+
908
+ extractor_registry.register(extractor).unwrap();
909
+ processor_registry.register(processor, 50).unwrap();
910
+ validator_registry.register(validator).unwrap();
911
+
912
+ ocr_registry.shutdown_all().unwrap();
913
+ extractor_registry.shutdown_all().unwrap();
914
+ processor_registry.shutdown_all().unwrap();
915
+ validator_registry.shutdown_all().unwrap();
916
+
917
+ assert_eq!(ocr_registry.list().len(), 0);
918
+ assert_eq!(extractor_registry.list().len(), 0);
919
+ assert_eq!(processor_registry.list().len(), 0);
920
+ assert_eq!(validator_registry.get_all().len(), 0);
921
+ }