kreuzberg 4.0.0.pre.rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (330) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +538 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +157 -0
  8. data/README.md +426 -0
  9. data/Rakefile +25 -0
  10. data/Steepfile +47 -0
  11. data/examples/async_patterns.rb +341 -0
  12. data/ext/kreuzberg_rb/extconf.rb +45 -0
  13. data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
  14. data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
  15. data/ext/kreuzberg_rb/native/README.md +425 -0
  16. data/ext/kreuzberg_rb/native/build.rs +15 -0
  17. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  18. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  19. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  20. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  21. data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
  22. data/extconf.rb +28 -0
  23. data/kreuzberg.gemspec +148 -0
  24. data/lib/kreuzberg/api_proxy.rb +142 -0
  25. data/lib/kreuzberg/cache_api.rb +46 -0
  26. data/lib/kreuzberg/cli.rb +55 -0
  27. data/lib/kreuzberg/cli_proxy.rb +127 -0
  28. data/lib/kreuzberg/config.rb +691 -0
  29. data/lib/kreuzberg/error_context.rb +32 -0
  30. data/lib/kreuzberg/errors.rb +118 -0
  31. data/lib/kreuzberg/extraction_api.rb +85 -0
  32. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  33. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  34. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  35. data/lib/kreuzberg/result.rb +216 -0
  36. data/lib/kreuzberg/setup_lib_path.rb +80 -0
  37. data/lib/kreuzberg/validator_protocol.rb +89 -0
  38. data/lib/kreuzberg/version.rb +5 -0
  39. data/lib/kreuzberg.rb +103 -0
  40. data/sig/kreuzberg/internal.rbs +184 -0
  41. data/sig/kreuzberg.rbs +520 -0
  42. data/spec/binding/cache_spec.rb +227 -0
  43. data/spec/binding/cli_proxy_spec.rb +85 -0
  44. data/spec/binding/cli_spec.rb +55 -0
  45. data/spec/binding/config_spec.rb +345 -0
  46. data/spec/binding/config_validation_spec.rb +283 -0
  47. data/spec/binding/error_handling_spec.rb +213 -0
  48. data/spec/binding/errors_spec.rb +66 -0
  49. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  50. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  51. data/spec/binding/plugins/validator_spec.rb +274 -0
  52. data/spec/fixtures/config.toml +39 -0
  53. data/spec/fixtures/config.yaml +41 -0
  54. data/spec/fixtures/invalid_config.toml +4 -0
  55. data/spec/smoke/package_spec.rb +178 -0
  56. data/spec/spec_helper.rb +42 -0
  57. data/vendor/kreuzberg/Cargo.toml +204 -0
  58. data/vendor/kreuzberg/README.md +175 -0
  59. data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
  60. data/vendor/kreuzberg/build.rs +474 -0
  61. data/vendor/kreuzberg/src/api/error.rs +81 -0
  62. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  63. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  64. data/vendor/kreuzberg/src/api/server.rs +353 -0
  65. data/vendor/kreuzberg/src/api/types.rs +170 -0
  66. data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
  67. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  68. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
  69. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  70. data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
  71. data/vendor/kreuzberg/src/core/io.rs +329 -0
  72. data/vendor/kreuzberg/src/core/mime.rs +605 -0
  73. data/vendor/kreuzberg/src/core/mod.rs +45 -0
  74. data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
  75. data/vendor/kreuzberg/src/embeddings.rs +432 -0
  76. data/vendor/kreuzberg/src/error.rs +431 -0
  77. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  78. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  79. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  80. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  81. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  82. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  83. data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
  84. data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
  85. data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  88. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  89. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
  90. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
  91. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  92. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  93. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  94. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  95. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  96. data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
  97. data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
  98. data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
  99. data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
  100. data/vendor/kreuzberg/src/extractors/email.rs +143 -0
  101. data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
  103. data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
  104. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
  105. data/vendor/kreuzberg/src/extractors/html.rs +393 -0
  106. data/vendor/kreuzberg/src/extractors/image.rs +198 -0
  107. data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
  108. data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
  109. data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
  110. data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
  111. data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
  112. data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
  113. data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
  114. data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
  115. data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
  116. data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
  117. data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
  118. data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
  119. data/vendor/kreuzberg/src/extractors/security.rs +484 -0
  120. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
  121. data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
  122. data/vendor/kreuzberg/src/extractors/text.rs +260 -0
  123. data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
  124. data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
  125. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  126. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  127. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  128. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  129. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  130. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  131. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  132. data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
  133. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  134. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  135. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  136. data/vendor/kreuzberg/src/lib.rs +105 -0
  137. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  138. data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
  139. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  140. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  141. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  142. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  143. data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
  144. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  145. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  146. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  147. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  148. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  149. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  150. data/vendor/kreuzberg/src/panic_context.rs +154 -0
  151. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  152. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  153. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  154. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  155. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  156. data/vendor/kreuzberg/src/pdf/table.rs +393 -0
  157. data/vendor/kreuzberg/src/pdf/text.rs +158 -0
  158. data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
  159. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  160. data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
  161. data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
  162. data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
  163. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  164. data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
  165. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  166. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  167. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  168. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  169. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  170. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  171. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  172. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  173. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  174. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  175. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  176. data/vendor/kreuzberg/src/types.rs +903 -0
  177. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  178. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  179. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  180. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  181. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  182. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  183. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  184. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  185. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  186. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  187. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  188. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  189. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  190. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  191. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  192. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  193. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  194. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  195. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  196. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  197. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  198. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  199. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  200. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  201. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  202. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  203. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  204. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  205. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  206. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  207. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  208. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  209. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  210. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  211. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  212. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  213. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  214. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  215. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  216. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  217. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  218. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  219. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  220. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  221. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  222. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  223. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  224. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  225. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  226. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  227. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  228. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  229. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  230. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  231. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  232. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  233. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  234. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  235. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  236. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  237. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  238. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  239. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  240. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  241. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  242. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  243. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  244. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
  245. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  246. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  247. data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
  248. data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
  249. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
  250. data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
  251. data/vendor/kreuzberg/tests/config_features.rs +598 -0
  252. data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
  253. data/vendor/kreuzberg/tests/core_integration.rs +510 -0
  254. data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
  255. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
  256. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
  257. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
  258. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  259. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
  260. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  261. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
  262. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  263. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  264. data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
  265. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  266. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
  267. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
  268. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
  269. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  270. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  271. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
  272. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
  273. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  274. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  275. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  276. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  277. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  278. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
  279. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
  280. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
  281. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  282. data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
  283. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  284. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
  285. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  286. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  287. data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
  288. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
  289. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
  290. data/vendor/kreuzberg/tests/security_validation.rs +415 -0
  291. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  292. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  293. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
  294. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
  295. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  296. data/vendor/rb-sys/.cargo-ok +1 -0
  297. data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
  298. data/vendor/rb-sys/Cargo.lock +393 -0
  299. data/vendor/rb-sys/Cargo.toml +70 -0
  300. data/vendor/rb-sys/Cargo.toml.orig +57 -0
  301. data/vendor/rb-sys/LICENSE-APACHE +190 -0
  302. data/vendor/rb-sys/LICENSE-MIT +21 -0
  303. data/vendor/rb-sys/bin/release.sh +21 -0
  304. data/vendor/rb-sys/build/features.rs +108 -0
  305. data/vendor/rb-sys/build/main.rs +246 -0
  306. data/vendor/rb-sys/build/stable_api_config.rs +153 -0
  307. data/vendor/rb-sys/build/version.rs +48 -0
  308. data/vendor/rb-sys/readme.md +36 -0
  309. data/vendor/rb-sys/src/bindings.rs +21 -0
  310. data/vendor/rb-sys/src/hidden.rs +11 -0
  311. data/vendor/rb-sys/src/lib.rs +34 -0
  312. data/vendor/rb-sys/src/macros.rs +371 -0
  313. data/vendor/rb-sys/src/memory.rs +53 -0
  314. data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
  315. data/vendor/rb-sys/src/special_consts.rs +31 -0
  316. data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
  317. data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
  318. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
  319. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
  320. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
  321. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
  322. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
  323. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
  324. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
  325. data/vendor/rb-sys/src/stable_api.rs +261 -0
  326. data/vendor/rb-sys/src/symbol.rs +31 -0
  327. data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
  328. data/vendor/rb-sys/src/utils.rs +89 -0
  329. data/vendor/rb-sys/src/value_type.rs +7 -0
  330. metadata +536 -0
@@ -0,0 +1,783 @@
1
+ //! Comprehensive validator plugin system tests.
2
+ //!
3
+ //! Tests custom validator registration, execution, validation logic,
4
+ //! error handling, and cleanup with real file extraction.
5
+
6
+ use async_trait::async_trait;
7
+ use kreuzberg::core::config::ExtractionConfig;
8
+ use kreuzberg::plugins::registry::get_validator_registry;
9
+ use kreuzberg::plugins::{Plugin, Validator};
10
+ use kreuzberg::types::ExtractionResult;
11
+ use kreuzberg::{KreuzbergError, Result, extract_file_sync};
12
+ use serial_test::serial;
13
+ use std::sync::Arc;
14
+ use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
15
+
16
+ struct MinLengthValidator {
17
+ name: String,
18
+ min_length: usize,
19
+ call_count: AtomicUsize,
20
+ }
21
+
22
+ impl Plugin for MinLengthValidator {
23
+ fn name(&self) -> &str {
24
+ &self.name
25
+ }
26
+
27
+ fn version(&self) -> String {
28
+ "1.0.0".to_string()
29
+ }
30
+
31
+ fn initialize(&self) -> Result<()> {
32
+ Ok(())
33
+ }
34
+
35
+ fn shutdown(&self) -> Result<()> {
36
+ Ok(())
37
+ }
38
+ }
39
+
40
+ #[async_trait]
41
+ impl Validator for MinLengthValidator {
42
+ async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
43
+ self.call_count.fetch_add(1, Ordering::SeqCst);
44
+
45
+ if result.content.len() < self.min_length {
46
+ Err(KreuzbergError::validation(format!(
47
+ "Content too short: {} < {} characters",
48
+ result.content.len(),
49
+ self.min_length
50
+ )))
51
+ } else {
52
+ Ok(())
53
+ }
54
+ }
55
+
56
+ fn priority(&self) -> i32 {
57
+ 50
58
+ }
59
+ }
60
+
61
+ struct PassingValidator {
62
+ name: String,
63
+ initialized: AtomicBool,
64
+ }
65
+
66
+ impl Plugin for PassingValidator {
67
+ fn name(&self) -> &str {
68
+ &self.name
69
+ }
70
+
71
+ fn version(&self) -> String {
72
+ "1.0.0".to_string()
73
+ }
74
+
75
+ fn initialize(&self) -> Result<()> {
76
+ self.initialized.store(true, Ordering::Release);
77
+ Ok(())
78
+ }
79
+
80
+ fn shutdown(&self) -> Result<()> {
81
+ self.initialized.store(false, Ordering::Release);
82
+ Ok(())
83
+ }
84
+ }
85
+
86
+ #[async_trait]
87
+ impl Validator for PassingValidator {
88
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
89
+ Ok(())
90
+ }
91
+ }
92
+
93
+ struct MimeTypeValidator {
94
+ name: String,
95
+ allowed_mime: String,
96
+ }
97
+
98
+ impl Plugin for MimeTypeValidator {
99
+ fn name(&self) -> &str {
100
+ &self.name
101
+ }
102
+
103
+ fn version(&self) -> String {
104
+ "1.0.0".to_string()
105
+ }
106
+
107
+ fn initialize(&self) -> Result<()> {
108
+ Ok(())
109
+ }
110
+
111
+ fn shutdown(&self) -> Result<()> {
112
+ Ok(())
113
+ }
114
+ }
115
+
116
+ #[async_trait]
117
+ impl Validator for MimeTypeValidator {
118
+ async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
119
+ if result.mime_type != self.allowed_mime {
120
+ Err(KreuzbergError::validation(format!(
121
+ "MIME type '{}' not allowed, expected '{}'",
122
+ result.mime_type, self.allowed_mime
123
+ )))
124
+ } else {
125
+ Ok(())
126
+ }
127
+ }
128
+
129
+ fn should_validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> bool {
130
+ !result.mime_type.is_empty()
131
+ }
132
+ }
133
+
134
+ struct MetadataValidator {
135
+ name: String,
136
+ required_key: String,
137
+ }
138
+
139
+ impl Plugin for MetadataValidator {
140
+ fn name(&self) -> &str {
141
+ &self.name
142
+ }
143
+
144
+ fn version(&self) -> String {
145
+ "1.0.0".to_string()
146
+ }
147
+
148
+ fn initialize(&self) -> Result<()> {
149
+ Ok(())
150
+ }
151
+
152
+ fn shutdown(&self) -> Result<()> {
153
+ Ok(())
154
+ }
155
+ }
156
+
157
+ #[async_trait]
158
+ impl Validator for MetadataValidator {
159
+ async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
160
+ if !result.metadata.additional.contains_key(&self.required_key) {
161
+ Err(KreuzbergError::validation(format!(
162
+ "Required metadata key '{}' missing",
163
+ self.required_key
164
+ )))
165
+ } else {
166
+ Ok(())
167
+ }
168
+ }
169
+
170
+ fn priority(&self) -> i32 {
171
+ 100
172
+ }
173
+ }
174
+
175
+ struct FailingValidator {
176
+ name: String,
177
+ }
178
+
179
+ impl Plugin for FailingValidator {
180
+ fn name(&self) -> &str {
181
+ &self.name
182
+ }
183
+
184
+ fn version(&self) -> String {
185
+ "1.0.0".to_string()
186
+ }
187
+
188
+ fn initialize(&self) -> Result<()> {
189
+ Ok(())
190
+ }
191
+
192
+ fn shutdown(&self) -> Result<()> {
193
+ Ok(())
194
+ }
195
+ }
196
+
197
+ #[async_trait]
198
+ impl Validator for FailingValidator {
199
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
200
+ Err(KreuzbergError::validation(
201
+ "Validation intentionally failed".to_string(),
202
+ ))
203
+ }
204
+ }
205
+
206
+ struct TrackingValidator {
207
+ name: String,
208
+ called: AtomicBool,
209
+ }
210
+
211
+ impl Plugin for TrackingValidator {
212
+ fn name(&self) -> &str {
213
+ &self.name
214
+ }
215
+
216
+ fn version(&self) -> String {
217
+ "1.0.0".to_string()
218
+ }
219
+
220
+ fn initialize(&self) -> Result<()> {
221
+ Ok(())
222
+ }
223
+
224
+ fn shutdown(&self) -> Result<()> {
225
+ Ok(())
226
+ }
227
+ }
228
+
229
+ #[async_trait]
230
+ impl Validator for TrackingValidator {
231
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
232
+ self.called.store(true, Ordering::Release);
233
+ Ok(())
234
+ }
235
+ }
236
+
237
+ #[test]
238
+ #[serial]
239
+ fn test_register_custom_validator() {
240
+ let registry = get_validator_registry();
241
+
242
+ {
243
+ let mut reg = registry.write().unwrap();
244
+ reg.shutdown_all().unwrap();
245
+ }
246
+
247
+ let validator = Arc::new(PassingValidator {
248
+ name: "test-validator".to_string(),
249
+ initialized: AtomicBool::new(false),
250
+ });
251
+
252
+ {
253
+ let mut reg = registry.write().unwrap();
254
+ let result = reg.register(Arc::clone(&validator) as Arc<dyn Validator>);
255
+ assert!(result.is_ok(), "Failed to register validator: {:?}", result.err());
256
+ }
257
+
258
+ assert!(
259
+ validator.initialized.load(Ordering::Acquire),
260
+ "Validator was not initialized"
261
+ );
262
+
263
+ let list = {
264
+ let reg = registry.read().unwrap();
265
+ reg.list()
266
+ };
267
+
268
+ assert!(list.contains(&"test-validator".to_string()));
269
+
270
+ {
271
+ let mut reg = registry.write().unwrap();
272
+ reg.shutdown_all().unwrap();
273
+ }
274
+ }
275
+
276
+ #[test]
277
+ #[serial]
278
+ fn test_validator_called_during_extraction() {
279
+ let test_file = "../../test_documents/text/fake_text.txt";
280
+ let registry = get_validator_registry();
281
+
282
+ {
283
+ let mut reg = registry.write().unwrap();
284
+ reg.shutdown_all().unwrap();
285
+ }
286
+
287
+ let validator = Arc::new(MinLengthValidator {
288
+ name: "call-test-validator".to_string(),
289
+ min_length: 1,
290
+ call_count: AtomicUsize::new(0),
291
+ });
292
+
293
+ {
294
+ let mut reg = registry.write().unwrap();
295
+ reg.register(Arc::clone(&validator) as Arc<dyn Validator>).unwrap();
296
+ }
297
+
298
+ let config = ExtractionConfig::default();
299
+ let result = extract_file_sync(test_file, None, &config);
300
+
301
+ assert!(result.is_ok(), "Extraction failed: {:?}", result.err());
302
+
303
+ assert_eq!(
304
+ validator.call_count.load(Ordering::SeqCst),
305
+ 1,
306
+ "Validator was not called exactly once"
307
+ );
308
+
309
+ {
310
+ let mut reg = registry.write().unwrap();
311
+ reg.shutdown_all().unwrap();
312
+ }
313
+ }
314
+
315
+ #[test]
316
+ #[serial]
317
+ fn test_validator_can_reject_invalid_input() {
318
+ let test_file = "../../test_documents/text/fake_text.txt";
319
+ let registry = get_validator_registry();
320
+
321
+ {
322
+ let mut reg = registry.write().unwrap();
323
+ reg.shutdown_all().unwrap();
324
+ }
325
+
326
+ let validator = Arc::new(MinLengthValidator {
327
+ name: "reject-validator".to_string(),
328
+ min_length: 1_000_000,
329
+ call_count: AtomicUsize::new(0),
330
+ });
331
+
332
+ {
333
+ let mut reg = registry.write().unwrap();
334
+ reg.register(validator as Arc<dyn Validator>).unwrap();
335
+ }
336
+
337
+ let config = ExtractionConfig::default();
338
+ let result = extract_file_sync(test_file, None, &config);
339
+
340
+ assert!(result.is_err(), "Expected validation to fail");
341
+
342
+ match result.err().unwrap() {
343
+ KreuzbergError::Validation { message, .. } => {
344
+ assert!(message.contains("Content too short"));
345
+ }
346
+ other => panic!("Expected Validation error, got: {:?}", other),
347
+ }
348
+
349
+ {
350
+ let mut reg = registry.write().unwrap();
351
+ reg.shutdown_all().unwrap();
352
+ }
353
+ }
354
+
355
+ #[test]
356
+ #[serial]
357
+ fn test_validator_can_pass_valid_input() {
358
+ let test_file = "../../test_documents/text/fake_text.txt";
359
+ let registry = get_validator_registry();
360
+
361
+ {
362
+ let mut reg = registry.write().unwrap();
363
+ reg.shutdown_all().unwrap();
364
+ }
365
+
366
+ let validator = Arc::new(MinLengthValidator {
367
+ name: "pass-validator".to_string(),
368
+ min_length: 10,
369
+ call_count: AtomicUsize::new(0),
370
+ });
371
+
372
+ {
373
+ let mut reg = registry.write().unwrap();
374
+ reg.register(validator as Arc<dyn Validator>).unwrap();
375
+ }
376
+
377
+ let config = ExtractionConfig::default();
378
+ let result = extract_file_sync(test_file, None, &config);
379
+
380
+ assert!(result.is_ok(), "Validation should have passed: {:?}", result.err());
381
+
382
+ {
383
+ let mut reg = registry.write().unwrap();
384
+ reg.shutdown_all().unwrap();
385
+ }
386
+ }
387
+
388
+ #[test]
389
+ #[serial]
390
+ fn test_validator_receives_correct_parameters() {
391
+ let test_file = "../../test_documents/text/fake_text.txt";
392
+ let registry = get_validator_registry();
393
+
394
+ {
395
+ let mut reg = registry.write().unwrap();
396
+ reg.shutdown_all().unwrap();
397
+ }
398
+
399
+ let validator = Arc::new(MimeTypeValidator {
400
+ name: "mime-validator".to_string(),
401
+ allowed_mime: "text/plain".to_string(),
402
+ });
403
+
404
+ {
405
+ let mut reg = registry.write().unwrap();
406
+ reg.register(validator as Arc<dyn Validator>).unwrap();
407
+ }
408
+
409
+ let config = ExtractionConfig::default();
410
+ let result = extract_file_sync(test_file, None, &config);
411
+
412
+ assert!(result.is_ok(), "Validation failed: {:?}", result.err());
413
+
414
+ let extraction_result = result.unwrap();
415
+ assert_eq!(extraction_result.mime_type, "text/plain");
416
+
417
+ {
418
+ let mut reg = registry.write().unwrap();
419
+ reg.shutdown_all().unwrap();
420
+ }
421
+ }
422
+
423
+ #[test]
424
+ #[serial]
425
+ fn test_validator_rejects_wrong_mime_type() {
426
+ let test_file = "../../test_documents/text/fake_text.txt";
427
+ let registry = get_validator_registry();
428
+
429
+ {
430
+ let mut reg = registry.write().unwrap();
431
+ reg.shutdown_all().unwrap();
432
+ }
433
+
434
+ let validator = Arc::new(MimeTypeValidator {
435
+ name: "strict-mime-validator".to_string(),
436
+ allowed_mime: "application/pdf".to_string(),
437
+ });
438
+
439
+ {
440
+ let mut reg = registry.write().unwrap();
441
+ reg.register(validator as Arc<dyn Validator>).unwrap();
442
+ }
443
+
444
+ let config = ExtractionConfig::default();
445
+ let result = extract_file_sync(test_file, None, &config);
446
+
447
+ assert!(result.is_err(), "Expected MIME type validation to fail");
448
+
449
+ match result.err().unwrap() {
450
+ KreuzbergError::Validation { message, .. } => {
451
+ assert!(message.contains("MIME type"));
452
+ assert!(message.contains("not allowed"));
453
+ }
454
+ other => panic!("Expected Validation error, got: {:?}", other),
455
+ }
456
+
457
+ {
458
+ let mut reg = registry.write().unwrap();
459
+ reg.shutdown_all().unwrap();
460
+ }
461
+ }
462
+
463
+ #[test]
464
+ #[serial]
465
+ fn test_unregister_validator() {
466
+ let registry = get_validator_registry();
467
+
468
+ {
469
+ let mut reg = registry.write().unwrap();
470
+ reg.shutdown_all().unwrap();
471
+ }
472
+
473
+ let validator = Arc::new(FailingValidator {
474
+ name: "unregister-test".to_string(),
475
+ });
476
+
477
+ {
478
+ let mut reg = registry.write().unwrap();
479
+ reg.register(validator as Arc<dyn Validator>).unwrap();
480
+ }
481
+
482
+ {
483
+ let mut reg = registry.write().unwrap();
484
+ reg.remove("unregister-test").unwrap();
485
+ }
486
+
487
+ let list = {
488
+ let reg = registry.read().unwrap();
489
+ reg.list()
490
+ };
491
+
492
+ assert!(!list.contains(&"unregister-test".to_string()));
493
+
494
+ let test_file = "../../test_documents/text/fake_text.txt";
495
+ let config = ExtractionConfig::default();
496
+ let result = extract_file_sync(test_file, None, &config);
497
+
498
+ assert!(
499
+ result.is_ok(),
500
+ "Extraction should succeed after unregistering validator"
501
+ );
502
+
503
+ {
504
+ let mut reg = registry.write().unwrap();
505
+ reg.shutdown_all().unwrap();
506
+ }
507
+ }
508
+
509
+ #[test]
510
+ #[serial]
511
+ fn test_clear_all_validators() {
512
+ let registry = get_validator_registry();
513
+
514
+ {
515
+ let mut reg = registry.write().unwrap();
516
+ reg.shutdown_all().unwrap();
517
+ }
518
+
519
+ let validator1 = Arc::new(FailingValidator {
520
+ name: "clear-test-1".to_string(),
521
+ });
522
+
523
+ let validator2 = Arc::new(FailingValidator {
524
+ name: "clear-test-2".to_string(),
525
+ });
526
+
527
+ {
528
+ let mut reg = registry.write().unwrap();
529
+ reg.register(validator1 as Arc<dyn Validator>).unwrap();
530
+ reg.register(validator2 as Arc<dyn Validator>).unwrap();
531
+ }
532
+
533
+ {
534
+ let mut reg = registry.write().unwrap();
535
+ reg.shutdown_all().unwrap();
536
+ }
537
+
538
+ let list = {
539
+ let reg = registry.read().unwrap();
540
+ reg.list()
541
+ };
542
+
543
+ assert!(list.is_empty(), "Registry was not cleared");
544
+
545
+ let test_file = "../../test_documents/text/fake_text.txt";
546
+ let config = ExtractionConfig::default();
547
+ let result = extract_file_sync(test_file, None, &config);
548
+
549
+ assert!(result.is_ok(), "Extraction should succeed after clearing validators");
550
+ }
551
+
552
+ #[test]
553
+ #[serial]
554
+ fn test_validator_invalid_name() {
555
+ let registry = get_validator_registry();
556
+
557
+ {
558
+ let mut reg = registry.write().unwrap();
559
+ reg.shutdown_all().unwrap();
560
+ }
561
+
562
+ let validator = Arc::new(PassingValidator {
563
+ name: "invalid name".to_string(),
564
+ initialized: AtomicBool::new(false),
565
+ });
566
+
567
+ {
568
+ let mut reg = registry.write().unwrap();
569
+ let result = reg.register(validator);
570
+
571
+ assert!(result.is_err());
572
+ assert!(matches!(result.err().unwrap(), KreuzbergError::Validation { .. }));
573
+ }
574
+
575
+ {
576
+ let mut reg = registry.write().unwrap();
577
+ reg.shutdown_all().unwrap();
578
+ }
579
+ }
580
+
581
+ #[test]
582
+ #[serial]
583
+ fn test_validator_initialization_lifecycle() {
584
+ let registry = get_validator_registry();
585
+
586
+ {
587
+ let mut reg = registry.write().unwrap();
588
+ reg.shutdown_all().unwrap();
589
+ }
590
+
591
+ let validator = Arc::new(PassingValidator {
592
+ name: "lifecycle-test".to_string(),
593
+ initialized: AtomicBool::new(false),
594
+ });
595
+
596
+ assert!(
597
+ !validator.initialized.load(Ordering::Acquire),
598
+ "Validator should not be initialized yet"
599
+ );
600
+
601
+ {
602
+ let mut reg = registry.write().unwrap();
603
+ reg.register(Arc::clone(&validator) as Arc<dyn Validator>).unwrap();
604
+ }
605
+
606
+ assert!(
607
+ validator.initialized.load(Ordering::Acquire),
608
+ "Validator should be initialized after registration"
609
+ );
610
+
611
+ {
612
+ let mut reg = registry.write().unwrap();
613
+ reg.shutdown_all().unwrap();
614
+ }
615
+
616
+ assert!(
617
+ !validator.initialized.load(Ordering::Acquire),
618
+ "Validator should be shutdown"
619
+ );
620
+ }
621
+
622
+ #[test]
623
+ #[serial]
624
+ fn test_multiple_validators_execution() {
625
+ let test_file = "../../test_documents/text/fake_text.txt";
626
+ let registry = get_validator_registry();
627
+
628
+ {
629
+ let mut reg = registry.write().unwrap();
630
+ reg.shutdown_all().unwrap();
631
+ }
632
+
633
+ let validator1 = Arc::new(MinLengthValidator {
634
+ name: "multi-validator-1".to_string(),
635
+ min_length: 10,
636
+ call_count: AtomicUsize::new(0),
637
+ });
638
+
639
+ let validator2 = Arc::new(MimeTypeValidator {
640
+ name: "multi-validator-2".to_string(),
641
+ allowed_mime: "text/plain".to_string(),
642
+ });
643
+
644
+ {
645
+ let mut reg = registry.write().unwrap();
646
+ reg.register(Arc::clone(&validator1) as Arc<dyn Validator>).unwrap();
647
+ reg.register(validator2 as Arc<dyn Validator>).unwrap();
648
+ }
649
+
650
+ let config = ExtractionConfig::default();
651
+ let result = extract_file_sync(test_file, None, &config);
652
+
653
+ assert!(result.is_ok(), "Both validators should pass");
654
+ assert_eq!(validator1.call_count.load(Ordering::SeqCst), 1);
655
+
656
+ {
657
+ let mut reg = registry.write().unwrap();
658
+ reg.shutdown_all().unwrap();
659
+ }
660
+ }
661
+
662
+ #[test]
663
+ #[serial]
664
+ fn test_validator_priority_execution_order() {
665
+ let test_file = "../../test_documents/text/fake_text.txt";
666
+ let registry = get_validator_registry();
667
+
668
+ {
669
+ let mut reg = registry.write().unwrap();
670
+ reg.shutdown_all().unwrap();
671
+ }
672
+
673
+ let high_priority = Arc::new(MetadataValidator {
674
+ name: "high-priority-validator".to_string(),
675
+ required_key: "nonexistent_key".to_string(),
676
+ });
677
+
678
+ let low_priority = Arc::new(PassingValidator {
679
+ name: "low-priority-validator".to_string(),
680
+ initialized: AtomicBool::new(false),
681
+ });
682
+
683
+ {
684
+ let mut reg = registry.write().unwrap();
685
+ reg.register(high_priority as Arc<dyn Validator>).unwrap();
686
+ reg.register(low_priority as Arc<dyn Validator>).unwrap();
687
+ }
688
+
689
+ let config = ExtractionConfig::default();
690
+ let result = extract_file_sync(test_file, None, &config);
691
+
692
+ assert!(result.is_err(), "Expected high-priority validator to fail");
693
+
694
+ match result.err().unwrap() {
695
+ KreuzbergError::Validation { message, .. } => {
696
+ assert!(message.contains("Required metadata key"));
697
+ }
698
+ other => panic!("Expected Validation error, got: {:?}", other),
699
+ }
700
+
701
+ {
702
+ let mut reg = registry.write().unwrap();
703
+ reg.shutdown_all().unwrap();
704
+ }
705
+ }
706
+
707
+ #[test]
708
+ #[serial]
709
+ fn test_validator_always_fails() {
710
+ let test_file = "../../test_documents/text/fake_text.txt";
711
+ let registry = get_validator_registry();
712
+
713
+ {
714
+ let mut reg = registry.write().unwrap();
715
+ reg.shutdown_all().unwrap();
716
+ }
717
+
718
+ let validator = Arc::new(FailingValidator {
719
+ name: "always-fails".to_string(),
720
+ });
721
+
722
+ {
723
+ let mut reg = registry.write().unwrap();
724
+ reg.register(validator as Arc<dyn Validator>).unwrap();
725
+ }
726
+
727
+ let config = ExtractionConfig::default();
728
+ let result = extract_file_sync(test_file, None, &config);
729
+
730
+ assert!(result.is_err(), "Validator should always fail");
731
+
732
+ match result.err().unwrap() {
733
+ KreuzbergError::Validation { message, .. } => {
734
+ assert!(message.contains("intentionally failed"));
735
+ }
736
+ other => panic!("Expected Validation error, got: {:?}", other),
737
+ }
738
+
739
+ {
740
+ let mut reg = registry.write().unwrap();
741
+ reg.shutdown_all().unwrap();
742
+ }
743
+ }
744
+
745
+ #[test]
746
+ #[serial]
747
+ fn test_validator_registration_order_preserved_for_same_priority() {
748
+ let test_file = "../../test_documents/text/fake_text.txt";
749
+ let registry = get_validator_registry();
750
+
751
+ {
752
+ let mut reg = registry.write().unwrap();
753
+ reg.shutdown_all().unwrap();
754
+ }
755
+
756
+ let tracker = Arc::new(TrackingValidator {
757
+ name: "order-second".to_string(),
758
+ called: AtomicBool::new(false),
759
+ });
760
+
761
+ {
762
+ let mut reg = registry.write().unwrap();
763
+ reg.register(Arc::new(FailingValidator {
764
+ name: "order-first".to_string(),
765
+ }) as Arc<dyn Validator>)
766
+ .unwrap();
767
+ reg.register(tracker.clone() as Arc<dyn Validator>).unwrap();
768
+ }
769
+
770
+ let config = ExtractionConfig::default();
771
+ let result = extract_file_sync(test_file, None, &config);
772
+
773
+ assert!(result.is_err(), "Expected first validator to fail");
774
+ assert!(
775
+ !tracker.called.load(Ordering::Acquire),
776
+ "Second validator should not run once the first validator fails"
777
+ );
778
+
779
+ {
780
+ let mut reg = registry.write().unwrap();
781
+ reg.shutdown_all().unwrap();
782
+ }
783
+ }