kreuzberg 4.0.0.rc2 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (446) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +14 -14
  3. data/.rspec +3 -3
  4. data/.rubocop.yaml +1 -1
  5. data/.rubocop.yml +543 -538
  6. data/Gemfile +8 -8
  7. data/Gemfile.lock +194 -6
  8. data/README.md +396 -426
  9. data/Rakefile +34 -25
  10. data/Steepfile +51 -47
  11. data/examples/async_patterns.rb +283 -341
  12. data/ext/kreuzberg_rb/extconf.rb +65 -45
  13. data/ext/kreuzberg_rb/native/.cargo/config.toml +23 -0
  14. data/ext/kreuzberg_rb/native/Cargo.lock +7619 -6535
  15. data/ext/kreuzberg_rb/native/Cargo.toml +75 -44
  16. data/ext/kreuzberg_rb/native/README.md +425 -425
  17. data/ext/kreuzberg_rb/native/build.rs +15 -15
  18. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -11
  19. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -14
  20. data/ext/kreuzberg_rb/native/include/strings.h +20 -20
  21. data/ext/kreuzberg_rb/native/include/unistd.h +47 -47
  22. data/ext/kreuzberg_rb/native/src/lib.rs +3802 -2998
  23. data/extconf.rb +60 -28
  24. data/kreuzberg.gemspec +199 -148
  25. data/lib/kreuzberg/api_proxy.rb +126 -142
  26. data/lib/kreuzberg/cache_api.rb +67 -46
  27. data/lib/kreuzberg/cli.rb +47 -55
  28. data/lib/kreuzberg/cli_proxy.rb +117 -127
  29. data/lib/kreuzberg/config.rb +936 -691
  30. data/lib/kreuzberg/error_context.rb +136 -32
  31. data/lib/kreuzberg/errors.rb +116 -118
  32. data/lib/kreuzberg/extraction_api.rb +313 -85
  33. data/lib/kreuzberg/mcp_proxy.rb +177 -186
  34. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -113
  35. data/lib/kreuzberg/post_processor_protocol.rb +15 -86
  36. data/lib/kreuzberg/result.rb +334 -216
  37. data/lib/kreuzberg/setup_lib_path.rb +99 -80
  38. data/lib/kreuzberg/types.rb +170 -0
  39. data/lib/kreuzberg/validator_protocol.rb +16 -89
  40. data/lib/kreuzberg/version.rb +5 -5
  41. data/lib/kreuzberg.rb +96 -103
  42. data/lib/libpdfium.so +0 -0
  43. data/sig/kreuzberg/internal.rbs +184 -184
  44. data/sig/kreuzberg.rbs +561 -520
  45. data/spec/binding/async_operations_spec.rb +473 -0
  46. data/spec/binding/batch_operations_spec.rb +595 -0
  47. data/spec/binding/batch_spec.rb +359 -0
  48. data/spec/binding/cache_spec.rb +227 -227
  49. data/spec/binding/cli_proxy_spec.rb +85 -85
  50. data/spec/binding/cli_spec.rb +55 -55
  51. data/spec/binding/config_result_spec.rb +377 -0
  52. data/spec/binding/config_spec.rb +419 -345
  53. data/spec/binding/config_validation_spec.rb +377 -283
  54. data/spec/binding/embeddings_spec.rb +816 -0
  55. data/spec/binding/error_handling_spec.rb +399 -213
  56. data/spec/binding/error_recovery_spec.rb +488 -0
  57. data/spec/binding/errors_spec.rb +66 -66
  58. data/spec/binding/font_config_spec.rb +220 -0
  59. data/spec/binding/images_spec.rb +738 -0
  60. data/spec/binding/keywords_extraction_spec.rb +600 -0
  61. data/spec/binding/metadata_types_spec.rb +1228 -0
  62. data/spec/binding/pages_extraction_spec.rb +471 -0
  63. data/spec/binding/plugins/ocr_backend_spec.rb +307 -307
  64. data/spec/binding/plugins/postprocessor_spec.rb +269 -269
  65. data/spec/binding/plugins/validator_spec.rb +273 -274
  66. data/spec/binding/tables_spec.rb +641 -0
  67. data/spec/fixtures/config.toml +38 -39
  68. data/spec/fixtures/config.yaml +41 -41
  69. data/spec/fixtures/invalid_config.toml +3 -4
  70. data/spec/smoke/package_spec.rb +177 -178
  71. data/spec/spec_helper.rb +40 -42
  72. data/spec/unit/config/chunking_config_spec.rb +213 -0
  73. data/spec/unit/config/embedding_config_spec.rb +343 -0
  74. data/spec/unit/config/extraction_config_spec.rb +438 -0
  75. data/spec/unit/config/font_config_spec.rb +285 -0
  76. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  77. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  78. data/spec/unit/config/image_preprocessing_config_spec.rb +249 -0
  79. data/spec/unit/config/keyword_config_spec.rb +229 -0
  80. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  81. data/spec/unit/config/ocr_config_spec.rb +171 -0
  82. data/spec/unit/config/page_config_spec.rb +221 -0
  83. data/spec/unit/config/pdf_config_spec.rb +267 -0
  84. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  85. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  86. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  87. data/test/metadata_types_test.rb +959 -0
  88. data/vendor/Cargo.toml +61 -0
  89. data/vendor/kreuzberg/Cargo.toml +259 -204
  90. data/vendor/kreuzberg/README.md +263 -175
  91. data/vendor/kreuzberg/build.rs +782 -474
  92. data/vendor/kreuzberg/examples/bench_fixes.rs +71 -0
  93. data/vendor/kreuzberg/examples/test_pdfium_fork.rs +62 -0
  94. data/vendor/kreuzberg/src/api/error.rs +81 -81
  95. data/vendor/kreuzberg/src/api/handlers.rs +320 -199
  96. data/vendor/kreuzberg/src/api/mod.rs +94 -79
  97. data/vendor/kreuzberg/src/api/server.rs +518 -353
  98. data/vendor/kreuzberg/src/api/types.rs +206 -170
  99. data/vendor/kreuzberg/src/cache/mod.rs +1167 -1167
  100. data/vendor/kreuzberg/src/chunking/mod.rs +2303 -677
  101. data/vendor/kreuzberg/src/chunking/processor.rs +219 -0
  102. data/vendor/kreuzberg/src/core/batch_mode.rs +95 -95
  103. data/vendor/kreuzberg/src/core/batch_optimizations.rs +385 -0
  104. data/vendor/kreuzberg/src/core/config.rs +1914 -1032
  105. data/vendor/kreuzberg/src/core/config_validation.rs +949 -0
  106. data/vendor/kreuzberg/src/core/extractor.rs +1200 -1024
  107. data/vendor/kreuzberg/src/core/formats.rs +235 -0
  108. data/vendor/kreuzberg/src/core/io.rs +329 -329
  109. data/vendor/kreuzberg/src/core/mime.rs +605 -605
  110. data/vendor/kreuzberg/src/core/mod.rs +61 -45
  111. data/vendor/kreuzberg/src/core/pipeline.rs +1223 -984
  112. data/vendor/kreuzberg/src/core/server_config.rs +1220 -0
  113. data/vendor/kreuzberg/src/embeddings.rs +471 -432
  114. data/vendor/kreuzberg/src/error.rs +431 -431
  115. data/vendor/kreuzberg/src/extraction/archive.rs +959 -954
  116. data/vendor/kreuzberg/src/extraction/capacity.rs +263 -0
  117. data/vendor/kreuzberg/src/extraction/docx.rs +404 -40
  118. data/vendor/kreuzberg/src/extraction/email.rs +855 -854
  119. data/vendor/kreuzberg/src/extraction/excel.rs +697 -688
  120. data/vendor/kreuzberg/src/extraction/html.rs +1830 -553
  121. data/vendor/kreuzberg/src/extraction/image.rs +492 -368
  122. data/vendor/kreuzberg/src/extraction/libreoffice.rs +574 -563
  123. data/vendor/kreuzberg/src/extraction/markdown.rs +216 -213
  124. data/vendor/kreuzberg/src/extraction/mod.rs +93 -81
  125. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -398
  126. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -247
  127. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -240
  128. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -130
  129. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +284 -287
  130. data/vendor/kreuzberg/src/extraction/pptx.rs +3102 -3000
  131. data/vendor/kreuzberg/src/extraction/structured.rs +491 -490
  132. data/vendor/kreuzberg/src/extraction/table.rs +329 -328
  133. data/vendor/kreuzberg/src/extraction/text.rs +277 -269
  134. data/vendor/kreuzberg/src/extraction/xml.rs +333 -333
  135. data/vendor/kreuzberg/src/extractors/archive.rs +447 -446
  136. data/vendor/kreuzberg/src/extractors/bibtex.rs +470 -469
  137. data/vendor/kreuzberg/src/extractors/docbook.rs +504 -502
  138. data/vendor/kreuzberg/src/extractors/docx.rs +400 -367
  139. data/vendor/kreuzberg/src/extractors/email.rs +157 -143
  140. data/vendor/kreuzberg/src/extractors/epub.rs +696 -707
  141. data/vendor/kreuzberg/src/extractors/excel.rs +385 -343
  142. data/vendor/kreuzberg/src/extractors/fictionbook.rs +492 -491
  143. data/vendor/kreuzberg/src/extractors/html.rs +419 -393
  144. data/vendor/kreuzberg/src/extractors/image.rs +219 -198
  145. data/vendor/kreuzberg/src/extractors/jats.rs +1054 -1051
  146. data/vendor/kreuzberg/src/extractors/jupyter.rs +368 -367
  147. data/vendor/kreuzberg/src/extractors/latex.rs +653 -652
  148. data/vendor/kreuzberg/src/extractors/markdown.rs +701 -700
  149. data/vendor/kreuzberg/src/extractors/mod.rs +429 -365
  150. data/vendor/kreuzberg/src/extractors/odt.rs +628 -628
  151. data/vendor/kreuzberg/src/extractors/opml.rs +635 -634
  152. data/vendor/kreuzberg/src/extractors/orgmode.rs +529 -528
  153. data/vendor/kreuzberg/src/extractors/pdf.rs +761 -493
  154. data/vendor/kreuzberg/src/extractors/pptx.rs +279 -248
  155. data/vendor/kreuzberg/src/extractors/rst.rs +577 -576
  156. data/vendor/kreuzberg/src/extractors/rtf.rs +809 -810
  157. data/vendor/kreuzberg/src/extractors/security.rs +484 -484
  158. data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -367
  159. data/vendor/kreuzberg/src/extractors/structured.rs +142 -140
  160. data/vendor/kreuzberg/src/extractors/text.rs +265 -260
  161. data/vendor/kreuzberg/src/extractors/typst.rs +651 -650
  162. data/vendor/kreuzberg/src/extractors/xml.rs +147 -135
  163. data/vendor/kreuzberg/src/image/dpi.rs +164 -164
  164. data/vendor/kreuzberg/src/image/mod.rs +6 -6
  165. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -417
  166. data/vendor/kreuzberg/src/image/resize.rs +89 -89
  167. data/vendor/kreuzberg/src/keywords/config.rs +154 -154
  168. data/vendor/kreuzberg/src/keywords/mod.rs +237 -237
  169. data/vendor/kreuzberg/src/keywords/processor.rs +275 -267
  170. data/vendor/kreuzberg/src/keywords/rake.rs +293 -293
  171. data/vendor/kreuzberg/src/keywords/types.rs +68 -68
  172. data/vendor/kreuzberg/src/keywords/yake.rs +163 -163
  173. data/vendor/kreuzberg/src/language_detection/mod.rs +985 -942
  174. data/vendor/kreuzberg/src/language_detection/processor.rs +218 -0
  175. data/vendor/kreuzberg/src/lib.rs +114 -105
  176. data/vendor/kreuzberg/src/mcp/mod.rs +35 -32
  177. data/vendor/kreuzberg/src/mcp/server.rs +2090 -1968
  178. data/vendor/kreuzberg/src/ocr/cache.rs +469 -469
  179. data/vendor/kreuzberg/src/ocr/error.rs +37 -37
  180. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -216
  181. data/vendor/kreuzberg/src/ocr/language_registry.rs +520 -0
  182. data/vendor/kreuzberg/src/ocr/mod.rs +60 -58
  183. data/vendor/kreuzberg/src/ocr/processor.rs +858 -863
  184. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -4
  185. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -144
  186. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +456 -450
  187. data/vendor/kreuzberg/src/ocr/types.rs +393 -393
  188. data/vendor/kreuzberg/src/ocr/utils.rs +47 -47
  189. data/vendor/kreuzberg/src/ocr/validation.rs +206 -206
  190. data/vendor/kreuzberg/src/panic_context.rs +154 -154
  191. data/vendor/kreuzberg/src/pdf/bindings.rs +306 -0
  192. data/vendor/kreuzberg/src/pdf/bundled.rs +408 -0
  193. data/vendor/kreuzberg/src/pdf/error.rs +214 -122
  194. data/vendor/kreuzberg/src/pdf/fonts.rs +358 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy.rs +903 -0
  196. data/vendor/kreuzberg/src/pdf/images.rs +139 -139
  197. data/vendor/kreuzberg/src/pdf/metadata.rs +509 -346
  198. data/vendor/kreuzberg/src/pdf/mod.rs +81 -50
  199. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -369
  200. data/vendor/kreuzberg/src/pdf/table.rs +417 -393
  201. data/vendor/kreuzberg/src/pdf/text.rs +553 -158
  202. data/vendor/kreuzberg/src/plugins/extractor.rs +1042 -1013
  203. data/vendor/kreuzberg/src/plugins/mod.rs +212 -209
  204. data/vendor/kreuzberg/src/plugins/ocr.rs +637 -620
  205. data/vendor/kreuzberg/src/plugins/processor.rs +650 -642
  206. data/vendor/kreuzberg/src/plugins/registry.rs +1339 -1337
  207. data/vendor/kreuzberg/src/plugins/traits.rs +258 -258
  208. data/vendor/kreuzberg/src/plugins/validator.rs +967 -956
  209. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -1470
  210. data/vendor/kreuzberg/src/text/mod.rs +27 -19
  211. data/vendor/kreuzberg/src/text/quality.rs +710 -697
  212. data/vendor/kreuzberg/src/text/quality_processor.rs +231 -0
  213. data/vendor/kreuzberg/src/text/string_utils.rs +229 -217
  214. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -164
  215. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -100
  216. data/vendor/kreuzberg/src/text/token_reduction/core.rs +832 -796
  217. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +923 -902
  218. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -160
  219. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -619
  220. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +148 -147
  221. data/vendor/kreuzberg/src/text/utf8_validation.rs +193 -0
  222. data/vendor/kreuzberg/src/types.rs +1713 -903
  223. data/vendor/kreuzberg/src/utils/mod.rs +31 -17
  224. data/vendor/kreuzberg/src/utils/pool.rs +503 -0
  225. data/vendor/kreuzberg/src/utils/pool_sizing.rs +364 -0
  226. data/vendor/kreuzberg/src/utils/quality.rs +968 -959
  227. data/vendor/kreuzberg/src/utils/string_pool.rs +761 -0
  228. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -381
  229. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -53
  230. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -482
  231. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -261
  232. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -400
  233. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -1205
  234. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -280
  235. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -425
  236. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -172
  237. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -622
  238. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -849
  239. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -1300
  240. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -175
  241. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -734
  242. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -37
  243. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -100
  244. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -801
  245. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -849
  246. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -693
  247. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -111
  248. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -162
  249. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -226
  250. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -41
  251. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -196
  252. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -227
  253. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -181
  254. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -791
  255. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -47
  256. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -760
  257. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -634
  258. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -136
  259. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -84
  260. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -681
  261. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -64
  262. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -51
  263. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -476
  264. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -163
  265. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -1
  266. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -101
  267. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -477
  268. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -490
  269. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -415
  270. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -223
  271. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -331
  272. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -562
  273. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -436
  274. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -561
  275. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -193
  276. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -420
  277. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -448
  278. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -32
  279. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -33
  280. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -420
  281. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -76
  282. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -129
  283. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -54
  284. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -118
  285. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -149
  286. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -506
  287. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -75
  288. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -519
  289. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -647
  290. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -62
  291. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -796
  292. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -31
  293. data/vendor/kreuzberg/tests/api_embed.rs +360 -0
  294. data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -52
  295. data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +471 -0
  296. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +289 -0
  297. data/vendor/kreuzberg/tests/api_tests.rs +1472 -966
  298. data/vendor/kreuzberg/tests/archive_integration.rs +545 -543
  299. data/vendor/kreuzberg/tests/batch_orchestration.rs +587 -556
  300. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +154 -0
  301. data/vendor/kreuzberg/tests/batch_processing.rs +328 -316
  302. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -421
  303. data/vendor/kreuzberg/tests/concurrency_stress.rs +541 -525
  304. data/vendor/kreuzberg/tests/config_features.rs +612 -598
  305. data/vendor/kreuzberg/tests/config_integration_test.rs +753 -0
  306. data/vendor/kreuzberg/tests/config_loading_tests.rs +416 -415
  307. data/vendor/kreuzberg/tests/core_integration.rs +519 -510
  308. data/vendor/kreuzberg/tests/csv_integration.rs +414 -414
  309. data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +294 -0
  310. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +500 -498
  311. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -122
  312. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -370
  313. data/vendor/kreuzberg/tests/email_integration.rs +327 -325
  314. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -275
  315. data/vendor/kreuzberg/tests/error_handling.rs +402 -393
  316. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -228
  317. data/vendor/kreuzberg/tests/format_integration.rs +165 -159
  318. data/vendor/kreuzberg/tests/helpers/mod.rs +202 -142
  319. data/vendor/kreuzberg/tests/html_table_test.rs +551 -551
  320. data/vendor/kreuzberg/tests/image_integration.rs +255 -253
  321. data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -139
  322. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -639
  323. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -704
  324. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -479
  325. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -509
  326. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -496
  327. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -490
  328. data/vendor/kreuzberg/tests/mime_detection.rs +429 -428
  329. data/vendor/kreuzberg/tests/ocr_configuration.rs +514 -510
  330. data/vendor/kreuzberg/tests/ocr_errors.rs +698 -676
  331. data/vendor/kreuzberg/tests/ocr_language_registry.rs +191 -0
  332. data/vendor/kreuzberg/tests/ocr_quality.rs +629 -627
  333. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -469
  334. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +674 -695
  335. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -616
  336. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -822
  337. data/vendor/kreuzberg/tests/page_markers.rs +297 -0
  338. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +301 -0
  339. data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +589 -0
  340. data/vendor/kreuzberg/tests/pdf_integration.rs +45 -43
  341. data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +301 -0
  342. data/vendor/kreuzberg/tests/pdf_text_merging.rs +475 -0
  343. data/vendor/kreuzberg/tests/pdfium_linking.rs +340 -0
  344. data/vendor/kreuzberg/tests/pipeline_integration.rs +1446 -1411
  345. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +776 -771
  346. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +577 -560
  347. data/vendor/kreuzberg/tests/plugin_system.rs +927 -921
  348. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -783
  349. data/vendor/kreuzberg/tests/registry_integration_tests.rs +587 -586
  350. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +694 -692
  351. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +775 -776
  352. data/vendor/kreuzberg/tests/security_validation.rs +416 -415
  353. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -888
  354. data/vendor/kreuzberg/tests/test_fastembed.rs +631 -609
  355. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1260 -1259
  356. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +648 -647
  357. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -87
  358. data/vendor/kreuzberg-ffi/Cargo.toml +67 -0
  359. data/vendor/kreuzberg-ffi/README.md +851 -0
  360. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +227 -0
  361. data/vendor/kreuzberg-ffi/build.rs +168 -0
  362. data/vendor/kreuzberg-ffi/cbindgen.toml +37 -0
  363. data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +12 -0
  364. data/vendor/kreuzberg-ffi/kreuzberg.h +3012 -0
  365. data/vendor/kreuzberg-ffi/src/batch_streaming.rs +588 -0
  366. data/vendor/kreuzberg-ffi/src/config.rs +1341 -0
  367. data/vendor/kreuzberg-ffi/src/error.rs +901 -0
  368. data/vendor/kreuzberg-ffi/src/extraction.rs +555 -0
  369. data/vendor/kreuzberg-ffi/src/helpers.rs +879 -0
  370. data/vendor/kreuzberg-ffi/src/lib.rs +977 -0
  371. data/vendor/kreuzberg-ffi/src/memory.rs +493 -0
  372. data/vendor/kreuzberg-ffi/src/mime.rs +329 -0
  373. data/vendor/kreuzberg-ffi/src/panic_shield.rs +265 -0
  374. data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +442 -0
  375. data/vendor/kreuzberg-ffi/src/plugins/mod.rs +14 -0
  376. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +628 -0
  377. data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +438 -0
  378. data/vendor/kreuzberg-ffi/src/plugins/validator.rs +329 -0
  379. data/vendor/kreuzberg-ffi/src/result.rs +510 -0
  380. data/vendor/kreuzberg-ffi/src/result_pool.rs +639 -0
  381. data/vendor/kreuzberg-ffi/src/result_view.rs +773 -0
  382. data/vendor/kreuzberg-ffi/src/string_intern.rs +568 -0
  383. data/vendor/kreuzberg-ffi/src/types.rs +363 -0
  384. data/vendor/kreuzberg-ffi/src/util.rs +210 -0
  385. data/vendor/kreuzberg-ffi/src/validation.rs +848 -0
  386. data/vendor/kreuzberg-ffi/tests.disabled/README.md +48 -0
  387. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +299 -0
  388. data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +346 -0
  389. data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +232 -0
  390. data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +470 -0
  391. data/vendor/kreuzberg-tesseract/.commitlintrc.json +13 -0
  392. data/vendor/kreuzberg-tesseract/.crate-ignore +2 -0
  393. data/vendor/kreuzberg-tesseract/Cargo.lock +2933 -0
  394. data/vendor/kreuzberg-tesseract/Cargo.toml +57 -0
  395. data/vendor/{rb-sys/LICENSE-MIT → kreuzberg-tesseract/LICENSE} +22 -21
  396. data/vendor/kreuzberg-tesseract/README.md +399 -0
  397. data/vendor/kreuzberg-tesseract/build.rs +1127 -0
  398. data/vendor/kreuzberg-tesseract/patches/README.md +71 -0
  399. data/vendor/kreuzberg-tesseract/patches/tesseract.diff +199 -0
  400. data/vendor/kreuzberg-tesseract/src/api.rs +1371 -0
  401. data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +77 -0
  402. data/vendor/kreuzberg-tesseract/src/enums.rs +297 -0
  403. data/vendor/kreuzberg-tesseract/src/error.rs +81 -0
  404. data/vendor/kreuzberg-tesseract/src/lib.rs +145 -0
  405. data/vendor/kreuzberg-tesseract/src/monitor.rs +57 -0
  406. data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +197 -0
  407. data/vendor/kreuzberg-tesseract/src/page_iterator.rs +253 -0
  408. data/vendor/kreuzberg-tesseract/src/result_iterator.rs +286 -0
  409. data/vendor/kreuzberg-tesseract/src/result_renderer.rs +183 -0
  410. data/vendor/kreuzberg-tesseract/tests/integration_test.rs +211 -0
  411. metadata +196 -45
  412. data/vendor/kreuzberg/benches/otel_overhead.rs +0 -48
  413. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  414. data/vendor/rb-sys/.cargo-ok +0 -1
  415. data/vendor/rb-sys/.cargo_vcs_info.json +0 -6
  416. data/vendor/rb-sys/Cargo.lock +0 -393
  417. data/vendor/rb-sys/Cargo.toml +0 -70
  418. data/vendor/rb-sys/Cargo.toml.orig +0 -57
  419. data/vendor/rb-sys/LICENSE-APACHE +0 -190
  420. data/vendor/rb-sys/bin/release.sh +0 -21
  421. data/vendor/rb-sys/build/features.rs +0 -108
  422. data/vendor/rb-sys/build/main.rs +0 -246
  423. data/vendor/rb-sys/build/stable_api_config.rs +0 -153
  424. data/vendor/rb-sys/build/version.rs +0 -48
  425. data/vendor/rb-sys/readme.md +0 -36
  426. data/vendor/rb-sys/src/bindings.rs +0 -21
  427. data/vendor/rb-sys/src/hidden.rs +0 -11
  428. data/vendor/rb-sys/src/lib.rs +0 -34
  429. data/vendor/rb-sys/src/macros.rs +0 -371
  430. data/vendor/rb-sys/src/memory.rs +0 -53
  431. data/vendor/rb-sys/src/ruby_abi_version.rs +0 -38
  432. data/vendor/rb-sys/src/special_consts.rs +0 -31
  433. data/vendor/rb-sys/src/stable_api/compiled.c +0 -179
  434. data/vendor/rb-sys/src/stable_api/compiled.rs +0 -257
  435. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
  436. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +0 -316
  437. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +0 -324
  438. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +0 -317
  439. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +0 -315
  440. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +0 -326
  441. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +0 -327
  442. data/vendor/rb-sys/src/stable_api.rs +0 -261
  443. data/vendor/rb-sys/src/symbol.rs +0 -31
  444. data/vendor/rb-sys/src/tracking_allocator.rs +0 -332
  445. data/vendor/rb-sys/src/utils.rs +0 -89
  446. data/vendor/rb-sys/src/value_type.rs +0 -7
@@ -1,956 +1,967 @@
1
- //! Validator plugin trait.
2
- //!
3
- //! This module defines the trait for implementing custom validation logic.
4
-
5
- use crate::Result;
6
- use crate::core::config::ExtractionConfig;
7
- use crate::plugins::Plugin;
8
- use crate::types::ExtractionResult;
9
- use async_trait::async_trait;
10
- use std::sync::Arc;
11
-
12
- /// Trait for validator plugins.
13
- ///
14
- /// Validators check extraction results for quality, completeness, or correctness.
15
- /// Unlike post-processors, validator errors **fail fast** - if a validator returns
16
- /// an error, the extraction fails immediately.
17
- ///
18
- /// # Use Cases
19
- ///
20
- /// - **Quality Gates**: Ensure extracted content meets minimum quality standards
21
- /// - **Compliance**: Verify content meets regulatory requirements
22
- /// - **Content Filtering**: Reject documents containing unwanted content
23
- /// - **Format Validation**: Verify extracted content structure
24
- /// - **Security Checks**: Scan for malicious content
25
- ///
26
- /// # Error Handling
27
- ///
28
- /// Validator errors are **fatal** - they cause the extraction to fail and bubble up
29
- /// to the caller. Use validators for hard requirements that must be met.
30
- ///
31
- /// For non-fatal checks, use post-processors instead.
32
- ///
33
- /// # Thread Safety
34
- ///
35
- /// Validators must be thread-safe (`Send + Sync`).
36
- ///
37
- /// # Example
38
- ///
39
- /// ```rust
40
- /// use kreuzberg::plugins::{Plugin, Validator};
41
- /// use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
42
- /// use async_trait::async_trait;
43
- ///
44
- /// /// Validate that extracted content has minimum length
45
- /// struct MinimumLengthValidator {
46
- /// min_length: usize,
47
- /// }
48
- ///
49
- /// impl Plugin for MinimumLengthValidator {
50
- /// fn name(&self) -> &str { "min-length-validator" }
51
- /// fn version(&self) -> String { "1.0.0".to_string() }
52
- /// fn initialize(&self) -> Result<()> { Ok(()) }
53
- /// fn shutdown(&self) -> Result<()> { Ok(()) }
54
- /// }
55
- ///
56
- /// #[async_trait]
57
- /// impl Validator for MinimumLengthValidator {
58
- /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
59
- /// -> Result<()> {
60
- /// if result.content.len() < self.min_length {
61
- /// return Err(KreuzbergError::validation(format!(
62
- /// "Content too short: {} < {} characters",
63
- /// result.content.len(),
64
- /// self.min_length
65
- /// )));
66
- /// }
67
- /// Ok(())
68
- /// }
69
- /// }
70
- /// ```
71
- #[cfg_attr(not(target_arch = "wasm32"), async_trait)]
72
- #[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
73
- pub trait Validator: Plugin {
74
- /// Validate an extraction result.
75
- ///
76
- /// Check the extraction result and return `Ok(())` if valid, or an error
77
- /// if validation fails.
78
- ///
79
- /// # Arguments
80
- ///
81
- /// * `result` - The extraction result to validate
82
- /// * `config` - Extraction configuration
83
- ///
84
- /// # Returns
85
- ///
86
- /// - `Ok(())` if validation passes
87
- /// - `Err(...)` if validation fails (extraction will fail)
88
- ///
89
- /// # Errors
90
- ///
91
- /// - `KreuzbergError::Validation` - Validation failed
92
- /// - Any other error type appropriate for the failure
93
- ///
94
- /// # Example - Content Length Validation
95
- ///
96
- /// ```rust
97
- /// # use kreuzberg::plugins::{Plugin, Validator};
98
- /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
99
- /// # use async_trait::async_trait;
100
- /// # struct ContentLengthValidator { min: usize, max: usize }
101
- /// # impl Plugin for ContentLengthValidator {
102
- /// # fn name(&self) -> &str { "length-validator" }
103
- /// # fn version(&self) -> String { "1.0.0".to_string() }
104
- /// # fn initialize(&self) -> Result<()> { Ok(()) }
105
- /// # fn shutdown(&self) -> Result<()> { Ok(()) }
106
- /// # }
107
- /// # #[async_trait]
108
- /// # impl Validator for ContentLengthValidator {
109
- /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
110
- /// -> Result<()> {
111
- /// let length = result.content.len();
112
- ///
113
- /// if length < self.min {
114
- /// return Err(KreuzbergError::validation(format!(
115
- /// "Content too short: {} < {} characters",
116
- /// length, self.min
117
- /// )));
118
- /// }
119
- ///
120
- /// if length > self.max {
121
- /// return Err(KreuzbergError::validation(format!(
122
- /// "Content too long: {} > {} characters",
123
- /// length, self.max
124
- /// )));
125
- /// }
126
- ///
127
- /// Ok(())
128
- /// }
129
- /// # }
130
- /// ```
131
- ///
132
- /// # Example - Quality Score Validation
133
- ///
134
- /// ```rust
135
- /// # use kreuzberg::plugins::{Plugin, Validator};
136
- /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
137
- /// # use async_trait::async_trait;
138
- /// # struct QualityValidator { min_score: f64 }
139
- /// # impl Plugin for QualityValidator {
140
- /// # fn name(&self) -> &str { "quality-validator" }
141
- /// # fn version(&self) -> String { "1.0.0".to_string() }
142
- /// # fn initialize(&self) -> Result<()> { Ok(()) }
143
- /// # fn shutdown(&self) -> Result<()> { Ok(()) }
144
- /// # }
145
- /// # #[async_trait]
146
- /// # impl Validator for QualityValidator {
147
- /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
148
- /// -> Result<()> {
149
- /// // Check if quality_score exists in metadata
150
- /// let score = result.metadata
151
- /// .additional
152
- /// .get("quality_score")
153
- /// .and_then(|v| v.as_f64())
154
- /// .unwrap_or(0.0);
155
- ///
156
- /// if score < self.min_score {
157
- /// return Err(KreuzbergError::validation(format!(
158
- /// "Quality score too low: {} < {}",
159
- /// score, self.min_score
160
- /// )));
161
- /// }
162
- ///
163
- /// Ok(())
164
- /// }
165
- /// # }
166
- /// ```
167
- ///
168
- /// # Example - Security Validation
169
- ///
170
- /// ```rust
171
- /// # use kreuzberg::plugins::{Plugin, Validator};
172
- /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
173
- /// # use async_trait::async_trait;
174
- /// # struct SecurityValidator { blocked_patterns: Vec<String> }
175
- /// # impl Plugin for SecurityValidator {
176
- /// # fn name(&self) -> &str { "security-validator" }
177
- /// # fn version(&self) -> String { "1.0.0".to_string() }
178
- /// # fn initialize(&self) -> Result<()> { Ok(()) }
179
- /// # fn shutdown(&self) -> Result<()> { Ok(()) }
180
- /// # }
181
- /// # #[async_trait]
182
- /// # impl Validator for SecurityValidator {
183
- /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
184
- /// -> Result<()> {
185
- /// // Check for blocked patterns
186
- /// for pattern in &self.blocked_patterns {
187
- /// if result.content.contains(pattern) {
188
- /// return Err(KreuzbergError::validation(format!(
189
- /// "Content contains blocked pattern: {}",
190
- /// pattern
191
- /// )));
192
- /// }
193
- /// }
194
- ///
195
- /// Ok(())
196
- /// }
197
- /// # }
198
- /// ```
199
- async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig) -> Result<()>;
200
-
201
- /// Optional: Check if this validator should run for a given result.
202
- ///
203
- /// Allows conditional validation based on MIME type, metadata, or content.
204
- /// Defaults to `true` (always run).
205
- ///
206
- /// # Arguments
207
- ///
208
- /// * `result` - The extraction result to check
209
- /// * `config` - Extraction configuration
210
- ///
211
- /// # Returns
212
- ///
213
- /// `true` if the validator should run, `false` to skip.
214
- ///
215
- /// # Example
216
- ///
217
- /// ```rust
218
- /// # use kreuzberg::plugins::{Plugin, Validator};
219
- /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
220
- /// # use async_trait::async_trait;
221
- /// # struct PdfValidator;
222
- /// # impl Plugin for PdfValidator {
223
- /// # fn name(&self) -> &str { "pdf-validator" }
224
- /// # fn version(&self) -> String { "1.0.0".to_string() }
225
- /// # fn initialize(&self) -> Result<()> { Ok(()) }
226
- /// # fn shutdown(&self) -> Result<()> { Ok(()) }
227
- /// # }
228
- /// # #[async_trait]
229
- /// # impl Validator for PdfValidator {
230
- /// # async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> { Ok(()) }
231
- /// /// Only validate PDF documents
232
- /// fn should_validate(&self, result: &ExtractionResult, config: &ExtractionConfig) -> bool {
233
- /// result.mime_type == "application/pdf"
234
- /// }
235
- /// # }
236
- /// ```
237
- fn should_validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> bool {
238
- true
239
- }
240
-
241
- /// Optional: Get the validation priority.
242
- ///
243
- /// Higher priority validators run first. Useful for ordering validation checks
244
- /// (e.g., run cheap validations before expensive ones).
245
- ///
246
- /// Default priority is 50.
247
- ///
248
- /// # Returns
249
- ///
250
- /// Priority value (higher = runs earlier).
251
- ///
252
- /// # Example
253
- ///
254
- /// ```rust
255
- /// # use kreuzberg::plugins::{Plugin, Validator};
256
- /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
257
- /// # use async_trait::async_trait;
258
- /// # struct FastValidator;
259
- /// # impl Plugin for FastValidator {
260
- /// # fn name(&self) -> &str { "fast-validator" }
261
- /// # fn version(&self) -> String { "1.0.0".to_string() }
262
- /// # fn initialize(&self) -> Result<()> { Ok(()) }
263
- /// # fn shutdown(&self) -> Result<()> { Ok(()) }
264
- /// # }
265
- /// # #[async_trait]
266
- /// # impl Validator for FastValidator {
267
- /// # async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> { Ok(()) }
268
- /// /// Run this validator first (it's fast)
269
- /// fn priority(&self) -> i32 {
270
- /// 100
271
- /// }
272
- /// # }
273
- /// ```
274
- fn priority(&self) -> i32 {
275
- 50
276
- }
277
- }
278
-
279
- /// Register a validator with the global registry.
280
- ///
281
- /// The validator will be registered with its default priority and will be called
282
- /// during extraction validation. The validator's `name()` method is used as the
283
- /// registration name.
284
- ///
285
- /// # Arguments
286
- ///
287
- /// * `validator` - The validator implementation wrapped in Arc
288
- ///
289
- /// # Returns
290
- ///
291
- /// - `Ok(())` if registration succeeded
292
- /// - `Err(...)` if validation failed or initialization failed
293
- ///
294
- /// # Errors
295
- ///
296
- /// - `KreuzbergError::Validation` - Invalid validator name (empty or contains whitespace)
297
- /// - Any error from the validator's `initialize()` method
298
- ///
299
- /// # Example
300
- ///
301
- /// ```rust
302
- /// use kreuzberg::plugins::{Plugin, Validator, register_validator};
303
- /// use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
304
- /// use async_trait::async_trait;
305
- /// use std::sync::Arc;
306
- ///
307
- /// struct MinLengthValidator { min_length: usize }
308
- ///
309
- /// impl Plugin for MinLengthValidator {
310
- /// fn name(&self) -> &str { "min-length" }
311
- /// fn version(&self) -> String { "1.0.0".to_string() }
312
- /// fn initialize(&self) -> Result<()> { Ok(()) }
313
- /// fn shutdown(&self) -> Result<()> { Ok(()) }
314
- /// }
315
- ///
316
- /// #[async_trait]
317
- /// impl Validator for MinLengthValidator {
318
- /// async fn validate(&self, result: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
319
- /// if result.content.len() < self.min_length {
320
- /// return Err(KreuzbergError::validation(
321
- /// format!("Content too short: {} < {}", result.content.len(), self.min_length)
322
- /// ));
323
- /// }
324
- /// Ok(())
325
- /// }
326
- /// }
327
- ///
328
- /// # tokio_test::block_on(async {
329
- /// let validator = Arc::new(MinLengthValidator { min_length: 10 });
330
- /// register_validator(validator)?;
331
- /// # Ok::<(), KreuzbergError>(())
332
- /// # });
333
- /// ```
334
- pub fn register_validator(validator: Arc<dyn Validator>) -> crate::Result<()> {
335
- use crate::plugins::registry::get_validator_registry;
336
-
337
- let registry = get_validator_registry();
338
- let mut registry = registry
339
- .write()
340
- .expect("~keep Failed to acquire write lock on validator registry"); // ~keep
341
-
342
- registry.register(validator)
343
- }
344
-
345
- /// Unregister a validator by name.
346
- ///
347
- /// Removes the validator from the global registry and calls its `shutdown()` method.
348
- ///
349
- /// # Arguments
350
- ///
351
- /// * `name` - Name of the validator to unregister
352
- ///
353
- /// # Returns
354
- ///
355
- /// - `Ok(())` if the validator was unregistered or didn't exist
356
- /// - `Err(...)` if the shutdown method failed
357
- ///
358
- /// # Example
359
- ///
360
- /// ```rust
361
- /// use kreuzberg::plugins::unregister_validator;
362
- ///
363
- /// # tokio_test::block_on(async {
364
- /// unregister_validator("min-length")?;
365
- /// # Ok::<(), kreuzberg::KreuzbergError>(())
366
- /// # });
367
- /// ```
368
- pub fn unregister_validator(name: &str) -> crate::Result<()> {
369
- use crate::plugins::registry::get_validator_registry;
370
-
371
- let registry = get_validator_registry();
372
- let mut registry = registry
373
- .write()
374
- .expect("~keep Failed to acquire write lock on validator registry"); // ~keep
375
-
376
- registry.remove(name)
377
- }
378
-
379
- /// List all registered validators.
380
- ///
381
- /// Returns the names of all validators currently registered in the global registry.
382
- ///
383
- /// # Returns
384
- ///
385
- /// A vector of validator names.
386
- ///
387
- /// # Example
388
- ///
389
- /// ```rust
390
- /// use kreuzberg::plugins::list_validators;
391
- ///
392
- /// # tokio_test::block_on(async {
393
- /// let validators = list_validators()?;
394
- /// for name in validators {
395
- /// println!("Registered validator: {}", name);
396
- /// }
397
- /// # Ok::<(), kreuzberg::KreuzbergError>(())
398
- /// # });
399
- /// ```
400
- pub fn list_validators() -> crate::Result<Vec<String>> {
401
- use crate::plugins::registry::get_validator_registry;
402
-
403
- let registry = get_validator_registry();
404
- let registry = registry
405
- .read()
406
- .expect("~keep Failed to acquire read lock on validator registry"); // ~keep
407
-
408
- Ok(registry.list())
409
- }
410
-
411
- /// Clear all validators from the global registry.
412
- ///
413
- /// Removes all validators and calls their `shutdown()` methods.
414
- ///
415
- /// # Returns
416
- ///
417
- /// - `Ok(())` if all validators were cleared successfully
418
- /// - `Err(...)` if any shutdown method failed
419
- ///
420
- /// # Example
421
- ///
422
- /// ```rust
423
- /// use kreuzberg::plugins::clear_validators;
424
- ///
425
- /// # tokio_test::block_on(async {
426
- /// clear_validators()?;
427
- /// # Ok::<(), kreuzberg::KreuzbergError>(())
428
- /// # });
429
- /// ```
430
- pub fn clear_validators() -> crate::Result<()> {
431
- use crate::plugins::registry::get_validator_registry;
432
-
433
- let registry = get_validator_registry();
434
- let mut registry = registry
435
- .write()
436
- .expect("~keep Failed to acquire write lock on validator registry"); // ~keep
437
-
438
- registry.shutdown_all()
439
- }
440
-
441
- #[cfg(test)]
442
- mod tests {
443
- use super::*;
444
- use crate::KreuzbergError;
445
- use std::collections::HashMap;
446
-
447
- struct MockValidator {
448
- should_fail: bool,
449
- }
450
-
451
- impl Plugin for MockValidator {
452
- fn name(&self) -> &str {
453
- "mock-validator"
454
- }
455
-
456
- fn version(&self) -> String {
457
- "1.0.0".to_string()
458
- }
459
-
460
- fn initialize(&self) -> Result<()> {
461
- Ok(())
462
- }
463
-
464
- fn shutdown(&self) -> Result<()> {
465
- Ok(())
466
- }
467
- }
468
-
469
- #[async_trait]
470
- impl Validator for MockValidator {
471
- async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
472
- if self.should_fail {
473
- Err(KreuzbergError::validation("Validation failed".to_string()))
474
- } else {
475
- Ok(())
476
- }
477
- }
478
- }
479
-
480
- #[tokio::test]
481
- async fn test_validator_success() {
482
- let validator = MockValidator { should_fail: false };
483
-
484
- let result = ExtractionResult {
485
- content: "test content".to_string(),
486
- mime_type: "text/plain".to_string(),
487
- metadata: crate::types::Metadata::default(),
488
- tables: vec![],
489
- detected_languages: None,
490
- chunks: None,
491
- images: None,
492
- };
493
-
494
- let config = ExtractionConfig::default();
495
- assert!(validator.validate(&result, &config).await.is_ok());
496
- }
497
-
498
- #[tokio::test]
499
- async fn test_validator_failure() {
500
- let validator = MockValidator { should_fail: true };
501
-
502
- let result = ExtractionResult {
503
- content: "test content".to_string(),
504
- mime_type: "text/plain".to_string(),
505
- metadata: crate::types::Metadata::default(),
506
- tables: vec![],
507
- detected_languages: None,
508
- chunks: None,
509
- images: None,
510
- };
511
-
512
- let config = ExtractionConfig::default();
513
- let validation_result = validator.validate(&result, &config).await;
514
-
515
- assert!(matches!(validation_result, Err(KreuzbergError::Validation { .. })));
516
- }
517
-
518
- #[test]
519
- fn test_validator_should_validate_default() {
520
- let validator = MockValidator { should_fail: false };
521
-
522
- let result = ExtractionResult {
523
- content: "test".to_string(),
524
- mime_type: "text/plain".to_string(),
525
- metadata: crate::types::Metadata::default(),
526
- tables: vec![],
527
- detected_languages: None,
528
- chunks: None,
529
- images: None,
530
- };
531
-
532
- let config = ExtractionConfig::default();
533
-
534
- assert!(validator.should_validate(&result, &config));
535
- }
536
-
537
- #[test]
538
- fn test_validator_priority_default() {
539
- let validator = MockValidator { should_fail: false };
540
- assert_eq!(validator.priority(), 50);
541
- }
542
-
543
- #[tokio::test]
544
- async fn test_validator_plugin_interface() {
545
- let validator = MockValidator { should_fail: false };
546
-
547
- assert_eq!(validator.name(), "mock-validator");
548
- assert_eq!(validator.version(), "1.0.0");
549
- assert!(validator.initialize().is_ok());
550
- assert!(validator.shutdown().is_ok());
551
- }
552
-
553
- #[tokio::test]
554
- async fn test_validator_empty_content() {
555
- let validator = MockValidator { should_fail: false };
556
-
557
- let result = ExtractionResult {
558
- content: String::new(),
559
- mime_type: "text/plain".to_string(),
560
- metadata: crate::types::Metadata::default(),
561
- tables: vec![],
562
- detected_languages: None,
563
- chunks: None,
564
- images: None,
565
- };
566
-
567
- let config = ExtractionConfig::default();
568
- assert!(validator.validate(&result, &config).await.is_ok());
569
- }
570
-
571
- #[test]
572
- fn test_validator_should_validate_conditional() {
573
- struct PdfOnlyValidator;
574
-
575
- impl Plugin for PdfOnlyValidator {
576
- fn name(&self) -> &str {
577
- "pdf-only"
578
- }
579
- fn version(&self) -> String {
580
- "1.0.0".to_string()
581
- }
582
- fn initialize(&self) -> Result<()> {
583
- Ok(())
584
- }
585
- fn shutdown(&self) -> Result<()> {
586
- Ok(())
587
- }
588
- }
589
-
590
- #[async_trait]
591
- impl Validator for PdfOnlyValidator {
592
- async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
593
- Ok(())
594
- }
595
-
596
- fn should_validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> bool {
597
- result.mime_type == "application/pdf"
598
- }
599
- }
600
-
601
- let validator = PdfOnlyValidator;
602
- let config = ExtractionConfig::default();
603
-
604
- let pdf_result = ExtractionResult {
605
- content: "test".to_string(),
606
- mime_type: "application/pdf".to_string(),
607
- metadata: crate::types::Metadata::default(),
608
- tables: vec![],
609
- detected_languages: None,
610
- chunks: None,
611
- images: None,
612
- };
613
-
614
- let txt_result = ExtractionResult {
615
- content: "test".to_string(),
616
- mime_type: "text/plain".to_string(),
617
- metadata: crate::types::Metadata::default(),
618
- tables: vec![],
619
- detected_languages: None,
620
- chunks: None,
621
- images: None,
622
- };
623
-
624
- assert!(validator.should_validate(&pdf_result, &config));
625
- assert!(!validator.should_validate(&txt_result, &config));
626
- }
627
-
628
- #[test]
629
- fn test_validator_priority_ranges() {
630
- struct HighPriorityValidator;
631
- struct LowPriorityValidator;
632
-
633
- impl Plugin for HighPriorityValidator {
634
- fn name(&self) -> &str {
635
- "high-priority"
636
- }
637
- fn version(&self) -> String {
638
- "1.0.0".to_string()
639
- }
640
- fn initialize(&self) -> Result<()> {
641
- Ok(())
642
- }
643
- fn shutdown(&self) -> Result<()> {
644
- Ok(())
645
- }
646
- }
647
-
648
- impl Plugin for LowPriorityValidator {
649
- fn name(&self) -> &str {
650
- "low-priority"
651
- }
652
- fn version(&self) -> String {
653
- "1.0.0".to_string()
654
- }
655
- fn initialize(&self) -> Result<()> {
656
- Ok(())
657
- }
658
- fn shutdown(&self) -> Result<()> {
659
- Ok(())
660
- }
661
- }
662
-
663
- #[async_trait]
664
- impl Validator for HighPriorityValidator {
665
- async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
666
- Ok(())
667
- }
668
-
669
- fn priority(&self) -> i32 {
670
- 100
671
- }
672
- }
673
-
674
- #[async_trait]
675
- impl Validator for LowPriorityValidator {
676
- async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
677
- Ok(())
678
- }
679
-
680
- fn priority(&self) -> i32 {
681
- 10
682
- }
683
- }
684
-
685
- let high = HighPriorityValidator;
686
- let low = LowPriorityValidator;
687
-
688
- assert_eq!(high.priority(), 100);
689
- assert_eq!(low.priority(), 10);
690
- assert!(high.priority() > low.priority());
691
- }
692
-
693
- #[tokio::test]
694
- async fn test_validator_error_message() {
695
- let validator = MockValidator { should_fail: true };
696
-
697
- let result = ExtractionResult {
698
- content: "test".to_string(),
699
- mime_type: "text/plain".to_string(),
700
- metadata: crate::types::Metadata::default(),
701
- tables: vec![],
702
- detected_languages: None,
703
- chunks: None,
704
- images: None,
705
- };
706
-
707
- let config = ExtractionConfig::default();
708
- let err = validator.validate(&result, &config).await.unwrap_err();
709
-
710
- match err {
711
- KreuzbergError::Validation { message: msg, .. } => {
712
- assert_eq!(msg, "Validation failed");
713
- }
714
- _ => panic!("Expected Validation error"),
715
- }
716
- }
717
-
718
- #[tokio::test]
719
- async fn test_validator_with_metadata() {
720
- let validator = MockValidator { should_fail: false };
721
-
722
- let mut additional = HashMap::new();
723
- additional.insert("quality_score".to_string(), serde_json::json!(0.95));
724
-
725
- let result = ExtractionResult {
726
- content: "test".to_string(),
727
- mime_type: "text/plain".to_string(),
728
- metadata: crate::types::Metadata {
729
- additional,
730
- ..Default::default()
731
- },
732
- tables: vec![],
733
- detected_languages: None,
734
- chunks: None,
735
- images: None,
736
- };
737
-
738
- let config = ExtractionConfig::default();
739
- assert!(validator.validate(&result, &config).await.is_ok());
740
- }
741
-
742
- #[tokio::test]
743
- async fn test_validator_with_tables() {
744
- use crate::types::Table;
745
-
746
- let validator = MockValidator { should_fail: false };
747
-
748
- let table = Table {
749
- cells: vec![vec!["A".to_string(), "B".to_string()]],
750
- markdown: "| A | B |".to_string(),
751
- page_number: 0,
752
- };
753
-
754
- let result = ExtractionResult {
755
- content: "test".to_string(),
756
- mime_type: "text/plain".to_string(),
757
- metadata: crate::types::Metadata::default(),
758
- tables: vec![table],
759
- detected_languages: None,
760
- chunks: None,
761
- images: None,
762
- };
763
-
764
- let config = ExtractionConfig::default();
765
- assert!(validator.validate(&result, &config).await.is_ok());
766
- }
767
-
768
- #[tokio::test]
769
- async fn test_validator_different_mime_types() {
770
- let validator = MockValidator { should_fail: false };
771
- let config = ExtractionConfig::default();
772
-
773
- let mime_types = vec![
774
- "text/plain",
775
- "application/pdf",
776
- "application/json",
777
- "text/html",
778
- "image/png",
779
- ];
780
-
781
- for mime_type in mime_types {
782
- let result = ExtractionResult {
783
- content: "test".to_string(),
784
- mime_type: mime_type.to_string(),
785
- metadata: crate::types::Metadata::default(),
786
- tables: vec![],
787
- detected_languages: None,
788
- chunks: None,
789
- images: None,
790
- };
791
-
792
- assert!(validator.validate(&result, &config).await.is_ok());
793
- }
794
- }
795
-
796
- #[tokio::test]
797
- async fn test_validator_long_content() {
798
- let validator = MockValidator { should_fail: false };
799
-
800
- let result = ExtractionResult {
801
- content: "test content ".repeat(10000),
802
- mime_type: "text/plain".to_string(),
803
- metadata: crate::types::Metadata::default(),
804
- tables: vec![],
805
- detected_languages: None,
806
- chunks: None,
807
- images: None,
808
- };
809
-
810
- let config = ExtractionConfig::default();
811
- assert!(validator.validate(&result, &config).await.is_ok());
812
- }
813
-
814
- #[test]
815
- #[serial_test::serial]
816
- fn test_register_validator() {
817
- use std::sync::Arc;
818
-
819
- let validator = Arc::new(MockValidator { should_fail: false });
820
- let result = super::register_validator(validator);
821
- assert!(result.is_ok());
822
-
823
- let _ = super::unregister_validator("mock-validator");
824
- }
825
-
826
- #[test]
827
- #[serial_test::serial]
828
- fn test_unregister_validator() {
829
- use std::sync::Arc;
830
-
831
- let validator = Arc::new(MockValidator { should_fail: false });
832
- super::register_validator(validator).unwrap();
833
-
834
- let result = super::unregister_validator("mock-validator");
835
- assert!(result.is_ok());
836
- }
837
-
838
- #[test]
839
- #[serial_test::serial]
840
- fn test_unregister_nonexistent_validator() {
841
- let result = super::unregister_validator("nonexistent-validator-xyz");
842
- assert!(result.is_ok());
843
- }
844
-
845
- #[test]
846
- #[serial_test::serial]
847
- fn test_list_validators() {
848
- use std::sync::Arc;
849
-
850
- super::clear_validators().unwrap();
851
-
852
- let validator1 = Arc::new(MockValidator { should_fail: false });
853
- let validator2 = Arc::new(MockValidator { should_fail: false });
854
-
855
- let list_before = super::list_validators().unwrap();
856
- assert_eq!(list_before.len(), 0);
857
-
858
- super::register_validator(validator1).unwrap();
859
- super::register_validator(validator2).unwrap();
860
-
861
- let list = super::list_validators().unwrap();
862
- assert_eq!(list.len(), 1);
863
- assert!(list.contains(&"mock-validator".to_string()));
864
-
865
- super::unregister_validator("mock-validator").unwrap();
866
- }
867
-
868
- #[test]
869
- #[serial_test::serial]
870
- fn test_clear_validators() {
871
- use std::sync::Arc;
872
-
873
- super::clear_validators().unwrap();
874
-
875
- let validator1 = Arc::new(MockValidator { should_fail: false });
876
- let validator2 = Arc::new(MockValidator { should_fail: false });
877
-
878
- super::register_validator(validator1).unwrap();
879
- super::register_validator(validator2).unwrap();
880
-
881
- let list_before = super::list_validators().unwrap();
882
- assert!(!list_before.is_empty());
883
-
884
- let result = super::clear_validators();
885
- assert!(result.is_ok());
886
-
887
- let list = super::list_validators().unwrap();
888
- assert_eq!(list.len(), 0);
889
- }
890
-
891
- #[test]
892
- #[serial_test::serial]
893
- fn test_register_validator_with_invalid_name() {
894
- use std::sync::Arc;
895
-
896
- struct InvalidNameValidator;
897
- impl Plugin for InvalidNameValidator {
898
- fn name(&self) -> &str {
899
- "invalid name with spaces"
900
- }
901
- fn version(&self) -> String {
902
- "1.0.0".to_string()
903
- }
904
- fn initialize(&self) -> Result<()> {
905
- Ok(())
906
- }
907
- fn shutdown(&self) -> Result<()> {
908
- Ok(())
909
- }
910
- }
911
-
912
- #[async_trait]
913
- impl Validator for InvalidNameValidator {
914
- async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
915
- Ok(())
916
- }
917
- }
918
-
919
- let validator = Arc::new(InvalidNameValidator);
920
- let result = super::register_validator(validator);
921
- assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
922
- }
923
-
924
- #[test]
925
- #[serial_test::serial]
926
- fn test_register_validator_with_empty_name() {
927
- use std::sync::Arc;
928
-
929
- struct EmptyNameValidator;
930
- impl Plugin for EmptyNameValidator {
931
- fn name(&self) -> &str {
932
- ""
933
- }
934
- fn version(&self) -> String {
935
- "1.0.0".to_string()
936
- }
937
- fn initialize(&self) -> Result<()> {
938
- Ok(())
939
- }
940
- fn shutdown(&self) -> Result<()> {
941
- Ok(())
942
- }
943
- }
944
-
945
- #[async_trait]
946
- impl Validator for EmptyNameValidator {
947
- async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
948
- Ok(())
949
- }
950
- }
951
-
952
- let validator = Arc::new(EmptyNameValidator);
953
- let result = super::register_validator(validator);
954
- assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
955
- }
956
- }
1
+ //! Validator plugin trait.
2
+ //!
3
+ //! This module defines the trait for implementing custom validation logic.
4
+
5
+ use crate::Result;
6
+ use crate::core::config::ExtractionConfig;
7
+ use crate::plugins::Plugin;
8
+ use crate::types::ExtractionResult;
9
+ use async_trait::async_trait;
10
+ use std::sync::Arc;
11
+
12
+ /// Trait for validator plugins.
13
+ ///
14
+ /// Validators check extraction results for quality, completeness, or correctness.
15
+ /// Unlike post-processors, validator errors **fail fast** - if a validator returns
16
+ /// an error, the extraction fails immediately.
17
+ ///
18
+ /// # Use Cases
19
+ ///
20
+ /// - **Quality Gates**: Ensure extracted content meets minimum quality standards
21
+ /// - **Compliance**: Verify content meets regulatory requirements
22
+ /// - **Content Filtering**: Reject documents containing unwanted content
23
+ /// - **Format Validation**: Verify extracted content structure
24
+ /// - **Security Checks**: Scan for malicious content
25
+ ///
26
+ /// # Error Handling
27
+ ///
28
+ /// Validator errors are **fatal** - they cause the extraction to fail and bubble up
29
+ /// to the caller. Use validators for hard requirements that must be met.
30
+ ///
31
+ /// For non-fatal checks, use post-processors instead.
32
+ ///
33
+ /// # Thread Safety
34
+ ///
35
+ /// Validators must be thread-safe (`Send + Sync`).
36
+ ///
37
+ /// # Example
38
+ ///
39
+ /// ```rust
40
+ /// use kreuzberg::plugins::{Plugin, Validator};
41
+ /// use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
42
+ /// use async_trait::async_trait;
43
+ ///
44
+ /// /// Validate that extracted content has minimum length
45
+ /// struct MinimumLengthValidator {
46
+ /// min_length: usize,
47
+ /// }
48
+ ///
49
+ /// impl Plugin for MinimumLengthValidator {
50
+ /// fn name(&self) -> &str { "min-length-validator" }
51
+ /// fn version(&self) -> String { "1.0.0".to_string() }
52
+ /// fn initialize(&self) -> Result<()> { Ok(()) }
53
+ /// fn shutdown(&self) -> Result<()> { Ok(()) }
54
+ /// }
55
+ ///
56
+ /// #[async_trait]
57
+ /// impl Validator for MinimumLengthValidator {
58
+ /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
59
+ /// -> Result<()> {
60
+ /// if result.content.len() < self.min_length {
61
+ /// return Err(KreuzbergError::validation(format!(
62
+ /// "Content too short: {} < {} characters",
63
+ /// result.content.len(),
64
+ /// self.min_length
65
+ /// )));
66
+ /// }
67
+ /// Ok(())
68
+ /// }
69
+ /// }
70
+ /// ```
71
+ #[cfg_attr(not(target_arch = "wasm32"), async_trait)]
72
+ #[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
73
+ pub trait Validator: Plugin {
74
+ /// Validate an extraction result.
75
+ ///
76
+ /// Check the extraction result and return `Ok(())` if valid, or an error
77
+ /// if validation fails.
78
+ ///
79
+ /// # Arguments
80
+ ///
81
+ /// * `result` - The extraction result to validate
82
+ /// * `config` - Extraction configuration
83
+ ///
84
+ /// # Returns
85
+ ///
86
+ /// - `Ok(())` if validation passes
87
+ /// - `Err(...)` if validation fails (extraction will fail)
88
+ ///
89
+ /// # Errors
90
+ ///
91
+ /// - `KreuzbergError::Validation` - Validation failed
92
+ /// - Any other error type appropriate for the failure
93
+ ///
94
+ /// # Example - Content Length Validation
95
+ ///
96
+ /// ```rust
97
+ /// # use kreuzberg::plugins::{Plugin, Validator};
98
+ /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
99
+ /// # use async_trait::async_trait;
100
+ /// # struct ContentLengthValidator { min: usize, max: usize }
101
+ /// # impl Plugin for ContentLengthValidator {
102
+ /// # fn name(&self) -> &str { "length-validator" }
103
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
104
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
105
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
106
+ /// # }
107
+ /// # #[async_trait]
108
+ /// # impl Validator for ContentLengthValidator {
109
+ /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
110
+ /// -> Result<()> {
111
+ /// let length = result.content.len();
112
+ ///
113
+ /// if length < self.min {
114
+ /// return Err(KreuzbergError::validation(format!(
115
+ /// "Content too short: {} < {} characters",
116
+ /// length, self.min
117
+ /// )));
118
+ /// }
119
+ ///
120
+ /// if length > self.max {
121
+ /// return Err(KreuzbergError::validation(format!(
122
+ /// "Content too long: {} > {} characters",
123
+ /// length, self.max
124
+ /// )));
125
+ /// }
126
+ ///
127
+ /// Ok(())
128
+ /// }
129
+ /// # }
130
+ /// ```
131
+ ///
132
+ /// # Example - Quality Score Validation
133
+ ///
134
+ /// ```rust
135
+ /// # use kreuzberg::plugins::{Plugin, Validator};
136
+ /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
137
+ /// # use async_trait::async_trait;
138
+ /// # struct QualityValidator { min_score: f64 }
139
+ /// # impl Plugin for QualityValidator {
140
+ /// # fn name(&self) -> &str { "quality-validator" }
141
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
142
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
143
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
144
+ /// # }
145
+ /// # #[async_trait]
146
+ /// # impl Validator for QualityValidator {
147
+ /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
148
+ /// -> Result<()> {
149
+ /// // Check if quality_score exists in metadata
150
+ /// let score = result.metadata
151
+ /// .additional
152
+ /// .get("quality_score")
153
+ /// .and_then(|v| v.as_f64())
154
+ /// .unwrap_or(0.0);
155
+ ///
156
+ /// if score < self.min_score {
157
+ /// return Err(KreuzbergError::validation(format!(
158
+ /// "Quality score too low: {} < {}",
159
+ /// score, self.min_score
160
+ /// )));
161
+ /// }
162
+ ///
163
+ /// Ok(())
164
+ /// }
165
+ /// # }
166
+ /// ```
167
+ ///
168
+ /// # Example - Security Validation
169
+ ///
170
+ /// ```rust
171
+ /// # use kreuzberg::plugins::{Plugin, Validator};
172
+ /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
173
+ /// # use async_trait::async_trait;
174
+ /// # struct SecurityValidator { blocked_patterns: Vec<String> }
175
+ /// # impl Plugin for SecurityValidator {
176
+ /// # fn name(&self) -> &str { "security-validator" }
177
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
178
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
179
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
180
+ /// # }
181
+ /// # #[async_trait]
182
+ /// # impl Validator for SecurityValidator {
183
+ /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
184
+ /// -> Result<()> {
185
+ /// // Check for blocked patterns
186
+ /// for pattern in &self.blocked_patterns {
187
+ /// if result.content.contains(pattern) {
188
+ /// return Err(KreuzbergError::validation(format!(
189
+ /// "Content contains blocked pattern: {}",
190
+ /// pattern
191
+ /// )));
192
+ /// }
193
+ /// }
194
+ ///
195
+ /// Ok(())
196
+ /// }
197
+ /// # }
198
+ /// ```
199
+ async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig) -> Result<()>;
200
+
201
+ /// Optional: Check if this validator should run for a given result.
202
+ ///
203
+ /// Allows conditional validation based on MIME type, metadata, or content.
204
+ /// Defaults to `true` (always run).
205
+ ///
206
+ /// # Arguments
207
+ ///
208
+ /// * `result` - The extraction result to check
209
+ /// * `config` - Extraction configuration
210
+ ///
211
+ /// # Returns
212
+ ///
213
+ /// `true` if the validator should run, `false` to skip.
214
+ ///
215
+ /// # Example
216
+ ///
217
+ /// ```rust
218
+ /// # use kreuzberg::plugins::{Plugin, Validator};
219
+ /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
220
+ /// # use async_trait::async_trait;
221
+ /// # struct PdfValidator;
222
+ /// # impl Plugin for PdfValidator {
223
+ /// # fn name(&self) -> &str { "pdf-validator" }
224
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
225
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
226
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
227
+ /// # }
228
+ /// # #[async_trait]
229
+ /// # impl Validator for PdfValidator {
230
+ /// # async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> { Ok(()) }
231
+ /// /// Only validate PDF documents
232
+ /// fn should_validate(&self, result: &ExtractionResult, config: &ExtractionConfig) -> bool {
233
+ /// result.mime_type == "application/pdf"
234
+ /// }
235
+ /// # }
236
+ /// ```
237
+ fn should_validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> bool {
238
+ true
239
+ }
240
+
241
+ /// Optional: Get the validation priority.
242
+ ///
243
+ /// Higher priority validators run first. Useful for ordering validation checks
244
+ /// (e.g., run cheap validations before expensive ones).
245
+ ///
246
+ /// Default priority is 50.
247
+ ///
248
+ /// # Returns
249
+ ///
250
+ /// Priority value (higher = runs earlier).
251
+ ///
252
+ /// # Example
253
+ ///
254
+ /// ```rust
255
+ /// # use kreuzberg::plugins::{Plugin, Validator};
256
+ /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
257
+ /// # use async_trait::async_trait;
258
+ /// # struct FastValidator;
259
+ /// # impl Plugin for FastValidator {
260
+ /// # fn name(&self) -> &str { "fast-validator" }
261
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
262
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
263
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
264
+ /// # }
265
+ /// # #[async_trait]
266
+ /// # impl Validator for FastValidator {
267
+ /// # async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> { Ok(()) }
268
+ /// /// Run this validator first (it's fast)
269
+ /// fn priority(&self) -> i32 {
270
+ /// 100
271
+ /// }
272
+ /// # }
273
+ /// ```
274
+ fn priority(&self) -> i32 {
275
+ 50
276
+ }
277
+ }
278
+
279
+ /// Register a validator with the global registry.
280
+ ///
281
+ /// The validator will be registered with its default priority and will be called
282
+ /// during extraction validation. The validator's `name()` method is used as the
283
+ /// registration name.
284
+ ///
285
+ /// # Arguments
286
+ ///
287
+ /// * `validator` - The validator implementation wrapped in Arc
288
+ ///
289
+ /// # Returns
290
+ ///
291
+ /// - `Ok(())` if registration succeeded
292
+ /// - `Err(...)` if validation failed or initialization failed
293
+ ///
294
+ /// # Errors
295
+ ///
296
+ /// - `KreuzbergError::Validation` - Invalid validator name (empty or contains whitespace)
297
+ /// - Any error from the validator's `initialize()` method
298
+ ///
299
+ /// # Example
300
+ ///
301
+ /// ```rust
302
+ /// use kreuzberg::plugins::{Plugin, Validator, register_validator};
303
+ /// use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
304
+ /// use async_trait::async_trait;
305
+ /// use std::sync::Arc;
306
+ ///
307
+ /// struct MinLengthValidator { min_length: usize }
308
+ ///
309
+ /// impl Plugin for MinLengthValidator {
310
+ /// fn name(&self) -> &str { "min-length" }
311
+ /// fn version(&self) -> String { "1.0.0".to_string() }
312
+ /// fn initialize(&self) -> Result<()> { Ok(()) }
313
+ /// fn shutdown(&self) -> Result<()> { Ok(()) }
314
+ /// }
315
+ ///
316
+ /// #[async_trait]
317
+ /// impl Validator for MinLengthValidator {
318
+ /// async fn validate(&self, result: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
319
+ /// if result.content.len() < self.min_length {
320
+ /// return Err(KreuzbergError::validation(
321
+ /// format!("Content too short: {} < {}", result.content.len(), self.min_length)
322
+ /// ));
323
+ /// }
324
+ /// Ok(())
325
+ /// }
326
+ /// }
327
+ ///
328
+ /// # tokio_test::block_on(async {
329
+ /// let validator = Arc::new(MinLengthValidator { min_length: 10 });
330
+ /// register_validator(validator)?;
331
+ /// # Ok::<(), KreuzbergError>(())
332
+ /// # });
333
+ /// ```
334
+ pub fn register_validator(validator: Arc<dyn Validator>) -> crate::Result<()> {
335
+ use crate::plugins::registry::get_validator_registry;
336
+
337
+ let registry = get_validator_registry();
338
+ let mut registry = registry
339
+ .write()
340
+ .expect("~keep Failed to acquire write lock on validator registry"); // ~keep
341
+
342
+ registry.register(validator)
343
+ }
344
+
345
+ /// Unregister a validator by name.
346
+ ///
347
+ /// Removes the validator from the global registry and calls its `shutdown()` method.
348
+ ///
349
+ /// # Arguments
350
+ ///
351
+ /// * `name` - Name of the validator to unregister
352
+ ///
353
+ /// # Returns
354
+ ///
355
+ /// - `Ok(())` if the validator was unregistered or didn't exist
356
+ /// - `Err(...)` if the shutdown method failed
357
+ ///
358
+ /// # Example
359
+ ///
360
+ /// ```rust
361
+ /// use kreuzberg::plugins::unregister_validator;
362
+ ///
363
+ /// # tokio_test::block_on(async {
364
+ /// unregister_validator("min-length")?;
365
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
366
+ /// # });
367
+ /// ```
368
+ pub fn unregister_validator(name: &str) -> crate::Result<()> {
369
+ use crate::plugins::registry::get_validator_registry;
370
+
371
+ let registry = get_validator_registry();
372
+ let mut registry = registry
373
+ .write()
374
+ .expect("~keep Failed to acquire write lock on validator registry"); // ~keep
375
+
376
+ registry.remove(name)
377
+ }
378
+
379
+ /// List all registered validators.
380
+ ///
381
+ /// Returns the names of all validators currently registered in the global registry.
382
+ ///
383
+ /// # Returns
384
+ ///
385
+ /// A vector of validator names.
386
+ ///
387
+ /// # Example
388
+ ///
389
+ /// ```rust
390
+ /// use kreuzberg::plugins::list_validators;
391
+ ///
392
+ /// # tokio_test::block_on(async {
393
+ /// let validators = list_validators()?;
394
+ /// for name in validators {
395
+ /// println!("Registered validator: {}", name);
396
+ /// }
397
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
398
+ /// # });
399
+ /// ```
400
+ pub fn list_validators() -> crate::Result<Vec<String>> {
401
+ use crate::plugins::registry::get_validator_registry;
402
+
403
+ let registry = get_validator_registry();
404
+ let registry = registry
405
+ .read()
406
+ .expect("~keep Failed to acquire read lock on validator registry"); // ~keep
407
+
408
+ Ok(registry.list())
409
+ }
410
+
411
+ /// Clear all validators from the global registry.
412
+ ///
413
+ /// Removes all validators and calls their `shutdown()` methods.
414
+ ///
415
+ /// # Returns
416
+ ///
417
+ /// - `Ok(())` if all validators were cleared successfully
418
+ /// - `Err(...)` if any shutdown method failed
419
+ ///
420
+ /// # Example
421
+ ///
422
+ /// ```rust
423
+ /// use kreuzberg::plugins::clear_validators;
424
+ ///
425
+ /// # tokio_test::block_on(async {
426
+ /// clear_validators()?;
427
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
428
+ /// # });
429
+ /// ```
430
+ pub fn clear_validators() -> crate::Result<()> {
431
+ use crate::plugins::registry::get_validator_registry;
432
+
433
+ let registry = get_validator_registry();
434
+ let mut registry = registry
435
+ .write()
436
+ .expect("~keep Failed to acquire write lock on validator registry"); // ~keep
437
+
438
+ registry.shutdown_all()
439
+ }
440
+
441
+ #[cfg(test)]
442
+ mod tests {
443
+ use super::*;
444
+ use crate::KreuzbergError;
445
+ use std::collections::HashMap;
446
+
447
+ struct MockValidator {
448
+ should_fail: bool,
449
+ }
450
+
451
+ impl Plugin for MockValidator {
452
+ fn name(&self) -> &str {
453
+ "mock-validator"
454
+ }
455
+
456
+ fn version(&self) -> String {
457
+ "1.0.0".to_string()
458
+ }
459
+
460
+ fn initialize(&self) -> Result<()> {
461
+ Ok(())
462
+ }
463
+
464
+ fn shutdown(&self) -> Result<()> {
465
+ Ok(())
466
+ }
467
+ }
468
+
469
+ #[async_trait]
470
+ impl Validator for MockValidator {
471
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
472
+ if self.should_fail {
473
+ Err(KreuzbergError::validation("Validation failed".to_string()))
474
+ } else {
475
+ Ok(())
476
+ }
477
+ }
478
+ }
479
+
480
+ #[tokio::test]
481
+ async fn test_validator_success() {
482
+ let validator = MockValidator { should_fail: false };
483
+
484
+ let result = ExtractionResult {
485
+ content: "test content".to_string(),
486
+ mime_type: "text/plain".to_string(),
487
+ metadata: crate::types::Metadata::default(),
488
+ tables: vec![],
489
+ detected_languages: None,
490
+ chunks: None,
491
+ images: None,
492
+ pages: None,
493
+ };
494
+
495
+ let config = ExtractionConfig::default();
496
+ assert!(validator.validate(&result, &config).await.is_ok());
497
+ }
498
+
499
+ #[tokio::test]
500
+ async fn test_validator_failure() {
501
+ let validator = MockValidator { should_fail: true };
502
+
503
+ let result = ExtractionResult {
504
+ content: "test content".to_string(),
505
+ mime_type: "text/plain".to_string(),
506
+ metadata: crate::types::Metadata::default(),
507
+ tables: vec![],
508
+ detected_languages: None,
509
+ chunks: None,
510
+ images: None,
511
+ pages: None,
512
+ };
513
+
514
+ let config = ExtractionConfig::default();
515
+ let validation_result = validator.validate(&result, &config).await;
516
+
517
+ assert!(matches!(validation_result, Err(KreuzbergError::Validation { .. })));
518
+ }
519
+
520
+ #[test]
521
+ fn test_validator_should_validate_default() {
522
+ let validator = MockValidator { should_fail: false };
523
+
524
+ let result = ExtractionResult {
525
+ content: "test".to_string(),
526
+ mime_type: "text/plain".to_string(),
527
+ metadata: crate::types::Metadata::default(),
528
+ tables: vec![],
529
+ detected_languages: None,
530
+ chunks: None,
531
+ images: None,
532
+ pages: None,
533
+ };
534
+
535
+ let config = ExtractionConfig::default();
536
+
537
+ assert!(validator.should_validate(&result, &config));
538
+ }
539
+
540
+ #[test]
541
+ fn test_validator_priority_default() {
542
+ let validator = MockValidator { should_fail: false };
543
+ assert_eq!(validator.priority(), 50);
544
+ }
545
+
546
+ #[tokio::test]
547
+ async fn test_validator_plugin_interface() {
548
+ let validator = MockValidator { should_fail: false };
549
+
550
+ assert_eq!(validator.name(), "mock-validator");
551
+ assert_eq!(validator.version(), "1.0.0");
552
+ assert!(validator.initialize().is_ok());
553
+ assert!(validator.shutdown().is_ok());
554
+ }
555
+
556
+ #[tokio::test]
557
+ async fn test_validator_empty_content() {
558
+ let validator = MockValidator { should_fail: false };
559
+
560
+ let result = ExtractionResult {
561
+ content: String::new(),
562
+ mime_type: "text/plain".to_string(),
563
+ metadata: crate::types::Metadata::default(),
564
+ tables: vec![],
565
+ detected_languages: None,
566
+ chunks: None,
567
+ images: None,
568
+ pages: None,
569
+ };
570
+
571
+ let config = ExtractionConfig::default();
572
+ assert!(validator.validate(&result, &config).await.is_ok());
573
+ }
574
+
575
+ #[test]
576
+ fn test_validator_should_validate_conditional() {
577
+ struct PdfOnlyValidator;
578
+
579
+ impl Plugin for PdfOnlyValidator {
580
+ fn name(&self) -> &str {
581
+ "pdf-only"
582
+ }
583
+ fn version(&self) -> String {
584
+ "1.0.0".to_string()
585
+ }
586
+ fn initialize(&self) -> Result<()> {
587
+ Ok(())
588
+ }
589
+ fn shutdown(&self) -> Result<()> {
590
+ Ok(())
591
+ }
592
+ }
593
+
594
+ #[async_trait]
595
+ impl Validator for PdfOnlyValidator {
596
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
597
+ Ok(())
598
+ }
599
+
600
+ fn should_validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> bool {
601
+ result.mime_type == "application/pdf"
602
+ }
603
+ }
604
+
605
+ let validator = PdfOnlyValidator;
606
+ let config = ExtractionConfig::default();
607
+
608
+ let pdf_result = ExtractionResult {
609
+ content: "test".to_string(),
610
+ mime_type: "application/pdf".to_string(),
611
+ metadata: crate::types::Metadata::default(),
612
+ tables: vec![],
613
+ detected_languages: None,
614
+ chunks: None,
615
+ images: None,
616
+ pages: None,
617
+ };
618
+
619
+ let txt_result = ExtractionResult {
620
+ content: "test".to_string(),
621
+ mime_type: "text/plain".to_string(),
622
+ metadata: crate::types::Metadata::default(),
623
+ tables: vec![],
624
+ detected_languages: None,
625
+ chunks: None,
626
+ images: None,
627
+ pages: None,
628
+ };
629
+
630
+ assert!(validator.should_validate(&pdf_result, &config));
631
+ assert!(!validator.should_validate(&txt_result, &config));
632
+ }
633
+
634
+ #[test]
635
+ fn test_validator_priority_ranges() {
636
+ struct HighPriorityValidator;
637
+ struct LowPriorityValidator;
638
+
639
+ impl Plugin for HighPriorityValidator {
640
+ fn name(&self) -> &str {
641
+ "high-priority"
642
+ }
643
+ fn version(&self) -> String {
644
+ "1.0.0".to_string()
645
+ }
646
+ fn initialize(&self) -> Result<()> {
647
+ Ok(())
648
+ }
649
+ fn shutdown(&self) -> Result<()> {
650
+ Ok(())
651
+ }
652
+ }
653
+
654
+ impl Plugin for LowPriorityValidator {
655
+ fn name(&self) -> &str {
656
+ "low-priority"
657
+ }
658
+ fn version(&self) -> String {
659
+ "1.0.0".to_string()
660
+ }
661
+ fn initialize(&self) -> Result<()> {
662
+ Ok(())
663
+ }
664
+ fn shutdown(&self) -> Result<()> {
665
+ Ok(())
666
+ }
667
+ }
668
+
669
+ #[async_trait]
670
+ impl Validator for HighPriorityValidator {
671
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
672
+ Ok(())
673
+ }
674
+
675
+ fn priority(&self) -> i32 {
676
+ 100
677
+ }
678
+ }
679
+
680
+ #[async_trait]
681
+ impl Validator for LowPriorityValidator {
682
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
683
+ Ok(())
684
+ }
685
+
686
+ fn priority(&self) -> i32 {
687
+ 10
688
+ }
689
+ }
690
+
691
+ let high = HighPriorityValidator;
692
+ let low = LowPriorityValidator;
693
+
694
+ assert_eq!(high.priority(), 100);
695
+ assert_eq!(low.priority(), 10);
696
+ assert!(high.priority() > low.priority());
697
+ }
698
+
699
+ #[tokio::test]
700
+ async fn test_validator_error_message() {
701
+ let validator = MockValidator { should_fail: true };
702
+
703
+ let result = ExtractionResult {
704
+ content: "test".to_string(),
705
+ mime_type: "text/plain".to_string(),
706
+ metadata: crate::types::Metadata::default(),
707
+ tables: vec![],
708
+ detected_languages: None,
709
+ chunks: None,
710
+ images: None,
711
+ pages: None,
712
+ };
713
+
714
+ let config = ExtractionConfig::default();
715
+ let err = validator.validate(&result, &config).await.unwrap_err();
716
+
717
+ match err {
718
+ KreuzbergError::Validation { message: msg, .. } => {
719
+ assert_eq!(msg, "Validation failed");
720
+ }
721
+ _ => panic!("Expected Validation error"),
722
+ }
723
+ }
724
+
725
+ #[tokio::test]
726
+ async fn test_validator_with_metadata() {
727
+ let validator = MockValidator { should_fail: false };
728
+
729
+ let mut additional = HashMap::new();
730
+ additional.insert("quality_score".to_string(), serde_json::json!(0.95));
731
+
732
+ let result = ExtractionResult {
733
+ content: "test".to_string(),
734
+ mime_type: "text/plain".to_string(),
735
+ metadata: crate::types::Metadata {
736
+ additional,
737
+ ..Default::default()
738
+ },
739
+ pages: None,
740
+ tables: vec![],
741
+ detected_languages: None,
742
+ chunks: None,
743
+ images: None,
744
+ };
745
+
746
+ let config = ExtractionConfig::default();
747
+ assert!(validator.validate(&result, &config).await.is_ok());
748
+ }
749
+
750
+ #[tokio::test]
751
+ async fn test_validator_with_tables() {
752
+ use crate::types::Table;
753
+
754
+ let validator = MockValidator { should_fail: false };
755
+
756
+ let table = Table {
757
+ cells: vec![vec!["A".to_string(), "B".to_string()]],
758
+ markdown: "| A | B |".to_string(),
759
+ page_number: 0,
760
+ };
761
+
762
+ let result = ExtractionResult {
763
+ content: "test".to_string(),
764
+ mime_type: "text/plain".to_string(),
765
+ metadata: crate::types::Metadata::default(),
766
+ tables: vec![table],
767
+ detected_languages: None,
768
+ chunks: None,
769
+ images: None,
770
+ pages: None,
771
+ };
772
+
773
+ let config = ExtractionConfig::default();
774
+ assert!(validator.validate(&result, &config).await.is_ok());
775
+ }
776
+
777
+ #[tokio::test]
778
+ async fn test_validator_different_mime_types() {
779
+ let validator = MockValidator { should_fail: false };
780
+ let config = ExtractionConfig::default();
781
+
782
+ let mime_types = vec![
783
+ "text/plain",
784
+ "application/pdf",
785
+ "application/json",
786
+ "text/html",
787
+ "image/png",
788
+ ];
789
+
790
+ for mime_type in mime_types {
791
+ let result = ExtractionResult {
792
+ content: "test".to_string(),
793
+ mime_type: mime_type.to_string(),
794
+ metadata: crate::types::Metadata::default(),
795
+ tables: vec![],
796
+ detected_languages: None,
797
+ chunks: None,
798
+ images: None,
799
+ pages: None,
800
+ };
801
+
802
+ assert!(validator.validate(&result, &config).await.is_ok());
803
+ }
804
+ }
805
+
806
+ #[tokio::test]
807
+ async fn test_validator_long_content() {
808
+ let validator = MockValidator { should_fail: false };
809
+
810
+ let result = ExtractionResult {
811
+ content: "test content ".repeat(10000),
812
+ mime_type: "text/plain".to_string(),
813
+ metadata: crate::types::Metadata::default(),
814
+ tables: vec![],
815
+ detected_languages: None,
816
+ chunks: None,
817
+ images: None,
818
+ pages: None,
819
+ };
820
+
821
+ let config = ExtractionConfig::default();
822
+ assert!(validator.validate(&result, &config).await.is_ok());
823
+ }
824
+
825
+ #[test]
826
+ #[serial_test::serial]
827
+ fn test_register_validator() {
828
+ use std::sync::Arc;
829
+
830
+ let validator = Arc::new(MockValidator { should_fail: false });
831
+ let result = super::register_validator(validator);
832
+ assert!(result.is_ok());
833
+
834
+ let _ = super::unregister_validator("mock-validator");
835
+ }
836
+
837
+ #[test]
838
+ #[serial_test::serial]
839
+ fn test_unregister_validator() {
840
+ use std::sync::Arc;
841
+
842
+ let validator = Arc::new(MockValidator { should_fail: false });
843
+ super::register_validator(validator).unwrap();
844
+
845
+ let result = super::unregister_validator("mock-validator");
846
+ assert!(result.is_ok());
847
+ }
848
+
849
+ #[test]
850
+ #[serial_test::serial]
851
+ fn test_unregister_nonexistent_validator() {
852
+ let result = super::unregister_validator("nonexistent-validator-xyz");
853
+ assert!(result.is_ok());
854
+ }
855
+
856
+ #[test]
857
+ #[serial_test::serial]
858
+ fn test_list_validators() {
859
+ use std::sync::Arc;
860
+
861
+ super::clear_validators().unwrap();
862
+
863
+ let validator1 = Arc::new(MockValidator { should_fail: false });
864
+ let validator2 = Arc::new(MockValidator { should_fail: false });
865
+
866
+ let list_before = super::list_validators().unwrap();
867
+ assert_eq!(list_before.len(), 0);
868
+
869
+ super::register_validator(validator1).unwrap();
870
+ super::register_validator(validator2).unwrap();
871
+
872
+ let list = super::list_validators().unwrap();
873
+ assert_eq!(list.len(), 1);
874
+ assert!(list.contains(&"mock-validator".to_string()));
875
+
876
+ super::unregister_validator("mock-validator").unwrap();
877
+ }
878
+
879
+ #[test]
880
+ #[serial_test::serial]
881
+ fn test_clear_validators() {
882
+ use std::sync::Arc;
883
+
884
+ super::clear_validators().unwrap();
885
+
886
+ let validator1 = Arc::new(MockValidator { should_fail: false });
887
+ let validator2 = Arc::new(MockValidator { should_fail: false });
888
+
889
+ super::register_validator(validator1).unwrap();
890
+ super::register_validator(validator2).unwrap();
891
+
892
+ let list_before = super::list_validators().unwrap();
893
+ assert!(!list_before.is_empty());
894
+
895
+ let result = super::clear_validators();
896
+ assert!(result.is_ok());
897
+
898
+ let list = super::list_validators().unwrap();
899
+ assert_eq!(list.len(), 0);
900
+ }
901
+
902
+ #[test]
903
+ #[serial_test::serial]
904
+ fn test_register_validator_with_invalid_name() {
905
+ use std::sync::Arc;
906
+
907
+ struct InvalidNameValidator;
908
+ impl Plugin for InvalidNameValidator {
909
+ fn name(&self) -> &str {
910
+ "invalid name with spaces"
911
+ }
912
+ fn version(&self) -> String {
913
+ "1.0.0".to_string()
914
+ }
915
+ fn initialize(&self) -> Result<()> {
916
+ Ok(())
917
+ }
918
+ fn shutdown(&self) -> Result<()> {
919
+ Ok(())
920
+ }
921
+ }
922
+
923
+ #[async_trait]
924
+ impl Validator for InvalidNameValidator {
925
+ async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
926
+ Ok(())
927
+ }
928
+ }
929
+
930
+ let validator = Arc::new(InvalidNameValidator);
931
+ let result = super::register_validator(validator);
932
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
933
+ }
934
+
935
+ #[test]
936
+ #[serial_test::serial]
937
+ fn test_register_validator_with_empty_name() {
938
+ use std::sync::Arc;
939
+
940
+ struct EmptyNameValidator;
941
+ impl Plugin for EmptyNameValidator {
942
+ fn name(&self) -> &str {
943
+ ""
944
+ }
945
+ fn version(&self) -> String {
946
+ "1.0.0".to_string()
947
+ }
948
+ fn initialize(&self) -> Result<()> {
949
+ Ok(())
950
+ }
951
+ fn shutdown(&self) -> Result<()> {
952
+ Ok(())
953
+ }
954
+ }
955
+
956
+ #[async_trait]
957
+ impl Validator for EmptyNameValidator {
958
+ async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
959
+ Ok(())
960
+ }
961
+ }
962
+
963
+ let validator = Arc::new(EmptyNameValidator);
964
+ let result = super::register_validator(validator);
965
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
966
+ }
967
+ }