kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
@@ -0,0 +1,955 @@
1
+ //! Validator plugin trait.
2
+ //!
3
+ //! This module defines the trait for implementing custom validation logic.
4
+
5
+ use crate::Result;
6
+ use crate::core::config::ExtractionConfig;
7
+ use crate::plugins::Plugin;
8
+ use crate::types::ExtractionResult;
9
+ use async_trait::async_trait;
10
+ use std::sync::Arc;
11
+
12
+ /// Trait for validator plugins.
13
+ ///
14
+ /// Validators check extraction results for quality, completeness, or correctness.
15
+ /// Unlike post-processors, validator errors **fail fast** - if a validator returns
16
+ /// an error, the extraction fails immediately.
17
+ ///
18
+ /// # Use Cases
19
+ ///
20
+ /// - **Quality Gates**: Ensure extracted content meets minimum quality standards
21
+ /// - **Compliance**: Verify content meets regulatory requirements
22
+ /// - **Content Filtering**: Reject documents containing unwanted content
23
+ /// - **Format Validation**: Verify extracted content structure
24
+ /// - **Security Checks**: Scan for malicious content
25
+ ///
26
+ /// # Error Handling
27
+ ///
28
+ /// Validator errors are **fatal** - they cause the extraction to fail and bubble up
29
+ /// to the caller. Use validators for hard requirements that must be met.
30
+ ///
31
+ /// For non-fatal checks, use post-processors instead.
32
+ ///
33
+ /// # Thread Safety
34
+ ///
35
+ /// Validators must be thread-safe (`Send + Sync`).
36
+ ///
37
+ /// # Example
38
+ ///
39
+ /// ```rust
40
+ /// use kreuzberg::plugins::{Plugin, Validator};
41
+ /// use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
42
+ /// use async_trait::async_trait;
43
+ ///
44
+ /// /// Validate that extracted content has minimum length
45
+ /// struct MinimumLengthValidator {
46
+ /// min_length: usize,
47
+ /// }
48
+ ///
49
+ /// impl Plugin for MinimumLengthValidator {
50
+ /// fn name(&self) -> &str { "min-length-validator" }
51
+ /// fn version(&self) -> String { "1.0.0".to_string() }
52
+ /// fn initialize(&self) -> Result<()> { Ok(()) }
53
+ /// fn shutdown(&self) -> Result<()> { Ok(()) }
54
+ /// }
55
+ ///
56
+ /// #[async_trait]
57
+ /// impl Validator for MinimumLengthValidator {
58
+ /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
59
+ /// -> Result<()> {
60
+ /// if result.content.len() < self.min_length {
61
+ /// return Err(KreuzbergError::validation(format!(
62
+ /// "Content too short: {} < {} characters",
63
+ /// result.content.len(),
64
+ /// self.min_length
65
+ /// )));
66
+ /// }
67
+ /// Ok(())
68
+ /// }
69
+ /// }
70
+ /// ```
71
+ #[async_trait]
72
+ pub trait Validator: Plugin {
73
+ /// Validate an extraction result.
74
+ ///
75
+ /// Check the extraction result and return `Ok(())` if valid, or an error
76
+ /// if validation fails.
77
+ ///
78
+ /// # Arguments
79
+ ///
80
+ /// * `result` - The extraction result to validate
81
+ /// * `config` - Extraction configuration
82
+ ///
83
+ /// # Returns
84
+ ///
85
+ /// - `Ok(())` if validation passes
86
+ /// - `Err(...)` if validation fails (extraction will fail)
87
+ ///
88
+ /// # Errors
89
+ ///
90
+ /// - `KreuzbergError::Validation` - Validation failed
91
+ /// - Any other error type appropriate for the failure
92
+ ///
93
+ /// # Example - Content Length Validation
94
+ ///
95
+ /// ```rust
96
+ /// # use kreuzberg::plugins::{Plugin, Validator};
97
+ /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
98
+ /// # use async_trait::async_trait;
99
+ /// # struct ContentLengthValidator { min: usize, max: usize }
100
+ /// # impl Plugin for ContentLengthValidator {
101
+ /// # fn name(&self) -> &str { "length-validator" }
102
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
103
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
104
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
105
+ /// # }
106
+ /// # #[async_trait]
107
+ /// # impl Validator for ContentLengthValidator {
108
+ /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
109
+ /// -> Result<()> {
110
+ /// let length = result.content.len();
111
+ ///
112
+ /// if length < self.min {
113
+ /// return Err(KreuzbergError::validation(format!(
114
+ /// "Content too short: {} < {} characters",
115
+ /// length, self.min
116
+ /// )));
117
+ /// }
118
+ ///
119
+ /// if length > self.max {
120
+ /// return Err(KreuzbergError::validation(format!(
121
+ /// "Content too long: {} > {} characters",
122
+ /// length, self.max
123
+ /// )));
124
+ /// }
125
+ ///
126
+ /// Ok(())
127
+ /// }
128
+ /// # }
129
+ /// ```
130
+ ///
131
+ /// # Example - Quality Score Validation
132
+ ///
133
+ /// ```rust
134
+ /// # use kreuzberg::plugins::{Plugin, Validator};
135
+ /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
136
+ /// # use async_trait::async_trait;
137
+ /// # struct QualityValidator { min_score: f64 }
138
+ /// # impl Plugin for QualityValidator {
139
+ /// # fn name(&self) -> &str { "quality-validator" }
140
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
141
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
142
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
143
+ /// # }
144
+ /// # #[async_trait]
145
+ /// # impl Validator for QualityValidator {
146
+ /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
147
+ /// -> Result<()> {
148
+ /// // Check if quality_score exists in metadata
149
+ /// let score = result.metadata
150
+ /// .additional
151
+ /// .get("quality_score")
152
+ /// .and_then(|v| v.as_f64())
153
+ /// .unwrap_or(0.0);
154
+ ///
155
+ /// if score < self.min_score {
156
+ /// return Err(KreuzbergError::validation(format!(
157
+ /// "Quality score too low: {} < {}",
158
+ /// score, self.min_score
159
+ /// )));
160
+ /// }
161
+ ///
162
+ /// Ok(())
163
+ /// }
164
+ /// # }
165
+ /// ```
166
+ ///
167
+ /// # Example - Security Validation
168
+ ///
169
+ /// ```rust
170
+ /// # use kreuzberg::plugins::{Plugin, Validator};
171
+ /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
172
+ /// # use async_trait::async_trait;
173
+ /// # struct SecurityValidator { blocked_patterns: Vec<String> }
174
+ /// # impl Plugin for SecurityValidator {
175
+ /// # fn name(&self) -> &str { "security-validator" }
176
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
177
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
178
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
179
+ /// # }
180
+ /// # #[async_trait]
181
+ /// # impl Validator for SecurityValidator {
182
+ /// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
183
+ /// -> Result<()> {
184
+ /// // Check for blocked patterns
185
+ /// for pattern in &self.blocked_patterns {
186
+ /// if result.content.contains(pattern) {
187
+ /// return Err(KreuzbergError::validation(format!(
188
+ /// "Content contains blocked pattern: {}",
189
+ /// pattern
190
+ /// )));
191
+ /// }
192
+ /// }
193
+ ///
194
+ /// Ok(())
195
+ /// }
196
+ /// # }
197
+ /// ```
198
+ async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig) -> Result<()>;
199
+
200
+ /// Optional: Check if this validator should run for a given result.
201
+ ///
202
+ /// Allows conditional validation based on MIME type, metadata, or content.
203
+ /// Defaults to `true` (always run).
204
+ ///
205
+ /// # Arguments
206
+ ///
207
+ /// * `result` - The extraction result to check
208
+ /// * `config` - Extraction configuration
209
+ ///
210
+ /// # Returns
211
+ ///
212
+ /// `true` if the validator should run, `false` to skip.
213
+ ///
214
+ /// # Example
215
+ ///
216
+ /// ```rust
217
+ /// # use kreuzberg::plugins::{Plugin, Validator};
218
+ /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
219
+ /// # use async_trait::async_trait;
220
+ /// # struct PdfValidator;
221
+ /// # impl Plugin for PdfValidator {
222
+ /// # fn name(&self) -> &str { "pdf-validator" }
223
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
224
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
225
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
226
+ /// # }
227
+ /// # #[async_trait]
228
+ /// # impl Validator for PdfValidator {
229
+ /// # async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> { Ok(()) }
230
+ /// /// Only validate PDF documents
231
+ /// fn should_validate(&self, result: &ExtractionResult, config: &ExtractionConfig) -> bool {
232
+ /// result.mime_type == "application/pdf"
233
+ /// }
234
+ /// # }
235
+ /// ```
236
+ fn should_validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> bool {
237
+ true
238
+ }
239
+
240
+ /// Optional: Get the validation priority.
241
+ ///
242
+ /// Higher priority validators run first. Useful for ordering validation checks
243
+ /// (e.g., run cheap validations before expensive ones).
244
+ ///
245
+ /// Default priority is 50.
246
+ ///
247
+ /// # Returns
248
+ ///
249
+ /// Priority value (higher = runs earlier).
250
+ ///
251
+ /// # Example
252
+ ///
253
+ /// ```rust
254
+ /// # use kreuzberg::plugins::{Plugin, Validator};
255
+ /// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
256
+ /// # use async_trait::async_trait;
257
+ /// # struct FastValidator;
258
+ /// # impl Plugin for FastValidator {
259
+ /// # fn name(&self) -> &str { "fast-validator" }
260
+ /// # fn version(&self) -> String { "1.0.0".to_string() }
261
+ /// # fn initialize(&self) -> Result<()> { Ok(()) }
262
+ /// # fn shutdown(&self) -> Result<()> { Ok(()) }
263
+ /// # }
264
+ /// # #[async_trait]
265
+ /// # impl Validator for FastValidator {
266
+ /// # async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> { Ok(()) }
267
+ /// /// Run this validator first (it's fast)
268
+ /// fn priority(&self) -> i32 {
269
+ /// 100
270
+ /// }
271
+ /// # }
272
+ /// ```
273
+ fn priority(&self) -> i32 {
274
+ 50
275
+ }
276
+ }
277
+
278
+ // Public registration APIs
279
+
280
+ /// Register a validator with the global registry.
281
+ ///
282
+ /// The validator will be registered with its default priority and will be called
283
+ /// during extraction validation. The validator's `name()` method is used as the
284
+ /// registration name.
285
+ ///
286
+ /// # Arguments
287
+ ///
288
+ /// * `validator` - The validator implementation wrapped in Arc
289
+ ///
290
+ /// # Returns
291
+ ///
292
+ /// - `Ok(())` if registration succeeded
293
+ /// - `Err(...)` if validation failed or initialization failed
294
+ ///
295
+ /// # Errors
296
+ ///
297
+ /// - `KreuzbergError::Validation` - Invalid validator name (empty or contains whitespace)
298
+ /// - Any error from the validator's `initialize()` method
299
+ ///
300
+ /// # Example
301
+ ///
302
+ /// ```rust
303
+ /// use kreuzberg::plugins::{Plugin, Validator, register_validator};
304
+ /// use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
305
+ /// use async_trait::async_trait;
306
+ /// use std::sync::Arc;
307
+ ///
308
+ /// struct MinLengthValidator { min_length: usize }
309
+ ///
310
+ /// impl Plugin for MinLengthValidator {
311
+ /// fn name(&self) -> &str { "min-length" }
312
+ /// fn version(&self) -> String { "1.0.0".to_string() }
313
+ /// fn initialize(&self) -> Result<()> { Ok(()) }
314
+ /// fn shutdown(&self) -> Result<()> { Ok(()) }
315
+ /// }
316
+ ///
317
+ /// #[async_trait]
318
+ /// impl Validator for MinLengthValidator {
319
+ /// async fn validate(&self, result: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
320
+ /// if result.content.len() < self.min_length {
321
+ /// return Err(KreuzbergError::validation(
322
+ /// format!("Content too short: {} < {}", result.content.len(), self.min_length)
323
+ /// ));
324
+ /// }
325
+ /// Ok(())
326
+ /// }
327
+ /// }
328
+ ///
329
+ /// # tokio_test::block_on(async {
330
+ /// let validator = Arc::new(MinLengthValidator { min_length: 10 });
331
+ /// register_validator(validator)?;
332
+ /// # Ok::<(), KreuzbergError>(())
333
+ /// # });
334
+ /// ```
335
+ pub fn register_validator(validator: Arc<dyn Validator>) -> crate::Result<()> {
336
+ use crate::plugins::registry::get_validator_registry;
337
+
338
+ let registry = get_validator_registry();
339
+ let mut registry = registry
340
+ .write()
341
+ .expect("~keep Failed to acquire write lock on validator registry"); // ~keep
342
+
343
+ registry.register(validator)
344
+ }
345
+
346
+ /// Unregister a validator by name.
347
+ ///
348
+ /// Removes the validator from the global registry and calls its `shutdown()` method.
349
+ ///
350
+ /// # Arguments
351
+ ///
352
+ /// * `name` - Name of the validator to unregister
353
+ ///
354
+ /// # Returns
355
+ ///
356
+ /// - `Ok(())` if the validator was unregistered or didn't exist
357
+ /// - `Err(...)` if the shutdown method failed
358
+ ///
359
+ /// # Example
360
+ ///
361
+ /// ```rust
362
+ /// use kreuzberg::plugins::unregister_validator;
363
+ ///
364
+ /// # tokio_test::block_on(async {
365
+ /// unregister_validator("min-length")?;
366
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
367
+ /// # });
368
+ /// ```
369
+ pub fn unregister_validator(name: &str) -> crate::Result<()> {
370
+ use crate::plugins::registry::get_validator_registry;
371
+
372
+ let registry = get_validator_registry();
373
+ let mut registry = registry
374
+ .write()
375
+ .expect("~keep Failed to acquire write lock on validator registry"); // ~keep
376
+
377
+ registry.remove(name)
378
+ }
379
+
380
+ /// List all registered validators.
381
+ ///
382
+ /// Returns the names of all validators currently registered in the global registry.
383
+ ///
384
+ /// # Returns
385
+ ///
386
+ /// A vector of validator names.
387
+ ///
388
+ /// # Example
389
+ ///
390
+ /// ```rust
391
+ /// use kreuzberg::plugins::list_validators;
392
+ ///
393
+ /// # tokio_test::block_on(async {
394
+ /// let validators = list_validators()?;
395
+ /// for name in validators {
396
+ /// println!("Registered validator: {}", name);
397
+ /// }
398
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
399
+ /// # });
400
+ /// ```
401
+ pub fn list_validators() -> crate::Result<Vec<String>> {
402
+ use crate::plugins::registry::get_validator_registry;
403
+
404
+ let registry = get_validator_registry();
405
+ let registry = registry
406
+ .read()
407
+ .expect("~keep Failed to acquire read lock on validator registry"); // ~keep
408
+
409
+ Ok(registry.list())
410
+ }
411
+
412
+ /// Clear all validators from the global registry.
413
+ ///
414
+ /// Removes all validators and calls their `shutdown()` methods.
415
+ ///
416
+ /// # Returns
417
+ ///
418
+ /// - `Ok(())` if all validators were cleared successfully
419
+ /// - `Err(...)` if any shutdown method failed
420
+ ///
421
+ /// # Example
422
+ ///
423
+ /// ```rust
424
+ /// use kreuzberg::plugins::clear_validators;
425
+ ///
426
+ /// # tokio_test::block_on(async {
427
+ /// clear_validators()?;
428
+ /// # Ok::<(), kreuzberg::KreuzbergError>(())
429
+ /// # });
430
+ /// ```
431
+ pub fn clear_validators() -> crate::Result<()> {
432
+ use crate::plugins::registry::get_validator_registry;
433
+
434
+ let registry = get_validator_registry();
435
+ let mut registry = registry
436
+ .write()
437
+ .expect("~keep Failed to acquire write lock on validator registry"); // ~keep
438
+
439
+ registry.shutdown_all()
440
+ }
441
+
442
+ #[cfg(test)]
443
+ mod tests {
444
+ use super::*;
445
+ use crate::KreuzbergError;
446
+ use std::collections::HashMap;
447
+
448
+ struct MockValidator {
449
+ should_fail: bool,
450
+ }
451
+
452
+ impl Plugin for MockValidator {
453
+ fn name(&self) -> &str {
454
+ "mock-validator"
455
+ }
456
+
457
+ fn version(&self) -> String {
458
+ "1.0.0".to_string()
459
+ }
460
+
461
+ fn initialize(&self) -> Result<()> {
462
+ Ok(())
463
+ }
464
+
465
+ fn shutdown(&self) -> Result<()> {
466
+ Ok(())
467
+ }
468
+ }
469
+
470
+ #[async_trait]
471
+ impl Validator for MockValidator {
472
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
473
+ if self.should_fail {
474
+ Err(KreuzbergError::validation("Validation failed".to_string()))
475
+ } else {
476
+ Ok(())
477
+ }
478
+ }
479
+ }
480
+
481
+ #[tokio::test]
482
+ async fn test_validator_success() {
483
+ let validator = MockValidator { should_fail: false };
484
+
485
+ let result = ExtractionResult {
486
+ content: "test content".to_string(),
487
+ mime_type: "text/plain".to_string(),
488
+ metadata: crate::types::Metadata::default(),
489
+ tables: vec![],
490
+ detected_languages: None,
491
+ chunks: None,
492
+ images: None,
493
+ };
494
+
495
+ let config = ExtractionConfig::default();
496
+ assert!(validator.validate(&result, &config).await.is_ok());
497
+ }
498
+
499
+ #[tokio::test]
500
+ async fn test_validator_failure() {
501
+ let validator = MockValidator { should_fail: true };
502
+
503
+ let result = ExtractionResult {
504
+ content: "test content".to_string(),
505
+ mime_type: "text/plain".to_string(),
506
+ metadata: crate::types::Metadata::default(),
507
+ tables: vec![],
508
+ detected_languages: None,
509
+ chunks: None,
510
+ images: None,
511
+ };
512
+
513
+ let config = ExtractionConfig::default();
514
+ let validation_result = validator.validate(&result, &config).await;
515
+
516
+ assert!(matches!(validation_result, Err(KreuzbergError::Validation { .. })));
517
+ }
518
+
519
+ #[test]
520
+ fn test_validator_should_validate_default() {
521
+ let validator = MockValidator { should_fail: false };
522
+
523
+ let result = ExtractionResult {
524
+ content: "test".to_string(),
525
+ mime_type: "text/plain".to_string(),
526
+ metadata: crate::types::Metadata::default(),
527
+ tables: vec![],
528
+ detected_languages: None,
529
+ chunks: None,
530
+ images: None,
531
+ };
532
+
533
+ let config = ExtractionConfig::default();
534
+
535
+ assert!(validator.should_validate(&result, &config));
536
+ }
537
+
538
+ #[test]
539
+ fn test_validator_priority_default() {
540
+ let validator = MockValidator { should_fail: false };
541
+ assert_eq!(validator.priority(), 50);
542
+ }
543
+
544
+ #[tokio::test]
545
+ async fn test_validator_plugin_interface() {
546
+ let validator = MockValidator { should_fail: false };
547
+
548
+ assert_eq!(validator.name(), "mock-validator");
549
+ assert_eq!(validator.version(), "1.0.0");
550
+ assert!(validator.initialize().is_ok());
551
+ assert!(validator.shutdown().is_ok());
552
+ }
553
+
554
+ #[tokio::test]
555
+ async fn test_validator_empty_content() {
556
+ let validator = MockValidator { should_fail: false };
557
+
558
+ let result = ExtractionResult {
559
+ content: String::new(),
560
+ mime_type: "text/plain".to_string(),
561
+ metadata: crate::types::Metadata::default(),
562
+ tables: vec![],
563
+ detected_languages: None,
564
+ chunks: None,
565
+ images: None,
566
+ };
567
+
568
+ let config = ExtractionConfig::default();
569
+ assert!(validator.validate(&result, &config).await.is_ok());
570
+ }
571
+
572
+ #[test]
573
+ fn test_validator_should_validate_conditional() {
574
+ struct PdfOnlyValidator;
575
+
576
+ impl Plugin for PdfOnlyValidator {
577
+ fn name(&self) -> &str {
578
+ "pdf-only"
579
+ }
580
+ fn version(&self) -> String {
581
+ "1.0.0".to_string()
582
+ }
583
+ fn initialize(&self) -> Result<()> {
584
+ Ok(())
585
+ }
586
+ fn shutdown(&self) -> Result<()> {
587
+ Ok(())
588
+ }
589
+ }
590
+
591
+ #[async_trait]
592
+ impl Validator for PdfOnlyValidator {
593
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
594
+ Ok(())
595
+ }
596
+
597
+ fn should_validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> bool {
598
+ result.mime_type == "application/pdf"
599
+ }
600
+ }
601
+
602
+ let validator = PdfOnlyValidator;
603
+ let config = ExtractionConfig::default();
604
+
605
+ let pdf_result = ExtractionResult {
606
+ content: "test".to_string(),
607
+ mime_type: "application/pdf".to_string(),
608
+ metadata: crate::types::Metadata::default(),
609
+ tables: vec![],
610
+ detected_languages: None,
611
+ chunks: None,
612
+ images: None,
613
+ };
614
+
615
+ let txt_result = ExtractionResult {
616
+ content: "test".to_string(),
617
+ mime_type: "text/plain".to_string(),
618
+ metadata: crate::types::Metadata::default(),
619
+ tables: vec![],
620
+ detected_languages: None,
621
+ chunks: None,
622
+ images: None,
623
+ };
624
+
625
+ assert!(validator.should_validate(&pdf_result, &config));
626
+ assert!(!validator.should_validate(&txt_result, &config));
627
+ }
628
+
629
+ #[test]
630
+ fn test_validator_priority_ranges() {
631
+ struct HighPriorityValidator;
632
+ struct LowPriorityValidator;
633
+
634
+ impl Plugin for HighPriorityValidator {
635
+ fn name(&self) -> &str {
636
+ "high-priority"
637
+ }
638
+ fn version(&self) -> String {
639
+ "1.0.0".to_string()
640
+ }
641
+ fn initialize(&self) -> Result<()> {
642
+ Ok(())
643
+ }
644
+ fn shutdown(&self) -> Result<()> {
645
+ Ok(())
646
+ }
647
+ }
648
+
649
+ impl Plugin for LowPriorityValidator {
650
+ fn name(&self) -> &str {
651
+ "low-priority"
652
+ }
653
+ fn version(&self) -> String {
654
+ "1.0.0".to_string()
655
+ }
656
+ fn initialize(&self) -> Result<()> {
657
+ Ok(())
658
+ }
659
+ fn shutdown(&self) -> Result<()> {
660
+ Ok(())
661
+ }
662
+ }
663
+
664
+ #[async_trait]
665
+ impl Validator for HighPriorityValidator {
666
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
667
+ Ok(())
668
+ }
669
+
670
+ fn priority(&self) -> i32 {
671
+ 100
672
+ }
673
+ }
674
+
675
+ #[async_trait]
676
+ impl Validator for LowPriorityValidator {
677
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
678
+ Ok(())
679
+ }
680
+
681
+ fn priority(&self) -> i32 {
682
+ 10
683
+ }
684
+ }
685
+
686
+ let high = HighPriorityValidator;
687
+ let low = LowPriorityValidator;
688
+
689
+ assert_eq!(high.priority(), 100);
690
+ assert_eq!(low.priority(), 10);
691
+ assert!(high.priority() > low.priority());
692
+ }
693
+
694
+ #[tokio::test]
695
+ async fn test_validator_error_message() {
696
+ let validator = MockValidator { should_fail: true };
697
+
698
+ let result = ExtractionResult {
699
+ content: "test".to_string(),
700
+ mime_type: "text/plain".to_string(),
701
+ metadata: crate::types::Metadata::default(),
702
+ tables: vec![],
703
+ detected_languages: None,
704
+ chunks: None,
705
+ images: None,
706
+ };
707
+
708
+ let config = ExtractionConfig::default();
709
+ let err = validator.validate(&result, &config).await.unwrap_err();
710
+
711
+ match err {
712
+ KreuzbergError::Validation { message: msg, .. } => {
713
+ assert_eq!(msg, "Validation failed");
714
+ }
715
+ _ => panic!("Expected Validation error"),
716
+ }
717
+ }
718
+
719
+ #[tokio::test]
720
+ async fn test_validator_with_metadata() {
721
+ let validator = MockValidator { should_fail: false };
722
+
723
+ let mut additional = HashMap::new();
724
+ additional.insert("quality_score".to_string(), serde_json::json!(0.95));
725
+
726
+ let result = ExtractionResult {
727
+ content: "test".to_string(),
728
+ mime_type: "text/plain".to_string(),
729
+ metadata: crate::types::Metadata {
730
+ additional,
731
+ ..Default::default()
732
+ },
733
+ tables: vec![],
734
+ detected_languages: None,
735
+ chunks: None,
736
+ images: None,
737
+ };
738
+
739
+ let config = ExtractionConfig::default();
740
+ assert!(validator.validate(&result, &config).await.is_ok());
741
+ }
742
+
743
+ #[tokio::test]
744
+ async fn test_validator_with_tables() {
745
+ use crate::types::Table;
746
+
747
+ let validator = MockValidator { should_fail: false };
748
+
749
+ let table = Table {
750
+ cells: vec![vec!["A".to_string(), "B".to_string()]],
751
+ markdown: "| A | B |".to_string(),
752
+ page_number: 0,
753
+ };
754
+
755
+ let result = ExtractionResult {
756
+ content: "test".to_string(),
757
+ mime_type: "text/plain".to_string(),
758
+ metadata: crate::types::Metadata::default(),
759
+ tables: vec![table],
760
+ detected_languages: None,
761
+ chunks: None,
762
+ images: None,
763
+ };
764
+
765
+ let config = ExtractionConfig::default();
766
+ assert!(validator.validate(&result, &config).await.is_ok());
767
+ }
768
+
769
+ #[tokio::test]
770
+ async fn test_validator_different_mime_types() {
771
+ let validator = MockValidator { should_fail: false };
772
+ let config = ExtractionConfig::default();
773
+
774
+ let mime_types = vec![
775
+ "text/plain",
776
+ "application/pdf",
777
+ "application/json",
778
+ "text/html",
779
+ "image/png",
780
+ ];
781
+
782
+ for mime_type in mime_types {
783
+ let result = ExtractionResult {
784
+ content: "test".to_string(),
785
+ mime_type: mime_type.to_string(),
786
+ metadata: crate::types::Metadata::default(),
787
+ tables: vec![],
788
+ detected_languages: None,
789
+ chunks: None,
790
+ images: None,
791
+ };
792
+
793
+ assert!(validator.validate(&result, &config).await.is_ok());
794
+ }
795
+ }
796
+
797
+ #[tokio::test]
798
+ async fn test_validator_long_content() {
799
+ let validator = MockValidator { should_fail: false };
800
+
801
+ let result = ExtractionResult {
802
+ content: "test content ".repeat(10000),
803
+ mime_type: "text/plain".to_string(),
804
+ metadata: crate::types::Metadata::default(),
805
+ tables: vec![],
806
+ detected_languages: None,
807
+ chunks: None,
808
+ images: None,
809
+ };
810
+
811
+ let config = ExtractionConfig::default();
812
+ assert!(validator.validate(&result, &config).await.is_ok());
813
+ }
814
+
815
+ // Tests for public registration APIs
816
+
817
+ #[test]
818
+ fn test_register_validator() {
819
+ use std::sync::Arc;
820
+
821
+ let validator = Arc::new(MockValidator { should_fail: false });
822
+ let result = super::register_validator(validator);
823
+ assert!(result.is_ok());
824
+
825
+ let _ = super::unregister_validator("mock-validator");
826
+ }
827
+
828
+ #[test]
829
+ fn test_unregister_validator() {
830
+ use std::sync::Arc;
831
+
832
+ let validator = Arc::new(MockValidator { should_fail: false });
833
+ super::register_validator(validator).unwrap();
834
+
835
+ let result = super::unregister_validator("mock-validator");
836
+ assert!(result.is_ok());
837
+ }
838
+
839
+ #[test]
840
+ fn test_unregister_nonexistent_validator() {
841
+ let result = super::unregister_validator("nonexistent-validator-xyz");
842
+ assert!(result.is_ok());
843
+ }
844
+
845
+ #[test]
846
+ fn test_list_validators() {
847
+ use std::sync::Arc;
848
+
849
+ super::clear_validators().unwrap();
850
+
851
+ let validator1 = Arc::new(MockValidator { should_fail: false });
852
+ // Both validators have the same name, so only one will be registered
853
+ let validator2 = Arc::new(MockValidator { should_fail: false });
854
+
855
+ let list_before = super::list_validators().unwrap();
856
+ assert_eq!(list_before.len(), 0);
857
+
858
+ super::register_validator(validator1).unwrap();
859
+ super::register_validator(validator2).unwrap();
860
+
861
+ let list = super::list_validators().unwrap();
862
+ // Only 1 validator registered since they have the same name
863
+ assert_eq!(list.len(), 1);
864
+ assert!(list.contains(&"mock-validator".to_string()));
865
+
866
+ super::unregister_validator("mock-validator").unwrap();
867
+ }
868
+
869
+ #[test]
870
+ fn test_clear_validators() {
871
+ use std::sync::Arc;
872
+
873
+ super::clear_validators().unwrap();
874
+
875
+ let validator1 = Arc::new(MockValidator { should_fail: false });
876
+ let validator2 = Arc::new(MockValidator { should_fail: false });
877
+
878
+ super::register_validator(validator1).unwrap();
879
+ super::register_validator(validator2).unwrap();
880
+
881
+ // Verify at least one validator is registered
882
+ let list_before = super::list_validators().unwrap();
883
+ assert!(!list_before.is_empty());
884
+
885
+ let result = super::clear_validators();
886
+ assert!(result.is_ok());
887
+
888
+ let list = super::list_validators().unwrap();
889
+ assert_eq!(list.len(), 0);
890
+ }
891
+
892
+ #[test]
893
+ fn test_register_validator_with_invalid_name() {
894
+ use std::sync::Arc;
895
+
896
+ struct InvalidNameValidator;
897
+ impl Plugin for InvalidNameValidator {
898
+ fn name(&self) -> &str {
899
+ "invalid name with spaces"
900
+ }
901
+ fn version(&self) -> String {
902
+ "1.0.0".to_string()
903
+ }
904
+ fn initialize(&self) -> Result<()> {
905
+ Ok(())
906
+ }
907
+ fn shutdown(&self) -> Result<()> {
908
+ Ok(())
909
+ }
910
+ }
911
+
912
+ #[async_trait]
913
+ impl Validator for InvalidNameValidator {
914
+ async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
915
+ Ok(())
916
+ }
917
+ }
918
+
919
+ let validator = Arc::new(InvalidNameValidator);
920
+ let result = super::register_validator(validator);
921
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
922
+ }
923
+
924
+ #[test]
925
+ fn test_register_validator_with_empty_name() {
926
+ use std::sync::Arc;
927
+
928
+ struct EmptyNameValidator;
929
+ impl Plugin for EmptyNameValidator {
930
+ fn name(&self) -> &str {
931
+ ""
932
+ }
933
+ fn version(&self) -> String {
934
+ "1.0.0".to_string()
935
+ }
936
+ fn initialize(&self) -> Result<()> {
937
+ Ok(())
938
+ }
939
+ fn shutdown(&self) -> Result<()> {
940
+ Ok(())
941
+ }
942
+ }
943
+
944
+ #[async_trait]
945
+ impl Validator for EmptyNameValidator {
946
+ async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
947
+ Ok(())
948
+ }
949
+ }
950
+
951
+ let validator = Arc::new(EmptyNameValidator);
952
+ let result = super::register_validator(validator);
953
+ assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
954
+ }
955
+ }