kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
@@ -0,0 +1,607 @@
1
+ //! Plugin registry integration tests.
2
+ //!
3
+ //! Tests the core registry APIs for all plugin types:
4
+ //! - Validator registration/unregistration
5
+ //! - Extractor registration/unregistration
6
+ //! - Registry clearing and listing
7
+ //! - Error handling and edge cases
8
+
9
+ use async_trait::async_trait;
10
+ use kreuzberg::core::config::ExtractionConfig;
11
+ use kreuzberg::plugins::registry::{DocumentExtractorRegistry, ValidatorRegistry};
12
+ use kreuzberg::plugins::{DocumentExtractor, Plugin, Validator};
13
+ use kreuzberg::types::{ExtractionResult, Metadata};
14
+ use kreuzberg::{KreuzbergError, Result};
15
+ use std::path::Path;
16
+ use std::sync::Arc;
17
+
18
+ // ===== Mock Validators =====
19
+
20
+ struct MockValidator {
21
+ name: String,
22
+ should_fail: bool,
23
+ }
24
+
25
+ impl Plugin for MockValidator {
26
+ fn name(&self) -> &str {
27
+ &self.name
28
+ }
29
+
30
+ fn version(&self) -> String {
31
+ "1.0.0".to_string()
32
+ }
33
+
34
+ fn initialize(&self) -> Result<()> {
35
+ Ok(())
36
+ }
37
+
38
+ fn shutdown(&self) -> Result<()> {
39
+ Ok(())
40
+ }
41
+ }
42
+
43
+ #[async_trait]
44
+ impl Validator for MockValidator {
45
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
46
+ if self.should_fail {
47
+ Err(KreuzbergError::validation("Mock validation failed"))
48
+ } else {
49
+ Ok(())
50
+ }
51
+ }
52
+
53
+ fn priority(&self) -> i32 {
54
+ 50
55
+ }
56
+ }
57
+
58
+ struct FailingInitValidator {
59
+ name: String,
60
+ }
61
+
62
+ impl Plugin for FailingInitValidator {
63
+ fn name(&self) -> &str {
64
+ &self.name
65
+ }
66
+
67
+ fn version(&self) -> String {
68
+ "1.0.0".to_string()
69
+ }
70
+
71
+ fn initialize(&self) -> Result<()> {
72
+ Err(KreuzbergError::Plugin {
73
+ message: "Initialization failed".to_string(),
74
+ plugin_name: self.name.clone(),
75
+ })
76
+ }
77
+
78
+ fn shutdown(&self) -> Result<()> {
79
+ Ok(())
80
+ }
81
+ }
82
+
83
+ #[async_trait]
84
+ impl Validator for FailingInitValidator {
85
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
86
+ Ok(())
87
+ }
88
+ }
89
+
90
+ // ===== Mock Extractors =====
91
+
92
+ struct MockExtractor {
93
+ name: String,
94
+ mime_types: Vec<&'static str>,
95
+ priority: i32,
96
+ }
97
+
98
+ impl Plugin for MockExtractor {
99
+ fn name(&self) -> &str {
100
+ &self.name
101
+ }
102
+
103
+ fn version(&self) -> String {
104
+ "1.0.0".to_string()
105
+ }
106
+
107
+ fn initialize(&self) -> Result<()> {
108
+ Ok(())
109
+ }
110
+
111
+ fn shutdown(&self) -> Result<()> {
112
+ Ok(())
113
+ }
114
+ }
115
+
116
+ #[async_trait]
117
+ impl DocumentExtractor for MockExtractor {
118
+ async fn extract_bytes(
119
+ &self,
120
+ content: &[u8],
121
+ mime_type: &str,
122
+ _config: &ExtractionConfig,
123
+ ) -> Result<ExtractionResult> {
124
+ Ok(ExtractionResult {
125
+ content: format!("Extracted by {}: {}", self.name, String::from_utf8_lossy(content)),
126
+ mime_type: mime_type.to_string(),
127
+ metadata: Metadata::default(),
128
+ tables: vec![],
129
+ detected_languages: None,
130
+ chunks: None,
131
+ images: None,
132
+ })
133
+ }
134
+
135
+ async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
136
+ let content = std::fs::read(path)?;
137
+ self.extract_bytes(&content, mime_type, config).await
138
+ }
139
+
140
+ fn supported_mime_types(&self) -> &[&str] {
141
+ &self.mime_types
142
+ }
143
+
144
+ fn priority(&self) -> i32 {
145
+ self.priority
146
+ }
147
+ }
148
+
149
+ // ===== Validator Registry Tests =====
150
+
151
+ /// Test validator registration and listing.
152
+ #[test]
153
+ fn test_validator_registration_succeeds() {
154
+ let mut registry = ValidatorRegistry::new();
155
+
156
+ let validator = Arc::new(MockValidator {
157
+ name: "test-validator".to_string(),
158
+ should_fail: false,
159
+ });
160
+
161
+ let result = registry.register(validator);
162
+ assert!(result.is_ok(), "Validator registration should succeed");
163
+
164
+ let list = registry.list();
165
+ assert_eq!(list.len(), 1, "Should have one validator");
166
+ assert!(
167
+ list.contains(&"test-validator".to_string()),
168
+ "Should contain registered validator"
169
+ );
170
+ }
171
+
172
+ /// Test registering multiple validators.
173
+ #[test]
174
+ fn test_register_multiple_validators_succeeds() {
175
+ let mut registry = ValidatorRegistry::new();
176
+
177
+ let v1 = Arc::new(MockValidator {
178
+ name: "validator-1".to_string(),
179
+ should_fail: false,
180
+ });
181
+ let v2 = Arc::new(MockValidator {
182
+ name: "validator-2".to_string(),
183
+ should_fail: false,
184
+ });
185
+ let v3 = Arc::new(MockValidator {
186
+ name: "validator-3".to_string(),
187
+ should_fail: true,
188
+ });
189
+
190
+ registry.register(v1).unwrap();
191
+ registry.register(v2).unwrap();
192
+ registry.register(v3).unwrap();
193
+
194
+ let list = registry.list();
195
+ assert_eq!(list.len(), 3, "Should have three validators");
196
+ assert!(list.contains(&"validator-1".to_string()));
197
+ assert!(list.contains(&"validator-2".to_string()));
198
+ assert!(list.contains(&"validator-3".to_string()));
199
+ }
200
+
201
+ /// Test validator unregistration.
202
+ #[test]
203
+ fn test_validator_unregistration_succeeds() {
204
+ let mut registry = ValidatorRegistry::new();
205
+
206
+ let validator = Arc::new(MockValidator {
207
+ name: "temp-validator".to_string(),
208
+ should_fail: false,
209
+ });
210
+
211
+ registry.register(validator).unwrap();
212
+ assert_eq!(registry.list().len(), 1);
213
+
214
+ let result = registry.remove("temp-validator");
215
+ assert!(result.is_ok(), "Unregistration should succeed");
216
+ assert_eq!(registry.list().len(), 0, "Registry should be empty after removal");
217
+ }
218
+
219
+ /// Test unregistering non-existent validator.
220
+ #[test]
221
+ fn test_unregister_nonexistent_validator_succeeds() {
222
+ let mut registry = ValidatorRegistry::new();
223
+
224
+ let result = registry.remove("nonexistent-validator");
225
+ assert!(result.is_ok(), "Removing non-existent validator should succeed (no-op)");
226
+ }
227
+
228
+ /// Test validator registration with empty name fails.
229
+ #[test]
230
+ fn test_validator_registration_with_empty_name_fails() {
231
+ let mut registry = ValidatorRegistry::new();
232
+
233
+ let validator = Arc::new(MockValidator {
234
+ name: "".to_string(),
235
+ should_fail: false,
236
+ });
237
+
238
+ let result = registry.register(validator);
239
+ assert!(result.is_err(), "Registration with empty name should fail");
240
+
241
+ match result {
242
+ Err(KreuzbergError::Validation { message, .. }) => {
243
+ assert!(message.contains("empty"), "Error should mention empty name");
244
+ }
245
+ _ => panic!("Expected Validation error"),
246
+ }
247
+ }
248
+
249
+ /// Test validator registration with whitespace in name fails.
250
+ #[test]
251
+ fn test_validator_registration_with_whitespace_fails() {
252
+ let mut registry = ValidatorRegistry::new();
253
+
254
+ let validator = Arc::new(MockValidator {
255
+ name: "validator with spaces".to_string(),
256
+ should_fail: false,
257
+ });
258
+
259
+ let result = registry.register(validator);
260
+ assert!(result.is_err(), "Registration with whitespace should fail");
261
+
262
+ match result {
263
+ Err(KreuzbergError::Validation { message, .. }) => {
264
+ assert!(message.contains("whitespace"), "Error should mention whitespace");
265
+ }
266
+ _ => panic!("Expected Validation error"),
267
+ }
268
+ }
269
+
270
+ /// Test validator registration with failed initialization.
271
+ #[test]
272
+ fn test_validator_registration_with_failed_init_fails() {
273
+ let mut registry = ValidatorRegistry::new();
274
+
275
+ let validator = Arc::new(FailingInitValidator {
276
+ name: "failing-validator".to_string(),
277
+ });
278
+
279
+ let result = registry.register(validator);
280
+ assert!(result.is_err(), "Registration with failed init should fail");
281
+
282
+ match result {
283
+ Err(KreuzbergError::Plugin { .. }) => {
284
+ // Expected error type
285
+ }
286
+ _ => panic!("Expected Plugin error"),
287
+ }
288
+
289
+ // Validator should not be in the list
290
+ assert_eq!(registry.list().len(), 0, "Failed validator should not be registered");
291
+ }
292
+
293
+ /// Test clearing all validators.
294
+ #[test]
295
+ fn test_clear_validators_succeeds() {
296
+ let mut registry = ValidatorRegistry::new();
297
+
298
+ // Register multiple validators
299
+ let v1 = Arc::new(MockValidator {
300
+ name: "validator-1".to_string(),
301
+ should_fail: false,
302
+ });
303
+ let v2 = Arc::new(MockValidator {
304
+ name: "validator-2".to_string(),
305
+ should_fail: false,
306
+ });
307
+
308
+ registry.register(v1).unwrap();
309
+ registry.register(v2).unwrap();
310
+ assert_eq!(registry.list().len(), 2);
311
+
312
+ // Clear all
313
+ let result = registry.shutdown_all();
314
+ assert!(result.is_ok(), "Clear should succeed");
315
+ assert_eq!(registry.list().len(), 0, "Registry should be empty after clear");
316
+ }
317
+
318
+ /// Test getting all validators in priority order.
319
+ #[test]
320
+ fn test_get_all_validators_respects_priority() {
321
+ let mut registry = ValidatorRegistry::new();
322
+
323
+ struct PriorityValidator {
324
+ name: String,
325
+ priority: i32,
326
+ }
327
+
328
+ impl Plugin for PriorityValidator {
329
+ fn name(&self) -> &str {
330
+ &self.name
331
+ }
332
+ fn version(&self) -> String {
333
+ "1.0.0".to_string()
334
+ }
335
+ fn initialize(&self) -> Result<()> {
336
+ Ok(())
337
+ }
338
+ fn shutdown(&self) -> Result<()> {
339
+ Ok(())
340
+ }
341
+ }
342
+
343
+ #[async_trait]
344
+ impl Validator for PriorityValidator {
345
+ async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
346
+ Ok(())
347
+ }
348
+ fn priority(&self) -> i32 {
349
+ self.priority
350
+ }
351
+ }
352
+
353
+ let low = Arc::new(PriorityValidator {
354
+ name: "low-priority".to_string(),
355
+ priority: 10,
356
+ });
357
+ let medium = Arc::new(PriorityValidator {
358
+ name: "medium-priority".to_string(),
359
+ priority: 50,
360
+ });
361
+ let high = Arc::new(PriorityValidator {
362
+ name: "high-priority".to_string(),
363
+ priority: 100,
364
+ });
365
+
366
+ registry.register(medium).unwrap();
367
+ registry.register(low).unwrap();
368
+ registry.register(high).unwrap();
369
+
370
+ let all = registry.get_all();
371
+ assert_eq!(all.len(), 3, "Should have three validators");
372
+
373
+ // Should be in descending priority order
374
+ assert_eq!(all[0].name(), "high-priority");
375
+ assert_eq!(all[1].name(), "medium-priority");
376
+ assert_eq!(all[2].name(), "low-priority");
377
+ }
378
+
379
+ // ===== Extractor Registry Tests =====
380
+
381
+ /// Test extractor registration and retrieval.
382
+ #[test]
383
+ fn test_extractor_registration_succeeds() {
384
+ let mut registry = DocumentExtractorRegistry::new();
385
+
386
+ let extractor = Arc::new(MockExtractor {
387
+ name: "test-extractor".to_string(),
388
+ mime_types: vec!["text/plain"],
389
+ priority: 50,
390
+ });
391
+
392
+ let result = registry.register(extractor);
393
+ assert!(result.is_ok(), "Extractor registration should succeed");
394
+
395
+ let list = registry.list();
396
+ assert_eq!(list.len(), 1, "Should have one extractor");
397
+ assert!(list.contains(&"test-extractor".to_string()));
398
+ }
399
+
400
+ /// Test extractor retrieval by MIME type.
401
+ #[test]
402
+ fn test_get_extractor_by_mime_type_succeeds() {
403
+ let mut registry = DocumentExtractorRegistry::new();
404
+
405
+ let extractor = Arc::new(MockExtractor {
406
+ name: "pdf-extractor".to_string(),
407
+ mime_types: vec!["application/pdf"],
408
+ priority: 50,
409
+ });
410
+
411
+ registry.register(extractor).unwrap();
412
+
413
+ let result = registry.get("application/pdf");
414
+ assert!(result.is_ok(), "Should find extractor for PDF");
415
+ assert_eq!(result.unwrap().name(), "pdf-extractor");
416
+ }
417
+
418
+ /// Test extractor not found for unsupported MIME type.
419
+ #[test]
420
+ fn test_get_extractor_for_unsupported_mime_fails() {
421
+ let registry = DocumentExtractorRegistry::new();
422
+
423
+ let result = registry.get("application/nonexistent");
424
+ assert!(result.is_err(), "Should not find extractor for unsupported MIME type");
425
+
426
+ match result {
427
+ Err(KreuzbergError::UnsupportedFormat(mime)) => {
428
+ assert_eq!(mime, "application/nonexistent");
429
+ }
430
+ _ => panic!("Expected UnsupportedFormat error"),
431
+ }
432
+ }
433
+
434
+ /// Test extractor priority selection.
435
+ #[test]
436
+ fn test_extractor_priority_selection() {
437
+ let mut registry = DocumentExtractorRegistry::new();
438
+
439
+ let low_priority = Arc::new(MockExtractor {
440
+ name: "low-priority-extractor".to_string(),
441
+ mime_types: vec!["text/plain"],
442
+ priority: 10,
443
+ });
444
+
445
+ let high_priority = Arc::new(MockExtractor {
446
+ name: "high-priority-extractor".to_string(),
447
+ mime_types: vec!["text/plain"],
448
+ priority: 100,
449
+ });
450
+
451
+ registry.register(low_priority).unwrap();
452
+ registry.register(high_priority).unwrap();
453
+
454
+ // Should get the high priority extractor
455
+ let result = registry.get("text/plain").unwrap();
456
+ assert_eq!(
457
+ result.name(),
458
+ "high-priority-extractor",
459
+ "Should select highest priority extractor"
460
+ );
461
+ }
462
+
463
+ /// Test extractor wildcard MIME type matching.
464
+ #[test]
465
+ fn test_extractor_wildcard_mime_matching() {
466
+ let mut registry = DocumentExtractorRegistry::new();
467
+
468
+ let extractor = Arc::new(MockExtractor {
469
+ name: "text-extractor".to_string(),
470
+ mime_types: vec!["text/*"],
471
+ priority: 50,
472
+ });
473
+
474
+ registry.register(extractor).unwrap();
475
+
476
+ // Should match text/plain
477
+ let result = registry.get("text/plain");
478
+ assert!(result.is_ok(), "Should match text/plain with text/*");
479
+ assert_eq!(result.unwrap().name(), "text-extractor");
480
+
481
+ // Should match text/html
482
+ let result = registry.get("text/html");
483
+ assert!(result.is_ok(), "Should match text/html with text/*");
484
+ assert_eq!(result.unwrap().name(), "text-extractor");
485
+
486
+ // Should not match application/pdf
487
+ let result = registry.get("application/pdf");
488
+ assert!(result.is_err(), "Should not match application/pdf with text/*");
489
+ }
490
+
491
+ /// Test extractor unregistration.
492
+ #[test]
493
+ fn test_extractor_unregistration_succeeds() {
494
+ let mut registry = DocumentExtractorRegistry::new();
495
+
496
+ let extractor = Arc::new(MockExtractor {
497
+ name: "temp-extractor".to_string(),
498
+ mime_types: vec!["text/plain"],
499
+ priority: 50,
500
+ });
501
+
502
+ registry.register(extractor).unwrap();
503
+ assert_eq!(registry.list().len(), 1);
504
+
505
+ let result = registry.remove("temp-extractor");
506
+ assert!(result.is_ok(), "Unregistration should succeed");
507
+ assert_eq!(registry.list().len(), 0, "Registry should be empty after removal");
508
+
509
+ // Should no longer find extractor for MIME type
510
+ let lookup_result = registry.get("text/plain");
511
+ assert!(lookup_result.is_err(), "Should not find extractor after removal");
512
+ }
513
+
514
+ /// Test extractor registration with multiple MIME types.
515
+ #[test]
516
+ fn test_extractor_multiple_mime_types() {
517
+ let mut registry = DocumentExtractorRegistry::new();
518
+
519
+ let extractor = Arc::new(MockExtractor {
520
+ name: "multi-format-extractor".to_string(),
521
+ mime_types: vec!["application/pdf", "application/vnd.ms-excel", "text/csv"],
522
+ priority: 50,
523
+ });
524
+
525
+ registry.register(extractor).unwrap();
526
+
527
+ // Should find for all MIME types
528
+ assert!(registry.get("application/pdf").is_ok());
529
+ assert!(registry.get("application/vnd.ms-excel").is_ok());
530
+ assert!(registry.get("text/csv").is_ok());
531
+
532
+ // All should return the same extractor
533
+ assert_eq!(
534
+ registry.get("application/pdf").unwrap().name(),
535
+ "multi-format-extractor"
536
+ );
537
+ assert_eq!(registry.get("text/csv").unwrap().name(), "multi-format-extractor");
538
+ }
539
+
540
+ /// Test clearing all extractors.
541
+ #[test]
542
+ fn test_clear_extractors_succeeds() {
543
+ let mut registry = DocumentExtractorRegistry::new();
544
+
545
+ let e1 = Arc::new(MockExtractor {
546
+ name: "extractor-1".to_string(),
547
+ mime_types: vec!["text/plain"],
548
+ priority: 50,
549
+ });
550
+ let e2 = Arc::new(MockExtractor {
551
+ name: "extractor-2".to_string(),
552
+ mime_types: vec!["application/pdf"],
553
+ priority: 50,
554
+ });
555
+
556
+ registry.register(e1).unwrap();
557
+ registry.register(e2).unwrap();
558
+ assert_eq!(registry.list().len(), 2);
559
+
560
+ let result = registry.shutdown_all();
561
+ assert!(result.is_ok(), "Clear should succeed");
562
+ assert_eq!(registry.list().len(), 0, "Registry should be empty after clear");
563
+ }
564
+
565
+ /// Test extractor registration with empty name fails.
566
+ #[test]
567
+ fn test_extractor_registration_with_empty_name_fails() {
568
+ let mut registry = DocumentExtractorRegistry::new();
569
+
570
+ let extractor = Arc::new(MockExtractor {
571
+ name: "".to_string(),
572
+ mime_types: vec!["text/plain"],
573
+ priority: 50,
574
+ });
575
+
576
+ let result = registry.register(extractor);
577
+ assert!(result.is_err(), "Registration with empty name should fail");
578
+
579
+ match result {
580
+ Err(KreuzbergError::Validation { message, .. }) => {
581
+ assert!(message.contains("empty"), "Error should mention empty name");
582
+ }
583
+ _ => panic!("Expected Validation error"),
584
+ }
585
+ }
586
+
587
+ /// Test extractor registration with whitespace fails.
588
+ #[test]
589
+ fn test_extractor_registration_with_whitespace_fails() {
590
+ let mut registry = DocumentExtractorRegistry::new();
591
+
592
+ let extractor = Arc::new(MockExtractor {
593
+ name: "extractor with spaces".to_string(),
594
+ mime_types: vec!["text/plain"],
595
+ priority: 50,
596
+ });
597
+
598
+ let result = registry.register(extractor);
599
+ assert!(result.is_err(), "Registration with whitespace should fail");
600
+
601
+ match result {
602
+ Err(KreuzbergError::Validation { message, .. }) => {
603
+ assert!(message.contains("whitespace"), "Error should mention whitespace");
604
+ }
605
+ _ => panic!("Expected Validation error"),
606
+ }
607
+ }