kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
@@ -0,0 +1,209 @@
1
+ //! Plugin system for extending Kreuzberg functionality.
2
+ //!
3
+ //! The plugin system provides a trait-based architecture that allows extending
4
+ //! Kreuzberg with custom extractors, OCR backends, post-processors, and validators.
5
+ //!
6
+ //! # Plugin Types
7
+ //!
8
+ //! - [`Plugin`] - Base trait that all plugins must implement
9
+ //! - [`OcrBackend`] - OCR processing plugins
10
+ //! - [`DocumentExtractor`] - Document format extraction plugins
11
+ //! - [`PostProcessor`] - Content post-processing plugins
12
+ //! - [`Validator`] - Validation plugins
13
+ //!
14
+ //! # Language Support
15
+ //!
16
+ //! Plugins can be implemented in:
17
+ //! - **Rust** (native, highest performance)
18
+ //! - **Python** (via PyO3 FFI bridge)
19
+ //! - **Node.js** (future - via napi-rs FFI bridge)
20
+ //!
21
+ //! # Lifecycle Pattern
22
+ //!
23
+ //! Plugins are stored in `Arc<dyn Trait>` for thread-safe shared access:
24
+ //!
25
+ //! ```rust
26
+ //! use kreuzberg::plugins::{Plugin, DocumentExtractor};
27
+ //! use kreuzberg::plugins::registry::get_document_extractor_registry;
28
+ //! use std::sync::Arc;
29
+ //!
30
+ //! # struct MyExtractor;
31
+ //! # use kreuzberg::types::{ExtractionResult, Metadata};
32
+ //! # impl kreuzberg::plugins::Plugin for MyExtractor {
33
+ //! # fn name(&self) -> &str { "my" }
34
+ //! # fn version(&self) -> String { "1.0.0".to_string() }
35
+ //! # fn initialize(&self) -> kreuzberg::Result<()> { Ok(()) }
36
+ //! # fn shutdown(&self) -> kreuzberg::Result<()> { Ok(()) }
37
+ //! # }
38
+ //! # #[async_trait::async_trait]
39
+ //! # impl DocumentExtractor for MyExtractor {
40
+ //! # async fn extract_bytes(&self, _: &[u8], _: &str, _: &kreuzberg::ExtractionConfig)
41
+ //! # -> kreuzberg::Result<ExtractionResult> {
42
+ //! # Ok(ExtractionResult {
43
+ //! # content: String::new(),
44
+ //! # mime_type: String::new(),
45
+ //! # metadata: Metadata::default(),
46
+ //! # tables: vec![],
47
+ //! # detected_languages: None,
48
+ //! # chunks: None,
49
+ //! # images: None,
50
+ //! # })
51
+ //! # }
52
+ //! # async fn extract_file(&self, _: &std::path::Path, _: &str, _: &kreuzberg::ExtractionConfig)
53
+ //! # -> kreuzberg::Result<ExtractionResult> {
54
+ //! # Ok(ExtractionResult {
55
+ //! # content: String::new(),
56
+ //! # mime_type: String::new(),
57
+ //! # metadata: Metadata::default(),
58
+ //! # tables: vec![],
59
+ //! # detected_languages: None,
60
+ //! # chunks: None,
61
+ //! # images: None,
62
+ //! # })
63
+ //! # }
64
+ //! # fn supported_mime_types(&self) -> &[&str] { &[] }
65
+ //! # fn priority(&self) -> i32 { 50 }
66
+ //! # }
67
+ //! // 1. Create plugin instance
68
+ //! let plugin = MyExtractor;
69
+ //!
70
+ //! // 2. Wrap in Arc for registration
71
+ //! let plugin = Arc::new(plugin);
72
+ //!
73
+ //! // 3. Register with registry (calls initialize internally)
74
+ //! let registry = get_document_extractor_registry();
75
+ //! let mut registry = registry.write().unwrap();
76
+ //! registry.register(plugin)?;
77
+ //! # Ok::<(), kreuzberg::KreuzbergError>(())
78
+ //! ```
79
+ //!
80
+ //! # Example: Custom Document Extractor
81
+ //!
82
+ //! ```rust
83
+ //! use kreuzberg::plugins::{Plugin, DocumentExtractor};
84
+ //! use kreuzberg::{Result, ExtractionConfig};
85
+ //! use kreuzberg::types::{ExtractionResult, Metadata};
86
+ //! use async_trait::async_trait;
87
+ //! use std::path::Path;
88
+ //!
89
+ //! struct CustomJsonExtractor;
90
+ //!
91
+ //! impl Plugin for CustomJsonExtractor {
92
+ //! fn name(&self) -> &str { "custom-json-extractor" }
93
+ //! fn version(&self) -> String { "1.0.0".to_string() }
94
+ //! fn initialize(&self) -> Result<()> {
95
+ //! println!("JSON extractor initialized");
96
+ //! Ok(())
97
+ //! }
98
+ //! fn shutdown(&self) -> Result<()> {
99
+ //! println!("JSON extractor shutdown");
100
+ //! Ok(())
101
+ //! }
102
+ //! }
103
+ //!
104
+ //! #[async_trait]
105
+ //! impl DocumentExtractor for CustomJsonExtractor {
106
+ //! async fn extract_bytes(&self, content: &[u8], _mime_type: &str, _config: &ExtractionConfig)
107
+ //! -> Result<ExtractionResult> {
108
+ //! // Parse JSON and extract all string values
109
+ //! let json: serde_json::Value = serde_json::from_slice(content)?;
110
+ //! let extracted_text = extract_strings_from_json(&json);
111
+ //!
112
+ //! let mut metadata = Metadata::default();
113
+ //! metadata.additional.insert("extracted_fields".to_string(), serde_json::json!(true));
114
+ //!
115
+ //! Ok(ExtractionResult {
116
+ //! content: extracted_text,
117
+ //! mime_type: "application/json".to_string(),
118
+ //! metadata,
119
+ //! tables: vec![],
120
+ //! detected_languages: None,
121
+ //! chunks: None,
122
+ //! images: None,
123
+ //! })
124
+ //! }
125
+ //!
126
+ //! async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig)
127
+ //! -> Result<ExtractionResult> {
128
+ //! // Read file and delegate to extract_bytes
129
+ //! let content = tokio::fs::read(path).await?;
130
+ //! self.extract_bytes(&content, mime_type, config).await
131
+ //! }
132
+ //!
133
+ //! fn supported_mime_types(&self) -> &[&str] {
134
+ //! &["application/json", "text/json"]
135
+ //! }
136
+ //!
137
+ //! fn priority(&self) -> i32 { 50 } // Default priority
138
+ //! }
139
+ //!
140
+ //! fn extract_strings_from_json(value: &serde_json::Value) -> String {
141
+ //! match value {
142
+ //! serde_json::Value::String(s) => format!("{}\n", s),
143
+ //! serde_json::Value::Array(arr) => {
144
+ //! arr.iter().map(extract_strings_from_json).collect()
145
+ //! }
146
+ //! serde_json::Value::Object(obj) => {
147
+ //! obj.values().map(extract_strings_from_json).collect()
148
+ //! }
149
+ //! _ => String::new(),
150
+ //! }
151
+ //! }
152
+ //! ```
153
+ //!
154
+ //! # Safety and Threading
155
+ //!
156
+ //! **CRITICAL**: All plugins must be `Send + Sync` because they are:
157
+ //! - Stored in `Arc<dyn Trait>` for shared ownership
158
+ //! - Accessed concurrently from multiple threads
159
+ //! - Called with `&self` (shared references)
160
+ //!
161
+ //! **Interior Mutability Pattern**:
162
+ //! Since plugins receive `&self` (not `&mut self`), use these for mutable state:
163
+ //! - `Mutex<T>` - Exclusive access, blocking
164
+ //! - `RwLock<T>` - Shared read, exclusive write
165
+ //! - `AtomicBool` / `AtomicU64` - Lock-free primitives
166
+ //! - `OnceCell<T>` - One-time initialization
167
+ //!
168
+ //! ```rust
169
+ //! use kreuzberg::plugins::Plugin;
170
+ //! use std::sync::Mutex;
171
+ //!
172
+ //! struct StatefulPlugin {
173
+ //! // Use interior mutability for state
174
+ //! call_count: std::sync::atomic::AtomicU64,
175
+ //! cache: Mutex<Option<Vec<String>>>,
176
+ //! }
177
+ //!
178
+ //! impl Plugin for StatefulPlugin {
179
+ //! fn name(&self) -> &str { "stateful-plugin" }
180
+ //! fn version(&self) -> String { "1.0.0".to_string() }
181
+ //!
182
+ //! fn initialize(&self) -> kreuzberg::Result<()> {
183
+ //! // Modify through interior mutability
184
+ //! let mut cache = self.cache.lock().unwrap();
185
+ //! *cache = Some(vec!["initialized".to_string()]);
186
+ //! Ok(())
187
+ //! }
188
+ //!
189
+ //! fn shutdown(&self) -> kreuzberg::Result<()> {
190
+ //! self.call_count.store(0, std::sync::atomic::Ordering::Release);
191
+ //! Ok(())
192
+ //! }
193
+ //! }
194
+ //! ```
195
+
196
+ mod extractor;
197
+ mod ocr;
198
+ mod processor;
199
+ pub mod registry;
200
+ mod traits;
201
+ mod validator;
202
+
203
+ pub use extractor::{DocumentExtractor, clear_extractors, list_extractors, register_extractor, unregister_extractor};
204
+ pub use ocr::{
205
+ OcrBackend, OcrBackendType, clear_ocr_backends, list_ocr_backends, register_ocr_backend, unregister_ocr_backend,
206
+ };
207
+ pub use processor::{PostProcessor, ProcessingStage, list_post_processors};
208
+ pub use traits::Plugin;
209
+ pub use validator::{Validator, clear_validators, list_validators, register_validator, unregister_validator};