kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
@@ -0,0 +1,268 @@
1
+ //! Built-in document extractors.
2
+ //!
3
+ //! This module contains the default extractors that ship with Kreuzberg.
4
+ //! All extractors implement the `DocumentExtractor` plugin trait.
5
+
6
+ use crate::Result;
7
+ use crate::plugins::registry::get_document_extractor_registry;
8
+ use once_cell::sync::Lazy;
9
+ use std::sync::Arc;
10
+
11
+ pub mod structured;
12
+ pub mod text;
13
+
14
+ #[cfg(feature = "ocr")]
15
+ pub mod image;
16
+
17
+ #[cfg(feature = "archives")]
18
+ pub mod archive;
19
+
20
+ #[cfg(feature = "email")]
21
+ pub mod email;
22
+
23
+ #[cfg(feature = "excel")]
24
+ pub mod excel;
25
+
26
+ #[cfg(feature = "html")]
27
+ pub mod html;
28
+
29
+ #[cfg(feature = "office")]
30
+ pub mod docx;
31
+
32
+ #[cfg(feature = "office")]
33
+ pub mod pandoc;
34
+
35
+ #[cfg(feature = "pdf")]
36
+ pub mod pdf;
37
+
38
+ #[cfg(feature = "office")]
39
+ pub mod pptx;
40
+
41
+ #[cfg(feature = "xml")]
42
+ pub mod xml;
43
+
44
+ pub use structured::StructuredExtractor;
45
+ pub use text::{MarkdownExtractor, PlainTextExtractor};
46
+
47
+ #[cfg(feature = "ocr")]
48
+ pub use image::ImageExtractor;
49
+
50
+ #[cfg(feature = "archives")]
51
+ pub use archive::{SevenZExtractor, TarExtractor, ZipExtractor};
52
+
53
+ #[cfg(feature = "email")]
54
+ pub use email::EmailExtractor;
55
+
56
+ #[cfg(feature = "excel")]
57
+ pub use excel::ExcelExtractor;
58
+
59
+ #[cfg(feature = "html")]
60
+ pub use html::HtmlExtractor;
61
+
62
+ #[cfg(feature = "office")]
63
+ pub use docx::DocxExtractor;
64
+
65
+ #[cfg(feature = "office")]
66
+ pub use pandoc::PandocExtractor;
67
+
68
+ #[cfg(feature = "pdf")]
69
+ pub use pdf::PdfExtractor;
70
+
71
+ #[cfg(feature = "office")]
72
+ pub use pptx::PptxExtractor;
73
+
74
+ #[cfg(feature = "xml")]
75
+ pub use xml::XmlExtractor;
76
+
77
+ /// Lazy-initialized flag that ensures extractors are registered exactly once.
78
+ ///
79
+ /// This static is accessed on first extraction operation to automatically
80
+ /// register all built-in extractors with the plugin registry.
81
+ static EXTRACTORS_INITIALIZED: Lazy<Result<()>> = Lazy::new(register_default_extractors);
82
+
83
+ /// Ensure built-in extractors are registered.
84
+ ///
85
+ /// This function is called automatically on first extraction operation.
86
+ /// It's safe to call multiple times - registration only happens once,
87
+ /// unless the registry was cleared, in which case extractors are re-registered.
88
+ pub fn ensure_initialized() -> Result<()> {
89
+ // First, try the lazy initialization
90
+ EXTRACTORS_INITIALIZED
91
+ .as_ref()
92
+ .map(|_| ())
93
+ .map_err(|e| crate::KreuzbergError::Plugin {
94
+ message: format!("Failed to register default extractors: {}", e),
95
+ plugin_name: "built-in-extractors".to_string(),
96
+ })?;
97
+
98
+ // Check if registry is empty (e.g., after clear_document_extractors)
99
+ // If so, re-register the default extractors
100
+ let registry = get_document_extractor_registry();
101
+ let registry_guard = registry
102
+ .read()
103
+ .map_err(|e| crate::KreuzbergError::Other(format!("Document extractor registry lock poisoned: {}", e)))?;
104
+
105
+ if registry_guard.list().is_empty() {
106
+ // Drop read lock before acquiring write lock
107
+ drop(registry_guard);
108
+ register_default_extractors()?;
109
+ }
110
+
111
+ Ok(())
112
+ }
113
+
114
+ /// Register all built-in extractors with the global registry.
115
+ ///
116
+ /// This function should be called once at application startup to register
117
+ /// the default extractors (PlainText, Markdown, XML, etc.).
118
+ ///
119
+ /// **Note:** This is called automatically on first extraction operation.
120
+ /// Explicit calling is optional.
121
+ ///
122
+ /// # Example
123
+ ///
124
+ /// ```rust
125
+ /// use kreuzberg::extractors::register_default_extractors;
126
+ ///
127
+ /// # fn main() -> kreuzberg::Result<()> {
128
+ /// register_default_extractors()?;
129
+ /// # Ok(())
130
+ /// # }
131
+ /// ```
132
+ pub fn register_default_extractors() -> Result<()> {
133
+ let registry = get_document_extractor_registry();
134
+ let mut registry = registry
135
+ .write()
136
+ .map_err(|e| crate::KreuzbergError::Other(format!("Document extractor registry lock poisoned: {}", e)))?;
137
+
138
+ registry.register(Arc::new(PlainTextExtractor::new()))?;
139
+ registry.register(Arc::new(MarkdownExtractor::new()))?;
140
+ registry.register(Arc::new(StructuredExtractor::new()))?;
141
+
142
+ #[cfg(feature = "ocr")]
143
+ registry.register(Arc::new(ImageExtractor::new()))?;
144
+
145
+ #[cfg(feature = "xml")]
146
+ registry.register(Arc::new(XmlExtractor::new()))?;
147
+
148
+ #[cfg(feature = "pdf")]
149
+ registry.register(Arc::new(PdfExtractor::new()))?;
150
+
151
+ #[cfg(feature = "excel")]
152
+ registry.register(Arc::new(ExcelExtractor::new()))?;
153
+
154
+ #[cfg(feature = "office")]
155
+ {
156
+ registry.register(Arc::new(DocxExtractor::new()))?;
157
+ registry.register(Arc::new(PptxExtractor::new()))?;
158
+ registry.register(Arc::new(PandocExtractor::new()))?;
159
+ }
160
+
161
+ #[cfg(feature = "email")]
162
+ registry.register(Arc::new(EmailExtractor::new()))?;
163
+
164
+ #[cfg(feature = "html")]
165
+ registry.register(Arc::new(HtmlExtractor::new()))?;
166
+
167
+ #[cfg(feature = "archives")]
168
+ {
169
+ registry.register(Arc::new(ZipExtractor::new()))?;
170
+ registry.register(Arc::new(TarExtractor::new()))?;
171
+ registry.register(Arc::new(SevenZExtractor::new()))?;
172
+ }
173
+
174
+ Ok(())
175
+ }
176
+
177
+ #[cfg(test)]
178
+ mod tests {
179
+ use super::*;
180
+
181
+ #[test]
182
+ fn test_register_default_extractors() {
183
+ let registry = get_document_extractor_registry();
184
+ {
185
+ let mut reg = registry
186
+ .write()
187
+ .expect("Failed to acquire write lock on registry in test");
188
+ *reg = crate::plugins::registry::DocumentExtractorRegistry::new();
189
+ }
190
+
191
+ register_default_extractors().expect("Failed to register extractors");
192
+
193
+ let reg = registry
194
+ .read()
195
+ .expect("Failed to acquire read lock on registry in test");
196
+ let extractor_names = reg.list();
197
+
198
+ #[allow(unused_mut)]
199
+ let mut expected_count = 3;
200
+ assert!(extractor_names.contains(&"plain-text-extractor".to_string()));
201
+ assert!(extractor_names.contains(&"markdown-extractor".to_string()));
202
+ assert!(extractor_names.contains(&"structured-extractor".to_string()));
203
+
204
+ #[cfg(feature = "ocr")]
205
+ {
206
+ expected_count += 1;
207
+ assert!(extractor_names.contains(&"image-extractor".to_string()));
208
+ }
209
+
210
+ #[cfg(feature = "xml")]
211
+ {
212
+ expected_count += 1;
213
+ assert!(extractor_names.contains(&"xml-extractor".to_string()));
214
+ }
215
+
216
+ #[cfg(feature = "pdf")]
217
+ {
218
+ expected_count += 1;
219
+ assert!(extractor_names.contains(&"pdf-extractor".to_string()));
220
+ }
221
+
222
+ #[cfg(feature = "excel")]
223
+ {
224
+ expected_count += 1;
225
+ assert!(extractor_names.contains(&"excel-extractor".to_string()));
226
+ }
227
+
228
+ #[cfg(feature = "office")]
229
+ {
230
+ expected_count += 3;
231
+ assert!(extractor_names.contains(&"docx-extractor".to_string()));
232
+ assert!(extractor_names.contains(&"pptx-extractor".to_string()));
233
+ assert!(extractor_names.contains(&"pandoc-extractor".to_string()));
234
+ }
235
+
236
+ #[cfg(feature = "email")]
237
+ {
238
+ expected_count += 1;
239
+ assert!(extractor_names.contains(&"email-extractor".to_string()));
240
+ }
241
+
242
+ #[cfg(feature = "html")]
243
+ {
244
+ expected_count += 1;
245
+ assert!(extractor_names.contains(&"html-extractor".to_string()));
246
+ }
247
+
248
+ #[cfg(feature = "archives")]
249
+ {
250
+ expected_count += 3;
251
+ assert!(extractor_names.contains(&"zip-extractor".to_string()));
252
+ assert!(extractor_names.contains(&"tar-extractor".to_string()));
253
+ assert!(extractor_names.contains(&"7z-extractor".to_string()));
254
+ }
255
+
256
+ assert_eq!(
257
+ extractor_names.len(),
258
+ expected_count,
259
+ "Expected {} extractors based on enabled features",
260
+ expected_count
261
+ );
262
+ }
263
+
264
+ #[test]
265
+ fn test_ensure_initialized() {
266
+ ensure_initialized().expect("Failed to ensure extractors initialized");
267
+ }
268
+ }
@@ -0,0 +1,201 @@
1
+ //! Pandoc-based extractors for various document formats.
2
+ //!
3
+ //! Supports: DOCX, ODT, EPUB, LaTeX, RST, RTF, and many more formats via Pandoc.
4
+
5
+ use crate::Result;
6
+ use crate::core::config::ExtractionConfig;
7
+ use crate::extraction::pandoc::extract_bytes_from_mime;
8
+ use crate::plugins::{DocumentExtractor, Plugin};
9
+ use crate::types::{ExtractionResult, Metadata};
10
+ use async_trait::async_trait;
11
+
12
+ /// Generic Pandoc extractor for all Pandoc-supported formats.
13
+ ///
14
+ /// This extractor handles all document formats supported by Pandoc, including:
15
+ /// - Microsoft Word (DOCX)
16
+ /// - OpenDocument Text (ODT)
17
+ /// - EPUB
18
+ /// - LaTeX
19
+ /// - reStructuredText (RST)
20
+ /// - RTF
21
+ /// - And many more
22
+ pub struct PandocExtractor;
23
+
24
+ impl PandocExtractor {
25
+ /// Create a new Pandoc extractor.
26
+ pub fn new() -> Self {
27
+ Self
28
+ }
29
+ }
30
+
31
+ impl Default for PandocExtractor {
32
+ fn default() -> Self {
33
+ Self::new()
34
+ }
35
+ }
36
+
37
+ impl Plugin for PandocExtractor {
38
+ fn name(&self) -> &str {
39
+ "pandoc-extractor"
40
+ }
41
+
42
+ fn version(&self) -> String {
43
+ env!("CARGO_PKG_VERSION").to_string()
44
+ }
45
+
46
+ fn initialize(&self) -> Result<()> {
47
+ Ok(())
48
+ }
49
+
50
+ fn shutdown(&self) -> Result<()> {
51
+ Ok(())
52
+ }
53
+
54
+ fn description(&self) -> &str {
55
+ "Extracts content from Pandoc-supported formats (DOCX, ODT, EPUB, LaTeX, RST, RTF, etc.)"
56
+ }
57
+
58
+ fn author(&self) -> &str {
59
+ "Kreuzberg Team"
60
+ }
61
+ }
62
+
63
+ #[async_trait]
64
+ impl DocumentExtractor for PandocExtractor {
65
+ async fn extract_bytes(
66
+ &self,
67
+ content: &[u8],
68
+ mime_type: &str,
69
+ _config: &ExtractionConfig,
70
+ ) -> Result<ExtractionResult> {
71
+ let pandoc_result = extract_bytes_from_mime(content, mime_type).await?;
72
+
73
+ let mut additional = std::collections::HashMap::new();
74
+ for (key, value) in pandoc_result.metadata {
75
+ additional.insert(key, value);
76
+ }
77
+
78
+ Ok(ExtractionResult {
79
+ content: pandoc_result.content,
80
+ mime_type: mime_type.to_string(),
81
+ metadata: Metadata {
82
+ additional,
83
+ ..Default::default()
84
+ },
85
+ tables: vec![],
86
+ detected_languages: None,
87
+ chunks: None,
88
+ images: None,
89
+ })
90
+ }
91
+
92
+ fn supported_mime_types(&self) -> &[&str] {
93
+ &[
94
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
95
+ "application/vnd.oasis.opendocument.text",
96
+ "application/epub+zip",
97
+ "application/x-latex",
98
+ "text/x-tex",
99
+ "text/x-rst",
100
+ "text/prs.fallenstein.rst",
101
+ "application/rtf",
102
+ "text/rtf",
103
+ "application/x-typst",
104
+ "application/x-ipynb+json",
105
+ "application/x-fictionbook+xml",
106
+ "text/x-org",
107
+ "text/x-commonmark",
108
+ "text/x-gfm",
109
+ "text/x-multimarkdown",
110
+ "text/x-markdown-extra",
111
+ "application/docbook+xml",
112
+ "application/x-jats+xml",
113
+ "application/x-opml+xml",
114
+ ]
115
+ }
116
+
117
+ fn priority(&self) -> i32 {
118
+ 40
119
+ }
120
+ }
121
+
122
+ #[cfg(test)]
123
+ mod tests {
124
+ use super::*;
125
+ use crate::extraction::pandoc::validate_pandoc_version;
126
+
127
+ #[tokio::test]
128
+ async fn test_pandoc_extractor_plugin_interface() {
129
+ let extractor = PandocExtractor::new();
130
+ assert_eq!(extractor.name(), "pandoc-extractor");
131
+ assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
132
+ assert_eq!(extractor.priority(), 40);
133
+ assert!(!extractor.supported_mime_types().is_empty());
134
+ }
135
+
136
+ #[tokio::test]
137
+ async fn test_pandoc_extractor_supports_docx() {
138
+ let extractor = PandocExtractor::new();
139
+ assert!(
140
+ extractor
141
+ .supported_mime_types()
142
+ .contains(&"application/vnd.openxmlformats-officedocument.wordprocessingml.document")
143
+ );
144
+ }
145
+
146
+ #[tokio::test]
147
+ async fn test_pandoc_extractor_supports_odt() {
148
+ let extractor = PandocExtractor::new();
149
+ assert!(
150
+ extractor
151
+ .supported_mime_types()
152
+ .contains(&"application/vnd.oasis.opendocument.text")
153
+ );
154
+ }
155
+
156
+ #[tokio::test]
157
+ async fn test_pandoc_extractor_supports_epub() {
158
+ let extractor = PandocExtractor::new();
159
+ assert!(extractor.supported_mime_types().contains(&"application/epub+zip"));
160
+ }
161
+
162
+ #[tokio::test]
163
+ async fn test_pandoc_extractor_supports_latex() {
164
+ let extractor = PandocExtractor::new();
165
+ assert!(extractor.supported_mime_types().contains(&"application/x-latex"));
166
+ }
167
+
168
+ #[tokio::test]
169
+ async fn test_pandoc_extractor_supports_rst() {
170
+ let extractor = PandocExtractor::new();
171
+ assert!(extractor.supported_mime_types().contains(&"text/x-rst"));
172
+ }
173
+
174
+ #[tokio::test]
175
+ async fn test_pandoc_extractor_markdown() {
176
+ if validate_pandoc_version().await.is_err() {
177
+ return;
178
+ }
179
+
180
+ let extractor = PandocExtractor::new();
181
+ let markdown = b"# Hello World\n\nThis is a test.";
182
+ let config = ExtractionConfig::default();
183
+
184
+ let result = extractor.extract_bytes(markdown, "text/x-rst", &config).await;
185
+
186
+ let _ = result;
187
+ }
188
+
189
+ #[tokio::test]
190
+ async fn test_pandoc_extractor_default() {
191
+ let extractor = PandocExtractor;
192
+ assert_eq!(extractor.name(), "pandoc-extractor");
193
+ }
194
+
195
+ #[tokio::test]
196
+ async fn test_pandoc_extractor_initialize_shutdown() {
197
+ let extractor = PandocExtractor::new();
198
+ assert!(extractor.initialize().is_ok());
199
+ assert!(extractor.shutdown().is_ok());
200
+ }
201
+ }