kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
@@ -0,0 +1,234 @@
1
+ //! PowerPoint presentation extractor.
2
+
3
+ use crate::Result;
4
+ use crate::core::config::ExtractionConfig;
5
+ use crate::plugins::{DocumentExtractor, Plugin};
6
+ use crate::types::{ExtractionResult, Metadata};
7
+ use async_trait::async_trait;
8
+ use std::path::Path;
9
+
10
+ #[cfg(feature = "ocr")]
11
+ use crate::ocr::OcrProcessor;
12
+
13
+ /// PowerPoint presentation extractor.
14
+ ///
15
+ /// Supports: .pptx, .pptm, .ppsx
16
+ pub struct PptxExtractor;
17
+
18
+ impl Default for PptxExtractor {
19
+ fn default() -> Self {
20
+ Self::new()
21
+ }
22
+ }
23
+
24
+ impl PptxExtractor {
25
+ pub fn new() -> Self {
26
+ Self
27
+ }
28
+
29
+ /// Process extracted images with OCR if configured.
30
+ #[cfg(feature = "ocr")]
31
+ async fn process_images_with_ocr(
32
+ &self,
33
+ mut images: Vec<crate::types::ExtractedImage>,
34
+ config: &ExtractionConfig,
35
+ ) -> Result<Vec<crate::types::ExtractedImage>> {
36
+ if config.ocr.is_none() {
37
+ return Ok(images);
38
+ }
39
+
40
+ let ocr_config = config.ocr.as_ref().unwrap();
41
+ let tess_config = ocr_config.tesseract_config.as_ref().cloned().unwrap_or_default();
42
+
43
+ for image in &mut images {
44
+ let image_data = image.data.clone();
45
+ let tess_config_clone = tess_config.clone();
46
+
47
+ let ocr_result = tokio::task::spawn_blocking(move || {
48
+ let cache_dir = std::env::var("KREUZBERG_CACHE_DIR").ok().map(std::path::PathBuf::from);
49
+
50
+ let proc = OcrProcessor::new(cache_dir)?;
51
+ let ocr_tess_config: crate::ocr::types::TesseractConfig = (&tess_config_clone).into();
52
+ proc.process_image(&image_data, &ocr_tess_config)
53
+ })
54
+ .await
55
+ .map_err(|e| crate::KreuzbergError::Ocr {
56
+ message: format!("OCR task failed: {}", e),
57
+ source: None,
58
+ })?;
59
+
60
+ match ocr_result {
61
+ Ok(ocr_extraction) => {
62
+ let extraction_result = ExtractionResult {
63
+ content: ocr_extraction.content,
64
+ mime_type: image.format.clone(),
65
+ metadata: Metadata::default(),
66
+ tables: vec![],
67
+ detected_languages: None,
68
+ chunks: None,
69
+ images: None,
70
+ };
71
+ image.ocr_result = Some(Box::new(extraction_result));
72
+ }
73
+ Err(_) => {
74
+ image.ocr_result = None;
75
+ }
76
+ }
77
+ }
78
+
79
+ Ok(images)
80
+ }
81
+ }
82
+
83
+ impl Plugin for PptxExtractor {
84
+ fn name(&self) -> &str {
85
+ "pptx-extractor"
86
+ }
87
+
88
+ fn version(&self) -> String {
89
+ env!("CARGO_PKG_VERSION").to_string()
90
+ }
91
+
92
+ fn initialize(&self) -> Result<()> {
93
+ Ok(())
94
+ }
95
+
96
+ fn shutdown(&self) -> Result<()> {
97
+ Ok(())
98
+ }
99
+ }
100
+
101
+ #[async_trait]
102
+ impl DocumentExtractor for PptxExtractor {
103
+ async fn extract_bytes(
104
+ &self,
105
+ content: &[u8],
106
+ mime_type: &str,
107
+ config: &ExtractionConfig,
108
+ ) -> Result<ExtractionResult> {
109
+ let extract_images = config.images.as_ref().is_some_and(|img| img.extract_images);
110
+
111
+ // Extract PPTX content
112
+ let pptx_result = if crate::core::batch_mode::is_batch_mode() {
113
+ // Batch mode: Use spawn_blocking for parallelism
114
+ let content_owned = content.to_vec();
115
+ tokio::task::spawn_blocking(move || {
116
+ crate::extraction::pptx::extract_pptx_from_bytes(&content_owned, extract_images)
117
+ })
118
+ .await
119
+ .map_err(|e| crate::error::KreuzbergError::parsing(format!("PPTX extraction task failed: {}", e)))??
120
+ } else {
121
+ // Single-file mode: Direct extraction (no spawn overhead)
122
+ crate::extraction::pptx::extract_pptx_from_bytes(content, extract_images)?
123
+ };
124
+
125
+ let mut additional = std::collections::HashMap::new();
126
+ additional.insert("slide_count".to_string(), serde_json::json!(pptx_result.slide_count));
127
+ additional.insert("image_count".to_string(), serde_json::json!(pptx_result.image_count));
128
+ additional.insert("table_count".to_string(), serde_json::json!(pptx_result.table_count));
129
+
130
+ let images = if !pptx_result.images.is_empty() {
131
+ #[cfg(feature = "ocr")]
132
+ {
133
+ let processed_images = self.process_images_with_ocr(pptx_result.images, config).await?;
134
+ Some(processed_images)
135
+ }
136
+ #[cfg(not(feature = "ocr"))]
137
+ {
138
+ Some(pptx_result.images)
139
+ }
140
+ } else {
141
+ None
142
+ };
143
+
144
+ Ok(ExtractionResult {
145
+ content: pptx_result.content,
146
+ mime_type: mime_type.to_string(),
147
+ metadata: Metadata {
148
+ format: Some(crate::types::FormatMetadata::Pptx(pptx_result.metadata)),
149
+ additional,
150
+ ..Default::default()
151
+ },
152
+ tables: vec![],
153
+ detected_languages: None,
154
+ chunks: None,
155
+ images,
156
+ })
157
+ }
158
+
159
+ async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
160
+ let path_str = path
161
+ .to_str()
162
+ .ok_or_else(|| crate::KreuzbergError::validation("Invalid file path".to_string()))?;
163
+
164
+ let extract_images = config.images.as_ref().is_some_and(|img| img.extract_images);
165
+
166
+ let pptx_result = crate::extraction::pptx::extract_pptx_from_path(path_str, extract_images)?;
167
+
168
+ let mut additional = std::collections::HashMap::new();
169
+ additional.insert("slide_count".to_string(), serde_json::json!(pptx_result.slide_count));
170
+ additional.insert("image_count".to_string(), serde_json::json!(pptx_result.image_count));
171
+ additional.insert("table_count".to_string(), serde_json::json!(pptx_result.table_count));
172
+
173
+ let images = if !pptx_result.images.is_empty() {
174
+ #[cfg(feature = "ocr")]
175
+ {
176
+ let processed_images = self.process_images_with_ocr(pptx_result.images, config).await?;
177
+ Some(processed_images)
178
+ }
179
+ #[cfg(not(feature = "ocr"))]
180
+ {
181
+ Some(pptx_result.images)
182
+ }
183
+ } else {
184
+ None
185
+ };
186
+
187
+ Ok(ExtractionResult {
188
+ content: pptx_result.content,
189
+ mime_type: mime_type.to_string(),
190
+ metadata: Metadata {
191
+ format: Some(crate::types::FormatMetadata::Pptx(pptx_result.metadata)),
192
+ additional,
193
+ ..Default::default()
194
+ },
195
+ tables: vec![],
196
+ detected_languages: None,
197
+ chunks: None,
198
+ images,
199
+ })
200
+ }
201
+
202
+ fn supported_mime_types(&self) -> &[&str] {
203
+ &[
204
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
205
+ "application/vnd.ms-powerpoint.presentation.macroEnabled.12",
206
+ "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
207
+ ]
208
+ }
209
+
210
+ fn priority(&self) -> i32 {
211
+ 50
212
+ }
213
+ }
214
+
215
+ #[cfg(test)]
216
+ mod tests {
217
+ use super::*;
218
+
219
+ #[test]
220
+ fn test_pptx_extractor_plugin_interface() {
221
+ let extractor = PptxExtractor::new();
222
+ assert_eq!(extractor.name(), "pptx-extractor");
223
+ assert!(extractor.initialize().is_ok());
224
+ assert!(extractor.shutdown().is_ok());
225
+ }
226
+
227
+ #[test]
228
+ fn test_pptx_extractor_supported_mime_types() {
229
+ let extractor = PptxExtractor::new();
230
+ let mime_types = extractor.supported_mime_types();
231
+ assert_eq!(mime_types.len(), 3);
232
+ assert!(mime_types.contains(&"application/vnd.openxmlformats-officedocument.presentationml.presentation"));
233
+ }
234
+ }
@@ -0,0 +1,126 @@
1
+ //! Structured data extractor (JSON, YAML, TOML).
2
+
3
+ use crate::Result;
4
+ use crate::core::config::ExtractionConfig;
5
+ use crate::plugins::{DocumentExtractor, Plugin};
6
+ use crate::types::{ExtractionResult, Metadata};
7
+ use async_trait::async_trait;
8
+ use std::path::Path;
9
+
10
+ /// Structured data extractor supporting JSON, YAML, and TOML.
11
+ pub struct StructuredExtractor;
12
+
13
+ impl Default for StructuredExtractor {
14
+ fn default() -> Self {
15
+ Self::new()
16
+ }
17
+ }
18
+
19
+ impl StructuredExtractor {
20
+ pub fn new() -> Self {
21
+ Self
22
+ }
23
+ }
24
+
25
+ impl Plugin for StructuredExtractor {
26
+ fn name(&self) -> &str {
27
+ "structured-extractor"
28
+ }
29
+
30
+ fn version(&self) -> String {
31
+ env!("CARGO_PKG_VERSION").to_string()
32
+ }
33
+
34
+ fn initialize(&self) -> Result<()> {
35
+ Ok(())
36
+ }
37
+
38
+ fn shutdown(&self) -> Result<()> {
39
+ Ok(())
40
+ }
41
+ }
42
+
43
+ #[async_trait]
44
+ impl DocumentExtractor for StructuredExtractor {
45
+ async fn extract_bytes(
46
+ &self,
47
+ content: &[u8],
48
+ mime_type: &str,
49
+ _config: &ExtractionConfig,
50
+ ) -> Result<ExtractionResult> {
51
+ let structured_result = match mime_type {
52
+ "application/json" | "text/json" => crate::extraction::structured::parse_json(content, None)?,
53
+ "application/x-yaml" | "text/yaml" | "text/x-yaml" => crate::extraction::structured::parse_yaml(content)?,
54
+ "application/toml" | "text/toml" => crate::extraction::structured::parse_toml(content)?,
55
+ _ => return Err(crate::KreuzbergError::UnsupportedFormat(mime_type.to_string())),
56
+ };
57
+
58
+ let mut additional = std::collections::HashMap::new();
59
+ additional.insert(
60
+ "field_count".to_string(),
61
+ serde_json::json!(structured_result.text_fields.len()),
62
+ );
63
+ additional.insert("data_format".to_string(), serde_json::json!(structured_result.format));
64
+
65
+ for (key, value) in structured_result.metadata {
66
+ additional.insert(key, serde_json::json!(value));
67
+ }
68
+
69
+ Ok(ExtractionResult {
70
+ content: structured_result.content,
71
+ mime_type: mime_type.to_string(),
72
+ metadata: Metadata {
73
+ additional,
74
+ ..Default::default()
75
+ },
76
+ tables: vec![],
77
+ detected_languages: None,
78
+ chunks: None,
79
+ images: None,
80
+ })
81
+ }
82
+
83
+ async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
84
+ let bytes = tokio::fs::read(path).await?;
85
+ self.extract_bytes(&bytes, mime_type, config).await
86
+ }
87
+
88
+ fn supported_mime_types(&self) -> &[&str] {
89
+ &[
90
+ "application/json",
91
+ "text/json",
92
+ "application/x-yaml",
93
+ "text/yaml",
94
+ "text/x-yaml",
95
+ "application/toml",
96
+ "text/toml",
97
+ ]
98
+ }
99
+
100
+ fn priority(&self) -> i32 {
101
+ 50
102
+ }
103
+ }
104
+
105
+ #[cfg(test)]
106
+ mod tests {
107
+ use super::*;
108
+
109
+ #[test]
110
+ fn test_structured_extractor_plugin_interface() {
111
+ let extractor = StructuredExtractor::new();
112
+ assert_eq!(extractor.name(), "structured-extractor");
113
+ assert!(extractor.initialize().is_ok());
114
+ assert!(extractor.shutdown().is_ok());
115
+ }
116
+
117
+ #[test]
118
+ fn test_structured_extractor_supported_mime_types() {
119
+ let extractor = StructuredExtractor::new();
120
+ let mime_types = extractor.supported_mime_types();
121
+ assert_eq!(mime_types.len(), 7);
122
+ assert!(mime_types.contains(&"application/json"));
123
+ assert!(mime_types.contains(&"application/x-yaml"));
124
+ assert!(mime_types.contains(&"application/toml"));
125
+ }
126
+ }
@@ -0,0 +1,242 @@
1
+ //! Plain text and Markdown extractors.
2
+
3
+ use crate::Result;
4
+ use crate::core::config::ExtractionConfig;
5
+ use crate::extraction::text::parse_text;
6
+ use crate::plugins::{DocumentExtractor, Plugin};
7
+ use crate::types::ExtractionResult;
8
+ use async_trait::async_trait;
9
+
10
+ /// Plain text extractor.
11
+ ///
12
+ /// Extracts content from plain text files (.txt).
13
+ pub struct PlainTextExtractor;
14
+
15
+ impl PlainTextExtractor {
16
+ /// Create a new plain text extractor.
17
+ pub fn new() -> Self {
18
+ Self
19
+ }
20
+ }
21
+
22
+ impl Default for PlainTextExtractor {
23
+ fn default() -> Self {
24
+ Self::new()
25
+ }
26
+ }
27
+
28
+ impl Plugin for PlainTextExtractor {
29
+ fn name(&self) -> &str {
30
+ "plain-text-extractor"
31
+ }
32
+
33
+ fn version(&self) -> String {
34
+ env!("CARGO_PKG_VERSION").to_string()
35
+ }
36
+
37
+ fn initialize(&self) -> Result<()> {
38
+ Ok(())
39
+ }
40
+
41
+ fn shutdown(&self) -> Result<()> {
42
+ Ok(())
43
+ }
44
+
45
+ fn description(&self) -> &str {
46
+ "Extracts content from plain text files"
47
+ }
48
+
49
+ fn author(&self) -> &str {
50
+ "Kreuzberg Team"
51
+ }
52
+ }
53
+
54
+ #[async_trait]
55
+ impl DocumentExtractor for PlainTextExtractor {
56
+ async fn extract_bytes(
57
+ &self,
58
+ content: &[u8],
59
+ mime_type: &str,
60
+ _config: &ExtractionConfig,
61
+ ) -> Result<ExtractionResult> {
62
+ let text_result = parse_text(content, false)?;
63
+
64
+ Ok(ExtractionResult {
65
+ content: text_result.content,
66
+ mime_type: mime_type.to_string(),
67
+ metadata: crate::types::Metadata {
68
+ format: Some(crate::types::FormatMetadata::Text(crate::types::TextMetadata {
69
+ line_count: text_result.line_count,
70
+ word_count: text_result.word_count,
71
+ character_count: text_result.character_count,
72
+ headers: None,
73
+ links: None,
74
+ code_blocks: None,
75
+ })),
76
+ ..Default::default()
77
+ },
78
+ tables: vec![],
79
+ detected_languages: None,
80
+ chunks: None,
81
+ images: None,
82
+ })
83
+ }
84
+
85
+ fn supported_mime_types(&self) -> &[&str] {
86
+ &["text/plain"]
87
+ }
88
+
89
+ fn priority(&self) -> i32 {
90
+ 50
91
+ }
92
+ }
93
+
94
+ /// Markdown extractor.
95
+ ///
96
+ /// Extracts content from Markdown files (.md, .markdown).
97
+ /// Preserves markdown syntax and extracts metadata like headers, links, and code blocks.
98
+ pub struct MarkdownExtractor;
99
+
100
+ impl MarkdownExtractor {
101
+ /// Create a new Markdown extractor.
102
+ pub fn new() -> Self {
103
+ Self
104
+ }
105
+ }
106
+
107
+ impl Default for MarkdownExtractor {
108
+ fn default() -> Self {
109
+ Self::new()
110
+ }
111
+ }
112
+
113
+ impl Plugin for MarkdownExtractor {
114
+ fn name(&self) -> &str {
115
+ "markdown-extractor"
116
+ }
117
+
118
+ fn version(&self) -> String {
119
+ env!("CARGO_PKG_VERSION").to_string()
120
+ }
121
+
122
+ fn initialize(&self) -> Result<()> {
123
+ Ok(())
124
+ }
125
+
126
+ fn shutdown(&self) -> Result<()> {
127
+ Ok(())
128
+ }
129
+
130
+ fn description(&self) -> &str {
131
+ "Extracts content from Markdown files with metadata parsing"
132
+ }
133
+
134
+ fn author(&self) -> &str {
135
+ "Kreuzberg Team"
136
+ }
137
+ }
138
+
139
+ #[async_trait]
140
+ impl DocumentExtractor for MarkdownExtractor {
141
+ async fn extract_bytes(
142
+ &self,
143
+ content: &[u8],
144
+ mime_type: &str,
145
+ _config: &ExtractionConfig,
146
+ ) -> Result<ExtractionResult> {
147
+ let text_result = parse_text(content, true)?;
148
+
149
+ Ok(ExtractionResult {
150
+ content: text_result.content,
151
+ mime_type: mime_type.to_string(),
152
+ metadata: crate::types::Metadata {
153
+ format: Some(crate::types::FormatMetadata::Text(crate::types::TextMetadata {
154
+ line_count: text_result.line_count,
155
+ word_count: text_result.word_count,
156
+ character_count: text_result.character_count,
157
+ headers: text_result.headers,
158
+ links: text_result.links,
159
+ code_blocks: text_result.code_blocks,
160
+ })),
161
+ ..Default::default()
162
+ },
163
+ tables: vec![],
164
+ detected_languages: None,
165
+ chunks: None,
166
+ images: None,
167
+ })
168
+ }
169
+
170
+ fn supported_mime_types(&self) -> &[&str] {
171
+ &["text/markdown", "text/x-markdown"]
172
+ }
173
+
174
+ fn priority(&self) -> i32 {
175
+ 50
176
+ }
177
+ }
178
+
179
+ #[cfg(test)]
180
+ mod tests {
181
+ use super::*;
182
+
183
+ #[tokio::test]
184
+ async fn test_plain_text_extractor() {
185
+ let extractor = PlainTextExtractor::new();
186
+ let content = b"Hello, World!\nThis is a test.";
187
+ let config = ExtractionConfig::default();
188
+
189
+ let result = extractor.extract_bytes(content, "text/plain", &config).await.unwrap();
190
+
191
+ assert_eq!(result.mime_type, "text/plain");
192
+ assert!(result.content.contains("Hello, World!"));
193
+ assert!(result.metadata.format.is_some());
194
+ let text_meta = match result.metadata.format.as_ref().unwrap() {
195
+ crate::types::FormatMetadata::Text(meta) => meta,
196
+ _ => panic!("Expected Text metadata"),
197
+ };
198
+ assert_eq!(text_meta.line_count, 2);
199
+ assert_eq!(text_meta.word_count, 6);
200
+ }
201
+
202
+ #[tokio::test]
203
+ async fn test_markdown_extractor() {
204
+ let extractor = MarkdownExtractor::new();
205
+ let content = b"# Header\n\nThis is [a link](https://example.com).\n\n```python\nprint(\"hello\")\n```";
206
+ let config = ExtractionConfig::default();
207
+
208
+ let result = extractor
209
+ .extract_bytes(content, "text/markdown", &config)
210
+ .await
211
+ .unwrap();
212
+
213
+ assert_eq!(result.mime_type, "text/markdown");
214
+ assert!(result.content.contains("# Header"));
215
+ assert!(result.metadata.format.is_some());
216
+ let text_meta = match result.metadata.format.as_ref().unwrap() {
217
+ crate::types::FormatMetadata::Text(meta) => meta,
218
+ _ => panic!("Expected Text metadata"),
219
+ };
220
+ assert!(text_meta.headers.is_some());
221
+ assert!(text_meta.links.is_some());
222
+ assert!(text_meta.code_blocks.is_some());
223
+ }
224
+
225
+ #[test]
226
+ fn test_plain_text_plugin_interface() {
227
+ let extractor = PlainTextExtractor::new();
228
+ assert_eq!(extractor.name(), "plain-text-extractor");
229
+ assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
230
+ assert_eq!(extractor.supported_mime_types(), &["text/plain"]);
231
+ assert_eq!(extractor.priority(), 50);
232
+ }
233
+
234
+ #[test]
235
+ fn test_markdown_plugin_interface() {
236
+ let extractor = MarkdownExtractor::new();
237
+ assert_eq!(extractor.name(), "markdown-extractor");
238
+ assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
239
+ assert_eq!(extractor.supported_mime_types(), &["text/markdown", "text/x-markdown"]);
240
+ assert_eq!(extractor.priority(), 50);
241
+ }
242
+ }