kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
@@ -0,0 +1,128 @@
1
+ //! XML extractor.
2
+
3
+ use crate::Result;
4
+ use crate::core::config::ExtractionConfig;
5
+ use crate::extraction::xml::parse_xml;
6
+ use crate::plugins::{DocumentExtractor, Plugin};
7
+ use crate::types::ExtractionResult;
8
+ use async_trait::async_trait;
9
+
10
+ /// XML extractor.
11
+ ///
12
+ /// Extracts text content from XML files, preserving element structure information.
13
+ pub struct XmlExtractor;
14
+
15
+ impl XmlExtractor {
16
+ /// Create a new XML extractor.
17
+ pub fn new() -> Self {
18
+ Self
19
+ }
20
+ }
21
+
22
+ impl Default for XmlExtractor {
23
+ fn default() -> Self {
24
+ Self::new()
25
+ }
26
+ }
27
+
28
+ impl Plugin for XmlExtractor {
29
+ fn name(&self) -> &str {
30
+ "xml-extractor"
31
+ }
32
+
33
+ fn version(&self) -> String {
34
+ env!("CARGO_PKG_VERSION").to_string()
35
+ }
36
+
37
+ fn initialize(&self) -> Result<()> {
38
+ Ok(())
39
+ }
40
+
41
+ fn shutdown(&self) -> Result<()> {
42
+ Ok(())
43
+ }
44
+
45
+ fn description(&self) -> &str {
46
+ "Extracts text content from XML files with element metadata"
47
+ }
48
+
49
+ fn author(&self) -> &str {
50
+ "Kreuzberg Team"
51
+ }
52
+ }
53
+
54
+ #[async_trait]
55
+ impl DocumentExtractor for XmlExtractor {
56
+ async fn extract_bytes(
57
+ &self,
58
+ content: &[u8],
59
+ mime_type: &str,
60
+ _config: &ExtractionConfig,
61
+ ) -> Result<ExtractionResult> {
62
+ let xml_result = parse_xml(content, false)?;
63
+
64
+ Ok(ExtractionResult {
65
+ content: xml_result.content,
66
+ mime_type: mime_type.to_string(),
67
+ metadata: crate::types::Metadata {
68
+ format: Some(crate::types::FormatMetadata::Xml(crate::types::XmlMetadata {
69
+ element_count: xml_result.element_count,
70
+ unique_elements: xml_result.unique_elements,
71
+ })),
72
+ ..Default::default()
73
+ },
74
+ tables: vec![],
75
+ detected_languages: None,
76
+ chunks: None,
77
+ images: None,
78
+ })
79
+ }
80
+
81
+ fn supported_mime_types(&self) -> &[&str] {
82
+ &["application/xml", "text/xml", "image/svg+xml"]
83
+ }
84
+
85
+ fn priority(&self) -> i32 {
86
+ 50
87
+ }
88
+ }
89
+
90
+ #[cfg(test)]
91
+ mod tests {
92
+ use super::*;
93
+
94
+ #[tokio::test]
95
+ async fn test_xml_extractor() {
96
+ let extractor = XmlExtractor::new();
97
+ let content = b"<root><item>Hello</item><item>World</item></root>";
98
+ let config = ExtractionConfig::default();
99
+
100
+ let result = extractor
101
+ .extract_bytes(content, "application/xml", &config)
102
+ .await
103
+ .unwrap();
104
+
105
+ assert_eq!(result.mime_type, "application/xml");
106
+ assert_eq!(result.content, "Hello World");
107
+ assert!(result.metadata.format.is_some());
108
+ let xml_meta = match result.metadata.format.as_ref().unwrap() {
109
+ crate::types::FormatMetadata::Xml(meta) => meta,
110
+ _ => panic!("Expected Xml metadata"),
111
+ };
112
+ assert_eq!(xml_meta.element_count, 3);
113
+ assert!(xml_meta.unique_elements.contains(&"root".to_string()));
114
+ assert!(xml_meta.unique_elements.contains(&"item".to_string()));
115
+ }
116
+
117
+ #[test]
118
+ fn test_xml_plugin_interface() {
119
+ let extractor = XmlExtractor::new();
120
+ assert_eq!(extractor.name(), "xml-extractor");
121
+ assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
122
+ assert_eq!(
123
+ extractor.supported_mime_types(),
124
+ &["application/xml", "text/xml", "image/svg+xml"]
125
+ );
126
+ assert_eq!(extractor.priority(), 50);
127
+ }
128
+ }
@@ -0,0 +1,164 @@
1
+ /// PDF points per inch constant
2
+ const PDF_POINTS_PER_INCH: f64 = 72.0;
3
+
4
+ /// Calculate smart DPI based on page dimensions, memory constraints, and target DPI
5
+ #[allow(clippy::cast_possible_truncation)]
6
+ pub fn calculate_smart_dpi(
7
+ page_width: f64,
8
+ page_height: f64,
9
+ target_dpi: i32,
10
+ max_dimension: i32,
11
+ max_memory_mb: f64,
12
+ ) -> i32 {
13
+ let width_inches = page_width / PDF_POINTS_PER_INCH;
14
+ let height_inches = page_height / PDF_POINTS_PER_INCH;
15
+
16
+ let max_pixels = (max_memory_mb * 1024.0 * 1024.0 / 3.0).sqrt().round() as i32;
17
+
18
+ let max_dpi_for_memory_width = if width_inches > 0.0 {
19
+ (f64::from(max_pixels) / width_inches).round() as i32
20
+ } else {
21
+ target_dpi
22
+ };
23
+
24
+ let max_dpi_for_memory_height = if height_inches > 0.0 {
25
+ (f64::from(max_pixels) / height_inches).round() as i32
26
+ } else {
27
+ target_dpi
28
+ };
29
+
30
+ let memory_constrained_dpi = max_dpi_for_memory_width.min(max_dpi_for_memory_height);
31
+
32
+ let dimension_constrained_dpi =
33
+ calculate_dimension_constrained_dpi(width_inches, height_inches, target_dpi, max_dimension);
34
+
35
+ let final_dpi = target_dpi.min(memory_constrained_dpi).min(dimension_constrained_dpi);
36
+
37
+ final_dpi.max(72)
38
+ }
39
+
40
+ /// Calculate DPI constrained by maximum dimension
41
+ #[allow(clippy::cast_possible_truncation)]
42
+ fn calculate_dimension_constrained_dpi(
43
+ width_inches: f64,
44
+ height_inches: f64,
45
+ target_dpi: i32,
46
+ max_dimension: i32,
47
+ ) -> i32 {
48
+ let target_width_pixels = (width_inches * f64::from(target_dpi)).round() as i32;
49
+ let target_height_pixels = (height_inches * f64::from(target_dpi)).round() as i32;
50
+ let max_pixel_dimension = target_width_pixels.max(target_height_pixels);
51
+
52
+ if max_pixel_dimension > max_dimension {
53
+ let max_dpi_for_width = if width_inches > 0.0 {
54
+ (f64::from(max_dimension) / width_inches).round() as i32
55
+ } else {
56
+ target_dpi
57
+ };
58
+
59
+ let max_dpi_for_height = if height_inches > 0.0 {
60
+ (f64::from(max_dimension) / height_inches).round() as i32
61
+ } else {
62
+ target_dpi
63
+ };
64
+
65
+ max_dpi_for_width.min(max_dpi_for_height)
66
+ } else {
67
+ target_dpi
68
+ }
69
+ }
70
+
71
+ /// Calculate optimal DPI with min/max constraints
72
+ pub fn calculate_optimal_dpi(
73
+ page_width: f64,
74
+ page_height: f64,
75
+ target_dpi: i32,
76
+ max_dimension: i32,
77
+ min_dpi: i32,
78
+ max_dpi: i32,
79
+ ) -> i32 {
80
+ let smart_dpi = calculate_smart_dpi(page_width, page_height, target_dpi, max_dimension, 2048.0);
81
+
82
+ min_dpi.max(smart_dpi.min(max_dpi))
83
+ }
84
+
85
+ #[cfg(test)]
86
+ mod tests {
87
+ use super::*;
88
+
89
+ #[test]
90
+ fn test_calculate_smart_dpi_basic() {
91
+ let dpi = calculate_smart_dpi(612.0, 792.0, 300, 4096, 2048.0);
92
+ assert!(dpi >= 72);
93
+ assert!(dpi <= 300);
94
+ }
95
+
96
+ #[test]
97
+ fn test_calculate_smart_dpi_memory_constrained() {
98
+ let dpi = calculate_smart_dpi(1224.0, 1584.0, 300, 8192, 10.0);
99
+ assert!(dpi < 300);
100
+ assert!(dpi >= 72);
101
+ }
102
+
103
+ #[test]
104
+ fn test_calculate_smart_dpi_dimension_constrained() {
105
+ let dpi = calculate_smart_dpi(612.0, 792.0, 300, 1000, 2048.0);
106
+ assert!(dpi < 300);
107
+ }
108
+
109
+ #[test]
110
+ fn test_calculate_smart_dpi_minimum_dpi() {
111
+ let dpi = calculate_smart_dpi(10000.0, 10000.0, 300, 100, 1.0);
112
+ assert_eq!(dpi, 72);
113
+ }
114
+
115
+ #[test]
116
+ fn test_calculate_smart_dpi_zero_dimensions() {
117
+ let dpi = calculate_smart_dpi(0.0, 792.0, 300, 4096, 2048.0);
118
+ assert!(dpi >= 72);
119
+
120
+ let dpi = calculate_smart_dpi(612.0, 0.0, 300, 4096, 2048.0);
121
+ assert!(dpi >= 72);
122
+
123
+ let dpi = calculate_smart_dpi(0.0, 0.0, 300, 4096, 2048.0);
124
+ assert_eq!(dpi, 300);
125
+ }
126
+
127
+ #[test]
128
+ fn test_calculate_dimension_constrained_dpi() {
129
+ let dpi = calculate_dimension_constrained_dpi(8.5, 11.0, 300, 4096);
130
+ assert!(dpi <= 300);
131
+
132
+ let dpi = calculate_dimension_constrained_dpi(8.5, 11.0, 600, 2000);
133
+ assert!(dpi < 600);
134
+ }
135
+
136
+ #[test]
137
+ fn test_calculate_optimal_dpi() {
138
+ let dpi = calculate_optimal_dpi(612.0, 792.0, 300, 4096, 72, 600);
139
+ assert!(dpi >= 72);
140
+ assert!(dpi <= 600);
141
+
142
+ let dpi = calculate_optimal_dpi(10000.0, 10000.0, 300, 100, 100, 600);
143
+ assert_eq!(dpi, 100);
144
+
145
+ let dpi = calculate_optimal_dpi(72.0, 72.0, 1000, 10000, 72, 600);
146
+ assert_eq!(dpi, 600);
147
+ }
148
+
149
+ #[test]
150
+ fn test_memory_calculation() {
151
+ let dpi = calculate_smart_dpi(612.0, 792.0, 10000, 100000, 2048.0);
152
+ assert!(dpi < 10000);
153
+ assert!(dpi >= 72);
154
+ }
155
+
156
+ #[test]
157
+ fn test_aspect_ratio_preservation() {
158
+ let wide_dpi = calculate_smart_dpi(1224.0, 396.0, 300, 4096, 2048.0);
159
+ let tall_dpi = calculate_smart_dpi(396.0, 1224.0, 300, 4096, 2048.0);
160
+
161
+ assert!(wide_dpi >= 72);
162
+ assert!(tall_dpi >= 72);
163
+ }
164
+ }
@@ -0,0 +1,6 @@
1
+ pub mod dpi;
2
+ pub mod preprocessing;
3
+ pub mod resize;
4
+
5
+ pub use dpi::calculate_optimal_dpi;
6
+ pub use preprocessing::{NormalizeResult, normalize_image_dpi};
@@ -0,0 +1,417 @@
1
+ use crate::error::{KreuzbergError, Result};
2
+ use crate::types::{ExtractionConfig, ImagePreprocessingMetadata};
3
+ use image::{DynamicImage, ImageBuffer, Rgb};
4
+
5
+ use super::dpi::calculate_smart_dpi;
6
+ use super::resize::resize_image;
7
+
8
+ const PDF_POINTS_PER_INCH: f64 = 72.0;
9
+
10
+ /// Result of image normalization
11
+ pub struct NormalizeResult {
12
+ /// Processed RGB image data (height * width * 3 bytes)
13
+ pub rgb_data: Vec<u8>,
14
+ /// Image dimensions (width, height)
15
+ pub dimensions: (usize, usize),
16
+ /// Preprocessing metadata
17
+ pub metadata: ImagePreprocessingMetadata,
18
+ }
19
+
20
+ /// Normalize image DPI based on extraction configuration
21
+ ///
22
+ /// # Arguments
23
+ /// * `rgb_data` - RGB image data as a flat `Vec<u8>` (height * width * 3 bytes, row-major)
24
+ /// * `width` - Image width in pixels
25
+ /// * `height` - Image height in pixels
26
+ /// * `config` - Extraction configuration containing DPI settings
27
+ /// * `current_dpi` - Optional current DPI of the image (defaults to 72 if None)
28
+ ///
29
+ /// # Returns
30
+ /// * `NormalizeResult` containing processed image data and metadata
31
+ pub fn normalize_image_dpi(
32
+ rgb_data: &[u8],
33
+ width: usize,
34
+ height: usize,
35
+ config: &ExtractionConfig,
36
+ current_dpi: Option<f64>,
37
+ ) -> Result<NormalizeResult> {
38
+ if width > 65536 || height > 65536 {
39
+ return Err(KreuzbergError::validation(format!(
40
+ "Image dimensions {}x{} exceed maximum 65536x65536",
41
+ width, height
42
+ )));
43
+ }
44
+
45
+ let expected_size = height * width * 3;
46
+ if rgb_data.len() != expected_size {
47
+ return Err(KreuzbergError::validation(format!(
48
+ "RGB data size {} does not match expected size {} for {}x{} image",
49
+ rgb_data.len(),
50
+ expected_size,
51
+ width,
52
+ height
53
+ )));
54
+ }
55
+
56
+ let current_dpi = current_dpi.unwrap_or(PDF_POINTS_PER_INCH);
57
+ let original_dpi = (current_dpi, current_dpi);
58
+ let max_memory_mb = 2048.0;
59
+
60
+ let (target_dpi, auto_adjusted, calculated_dpi) =
61
+ calculate_target_dpi(width as u32, height as u32, current_dpi, config, max_memory_mb);
62
+
63
+ let scale_factor = f64::from(target_dpi) / current_dpi;
64
+
65
+ if !needs_resize(width as u32, height as u32, scale_factor, config) {
66
+ return Ok(create_skip_result(
67
+ rgb_data.to_vec(),
68
+ width,
69
+ height,
70
+ original_dpi,
71
+ config,
72
+ target_dpi,
73
+ scale_factor,
74
+ auto_adjusted,
75
+ calculated_dpi,
76
+ ));
77
+ }
78
+
79
+ let (new_width, new_height, final_scale, dimension_clamped) =
80
+ calculate_new_dimensions(width as u32, height as u32, scale_factor, config);
81
+
82
+ perform_resize(
83
+ rgb_data,
84
+ width as u32,
85
+ height as u32,
86
+ new_width,
87
+ new_height,
88
+ final_scale,
89
+ original_dpi,
90
+ target_dpi,
91
+ auto_adjusted,
92
+ dimension_clamped,
93
+ calculated_dpi,
94
+ config,
95
+ )
96
+ }
97
+
98
+ /// Calculate target DPI based on configuration
99
+ fn calculate_target_dpi(
100
+ width: u32,
101
+ height: u32,
102
+ current_dpi: f64,
103
+ config: &ExtractionConfig,
104
+ max_memory_mb: f64,
105
+ ) -> (i32, bool, Option<i32>) {
106
+ if config.auto_adjust_dpi {
107
+ let approx_width_points = f64::from(width) * PDF_POINTS_PER_INCH / current_dpi;
108
+ let approx_height_points = f64::from(height) * PDF_POINTS_PER_INCH / current_dpi;
109
+
110
+ let optimal_dpi = calculate_smart_dpi(
111
+ approx_width_points,
112
+ approx_height_points,
113
+ config.target_dpi,
114
+ config.max_image_dimension,
115
+ max_memory_mb,
116
+ );
117
+
118
+ (optimal_dpi, optimal_dpi != config.target_dpi, Some(optimal_dpi))
119
+ } else {
120
+ (config.target_dpi, false, None)
121
+ }
122
+ }
123
+
124
+ /// Check if resize is needed
125
+ fn needs_resize(width: u32, height: u32, scale_factor: f64, config: &ExtractionConfig) -> bool {
126
+ let max_dimension = width.max(height);
127
+ let exceeds_max = i32::try_from(max_dimension).map_or(true, |dim| dim > config.max_image_dimension);
128
+
129
+ (scale_factor - 1.0).abs() >= 0.05 || exceeds_max
130
+ }
131
+
132
+ /// Calculate new dimensions after scaling
133
+ #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
134
+ fn calculate_new_dimensions(
135
+ original_width: u32,
136
+ original_height: u32,
137
+ scale_factor: f64,
138
+ config: &ExtractionConfig,
139
+ ) -> (u32, u32, f64, bool) {
140
+ let mut new_width = (f64::from(original_width) * scale_factor).round() as u32;
141
+ let mut new_height = (f64::from(original_height) * scale_factor).round() as u32;
142
+ let mut final_scale = scale_factor;
143
+ let mut dimension_clamped = false;
144
+
145
+ let max_new_dimension = new_width.max(new_height);
146
+ if let Ok(max_dim_i32) = i32::try_from(max_new_dimension)
147
+ && max_dim_i32 > config.max_image_dimension
148
+ {
149
+ let dimension_scale = f64::from(config.max_image_dimension) / f64::from(max_new_dimension);
150
+ new_width = (f64::from(new_width) * dimension_scale).round() as u32;
151
+ new_height = (f64::from(new_height) * dimension_scale).round() as u32;
152
+ final_scale *= dimension_scale;
153
+ dimension_clamped = true;
154
+ }
155
+
156
+ (new_width, new_height, final_scale, dimension_clamped)
157
+ }
158
+
159
+ /// Create result when resize is skipped
160
+ #[allow(clippy::too_many_arguments)]
161
+ fn create_skip_result(
162
+ rgb_data: Vec<u8>,
163
+ width: usize,
164
+ height: usize,
165
+ original_dpi: (f64, f64),
166
+ config: &ExtractionConfig,
167
+ target_dpi: i32,
168
+ scale_factor: f64,
169
+ auto_adjusted: bool,
170
+ calculated_dpi: Option<i32>,
171
+ ) -> NormalizeResult {
172
+ NormalizeResult {
173
+ rgb_data,
174
+ dimensions: (width, height),
175
+ metadata: ImagePreprocessingMetadata {
176
+ original_dimensions: (width, height),
177
+ original_dpi,
178
+ target_dpi: config.target_dpi,
179
+ scale_factor,
180
+ auto_adjusted,
181
+ final_dpi: target_dpi,
182
+ new_dimensions: None,
183
+ resample_method: "NONE".to_string(),
184
+ dimension_clamped: false,
185
+ calculated_dpi,
186
+ skipped_resize: true,
187
+ resize_error: None,
188
+ },
189
+ }
190
+ }
191
+
192
+ /// Perform the actual resize operation
193
+ #[allow(clippy::too_many_arguments)]
194
+ fn perform_resize(
195
+ rgb_data: &[u8],
196
+ original_width: u32,
197
+ original_height: u32,
198
+ new_width: u32,
199
+ new_height: u32,
200
+ final_scale: f64,
201
+ original_dpi: (f64, f64),
202
+ target_dpi: i32,
203
+ auto_adjusted: bool,
204
+ dimension_clamped: bool,
205
+ calculated_dpi: Option<i32>,
206
+ config: &ExtractionConfig,
207
+ ) -> Result<NormalizeResult> {
208
+ let img_buffer = ImageBuffer::<Rgb<u8>, Vec<u8>>::from_raw(original_width, original_height, rgb_data.to_vec())
209
+ .ok_or_else(|| {
210
+ KreuzbergError::parsing(format!(
211
+ "Failed to create image buffer from {}x{} RGB data",
212
+ original_width, original_height
213
+ ))
214
+ })?;
215
+
216
+ let image = DynamicImage::ImageRgb8(img_buffer);
217
+
218
+ let resized = resize_image(&image, new_width, new_height, final_scale)?;
219
+
220
+ let rgb_image = resized.to_rgb8();
221
+ let result_rgb_data = rgb_image.into_raw();
222
+
223
+ let metadata = ImagePreprocessingMetadata {
224
+ original_dimensions: (original_width as usize, original_height as usize),
225
+ original_dpi,
226
+ target_dpi: config.target_dpi,
227
+ scale_factor: final_scale,
228
+ auto_adjusted,
229
+ final_dpi: target_dpi,
230
+ new_dimensions: Some((new_width as usize, new_height as usize)),
231
+ resample_method: if final_scale < 1.0 { "LANCZOS3" } else { "CATMULLROM" }.to_string(),
232
+ dimension_clamped,
233
+ calculated_dpi,
234
+ skipped_resize: false,
235
+ resize_error: None,
236
+ };
237
+
238
+ Ok(NormalizeResult {
239
+ rgb_data: result_rgb_data,
240
+ dimensions: (new_width as usize, new_height as usize),
241
+ metadata,
242
+ })
243
+ }
244
+
245
+ #[cfg(test)]
246
+ mod tests {
247
+ use super::*;
248
+
249
+ fn create_test_rgb_data(width: usize, height: usize) -> Vec<u8> {
250
+ let mut data = Vec::with_capacity(width * height * 3);
251
+ for _ in 0..width * height {
252
+ data.push(255);
253
+ data.push(0);
254
+ data.push(0);
255
+ }
256
+ data
257
+ }
258
+
259
+ #[test]
260
+ fn test_normalize_image_dpi_skip_resize() {
261
+ let config = ExtractionConfig {
262
+ target_dpi: 72,
263
+ max_image_dimension: 4096,
264
+ auto_adjust_dpi: false,
265
+ min_dpi: 72,
266
+ max_dpi: 600,
267
+ };
268
+
269
+ let rgb_data = create_test_rgb_data(100, 100);
270
+ let result = normalize_image_dpi(&rgb_data, 100, 100, &config, Some(72.0));
271
+
272
+ assert!(result.is_ok());
273
+ let normalized = result.unwrap();
274
+ assert_eq!(normalized.dimensions, (100, 100));
275
+ assert!(normalized.metadata.skipped_resize);
276
+ }
277
+
278
+ #[test]
279
+ fn test_normalize_image_dpi_upscale() {
280
+ let config = ExtractionConfig {
281
+ target_dpi: 300,
282
+ max_image_dimension: 4096,
283
+ auto_adjust_dpi: false,
284
+ min_dpi: 72,
285
+ max_dpi: 600,
286
+ };
287
+
288
+ let rgb_data = create_test_rgb_data(100, 100);
289
+ let result = normalize_image_dpi(&rgb_data, 100, 100, &config, Some(72.0));
290
+
291
+ assert!(result.is_ok());
292
+ let normalized = result.unwrap();
293
+ assert!(!normalized.metadata.skipped_resize);
294
+ assert!(normalized.dimensions.0 > 100);
295
+ assert!(normalized.dimensions.1 > 100);
296
+ }
297
+
298
+ #[test]
299
+ fn test_normalize_image_dpi_downscale() {
300
+ let config = ExtractionConfig {
301
+ target_dpi: 72,
302
+ max_image_dimension: 4096,
303
+ auto_adjust_dpi: false,
304
+ min_dpi: 72,
305
+ max_dpi: 600,
306
+ };
307
+
308
+ let rgb_data = create_test_rgb_data(1000, 1000);
309
+ let result = normalize_image_dpi(&rgb_data, 1000, 1000, &config, Some(300.0));
310
+
311
+ assert!(result.is_ok());
312
+ let normalized = result.unwrap();
313
+ assert!(!normalized.metadata.skipped_resize);
314
+ assert!(normalized.dimensions.0 < 1000);
315
+ assert!(normalized.dimensions.1 < 1000);
316
+ }
317
+
318
+ #[test]
319
+ fn test_normalize_image_dpi_dimension_clamp() {
320
+ let config = ExtractionConfig {
321
+ target_dpi: 300,
322
+ max_image_dimension: 500,
323
+ auto_adjust_dpi: false,
324
+ min_dpi: 72,
325
+ max_dpi: 600,
326
+ };
327
+
328
+ let rgb_data = create_test_rgb_data(1000, 1000);
329
+ let result = normalize_image_dpi(&rgb_data, 1000, 1000, &config, Some(300.0));
330
+
331
+ assert!(result.is_ok());
332
+ let normalized = result.unwrap();
333
+ assert!(normalized.metadata.dimension_clamped);
334
+ assert!(normalized.dimensions.0 <= 500);
335
+ assert!(normalized.dimensions.1 <= 500);
336
+ }
337
+
338
+ #[test]
339
+ fn test_normalize_image_dpi_auto_adjust() {
340
+ let config = ExtractionConfig {
341
+ target_dpi: 300,
342
+ max_image_dimension: 4096,
343
+ auto_adjust_dpi: true,
344
+ min_dpi: 72,
345
+ max_dpi: 600,
346
+ };
347
+
348
+ let rgb_data = create_test_rgb_data(100, 100);
349
+ let result = normalize_image_dpi(&rgb_data, 100, 100, &config, Some(72.0));
350
+
351
+ assert!(result.is_ok());
352
+ let normalized = result.unwrap();
353
+ assert!(normalized.metadata.calculated_dpi.is_some());
354
+ }
355
+
356
+ #[test]
357
+ fn test_normalize_image_dpi_invalid_dimensions() {
358
+ let config = ExtractionConfig::default();
359
+ let rgb_data = create_test_rgb_data(100, 100);
360
+
361
+ let result = normalize_image_dpi(&rgb_data, 100000, 100000, &config, None);
362
+ assert!(result.is_err());
363
+ }
364
+
365
+ #[test]
366
+ fn test_normalize_image_dpi_invalid_data_size() {
367
+ let config = ExtractionConfig::default();
368
+ let rgb_data = vec![0u8; 100];
369
+
370
+ let result = normalize_image_dpi(&rgb_data, 100, 100, &config, None);
371
+ assert!(result.is_err());
372
+ }
373
+
374
+ #[test]
375
+ fn test_needs_resize_threshold() {
376
+ let config = ExtractionConfig {
377
+ target_dpi: 300,
378
+ max_image_dimension: 4096,
379
+ auto_adjust_dpi: false,
380
+ min_dpi: 72,
381
+ max_dpi: 600,
382
+ };
383
+
384
+ assert!(!needs_resize(100, 100, 1.02, &config));
385
+
386
+ assert!(needs_resize(100, 100, 1.10, &config));
387
+ }
388
+
389
+ #[test]
390
+ fn test_calculate_new_dimensions_no_clamp() {
391
+ let config = ExtractionConfig::default();
392
+
393
+ let (new_w, new_h, scale, clamped) = calculate_new_dimensions(100, 100, 2.0, &config);
394
+
395
+ assert_eq!(new_w, 200);
396
+ assert_eq!(new_h, 200);
397
+ assert!((scale - 2.0).abs() < 0.01);
398
+ assert!(!clamped);
399
+ }
400
+
401
+ #[test]
402
+ fn test_calculate_new_dimensions_with_clamp() {
403
+ let config = ExtractionConfig {
404
+ target_dpi: 300,
405
+ max_image_dimension: 100,
406
+ auto_adjust_dpi: false,
407
+ min_dpi: 72,
408
+ max_dpi: 600,
409
+ };
410
+
411
+ let (new_w, new_h, _scale, clamped) = calculate_new_dimensions(100, 100, 2.0, &config);
412
+
413
+ assert!(new_w <= 100);
414
+ assert!(new_h <= 100);
415
+ assert!(clamped);
416
+ }
417
+ }