kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
@@ -0,0 +1,3000 @@
1
+ //! PowerPoint presentation extraction functions.
2
+ //!
3
+ //! This module provides PowerPoint (PPTX) file parsing by directly reading the Office Open XML
4
+ //! format. It extracts text content, slide structure, images, and presentation metadata.
5
+ //!
6
+ //! # Attribution
7
+ //!
8
+ //! This code is based on the [pptx-to-md](https://github.com/nilskruthoff/pptx-parser) library
9
+ //! by Nils Kruthoff, licensed under MIT OR Apache-2.0. The original code has been vendored and
10
+ //! adapted to integrate with Kreuzberg's architecture. See ATTRIBUTIONS.md for full license text.
11
+ //!
12
+ //! # Features
13
+ //!
14
+ //! - **Slide extraction**: Reads all slides from presentation
15
+ //! - **Text formatting**: Preserves bold, italic, underline formatting as Markdown
16
+ //! - **Image extraction**: Optionally extracts embedded images with metadata
17
+ //! - **Office metadata**: Extracts core properties, custom properties (when `office` feature enabled)
18
+ //! - **Structure preservation**: Maintains heading hierarchy and list structure
19
+ //!
20
+ //! # Supported Formats
21
+ //!
22
+ //! - `.pptx` - PowerPoint Presentation
23
+ //! - `.pptm` - PowerPoint Macro-Enabled Presentation
24
+ //! - `.ppsx` - PowerPoint Slide Show
25
+ //!
26
+ //! # Example
27
+ //!
28
+ //! ```rust
29
+ //! use kreuzberg::extraction::pptx::extract_pptx_from_path;
30
+ //!
31
+ //! # fn example() -> kreuzberg::Result<()> {
32
+ //! let result = extract_pptx_from_path("presentation.pptx", true)?;
33
+ //!
34
+ //! println!("Slide count: {}", result.slide_count);
35
+ //! println!("Image count: {}", result.image_count);
36
+ //! println!("Content:\n{}", result.content);
37
+ //! # Ok(())
38
+ //! # }
39
+ //! ```
40
+ use crate::error::{KreuzbergError, Result};
41
+ use crate::types::{ExtractedImage, PptxExtractionResult, PptxMetadata};
42
+ use std::collections::HashMap;
43
+ use std::fs::File;
44
+ use std::io::Read;
45
+ use std::path::Path;
46
+ use zip::ZipArchive;
47
+
48
+ #[cfg(feature = "office")]
49
+ use crate::extraction::office_metadata::{
50
+ extract_core_properties, extract_custom_properties, extract_pptx_app_properties,
51
+ };
52
+ #[cfg(feature = "office")]
53
+ use serde_json::Value;
54
+
55
+ const P_NAMESPACE: &str = "http://schemas.openxmlformats.org/presentationml/2006/main";
56
+ const A_NAMESPACE: &str = "http://schemas.openxmlformats.org/drawingml/2006/main";
57
+ const RELS_NAMESPACE: &str = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
58
+
59
+ #[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord)]
60
+ struct ElementPosition {
61
+ x: i64,
62
+ y: i64,
63
+ }
64
+
65
+ #[derive(Debug, Clone, Default)]
66
+ struct Formatting {
67
+ bold: bool,
68
+ italic: bool,
69
+ underlined: bool,
70
+ lang: String,
71
+ }
72
+
73
+ #[derive(Debug, Clone)]
74
+ struct Run {
75
+ text: String,
76
+ formatting: Formatting,
77
+ }
78
+
79
+ impl Run {
80
+ fn extract(&self) -> String {
81
+ self.text.clone()
82
+ }
83
+
84
+ fn render_as_md(&self) -> String {
85
+ let mut result = self.text.clone();
86
+
87
+ if self.formatting.bold {
88
+ result = format!("**{}**", result);
89
+ }
90
+ if self.formatting.italic {
91
+ result = format!("*{}*", result);
92
+ }
93
+ if self.formatting.underlined {
94
+ result = format!("<u>{}</u>", result);
95
+ }
96
+
97
+ result
98
+ }
99
+ }
100
+
101
+ #[derive(Debug, Clone)]
102
+ struct TextElement {
103
+ runs: Vec<Run>,
104
+ }
105
+
106
+ #[derive(Debug, Clone)]
107
+ struct ListItem {
108
+ level: u32,
109
+ is_ordered: bool,
110
+ runs: Vec<Run>,
111
+ }
112
+
113
+ #[derive(Debug, Clone)]
114
+ struct ListElement {
115
+ items: Vec<ListItem>,
116
+ }
117
+
118
+ #[derive(Debug, Clone)]
119
+ struct TableCell {
120
+ runs: Vec<Run>,
121
+ }
122
+
123
+ #[derive(Debug, Clone)]
124
+ struct TableRow {
125
+ cells: Vec<TableCell>,
126
+ }
127
+
128
+ #[derive(Debug, Clone)]
129
+ struct TableElement {
130
+ rows: Vec<TableRow>,
131
+ }
132
+
133
+ #[derive(Debug, Clone)]
134
+ struct ImageReference {
135
+ id: String,
136
+ target: String,
137
+ }
138
+
139
+ #[derive(Debug, Clone)]
140
+ enum SlideElement {
141
+ Text(TextElement, ElementPosition),
142
+ Table(TableElement, ElementPosition),
143
+ Image(ImageReference, ElementPosition),
144
+ List(ListElement, ElementPosition),
145
+ Unknown,
146
+ }
147
+
148
+ impl SlideElement {
149
+ fn position(&self) -> ElementPosition {
150
+ match self {
151
+ SlideElement::Text(_, pos)
152
+ | SlideElement::Table(_, pos)
153
+ | SlideElement::Image(_, pos)
154
+ | SlideElement::List(_, pos) => *pos,
155
+ SlideElement::Unknown => ElementPosition::default(),
156
+ }
157
+ }
158
+ }
159
+
160
+ #[derive(Debug)]
161
+ struct Slide {
162
+ slide_number: u32,
163
+ elements: Vec<SlideElement>,
164
+ images: Vec<ImageReference>,
165
+ }
166
+
167
+ #[derive(Debug, Clone)]
168
+ struct ParserConfig {
169
+ extract_images: bool,
170
+ include_slide_comment: bool,
171
+ }
172
+
173
+ impl Default for ParserConfig {
174
+ fn default() -> Self {
175
+ Self {
176
+ extract_images: true,
177
+ include_slide_comment: false,
178
+ }
179
+ }
180
+ }
181
+
182
+ struct ContentBuilder {
183
+ content: String,
184
+ }
185
+
186
+ impl ContentBuilder {
187
+ fn new() -> Self {
188
+ Self {
189
+ content: String::with_capacity(8192),
190
+ }
191
+ }
192
+
193
+ fn with_capacity(capacity: usize) -> Self {
194
+ Self {
195
+ content: String::with_capacity(capacity),
196
+ }
197
+ }
198
+
199
+ fn add_slide_header(&mut self, slide_number: u32) {
200
+ self.content.reserve(50);
201
+ self.content.push_str("\n\n<!-- Slide number: ");
202
+ self.content.push_str(&slide_number.to_string());
203
+ self.content.push_str(" -->\n");
204
+ }
205
+
206
+ fn add_text(&mut self, text: &str) {
207
+ if !text.trim().is_empty() {
208
+ self.content.push_str(text);
209
+ }
210
+ }
211
+
212
+ fn add_title(&mut self, title: &str) {
213
+ if !title.trim().is_empty() {
214
+ self.content.push_str("# ");
215
+ self.content.push_str(title.trim());
216
+ self.content.push('\n');
217
+ }
218
+ }
219
+
220
+ fn add_table(&mut self, rows: &[Vec<String>]) {
221
+ if rows.is_empty() {
222
+ return;
223
+ }
224
+
225
+ self.content.push_str("\n<table>");
226
+ for (i, row) in rows.iter().enumerate() {
227
+ self.content.push_str("<tr>");
228
+ let tag = if i == 0 { "th" } else { "td" };
229
+
230
+ for cell in row {
231
+ self.content.push('<');
232
+ self.content.push_str(tag);
233
+ self.content.push('>');
234
+ self.content.push_str(&html_escape(cell));
235
+ self.content.push_str("</");
236
+ self.content.push_str(tag);
237
+ self.content.push('>');
238
+ }
239
+ self.content.push_str("</tr>");
240
+ }
241
+ self.content.push_str("</table>\n");
242
+ }
243
+
244
+ fn add_list_item(&mut self, level: u32, is_ordered: bool, text: &str) {
245
+ let indent_count = level.saturating_sub(1) as usize;
246
+ for _ in 0..indent_count {
247
+ self.content.push_str(" ");
248
+ }
249
+
250
+ let marker = if is_ordered { "1." } else { "-" };
251
+ self.content.push_str(marker);
252
+ self.content.push(' ');
253
+ self.content.push_str(text.trim());
254
+ self.content.push('\n');
255
+ }
256
+
257
+ fn add_image(&mut self, image_id: &str, slide_number: u32) {
258
+ let filename = format!("slide_{}_image_{}.jpg", slide_number, image_id);
259
+ self.content.push_str("![");
260
+ self.content.push_str(image_id);
261
+ self.content.push_str("](");
262
+ self.content.push_str(&filename);
263
+ self.content.push_str(")\n");
264
+ }
265
+
266
+ fn add_notes(&mut self, notes: &str) {
267
+ if !notes.trim().is_empty() {
268
+ self.content.push_str("\n\n### Notes:\n");
269
+ self.content.push_str(notes);
270
+ self.content.push('\n');
271
+ }
272
+ }
273
+
274
+ fn build(self) -> String {
275
+ self.content.trim().to_string()
276
+ }
277
+ }
278
+
279
+ fn html_escape(text: &str) -> String {
280
+ text.replace('&', "&amp;")
281
+ .replace('<', "&lt;")
282
+ .replace('>', "&gt;")
283
+ .replace('"', "&quot;")
284
+ .replace('\'', "&#x27;")
285
+ }
286
+
287
+ struct PptxContainer {
288
+ archive: ZipArchive<File>,
289
+ slide_paths: Vec<String>,
290
+ }
291
+
292
+ impl PptxContainer {
293
+ fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
294
+ // IO errors must bubble up unchanged - file access issues need user reports ~keep
295
+ let file = File::open(path)?;
296
+
297
+ let mut archive = match ZipArchive::new(file) {
298
+ Ok(arc) => arc,
299
+ Err(zip::result::ZipError::Io(io_err)) => return Err(io_err.into()), // Bubble up IO errors ~keep
300
+ Err(e) => {
301
+ return Err(KreuzbergError::parsing(format!(
302
+ "Failed to read PPTX archive (invalid format): {}",
303
+ e
304
+ )));
305
+ }
306
+ };
307
+
308
+ let slide_paths = Self::find_slide_paths(&mut archive)?;
309
+
310
+ Ok(Self { archive, slide_paths })
311
+ }
312
+
313
+ fn slide_paths(&self) -> &[String] {
314
+ &self.slide_paths
315
+ }
316
+
317
+ fn read_file(&mut self, path: &str) -> Result<Vec<u8>> {
318
+ match self.archive.by_name(path) {
319
+ Ok(mut file) => {
320
+ let mut contents = Vec::new();
321
+ // IO errors must bubble up - file read issues need user reports ~keep
322
+ file.read_to_end(&mut contents)?;
323
+ Ok(contents)
324
+ }
325
+ Err(zip::result::ZipError::FileNotFound) => {
326
+ Err(KreuzbergError::parsing("File not found in archive".to_string()))
327
+ }
328
+ Err(zip::result::ZipError::Io(io_err)) => Err(io_err.into()), // Bubble up IO errors ~keep
329
+ Err(e) => Err(KreuzbergError::parsing(format!("Zip error: {}", e))),
330
+ }
331
+ }
332
+
333
+ fn get_slide_rels_path(&self, slide_path: &str) -> String {
334
+ get_slide_rels_path(slide_path)
335
+ }
336
+
337
+ fn find_slide_paths(archive: &mut ZipArchive<File>) -> Result<Vec<String>> {
338
+ if let Ok(rels_data) = Self::read_file_from_archive(archive, "ppt/_rels/presentation.xml.rels")
339
+ && let Ok(paths) = parse_presentation_rels(&rels_data)
340
+ {
341
+ return Ok(paths);
342
+ }
343
+
344
+ let mut slide_paths = Vec::new();
345
+ for i in 0..archive.len() {
346
+ if let Ok(file) = archive.by_index(i) {
347
+ let name = file.name();
348
+ if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") {
349
+ slide_paths.push(name.to_string());
350
+ }
351
+ }
352
+ }
353
+
354
+ slide_paths.sort();
355
+ Ok(slide_paths)
356
+ }
357
+
358
+ fn read_file_from_archive(archive: &mut ZipArchive<File>, path: &str) -> Result<Vec<u8>> {
359
+ let mut file = match archive.by_name(path) {
360
+ Ok(f) => f,
361
+ Err(zip::result::ZipError::Io(io_err)) => return Err(io_err.into()), // Bubble up IO errors ~keep
362
+ Err(e) => {
363
+ return Err(KreuzbergError::parsing(format!(
364
+ "Failed to read file from archive: {}",
365
+ e
366
+ )));
367
+ }
368
+ };
369
+ let mut contents = Vec::new();
370
+ // IO errors must bubble up - file read issues need user reports ~keep
371
+ file.read_to_end(&mut contents)?;
372
+ Ok(contents)
373
+ }
374
+ }
375
+
376
+ impl Slide {
377
+ fn from_xml(slide_number: u32, xml_data: &[u8], rels_data: Option<&[u8]>) -> Result<Self> {
378
+ let elements = parse_slide_xml(xml_data)?;
379
+
380
+ let images = if let Some(rels) = rels_data {
381
+ parse_slide_rels(rels)?
382
+ } else {
383
+ Vec::new()
384
+ };
385
+
386
+ Ok(Self {
387
+ slide_number,
388
+ elements,
389
+ images,
390
+ })
391
+ }
392
+
393
+ fn to_markdown(&self, config: &ParserConfig) -> String {
394
+ let mut builder = ContentBuilder::new();
395
+
396
+ if config.include_slide_comment {
397
+ builder.add_slide_header(self.slide_number);
398
+ }
399
+
400
+ let mut element_indices: Vec<usize> = (0..self.elements.len()).collect();
401
+ element_indices.sort_by_key(|&i| {
402
+ let pos = self.elements[i].position();
403
+ (pos.y, pos.x)
404
+ });
405
+
406
+ for &idx in &element_indices {
407
+ match &self.elements[idx] {
408
+ SlideElement::Text(text, _) => {
409
+ let text_content: String = text.runs.iter().map(|run| run.render_as_md()).collect();
410
+
411
+ let normalized = text_content.replace('\n', " ");
412
+ let is_title = normalized.len() < 100 && !normalized.trim().is_empty();
413
+
414
+ if is_title {
415
+ builder.add_title(normalized.trim());
416
+ } else {
417
+ builder.add_text(&text_content);
418
+ }
419
+ }
420
+ SlideElement::Table(table, _) => {
421
+ let table_rows: Vec<Vec<String>> = table
422
+ .rows
423
+ .iter()
424
+ .map(|row| {
425
+ row.cells
426
+ .iter()
427
+ .map(|cell| cell.runs.iter().map(|run| run.extract()).collect::<String>())
428
+ .collect()
429
+ })
430
+ .collect();
431
+ builder.add_table(&table_rows);
432
+ }
433
+ SlideElement::List(list, _) => {
434
+ for item in &list.items {
435
+ let item_text: String = item.runs.iter().map(|run| run.extract()).collect();
436
+ builder.add_list_item(item.level, item.is_ordered, &item_text);
437
+ }
438
+ }
439
+ SlideElement::Image(img_ref, _) => {
440
+ builder.add_image(&img_ref.id, self.slide_number);
441
+ }
442
+ SlideElement::Unknown => {}
443
+ }
444
+ }
445
+
446
+ builder.build()
447
+ }
448
+
449
+ fn image_count(&self) -> usize {
450
+ self.elements
451
+ .iter()
452
+ .filter(|e| matches!(e, SlideElement::Image(_, _)))
453
+ .count()
454
+ }
455
+
456
+ fn table_count(&self) -> usize {
457
+ self.elements
458
+ .iter()
459
+ .filter(|e| matches!(e, SlideElement::Table(_, _)))
460
+ .count()
461
+ }
462
+ }
463
+
464
+ struct SlideIterator {
465
+ container: PptxContainer,
466
+ current_index: usize,
467
+ total_slides: usize,
468
+ }
469
+
470
+ impl SlideIterator {
471
+ fn new(container: PptxContainer) -> Self {
472
+ let total_slides = container.slide_paths().len();
473
+ Self {
474
+ container,
475
+ current_index: 0,
476
+ total_slides,
477
+ }
478
+ }
479
+
480
+ fn slide_count(&self) -> usize {
481
+ self.total_slides
482
+ }
483
+
484
+ fn next_slide(&mut self) -> Result<Option<Slide>> {
485
+ if self.current_index >= self.total_slides {
486
+ return Ok(None);
487
+ }
488
+
489
+ let slide_path = &self.container.slide_paths()[self.current_index].clone();
490
+ let slide_number = (self.current_index + 1) as u32;
491
+
492
+ let xml_data = self.container.read_file(slide_path)?;
493
+
494
+ let rels_path = self.container.get_slide_rels_path(slide_path);
495
+ let rels_data = self.container.read_file(&rels_path).ok();
496
+
497
+ let slide = Slide::from_xml(slide_number, &xml_data, rels_data.as_deref())?;
498
+
499
+ self.current_index += 1;
500
+
501
+ Ok(Some(slide))
502
+ }
503
+
504
+ fn get_slide_images(&mut self, slide: &Slide) -> Result<HashMap<String, Vec<u8>>> {
505
+ let mut image_data = HashMap::new();
506
+
507
+ for img_ref in &slide.images {
508
+ let slide_path = &self.container.slide_paths()[slide.slide_number as usize - 1];
509
+ let full_path = get_full_image_path(slide_path, &img_ref.target);
510
+
511
+ if let Ok(data) = self.container.read_file(&full_path) {
512
+ image_data.insert(img_ref.id.clone(), data);
513
+ }
514
+ }
515
+
516
+ Ok(image_data)
517
+ }
518
+ }
519
+
520
+ use roxmltree::{Document, Node};
521
+
522
+ enum ParsedContent {
523
+ Text(TextElement),
524
+ List(ListElement),
525
+ }
526
+
527
+ fn parse_slide_xml(xml_data: &[u8]) -> Result<Vec<SlideElement>> {
528
+ let xml_str =
529
+ std::str::from_utf8(xml_data).map_err(|_| KreuzbergError::parsing("Invalid UTF-8 in slide XML".to_string()))?;
530
+
531
+ let doc =
532
+ Document::parse(xml_str).map_err(|e| KreuzbergError::parsing(format!("Failed to parse slide XML: {}", e)))?;
533
+
534
+ let root = doc.root_element();
535
+ let ns = root.tag_name().namespace();
536
+
537
+ let c_sld = root
538
+ .descendants()
539
+ .find(|n| n.tag_name().name() == "cSld" && n.tag_name().namespace() == ns)
540
+ .ok_or_else(|| KreuzbergError::parsing("No <p:cSld> tag found".to_string()))?;
541
+
542
+ let sp_tree = c_sld
543
+ .children()
544
+ .find(|n| n.tag_name().name() == "spTree" && n.tag_name().namespace() == ns)
545
+ .ok_or_else(|| KreuzbergError::parsing("No <p:spTree> tag found".to_string()))?;
546
+
547
+ let mut elements = Vec::new();
548
+ for child_node in sp_tree.children().filter(|n| n.is_element()) {
549
+ elements.extend(parse_group(&child_node)?);
550
+ }
551
+
552
+ Ok(elements)
553
+ }
554
+
555
+ fn parse_group(node: &Node) -> Result<Vec<SlideElement>> {
556
+ let mut elements = Vec::new();
557
+
558
+ let tag_name = node.tag_name().name();
559
+ let namespace = node.tag_name().namespace().unwrap_or("");
560
+
561
+ if namespace != P_NAMESPACE {
562
+ return Ok(elements);
563
+ }
564
+
565
+ let position = extract_position(node);
566
+
567
+ match tag_name {
568
+ "sp" => {
569
+ let position = extract_position(node);
570
+ match parse_sp(node)? {
571
+ ParsedContent::Text(text) => elements.push(SlideElement::Text(text, position)),
572
+ ParsedContent::List(list) => elements.push(SlideElement::List(list, position)),
573
+ }
574
+ }
575
+ "graphicFrame" => {
576
+ if let Some(graphic_element) = parse_graphic_frame(node)? {
577
+ elements.push(SlideElement::Table(graphic_element, position));
578
+ }
579
+ }
580
+ "pic" => {
581
+ let image_reference = parse_pic(node)?;
582
+ elements.push(SlideElement::Image(image_reference, position));
583
+ }
584
+ "grpSp" => {
585
+ for child in node.children().filter(|n| n.is_element()) {
586
+ elements.extend(parse_group(&child)?);
587
+ }
588
+ }
589
+ _ => elements.push(SlideElement::Unknown),
590
+ }
591
+
592
+ Ok(elements)
593
+ }
594
+
595
+ fn parse_sp(sp_node: &Node) -> Result<ParsedContent> {
596
+ let tx_body_node = sp_node
597
+ .children()
598
+ .find(|n| n.tag_name().name() == "txBody" && n.tag_name().namespace() == Some(P_NAMESPACE))
599
+ .ok_or_else(|| KreuzbergError::parsing("No txBody found".to_string()))?;
600
+
601
+ let is_list = tx_body_node.descendants().any(|n| {
602
+ n.is_element()
603
+ && n.tag_name().name() == "pPr"
604
+ && n.tag_name().namespace() == Some(A_NAMESPACE)
605
+ && (n.attribute("lvl").is_some()
606
+ || n.children().any(|child| {
607
+ child.is_element()
608
+ && (child.tag_name().name() == "buAutoNum" || child.tag_name().name() == "buChar")
609
+ }))
610
+ });
611
+
612
+ if is_list {
613
+ Ok(ParsedContent::List(parse_list(&tx_body_node)?))
614
+ } else {
615
+ Ok(ParsedContent::Text(parse_text(&tx_body_node)?))
616
+ }
617
+ }
618
+
619
+ fn parse_text(tx_body_node: &Node) -> Result<TextElement> {
620
+ let mut runs = Vec::new();
621
+
622
+ for p_node in tx_body_node
623
+ .children()
624
+ .filter(|n| n.is_element() && n.tag_name().name() == "p" && n.tag_name().namespace() == Some(A_NAMESPACE))
625
+ {
626
+ let mut paragraph_runs = parse_paragraph(&p_node, true)?;
627
+ runs.append(&mut paragraph_runs);
628
+ }
629
+
630
+ Ok(TextElement { runs })
631
+ }
632
+
633
+ fn parse_graphic_frame(node: &Node) -> Result<Option<TableElement>> {
634
+ let graphic_data_node = node.descendants().find(|n| {
635
+ n.is_element()
636
+ && n.tag_name().name() == "graphicData"
637
+ && n.tag_name().namespace() == Some(A_NAMESPACE)
638
+ && n.attribute("uri") == Some("http://schemas.openxmlformats.org/drawingml/2006/table")
639
+ });
640
+
641
+ if let Some(graphic_data) = graphic_data_node
642
+ && let Some(tbl_node) = graphic_data
643
+ .children()
644
+ .find(|n| n.is_element() && n.tag_name().name() == "tbl" && n.tag_name().namespace() == Some(A_NAMESPACE))
645
+ {
646
+ let table = parse_table(&tbl_node)?;
647
+ return Ok(Some(table));
648
+ }
649
+
650
+ Ok(None)
651
+ }
652
+
653
+ fn parse_table(tbl_node: &Node) -> Result<TableElement> {
654
+ let mut rows = Vec::new();
655
+
656
+ for tr_node in tbl_node
657
+ .children()
658
+ .filter(|n| n.is_element() && n.tag_name().name() == "tr" && n.tag_name().namespace() == Some(A_NAMESPACE))
659
+ {
660
+ let row = parse_table_row(&tr_node)?;
661
+ rows.push(row);
662
+ }
663
+
664
+ Ok(TableElement { rows })
665
+ }
666
+
667
+ fn parse_table_row(tr_node: &Node) -> Result<TableRow> {
668
+ let mut cells = Vec::new();
669
+
670
+ for tc_node in tr_node
671
+ .children()
672
+ .filter(|n| n.is_element() && n.tag_name().name() == "tc" && n.tag_name().namespace() == Some(A_NAMESPACE))
673
+ {
674
+ let cell = parse_table_cell(&tc_node)?;
675
+ cells.push(cell);
676
+ }
677
+
678
+ Ok(TableRow { cells })
679
+ }
680
+
681
+ fn parse_table_cell(tc_node: &Node) -> Result<TableCell> {
682
+ let mut runs = Vec::new();
683
+
684
+ if let Some(tx_body_node) = tc_node
685
+ .children()
686
+ .find(|n| n.is_element() && n.tag_name().name() == "txBody" && n.tag_name().namespace() == Some(A_NAMESPACE))
687
+ {
688
+ for p_node in tx_body_node
689
+ .children()
690
+ .filter(|n| n.is_element() && n.tag_name().name() == "p" && n.tag_name().namespace() == Some(A_NAMESPACE))
691
+ {
692
+ let mut paragraph_runs = parse_paragraph(&p_node, false)?;
693
+ runs.append(&mut paragraph_runs);
694
+ }
695
+ }
696
+
697
+ Ok(TableCell { runs })
698
+ }
699
+
700
+ fn parse_pic(pic_node: &Node) -> Result<ImageReference> {
701
+ let blip_node = pic_node
702
+ .descendants()
703
+ .find(|n| n.is_element() && n.tag_name().name() == "blip" && n.tag_name().namespace() == Some(A_NAMESPACE))
704
+ .ok_or_else(|| KreuzbergError::parsing("Image blip not found".to_string()))?;
705
+
706
+ let embed_attr = blip_node
707
+ .attribute((RELS_NAMESPACE, "embed"))
708
+ .or_else(|| blip_node.attribute("r:embed"))
709
+ .ok_or_else(|| KreuzbergError::parsing("Image embed attribute not found".to_string()))?;
710
+
711
+ let image_ref = ImageReference {
712
+ id: embed_attr.to_string(),
713
+ target: String::new(),
714
+ };
715
+
716
+ Ok(image_ref)
717
+ }
718
+
719
+ fn parse_list(tx_body_node: &Node) -> Result<ListElement> {
720
+ let mut items = Vec::new();
721
+
722
+ for p_node in tx_body_node
723
+ .children()
724
+ .filter(|n| n.is_element() && n.tag_name().name() == "p" && n.tag_name().namespace() == Some(A_NAMESPACE))
725
+ {
726
+ let (level, is_ordered) = parse_list_properties(&p_node)?;
727
+
728
+ let runs = parse_paragraph(&p_node, true)?;
729
+
730
+ items.push(ListItem {
731
+ level,
732
+ is_ordered,
733
+ runs,
734
+ });
735
+ }
736
+
737
+ Ok(ListElement { items })
738
+ }
739
+
740
+ fn parse_list_properties(p_node: &Node) -> Result<(u32, bool)> {
741
+ let mut level = 1;
742
+ let mut is_ordered = false;
743
+
744
+ if let Some(p_pr_node) = p_node
745
+ .children()
746
+ .find(|n| n.is_element() && n.tag_name().name() == "pPr" && n.tag_name().namespace() == Some(A_NAMESPACE))
747
+ {
748
+ if let Some(lvl_attr) = p_pr_node.attribute("lvl") {
749
+ level = lvl_attr.parse::<u32>().unwrap_or(0) + 1;
750
+ }
751
+
752
+ is_ordered = p_pr_node.children().any(|n| {
753
+ n.is_element() && n.tag_name().namespace() == Some(A_NAMESPACE) && n.tag_name().name() == "buAutoNum"
754
+ });
755
+ }
756
+
757
+ Ok((level, is_ordered))
758
+ }
759
+
760
+ fn parse_paragraph(p_node: &Node, add_new_line: bool) -> Result<Vec<Run>> {
761
+ let run_nodes: Vec<_> = p_node
762
+ .children()
763
+ .filter(|n| n.is_element() && n.tag_name().name() == "r" && n.tag_name().namespace() == Some(A_NAMESPACE))
764
+ .collect();
765
+
766
+ let count = run_nodes.len();
767
+ let mut runs: Vec<Run> = Vec::new();
768
+
769
+ for (idx, r_node) in run_nodes.iter().enumerate() {
770
+ let mut run = parse_run(r_node)?;
771
+
772
+ if add_new_line && idx == count - 1 {
773
+ run.text.push('\n');
774
+ }
775
+
776
+ runs.push(run);
777
+ }
778
+ Ok(runs)
779
+ }
780
+
781
+ fn parse_run(r_node: &Node) -> Result<Run> {
782
+ let mut text = String::new();
783
+ let mut formatting = Formatting::default();
784
+
785
+ if let Some(r_pr_node) = r_node
786
+ .children()
787
+ .find(|n| n.is_element() && n.tag_name().name() == "rPr" && n.tag_name().namespace() == Some(A_NAMESPACE))
788
+ {
789
+ if let Some(b_attr) = r_pr_node.attribute("b") {
790
+ formatting.bold = b_attr == "1" || b_attr.eq_ignore_ascii_case("true");
791
+ }
792
+ if let Some(i_attr) = r_pr_node.attribute("i") {
793
+ formatting.italic = i_attr == "1" || i_attr.eq_ignore_ascii_case("true");
794
+ }
795
+ if let Some(u_attr) = r_pr_node.attribute("u") {
796
+ formatting.underlined = u_attr != "none";
797
+ }
798
+ if let Some(lang_attr) = r_pr_node.attribute("lang") {
799
+ formatting.lang = lang_attr.to_string();
800
+ }
801
+ }
802
+
803
+ if let Some(t_node) = r_node
804
+ .children()
805
+ .find(|n| n.is_element() && n.tag_name().name() == "t" && n.tag_name().namespace() == Some(A_NAMESPACE))
806
+ && let Some(t) = t_node.text()
807
+ {
808
+ text.push_str(t);
809
+ }
810
+ Ok(Run { text, formatting })
811
+ }
812
+
813
+ fn extract_position(node: &Node) -> ElementPosition {
814
+ let default = ElementPosition::default();
815
+
816
+ node.descendants()
817
+ .find(|n| n.tag_name().namespace() == Some(A_NAMESPACE) && n.tag_name().name() == "xfrm")
818
+ .and_then(|xfrm| {
819
+ let x = xfrm
820
+ .children()
821
+ .find(|n| n.tag_name().name() == "off" && n.tag_name().namespace() == Some(A_NAMESPACE))
822
+ .and_then(|off| off.attribute("x")?.parse::<i64>().ok())?;
823
+
824
+ let y = xfrm
825
+ .children()
826
+ .find(|n| n.tag_name().name() == "off" && n.tag_name().namespace() == Some(A_NAMESPACE))
827
+ .and_then(|off| off.attribute("y")?.parse::<i64>().ok())?;
828
+
829
+ Some(ElementPosition { x, y })
830
+ })
831
+ .unwrap_or(default)
832
+ }
833
+
834
+ fn parse_slide_rels(rels_data: &[u8]) -> Result<Vec<ImageReference>> {
835
+ let xml_str = std::str::from_utf8(rels_data)
836
+ .map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in rels XML: {}", e)))?;
837
+
838
+ let doc =
839
+ Document::parse(xml_str).map_err(|e| KreuzbergError::parsing(format!("Failed to parse rels XML: {}", e)))?;
840
+
841
+ let mut images = Vec::new();
842
+
843
+ for node in doc.descendants() {
844
+ if node.has_tag_name("Relationship")
845
+ && let Some(rel_type) = node.attribute("Type")
846
+ && rel_type.contains("image")
847
+ && let (Some(id), Some(target)) = (node.attribute("Id"), node.attribute("Target"))
848
+ {
849
+ images.push(ImageReference {
850
+ id: id.to_string(),
851
+ target: target.to_string(),
852
+ });
853
+ }
854
+ }
855
+
856
+ Ok(images)
857
+ }
858
+
859
+ fn parse_presentation_rels(rels_data: &[u8]) -> Result<Vec<String>> {
860
+ let xml_str = std::str::from_utf8(rels_data)
861
+ .map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in presentation rels: {}", e)))?;
862
+
863
+ let doc = Document::parse(xml_str)
864
+ .map_err(|e| KreuzbergError::parsing(format!("Failed to parse presentation rels: {}", e)))?;
865
+
866
+ let mut slide_paths = Vec::new();
867
+
868
+ for node in doc.descendants() {
869
+ if node.has_tag_name("Relationship")
870
+ && let Some(rel_type) = node.attribute("Type")
871
+ && rel_type.contains("slide")
872
+ && !rel_type.contains("slideMaster")
873
+ && let Some(target) = node.attribute("Target")
874
+ {
875
+ let normalized_target = target.strip_prefix('/').unwrap_or(target);
876
+ let final_path = if normalized_target.starts_with("ppt/") {
877
+ normalized_target.to_string()
878
+ } else {
879
+ format!("ppt/{}", normalized_target)
880
+ };
881
+ slide_paths.push(final_path);
882
+ }
883
+ }
884
+
885
+ Ok(slide_paths)
886
+ }
887
+
888
+ /// Extract comprehensive metadata from PPTX using office_metadata module
889
+ fn extract_metadata(archive: &mut ZipArchive<File>) -> PptxMetadata {
890
+ #[cfg(feature = "office")]
891
+ {
892
+ let mut metadata_map = HashMap::new();
893
+
894
+ if let Ok(core) = extract_core_properties(archive) {
895
+ if let Some(title) = core.title {
896
+ metadata_map.insert("title".to_string(), title);
897
+ }
898
+ if let Some(creator) = core.creator {
899
+ metadata_map.insert("author".to_string(), creator.clone());
900
+ metadata_map.insert("created_by".to_string(), creator);
901
+ }
902
+ if let Some(subject) = core.subject {
903
+ metadata_map.insert("subject".to_string(), subject.clone());
904
+ metadata_map.insert("summary".to_string(), subject);
905
+ }
906
+ if let Some(keywords) = core.keywords {
907
+ metadata_map.insert("keywords".to_string(), keywords);
908
+ }
909
+ if let Some(description) = core.description {
910
+ metadata_map.insert("description".to_string(), description);
911
+ }
912
+ if let Some(modified_by) = core.last_modified_by {
913
+ metadata_map.insert("modified_by".to_string(), modified_by);
914
+ }
915
+ if let Some(created) = core.created {
916
+ metadata_map.insert("created_at".to_string(), created);
917
+ }
918
+ if let Some(modified) = core.modified {
919
+ metadata_map.insert("modified_at".to_string(), modified);
920
+ }
921
+ if let Some(revision) = core.revision {
922
+ metadata_map.insert("revision".to_string(), revision);
923
+ }
924
+ if let Some(category) = core.category {
925
+ metadata_map.insert("category".to_string(), category);
926
+ }
927
+ }
928
+
929
+ if let Ok(app) = extract_pptx_app_properties(archive) {
930
+ if let Some(slides) = app.slides {
931
+ metadata_map.insert("slide_count".to_string(), slides.to_string());
932
+ }
933
+ if let Some(notes) = app.notes {
934
+ metadata_map.insert("notes_count".to_string(), notes.to_string());
935
+ }
936
+ if let Some(hidden_slides) = app.hidden_slides {
937
+ metadata_map.insert("hidden_slides".to_string(), hidden_slides.to_string());
938
+ }
939
+ if !app.slide_titles.is_empty() {
940
+ metadata_map.insert("slide_titles".to_string(), app.slide_titles.join(", "));
941
+ }
942
+ if let Some(presentation_format) = app.presentation_format {
943
+ metadata_map.insert("presentation_format".to_string(), presentation_format);
944
+ }
945
+ if let Some(company) = app.company {
946
+ metadata_map.insert("organization".to_string(), company);
947
+ }
948
+ if let Some(application) = app.application {
949
+ metadata_map.insert("application".to_string(), application);
950
+ }
951
+ if let Some(app_version) = app.app_version {
952
+ metadata_map.insert("application_version".to_string(), app_version);
953
+ }
954
+ }
955
+
956
+ if let Ok(custom) = extract_custom_properties(archive) {
957
+ for (key, value) in custom {
958
+ let value_str = match value {
959
+ Value::String(s) => s,
960
+ Value::Number(n) => n.to_string(),
961
+ Value::Bool(b) => b.to_string(),
962
+ Value::Null => "null".to_string(),
963
+ Value::Array(_) | Value::Object(_) => value.to_string(),
964
+ };
965
+ metadata_map.insert(format!("custom_{}", key), value_str);
966
+ }
967
+ }
968
+
969
+ PptxMetadata {
970
+ title: metadata_map.get("title").cloned(),
971
+ author: metadata_map.get("author").cloned(),
972
+ description: metadata_map.get("description").cloned(),
973
+ summary: metadata_map.get("summary").cloned(),
974
+ fonts: Vec::new(),
975
+ }
976
+ }
977
+
978
+ #[cfg(not(feature = "office"))]
979
+ {
980
+ PptxMetadata {
981
+ title: None,
982
+ author: None,
983
+ description: None,
984
+ summary: None,
985
+ fonts: Vec::new(),
986
+ }
987
+ }
988
+ }
989
+
990
+ fn extract_all_notes(container: &mut PptxContainer) -> Result<HashMap<u32, String>> {
991
+ let mut notes = HashMap::new();
992
+
993
+ let slide_paths: Vec<String> = container.slide_paths().to_vec();
994
+
995
+ for (i, slide_path) in slide_paths.iter().enumerate() {
996
+ let notes_path = slide_path.replace("slides/slide", "notesSlides/notesSlide");
997
+ if let Ok(notes_xml) = container.read_file(&notes_path)
998
+ && let Ok(note_text) = extract_notes_text(&notes_xml)
999
+ {
1000
+ notes.insert((i + 1) as u32, note_text);
1001
+ }
1002
+ }
1003
+
1004
+ Ok(notes)
1005
+ }
1006
+
1007
+ fn extract_notes_text(notes_xml: &[u8]) -> Result<String> {
1008
+ let xml_str = std::str::from_utf8(notes_xml)
1009
+ .map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in notes XML: {}", e)))?;
1010
+
1011
+ let doc =
1012
+ Document::parse(xml_str).map_err(|e| KreuzbergError::parsing(format!("Failed to parse notes XML: {}", e)))?;
1013
+
1014
+ let mut text_parts = Vec::new();
1015
+ const DRAWINGML_NS: &str = "http://schemas.openxmlformats.org/drawingml/2006/main";
1016
+
1017
+ for node in doc.descendants() {
1018
+ if node.has_tag_name((DRAWINGML_NS, "t"))
1019
+ && let Some(text) = node.text()
1020
+ {
1021
+ text_parts.push(text);
1022
+ }
1023
+ }
1024
+
1025
+ Ok(text_parts.join(" "))
1026
+ }
1027
+
1028
+ fn get_slide_rels_path(slide_path: &str) -> String {
1029
+ let parts: Vec<&str> = slide_path.rsplitn(2, '/').collect();
1030
+ if parts.len() == 2 {
1031
+ format!("{}/_rels/{}.rels", parts[1], parts[0])
1032
+ } else {
1033
+ format!("_rels/{}.rels", slide_path)
1034
+ }
1035
+ }
1036
+
1037
+ fn get_full_image_path(slide_path: &str, image_target: &str) -> String {
1038
+ if image_target.starts_with("..") {
1039
+ let parts: Vec<&str> = slide_path.rsplitn(3, '/').collect();
1040
+ if parts.len() >= 3 {
1041
+ format!("{}/{}", parts[2], &image_target[3..])
1042
+ } else {
1043
+ format!("ppt/{}", &image_target[3..])
1044
+ }
1045
+ } else {
1046
+ let parts: Vec<&str> = slide_path.rsplitn(2, '/').collect();
1047
+ if parts.len() == 2 {
1048
+ format!("{}/{}", parts[1], image_target)
1049
+ } else {
1050
+ format!("ppt/slides/{}", image_target)
1051
+ }
1052
+ }
1053
+ }
1054
+
1055
+ fn detect_image_format(data: &[u8]) -> String {
1056
+ if data.starts_with(&[0xFF, 0xD8, 0xFF]) {
1057
+ "jpeg".to_string()
1058
+ } else if data.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
1059
+ "png".to_string()
1060
+ } else if data.starts_with(b"GIF") {
1061
+ "gif".to_string()
1062
+ } else if data.starts_with(b"BM") {
1063
+ "bmp".to_string()
1064
+ } else if data.starts_with(b"<svg") || data.starts_with(b"<?xml") {
1065
+ "svg".to_string()
1066
+ } else if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
1067
+ "tiff".to_string()
1068
+ } else {
1069
+ "unknown".to_string()
1070
+ }
1071
+ }
1072
+
1073
+ pub fn extract_pptx_from_path(path: &str, extract_images: bool) -> Result<PptxExtractionResult> {
1074
+ let config = ParserConfig {
1075
+ extract_images,
1076
+ ..Default::default()
1077
+ };
1078
+
1079
+ let mut container = PptxContainer::open(path)?;
1080
+
1081
+ let metadata = extract_metadata(&mut container.archive);
1082
+
1083
+ let notes = extract_all_notes(&mut container)?;
1084
+
1085
+ let mut iterator = SlideIterator::new(container);
1086
+ let slide_count = iterator.slide_count();
1087
+
1088
+ let estimated_capacity = slide_count * 1024;
1089
+ let mut content_builder = ContentBuilder::with_capacity(estimated_capacity);
1090
+
1091
+ let mut total_image_count = 0;
1092
+ let mut total_table_count = 0;
1093
+ let mut extracted_images = Vec::new();
1094
+
1095
+ while let Some(slide) = iterator.next_slide()? {
1096
+ content_builder.add_slide_header(slide.slide_number);
1097
+
1098
+ let slide_content = slide.to_markdown(&config);
1099
+ content_builder.add_text(&slide_content);
1100
+
1101
+ if let Some(slide_notes) = notes.get(&slide.slide_number) {
1102
+ content_builder.add_notes(slide_notes);
1103
+ }
1104
+
1105
+ if config.extract_images
1106
+ && let Ok(image_data) = iterator.get_slide_images(&slide)
1107
+ {
1108
+ for (_, data) in image_data {
1109
+ let format = detect_image_format(&data);
1110
+ let image_index = extracted_images.len();
1111
+
1112
+ extracted_images.push(ExtractedImage {
1113
+ data,
1114
+ format,
1115
+ image_index,
1116
+ page_number: Some(slide.slide_number as usize),
1117
+ width: None,
1118
+ height: None,
1119
+ colorspace: None,
1120
+ bits_per_component: None,
1121
+ is_mask: false,
1122
+ description: None,
1123
+ ocr_result: None,
1124
+ });
1125
+ }
1126
+ }
1127
+
1128
+ total_image_count += slide.image_count();
1129
+ total_table_count += slide.table_count();
1130
+ }
1131
+
1132
+ Ok(PptxExtractionResult {
1133
+ content: content_builder.build(),
1134
+ metadata,
1135
+ slide_count,
1136
+ image_count: total_image_count,
1137
+ table_count: total_table_count,
1138
+ images: extracted_images,
1139
+ })
1140
+ }
1141
+
1142
+ pub fn extract_pptx_from_bytes(data: &[u8], extract_images: bool) -> Result<PptxExtractionResult> {
1143
+ use std::sync::atomic::{AtomicU64, Ordering};
1144
+ static COUNTER: AtomicU64 = AtomicU64::new(0);
1145
+ let unique_id = COUNTER.fetch_add(1, Ordering::SeqCst);
1146
+ let temp_path = std::env::temp_dir().join(format!("temp_pptx_{}_{}.pptx", std::process::id(), unique_id));
1147
+
1148
+ // IO errors must bubble up - temp file write issues need user reports ~keep
1149
+ std::fs::write(&temp_path, data)?;
1150
+
1151
+ let result = extract_pptx_from_path(temp_path.to_str().unwrap(), extract_images);
1152
+
1153
+ let _ = std::fs::remove_file(&temp_path);
1154
+
1155
+ result
1156
+ }
1157
+
1158
+ #[cfg(test)]
1159
+ mod tests {
1160
+ use super::*;
1161
+
1162
+ fn create_test_pptx_bytes(slides: Vec<&str>) -> Vec<u8> {
1163
+ use std::io::Write;
1164
+ use zip::write::{SimpleFileOptions, ZipWriter};
1165
+
1166
+ let mut buffer = Vec::new();
1167
+ {
1168
+ let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
1169
+ let options = SimpleFileOptions::default();
1170
+
1171
+ zip.start_file("[Content_Types].xml", options).unwrap();
1172
+ zip.write_all(
1173
+ br#"<?xml version="1.0" encoding="UTF-8"?>
1174
+ <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
1175
+ <Default Extension="xml" ContentType="application/xml"/>
1176
+ <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
1177
+ </Types>"#,
1178
+ )
1179
+ .unwrap();
1180
+
1181
+ zip.start_file("ppt/presentation.xml", options).unwrap();
1182
+ zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
1183
+
1184
+ zip.start_file("_rels/.rels", options).unwrap();
1185
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
1186
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1187
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
1188
+ </Relationships>"#).unwrap();
1189
+
1190
+ let mut rels_xml = String::from(
1191
+ r#"<?xml version="1.0" encoding="UTF-8"?>
1192
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">"#,
1193
+ );
1194
+ for (i, _) in slides.iter().enumerate() {
1195
+ rels_xml.push_str(&format!(
1196
+ r#"<Relationship Id="rId{}" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide{}.xml"/>"#,
1197
+ i + 1,
1198
+ i + 1
1199
+ ));
1200
+ }
1201
+ rels_xml.push_str("</Relationships>");
1202
+ zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
1203
+ zip.write_all(rels_xml.as_bytes()).unwrap();
1204
+
1205
+ for (i, text) in slides.iter().enumerate() {
1206
+ let slide_xml = format!(
1207
+ r#"<?xml version="1.0" encoding="UTF-8"?>
1208
+ <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
1209
+ xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
1210
+ <p:cSld>
1211
+ <p:spTree>
1212
+ <p:sp>
1213
+ <p:txBody>
1214
+ <a:p>
1215
+ <a:r>
1216
+ <a:t>{}</a:t>
1217
+ </a:r>
1218
+ </a:p>
1219
+ </p:txBody>
1220
+ </p:sp>
1221
+ </p:spTree>
1222
+ </p:cSld>
1223
+ </p:sld>"#,
1224
+ text
1225
+ );
1226
+ zip.start_file(format!("ppt/slides/slide{}.xml", i + 1), options)
1227
+ .unwrap();
1228
+ zip.write_all(slide_xml.as_bytes()).unwrap();
1229
+ }
1230
+
1231
+ zip.start_file("docProps/core.xml", options).unwrap();
1232
+ zip.write_all(
1233
+ br#"<?xml version="1.0" encoding="UTF-8"?>
1234
+ <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
1235
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
1236
+ xmlns:dcterms="http://purl.org/dc/terms/">
1237
+ <dc:title>Test Presentation</dc:title>
1238
+ <dc:creator>Test Author</dc:creator>
1239
+ <dc:description>Test Description</dc:description>
1240
+ <dc:subject>Test Subject</dc:subject>
1241
+ </cp:coreProperties>"#,
1242
+ )
1243
+ .unwrap();
1244
+
1245
+ let _ = zip.finish().unwrap();
1246
+ }
1247
+ buffer
1248
+ }
1249
+
1250
+ #[test]
1251
+ fn test_extract_pptx_from_bytes_single_slide() {
1252
+ let pptx_bytes = create_test_pptx_bytes(vec!["Hello World"]);
1253
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
1254
+
1255
+ assert_eq!(result.slide_count, 1);
1256
+ assert!(
1257
+ result.content.contains("Hello World"),
1258
+ "Content was: {}",
1259
+ result.content
1260
+ );
1261
+ assert_eq!(result.image_count, 0);
1262
+ assert_eq!(result.table_count, 0);
1263
+ }
1264
+
1265
+ #[test]
1266
+ fn test_extract_pptx_from_bytes_multiple_slides() {
1267
+ let pptx_bytes = create_test_pptx_bytes(vec!["Slide 1", "Slide 2", "Slide 3"]);
1268
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
1269
+
1270
+ assert_eq!(result.slide_count, 3);
1271
+ assert!(result.content.contains("Slide 1"));
1272
+ assert!(result.content.contains("Slide 2"));
1273
+ assert!(result.content.contains("Slide 3"));
1274
+ }
1275
+
1276
+ #[test]
1277
+ fn test_extract_pptx_metadata() {
1278
+ let pptx_bytes = create_test_pptx_bytes(vec!["Content"]);
1279
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
1280
+
1281
+ assert_eq!(result.metadata.title, Some("Test Presentation".to_string()));
1282
+ assert_eq!(result.metadata.author, Some("Test Author".to_string()));
1283
+ assert_eq!(result.metadata.description, Some("Test Description".to_string()));
1284
+ assert_eq!(result.metadata.summary, Some("Test Subject".to_string()));
1285
+ }
1286
+
1287
+ #[test]
1288
+ fn test_extract_pptx_empty_slides() {
1289
+ let pptx_bytes = create_test_pptx_bytes(vec!["", "", ""]);
1290
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
1291
+
1292
+ assert_eq!(result.slide_count, 3);
1293
+ }
1294
+
1295
+ #[test]
1296
+ fn test_extract_pptx_from_bytes_invalid_data() {
1297
+ let invalid_bytes = b"not a valid pptx file";
1298
+ let result = extract_pptx_from_bytes(invalid_bytes, false);
1299
+
1300
+ assert!(result.is_err());
1301
+ if let Err(KreuzbergError::Parsing { message: msg, .. }) = result {
1302
+ assert!(msg.contains("Failed to read PPTX archive") || msg.contains("Failed to write temp PPTX file"));
1303
+ } else {
1304
+ panic!("Expected ParsingError");
1305
+ }
1306
+ }
1307
+
1308
+ #[test]
1309
+ fn test_extract_pptx_from_bytes_empty_data() {
1310
+ let empty_bytes: &[u8] = &[];
1311
+ let result = extract_pptx_from_bytes(empty_bytes, false);
1312
+
1313
+ assert!(result.is_err());
1314
+ }
1315
+
1316
+ #[test]
1317
+ fn test_detect_image_format_jpeg() {
1318
+ let jpeg_header = vec![0xFF, 0xD8, 0xFF, 0xE0];
1319
+ assert_eq!(detect_image_format(&jpeg_header), "jpeg");
1320
+ }
1321
+
1322
+ #[test]
1323
+ fn test_detect_image_format_png() {
1324
+ let png_header = vec![0x89, 0x50, 0x4E, 0x47];
1325
+ assert_eq!(detect_image_format(&png_header), "png");
1326
+ }
1327
+
1328
+ #[test]
1329
+ fn test_detect_image_format_gif() {
1330
+ let gif_header = b"GIF89a";
1331
+ assert_eq!(detect_image_format(gif_header), "gif");
1332
+ }
1333
+
1334
+ #[test]
1335
+ fn test_detect_image_format_bmp() {
1336
+ let bmp_header = b"BM";
1337
+ assert_eq!(detect_image_format(bmp_header), "bmp");
1338
+ }
1339
+
1340
+ #[test]
1341
+ fn test_detect_image_format_svg() {
1342
+ let svg_header = b"<svg xmlns=\"http://www.w3.org/2000/svg\">";
1343
+ assert_eq!(detect_image_format(svg_header), "svg");
1344
+ }
1345
+
1346
+ #[test]
1347
+ fn test_detect_image_format_tiff_little_endian() {
1348
+ let tiff_header = vec![0x49, 0x49, 0x2A, 0x00];
1349
+ assert_eq!(detect_image_format(&tiff_header), "tiff");
1350
+ }
1351
+
1352
+ #[test]
1353
+ fn test_detect_image_format_tiff_big_endian() {
1354
+ let tiff_header = vec![0x4D, 0x4D, 0x00, 0x2A];
1355
+ assert_eq!(detect_image_format(&tiff_header), "tiff");
1356
+ }
1357
+
1358
+ #[test]
1359
+ fn test_detect_image_format_unknown() {
1360
+ let unknown_data = b"unknown format";
1361
+ assert_eq!(detect_image_format(unknown_data), "unknown");
1362
+ }
1363
+
1364
+ #[test]
1365
+ fn test_html_escape() {
1366
+ assert_eq!(html_escape("plain text"), "plain text");
1367
+ assert_eq!(html_escape("a & b"), "a &amp; b");
1368
+ assert_eq!(html_escape("<tag>"), "&lt;tag&gt;");
1369
+ assert_eq!(html_escape("\"quoted\""), "&quot;quoted&quot;");
1370
+ assert_eq!(html_escape("'apostrophe'"), "&#x27;apostrophe&#x27;");
1371
+ assert_eq!(
1372
+ html_escape("<a href=\"url\" title='test'>text & more</a>"),
1373
+ "&lt;a href=&quot;url&quot; title=&#x27;test&#x27;&gt;text &amp; more&lt;/a&gt;"
1374
+ );
1375
+ }
1376
+
1377
+ #[test]
1378
+ fn test_get_slide_rels_path() {
1379
+ assert_eq!(
1380
+ get_slide_rels_path("ppt/slides/slide1.xml"),
1381
+ "ppt/slides/_rels/slide1.xml.rels"
1382
+ );
1383
+ assert_eq!(
1384
+ get_slide_rels_path("ppt/slides/slide10.xml"),
1385
+ "ppt/slides/_rels/slide10.xml.rels"
1386
+ );
1387
+ }
1388
+
1389
+ #[test]
1390
+ fn test_get_full_image_path_relative() {
1391
+ assert_eq!(
1392
+ get_full_image_path("ppt/slides/slide1.xml", "../media/image1.png"),
1393
+ "ppt/media/image1.png"
1394
+ );
1395
+ }
1396
+
1397
+ #[test]
1398
+ fn test_get_full_image_path_direct() {
1399
+ assert_eq!(
1400
+ get_full_image_path("ppt/slides/slide1.xml", "image1.png"),
1401
+ "ppt/slides/image1.png"
1402
+ );
1403
+ }
1404
+
1405
+ #[test]
1406
+ fn test_content_builder_add_text() {
1407
+ let mut builder = ContentBuilder::new();
1408
+ builder.add_text("Hello");
1409
+ builder.add_text(" ");
1410
+ builder.add_text("World");
1411
+ assert_eq!(builder.build(), "HelloWorld");
1412
+ }
1413
+
1414
+ #[test]
1415
+ fn test_content_builder_add_text_empty() {
1416
+ let mut builder = ContentBuilder::new();
1417
+ builder.add_text(" ");
1418
+ builder.add_text("");
1419
+ assert_eq!(builder.build(), "");
1420
+ }
1421
+
1422
+ #[test]
1423
+ fn test_content_builder_add_title() {
1424
+ let mut builder = ContentBuilder::new();
1425
+ builder.add_title("Title");
1426
+ assert_eq!(builder.build(), "# Title");
1427
+ }
1428
+
1429
+ #[test]
1430
+ fn test_content_builder_add_title_with_whitespace() {
1431
+ let mut builder = ContentBuilder::new();
1432
+ builder.add_title(" Title ");
1433
+ assert_eq!(builder.build(), "# Title");
1434
+ }
1435
+
1436
+ #[test]
1437
+ fn test_content_builder_add_table_empty() {
1438
+ let mut builder = ContentBuilder::new();
1439
+ builder.add_table(&[]);
1440
+ assert_eq!(builder.build(), "");
1441
+ }
1442
+
1443
+ #[test]
1444
+ fn test_content_builder_add_table_single_row() {
1445
+ let mut builder = ContentBuilder::new();
1446
+ let rows = vec![vec!["Header1".to_string(), "Header2".to_string()]];
1447
+ builder.add_table(&rows);
1448
+ let result = builder.build();
1449
+ assert!(result.contains("<table>"));
1450
+ assert!(result.contains("<th>Header1</th>"));
1451
+ assert!(result.contains("<th>Header2</th>"));
1452
+ }
1453
+
1454
+ #[test]
1455
+ fn test_content_builder_add_table_multiple_rows() {
1456
+ let mut builder = ContentBuilder::new();
1457
+ let rows = vec![
1458
+ vec!["H1".to_string(), "H2".to_string()],
1459
+ vec!["D1".to_string(), "D2".to_string()],
1460
+ ];
1461
+ builder.add_table(&rows);
1462
+ let result = builder.build();
1463
+ assert!(result.contains("<th>H1</th>"));
1464
+ assert!(result.contains("<td>D1</td>"));
1465
+ }
1466
+
1467
+ #[test]
1468
+ fn test_content_builder_add_table_with_special_chars() {
1469
+ let mut builder = ContentBuilder::new();
1470
+ let rows = vec![vec!["<tag>".to_string(), "a & b".to_string()]];
1471
+ builder.add_table(&rows);
1472
+ let result = builder.build();
1473
+ assert!(result.contains("&lt;tag&gt;"));
1474
+ assert!(result.contains("a &amp; b"));
1475
+ }
1476
+
1477
+ #[test]
1478
+ fn test_content_builder_add_list_item_unordered() {
1479
+ let mut builder = ContentBuilder::new();
1480
+ builder.add_list_item(1, false, "Item 1");
1481
+ builder.add_list_item(1, false, "Item 2");
1482
+ let result = builder.build();
1483
+ assert!(result.contains("- Item 1"));
1484
+ assert!(result.contains("- Item 2"));
1485
+ }
1486
+
1487
+ #[test]
1488
+ fn test_content_builder_add_list_item_ordered() {
1489
+ let mut builder = ContentBuilder::new();
1490
+ builder.add_list_item(1, true, "First");
1491
+ builder.add_list_item(1, true, "Second");
1492
+ let result = builder.build();
1493
+ assert!(result.contains("1. First"));
1494
+ assert!(result.contains("1. Second"));
1495
+ }
1496
+
1497
+ #[test]
1498
+ fn test_content_builder_add_list_item_nested() {
1499
+ let mut builder = ContentBuilder::new();
1500
+ builder.add_list_item(1, false, "Level 1");
1501
+ builder.add_list_item(2, false, "Level 2");
1502
+ builder.add_list_item(3, false, "Level 3");
1503
+ let result = builder.build();
1504
+ assert!(result.contains("- Level 1"));
1505
+ assert!(result.contains(" - Level 2"));
1506
+ assert!(result.contains(" - Level 3"));
1507
+ }
1508
+
1509
+ #[test]
1510
+ fn test_content_builder_add_image() {
1511
+ let mut builder = ContentBuilder::new();
1512
+ builder.add_image("img123", 5);
1513
+ let result = builder.build();
1514
+ assert!(result.contains("![img123](slide_5_image_img123.jpg)"));
1515
+ }
1516
+
1517
+ #[test]
1518
+ fn test_content_builder_add_notes() {
1519
+ let mut builder = ContentBuilder::new();
1520
+ builder.add_notes("This is a note");
1521
+ let result = builder.build();
1522
+ assert!(result.contains("### Notes:"));
1523
+ assert!(result.contains("This is a note"));
1524
+ }
1525
+
1526
+ #[test]
1527
+ fn test_content_builder_add_notes_empty() {
1528
+ let mut builder = ContentBuilder::new();
1529
+ builder.add_notes(" ");
1530
+ assert_eq!(builder.build(), "");
1531
+ }
1532
+
1533
+ #[test]
1534
+ fn test_content_builder_add_slide_header() {
1535
+ let mut builder = ContentBuilder::new();
1536
+ builder.add_slide_header(3);
1537
+ let result = builder.build();
1538
+ assert!(result.contains("<!-- Slide number: 3 -->"));
1539
+ }
1540
+
1541
+ #[test]
1542
+ fn test_run_extract() {
1543
+ let run = Run {
1544
+ text: "Hello".to_string(),
1545
+ formatting: Formatting::default(),
1546
+ };
1547
+ assert_eq!(run.extract(), "Hello");
1548
+ }
1549
+
1550
+ #[test]
1551
+ fn test_run_render_as_md_plain() {
1552
+ let run = Run {
1553
+ text: "plain".to_string(),
1554
+ formatting: Formatting::default(),
1555
+ };
1556
+ assert_eq!(run.render_as_md(), "plain");
1557
+ }
1558
+
1559
+ #[test]
1560
+ fn test_run_render_as_md_bold() {
1561
+ let run = Run {
1562
+ text: "bold".to_string(),
1563
+ formatting: Formatting {
1564
+ bold: true,
1565
+ ..Default::default()
1566
+ },
1567
+ };
1568
+ assert_eq!(run.render_as_md(), "**bold**");
1569
+ }
1570
+
1571
+ #[test]
1572
+ fn test_run_render_as_md_italic() {
1573
+ let run = Run {
1574
+ text: "italic".to_string(),
1575
+ formatting: Formatting {
1576
+ italic: true,
1577
+ ..Default::default()
1578
+ },
1579
+ };
1580
+ assert_eq!(run.render_as_md(), "*italic*");
1581
+ }
1582
+
1583
+ #[test]
1584
+ fn test_run_render_as_md_bold_italic() {
1585
+ let run = Run {
1586
+ text: "both".to_string(),
1587
+ formatting: Formatting {
1588
+ bold: true,
1589
+ italic: true,
1590
+ ..Default::default()
1591
+ },
1592
+ };
1593
+ assert_eq!(run.render_as_md(), "***both***");
1594
+ }
1595
+
1596
+ #[test]
1597
+ fn test_parse_slide_xml_simple_text() {
1598
+ let xml = br#"<?xml version="1.0"?>
1599
+ <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
1600
+ xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
1601
+ <p:cSld>
1602
+ <p:spTree>
1603
+ <p:sp>
1604
+ <p:txBody>
1605
+ <a:p>
1606
+ <a:r>
1607
+ <a:t>Test Text</a:t>
1608
+ </a:r>
1609
+ </a:p>
1610
+ </p:txBody>
1611
+ </p:sp>
1612
+ </p:spTree>
1613
+ </p:cSld>
1614
+ </p:sld>"#;
1615
+
1616
+ let elements = parse_slide_xml(xml).unwrap();
1617
+ if !elements.is_empty() {
1618
+ if let SlideElement::Text(text, _) = &elements[0] {
1619
+ assert_eq!(text.runs[0].text, "Test Text\n");
1620
+ } else {
1621
+ panic!("Expected Text element");
1622
+ }
1623
+ }
1624
+ }
1625
+
1626
+ #[test]
1627
+ fn test_parse_slide_xml_invalid_utf8() {
1628
+ let invalid_utf8 = vec![0xFF, 0xFE, 0xFF];
1629
+ let result = parse_slide_xml(&invalid_utf8);
1630
+ assert!(result.is_err());
1631
+ if let Err(KreuzbergError::Parsing { message: msg, .. }) = result {
1632
+ assert!(msg.contains("Invalid UTF-8"));
1633
+ }
1634
+ }
1635
+
1636
+ #[test]
1637
+ fn test_parse_slide_xml_malformed() {
1638
+ let malformed = b"<not valid xml>";
1639
+ let result = parse_slide_xml(malformed);
1640
+ assert!(result.is_err());
1641
+ }
1642
+
1643
+ #[test]
1644
+ fn test_parse_slide_rels_with_images() {
1645
+ let rels_xml = br#"<?xml version="1.0"?>
1646
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1647
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image1.png"/>
1648
+ <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image2.jpg"/>
1649
+ </Relationships>"#;
1650
+
1651
+ let images = parse_slide_rels(rels_xml).unwrap();
1652
+ assert_eq!(images.len(), 2);
1653
+ assert_eq!(images[0].id, "rId1");
1654
+ assert_eq!(images[0].target, "../media/image1.png");
1655
+ assert_eq!(images[1].id, "rId2");
1656
+ assert_eq!(images[1].target, "../media/image2.jpg");
1657
+ }
1658
+
1659
+ #[test]
1660
+ fn test_parse_slide_rels_no_images() {
1661
+ let rels_xml = br#"<?xml version="1.0"?>
1662
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1663
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide" Target="../notesSlides/notesSlide1.xml"/>
1664
+ </Relationships>"#;
1665
+
1666
+ let images = parse_slide_rels(rels_xml).unwrap();
1667
+ assert_eq!(images.len(), 0);
1668
+ }
1669
+
1670
+ #[test]
1671
+ fn test_parse_presentation_rels() {
1672
+ let rels_xml = br#"<?xml version="1.0"?>
1673
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1674
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
1675
+ <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide2.xml"/>
1676
+ <Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slideMaster" Target="slideMasters/slideMaster1.xml"/>
1677
+ </Relationships>"#;
1678
+
1679
+ let slides = parse_presentation_rels(rels_xml).unwrap();
1680
+ assert_eq!(slides.len(), 2);
1681
+ assert_eq!(slides[0], "ppt/slides/slide1.xml");
1682
+ assert_eq!(slides[1], "ppt/slides/slide2.xml");
1683
+ }
1684
+
1685
+ #[test]
1686
+ fn test_extract_notes_text() {
1687
+ let notes_xml = br#"<?xml version="1.0"?>
1688
+ <p:notes xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
1689
+ xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
1690
+ <p:cSld>
1691
+ <p:spTree>
1692
+ <p:sp>
1693
+ <p:txBody>
1694
+ <a:p>
1695
+ <a:r>
1696
+ <a:t>First note</a:t>
1697
+ </a:r>
1698
+ </a:p>
1699
+ <a:p>
1700
+ <a:r>
1701
+ <a:t>Second note</a:t>
1702
+ </a:r>
1703
+ </a:p>
1704
+ </p:txBody>
1705
+ </p:sp>
1706
+ </p:spTree>
1707
+ </p:cSld>
1708
+ </p:notes>"#;
1709
+
1710
+ let notes = extract_notes_text(notes_xml).unwrap();
1711
+ assert!(notes.contains("First note"));
1712
+ assert!(notes.contains("Second note"));
1713
+ }
1714
+
1715
+ #[test]
1716
+ fn test_parser_config_default() {
1717
+ let config = ParserConfig::default();
1718
+ assert!(config.extract_images);
1719
+ assert!(!config.include_slide_comment);
1720
+ }
1721
+
1722
+ fn create_pptx_with_table(rows: Vec<Vec<&str>>) -> Vec<u8> {
1723
+ use std::io::Write;
1724
+ use zip::write::{SimpleFileOptions, ZipWriter};
1725
+
1726
+ let mut buffer = Vec::new();
1727
+ {
1728
+ let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
1729
+ let options = SimpleFileOptions::default();
1730
+
1731
+ zip.start_file("[Content_Types].xml", options).unwrap();
1732
+ zip.write_all(
1733
+ br#"<?xml version="1.0" encoding="UTF-8"?>
1734
+ <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
1735
+ <Default Extension="xml" ContentType="application/xml"/>
1736
+ <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
1737
+ </Types>"#,
1738
+ )
1739
+ .unwrap();
1740
+
1741
+ zip.start_file("ppt/presentation.xml", options).unwrap();
1742
+ zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
1743
+
1744
+ zip.start_file("_rels/.rels", options).unwrap();
1745
+ zip.write_all(
1746
+ br#"<?xml version="1.0" encoding="UTF-8"?>
1747
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1748
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
1749
+ </Relationships>"#,
1750
+ )
1751
+ .unwrap();
1752
+
1753
+ zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
1754
+ zip.write_all(
1755
+ br#"<?xml version="1.0" encoding="UTF-8"?>
1756
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1757
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
1758
+ </Relationships>"#,
1759
+ )
1760
+ .unwrap();
1761
+
1762
+ let mut table_xml = String::from(
1763
+ r#"<a:tbl>
1764
+ <a:tblGrid>"#,
1765
+ );
1766
+ if !rows.is_empty() {
1767
+ for _ in 0..rows[0].len() {
1768
+ table_xml.push_str(r#"<a:gridCol w="2000000"/>"#);
1769
+ }
1770
+ }
1771
+ table_xml.push_str("</a:tblGrid>");
1772
+
1773
+ for row in rows {
1774
+ table_xml.push_str(r#"<a:tr h="370840">"#);
1775
+ for cell in row {
1776
+ table_xml.push_str(&format!(
1777
+ r#"<a:tc>
1778
+ <a:txBody>
1779
+ <a:p>
1780
+ <a:r>
1781
+ <a:t>{}</a:t>
1782
+ </a:r>
1783
+ </a:p>
1784
+ </a:txBody>
1785
+ </a:tc>"#,
1786
+ cell
1787
+ ));
1788
+ }
1789
+ table_xml.push_str("</a:tr>");
1790
+ }
1791
+ table_xml.push_str("</a:tbl>");
1792
+
1793
+ let slide_xml = format!(
1794
+ r#"<?xml version="1.0" encoding="UTF-8"?>
1795
+ <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
1796
+ xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
1797
+ <p:cSld>
1798
+ <p:spTree>
1799
+ <p:graphicFrame>
1800
+ <p:xfrm>
1801
+ <a:off x="1000000" y="2000000"/>
1802
+ <a:ext cx="8000000" cy="4000000"/>
1803
+ </p:xfrm>
1804
+ <a:graphic>
1805
+ <a:graphicData uri="http://schemas.openxmlformats.org/drawingml/2006/table">
1806
+ {}
1807
+ </a:graphicData>
1808
+ </a:graphic>
1809
+ </p:graphicFrame>
1810
+ </p:spTree>
1811
+ </p:cSld>
1812
+ </p:sld>"#,
1813
+ table_xml
1814
+ );
1815
+
1816
+ zip.start_file("ppt/slides/slide1.xml", options).unwrap();
1817
+ zip.write_all(slide_xml.as_bytes()).unwrap();
1818
+
1819
+ zip.start_file("docProps/core.xml", options).unwrap();
1820
+ zip.write_all(
1821
+ br#"<?xml version="1.0" encoding="UTF-8"?>
1822
+ <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
1823
+ xmlns:dc="http://purl.org/dc/elements/1.1/">
1824
+ <dc:title>Test Table</dc:title>
1825
+ </cp:coreProperties>"#,
1826
+ )
1827
+ .unwrap();
1828
+
1829
+ let _ = zip.finish().unwrap();
1830
+ }
1831
+ buffer
1832
+ }
1833
+
1834
+ fn create_pptx_with_lists(list_items: Vec<(usize, bool, &str)>) -> Vec<u8> {
1835
+ use std::io::Write;
1836
+ use zip::write::{SimpleFileOptions, ZipWriter};
1837
+
1838
+ let mut buffer = Vec::new();
1839
+ {
1840
+ let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
1841
+ let options = SimpleFileOptions::default();
1842
+
1843
+ zip.start_file("[Content_Types].xml", options).unwrap();
1844
+ zip.write_all(
1845
+ br#"<?xml version="1.0" encoding="UTF-8"?>
1846
+ <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
1847
+ <Default Extension="xml" ContentType="application/xml"/>
1848
+ <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
1849
+ </Types>"#,
1850
+ )
1851
+ .unwrap();
1852
+
1853
+ zip.start_file("ppt/presentation.xml", options).unwrap();
1854
+ zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
1855
+
1856
+ zip.start_file("_rels/.rels", options).unwrap();
1857
+ zip.write_all(
1858
+ br#"<?xml version="1.0" encoding="UTF-8"?>
1859
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1860
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
1861
+ </Relationships>"#,
1862
+ )
1863
+ .unwrap();
1864
+
1865
+ zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
1866
+ zip.write_all(
1867
+ br#"<?xml version="1.0" encoding="UTF-8"?>
1868
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1869
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
1870
+ </Relationships>"#,
1871
+ )
1872
+ .unwrap();
1873
+
1874
+ let mut list_xml = String::new();
1875
+ for (level, is_ordered, text) in list_items {
1876
+ let indent = (level - 1) * 457200;
1877
+ let lvl_attr = level - 1;
1878
+ let bullet_section = if is_ordered {
1879
+ format!(
1880
+ r#"<a:pPr lvl="{}"><a:buAutoNum type="arabicPeriod"/></a:pPr>"#,
1881
+ lvl_attr
1882
+ )
1883
+ } else {
1884
+ format!(
1885
+ r#"<a:pPr lvl="{}" marL="{}"><a:buFont typeface="Arial"/><a:buChar char="•"/></a:pPr>"#,
1886
+ lvl_attr, indent
1887
+ )
1888
+ };
1889
+
1890
+ list_xml.push_str(&format!(
1891
+ r#"<p:sp>
1892
+ <p:spPr>
1893
+ <a:xfrm>
1894
+ <a:off x="1000000" y="1000000"/>
1895
+ <a:ext cx="6000000" cy="1000000"/>
1896
+ </a:xfrm>
1897
+ </p:spPr>
1898
+ <p:txBody>
1899
+ <a:bodyPr/>
1900
+ <a:lstStyle/>
1901
+ <a:p>
1902
+ {}
1903
+ <a:r>
1904
+ <a:t>{}</a:t>
1905
+ </a:r>
1906
+ </a:p>
1907
+ </p:txBody>
1908
+ </p:sp>"#,
1909
+ bullet_section, text
1910
+ ));
1911
+ }
1912
+
1913
+ let slide_xml = format!(
1914
+ r#"<?xml version="1.0" encoding="UTF-8"?>
1915
+ <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
1916
+ xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
1917
+ <p:cSld>
1918
+ <p:spTree>
1919
+ {}
1920
+ </p:spTree>
1921
+ </p:cSld>
1922
+ </p:sld>"#,
1923
+ list_xml
1924
+ );
1925
+
1926
+ zip.start_file("ppt/slides/slide1.xml", options).unwrap();
1927
+ zip.write_all(slide_xml.as_bytes()).unwrap();
1928
+
1929
+ zip.start_file("docProps/core.xml", options).unwrap();
1930
+ zip.write_all(
1931
+ br#"<?xml version="1.0" encoding="UTF-8"?>
1932
+ <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
1933
+ xmlns:dc="http://purl.org/dc/elements/1.1/">
1934
+ <dc:title>Test Lists</dc:title>
1935
+ </cp:coreProperties>"#,
1936
+ )
1937
+ .unwrap();
1938
+
1939
+ let _ = zip.finish().unwrap();
1940
+ }
1941
+ buffer
1942
+ }
1943
+
1944
+ fn create_pptx_with_images() -> Vec<u8> {
1945
+ use std::io::Write;
1946
+ use zip::write::{SimpleFileOptions, ZipWriter};
1947
+
1948
+ let mut buffer = Vec::new();
1949
+ {
1950
+ let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
1951
+ let options = SimpleFileOptions::default();
1952
+
1953
+ zip.start_file("[Content_Types].xml", options).unwrap();
1954
+ zip.write_all(
1955
+ br#"<?xml version="1.0" encoding="UTF-8"?>
1956
+ <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
1957
+ <Default Extension="xml" ContentType="application/xml"/>
1958
+ <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
1959
+ <Default Extension="png" ContentType="image/png"/>
1960
+ <Default Extension="jpeg" ContentType="image/jpeg"/>
1961
+ </Types>"#,
1962
+ )
1963
+ .unwrap();
1964
+
1965
+ zip.start_file("ppt/presentation.xml", options).unwrap();
1966
+ zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
1967
+
1968
+ zip.start_file("_rels/.rels", options).unwrap();
1969
+ zip.write_all(
1970
+ br#"<?xml version="1.0" encoding="UTF-8"?>
1971
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1972
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
1973
+ </Relationships>"#,
1974
+ )
1975
+ .unwrap();
1976
+
1977
+ zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
1978
+ zip.write_all(
1979
+ br#"<?xml version="1.0" encoding="UTF-8"?>
1980
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1981
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
1982
+ </Relationships>"#,
1983
+ )
1984
+ .unwrap();
1985
+
1986
+ zip.start_file("ppt/slides/_rels/slide1.xml.rels", options).unwrap();
1987
+ zip.write_all(
1988
+ br#"<?xml version="1.0" encoding="UTF-8"?>
1989
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1990
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image1.png"/>
1991
+ <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image2.jpeg"/>
1992
+ </Relationships>"#,
1993
+ )
1994
+ .unwrap();
1995
+
1996
+ let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
1997
+ <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
1998
+ xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
1999
+ xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
2000
+ <p:cSld>
2001
+ <p:spTree>
2002
+ <p:pic>
2003
+ <p:nvPicPr>
2004
+ <p:cNvPr id="1" name="Image1"/>
2005
+ </p:nvPicPr>
2006
+ <p:blipFill>
2007
+ <a:blip r:embed="rId1"/>
2008
+ </p:blipFill>
2009
+ <p:spPr>
2010
+ <a:xfrm>
2011
+ <a:off x="1000000" y="1000000"/>
2012
+ <a:ext cx="2000000" cy="2000000"/>
2013
+ </a:xfrm>
2014
+ </p:spPr>
2015
+ </p:pic>
2016
+ <p:pic>
2017
+ <p:nvPicPr>
2018
+ <p:cNvPr id="2" name="Image2"/>
2019
+ </p:nvPicPr>
2020
+ <p:blipFill>
2021
+ <a:blip r:embed="rId2"/>
2022
+ </p:blipFill>
2023
+ <p:spPr>
2024
+ <a:xfrm>
2025
+ <a:off x="4000000" y="1000000"/>
2026
+ <a:ext cx="2000000" cy="2000000"/>
2027
+ </a:xfrm>
2028
+ </p:spPr>
2029
+ </p:pic>
2030
+ </p:spTree>
2031
+ </p:cSld>
2032
+ </p:sld>"#;
2033
+
2034
+ zip.start_file("ppt/slides/slide1.xml", options).unwrap();
2035
+ zip.write_all(slide_xml.as_bytes()).unwrap();
2036
+
2037
+ let png_bytes: Vec<u8> = vec![
2038
+ 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, 0x00,
2039
+ 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x02, 0x00, 0x00, 0x00, 0x90, 0x77, 0x53, 0xDE, 0x00,
2040
+ 0x00, 0x00, 0x00, 0x49, 0x45, 0x4E, 0x44, 0xAE, 0x42, 0x60, 0x82,
2041
+ ];
2042
+ zip.start_file("ppt/media/image1.png", options).unwrap();
2043
+ zip.write_all(&png_bytes).unwrap();
2044
+
2045
+ let jpeg_bytes: Vec<u8> = vec![
2046
+ 0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x00,
2047
+ 0x01, 0x00, 0x00, 0xFF, 0xD9,
2048
+ ];
2049
+ zip.start_file("ppt/media/image2.jpeg", options).unwrap();
2050
+ zip.write_all(&jpeg_bytes).unwrap();
2051
+
2052
+ zip.start_file("docProps/core.xml", options).unwrap();
2053
+ zip.write_all(
2054
+ br#"<?xml version="1.0" encoding="UTF-8"?>
2055
+ <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
2056
+ xmlns:dc="http://purl.org/dc/elements/1.1/">
2057
+ <dc:title>Test Images</dc:title>
2058
+ </cp:coreProperties>"#,
2059
+ )
2060
+ .unwrap();
2061
+
2062
+ let _ = zip.finish().unwrap();
2063
+ }
2064
+ buffer
2065
+ }
2066
+
2067
+ fn create_pptx_with_formatting() -> Vec<u8> {
2068
+ use std::io::Write;
2069
+ use zip::write::{SimpleFileOptions, ZipWriter};
2070
+
2071
+ let mut buffer = Vec::new();
2072
+ {
2073
+ let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
2074
+ let options = SimpleFileOptions::default();
2075
+
2076
+ zip.start_file("[Content_Types].xml", options).unwrap();
2077
+ zip.write_all(
2078
+ br#"<?xml version="1.0" encoding="UTF-8"?>
2079
+ <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
2080
+ <Default Extension="xml" ContentType="application/xml"/>
2081
+ <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
2082
+ </Types>"#,
2083
+ )
2084
+ .unwrap();
2085
+
2086
+ zip.start_file("ppt/presentation.xml", options).unwrap();
2087
+ zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
2088
+
2089
+ zip.start_file("_rels/.rels", options).unwrap();
2090
+ zip.write_all(
2091
+ br#"<?xml version="1.0" encoding="UTF-8"?>
2092
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
2093
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
2094
+ </Relationships>"#,
2095
+ )
2096
+ .unwrap();
2097
+
2098
+ zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
2099
+ zip.write_all(
2100
+ br#"<?xml version="1.0" encoding="UTF-8"?>
2101
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
2102
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
2103
+ </Relationships>"#,
2104
+ )
2105
+ .unwrap();
2106
+
2107
+ let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
2108
+ <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
2109
+ xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
2110
+ <p:cSld>
2111
+ <p:spTree>
2112
+ <p:sp>
2113
+ <p:spPr>
2114
+ <a:xfrm>
2115
+ <a:off x="1000000" y="1000000"/>
2116
+ <a:ext cx="6000000" cy="1000000"/>
2117
+ </a:xfrm>
2118
+ </p:spPr>
2119
+ <p:txBody>
2120
+ <a:p>
2121
+ <a:r>
2122
+ <a:rPr b="1"/>
2123
+ <a:t>Bold text</a:t>
2124
+ </a:r>
2125
+ </a:p>
2126
+ </p:txBody>
2127
+ </p:sp>
2128
+ <p:sp>
2129
+ <p:spPr>
2130
+ <a:xfrm>
2131
+ <a:off x="1000000" y="2000000"/>
2132
+ <a:ext cx="6000000" cy="1000000"/>
2133
+ </a:xfrm>
2134
+ </p:spPr>
2135
+ <p:txBody>
2136
+ <a:p>
2137
+ <a:r>
2138
+ <a:rPr i="1"/>
2139
+ <a:t>Italic text</a:t>
2140
+ </a:r>
2141
+ </a:p>
2142
+ </p:txBody>
2143
+ </p:sp>
2144
+ <p:sp>
2145
+ <p:spPr>
2146
+ <a:xfrm>
2147
+ <a:off x="1000000" y="3000000"/>
2148
+ <a:ext cx="6000000" cy="1000000"/>
2149
+ </a:xfrm>
2150
+ </p:spPr>
2151
+ <p:txBody>
2152
+ <a:p>
2153
+ <a:r>
2154
+ <a:rPr u="sng"/>
2155
+ <a:t>Underline text</a:t>
2156
+ </a:r>
2157
+ </a:p>
2158
+ </p:txBody>
2159
+ </p:sp>
2160
+ <p:sp>
2161
+ <p:spPr>
2162
+ <a:xfrm>
2163
+ <a:off x="1000000" y="4000000"/>
2164
+ <a:ext cx="6000000" cy="1000000"/>
2165
+ </a:xfrm>
2166
+ </p:spPr>
2167
+ <p:txBody>
2168
+ <a:p>
2169
+ <a:r>
2170
+ <a:rPr b="1" i="1"/>
2171
+ <a:t>Bold italic text</a:t>
2172
+ </a:r>
2173
+ </a:p>
2174
+ </p:txBody>
2175
+ </p:sp>
2176
+ </p:spTree>
2177
+ </p:cSld>
2178
+ </p:sld>"#;
2179
+
2180
+ zip.start_file("ppt/slides/slide1.xml", options).unwrap();
2181
+ zip.write_all(slide_xml.as_bytes()).unwrap();
2182
+
2183
+ zip.start_file("docProps/core.xml", options).unwrap();
2184
+ zip.write_all(
2185
+ br#"<?xml version="1.0" encoding="UTF-8"?>
2186
+ <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
2187
+ xmlns:dc="http://purl.org/dc/elements/1.1/">
2188
+ <dc:title>Test Formatting</dc:title>
2189
+ </cp:coreProperties>"#,
2190
+ )
2191
+ .unwrap();
2192
+
2193
+ let _ = zip.finish().unwrap();
2194
+ }
2195
+ buffer
2196
+ }
2197
+
2198
+ #[test]
2199
+ fn test_table_extraction_with_headers_succeeds() {
2200
+ let pptx_bytes = create_pptx_with_table(vec![
2201
+ vec!["Header 1", "Header 2", "Header 3"],
2202
+ vec!["Data 1", "Data 2", "Data 3"],
2203
+ vec!["Row 2 Col 1", "Row 2 Col 2", "Row 2 Col 3"],
2204
+ ]);
2205
+
2206
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2207
+
2208
+ assert_eq!(result.table_count, 1, "Should detect one table");
2209
+ assert!(result.content.contains("<table>"), "Should contain table tag");
2210
+ assert!(
2211
+ result.content.contains("<th>Header 1</th>"),
2212
+ "Should render first header"
2213
+ );
2214
+ assert!(
2215
+ result.content.contains("<th>Header 2</th>"),
2216
+ "Should render second header"
2217
+ );
2218
+ assert!(
2219
+ result.content.contains("<th>Header 3</th>"),
2220
+ "Should render third header"
2221
+ );
2222
+ assert!(result.content.contains("<td>Data 1</td>"), "Should render data cell");
2223
+ assert!(
2224
+ result.content.contains("<td>Row 2 Col 2</td>"),
2225
+ "Should render second row data"
2226
+ );
2227
+ }
2228
+
2229
+ #[test]
2230
+ fn test_table_extraction_multirow_multicolumn_succeeds() {
2231
+ let pptx_bytes = create_pptx_with_table(vec![
2232
+ vec!["A1", "B1", "C1", "D1"],
2233
+ vec!["A2", "B2", "C2", "D2"],
2234
+ vec!["A3", "B3", "C3", "D3"],
2235
+ vec!["A4", "B4", "C4", "D4"],
2236
+ ]);
2237
+
2238
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2239
+
2240
+ assert_eq!(result.table_count, 1, "Should detect one table");
2241
+ assert!(result.content.contains("<tr>"), "Should contain table rows");
2242
+ assert!(result.content.contains("A1"), "Should contain first row data");
2243
+ assert!(result.content.contains("D4"), "Should contain last row data");
2244
+
2245
+ let tr_count = result.content.matches("<tr>").count();
2246
+ assert_eq!(tr_count, 4, "Should have 4 table rows");
2247
+ }
2248
+
2249
+ #[test]
2250
+ fn test_table_counting_via_slide_metadata_succeeds() {
2251
+ let pptx_bytes = create_pptx_with_table(vec![vec!["Col1", "Col2"], vec!["Val1", "Val2"]]);
2252
+
2253
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2254
+
2255
+ assert_eq!(result.table_count, 1, "table_count should be 1");
2256
+ }
2257
+
2258
+ #[test]
2259
+ fn test_table_markdown_rendering_with_special_chars() {
2260
+ let pptx_bytes = create_pptx_with_table(vec![
2261
+ vec!["Header with ampersand", "Header 2"],
2262
+ vec!["Cell data 1", "Cell data 2"],
2263
+ ]);
2264
+
2265
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2266
+
2267
+ assert!(result.content.contains("<table>"), "Should contain table tag");
2268
+ assert!(
2269
+ result.content.contains("<th>Header with ampersand</th>"),
2270
+ "Should contain header text"
2271
+ );
2272
+ assert!(
2273
+ result.content.contains("<td>Cell data 1</td>"),
2274
+ "Should contain cell data"
2275
+ );
2276
+ }
2277
+
2278
+ #[test]
2279
+ fn test_table_extraction_empty_table_returns_one_count() {
2280
+ let pptx_bytes = create_pptx_with_table(vec![]);
2281
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2282
+
2283
+ assert_eq!(result.table_count, 1, "Empty table structure should be detected");
2284
+ assert!(!result.content.contains("<td>"), "Empty table should have no cells");
2285
+ }
2286
+
2287
+ #[test]
2288
+ fn test_list_extraction_ordered_list_succeeds() {
2289
+ let pptx_bytes = create_pptx_with_lists(vec![
2290
+ (1, true, "First item"),
2291
+ (1, true, "Second item"),
2292
+ (1, true, "Third item"),
2293
+ ]);
2294
+
2295
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2296
+
2297
+ assert!(
2298
+ result.content.contains("1. First item"),
2299
+ "Should contain ordered list item 1"
2300
+ );
2301
+ assert!(
2302
+ result.content.contains("1. Second item"),
2303
+ "Should contain ordered list item 2"
2304
+ );
2305
+ assert!(
2306
+ result.content.contains("1. Third item"),
2307
+ "Should contain ordered list item 3"
2308
+ );
2309
+ }
2310
+
2311
+ #[test]
2312
+ fn test_list_extraction_unordered_list_succeeds() {
2313
+ let pptx_bytes = create_pptx_with_lists(vec![
2314
+ (1, false, "Bullet one"),
2315
+ (1, false, "Bullet two"),
2316
+ (1, false, "Bullet three"),
2317
+ ]);
2318
+
2319
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2320
+
2321
+ assert!(result.content.contains("- Bullet one"), "Should contain bullet point 1");
2322
+ assert!(result.content.contains("- Bullet two"), "Should contain bullet point 2");
2323
+ assert!(
2324
+ result.content.contains("- Bullet three"),
2325
+ "Should contain bullet point 3"
2326
+ );
2327
+ }
2328
+
2329
+ #[test]
2330
+ fn test_list_extraction_nested_lists_with_indentation_succeeds() {
2331
+ let pptx_bytes = create_pptx_with_lists(vec![
2332
+ (1, false, "Level 1 Item"),
2333
+ (2, false, "Level 2 Item"),
2334
+ (3, false, "Level 3 Item"),
2335
+ (2, false, "Back to Level 2"),
2336
+ (1, false, "Back to Level 1"),
2337
+ ]);
2338
+
2339
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2340
+
2341
+ assert!(
2342
+ result.content.contains("- Level 1 Item"),
2343
+ "Should have level 1 with no indent"
2344
+ );
2345
+ assert!(
2346
+ result.content.contains(" - Level 2 Item"),
2347
+ "Should have level 2 with 2-space indent"
2348
+ );
2349
+ assert!(
2350
+ result.content.contains(" - Level 3 Item"),
2351
+ "Should have level 3 with 4-space indent"
2352
+ );
2353
+ assert!(
2354
+ result.content.contains(" - Back to Level 2"),
2355
+ "Should return to level 2 indent"
2356
+ );
2357
+ assert!(result.content.contains("- Back to Level 1"), "Should return to level 1");
2358
+ }
2359
+
2360
+ #[test]
2361
+ fn test_list_extraction_mixed_ordered_unordered_succeeds() {
2362
+ let pptx_bytes = create_pptx_with_lists(vec![
2363
+ (1, true, "Ordered item 1"),
2364
+ (1, false, "Unordered item 1"),
2365
+ (1, true, "Ordered item 2"),
2366
+ ]);
2367
+
2368
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2369
+
2370
+ assert!(
2371
+ result.content.contains("1. Ordered item 1"),
2372
+ "Should render ordered list"
2373
+ );
2374
+ assert!(
2375
+ result.content.contains("- Unordered item 1"),
2376
+ "Should render unordered list"
2377
+ );
2378
+ assert!(
2379
+ result.content.contains("1. Ordered item 2"),
2380
+ "Should render ordered list again"
2381
+ );
2382
+ }
2383
+
2384
+ #[test]
2385
+ fn test_image_extraction_from_slide_xml_succeeds() {
2386
+ let pptx_bytes = create_pptx_with_images();
2387
+ let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
2388
+
2389
+ assert_eq!(result.image_count, 2, "Should detect 2 images");
2390
+ assert!(!result.images.is_empty(), "Should extract image data");
2391
+ }
2392
+
2393
+ #[test]
2394
+ fn test_image_data_loading_from_zip_archive_succeeds() {
2395
+ let pptx_bytes = create_pptx_with_images();
2396
+ let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
2397
+
2398
+ assert_eq!(result.images.len(), 2, "Should load 2 images");
2399
+
2400
+ for (i, img) in result.images.iter().enumerate() {
2401
+ assert!(!img.data.is_empty(), "Image {} should have non-empty data", i);
2402
+ }
2403
+ }
2404
+
2405
+ #[test]
2406
+ fn test_image_format_detection_succeeds() {
2407
+ let pptx_bytes = create_pptx_with_images();
2408
+ let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
2409
+
2410
+ assert_eq!(result.images.len(), 2, "Should have 2 images");
2411
+
2412
+ let formats: Vec<&str> = result.images.iter().map(|img| img.format.as_str()).collect();
2413
+
2414
+ assert!(formats.contains(&"png"), "Should detect PNG format");
2415
+ assert!(formats.contains(&"jpeg"), "Should detect JPEG format");
2416
+ }
2417
+
2418
+ #[test]
2419
+ fn test_image_counting_via_result_metadata_succeeds() {
2420
+ let pptx_bytes = create_pptx_with_images();
2421
+ let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
2422
+
2423
+ assert_eq!(result.image_count, 2, "image_count should match actual images");
2424
+ assert_eq!(result.images.len(), 2, "images vector should have 2 elements");
2425
+ }
2426
+
2427
+ #[test]
2428
+ fn test_image_extraction_disabled_returns_zero_images() {
2429
+ let pptx_bytes = create_pptx_with_images();
2430
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2431
+
2432
+ assert_eq!(
2433
+ result.image_count, 2,
2434
+ "Should still count images even when not extracted"
2435
+ );
2436
+ assert_eq!(result.images.len(), 0, "Should not extract image data when disabled");
2437
+ }
2438
+
2439
+ #[test]
2440
+ fn test_multiple_images_per_slide_extraction_succeeds() {
2441
+ let pptx_bytes = create_pptx_with_images();
2442
+ let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
2443
+
2444
+ assert_eq!(result.slide_count, 1, "Should have 1 slide");
2445
+ assert_eq!(result.image_count, 2, "Single slide should contain 2 images");
2446
+
2447
+ let indices: Vec<usize> = result.images.iter().map(|img| img.image_index).collect();
2448
+ assert_eq!(indices.len(), 2, "Should have 2 images with indices");
2449
+ assert_eq!(indices, vec![0, 1], "Should have sequential image indices");
2450
+ }
2451
+
2452
+ #[test]
2453
+ fn test_formatting_bold_text_renders_as_markdown_bold() {
2454
+ let pptx_bytes = create_pptx_with_formatting();
2455
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2456
+
2457
+ assert!(
2458
+ result.content.contains("**Bold text"),
2459
+ "Should render bold text with ** markers"
2460
+ );
2461
+ }
2462
+
2463
+ #[test]
2464
+ fn test_formatting_italic_text_renders_as_markdown_italic() {
2465
+ let pptx_bytes = create_pptx_with_formatting();
2466
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2467
+
2468
+ assert!(
2469
+ result.content.contains("*Italic text"),
2470
+ "Should render italic text with * markers"
2471
+ );
2472
+ }
2473
+
2474
+ #[test]
2475
+ fn test_formatting_underline_text_renders_as_html_underline() {
2476
+ let pptx_bytes = create_pptx_with_formatting();
2477
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2478
+
2479
+ assert!(
2480
+ result.content.contains("<u>Underline text"),
2481
+ "Should render underline with HTML tags"
2482
+ );
2483
+ }
2484
+
2485
+ #[test]
2486
+ fn test_formatting_combined_bold_italic_renders_correctly() {
2487
+ let pptx_bytes = create_pptx_with_formatting();
2488
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2489
+
2490
+ assert!(
2491
+ result.content.contains("***Bold italic text"),
2492
+ "Should render bold+italic with *** markers"
2493
+ );
2494
+ }
2495
+
2496
+ #[test]
2497
+ fn test_run_render_underline_formatting() {
2498
+ let run = Run {
2499
+ text: "underlined".to_string(),
2500
+ formatting: Formatting {
2501
+ underlined: true,
2502
+ ..Default::default()
2503
+ },
2504
+ };
2505
+ assert_eq!(
2506
+ run.render_as_md(),
2507
+ "<u>underlined</u>",
2508
+ "Should wrap underlined text in <u> tags"
2509
+ );
2510
+ }
2511
+
2512
+ #[test]
2513
+ fn test_run_render_all_formatting_combined() {
2514
+ let run = Run {
2515
+ text: "all formats".to_string(),
2516
+ formatting: Formatting {
2517
+ bold: true,
2518
+ italic: true,
2519
+ underlined: true,
2520
+ ..Default::default()
2521
+ },
2522
+ };
2523
+ let rendered = run.render_as_md();
2524
+ assert!(rendered.contains("***"), "Should have bold+italic markers");
2525
+ assert!(rendered.contains("<u>"), "Should have underline tags");
2526
+ assert!(rendered.contains("all formats"), "Should contain original text");
2527
+ }
2528
+
2529
+ #[test]
2530
+ fn test_integration_complete_pptx_with_mixed_content_succeeds() {
2531
+ use std::io::Write;
2532
+ use zip::write::{SimpleFileOptions, ZipWriter};
2533
+
2534
+ let mut buffer = Vec::new();
2535
+ {
2536
+ let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
2537
+ let options = SimpleFileOptions::default();
2538
+
2539
+ zip.start_file("[Content_Types].xml", options).unwrap();
2540
+ zip.write_all(
2541
+ br#"<?xml version="1.0" encoding="UTF-8"?>
2542
+ <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
2543
+ <Default Extension="xml" ContentType="application/xml"/>
2544
+ <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
2545
+ <Default Extension="png" ContentType="image/png"/>
2546
+ </Types>"#,
2547
+ )
2548
+ .unwrap();
2549
+
2550
+ zip.start_file("ppt/presentation.xml", options).unwrap();
2551
+ zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
2552
+
2553
+ zip.start_file("_rels/.rels", options).unwrap();
2554
+ zip.write_all(
2555
+ br#"<?xml version="1.0" encoding="UTF-8"?>
2556
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
2557
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
2558
+ </Relationships>"#,
2559
+ )
2560
+ .unwrap();
2561
+
2562
+ zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
2563
+ zip.write_all(
2564
+ br#"<?xml version="1.0" encoding="UTF-8"?>
2565
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
2566
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
2567
+ </Relationships>"#,
2568
+ )
2569
+ .unwrap();
2570
+
2571
+ zip.start_file("ppt/slides/_rels/slide1.xml.rels", options).unwrap();
2572
+ zip.write_all(
2573
+ br#"<?xml version="1.0" encoding="UTF-8"?>
2574
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
2575
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image1.png"/>
2576
+ </Relationships>"#,
2577
+ )
2578
+ .unwrap();
2579
+
2580
+ let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
2581
+ <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
2582
+ xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
2583
+ xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
2584
+ <p:cSld>
2585
+ <p:spTree>
2586
+ <p:sp>
2587
+ <p:txBody>
2588
+ <a:p>
2589
+ <a:r>
2590
+ <a:rPr b="1"/>
2591
+ <a:t>Title with Bold</a:t>
2592
+ </a:r>
2593
+ </a:p>
2594
+ </p:txBody>
2595
+ <p:spPr>
2596
+ <a:xfrm>
2597
+ <a:off x="1000000" y="500000"/>
2598
+ </a:xfrm>
2599
+ </p:spPr>
2600
+ </p:sp>
2601
+ <p:sp>
2602
+ <p:txBody>
2603
+ <a:p>
2604
+ <a:pPr lvl="0"><a:buChar char="•"/></a:pPr>
2605
+ <a:r>
2606
+ <a:t>List item one</a:t>
2607
+ </a:r>
2608
+ </a:p>
2609
+ </p:txBody>
2610
+ <p:spPr>
2611
+ <a:xfrm>
2612
+ <a:off x="1000000" y="1500000"/>
2613
+ </a:xfrm>
2614
+ </p:spPr>
2615
+ </p:sp>
2616
+ <p:graphicFrame>
2617
+ <p:xfrm>
2618
+ <a:off x="1000000" y="2500000"/>
2619
+ <a:ext cx="4000000" cy="2000000"/>
2620
+ </p:xfrm>
2621
+ <a:graphic>
2622
+ <a:graphicData uri="http://schemas.openxmlformats.org/drawingml/2006/table">
2623
+ <a:tbl>
2624
+ <a:tblGrid>
2625
+ <a:gridCol w="2000000"/>
2626
+ <a:gridCol w="2000000"/>
2627
+ </a:tblGrid>
2628
+ <a:tr h="370840">
2629
+ <a:tc>
2630
+ <a:txBody>
2631
+ <a:p>
2632
+ <a:r>
2633
+ <a:t>Header A</a:t>
2634
+ </a:r>
2635
+ </a:p>
2636
+ </a:txBody>
2637
+ </a:tc>
2638
+ <a:tc>
2639
+ <a:txBody>
2640
+ <a:p>
2641
+ <a:r>
2642
+ <a:t>Header B</a:t>
2643
+ </a:r>
2644
+ </a:p>
2645
+ </a:txBody>
2646
+ </a:tc>
2647
+ </a:tr>
2648
+ <a:tr h="370840">
2649
+ <a:tc>
2650
+ <a:txBody>
2651
+ <a:p>
2652
+ <a:r>
2653
+ <a:t>Data 1</a:t>
2654
+ </a:r>
2655
+ </a:p>
2656
+ </a:txBody>
2657
+ </a:tc>
2658
+ <a:tc>
2659
+ <a:txBody>
2660
+ <a:p>
2661
+ <a:r>
2662
+ <a:t>Data 2</a:t>
2663
+ </a:r>
2664
+ </a:p>
2665
+ </a:txBody>
2666
+ </a:tc>
2667
+ </a:tr>
2668
+ </a:tbl>
2669
+ </a:graphicData>
2670
+ </a:graphic>
2671
+ </p:graphicFrame>
2672
+ <p:pic>
2673
+ <p:nvPicPr>
2674
+ <p:cNvPr id="1" name="TestImage"/>
2675
+ </p:nvPicPr>
2676
+ <p:blipFill>
2677
+ <a:blip r:embed="rId1"/>
2678
+ </p:blipFill>
2679
+ <p:spPr>
2680
+ <a:xfrm>
2681
+ <a:off x="6000000" y="1000000"/>
2682
+ <a:ext cx="2000000" cy="2000000"/>
2683
+ </a:xfrm>
2684
+ </p:spPr>
2685
+ </p:pic>
2686
+ </p:spTree>
2687
+ </p:cSld>
2688
+ </p:sld>"#;
2689
+
2690
+ zip.start_file("ppt/slides/slide1.xml", options).unwrap();
2691
+ zip.write_all(slide_xml.as_bytes()).unwrap();
2692
+
2693
+ let png_bytes: Vec<u8> = vec![
2694
+ 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, 0x00,
2695
+ 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x02, 0x00, 0x00, 0x00, 0x90, 0x77, 0x53, 0xDE, 0x00,
2696
+ 0x00, 0x00, 0x00, 0x49, 0x45, 0x4E, 0x44, 0xAE, 0x42, 0x60, 0x82,
2697
+ ];
2698
+ zip.start_file("ppt/media/image1.png", options).unwrap();
2699
+ zip.write_all(&png_bytes).unwrap();
2700
+
2701
+ zip.start_file("docProps/core.xml", options).unwrap();
2702
+ zip.write_all(
2703
+ br#"<?xml version="1.0" encoding="UTF-8"?>
2704
+ <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
2705
+ xmlns:dc="http://purl.org/dc/elements/1.1/">
2706
+ <dc:title>Integration Test</dc:title>
2707
+ </cp:coreProperties>"#,
2708
+ )
2709
+ .unwrap();
2710
+
2711
+ let _ = zip.finish().unwrap();
2712
+ }
2713
+
2714
+ let result = extract_pptx_from_bytes(&buffer, true).unwrap();
2715
+
2716
+ assert!(
2717
+ result.content.contains("**Title with Bold"),
2718
+ "Should contain formatted title"
2719
+ );
2720
+ assert!(result.content.contains("- List item one"), "Should contain list item");
2721
+ assert!(result.content.contains("<table>"), "Should contain table");
2722
+ assert!(result.content.contains("Header A"), "Should contain table header");
2723
+ assert!(result.content.contains("Data 1"), "Should contain table data");
2724
+
2725
+ assert_eq!(result.slide_count, 1, "Should have 1 slide");
2726
+ assert_eq!(result.table_count, 1, "Should detect 1 table");
2727
+ assert_eq!(result.image_count, 1, "Should detect 1 image");
2728
+ assert_eq!(result.images.len(), 1, "Should extract 1 image");
2729
+ }
2730
+
2731
+ #[test]
2732
+ fn test_integration_position_based_sorting_orders_elements_correctly() {
2733
+ use std::io::Write;
2734
+ use zip::write::{SimpleFileOptions, ZipWriter};
2735
+
2736
+ let mut buffer = Vec::new();
2737
+ {
2738
+ let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
2739
+ let options = SimpleFileOptions::default();
2740
+
2741
+ zip.start_file("[Content_Types].xml", options).unwrap();
2742
+ zip.write_all(
2743
+ br#"<?xml version="1.0" encoding="UTF-8"?>
2744
+ <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
2745
+ <Default Extension="xml" ContentType="application/xml"/>
2746
+ <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
2747
+ </Types>"#,
2748
+ )
2749
+ .unwrap();
2750
+
2751
+ zip.start_file("ppt/presentation.xml", options).unwrap();
2752
+ zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
2753
+
2754
+ zip.start_file("_rels/.rels", options).unwrap();
2755
+ zip.write_all(
2756
+ br#"<?xml version="1.0" encoding="UTF-8"?>
2757
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
2758
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
2759
+ </Relationships>"#,
2760
+ )
2761
+ .unwrap();
2762
+
2763
+ zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
2764
+ zip.write_all(
2765
+ br#"<?xml version="1.0" encoding="UTF-8"?>
2766
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
2767
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
2768
+ </Relationships>"#,
2769
+ )
2770
+ .unwrap();
2771
+
2772
+ let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
2773
+ <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
2774
+ xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
2775
+ <p:cSld>
2776
+ <p:spTree>
2777
+ <p:sp>
2778
+ <p:txBody>
2779
+ <a:p>
2780
+ <a:r>
2781
+ <a:t>Bottom Right</a:t>
2782
+ </a:r>
2783
+ </a:p>
2784
+ </p:txBody>
2785
+ <p:spPr>
2786
+ <a:xfrm>
2787
+ <a:off x="5000000" y="3000000"/>
2788
+ </a:xfrm>
2789
+ </p:spPr>
2790
+ </p:sp>
2791
+ <p:sp>
2792
+ <p:txBody>
2793
+ <a:p>
2794
+ <a:r>
2795
+ <a:t>Top Left</a:t>
2796
+ </a:r>
2797
+ </a:p>
2798
+ </p:txBody>
2799
+ <p:spPr>
2800
+ <a:xfrm>
2801
+ <a:off x="1000000" y="1000000"/>
2802
+ </a:xfrm>
2803
+ </p:spPr>
2804
+ </p:sp>
2805
+ <p:sp>
2806
+ <p:txBody>
2807
+ <a:p>
2808
+ <a:r>
2809
+ <a:t>Top Right</a:t>
2810
+ </a:r>
2811
+ </a:p>
2812
+ </p:txBody>
2813
+ <p:spPr>
2814
+ <a:xfrm>
2815
+ <a:off x="5000000" y="1000000"/>
2816
+ </a:xfrm>
2817
+ </p:spPr>
2818
+ </p:sp>
2819
+ <p:sp>
2820
+ <p:txBody>
2821
+ <a:p>
2822
+ <a:r>
2823
+ <a:t>Bottom Left</a:t>
2824
+ </a:r>
2825
+ </a:p>
2826
+ </p:txBody>
2827
+ <p:spPr>
2828
+ <a:xfrm>
2829
+ <a:off x="1000000" y="3000000"/>
2830
+ </a:xfrm>
2831
+ </p:spPr>
2832
+ </p:sp>
2833
+ </p:spTree>
2834
+ </p:cSld>
2835
+ </p:sld>"#;
2836
+
2837
+ zip.start_file("ppt/slides/slide1.xml", options).unwrap();
2838
+ zip.write_all(slide_xml.as_bytes()).unwrap();
2839
+
2840
+ zip.start_file("docProps/core.xml", options).unwrap();
2841
+ zip.write_all(
2842
+ br#"<?xml version="1.0" encoding="UTF-8"?>
2843
+ <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
2844
+ xmlns:dc="http://purl.org/dc/elements/1.1/">
2845
+ <dc:title>Position Test</dc:title>
2846
+ </cp:coreProperties>"#,
2847
+ )
2848
+ .unwrap();
2849
+
2850
+ let _ = zip.finish().unwrap();
2851
+ }
2852
+
2853
+ let result = extract_pptx_from_bytes(&buffer, false).unwrap();
2854
+
2855
+ let content = result.content;
2856
+ let top_left_pos = content.find("Top Left").unwrap();
2857
+ let top_right_pos = content.find("Top Right").unwrap();
2858
+ let bottom_left_pos = content.find("Bottom Left").unwrap();
2859
+ let bottom_right_pos = content.find("Bottom Right").unwrap();
2860
+
2861
+ assert!(
2862
+ top_left_pos < top_right_pos,
2863
+ "Top Left should appear before Top Right (same Y, lower X)"
2864
+ );
2865
+ assert!(
2866
+ top_right_pos < bottom_left_pos,
2867
+ "Top row should appear before bottom row"
2868
+ );
2869
+ assert!(
2870
+ bottom_left_pos < bottom_right_pos,
2871
+ "Bottom Left should appear before Bottom Right (same Y, lower X)"
2872
+ );
2873
+ }
2874
+
2875
+ #[test]
2876
+ fn test_integration_slide_notes_extraction_succeeds() {
2877
+ use std::io::Write;
2878
+ use zip::write::{SimpleFileOptions, ZipWriter};
2879
+
2880
+ let mut buffer = Vec::new();
2881
+ {
2882
+ let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
2883
+ let options = SimpleFileOptions::default();
2884
+
2885
+ zip.start_file("[Content_Types].xml", options).unwrap();
2886
+ zip.write_all(
2887
+ br#"<?xml version="1.0" encoding="UTF-8"?>
2888
+ <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
2889
+ <Default Extension="xml" ContentType="application/xml"/>
2890
+ <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
2891
+ </Types>"#,
2892
+ )
2893
+ .unwrap();
2894
+
2895
+ zip.start_file("ppt/presentation.xml", options).unwrap();
2896
+ zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
2897
+
2898
+ zip.start_file("_rels/.rels", options).unwrap();
2899
+ zip.write_all(
2900
+ br#"<?xml version="1.0" encoding="UTF-8"?>
2901
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
2902
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
2903
+ </Relationships>"#,
2904
+ )
2905
+ .unwrap();
2906
+
2907
+ zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
2908
+ zip.write_all(
2909
+ br#"<?xml version="1.0" encoding="UTF-8"?>
2910
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
2911
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
2912
+ </Relationships>"#,
2913
+ )
2914
+ .unwrap();
2915
+
2916
+ zip.start_file("ppt/slides/_rels/slide1.xml.rels", options).unwrap();
2917
+ zip.write_all(
2918
+ br#"<?xml version="1.0" encoding="UTF-8"?>
2919
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
2920
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide" Target="../notesSlides/notesSlide1.xml"/>
2921
+ </Relationships>"#,
2922
+ )
2923
+ .unwrap();
2924
+
2925
+ let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
2926
+ <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
2927
+ xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
2928
+ <p:cSld>
2929
+ <p:spTree>
2930
+ <p:sp>
2931
+ <p:txBody>
2932
+ <a:p>
2933
+ <a:r>
2934
+ <a:t>Slide Content</a:t>
2935
+ </a:r>
2936
+ </a:p>
2937
+ </p:txBody>
2938
+ </p:sp>
2939
+ </p:spTree>
2940
+ </p:cSld>
2941
+ </p:sld>"#;
2942
+
2943
+ zip.start_file("ppt/slides/slide1.xml", options).unwrap();
2944
+ zip.write_all(slide_xml.as_bytes()).unwrap();
2945
+
2946
+ let notes_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
2947
+ <p:notes xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
2948
+ xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
2949
+ <p:cSld>
2950
+ <p:spTree>
2951
+ <p:sp>
2952
+ <p:txBody>
2953
+ <a:p>
2954
+ <a:r>
2955
+ <a:t>This is a speaker note for testing</a:t>
2956
+ </a:r>
2957
+ </a:p>
2958
+ </p:txBody>
2959
+ </p:sp>
2960
+ </p:spTree>
2961
+ </p:cSld>
2962
+ </p:notes>"#;
2963
+
2964
+ zip.start_file("ppt/notesSlides/notesSlide1.xml", options).unwrap();
2965
+ zip.write_all(notes_xml.as_bytes()).unwrap();
2966
+
2967
+ zip.start_file("docProps/core.xml", options).unwrap();
2968
+ zip.write_all(
2969
+ br#"<?xml version="1.0" encoding="UTF-8"?>
2970
+ <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
2971
+ xmlns:dc="http://purl.org/dc/elements/1.1/">
2972
+ <dc:title>Notes Test</dc:title>
2973
+ </cp:coreProperties>"#,
2974
+ )
2975
+ .unwrap();
2976
+
2977
+ let _ = zip.finish().unwrap();
2978
+ }
2979
+
2980
+ let result = extract_pptx_from_bytes(&buffer, false).unwrap();
2981
+
2982
+ assert!(result.content.contains("Slide Content"), "Should contain slide content");
2983
+ assert!(result.content.contains("### Notes:"), "Should contain notes header");
2984
+ assert!(
2985
+ result.content.contains("This is a speaker note for testing"),
2986
+ "Should extract speaker notes"
2987
+ );
2988
+ }
2989
+
2990
+ #[test]
2991
+ fn test_integration_metadata_extraction_complete() {
2992
+ let pptx_bytes = create_test_pptx_bytes(vec!["Content"]);
2993
+ let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
2994
+
2995
+ assert_eq!(result.metadata.title, Some("Test Presentation".to_string()));
2996
+ assert_eq!(result.metadata.author, Some("Test Author".to_string()));
2997
+ assert_eq!(result.metadata.description, Some("Test Description".to_string()));
2998
+ assert_eq!(result.metadata.summary, Some("Test Subject".to_string()));
2999
+ }
3000
+ }