kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -1,3102 +0,0 @@
1
- //! PowerPoint presentation extraction functions.
2
- //!
3
- //! This module provides PowerPoint (PPTX) file parsing by directly reading the Office Open XML
4
- //! format. It extracts text content, slide structure, images, and presentation metadata.
5
- //!
6
- //! # Attribution
7
- //!
8
- //! This code is based on the [pptx-to-md](https://github.com/nilskruthoff/pptx-parser) library
9
- //! by Nils Kruthoff, licensed under MIT OR Apache-2.0. The original code has been vendored and
10
- //! adapted to integrate with Kreuzberg's architecture. See ATTRIBUTIONS.md for full license text.
11
- //!
12
- //! # Features
13
- //!
14
- //! - **Slide extraction**: Reads all slides from presentation
15
- //! - **Text formatting**: Preserves bold, italic, underline formatting as Markdown
16
- //! - **Image extraction**: Optionally extracts embedded images with metadata
17
- //! - **Office metadata**: Extracts core properties, custom properties (when `office` feature enabled)
18
- //! - **Structure preservation**: Maintains heading hierarchy and list structure
19
- //!
20
- //! # Supported Formats
21
- //!
22
- //! - `.pptx` - PowerPoint Presentation
23
- //! - `.pptm` - PowerPoint Macro-Enabled Presentation
24
- //! - `.ppsx` - PowerPoint Slide Show
25
- //!
26
- //! # Example
27
- //!
28
- //! ```rust
29
- //! use kreuzberg::extraction::pptx::extract_pptx_from_path;
30
- //!
31
- //! # fn example() -> kreuzberg::Result<()> {
32
- //! let result = extract_pptx_from_path("presentation.pptx", true, None)?;
33
- //!
34
- //! println!("Slide count: {}", result.slide_count);
35
- //! println!("Image count: {}", result.image_count);
36
- //! println!("Content:\n{}", result.content);
37
- //! # Ok(())
38
- //! # }
39
- //! ```
40
- use crate::error::{KreuzbergError, Result};
41
- use crate::text::utf8_validation;
42
- use crate::types::{ExtractedImage, PptxExtractionResult, PptxMetadata};
43
- use std::collections::HashMap;
44
- use std::fs::File;
45
- use std::io::Read;
46
- use std::path::Path;
47
- use zip::ZipArchive;
48
-
49
- #[cfg(feature = "office")]
50
- use crate::extraction::office_metadata::{
51
- extract_core_properties, extract_custom_properties, extract_pptx_app_properties,
52
- };
53
- #[cfg(feature = "office")]
54
- use serde_json::Value;
55
-
56
- const P_NAMESPACE: &str = "http://schemas.openxmlformats.org/presentationml/2006/main";
57
- const A_NAMESPACE: &str = "http://schemas.openxmlformats.org/drawingml/2006/main";
58
- const RELS_NAMESPACE: &str = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
59
-
60
- #[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord)]
61
- struct ElementPosition {
62
- x: i64,
63
- y: i64,
64
- }
65
-
66
- #[derive(Debug, Clone, Default)]
67
- struct Formatting {
68
- bold: bool,
69
- italic: bool,
70
- underlined: bool,
71
- lang: String,
72
- }
73
-
74
- #[derive(Debug, Clone)]
75
- struct Run {
76
- text: String,
77
- formatting: Formatting,
78
- }
79
-
80
- impl Run {
81
- fn extract(&self) -> String {
82
- self.text.clone()
83
- }
84
-
85
- fn render_as_md(&self) -> String {
86
- let mut result = self.text.clone();
87
-
88
- if self.formatting.bold {
89
- result = format!("**{}**", result);
90
- }
91
- if self.formatting.italic {
92
- result = format!("*{}*", result);
93
- }
94
- if self.formatting.underlined {
95
- result = format!("<u>{}</u>", result);
96
- }
97
-
98
- result
99
- }
100
- }
101
-
102
- #[derive(Debug, Clone)]
103
- struct TextElement {
104
- runs: Vec<Run>,
105
- }
106
-
107
- #[derive(Debug, Clone)]
108
- struct ListItem {
109
- level: u32,
110
- is_ordered: bool,
111
- runs: Vec<Run>,
112
- }
113
-
114
- #[derive(Debug, Clone)]
115
- struct ListElement {
116
- items: Vec<ListItem>,
117
- }
118
-
119
- #[derive(Debug, Clone)]
120
- struct TableCell {
121
- runs: Vec<Run>,
122
- }
123
-
124
- #[derive(Debug, Clone)]
125
- struct TableRow {
126
- cells: Vec<TableCell>,
127
- }
128
-
129
- #[derive(Debug, Clone)]
130
- struct TableElement {
131
- rows: Vec<TableRow>,
132
- }
133
-
134
- #[derive(Debug, Clone)]
135
- struct ImageReference {
136
- id: String,
137
- target: String,
138
- }
139
-
140
- #[derive(Debug, Clone)]
141
- enum SlideElement {
142
- Text(TextElement, ElementPosition),
143
- Table(TableElement, ElementPosition),
144
- Image(ImageReference, ElementPosition),
145
- List(ListElement, ElementPosition),
146
- Unknown,
147
- }
148
-
149
- impl SlideElement {
150
- fn position(&self) -> ElementPosition {
151
- match self {
152
- SlideElement::Text(_, pos)
153
- | SlideElement::Table(_, pos)
154
- | SlideElement::Image(_, pos)
155
- | SlideElement::List(_, pos) => *pos,
156
- SlideElement::Unknown => ElementPosition::default(),
157
- }
158
- }
159
- }
160
-
161
- #[derive(Debug)]
162
- struct Slide {
163
- slide_number: u32,
164
- elements: Vec<SlideElement>,
165
- images: Vec<ImageReference>,
166
- }
167
-
168
- #[derive(Debug, Clone)]
169
- struct ParserConfig {
170
- extract_images: bool,
171
- include_slide_comment: bool,
172
- }
173
-
174
- impl Default for ParserConfig {
175
- fn default() -> Self {
176
- Self {
177
- extract_images: true,
178
- include_slide_comment: false,
179
- }
180
- }
181
- }
182
-
183
- struct ContentBuilder {
184
- content: String,
185
- boundaries: Vec<crate::types::PageBoundary>,
186
- page_contents: Vec<crate::types::PageContent>,
187
- config: Option<crate::core::config::PageConfig>,
188
- }
189
-
190
- impl ContentBuilder {
191
- fn new() -> Self {
192
- Self {
193
- content: String::with_capacity(8192),
194
- boundaries: Vec::new(),
195
- page_contents: Vec::new(),
196
- config: None,
197
- }
198
- }
199
-
200
- fn with_page_config(capacity: usize, config: Option<crate::core::config::PageConfig>) -> Self {
201
- Self {
202
- content: String::with_capacity(capacity),
203
- boundaries: if config.is_some() {
204
- Vec::new()
205
- } else {
206
- Vec::with_capacity(0)
207
- },
208
- page_contents: if config.is_some() {
209
- Vec::new()
210
- } else {
211
- Vec::with_capacity(0)
212
- },
213
- config,
214
- }
215
- }
216
-
217
- fn start_slide(&mut self, slide_number: u32) -> usize {
218
- let byte_start = self.content.len();
219
-
220
- if let Some(ref cfg) = self.config
221
- && cfg.insert_page_markers
222
- {
223
- let marker = cfg.marker_format.replace("{page_num}", &slide_number.to_string());
224
- self.content.push_str(&marker);
225
- }
226
-
227
- byte_start
228
- }
229
-
230
- fn end_slide(&mut self, slide_number: u32, byte_start: usize, slide_content: String) {
231
- let byte_end = self.content.len();
232
-
233
- if self.config.is_some() {
234
- self.boundaries.push(crate::types::PageBoundary {
235
- byte_start,
236
- byte_end,
237
- page_number: slide_number as usize,
238
- });
239
-
240
- self.page_contents.push(crate::types::PageContent {
241
- page_number: slide_number as usize,
242
- content: slide_content,
243
- tables: Vec::new(),
244
- images: Vec::new(),
245
- hierarchy: None,
246
- });
247
- }
248
- }
249
-
250
- fn add_slide_header(&mut self, slide_number: u32) {
251
- self.content.reserve(50);
252
- self.content.push_str("\n\n<!-- Slide number: ");
253
- self.content.push_str(&slide_number.to_string());
254
- self.content.push_str(" -->\n");
255
- }
256
-
257
- fn add_text(&mut self, text: &str) {
258
- if !text.trim().is_empty() {
259
- self.content.push_str(text);
260
- }
261
- }
262
-
263
- fn add_title(&mut self, title: &str) {
264
- if !title.trim().is_empty() {
265
- self.content.push_str("# ");
266
- self.content.push_str(title.trim());
267
- self.content.push('\n');
268
- }
269
- }
270
-
271
- fn add_table(&mut self, rows: &[Vec<String>]) {
272
- if rows.is_empty() {
273
- return;
274
- }
275
-
276
- self.content.push_str("\n<table>");
277
- for (i, row) in rows.iter().enumerate() {
278
- self.content.push_str("<tr>");
279
- let tag = if i == 0 { "th" } else { "td" };
280
-
281
- for cell in row {
282
- self.content.push('<');
283
- self.content.push_str(tag);
284
- self.content.push('>');
285
- self.content.push_str(&html_escape(cell));
286
- self.content.push_str("</");
287
- self.content.push_str(tag);
288
- self.content.push('>');
289
- }
290
- self.content.push_str("</tr>");
291
- }
292
- self.content.push_str("</table>\n");
293
- }
294
-
295
- fn add_list_item(&mut self, level: u32, is_ordered: bool, text: &str) {
296
- let indent_count = level.saturating_sub(1) as usize;
297
- for _ in 0..indent_count {
298
- self.content.push_str(" ");
299
- }
300
-
301
- let marker = if is_ordered { "1." } else { "-" };
302
- self.content.push_str(marker);
303
- self.content.push(' ');
304
- self.content.push_str(text.trim());
305
- self.content.push('\n');
306
- }
307
-
308
- fn add_image(&mut self, image_id: &str, slide_number: u32) {
309
- let filename = format!("slide_{}_image_{}.jpg", slide_number, image_id);
310
- self.content.push_str("![");
311
- self.content.push_str(image_id);
312
- self.content.push_str("](");
313
- self.content.push_str(&filename);
314
- self.content.push_str(")\n");
315
- }
316
-
317
- fn add_notes(&mut self, notes: &str) {
318
- if !notes.trim().is_empty() {
319
- self.content.push_str("\n\n### Notes:\n");
320
- self.content.push_str(notes);
321
- self.content.push('\n');
322
- }
323
- }
324
-
325
- fn build(
326
- self,
327
- ) -> (
328
- String,
329
- Option<Vec<crate::types::PageBoundary>>,
330
- Option<Vec<crate::types::PageContent>>,
331
- ) {
332
- let content = self.content.trim().to_string();
333
- let boundaries = if self.config.is_some() && !self.boundaries.is_empty() {
334
- Some(self.boundaries)
335
- } else {
336
- None
337
- };
338
- let pages = if self.config.is_some() && !self.page_contents.is_empty() {
339
- Some(self.page_contents)
340
- } else {
341
- None
342
- };
343
- (content, boundaries, pages)
344
- }
345
- }
346
-
347
- fn html_escape(text: &str) -> String {
348
- text.replace('&', "&amp;")
349
- .replace('<', "&lt;")
350
- .replace('>', "&gt;")
351
- .replace('"', "&quot;")
352
- .replace('\'', "&#x27;")
353
- }
354
-
355
- struct PptxContainer {
356
- archive: ZipArchive<File>,
357
- slide_paths: Vec<String>,
358
- }
359
-
360
- impl PptxContainer {
361
- fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
362
- // IO errors must bubble up unchanged - file access issues need user reports ~keep
363
- let file = File::open(path)?;
364
-
365
- let mut archive = match ZipArchive::new(file) {
366
- Ok(arc) => arc,
367
- Err(zip::result::ZipError::Io(io_err)) => return Err(io_err.into()), // Bubble up IO errors ~keep
368
- Err(e) => {
369
- return Err(KreuzbergError::parsing(format!(
370
- "Failed to read PPTX archive (invalid format): {}",
371
- e
372
- )));
373
- }
374
- };
375
-
376
- let slide_paths = Self::find_slide_paths(&mut archive)?;
377
-
378
- Ok(Self { archive, slide_paths })
379
- }
380
-
381
- fn slide_paths(&self) -> &[String] {
382
- &self.slide_paths
383
- }
384
-
385
- fn read_file(&mut self, path: &str) -> Result<Vec<u8>> {
386
- match self.archive.by_name(path) {
387
- Ok(mut file) => {
388
- let mut contents = Vec::new();
389
- // IO errors must bubble up - file read issues need user reports ~keep
390
- file.read_to_end(&mut contents)?;
391
- Ok(contents)
392
- }
393
- Err(zip::result::ZipError::FileNotFound) => {
394
- Err(KreuzbergError::parsing("File not found in archive".to_string()))
395
- }
396
- Err(zip::result::ZipError::Io(io_err)) => Err(io_err.into()), // Bubble up IO errors ~keep
397
- Err(e) => Err(KreuzbergError::parsing(format!("Zip error: {}", e))),
398
- }
399
- }
400
-
401
- fn get_slide_rels_path(&self, slide_path: &str) -> String {
402
- get_slide_rels_path(slide_path)
403
- }
404
-
405
- fn find_slide_paths(archive: &mut ZipArchive<File>) -> Result<Vec<String>> {
406
- if let Ok(rels_data) = Self::read_file_from_archive(archive, "ppt/_rels/presentation.xml.rels")
407
- && let Ok(paths) = parse_presentation_rels(&rels_data)
408
- {
409
- return Ok(paths);
410
- }
411
-
412
- let mut slide_paths = Vec::new();
413
- for i in 0..archive.len() {
414
- if let Ok(file) = archive.by_index(i) {
415
- let name = file.name();
416
- if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") {
417
- slide_paths.push(name.to_string());
418
- }
419
- }
420
- }
421
-
422
- slide_paths.sort();
423
- Ok(slide_paths)
424
- }
425
-
426
- fn read_file_from_archive(archive: &mut ZipArchive<File>, path: &str) -> Result<Vec<u8>> {
427
- let mut file = match archive.by_name(path) {
428
- Ok(f) => f,
429
- Err(zip::result::ZipError::Io(io_err)) => return Err(io_err.into()), // Bubble up IO errors ~keep
430
- Err(e) => {
431
- return Err(KreuzbergError::parsing(format!(
432
- "Failed to read file from archive: {}",
433
- e
434
- )));
435
- }
436
- };
437
- let mut contents = Vec::new();
438
- // IO errors must bubble up - file read issues need user reports ~keep
439
- file.read_to_end(&mut contents)?;
440
- Ok(contents)
441
- }
442
- }
443
-
444
- impl Slide {
445
- fn from_xml(slide_number: u32, xml_data: &[u8], rels_data: Option<&[u8]>) -> Result<Self> {
446
- let elements = parse_slide_xml(xml_data)?;
447
-
448
- let images = if let Some(rels) = rels_data {
449
- parse_slide_rels(rels)?
450
- } else {
451
- Vec::new()
452
- };
453
-
454
- Ok(Self {
455
- slide_number,
456
- elements,
457
- images,
458
- })
459
- }
460
-
461
- fn to_markdown(&self, config: &ParserConfig) -> String {
462
- let mut builder = ContentBuilder::new();
463
-
464
- if config.include_slide_comment {
465
- builder.add_slide_header(self.slide_number);
466
- }
467
-
468
- let mut element_indices: Vec<usize> = (0..self.elements.len()).collect();
469
- element_indices.sort_by_key(|&i| {
470
- let pos = self.elements[i].position();
471
- (pos.y, pos.x)
472
- });
473
-
474
- for &idx in &element_indices {
475
- match &self.elements[idx] {
476
- SlideElement::Text(text, _) => {
477
- let text_content: String = text.runs.iter().map(|run| run.render_as_md()).collect();
478
-
479
- let normalized = text_content.replace('\n', " ");
480
- let is_title = normalized.len() < 100 && !normalized.trim().is_empty();
481
-
482
- if is_title {
483
- builder.add_title(normalized.trim());
484
- } else {
485
- builder.add_text(&text_content);
486
- }
487
- }
488
- SlideElement::Table(table, _) => {
489
- let table_rows: Vec<Vec<String>> = table
490
- .rows
491
- .iter()
492
- .map(|row| {
493
- row.cells
494
- .iter()
495
- .map(|cell| cell.runs.iter().map(|run| run.extract()).collect::<String>())
496
- .collect()
497
- })
498
- .collect();
499
- builder.add_table(&table_rows);
500
- }
501
- SlideElement::List(list, _) => {
502
- for item in &list.items {
503
- let item_text: String = item.runs.iter().map(|run| run.extract()).collect();
504
- builder.add_list_item(item.level, item.is_ordered, &item_text);
505
- }
506
- }
507
- SlideElement::Image(img_ref, _) => {
508
- builder.add_image(&img_ref.id, self.slide_number);
509
- }
510
- SlideElement::Unknown => {}
511
- }
512
- }
513
-
514
- builder.build().0
515
- }
516
-
517
- fn image_count(&self) -> usize {
518
- self.elements
519
- .iter()
520
- .filter(|e| matches!(e, SlideElement::Image(_, _)))
521
- .count()
522
- }
523
-
524
- fn table_count(&self) -> usize {
525
- self.elements
526
- .iter()
527
- .filter(|e| matches!(e, SlideElement::Table(_, _)))
528
- .count()
529
- }
530
- }
531
-
532
- struct SlideIterator {
533
- container: PptxContainer,
534
- current_index: usize,
535
- total_slides: usize,
536
- }
537
-
538
- impl SlideIterator {
539
- fn new(container: PptxContainer) -> Self {
540
- let total_slides = container.slide_paths().len();
541
- Self {
542
- container,
543
- current_index: 0,
544
- total_slides,
545
- }
546
- }
547
-
548
- fn slide_count(&self) -> usize {
549
- self.total_slides
550
- }
551
-
552
- fn next_slide(&mut self) -> Result<Option<Slide>> {
553
- if self.current_index >= self.total_slides {
554
- return Ok(None);
555
- }
556
-
557
- let slide_path = &self.container.slide_paths()[self.current_index].clone();
558
- let slide_number = (self.current_index + 1) as u32;
559
-
560
- let xml_data = self.container.read_file(slide_path)?;
561
-
562
- let rels_path = self.container.get_slide_rels_path(slide_path);
563
- let rels_data = self.container.read_file(&rels_path).ok();
564
-
565
- let slide = Slide::from_xml(slide_number, &xml_data, rels_data.as_deref())?;
566
-
567
- self.current_index += 1;
568
-
569
- Ok(Some(slide))
570
- }
571
-
572
- fn get_slide_images(&mut self, slide: &Slide) -> Result<HashMap<String, Vec<u8>>> {
573
- let mut image_data = HashMap::new();
574
-
575
- for img_ref in &slide.images {
576
- let slide_path = &self.container.slide_paths()[slide.slide_number as usize - 1];
577
- let full_path = get_full_image_path(slide_path, &img_ref.target);
578
-
579
- if let Ok(data) = self.container.read_file(&full_path) {
580
- image_data.insert(img_ref.id.clone(), data);
581
- }
582
- }
583
-
584
- Ok(image_data)
585
- }
586
- }
587
-
588
- use roxmltree::{Document, Node};
589
-
590
- enum ParsedContent {
591
- Text(TextElement),
592
- List(ListElement),
593
- }
594
-
595
- fn parse_slide_xml(xml_data: &[u8]) -> Result<Vec<SlideElement>> {
596
- let xml_str = utf8_validation::from_utf8(xml_data)
597
- .map_err(|_| KreuzbergError::parsing("Invalid UTF-8 in slide XML".to_string()))?;
598
-
599
- let doc =
600
- Document::parse(xml_str).map_err(|e| KreuzbergError::parsing(format!("Failed to parse slide XML: {}", e)))?;
601
-
602
- let root = doc.root_element();
603
- let ns = root.tag_name().namespace();
604
-
605
- let c_sld = root
606
- .descendants()
607
- .find(|n| n.tag_name().name() == "cSld" && n.tag_name().namespace() == ns)
608
- .ok_or_else(|| KreuzbergError::parsing("No <p:cSld> tag found".to_string()))?;
609
-
610
- let sp_tree = c_sld
611
- .children()
612
- .find(|n| n.tag_name().name() == "spTree" && n.tag_name().namespace() == ns)
613
- .ok_or_else(|| KreuzbergError::parsing("No <p:spTree> tag found".to_string()))?;
614
-
615
- let mut elements = Vec::new();
616
- for child_node in sp_tree.children().filter(|n| n.is_element()) {
617
- elements.extend(parse_group(&child_node)?);
618
- }
619
-
620
- Ok(elements)
621
- }
622
-
623
- fn parse_group(node: &Node) -> Result<Vec<SlideElement>> {
624
- let mut elements = Vec::new();
625
-
626
- let tag_name = node.tag_name().name();
627
- let namespace = node.tag_name().namespace().unwrap_or("");
628
-
629
- if namespace != P_NAMESPACE {
630
- return Ok(elements);
631
- }
632
-
633
- let position = extract_position(node);
634
-
635
- match tag_name {
636
- "sp" => {
637
- let position = extract_position(node);
638
- match parse_sp(node)? {
639
- ParsedContent::Text(text) => elements.push(SlideElement::Text(text, position)),
640
- ParsedContent::List(list) => elements.push(SlideElement::List(list, position)),
641
- }
642
- }
643
- "graphicFrame" => {
644
- if let Some(graphic_element) = parse_graphic_frame(node)? {
645
- elements.push(SlideElement::Table(graphic_element, position));
646
- }
647
- }
648
- "pic" => {
649
- let image_reference = parse_pic(node)?;
650
- elements.push(SlideElement::Image(image_reference, position));
651
- }
652
- "grpSp" => {
653
- for child in node.children().filter(|n| n.is_element()) {
654
- elements.extend(parse_group(&child)?);
655
- }
656
- }
657
- _ => elements.push(SlideElement::Unknown),
658
- }
659
-
660
- Ok(elements)
661
- }
662
-
663
- fn parse_sp(sp_node: &Node) -> Result<ParsedContent> {
664
- let tx_body_node = sp_node
665
- .children()
666
- .find(|n| n.tag_name().name() == "txBody" && n.tag_name().namespace() == Some(P_NAMESPACE))
667
- .ok_or_else(|| KreuzbergError::parsing("No txBody found".to_string()))?;
668
-
669
- let is_list = tx_body_node.descendants().any(|n| {
670
- n.is_element()
671
- && n.tag_name().name() == "pPr"
672
- && n.tag_name().namespace() == Some(A_NAMESPACE)
673
- && (n.attribute("lvl").is_some()
674
- || n.children().any(|child| {
675
- child.is_element()
676
- && (child.tag_name().name() == "buAutoNum" || child.tag_name().name() == "buChar")
677
- }))
678
- });
679
-
680
- if is_list {
681
- Ok(ParsedContent::List(parse_list(&tx_body_node)?))
682
- } else {
683
- Ok(ParsedContent::Text(parse_text(&tx_body_node)?))
684
- }
685
- }
686
-
687
- fn parse_text(tx_body_node: &Node) -> Result<TextElement> {
688
- let mut runs = Vec::new();
689
-
690
- for p_node in tx_body_node
691
- .children()
692
- .filter(|n| n.is_element() && n.tag_name().name() == "p" && n.tag_name().namespace() == Some(A_NAMESPACE))
693
- {
694
- let mut paragraph_runs = parse_paragraph(&p_node, true)?;
695
- runs.append(&mut paragraph_runs);
696
- }
697
-
698
- Ok(TextElement { runs })
699
- }
700
-
701
- fn parse_graphic_frame(node: &Node) -> Result<Option<TableElement>> {
702
- let graphic_data_node = node.descendants().find(|n| {
703
- n.is_element()
704
- && n.tag_name().name() == "graphicData"
705
- && n.tag_name().namespace() == Some(A_NAMESPACE)
706
- && n.attribute("uri") == Some("http://schemas.openxmlformats.org/drawingml/2006/table")
707
- });
708
-
709
- if let Some(graphic_data) = graphic_data_node
710
- && let Some(tbl_node) = graphic_data
711
- .children()
712
- .find(|n| n.is_element() && n.tag_name().name() == "tbl" && n.tag_name().namespace() == Some(A_NAMESPACE))
713
- {
714
- let table = parse_table(&tbl_node)?;
715
- return Ok(Some(table));
716
- }
717
-
718
- Ok(None)
719
- }
720
-
721
- fn parse_table(tbl_node: &Node) -> Result<TableElement> {
722
- let mut rows = Vec::new();
723
-
724
- for tr_node in tbl_node
725
- .children()
726
- .filter(|n| n.is_element() && n.tag_name().name() == "tr" && n.tag_name().namespace() == Some(A_NAMESPACE))
727
- {
728
- let row = parse_table_row(&tr_node)?;
729
- rows.push(row);
730
- }
731
-
732
- Ok(TableElement { rows })
733
- }
734
-
735
- fn parse_table_row(tr_node: &Node) -> Result<TableRow> {
736
- let mut cells = Vec::new();
737
-
738
- for tc_node in tr_node
739
- .children()
740
- .filter(|n| n.is_element() && n.tag_name().name() == "tc" && n.tag_name().namespace() == Some(A_NAMESPACE))
741
- {
742
- let cell = parse_table_cell(&tc_node)?;
743
- cells.push(cell);
744
- }
745
-
746
- Ok(TableRow { cells })
747
- }
748
-
749
- fn parse_table_cell(tc_node: &Node) -> Result<TableCell> {
750
- let mut runs = Vec::new();
751
-
752
- if let Some(tx_body_node) = tc_node
753
- .children()
754
- .find(|n| n.is_element() && n.tag_name().name() == "txBody" && n.tag_name().namespace() == Some(A_NAMESPACE))
755
- {
756
- for p_node in tx_body_node
757
- .children()
758
- .filter(|n| n.is_element() && n.tag_name().name() == "p" && n.tag_name().namespace() == Some(A_NAMESPACE))
759
- {
760
- let mut paragraph_runs = parse_paragraph(&p_node, false)?;
761
- runs.append(&mut paragraph_runs);
762
- }
763
- }
764
-
765
- Ok(TableCell { runs })
766
- }
767
-
768
- fn parse_pic(pic_node: &Node) -> Result<ImageReference> {
769
- let blip_node = pic_node
770
- .descendants()
771
- .find(|n| n.is_element() && n.tag_name().name() == "blip" && n.tag_name().namespace() == Some(A_NAMESPACE))
772
- .ok_or_else(|| KreuzbergError::parsing("Image blip not found".to_string()))?;
773
-
774
- let embed_attr = blip_node
775
- .attribute((RELS_NAMESPACE, "embed"))
776
- .or_else(|| blip_node.attribute("r:embed"))
777
- .ok_or_else(|| KreuzbergError::parsing("Image embed attribute not found".to_string()))?;
778
-
779
- let image_ref = ImageReference {
780
- id: embed_attr.to_string(),
781
- target: String::new(),
782
- };
783
-
784
- Ok(image_ref)
785
- }
786
-
787
- fn parse_list(tx_body_node: &Node) -> Result<ListElement> {
788
- let mut items = Vec::new();
789
-
790
- for p_node in tx_body_node
791
- .children()
792
- .filter(|n| n.is_element() && n.tag_name().name() == "p" && n.tag_name().namespace() == Some(A_NAMESPACE))
793
- {
794
- let (level, is_ordered) = parse_list_properties(&p_node)?;
795
-
796
- let runs = parse_paragraph(&p_node, true)?;
797
-
798
- items.push(ListItem {
799
- level,
800
- is_ordered,
801
- runs,
802
- });
803
- }
804
-
805
- Ok(ListElement { items })
806
- }
807
-
808
- fn parse_list_properties(p_node: &Node) -> Result<(u32, bool)> {
809
- let mut level = 1;
810
- let mut is_ordered = false;
811
-
812
- if let Some(p_pr_node) = p_node
813
- .children()
814
- .find(|n| n.is_element() && n.tag_name().name() == "pPr" && n.tag_name().namespace() == Some(A_NAMESPACE))
815
- {
816
- if let Some(lvl_attr) = p_pr_node.attribute("lvl") {
817
- level = lvl_attr.parse::<u32>().unwrap_or(0) + 1;
818
- }
819
-
820
- is_ordered = p_pr_node.children().any(|n| {
821
- n.is_element() && n.tag_name().namespace() == Some(A_NAMESPACE) && n.tag_name().name() == "buAutoNum"
822
- });
823
- }
824
-
825
- Ok((level, is_ordered))
826
- }
827
-
828
- fn parse_paragraph(p_node: &Node, add_new_line: bool) -> Result<Vec<Run>> {
829
- let run_nodes: Vec<_> = p_node
830
- .children()
831
- .filter(|n| n.is_element() && n.tag_name().name() == "r" && n.tag_name().namespace() == Some(A_NAMESPACE))
832
- .collect();
833
-
834
- let count = run_nodes.len();
835
- let mut runs: Vec<Run> = Vec::new();
836
-
837
- for (idx, r_node) in run_nodes.iter().enumerate() {
838
- let mut run = parse_run(r_node)?;
839
-
840
- if add_new_line && idx == count - 1 {
841
- run.text.push('\n');
842
- }
843
-
844
- runs.push(run);
845
- }
846
- Ok(runs)
847
- }
848
-
849
- fn parse_run(r_node: &Node) -> Result<Run> {
850
- let mut text = String::new();
851
- let mut formatting = Formatting::default();
852
-
853
- if let Some(r_pr_node) = r_node
854
- .children()
855
- .find(|n| n.is_element() && n.tag_name().name() == "rPr" && n.tag_name().namespace() == Some(A_NAMESPACE))
856
- {
857
- if let Some(b_attr) = r_pr_node.attribute("b") {
858
- formatting.bold = b_attr == "1" || b_attr.eq_ignore_ascii_case("true");
859
- }
860
- if let Some(i_attr) = r_pr_node.attribute("i") {
861
- formatting.italic = i_attr == "1" || i_attr.eq_ignore_ascii_case("true");
862
- }
863
- if let Some(u_attr) = r_pr_node.attribute("u") {
864
- formatting.underlined = u_attr != "none";
865
- }
866
- if let Some(lang_attr) = r_pr_node.attribute("lang") {
867
- formatting.lang = lang_attr.to_string();
868
- }
869
- }
870
-
871
- if let Some(t_node) = r_node
872
- .children()
873
- .find(|n| n.is_element() && n.tag_name().name() == "t" && n.tag_name().namespace() == Some(A_NAMESPACE))
874
- && let Some(t) = t_node.text()
875
- {
876
- text.push_str(t);
877
- }
878
- Ok(Run { text, formatting })
879
- }
880
-
881
- fn extract_position(node: &Node) -> ElementPosition {
882
- let default = ElementPosition::default();
883
-
884
- node.descendants()
885
- .find(|n| n.tag_name().namespace() == Some(A_NAMESPACE) && n.tag_name().name() == "xfrm")
886
- .and_then(|xfrm| {
887
- let x = xfrm
888
- .children()
889
- .find(|n| n.tag_name().name() == "off" && n.tag_name().namespace() == Some(A_NAMESPACE))
890
- .and_then(|off| off.attribute("x")?.parse::<i64>().ok())?;
891
-
892
- let y = xfrm
893
- .children()
894
- .find(|n| n.tag_name().name() == "off" && n.tag_name().namespace() == Some(A_NAMESPACE))
895
- .and_then(|off| off.attribute("y")?.parse::<i64>().ok())?;
896
-
897
- Some(ElementPosition { x, y })
898
- })
899
- .unwrap_or(default)
900
- }
901
-
902
- fn parse_slide_rels(rels_data: &[u8]) -> Result<Vec<ImageReference>> {
903
- let xml_str = utf8_validation::from_utf8(rels_data)
904
- .map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in rels XML: {}", e)))?;
905
-
906
- let doc =
907
- Document::parse(xml_str).map_err(|e| KreuzbergError::parsing(format!("Failed to parse rels XML: {}", e)))?;
908
-
909
- let mut images = Vec::new();
910
-
911
- for node in doc.descendants() {
912
- if node.has_tag_name("Relationship")
913
- && let Some(rel_type) = node.attribute("Type")
914
- && rel_type.contains("image")
915
- && let (Some(id), Some(target)) = (node.attribute("Id"), node.attribute("Target"))
916
- {
917
- images.push(ImageReference {
918
- id: id.to_string(),
919
- target: target.to_string(),
920
- });
921
- }
922
- }
923
-
924
- Ok(images)
925
- }
926
-
927
- fn parse_presentation_rels(rels_data: &[u8]) -> Result<Vec<String>> {
928
- let xml_str = utf8_validation::from_utf8(rels_data)
929
- .map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in presentation rels: {}", e)))?;
930
-
931
- let doc = Document::parse(xml_str)
932
- .map_err(|e| KreuzbergError::parsing(format!("Failed to parse presentation rels: {}", e)))?;
933
-
934
- let mut slide_paths = Vec::new();
935
-
936
- for node in doc.descendants() {
937
- if node.has_tag_name("Relationship")
938
- && let Some(rel_type) = node.attribute("Type")
939
- && rel_type.contains("slide")
940
- && !rel_type.contains("slideMaster")
941
- && let Some(target) = node.attribute("Target")
942
- {
943
- let normalized_target = target.strip_prefix('/').unwrap_or(target);
944
- let final_path = if normalized_target.starts_with("ppt/") {
945
- normalized_target.to_string()
946
- } else {
947
- format!("ppt/{}", normalized_target)
948
- };
949
- slide_paths.push(final_path);
950
- }
951
- }
952
-
953
- Ok(slide_paths)
954
- }
955
-
956
- /// Extract comprehensive metadata from PPTX using office_metadata module
957
- fn extract_metadata(archive: &mut ZipArchive<File>) -> PptxMetadata {
958
- #[cfg(feature = "office")]
959
- {
960
- let mut metadata_map = HashMap::new();
961
-
962
- if let Ok(core) = extract_core_properties(archive) {
963
- if let Some(title) = core.title {
964
- metadata_map.insert("title".to_string(), title);
965
- }
966
- if let Some(creator) = core.creator {
967
- metadata_map.insert("author".to_string(), creator.clone());
968
- metadata_map.insert("created_by".to_string(), creator);
969
- }
970
- if let Some(subject) = core.subject {
971
- metadata_map.insert("subject".to_string(), subject.clone());
972
- metadata_map.insert("summary".to_string(), subject);
973
- }
974
- if let Some(keywords) = core.keywords {
975
- metadata_map.insert("keywords".to_string(), keywords);
976
- }
977
- if let Some(description) = core.description {
978
- metadata_map.insert("description".to_string(), description);
979
- }
980
- if let Some(modified_by) = core.last_modified_by {
981
- metadata_map.insert("modified_by".to_string(), modified_by);
982
- }
983
- if let Some(created) = core.created {
984
- metadata_map.insert("created_at".to_string(), created);
985
- }
986
- if let Some(modified) = core.modified {
987
- metadata_map.insert("modified_at".to_string(), modified);
988
- }
989
- if let Some(revision) = core.revision {
990
- metadata_map.insert("revision".to_string(), revision);
991
- }
992
- if let Some(category) = core.category {
993
- metadata_map.insert("category".to_string(), category);
994
- }
995
- }
996
-
997
- if let Ok(app) = extract_pptx_app_properties(archive) {
998
- if let Some(slides) = app.slides {
999
- metadata_map.insert("slide_count".to_string(), slides.to_string());
1000
- }
1001
- if let Some(notes) = app.notes {
1002
- metadata_map.insert("notes_count".to_string(), notes.to_string());
1003
- }
1004
- if let Some(hidden_slides) = app.hidden_slides {
1005
- metadata_map.insert("hidden_slides".to_string(), hidden_slides.to_string());
1006
- }
1007
- if !app.slide_titles.is_empty() {
1008
- metadata_map.insert("slide_titles".to_string(), app.slide_titles.join(", "));
1009
- }
1010
- if let Some(presentation_format) = app.presentation_format {
1011
- metadata_map.insert("presentation_format".to_string(), presentation_format);
1012
- }
1013
- if let Some(company) = app.company {
1014
- metadata_map.insert("organization".to_string(), company);
1015
- }
1016
- if let Some(application) = app.application {
1017
- metadata_map.insert("application".to_string(), application);
1018
- }
1019
- if let Some(app_version) = app.app_version {
1020
- metadata_map.insert("application_version".to_string(), app_version);
1021
- }
1022
- }
1023
-
1024
- if let Ok(custom) = extract_custom_properties(archive) {
1025
- for (key, value) in custom {
1026
- let value_str = match value {
1027
- Value::String(s) => s,
1028
- Value::Number(n) => n.to_string(),
1029
- Value::Bool(b) => b.to_string(),
1030
- Value::Null => "null".to_string(),
1031
- Value::Array(_) | Value::Object(_) => value.to_string(),
1032
- };
1033
- metadata_map.insert(format!("custom_{}", key), value_str);
1034
- }
1035
- }
1036
-
1037
- PptxMetadata { fonts: Vec::new() }
1038
- }
1039
-
1040
- #[cfg(not(feature = "office"))]
1041
- {
1042
- PptxMetadata { fonts: Vec::new() }
1043
- }
1044
- }
1045
-
1046
- fn extract_all_notes(container: &mut PptxContainer) -> Result<HashMap<u32, String>> {
1047
- let mut notes = HashMap::new();
1048
-
1049
- let slide_paths: Vec<String> = container.slide_paths().to_vec();
1050
-
1051
- for (i, slide_path) in slide_paths.iter().enumerate() {
1052
- let notes_path = slide_path.replace("slides/slide", "notesSlides/notesSlide");
1053
- if let Ok(notes_xml) = container.read_file(&notes_path)
1054
- && let Ok(note_text) = extract_notes_text(&notes_xml)
1055
- {
1056
- notes.insert((i + 1) as u32, note_text);
1057
- }
1058
- }
1059
-
1060
- Ok(notes)
1061
- }
1062
-
1063
- fn extract_notes_text(notes_xml: &[u8]) -> Result<String> {
1064
- let xml_str = utf8_validation::from_utf8(notes_xml)
1065
- .map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in notes XML: {}", e)))?;
1066
-
1067
- let doc =
1068
- Document::parse(xml_str).map_err(|e| KreuzbergError::parsing(format!("Failed to parse notes XML: {}", e)))?;
1069
-
1070
- let mut text_parts = Vec::with_capacity(16);
1071
- const DRAWINGML_NS: &str = "http://schemas.openxmlformats.org/drawingml/2006/main";
1072
-
1073
- for node in doc.descendants() {
1074
- if node.has_tag_name((DRAWINGML_NS, "t"))
1075
- && let Some(text) = node.text()
1076
- {
1077
- text_parts.push(text);
1078
- }
1079
- }
1080
-
1081
- Ok(text_parts.join(" "))
1082
- }
1083
-
1084
- fn get_slide_rels_path(slide_path: &str) -> String {
1085
- let parts: Vec<&str> = slide_path.rsplitn(2, '/').collect();
1086
- if parts.len() == 2 {
1087
- format!("{}/_rels/{}.rels", parts[1], parts[0])
1088
- } else {
1089
- format!("_rels/{}.rels", slide_path)
1090
- }
1091
- }
1092
-
1093
- fn get_full_image_path(slide_path: &str, image_target: &str) -> String {
1094
- if image_target.starts_with("..") {
1095
- let parts: Vec<&str> = slide_path.rsplitn(3, '/').collect();
1096
- if parts.len() >= 3 {
1097
- format!("{}/{}", parts[2], &image_target[3..])
1098
- } else {
1099
- format!("ppt/{}", &image_target[3..])
1100
- }
1101
- } else {
1102
- let parts: Vec<&str> = slide_path.rsplitn(2, '/').collect();
1103
- if parts.len() == 2 {
1104
- format!("{}/{}", parts[1], image_target)
1105
- } else {
1106
- format!("ppt/slides/{}", image_target)
1107
- }
1108
- }
1109
- }
1110
-
1111
- fn detect_image_format(data: &[u8]) -> String {
1112
- if data.starts_with(&[0xFF, 0xD8, 0xFF]) {
1113
- "jpeg".to_string()
1114
- } else if data.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
1115
- "png".to_string()
1116
- } else if data.starts_with(b"GIF") {
1117
- "gif".to_string()
1118
- } else if data.starts_with(b"BM") {
1119
- "bmp".to_string()
1120
- } else if data.starts_with(b"<svg") || data.starts_with(b"<?xml") {
1121
- "svg".to_string()
1122
- } else if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
1123
- "tiff".to_string()
1124
- } else {
1125
- "unknown".to_string()
1126
- }
1127
- }
1128
-
1129
- pub fn extract_pptx_from_path(
1130
- path: &str,
1131
- extract_images: bool,
1132
- page_config: Option<&crate::core::config::PageConfig>,
1133
- ) -> Result<PptxExtractionResult> {
1134
- let config = ParserConfig {
1135
- extract_images,
1136
- ..Default::default()
1137
- };
1138
-
1139
- let mut container = PptxContainer::open(path)?;
1140
-
1141
- let metadata = extract_metadata(&mut container.archive);
1142
-
1143
- let notes = extract_all_notes(&mut container)?;
1144
-
1145
- let mut iterator = SlideIterator::new(container);
1146
- let slide_count = iterator.slide_count();
1147
-
1148
- let estimated_capacity = slide_count.saturating_mul(1000).max(8192);
1149
- let mut content_builder = ContentBuilder::with_page_config(estimated_capacity, page_config.cloned());
1150
-
1151
- let mut total_image_count = 0;
1152
- let mut total_table_count = 0;
1153
- let mut extracted_images = Vec::new();
1154
-
1155
- while let Some(slide) = iterator.next_slide()? {
1156
- let byte_start = if page_config.is_some() {
1157
- content_builder.start_slide(slide.slide_number)
1158
- } else {
1159
- 0
1160
- };
1161
-
1162
- let slide_content = slide.to_markdown(&config);
1163
- content_builder.add_text(&slide_content);
1164
-
1165
- if let Some(slide_notes) = notes.get(&slide.slide_number) {
1166
- content_builder.add_notes(slide_notes);
1167
- }
1168
-
1169
- if page_config.is_some() {
1170
- content_builder.end_slide(slide.slide_number, byte_start, slide_content.clone());
1171
- }
1172
-
1173
- if config.extract_images
1174
- && let Ok(image_data) = iterator.get_slide_images(&slide)
1175
- {
1176
- for (_, data) in image_data {
1177
- let format = detect_image_format(&data);
1178
- let image_index = extracted_images.len();
1179
-
1180
- extracted_images.push(ExtractedImage {
1181
- data,
1182
- format,
1183
- image_index,
1184
- page_number: Some(slide.slide_number as usize),
1185
- width: None,
1186
- height: None,
1187
- colorspace: None,
1188
- bits_per_component: None,
1189
- is_mask: false,
1190
- description: None,
1191
- ocr_result: None,
1192
- });
1193
- }
1194
- }
1195
-
1196
- total_image_count += slide.image_count();
1197
- total_table_count += slide.table_count();
1198
- }
1199
-
1200
- let (content, boundaries, page_contents) = content_builder.build();
1201
-
1202
- let page_structure = boundaries.as_ref().map(|bounds| crate::types::PageStructure {
1203
- total_count: slide_count,
1204
- unit_type: crate::types::PageUnitType::Slide,
1205
- boundaries: Some(bounds.clone()),
1206
- pages: page_contents.as_ref().map(|pcs| {
1207
- pcs.iter()
1208
- .map(|pc| crate::types::PageInfo {
1209
- number: pc.page_number,
1210
- title: None,
1211
- dimensions: None,
1212
- image_count: None,
1213
- table_count: None,
1214
- hidden: None,
1215
- })
1216
- .collect()
1217
- }),
1218
- });
1219
-
1220
- Ok(PptxExtractionResult {
1221
- content,
1222
- metadata,
1223
- slide_count,
1224
- image_count: total_image_count,
1225
- table_count: total_table_count,
1226
- images: extracted_images,
1227
- page_structure,
1228
- page_contents,
1229
- })
1230
- }
1231
-
1232
- pub fn extract_pptx_from_bytes(
1233
- data: &[u8],
1234
- extract_images: bool,
1235
- page_config: Option<&crate::core::config::PageConfig>,
1236
- ) -> Result<PptxExtractionResult> {
1237
- use std::sync::atomic::{AtomicU64, Ordering};
1238
- static COUNTER: AtomicU64 = AtomicU64::new(0);
1239
- let unique_id = COUNTER.fetch_add(1, Ordering::SeqCst);
1240
- let temp_path = std::env::temp_dir().join(format!("temp_pptx_{}_{}.pptx", std::process::id(), unique_id));
1241
-
1242
- // IO errors must bubble up - temp file write issues need user reports ~keep
1243
- std::fs::write(&temp_path, data)?;
1244
-
1245
- let result = extract_pptx_from_path(
1246
- temp_path.to_str().ok_or_else(|| {
1247
- crate::KreuzbergError::validation("Invalid temp path - contains invalid UTF-8".to_string())
1248
- })?,
1249
- extract_images,
1250
- page_config,
1251
- );
1252
-
1253
- if let Err(e) = std::fs::remove_file(&temp_path) {
1254
- tracing::warn!("Failed to remove temp PPTX file: {}", e);
1255
- }
1256
-
1257
- result
1258
- }
1259
-
1260
- #[cfg(test)]
1261
- mod tests {
1262
- use super::*;
1263
-
1264
- fn create_test_pptx_bytes(slides: Vec<&str>) -> Vec<u8> {
1265
- use std::io::Write;
1266
- use zip::write::{SimpleFileOptions, ZipWriter};
1267
-
1268
- let mut buffer = Vec::new();
1269
- {
1270
- let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
1271
- let options = SimpleFileOptions::default();
1272
-
1273
- zip.start_file("[Content_Types].xml", options).unwrap();
1274
- zip.write_all(
1275
- br#"<?xml version="1.0" encoding="UTF-8"?>
1276
- <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
1277
- <Default Extension="xml" ContentType="application/xml"/>
1278
- <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
1279
- </Types>"#,
1280
- )
1281
- .unwrap();
1282
-
1283
- zip.start_file("ppt/presentation.xml", options).unwrap();
1284
- zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
1285
-
1286
- zip.start_file("_rels/.rels", options).unwrap();
1287
- zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
1288
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1289
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
1290
- </Relationships>"#).unwrap();
1291
-
1292
- let mut rels_xml = String::from(
1293
- r#"<?xml version="1.0" encoding="UTF-8"?>
1294
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">"#,
1295
- );
1296
- for (i, _) in slides.iter().enumerate() {
1297
- rels_xml.push_str(&format!(
1298
- r#"<Relationship Id="rId{}" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide{}.xml"/>"#,
1299
- i + 1,
1300
- i + 1
1301
- ));
1302
- }
1303
- rels_xml.push_str("</Relationships>");
1304
- zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
1305
- zip.write_all(rels_xml.as_bytes()).unwrap();
1306
-
1307
- for (i, text) in slides.iter().enumerate() {
1308
- let slide_xml = format!(
1309
- r#"<?xml version="1.0" encoding="UTF-8"?>
1310
- <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
1311
- xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
1312
- <p:cSld>
1313
- <p:spTree>
1314
- <p:sp>
1315
- <p:txBody>
1316
- <a:p>
1317
- <a:r>
1318
- <a:t>{}</a:t>
1319
- </a:r>
1320
- </a:p>
1321
- </p:txBody>
1322
- </p:sp>
1323
- </p:spTree>
1324
- </p:cSld>
1325
- </p:sld>"#,
1326
- text
1327
- );
1328
- zip.start_file(format!("ppt/slides/slide{}.xml", i + 1), options)
1329
- .unwrap();
1330
- zip.write_all(slide_xml.as_bytes()).unwrap();
1331
- }
1332
-
1333
- zip.start_file("docProps/core.xml", options).unwrap();
1334
- zip.write_all(
1335
- br#"<?xml version="1.0" encoding="UTF-8"?>
1336
- <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
1337
- xmlns:dc="http://purl.org/dc/elements/1.1/"
1338
- xmlns:dcterms="http://purl.org/dc/terms/">
1339
- <dc:title>Test Presentation</dc:title>
1340
- <dc:creator>Test Author</dc:creator>
1341
- <dc:description>Test Description</dc:description>
1342
- <dc:subject>Test Subject</dc:subject>
1343
- </cp:coreProperties>"#,
1344
- )
1345
- .unwrap();
1346
-
1347
- let _ = zip.finish().unwrap();
1348
- }
1349
- buffer
1350
- }
1351
-
1352
- #[test]
1353
- fn test_extract_pptx_from_bytes_single_slide() {
1354
- let pptx_bytes = create_test_pptx_bytes(vec!["Hello World"]);
1355
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
1356
-
1357
- assert_eq!(result.slide_count, 1);
1358
- assert!(
1359
- result.content.contains("Hello World"),
1360
- "Content was: {}",
1361
- result.content
1362
- );
1363
- assert_eq!(result.image_count, 0);
1364
- assert_eq!(result.table_count, 0);
1365
- }
1366
-
1367
- #[test]
1368
- fn test_extract_pptx_from_bytes_multiple_slides() {
1369
- let pptx_bytes = create_test_pptx_bytes(vec!["Slide 1", "Slide 2", "Slide 3"]);
1370
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
1371
-
1372
- assert_eq!(result.slide_count, 3);
1373
- assert!(result.content.contains("Slide 1"));
1374
- assert!(result.content.contains("Slide 2"));
1375
- assert!(result.content.contains("Slide 3"));
1376
- }
1377
-
1378
- #[test]
1379
- fn test_extract_pptx_metadata() {
1380
- let pptx_bytes = create_test_pptx_bytes(vec!["Content"]);
1381
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
1382
-
1383
- assert!(result.metadata.fonts.is_empty() || !result.metadata.fonts.is_empty());
1384
- }
1385
-
1386
- #[test]
1387
- fn test_extract_pptx_empty_slides() {
1388
- let pptx_bytes = create_test_pptx_bytes(vec!["", "", ""]);
1389
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
1390
-
1391
- assert_eq!(result.slide_count, 3);
1392
- }
1393
-
1394
- #[test]
1395
- fn test_extract_pptx_from_bytes_invalid_data() {
1396
- let invalid_bytes = b"not a valid pptx file";
1397
- let result = extract_pptx_from_bytes(invalid_bytes, false, None);
1398
-
1399
- assert!(result.is_err());
1400
- if let Err(KreuzbergError::Parsing { message: msg, .. }) = result {
1401
- assert!(msg.contains("Failed to read PPTX archive") || msg.contains("Failed to write temp PPTX file"));
1402
- } else {
1403
- panic!("Expected ParsingError");
1404
- }
1405
- }
1406
-
1407
- #[test]
1408
- fn test_extract_pptx_from_bytes_empty_data() {
1409
- let empty_bytes: &[u8] = &[];
1410
- let result = extract_pptx_from_bytes(empty_bytes, false, None);
1411
-
1412
- assert!(result.is_err());
1413
- }
1414
-
1415
- #[test]
1416
- fn test_detect_image_format_jpeg() {
1417
- let jpeg_header = vec![0xFF, 0xD8, 0xFF, 0xE0];
1418
- assert_eq!(detect_image_format(&jpeg_header), "jpeg");
1419
- }
1420
-
1421
- #[test]
1422
- fn test_detect_image_format_png() {
1423
- let png_header = vec![0x89, 0x50, 0x4E, 0x47];
1424
- assert_eq!(detect_image_format(&png_header), "png");
1425
- }
1426
-
1427
- #[test]
1428
- fn test_detect_image_format_gif() {
1429
- let gif_header = b"GIF89a";
1430
- assert_eq!(detect_image_format(gif_header), "gif");
1431
- }
1432
-
1433
- #[test]
1434
- fn test_detect_image_format_bmp() {
1435
- let bmp_header = b"BM";
1436
- assert_eq!(detect_image_format(bmp_header), "bmp");
1437
- }
1438
-
1439
- #[test]
1440
- fn test_detect_image_format_svg() {
1441
- let svg_header = b"<svg xmlns=\"http://www.w3.org/2000/svg\">";
1442
- assert_eq!(detect_image_format(svg_header), "svg");
1443
- }
1444
-
1445
- #[test]
1446
- fn test_detect_image_format_tiff_little_endian() {
1447
- let tiff_header = vec![0x49, 0x49, 0x2A, 0x00];
1448
- assert_eq!(detect_image_format(&tiff_header), "tiff");
1449
- }
1450
-
1451
- #[test]
1452
- fn test_detect_image_format_tiff_big_endian() {
1453
- let tiff_header = vec![0x4D, 0x4D, 0x00, 0x2A];
1454
- assert_eq!(detect_image_format(&tiff_header), "tiff");
1455
- }
1456
-
1457
- #[test]
1458
- fn test_detect_image_format_unknown() {
1459
- let unknown_data = b"unknown format";
1460
- assert_eq!(detect_image_format(unknown_data), "unknown");
1461
- }
1462
-
1463
- #[test]
1464
- fn test_html_escape() {
1465
- assert_eq!(html_escape("plain text"), "plain text");
1466
- assert_eq!(html_escape("a & b"), "a &amp; b");
1467
- assert_eq!(html_escape("<tag>"), "&lt;tag&gt;");
1468
- assert_eq!(html_escape("\"quoted\""), "&quot;quoted&quot;");
1469
- assert_eq!(html_escape("'apostrophe'"), "&#x27;apostrophe&#x27;");
1470
- assert_eq!(
1471
- html_escape("<a href=\"url\" title='test'>text & more</a>"),
1472
- "&lt;a href=&quot;url&quot; title=&#x27;test&#x27;&gt;text &amp; more&lt;/a&gt;"
1473
- );
1474
- }
1475
-
1476
- #[test]
1477
- fn test_get_slide_rels_path() {
1478
- assert_eq!(
1479
- get_slide_rels_path("ppt/slides/slide1.xml"),
1480
- "ppt/slides/_rels/slide1.xml.rels"
1481
- );
1482
- assert_eq!(
1483
- get_slide_rels_path("ppt/slides/slide10.xml"),
1484
- "ppt/slides/_rels/slide10.xml.rels"
1485
- );
1486
- }
1487
-
1488
- #[test]
1489
- fn test_get_full_image_path_relative() {
1490
- assert_eq!(
1491
- get_full_image_path("ppt/slides/slide1.xml", "../media/image1.png"),
1492
- "ppt/media/image1.png"
1493
- );
1494
- }
1495
-
1496
- #[test]
1497
- fn test_get_full_image_path_direct() {
1498
- assert_eq!(
1499
- get_full_image_path("ppt/slides/slide1.xml", "image1.png"),
1500
- "ppt/slides/image1.png"
1501
- );
1502
- }
1503
-
1504
- #[test]
1505
- fn test_content_builder_add_text() {
1506
- let mut builder = ContentBuilder::new();
1507
- builder.add_text("Hello");
1508
- builder.add_text(" ");
1509
- builder.add_text("World");
1510
- let (content, _, _) = builder.build();
1511
- assert_eq!(content, "HelloWorld");
1512
- }
1513
-
1514
- #[test]
1515
- fn test_content_builder_add_text_empty() {
1516
- let mut builder = ContentBuilder::new();
1517
- builder.add_text(" ");
1518
- builder.add_text("");
1519
- let (content, _, _) = builder.build();
1520
- assert_eq!(content, "");
1521
- }
1522
-
1523
- #[test]
1524
- fn test_content_builder_add_title() {
1525
- let mut builder = ContentBuilder::new();
1526
- builder.add_title("Title");
1527
- let (content, _, _) = builder.build();
1528
- assert_eq!(content, "# Title");
1529
- }
1530
-
1531
- #[test]
1532
- fn test_content_builder_add_title_with_whitespace() {
1533
- let mut builder = ContentBuilder::new();
1534
- builder.add_title(" Title ");
1535
- let (content, _, _) = builder.build();
1536
- assert_eq!(content, "# Title");
1537
- }
1538
-
1539
- #[test]
1540
- fn test_content_builder_add_table_empty() {
1541
- let mut builder = ContentBuilder::new();
1542
- builder.add_table(&[]);
1543
- let (content, _, _) = builder.build();
1544
- assert_eq!(content, "");
1545
- }
1546
-
1547
- #[test]
1548
- fn test_content_builder_add_table_single_row() {
1549
- let mut builder = ContentBuilder::new();
1550
- let rows = vec![vec!["Header1".to_string(), "Header2".to_string()]];
1551
- builder.add_table(&rows);
1552
- let result = builder.build();
1553
- assert!(result.0.contains("<table>"));
1554
- assert!(result.0.contains("<th>Header1</th>"));
1555
- assert!(result.0.contains("<th>Header2</th>"));
1556
- }
1557
-
1558
- #[test]
1559
- fn test_content_builder_add_table_multiple_rows() {
1560
- let mut builder = ContentBuilder::new();
1561
- let rows = vec![
1562
- vec!["H1".to_string(), "H2".to_string()],
1563
- vec!["D1".to_string(), "D2".to_string()],
1564
- ];
1565
- builder.add_table(&rows);
1566
- let result = builder.build();
1567
- assert!(result.0.contains("<th>H1</th>"));
1568
- assert!(result.0.contains("<td>D1</td>"));
1569
- }
1570
-
1571
- #[test]
1572
- fn test_content_builder_add_table_with_special_chars() {
1573
- let mut builder = ContentBuilder::new();
1574
- let rows = vec![vec!["<tag>".to_string(), "a & b".to_string()]];
1575
- builder.add_table(&rows);
1576
- let result = builder.build();
1577
- assert!(result.0.contains("&lt;tag&gt;"));
1578
- assert!(result.0.contains("a &amp; b"));
1579
- }
1580
-
1581
- #[test]
1582
- fn test_content_builder_add_list_item_unordered() {
1583
- let mut builder = ContentBuilder::new();
1584
- builder.add_list_item(1, false, "Item 1");
1585
- builder.add_list_item(1, false, "Item 2");
1586
- let result = builder.build();
1587
- assert!(result.0.contains("- Item 1"));
1588
- assert!(result.0.contains("- Item 2"));
1589
- }
1590
-
1591
- #[test]
1592
- fn test_content_builder_add_list_item_ordered() {
1593
- let mut builder = ContentBuilder::new();
1594
- builder.add_list_item(1, true, "First");
1595
- builder.add_list_item(1, true, "Second");
1596
- let result = builder.build();
1597
- assert!(result.0.contains("1. First"));
1598
- assert!(result.0.contains("1. Second"));
1599
- }
1600
-
1601
- #[test]
1602
- fn test_content_builder_add_list_item_nested() {
1603
- let mut builder = ContentBuilder::new();
1604
- builder.add_list_item(1, false, "Level 1");
1605
- builder.add_list_item(2, false, "Level 2");
1606
- builder.add_list_item(3, false, "Level 3");
1607
- let result = builder.build();
1608
- assert!(result.0.contains("- Level 1"));
1609
- assert!(result.0.contains(" - Level 2"));
1610
- assert!(result.0.contains(" - Level 3"));
1611
- }
1612
-
1613
- #[test]
1614
- fn test_content_builder_add_image() {
1615
- let mut builder = ContentBuilder::new();
1616
- builder.add_image("img123", 5);
1617
- let result = builder.build();
1618
- assert!(result.0.contains("![img123](slide_5_image_img123.jpg)"));
1619
- }
1620
-
1621
- #[test]
1622
- fn test_content_builder_add_notes() {
1623
- let mut builder = ContentBuilder::new();
1624
- builder.add_notes("This is a note");
1625
- let result = builder.build();
1626
- assert!(result.0.contains("### Notes:"));
1627
- assert!(result.0.contains("This is a note"));
1628
- }
1629
-
1630
- #[test]
1631
- fn test_content_builder_add_notes_empty() {
1632
- let mut builder = ContentBuilder::new();
1633
- builder.add_notes(" ");
1634
- let (content, _, _) = builder.build();
1635
- assert_eq!(content, "");
1636
- }
1637
-
1638
- #[test]
1639
- fn test_content_builder_add_slide_header() {
1640
- let mut builder = ContentBuilder::new();
1641
- builder.add_slide_header(3);
1642
- let result = builder.build();
1643
- assert!(result.0.contains("<!-- Slide number: 3 -->"));
1644
- }
1645
-
1646
- #[test]
1647
- fn test_run_extract() {
1648
- let run = Run {
1649
- text: "Hello".to_string(),
1650
- formatting: Formatting::default(),
1651
- };
1652
- assert_eq!(run.extract(), "Hello");
1653
- }
1654
-
1655
- #[test]
1656
- fn test_run_render_as_md_plain() {
1657
- let run = Run {
1658
- text: "plain".to_string(),
1659
- formatting: Formatting::default(),
1660
- };
1661
- assert_eq!(run.render_as_md(), "plain");
1662
- }
1663
-
1664
- #[test]
1665
- fn test_run_render_as_md_bold() {
1666
- let run = Run {
1667
- text: "bold".to_string(),
1668
- formatting: Formatting {
1669
- bold: true,
1670
- ..Default::default()
1671
- },
1672
- };
1673
- assert_eq!(run.render_as_md(), "**bold**");
1674
- }
1675
-
1676
- #[test]
1677
- fn test_run_render_as_md_italic() {
1678
- let run = Run {
1679
- text: "italic".to_string(),
1680
- formatting: Formatting {
1681
- italic: true,
1682
- ..Default::default()
1683
- },
1684
- };
1685
- assert_eq!(run.render_as_md(), "*italic*");
1686
- }
1687
-
1688
- #[test]
1689
- fn test_run_render_as_md_bold_italic() {
1690
- let run = Run {
1691
- text: "both".to_string(),
1692
- formatting: Formatting {
1693
- bold: true,
1694
- italic: true,
1695
- ..Default::default()
1696
- },
1697
- };
1698
- assert_eq!(run.render_as_md(), "***both***");
1699
- }
1700
-
1701
- #[test]
1702
- fn test_parse_slide_xml_simple_text() {
1703
- let xml = br#"<?xml version="1.0"?>
1704
- <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
1705
- xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
1706
- <p:cSld>
1707
- <p:spTree>
1708
- <p:sp>
1709
- <p:txBody>
1710
- <a:p>
1711
- <a:r>
1712
- <a:t>Test Text</a:t>
1713
- </a:r>
1714
- </a:p>
1715
- </p:txBody>
1716
- </p:sp>
1717
- </p:spTree>
1718
- </p:cSld>
1719
- </p:sld>"#;
1720
-
1721
- let elements = parse_slide_xml(xml).unwrap();
1722
- if !elements.is_empty() {
1723
- if let SlideElement::Text(text, _) = &elements[0] {
1724
- assert_eq!(text.runs[0].text, "Test Text\n");
1725
- } else {
1726
- panic!("Expected Text element");
1727
- }
1728
- }
1729
- }
1730
-
1731
- #[test]
1732
- fn test_parse_slide_xml_invalid_utf8() {
1733
- let invalid_utf8 = vec![0xFF, 0xFE, 0xFF];
1734
- let result = parse_slide_xml(&invalid_utf8);
1735
- assert!(result.is_err());
1736
- if let Err(KreuzbergError::Parsing { message: msg, .. }) = result {
1737
- assert!(msg.contains("Invalid UTF-8"));
1738
- }
1739
- }
1740
-
1741
- #[test]
1742
- fn test_parse_slide_xml_malformed() {
1743
- let malformed = b"<not valid xml>";
1744
- let result = parse_slide_xml(malformed);
1745
- assert!(result.is_err());
1746
- }
1747
-
1748
- #[test]
1749
- fn test_parse_slide_rels_with_images() {
1750
- let rels_xml = br#"<?xml version="1.0"?>
1751
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1752
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image1.png"/>
1753
- <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image2.jpg"/>
1754
- </Relationships>"#;
1755
-
1756
- let images = parse_slide_rels(rels_xml).unwrap();
1757
- assert_eq!(images.len(), 2);
1758
- assert_eq!(images[0].id, "rId1");
1759
- assert_eq!(images[0].target, "../media/image1.png");
1760
- assert_eq!(images[1].id, "rId2");
1761
- assert_eq!(images[1].target, "../media/image2.jpg");
1762
- }
1763
-
1764
- #[test]
1765
- fn test_parse_slide_rels_no_images() {
1766
- let rels_xml = br#"<?xml version="1.0"?>
1767
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1768
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide" Target="../notesSlides/notesSlide1.xml"/>
1769
- </Relationships>"#;
1770
-
1771
- let images = parse_slide_rels(rels_xml).unwrap();
1772
- assert_eq!(images.len(), 0);
1773
- }
1774
-
1775
- #[test]
1776
- fn test_parse_presentation_rels() {
1777
- let rels_xml = br#"<?xml version="1.0"?>
1778
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1779
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
1780
- <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide2.xml"/>
1781
- <Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slideMaster" Target="slideMasters/slideMaster1.xml"/>
1782
- </Relationships>"#;
1783
-
1784
- let slides = parse_presentation_rels(rels_xml).unwrap();
1785
- assert_eq!(slides.len(), 2);
1786
- assert_eq!(slides[0], "ppt/slides/slide1.xml");
1787
- assert_eq!(slides[1], "ppt/slides/slide2.xml");
1788
- }
1789
-
1790
- #[test]
1791
- fn test_extract_notes_text() {
1792
- let notes_xml = br#"<?xml version="1.0"?>
1793
- <p:notes xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
1794
- xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
1795
- <p:cSld>
1796
- <p:spTree>
1797
- <p:sp>
1798
- <p:txBody>
1799
- <a:p>
1800
- <a:r>
1801
- <a:t>First note</a:t>
1802
- </a:r>
1803
- </a:p>
1804
- <a:p>
1805
- <a:r>
1806
- <a:t>Second note</a:t>
1807
- </a:r>
1808
- </a:p>
1809
- </p:txBody>
1810
- </p:sp>
1811
- </p:spTree>
1812
- </p:cSld>
1813
- </p:notes>"#;
1814
-
1815
- let notes = extract_notes_text(notes_xml).unwrap();
1816
- assert!(notes.contains("First note"));
1817
- assert!(notes.contains("Second note"));
1818
- }
1819
-
1820
- #[test]
1821
- fn test_parser_config_default() {
1822
- let config = ParserConfig::default();
1823
- assert!(config.extract_images);
1824
- assert!(!config.include_slide_comment);
1825
- }
1826
-
1827
- fn create_pptx_with_table(rows: Vec<Vec<&str>>) -> Vec<u8> {
1828
- use std::io::Write;
1829
- use zip::write::{SimpleFileOptions, ZipWriter};
1830
-
1831
- let mut buffer = Vec::new();
1832
- {
1833
- let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
1834
- let options = SimpleFileOptions::default();
1835
-
1836
- zip.start_file("[Content_Types].xml", options).unwrap();
1837
- zip.write_all(
1838
- br#"<?xml version="1.0" encoding="UTF-8"?>
1839
- <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
1840
- <Default Extension="xml" ContentType="application/xml"/>
1841
- <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
1842
- </Types>"#,
1843
- )
1844
- .unwrap();
1845
-
1846
- zip.start_file("ppt/presentation.xml", options).unwrap();
1847
- zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
1848
-
1849
- zip.start_file("_rels/.rels", options).unwrap();
1850
- zip.write_all(
1851
- br#"<?xml version="1.0" encoding="UTF-8"?>
1852
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1853
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
1854
- </Relationships>"#,
1855
- )
1856
- .unwrap();
1857
-
1858
- zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
1859
- zip.write_all(
1860
- br#"<?xml version="1.0" encoding="UTF-8"?>
1861
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1862
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
1863
- </Relationships>"#,
1864
- )
1865
- .unwrap();
1866
-
1867
- let mut table_xml = String::from(
1868
- r#"<a:tbl>
1869
- <a:tblGrid>"#,
1870
- );
1871
- if !rows.is_empty() {
1872
- for _ in 0..rows[0].len() {
1873
- table_xml.push_str(r#"<a:gridCol w="2000000"/>"#);
1874
- }
1875
- }
1876
- table_xml.push_str("</a:tblGrid>");
1877
-
1878
- for row in rows {
1879
- table_xml.push_str(r#"<a:tr h="370840">"#);
1880
- for cell in row {
1881
- table_xml.push_str(&format!(
1882
- r#"<a:tc>
1883
- <a:txBody>
1884
- <a:p>
1885
- <a:r>
1886
- <a:t>{}</a:t>
1887
- </a:r>
1888
- </a:p>
1889
- </a:txBody>
1890
- </a:tc>"#,
1891
- cell
1892
- ));
1893
- }
1894
- table_xml.push_str("</a:tr>");
1895
- }
1896
- table_xml.push_str("</a:tbl>");
1897
-
1898
- let slide_xml = format!(
1899
- r#"<?xml version="1.0" encoding="UTF-8"?>
1900
- <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
1901
- xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
1902
- <p:cSld>
1903
- <p:spTree>
1904
- <p:graphicFrame>
1905
- <p:xfrm>
1906
- <a:off x="1000000" y="2000000"/>
1907
- <a:ext cx="8000000" cy="4000000"/>
1908
- </p:xfrm>
1909
- <a:graphic>
1910
- <a:graphicData uri="http://schemas.openxmlformats.org/drawingml/2006/table">
1911
- {}
1912
- </a:graphicData>
1913
- </a:graphic>
1914
- </p:graphicFrame>
1915
- </p:spTree>
1916
- </p:cSld>
1917
- </p:sld>"#,
1918
- table_xml
1919
- );
1920
-
1921
- zip.start_file("ppt/slides/slide1.xml", options).unwrap();
1922
- zip.write_all(slide_xml.as_bytes()).unwrap();
1923
-
1924
- zip.start_file("docProps/core.xml", options).unwrap();
1925
- zip.write_all(
1926
- br#"<?xml version="1.0" encoding="UTF-8"?>
1927
- <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
1928
- xmlns:dc="http://purl.org/dc/elements/1.1/">
1929
- <dc:title>Test Table</dc:title>
1930
- </cp:coreProperties>"#,
1931
- )
1932
- .unwrap();
1933
-
1934
- let _ = zip.finish().unwrap();
1935
- }
1936
- buffer
1937
- }
1938
-
1939
- fn create_pptx_with_lists(list_items: Vec<(usize, bool, &str)>) -> Vec<u8> {
1940
- use std::io::Write;
1941
- use zip::write::{SimpleFileOptions, ZipWriter};
1942
-
1943
- let mut buffer = Vec::new();
1944
- {
1945
- let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
1946
- let options = SimpleFileOptions::default();
1947
-
1948
- zip.start_file("[Content_Types].xml", options).unwrap();
1949
- zip.write_all(
1950
- br#"<?xml version="1.0" encoding="UTF-8"?>
1951
- <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
1952
- <Default Extension="xml" ContentType="application/xml"/>
1953
- <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
1954
- </Types>"#,
1955
- )
1956
- .unwrap();
1957
-
1958
- zip.start_file("ppt/presentation.xml", options).unwrap();
1959
- zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
1960
-
1961
- zip.start_file("_rels/.rels", options).unwrap();
1962
- zip.write_all(
1963
- br#"<?xml version="1.0" encoding="UTF-8"?>
1964
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1965
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
1966
- </Relationships>"#,
1967
- )
1968
- .unwrap();
1969
-
1970
- zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
1971
- zip.write_all(
1972
- br#"<?xml version="1.0" encoding="UTF-8"?>
1973
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1974
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
1975
- </Relationships>"#,
1976
- )
1977
- .unwrap();
1978
-
1979
- let mut list_xml = String::new();
1980
- for (level, is_ordered, text) in list_items {
1981
- let indent = (level - 1) * 457200;
1982
- let lvl_attr = level - 1;
1983
- let bullet_section = if is_ordered {
1984
- format!(
1985
- r#"<a:pPr lvl="{}"><a:buAutoNum type="arabicPeriod"/></a:pPr>"#,
1986
- lvl_attr
1987
- )
1988
- } else {
1989
- format!(
1990
- r#"<a:pPr lvl="{}" marL="{}"><a:buFont typeface="Arial"/><a:buChar char="•"/></a:pPr>"#,
1991
- lvl_attr, indent
1992
- )
1993
- };
1994
-
1995
- list_xml.push_str(&format!(
1996
- r#"<p:sp>
1997
- <p:spPr>
1998
- <a:xfrm>
1999
- <a:off x="1000000" y="1000000"/>
2000
- <a:ext cx="6000000" cy="1000000"/>
2001
- </a:xfrm>
2002
- </p:spPr>
2003
- <p:txBody>
2004
- <a:bodyPr/>
2005
- <a:lstStyle/>
2006
- <a:p>
2007
- {}
2008
- <a:r>
2009
- <a:t>{}</a:t>
2010
- </a:r>
2011
- </a:p>
2012
- </p:txBody>
2013
- </p:sp>"#,
2014
- bullet_section, text
2015
- ));
2016
- }
2017
-
2018
- let slide_xml = format!(
2019
- r#"<?xml version="1.0" encoding="UTF-8"?>
2020
- <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
2021
- xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
2022
- <p:cSld>
2023
- <p:spTree>
2024
- {}
2025
- </p:spTree>
2026
- </p:cSld>
2027
- </p:sld>"#,
2028
- list_xml
2029
- );
2030
-
2031
- zip.start_file("ppt/slides/slide1.xml", options).unwrap();
2032
- zip.write_all(slide_xml.as_bytes()).unwrap();
2033
-
2034
- zip.start_file("docProps/core.xml", options).unwrap();
2035
- zip.write_all(
2036
- br#"<?xml version="1.0" encoding="UTF-8"?>
2037
- <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
2038
- xmlns:dc="http://purl.org/dc/elements/1.1/">
2039
- <dc:title>Test Lists</dc:title>
2040
- </cp:coreProperties>"#,
2041
- )
2042
- .unwrap();
2043
-
2044
- let _ = zip.finish().unwrap();
2045
- }
2046
- buffer
2047
- }
2048
-
2049
- fn create_pptx_with_images() -> Vec<u8> {
2050
- use std::io::Write;
2051
- use zip::write::{SimpleFileOptions, ZipWriter};
2052
-
2053
- let mut buffer = Vec::new();
2054
- {
2055
- let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
2056
- let options = SimpleFileOptions::default();
2057
-
2058
- zip.start_file("[Content_Types].xml", options).unwrap();
2059
- zip.write_all(
2060
- br#"<?xml version="1.0" encoding="UTF-8"?>
2061
- <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
2062
- <Default Extension="xml" ContentType="application/xml"/>
2063
- <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
2064
- <Default Extension="png" ContentType="image/png"/>
2065
- <Default Extension="jpeg" ContentType="image/jpeg"/>
2066
- </Types>"#,
2067
- )
2068
- .unwrap();
2069
-
2070
- zip.start_file("ppt/presentation.xml", options).unwrap();
2071
- zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
2072
-
2073
- zip.start_file("_rels/.rels", options).unwrap();
2074
- zip.write_all(
2075
- br#"<?xml version="1.0" encoding="UTF-8"?>
2076
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
2077
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
2078
- </Relationships>"#,
2079
- )
2080
- .unwrap();
2081
-
2082
- zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
2083
- zip.write_all(
2084
- br#"<?xml version="1.0" encoding="UTF-8"?>
2085
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
2086
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
2087
- </Relationships>"#,
2088
- )
2089
- .unwrap();
2090
-
2091
- zip.start_file("ppt/slides/_rels/slide1.xml.rels", options).unwrap();
2092
- zip.write_all(
2093
- br#"<?xml version="1.0" encoding="UTF-8"?>
2094
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
2095
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image1.png"/>
2096
- <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image2.jpeg"/>
2097
- </Relationships>"#,
2098
- )
2099
- .unwrap();
2100
-
2101
- let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
2102
- <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
2103
- xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
2104
- xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
2105
- <p:cSld>
2106
- <p:spTree>
2107
- <p:pic>
2108
- <p:nvPicPr>
2109
- <p:cNvPr id="1" name="Image1"/>
2110
- </p:nvPicPr>
2111
- <p:blipFill>
2112
- <a:blip r:embed="rId1"/>
2113
- </p:blipFill>
2114
- <p:spPr>
2115
- <a:xfrm>
2116
- <a:off x="1000000" y="1000000"/>
2117
- <a:ext cx="2000000" cy="2000000"/>
2118
- </a:xfrm>
2119
- </p:spPr>
2120
- </p:pic>
2121
- <p:pic>
2122
- <p:nvPicPr>
2123
- <p:cNvPr id="2" name="Image2"/>
2124
- </p:nvPicPr>
2125
- <p:blipFill>
2126
- <a:blip r:embed="rId2"/>
2127
- </p:blipFill>
2128
- <p:spPr>
2129
- <a:xfrm>
2130
- <a:off x="4000000" y="1000000"/>
2131
- <a:ext cx="2000000" cy="2000000"/>
2132
- </a:xfrm>
2133
- </p:spPr>
2134
- </p:pic>
2135
- </p:spTree>
2136
- </p:cSld>
2137
- </p:sld>"#;
2138
-
2139
- zip.start_file("ppt/slides/slide1.xml", options).unwrap();
2140
- zip.write_all(slide_xml.as_bytes()).unwrap();
2141
-
2142
- let png_bytes: Vec<u8> = vec![
2143
- 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, 0x00,
2144
- 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x02, 0x00, 0x00, 0x00, 0x90, 0x77, 0x53, 0xDE, 0x00,
2145
- 0x00, 0x00, 0x00, 0x49, 0x45, 0x4E, 0x44, 0xAE, 0x42, 0x60, 0x82,
2146
- ];
2147
- zip.start_file("ppt/media/image1.png", options).unwrap();
2148
- zip.write_all(&png_bytes).unwrap();
2149
-
2150
- let jpeg_bytes: Vec<u8> = vec![
2151
- 0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x00,
2152
- 0x01, 0x00, 0x00, 0xFF, 0xD9,
2153
- ];
2154
- zip.start_file("ppt/media/image2.jpeg", options).unwrap();
2155
- zip.write_all(&jpeg_bytes).unwrap();
2156
-
2157
- zip.start_file("docProps/core.xml", options).unwrap();
2158
- zip.write_all(
2159
- br#"<?xml version="1.0" encoding="UTF-8"?>
2160
- <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
2161
- xmlns:dc="http://purl.org/dc/elements/1.1/">
2162
- <dc:title>Test Images</dc:title>
2163
- </cp:coreProperties>"#,
2164
- )
2165
- .unwrap();
2166
-
2167
- let _ = zip.finish().unwrap();
2168
- }
2169
- buffer
2170
- }
2171
-
2172
- fn create_pptx_with_formatting() -> Vec<u8> {
2173
- use std::io::Write;
2174
- use zip::write::{SimpleFileOptions, ZipWriter};
2175
-
2176
- let mut buffer = Vec::new();
2177
- {
2178
- let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
2179
- let options = SimpleFileOptions::default();
2180
-
2181
- zip.start_file("[Content_Types].xml", options).unwrap();
2182
- zip.write_all(
2183
- br#"<?xml version="1.0" encoding="UTF-8"?>
2184
- <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
2185
- <Default Extension="xml" ContentType="application/xml"/>
2186
- <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
2187
- </Types>"#,
2188
- )
2189
- .unwrap();
2190
-
2191
- zip.start_file("ppt/presentation.xml", options).unwrap();
2192
- zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
2193
-
2194
- zip.start_file("_rels/.rels", options).unwrap();
2195
- zip.write_all(
2196
- br#"<?xml version="1.0" encoding="UTF-8"?>
2197
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
2198
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
2199
- </Relationships>"#,
2200
- )
2201
- .unwrap();
2202
-
2203
- zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
2204
- zip.write_all(
2205
- br#"<?xml version="1.0" encoding="UTF-8"?>
2206
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
2207
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
2208
- </Relationships>"#,
2209
- )
2210
- .unwrap();
2211
-
2212
- let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
2213
- <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
2214
- xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
2215
- <p:cSld>
2216
- <p:spTree>
2217
- <p:sp>
2218
- <p:spPr>
2219
- <a:xfrm>
2220
- <a:off x="1000000" y="1000000"/>
2221
- <a:ext cx="6000000" cy="1000000"/>
2222
- </a:xfrm>
2223
- </p:spPr>
2224
- <p:txBody>
2225
- <a:p>
2226
- <a:r>
2227
- <a:rPr b="1"/>
2228
- <a:t>Bold text</a:t>
2229
- </a:r>
2230
- </a:p>
2231
- </p:txBody>
2232
- </p:sp>
2233
- <p:sp>
2234
- <p:spPr>
2235
- <a:xfrm>
2236
- <a:off x="1000000" y="2000000"/>
2237
- <a:ext cx="6000000" cy="1000000"/>
2238
- </a:xfrm>
2239
- </p:spPr>
2240
- <p:txBody>
2241
- <a:p>
2242
- <a:r>
2243
- <a:rPr i="1"/>
2244
- <a:t>Italic text</a:t>
2245
- </a:r>
2246
- </a:p>
2247
- </p:txBody>
2248
- </p:sp>
2249
- <p:sp>
2250
- <p:spPr>
2251
- <a:xfrm>
2252
- <a:off x="1000000" y="3000000"/>
2253
- <a:ext cx="6000000" cy="1000000"/>
2254
- </a:xfrm>
2255
- </p:spPr>
2256
- <p:txBody>
2257
- <a:p>
2258
- <a:r>
2259
- <a:rPr u="sng"/>
2260
- <a:t>Underline text</a:t>
2261
- </a:r>
2262
- </a:p>
2263
- </p:txBody>
2264
- </p:sp>
2265
- <p:sp>
2266
- <p:spPr>
2267
- <a:xfrm>
2268
- <a:off x="1000000" y="4000000"/>
2269
- <a:ext cx="6000000" cy="1000000"/>
2270
- </a:xfrm>
2271
- </p:spPr>
2272
- <p:txBody>
2273
- <a:p>
2274
- <a:r>
2275
- <a:rPr b="1" i="1"/>
2276
- <a:t>Bold italic text</a:t>
2277
- </a:r>
2278
- </a:p>
2279
- </p:txBody>
2280
- </p:sp>
2281
- </p:spTree>
2282
- </p:cSld>
2283
- </p:sld>"#;
2284
-
2285
- zip.start_file("ppt/slides/slide1.xml", options).unwrap();
2286
- zip.write_all(slide_xml.as_bytes()).unwrap();
2287
-
2288
- zip.start_file("docProps/core.xml", options).unwrap();
2289
- zip.write_all(
2290
- br#"<?xml version="1.0" encoding="UTF-8"?>
2291
- <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
2292
- xmlns:dc="http://purl.org/dc/elements/1.1/">
2293
- <dc:title>Test Formatting</dc:title>
2294
- </cp:coreProperties>"#,
2295
- )
2296
- .unwrap();
2297
-
2298
- let _ = zip.finish().unwrap();
2299
- }
2300
- buffer
2301
- }
2302
-
2303
- #[test]
2304
- fn test_table_extraction_with_headers_succeeds() {
2305
- let pptx_bytes = create_pptx_with_table(vec![
2306
- vec!["Header 1", "Header 2", "Header 3"],
2307
- vec!["Data 1", "Data 2", "Data 3"],
2308
- vec!["Row 2 Col 1", "Row 2 Col 2", "Row 2 Col 3"],
2309
- ]);
2310
-
2311
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2312
-
2313
- assert_eq!(result.table_count, 1, "Should detect one table");
2314
- assert!(result.content.contains("<table>"), "Should contain table tag");
2315
- assert!(
2316
- result.content.contains("<th>Header 1</th>"),
2317
- "Should render first header"
2318
- );
2319
- assert!(
2320
- result.content.contains("<th>Header 2</th>"),
2321
- "Should render second header"
2322
- );
2323
- assert!(
2324
- result.content.contains("<th>Header 3</th>"),
2325
- "Should render third header"
2326
- );
2327
- assert!(result.content.contains("<td>Data 1</td>"), "Should render data cell");
2328
- assert!(
2329
- result.content.contains("<td>Row 2 Col 2</td>"),
2330
- "Should render second row data"
2331
- );
2332
- }
2333
-
2334
- #[test]
2335
- fn test_table_extraction_multirow_multicolumn_succeeds() {
2336
- let pptx_bytes = create_pptx_with_table(vec![
2337
- vec!["A1", "B1", "C1", "D1"],
2338
- vec!["A2", "B2", "C2", "D2"],
2339
- vec!["A3", "B3", "C3", "D3"],
2340
- vec!["A4", "B4", "C4", "D4"],
2341
- ]);
2342
-
2343
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2344
-
2345
- assert_eq!(result.table_count, 1, "Should detect one table");
2346
- assert!(result.content.contains("<tr>"), "Should contain table rows");
2347
- assert!(result.content.contains("A1"), "Should contain first row data");
2348
- assert!(result.content.contains("D4"), "Should contain last row data");
2349
-
2350
- let tr_count = result.content.matches("<tr>").count();
2351
- assert_eq!(tr_count, 4, "Should have 4 table rows");
2352
- }
2353
-
2354
- #[test]
2355
- fn test_table_counting_via_slide_metadata_succeeds() {
2356
- let pptx_bytes = create_pptx_with_table(vec![vec!["Col1", "Col2"], vec!["Val1", "Val2"]]);
2357
-
2358
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2359
-
2360
- assert_eq!(result.table_count, 1, "table_count should be 1");
2361
- }
2362
-
2363
- #[test]
2364
- fn test_table_markdown_rendering_with_special_chars() {
2365
- let pptx_bytes = create_pptx_with_table(vec![
2366
- vec!["Header with ampersand", "Header 2"],
2367
- vec!["Cell data 1", "Cell data 2"],
2368
- ]);
2369
-
2370
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2371
-
2372
- assert!(result.content.contains("<table>"), "Should contain table tag");
2373
- assert!(
2374
- result.content.contains("<th>Header with ampersand</th>"),
2375
- "Should contain header text"
2376
- );
2377
- assert!(
2378
- result.content.contains("<td>Cell data 1</td>"),
2379
- "Should contain cell data"
2380
- );
2381
- }
2382
-
2383
- #[test]
2384
- fn test_table_extraction_empty_table_returns_one_count() {
2385
- let pptx_bytes = create_pptx_with_table(vec![]);
2386
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2387
-
2388
- assert_eq!(result.table_count, 1, "Empty table structure should be detected");
2389
- assert!(!result.content.contains("<td>"), "Empty table should have no cells");
2390
- }
2391
-
2392
- #[test]
2393
- fn test_list_extraction_ordered_list_succeeds() {
2394
- let pptx_bytes = create_pptx_with_lists(vec![
2395
- (1, true, "First item"),
2396
- (1, true, "Second item"),
2397
- (1, true, "Third item"),
2398
- ]);
2399
-
2400
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2401
-
2402
- assert!(
2403
- result.content.contains("1. First item"),
2404
- "Should contain ordered list item 1"
2405
- );
2406
- assert!(
2407
- result.content.contains("1. Second item"),
2408
- "Should contain ordered list item 2"
2409
- );
2410
- assert!(
2411
- result.content.contains("1. Third item"),
2412
- "Should contain ordered list item 3"
2413
- );
2414
- }
2415
-
2416
- #[test]
2417
- fn test_list_extraction_unordered_list_succeeds() {
2418
- let pptx_bytes = create_pptx_with_lists(vec![
2419
- (1, false, "Bullet one"),
2420
- (1, false, "Bullet two"),
2421
- (1, false, "Bullet three"),
2422
- ]);
2423
-
2424
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2425
-
2426
- assert!(result.content.contains("- Bullet one"), "Should contain bullet point 1");
2427
- assert!(result.content.contains("- Bullet two"), "Should contain bullet point 2");
2428
- assert!(
2429
- result.content.contains("- Bullet three"),
2430
- "Should contain bullet point 3"
2431
- );
2432
- }
2433
-
2434
- #[test]
2435
- fn test_list_extraction_nested_lists_with_indentation_succeeds() {
2436
- let pptx_bytes = create_pptx_with_lists(vec![
2437
- (1, false, "Level 1 Item"),
2438
- (2, false, "Level 2 Item"),
2439
- (3, false, "Level 3 Item"),
2440
- (2, false, "Back to Level 2"),
2441
- (1, false, "Back to Level 1"),
2442
- ]);
2443
-
2444
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2445
-
2446
- assert!(
2447
- result.content.contains("- Level 1 Item"),
2448
- "Should have level 1 with no indent"
2449
- );
2450
- assert!(
2451
- result.content.contains(" - Level 2 Item"),
2452
- "Should have level 2 with 2-space indent"
2453
- );
2454
- assert!(
2455
- result.content.contains(" - Level 3 Item"),
2456
- "Should have level 3 with 4-space indent"
2457
- );
2458
- assert!(
2459
- result.content.contains(" - Back to Level 2"),
2460
- "Should return to level 2 indent"
2461
- );
2462
- assert!(result.content.contains("- Back to Level 1"), "Should return to level 1");
2463
- }
2464
-
2465
- #[test]
2466
- fn test_list_extraction_mixed_ordered_unordered_succeeds() {
2467
- let pptx_bytes = create_pptx_with_lists(vec![
2468
- (1, true, "Ordered item 1"),
2469
- (1, false, "Unordered item 1"),
2470
- (1, true, "Ordered item 2"),
2471
- ]);
2472
-
2473
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2474
-
2475
- assert!(
2476
- result.content.contains("1. Ordered item 1"),
2477
- "Should render ordered list"
2478
- );
2479
- assert!(
2480
- result.content.contains("- Unordered item 1"),
2481
- "Should render unordered list"
2482
- );
2483
- assert!(
2484
- result.content.contains("1. Ordered item 2"),
2485
- "Should render ordered list again"
2486
- );
2487
- }
2488
-
2489
- #[test]
2490
- fn test_image_extraction_from_slide_xml_succeeds() {
2491
- let pptx_bytes = create_pptx_with_images();
2492
- let result = extract_pptx_from_bytes(&pptx_bytes, true, None).unwrap();
2493
-
2494
- assert_eq!(result.image_count, 2, "Should detect 2 images");
2495
- assert!(!result.images.is_empty(), "Should extract image data");
2496
- }
2497
-
2498
- #[test]
2499
- fn test_image_data_loading_from_zip_archive_succeeds() {
2500
- let pptx_bytes = create_pptx_with_images();
2501
- let result = extract_pptx_from_bytes(&pptx_bytes, true, None).unwrap();
2502
-
2503
- assert_eq!(result.images.len(), 2, "Should load 2 images");
2504
-
2505
- for (i, img) in result.images.iter().enumerate() {
2506
- assert!(!img.data.is_empty(), "Image {} should have non-empty data", i);
2507
- }
2508
- }
2509
-
2510
- #[test]
2511
- fn test_image_format_detection_succeeds() {
2512
- let pptx_bytes = create_pptx_with_images();
2513
- let result = extract_pptx_from_bytes(&pptx_bytes, true, None).unwrap();
2514
-
2515
- assert_eq!(result.images.len(), 2, "Should have 2 images");
2516
-
2517
- let formats: Vec<&str> = result.images.iter().map(|img| img.format.as_str()).collect();
2518
-
2519
- assert!(formats.contains(&"png"), "Should detect PNG format");
2520
- assert!(formats.contains(&"jpeg"), "Should detect JPEG format");
2521
- }
2522
-
2523
- #[test]
2524
- fn test_image_counting_via_result_metadata_succeeds() {
2525
- let pptx_bytes = create_pptx_with_images();
2526
- let result = extract_pptx_from_bytes(&pptx_bytes, true, None).unwrap();
2527
-
2528
- assert_eq!(result.image_count, 2, "image_count should match actual images");
2529
- assert_eq!(result.images.len(), 2, "images vector should have 2 elements");
2530
- }
2531
-
2532
- #[test]
2533
- fn test_image_extraction_disabled_returns_zero_images() {
2534
- let pptx_bytes = create_pptx_with_images();
2535
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2536
-
2537
- assert_eq!(
2538
- result.image_count, 2,
2539
- "Should still count images even when not extracted"
2540
- );
2541
- assert_eq!(result.images.len(), 0, "Should not extract image data when disabled");
2542
- }
2543
-
2544
- #[test]
2545
- fn test_multiple_images_per_slide_extraction_succeeds() {
2546
- let pptx_bytes = create_pptx_with_images();
2547
- let result = extract_pptx_from_bytes(&pptx_bytes, true, None).unwrap();
2548
-
2549
- assert_eq!(result.slide_count, 1, "Should have 1 slide");
2550
- assert_eq!(result.image_count, 2, "Single slide should contain 2 images");
2551
-
2552
- let indices: Vec<usize> = result.images.iter().map(|img| img.image_index).collect();
2553
- assert_eq!(indices.len(), 2, "Should have 2 images with indices");
2554
- assert_eq!(indices, vec![0, 1], "Should have sequential image indices");
2555
- }
2556
-
2557
- #[test]
2558
- fn test_formatting_bold_text_renders_as_markdown_bold() {
2559
- let pptx_bytes = create_pptx_with_formatting();
2560
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2561
-
2562
- assert!(
2563
- result.content.contains("**Bold text"),
2564
- "Should render bold text with ** markers"
2565
- );
2566
- }
2567
-
2568
- #[test]
2569
- fn test_formatting_italic_text_renders_as_markdown_italic() {
2570
- let pptx_bytes = create_pptx_with_formatting();
2571
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2572
-
2573
- assert!(
2574
- result.content.contains("*Italic text"),
2575
- "Should render italic text with * markers"
2576
- );
2577
- }
2578
-
2579
- #[test]
2580
- fn test_formatting_underline_text_renders_as_html_underline() {
2581
- let pptx_bytes = create_pptx_with_formatting();
2582
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2583
-
2584
- assert!(
2585
- result.content.contains("<u>Underline text"),
2586
- "Should render underline with HTML tags"
2587
- );
2588
- }
2589
-
2590
- #[test]
2591
- fn test_formatting_combined_bold_italic_renders_correctly() {
2592
- let pptx_bytes = create_pptx_with_formatting();
2593
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
2594
-
2595
- assert!(
2596
- result.content.contains("***Bold italic text"),
2597
- "Should render bold+italic with *** markers"
2598
- );
2599
- }
2600
-
2601
- #[test]
2602
- fn test_run_render_underline_formatting() {
2603
- let run = Run {
2604
- text: "underlined".to_string(),
2605
- formatting: Formatting {
2606
- underlined: true,
2607
- ..Default::default()
2608
- },
2609
- };
2610
- assert_eq!(
2611
- run.render_as_md(),
2612
- "<u>underlined</u>",
2613
- "Should wrap underlined text in <u> tags"
2614
- );
2615
- }
2616
-
2617
- #[test]
2618
- fn test_run_render_all_formatting_combined() {
2619
- let run = Run {
2620
- text: "all formats".to_string(),
2621
- formatting: Formatting {
2622
- bold: true,
2623
- italic: true,
2624
- underlined: true,
2625
- ..Default::default()
2626
- },
2627
- };
2628
- let rendered = run.render_as_md();
2629
- assert!(rendered.contains("***"), "Should have bold+italic markers");
2630
- assert!(rendered.contains("<u>"), "Should have underline tags");
2631
- assert!(rendered.contains("all formats"), "Should contain original text");
2632
- }
2633
-
2634
- #[test]
2635
- fn test_integration_complete_pptx_with_mixed_content_succeeds() {
2636
- use std::io::Write;
2637
- use zip::write::{SimpleFileOptions, ZipWriter};
2638
-
2639
- let mut buffer = Vec::new();
2640
- {
2641
- let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
2642
- let options = SimpleFileOptions::default();
2643
-
2644
- zip.start_file("[Content_Types].xml", options).unwrap();
2645
- zip.write_all(
2646
- br#"<?xml version="1.0" encoding="UTF-8"?>
2647
- <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
2648
- <Default Extension="xml" ContentType="application/xml"/>
2649
- <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
2650
- <Default Extension="png" ContentType="image/png"/>
2651
- </Types>"#,
2652
- )
2653
- .unwrap();
2654
-
2655
- zip.start_file("ppt/presentation.xml", options).unwrap();
2656
- zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
2657
-
2658
- zip.start_file("_rels/.rels", options).unwrap();
2659
- zip.write_all(
2660
- br#"<?xml version="1.0" encoding="UTF-8"?>
2661
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
2662
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
2663
- </Relationships>"#,
2664
- )
2665
- .unwrap();
2666
-
2667
- zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
2668
- zip.write_all(
2669
- br#"<?xml version="1.0" encoding="UTF-8"?>
2670
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
2671
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
2672
- </Relationships>"#,
2673
- )
2674
- .unwrap();
2675
-
2676
- zip.start_file("ppt/slides/_rels/slide1.xml.rels", options).unwrap();
2677
- zip.write_all(
2678
- br#"<?xml version="1.0" encoding="UTF-8"?>
2679
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
2680
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image1.png"/>
2681
- </Relationships>"#,
2682
- )
2683
- .unwrap();
2684
-
2685
- let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
2686
- <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
2687
- xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
2688
- xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
2689
- <p:cSld>
2690
- <p:spTree>
2691
- <p:sp>
2692
- <p:txBody>
2693
- <a:p>
2694
- <a:r>
2695
- <a:rPr b="1"/>
2696
- <a:t>Title with Bold</a:t>
2697
- </a:r>
2698
- </a:p>
2699
- </p:txBody>
2700
- <p:spPr>
2701
- <a:xfrm>
2702
- <a:off x="1000000" y="500000"/>
2703
- </a:xfrm>
2704
- </p:spPr>
2705
- </p:sp>
2706
- <p:sp>
2707
- <p:txBody>
2708
- <a:p>
2709
- <a:pPr lvl="0"><a:buChar char="•"/></a:pPr>
2710
- <a:r>
2711
- <a:t>List item one</a:t>
2712
- </a:r>
2713
- </a:p>
2714
- </p:txBody>
2715
- <p:spPr>
2716
- <a:xfrm>
2717
- <a:off x="1000000" y="1500000"/>
2718
- </a:xfrm>
2719
- </p:spPr>
2720
- </p:sp>
2721
- <p:graphicFrame>
2722
- <p:xfrm>
2723
- <a:off x="1000000" y="2500000"/>
2724
- <a:ext cx="4000000" cy="2000000"/>
2725
- </p:xfrm>
2726
- <a:graphic>
2727
- <a:graphicData uri="http://schemas.openxmlformats.org/drawingml/2006/table">
2728
- <a:tbl>
2729
- <a:tblGrid>
2730
- <a:gridCol w="2000000"/>
2731
- <a:gridCol w="2000000"/>
2732
- </a:tblGrid>
2733
- <a:tr h="370840">
2734
- <a:tc>
2735
- <a:txBody>
2736
- <a:p>
2737
- <a:r>
2738
- <a:t>Header A</a:t>
2739
- </a:r>
2740
- </a:p>
2741
- </a:txBody>
2742
- </a:tc>
2743
- <a:tc>
2744
- <a:txBody>
2745
- <a:p>
2746
- <a:r>
2747
- <a:t>Header B</a:t>
2748
- </a:r>
2749
- </a:p>
2750
- </a:txBody>
2751
- </a:tc>
2752
- </a:tr>
2753
- <a:tr h="370840">
2754
- <a:tc>
2755
- <a:txBody>
2756
- <a:p>
2757
- <a:r>
2758
- <a:t>Data 1</a:t>
2759
- </a:r>
2760
- </a:p>
2761
- </a:txBody>
2762
- </a:tc>
2763
- <a:tc>
2764
- <a:txBody>
2765
- <a:p>
2766
- <a:r>
2767
- <a:t>Data 2</a:t>
2768
- </a:r>
2769
- </a:p>
2770
- </a:txBody>
2771
- </a:tc>
2772
- </a:tr>
2773
- </a:tbl>
2774
- </a:graphicData>
2775
- </a:graphic>
2776
- </p:graphicFrame>
2777
- <p:pic>
2778
- <p:nvPicPr>
2779
- <p:cNvPr id="1" name="TestImage"/>
2780
- </p:nvPicPr>
2781
- <p:blipFill>
2782
- <a:blip r:embed="rId1"/>
2783
- </p:blipFill>
2784
- <p:spPr>
2785
- <a:xfrm>
2786
- <a:off x="6000000" y="1000000"/>
2787
- <a:ext cx="2000000" cy="2000000"/>
2788
- </a:xfrm>
2789
- </p:spPr>
2790
- </p:pic>
2791
- </p:spTree>
2792
- </p:cSld>
2793
- </p:sld>"#;
2794
-
2795
- zip.start_file("ppt/slides/slide1.xml", options).unwrap();
2796
- zip.write_all(slide_xml.as_bytes()).unwrap();
2797
-
2798
- let png_bytes: Vec<u8> = vec![
2799
- 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, 0x00,
2800
- 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x02, 0x00, 0x00, 0x00, 0x90, 0x77, 0x53, 0xDE, 0x00,
2801
- 0x00, 0x00, 0x00, 0x49, 0x45, 0x4E, 0x44, 0xAE, 0x42, 0x60, 0x82,
2802
- ];
2803
- zip.start_file("ppt/media/image1.png", options).unwrap();
2804
- zip.write_all(&png_bytes).unwrap();
2805
-
2806
- zip.start_file("docProps/core.xml", options).unwrap();
2807
- zip.write_all(
2808
- br#"<?xml version="1.0" encoding="UTF-8"?>
2809
- <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
2810
- xmlns:dc="http://purl.org/dc/elements/1.1/">
2811
- <dc:title>Integration Test</dc:title>
2812
- </cp:coreProperties>"#,
2813
- )
2814
- .unwrap();
2815
-
2816
- let _ = zip.finish().unwrap();
2817
- }
2818
-
2819
- let result = extract_pptx_from_bytes(&buffer, true, None).unwrap();
2820
-
2821
- assert!(
2822
- result.content.contains("**Title with Bold"),
2823
- "Should contain formatted title"
2824
- );
2825
- assert!(result.content.contains("- List item one"), "Should contain list item");
2826
- assert!(result.content.contains("<table>"), "Should contain table");
2827
- assert!(result.content.contains("Header A"), "Should contain table header");
2828
- assert!(result.content.contains("Data 1"), "Should contain table data");
2829
-
2830
- assert_eq!(result.slide_count, 1, "Should have 1 slide");
2831
- assert_eq!(result.table_count, 1, "Should detect 1 table");
2832
- assert_eq!(result.image_count, 1, "Should detect 1 image");
2833
- assert_eq!(result.images.len(), 1, "Should extract 1 image");
2834
- }
2835
-
2836
- #[test]
2837
- fn test_integration_position_based_sorting_orders_elements_correctly() {
2838
- use std::io::Write;
2839
- use zip::write::{SimpleFileOptions, ZipWriter};
2840
-
2841
- let mut buffer = Vec::new();
2842
- {
2843
- let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
2844
- let options = SimpleFileOptions::default();
2845
-
2846
- zip.start_file("[Content_Types].xml", options).unwrap();
2847
- zip.write_all(
2848
- br#"<?xml version="1.0" encoding="UTF-8"?>
2849
- <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
2850
- <Default Extension="xml" ContentType="application/xml"/>
2851
- <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
2852
- </Types>"#,
2853
- )
2854
- .unwrap();
2855
-
2856
- zip.start_file("ppt/presentation.xml", options).unwrap();
2857
- zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
2858
-
2859
- zip.start_file("_rels/.rels", options).unwrap();
2860
- zip.write_all(
2861
- br#"<?xml version="1.0" encoding="UTF-8"?>
2862
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
2863
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
2864
- </Relationships>"#,
2865
- )
2866
- .unwrap();
2867
-
2868
- zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
2869
- zip.write_all(
2870
- br#"<?xml version="1.0" encoding="UTF-8"?>
2871
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
2872
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
2873
- </Relationships>"#,
2874
- )
2875
- .unwrap();
2876
-
2877
- let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
2878
- <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
2879
- xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
2880
- <p:cSld>
2881
- <p:spTree>
2882
- <p:sp>
2883
- <p:txBody>
2884
- <a:p>
2885
- <a:r>
2886
- <a:t>Bottom Right</a:t>
2887
- </a:r>
2888
- </a:p>
2889
- </p:txBody>
2890
- <p:spPr>
2891
- <a:xfrm>
2892
- <a:off x="5000000" y="3000000"/>
2893
- </a:xfrm>
2894
- </p:spPr>
2895
- </p:sp>
2896
- <p:sp>
2897
- <p:txBody>
2898
- <a:p>
2899
- <a:r>
2900
- <a:t>Top Left</a:t>
2901
- </a:r>
2902
- </a:p>
2903
- </p:txBody>
2904
- <p:spPr>
2905
- <a:xfrm>
2906
- <a:off x="1000000" y="1000000"/>
2907
- </a:xfrm>
2908
- </p:spPr>
2909
- </p:sp>
2910
- <p:sp>
2911
- <p:txBody>
2912
- <a:p>
2913
- <a:r>
2914
- <a:t>Top Right</a:t>
2915
- </a:r>
2916
- </a:p>
2917
- </p:txBody>
2918
- <p:spPr>
2919
- <a:xfrm>
2920
- <a:off x="5000000" y="1000000"/>
2921
- </a:xfrm>
2922
- </p:spPr>
2923
- </p:sp>
2924
- <p:sp>
2925
- <p:txBody>
2926
- <a:p>
2927
- <a:r>
2928
- <a:t>Bottom Left</a:t>
2929
- </a:r>
2930
- </a:p>
2931
- </p:txBody>
2932
- <p:spPr>
2933
- <a:xfrm>
2934
- <a:off x="1000000" y="3000000"/>
2935
- </a:xfrm>
2936
- </p:spPr>
2937
- </p:sp>
2938
- </p:spTree>
2939
- </p:cSld>
2940
- </p:sld>"#;
2941
-
2942
- zip.start_file("ppt/slides/slide1.xml", options).unwrap();
2943
- zip.write_all(slide_xml.as_bytes()).unwrap();
2944
-
2945
- zip.start_file("docProps/core.xml", options).unwrap();
2946
- zip.write_all(
2947
- br#"<?xml version="1.0" encoding="UTF-8"?>
2948
- <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
2949
- xmlns:dc="http://purl.org/dc/elements/1.1/">
2950
- <dc:title>Position Test</dc:title>
2951
- </cp:coreProperties>"#,
2952
- )
2953
- .unwrap();
2954
-
2955
- let _ = zip.finish().unwrap();
2956
- }
2957
-
2958
- let result = extract_pptx_from_bytes(&buffer, false, None).unwrap();
2959
-
2960
- let content = result.content;
2961
- let top_left_pos = content.find("Top Left").unwrap();
2962
- let top_right_pos = content.find("Top Right").unwrap();
2963
- let bottom_left_pos = content.find("Bottom Left").unwrap();
2964
- let bottom_right_pos = content.find("Bottom Right").unwrap();
2965
-
2966
- assert!(
2967
- top_left_pos < top_right_pos,
2968
- "Top Left should appear before Top Right (same Y, lower X)"
2969
- );
2970
- assert!(
2971
- top_right_pos < bottom_left_pos,
2972
- "Top row should appear before bottom row"
2973
- );
2974
- assert!(
2975
- bottom_left_pos < bottom_right_pos,
2976
- "Bottom Left should appear before Bottom Right (same Y, lower X)"
2977
- );
2978
- }
2979
-
2980
- #[test]
2981
- fn test_integration_slide_notes_extraction_succeeds() {
2982
- use std::io::Write;
2983
- use zip::write::{SimpleFileOptions, ZipWriter};
2984
-
2985
- let mut buffer = Vec::new();
2986
- {
2987
- let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
2988
- let options = SimpleFileOptions::default();
2989
-
2990
- zip.start_file("[Content_Types].xml", options).unwrap();
2991
- zip.write_all(
2992
- br#"<?xml version="1.0" encoding="UTF-8"?>
2993
- <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
2994
- <Default Extension="xml" ContentType="application/xml"/>
2995
- <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
2996
- </Types>"#,
2997
- )
2998
- .unwrap();
2999
-
3000
- zip.start_file("ppt/presentation.xml", options).unwrap();
3001
- zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
3002
-
3003
- zip.start_file("_rels/.rels", options).unwrap();
3004
- zip.write_all(
3005
- br#"<?xml version="1.0" encoding="UTF-8"?>
3006
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
3007
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
3008
- </Relationships>"#,
3009
- )
3010
- .unwrap();
3011
-
3012
- zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
3013
- zip.write_all(
3014
- br#"<?xml version="1.0" encoding="UTF-8"?>
3015
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
3016
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
3017
- </Relationships>"#,
3018
- )
3019
- .unwrap();
3020
-
3021
- zip.start_file("ppt/slides/_rels/slide1.xml.rels", options).unwrap();
3022
- zip.write_all(
3023
- br#"<?xml version="1.0" encoding="UTF-8"?>
3024
- <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
3025
- <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide" Target="../notesSlides/notesSlide1.xml"/>
3026
- </Relationships>"#,
3027
- )
3028
- .unwrap();
3029
-
3030
- let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
3031
- <p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
3032
- xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
3033
- <p:cSld>
3034
- <p:spTree>
3035
- <p:sp>
3036
- <p:txBody>
3037
- <a:p>
3038
- <a:r>
3039
- <a:t>Slide Content</a:t>
3040
- </a:r>
3041
- </a:p>
3042
- </p:txBody>
3043
- </p:sp>
3044
- </p:spTree>
3045
- </p:cSld>
3046
- </p:sld>"#;
3047
-
3048
- zip.start_file("ppt/slides/slide1.xml", options).unwrap();
3049
- zip.write_all(slide_xml.as_bytes()).unwrap();
3050
-
3051
- let notes_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
3052
- <p:notes xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
3053
- xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
3054
- <p:cSld>
3055
- <p:spTree>
3056
- <p:sp>
3057
- <p:txBody>
3058
- <a:p>
3059
- <a:r>
3060
- <a:t>This is a speaker note for testing</a:t>
3061
- </a:r>
3062
- </a:p>
3063
- </p:txBody>
3064
- </p:sp>
3065
- </p:spTree>
3066
- </p:cSld>
3067
- </p:notes>"#;
3068
-
3069
- zip.start_file("ppt/notesSlides/notesSlide1.xml", options).unwrap();
3070
- zip.write_all(notes_xml.as_bytes()).unwrap();
3071
-
3072
- zip.start_file("docProps/core.xml", options).unwrap();
3073
- zip.write_all(
3074
- br#"<?xml version="1.0" encoding="UTF-8"?>
3075
- <cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
3076
- xmlns:dc="http://purl.org/dc/elements/1.1/">
3077
- <dc:title>Notes Test</dc:title>
3078
- </cp:coreProperties>"#,
3079
- )
3080
- .unwrap();
3081
-
3082
- let _ = zip.finish().unwrap();
3083
- }
3084
-
3085
- let result = extract_pptx_from_bytes(&buffer, false, None).unwrap();
3086
-
3087
- assert!(result.content.contains("Slide Content"), "Should contain slide content");
3088
- assert!(result.content.contains("### Notes:"), "Should contain notes header");
3089
- assert!(
3090
- result.content.contains("This is a speaker note for testing"),
3091
- "Should extract speaker notes"
3092
- );
3093
- }
3094
-
3095
- #[test]
3096
- fn test_integration_metadata_extraction_complete() {
3097
- let pptx_bytes = create_test_pptx_bytes(vec!["Content"]);
3098
- let result = extract_pptx_from_bytes(&pptx_bytes, false, None).unwrap();
3099
-
3100
- let _ = &result.metadata.fonts;
3101
- }
3102
- }