kreuzberg 4.0.0.pre.rc.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +538 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +157 -0
- data/README.md +426 -0
- data/Rakefile +25 -0
- data/Steepfile +47 -0
- data/examples/async_patterns.rb +341 -0
- data/ext/kreuzberg_rb/extconf.rb +45 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +6535 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +44 -0
- data/ext/kreuzberg_rb/native/README.md +425 -0
- data/ext/kreuzberg_rb/native/build.rs +15 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
- data/ext/kreuzberg_rb/native/include/strings.h +20 -0
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +2998 -0
- data/extconf.rb +28 -0
- data/kreuzberg.gemspec +148 -0
- data/lib/kreuzberg/api_proxy.rb +142 -0
- data/lib/kreuzberg/cache_api.rb +46 -0
- data/lib/kreuzberg/cli.rb +55 -0
- data/lib/kreuzberg/cli_proxy.rb +127 -0
- data/lib/kreuzberg/config.rb +691 -0
- data/lib/kreuzberg/error_context.rb +32 -0
- data/lib/kreuzberg/errors.rb +118 -0
- data/lib/kreuzberg/extraction_api.rb +85 -0
- data/lib/kreuzberg/mcp_proxy.rb +186 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
- data/lib/kreuzberg/post_processor_protocol.rb +86 -0
- data/lib/kreuzberg/result.rb +216 -0
- data/lib/kreuzberg/setup_lib_path.rb +80 -0
- data/lib/kreuzberg/validator_protocol.rb +89 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +103 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +520 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_spec.rb +345 -0
- data/spec/binding/config_validation_spec.rb +283 -0
- data/spec/binding/error_handling_spec.rb +213 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +274 -0
- data/spec/fixtures/config.toml +39 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +4 -0
- data/spec/smoke/package_spec.rb +178 -0
- data/spec/spec_helper.rb +42 -0
- data/vendor/kreuzberg/Cargo.toml +204 -0
- data/vendor/kreuzberg/README.md +175 -0
- data/vendor/kreuzberg/benches/otel_overhead.rs +48 -0
- data/vendor/kreuzberg/build.rs +474 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -0
- data/vendor/kreuzberg/src/api/handlers.rs +199 -0
- data/vendor/kreuzberg/src/api/mod.rs +79 -0
- data/vendor/kreuzberg/src/api/server.rs +353 -0
- data/vendor/kreuzberg/src/api/types.rs +170 -0
- data/vendor/kreuzberg/src/cache/mod.rs +1167 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +95 -0
- data/vendor/kreuzberg/src/core/config.rs +1032 -0
- data/vendor/kreuzberg/src/core/extractor.rs +1024 -0
- data/vendor/kreuzberg/src/core/io.rs +329 -0
- data/vendor/kreuzberg/src/core/mime.rs +605 -0
- data/vendor/kreuzberg/src/core/mod.rs +45 -0
- data/vendor/kreuzberg/src/core/pipeline.rs +984 -0
- data/vendor/kreuzberg/src/embeddings.rs +432 -0
- data/vendor/kreuzberg/src/error.rs +431 -0
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
- data/vendor/kreuzberg/src/extraction/email.rs +854 -0
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
- data/vendor/kreuzberg/src/extraction/html.rs +553 -0
- data/vendor/kreuzberg/src/extraction/image.rs +368 -0
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +563 -0
- data/vendor/kreuzberg/src/extraction/markdown.rs +213 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +81 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +130 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +287 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
- data/vendor/kreuzberg/src/extraction/table.rs +328 -0
- data/vendor/kreuzberg/src/extraction/text.rs +269 -0
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +446 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +469 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +502 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +367 -0
- data/vendor/kreuzberg/src/extractors/email.rs +143 -0
- data/vendor/kreuzberg/src/extractors/epub.rs +707 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +343 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +491 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +738 -0
- data/vendor/kreuzberg/src/extractors/html.rs +393 -0
- data/vendor/kreuzberg/src/extractors/image.rs +198 -0
- data/vendor/kreuzberg/src/extractors/jats.rs +1051 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +367 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +652 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +700 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +365 -0
- data/vendor/kreuzberg/src/extractors/odt.rs +628 -0
- data/vendor/kreuzberg/src/extractors/opml.rs +634 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +528 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +493 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +248 -0
- data/vendor/kreuzberg/src/extractors/rst.rs +576 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +810 -0
- data/vendor/kreuzberg/src/extractors/security.rs +484 -0
- data/vendor/kreuzberg/src/extractors/security_tests.rs +367 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +140 -0
- data/vendor/kreuzberg/src/extractors/text.rs +260 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +650 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +135 -0
- data/vendor/kreuzberg/src/image/dpi.rs +164 -0
- data/vendor/kreuzberg/src/image/mod.rs +6 -0
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
- data/vendor/kreuzberg/src/image/resize.rs +89 -0
- data/vendor/kreuzberg/src/keywords/config.rs +154 -0
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
- data/vendor/kreuzberg/src/keywords/rake.rs +293 -0
- data/vendor/kreuzberg/src/keywords/types.rs +68 -0
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
- data/vendor/kreuzberg/src/lib.rs +105 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
- data/vendor/kreuzberg/src/mcp/server.rs +1968 -0
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
- data/vendor/kreuzberg/src/ocr/error.rs +37 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
- data/vendor/kreuzberg/src/ocr/processor.rs +863 -0
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
- data/vendor/kreuzberg/src/ocr/types.rs +393 -0
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
- data/vendor/kreuzberg/src/panic_context.rs +154 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
- data/vendor/kreuzberg/src/pdf/table.rs +393 -0
- data/vendor/kreuzberg/src/pdf/text.rs +158 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +1013 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +620 -0
- data/vendor/kreuzberg/src/plugins/processor.rs +642 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +1337 -0
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +956 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
- data/vendor/kreuzberg/src/text/mod.rs +19 -0
- data/vendor/kreuzberg/src/text/quality.rs +697 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
- data/vendor/kreuzberg/src/types.rs +903 -0
- data/vendor/kreuzberg/src/utils/mod.rs +17 -0
- data/vendor/kreuzberg/src/utils/quality.rs +959 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +52 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -0
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
- data/vendor/kreuzberg/tests/batch_orchestration.rs +556 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +316 -0
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +421 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +525 -0
- data/vendor/kreuzberg/tests/config_features.rs +598 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +415 -0
- data/vendor/kreuzberg/tests/core_integration.rs +510 -0
- data/vendor/kreuzberg/tests/csv_integration.rs +414 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +498 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +122 -0
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +370 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +275 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -0
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +228 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +551 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -0
- data/vendor/kreuzberg/tests/instrumentation_test.rs +139 -0
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +639 -0
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +704 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +496 -0
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +490 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +695 -0
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +616 -0
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +822 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1411 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +560 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +586 -0
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +692 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +776 -0
- data/vendor/kreuzberg/tests/security_validation.rs +415 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1259 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +647 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
- data/vendor/rb-sys/.cargo-ok +1 -0
- data/vendor/rb-sys/.cargo_vcs_info.json +6 -0
- data/vendor/rb-sys/Cargo.lock +393 -0
- data/vendor/rb-sys/Cargo.toml +70 -0
- data/vendor/rb-sys/Cargo.toml.orig +57 -0
- data/vendor/rb-sys/LICENSE-APACHE +190 -0
- data/vendor/rb-sys/LICENSE-MIT +21 -0
- data/vendor/rb-sys/bin/release.sh +21 -0
- data/vendor/rb-sys/build/features.rs +108 -0
- data/vendor/rb-sys/build/main.rs +246 -0
- data/vendor/rb-sys/build/stable_api_config.rs +153 -0
- data/vendor/rb-sys/build/version.rs +48 -0
- data/vendor/rb-sys/readme.md +36 -0
- data/vendor/rb-sys/src/bindings.rs +21 -0
- data/vendor/rb-sys/src/hidden.rs +11 -0
- data/vendor/rb-sys/src/lib.rs +34 -0
- data/vendor/rb-sys/src/macros.rs +371 -0
- data/vendor/rb-sys/src/memory.rs +53 -0
- data/vendor/rb-sys/src/ruby_abi_version.rs +38 -0
- data/vendor/rb-sys/src/special_consts.rs +31 -0
- data/vendor/rb-sys/src/stable_api/compiled.c +179 -0
- data/vendor/rb-sys/src/stable_api/compiled.rs +257 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +316 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +324 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +317 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +315 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +326 -0
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +327 -0
- data/vendor/rb-sys/src/stable_api.rs +261 -0
- data/vendor/rb-sys/src/symbol.rs +31 -0
- data/vendor/rb-sys/src/tracking_allocator.rs +332 -0
- data/vendor/rb-sys/src/utils.rs +89 -0
- data/vendor/rb-sys/src/value_type.rs +7 -0
- metadata +536 -0
|
@@ -0,0 +1,3000 @@
|
|
|
1
|
+
//! PowerPoint presentation extraction functions.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides PowerPoint (PPTX) file parsing by directly reading the Office Open XML
|
|
4
|
+
//! format. It extracts text content, slide structure, images, and presentation metadata.
|
|
5
|
+
//!
|
|
6
|
+
//! # Attribution
|
|
7
|
+
//!
|
|
8
|
+
//! This code is based on the [pptx-to-md](https://github.com/nilskruthoff/pptx-parser) library
|
|
9
|
+
//! by Nils Kruthoff, licensed under MIT OR Apache-2.0. The original code has been vendored and
|
|
10
|
+
//! adapted to integrate with Kreuzberg's architecture. See ATTRIBUTIONS.md for full license text.
|
|
11
|
+
//!
|
|
12
|
+
//! # Features
|
|
13
|
+
//!
|
|
14
|
+
//! - **Slide extraction**: Reads all slides from presentation
|
|
15
|
+
//! - **Text formatting**: Preserves bold, italic, underline formatting as Markdown
|
|
16
|
+
//! - **Image extraction**: Optionally extracts embedded images with metadata
|
|
17
|
+
//! - **Office metadata**: Extracts core properties, custom properties (when `office` feature enabled)
|
|
18
|
+
//! - **Structure preservation**: Maintains heading hierarchy and list structure
|
|
19
|
+
//!
|
|
20
|
+
//! # Supported Formats
|
|
21
|
+
//!
|
|
22
|
+
//! - `.pptx` - PowerPoint Presentation
|
|
23
|
+
//! - `.pptm` - PowerPoint Macro-Enabled Presentation
|
|
24
|
+
//! - `.ppsx` - PowerPoint Slide Show
|
|
25
|
+
//!
|
|
26
|
+
//! # Example
|
|
27
|
+
//!
|
|
28
|
+
//! ```rust
|
|
29
|
+
//! use kreuzberg::extraction::pptx::extract_pptx_from_path;
|
|
30
|
+
//!
|
|
31
|
+
//! # fn example() -> kreuzberg::Result<()> {
|
|
32
|
+
//! let result = extract_pptx_from_path("presentation.pptx", true)?;
|
|
33
|
+
//!
|
|
34
|
+
//! println!("Slide count: {}", result.slide_count);
|
|
35
|
+
//! println!("Image count: {}", result.image_count);
|
|
36
|
+
//! println!("Content:\n{}", result.content);
|
|
37
|
+
//! # Ok(())
|
|
38
|
+
//! # }
|
|
39
|
+
//! ```
|
|
40
|
+
use crate::error::{KreuzbergError, Result};
|
|
41
|
+
use crate::types::{ExtractedImage, PptxExtractionResult, PptxMetadata};
|
|
42
|
+
use std::collections::HashMap;
|
|
43
|
+
use std::fs::File;
|
|
44
|
+
use std::io::Read;
|
|
45
|
+
use std::path::Path;
|
|
46
|
+
use zip::ZipArchive;
|
|
47
|
+
|
|
48
|
+
#[cfg(feature = "office")]
|
|
49
|
+
use crate::extraction::office_metadata::{
|
|
50
|
+
extract_core_properties, extract_custom_properties, extract_pptx_app_properties,
|
|
51
|
+
};
|
|
52
|
+
#[cfg(feature = "office")]
|
|
53
|
+
use serde_json::Value;
|
|
54
|
+
|
|
55
|
+
const P_NAMESPACE: &str = "http://schemas.openxmlformats.org/presentationml/2006/main";
|
|
56
|
+
const A_NAMESPACE: &str = "http://schemas.openxmlformats.org/drawingml/2006/main";
|
|
57
|
+
const RELS_NAMESPACE: &str = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
|
|
58
|
+
|
|
59
|
+
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord)]
|
|
60
|
+
struct ElementPosition {
|
|
61
|
+
x: i64,
|
|
62
|
+
y: i64,
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
#[derive(Debug, Clone, Default)]
|
|
66
|
+
struct Formatting {
|
|
67
|
+
bold: bool,
|
|
68
|
+
italic: bool,
|
|
69
|
+
underlined: bool,
|
|
70
|
+
lang: String,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
#[derive(Debug, Clone)]
|
|
74
|
+
struct Run {
|
|
75
|
+
text: String,
|
|
76
|
+
formatting: Formatting,
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
impl Run {
|
|
80
|
+
fn extract(&self) -> String {
|
|
81
|
+
self.text.clone()
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
fn render_as_md(&self) -> String {
|
|
85
|
+
let mut result = self.text.clone();
|
|
86
|
+
|
|
87
|
+
if self.formatting.bold {
|
|
88
|
+
result = format!("**{}**", result);
|
|
89
|
+
}
|
|
90
|
+
if self.formatting.italic {
|
|
91
|
+
result = format!("*{}*", result);
|
|
92
|
+
}
|
|
93
|
+
if self.formatting.underlined {
|
|
94
|
+
result = format!("<u>{}</u>", result);
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
result
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
#[derive(Debug, Clone)]
|
|
102
|
+
struct TextElement {
|
|
103
|
+
runs: Vec<Run>,
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
#[derive(Debug, Clone)]
|
|
107
|
+
struct ListItem {
|
|
108
|
+
level: u32,
|
|
109
|
+
is_ordered: bool,
|
|
110
|
+
runs: Vec<Run>,
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
#[derive(Debug, Clone)]
|
|
114
|
+
struct ListElement {
|
|
115
|
+
items: Vec<ListItem>,
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
#[derive(Debug, Clone)]
|
|
119
|
+
struct TableCell {
|
|
120
|
+
runs: Vec<Run>,
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
#[derive(Debug, Clone)]
|
|
124
|
+
struct TableRow {
|
|
125
|
+
cells: Vec<TableCell>,
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
#[derive(Debug, Clone)]
|
|
129
|
+
struct TableElement {
|
|
130
|
+
rows: Vec<TableRow>,
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
#[derive(Debug, Clone)]
|
|
134
|
+
struct ImageReference {
|
|
135
|
+
id: String,
|
|
136
|
+
target: String,
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
#[derive(Debug, Clone)]
|
|
140
|
+
enum SlideElement {
|
|
141
|
+
Text(TextElement, ElementPosition),
|
|
142
|
+
Table(TableElement, ElementPosition),
|
|
143
|
+
Image(ImageReference, ElementPosition),
|
|
144
|
+
List(ListElement, ElementPosition),
|
|
145
|
+
Unknown,
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
impl SlideElement {
|
|
149
|
+
fn position(&self) -> ElementPosition {
|
|
150
|
+
match self {
|
|
151
|
+
SlideElement::Text(_, pos)
|
|
152
|
+
| SlideElement::Table(_, pos)
|
|
153
|
+
| SlideElement::Image(_, pos)
|
|
154
|
+
| SlideElement::List(_, pos) => *pos,
|
|
155
|
+
SlideElement::Unknown => ElementPosition::default(),
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
#[derive(Debug)]
|
|
161
|
+
struct Slide {
|
|
162
|
+
slide_number: u32,
|
|
163
|
+
elements: Vec<SlideElement>,
|
|
164
|
+
images: Vec<ImageReference>,
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
#[derive(Debug, Clone)]
|
|
168
|
+
struct ParserConfig {
|
|
169
|
+
extract_images: bool,
|
|
170
|
+
include_slide_comment: bool,
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
impl Default for ParserConfig {
|
|
174
|
+
fn default() -> Self {
|
|
175
|
+
Self {
|
|
176
|
+
extract_images: true,
|
|
177
|
+
include_slide_comment: false,
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
struct ContentBuilder {
|
|
183
|
+
content: String,
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
impl ContentBuilder {
|
|
187
|
+
fn new() -> Self {
|
|
188
|
+
Self {
|
|
189
|
+
content: String::with_capacity(8192),
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
fn with_capacity(capacity: usize) -> Self {
|
|
194
|
+
Self {
|
|
195
|
+
content: String::with_capacity(capacity),
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
fn add_slide_header(&mut self, slide_number: u32) {
|
|
200
|
+
self.content.reserve(50);
|
|
201
|
+
self.content.push_str("\n\n<!-- Slide number: ");
|
|
202
|
+
self.content.push_str(&slide_number.to_string());
|
|
203
|
+
self.content.push_str(" -->\n");
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
fn add_text(&mut self, text: &str) {
|
|
207
|
+
if !text.trim().is_empty() {
|
|
208
|
+
self.content.push_str(text);
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
fn add_title(&mut self, title: &str) {
|
|
213
|
+
if !title.trim().is_empty() {
|
|
214
|
+
self.content.push_str("# ");
|
|
215
|
+
self.content.push_str(title.trim());
|
|
216
|
+
self.content.push('\n');
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
fn add_table(&mut self, rows: &[Vec<String>]) {
|
|
221
|
+
if rows.is_empty() {
|
|
222
|
+
return;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
self.content.push_str("\n<table>");
|
|
226
|
+
for (i, row) in rows.iter().enumerate() {
|
|
227
|
+
self.content.push_str("<tr>");
|
|
228
|
+
let tag = if i == 0 { "th" } else { "td" };
|
|
229
|
+
|
|
230
|
+
for cell in row {
|
|
231
|
+
self.content.push('<');
|
|
232
|
+
self.content.push_str(tag);
|
|
233
|
+
self.content.push('>');
|
|
234
|
+
self.content.push_str(&html_escape(cell));
|
|
235
|
+
self.content.push_str("</");
|
|
236
|
+
self.content.push_str(tag);
|
|
237
|
+
self.content.push('>');
|
|
238
|
+
}
|
|
239
|
+
self.content.push_str("</tr>");
|
|
240
|
+
}
|
|
241
|
+
self.content.push_str("</table>\n");
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
fn add_list_item(&mut self, level: u32, is_ordered: bool, text: &str) {
|
|
245
|
+
let indent_count = level.saturating_sub(1) as usize;
|
|
246
|
+
for _ in 0..indent_count {
|
|
247
|
+
self.content.push_str(" ");
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
let marker = if is_ordered { "1." } else { "-" };
|
|
251
|
+
self.content.push_str(marker);
|
|
252
|
+
self.content.push(' ');
|
|
253
|
+
self.content.push_str(text.trim());
|
|
254
|
+
self.content.push('\n');
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
fn add_image(&mut self, image_id: &str, slide_number: u32) {
|
|
258
|
+
let filename = format!("slide_{}_image_{}.jpg", slide_number, image_id);
|
|
259
|
+
self.content.push_str(";
|
|
262
|
+
self.content.push_str(&filename);
|
|
263
|
+
self.content.push_str(")\n");
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
fn add_notes(&mut self, notes: &str) {
|
|
267
|
+
if !notes.trim().is_empty() {
|
|
268
|
+
self.content.push_str("\n\n### Notes:\n");
|
|
269
|
+
self.content.push_str(notes);
|
|
270
|
+
self.content.push('\n');
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
fn build(self) -> String {
|
|
275
|
+
self.content.trim().to_string()
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
fn html_escape(text: &str) -> String {
|
|
280
|
+
text.replace('&', "&")
|
|
281
|
+
.replace('<', "<")
|
|
282
|
+
.replace('>', ">")
|
|
283
|
+
.replace('"', """)
|
|
284
|
+
.replace('\'', "'")
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
struct PptxContainer {
|
|
288
|
+
archive: ZipArchive<File>,
|
|
289
|
+
slide_paths: Vec<String>,
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
impl PptxContainer {
|
|
293
|
+
fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
|
|
294
|
+
// IO errors must bubble up unchanged - file access issues need user reports ~keep
|
|
295
|
+
let file = File::open(path)?;
|
|
296
|
+
|
|
297
|
+
let mut archive = match ZipArchive::new(file) {
|
|
298
|
+
Ok(arc) => arc,
|
|
299
|
+
Err(zip::result::ZipError::Io(io_err)) => return Err(io_err.into()), // Bubble up IO errors ~keep
|
|
300
|
+
Err(e) => {
|
|
301
|
+
return Err(KreuzbergError::parsing(format!(
|
|
302
|
+
"Failed to read PPTX archive (invalid format): {}",
|
|
303
|
+
e
|
|
304
|
+
)));
|
|
305
|
+
}
|
|
306
|
+
};
|
|
307
|
+
|
|
308
|
+
let slide_paths = Self::find_slide_paths(&mut archive)?;
|
|
309
|
+
|
|
310
|
+
Ok(Self { archive, slide_paths })
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
fn slide_paths(&self) -> &[String] {
|
|
314
|
+
&self.slide_paths
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
fn read_file(&mut self, path: &str) -> Result<Vec<u8>> {
|
|
318
|
+
match self.archive.by_name(path) {
|
|
319
|
+
Ok(mut file) => {
|
|
320
|
+
let mut contents = Vec::new();
|
|
321
|
+
// IO errors must bubble up - file read issues need user reports ~keep
|
|
322
|
+
file.read_to_end(&mut contents)?;
|
|
323
|
+
Ok(contents)
|
|
324
|
+
}
|
|
325
|
+
Err(zip::result::ZipError::FileNotFound) => {
|
|
326
|
+
Err(KreuzbergError::parsing("File not found in archive".to_string()))
|
|
327
|
+
}
|
|
328
|
+
Err(zip::result::ZipError::Io(io_err)) => Err(io_err.into()), // Bubble up IO errors ~keep
|
|
329
|
+
Err(e) => Err(KreuzbergError::parsing(format!("Zip error: {}", e))),
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
fn get_slide_rels_path(&self, slide_path: &str) -> String {
|
|
334
|
+
get_slide_rels_path(slide_path)
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
fn find_slide_paths(archive: &mut ZipArchive<File>) -> Result<Vec<String>> {
|
|
338
|
+
if let Ok(rels_data) = Self::read_file_from_archive(archive, "ppt/_rels/presentation.xml.rels")
|
|
339
|
+
&& let Ok(paths) = parse_presentation_rels(&rels_data)
|
|
340
|
+
{
|
|
341
|
+
return Ok(paths);
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
let mut slide_paths = Vec::new();
|
|
345
|
+
for i in 0..archive.len() {
|
|
346
|
+
if let Ok(file) = archive.by_index(i) {
|
|
347
|
+
let name = file.name();
|
|
348
|
+
if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") {
|
|
349
|
+
slide_paths.push(name.to_string());
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
slide_paths.sort();
|
|
355
|
+
Ok(slide_paths)
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
fn read_file_from_archive(archive: &mut ZipArchive<File>, path: &str) -> Result<Vec<u8>> {
|
|
359
|
+
let mut file = match archive.by_name(path) {
|
|
360
|
+
Ok(f) => f,
|
|
361
|
+
Err(zip::result::ZipError::Io(io_err)) => return Err(io_err.into()), // Bubble up IO errors ~keep
|
|
362
|
+
Err(e) => {
|
|
363
|
+
return Err(KreuzbergError::parsing(format!(
|
|
364
|
+
"Failed to read file from archive: {}",
|
|
365
|
+
e
|
|
366
|
+
)));
|
|
367
|
+
}
|
|
368
|
+
};
|
|
369
|
+
let mut contents = Vec::new();
|
|
370
|
+
// IO errors must bubble up - file read issues need user reports ~keep
|
|
371
|
+
file.read_to_end(&mut contents)?;
|
|
372
|
+
Ok(contents)
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
impl Slide {
|
|
377
|
+
fn from_xml(slide_number: u32, xml_data: &[u8], rels_data: Option<&[u8]>) -> Result<Self> {
|
|
378
|
+
let elements = parse_slide_xml(xml_data)?;
|
|
379
|
+
|
|
380
|
+
let images = if let Some(rels) = rels_data {
|
|
381
|
+
parse_slide_rels(rels)?
|
|
382
|
+
} else {
|
|
383
|
+
Vec::new()
|
|
384
|
+
};
|
|
385
|
+
|
|
386
|
+
Ok(Self {
|
|
387
|
+
slide_number,
|
|
388
|
+
elements,
|
|
389
|
+
images,
|
|
390
|
+
})
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
fn to_markdown(&self, config: &ParserConfig) -> String {
|
|
394
|
+
let mut builder = ContentBuilder::new();
|
|
395
|
+
|
|
396
|
+
if config.include_slide_comment {
|
|
397
|
+
builder.add_slide_header(self.slide_number);
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
let mut element_indices: Vec<usize> = (0..self.elements.len()).collect();
|
|
401
|
+
element_indices.sort_by_key(|&i| {
|
|
402
|
+
let pos = self.elements[i].position();
|
|
403
|
+
(pos.y, pos.x)
|
|
404
|
+
});
|
|
405
|
+
|
|
406
|
+
for &idx in &element_indices {
|
|
407
|
+
match &self.elements[idx] {
|
|
408
|
+
SlideElement::Text(text, _) => {
|
|
409
|
+
let text_content: String = text.runs.iter().map(|run| run.render_as_md()).collect();
|
|
410
|
+
|
|
411
|
+
let normalized = text_content.replace('\n', " ");
|
|
412
|
+
let is_title = normalized.len() < 100 && !normalized.trim().is_empty();
|
|
413
|
+
|
|
414
|
+
if is_title {
|
|
415
|
+
builder.add_title(normalized.trim());
|
|
416
|
+
} else {
|
|
417
|
+
builder.add_text(&text_content);
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
SlideElement::Table(table, _) => {
|
|
421
|
+
let table_rows: Vec<Vec<String>> = table
|
|
422
|
+
.rows
|
|
423
|
+
.iter()
|
|
424
|
+
.map(|row| {
|
|
425
|
+
row.cells
|
|
426
|
+
.iter()
|
|
427
|
+
.map(|cell| cell.runs.iter().map(|run| run.extract()).collect::<String>())
|
|
428
|
+
.collect()
|
|
429
|
+
})
|
|
430
|
+
.collect();
|
|
431
|
+
builder.add_table(&table_rows);
|
|
432
|
+
}
|
|
433
|
+
SlideElement::List(list, _) => {
|
|
434
|
+
for item in &list.items {
|
|
435
|
+
let item_text: String = item.runs.iter().map(|run| run.extract()).collect();
|
|
436
|
+
builder.add_list_item(item.level, item.is_ordered, &item_text);
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
SlideElement::Image(img_ref, _) => {
|
|
440
|
+
builder.add_image(&img_ref.id, self.slide_number);
|
|
441
|
+
}
|
|
442
|
+
SlideElement::Unknown => {}
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
builder.build()
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
fn image_count(&self) -> usize {
|
|
450
|
+
self.elements
|
|
451
|
+
.iter()
|
|
452
|
+
.filter(|e| matches!(e, SlideElement::Image(_, _)))
|
|
453
|
+
.count()
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
fn table_count(&self) -> usize {
|
|
457
|
+
self.elements
|
|
458
|
+
.iter()
|
|
459
|
+
.filter(|e| matches!(e, SlideElement::Table(_, _)))
|
|
460
|
+
.count()
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
struct SlideIterator {
|
|
465
|
+
container: PptxContainer,
|
|
466
|
+
current_index: usize,
|
|
467
|
+
total_slides: usize,
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
impl SlideIterator {
|
|
471
|
+
fn new(container: PptxContainer) -> Self {
|
|
472
|
+
let total_slides = container.slide_paths().len();
|
|
473
|
+
Self {
|
|
474
|
+
container,
|
|
475
|
+
current_index: 0,
|
|
476
|
+
total_slides,
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
fn slide_count(&self) -> usize {
|
|
481
|
+
self.total_slides
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
fn next_slide(&mut self) -> Result<Option<Slide>> {
|
|
485
|
+
if self.current_index >= self.total_slides {
|
|
486
|
+
return Ok(None);
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
let slide_path = &self.container.slide_paths()[self.current_index].clone();
|
|
490
|
+
let slide_number = (self.current_index + 1) as u32;
|
|
491
|
+
|
|
492
|
+
let xml_data = self.container.read_file(slide_path)?;
|
|
493
|
+
|
|
494
|
+
let rels_path = self.container.get_slide_rels_path(slide_path);
|
|
495
|
+
let rels_data = self.container.read_file(&rels_path).ok();
|
|
496
|
+
|
|
497
|
+
let slide = Slide::from_xml(slide_number, &xml_data, rels_data.as_deref())?;
|
|
498
|
+
|
|
499
|
+
self.current_index += 1;
|
|
500
|
+
|
|
501
|
+
Ok(Some(slide))
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
fn get_slide_images(&mut self, slide: &Slide) -> Result<HashMap<String, Vec<u8>>> {
|
|
505
|
+
let mut image_data = HashMap::new();
|
|
506
|
+
|
|
507
|
+
for img_ref in &slide.images {
|
|
508
|
+
let slide_path = &self.container.slide_paths()[slide.slide_number as usize - 1];
|
|
509
|
+
let full_path = get_full_image_path(slide_path, &img_ref.target);
|
|
510
|
+
|
|
511
|
+
if let Ok(data) = self.container.read_file(&full_path) {
|
|
512
|
+
image_data.insert(img_ref.id.clone(), data);
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
Ok(image_data)
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
use roxmltree::{Document, Node};
|
|
521
|
+
|
|
522
|
+
enum ParsedContent {
|
|
523
|
+
Text(TextElement),
|
|
524
|
+
List(ListElement),
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
fn parse_slide_xml(xml_data: &[u8]) -> Result<Vec<SlideElement>> {
|
|
528
|
+
let xml_str =
|
|
529
|
+
std::str::from_utf8(xml_data).map_err(|_| KreuzbergError::parsing("Invalid UTF-8 in slide XML".to_string()))?;
|
|
530
|
+
|
|
531
|
+
let doc =
|
|
532
|
+
Document::parse(xml_str).map_err(|e| KreuzbergError::parsing(format!("Failed to parse slide XML: {}", e)))?;
|
|
533
|
+
|
|
534
|
+
let root = doc.root_element();
|
|
535
|
+
let ns = root.tag_name().namespace();
|
|
536
|
+
|
|
537
|
+
let c_sld = root
|
|
538
|
+
.descendants()
|
|
539
|
+
.find(|n| n.tag_name().name() == "cSld" && n.tag_name().namespace() == ns)
|
|
540
|
+
.ok_or_else(|| KreuzbergError::parsing("No <p:cSld> tag found".to_string()))?;
|
|
541
|
+
|
|
542
|
+
let sp_tree = c_sld
|
|
543
|
+
.children()
|
|
544
|
+
.find(|n| n.tag_name().name() == "spTree" && n.tag_name().namespace() == ns)
|
|
545
|
+
.ok_or_else(|| KreuzbergError::parsing("No <p:spTree> tag found".to_string()))?;
|
|
546
|
+
|
|
547
|
+
let mut elements = Vec::new();
|
|
548
|
+
for child_node in sp_tree.children().filter(|n| n.is_element()) {
|
|
549
|
+
elements.extend(parse_group(&child_node)?);
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
Ok(elements)
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
fn parse_group(node: &Node) -> Result<Vec<SlideElement>> {
|
|
556
|
+
let mut elements = Vec::new();
|
|
557
|
+
|
|
558
|
+
let tag_name = node.tag_name().name();
|
|
559
|
+
let namespace = node.tag_name().namespace().unwrap_or("");
|
|
560
|
+
|
|
561
|
+
if namespace != P_NAMESPACE {
|
|
562
|
+
return Ok(elements);
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
let position = extract_position(node);
|
|
566
|
+
|
|
567
|
+
match tag_name {
|
|
568
|
+
"sp" => {
|
|
569
|
+
let position = extract_position(node);
|
|
570
|
+
match parse_sp(node)? {
|
|
571
|
+
ParsedContent::Text(text) => elements.push(SlideElement::Text(text, position)),
|
|
572
|
+
ParsedContent::List(list) => elements.push(SlideElement::List(list, position)),
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
"graphicFrame" => {
|
|
576
|
+
if let Some(graphic_element) = parse_graphic_frame(node)? {
|
|
577
|
+
elements.push(SlideElement::Table(graphic_element, position));
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
"pic" => {
|
|
581
|
+
let image_reference = parse_pic(node)?;
|
|
582
|
+
elements.push(SlideElement::Image(image_reference, position));
|
|
583
|
+
}
|
|
584
|
+
"grpSp" => {
|
|
585
|
+
for child in node.children().filter(|n| n.is_element()) {
|
|
586
|
+
elements.extend(parse_group(&child)?);
|
|
587
|
+
}
|
|
588
|
+
}
|
|
589
|
+
_ => elements.push(SlideElement::Unknown),
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
Ok(elements)
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
fn parse_sp(sp_node: &Node) -> Result<ParsedContent> {
|
|
596
|
+
let tx_body_node = sp_node
|
|
597
|
+
.children()
|
|
598
|
+
.find(|n| n.tag_name().name() == "txBody" && n.tag_name().namespace() == Some(P_NAMESPACE))
|
|
599
|
+
.ok_or_else(|| KreuzbergError::parsing("No txBody found".to_string()))?;
|
|
600
|
+
|
|
601
|
+
let is_list = tx_body_node.descendants().any(|n| {
|
|
602
|
+
n.is_element()
|
|
603
|
+
&& n.tag_name().name() == "pPr"
|
|
604
|
+
&& n.tag_name().namespace() == Some(A_NAMESPACE)
|
|
605
|
+
&& (n.attribute("lvl").is_some()
|
|
606
|
+
|| n.children().any(|child| {
|
|
607
|
+
child.is_element()
|
|
608
|
+
&& (child.tag_name().name() == "buAutoNum" || child.tag_name().name() == "buChar")
|
|
609
|
+
}))
|
|
610
|
+
});
|
|
611
|
+
|
|
612
|
+
if is_list {
|
|
613
|
+
Ok(ParsedContent::List(parse_list(&tx_body_node)?))
|
|
614
|
+
} else {
|
|
615
|
+
Ok(ParsedContent::Text(parse_text(&tx_body_node)?))
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
fn parse_text(tx_body_node: &Node) -> Result<TextElement> {
|
|
620
|
+
let mut runs = Vec::new();
|
|
621
|
+
|
|
622
|
+
for p_node in tx_body_node
|
|
623
|
+
.children()
|
|
624
|
+
.filter(|n| n.is_element() && n.tag_name().name() == "p" && n.tag_name().namespace() == Some(A_NAMESPACE))
|
|
625
|
+
{
|
|
626
|
+
let mut paragraph_runs = parse_paragraph(&p_node, true)?;
|
|
627
|
+
runs.append(&mut paragraph_runs);
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
Ok(TextElement { runs })
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
fn parse_graphic_frame(node: &Node) -> Result<Option<TableElement>> {
|
|
634
|
+
let graphic_data_node = node.descendants().find(|n| {
|
|
635
|
+
n.is_element()
|
|
636
|
+
&& n.tag_name().name() == "graphicData"
|
|
637
|
+
&& n.tag_name().namespace() == Some(A_NAMESPACE)
|
|
638
|
+
&& n.attribute("uri") == Some("http://schemas.openxmlformats.org/drawingml/2006/table")
|
|
639
|
+
});
|
|
640
|
+
|
|
641
|
+
if let Some(graphic_data) = graphic_data_node
|
|
642
|
+
&& let Some(tbl_node) = graphic_data
|
|
643
|
+
.children()
|
|
644
|
+
.find(|n| n.is_element() && n.tag_name().name() == "tbl" && n.tag_name().namespace() == Some(A_NAMESPACE))
|
|
645
|
+
{
|
|
646
|
+
let table = parse_table(&tbl_node)?;
|
|
647
|
+
return Ok(Some(table));
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
Ok(None)
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
fn parse_table(tbl_node: &Node) -> Result<TableElement> {
|
|
654
|
+
let mut rows = Vec::new();
|
|
655
|
+
|
|
656
|
+
for tr_node in tbl_node
|
|
657
|
+
.children()
|
|
658
|
+
.filter(|n| n.is_element() && n.tag_name().name() == "tr" && n.tag_name().namespace() == Some(A_NAMESPACE))
|
|
659
|
+
{
|
|
660
|
+
let row = parse_table_row(&tr_node)?;
|
|
661
|
+
rows.push(row);
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
Ok(TableElement { rows })
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
fn parse_table_row(tr_node: &Node) -> Result<TableRow> {
|
|
668
|
+
let mut cells = Vec::new();
|
|
669
|
+
|
|
670
|
+
for tc_node in tr_node
|
|
671
|
+
.children()
|
|
672
|
+
.filter(|n| n.is_element() && n.tag_name().name() == "tc" && n.tag_name().namespace() == Some(A_NAMESPACE))
|
|
673
|
+
{
|
|
674
|
+
let cell = parse_table_cell(&tc_node)?;
|
|
675
|
+
cells.push(cell);
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
Ok(TableRow { cells })
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
fn parse_table_cell(tc_node: &Node) -> Result<TableCell> {
|
|
682
|
+
let mut runs = Vec::new();
|
|
683
|
+
|
|
684
|
+
if let Some(tx_body_node) = tc_node
|
|
685
|
+
.children()
|
|
686
|
+
.find(|n| n.is_element() && n.tag_name().name() == "txBody" && n.tag_name().namespace() == Some(A_NAMESPACE))
|
|
687
|
+
{
|
|
688
|
+
for p_node in tx_body_node
|
|
689
|
+
.children()
|
|
690
|
+
.filter(|n| n.is_element() && n.tag_name().name() == "p" && n.tag_name().namespace() == Some(A_NAMESPACE))
|
|
691
|
+
{
|
|
692
|
+
let mut paragraph_runs = parse_paragraph(&p_node, false)?;
|
|
693
|
+
runs.append(&mut paragraph_runs);
|
|
694
|
+
}
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
Ok(TableCell { runs })
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
fn parse_pic(pic_node: &Node) -> Result<ImageReference> {
|
|
701
|
+
let blip_node = pic_node
|
|
702
|
+
.descendants()
|
|
703
|
+
.find(|n| n.is_element() && n.tag_name().name() == "blip" && n.tag_name().namespace() == Some(A_NAMESPACE))
|
|
704
|
+
.ok_or_else(|| KreuzbergError::parsing("Image blip not found".to_string()))?;
|
|
705
|
+
|
|
706
|
+
let embed_attr = blip_node
|
|
707
|
+
.attribute((RELS_NAMESPACE, "embed"))
|
|
708
|
+
.or_else(|| blip_node.attribute("r:embed"))
|
|
709
|
+
.ok_or_else(|| KreuzbergError::parsing("Image embed attribute not found".to_string()))?;
|
|
710
|
+
|
|
711
|
+
let image_ref = ImageReference {
|
|
712
|
+
id: embed_attr.to_string(),
|
|
713
|
+
target: String::new(),
|
|
714
|
+
};
|
|
715
|
+
|
|
716
|
+
Ok(image_ref)
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
fn parse_list(tx_body_node: &Node) -> Result<ListElement> {
|
|
720
|
+
let mut items = Vec::new();
|
|
721
|
+
|
|
722
|
+
for p_node in tx_body_node
|
|
723
|
+
.children()
|
|
724
|
+
.filter(|n| n.is_element() && n.tag_name().name() == "p" && n.tag_name().namespace() == Some(A_NAMESPACE))
|
|
725
|
+
{
|
|
726
|
+
let (level, is_ordered) = parse_list_properties(&p_node)?;
|
|
727
|
+
|
|
728
|
+
let runs = parse_paragraph(&p_node, true)?;
|
|
729
|
+
|
|
730
|
+
items.push(ListItem {
|
|
731
|
+
level,
|
|
732
|
+
is_ordered,
|
|
733
|
+
runs,
|
|
734
|
+
});
|
|
735
|
+
}
|
|
736
|
+
|
|
737
|
+
Ok(ListElement { items })
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
fn parse_list_properties(p_node: &Node) -> Result<(u32, bool)> {
|
|
741
|
+
let mut level = 1;
|
|
742
|
+
let mut is_ordered = false;
|
|
743
|
+
|
|
744
|
+
if let Some(p_pr_node) = p_node
|
|
745
|
+
.children()
|
|
746
|
+
.find(|n| n.is_element() && n.tag_name().name() == "pPr" && n.tag_name().namespace() == Some(A_NAMESPACE))
|
|
747
|
+
{
|
|
748
|
+
if let Some(lvl_attr) = p_pr_node.attribute("lvl") {
|
|
749
|
+
level = lvl_attr.parse::<u32>().unwrap_or(0) + 1;
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
is_ordered = p_pr_node.children().any(|n| {
|
|
753
|
+
n.is_element() && n.tag_name().namespace() == Some(A_NAMESPACE) && n.tag_name().name() == "buAutoNum"
|
|
754
|
+
});
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
Ok((level, is_ordered))
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
fn parse_paragraph(p_node: &Node, add_new_line: bool) -> Result<Vec<Run>> {
|
|
761
|
+
let run_nodes: Vec<_> = p_node
|
|
762
|
+
.children()
|
|
763
|
+
.filter(|n| n.is_element() && n.tag_name().name() == "r" && n.tag_name().namespace() == Some(A_NAMESPACE))
|
|
764
|
+
.collect();
|
|
765
|
+
|
|
766
|
+
let count = run_nodes.len();
|
|
767
|
+
let mut runs: Vec<Run> = Vec::new();
|
|
768
|
+
|
|
769
|
+
for (idx, r_node) in run_nodes.iter().enumerate() {
|
|
770
|
+
let mut run = parse_run(r_node)?;
|
|
771
|
+
|
|
772
|
+
if add_new_line && idx == count - 1 {
|
|
773
|
+
run.text.push('\n');
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
runs.push(run);
|
|
777
|
+
}
|
|
778
|
+
Ok(runs)
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
fn parse_run(r_node: &Node) -> Result<Run> {
|
|
782
|
+
let mut text = String::new();
|
|
783
|
+
let mut formatting = Formatting::default();
|
|
784
|
+
|
|
785
|
+
if let Some(r_pr_node) = r_node
|
|
786
|
+
.children()
|
|
787
|
+
.find(|n| n.is_element() && n.tag_name().name() == "rPr" && n.tag_name().namespace() == Some(A_NAMESPACE))
|
|
788
|
+
{
|
|
789
|
+
if let Some(b_attr) = r_pr_node.attribute("b") {
|
|
790
|
+
formatting.bold = b_attr == "1" || b_attr.eq_ignore_ascii_case("true");
|
|
791
|
+
}
|
|
792
|
+
if let Some(i_attr) = r_pr_node.attribute("i") {
|
|
793
|
+
formatting.italic = i_attr == "1" || i_attr.eq_ignore_ascii_case("true");
|
|
794
|
+
}
|
|
795
|
+
if let Some(u_attr) = r_pr_node.attribute("u") {
|
|
796
|
+
formatting.underlined = u_attr != "none";
|
|
797
|
+
}
|
|
798
|
+
if let Some(lang_attr) = r_pr_node.attribute("lang") {
|
|
799
|
+
formatting.lang = lang_attr.to_string();
|
|
800
|
+
}
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
if let Some(t_node) = r_node
|
|
804
|
+
.children()
|
|
805
|
+
.find(|n| n.is_element() && n.tag_name().name() == "t" && n.tag_name().namespace() == Some(A_NAMESPACE))
|
|
806
|
+
&& let Some(t) = t_node.text()
|
|
807
|
+
{
|
|
808
|
+
text.push_str(t);
|
|
809
|
+
}
|
|
810
|
+
Ok(Run { text, formatting })
|
|
811
|
+
}
|
|
812
|
+
|
|
813
|
+
fn extract_position(node: &Node) -> ElementPosition {
|
|
814
|
+
let default = ElementPosition::default();
|
|
815
|
+
|
|
816
|
+
node.descendants()
|
|
817
|
+
.find(|n| n.tag_name().namespace() == Some(A_NAMESPACE) && n.tag_name().name() == "xfrm")
|
|
818
|
+
.and_then(|xfrm| {
|
|
819
|
+
let x = xfrm
|
|
820
|
+
.children()
|
|
821
|
+
.find(|n| n.tag_name().name() == "off" && n.tag_name().namespace() == Some(A_NAMESPACE))
|
|
822
|
+
.and_then(|off| off.attribute("x")?.parse::<i64>().ok())?;
|
|
823
|
+
|
|
824
|
+
let y = xfrm
|
|
825
|
+
.children()
|
|
826
|
+
.find(|n| n.tag_name().name() == "off" && n.tag_name().namespace() == Some(A_NAMESPACE))
|
|
827
|
+
.and_then(|off| off.attribute("y")?.parse::<i64>().ok())?;
|
|
828
|
+
|
|
829
|
+
Some(ElementPosition { x, y })
|
|
830
|
+
})
|
|
831
|
+
.unwrap_or(default)
|
|
832
|
+
}
|
|
833
|
+
|
|
834
|
+
fn parse_slide_rels(rels_data: &[u8]) -> Result<Vec<ImageReference>> {
|
|
835
|
+
let xml_str = std::str::from_utf8(rels_data)
|
|
836
|
+
.map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in rels XML: {}", e)))?;
|
|
837
|
+
|
|
838
|
+
let doc =
|
|
839
|
+
Document::parse(xml_str).map_err(|e| KreuzbergError::parsing(format!("Failed to parse rels XML: {}", e)))?;
|
|
840
|
+
|
|
841
|
+
let mut images = Vec::new();
|
|
842
|
+
|
|
843
|
+
for node in doc.descendants() {
|
|
844
|
+
if node.has_tag_name("Relationship")
|
|
845
|
+
&& let Some(rel_type) = node.attribute("Type")
|
|
846
|
+
&& rel_type.contains("image")
|
|
847
|
+
&& let (Some(id), Some(target)) = (node.attribute("Id"), node.attribute("Target"))
|
|
848
|
+
{
|
|
849
|
+
images.push(ImageReference {
|
|
850
|
+
id: id.to_string(),
|
|
851
|
+
target: target.to_string(),
|
|
852
|
+
});
|
|
853
|
+
}
|
|
854
|
+
}
|
|
855
|
+
|
|
856
|
+
Ok(images)
|
|
857
|
+
}
|
|
858
|
+
|
|
859
|
+
fn parse_presentation_rels(rels_data: &[u8]) -> Result<Vec<String>> {
|
|
860
|
+
let xml_str = std::str::from_utf8(rels_data)
|
|
861
|
+
.map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in presentation rels: {}", e)))?;
|
|
862
|
+
|
|
863
|
+
let doc = Document::parse(xml_str)
|
|
864
|
+
.map_err(|e| KreuzbergError::parsing(format!("Failed to parse presentation rels: {}", e)))?;
|
|
865
|
+
|
|
866
|
+
let mut slide_paths = Vec::new();
|
|
867
|
+
|
|
868
|
+
for node in doc.descendants() {
|
|
869
|
+
if node.has_tag_name("Relationship")
|
|
870
|
+
&& let Some(rel_type) = node.attribute("Type")
|
|
871
|
+
&& rel_type.contains("slide")
|
|
872
|
+
&& !rel_type.contains("slideMaster")
|
|
873
|
+
&& let Some(target) = node.attribute("Target")
|
|
874
|
+
{
|
|
875
|
+
let normalized_target = target.strip_prefix('/').unwrap_or(target);
|
|
876
|
+
let final_path = if normalized_target.starts_with("ppt/") {
|
|
877
|
+
normalized_target.to_string()
|
|
878
|
+
} else {
|
|
879
|
+
format!("ppt/{}", normalized_target)
|
|
880
|
+
};
|
|
881
|
+
slide_paths.push(final_path);
|
|
882
|
+
}
|
|
883
|
+
}
|
|
884
|
+
|
|
885
|
+
Ok(slide_paths)
|
|
886
|
+
}
|
|
887
|
+
|
|
888
|
+
/// Extract comprehensive metadata from PPTX using office_metadata module
|
|
889
|
+
fn extract_metadata(archive: &mut ZipArchive<File>) -> PptxMetadata {
|
|
890
|
+
#[cfg(feature = "office")]
|
|
891
|
+
{
|
|
892
|
+
let mut metadata_map = HashMap::new();
|
|
893
|
+
|
|
894
|
+
if let Ok(core) = extract_core_properties(archive) {
|
|
895
|
+
if let Some(title) = core.title {
|
|
896
|
+
metadata_map.insert("title".to_string(), title);
|
|
897
|
+
}
|
|
898
|
+
if let Some(creator) = core.creator {
|
|
899
|
+
metadata_map.insert("author".to_string(), creator.clone());
|
|
900
|
+
metadata_map.insert("created_by".to_string(), creator);
|
|
901
|
+
}
|
|
902
|
+
if let Some(subject) = core.subject {
|
|
903
|
+
metadata_map.insert("subject".to_string(), subject.clone());
|
|
904
|
+
metadata_map.insert("summary".to_string(), subject);
|
|
905
|
+
}
|
|
906
|
+
if let Some(keywords) = core.keywords {
|
|
907
|
+
metadata_map.insert("keywords".to_string(), keywords);
|
|
908
|
+
}
|
|
909
|
+
if let Some(description) = core.description {
|
|
910
|
+
metadata_map.insert("description".to_string(), description);
|
|
911
|
+
}
|
|
912
|
+
if let Some(modified_by) = core.last_modified_by {
|
|
913
|
+
metadata_map.insert("modified_by".to_string(), modified_by);
|
|
914
|
+
}
|
|
915
|
+
if let Some(created) = core.created {
|
|
916
|
+
metadata_map.insert("created_at".to_string(), created);
|
|
917
|
+
}
|
|
918
|
+
if let Some(modified) = core.modified {
|
|
919
|
+
metadata_map.insert("modified_at".to_string(), modified);
|
|
920
|
+
}
|
|
921
|
+
if let Some(revision) = core.revision {
|
|
922
|
+
metadata_map.insert("revision".to_string(), revision);
|
|
923
|
+
}
|
|
924
|
+
if let Some(category) = core.category {
|
|
925
|
+
metadata_map.insert("category".to_string(), category);
|
|
926
|
+
}
|
|
927
|
+
}
|
|
928
|
+
|
|
929
|
+
if let Ok(app) = extract_pptx_app_properties(archive) {
|
|
930
|
+
if let Some(slides) = app.slides {
|
|
931
|
+
metadata_map.insert("slide_count".to_string(), slides.to_string());
|
|
932
|
+
}
|
|
933
|
+
if let Some(notes) = app.notes {
|
|
934
|
+
metadata_map.insert("notes_count".to_string(), notes.to_string());
|
|
935
|
+
}
|
|
936
|
+
if let Some(hidden_slides) = app.hidden_slides {
|
|
937
|
+
metadata_map.insert("hidden_slides".to_string(), hidden_slides.to_string());
|
|
938
|
+
}
|
|
939
|
+
if !app.slide_titles.is_empty() {
|
|
940
|
+
metadata_map.insert("slide_titles".to_string(), app.slide_titles.join(", "));
|
|
941
|
+
}
|
|
942
|
+
if let Some(presentation_format) = app.presentation_format {
|
|
943
|
+
metadata_map.insert("presentation_format".to_string(), presentation_format);
|
|
944
|
+
}
|
|
945
|
+
if let Some(company) = app.company {
|
|
946
|
+
metadata_map.insert("organization".to_string(), company);
|
|
947
|
+
}
|
|
948
|
+
if let Some(application) = app.application {
|
|
949
|
+
metadata_map.insert("application".to_string(), application);
|
|
950
|
+
}
|
|
951
|
+
if let Some(app_version) = app.app_version {
|
|
952
|
+
metadata_map.insert("application_version".to_string(), app_version);
|
|
953
|
+
}
|
|
954
|
+
}
|
|
955
|
+
|
|
956
|
+
if let Ok(custom) = extract_custom_properties(archive) {
|
|
957
|
+
for (key, value) in custom {
|
|
958
|
+
let value_str = match value {
|
|
959
|
+
Value::String(s) => s,
|
|
960
|
+
Value::Number(n) => n.to_string(),
|
|
961
|
+
Value::Bool(b) => b.to_string(),
|
|
962
|
+
Value::Null => "null".to_string(),
|
|
963
|
+
Value::Array(_) | Value::Object(_) => value.to_string(),
|
|
964
|
+
};
|
|
965
|
+
metadata_map.insert(format!("custom_{}", key), value_str);
|
|
966
|
+
}
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
PptxMetadata {
|
|
970
|
+
title: metadata_map.get("title").cloned(),
|
|
971
|
+
author: metadata_map.get("author").cloned(),
|
|
972
|
+
description: metadata_map.get("description").cloned(),
|
|
973
|
+
summary: metadata_map.get("summary").cloned(),
|
|
974
|
+
fonts: Vec::new(),
|
|
975
|
+
}
|
|
976
|
+
}
|
|
977
|
+
|
|
978
|
+
#[cfg(not(feature = "office"))]
|
|
979
|
+
{
|
|
980
|
+
PptxMetadata {
|
|
981
|
+
title: None,
|
|
982
|
+
author: None,
|
|
983
|
+
description: None,
|
|
984
|
+
summary: None,
|
|
985
|
+
fonts: Vec::new(),
|
|
986
|
+
}
|
|
987
|
+
}
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
fn extract_all_notes(container: &mut PptxContainer) -> Result<HashMap<u32, String>> {
|
|
991
|
+
let mut notes = HashMap::new();
|
|
992
|
+
|
|
993
|
+
let slide_paths: Vec<String> = container.slide_paths().to_vec();
|
|
994
|
+
|
|
995
|
+
for (i, slide_path) in slide_paths.iter().enumerate() {
|
|
996
|
+
let notes_path = slide_path.replace("slides/slide", "notesSlides/notesSlide");
|
|
997
|
+
if let Ok(notes_xml) = container.read_file(¬es_path)
|
|
998
|
+
&& let Ok(note_text) = extract_notes_text(¬es_xml)
|
|
999
|
+
{
|
|
1000
|
+
notes.insert((i + 1) as u32, note_text);
|
|
1001
|
+
}
|
|
1002
|
+
}
|
|
1003
|
+
|
|
1004
|
+
Ok(notes)
|
|
1005
|
+
}
|
|
1006
|
+
|
|
1007
|
+
fn extract_notes_text(notes_xml: &[u8]) -> Result<String> {
|
|
1008
|
+
let xml_str = std::str::from_utf8(notes_xml)
|
|
1009
|
+
.map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in notes XML: {}", e)))?;
|
|
1010
|
+
|
|
1011
|
+
let doc =
|
|
1012
|
+
Document::parse(xml_str).map_err(|e| KreuzbergError::parsing(format!("Failed to parse notes XML: {}", e)))?;
|
|
1013
|
+
|
|
1014
|
+
let mut text_parts = Vec::new();
|
|
1015
|
+
const DRAWINGML_NS: &str = "http://schemas.openxmlformats.org/drawingml/2006/main";
|
|
1016
|
+
|
|
1017
|
+
for node in doc.descendants() {
|
|
1018
|
+
if node.has_tag_name((DRAWINGML_NS, "t"))
|
|
1019
|
+
&& let Some(text) = node.text()
|
|
1020
|
+
{
|
|
1021
|
+
text_parts.push(text);
|
|
1022
|
+
}
|
|
1023
|
+
}
|
|
1024
|
+
|
|
1025
|
+
Ok(text_parts.join(" "))
|
|
1026
|
+
}
|
|
1027
|
+
|
|
1028
|
+
fn get_slide_rels_path(slide_path: &str) -> String {
|
|
1029
|
+
let parts: Vec<&str> = slide_path.rsplitn(2, '/').collect();
|
|
1030
|
+
if parts.len() == 2 {
|
|
1031
|
+
format!("{}/_rels/{}.rels", parts[1], parts[0])
|
|
1032
|
+
} else {
|
|
1033
|
+
format!("_rels/{}.rels", slide_path)
|
|
1034
|
+
}
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
fn get_full_image_path(slide_path: &str, image_target: &str) -> String {
|
|
1038
|
+
if image_target.starts_with("..") {
|
|
1039
|
+
let parts: Vec<&str> = slide_path.rsplitn(3, '/').collect();
|
|
1040
|
+
if parts.len() >= 3 {
|
|
1041
|
+
format!("{}/{}", parts[2], &image_target[3..])
|
|
1042
|
+
} else {
|
|
1043
|
+
format!("ppt/{}", &image_target[3..])
|
|
1044
|
+
}
|
|
1045
|
+
} else {
|
|
1046
|
+
let parts: Vec<&str> = slide_path.rsplitn(2, '/').collect();
|
|
1047
|
+
if parts.len() == 2 {
|
|
1048
|
+
format!("{}/{}", parts[1], image_target)
|
|
1049
|
+
} else {
|
|
1050
|
+
format!("ppt/slides/{}", image_target)
|
|
1051
|
+
}
|
|
1052
|
+
}
|
|
1053
|
+
}
|
|
1054
|
+
|
|
1055
|
+
fn detect_image_format(data: &[u8]) -> String {
|
|
1056
|
+
if data.starts_with(&[0xFF, 0xD8, 0xFF]) {
|
|
1057
|
+
"jpeg".to_string()
|
|
1058
|
+
} else if data.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
|
|
1059
|
+
"png".to_string()
|
|
1060
|
+
} else if data.starts_with(b"GIF") {
|
|
1061
|
+
"gif".to_string()
|
|
1062
|
+
} else if data.starts_with(b"BM") {
|
|
1063
|
+
"bmp".to_string()
|
|
1064
|
+
} else if data.starts_with(b"<svg") || data.starts_with(b"<?xml") {
|
|
1065
|
+
"svg".to_string()
|
|
1066
|
+
} else if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
|
|
1067
|
+
"tiff".to_string()
|
|
1068
|
+
} else {
|
|
1069
|
+
"unknown".to_string()
|
|
1070
|
+
}
|
|
1071
|
+
}
|
|
1072
|
+
|
|
1073
|
+
pub fn extract_pptx_from_path(path: &str, extract_images: bool) -> Result<PptxExtractionResult> {
|
|
1074
|
+
let config = ParserConfig {
|
|
1075
|
+
extract_images,
|
|
1076
|
+
..Default::default()
|
|
1077
|
+
};
|
|
1078
|
+
|
|
1079
|
+
let mut container = PptxContainer::open(path)?;
|
|
1080
|
+
|
|
1081
|
+
let metadata = extract_metadata(&mut container.archive);
|
|
1082
|
+
|
|
1083
|
+
let notes = extract_all_notes(&mut container)?;
|
|
1084
|
+
|
|
1085
|
+
let mut iterator = SlideIterator::new(container);
|
|
1086
|
+
let slide_count = iterator.slide_count();
|
|
1087
|
+
|
|
1088
|
+
let estimated_capacity = slide_count * 1024;
|
|
1089
|
+
let mut content_builder = ContentBuilder::with_capacity(estimated_capacity);
|
|
1090
|
+
|
|
1091
|
+
let mut total_image_count = 0;
|
|
1092
|
+
let mut total_table_count = 0;
|
|
1093
|
+
let mut extracted_images = Vec::new();
|
|
1094
|
+
|
|
1095
|
+
while let Some(slide) = iterator.next_slide()? {
|
|
1096
|
+
content_builder.add_slide_header(slide.slide_number);
|
|
1097
|
+
|
|
1098
|
+
let slide_content = slide.to_markdown(&config);
|
|
1099
|
+
content_builder.add_text(&slide_content);
|
|
1100
|
+
|
|
1101
|
+
if let Some(slide_notes) = notes.get(&slide.slide_number) {
|
|
1102
|
+
content_builder.add_notes(slide_notes);
|
|
1103
|
+
}
|
|
1104
|
+
|
|
1105
|
+
if config.extract_images
|
|
1106
|
+
&& let Ok(image_data) = iterator.get_slide_images(&slide)
|
|
1107
|
+
{
|
|
1108
|
+
for (_, data) in image_data {
|
|
1109
|
+
let format = detect_image_format(&data);
|
|
1110
|
+
let image_index = extracted_images.len();
|
|
1111
|
+
|
|
1112
|
+
extracted_images.push(ExtractedImage {
|
|
1113
|
+
data,
|
|
1114
|
+
format,
|
|
1115
|
+
image_index,
|
|
1116
|
+
page_number: Some(slide.slide_number as usize),
|
|
1117
|
+
width: None,
|
|
1118
|
+
height: None,
|
|
1119
|
+
colorspace: None,
|
|
1120
|
+
bits_per_component: None,
|
|
1121
|
+
is_mask: false,
|
|
1122
|
+
description: None,
|
|
1123
|
+
ocr_result: None,
|
|
1124
|
+
});
|
|
1125
|
+
}
|
|
1126
|
+
}
|
|
1127
|
+
|
|
1128
|
+
total_image_count += slide.image_count();
|
|
1129
|
+
total_table_count += slide.table_count();
|
|
1130
|
+
}
|
|
1131
|
+
|
|
1132
|
+
Ok(PptxExtractionResult {
|
|
1133
|
+
content: content_builder.build(),
|
|
1134
|
+
metadata,
|
|
1135
|
+
slide_count,
|
|
1136
|
+
image_count: total_image_count,
|
|
1137
|
+
table_count: total_table_count,
|
|
1138
|
+
images: extracted_images,
|
|
1139
|
+
})
|
|
1140
|
+
}
|
|
1141
|
+
|
|
1142
|
+
pub fn extract_pptx_from_bytes(data: &[u8], extract_images: bool) -> Result<PptxExtractionResult> {
|
|
1143
|
+
use std::sync::atomic::{AtomicU64, Ordering};
|
|
1144
|
+
static COUNTER: AtomicU64 = AtomicU64::new(0);
|
|
1145
|
+
let unique_id = COUNTER.fetch_add(1, Ordering::SeqCst);
|
|
1146
|
+
let temp_path = std::env::temp_dir().join(format!("temp_pptx_{}_{}.pptx", std::process::id(), unique_id));
|
|
1147
|
+
|
|
1148
|
+
// IO errors must bubble up - temp file write issues need user reports ~keep
|
|
1149
|
+
std::fs::write(&temp_path, data)?;
|
|
1150
|
+
|
|
1151
|
+
let result = extract_pptx_from_path(temp_path.to_str().unwrap(), extract_images);
|
|
1152
|
+
|
|
1153
|
+
let _ = std::fs::remove_file(&temp_path);
|
|
1154
|
+
|
|
1155
|
+
result
|
|
1156
|
+
}
|
|
1157
|
+
|
|
1158
|
+
#[cfg(test)]
|
|
1159
|
+
mod tests {
|
|
1160
|
+
use super::*;
|
|
1161
|
+
|
|
1162
|
+
fn create_test_pptx_bytes(slides: Vec<&str>) -> Vec<u8> {
|
|
1163
|
+
use std::io::Write;
|
|
1164
|
+
use zip::write::{SimpleFileOptions, ZipWriter};
|
|
1165
|
+
|
|
1166
|
+
let mut buffer = Vec::new();
|
|
1167
|
+
{
|
|
1168
|
+
let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
|
|
1169
|
+
let options = SimpleFileOptions::default();
|
|
1170
|
+
|
|
1171
|
+
zip.start_file("[Content_Types].xml", options).unwrap();
|
|
1172
|
+
zip.write_all(
|
|
1173
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
1174
|
+
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
1175
|
+
<Default Extension="xml" ContentType="application/xml"/>
|
|
1176
|
+
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
1177
|
+
</Types>"#,
|
|
1178
|
+
)
|
|
1179
|
+
.unwrap();
|
|
1180
|
+
|
|
1181
|
+
zip.start_file("ppt/presentation.xml", options).unwrap();
|
|
1182
|
+
zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
|
|
1183
|
+
|
|
1184
|
+
zip.start_file("_rels/.rels", options).unwrap();
|
|
1185
|
+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
1186
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
1187
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
|
|
1188
|
+
</Relationships>"#).unwrap();
|
|
1189
|
+
|
|
1190
|
+
let mut rels_xml = String::from(
|
|
1191
|
+
r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
1192
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">"#,
|
|
1193
|
+
);
|
|
1194
|
+
for (i, _) in slides.iter().enumerate() {
|
|
1195
|
+
rels_xml.push_str(&format!(
|
|
1196
|
+
r#"<Relationship Id="rId{}" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide{}.xml"/>"#,
|
|
1197
|
+
i + 1,
|
|
1198
|
+
i + 1
|
|
1199
|
+
));
|
|
1200
|
+
}
|
|
1201
|
+
rels_xml.push_str("</Relationships>");
|
|
1202
|
+
zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
|
|
1203
|
+
zip.write_all(rels_xml.as_bytes()).unwrap();
|
|
1204
|
+
|
|
1205
|
+
for (i, text) in slides.iter().enumerate() {
|
|
1206
|
+
let slide_xml = format!(
|
|
1207
|
+
r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
1208
|
+
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
1209
|
+
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
|
|
1210
|
+
<p:cSld>
|
|
1211
|
+
<p:spTree>
|
|
1212
|
+
<p:sp>
|
|
1213
|
+
<p:txBody>
|
|
1214
|
+
<a:p>
|
|
1215
|
+
<a:r>
|
|
1216
|
+
<a:t>{}</a:t>
|
|
1217
|
+
</a:r>
|
|
1218
|
+
</a:p>
|
|
1219
|
+
</p:txBody>
|
|
1220
|
+
</p:sp>
|
|
1221
|
+
</p:spTree>
|
|
1222
|
+
</p:cSld>
|
|
1223
|
+
</p:sld>"#,
|
|
1224
|
+
text
|
|
1225
|
+
);
|
|
1226
|
+
zip.start_file(format!("ppt/slides/slide{}.xml", i + 1), options)
|
|
1227
|
+
.unwrap();
|
|
1228
|
+
zip.write_all(slide_xml.as_bytes()).unwrap();
|
|
1229
|
+
}
|
|
1230
|
+
|
|
1231
|
+
zip.start_file("docProps/core.xml", options).unwrap();
|
|
1232
|
+
zip.write_all(
|
|
1233
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
1234
|
+
<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
|
|
1235
|
+
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
|
1236
|
+
xmlns:dcterms="http://purl.org/dc/terms/">
|
|
1237
|
+
<dc:title>Test Presentation</dc:title>
|
|
1238
|
+
<dc:creator>Test Author</dc:creator>
|
|
1239
|
+
<dc:description>Test Description</dc:description>
|
|
1240
|
+
<dc:subject>Test Subject</dc:subject>
|
|
1241
|
+
</cp:coreProperties>"#,
|
|
1242
|
+
)
|
|
1243
|
+
.unwrap();
|
|
1244
|
+
|
|
1245
|
+
let _ = zip.finish().unwrap();
|
|
1246
|
+
}
|
|
1247
|
+
buffer
|
|
1248
|
+
}
|
|
1249
|
+
|
|
1250
|
+
#[test]
|
|
1251
|
+
fn test_extract_pptx_from_bytes_single_slide() {
|
|
1252
|
+
let pptx_bytes = create_test_pptx_bytes(vec!["Hello World"]);
|
|
1253
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
1254
|
+
|
|
1255
|
+
assert_eq!(result.slide_count, 1);
|
|
1256
|
+
assert!(
|
|
1257
|
+
result.content.contains("Hello World"),
|
|
1258
|
+
"Content was: {}",
|
|
1259
|
+
result.content
|
|
1260
|
+
);
|
|
1261
|
+
assert_eq!(result.image_count, 0);
|
|
1262
|
+
assert_eq!(result.table_count, 0);
|
|
1263
|
+
}
|
|
1264
|
+
|
|
1265
|
+
#[test]
|
|
1266
|
+
fn test_extract_pptx_from_bytes_multiple_slides() {
|
|
1267
|
+
let pptx_bytes = create_test_pptx_bytes(vec!["Slide 1", "Slide 2", "Slide 3"]);
|
|
1268
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
1269
|
+
|
|
1270
|
+
assert_eq!(result.slide_count, 3);
|
|
1271
|
+
assert!(result.content.contains("Slide 1"));
|
|
1272
|
+
assert!(result.content.contains("Slide 2"));
|
|
1273
|
+
assert!(result.content.contains("Slide 3"));
|
|
1274
|
+
}
|
|
1275
|
+
|
|
1276
|
+
#[test]
|
|
1277
|
+
fn test_extract_pptx_metadata() {
|
|
1278
|
+
let pptx_bytes = create_test_pptx_bytes(vec!["Content"]);
|
|
1279
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
1280
|
+
|
|
1281
|
+
assert_eq!(result.metadata.title, Some("Test Presentation".to_string()));
|
|
1282
|
+
assert_eq!(result.metadata.author, Some("Test Author".to_string()));
|
|
1283
|
+
assert_eq!(result.metadata.description, Some("Test Description".to_string()));
|
|
1284
|
+
assert_eq!(result.metadata.summary, Some("Test Subject".to_string()));
|
|
1285
|
+
}
|
|
1286
|
+
|
|
1287
|
+
#[test]
|
|
1288
|
+
fn test_extract_pptx_empty_slides() {
|
|
1289
|
+
let pptx_bytes = create_test_pptx_bytes(vec!["", "", ""]);
|
|
1290
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
1291
|
+
|
|
1292
|
+
assert_eq!(result.slide_count, 3);
|
|
1293
|
+
}
|
|
1294
|
+
|
|
1295
|
+
#[test]
|
|
1296
|
+
fn test_extract_pptx_from_bytes_invalid_data() {
|
|
1297
|
+
let invalid_bytes = b"not a valid pptx file";
|
|
1298
|
+
let result = extract_pptx_from_bytes(invalid_bytes, false);
|
|
1299
|
+
|
|
1300
|
+
assert!(result.is_err());
|
|
1301
|
+
if let Err(KreuzbergError::Parsing { message: msg, .. }) = result {
|
|
1302
|
+
assert!(msg.contains("Failed to read PPTX archive") || msg.contains("Failed to write temp PPTX file"));
|
|
1303
|
+
} else {
|
|
1304
|
+
panic!("Expected ParsingError");
|
|
1305
|
+
}
|
|
1306
|
+
}
|
|
1307
|
+
|
|
1308
|
+
#[test]
|
|
1309
|
+
fn test_extract_pptx_from_bytes_empty_data() {
|
|
1310
|
+
let empty_bytes: &[u8] = &[];
|
|
1311
|
+
let result = extract_pptx_from_bytes(empty_bytes, false);
|
|
1312
|
+
|
|
1313
|
+
assert!(result.is_err());
|
|
1314
|
+
}
|
|
1315
|
+
|
|
1316
|
+
#[test]
|
|
1317
|
+
fn test_detect_image_format_jpeg() {
|
|
1318
|
+
let jpeg_header = vec![0xFF, 0xD8, 0xFF, 0xE0];
|
|
1319
|
+
assert_eq!(detect_image_format(&jpeg_header), "jpeg");
|
|
1320
|
+
}
|
|
1321
|
+
|
|
1322
|
+
#[test]
|
|
1323
|
+
fn test_detect_image_format_png() {
|
|
1324
|
+
let png_header = vec![0x89, 0x50, 0x4E, 0x47];
|
|
1325
|
+
assert_eq!(detect_image_format(&png_header), "png");
|
|
1326
|
+
}
|
|
1327
|
+
|
|
1328
|
+
#[test]
|
|
1329
|
+
fn test_detect_image_format_gif() {
|
|
1330
|
+
let gif_header = b"GIF89a";
|
|
1331
|
+
assert_eq!(detect_image_format(gif_header), "gif");
|
|
1332
|
+
}
|
|
1333
|
+
|
|
1334
|
+
#[test]
|
|
1335
|
+
fn test_detect_image_format_bmp() {
|
|
1336
|
+
let bmp_header = b"BM";
|
|
1337
|
+
assert_eq!(detect_image_format(bmp_header), "bmp");
|
|
1338
|
+
}
|
|
1339
|
+
|
|
1340
|
+
#[test]
|
|
1341
|
+
fn test_detect_image_format_svg() {
|
|
1342
|
+
let svg_header = b"<svg xmlns=\"http://www.w3.org/2000/svg\">";
|
|
1343
|
+
assert_eq!(detect_image_format(svg_header), "svg");
|
|
1344
|
+
}
|
|
1345
|
+
|
|
1346
|
+
#[test]
|
|
1347
|
+
fn test_detect_image_format_tiff_little_endian() {
|
|
1348
|
+
let tiff_header = vec![0x49, 0x49, 0x2A, 0x00];
|
|
1349
|
+
assert_eq!(detect_image_format(&tiff_header), "tiff");
|
|
1350
|
+
}
|
|
1351
|
+
|
|
1352
|
+
#[test]
|
|
1353
|
+
fn test_detect_image_format_tiff_big_endian() {
|
|
1354
|
+
let tiff_header = vec![0x4D, 0x4D, 0x00, 0x2A];
|
|
1355
|
+
assert_eq!(detect_image_format(&tiff_header), "tiff");
|
|
1356
|
+
}
|
|
1357
|
+
|
|
1358
|
+
#[test]
|
|
1359
|
+
fn test_detect_image_format_unknown() {
|
|
1360
|
+
let unknown_data = b"unknown format";
|
|
1361
|
+
assert_eq!(detect_image_format(unknown_data), "unknown");
|
|
1362
|
+
}
|
|
1363
|
+
|
|
1364
|
+
#[test]
|
|
1365
|
+
fn test_html_escape() {
|
|
1366
|
+
assert_eq!(html_escape("plain text"), "plain text");
|
|
1367
|
+
assert_eq!(html_escape("a & b"), "a & b");
|
|
1368
|
+
assert_eq!(html_escape("<tag>"), "<tag>");
|
|
1369
|
+
assert_eq!(html_escape("\"quoted\""), ""quoted"");
|
|
1370
|
+
assert_eq!(html_escape("'apostrophe'"), "'apostrophe'");
|
|
1371
|
+
assert_eq!(
|
|
1372
|
+
html_escape("<a href=\"url\" title='test'>text & more</a>"),
|
|
1373
|
+
"<a href="url" title='test'>text & more</a>"
|
|
1374
|
+
);
|
|
1375
|
+
}
|
|
1376
|
+
|
|
1377
|
+
#[test]
|
|
1378
|
+
fn test_get_slide_rels_path() {
|
|
1379
|
+
assert_eq!(
|
|
1380
|
+
get_slide_rels_path("ppt/slides/slide1.xml"),
|
|
1381
|
+
"ppt/slides/_rels/slide1.xml.rels"
|
|
1382
|
+
);
|
|
1383
|
+
assert_eq!(
|
|
1384
|
+
get_slide_rels_path("ppt/slides/slide10.xml"),
|
|
1385
|
+
"ppt/slides/_rels/slide10.xml.rels"
|
|
1386
|
+
);
|
|
1387
|
+
}
|
|
1388
|
+
|
|
1389
|
+
#[test]
|
|
1390
|
+
fn test_get_full_image_path_relative() {
|
|
1391
|
+
assert_eq!(
|
|
1392
|
+
get_full_image_path("ppt/slides/slide1.xml", "../media/image1.png"),
|
|
1393
|
+
"ppt/media/image1.png"
|
|
1394
|
+
);
|
|
1395
|
+
}
|
|
1396
|
+
|
|
1397
|
+
#[test]
|
|
1398
|
+
fn test_get_full_image_path_direct() {
|
|
1399
|
+
assert_eq!(
|
|
1400
|
+
get_full_image_path("ppt/slides/slide1.xml", "image1.png"),
|
|
1401
|
+
"ppt/slides/image1.png"
|
|
1402
|
+
);
|
|
1403
|
+
}
|
|
1404
|
+
|
|
1405
|
+
#[test]
|
|
1406
|
+
fn test_content_builder_add_text() {
|
|
1407
|
+
let mut builder = ContentBuilder::new();
|
|
1408
|
+
builder.add_text("Hello");
|
|
1409
|
+
builder.add_text(" ");
|
|
1410
|
+
builder.add_text("World");
|
|
1411
|
+
assert_eq!(builder.build(), "HelloWorld");
|
|
1412
|
+
}
|
|
1413
|
+
|
|
1414
|
+
#[test]
|
|
1415
|
+
fn test_content_builder_add_text_empty() {
|
|
1416
|
+
let mut builder = ContentBuilder::new();
|
|
1417
|
+
builder.add_text(" ");
|
|
1418
|
+
builder.add_text("");
|
|
1419
|
+
assert_eq!(builder.build(), "");
|
|
1420
|
+
}
|
|
1421
|
+
|
|
1422
|
+
#[test]
|
|
1423
|
+
fn test_content_builder_add_title() {
|
|
1424
|
+
let mut builder = ContentBuilder::new();
|
|
1425
|
+
builder.add_title("Title");
|
|
1426
|
+
assert_eq!(builder.build(), "# Title");
|
|
1427
|
+
}
|
|
1428
|
+
|
|
1429
|
+
#[test]
|
|
1430
|
+
fn test_content_builder_add_title_with_whitespace() {
|
|
1431
|
+
let mut builder = ContentBuilder::new();
|
|
1432
|
+
builder.add_title(" Title ");
|
|
1433
|
+
assert_eq!(builder.build(), "# Title");
|
|
1434
|
+
}
|
|
1435
|
+
|
|
1436
|
+
#[test]
|
|
1437
|
+
fn test_content_builder_add_table_empty() {
|
|
1438
|
+
let mut builder = ContentBuilder::new();
|
|
1439
|
+
builder.add_table(&[]);
|
|
1440
|
+
assert_eq!(builder.build(), "");
|
|
1441
|
+
}
|
|
1442
|
+
|
|
1443
|
+
#[test]
|
|
1444
|
+
fn test_content_builder_add_table_single_row() {
|
|
1445
|
+
let mut builder = ContentBuilder::new();
|
|
1446
|
+
let rows = vec![vec!["Header1".to_string(), "Header2".to_string()]];
|
|
1447
|
+
builder.add_table(&rows);
|
|
1448
|
+
let result = builder.build();
|
|
1449
|
+
assert!(result.contains("<table>"));
|
|
1450
|
+
assert!(result.contains("<th>Header1</th>"));
|
|
1451
|
+
assert!(result.contains("<th>Header2</th>"));
|
|
1452
|
+
}
|
|
1453
|
+
|
|
1454
|
+
#[test]
|
|
1455
|
+
fn test_content_builder_add_table_multiple_rows() {
|
|
1456
|
+
let mut builder = ContentBuilder::new();
|
|
1457
|
+
let rows = vec![
|
|
1458
|
+
vec!["H1".to_string(), "H2".to_string()],
|
|
1459
|
+
vec!["D1".to_string(), "D2".to_string()],
|
|
1460
|
+
];
|
|
1461
|
+
builder.add_table(&rows);
|
|
1462
|
+
let result = builder.build();
|
|
1463
|
+
assert!(result.contains("<th>H1</th>"));
|
|
1464
|
+
assert!(result.contains("<td>D1</td>"));
|
|
1465
|
+
}
|
|
1466
|
+
|
|
1467
|
+
#[test]
|
|
1468
|
+
fn test_content_builder_add_table_with_special_chars() {
|
|
1469
|
+
let mut builder = ContentBuilder::new();
|
|
1470
|
+
let rows = vec![vec!["<tag>".to_string(), "a & b".to_string()]];
|
|
1471
|
+
builder.add_table(&rows);
|
|
1472
|
+
let result = builder.build();
|
|
1473
|
+
assert!(result.contains("<tag>"));
|
|
1474
|
+
assert!(result.contains("a & b"));
|
|
1475
|
+
}
|
|
1476
|
+
|
|
1477
|
+
#[test]
|
|
1478
|
+
fn test_content_builder_add_list_item_unordered() {
|
|
1479
|
+
let mut builder = ContentBuilder::new();
|
|
1480
|
+
builder.add_list_item(1, false, "Item 1");
|
|
1481
|
+
builder.add_list_item(1, false, "Item 2");
|
|
1482
|
+
let result = builder.build();
|
|
1483
|
+
assert!(result.contains("- Item 1"));
|
|
1484
|
+
assert!(result.contains("- Item 2"));
|
|
1485
|
+
}
|
|
1486
|
+
|
|
1487
|
+
#[test]
|
|
1488
|
+
fn test_content_builder_add_list_item_ordered() {
|
|
1489
|
+
let mut builder = ContentBuilder::new();
|
|
1490
|
+
builder.add_list_item(1, true, "First");
|
|
1491
|
+
builder.add_list_item(1, true, "Second");
|
|
1492
|
+
let result = builder.build();
|
|
1493
|
+
assert!(result.contains("1. First"));
|
|
1494
|
+
assert!(result.contains("1. Second"));
|
|
1495
|
+
}
|
|
1496
|
+
|
|
1497
|
+
#[test]
|
|
1498
|
+
fn test_content_builder_add_list_item_nested() {
|
|
1499
|
+
let mut builder = ContentBuilder::new();
|
|
1500
|
+
builder.add_list_item(1, false, "Level 1");
|
|
1501
|
+
builder.add_list_item(2, false, "Level 2");
|
|
1502
|
+
builder.add_list_item(3, false, "Level 3");
|
|
1503
|
+
let result = builder.build();
|
|
1504
|
+
assert!(result.contains("- Level 1"));
|
|
1505
|
+
assert!(result.contains(" - Level 2"));
|
|
1506
|
+
assert!(result.contains(" - Level 3"));
|
|
1507
|
+
}
|
|
1508
|
+
|
|
1509
|
+
#[test]
|
|
1510
|
+
fn test_content_builder_add_image() {
|
|
1511
|
+
let mut builder = ContentBuilder::new();
|
|
1512
|
+
builder.add_image("img123", 5);
|
|
1513
|
+
let result = builder.build();
|
|
1514
|
+
assert!(result.contains(""));
|
|
1515
|
+
}
|
|
1516
|
+
|
|
1517
|
+
#[test]
|
|
1518
|
+
fn test_content_builder_add_notes() {
|
|
1519
|
+
let mut builder = ContentBuilder::new();
|
|
1520
|
+
builder.add_notes("This is a note");
|
|
1521
|
+
let result = builder.build();
|
|
1522
|
+
assert!(result.contains("### Notes:"));
|
|
1523
|
+
assert!(result.contains("This is a note"));
|
|
1524
|
+
}
|
|
1525
|
+
|
|
1526
|
+
#[test]
|
|
1527
|
+
fn test_content_builder_add_notes_empty() {
|
|
1528
|
+
let mut builder = ContentBuilder::new();
|
|
1529
|
+
builder.add_notes(" ");
|
|
1530
|
+
assert_eq!(builder.build(), "");
|
|
1531
|
+
}
|
|
1532
|
+
|
|
1533
|
+
#[test]
|
|
1534
|
+
fn test_content_builder_add_slide_header() {
|
|
1535
|
+
let mut builder = ContentBuilder::new();
|
|
1536
|
+
builder.add_slide_header(3);
|
|
1537
|
+
let result = builder.build();
|
|
1538
|
+
assert!(result.contains("<!-- Slide number: 3 -->"));
|
|
1539
|
+
}
|
|
1540
|
+
|
|
1541
|
+
#[test]
|
|
1542
|
+
fn test_run_extract() {
|
|
1543
|
+
let run = Run {
|
|
1544
|
+
text: "Hello".to_string(),
|
|
1545
|
+
formatting: Formatting::default(),
|
|
1546
|
+
};
|
|
1547
|
+
assert_eq!(run.extract(), "Hello");
|
|
1548
|
+
}
|
|
1549
|
+
|
|
1550
|
+
#[test]
|
|
1551
|
+
fn test_run_render_as_md_plain() {
|
|
1552
|
+
let run = Run {
|
|
1553
|
+
text: "plain".to_string(),
|
|
1554
|
+
formatting: Formatting::default(),
|
|
1555
|
+
};
|
|
1556
|
+
assert_eq!(run.render_as_md(), "plain");
|
|
1557
|
+
}
|
|
1558
|
+
|
|
1559
|
+
#[test]
|
|
1560
|
+
fn test_run_render_as_md_bold() {
|
|
1561
|
+
let run = Run {
|
|
1562
|
+
text: "bold".to_string(),
|
|
1563
|
+
formatting: Formatting {
|
|
1564
|
+
bold: true,
|
|
1565
|
+
..Default::default()
|
|
1566
|
+
},
|
|
1567
|
+
};
|
|
1568
|
+
assert_eq!(run.render_as_md(), "**bold**");
|
|
1569
|
+
}
|
|
1570
|
+
|
|
1571
|
+
#[test]
|
|
1572
|
+
fn test_run_render_as_md_italic() {
|
|
1573
|
+
let run = Run {
|
|
1574
|
+
text: "italic".to_string(),
|
|
1575
|
+
formatting: Formatting {
|
|
1576
|
+
italic: true,
|
|
1577
|
+
..Default::default()
|
|
1578
|
+
},
|
|
1579
|
+
};
|
|
1580
|
+
assert_eq!(run.render_as_md(), "*italic*");
|
|
1581
|
+
}
|
|
1582
|
+
|
|
1583
|
+
#[test]
|
|
1584
|
+
fn test_run_render_as_md_bold_italic() {
|
|
1585
|
+
let run = Run {
|
|
1586
|
+
text: "both".to_string(),
|
|
1587
|
+
formatting: Formatting {
|
|
1588
|
+
bold: true,
|
|
1589
|
+
italic: true,
|
|
1590
|
+
..Default::default()
|
|
1591
|
+
},
|
|
1592
|
+
};
|
|
1593
|
+
assert_eq!(run.render_as_md(), "***both***");
|
|
1594
|
+
}
|
|
1595
|
+
|
|
1596
|
+
#[test]
|
|
1597
|
+
fn test_parse_slide_xml_simple_text() {
|
|
1598
|
+
let xml = br#"<?xml version="1.0"?>
|
|
1599
|
+
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
1600
|
+
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
|
|
1601
|
+
<p:cSld>
|
|
1602
|
+
<p:spTree>
|
|
1603
|
+
<p:sp>
|
|
1604
|
+
<p:txBody>
|
|
1605
|
+
<a:p>
|
|
1606
|
+
<a:r>
|
|
1607
|
+
<a:t>Test Text</a:t>
|
|
1608
|
+
</a:r>
|
|
1609
|
+
</a:p>
|
|
1610
|
+
</p:txBody>
|
|
1611
|
+
</p:sp>
|
|
1612
|
+
</p:spTree>
|
|
1613
|
+
</p:cSld>
|
|
1614
|
+
</p:sld>"#;
|
|
1615
|
+
|
|
1616
|
+
let elements = parse_slide_xml(xml).unwrap();
|
|
1617
|
+
if !elements.is_empty() {
|
|
1618
|
+
if let SlideElement::Text(text, _) = &elements[0] {
|
|
1619
|
+
assert_eq!(text.runs[0].text, "Test Text\n");
|
|
1620
|
+
} else {
|
|
1621
|
+
panic!("Expected Text element");
|
|
1622
|
+
}
|
|
1623
|
+
}
|
|
1624
|
+
}
|
|
1625
|
+
|
|
1626
|
+
#[test]
|
|
1627
|
+
fn test_parse_slide_xml_invalid_utf8() {
|
|
1628
|
+
let invalid_utf8 = vec![0xFF, 0xFE, 0xFF];
|
|
1629
|
+
let result = parse_slide_xml(&invalid_utf8);
|
|
1630
|
+
assert!(result.is_err());
|
|
1631
|
+
if let Err(KreuzbergError::Parsing { message: msg, .. }) = result {
|
|
1632
|
+
assert!(msg.contains("Invalid UTF-8"));
|
|
1633
|
+
}
|
|
1634
|
+
}
|
|
1635
|
+
|
|
1636
|
+
#[test]
|
|
1637
|
+
fn test_parse_slide_xml_malformed() {
|
|
1638
|
+
let malformed = b"<not valid xml>";
|
|
1639
|
+
let result = parse_slide_xml(malformed);
|
|
1640
|
+
assert!(result.is_err());
|
|
1641
|
+
}
|
|
1642
|
+
|
|
1643
|
+
#[test]
|
|
1644
|
+
fn test_parse_slide_rels_with_images() {
|
|
1645
|
+
let rels_xml = br#"<?xml version="1.0"?>
|
|
1646
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
1647
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image1.png"/>
|
|
1648
|
+
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image2.jpg"/>
|
|
1649
|
+
</Relationships>"#;
|
|
1650
|
+
|
|
1651
|
+
let images = parse_slide_rels(rels_xml).unwrap();
|
|
1652
|
+
assert_eq!(images.len(), 2);
|
|
1653
|
+
assert_eq!(images[0].id, "rId1");
|
|
1654
|
+
assert_eq!(images[0].target, "../media/image1.png");
|
|
1655
|
+
assert_eq!(images[1].id, "rId2");
|
|
1656
|
+
assert_eq!(images[1].target, "../media/image2.jpg");
|
|
1657
|
+
}
|
|
1658
|
+
|
|
1659
|
+
#[test]
|
|
1660
|
+
fn test_parse_slide_rels_no_images() {
|
|
1661
|
+
let rels_xml = br#"<?xml version="1.0"?>
|
|
1662
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
1663
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide" Target="../notesSlides/notesSlide1.xml"/>
|
|
1664
|
+
</Relationships>"#;
|
|
1665
|
+
|
|
1666
|
+
let images = parse_slide_rels(rels_xml).unwrap();
|
|
1667
|
+
assert_eq!(images.len(), 0);
|
|
1668
|
+
}
|
|
1669
|
+
|
|
1670
|
+
#[test]
|
|
1671
|
+
fn test_parse_presentation_rels() {
|
|
1672
|
+
let rels_xml = br#"<?xml version="1.0"?>
|
|
1673
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
1674
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
|
|
1675
|
+
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide2.xml"/>
|
|
1676
|
+
<Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slideMaster" Target="slideMasters/slideMaster1.xml"/>
|
|
1677
|
+
</Relationships>"#;
|
|
1678
|
+
|
|
1679
|
+
let slides = parse_presentation_rels(rels_xml).unwrap();
|
|
1680
|
+
assert_eq!(slides.len(), 2);
|
|
1681
|
+
assert_eq!(slides[0], "ppt/slides/slide1.xml");
|
|
1682
|
+
assert_eq!(slides[1], "ppt/slides/slide2.xml");
|
|
1683
|
+
}
|
|
1684
|
+
|
|
1685
|
+
#[test]
|
|
1686
|
+
fn test_extract_notes_text() {
|
|
1687
|
+
let notes_xml = br#"<?xml version="1.0"?>
|
|
1688
|
+
<p:notes xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
1689
|
+
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
|
|
1690
|
+
<p:cSld>
|
|
1691
|
+
<p:spTree>
|
|
1692
|
+
<p:sp>
|
|
1693
|
+
<p:txBody>
|
|
1694
|
+
<a:p>
|
|
1695
|
+
<a:r>
|
|
1696
|
+
<a:t>First note</a:t>
|
|
1697
|
+
</a:r>
|
|
1698
|
+
</a:p>
|
|
1699
|
+
<a:p>
|
|
1700
|
+
<a:r>
|
|
1701
|
+
<a:t>Second note</a:t>
|
|
1702
|
+
</a:r>
|
|
1703
|
+
</a:p>
|
|
1704
|
+
</p:txBody>
|
|
1705
|
+
</p:sp>
|
|
1706
|
+
</p:spTree>
|
|
1707
|
+
</p:cSld>
|
|
1708
|
+
</p:notes>"#;
|
|
1709
|
+
|
|
1710
|
+
let notes = extract_notes_text(notes_xml).unwrap();
|
|
1711
|
+
assert!(notes.contains("First note"));
|
|
1712
|
+
assert!(notes.contains("Second note"));
|
|
1713
|
+
}
|
|
1714
|
+
|
|
1715
|
+
#[test]
|
|
1716
|
+
fn test_parser_config_default() {
|
|
1717
|
+
let config = ParserConfig::default();
|
|
1718
|
+
assert!(config.extract_images);
|
|
1719
|
+
assert!(!config.include_slide_comment);
|
|
1720
|
+
}
|
|
1721
|
+
|
|
1722
|
+
fn create_pptx_with_table(rows: Vec<Vec<&str>>) -> Vec<u8> {
|
|
1723
|
+
use std::io::Write;
|
|
1724
|
+
use zip::write::{SimpleFileOptions, ZipWriter};
|
|
1725
|
+
|
|
1726
|
+
let mut buffer = Vec::new();
|
|
1727
|
+
{
|
|
1728
|
+
let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
|
|
1729
|
+
let options = SimpleFileOptions::default();
|
|
1730
|
+
|
|
1731
|
+
zip.start_file("[Content_Types].xml", options).unwrap();
|
|
1732
|
+
zip.write_all(
|
|
1733
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
1734
|
+
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
1735
|
+
<Default Extension="xml" ContentType="application/xml"/>
|
|
1736
|
+
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
1737
|
+
</Types>"#,
|
|
1738
|
+
)
|
|
1739
|
+
.unwrap();
|
|
1740
|
+
|
|
1741
|
+
zip.start_file("ppt/presentation.xml", options).unwrap();
|
|
1742
|
+
zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
|
|
1743
|
+
|
|
1744
|
+
zip.start_file("_rels/.rels", options).unwrap();
|
|
1745
|
+
zip.write_all(
|
|
1746
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
1747
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
1748
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
|
|
1749
|
+
</Relationships>"#,
|
|
1750
|
+
)
|
|
1751
|
+
.unwrap();
|
|
1752
|
+
|
|
1753
|
+
zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
|
|
1754
|
+
zip.write_all(
|
|
1755
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
1756
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
1757
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
|
|
1758
|
+
</Relationships>"#,
|
|
1759
|
+
)
|
|
1760
|
+
.unwrap();
|
|
1761
|
+
|
|
1762
|
+
let mut table_xml = String::from(
|
|
1763
|
+
r#"<a:tbl>
|
|
1764
|
+
<a:tblGrid>"#,
|
|
1765
|
+
);
|
|
1766
|
+
if !rows.is_empty() {
|
|
1767
|
+
for _ in 0..rows[0].len() {
|
|
1768
|
+
table_xml.push_str(r#"<a:gridCol w="2000000"/>"#);
|
|
1769
|
+
}
|
|
1770
|
+
}
|
|
1771
|
+
table_xml.push_str("</a:tblGrid>");
|
|
1772
|
+
|
|
1773
|
+
for row in rows {
|
|
1774
|
+
table_xml.push_str(r#"<a:tr h="370840">"#);
|
|
1775
|
+
for cell in row {
|
|
1776
|
+
table_xml.push_str(&format!(
|
|
1777
|
+
r#"<a:tc>
|
|
1778
|
+
<a:txBody>
|
|
1779
|
+
<a:p>
|
|
1780
|
+
<a:r>
|
|
1781
|
+
<a:t>{}</a:t>
|
|
1782
|
+
</a:r>
|
|
1783
|
+
</a:p>
|
|
1784
|
+
</a:txBody>
|
|
1785
|
+
</a:tc>"#,
|
|
1786
|
+
cell
|
|
1787
|
+
));
|
|
1788
|
+
}
|
|
1789
|
+
table_xml.push_str("</a:tr>");
|
|
1790
|
+
}
|
|
1791
|
+
table_xml.push_str("</a:tbl>");
|
|
1792
|
+
|
|
1793
|
+
let slide_xml = format!(
|
|
1794
|
+
r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
1795
|
+
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
1796
|
+
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
|
|
1797
|
+
<p:cSld>
|
|
1798
|
+
<p:spTree>
|
|
1799
|
+
<p:graphicFrame>
|
|
1800
|
+
<p:xfrm>
|
|
1801
|
+
<a:off x="1000000" y="2000000"/>
|
|
1802
|
+
<a:ext cx="8000000" cy="4000000"/>
|
|
1803
|
+
</p:xfrm>
|
|
1804
|
+
<a:graphic>
|
|
1805
|
+
<a:graphicData uri="http://schemas.openxmlformats.org/drawingml/2006/table">
|
|
1806
|
+
{}
|
|
1807
|
+
</a:graphicData>
|
|
1808
|
+
</a:graphic>
|
|
1809
|
+
</p:graphicFrame>
|
|
1810
|
+
</p:spTree>
|
|
1811
|
+
</p:cSld>
|
|
1812
|
+
</p:sld>"#,
|
|
1813
|
+
table_xml
|
|
1814
|
+
);
|
|
1815
|
+
|
|
1816
|
+
zip.start_file("ppt/slides/slide1.xml", options).unwrap();
|
|
1817
|
+
zip.write_all(slide_xml.as_bytes()).unwrap();
|
|
1818
|
+
|
|
1819
|
+
zip.start_file("docProps/core.xml", options).unwrap();
|
|
1820
|
+
zip.write_all(
|
|
1821
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
1822
|
+
<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
|
|
1823
|
+
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
|
1824
|
+
<dc:title>Test Table</dc:title>
|
|
1825
|
+
</cp:coreProperties>"#,
|
|
1826
|
+
)
|
|
1827
|
+
.unwrap();
|
|
1828
|
+
|
|
1829
|
+
let _ = zip.finish().unwrap();
|
|
1830
|
+
}
|
|
1831
|
+
buffer
|
|
1832
|
+
}
|
|
1833
|
+
|
|
1834
|
+
fn create_pptx_with_lists(list_items: Vec<(usize, bool, &str)>) -> Vec<u8> {
|
|
1835
|
+
use std::io::Write;
|
|
1836
|
+
use zip::write::{SimpleFileOptions, ZipWriter};
|
|
1837
|
+
|
|
1838
|
+
let mut buffer = Vec::new();
|
|
1839
|
+
{
|
|
1840
|
+
let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
|
|
1841
|
+
let options = SimpleFileOptions::default();
|
|
1842
|
+
|
|
1843
|
+
zip.start_file("[Content_Types].xml", options).unwrap();
|
|
1844
|
+
zip.write_all(
|
|
1845
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
1846
|
+
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
1847
|
+
<Default Extension="xml" ContentType="application/xml"/>
|
|
1848
|
+
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
1849
|
+
</Types>"#,
|
|
1850
|
+
)
|
|
1851
|
+
.unwrap();
|
|
1852
|
+
|
|
1853
|
+
zip.start_file("ppt/presentation.xml", options).unwrap();
|
|
1854
|
+
zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
|
|
1855
|
+
|
|
1856
|
+
zip.start_file("_rels/.rels", options).unwrap();
|
|
1857
|
+
zip.write_all(
|
|
1858
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
1859
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
1860
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
|
|
1861
|
+
</Relationships>"#,
|
|
1862
|
+
)
|
|
1863
|
+
.unwrap();
|
|
1864
|
+
|
|
1865
|
+
zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
|
|
1866
|
+
zip.write_all(
|
|
1867
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
1868
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
1869
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
|
|
1870
|
+
</Relationships>"#,
|
|
1871
|
+
)
|
|
1872
|
+
.unwrap();
|
|
1873
|
+
|
|
1874
|
+
let mut list_xml = String::new();
|
|
1875
|
+
for (level, is_ordered, text) in list_items {
|
|
1876
|
+
let indent = (level - 1) * 457200;
|
|
1877
|
+
let lvl_attr = level - 1;
|
|
1878
|
+
let bullet_section = if is_ordered {
|
|
1879
|
+
format!(
|
|
1880
|
+
r#"<a:pPr lvl="{}"><a:buAutoNum type="arabicPeriod"/></a:pPr>"#,
|
|
1881
|
+
lvl_attr
|
|
1882
|
+
)
|
|
1883
|
+
} else {
|
|
1884
|
+
format!(
|
|
1885
|
+
r#"<a:pPr lvl="{}" marL="{}"><a:buFont typeface="Arial"/><a:buChar char="•"/></a:pPr>"#,
|
|
1886
|
+
lvl_attr, indent
|
|
1887
|
+
)
|
|
1888
|
+
};
|
|
1889
|
+
|
|
1890
|
+
list_xml.push_str(&format!(
|
|
1891
|
+
r#"<p:sp>
|
|
1892
|
+
<p:spPr>
|
|
1893
|
+
<a:xfrm>
|
|
1894
|
+
<a:off x="1000000" y="1000000"/>
|
|
1895
|
+
<a:ext cx="6000000" cy="1000000"/>
|
|
1896
|
+
</a:xfrm>
|
|
1897
|
+
</p:spPr>
|
|
1898
|
+
<p:txBody>
|
|
1899
|
+
<a:bodyPr/>
|
|
1900
|
+
<a:lstStyle/>
|
|
1901
|
+
<a:p>
|
|
1902
|
+
{}
|
|
1903
|
+
<a:r>
|
|
1904
|
+
<a:t>{}</a:t>
|
|
1905
|
+
</a:r>
|
|
1906
|
+
</a:p>
|
|
1907
|
+
</p:txBody>
|
|
1908
|
+
</p:sp>"#,
|
|
1909
|
+
bullet_section, text
|
|
1910
|
+
));
|
|
1911
|
+
}
|
|
1912
|
+
|
|
1913
|
+
let slide_xml = format!(
|
|
1914
|
+
r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
1915
|
+
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
1916
|
+
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
|
|
1917
|
+
<p:cSld>
|
|
1918
|
+
<p:spTree>
|
|
1919
|
+
{}
|
|
1920
|
+
</p:spTree>
|
|
1921
|
+
</p:cSld>
|
|
1922
|
+
</p:sld>"#,
|
|
1923
|
+
list_xml
|
|
1924
|
+
);
|
|
1925
|
+
|
|
1926
|
+
zip.start_file("ppt/slides/slide1.xml", options).unwrap();
|
|
1927
|
+
zip.write_all(slide_xml.as_bytes()).unwrap();
|
|
1928
|
+
|
|
1929
|
+
zip.start_file("docProps/core.xml", options).unwrap();
|
|
1930
|
+
zip.write_all(
|
|
1931
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
1932
|
+
<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
|
|
1933
|
+
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
|
1934
|
+
<dc:title>Test Lists</dc:title>
|
|
1935
|
+
</cp:coreProperties>"#,
|
|
1936
|
+
)
|
|
1937
|
+
.unwrap();
|
|
1938
|
+
|
|
1939
|
+
let _ = zip.finish().unwrap();
|
|
1940
|
+
}
|
|
1941
|
+
buffer
|
|
1942
|
+
}
|
|
1943
|
+
|
|
1944
|
+
fn create_pptx_with_images() -> Vec<u8> {
|
|
1945
|
+
use std::io::Write;
|
|
1946
|
+
use zip::write::{SimpleFileOptions, ZipWriter};
|
|
1947
|
+
|
|
1948
|
+
let mut buffer = Vec::new();
|
|
1949
|
+
{
|
|
1950
|
+
let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
|
|
1951
|
+
let options = SimpleFileOptions::default();
|
|
1952
|
+
|
|
1953
|
+
zip.start_file("[Content_Types].xml", options).unwrap();
|
|
1954
|
+
zip.write_all(
|
|
1955
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
1956
|
+
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
1957
|
+
<Default Extension="xml" ContentType="application/xml"/>
|
|
1958
|
+
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
1959
|
+
<Default Extension="png" ContentType="image/png"/>
|
|
1960
|
+
<Default Extension="jpeg" ContentType="image/jpeg"/>
|
|
1961
|
+
</Types>"#,
|
|
1962
|
+
)
|
|
1963
|
+
.unwrap();
|
|
1964
|
+
|
|
1965
|
+
zip.start_file("ppt/presentation.xml", options).unwrap();
|
|
1966
|
+
zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
|
|
1967
|
+
|
|
1968
|
+
zip.start_file("_rels/.rels", options).unwrap();
|
|
1969
|
+
zip.write_all(
|
|
1970
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
1971
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
1972
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
|
|
1973
|
+
</Relationships>"#,
|
|
1974
|
+
)
|
|
1975
|
+
.unwrap();
|
|
1976
|
+
|
|
1977
|
+
zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
|
|
1978
|
+
zip.write_all(
|
|
1979
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
1980
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
1981
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
|
|
1982
|
+
</Relationships>"#,
|
|
1983
|
+
)
|
|
1984
|
+
.unwrap();
|
|
1985
|
+
|
|
1986
|
+
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options).unwrap();
|
|
1987
|
+
zip.write_all(
|
|
1988
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
1989
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
1990
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image1.png"/>
|
|
1991
|
+
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image2.jpeg"/>
|
|
1992
|
+
</Relationships>"#,
|
|
1993
|
+
)
|
|
1994
|
+
.unwrap();
|
|
1995
|
+
|
|
1996
|
+
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
1997
|
+
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
1998
|
+
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
1999
|
+
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
|
|
2000
|
+
<p:cSld>
|
|
2001
|
+
<p:spTree>
|
|
2002
|
+
<p:pic>
|
|
2003
|
+
<p:nvPicPr>
|
|
2004
|
+
<p:cNvPr id="1" name="Image1"/>
|
|
2005
|
+
</p:nvPicPr>
|
|
2006
|
+
<p:blipFill>
|
|
2007
|
+
<a:blip r:embed="rId1"/>
|
|
2008
|
+
</p:blipFill>
|
|
2009
|
+
<p:spPr>
|
|
2010
|
+
<a:xfrm>
|
|
2011
|
+
<a:off x="1000000" y="1000000"/>
|
|
2012
|
+
<a:ext cx="2000000" cy="2000000"/>
|
|
2013
|
+
</a:xfrm>
|
|
2014
|
+
</p:spPr>
|
|
2015
|
+
</p:pic>
|
|
2016
|
+
<p:pic>
|
|
2017
|
+
<p:nvPicPr>
|
|
2018
|
+
<p:cNvPr id="2" name="Image2"/>
|
|
2019
|
+
</p:nvPicPr>
|
|
2020
|
+
<p:blipFill>
|
|
2021
|
+
<a:blip r:embed="rId2"/>
|
|
2022
|
+
</p:blipFill>
|
|
2023
|
+
<p:spPr>
|
|
2024
|
+
<a:xfrm>
|
|
2025
|
+
<a:off x="4000000" y="1000000"/>
|
|
2026
|
+
<a:ext cx="2000000" cy="2000000"/>
|
|
2027
|
+
</a:xfrm>
|
|
2028
|
+
</p:spPr>
|
|
2029
|
+
</p:pic>
|
|
2030
|
+
</p:spTree>
|
|
2031
|
+
</p:cSld>
|
|
2032
|
+
</p:sld>"#;
|
|
2033
|
+
|
|
2034
|
+
zip.start_file("ppt/slides/slide1.xml", options).unwrap();
|
|
2035
|
+
zip.write_all(slide_xml.as_bytes()).unwrap();
|
|
2036
|
+
|
|
2037
|
+
let png_bytes: Vec<u8> = vec![
|
|
2038
|
+
0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, 0x00,
|
|
2039
|
+
0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x02, 0x00, 0x00, 0x00, 0x90, 0x77, 0x53, 0xDE, 0x00,
|
|
2040
|
+
0x00, 0x00, 0x00, 0x49, 0x45, 0x4E, 0x44, 0xAE, 0x42, 0x60, 0x82,
|
|
2041
|
+
];
|
|
2042
|
+
zip.start_file("ppt/media/image1.png", options).unwrap();
|
|
2043
|
+
zip.write_all(&png_bytes).unwrap();
|
|
2044
|
+
|
|
2045
|
+
let jpeg_bytes: Vec<u8> = vec![
|
|
2046
|
+
0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x00,
|
|
2047
|
+
0x01, 0x00, 0x00, 0xFF, 0xD9,
|
|
2048
|
+
];
|
|
2049
|
+
zip.start_file("ppt/media/image2.jpeg", options).unwrap();
|
|
2050
|
+
zip.write_all(&jpeg_bytes).unwrap();
|
|
2051
|
+
|
|
2052
|
+
zip.start_file("docProps/core.xml", options).unwrap();
|
|
2053
|
+
zip.write_all(
|
|
2054
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2055
|
+
<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
|
|
2056
|
+
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
|
2057
|
+
<dc:title>Test Images</dc:title>
|
|
2058
|
+
</cp:coreProperties>"#,
|
|
2059
|
+
)
|
|
2060
|
+
.unwrap();
|
|
2061
|
+
|
|
2062
|
+
let _ = zip.finish().unwrap();
|
|
2063
|
+
}
|
|
2064
|
+
buffer
|
|
2065
|
+
}
|
|
2066
|
+
|
|
2067
|
+
fn create_pptx_with_formatting() -> Vec<u8> {
|
|
2068
|
+
use std::io::Write;
|
|
2069
|
+
use zip::write::{SimpleFileOptions, ZipWriter};
|
|
2070
|
+
|
|
2071
|
+
let mut buffer = Vec::new();
|
|
2072
|
+
{
|
|
2073
|
+
let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
|
|
2074
|
+
let options = SimpleFileOptions::default();
|
|
2075
|
+
|
|
2076
|
+
zip.start_file("[Content_Types].xml", options).unwrap();
|
|
2077
|
+
zip.write_all(
|
|
2078
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2079
|
+
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
2080
|
+
<Default Extension="xml" ContentType="application/xml"/>
|
|
2081
|
+
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
2082
|
+
</Types>"#,
|
|
2083
|
+
)
|
|
2084
|
+
.unwrap();
|
|
2085
|
+
|
|
2086
|
+
zip.start_file("ppt/presentation.xml", options).unwrap();
|
|
2087
|
+
zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
|
|
2088
|
+
|
|
2089
|
+
zip.start_file("_rels/.rels", options).unwrap();
|
|
2090
|
+
zip.write_all(
|
|
2091
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2092
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
2093
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
|
|
2094
|
+
</Relationships>"#,
|
|
2095
|
+
)
|
|
2096
|
+
.unwrap();
|
|
2097
|
+
|
|
2098
|
+
zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
|
|
2099
|
+
zip.write_all(
|
|
2100
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2101
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
2102
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
|
|
2103
|
+
</Relationships>"#,
|
|
2104
|
+
)
|
|
2105
|
+
.unwrap();
|
|
2106
|
+
|
|
2107
|
+
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2108
|
+
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
2109
|
+
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
|
|
2110
|
+
<p:cSld>
|
|
2111
|
+
<p:spTree>
|
|
2112
|
+
<p:sp>
|
|
2113
|
+
<p:spPr>
|
|
2114
|
+
<a:xfrm>
|
|
2115
|
+
<a:off x="1000000" y="1000000"/>
|
|
2116
|
+
<a:ext cx="6000000" cy="1000000"/>
|
|
2117
|
+
</a:xfrm>
|
|
2118
|
+
</p:spPr>
|
|
2119
|
+
<p:txBody>
|
|
2120
|
+
<a:p>
|
|
2121
|
+
<a:r>
|
|
2122
|
+
<a:rPr b="1"/>
|
|
2123
|
+
<a:t>Bold text</a:t>
|
|
2124
|
+
</a:r>
|
|
2125
|
+
</a:p>
|
|
2126
|
+
</p:txBody>
|
|
2127
|
+
</p:sp>
|
|
2128
|
+
<p:sp>
|
|
2129
|
+
<p:spPr>
|
|
2130
|
+
<a:xfrm>
|
|
2131
|
+
<a:off x="1000000" y="2000000"/>
|
|
2132
|
+
<a:ext cx="6000000" cy="1000000"/>
|
|
2133
|
+
</a:xfrm>
|
|
2134
|
+
</p:spPr>
|
|
2135
|
+
<p:txBody>
|
|
2136
|
+
<a:p>
|
|
2137
|
+
<a:r>
|
|
2138
|
+
<a:rPr i="1"/>
|
|
2139
|
+
<a:t>Italic text</a:t>
|
|
2140
|
+
</a:r>
|
|
2141
|
+
</a:p>
|
|
2142
|
+
</p:txBody>
|
|
2143
|
+
</p:sp>
|
|
2144
|
+
<p:sp>
|
|
2145
|
+
<p:spPr>
|
|
2146
|
+
<a:xfrm>
|
|
2147
|
+
<a:off x="1000000" y="3000000"/>
|
|
2148
|
+
<a:ext cx="6000000" cy="1000000"/>
|
|
2149
|
+
</a:xfrm>
|
|
2150
|
+
</p:spPr>
|
|
2151
|
+
<p:txBody>
|
|
2152
|
+
<a:p>
|
|
2153
|
+
<a:r>
|
|
2154
|
+
<a:rPr u="sng"/>
|
|
2155
|
+
<a:t>Underline text</a:t>
|
|
2156
|
+
</a:r>
|
|
2157
|
+
</a:p>
|
|
2158
|
+
</p:txBody>
|
|
2159
|
+
</p:sp>
|
|
2160
|
+
<p:sp>
|
|
2161
|
+
<p:spPr>
|
|
2162
|
+
<a:xfrm>
|
|
2163
|
+
<a:off x="1000000" y="4000000"/>
|
|
2164
|
+
<a:ext cx="6000000" cy="1000000"/>
|
|
2165
|
+
</a:xfrm>
|
|
2166
|
+
</p:spPr>
|
|
2167
|
+
<p:txBody>
|
|
2168
|
+
<a:p>
|
|
2169
|
+
<a:r>
|
|
2170
|
+
<a:rPr b="1" i="1"/>
|
|
2171
|
+
<a:t>Bold italic text</a:t>
|
|
2172
|
+
</a:r>
|
|
2173
|
+
</a:p>
|
|
2174
|
+
</p:txBody>
|
|
2175
|
+
</p:sp>
|
|
2176
|
+
</p:spTree>
|
|
2177
|
+
</p:cSld>
|
|
2178
|
+
</p:sld>"#;
|
|
2179
|
+
|
|
2180
|
+
zip.start_file("ppt/slides/slide1.xml", options).unwrap();
|
|
2181
|
+
zip.write_all(slide_xml.as_bytes()).unwrap();
|
|
2182
|
+
|
|
2183
|
+
zip.start_file("docProps/core.xml", options).unwrap();
|
|
2184
|
+
zip.write_all(
|
|
2185
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2186
|
+
<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
|
|
2187
|
+
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
|
2188
|
+
<dc:title>Test Formatting</dc:title>
|
|
2189
|
+
</cp:coreProperties>"#,
|
|
2190
|
+
)
|
|
2191
|
+
.unwrap();
|
|
2192
|
+
|
|
2193
|
+
let _ = zip.finish().unwrap();
|
|
2194
|
+
}
|
|
2195
|
+
buffer
|
|
2196
|
+
}
|
|
2197
|
+
|
|
2198
|
+
#[test]
|
|
2199
|
+
fn test_table_extraction_with_headers_succeeds() {
|
|
2200
|
+
let pptx_bytes = create_pptx_with_table(vec![
|
|
2201
|
+
vec!["Header 1", "Header 2", "Header 3"],
|
|
2202
|
+
vec!["Data 1", "Data 2", "Data 3"],
|
|
2203
|
+
vec!["Row 2 Col 1", "Row 2 Col 2", "Row 2 Col 3"],
|
|
2204
|
+
]);
|
|
2205
|
+
|
|
2206
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2207
|
+
|
|
2208
|
+
assert_eq!(result.table_count, 1, "Should detect one table");
|
|
2209
|
+
assert!(result.content.contains("<table>"), "Should contain table tag");
|
|
2210
|
+
assert!(
|
|
2211
|
+
result.content.contains("<th>Header 1</th>"),
|
|
2212
|
+
"Should render first header"
|
|
2213
|
+
);
|
|
2214
|
+
assert!(
|
|
2215
|
+
result.content.contains("<th>Header 2</th>"),
|
|
2216
|
+
"Should render second header"
|
|
2217
|
+
);
|
|
2218
|
+
assert!(
|
|
2219
|
+
result.content.contains("<th>Header 3</th>"),
|
|
2220
|
+
"Should render third header"
|
|
2221
|
+
);
|
|
2222
|
+
assert!(result.content.contains("<td>Data 1</td>"), "Should render data cell");
|
|
2223
|
+
assert!(
|
|
2224
|
+
result.content.contains("<td>Row 2 Col 2</td>"),
|
|
2225
|
+
"Should render second row data"
|
|
2226
|
+
);
|
|
2227
|
+
}
|
|
2228
|
+
|
|
2229
|
+
#[test]
|
|
2230
|
+
fn test_table_extraction_multirow_multicolumn_succeeds() {
|
|
2231
|
+
let pptx_bytes = create_pptx_with_table(vec![
|
|
2232
|
+
vec!["A1", "B1", "C1", "D1"],
|
|
2233
|
+
vec!["A2", "B2", "C2", "D2"],
|
|
2234
|
+
vec!["A3", "B3", "C3", "D3"],
|
|
2235
|
+
vec!["A4", "B4", "C4", "D4"],
|
|
2236
|
+
]);
|
|
2237
|
+
|
|
2238
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2239
|
+
|
|
2240
|
+
assert_eq!(result.table_count, 1, "Should detect one table");
|
|
2241
|
+
assert!(result.content.contains("<tr>"), "Should contain table rows");
|
|
2242
|
+
assert!(result.content.contains("A1"), "Should contain first row data");
|
|
2243
|
+
assert!(result.content.contains("D4"), "Should contain last row data");
|
|
2244
|
+
|
|
2245
|
+
let tr_count = result.content.matches("<tr>").count();
|
|
2246
|
+
assert_eq!(tr_count, 4, "Should have 4 table rows");
|
|
2247
|
+
}
|
|
2248
|
+
|
|
2249
|
+
#[test]
|
|
2250
|
+
fn test_table_counting_via_slide_metadata_succeeds() {
|
|
2251
|
+
let pptx_bytes = create_pptx_with_table(vec![vec!["Col1", "Col2"], vec!["Val1", "Val2"]]);
|
|
2252
|
+
|
|
2253
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2254
|
+
|
|
2255
|
+
assert_eq!(result.table_count, 1, "table_count should be 1");
|
|
2256
|
+
}
|
|
2257
|
+
|
|
2258
|
+
#[test]
|
|
2259
|
+
fn test_table_markdown_rendering_with_special_chars() {
|
|
2260
|
+
let pptx_bytes = create_pptx_with_table(vec![
|
|
2261
|
+
vec!["Header with ampersand", "Header 2"],
|
|
2262
|
+
vec!["Cell data 1", "Cell data 2"],
|
|
2263
|
+
]);
|
|
2264
|
+
|
|
2265
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2266
|
+
|
|
2267
|
+
assert!(result.content.contains("<table>"), "Should contain table tag");
|
|
2268
|
+
assert!(
|
|
2269
|
+
result.content.contains("<th>Header with ampersand</th>"),
|
|
2270
|
+
"Should contain header text"
|
|
2271
|
+
);
|
|
2272
|
+
assert!(
|
|
2273
|
+
result.content.contains("<td>Cell data 1</td>"),
|
|
2274
|
+
"Should contain cell data"
|
|
2275
|
+
);
|
|
2276
|
+
}
|
|
2277
|
+
|
|
2278
|
+
#[test]
|
|
2279
|
+
fn test_table_extraction_empty_table_returns_one_count() {
|
|
2280
|
+
let pptx_bytes = create_pptx_with_table(vec![]);
|
|
2281
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2282
|
+
|
|
2283
|
+
assert_eq!(result.table_count, 1, "Empty table structure should be detected");
|
|
2284
|
+
assert!(!result.content.contains("<td>"), "Empty table should have no cells");
|
|
2285
|
+
}
|
|
2286
|
+
|
|
2287
|
+
#[test]
|
|
2288
|
+
fn test_list_extraction_ordered_list_succeeds() {
|
|
2289
|
+
let pptx_bytes = create_pptx_with_lists(vec![
|
|
2290
|
+
(1, true, "First item"),
|
|
2291
|
+
(1, true, "Second item"),
|
|
2292
|
+
(1, true, "Third item"),
|
|
2293
|
+
]);
|
|
2294
|
+
|
|
2295
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2296
|
+
|
|
2297
|
+
assert!(
|
|
2298
|
+
result.content.contains("1. First item"),
|
|
2299
|
+
"Should contain ordered list item 1"
|
|
2300
|
+
);
|
|
2301
|
+
assert!(
|
|
2302
|
+
result.content.contains("1. Second item"),
|
|
2303
|
+
"Should contain ordered list item 2"
|
|
2304
|
+
);
|
|
2305
|
+
assert!(
|
|
2306
|
+
result.content.contains("1. Third item"),
|
|
2307
|
+
"Should contain ordered list item 3"
|
|
2308
|
+
);
|
|
2309
|
+
}
|
|
2310
|
+
|
|
2311
|
+
#[test]
|
|
2312
|
+
fn test_list_extraction_unordered_list_succeeds() {
|
|
2313
|
+
let pptx_bytes = create_pptx_with_lists(vec![
|
|
2314
|
+
(1, false, "Bullet one"),
|
|
2315
|
+
(1, false, "Bullet two"),
|
|
2316
|
+
(1, false, "Bullet three"),
|
|
2317
|
+
]);
|
|
2318
|
+
|
|
2319
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2320
|
+
|
|
2321
|
+
assert!(result.content.contains("- Bullet one"), "Should contain bullet point 1");
|
|
2322
|
+
assert!(result.content.contains("- Bullet two"), "Should contain bullet point 2");
|
|
2323
|
+
assert!(
|
|
2324
|
+
result.content.contains("- Bullet three"),
|
|
2325
|
+
"Should contain bullet point 3"
|
|
2326
|
+
);
|
|
2327
|
+
}
|
|
2328
|
+
|
|
2329
|
+
#[test]
|
|
2330
|
+
fn test_list_extraction_nested_lists_with_indentation_succeeds() {
|
|
2331
|
+
let pptx_bytes = create_pptx_with_lists(vec![
|
|
2332
|
+
(1, false, "Level 1 Item"),
|
|
2333
|
+
(2, false, "Level 2 Item"),
|
|
2334
|
+
(3, false, "Level 3 Item"),
|
|
2335
|
+
(2, false, "Back to Level 2"),
|
|
2336
|
+
(1, false, "Back to Level 1"),
|
|
2337
|
+
]);
|
|
2338
|
+
|
|
2339
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2340
|
+
|
|
2341
|
+
assert!(
|
|
2342
|
+
result.content.contains("- Level 1 Item"),
|
|
2343
|
+
"Should have level 1 with no indent"
|
|
2344
|
+
);
|
|
2345
|
+
assert!(
|
|
2346
|
+
result.content.contains(" - Level 2 Item"),
|
|
2347
|
+
"Should have level 2 with 2-space indent"
|
|
2348
|
+
);
|
|
2349
|
+
assert!(
|
|
2350
|
+
result.content.contains(" - Level 3 Item"),
|
|
2351
|
+
"Should have level 3 with 4-space indent"
|
|
2352
|
+
);
|
|
2353
|
+
assert!(
|
|
2354
|
+
result.content.contains(" - Back to Level 2"),
|
|
2355
|
+
"Should return to level 2 indent"
|
|
2356
|
+
);
|
|
2357
|
+
assert!(result.content.contains("- Back to Level 1"), "Should return to level 1");
|
|
2358
|
+
}
|
|
2359
|
+
|
|
2360
|
+
#[test]
|
|
2361
|
+
fn test_list_extraction_mixed_ordered_unordered_succeeds() {
|
|
2362
|
+
let pptx_bytes = create_pptx_with_lists(vec![
|
|
2363
|
+
(1, true, "Ordered item 1"),
|
|
2364
|
+
(1, false, "Unordered item 1"),
|
|
2365
|
+
(1, true, "Ordered item 2"),
|
|
2366
|
+
]);
|
|
2367
|
+
|
|
2368
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2369
|
+
|
|
2370
|
+
assert!(
|
|
2371
|
+
result.content.contains("1. Ordered item 1"),
|
|
2372
|
+
"Should render ordered list"
|
|
2373
|
+
);
|
|
2374
|
+
assert!(
|
|
2375
|
+
result.content.contains("- Unordered item 1"),
|
|
2376
|
+
"Should render unordered list"
|
|
2377
|
+
);
|
|
2378
|
+
assert!(
|
|
2379
|
+
result.content.contains("1. Ordered item 2"),
|
|
2380
|
+
"Should render ordered list again"
|
|
2381
|
+
);
|
|
2382
|
+
}
|
|
2383
|
+
|
|
2384
|
+
#[test]
|
|
2385
|
+
fn test_image_extraction_from_slide_xml_succeeds() {
|
|
2386
|
+
let pptx_bytes = create_pptx_with_images();
|
|
2387
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
|
|
2388
|
+
|
|
2389
|
+
assert_eq!(result.image_count, 2, "Should detect 2 images");
|
|
2390
|
+
assert!(!result.images.is_empty(), "Should extract image data");
|
|
2391
|
+
}
|
|
2392
|
+
|
|
2393
|
+
#[test]
|
|
2394
|
+
fn test_image_data_loading_from_zip_archive_succeeds() {
|
|
2395
|
+
let pptx_bytes = create_pptx_with_images();
|
|
2396
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
|
|
2397
|
+
|
|
2398
|
+
assert_eq!(result.images.len(), 2, "Should load 2 images");
|
|
2399
|
+
|
|
2400
|
+
for (i, img) in result.images.iter().enumerate() {
|
|
2401
|
+
assert!(!img.data.is_empty(), "Image {} should have non-empty data", i);
|
|
2402
|
+
}
|
|
2403
|
+
}
|
|
2404
|
+
|
|
2405
|
+
#[test]
|
|
2406
|
+
fn test_image_format_detection_succeeds() {
|
|
2407
|
+
let pptx_bytes = create_pptx_with_images();
|
|
2408
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
|
|
2409
|
+
|
|
2410
|
+
assert_eq!(result.images.len(), 2, "Should have 2 images");
|
|
2411
|
+
|
|
2412
|
+
let formats: Vec<&str> = result.images.iter().map(|img| img.format.as_str()).collect();
|
|
2413
|
+
|
|
2414
|
+
assert!(formats.contains(&"png"), "Should detect PNG format");
|
|
2415
|
+
assert!(formats.contains(&"jpeg"), "Should detect JPEG format");
|
|
2416
|
+
}
|
|
2417
|
+
|
|
2418
|
+
#[test]
|
|
2419
|
+
fn test_image_counting_via_result_metadata_succeeds() {
|
|
2420
|
+
let pptx_bytes = create_pptx_with_images();
|
|
2421
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
|
|
2422
|
+
|
|
2423
|
+
assert_eq!(result.image_count, 2, "image_count should match actual images");
|
|
2424
|
+
assert_eq!(result.images.len(), 2, "images vector should have 2 elements");
|
|
2425
|
+
}
|
|
2426
|
+
|
|
2427
|
+
#[test]
|
|
2428
|
+
fn test_image_extraction_disabled_returns_zero_images() {
|
|
2429
|
+
let pptx_bytes = create_pptx_with_images();
|
|
2430
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2431
|
+
|
|
2432
|
+
assert_eq!(
|
|
2433
|
+
result.image_count, 2,
|
|
2434
|
+
"Should still count images even when not extracted"
|
|
2435
|
+
);
|
|
2436
|
+
assert_eq!(result.images.len(), 0, "Should not extract image data when disabled");
|
|
2437
|
+
}
|
|
2438
|
+
|
|
2439
|
+
#[test]
|
|
2440
|
+
fn test_multiple_images_per_slide_extraction_succeeds() {
|
|
2441
|
+
let pptx_bytes = create_pptx_with_images();
|
|
2442
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
|
|
2443
|
+
|
|
2444
|
+
assert_eq!(result.slide_count, 1, "Should have 1 slide");
|
|
2445
|
+
assert_eq!(result.image_count, 2, "Single slide should contain 2 images");
|
|
2446
|
+
|
|
2447
|
+
let indices: Vec<usize> = result.images.iter().map(|img| img.image_index).collect();
|
|
2448
|
+
assert_eq!(indices.len(), 2, "Should have 2 images with indices");
|
|
2449
|
+
assert_eq!(indices, vec![0, 1], "Should have sequential image indices");
|
|
2450
|
+
}
|
|
2451
|
+
|
|
2452
|
+
#[test]
|
|
2453
|
+
fn test_formatting_bold_text_renders_as_markdown_bold() {
|
|
2454
|
+
let pptx_bytes = create_pptx_with_formatting();
|
|
2455
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2456
|
+
|
|
2457
|
+
assert!(
|
|
2458
|
+
result.content.contains("**Bold text"),
|
|
2459
|
+
"Should render bold text with ** markers"
|
|
2460
|
+
);
|
|
2461
|
+
}
|
|
2462
|
+
|
|
2463
|
+
#[test]
|
|
2464
|
+
fn test_formatting_italic_text_renders_as_markdown_italic() {
|
|
2465
|
+
let pptx_bytes = create_pptx_with_formatting();
|
|
2466
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2467
|
+
|
|
2468
|
+
assert!(
|
|
2469
|
+
result.content.contains("*Italic text"),
|
|
2470
|
+
"Should render italic text with * markers"
|
|
2471
|
+
);
|
|
2472
|
+
}
|
|
2473
|
+
|
|
2474
|
+
#[test]
|
|
2475
|
+
fn test_formatting_underline_text_renders_as_html_underline() {
|
|
2476
|
+
let pptx_bytes = create_pptx_with_formatting();
|
|
2477
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2478
|
+
|
|
2479
|
+
assert!(
|
|
2480
|
+
result.content.contains("<u>Underline text"),
|
|
2481
|
+
"Should render underline with HTML tags"
|
|
2482
|
+
);
|
|
2483
|
+
}
|
|
2484
|
+
|
|
2485
|
+
#[test]
|
|
2486
|
+
fn test_formatting_combined_bold_italic_renders_correctly() {
|
|
2487
|
+
let pptx_bytes = create_pptx_with_formatting();
|
|
2488
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2489
|
+
|
|
2490
|
+
assert!(
|
|
2491
|
+
result.content.contains("***Bold italic text"),
|
|
2492
|
+
"Should render bold+italic with *** markers"
|
|
2493
|
+
);
|
|
2494
|
+
}
|
|
2495
|
+
|
|
2496
|
+
#[test]
|
|
2497
|
+
fn test_run_render_underline_formatting() {
|
|
2498
|
+
let run = Run {
|
|
2499
|
+
text: "underlined".to_string(),
|
|
2500
|
+
formatting: Formatting {
|
|
2501
|
+
underlined: true,
|
|
2502
|
+
..Default::default()
|
|
2503
|
+
},
|
|
2504
|
+
};
|
|
2505
|
+
assert_eq!(
|
|
2506
|
+
run.render_as_md(),
|
|
2507
|
+
"<u>underlined</u>",
|
|
2508
|
+
"Should wrap underlined text in <u> tags"
|
|
2509
|
+
);
|
|
2510
|
+
}
|
|
2511
|
+
|
|
2512
|
+
#[test]
|
|
2513
|
+
fn test_run_render_all_formatting_combined() {
|
|
2514
|
+
let run = Run {
|
|
2515
|
+
text: "all formats".to_string(),
|
|
2516
|
+
formatting: Formatting {
|
|
2517
|
+
bold: true,
|
|
2518
|
+
italic: true,
|
|
2519
|
+
underlined: true,
|
|
2520
|
+
..Default::default()
|
|
2521
|
+
},
|
|
2522
|
+
};
|
|
2523
|
+
let rendered = run.render_as_md();
|
|
2524
|
+
assert!(rendered.contains("***"), "Should have bold+italic markers");
|
|
2525
|
+
assert!(rendered.contains("<u>"), "Should have underline tags");
|
|
2526
|
+
assert!(rendered.contains("all formats"), "Should contain original text");
|
|
2527
|
+
}
|
|
2528
|
+
|
|
2529
|
+
#[test]
|
|
2530
|
+
fn test_integration_complete_pptx_with_mixed_content_succeeds() {
|
|
2531
|
+
use std::io::Write;
|
|
2532
|
+
use zip::write::{SimpleFileOptions, ZipWriter};
|
|
2533
|
+
|
|
2534
|
+
let mut buffer = Vec::new();
|
|
2535
|
+
{
|
|
2536
|
+
let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
|
|
2537
|
+
let options = SimpleFileOptions::default();
|
|
2538
|
+
|
|
2539
|
+
zip.start_file("[Content_Types].xml", options).unwrap();
|
|
2540
|
+
zip.write_all(
|
|
2541
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2542
|
+
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
2543
|
+
<Default Extension="xml" ContentType="application/xml"/>
|
|
2544
|
+
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
2545
|
+
<Default Extension="png" ContentType="image/png"/>
|
|
2546
|
+
</Types>"#,
|
|
2547
|
+
)
|
|
2548
|
+
.unwrap();
|
|
2549
|
+
|
|
2550
|
+
zip.start_file("ppt/presentation.xml", options).unwrap();
|
|
2551
|
+
zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
|
|
2552
|
+
|
|
2553
|
+
zip.start_file("_rels/.rels", options).unwrap();
|
|
2554
|
+
zip.write_all(
|
|
2555
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2556
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
2557
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
|
|
2558
|
+
</Relationships>"#,
|
|
2559
|
+
)
|
|
2560
|
+
.unwrap();
|
|
2561
|
+
|
|
2562
|
+
zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
|
|
2563
|
+
zip.write_all(
|
|
2564
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2565
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
2566
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
|
|
2567
|
+
</Relationships>"#,
|
|
2568
|
+
)
|
|
2569
|
+
.unwrap();
|
|
2570
|
+
|
|
2571
|
+
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options).unwrap();
|
|
2572
|
+
zip.write_all(
|
|
2573
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2574
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
2575
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image1.png"/>
|
|
2576
|
+
</Relationships>"#,
|
|
2577
|
+
)
|
|
2578
|
+
.unwrap();
|
|
2579
|
+
|
|
2580
|
+
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2581
|
+
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
2582
|
+
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
2583
|
+
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
|
|
2584
|
+
<p:cSld>
|
|
2585
|
+
<p:spTree>
|
|
2586
|
+
<p:sp>
|
|
2587
|
+
<p:txBody>
|
|
2588
|
+
<a:p>
|
|
2589
|
+
<a:r>
|
|
2590
|
+
<a:rPr b="1"/>
|
|
2591
|
+
<a:t>Title with Bold</a:t>
|
|
2592
|
+
</a:r>
|
|
2593
|
+
</a:p>
|
|
2594
|
+
</p:txBody>
|
|
2595
|
+
<p:spPr>
|
|
2596
|
+
<a:xfrm>
|
|
2597
|
+
<a:off x="1000000" y="500000"/>
|
|
2598
|
+
</a:xfrm>
|
|
2599
|
+
</p:spPr>
|
|
2600
|
+
</p:sp>
|
|
2601
|
+
<p:sp>
|
|
2602
|
+
<p:txBody>
|
|
2603
|
+
<a:p>
|
|
2604
|
+
<a:pPr lvl="0"><a:buChar char="•"/></a:pPr>
|
|
2605
|
+
<a:r>
|
|
2606
|
+
<a:t>List item one</a:t>
|
|
2607
|
+
</a:r>
|
|
2608
|
+
</a:p>
|
|
2609
|
+
</p:txBody>
|
|
2610
|
+
<p:spPr>
|
|
2611
|
+
<a:xfrm>
|
|
2612
|
+
<a:off x="1000000" y="1500000"/>
|
|
2613
|
+
</a:xfrm>
|
|
2614
|
+
</p:spPr>
|
|
2615
|
+
</p:sp>
|
|
2616
|
+
<p:graphicFrame>
|
|
2617
|
+
<p:xfrm>
|
|
2618
|
+
<a:off x="1000000" y="2500000"/>
|
|
2619
|
+
<a:ext cx="4000000" cy="2000000"/>
|
|
2620
|
+
</p:xfrm>
|
|
2621
|
+
<a:graphic>
|
|
2622
|
+
<a:graphicData uri="http://schemas.openxmlformats.org/drawingml/2006/table">
|
|
2623
|
+
<a:tbl>
|
|
2624
|
+
<a:tblGrid>
|
|
2625
|
+
<a:gridCol w="2000000"/>
|
|
2626
|
+
<a:gridCol w="2000000"/>
|
|
2627
|
+
</a:tblGrid>
|
|
2628
|
+
<a:tr h="370840">
|
|
2629
|
+
<a:tc>
|
|
2630
|
+
<a:txBody>
|
|
2631
|
+
<a:p>
|
|
2632
|
+
<a:r>
|
|
2633
|
+
<a:t>Header A</a:t>
|
|
2634
|
+
</a:r>
|
|
2635
|
+
</a:p>
|
|
2636
|
+
</a:txBody>
|
|
2637
|
+
</a:tc>
|
|
2638
|
+
<a:tc>
|
|
2639
|
+
<a:txBody>
|
|
2640
|
+
<a:p>
|
|
2641
|
+
<a:r>
|
|
2642
|
+
<a:t>Header B</a:t>
|
|
2643
|
+
</a:r>
|
|
2644
|
+
</a:p>
|
|
2645
|
+
</a:txBody>
|
|
2646
|
+
</a:tc>
|
|
2647
|
+
</a:tr>
|
|
2648
|
+
<a:tr h="370840">
|
|
2649
|
+
<a:tc>
|
|
2650
|
+
<a:txBody>
|
|
2651
|
+
<a:p>
|
|
2652
|
+
<a:r>
|
|
2653
|
+
<a:t>Data 1</a:t>
|
|
2654
|
+
</a:r>
|
|
2655
|
+
</a:p>
|
|
2656
|
+
</a:txBody>
|
|
2657
|
+
</a:tc>
|
|
2658
|
+
<a:tc>
|
|
2659
|
+
<a:txBody>
|
|
2660
|
+
<a:p>
|
|
2661
|
+
<a:r>
|
|
2662
|
+
<a:t>Data 2</a:t>
|
|
2663
|
+
</a:r>
|
|
2664
|
+
</a:p>
|
|
2665
|
+
</a:txBody>
|
|
2666
|
+
</a:tc>
|
|
2667
|
+
</a:tr>
|
|
2668
|
+
</a:tbl>
|
|
2669
|
+
</a:graphicData>
|
|
2670
|
+
</a:graphic>
|
|
2671
|
+
</p:graphicFrame>
|
|
2672
|
+
<p:pic>
|
|
2673
|
+
<p:nvPicPr>
|
|
2674
|
+
<p:cNvPr id="1" name="TestImage"/>
|
|
2675
|
+
</p:nvPicPr>
|
|
2676
|
+
<p:blipFill>
|
|
2677
|
+
<a:blip r:embed="rId1"/>
|
|
2678
|
+
</p:blipFill>
|
|
2679
|
+
<p:spPr>
|
|
2680
|
+
<a:xfrm>
|
|
2681
|
+
<a:off x="6000000" y="1000000"/>
|
|
2682
|
+
<a:ext cx="2000000" cy="2000000"/>
|
|
2683
|
+
</a:xfrm>
|
|
2684
|
+
</p:spPr>
|
|
2685
|
+
</p:pic>
|
|
2686
|
+
</p:spTree>
|
|
2687
|
+
</p:cSld>
|
|
2688
|
+
</p:sld>"#;
|
|
2689
|
+
|
|
2690
|
+
zip.start_file("ppt/slides/slide1.xml", options).unwrap();
|
|
2691
|
+
zip.write_all(slide_xml.as_bytes()).unwrap();
|
|
2692
|
+
|
|
2693
|
+
let png_bytes: Vec<u8> = vec![
|
|
2694
|
+
0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, 0x00, 0x00, 0x00, 0x0D, 0x49, 0x48, 0x44, 0x52, 0x00,
|
|
2695
|
+
0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x02, 0x00, 0x00, 0x00, 0x90, 0x77, 0x53, 0xDE, 0x00,
|
|
2696
|
+
0x00, 0x00, 0x00, 0x49, 0x45, 0x4E, 0x44, 0xAE, 0x42, 0x60, 0x82,
|
|
2697
|
+
];
|
|
2698
|
+
zip.start_file("ppt/media/image1.png", options).unwrap();
|
|
2699
|
+
zip.write_all(&png_bytes).unwrap();
|
|
2700
|
+
|
|
2701
|
+
zip.start_file("docProps/core.xml", options).unwrap();
|
|
2702
|
+
zip.write_all(
|
|
2703
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2704
|
+
<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
|
|
2705
|
+
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
|
2706
|
+
<dc:title>Integration Test</dc:title>
|
|
2707
|
+
</cp:coreProperties>"#,
|
|
2708
|
+
)
|
|
2709
|
+
.unwrap();
|
|
2710
|
+
|
|
2711
|
+
let _ = zip.finish().unwrap();
|
|
2712
|
+
}
|
|
2713
|
+
|
|
2714
|
+
let result = extract_pptx_from_bytes(&buffer, true).unwrap();
|
|
2715
|
+
|
|
2716
|
+
assert!(
|
|
2717
|
+
result.content.contains("**Title with Bold"),
|
|
2718
|
+
"Should contain formatted title"
|
|
2719
|
+
);
|
|
2720
|
+
assert!(result.content.contains("- List item one"), "Should contain list item");
|
|
2721
|
+
assert!(result.content.contains("<table>"), "Should contain table");
|
|
2722
|
+
assert!(result.content.contains("Header A"), "Should contain table header");
|
|
2723
|
+
assert!(result.content.contains("Data 1"), "Should contain table data");
|
|
2724
|
+
|
|
2725
|
+
assert_eq!(result.slide_count, 1, "Should have 1 slide");
|
|
2726
|
+
assert_eq!(result.table_count, 1, "Should detect 1 table");
|
|
2727
|
+
assert_eq!(result.image_count, 1, "Should detect 1 image");
|
|
2728
|
+
assert_eq!(result.images.len(), 1, "Should extract 1 image");
|
|
2729
|
+
}
|
|
2730
|
+
|
|
2731
|
+
#[test]
|
|
2732
|
+
fn test_integration_position_based_sorting_orders_elements_correctly() {
|
|
2733
|
+
use std::io::Write;
|
|
2734
|
+
use zip::write::{SimpleFileOptions, ZipWriter};
|
|
2735
|
+
|
|
2736
|
+
let mut buffer = Vec::new();
|
|
2737
|
+
{
|
|
2738
|
+
let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
|
|
2739
|
+
let options = SimpleFileOptions::default();
|
|
2740
|
+
|
|
2741
|
+
zip.start_file("[Content_Types].xml", options).unwrap();
|
|
2742
|
+
zip.write_all(
|
|
2743
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2744
|
+
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
2745
|
+
<Default Extension="xml" ContentType="application/xml"/>
|
|
2746
|
+
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
2747
|
+
</Types>"#,
|
|
2748
|
+
)
|
|
2749
|
+
.unwrap();
|
|
2750
|
+
|
|
2751
|
+
zip.start_file("ppt/presentation.xml", options).unwrap();
|
|
2752
|
+
zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
|
|
2753
|
+
|
|
2754
|
+
zip.start_file("_rels/.rels", options).unwrap();
|
|
2755
|
+
zip.write_all(
|
|
2756
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2757
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
2758
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
|
|
2759
|
+
</Relationships>"#,
|
|
2760
|
+
)
|
|
2761
|
+
.unwrap();
|
|
2762
|
+
|
|
2763
|
+
zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
|
|
2764
|
+
zip.write_all(
|
|
2765
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2766
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
2767
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
|
|
2768
|
+
</Relationships>"#,
|
|
2769
|
+
)
|
|
2770
|
+
.unwrap();
|
|
2771
|
+
|
|
2772
|
+
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2773
|
+
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
2774
|
+
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
|
|
2775
|
+
<p:cSld>
|
|
2776
|
+
<p:spTree>
|
|
2777
|
+
<p:sp>
|
|
2778
|
+
<p:txBody>
|
|
2779
|
+
<a:p>
|
|
2780
|
+
<a:r>
|
|
2781
|
+
<a:t>Bottom Right</a:t>
|
|
2782
|
+
</a:r>
|
|
2783
|
+
</a:p>
|
|
2784
|
+
</p:txBody>
|
|
2785
|
+
<p:spPr>
|
|
2786
|
+
<a:xfrm>
|
|
2787
|
+
<a:off x="5000000" y="3000000"/>
|
|
2788
|
+
</a:xfrm>
|
|
2789
|
+
</p:spPr>
|
|
2790
|
+
</p:sp>
|
|
2791
|
+
<p:sp>
|
|
2792
|
+
<p:txBody>
|
|
2793
|
+
<a:p>
|
|
2794
|
+
<a:r>
|
|
2795
|
+
<a:t>Top Left</a:t>
|
|
2796
|
+
</a:r>
|
|
2797
|
+
</a:p>
|
|
2798
|
+
</p:txBody>
|
|
2799
|
+
<p:spPr>
|
|
2800
|
+
<a:xfrm>
|
|
2801
|
+
<a:off x="1000000" y="1000000"/>
|
|
2802
|
+
</a:xfrm>
|
|
2803
|
+
</p:spPr>
|
|
2804
|
+
</p:sp>
|
|
2805
|
+
<p:sp>
|
|
2806
|
+
<p:txBody>
|
|
2807
|
+
<a:p>
|
|
2808
|
+
<a:r>
|
|
2809
|
+
<a:t>Top Right</a:t>
|
|
2810
|
+
</a:r>
|
|
2811
|
+
</a:p>
|
|
2812
|
+
</p:txBody>
|
|
2813
|
+
<p:spPr>
|
|
2814
|
+
<a:xfrm>
|
|
2815
|
+
<a:off x="5000000" y="1000000"/>
|
|
2816
|
+
</a:xfrm>
|
|
2817
|
+
</p:spPr>
|
|
2818
|
+
</p:sp>
|
|
2819
|
+
<p:sp>
|
|
2820
|
+
<p:txBody>
|
|
2821
|
+
<a:p>
|
|
2822
|
+
<a:r>
|
|
2823
|
+
<a:t>Bottom Left</a:t>
|
|
2824
|
+
</a:r>
|
|
2825
|
+
</a:p>
|
|
2826
|
+
</p:txBody>
|
|
2827
|
+
<p:spPr>
|
|
2828
|
+
<a:xfrm>
|
|
2829
|
+
<a:off x="1000000" y="3000000"/>
|
|
2830
|
+
</a:xfrm>
|
|
2831
|
+
</p:spPr>
|
|
2832
|
+
</p:sp>
|
|
2833
|
+
</p:spTree>
|
|
2834
|
+
</p:cSld>
|
|
2835
|
+
</p:sld>"#;
|
|
2836
|
+
|
|
2837
|
+
zip.start_file("ppt/slides/slide1.xml", options).unwrap();
|
|
2838
|
+
zip.write_all(slide_xml.as_bytes()).unwrap();
|
|
2839
|
+
|
|
2840
|
+
zip.start_file("docProps/core.xml", options).unwrap();
|
|
2841
|
+
zip.write_all(
|
|
2842
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2843
|
+
<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
|
|
2844
|
+
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
|
2845
|
+
<dc:title>Position Test</dc:title>
|
|
2846
|
+
</cp:coreProperties>"#,
|
|
2847
|
+
)
|
|
2848
|
+
.unwrap();
|
|
2849
|
+
|
|
2850
|
+
let _ = zip.finish().unwrap();
|
|
2851
|
+
}
|
|
2852
|
+
|
|
2853
|
+
let result = extract_pptx_from_bytes(&buffer, false).unwrap();
|
|
2854
|
+
|
|
2855
|
+
let content = result.content;
|
|
2856
|
+
let top_left_pos = content.find("Top Left").unwrap();
|
|
2857
|
+
let top_right_pos = content.find("Top Right").unwrap();
|
|
2858
|
+
let bottom_left_pos = content.find("Bottom Left").unwrap();
|
|
2859
|
+
let bottom_right_pos = content.find("Bottom Right").unwrap();
|
|
2860
|
+
|
|
2861
|
+
assert!(
|
|
2862
|
+
top_left_pos < top_right_pos,
|
|
2863
|
+
"Top Left should appear before Top Right (same Y, lower X)"
|
|
2864
|
+
);
|
|
2865
|
+
assert!(
|
|
2866
|
+
top_right_pos < bottom_left_pos,
|
|
2867
|
+
"Top row should appear before bottom row"
|
|
2868
|
+
);
|
|
2869
|
+
assert!(
|
|
2870
|
+
bottom_left_pos < bottom_right_pos,
|
|
2871
|
+
"Bottom Left should appear before Bottom Right (same Y, lower X)"
|
|
2872
|
+
);
|
|
2873
|
+
}
|
|
2874
|
+
|
|
2875
|
+
#[test]
|
|
2876
|
+
fn test_integration_slide_notes_extraction_succeeds() {
|
|
2877
|
+
use std::io::Write;
|
|
2878
|
+
use zip::write::{SimpleFileOptions, ZipWriter};
|
|
2879
|
+
|
|
2880
|
+
let mut buffer = Vec::new();
|
|
2881
|
+
{
|
|
2882
|
+
let mut zip = ZipWriter::new(std::io::Cursor::new(&mut buffer));
|
|
2883
|
+
let options = SimpleFileOptions::default();
|
|
2884
|
+
|
|
2885
|
+
zip.start_file("[Content_Types].xml", options).unwrap();
|
|
2886
|
+
zip.write_all(
|
|
2887
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2888
|
+
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
2889
|
+
<Default Extension="xml" ContentType="application/xml"/>
|
|
2890
|
+
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
2891
|
+
</Types>"#,
|
|
2892
|
+
)
|
|
2893
|
+
.unwrap();
|
|
2894
|
+
|
|
2895
|
+
zip.start_file("ppt/presentation.xml", options).unwrap();
|
|
2896
|
+
zip.write_all(b"<?xml version=\"1.0\"?><presentation/>").unwrap();
|
|
2897
|
+
|
|
2898
|
+
zip.start_file("_rels/.rels", options).unwrap();
|
|
2899
|
+
zip.write_all(
|
|
2900
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2901
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
2902
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
|
|
2903
|
+
</Relationships>"#,
|
|
2904
|
+
)
|
|
2905
|
+
.unwrap();
|
|
2906
|
+
|
|
2907
|
+
zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
|
|
2908
|
+
zip.write_all(
|
|
2909
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2910
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
2911
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
|
|
2912
|
+
</Relationships>"#,
|
|
2913
|
+
)
|
|
2914
|
+
.unwrap();
|
|
2915
|
+
|
|
2916
|
+
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options).unwrap();
|
|
2917
|
+
zip.write_all(
|
|
2918
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2919
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
2920
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide" Target="../notesSlides/notesSlide1.xml"/>
|
|
2921
|
+
</Relationships>"#,
|
|
2922
|
+
)
|
|
2923
|
+
.unwrap();
|
|
2924
|
+
|
|
2925
|
+
let slide_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2926
|
+
<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
2927
|
+
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
|
|
2928
|
+
<p:cSld>
|
|
2929
|
+
<p:spTree>
|
|
2930
|
+
<p:sp>
|
|
2931
|
+
<p:txBody>
|
|
2932
|
+
<a:p>
|
|
2933
|
+
<a:r>
|
|
2934
|
+
<a:t>Slide Content</a:t>
|
|
2935
|
+
</a:r>
|
|
2936
|
+
</a:p>
|
|
2937
|
+
</p:txBody>
|
|
2938
|
+
</p:sp>
|
|
2939
|
+
</p:spTree>
|
|
2940
|
+
</p:cSld>
|
|
2941
|
+
</p:sld>"#;
|
|
2942
|
+
|
|
2943
|
+
zip.start_file("ppt/slides/slide1.xml", options).unwrap();
|
|
2944
|
+
zip.write_all(slide_xml.as_bytes()).unwrap();
|
|
2945
|
+
|
|
2946
|
+
let notes_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2947
|
+
<p:notes xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
2948
|
+
xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
|
|
2949
|
+
<p:cSld>
|
|
2950
|
+
<p:spTree>
|
|
2951
|
+
<p:sp>
|
|
2952
|
+
<p:txBody>
|
|
2953
|
+
<a:p>
|
|
2954
|
+
<a:r>
|
|
2955
|
+
<a:t>This is a speaker note for testing</a:t>
|
|
2956
|
+
</a:r>
|
|
2957
|
+
</a:p>
|
|
2958
|
+
</p:txBody>
|
|
2959
|
+
</p:sp>
|
|
2960
|
+
</p:spTree>
|
|
2961
|
+
</p:cSld>
|
|
2962
|
+
</p:notes>"#;
|
|
2963
|
+
|
|
2964
|
+
zip.start_file("ppt/notesSlides/notesSlide1.xml", options).unwrap();
|
|
2965
|
+
zip.write_all(notes_xml.as_bytes()).unwrap();
|
|
2966
|
+
|
|
2967
|
+
zip.start_file("docProps/core.xml", options).unwrap();
|
|
2968
|
+
zip.write_all(
|
|
2969
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
2970
|
+
<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
|
|
2971
|
+
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
|
2972
|
+
<dc:title>Notes Test</dc:title>
|
|
2973
|
+
</cp:coreProperties>"#,
|
|
2974
|
+
)
|
|
2975
|
+
.unwrap();
|
|
2976
|
+
|
|
2977
|
+
let _ = zip.finish().unwrap();
|
|
2978
|
+
}
|
|
2979
|
+
|
|
2980
|
+
let result = extract_pptx_from_bytes(&buffer, false).unwrap();
|
|
2981
|
+
|
|
2982
|
+
assert!(result.content.contains("Slide Content"), "Should contain slide content");
|
|
2983
|
+
assert!(result.content.contains("### Notes:"), "Should contain notes header");
|
|
2984
|
+
assert!(
|
|
2985
|
+
result.content.contains("This is a speaker note for testing"),
|
|
2986
|
+
"Should extract speaker notes"
|
|
2987
|
+
);
|
|
2988
|
+
}
|
|
2989
|
+
|
|
2990
|
+
#[test]
|
|
2991
|
+
fn test_integration_metadata_extraction_complete() {
|
|
2992
|
+
let pptx_bytes = create_test_pptx_bytes(vec!["Content"]);
|
|
2993
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2994
|
+
|
|
2995
|
+
assert_eq!(result.metadata.title, Some("Test Presentation".to_string()));
|
|
2996
|
+
assert_eq!(result.metadata.author, Some("Test Author".to_string()));
|
|
2997
|
+
assert_eq!(result.metadata.description, Some("Test Description".to_string()));
|
|
2998
|
+
assert_eq!(result.metadata.summary, Some("Test Subject".to_string()));
|
|
2999
|
+
}
|
|
3000
|
+
}
|