kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -29,7 +29,7 @@
|
|
|
29
29
|
//! use kreuzberg::extraction::pptx::extract_pptx_from_path;
|
|
30
30
|
//!
|
|
31
31
|
//! # fn example() -> kreuzberg::Result<()> {
|
|
32
|
-
//! let result = extract_pptx_from_path("presentation.pptx", true
|
|
32
|
+
//! let result = extract_pptx_from_path("presentation.pptx", true)?;
|
|
33
33
|
//!
|
|
34
34
|
//! println!("Slide count: {}", result.slide_count);
|
|
35
35
|
//! println!("Image count: {}", result.image_count);
|
|
@@ -38,7 +38,6 @@
|
|
|
38
38
|
//! # }
|
|
39
39
|
//! ```
|
|
40
40
|
use crate::error::{KreuzbergError, Result};
|
|
41
|
-
use crate::text::utf8_validation;
|
|
42
41
|
use crate::types::{ExtractedImage, PptxExtractionResult, PptxMetadata};
|
|
43
42
|
use std::collections::HashMap;
|
|
44
43
|
use std::fs::File;
|
|
@@ -182,68 +181,18 @@ impl Default for ParserConfig {
|
|
|
182
181
|
|
|
183
182
|
struct ContentBuilder {
|
|
184
183
|
content: String,
|
|
185
|
-
boundaries: Vec<crate::types::PageBoundary>,
|
|
186
|
-
page_contents: Vec<crate::types::PageContent>,
|
|
187
|
-
config: Option<crate::core::config::PageConfig>,
|
|
188
184
|
}
|
|
189
185
|
|
|
190
186
|
impl ContentBuilder {
|
|
191
187
|
fn new() -> Self {
|
|
192
188
|
Self {
|
|
193
189
|
content: String::with_capacity(8192),
|
|
194
|
-
boundaries: Vec::new(),
|
|
195
|
-
page_contents: Vec::new(),
|
|
196
|
-
config: None,
|
|
197
190
|
}
|
|
198
191
|
}
|
|
199
192
|
|
|
200
|
-
fn
|
|
193
|
+
fn with_capacity(capacity: usize) -> Self {
|
|
201
194
|
Self {
|
|
202
195
|
content: String::with_capacity(capacity),
|
|
203
|
-
boundaries: if config.is_some() {
|
|
204
|
-
Vec::new()
|
|
205
|
-
} else {
|
|
206
|
-
Vec::with_capacity(0)
|
|
207
|
-
},
|
|
208
|
-
page_contents: if config.is_some() {
|
|
209
|
-
Vec::new()
|
|
210
|
-
} else {
|
|
211
|
-
Vec::with_capacity(0)
|
|
212
|
-
},
|
|
213
|
-
config,
|
|
214
|
-
}
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
fn start_slide(&mut self, slide_number: u32) -> usize {
|
|
218
|
-
let byte_start = self.content.len();
|
|
219
|
-
|
|
220
|
-
if let Some(ref cfg) = self.config
|
|
221
|
-
&& cfg.insert_page_markers
|
|
222
|
-
{
|
|
223
|
-
let marker = cfg.marker_format.replace("{page_num}", &slide_number.to_string());
|
|
224
|
-
self.content.push_str(&marker);
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
byte_start
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
fn end_slide(&mut self, slide_number: u32, byte_start: usize, slide_content: String) {
|
|
231
|
-
let byte_end = self.content.len();
|
|
232
|
-
|
|
233
|
-
if self.config.is_some() {
|
|
234
|
-
self.boundaries.push(crate::types::PageBoundary {
|
|
235
|
-
byte_start,
|
|
236
|
-
byte_end,
|
|
237
|
-
page_number: slide_number as usize,
|
|
238
|
-
});
|
|
239
|
-
|
|
240
|
-
self.page_contents.push(crate::types::PageContent {
|
|
241
|
-
page_number: slide_number as usize,
|
|
242
|
-
content: slide_content,
|
|
243
|
-
tables: Vec::new(),
|
|
244
|
-
images: Vec::new(),
|
|
245
|
-
hierarchy: None,
|
|
246
|
-
});
|
|
247
196
|
}
|
|
248
197
|
}
|
|
249
198
|
|
|
@@ -322,25 +271,8 @@ impl ContentBuilder {
|
|
|
322
271
|
}
|
|
323
272
|
}
|
|
324
273
|
|
|
325
|
-
fn build(
|
|
326
|
-
self
|
|
327
|
-
) -> (
|
|
328
|
-
String,
|
|
329
|
-
Option<Vec<crate::types::PageBoundary>>,
|
|
330
|
-
Option<Vec<crate::types::PageContent>>,
|
|
331
|
-
) {
|
|
332
|
-
let content = self.content.trim().to_string();
|
|
333
|
-
let boundaries = if self.config.is_some() && !self.boundaries.is_empty() {
|
|
334
|
-
Some(self.boundaries)
|
|
335
|
-
} else {
|
|
336
|
-
None
|
|
337
|
-
};
|
|
338
|
-
let pages = if self.config.is_some() && !self.page_contents.is_empty() {
|
|
339
|
-
Some(self.page_contents)
|
|
340
|
-
} else {
|
|
341
|
-
None
|
|
342
|
-
};
|
|
343
|
-
(content, boundaries, pages)
|
|
274
|
+
fn build(self) -> String {
|
|
275
|
+
self.content.trim().to_string()
|
|
344
276
|
}
|
|
345
277
|
}
|
|
346
278
|
|
|
@@ -511,7 +443,7 @@ impl Slide {
|
|
|
511
443
|
}
|
|
512
444
|
}
|
|
513
445
|
|
|
514
|
-
builder.build()
|
|
446
|
+
builder.build()
|
|
515
447
|
}
|
|
516
448
|
|
|
517
449
|
fn image_count(&self) -> usize {
|
|
@@ -593,8 +525,8 @@ enum ParsedContent {
|
|
|
593
525
|
}
|
|
594
526
|
|
|
595
527
|
fn parse_slide_xml(xml_data: &[u8]) -> Result<Vec<SlideElement>> {
|
|
596
|
-
let xml_str =
|
|
597
|
-
.map_err(|_| KreuzbergError::parsing("Invalid UTF-8 in slide XML".to_string()))?;
|
|
528
|
+
let xml_str =
|
|
529
|
+
std::str::from_utf8(xml_data).map_err(|_| KreuzbergError::parsing("Invalid UTF-8 in slide XML".to_string()))?;
|
|
598
530
|
|
|
599
531
|
let doc =
|
|
600
532
|
Document::parse(xml_str).map_err(|e| KreuzbergError::parsing(format!("Failed to parse slide XML: {}", e)))?;
|
|
@@ -900,7 +832,7 @@ fn extract_position(node: &Node) -> ElementPosition {
|
|
|
900
832
|
}
|
|
901
833
|
|
|
902
834
|
fn parse_slide_rels(rels_data: &[u8]) -> Result<Vec<ImageReference>> {
|
|
903
|
-
let xml_str =
|
|
835
|
+
let xml_str = std::str::from_utf8(rels_data)
|
|
904
836
|
.map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in rels XML: {}", e)))?;
|
|
905
837
|
|
|
906
838
|
let doc =
|
|
@@ -925,7 +857,7 @@ fn parse_slide_rels(rels_data: &[u8]) -> Result<Vec<ImageReference>> {
|
|
|
925
857
|
}
|
|
926
858
|
|
|
927
859
|
fn parse_presentation_rels(rels_data: &[u8]) -> Result<Vec<String>> {
|
|
928
|
-
let xml_str =
|
|
860
|
+
let xml_str = std::str::from_utf8(rels_data)
|
|
929
861
|
.map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in presentation rels: {}", e)))?;
|
|
930
862
|
|
|
931
863
|
let doc = Document::parse(xml_str)
|
|
@@ -1034,12 +966,24 @@ fn extract_metadata(archive: &mut ZipArchive<File>) -> PptxMetadata {
|
|
|
1034
966
|
}
|
|
1035
967
|
}
|
|
1036
968
|
|
|
1037
|
-
PptxMetadata {
|
|
969
|
+
PptxMetadata {
|
|
970
|
+
title: metadata_map.get("title").cloned(),
|
|
971
|
+
author: metadata_map.get("author").cloned(),
|
|
972
|
+
description: metadata_map.get("description").cloned(),
|
|
973
|
+
summary: metadata_map.get("summary").cloned(),
|
|
974
|
+
fonts: Vec::new(),
|
|
975
|
+
}
|
|
1038
976
|
}
|
|
1039
977
|
|
|
1040
978
|
#[cfg(not(feature = "office"))]
|
|
1041
979
|
{
|
|
1042
|
-
PptxMetadata {
|
|
980
|
+
PptxMetadata {
|
|
981
|
+
title: None,
|
|
982
|
+
author: None,
|
|
983
|
+
description: None,
|
|
984
|
+
summary: None,
|
|
985
|
+
fonts: Vec::new(),
|
|
986
|
+
}
|
|
1043
987
|
}
|
|
1044
988
|
}
|
|
1045
989
|
|
|
@@ -1061,13 +1005,13 @@ fn extract_all_notes(container: &mut PptxContainer) -> Result<HashMap<u32, Strin
|
|
|
1061
1005
|
}
|
|
1062
1006
|
|
|
1063
1007
|
fn extract_notes_text(notes_xml: &[u8]) -> Result<String> {
|
|
1064
|
-
let xml_str =
|
|
1008
|
+
let xml_str = std::str::from_utf8(notes_xml)
|
|
1065
1009
|
.map_err(|e| KreuzbergError::parsing(format!("Invalid UTF-8 in notes XML: {}", e)))?;
|
|
1066
1010
|
|
|
1067
1011
|
let doc =
|
|
1068
1012
|
Document::parse(xml_str).map_err(|e| KreuzbergError::parsing(format!("Failed to parse notes XML: {}", e)))?;
|
|
1069
1013
|
|
|
1070
|
-
let mut text_parts = Vec::
|
|
1014
|
+
let mut text_parts = Vec::new();
|
|
1071
1015
|
const DRAWINGML_NS: &str = "http://schemas.openxmlformats.org/drawingml/2006/main";
|
|
1072
1016
|
|
|
1073
1017
|
for node in doc.descendants() {
|
|
@@ -1126,11 +1070,7 @@ fn detect_image_format(data: &[u8]) -> String {
|
|
|
1126
1070
|
}
|
|
1127
1071
|
}
|
|
1128
1072
|
|
|
1129
|
-
pub fn extract_pptx_from_path(
|
|
1130
|
-
path: &str,
|
|
1131
|
-
extract_images: bool,
|
|
1132
|
-
page_config: Option<&crate::core::config::PageConfig>,
|
|
1133
|
-
) -> Result<PptxExtractionResult> {
|
|
1073
|
+
pub fn extract_pptx_from_path(path: &str, extract_images: bool) -> Result<PptxExtractionResult> {
|
|
1134
1074
|
let config = ParserConfig {
|
|
1135
1075
|
extract_images,
|
|
1136
1076
|
..Default::default()
|
|
@@ -1145,19 +1085,15 @@ pub fn extract_pptx_from_path(
|
|
|
1145
1085
|
let mut iterator = SlideIterator::new(container);
|
|
1146
1086
|
let slide_count = iterator.slide_count();
|
|
1147
1087
|
|
|
1148
|
-
let estimated_capacity = slide_count
|
|
1149
|
-
let mut content_builder = ContentBuilder::
|
|
1088
|
+
let estimated_capacity = slide_count * 1024;
|
|
1089
|
+
let mut content_builder = ContentBuilder::with_capacity(estimated_capacity);
|
|
1150
1090
|
|
|
1151
1091
|
let mut total_image_count = 0;
|
|
1152
1092
|
let mut total_table_count = 0;
|
|
1153
1093
|
let mut extracted_images = Vec::new();
|
|
1154
1094
|
|
|
1155
1095
|
while let Some(slide) = iterator.next_slide()? {
|
|
1156
|
-
|
|
1157
|
-
content_builder.start_slide(slide.slide_number)
|
|
1158
|
-
} else {
|
|
1159
|
-
0
|
|
1160
|
-
};
|
|
1096
|
+
content_builder.add_slide_header(slide.slide_number);
|
|
1161
1097
|
|
|
1162
1098
|
let slide_content = slide.to_markdown(&config);
|
|
1163
1099
|
content_builder.add_text(&slide_content);
|
|
@@ -1166,10 +1102,6 @@ pub fn extract_pptx_from_path(
|
|
|
1166
1102
|
content_builder.add_notes(slide_notes);
|
|
1167
1103
|
}
|
|
1168
1104
|
|
|
1169
|
-
if page_config.is_some() {
|
|
1170
|
-
content_builder.end_slide(slide.slide_number, byte_start, slide_content.clone());
|
|
1171
|
-
}
|
|
1172
|
-
|
|
1173
1105
|
if config.extract_images
|
|
1174
1106
|
&& let Ok(image_data) = iterator.get_slide_images(&slide)
|
|
1175
1107
|
{
|
|
@@ -1197,43 +1129,17 @@ pub fn extract_pptx_from_path(
|
|
|
1197
1129
|
total_table_count += slide.table_count();
|
|
1198
1130
|
}
|
|
1199
1131
|
|
|
1200
|
-
let (content, boundaries, page_contents) = content_builder.build();
|
|
1201
|
-
|
|
1202
|
-
let page_structure = boundaries.as_ref().map(|bounds| crate::types::PageStructure {
|
|
1203
|
-
total_count: slide_count,
|
|
1204
|
-
unit_type: crate::types::PageUnitType::Slide,
|
|
1205
|
-
boundaries: Some(bounds.clone()),
|
|
1206
|
-
pages: page_contents.as_ref().map(|pcs| {
|
|
1207
|
-
pcs.iter()
|
|
1208
|
-
.map(|pc| crate::types::PageInfo {
|
|
1209
|
-
number: pc.page_number,
|
|
1210
|
-
title: None,
|
|
1211
|
-
dimensions: None,
|
|
1212
|
-
image_count: None,
|
|
1213
|
-
table_count: None,
|
|
1214
|
-
hidden: None,
|
|
1215
|
-
})
|
|
1216
|
-
.collect()
|
|
1217
|
-
}),
|
|
1218
|
-
});
|
|
1219
|
-
|
|
1220
1132
|
Ok(PptxExtractionResult {
|
|
1221
|
-
content,
|
|
1133
|
+
content: content_builder.build(),
|
|
1222
1134
|
metadata,
|
|
1223
1135
|
slide_count,
|
|
1224
1136
|
image_count: total_image_count,
|
|
1225
1137
|
table_count: total_table_count,
|
|
1226
1138
|
images: extracted_images,
|
|
1227
|
-
page_structure,
|
|
1228
|
-
page_contents,
|
|
1229
1139
|
})
|
|
1230
1140
|
}
|
|
1231
1141
|
|
|
1232
|
-
pub fn extract_pptx_from_bytes(
|
|
1233
|
-
data: &[u8],
|
|
1234
|
-
extract_images: bool,
|
|
1235
|
-
page_config: Option<&crate::core::config::PageConfig>,
|
|
1236
|
-
) -> Result<PptxExtractionResult> {
|
|
1142
|
+
pub fn extract_pptx_from_bytes(data: &[u8], extract_images: bool) -> Result<PptxExtractionResult> {
|
|
1237
1143
|
use std::sync::atomic::{AtomicU64, Ordering};
|
|
1238
1144
|
static COUNTER: AtomicU64 = AtomicU64::new(0);
|
|
1239
1145
|
let unique_id = COUNTER.fetch_add(1, Ordering::SeqCst);
|
|
@@ -1242,17 +1148,9 @@ pub fn extract_pptx_from_bytes(
|
|
|
1242
1148
|
// IO errors must bubble up - temp file write issues need user reports ~keep
|
|
1243
1149
|
std::fs::write(&temp_path, data)?;
|
|
1244
1150
|
|
|
1245
|
-
let result = extract_pptx_from_path(
|
|
1246
|
-
temp_path.to_str().ok_or_else(|| {
|
|
1247
|
-
crate::KreuzbergError::validation("Invalid temp path - contains invalid UTF-8".to_string())
|
|
1248
|
-
})?,
|
|
1249
|
-
extract_images,
|
|
1250
|
-
page_config,
|
|
1251
|
-
);
|
|
1151
|
+
let result = extract_pptx_from_path(temp_path.to_str().unwrap(), extract_images);
|
|
1252
1152
|
|
|
1253
|
-
|
|
1254
|
-
tracing::warn!("Failed to remove temp PPTX file: {}", e);
|
|
1255
|
-
}
|
|
1153
|
+
let _ = std::fs::remove_file(&temp_path);
|
|
1256
1154
|
|
|
1257
1155
|
result
|
|
1258
1156
|
}
|
|
@@ -1352,7 +1250,7 @@ mod tests {
|
|
|
1352
1250
|
#[test]
|
|
1353
1251
|
fn test_extract_pptx_from_bytes_single_slide() {
|
|
1354
1252
|
let pptx_bytes = create_test_pptx_bytes(vec!["Hello World"]);
|
|
1355
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, false
|
|
1253
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
1356
1254
|
|
|
1357
1255
|
assert_eq!(result.slide_count, 1);
|
|
1358
1256
|
assert!(
|
|
@@ -1367,7 +1265,7 @@ mod tests {
|
|
|
1367
1265
|
#[test]
|
|
1368
1266
|
fn test_extract_pptx_from_bytes_multiple_slides() {
|
|
1369
1267
|
let pptx_bytes = create_test_pptx_bytes(vec!["Slide 1", "Slide 2", "Slide 3"]);
|
|
1370
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, false
|
|
1268
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
1371
1269
|
|
|
1372
1270
|
assert_eq!(result.slide_count, 3);
|
|
1373
1271
|
assert!(result.content.contains("Slide 1"));
|
|
@@ -1378,15 +1276,18 @@ mod tests {
|
|
|
1378
1276
|
#[test]
|
|
1379
1277
|
fn test_extract_pptx_metadata() {
|
|
1380
1278
|
let pptx_bytes = create_test_pptx_bytes(vec!["Content"]);
|
|
1381
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, false
|
|
1279
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
1382
1280
|
|
|
1383
|
-
|
|
1281
|
+
assert_eq!(result.metadata.title, Some("Test Presentation".to_string()));
|
|
1282
|
+
assert_eq!(result.metadata.author, Some("Test Author".to_string()));
|
|
1283
|
+
assert_eq!(result.metadata.description, Some("Test Description".to_string()));
|
|
1284
|
+
assert_eq!(result.metadata.summary, Some("Test Subject".to_string()));
|
|
1384
1285
|
}
|
|
1385
1286
|
|
|
1386
1287
|
#[test]
|
|
1387
1288
|
fn test_extract_pptx_empty_slides() {
|
|
1388
1289
|
let pptx_bytes = create_test_pptx_bytes(vec!["", "", ""]);
|
|
1389
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, false
|
|
1290
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
1390
1291
|
|
|
1391
1292
|
assert_eq!(result.slide_count, 3);
|
|
1392
1293
|
}
|
|
@@ -1394,7 +1295,7 @@ mod tests {
|
|
|
1394
1295
|
#[test]
|
|
1395
1296
|
fn test_extract_pptx_from_bytes_invalid_data() {
|
|
1396
1297
|
let invalid_bytes = b"not a valid pptx file";
|
|
1397
|
-
let result = extract_pptx_from_bytes(invalid_bytes, false
|
|
1298
|
+
let result = extract_pptx_from_bytes(invalid_bytes, false);
|
|
1398
1299
|
|
|
1399
1300
|
assert!(result.is_err());
|
|
1400
1301
|
if let Err(KreuzbergError::Parsing { message: msg, .. }) = result {
|
|
@@ -1407,7 +1308,7 @@ mod tests {
|
|
|
1407
1308
|
#[test]
|
|
1408
1309
|
fn test_extract_pptx_from_bytes_empty_data() {
|
|
1409
1310
|
let empty_bytes: &[u8] = &[];
|
|
1410
|
-
let result = extract_pptx_from_bytes(empty_bytes, false
|
|
1311
|
+
let result = extract_pptx_from_bytes(empty_bytes, false);
|
|
1411
1312
|
|
|
1412
1313
|
assert!(result.is_err());
|
|
1413
1314
|
}
|
|
@@ -1507,8 +1408,7 @@ mod tests {
|
|
|
1507
1408
|
builder.add_text("Hello");
|
|
1508
1409
|
builder.add_text(" ");
|
|
1509
1410
|
builder.add_text("World");
|
|
1510
|
-
|
|
1511
|
-
assert_eq!(content, "HelloWorld");
|
|
1411
|
+
assert_eq!(builder.build(), "HelloWorld");
|
|
1512
1412
|
}
|
|
1513
1413
|
|
|
1514
1414
|
#[test]
|
|
@@ -1516,32 +1416,28 @@ mod tests {
|
|
|
1516
1416
|
let mut builder = ContentBuilder::new();
|
|
1517
1417
|
builder.add_text(" ");
|
|
1518
1418
|
builder.add_text("");
|
|
1519
|
-
|
|
1520
|
-
assert_eq!(content, "");
|
|
1419
|
+
assert_eq!(builder.build(), "");
|
|
1521
1420
|
}
|
|
1522
1421
|
|
|
1523
1422
|
#[test]
|
|
1524
1423
|
fn test_content_builder_add_title() {
|
|
1525
1424
|
let mut builder = ContentBuilder::new();
|
|
1526
1425
|
builder.add_title("Title");
|
|
1527
|
-
|
|
1528
|
-
assert_eq!(content, "# Title");
|
|
1426
|
+
assert_eq!(builder.build(), "# Title");
|
|
1529
1427
|
}
|
|
1530
1428
|
|
|
1531
1429
|
#[test]
|
|
1532
1430
|
fn test_content_builder_add_title_with_whitespace() {
|
|
1533
1431
|
let mut builder = ContentBuilder::new();
|
|
1534
1432
|
builder.add_title(" Title ");
|
|
1535
|
-
|
|
1536
|
-
assert_eq!(content, "# Title");
|
|
1433
|
+
assert_eq!(builder.build(), "# Title");
|
|
1537
1434
|
}
|
|
1538
1435
|
|
|
1539
1436
|
#[test]
|
|
1540
1437
|
fn test_content_builder_add_table_empty() {
|
|
1541
1438
|
let mut builder = ContentBuilder::new();
|
|
1542
1439
|
builder.add_table(&[]);
|
|
1543
|
-
|
|
1544
|
-
assert_eq!(content, "");
|
|
1440
|
+
assert_eq!(builder.build(), "");
|
|
1545
1441
|
}
|
|
1546
1442
|
|
|
1547
1443
|
#[test]
|
|
@@ -1550,9 +1446,9 @@ mod tests {
|
|
|
1550
1446
|
let rows = vec![vec!["Header1".to_string(), "Header2".to_string()]];
|
|
1551
1447
|
builder.add_table(&rows);
|
|
1552
1448
|
let result = builder.build();
|
|
1553
|
-
assert!(result.
|
|
1554
|
-
assert!(result.
|
|
1555
|
-
assert!(result.
|
|
1449
|
+
assert!(result.contains("<table>"));
|
|
1450
|
+
assert!(result.contains("<th>Header1</th>"));
|
|
1451
|
+
assert!(result.contains("<th>Header2</th>"));
|
|
1556
1452
|
}
|
|
1557
1453
|
|
|
1558
1454
|
#[test]
|
|
@@ -1564,8 +1460,8 @@ mod tests {
|
|
|
1564
1460
|
];
|
|
1565
1461
|
builder.add_table(&rows);
|
|
1566
1462
|
let result = builder.build();
|
|
1567
|
-
assert!(result.
|
|
1568
|
-
assert!(result.
|
|
1463
|
+
assert!(result.contains("<th>H1</th>"));
|
|
1464
|
+
assert!(result.contains("<td>D1</td>"));
|
|
1569
1465
|
}
|
|
1570
1466
|
|
|
1571
1467
|
#[test]
|
|
@@ -1574,8 +1470,8 @@ mod tests {
|
|
|
1574
1470
|
let rows = vec![vec!["<tag>".to_string(), "a & b".to_string()]];
|
|
1575
1471
|
builder.add_table(&rows);
|
|
1576
1472
|
let result = builder.build();
|
|
1577
|
-
assert!(result.
|
|
1578
|
-
assert!(result.
|
|
1473
|
+
assert!(result.contains("<tag>"));
|
|
1474
|
+
assert!(result.contains("a & b"));
|
|
1579
1475
|
}
|
|
1580
1476
|
|
|
1581
1477
|
#[test]
|
|
@@ -1584,8 +1480,8 @@ mod tests {
|
|
|
1584
1480
|
builder.add_list_item(1, false, "Item 1");
|
|
1585
1481
|
builder.add_list_item(1, false, "Item 2");
|
|
1586
1482
|
let result = builder.build();
|
|
1587
|
-
assert!(result.
|
|
1588
|
-
assert!(result.
|
|
1483
|
+
assert!(result.contains("- Item 1"));
|
|
1484
|
+
assert!(result.contains("- Item 2"));
|
|
1589
1485
|
}
|
|
1590
1486
|
|
|
1591
1487
|
#[test]
|
|
@@ -1594,8 +1490,8 @@ mod tests {
|
|
|
1594
1490
|
builder.add_list_item(1, true, "First");
|
|
1595
1491
|
builder.add_list_item(1, true, "Second");
|
|
1596
1492
|
let result = builder.build();
|
|
1597
|
-
assert!(result.
|
|
1598
|
-
assert!(result.
|
|
1493
|
+
assert!(result.contains("1. First"));
|
|
1494
|
+
assert!(result.contains("1. Second"));
|
|
1599
1495
|
}
|
|
1600
1496
|
|
|
1601
1497
|
#[test]
|
|
@@ -1605,9 +1501,9 @@ mod tests {
|
|
|
1605
1501
|
builder.add_list_item(2, false, "Level 2");
|
|
1606
1502
|
builder.add_list_item(3, false, "Level 3");
|
|
1607
1503
|
let result = builder.build();
|
|
1608
|
-
assert!(result.
|
|
1609
|
-
assert!(result.
|
|
1610
|
-
assert!(result.
|
|
1504
|
+
assert!(result.contains("- Level 1"));
|
|
1505
|
+
assert!(result.contains(" - Level 2"));
|
|
1506
|
+
assert!(result.contains(" - Level 3"));
|
|
1611
1507
|
}
|
|
1612
1508
|
|
|
1613
1509
|
#[test]
|
|
@@ -1615,7 +1511,7 @@ mod tests {
|
|
|
1615
1511
|
let mut builder = ContentBuilder::new();
|
|
1616
1512
|
builder.add_image("img123", 5);
|
|
1617
1513
|
let result = builder.build();
|
|
1618
|
-
assert!(result.
|
|
1514
|
+
assert!(result.contains(""));
|
|
1619
1515
|
}
|
|
1620
1516
|
|
|
1621
1517
|
#[test]
|
|
@@ -1623,16 +1519,15 @@ mod tests {
|
|
|
1623
1519
|
let mut builder = ContentBuilder::new();
|
|
1624
1520
|
builder.add_notes("This is a note");
|
|
1625
1521
|
let result = builder.build();
|
|
1626
|
-
assert!(result.
|
|
1627
|
-
assert!(result.
|
|
1522
|
+
assert!(result.contains("### Notes:"));
|
|
1523
|
+
assert!(result.contains("This is a note"));
|
|
1628
1524
|
}
|
|
1629
1525
|
|
|
1630
1526
|
#[test]
|
|
1631
1527
|
fn test_content_builder_add_notes_empty() {
|
|
1632
1528
|
let mut builder = ContentBuilder::new();
|
|
1633
1529
|
builder.add_notes(" ");
|
|
1634
|
-
|
|
1635
|
-
assert_eq!(content, "");
|
|
1530
|
+
assert_eq!(builder.build(), "");
|
|
1636
1531
|
}
|
|
1637
1532
|
|
|
1638
1533
|
#[test]
|
|
@@ -1640,7 +1535,7 @@ mod tests {
|
|
|
1640
1535
|
let mut builder = ContentBuilder::new();
|
|
1641
1536
|
builder.add_slide_header(3);
|
|
1642
1537
|
let result = builder.build();
|
|
1643
|
-
assert!(result.
|
|
1538
|
+
assert!(result.contains("<!-- Slide number: 3 -->"));
|
|
1644
1539
|
}
|
|
1645
1540
|
|
|
1646
1541
|
#[test]
|
|
@@ -2308,7 +2203,7 @@ mod tests {
|
|
|
2308
2203
|
vec!["Row 2 Col 1", "Row 2 Col 2", "Row 2 Col 3"],
|
|
2309
2204
|
]);
|
|
2310
2205
|
|
|
2311
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, false
|
|
2206
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2312
2207
|
|
|
2313
2208
|
assert_eq!(result.table_count, 1, "Should detect one table");
|
|
2314
2209
|
assert!(result.content.contains("<table>"), "Should contain table tag");
|
|
@@ -2340,7 +2235,7 @@ mod tests {
|
|
|
2340
2235
|
vec!["A4", "B4", "C4", "D4"],
|
|
2341
2236
|
]);
|
|
2342
2237
|
|
|
2343
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, false
|
|
2238
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2344
2239
|
|
|
2345
2240
|
assert_eq!(result.table_count, 1, "Should detect one table");
|
|
2346
2241
|
assert!(result.content.contains("<tr>"), "Should contain table rows");
|
|
@@ -2355,7 +2250,7 @@ mod tests {
|
|
|
2355
2250
|
fn test_table_counting_via_slide_metadata_succeeds() {
|
|
2356
2251
|
let pptx_bytes = create_pptx_with_table(vec![vec!["Col1", "Col2"], vec!["Val1", "Val2"]]);
|
|
2357
2252
|
|
|
2358
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, false
|
|
2253
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2359
2254
|
|
|
2360
2255
|
assert_eq!(result.table_count, 1, "table_count should be 1");
|
|
2361
2256
|
}
|
|
@@ -2367,7 +2262,7 @@ mod tests {
|
|
|
2367
2262
|
vec!["Cell data 1", "Cell data 2"],
|
|
2368
2263
|
]);
|
|
2369
2264
|
|
|
2370
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, false
|
|
2265
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2371
2266
|
|
|
2372
2267
|
assert!(result.content.contains("<table>"), "Should contain table tag");
|
|
2373
2268
|
assert!(
|
|
@@ -2383,7 +2278,7 @@ mod tests {
|
|
|
2383
2278
|
#[test]
|
|
2384
2279
|
fn test_table_extraction_empty_table_returns_one_count() {
|
|
2385
2280
|
let pptx_bytes = create_pptx_with_table(vec![]);
|
|
2386
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, false
|
|
2281
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2387
2282
|
|
|
2388
2283
|
assert_eq!(result.table_count, 1, "Empty table structure should be detected");
|
|
2389
2284
|
assert!(!result.content.contains("<td>"), "Empty table should have no cells");
|
|
@@ -2397,7 +2292,7 @@ mod tests {
|
|
|
2397
2292
|
(1, true, "Third item"),
|
|
2398
2293
|
]);
|
|
2399
2294
|
|
|
2400
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, false
|
|
2295
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2401
2296
|
|
|
2402
2297
|
assert!(
|
|
2403
2298
|
result.content.contains("1. First item"),
|
|
@@ -2421,7 +2316,7 @@ mod tests {
|
|
|
2421
2316
|
(1, false, "Bullet three"),
|
|
2422
2317
|
]);
|
|
2423
2318
|
|
|
2424
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, false
|
|
2319
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2425
2320
|
|
|
2426
2321
|
assert!(result.content.contains("- Bullet one"), "Should contain bullet point 1");
|
|
2427
2322
|
assert!(result.content.contains("- Bullet two"), "Should contain bullet point 2");
|
|
@@ -2441,7 +2336,7 @@ mod tests {
|
|
|
2441
2336
|
(1, false, "Back to Level 1"),
|
|
2442
2337
|
]);
|
|
2443
2338
|
|
|
2444
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, false
|
|
2339
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2445
2340
|
|
|
2446
2341
|
assert!(
|
|
2447
2342
|
result.content.contains("- Level 1 Item"),
|
|
@@ -2470,7 +2365,7 @@ mod tests {
|
|
|
2470
2365
|
(1, true, "Ordered item 2"),
|
|
2471
2366
|
]);
|
|
2472
2367
|
|
|
2473
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, false
|
|
2368
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2474
2369
|
|
|
2475
2370
|
assert!(
|
|
2476
2371
|
result.content.contains("1. Ordered item 1"),
|
|
@@ -2489,7 +2384,7 @@ mod tests {
|
|
|
2489
2384
|
#[test]
|
|
2490
2385
|
fn test_image_extraction_from_slide_xml_succeeds() {
|
|
2491
2386
|
let pptx_bytes = create_pptx_with_images();
|
|
2492
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, true
|
|
2387
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
|
|
2493
2388
|
|
|
2494
2389
|
assert_eq!(result.image_count, 2, "Should detect 2 images");
|
|
2495
2390
|
assert!(!result.images.is_empty(), "Should extract image data");
|
|
@@ -2498,7 +2393,7 @@ mod tests {
|
|
|
2498
2393
|
#[test]
|
|
2499
2394
|
fn test_image_data_loading_from_zip_archive_succeeds() {
|
|
2500
2395
|
let pptx_bytes = create_pptx_with_images();
|
|
2501
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, true
|
|
2396
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
|
|
2502
2397
|
|
|
2503
2398
|
assert_eq!(result.images.len(), 2, "Should load 2 images");
|
|
2504
2399
|
|
|
@@ -2510,7 +2405,7 @@ mod tests {
|
|
|
2510
2405
|
#[test]
|
|
2511
2406
|
fn test_image_format_detection_succeeds() {
|
|
2512
2407
|
let pptx_bytes = create_pptx_with_images();
|
|
2513
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, true
|
|
2408
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
|
|
2514
2409
|
|
|
2515
2410
|
assert_eq!(result.images.len(), 2, "Should have 2 images");
|
|
2516
2411
|
|
|
@@ -2523,7 +2418,7 @@ mod tests {
|
|
|
2523
2418
|
#[test]
|
|
2524
2419
|
fn test_image_counting_via_result_metadata_succeeds() {
|
|
2525
2420
|
let pptx_bytes = create_pptx_with_images();
|
|
2526
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, true
|
|
2421
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
|
|
2527
2422
|
|
|
2528
2423
|
assert_eq!(result.image_count, 2, "image_count should match actual images");
|
|
2529
2424
|
assert_eq!(result.images.len(), 2, "images vector should have 2 elements");
|
|
@@ -2532,7 +2427,7 @@ mod tests {
|
|
|
2532
2427
|
#[test]
|
|
2533
2428
|
fn test_image_extraction_disabled_returns_zero_images() {
|
|
2534
2429
|
let pptx_bytes = create_pptx_with_images();
|
|
2535
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, false
|
|
2430
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2536
2431
|
|
|
2537
2432
|
assert_eq!(
|
|
2538
2433
|
result.image_count, 2,
|
|
@@ -2544,7 +2439,7 @@ mod tests {
|
|
|
2544
2439
|
#[test]
|
|
2545
2440
|
fn test_multiple_images_per_slide_extraction_succeeds() {
|
|
2546
2441
|
let pptx_bytes = create_pptx_with_images();
|
|
2547
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, true
|
|
2442
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, true).unwrap();
|
|
2548
2443
|
|
|
2549
2444
|
assert_eq!(result.slide_count, 1, "Should have 1 slide");
|
|
2550
2445
|
assert_eq!(result.image_count, 2, "Single slide should contain 2 images");
|
|
@@ -2557,7 +2452,7 @@ mod tests {
|
|
|
2557
2452
|
#[test]
|
|
2558
2453
|
fn test_formatting_bold_text_renders_as_markdown_bold() {
|
|
2559
2454
|
let pptx_bytes = create_pptx_with_formatting();
|
|
2560
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, false
|
|
2455
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2561
2456
|
|
|
2562
2457
|
assert!(
|
|
2563
2458
|
result.content.contains("**Bold text"),
|
|
@@ -2568,7 +2463,7 @@ mod tests {
|
|
|
2568
2463
|
#[test]
|
|
2569
2464
|
fn test_formatting_italic_text_renders_as_markdown_italic() {
|
|
2570
2465
|
let pptx_bytes = create_pptx_with_formatting();
|
|
2571
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, false
|
|
2466
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2572
2467
|
|
|
2573
2468
|
assert!(
|
|
2574
2469
|
result.content.contains("*Italic text"),
|
|
@@ -2579,7 +2474,7 @@ mod tests {
|
|
|
2579
2474
|
#[test]
|
|
2580
2475
|
fn test_formatting_underline_text_renders_as_html_underline() {
|
|
2581
2476
|
let pptx_bytes = create_pptx_with_formatting();
|
|
2582
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, false
|
|
2477
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2583
2478
|
|
|
2584
2479
|
assert!(
|
|
2585
2480
|
result.content.contains("<u>Underline text"),
|
|
@@ -2590,7 +2485,7 @@ mod tests {
|
|
|
2590
2485
|
#[test]
|
|
2591
2486
|
fn test_formatting_combined_bold_italic_renders_correctly() {
|
|
2592
2487
|
let pptx_bytes = create_pptx_with_formatting();
|
|
2593
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, false
|
|
2488
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
2594
2489
|
|
|
2595
2490
|
assert!(
|
|
2596
2491
|
result.content.contains("***Bold italic text"),
|
|
@@ -2816,7 +2711,7 @@ mod tests {
|
|
|
2816
2711
|
let _ = zip.finish().unwrap();
|
|
2817
2712
|
}
|
|
2818
2713
|
|
|
2819
|
-
let result = extract_pptx_from_bytes(&buffer, true
|
|
2714
|
+
let result = extract_pptx_from_bytes(&buffer, true).unwrap();
|
|
2820
2715
|
|
|
2821
2716
|
assert!(
|
|
2822
2717
|
result.content.contains("**Title with Bold"),
|
|
@@ -2955,7 +2850,7 @@ mod tests {
|
|
|
2955
2850
|
let _ = zip.finish().unwrap();
|
|
2956
2851
|
}
|
|
2957
2852
|
|
|
2958
|
-
let result = extract_pptx_from_bytes(&buffer, false
|
|
2853
|
+
let result = extract_pptx_from_bytes(&buffer, false).unwrap();
|
|
2959
2854
|
|
|
2960
2855
|
let content = result.content;
|
|
2961
2856
|
let top_left_pos = content.find("Top Left").unwrap();
|
|
@@ -3082,7 +2977,7 @@ mod tests {
|
|
|
3082
2977
|
let _ = zip.finish().unwrap();
|
|
3083
2978
|
}
|
|
3084
2979
|
|
|
3085
|
-
let result = extract_pptx_from_bytes(&buffer, false
|
|
2980
|
+
let result = extract_pptx_from_bytes(&buffer, false).unwrap();
|
|
3086
2981
|
|
|
3087
2982
|
assert!(result.content.contains("Slide Content"), "Should contain slide content");
|
|
3088
2983
|
assert!(result.content.contains("### Notes:"), "Should contain notes header");
|
|
@@ -3095,8 +2990,11 @@ mod tests {
|
|
|
3095
2990
|
#[test]
|
|
3096
2991
|
fn test_integration_metadata_extraction_complete() {
|
|
3097
2992
|
let pptx_bytes = create_test_pptx_bytes(vec!["Content"]);
|
|
3098
|
-
let result = extract_pptx_from_bytes(&pptx_bytes, false
|
|
2993
|
+
let result = extract_pptx_from_bytes(&pptx_bytes, false).unwrap();
|
|
3099
2994
|
|
|
3100
|
-
|
|
2995
|
+
assert_eq!(result.metadata.title, Some("Test Presentation".to_string()));
|
|
2996
|
+
assert_eq!(result.metadata.author, Some("Test Author".to_string()));
|
|
2997
|
+
assert_eq!(result.metadata.description, Some("Test Description".to_string()));
|
|
2998
|
+
assert_eq!(result.metadata.summary, Some("Test Subject".to_string()));
|
|
3101
2999
|
}
|
|
3102
3000
|
}
|