kreuzberg 4.1.2 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/kreuzberg.gemspec +13 -1
- data/lib/kreuzberg/cli.rb +16 -6
- data/lib/kreuzberg/cli_proxy.rb +3 -1
- data/lib/kreuzberg/config.rb +121 -39
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/extraction_api.rb +20 -4
- data/lib/kreuzberg/result.rb +12 -2
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +1 -0
- data/sig/kreuzberg.rbs +28 -12
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/batch_spec.rb +6 -5
- data/spec/binding/error_recovery_spec.rb +3 -3
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/binding/tables_spec.rb +11 -2
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/startup.rs +15 -1
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
- data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
- data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
- data/vendor/kreuzberg/src/core/io.rs +7 -7
- data/vendor/kreuzberg/src/core/mime.rs +4 -4
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
- data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +57 -57
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +12 -2
|
@@ -1,13 +1,15 @@
|
|
|
1
1
|
//! Regression tests for PPTX/PPSX extraction bugs
|
|
2
2
|
//!
|
|
3
3
|
//! GitHub Issue #321: PPTX extraction fails on shapes without txBody (image placeholders) + PPSX not supported
|
|
4
|
+
//! GitHub Issue #329: Extracting images from PPTX results in reversed page numbers
|
|
4
5
|
//!
|
|
5
6
|
//! Bug 1: "No txBody found" - PPTX extraction fails when any shape lacks a text body
|
|
6
7
|
//! Bug 2: PPSX not supported - PowerPoint Show files rejected entirely
|
|
8
|
+
//! Bug 3: Image page numbers reversed - image on slide 1 reports page_number=2
|
|
7
9
|
|
|
8
10
|
#![cfg(feature = "office")]
|
|
9
11
|
|
|
10
|
-
use kreuzberg::{ExtractionConfig, extract_file};
|
|
12
|
+
use kreuzberg::{ExtractionConfig, ImageExtractionConfig, extract_file};
|
|
11
13
|
use std::io::Write;
|
|
12
14
|
use tempfile::NamedTempFile;
|
|
13
15
|
use zip::CompressionMethod;
|
|
@@ -25,9 +27,9 @@ use zip::write::{FileOptions, ZipWriter};
|
|
|
25
27
|
async fn test_ppsx_slideshow_extraction() {
|
|
26
28
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
27
29
|
.parent()
|
|
28
|
-
.
|
|
30
|
+
.expect("Operation failed")
|
|
29
31
|
.parent()
|
|
30
|
-
.
|
|
32
|
+
.expect("Operation failed");
|
|
31
33
|
let test_file = workspace_root.join("test_documents/presentations/sample.ppsx");
|
|
32
34
|
|
|
33
35
|
if !test_file.exists() {
|
|
@@ -69,9 +71,9 @@ async fn test_ppsx_slideshow_extraction() {
|
|
|
69
71
|
async fn test_ppsx_with_explicit_mime_type() {
|
|
70
72
|
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
71
73
|
.parent()
|
|
72
|
-
.
|
|
74
|
+
.expect("Operation failed")
|
|
73
75
|
.parent()
|
|
74
|
-
.
|
|
76
|
+
.expect("Operation failed");
|
|
75
77
|
let test_file = workspace_root.join("test_documents/presentations/sample.ppsx");
|
|
76
78
|
|
|
77
79
|
if !test_file.exists() {
|
|
@@ -120,24 +122,26 @@ async fn test_pptx_with_image_placeholder_no_txbody() {
|
|
|
120
122
|
let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
|
|
121
123
|
|
|
122
124
|
// Add [Content_Types].xml
|
|
123
|
-
zip.start_file("[Content_Types].xml", options)
|
|
125
|
+
zip.start_file("[Content_Types].xml", options)
|
|
126
|
+
.expect("Operation failed");
|
|
124
127
|
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
125
128
|
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
126
129
|
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
127
130
|
<Default Extension="xml" ContentType="application/xml"/>
|
|
128
131
|
<Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
|
|
129
132
|
<Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
|
|
130
|
-
</Types>"#).
|
|
133
|
+
</Types>"#).expect("Operation failed");
|
|
131
134
|
|
|
132
135
|
// Add _rels/.rels
|
|
133
|
-
zip.start_file("_rels/.rels", options).
|
|
136
|
+
zip.start_file("_rels/.rels", options).expect("Operation failed");
|
|
134
137
|
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
135
138
|
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
136
139
|
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
|
|
137
|
-
</Relationships>"#).
|
|
140
|
+
</Relationships>"#).expect("Operation failed");
|
|
138
141
|
|
|
139
142
|
// Add ppt/presentation.xml
|
|
140
|
-
zip.start_file("ppt/presentation.xml", options)
|
|
143
|
+
zip.start_file("ppt/presentation.xml", options)
|
|
144
|
+
.expect("Operation failed");
|
|
141
145
|
zip.write_all(
|
|
142
146
|
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
143
147
|
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
@@ -148,18 +152,20 @@ async fn test_pptx_with_image_placeholder_no_txbody() {
|
|
|
148
152
|
</p:sldIdLst>
|
|
149
153
|
</p:presentation>"#,
|
|
150
154
|
)
|
|
151
|
-
.
|
|
155
|
+
.expect("Operation failed");
|
|
152
156
|
|
|
153
157
|
// Add ppt/_rels/presentation.xml.rels
|
|
154
|
-
zip.start_file("ppt/_rels/presentation.xml.rels", options)
|
|
158
|
+
zip.start_file("ppt/_rels/presentation.xml.rels", options)
|
|
159
|
+
.expect("Operation failed");
|
|
155
160
|
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
156
161
|
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
157
162
|
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
|
|
158
|
-
</Relationships>"#).
|
|
163
|
+
</Relationships>"#).expect("Operation failed");
|
|
159
164
|
|
|
160
165
|
// Add ppt/slides/slide1.xml with a shape WITHOUT txBody (image placeholder)
|
|
161
166
|
// This is the critical test case - a <p:sp> element with no <p:txBody>
|
|
162
|
-
zip.start_file("ppt/slides/slide1.xml", options)
|
|
167
|
+
zip.start_file("ppt/slides/slide1.xml", options)
|
|
168
|
+
.expect("Operation failed");
|
|
163
169
|
zip.write_all(
|
|
164
170
|
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
165
171
|
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
@@ -259,18 +265,19 @@ async fn test_pptx_with_image_placeholder_no_txbody() {
|
|
|
259
265
|
</p:cSld>
|
|
260
266
|
</p:sld>"#,
|
|
261
267
|
)
|
|
262
|
-
.
|
|
268
|
+
.expect("Operation failed");
|
|
263
269
|
|
|
264
270
|
// Add ppt/slides/_rels/slide1.xml.rels (empty)
|
|
265
|
-
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
|
|
271
|
+
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
|
|
272
|
+
.expect("Operation failed");
|
|
266
273
|
zip.write_all(
|
|
267
274
|
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
268
275
|
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
269
276
|
</Relationships>"#,
|
|
270
277
|
)
|
|
271
|
-
.
|
|
278
|
+
.expect("Operation failed");
|
|
272
279
|
|
|
273
|
-
zip.finish().
|
|
280
|
+
zip.finish().expect("Operation failed");
|
|
274
281
|
}
|
|
275
282
|
|
|
276
283
|
// Extract the PPTX file
|
|
@@ -336,24 +343,26 @@ async fn test_pptx_mixed_shapes_extraction() {
|
|
|
336
343
|
let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
|
|
337
344
|
|
|
338
345
|
// Add [Content_Types].xml
|
|
339
|
-
zip.start_file("[Content_Types].xml", options)
|
|
346
|
+
zip.start_file("[Content_Types].xml", options)
|
|
347
|
+
.expect("Operation failed");
|
|
340
348
|
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
341
349
|
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
342
350
|
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
343
351
|
<Default Extension="xml" ContentType="application/xml"/>
|
|
344
352
|
<Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
|
|
345
353
|
<Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
|
|
346
|
-
</Types>"#).
|
|
354
|
+
</Types>"#).expect("Operation failed");
|
|
347
355
|
|
|
348
356
|
// Add _rels/.rels
|
|
349
|
-
zip.start_file("_rels/.rels", options).
|
|
357
|
+
zip.start_file("_rels/.rels", options).expect("Operation failed");
|
|
350
358
|
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
351
359
|
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
352
360
|
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
|
|
353
|
-
</Relationships>"#).
|
|
361
|
+
</Relationships>"#).expect("Operation failed");
|
|
354
362
|
|
|
355
363
|
// Add ppt/presentation.xml
|
|
356
|
-
zip.start_file("ppt/presentation.xml", options)
|
|
364
|
+
zip.start_file("ppt/presentation.xml", options)
|
|
365
|
+
.expect("Operation failed");
|
|
357
366
|
zip.write_all(
|
|
358
367
|
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
359
368
|
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
@@ -364,17 +373,19 @@ async fn test_pptx_mixed_shapes_extraction() {
|
|
|
364
373
|
</p:sldIdLst>
|
|
365
374
|
</p:presentation>"#,
|
|
366
375
|
)
|
|
367
|
-
.
|
|
376
|
+
.expect("Operation failed");
|
|
368
377
|
|
|
369
378
|
// Add ppt/_rels/presentation.xml.rels
|
|
370
|
-
zip.start_file("ppt/_rels/presentation.xml.rels", options)
|
|
379
|
+
zip.start_file("ppt/_rels/presentation.xml.rels", options)
|
|
380
|
+
.expect("Operation failed");
|
|
371
381
|
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
372
382
|
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
373
383
|
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
|
|
374
|
-
</Relationships>"#).
|
|
384
|
+
</Relationships>"#).expect("Operation failed");
|
|
375
385
|
|
|
376
386
|
// Add slide with various shapes - some with txBody, some without
|
|
377
|
-
zip.start_file("ppt/slides/slide1.xml", options)
|
|
387
|
+
zip.start_file("ppt/slides/slide1.xml", options)
|
|
388
|
+
.expect("Operation failed");
|
|
378
389
|
zip.write_all(
|
|
379
390
|
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
380
391
|
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
@@ -453,18 +464,19 @@ async fn test_pptx_mixed_shapes_extraction() {
|
|
|
453
464
|
</p:cSld>
|
|
454
465
|
</p:sld>"#,
|
|
455
466
|
)
|
|
456
|
-
.
|
|
467
|
+
.expect("Operation failed");
|
|
457
468
|
|
|
458
469
|
// Add empty rels
|
|
459
|
-
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
|
|
470
|
+
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
|
|
471
|
+
.expect("Operation failed");
|
|
460
472
|
zip.write_all(
|
|
461
473
|
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
462
474
|
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
463
475
|
</Relationships>"#,
|
|
464
476
|
)
|
|
465
|
-
.
|
|
477
|
+
.expect("Operation failed");
|
|
466
478
|
|
|
467
|
-
zip.finish().
|
|
479
|
+
zip.finish().expect("Operation failed");
|
|
468
480
|
}
|
|
469
481
|
|
|
470
482
|
let result = extract_file(
|
|
@@ -502,3 +514,284 @@ async fn test_pptx_mixed_shapes_extraction() {
|
|
|
502
514
|
}
|
|
503
515
|
}
|
|
504
516
|
}
|
|
517
|
+
|
|
518
|
+
/// Test that images extracted from PPTX have correct page numbers.
|
|
519
|
+
///
|
|
520
|
+
/// When a PPTX has multiple slides and an image on slide 1, the extracted image
|
|
521
|
+
/// should have page_number=1 (not reversed).
|
|
522
|
+
///
|
|
523
|
+
/// GitHub Issue #329: Image on slide 1 of 2-slide PPTX reports page_number=2
|
|
524
|
+
#[tokio::test]
|
|
525
|
+
async fn test_pptx_image_page_numbers_not_reversed() {
|
|
526
|
+
// Create a PPTX with 2 slides, image on slide 1
|
|
527
|
+
let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
|
|
528
|
+
|
|
529
|
+
// A minimal 1x1 red PNG image (valid PNG format)
|
|
530
|
+
let png_image: &[u8] = &[
|
|
531
|
+
0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, // PNG signature
|
|
532
|
+
0x00, 0x00, 0x00, 0x0D, // IHDR chunk length
|
|
533
|
+
0x49, 0x48, 0x44, 0x52, // "IHDR"
|
|
534
|
+
0x00, 0x00, 0x00, 0x01, // width: 1
|
|
535
|
+
0x00, 0x00, 0x00, 0x01, // height: 1
|
|
536
|
+
0x08, 0x02, // bit depth: 8, color type: RGB
|
|
537
|
+
0x00, 0x00, 0x00, // compression, filter, interlace
|
|
538
|
+
0x90, 0x77, 0x53, 0xDE, // IHDR CRC
|
|
539
|
+
0x00, 0x00, 0x00, 0x0C, // IDAT chunk length
|
|
540
|
+
0x49, 0x44, 0x41, 0x54, // "IDAT"
|
|
541
|
+
0x08, 0xD7, 0x63, 0xF8, 0xCF, 0xC0, 0x00, 0x00, // compressed data
|
|
542
|
+
0x01, 0x01, 0x01, 0x00, // checksum
|
|
543
|
+
0x18, 0xDD, 0x8D, 0xB4, // IDAT CRC
|
|
544
|
+
0x00, 0x00, 0x00, 0x00, // IEND chunk length
|
|
545
|
+
0x49, 0x45, 0x4E, 0x44, // "IEND"
|
|
546
|
+
0xAE, 0x42, 0x60, 0x82, // IEND CRC
|
|
547
|
+
];
|
|
548
|
+
|
|
549
|
+
{
|
|
550
|
+
let mut zip = ZipWriter::new(&mut temp_file);
|
|
551
|
+
let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
|
|
552
|
+
|
|
553
|
+
// Add [Content_Types].xml
|
|
554
|
+
zip.start_file("[Content_Types].xml", options)
|
|
555
|
+
.expect("Operation failed");
|
|
556
|
+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
557
|
+
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
|
|
558
|
+
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
|
|
559
|
+
<Default Extension="xml" ContentType="application/xml"/>
|
|
560
|
+
<Default Extension="png" ContentType="image/png"/>
|
|
561
|
+
<Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
|
|
562
|
+
<Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
|
|
563
|
+
<Override PartName="/ppt/slides/slide2.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
|
|
564
|
+
</Types>"#).expect("Operation failed");
|
|
565
|
+
|
|
566
|
+
// Add _rels/.rels
|
|
567
|
+
zip.start_file("_rels/.rels", options).expect("Operation failed");
|
|
568
|
+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
569
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
570
|
+
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
|
|
571
|
+
</Relationships>"#).expect("Operation failed");
|
|
572
|
+
|
|
573
|
+
// Add ppt/presentation.xml
|
|
574
|
+
zip.start_file("ppt/presentation.xml", options)
|
|
575
|
+
.expect("Operation failed");
|
|
576
|
+
zip.write_all(
|
|
577
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
578
|
+
<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
579
|
+
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
|
|
580
|
+
<p:sldIdLst>
|
|
581
|
+
<p:sldId id="256" r:id="rId2"/>
|
|
582
|
+
<p:sldId id="257" r:id="rId3"/>
|
|
583
|
+
</p:sldIdLst>
|
|
584
|
+
</p:presentation>"#,
|
|
585
|
+
)
|
|
586
|
+
.expect("Operation failed");
|
|
587
|
+
|
|
588
|
+
// Add ppt/_rels/presentation.xml.rels
|
|
589
|
+
// BUG REPRODUCTION: Slides listed in REVERSE order in XML (slide2 before slide1)
|
|
590
|
+
// This is valid XML - PowerPoint doesn't guarantee order in rels files
|
|
591
|
+
// GitHub Issue #329: This causes page numbers to be reversed
|
|
592
|
+
zip.start_file("ppt/_rels/presentation.xml.rels", options)
|
|
593
|
+
.expect("Operation failed");
|
|
594
|
+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
595
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
596
|
+
<Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide2.xml"/>
|
|
597
|
+
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
|
|
598
|
+
</Relationships>"#).expect("Operation failed");
|
|
599
|
+
|
|
600
|
+
// Add the image file
|
|
601
|
+
zip.start_file("ppt/media/image1.png", options)
|
|
602
|
+
.expect("Operation failed");
|
|
603
|
+
zip.write_all(png_image).expect("Operation failed");
|
|
604
|
+
|
|
605
|
+
// Add slide 1 WITH an image
|
|
606
|
+
zip.start_file("ppt/slides/slide1.xml", options)
|
|
607
|
+
.expect("Operation failed");
|
|
608
|
+
zip.write_all(
|
|
609
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
610
|
+
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
611
|
+
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
612
|
+
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
|
|
613
|
+
<p:cSld>
|
|
614
|
+
<p:spTree>
|
|
615
|
+
<p:nvGrpSpPr><p:cNvPr id="1" name=""/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr>
|
|
616
|
+
<p:grpSpPr/>
|
|
617
|
+
<p:sp>
|
|
618
|
+
<p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
|
|
619
|
+
<p:spPr/>
|
|
620
|
+
<p:txBody><a:bodyPr/><a:lstStyle/><a:p><a:r><a:t>Slide 1 - Has Image</a:t></a:r></a:p></p:txBody>
|
|
621
|
+
</p:sp>
|
|
622
|
+
<p:pic>
|
|
623
|
+
<p:nvPicPr>
|
|
624
|
+
<p:cNvPr id="3" name="Picture 1"/>
|
|
625
|
+
<p:cNvPicPr><a:picLocks noChangeAspect="1"/></p:cNvPicPr>
|
|
626
|
+
<p:nvPr/>
|
|
627
|
+
</p:nvPicPr>
|
|
628
|
+
<p:blipFill>
|
|
629
|
+
<a:blip r:embed="rId2"/>
|
|
630
|
+
<a:stretch><a:fillRect/></a:stretch>
|
|
631
|
+
</p:blipFill>
|
|
632
|
+
<p:spPr>
|
|
633
|
+
<a:xfrm><a:off x="0" y="0"/><a:ext cx="100000" cy="100000"/></a:xfrm>
|
|
634
|
+
<a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
|
|
635
|
+
</p:spPr>
|
|
636
|
+
</p:pic>
|
|
637
|
+
</p:spTree>
|
|
638
|
+
</p:cSld>
|
|
639
|
+
</p:sld>"#,
|
|
640
|
+
)
|
|
641
|
+
.expect("Operation failed");
|
|
642
|
+
|
|
643
|
+
// Add slide 1 relationships (points to the image)
|
|
644
|
+
zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
|
|
645
|
+
.expect("Operation failed");
|
|
646
|
+
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
647
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
648
|
+
<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image1.png"/>
|
|
649
|
+
</Relationships>"#).expect("Operation failed");
|
|
650
|
+
|
|
651
|
+
// Add slide 2 WITHOUT an image
|
|
652
|
+
zip.start_file("ppt/slides/slide2.xml", options)
|
|
653
|
+
.expect("Operation failed");
|
|
654
|
+
zip.write_all(
|
|
655
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
656
|
+
<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
657
|
+
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
658
|
+
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
|
|
659
|
+
<p:cSld>
|
|
660
|
+
<p:spTree>
|
|
661
|
+
<p:nvGrpSpPr><p:cNvPr id="1" name=""/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr>
|
|
662
|
+
<p:grpSpPr/>
|
|
663
|
+
<p:sp>
|
|
664
|
+
<p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
|
|
665
|
+
<p:spPr/>
|
|
666
|
+
<p:txBody><a:bodyPr/><a:lstStyle/><a:p><a:r><a:t>Slide 2 - No Image</a:t></a:r></a:p></p:txBody>
|
|
667
|
+
</p:sp>
|
|
668
|
+
</p:spTree>
|
|
669
|
+
</p:cSld>
|
|
670
|
+
</p:sld>"#,
|
|
671
|
+
)
|
|
672
|
+
.expect("Operation failed");
|
|
673
|
+
|
|
674
|
+
// Add empty slide 2 relationships
|
|
675
|
+
zip.start_file("ppt/slides/_rels/slide2.xml.rels", options)
|
|
676
|
+
.expect("Operation failed");
|
|
677
|
+
zip.write_all(
|
|
678
|
+
br#"<?xml version="1.0" encoding="UTF-8"?>
|
|
679
|
+
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
|
680
|
+
</Relationships>"#,
|
|
681
|
+
)
|
|
682
|
+
.expect("Operation failed");
|
|
683
|
+
|
|
684
|
+
zip.finish().expect("Operation failed");
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
// Extract with images enabled
|
|
688
|
+
let config = ExtractionConfig {
|
|
689
|
+
images: Some(ImageExtractionConfig {
|
|
690
|
+
extract_images: true,
|
|
691
|
+
target_dpi: 300,
|
|
692
|
+
max_image_dimension: 4096,
|
|
693
|
+
auto_adjust_dpi: true,
|
|
694
|
+
min_dpi: 72,
|
|
695
|
+
max_dpi: 600,
|
|
696
|
+
}),
|
|
697
|
+
..Default::default()
|
|
698
|
+
};
|
|
699
|
+
|
|
700
|
+
let result = extract_file(
|
|
701
|
+
temp_file.path(),
|
|
702
|
+
Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
|
|
703
|
+
&config,
|
|
704
|
+
)
|
|
705
|
+
.await;
|
|
706
|
+
|
|
707
|
+
match result {
|
|
708
|
+
Ok(extraction) => {
|
|
709
|
+
// Verify text extraction works
|
|
710
|
+
assert!(extraction.content.contains("Slide 1"), "Should extract slide 1 text");
|
|
711
|
+
assert!(extraction.content.contains("Slide 2"), "Should extract slide 2 text");
|
|
712
|
+
|
|
713
|
+
// Verify we got an image
|
|
714
|
+
let images = extraction.images.as_ref().expect("Images should be present");
|
|
715
|
+
assert!(!images.is_empty(), "Should extract at least one image");
|
|
716
|
+
|
|
717
|
+
// THE CRITICAL TEST: Image on slide 1 should have page_number=1, NOT 2
|
|
718
|
+
let image = &images[0];
|
|
719
|
+
assert_eq!(
|
|
720
|
+
image.page_number,
|
|
721
|
+
Some(1),
|
|
722
|
+
"GitHub Issue #329: Image on slide 1 should have page_number=1, but got {:?}. \
|
|
723
|
+
The page numbers are reversed!",
|
|
724
|
+
image.page_number
|
|
725
|
+
);
|
|
726
|
+
|
|
727
|
+
println!("✅ PPTX image page numbers are correct!");
|
|
728
|
+
println!(" Image on slide 1 has page_number={:?}", image.page_number);
|
|
729
|
+
}
|
|
730
|
+
Err(e) => {
|
|
731
|
+
panic!("PPTX extraction failed: {:?}", e);
|
|
732
|
+
}
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
/// Test with actual user-provided PPTX file from GitHub Issue #329.
|
|
737
|
+
///
|
|
738
|
+
/// The user's file has slides listed in reverse order in presentation.xml.rels,
|
|
739
|
+
/// which caused images to have incorrect page numbers.
|
|
740
|
+
#[tokio::test]
|
|
741
|
+
async fn test_pptx_image_page_numbers_issue329_user_file() {
|
|
742
|
+
let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
|
|
743
|
+
.parent()
|
|
744
|
+
.expect("Operation failed")
|
|
745
|
+
.parent()
|
|
746
|
+
.expect("Operation failed");
|
|
747
|
+
let test_file = workspace_root.join("test_documents/presentations/pptx_reversed_slide_order_issue329.pptx");
|
|
748
|
+
|
|
749
|
+
if !test_file.exists() {
|
|
750
|
+
println!("Skipping test: User file not found at {:?}", test_file);
|
|
751
|
+
return;
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
// Extract with images enabled
|
|
755
|
+
let config = ExtractionConfig {
|
|
756
|
+
images: Some(ImageExtractionConfig {
|
|
757
|
+
extract_images: true,
|
|
758
|
+
target_dpi: 300,
|
|
759
|
+
max_image_dimension: 4096,
|
|
760
|
+
auto_adjust_dpi: true,
|
|
761
|
+
min_dpi: 72,
|
|
762
|
+
max_dpi: 600,
|
|
763
|
+
}),
|
|
764
|
+
..Default::default()
|
|
765
|
+
};
|
|
766
|
+
|
|
767
|
+
let result = extract_file(&test_file, None, &config).await;
|
|
768
|
+
|
|
769
|
+
match result {
|
|
770
|
+
Ok(extraction) => {
|
|
771
|
+
// The user's file has an image on slide 1
|
|
772
|
+
let images = extraction.images.as_ref().expect("Images should be extracted");
|
|
773
|
+
|
|
774
|
+
if images.is_empty() {
|
|
775
|
+
println!("No images extracted from user file (may not have embedded images)");
|
|
776
|
+
return;
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
// All images should have page_number = 1 since they're on the first slide
|
|
780
|
+
for (idx, image) in images.iter().enumerate() {
|
|
781
|
+
assert_eq!(
|
|
782
|
+
image.page_number,
|
|
783
|
+
Some(1),
|
|
784
|
+
"GitHub Issue #329: Image {} should have page_number=1, but got {:?}",
|
|
785
|
+
idx,
|
|
786
|
+
image.page_number
|
|
787
|
+
);
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
println!("✅ User file from Issue #329 - image page numbers correct!");
|
|
791
|
+
println!(" Found {} images, all with page_number=1", images.len());
|
|
792
|
+
}
|
|
793
|
+
Err(e) => {
|
|
794
|
+
panic!("Failed to extract user file: {:?}", e);
|
|
795
|
+
}
|
|
796
|
+
}
|
|
797
|
+
}
|
|
@@ -184,9 +184,9 @@ fn test_register_multiple_validators_succeeds() {
|
|
|
184
184
|
should_fail: true,
|
|
185
185
|
});
|
|
186
186
|
|
|
187
|
-
registry.register(v1).
|
|
188
|
-
registry.register(v2).
|
|
189
|
-
registry.register(v3).
|
|
187
|
+
registry.register(v1).expect("Operation failed");
|
|
188
|
+
registry.register(v2).expect("Operation failed");
|
|
189
|
+
registry.register(v3).expect("Operation failed");
|
|
190
190
|
|
|
191
191
|
let list = registry.list();
|
|
192
192
|
assert_eq!(list.len(), 3, "Should have three validators");
|
|
@@ -205,7 +205,7 @@ fn test_validator_unregistration_succeeds() {
|
|
|
205
205
|
should_fail: false,
|
|
206
206
|
});
|
|
207
207
|
|
|
208
|
-
registry.register(validator).
|
|
208
|
+
registry.register(validator).expect("Operation failed");
|
|
209
209
|
assert_eq!(registry.list().len(), 1);
|
|
210
210
|
|
|
211
211
|
let result = registry.remove("temp-validator");
|
|
@@ -298,8 +298,8 @@ fn test_clear_validators_succeeds() {
|
|
|
298
298
|
should_fail: false,
|
|
299
299
|
});
|
|
300
300
|
|
|
301
|
-
registry.register(v1).
|
|
302
|
-
registry.register(v2).
|
|
301
|
+
registry.register(v1).expect("Operation failed");
|
|
302
|
+
registry.register(v2).expect("Operation failed");
|
|
303
303
|
assert_eq!(registry.list().len(), 2);
|
|
304
304
|
|
|
305
305
|
let result = registry.shutdown_all();
|
|
@@ -355,9 +355,9 @@ fn test_get_all_validators_respects_priority() {
|
|
|
355
355
|
priority: 100,
|
|
356
356
|
});
|
|
357
357
|
|
|
358
|
-
registry.register(medium).
|
|
359
|
-
registry.register(low).
|
|
360
|
-
registry.register(high).
|
|
358
|
+
registry.register(medium).expect("Operation failed");
|
|
359
|
+
registry.register(low).expect("Operation failed");
|
|
360
|
+
registry.register(high).expect("Operation failed");
|
|
361
361
|
|
|
362
362
|
let all = registry.get_all();
|
|
363
363
|
assert_eq!(all.len(), 3, "Should have three validators");
|
|
@@ -397,11 +397,11 @@ fn test_get_extractor_by_mime_type_succeeds() {
|
|
|
397
397
|
priority: 50,
|
|
398
398
|
});
|
|
399
399
|
|
|
400
|
-
registry.register(extractor).
|
|
400
|
+
registry.register(extractor).expect("Operation failed");
|
|
401
401
|
|
|
402
402
|
let result = registry.get("application/pdf");
|
|
403
403
|
assert!(result.is_ok(), "Should find extractor for PDF");
|
|
404
|
-
assert_eq!(result.
|
|
404
|
+
assert_eq!(result.expect("Operation failed").name(), "pdf-extractor");
|
|
405
405
|
}
|
|
406
406
|
|
|
407
407
|
/// Test extractor not found for unsupported MIME type.
|
|
@@ -437,10 +437,10 @@ fn test_extractor_priority_selection() {
|
|
|
437
437
|
priority: 100,
|
|
438
438
|
});
|
|
439
439
|
|
|
440
|
-
registry.register(low_priority).
|
|
441
|
-
registry.register(high_priority).
|
|
440
|
+
registry.register(low_priority).expect("Operation failed");
|
|
441
|
+
registry.register(high_priority).expect("Operation failed");
|
|
442
442
|
|
|
443
|
-
let result = registry.get("text/plain").
|
|
443
|
+
let result = registry.get("text/plain").expect("Value not found");
|
|
444
444
|
assert_eq!(
|
|
445
445
|
result.name(),
|
|
446
446
|
"high-priority-extractor",
|
|
@@ -459,15 +459,15 @@ fn test_extractor_wildcard_mime_matching() {
|
|
|
459
459
|
priority: 50,
|
|
460
460
|
});
|
|
461
461
|
|
|
462
|
-
registry.register(extractor).
|
|
462
|
+
registry.register(extractor).expect("Operation failed");
|
|
463
463
|
|
|
464
464
|
let result = registry.get("text/plain");
|
|
465
465
|
assert!(result.is_ok(), "Should match text/plain with text/*");
|
|
466
|
-
assert_eq!(result.
|
|
466
|
+
assert_eq!(result.expect("Operation failed").name(), "text-extractor");
|
|
467
467
|
|
|
468
468
|
let result = registry.get("text/html");
|
|
469
469
|
assert!(result.is_ok(), "Should match text/html with text/*");
|
|
470
|
-
assert_eq!(result.
|
|
470
|
+
assert_eq!(result.expect("Operation failed").name(), "text-extractor");
|
|
471
471
|
|
|
472
472
|
let result = registry.get("application/pdf");
|
|
473
473
|
assert!(result.is_err(), "Should not match application/pdf with text/*");
|
|
@@ -484,7 +484,7 @@ fn test_extractor_unregistration_succeeds() {
|
|
|
484
484
|
priority: 50,
|
|
485
485
|
});
|
|
486
486
|
|
|
487
|
-
registry.register(extractor).
|
|
487
|
+
registry.register(extractor).expect("Operation failed");
|
|
488
488
|
assert_eq!(registry.list().len(), 1);
|
|
489
489
|
|
|
490
490
|
let result = registry.remove("temp-extractor");
|
|
@@ -506,17 +506,20 @@ fn test_extractor_multiple_mime_types() {
|
|
|
506
506
|
priority: 50,
|
|
507
507
|
});
|
|
508
508
|
|
|
509
|
-
registry.register(extractor).
|
|
509
|
+
registry.register(extractor).expect("Operation failed");
|
|
510
510
|
|
|
511
511
|
assert!(registry.get("application/pdf").is_ok());
|
|
512
512
|
assert!(registry.get("application/vnd.ms-excel").is_ok());
|
|
513
513
|
assert!(registry.get("text/csv").is_ok());
|
|
514
514
|
|
|
515
515
|
assert_eq!(
|
|
516
|
-
registry.get("application/pdf").
|
|
516
|
+
registry.get("application/pdf").expect("Value not found").name(),
|
|
517
|
+
"multi-format-extractor"
|
|
518
|
+
);
|
|
519
|
+
assert_eq!(
|
|
520
|
+
registry.get("text/csv").expect("Value not found").name(),
|
|
517
521
|
"multi-format-extractor"
|
|
518
522
|
);
|
|
519
|
-
assert_eq!(registry.get("text/csv").unwrap().name(), "multi-format-extractor");
|
|
520
523
|
}
|
|
521
524
|
|
|
522
525
|
/// Test clearing all extractors.
|
|
@@ -535,8 +538,8 @@ fn test_clear_extractors_succeeds() {
|
|
|
535
538
|
priority: 50,
|
|
536
539
|
});
|
|
537
540
|
|
|
538
|
-
registry.register(e1).
|
|
539
|
-
registry.register(e2).
|
|
541
|
+
registry.register(e1).expect("Operation failed");
|
|
542
|
+
registry.register(e2).expect("Operation failed");
|
|
540
543
|
assert_eq!(registry.list().len(), 2);
|
|
541
544
|
|
|
542
545
|
let result = registry.shutdown_all();
|
|
@@ -686,7 +686,7 @@ async fn test_rst_extraction_no_errors() {
|
|
|
686
686
|
result.err()
|
|
687
687
|
);
|
|
688
688
|
|
|
689
|
-
let extraction = result.
|
|
689
|
+
let extraction = result.expect("Operation failed");
|
|
690
690
|
|
|
691
691
|
assert!(!extraction.content.is_empty(), "Extracted content should not be empty");
|
|
692
692
|
|