kreuzberg 4.1.2 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
  5. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  6. data/kreuzberg.gemspec +13 -1
  7. data/lib/kreuzberg/cli.rb +16 -6
  8. data/lib/kreuzberg/cli_proxy.rb +3 -1
  9. data/lib/kreuzberg/config.rb +121 -39
  10. data/lib/kreuzberg/djot_content.rb +225 -0
  11. data/lib/kreuzberg/extraction_api.rb +20 -4
  12. data/lib/kreuzberg/result.rb +12 -2
  13. data/lib/kreuzberg/version.rb +1 -1
  14. data/lib/kreuzberg.rb +1 -0
  15. data/sig/kreuzberg.rbs +28 -12
  16. data/spec/binding/batch_operations_spec.rb +80 -0
  17. data/spec/binding/batch_spec.rb +6 -5
  18. data/spec/binding/error_recovery_spec.rb +3 -3
  19. data/spec/binding/metadata_types_spec.rb +77 -57
  20. data/spec/binding/tables_spec.rb +11 -2
  21. data/spec/serialization_spec.rb +134 -0
  22. data/spec/unit/config/output_format_spec.rb +380 -0
  23. data/vendor/Cargo.toml +1 -1
  24. data/vendor/kreuzberg/Cargo.toml +1 -1
  25. data/vendor/kreuzberg/README.md +1 -1
  26. data/vendor/kreuzberg/src/api/startup.rs +15 -1
  27. data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
  28. data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
  29. data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
  30. data/vendor/kreuzberg/src/core/io.rs +7 -7
  31. data/vendor/kreuzberg/src/core/mime.rs +4 -4
  32. data/vendor/kreuzberg/src/embeddings.rs +4 -4
  33. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
  34. data/vendor/kreuzberg/src/mcp/format.rs +237 -39
  35. data/vendor/kreuzberg/src/mcp/params.rs +26 -33
  36. data/vendor/kreuzberg/src/mcp/server.rs +6 -3
  37. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
  38. data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
  39. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
  40. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
  41. data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
  42. data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
  43. data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
  44. data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
  45. data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
  46. data/vendor/kreuzberg/tests/api_embed.rs +84 -50
  47. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
  48. data/vendor/kreuzberg/tests/api_tests.rs +298 -139
  49. data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
  50. data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
  51. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
  52. data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
  53. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
  54. data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
  55. data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
  56. data/vendor/kreuzberg/tests/config_features.rs +19 -15
  57. data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
  58. data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
  59. data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
  60. data/vendor/kreuzberg/tests/core_integration.rs +57 -57
  61. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
  62. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
  63. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
  64. data/vendor/kreuzberg/tests/email_integration.rs +7 -7
  65. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
  66. data/vendor/kreuzberg/tests/error_handling.rs +13 -11
  67. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
  68. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  69. data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
  70. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
  71. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
  72. data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
  73. data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
  74. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
  75. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
  76. data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
  77. data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
  78. data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
  79. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
  80. data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
  81. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
  82. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
  83. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
  84. data/vendor/kreuzberg/tests/page_markers.rs +1 -1
  85. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
  86. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  87. data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
  88. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
  89. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
  90. data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
  91. data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
  92. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
  93. data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
  94. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
  95. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
  96. data/vendor/kreuzberg/tests/security_validation.rs +20 -19
  97. data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
  98. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
  99. data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
  100. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
  101. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
  102. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  103. metadata +12 -2
@@ -1,13 +1,15 @@
1
1
  //! Regression tests for PPTX/PPSX extraction bugs
2
2
  //!
3
3
  //! GitHub Issue #321: PPTX extraction fails on shapes without txBody (image placeholders) + PPSX not supported
4
+ //! GitHub Issue #329: Extracting images from PPTX results in reversed page numbers
4
5
  //!
5
6
  //! Bug 1: "No txBody found" - PPTX extraction fails when any shape lacks a text body
6
7
  //! Bug 2: PPSX not supported - PowerPoint Show files rejected entirely
8
+ //! Bug 3: Image page numbers reversed - image on slide 1 reports page_number=2
7
9
 
8
10
  #![cfg(feature = "office")]
9
11
 
10
- use kreuzberg::{ExtractionConfig, extract_file};
12
+ use kreuzberg::{ExtractionConfig, ImageExtractionConfig, extract_file};
11
13
  use std::io::Write;
12
14
  use tempfile::NamedTempFile;
13
15
  use zip::CompressionMethod;
@@ -25,9 +27,9 @@ use zip::write::{FileOptions, ZipWriter};
25
27
  async fn test_ppsx_slideshow_extraction() {
26
28
  let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
27
29
  .parent()
28
- .unwrap()
30
+ .expect("Operation failed")
29
31
  .parent()
30
- .unwrap();
32
+ .expect("Operation failed");
31
33
  let test_file = workspace_root.join("test_documents/presentations/sample.ppsx");
32
34
 
33
35
  if !test_file.exists() {
@@ -69,9 +71,9 @@ async fn test_ppsx_slideshow_extraction() {
69
71
  async fn test_ppsx_with_explicit_mime_type() {
70
72
  let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
71
73
  .parent()
72
- .unwrap()
74
+ .expect("Operation failed")
73
75
  .parent()
74
- .unwrap();
76
+ .expect("Operation failed");
75
77
  let test_file = workspace_root.join("test_documents/presentations/sample.ppsx");
76
78
 
77
79
  if !test_file.exists() {
@@ -120,24 +122,26 @@ async fn test_pptx_with_image_placeholder_no_txbody() {
120
122
  let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
121
123
 
122
124
  // Add [Content_Types].xml
123
- zip.start_file("[Content_Types].xml", options).unwrap();
125
+ zip.start_file("[Content_Types].xml", options)
126
+ .expect("Operation failed");
124
127
  zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
125
128
  <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
126
129
  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
127
130
  <Default Extension="xml" ContentType="application/xml"/>
128
131
  <Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
129
132
  <Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
130
- </Types>"#).unwrap();
133
+ </Types>"#).expect("Operation failed");
131
134
 
132
135
  // Add _rels/.rels
133
- zip.start_file("_rels/.rels", options).unwrap();
136
+ zip.start_file("_rels/.rels", options).expect("Operation failed");
134
137
  zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
135
138
  <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
136
139
  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
137
- </Relationships>"#).unwrap();
140
+ </Relationships>"#).expect("Operation failed");
138
141
 
139
142
  // Add ppt/presentation.xml
140
- zip.start_file("ppt/presentation.xml", options).unwrap();
143
+ zip.start_file("ppt/presentation.xml", options)
144
+ .expect("Operation failed");
141
145
  zip.write_all(
142
146
  br#"<?xml version="1.0" encoding="UTF-8"?>
143
147
  <p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
@@ -148,18 +152,20 @@ async fn test_pptx_with_image_placeholder_no_txbody() {
148
152
  </p:sldIdLst>
149
153
  </p:presentation>"#,
150
154
  )
151
- .unwrap();
155
+ .expect("Operation failed");
152
156
 
153
157
  // Add ppt/_rels/presentation.xml.rels
154
- zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
158
+ zip.start_file("ppt/_rels/presentation.xml.rels", options)
159
+ .expect("Operation failed");
155
160
  zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
156
161
  <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
157
162
  <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
158
- </Relationships>"#).unwrap();
163
+ </Relationships>"#).expect("Operation failed");
159
164
 
160
165
  // Add ppt/slides/slide1.xml with a shape WITHOUT txBody (image placeholder)
161
166
  // This is the critical test case - a <p:sp> element with no <p:txBody>
162
- zip.start_file("ppt/slides/slide1.xml", options).unwrap();
167
+ zip.start_file("ppt/slides/slide1.xml", options)
168
+ .expect("Operation failed");
163
169
  zip.write_all(
164
170
  br#"<?xml version="1.0" encoding="UTF-8"?>
165
171
  <p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
@@ -259,18 +265,19 @@ async fn test_pptx_with_image_placeholder_no_txbody() {
259
265
  </p:cSld>
260
266
  </p:sld>"#,
261
267
  )
262
- .unwrap();
268
+ .expect("Operation failed");
263
269
 
264
270
  // Add ppt/slides/_rels/slide1.xml.rels (empty)
265
- zip.start_file("ppt/slides/_rels/slide1.xml.rels", options).unwrap();
271
+ zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
272
+ .expect("Operation failed");
266
273
  zip.write_all(
267
274
  br#"<?xml version="1.0" encoding="UTF-8"?>
268
275
  <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
269
276
  </Relationships>"#,
270
277
  )
271
- .unwrap();
278
+ .expect("Operation failed");
272
279
 
273
- zip.finish().unwrap();
280
+ zip.finish().expect("Operation failed");
274
281
  }
275
282
 
276
283
  // Extract the PPTX file
@@ -336,24 +343,26 @@ async fn test_pptx_mixed_shapes_extraction() {
336
343
  let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
337
344
 
338
345
  // Add [Content_Types].xml
339
- zip.start_file("[Content_Types].xml", options).unwrap();
346
+ zip.start_file("[Content_Types].xml", options)
347
+ .expect("Operation failed");
340
348
  zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
341
349
  <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
342
350
  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
343
351
  <Default Extension="xml" ContentType="application/xml"/>
344
352
  <Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
345
353
  <Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
346
- </Types>"#).unwrap();
354
+ </Types>"#).expect("Operation failed");
347
355
 
348
356
  // Add _rels/.rels
349
- zip.start_file("_rels/.rels", options).unwrap();
357
+ zip.start_file("_rels/.rels", options).expect("Operation failed");
350
358
  zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
351
359
  <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
352
360
  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
353
- </Relationships>"#).unwrap();
361
+ </Relationships>"#).expect("Operation failed");
354
362
 
355
363
  // Add ppt/presentation.xml
356
- zip.start_file("ppt/presentation.xml", options).unwrap();
364
+ zip.start_file("ppt/presentation.xml", options)
365
+ .expect("Operation failed");
357
366
  zip.write_all(
358
367
  br#"<?xml version="1.0" encoding="UTF-8"?>
359
368
  <p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
@@ -364,17 +373,19 @@ async fn test_pptx_mixed_shapes_extraction() {
364
373
  </p:sldIdLst>
365
374
  </p:presentation>"#,
366
375
  )
367
- .unwrap();
376
+ .expect("Operation failed");
368
377
 
369
378
  // Add ppt/_rels/presentation.xml.rels
370
- zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
379
+ zip.start_file("ppt/_rels/presentation.xml.rels", options)
380
+ .expect("Operation failed");
371
381
  zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
372
382
  <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
373
383
  <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
374
- </Relationships>"#).unwrap();
384
+ </Relationships>"#).expect("Operation failed");
375
385
 
376
386
  // Add slide with various shapes - some with txBody, some without
377
- zip.start_file("ppt/slides/slide1.xml", options).unwrap();
387
+ zip.start_file("ppt/slides/slide1.xml", options)
388
+ .expect("Operation failed");
378
389
  zip.write_all(
379
390
  br#"<?xml version="1.0" encoding="UTF-8"?>
380
391
  <p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
@@ -453,18 +464,19 @@ async fn test_pptx_mixed_shapes_extraction() {
453
464
  </p:cSld>
454
465
  </p:sld>"#,
455
466
  )
456
- .unwrap();
467
+ .expect("Operation failed");
457
468
 
458
469
  // Add empty rels
459
- zip.start_file("ppt/slides/_rels/slide1.xml.rels", options).unwrap();
470
+ zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
471
+ .expect("Operation failed");
460
472
  zip.write_all(
461
473
  br#"<?xml version="1.0" encoding="UTF-8"?>
462
474
  <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
463
475
  </Relationships>"#,
464
476
  )
465
- .unwrap();
477
+ .expect("Operation failed");
466
478
 
467
- zip.finish().unwrap();
479
+ zip.finish().expect("Operation failed");
468
480
  }
469
481
 
470
482
  let result = extract_file(
@@ -502,3 +514,284 @@ async fn test_pptx_mixed_shapes_extraction() {
502
514
  }
503
515
  }
504
516
  }
517
+
518
+ /// Test that images extracted from PPTX have correct page numbers.
519
+ ///
520
+ /// When a PPTX has multiple slides and an image on slide 1, the extracted image
521
+ /// should have page_number=1 (not reversed).
522
+ ///
523
+ /// GitHub Issue #329: Image on slide 1 of 2-slide PPTX reports page_number=2
524
+ #[tokio::test]
525
+ async fn test_pptx_image_page_numbers_not_reversed() {
526
+ // Create a PPTX with 2 slides, image on slide 1
527
+ let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
528
+
529
+ // A minimal 1x1 red PNG image (valid PNG format)
530
+ let png_image: &[u8] = &[
531
+ 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, // PNG signature
532
+ 0x00, 0x00, 0x00, 0x0D, // IHDR chunk length
533
+ 0x49, 0x48, 0x44, 0x52, // "IHDR"
534
+ 0x00, 0x00, 0x00, 0x01, // width: 1
535
+ 0x00, 0x00, 0x00, 0x01, // height: 1
536
+ 0x08, 0x02, // bit depth: 8, color type: RGB
537
+ 0x00, 0x00, 0x00, // compression, filter, interlace
538
+ 0x90, 0x77, 0x53, 0xDE, // IHDR CRC
539
+ 0x00, 0x00, 0x00, 0x0C, // IDAT chunk length
540
+ 0x49, 0x44, 0x41, 0x54, // "IDAT"
541
+ 0x08, 0xD7, 0x63, 0xF8, 0xCF, 0xC0, 0x00, 0x00, // compressed data
542
+ 0x01, 0x01, 0x01, 0x00, // checksum
543
+ 0x18, 0xDD, 0x8D, 0xB4, // IDAT CRC
544
+ 0x00, 0x00, 0x00, 0x00, // IEND chunk length
545
+ 0x49, 0x45, 0x4E, 0x44, // "IEND"
546
+ 0xAE, 0x42, 0x60, 0x82, // IEND CRC
547
+ ];
548
+
549
+ {
550
+ let mut zip = ZipWriter::new(&mut temp_file);
551
+ let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
552
+
553
+ // Add [Content_Types].xml
554
+ zip.start_file("[Content_Types].xml", options)
555
+ .expect("Operation failed");
556
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
557
+ <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
558
+ <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
559
+ <Default Extension="xml" ContentType="application/xml"/>
560
+ <Default Extension="png" ContentType="image/png"/>
561
+ <Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
562
+ <Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
563
+ <Override PartName="/ppt/slides/slide2.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
564
+ </Types>"#).expect("Operation failed");
565
+
566
+ // Add _rels/.rels
567
+ zip.start_file("_rels/.rels", options).expect("Operation failed");
568
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
569
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
570
+ <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
571
+ </Relationships>"#).expect("Operation failed");
572
+
573
+ // Add ppt/presentation.xml
574
+ zip.start_file("ppt/presentation.xml", options)
575
+ .expect("Operation failed");
576
+ zip.write_all(
577
+ br#"<?xml version="1.0" encoding="UTF-8"?>
578
+ <p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
579
+ xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
580
+ <p:sldIdLst>
581
+ <p:sldId id="256" r:id="rId2"/>
582
+ <p:sldId id="257" r:id="rId3"/>
583
+ </p:sldIdLst>
584
+ </p:presentation>"#,
585
+ )
586
+ .expect("Operation failed");
587
+
588
+ // Add ppt/_rels/presentation.xml.rels
589
+ // BUG REPRODUCTION: Slides listed in REVERSE order in XML (slide2 before slide1)
590
+ // This is valid XML - PowerPoint doesn't guarantee order in rels files
591
+ // GitHub Issue #329: This causes page numbers to be reversed
592
+ zip.start_file("ppt/_rels/presentation.xml.rels", options)
593
+ .expect("Operation failed");
594
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
595
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
596
+ <Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide2.xml"/>
597
+ <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
598
+ </Relationships>"#).expect("Operation failed");
599
+
600
+ // Add the image file
601
+ zip.start_file("ppt/media/image1.png", options)
602
+ .expect("Operation failed");
603
+ zip.write_all(png_image).expect("Operation failed");
604
+
605
+ // Add slide 1 WITH an image
606
+ zip.start_file("ppt/slides/slide1.xml", options)
607
+ .expect("Operation failed");
608
+ zip.write_all(
609
+ br#"<?xml version="1.0" encoding="UTF-8"?>
610
+ <p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
611
+ xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
612
+ xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
613
+ <p:cSld>
614
+ <p:spTree>
615
+ <p:nvGrpSpPr><p:cNvPr id="1" name=""/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr>
616
+ <p:grpSpPr/>
617
+ <p:sp>
618
+ <p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
619
+ <p:spPr/>
620
+ <p:txBody><a:bodyPr/><a:lstStyle/><a:p><a:r><a:t>Slide 1 - Has Image</a:t></a:r></a:p></p:txBody>
621
+ </p:sp>
622
+ <p:pic>
623
+ <p:nvPicPr>
624
+ <p:cNvPr id="3" name="Picture 1"/>
625
+ <p:cNvPicPr><a:picLocks noChangeAspect="1"/></p:cNvPicPr>
626
+ <p:nvPr/>
627
+ </p:nvPicPr>
628
+ <p:blipFill>
629
+ <a:blip r:embed="rId2"/>
630
+ <a:stretch><a:fillRect/></a:stretch>
631
+ </p:blipFill>
632
+ <p:spPr>
633
+ <a:xfrm><a:off x="0" y="0"/><a:ext cx="100000" cy="100000"/></a:xfrm>
634
+ <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
635
+ </p:spPr>
636
+ </p:pic>
637
+ </p:spTree>
638
+ </p:cSld>
639
+ </p:sld>"#,
640
+ )
641
+ .expect("Operation failed");
642
+
643
+ // Add slide 1 relationships (points to the image)
644
+ zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
645
+ .expect("Operation failed");
646
+ zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
647
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
648
+ <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image1.png"/>
649
+ </Relationships>"#).expect("Operation failed");
650
+
651
+ // Add slide 2 WITHOUT an image
652
+ zip.start_file("ppt/slides/slide2.xml", options)
653
+ .expect("Operation failed");
654
+ zip.write_all(
655
+ br#"<?xml version="1.0" encoding="UTF-8"?>
656
+ <p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
657
+ xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
658
+ xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
659
+ <p:cSld>
660
+ <p:spTree>
661
+ <p:nvGrpSpPr><p:cNvPr id="1" name=""/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr>
662
+ <p:grpSpPr/>
663
+ <p:sp>
664
+ <p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
665
+ <p:spPr/>
666
+ <p:txBody><a:bodyPr/><a:lstStyle/><a:p><a:r><a:t>Slide 2 - No Image</a:t></a:r></a:p></p:txBody>
667
+ </p:sp>
668
+ </p:spTree>
669
+ </p:cSld>
670
+ </p:sld>"#,
671
+ )
672
+ .expect("Operation failed");
673
+
674
+ // Add empty slide 2 relationships
675
+ zip.start_file("ppt/slides/_rels/slide2.xml.rels", options)
676
+ .expect("Operation failed");
677
+ zip.write_all(
678
+ br#"<?xml version="1.0" encoding="UTF-8"?>
679
+ <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
680
+ </Relationships>"#,
681
+ )
682
+ .expect("Operation failed");
683
+
684
+ zip.finish().expect("Operation failed");
685
+ }
686
+
687
+ // Extract with images enabled
688
+ let config = ExtractionConfig {
689
+ images: Some(ImageExtractionConfig {
690
+ extract_images: true,
691
+ target_dpi: 300,
692
+ max_image_dimension: 4096,
693
+ auto_adjust_dpi: true,
694
+ min_dpi: 72,
695
+ max_dpi: 600,
696
+ }),
697
+ ..Default::default()
698
+ };
699
+
700
+ let result = extract_file(
701
+ temp_file.path(),
702
+ Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
703
+ &config,
704
+ )
705
+ .await;
706
+
707
+ match result {
708
+ Ok(extraction) => {
709
+ // Verify text extraction works
710
+ assert!(extraction.content.contains("Slide 1"), "Should extract slide 1 text");
711
+ assert!(extraction.content.contains("Slide 2"), "Should extract slide 2 text");
712
+
713
+ // Verify we got an image
714
+ let images = extraction.images.as_ref().expect("Images should be present");
715
+ assert!(!images.is_empty(), "Should extract at least one image");
716
+
717
+ // THE CRITICAL TEST: Image on slide 1 should have page_number=1, NOT 2
718
+ let image = &images[0];
719
+ assert_eq!(
720
+ image.page_number,
721
+ Some(1),
722
+ "GitHub Issue #329: Image on slide 1 should have page_number=1, but got {:?}. \
723
+ The page numbers are reversed!",
724
+ image.page_number
725
+ );
726
+
727
+ println!("✅ PPTX image page numbers are correct!");
728
+ println!(" Image on slide 1 has page_number={:?}", image.page_number);
729
+ }
730
+ Err(e) => {
731
+ panic!("PPTX extraction failed: {:?}", e);
732
+ }
733
+ }
734
+ }
735
+
736
+ /// Test with actual user-provided PPTX file from GitHub Issue #329.
737
+ ///
738
+ /// The user's file has slides listed in reverse order in presentation.xml.rels,
739
+ /// which caused images to have incorrect page numbers.
740
+ #[tokio::test]
741
+ async fn test_pptx_image_page_numbers_issue329_user_file() {
742
+ let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
743
+ .parent()
744
+ .expect("Operation failed")
745
+ .parent()
746
+ .expect("Operation failed");
747
+ let test_file = workspace_root.join("test_documents/presentations/pptx_reversed_slide_order_issue329.pptx");
748
+
749
+ if !test_file.exists() {
750
+ println!("Skipping test: User file not found at {:?}", test_file);
751
+ return;
752
+ }
753
+
754
+ // Extract with images enabled
755
+ let config = ExtractionConfig {
756
+ images: Some(ImageExtractionConfig {
757
+ extract_images: true,
758
+ target_dpi: 300,
759
+ max_image_dimension: 4096,
760
+ auto_adjust_dpi: true,
761
+ min_dpi: 72,
762
+ max_dpi: 600,
763
+ }),
764
+ ..Default::default()
765
+ };
766
+
767
+ let result = extract_file(&test_file, None, &config).await;
768
+
769
+ match result {
770
+ Ok(extraction) => {
771
+ // The user's file has an image on slide 1
772
+ let images = extraction.images.as_ref().expect("Images should be extracted");
773
+
774
+ if images.is_empty() {
775
+ println!("No images extracted from user file (may not have embedded images)");
776
+ return;
777
+ }
778
+
779
+ // All images should have page_number = 1 since they're on the first slide
780
+ for (idx, image) in images.iter().enumerate() {
781
+ assert_eq!(
782
+ image.page_number,
783
+ Some(1),
784
+ "GitHub Issue #329: Image {} should have page_number=1, but got {:?}",
785
+ idx,
786
+ image.page_number
787
+ );
788
+ }
789
+
790
+ println!("✅ User file from Issue #329 - image page numbers correct!");
791
+ println!(" Found {} images, all with page_number=1", images.len());
792
+ }
793
+ Err(e) => {
794
+ panic!("Failed to extract user file: {:?}", e);
795
+ }
796
+ }
797
+ }
@@ -184,9 +184,9 @@ fn test_register_multiple_validators_succeeds() {
184
184
  should_fail: true,
185
185
  });
186
186
 
187
- registry.register(v1).unwrap();
188
- registry.register(v2).unwrap();
189
- registry.register(v3).unwrap();
187
+ registry.register(v1).expect("Operation failed");
188
+ registry.register(v2).expect("Operation failed");
189
+ registry.register(v3).expect("Operation failed");
190
190
 
191
191
  let list = registry.list();
192
192
  assert_eq!(list.len(), 3, "Should have three validators");
@@ -205,7 +205,7 @@ fn test_validator_unregistration_succeeds() {
205
205
  should_fail: false,
206
206
  });
207
207
 
208
- registry.register(validator).unwrap();
208
+ registry.register(validator).expect("Operation failed");
209
209
  assert_eq!(registry.list().len(), 1);
210
210
 
211
211
  let result = registry.remove("temp-validator");
@@ -298,8 +298,8 @@ fn test_clear_validators_succeeds() {
298
298
  should_fail: false,
299
299
  });
300
300
 
301
- registry.register(v1).unwrap();
302
- registry.register(v2).unwrap();
301
+ registry.register(v1).expect("Operation failed");
302
+ registry.register(v2).expect("Operation failed");
303
303
  assert_eq!(registry.list().len(), 2);
304
304
 
305
305
  let result = registry.shutdown_all();
@@ -355,9 +355,9 @@ fn test_get_all_validators_respects_priority() {
355
355
  priority: 100,
356
356
  });
357
357
 
358
- registry.register(medium).unwrap();
359
- registry.register(low).unwrap();
360
- registry.register(high).unwrap();
358
+ registry.register(medium).expect("Operation failed");
359
+ registry.register(low).expect("Operation failed");
360
+ registry.register(high).expect("Operation failed");
361
361
 
362
362
  let all = registry.get_all();
363
363
  assert_eq!(all.len(), 3, "Should have three validators");
@@ -397,11 +397,11 @@ fn test_get_extractor_by_mime_type_succeeds() {
397
397
  priority: 50,
398
398
  });
399
399
 
400
- registry.register(extractor).unwrap();
400
+ registry.register(extractor).expect("Operation failed");
401
401
 
402
402
  let result = registry.get("application/pdf");
403
403
  assert!(result.is_ok(), "Should find extractor for PDF");
404
- assert_eq!(result.unwrap().name(), "pdf-extractor");
404
+ assert_eq!(result.expect("Operation failed").name(), "pdf-extractor");
405
405
  }
406
406
 
407
407
  /// Test extractor not found for unsupported MIME type.
@@ -437,10 +437,10 @@ fn test_extractor_priority_selection() {
437
437
  priority: 100,
438
438
  });
439
439
 
440
- registry.register(low_priority).unwrap();
441
- registry.register(high_priority).unwrap();
440
+ registry.register(low_priority).expect("Operation failed");
441
+ registry.register(high_priority).expect("Operation failed");
442
442
 
443
- let result = registry.get("text/plain").unwrap();
443
+ let result = registry.get("text/plain").expect("Value not found");
444
444
  assert_eq!(
445
445
  result.name(),
446
446
  "high-priority-extractor",
@@ -459,15 +459,15 @@ fn test_extractor_wildcard_mime_matching() {
459
459
  priority: 50,
460
460
  });
461
461
 
462
- registry.register(extractor).unwrap();
462
+ registry.register(extractor).expect("Operation failed");
463
463
 
464
464
  let result = registry.get("text/plain");
465
465
  assert!(result.is_ok(), "Should match text/plain with text/*");
466
- assert_eq!(result.unwrap().name(), "text-extractor");
466
+ assert_eq!(result.expect("Operation failed").name(), "text-extractor");
467
467
 
468
468
  let result = registry.get("text/html");
469
469
  assert!(result.is_ok(), "Should match text/html with text/*");
470
- assert_eq!(result.unwrap().name(), "text-extractor");
470
+ assert_eq!(result.expect("Operation failed").name(), "text-extractor");
471
471
 
472
472
  let result = registry.get("application/pdf");
473
473
  assert!(result.is_err(), "Should not match application/pdf with text/*");
@@ -484,7 +484,7 @@ fn test_extractor_unregistration_succeeds() {
484
484
  priority: 50,
485
485
  });
486
486
 
487
- registry.register(extractor).unwrap();
487
+ registry.register(extractor).expect("Operation failed");
488
488
  assert_eq!(registry.list().len(), 1);
489
489
 
490
490
  let result = registry.remove("temp-extractor");
@@ -506,17 +506,20 @@ fn test_extractor_multiple_mime_types() {
506
506
  priority: 50,
507
507
  });
508
508
 
509
- registry.register(extractor).unwrap();
509
+ registry.register(extractor).expect("Operation failed");
510
510
 
511
511
  assert!(registry.get("application/pdf").is_ok());
512
512
  assert!(registry.get("application/vnd.ms-excel").is_ok());
513
513
  assert!(registry.get("text/csv").is_ok());
514
514
 
515
515
  assert_eq!(
516
- registry.get("application/pdf").unwrap().name(),
516
+ registry.get("application/pdf").expect("Value not found").name(),
517
+ "multi-format-extractor"
518
+ );
519
+ assert_eq!(
520
+ registry.get("text/csv").expect("Value not found").name(),
517
521
  "multi-format-extractor"
518
522
  );
519
- assert_eq!(registry.get("text/csv").unwrap().name(), "multi-format-extractor");
520
523
  }
521
524
 
522
525
  /// Test clearing all extractors.
@@ -535,8 +538,8 @@ fn test_clear_extractors_succeeds() {
535
538
  priority: 50,
536
539
  });
537
540
 
538
- registry.register(e1).unwrap();
539
- registry.register(e2).unwrap();
541
+ registry.register(e1).expect("Operation failed");
542
+ registry.register(e2).expect("Operation failed");
540
543
  assert_eq!(registry.list().len(), 2);
541
544
 
542
545
  let result = registry.shutdown_all();
@@ -686,7 +686,7 @@ async fn test_rst_extraction_no_errors() {
686
686
  result.err()
687
687
  );
688
688
 
689
- let extraction = result.unwrap();
689
+ let extraction = result.expect("Operation failed");
690
690
 
691
691
  assert!(!extraction.content.is_empty(), "Extracted content should not be empty");
692
692