RubyGems - kreuzberg - Versions diffs - 4.1.2 → 4.2.1 - Mend

kreuzberg 4.1.2 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (103) hide show

checksums.yaml +4 -4
data/Gemfile.lock +2 -2
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
data/kreuzberg.gemspec +13 -1
data/lib/kreuzberg/cli.rb +16 -6
data/lib/kreuzberg/cli_proxy.rb +3 -1
data/lib/kreuzberg/config.rb +121 -39
data/lib/kreuzberg/djot_content.rb +225 -0
data/lib/kreuzberg/extraction_api.rb +20 -4
data/lib/kreuzberg/result.rb +12 -2
data/lib/kreuzberg/version.rb +1 -1
data/lib/kreuzberg.rb +1 -0
data/sig/kreuzberg.rbs +28 -12
data/spec/binding/batch_operations_spec.rb +80 -0
data/spec/binding/batch_spec.rb +6 -5
data/spec/binding/error_recovery_spec.rb +3 -3
data/spec/binding/metadata_types_spec.rb +77 -57
data/spec/binding/tables_spec.rb +11 -2
data/spec/serialization_spec.rb +134 -0
data/spec/unit/config/output_format_spec.rb +380 -0
data/vendor/Cargo.toml +1 -1
data/vendor/kreuzberg/Cargo.toml +1 -1
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/api/startup.rs +15 -1
data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
data/vendor/kreuzberg/src/core/io.rs +7 -7
data/vendor/kreuzberg/src/core/mime.rs +4 -4
data/vendor/kreuzberg/src/embeddings.rs +4 -4
data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
data/vendor/kreuzberg/src/mcp/format.rs +237 -39
data/vendor/kreuzberg/src/mcp/params.rs +26 -33
data/vendor/kreuzberg/src/mcp/server.rs +6 -3
data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
data/vendor/kreuzberg/tests/api_embed.rs +84 -50
data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
data/vendor/kreuzberg/tests/api_tests.rs +298 -139
data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
data/vendor/kreuzberg/tests/config_features.rs +19 -15
data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
data/vendor/kreuzberg/tests/core_integration.rs +57 -57
data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
data/vendor/kreuzberg/tests/email_integration.rs +7 -7
data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/error_handling.rs +13 -11
data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
data/vendor/kreuzberg/tests/page_markers.rs +1 -1
data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
data/vendor/kreuzberg/tests/security_validation.rs +20 -19
data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +12 -2

data/vendor/kreuzberg/tests/pptx_regression_tests.rs CHANGED Viewed

@@ -1,13 +1,15 @@
 //! Regression tests for PPTX/PPSX extraction bugs
 //!
 //! GitHub Issue #321: PPTX extraction fails on shapes without txBody (image placeholders) + PPSX not supported
+//! GitHub Issue #329: Extracting images from PPTX results in reversed page numbers
 //!
 //! Bug 1: "No txBody found" - PPTX extraction fails when any shape lacks a text body
 //! Bug 2: PPSX not supported - PowerPoint Show files rejected entirely
+//! Bug 3: Image page numbers reversed - image on slide 1 reports page_number=2
 #![cfg(feature = "office")]
-use kreuzberg::{ExtractionConfig, extract_file};
+use kreuzberg::{ExtractionConfig, ImageExtractionConfig, extract_file};
 use std::io::Write;
 use tempfile::NamedTempFile;
 use zip::CompressionMethod;
@@ -25,9 +27,9 @@ use zip::write::{FileOptions, ZipWriter};
 async fn test_ppsx_slideshow_extraction() {
     let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
         .parent()
-        .unwrap()
+        .expect("Operation failed")
         .parent()
-        .unwrap();
+        .expect("Operation failed");
     let test_file = workspace_root.join("test_documents/presentations/sample.ppsx");
     if !test_file.exists() {
@@ -69,9 +71,9 @@ async fn test_ppsx_slideshow_extraction() {
 async fn test_ppsx_with_explicit_mime_type() {
     let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
         .parent()
-        .unwrap()
+        .expect("Operation failed")
         .parent()
-        .unwrap();
+        .expect("Operation failed");
     let test_file = workspace_root.join("test_documents/presentations/sample.ppsx");
     if !test_file.exists() {
@@ -120,24 +122,26 @@ async fn test_pptx_with_image_placeholder_no_txbody() {
         let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
         // Add [Content_Types].xml
-        zip.start_file("[Content_Types].xml", options).unwrap();
+        zip.start_file("[Content_Types].xml", options)
+            .expect("Operation failed");
         zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
 <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
   <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
   <Default Extension="xml" ContentType="application/xml"/>
   <Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
   <Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
-</Types>"#).unwrap();
+</Types>"#).expect("Operation failed");
         // Add _rels/.rels
-        zip.start_file("_rels/.rels", options).unwrap();
+        zip.start_file("_rels/.rels", options).expect("Operation failed");
         zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
 <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
   <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
-</Relationships>"#).unwrap();
+</Relationships>"#).expect("Operation failed");
         // Add ppt/presentation.xml
-        zip.start_file("ppt/presentation.xml", options).unwrap();
+        zip.start_file("ppt/presentation.xml", options)
+            .expect("Operation failed");
         zip.write_all(
             br#"<?xml version="1.0" encoding="UTF-8"?>
 <p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
@@ -148,18 +152,20 @@ async fn test_pptx_with_image_placeholder_no_txbody() {
   </p:sldIdLst>
 </p:presentation>"#,
         )
-        .unwrap();
+        .expect("Operation failed");
         // Add ppt/_rels/presentation.xml.rels
-        zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
+        zip.start_file("ppt/_rels/presentation.xml.rels", options)
+            .expect("Operation failed");
         zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
 <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
   <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
-</Relationships>"#).unwrap();
+</Relationships>"#).expect("Operation failed");
         // Add ppt/slides/slide1.xml with a shape WITHOUT txBody (image placeholder)
         // This is the critical test case - a <p:sp> element with no <p:txBody>
-        zip.start_file("ppt/slides/slide1.xml", options).unwrap();
+        zip.start_file("ppt/slides/slide1.xml", options)
+            .expect("Operation failed");
         zip.write_all(
             br#"<?xml version="1.0" encoding="UTF-8"?>
 <p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
@@ -259,18 +265,19 @@ async fn test_pptx_with_image_placeholder_no_txbody() {
   </p:cSld>
 </p:sld>"#,
         )
-        .unwrap();
+        .expect("Operation failed");
         // Add ppt/slides/_rels/slide1.xml.rels (empty)
-        zip.start_file("ppt/slides/_rels/slide1.xml.rels", options).unwrap();
+        zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
+            .expect("Operation failed");
         zip.write_all(
             br#"<?xml version="1.0" encoding="UTF-8"?>
 <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
 </Relationships>"#,
         )
-        .unwrap();
+        .expect("Operation failed");
-        zip.finish().unwrap();
+        zip.finish().expect("Operation failed");
     }
     // Extract the PPTX file
@@ -336,24 +343,26 @@ async fn test_pptx_mixed_shapes_extraction() {
         let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
         // Add [Content_Types].xml
-        zip.start_file("[Content_Types].xml", options).unwrap();
+        zip.start_file("[Content_Types].xml", options)
+            .expect("Operation failed");
         zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
 <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
   <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
   <Default Extension="xml" ContentType="application/xml"/>
   <Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
   <Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
-</Types>"#).unwrap();
+</Types>"#).expect("Operation failed");
         // Add _rels/.rels
-        zip.start_file("_rels/.rels", options).unwrap();
+        zip.start_file("_rels/.rels", options).expect("Operation failed");
         zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
 <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
   <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
-</Relationships>"#).unwrap();
+</Relationships>"#).expect("Operation failed");
         // Add ppt/presentation.xml
-        zip.start_file("ppt/presentation.xml", options).unwrap();
+        zip.start_file("ppt/presentation.xml", options)
+            .expect("Operation failed");
         zip.write_all(
             br#"<?xml version="1.0" encoding="UTF-8"?>
 <p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
@@ -364,17 +373,19 @@ async fn test_pptx_mixed_shapes_extraction() {
   </p:sldIdLst>
 </p:presentation>"#,
         )
-        .unwrap();
+        .expect("Operation failed");
         // Add ppt/_rels/presentation.xml.rels
-        zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
+        zip.start_file("ppt/_rels/presentation.xml.rels", options)
+            .expect("Operation failed");
         zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
 <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
   <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
-</Relationships>"#).unwrap();
+</Relationships>"#).expect("Operation failed");
         // Add slide with various shapes - some with txBody, some without
-        zip.start_file("ppt/slides/slide1.xml", options).unwrap();
+        zip.start_file("ppt/slides/slide1.xml", options)
+            .expect("Operation failed");
         zip.write_all(
             br#"<?xml version="1.0" encoding="UTF-8"?>
 <p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
@@ -453,18 +464,19 @@ async fn test_pptx_mixed_shapes_extraction() {
   </p:cSld>
 </p:sld>"#,
         )
-        .unwrap();
+        .expect("Operation failed");
         // Add empty rels
-        zip.start_file("ppt/slides/_rels/slide1.xml.rels", options).unwrap();
+        zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
+            .expect("Operation failed");
         zip.write_all(
             br#"<?xml version="1.0" encoding="UTF-8"?>
 <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
 </Relationships>"#,
         )
-        .unwrap();
+        .expect("Operation failed");
-        zip.finish().unwrap();
+        zip.finish().expect("Operation failed");
     }
     let result = extract_file(
@@ -502,3 +514,284 @@ async fn test_pptx_mixed_shapes_extraction() {
         }
     }
 }
+/// Test that images extracted from PPTX have correct page numbers.
+///
+/// When a PPTX has multiple slides and an image on slide 1, the extracted image
+/// should have page_number=1 (not reversed).
+///
+/// GitHub Issue #329: Image on slide 1 of 2-slide PPTX reports page_number=2
+#[tokio::test]
+async fn test_pptx_image_page_numbers_not_reversed() {
+    // Create a PPTX with 2 slides, image on slide 1
+    let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
+    // A minimal 1x1 red PNG image (valid PNG format)
+    let png_image: &[u8] = &[
+        0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A, // PNG signature
+        0x00, 0x00, 0x00, 0x0D, // IHDR chunk length
+        0x49, 0x48, 0x44, 0x52, // "IHDR"
+        0x00, 0x00, 0x00, 0x01, // width: 1
+        0x00, 0x00, 0x00, 0x01, // height: 1
+        0x08, 0x02, // bit depth: 8, color type: RGB
+        0x00, 0x00, 0x00, // compression, filter, interlace
+        0x90, 0x77, 0x53, 0xDE, // IHDR CRC
+        0x00, 0x00, 0x00, 0x0C, // IDAT chunk length
+        0x49, 0x44, 0x41, 0x54, // "IDAT"
+        0x08, 0xD7, 0x63, 0xF8, 0xCF, 0xC0, 0x00, 0x00, // compressed data
+        0x01, 0x01, 0x01, 0x00, // checksum
+        0x18, 0xDD, 0x8D, 0xB4, // IDAT CRC
+        0x00, 0x00, 0x00, 0x00, // IEND chunk length
+        0x49, 0x45, 0x4E, 0x44, // "IEND"
+        0xAE, 0x42, 0x60, 0x82, // IEND CRC
+    ];
+    {
+        let mut zip = ZipWriter::new(&mut temp_file);
+        let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
+        // Add [Content_Types].xml
+        zip.start_file("[Content_Types].xml", options)
+            .expect("Operation failed");
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
+  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
+  <Default Extension="xml" ContentType="application/xml"/>
+  <Default Extension="png" ContentType="image/png"/>
+  <Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
+  <Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
+  <Override PartName="/ppt/slides/slide2.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
+</Types>"#).expect("Operation failed");
+        // Add _rels/.rels
+        zip.start_file("_rels/.rels", options).expect("Operation failed");
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
+</Relationships>"#).expect("Operation failed");
+        // Add ppt/presentation.xml
+        zip.start_file("ppt/presentation.xml", options)
+            .expect("Operation failed");
+        zip.write_all(
+            br#"<?xml version="1.0" encoding="UTF-8"?>
+<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
+                xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
+  <p:sldIdLst>
+    <p:sldId id="256" r:id="rId2"/>
+    <p:sldId id="257" r:id="rId3"/>
+  </p:sldIdLst>
+</p:presentation>"#,
+        )
+        .expect("Operation failed");
+        // Add ppt/_rels/presentation.xml.rels
+        // BUG REPRODUCTION: Slides listed in REVERSE order in XML (slide2 before slide1)
+        // This is valid XML - PowerPoint doesn't guarantee order in rels files
+        // GitHub Issue #329: This causes page numbers to be reversed
+        zip.start_file("ppt/_rels/presentation.xml.rels", options)
+            .expect("Operation failed");
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+  <Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide2.xml"/>
+  <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
+</Relationships>"#).expect("Operation failed");
+        // Add the image file
+        zip.start_file("ppt/media/image1.png", options)
+            .expect("Operation failed");
+        zip.write_all(png_image).expect("Operation failed");
+        // Add slide 1 WITH an image
+        zip.start_file("ppt/slides/slide1.xml", options)
+            .expect("Operation failed");
+        zip.write_all(
+            br#"<?xml version="1.0" encoding="UTF-8"?>
+<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
+       xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
+       xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
+  <p:cSld>
+    <p:spTree>
+      <p:nvGrpSpPr><p:cNvPr id="1" name=""/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr>
+      <p:grpSpPr/>
+      <p:sp>
+        <p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
+        <p:spPr/>
+        <p:txBody><a:bodyPr/><a:lstStyle/><a:p><a:r><a:t>Slide 1 - Has Image</a:t></a:r></a:p></p:txBody>
+      </p:sp>
+      <p:pic>
+        <p:nvPicPr>
+          <p:cNvPr id="3" name="Picture 1"/>
+          <p:cNvPicPr><a:picLocks noChangeAspect="1"/></p:cNvPicPr>
+          <p:nvPr/>
+        </p:nvPicPr>
+        <p:blipFill>
+          <a:blip r:embed="rId2"/>
+          <a:stretch><a:fillRect/></a:stretch>
+        </p:blipFill>
+        <p:spPr>
+          <a:xfrm><a:off x="0" y="0"/><a:ext cx="100000" cy="100000"/></a:xfrm>
+          <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
+        </p:spPr>
+      </p:pic>
+    </p:spTree>
+  </p:cSld>
+</p:sld>"#,
+        )
+        .expect("Operation failed");
+        // Add slide 1 relationships (points to the image)
+        zip.start_file("ppt/slides/_rels/slide1.xml.rels", options)
+            .expect("Operation failed");
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+  <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="../media/image1.png"/>
+</Relationships>"#).expect("Operation failed");
+        // Add slide 2 WITHOUT an image
+        zip.start_file("ppt/slides/slide2.xml", options)
+            .expect("Operation failed");
+        zip.write_all(
+            br#"<?xml version="1.0" encoding="UTF-8"?>
+<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
+       xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
+       xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
+  <p:cSld>
+    <p:spTree>
+      <p:nvGrpSpPr><p:cNvPr id="1" name=""/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr>
+      <p:grpSpPr/>
+      <p:sp>
+        <p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
+        <p:spPr/>
+        <p:txBody><a:bodyPr/><a:lstStyle/><a:p><a:r><a:t>Slide 2 - No Image</a:t></a:r></a:p></p:txBody>
+      </p:sp>
+    </p:spTree>
+  </p:cSld>
+</p:sld>"#,
+        )
+        .expect("Operation failed");
+        // Add empty slide 2 relationships
+        zip.start_file("ppt/slides/_rels/slide2.xml.rels", options)
+            .expect("Operation failed");
+        zip.write_all(
+            br#"<?xml version="1.0" encoding="UTF-8"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+</Relationships>"#,
+        )
+        .expect("Operation failed");
+        zip.finish().expect("Operation failed");
+    }
+    // Extract with images enabled
+    let config = ExtractionConfig {
+        images: Some(ImageExtractionConfig {
+            extract_images: true,
+            target_dpi: 300,
+            max_image_dimension: 4096,
+            auto_adjust_dpi: true,
+            min_dpi: 72,
+            max_dpi: 600,
+        }),
+        ..Default::default()
+    };
+    let result = extract_file(
+        temp_file.path(),
+        Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
+        &config,
+    )
+    .await;
+    match result {
+        Ok(extraction) => {
+            // Verify text extraction works
+            assert!(extraction.content.contains("Slide 1"), "Should extract slide 1 text");
+            assert!(extraction.content.contains("Slide 2"), "Should extract slide 2 text");
+            // Verify we got an image
+            let images = extraction.images.as_ref().expect("Images should be present");
+            assert!(!images.is_empty(), "Should extract at least one image");
+            // THE CRITICAL TEST: Image on slide 1 should have page_number=1, NOT 2
+            let image = &images[0];
+            assert_eq!(
+                image.page_number,
+                Some(1),
+                "GitHub Issue #329: Image on slide 1 should have page_number=1, but got {:?}. \
+                 The page numbers are reversed!",
+                image.page_number
+            );
+            println!("✅ PPTX image page numbers are correct!");
+            println!("   Image on slide 1 has page_number={:?}", image.page_number);
+        }
+        Err(e) => {
+            panic!("PPTX extraction failed: {:?}", e);
+        }
+    }
+}
+/// Test with actual user-provided PPTX file from GitHub Issue #329.
+///
+/// The user's file has slides listed in reverse order in presentation.xml.rels,
+/// which caused images to have incorrect page numbers.
+#[tokio::test]
+async fn test_pptx_image_page_numbers_issue329_user_file() {
+    let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+        .parent()
+        .expect("Operation failed")
+        .parent()
+        .expect("Operation failed");
+    let test_file = workspace_root.join("test_documents/presentations/pptx_reversed_slide_order_issue329.pptx");
+    if !test_file.exists() {
+        println!("Skipping test: User file not found at {:?}", test_file);
+        return;
+    }
+    // Extract with images enabled
+    let config = ExtractionConfig {
+        images: Some(ImageExtractionConfig {
+            extract_images: true,
+            target_dpi: 300,
+            max_image_dimension: 4096,
+            auto_adjust_dpi: true,
+            min_dpi: 72,
+            max_dpi: 600,
+        }),
+        ..Default::default()
+    };
+    let result = extract_file(&test_file, None, &config).await;
+    match result {
+        Ok(extraction) => {
+            // The user's file has an image on slide 1
+            let images = extraction.images.as_ref().expect("Images should be extracted");
+            if images.is_empty() {
+                println!("No images extracted from user file (may not have embedded images)");
+                return;
+            }
+            // All images should have page_number = 1 since they're on the first slide
+            for (idx, image) in images.iter().enumerate() {
+                assert_eq!(
+                    image.page_number,
+                    Some(1),
+                    "GitHub Issue #329: Image {} should have page_number=1, but got {:?}",
+                    idx,
+                    image.page_number
+                );
+            }
+            println!("✅ User file from Issue #329 - image page numbers correct!");
+            println!("   Found {} images, all with page_number=1", images.len());
+        }
+        Err(e) => {
+            panic!("Failed to extract user file: {:?}", e);
+        }
+    }
+}

data/vendor/kreuzberg/tests/registry_integration_tests.rs CHANGED Viewed

@@ -184,9 +184,9 @@ fn test_register_multiple_validators_succeeds() {
         should_fail: true,
     });
-    registry.register(v1).unwrap();
-    registry.register(v2).unwrap();
-    registry.register(v3).unwrap();
+    registry.register(v1).expect("Operation failed");
+    registry.register(v2).expect("Operation failed");
+    registry.register(v3).expect("Operation failed");
     let list = registry.list();
     assert_eq!(list.len(), 3, "Should have three validators");
@@ -205,7 +205,7 @@ fn test_validator_unregistration_succeeds() {
         should_fail: false,
     });
-    registry.register(validator).unwrap();
+    registry.register(validator).expect("Operation failed");
     assert_eq!(registry.list().len(), 1);
     let result = registry.remove("temp-validator");
@@ -298,8 +298,8 @@ fn test_clear_validators_succeeds() {
         should_fail: false,
     });
-    registry.register(v1).unwrap();
-    registry.register(v2).unwrap();
+    registry.register(v1).expect("Operation failed");
+    registry.register(v2).expect("Operation failed");
     assert_eq!(registry.list().len(), 2);
     let result = registry.shutdown_all();
@@ -355,9 +355,9 @@ fn test_get_all_validators_respects_priority() {
         priority: 100,
     });
-    registry.register(medium).unwrap();
-    registry.register(low).unwrap();
-    registry.register(high).unwrap();
+    registry.register(medium).expect("Operation failed");
+    registry.register(low).expect("Operation failed");
+    registry.register(high).expect("Operation failed");
     let all = registry.get_all();
     assert_eq!(all.len(), 3, "Should have three validators");
@@ -397,11 +397,11 @@ fn test_get_extractor_by_mime_type_succeeds() {
         priority: 50,
     });
-    registry.register(extractor).unwrap();
+    registry.register(extractor).expect("Operation failed");
     let result = registry.get("application/pdf");
     assert!(result.is_ok(), "Should find extractor for PDF");
-    assert_eq!(result.unwrap().name(), "pdf-extractor");
+    assert_eq!(result.expect("Operation failed").name(), "pdf-extractor");
 }
 /// Test extractor not found for unsupported MIME type.
@@ -437,10 +437,10 @@ fn test_extractor_priority_selection() {
         priority: 100,
     });
-    registry.register(low_priority).unwrap();
-    registry.register(high_priority).unwrap();
+    registry.register(low_priority).expect("Operation failed");
+    registry.register(high_priority).expect("Operation failed");
-    let result = registry.get("text/plain").unwrap();
+    let result = registry.get("text/plain").expect("Value not found");
     assert_eq!(
         result.name(),
         "high-priority-extractor",
@@ -459,15 +459,15 @@ fn test_extractor_wildcard_mime_matching() {
         priority: 50,
     });
-    registry.register(extractor).unwrap();
+    registry.register(extractor).expect("Operation failed");
     let result = registry.get("text/plain");
     assert!(result.is_ok(), "Should match text/plain with text/*");
-    assert_eq!(result.unwrap().name(), "text-extractor");
+    assert_eq!(result.expect("Operation failed").name(), "text-extractor");
     let result = registry.get("text/html");
     assert!(result.is_ok(), "Should match text/html with text/*");
-    assert_eq!(result.unwrap().name(), "text-extractor");
+    assert_eq!(result.expect("Operation failed").name(), "text-extractor");
     let result = registry.get("application/pdf");
     assert!(result.is_err(), "Should not match application/pdf with text/*");
@@ -484,7 +484,7 @@ fn test_extractor_unregistration_succeeds() {
         priority: 50,
     });
-    registry.register(extractor).unwrap();
+    registry.register(extractor).expect("Operation failed");
     assert_eq!(registry.list().len(), 1);
     let result = registry.remove("temp-extractor");
@@ -506,17 +506,20 @@ fn test_extractor_multiple_mime_types() {
         priority: 50,
     });
-    registry.register(extractor).unwrap();
+    registry.register(extractor).expect("Operation failed");
     assert!(registry.get("application/pdf").is_ok());
     assert!(registry.get("application/vnd.ms-excel").is_ok());
     assert!(registry.get("text/csv").is_ok());
     assert_eq!(
-        registry.get("application/pdf").unwrap().name(),
+        registry.get("application/pdf").expect("Value not found").name(),
+        "multi-format-extractor"
+    );
+    assert_eq!(
+        registry.get("text/csv").expect("Value not found").name(),
         "multi-format-extractor"
     );
-    assert_eq!(registry.get("text/csv").unwrap().name(), "multi-format-extractor");
 }
 /// Test clearing all extractors.
@@ -535,8 +538,8 @@ fn test_clear_extractors_succeeds() {
         priority: 50,
     });
-    registry.register(e1).unwrap();
-    registry.register(e2).unwrap();
+    registry.register(e1).expect("Operation failed");
+    registry.register(e2).expect("Operation failed");
     assert_eq!(registry.list().len(), 2);
     let result = registry.shutdown_all();

data/vendor/kreuzberg/tests/rst_extractor_tests.rs CHANGED Viewed

@@ -686,7 +686,7 @@ async fn test_rst_extraction_no_errors() {
         result.err()
     );
-    let extraction = result.unwrap();
+    let extraction = result.expect("Operation failed");
     assert!(!extraction.content.is_empty(), "Extracted content should not be empty");