RubyGems - kreuzberg - Versions diffs - 4.1.0 → 4.1.1 - Mend

kreuzberg 4.1.0 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/Gemfile.lock +2 -2
data/README.md +1 -1
data/lib/kreuzberg/config.rb +33 -2
data/lib/kreuzberg/version.rb +1 -1
data/spec/fixtures/config.toml +1 -1
data/spec/fixtures/config.yaml +1 -1
data/vendor/Cargo.toml +1 -1
data/vendor/kreuzberg/Cargo.toml +2 -2
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/core/mime.rs +15 -0
data/vendor/kreuzberg/src/extraction/pptx/parser.rs +17 -8
data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: df75a873a0c3547b2e6e44fa1c04e939372f5c01839c1d81ea671414e5c98016
-  data.tar.gz: e4f60f88486c2807c6bf45959e6e6160d509ed096236a5d5bf0c70c26cf8f2f8
+  metadata.gz: 0c1c0519fb3a58c45ec553994bd982b4f284835bd35ea0758461f6f381accfd6
+  data.tar.gz: 161c18cfabdd20bdaa520abda521cb16072dcc00f5fd2e41152d9da4acdb9d08
 SHA512:
-  metadata.gz: fac3f1dc6ca132f71f8034536f3ca7b7df53542a760061932e447e5cde3906d61903bb9ade16e96b782de1b879be68c50ee59218c3b0ba908d26d515d90d4966
-  data.tar.gz: 79b7ab92c373b7a06d06fe324b3d06ad2a588c9e993a2c8cb23c474e3452d82914d2ac48eb9edc1b9475cbf337b4e09a160c242d3510b33364d7f511d264e36f
+  metadata.gz: 7c6e1768022dcfdef5eaaaa3557a8388e8ad45158a69ed022d852b07f7658cbb885ca7860fa32eda8f29b5f7ea4216f93033aad77614afb82578b9157ed92710
+  data.tar.gz: 96ca3f1f3c6d6f9ea6dc826f7704c1d741ffaab24f524f099c4eb294652211ac6b9f324874974bb24d417193e76b1bce951315330d6f1ccd8eb5b014fa7fc71f

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    kreuzberg (4.1.0)
+    kreuzberg (4.1.1)
 GEM
   remote: https://rubygems.org/
@@ -207,7 +207,7 @@ CHECKSUMS
   i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
   io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
   json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
-  kreuzberg (4.1.0)
+  kreuzberg (4.1.1)
   language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
   lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
   listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2

data/README.md CHANGED Viewed

@@ -22,7 +22,7 @@
     <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
   </a>
   <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
-    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.1.0" alt="Go">
+    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.1.1" alt="Go">
   </a>
   <a href="https://www.nuget.org/packages/Kreuzberg/">
     <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">

data/lib/kreuzberg/config.rb CHANGED Viewed

@@ -733,11 +733,42 @@ module Kreuzberg
       # @example Load from YAML
       #   config = Kreuzberg::Config::Extraction.from_file("config.yaml")
       #
+      # Keys that are allowed in the Extraction config
+      ALLOWED_KEYS = %i[
+        use_cache enable_quality_processing force_ocr ocr chunking
+        language_detection pdf_options image_extraction image_preprocessing
+        postprocessor token_reduction keywords html_options pages
+        max_concurrent_extractions
+      ].freeze
+      # Aliases for backward compatibility
+      KEY_ALIASES = {
+        images: :image_extraction
+      }.freeze
       def self.from_file(path)
         hash = Kreuzberg._config_from_file_native(path)
-        new(**hash.transform_keys(&:to_sym))
+        new(**normalize_hash_keys(hash))
       end
+      # Normalize hash keys from native function
+      # - Converts string keys to symbols
+      # - Maps aliased keys to their canonical names
+      # - Filters out unknown keys
+      def self.normalize_hash_keys(hash)
+        symbolized = hash.transform_keys(&:to_sym)
+        # Apply key aliases
+        KEY_ALIASES.each do |from, to|
+          symbolized[to] = symbolized.delete(from) if symbolized.key?(from) && !symbolized.key?(to)
+        end
+        # Filter to only allowed keys
+        symbolized.slice(*ALLOWED_KEYS)
+      end
+      private_class_method :normalize_hash_keys
       # Discover configuration file in current or parent directories.
       #
       # Searches for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in the current
@@ -755,7 +786,7 @@ module Kreuzberg
         hash = Kreuzberg._config_discover_native
         return nil if hash.nil?
-        new(**hash.transform_keys(&:to_sym))
+        new(**normalize_hash_keys(hash))
       end
       def initialize(

data/lib/kreuzberg/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Kreuzberg
-  VERSION = '4.1.0'
+  VERSION = '4.1.1'
 end

data/spec/fixtures/config.toml CHANGED Viewed

@@ -21,7 +21,7 @@ extract_images = true
 passwords = ["secret", "backup"]
 extract_metadata = true
-[images]
+[image_extraction]
 extract_images = true
 target_dpi = 600
 max_image_dimension = 2000

data/spec/fixtures/config.yaml CHANGED Viewed

@@ -23,7 +23,7 @@ pdf_options:
     - password2
   extract_metadata: true
-images:
+image_extraction:
   extract_images: true
   target_dpi: 300
   max_image_dimension: 4096

data/vendor/Cargo.toml CHANGED Viewed

@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
 resolver = "2"
 [workspace.package]
-version = "4.1.0"
+version = "4.1.1"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

data/vendor/kreuzberg/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg"
-version = "4.1.0"
+version = "4.1.1"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
@@ -174,7 +174,7 @@ rst_parser = { version = "0.4", optional = true }
 fb2 = { version = "0.4", optional = true }
 typst-syntax = { version = "0.14", optional = true }
-kreuzberg-tesseract = { path = "../kreuzberg-tesseract", optional = true }
+kreuzberg-tesseract = { path = "../kreuzberg-tesseract", version = "4.1", optional = true }
 image = { workspace = true, default-features = false, features = [
     "png",
     "jpeg",

data/vendor/kreuzberg/README.md CHANGED Viewed

@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
 This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
-> **🚀 Version 4.1.0 Release**
+> **🚀 Version 4.1.1 Release**
 > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
 >
 > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.

data/vendor/kreuzberg/src/core/mime.rs CHANGED Viewed

@@ -57,6 +57,11 @@ static EXT_TO_MIME: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
     m.insert("ods", OPENDOC_SPREADSHEET_MIME_TYPE);
     m.insert("pptx", POWER_POINT_MIME_TYPE);
+    m.insert(
+        "ppsx",
+        "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
+    );
+    m.insert("pptm", "application/vnd.ms-powerpoint.presentation.macroEnabled.12");
     m.insert("ppt", LEGACY_POWERPOINT_MIME_TYPE);
     m.insert("docx", DOCX_MIME_TYPE);
@@ -180,6 +185,8 @@ static SUPPORTED_MIME_TYPES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
     set.insert(PDF_MIME_TYPE);
     set.insert(POWER_POINT_MIME_TYPE);
+    set.insert("application/vnd.openxmlformats-officedocument.presentationml.slideshow"); // PPSX
+    set.insert("application/vnd.ms-powerpoint.presentation.macroEnabled.12"); // PPTM
     set.insert(LEGACY_WORD_MIME_TYPE);
     set.insert(LEGACY_POWERPOINT_MIME_TYPE);
     set.insert(HTML_MIME_TYPE);
@@ -459,6 +466,14 @@ mod tests {
             ("test.xlsx", EXCEL_MIME_TYPE),
             ("test.xls", EXCEL_BINARY_MIME_TYPE),
             ("test.pptx", POWER_POINT_MIME_TYPE),
+            (
+                "test.ppsx",
+                "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
+            ),
+            (
+                "test.pptm",
+                "application/vnd.ms-powerpoint.presentation.macroEnabled.12",
+            ),
             ("test.ppt", LEGACY_POWERPOINT_MIME_TYPE),
             ("test.docx", DOCX_MIME_TYPE),
             ("test.doc", LEGACY_WORD_MIME_TYPE),

data/vendor/kreuzberg/src/extraction/pptx/parser.rs CHANGED Viewed

@@ -60,9 +60,12 @@ fn parse_group(node: &Node) -> Result<Vec<SlideElement>> {
     match tag_name {
         "sp" => {
             let position = extract_position(node);
-            match parse_sp(node)? {
-                ParsedContent::Text(text) => elements.push(SlideElement::Text(text, position)),
-                ParsedContent::List(list) => elements.push(SlideElement::List(list, position)),
+            // parse_sp returns None for shapes without txBody (e.g., image placeholders)
+            if let Some(content) = parse_sp(node)? {
+                match content {
+                    ParsedContent::Text(text) => elements.push(SlideElement::Text(text, position)),
+                    ParsedContent::List(list) => elements.push(SlideElement::List(list, position)),
+                }
             }
         }
         "graphicFrame" => {
@@ -85,11 +88,17 @@ fn parse_group(node: &Node) -> Result<Vec<SlideElement>> {
     Ok(elements)
 }
-fn parse_sp(sp_node: &Node) -> Result<ParsedContent> {
-    let tx_body_node = sp_node
+fn parse_sp(sp_node: &Node) -> Result<Option<ParsedContent>> {
+    // Some shapes like image placeholders (<p:ph type="pic"/>) don't have txBody.
+    // These should be skipped gracefully - they contain no text to extract.
+    // GitHub Issue #321 Bug 1
+    let tx_body_node = match sp_node
         .children()
         .find(|n| n.tag_name().name() == "txBody" && n.tag_name().namespace() == Some(P_NAMESPACE))
-        .ok_or_else(|| KreuzbergError::parsing("No txBody found".to_string()))?;
+    {
+        Some(node) => node,
+        None => return Ok(None), // Skip shapes without txBody
+    };
     let is_list = tx_body_node.descendants().any(|n| {
         n.is_element()
@@ -103,9 +112,9 @@ fn parse_sp(sp_node: &Node) -> Result<ParsedContent> {
     });
     if is_list {
-        Ok(ParsedContent::List(parse_list(&tx_body_node)?))
+        Ok(Some(ParsedContent::List(parse_list(&tx_body_node)?)))
     } else {
-        Ok(ParsedContent::Text(parse_text(&tx_body_node)?))
+        Ok(Some(ParsedContent::Text(parse_text(&tx_body_node)?)))
     }
 }

data/vendor/kreuzberg/tests/pptx_regression_tests.rs ADDED Viewed

@@ -0,0 +1,504 @@
+//! Regression tests for PPTX/PPSX extraction bugs
+//!
+//! GitHub Issue #321: PPTX extraction fails on shapes without txBody (image placeholders) + PPSX not supported
+//!
+//! Bug 1: "No txBody found" - PPTX extraction fails when any shape lacks a text body
+//! Bug 2: PPSX not supported - PowerPoint Show files rejected entirely
+#![cfg(feature = "office")]
+use kreuzberg::{ExtractionConfig, extract_file};
+use std::io::Write;
+use tempfile::NamedTempFile;
+use zip::CompressionMethod;
+use zip::write::{FileOptions, ZipWriter};
+/// Test that PPSX (PowerPoint Show) files are extracted correctly.
+///
+/// PPSX files use MIME type `application/vnd.openxmlformats-officedocument.presentationml.slideshow`
+/// instead of PPTX's `application/vnd.openxmlformats-officedocument.presentationml.presentation`.
+///
+/// The internal structure is identical to PPTX - same slide XML format.
+///
+/// GitHub Issue #321 Bug 2
+#[tokio::test]
+async fn test_ppsx_slideshow_extraction() {
+    let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+        .parent()
+        .unwrap()
+        .parent()
+        .unwrap();
+    let test_file = workspace_root.join("test_documents/presentations/sample.ppsx");
+    if !test_file.exists() {
+        println!("Skipping test: PPSX test file not found at {:?}", test_file);
+        return;
+    }
+    let result = extract_file(&test_file, None, &ExtractionConfig::default()).await;
+    match result {
+        Ok(extraction) => {
+            assert!(!extraction.content.is_empty(), "PPSX content should not be empty");
+            println!("✅ PPSX extraction succeeded!");
+            println!("   Content length: {} chars", extraction.content.len());
+            println!(
+                "   Content preview: {}",
+                &extraction.content[..extraction.content.len().min(200)]
+            );
+        }
+        Err(e) => {
+            panic!(
+                "PPSX extraction failed with error: {:?}\n\
+                 This is GitHub Issue #321 Bug 2: PPSX files should be supported.\n\
+                 PPSX MIME type (application/vnd.openxmlformats-officedocument.presentationml.slideshow) \
+                 needs to be added to extension-to-MIME mapping.",
+                e
+            );
+        }
+    }
+}
+/// Test that PPSX files can be extracted when MIME type is explicitly provided.
+///
+/// This validates that the PPTX extractor can handle PPSX content correctly
+/// (the XML structure is identical), even if MIME detection fails.
+///
+/// GitHub Issue #321 Bug 2
+#[tokio::test]
+async fn test_ppsx_with_explicit_mime_type() {
+    let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+        .parent()
+        .unwrap()
+        .parent()
+        .unwrap();
+    let test_file = workspace_root.join("test_documents/presentations/sample.ppsx");
+    if !test_file.exists() {
+        println!("Skipping test: PPSX test file not found at {:?}", test_file);
+        return;
+    }
+    // Explicitly provide the PPSX MIME type
+    let result = extract_file(
+        &test_file,
+        Some("application/vnd.openxmlformats-officedocument.presentationml.slideshow"),
+        &ExtractionConfig::default(),
+    )
+    .await;
+    match result {
+        Ok(extraction) => {
+            assert!(!extraction.content.is_empty(), "PPSX content should not be empty");
+            println!("✅ PPSX extraction with explicit MIME type succeeded!");
+        }
+        Err(e) => {
+            panic!(
+                "PPSX extraction with explicit MIME type failed: {:?}\n\
+                 The PPTX extractor should handle PPSX content (identical XML structure).",
+                e
+            );
+        }
+    }
+}
+/// Test that PPTX files with image placeholder shapes (no txBody) are extracted correctly.
+///
+/// Some shapes in PPTX files, like image placeholders (`<p:ph type="pic"/>`), don't have
+/// `<p:txBody>` children because they're designed to hold images, not text.
+///
+/// The parser should skip shapes without txBody gracefully instead of failing.
+///
+/// GitHub Issue #321 Bug 1
+#[tokio::test]
+async fn test_pptx_with_image_placeholder_no_txbody() {
+    // Create a minimal PPTX with a shape that has no txBody (image placeholder)
+    let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
+    {
+        let mut zip = ZipWriter::new(&mut temp_file);
+        let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
+        // Add [Content_Types].xml
+        zip.start_file("[Content_Types].xml", options).unwrap();
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
+  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
+  <Default Extension="xml" ContentType="application/xml"/>
+  <Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
+  <Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
+</Types>"#).unwrap();
+        // Add _rels/.rels
+        zip.start_file("_rels/.rels", options).unwrap();
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
+</Relationships>"#).unwrap();
+        // Add ppt/presentation.xml
+        zip.start_file("ppt/presentation.xml", options).unwrap();
+        zip.write_all(
+            br#"<?xml version="1.0" encoding="UTF-8"?>
+<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
+                xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
+                xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
+  <p:sldIdLst>
+    <p:sldId id="256" r:id="rId2"/>
+  </p:sldIdLst>
+</p:presentation>"#,
+        )
+        .unwrap();
+        // Add ppt/_rels/presentation.xml.rels
+        zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+  <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
+</Relationships>"#).unwrap();
+        // Add ppt/slides/slide1.xml with a shape WITHOUT txBody (image placeholder)
+        // This is the critical test case - a <p:sp> element with no <p:txBody>
+        zip.start_file("ppt/slides/slide1.xml", options).unwrap();
+        zip.write_all(
+            br#"<?xml version="1.0" encoding="UTF-8"?>
+<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
+       xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
+       xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
+  <p:cSld>
+    <p:spTree>
+      <p:nvGrpSpPr>
+        <p:cNvPr id="1" name=""/>
+        <p:cNvGrpSpPr/>
+        <p:nvPr/>
+      </p:nvGrpSpPr>
+      <p:grpSpPr>
+        <a:xfrm>
+          <a:off x="0" y="0"/>
+          <a:ext cx="0" cy="0"/>
+          <a:chOff x="0" y="0"/>
+          <a:chExt cx="0" cy="0"/>
+        </a:xfrm>
+      </p:grpSpPr>
+      <!-- Normal text shape WITH txBody - this should be extracted -->
+      <p:sp>
+        <p:nvSpPr>
+          <p:cNvPr id="2" name="Title"/>
+          <p:cNvSpPr/>
+          <p:nvPr/>
+        </p:nvSpPr>
+        <p:spPr>
+          <a:xfrm>
+            <a:off x="0" y="0"/>
+            <a:ext cx="100000" cy="100000"/>
+          </a:xfrm>
+          <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
+        </p:spPr>
+        <p:txBody>
+          <a:bodyPr/>
+          <a:lstStyle/>
+          <a:p>
+            <a:r>
+              <a:rPr lang="en-US"/>
+              <a:t>This is the title text</a:t>
+            </a:r>
+          </a:p>
+        </p:txBody>
+      </p:sp>
+      <!-- IMAGE PLACEHOLDER shape WITHOUT txBody - this caused the "No txBody found" error -->
+      <!-- This is a valid PPTX structure - image placeholders don't contain text -->
+      <p:sp>
+        <p:nvSpPr>
+          <p:cNvPr id="99" name="Image Placeholder"/>
+          <p:cNvSpPr>
+            <a:spLocks noGrp="1"/>
+          </p:cNvSpPr>
+          <p:nvPr>
+            <p:ph type="pic" idx="1"/>
+          </p:nvPr>
+        </p:nvSpPr>
+        <p:spPr>
+          <a:xfrm>
+            <a:off x="0" y="0"/>
+            <a:ext cx="100000" cy="100000"/>
+          </a:xfrm>
+          <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
+        </p:spPr>
+        <!-- NOTE: No <p:txBody> here - this is valid for image placeholders -->
+      </p:sp>
+      <!-- Another normal text shape - should also be extracted -->
+      <p:sp>
+        <p:nvSpPr>
+          <p:cNvPr id="3" name="Content"/>
+          <p:cNvSpPr/>
+          <p:nvPr/>
+        </p:nvSpPr>
+        <p:spPr>
+          <a:xfrm>
+            <a:off x="0" y="200000"/>
+            <a:ext cx="100000" cy="100000"/>
+          </a:xfrm>
+          <a:prstGeom prst="rect"><a:avLst/></a:prstGeom>
+        </p:spPr>
+        <p:txBody>
+          <a:bodyPr/>
+          <a:lstStyle/>
+          <a:p>
+            <a:r>
+              <a:rPr lang="en-US"/>
+              <a:t>Content after image placeholder</a:t>
+            </a:r>
+          </a:p>
+        </p:txBody>
+      </p:sp>
+    </p:spTree>
+  </p:cSld>
+</p:sld>"#,
+        )
+        .unwrap();
+        // Add ppt/slides/_rels/slide1.xml.rels (empty)
+        zip.start_file("ppt/slides/_rels/slide1.xml.rels", options).unwrap();
+        zip.write_all(
+            br#"<?xml version="1.0" encoding="UTF-8"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+</Relationships>"#,
+        )
+        .unwrap();
+        zip.finish().unwrap();
+    }
+    // Extract the PPTX file
+    let result = extract_file(
+        temp_file.path(),
+        Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
+        &ExtractionConfig::default(),
+    )
+    .await;
+    match result {
+        Ok(extraction) => {
+            assert!(!extraction.content.is_empty(), "Content should not be empty");
+            // Verify we extracted text from shapes that DO have txBody
+            assert!(
+                extraction.content.contains("title text"),
+                "Should extract text from first shape with txBody. Got: {}",
+                extraction.content
+            );
+            assert!(
+                extraction.content.contains("Content after"),
+                "Should extract text from shape after image placeholder. Got: {}",
+                extraction.content
+            );
+            println!("✅ PPTX with image placeholder (no txBody) extraction succeeded!");
+            println!("   Content: {}", extraction.content);
+        }
+        Err(e) => {
+            let error_msg = format!("{:?}", e);
+            if error_msg.contains("No txBody found") {
+                panic!(
+                    "PPTX extraction failed with 'No txBody found' error!\n\
+                     This is GitHub Issue #321 Bug 1.\n\
+                     The parser should skip shapes without txBody (image placeholders) \
+                     instead of failing.\n\
+                     Error: {:?}",
+                    e
+                );
+            } else {
+                panic!("PPTX extraction failed with unexpected error: {:?}", e);
+            }
+        }
+    }
+}
+/// Test extraction of PPTX with multiple shapes, some with txBody, some without.
+///
+/// This test verifies that:
+/// 1. Shapes WITH txBody are extracted
+/// 2. Shapes WITHOUT txBody (image placeholders, etc.) are skipped gracefully
+/// 3. The extraction continues and doesn't fail on the first shape without txBody
+///
+/// GitHub Issue #321 Bug 1
+#[tokio::test]
+async fn test_pptx_mixed_shapes_extraction() {
+    // Create a PPTX with multiple slides, each containing mixed shapes
+    let mut temp_file = NamedTempFile::with_suffix(".pptx").expect("Failed to create temp file");
+    {
+        let mut zip = ZipWriter::new(&mut temp_file);
+        let options: FileOptions<()> = FileOptions::default().compression_method(CompressionMethod::Stored);
+        // Add [Content_Types].xml
+        zip.start_file("[Content_Types].xml", options).unwrap();
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
+  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
+  <Default Extension="xml" ContentType="application/xml"/>
+  <Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
+  <Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
+</Types>"#).unwrap();
+        // Add _rels/.rels
+        zip.start_file("_rels/.rels", options).unwrap();
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
+</Relationships>"#).unwrap();
+        // Add ppt/presentation.xml
+        zip.start_file("ppt/presentation.xml", options).unwrap();
+        zip.write_all(
+            br#"<?xml version="1.0" encoding="UTF-8"?>
+<p:presentation xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
+                xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
+                xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
+  <p:sldIdLst>
+    <p:sldId id="256" r:id="rId2"/>
+  </p:sldIdLst>
+</p:presentation>"#,
+        )
+        .unwrap();
+        // Add ppt/_rels/presentation.xml.rels
+        zip.start_file("ppt/_rels/presentation.xml.rels", options).unwrap();
+        zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+  <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
+</Relationships>"#).unwrap();
+        // Add slide with various shapes - some with txBody, some without
+        zip.start_file("ppt/slides/slide1.xml", options).unwrap();
+        zip.write_all(
+            br#"<?xml version="1.0" encoding="UTF-8"?>
+<p:sld xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
+       xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
+       xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
+  <p:cSld>
+    <p:spTree>
+      <p:nvGrpSpPr>
+        <p:cNvPr id="1" name=""/>
+        <p:cNvGrpSpPr/>
+        <p:nvPr/>
+      </p:nvGrpSpPr>
+      <p:grpSpPr/>
+      <!-- Shape 1: Normal text -->
+      <p:sp>
+        <p:nvSpPr><p:cNvPr id="2" name="Title"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
+        <p:spPr/>
+        <p:txBody>
+          <a:bodyPr/><a:lstStyle/>
+          <a:p><a:r><a:t>First Text Shape</a:t></a:r></a:p>
+        </p:txBody>
+      </p:sp>
+      <!-- Shape 2: Image placeholder (NO txBody) -->
+      <p:sp>
+        <p:nvSpPr>
+          <p:cNvPr id="10" name="Picture Placeholder"/>
+          <p:cNvSpPr><a:spLocks noGrp="1"/></p:cNvSpPr>
+          <p:nvPr><p:ph type="pic"/></p:nvPr>
+        </p:nvSpPr>
+        <p:spPr/>
+      </p:sp>
+      <!-- Shape 3: Another text shape -->
+      <p:sp>
+        <p:nvSpPr><p:cNvPr id="3" name="Body"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
+        <p:spPr/>
+        <p:txBody>
+          <a:bodyPr/><a:lstStyle/>
+          <a:p><a:r><a:t>Second Text Shape</a:t></a:r></a:p>
+        </p:txBody>
+      </p:sp>
+      <!-- Shape 4: Chart placeholder (NO txBody) -->
+      <p:sp>
+        <p:nvSpPr>
+          <p:cNvPr id="11" name="Chart Placeholder"/>
+          <p:cNvSpPr><a:spLocks noGrp="1"/></p:cNvSpPr>
+          <p:nvPr><p:ph type="chart"/></p:nvPr>
+        </p:nvSpPr>
+        <p:spPr/>
+      </p:sp>
+      <!-- Shape 5: Content placeholder (NO txBody - empty) -->
+      <p:sp>
+        <p:nvSpPr>
+          <p:cNvPr id="12" name="Content Placeholder"/>
+          <p:cNvSpPr><a:spLocks noGrp="1"/></p:cNvSpPr>
+          <p:nvPr><p:ph type="body"/></p:nvPr>
+        </p:nvSpPr>
+        <p:spPr/>
+      </p:sp>
+      <!-- Shape 6: Final text shape -->
+      <p:sp>
+        <p:nvSpPr><p:cNvPr id="4" name="Footer"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
+        <p:spPr/>
+        <p:txBody>
+          <a:bodyPr/><a:lstStyle/>
+          <a:p><a:r><a:t>Third Text Shape</a:t></a:r></a:p>
+        </p:txBody>
+      </p:sp>
+    </p:spTree>
+  </p:cSld>
+</p:sld>"#,
+        )
+        .unwrap();
+        // Add empty rels
+        zip.start_file("ppt/slides/_rels/slide1.xml.rels", options).unwrap();
+        zip.write_all(
+            br#"<?xml version="1.0" encoding="UTF-8"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+</Relationships>"#,
+        )
+        .unwrap();
+        zip.finish().unwrap();
+    }
+    let result = extract_file(
+        temp_file.path(),
+        Some("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
+        &ExtractionConfig::default(),
+    )
+    .await;
+    match result {
+        Ok(extraction) => {
+            // All three text shapes should be extracted
+            assert!(
+                extraction.content.contains("First Text Shape"),
+                "Should extract first text shape"
+            );
+            assert!(
+                extraction.content.contains("Second Text Shape"),
+                "Should extract second text shape (after image placeholder)"
+            );
+            assert!(
+                extraction.content.contains("Third Text Shape"),
+                "Should extract third text shape (after multiple placeholders)"
+            );
+            println!("✅ PPTX mixed shapes extraction succeeded!");
+            println!("   All text shapes extracted despite image/chart/content placeholders without txBody");
+        }
+        Err(e) => {
+            panic!(
+                "PPTX extraction failed: {:?}\n\
+                 Shapes without txBody should be skipped gracefully.",
+                e
+            );
+        }
+    }
+}

data/vendor/kreuzberg-tesseract/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg-tesseract"
-version = "4.1.0"
+version = "4.1.1"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: kreuzberg
 version: !ruby/object:Gem::Version
-  version: 4.1.0
+  version: 4.1.1
 platform: ruby
 authors:
 - Na'aman Hirschfeld
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2026-01-22 00:00:00.000000000 Z
+date: 2026-01-23 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -761,6 +761,7 @@ files:
 - vendor/kreuzberg/tests/plugin_postprocessor_test.rs
 - vendor/kreuzberg/tests/plugin_system.rs
 - vendor/kreuzberg/tests/plugin_validator_test.rs
+- vendor/kreuzberg/tests/pptx_regression_tests.rs
 - vendor/kreuzberg/tests/registry_integration_tests.rs
 - vendor/kreuzberg/tests/rst_extractor_tests.rs
 - vendor/kreuzberg/tests/rtf_extractor_tests.rs