RubyGems - kreuzberg - Versions diffs - 4.7.3 → 4.7.4 - Mend

kreuzberg 4.7.3 → 4.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

checksums.yaml +4 -4
data/README.md +1 -1
data/ext/kreuzberg_rb/native/Cargo.lock +6 -6
data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
data/lib/kreuzberg/version.rb +1 -1
data/vendor/Cargo.toml +4 -4
data/vendor/kreuzberg/Cargo.toml +2 -2
data/vendor/kreuzberg/README.md +1 -1
data/vendor/kreuzberg/src/pdf/structure/classify.rs +157 -0
data/vendor/kreuzberg/src/pdf/structure/layout_classify.rs +6 -0
data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +127 -2
data/vendor/kreuzberg/src/rendering/comrak_bridge.rs +6 -0
data/vendor/kreuzberg/src/rendering/markdown.rs +46 -0
data/vendor/kreuzberg/tests/pdf_image_extraction_tests.rs +113 -0
data/vendor/kreuzberg/tests/pdf_output_quality.rs +200 -0
data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
data/vendor/kreuzberg-tesseract/src/api.rs +1 -1
data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +1 -1
data/vendor/kreuzberg-tesseract/src/leptonica.rs +1 -1
data/vendor/kreuzberg-tesseract/src/lib.rs +28 -0
data/vendor/kreuzberg-tesseract/src/monitor.rs +1 -1
data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +1 -1
data/vendor/kreuzberg-tesseract/src/page_iterator.rs +1 -1
data/vendor/kreuzberg-tesseract/src/result_iterator.rs +1 -1
data/vendor/kreuzberg-tesseract/src/result_renderer.rs +1 -1
metadata +4 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 0f26fc086a0221056b94cd10141832240ef9563783835ef66555445e0d33442d
-  data.tar.gz: ed8488fcdd8bd12266ec2be8984ec5f5b60a32aaa562e64b442d9a4a641c8841
+  metadata.gz: 3fffd5a1e2d066b0997155be101dade834002fd2805fd556b2e5b6b8a8d29be9
+  data.tar.gz: 0f1ce8406a8880327191fa4e49bdadd71f938ec47ecb3b7e38f4121008d16600
 SHA512:
-  metadata.gz: 485460f24ddbf58b82a41873a61c5d78408012b415e62a52daeed43df2d150b317115f3ca79bc029a9394188da70d6733f79ba8e61f589c58bee7d1a845b17d8
-  data.tar.gz: 5e80e6e34fe8125a934b141ad3ab058240b81f4b82d609e0e2aea822131cbd4954dd7112e152a90542fc56d494519182c62cc14b387f658944ebbc73bc5037a1
+  metadata.gz: f878aeea0ccb330d30f863707fc31ba0dce4a8b2eb5f6fca705e5d21337a68ea209328a8dc84ae565c20a996d3ae6f5e53149d570a5439f6746f27df3d1c5671
+  data.tar.gz: 1ca62afb51ffb9d85f3a820c7d44473d393bfdefab21f78723c52d3304082382e30d646c2751203d2f0a0d2ab346ec664856662ba7c212c565b35777fa8f167f

data/README.md CHANGED Viewed

@@ -22,7 +22,7 @@
     <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
   </a>
   <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
-    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.7.3" alt="Go">
+    <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.7.4" alt="Go">
   </a>
   <a href="https://www.nuget.org/packages/Kreuzberg/">
     <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">

data/ext/kreuzberg_rb/native/Cargo.lock CHANGED Viewed

@@ -2792,7 +2792,7 @@ dependencies = [
 [[package]]
 name = "kreuzberg"
-version = "4.7.2"
+version = "4.7.3"
 dependencies = [
  "ahash",
  "async-trait",
@@ -2885,7 +2885,7 @@ dependencies = [
 [[package]]
 name = "kreuzberg-ffi"
-version = "4.7.2"
+version = "4.7.3"
 dependencies = [
  "ahash",
  "async-trait",
@@ -2901,7 +2901,7 @@ dependencies = [
 [[package]]
 name = "kreuzberg-paddle-ocr"
-version = "4.7.2"
+version = "4.7.3"
 dependencies = [
  "geo-clipper",
  "geo-types",
@@ -2915,7 +2915,7 @@ dependencies = [
 [[package]]
 name = "kreuzberg-pdfium-render"
-version = "4.7.2"
+version = "4.7.3"
 dependencies = [
  "bitflags",
  "bytemuck",
@@ -2938,7 +2938,7 @@ dependencies = [
 [[package]]
 name = "kreuzberg-rb"
-version = "4.7.2"
+version = "4.7.3"
 dependencies = [
  "async-trait",
  "html-to-markdown-rs",
@@ -2955,7 +2955,7 @@ dependencies = [
 [[package]]
 name = "kreuzberg-tesseract"
-version = "4.7.2"
+version = "4.7.3"
 dependencies = [
  "cc",
  "cmake",

data/ext/kreuzberg_rb/native/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg-rb"
-version = "4.7.3"
+version = "4.7.4"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]

data/lib/kreuzberg/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Kreuzberg
-  VERSION = '4.7.3'
+  VERSION = '4.7.4'
 end

data/vendor/Cargo.toml CHANGED Viewed

@@ -2,7 +2,7 @@
 members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
 [workspace.package]
-version = "4.7.3"
+version = "4.7.4"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -32,8 +32,8 @@ html-to-markdown-rs = { version = "3.1.0", default-features = false }
 image = { version = "0.25.10", default-features = false }
 itertools = "0.14"
 js-sys = "0.3"
-kreuzberg = { path = "./crates/kreuzberg", version = "4.7.3", default-features = false }
-kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.7.3" }
+kreuzberg = { path = "./crates/kreuzberg", version = "4.7.4", default-features = false }
+kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.7.4" }
 lazy_static = "1.5.0"
 libc = "0.2.184"
 log = "0.4"
@@ -43,7 +43,7 @@ num_cpus = "1.17.0"
 once_cell = "1.21.4"
 ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
 parking_lot = "0.12.5"
-pdf_oxide = { version = "0.3.20", default-features = false }
+pdf_oxide = { version = "0.3.21", default-features = false }
 pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
 rayon = "1.11.0"
 reqwest = { version = "0.13.2", default-features = false }

data/vendor/kreuzberg/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg"
-version = "4.7.3"
+version = "4.7.4"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -307,7 +307,7 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
 outlook-pst = { version = "1.2.0", optional = true }
 parking_lot = "0.12.5"
 pastey = "0.2"
-pdf_oxide = { version = "0.3.20", default-features = false, optional = true }
+pdf_oxide = { version = "0.3.21", default-features = false, optional = true }
 pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
 pulldown-cmark = { version = "0.13" }
 quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }

data/vendor/kreuzberg/README.md CHANGED Viewed

@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
 This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
-> **🚀 Version 4.7.3 Release**
+> **🚀 Version 4.7.4 Release**
 > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
 >
 > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.

data/vendor/kreuzberg/src/pdf/structure/classify.rs CHANGED Viewed

@@ -1045,6 +1045,163 @@ fn is_greek_letter(c: char) -> bool {
     matches!(c, '\u{0391}'..='\u{03A9}' | '\u{03B1}'..='\u{03C9}')
 }
+/// Remove arXiv watermark/sidebar noise from paragraphs on the first pages.
+///
+/// Handles two cases:
+/// 1. Short standalone paragraphs that are just the arXiv identifier → mark as furniture.
+/// 2. arXiv identifier appended to the end of a longer paragraph (LaTeX sidebar
+///    text that pdfium concatenates with body text) → strip the trailing noise.
+pub(super) fn mark_arxiv_noise(all_pages: &mut [Vec<PdfParagraph>]) {
+    let arxiv_re = regex::Regex::new(r"arXiv:\d{4}\.\d{4,5}").expect("valid regex");
+    // Match trailing sidebar noise: title/page-num + arXiv ID (+ category + date) at end.
+    // The sidebar text from LaTeX gets concatenated by pdfium with body text.
+    // We capture from the arXiv ID to end-of-string and also eat back any preceding
+    // short title/page-number fragment (up to ~8 words before arXiv:).
+    let trailing_re = regex::Regex::new(
+        r"(?:\s+(?:\S+\s+){0,8})?arXiv:\d{4}\.\d{4,5}(?:v\d+)?(?:\s*\[[\w.-]+\])?\s*(?:\d{1,2}\s+\w+\s+\d{4})?\s*$",
+    )
+    .expect("valid regex");
+    // Only check first 2 pages — arXiv watermarks don't appear later.
+    for page in all_pages.iter_mut().take(2) {
+        for para in page.iter_mut() {
+            if para.is_page_furniture {
+                continue;
+            }
+            let text = paragraph_plain_text(para);
+            let trimmed = text.trim();
+            let word_count = trimmed.split_whitespace().count();
+            if !arxiv_re.is_match(trimmed) {
+                continue;
+            }
+            // Short paragraph dominated by the arXiv identifier → mark as furniture.
+            if word_count <= 25 {
+                tracing::trace!(
+                    text = %trimmed.chars().take(80).collect::<String>(),
+                    "marking arXiv watermark as furniture"
+                );
+                para.is_page_furniture = true;
+                para.heading_level = None;
+            } else if let Some(m) = trailing_re.find(trimmed) {
+                // arXiv id is at the end of a longer paragraph — strip it from the last segment.
+                let noise = &trimmed[m.start()..];
+                tracing::trace!(
+                    stripped = %noise.chars().take(80).collect::<String>(),
+                    "stripping trailing arXiv watermark from paragraph"
+                );
+                strip_trailing_text_from_paragraph(para, noise.trim());
+            }
+        }
+    }
+}
+/// Strip trailing noise text from the last segment(s) of a paragraph.
+fn strip_trailing_text_from_paragraph(para: &mut PdfParagraph, noise: &str) {
+    // Walk lines in reverse to find the segment containing the noise.
+    for line in para.lines.iter_mut().rev() {
+        for seg in line.segments.iter_mut().rev() {
+            if let Some(pos) = seg.text.find(noise) {
+                seg.text = seg.text[..pos].trim_end().to_string();
+                return;
+            }
+            // If the entire segment is part of the noise, clear it.
+            let seg_trimmed = seg.text.trim();
+            if !seg_trimmed.is_empty() && noise.contains(seg_trimmed) {
+                seg.text.clear();
+            } else {
+                // Reached body text — stop.
+                return;
+            }
+        }
+    }
+}
+/// Second-tier cross-page repeating text detection.
+///
+/// Supplements `mark_cross_page_repeating_text` by scanning ALL paragraphs
+/// (not just margin-positioned ones) for short text that repeats on a
+/// supermajority of pages. Catches inline conference headers, journal running
+/// titles, and similar repeated boilerplate that appears outside the margin zone.
+pub(super) fn mark_cross_page_repeating_short_text(all_pages: &mut [Vec<PdfParagraph>]) {
+    if all_pages.len() < 5 {
+        return;
+    }
+    let max_words = 20;
+    let threshold = (all_pages.len() as f64 * 0.7).ceil() as usize;
+    // Count how many pages each short text appears on.
+    let mut text_page_count: ahash::AHashMap<String, usize> = ahash::AHashMap::new();
+    for page in all_pages.iter() {
+        let mut seen: ahash::AHashSet<String> = ahash::AHashSet::new();
+        for para in page {
+            if para.is_page_furniture {
+                continue;
+            }
+            let text = paragraph_plain_text(para);
+            let normalized = text.trim().to_lowercase();
+            if normalized.is_empty() {
+                continue;
+            }
+            let word_count = normalized.split_whitespace().count();
+            if word_count > max_words {
+                continue;
+            }
+            let alphanum_key: String = normalized.chars().filter(|c| c.is_alphanumeric()).collect();
+            if alphanum_key.is_empty() {
+                continue;
+            }
+            if seen.insert(alphanum_key.clone()) {
+                *text_page_count.entry(alphanum_key).or_insert(0) += 1;
+            }
+        }
+    }
+    // Collect keys that repeat on ≥70% of pages.
+    let repeating: ahash::AHashSet<String> = text_page_count
+        .into_iter()
+        .filter(|(_, count)| *count >= threshold)
+        .map(|(key, _)| key)
+        .collect();
+    if repeating.is_empty() {
+        return;
+    }
+    tracing::debug!(
+        repeating_count = repeating.len(),
+        threshold,
+        total_pages = all_pages.len(),
+        "cross-page short-text repeating detection (tier 2)"
+    );
+    // Mark matching paragraphs as furniture.
+    for page in all_pages.iter_mut() {
+        for para in page.iter_mut() {
+            if para.is_page_furniture {
+                continue;
+            }
+            let text = paragraph_plain_text(para);
+            let normalized = text.trim().to_lowercase();
+            let word_count = normalized.split_whitespace().count();
+            if word_count > max_words {
+                continue;
+            }
+            let alphanum_key: String = normalized.chars().filter(|c| c.is_alphanumeric()).collect();
+            if repeating.contains(&alphanum_key) {
+                tracing::trace!(
+                    text = %normalized.chars().take(60).collect::<String>(),
+                    "marking repeating short text as furniture (tier 2)"
+                );
+                para.is_page_furniture = true;
+                para.heading_level = None;
+            }
+        }
+    }
+}
 #[cfg(test)]
 mod tests {
     use super::*;

data/vendor/kreuzberg/src/pdf/structure/layout_classify.rs CHANGED Viewed

@@ -380,6 +380,12 @@ pub(super) fn apply_hint_to_paragraph(para: &mut PdfParagraph, hint: &LayoutHint
         LayoutHintClass::PageHeader | LayoutHintClass::PageFooter => {
             para.is_page_furniture = true;
         }
+        LayoutHintClass::Picture => {
+            // Text classified as Picture by layout model is figure-internal text
+            // (diagram labels, axis text, etc.) — suppress from body output.
+            para.is_page_furniture = true;
+            para.heading_level = None;
+        }
         LayoutHintClass::Text | LayoutHintClass::Caption | LayoutHintClass::Footnote
             // Layout model says this is body text, not a heading.
             // Demote font-size-classified headings when layout has high confidence.

data/vendor/kreuzberg/src/pdf/structure/pipeline.rs CHANGED Viewed

@@ -11,8 +11,8 @@ use rayon::prelude::*;
 use super::assembly::assemble_internal_document;
 use super::bridge::{ImagePosition, extracted_blocks_to_paragraphs, filter_sidebar_blocks, objects_to_page_data};
 use super::classify::{
-    classify_paragraphs, demote_heading_runs, demote_unnumbered_subsections, mark_cross_page_repeating_text,
-    refine_heading_hierarchy,
+    classify_paragraphs, demote_heading_runs, demote_unnumbered_subsections, mark_arxiv_noise,
+    mark_cross_page_repeating_short_text, mark_cross_page_repeating_text, refine_heading_hierarchy,
 };
 use super::constants::{
     FULL_LINE_FRACTION, MIN_FONT_SIZE, MIN_HEADING_FONT_GAP, MIN_HEADING_FONT_RATIO, PAGE_BOTTOM_MARGIN_FRACTION,
@@ -1187,6 +1187,10 @@ pub fn extract_document_structure(
     // Mark short text that repeats across many pages as furniture (headers/footers/watermarks).
     mark_cross_page_repeating_text(&mut all_page_paragraphs, &page_heights);
+    // Tier 2: catch short repeating text outside margin zones (e.g. conference headers).
+    mark_cross_page_repeating_short_text(&mut all_page_paragraphs);
+    // Mark arXiv watermark identifiers on first pages.
+    mark_arxiv_noise(&mut all_page_paragraphs);
     for page in &mut all_page_paragraphs {
         retain_page_furniture_safely(page);
     }
@@ -1225,6 +1229,11 @@ pub fn extract_document_structure(
     let mut doc = assemble_internal_document(all_page_paragraphs, &combined_tables, &image_pos_pairs);
+    // Stage 4b: Populate doc.images with actual image data from pdfium.
+    // Image elements reference indices into doc.images, which must be populated
+    // for markdown/HTML rendering to produce `![desc](image_N.png)` instead of `![]()`.
+    populate_images_from_pdfium(document, &all_image_positions, &mut doc);
     let element_count = doc.elements.len();
     tracing::debug!(element_count, "PDF structure pipeline: assembly complete");
@@ -1748,6 +1757,122 @@ fn is_dedup_candidate(p: &PdfParagraph) -> bool {
         && p.caption_for.is_none()
 }
+/// Extract actual image data from pdfium and populate `doc.images`.
+///
+/// Each `ImagePosition` records (page_number, image_index) for image objects
+/// found during page scanning. This function re-traverses the pages to extract
+/// actual pixel data via pdfium's `get_processed_image`, then pushes each as an
+/// `ExtractedImage` into the document so that rendering produces proper
+/// `![desc](image_N.fmt)` references instead of empty `![]()`.
+fn populate_images_from_pdfium(
+    document: &PdfDocument,
+    image_positions: &[super::bridge::ImagePosition],
+    doc: &mut crate::types::internal::InternalDocument,
+) {
+    use bytes::Bytes;
+    use image::ImageEncoder;
+    if image_positions.is_empty() {
+        return;
+    }
+    // Group image positions by page number (1-indexed) for efficient traversal.
+    let mut by_page: std::collections::BTreeMap<usize, Vec<usize>> = std::collections::BTreeMap::new();
+    for pos in image_positions {
+        by_page.entry(pos.page_number).or_default().push(pos.image_index);
+    }
+    let pages = document.pages();
+    let mut extracted_count = 0u32;
+    for (&page_num, indices) in &by_page {
+        let page_idx = page_num.saturating_sub(1) as i32;
+        let Ok(page) = pages.get(page_idx) else {
+            for &idx in indices {
+                doc.images.push(empty_image_placeholder(idx, page_num));
+            }
+            continue;
+        };
+        // Walk page objects, extracting image data for each matching index.
+        let first_idx_on_page = indices.iter().copied().min().unwrap_or(0);
+        let mut current_image = 0usize;
+        let mut extracted_on_page: std::collections::BTreeMap<usize, crate::types::ExtractedImage> =
+            std::collections::BTreeMap::new();
+        for obj in page.objects().iter() {
+            if let Some(image_obj) = obj.as_image_object() {
+                let global_idx = first_idx_on_page + current_image;
+                if indices.contains(&global_idx)
+                    && let Ok(dynamic_image) = image_obj.get_processed_image(document)
+                {
+                    let w = dynamic_image.width();
+                    let h = dynamic_image.height();
+                    let rgba = dynamic_image.to_rgba8();
+                    let mut png_buf: Vec<u8> = Vec::new();
+                    if image::codecs::png::PngEncoder::new(&mut png_buf)
+                        .write_image(rgba.as_raw(), w, h, image::ExtendedColorType::Rgba8)
+                        .is_ok()
+                    {
+                        extracted_count += 1;
+                        extracted_on_page.insert(
+                            global_idx,
+                            crate::types::ExtractedImage {
+                                data: Bytes::from(png_buf),
+                                format: std::borrow::Cow::Borrowed("png"),
+                                image_index: global_idx,
+                                page_number: Some(page_num),
+                                width: Some(w),
+                                height: Some(h),
+                                colorspace: Some("RGBA".to_string()),
+                                bits_per_component: Some(8),
+                                is_mask: false,
+                                description: None,
+                                ocr_result: None,
+                                bounding_box: None,
+                                source_path: None,
+                            },
+                        );
+                    }
+                }
+                current_image += 1;
+            }
+        }
+        for &idx in indices {
+            let img = extracted_on_page
+                .remove(&idx)
+                .unwrap_or_else(|| empty_image_placeholder(idx, page_num));
+            doc.images.push(img);
+        }
+    }
+    tracing::debug!(
+        total_positions = image_positions.len(),
+        extracted = extracted_count,
+        "populated document images from pdfium"
+    );
+}
+/// Create an empty placeholder for an image that couldn't be extracted.
+fn empty_image_placeholder(idx: usize, page_num: usize) -> crate::types::ExtractedImage {
+    crate::types::ExtractedImage {
+        data: bytes::Bytes::new(),
+        format: std::borrow::Cow::Borrowed("unknown"),
+        image_index: idx,
+        page_number: Some(page_num),
+        width: None,
+        height: None,
+        colorspace: None,
+        bits_per_component: None,
+        is_mask: false,
+        description: None,
+        ocr_result: None,
+        bounding_box: None,
+        source_path: None,
+    }
+}
 #[cfg(test)]
 mod tests {
     use super::*;

data/vendor/kreuzberg/src/rendering/comrak_bridge.rs CHANGED Viewed

@@ -676,6 +676,12 @@ pub fn build_comrak_ast<'a>(doc: &InternalDocument, arena: &'a comrak::Arena<'a>
                     })
                     .unwrap_or_default();
+                // Skip images with no URL and no description — they produce
+                // empty `![]()` nodes that add noise to the output.
+                if url.is_empty() && desc.is_empty() {
+                    continue;
+                }
                 let para = mk(arena, NodeValue::Paragraph);
                 let img_node = mk(
                     arena,

data/vendor/kreuzberg/src/rendering/markdown.rs CHANGED Viewed

@@ -86,6 +86,10 @@ pub fn render_markdown(doc: &InternalDocument) -> String {
             .join("\n");
     }
+    // Strip arXiv watermark/sidebar noise that gets concatenated with body text.
+    // Only applies to the first ~2000 chars (first page area) to avoid touching references.
+    output = strip_arxiv_watermark_noise(output);
     // Trim trailing whitespace but keep single trailing newline
     let trimmed_len = output.trim_end().len();
     if trimmed_len == 0 {
@@ -97,6 +101,48 @@ pub fn render_markdown(doc: &InternalDocument) -> String {
     output
 }
+/// Strip arXiv watermark noise from rendered markdown.
+///
+/// LaTeX-generated PDFs often have a rotated sidebar with the arXiv identifier
+/// that pdfium concatenates with body text. This strips patterns like:
+/// "Title N arXiv:NNNN.NNNNNvN [cat.SC] DD Mon YYYY" from the first pages.
+fn strip_arxiv_watermark_noise(mut text: String) -> String {
+    // Only search the first portion of the text (roughly first 2 pages)
+    let search_limit = text.len().min(6000);
+    let search_area = &text[..search_limit];
+    // Match: optional preceding short fragment + arXiv ID + optional version + category + date
+    let re = regex::Regex::new(
+        r"(?:\s+\S+(?:\s+\S+){0,8})?\s*arXiv:\d{4}\.\d{4,5}(?:v\d+)?(?:\s*\[[\w.-]+\])?\s*(?:\d{1,2}\s+\w+\s+\d{4})?",
+    )
+    .expect("valid regex");
+    if let Some(m) = re.find(search_area) {
+        // Only strip if it looks like a watermark (appears near end of a paragraph,
+        // not in the middle of a sentence about arXiv).
+        let after = &search_area[m.end()..];
+        let before_char = if m.start() > 0 {
+            search_area[..m.start()].chars().last()
+        } else {
+            None
+        };
+        // Strip if preceded by a sentence-ending period or is at end of paragraph
+        let is_at_paragraph_boundary = before_char == Some('.') || after.starts_with('\n') || after.starts_with("\n\n");
+        if is_at_paragraph_boundary {
+            let start = m.start();
+            let end = m.end();
+            tracing::trace!(
+                stripped = %&text[start..end].chars().take(80).collect::<String>(),
+                "stripping arXiv watermark from markdown output"
+            );
+            text.replace_range(start..end, "");
+        }
+    }
+    text
+}
 /// Shared comrak options with all GFM extensions enabled.
 pub(crate) fn comrak_options<'a>() -> Options<'a> {
     let mut options = Options::default();

data/vendor/kreuzberg/tests/pdf_image_extraction_tests.rs ADDED Viewed

@@ -0,0 +1,113 @@
+//! Regression tests for PDF image extraction in markdown output.
+//!
+//! Verifies that embedded images in PDFs produce proper `![](image_N.fmt)`
+//! references instead of empty `![]()` placeholders.
+#![cfg(feature = "pdf")]
+use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
+use kreuzberg::core::extractor::extract_file;
+use std::path::PathBuf;
+mod helpers;
+fn test_documents_dir() -> PathBuf {
+    PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+        .parent()
+        .unwrap()
+        .parent()
+        .unwrap()
+        .join("test_documents")
+}
+fn extract_markdown(relative_path: &str) -> kreuzberg::types::ExtractionResult {
+    let path = test_documents_dir().join(relative_path);
+    let config = ExtractionConfig {
+        output_format: OutputFormat::Markdown,
+        ..Default::default()
+    };
+    let rt = tokio::runtime::Runtime::new().unwrap();
+    rt.block_on(extract_file(&path, None, &config)).unwrap()
+}
+#[test]
+fn test_multipage_marketing_no_empty_image_refs() {
+    let result = extract_markdown("pdf/multipage_marketing.pdf");
+    let content = &result.content;
+    // Must not contain empty image references
+    assert!(
+        !content.contains("![]()"),
+        "Markdown output must not contain empty image references ![](), got:\n{}",
+        content
+    );
+}
+#[test]
+fn test_multipage_marketing_has_image_refs() {
+    let result = extract_markdown("pdf/multipage_marketing.pdf");
+    let content = &result.content;
+    // Must contain at least one proper image reference
+    assert!(
+        content.contains("![](image_"),
+        "Markdown output must contain image references like ![](image_N.png), got:\n{}",
+        content
+    );
+}
+#[test]
+fn test_multipage_marketing_images_populated() {
+    let result = extract_markdown("pdf/multipage_marketing.pdf");
+    // Extraction result must have images with actual data
+    let images = result.images.as_ref().expect("images field must be Some");
+    assert!(!images.is_empty(), "Extraction result must contain extracted images");
+    // At least some images should have non-empty data
+    let images_with_data = images.iter().filter(|img| !img.data.is_empty()).count();
+    assert!(
+        images_with_data > 0,
+        "At least some images should have actual pixel data, got {} images total but none with data",
+        images.len()
+    );
+}
+#[test]
+fn test_docling_no_empty_image_refs() {
+    let result = extract_markdown("pdf/docling.pdf");
+    let content = &result.content;
+    assert!(
+        !content.contains("![]()"),
+        "Docling markdown must not contain empty image references ![](), got:\n{}",
+        content
+    );
+}
+#[test]
+fn test_docling_has_image_refs() {
+    let result = extract_markdown("pdf/docling.pdf");
+    let content = &result.content;
+    // Docling has at least 1 figure
+    assert!(
+        content.contains("![](image_"),
+        "Docling markdown must contain image references, got:\n{}",
+        content
+    );
+}
+#[test]
+fn test_docling_content_quality() {
+    let result = extract_markdown("pdf/docling.pdf");
+    let content = &result.content;
+    // Verify key content from the Docling technical report is present
+    assert!(content.contains("Docling"), "Must contain 'Docling'");
+    assert!(content.contains("PDF"), "Must contain 'PDF'");
+    assert!(
+        content.contains("table structure recognition") || content.contains("TableFormer"),
+        "Must mention table structure recognition or TableFormer"
+    );
+}

data/vendor/kreuzberg/tests/pdf_output_quality.rs ADDED Viewed

@@ -0,0 +1,200 @@
+//! PDF output quality integration tests.
+//!
+//! Regression tests verifying that extraction output is clean and free of
+//! common noise patterns (figure-internal text, arXiv watermarks, reference
+//! entries misclassified as headings, repeating conference headers).
+//!
+//! Benchmark documents:
+//! - `docling.pdf` — academic paper with figures, tables, arXiv sidebar
+//! - `multi_page.pdf` — clean multi-page document (no noise expected)
+#![cfg(feature = "pdf")]
+mod helpers;
+use helpers::*;
+use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
+use kreuzberg::extract_file_sync;
+fn extract_markdown(relative_path: &str) -> String {
+    let pdf_path = get_test_file_path(relative_path);
+    if !pdf_path.exists() {
+        panic!("Test document not found: {}", relative_path);
+    }
+    let config = ExtractionConfig {
+        output_format: OutputFormat::Markdown,
+        ..Default::default()
+    };
+    extract_file_sync(&pdf_path, None, &config)
+        .expect("extraction should succeed")
+        .content
+}
+#[cfg(feature = "layout-detection")]
+fn extract_markdown_with_layout(relative_path: &str) -> String {
+    use kreuzberg::core::config::layout::LayoutDetectionConfig;
+    let pdf_path = get_test_file_path(relative_path);
+    if !pdf_path.exists() {
+        panic!("Test document not found: {}", relative_path);
+    }
+    let config = ExtractionConfig {
+        output_format: OutputFormat::Markdown,
+        layout: Some(LayoutDetectionConfig::default()),
+        ..Default::default()
+    };
+    extract_file_sync(&pdf_path, None, &config)
+        .expect("layout extraction should succeed")
+        .content
+}
+// ── Noise filtering: figure-internal text ────────────────────────────
+#[cfg(feature = "layout-detection")]
+#[test]
+fn test_docling_no_figure_internal_text() {
+    if !test_documents_available() {
+        return;
+    }
+    let content = extract_markdown_with_layout("pdf/docling.pdf");
+    // "Circling Minimums" is a heading from inside an appendix figure — should be suppressed
+    assert!(
+        !content.contains("Circling Minimums"),
+        "Figure-internal heading 'Circling Minimums' leaked into output"
+    );
+    // Figure diagram labels from Figure 1 should not appear as body text
+    assert!(
+        !content.contains("{;} Parse PDF pages"),
+        "Figure 1 diagram text leaked into output"
+    );
+}
+#[cfg(feature = "layout-detection")]
+#[test]
+fn test_docling_no_figure_text_as_headings() {
+    if !test_documents_available() {
+        return;
+    }
+    let content = extract_markdown_with_layout("pdf/docling.pdf");
+    // "{;} Parse PDF pages" is from the pipeline diagram (Figure 1)
+    for line in content.lines() {
+        if line.starts_with('#') {
+            assert!(
+                !line.contains("{;}"),
+                "Figure diagram text promoted to heading: {}",
+                line
+            );
+            assert!(
+                !line.contains("Parse PDF pages Table Structure OCR"),
+                "Figure diagram text promoted to heading: {}",
+                line
+            );
+        }
+    }
+}
+// ── Noise filtering: arXiv watermark ─────────────────────────────────
+#[cfg(feature = "layout-detection")]
+#[test]
+fn test_docling_no_arxiv_watermark() {
+    if !test_documents_available() {
+        return;
+    }
+    let content = extract_markdown_with_layout("pdf/docling.pdf");
+    // The arXiv sidebar watermark "arXiv:2408.09869v5" should be stripped.
+    // Legitimate references to arXiv in body text are fine (they don't include the ID).
+    assert!(
+        !content.contains("arXiv:2408.09869"),
+        "arXiv watermark identifier not stripped from output"
+    );
+}
+// ── Noise filtering: references as headings ──────────────────────────
+#[cfg(feature = "layout-detection")]
+#[test]
+fn test_docling_references_not_headings() {
+    if !test_documents_available() {
+        return;
+    }
+    let content = extract_markdown_with_layout("pdf/docling.pdf");
+    // Individual reference entries should not be promoted to ## headings
+    let heading_lines: Vec<&str> = content.lines().filter(|l| l.starts_with("## ")).collect();
+    for h in &heading_lines {
+        assert!(
+            !h.contains("PyPDFium2"),
+            "Reference entry misclassified as heading: {}",
+            h
+        );
+        assert!(
+            !h.contains("LlamaIndex"),
+            "Reference entry misclassified as heading: {}",
+            h
+        );
+        assert!(
+            !h.contains("PyttiuPDF"),
+            "Reference entry misclassified as heading: {}",
+            h
+        );
+    }
+}
+// ── Content preservation ─────────────────────────────────────────────
+#[cfg(feature = "layout-detection")]
+#[test]
+fn test_docling_key_content_preserved() {
+    if !test_documents_available() {
+        return;
+    }
+    let content = extract_markdown_with_layout("pdf/docling.pdf");
+    assert!(
+        content.contains("Docling Technical Report"),
+        "Title not found in output"
+    );
+    assert!(
+        content.contains("Processing pipeline") || content.contains("processing pipeline"),
+        "Section 'Processing pipeline' not found"
+    );
+    assert!(content.contains("TableFormer"), "'TableFormer' not found");
+    assert!(
+        content.contains("PDF backend") || content.contains("PDF backends"),
+        "'PDF backends' section not found"
+    );
+}
+#[test]
+fn test_multipage_clean_output() {
+    if !test_documents_available() {
+        return;
+    }
+    let content = extract_markdown("pdf/multi_page.pdf");
+    assert!(content.contains("Evolution of the Word Processor"), "Title not found");
+    assert!(
+        content.contains("Pre-Digital Era"),
+        "Section 'Pre-Digital Era' not found"
+    );
+    assert!(content.contains("IBM MT/ST"), "'IBM MT/ST' not found");
+}
+#[test]
+fn test_multipage_no_noise() {
+    if !test_documents_available() {
+        return;
+    }
+    let content = extract_markdown("pdf/multi_page.pdf");
+    // multipage.pdf is a clean document — should have no arXiv noise
+    assert!(
+        !content.contains("arXiv:"),
+        "multipage.pdf should have no arXiv identifiers"
+    );
+}

data/vendor/kreuzberg-ffi/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg-ffi"
-version = "4.7.3"
+version = "4.7.4"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -41,7 +41,7 @@ serde_json = { version = "1.0.149" }
 tokio = { version = "1.51.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
 [target.'cfg(all(windows, target_env = "gnu"))'.dependencies]
-kreuzberg = { path = "../kreuzberg", version = "4.7.3", default-features = false, features = [
+kreuzberg = { path = "../kreuzberg", version = "4.7.4", default-features = false, features = [
     "pdf",
     "excel",
     "office",
@@ -64,7 +64,7 @@ kreuzberg = { path = "../kreuzberg", version = "4.7.3", default-features = false
 ] }
 [target.'cfg(not(all(windows, target_env = "gnu")))'.dependencies]
-kreuzberg = { path = "../kreuzberg", version = "4.7.3", default-features = false, features = ["bundled-pdfium", "full"] }
+kreuzberg = { path = "../kreuzberg", version = "4.7.4", default-features = false, features = ["bundled-pdfium", "full"] }
 [build-dependencies]
 cbindgen = "0.29"

data/vendor/kreuzberg-ffi/kreuzberg.h CHANGED Viewed

@@ -9,8 +9,8 @@
 #define KREUZBERG_VERSION_MAJOR 4
 #define KREUZBERG_VERSION_MINOR 7
-#define KREUZBERG_VERSION_PATCH 3
-#define KREUZBERG_VERSION "4.7.3"
+#define KREUZBERG_VERSION_PATCH 4
+#define KREUZBERG_VERSION "4.7.4"
 #include <stdarg.h>

data/vendor/kreuzberg-paddle-ocr/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg-paddle-ocr"
-version = "4.7.3"
+version = "4.7.4"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]

data/vendor/kreuzberg-pdfium-render/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg-pdfium-render"
-version = "4.7.3"
+version = "4.7.4"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]

data/vendor/kreuzberg-tesseract/Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "kreuzberg-tesseract"
-version = "4.7.3"
+version = "4.7.4"
 edition = "2024"
 rust-version = "1.91"
 authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]

data/vendor/kreuzberg-tesseract/src/api.rs CHANGED Viewed

@@ -2160,7 +2160,7 @@ impl Clone for TesseractAPI {
 }
 #[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
-unsafe extern "C-unwind" {
+ffi_extern! {
     fn TessBaseAPIMeanTextConf(handle: *mut c_void) -> c_int;
     fn TessBaseAPISetVariable(handle: *mut c_void, name: *const c_char, value: *const c_char) -> c_int;
     fn TessBaseAPIGetStringVariable(handle: *mut c_void, name: *const c_char) -> *const c_char;

data/vendor/kreuzberg-tesseract/src/choice_iterator.rs CHANGED Viewed

@@ -69,7 +69,7 @@ impl Drop for ChoiceIterator {
     }
 }
-unsafe extern "C-unwind" {
+ffi_extern! {
     fn TessChoiceIteratorDelete(handle: *mut c_void);
     fn TessChoiceIteratorNext(handle: *mut c_void) -> c_int;
     fn TessChoiceIteratorGetUTF8Text(handle: *mut c_void) -> *mut c_char;

data/vendor/kreuzberg-tesseract/src/leptonica.rs CHANGED Viewed

@@ -29,7 +29,7 @@ use std::ffi::c_void;
 // ---------------------------------------------------------------------------
 #[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
-unsafe extern "C-unwind" {
+ffi_extern! {
     /// Allocates a new Pix with the given dimensions and bit depth.
     fn pixCreate(width: i32, height: i32, depth: i32) -> *mut c_void;

data/vendor/kreuzberg-tesseract/src/lib.rs CHANGED Viewed

@@ -127,6 +127,34 @@
 //!     Ok(())
 //! }
 //! ```
+/// Declare FFI functions with `extern "C-unwind"` on native targets (to catch
+/// C++ exceptions from Tesseract/Leptonica) and `extern "C"` on WASM (where
+/// the LLVM backend does not support `cleanupret` / C++ unwinding).
+macro_rules! ffi_extern {
+    (
+        $(
+            $(#[$meta:meta])*
+            $vis:vis fn $name:ident($($arg:ident : $ty:ty),* $(,)?) $(-> $ret:ty)?;
+        )*
+    ) => {
+        #[cfg(not(target_arch = "wasm32"))]
+        unsafe extern "C-unwind" {
+            $(
+                $(#[$meta])*
+                $vis fn $name($($arg : $ty),*) $(-> $ret)?;
+            )*
+        }
+        #[cfg(target_arch = "wasm32")]
+        unsafe extern "C" {
+            $(
+                $(#[$meta])*
+                $vis fn $name($($arg : $ty),*) $(-> $ret)?;
+            )*
+        }
+    };
+}
 pub use error::{Result, TesseractError};
 mod error;

data/vendor/kreuzberg-tesseract/src/monitor.rs CHANGED Viewed

@@ -60,7 +60,7 @@ impl Drop for TessMonitor {
     }
 }
-unsafe extern "C-unwind" {
+ffi_extern! {
     pub fn TessMonitorCreate() -> *mut c_void;
     pub fn TessMonitorDelete(monitor: *mut c_void);
     pub fn TessMonitorSetDeadlineMSecs(monitor: *mut c_void, deadline: c_int);

data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs CHANGED Viewed

@@ -191,7 +191,7 @@ impl Drop for MutableIterator {
     }
 }
-unsafe extern "C-unwind" {
+ffi_extern! {
     pub fn TessResultIteratorDelete(handle: *mut c_void);
     pub fn TessDeleteText(text: *mut c_char);
 }

data/vendor/kreuzberg-tesseract/src/page_iterator.rs CHANGED Viewed

@@ -380,7 +380,7 @@ impl Drop for PageIterator {
     }
 }
-unsafe extern "C-unwind" {
+ffi_extern! {
     pub fn TessPageIteratorDelete(handle: *mut c_void);
     pub fn TessPageIteratorBegin(handle: *mut c_void);
     pub fn TessPageIteratorNext(handle: *mut c_void, level: c_int) -> c_int;

data/vendor/kreuzberg-tesseract/src/result_iterator.rs CHANGED Viewed

@@ -555,7 +555,7 @@ impl Drop for ResultIterator {
 }
 #[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
-unsafe extern "C-unwind" {
+ffi_extern! {
     pub fn TessResultIteratorDelete(handle: *mut c_void);
     pub fn TessPageIteratorBegin(handle: *mut c_void);
     pub fn TessResultIteratorGetUTF8Text(handle: *mut c_void, level: c_int) -> *mut c_char;

data/vendor/kreuzberg-tesseract/src/result_renderer.rs CHANGED Viewed

@@ -198,7 +198,7 @@ impl Drop for TessResultRenderer {
     }
 }
-unsafe extern "C-unwind" {
+ffi_extern! {
     pub fn TessTextRendererCreate(outputbase: *const c_char) -> *mut c_void;
     pub fn TessHOcrRendererCreate(outputbase: *const c_char) -> *mut c_void;
     pub fn TessPDFRendererCreate(outputbase: *const c_char, datadir: *const c_char, textonly: c_int) -> *mut c_void;

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: kreuzberg
 version: !ruby/object:Gem::Version
-  version: 4.7.3
+  version: 4.7.4
 platform: ruby
 authors:
 - Na'aman Hirschfeld
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2026-04-05 00:00:00.000000000 Z
+date: 2026-04-06 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rb_sys
@@ -1031,11 +1031,13 @@ files:
 - vendor/kreuzberg/tests/path_resolution/fixtures/typst_with_images.typ
 - vendor/kreuzberg/tests/pdf_hierarchy_detection.rs
 - vendor/kreuzberg/tests/pdf_hierarchy_quality.rs
+- vendor/kreuzberg/tests/pdf_image_extraction_tests.rs
 - vendor/kreuzberg/tests/pdf_integration.rs
 - vendor/kreuzberg/tests/pdf_markdown_extraction.rs
 - vendor/kreuzberg/tests/pdf_markdown_quality.rs
 - vendor/kreuzberg/tests/pdf_markdown_regression.rs
 - vendor/kreuzberg/tests/pdf_ocr_triggering.rs
+- vendor/kreuzberg/tests/pdf_output_quality.rs
 - vendor/kreuzberg/tests/pdf_table_detection.rs
 - vendor/kreuzberg/tests/pdf_table_ground_truth.rs
 - vendor/kreuzberg/tests/pdf_text_merging.rs