kreuzberg 4.7.3 → 4.7.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0f26fc086a0221056b94cd10141832240ef9563783835ef66555445e0d33442d
4
- data.tar.gz: ed8488fcdd8bd12266ec2be8984ec5f5b60a32aaa562e64b442d9a4a641c8841
3
+ metadata.gz: 3fffd5a1e2d066b0997155be101dade834002fd2805fd556b2e5b6b8a8d29be9
4
+ data.tar.gz: 0f1ce8406a8880327191fa4e49bdadd71f938ec47ecb3b7e38f4121008d16600
5
5
  SHA512:
6
- metadata.gz: 485460f24ddbf58b82a41873a61c5d78408012b415e62a52daeed43df2d150b317115f3ca79bc029a9394188da70d6733f79ba8e61f589c58bee7d1a845b17d8
7
- data.tar.gz: 5e80e6e34fe8125a934b141ad3ab058240b81f4b82d609e0e2aea822131cbd4954dd7112e152a90542fc56d494519182c62cc14b387f658944ebbc73bc5037a1
6
+ metadata.gz: f878aeea0ccb330d30f863707fc31ba0dce4a8b2eb5f6fca705e5d21337a68ea209328a8dc84ae565c20a996d3ae6f5e53149d570a5439f6746f27df3d1c5671
7
+ data.tar.gz: 1ca62afb51ffb9d85f3a820c7d44473d393bfdefab21f78723c52d3304082382e30d646c2751203d2f0a0d2ab346ec664856662ba7c212c565b35777fa8f167f
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.7.3" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.7.4" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -2792,7 +2792,7 @@ dependencies = [
2792
2792
 
2793
2793
  [[package]]
2794
2794
  name = "kreuzberg"
2795
- version = "4.7.2"
2795
+ version = "4.7.3"
2796
2796
  dependencies = [
2797
2797
  "ahash",
2798
2798
  "async-trait",
@@ -2885,7 +2885,7 @@ dependencies = [
2885
2885
 
2886
2886
  [[package]]
2887
2887
  name = "kreuzberg-ffi"
2888
- version = "4.7.2"
2888
+ version = "4.7.3"
2889
2889
  dependencies = [
2890
2890
  "ahash",
2891
2891
  "async-trait",
@@ -2901,7 +2901,7 @@ dependencies = [
2901
2901
 
2902
2902
  [[package]]
2903
2903
  name = "kreuzberg-paddle-ocr"
2904
- version = "4.7.2"
2904
+ version = "4.7.3"
2905
2905
  dependencies = [
2906
2906
  "geo-clipper",
2907
2907
  "geo-types",
@@ -2915,7 +2915,7 @@ dependencies = [
2915
2915
 
2916
2916
  [[package]]
2917
2917
  name = "kreuzberg-pdfium-render"
2918
- version = "4.7.2"
2918
+ version = "4.7.3"
2919
2919
  dependencies = [
2920
2920
  "bitflags",
2921
2921
  "bytemuck",
@@ -2938,7 +2938,7 @@ dependencies = [
2938
2938
 
2939
2939
  [[package]]
2940
2940
  name = "kreuzberg-rb"
2941
- version = "4.7.2"
2941
+ version = "4.7.3"
2942
2942
  dependencies = [
2943
2943
  "async-trait",
2944
2944
  "html-to-markdown-rs",
@@ -2955,7 +2955,7 @@ dependencies = [
2955
2955
 
2956
2956
  [[package]]
2957
2957
  name = "kreuzberg-tesseract"
2958
- version = "4.7.2"
2958
+ version = "4.7.3"
2959
2959
  dependencies = [
2960
2960
  "cc",
2961
2961
  "cmake",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-rb"
3
- version = "4.7.3"
3
+ version = "4.7.4"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.7.3'
4
+ VERSION = '4.7.4'
5
5
  end
data/vendor/Cargo.toml CHANGED
@@ -2,7 +2,7 @@
2
2
  members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
3
3
 
4
4
  [workspace.package]
5
- version = "4.7.3"
5
+ version = "4.7.4"
6
6
  edition = "2024"
7
7
  rust-version = "1.91"
8
8
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -32,8 +32,8 @@ html-to-markdown-rs = { version = "3.1.0", default-features = false }
32
32
  image = { version = "0.25.10", default-features = false }
33
33
  itertools = "0.14"
34
34
  js-sys = "0.3"
35
- kreuzberg = { path = "./crates/kreuzberg", version = "4.7.3", default-features = false }
36
- kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.7.3" }
35
+ kreuzberg = { path = "./crates/kreuzberg", version = "4.7.4", default-features = false }
36
+ kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.7.4" }
37
37
  lazy_static = "1.5.0"
38
38
  libc = "0.2.184"
39
39
  log = "0.4"
@@ -43,7 +43,7 @@ num_cpus = "1.17.0"
43
43
  once_cell = "1.21.4"
44
44
  ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
45
45
  parking_lot = "0.12.5"
46
- pdf_oxide = { version = "0.3.20", default-features = false }
46
+ pdf_oxide = { version = "0.3.21", default-features = false }
47
47
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
48
48
  rayon = "1.11.0"
49
49
  reqwest = { version = "0.13.2", default-features = false }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg"
3
- version = "4.7.3"
3
+ version = "4.7.4"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -307,7 +307,7 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
307
307
  outlook-pst = { version = "1.2.0", optional = true }
308
308
  parking_lot = "0.12.5"
309
309
  pastey = "0.2"
310
- pdf_oxide = { version = "0.3.20", default-features = false, optional = true }
310
+ pdf_oxide = { version = "0.3.21", default-features = false, optional = true }
311
311
  pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
312
312
  pulldown-cmark = { version = "0.13" }
313
313
  quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
18
18
 
19
19
  This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
20
20
 
21
- > **🚀 Version 4.7.3 Release**
21
+ > **🚀 Version 4.7.4 Release**
22
22
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
23
23
  >
24
24
  > **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
@@ -1045,6 +1045,163 @@ fn is_greek_letter(c: char) -> bool {
1045
1045
  matches!(c, '\u{0391}'..='\u{03A9}' | '\u{03B1}'..='\u{03C9}')
1046
1046
  }
1047
1047
 
1048
+ /// Remove arXiv watermark/sidebar noise from paragraphs on the first pages.
1049
+ ///
1050
+ /// Handles two cases:
1051
+ /// 1. Short standalone paragraphs that are just the arXiv identifier → mark as furniture.
1052
+ /// 2. arXiv identifier appended to the end of a longer paragraph (LaTeX sidebar
1053
+ /// text that pdfium concatenates with body text) → strip the trailing noise.
1054
+ pub(super) fn mark_arxiv_noise(all_pages: &mut [Vec<PdfParagraph>]) {
1055
+ let arxiv_re = regex::Regex::new(r"arXiv:\d{4}\.\d{4,5}").expect("valid regex");
1056
+ // Match trailing sidebar noise: title/page-num + arXiv ID (+ category + date) at end.
1057
+ // The sidebar text from LaTeX gets concatenated by pdfium with body text.
1058
+ // We capture from the arXiv ID to end-of-string and also eat back any preceding
1059
+ // short title/page-number fragment (up to ~8 words before arXiv:).
1060
+ let trailing_re = regex::Regex::new(
1061
+ r"(?:\s+(?:\S+\s+){0,8})?arXiv:\d{4}\.\d{4,5}(?:v\d+)?(?:\s*\[[\w.-]+\])?\s*(?:\d{1,2}\s+\w+\s+\d{4})?\s*$",
1062
+ )
1063
+ .expect("valid regex");
1064
+
1065
+ // Only check first 2 pages — arXiv watermarks don't appear later.
1066
+ for page in all_pages.iter_mut().take(2) {
1067
+ for para in page.iter_mut() {
1068
+ if para.is_page_furniture {
1069
+ continue;
1070
+ }
1071
+ let text = paragraph_plain_text(para);
1072
+ let trimmed = text.trim();
1073
+ let word_count = trimmed.split_whitespace().count();
1074
+
1075
+ if !arxiv_re.is_match(trimmed) {
1076
+ continue;
1077
+ }
1078
+
1079
+ // Short paragraph dominated by the arXiv identifier → mark as furniture.
1080
+ if word_count <= 25 {
1081
+ tracing::trace!(
1082
+ text = %trimmed.chars().take(80).collect::<String>(),
1083
+ "marking arXiv watermark as furniture"
1084
+ );
1085
+ para.is_page_furniture = true;
1086
+ para.heading_level = None;
1087
+ } else if let Some(m) = trailing_re.find(trimmed) {
1088
+ // arXiv id is at the end of a longer paragraph — strip it from the last segment.
1089
+ let noise = &trimmed[m.start()..];
1090
+ tracing::trace!(
1091
+ stripped = %noise.chars().take(80).collect::<String>(),
1092
+ "stripping trailing arXiv watermark from paragraph"
1093
+ );
1094
+ strip_trailing_text_from_paragraph(para, noise.trim());
1095
+ }
1096
+ }
1097
+ }
1098
+ }
1099
+
1100
+ /// Strip trailing noise text from the last segment(s) of a paragraph.
1101
+ fn strip_trailing_text_from_paragraph(para: &mut PdfParagraph, noise: &str) {
1102
+ // Walk lines in reverse to find the segment containing the noise.
1103
+ for line in para.lines.iter_mut().rev() {
1104
+ for seg in line.segments.iter_mut().rev() {
1105
+ if let Some(pos) = seg.text.find(noise) {
1106
+ seg.text = seg.text[..pos].trim_end().to_string();
1107
+ return;
1108
+ }
1109
+ // If the entire segment is part of the noise, clear it.
1110
+ let seg_trimmed = seg.text.trim();
1111
+ if !seg_trimmed.is_empty() && noise.contains(seg_trimmed) {
1112
+ seg.text.clear();
1113
+ } else {
1114
+ // Reached body text — stop.
1115
+ return;
1116
+ }
1117
+ }
1118
+ }
1119
+ }
1120
+
1121
+ /// Second-tier cross-page repeating text detection.
1122
+ ///
1123
+ /// Supplements `mark_cross_page_repeating_text` by scanning ALL paragraphs
1124
+ /// (not just margin-positioned ones) for short text that repeats on a
1125
+ /// supermajority of pages. Catches inline conference headers, journal running
1126
+ /// titles, and similar repeated boilerplate that appears outside the margin zone.
1127
+ pub(super) fn mark_cross_page_repeating_short_text(all_pages: &mut [Vec<PdfParagraph>]) {
1128
+ if all_pages.len() < 5 {
1129
+ return;
1130
+ }
1131
+
1132
+ let max_words = 20;
1133
+ let threshold = (all_pages.len() as f64 * 0.7).ceil() as usize;
1134
+
1135
+ // Count how many pages each short text appears on.
1136
+ let mut text_page_count: ahash::AHashMap<String, usize> = ahash::AHashMap::new();
1137
+ for page in all_pages.iter() {
1138
+ let mut seen: ahash::AHashSet<String> = ahash::AHashSet::new();
1139
+ for para in page {
1140
+ if para.is_page_furniture {
1141
+ continue;
1142
+ }
1143
+ let text = paragraph_plain_text(para);
1144
+ let normalized = text.trim().to_lowercase();
1145
+ if normalized.is_empty() {
1146
+ continue;
1147
+ }
1148
+ let word_count = normalized.split_whitespace().count();
1149
+ if word_count > max_words {
1150
+ continue;
1151
+ }
1152
+ let alphanum_key: String = normalized.chars().filter(|c| c.is_alphanumeric()).collect();
1153
+ if alphanum_key.is_empty() {
1154
+ continue;
1155
+ }
1156
+ if seen.insert(alphanum_key.clone()) {
1157
+ *text_page_count.entry(alphanum_key).or_insert(0) += 1;
1158
+ }
1159
+ }
1160
+ }
1161
+
1162
+ // Collect keys that repeat on ≥70% of pages.
1163
+ let repeating: ahash::AHashSet<String> = text_page_count
1164
+ .into_iter()
1165
+ .filter(|(_, count)| *count >= threshold)
1166
+ .map(|(key, _)| key)
1167
+ .collect();
1168
+
1169
+ if repeating.is_empty() {
1170
+ return;
1171
+ }
1172
+
1173
+ tracing::debug!(
1174
+ repeating_count = repeating.len(),
1175
+ threshold,
1176
+ total_pages = all_pages.len(),
1177
+ "cross-page short-text repeating detection (tier 2)"
1178
+ );
1179
+
1180
+ // Mark matching paragraphs as furniture.
1181
+ for page in all_pages.iter_mut() {
1182
+ for para in page.iter_mut() {
1183
+ if para.is_page_furniture {
1184
+ continue;
1185
+ }
1186
+ let text = paragraph_plain_text(para);
1187
+ let normalized = text.trim().to_lowercase();
1188
+ let word_count = normalized.split_whitespace().count();
1189
+ if word_count > max_words {
1190
+ continue;
1191
+ }
1192
+ let alphanum_key: String = normalized.chars().filter(|c| c.is_alphanumeric()).collect();
1193
+ if repeating.contains(&alphanum_key) {
1194
+ tracing::trace!(
1195
+ text = %normalized.chars().take(60).collect::<String>(),
1196
+ "marking repeating short text as furniture (tier 2)"
1197
+ );
1198
+ para.is_page_furniture = true;
1199
+ para.heading_level = None;
1200
+ }
1201
+ }
1202
+ }
1203
+ }
1204
+
1048
1205
  #[cfg(test)]
1049
1206
  mod tests {
1050
1207
  use super::*;
@@ -380,6 +380,12 @@ pub(super) fn apply_hint_to_paragraph(para: &mut PdfParagraph, hint: &LayoutHint
380
380
  LayoutHintClass::PageHeader | LayoutHintClass::PageFooter => {
381
381
  para.is_page_furniture = true;
382
382
  }
383
+ LayoutHintClass::Picture => {
384
+ // Text classified as Picture by layout model is figure-internal text
385
+ // (diagram labels, axis text, etc.) — suppress from body output.
386
+ para.is_page_furniture = true;
387
+ para.heading_level = None;
388
+ }
383
389
  LayoutHintClass::Text | LayoutHintClass::Caption | LayoutHintClass::Footnote
384
390
  // Layout model says this is body text, not a heading.
385
391
  // Demote font-size-classified headings when layout has high confidence.
@@ -11,8 +11,8 @@ use rayon::prelude::*;
11
11
  use super::assembly::assemble_internal_document;
12
12
  use super::bridge::{ImagePosition, extracted_blocks_to_paragraphs, filter_sidebar_blocks, objects_to_page_data};
13
13
  use super::classify::{
14
- classify_paragraphs, demote_heading_runs, demote_unnumbered_subsections, mark_cross_page_repeating_text,
15
- refine_heading_hierarchy,
14
+ classify_paragraphs, demote_heading_runs, demote_unnumbered_subsections, mark_arxiv_noise,
15
+ mark_cross_page_repeating_short_text, mark_cross_page_repeating_text, refine_heading_hierarchy,
16
16
  };
17
17
  use super::constants::{
18
18
  FULL_LINE_FRACTION, MIN_FONT_SIZE, MIN_HEADING_FONT_GAP, MIN_HEADING_FONT_RATIO, PAGE_BOTTOM_MARGIN_FRACTION,
@@ -1187,6 +1187,10 @@ pub fn extract_document_structure(
1187
1187
 
1188
1188
  // Mark short text that repeats across many pages as furniture (headers/footers/watermarks).
1189
1189
  mark_cross_page_repeating_text(&mut all_page_paragraphs, &page_heights);
1190
+ // Tier 2: catch short repeating text outside margin zones (e.g. conference headers).
1191
+ mark_cross_page_repeating_short_text(&mut all_page_paragraphs);
1192
+ // Mark arXiv watermark identifiers on first pages.
1193
+ mark_arxiv_noise(&mut all_page_paragraphs);
1190
1194
  for page in &mut all_page_paragraphs {
1191
1195
  retain_page_furniture_safely(page);
1192
1196
  }
@@ -1225,6 +1229,11 @@ pub fn extract_document_structure(
1225
1229
 
1226
1230
  let mut doc = assemble_internal_document(all_page_paragraphs, &combined_tables, &image_pos_pairs);
1227
1231
 
1232
+ // Stage 4b: Populate doc.images with actual image data from pdfium.
1233
+ // Image elements reference indices into doc.images, which must be populated
1234
+ // for markdown/HTML rendering to produce `![desc](image_N.png)` instead of `![]()`.
1235
+ populate_images_from_pdfium(document, &all_image_positions, &mut doc);
1236
+
1228
1237
  let element_count = doc.elements.len();
1229
1238
  tracing::debug!(element_count, "PDF structure pipeline: assembly complete");
1230
1239
 
@@ -1748,6 +1757,122 @@ fn is_dedup_candidate(p: &PdfParagraph) -> bool {
1748
1757
  && p.caption_for.is_none()
1749
1758
  }
1750
1759
 
1760
+ /// Extract actual image data from pdfium and populate `doc.images`.
1761
+ ///
1762
+ /// Each `ImagePosition` records (page_number, image_index) for image objects
1763
+ /// found during page scanning. This function re-traverses the pages to extract
1764
+ /// actual pixel data via pdfium's `get_processed_image`, then pushes each as an
1765
+ /// `ExtractedImage` into the document so that rendering produces proper
1766
+ /// `![desc](image_N.fmt)` references instead of empty `![]()`.
1767
+ fn populate_images_from_pdfium(
1768
+ document: &PdfDocument,
1769
+ image_positions: &[super::bridge::ImagePosition],
1770
+ doc: &mut crate::types::internal::InternalDocument,
1771
+ ) {
1772
+ use bytes::Bytes;
1773
+ use image::ImageEncoder;
1774
+
1775
+ if image_positions.is_empty() {
1776
+ return;
1777
+ }
1778
+
1779
+ // Group image positions by page number (1-indexed) for efficient traversal.
1780
+ let mut by_page: std::collections::BTreeMap<usize, Vec<usize>> = std::collections::BTreeMap::new();
1781
+ for pos in image_positions {
1782
+ by_page.entry(pos.page_number).or_default().push(pos.image_index);
1783
+ }
1784
+
1785
+ let pages = document.pages();
1786
+ let mut extracted_count = 0u32;
1787
+
1788
+ for (&page_num, indices) in &by_page {
1789
+ let page_idx = page_num.saturating_sub(1) as i32;
1790
+ let Ok(page) = pages.get(page_idx) else {
1791
+ for &idx in indices {
1792
+ doc.images.push(empty_image_placeholder(idx, page_num));
1793
+ }
1794
+ continue;
1795
+ };
1796
+
1797
+ // Walk page objects, extracting image data for each matching index.
1798
+ let first_idx_on_page = indices.iter().copied().min().unwrap_or(0);
1799
+ let mut current_image = 0usize;
1800
+ let mut extracted_on_page: std::collections::BTreeMap<usize, crate::types::ExtractedImage> =
1801
+ std::collections::BTreeMap::new();
1802
+
1803
+ for obj in page.objects().iter() {
1804
+ if let Some(image_obj) = obj.as_image_object() {
1805
+ let global_idx = first_idx_on_page + current_image;
1806
+ if indices.contains(&global_idx)
1807
+ && let Ok(dynamic_image) = image_obj.get_processed_image(document)
1808
+ {
1809
+ let w = dynamic_image.width();
1810
+ let h = dynamic_image.height();
1811
+ let rgba = dynamic_image.to_rgba8();
1812
+ let mut png_buf: Vec<u8> = Vec::new();
1813
+ if image::codecs::png::PngEncoder::new(&mut png_buf)
1814
+ .write_image(rgba.as_raw(), w, h, image::ExtendedColorType::Rgba8)
1815
+ .is_ok()
1816
+ {
1817
+ extracted_count += 1;
1818
+ extracted_on_page.insert(
1819
+ global_idx,
1820
+ crate::types::ExtractedImage {
1821
+ data: Bytes::from(png_buf),
1822
+ format: std::borrow::Cow::Borrowed("png"),
1823
+ image_index: global_idx,
1824
+ page_number: Some(page_num),
1825
+ width: Some(w),
1826
+ height: Some(h),
1827
+ colorspace: Some("RGBA".to_string()),
1828
+ bits_per_component: Some(8),
1829
+ is_mask: false,
1830
+ description: None,
1831
+ ocr_result: None,
1832
+ bounding_box: None,
1833
+ source_path: None,
1834
+ },
1835
+ );
1836
+ }
1837
+ }
1838
+ current_image += 1;
1839
+ }
1840
+ }
1841
+
1842
+ for &idx in indices {
1843
+ let img = extracted_on_page
1844
+ .remove(&idx)
1845
+ .unwrap_or_else(|| empty_image_placeholder(idx, page_num));
1846
+ doc.images.push(img);
1847
+ }
1848
+ }
1849
+
1850
+ tracing::debug!(
1851
+ total_positions = image_positions.len(),
1852
+ extracted = extracted_count,
1853
+ "populated document images from pdfium"
1854
+ );
1855
+ }
1856
+
1857
+ /// Create an empty placeholder for an image that couldn't be extracted.
1858
+ fn empty_image_placeholder(idx: usize, page_num: usize) -> crate::types::ExtractedImage {
1859
+ crate::types::ExtractedImage {
1860
+ data: bytes::Bytes::new(),
1861
+ format: std::borrow::Cow::Borrowed("unknown"),
1862
+ image_index: idx,
1863
+ page_number: Some(page_num),
1864
+ width: None,
1865
+ height: None,
1866
+ colorspace: None,
1867
+ bits_per_component: None,
1868
+ is_mask: false,
1869
+ description: None,
1870
+ ocr_result: None,
1871
+ bounding_box: None,
1872
+ source_path: None,
1873
+ }
1874
+ }
1875
+
1751
1876
  #[cfg(test)]
1752
1877
  mod tests {
1753
1878
  use super::*;
@@ -676,6 +676,12 @@ pub fn build_comrak_ast<'a>(doc: &InternalDocument, arena: &'a comrak::Arena<'a>
676
676
  })
677
677
  .unwrap_or_default();
678
678
 
679
+ // Skip images with no URL and no description — they produce
680
+ // empty `![]()` nodes that add noise to the output.
681
+ if url.is_empty() && desc.is_empty() {
682
+ continue;
683
+ }
684
+
679
685
  let para = mk(arena, NodeValue::Paragraph);
680
686
  let img_node = mk(
681
687
  arena,
@@ -86,6 +86,10 @@ pub fn render_markdown(doc: &InternalDocument) -> String {
86
86
  .join("\n");
87
87
  }
88
88
 
89
+ // Strip arXiv watermark/sidebar noise that gets concatenated with body text.
90
+ // Only applies to the first ~2000 chars (first page area) to avoid touching references.
91
+ output = strip_arxiv_watermark_noise(output);
92
+
89
93
  // Trim trailing whitespace but keep single trailing newline
90
94
  let trimmed_len = output.trim_end().len();
91
95
  if trimmed_len == 0 {
@@ -97,6 +101,48 @@ pub fn render_markdown(doc: &InternalDocument) -> String {
97
101
  output
98
102
  }
99
103
 
104
+ /// Strip arXiv watermark noise from rendered markdown.
105
+ ///
106
+ /// LaTeX-generated PDFs often have a rotated sidebar with the arXiv identifier
107
+ /// that pdfium concatenates with body text. This strips patterns like:
108
+ /// "Title N arXiv:NNNN.NNNNNvN [cat.SC] DD Mon YYYY" from the first pages.
109
+ fn strip_arxiv_watermark_noise(mut text: String) -> String {
110
+ // Only search the first portion of the text (roughly first 2 pages)
111
+ let search_limit = text.len().min(6000);
112
+ let search_area = &text[..search_limit];
113
+
114
+ // Match: optional preceding short fragment + arXiv ID + optional version + category + date
115
+ let re = regex::Regex::new(
116
+ r"(?:\s+\S+(?:\s+\S+){0,8})?\s*arXiv:\d{4}\.\d{4,5}(?:v\d+)?(?:\s*\[[\w.-]+\])?\s*(?:\d{1,2}\s+\w+\s+\d{4})?",
117
+ )
118
+ .expect("valid regex");
119
+
120
+ if let Some(m) = re.find(search_area) {
121
+ // Only strip if it looks like a watermark (appears near end of a paragraph,
122
+ // not in the middle of a sentence about arXiv).
123
+ let after = &search_area[m.end()..];
124
+ let before_char = if m.start() > 0 {
125
+ search_area[..m.start()].chars().last()
126
+ } else {
127
+ None
128
+ };
129
+
130
+ // Strip if preceded by a sentence-ending period or is at end of paragraph
131
+ let is_at_paragraph_boundary = before_char == Some('.') || after.starts_with('\n') || after.starts_with("\n\n");
132
+ if is_at_paragraph_boundary {
133
+ let start = m.start();
134
+ let end = m.end();
135
+ tracing::trace!(
136
+ stripped = %&text[start..end].chars().take(80).collect::<String>(),
137
+ "stripping arXiv watermark from markdown output"
138
+ );
139
+ text.replace_range(start..end, "");
140
+ }
141
+ }
142
+
143
+ text
144
+ }
145
+
100
146
  /// Shared comrak options with all GFM extensions enabled.
101
147
  pub(crate) fn comrak_options<'a>() -> Options<'a> {
102
148
  let mut options = Options::default();
@@ -0,0 +1,113 @@
1
+ //! Regression tests for PDF image extraction in markdown output.
2
+ //!
3
+ //! Verifies that embedded images in PDFs produce proper `![](image_N.fmt)`
4
+ //! references instead of empty `![]()` placeholders.
5
+
6
+ #![cfg(feature = "pdf")]
7
+
8
+ use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
9
+ use kreuzberg::core::extractor::extract_file;
10
+ use std::path::PathBuf;
11
+
12
+ mod helpers;
13
+
14
+ fn test_documents_dir() -> PathBuf {
15
+ PathBuf::from(env!("CARGO_MANIFEST_DIR"))
16
+ .parent()
17
+ .unwrap()
18
+ .parent()
19
+ .unwrap()
20
+ .join("test_documents")
21
+ }
22
+
23
+ fn extract_markdown(relative_path: &str) -> kreuzberg::types::ExtractionResult {
24
+ let path = test_documents_dir().join(relative_path);
25
+ let config = ExtractionConfig {
26
+ output_format: OutputFormat::Markdown,
27
+ ..Default::default()
28
+ };
29
+ let rt = tokio::runtime::Runtime::new().unwrap();
30
+ rt.block_on(extract_file(&path, None, &config)).unwrap()
31
+ }
32
+
33
+ #[test]
34
+ fn test_multipage_marketing_no_empty_image_refs() {
35
+ let result = extract_markdown("pdf/multipage_marketing.pdf");
36
+ let content = &result.content;
37
+
38
+ // Must not contain empty image references
39
+ assert!(
40
+ !content.contains("![]()"),
41
+ "Markdown output must not contain empty image references ![](), got:\n{}",
42
+ content
43
+ );
44
+ }
45
+
46
+ #[test]
47
+ fn test_multipage_marketing_has_image_refs() {
48
+ let result = extract_markdown("pdf/multipage_marketing.pdf");
49
+ let content = &result.content;
50
+
51
+ // Must contain at least one proper image reference
52
+ assert!(
53
+ content.contains("![](image_"),
54
+ "Markdown output must contain image references like ![](image_N.png), got:\n{}",
55
+ content
56
+ );
57
+ }
58
+
59
+ #[test]
60
+ fn test_multipage_marketing_images_populated() {
61
+ let result = extract_markdown("pdf/multipage_marketing.pdf");
62
+
63
+ // Extraction result must have images with actual data
64
+ let images = result.images.as_ref().expect("images field must be Some");
65
+ assert!(!images.is_empty(), "Extraction result must contain extracted images");
66
+
67
+ // At least some images should have non-empty data
68
+ let images_with_data = images.iter().filter(|img| !img.data.is_empty()).count();
69
+ assert!(
70
+ images_with_data > 0,
71
+ "At least some images should have actual pixel data, got {} images total but none with data",
72
+ images.len()
73
+ );
74
+ }
75
+
76
+ #[test]
77
+ fn test_docling_no_empty_image_refs() {
78
+ let result = extract_markdown("pdf/docling.pdf");
79
+ let content = &result.content;
80
+
81
+ assert!(
82
+ !content.contains("![]()"),
83
+ "Docling markdown must not contain empty image references ![](), got:\n{}",
84
+ content
85
+ );
86
+ }
87
+
88
+ #[test]
89
+ fn test_docling_has_image_refs() {
90
+ let result = extract_markdown("pdf/docling.pdf");
91
+ let content = &result.content;
92
+
93
+ // Docling has at least 1 figure
94
+ assert!(
95
+ content.contains("![](image_"),
96
+ "Docling markdown must contain image references, got:\n{}",
97
+ content
98
+ );
99
+ }
100
+
101
+ #[test]
102
+ fn test_docling_content_quality() {
103
+ let result = extract_markdown("pdf/docling.pdf");
104
+ let content = &result.content;
105
+
106
+ // Verify key content from the Docling technical report is present
107
+ assert!(content.contains("Docling"), "Must contain 'Docling'");
108
+ assert!(content.contains("PDF"), "Must contain 'PDF'");
109
+ assert!(
110
+ content.contains("table structure recognition") || content.contains("TableFormer"),
111
+ "Must mention table structure recognition or TableFormer"
112
+ );
113
+ }
@@ -0,0 +1,200 @@
1
+ //! PDF output quality integration tests.
2
+ //!
3
+ //! Regression tests verifying that extraction output is clean and free of
4
+ //! common noise patterns (figure-internal text, arXiv watermarks, reference
5
+ //! entries misclassified as headings, repeating conference headers).
6
+ //!
7
+ //! Benchmark documents:
8
+ //! - `docling.pdf` — academic paper with figures, tables, arXiv sidebar
9
+ //! - `multi_page.pdf` — clean multi-page document (no noise expected)
10
+
11
+ #![cfg(feature = "pdf")]
12
+
13
+ mod helpers;
14
+
15
+ use helpers::*;
16
+ use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
17
+ use kreuzberg::extract_file_sync;
18
+
19
+ fn extract_markdown(relative_path: &str) -> String {
20
+ let pdf_path = get_test_file_path(relative_path);
21
+ if !pdf_path.exists() {
22
+ panic!("Test document not found: {}", relative_path);
23
+ }
24
+ let config = ExtractionConfig {
25
+ output_format: OutputFormat::Markdown,
26
+ ..Default::default()
27
+ };
28
+ extract_file_sync(&pdf_path, None, &config)
29
+ .expect("extraction should succeed")
30
+ .content
31
+ }
32
+
33
+ #[cfg(feature = "layout-detection")]
34
+ fn extract_markdown_with_layout(relative_path: &str) -> String {
35
+ use kreuzberg::core::config::layout::LayoutDetectionConfig;
36
+
37
+ let pdf_path = get_test_file_path(relative_path);
38
+ if !pdf_path.exists() {
39
+ panic!("Test document not found: {}", relative_path);
40
+ }
41
+ let config = ExtractionConfig {
42
+ output_format: OutputFormat::Markdown,
43
+ layout: Some(LayoutDetectionConfig::default()),
44
+ ..Default::default()
45
+ };
46
+ extract_file_sync(&pdf_path, None, &config)
47
+ .expect("layout extraction should succeed")
48
+ .content
49
+ }
50
+
51
+ // ── Noise filtering: figure-internal text ────────────────────────────
52
+
53
+ #[cfg(feature = "layout-detection")]
54
+ #[test]
55
+ fn test_docling_no_figure_internal_text() {
56
+ if !test_documents_available() {
57
+ return;
58
+ }
59
+ let content = extract_markdown_with_layout("pdf/docling.pdf");
60
+
61
+ // "Circling Minimums" is a heading from inside an appendix figure — should be suppressed
62
+ assert!(
63
+ !content.contains("Circling Minimums"),
64
+ "Figure-internal heading 'Circling Minimums' leaked into output"
65
+ );
66
+
67
+ // Figure diagram labels from Figure 1 should not appear as body text
68
+ assert!(
69
+ !content.contains("{;} Parse PDF pages"),
70
+ "Figure 1 diagram text leaked into output"
71
+ );
72
+ }
73
+
74
+ #[cfg(feature = "layout-detection")]
75
+ #[test]
76
+ fn test_docling_no_figure_text_as_headings() {
77
+ if !test_documents_available() {
78
+ return;
79
+ }
80
+ let content = extract_markdown_with_layout("pdf/docling.pdf");
81
+
82
+ // "{;} Parse PDF pages" is from the pipeline diagram (Figure 1)
83
+ for line in content.lines() {
84
+ if line.starts_with('#') {
85
+ assert!(
86
+ !line.contains("{;}"),
87
+ "Figure diagram text promoted to heading: {}",
88
+ line
89
+ );
90
+ assert!(
91
+ !line.contains("Parse PDF pages Table Structure OCR"),
92
+ "Figure diagram text promoted to heading: {}",
93
+ line
94
+ );
95
+ }
96
+ }
97
+ }
98
+
99
+ // ── Noise filtering: arXiv watermark ─────────────────────────────────
100
+
101
+ #[cfg(feature = "layout-detection")]
102
+ #[test]
103
+ fn test_docling_no_arxiv_watermark() {
104
+ if !test_documents_available() {
105
+ return;
106
+ }
107
+ let content = extract_markdown_with_layout("pdf/docling.pdf");
108
+
109
+ // The arXiv sidebar watermark "arXiv:2408.09869v5" should be stripped.
110
+ // Legitimate references to arXiv in body text are fine (they don't include the ID).
111
+ assert!(
112
+ !content.contains("arXiv:2408.09869"),
113
+ "arXiv watermark identifier not stripped from output"
114
+ );
115
+ }
116
+
117
+ // ── Noise filtering: references as headings ──────────────────────────
118
+
119
+ #[cfg(feature = "layout-detection")]
120
+ #[test]
121
+ fn test_docling_references_not_headings() {
122
+ if !test_documents_available() {
123
+ return;
124
+ }
125
+ let content = extract_markdown_with_layout("pdf/docling.pdf");
126
+
127
+ // Individual reference entries should not be promoted to ## headings
128
+ let heading_lines: Vec<&str> = content.lines().filter(|l| l.starts_with("## ")).collect();
129
+ for h in &heading_lines {
130
+ assert!(
131
+ !h.contains("PyPDFium2"),
132
+ "Reference entry misclassified as heading: {}",
133
+ h
134
+ );
135
+ assert!(
136
+ !h.contains("LlamaIndex"),
137
+ "Reference entry misclassified as heading: {}",
138
+ h
139
+ );
140
+ assert!(
141
+ !h.contains("PyttiuPDF"),
142
+ "Reference entry misclassified as heading: {}",
143
+ h
144
+ );
145
+ }
146
+ }
147
+
148
+ // ── Content preservation ─────────────────────────────────────────────
149
+
150
+ #[cfg(feature = "layout-detection")]
151
+ #[test]
152
+ fn test_docling_key_content_preserved() {
153
+ if !test_documents_available() {
154
+ return;
155
+ }
156
+ let content = extract_markdown_with_layout("pdf/docling.pdf");
157
+
158
+ assert!(
159
+ content.contains("Docling Technical Report"),
160
+ "Title not found in output"
161
+ );
162
+ assert!(
163
+ content.contains("Processing pipeline") || content.contains("processing pipeline"),
164
+ "Section 'Processing pipeline' not found"
165
+ );
166
+ assert!(content.contains("TableFormer"), "'TableFormer' not found");
167
+ assert!(
168
+ content.contains("PDF backend") || content.contains("PDF backends"),
169
+ "'PDF backends' section not found"
170
+ );
171
+ }
172
+
173
+ #[test]
174
+ fn test_multipage_clean_output() {
175
+ if !test_documents_available() {
176
+ return;
177
+ }
178
+ let content = extract_markdown("pdf/multi_page.pdf");
179
+
180
+ assert!(content.contains("Evolution of the Word Processor"), "Title not found");
181
+ assert!(
182
+ content.contains("Pre-Digital Era"),
183
+ "Section 'Pre-Digital Era' not found"
184
+ );
185
+ assert!(content.contains("IBM MT/ST"), "'IBM MT/ST' not found");
186
+ }
187
+
188
+ #[test]
189
+ fn test_multipage_no_noise() {
190
+ if !test_documents_available() {
191
+ return;
192
+ }
193
+ let content = extract_markdown("pdf/multi_page.pdf");
194
+
195
+ // multipage.pdf is a clean document — should have no arXiv noise
196
+ assert!(
197
+ !content.contains("arXiv:"),
198
+ "multipage.pdf should have no arXiv identifiers"
199
+ );
200
+ }
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-ffi"
3
- version = "4.7.3"
3
+ version = "4.7.4"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -41,7 +41,7 @@ serde_json = { version = "1.0.149" }
41
41
  tokio = { version = "1.51.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
42
42
 
43
43
  [target.'cfg(all(windows, target_env = "gnu"))'.dependencies]
44
- kreuzberg = { path = "../kreuzberg", version = "4.7.3", default-features = false, features = [
44
+ kreuzberg = { path = "../kreuzberg", version = "4.7.4", default-features = false, features = [
45
45
  "pdf",
46
46
  "excel",
47
47
  "office",
@@ -64,7 +64,7 @@ kreuzberg = { path = "../kreuzberg", version = "4.7.3", default-features = false
64
64
  ] }
65
65
 
66
66
  [target.'cfg(not(all(windows, target_env = "gnu")))'.dependencies]
67
- kreuzberg = { path = "../kreuzberg", version = "4.7.3", default-features = false, features = ["bundled-pdfium", "full"] }
67
+ kreuzberg = { path = "../kreuzberg", version = "4.7.4", default-features = false, features = ["bundled-pdfium", "full"] }
68
68
 
69
69
  [build-dependencies]
70
70
  cbindgen = "0.29"
@@ -9,8 +9,8 @@
9
9
 
10
10
  #define KREUZBERG_VERSION_MAJOR 4
11
11
  #define KREUZBERG_VERSION_MINOR 7
12
- #define KREUZBERG_VERSION_PATCH 3
13
- #define KREUZBERG_VERSION "4.7.3"
12
+ #define KREUZBERG_VERSION_PATCH 4
13
+ #define KREUZBERG_VERSION "4.7.4"
14
14
 
15
15
 
16
16
  #include <stdarg.h>
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-paddle-ocr"
3
- version = "4.7.3"
3
+ version = "4.7.4"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-pdfium-render"
3
- version = "4.7.3"
3
+ version = "4.7.4"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.7.3"
3
+ version = "4.7.4"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
@@ -2160,7 +2160,7 @@ impl Clone for TesseractAPI {
2160
2160
  }
2161
2161
 
2162
2162
  #[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
2163
- unsafe extern "C-unwind" {
2163
+ ffi_extern! {
2164
2164
  fn TessBaseAPIMeanTextConf(handle: *mut c_void) -> c_int;
2165
2165
  fn TessBaseAPISetVariable(handle: *mut c_void, name: *const c_char, value: *const c_char) -> c_int;
2166
2166
  fn TessBaseAPIGetStringVariable(handle: *mut c_void, name: *const c_char) -> *const c_char;
@@ -69,7 +69,7 @@ impl Drop for ChoiceIterator {
69
69
  }
70
70
  }
71
71
 
72
- unsafe extern "C-unwind" {
72
+ ffi_extern! {
73
73
  fn TessChoiceIteratorDelete(handle: *mut c_void);
74
74
  fn TessChoiceIteratorNext(handle: *mut c_void) -> c_int;
75
75
  fn TessChoiceIteratorGetUTF8Text(handle: *mut c_void) -> *mut c_char;
@@ -29,7 +29,7 @@ use std::ffi::c_void;
29
29
  // ---------------------------------------------------------------------------
30
30
 
31
31
  #[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
32
- unsafe extern "C-unwind" {
32
+ ffi_extern! {
33
33
  /// Allocates a new Pix with the given dimensions and bit depth.
34
34
  fn pixCreate(width: i32, height: i32, depth: i32) -> *mut c_void;
35
35
 
@@ -127,6 +127,34 @@
127
127
  //! Ok(())
128
128
  //! }
129
129
  //! ```
130
+ /// Declare FFI functions with `extern "C-unwind"` on native targets (to catch
131
+ /// C++ exceptions from Tesseract/Leptonica) and `extern "C"` on WASM (where
132
+ /// the LLVM backend does not support `cleanupret` / C++ unwinding).
133
+ macro_rules! ffi_extern {
134
+ (
135
+ $(
136
+ $(#[$meta:meta])*
137
+ $vis:vis fn $name:ident($($arg:ident : $ty:ty),* $(,)?) $(-> $ret:ty)?;
138
+ )*
139
+ ) => {
140
+ #[cfg(not(target_arch = "wasm32"))]
141
+ unsafe extern "C-unwind" {
142
+ $(
143
+ $(#[$meta])*
144
+ $vis fn $name($($arg : $ty),*) $(-> $ret)?;
145
+ )*
146
+ }
147
+
148
+ #[cfg(target_arch = "wasm32")]
149
+ unsafe extern "C" {
150
+ $(
151
+ $(#[$meta])*
152
+ $vis fn $name($($arg : $ty),*) $(-> $ret)?;
153
+ )*
154
+ }
155
+ };
156
+ }
157
+
130
158
  pub use error::{Result, TesseractError};
131
159
  mod error;
132
160
 
@@ -60,7 +60,7 @@ impl Drop for TessMonitor {
60
60
  }
61
61
  }
62
62
 
63
- unsafe extern "C-unwind" {
63
+ ffi_extern! {
64
64
  pub fn TessMonitorCreate() -> *mut c_void;
65
65
  pub fn TessMonitorDelete(monitor: *mut c_void);
66
66
  pub fn TessMonitorSetDeadlineMSecs(monitor: *mut c_void, deadline: c_int);
@@ -191,7 +191,7 @@ impl Drop for MutableIterator {
191
191
  }
192
192
  }
193
193
 
194
- unsafe extern "C-unwind" {
194
+ ffi_extern! {
195
195
  pub fn TessResultIteratorDelete(handle: *mut c_void);
196
196
  pub fn TessDeleteText(text: *mut c_char);
197
197
  }
@@ -380,7 +380,7 @@ impl Drop for PageIterator {
380
380
  }
381
381
  }
382
382
 
383
- unsafe extern "C-unwind" {
383
+ ffi_extern! {
384
384
  pub fn TessPageIteratorDelete(handle: *mut c_void);
385
385
  pub fn TessPageIteratorBegin(handle: *mut c_void);
386
386
  pub fn TessPageIteratorNext(handle: *mut c_void, level: c_int) -> c_int;
@@ -555,7 +555,7 @@ impl Drop for ResultIterator {
555
555
  }
556
556
 
557
557
  #[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
558
- unsafe extern "C-unwind" {
558
+ ffi_extern! {
559
559
  pub fn TessResultIteratorDelete(handle: *mut c_void);
560
560
  pub fn TessPageIteratorBegin(handle: *mut c_void);
561
561
  pub fn TessResultIteratorGetUTF8Text(handle: *mut c_void, level: c_int) -> *mut c_char;
@@ -198,7 +198,7 @@ impl Drop for TessResultRenderer {
198
198
  }
199
199
  }
200
200
 
201
- unsafe extern "C-unwind" {
201
+ ffi_extern! {
202
202
  pub fn TessTextRendererCreate(outputbase: *const c_char) -> *mut c_void;
203
203
  pub fn TessHOcrRendererCreate(outputbase: *const c_char) -> *mut c_void;
204
204
  pub fn TessPDFRendererCreate(outputbase: *const c_char, datadir: *const c_char, textonly: c_int) -> *mut c_void;
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.7.3
4
+ version: 4.7.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-04-05 00:00:00.000000000 Z
11
+ date: 2026-04-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -1031,11 +1031,13 @@ files:
1031
1031
  - vendor/kreuzberg/tests/path_resolution/fixtures/typst_with_images.typ
1032
1032
  - vendor/kreuzberg/tests/pdf_hierarchy_detection.rs
1033
1033
  - vendor/kreuzberg/tests/pdf_hierarchy_quality.rs
1034
+ - vendor/kreuzberg/tests/pdf_image_extraction_tests.rs
1034
1035
  - vendor/kreuzberg/tests/pdf_integration.rs
1035
1036
  - vendor/kreuzberg/tests/pdf_markdown_extraction.rs
1036
1037
  - vendor/kreuzberg/tests/pdf_markdown_quality.rs
1037
1038
  - vendor/kreuzberg/tests/pdf_markdown_regression.rs
1038
1039
  - vendor/kreuzberg/tests/pdf_ocr_triggering.rs
1040
+ - vendor/kreuzberg/tests/pdf_output_quality.rs
1039
1041
  - vendor/kreuzberg/tests/pdf_table_detection.rs
1040
1042
  - vendor/kreuzberg/tests/pdf_table_ground_truth.rs
1041
1043
  - vendor/kreuzberg/tests/pdf_text_merging.rs