kreuzberg 4.7.3 → 4.7.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +6 -6
- data/ext/kreuzberg_rb/native/Cargo.toml +1 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +4 -4
- data/vendor/kreuzberg/Cargo.toml +2 -2
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/pdf/structure/classify.rs +157 -0
- data/vendor/kreuzberg/src/pdf/structure/layout_classify.rs +6 -0
- data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +127 -2
- data/vendor/kreuzberg/src/rendering/comrak_bridge.rs +6 -0
- data/vendor/kreuzberg/src/rendering/markdown.rs +46 -0
- data/vendor/kreuzberg/tests/pdf_image_extraction_tests.rs +113 -0
- data/vendor/kreuzberg/tests/pdf_output_quality.rs +200 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +3 -3
- data/vendor/kreuzberg-ffi/kreuzberg.h +2 -2
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/src/api.rs +1 -1
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +1 -1
- data/vendor/kreuzberg-tesseract/src/leptonica.rs +1 -1
- data/vendor/kreuzberg-tesseract/src/lib.rs +28 -0
- data/vendor/kreuzberg-tesseract/src/monitor.rs +1 -1
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +1 -1
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +1 -1
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +1 -1
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 3fffd5a1e2d066b0997155be101dade834002fd2805fd556b2e5b6b8a8d29be9
|
|
4
|
+
data.tar.gz: 0f1ce8406a8880327191fa4e49bdadd71f938ec47ecb3b7e38f4121008d16600
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f878aeea0ccb330d30f863707fc31ba0dce4a8b2eb5f6fca705e5d21337a68ea209328a8dc84ae565c20a996d3ae6f5e53149d570a5439f6746f27df3d1c5671
|
|
7
|
+
data.tar.gz: 1ca62afb51ffb9d85f3a820c7d44473d393bfdefab21f78723c52d3304082382e30d646c2751203d2f0a0d2ab346ec664856662ba7c212c565b35777fa8f167f
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.7.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.7.4" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -2792,7 +2792,7 @@ dependencies = [
|
|
|
2792
2792
|
|
|
2793
2793
|
[[package]]
|
|
2794
2794
|
name = "kreuzberg"
|
|
2795
|
-
version = "4.7.
|
|
2795
|
+
version = "4.7.3"
|
|
2796
2796
|
dependencies = [
|
|
2797
2797
|
"ahash",
|
|
2798
2798
|
"async-trait",
|
|
@@ -2885,7 +2885,7 @@ dependencies = [
|
|
|
2885
2885
|
|
|
2886
2886
|
[[package]]
|
|
2887
2887
|
name = "kreuzberg-ffi"
|
|
2888
|
-
version = "4.7.
|
|
2888
|
+
version = "4.7.3"
|
|
2889
2889
|
dependencies = [
|
|
2890
2890
|
"ahash",
|
|
2891
2891
|
"async-trait",
|
|
@@ -2901,7 +2901,7 @@ dependencies = [
|
|
|
2901
2901
|
|
|
2902
2902
|
[[package]]
|
|
2903
2903
|
name = "kreuzberg-paddle-ocr"
|
|
2904
|
-
version = "4.7.
|
|
2904
|
+
version = "4.7.3"
|
|
2905
2905
|
dependencies = [
|
|
2906
2906
|
"geo-clipper",
|
|
2907
2907
|
"geo-types",
|
|
@@ -2915,7 +2915,7 @@ dependencies = [
|
|
|
2915
2915
|
|
|
2916
2916
|
[[package]]
|
|
2917
2917
|
name = "kreuzberg-pdfium-render"
|
|
2918
|
-
version = "4.7.
|
|
2918
|
+
version = "4.7.3"
|
|
2919
2919
|
dependencies = [
|
|
2920
2920
|
"bitflags",
|
|
2921
2921
|
"bytemuck",
|
|
@@ -2938,7 +2938,7 @@ dependencies = [
|
|
|
2938
2938
|
|
|
2939
2939
|
[[package]]
|
|
2940
2940
|
name = "kreuzberg-rb"
|
|
2941
|
-
version = "4.7.
|
|
2941
|
+
version = "4.7.3"
|
|
2942
2942
|
dependencies = [
|
|
2943
2943
|
"async-trait",
|
|
2944
2944
|
"html-to-markdown-rs",
|
|
@@ -2955,7 +2955,7 @@ dependencies = [
|
|
|
2955
2955
|
|
|
2956
2956
|
[[package]]
|
|
2957
2957
|
name = "kreuzberg-tesseract"
|
|
2958
|
-
version = "4.7.
|
|
2958
|
+
version = "4.7.3"
|
|
2959
2959
|
dependencies = [
|
|
2960
2960
|
"cc",
|
|
2961
2961
|
"cmake",
|
data/lib/kreuzberg/version.rb
CHANGED
data/vendor/Cargo.toml
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
members = ["kreuzberg", "kreuzberg-ffi", "kreuzberg-tesseract", "kreuzberg-paddle-ocr", "kreuzberg-pdfium-render"]
|
|
3
3
|
|
|
4
4
|
[workspace.package]
|
|
5
|
-
version = "4.7.
|
|
5
|
+
version = "4.7.4"
|
|
6
6
|
edition = "2024"
|
|
7
7
|
rust-version = "1.91"
|
|
8
8
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -32,8 +32,8 @@ html-to-markdown-rs = { version = "3.1.0", default-features = false }
|
|
|
32
32
|
image = { version = "0.25.10", default-features = false }
|
|
33
33
|
itertools = "0.14"
|
|
34
34
|
js-sys = "0.3"
|
|
35
|
-
kreuzberg = { path = "./crates/kreuzberg", version = "4.7.
|
|
36
|
-
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.7.
|
|
35
|
+
kreuzberg = { path = "./crates/kreuzberg", version = "4.7.4", default-features = false }
|
|
36
|
+
kreuzberg-ffi = { path = "./crates/kreuzberg-ffi", version = "4.7.4" }
|
|
37
37
|
lazy_static = "1.5.0"
|
|
38
38
|
libc = "0.2.184"
|
|
39
39
|
log = "0.4"
|
|
@@ -43,7 +43,7 @@ num_cpus = "1.17.0"
|
|
|
43
43
|
once_cell = "1.21.4"
|
|
44
44
|
ort = { version = "2.0.0-rc.12", features = ["std", "api-18"], default-features = false }
|
|
45
45
|
parking_lot = "0.12.5"
|
|
46
|
-
pdf_oxide = { version = "0.3.
|
|
46
|
+
pdf_oxide = { version = "0.3.21", default-features = false }
|
|
47
47
|
pdfium-render = { package = "kreuzberg-pdfium-render", path = "crates/kreuzberg-pdfium-render", version = "4.3" }
|
|
48
48
|
rayon = "1.11.0"
|
|
49
49
|
reqwest = { version = "0.13.2", default-features = false }
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.7.
|
|
3
|
+
version = "4.7.4"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -307,7 +307,7 @@ ort = { version = "2.0.0-rc.12", default-features = false, features = [
|
|
|
307
307
|
outlook-pst = { version = "1.2.0", optional = true }
|
|
308
308
|
parking_lot = "0.12.5"
|
|
309
309
|
pastey = "0.2"
|
|
310
|
-
pdf_oxide = { version = "0.3.
|
|
310
|
+
pdf_oxide = { version = "0.3.21", default-features = false, optional = true }
|
|
311
311
|
pdfium-render = { package = "kreuzberg-pdfium-render", path = "../kreuzberg-pdfium-render", features = ["thread_safe", "image_latest"], optional = true }
|
|
312
312
|
pulldown-cmark = { version = "0.13" }
|
|
313
313
|
quick-xml = { version = "0.39.2", features = ["serialize"], optional = true }
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -18,7 +18,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
18
18
|
|
|
19
19
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
20
20
|
|
|
21
|
-
> **🚀 Version 4.7.
|
|
21
|
+
> **🚀 Version 4.7.4 Release**
|
|
22
22
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
23
23
|
>
|
|
24
24
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -1045,6 +1045,163 @@ fn is_greek_letter(c: char) -> bool {
|
|
|
1045
1045
|
matches!(c, '\u{0391}'..='\u{03A9}' | '\u{03B1}'..='\u{03C9}')
|
|
1046
1046
|
}
|
|
1047
1047
|
|
|
1048
|
+
/// Remove arXiv watermark/sidebar noise from paragraphs on the first pages.
|
|
1049
|
+
///
|
|
1050
|
+
/// Handles two cases:
|
|
1051
|
+
/// 1. Short standalone paragraphs that are just the arXiv identifier → mark as furniture.
|
|
1052
|
+
/// 2. arXiv identifier appended to the end of a longer paragraph (LaTeX sidebar
|
|
1053
|
+
/// text that pdfium concatenates with body text) → strip the trailing noise.
|
|
1054
|
+
pub(super) fn mark_arxiv_noise(all_pages: &mut [Vec<PdfParagraph>]) {
|
|
1055
|
+
let arxiv_re = regex::Regex::new(r"arXiv:\d{4}\.\d{4,5}").expect("valid regex");
|
|
1056
|
+
// Match trailing sidebar noise: title/page-num + arXiv ID (+ category + date) at end.
|
|
1057
|
+
// The sidebar text from LaTeX gets concatenated by pdfium with body text.
|
|
1058
|
+
// We capture from the arXiv ID to end-of-string and also eat back any preceding
|
|
1059
|
+
// short title/page-number fragment (up to ~8 words before arXiv:).
|
|
1060
|
+
let trailing_re = regex::Regex::new(
|
|
1061
|
+
r"(?:\s+(?:\S+\s+){0,8})?arXiv:\d{4}\.\d{4,5}(?:v\d+)?(?:\s*\[[\w.-]+\])?\s*(?:\d{1,2}\s+\w+\s+\d{4})?\s*$",
|
|
1062
|
+
)
|
|
1063
|
+
.expect("valid regex");
|
|
1064
|
+
|
|
1065
|
+
// Only check first 2 pages — arXiv watermarks don't appear later.
|
|
1066
|
+
for page in all_pages.iter_mut().take(2) {
|
|
1067
|
+
for para in page.iter_mut() {
|
|
1068
|
+
if para.is_page_furniture {
|
|
1069
|
+
continue;
|
|
1070
|
+
}
|
|
1071
|
+
let text = paragraph_plain_text(para);
|
|
1072
|
+
let trimmed = text.trim();
|
|
1073
|
+
let word_count = trimmed.split_whitespace().count();
|
|
1074
|
+
|
|
1075
|
+
if !arxiv_re.is_match(trimmed) {
|
|
1076
|
+
continue;
|
|
1077
|
+
}
|
|
1078
|
+
|
|
1079
|
+
// Short paragraph dominated by the arXiv identifier → mark as furniture.
|
|
1080
|
+
if word_count <= 25 {
|
|
1081
|
+
tracing::trace!(
|
|
1082
|
+
text = %trimmed.chars().take(80).collect::<String>(),
|
|
1083
|
+
"marking arXiv watermark as furniture"
|
|
1084
|
+
);
|
|
1085
|
+
para.is_page_furniture = true;
|
|
1086
|
+
para.heading_level = None;
|
|
1087
|
+
} else if let Some(m) = trailing_re.find(trimmed) {
|
|
1088
|
+
// arXiv id is at the end of a longer paragraph — strip it from the last segment.
|
|
1089
|
+
let noise = &trimmed[m.start()..];
|
|
1090
|
+
tracing::trace!(
|
|
1091
|
+
stripped = %noise.chars().take(80).collect::<String>(),
|
|
1092
|
+
"stripping trailing arXiv watermark from paragraph"
|
|
1093
|
+
);
|
|
1094
|
+
strip_trailing_text_from_paragraph(para, noise.trim());
|
|
1095
|
+
}
|
|
1096
|
+
}
|
|
1097
|
+
}
|
|
1098
|
+
}
|
|
1099
|
+
|
|
1100
|
+
/// Strip trailing noise text from the last segment(s) of a paragraph.
|
|
1101
|
+
fn strip_trailing_text_from_paragraph(para: &mut PdfParagraph, noise: &str) {
|
|
1102
|
+
// Walk lines in reverse to find the segment containing the noise.
|
|
1103
|
+
for line in para.lines.iter_mut().rev() {
|
|
1104
|
+
for seg in line.segments.iter_mut().rev() {
|
|
1105
|
+
if let Some(pos) = seg.text.find(noise) {
|
|
1106
|
+
seg.text = seg.text[..pos].trim_end().to_string();
|
|
1107
|
+
return;
|
|
1108
|
+
}
|
|
1109
|
+
// If the entire segment is part of the noise, clear it.
|
|
1110
|
+
let seg_trimmed = seg.text.trim();
|
|
1111
|
+
if !seg_trimmed.is_empty() && noise.contains(seg_trimmed) {
|
|
1112
|
+
seg.text.clear();
|
|
1113
|
+
} else {
|
|
1114
|
+
// Reached body text — stop.
|
|
1115
|
+
return;
|
|
1116
|
+
}
|
|
1117
|
+
}
|
|
1118
|
+
}
|
|
1119
|
+
}
|
|
1120
|
+
|
|
1121
|
+
/// Second-tier cross-page repeating text detection.
|
|
1122
|
+
///
|
|
1123
|
+
/// Supplements `mark_cross_page_repeating_text` by scanning ALL paragraphs
|
|
1124
|
+
/// (not just margin-positioned ones) for short text that repeats on a
|
|
1125
|
+
/// supermajority of pages. Catches inline conference headers, journal running
|
|
1126
|
+
/// titles, and similar repeated boilerplate that appears outside the margin zone.
|
|
1127
|
+
pub(super) fn mark_cross_page_repeating_short_text(all_pages: &mut [Vec<PdfParagraph>]) {
|
|
1128
|
+
if all_pages.len() < 5 {
|
|
1129
|
+
return;
|
|
1130
|
+
}
|
|
1131
|
+
|
|
1132
|
+
let max_words = 20;
|
|
1133
|
+
let threshold = (all_pages.len() as f64 * 0.7).ceil() as usize;
|
|
1134
|
+
|
|
1135
|
+
// Count how many pages each short text appears on.
|
|
1136
|
+
let mut text_page_count: ahash::AHashMap<String, usize> = ahash::AHashMap::new();
|
|
1137
|
+
for page in all_pages.iter() {
|
|
1138
|
+
let mut seen: ahash::AHashSet<String> = ahash::AHashSet::new();
|
|
1139
|
+
for para in page {
|
|
1140
|
+
if para.is_page_furniture {
|
|
1141
|
+
continue;
|
|
1142
|
+
}
|
|
1143
|
+
let text = paragraph_plain_text(para);
|
|
1144
|
+
let normalized = text.trim().to_lowercase();
|
|
1145
|
+
if normalized.is_empty() {
|
|
1146
|
+
continue;
|
|
1147
|
+
}
|
|
1148
|
+
let word_count = normalized.split_whitespace().count();
|
|
1149
|
+
if word_count > max_words {
|
|
1150
|
+
continue;
|
|
1151
|
+
}
|
|
1152
|
+
let alphanum_key: String = normalized.chars().filter(|c| c.is_alphanumeric()).collect();
|
|
1153
|
+
if alphanum_key.is_empty() {
|
|
1154
|
+
continue;
|
|
1155
|
+
}
|
|
1156
|
+
if seen.insert(alphanum_key.clone()) {
|
|
1157
|
+
*text_page_count.entry(alphanum_key).or_insert(0) += 1;
|
|
1158
|
+
}
|
|
1159
|
+
}
|
|
1160
|
+
}
|
|
1161
|
+
|
|
1162
|
+
// Collect keys that repeat on ≥70% of pages.
|
|
1163
|
+
let repeating: ahash::AHashSet<String> = text_page_count
|
|
1164
|
+
.into_iter()
|
|
1165
|
+
.filter(|(_, count)| *count >= threshold)
|
|
1166
|
+
.map(|(key, _)| key)
|
|
1167
|
+
.collect();
|
|
1168
|
+
|
|
1169
|
+
if repeating.is_empty() {
|
|
1170
|
+
return;
|
|
1171
|
+
}
|
|
1172
|
+
|
|
1173
|
+
tracing::debug!(
|
|
1174
|
+
repeating_count = repeating.len(),
|
|
1175
|
+
threshold,
|
|
1176
|
+
total_pages = all_pages.len(),
|
|
1177
|
+
"cross-page short-text repeating detection (tier 2)"
|
|
1178
|
+
);
|
|
1179
|
+
|
|
1180
|
+
// Mark matching paragraphs as furniture.
|
|
1181
|
+
for page in all_pages.iter_mut() {
|
|
1182
|
+
for para in page.iter_mut() {
|
|
1183
|
+
if para.is_page_furniture {
|
|
1184
|
+
continue;
|
|
1185
|
+
}
|
|
1186
|
+
let text = paragraph_plain_text(para);
|
|
1187
|
+
let normalized = text.trim().to_lowercase();
|
|
1188
|
+
let word_count = normalized.split_whitespace().count();
|
|
1189
|
+
if word_count > max_words {
|
|
1190
|
+
continue;
|
|
1191
|
+
}
|
|
1192
|
+
let alphanum_key: String = normalized.chars().filter(|c| c.is_alphanumeric()).collect();
|
|
1193
|
+
if repeating.contains(&alphanum_key) {
|
|
1194
|
+
tracing::trace!(
|
|
1195
|
+
text = %normalized.chars().take(60).collect::<String>(),
|
|
1196
|
+
"marking repeating short text as furniture (tier 2)"
|
|
1197
|
+
);
|
|
1198
|
+
para.is_page_furniture = true;
|
|
1199
|
+
para.heading_level = None;
|
|
1200
|
+
}
|
|
1201
|
+
}
|
|
1202
|
+
}
|
|
1203
|
+
}
|
|
1204
|
+
|
|
1048
1205
|
#[cfg(test)]
|
|
1049
1206
|
mod tests {
|
|
1050
1207
|
use super::*;
|
|
@@ -380,6 +380,12 @@ pub(super) fn apply_hint_to_paragraph(para: &mut PdfParagraph, hint: &LayoutHint
|
|
|
380
380
|
LayoutHintClass::PageHeader | LayoutHintClass::PageFooter => {
|
|
381
381
|
para.is_page_furniture = true;
|
|
382
382
|
}
|
|
383
|
+
LayoutHintClass::Picture => {
|
|
384
|
+
// Text classified as Picture by layout model is figure-internal text
|
|
385
|
+
// (diagram labels, axis text, etc.) — suppress from body output.
|
|
386
|
+
para.is_page_furniture = true;
|
|
387
|
+
para.heading_level = None;
|
|
388
|
+
}
|
|
383
389
|
LayoutHintClass::Text | LayoutHintClass::Caption | LayoutHintClass::Footnote
|
|
384
390
|
// Layout model says this is body text, not a heading.
|
|
385
391
|
// Demote font-size-classified headings when layout has high confidence.
|
|
@@ -11,8 +11,8 @@ use rayon::prelude::*;
|
|
|
11
11
|
use super::assembly::assemble_internal_document;
|
|
12
12
|
use super::bridge::{ImagePosition, extracted_blocks_to_paragraphs, filter_sidebar_blocks, objects_to_page_data};
|
|
13
13
|
use super::classify::{
|
|
14
|
-
classify_paragraphs, demote_heading_runs, demote_unnumbered_subsections,
|
|
15
|
-
refine_heading_hierarchy,
|
|
14
|
+
classify_paragraphs, demote_heading_runs, demote_unnumbered_subsections, mark_arxiv_noise,
|
|
15
|
+
mark_cross_page_repeating_short_text, mark_cross_page_repeating_text, refine_heading_hierarchy,
|
|
16
16
|
};
|
|
17
17
|
use super::constants::{
|
|
18
18
|
FULL_LINE_FRACTION, MIN_FONT_SIZE, MIN_HEADING_FONT_GAP, MIN_HEADING_FONT_RATIO, PAGE_BOTTOM_MARGIN_FRACTION,
|
|
@@ -1187,6 +1187,10 @@ pub fn extract_document_structure(
|
|
|
1187
1187
|
|
|
1188
1188
|
// Mark short text that repeats across many pages as furniture (headers/footers/watermarks).
|
|
1189
1189
|
mark_cross_page_repeating_text(&mut all_page_paragraphs, &page_heights);
|
|
1190
|
+
// Tier 2: catch short repeating text outside margin zones (e.g. conference headers).
|
|
1191
|
+
mark_cross_page_repeating_short_text(&mut all_page_paragraphs);
|
|
1192
|
+
// Mark arXiv watermark identifiers on first pages.
|
|
1193
|
+
mark_arxiv_noise(&mut all_page_paragraphs);
|
|
1190
1194
|
for page in &mut all_page_paragraphs {
|
|
1191
1195
|
retain_page_furniture_safely(page);
|
|
1192
1196
|
}
|
|
@@ -1225,6 +1229,11 @@ pub fn extract_document_structure(
|
|
|
1225
1229
|
|
|
1226
1230
|
let mut doc = assemble_internal_document(all_page_paragraphs, &combined_tables, &image_pos_pairs);
|
|
1227
1231
|
|
|
1232
|
+
// Stage 4b: Populate doc.images with actual image data from pdfium.
|
|
1233
|
+
// Image elements reference indices into doc.images, which must be populated
|
|
1234
|
+
// for markdown/HTML rendering to produce `` instead of `![]()`.
|
|
1235
|
+
populate_images_from_pdfium(document, &all_image_positions, &mut doc);
|
|
1236
|
+
|
|
1228
1237
|
let element_count = doc.elements.len();
|
|
1229
1238
|
tracing::debug!(element_count, "PDF structure pipeline: assembly complete");
|
|
1230
1239
|
|
|
@@ -1748,6 +1757,122 @@ fn is_dedup_candidate(p: &PdfParagraph) -> bool {
|
|
|
1748
1757
|
&& p.caption_for.is_none()
|
|
1749
1758
|
}
|
|
1750
1759
|
|
|
1760
|
+
/// Extract actual image data from pdfium and populate `doc.images`.
|
|
1761
|
+
///
|
|
1762
|
+
/// Each `ImagePosition` records (page_number, image_index) for image objects
|
|
1763
|
+
/// found during page scanning. This function re-traverses the pages to extract
|
|
1764
|
+
/// actual pixel data via pdfium's `get_processed_image`, then pushes each as an
|
|
1765
|
+
/// `ExtractedImage` into the document so that rendering produces proper
|
|
1766
|
+
/// `` references instead of empty `![]()`.
|
|
1767
|
+
fn populate_images_from_pdfium(
|
|
1768
|
+
document: &PdfDocument,
|
|
1769
|
+
image_positions: &[super::bridge::ImagePosition],
|
|
1770
|
+
doc: &mut crate::types::internal::InternalDocument,
|
|
1771
|
+
) {
|
|
1772
|
+
use bytes::Bytes;
|
|
1773
|
+
use image::ImageEncoder;
|
|
1774
|
+
|
|
1775
|
+
if image_positions.is_empty() {
|
|
1776
|
+
return;
|
|
1777
|
+
}
|
|
1778
|
+
|
|
1779
|
+
// Group image positions by page number (1-indexed) for efficient traversal.
|
|
1780
|
+
let mut by_page: std::collections::BTreeMap<usize, Vec<usize>> = std::collections::BTreeMap::new();
|
|
1781
|
+
for pos in image_positions {
|
|
1782
|
+
by_page.entry(pos.page_number).or_default().push(pos.image_index);
|
|
1783
|
+
}
|
|
1784
|
+
|
|
1785
|
+
let pages = document.pages();
|
|
1786
|
+
let mut extracted_count = 0u32;
|
|
1787
|
+
|
|
1788
|
+
for (&page_num, indices) in &by_page {
|
|
1789
|
+
let page_idx = page_num.saturating_sub(1) as i32;
|
|
1790
|
+
let Ok(page) = pages.get(page_idx) else {
|
|
1791
|
+
for &idx in indices {
|
|
1792
|
+
doc.images.push(empty_image_placeholder(idx, page_num));
|
|
1793
|
+
}
|
|
1794
|
+
continue;
|
|
1795
|
+
};
|
|
1796
|
+
|
|
1797
|
+
// Walk page objects, extracting image data for each matching index.
|
|
1798
|
+
let first_idx_on_page = indices.iter().copied().min().unwrap_or(0);
|
|
1799
|
+
let mut current_image = 0usize;
|
|
1800
|
+
let mut extracted_on_page: std::collections::BTreeMap<usize, crate::types::ExtractedImage> =
|
|
1801
|
+
std::collections::BTreeMap::new();
|
|
1802
|
+
|
|
1803
|
+
for obj in page.objects().iter() {
|
|
1804
|
+
if let Some(image_obj) = obj.as_image_object() {
|
|
1805
|
+
let global_idx = first_idx_on_page + current_image;
|
|
1806
|
+
if indices.contains(&global_idx)
|
|
1807
|
+
&& let Ok(dynamic_image) = image_obj.get_processed_image(document)
|
|
1808
|
+
{
|
|
1809
|
+
let w = dynamic_image.width();
|
|
1810
|
+
let h = dynamic_image.height();
|
|
1811
|
+
let rgba = dynamic_image.to_rgba8();
|
|
1812
|
+
let mut png_buf: Vec<u8> = Vec::new();
|
|
1813
|
+
if image::codecs::png::PngEncoder::new(&mut png_buf)
|
|
1814
|
+
.write_image(rgba.as_raw(), w, h, image::ExtendedColorType::Rgba8)
|
|
1815
|
+
.is_ok()
|
|
1816
|
+
{
|
|
1817
|
+
extracted_count += 1;
|
|
1818
|
+
extracted_on_page.insert(
|
|
1819
|
+
global_idx,
|
|
1820
|
+
crate::types::ExtractedImage {
|
|
1821
|
+
data: Bytes::from(png_buf),
|
|
1822
|
+
format: std::borrow::Cow::Borrowed("png"),
|
|
1823
|
+
image_index: global_idx,
|
|
1824
|
+
page_number: Some(page_num),
|
|
1825
|
+
width: Some(w),
|
|
1826
|
+
height: Some(h),
|
|
1827
|
+
colorspace: Some("RGBA".to_string()),
|
|
1828
|
+
bits_per_component: Some(8),
|
|
1829
|
+
is_mask: false,
|
|
1830
|
+
description: None,
|
|
1831
|
+
ocr_result: None,
|
|
1832
|
+
bounding_box: None,
|
|
1833
|
+
source_path: None,
|
|
1834
|
+
},
|
|
1835
|
+
);
|
|
1836
|
+
}
|
|
1837
|
+
}
|
|
1838
|
+
current_image += 1;
|
|
1839
|
+
}
|
|
1840
|
+
}
|
|
1841
|
+
|
|
1842
|
+
for &idx in indices {
|
|
1843
|
+
let img = extracted_on_page
|
|
1844
|
+
.remove(&idx)
|
|
1845
|
+
.unwrap_or_else(|| empty_image_placeholder(idx, page_num));
|
|
1846
|
+
doc.images.push(img);
|
|
1847
|
+
}
|
|
1848
|
+
}
|
|
1849
|
+
|
|
1850
|
+
tracing::debug!(
|
|
1851
|
+
total_positions = image_positions.len(),
|
|
1852
|
+
extracted = extracted_count,
|
|
1853
|
+
"populated document images from pdfium"
|
|
1854
|
+
);
|
|
1855
|
+
}
|
|
1856
|
+
|
|
1857
|
+
/// Create an empty placeholder for an image that couldn't be extracted.
|
|
1858
|
+
fn empty_image_placeholder(idx: usize, page_num: usize) -> crate::types::ExtractedImage {
|
|
1859
|
+
crate::types::ExtractedImage {
|
|
1860
|
+
data: bytes::Bytes::new(),
|
|
1861
|
+
format: std::borrow::Cow::Borrowed("unknown"),
|
|
1862
|
+
image_index: idx,
|
|
1863
|
+
page_number: Some(page_num),
|
|
1864
|
+
width: None,
|
|
1865
|
+
height: None,
|
|
1866
|
+
colorspace: None,
|
|
1867
|
+
bits_per_component: None,
|
|
1868
|
+
is_mask: false,
|
|
1869
|
+
description: None,
|
|
1870
|
+
ocr_result: None,
|
|
1871
|
+
bounding_box: None,
|
|
1872
|
+
source_path: None,
|
|
1873
|
+
}
|
|
1874
|
+
}
|
|
1875
|
+
|
|
1751
1876
|
#[cfg(test)]
|
|
1752
1877
|
mod tests {
|
|
1753
1878
|
use super::*;
|
|
@@ -676,6 +676,12 @@ pub fn build_comrak_ast<'a>(doc: &InternalDocument, arena: &'a comrak::Arena<'a>
|
|
|
676
676
|
})
|
|
677
677
|
.unwrap_or_default();
|
|
678
678
|
|
|
679
|
+
// Skip images with no URL and no description — they produce
|
|
680
|
+
// empty `![]()` nodes that add noise to the output.
|
|
681
|
+
if url.is_empty() && desc.is_empty() {
|
|
682
|
+
continue;
|
|
683
|
+
}
|
|
684
|
+
|
|
679
685
|
let para = mk(arena, NodeValue::Paragraph);
|
|
680
686
|
let img_node = mk(
|
|
681
687
|
arena,
|
|
@@ -86,6 +86,10 @@ pub fn render_markdown(doc: &InternalDocument) -> String {
|
|
|
86
86
|
.join("\n");
|
|
87
87
|
}
|
|
88
88
|
|
|
89
|
+
// Strip arXiv watermark/sidebar noise that gets concatenated with body text.
|
|
90
|
+
// Only applies to the first ~2000 chars (first page area) to avoid touching references.
|
|
91
|
+
output = strip_arxiv_watermark_noise(output);
|
|
92
|
+
|
|
89
93
|
// Trim trailing whitespace but keep single trailing newline
|
|
90
94
|
let trimmed_len = output.trim_end().len();
|
|
91
95
|
if trimmed_len == 0 {
|
|
@@ -97,6 +101,48 @@ pub fn render_markdown(doc: &InternalDocument) -> String {
|
|
|
97
101
|
output
|
|
98
102
|
}
|
|
99
103
|
|
|
104
|
+
/// Strip arXiv watermark noise from rendered markdown.
|
|
105
|
+
///
|
|
106
|
+
/// LaTeX-generated PDFs often have a rotated sidebar with the arXiv identifier
|
|
107
|
+
/// that pdfium concatenates with body text. This strips patterns like:
|
|
108
|
+
/// "Title N arXiv:NNNN.NNNNNvN [cat.SC] DD Mon YYYY" from the first pages.
|
|
109
|
+
fn strip_arxiv_watermark_noise(mut text: String) -> String {
|
|
110
|
+
// Only search the first portion of the text (roughly first 2 pages)
|
|
111
|
+
let search_limit = text.len().min(6000);
|
|
112
|
+
let search_area = &text[..search_limit];
|
|
113
|
+
|
|
114
|
+
// Match: optional preceding short fragment + arXiv ID + optional version + category + date
|
|
115
|
+
let re = regex::Regex::new(
|
|
116
|
+
r"(?:\s+\S+(?:\s+\S+){0,8})?\s*arXiv:\d{4}\.\d{4,5}(?:v\d+)?(?:\s*\[[\w.-]+\])?\s*(?:\d{1,2}\s+\w+\s+\d{4})?",
|
|
117
|
+
)
|
|
118
|
+
.expect("valid regex");
|
|
119
|
+
|
|
120
|
+
if let Some(m) = re.find(search_area) {
|
|
121
|
+
// Only strip if it looks like a watermark (appears near end of a paragraph,
|
|
122
|
+
// not in the middle of a sentence about arXiv).
|
|
123
|
+
let after = &search_area[m.end()..];
|
|
124
|
+
let before_char = if m.start() > 0 {
|
|
125
|
+
search_area[..m.start()].chars().last()
|
|
126
|
+
} else {
|
|
127
|
+
None
|
|
128
|
+
};
|
|
129
|
+
|
|
130
|
+
// Strip if preceded by a sentence-ending period or is at end of paragraph
|
|
131
|
+
let is_at_paragraph_boundary = before_char == Some('.') || after.starts_with('\n') || after.starts_with("\n\n");
|
|
132
|
+
if is_at_paragraph_boundary {
|
|
133
|
+
let start = m.start();
|
|
134
|
+
let end = m.end();
|
|
135
|
+
tracing::trace!(
|
|
136
|
+
stripped = %&text[start..end].chars().take(80).collect::<String>(),
|
|
137
|
+
"stripping arXiv watermark from markdown output"
|
|
138
|
+
);
|
|
139
|
+
text.replace_range(start..end, "");
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
text
|
|
144
|
+
}
|
|
145
|
+
|
|
100
146
|
/// Shared comrak options with all GFM extensions enabled.
|
|
101
147
|
pub(crate) fn comrak_options<'a>() -> Options<'a> {
|
|
102
148
|
let mut options = Options::default();
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
//! Regression tests for PDF image extraction in markdown output.
|
|
2
|
+
//!
|
|
3
|
+
//! Verifies that embedded images in PDFs produce proper ``
|
|
4
|
+
//! references instead of empty `![]()` placeholders.
|
|
5
|
+
|
|
6
|
+
#![cfg(feature = "pdf")]
|
|
7
|
+
|
|
8
|
+
use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
|
|
9
|
+
use kreuzberg::core::extractor::extract_file;
|
|
10
|
+
use std::path::PathBuf;
|
|
11
|
+
|
|
12
|
+
mod helpers;
|
|
13
|
+
|
|
14
|
+
fn test_documents_dir() -> PathBuf {
|
|
15
|
+
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
|
16
|
+
.parent()
|
|
17
|
+
.unwrap()
|
|
18
|
+
.parent()
|
|
19
|
+
.unwrap()
|
|
20
|
+
.join("test_documents")
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
fn extract_markdown(relative_path: &str) -> kreuzberg::types::ExtractionResult {
|
|
24
|
+
let path = test_documents_dir().join(relative_path);
|
|
25
|
+
let config = ExtractionConfig {
|
|
26
|
+
output_format: OutputFormat::Markdown,
|
|
27
|
+
..Default::default()
|
|
28
|
+
};
|
|
29
|
+
let rt = tokio::runtime::Runtime::new().unwrap();
|
|
30
|
+
rt.block_on(extract_file(&path, None, &config)).unwrap()
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
#[test]
|
|
34
|
+
fn test_multipage_marketing_no_empty_image_refs() {
|
|
35
|
+
let result = extract_markdown("pdf/multipage_marketing.pdf");
|
|
36
|
+
let content = &result.content;
|
|
37
|
+
|
|
38
|
+
// Must not contain empty image references
|
|
39
|
+
assert!(
|
|
40
|
+
!content.contains("![]()"),
|
|
41
|
+
"Markdown output must not contain empty image references ![](), got:\n{}",
|
|
42
|
+
content
|
|
43
|
+
);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
#[test]
|
|
47
|
+
fn test_multipage_marketing_has_image_refs() {
|
|
48
|
+
let result = extract_markdown("pdf/multipage_marketing.pdf");
|
|
49
|
+
let content = &result.content;
|
|
50
|
+
|
|
51
|
+
// Must contain at least one proper image reference
|
|
52
|
+
assert!(
|
|
53
|
+
content.contains(",
|
|
54
|
+
"Markdown output must contain image references like , got:\n{}",
|
|
55
|
+
content
|
|
56
|
+
);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
#[test]
|
|
60
|
+
fn test_multipage_marketing_images_populated() {
|
|
61
|
+
let result = extract_markdown("pdf/multipage_marketing.pdf");
|
|
62
|
+
|
|
63
|
+
// Extraction result must have images with actual data
|
|
64
|
+
let images = result.images.as_ref().expect("images field must be Some");
|
|
65
|
+
assert!(!images.is_empty(), "Extraction result must contain extracted images");
|
|
66
|
+
|
|
67
|
+
// At least some images should have non-empty data
|
|
68
|
+
let images_with_data = images.iter().filter(|img| !img.data.is_empty()).count();
|
|
69
|
+
assert!(
|
|
70
|
+
images_with_data > 0,
|
|
71
|
+
"At least some images should have actual pixel data, got {} images total but none with data",
|
|
72
|
+
images.len()
|
|
73
|
+
);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
#[test]
|
|
77
|
+
fn test_docling_no_empty_image_refs() {
|
|
78
|
+
let result = extract_markdown("pdf/docling.pdf");
|
|
79
|
+
let content = &result.content;
|
|
80
|
+
|
|
81
|
+
assert!(
|
|
82
|
+
!content.contains("![]()"),
|
|
83
|
+
"Docling markdown must not contain empty image references ![](), got:\n{}",
|
|
84
|
+
content
|
|
85
|
+
);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
#[test]
|
|
89
|
+
fn test_docling_has_image_refs() {
|
|
90
|
+
let result = extract_markdown("pdf/docling.pdf");
|
|
91
|
+
let content = &result.content;
|
|
92
|
+
|
|
93
|
+
// Docling has at least 1 figure
|
|
94
|
+
assert!(
|
|
95
|
+
content.contains(",
|
|
96
|
+
"Docling markdown must contain image references, got:\n{}",
|
|
97
|
+
content
|
|
98
|
+
);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
#[test]
|
|
102
|
+
fn test_docling_content_quality() {
|
|
103
|
+
let result = extract_markdown("pdf/docling.pdf");
|
|
104
|
+
let content = &result.content;
|
|
105
|
+
|
|
106
|
+
// Verify key content from the Docling technical report is present
|
|
107
|
+
assert!(content.contains("Docling"), "Must contain 'Docling'");
|
|
108
|
+
assert!(content.contains("PDF"), "Must contain 'PDF'");
|
|
109
|
+
assert!(
|
|
110
|
+
content.contains("table structure recognition") || content.contains("TableFormer"),
|
|
111
|
+
"Must mention table structure recognition or TableFormer"
|
|
112
|
+
);
|
|
113
|
+
}
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
//! PDF output quality integration tests.
|
|
2
|
+
//!
|
|
3
|
+
//! Regression tests verifying that extraction output is clean and free of
|
|
4
|
+
//! common noise patterns (figure-internal text, arXiv watermarks, reference
|
|
5
|
+
//! entries misclassified as headings, repeating conference headers).
|
|
6
|
+
//!
|
|
7
|
+
//! Benchmark documents:
|
|
8
|
+
//! - `docling.pdf` — academic paper with figures, tables, arXiv sidebar
|
|
9
|
+
//! - `multi_page.pdf` — clean multi-page document (no noise expected)
|
|
10
|
+
|
|
11
|
+
#![cfg(feature = "pdf")]
|
|
12
|
+
|
|
13
|
+
mod helpers;
|
|
14
|
+
|
|
15
|
+
use helpers::*;
|
|
16
|
+
use kreuzberg::core::config::{ExtractionConfig, OutputFormat};
|
|
17
|
+
use kreuzberg::extract_file_sync;
|
|
18
|
+
|
|
19
|
+
fn extract_markdown(relative_path: &str) -> String {
|
|
20
|
+
let pdf_path = get_test_file_path(relative_path);
|
|
21
|
+
if !pdf_path.exists() {
|
|
22
|
+
panic!("Test document not found: {}", relative_path);
|
|
23
|
+
}
|
|
24
|
+
let config = ExtractionConfig {
|
|
25
|
+
output_format: OutputFormat::Markdown,
|
|
26
|
+
..Default::default()
|
|
27
|
+
};
|
|
28
|
+
extract_file_sync(&pdf_path, None, &config)
|
|
29
|
+
.expect("extraction should succeed")
|
|
30
|
+
.content
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
#[cfg(feature = "layout-detection")]
|
|
34
|
+
fn extract_markdown_with_layout(relative_path: &str) -> String {
|
|
35
|
+
use kreuzberg::core::config::layout::LayoutDetectionConfig;
|
|
36
|
+
|
|
37
|
+
let pdf_path = get_test_file_path(relative_path);
|
|
38
|
+
if !pdf_path.exists() {
|
|
39
|
+
panic!("Test document not found: {}", relative_path);
|
|
40
|
+
}
|
|
41
|
+
let config = ExtractionConfig {
|
|
42
|
+
output_format: OutputFormat::Markdown,
|
|
43
|
+
layout: Some(LayoutDetectionConfig::default()),
|
|
44
|
+
..Default::default()
|
|
45
|
+
};
|
|
46
|
+
extract_file_sync(&pdf_path, None, &config)
|
|
47
|
+
.expect("layout extraction should succeed")
|
|
48
|
+
.content
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// ── Noise filtering: figure-internal text ────────────────────────────
|
|
52
|
+
|
|
53
|
+
#[cfg(feature = "layout-detection")]
|
|
54
|
+
#[test]
|
|
55
|
+
fn test_docling_no_figure_internal_text() {
|
|
56
|
+
if !test_documents_available() {
|
|
57
|
+
return;
|
|
58
|
+
}
|
|
59
|
+
let content = extract_markdown_with_layout("pdf/docling.pdf");
|
|
60
|
+
|
|
61
|
+
// "Circling Minimums" is a heading from inside an appendix figure — should be suppressed
|
|
62
|
+
assert!(
|
|
63
|
+
!content.contains("Circling Minimums"),
|
|
64
|
+
"Figure-internal heading 'Circling Minimums' leaked into output"
|
|
65
|
+
);
|
|
66
|
+
|
|
67
|
+
// Figure diagram labels from Figure 1 should not appear as body text
|
|
68
|
+
assert!(
|
|
69
|
+
!content.contains("{;} Parse PDF pages"),
|
|
70
|
+
"Figure 1 diagram text leaked into output"
|
|
71
|
+
);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
#[cfg(feature = "layout-detection")]
|
|
75
|
+
#[test]
|
|
76
|
+
fn test_docling_no_figure_text_as_headings() {
|
|
77
|
+
if !test_documents_available() {
|
|
78
|
+
return;
|
|
79
|
+
}
|
|
80
|
+
let content = extract_markdown_with_layout("pdf/docling.pdf");
|
|
81
|
+
|
|
82
|
+
// "{;} Parse PDF pages" is from the pipeline diagram (Figure 1)
|
|
83
|
+
for line in content.lines() {
|
|
84
|
+
if line.starts_with('#') {
|
|
85
|
+
assert!(
|
|
86
|
+
!line.contains("{;}"),
|
|
87
|
+
"Figure diagram text promoted to heading: {}",
|
|
88
|
+
line
|
|
89
|
+
);
|
|
90
|
+
assert!(
|
|
91
|
+
!line.contains("Parse PDF pages Table Structure OCR"),
|
|
92
|
+
"Figure diagram text promoted to heading: {}",
|
|
93
|
+
line
|
|
94
|
+
);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// ── Noise filtering: arXiv watermark ─────────────────────────────────
|
|
100
|
+
|
|
101
|
+
#[cfg(feature = "layout-detection")]
|
|
102
|
+
#[test]
|
|
103
|
+
fn test_docling_no_arxiv_watermark() {
|
|
104
|
+
if !test_documents_available() {
|
|
105
|
+
return;
|
|
106
|
+
}
|
|
107
|
+
let content = extract_markdown_with_layout("pdf/docling.pdf");
|
|
108
|
+
|
|
109
|
+
// The arXiv sidebar watermark "arXiv:2408.09869v5" should be stripped.
|
|
110
|
+
// Legitimate references to arXiv in body text are fine (they don't include the ID).
|
|
111
|
+
assert!(
|
|
112
|
+
!content.contains("arXiv:2408.09869"),
|
|
113
|
+
"arXiv watermark identifier not stripped from output"
|
|
114
|
+
);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// ── Noise filtering: references as headings ──────────────────────────
|
|
118
|
+
|
|
119
|
+
#[cfg(feature = "layout-detection")]
|
|
120
|
+
#[test]
|
|
121
|
+
fn test_docling_references_not_headings() {
|
|
122
|
+
if !test_documents_available() {
|
|
123
|
+
return;
|
|
124
|
+
}
|
|
125
|
+
let content = extract_markdown_with_layout("pdf/docling.pdf");
|
|
126
|
+
|
|
127
|
+
// Individual reference entries should not be promoted to ## headings
|
|
128
|
+
let heading_lines: Vec<&str> = content.lines().filter(|l| l.starts_with("## ")).collect();
|
|
129
|
+
for h in &heading_lines {
|
|
130
|
+
assert!(
|
|
131
|
+
!h.contains("PyPDFium2"),
|
|
132
|
+
"Reference entry misclassified as heading: {}",
|
|
133
|
+
h
|
|
134
|
+
);
|
|
135
|
+
assert!(
|
|
136
|
+
!h.contains("LlamaIndex"),
|
|
137
|
+
"Reference entry misclassified as heading: {}",
|
|
138
|
+
h
|
|
139
|
+
);
|
|
140
|
+
assert!(
|
|
141
|
+
!h.contains("PyttiuPDF"),
|
|
142
|
+
"Reference entry misclassified as heading: {}",
|
|
143
|
+
h
|
|
144
|
+
);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// ── Content preservation ─────────────────────────────────────────────
|
|
149
|
+
|
|
150
|
+
#[cfg(feature = "layout-detection")]
|
|
151
|
+
#[test]
|
|
152
|
+
fn test_docling_key_content_preserved() {
|
|
153
|
+
if !test_documents_available() {
|
|
154
|
+
return;
|
|
155
|
+
}
|
|
156
|
+
let content = extract_markdown_with_layout("pdf/docling.pdf");
|
|
157
|
+
|
|
158
|
+
assert!(
|
|
159
|
+
content.contains("Docling Technical Report"),
|
|
160
|
+
"Title not found in output"
|
|
161
|
+
);
|
|
162
|
+
assert!(
|
|
163
|
+
content.contains("Processing pipeline") || content.contains("processing pipeline"),
|
|
164
|
+
"Section 'Processing pipeline' not found"
|
|
165
|
+
);
|
|
166
|
+
assert!(content.contains("TableFormer"), "'TableFormer' not found");
|
|
167
|
+
assert!(
|
|
168
|
+
content.contains("PDF backend") || content.contains("PDF backends"),
|
|
169
|
+
"'PDF backends' section not found"
|
|
170
|
+
);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
#[test]
|
|
174
|
+
fn test_multipage_clean_output() {
|
|
175
|
+
if !test_documents_available() {
|
|
176
|
+
return;
|
|
177
|
+
}
|
|
178
|
+
let content = extract_markdown("pdf/multi_page.pdf");
|
|
179
|
+
|
|
180
|
+
assert!(content.contains("Evolution of the Word Processor"), "Title not found");
|
|
181
|
+
assert!(
|
|
182
|
+
content.contains("Pre-Digital Era"),
|
|
183
|
+
"Section 'Pre-Digital Era' not found"
|
|
184
|
+
);
|
|
185
|
+
assert!(content.contains("IBM MT/ST"), "'IBM MT/ST' not found");
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
#[test]
|
|
189
|
+
fn test_multipage_no_noise() {
|
|
190
|
+
if !test_documents_available() {
|
|
191
|
+
return;
|
|
192
|
+
}
|
|
193
|
+
let content = extract_markdown("pdf/multi_page.pdf");
|
|
194
|
+
|
|
195
|
+
// multipage.pdf is a clean document — should have no arXiv noise
|
|
196
|
+
assert!(
|
|
197
|
+
!content.contains("arXiv:"),
|
|
198
|
+
"multipage.pdf should have no arXiv identifiers"
|
|
199
|
+
);
|
|
200
|
+
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg-ffi"
|
|
3
|
-
version = "4.7.
|
|
3
|
+
version = "4.7.4"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <naaman@kreuzberg.dev>"]
|
|
@@ -41,7 +41,7 @@ serde_json = { version = "1.0.149" }
|
|
|
41
41
|
tokio = { version = "1.51.0", features = ["rt", "rt-multi-thread", "macros", "sync", "process", "fs", "time", "io-util"] }
|
|
42
42
|
|
|
43
43
|
[target.'cfg(all(windows, target_env = "gnu"))'.dependencies]
|
|
44
|
-
kreuzberg = { path = "../kreuzberg", version = "4.7.
|
|
44
|
+
kreuzberg = { path = "../kreuzberg", version = "4.7.4", default-features = false, features = [
|
|
45
45
|
"pdf",
|
|
46
46
|
"excel",
|
|
47
47
|
"office",
|
|
@@ -64,7 +64,7 @@ kreuzberg = { path = "../kreuzberg", version = "4.7.3", default-features = false
|
|
|
64
64
|
] }
|
|
65
65
|
|
|
66
66
|
[target.'cfg(not(all(windows, target_env = "gnu")))'.dependencies]
|
|
67
|
-
kreuzberg = { path = "../kreuzberg", version = "4.7.
|
|
67
|
+
kreuzberg = { path = "../kreuzberg", version = "4.7.4", default-features = false, features = ["bundled-pdfium", "full"] }
|
|
68
68
|
|
|
69
69
|
[build-dependencies]
|
|
70
70
|
cbindgen = "0.29"
|
|
@@ -2160,7 +2160,7 @@ impl Clone for TesseractAPI {
|
|
|
2160
2160
|
}
|
|
2161
2161
|
|
|
2162
2162
|
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
|
2163
|
-
|
|
2163
|
+
ffi_extern! {
|
|
2164
2164
|
fn TessBaseAPIMeanTextConf(handle: *mut c_void) -> c_int;
|
|
2165
2165
|
fn TessBaseAPISetVariable(handle: *mut c_void, name: *const c_char, value: *const c_char) -> c_int;
|
|
2166
2166
|
fn TessBaseAPIGetStringVariable(handle: *mut c_void, name: *const c_char) -> *const c_char;
|
|
@@ -69,7 +69,7 @@ impl Drop for ChoiceIterator {
|
|
|
69
69
|
}
|
|
70
70
|
}
|
|
71
71
|
|
|
72
|
-
|
|
72
|
+
ffi_extern! {
|
|
73
73
|
fn TessChoiceIteratorDelete(handle: *mut c_void);
|
|
74
74
|
fn TessChoiceIteratorNext(handle: *mut c_void) -> c_int;
|
|
75
75
|
fn TessChoiceIteratorGetUTF8Text(handle: *mut c_void) -> *mut c_char;
|
|
@@ -29,7 +29,7 @@ use std::ffi::c_void;
|
|
|
29
29
|
// ---------------------------------------------------------------------------
|
|
30
30
|
|
|
31
31
|
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
|
32
|
-
|
|
32
|
+
ffi_extern! {
|
|
33
33
|
/// Allocates a new Pix with the given dimensions and bit depth.
|
|
34
34
|
fn pixCreate(width: i32, height: i32, depth: i32) -> *mut c_void;
|
|
35
35
|
|
|
@@ -127,6 +127,34 @@
|
|
|
127
127
|
//! Ok(())
|
|
128
128
|
//! }
|
|
129
129
|
//! ```
|
|
130
|
+
/// Declare FFI functions with `extern "C-unwind"` on native targets (to catch
|
|
131
|
+
/// C++ exceptions from Tesseract/Leptonica) and `extern "C"` on WASM (where
|
|
132
|
+
/// the LLVM backend does not support `cleanupret` / C++ unwinding).
|
|
133
|
+
macro_rules! ffi_extern {
|
|
134
|
+
(
|
|
135
|
+
$(
|
|
136
|
+
$(#[$meta:meta])*
|
|
137
|
+
$vis:vis fn $name:ident($($arg:ident : $ty:ty),* $(,)?) $(-> $ret:ty)?;
|
|
138
|
+
)*
|
|
139
|
+
) => {
|
|
140
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
141
|
+
unsafe extern "C-unwind" {
|
|
142
|
+
$(
|
|
143
|
+
$(#[$meta])*
|
|
144
|
+
$vis fn $name($($arg : $ty),*) $(-> $ret)?;
|
|
145
|
+
)*
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
#[cfg(target_arch = "wasm32")]
|
|
149
|
+
unsafe extern "C" {
|
|
150
|
+
$(
|
|
151
|
+
$(#[$meta])*
|
|
152
|
+
$vis fn $name($($arg : $ty),*) $(-> $ret)?;
|
|
153
|
+
)*
|
|
154
|
+
}
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
|
|
130
158
|
pub use error::{Result, TesseractError};
|
|
131
159
|
mod error;
|
|
132
160
|
|
|
@@ -60,7 +60,7 @@ impl Drop for TessMonitor {
|
|
|
60
60
|
}
|
|
61
61
|
}
|
|
62
62
|
|
|
63
|
-
|
|
63
|
+
ffi_extern! {
|
|
64
64
|
pub fn TessMonitorCreate() -> *mut c_void;
|
|
65
65
|
pub fn TessMonitorDelete(monitor: *mut c_void);
|
|
66
66
|
pub fn TessMonitorSetDeadlineMSecs(monitor: *mut c_void, deadline: c_int);
|
|
@@ -380,7 +380,7 @@ impl Drop for PageIterator {
|
|
|
380
380
|
}
|
|
381
381
|
}
|
|
382
382
|
|
|
383
|
-
|
|
383
|
+
ffi_extern! {
|
|
384
384
|
pub fn TessPageIteratorDelete(handle: *mut c_void);
|
|
385
385
|
pub fn TessPageIteratorBegin(handle: *mut c_void);
|
|
386
386
|
pub fn TessPageIteratorNext(handle: *mut c_void, level: c_int) -> c_int;
|
|
@@ -555,7 +555,7 @@ impl Drop for ResultIterator {
|
|
|
555
555
|
}
|
|
556
556
|
|
|
557
557
|
#[cfg(any(feature = "build-tesseract", feature = "build-tesseract-wasm"))]
|
|
558
|
-
|
|
558
|
+
ffi_extern! {
|
|
559
559
|
pub fn TessResultIteratorDelete(handle: *mut c_void);
|
|
560
560
|
pub fn TessPageIteratorBegin(handle: *mut c_void);
|
|
561
561
|
pub fn TessResultIteratorGetUTF8Text(handle: *mut c_void, level: c_int) -> *mut c_char;
|
|
@@ -198,7 +198,7 @@ impl Drop for TessResultRenderer {
|
|
|
198
198
|
}
|
|
199
199
|
}
|
|
200
200
|
|
|
201
|
-
|
|
201
|
+
ffi_extern! {
|
|
202
202
|
pub fn TessTextRendererCreate(outputbase: *const c_char) -> *mut c_void;
|
|
203
203
|
pub fn TessHOcrRendererCreate(outputbase: *const c_char) -> *mut c_void;
|
|
204
204
|
pub fn TessPDFRendererCreate(outputbase: *const c_char, datadir: *const c_char, textonly: c_int) -> *mut c_void;
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.7.
|
|
4
|
+
version: 4.7.4
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-04-
|
|
11
|
+
date: 2026-04-06 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -1031,11 +1031,13 @@ files:
|
|
|
1031
1031
|
- vendor/kreuzberg/tests/path_resolution/fixtures/typst_with_images.typ
|
|
1032
1032
|
- vendor/kreuzberg/tests/pdf_hierarchy_detection.rs
|
|
1033
1033
|
- vendor/kreuzberg/tests/pdf_hierarchy_quality.rs
|
|
1034
|
+
- vendor/kreuzberg/tests/pdf_image_extraction_tests.rs
|
|
1034
1035
|
- vendor/kreuzberg/tests/pdf_integration.rs
|
|
1035
1036
|
- vendor/kreuzberg/tests/pdf_markdown_extraction.rs
|
|
1036
1037
|
- vendor/kreuzberg/tests/pdf_markdown_quality.rs
|
|
1037
1038
|
- vendor/kreuzberg/tests/pdf_markdown_regression.rs
|
|
1038
1039
|
- vendor/kreuzberg/tests/pdf_ocr_triggering.rs
|
|
1040
|
+
- vendor/kreuzberg/tests/pdf_output_quality.rs
|
|
1039
1041
|
- vendor/kreuzberg/tests/pdf_table_detection.rs
|
|
1040
1042
|
- vendor/kreuzberg/tests/pdf_table_ground_truth.rs
|
|
1041
1043
|
- vendor/kreuzberg/tests/pdf_text_merging.rs
|