pdf2dotmd 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf2dotmd/__init__.py +3 -0
- pdf2dotmd/cli.py +100 -0
- pdf2dotmd/converter.py +172 -0
- pdf2dotmd/image_extractor.py +110 -0
- pdf2dotmd/layout_analyzer.py +478 -0
- pdf2dotmd/page_processor.py +119 -0
- pdf2dotmd/table_processor.py +93 -0
- pdf2dotmd/text_block.py +63 -0
- pdf2dotmd/utils.py +23 -0
- pdf2dotmd-0.0.1.dist-info/METADATA +89 -0
- pdf2dotmd-0.0.1.dist-info/RECORD +15 -0
- pdf2dotmd-0.0.1.dist-info/WHEEL +5 -0
- pdf2dotmd-0.0.1.dist-info/entry_points.txt +2 -0
- pdf2dotmd-0.0.1.dist-info/licenses/LICENSE +21 -0
- pdf2dotmd-0.0.1.dist-info/top_level.txt +1 -0
pdf2dotmd/__init__.py
ADDED
pdf2dotmd/cli.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Command line interface for PDF to Markdown converter."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
from glob import glob
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from .converter import PdfToMarkdownConverter
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def main():
|
|
18
|
+
parser = argparse.ArgumentParser(
|
|
19
|
+
description="Convert PDF files to Markdown format",
|
|
20
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
21
|
+
epilog="""
|
|
22
|
+
Example usage:
|
|
23
|
+
%(prog)s input.pdf # Output to stdout
|
|
24
|
+
%(prog)s input.pdf -o output.md # Output to file
|
|
25
|
+
%(prog)s input.pdf --ignore-images # Skip images, single file output
|
|
26
|
+
%(prog)s *.pdf -o output_dir/ # Batch conversion
|
|
27
|
+
%(prog)s input.pdf -p 1-3 # Convert only pages 1-3
|
|
28
|
+
""",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"input_files",
|
|
33
|
+
nargs="+",
|
|
34
|
+
help="Input PDF file paths (supports wildcards)",
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument("-o", "--output", help="Output file or directory path")
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"--ignore-images",
|
|
39
|
+
"--no-images",
|
|
40
|
+
action="store_true",
|
|
41
|
+
dest="ignore_images",
|
|
42
|
+
help="Ignore all images and output a single Markdown file",
|
|
43
|
+
)
|
|
44
|
+
parser.add_argument(
|
|
45
|
+
"-p",
|
|
46
|
+
"--pages",
|
|
47
|
+
help="Page range to convert (e.g., '1-5,8,10-12')",
|
|
48
|
+
)
|
|
49
|
+
parser.add_argument("-v", "--verbose", action="store_true", help="Show verbose logs")
|
|
50
|
+
|
|
51
|
+
args = parser.parse_args()
|
|
52
|
+
|
|
53
|
+
logging.basicConfig(
|
|
54
|
+
level=logging.DEBUG if args.verbose else logging.INFO,
|
|
55
|
+
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
converter = PdfToMarkdownConverter()
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
for input_pattern in args.input_files:
|
|
62
|
+
matching_files = glob(input_pattern)
|
|
63
|
+
if not matching_files:
|
|
64
|
+
logger.warning("No matching files found: %s", input_pattern)
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
for file_path in matching_files:
|
|
68
|
+
if not file_path.lower().endswith(".pdf"):
|
|
69
|
+
logger.warning("Skipping non-PDF file: %s", file_path)
|
|
70
|
+
continue
|
|
71
|
+
|
|
72
|
+
output_path = None
|
|
73
|
+
if args.output:
|
|
74
|
+
if os.path.isdir(args.output) or args.output.endswith("/"):
|
|
75
|
+
output_path = os.path.join(
|
|
76
|
+
args.output, f"{Path(file_path).stem}.md"
|
|
77
|
+
)
|
|
78
|
+
else:
|
|
79
|
+
output_path = args.output
|
|
80
|
+
|
|
81
|
+
markdown_content = converter.convert_file(
|
|
82
|
+
file_path,
|
|
83
|
+
output_path=output_path,
|
|
84
|
+
ignore_images=args.ignore_images,
|
|
85
|
+
pages=args.pages,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
if not output_path:
|
|
89
|
+
print(f"\n=== {file_path} ===\n")
|
|
90
|
+
print(markdown_content)
|
|
91
|
+
except KeyboardInterrupt:
|
|
92
|
+
logger.info("Conversion interrupted by user")
|
|
93
|
+
sys.exit(1)
|
|
94
|
+
except Exception as exc: # pylint: disable=broad-except
|
|
95
|
+
logger.error("Program execution failed: %s", exc)
|
|
96
|
+
sys.exit(1)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
if __name__ == "__main__":
|
|
100
|
+
main()
|
pdf2dotmd/converter.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Core converter module for PDF to Markdown conversion."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import pdfplumber # type: ignore[import-not-found]
|
|
12
|
+
except ImportError: # pragma: no cover
|
|
13
|
+
pdfplumber = None # type: ignore[assignment]
|
|
14
|
+
|
|
15
|
+
from .image_extractor import ImageExtractor
|
|
16
|
+
from .layout_analyzer import LayoutAnalyzer
|
|
17
|
+
from .page_processor import PageProcessor
|
|
18
|
+
from .table_processor import TableProcessor
|
|
19
|
+
from .utils import clean_markdown_content
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _parse_page_range(page_spec: str, total_pages: int) -> list[int]:
|
|
25
|
+
"""Parse a page range string like '1-5,8,10-12' into 0-based page indices."""
|
|
26
|
+
indices: list[int] = []
|
|
27
|
+
for part in page_spec.split(","):
|
|
28
|
+
part = part.strip()
|
|
29
|
+
if "-" in part:
|
|
30
|
+
start, end = part.split("-", 1)
|
|
31
|
+
start = int(start.strip())
|
|
32
|
+
end = int(end.strip())
|
|
33
|
+
for p in range(start, end + 1):
|
|
34
|
+
if 1 <= p <= total_pages:
|
|
35
|
+
indices.append(p - 1)
|
|
36
|
+
else:
|
|
37
|
+
p = int(part)
|
|
38
|
+
if 1 <= p <= total_pages:
|
|
39
|
+
indices.append(p - 1)
|
|
40
|
+
return sorted(set(indices))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class PdfToMarkdownConverter:
|
|
44
|
+
"""PDF to Markdown converter."""
|
|
45
|
+
|
|
46
|
+
def __init__(self):
|
|
47
|
+
self.output_folder: str = ""
|
|
48
|
+
self.assets_dir: str = ""
|
|
49
|
+
|
|
50
|
+
def convert_file(
|
|
51
|
+
self,
|
|
52
|
+
input_path: str,
|
|
53
|
+
output_path: Optional[str] = None,
|
|
54
|
+
ignore_images: bool = False,
|
|
55
|
+
pages: Optional[str] = None,
|
|
56
|
+
) -> str:
|
|
57
|
+
if not os.path.exists(input_path):
|
|
58
|
+
raise FileNotFoundError(f"Input file does not exist: {input_path}")
|
|
59
|
+
|
|
60
|
+
if not input_path.lower().endswith(".pdf"):
|
|
61
|
+
raise ValueError(f"Only .pdf is supported: {input_path}")
|
|
62
|
+
|
|
63
|
+
if pdfplumber is None:
|
|
64
|
+
raise RuntimeError(
|
|
65
|
+
"Missing required dependency 'pdfplumber'. Please run: pip install pdfplumber"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
self._setup_output_structure(input_path, output_path, ignore_images)
|
|
69
|
+
|
|
70
|
+
layout_analyzer = LayoutAnalyzer()
|
|
71
|
+
table_processor = TableProcessor()
|
|
72
|
+
image_extractor = (
|
|
73
|
+
ImageExtractor(self.assets_dir) if not ignore_images and self.assets_dir else None
|
|
74
|
+
)
|
|
75
|
+
page_processor = PageProcessor(
|
|
76
|
+
layout_analyzer=layout_analyzer,
|
|
77
|
+
table_processor=table_processor,
|
|
78
|
+
image_extractor=image_extractor,
|
|
79
|
+
ignore_images=ignore_images,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
output_lines: list[str] = []
|
|
83
|
+
|
|
84
|
+
with pdfplumber.open(input_path) as pdf:
|
|
85
|
+
total_pages = len(pdf.pages)
|
|
86
|
+
|
|
87
|
+
if total_pages == 0:
|
|
88
|
+
logger.warning("PDF has no pages: %s", input_path)
|
|
89
|
+
markdown_content = "\n"
|
|
90
|
+
self._write_output(markdown_content, self._get_final_output_path(input_path, output_path))
|
|
91
|
+
return markdown_content
|
|
92
|
+
|
|
93
|
+
# Determine page indices
|
|
94
|
+
if pages:
|
|
95
|
+
page_indices = _parse_page_range(pages, total_pages)
|
|
96
|
+
if not page_indices:
|
|
97
|
+
raise ValueError(f"No valid pages in range '{pages}' (total: {total_pages})")
|
|
98
|
+
else:
|
|
99
|
+
page_indices = list(range(total_pages))
|
|
100
|
+
|
|
101
|
+
# Check if the PDF is scanned (no text on any page)
|
|
102
|
+
has_text = False
|
|
103
|
+
for idx in page_indices:
|
|
104
|
+
if pdf.pages[idx].chars:
|
|
105
|
+
has_text = True
|
|
106
|
+
break
|
|
107
|
+
if not has_text:
|
|
108
|
+
logger.warning(
|
|
109
|
+
"PDF appears to have no text layer (possibly scanned). "
|
|
110
|
+
"OCR is not supported. Output may be empty."
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
for idx in page_indices:
|
|
114
|
+
page = pdf.pages[idx]
|
|
115
|
+
page_number = idx + 1
|
|
116
|
+
logger.debug("Processing page %d/%d", page_number, total_pages)
|
|
117
|
+
|
|
118
|
+
page_lines = page_processor.process_page(page, page_number)
|
|
119
|
+
output_lines.extend(page_lines)
|
|
120
|
+
|
|
121
|
+
markdown_content = clean_markdown_content(output_lines)
|
|
122
|
+
final_output_path = self._get_final_output_path(input_path, output_path)
|
|
123
|
+
self._write_output(markdown_content, final_output_path)
|
|
124
|
+
self._cleanup_empty_assets_dir()
|
|
125
|
+
|
|
126
|
+
logger.info("Conversion completed, output file: %s", final_output_path)
|
|
127
|
+
return markdown_content
|
|
128
|
+
|
|
129
|
+
def _setup_output_structure(
|
|
130
|
+
self, input_path: str, output_path: Optional[str], ignore_images: bool
|
|
131
|
+
):
|
|
132
|
+
input_stem = Path(input_path).stem
|
|
133
|
+
|
|
134
|
+
if output_path:
|
|
135
|
+
if os.path.isdir(output_path) or output_path.endswith("/"):
|
|
136
|
+
self.output_folder = os.path.join(output_path, input_stem)
|
|
137
|
+
else:
|
|
138
|
+
self.output_folder = os.path.dirname(output_path)
|
|
139
|
+
if not self.output_folder:
|
|
140
|
+
self.output_folder = input_stem
|
|
141
|
+
else:
|
|
142
|
+
self.output_folder = input_stem
|
|
143
|
+
|
|
144
|
+
os.makedirs(self.output_folder, exist_ok=True)
|
|
145
|
+
|
|
146
|
+
if ignore_images:
|
|
147
|
+
self.assets_dir = ""
|
|
148
|
+
else:
|
|
149
|
+
self.assets_dir = os.path.join(self.output_folder, "assets")
|
|
150
|
+
os.makedirs(self.assets_dir, exist_ok=True)
|
|
151
|
+
|
|
152
|
+
def _get_final_output_path(self, input_path: str, output_path: Optional[str]) -> str:
|
|
153
|
+
input_stem = Path(input_path).stem
|
|
154
|
+
|
|
155
|
+
if output_path:
|
|
156
|
+
if os.path.isdir(output_path) or output_path.endswith("/"):
|
|
157
|
+
return os.path.join(self.output_folder, f"{input_stem}.md")
|
|
158
|
+
return output_path
|
|
159
|
+
|
|
160
|
+
return os.path.join(self.output_folder, f"{input_stem}.md")
|
|
161
|
+
|
|
162
|
+
def _write_output(self, content: str, output_path: str):
|
|
163
|
+
output_dir = os.path.dirname(output_path)
|
|
164
|
+
if output_dir and not os.path.exists(output_dir):
|
|
165
|
+
os.makedirs(output_dir)
|
|
166
|
+
|
|
167
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
168
|
+
f.write(content)
|
|
169
|
+
|
|
170
|
+
def _cleanup_empty_assets_dir(self):
|
|
171
|
+
if self.assets_dir and os.path.exists(self.assets_dir) and not os.listdir(self.assets_dir):
|
|
172
|
+
os.rmdir(self.assets_dir)
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Image extraction from PDF pages."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ImageExtractor:
|
|
13
|
+
"""Extract images from PDF pages into an assets directory."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, assets_dir: str):
|
|
16
|
+
self.assets_dir = assets_dir
|
|
17
|
+
self._image_count = 0
|
|
18
|
+
|
|
19
|
+
def extract_images(self, page, page_number: int) -> list[tuple[str, tuple[float, float, float, float]]]:
|
|
20
|
+
"""Extract images from a pdfplumber page.
|
|
21
|
+
|
|
22
|
+
Returns list of (markdown_path, bounding_box) tuples.
|
|
23
|
+
"""
|
|
24
|
+
results: list[tuple[str, tuple[float, float, float, float]]] = []
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
images = page.images
|
|
28
|
+
except Exception as exc:
|
|
29
|
+
logger.warning("Failed to get images from page %d: %s", page_number, exc)
|
|
30
|
+
return results
|
|
31
|
+
|
|
32
|
+
for img_info in images:
|
|
33
|
+
self._image_count += 1
|
|
34
|
+
bbox = (img_info["x0"], img_info["top"], img_info["x1"], img_info["bottom"])
|
|
35
|
+
|
|
36
|
+
# Try to extract the actual image data
|
|
37
|
+
image_path = self._save_image_from_page(page, page_number, self._image_count, img_info)
|
|
38
|
+
if image_path:
|
|
39
|
+
results.append((image_path, bbox))
|
|
40
|
+
|
|
41
|
+
return results
|
|
42
|
+
|
|
43
|
+
def _save_image_from_page(
|
|
44
|
+
self, page, page_number: int, image_index: int, img_info: dict
|
|
45
|
+
) -> str:
|
|
46
|
+
"""Try to extract and save an image, return relative markdown path or empty string."""
|
|
47
|
+
try:
|
|
48
|
+
# Access underlying pdfminer page for image XObjects
|
|
49
|
+
pdfminer_page = page.page
|
|
50
|
+
resources = pdfminer_page.resources
|
|
51
|
+
|
|
52
|
+
if resources is None:
|
|
53
|
+
return ""
|
|
54
|
+
|
|
55
|
+
xobjects = resources.get("XObject", {})
|
|
56
|
+
if not xobjects:
|
|
57
|
+
return ""
|
|
58
|
+
|
|
59
|
+
# Try to find the image by iterating XObjects
|
|
60
|
+
for obj_name in xobjects:
|
|
61
|
+
try:
|
|
62
|
+
xobj = xobjects[obj_name].resolve()
|
|
63
|
+
if xobj.get("Subtype") != "Image":
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
width = int(xobj.get("Width", 0))
|
|
67
|
+
height = int(xobj.get("Height", 0))
|
|
68
|
+
if width <= 0 or height <= 0:
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
# Determine format
|
|
72
|
+
color_space = xobj.get("ColorSpace", "")
|
|
73
|
+
filters = xobj.get("Filter", "")
|
|
74
|
+
|
|
75
|
+
if isinstance(filters, list):
|
|
76
|
+
has_jpeg = any("DCT" in str(f) for f in filters)
|
|
77
|
+
ext = "jpg" if has_jpeg else "png"
|
|
78
|
+
elif "DCT" in str(filters):
|
|
79
|
+
ext = "jpg"
|
|
80
|
+
else:
|
|
81
|
+
ext = "png"
|
|
82
|
+
|
|
83
|
+
filename = f"page{page_number:03d}_img{image_index:02d}.{ext}"
|
|
84
|
+
os.makedirs(self.assets_dir, exist_ok=True)
|
|
85
|
+
output_path = Path(self.assets_dir) / filename
|
|
86
|
+
|
|
87
|
+
# Extract raw stream data
|
|
88
|
+
stream = xobj.get_data()
|
|
89
|
+
output_path.write_bytes(stream)
|
|
90
|
+
|
|
91
|
+
return f"assets/{filename}"
|
|
92
|
+
except Exception:
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
# Fallback: just record image position without data
|
|
96
|
+
logger.debug(
|
|
97
|
+
"Could not extract image data for image %d on page %d",
|
|
98
|
+
image_index,
|
|
99
|
+
page_number,
|
|
100
|
+
)
|
|
101
|
+
return ""
|
|
102
|
+
|
|
103
|
+
except Exception as exc:
|
|
104
|
+
logger.warning(
|
|
105
|
+
"Failed to extract image %d on page %d: %s",
|
|
106
|
+
image_index,
|
|
107
|
+
page_number,
|
|
108
|
+
exc,
|
|
109
|
+
)
|
|
110
|
+
return ""
|
|
@@ -0,0 +1,478 @@
|
|
|
1
|
+
"""Layout analysis for reconstructing reading order from PDF spatial data."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import re
|
|
7
|
+
from collections import Counter
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from .text_block import TextBlock
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
# Minimum gap (as fraction of page width) to detect a column boundary
|
|
15
|
+
COLUMN_GAP_MIN_FRACTION = 0.15
|
|
16
|
+
|
|
17
|
+
# Y-tolerance for grouping characters into lines (points)
|
|
18
|
+
DEFAULT_Y_TOLERANCE = 3.0
|
|
19
|
+
|
|
20
|
+
# Maximum vertical gap (as multiple of line height) to group lines into a block
|
|
21
|
+
BLOCK_MAX_GAP_LINES = 1.5
|
|
22
|
+
|
|
23
|
+
# Minimum horizontal overlap ratio for lines to be in the same block
|
|
24
|
+
BLOCK_MIN_OVERLAP = 0.5
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class _TextLine:
|
|
28
|
+
"""Internal intermediate: a single line of characters on a page."""
|
|
29
|
+
|
|
30
|
+
__slots__ = ("chars", "text", "x0", "y0", "x1", "y1", "font_name", "font_size")
|
|
31
|
+
|
|
32
|
+
def __init__(self):
|
|
33
|
+
self.chars: list[dict] = []
|
|
34
|
+
self.text: str = ""
|
|
35
|
+
self.x0: float = float("inf")
|
|
36
|
+
self.y0: float = float("inf")
|
|
37
|
+
self.x1: float = 0.0
|
|
38
|
+
self.y1: float = 0.0
|
|
39
|
+
self.font_name: str = ""
|
|
40
|
+
self.font_size: float = 0.0
|
|
41
|
+
|
|
42
|
+
def add_char(self, char: dict):
|
|
43
|
+
self.chars.append(char)
|
|
44
|
+
self.x0 = min(self.x0, char["x0"])
|
|
45
|
+
self.y0 = min(self.y0, char["y0"])
|
|
46
|
+
self.x1 = max(self.x1, char["x1"])
|
|
47
|
+
self.y1 = max(self.y1, char["y1"])
|
|
48
|
+
|
|
49
|
+
def finalize(self):
|
|
50
|
+
if not self.chars:
|
|
51
|
+
return
|
|
52
|
+
self.chars.sort(key=lambda c: c["x0"])
|
|
53
|
+
self.text = "".join(c["text"] for c in self.chars)
|
|
54
|
+
# Dominant font
|
|
55
|
+
font_counter = Counter(c.get("fontname", "") for c in self.chars)
|
|
56
|
+
self.font_name = font_counter.most_common(1)[0][0]
|
|
57
|
+
size_counter = Counter(c.get("size", 0) for c in self.chars)
|
|
58
|
+
self.font_size = size_counter.most_common(1)[0][0]
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def center_y(self) -> float:
|
|
62
|
+
return (self.y0 + self.y1) / 2
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def width(self) -> float:
|
|
66
|
+
return self.x1 - self.x0
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def height(self) -> float:
|
|
70
|
+
return self.y1 - self.y0
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class LayoutAnalyzer:
|
|
74
|
+
"""Analyze PDF page layout to reconstruct reading order."""
|
|
75
|
+
|
|
76
|
+
def __init__(self):
|
|
77
|
+
self._header_texts: list[str] = []
|
|
78
|
+
self._footer_texts: list[str] = []
|
|
79
|
+
|
|
80
|
+
def analyze(self, page, page_number: int) -> list[TextBlock]:
|
|
81
|
+
"""Analyze a pdfplumber page and return TextBlocks in reading order."""
|
|
82
|
+
chars = page.chars
|
|
83
|
+
if not chars:
|
|
84
|
+
logger.debug("Page %d has no text characters", page_number)
|
|
85
|
+
return []
|
|
86
|
+
|
|
87
|
+
page_width = page.width
|
|
88
|
+
page_height = page.height
|
|
89
|
+
|
|
90
|
+
# Step 1: Group characters into text lines
|
|
91
|
+
lines = self._group_chars_to_lines(chars)
|
|
92
|
+
|
|
93
|
+
# Step 2: Group lines into text blocks (paragraphs)
|
|
94
|
+
blocks = self._group_lines_to_blocks(lines, page_number)
|
|
95
|
+
|
|
96
|
+
# Step 3: Detect columns
|
|
97
|
+
columns = self._detect_columns(blocks, page_width)
|
|
98
|
+
|
|
99
|
+
# Step 4: Sort blocks into reading order
|
|
100
|
+
ordered = self._apply_reading_order(blocks, columns, page_width)
|
|
101
|
+
|
|
102
|
+
# Step 5: Detect headers/footers
|
|
103
|
+
self._detect_headers_footers(ordered, page_height)
|
|
104
|
+
|
|
105
|
+
# Step 6: Merge hyphenated line breaks
|
|
106
|
+
self._merge_hyphenation(ordered)
|
|
107
|
+
|
|
108
|
+
return ordered
|
|
109
|
+
|
|
110
|
+
def analyze_multi_page(self, pages_with_numbers: list[tuple]) -> list[TextBlock]:
|
|
111
|
+
"""Analyze multiple pages and detect repeated headers/footers.
|
|
112
|
+
|
|
113
|
+
pages_with_numbers: list of (page, page_number) tuples
|
|
114
|
+
"""
|
|
115
|
+
all_blocks: list[TextBlock] = []
|
|
116
|
+
per_page_top: list[tuple[str, int]] = []
|
|
117
|
+
per_page_bottom: list[tuple[str, int]] = []
|
|
118
|
+
|
|
119
|
+
for page, page_number in pages_with_numbers:
|
|
120
|
+
blocks = self.analyze(page, page_number)
|
|
121
|
+
if blocks:
|
|
122
|
+
# topmost block (highest y1 in PDF coords = visually top)
|
|
123
|
+
top_block = max(blocks, key=lambda b: b.y1)
|
|
124
|
+
per_page_top.append((top_block.text.strip(), page_number))
|
|
125
|
+
# bottommost block
|
|
126
|
+
bottom_block = min(blocks, key=lambda b: b.y0)
|
|
127
|
+
per_page_bottom.append((bottom_block.text.strip(), page_number))
|
|
128
|
+
all_blocks.extend(blocks)
|
|
129
|
+
|
|
130
|
+
# Mark repeated headers/footers across pages
|
|
131
|
+
self._mark_repeated_elements(all_blocks, per_page_top, per_page_bottom)
|
|
132
|
+
|
|
133
|
+
return all_blocks
|
|
134
|
+
|
|
135
|
+
def _group_chars_to_lines(self, chars: list[dict]) -> list[_TextLine]:
|
|
136
|
+
"""Group characters into text lines by y-coordinate clustering."""
|
|
137
|
+
if not chars:
|
|
138
|
+
return []
|
|
139
|
+
|
|
140
|
+
# Sort by y center descending (top of page first), then x
|
|
141
|
+
sorted_chars = sorted(chars, key=lambda c: (-(c["top"] + c["bottom"]) / 2, c["x0"]))
|
|
142
|
+
|
|
143
|
+
lines: list[_TextLine] = []
|
|
144
|
+
current_line = _TextLine()
|
|
145
|
+
|
|
146
|
+
for char in sorted_chars:
|
|
147
|
+
char_cy = (char["top"] + char["bottom"]) / 2
|
|
148
|
+
# Use adaptive tolerance based on font size
|
|
149
|
+
tolerance = max(DEFAULT_Y_TOLERANCE, char.get("size", 12) * 0.4)
|
|
150
|
+
|
|
151
|
+
if current_line.chars:
|
|
152
|
+
line_cy = current_line.center_y
|
|
153
|
+
if abs(char_cy - line_cy) <= tolerance:
|
|
154
|
+
current_line.add_char(char)
|
|
155
|
+
else:
|
|
156
|
+
current_line.finalize()
|
|
157
|
+
if current_line.text.strip():
|
|
158
|
+
lines.append(current_line)
|
|
159
|
+
current_line = _TextLine()
|
|
160
|
+
current_line.add_char(char)
|
|
161
|
+
else:
|
|
162
|
+
current_line.add_char(char)
|
|
163
|
+
|
|
164
|
+
if current_line.chars:
|
|
165
|
+
current_line.finalize()
|
|
166
|
+
if current_line.text.strip():
|
|
167
|
+
lines.append(current_line)
|
|
168
|
+
|
|
169
|
+
# Sort lines top-to-bottom
|
|
170
|
+
lines.sort(key=lambda l: -l.center_y)
|
|
171
|
+
return lines
|
|
172
|
+
|
|
173
|
+
def _group_lines_to_blocks(self, lines: list[_TextLine], page_number: int) -> list[TextBlock]:
|
|
174
|
+
"""Group adjacent text lines into TextBlock paragraphs."""
|
|
175
|
+
if not lines:
|
|
176
|
+
return []
|
|
177
|
+
|
|
178
|
+
blocks: list[TextBlock] = []
|
|
179
|
+
current_lines: list[_TextLine] = [lines[0]]
|
|
180
|
+
|
|
181
|
+
for i in range(1, len(lines)):
|
|
182
|
+
prev = current_lines[-1]
|
|
183
|
+
curr = lines[i]
|
|
184
|
+
|
|
185
|
+
# Check horizontal overlap
|
|
186
|
+
overlap = self._horizontal_overlap(prev, curr)
|
|
187
|
+
|
|
188
|
+
# Check vertical gap
|
|
189
|
+
avg_height = (prev.height + curr.height) / 2
|
|
190
|
+
if avg_height <= 0:
|
|
191
|
+
avg_height = 12
|
|
192
|
+
gap = prev.y0 - curr.y1 # positive means curr is below prev
|
|
193
|
+
if gap < 0:
|
|
194
|
+
gap = 0
|
|
195
|
+
|
|
196
|
+
if overlap >= BLOCK_MIN_OVERLAP and gap < BLOCK_MAX_GAP_LINES * avg_height:
|
|
197
|
+
current_lines.append(curr)
|
|
198
|
+
else:
|
|
199
|
+
blocks.append(self._lines_to_block(current_lines, page_number))
|
|
200
|
+
current_lines = [curr]
|
|
201
|
+
|
|
202
|
+
if current_lines:
|
|
203
|
+
blocks.append(self._lines_to_block(current_lines, page_number))
|
|
204
|
+
|
|
205
|
+
return blocks
|
|
206
|
+
|
|
207
|
+
def _horizontal_overlap(self, a: _TextLine, b: _TextLine) -> float:
|
|
208
|
+
"""Calculate horizontal overlap ratio between two lines."""
|
|
209
|
+
overlap = max(0.0, min(a.x1, b.x1) - max(a.x0, b.x0))
|
|
210
|
+
min_width = min(a.width, b.width)
|
|
211
|
+
if min_width <= 0:
|
|
212
|
+
return 0.0
|
|
213
|
+
return overlap / min_width
|
|
214
|
+
|
|
215
|
+
def _lines_to_block(self, lines: list[_TextLine], page_number: int) -> TextBlock:
|
|
216
|
+
"""Merge multiple lines into a single TextBlock."""
|
|
217
|
+
if not lines:
|
|
218
|
+
return TextBlock(page_number=page_number)
|
|
219
|
+
|
|
220
|
+
text_parts: list[str] = []
|
|
221
|
+
for i, line in enumerate(lines):
|
|
222
|
+
stripped = line.text.strip()
|
|
223
|
+
if i > 0 and stripped:
|
|
224
|
+
text_parts.append(" ")
|
|
225
|
+
text_parts.append(stripped)
|
|
226
|
+
|
|
227
|
+
text = "".join(text_parts).strip()
|
|
228
|
+
|
|
229
|
+
# Dominant font info
|
|
230
|
+
font_counter = Counter(l.font_name for l in lines)
|
|
231
|
+
font_name = font_counter.most_common(1)[0][0]
|
|
232
|
+
size_counter = Counter(l.font_size for l in lines)
|
|
233
|
+
font_size = size_counter.most_common(1)[0][0]
|
|
234
|
+
|
|
235
|
+
bold = "bold" in font_name.lower()
|
|
236
|
+
|
|
237
|
+
return TextBlock(
|
|
238
|
+
text=text,
|
|
239
|
+
x0=min(l.x0 for l in lines),
|
|
240
|
+
y0=min(l.y0 for l in lines),
|
|
241
|
+
x1=max(l.x1 for l in lines),
|
|
242
|
+
y1=max(l.y1 for l in lines),
|
|
243
|
+
font_name=font_name,
|
|
244
|
+
font_size=font_size,
|
|
245
|
+
bold=bold,
|
|
246
|
+
page_number=page_number,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
def _detect_columns(self, blocks: list[TextBlock], page_width: float) -> list[tuple[float, float]]:
|
|
250
|
+
"""Detect column boundaries. Returns list of (x0, x1) column ranges."""
|
|
251
|
+
if not blocks or page_width <= 0:
|
|
252
|
+
return [(0, page_width)]
|
|
253
|
+
|
|
254
|
+
# Collect block left edges
|
|
255
|
+
left_edges = sorted(set(round(b.x0) for b in blocks))
|
|
256
|
+
|
|
257
|
+
if len(left_edges) <= 1:
|
|
258
|
+
return [(0, page_width)]
|
|
259
|
+
|
|
260
|
+
# Find the largest horizontal gap between block groups
|
|
261
|
+
# Divide page into vertical strips and count chars density
|
|
262
|
+
strip_width = page_width / 40
|
|
263
|
+
strip_counts: dict[int, int] = {}
|
|
264
|
+
for b in blocks:
|
|
265
|
+
center_strip = int(b.center_x / strip_width)
|
|
266
|
+
strip_counts[center_strip] = strip_counts.get(center_strip, 0) + len(b.text)
|
|
267
|
+
|
|
268
|
+
# Find gap ranges with no text
|
|
269
|
+
all_strips = range(40)
|
|
270
|
+
gap_ranges: list[tuple[int, int]] = []
|
|
271
|
+
gap_start = None
|
|
272
|
+
|
|
273
|
+
for s in all_strips:
|
|
274
|
+
if strip_counts.get(s, 0) == 0:
|
|
275
|
+
if gap_start is None:
|
|
276
|
+
gap_start = s
|
|
277
|
+
else:
|
|
278
|
+
if gap_start is not None:
|
|
279
|
+
gap_ranges.append((gap_start, s - 1))
|
|
280
|
+
gap_start = None
|
|
281
|
+
if gap_start is not None:
|
|
282
|
+
gap_ranges.append((gap_start, 39))
|
|
283
|
+
|
|
284
|
+
# Filter gaps that are wide enough to be column separators
|
|
285
|
+
min_gap_strips = int(COLUMN_GAP_MIN_FRACTION * 40)
|
|
286
|
+
significant_gaps = [
|
|
287
|
+
(s * strip_width, (e + 1) * strip_width)
|
|
288
|
+
for s, e in gap_ranges
|
|
289
|
+
if (e - s + 1) >= min_gap_strips
|
|
290
|
+
]
|
|
291
|
+
|
|
292
|
+
if not significant_gaps:
|
|
293
|
+
return [(0, page_width)]
|
|
294
|
+
|
|
295
|
+
# Build column ranges from gaps
|
|
296
|
+
columns: list[tuple[float, float]] = []
|
|
297
|
+
prev_end = 0.0
|
|
298
|
+
for gap_start, gap_end in significant_gaps:
|
|
299
|
+
if gap_start > prev_end:
|
|
300
|
+
columns.append((prev_end, gap_start))
|
|
301
|
+
prev_end = gap_end
|
|
302
|
+
if prev_end < page_width:
|
|
303
|
+
columns.append((prev_end, page_width))
|
|
304
|
+
|
|
305
|
+
if len(columns) <= 1:
|
|
306
|
+
return [(0, page_width)]
|
|
307
|
+
|
|
308
|
+
return columns
|
|
309
|
+
|
|
310
|
+
def _apply_reading_order(
|
|
311
|
+
self,
|
|
312
|
+
blocks: list[TextBlock],
|
|
313
|
+
columns: list[tuple[float, float]],
|
|
314
|
+
page_width: float,
|
|
315
|
+
) -> list[TextBlock]:
|
|
316
|
+
"""Sort blocks into logical reading order based on detected columns."""
|
|
317
|
+
if len(columns) <= 1:
|
|
318
|
+
# Single column: sort top-to-bottom (descending y1 in PDF coords)
|
|
319
|
+
return sorted(blocks, key=lambda b: -b.y1)
|
|
320
|
+
|
|
321
|
+
# Multi-column layout
|
|
322
|
+
column_blocks: list[list[TextBlock]] = [[] for _ in columns]
|
|
323
|
+
spanning: list[TextBlock] = []
|
|
324
|
+
|
|
325
|
+
for block in blocks:
|
|
326
|
+
if block.is_spanning(page_width):
|
|
327
|
+
spanning.append(block)
|
|
328
|
+
else:
|
|
329
|
+
# Assign to column by center_x
|
|
330
|
+
best_col = 0
|
|
331
|
+
best_dist = float("inf")
|
|
332
|
+
for i, (cx0, cx1) in enumerate(columns):
|
|
333
|
+
col_center = (cx0 + cx1) / 2
|
|
334
|
+
dist = abs(block.center_x - col_center)
|
|
335
|
+
if dist < best_dist:
|
|
336
|
+
best_dist = dist
|
|
337
|
+
best_col = i
|
|
338
|
+
column_blocks[best_col].append(block)
|
|
339
|
+
|
|
340
|
+
# Sort each column top-to-bottom
|
|
341
|
+
for col in column_blocks:
|
|
342
|
+
col.sort(key=lambda b: -b.y1)
|
|
343
|
+
|
|
344
|
+
# Sort spanning blocks top-to-bottom
|
|
345
|
+
spanning.sort(key=lambda b: -b.y1)
|
|
346
|
+
|
|
347
|
+
# Merge: interleaving spanning blocks with column content
|
|
348
|
+
result: list[TextBlock] = []
|
|
349
|
+
col_indices = [0] * len(columns)
|
|
350
|
+
span_idx = 0
|
|
351
|
+
|
|
352
|
+
for block in spanning:
|
|
353
|
+
# Flush column blocks that are above this spanning block
|
|
354
|
+
for col_i, col in enumerate(column_blocks):
|
|
355
|
+
while col_indices[col_i] < len(col) and col[col_indices[col_i]].y1 > block.y1:
|
|
356
|
+
result.append(col[col_indices[col_i]])
|
|
357
|
+
col_indices[col_i] += 1
|
|
358
|
+
result.append(block)
|
|
359
|
+
|
|
360
|
+
# Flush remaining column blocks (left to right, top to bottom within)
|
|
361
|
+
while any(col_indices[i] < len(column_blocks[i]) for i in range(len(columns))):
|
|
362
|
+
# Find the topmost remaining block across all columns
|
|
363
|
+
top_block = None
|
|
364
|
+
top_col = -1
|
|
365
|
+
for col_i, col in enumerate(column_blocks):
|
|
366
|
+
if col_indices[col_i] < len(col):
|
|
367
|
+
b = col[col_indices[col_i]]
|
|
368
|
+
if top_block is None or b.y1 > top_block.y1:
|
|
369
|
+
top_block = b
|
|
370
|
+
top_col = col_i
|
|
371
|
+
if top_block is None:
|
|
372
|
+
break
|
|
373
|
+
result.append(top_block)
|
|
374
|
+
col_indices[top_col] += 1
|
|
375
|
+
|
|
376
|
+
return result
|
|
377
|
+
|
|
378
|
+
def _detect_headers_footers(self, blocks: list[TextBlock], page_height: float):
|
|
379
|
+
"""Detect header/footer blocks based on position on the page."""
|
|
380
|
+
if not blocks:
|
|
381
|
+
return
|
|
382
|
+
|
|
383
|
+
for block in blocks:
|
|
384
|
+
# Top 10% of page → potential header
|
|
385
|
+
if block.y1 > page_height * 0.92:
|
|
386
|
+
block.is_header = True
|
|
387
|
+
# Bottom 8% of page → potential footer
|
|
388
|
+
if block.y0 < page_height * 0.08:
|
|
389
|
+
footer_text = block.text.strip()
|
|
390
|
+
if re.match(r"^[-—\s]*\d+[-—\s]*$", footer_text):
|
|
391
|
+
block.is_footer = True
|
|
392
|
+
elif re.match(r"^\d+\s*/\s*\d+$", footer_text):
|
|
393
|
+
block.is_footer = True
|
|
394
|
+
|
|
395
|
+
def _mark_repeated_elements(
|
|
396
|
+
self,
|
|
397
|
+
all_blocks: list[TextBlock],
|
|
398
|
+
per_page_top: list[tuple[str, int]],
|
|
399
|
+
per_page_bottom: list[tuple[str, int]],
|
|
400
|
+
):
|
|
401
|
+
"""Mark blocks as headers/footers if they repeat across pages."""
|
|
402
|
+
if len(per_page_top) < 3:
|
|
403
|
+
return
|
|
404
|
+
|
|
405
|
+
# Check for repeated top texts
|
|
406
|
+
top_texts = [t for t, _ in per_page_top]
|
|
407
|
+
top_counter = Counter(top_texts)
|
|
408
|
+
for text, count in top_counter.items():
|
|
409
|
+
if count >= 3:
|
|
410
|
+
for block in all_blocks:
|
|
411
|
+
if block.text.strip() == text:
|
|
412
|
+
block.is_header = True
|
|
413
|
+
|
|
414
|
+
# Check for repeated bottom texts
|
|
415
|
+
bottom_texts = [t for t, _ in per_page_bottom]
|
|
416
|
+
bottom_counter = Counter(bottom_texts)
|
|
417
|
+
for text, count in bottom_counter.items():
|
|
418
|
+
if count >= 3:
|
|
419
|
+
for block in all_blocks:
|
|
420
|
+
if block.text.strip() == text:
|
|
421
|
+
block.is_footer = True
|
|
422
|
+
|
|
423
|
+
def _merge_hyphenation(self, blocks: list[TextBlock]):
|
|
424
|
+
"""Merge hyphenated words split across consecutive blocks."""
|
|
425
|
+
i = 0
|
|
426
|
+
while i < len(blocks) - 1:
|
|
427
|
+
current = blocks[i]
|
|
428
|
+
next_block = blocks[i + 1]
|
|
429
|
+
if (
|
|
430
|
+
current.text.endswith("-")
|
|
431
|
+
and next_block.text
|
|
432
|
+
and next_block.text[0].islower()
|
|
433
|
+
and not current.is_header
|
|
434
|
+
and not current.is_footer
|
|
435
|
+
and not next_block.is_header
|
|
436
|
+
and not next_block.is_footer
|
|
437
|
+
):
|
|
438
|
+
# Remove hyphen and join
|
|
439
|
+
current.text = current.text[:-1] + next_block.text
|
|
440
|
+
# Expand bounding box
|
|
441
|
+
current.x1 = max(current.x1, next_block.x1)
|
|
442
|
+
current.y0 = min(current.y0, next_block.y0)
|
|
443
|
+
blocks.pop(i + 1)
|
|
444
|
+
# Don't increment i - check again in case of multi-line hyphenation
|
|
445
|
+
else:
|
|
446
|
+
i += 1
|
|
447
|
+
|
|
448
|
+
@staticmethod
|
|
449
|
+
def infer_heading_levels(blocks: list[TextBlock]) -> dict[int, int]:
|
|
450
|
+
"""Infer heading levels (1-6) from font sizes. Returns {block_index: level}."""
|
|
451
|
+
if not blocks:
|
|
452
|
+
return {}
|
|
453
|
+
|
|
454
|
+
# Collect font sizes from non-header/footer blocks
|
|
455
|
+
sizes = [b.font_size for b in blocks if not b.is_header and not b.is_footer]
|
|
456
|
+
if not sizes:
|
|
457
|
+
return {}
|
|
458
|
+
|
|
459
|
+
# Body text = most common size
|
|
460
|
+
size_counter = Counter(sizes)
|
|
461
|
+
body_size = size_counter.most_common(1)[0][0]
|
|
462
|
+
|
|
463
|
+
# Find unique sizes larger than body
|
|
464
|
+
larger_sizes = sorted(set(s for s in sizes if s > body_size * 1.1), reverse=True)
|
|
465
|
+
|
|
466
|
+
heading_map: dict[int, int] = {}
|
|
467
|
+
for idx, block in enumerate(blocks):
|
|
468
|
+
if block.is_header or block.is_footer:
|
|
469
|
+
continue
|
|
470
|
+
if block.font_size > body_size * 1.1:
|
|
471
|
+
try:
|
|
472
|
+
level = larger_sizes.index(block.font_size) + 1
|
|
473
|
+
except ValueError:
|
|
474
|
+
level = 6
|
|
475
|
+
level = min(level, 6)
|
|
476
|
+
heading_map[idx] = level
|
|
477
|
+
|
|
478
|
+
return heading_map
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Page-level processing: convert analyzed PDF page layout to Markdown lines."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from .image_extractor import ImageExtractor
|
|
9
|
+
from .layout_analyzer import LayoutAnalyzer
|
|
10
|
+
from .table_processor import TableProcessor
|
|
11
|
+
from .text_block import TextBlock
|
|
12
|
+
from .utils import escape_markdown
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PageProcessor:
|
|
18
|
+
"""Convert one PDF page into Markdown lines."""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
layout_analyzer: LayoutAnalyzer,
|
|
23
|
+
table_processor: TableProcessor,
|
|
24
|
+
image_extractor: Optional[ImageExtractor],
|
|
25
|
+
ignore_images: bool = False,
|
|
26
|
+
):
|
|
27
|
+
self.layout_analyzer = layout_analyzer
|
|
28
|
+
self.table_processor = table_processor
|
|
29
|
+
self.image_extractor = image_extractor
|
|
30
|
+
self.ignore_images = ignore_images
|
|
31
|
+
|
|
32
|
+
def process_page(self, page, page_number: int) -> list[str]:
|
|
33
|
+
"""Process a pdfplumber page and return Markdown lines."""
|
|
34
|
+
# Analyze layout to get ordered text blocks
|
|
35
|
+
blocks = self.layout_analyzer.analyze(page, page_number)
|
|
36
|
+
|
|
37
|
+
# Extract tables and their bounding boxes
|
|
38
|
+
tables_with_bbox = self.table_processor.extract_tables(page)
|
|
39
|
+
table_bboxes = [bbox for _, bbox in tables_with_bbox]
|
|
40
|
+
table_lines_by_y: dict[float, list[str]] = {}
|
|
41
|
+
|
|
42
|
+
for table_data, bbox in tables_with_bbox:
|
|
43
|
+
formatted = self.table_processor.format_table(table_data)
|
|
44
|
+
if formatted:
|
|
45
|
+
# Key by y-center of table for interleaving with text
|
|
46
|
+
y_center = (bbox[1] + bbox[3]) / 2
|
|
47
|
+
table_lines_by_y[y_center] = formatted
|
|
48
|
+
|
|
49
|
+
# Extract images (if not ignored)
|
|
50
|
+
images_with_bbox: list[tuple[str, tuple[float, float, float, float]]] = []
|
|
51
|
+
if not self.ignore_images and self.image_extractor:
|
|
52
|
+
images_with_bbox = self.image_extractor.extract_images(page, page_number)
|
|
53
|
+
|
|
54
|
+
if not blocks and not tables_with_bbox and not images_with_bbox:
|
|
55
|
+
return ["<!-- empty page -->"]
|
|
56
|
+
|
|
57
|
+
# Filter out blocks that overlap with tables
|
|
58
|
+
text_blocks = [
|
|
59
|
+
b for b in blocks
|
|
60
|
+
if not b.is_header and not b.is_footer
|
|
61
|
+
and not any(b.overlaps_bbox(tb, threshold=0.6) for tb in table_bboxes)
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
# Infer heading levels
|
|
65
|
+
heading_levels = LayoutAnalyzer.infer_heading_levels(blocks)
|
|
66
|
+
|
|
67
|
+
# Build output by interleaving text blocks, tables, and images
|
|
68
|
+
lines: list[str] = []
|
|
69
|
+
|
|
70
|
+
# Collect all elements with their y-positions for proper interleaving
|
|
71
|
+
elements: list[tuple[float, str, list[str]]] = [] # (y1, type, content_lines)
|
|
72
|
+
|
|
73
|
+
for idx, block in enumerate(blocks):
|
|
74
|
+
if block.is_header or block.is_footer:
|
|
75
|
+
continue
|
|
76
|
+
if any(block.overlaps_bbox(tb, threshold=0.6) for tb in table_bboxes):
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
block_lines = self._block_to_markdown(block, idx, heading_levels)
|
|
80
|
+
if block_lines:
|
|
81
|
+
elements.append((block.y1, "text", block_lines))
|
|
82
|
+
|
|
83
|
+
for y_center, tbl_lines in table_lines_by_y.items():
|
|
84
|
+
elements.append((y_center, "table", tbl_lines))
|
|
85
|
+
|
|
86
|
+
for img_path, bbox in images_with_bbox:
|
|
87
|
+
elements.append((bbox[3], "image", [f"", ""]))
|
|
88
|
+
|
|
89
|
+
# Sort by y-position (top to bottom = descending y1 in PDF coords)
|
|
90
|
+
elements.sort(key=lambda e: -e[0])
|
|
91
|
+
|
|
92
|
+
for _, _, content_lines in elements:
|
|
93
|
+
lines.extend(content_lines)
|
|
94
|
+
|
|
95
|
+
return lines if lines else ["<!-- empty page -->"]
|
|
96
|
+
|
|
97
|
+
def _block_to_markdown(
|
|
98
|
+
self,
|
|
99
|
+
block: TextBlock,
|
|
100
|
+
block_idx: int,
|
|
101
|
+
heading_levels: dict[int, int],
|
|
102
|
+
) -> list[str]:
|
|
103
|
+
"""Convert a single TextBlock to Markdown lines."""
|
|
104
|
+
text = block.text.strip()
|
|
105
|
+
if not text:
|
|
106
|
+
return []
|
|
107
|
+
|
|
108
|
+
# Check if this is a heading
|
|
109
|
+
level = heading_levels.get(block_idx, 0)
|
|
110
|
+
if level > 0:
|
|
111
|
+
prefix = "#" * level
|
|
112
|
+
return [f"{prefix} {escape_markdown(text)}", ""]
|
|
113
|
+
|
|
114
|
+
# Bold text
|
|
115
|
+
if block.bold:
|
|
116
|
+
return [f"**{escape_markdown(text)}**", ""]
|
|
117
|
+
|
|
118
|
+
# Regular paragraph
|
|
119
|
+
return [escape_markdown(text), ""]
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Table extraction from PDF pages to Markdown format."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from .utils import escape_markdown
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
TABLE_SETTINGS = {
|
|
13
|
+
"vertical_strategy": "lines",
|
|
14
|
+
"horizontal_strategy": "lines",
|
|
15
|
+
"snap_tolerance": 5,
|
|
16
|
+
"join_tolerance": 5,
|
|
17
|
+
"edge_min_length": 10,
|
|
18
|
+
"min_words_vertical": 2,
|
|
19
|
+
"min_words_horizontal": 2,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TableProcessor:
|
|
24
|
+
"""Extract tables from PDF pages and convert to Markdown."""
|
|
25
|
+
|
|
26
|
+
def extract_tables(self, page) -> list[tuple[list[list[str]], tuple[float, float, float, float]]]:
|
|
27
|
+
"""Extract tables from a pdfplumber page.
|
|
28
|
+
|
|
29
|
+
Returns list of (table_data, bounding_box) tuples.
|
|
30
|
+
table_data is a list of rows, each row is a list of cell strings.
|
|
31
|
+
bounding_box is (x0, y0, x1, y1).
|
|
32
|
+
"""
|
|
33
|
+
try:
|
|
34
|
+
tables = page.find_tables(table_settings=TABLE_SETTINGS)
|
|
35
|
+
except Exception as exc:
|
|
36
|
+
logger.warning("Failed to find tables on page: %s", exc)
|
|
37
|
+
return []
|
|
38
|
+
|
|
39
|
+
results = []
|
|
40
|
+
for table in tables:
|
|
41
|
+
try:
|
|
42
|
+
table_data = table.extract()
|
|
43
|
+
if not table_data or all(not any(row) for row in table_data):
|
|
44
|
+
continue
|
|
45
|
+
bbox = (table.bbox[0], table.bbox[1], table.bbox[2], table.bbox[3])
|
|
46
|
+
normalized = self._normalize_table(table_data)
|
|
47
|
+
results.append((normalized, bbox))
|
|
48
|
+
except Exception as exc:
|
|
49
|
+
logger.warning("Failed to extract table: %s", exc)
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
return results
|
|
53
|
+
|
|
54
|
+
def format_table(self, table_data: list[list[str]]) -> list[str]:
|
|
55
|
+
"""Convert table data to Markdown pipe table lines."""
|
|
56
|
+
if not table_data:
|
|
57
|
+
return []
|
|
58
|
+
|
|
59
|
+
rows = table_data
|
|
60
|
+
header = rows[0]
|
|
61
|
+
col_count = len(header)
|
|
62
|
+
if col_count == 0:
|
|
63
|
+
return []
|
|
64
|
+
|
|
65
|
+
divider = ["---"] * col_count
|
|
66
|
+
lines = [
|
|
67
|
+
"| " + " | ".join(header) + " |",
|
|
68
|
+
"| " + " | ".join(divider) + " |",
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
for row in rows[1:]:
|
|
72
|
+
# Pad or truncate to match header
|
|
73
|
+
if len(row) < col_count:
|
|
74
|
+
row = row + [""] * (col_count - len(row))
|
|
75
|
+
elif len(row) > col_count:
|
|
76
|
+
row = row[:col_count]
|
|
77
|
+
lines.append("| " + " | ".join(row) + " |")
|
|
78
|
+
|
|
79
|
+
return lines
|
|
80
|
+
|
|
81
|
+
def _normalize_table(self, table_data: list[list[Optional[str]]]) -> list[list[str]]:
|
|
82
|
+
"""Normalize table cells: replace None with empty string, strip whitespace."""
|
|
83
|
+
normalized = []
|
|
84
|
+
for row in table_data:
|
|
85
|
+
normalized_row = []
|
|
86
|
+
for cell in row:
|
|
87
|
+
if cell is None:
|
|
88
|
+
normalized_row.append("")
|
|
89
|
+
else:
|
|
90
|
+
text = " ".join(str(cell).split())
|
|
91
|
+
normalized_row.append(escape_markdown(text))
|
|
92
|
+
normalized.append(normalized_row)
|
|
93
|
+
return normalized
|
pdf2dotmd/text_block.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""TextBlock data structure for representing spatial text units on a PDF page."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class TextBlock:
|
|
10
|
+
"""A coherent unit of text extracted from a PDF page with spatial metadata."""
|
|
11
|
+
|
|
12
|
+
text: str = ""
|
|
13
|
+
x0: float = 0.0
|
|
14
|
+
y0: float = 0.0
|
|
15
|
+
x1: float = 0.0
|
|
16
|
+
y1: float = 0.0
|
|
17
|
+
font_name: str = ""
|
|
18
|
+
font_size: float = 0.0
|
|
19
|
+
bold: bool = False
|
|
20
|
+
page_number: int = 0
|
|
21
|
+
is_header: bool = False
|
|
22
|
+
is_footer: bool = False
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def width(self) -> float:
|
|
26
|
+
return self.x1 - self.x0
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def height(self) -> float:
|
|
30
|
+
return self.y1 - self.y0
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def center_x(self) -> float:
|
|
34
|
+
return (self.x0 + self.x1) / 2
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def center_y(self) -> float:
|
|
38
|
+
return (self.y0 + self.y1) / 2
|
|
39
|
+
|
|
40
|
+
def horizontal_overlap_ratio(self, other: TextBlock) -> float:
|
|
41
|
+
"""Return the ratio of horizontal overlap to the narrower block's width."""
|
|
42
|
+
overlap = max(0.0, min(self.x1, other.x1) - max(self.x0, other.x0))
|
|
43
|
+
min_width = min(self.width, other.width)
|
|
44
|
+
if min_width <= 0:
|
|
45
|
+
return 0.0
|
|
46
|
+
return overlap / min_width
|
|
47
|
+
|
|
48
|
+
def overlaps_bbox(self, bbox: tuple[float, float, float, float], threshold: float = 0.5) -> bool:
|
|
49
|
+
"""Check if this block overlaps significantly with a bounding box."""
|
|
50
|
+
bx0, by0, bx1, by1 = bbox
|
|
51
|
+
overlap_x = max(0.0, min(self.x1, bx1) - max(self.x0, bx0))
|
|
52
|
+
overlap_y = max(0.0, min(self.y1, by1) - max(self.y0, by0))
|
|
53
|
+
if self.width <= 0 or self.height <= 0:
|
|
54
|
+
return False
|
|
55
|
+
overlap_area = overlap_x * overlap_y
|
|
56
|
+
self_area = self.width * self.height
|
|
57
|
+
return (overlap_area / self_area) > threshold
|
|
58
|
+
|
|
59
|
+
def is_spanning(self, page_width: float, ratio: float = 0.8) -> bool:
|
|
60
|
+
"""Check if this block spans most of the page width."""
|
|
61
|
+
if page_width <= 0:
|
|
62
|
+
return False
|
|
63
|
+
return self.width > page_width * ratio
|
pdf2dotmd/utils.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Utility helpers for markdown generation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Iterable
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def escape_markdown(text: str) -> str:
|
|
10
|
+
"""Escape markdown control characters in plain text content."""
|
|
11
|
+
if not text:
|
|
12
|
+
return ""
|
|
13
|
+
|
|
14
|
+
escaped = text.replace("\\", "\\\\")
|
|
15
|
+
for ch in ["`", "*", "_", "{", "}", "[", "]", "<", ">", "|"]:
|
|
16
|
+
escaped = escaped.replace(ch, f"\\{ch}")
|
|
17
|
+
return escaped
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def clean_markdown_content(lines: Iterable[str]) -> str:
|
|
21
|
+
"""Join markdown lines and collapse repeated blank lines."""
|
|
22
|
+
text = "\n".join(lines).rstrip() + "\n"
|
|
23
|
+
return re.sub(r"\n{3,}", "\n\n", text)
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdf2dotmd
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A Python tool for converting PDF files to Markdown
|
|
5
|
+
Author: hnrobert
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/HNRobert/pdf2dotmd
|
|
8
|
+
Project-URL: Repository, https://github.com/HNRobert/pdf2dotmd
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Text Processing :: Markup
|
|
19
|
+
Classifier: Topic :: Utilities
|
|
20
|
+
Requires-Python: >=3.8
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: pdfplumber>=0.11.0
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
|
|
26
|
+
# pdf2dotmd
|
|
27
|
+
|
|
28
|
+
A Python CLI tool that converts PDF files to Markdown format with intelligent layout analysis.
|
|
29
|
+
|
|
30
|
+
## Features
|
|
31
|
+
|
|
32
|
+
- **Layout-aware text extraction** — reconstructs logical reading order from PDF spatial data
|
|
33
|
+
- **Multi-column detection** — handles two-column and multi-column layouts
|
|
34
|
+
- **Table extraction** — converts PDF tables to Markdown pipe tables
|
|
35
|
+
- **Heading inference** — detects headings from font size hierarchy
|
|
36
|
+
- **Header/footer filtering** — automatically removes repeated page headers and footers
|
|
37
|
+
- **Image extraction** — extracts embedded images to an `assets/` directory
|
|
38
|
+
- **Ignore images mode** — `--ignore-images` flag for text-only output
|
|
39
|
+
- **Page range selection** — convert specific pages only
|
|
40
|
+
- **Batch conversion** — process multiple PDF files with wildcards
|
|
41
|
+
|
|
42
|
+
## Installation
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install pdf2dotmd
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Usage
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
# Output to stdout
|
|
52
|
+
pdf2dotmd input.pdf
|
|
53
|
+
|
|
54
|
+
# Output to file
|
|
55
|
+
pdf2dotmd input.pdf -o output.md
|
|
56
|
+
|
|
57
|
+
# Skip images, output single Markdown file
|
|
58
|
+
pdf2dotmd input.pdf --ignore-images
|
|
59
|
+
|
|
60
|
+
# Batch conversion
|
|
61
|
+
pdf2dotmd *.pdf -o output_dir/
|
|
62
|
+
|
|
63
|
+
# Convert only specific pages
|
|
64
|
+
pdf2dotmd input.pdf -p 1-3
|
|
65
|
+
pdf2dotmd input.pdf -p 1-5,8,10-12
|
|
66
|
+
|
|
67
|
+
# Verbose logging
|
|
68
|
+
pdf2dotmd input.pdf -v
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## How It Works
|
|
72
|
+
|
|
73
|
+
1. **Character extraction** — uses [pdfplumber](https://github.com/jsvine/pdfplumber) to extract individual characters with position data
|
|
74
|
+
2. **Line grouping** — clusters characters into text lines by y-coordinate proximity
|
|
75
|
+
3. **Block formation** — groups lines into paragraphs based on horizontal alignment and vertical spacing
|
|
76
|
+
4. **Column detection** — identifies multi-column layouts by analyzing horizontal text density gaps
|
|
77
|
+
5. **Reading order** — sorts blocks top-to-bottom, left-to-right, handling spanning titles
|
|
78
|
+
6. **Header/footer removal** — detects repeated elements across pages
|
|
79
|
+
7. **Heading inference** — maps font sizes to heading levels (H1-H6)
|
|
80
|
+
|
|
81
|
+
## Limitations
|
|
82
|
+
|
|
83
|
+
- **Scanned PDFs** — OCR is not supported; scanned/image-only PDFs will produce empty output
|
|
84
|
+
- **Encrypted PDFs** — password-protected PDFs are not supported
|
|
85
|
+
- **Complex layouts** — highly irregular layouts may not parse perfectly
|
|
86
|
+
|
|
87
|
+
## License
|
|
88
|
+
|
|
89
|
+
MIT
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
pdf2dotmd/__init__.py,sha256=50eR1xS6VIHHNF2ZUw2yjrBqWFn80nPuIRtn9bVMRAQ,45
|
|
2
|
+
pdf2dotmd/cli.py,sha256=Ni8Fy73uoak10dsXV5lC_OQ6b1TPz8Y0N41R00L6D_0,3182
|
|
3
|
+
pdf2dotmd/converter.py,sha256=a-fXjn_S5RmZJyAHTIQ8ltuwlSwxKpEDGVzQVrkCVjU,6156
|
|
4
|
+
pdf2dotmd/image_extractor.py,sha256=W1e1HySOEH69oHjTqn763AIRBLVmwjT8Z_o1afdWI6A,3726
|
|
5
|
+
pdf2dotmd/layout_analyzer.py,sha256=5k9QogF6wgMrsyx-MBZVY6RetUpO-PN8FOtT_KD_NmI,17193
|
|
6
|
+
pdf2dotmd/page_processor.py,sha256=M5qzTcSp-_khSLQaD0xuwAtSU33pWkg3SsUAuz2glpE,4266
|
|
7
|
+
pdf2dotmd/table_processor.py,sha256=9CyUp45NE9udxO_TBpWmsRK25QnlYfLzwOvc542xnq8,3038
|
|
8
|
+
pdf2dotmd/text_block.py,sha256=MiwC61MT0aY3EoYs3GFNYscl6DrOEvO6xmV9VCPvCMY,2028
|
|
9
|
+
pdf2dotmd/utils.py,sha256=w5deYI99DlsDa1CxfpcGZu2P2k7i28HzIEpB7XnvVq8,653
|
|
10
|
+
pdf2dotmd-0.0.1.dist-info/licenses/LICENSE,sha256=ZfEADodI9tn9tdS_ab8HR0oCeCMl9wTRgwPljxRTsXs,1066
|
|
11
|
+
pdf2dotmd-0.0.1.dist-info/METADATA,sha256=q8ueKEpOQpVGFXmyRVx9eXVxhXaW8TR-w8C2pn2jDj0,3130
|
|
12
|
+
pdf2dotmd-0.0.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
13
|
+
pdf2dotmd-0.0.1.dist-info/entry_points.txt,sha256=SQVMAmDp0so-_4paUGfSK6HRGfA4VYxrEEsrEx6TXRA,49
|
|
14
|
+
pdf2dotmd-0.0.1.dist-info/top_level.txt,sha256=fhJsRXlgjF6tT5Wiy50rI5PI66E5uHj_AJ8dR7K-3CI,10
|
|
15
|
+
pdf2dotmd-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Robert He
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pdf2dotmd
|