pdf2dotmd 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Robert He
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,89 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdf2dotmd
3
+ Version: 0.0.1
4
+ Summary: A Python tool for converting PDF files to Markdown
5
+ Author: hnrobert
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/HNRobert/pdf2dotmd
8
+ Project-URL: Repository, https://github.com/HNRobert/pdf2dotmd
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.8
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Text Processing :: Markup
19
+ Classifier: Topic :: Utilities
20
+ Requires-Python: >=3.8
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: pdfplumber>=0.11.0
24
+ Dynamic: license-file
25
+
26
+ # pdf2dotmd
27
+
28
+ A Python CLI tool that converts PDF files to Markdown format with intelligent layout analysis.
29
+
30
+ ## Features
31
+
32
+ - **Layout-aware text extraction** — reconstructs logical reading order from PDF spatial data
33
+ - **Multi-column detection** — handles two-column and multi-column layouts
34
+ - **Table extraction** — converts PDF tables to Markdown pipe tables
35
+ - **Heading inference** — detects headings from font size hierarchy
36
+ - **Header/footer filtering** — automatically removes repeated page headers and footers
37
+ - **Image extraction** — extracts embedded images to an `assets/` directory
38
+ - **Ignore images mode** — `--ignore-images` flag for text-only output
39
+ - **Page range selection** — convert specific pages only
40
+ - **Batch conversion** — process multiple PDF files with wildcards
41
+
42
+ ## Installation
43
+
44
+ ```bash
45
+ pip install pdf2dotmd
46
+ ```
47
+
48
+ ## Usage
49
+
50
+ ```bash
51
+ # Output to stdout
52
+ pdf2dotmd input.pdf
53
+
54
+ # Output to file
55
+ pdf2dotmd input.pdf -o output.md
56
+
57
+ # Skip images, output single Markdown file
58
+ pdf2dotmd input.pdf --ignore-images
59
+
60
+ # Batch conversion
61
+ pdf2dotmd *.pdf -o output_dir/
62
+
63
+ # Convert only specific pages
64
+ pdf2dotmd input.pdf -p 1-3
65
+ pdf2dotmd input.pdf -p 1-5,8,10-12
66
+
67
+ # Verbose logging
68
+ pdf2dotmd input.pdf -v
69
+ ```
70
+
71
+ ## How It Works
72
+
73
+ 1. **Character extraction** — uses [pdfplumber](https://github.com/jsvine/pdfplumber) to extract individual characters with position data
74
+ 2. **Line grouping** — clusters characters into text lines by y-coordinate proximity
75
+ 3. **Block formation** — groups lines into paragraphs based on horizontal alignment and vertical spacing
76
+ 4. **Column detection** — identifies multi-column layouts by analyzing horizontal text density gaps
77
+ 5. **Reading order** — sorts blocks top-to-bottom, left-to-right, handling spanning titles
78
+ 6. **Header/footer removal** — detects repeated elements across pages
79
+ 7. **Heading inference** — maps font sizes to heading levels (H1-H6)
80
+
81
+ ## Limitations
82
+
83
+ - **Scanned PDFs** — OCR is not supported; scanned/image-only PDFs will produce empty output
84
+ - **Encrypted PDFs** — password-protected PDFs are not supported
85
+ - **Complex layouts** — highly irregular layouts may not parse perfectly
86
+
87
+ ## License
88
+
89
+ MIT
@@ -0,0 +1,64 @@
1
+ # pdf2dotmd
2
+
3
+ A Python CLI tool that converts PDF files to Markdown format with intelligent layout analysis.
4
+
5
+ ## Features
6
+
7
+ - **Layout-aware text extraction** — reconstructs logical reading order from PDF spatial data
8
+ - **Multi-column detection** — handles two-column and multi-column layouts
9
+ - **Table extraction** — converts PDF tables to Markdown pipe tables
10
+ - **Heading inference** — detects headings from font size hierarchy
11
+ - **Header/footer filtering** — automatically removes repeated page headers and footers
12
+ - **Image extraction** — extracts embedded images to an `assets/` directory
13
+ - **Ignore images mode** — `--ignore-images` flag for text-only output
14
+ - **Page range selection** — convert specific pages only
15
+ - **Batch conversion** — process multiple PDF files with wildcards
16
+
17
+ ## Installation
18
+
19
+ ```bash
20
+ pip install pdf2dotmd
21
+ ```
22
+
23
+ ## Usage
24
+
25
+ ```bash
26
+ # Output to stdout
27
+ pdf2dotmd input.pdf
28
+
29
+ # Output to file
30
+ pdf2dotmd input.pdf -o output.md
31
+
32
+ # Skip images, output single Markdown file
33
+ pdf2dotmd input.pdf --ignore-images
34
+
35
+ # Batch conversion
36
+ pdf2dotmd *.pdf -o output_dir/
37
+
38
+ # Convert only specific pages
39
+ pdf2dotmd input.pdf -p 1-3
40
+ pdf2dotmd input.pdf -p 1-5,8,10-12
41
+
42
+ # Verbose logging
43
+ pdf2dotmd input.pdf -v
44
+ ```
45
+
46
+ ## How It Works
47
+
48
+ 1. **Character extraction** — uses [pdfplumber](https://github.com/jsvine/pdfplumber) to extract individual characters with position data
49
+ 2. **Line grouping** — clusters characters into text lines by y-coordinate proximity
50
+ 3. **Block formation** — groups lines into paragraphs based on horizontal alignment and vertical spacing
51
+ 4. **Column detection** — identifies multi-column layouts by analyzing horizontal text density gaps
52
+ 5. **Reading order** — sorts blocks top-to-bottom, left-to-right, handling spanning titles
53
+ 6. **Header/footer removal** — detects repeated elements across pages
54
+ 7. **Heading inference** — maps font sizes to heading levels (H1-H6)
55
+
56
+ ## Limitations
57
+
58
+ - **Scanned PDFs** — OCR is not supported; scanned/image-only PDFs will produce empty output
59
+ - **Encrypted PDFs** — password-protected PDFs are not supported
60
+ - **Complex layouts** — highly irregular layouts may not parse perfectly
61
+
62
+ ## License
63
+
64
+ MIT
@@ -0,0 +1,3 @@
1
+ """pdf2md package."""
2
+
3
+ __version__ = "0.0.1"
@@ -0,0 +1,100 @@
1
+ """Command line interface for PDF to Markdown converter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import logging
7
+ import os
8
+ import sys
9
+ from glob import glob
10
+ from pathlib import Path
11
+
12
+ from .converter import PdfToMarkdownConverter
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def main():
18
+ parser = argparse.ArgumentParser(
19
+ description="Convert PDF files to Markdown format",
20
+ formatter_class=argparse.RawDescriptionHelpFormatter,
21
+ epilog="""
22
+ Example usage:
23
+ %(prog)s input.pdf # Output to stdout
24
+ %(prog)s input.pdf -o output.md # Output to file
25
+ %(prog)s input.pdf --ignore-images # Skip images, single file output
26
+ %(prog)s *.pdf -o output_dir/ # Batch conversion
27
+ %(prog)s input.pdf -p 1-3 # Convert only pages 1-3
28
+ """,
29
+ )
30
+
31
+ parser.add_argument(
32
+ "input_files",
33
+ nargs="+",
34
+ help="Input PDF file paths (supports wildcards)",
35
+ )
36
+ parser.add_argument("-o", "--output", help="Output file or directory path")
37
+ parser.add_argument(
38
+ "--ignore-images",
39
+ "--no-images",
40
+ action="store_true",
41
+ dest="ignore_images",
42
+ help="Ignore all images and output a single Markdown file",
43
+ )
44
+ parser.add_argument(
45
+ "-p",
46
+ "--pages",
47
+ help="Page range to convert (e.g., '1-5,8,10-12')",
48
+ )
49
+ parser.add_argument("-v", "--verbose", action="store_true", help="Show verbose logs")
50
+
51
+ args = parser.parse_args()
52
+
53
+ logging.basicConfig(
54
+ level=logging.DEBUG if args.verbose else logging.INFO,
55
+ format="%(asctime)s - %(levelname)s - %(message)s",
56
+ )
57
+
58
+ converter = PdfToMarkdownConverter()
59
+
60
+ try:
61
+ for input_pattern in args.input_files:
62
+ matching_files = glob(input_pattern)
63
+ if not matching_files:
64
+ logger.warning("No matching files found: %s", input_pattern)
65
+ continue
66
+
67
+ for file_path in matching_files:
68
+ if not file_path.lower().endswith(".pdf"):
69
+ logger.warning("Skipping non-PDF file: %s", file_path)
70
+ continue
71
+
72
+ output_path = None
73
+ if args.output:
74
+ if os.path.isdir(args.output) or args.output.endswith("/"):
75
+ output_path = os.path.join(
76
+ args.output, f"{Path(file_path).stem}.md"
77
+ )
78
+ else:
79
+ output_path = args.output
80
+
81
+ markdown_content = converter.convert_file(
82
+ file_path,
83
+ output_path=output_path,
84
+ ignore_images=args.ignore_images,
85
+ pages=args.pages,
86
+ )
87
+
88
+ if not output_path:
89
+ print(f"\n=== {file_path} ===\n")
90
+ print(markdown_content)
91
+ except KeyboardInterrupt:
92
+ logger.info("Conversion interrupted by user")
93
+ sys.exit(1)
94
+ except Exception as exc: # pylint: disable=broad-except
95
+ logger.error("Program execution failed: %s", exc)
96
+ sys.exit(1)
97
+
98
+
99
+ if __name__ == "__main__":
100
+ main()
@@ -0,0 +1,172 @@
1
+ """Core converter module for PDF to Markdown conversion."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import os
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ try:
11
+ import pdfplumber # type: ignore[import-not-found]
12
+ except ImportError: # pragma: no cover
13
+ pdfplumber = None # type: ignore[assignment]
14
+
15
+ from .image_extractor import ImageExtractor
16
+ from .layout_analyzer import LayoutAnalyzer
17
+ from .page_processor import PageProcessor
18
+ from .table_processor import TableProcessor
19
+ from .utils import clean_markdown_content
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def _parse_page_range(page_spec: str, total_pages: int) -> list[int]:
25
+ """Parse a page range string like '1-5,8,10-12' into 0-based page indices."""
26
+ indices: list[int] = []
27
+ for part in page_spec.split(","):
28
+ part = part.strip()
29
+ if "-" in part:
30
+ start, end = part.split("-", 1)
31
+ start = int(start.strip())
32
+ end = int(end.strip())
33
+ for p in range(start, end + 1):
34
+ if 1 <= p <= total_pages:
35
+ indices.append(p - 1)
36
+ else:
37
+ p = int(part)
38
+ if 1 <= p <= total_pages:
39
+ indices.append(p - 1)
40
+ return sorted(set(indices))
41
+
42
+
43
+ class PdfToMarkdownConverter:
44
+ """PDF to Markdown converter."""
45
+
46
+ def __init__(self):
47
+ self.output_folder: str = ""
48
+ self.assets_dir: str = ""
49
+
50
+ def convert_file(
51
+ self,
52
+ input_path: str,
53
+ output_path: Optional[str] = None,
54
+ ignore_images: bool = False,
55
+ pages: Optional[str] = None,
56
+ ) -> str:
57
+ if not os.path.exists(input_path):
58
+ raise FileNotFoundError(f"Input file does not exist: {input_path}")
59
+
60
+ if not input_path.lower().endswith(".pdf"):
61
+ raise ValueError(f"Only .pdf is supported: {input_path}")
62
+
63
+ if pdfplumber is None:
64
+ raise RuntimeError(
65
+ "Missing required dependency 'pdfplumber'. Please run: pip install pdfplumber"
66
+ )
67
+
68
+ self._setup_output_structure(input_path, output_path, ignore_images)
69
+
70
+ layout_analyzer = LayoutAnalyzer()
71
+ table_processor = TableProcessor()
72
+ image_extractor = (
73
+ ImageExtractor(self.assets_dir) if not ignore_images and self.assets_dir else None
74
+ )
75
+ page_processor = PageProcessor(
76
+ layout_analyzer=layout_analyzer,
77
+ table_processor=table_processor,
78
+ image_extractor=image_extractor,
79
+ ignore_images=ignore_images,
80
+ )
81
+
82
+ output_lines: list[str] = []
83
+
84
+ with pdfplumber.open(input_path) as pdf:
85
+ total_pages = len(pdf.pages)
86
+
87
+ if total_pages == 0:
88
+ logger.warning("PDF has no pages: %s", input_path)
89
+ markdown_content = "\n"
90
+ self._write_output(markdown_content, self._get_final_output_path(input_path, output_path))
91
+ return markdown_content
92
+
93
+ # Determine page indices
94
+ if pages:
95
+ page_indices = _parse_page_range(pages, total_pages)
96
+ if not page_indices:
97
+ raise ValueError(f"No valid pages in range '{pages}' (total: {total_pages})")
98
+ else:
99
+ page_indices = list(range(total_pages))
100
+
101
+ # Check if the PDF is scanned (no text on any page)
102
+ has_text = False
103
+ for idx in page_indices:
104
+ if pdf.pages[idx].chars:
105
+ has_text = True
106
+ break
107
+ if not has_text:
108
+ logger.warning(
109
+ "PDF appears to have no text layer (possibly scanned). "
110
+ "OCR is not supported. Output may be empty."
111
+ )
112
+
113
+ for idx in page_indices:
114
+ page = pdf.pages[idx]
115
+ page_number = idx + 1
116
+ logger.debug("Processing page %d/%d", page_number, total_pages)
117
+
118
+ page_lines = page_processor.process_page(page, page_number)
119
+ output_lines.extend(page_lines)
120
+
121
+ markdown_content = clean_markdown_content(output_lines)
122
+ final_output_path = self._get_final_output_path(input_path, output_path)
123
+ self._write_output(markdown_content, final_output_path)
124
+ self._cleanup_empty_assets_dir()
125
+
126
+ logger.info("Conversion completed, output file: %s", final_output_path)
127
+ return markdown_content
128
+
129
+ def _setup_output_structure(
130
+ self, input_path: str, output_path: Optional[str], ignore_images: bool
131
+ ):
132
+ input_stem = Path(input_path).stem
133
+
134
+ if output_path:
135
+ if os.path.isdir(output_path) or output_path.endswith("/"):
136
+ self.output_folder = os.path.join(output_path, input_stem)
137
+ else:
138
+ self.output_folder = os.path.dirname(output_path)
139
+ if not self.output_folder:
140
+ self.output_folder = input_stem
141
+ else:
142
+ self.output_folder = input_stem
143
+
144
+ os.makedirs(self.output_folder, exist_ok=True)
145
+
146
+ if ignore_images:
147
+ self.assets_dir = ""
148
+ else:
149
+ self.assets_dir = os.path.join(self.output_folder, "assets")
150
+ os.makedirs(self.assets_dir, exist_ok=True)
151
+
152
+ def _get_final_output_path(self, input_path: str, output_path: Optional[str]) -> str:
153
+ input_stem = Path(input_path).stem
154
+
155
+ if output_path:
156
+ if os.path.isdir(output_path) or output_path.endswith("/"):
157
+ return os.path.join(self.output_folder, f"{input_stem}.md")
158
+ return output_path
159
+
160
+ return os.path.join(self.output_folder, f"{input_stem}.md")
161
+
162
+ def _write_output(self, content: str, output_path: str):
163
+ output_dir = os.path.dirname(output_path)
164
+ if output_dir and not os.path.exists(output_dir):
165
+ os.makedirs(output_dir)
166
+
167
+ with open(output_path, "w", encoding="utf-8") as f:
168
+ f.write(content)
169
+
170
+ def _cleanup_empty_assets_dir(self):
171
+ if self.assets_dir and os.path.exists(self.assets_dir) and not os.listdir(self.assets_dir):
172
+ os.rmdir(self.assets_dir)
@@ -0,0 +1,110 @@
1
+ """Image extraction from PDF pages."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import os
7
+ from pathlib import Path
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class ImageExtractor:
13
+ """Extract images from PDF pages into an assets directory."""
14
+
15
+ def __init__(self, assets_dir: str):
16
+ self.assets_dir = assets_dir
17
+ self._image_count = 0
18
+
19
+ def extract_images(self, page, page_number: int) -> list[tuple[str, tuple[float, float, float, float]]]:
20
+ """Extract images from a pdfplumber page.
21
+
22
+ Returns list of (markdown_path, bounding_box) tuples.
23
+ """
24
+ results: list[tuple[str, tuple[float, float, float, float]]] = []
25
+
26
+ try:
27
+ images = page.images
28
+ except Exception as exc:
29
+ logger.warning("Failed to get images from page %d: %s", page_number, exc)
30
+ return results
31
+
32
+ for img_info in images:
33
+ self._image_count += 1
34
+ bbox = (img_info["x0"], img_info["top"], img_info["x1"], img_info["bottom"])
35
+
36
+ # Try to extract the actual image data
37
+ image_path = self._save_image_from_page(page, page_number, self._image_count, img_info)
38
+ if image_path:
39
+ results.append((image_path, bbox))
40
+
41
+ return results
42
+
43
+ def _save_image_from_page(
44
+ self, page, page_number: int, image_index: int, img_info: dict
45
+ ) -> str:
46
+ """Try to extract and save an image, return relative markdown path or empty string."""
47
+ try:
48
+ # Access underlying pdfminer page for image XObjects
49
+ pdfminer_page = page.page
50
+ resources = pdfminer_page.resources
51
+
52
+ if resources is None:
53
+ return ""
54
+
55
+ xobjects = resources.get("XObject", {})
56
+ if not xobjects:
57
+ return ""
58
+
59
+ # Try to find the image by iterating XObjects
60
+ for obj_name in xobjects:
61
+ try:
62
+ xobj = xobjects[obj_name].resolve()
63
+ if xobj.get("Subtype") != "Image":
64
+ continue
65
+
66
+ width = int(xobj.get("Width", 0))
67
+ height = int(xobj.get("Height", 0))
68
+ if width <= 0 or height <= 0:
69
+ continue
70
+
71
+ # Determine format
72
+ color_space = xobj.get("ColorSpace", "")
73
+ filters = xobj.get("Filter", "")
74
+
75
+ if isinstance(filters, list):
76
+ has_jpeg = any("DCT" in str(f) for f in filters)
77
+ ext = "jpg" if has_jpeg else "png"
78
+ elif "DCT" in str(filters):
79
+ ext = "jpg"
80
+ else:
81
+ ext = "png"
82
+
83
+ filename = f"page{page_number:03d}_img{image_index:02d}.{ext}"
84
+ os.makedirs(self.assets_dir, exist_ok=True)
85
+ output_path = Path(self.assets_dir) / filename
86
+
87
+ # Extract raw stream data
88
+ stream = xobj.get_data()
89
+ output_path.write_bytes(stream)
90
+
91
+ return f"assets/{filename}"
92
+ except Exception:
93
+ continue
94
+
95
+ # Fallback: just record image position without data
96
+ logger.debug(
97
+ "Could not extract image data for image %d on page %d",
98
+ image_index,
99
+ page_number,
100
+ )
101
+ return ""
102
+
103
+ except Exception as exc:
104
+ logger.warning(
105
+ "Failed to extract image %d on page %d: %s",
106
+ image_index,
107
+ page_number,
108
+ exc,
109
+ )
110
+ return ""