pdf2dotmd 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdf2dotmd-0.0.1/LICENSE +21 -0
- pdf2dotmd-0.0.1/PKG-INFO +89 -0
- pdf2dotmd-0.0.1/README.md +64 -0
- pdf2dotmd-0.0.1/pdf2dotmd/__init__.py +3 -0
- pdf2dotmd-0.0.1/pdf2dotmd/cli.py +100 -0
- pdf2dotmd-0.0.1/pdf2dotmd/converter.py +172 -0
- pdf2dotmd-0.0.1/pdf2dotmd/image_extractor.py +110 -0
- pdf2dotmd-0.0.1/pdf2dotmd/layout_analyzer.py +478 -0
- pdf2dotmd-0.0.1/pdf2dotmd/page_processor.py +119 -0
- pdf2dotmd-0.0.1/pdf2dotmd/table_processor.py +93 -0
- pdf2dotmd-0.0.1/pdf2dotmd/text_block.py +63 -0
- pdf2dotmd-0.0.1/pdf2dotmd/utils.py +23 -0
- pdf2dotmd-0.0.1/pdf2dotmd.egg-info/PKG-INFO +89 -0
- pdf2dotmd-0.0.1/pdf2dotmd.egg-info/SOURCES.txt +18 -0
- pdf2dotmd-0.0.1/pdf2dotmd.egg-info/dependency_links.txt +1 -0
- pdf2dotmd-0.0.1/pdf2dotmd.egg-info/entry_points.txt +2 -0
- pdf2dotmd-0.0.1/pdf2dotmd.egg-info/requires.txt +1 -0
- pdf2dotmd-0.0.1/pdf2dotmd.egg-info/top_level.txt +1 -0
- pdf2dotmd-0.0.1/pyproject.toml +36 -0
- pdf2dotmd-0.0.1/setup.cfg +4 -0
pdf2dotmd-0.0.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Robert He
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
pdf2dotmd-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdf2dotmd
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A Python tool for converting PDF files to Markdown
|
|
5
|
+
Author: hnrobert
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/HNRobert/pdf2dotmd
|
|
8
|
+
Project-URL: Repository, https://github.com/HNRobert/pdf2dotmd
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Text Processing :: Markup
|
|
19
|
+
Classifier: Topic :: Utilities
|
|
20
|
+
Requires-Python: >=3.8
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: pdfplumber>=0.11.0
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
|
|
26
|
+
# pdf2dotmd
|
|
27
|
+
|
|
28
|
+
A Python CLI tool that converts PDF files to Markdown format with intelligent layout analysis.
|
|
29
|
+
|
|
30
|
+
## Features
|
|
31
|
+
|
|
32
|
+
- **Layout-aware text extraction** — reconstructs logical reading order from PDF spatial data
|
|
33
|
+
- **Multi-column detection** — handles two-column and multi-column layouts
|
|
34
|
+
- **Table extraction** — converts PDF tables to Markdown pipe tables
|
|
35
|
+
- **Heading inference** — detects headings from font size hierarchy
|
|
36
|
+
- **Header/footer filtering** — automatically removes repeated page headers and footers
|
|
37
|
+
- **Image extraction** — extracts embedded images to an `assets/` directory
|
|
38
|
+
- **Ignore images mode** — `--ignore-images` flag for text-only output
|
|
39
|
+
- **Page range selection** — convert specific pages only
|
|
40
|
+
- **Batch conversion** — process multiple PDF files with wildcards
|
|
41
|
+
|
|
42
|
+
## Installation
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install pdf2dotmd
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Usage
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
# Output to stdout
|
|
52
|
+
pdf2dotmd input.pdf
|
|
53
|
+
|
|
54
|
+
# Output to file
|
|
55
|
+
pdf2dotmd input.pdf -o output.md
|
|
56
|
+
|
|
57
|
+
# Skip images, output single Markdown file
|
|
58
|
+
pdf2dotmd input.pdf --ignore-images
|
|
59
|
+
|
|
60
|
+
# Batch conversion
|
|
61
|
+
pdf2dotmd *.pdf -o output_dir/
|
|
62
|
+
|
|
63
|
+
# Convert only specific pages
|
|
64
|
+
pdf2dotmd input.pdf -p 1-3
|
|
65
|
+
pdf2dotmd input.pdf -p 1-5,8,10-12
|
|
66
|
+
|
|
67
|
+
# Verbose logging
|
|
68
|
+
pdf2dotmd input.pdf -v
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## How It Works
|
|
72
|
+
|
|
73
|
+
1. **Character extraction** — uses [pdfplumber](https://github.com/jsvine/pdfplumber) to extract individual characters with position data
|
|
74
|
+
2. **Line grouping** — clusters characters into text lines by y-coordinate proximity
|
|
75
|
+
3. **Block formation** — groups lines into paragraphs based on horizontal alignment and vertical spacing
|
|
76
|
+
4. **Column detection** — identifies multi-column layouts by analyzing horizontal text density gaps
|
|
77
|
+
5. **Reading order** — sorts blocks top-to-bottom, left-to-right, handling spanning titles
|
|
78
|
+
6. **Header/footer removal** — detects repeated elements across pages
|
|
79
|
+
7. **Heading inference** — maps font sizes to heading levels (H1-H6)
|
|
80
|
+
|
|
81
|
+
## Limitations
|
|
82
|
+
|
|
83
|
+
- **Scanned PDFs** — OCR is not supported; scanned/image-only PDFs will produce empty output
|
|
84
|
+
- **Encrypted PDFs** — password-protected PDFs are not supported
|
|
85
|
+
- **Complex layouts** — highly irregular layouts may not parse perfectly
|
|
86
|
+
|
|
87
|
+
## License
|
|
88
|
+
|
|
89
|
+
MIT
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# pdf2dotmd
|
|
2
|
+
|
|
3
|
+
A Python CLI tool that converts PDF files to Markdown format with intelligent layout analysis.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Layout-aware text extraction** — reconstructs logical reading order from PDF spatial data
|
|
8
|
+
- **Multi-column detection** — handles two-column and multi-column layouts
|
|
9
|
+
- **Table extraction** — converts PDF tables to Markdown pipe tables
|
|
10
|
+
- **Heading inference** — detects headings from font size hierarchy
|
|
11
|
+
- **Header/footer filtering** — automatically removes repeated page headers and footers
|
|
12
|
+
- **Image extraction** — extracts embedded images to an `assets/` directory
|
|
13
|
+
- **Ignore images mode** — `--ignore-images` flag for text-only output
|
|
14
|
+
- **Page range selection** — convert specific pages only
|
|
15
|
+
- **Batch conversion** — process multiple PDF files with wildcards
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install pdf2dotmd
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Usage
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
# Output to stdout
|
|
27
|
+
pdf2dotmd input.pdf
|
|
28
|
+
|
|
29
|
+
# Output to file
|
|
30
|
+
pdf2dotmd input.pdf -o output.md
|
|
31
|
+
|
|
32
|
+
# Skip images, output single Markdown file
|
|
33
|
+
pdf2dotmd input.pdf --ignore-images
|
|
34
|
+
|
|
35
|
+
# Batch conversion
|
|
36
|
+
pdf2dotmd *.pdf -o output_dir/
|
|
37
|
+
|
|
38
|
+
# Convert only specific pages
|
|
39
|
+
pdf2dotmd input.pdf -p 1-3
|
|
40
|
+
pdf2dotmd input.pdf -p 1-5,8,10-12
|
|
41
|
+
|
|
42
|
+
# Verbose logging
|
|
43
|
+
pdf2dotmd input.pdf -v
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## How It Works
|
|
47
|
+
|
|
48
|
+
1. **Character extraction** — uses [pdfplumber](https://github.com/jsvine/pdfplumber) to extract individual characters with position data
|
|
49
|
+
2. **Line grouping** — clusters characters into text lines by y-coordinate proximity
|
|
50
|
+
3. **Block formation** — groups lines into paragraphs based on horizontal alignment and vertical spacing
|
|
51
|
+
4. **Column detection** — identifies multi-column layouts by analyzing horizontal text density gaps
|
|
52
|
+
5. **Reading order** — sorts blocks top-to-bottom, left-to-right, handling spanning titles
|
|
53
|
+
6. **Header/footer removal** — detects repeated elements across pages
|
|
54
|
+
7. **Heading inference** — maps font sizes to heading levels (H1-H6)
|
|
55
|
+
|
|
56
|
+
## Limitations
|
|
57
|
+
|
|
58
|
+
- **Scanned PDFs** — OCR is not supported; scanned/image-only PDFs will produce empty output
|
|
59
|
+
- **Encrypted PDFs** — password-protected PDFs are not supported
|
|
60
|
+
- **Complex layouts** — highly irregular layouts may not parse perfectly
|
|
61
|
+
|
|
62
|
+
## License
|
|
63
|
+
|
|
64
|
+
MIT
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Command line interface for PDF to Markdown converter."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
from glob import glob
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from .converter import PdfToMarkdownConverter
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def main():
|
|
18
|
+
parser = argparse.ArgumentParser(
|
|
19
|
+
description="Convert PDF files to Markdown format",
|
|
20
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
21
|
+
epilog="""
|
|
22
|
+
Example usage:
|
|
23
|
+
%(prog)s input.pdf # Output to stdout
|
|
24
|
+
%(prog)s input.pdf -o output.md # Output to file
|
|
25
|
+
%(prog)s input.pdf --ignore-images # Skip images, single file output
|
|
26
|
+
%(prog)s *.pdf -o output_dir/ # Batch conversion
|
|
27
|
+
%(prog)s input.pdf -p 1-3 # Convert only pages 1-3
|
|
28
|
+
""",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"input_files",
|
|
33
|
+
nargs="+",
|
|
34
|
+
help="Input PDF file paths (supports wildcards)",
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument("-o", "--output", help="Output file or directory path")
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"--ignore-images",
|
|
39
|
+
"--no-images",
|
|
40
|
+
action="store_true",
|
|
41
|
+
dest="ignore_images",
|
|
42
|
+
help="Ignore all images and output a single Markdown file",
|
|
43
|
+
)
|
|
44
|
+
parser.add_argument(
|
|
45
|
+
"-p",
|
|
46
|
+
"--pages",
|
|
47
|
+
help="Page range to convert (e.g., '1-5,8,10-12')",
|
|
48
|
+
)
|
|
49
|
+
parser.add_argument("-v", "--verbose", action="store_true", help="Show verbose logs")
|
|
50
|
+
|
|
51
|
+
args = parser.parse_args()
|
|
52
|
+
|
|
53
|
+
logging.basicConfig(
|
|
54
|
+
level=logging.DEBUG if args.verbose else logging.INFO,
|
|
55
|
+
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
converter = PdfToMarkdownConverter()
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
for input_pattern in args.input_files:
|
|
62
|
+
matching_files = glob(input_pattern)
|
|
63
|
+
if not matching_files:
|
|
64
|
+
logger.warning("No matching files found: %s", input_pattern)
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
for file_path in matching_files:
|
|
68
|
+
if not file_path.lower().endswith(".pdf"):
|
|
69
|
+
logger.warning("Skipping non-PDF file: %s", file_path)
|
|
70
|
+
continue
|
|
71
|
+
|
|
72
|
+
output_path = None
|
|
73
|
+
if args.output:
|
|
74
|
+
if os.path.isdir(args.output) or args.output.endswith("/"):
|
|
75
|
+
output_path = os.path.join(
|
|
76
|
+
args.output, f"{Path(file_path).stem}.md"
|
|
77
|
+
)
|
|
78
|
+
else:
|
|
79
|
+
output_path = args.output
|
|
80
|
+
|
|
81
|
+
markdown_content = converter.convert_file(
|
|
82
|
+
file_path,
|
|
83
|
+
output_path=output_path,
|
|
84
|
+
ignore_images=args.ignore_images,
|
|
85
|
+
pages=args.pages,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
if not output_path:
|
|
89
|
+
print(f"\n=== {file_path} ===\n")
|
|
90
|
+
print(markdown_content)
|
|
91
|
+
except KeyboardInterrupt:
|
|
92
|
+
logger.info("Conversion interrupted by user")
|
|
93
|
+
sys.exit(1)
|
|
94
|
+
except Exception as exc: # pylint: disable=broad-except
|
|
95
|
+
logger.error("Program execution failed: %s", exc)
|
|
96
|
+
sys.exit(1)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
if __name__ == "__main__":
|
|
100
|
+
main()
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Core converter module for PDF to Markdown conversion."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import pdfplumber # type: ignore[import-not-found]
|
|
12
|
+
except ImportError: # pragma: no cover
|
|
13
|
+
pdfplumber = None # type: ignore[assignment]
|
|
14
|
+
|
|
15
|
+
from .image_extractor import ImageExtractor
|
|
16
|
+
from .layout_analyzer import LayoutAnalyzer
|
|
17
|
+
from .page_processor import PageProcessor
|
|
18
|
+
from .table_processor import TableProcessor
|
|
19
|
+
from .utils import clean_markdown_content
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _parse_page_range(page_spec: str, total_pages: int) -> list[int]:
|
|
25
|
+
"""Parse a page range string like '1-5,8,10-12' into 0-based page indices."""
|
|
26
|
+
indices: list[int] = []
|
|
27
|
+
for part in page_spec.split(","):
|
|
28
|
+
part = part.strip()
|
|
29
|
+
if "-" in part:
|
|
30
|
+
start, end = part.split("-", 1)
|
|
31
|
+
start = int(start.strip())
|
|
32
|
+
end = int(end.strip())
|
|
33
|
+
for p in range(start, end + 1):
|
|
34
|
+
if 1 <= p <= total_pages:
|
|
35
|
+
indices.append(p - 1)
|
|
36
|
+
else:
|
|
37
|
+
p = int(part)
|
|
38
|
+
if 1 <= p <= total_pages:
|
|
39
|
+
indices.append(p - 1)
|
|
40
|
+
return sorted(set(indices))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class PdfToMarkdownConverter:
|
|
44
|
+
"""PDF to Markdown converter."""
|
|
45
|
+
|
|
46
|
+
def __init__(self):
|
|
47
|
+
self.output_folder: str = ""
|
|
48
|
+
self.assets_dir: str = ""
|
|
49
|
+
|
|
50
|
+
def convert_file(
|
|
51
|
+
self,
|
|
52
|
+
input_path: str,
|
|
53
|
+
output_path: Optional[str] = None,
|
|
54
|
+
ignore_images: bool = False,
|
|
55
|
+
pages: Optional[str] = None,
|
|
56
|
+
) -> str:
|
|
57
|
+
if not os.path.exists(input_path):
|
|
58
|
+
raise FileNotFoundError(f"Input file does not exist: {input_path}")
|
|
59
|
+
|
|
60
|
+
if not input_path.lower().endswith(".pdf"):
|
|
61
|
+
raise ValueError(f"Only .pdf is supported: {input_path}")
|
|
62
|
+
|
|
63
|
+
if pdfplumber is None:
|
|
64
|
+
raise RuntimeError(
|
|
65
|
+
"Missing required dependency 'pdfplumber'. Please run: pip install pdfplumber"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
self._setup_output_structure(input_path, output_path, ignore_images)
|
|
69
|
+
|
|
70
|
+
layout_analyzer = LayoutAnalyzer()
|
|
71
|
+
table_processor = TableProcessor()
|
|
72
|
+
image_extractor = (
|
|
73
|
+
ImageExtractor(self.assets_dir) if not ignore_images and self.assets_dir else None
|
|
74
|
+
)
|
|
75
|
+
page_processor = PageProcessor(
|
|
76
|
+
layout_analyzer=layout_analyzer,
|
|
77
|
+
table_processor=table_processor,
|
|
78
|
+
image_extractor=image_extractor,
|
|
79
|
+
ignore_images=ignore_images,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
output_lines: list[str] = []
|
|
83
|
+
|
|
84
|
+
with pdfplumber.open(input_path) as pdf:
|
|
85
|
+
total_pages = len(pdf.pages)
|
|
86
|
+
|
|
87
|
+
if total_pages == 0:
|
|
88
|
+
logger.warning("PDF has no pages: %s", input_path)
|
|
89
|
+
markdown_content = "\n"
|
|
90
|
+
self._write_output(markdown_content, self._get_final_output_path(input_path, output_path))
|
|
91
|
+
return markdown_content
|
|
92
|
+
|
|
93
|
+
# Determine page indices
|
|
94
|
+
if pages:
|
|
95
|
+
page_indices = _parse_page_range(pages, total_pages)
|
|
96
|
+
if not page_indices:
|
|
97
|
+
raise ValueError(f"No valid pages in range '{pages}' (total: {total_pages})")
|
|
98
|
+
else:
|
|
99
|
+
page_indices = list(range(total_pages))
|
|
100
|
+
|
|
101
|
+
# Check if the PDF is scanned (no text on any page)
|
|
102
|
+
has_text = False
|
|
103
|
+
for idx in page_indices:
|
|
104
|
+
if pdf.pages[idx].chars:
|
|
105
|
+
has_text = True
|
|
106
|
+
break
|
|
107
|
+
if not has_text:
|
|
108
|
+
logger.warning(
|
|
109
|
+
"PDF appears to have no text layer (possibly scanned). "
|
|
110
|
+
"OCR is not supported. Output may be empty."
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
for idx in page_indices:
|
|
114
|
+
page = pdf.pages[idx]
|
|
115
|
+
page_number = idx + 1
|
|
116
|
+
logger.debug("Processing page %d/%d", page_number, total_pages)
|
|
117
|
+
|
|
118
|
+
page_lines = page_processor.process_page(page, page_number)
|
|
119
|
+
output_lines.extend(page_lines)
|
|
120
|
+
|
|
121
|
+
markdown_content = clean_markdown_content(output_lines)
|
|
122
|
+
final_output_path = self._get_final_output_path(input_path, output_path)
|
|
123
|
+
self._write_output(markdown_content, final_output_path)
|
|
124
|
+
self._cleanup_empty_assets_dir()
|
|
125
|
+
|
|
126
|
+
logger.info("Conversion completed, output file: %s", final_output_path)
|
|
127
|
+
return markdown_content
|
|
128
|
+
|
|
129
|
+
def _setup_output_structure(
|
|
130
|
+
self, input_path: str, output_path: Optional[str], ignore_images: bool
|
|
131
|
+
):
|
|
132
|
+
input_stem = Path(input_path).stem
|
|
133
|
+
|
|
134
|
+
if output_path:
|
|
135
|
+
if os.path.isdir(output_path) or output_path.endswith("/"):
|
|
136
|
+
self.output_folder = os.path.join(output_path, input_stem)
|
|
137
|
+
else:
|
|
138
|
+
self.output_folder = os.path.dirname(output_path)
|
|
139
|
+
if not self.output_folder:
|
|
140
|
+
self.output_folder = input_stem
|
|
141
|
+
else:
|
|
142
|
+
self.output_folder = input_stem
|
|
143
|
+
|
|
144
|
+
os.makedirs(self.output_folder, exist_ok=True)
|
|
145
|
+
|
|
146
|
+
if ignore_images:
|
|
147
|
+
self.assets_dir = ""
|
|
148
|
+
else:
|
|
149
|
+
self.assets_dir = os.path.join(self.output_folder, "assets")
|
|
150
|
+
os.makedirs(self.assets_dir, exist_ok=True)
|
|
151
|
+
|
|
152
|
+
def _get_final_output_path(self, input_path: str, output_path: Optional[str]) -> str:
|
|
153
|
+
input_stem = Path(input_path).stem
|
|
154
|
+
|
|
155
|
+
if output_path:
|
|
156
|
+
if os.path.isdir(output_path) or output_path.endswith("/"):
|
|
157
|
+
return os.path.join(self.output_folder, f"{input_stem}.md")
|
|
158
|
+
return output_path
|
|
159
|
+
|
|
160
|
+
return os.path.join(self.output_folder, f"{input_stem}.md")
|
|
161
|
+
|
|
162
|
+
def _write_output(self, content: str, output_path: str):
|
|
163
|
+
output_dir = os.path.dirname(output_path)
|
|
164
|
+
if output_dir and not os.path.exists(output_dir):
|
|
165
|
+
os.makedirs(output_dir)
|
|
166
|
+
|
|
167
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
168
|
+
f.write(content)
|
|
169
|
+
|
|
170
|
+
def _cleanup_empty_assets_dir(self):
|
|
171
|
+
if self.assets_dir and os.path.exists(self.assets_dir) and not os.listdir(self.assets_dir):
|
|
172
|
+
os.rmdir(self.assets_dir)
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Image extraction from PDF pages."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ImageExtractor:
|
|
13
|
+
"""Extract images from PDF pages into an assets directory."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, assets_dir: str):
|
|
16
|
+
self.assets_dir = assets_dir
|
|
17
|
+
self._image_count = 0
|
|
18
|
+
|
|
19
|
+
def extract_images(self, page, page_number: int) -> list[tuple[str, tuple[float, float, float, float]]]:
|
|
20
|
+
"""Extract images from a pdfplumber page.
|
|
21
|
+
|
|
22
|
+
Returns list of (markdown_path, bounding_box) tuples.
|
|
23
|
+
"""
|
|
24
|
+
results: list[tuple[str, tuple[float, float, float, float]]] = []
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
images = page.images
|
|
28
|
+
except Exception as exc:
|
|
29
|
+
logger.warning("Failed to get images from page %d: %s", page_number, exc)
|
|
30
|
+
return results
|
|
31
|
+
|
|
32
|
+
for img_info in images:
|
|
33
|
+
self._image_count += 1
|
|
34
|
+
bbox = (img_info["x0"], img_info["top"], img_info["x1"], img_info["bottom"])
|
|
35
|
+
|
|
36
|
+
# Try to extract the actual image data
|
|
37
|
+
image_path = self._save_image_from_page(page, page_number, self._image_count, img_info)
|
|
38
|
+
if image_path:
|
|
39
|
+
results.append((image_path, bbox))
|
|
40
|
+
|
|
41
|
+
return results
|
|
42
|
+
|
|
43
|
+
def _save_image_from_page(
|
|
44
|
+
self, page, page_number: int, image_index: int, img_info: dict
|
|
45
|
+
) -> str:
|
|
46
|
+
"""Try to extract and save an image, return relative markdown path or empty string."""
|
|
47
|
+
try:
|
|
48
|
+
# Access underlying pdfminer page for image XObjects
|
|
49
|
+
pdfminer_page = page.page
|
|
50
|
+
resources = pdfminer_page.resources
|
|
51
|
+
|
|
52
|
+
if resources is None:
|
|
53
|
+
return ""
|
|
54
|
+
|
|
55
|
+
xobjects = resources.get("XObject", {})
|
|
56
|
+
if not xobjects:
|
|
57
|
+
return ""
|
|
58
|
+
|
|
59
|
+
# Try to find the image by iterating XObjects
|
|
60
|
+
for obj_name in xobjects:
|
|
61
|
+
try:
|
|
62
|
+
xobj = xobjects[obj_name].resolve()
|
|
63
|
+
if xobj.get("Subtype") != "Image":
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
width = int(xobj.get("Width", 0))
|
|
67
|
+
height = int(xobj.get("Height", 0))
|
|
68
|
+
if width <= 0 or height <= 0:
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
# Determine format
|
|
72
|
+
color_space = xobj.get("ColorSpace", "")
|
|
73
|
+
filters = xobj.get("Filter", "")
|
|
74
|
+
|
|
75
|
+
if isinstance(filters, list):
|
|
76
|
+
has_jpeg = any("DCT" in str(f) for f in filters)
|
|
77
|
+
ext = "jpg" if has_jpeg else "png"
|
|
78
|
+
elif "DCT" in str(filters):
|
|
79
|
+
ext = "jpg"
|
|
80
|
+
else:
|
|
81
|
+
ext = "png"
|
|
82
|
+
|
|
83
|
+
filename = f"page{page_number:03d}_img{image_index:02d}.{ext}"
|
|
84
|
+
os.makedirs(self.assets_dir, exist_ok=True)
|
|
85
|
+
output_path = Path(self.assets_dir) / filename
|
|
86
|
+
|
|
87
|
+
# Extract raw stream data
|
|
88
|
+
stream = xobj.get_data()
|
|
89
|
+
output_path.write_bytes(stream)
|
|
90
|
+
|
|
91
|
+
return f"assets/{filename}"
|
|
92
|
+
except Exception:
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
# Fallback: just record image position without data
|
|
96
|
+
logger.debug(
|
|
97
|
+
"Could not extract image data for image %d on page %d",
|
|
98
|
+
image_index,
|
|
99
|
+
page_number,
|
|
100
|
+
)
|
|
101
|
+
return ""
|
|
102
|
+
|
|
103
|
+
except Exception as exc:
|
|
104
|
+
logger.warning(
|
|
105
|
+
"Failed to extract image %d on page %d: %s",
|
|
106
|
+
image_index,
|
|
107
|
+
page_number,
|
|
108
|
+
exc,
|
|
109
|
+
)
|
|
110
|
+
return ""
|