pdf2dotmd 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pdf2dotmd/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """pdf2md package."""
2
+
3
+ __version__ = "0.0.1"
pdf2dotmd/cli.py ADDED
@@ -0,0 +1,100 @@
1
+ """Command line interface for PDF to Markdown converter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import logging
7
+ import os
8
+ import sys
9
+ from glob import glob
10
+ from pathlib import Path
11
+
12
+ from .converter import PdfToMarkdownConverter
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def main():
18
+ parser = argparse.ArgumentParser(
19
+ description="Convert PDF files to Markdown format",
20
+ formatter_class=argparse.RawDescriptionHelpFormatter,
21
+ epilog="""
22
+ Example usage:
23
+ %(prog)s input.pdf # Output to stdout
24
+ %(prog)s input.pdf -o output.md # Output to file
25
+ %(prog)s input.pdf --ignore-images # Skip images, single file output
26
+ %(prog)s *.pdf -o output_dir/ # Batch conversion
27
+ %(prog)s input.pdf -p 1-3 # Convert only pages 1-3
28
+ """,
29
+ )
30
+
31
+ parser.add_argument(
32
+ "input_files",
33
+ nargs="+",
34
+ help="Input PDF file paths (supports wildcards)",
35
+ )
36
+ parser.add_argument("-o", "--output", help="Output file or directory path")
37
+ parser.add_argument(
38
+ "--ignore-images",
39
+ "--no-images",
40
+ action="store_true",
41
+ dest="ignore_images",
42
+ help="Ignore all images and output a single Markdown file",
43
+ )
44
+ parser.add_argument(
45
+ "-p",
46
+ "--pages",
47
+ help="Page range to convert (e.g., '1-5,8,10-12')",
48
+ )
49
+ parser.add_argument("-v", "--verbose", action="store_true", help="Show verbose logs")
50
+
51
+ args = parser.parse_args()
52
+
53
+ logging.basicConfig(
54
+ level=logging.DEBUG if args.verbose else logging.INFO,
55
+ format="%(asctime)s - %(levelname)s - %(message)s",
56
+ )
57
+
58
+ converter = PdfToMarkdownConverter()
59
+
60
+ try:
61
+ for input_pattern in args.input_files:
62
+ matching_files = glob(input_pattern)
63
+ if not matching_files:
64
+ logger.warning("No matching files found: %s", input_pattern)
65
+ continue
66
+
67
+ for file_path in matching_files:
68
+ if not file_path.lower().endswith(".pdf"):
69
+ logger.warning("Skipping non-PDF file: %s", file_path)
70
+ continue
71
+
72
+ output_path = None
73
+ if args.output:
74
+ if os.path.isdir(args.output) or args.output.endswith("/"):
75
+ output_path = os.path.join(
76
+ args.output, f"{Path(file_path).stem}.md"
77
+ )
78
+ else:
79
+ output_path = args.output
80
+
81
+ markdown_content = converter.convert_file(
82
+ file_path,
83
+ output_path=output_path,
84
+ ignore_images=args.ignore_images,
85
+ pages=args.pages,
86
+ )
87
+
88
+ if not output_path:
89
+ print(f"\n=== {file_path} ===\n")
90
+ print(markdown_content)
91
+ except KeyboardInterrupt:
92
+ logger.info("Conversion interrupted by user")
93
+ sys.exit(1)
94
+ except Exception as exc: # pylint: disable=broad-except
95
+ logger.error("Program execution failed: %s", exc)
96
+ sys.exit(1)
97
+
98
+
99
+ if __name__ == "__main__":
100
+ main()
pdf2dotmd/converter.py ADDED
@@ -0,0 +1,172 @@
1
+ """Core converter module for PDF to Markdown conversion."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import os
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ try:
11
+ import pdfplumber # type: ignore[import-not-found]
12
+ except ImportError: # pragma: no cover
13
+ pdfplumber = None # type: ignore[assignment]
14
+
15
+ from .image_extractor import ImageExtractor
16
+ from .layout_analyzer import LayoutAnalyzer
17
+ from .page_processor import PageProcessor
18
+ from .table_processor import TableProcessor
19
+ from .utils import clean_markdown_content
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def _parse_page_range(page_spec: str, total_pages: int) -> list[int]:
25
+ """Parse a page range string like '1-5,8,10-12' into 0-based page indices."""
26
+ indices: list[int] = []
27
+ for part in page_spec.split(","):
28
+ part = part.strip()
29
+ if "-" in part:
30
+ start, end = part.split("-", 1)
31
+ start = int(start.strip())
32
+ end = int(end.strip())
33
+ for p in range(start, end + 1):
34
+ if 1 <= p <= total_pages:
35
+ indices.append(p - 1)
36
+ else:
37
+ p = int(part)
38
+ if 1 <= p <= total_pages:
39
+ indices.append(p - 1)
40
+ return sorted(set(indices))
41
+
42
+
43
+ class PdfToMarkdownConverter:
44
+ """PDF to Markdown converter."""
45
+
46
+ def __init__(self):
47
+ self.output_folder: str = ""
48
+ self.assets_dir: str = ""
49
+
50
+ def convert_file(
51
+ self,
52
+ input_path: str,
53
+ output_path: Optional[str] = None,
54
+ ignore_images: bool = False,
55
+ pages: Optional[str] = None,
56
+ ) -> str:
57
+ if not os.path.exists(input_path):
58
+ raise FileNotFoundError(f"Input file does not exist: {input_path}")
59
+
60
+ if not input_path.lower().endswith(".pdf"):
61
+ raise ValueError(f"Only .pdf is supported: {input_path}")
62
+
63
+ if pdfplumber is None:
64
+ raise RuntimeError(
65
+ "Missing required dependency 'pdfplumber'. Please run: pip install pdfplumber"
66
+ )
67
+
68
+ self._setup_output_structure(input_path, output_path, ignore_images)
69
+
70
+ layout_analyzer = LayoutAnalyzer()
71
+ table_processor = TableProcessor()
72
+ image_extractor = (
73
+ ImageExtractor(self.assets_dir) if not ignore_images and self.assets_dir else None
74
+ )
75
+ page_processor = PageProcessor(
76
+ layout_analyzer=layout_analyzer,
77
+ table_processor=table_processor,
78
+ image_extractor=image_extractor,
79
+ ignore_images=ignore_images,
80
+ )
81
+
82
+ output_lines: list[str] = []
83
+
84
+ with pdfplumber.open(input_path) as pdf:
85
+ total_pages = len(pdf.pages)
86
+
87
+ if total_pages == 0:
88
+ logger.warning("PDF has no pages: %s", input_path)
89
+ markdown_content = "\n"
90
+ self._write_output(markdown_content, self._get_final_output_path(input_path, output_path))
91
+ return markdown_content
92
+
93
+ # Determine page indices
94
+ if pages:
95
+ page_indices = _parse_page_range(pages, total_pages)
96
+ if not page_indices:
97
+ raise ValueError(f"No valid pages in range '{pages}' (total: {total_pages})")
98
+ else:
99
+ page_indices = list(range(total_pages))
100
+
101
+ # Check if the PDF is scanned (no text on any page)
102
+ has_text = False
103
+ for idx in page_indices:
104
+ if pdf.pages[idx].chars:
105
+ has_text = True
106
+ break
107
+ if not has_text:
108
+ logger.warning(
109
+ "PDF appears to have no text layer (possibly scanned). "
110
+ "OCR is not supported. Output may be empty."
111
+ )
112
+
113
+ for idx in page_indices:
114
+ page = pdf.pages[idx]
115
+ page_number = idx + 1
116
+ logger.debug("Processing page %d/%d", page_number, total_pages)
117
+
118
+ page_lines = page_processor.process_page(page, page_number)
119
+ output_lines.extend(page_lines)
120
+
121
+ markdown_content = clean_markdown_content(output_lines)
122
+ final_output_path = self._get_final_output_path(input_path, output_path)
123
+ self._write_output(markdown_content, final_output_path)
124
+ self._cleanup_empty_assets_dir()
125
+
126
+ logger.info("Conversion completed, output file: %s", final_output_path)
127
+ return markdown_content
128
+
129
+ def _setup_output_structure(
130
+ self, input_path: str, output_path: Optional[str], ignore_images: bool
131
+ ):
132
+ input_stem = Path(input_path).stem
133
+
134
+ if output_path:
135
+ if os.path.isdir(output_path) or output_path.endswith("/"):
136
+ self.output_folder = os.path.join(output_path, input_stem)
137
+ else:
138
+ self.output_folder = os.path.dirname(output_path)
139
+ if not self.output_folder:
140
+ self.output_folder = input_stem
141
+ else:
142
+ self.output_folder = input_stem
143
+
144
+ os.makedirs(self.output_folder, exist_ok=True)
145
+
146
+ if ignore_images:
147
+ self.assets_dir = ""
148
+ else:
149
+ self.assets_dir = os.path.join(self.output_folder, "assets")
150
+ os.makedirs(self.assets_dir, exist_ok=True)
151
+
152
+ def _get_final_output_path(self, input_path: str, output_path: Optional[str]) -> str:
153
+ input_stem = Path(input_path).stem
154
+
155
+ if output_path:
156
+ if os.path.isdir(output_path) or output_path.endswith("/"):
157
+ return os.path.join(self.output_folder, f"{input_stem}.md")
158
+ return output_path
159
+
160
+ return os.path.join(self.output_folder, f"{input_stem}.md")
161
+
162
+ def _write_output(self, content: str, output_path: str):
163
+ output_dir = os.path.dirname(output_path)
164
+ if output_dir and not os.path.exists(output_dir):
165
+ os.makedirs(output_dir)
166
+
167
+ with open(output_path, "w", encoding="utf-8") as f:
168
+ f.write(content)
169
+
170
+ def _cleanup_empty_assets_dir(self):
171
+ if self.assets_dir and os.path.exists(self.assets_dir) and not os.listdir(self.assets_dir):
172
+ os.rmdir(self.assets_dir)
@@ -0,0 +1,110 @@
1
+ """Image extraction from PDF pages."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import os
7
+ from pathlib import Path
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class ImageExtractor:
13
+ """Extract images from PDF pages into an assets directory."""
14
+
15
+ def __init__(self, assets_dir: str):
16
+ self.assets_dir = assets_dir
17
+ self._image_count = 0
18
+
19
+ def extract_images(self, page, page_number: int) -> list[tuple[str, tuple[float, float, float, float]]]:
20
+ """Extract images from a pdfplumber page.
21
+
22
+ Returns list of (markdown_path, bounding_box) tuples.
23
+ """
24
+ results: list[tuple[str, tuple[float, float, float, float]]] = []
25
+
26
+ try:
27
+ images = page.images
28
+ except Exception as exc:
29
+ logger.warning("Failed to get images from page %d: %s", page_number, exc)
30
+ return results
31
+
32
+ for img_info in images:
33
+ self._image_count += 1
34
+ bbox = (img_info["x0"], img_info["top"], img_info["x1"], img_info["bottom"])
35
+
36
+ # Try to extract the actual image data
37
+ image_path = self._save_image_from_page(page, page_number, self._image_count, img_info)
38
+ if image_path:
39
+ results.append((image_path, bbox))
40
+
41
+ return results
42
+
43
+ def _save_image_from_page(
44
+ self, page, page_number: int, image_index: int, img_info: dict
45
+ ) -> str:
46
+ """Try to extract and save an image, return relative markdown path or empty string."""
47
+ try:
48
+ # Access underlying pdfminer page for image XObjects
49
+ pdfminer_page = page.page
50
+ resources = pdfminer_page.resources
51
+
52
+ if resources is None:
53
+ return ""
54
+
55
+ xobjects = resources.get("XObject", {})
56
+ if not xobjects:
57
+ return ""
58
+
59
+ # Try to find the image by iterating XObjects
60
+ for obj_name in xobjects:
61
+ try:
62
+ xobj = xobjects[obj_name].resolve()
63
+ if xobj.get("Subtype") != "Image":
64
+ continue
65
+
66
+ width = int(xobj.get("Width", 0))
67
+ height = int(xobj.get("Height", 0))
68
+ if width <= 0 or height <= 0:
69
+ continue
70
+
71
+ # Determine format
72
+ color_space = xobj.get("ColorSpace", "")
73
+ filters = xobj.get("Filter", "")
74
+
75
+ if isinstance(filters, list):
76
+ has_jpeg = any("DCT" in str(f) for f in filters)
77
+ ext = "jpg" if has_jpeg else "png"
78
+ elif "DCT" in str(filters):
79
+ ext = "jpg"
80
+ else:
81
+ ext = "png"
82
+
83
+ filename = f"page{page_number:03d}_img{image_index:02d}.{ext}"
84
+ os.makedirs(self.assets_dir, exist_ok=True)
85
+ output_path = Path(self.assets_dir) / filename
86
+
87
+ # Extract raw stream data
88
+ stream = xobj.get_data()
89
+ output_path.write_bytes(stream)
90
+
91
+ return f"assets/{filename}"
92
+ except Exception:
93
+ continue
94
+
95
+ # Fallback: just record image position without data
96
+ logger.debug(
97
+ "Could not extract image data for image %d on page %d",
98
+ image_index,
99
+ page_number,
100
+ )
101
+ return ""
102
+
103
+ except Exception as exc:
104
+ logger.warning(
105
+ "Failed to extract image %d on page %d: %s",
106
+ image_index,
107
+ page_number,
108
+ exc,
109
+ )
110
+ return ""
@@ -0,0 +1,478 @@
1
+ """Layout analysis for reconstructing reading order from PDF spatial data."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import re
7
+ from collections import Counter
8
+ from typing import Optional
9
+
10
+ from .text_block import TextBlock
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Minimum gap (as fraction of page width) to detect a column boundary
15
+ COLUMN_GAP_MIN_FRACTION = 0.15
16
+
17
+ # Y-tolerance for grouping characters into lines (points)
18
+ DEFAULT_Y_TOLERANCE = 3.0
19
+
20
+ # Maximum vertical gap (as multiple of line height) to group lines into a block
21
+ BLOCK_MAX_GAP_LINES = 1.5
22
+
23
+ # Minimum horizontal overlap ratio for lines to be in the same block
24
+ BLOCK_MIN_OVERLAP = 0.5
25
+
26
+
27
+ class _TextLine:
28
+ """Internal intermediate: a single line of characters on a page."""
29
+
30
+ __slots__ = ("chars", "text", "x0", "y0", "x1", "y1", "font_name", "font_size")
31
+
32
+ def __init__(self):
33
+ self.chars: list[dict] = []
34
+ self.text: str = ""
35
+ self.x0: float = float("inf")
36
+ self.y0: float = float("inf")
37
+ self.x1: float = 0.0
38
+ self.y1: float = 0.0
39
+ self.font_name: str = ""
40
+ self.font_size: float = 0.0
41
+
42
+ def add_char(self, char: dict):
43
+ self.chars.append(char)
44
+ self.x0 = min(self.x0, char["x0"])
45
+ self.y0 = min(self.y0, char["y0"])
46
+ self.x1 = max(self.x1, char["x1"])
47
+ self.y1 = max(self.y1, char["y1"])
48
+
49
+ def finalize(self):
50
+ if not self.chars:
51
+ return
52
+ self.chars.sort(key=lambda c: c["x0"])
53
+ self.text = "".join(c["text"] for c in self.chars)
54
+ # Dominant font
55
+ font_counter = Counter(c.get("fontname", "") for c in self.chars)
56
+ self.font_name = font_counter.most_common(1)[0][0]
57
+ size_counter = Counter(c.get("size", 0) for c in self.chars)
58
+ self.font_size = size_counter.most_common(1)[0][0]
59
+
60
+ @property
61
+ def center_y(self) -> float:
62
+ return (self.y0 + self.y1) / 2
63
+
64
+ @property
65
+ def width(self) -> float:
66
+ return self.x1 - self.x0
67
+
68
+ @property
69
+ def height(self) -> float:
70
+ return self.y1 - self.y0
71
+
72
+
73
+ class LayoutAnalyzer:
74
+ """Analyze PDF page layout to reconstruct reading order."""
75
+
76
+ def __init__(self):
77
+ self._header_texts: list[str] = []
78
+ self._footer_texts: list[str] = []
79
+
80
+ def analyze(self, page, page_number: int) -> list[TextBlock]:
81
+ """Analyze a pdfplumber page and return TextBlocks in reading order."""
82
+ chars = page.chars
83
+ if not chars:
84
+ logger.debug("Page %d has no text characters", page_number)
85
+ return []
86
+
87
+ page_width = page.width
88
+ page_height = page.height
89
+
90
+ # Step 1: Group characters into text lines
91
+ lines = self._group_chars_to_lines(chars)
92
+
93
+ # Step 2: Group lines into text blocks (paragraphs)
94
+ blocks = self._group_lines_to_blocks(lines, page_number)
95
+
96
+ # Step 3: Detect columns
97
+ columns = self._detect_columns(blocks, page_width)
98
+
99
+ # Step 4: Sort blocks into reading order
100
+ ordered = self._apply_reading_order(blocks, columns, page_width)
101
+
102
+ # Step 5: Detect headers/footers
103
+ self._detect_headers_footers(ordered, page_height)
104
+
105
+ # Step 6: Merge hyphenated line breaks
106
+ self._merge_hyphenation(ordered)
107
+
108
+ return ordered
109
+
110
+ def analyze_multi_page(self, pages_with_numbers: list[tuple]) -> list[TextBlock]:
111
+ """Analyze multiple pages and detect repeated headers/footers.
112
+
113
+ pages_with_numbers: list of (page, page_number) tuples
114
+ """
115
+ all_blocks: list[TextBlock] = []
116
+ per_page_top: list[tuple[str, int]] = []
117
+ per_page_bottom: list[tuple[str, int]] = []
118
+
119
+ for page, page_number in pages_with_numbers:
120
+ blocks = self.analyze(page, page_number)
121
+ if blocks:
122
+ # topmost block (highest y1 in PDF coords = visually top)
123
+ top_block = max(blocks, key=lambda b: b.y1)
124
+ per_page_top.append((top_block.text.strip(), page_number))
125
+ # bottommost block
126
+ bottom_block = min(blocks, key=lambda b: b.y0)
127
+ per_page_bottom.append((bottom_block.text.strip(), page_number))
128
+ all_blocks.extend(blocks)
129
+
130
+ # Mark repeated headers/footers across pages
131
+ self._mark_repeated_elements(all_blocks, per_page_top, per_page_bottom)
132
+
133
+ return all_blocks
134
+
135
+ def _group_chars_to_lines(self, chars: list[dict]) -> list[_TextLine]:
136
+ """Group characters into text lines by y-coordinate clustering."""
137
+ if not chars:
138
+ return []
139
+
140
+ # Sort by y center descending (top of page first), then x
141
+ sorted_chars = sorted(chars, key=lambda c: (-(c["top"] + c["bottom"]) / 2, c["x0"]))
142
+
143
+ lines: list[_TextLine] = []
144
+ current_line = _TextLine()
145
+
146
+ for char in sorted_chars:
147
+ char_cy = (char["top"] + char["bottom"]) / 2
148
+ # Use adaptive tolerance based on font size
149
+ tolerance = max(DEFAULT_Y_TOLERANCE, char.get("size", 12) * 0.4)
150
+
151
+ if current_line.chars:
152
+ line_cy = current_line.center_y
153
+ if abs(char_cy - line_cy) <= tolerance:
154
+ current_line.add_char(char)
155
+ else:
156
+ current_line.finalize()
157
+ if current_line.text.strip():
158
+ lines.append(current_line)
159
+ current_line = _TextLine()
160
+ current_line.add_char(char)
161
+ else:
162
+ current_line.add_char(char)
163
+
164
+ if current_line.chars:
165
+ current_line.finalize()
166
+ if current_line.text.strip():
167
+ lines.append(current_line)
168
+
169
+ # Sort lines top-to-bottom
170
+ lines.sort(key=lambda l: -l.center_y)
171
+ return lines
172
+
173
+ def _group_lines_to_blocks(self, lines: list[_TextLine], page_number: int) -> list[TextBlock]:
174
+ """Group adjacent text lines into TextBlock paragraphs."""
175
+ if not lines:
176
+ return []
177
+
178
+ blocks: list[TextBlock] = []
179
+ current_lines: list[_TextLine] = [lines[0]]
180
+
181
+ for i in range(1, len(lines)):
182
+ prev = current_lines[-1]
183
+ curr = lines[i]
184
+
185
+ # Check horizontal overlap
186
+ overlap = self._horizontal_overlap(prev, curr)
187
+
188
+ # Check vertical gap
189
+ avg_height = (prev.height + curr.height) / 2
190
+ if avg_height <= 0:
191
+ avg_height = 12
192
+ gap = prev.y0 - curr.y1 # positive means curr is below prev
193
+ if gap < 0:
194
+ gap = 0
195
+
196
+ if overlap >= BLOCK_MIN_OVERLAP and gap < BLOCK_MAX_GAP_LINES * avg_height:
197
+ current_lines.append(curr)
198
+ else:
199
+ blocks.append(self._lines_to_block(current_lines, page_number))
200
+ current_lines = [curr]
201
+
202
+ if current_lines:
203
+ blocks.append(self._lines_to_block(current_lines, page_number))
204
+
205
+ return blocks
206
+
207
+ def _horizontal_overlap(self, a: _TextLine, b: _TextLine) -> float:
208
+ """Calculate horizontal overlap ratio between two lines."""
209
+ overlap = max(0.0, min(a.x1, b.x1) - max(a.x0, b.x0))
210
+ min_width = min(a.width, b.width)
211
+ if min_width <= 0:
212
+ return 0.0
213
+ return overlap / min_width
214
+
215
+ def _lines_to_block(self, lines: list[_TextLine], page_number: int) -> TextBlock:
216
+ """Merge multiple lines into a single TextBlock."""
217
+ if not lines:
218
+ return TextBlock(page_number=page_number)
219
+
220
+ text_parts: list[str] = []
221
+ for i, line in enumerate(lines):
222
+ stripped = line.text.strip()
223
+ if i > 0 and stripped:
224
+ text_parts.append(" ")
225
+ text_parts.append(stripped)
226
+
227
+ text = "".join(text_parts).strip()
228
+
229
+ # Dominant font info
230
+ font_counter = Counter(l.font_name for l in lines)
231
+ font_name = font_counter.most_common(1)[0][0]
232
+ size_counter = Counter(l.font_size for l in lines)
233
+ font_size = size_counter.most_common(1)[0][0]
234
+
235
+ bold = "bold" in font_name.lower()
236
+
237
+ return TextBlock(
238
+ text=text,
239
+ x0=min(l.x0 for l in lines),
240
+ y0=min(l.y0 for l in lines),
241
+ x1=max(l.x1 for l in lines),
242
+ y1=max(l.y1 for l in lines),
243
+ font_name=font_name,
244
+ font_size=font_size,
245
+ bold=bold,
246
+ page_number=page_number,
247
+ )
248
+
249
+ def _detect_columns(self, blocks: list[TextBlock], page_width: float) -> list[tuple[float, float]]:
250
+ """Detect column boundaries. Returns list of (x0, x1) column ranges."""
251
+ if not blocks or page_width <= 0:
252
+ return [(0, page_width)]
253
+
254
+ # Collect block left edges
255
+ left_edges = sorted(set(round(b.x0) for b in blocks))
256
+
257
+ if len(left_edges) <= 1:
258
+ return [(0, page_width)]
259
+
260
+ # Find the largest horizontal gap between block groups
261
+ # Divide page into vertical strips and count chars density
262
+ strip_width = page_width / 40
263
+ strip_counts: dict[int, int] = {}
264
+ for b in blocks:
265
+ center_strip = int(b.center_x / strip_width)
266
+ strip_counts[center_strip] = strip_counts.get(center_strip, 0) + len(b.text)
267
+
268
+ # Find gap ranges with no text
269
+ all_strips = range(40)
270
+ gap_ranges: list[tuple[int, int]] = []
271
+ gap_start = None
272
+
273
+ for s in all_strips:
274
+ if strip_counts.get(s, 0) == 0:
275
+ if gap_start is None:
276
+ gap_start = s
277
+ else:
278
+ if gap_start is not None:
279
+ gap_ranges.append((gap_start, s - 1))
280
+ gap_start = None
281
+ if gap_start is not None:
282
+ gap_ranges.append((gap_start, 39))
283
+
284
+ # Filter gaps that are wide enough to be column separators
285
+ min_gap_strips = int(COLUMN_GAP_MIN_FRACTION * 40)
286
+ significant_gaps = [
287
+ (s * strip_width, (e + 1) * strip_width)
288
+ for s, e in gap_ranges
289
+ if (e - s + 1) >= min_gap_strips
290
+ ]
291
+
292
+ if not significant_gaps:
293
+ return [(0, page_width)]
294
+
295
+ # Build column ranges from gaps
296
+ columns: list[tuple[float, float]] = []
297
+ prev_end = 0.0
298
+ for gap_start, gap_end in significant_gaps:
299
+ if gap_start > prev_end:
300
+ columns.append((prev_end, gap_start))
301
+ prev_end = gap_end
302
+ if prev_end < page_width:
303
+ columns.append((prev_end, page_width))
304
+
305
+ if len(columns) <= 1:
306
+ return [(0, page_width)]
307
+
308
+ return columns
309
+
310
+ def _apply_reading_order(
311
+ self,
312
+ blocks: list[TextBlock],
313
+ columns: list[tuple[float, float]],
314
+ page_width: float,
315
+ ) -> list[TextBlock]:
316
+ """Sort blocks into logical reading order based on detected columns."""
317
+ if len(columns) <= 1:
318
+ # Single column: sort top-to-bottom (descending y1 in PDF coords)
319
+ return sorted(blocks, key=lambda b: -b.y1)
320
+
321
+ # Multi-column layout
322
+ column_blocks: list[list[TextBlock]] = [[] for _ in columns]
323
+ spanning: list[TextBlock] = []
324
+
325
+ for block in blocks:
326
+ if block.is_spanning(page_width):
327
+ spanning.append(block)
328
+ else:
329
+ # Assign to column by center_x
330
+ best_col = 0
331
+ best_dist = float("inf")
332
+ for i, (cx0, cx1) in enumerate(columns):
333
+ col_center = (cx0 + cx1) / 2
334
+ dist = abs(block.center_x - col_center)
335
+ if dist < best_dist:
336
+ best_dist = dist
337
+ best_col = i
338
+ column_blocks[best_col].append(block)
339
+
340
+ # Sort each column top-to-bottom
341
+ for col in column_blocks:
342
+ col.sort(key=lambda b: -b.y1)
343
+
344
+ # Sort spanning blocks top-to-bottom
345
+ spanning.sort(key=lambda b: -b.y1)
346
+
347
+ # Merge: interleaving spanning blocks with column content
348
+ result: list[TextBlock] = []
349
+ col_indices = [0] * len(columns)
350
+ span_idx = 0
351
+
352
+ for block in spanning:
353
+ # Flush column blocks that are above this spanning block
354
+ for col_i, col in enumerate(column_blocks):
355
+ while col_indices[col_i] < len(col) and col[col_indices[col_i]].y1 > block.y1:
356
+ result.append(col[col_indices[col_i]])
357
+ col_indices[col_i] += 1
358
+ result.append(block)
359
+
360
+ # Flush remaining column blocks (left to right, top to bottom within)
361
+ while any(col_indices[i] < len(column_blocks[i]) for i in range(len(columns))):
362
+ # Find the topmost remaining block across all columns
363
+ top_block = None
364
+ top_col = -1
365
+ for col_i, col in enumerate(column_blocks):
366
+ if col_indices[col_i] < len(col):
367
+ b = col[col_indices[col_i]]
368
+ if top_block is None or b.y1 > top_block.y1:
369
+ top_block = b
370
+ top_col = col_i
371
+ if top_block is None:
372
+ break
373
+ result.append(top_block)
374
+ col_indices[top_col] += 1
375
+
376
+ return result
377
+
378
+ def _detect_headers_footers(self, blocks: list[TextBlock], page_height: float):
379
+ """Detect header/footer blocks based on position on the page."""
380
+ if not blocks:
381
+ return
382
+
383
+ for block in blocks:
384
+ # Top 10% of page → potential header
385
+ if block.y1 > page_height * 0.92:
386
+ block.is_header = True
387
+ # Bottom 8% of page → potential footer
388
+ if block.y0 < page_height * 0.08:
389
+ footer_text = block.text.strip()
390
+ if re.match(r"^[-—\s]*\d+[-—\s]*$", footer_text):
391
+ block.is_footer = True
392
+ elif re.match(r"^\d+\s*/\s*\d+$", footer_text):
393
+ block.is_footer = True
394
+
395
+ def _mark_repeated_elements(
396
+ self,
397
+ all_blocks: list[TextBlock],
398
+ per_page_top: list[tuple[str, int]],
399
+ per_page_bottom: list[tuple[str, int]],
400
+ ):
401
+ """Mark blocks as headers/footers if they repeat across pages."""
402
+ if len(per_page_top) < 3:
403
+ return
404
+
405
+ # Check for repeated top texts
406
+ top_texts = [t for t, _ in per_page_top]
407
+ top_counter = Counter(top_texts)
408
+ for text, count in top_counter.items():
409
+ if count >= 3:
410
+ for block in all_blocks:
411
+ if block.text.strip() == text:
412
+ block.is_header = True
413
+
414
+ # Check for repeated bottom texts
415
+ bottom_texts = [t for t, _ in per_page_bottom]
416
+ bottom_counter = Counter(bottom_texts)
417
+ for text, count in bottom_counter.items():
418
+ if count >= 3:
419
+ for block in all_blocks:
420
+ if block.text.strip() == text:
421
+ block.is_footer = True
422
+
423
+ def _merge_hyphenation(self, blocks: list[TextBlock]):
424
+ """Merge hyphenated words split across consecutive blocks."""
425
+ i = 0
426
+ while i < len(blocks) - 1:
427
+ current = blocks[i]
428
+ next_block = blocks[i + 1]
429
+ if (
430
+ current.text.endswith("-")
431
+ and next_block.text
432
+ and next_block.text[0].islower()
433
+ and not current.is_header
434
+ and not current.is_footer
435
+ and not next_block.is_header
436
+ and not next_block.is_footer
437
+ ):
438
+ # Remove hyphen and join
439
+ current.text = current.text[:-1] + next_block.text
440
+ # Expand bounding box
441
+ current.x1 = max(current.x1, next_block.x1)
442
+ current.y0 = min(current.y0, next_block.y0)
443
+ blocks.pop(i + 1)
444
+ # Don't increment i - check again in case of multi-line hyphenation
445
+ else:
446
+ i += 1
447
+
448
+ @staticmethod
449
+ def infer_heading_levels(blocks: list[TextBlock]) -> dict[int, int]:
450
+ """Infer heading levels (1-6) from font sizes. Returns {block_index: level}."""
451
+ if not blocks:
452
+ return {}
453
+
454
+ # Collect font sizes from non-header/footer blocks
455
+ sizes = [b.font_size for b in blocks if not b.is_header and not b.is_footer]
456
+ if not sizes:
457
+ return {}
458
+
459
+ # Body text = most common size
460
+ size_counter = Counter(sizes)
461
+ body_size = size_counter.most_common(1)[0][0]
462
+
463
+ # Find unique sizes larger than body
464
+ larger_sizes = sorted(set(s for s in sizes if s > body_size * 1.1), reverse=True)
465
+
466
+ heading_map: dict[int, int] = {}
467
+ for idx, block in enumerate(blocks):
468
+ if block.is_header or block.is_footer:
469
+ continue
470
+ if block.font_size > body_size * 1.1:
471
+ try:
472
+ level = larger_sizes.index(block.font_size) + 1
473
+ except ValueError:
474
+ level = 6
475
+ level = min(level, 6)
476
+ heading_map[idx] = level
477
+
478
+ return heading_map
@@ -0,0 +1,119 @@
1
+ """Page-level processing: convert analyzed PDF page layout to Markdown lines."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Optional
7
+
8
+ from .image_extractor import ImageExtractor
9
+ from .layout_analyzer import LayoutAnalyzer
10
+ from .table_processor import TableProcessor
11
+ from .text_block import TextBlock
12
+ from .utils import escape_markdown
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class PageProcessor:
18
+ """Convert one PDF page into Markdown lines."""
19
+
20
+ def __init__(
21
+ self,
22
+ layout_analyzer: LayoutAnalyzer,
23
+ table_processor: TableProcessor,
24
+ image_extractor: Optional[ImageExtractor],
25
+ ignore_images: bool = False,
26
+ ):
27
+ self.layout_analyzer = layout_analyzer
28
+ self.table_processor = table_processor
29
+ self.image_extractor = image_extractor
30
+ self.ignore_images = ignore_images
31
+
32
+ def process_page(self, page, page_number: int) -> list[str]:
33
+ """Process a pdfplumber page and return Markdown lines."""
34
+ # Analyze layout to get ordered text blocks
35
+ blocks = self.layout_analyzer.analyze(page, page_number)
36
+
37
+ # Extract tables and their bounding boxes
38
+ tables_with_bbox = self.table_processor.extract_tables(page)
39
+ table_bboxes = [bbox for _, bbox in tables_with_bbox]
40
+ table_lines_by_y: dict[float, list[str]] = {}
41
+
42
+ for table_data, bbox in tables_with_bbox:
43
+ formatted = self.table_processor.format_table(table_data)
44
+ if formatted:
45
+ # Key by y-center of table for interleaving with text
46
+ y_center = (bbox[1] + bbox[3]) / 2
47
+ table_lines_by_y[y_center] = formatted
48
+
49
+ # Extract images (if not ignored)
50
+ images_with_bbox: list[tuple[str, tuple[float, float, float, float]]] = []
51
+ if not self.ignore_images and self.image_extractor:
52
+ images_with_bbox = self.image_extractor.extract_images(page, page_number)
53
+
54
+ if not blocks and not tables_with_bbox and not images_with_bbox:
55
+ return ["<!-- empty page -->"]
56
+
57
+ # Filter out blocks that overlap with tables
58
+ text_blocks = [
59
+ b for b in blocks
60
+ if not b.is_header and not b.is_footer
61
+ and not any(b.overlaps_bbox(tb, threshold=0.6) for tb in table_bboxes)
62
+ ]
63
+
64
+ # Infer heading levels
65
+ heading_levels = LayoutAnalyzer.infer_heading_levels(blocks)
66
+
67
+ # Build output by interleaving text blocks, tables, and images
68
+ lines: list[str] = []
69
+
70
+ # Collect all elements with their y-positions for proper interleaving
71
+ elements: list[tuple[float, str, list[str]]] = [] # (y1, type, content_lines)
72
+
73
+ for idx, block in enumerate(blocks):
74
+ if block.is_header or block.is_footer:
75
+ continue
76
+ if any(block.overlaps_bbox(tb, threshold=0.6) for tb in table_bboxes):
77
+ continue
78
+
79
+ block_lines = self._block_to_markdown(block, idx, heading_levels)
80
+ if block_lines:
81
+ elements.append((block.y1, "text", block_lines))
82
+
83
+ for y_center, tbl_lines in table_lines_by_y.items():
84
+ elements.append((y_center, "table", tbl_lines))
85
+
86
+ for img_path, bbox in images_with_bbox:
87
+ elements.append((bbox[3], "image", [f"![]({img_path})", ""]))
88
+
89
+ # Sort by y-position (top to bottom = descending y1 in PDF coords)
90
+ elements.sort(key=lambda e: -e[0])
91
+
92
+ for _, _, content_lines in elements:
93
+ lines.extend(content_lines)
94
+
95
+ return lines if lines else ["<!-- empty page -->"]
96
+
97
+ def _block_to_markdown(
98
+ self,
99
+ block: TextBlock,
100
+ block_idx: int,
101
+ heading_levels: dict[int, int],
102
+ ) -> list[str]:
103
+ """Convert a single TextBlock to Markdown lines."""
104
+ text = block.text.strip()
105
+ if not text:
106
+ return []
107
+
108
+ # Check if this is a heading
109
+ level = heading_levels.get(block_idx, 0)
110
+ if level > 0:
111
+ prefix = "#" * level
112
+ return [f"{prefix} {escape_markdown(text)}", ""]
113
+
114
+ # Bold text
115
+ if block.bold:
116
+ return [f"**{escape_markdown(text)}**", ""]
117
+
118
+ # Regular paragraph
119
+ return [escape_markdown(text), ""]
@@ -0,0 +1,93 @@
1
+ """Table extraction from PDF pages to Markdown format."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Optional
7
+
8
+ from .utils import escape_markdown
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ TABLE_SETTINGS = {
13
+ "vertical_strategy": "lines",
14
+ "horizontal_strategy": "lines",
15
+ "snap_tolerance": 5,
16
+ "join_tolerance": 5,
17
+ "edge_min_length": 10,
18
+ "min_words_vertical": 2,
19
+ "min_words_horizontal": 2,
20
+ }
21
+
22
+
23
+ class TableProcessor:
24
+ """Extract tables from PDF pages and convert to Markdown."""
25
+
26
+ def extract_tables(self, page) -> list[tuple[list[list[str]], tuple[float, float, float, float]]]:
27
+ """Extract tables from a pdfplumber page.
28
+
29
+ Returns list of (table_data, bounding_box) tuples.
30
+ table_data is a list of rows, each row is a list of cell strings.
31
+ bounding_box is (x0, y0, x1, y1).
32
+ """
33
+ try:
34
+ tables = page.find_tables(table_settings=TABLE_SETTINGS)
35
+ except Exception as exc:
36
+ logger.warning("Failed to find tables on page: %s", exc)
37
+ return []
38
+
39
+ results = []
40
+ for table in tables:
41
+ try:
42
+ table_data = table.extract()
43
+ if not table_data or all(not any(row) for row in table_data):
44
+ continue
45
+ bbox = (table.bbox[0], table.bbox[1], table.bbox[2], table.bbox[3])
46
+ normalized = self._normalize_table(table_data)
47
+ results.append((normalized, bbox))
48
+ except Exception as exc:
49
+ logger.warning("Failed to extract table: %s", exc)
50
+ continue
51
+
52
+ return results
53
+
54
+ def format_table(self, table_data: list[list[str]]) -> list[str]:
55
+ """Convert table data to Markdown pipe table lines."""
56
+ if not table_data:
57
+ return []
58
+
59
+ rows = table_data
60
+ header = rows[0]
61
+ col_count = len(header)
62
+ if col_count == 0:
63
+ return []
64
+
65
+ divider = ["---"] * col_count
66
+ lines = [
67
+ "| " + " | ".join(header) + " |",
68
+ "| " + " | ".join(divider) + " |",
69
+ ]
70
+
71
+ for row in rows[1:]:
72
+ # Pad or truncate to match header
73
+ if len(row) < col_count:
74
+ row = row + [""] * (col_count - len(row))
75
+ elif len(row) > col_count:
76
+ row = row[:col_count]
77
+ lines.append("| " + " | ".join(row) + " |")
78
+
79
+ return lines
80
+
81
+ def _normalize_table(self, table_data: list[list[Optional[str]]]) -> list[list[str]]:
82
+ """Normalize table cells: replace None with empty string, strip whitespace."""
83
+ normalized = []
84
+ for row in table_data:
85
+ normalized_row = []
86
+ for cell in row:
87
+ if cell is None:
88
+ normalized_row.append("")
89
+ else:
90
+ text = " ".join(str(cell).split())
91
+ normalized_row.append(escape_markdown(text))
92
+ normalized.append(normalized_row)
93
+ return normalized
@@ -0,0 +1,63 @@
1
+ """TextBlock data structure for representing spatial text units on a PDF page."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+
7
+
8
+ @dataclass
9
+ class TextBlock:
10
+ """A coherent unit of text extracted from a PDF page with spatial metadata."""
11
+
12
+ text: str = ""
13
+ x0: float = 0.0
14
+ y0: float = 0.0
15
+ x1: float = 0.0
16
+ y1: float = 0.0
17
+ font_name: str = ""
18
+ font_size: float = 0.0
19
+ bold: bool = False
20
+ page_number: int = 0
21
+ is_header: bool = False
22
+ is_footer: bool = False
23
+
24
+ @property
25
+ def width(self) -> float:
26
+ return self.x1 - self.x0
27
+
28
+ @property
29
+ def height(self) -> float:
30
+ return self.y1 - self.y0
31
+
32
+ @property
33
+ def center_x(self) -> float:
34
+ return (self.x0 + self.x1) / 2
35
+
36
+ @property
37
+ def center_y(self) -> float:
38
+ return (self.y0 + self.y1) / 2
39
+
40
+ def horizontal_overlap_ratio(self, other: TextBlock) -> float:
41
+ """Return the ratio of horizontal overlap to the narrower block's width."""
42
+ overlap = max(0.0, min(self.x1, other.x1) - max(self.x0, other.x0))
43
+ min_width = min(self.width, other.width)
44
+ if min_width <= 0:
45
+ return 0.0
46
+ return overlap / min_width
47
+
48
+ def overlaps_bbox(self, bbox: tuple[float, float, float, float], threshold: float = 0.5) -> bool:
49
+ """Check if this block overlaps significantly with a bounding box."""
50
+ bx0, by0, bx1, by1 = bbox
51
+ overlap_x = max(0.0, min(self.x1, bx1) - max(self.x0, bx0))
52
+ overlap_y = max(0.0, min(self.y1, by1) - max(self.y0, by0))
53
+ if self.width <= 0 or self.height <= 0:
54
+ return False
55
+ overlap_area = overlap_x * overlap_y
56
+ self_area = self.width * self.height
57
+ return (overlap_area / self_area) > threshold
58
+
59
+ def is_spanning(self, page_width: float, ratio: float = 0.8) -> bool:
60
+ """Check if this block spans most of the page width."""
61
+ if page_width <= 0:
62
+ return False
63
+ return self.width > page_width * ratio
pdf2dotmd/utils.py ADDED
@@ -0,0 +1,23 @@
1
+ """Utility helpers for markdown generation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Iterable
7
+
8
+
9
+ def escape_markdown(text: str) -> str:
10
+ """Escape markdown control characters in plain text content."""
11
+ if not text:
12
+ return ""
13
+
14
+ escaped = text.replace("\\", "\\\\")
15
+ for ch in ["`", "*", "_", "{", "}", "[", "]", "<", ">", "|"]:
16
+ escaped = escaped.replace(ch, f"\\{ch}")
17
+ return escaped
18
+
19
+
20
+ def clean_markdown_content(lines: Iterable[str]) -> str:
21
+ """Join markdown lines and collapse repeated blank lines."""
22
+ text = "\n".join(lines).rstrip() + "\n"
23
+ return re.sub(r"\n{3,}", "\n\n", text)
@@ -0,0 +1,89 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdf2dotmd
3
+ Version: 0.0.1
4
+ Summary: A Python tool for converting PDF files to Markdown
5
+ Author: hnrobert
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/HNRobert/pdf2dotmd
8
+ Project-URL: Repository, https://github.com/HNRobert/pdf2dotmd
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.8
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Text Processing :: Markup
19
+ Classifier: Topic :: Utilities
20
+ Requires-Python: >=3.8
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: pdfplumber>=0.11.0
24
+ Dynamic: license-file
25
+
26
+ # pdf2dotmd
27
+
28
+ A Python CLI tool that converts PDF files to Markdown format with intelligent layout analysis.
29
+
30
+ ## Features
31
+
32
+ - **Layout-aware text extraction** — reconstructs logical reading order from PDF spatial data
33
+ - **Multi-column detection** — handles two-column and multi-column layouts
34
+ - **Table extraction** — converts PDF tables to Markdown pipe tables
35
+ - **Heading inference** — detects headings from font size hierarchy
36
+ - **Header/footer filtering** — automatically removes repeated page headers and footers
37
+ - **Image extraction** — extracts embedded images to an `assets/` directory
38
+ - **Ignore images mode** — `--ignore-images` flag for text-only output
39
+ - **Page range selection** — convert specific pages only
40
+ - **Batch conversion** — process multiple PDF files with wildcards
41
+
42
+ ## Installation
43
+
44
+ ```bash
45
+ pip install pdf2dotmd
46
+ ```
47
+
48
+ ## Usage
49
+
50
+ ```bash
51
+ # Output to stdout
52
+ pdf2dotmd input.pdf
53
+
54
+ # Output to file
55
+ pdf2dotmd input.pdf -o output.md
56
+
57
+ # Skip images, output single Markdown file
58
+ pdf2dotmd input.pdf --ignore-images
59
+
60
+ # Batch conversion
61
+ pdf2dotmd *.pdf -o output_dir/
62
+
63
+ # Convert only specific pages
64
+ pdf2dotmd input.pdf -p 1-3
65
+ pdf2dotmd input.pdf -p 1-5,8,10-12
66
+
67
+ # Verbose logging
68
+ pdf2dotmd input.pdf -v
69
+ ```
70
+
71
+ ## How It Works
72
+
73
+ 1. **Character extraction** — uses [pdfplumber](https://github.com/jsvine/pdfplumber) to extract individual characters with position data
74
+ 2. **Line grouping** — clusters characters into text lines by y-coordinate proximity
75
+ 3. **Block formation** — groups lines into paragraphs based on horizontal alignment and vertical spacing
76
+ 4. **Column detection** — identifies multi-column layouts by analyzing horizontal text density gaps
77
+ 5. **Reading order** — sorts blocks top-to-bottom, left-to-right, handling spanning titles
78
+ 6. **Header/footer removal** — detects repeated elements across pages
79
+ 7. **Heading inference** — maps font sizes to heading levels (H1-H6)
80
+
81
+ ## Limitations
82
+
83
+ - **Scanned PDFs** — OCR is not supported; scanned/image-only PDFs will produce empty output
84
+ - **Encrypted PDFs** — password-protected PDFs are not supported
85
+ - **Complex layouts** — highly irregular layouts may not parse perfectly
86
+
87
+ ## License
88
+
89
+ MIT
@@ -0,0 +1,15 @@
1
+ pdf2dotmd/__init__.py,sha256=50eR1xS6VIHHNF2ZUw2yjrBqWFn80nPuIRtn9bVMRAQ,45
2
+ pdf2dotmd/cli.py,sha256=Ni8Fy73uoak10dsXV5lC_OQ6b1TPz8Y0N41R00L6D_0,3182
3
+ pdf2dotmd/converter.py,sha256=a-fXjn_S5RmZJyAHTIQ8ltuwlSwxKpEDGVzQVrkCVjU,6156
4
+ pdf2dotmd/image_extractor.py,sha256=W1e1HySOEH69oHjTqn763AIRBLVmwjT8Z_o1afdWI6A,3726
5
+ pdf2dotmd/layout_analyzer.py,sha256=5k9QogF6wgMrsyx-MBZVY6RetUpO-PN8FOtT_KD_NmI,17193
6
+ pdf2dotmd/page_processor.py,sha256=M5qzTcSp-_khSLQaD0xuwAtSU33pWkg3SsUAuz2glpE,4266
7
+ pdf2dotmd/table_processor.py,sha256=9CyUp45NE9udxO_TBpWmsRK25QnlYfLzwOvc542xnq8,3038
8
+ pdf2dotmd/text_block.py,sha256=MiwC61MT0aY3EoYs3GFNYscl6DrOEvO6xmV9VCPvCMY,2028
9
+ pdf2dotmd/utils.py,sha256=w5deYI99DlsDa1CxfpcGZu2P2k7i28HzIEpB7XnvVq8,653
10
+ pdf2dotmd-0.0.1.dist-info/licenses/LICENSE,sha256=ZfEADodI9tn9tdS_ab8HR0oCeCMl9wTRgwPljxRTsXs,1066
11
+ pdf2dotmd-0.0.1.dist-info/METADATA,sha256=q8ueKEpOQpVGFXmyRVx9eXVxhXaW8TR-w8C2pn2jDj0,3130
12
+ pdf2dotmd-0.0.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
13
+ pdf2dotmd-0.0.1.dist-info/entry_points.txt,sha256=SQVMAmDp0so-_4paUGfSK6HRGfA4VYxrEEsrEx6TXRA,49
14
+ pdf2dotmd-0.0.1.dist-info/top_level.txt,sha256=fhJsRXlgjF6tT5Wiy50rI5PI66E5uHj_AJ8dR7K-3CI,10
15
+ pdf2dotmd-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ pdf2dotmd = pdf2dotmd.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Robert He
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ pdf2dotmd