pdfmd 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdfmd/__init__.py +1 -0
- pdfmd/__main__.py +4 -0
- pdfmd/cli.py +95 -0
- pdfmd-0.1.0.dist-info/METADATA +40 -0
- pdfmd-0.1.0.dist-info/RECORD +7 -0
- pdfmd-0.1.0.dist-info/WHEEL +4 -0
- pdfmd-0.1.0.dist-info/entry_points.txt +2 -0
pdfmd/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
pdfmd/__main__.py
ADDED
pdfmd/cli.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""pdf2md - Convert PDF files to Markdown using pymupdf4llm."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import pymupdf4llm
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def parse_pages(spec: str) -> list[int]:
|
|
11
|
+
"""Parse a page spec like '1,3,5-8' into a 0-indexed list."""
|
|
12
|
+
pages = set()
|
|
13
|
+
for part in spec.split(","):
|
|
14
|
+
part = part.strip()
|
|
15
|
+
if "-" in part:
|
|
16
|
+
start, end = part.split("-", 1)
|
|
17
|
+
pages.update(range(int(start) - 1, int(end)))
|
|
18
|
+
else:
|
|
19
|
+
pages.add(int(part) - 1)
|
|
20
|
+
return sorted(pages)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def main() -> int:
|
|
24
|
+
ap = argparse.ArgumentParser(
|
|
25
|
+
description="Convert a PDF to Markdown.",
|
|
26
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
27
|
+
)
|
|
28
|
+
ap.add_argument("input", type=Path, help="input PDF file")
|
|
29
|
+
ap.add_argument(
|
|
30
|
+
"-o", "--output", type=Path,
|
|
31
|
+
help="output .md file (default: input with .md extension, '-' for stdout)",
|
|
32
|
+
)
|
|
33
|
+
ap.add_argument(
|
|
34
|
+
"-p", "--pages", type=str,
|
|
35
|
+
help="pages to extract, e.g. '1,3,5-8' (1-indexed). default: all",
|
|
36
|
+
)
|
|
37
|
+
ap.add_argument(
|
|
38
|
+
"--images", action="store_true",
|
|
39
|
+
help="extract embedded images alongside the markdown",
|
|
40
|
+
)
|
|
41
|
+
ap.add_argument(
|
|
42
|
+
"--image-dir", type=Path, default=Path("images"),
|
|
43
|
+
help="directory for extracted images",
|
|
44
|
+
)
|
|
45
|
+
ap.add_argument(
|
|
46
|
+
"--image-format", default="png", choices=["png", "jpg"],
|
|
47
|
+
help="image format",
|
|
48
|
+
)
|
|
49
|
+
ap.add_argument(
|
|
50
|
+
"--dpi", type=int, default=150,
|
|
51
|
+
help="DPI for rendered/extracted images",
|
|
52
|
+
)
|
|
53
|
+
ap.add_argument(
|
|
54
|
+
"--table-strategy", default="lines", choices=["lines", "lines_strict"],
|
|
55
|
+
help="table detection strategy",
|
|
56
|
+
)
|
|
57
|
+
ap.add_argument(
|
|
58
|
+
"-q", "--quiet", action="store_true",
|
|
59
|
+
help="suppress progress output",
|
|
60
|
+
)
|
|
61
|
+
args = ap.parse_args()
|
|
62
|
+
|
|
63
|
+
if not args.input.is_file():
|
|
64
|
+
sys.exit(f"error: {args.input} not found")
|
|
65
|
+
|
|
66
|
+
kwargs = {
|
|
67
|
+
"table_strategy": args.table_strategy,
|
|
68
|
+
"dpi": args.dpi,
|
|
69
|
+
"show_progress": not args.quiet,
|
|
70
|
+
}
|
|
71
|
+
if args.pages:
|
|
72
|
+
try:
|
|
73
|
+
kwargs["pages"] = parse_pages(args.pages)
|
|
74
|
+
except ValueError:
|
|
75
|
+
sys.exit(f"error: invalid page spec: {args.pages!r}")
|
|
76
|
+
if args.images:
|
|
77
|
+
args.image_dir.mkdir(parents=True, exist_ok=True)
|
|
78
|
+
kwargs["write_images"] = True
|
|
79
|
+
kwargs["image_path"] = str(args.image_dir)
|
|
80
|
+
kwargs["image_format"] = args.image_format
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
md = pymupdf4llm.to_markdown(str(args.input), **kwargs)
|
|
84
|
+
except Exception as e:
|
|
85
|
+
sys.exit(f"error: conversion failed: {e}")
|
|
86
|
+
|
|
87
|
+
if args.output and str(args.output) == "-":
|
|
88
|
+
sys.stdout.write(md)
|
|
89
|
+
else:
|
|
90
|
+
out = args.output or args.input.with_suffix(".md")
|
|
91
|
+
out.write_text(md, encoding="utf-8")
|
|
92
|
+
if not args.quiet:
|
|
93
|
+
print(f"wrote {out} ({len(md):,} chars)", file=sys.stderr)
|
|
94
|
+
|
|
95
|
+
return 0
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdfmd
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Convert PDF files to Markdown using pymupdf4llm
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Requires-Dist: pymupdf4llm
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
|
|
9
|
+
# pdf2md
|
|
10
|
+
|
|
11
|
+
Convert PDF files to Markdown using [pymupdf4llm](https://github.com/pymupdf/RAG).
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
# With uv
|
|
17
|
+
uv sync
|
|
18
|
+
|
|
19
|
+
# Or with pip
|
|
20
|
+
pip install .
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Usage
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pdf2md input.pdf # writes input.md
|
|
27
|
+
pdf2md input.pdf -o output.md # explicit output path
|
|
28
|
+
pdf2md input.pdf -o - # stdout
|
|
29
|
+
pdf2md input.pdf -p "1,3,5-8" # specific pages (1-indexed)
|
|
30
|
+
pdf2md input.pdf --images --dpi 200 # extract images
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
Run `pdf2md --help` for all options.
|
|
34
|
+
|
|
35
|
+
## Build & Publish
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
uv build # creates dist/*.whl and dist/*.tar.gz
|
|
39
|
+
uv publish # publish to PyPI
|
|
40
|
+
```
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
pdfmd/__init__.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
|
|
2
|
+
pdfmd/__main__.py,sha256=EurL_Wa0CJy1bMx4Ha0tsqLBcnCY-Gr8p7peeKGH408,57
|
|
3
|
+
pdfmd/cli.py,sha256=DnoY46QqpWUM7zejCCiyK4-MjFZJhMdRCRnIaw-f2-s,2913
|
|
4
|
+
pdfmd-0.1.0.dist-info/METADATA,sha256=gnzFYsyuH6oURDtPV5I9ra4r6mHtp1zL4cLWtmU4Tck,852
|
|
5
|
+
pdfmd-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
6
|
+
pdfmd-0.1.0.dist-info/entry_points.txt,sha256=EJ-1aBfge6N41zjl-ks12NE7H8LDd1FZhETWfhb98_Y,41
|
|
7
|
+
pdfmd-0.1.0.dist-info/RECORD,,
|