pdfmd 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pdfmd/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
pdfmd/__main__.py ADDED
@@ -0,0 +1,4 @@
1
+ from pdf2md.cli import main
2
+ import sys
3
+
4
+ sys.exit(main())
pdfmd/cli.py ADDED
@@ -0,0 +1,95 @@
1
+ """pdf2md - Convert PDF files to Markdown using pymupdf4llm."""
2
+
3
+ import argparse
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ import pymupdf4llm
8
+
9
+
10
+ def parse_pages(spec: str) -> list[int]:
11
+ """Parse a page spec like '1,3,5-8' into a 0-indexed list."""
12
+ pages = set()
13
+ for part in spec.split(","):
14
+ part = part.strip()
15
+ if "-" in part:
16
+ start, end = part.split("-", 1)
17
+ pages.update(range(int(start) - 1, int(end)))
18
+ else:
19
+ pages.add(int(part) - 1)
20
+ return sorted(pages)
21
+
22
+
23
+ def main() -> int:
24
+ ap = argparse.ArgumentParser(
25
+ description="Convert a PDF to Markdown.",
26
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
27
+ )
28
+ ap.add_argument("input", type=Path, help="input PDF file")
29
+ ap.add_argument(
30
+ "-o", "--output", type=Path,
31
+ help="output .md file (default: input with .md extension, '-' for stdout)",
32
+ )
33
+ ap.add_argument(
34
+ "-p", "--pages", type=str,
35
+ help="pages to extract, e.g. '1,3,5-8' (1-indexed). default: all",
36
+ )
37
+ ap.add_argument(
38
+ "--images", action="store_true",
39
+ help="extract embedded images alongside the markdown",
40
+ )
41
+ ap.add_argument(
42
+ "--image-dir", type=Path, default=Path("images"),
43
+ help="directory for extracted images",
44
+ )
45
+ ap.add_argument(
46
+ "--image-format", default="png", choices=["png", "jpg"],
47
+ help="image format",
48
+ )
49
+ ap.add_argument(
50
+ "--dpi", type=int, default=150,
51
+ help="DPI for rendered/extracted images",
52
+ )
53
+ ap.add_argument(
54
+ "--table-strategy", default="lines", choices=["lines", "lines_strict"],
55
+ help="table detection strategy",
56
+ )
57
+ ap.add_argument(
58
+ "-q", "--quiet", action="store_true",
59
+ help="suppress progress output",
60
+ )
61
+ args = ap.parse_args()
62
+
63
+ if not args.input.is_file():
64
+ sys.exit(f"error: {args.input} not found")
65
+
66
+ kwargs = {
67
+ "table_strategy": args.table_strategy,
68
+ "dpi": args.dpi,
69
+ "show_progress": not args.quiet,
70
+ }
71
+ if args.pages:
72
+ try:
73
+ kwargs["pages"] = parse_pages(args.pages)
74
+ except ValueError:
75
+ sys.exit(f"error: invalid page spec: {args.pages!r}")
76
+ if args.images:
77
+ args.image_dir.mkdir(parents=True, exist_ok=True)
78
+ kwargs["write_images"] = True
79
+ kwargs["image_path"] = str(args.image_dir)
80
+ kwargs["image_format"] = args.image_format
81
+
82
+ try:
83
+ md = pymupdf4llm.to_markdown(str(args.input), **kwargs)
84
+ except Exception as e:
85
+ sys.exit(f"error: conversion failed: {e}")
86
+
87
+ if args.output and str(args.output) == "-":
88
+ sys.stdout.write(md)
89
+ else:
90
+ out = args.output or args.input.with_suffix(".md")
91
+ out.write_text(md, encoding="utf-8")
92
+ if not args.quiet:
93
+ print(f"wrote {out} ({len(md):,} chars)", file=sys.stderr)
94
+
95
+ return 0
@@ -0,0 +1,40 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdfmd
3
+ Version: 0.1.0
4
+ Summary: Convert PDF files to Markdown using pymupdf4llm
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: pymupdf4llm
7
+ Description-Content-Type: text/markdown
8
+
9
+ # pdf2md
10
+
11
+ Convert PDF files to Markdown using [pymupdf4llm](https://github.com/pymupdf/RAG).
12
+
13
+ ## Installation
14
+
15
+ ```bash
16
+ # With uv
17
+ uv sync
18
+
19
+ # Or with pip
20
+ pip install .
21
+ ```
22
+
23
+ ## Usage
24
+
25
+ ```bash
26
+ pdf2md input.pdf # writes input.md
27
+ pdf2md input.pdf -o output.md # explicit output path
28
+ pdf2md input.pdf -o - # stdout
29
+ pdf2md input.pdf -p "1,3,5-8" # specific pages (1-indexed)
30
+ pdf2md input.pdf --images --dpi 200 # extract images
31
+ ```
32
+
33
+ Run `pdf2md --help` for all options.
34
+
35
+ ## Build & Publish
36
+
37
+ ```bash
38
+ uv build # creates dist/*.whl and dist/*.tar.gz
39
+ uv publish # publish to PyPI
40
+ ```
@@ -0,0 +1,7 @@
1
+ pdfmd/__init__.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
2
+ pdfmd/__main__.py,sha256=EurL_Wa0CJy1bMx4Ha0tsqLBcnCY-Gr8p7peeKGH408,57
3
+ pdfmd/cli.py,sha256=DnoY46QqpWUM7zejCCiyK4-MjFZJhMdRCRnIaw-f2-s,2913
4
+ pdfmd-0.1.0.dist-info/METADATA,sha256=gnzFYsyuH6oURDtPV5I9ra4r6mHtp1zL4cLWtmU4Tck,852
5
+ pdfmd-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
6
+ pdfmd-0.1.0.dist-info/entry_points.txt,sha256=EJ-1aBfge6N41zjl-ks12NE7H8LDd1FZhETWfhb98_Y,41
7
+ pdfmd-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ pdfmd = pdfmd.cli:main