pdfmd 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdfmd-0.1.0/.claude/settings.local.json +7 -0
- pdfmd-0.1.0/.gitignore +7 -0
- pdfmd-0.1.0/.python-version +1 -0
- pdfmd-0.1.0/CLAUDE.md +26 -0
- pdfmd-0.1.0/PKG-INFO +40 -0
- pdfmd-0.1.0/README.md +32 -0
- pdfmd-0.1.0/pdfmd.spec +38 -0
- pdfmd-0.1.0/pyproject.toml +16 -0
- pdfmd-0.1.0/src/pdfmd/__init__.py +1 -0
- pdfmd-0.1.0/src/pdfmd/__main__.py +4 -0
- pdfmd-0.1.0/src/pdfmd/cli.py +95 -0
pdfmd-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
pdfmd-0.1.0/CLAUDE.md
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Project Overview
|
|
6
|
+
|
|
7
|
+
pdf2md is a Python CLI tool that converts PDF files to Markdown using `pymupdf4llm`. Packaged with `hatchling`, managed with `uv`.
|
|
8
|
+
|
|
9
|
+
## Development
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
uv sync # install dependencies
|
|
13
|
+
uv run pdf2md --help # run the CLI
|
|
14
|
+
uv run python -m pdf2md # alternative invocation
|
|
15
|
+
uv build # build distributable
|
|
16
|
+
uv publish # publish to PyPI
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Architecture
|
|
20
|
+
|
|
21
|
+
src layout package (`src/pdf2md/`):
|
|
22
|
+
- `cli.py` — CLI entry point using argparse; `parse_pages()` converts 1-indexed page specs to 0-indexed lists, `main()` delegates to `pymupdf4llm.to_markdown()`
|
|
23
|
+
- `__main__.py` — enables `python -m pdf2md`
|
|
24
|
+
- `__init__.py` — version string
|
|
25
|
+
|
|
26
|
+
Console script entry point: `pdf2md = "pdf2md.cli:main"` (defined in `pyproject.toml`).
|
pdfmd-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdfmd
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Convert PDF files to Markdown using pymupdf4llm
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Requires-Dist: pymupdf4llm
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
|
|
9
|
+
# pdf2md
|
|
10
|
+
|
|
11
|
+
Convert PDF files to Markdown using [pymupdf4llm](https://github.com/pymupdf/RAG).
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
# With uv
|
|
17
|
+
uv sync
|
|
18
|
+
|
|
19
|
+
# Or with pip
|
|
20
|
+
pip install .
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Usage
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pdf2md input.pdf # writes input.md
|
|
27
|
+
pdf2md input.pdf -o output.md # explicit output path
|
|
28
|
+
pdf2md input.pdf -o - # stdout
|
|
29
|
+
pdf2md input.pdf -p "1,3,5-8" # specific pages (1-indexed)
|
|
30
|
+
pdf2md input.pdf --images --dpi 200 # extract images
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
Run `pdf2md --help` for all options.
|
|
34
|
+
|
|
35
|
+
## Build & Publish
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
uv build # creates dist/*.whl and dist/*.tar.gz
|
|
39
|
+
uv publish # publish to PyPI
|
|
40
|
+
```
|
pdfmd-0.1.0/README.md
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# pdf2md
|
|
2
|
+
|
|
3
|
+
Convert PDF files to Markdown using [pymupdf4llm](https://github.com/pymupdf/RAG).
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# With uv
|
|
9
|
+
uv sync
|
|
10
|
+
|
|
11
|
+
# Or with pip
|
|
12
|
+
pip install .
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Usage
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pdf2md input.pdf # writes input.md
|
|
19
|
+
pdf2md input.pdf -o output.md # explicit output path
|
|
20
|
+
pdf2md input.pdf -o - # stdout
|
|
21
|
+
pdf2md input.pdf -p "1,3,5-8" # specific pages (1-indexed)
|
|
22
|
+
pdf2md input.pdf --images --dpi 200 # extract images
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Run `pdf2md --help` for all options.
|
|
26
|
+
|
|
27
|
+
## Build & Publish
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
uv build # creates dist/*.whl and dist/*.tar.gz
|
|
31
|
+
uv publish # publish to PyPI
|
|
32
|
+
```
|
pdfmd-0.1.0/pdfmd.spec
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# -*- mode: python ; coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
a = Analysis(
|
|
5
|
+
['src\\pdfmd\\cli.py'],
|
|
6
|
+
pathex=[],
|
|
7
|
+
binaries=[],
|
|
8
|
+
datas=[],
|
|
9
|
+
hiddenimports=[],
|
|
10
|
+
hookspath=[],
|
|
11
|
+
hooksconfig={},
|
|
12
|
+
runtime_hooks=[],
|
|
13
|
+
excludes=[],
|
|
14
|
+
noarchive=False,
|
|
15
|
+
optimize=0,
|
|
16
|
+
)
|
|
17
|
+
pyz = PYZ(a.pure)
|
|
18
|
+
|
|
19
|
+
exe = EXE(
|
|
20
|
+
pyz,
|
|
21
|
+
a.scripts,
|
|
22
|
+
a.binaries,
|
|
23
|
+
a.datas,
|
|
24
|
+
[],
|
|
25
|
+
name='pdfmd',
|
|
26
|
+
debug=False,
|
|
27
|
+
bootloader_ignore_signals=False,
|
|
28
|
+
strip=False,
|
|
29
|
+
upx=True,
|
|
30
|
+
upx_exclude=[],
|
|
31
|
+
runtime_tmpdir=None,
|
|
32
|
+
console=True,
|
|
33
|
+
disable_windowed_traceback=False,
|
|
34
|
+
argv_emulation=False,
|
|
35
|
+
target_arch=None,
|
|
36
|
+
codesign_identity=None,
|
|
37
|
+
entitlements_file=None,
|
|
38
|
+
)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "pdfmd"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Convert PDF files to Markdown using pymupdf4llm"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"pymupdf4llm",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[project.scripts]
|
|
16
|
+
pdfmd = "pdfmd.cli:main"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""pdf2md - Convert PDF files to Markdown using pymupdf4llm."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import pymupdf4llm
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def parse_pages(spec: str) -> list[int]:
|
|
11
|
+
"""Parse a page spec like '1,3,5-8' into a 0-indexed list."""
|
|
12
|
+
pages = set()
|
|
13
|
+
for part in spec.split(","):
|
|
14
|
+
part = part.strip()
|
|
15
|
+
if "-" in part:
|
|
16
|
+
start, end = part.split("-", 1)
|
|
17
|
+
pages.update(range(int(start) - 1, int(end)))
|
|
18
|
+
else:
|
|
19
|
+
pages.add(int(part) - 1)
|
|
20
|
+
return sorted(pages)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def main() -> int:
|
|
24
|
+
ap = argparse.ArgumentParser(
|
|
25
|
+
description="Convert a PDF to Markdown.",
|
|
26
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
27
|
+
)
|
|
28
|
+
ap.add_argument("input", type=Path, help="input PDF file")
|
|
29
|
+
ap.add_argument(
|
|
30
|
+
"-o", "--output", type=Path,
|
|
31
|
+
help="output .md file (default: input with .md extension, '-' for stdout)",
|
|
32
|
+
)
|
|
33
|
+
ap.add_argument(
|
|
34
|
+
"-p", "--pages", type=str,
|
|
35
|
+
help="pages to extract, e.g. '1,3,5-8' (1-indexed). default: all",
|
|
36
|
+
)
|
|
37
|
+
ap.add_argument(
|
|
38
|
+
"--images", action="store_true",
|
|
39
|
+
help="extract embedded images alongside the markdown",
|
|
40
|
+
)
|
|
41
|
+
ap.add_argument(
|
|
42
|
+
"--image-dir", type=Path, default=Path("images"),
|
|
43
|
+
help="directory for extracted images",
|
|
44
|
+
)
|
|
45
|
+
ap.add_argument(
|
|
46
|
+
"--image-format", default="png", choices=["png", "jpg"],
|
|
47
|
+
help="image format",
|
|
48
|
+
)
|
|
49
|
+
ap.add_argument(
|
|
50
|
+
"--dpi", type=int, default=150,
|
|
51
|
+
help="DPI for rendered/extracted images",
|
|
52
|
+
)
|
|
53
|
+
ap.add_argument(
|
|
54
|
+
"--table-strategy", default="lines", choices=["lines", "lines_strict"],
|
|
55
|
+
help="table detection strategy",
|
|
56
|
+
)
|
|
57
|
+
ap.add_argument(
|
|
58
|
+
"-q", "--quiet", action="store_true",
|
|
59
|
+
help="suppress progress output",
|
|
60
|
+
)
|
|
61
|
+
args = ap.parse_args()
|
|
62
|
+
|
|
63
|
+
if not args.input.is_file():
|
|
64
|
+
sys.exit(f"error: {args.input} not found")
|
|
65
|
+
|
|
66
|
+
kwargs = {
|
|
67
|
+
"table_strategy": args.table_strategy,
|
|
68
|
+
"dpi": args.dpi,
|
|
69
|
+
"show_progress": not args.quiet,
|
|
70
|
+
}
|
|
71
|
+
if args.pages:
|
|
72
|
+
try:
|
|
73
|
+
kwargs["pages"] = parse_pages(args.pages)
|
|
74
|
+
except ValueError:
|
|
75
|
+
sys.exit(f"error: invalid page spec: {args.pages!r}")
|
|
76
|
+
if args.images:
|
|
77
|
+
args.image_dir.mkdir(parents=True, exist_ok=True)
|
|
78
|
+
kwargs["write_images"] = True
|
|
79
|
+
kwargs["image_path"] = str(args.image_dir)
|
|
80
|
+
kwargs["image_format"] = args.image_format
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
md = pymupdf4llm.to_markdown(str(args.input), **kwargs)
|
|
84
|
+
except Exception as e:
|
|
85
|
+
sys.exit(f"error: conversion failed: {e}")
|
|
86
|
+
|
|
87
|
+
if args.output and str(args.output) == "-":
|
|
88
|
+
sys.stdout.write(md)
|
|
89
|
+
else:
|
|
90
|
+
out = args.output or args.input.with_suffix(".md")
|
|
91
|
+
out.write_text(md, encoding="utf-8")
|
|
92
|
+
if not args.quiet:
|
|
93
|
+
print(f"wrote {out} ({len(md):,} chars)", file=sys.stderr)
|
|
94
|
+
|
|
95
|
+
return 0
|