pdfmd 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(uv sync:*)"
5
+ ]
6
+ }
7
+ }
pdfmd-0.1.0/.gitignore ADDED
@@ -0,0 +1,7 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .venv/
7
+ uv.lock
@@ -0,0 +1 @@
1
+ 3.12
pdfmd-0.1.0/CLAUDE.md ADDED
@@ -0,0 +1,26 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ pdf2md is a Python CLI tool that converts PDF files to Markdown using `pymupdf4llm`. Packaged with `hatchling`, managed with `uv`.
8
+
9
+ ## Development
10
+
11
+ ```bash
12
+ uv sync # install dependencies
13
+ uv run pdf2md --help # run the CLI
14
+ uv run python -m pdf2md # alternative invocation
15
+ uv build # build distributable
16
+ uv publish # publish to PyPI
17
+ ```
18
+
19
+ ## Architecture
20
+
21
+ src layout package (`src/pdf2md/`):
22
+ - `cli.py` — CLI entry point using argparse; `parse_pages()` converts 1-indexed page specs to 0-indexed lists, `main()` delegates to `pymupdf4llm.to_markdown()`
23
+ - `__main__.py` — enables `python -m pdf2md`
24
+ - `__init__.py` — version string
25
+
26
+ Console script entry point: `pdf2md = "pdf2md.cli:main"` (defined in `pyproject.toml`).
pdfmd-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,40 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdfmd
3
+ Version: 0.1.0
4
+ Summary: Convert PDF files to Markdown using pymupdf4llm
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: pymupdf4llm
7
+ Description-Content-Type: text/markdown
8
+
9
+ # pdf2md
10
+
11
+ Convert PDF files to Markdown using [pymupdf4llm](https://github.com/pymupdf/RAG).
12
+
13
+ ## Installation
14
+
15
+ ```bash
16
+ # With uv
17
+ uv sync
18
+
19
+ # Or with pip
20
+ pip install .
21
+ ```
22
+
23
+ ## Usage
24
+
25
+ ```bash
26
+ pdf2md input.pdf # writes input.md
27
+ pdf2md input.pdf -o output.md # explicit output path
28
+ pdf2md input.pdf -o - # stdout
29
+ pdf2md input.pdf -p "1,3,5-8" # specific pages (1-indexed)
30
+ pdf2md input.pdf --images --dpi 200 # extract images
31
+ ```
32
+
33
+ Run `pdf2md --help` for all options.
34
+
35
+ ## Build & Publish
36
+
37
+ ```bash
38
+ uv build # creates dist/*.whl and dist/*.tar.gz
39
+ uv publish # publish to PyPI
40
+ ```
pdfmd-0.1.0/README.md ADDED
@@ -0,0 +1,32 @@
1
+ # pdf2md
2
+
3
+ Convert PDF files to Markdown using [pymupdf4llm](https://github.com/pymupdf/RAG).
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ # With uv
9
+ uv sync
10
+
11
+ # Or with pip
12
+ pip install .
13
+ ```
14
+
15
+ ## Usage
16
+
17
+ ```bash
18
+ pdf2md input.pdf # writes input.md
19
+ pdf2md input.pdf -o output.md # explicit output path
20
+ pdf2md input.pdf -o - # stdout
21
+ pdf2md input.pdf -p "1,3,5-8" # specific pages (1-indexed)
22
+ pdf2md input.pdf --images --dpi 200 # extract images
23
+ ```
24
+
25
+ Run `pdf2md --help` for all options.
26
+
27
+ ## Build & Publish
28
+
29
+ ```bash
30
+ uv build # creates dist/*.whl and dist/*.tar.gz
31
+ uv publish # publish to PyPI
32
+ ```
pdfmd-0.1.0/pdfmd.spec ADDED
@@ -0,0 +1,38 @@
1
+ # -*- mode: python ; coding: utf-8 -*-
2
+
3
+
4
+ a = Analysis(
5
+ ['src\\pdfmd\\cli.py'],
6
+ pathex=[],
7
+ binaries=[],
8
+ datas=[],
9
+ hiddenimports=[],
10
+ hookspath=[],
11
+ hooksconfig={},
12
+ runtime_hooks=[],
13
+ excludes=[],
14
+ noarchive=False,
15
+ optimize=0,
16
+ )
17
+ pyz = PYZ(a.pure)
18
+
19
+ exe = EXE(
20
+ pyz,
21
+ a.scripts,
22
+ a.binaries,
23
+ a.datas,
24
+ [],
25
+ name='pdfmd',
26
+ debug=False,
27
+ bootloader_ignore_signals=False,
28
+ strip=False,
29
+ upx=True,
30
+ upx_exclude=[],
31
+ runtime_tmpdir=None,
32
+ console=True,
33
+ disable_windowed_traceback=False,
34
+ argv_emulation=False,
35
+ target_arch=None,
36
+ codesign_identity=None,
37
+ entitlements_file=None,
38
+ )
@@ -0,0 +1,16 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "pdfmd"
7
+ version = "0.1.0"
8
+ description = "Convert PDF files to Markdown using pymupdf4llm"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ dependencies = [
12
+ "pymupdf4llm",
13
+ ]
14
+
15
+ [project.scripts]
16
+ pdfmd = "pdfmd.cli:main"
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,4 @@
1
+ from pdf2md.cli import main
2
+ import sys
3
+
4
+ sys.exit(main())
@@ -0,0 +1,95 @@
1
+ """pdf2md - Convert PDF files to Markdown using pymupdf4llm."""
2
+
3
+ import argparse
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ import pymupdf4llm
8
+
9
+
10
+ def parse_pages(spec: str) -> list[int]:
11
+ """Parse a page spec like '1,3,5-8' into a 0-indexed list."""
12
+ pages = set()
13
+ for part in spec.split(","):
14
+ part = part.strip()
15
+ if "-" in part:
16
+ start, end = part.split("-", 1)
17
+ pages.update(range(int(start) - 1, int(end)))
18
+ else:
19
+ pages.add(int(part) - 1)
20
+ return sorted(pages)
21
+
22
+
23
+ def main() -> int:
24
+ ap = argparse.ArgumentParser(
25
+ description="Convert a PDF to Markdown.",
26
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
27
+ )
28
+ ap.add_argument("input", type=Path, help="input PDF file")
29
+ ap.add_argument(
30
+ "-o", "--output", type=Path,
31
+ help="output .md file (default: input with .md extension, '-' for stdout)",
32
+ )
33
+ ap.add_argument(
34
+ "-p", "--pages", type=str,
35
+ help="pages to extract, e.g. '1,3,5-8' (1-indexed). default: all",
36
+ )
37
+ ap.add_argument(
38
+ "--images", action="store_true",
39
+ help="extract embedded images alongside the markdown",
40
+ )
41
+ ap.add_argument(
42
+ "--image-dir", type=Path, default=Path("images"),
43
+ help="directory for extracted images",
44
+ )
45
+ ap.add_argument(
46
+ "--image-format", default="png", choices=["png", "jpg"],
47
+ help="image format",
48
+ )
49
+ ap.add_argument(
50
+ "--dpi", type=int, default=150,
51
+ help="DPI for rendered/extracted images",
52
+ )
53
+ ap.add_argument(
54
+ "--table-strategy", default="lines", choices=["lines", "lines_strict"],
55
+ help="table detection strategy",
56
+ )
57
+ ap.add_argument(
58
+ "-q", "--quiet", action="store_true",
59
+ help="suppress progress output",
60
+ )
61
+ args = ap.parse_args()
62
+
63
+ if not args.input.is_file():
64
+ sys.exit(f"error: {args.input} not found")
65
+
66
+ kwargs = {
67
+ "table_strategy": args.table_strategy,
68
+ "dpi": args.dpi,
69
+ "show_progress": not args.quiet,
70
+ }
71
+ if args.pages:
72
+ try:
73
+ kwargs["pages"] = parse_pages(args.pages)
74
+ except ValueError:
75
+ sys.exit(f"error: invalid page spec: {args.pages!r}")
76
+ if args.images:
77
+ args.image_dir.mkdir(parents=True, exist_ok=True)
78
+ kwargs["write_images"] = True
79
+ kwargs["image_path"] = str(args.image_dir)
80
+ kwargs["image_format"] = args.image_format
81
+
82
+ try:
83
+ md = pymupdf4llm.to_markdown(str(args.input), **kwargs)
84
+ except Exception as e:
85
+ sys.exit(f"error: conversion failed: {e}")
86
+
87
+ if args.output and str(args.output) == "-":
88
+ sys.stdout.write(md)
89
+ else:
90
+ out = args.output or args.input.with_suffix(".md")
91
+ out.write_text(md, encoding="utf-8")
92
+ if not args.quiet:
93
+ print(f"wrote {out} ({len(md):,} chars)", file=sys.stderr)
94
+
95
+ return 0