pdfmux 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ {
2
+ "version": "0.0.1",
3
+ "configurations": [
4
+ {
5
+ "name": "site",
6
+ "runtimeExecutable": "npx",
7
+ "runtimeArgs": ["serve", "site/", "-l", "3456"],
8
+ "port": 3456
9
+ }
10
+ ]
11
+ }
@@ -0,0 +1,48 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ lint:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+ - uses: actions/setup-python@v5
15
+ with:
16
+ python-version: "3.11"
17
+ - run: pip install ruff
18
+ - run: ruff check src/ tests/
19
+ - run: ruff format --check src/ tests/
20
+
21
+ test:
22
+ runs-on: ubuntu-latest
23
+ needs: lint
24
+ strategy:
25
+ matrix:
26
+ python-version: ["3.11", "3.12", "3.13"]
27
+ steps:
28
+ - uses: actions/checkout@v4
29
+ - uses: actions/setup-python@v5
30
+ with:
31
+ python-version: ${{ matrix.python-version }}
32
+ - run: pip install -e ".[dev]"
33
+ - run: pytest -v
34
+
35
+ build:
36
+ runs-on: ubuntu-latest
37
+ needs: test
38
+ steps:
39
+ - uses: actions/checkout@v4
40
+ - uses: actions/setup-python@v5
41
+ with:
42
+ python-version: "3.11"
43
+ - run: pip install build
44
+ - run: python -m build
45
+ - uses: actions/upload-artifact@v4
46
+ with:
47
+ name: dist
48
+ path: dist/
@@ -0,0 +1,21 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ permissions:
8
+ id-token: write
9
+
10
+ jobs:
11
+ publish:
12
+ runs-on: ubuntu-latest
13
+ environment: pypi
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ - uses: actions/setup-python@v5
17
+ with:
18
+ python-version: "3.12"
19
+ - run: pip install build
20
+ - run: python -m build
21
+ - uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,29 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .eggs/
8
+ *.egg
9
+
10
+ .env
11
+ .env.*
12
+ .env.local
13
+ .venv/
14
+ venv/
15
+
16
+ *.pem
17
+ *.key
18
+ credentials
19
+ credentials.*
20
+ secrets
21
+ secrets.*
22
+
23
+ .pytest_cache/
24
+ .ruff_cache/
25
+ .mypy_cache/
26
+ .wrangler/
27
+
28
+ *.pdf
29
+ !tests/fixtures/*.pdf
@@ -0,0 +1,18 @@
1
+ FROM python:3.11-slim AS builder
2
+
3
+ WORKDIR /app
4
+ COPY pyproject.toml README.md ./
5
+ COPY src/ ./src/
6
+
7
+ RUN pip install --no-cache-dir build && \
8
+ python -m build --wheel
9
+
10
+ FROM python:3.11-slim
11
+
12
+ WORKDIR /app
13
+ COPY --from=builder /app/dist/*.whl /tmp/
14
+ RUN pip install --no-cache-dir /tmp/*.whl && \
15
+ rm /tmp/*.whl
16
+
17
+ ENTRYPOINT ["pdfmux"]
18
+ CMD ["--help"]
pdfmux-0.2.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Nameet Potnis
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
pdfmux-0.2.0/PKG-INFO ADDED
@@ -0,0 +1,385 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdfmux
3
+ Version: 0.2.0
4
+ Summary: The smart PDF-to-Markdown router. One command, zero config, best extractor per document.
5
+ Project-URL: Homepage, https://pdfmux.com
6
+ Project-URL: Repository, https://github.com/NameetP/pdfmux
7
+ Project-URL: Issues, https://github.com/NameetP/pdfmux/issues
8
+ Author: Nameet Potnis
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: ai,converter,extraction,llm,markdown,mcp,pdf
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
21
+ Requires-Python: >=3.11
22
+ Requires-Dist: mcp>=1.0.0
23
+ Requires-Dist: pymupdf4llm>=0.0.10
24
+ Requires-Dist: pymupdf>=1.24.0
25
+ Requires-Dist: rich>=13.0.0
26
+ Requires-Dist: typer>=0.9.0
27
+ Provides-Extra: all
28
+ Requires-Dist: docling>=2.0.0; extra == 'all'
29
+ Requires-Dist: google-genai>=1.0.0; extra == 'all'
30
+ Requires-Dist: surya-ocr>=0.6.0; extra == 'all'
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
33
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
34
+ Requires-Dist: ruff>=0.4.0; extra == 'dev'
35
+ Provides-Extra: llm
36
+ Requires-Dist: google-genai>=1.0.0; extra == 'llm'
37
+ Provides-Extra: ocr
38
+ Requires-Dist: surya-ocr>=0.6.0; extra == 'ocr'
39
+ Provides-Extra: tables
40
+ Requires-Dist: docling>=2.0.0; extra == 'tables'
41
+ Description-Content-Type: text/markdown
42
+
43
+ # pdfmux
44
+
45
+ The smart PDF-to-Markdown router. One command, zero config.
46
+
47
+ ```
48
+ PDF ──→ pdfmux ──→ Markdown
49
+
50
+ ├─ digital? → PyMuPDF (0.01s/pg, free)
51
+ ├─ tables? → Docling (0.3s/pg, free)
52
+ ├─ scanned? → Surya OCR (1-5s/pg, free)
53
+ └─ complex? → Gemini Flash (2-5s/pg, ~$0.01)
54
+ ```
55
+
56
+ We don't convert PDFs. We route them to whichever tool converts them best.
57
+
58
+ 90% of PDFs are digital — converted in milliseconds, for free.
59
+
60
+ ## Quick Start
61
+
62
+ ```bash
63
+ pip install pdfmux
64
+
65
+ pdfmux invoice.pdf
66
+ # ✓ invoice.pdf → invoice.md (2 pages, 95% confidence, via pymupdf4llm)
67
+ ```
68
+
69
+ That's it. No config, no flags, no API keys needed.
70
+
71
+ ## Install
72
+
73
+ ```bash
74
+ # core (handles digital PDFs — the vast majority)
75
+ pip install pdfmux
76
+
77
+ # add table extraction (Docling — 97.9% table accuracy)
78
+ pip install pdfmux[tables]
79
+
80
+ # add scanned PDF support (Surya OCR)
81
+ pip install pdfmux[ocr]
82
+
83
+ # add LLM fallback for hardest cases (Gemini Flash)
84
+ pip install pdfmux[llm]
85
+
86
+ # everything
87
+ pip install pdfmux[all]
88
+ ```
89
+
90
+ Requires Python 3.11+.
91
+
92
+ ## Usage
93
+
94
+ ### Convert a single file
95
+
96
+ ```bash
97
+ pdfmux invoice.pdf
98
+ # ✓ invoice.pdf → invoice.md (2 pages, 95% confidence, via pymupdf4llm)
99
+ ```
100
+
101
+ Output is written to the same directory with a `.md` extension by default.
102
+
103
+ ### Specify output location
104
+
105
+ ```bash
106
+ pdfmux report.pdf -o ./converted/report.md
107
+ ```
108
+
109
+ ### Batch convert a directory
110
+
111
+ ```bash
112
+ pdfmux ./docs/ -o ./output/
113
+ # Converting 12 PDFs from ./docs/...
114
+ # ✓ invoice.pdf → invoice.md (95%)
115
+ # ✓ contract.pdf → contract.md (92%)
116
+ # ✓ scan.pdf → scan.md (87%)
117
+ # Done: 12 converted, 0 failed
118
+ ```
119
+
120
+ ### Output formats
121
+
122
+ ```bash
123
+ # markdown (default)
124
+ pdfmux report.pdf
125
+
126
+ # json — structured output with metadata
127
+ pdfmux report.pdf -f json
128
+
129
+ # csv — extracts tables only
130
+ pdfmux data.pdf -f csv
131
+ ```
132
+
133
+ ### Quality presets
134
+
135
+ ```bash
136
+ # fast — PyMuPDF only, no ML (instant, free)
137
+ pdfmux report.pdf -q fast
138
+
139
+ # standard — auto-detect and route (default)
140
+ pdfmux report.pdf -q standard
141
+
142
+ # high — use LLM for everything (slow, costs ~$0.01/doc)
143
+ pdfmux report.pdf -q high
144
+ ```
145
+
146
+ ### Other options
147
+
148
+ ```bash
149
+ # show confidence score in output
150
+ pdfmux report.pdf --confidence
151
+
152
+ # print to stdout instead of file
153
+ pdfmux report.pdf --stdout
154
+ ```
155
+
156
+ ### All CLI options
157
+
158
+ | Option | Short | Default | Description |
159
+ |--------|-------|---------|-------------|
160
+ | `--output` | `-o` | Same dir, `.md` ext | Output file or directory |
161
+ | `--format` | `-f` | `markdown` | Output format: `markdown`, `json`, `csv` |
162
+ | `--quality` | `-q` | `standard` | Quality: `fast`, `standard`, `high` |
163
+ | `--confidence` | | `false` | Include confidence score in output |
164
+ | `--stdout` | | `false` | Print to stdout instead of writing file |
165
+
166
+ ## How It Works
167
+
168
+ ### Detection
169
+
170
+ pdfmux opens each PDF with PyMuPDF and classifies it by inspecting every page:
171
+
172
+ ```
173
+ For each page:
174
+ ├─ Has >50 chars of extractable text? → digital
175
+ ├─ Has embedded images but no text? → scanned
176
+ └─ Empty or minimal content? → digital (empty page)
177
+
178
+ Classification:
179
+ ├─ ≥80% digital pages → digital PDF
180
+ ├─ ≥80% scanned pages → scanned PDF
181
+ └─ Otherwise → mixed PDF
182
+
183
+ Table detection:
184
+ ├─ Check for ruled line patterns (≥3 horizontal + ≥2 vertical lines)
185
+ └─ Check for tab-separated or multi-space aligned text patterns
186
+ ```
187
+
188
+ ### Routing
189
+
190
+ Based on classification, pdfmux picks the best extractor:
191
+
192
+ ```
193
+ classify(pdf)
194
+
195
+ ├─ quality=fast? ────────────────→ PyMuPDF (always)
196
+ ├─ quality=high? ────────────────→ Gemini Flash → Surya → PyMuPDF
197
+
198
+ └─ quality=standard (default):
199
+ ├─ digital, no tables ──────→ PyMuPDF
200
+ ├─ has tables ──────────────→ Docling → PyMuPDF fallback
201
+ ├─ scanned ─────────────────→ Surya OCR → PyMuPDF fallback
202
+ ├─ mixed ───────────────────→ PyMuPDF (digital pgs) + Surya (scanned pgs)
203
+ └─ default ─────────────────→ PyMuPDF
204
+ ```
205
+
206
+ If an optional extractor isn't installed, pdfmux silently falls back to the next best option. No errors, no config.
207
+
208
+ ### Post-processing
209
+
210
+ After extraction, every result goes through:
211
+
212
+ 1. **Cleanup** — remove control characters, fix broken hyphenation, normalize blank lines
213
+ 2. **Confidence scoring** — text completeness, encoding quality, structure preservation, whitespace sanity
214
+ 3. **Formatting** — heading normalization, list marker standardization, optional YAML frontmatter
215
+
216
+ ### Extractors
217
+
218
+ | Tier | Extractor | What it handles | Speed | Cost | Install |
219
+ |------|-----------|----------------|-------|------|---------|
220
+ | Fast | PyMuPDF / pymupdf4llm | Digital PDFs with clean text | 0.01s/page | Free | Base |
221
+ | Tables | Docling | Table-heavy documents | 0.3-3s/page | Free | `pdfmux[tables]` |
222
+ | OCR | Surya | Scanned / image-based PDFs | 1-5s/page | Free | `pdfmux[ocr]` |
223
+ | LLM | Gemini 2.5 Flash | Complex layouts, handwriting, edge cases | 2-5s/page | ~$0.01/doc | `pdfmux[llm]` |
224
+
225
+ ## Output Formats
226
+
227
+ ### Markdown (default)
228
+
229
+ Clean markdown optimized for LLM consumption:
230
+
231
+ ```markdown
232
+ # Quarterly Report
233
+
234
+ Revenue for Q3 increased by 15% year-over-year...
235
+
236
+ ## Financial Summary
237
+
238
+ | Metric | Q3 2025 | Q3 2024 |
239
+ |--------|---------|---------|
240
+ | Revenue | $12.3M | $10.7M |
241
+ | Profit | $3.1M | $2.4M |
242
+ ```
243
+
244
+ ### JSON
245
+
246
+ Structured output with metadata, useful for pipelines:
247
+
248
+ ```json
249
+ {
250
+ "source": "report.pdf",
251
+ "converter": "pdfmux",
252
+ "extractor": "pymupdf4llm (fast)",
253
+ "page_count": 5,
254
+ "confidence": 0.95,
255
+ "warnings": [],
256
+ "content": "# Quarterly Report\n\nRevenue for Q3...",
257
+ "pages": [
258
+ { "page": 1, "content": "# Quarterly Report..." },
259
+ { "page": 2, "content": "## Financial Summary..." }
260
+ ]
261
+ }
262
+ ```
263
+
264
+ ### CSV
265
+
266
+ Extracts tables from the document into CSV format:
267
+
268
+ ```csv
269
+ Metric,Q3 2025,Q3 2024
270
+ Revenue,$12.3M,$10.7M
271
+ Profit,$3.1M,$2.4M
272
+ ```
273
+
274
+ Raises an error if no tables are found in the document.
275
+
276
+ ## MCP Server
277
+
278
+ pdfmux includes a built-in MCP (Model Context Protocol) server so AI agents can read PDFs natively.
279
+
280
+ ```bash
281
+ pdfmux serve
282
+ ```
283
+
284
+ ### Claude Desktop / Cursor
285
+
286
+ Add to your config:
287
+
288
+ ```json
289
+ {
290
+ "mcpServers": {
291
+ "pdfmux": {
292
+ "command": "pdfmux",
293
+ "args": ["serve"]
294
+ }
295
+ }
296
+ }
297
+ ```
298
+
299
+ ### Claude Code
300
+
301
+ ```bash
302
+ claude mcp add pdfmux -- pdfmux serve
303
+ ```
304
+
305
+ ### Tool
306
+
307
+ The server exposes a single `convert_pdf` tool over stdio:
308
+
309
+ ```json
310
+ {
311
+ "name": "convert_pdf",
312
+ "description": "Convert a PDF to Markdown/JSON/CSV",
313
+ "parameters": {
314
+ "file_path": "string — path to the PDF file",
315
+ "format": "string — markdown | json | csv (default: markdown)",
316
+ "quality": "string — fast | standard | high (default: standard)"
317
+ }
318
+ }
319
+ ```
320
+
321
+ Your agent calls it, gets the extracted text back. No setup required.
322
+
323
+ ## Environment Variables
324
+
325
+ | Variable | Required | Description |
326
+ |----------|----------|-------------|
327
+ | `GEMINI_API_KEY` | Only for `pdfmux[llm]` | Google Gemini API key for LLM extraction |
328
+ | `GOOGLE_API_KEY` | Alternative | Alternative env var for Gemini API key |
329
+
330
+ No environment variables are needed for the base install or the `tables`/`ocr` extras.
331
+
332
+ ## Why Not Just Use X?
333
+
334
+ | Tool | Good at | Limitation |
335
+ |------|---------|-----------|
336
+ | Marker | GPU ML extraction | Overkill for simple digital PDFs, needs GPU |
337
+ | Docling | Tables (97.9% accuracy) | Slow on non-table documents |
338
+ | pymupdf4llm | Fast digital text | Can't handle scanned or complex layouts |
339
+ | MinerU | Full ML pipeline | Heavy, complex setup |
340
+ | MarkItDown | Microsoft tool, wide format support | Not optimized for any specific PDF type |
341
+ | **pdfmux** | Picking the right tool automatically | — |
342
+
343
+ pdfmux uses these tools. It doesn't compete with them — it orchestrates them.
344
+
345
+ The key insight: no single extractor wins on everything. PyMuPDF is 100x faster on digital PDFs. Docling is better at tables. Surya handles scans. Gemini catches what everything else misses. pdfmux routes each document to the right one.
346
+
347
+ ## Project Structure
348
+
349
+ ```
350
+ src/pdfmux/
351
+ ├── cli.py # Typer CLI (convert, serve, version)
352
+ ├── pipeline.py # Tiered routing logic
353
+ ├── detect.py # PDF type detection
354
+ ├── postprocess.py # Cleanup + confidence scoring
355
+ ├── mcp_server.py # MCP server (stdio JSON-RPC)
356
+ ├── extractors/
357
+ │ ├── fast.py # PyMuPDF — handles 90% of PDFs
358
+ │ ├── tables.py # Docling — table-heavy docs
359
+ │ ├── ocr.py # Surya — scanned PDFs
360
+ │ └── llm.py # Gemini Flash — hardest cases
361
+ └── formatters/
362
+ ├── markdown.py # Markdown output
363
+ ├── json_fmt.py # JSON output
364
+ └── csv_fmt.py # CSV output (tables only)
365
+ ```
366
+
367
+ ## Development
368
+
369
+ ```bash
370
+ git clone https://github.com/NameetP/pdfmux.git
371
+ cd pdfmux
372
+ python3.12 -m venv .venv && source .venv/bin/activate
373
+ pip install -e ".[dev]"
374
+
375
+ # run tests
376
+ pytest
377
+
378
+ # lint
379
+ ruff check src/ tests/
380
+ ruff format src/ tests/
381
+ ```
382
+
383
+ ## License
384
+
385
+ [MIT](LICENSE)