pdfmark-ocr 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdfmark_ocr-0.1.0/LICENSE +21 -0
- pdfmark_ocr-0.1.0/PKG-INFO +103 -0
- pdfmark_ocr-0.1.0/README.md +71 -0
- pdfmark_ocr-0.1.0/pyproject.toml +48 -0
- pdfmark_ocr-0.1.0/setup.cfg +4 -0
- pdfmark_ocr-0.1.0/src/pdf2md_ocr/__init__.py +6 -0
- pdfmark_ocr-0.1.0/src/pdf2md_ocr/__main__.py +6 -0
- pdfmark_ocr-0.1.0/src/pdf2md_ocr/converter.py +245 -0
- pdfmark_ocr-0.1.0/src/pdfmark_ocr.egg-info/PKG-INFO +103 -0
- pdfmark_ocr-0.1.0/src/pdfmark_ocr.egg-info/SOURCES.txt +12 -0
- pdfmark_ocr-0.1.0/src/pdfmark_ocr.egg-info/dependency_links.txt +1 -0
- pdfmark_ocr-0.1.0/src/pdfmark_ocr.egg-info/entry_points.txt +2 -0
- pdfmark_ocr-0.1.0/src/pdfmark_ocr.egg-info/requires.txt +7 -0
- pdfmark_ocr-0.1.0/src/pdfmark_ocr.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ather
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdfmark-ocr
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Fast, fully-local PDF to Markdown converter with image OCR. No API calls, runs 100% offline.
|
|
5
|
+
Author: Ather
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/yourname/pdfmark-ocr
|
|
8
|
+
Project-URL: Issues, https://github.com/yourname/pdfmark-ocr/issues
|
|
9
|
+
Keywords: pdf,markdown,ocr,convert,rapidocr,offline,local
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
21
|
+
Classifier: Topic :: Utilities
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: PyMuPDF>=1.23
|
|
26
|
+
Requires-Dist: pillow>=9.0
|
|
27
|
+
Requires-Dist: rapidocr_onnxruntime>=1.3
|
|
28
|
+
Provides-Extra: easyocr
|
|
29
|
+
Requires-Dist: easyocr>=1.7; extra == "easyocr"
|
|
30
|
+
Requires-Dist: torch>=2.0; extra == "easyocr"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# pdfmark-ocr
|
|
34
|
+
|
|
35
|
+
**Fast, fully-local PDF → Markdown — with OCR for images.** No API calls, no
|
|
36
|
+
cloud, nothing leaves your machine.
|
|
37
|
+
|
|
38
|
+
Microsoft's `markitdown` only reads native text; if a PDF page is a scan or has
|
|
39
|
+
text inside images, you lose it. `pdfmark-ocr` reads native text directly (instant)
|
|
40
|
+
**and** runs OCR on embedded images, in parallel across your CPU cores.
|
|
41
|
+
|
|
42
|
+
## Install
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install pdfmark-ocr
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
That's it — works on **Windows, macOS, and Linux** (Python 3.9+). The default OCR
|
|
49
|
+
engine is [RapidOCR](https://github.com/RapidAI/RapidOCR) (ONNX, fast, small).
|
|
50
|
+
|
|
51
|
+
## Usage
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pdf2md document.pdf # -> full_stitched_output.md
|
|
55
|
+
pdf2md document.pdf -o notes.md # choose output file
|
|
56
|
+
pdf2md document.pdf --workers 4 # control parallelism
|
|
57
|
+
pdf2md document.pdf --min-image 0 # OCR every image, even tiny ones
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
You can also run it as a module:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
python -m pdf2md_ocr document.pdf
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Options
|
|
67
|
+
|
|
68
|
+
| Flag | Default | Meaning |
|
|
69
|
+
|------|---------|---------|
|
|
70
|
+
| `-o, --output` | `full_stitched_output.md` | Output markdown path |
|
|
71
|
+
| `--engine` | `rapidocr` | `rapidocr` (default) or `easyocr` |
|
|
72
|
+
| `--workers` | auto | Parallel page workers |
|
|
73
|
+
| `--max-dim` | `1500` | Downscale images larger than this (px) |
|
|
74
|
+
| `--min-image` | `16` | Skip images smaller than this (px); `0` keeps all |
|
|
75
|
+
|
|
76
|
+
## Optional: EasyOCR engine
|
|
77
|
+
|
|
78
|
+
EasyOCR is heavier (pulls in PyTorch, hundreds of MB) but you may prefer its
|
|
79
|
+
accuracy on some documents:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
pip install "pdfmark-ocr[easyocr]"
|
|
83
|
+
pdf2md document.pdf --engine easyocr
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## How it works
|
|
87
|
+
|
|
88
|
+
1. **Native text** is extracted directly from the PDF — instant, no OCR.
|
|
89
|
+
2. **Embedded images** are downscaled and sent to the chosen OCR engine.
|
|
90
|
+
3. Pages are processed **in parallel**, one OCR engine per worker process.
|
|
91
|
+
4. A per-worker cache means repeated logos/headers are OCR'd only once.
|
|
92
|
+
|
|
93
|
+
## Use from Python
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from pdf2md_ocr import stitch_full_pdf
|
|
97
|
+
|
|
98
|
+
stitch_full_pdf("document.pdf", "out.md", engine="rapidocr")
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## License
|
|
102
|
+
|
|
103
|
+
MIT
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# pdfmark-ocr
|
|
2
|
+
|
|
3
|
+
**Fast, fully-local PDF → Markdown — with OCR for images.** No API calls, no
|
|
4
|
+
cloud, nothing leaves your machine.
|
|
5
|
+
|
|
6
|
+
Microsoft's `markitdown` only reads native text; if a PDF page is a scan or has
|
|
7
|
+
text inside images, you lose it. `pdfmark-ocr` reads native text directly (instant)
|
|
8
|
+
**and** runs OCR on embedded images, in parallel across your CPU cores.
|
|
9
|
+
|
|
10
|
+
## Install
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install pdfmark-ocr
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
That's it — works on **Windows, macOS, and Linux** (Python 3.9+). The default OCR
|
|
17
|
+
engine is [RapidOCR](https://github.com/RapidAI/RapidOCR) (ONNX, fast, small).
|
|
18
|
+
|
|
19
|
+
## Usage
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pdf2md document.pdf # -> full_stitched_output.md
|
|
23
|
+
pdf2md document.pdf -o notes.md # choose output file
|
|
24
|
+
pdf2md document.pdf --workers 4 # control parallelism
|
|
25
|
+
pdf2md document.pdf --min-image 0 # OCR every image, even tiny ones
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
You can also run it as a module:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
python -m pdf2md_ocr document.pdf
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
### Options
|
|
35
|
+
|
|
36
|
+
| Flag | Default | Meaning |
|
|
37
|
+
|------|---------|---------|
|
|
38
|
+
| `-o, --output` | `full_stitched_output.md` | Output markdown path |
|
|
39
|
+
| `--engine` | `rapidocr` | `rapidocr` (default) or `easyocr` |
|
|
40
|
+
| `--workers` | auto | Parallel page workers |
|
|
41
|
+
| `--max-dim` | `1500` | Downscale images larger than this (px) |
|
|
42
|
+
| `--min-image` | `16` | Skip images smaller than this (px); `0` keeps all |
|
|
43
|
+
|
|
44
|
+
## Optional: EasyOCR engine
|
|
45
|
+
|
|
46
|
+
EasyOCR is heavier (pulls in PyTorch, hundreds of MB) but you may prefer its
|
|
47
|
+
accuracy on some documents:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install "pdfmark-ocr[easyocr]"
|
|
51
|
+
pdf2md document.pdf --engine easyocr
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## How it works
|
|
55
|
+
|
|
56
|
+
1. **Native text** is extracted directly from the PDF — instant, no OCR.
|
|
57
|
+
2. **Embedded images** are downscaled and sent to the chosen OCR engine.
|
|
58
|
+
3. Pages are processed **in parallel**, one OCR engine per worker process.
|
|
59
|
+
4. A per-worker cache means repeated logos/headers are OCR'd only once.
|
|
60
|
+
|
|
61
|
+
## Use from Python
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from pdf2md_ocr import stitch_full_pdf
|
|
65
|
+
|
|
66
|
+
stitch_full_pdf("document.pdf", "out.md", engine="rapidocr")
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## License
|
|
70
|
+
|
|
71
|
+
MIT
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "pdfmark-ocr"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Fast, fully-local PDF to Markdown converter with image OCR. No API calls, runs 100% offline."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Ather" }]
|
|
13
|
+
keywords = ["pdf", "markdown", "ocr", "convert", "rapidocr", "offline", "local"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Environment :: Console",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Operating System :: OS Independent",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.9",
|
|
22
|
+
"Programming Language :: Python :: 3.10",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Programming Language :: Python :: 3.12",
|
|
25
|
+
"Topic :: Text Processing :: Markup :: Markdown",
|
|
26
|
+
"Topic :: Utilities",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
# Default install is light: native PDF text + the fast ONNX OCR engine.
|
|
30
|
+
dependencies = [
|
|
31
|
+
"PyMuPDF>=1.23", # PDF parsing: native text + embedded image extraction
|
|
32
|
+
"pillow>=9.0", # image downscaling
|
|
33
|
+
"rapidocr_onnxruntime>=1.3", # fast CPU OCR engine (default), uses onnxruntime
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.optional-dependencies]
|
|
37
|
+
# Heavier alternative engine: pip install "pdfmark-ocr[easyocr]"
|
|
38
|
+
easyocr = ["easyocr>=1.7", "torch>=2.0"]
|
|
39
|
+
|
|
40
|
+
[project.urls]
|
|
41
|
+
Homepage = "https://github.com/yourname/pdfmark-ocr"
|
|
42
|
+
Issues = "https://github.com/yourname/pdfmark-ocr/issues"
|
|
43
|
+
|
|
44
|
+
[project.scripts]
|
|
45
|
+
pdf2md = "pdf2md_ocr.converter:main"
|
|
46
|
+
|
|
47
|
+
[tool.setuptools.packages.find]
|
|
48
|
+
where = ["src"]
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
"""
|
|
2
|
+
pdf2md_ocr.converter -- Fast, fully-local PDF -> Markdown
|
|
3
|
+
|
|
4
|
+
Native text is read directly from the PDF (instant). Only embedded IMAGES are
|
|
5
|
+
sent to OCR. Speed comes from:
|
|
6
|
+
|
|
7
|
+
1. Engine choice at runtime: --engine rapidocr (ONNX, fastest on CPU)
|
|
8
|
+
--engine easyocr (optional, pip install pdfmark-ocr[easyocr])
|
|
9
|
+
2. Parallel page processing across CPU cores (one OCR engine per worker).
|
|
10
|
+
3. A per-worker image cache so repeated logos/headers are OCR'd only once.
|
|
11
|
+
4. Image downscaling + thread tuning + EasyOCR canvas/quantization.
|
|
12
|
+
|
|
13
|
+
Everything runs locally. No API calls, no data leaves the machine.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import os
|
|
17
|
+
import io
|
|
18
|
+
import ssl
|
|
19
|
+
import time
|
|
20
|
+
import hashlib
|
|
21
|
+
import argparse
|
|
22
|
+
from multiprocessing import Pool
|
|
23
|
+
|
|
24
|
+
# --- PROXY / SSL OVERRIDE ---
|
|
25
|
+
try:
|
|
26
|
+
if not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
|
|
27
|
+
ssl._create_default_https_context = ssl._create_unverified_context
|
|
28
|
+
except Exception:
|
|
29
|
+
pass
|
|
30
|
+
# ----------------------------
|
|
31
|
+
|
|
32
|
+
# ============================================================================
|
|
33
|
+
# OCR ENGINE ABSTRACTION
|
|
34
|
+
# Both engines take PNG bytes and return a list[str] of text lines, so the rest
|
|
35
|
+
# of the pipeline doesn't care which one is used.
|
|
36
|
+
# ============================================================================
|
|
37
|
+
|
|
38
|
+
def build_engine(name, threads):
|
|
39
|
+
"""Create an OCR engine. Called once per process (heavy)."""
|
|
40
|
+
if name == "rapidocr":
|
|
41
|
+
try:
|
|
42
|
+
from rapidocr_onnxruntime import RapidOCR
|
|
43
|
+
except ImportError as e:
|
|
44
|
+
raise SystemExit(
|
|
45
|
+
"rapidocr is not installed. Run: pip install pdfmark-ocr"
|
|
46
|
+
) from e
|
|
47
|
+
try:
|
|
48
|
+
eng = RapidOCR(intra_op_num_threads=threads, inter_op_num_threads=1)
|
|
49
|
+
except TypeError: # older rapidocr without these kwargs
|
|
50
|
+
eng = RapidOCR()
|
|
51
|
+
return ("rapidocr", eng)
|
|
52
|
+
|
|
53
|
+
if name == "easyocr":
|
|
54
|
+
try:
|
|
55
|
+
import torch
|
|
56
|
+
import easyocr
|
|
57
|
+
except ImportError as e:
|
|
58
|
+
raise SystemExit(
|
|
59
|
+
"easyocr is not installed. Run: pip install \"pdfmark-ocr[easyocr]\""
|
|
60
|
+
) from e
|
|
61
|
+
torch.set_num_threads(max(1, threads))
|
|
62
|
+
# quantize=True -> int8 CPU inference (faster). verbose=False -> quiet.
|
|
63
|
+
reader = easyocr.Reader(['en'], gpu=False, quantize=True,
|
|
64
|
+
verbose=False, download_enabled=True)
|
|
65
|
+
return ("easyocr", reader)
|
|
66
|
+
|
|
67
|
+
raise ValueError(f"unknown engine: {name}")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def run_ocr(engine, img_bytes, max_dim):
|
|
71
|
+
"""Run OCR on PNG bytes, return list[str] of detected lines."""
|
|
72
|
+
name, eng = engine
|
|
73
|
+
if name == "rapidocr":
|
|
74
|
+
result, _ = eng(img_bytes)
|
|
75
|
+
return [line[1] for line in result] if result else []
|
|
76
|
+
# easyocr -- cap canvas at our downscale size so it doesn't re-upscale
|
|
77
|
+
return eng.readtext(img_bytes, detail=0, canvas_size=max_dim, mag_ratio=1.0)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# ============================================================================
|
|
81
|
+
# PER-PAGE WORKER
|
|
82
|
+
# Each worker process opens the PDF once and keeps its own engine + image cache.
|
|
83
|
+
# ============================================================================
|
|
84
|
+
|
|
85
|
+
_W = {} # per-process state: engine, doc, config, cache
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _init_worker(pdf_path, engine_name, threads, max_dim, min_image):
|
|
89
|
+
# Limit BLAS/OMP threads BEFORE the engine builds its thread pools, so N
|
|
90
|
+
# workers x threads doesn't oversubscribe the CPU.
|
|
91
|
+
for var in ("OMP_NUM_THREADS", "OPENBLAS_NUM_THREADS", "MKL_NUM_THREADS"):
|
|
92
|
+
os.environ[var] = str(max(1, threads))
|
|
93
|
+
import fitz
|
|
94
|
+
_W["engine"] = build_engine(engine_name, threads)
|
|
95
|
+
_W["doc"] = fitz.open(pdf_path)
|
|
96
|
+
_W["max_dim"] = max_dim
|
|
97
|
+
_W["min_image"] = min_image
|
|
98
|
+
_W["cache"] = {} # md5(img_bytes) -> ocr text (skip duplicates)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _ocr_image_block(block):
|
|
102
|
+
"""OCR one image block, with downscale + dedup cache. Returns markdown str."""
|
|
103
|
+
width = block.get("width", 0)
|
|
104
|
+
height = block.get("height", 0)
|
|
105
|
+
if width < _W["min_image"] or height < _W["min_image"]:
|
|
106
|
+
return None # skip tiny logos/icons
|
|
107
|
+
|
|
108
|
+
img_bytes = block["image"]
|
|
109
|
+
key = hashlib.md5(img_bytes).digest()
|
|
110
|
+
if key in _W["cache"]: # same image seen before
|
|
111
|
+
ocr_text = _W["cache"][key]
|
|
112
|
+
else:
|
|
113
|
+
from PIL import Image
|
|
114
|
+
max_dim = _W["max_dim"]
|
|
115
|
+
try:
|
|
116
|
+
img = Image.open(io.BytesIO(img_bytes))
|
|
117
|
+
if img.width > max_dim or img.height > max_dim:
|
|
118
|
+
img.thumbnail((max_dim, max_dim), Image.Resampling.LANCZOS)
|
|
119
|
+
buf = io.BytesIO()
|
|
120
|
+
img.save(buf, format="PNG")
|
|
121
|
+
img_bytes = buf.getvalue()
|
|
122
|
+
lines = run_ocr(_W["engine"], img_bytes, max_dim)
|
|
123
|
+
ocr_text = "\n".join(lines).strip()
|
|
124
|
+
except Exception as e:
|
|
125
|
+
return f"> **[Error processing image]** ({e})"
|
|
126
|
+
_W["cache"][key] = ocr_text
|
|
127
|
+
|
|
128
|
+
if ocr_text:
|
|
129
|
+
body = ocr_text.replace("\n", "\n> ")
|
|
130
|
+
return f"> **[Extracted from Image]**\n> {body}"
|
|
131
|
+
return "> **[Image contained no readable text]**"
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _process_page(page_num):
|
|
135
|
+
"""Build markdown for a single page. Returns (page_num, markdown, n_imgs)."""
|
|
136
|
+
page = _W["doc"][page_num]
|
|
137
|
+
parts = [f"\n## Page {page_num + 1}\n"]
|
|
138
|
+
n_imgs = 0
|
|
139
|
+
|
|
140
|
+
blocks = page.get_text("dict")["blocks"]
|
|
141
|
+
blocks.sort(key=lambda b: b["bbox"][1]) # top-to-bottom reading order
|
|
142
|
+
|
|
143
|
+
for block in blocks:
|
|
144
|
+
if block["type"] == 0: # native text
|
|
145
|
+
text = ""
|
|
146
|
+
for line in block["lines"]:
|
|
147
|
+
for span in line["spans"]:
|
|
148
|
+
text += span["text"] + " "
|
|
149
|
+
text += "\n"
|
|
150
|
+
parts.append(text.strip())
|
|
151
|
+
elif block["type"] == 1: # image
|
|
152
|
+
md = _ocr_image_block(block)
|
|
153
|
+
if md is not None:
|
|
154
|
+
n_imgs += 1
|
|
155
|
+
parts.append(md)
|
|
156
|
+
|
|
157
|
+
parts.append("\n---\n")
|
|
158
|
+
return page_num, "\n\n".join(parts), n_imgs
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# ============================================================================
|
|
162
|
+
# DRIVER
|
|
163
|
+
# ============================================================================
|
|
164
|
+
|
|
165
|
+
def stitch_full_pdf(pdf_path, output_name="full_stitched_output.md",
|
|
166
|
+
engine="rapidocr", workers=None, max_dim=1500, min_image=16):
|
|
167
|
+
if not os.path.exists(pdf_path):
|
|
168
|
+
print(f"Error: Cannot find '{pdf_path}'")
|
|
169
|
+
return
|
|
170
|
+
|
|
171
|
+
import fitz
|
|
172
|
+
total_start = time.perf_counter()
|
|
173
|
+
|
|
174
|
+
doc = fitz.open(pdf_path)
|
|
175
|
+
total_pages = len(doc)
|
|
176
|
+
doc.close()
|
|
177
|
+
|
|
178
|
+
cpu = os.cpu_count() or 4
|
|
179
|
+
if workers is None:
|
|
180
|
+
workers = min(cpu, total_pages)
|
|
181
|
+
workers = max(1, min(workers, total_pages))
|
|
182
|
+
threads = max(1, cpu // workers) # split cores across workers
|
|
183
|
+
|
|
184
|
+
print(f"--- ENGINE: {engine} | pages: {total_pages} | "
|
|
185
|
+
f"workers: {workers} x {threads} threads ---")
|
|
186
|
+
|
|
187
|
+
header = f"# Full Extraction: {os.path.basename(pdf_path)}\n"
|
|
188
|
+
results = []
|
|
189
|
+
|
|
190
|
+
if workers == 1:
|
|
191
|
+
# Sequential: avoid process-spawn overhead for small PDFs.
|
|
192
|
+
_init_worker(pdf_path, engine, threads, max_dim, min_image)
|
|
193
|
+
print("[init] engine ready")
|
|
194
|
+
for pn in range(total_pages):
|
|
195
|
+
ps = time.perf_counter()
|
|
196
|
+
results.append(_process_page(pn))
|
|
197
|
+
print(f" page {pn + 1}/{total_pages} done "
|
|
198
|
+
f"[{time.perf_counter() - ps:.2f}s, {results[-1][2]} imgs]")
|
|
199
|
+
else:
|
|
200
|
+
with Pool(workers, initializer=_init_worker,
|
|
201
|
+
initargs=(pdf_path, engine, threads, max_dim, min_image)) as pool:
|
|
202
|
+
for pn, md, n_imgs in pool.imap_unordered(_process_page,
|
|
203
|
+
range(total_pages)):
|
|
204
|
+
results.append((pn, md, n_imgs))
|
|
205
|
+
print(f" page {pn + 1}/{total_pages} done [{n_imgs} imgs] "
|
|
206
|
+
f"({len(results)}/{total_pages})")
|
|
207
|
+
|
|
208
|
+
results.sort(key=lambda r: r[0]) # restore page order
|
|
209
|
+
final_md = header + "\n\n".join(r[1] for r in results)
|
|
210
|
+
|
|
211
|
+
with open(output_name, "w", encoding="utf-8") as f:
|
|
212
|
+
f.write(final_md)
|
|
213
|
+
|
|
214
|
+
total_imgs = sum(r[2] for r in results)
|
|
215
|
+
elapsed = time.perf_counter() - total_start
|
|
216
|
+
print(f"\nSaved -> {output_name}")
|
|
217
|
+
print(f"Pages: {total_pages} | Images OCR'd: {total_imgs} | "
|
|
218
|
+
f"TOTAL: {elapsed:.2f}s ({elapsed / max(1, total_pages):.2f}s/page)")
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def main():
|
|
222
|
+
p = argparse.ArgumentParser(
|
|
223
|
+
prog="pdf2md",
|
|
224
|
+
description="Fast local PDF -> Markdown (native text + image OCR). Runs fully offline.",
|
|
225
|
+
)
|
|
226
|
+
p.add_argument("pdf", help="input PDF file")
|
|
227
|
+
p.add_argument("-o", "--output", default="full_stitched_output.md",
|
|
228
|
+
help="output markdown file (default: full_stitched_output.md)")
|
|
229
|
+
p.add_argument("--engine", choices=["rapidocr", "easyocr"], default="rapidocr",
|
|
230
|
+
help="OCR engine (default: rapidocr; easyocr needs the [easyocr] extra)")
|
|
231
|
+
p.add_argument("--workers", type=int, default=None,
|
|
232
|
+
help="parallel page workers (default: auto)")
|
|
233
|
+
p.add_argument("--max-dim", type=int, default=1500,
|
|
234
|
+
help="downscale images larger than this (px)")
|
|
235
|
+
p.add_argument("--min-image", type=int, default=16,
|
|
236
|
+
help="skip images smaller than this (px); use 0 to keep all")
|
|
237
|
+
args = p.parse_args()
|
|
238
|
+
|
|
239
|
+
stitch_full_pdf(args.pdf, args.output, engine=args.engine,
|
|
240
|
+
workers=args.workers, max_dim=args.max_dim,
|
|
241
|
+
min_image=args.min_image)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
if __name__ == "__main__": # required for multiprocessing on Windows
|
|
245
|
+
main()
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdfmark-ocr
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Fast, fully-local PDF to Markdown converter with image OCR. No API calls, runs 100% offline.
|
|
5
|
+
Author: Ather
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/yourname/pdfmark-ocr
|
|
8
|
+
Project-URL: Issues, https://github.com/yourname/pdfmark-ocr/issues
|
|
9
|
+
Keywords: pdf,markdown,ocr,convert,rapidocr,offline,local
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
21
|
+
Classifier: Topic :: Utilities
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: PyMuPDF>=1.23
|
|
26
|
+
Requires-Dist: pillow>=9.0
|
|
27
|
+
Requires-Dist: rapidocr_onnxruntime>=1.3
|
|
28
|
+
Provides-Extra: easyocr
|
|
29
|
+
Requires-Dist: easyocr>=1.7; extra == "easyocr"
|
|
30
|
+
Requires-Dist: torch>=2.0; extra == "easyocr"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# pdfmark-ocr
|
|
34
|
+
|
|
35
|
+
**Fast, fully-local PDF → Markdown — with OCR for images.** No API calls, no
|
|
36
|
+
cloud, nothing leaves your machine.
|
|
37
|
+
|
|
38
|
+
Microsoft's `markitdown` only reads native text; if a PDF page is a scan or has
|
|
39
|
+
text inside images, you lose it. `pdfmark-ocr` reads native text directly (instant)
|
|
40
|
+
**and** runs OCR on embedded images, in parallel across your CPU cores.
|
|
41
|
+
|
|
42
|
+
## Install
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install pdfmark-ocr
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
That's it — works on **Windows, macOS, and Linux** (Python 3.9+). The default OCR
|
|
49
|
+
engine is [RapidOCR](https://github.com/RapidAI/RapidOCR) (ONNX, fast, small).
|
|
50
|
+
|
|
51
|
+
## Usage
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pdf2md document.pdf # -> full_stitched_output.md
|
|
55
|
+
pdf2md document.pdf -o notes.md # choose output file
|
|
56
|
+
pdf2md document.pdf --workers 4 # control parallelism
|
|
57
|
+
pdf2md document.pdf --min-image 0 # OCR every image, even tiny ones
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
You can also run it as a module:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
python -m pdf2md_ocr document.pdf
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Options
|
|
67
|
+
|
|
68
|
+
| Flag | Default | Meaning |
|
|
69
|
+
|------|---------|---------|
|
|
70
|
+
| `-o, --output` | `full_stitched_output.md` | Output markdown path |
|
|
71
|
+
| `--engine` | `rapidocr` | `rapidocr` (default) or `easyocr` |
|
|
72
|
+
| `--workers` | auto | Parallel page workers |
|
|
73
|
+
| `--max-dim` | `1500` | Downscale images larger than this (px) |
|
|
74
|
+
| `--min-image` | `16` | Skip images smaller than this (px); `0` keeps all |
|
|
75
|
+
|
|
76
|
+
## Optional: EasyOCR engine
|
|
77
|
+
|
|
78
|
+
EasyOCR is heavier (pulls in PyTorch, hundreds of MB) but you may prefer its
|
|
79
|
+
accuracy on some documents:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
pip install "pdfmark-ocr[easyocr]"
|
|
83
|
+
pdf2md document.pdf --engine easyocr
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## How it works
|
|
87
|
+
|
|
88
|
+
1. **Native text** is extracted directly from the PDF — instant, no OCR.
|
|
89
|
+
2. **Embedded images** are downscaled and sent to the chosen OCR engine.
|
|
90
|
+
3. Pages are processed **in parallel**, one OCR engine per worker process.
|
|
91
|
+
4. A per-worker cache means repeated logos/headers are OCR'd only once.
|
|
92
|
+
|
|
93
|
+
## Use from Python
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from pdf2md_ocr import stitch_full_pdf
|
|
97
|
+
|
|
98
|
+
stitch_full_pdf("document.pdf", "out.md", engine="rapidocr")
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## License
|
|
102
|
+
|
|
103
|
+
MIT
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/pdf2md_ocr/__init__.py
|
|
5
|
+
src/pdf2md_ocr/__main__.py
|
|
6
|
+
src/pdf2md_ocr/converter.py
|
|
7
|
+
src/pdfmark_ocr.egg-info/PKG-INFO
|
|
8
|
+
src/pdfmark_ocr.egg-info/SOURCES.txt
|
|
9
|
+
src/pdfmark_ocr.egg-info/dependency_links.txt
|
|
10
|
+
src/pdfmark_ocr.egg-info/entry_points.txt
|
|
11
|
+
src/pdfmark_ocr.egg-info/requires.txt
|
|
12
|
+
src/pdfmark_ocr.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pdf2md_ocr
|