pdfpeek 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdfpeek-0.1.0/.gitignore +94 -0
- pdfpeek-0.1.0/CHANGELOG.md +33 -0
- pdfpeek-0.1.0/LICENSE +21 -0
- pdfpeek-0.1.0/PKG-INFO +212 -0
- pdfpeek-0.1.0/README.md +167 -0
- pdfpeek-0.1.0/pdf_engine/__init__.py +39 -0
- pdfpeek-0.1.0/pdf_engine/__main__.py +5 -0
- pdfpeek-0.1.0/pdf_engine/api.py +451 -0
- pdfpeek-0.1.0/pdf_engine/cli.py +532 -0
- pdfpeek-0.1.0/pdf_engine/models.py +281 -0
- pdfpeek-0.1.0/pdf_engine/py.typed +0 -0
- pdfpeek-0.1.0/pdf_engine/stage0_triage.py +478 -0
- pdfpeek-0.1.0/pdf_engine/stage1_extraction.py +694 -0
- pdfpeek-0.1.0/pdf_engine/stage2_layout.py +500 -0
- pdfpeek-0.1.0/pdf_engine/stage3_reading_order.py +563 -0
- pdfpeek-0.1.0/pdf_engine/stage4_tables.py +614 -0
- pdfpeek-0.1.0/pdf_engine/stage5_ocr.py +874 -0
- pdfpeek-0.1.0/pdf_engine/stage6_assembly.py +433 -0
- pdfpeek-0.1.0/pdf_engine/stage7_crosspage.py +583 -0
- pdfpeek-0.1.0/pdf_engine/stage8_postprocessing.py +569 -0
- pdfpeek-0.1.0/pdf_engine/stage9_confidence.py +196 -0
- pdfpeek-0.1.0/pdf_engine/surya_adapter.py +264 -0
- pdfpeek-0.1.0/pyproject.toml +89 -0
- pdfpeek-0.1.0/tests/__init__.py +0 -0
- pdfpeek-0.1.0/tests/fixtures/born_digital_academic.pdf +93 -0
- pdfpeek-0.1.0/tests/fixtures/born_digital_academic.pdf#Uf03aZone.Identifier +0 -0
- pdfpeek-0.1.0/tests/fixtures/encrypted_no_password.pdf +0 -0
- pdfpeek-0.1.0/tests/fixtures/encrypted_no_password.pdf#Uf03aZone.Identifier +0 -0
- pdfpeek-0.1.0/tests/fixtures/scanned_single_column.pdf +79 -0
- pdfpeek-0.1.0/tests/fixtures/scanned_single_column.pdf#Uf03aZone.Identifier +0 -0
- pdfpeek-0.1.0/tests/test_integration.py +370 -0
- pdfpeek-0.1.0/tests/test_stage0.py +546 -0
- pdfpeek-0.1.0/tests/test_stage1.py +642 -0
- pdfpeek-0.1.0/tests/test_stage2.py +430 -0
- pdfpeek-0.1.0/tests/test_stage3.py +418 -0
- pdfpeek-0.1.0/tests/test_stage4.py +571 -0
- pdfpeek-0.1.0/tests/test_stage5.py +531 -0
- pdfpeek-0.1.0/tests/test_stage6.py +642 -0
- pdfpeek-0.1.0/tests/test_stage7.py +590 -0
- pdfpeek-0.1.0/tests/test_stage8.py +354 -0
- pdfpeek-0.1.0/tests/test_stage9.py +322 -0
pdfpeek-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
|
|
7
|
+
# Distribution / packaging
|
|
8
|
+
.Python
|
|
9
|
+
build/
|
|
10
|
+
develop-eggs/
|
|
11
|
+
dist/
|
|
12
|
+
downloads/
|
|
13
|
+
eggs/
|
|
14
|
+
.eggs/
|
|
15
|
+
lib/
|
|
16
|
+
lib64/
|
|
17
|
+
parts/
|
|
18
|
+
sdist/
|
|
19
|
+
var/
|
|
20
|
+
wheels/
|
|
21
|
+
share/python-wheels/
|
|
22
|
+
*.egg-info/
|
|
23
|
+
.installed.cfg
|
|
24
|
+
*.egg
|
|
25
|
+
MANIFEST
|
|
26
|
+
|
|
27
|
+
# PyInstaller
|
|
28
|
+
*.manifest
|
|
29
|
+
*.spec
|
|
30
|
+
|
|
31
|
+
# Unit test / coverage reports
|
|
32
|
+
htmlcov/
|
|
33
|
+
.tox/
|
|
34
|
+
.nox/
|
|
35
|
+
.coverage
|
|
36
|
+
.coverage.*
|
|
37
|
+
.cache
|
|
38
|
+
nosetests.xml
|
|
39
|
+
coverage.xml
|
|
40
|
+
*.cover
|
|
41
|
+
*.py,cover
|
|
42
|
+
.hypothesis/
|
|
43
|
+
.pytest_cache/
|
|
44
|
+
cover/
|
|
45
|
+
|
|
46
|
+
# Virtual environments
|
|
47
|
+
venv/
|
|
48
|
+
env/
|
|
49
|
+
ENV/
|
|
50
|
+
env.bak/
|
|
51
|
+
venv.bak/
|
|
52
|
+
|
|
53
|
+
# IDE settings
|
|
54
|
+
.vscode/
|
|
55
|
+
.idea/
|
|
56
|
+
*.swp
|
|
57
|
+
*.swo
|
|
58
|
+
*~
|
|
59
|
+
.DS_Store
|
|
60
|
+
|
|
61
|
+
# Claude Code memory (local AI assistant data)
|
|
62
|
+
.claude/
|
|
63
|
+
|
|
64
|
+
# Test outputs and temporary files
|
|
65
|
+
test_*.txt
|
|
66
|
+
test_*.md
|
|
67
|
+
*.pdf
|
|
68
|
+
!tests/fixtures/*.pdf
|
|
69
|
+
|
|
70
|
+
# Internal review documents
|
|
71
|
+
Review.md
|
|
72
|
+
NOTES.md
|
|
73
|
+
TODO.md
|
|
74
|
+
|
|
75
|
+
# mypy
|
|
76
|
+
.mypy_cache/
|
|
77
|
+
.dmypy.json
|
|
78
|
+
dmypy.json
|
|
79
|
+
|
|
80
|
+
# Pyre type checker
|
|
81
|
+
.pyre/
|
|
82
|
+
|
|
83
|
+
# pytype static type analyzer
|
|
84
|
+
.pytype/
|
|
85
|
+
|
|
86
|
+
# Jupyter Notebook
|
|
87
|
+
.ipynb_checkpoints
|
|
88
|
+
|
|
89
|
+
# pyenv
|
|
90
|
+
.python-version
|
|
91
|
+
|
|
92
|
+
# Environments
|
|
93
|
+
.env
|
|
94
|
+
.venv
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to pdfpeek will be documented here.
|
|
4
|
+
|
|
5
|
+
Format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
|
6
|
+
Versioning follows [Semantic Versioning](https://semver.org/).
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
## [0.1.0] — 2025
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- 10-stage PDF extraction pipeline (triage → extraction → layout → reading order
|
|
14
|
+
→ tables → OCR → assembly → cross-page → post-processing → confidence scoring)
|
|
15
|
+
- Confidence scoring on every extracted block and at document level
|
|
16
|
+
- Three-tier install: core (~50 MB), `[ocr]` (~100 MB), `[layout]` (~5 GB)
|
|
17
|
+
- CLI: `pdfpeek extract` and `pdfpeek info`
|
|
18
|
+
- Born-digital, scanned, and hybrid PDF support
|
|
19
|
+
- Union-based image coverage for accurate page triage (W1)
|
|
20
|
+
- Page-size-normalised text density thresholds (W2)
|
|
21
|
+
- Center-containment block merging — no IoU false negatives (W19)
|
|
22
|
+
- Encrypted PDF detection with structured warnings (W29)
|
|
23
|
+
- RTL script direction support throughout (W11)
|
|
24
|
+
- O(n) header/footer detection via text fingerprinting (W20)
|
|
25
|
+
- Graceful surya fallback when layout model is not installed
|
|
26
|
+
- 337 passing unit tests
|
|
27
|
+
|
|
28
|
+
### Known limitations (V2)
|
|
29
|
+
- Multi-column reading order: XY-cut works; very complex layouts may scramble
|
|
30
|
+
- Hybrid page sub-region isolation is best-effort only
|
|
31
|
+
- Signal 4 phantom detection (borderline 0.3–0.7 scores) deferred
|
|
32
|
+
- Non-English OCR error correction not yet implemented
|
|
33
|
+
- Documents > 1000 pages: memory warning only, no streaming
|
pdfpeek-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 pdfpeek contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
pdfpeek-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdfpeek
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: PDF to text extraction with confidence scoring — born-digital, scanned, and hybrid PDFs
|
|
5
|
+
Project-URL: Homepage, https://github.com/ibrah5em/pdfpeek
|
|
6
|
+
Project-URL: Bug Tracker, https://github.com/ibrah5em/pdfpeek/issues
|
|
7
|
+
Project-URL: Documentation, https://github.com/ibrah5em/pdfpeek#readme
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: document,nlp,ocr,pdf,text-extraction
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Text Processing
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Requires-Dist: click>=8.0
|
|
22
|
+
Requires-Dist: colorama>=0.4
|
|
23
|
+
Requires-Dist: numpy>=1.24
|
|
24
|
+
Requires-Dist: pdfplumber>=0.10
|
|
25
|
+
Requires-Dist: pillow>=10.0
|
|
26
|
+
Requires-Dist: pymupdf>=1.23
|
|
27
|
+
Requires-Dist: pypdfium2>=4.0
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pikepdf>=8.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytesseract>=0.3; extra == 'dev'
|
|
31
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
32
|
+
Requires-Dist: reportlab>=4.0; extra == 'dev'
|
|
33
|
+
Provides-Extra: full
|
|
34
|
+
Requires-Dist: opencv-python-headless>=4.8; extra == 'full'
|
|
35
|
+
Requires-Dist: pytesseract>=0.3; extra == 'full'
|
|
36
|
+
Requires-Dist: surya-ocr>=0.4; extra == 'full'
|
|
37
|
+
Provides-Extra: layout
|
|
38
|
+
Requires-Dist: surya-ocr>=0.4; extra == 'layout'
|
|
39
|
+
Provides-Extra: ocr
|
|
40
|
+
Requires-Dist: pytesseract>=0.3; extra == 'ocr'
|
|
41
|
+
Provides-Extra: ocr-full
|
|
42
|
+
Requires-Dist: opencv-python-headless>=4.8; extra == 'ocr-full'
|
|
43
|
+
Requires-Dist: pytesseract>=0.3; extra == 'ocr-full'
|
|
44
|
+
Description-Content-Type: text/markdown
|
|
45
|
+
|
|
46
|
+
# pdfpeek
|
|
47
|
+
|
|
48
|
+
**PDF to text — with a confidence score.**
|
|
49
|
+
|
|
50
|
+
[](https://github.com/ibrah5em/pdfpeek/actions)
|
|
51
|
+
[](https://pypi.org/project/pdfpeek/)
|
|
52
|
+
[](https://pypi.org/project/pdfpeek/)
|
|
53
|
+
[](LICENSE)
|
|
54
|
+
|
|
55
|
+
Most PDF tools dump text and leave you guessing whether it worked.
|
|
56
|
+
**pdfpeek** runs a 10-stage pipeline and tells you exactly how confident it is in every extraction.
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
$ pdfpeek extract report.pdf
|
|
60
|
+
|
|
61
|
+
pdfpeek → report.pdf
|
|
62
|
+
|
|
63
|
+
✔ Triage 2 page(s) text_native: 1 hybrid: 1
|
|
64
|
+
✔ Extraction 47 blocks
|
|
65
|
+
✔ Warnings 1 warning(s)
|
|
66
|
+
|
|
67
|
+
Confidence 0.893
|
|
68
|
+
Time 1.4s
|
|
69
|
+
|
|
70
|
+
Output → report.txt
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## Install
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
# Born-digital PDFs only (~50 MB)
|
|
79
|
+
pip install pdfpeek
|
|
80
|
+
|
|
81
|
+
# + Scanned PDF support via Tesseract (~100 MB)
|
|
82
|
+
pip install pdfpeek[ocr]
|
|
83
|
+
|
|
84
|
+
# + AI layout detection via surya (~5 GB)
|
|
85
|
+
pip install pdfpeek[layout]
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
> **Tesseract** (system package) is required for `[ocr]`:
|
|
89
|
+
> - macOS: `brew install tesseract`
|
|
90
|
+
> - Ubuntu/Debian: `sudo apt install tesseract-ocr`
|
|
91
|
+
> - Windows: [UB-Mannheim installer](https://github.com/UB-Mannheim/tesseract/wiki)
|
|
92
|
+
|
|
93
|
+
---
|
|
94
|
+
|
|
95
|
+
## Usage
|
|
96
|
+
|
|
97
|
+
### CLI
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
# Extract to plain text (default)
|
|
101
|
+
pdfpeek extract document.pdf
|
|
102
|
+
|
|
103
|
+
# Extract to markdown
|
|
104
|
+
pdfpeek extract document.pdf --format markdown --out result.md
|
|
105
|
+
|
|
106
|
+
# Batch a whole folder
|
|
107
|
+
pdfpeek extract ./pdfs/ --out ./txts/
|
|
108
|
+
|
|
109
|
+
# Password-protected PDF
|
|
110
|
+
pdfpeek extract encrypted.pdf --password secret
|
|
111
|
+
|
|
112
|
+
# Inspect a PDF before extracting
|
|
113
|
+
pdfpeek info document.pdf
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### Python API
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
from pdf_engine import extract
|
|
120
|
+
|
|
121
|
+
result = extract("document.pdf")
|
|
122
|
+
|
|
123
|
+
print(result.text) # extracted text
|
|
124
|
+
print(result.confidence) # 0.0 – 1.0 document-level confidence
|
|
125
|
+
print(result.warnings) # any issues found during extraction
|
|
126
|
+
|
|
127
|
+
# Full structured output
|
|
128
|
+
for page in result.ir.pages:
|
|
129
|
+
for block in page.blocks:
|
|
130
|
+
print(block.text, block.confidence.final, block.block_type)
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## How it works
|
|
136
|
+
|
|
137
|
+
pdfpeek runs every PDF through a 10-stage pipeline:
|
|
138
|
+
|
|
139
|
+
| Stage | Name | What it does |
|
|
140
|
+
|-------|------|-------------|
|
|
141
|
+
| 0 | Triage | Classifies each page: text-native, scanned, or hybrid |
|
|
142
|
+
| 1 | Extraction | Pulls embedded text with phantom-layer detection |
|
|
143
|
+
| 2 | Layout | Detects block types (heading, body, table, figure) via surya |
|
|
144
|
+
| 3 | Reading Order | XY-cut partitioning, RTL-aware |
|
|
145
|
+
| 4 | Tables | Explicit (ruled) and implicit (whitespace) table detection |
|
|
146
|
+
| 5 | OCR | Tesseract + surya for scanned/hybrid pages |
|
|
147
|
+
| 6 | Assembly | Merges pymupdf and surya outputs; de-duplicates |
|
|
148
|
+
| 7 | Cross-page | Strips headers/footers; builds heading hierarchy |
|
|
149
|
+
| 8 | Post-processing | Rejoins hyphenation; corrects OCR errors |
|
|
150
|
+
| 9 | Confidence | Scores every block on text quality, method trust, order, and type |
|
|
151
|
+
|
|
152
|
+
Each block gets a confidence score from 0 to 1. The document score is the mean of all block scores.
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## Comparison
|
|
157
|
+
|
|
158
|
+
| | pdfpeek | pdfplumber | pypdf | unstructured |
|
|
159
|
+
|---|---|---|---|---|
|
|
160
|
+
| Born-digital | ✅ | ✅ | ✅ | ✅ |
|
|
161
|
+
| Scanned (OCR) | ✅ `[ocr]` | ❌ | ❌ | ✅ |
|
|
162
|
+
| Hybrid pages | ✅ | ⚠️ partial | ❌ | ✅ |
|
|
163
|
+
| Confidence score | ✅ | ❌ | ❌ | ❌ |
|
|
164
|
+
| Install size | **50 MB** | ~20 MB | ~5 MB | **5 GB+** |
|
|
165
|
+
| Python API | ✅ | ✅ | ✅ | ✅ |
|
|
166
|
+
| CLI | ✅ | ❌ | ❌ | ✅ |
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## Confidence scoring
|
|
171
|
+
|
|
172
|
+
Every `TextBlock` has a `BlockConfidence` with four dimensions:
|
|
173
|
+
|
|
174
|
+
| Dimension | Set by | Meaning |
|
|
175
|
+
|-----------|--------|---------|
|
|
176
|
+
| `text_quality` | Stage 9 | Fraction of printable, non-garbled characters |
|
|
177
|
+
| `method_score` | Stage 9 | Trust in the extraction method (pymupdf=1.0, tesseract=0.7) |
|
|
178
|
+
| `order_quality` | Stage 3 | Confidence in reading-order placement |
|
|
179
|
+
| `type_quality` | Stage 2 | Confidence in block-type classification |
|
|
180
|
+
| `final` | Stage 9 | Geometric mean of the four above |
|
|
181
|
+
|
|
182
|
+
A document-level score above **0.8** means reliable extraction. Below **0.6** means you should check the output manually or try `[layout]`.
|
|
183
|
+
|
|
184
|
+
---
|
|
185
|
+
|
|
186
|
+
## Known limitations (v0.1)
|
|
187
|
+
|
|
188
|
+
- Very complex multi-column layouts (magazines, newspapers) may have reading-order issues
|
|
189
|
+
- Non-English OCR error correction is not yet implemented
|
|
190
|
+
- Documents > 1000 pages will be slow (surya processes ~2–3 pages/sec)
|
|
191
|
+
- Equations, handwriting, and deeply nested table-in-sidebar structures are best-effort
|
|
192
|
+
|
|
193
|
+
These are on the roadmap for v0.2.
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
## Contributing
|
|
198
|
+
|
|
199
|
+
```bash
|
|
200
|
+
git clone https://github.com/ibrah5em/pdfpeek
|
|
201
|
+
cd pdfpeek
|
|
202
|
+
pip install -e ".[ocr,dev]"
|
|
203
|
+
pytest tests/
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
PRs welcome. Please add a test for any bug fix.
|
|
207
|
+
|
|
208
|
+
---
|
|
209
|
+
|
|
210
|
+
## License
|
|
211
|
+
|
|
212
|
+
MIT — see [LICENSE](LICENSE).
|
pdfpeek-0.1.0/README.md
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
# pdfpeek
|
|
2
|
+
|
|
3
|
+
**PDF to text — with a confidence score.**
|
|
4
|
+
|
|
5
|
+
[](https://github.com/ibrah5em/pdfpeek/actions)
|
|
6
|
+
[](https://pypi.org/project/pdfpeek/)
|
|
7
|
+
[](https://pypi.org/project/pdfpeek/)
|
|
8
|
+
[](LICENSE)
|
|
9
|
+
|
|
10
|
+
Most PDF tools dump text and leave you guessing whether it worked.
|
|
11
|
+
**pdfpeek** runs a 10-stage pipeline and tells you exactly how confident it is in every extraction.
|
|
12
|
+
|
|
13
|
+
```
|
|
14
|
+
$ pdfpeek extract report.pdf
|
|
15
|
+
|
|
16
|
+
pdfpeek → report.pdf
|
|
17
|
+
|
|
18
|
+
✔ Triage 2 page(s) text_native: 1 hybrid: 1
|
|
19
|
+
✔ Extraction 47 blocks
|
|
20
|
+
✔ Warnings 1 warning(s)
|
|
21
|
+
|
|
22
|
+
Confidence 0.893
|
|
23
|
+
Time 1.4s
|
|
24
|
+
|
|
25
|
+
Output → report.txt
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## Install
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
# Born-digital PDFs only (~50 MB)
|
|
34
|
+
pip install pdfpeek
|
|
35
|
+
|
|
36
|
+
# + Scanned PDF support via Tesseract (~100 MB)
|
|
37
|
+
pip install pdfpeek[ocr]
|
|
38
|
+
|
|
39
|
+
# + AI layout detection via surya (~5 GB)
|
|
40
|
+
pip install pdfpeek[layout]
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
> **Tesseract** (system package) is required for `[ocr]`:
|
|
44
|
+
> - macOS: `brew install tesseract`
|
|
45
|
+
> - Ubuntu/Debian: `sudo apt install tesseract-ocr`
|
|
46
|
+
> - Windows: [UB-Mannheim installer](https://github.com/UB-Mannheim/tesseract/wiki)
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## Usage
|
|
51
|
+
|
|
52
|
+
### CLI
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
# Extract to plain text (default)
|
|
56
|
+
pdfpeek extract document.pdf
|
|
57
|
+
|
|
58
|
+
# Extract to markdown
|
|
59
|
+
pdfpeek extract document.pdf --format markdown --out result.md
|
|
60
|
+
|
|
61
|
+
# Batch a whole folder
|
|
62
|
+
pdfpeek extract ./pdfs/ --out ./txts/
|
|
63
|
+
|
|
64
|
+
# Password-protected PDF
|
|
65
|
+
pdfpeek extract encrypted.pdf --password secret
|
|
66
|
+
|
|
67
|
+
# Inspect a PDF before extracting
|
|
68
|
+
pdfpeek info document.pdf
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Python API
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from pdf_engine import extract
|
|
75
|
+
|
|
76
|
+
result = extract("document.pdf")
|
|
77
|
+
|
|
78
|
+
print(result.text) # extracted text
|
|
79
|
+
print(result.confidence) # 0.0 – 1.0 document-level confidence
|
|
80
|
+
print(result.warnings) # any issues found during extraction
|
|
81
|
+
|
|
82
|
+
# Full structured output
|
|
83
|
+
for page in result.ir.pages:
|
|
84
|
+
for block in page.blocks:
|
|
85
|
+
print(block.text, block.confidence.final, block.block_type)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## How it works
|
|
91
|
+
|
|
92
|
+
pdfpeek runs every PDF through a 10-stage pipeline:
|
|
93
|
+
|
|
94
|
+
| Stage | Name | What it does |
|
|
95
|
+
|-------|------|-------------|
|
|
96
|
+
| 0 | Triage | Classifies each page: text-native, scanned, or hybrid |
|
|
97
|
+
| 1 | Extraction | Pulls embedded text with phantom-layer detection |
|
|
98
|
+
| 2 | Layout | Detects block types (heading, body, table, figure) via surya |
|
|
99
|
+
| 3 | Reading Order | XY-cut partitioning, RTL-aware |
|
|
100
|
+
| 4 | Tables | Explicit (ruled) and implicit (whitespace) table detection |
|
|
101
|
+
| 5 | OCR | Tesseract + surya for scanned/hybrid pages |
|
|
102
|
+
| 6 | Assembly | Merges pymupdf and surya outputs; de-duplicates |
|
|
103
|
+
| 7 | Cross-page | Strips headers/footers; builds heading hierarchy |
|
|
104
|
+
| 8 | Post-processing | Rejoins hyphenation; corrects OCR errors |
|
|
105
|
+
| 9 | Confidence | Scores every block on text quality, method trust, order, and type |
|
|
106
|
+
|
|
107
|
+
Each block gets a confidence score from 0 to 1. The document score is the mean of all block scores.
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## Comparison
|
|
112
|
+
|
|
113
|
+
| | pdfpeek | pdfplumber | pypdf | unstructured |
|
|
114
|
+
|---|---|---|---|---|
|
|
115
|
+
| Born-digital | ✅ | ✅ | ✅ | ✅ |
|
|
116
|
+
| Scanned (OCR) | ✅ `[ocr]` | ❌ | ❌ | ✅ |
|
|
117
|
+
| Hybrid pages | ✅ | ⚠️ partial | ❌ | ✅ |
|
|
118
|
+
| Confidence score | ✅ | ❌ | ❌ | ❌ |
|
|
119
|
+
| Install size | **50 MB** | ~20 MB | ~5 MB | **5 GB+** |
|
|
120
|
+
| Python API | ✅ | ✅ | ✅ | ✅ |
|
|
121
|
+
| CLI | ✅ | ❌ | ❌ | ✅ |
|
|
122
|
+
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
## Confidence scoring
|
|
126
|
+
|
|
127
|
+
Every `TextBlock` has a `BlockConfidence` with four dimensions:
|
|
128
|
+
|
|
129
|
+
| Dimension | Set by | Meaning |
|
|
130
|
+
|-----------|--------|---------|
|
|
131
|
+
| `text_quality` | Stage 9 | Fraction of printable, non-garbled characters |
|
|
132
|
+
| `method_score` | Stage 9 | Trust in the extraction method (pymupdf=1.0, tesseract=0.7) |
|
|
133
|
+
| `order_quality` | Stage 3 | Confidence in reading-order placement |
|
|
134
|
+
| `type_quality` | Stage 2 | Confidence in block-type classification |
|
|
135
|
+
| `final` | Stage 9 | Geometric mean of the four above |
|
|
136
|
+
|
|
137
|
+
A document-level score above **0.8** means reliable extraction. Below **0.6** means you should check the output manually or try `[layout]`.
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
## Known limitations (v0.1)
|
|
142
|
+
|
|
143
|
+
- Very complex multi-column layouts (magazines, newspapers) may have reading-order issues
|
|
144
|
+
- Non-English OCR error correction is not yet implemented
|
|
145
|
+
- Documents > 1000 pages will be slow (surya processes ~2–3 pages/sec)
|
|
146
|
+
- Equations, handwriting, and deeply nested table-in-sidebar structures are best-effort
|
|
147
|
+
|
|
148
|
+
These are on the roadmap for v0.2.
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## Contributing
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
git clone https://github.com/ibrah5em/pdfpeek
|
|
156
|
+
cd pdfpeek
|
|
157
|
+
pip install -e ".[ocr,dev]"
|
|
158
|
+
pytest tests/
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
PRs welcome. Please add a test for any bug fix.
|
|
162
|
+
|
|
163
|
+
---
|
|
164
|
+
|
|
165
|
+
## License
|
|
166
|
+
|
|
167
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""
|
|
2
|
+
pdfpeek — PDF to text extraction with confidence scoring.
|
|
3
|
+
|
|
4
|
+
Quick start
|
|
5
|
+
-----------
|
|
6
|
+
from pdf_engine import extract
|
|
7
|
+
|
|
8
|
+
result = extract("document.pdf")
|
|
9
|
+
print(result.text)
|
|
10
|
+
print(f"Confidence: {result.confidence:.3f}")
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from pdf_engine.api import extract, ExtractionResult
|
|
14
|
+
from pdf_engine.models import (
|
|
15
|
+
DocumentIR,
|
|
16
|
+
PageIR,
|
|
17
|
+
TextBlock,
|
|
18
|
+
BlockConfidence,
|
|
19
|
+
BBox,
|
|
20
|
+
BlockType,
|
|
21
|
+
ExtractionMethod,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
from importlib.metadata import version as _pkg_version
|
|
26
|
+
__version__ = _pkg_version("pdfpeek")
|
|
27
|
+
except Exception:
|
|
28
|
+
__version__ = "0.1.0" # fallback for editable / pre-install runs
|
|
29
|
+
__all__ = [
|
|
30
|
+
"extract",
|
|
31
|
+
"ExtractionResult",
|
|
32
|
+
"DocumentIR",
|
|
33
|
+
"PageIR",
|
|
34
|
+
"TextBlock",
|
|
35
|
+
"BlockConfidence",
|
|
36
|
+
"BBox",
|
|
37
|
+
"BlockType",
|
|
38
|
+
"ExtractionMethod",
|
|
39
|
+
]
|