pdfpeek 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. pdfpeek-0.1.0/.gitignore +94 -0
  2. pdfpeek-0.1.0/CHANGELOG.md +33 -0
  3. pdfpeek-0.1.0/LICENSE +21 -0
  4. pdfpeek-0.1.0/PKG-INFO +212 -0
  5. pdfpeek-0.1.0/README.md +167 -0
  6. pdfpeek-0.1.0/pdf_engine/__init__.py +39 -0
  7. pdfpeek-0.1.0/pdf_engine/__main__.py +5 -0
  8. pdfpeek-0.1.0/pdf_engine/api.py +451 -0
  9. pdfpeek-0.1.0/pdf_engine/cli.py +532 -0
  10. pdfpeek-0.1.0/pdf_engine/models.py +281 -0
  11. pdfpeek-0.1.0/pdf_engine/py.typed +0 -0
  12. pdfpeek-0.1.0/pdf_engine/stage0_triage.py +478 -0
  13. pdfpeek-0.1.0/pdf_engine/stage1_extraction.py +694 -0
  14. pdfpeek-0.1.0/pdf_engine/stage2_layout.py +500 -0
  15. pdfpeek-0.1.0/pdf_engine/stage3_reading_order.py +563 -0
  16. pdfpeek-0.1.0/pdf_engine/stage4_tables.py +614 -0
  17. pdfpeek-0.1.0/pdf_engine/stage5_ocr.py +874 -0
  18. pdfpeek-0.1.0/pdf_engine/stage6_assembly.py +433 -0
  19. pdfpeek-0.1.0/pdf_engine/stage7_crosspage.py +583 -0
  20. pdfpeek-0.1.0/pdf_engine/stage8_postprocessing.py +569 -0
  21. pdfpeek-0.1.0/pdf_engine/stage9_confidence.py +196 -0
  22. pdfpeek-0.1.0/pdf_engine/surya_adapter.py +264 -0
  23. pdfpeek-0.1.0/pyproject.toml +89 -0
  24. pdfpeek-0.1.0/tests/__init__.py +0 -0
  25. pdfpeek-0.1.0/tests/fixtures/born_digital_academic.pdf +93 -0
  26. pdfpeek-0.1.0/tests/fixtures/born_digital_academic.pdf#Uf03aZone.Identifier +0 -0
  27. pdfpeek-0.1.0/tests/fixtures/encrypted_no_password.pdf +0 -0
  28. pdfpeek-0.1.0/tests/fixtures/encrypted_no_password.pdf#Uf03aZone.Identifier +0 -0
  29. pdfpeek-0.1.0/tests/fixtures/scanned_single_column.pdf +79 -0
  30. pdfpeek-0.1.0/tests/fixtures/scanned_single_column.pdf#Uf03aZone.Identifier +0 -0
  31. pdfpeek-0.1.0/tests/test_integration.py +370 -0
  32. pdfpeek-0.1.0/tests/test_stage0.py +546 -0
  33. pdfpeek-0.1.0/tests/test_stage1.py +642 -0
  34. pdfpeek-0.1.0/tests/test_stage2.py +430 -0
  35. pdfpeek-0.1.0/tests/test_stage3.py +418 -0
  36. pdfpeek-0.1.0/tests/test_stage4.py +571 -0
  37. pdfpeek-0.1.0/tests/test_stage5.py +531 -0
  38. pdfpeek-0.1.0/tests/test_stage6.py +642 -0
  39. pdfpeek-0.1.0/tests/test_stage7.py +590 -0
  40. pdfpeek-0.1.0/tests/test_stage8.py +354 -0
  41. pdfpeek-0.1.0/tests/test_stage9.py +322 -0
@@ -0,0 +1,94 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+
7
+ # Distribution / packaging
8
+ .Python
9
+ build/
10
+ develop-eggs/
11
+ dist/
12
+ downloads/
13
+ eggs/
14
+ .eggs/
15
+ lib/
16
+ lib64/
17
+ parts/
18
+ sdist/
19
+ var/
20
+ wheels/
21
+ share/python-wheels/
22
+ *.egg-info/
23
+ .installed.cfg
24
+ *.egg
25
+ MANIFEST
26
+
27
+ # PyInstaller
28
+ *.manifest
29
+ *.spec
30
+
31
+ # Unit test / coverage reports
32
+ htmlcov/
33
+ .tox/
34
+ .nox/
35
+ .coverage
36
+ .coverage.*
37
+ .cache
38
+ nosetests.xml
39
+ coverage.xml
40
+ *.cover
41
+ *.py,cover
42
+ .hypothesis/
43
+ .pytest_cache/
44
+ cover/
45
+
46
+ # Virtual environments
47
+ venv/
48
+ env/
49
+ ENV/
50
+ env.bak/
51
+ venv.bak/
52
+
53
+ # IDE settings
54
+ .vscode/
55
+ .idea/
56
+ *.swp
57
+ *.swo
58
+ *~
59
+ .DS_Store
60
+
61
+ # Claude Code memory (local AI assistant data)
62
+ .claude/
63
+
64
+ # Test outputs and temporary files
65
+ test_*.txt
66
+ test_*.md
67
+ *.pdf
68
+ !tests/fixtures/*.pdf
69
+
70
+ # Internal review documents
71
+ Review.md
72
+ NOTES.md
73
+ TODO.md
74
+
75
+ # mypy
76
+ .mypy_cache/
77
+ .dmypy.json
78
+ dmypy.json
79
+
80
+ # Pyre type checker
81
+ .pyre/
82
+
83
+ # pytype static type analyzer
84
+ .pytype/
85
+
86
+ # Jupyter Notebook
87
+ .ipynb_checkpoints
88
+
89
+ # pyenv
90
+ .python-version
91
+
92
+ # Environments
93
+ .env
94
+ .venv
@@ -0,0 +1,33 @@
1
+ # Changelog
2
+
3
+ All notable changes to pdfpeek will be documented here.
4
+
5
+ Format follows [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
6
+ Versioning follows [Semantic Versioning](https://semver.org/).
7
+
8
+ ---
9
+
10
+ ## [0.1.0] — 2025
11
+
12
+ ### Added
13
+ - 10-stage PDF extraction pipeline (triage → extraction → layout → reading order
14
+ → tables → OCR → assembly → cross-page → post-processing → confidence scoring)
15
+ - Confidence scoring on every extracted block and at document level
16
+ - Three-tier install: core (~50 MB), `[ocr]` (~100 MB), `[layout]` (~5 GB)
17
+ - CLI: `pdfpeek extract` and `pdfpeek info`
18
+ - Born-digital, scanned, and hybrid PDF support
19
+ - Union-based image coverage for accurate page triage (W1)
20
+ - Page-size-normalised text density thresholds (W2)
21
+ - Center-containment block merging — no IoU false negatives (W19)
22
+ - Encrypted PDF detection with structured warnings (W29)
23
+ - RTL script direction support throughout (W11)
24
+ - O(n) header/footer detection via text fingerprinting (W20)
25
+ - Graceful surya fallback when layout model is not installed
26
+ - 337 passing unit tests
27
+
28
+ ### Known limitations (V2)
29
+ - Multi-column reading order: XY-cut works; very complex layouts may scramble
30
+ - Hybrid page sub-region isolation is best-effort only
31
+ - Signal 4 phantom detection (borderline 0.3–0.7 scores) deferred
32
+ - Non-English OCR error correction not yet implemented
33
+ - Documents > 1000 pages: memory warning only, no streaming
pdfpeek-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 pdfpeek contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
pdfpeek-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,212 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdfpeek
3
+ Version: 0.1.0
4
+ Summary: PDF to text extraction with confidence scoring — born-digital, scanned, and hybrid PDFs
5
+ Project-URL: Homepage, https://github.com/ibrah5em/pdfpeek
6
+ Project-URL: Bug Tracker, https://github.com/ibrah5em/pdfpeek/issues
7
+ Project-URL: Documentation, https://github.com/ibrah5em/pdfpeek#readme
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: document,nlp,ocr,pdf,text-extraction
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Classifier: Topic :: Text Processing
20
+ Requires-Python: >=3.10
21
+ Requires-Dist: click>=8.0
22
+ Requires-Dist: colorama>=0.4
23
+ Requires-Dist: numpy>=1.24
24
+ Requires-Dist: pdfplumber>=0.10
25
+ Requires-Dist: pillow>=10.0
26
+ Requires-Dist: pymupdf>=1.23
27
+ Requires-Dist: pypdfium2>=4.0
28
+ Provides-Extra: dev
29
+ Requires-Dist: pikepdf>=8.0; extra == 'dev'
30
+ Requires-Dist: pytesseract>=0.3; extra == 'dev'
31
+ Requires-Dist: pytest>=8.0; extra == 'dev'
32
+ Requires-Dist: reportlab>=4.0; extra == 'dev'
33
+ Provides-Extra: full
34
+ Requires-Dist: opencv-python-headless>=4.8; extra == 'full'
35
+ Requires-Dist: pytesseract>=0.3; extra == 'full'
36
+ Requires-Dist: surya-ocr>=0.4; extra == 'full'
37
+ Provides-Extra: layout
38
+ Requires-Dist: surya-ocr>=0.4; extra == 'layout'
39
+ Provides-Extra: ocr
40
+ Requires-Dist: pytesseract>=0.3; extra == 'ocr'
41
+ Provides-Extra: ocr-full
42
+ Requires-Dist: opencv-python-headless>=4.8; extra == 'ocr-full'
43
+ Requires-Dist: pytesseract>=0.3; extra == 'ocr-full'
44
+ Description-Content-Type: text/markdown
45
+
46
+ # pdfpeek
47
+
48
+ **PDF to text — with a confidence score.**
49
+
50
+ [![Tests](https://github.com/ibrah5em/pdfpeek/actions/workflows/test.yml/badge.svg)](https://github.com/ibrah5em/pdfpeek/actions)
51
+ [![PyPI](https://img.shields.io/pypi/v/pdfpeek)](https://pypi.org/project/pdfpeek/)
52
+ [![Python](https://img.shields.io/pypi/pyversions/pdfpeek)](https://pypi.org/project/pdfpeek/)
53
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
54
+
55
+ Most PDF tools dump text and leave you guessing whether it worked.
56
+ **pdfpeek** runs a 10-stage pipeline and tells you exactly how confident it is in every extraction.
57
+
58
+ ```
59
+ $ pdfpeek extract report.pdf
60
+
61
+ pdfpeek → report.pdf
62
+
63
+ ✔ Triage 2 page(s) text_native: 1 hybrid: 1
64
+ ✔ Extraction 47 blocks
65
+ ✔ Warnings 1 warning(s)
66
+
67
+ Confidence 0.893
68
+ Time 1.4s
69
+
70
+ Output → report.txt
71
+ ```
72
+
73
+ ---
74
+
75
+ ## Install
76
+
77
+ ```bash
78
+ # Born-digital PDFs only (~50 MB)
79
+ pip install pdfpeek
80
+
81
+ # + Scanned PDF support via Tesseract (~100 MB)
82
+ pip install pdfpeek[ocr]
83
+
84
+ # + AI layout detection via surya (~5 GB)
85
+ pip install pdfpeek[layout]
86
+ ```
87
+
88
+ > **Tesseract** (system package) is required for `[ocr]`:
89
+ > - macOS: `brew install tesseract`
90
+ > - Ubuntu/Debian: `sudo apt install tesseract-ocr`
91
+ > - Windows: [UB-Mannheim installer](https://github.com/UB-Mannheim/tesseract/wiki)
92
+
93
+ ---
94
+
95
+ ## Usage
96
+
97
+ ### CLI
98
+
99
+ ```bash
100
+ # Extract to plain text (default)
101
+ pdfpeek extract document.pdf
102
+
103
+ # Extract to markdown
104
+ pdfpeek extract document.pdf --format markdown --out result.md
105
+
106
+ # Batch a whole folder
107
+ pdfpeek extract ./pdfs/ --out ./txts/
108
+
109
+ # Password-protected PDF
110
+ pdfpeek extract encrypted.pdf --password secret
111
+
112
+ # Inspect a PDF before extracting
113
+ pdfpeek info document.pdf
114
+ ```
115
+
116
+ ### Python API
117
+
118
+ ```python
119
+ from pdf_engine import extract
120
+
121
+ result = extract("document.pdf")
122
+
123
+ print(result.text) # extracted text
124
+ print(result.confidence) # 0.0 – 1.0 document-level confidence
125
+ print(result.warnings) # any issues found during extraction
126
+
127
+ # Full structured output
128
+ for page in result.ir.pages:
129
+ for block in page.blocks:
130
+ print(block.text, block.confidence.final, block.block_type)
131
+ ```
132
+
133
+ ---
134
+
135
+ ## How it works
136
+
137
+ pdfpeek runs every PDF through a 10-stage pipeline:
138
+
139
+ | Stage | Name | What it does |
140
+ |-------|------|-------------|
141
+ | 0 | Triage | Classifies each page: text-native, scanned, or hybrid |
142
+ | 1 | Extraction | Pulls embedded text with phantom-layer detection |
143
+ | 2 | Layout | Detects block types (heading, body, table, figure) via surya |
144
+ | 3 | Reading Order | XY-cut partitioning, RTL-aware |
145
+ | 4 | Tables | Explicit (ruled) and implicit (whitespace) table detection |
146
+ | 5 | OCR | Tesseract + surya for scanned/hybrid pages |
147
+ | 6 | Assembly | Merges pymupdf and surya outputs; de-duplicates |
148
+ | 7 | Cross-page | Strips headers/footers; builds heading hierarchy |
149
+ | 8 | Post-processing | Rejoins hyphenation; corrects OCR errors |
150
+ | 9 | Confidence | Scores every block on text quality, method trust, order, and type |
151
+
152
+ Each block gets a confidence score from 0 to 1. The document score is the mean of all block scores.
153
+
154
+ ---
155
+
156
+ ## Comparison
157
+
158
+ | | pdfpeek | pdfplumber | pypdf | unstructured |
159
+ |---|---|---|---|---|
160
+ | Born-digital | ✅ | ✅ | ✅ | ✅ |
161
+ | Scanned (OCR) | ✅ `[ocr]` | ❌ | ❌ | ✅ |
162
+ | Hybrid pages | ✅ | ⚠️ partial | ❌ | ✅ |
163
+ | Confidence score | ✅ | ❌ | ❌ | ❌ |
164
+ | Install size | **50 MB** | ~20 MB | ~5 MB | **5 GB+** |
165
+ | Python API | ✅ | ✅ | ✅ | ✅ |
166
+ | CLI | ✅ | ❌ | ❌ | ✅ |
167
+
168
+ ---
169
+
170
+ ## Confidence scoring
171
+
172
+ Every `TextBlock` has a `BlockConfidence` with four dimensions:
173
+
174
+ | Dimension | Set by | Meaning |
175
+ |-----------|--------|---------|
176
+ | `text_quality` | Stage 9 | Fraction of printable, non-garbled characters |
177
+ | `method_score` | Stage 9 | Trust in the extraction method (pymupdf=1.0, tesseract=0.7) |
178
+ | `order_quality` | Stage 3 | Confidence in reading-order placement |
179
+ | `type_quality` | Stage 2 | Confidence in block-type classification |
180
+ | `final` | Stage 9 | Geometric mean of the four above |
181
+
182
+ A document-level score above **0.8** means reliable extraction. Below **0.6** means you should check the output manually or try `[layout]`.
183
+
184
+ ---
185
+
186
+ ## Known limitations (v0.1)
187
+
188
+ - Very complex multi-column layouts (magazines, newspapers) may have reading-order issues
189
+ - Non-English OCR error correction is not yet implemented
190
+ - Documents > 1000 pages will be slow (surya processes ~2–3 pages/sec)
191
+ - Equations, handwriting, and deeply nested table-in-sidebar structures are best-effort
192
+
193
+ These are on the roadmap for v0.2.
194
+
195
+ ---
196
+
197
+ ## Contributing
198
+
199
+ ```bash
200
+ git clone https://github.com/ibrah5em/pdfpeek
201
+ cd pdfpeek
202
+ pip install -e ".[ocr,dev]"
203
+ pytest tests/
204
+ ```
205
+
206
+ PRs welcome. Please add a test for any bug fix.
207
+
208
+ ---
209
+
210
+ ## License
211
+
212
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,167 @@
1
+ # pdfpeek
2
+
3
+ **PDF to text — with a confidence score.**
4
+
5
+ [![Tests](https://github.com/ibrah5em/pdfpeek/actions/workflows/test.yml/badge.svg)](https://github.com/ibrah5em/pdfpeek/actions)
6
+ [![PyPI](https://img.shields.io/pypi/v/pdfpeek)](https://pypi.org/project/pdfpeek/)
7
+ [![Python](https://img.shields.io/pypi/pyversions/pdfpeek)](https://pypi.org/project/pdfpeek/)
8
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
9
+
10
+ Most PDF tools dump text and leave you guessing whether it worked.
11
+ **pdfpeek** runs a 10-stage pipeline and tells you exactly how confident it is in every extraction.
12
+
13
+ ```
14
+ $ pdfpeek extract report.pdf
15
+
16
+ pdfpeek → report.pdf
17
+
18
+ ✔ Triage 2 page(s) text_native: 1 hybrid: 1
19
+ ✔ Extraction 47 blocks
20
+ ✔ Warnings 1 warning(s)
21
+
22
+ Confidence 0.893
23
+ Time 1.4s
24
+
25
+ Output → report.txt
26
+ ```
27
+
28
+ ---
29
+
30
+ ## Install
31
+
32
+ ```bash
33
+ # Born-digital PDFs only (~50 MB)
34
+ pip install pdfpeek
35
+
36
+ # + Scanned PDF support via Tesseract (~100 MB)
37
+ pip install pdfpeek[ocr]
38
+
39
+ # + AI layout detection via surya (~5 GB)
40
+ pip install pdfpeek[layout]
41
+ ```
42
+
43
+ > **Tesseract** (system package) is required for `[ocr]`:
44
+ > - macOS: `brew install tesseract`
45
+ > - Ubuntu/Debian: `sudo apt install tesseract-ocr`
46
+ > - Windows: [UB-Mannheim installer](https://github.com/UB-Mannheim/tesseract/wiki)
47
+
48
+ ---
49
+
50
+ ## Usage
51
+
52
+ ### CLI
53
+
54
+ ```bash
55
+ # Extract to plain text (default)
56
+ pdfpeek extract document.pdf
57
+
58
+ # Extract to markdown
59
+ pdfpeek extract document.pdf --format markdown --out result.md
60
+
61
+ # Batch a whole folder
62
+ pdfpeek extract ./pdfs/ --out ./txts/
63
+
64
+ # Password-protected PDF
65
+ pdfpeek extract encrypted.pdf --password secret
66
+
67
+ # Inspect a PDF before extracting
68
+ pdfpeek info document.pdf
69
+ ```
70
+
71
+ ### Python API
72
+
73
+ ```python
74
+ from pdf_engine import extract
75
+
76
+ result = extract("document.pdf")
77
+
78
+ print(result.text) # extracted text
79
+ print(result.confidence) # 0.0 – 1.0 document-level confidence
80
+ print(result.warnings) # any issues found during extraction
81
+
82
+ # Full structured output
83
+ for page in result.ir.pages:
84
+ for block in page.blocks:
85
+ print(block.text, block.confidence.final, block.block_type)
86
+ ```
87
+
88
+ ---
89
+
90
+ ## How it works
91
+
92
+ pdfpeek runs every PDF through a 10-stage pipeline:
93
+
94
+ | Stage | Name | What it does |
95
+ |-------|------|-------------|
96
+ | 0 | Triage | Classifies each page: text-native, scanned, or hybrid |
97
+ | 1 | Extraction | Pulls embedded text with phantom-layer detection |
98
+ | 2 | Layout | Detects block types (heading, body, table, figure) via surya |
99
+ | 3 | Reading Order | XY-cut partitioning, RTL-aware |
100
+ | 4 | Tables | Explicit (ruled) and implicit (whitespace) table detection |
101
+ | 5 | OCR | Tesseract + surya for scanned/hybrid pages |
102
+ | 6 | Assembly | Merges pymupdf and surya outputs; de-duplicates |
103
+ | 7 | Cross-page | Strips headers/footers; builds heading hierarchy |
104
+ | 8 | Post-processing | Rejoins hyphenation; corrects OCR errors |
105
+ | 9 | Confidence | Scores every block on text quality, method trust, order, and type |
106
+
107
+ Each block gets a confidence score from 0 to 1. The document score is the mean of all block scores.
108
+
109
+ ---
110
+
111
+ ## Comparison
112
+
113
+ | | pdfpeek | pdfplumber | pypdf | unstructured |
114
+ |---|---|---|---|---|
115
+ | Born-digital | ✅ | ✅ | ✅ | ✅ |
116
+ | Scanned (OCR) | ✅ `[ocr]` | ❌ | ❌ | ✅ |
117
+ | Hybrid pages | ✅ | ⚠️ partial | ❌ | ✅ |
118
+ | Confidence score | ✅ | ❌ | ❌ | ❌ |
119
+ | Install size | **50 MB** | ~20 MB | ~5 MB | **5 GB+** |
120
+ | Python API | ✅ | ✅ | ✅ | ✅ |
121
+ | CLI | ✅ | ❌ | ❌ | ✅ |
122
+
123
+ ---
124
+
125
+ ## Confidence scoring
126
+
127
+ Every `TextBlock` has a `BlockConfidence` with four dimensions:
128
+
129
+ | Dimension | Set by | Meaning |
130
+ |-----------|--------|---------|
131
+ | `text_quality` | Stage 9 | Fraction of printable, non-garbled characters |
132
+ | `method_score` | Stage 9 | Trust in the extraction method (pymupdf=1.0, tesseract=0.7) |
133
+ | `order_quality` | Stage 3 | Confidence in reading-order placement |
134
+ | `type_quality` | Stage 2 | Confidence in block-type classification |
135
+ | `final` | Stage 9 | Geometric mean of the four above |
136
+
137
+ A document-level score above **0.8** means reliable extraction. Below **0.6** means you should check the output manually or try `[layout]`.
138
+
139
+ ---
140
+
141
+ ## Known limitations (v0.1)
142
+
143
+ - Very complex multi-column layouts (magazines, newspapers) may have reading-order issues
144
+ - Non-English OCR error correction is not yet implemented
145
+ - Documents > 1000 pages will be slow (surya processes ~2–3 pages/sec)
146
+ - Equations, handwriting, and deeply nested table-in-sidebar structures are best-effort
147
+
148
+ These are on the roadmap for v0.2.
149
+
150
+ ---
151
+
152
+ ## Contributing
153
+
154
+ ```bash
155
+ git clone https://github.com/ibrah5em/pdfpeek
156
+ cd pdfpeek
157
+ pip install -e ".[ocr,dev]"
158
+ pytest tests/
159
+ ```
160
+
161
+ PRs welcome. Please add a test for any bug fix.
162
+
163
+ ---
164
+
165
+ ## License
166
+
167
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,39 @@
1
+ """
2
+ pdfpeek — PDF to text extraction with confidence scoring.
3
+
4
+ Quick start
5
+ -----------
6
+ from pdf_engine import extract
7
+
8
+ result = extract("document.pdf")
9
+ print(result.text)
10
+ print(f"Confidence: {result.confidence:.3f}")
11
+ """
12
+
13
+ from pdf_engine.api import extract, ExtractionResult
14
+ from pdf_engine.models import (
15
+ DocumentIR,
16
+ PageIR,
17
+ TextBlock,
18
+ BlockConfidence,
19
+ BBox,
20
+ BlockType,
21
+ ExtractionMethod,
22
+ )
23
+
24
+ try:
25
+ from importlib.metadata import version as _pkg_version
26
+ __version__ = _pkg_version("pdfpeek")
27
+ except Exception:
28
+ __version__ = "0.1.0" # fallback for editable / pre-install runs
29
+ __all__ = [
30
+ "extract",
31
+ "ExtractionResult",
32
+ "DocumentIR",
33
+ "PageIR",
34
+ "TextBlock",
35
+ "BlockConfidence",
36
+ "BBox",
37
+ "BlockType",
38
+ "ExtractionMethod",
39
+ ]
@@ -0,0 +1,5 @@
1
+ """Allow ``python -m pdf_engine`` as an alias for the ``pdfpeek`` CLI."""
2
+ from pdf_engine.cli import main
3
+
4
+ if __name__ == "__main__":
5
+ main()