pdforienter 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdforienter-0.1.0/LICENSE +21 -0
- pdforienter-0.1.0/PKG-INFO +259 -0
- pdforienter-0.1.0/README.md +222 -0
- pdforienter-0.1.0/pdforienter/__init__.py +30 -0
- pdforienter-0.1.0/pdforienter/cli.py +66 -0
- pdforienter-0.1.0/pdforienter/config.py +40 -0
- pdforienter-0.1.0/pdforienter/core/__init__.py +0 -0
- pdforienter-0.1.0/pdforienter/core/analyzer.py +80 -0
- pdforienter-0.1.0/pdforienter/core/classifier.py +18 -0
- pdforienter-0.1.0/pdforienter/core/corrector.py +61 -0
- pdforienter-0.1.0/pdforienter/core/detector.py +91 -0
- pdforienter-0.1.0/pdforienter/core/pipeline.py +55 -0
- pdforienter-0.1.0/pdforienter/core/processor.py +116 -0
- pdforienter-0.1.0/pdforienter/logging/__init__.py +0 -0
- pdforienter-0.1.0/pdforienter/logging/formatter.py +87 -0
- pdforienter-0.1.0/pdforienter/logging/writer.py +25 -0
- pdforienter-0.1.0/pdforienter/models.py +66 -0
- pdforienter-0.1.0/pdforienter/utils/__init__.py +0 -0
- pdforienter-0.1.0/pdforienter/utils/fs.py +32 -0
- pdforienter-0.1.0/pdforienter/utils/resources.py +22 -0
- pdforienter-0.1.0/pdforienter.egg-info/PKG-INFO +259 -0
- pdforienter-0.1.0/pdforienter.egg-info/SOURCES.txt +27 -0
- pdforienter-0.1.0/pdforienter.egg-info/dependency_links.txt +1 -0
- pdforienter-0.1.0/pdforienter.egg-info/entry_points.txt +2 -0
- pdforienter-0.1.0/pdforienter.egg-info/requires.txt +10 -0
- pdforienter-0.1.0/pdforienter.egg-info/top_level.txt +1 -0
- pdforienter-0.1.0/pyproject.toml +74 -0
- pdforienter-0.1.0/setup.cfg +4 -0
- pdforienter-0.1.0/tests/test_core.py +106 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 InfinitiBit GmbH
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdforienter
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Intelligent, parallel PDF page rotation correction.
|
|
5
|
+
Author: InfinitiBit GmbH
|
|
6
|
+
Maintainer: InfinitiBit GmbH
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/MdRahmatUllah/pdforienter
|
|
9
|
+
Project-URL: Repository, https://github.com/MdRahmatUllah/pdforienter
|
|
10
|
+
Project-URL: Issues, https://github.com/MdRahmatUllah/pdforienter/issues
|
|
11
|
+
Project-URL: Documentation, https://github.com/MdRahmatUllah/pdforienter/blob/main/TECHNICAL.md
|
|
12
|
+
Keywords: pdf,rotation,ocr,tesseract,document
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Utilities
|
|
22
|
+
Classifier: Topic :: Multimedia :: Graphics :: Graphics Conversion
|
|
23
|
+
Classifier: Topic :: Text Processing
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: PyMuPDF>=1.23
|
|
28
|
+
Requires-Dist: pytesseract>=0.3.10
|
|
29
|
+
Requires-Dist: Pillow>=10.0
|
|
30
|
+
Requires-Dist: psutil>=5.9
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
33
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
34
|
+
Requires-Dist: ruff; extra == "dev"
|
|
35
|
+
Requires-Dist: mypy; extra == "dev"
|
|
36
|
+
Dynamic: license-file
|
|
37
|
+
|
|
38
|
+
# PDFOrienter
|
|
39
|
+
|
|
40
|
+
**Intelligent, parallel PDF page rotation correction for Python.**
|
|
41
|
+
|
|
42
|
+
PDFOrienter analyses every page of one or more PDF files, detects incorrect orientations, and fixes them in a single write pass — with no unnecessary re-processing.
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## Features
|
|
47
|
+
|
|
48
|
+
- **Two-phase pipeline** — detect all pages in parallel, then apply all corrections in a single write
|
|
49
|
+
- **Smart strategy selection** — uses fast text-direction analysis for text-based pages; falls back to Tesseract OSD only for image/scanned pages
|
|
50
|
+
- **Dynamic parallelism** — automatically uses 75 % of available CPU cores; scales from 4 to 64+ cores without any configuration change
|
|
51
|
+
- **Detailed structured logging** — per-page and per-file timing, rotation details, confidence scores, RAM and CPU usage
|
|
52
|
+
- **Zero intermediate files** — corrected PDFs are written once; originals are never modified
|
|
53
|
+
- **Package-ready** — clean modular design, typed, fully testable
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Requirements
|
|
58
|
+
|
|
59
|
+
### Python
|
|
60
|
+
|
|
61
|
+
Python 3.10 or newer.
|
|
62
|
+
|
|
63
|
+
### System dependency — Tesseract
|
|
64
|
+
|
|
65
|
+
Tesseract must be installed on the host system **before** installing PDFOrienter.
|
|
66
|
+
|
|
67
|
+
**Ubuntu / Debian**
|
|
68
|
+
```bash
|
|
69
|
+
sudo apt-get update && sudo apt-get install -y tesseract-ocr
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
**macOS (Homebrew)**
|
|
73
|
+
```bash
|
|
74
|
+
brew install tesseract
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
**Windows**
|
|
78
|
+
|
|
79
|
+
Download and run the installer from the [Tesseract UB Mannheim releases](https://github.com/UB-Mannheim/tesseract/wiki), then add the install directory to your `PATH`.
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## Installation
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
pip install pdforienter
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
For development (includes linting + test tools):
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
git clone https://github.com/your-org/pdforienter.git
|
|
93
|
+
cd pdforienter
|
|
94
|
+
pip install -e ".[dev]"
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Quick Start
|
|
100
|
+
|
|
101
|
+
### Command line
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
# Fix a single PDF
|
|
105
|
+
pdforienter invoice.pdf --output ./fixed
|
|
106
|
+
|
|
107
|
+
# Fix every PDF in a directory
|
|
108
|
+
pdforienter /scans/ --output /corrected
|
|
109
|
+
|
|
110
|
+
# Mix files and directories
|
|
111
|
+
pdforienter report.pdf /archive/ receipts.pdf --output ./out
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Python API
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
from pdforienter import run_pipeline
|
|
118
|
+
from pdforienter.logging.writer import write_log
|
|
119
|
+
|
|
120
|
+
result = run_pipeline(
|
|
121
|
+
pdf_paths=["invoice.pdf", "report.pdf"],
|
|
122
|
+
output_dir="./corrected",
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Write the structured log file
|
|
126
|
+
log_path = write_log(result, "./corrected")
|
|
127
|
+
|
|
128
|
+
print(f"{result.total_pages_changed} pages corrected in {result.total_duration_seconds:.1f}s")
|
|
129
|
+
print(f"Log: {log_path}")
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## Log File
|
|
135
|
+
|
|
136
|
+
Every run produces a timestamped `.log` file in the output directory.
|
|
137
|
+
|
|
138
|
+
```
|
|
139
|
+
PDFOrienter Run Log — 2024-11-01 14:32:05
|
|
140
|
+
============================================================
|
|
141
|
+
|
|
142
|
+
[RUN SUMMARY]
|
|
143
|
+
Total files processed : 3
|
|
144
|
+
Total pages : 247
|
|
145
|
+
Pages rotated : 18
|
|
146
|
+
Text pages : 201
|
|
147
|
+
Scanned pages (OCR) : 46
|
|
148
|
+
Skipped pages : 0
|
|
149
|
+
Workers used : 6
|
|
150
|
+
Peak RAM usage : 312.4 MB
|
|
151
|
+
Total time : 42.18s
|
|
152
|
+
|
|
153
|
+
------------------------------------------------------------
|
|
154
|
+
[FILE] /scans/invoice.pdf
|
|
155
|
+
Output : /corrected/invoice_corrected.pdf
|
|
156
|
+
Total pages : 12
|
|
157
|
+
Pages changed : 3
|
|
158
|
+
Text pages : 8
|
|
159
|
+
Scanned pages : 4
|
|
160
|
+
Skipped pages : 0
|
|
161
|
+
Detection time : 9.41s
|
|
162
|
+
Correction time : 0.23s
|
|
163
|
+
Total time : 9.64s
|
|
164
|
+
[PAGE DETAILS]
|
|
165
|
+
p 1 | text | OK | angle= 0° | conf= 98.2 | 0.11s | No rotation needed.
|
|
166
|
+
p 2 | scanned | CHANGED | angle= 90° | conf= 87.5 | 2.34s | Rotation of 90° detected (confidence 87.5).
|
|
167
|
+
...
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## Project Structure
|
|
173
|
+
|
|
174
|
+
```
|
|
175
|
+
pdforienter/
|
|
176
|
+
├── pdforienter/
|
|
177
|
+
│ ├── __init__.py # Public API: run_pipeline
|
|
178
|
+
│ ├── config.py # Tuneable constants (worker count, thresholds)
|
|
179
|
+
│ ├── models.py # Typed data classes (PageResult, FileResult, RunResult)
|
|
180
|
+
│ ├── cli.py # Command-line interface
|
|
181
|
+
│ ├── core/
|
|
182
|
+
│ │ ├── pipeline.py # Top-level orchestrator
|
|
183
|
+
│ │ ├── processor.py # Per-file orchestrator (Phase 1 + Phase 2)
|
|
184
|
+
│ │ ├── analyzer.py # Per-page worker (dispatched to subprocess)
|
|
185
|
+
│ │ ├── classifier.py # Text vs scanned page detection
|
|
186
|
+
│ │ ├── detector.py # Orientation detection (text + OSD strategies)
|
|
187
|
+
│ │ └── corrector.py # Single-pass rotation applier
|
|
188
|
+
│ ├── logging/
|
|
189
|
+
│ │ ├── formatter.py # RunResult → structured log string
|
|
190
|
+
│ │ └── writer.py # Write log file to disk
|
|
191
|
+
│ └── utils/
|
|
192
|
+
│ ├── fs.py # Filesystem helpers
|
|
193
|
+
│ └── resources.py # CPU / RAM telemetry
|
|
194
|
+
├── tests/
|
|
195
|
+
│ └── test_core.py
|
|
196
|
+
├── pyproject.toml
|
|
197
|
+
└── README.md
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
---
|
|
201
|
+
|
|
202
|
+
## Configuration
|
|
203
|
+
|
|
204
|
+
All tuneable constants live in `pdforienter/config.py`.
|
|
205
|
+
|
|
206
|
+
| Constant | Default | Description |
|
|
207
|
+
|---|---|---|
|
|
208
|
+
| `MAX_WORKERS` | `floor(cpu_count × 0.75)` | Worker processes for parallel page analysis |
|
|
209
|
+
| `OSD_CONFIDENCE_THRESHOLD` | `10.0` | Minimum Tesseract OSD confidence to trust a result |
|
|
210
|
+
| `TESSERACT_OSD_PSM` | `0` | Tesseract page segmentation mode (0 = OSD only) |
|
|
211
|
+
| `_RENDER_DPI` (detector.py) | `150` | DPI used when rasterising pages for OSD |
|
|
212
|
+
| `_MIN_CHAR_COUNT` (classifier.py) | `20` | Minimum characters to classify a page as text-based |
|
|
213
|
+
|
|
214
|
+
---
|
|
215
|
+
|
|
216
|
+
## How It Works
|
|
217
|
+
|
|
218
|
+
### Phase 1 — Parallel Detection
|
|
219
|
+
|
|
220
|
+
Each page is dispatched to a subprocess worker via `ProcessPoolExecutor`. Workers run concurrently up to `MAX_WORKERS`.
|
|
221
|
+
|
|
222
|
+
For each page:
|
|
223
|
+
1. **Classify** — does the page have selectable text?
|
|
224
|
+
2. **Detect orientation**
|
|
225
|
+
- *Text page* → analyse character direction vectors (fast, no OCR)
|
|
226
|
+
- *Scanned page* → rasterise at 150 DPI and run Tesseract OSD
|
|
227
|
+
3. Return a `PageResult` with the detected angle, confidence, and timing
|
|
228
|
+
|
|
229
|
+
### Phase 2 — Single-Pass Correction
|
|
230
|
+
|
|
231
|
+
After all pages are analysed, a single `fitz.Document.save()` call applies every rotation and writes the corrected PDF. No intermediate files are created.
|
|
232
|
+
|
|
233
|
+
---
|
|
234
|
+
|
|
235
|
+
## Performance
|
|
236
|
+
|
|
237
|
+
Typical estimates on an 8-core server (6 workers) with mixed text/scanned PDFs:
|
|
238
|
+
|
|
239
|
+
| Scenario | Estimate |
|
|
240
|
+
|---|---|
|
|
241
|
+
| 2 000 pages, all text-based | ~1–2 minutes |
|
|
242
|
+
| 2 000 pages, mixed 50/50 | ~7–8 minutes |
|
|
243
|
+
| 2 000 pages, all scanned | ~15–17 minutes |
|
|
244
|
+
|
|
245
|
+
RAM usage: ~200–400 MB per Tesseract worker. 6 workers ≈ 2.5 GB peak. Well within a 16 GB server.
|
|
246
|
+
|
|
247
|
+
---
|
|
248
|
+
|
|
249
|
+
## Running Tests
|
|
250
|
+
|
|
251
|
+
```bash
|
|
252
|
+
pytest tests/ -v
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
## License
|
|
258
|
+
|
|
259
|
+
MIT
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
# PDFOrienter
|
|
2
|
+
|
|
3
|
+
**Intelligent, parallel PDF page rotation correction for Python.**
|
|
4
|
+
|
|
5
|
+
PDFOrienter analyses every page of one or more PDF files, detects incorrect orientations, and fixes them in a single write pass — with no unnecessary re-processing.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- **Two-phase pipeline** — detect all pages in parallel, then apply all corrections in a single write
|
|
12
|
+
- **Smart strategy selection** — uses fast text-direction analysis for text-based pages; falls back to Tesseract OSD only for image/scanned pages
|
|
13
|
+
- **Dynamic parallelism** — automatically uses 75 % of available CPU cores; scales from 4 to 64+ cores without any configuration change
|
|
14
|
+
- **Detailed structured logging** — per-page and per-file timing, rotation details, confidence scores, RAM and CPU usage
|
|
15
|
+
- **Zero intermediate files** — corrected PDFs are written once; originals are never modified
|
|
16
|
+
- **Package-ready** — clean modular design, typed, fully testable
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Requirements
|
|
21
|
+
|
|
22
|
+
### Python
|
|
23
|
+
|
|
24
|
+
Python 3.10 or newer.
|
|
25
|
+
|
|
26
|
+
### System dependency — Tesseract
|
|
27
|
+
|
|
28
|
+
Tesseract must be installed on the host system **before** installing PDFOrienter.
|
|
29
|
+
|
|
30
|
+
**Ubuntu / Debian**
|
|
31
|
+
```bash
|
|
32
|
+
sudo apt-get update && sudo apt-get install -y tesseract-ocr
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
**macOS (Homebrew)**
|
|
36
|
+
```bash
|
|
37
|
+
brew install tesseract
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
**Windows**
|
|
41
|
+
|
|
42
|
+
Download and run the installer from the [Tesseract UB Mannheim releases](https://github.com/UB-Mannheim/tesseract/wiki), then add the install directory to your `PATH`.
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## Installation
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install pdforienter
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
For development (includes linting + test tools):
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
git clone https://github.com/your-org/pdforienter.git
|
|
56
|
+
cd pdforienter
|
|
57
|
+
pip install -e ".[dev]"
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Quick Start
|
|
63
|
+
|
|
64
|
+
### Command line
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
# Fix a single PDF
|
|
68
|
+
pdforienter invoice.pdf --output ./fixed
|
|
69
|
+
|
|
70
|
+
# Fix every PDF in a directory
|
|
71
|
+
pdforienter /scans/ --output /corrected
|
|
72
|
+
|
|
73
|
+
# Mix files and directories
|
|
74
|
+
pdforienter report.pdf /archive/ receipts.pdf --output ./out
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Python API
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from pdforienter import run_pipeline
|
|
81
|
+
from pdforienter.logging.writer import write_log
|
|
82
|
+
|
|
83
|
+
result = run_pipeline(
|
|
84
|
+
pdf_paths=["invoice.pdf", "report.pdf"],
|
|
85
|
+
output_dir="./corrected",
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Write the structured log file
|
|
89
|
+
log_path = write_log(result, "./corrected")
|
|
90
|
+
|
|
91
|
+
print(f"{result.total_pages_changed} pages corrected in {result.total_duration_seconds:.1f}s")
|
|
92
|
+
print(f"Log: {log_path}")
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## Log File
|
|
98
|
+
|
|
99
|
+
Every run produces a timestamped `.log` file in the output directory.
|
|
100
|
+
|
|
101
|
+
```
|
|
102
|
+
PDFOrienter Run Log — 2024-11-01 14:32:05
|
|
103
|
+
============================================================
|
|
104
|
+
|
|
105
|
+
[RUN SUMMARY]
|
|
106
|
+
Total files processed : 3
|
|
107
|
+
Total pages : 247
|
|
108
|
+
Pages rotated : 18
|
|
109
|
+
Text pages : 201
|
|
110
|
+
Scanned pages (OCR) : 46
|
|
111
|
+
Skipped pages : 0
|
|
112
|
+
Workers used : 6
|
|
113
|
+
Peak RAM usage : 312.4 MB
|
|
114
|
+
Total time : 42.18s
|
|
115
|
+
|
|
116
|
+
------------------------------------------------------------
|
|
117
|
+
[FILE] /scans/invoice.pdf
|
|
118
|
+
Output : /corrected/invoice_corrected.pdf
|
|
119
|
+
Total pages : 12
|
|
120
|
+
Pages changed : 3
|
|
121
|
+
Text pages : 8
|
|
122
|
+
Scanned pages : 4
|
|
123
|
+
Skipped pages : 0
|
|
124
|
+
Detection time : 9.41s
|
|
125
|
+
Correction time : 0.23s
|
|
126
|
+
Total time : 9.64s
|
|
127
|
+
[PAGE DETAILS]
|
|
128
|
+
p 1 | text | OK | angle= 0° | conf= 98.2 | 0.11s | No rotation needed.
|
|
129
|
+
p 2 | scanned | CHANGED | angle= 90° | conf= 87.5 | 2.34s | Rotation of 90° detected (confidence 87.5).
|
|
130
|
+
...
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## Project Structure
|
|
136
|
+
|
|
137
|
+
```
|
|
138
|
+
pdforienter/
|
|
139
|
+
├── pdforienter/
|
|
140
|
+
│ ├── __init__.py # Public API: run_pipeline
|
|
141
|
+
│ ├── config.py # Tuneable constants (worker count, thresholds)
|
|
142
|
+
│ ├── models.py # Typed data classes (PageResult, FileResult, RunResult)
|
|
143
|
+
│ ├── cli.py # Command-line interface
|
|
144
|
+
│ ├── core/
|
|
145
|
+
│ │ ├── pipeline.py # Top-level orchestrator
|
|
146
|
+
│ │ ├── processor.py # Per-file orchestrator (Phase 1 + Phase 2)
|
|
147
|
+
│ │ ├── analyzer.py # Per-page worker (dispatched to subprocess)
|
|
148
|
+
│ │ ├── classifier.py # Text vs scanned page detection
|
|
149
|
+
│ │ ├── detector.py # Orientation detection (text + OSD strategies)
|
|
150
|
+
│ │ └── corrector.py # Single-pass rotation applier
|
|
151
|
+
│ ├── logging/
|
|
152
|
+
│ │ ├── formatter.py # RunResult → structured log string
|
|
153
|
+
│ │ └── writer.py # Write log file to disk
|
|
154
|
+
│ └── utils/
|
|
155
|
+
│ ├── fs.py # Filesystem helpers
|
|
156
|
+
│ └── resources.py # CPU / RAM telemetry
|
|
157
|
+
├── tests/
|
|
158
|
+
│ └── test_core.py
|
|
159
|
+
├── pyproject.toml
|
|
160
|
+
└── README.md
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
---
|
|
164
|
+
|
|
165
|
+
## Configuration
|
|
166
|
+
|
|
167
|
+
All tuneable constants live in `pdforienter/config.py`.
|
|
168
|
+
|
|
169
|
+
| Constant | Default | Description |
|
|
170
|
+
|---|---|---|
|
|
171
|
+
| `MAX_WORKERS` | `floor(cpu_count × 0.75)` | Worker processes for parallel page analysis |
|
|
172
|
+
| `OSD_CONFIDENCE_THRESHOLD` | `10.0` | Minimum Tesseract OSD confidence to trust a result |
|
|
173
|
+
| `TESSERACT_OSD_PSM` | `0` | Tesseract page segmentation mode (0 = OSD only) |
|
|
174
|
+
| `_RENDER_DPI` (detector.py) | `150` | DPI used when rasterising pages for OSD |
|
|
175
|
+
| `_MIN_CHAR_COUNT` (classifier.py) | `20` | Minimum characters to classify a page as text-based |
|
|
176
|
+
|
|
177
|
+
---
|
|
178
|
+
|
|
179
|
+
## How It Works
|
|
180
|
+
|
|
181
|
+
### Phase 1 — Parallel Detection
|
|
182
|
+
|
|
183
|
+
Each page is dispatched to a subprocess worker via `ProcessPoolExecutor`. Workers run concurrently up to `MAX_WORKERS`.
|
|
184
|
+
|
|
185
|
+
For each page:
|
|
186
|
+
1. **Classify** — does the page have selectable text?
|
|
187
|
+
2. **Detect orientation**
|
|
188
|
+
- *Text page* → analyse character direction vectors (fast, no OCR)
|
|
189
|
+
- *Scanned page* → rasterise at 150 DPI and run Tesseract OSD
|
|
190
|
+
3. Return a `PageResult` with the detected angle, confidence, and timing
|
|
191
|
+
|
|
192
|
+
### Phase 2 — Single-Pass Correction
|
|
193
|
+
|
|
194
|
+
After all pages are analysed, a single `fitz.Document.save()` call applies every rotation and writes the corrected PDF. No intermediate files are created.
|
|
195
|
+
|
|
196
|
+
---
|
|
197
|
+
|
|
198
|
+
## Performance
|
|
199
|
+
|
|
200
|
+
Typical estimates on an 8-core server (6 workers) with mixed text/scanned PDFs:
|
|
201
|
+
|
|
202
|
+
| Scenario | Estimate |
|
|
203
|
+
|---|---|
|
|
204
|
+
| 2 000 pages, all text-based | ~1–2 minutes |
|
|
205
|
+
| 2 000 pages, mixed 50/50 | ~7–8 minutes |
|
|
206
|
+
| 2 000 pages, all scanned | ~15–17 minutes |
|
|
207
|
+
|
|
208
|
+
RAM usage: ~200–400 MB per Tesseract worker. 6 workers ≈ 2.5 GB peak. Well within a 16 GB server.
|
|
209
|
+
|
|
210
|
+
---
|
|
211
|
+
|
|
212
|
+
## Running Tests
|
|
213
|
+
|
|
214
|
+
```bash
|
|
215
|
+
pytest tests/ -v
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
---
|
|
219
|
+
|
|
220
|
+
## License
|
|
221
|
+
|
|
222
|
+
MIT
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PDFOrienter — Intelligent, parallel PDF page rotation correction.
|
|
3
|
+
|
|
4
|
+
Public API
|
|
5
|
+
----------
|
|
6
|
+
run_pipeline(pdf_paths, output_dir) -> RunResult
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from pdforienter.models import FileResult, PageResult, PageType, RunResult
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"run_pipeline",
|
|
15
|
+
"RunResult",
|
|
16
|
+
"FileResult",
|
|
17
|
+
"PageResult",
|
|
18
|
+
"PageType",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
__version__ = "0.1.0"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def __getattr__(name: str) -> Any:
|
|
25
|
+
# Lazy export — defer importing the pipeline (and its transitive
|
|
26
|
+
# pytesseract / PyMuPDF dependencies) until the symbol is actually used.
|
|
27
|
+
if name == "run_pipeline":
|
|
28
|
+
from pdforienter.core.pipeline import run_pipeline
|
|
29
|
+
return run_pipeline
|
|
30
|
+
raise AttributeError(f"module 'pdforienter' has no attribute {name!r}")
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Command-line interface for PDFOrienter.
|
|
3
|
+
|
|
4
|
+
Usage
|
|
5
|
+
-----
|
|
6
|
+
pdforienter <pdf_or_dir> [<pdf_or_dir> ...] --output <dir>
|
|
7
|
+
|
|
8
|
+
Examples
|
|
9
|
+
--------
|
|
10
|
+
pdforienter invoice.pdf --output ./fixed
|
|
11
|
+
pdforienter /scans/ report.pdf --output /corrected
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import sys
|
|
18
|
+
|
|
19
|
+
from pdforienter.core.pipeline import run_pipeline
|
|
20
|
+
from pdforienter.logging.writer import write_log
|
|
21
|
+
from pdforienter.utils.fs import resolve_pdf_paths
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
25
|
+
parser = argparse.ArgumentParser(
|
|
26
|
+
prog="pdforienter",
|
|
27
|
+
description="Automatically fix PDF page orientations.",
|
|
28
|
+
)
|
|
29
|
+
parser.add_argument(
|
|
30
|
+
"inputs",
|
|
31
|
+
nargs="+",
|
|
32
|
+
metavar="PDF_OR_DIR",
|
|
33
|
+
help="One or more PDF files or directories to process.",
|
|
34
|
+
)
|
|
35
|
+
parser.add_argument(
|
|
36
|
+
"--output", "-o",
|
|
37
|
+
required=True,
|
|
38
|
+
metavar="OUTPUT_DIR",
|
|
39
|
+
help="Directory where corrected PDFs and the log will be saved.",
|
|
40
|
+
)
|
|
41
|
+
return parser
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def main(argv: list[str] | None = None) -> int:
|
|
45
|
+
parser = build_parser()
|
|
46
|
+
args = parser.parse_args(argv)
|
|
47
|
+
|
|
48
|
+
pdf_paths = resolve_pdf_paths(args.inputs)
|
|
49
|
+
if not pdf_paths:
|
|
50
|
+
print("No PDF files found in the provided paths.", file=sys.stderr)
|
|
51
|
+
return 1
|
|
52
|
+
|
|
53
|
+
print(f"Processing {len(pdf_paths)} PDF file(s)…")
|
|
54
|
+
result = run_pipeline(pdf_paths, args.output)
|
|
55
|
+
log_path = write_log(result, args.output)
|
|
56
|
+
|
|
57
|
+
print(
|
|
58
|
+
f"\nDone. {result.total_pages_changed}/{result.total_pages} pages rotated "
|
|
59
|
+
f"across {result.total_files} file(s) in {result.total_duration_seconds:.1f}s."
|
|
60
|
+
)
|
|
61
|
+
print(f"Log written to: {log_path}")
|
|
62
|
+
return 0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
sys.exit(main())
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Global configuration and tuneable constants for PDFOrienter.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
import os
|
|
7
|
+
|
|
8
|
+
# ---------------------------------------------------------------------------
|
|
9
|
+
# Worker pool
|
|
10
|
+
# ---------------------------------------------------------------------------
|
|
11
|
+
# Use 75 % of available logical CPUs, minimum 1, to leave headroom for the OS
|
|
12
|
+
# and other processes running on the same host.
|
|
13
|
+
CPU_COUNT: int = os.cpu_count() or 1
|
|
14
|
+
MAX_WORKERS: int = max(1, math.floor(CPU_COUNT * 0.75))
|
|
15
|
+
|
|
16
|
+
# ---------------------------------------------------------------------------
|
|
17
|
+
# Tesseract / OSD
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
# Tesseract Page Segmentation Mode 0 = Orientation and Script Detection only.
|
|
20
|
+
# Much faster than full OCR — no character recognition is performed.
|
|
21
|
+
TESSERACT_OSD_PSM: int = 0
|
|
22
|
+
|
|
23
|
+
# Minimum OSD confidence (0-100) required to trust the detected orientation.
|
|
24
|
+
OSD_CONFIDENCE_THRESHOLD: float = 10.0
|
|
25
|
+
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
# Rotation
|
|
28
|
+
# ---------------------------------------------------------------------------
|
|
29
|
+
# Only these discrete angles (degrees) are considered valid rotations.
|
|
30
|
+
VALID_ROTATIONS: tuple[int, ...] = (0, 90, 180, 270)
|
|
31
|
+
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# File limits
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
MAX_FILE_SIZE_MB: int = 200
|
|
36
|
+
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
# Logging
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
LOG_DATE_FORMAT: str = "%Y-%m-%d %H:%M:%S"
|
|
File without changes
|