pdforienter 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. pdforienter-0.1.0/LICENSE +21 -0
  2. pdforienter-0.1.0/PKG-INFO +259 -0
  3. pdforienter-0.1.0/README.md +222 -0
  4. pdforienter-0.1.0/pdforienter/__init__.py +30 -0
  5. pdforienter-0.1.0/pdforienter/cli.py +66 -0
  6. pdforienter-0.1.0/pdforienter/config.py +40 -0
  7. pdforienter-0.1.0/pdforienter/core/__init__.py +0 -0
  8. pdforienter-0.1.0/pdforienter/core/analyzer.py +80 -0
  9. pdforienter-0.1.0/pdforienter/core/classifier.py +18 -0
  10. pdforienter-0.1.0/pdforienter/core/corrector.py +61 -0
  11. pdforienter-0.1.0/pdforienter/core/detector.py +91 -0
  12. pdforienter-0.1.0/pdforienter/core/pipeline.py +55 -0
  13. pdforienter-0.1.0/pdforienter/core/processor.py +116 -0
  14. pdforienter-0.1.0/pdforienter/logging/__init__.py +0 -0
  15. pdforienter-0.1.0/pdforienter/logging/formatter.py +87 -0
  16. pdforienter-0.1.0/pdforienter/logging/writer.py +25 -0
  17. pdforienter-0.1.0/pdforienter/models.py +66 -0
  18. pdforienter-0.1.0/pdforienter/utils/__init__.py +0 -0
  19. pdforienter-0.1.0/pdforienter/utils/fs.py +32 -0
  20. pdforienter-0.1.0/pdforienter/utils/resources.py +22 -0
  21. pdforienter-0.1.0/pdforienter.egg-info/PKG-INFO +259 -0
  22. pdforienter-0.1.0/pdforienter.egg-info/SOURCES.txt +27 -0
  23. pdforienter-0.1.0/pdforienter.egg-info/dependency_links.txt +1 -0
  24. pdforienter-0.1.0/pdforienter.egg-info/entry_points.txt +2 -0
  25. pdforienter-0.1.0/pdforienter.egg-info/requires.txt +10 -0
  26. pdforienter-0.1.0/pdforienter.egg-info/top_level.txt +1 -0
  27. pdforienter-0.1.0/pyproject.toml +74 -0
  28. pdforienter-0.1.0/setup.cfg +4 -0
  29. pdforienter-0.1.0/tests/test_core.py +106 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 InfinitiBit GmbH
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,259 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdforienter
3
+ Version: 0.1.0
4
+ Summary: Intelligent, parallel PDF page rotation correction.
5
+ Author: InfinitiBit GmbH
6
+ Maintainer: InfinitiBit GmbH
7
+ License: MIT
8
+ Project-URL: Homepage, https://github.com/MdRahmatUllah/pdforienter
9
+ Project-URL: Repository, https://github.com/MdRahmatUllah/pdforienter
10
+ Project-URL: Issues, https://github.com/MdRahmatUllah/pdforienter/issues
11
+ Project-URL: Documentation, https://github.com/MdRahmatUllah/pdforienter/blob/main/TECHNICAL.md
12
+ Keywords: pdf,rotation,ocr,tesseract,document
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Utilities
22
+ Classifier: Topic :: Multimedia :: Graphics :: Graphics Conversion
23
+ Classifier: Topic :: Text Processing
24
+ Requires-Python: >=3.10
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: PyMuPDF>=1.23
28
+ Requires-Dist: pytesseract>=0.3.10
29
+ Requires-Dist: Pillow>=10.0
30
+ Requires-Dist: psutil>=5.9
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest>=7; extra == "dev"
33
+ Requires-Dist: pytest-cov; extra == "dev"
34
+ Requires-Dist: ruff; extra == "dev"
35
+ Requires-Dist: mypy; extra == "dev"
36
+ Dynamic: license-file
37
+
38
+ # PDFOrienter
39
+
40
+ **Intelligent, parallel PDF page rotation correction for Python.**
41
+
42
+ PDFOrienter analyses every page of one or more PDF files, detects incorrect orientations, and fixes them in a single write pass — with no unnecessary re-processing.
43
+
44
+ ---
45
+
46
+ ## Features
47
+
48
+ - **Two-phase pipeline** — detect all pages in parallel, then apply all corrections in a single write
49
+ - **Smart strategy selection** — uses fast text-direction analysis for text-based pages; falls back to Tesseract OSD only for image/scanned pages
50
+ - **Dynamic parallelism** — automatically uses 75 % of available CPU cores; scales from 4 to 64+ cores without any configuration change
51
+ - **Detailed structured logging** — per-page and per-file timing, rotation details, confidence scores, RAM and CPU usage
52
+ - **Zero intermediate files** — corrected PDFs are written once; originals are never modified
53
+ - **Package-ready** — clean modular design, typed, fully testable
54
+
55
+ ---
56
+
57
+ ## Requirements
58
+
59
+ ### Python
60
+
61
+ Python 3.10 or newer.
62
+
63
+ ### System dependency — Tesseract
64
+
65
+ Tesseract must be installed on the host system **before** installing PDFOrienter.
66
+
67
+ **Ubuntu / Debian**
68
+ ```bash
69
+ sudo apt-get update && sudo apt-get install -y tesseract-ocr
70
+ ```
71
+
72
+ **macOS (Homebrew)**
73
+ ```bash
74
+ brew install tesseract
75
+ ```
76
+
77
+ **Windows**
78
+
79
+ Download and run the installer from the [Tesseract UB Mannheim releases](https://github.com/UB-Mannheim/tesseract/wiki), then add the install directory to your `PATH`.
80
+
81
+ ---
82
+
83
+ ## Installation
84
+
85
+ ```bash
86
+ pip install pdforienter
87
+ ```
88
+
89
+ For development (includes linting + test tools):
90
+
91
+ ```bash
92
+ git clone https://github.com/your-org/pdforienter.git
93
+ cd pdforienter
94
+ pip install -e ".[dev]"
95
+ ```
96
+
97
+ ---
98
+
99
+ ## Quick Start
100
+
101
+ ### Command line
102
+
103
+ ```bash
104
+ # Fix a single PDF
105
+ pdforienter invoice.pdf --output ./fixed
106
+
107
+ # Fix every PDF in a directory
108
+ pdforienter /scans/ --output /corrected
109
+
110
+ # Mix files and directories
111
+ pdforienter report.pdf /archive/ receipts.pdf --output ./out
112
+ ```
113
+
114
+ ### Python API
115
+
116
+ ```python
117
+ from pdforienter import run_pipeline
118
+ from pdforienter.logging.writer import write_log
119
+
120
+ result = run_pipeline(
121
+ pdf_paths=["invoice.pdf", "report.pdf"],
122
+ output_dir="./corrected",
123
+ )
124
+
125
+ # Write the structured log file
126
+ log_path = write_log(result, "./corrected")
127
+
128
+ print(f"{result.total_pages_changed} pages corrected in {result.total_duration_seconds:.1f}s")
129
+ print(f"Log: {log_path}")
130
+ ```
131
+
132
+ ---
133
+
134
+ ## Log File
135
+
136
+ Every run produces a timestamped `.log` file in the output directory.
137
+
138
+ ```
139
+ PDFOrienter Run Log — 2024-11-01 14:32:05
140
+ ============================================================
141
+
142
+ [RUN SUMMARY]
143
+ Total files processed : 3
144
+ Total pages : 247
145
+ Pages rotated : 18
146
+ Text pages : 201
147
+ Scanned pages (OCR) : 46
148
+ Skipped pages : 0
149
+ Workers used : 6
150
+ Peak RAM usage : 312.4 MB
151
+ Total time : 42.18s
152
+
153
+ ------------------------------------------------------------
154
+ [FILE] /scans/invoice.pdf
155
+ Output : /corrected/invoice_corrected.pdf
156
+ Total pages : 12
157
+ Pages changed : 3
158
+ Text pages : 8
159
+ Scanned pages : 4
160
+ Skipped pages : 0
161
+ Detection time : 9.41s
162
+ Correction time : 0.23s
163
+ Total time : 9.64s
164
+ [PAGE DETAILS]
165
+ p 1 | text | OK | angle= 0° | conf= 98.2 | 0.11s | No rotation needed.
166
+ p 2 | scanned | CHANGED | angle= 90° | conf= 87.5 | 2.34s | Rotation of 90° detected (confidence 87.5).
167
+ ...
168
+ ```
169
+
170
+ ---
171
+
172
+ ## Project Structure
173
+
174
+ ```
175
+ pdforienter/
176
+ ├── pdforienter/
177
+ │ ├── __init__.py # Public API: run_pipeline
178
+ │ ├── config.py # Tuneable constants (worker count, thresholds)
179
+ │ ├── models.py # Typed data classes (PageResult, FileResult, RunResult)
180
+ │ ├── cli.py # Command-line interface
181
+ │ ├── core/
182
+ │ │ ├── pipeline.py # Top-level orchestrator
183
+ │ │ ├── processor.py # Per-file orchestrator (Phase 1 + Phase 2)
184
+ │ │ ├── analyzer.py # Per-page worker (dispatched to subprocess)
185
+ │ │ ├── classifier.py # Text vs scanned page detection
186
+ │ │ ├── detector.py # Orientation detection (text + OSD strategies)
187
+ │ │ └── corrector.py # Single-pass rotation applier
188
+ │ ├── logging/
189
+ │ │ ├── formatter.py # RunResult → structured log string
190
+ │ │ └── writer.py # Write log file to disk
191
+ │ └── utils/
192
+ │ ├── fs.py # Filesystem helpers
193
+ │ └── resources.py # CPU / RAM telemetry
194
+ ├── tests/
195
+ │ └── test_core.py
196
+ ├── pyproject.toml
197
+ └── README.md
198
+ ```
199
+
200
+ ---
201
+
202
+ ## Configuration
203
+
204
+ All tuneable constants live in `pdforienter/config.py`.
205
+
206
+ | Constant | Default | Description |
207
+ |---|---|---|
208
+ | `MAX_WORKERS` | `floor(cpu_count × 0.75)` | Worker processes for parallel page analysis |
209
+ | `OSD_CONFIDENCE_THRESHOLD` | `10.0` | Minimum Tesseract OSD confidence to trust a result |
210
+ | `TESSERACT_OSD_PSM` | `0` | Tesseract page segmentation mode (0 = OSD only) |
211
+ | `_RENDER_DPI` (detector.py) | `150` | DPI used when rasterising pages for OSD |
212
+ | `_MIN_CHAR_COUNT` (classifier.py) | `20` | Minimum characters to classify a page as text-based |
213
+
214
+ ---
215
+
216
+ ## How It Works
217
+
218
+ ### Phase 1 — Parallel Detection
219
+
220
+ Each page is dispatched to a subprocess worker via `ProcessPoolExecutor`. Workers run concurrently up to `MAX_WORKERS`.
221
+
222
+ For each page:
223
+ 1. **Classify** — does the page have selectable text?
224
+ 2. **Detect orientation**
225
+ - *Text page* → analyse character direction vectors (fast, no OCR)
226
+ - *Scanned page* → rasterise at 150 DPI and run Tesseract OSD
227
+ 3. Return a `PageResult` with the detected angle, confidence, and timing
228
+
229
+ ### Phase 2 — Single-Pass Correction
230
+
231
+ After all pages are analysed, a single `fitz.Document.save()` call applies every rotation and writes the corrected PDF. No intermediate files are created.
232
+
233
+ ---
234
+
235
+ ## Performance
236
+
237
+ Typical estimates on an 8-core server (6 workers) with mixed text/scanned PDFs:
238
+
239
+ | Scenario | Estimate |
240
+ |---|---|
241
+ | 2 000 pages, all text-based | ~1–2 minutes |
242
+ | 2 000 pages, mixed 50/50 | ~7–8 minutes |
243
+ | 2 000 pages, all scanned | ~15–17 minutes |
244
+
245
+ RAM usage: ~200–400 MB per Tesseract worker. 6 workers ≈ 2.5 GB peak. Well within a 16 GB server.
246
+
247
+ ---
248
+
249
+ ## Running Tests
250
+
251
+ ```bash
252
+ pytest tests/ -v
253
+ ```
254
+
255
+ ---
256
+
257
+ ## License
258
+
259
+ MIT
@@ -0,0 +1,222 @@
1
+ # PDFOrienter
2
+
3
+ **Intelligent, parallel PDF page rotation correction for Python.**
4
+
5
+ PDFOrienter analyses every page of one or more PDF files, detects incorrect orientations, and fixes them in a single write pass — with no unnecessary re-processing.
6
+
7
+ ---
8
+
9
+ ## Features
10
+
11
+ - **Two-phase pipeline** — detect all pages in parallel, then apply all corrections in a single write
12
+ - **Smart strategy selection** — uses fast text-direction analysis for text-based pages; falls back to Tesseract OSD only for image/scanned pages
13
+ - **Dynamic parallelism** — automatically uses 75 % of available CPU cores; scales from 4 to 64+ cores without any configuration change
14
+ - **Detailed structured logging** — per-page and per-file timing, rotation details, confidence scores, RAM and CPU usage
15
+ - **Zero intermediate files** — corrected PDFs are written once; originals are never modified
16
+ - **Package-ready** — clean modular design, typed, fully testable
17
+
18
+ ---
19
+
20
+ ## Requirements
21
+
22
+ ### Python
23
+
24
+ Python 3.10 or newer.
25
+
26
+ ### System dependency — Tesseract
27
+
28
+ Tesseract must be installed on the host system **before** installing PDFOrienter.
29
+
30
+ **Ubuntu / Debian**
31
+ ```bash
32
+ sudo apt-get update && sudo apt-get install -y tesseract-ocr
33
+ ```
34
+
35
+ **macOS (Homebrew)**
36
+ ```bash
37
+ brew install tesseract
38
+ ```
39
+
40
+ **Windows**
41
+
42
+ Download and run the installer from the [Tesseract UB Mannheim releases](https://github.com/UB-Mannheim/tesseract/wiki), then add the install directory to your `PATH`.
43
+
44
+ ---
45
+
46
+ ## Installation
47
+
48
+ ```bash
49
+ pip install pdforienter
50
+ ```
51
+
52
+ For development (includes linting + test tools):
53
+
54
+ ```bash
55
+ git clone https://github.com/your-org/pdforienter.git
56
+ cd pdforienter
57
+ pip install -e ".[dev]"
58
+ ```
59
+
60
+ ---
61
+
62
+ ## Quick Start
63
+
64
+ ### Command line
65
+
66
+ ```bash
67
+ # Fix a single PDF
68
+ pdforienter invoice.pdf --output ./fixed
69
+
70
+ # Fix every PDF in a directory
71
+ pdforienter /scans/ --output /corrected
72
+
73
+ # Mix files and directories
74
+ pdforienter report.pdf /archive/ receipts.pdf --output ./out
75
+ ```
76
+
77
+ ### Python API
78
+
79
+ ```python
80
+ from pdforienter import run_pipeline
81
+ from pdforienter.logging.writer import write_log
82
+
83
+ result = run_pipeline(
84
+ pdf_paths=["invoice.pdf", "report.pdf"],
85
+ output_dir="./corrected",
86
+ )
87
+
88
+ # Write the structured log file
89
+ log_path = write_log(result, "./corrected")
90
+
91
+ print(f"{result.total_pages_changed} pages corrected in {result.total_duration_seconds:.1f}s")
92
+ print(f"Log: {log_path}")
93
+ ```
94
+
95
+ ---
96
+
97
+ ## Log File
98
+
99
+ Every run produces a timestamped `.log` file in the output directory.
100
+
101
+ ```
102
+ PDFOrienter Run Log — 2024-11-01 14:32:05
103
+ ============================================================
104
+
105
+ [RUN SUMMARY]
106
+ Total files processed : 3
107
+ Total pages : 247
108
+ Pages rotated : 18
109
+ Text pages : 201
110
+ Scanned pages (OCR) : 46
111
+ Skipped pages : 0
112
+ Workers used : 6
113
+ Peak RAM usage : 312.4 MB
114
+ Total time : 42.18s
115
+
116
+ ------------------------------------------------------------
117
+ [FILE] /scans/invoice.pdf
118
+ Output : /corrected/invoice_corrected.pdf
119
+ Total pages : 12
120
+ Pages changed : 3
121
+ Text pages : 8
122
+ Scanned pages : 4
123
+ Skipped pages : 0
124
+ Detection time : 9.41s
125
+ Correction time : 0.23s
126
+ Total time : 9.64s
127
+ [PAGE DETAILS]
128
+ p 1 | text | OK | angle= 0° | conf= 98.2 | 0.11s | No rotation needed.
129
+ p 2 | scanned | CHANGED | angle= 90° | conf= 87.5 | 2.34s | Rotation of 90° detected (confidence 87.5).
130
+ ...
131
+ ```
132
+
133
+ ---
134
+
135
+ ## Project Structure
136
+
137
+ ```
138
+ pdforienter/
139
+ ├── pdforienter/
140
+ │ ├── __init__.py # Public API: run_pipeline
141
+ │ ├── config.py # Tuneable constants (worker count, thresholds)
142
+ │ ├── models.py # Typed data classes (PageResult, FileResult, RunResult)
143
+ │ ├── cli.py # Command-line interface
144
+ │ ├── core/
145
+ │ │ ├── pipeline.py # Top-level orchestrator
146
+ │ │ ├── processor.py # Per-file orchestrator (Phase 1 + Phase 2)
147
+ │ │ ├── analyzer.py # Per-page worker (dispatched to subprocess)
148
+ │ │ ├── classifier.py # Text vs scanned page detection
149
+ │ │ ├── detector.py # Orientation detection (text + OSD strategies)
150
+ │ │ └── corrector.py # Single-pass rotation applier
151
+ │ ├── logging/
152
+ │ │ ├── formatter.py # RunResult → structured log string
153
+ │ │ └── writer.py # Write log file to disk
154
+ │ └── utils/
155
+ │ ├── fs.py # Filesystem helpers
156
+ │ └── resources.py # CPU / RAM telemetry
157
+ ├── tests/
158
+ │ └── test_core.py
159
+ ├── pyproject.toml
160
+ └── README.md
161
+ ```
162
+
163
+ ---
164
+
165
+ ## Configuration
166
+
167
+ All tuneable constants live in `pdforienter/config.py`.
168
+
169
+ | Constant | Default | Description |
170
+ |---|---|---|
171
+ | `MAX_WORKERS` | `floor(cpu_count × 0.75)` | Worker processes for parallel page analysis |
172
+ | `OSD_CONFIDENCE_THRESHOLD` | `10.0` | Minimum Tesseract OSD confidence to trust a result |
173
+ | `TESSERACT_OSD_PSM` | `0` | Tesseract page segmentation mode (0 = OSD only) |
174
+ | `_RENDER_DPI` (detector.py) | `150` | DPI used when rasterising pages for OSD |
175
+ | `_MIN_CHAR_COUNT` (classifier.py) | `20` | Minimum characters to classify a page as text-based |
176
+
177
+ ---
178
+
179
+ ## How It Works
180
+
181
+ ### Phase 1 — Parallel Detection
182
+
183
+ Each page is dispatched to a subprocess worker via `ProcessPoolExecutor`. Workers run concurrently up to `MAX_WORKERS`.
184
+
185
+ For each page:
186
+ 1. **Classify** — does the page have selectable text?
187
+ 2. **Detect orientation**
188
+ - *Text page* → analyse character direction vectors (fast, no OCR)
189
+ - *Scanned page* → rasterise at 150 DPI and run Tesseract OSD
190
+ 3. Return a `PageResult` with the detected angle, confidence, and timing
191
+
192
+ ### Phase 2 — Single-Pass Correction
193
+
194
+ After all pages are analysed, a single `fitz.Document.save()` call applies every rotation and writes the corrected PDF. No intermediate files are created.
195
+
196
+ ---
197
+
198
+ ## Performance
199
+
200
+ Typical estimates on an 8-core server (6 workers) with mixed text/scanned PDFs:
201
+
202
+ | Scenario | Estimate |
203
+ |---|---|
204
+ | 2 000 pages, all text-based | ~1–2 minutes |
205
+ | 2 000 pages, mixed 50/50 | ~7–8 minutes |
206
+ | 2 000 pages, all scanned | ~15–17 minutes |
207
+
208
+ RAM usage: ~200–400 MB per Tesseract worker. 6 workers ≈ 2.5 GB peak. Well within a 16 GB server.
209
+
210
+ ---
211
+
212
+ ## Running Tests
213
+
214
+ ```bash
215
+ pytest tests/ -v
216
+ ```
217
+
218
+ ---
219
+
220
+ ## License
221
+
222
+ MIT
@@ -0,0 +1,30 @@
1
+ """
2
+ PDFOrienter — Intelligent, parallel PDF page rotation correction.
3
+
4
+ Public API
5
+ ----------
6
+ run_pipeline(pdf_paths, output_dir) -> RunResult
7
+ """
8
+
9
+ from typing import Any
10
+
11
+ from pdforienter.models import FileResult, PageResult, PageType, RunResult
12
+
13
+ __all__ = [
14
+ "run_pipeline",
15
+ "RunResult",
16
+ "FileResult",
17
+ "PageResult",
18
+ "PageType",
19
+ ]
20
+
21
+ __version__ = "0.1.0"
22
+
23
+
24
+ def __getattr__(name: str) -> Any:
25
+ # Lazy export — defer importing the pipeline (and its transitive
26
+ # pytesseract / PyMuPDF dependencies) until the symbol is actually used.
27
+ if name == "run_pipeline":
28
+ from pdforienter.core.pipeline import run_pipeline
29
+ return run_pipeline
30
+ raise AttributeError(f"module 'pdforienter' has no attribute {name!r}")
@@ -0,0 +1,66 @@
1
+ """
2
+ Command-line interface for PDFOrienter.
3
+
4
+ Usage
5
+ -----
6
+ pdforienter <pdf_or_dir> [<pdf_or_dir> ...] --output <dir>
7
+
8
+ Examples
9
+ --------
10
+ pdforienter invoice.pdf --output ./fixed
11
+ pdforienter /scans/ report.pdf --output /corrected
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import argparse
17
+ import sys
18
+
19
+ from pdforienter.core.pipeline import run_pipeline
20
+ from pdforienter.logging.writer import write_log
21
+ from pdforienter.utils.fs import resolve_pdf_paths
22
+
23
+
24
+ def build_parser() -> argparse.ArgumentParser:
25
+ parser = argparse.ArgumentParser(
26
+ prog="pdforienter",
27
+ description="Automatically fix PDF page orientations.",
28
+ )
29
+ parser.add_argument(
30
+ "inputs",
31
+ nargs="+",
32
+ metavar="PDF_OR_DIR",
33
+ help="One or more PDF files or directories to process.",
34
+ )
35
+ parser.add_argument(
36
+ "--output", "-o",
37
+ required=True,
38
+ metavar="OUTPUT_DIR",
39
+ help="Directory where corrected PDFs and the log will be saved.",
40
+ )
41
+ return parser
42
+
43
+
44
+ def main(argv: list[str] | None = None) -> int:
45
+ parser = build_parser()
46
+ args = parser.parse_args(argv)
47
+
48
+ pdf_paths = resolve_pdf_paths(args.inputs)
49
+ if not pdf_paths:
50
+ print("No PDF files found in the provided paths.", file=sys.stderr)
51
+ return 1
52
+
53
+ print(f"Processing {len(pdf_paths)} PDF file(s)…")
54
+ result = run_pipeline(pdf_paths, args.output)
55
+ log_path = write_log(result, args.output)
56
+
57
+ print(
58
+ f"\nDone. {result.total_pages_changed}/{result.total_pages} pages rotated "
59
+ f"across {result.total_files} file(s) in {result.total_duration_seconds:.1f}s."
60
+ )
61
+ print(f"Log written to: {log_path}")
62
+ return 0
63
+
64
+
65
+ if __name__ == "__main__":
66
+ sys.exit(main())
@@ -0,0 +1,40 @@
1
+ """
2
+ Global configuration and tuneable constants for PDFOrienter.
3
+ """
4
+
5
+ import math
6
+ import os
7
+
8
+ # ---------------------------------------------------------------------------
9
+ # Worker pool
10
+ # ---------------------------------------------------------------------------
11
+ # Use 75 % of available logical CPUs, minimum 1, to leave headroom for the OS
12
+ # and other processes running on the same host.
13
+ CPU_COUNT: int = os.cpu_count() or 1
14
+ MAX_WORKERS: int = max(1, math.floor(CPU_COUNT * 0.75))
15
+
16
+ # ---------------------------------------------------------------------------
17
+ # Tesseract / OSD
18
+ # ---------------------------------------------------------------------------
19
+ # Tesseract Page Segmentation Mode 0 = Orientation and Script Detection only.
20
+ # Much faster than full OCR — no character recognition is performed.
21
+ TESSERACT_OSD_PSM: int = 0
22
+
23
+ # Minimum OSD confidence (0-100) required to trust the detected orientation.
24
+ OSD_CONFIDENCE_THRESHOLD: float = 10.0
25
+
26
+ # ---------------------------------------------------------------------------
27
+ # Rotation
28
+ # ---------------------------------------------------------------------------
29
+ # Only these discrete angles (degrees) are considered valid rotations.
30
+ VALID_ROTATIONS: tuple[int, ...] = (0, 90, 180, 270)
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # File limits
34
+ # ---------------------------------------------------------------------------
35
+ MAX_FILE_SIZE_MB: int = 200
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # Logging
39
+ # ---------------------------------------------------------------------------
40
+ LOG_DATE_FORMAT: str = "%Y-%m-%d %H:%M:%S"
File without changes