pdforienter 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdforienter/__init__.py +30 -0
- pdforienter/cli.py +66 -0
- pdforienter/config.py +40 -0
- pdforienter/core/__init__.py +0 -0
- pdforienter/core/analyzer.py +80 -0
- pdforienter/core/classifier.py +18 -0
- pdforienter/core/corrector.py +61 -0
- pdforienter/core/detector.py +91 -0
- pdforienter/core/pipeline.py +55 -0
- pdforienter/core/processor.py +116 -0
- pdforienter/logging/__init__.py +0 -0
- pdforienter/logging/formatter.py +87 -0
- pdforienter/logging/writer.py +25 -0
- pdforienter/models.py +66 -0
- pdforienter/utils/__init__.py +0 -0
- pdforienter/utils/fs.py +32 -0
- pdforienter/utils/resources.py +22 -0
- pdforienter-0.1.0.dist-info/METADATA +259 -0
- pdforienter-0.1.0.dist-info/RECORD +23 -0
- pdforienter-0.1.0.dist-info/WHEEL +5 -0
- pdforienter-0.1.0.dist-info/entry_points.txt +2 -0
- pdforienter-0.1.0.dist-info/licenses/LICENSE +21 -0
- pdforienter-0.1.0.dist-info/top_level.txt +1 -0
pdforienter/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PDFOrienter — Intelligent, parallel PDF page rotation correction.
|
|
3
|
+
|
|
4
|
+
Public API
|
|
5
|
+
----------
|
|
6
|
+
run_pipeline(pdf_paths, output_dir) -> RunResult
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from pdforienter.models import FileResult, PageResult, PageType, RunResult
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"run_pipeline",
|
|
15
|
+
"RunResult",
|
|
16
|
+
"FileResult",
|
|
17
|
+
"PageResult",
|
|
18
|
+
"PageType",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
__version__ = "0.1.0"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def __getattr__(name: str) -> Any:
|
|
25
|
+
# Lazy export — defer importing the pipeline (and its transitive
|
|
26
|
+
# pytesseract / PyMuPDF dependencies) until the symbol is actually used.
|
|
27
|
+
if name == "run_pipeline":
|
|
28
|
+
from pdforienter.core.pipeline import run_pipeline
|
|
29
|
+
return run_pipeline
|
|
30
|
+
raise AttributeError(f"module 'pdforienter' has no attribute {name!r}")
|
pdforienter/cli.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Command-line interface for PDFOrienter.
|
|
3
|
+
|
|
4
|
+
Usage
|
|
5
|
+
-----
|
|
6
|
+
pdforienter <pdf_or_dir> [<pdf_or_dir> ...] --output <dir>
|
|
7
|
+
|
|
8
|
+
Examples
|
|
9
|
+
--------
|
|
10
|
+
pdforienter invoice.pdf --output ./fixed
|
|
11
|
+
pdforienter /scans/ report.pdf --output /corrected
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import sys
|
|
18
|
+
|
|
19
|
+
from pdforienter.core.pipeline import run_pipeline
|
|
20
|
+
from pdforienter.logging.writer import write_log
|
|
21
|
+
from pdforienter.utils.fs import resolve_pdf_paths
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
25
|
+
parser = argparse.ArgumentParser(
|
|
26
|
+
prog="pdforienter",
|
|
27
|
+
description="Automatically fix PDF page orientations.",
|
|
28
|
+
)
|
|
29
|
+
parser.add_argument(
|
|
30
|
+
"inputs",
|
|
31
|
+
nargs="+",
|
|
32
|
+
metavar="PDF_OR_DIR",
|
|
33
|
+
help="One or more PDF files or directories to process.",
|
|
34
|
+
)
|
|
35
|
+
parser.add_argument(
|
|
36
|
+
"--output", "-o",
|
|
37
|
+
required=True,
|
|
38
|
+
metavar="OUTPUT_DIR",
|
|
39
|
+
help="Directory where corrected PDFs and the log will be saved.",
|
|
40
|
+
)
|
|
41
|
+
return parser
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def main(argv: list[str] | None = None) -> int:
|
|
45
|
+
parser = build_parser()
|
|
46
|
+
args = parser.parse_args(argv)
|
|
47
|
+
|
|
48
|
+
pdf_paths = resolve_pdf_paths(args.inputs)
|
|
49
|
+
if not pdf_paths:
|
|
50
|
+
print("No PDF files found in the provided paths.", file=sys.stderr)
|
|
51
|
+
return 1
|
|
52
|
+
|
|
53
|
+
print(f"Processing {len(pdf_paths)} PDF file(s)…")
|
|
54
|
+
result = run_pipeline(pdf_paths, args.output)
|
|
55
|
+
log_path = write_log(result, args.output)
|
|
56
|
+
|
|
57
|
+
print(
|
|
58
|
+
f"\nDone. {result.total_pages_changed}/{result.total_pages} pages rotated "
|
|
59
|
+
f"across {result.total_files} file(s) in {result.total_duration_seconds:.1f}s."
|
|
60
|
+
)
|
|
61
|
+
print(f"Log written to: {log_path}")
|
|
62
|
+
return 0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
sys.exit(main())
|
pdforienter/config.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Global configuration and tuneable constants for PDFOrienter.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
import os
|
|
7
|
+
|
|
8
|
+
# ---------------------------------------------------------------------------
|
|
9
|
+
# Worker pool
|
|
10
|
+
# ---------------------------------------------------------------------------
|
|
11
|
+
# Use 75 % of available logical CPUs, minimum 1, to leave headroom for the OS
|
|
12
|
+
# and other processes running on the same host.
|
|
13
|
+
CPU_COUNT: int = os.cpu_count() or 1
|
|
14
|
+
MAX_WORKERS: int = max(1, math.floor(CPU_COUNT * 0.75))
|
|
15
|
+
|
|
16
|
+
# ---------------------------------------------------------------------------
|
|
17
|
+
# Tesseract / OSD
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
# Tesseract Page Segmentation Mode 0 = Orientation and Script Detection only.
|
|
20
|
+
# Much faster than full OCR — no character recognition is performed.
|
|
21
|
+
TESSERACT_OSD_PSM: int = 0
|
|
22
|
+
|
|
23
|
+
# Minimum OSD confidence (0-100) required to trust the detected orientation.
|
|
24
|
+
OSD_CONFIDENCE_THRESHOLD: float = 10.0
|
|
25
|
+
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
# Rotation
|
|
28
|
+
# ---------------------------------------------------------------------------
|
|
29
|
+
# Only these discrete angles (degrees) are considered valid rotations.
|
|
30
|
+
VALID_ROTATIONS: tuple[int, ...] = (0, 90, 180, 270)
|
|
31
|
+
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# File limits
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
MAX_FILE_SIZE_MB: int = 200
|
|
36
|
+
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
# Logging
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
LOG_DATE_FORMAT: str = "%Y-%m-%d %H:%M:%S"
|
|
File without changes
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""
|
|
2
|
+
analyse_page — the unit of work dispatched to each worker process.
|
|
3
|
+
|
|
4
|
+
Responsibility: open one page, classify it, detect orientation, return
|
|
5
|
+
a PageResult. This function is intentionally self-contained so it can
|
|
6
|
+
be safely pickled and sent to a ProcessPoolExecutor worker.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import time
|
|
12
|
+
|
|
13
|
+
import fitz # PyMuPDF
|
|
14
|
+
|
|
15
|
+
from pdforienter.core.classifier import has_text_layer
|
|
16
|
+
from pdforienter.core.detector import osd_orientation, text_orientation
|
|
17
|
+
from pdforienter.models import PageResult, PageType
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def analyse_page(pdf_path: str, page_index: int) -> PageResult:
|
|
21
|
+
"""
|
|
22
|
+
Analyse a single page and return its *PageResult*.
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
pdf_path:
|
|
27
|
+
Absolute path to the source PDF.
|
|
28
|
+
page_index:
|
|
29
|
+
Zero-based page index within the document.
|
|
30
|
+
"""
|
|
31
|
+
start = time.perf_counter()
|
|
32
|
+
page_number = page_index + 1 # convert to 1-based for reporting
|
|
33
|
+
|
|
34
|
+
doc = fitz.open(pdf_path)
|
|
35
|
+
try:
|
|
36
|
+
page = doc[page_index]
|
|
37
|
+
existing_rotation = int(page.rotation)
|
|
38
|
+
|
|
39
|
+
if has_text_layer(page):
|
|
40
|
+
page_type = PageType.TEXT
|
|
41
|
+
detected_angle, confidence = text_orientation(page)
|
|
42
|
+
else:
|
|
43
|
+
page_type = PageType.SCANNED
|
|
44
|
+
detected_angle, confidence = osd_orientation(page)
|
|
45
|
+
|
|
46
|
+
if detected_angle == 0:
|
|
47
|
+
changed = False
|
|
48
|
+
correction = 0
|
|
49
|
+
reason = "No rotation needed."
|
|
50
|
+
else:
|
|
51
|
+
changed = True
|
|
52
|
+
correction = detected_angle
|
|
53
|
+
reason = f"Rotation of {detected_angle}° detected (confidence {confidence:.1f})."
|
|
54
|
+
|
|
55
|
+
except Exception as exc: # noqa: BLE001
|
|
56
|
+
return PageResult(
|
|
57
|
+
page_number=page_number,
|
|
58
|
+
page_type=PageType.SKIPPED,
|
|
59
|
+
detected_angle=0,
|
|
60
|
+
existing_rotation=0,
|
|
61
|
+
correction_applied=0,
|
|
62
|
+
changed=False,
|
|
63
|
+
confidence=-1.0,
|
|
64
|
+
reason=f"Error during analysis: {exc}",
|
|
65
|
+
duration_seconds=time.perf_counter() - start,
|
|
66
|
+
)
|
|
67
|
+
finally:
|
|
68
|
+
doc.close()
|
|
69
|
+
|
|
70
|
+
return PageResult(
|
|
71
|
+
page_number=page_number,
|
|
72
|
+
page_type=page_type,
|
|
73
|
+
detected_angle=detected_angle,
|
|
74
|
+
existing_rotation=existing_rotation,
|
|
75
|
+
correction_applied=correction,
|
|
76
|
+
changed=changed,
|
|
77
|
+
confidence=confidence,
|
|
78
|
+
reason=reason,
|
|
79
|
+
duration_seconds=time.perf_counter() - start,
|
|
80
|
+
)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Determine whether a PDF page has a selectable text layer.
|
|
3
|
+
|
|
4
|
+
Responsibility: single — classify one page as TEXT or SCANNED.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import fitz # PyMuPDF
|
|
10
|
+
|
|
11
|
+
# Minimum number of characters required to treat a page as text-based.
|
|
12
|
+
_MIN_CHAR_COUNT: int = 20
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def has_text_layer(page: fitz.Page) -> bool:
|
|
16
|
+
"""Return True when *page* contains enough selectable text to analyse."""
|
|
17
|
+
text = page.get_text("text")
|
|
18
|
+
return len(text.strip()) >= _MIN_CHAR_COUNT
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Apply collected rotation corrections to a PDF in a single write pass.
|
|
3
|
+
|
|
4
|
+
Responsibility: given a list of PageResults, mutate the document's
|
|
5
|
+
rotation metadata for every changed page and save once.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import time
|
|
11
|
+
|
|
12
|
+
import fitz # PyMuPDF
|
|
13
|
+
|
|
14
|
+
from pdforienter.models import PageResult
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def apply_rotations(
|
|
18
|
+
input_path: str,
|
|
19
|
+
output_path: str,
|
|
20
|
+
page_results: list[PageResult],
|
|
21
|
+
) -> float:
|
|
22
|
+
"""
|
|
23
|
+
Write *output_path* with all required rotations applied in one pass.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
input_path:
|
|
28
|
+
Source PDF path.
|
|
29
|
+
output_path:
|
|
30
|
+
Destination PDF path (will be created / overwritten).
|
|
31
|
+
page_results:
|
|
32
|
+
Analysis results; only pages with ``changed=True`` are modified.
|
|
33
|
+
|
|
34
|
+
Returns
|
|
35
|
+
-------
|
|
36
|
+
float
|
|
37
|
+
Wall-clock seconds spent on the correction pass.
|
|
38
|
+
"""
|
|
39
|
+
start = time.perf_counter()
|
|
40
|
+
|
|
41
|
+
doc = fitz.open(input_path)
|
|
42
|
+
try:
|
|
43
|
+
for result in page_results:
|
|
44
|
+
if not result.changed:
|
|
45
|
+
continue
|
|
46
|
+
page = doc[result.page_number - 1] # back to 0-based
|
|
47
|
+
new_rotation = _normalise_rotation(
|
|
48
|
+
result.existing_rotation + result.correction_applied
|
|
49
|
+
)
|
|
50
|
+
page.set_rotation(new_rotation)
|
|
51
|
+
|
|
52
|
+
doc.save(output_path, garbage=4, deflate=True)
|
|
53
|
+
finally:
|
|
54
|
+
doc.close()
|
|
55
|
+
|
|
56
|
+
return time.perf_counter() - start
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _normalise_rotation(degrees: int) -> int:
|
|
60
|
+
"""Reduce *degrees* into the half-open range [0, 360)."""
|
|
61
|
+
return degrees % 360
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Detect the orientation of a single PDF page.
|
|
3
|
+
|
|
4
|
+
Two strategies:
|
|
5
|
+
1. text_orientation — fast; uses PyMuPDF's character direction vectors.
|
|
6
|
+
2. osd_orientation — slower; renders the page and calls Tesseract OSD.
|
|
7
|
+
|
|
8
|
+
Responsibility: return (angle_degrees, confidence) for one page.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import fitz # PyMuPDF
|
|
14
|
+
|
|
15
|
+
from pdforienter.config import OSD_CONFIDENCE_THRESHOLD, TESSERACT_OSD_PSM
|
|
16
|
+
|
|
17
|
+
# Resolution used when rasterising a page for OSD.
|
|
18
|
+
_RENDER_DPI: int = 150 # lower = faster; sufficient for orientation detection
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# ---------------------------------------------------------------------------
|
|
22
|
+
# Text-layer strategy
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
def text_orientation(page: fitz.Page) -> tuple[int, float]:
|
|
26
|
+
"""
|
|
27
|
+
Infer orientation from character direction vectors in the text layer.
|
|
28
|
+
|
|
29
|
+
Returns (angle, confidence) where angle ∈ {0, 90, 180, 270} and
|
|
30
|
+
confidence is a value in [0, 100].
|
|
31
|
+
"""
|
|
32
|
+
blocks = page.get_text("rawdict", flags=fitz.TEXT_PRESERVE_WHITESPACE)["blocks"]
|
|
33
|
+
angle_votes: dict[int, int] = {0: 0, 90: 0, 180: 0, 270: 0}
|
|
34
|
+
|
|
35
|
+
for block in blocks:
|
|
36
|
+
for line in block.get("lines", []):
|
|
37
|
+
direction = line.get("dir", (1, 0))
|
|
38
|
+
angle = _direction_to_angle(direction)
|
|
39
|
+
angle_votes[angle] += len(line.get("spans", []))
|
|
40
|
+
|
|
41
|
+
total = sum(angle_votes.values())
|
|
42
|
+
if total == 0:
|
|
43
|
+
return 0, 0.0
|
|
44
|
+
|
|
45
|
+
dominant = max(angle_votes, key=angle_votes.__getitem__)
|
|
46
|
+
confidence = (angle_votes[dominant] / total) * 100.0
|
|
47
|
+
return dominant, confidence
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _direction_to_angle(direction: tuple[float, float]) -> int:
|
|
51
|
+
"""Map a (cos, sin) direction vector to the nearest 90-degree angle."""
|
|
52
|
+
dx, dy = direction
|
|
53
|
+
if dx > 0.5:
|
|
54
|
+
return 0
|
|
55
|
+
if dx < -0.5:
|
|
56
|
+
return 180
|
|
57
|
+
if dy > 0.5:
|
|
58
|
+
return 270
|
|
59
|
+
return 90
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# ---------------------------------------------------------------------------
|
|
63
|
+
# OSD (Tesseract) strategy
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
|
|
66
|
+
def osd_orientation(page: fitz.Page) -> tuple[int, float]:
|
|
67
|
+
"""
|
|
68
|
+
Detect orientation by rasterising *page* and running Tesseract OSD.
|
|
69
|
+
|
|
70
|
+
Returns (angle, confidence). Falls back to (0, 0.0) on any error.
|
|
71
|
+
"""
|
|
72
|
+
try:
|
|
73
|
+
# Heavy imports kept lazy: text-only workloads never need Tesseract,
|
|
74
|
+
# and pytesseract pulls in pandas at module-load time.
|
|
75
|
+
import pytesseract
|
|
76
|
+
from PIL import Image
|
|
77
|
+
|
|
78
|
+
pix = page.get_pixmap(dpi=_RENDER_DPI, colorspace=fitz.csGRAY)
|
|
79
|
+
img = Image.frombytes("L", (pix.width, pix.height), pix.samples)
|
|
80
|
+
osd = pytesseract.image_to_osd(
|
|
81
|
+
img,
|
|
82
|
+
config=f"--psm {TESSERACT_OSD_PSM}",
|
|
83
|
+
output_type=pytesseract.Output.DICT,
|
|
84
|
+
)
|
|
85
|
+
angle = int(osd.get("rotate", 0))
|
|
86
|
+
confidence = float(osd.get("orientation_conf", 0.0))
|
|
87
|
+
if confidence < OSD_CONFIDENCE_THRESHOLD:
|
|
88
|
+
return 0, confidence
|
|
89
|
+
return angle, confidence
|
|
90
|
+
except Exception: # noqa: BLE001
|
|
91
|
+
return 0, 0.0
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""
|
|
2
|
+
run_pipeline — public entry point for PDFOrienter.
|
|
3
|
+
|
|
4
|
+
Accepts one or more PDF paths, processes them (potentially in parallel
|
|
5
|
+
across files), aggregates results, and returns a RunResult.
|
|
6
|
+
|
|
7
|
+
Responsibility: top-level orchestration only; no PDF logic here.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import time
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from pdforienter.config import MAX_WORKERS
|
|
16
|
+
from pdforienter.core.processor import process_file
|
|
17
|
+
from pdforienter.models import FileResult, RunResult
|
|
18
|
+
from pdforienter.utils.fs import ensure_dir
|
|
19
|
+
from pdforienter.utils.resources import peak_ram_mb
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def run_pipeline(
|
|
23
|
+
pdf_paths: list[str],
|
|
24
|
+
output_dir: str,
|
|
25
|
+
) -> RunResult:
|
|
26
|
+
"""
|
|
27
|
+
Process one or more PDF files and return a *RunResult*.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
pdf_paths:
|
|
32
|
+
List of absolute or relative paths to PDF files.
|
|
33
|
+
output_dir:
|
|
34
|
+
Directory where corrected PDFs and the log file will be written.
|
|
35
|
+
"""
|
|
36
|
+
ensure_dir(output_dir)
|
|
37
|
+
run_start = time.perf_counter()
|
|
38
|
+
|
|
39
|
+
file_results: list[FileResult] = [
|
|
40
|
+
process_file(str(Path(p).resolve()), output_dir)
|
|
41
|
+
for p in pdf_paths
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
return RunResult(
|
|
45
|
+
total_files=len(file_results),
|
|
46
|
+
total_pages=sum(r.total_pages for r in file_results),
|
|
47
|
+
total_pages_changed=sum(r.pages_changed for r in file_results),
|
|
48
|
+
total_text_pages=sum(r.text_pages for r in file_results),
|
|
49
|
+
total_scanned_pages=sum(r.scanned_pages for r in file_results),
|
|
50
|
+
total_skipped_pages=sum(r.skipped_pages for r in file_results),
|
|
51
|
+
workers_used=MAX_WORKERS,
|
|
52
|
+
peak_ram_mb=peak_ram_mb(),
|
|
53
|
+
total_duration_seconds=time.perf_counter() - run_start,
|
|
54
|
+
file_results=file_results,
|
|
55
|
+
)
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Orchestrate the two-phase pipeline for a single PDF file.
|
|
3
|
+
|
|
4
|
+
Phase 1 — Parallel detection : all pages analysed concurrently.
|
|
5
|
+
Phase 2 — Single-pass correction: all rotations applied in one write.
|
|
6
|
+
|
|
7
|
+
Responsibility: coordinate analyzer + corrector for one file.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import time
|
|
13
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
import fitz # PyMuPDF
|
|
17
|
+
|
|
18
|
+
from pdforienter.config import MAX_WORKERS
|
|
19
|
+
from pdforienter.core.analyzer import analyse_page
|
|
20
|
+
from pdforienter.core.corrector import apply_rotations
|
|
21
|
+
from pdforienter.models import FileResult, PageResult, PageType
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def process_file(input_path: str, output_dir: str) -> FileResult:
|
|
25
|
+
"""
|
|
26
|
+
Run the full detection + correction pipeline for *input_path*.
|
|
27
|
+
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
input_path:
|
|
31
|
+
Absolute path to the source PDF.
|
|
32
|
+
output_dir:
|
|
33
|
+
Directory where the corrected PDF will be written.
|
|
34
|
+
"""
|
|
35
|
+
file_start = time.perf_counter()
|
|
36
|
+
output_path = _build_output_path(input_path, output_dir)
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
page_count = _page_count(input_path)
|
|
40
|
+
page_results = _detect_all_pages(input_path, page_count)
|
|
41
|
+
correction_duration = _correct_file(input_path, output_path, page_results)
|
|
42
|
+
except Exception as exc: # noqa: BLE001
|
|
43
|
+
return FileResult(
|
|
44
|
+
input_path=input_path,
|
|
45
|
+
output_path=output_path,
|
|
46
|
+
total_pages=0,
|
|
47
|
+
pages_changed=0,
|
|
48
|
+
text_pages=0,
|
|
49
|
+
scanned_pages=0,
|
|
50
|
+
skipped_pages=0,
|
|
51
|
+
error=str(exc),
|
|
52
|
+
total_duration_seconds=time.perf_counter() - file_start,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
detection_duration = sum(r.duration_seconds for r in page_results)
|
|
56
|
+
|
|
57
|
+
return FileResult(
|
|
58
|
+
input_path=input_path,
|
|
59
|
+
output_path=output_path,
|
|
60
|
+
total_pages=page_count,
|
|
61
|
+
pages_changed=sum(1 for r in page_results if r.changed),
|
|
62
|
+
text_pages=sum(1 for r in page_results if r.page_type == PageType.TEXT),
|
|
63
|
+
scanned_pages=sum(1 for r in page_results if r.page_type == PageType.SCANNED),
|
|
64
|
+
skipped_pages=sum(1 for r in page_results if r.page_type == PageType.SKIPPED),
|
|
65
|
+
page_results=page_results,
|
|
66
|
+
detection_duration_seconds=detection_duration,
|
|
67
|
+
correction_duration_seconds=correction_duration,
|
|
68
|
+
total_duration_seconds=time.perf_counter() - file_start,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# ---------------------------------------------------------------------------
|
|
73
|
+
# Internal helpers
|
|
74
|
+
# ---------------------------------------------------------------------------
|
|
75
|
+
|
|
76
|
+
def _page_count(pdf_path: str) -> int:
|
|
77
|
+
doc = fitz.open(pdf_path)
|
|
78
|
+
count = int(doc.page_count)
|
|
79
|
+
doc.close()
|
|
80
|
+
return count
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _detect_all_pages(pdf_path: str, page_count: int) -> list[PageResult]:
|
|
84
|
+
"""Dispatch one worker per page; collect results in page order."""
|
|
85
|
+
results: dict[int, PageResult] = {}
|
|
86
|
+
|
|
87
|
+
with ProcessPoolExecutor(max_workers=MAX_WORKERS) as pool:
|
|
88
|
+
future_map = {
|
|
89
|
+
pool.submit(analyse_page, pdf_path, idx): idx
|
|
90
|
+
for idx in range(page_count)
|
|
91
|
+
}
|
|
92
|
+
for future in as_completed(future_map):
|
|
93
|
+
idx = future_map[future]
|
|
94
|
+
results[idx] = future.result()
|
|
95
|
+
|
|
96
|
+
return [results[i] for i in range(page_count)]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _correct_file(
|
|
100
|
+
input_path: str,
|
|
101
|
+
output_path: str,
|
|
102
|
+
page_results: list[PageResult],
|
|
103
|
+
) -> float:
|
|
104
|
+
"""Apply rotations only when at least one page needs changing."""
|
|
105
|
+
if not any(r.changed for r in page_results):
|
|
106
|
+
# Nothing to do — copy input to output as-is.
|
|
107
|
+
import shutil
|
|
108
|
+
shutil.copy2(input_path, output_path)
|
|
109
|
+
return 0.0
|
|
110
|
+
|
|
111
|
+
return apply_rotations(input_path, output_path, page_results)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _build_output_path(input_path: str, output_dir: str) -> str:
|
|
115
|
+
stem = Path(input_path).stem
|
|
116
|
+
return str(Path(output_dir) / f"{stem}_corrected.pdf")
|
|
File without changes
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Format a RunResult into a human-readable, structured text log.
|
|
3
|
+
|
|
4
|
+
Responsibility: serialisation only — no I/O, no business logic.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
|
|
11
|
+
from pdforienter.config import LOG_DATE_FORMAT
|
|
12
|
+
from pdforienter.models import FileResult, RunResult
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def format_run_log(result: RunResult) -> str:
|
|
16
|
+
"""Return the full log string for *result*."""
|
|
17
|
+
sections = [
|
|
18
|
+
_header(),
|
|
19
|
+
_run_summary(result),
|
|
20
|
+
_separator(),
|
|
21
|
+
*[_file_section(fr) for fr in result.file_results],
|
|
22
|
+
_footer(),
|
|
23
|
+
]
|
|
24
|
+
return "\n".join(sections)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
# Section builders
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
|
|
31
|
+
def _header() -> str:
|
|
32
|
+
now = datetime.now().strftime(LOG_DATE_FORMAT)
|
|
33
|
+
return f"PDFOrienter Run Log — {now}\n{'=' * 60}"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _run_summary(r: RunResult) -> str:
|
|
37
|
+
return (
|
|
38
|
+
f"\n[RUN SUMMARY]\n"
|
|
39
|
+
f" Total files processed : {r.total_files}\n"
|
|
40
|
+
f" Total pages : {r.total_pages}\n"
|
|
41
|
+
f" Pages rotated : {r.total_pages_changed}\n"
|
|
42
|
+
f" Text pages : {r.total_text_pages}\n"
|
|
43
|
+
f" Scanned pages (OCR) : {r.total_scanned_pages}\n"
|
|
44
|
+
f" Skipped pages : {r.total_skipped_pages}\n"
|
|
45
|
+
f" Workers used : {r.workers_used}\n"
|
|
46
|
+
f" Peak RAM usage : {r.peak_ram_mb:.1f} MB\n"
|
|
47
|
+
f" Total time : {r.total_duration_seconds:.2f}s\n"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _file_section(fr: FileResult) -> str:
|
|
52
|
+
lines = [
|
|
53
|
+
f"[FILE] {fr.input_path}",
|
|
54
|
+
f" Output : {fr.output_path}",
|
|
55
|
+
f" Total pages : {fr.total_pages}",
|
|
56
|
+
f" Pages changed : {fr.pages_changed}",
|
|
57
|
+
f" Text pages : {fr.text_pages}",
|
|
58
|
+
f" Scanned pages : {fr.scanned_pages}",
|
|
59
|
+
f" Skipped pages : {fr.skipped_pages}",
|
|
60
|
+
f" Detection time : {fr.detection_duration_seconds:.2f}s",
|
|
61
|
+
f" Correction time : {fr.correction_duration_seconds:.2f}s",
|
|
62
|
+
f" Total time : {fr.total_duration_seconds:.2f}s",
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
if fr.error:
|
|
66
|
+
lines.append(f" ERROR : {fr.error}")
|
|
67
|
+
else:
|
|
68
|
+
lines.append(" [PAGE DETAILS]")
|
|
69
|
+
for pr in fr.page_results:
|
|
70
|
+
status = "CHANGED" if pr.changed else "OK"
|
|
71
|
+
lines.append(
|
|
72
|
+
f" p{pr.page_number:>4} | {pr.page_type.value:<7} | "
|
|
73
|
+
f"{status:<7} | angle={pr.detected_angle:>3}° | "
|
|
74
|
+
f"conf={pr.confidence:>5.1f} | {pr.duration_seconds:.2f}s | "
|
|
75
|
+
f"{pr.reason}"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
lines.append("")
|
|
79
|
+
return "\n".join(lines)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _separator() -> str:
|
|
83
|
+
return "-" * 60
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _footer() -> str:
|
|
87
|
+
return "=" * 60 + "\nEnd of log.\n"
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Write formatted log content to disk.
|
|
3
|
+
|
|
4
|
+
Responsibility: file I/O for log output only.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from pdforienter.models import RunResult
|
|
13
|
+
from pdforienter.logging.formatter import format_run_log
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def write_log(result: RunResult, output_dir: str) -> str:
|
|
17
|
+
"""
|
|
18
|
+
Serialise *result* and write the log to *output_dir*.
|
|
19
|
+
|
|
20
|
+
Returns the absolute path of the written log file.
|
|
21
|
+
"""
|
|
22
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
23
|
+
log_path = Path(output_dir) / f"pdforienter_{timestamp}.log"
|
|
24
|
+
log_path.write_text(format_run_log(result), encoding="utf-8")
|
|
25
|
+
return str(log_path)
|
pdforienter/models.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pure-data classes used throughout PDFOrienter.
|
|
3
|
+
|
|
4
|
+
No business logic lives here — only typed containers so every module
|
|
5
|
+
shares a common vocabulary.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from enum import Enum
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class PageType(str, Enum):
|
|
15
|
+
TEXT = "text" # selectable text layer present
|
|
16
|
+
SCANNED = "scanned" # image-only — OCR/OSD required
|
|
17
|
+
SKIPPED = "skipped" # could not be analysed (no text, low confidence …)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class PageResult:
|
|
22
|
+
"""Analysis + correction outcome for a single PDF page."""
|
|
23
|
+
|
|
24
|
+
page_number: int # 1-based
|
|
25
|
+
page_type: PageType
|
|
26
|
+
detected_angle: int # angle reported by detector (degrees)
|
|
27
|
+
existing_rotation: int # rotation already stored in PDF metadata
|
|
28
|
+
correction_applied: int # net rotation written to the page (degrees)
|
|
29
|
+
changed: bool
|
|
30
|
+
confidence: float # 0-100; -1 when N/A
|
|
31
|
+
reason: str # human-readable explanation
|
|
32
|
+
duration_seconds: float # wall-clock time for this page
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class FileResult:
|
|
37
|
+
"""Aggregated outcome for one PDF file."""
|
|
38
|
+
|
|
39
|
+
input_path: str
|
|
40
|
+
output_path: str
|
|
41
|
+
total_pages: int
|
|
42
|
+
pages_changed: int
|
|
43
|
+
text_pages: int
|
|
44
|
+
scanned_pages: int
|
|
45
|
+
skipped_pages: int
|
|
46
|
+
page_results: list[PageResult] = field(default_factory=list)
|
|
47
|
+
detection_duration_seconds: float = 0.0
|
|
48
|
+
correction_duration_seconds: float = 0.0
|
|
49
|
+
total_duration_seconds: float = 0.0
|
|
50
|
+
error: str | None = None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class RunResult:
|
|
55
|
+
"""Top-level summary for an entire PDFOrienter run."""
|
|
56
|
+
|
|
57
|
+
total_files: int
|
|
58
|
+
total_pages: int
|
|
59
|
+
total_pages_changed: int
|
|
60
|
+
total_text_pages: int
|
|
61
|
+
total_scanned_pages: int
|
|
62
|
+
total_skipped_pages: int
|
|
63
|
+
workers_used: int
|
|
64
|
+
peak_ram_mb: float
|
|
65
|
+
total_duration_seconds: float
|
|
66
|
+
file_results: list[FileResult] = field(default_factory=list)
|
|
File without changes
|
pdforienter/utils/fs.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Filesystem utility helpers.
|
|
3
|
+
|
|
4
|
+
Responsibility: path / directory operations only.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def ensure_dir(directory: str) -> Path:
|
|
13
|
+
"""Create *directory* (and all parents) if it does not already exist."""
|
|
14
|
+
path = Path(directory)
|
|
15
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
16
|
+
return path
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def resolve_pdf_paths(inputs: list[str]) -> list[str]:
|
|
20
|
+
"""
|
|
21
|
+
Expand a list of paths to absolute PDF file paths.
|
|
22
|
+
|
|
23
|
+
Directories are walked recursively; non-PDF files are ignored.
|
|
24
|
+
"""
|
|
25
|
+
resolved: list[str] = []
|
|
26
|
+
for raw in inputs:
|
|
27
|
+
p = Path(raw).resolve()
|
|
28
|
+
if p.is_dir():
|
|
29
|
+
resolved.extend(str(f) for f in sorted(p.rglob("*.pdf")))
|
|
30
|
+
elif p.suffix.lower() == ".pdf" and p.is_file():
|
|
31
|
+
resolved.append(str(p))
|
|
32
|
+
return resolved
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""
|
|
2
|
+
System resource measurement helpers.
|
|
3
|
+
|
|
4
|
+
Responsibility: CPU and RAM telemetry only.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
import psutil
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def peak_ram_mb() -> float:
|
|
15
|
+
"""Return the current process RSS memory usage in megabytes."""
|
|
16
|
+
process = psutil.Process(os.getpid())
|
|
17
|
+
return float(process.memory_info().rss) / (1024 ** 2)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def cpu_count() -> int:
|
|
21
|
+
"""Return the number of logical CPUs available to the current process."""
|
|
22
|
+
return os.cpu_count() or 1
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdforienter
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Intelligent, parallel PDF page rotation correction.
|
|
5
|
+
Author: InfinitiBit GmbH
|
|
6
|
+
Maintainer: InfinitiBit GmbH
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/MdRahmatUllah/pdforienter
|
|
9
|
+
Project-URL: Repository, https://github.com/MdRahmatUllah/pdforienter
|
|
10
|
+
Project-URL: Issues, https://github.com/MdRahmatUllah/pdforienter/issues
|
|
11
|
+
Project-URL: Documentation, https://github.com/MdRahmatUllah/pdforienter/blob/main/TECHNICAL.md
|
|
12
|
+
Keywords: pdf,rotation,ocr,tesseract,document
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Utilities
|
|
22
|
+
Classifier: Topic :: Multimedia :: Graphics :: Graphics Conversion
|
|
23
|
+
Classifier: Topic :: Text Processing
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: PyMuPDF>=1.23
|
|
28
|
+
Requires-Dist: pytesseract>=0.3.10
|
|
29
|
+
Requires-Dist: Pillow>=10.0
|
|
30
|
+
Requires-Dist: psutil>=5.9
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
33
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
34
|
+
Requires-Dist: ruff; extra == "dev"
|
|
35
|
+
Requires-Dist: mypy; extra == "dev"
|
|
36
|
+
Dynamic: license-file
|
|
37
|
+
|
|
38
|
+
# PDFOrienter
|
|
39
|
+
|
|
40
|
+
**Intelligent, parallel PDF page rotation correction for Python.**
|
|
41
|
+
|
|
42
|
+
PDFOrienter analyses every page of one or more PDF files, detects incorrect orientations, and fixes them in a single write pass — with no unnecessary re-processing.
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## Features
|
|
47
|
+
|
|
48
|
+
- **Two-phase pipeline** — detect all pages in parallel, then apply all corrections in a single write
|
|
49
|
+
- **Smart strategy selection** — uses fast text-direction analysis for text-based pages; falls back to Tesseract OSD only for image/scanned pages
|
|
50
|
+
- **Dynamic parallelism** — automatically uses 75 % of available CPU cores; scales from 4 to 64+ cores without any configuration change
|
|
51
|
+
- **Detailed structured logging** — per-page and per-file timing, rotation details, confidence scores, RAM and CPU usage
|
|
52
|
+
- **Zero intermediate files** — corrected PDFs are written once; originals are never modified
|
|
53
|
+
- **Package-ready** — clean modular design, typed, fully testable
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Requirements
|
|
58
|
+
|
|
59
|
+
### Python
|
|
60
|
+
|
|
61
|
+
Python 3.10 or newer.
|
|
62
|
+
|
|
63
|
+
### System dependency — Tesseract
|
|
64
|
+
|
|
65
|
+
Tesseract must be installed on the host system **before** installing PDFOrienter.
|
|
66
|
+
|
|
67
|
+
**Ubuntu / Debian**
|
|
68
|
+
```bash
|
|
69
|
+
sudo apt-get update && sudo apt-get install -y tesseract-ocr
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
**macOS (Homebrew)**
|
|
73
|
+
```bash
|
|
74
|
+
brew install tesseract
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
**Windows**
|
|
78
|
+
|
|
79
|
+
Download and run the installer from the [Tesseract UB Mannheim releases](https://github.com/UB-Mannheim/tesseract/wiki), then add the install directory to your `PATH`.
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## Installation
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
pip install pdforienter
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
For development (includes linting + test tools):
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
git clone https://github.com/your-org/pdforienter.git
|
|
93
|
+
cd pdforienter
|
|
94
|
+
pip install -e ".[dev]"
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Quick Start
|
|
100
|
+
|
|
101
|
+
### Command line
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
# Fix a single PDF
|
|
105
|
+
pdforienter invoice.pdf --output ./fixed
|
|
106
|
+
|
|
107
|
+
# Fix every PDF in a directory
|
|
108
|
+
pdforienter /scans/ --output /corrected
|
|
109
|
+
|
|
110
|
+
# Mix files and directories
|
|
111
|
+
pdforienter report.pdf /archive/ receipts.pdf --output ./out
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Python API
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
from pdforienter import run_pipeline
|
|
118
|
+
from pdforienter.logging.writer import write_log
|
|
119
|
+
|
|
120
|
+
result = run_pipeline(
|
|
121
|
+
pdf_paths=["invoice.pdf", "report.pdf"],
|
|
122
|
+
output_dir="./corrected",
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Write the structured log file
|
|
126
|
+
log_path = write_log(result, "./corrected")
|
|
127
|
+
|
|
128
|
+
print(f"{result.total_pages_changed} pages corrected in {result.total_duration_seconds:.1f}s")
|
|
129
|
+
print(f"Log: {log_path}")
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## Log File
|
|
135
|
+
|
|
136
|
+
Every run produces a timestamped `.log` file in the output directory.
|
|
137
|
+
|
|
138
|
+
```
|
|
139
|
+
PDFOrienter Run Log — 2024-11-01 14:32:05
|
|
140
|
+
============================================================
|
|
141
|
+
|
|
142
|
+
[RUN SUMMARY]
|
|
143
|
+
Total files processed : 3
|
|
144
|
+
Total pages : 247
|
|
145
|
+
Pages rotated : 18
|
|
146
|
+
Text pages : 201
|
|
147
|
+
Scanned pages (OCR) : 46
|
|
148
|
+
Skipped pages : 0
|
|
149
|
+
Workers used : 6
|
|
150
|
+
Peak RAM usage : 312.4 MB
|
|
151
|
+
Total time : 42.18s
|
|
152
|
+
|
|
153
|
+
------------------------------------------------------------
|
|
154
|
+
[FILE] /scans/invoice.pdf
|
|
155
|
+
Output : /corrected/invoice_corrected.pdf
|
|
156
|
+
Total pages : 12
|
|
157
|
+
Pages changed : 3
|
|
158
|
+
Text pages : 8
|
|
159
|
+
Scanned pages : 4
|
|
160
|
+
Skipped pages : 0
|
|
161
|
+
Detection time : 9.41s
|
|
162
|
+
Correction time : 0.23s
|
|
163
|
+
Total time : 9.64s
|
|
164
|
+
[PAGE DETAILS]
|
|
165
|
+
p 1 | text | OK | angle= 0° | conf= 98.2 | 0.11s | No rotation needed.
|
|
166
|
+
p 2 | scanned | CHANGED | angle= 90° | conf= 87.5 | 2.34s | Rotation of 90° detected (confidence 87.5).
|
|
167
|
+
...
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## Project Structure
|
|
173
|
+
|
|
174
|
+
```
|
|
175
|
+
pdforienter/
|
|
176
|
+
├── pdforienter/
|
|
177
|
+
│ ├── __init__.py # Public API: run_pipeline
|
|
178
|
+
│ ├── config.py # Tuneable constants (worker count, thresholds)
|
|
179
|
+
│ ├── models.py # Typed data classes (PageResult, FileResult, RunResult)
|
|
180
|
+
│ ├── cli.py # Command-line interface
|
|
181
|
+
│ ├── core/
|
|
182
|
+
│ │ ├── pipeline.py # Top-level orchestrator
|
|
183
|
+
│ │ ├── processor.py # Per-file orchestrator (Phase 1 + Phase 2)
|
|
184
|
+
│ │ ├── analyzer.py # Per-page worker (dispatched to subprocess)
|
|
185
|
+
│ │ ├── classifier.py # Text vs scanned page detection
|
|
186
|
+
│ │ ├── detector.py # Orientation detection (text + OSD strategies)
|
|
187
|
+
│ │ └── corrector.py # Single-pass rotation applier
|
|
188
|
+
│ ├── logging/
|
|
189
|
+
│ │ ├── formatter.py # RunResult → structured log string
|
|
190
|
+
│ │ └── writer.py # Write log file to disk
|
|
191
|
+
│ └── utils/
|
|
192
|
+
│ ├── fs.py # Filesystem helpers
|
|
193
|
+
│ └── resources.py # CPU / RAM telemetry
|
|
194
|
+
├── tests/
|
|
195
|
+
│ └── test_core.py
|
|
196
|
+
├── pyproject.toml
|
|
197
|
+
└── README.md
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
---
|
|
201
|
+
|
|
202
|
+
## Configuration
|
|
203
|
+
|
|
204
|
+
All tuneable constants live in `pdforienter/config.py`.
|
|
205
|
+
|
|
206
|
+
| Constant | Default | Description |
|
|
207
|
+
|---|---|---|
|
|
208
|
+
| `MAX_WORKERS` | `floor(cpu_count × 0.75)` | Worker processes for parallel page analysis |
|
|
209
|
+
| `OSD_CONFIDENCE_THRESHOLD` | `10.0` | Minimum Tesseract OSD confidence to trust a result |
|
|
210
|
+
| `TESSERACT_OSD_PSM` | `0` | Tesseract page segmentation mode (0 = OSD only) |
|
|
211
|
+
| `_RENDER_DPI` (detector.py) | `150` | DPI used when rasterising pages for OSD |
|
|
212
|
+
| `_MIN_CHAR_COUNT` (classifier.py) | `20` | Minimum characters to classify a page as text-based |
|
|
213
|
+
|
|
214
|
+
---
|
|
215
|
+
|
|
216
|
+
## How It Works
|
|
217
|
+
|
|
218
|
+
### Phase 1 — Parallel Detection
|
|
219
|
+
|
|
220
|
+
Each page is dispatched to a subprocess worker via `ProcessPoolExecutor`. Workers run concurrently up to `MAX_WORKERS`.
|
|
221
|
+
|
|
222
|
+
For each page:
|
|
223
|
+
1. **Classify** — does the page have selectable text?
|
|
224
|
+
2. **Detect orientation**
|
|
225
|
+
- *Text page* → analyse character direction vectors (fast, no OCR)
|
|
226
|
+
- *Scanned page* → rasterise at 150 DPI and run Tesseract OSD
|
|
227
|
+
3. Return a `PageResult` with the detected angle, confidence, and timing
|
|
228
|
+
|
|
229
|
+
### Phase 2 — Single-Pass Correction
|
|
230
|
+
|
|
231
|
+
After all pages are analysed, a single `fitz.Document.save()` call applies every rotation and writes the corrected PDF. No intermediate files are created.
|
|
232
|
+
|
|
233
|
+
---
|
|
234
|
+
|
|
235
|
+
## Performance
|
|
236
|
+
|
|
237
|
+
Typical estimates on an 8-core server (6 workers) with mixed text/scanned PDFs:
|
|
238
|
+
|
|
239
|
+
| Scenario | Estimate |
|
|
240
|
+
|---|---|
|
|
241
|
+
| 2 000 pages, all text-based | ~1–2 minutes |
|
|
242
|
+
| 2 000 pages, mixed 50/50 | ~7–8 minutes |
|
|
243
|
+
| 2 000 pages, all scanned | ~15–17 minutes |
|
|
244
|
+
|
|
245
|
+
RAM usage: ~200–400 MB per Tesseract worker. 6 workers ≈ 2.5 GB peak. Well within a 16 GB server.
|
|
246
|
+
|
|
247
|
+
---
|
|
248
|
+
|
|
249
|
+
## Running Tests
|
|
250
|
+
|
|
251
|
+
```bash
|
|
252
|
+
pytest tests/ -v
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
## License
|
|
258
|
+
|
|
259
|
+
MIT
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
pdforienter/__init__.py,sha256=zWu665aCLdZahwZo69vWM8J2yQKfjqdI2wGanGpbENQ,758
|
|
2
|
+
pdforienter/cli.py,sha256=4N7J4LPorKH8UejoRexdGgDoVyCz8jIHDhgCiloQSt0,1733
|
|
3
|
+
pdforienter/config.py,sha256=4urbIquQOfLUMQtlC-PswQt2Eo-qAtyXEQ2ZGuBzvIk,1651
|
|
4
|
+
pdforienter/models.py,sha256=zb7kWikz3omLSitjP3PKMBJcR8LiRDdD8wOOVF2kmWc,1883
|
|
5
|
+
pdforienter/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
pdforienter/core/analyzer.py,sha256=IHrZdExx79w-K-7YWE-cU8q_4_99i91xqp4lwTVeGwo,2407
|
|
7
|
+
pdforienter/core/classifier.py,sha256=3wZhXHmAF8uKu3feW9w-zsFjgKyMYoPJQun9hGzzNOk,496
|
|
8
|
+
pdforienter/core/corrector.py,sha256=Zf_RevNTNCpZw5UswVagsjwc2XhSx9Ld-OMojjIpsqg,1530
|
|
9
|
+
pdforienter/core/detector.py,sha256=9XV6b9E1HJkzfwaTtKNu2DlljTY9g8LRbI_PbE5wOb4,3094
|
|
10
|
+
pdforienter/core/pipeline.py,sha256=CNGOWz-1G5ziVNYmMxipHvpvocIDjtQQEE2g-6qZLaw,1702
|
|
11
|
+
pdforienter/core/processor.py,sha256=5cBX-LWDytH_8DdXAU5wBbE5P6gC-fpvmWVFg4aF8Eo,3858
|
|
12
|
+
pdforienter/logging/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
+
pdforienter/logging/formatter.py,sha256=Rp2SgjtAi54FGosVyXQwGqdI4KNj-6JLp7h78zqYNoc,2820
|
|
14
|
+
pdforienter/logging/writer.py,sha256=BtYWZVe-wiZ1r-JzpIc15K2vh1LFrU6_5Uq2urzAqjU,686
|
|
15
|
+
pdforienter/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
+
pdforienter/utils/fs.py,sha256=cNzJa5O6riV-rvpitLsfzG1CvcluJrqQnLIcquRnQU0,841
|
|
17
|
+
pdforienter/utils/resources.py,sha256=F6193tGVylCYpYtsOVz3w5J0gVtRwreLCY3aFGmasvg,485
|
|
18
|
+
pdforienter-0.1.0.dist-info/licenses/LICENSE,sha256=yBPpeen2dS_xfsiwE5iPbH-ekYEKDBLZO715EvKLIkc,1073
|
|
19
|
+
pdforienter-0.1.0.dist-info/METADATA,sha256=0Ec4TJfk336-JZsypMscNYEapp7fUykG5XfpkzICnEc,8105
|
|
20
|
+
pdforienter-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
21
|
+
pdforienter-0.1.0.dist-info/entry_points.txt,sha256=HNa3K0WeJyLE91iQyjk0xodYDr93Mgx9oLWywv2-AS0,53
|
|
22
|
+
pdforienter-0.1.0.dist-info/top_level.txt,sha256=8M7epoDEGKPZEBAUa1IpCgc_C9OjF4D6R7pLJpRtO0g,12
|
|
23
|
+
pdforienter-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 InfinitiBit GmbH
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pdforienter
|