pdforienter 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,30 @@
1
+ """
2
+ PDFOrienter — Intelligent, parallel PDF page rotation correction.
3
+
4
+ Public API
5
+ ----------
6
+ run_pipeline(pdf_paths, output_dir) -> RunResult
7
+ """
8
+
9
+ from typing import Any
10
+
11
+ from pdforienter.models import FileResult, PageResult, PageType, RunResult
12
+
13
+ __all__ = [
14
+ "run_pipeline",
15
+ "RunResult",
16
+ "FileResult",
17
+ "PageResult",
18
+ "PageType",
19
+ ]
20
+
21
+ __version__ = "0.1.0"
22
+
23
+
24
+ def __getattr__(name: str) -> Any:
25
+ # Lazy export — defer importing the pipeline (and its transitive
26
+ # pytesseract / PyMuPDF dependencies) until the symbol is actually used.
27
+ if name == "run_pipeline":
28
+ from pdforienter.core.pipeline import run_pipeline
29
+ return run_pipeline
30
+ raise AttributeError(f"module 'pdforienter' has no attribute {name!r}")
pdforienter/cli.py ADDED
@@ -0,0 +1,66 @@
1
+ """
2
+ Command-line interface for PDFOrienter.
3
+
4
+ Usage
5
+ -----
6
+ pdforienter <pdf_or_dir> [<pdf_or_dir> ...] --output <dir>
7
+
8
+ Examples
9
+ --------
10
+ pdforienter invoice.pdf --output ./fixed
11
+ pdforienter /scans/ report.pdf --output /corrected
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import argparse
17
+ import sys
18
+
19
+ from pdforienter.core.pipeline import run_pipeline
20
+ from pdforienter.logging.writer import write_log
21
+ from pdforienter.utils.fs import resolve_pdf_paths
22
+
23
+
24
+ def build_parser() -> argparse.ArgumentParser:
25
+ parser = argparse.ArgumentParser(
26
+ prog="pdforienter",
27
+ description="Automatically fix PDF page orientations.",
28
+ )
29
+ parser.add_argument(
30
+ "inputs",
31
+ nargs="+",
32
+ metavar="PDF_OR_DIR",
33
+ help="One or more PDF files or directories to process.",
34
+ )
35
+ parser.add_argument(
36
+ "--output", "-o",
37
+ required=True,
38
+ metavar="OUTPUT_DIR",
39
+ help="Directory where corrected PDFs and the log will be saved.",
40
+ )
41
+ return parser
42
+
43
+
44
+ def main(argv: list[str] | None = None) -> int:
45
+ parser = build_parser()
46
+ args = parser.parse_args(argv)
47
+
48
+ pdf_paths = resolve_pdf_paths(args.inputs)
49
+ if not pdf_paths:
50
+ print("No PDF files found in the provided paths.", file=sys.stderr)
51
+ return 1
52
+
53
+ print(f"Processing {len(pdf_paths)} PDF file(s)…")
54
+ result = run_pipeline(pdf_paths, args.output)
55
+ log_path = write_log(result, args.output)
56
+
57
+ print(
58
+ f"\nDone. {result.total_pages_changed}/{result.total_pages} pages rotated "
59
+ f"across {result.total_files} file(s) in {result.total_duration_seconds:.1f}s."
60
+ )
61
+ print(f"Log written to: {log_path}")
62
+ return 0
63
+
64
+
65
+ if __name__ == "__main__":
66
+ sys.exit(main())
pdforienter/config.py ADDED
@@ -0,0 +1,40 @@
1
+ """
2
+ Global configuration and tuneable constants for PDFOrienter.
3
+ """
4
+
5
+ import math
6
+ import os
7
+
8
+ # ---------------------------------------------------------------------------
9
+ # Worker pool
10
+ # ---------------------------------------------------------------------------
11
+ # Use 75 % of available logical CPUs, minimum 1, to leave headroom for the OS
12
+ # and other processes running on the same host.
13
+ CPU_COUNT: int = os.cpu_count() or 1
14
+ MAX_WORKERS: int = max(1, math.floor(CPU_COUNT * 0.75))
15
+
16
+ # ---------------------------------------------------------------------------
17
+ # Tesseract / OSD
18
+ # ---------------------------------------------------------------------------
19
+ # Tesseract Page Segmentation Mode 0 = Orientation and Script Detection only.
20
+ # Much faster than full OCR — no character recognition is performed.
21
+ TESSERACT_OSD_PSM: int = 0
22
+
23
+ # Minimum OSD confidence (0-100) required to trust the detected orientation.
24
+ OSD_CONFIDENCE_THRESHOLD: float = 10.0
25
+
26
+ # ---------------------------------------------------------------------------
27
+ # Rotation
28
+ # ---------------------------------------------------------------------------
29
+ # Only these discrete angles (degrees) are considered valid rotations.
30
+ VALID_ROTATIONS: tuple[int, ...] = (0, 90, 180, 270)
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # File limits
34
+ # ---------------------------------------------------------------------------
35
+ MAX_FILE_SIZE_MB: int = 200
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # Logging
39
+ # ---------------------------------------------------------------------------
40
+ LOG_DATE_FORMAT: str = "%Y-%m-%d %H:%M:%S"
File without changes
@@ -0,0 +1,80 @@
1
+ """
2
+ analyse_page — the unit of work dispatched to each worker process.
3
+
4
+ Responsibility: open one page, classify it, detect orientation, return
5
+ a PageResult. This function is intentionally self-contained so it can
6
+ be safely pickled and sent to a ProcessPoolExecutor worker.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import time
12
+
13
+ import fitz # PyMuPDF
14
+
15
+ from pdforienter.core.classifier import has_text_layer
16
+ from pdforienter.core.detector import osd_orientation, text_orientation
17
+ from pdforienter.models import PageResult, PageType
18
+
19
+
20
+ def analyse_page(pdf_path: str, page_index: int) -> PageResult:
21
+ """
22
+ Analyse a single page and return its *PageResult*.
23
+
24
+ Parameters
25
+ ----------
26
+ pdf_path:
27
+ Absolute path to the source PDF.
28
+ page_index:
29
+ Zero-based page index within the document.
30
+ """
31
+ start = time.perf_counter()
32
+ page_number = page_index + 1 # convert to 1-based for reporting
33
+
34
+ doc = fitz.open(pdf_path)
35
+ try:
36
+ page = doc[page_index]
37
+ existing_rotation = int(page.rotation)
38
+
39
+ if has_text_layer(page):
40
+ page_type = PageType.TEXT
41
+ detected_angle, confidence = text_orientation(page)
42
+ else:
43
+ page_type = PageType.SCANNED
44
+ detected_angle, confidence = osd_orientation(page)
45
+
46
+ if detected_angle == 0:
47
+ changed = False
48
+ correction = 0
49
+ reason = "No rotation needed."
50
+ else:
51
+ changed = True
52
+ correction = detected_angle
53
+ reason = f"Rotation of {detected_angle}° detected (confidence {confidence:.1f})."
54
+
55
+ except Exception as exc: # noqa: BLE001
56
+ return PageResult(
57
+ page_number=page_number,
58
+ page_type=PageType.SKIPPED,
59
+ detected_angle=0,
60
+ existing_rotation=0,
61
+ correction_applied=0,
62
+ changed=False,
63
+ confidence=-1.0,
64
+ reason=f"Error during analysis: {exc}",
65
+ duration_seconds=time.perf_counter() - start,
66
+ )
67
+ finally:
68
+ doc.close()
69
+
70
+ return PageResult(
71
+ page_number=page_number,
72
+ page_type=page_type,
73
+ detected_angle=detected_angle,
74
+ existing_rotation=existing_rotation,
75
+ correction_applied=correction,
76
+ changed=changed,
77
+ confidence=confidence,
78
+ reason=reason,
79
+ duration_seconds=time.perf_counter() - start,
80
+ )
@@ -0,0 +1,18 @@
1
+ """
2
+ Determine whether a PDF page has a selectable text layer.
3
+
4
+ Responsibility: single — classify one page as TEXT or SCANNED.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import fitz # PyMuPDF
10
+
11
+ # Minimum number of characters required to treat a page as text-based.
12
+ _MIN_CHAR_COUNT: int = 20
13
+
14
+
15
+ def has_text_layer(page: fitz.Page) -> bool:
16
+ """Return True when *page* contains enough selectable text to analyse."""
17
+ text = page.get_text("text")
18
+ return len(text.strip()) >= _MIN_CHAR_COUNT
@@ -0,0 +1,61 @@
1
+ """
2
+ Apply collected rotation corrections to a PDF in a single write pass.
3
+
4
+ Responsibility: given a list of PageResults, mutate the document's
5
+ rotation metadata for every changed page and save once.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import time
11
+
12
+ import fitz # PyMuPDF
13
+
14
+ from pdforienter.models import PageResult
15
+
16
+
17
+ def apply_rotations(
18
+ input_path: str,
19
+ output_path: str,
20
+ page_results: list[PageResult],
21
+ ) -> float:
22
+ """
23
+ Write *output_path* with all required rotations applied in one pass.
24
+
25
+ Parameters
26
+ ----------
27
+ input_path:
28
+ Source PDF path.
29
+ output_path:
30
+ Destination PDF path (will be created / overwritten).
31
+ page_results:
32
+ Analysis results; only pages with ``changed=True`` are modified.
33
+
34
+ Returns
35
+ -------
36
+ float
37
+ Wall-clock seconds spent on the correction pass.
38
+ """
39
+ start = time.perf_counter()
40
+
41
+ doc = fitz.open(input_path)
42
+ try:
43
+ for result in page_results:
44
+ if not result.changed:
45
+ continue
46
+ page = doc[result.page_number - 1] # back to 0-based
47
+ new_rotation = _normalise_rotation(
48
+ result.existing_rotation + result.correction_applied
49
+ )
50
+ page.set_rotation(new_rotation)
51
+
52
+ doc.save(output_path, garbage=4, deflate=True)
53
+ finally:
54
+ doc.close()
55
+
56
+ return time.perf_counter() - start
57
+
58
+
59
+ def _normalise_rotation(degrees: int) -> int:
60
+ """Reduce *degrees* into the half-open range [0, 360)."""
61
+ return degrees % 360
@@ -0,0 +1,91 @@
1
+ """
2
+ Detect the orientation of a single PDF page.
3
+
4
+ Two strategies:
5
+ 1. text_orientation — fast; uses PyMuPDF's character direction vectors.
6
+ 2. osd_orientation — slower; renders the page and calls Tesseract OSD.
7
+
8
+ Responsibility: return (angle_degrees, confidence) for one page.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import fitz # PyMuPDF
14
+
15
+ from pdforienter.config import OSD_CONFIDENCE_THRESHOLD, TESSERACT_OSD_PSM
16
+
17
+ # Resolution used when rasterising a page for OSD.
18
+ _RENDER_DPI: int = 150 # lower = faster; sufficient for orientation detection
19
+
20
+
21
+ # ---------------------------------------------------------------------------
22
+ # Text-layer strategy
23
+ # ---------------------------------------------------------------------------
24
+
25
+ def text_orientation(page: fitz.Page) -> tuple[int, float]:
26
+ """
27
+ Infer orientation from character direction vectors in the text layer.
28
+
29
+ Returns (angle, confidence) where angle ∈ {0, 90, 180, 270} and
30
+ confidence is a value in [0, 100].
31
+ """
32
+ blocks = page.get_text("rawdict", flags=fitz.TEXT_PRESERVE_WHITESPACE)["blocks"]
33
+ angle_votes: dict[int, int] = {0: 0, 90: 0, 180: 0, 270: 0}
34
+
35
+ for block in blocks:
36
+ for line in block.get("lines", []):
37
+ direction = line.get("dir", (1, 0))
38
+ angle = _direction_to_angle(direction)
39
+ angle_votes[angle] += len(line.get("spans", []))
40
+
41
+ total = sum(angle_votes.values())
42
+ if total == 0:
43
+ return 0, 0.0
44
+
45
+ dominant = max(angle_votes, key=angle_votes.__getitem__)
46
+ confidence = (angle_votes[dominant] / total) * 100.0
47
+ return dominant, confidence
48
+
49
+
50
+ def _direction_to_angle(direction: tuple[float, float]) -> int:
51
+ """Map a (cos, sin) direction vector to the nearest 90-degree angle."""
52
+ dx, dy = direction
53
+ if dx > 0.5:
54
+ return 0
55
+ if dx < -0.5:
56
+ return 180
57
+ if dy > 0.5:
58
+ return 270
59
+ return 90
60
+
61
+
62
+ # ---------------------------------------------------------------------------
63
+ # OSD (Tesseract) strategy
64
+ # ---------------------------------------------------------------------------
65
+
66
+ def osd_orientation(page: fitz.Page) -> tuple[int, float]:
67
+ """
68
+ Detect orientation by rasterising *page* and running Tesseract OSD.
69
+
70
+ Returns (angle, confidence). Falls back to (0, 0.0) on any error.
71
+ """
72
+ try:
73
+ # Heavy imports kept lazy: text-only workloads never need Tesseract,
74
+ # and pytesseract pulls in pandas at module-load time.
75
+ import pytesseract
76
+ from PIL import Image
77
+
78
+ pix = page.get_pixmap(dpi=_RENDER_DPI, colorspace=fitz.csGRAY)
79
+ img = Image.frombytes("L", (pix.width, pix.height), pix.samples)
80
+ osd = pytesseract.image_to_osd(
81
+ img,
82
+ config=f"--psm {TESSERACT_OSD_PSM}",
83
+ output_type=pytesseract.Output.DICT,
84
+ )
85
+ angle = int(osd.get("rotate", 0))
86
+ confidence = float(osd.get("orientation_conf", 0.0))
87
+ if confidence < OSD_CONFIDENCE_THRESHOLD:
88
+ return 0, confidence
89
+ return angle, confidence
90
+ except Exception: # noqa: BLE001
91
+ return 0, 0.0
@@ -0,0 +1,55 @@
1
+ """
2
+ run_pipeline — public entry point for PDFOrienter.
3
+
4
+ Accepts one or more PDF paths, processes them (potentially in parallel
5
+ across files), aggregates results, and returns a RunResult.
6
+
7
+ Responsibility: top-level orchestration only; no PDF logic here.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import time
13
+ from pathlib import Path
14
+
15
+ from pdforienter.config import MAX_WORKERS
16
+ from pdforienter.core.processor import process_file
17
+ from pdforienter.models import FileResult, RunResult
18
+ from pdforienter.utils.fs import ensure_dir
19
+ from pdforienter.utils.resources import peak_ram_mb
20
+
21
+
22
+ def run_pipeline(
23
+ pdf_paths: list[str],
24
+ output_dir: str,
25
+ ) -> RunResult:
26
+ """
27
+ Process one or more PDF files and return a *RunResult*.
28
+
29
+ Parameters
30
+ ----------
31
+ pdf_paths:
32
+ List of absolute or relative paths to PDF files.
33
+ output_dir:
34
+ Directory where corrected PDFs and the log file will be written.
35
+ """
36
+ ensure_dir(output_dir)
37
+ run_start = time.perf_counter()
38
+
39
+ file_results: list[FileResult] = [
40
+ process_file(str(Path(p).resolve()), output_dir)
41
+ for p in pdf_paths
42
+ ]
43
+
44
+ return RunResult(
45
+ total_files=len(file_results),
46
+ total_pages=sum(r.total_pages for r in file_results),
47
+ total_pages_changed=sum(r.pages_changed for r in file_results),
48
+ total_text_pages=sum(r.text_pages for r in file_results),
49
+ total_scanned_pages=sum(r.scanned_pages for r in file_results),
50
+ total_skipped_pages=sum(r.skipped_pages for r in file_results),
51
+ workers_used=MAX_WORKERS,
52
+ peak_ram_mb=peak_ram_mb(),
53
+ total_duration_seconds=time.perf_counter() - run_start,
54
+ file_results=file_results,
55
+ )
@@ -0,0 +1,116 @@
1
+ """
2
+ Orchestrate the two-phase pipeline for a single PDF file.
3
+
4
+ Phase 1 — Parallel detection : all pages analysed concurrently.
5
+ Phase 2 — Single-pass correction: all rotations applied in one write.
6
+
7
+ Responsibility: coordinate analyzer + corrector for one file.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import time
13
+ from concurrent.futures import ProcessPoolExecutor, as_completed
14
+ from pathlib import Path
15
+
16
+ import fitz # PyMuPDF
17
+
18
+ from pdforienter.config import MAX_WORKERS
19
+ from pdforienter.core.analyzer import analyse_page
20
+ from pdforienter.core.corrector import apply_rotations
21
+ from pdforienter.models import FileResult, PageResult, PageType
22
+
23
+
24
+ def process_file(input_path: str, output_dir: str) -> FileResult:
25
+ """
26
+ Run the full detection + correction pipeline for *input_path*.
27
+
28
+ Parameters
29
+ ----------
30
+ input_path:
31
+ Absolute path to the source PDF.
32
+ output_dir:
33
+ Directory where the corrected PDF will be written.
34
+ """
35
+ file_start = time.perf_counter()
36
+ output_path = _build_output_path(input_path, output_dir)
37
+
38
+ try:
39
+ page_count = _page_count(input_path)
40
+ page_results = _detect_all_pages(input_path, page_count)
41
+ correction_duration = _correct_file(input_path, output_path, page_results)
42
+ except Exception as exc: # noqa: BLE001
43
+ return FileResult(
44
+ input_path=input_path,
45
+ output_path=output_path,
46
+ total_pages=0,
47
+ pages_changed=0,
48
+ text_pages=0,
49
+ scanned_pages=0,
50
+ skipped_pages=0,
51
+ error=str(exc),
52
+ total_duration_seconds=time.perf_counter() - file_start,
53
+ )
54
+
55
+ detection_duration = sum(r.duration_seconds for r in page_results)
56
+
57
+ return FileResult(
58
+ input_path=input_path,
59
+ output_path=output_path,
60
+ total_pages=page_count,
61
+ pages_changed=sum(1 for r in page_results if r.changed),
62
+ text_pages=sum(1 for r in page_results if r.page_type == PageType.TEXT),
63
+ scanned_pages=sum(1 for r in page_results if r.page_type == PageType.SCANNED),
64
+ skipped_pages=sum(1 for r in page_results if r.page_type == PageType.SKIPPED),
65
+ page_results=page_results,
66
+ detection_duration_seconds=detection_duration,
67
+ correction_duration_seconds=correction_duration,
68
+ total_duration_seconds=time.perf_counter() - file_start,
69
+ )
70
+
71
+
72
+ # ---------------------------------------------------------------------------
73
+ # Internal helpers
74
+ # ---------------------------------------------------------------------------
75
+
76
+ def _page_count(pdf_path: str) -> int:
77
+ doc = fitz.open(pdf_path)
78
+ count = int(doc.page_count)
79
+ doc.close()
80
+ return count
81
+
82
+
83
+ def _detect_all_pages(pdf_path: str, page_count: int) -> list[PageResult]:
84
+ """Dispatch one worker per page; collect results in page order."""
85
+ results: dict[int, PageResult] = {}
86
+
87
+ with ProcessPoolExecutor(max_workers=MAX_WORKERS) as pool:
88
+ future_map = {
89
+ pool.submit(analyse_page, pdf_path, idx): idx
90
+ for idx in range(page_count)
91
+ }
92
+ for future in as_completed(future_map):
93
+ idx = future_map[future]
94
+ results[idx] = future.result()
95
+
96
+ return [results[i] for i in range(page_count)]
97
+
98
+
99
+ def _correct_file(
100
+ input_path: str,
101
+ output_path: str,
102
+ page_results: list[PageResult],
103
+ ) -> float:
104
+ """Apply rotations only when at least one page needs changing."""
105
+ if not any(r.changed for r in page_results):
106
+ # Nothing to do — copy input to output as-is.
107
+ import shutil
108
+ shutil.copy2(input_path, output_path)
109
+ return 0.0
110
+
111
+ return apply_rotations(input_path, output_path, page_results)
112
+
113
+
114
+ def _build_output_path(input_path: str, output_dir: str) -> str:
115
+ stem = Path(input_path).stem
116
+ return str(Path(output_dir) / f"{stem}_corrected.pdf")
File without changes
@@ -0,0 +1,87 @@
1
+ """
2
+ Format a RunResult into a human-readable, structured text log.
3
+
4
+ Responsibility: serialisation only — no I/O, no business logic.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from datetime import datetime
10
+
11
+ from pdforienter.config import LOG_DATE_FORMAT
12
+ from pdforienter.models import FileResult, RunResult
13
+
14
+
15
+ def format_run_log(result: RunResult) -> str:
16
+ """Return the full log string for *result*."""
17
+ sections = [
18
+ _header(),
19
+ _run_summary(result),
20
+ _separator(),
21
+ *[_file_section(fr) for fr in result.file_results],
22
+ _footer(),
23
+ ]
24
+ return "\n".join(sections)
25
+
26
+
27
+ # ---------------------------------------------------------------------------
28
+ # Section builders
29
+ # ---------------------------------------------------------------------------
30
+
31
+ def _header() -> str:
32
+ now = datetime.now().strftime(LOG_DATE_FORMAT)
33
+ return f"PDFOrienter Run Log — {now}\n{'=' * 60}"
34
+
35
+
36
+ def _run_summary(r: RunResult) -> str:
37
+ return (
38
+ f"\n[RUN SUMMARY]\n"
39
+ f" Total files processed : {r.total_files}\n"
40
+ f" Total pages : {r.total_pages}\n"
41
+ f" Pages rotated : {r.total_pages_changed}\n"
42
+ f" Text pages : {r.total_text_pages}\n"
43
+ f" Scanned pages (OCR) : {r.total_scanned_pages}\n"
44
+ f" Skipped pages : {r.total_skipped_pages}\n"
45
+ f" Workers used : {r.workers_used}\n"
46
+ f" Peak RAM usage : {r.peak_ram_mb:.1f} MB\n"
47
+ f" Total time : {r.total_duration_seconds:.2f}s\n"
48
+ )
49
+
50
+
51
+ def _file_section(fr: FileResult) -> str:
52
+ lines = [
53
+ f"[FILE] {fr.input_path}",
54
+ f" Output : {fr.output_path}",
55
+ f" Total pages : {fr.total_pages}",
56
+ f" Pages changed : {fr.pages_changed}",
57
+ f" Text pages : {fr.text_pages}",
58
+ f" Scanned pages : {fr.scanned_pages}",
59
+ f" Skipped pages : {fr.skipped_pages}",
60
+ f" Detection time : {fr.detection_duration_seconds:.2f}s",
61
+ f" Correction time : {fr.correction_duration_seconds:.2f}s",
62
+ f" Total time : {fr.total_duration_seconds:.2f}s",
63
+ ]
64
+
65
+ if fr.error:
66
+ lines.append(f" ERROR : {fr.error}")
67
+ else:
68
+ lines.append(" [PAGE DETAILS]")
69
+ for pr in fr.page_results:
70
+ status = "CHANGED" if pr.changed else "OK"
71
+ lines.append(
72
+ f" p{pr.page_number:>4} | {pr.page_type.value:<7} | "
73
+ f"{status:<7} | angle={pr.detected_angle:>3}° | "
74
+ f"conf={pr.confidence:>5.1f} | {pr.duration_seconds:.2f}s | "
75
+ f"{pr.reason}"
76
+ )
77
+
78
+ lines.append("")
79
+ return "\n".join(lines)
80
+
81
+
82
+ def _separator() -> str:
83
+ return "-" * 60
84
+
85
+
86
+ def _footer() -> str:
87
+ return "=" * 60 + "\nEnd of log.\n"
@@ -0,0 +1,25 @@
1
+ """
2
+ Write formatted log content to disk.
3
+
4
+ Responsibility: file I/O for log output only.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+
12
+ from pdforienter.models import RunResult
13
+ from pdforienter.logging.formatter import format_run_log
14
+
15
+
16
+ def write_log(result: RunResult, output_dir: str) -> str:
17
+ """
18
+ Serialise *result* and write the log to *output_dir*.
19
+
20
+ Returns the absolute path of the written log file.
21
+ """
22
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
23
+ log_path = Path(output_dir) / f"pdforienter_{timestamp}.log"
24
+ log_path.write_text(format_run_log(result), encoding="utf-8")
25
+ return str(log_path)
pdforienter/models.py ADDED
@@ -0,0 +1,66 @@
1
+ """
2
+ Pure-data classes used throughout PDFOrienter.
3
+
4
+ No business logic lives here — only typed containers so every module
5
+ shares a common vocabulary.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+ from enum import Enum
12
+
13
+
14
+ class PageType(str, Enum):
15
+ TEXT = "text" # selectable text layer present
16
+ SCANNED = "scanned" # image-only — OCR/OSD required
17
+ SKIPPED = "skipped" # could not be analysed (no text, low confidence …)
18
+
19
+
20
+ @dataclass
21
+ class PageResult:
22
+ """Analysis + correction outcome for a single PDF page."""
23
+
24
+ page_number: int # 1-based
25
+ page_type: PageType
26
+ detected_angle: int # angle reported by detector (degrees)
27
+ existing_rotation: int # rotation already stored in PDF metadata
28
+ correction_applied: int # net rotation written to the page (degrees)
29
+ changed: bool
30
+ confidence: float # 0-100; -1 when N/A
31
+ reason: str # human-readable explanation
32
+ duration_seconds: float # wall-clock time for this page
33
+
34
+
35
+ @dataclass
36
+ class FileResult:
37
+ """Aggregated outcome for one PDF file."""
38
+
39
+ input_path: str
40
+ output_path: str
41
+ total_pages: int
42
+ pages_changed: int
43
+ text_pages: int
44
+ scanned_pages: int
45
+ skipped_pages: int
46
+ page_results: list[PageResult] = field(default_factory=list)
47
+ detection_duration_seconds: float = 0.0
48
+ correction_duration_seconds: float = 0.0
49
+ total_duration_seconds: float = 0.0
50
+ error: str | None = None
51
+
52
+
53
+ @dataclass
54
+ class RunResult:
55
+ """Top-level summary for an entire PDFOrienter run."""
56
+
57
+ total_files: int
58
+ total_pages: int
59
+ total_pages_changed: int
60
+ total_text_pages: int
61
+ total_scanned_pages: int
62
+ total_skipped_pages: int
63
+ workers_used: int
64
+ peak_ram_mb: float
65
+ total_duration_seconds: float
66
+ file_results: list[FileResult] = field(default_factory=list)
File without changes
@@ -0,0 +1,32 @@
1
+ """
2
+ Filesystem utility helpers.
3
+
4
+ Responsibility: path / directory operations only.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+
11
+
12
+ def ensure_dir(directory: str) -> Path:
13
+ """Create *directory* (and all parents) if it does not already exist."""
14
+ path = Path(directory)
15
+ path.mkdir(parents=True, exist_ok=True)
16
+ return path
17
+
18
+
19
+ def resolve_pdf_paths(inputs: list[str]) -> list[str]:
20
+ """
21
+ Expand a list of paths to absolute PDF file paths.
22
+
23
+ Directories are walked recursively; non-PDF files are ignored.
24
+ """
25
+ resolved: list[str] = []
26
+ for raw in inputs:
27
+ p = Path(raw).resolve()
28
+ if p.is_dir():
29
+ resolved.extend(str(f) for f in sorted(p.rglob("*.pdf")))
30
+ elif p.suffix.lower() == ".pdf" and p.is_file():
31
+ resolved.append(str(p))
32
+ return resolved
@@ -0,0 +1,22 @@
1
+ """
2
+ System resource measurement helpers.
3
+
4
+ Responsibility: CPU and RAM telemetry only.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+
11
+ import psutil
12
+
13
+
14
+ def peak_ram_mb() -> float:
15
+ """Return the current process RSS memory usage in megabytes."""
16
+ process = psutil.Process(os.getpid())
17
+ return float(process.memory_info().rss) / (1024 ** 2)
18
+
19
+
20
+ def cpu_count() -> int:
21
+ """Return the number of logical CPUs available to the current process."""
22
+ return os.cpu_count() or 1
@@ -0,0 +1,259 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdforienter
3
+ Version: 0.1.0
4
+ Summary: Intelligent, parallel PDF page rotation correction.
5
+ Author: InfinitiBit GmbH
6
+ Maintainer: InfinitiBit GmbH
7
+ License: MIT
8
+ Project-URL: Homepage, https://github.com/MdRahmatUllah/pdforienter
9
+ Project-URL: Repository, https://github.com/MdRahmatUllah/pdforienter
10
+ Project-URL: Issues, https://github.com/MdRahmatUllah/pdforienter/issues
11
+ Project-URL: Documentation, https://github.com/MdRahmatUllah/pdforienter/blob/main/TECHNICAL.md
12
+ Keywords: pdf,rotation,ocr,tesseract,document
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Utilities
22
+ Classifier: Topic :: Multimedia :: Graphics :: Graphics Conversion
23
+ Classifier: Topic :: Text Processing
24
+ Requires-Python: >=3.10
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: PyMuPDF>=1.23
28
+ Requires-Dist: pytesseract>=0.3.10
29
+ Requires-Dist: Pillow>=10.0
30
+ Requires-Dist: psutil>=5.9
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest>=7; extra == "dev"
33
+ Requires-Dist: pytest-cov; extra == "dev"
34
+ Requires-Dist: ruff; extra == "dev"
35
+ Requires-Dist: mypy; extra == "dev"
36
+ Dynamic: license-file
37
+
38
+ # PDFOrienter
39
+
40
+ **Intelligent, parallel PDF page rotation correction for Python.**
41
+
42
+ PDFOrienter analyses every page of one or more PDF files, detects incorrect orientations, and fixes them in a single write pass — with no unnecessary re-processing.
43
+
44
+ ---
45
+
46
+ ## Features
47
+
48
+ - **Two-phase pipeline** — detect all pages in parallel, then apply all corrections in a single write
49
+ - **Smart strategy selection** — uses fast text-direction analysis for text-based pages; falls back to Tesseract OSD only for image/scanned pages
50
+ - **Dynamic parallelism** — automatically uses 75 % of available CPU cores; scales from 4 to 64+ cores without any configuration change
51
+ - **Detailed structured logging** — per-page and per-file timing, rotation details, confidence scores, RAM and CPU usage
52
+ - **Zero intermediate files** — corrected PDFs are written once; originals are never modified
53
+ - **Package-ready** — clean modular design, typed, fully testable
54
+
55
+ ---
56
+
57
+ ## Requirements
58
+
59
+ ### Python
60
+
61
+ Python 3.10 or newer.
62
+
63
+ ### System dependency — Tesseract
64
+
65
+ Tesseract must be installed on the host system **before** installing PDFOrienter.
66
+
67
+ **Ubuntu / Debian**
68
+ ```bash
69
+ sudo apt-get update && sudo apt-get install -y tesseract-ocr
70
+ ```
71
+
72
+ **macOS (Homebrew)**
73
+ ```bash
74
+ brew install tesseract
75
+ ```
76
+
77
+ **Windows**
78
+
79
+ Download and run the installer from the [Tesseract UB Mannheim releases](https://github.com/UB-Mannheim/tesseract/wiki), then add the install directory to your `PATH`.
80
+
81
+ ---
82
+
83
+ ## Installation
84
+
85
+ ```bash
86
+ pip install pdforienter
87
+ ```
88
+
89
+ For development (includes linting + test tools):
90
+
91
+ ```bash
92
+ git clone https://github.com/your-org/pdforienter.git
93
+ cd pdforienter
94
+ pip install -e ".[dev]"
95
+ ```
96
+
97
+ ---
98
+
99
+ ## Quick Start
100
+
101
+ ### Command line
102
+
103
+ ```bash
104
+ # Fix a single PDF
105
+ pdforienter invoice.pdf --output ./fixed
106
+
107
+ # Fix every PDF in a directory
108
+ pdforienter /scans/ --output /corrected
109
+
110
+ # Mix files and directories
111
+ pdforienter report.pdf /archive/ receipts.pdf --output ./out
112
+ ```
113
+
114
+ ### Python API
115
+
116
+ ```python
117
+ from pdforienter import run_pipeline
118
+ from pdforienter.logging.writer import write_log
119
+
120
+ result = run_pipeline(
121
+ pdf_paths=["invoice.pdf", "report.pdf"],
122
+ output_dir="./corrected",
123
+ )
124
+
125
+ # Write the structured log file
126
+ log_path = write_log(result, "./corrected")
127
+
128
+ print(f"{result.total_pages_changed} pages corrected in {result.total_duration_seconds:.1f}s")
129
+ print(f"Log: {log_path}")
130
+ ```
131
+
132
+ ---
133
+
134
+ ## Log File
135
+
136
+ Every run produces a timestamped `.log` file in the output directory.
137
+
138
+ ```
139
+ PDFOrienter Run Log — 2024-11-01 14:32:05
140
+ ============================================================
141
+
142
+ [RUN SUMMARY]
143
+ Total files processed : 3
144
+ Total pages : 247
145
+ Pages rotated : 18
146
+ Text pages : 201
147
+ Scanned pages (OCR) : 46
148
+ Skipped pages : 0
149
+ Workers used : 6
150
+ Peak RAM usage : 312.4 MB
151
+ Total time : 42.18s
152
+
153
+ ------------------------------------------------------------
154
+ [FILE] /scans/invoice.pdf
155
+ Output : /corrected/invoice_corrected.pdf
156
+ Total pages : 12
157
+ Pages changed : 3
158
+ Text pages : 8
159
+ Scanned pages : 4
160
+ Skipped pages : 0
161
+ Detection time : 9.41s
162
+ Correction time : 0.23s
163
+ Total time : 9.64s
164
+ [PAGE DETAILS]
165
+ p 1 | text | OK | angle= 0° | conf= 98.2 | 0.11s | No rotation needed.
166
+ p 2 | scanned | CHANGED | angle= 90° | conf= 87.5 | 2.34s | Rotation of 90° detected (confidence 87.5).
167
+ ...
168
+ ```
169
+
170
+ ---
171
+
172
+ ## Project Structure
173
+
174
+ ```
175
+ pdforienter/
176
+ ├── pdforienter/
177
+ │ ├── __init__.py # Public API: run_pipeline
178
+ │ ├── config.py # Tuneable constants (worker count, thresholds)
179
+ │ ├── models.py # Typed data classes (PageResult, FileResult, RunResult)
180
+ │ ├── cli.py # Command-line interface
181
+ │ ├── core/
182
+ │ │ ├── pipeline.py # Top-level orchestrator
183
+ │ │ ├── processor.py # Per-file orchestrator (Phase 1 + Phase 2)
184
+ │ │ ├── analyzer.py # Per-page worker (dispatched to subprocess)
185
+ │ │ ├── classifier.py # Text vs scanned page detection
186
+ │ │ ├── detector.py # Orientation detection (text + OSD strategies)
187
+ │ │ └── corrector.py # Single-pass rotation applier
188
+ │ ├── logging/
189
+ │ │ ├── formatter.py # RunResult → structured log string
190
+ │ │ └── writer.py # Write log file to disk
191
+ │ └── utils/
192
+ │ ├── fs.py # Filesystem helpers
193
+ │ └── resources.py # CPU / RAM telemetry
194
+ ├── tests/
195
+ │ └── test_core.py
196
+ ├── pyproject.toml
197
+ └── README.md
198
+ ```
199
+
200
+ ---
201
+
202
+ ## Configuration
203
+
204
+ All tuneable constants live in `pdforienter/config.py`.
205
+
206
+ | Constant | Default | Description |
207
+ |---|---|---|
208
+ | `MAX_WORKERS` | `floor(cpu_count × 0.75)` | Worker processes for parallel page analysis |
209
+ | `OSD_CONFIDENCE_THRESHOLD` | `10.0` | Minimum Tesseract OSD confidence to trust a result |
210
+ | `TESSERACT_OSD_PSM` | `0` | Tesseract page segmentation mode (0 = OSD only) |
211
+ | `_RENDER_DPI` (detector.py) | `150` | DPI used when rasterising pages for OSD |
212
+ | `_MIN_CHAR_COUNT` (classifier.py) | `20` | Minimum characters to classify a page as text-based |
213
+
214
+ ---
215
+
216
+ ## How It Works
217
+
218
+ ### Phase 1 — Parallel Detection
219
+
220
+ Each page is dispatched to a subprocess worker via `ProcessPoolExecutor`. Workers run concurrently up to `MAX_WORKERS`.
221
+
222
+ For each page:
223
+ 1. **Classify** — does the page have selectable text?
224
+ 2. **Detect orientation**
225
+ - *Text page* → analyse character direction vectors (fast, no OCR)
226
+ - *Scanned page* → rasterise at 150 DPI and run Tesseract OSD
227
+ 3. Return a `PageResult` with the detected angle, confidence, and timing
228
+
229
+ ### Phase 2 — Single-Pass Correction
230
+
231
+ After all pages are analysed, a single `fitz.Document.save()` call applies every rotation and writes the corrected PDF. No intermediate files are created.
232
+
233
+ ---
234
+
235
+ ## Performance
236
+
237
+ Typical estimates on an 8-core server (6 workers) with mixed text/scanned PDFs:
238
+
239
+ | Scenario | Estimate |
240
+ |---|---|
241
+ | 2 000 pages, all text-based | ~1–2 minutes |
242
+ | 2 000 pages, mixed 50/50 | ~7–8 minutes |
243
+ | 2 000 pages, all scanned | ~15–17 minutes |
244
+
245
+ RAM usage: ~200–400 MB per Tesseract worker. 6 workers ≈ 2.5 GB peak. Well within a 16 GB server.
246
+
247
+ ---
248
+
249
+ ## Running Tests
250
+
251
+ ```bash
252
+ pytest tests/ -v
253
+ ```
254
+
255
+ ---
256
+
257
+ ## License
258
+
259
+ MIT
@@ -0,0 +1,23 @@
1
+ pdforienter/__init__.py,sha256=zWu665aCLdZahwZo69vWM8J2yQKfjqdI2wGanGpbENQ,758
2
+ pdforienter/cli.py,sha256=4N7J4LPorKH8UejoRexdGgDoVyCz8jIHDhgCiloQSt0,1733
3
+ pdforienter/config.py,sha256=4urbIquQOfLUMQtlC-PswQt2Eo-qAtyXEQ2ZGuBzvIk,1651
4
+ pdforienter/models.py,sha256=zb7kWikz3omLSitjP3PKMBJcR8LiRDdD8wOOVF2kmWc,1883
5
+ pdforienter/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ pdforienter/core/analyzer.py,sha256=IHrZdExx79w-K-7YWE-cU8q_4_99i91xqp4lwTVeGwo,2407
7
+ pdforienter/core/classifier.py,sha256=3wZhXHmAF8uKu3feW9w-zsFjgKyMYoPJQun9hGzzNOk,496
8
+ pdforienter/core/corrector.py,sha256=Zf_RevNTNCpZw5UswVagsjwc2XhSx9Ld-OMojjIpsqg,1530
9
+ pdforienter/core/detector.py,sha256=9XV6b9E1HJkzfwaTtKNu2DlljTY9g8LRbI_PbE5wOb4,3094
10
+ pdforienter/core/pipeline.py,sha256=CNGOWz-1G5ziVNYmMxipHvpvocIDjtQQEE2g-6qZLaw,1702
11
+ pdforienter/core/processor.py,sha256=5cBX-LWDytH_8DdXAU5wBbE5P6gC-fpvmWVFg4aF8Eo,3858
12
+ pdforienter/logging/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ pdforienter/logging/formatter.py,sha256=Rp2SgjtAi54FGosVyXQwGqdI4KNj-6JLp7h78zqYNoc,2820
14
+ pdforienter/logging/writer.py,sha256=BtYWZVe-wiZ1r-JzpIc15K2vh1LFrU6_5Uq2urzAqjU,686
15
+ pdforienter/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ pdforienter/utils/fs.py,sha256=cNzJa5O6riV-rvpitLsfzG1CvcluJrqQnLIcquRnQU0,841
17
+ pdforienter/utils/resources.py,sha256=F6193tGVylCYpYtsOVz3w5J0gVtRwreLCY3aFGmasvg,485
18
+ pdforienter-0.1.0.dist-info/licenses/LICENSE,sha256=yBPpeen2dS_xfsiwE5iPbH-ekYEKDBLZO715EvKLIkc,1073
19
+ pdforienter-0.1.0.dist-info/METADATA,sha256=0Ec4TJfk336-JZsypMscNYEapp7fUykG5XfpkzICnEc,8105
20
+ pdforienter-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
21
+ pdforienter-0.1.0.dist-info/entry_points.txt,sha256=HNa3K0WeJyLE91iQyjk0xodYDr93Mgx9oLWywv2-AS0,53
22
+ pdforienter-0.1.0.dist-info/top_level.txt,sha256=8M7epoDEGKPZEBAUa1IpCgc_C9OjF4D6R7pLJpRtO0g,12
23
+ pdforienter-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ pdforienter = pdforienter.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 InfinitiBit GmbH
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ pdforienter