docorient 0.2.0__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docorient-0.3.2/CHANGELOG.md +44 -0
- {docorient-0.2.0 → docorient-0.3.2}/PKG-INFO +8 -7
- {docorient-0.2.0 → docorient-0.3.2}/docs/architecture.md +52 -19
- {docorient-0.2.0 → docorient-0.3.2}/docs/contributing.md +1 -2
- {docorient-0.2.0 → docorient-0.3.2}/pyproject.toml +8 -7
- {docorient-0.2.0 → docorient-0.3.2}/src/docorient/__init__.py +8 -1
- docorient-0.3.2/src/docorient/_imaging.py +49 -0
- docorient-0.3.2/src/docorient/_version.py +1 -0
- {docorient-0.2.0 → docorient-0.3.2}/src/docorient/batch/processor.py +75 -62
- {docorient-0.2.0 → docorient-0.3.2}/src/docorient/batch/worker.py +46 -32
- docorient-0.3.2/src/docorient/correction.py +71 -0
- docorient-0.3.2/src/docorient/detection/__init__.py +13 -0
- docorient-0.3.2/src/docorient/detection/base.py +15 -0
- docorient-0.3.2/src/docorient/detection/engine.py +55 -0
- docorient-0.3.2/src/docorient/detection/primary.py +77 -0
- docorient-0.3.2/src/docorient/detection/secondary.py +75 -0
- docorient-0.3.2/src/docorient/rotation.py +9 -0
- docorient-0.3.2/src/docorient/voting.py +33 -0
- {docorient-0.2.0 → docorient-0.3.2}/tests/test_detection.py +10 -5
- docorient-0.2.0/CHANGELOG.md +0 -22
- docorient-0.2.0/src/docorient/_imaging.py +0 -46
- docorient-0.2.0/src/docorient/_version.py +0 -1
- docorient-0.2.0/src/docorient/correction.py +0 -122
- docorient-0.2.0/src/docorient/detection/__init__.py +0 -4
- docorient-0.2.0/src/docorient/detection/engine.py +0 -51
- docorient-0.2.0/src/docorient/detection/primary.py +0 -74
- docorient-0.2.0/src/docorient/detection/secondary.py +0 -72
- {docorient-0.2.0 → docorient-0.3.2}/.github/workflows/publish.yml +0 -0
- {docorient-0.2.0 → docorient-0.3.2}/.gitignore +0 -0
- {docorient-0.2.0 → docorient-0.3.2}/LICENSE +0 -0
- {docorient-0.2.0 → docorient-0.3.2}/README.md +0 -0
- {docorient-0.2.0 → docorient-0.3.2}/src/docorient/batch/__init__.py +0 -0
- {docorient-0.2.0 → docorient-0.3.2}/src/docorient/batch/scanner.py +0 -0
- {docorient-0.2.0 → docorient-0.3.2}/src/docorient/cli.py +0 -0
- {docorient-0.2.0 → docorient-0.3.2}/src/docorient/config.py +0 -0
- {docorient-0.2.0 → docorient-0.3.2}/src/docorient/exceptions.py +0 -0
- {docorient-0.2.0 → docorient-0.3.2}/src/docorient/types.py +0 -0
- {docorient-0.2.0 → docorient-0.3.2}/tests/__init__.py +0 -0
- {docorient-0.2.0 → docorient-0.3.2}/tests/conftest.py +0 -0
- {docorient-0.2.0 → docorient-0.3.2}/tests/test_batch.py +0 -0
- {docorient-0.2.0 → docorient-0.3.2}/tests/test_cli.py +0 -0
- {docorient-0.2.0 → docorient-0.3.2}/tests/test_correction.py +0 -0
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.3.2 (2026-02-27)
|
|
4
|
+
|
|
5
|
+
- Fix: update project URLs to correct GitHub repository
|
|
6
|
+
- Add changelog link to PyPI metadata
|
|
7
|
+
|
|
8
|
+
## 0.3.1 (2026-02-27)
|
|
9
|
+
|
|
10
|
+
- Fix: author metadata corrected to Lucas Gabriel Vaz
|
|
11
|
+
- Remove `tesseract` from PyPI keywords
|
|
12
|
+
|
|
13
|
+
## 0.3.0 (2026-02-27)
|
|
14
|
+
|
|
15
|
+
- **Breaking:** detection engines refactored to class-based architecture with `DetectionEngine` Protocol
|
|
16
|
+
- Introduce `DetectionPipeline` for extensible engine orchestration
|
|
17
|
+
- Introduce `PrimaryEngine` and `SecondaryEngine` classes
|
|
18
|
+
- Extract `rotation.py` and `voting.py` as standalone modules
|
|
19
|
+
- Encapsulate worker state in `WorkerContext` dataclass
|
|
20
|
+
- Transform `_imaging.py` functions into `ImageIO` class
|
|
21
|
+
- Decompose `process_directory` into focused sub-functions
|
|
22
|
+
- Apply custom exceptions (`DetectionError`, `CorrectionError`, `BatchProcessingError`) throughout codebase
|
|
23
|
+
- Export `DetectionEngine`, `DetectionPipeline`, `PrimaryEngine`, `SecondaryEngine` in public API
|
|
24
|
+
|
|
25
|
+
## 0.2.0 (2026-02-26)
|
|
26
|
+
|
|
27
|
+
- **Breaking:** renamed config params `osd_confidence_threshold` → `secondary_confidence_threshold`, `max_osd_dimension` → `secondary_max_dimension`, `projection_target_dimension` → `primary_max_dimension`
|
|
28
|
+
- **Breaking:** renamed CLI flag `--no-ocr` → `--no-secondary`
|
|
29
|
+
- Internal engines renamed to `primary` and `secondary`
|
|
30
|
+
- Updated `OrientationResult.method` trace strings
|
|
31
|
+
|
|
32
|
+
## 0.1.1 (2026-02-26)
|
|
33
|
+
|
|
34
|
+
- Docs: added `if __name__ == "__main__":` note for `process_directory` on macOS/Windows
|
|
35
|
+
|
|
36
|
+
## 0.1.0 (2026-02-25)
|
|
37
|
+
|
|
38
|
+
- Initial release
|
|
39
|
+
- Primary engine for 90°/270° detection
|
|
40
|
+
- Optional secondary engine for 180° detection
|
|
41
|
+
- Single image and batch directory processing
|
|
42
|
+
- Multi-page majority voting
|
|
43
|
+
- Resumable batch processing
|
|
44
|
+
- CLI interface
|
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docorient
|
|
3
|
-
Version: 0.2
|
|
4
|
-
Summary: Document image orientation detection and correction
|
|
5
|
-
Project-URL: Homepage, https://github.com/
|
|
6
|
-
Project-URL: Repository, https://github.com/
|
|
7
|
-
Project-URL: Issues, https://github.com/
|
|
8
|
-
|
|
3
|
+
Version: 0.3.2
|
|
4
|
+
Summary: Document image orientation detection and correction.
|
|
5
|
+
Project-URL: Homepage, https://github.com/lucasleirbag/DocOrient
|
|
6
|
+
Project-URL: Repository, https://github.com/lucasleirbag/DocOrient
|
|
7
|
+
Project-URL: Issues, https://github.com/lucasleirbag/DocOrient/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/lucasleirbag/DocOrient/blob/main/CHANGELOG.md
|
|
9
|
+
Author: Lucas Gabriel Vaz
|
|
9
10
|
License-Expression: MIT
|
|
10
11
|
License-File: LICENSE
|
|
11
|
-
Keywords: correction,document,image,ocr,orientation,rotation
|
|
12
|
+
Keywords: correction,document,image,ocr,orientation,rotation
|
|
12
13
|
Classifier: Development Status :: 3 - Alpha
|
|
13
14
|
Classifier: Intended Audience :: Developers
|
|
14
15
|
Classifier: License :: OSI Approved :: MIT License
|
|
@@ -6,21 +6,24 @@
|
|
|
6
6
|
src/docorient/
|
|
7
7
|
├── __init__.py Public API re-exports
|
|
8
8
|
├── _version.py Version string
|
|
9
|
-
├── _imaging.py
|
|
9
|
+
├── _imaging.py ImageIO class for image operations
|
|
10
10
|
├── config.py OrientationConfig dataclass
|
|
11
11
|
├── types.py Result dataclasses
|
|
12
12
|
├── exceptions.py Exception hierarchy
|
|
13
|
+
├── rotation.py Image rotation utility
|
|
14
|
+
├── voting.py Majority voting logic
|
|
13
15
|
├── correction.py correct_image, correct_document_pages
|
|
14
16
|
├── cli.py CLI entry point
|
|
15
17
|
├── detection/
|
|
16
|
-
│ ├── __init__.py Re-exports
|
|
17
|
-
│ ├──
|
|
18
|
-
│ ├──
|
|
19
|
-
│
|
|
18
|
+
│ ├── __init__.py Re-exports detection API
|
|
19
|
+
│ ├── base.py DetectionEngine Protocol
|
|
20
|
+
│ ├── engine.py DetectionPipeline orchestrator
|
|
21
|
+
│ ├── primary.py PrimaryEngine (90°/270°)
|
|
22
|
+
│ └── secondary.py SecondaryEngine (180°)
|
|
20
23
|
└── batch/
|
|
21
24
|
├── __init__.py Re-exports process_directory
|
|
22
25
|
├── scanner.py Directory scanning and grouping
|
|
23
|
-
├── worker.py
|
|
26
|
+
├── worker.py WorkerContext and multiprocessing logic
|
|
24
27
|
└── processor.py Batch orchestrator with resume and progress
|
|
25
28
|
```
|
|
26
29
|
|
|
@@ -30,17 +33,22 @@ src/docorient/
|
|
|
30
33
|
detect_orientation()
|
|
31
34
|
│
|
|
32
35
|
▼
|
|
33
|
-
|
|
36
|
+
DetectionPipeline.run()
|
|
37
|
+
Iterates through list[DetectionEngine]
|
|
38
|
+
│
|
|
39
|
+
▼
|
|
40
|
+
PrimaryEngine.detect()
|
|
34
41
|
Analyzes pixel density distribution to determine text alignment.
|
|
35
42
|
│
|
|
36
43
|
├── angle ∈ {90, 270} ──► Return result immediately
|
|
37
44
|
│
|
|
38
|
-
└── angle = 0 (aligned) ──►
|
|
45
|
+
└── angle = 0 (aligned) ──► SecondaryEngine.detect()
|
|
46
|
+
│
|
|
47
|
+
is_available()?
|
|
39
48
|
│
|
|
40
49
|
Yes ─────────┴───────── No
|
|
41
50
|
▼ ▼
|
|
42
|
-
|
|
43
|
-
(detection/secondary.py)
|
|
51
|
+
Runs secondary analysis Return 0° (no change)
|
|
44
52
|
Checks for 180° inversion
|
|
45
53
|
with confidence scoring
|
|
46
54
|
│
|
|
@@ -51,27 +59,52 @@ Analyzes pixel density distribution to determine text alignment.
|
|
|
51
59
|
Return 180° Return 0°
|
|
52
60
|
```
|
|
53
61
|
|
|
62
|
+
## Engine Architecture
|
|
63
|
+
|
|
64
|
+
The detection system is built on a `DetectionEngine` Protocol:
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
DetectionEngine (Protocol)
|
|
68
|
+
├── name: str
|
|
69
|
+
└── detect(image, config) → OrientationResult | None
|
|
70
|
+
|
|
71
|
+
PrimaryEngine implements DetectionEngine
|
|
72
|
+
├── Always returns OrientationResult (never None)
|
|
73
|
+
└── Detects 0°, 90°, 270° via energy analysis
|
|
74
|
+
|
|
75
|
+
SecondaryEngine implements DetectionEngine
|
|
76
|
+
├── Returns None when unavailable or low confidence
|
|
77
|
+
└── Detects 180° via optional OCR dependency
|
|
78
|
+
|
|
79
|
+
DetectionPipeline
|
|
80
|
+
├── Holds list[DetectionEngine] (default: [PrimaryEngine(), SecondaryEngine()])
|
|
81
|
+
└── Executes engines in sequence with short-circuit logic
|
|
82
|
+
```
|
|
83
|
+
|
|
54
84
|
## Batch Processing Pipeline
|
|
55
85
|
|
|
56
86
|
```
|
|
57
87
|
process_directory()
|
|
58
88
|
│
|
|
59
89
|
▼
|
|
90
|
+
_resolve_output_directory()
|
|
91
|
+
Resolves explicit path or generates UUID
|
|
92
|
+
│
|
|
93
|
+
▼
|
|
60
94
|
scan_directory() ← scanner.py
|
|
61
95
|
Groups images by
|
|
62
96
|
source document name
|
|
63
97
|
│
|
|
64
98
|
▼
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
│
|
|
68
|
-
▼
|
|
69
|
-
Distribute pending
|
|
70
|
-
sources into N batches ← N = effective_workers
|
|
99
|
+
_filter_pending_sources()
|
|
100
|
+
Loads resume log, skips completed
|
|
71
101
|
│
|
|
72
102
|
▼
|
|
73
|
-
|
|
103
|
+
_run_parallel_processing()
|
|
74
104
|
┌─────────────────────────────────────┐
|
|
105
|
+
│ multiprocessing.Pool │
|
|
106
|
+
│ WorkerContext encapsulates state │
|
|
107
|
+
│ │
|
|
75
108
|
│ For each source in batch: │
|
|
76
109
|
│ 1. Run detection per page │
|
|
77
110
|
│ 2. Apply majority voting │
|
|
@@ -93,12 +126,12 @@ Majority voting resolves this:
|
|
|
93
126
|
2. Find the most common angle (`Counter.most_common`)
|
|
94
127
|
3. Override any unreliable detection that differs from the majority
|
|
95
128
|
|
|
96
|
-
Implemented in `
|
|
129
|
+
Implemented in `voting.apply_majority_voting()`, reused by both
|
|
97
130
|
`correct_document_pages()` and `batch/worker._process_single_source()`.
|
|
98
131
|
|
|
99
132
|
## Multiprocessing Design
|
|
100
133
|
|
|
101
|
-
- **
|
|
134
|
+
- **WorkerContext dataclass** encapsulates all shared state in a single object
|
|
102
135
|
- **Progress tracking** via `multiprocessing.Value` + `multiprocessing.Lock`
|
|
103
136
|
- **Resume log** written atomically per source file, protected by the shared lock
|
|
104
137
|
- **`maxtasksperchild=1`** prevents memory accumulation in long jobs
|
|
@@ -28,9 +28,8 @@ ruff check src/ tests/ --fix
|
|
|
28
28
|
- No comments — code must be self-explanatory through naming
|
|
29
29
|
- Descriptive variable names — never single-letter variables in non-trivial scopes
|
|
30
30
|
- Full type hints on all function signatures
|
|
31
|
-
- Docstrings only on public API functions
|
|
32
31
|
- Functions must do exactly one thing, max ~30 lines of logic
|
|
33
|
-
-
|
|
32
|
+
- Configuration is always passed as a parameter
|
|
34
33
|
|
|
35
34
|
## Publishing a New Version
|
|
36
35
|
|
|
@@ -4,13 +4,13 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docorient"
|
|
7
|
-
version = "0.2
|
|
8
|
-
description = "Document image orientation detection and correction
|
|
7
|
+
version = "0.3.2"
|
|
8
|
+
description = "Document image orientation detection and correction."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
11
11
|
requires-python = ">= 3.10"
|
|
12
|
-
authors = [{ name = "
|
|
13
|
-
keywords = ["document", "orientation", "rotation", "ocr", "image", "correction"
|
|
12
|
+
authors = [{ name = "Lucas Gabriel Vaz" }]
|
|
13
|
+
keywords = ["document", "orientation", "rotation", "ocr", "image", "correction"]
|
|
14
14
|
classifiers = [
|
|
15
15
|
"Development Status :: 3 - Alpha",
|
|
16
16
|
"Intended Audience :: Developers",
|
|
@@ -37,9 +37,10 @@ dev = ["pytest >= 8.0", "pytest-cov", "ruff >= 0.4", "build", "twine"]
|
|
|
37
37
|
docorient = "docorient.cli:main"
|
|
38
38
|
|
|
39
39
|
[project.urls]
|
|
40
|
-
Homepage = "https://github.com/
|
|
41
|
-
Repository = "https://github.com/
|
|
42
|
-
Issues = "https://github.com/
|
|
40
|
+
Homepage = "https://github.com/lucasleirbag/DocOrient"
|
|
41
|
+
Repository = "https://github.com/lucasleirbag/DocOrient"
|
|
42
|
+
Issues = "https://github.com/lucasleirbag/DocOrient/issues"
|
|
43
|
+
Changelog = "https://github.com/lucasleirbag/DocOrient/blob/main/CHANGELOG.md"
|
|
43
44
|
|
|
44
45
|
[tool.hatch.build.targets.wheel]
|
|
45
46
|
packages = ["src/docorient"]
|
|
@@ -2,7 +2,10 @@ from docorient._version import __version__
|
|
|
2
2
|
from docorient.batch.processor import process_directory
|
|
3
3
|
from docorient.config import OrientationConfig
|
|
4
4
|
from docorient.correction import correct_document_pages, correct_image
|
|
5
|
-
from docorient.detection.
|
|
5
|
+
from docorient.detection.base import DetectionEngine
|
|
6
|
+
from docorient.detection.engine import DetectionPipeline, detect_orientation
|
|
7
|
+
from docorient.detection.primary import PrimaryEngine
|
|
8
|
+
from docorient.detection.secondary import SecondaryEngine
|
|
6
9
|
from docorient.exceptions import (
|
|
7
10
|
BatchProcessingError,
|
|
8
11
|
CorrectionError,
|
|
@@ -22,11 +25,15 @@ __all__ = [
|
|
|
22
25
|
"BatchSummary",
|
|
23
26
|
"CorrectionError",
|
|
24
27
|
"CorrectionResult",
|
|
28
|
+
"DetectionEngine",
|
|
25
29
|
"DetectionError",
|
|
30
|
+
"DetectionPipeline",
|
|
26
31
|
"DocorientError",
|
|
27
32
|
"OrientationConfig",
|
|
28
33
|
"OrientationResult",
|
|
29
34
|
"PageResult",
|
|
35
|
+
"PrimaryEngine",
|
|
36
|
+
"SecondaryEngine",
|
|
30
37
|
"TesseractNotAvailableError",
|
|
31
38
|
"__version__",
|
|
32
39
|
"correct_document_pages",
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from PIL import Image
|
|
6
|
+
|
|
7
|
+
FORMAT_MAPPING: dict[str, str] = {
|
|
8
|
+
".jpg": "JPEG",
|
|
9
|
+
".jpeg": "JPEG",
|
|
10
|
+
".png": "PNG",
|
|
11
|
+
".tiff": "TIFF",
|
|
12
|
+
".tif": "TIFF",
|
|
13
|
+
".bmp": "BMP",
|
|
14
|
+
".gif": "GIF",
|
|
15
|
+
".webp": "WEBP",
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ImageIO:
|
|
20
|
+
@staticmethod
|
|
21
|
+
def open_as_rgb(image_path: str | Path) -> Image.Image:
|
|
22
|
+
return Image.open(image_path).convert("RGB")
|
|
23
|
+
|
|
24
|
+
@staticmethod
|
|
25
|
+
def downscale(image: Image.Image, max_dimension: int) -> Image.Image:
|
|
26
|
+
image_width, image_height = image.size
|
|
27
|
+
largest_side = max(image_width, image_height)
|
|
28
|
+
|
|
29
|
+
if largest_side <= max_dimension:
|
|
30
|
+
return image
|
|
31
|
+
|
|
32
|
+
scale_factor = max_dimension / largest_side
|
|
33
|
+
target_width = int(image_width * scale_factor)
|
|
34
|
+
target_height = int(image_height * scale_factor)
|
|
35
|
+
return image.resize((target_width, target_height), Image.LANCZOS)
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def save(
|
|
39
|
+
image: Image.Image,
|
|
40
|
+
output_path: str | Path,
|
|
41
|
+
output_format: str = "JPEG",
|
|
42
|
+
quality: int = 92,
|
|
43
|
+
) -> None:
|
|
44
|
+
image.save(output_path, output_format, quality=quality)
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def resolve_format(file_path: str | Path) -> str:
|
|
48
|
+
extension = Path(file_path).suffix.lower()
|
|
49
|
+
return FORMAT_MAPPING.get(extension, "JPEG")
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.3.2"
|
|
@@ -12,6 +12,7 @@ from tqdm import tqdm
|
|
|
12
12
|
from docorient.batch.scanner import ScannedPage, scan_directory
|
|
13
13
|
from docorient.batch.worker import initialize_worker, process_batch
|
|
14
14
|
from docorient.config import RESUME_LOG_FILENAME, OrientationConfig
|
|
15
|
+
from docorient.exceptions import BatchProcessingError
|
|
15
16
|
from docorient.types import BatchSummary, PageResult
|
|
16
17
|
|
|
17
18
|
|
|
@@ -71,76 +72,43 @@ def _build_summary(
|
|
|
71
72
|
)
|
|
72
73
|
|
|
73
74
|
|
|
74
|
-
def
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
output_dir: str | Path | None = None,
|
|
78
|
-
config: OrientationConfig | None = None,
|
|
79
|
-
limit: int = 0,
|
|
80
|
-
show_progress: bool = True,
|
|
81
|
-
) -> BatchSummary:
|
|
82
|
-
"""Process all images in a directory, detecting and correcting orientation.
|
|
83
|
-
|
|
84
|
-
Args:
|
|
85
|
-
input_dir: Path to directory containing document images.
|
|
86
|
-
output_dir: Path for corrected output. None generates a UUID-named directory.
|
|
87
|
-
config: Processing configuration. Uses defaults if not provided.
|
|
88
|
-
limit: Maximum number of images to process. 0 means all.
|
|
89
|
-
show_progress: Whether to display a tqdm progress bar.
|
|
90
|
-
|
|
91
|
-
Returns:
|
|
92
|
-
BatchSummary with statistics and per-page results.
|
|
93
|
-
"""
|
|
94
|
-
effective_config = config or OrientationConfig()
|
|
95
|
-
input_path = Path(input_dir).resolve()
|
|
96
|
-
|
|
75
|
+
def _resolve_output_directory(
|
|
76
|
+
input_path: Path, output_dir: str | Path | None
|
|
77
|
+
) -> Path:
|
|
97
78
|
if output_dir is None:
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
output_path = Path(output_dir).resolve()
|
|
79
|
+
return input_path.parent / str(uuid.uuid4())
|
|
80
|
+
return Path(output_dir).resolve()
|
|
101
81
|
|
|
102
|
-
output_path.mkdir(parents=True, exist_ok=True)
|
|
103
82
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
if total_pages == 0:
|
|
116
|
-
return _build_summary(str(input_path), str(output_path), 0, {}, [])
|
|
117
|
-
|
|
118
|
-
resume_log_path = output_path / RESUME_LOG_FILENAME
|
|
119
|
-
already_completed_sources = set()
|
|
120
|
-
|
|
121
|
-
if effective_config.resume_enabled:
|
|
122
|
-
already_completed_sources = _load_completed_sources(resume_log_path)
|
|
123
|
-
|
|
124
|
-
pending_sources = [
|
|
83
|
+
def _filter_pending_sources(
|
|
84
|
+
pages_by_source: dict[str, list[ScannedPage]],
|
|
85
|
+
source_file_names: list[str],
|
|
86
|
+
resume_log_path: Path,
|
|
87
|
+
resume_enabled: bool,
|
|
88
|
+
) -> list[tuple[str, list[ScannedPage]]]:
|
|
89
|
+
already_completed = set()
|
|
90
|
+
if resume_enabled:
|
|
91
|
+
already_completed = _load_completed_sources(resume_log_path)
|
|
92
|
+
|
|
93
|
+
return [
|
|
125
94
|
(source_name, pages_by_source[source_name])
|
|
126
95
|
for source_name in source_file_names
|
|
127
|
-
if source_name not in
|
|
96
|
+
if source_name not in already_completed
|
|
128
97
|
]
|
|
129
98
|
|
|
130
|
-
all_page_results: dict[str, list[PageResult]] = {}
|
|
131
|
-
|
|
132
|
-
if not pending_sources:
|
|
133
|
-
return _build_summary(
|
|
134
|
-
str(input_path), str(output_path), total_files, all_page_results, source_file_names
|
|
135
|
-
)
|
|
136
99
|
|
|
137
|
-
|
|
100
|
+
def _run_parallel_processing(
|
|
101
|
+
pending_sources: list[tuple[str, list[ScannedPage]]],
|
|
102
|
+
config: OrientationConfig,
|
|
103
|
+
resume_log_path: Path,
|
|
104
|
+
show_progress: bool,
|
|
105
|
+
) -> dict[str, list[PageResult]]:
|
|
106
|
+
worker_count = min(config.effective_workers, len(pending_sources))
|
|
138
107
|
batches = _distribute_into_batches(pending_sources, worker_count)
|
|
139
108
|
|
|
140
109
|
progress_counter = multiprocessing.Value("i", 0)
|
|
141
110
|
progress_lock = multiprocessing.Lock()
|
|
142
|
-
|
|
143
|
-
config_as_dict = asdict(effective_config)
|
|
111
|
+
config_as_dict = asdict(config)
|
|
144
112
|
|
|
145
113
|
worker_pool = multiprocessing.Pool(
|
|
146
114
|
processes=worker_count,
|
|
@@ -154,6 +122,7 @@ def process_directory(
|
|
|
154
122
|
]
|
|
155
123
|
worker_pool.close()
|
|
156
124
|
|
|
125
|
+
progress_bar = None
|
|
157
126
|
if show_progress:
|
|
158
127
|
progress_bar = tqdm(
|
|
159
128
|
total=len(pending_sources),
|
|
@@ -161,8 +130,6 @@ def process_directory(
|
|
|
161
130
|
unit="file",
|
|
162
131
|
bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
|
|
163
132
|
)
|
|
164
|
-
else:
|
|
165
|
-
progress_bar = None
|
|
166
133
|
|
|
167
134
|
try:
|
|
168
135
|
while not all(async_result.ready() for async_result in async_results):
|
|
@@ -182,15 +149,61 @@ def process_directory(
|
|
|
182
149
|
progress_bar.refresh()
|
|
183
150
|
progress_bar.close()
|
|
184
151
|
|
|
152
|
+
all_page_results: dict[str, list[PageResult]] = {}
|
|
185
153
|
for async_result in async_results:
|
|
186
154
|
try:
|
|
187
155
|
batch_results = async_result.get(timeout=60)
|
|
188
156
|
for source_name, page_results in batch_results:
|
|
189
157
|
all_page_results[source_name] = page_results
|
|
190
|
-
except Exception:
|
|
191
|
-
|
|
158
|
+
except Exception as pool_error:
|
|
159
|
+
raise BatchProcessingError(
|
|
160
|
+
f"Worker pool error: {pool_error}"
|
|
161
|
+
) from pool_error
|
|
192
162
|
|
|
193
163
|
worker_pool.join()
|
|
164
|
+
return all_page_results
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def process_directory(
|
|
168
|
+
input_dir: str | Path,
|
|
169
|
+
*,
|
|
170
|
+
output_dir: str | Path | None = None,
|
|
171
|
+
config: OrientationConfig | None = None,
|
|
172
|
+
limit: int = 0,
|
|
173
|
+
show_progress: bool = True,
|
|
174
|
+
) -> BatchSummary:
|
|
175
|
+
effective_config = config or OrientationConfig()
|
|
176
|
+
input_path = Path(input_dir).resolve()
|
|
177
|
+
output_path = _resolve_output_directory(input_path, output_dir)
|
|
178
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
179
|
+
|
|
180
|
+
pages_by_source = scan_directory(
|
|
181
|
+
input_path,
|
|
182
|
+
output_path,
|
|
183
|
+
supported_extensions=effective_config.supported_extensions,
|
|
184
|
+
limit=limit,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
source_file_names = list(pages_by_source.keys())
|
|
188
|
+
total_files = len(source_file_names)
|
|
189
|
+
|
|
190
|
+
if not pages_by_source:
|
|
191
|
+
return _build_summary(str(input_path), str(output_path), 0, {}, [])
|
|
192
|
+
|
|
193
|
+
resume_log_path = output_path / RESUME_LOG_FILENAME
|
|
194
|
+
|
|
195
|
+
pending_sources = _filter_pending_sources(
|
|
196
|
+
pages_by_source, source_file_names, resume_log_path, effective_config.resume_enabled
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
if not pending_sources:
|
|
200
|
+
return _build_summary(
|
|
201
|
+
str(input_path), str(output_path), total_files, {}, source_file_names
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
all_page_results = _run_parallel_processing(
|
|
205
|
+
pending_sources, effective_config, resume_log_path, show_progress
|
|
206
|
+
)
|
|
194
207
|
|
|
195
208
|
return _build_summary(
|
|
196
209
|
str(input_path), str(output_path), total_files, all_page_results, source_file_names
|
|
@@ -1,19 +1,28 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import multiprocessing
|
|
4
|
+
from dataclasses import dataclass
|
|
4
5
|
from typing import Any
|
|
5
6
|
|
|
6
|
-
from docorient._imaging import
|
|
7
|
+
from docorient._imaging import ImageIO
|
|
7
8
|
from docorient.batch.scanner import ScannedPage
|
|
8
9
|
from docorient.config import OrientationConfig
|
|
9
|
-
from docorient.correction import _apply_majority_voting, _apply_rotation
|
|
10
10
|
from docorient.detection.engine import detect_orientation
|
|
11
|
+
from docorient.exceptions import CorrectionError, DetectionError
|
|
12
|
+
from docorient.rotation import apply_rotation
|
|
11
13
|
from docorient.types import OrientationResult, PageResult
|
|
14
|
+
from docorient.voting import apply_majority_voting
|
|
12
15
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class WorkerContext:
|
|
19
|
+
progress_counter: Any
|
|
20
|
+
progress_lock: Any
|
|
21
|
+
resume_log_path: str
|
|
22
|
+
config: OrientationConfig
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
_worker_context: WorkerContext | None = None
|
|
17
26
|
|
|
18
27
|
|
|
19
28
|
def initialize_worker(
|
|
@@ -22,17 +31,13 @@ def initialize_worker(
|
|
|
22
31
|
resume_log_path: str,
|
|
23
32
|
config_dict: dict[str, Any],
|
|
24
33
|
) -> None:
|
|
25
|
-
global
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def _reconstruct_config() -> OrientationConfig:
|
|
33
|
-
if _shared_config_dict is None:
|
|
34
|
-
return OrientationConfig()
|
|
35
|
-
return OrientationConfig(**_shared_config_dict)
|
|
34
|
+
global _worker_context
|
|
35
|
+
_worker_context = WorkerContext(
|
|
36
|
+
progress_counter=counter,
|
|
37
|
+
progress_lock=lock,
|
|
38
|
+
resume_log_path=resume_log_path,
|
|
39
|
+
config=OrientationConfig(**config_dict),
|
|
40
|
+
)
|
|
36
41
|
|
|
37
42
|
|
|
38
43
|
def _process_single_source(
|
|
@@ -46,16 +51,19 @@ def _process_single_source(
|
|
|
46
51
|
|
|
47
52
|
for page_index, scanned_page in enumerate(valid_pages):
|
|
48
53
|
try:
|
|
49
|
-
image = open_as_rgb(scanned_page.image_path)
|
|
54
|
+
image = ImageIO.open_as_rgb(scanned_page.image_path)
|
|
50
55
|
orientation = detect_orientation(image, config=config)
|
|
51
56
|
detection_results.append(orientation)
|
|
52
57
|
image.close()
|
|
53
|
-
except Exception as
|
|
58
|
+
except Exception as original_error:
|
|
59
|
+
wrapped = DetectionError(
|
|
60
|
+
f"Detection failed for {scanned_page.image_name}: {original_error}"
|
|
61
|
+
)
|
|
54
62
|
detection_results.append(OrientationResult(angle=0, method="error", reliable=False))
|
|
55
|
-
page_errors[page_index] = str(
|
|
63
|
+
page_errors[page_index] = str(wrapped)
|
|
56
64
|
|
|
57
65
|
if len(valid_pages) > 1:
|
|
58
|
-
detection_results =
|
|
66
|
+
detection_results = apply_majority_voting(detection_results)
|
|
59
67
|
|
|
60
68
|
page_results: list[PageResult] = []
|
|
61
69
|
|
|
@@ -64,10 +72,10 @@ def _process_single_source(
|
|
|
64
72
|
|
|
65
73
|
if error_message is None:
|
|
66
74
|
try:
|
|
67
|
-
image = open_as_rgb(scanned_page.image_path)
|
|
68
|
-
corrected_image =
|
|
69
|
-
output_format =
|
|
70
|
-
|
|
75
|
+
image = ImageIO.open_as_rgb(scanned_page.image_path)
|
|
76
|
+
corrected_image = apply_rotation(image, orientation.angle)
|
|
77
|
+
output_format = ImageIO.resolve_format(scanned_page.output_path)
|
|
78
|
+
ImageIO.save(
|
|
71
79
|
corrected_image,
|
|
72
80
|
scanned_page.output_path,
|
|
73
81
|
output_format=output_format,
|
|
@@ -75,8 +83,11 @@ def _process_single_source(
|
|
|
75
83
|
)
|
|
76
84
|
corrected_image.close()
|
|
77
85
|
image.close()
|
|
78
|
-
except Exception as
|
|
79
|
-
|
|
86
|
+
except Exception as original_error:
|
|
87
|
+
wrapped = CorrectionError(
|
|
88
|
+
f"Correction failed for {scanned_page.image_name}: {original_error}"
|
|
89
|
+
)
|
|
90
|
+
error_message = str(wrapped)
|
|
80
91
|
|
|
81
92
|
page_results.append(
|
|
82
93
|
PageResult(
|
|
@@ -94,10 +105,11 @@ def _process_single_source(
|
|
|
94
105
|
|
|
95
106
|
|
|
96
107
|
def _record_completion(source_file_name: str) -> None:
|
|
97
|
-
|
|
98
|
-
|
|
108
|
+
assert _worker_context is not None
|
|
109
|
+
with _worker_context.progress_lock:
|
|
110
|
+
_worker_context.progress_counter.value += 1
|
|
99
111
|
try:
|
|
100
|
-
with open(
|
|
112
|
+
with open(_worker_context.resume_log_path, "a") as resume_log:
|
|
101
113
|
resume_log.write(source_file_name + "\n")
|
|
102
114
|
resume_log.flush()
|
|
103
115
|
except OSError:
|
|
@@ -107,11 +119,13 @@ def _record_completion(source_file_name: str) -> None:
|
|
|
107
119
|
def process_batch(
|
|
108
120
|
batch: list[tuple[str, list[ScannedPage]]],
|
|
109
121
|
) -> list[tuple[str, list[PageResult]]]:
|
|
110
|
-
|
|
122
|
+
assert _worker_context is not None
|
|
111
123
|
batch_results: list[tuple[str, list[PageResult]]] = []
|
|
112
124
|
|
|
113
125
|
for source_file_name, scanned_pages in batch:
|
|
114
|
-
page_results = _process_single_source(
|
|
126
|
+
page_results = _process_single_source(
|
|
127
|
+
source_file_name, scanned_pages, _worker_context.config
|
|
128
|
+
)
|
|
115
129
|
batch_results.append((source_file_name, page_results))
|
|
116
130
|
_record_completion(source_file_name)
|
|
117
131
|
|