docorient 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ # Changelog
2
+
3
+ ## 0.1.0 (2026-02-25)
4
+
5
+ - Initial release
6
+ - Projection profile engine for 90°/270° detection
7
+ - Optional Tesseract OSD engine for 180° detection
8
+ - Single image and batch directory processing
9
+ - Multi-page majority voting
10
+ - Resumable batch processing
11
+ - CLI interface
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Cebraspe Lab
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,146 @@
1
+ Metadata-Version: 2.4
2
+ Name: docorient
3
+ Version: 0.1.0
4
+ Summary: Document image orientation detection and correction using projection profile analysis and optional Tesseract OSD.
5
+ Project-URL: Homepage, https://github.com/cebraspe-lab/docorient
6
+ Project-URL: Repository, https://github.com/cebraspe-lab/docorient
7
+ Project-URL: Issues, https://github.com/cebraspe-lab/docorient/issues
8
+ Author: Cebraspe Lab
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: correction,document,image,ocr,orientation,rotation,tesseract
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Scientific/Engineering :: Image Processing
21
+ Classifier: Typing :: Typed
22
+ Requires-Python: >=3.10
23
+ Requires-Dist: numpy>=1.24
24
+ Requires-Dist: pillow>=10.0
25
+ Requires-Dist: tqdm>=4.60
26
+ Provides-Extra: dev
27
+ Requires-Dist: build; extra == 'dev'
28
+ Requires-Dist: pytest-cov; extra == 'dev'
29
+ Requires-Dist: pytest>=8.0; extra == 'dev'
30
+ Requires-Dist: ruff>=0.4; extra == 'dev'
31
+ Requires-Dist: twine; extra == 'dev'
32
+ Provides-Extra: ocr
33
+ Requires-Dist: pytesseract>=0.3.10; extra == 'ocr'
34
+ Description-Content-Type: text/markdown
35
+
36
+ # docorient
37
+
38
+ Document image orientation detection and correction.
39
+
40
+ Detects and fixes rotation (0°, 90°, 180°, 270°) in scanned document images using projection profile analysis and optional Tesseract OSD.
41
+
42
+ ## Installation
43
+
44
+ ```bash
45
+ pip install docorient
46
+ ```
47
+
48
+ For 180° detection via Tesseract OSD:
49
+
50
+ ```bash
51
+ pip install docorient[ocr]
52
+ ```
53
+
54
+ > **Note:** The `[ocr]` extra requires [Tesseract](https://github.com/tesseract-ocr/tesseract) installed on your system.
55
+
56
+ ## Quick Start
57
+
58
+ ### Detect orientation
59
+
60
+ ```python
61
+ from PIL import Image
62
+ from docorient import detect_orientation
63
+
64
+ image = Image.open("document.jpg")
65
+ result = detect_orientation(image)
66
+
67
+ print(result.angle) # 0, 90, 180, or 270
68
+ print(result.method) # detection method used
69
+ print(result.reliable) # confidence flag
70
+ ```
71
+
72
+ ### Correct a single image
73
+
74
+ ```python
75
+ from docorient import correct_image
76
+
77
+ corrected = correct_image(image)
78
+ corrected.save("fixed.jpg")
79
+ ```
80
+
81
+ ### Correct with metadata
82
+
83
+ ```python
84
+ from docorient import correct_image
85
+
86
+ result = correct_image(image, return_metadata=True)
87
+ print(result.orientation.angle)
88
+ result.image.save("fixed.jpg")
89
+ ```
90
+
91
+ ### Correct multi-page document (majority voting)
92
+
93
+ ```python
94
+ from docorient import correct_document_pages
95
+
96
+ pages = [Image.open(f"page_{i}.jpg") for i in range(5)]
97
+ corrected_pages = correct_document_pages(pages)
98
+ ```
99
+
100
+ ### Batch process a directory
101
+
102
+ ```python
103
+ from docorient import process_directory, OrientationConfig
104
+
105
+ config = OrientationConfig(workers=4, output_quality=95)
106
+ summary = process_directory("./scans", output_dir="./fixed", config=config)
107
+
108
+ print(f"Corrected: {summary.corrected}/{summary.total_pages}")
109
+ ```
110
+
111
+ ### CLI
112
+
113
+ ```bash
114
+ docorient ./scans --output ./fixed --workers 4
115
+ docorient ./scans --dry-run
116
+ docorient ./scans --no-ocr --limit 100
117
+ ```
118
+
119
+ ## How It Works
120
+
121
+ 1. **Projection profile analysis** detects 90° and 270° rotations by comparing horizontal vs vertical text energy
122
+ 2. **Tesseract OSD** (optional) detects 180° rotation with confidence thresholding
123
+ 3. **Majority voting** across pages of the same document improves reliability
124
+
125
+ ## Supported Formats
126
+
127
+ Any format readable by Pillow: JPEG, PNG, TIFF, BMP, GIF, WebP, and more.
128
+
129
+ ## Configuration
130
+
131
+ ```python
132
+ from docorient import OrientationConfig
133
+
134
+ config = OrientationConfig(
135
+ osd_confidence_threshold=2.0,
136
+ output_quality=92,
137
+ max_osd_dimension=1200,
138
+ projection_target_dimension=800,
139
+ workers=4,
140
+ resume_enabled=True,
141
+ )
142
+ ```
143
+
144
+ ## License
145
+
146
+ MIT
@@ -0,0 +1,111 @@
1
+ # docorient
2
+
3
+ Document image orientation detection and correction.
4
+
5
+ Detects and fixes rotation (0°, 90°, 180°, 270°) in scanned document images using projection profile analysis and optional Tesseract OSD.
6
+
7
+ ## Installation
8
+
9
+ ```bash
10
+ pip install docorient
11
+ ```
12
+
13
+ For 180° detection via Tesseract OSD:
14
+
15
+ ```bash
16
+ pip install docorient[ocr]
17
+ ```
18
+
19
+ > **Note:** The `[ocr]` extra requires [Tesseract](https://github.com/tesseract-ocr/tesseract) installed on your system.
20
+
21
+ ## Quick Start
22
+
23
+ ### Detect orientation
24
+
25
+ ```python
26
+ from PIL import Image
27
+ from docorient import detect_orientation
28
+
29
+ image = Image.open("document.jpg")
30
+ result = detect_orientation(image)
31
+
32
+ print(result.angle) # 0, 90, 180, or 270
33
+ print(result.method) # detection method used
34
+ print(result.reliable) # confidence flag
35
+ ```
36
+
37
+ ### Correct a single image
38
+
39
+ ```python
40
+ from docorient import correct_image
41
+
42
+ corrected = correct_image(image)
43
+ corrected.save("fixed.jpg")
44
+ ```
45
+
46
+ ### Correct with metadata
47
+
48
+ ```python
49
+ from docorient import correct_image
50
+
51
+ result = correct_image(image, return_metadata=True)
52
+ print(result.orientation.angle)
53
+ result.image.save("fixed.jpg")
54
+ ```
55
+
56
+ ### Correct multi-page document (majority voting)
57
+
58
+ ```python
59
+ from docorient import correct_document_pages
60
+
61
+ pages = [Image.open(f"page_{i}.jpg") for i in range(5)]
62
+ corrected_pages = correct_document_pages(pages)
63
+ ```
64
+
65
+ ### Batch process a directory
66
+
67
+ ```python
68
+ from docorient import process_directory, OrientationConfig
69
+
70
+ config = OrientationConfig(workers=4, output_quality=95)
71
+ summary = process_directory("./scans", output_dir="./fixed", config=config)
72
+
73
+ print(f"Corrected: {summary.corrected}/{summary.total_pages}")
74
+ ```
75
+
76
+ ### CLI
77
+
78
+ ```bash
79
+ docorient ./scans --output ./fixed --workers 4
80
+ docorient ./scans --dry-run
81
+ docorient ./scans --no-ocr --limit 100
82
+ ```
83
+
84
+ ## How It Works
85
+
86
+ 1. **Projection profile analysis** detects 90° and 270° rotations by comparing horizontal vs vertical text energy
87
+ 2. **Tesseract OSD** (optional) detects 180° rotation with confidence thresholding
88
+ 3. **Majority voting** across pages of the same document improves reliability
89
+
90
+ ## Supported Formats
91
+
92
+ Any format readable by Pillow: JPEG, PNG, TIFF, BMP, GIF, WebP, and more.
93
+
94
+ ## Configuration
95
+
96
+ ```python
97
+ from docorient import OrientationConfig
98
+
99
+ config = OrientationConfig(
100
+ osd_confidence_threshold=2.0,
101
+ output_quality=92,
102
+ max_osd_dimension=1200,
103
+ projection_target_dimension=800,
104
+ workers=4,
105
+ resume_enabled=True,
106
+ )
107
+ ```
108
+
109
+ ## License
110
+
111
+ MIT
@@ -0,0 +1,67 @@
1
+ [build-system]
2
+ requires = ["hatchling >= 1.26"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "docorient"
7
+ version = "0.1.0"
8
+ description = "Document image orientation detection and correction using projection profile analysis and optional Tesseract OSD."
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">= 3.10"
12
+ authors = [{ name = "Cebraspe Lab" }]
13
+ keywords = ["document", "orientation", "rotation", "ocr", "image", "correction", "tesseract"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.10",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Programming Language :: Python :: 3.13",
23
+ "Topic :: Scientific/Engineering :: Image Processing",
24
+ "Typing :: Typed",
25
+ ]
26
+ dependencies = [
27
+ "Pillow >= 10.0",
28
+ "numpy >= 1.24",
29
+ "tqdm >= 4.60",
30
+ ]
31
+
32
+ [project.optional-dependencies]
33
+ ocr = ["pytesseract >= 0.3.10"]
34
+ dev = ["pytest >= 8.0", "pytest-cov", "ruff >= 0.4", "build", "twine"]
35
+
36
+ [project.scripts]
37
+ docorient = "docorient.cli:main"
38
+
39
+ [project.urls]
40
+ Homepage = "https://github.com/cebraspe-lab/docorient"
41
+ Repository = "https://github.com/cebraspe-lab/docorient"
42
+ Issues = "https://github.com/cebraspe-lab/docorient/issues"
43
+
44
+ [tool.hatch.build.targets.wheel]
45
+ packages = ["src/docorient"]
46
+
47
+ [tool.ruff]
48
+ target-version = "py310"
49
+ line-length = 100
50
+
51
+ [tool.ruff.lint]
52
+ select = [
53
+ "E",
54
+ "F",
55
+ "W",
56
+ "I",
57
+ "N",
58
+ "UP",
59
+ "B",
60
+ "SIM",
61
+ "RUF",
62
+ ]
63
+ ignore = ["B905"]
64
+
65
+ [tool.pytest.ini_options]
66
+ testpaths = ["tests"]
67
+ pythonpath = ["src"]
@@ -0,0 +1,36 @@
1
+ from docorient._version import __version__
2
+ from docorient.batch.processor import process_directory
3
+ from docorient.config import OrientationConfig
4
+ from docorient.correction import correct_document_pages, correct_image
5
+ from docorient.detection.engine import detect_orientation
6
+ from docorient.exceptions import (
7
+ BatchProcessingError,
8
+ CorrectionError,
9
+ DetectionError,
10
+ DocorientError,
11
+ TesseractNotAvailableError,
12
+ )
13
+ from docorient.types import (
14
+ BatchSummary,
15
+ CorrectionResult,
16
+ OrientationResult,
17
+ PageResult,
18
+ )
19
+
20
+ __all__ = [
21
+ "BatchProcessingError",
22
+ "BatchSummary",
23
+ "CorrectionError",
24
+ "CorrectionResult",
25
+ "DetectionError",
26
+ "DocorientError",
27
+ "OrientationConfig",
28
+ "OrientationResult",
29
+ "PageResult",
30
+ "TesseractNotAvailableError",
31
+ "__version__",
32
+ "correct_document_pages",
33
+ "correct_image",
34
+ "detect_orientation",
35
+ "process_directory",
36
+ ]
@@ -0,0 +1,46 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from PIL import Image
6
+
7
+
8
+ def open_as_rgb(image_path: str | Path) -> Image.Image:
9
+ return Image.open(image_path).convert("RGB")
10
+
11
+
12
+ def downscale_to_max_dimension(image: Image.Image, max_dimension: int) -> Image.Image:
13
+ image_width, image_height = image.size
14
+ largest_side = max(image_width, image_height)
15
+
16
+ if largest_side <= max_dimension:
17
+ return image
18
+
19
+ scale_factor = max_dimension / largest_side
20
+ target_width = int(image_width * scale_factor)
21
+ target_height = int(image_height * scale_factor)
22
+ return image.resize((target_width, target_height), Image.LANCZOS)
23
+
24
+
25
+ def save_image(
26
+ image: Image.Image,
27
+ output_path: str | Path,
28
+ output_format: str = "JPEG",
29
+ quality: int = 92,
30
+ ) -> None:
31
+ image.save(output_path, output_format, quality=quality)
32
+
33
+
34
+ def determine_output_format(file_path: str | Path) -> str:
35
+ extension = Path(file_path).suffix.lower()
36
+ format_mapping = {
37
+ ".jpg": "JPEG",
38
+ ".jpeg": "JPEG",
39
+ ".png": "PNG",
40
+ ".tiff": "TIFF",
41
+ ".tif": "TIFF",
42
+ ".bmp": "BMP",
43
+ ".gif": "GIF",
44
+ ".webp": "WEBP",
45
+ }
46
+ return format_mapping.get(extension, "JPEG")
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,3 @@
1
+ from docorient.batch.processor import process_directory
2
+
3
+ __all__ = ["process_directory"]
@@ -0,0 +1,197 @@
1
+ from __future__ import annotations
2
+
3
+ import multiprocessing
4
+ import sys
5
+ import time
6
+ import uuid
7
+ from dataclasses import asdict
8
+ from pathlib import Path
9
+
10
+ from tqdm import tqdm
11
+
12
+ from docorient.batch.scanner import ScannedPage, scan_directory
13
+ from docorient.batch.worker import initialize_worker, process_batch
14
+ from docorient.config import RESUME_LOG_FILENAME, OrientationConfig
15
+ from docorient.types import BatchSummary, PageResult
16
+
17
+
18
+ def _load_completed_sources(resume_log_path: Path) -> set[str]:
19
+ if not resume_log_path.exists():
20
+ return set()
21
+ with open(resume_log_path) as resume_log:
22
+ return {line.strip() for line in resume_log if line.strip()}
23
+
24
+
25
+ def _distribute_into_batches(
26
+ items: list[tuple[str, list[ScannedPage]]],
27
+ batch_count: int,
28
+ ) -> list[list[tuple[str, list[ScannedPage]]]]:
29
+ batches: list[list[tuple[str, list[ScannedPage]]]] = [[] for _ in range(batch_count)]
30
+ for item_index, item in enumerate(items):
31
+ target_batch = item_index % batch_count
32
+ batches[target_batch].append(item)
33
+ return batches
34
+
35
+
36
+ def _build_summary(
37
+ input_directory: str,
38
+ output_directory: str,
39
+ total_files: int,
40
+ all_page_results: dict[str, list[PageResult]],
41
+ source_file_names: list[str],
42
+ ) -> BatchSummary:
43
+ all_pages: list[PageResult] = []
44
+ already_correct_count = 0
45
+ corrected_count = 0
46
+ corrected_by_majority_count = 0
47
+ error_count = 0
48
+
49
+ for source_name in source_file_names:
50
+ for page_result in all_page_results.get(source_name, []):
51
+ all_pages.append(page_result)
52
+ if page_result.error is not None:
53
+ error_count += 1
54
+ elif page_result.orientation.angle != 0:
55
+ corrected_count += 1
56
+ if "->majority" in page_result.orientation.method:
57
+ corrected_by_majority_count += 1
58
+ else:
59
+ already_correct_count += 1
60
+
61
+ return BatchSummary(
62
+ input_directory=input_directory,
63
+ output_directory=output_directory,
64
+ total_files=total_files,
65
+ total_pages=len(all_pages),
66
+ already_correct=already_correct_count,
67
+ corrected=corrected_count,
68
+ corrected_by_majority=corrected_by_majority_count,
69
+ errors=error_count,
70
+ pages=tuple(all_pages),
71
+ )
72
+
73
+
74
+ def process_directory(
75
+ input_dir: str | Path,
76
+ *,
77
+ output_dir: str | Path | None = None,
78
+ config: OrientationConfig | None = None,
79
+ limit: int = 0,
80
+ show_progress: bool = True,
81
+ ) -> BatchSummary:
82
+ """Process all images in a directory, detecting and correcting orientation.
83
+
84
+ Args:
85
+ input_dir: Path to directory containing document images.
86
+ output_dir: Path for corrected output. None generates a UUID-named directory.
87
+ config: Processing configuration. Uses defaults if not provided.
88
+ limit: Maximum number of images to process. 0 means all.
89
+ show_progress: Whether to display a tqdm progress bar.
90
+
91
+ Returns:
92
+ BatchSummary with statistics and per-page results.
93
+ """
94
+ effective_config = config or OrientationConfig()
95
+ input_path = Path(input_dir).resolve()
96
+
97
+ if output_dir is None:
98
+ output_path = input_path.parent / str(uuid.uuid4())
99
+ else:
100
+ output_path = Path(output_dir).resolve()
101
+
102
+ output_path.mkdir(parents=True, exist_ok=True)
103
+
104
+ pages_by_source = scan_directory(
105
+ input_path,
106
+ output_path,
107
+ supported_extensions=effective_config.supported_extensions,
108
+ limit=limit,
109
+ )
110
+
111
+ source_file_names = list(pages_by_source.keys())
112
+ total_files = len(source_file_names)
113
+ total_pages = sum(len(pages) for pages in pages_by_source.values())
114
+
115
+ if total_pages == 0:
116
+ return _build_summary(str(input_path), str(output_path), 0, {}, [])
117
+
118
+ resume_log_path = output_path / RESUME_LOG_FILENAME
119
+ already_completed_sources = set()
120
+
121
+ if effective_config.resume_enabled:
122
+ already_completed_sources = _load_completed_sources(resume_log_path)
123
+
124
+ pending_sources = [
125
+ (source_name, pages_by_source[source_name])
126
+ for source_name in source_file_names
127
+ if source_name not in already_completed_sources
128
+ ]
129
+
130
+ all_page_results: dict[str, list[PageResult]] = {}
131
+
132
+ if not pending_sources:
133
+ return _build_summary(
134
+ str(input_path), str(output_path), total_files, all_page_results, source_file_names
135
+ )
136
+
137
+ worker_count = min(effective_config.effective_workers, len(pending_sources))
138
+ batches = _distribute_into_batches(pending_sources, worker_count)
139
+
140
+ progress_counter = multiprocessing.Value("i", 0)
141
+ progress_lock = multiprocessing.Lock()
142
+
143
+ config_as_dict = asdict(effective_config)
144
+
145
+ worker_pool = multiprocessing.Pool(
146
+ processes=worker_count,
147
+ initializer=initialize_worker,
148
+ initargs=(progress_counter, progress_lock, str(resume_log_path), config_as_dict),
149
+ maxtasksperchild=1,
150
+ )
151
+
152
+ async_results = [
153
+ worker_pool.apply_async(process_batch, (batch,)) for batch in batches
154
+ ]
155
+ worker_pool.close()
156
+
157
+ if show_progress:
158
+ progress_bar = tqdm(
159
+ total=len(pending_sources),
160
+ desc="Correcting",
161
+ unit="file",
162
+ bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
163
+ )
164
+ else:
165
+ progress_bar = None
166
+
167
+ try:
168
+ while not all(async_result.ready() for async_result in async_results):
169
+ if progress_bar is not None:
170
+ progress_bar.n = progress_counter.value
171
+ progress_bar.refresh()
172
+ time.sleep(0.3)
173
+ except KeyboardInterrupt:
174
+ worker_pool.terminate()
175
+ worker_pool.join()
176
+ if progress_bar is not None:
177
+ progress_bar.close()
178
+ sys.exit(1)
179
+
180
+ if progress_bar is not None:
181
+ progress_bar.n = progress_counter.value
182
+ progress_bar.refresh()
183
+ progress_bar.close()
184
+
185
+ for async_result in async_results:
186
+ try:
187
+ batch_results = async_result.get(timeout=60)
188
+ for source_name, page_results in batch_results:
189
+ all_page_results[source_name] = page_results
190
+ except Exception:
191
+ pass
192
+
193
+ worker_pool.join()
194
+
195
+ return _build_summary(
196
+ str(input_path), str(output_path), total_files, all_page_results, source_file_names
197
+ )
@@ -0,0 +1,59 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+
7
+ PAGE_PATTERN = re.compile(r"^(.+)_p(\d+)\.\w+$")
8
+
9
+
10
+ @dataclass(frozen=True, slots=True)
11
+ class ScannedPage:
12
+ source_file: str
13
+ page_number: int
14
+ image_name: str
15
+ image_path: str
16
+ output_path: str
17
+
18
+
19
+ def scan_directory(
20
+ input_directory: Path,
21
+ output_directory: Path,
22
+ supported_extensions: tuple[str, ...],
23
+ limit: int = 0,
24
+ ) -> dict[str, list[ScannedPage]]:
25
+ all_image_paths = sorted(
26
+ image_path
27
+ for image_path in input_directory.iterdir()
28
+ if image_path.is_file() and image_path.suffix.lower() in supported_extensions
29
+ )
30
+
31
+ if limit > 0:
32
+ all_image_paths = all_image_paths[:limit]
33
+
34
+ pages_by_source: dict[str, list[ScannedPage]] = {}
35
+
36
+ for image_path in all_image_paths:
37
+ image_name = image_path.name
38
+ page_match = PAGE_PATTERN.match(image_name)
39
+
40
+ if page_match:
41
+ source_file_name = page_match.group(1)
42
+ page_number = int(page_match.group(2))
43
+ else:
44
+ source_file_name = image_path.stem
45
+ page_number = 1
46
+
47
+ output_path = output_directory / image_name
48
+
49
+ scanned_page = ScannedPage(
50
+ source_file=source_file_name,
51
+ page_number=page_number,
52
+ image_name=image_name,
53
+ image_path=str(image_path),
54
+ output_path=str(output_path),
55
+ )
56
+
57
+ pages_by_source.setdefault(source_file_name, []).append(scanned_page)
58
+
59
+ return pages_by_source