dforge-cli 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dforge/__init__.py +1 -0
- dforge/banner.py +31 -0
- dforge/batch.py +156 -0
- dforge/cli.py +525 -0
- dforge/config.py +38 -0
- dforge/config_manager.py +33 -0
- dforge/converter.py +167 -0
- dforge/dependencies.py +98 -0
- dforge/engine.py +236 -0
- dforge/extractor.py +201 -0
- dforge/loading.py +19 -0
- dforge/menu.py +115 -0
- dforge/operations.py +314 -0
- dforge/processor.py +251 -0
- dforge/setup.py +107 -0
- dforge/theme.py +12 -0
- dforge/utils.py +169 -0
- dforge/watcher.py +137 -0
- dforge/workflows/__init__.py +0 -0
- dforge/workflows/automation.py +21 -0
- dforge/workflows/batch.py +18 -0
- dforge/workflows/batch_ocr.py +61 -0
- dforge/workflows/common.py +133 -0
- dforge/workflows/compress.py +73 -0
- dforge/workflows/convert.py +148 -0
- dforge/workflows/decrypt.py +50 -0
- dforge/workflows/encrypt.py +50 -0
- dforge/workflows/extract.py +18 -0
- dforge/workflows/image.py +21 -0
- dforge/workflows/merge.py +109 -0
- dforge/workflows/ocr.py +104 -0
- dforge/workflows/ocr_folder.py +0 -0
- dforge/workflows/pages.py +57 -0
- dforge/workflows/rotate.py +53 -0
- dforge/workflows/searchable.py +51 -0
- dforge/workflows/settings.py +56 -0
- dforge/workflows/split.py +32 -0
- dforge/workflows/tables.py +45 -0
- dforge/workflows/watermark.py +54 -0
- dforge_cli-1.0.1.dist-info/METADATA +244 -0
- dforge_cli-1.0.1.dist-info/RECORD +44 -0
- dforge_cli-1.0.1.dist-info/WHEEL +5 -0
- dforge_cli-1.0.1.dist-info/entry_points.txt +2 -0
- dforge_cli-1.0.1.dist-info/top_level.txt +1 -0
dforge/loading.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from rich.console import Console
|
|
2
|
+
from rich.status import Status
|
|
3
|
+
|
|
4
|
+
console = Console()
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Loader:
|
|
8
|
+
def __init__(self, text):
|
|
9
|
+
self.status = Status(
|
|
10
|
+
f"[bold cyan]{text}[/bold cyan]",
|
|
11
|
+
spinner="dots"
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
def __enter__(self):
|
|
15
|
+
self.status.start()
|
|
16
|
+
return self
|
|
17
|
+
|
|
18
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
19
|
+
self.status.stop()
|
dforge/menu.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import questionary
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def show_menu(title: str, choices: list[str]):
|
|
5
|
+
return questionary.select(
|
|
6
|
+
title,
|
|
7
|
+
choices=choices,
|
|
8
|
+
).ask()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def main_menu():
|
|
12
|
+
return show_menu(
|
|
13
|
+
"What would you like to do?",
|
|
14
|
+
[
|
|
15
|
+
"📄 PDF Tools",
|
|
16
|
+
"🔍 OCR",
|
|
17
|
+
"🔄 Conversion",
|
|
18
|
+
"🖼 Image Processing",
|
|
19
|
+
"⚡ Batch Processing",
|
|
20
|
+
"👀 Watch Folder",
|
|
21
|
+
"⚙ Settings",
|
|
22
|
+
"❌ Exit",
|
|
23
|
+
],
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def pdf_menu():
|
|
28
|
+
return show_menu(
|
|
29
|
+
"PDF Tools",
|
|
30
|
+
[
|
|
31
|
+
"Merge PDFs",
|
|
32
|
+
"Split PDF",
|
|
33
|
+
"Compress PDF",
|
|
34
|
+
"Rotate PDF",
|
|
35
|
+
"Extract Pages",
|
|
36
|
+
"Watermark PDF",
|
|
37
|
+
"Encrypt PDF",
|
|
38
|
+
"Decrypt PDF",
|
|
39
|
+
"⬅ Back",
|
|
40
|
+
],
|
|
41
|
+
)
|
|
42
|
+
def ocr_menu():
|
|
43
|
+
return show_menu(
|
|
44
|
+
"OCR Tools",
|
|
45
|
+
[
|
|
46
|
+
"OCR Image/PDF",
|
|
47
|
+
"Searchable PDF",
|
|
48
|
+
"Batch OCR",
|
|
49
|
+
"OCR Folder",
|
|
50
|
+
"Extract Tables",
|
|
51
|
+
"OCR Settings",
|
|
52
|
+
"⬅ Back",
|
|
53
|
+
],
|
|
54
|
+
)
|
|
55
|
+
def conversion_menu():
|
|
56
|
+
return show_menu(
|
|
57
|
+
"Conversion Tools",
|
|
58
|
+
[
|
|
59
|
+
"Markdown → PDF",
|
|
60
|
+
"Markdown → DOCX",
|
|
61
|
+
"DOCX → PDF",
|
|
62
|
+
"DOCX → Markdown",
|
|
63
|
+
"Images → PDF",
|
|
64
|
+
"PDF → Images",
|
|
65
|
+
"⬅ Back",
|
|
66
|
+
],
|
|
67
|
+
)
|
|
68
|
+
def extract_menu():
|
|
69
|
+
return show_menu(
|
|
70
|
+
"Extract Tools",
|
|
71
|
+
[
|
|
72
|
+
"Extract Text",
|
|
73
|
+
"Extract Images",
|
|
74
|
+
"Extract Metadata",
|
|
75
|
+
"⬅ Back",
|
|
76
|
+
],
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def batch_menu():
|
|
81
|
+
return show_menu(
|
|
82
|
+
"Batch Tools",
|
|
83
|
+
[
|
|
84
|
+
"Batch Convert",
|
|
85
|
+
"Batch Compress",
|
|
86
|
+
"Batch OCR",
|
|
87
|
+
"⬅ Back",
|
|
88
|
+
],
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def automation_menu():
|
|
93
|
+
return show_menu(
|
|
94
|
+
"Automation Tools",
|
|
95
|
+
[
|
|
96
|
+
"Watch Folder",
|
|
97
|
+
"Auto OCR",
|
|
98
|
+
"Auto Convert",
|
|
99
|
+
"Scheduled Tasks",
|
|
100
|
+
"⬅ Back",
|
|
101
|
+
],
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def image_menu():
|
|
106
|
+
return show_menu(
|
|
107
|
+
"Image Tools",
|
|
108
|
+
[
|
|
109
|
+
"Resize Images",
|
|
110
|
+
"Convert Format",
|
|
111
|
+
"Crop Images",
|
|
112
|
+
"Watermark Images",
|
|
113
|
+
"⬅ Back",
|
|
114
|
+
],
|
|
115
|
+
)
|
dforge/operations.py
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DForge PDF Operations
|
|
3
|
+
Handles: merge, split, compress, rotate, page extraction, watermark, encrypt, decrypt
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import subprocess
|
|
10
|
+
import sys
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import List, Optional
|
|
13
|
+
|
|
14
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
15
|
+
|
|
16
|
+
from dforge.utils import (
|
|
17
|
+
abort, console, ensure_parent, ghostscript_bin,
|
|
18
|
+
info, require_ghostscript, resolve_output, success, warn,
|
|
19
|
+
)
|
|
20
|
+
from dforge.config import DEFAULT_COMPRESS_PRESET
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
# Merge
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
|
|
27
|
+
def merge(inputs: List[Path], output: Path) -> None:
|
|
28
|
+
"""Merge multiple PDF files into one."""
|
|
29
|
+
try:
|
|
30
|
+
from pypdf import PdfWriter
|
|
31
|
+
except ImportError:
|
|
32
|
+
abort("pypdf is required. Run: pip install pypdf")
|
|
33
|
+
|
|
34
|
+
for f in inputs:
|
|
35
|
+
if not f.exists():
|
|
36
|
+
abort(f"File not found: {f}")
|
|
37
|
+
|
|
38
|
+
writer = PdfWriter()
|
|
39
|
+
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress:
|
|
40
|
+
task = progress.add_task("Merging PDFs...", total=len(inputs))
|
|
41
|
+
for f in inputs:
|
|
42
|
+
from pypdf import PdfReader
|
|
43
|
+
reader = PdfReader(str(f))
|
|
44
|
+
for page in reader.pages:
|
|
45
|
+
writer.add_page(page)
|
|
46
|
+
progress.advance(task)
|
|
47
|
+
|
|
48
|
+
ensure_parent(output)
|
|
49
|
+
with open(output, "wb") as fh:
|
|
50
|
+
writer.write(fh)
|
|
51
|
+
success(f"Merged {len(inputs)} files -> [bold]{output}[/bold]")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
# Split
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
|
|
58
|
+
def split(input_path: Path, output_dir: Optional[Path] = None) -> None:
|
|
59
|
+
"""Split a PDF into individual pages."""
|
|
60
|
+
try:
|
|
61
|
+
from pypdf import PdfReader, PdfWriter
|
|
62
|
+
except ImportError:
|
|
63
|
+
abort("pypdf is required. Run: pip install pypdf")
|
|
64
|
+
|
|
65
|
+
if not input_path.exists():
|
|
66
|
+
abort(f"File not found: {input_path}")
|
|
67
|
+
|
|
68
|
+
dest = output_dir or input_path.parent / (input_path.stem + "_pages")
|
|
69
|
+
dest.mkdir(parents=True, exist_ok=True)
|
|
70
|
+
|
|
71
|
+
reader = PdfReader(str(input_path))
|
|
72
|
+
total = len(reader.pages)
|
|
73
|
+
|
|
74
|
+
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress:
|
|
75
|
+
task = progress.add_task(f"Splitting {total} pages...", total=total)
|
|
76
|
+
for i, page in enumerate(reader.pages, start=1):
|
|
77
|
+
writer = PdfWriter()
|
|
78
|
+
writer.add_page(page)
|
|
79
|
+
out_file = dest / f"{input_path.stem}_page_{i:04d}.pdf"
|
|
80
|
+
with open(out_file, "wb") as fh:
|
|
81
|
+
writer.write(fh)
|
|
82
|
+
progress.advance(task)
|
|
83
|
+
|
|
84
|
+
success(f"Split into {total} pages -> [bold]{dest}/[/bold]")
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# ---------------------------------------------------------------------------
|
|
88
|
+
# Compress
|
|
89
|
+
# ---------------------------------------------------------------------------
|
|
90
|
+
|
|
91
|
+
def compress(input_path: Path, output: Optional[Path] = None, preset: str = DEFAULT_COMPRESS_PRESET) -> None:
|
|
92
|
+
"""Compress a PDF using Ghostscript."""
|
|
93
|
+
require_ghostscript()
|
|
94
|
+
|
|
95
|
+
if not input_path.exists():
|
|
96
|
+
abort(f"File not found: {input_path}")
|
|
97
|
+
|
|
98
|
+
out = output or resolve_output(input_path, None, "_compressed", ".pdf")
|
|
99
|
+
ensure_parent(out)
|
|
100
|
+
|
|
101
|
+
gs = ghostscript_bin()
|
|
102
|
+
cmd = [
|
|
103
|
+
gs,
|
|
104
|
+
"-sDEVICE=pdfwrite",
|
|
105
|
+
"-dCompatibilityLevel=1.4",
|
|
106
|
+
f"-dPDFSETTINGS=/{preset}",
|
|
107
|
+
"-dNOPAUSE",
|
|
108
|
+
"-dQUIET",
|
|
109
|
+
"-dBATCH",
|
|
110
|
+
f"-sOutputFile={out}",
|
|
111
|
+
str(input_path),
|
|
112
|
+
]
|
|
113
|
+
|
|
114
|
+
info(f"Compressing with preset '[bold]{preset}[/bold]'...")
|
|
115
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
116
|
+
if result.returncode != 0:
|
|
117
|
+
abort(f"Ghostscript error:\n{result.stderr}")
|
|
118
|
+
|
|
119
|
+
original_kb = input_path.stat().st_size / 1024
|
|
120
|
+
compressed_kb = out.stat().st_size / 1024
|
|
121
|
+
ratio = (1 - compressed_kb / original_kb) * 100 if original_kb > 0 else 0
|
|
122
|
+
success(
|
|
123
|
+
f"Compressed: {original_kb:.1f} KB -> {compressed_kb:.1f} KB "
|
|
124
|
+
f"([green]{ratio:.1f}% reduction[/green]) -> [bold]{out}[/bold]"
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# ---------------------------------------------------------------------------
|
|
129
|
+
# Rotate
|
|
130
|
+
# ---------------------------------------------------------------------------
|
|
131
|
+
|
|
132
|
+
def rotate(input_path: Path, degrees: int, output: Optional[Path] = None) -> None:
|
|
133
|
+
"""Rotate all pages of a PDF by the given degrees (90, 180, 270)."""
|
|
134
|
+
try:
|
|
135
|
+
from pypdf import PdfReader, PdfWriter
|
|
136
|
+
except ImportError:
|
|
137
|
+
abort("pypdf is required.")
|
|
138
|
+
|
|
139
|
+
if degrees not in (90, 180, 270):
|
|
140
|
+
abort("Rotation must be 90, 180, or 270 degrees.")
|
|
141
|
+
|
|
142
|
+
if not input_path.exists():
|
|
143
|
+
abort(f"File not found: {input_path}")
|
|
144
|
+
|
|
145
|
+
reader = PdfReader(str(input_path))
|
|
146
|
+
writer = PdfWriter()
|
|
147
|
+
|
|
148
|
+
for page in reader.pages:
|
|
149
|
+
page.rotate(degrees)
|
|
150
|
+
writer.add_page(page)
|
|
151
|
+
|
|
152
|
+
out = output or resolve_output(input_path, None, f"_rotated{degrees}", ".pdf")
|
|
153
|
+
ensure_parent(out)
|
|
154
|
+
with open(out, "wb") as fh:
|
|
155
|
+
writer.write(fh)
|
|
156
|
+
success(f"Rotated {len(reader.pages)} pages by {degrees} deg -> [bold]{out}[/bold]")
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
# ---------------------------------------------------------------------------
|
|
160
|
+
# Extract page range
|
|
161
|
+
# ---------------------------------------------------------------------------
|
|
162
|
+
|
|
163
|
+
def extract_pages(input_path: Path, page_range: str, output: Optional[Path] = None) -> None:
|
|
164
|
+
"""
|
|
165
|
+
Extract a page range from a PDF.
|
|
166
|
+
page_range format: "1-5" or "3" or "2,4,6"
|
|
167
|
+
"""
|
|
168
|
+
try:
|
|
169
|
+
from pypdf import PdfReader, PdfWriter
|
|
170
|
+
except ImportError:
|
|
171
|
+
abort("pypdf is required.")
|
|
172
|
+
|
|
173
|
+
if not input_path.exists():
|
|
174
|
+
abort(f"File not found: {input_path}")
|
|
175
|
+
|
|
176
|
+
reader = PdfReader(str(input_path))
|
|
177
|
+
total = len(reader.pages)
|
|
178
|
+
|
|
179
|
+
# Parse range
|
|
180
|
+
pages: List[int] = []
|
|
181
|
+
for part in page_range.split(","):
|
|
182
|
+
part = part.strip()
|
|
183
|
+
if "-" in part:
|
|
184
|
+
start_s, end_s = part.split("-", 1)
|
|
185
|
+
start, end = int(start_s), int(end_s)
|
|
186
|
+
pages.extend(range(start, end + 1))
|
|
187
|
+
else:
|
|
188
|
+
pages.append(int(part))
|
|
189
|
+
|
|
190
|
+
# Validate
|
|
191
|
+
invalid = [p for p in pages if p < 1 or p > total]
|
|
192
|
+
if invalid:
|
|
193
|
+
abort(f"Page(s) out of range (document has {total} pages): {invalid}")
|
|
194
|
+
|
|
195
|
+
writer = PdfWriter()
|
|
196
|
+
for p in pages:
|
|
197
|
+
writer.add_page(reader.pages[p - 1])
|
|
198
|
+
|
|
199
|
+
out = output or resolve_output(input_path, None, f"_pages_{page_range.replace(',', '-')}", ".pdf")
|
|
200
|
+
ensure_parent(out)
|
|
201
|
+
with open(out, "wb") as fh:
|
|
202
|
+
writer.write(fh)
|
|
203
|
+
success(f"Extracted {len(pages)} pages -> [bold]{out}[/bold]")
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
# ---------------------------------------------------------------------------
|
|
207
|
+
# Watermark
|
|
208
|
+
# ---------------------------------------------------------------------------
|
|
209
|
+
|
|
210
|
+
def watermark(input_path: Path, watermark_file: Path, output: Optional[Path] = None) -> None:
|
|
211
|
+
"""Overlay a watermark (PDF or image) on every page."""
|
|
212
|
+
try:
|
|
213
|
+
from pypdf import PdfReader, PdfWriter
|
|
214
|
+
except ImportError:
|
|
215
|
+
abort("pypdf is required.")
|
|
216
|
+
|
|
217
|
+
if not input_path.exists():
|
|
218
|
+
abort(f"File not found: {input_path}")
|
|
219
|
+
if not watermark_file.exists():
|
|
220
|
+
abort(f"Watermark file not found: {watermark_file}")
|
|
221
|
+
|
|
222
|
+
# If watermark is an image, convert it to a single-page PDF first
|
|
223
|
+
wm_path = watermark_file
|
|
224
|
+
if watermark_file.suffix.lower() in {".png", ".jpg", ".jpeg", ".tiff", ".bmp"}:
|
|
225
|
+
wm_path = _image_to_pdf_watermark(watermark_file)
|
|
226
|
+
|
|
227
|
+
wm_reader = PdfReader(str(wm_path))
|
|
228
|
+
wm_page = wm_reader.pages[0]
|
|
229
|
+
|
|
230
|
+
reader = PdfReader(str(input_path))
|
|
231
|
+
writer = PdfWriter()
|
|
232
|
+
|
|
233
|
+
for page in reader.pages:
|
|
234
|
+
page.merge_page(wm_page)
|
|
235
|
+
writer.add_page(page)
|
|
236
|
+
|
|
237
|
+
out = output or resolve_output(input_path, None, "_watermarked", ".pdf")
|
|
238
|
+
ensure_parent(out)
|
|
239
|
+
with open(out, "wb") as fh:
|
|
240
|
+
writer.write(fh)
|
|
241
|
+
success(f"Watermarked {len(reader.pages)} pages -> [bold]{out}[/bold]")
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _image_to_pdf_watermark(image_path: Path) -> Path:
|
|
245
|
+
"""Convert an image file to a temporary single-page PDF for use as a watermark."""
|
|
246
|
+
import tempfile
|
|
247
|
+
try:
|
|
248
|
+
import img2pdf
|
|
249
|
+
from PIL import Image
|
|
250
|
+
except ImportError:
|
|
251
|
+
abort("img2pdf and Pillow are required for image watermarks.")
|
|
252
|
+
|
|
253
|
+
tmp = Path(tempfile.mktemp(suffix=".pdf"))
|
|
254
|
+
with open(tmp, "wb") as fh:
|
|
255
|
+
fh.write(img2pdf.convert(str(image_path)))
|
|
256
|
+
return tmp
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
# ---------------------------------------------------------------------------
|
|
260
|
+
# Encrypt
|
|
261
|
+
# ---------------------------------------------------------------------------
|
|
262
|
+
|
|
263
|
+
def encrypt(input_path: Path, password: str, output: Optional[Path] = None) -> None:
|
|
264
|
+
"""Encrypt a PDF with a password."""
|
|
265
|
+
try:
|
|
266
|
+
from pypdf import PdfReader, PdfWriter
|
|
267
|
+
except ImportError:
|
|
268
|
+
abort("pypdf is required.")
|
|
269
|
+
|
|
270
|
+
if not input_path.exists():
|
|
271
|
+
abort(f"File not found: {input_path}")
|
|
272
|
+
|
|
273
|
+
reader = PdfReader(str(input_path))
|
|
274
|
+
writer = PdfWriter()
|
|
275
|
+
for page in reader.pages:
|
|
276
|
+
writer.add_page(page)
|
|
277
|
+
|
|
278
|
+
writer.encrypt(password)
|
|
279
|
+
|
|
280
|
+
out = output or resolve_output(input_path, None, "_encrypted", ".pdf")
|
|
281
|
+
ensure_parent(out)
|
|
282
|
+
with open(out, "wb") as fh:
|
|
283
|
+
writer.write(fh)
|
|
284
|
+
success(f"Encrypted -> [bold]{out}[/bold]")
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
# ---------------------------------------------------------------------------
|
|
288
|
+
# Decrypt
|
|
289
|
+
# ---------------------------------------------------------------------------
|
|
290
|
+
|
|
291
|
+
def decrypt(input_path: Path, password: str, output: Optional[Path] = None) -> None:
|
|
292
|
+
"""Decrypt a password-protected PDF."""
|
|
293
|
+
try:
|
|
294
|
+
from pypdf import PdfReader, PdfWriter
|
|
295
|
+
except ImportError:
|
|
296
|
+
abort("pypdf is required.")
|
|
297
|
+
|
|
298
|
+
if not input_path.exists():
|
|
299
|
+
abort(f"File not found: {input_path}")
|
|
300
|
+
|
|
301
|
+
reader = PdfReader(str(input_path))
|
|
302
|
+
if reader.is_encrypted:
|
|
303
|
+
if not reader.decrypt(password):
|
|
304
|
+
abort("Incorrect password or unsupported encryption.")
|
|
305
|
+
|
|
306
|
+
writer = PdfWriter()
|
|
307
|
+
for page in reader.pages:
|
|
308
|
+
writer.add_page(page)
|
|
309
|
+
|
|
310
|
+
out = output or resolve_output(input_path, None, "_decrypted", ".pdf")
|
|
311
|
+
ensure_parent(out)
|
|
312
|
+
with open(out, "wb") as fh:
|
|
313
|
+
writer.write(fh)
|
|
314
|
+
success(f"Decrypted -> [bold]{out}[/bold]")
|
dforge/processor.py
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DForge Image Processing Module
|
|
3
|
+
Handles: enhance, deskew, denoise, resize, and the full OCR preprocessing pipeline
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional, Tuple
|
|
10
|
+
|
|
11
|
+
from dforge.utils import abort, info, success, warn
|
|
12
|
+
from dforge.config import DEFAULT_COMPRESS_QUALITY
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# ---------------------------------------------------------------------------
|
|
16
|
+
# Internal helpers
|
|
17
|
+
# ---------------------------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
def _load_pil(path: Path):
|
|
20
|
+
"""Load an image using Pillow, aborting on failure."""
|
|
21
|
+
try:
|
|
22
|
+
from PIL import Image
|
|
23
|
+
except ImportError:
|
|
24
|
+
abort("Pillow is required. Run: pip install Pillow")
|
|
25
|
+
|
|
26
|
+
if not path.exists():
|
|
27
|
+
abort(f"File not found: {path}")
|
|
28
|
+
return Image.open(str(path))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _load_cv2(path: Path):
|
|
32
|
+
"""Load an image using OpenCV."""
|
|
33
|
+
try:
|
|
34
|
+
import cv2
|
|
35
|
+
import numpy as np
|
|
36
|
+
except ImportError:
|
|
37
|
+
abort("opencv-python-headless is required. Run: pip install opencv-python-headless")
|
|
38
|
+
|
|
39
|
+
img = cv2.imread(str(path))
|
|
40
|
+
if img is None:
|
|
41
|
+
abort(f"Could not read image: {path}")
|
|
42
|
+
return img
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _save_pil(img, output: Path, quality: int = DEFAULT_COMPRESS_QUALITY) -> None:
|
|
46
|
+
kwargs = {}
|
|
47
|
+
if output.suffix.lower() in {".jpg", ".jpeg"}:
|
|
48
|
+
kwargs["quality"] = quality
|
|
49
|
+
img.save(str(output), **kwargs)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
# Enhance
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
|
|
56
|
+
def enhance(input_path: Path, output: Optional[Path] = None) -> None:
|
|
57
|
+
"""
|
|
58
|
+
Enhance an image for better readability.
|
|
59
|
+
Applies: auto-contrast, sharpness boost.
|
|
60
|
+
"""
|
|
61
|
+
try:
|
|
62
|
+
from PIL import Image, ImageEnhance, ImageOps
|
|
63
|
+
except ImportError:
|
|
64
|
+
abort("Pillow is required.")
|
|
65
|
+
|
|
66
|
+
img = _load_pil(input_path)
|
|
67
|
+
|
|
68
|
+
# Convert to RGB if needed
|
|
69
|
+
if img.mode not in ("RGB", "L"):
|
|
70
|
+
img = img.convert("RGB")
|
|
71
|
+
|
|
72
|
+
img = ImageOps.autocontrast(img, cutoff=1)
|
|
73
|
+
sharpener = ImageEnhance.Sharpness(img)
|
|
74
|
+
img = sharpener.enhance(1.5)
|
|
75
|
+
contrast = ImageEnhance.Contrast(img)
|
|
76
|
+
img = contrast.enhance(1.3)
|
|
77
|
+
|
|
78
|
+
out = output or input_path.with_name(input_path.stem + "_enhanced" + input_path.suffix)
|
|
79
|
+
_save_pil(img, out)
|
|
80
|
+
success(f"Enhanced -> [bold]{out}[/bold]")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# ---------------------------------------------------------------------------
|
|
84
|
+
# Deskew
|
|
85
|
+
# ---------------------------------------------------------------------------
|
|
86
|
+
|
|
87
|
+
def deskew(input_path: Path, output: Optional[Path] = None) -> None:
|
|
88
|
+
"""
|
|
89
|
+
Detect and correct the skew angle of a scanned document image.
|
|
90
|
+
Uses OpenCV Hough line detection.
|
|
91
|
+
"""
|
|
92
|
+
try:
|
|
93
|
+
import cv2
|
|
94
|
+
import numpy as np
|
|
95
|
+
except ImportError:
|
|
96
|
+
abort("opencv-python-headless is required.")
|
|
97
|
+
|
|
98
|
+
img = _load_cv2(input_path)
|
|
99
|
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) if len(img.shape) == 3 else img
|
|
100
|
+
gray = cv2.bitwise_not(gray)
|
|
101
|
+
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
|
|
102
|
+
|
|
103
|
+
coords = np.column_stack(np.where(thresh > 0))
|
|
104
|
+
if len(coords) == 0:
|
|
105
|
+
warn("Could not detect text regions for deskewing.")
|
|
106
|
+
return
|
|
107
|
+
|
|
108
|
+
angle = cv2.minAreaRect(coords)[-1]
|
|
109
|
+
if angle < -45:
|
|
110
|
+
angle = 90 + angle
|
|
111
|
+
|
|
112
|
+
(h, w) = img.shape[:2]
|
|
113
|
+
center = (w // 2, h // 2)
|
|
114
|
+
M = cv2.getRotationMatrix2D(center, angle, 1.0)
|
|
115
|
+
rotated = cv2.warpAffine(img, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
|
|
116
|
+
|
|
117
|
+
out = output or input_path.with_name(input_path.stem + "_deskewed" + input_path.suffix)
|
|
118
|
+
cv2.imwrite(str(out), rotated)
|
|
119
|
+
success(f"Deskewed (corrected {angle:.2f} deg) -> [bold]{out}[/bold]")
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# ---------------------------------------------------------------------------
|
|
123
|
+
# Denoise
|
|
124
|
+
# ---------------------------------------------------------------------------
|
|
125
|
+
|
|
126
|
+
def denoise(input_path: Path, output: Optional[Path] = None) -> None:
|
|
127
|
+
"""Remove noise from an image using OpenCV Non-Local Means."""
|
|
128
|
+
try:
|
|
129
|
+
import cv2
|
|
130
|
+
except ImportError:
|
|
131
|
+
abort("opencv-python-headless is required.")
|
|
132
|
+
|
|
133
|
+
img = _load_cv2(input_path)
|
|
134
|
+
|
|
135
|
+
if len(img.shape) == 2: # grayscale
|
|
136
|
+
denoised = cv2.fastNlMeansDenoising(img, h=10, templateWindowSize=7, searchWindowSize=21)
|
|
137
|
+
else:
|
|
138
|
+
denoised = cv2.fastNlMeansDenoisingColored(img, h=10, hColor=10, templateWindowSize=7, searchWindowSize=21)
|
|
139
|
+
|
|
140
|
+
out = output or input_path.with_name(input_path.stem + "_denoised" + input_path.suffix)
|
|
141
|
+
cv2.imwrite(str(out), denoised)
|
|
142
|
+
success(f"Denoised -> [bold]{out}[/bold]")
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# ---------------------------------------------------------------------------
|
|
146
|
+
# Resize
|
|
147
|
+
# ---------------------------------------------------------------------------
|
|
148
|
+
|
|
149
|
+
def resize(
|
|
150
|
+
input_path: Path,
|
|
151
|
+
width: Optional[int] = None,
|
|
152
|
+
height: Optional[int] = None,
|
|
153
|
+
scale: Optional[float] = None,
|
|
154
|
+
output: Optional[Path] = None,
|
|
155
|
+
) -> None:
|
|
156
|
+
"""
|
|
157
|
+
Resize an image.
|
|
158
|
+
|
|
159
|
+
Provide either (width, height), one of them (maintains aspect ratio), or a scale factor.
|
|
160
|
+
"""
|
|
161
|
+
try:
|
|
162
|
+
from PIL import Image
|
|
163
|
+
except ImportError:
|
|
164
|
+
abort("Pillow is required.")
|
|
165
|
+
|
|
166
|
+
if width is None and height is None and scale is None:
|
|
167
|
+
abort("Provide --width, --height, or --scale.")
|
|
168
|
+
|
|
169
|
+
img = _load_pil(input_path)
|
|
170
|
+
orig_w, orig_h = img.size
|
|
171
|
+
|
|
172
|
+
if scale is not None:
|
|
173
|
+
new_w = int(orig_w * scale)
|
|
174
|
+
new_h = int(orig_h * scale)
|
|
175
|
+
elif width and height:
|
|
176
|
+
new_w, new_h = width, height
|
|
177
|
+
elif width:
|
|
178
|
+
new_w = width
|
|
179
|
+
new_h = int(orig_h * (width / orig_w))
|
|
180
|
+
else:
|
|
181
|
+
new_h = height
|
|
182
|
+
new_w = int(orig_w * (height / orig_h))
|
|
183
|
+
|
|
184
|
+
resized = img.resize((new_w, new_h), Image.LANCZOS)
|
|
185
|
+
out = output or input_path.with_name(input_path.stem + f"_{new_w}x{new_h}" + input_path.suffix)
|
|
186
|
+
_save_pil(resized, out)
|
|
187
|
+
success(f"Resized {orig_w}x{orig_h} -> {new_w}x{new_h} -> [bold]{out}[/bold]")
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
# ---------------------------------------------------------------------------
|
|
191
|
+
# OCR Preprocessing Pipeline
|
|
192
|
+
# ---------------------------------------------------------------------------
|
|
193
|
+
|
|
194
|
+
def preprocess_for_ocr(input_path: Path, output: Optional[Path] = None) -> Path:
|
|
195
|
+
"""
|
|
196
|
+
Full OCR preprocessing pipeline:
|
|
197
|
+
1. Auto orientation detection
|
|
198
|
+
2. Contrast enhancement
|
|
199
|
+
3. Noise removal
|
|
200
|
+
4. Threshold binarization
|
|
201
|
+
Returns path to the preprocessed image.
|
|
202
|
+
"""
|
|
203
|
+
try:
|
|
204
|
+
import cv2
|
|
205
|
+
import numpy as np
|
|
206
|
+
from PIL import Image, ImageEnhance, ImageOps
|
|
207
|
+
except ImportError:
|
|
208
|
+
abort("opencv-python-headless and Pillow are required.")
|
|
209
|
+
|
|
210
|
+
if not input_path.exists():
|
|
211
|
+
abort(f"File not found: {input_path}")
|
|
212
|
+
|
|
213
|
+
info("Step 1/4: Orientation detection...")
|
|
214
|
+
img_cv = _load_cv2(input_path)
|
|
215
|
+
gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY) if len(img_cv.shape) == 3 else img_cv
|
|
216
|
+
|
|
217
|
+
# Deskew
|
|
218
|
+
inv = cv2.bitwise_not(gray)
|
|
219
|
+
thresh = cv2.threshold(inv, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
|
|
220
|
+
coords = cv2.findNonZero(thresh)
|
|
221
|
+
if coords is not None:
|
|
222
|
+
angle = cv2.minAreaRect(coords)[-1]
|
|
223
|
+
if angle < -45:
|
|
224
|
+
angle = 90 + angle
|
|
225
|
+
(h, w) = gray.shape[:2]
|
|
226
|
+
M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
|
|
227
|
+
gray = cv2.warpAffine(gray, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
|
|
228
|
+
|
|
229
|
+
info("Step 2/4: Contrast enhancement...")
|
|
230
|
+
pil_img = Image.fromarray(gray)
|
|
231
|
+
pil_img = ImageOps.autocontrast(pil_img, cutoff=2)
|
|
232
|
+
enhancer = ImageEnhance.Contrast(pil_img)
|
|
233
|
+
pil_img = enhancer.enhance(1.4)
|
|
234
|
+
gray = __import__("numpy").array(pil_img)
|
|
235
|
+
|
|
236
|
+
info("Step 3/4: Noise removal...")
|
|
237
|
+
gray = cv2.fastNlMeansDenoising(gray, h=10, templateWindowSize=7, searchWindowSize=21)
|
|
238
|
+
|
|
239
|
+
info("Step 4/4: Threshold binarization...")
|
|
240
|
+
binary = cv2.adaptiveThreshold(
|
|
241
|
+
gray, 255,
|
|
242
|
+
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
|
243
|
+
cv2.THRESH_BINARY,
|
|
244
|
+
blockSize=15,
|
|
245
|
+
C=4,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
out = output or input_path.with_name(input_path.stem + "_preprocessed.png")
|
|
249
|
+
cv2.imwrite(str(out), binary)
|
|
250
|
+
success(f"Preprocessing complete -> [bold]{out}[/bold]")
|
|
251
|
+
return out
|