dforge-cli 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dforge/__init__.py +1 -0
- dforge/banner.py +31 -0
- dforge/batch.py +156 -0
- dforge/cli.py +525 -0
- dforge/config.py +38 -0
- dforge/config_manager.py +33 -0
- dforge/converter.py +167 -0
- dforge/dependencies.py +98 -0
- dforge/engine.py +236 -0
- dforge/extractor.py +201 -0
- dforge/loading.py +19 -0
- dforge/menu.py +115 -0
- dforge/operations.py +314 -0
- dforge/processor.py +251 -0
- dforge/setup.py +107 -0
- dforge/theme.py +12 -0
- dforge/utils.py +169 -0
- dforge/watcher.py +137 -0
- dforge/workflows/__init__.py +0 -0
- dforge/workflows/automation.py +21 -0
- dforge/workflows/batch.py +18 -0
- dforge/workflows/batch_ocr.py +61 -0
- dforge/workflows/common.py +133 -0
- dforge/workflows/compress.py +73 -0
- dforge/workflows/convert.py +148 -0
- dforge/workflows/decrypt.py +50 -0
- dforge/workflows/encrypt.py +50 -0
- dforge/workflows/extract.py +18 -0
- dforge/workflows/image.py +21 -0
- dforge/workflows/merge.py +109 -0
- dforge/workflows/ocr.py +104 -0
- dforge/workflows/ocr_folder.py +0 -0
- dforge/workflows/pages.py +57 -0
- dforge/workflows/rotate.py +53 -0
- dforge/workflows/searchable.py +51 -0
- dforge/workflows/settings.py +56 -0
- dforge/workflows/split.py +32 -0
- dforge/workflows/tables.py +45 -0
- dforge/workflows/watermark.py +54 -0
- dforge_cli-1.0.1.dist-info/METADATA +244 -0
- dforge_cli-1.0.1.dist-info/RECORD +44 -0
- dforge_cli-1.0.1.dist-info/WHEEL +5 -0
- dforge_cli-1.0.1.dist-info/entry_points.txt +2 -0
- dforge_cli-1.0.1.dist-info/top_level.txt +1 -0
dforge/converter.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DForge Conversion Module
|
|
3
|
+
Handles: document format conversion (docx, pdf, md, html, txt), img2pdf, pdf2img
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import subprocess
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import List, Optional
|
|
11
|
+
|
|
12
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
13
|
+
from dforge.config_manager import get_tool_path
|
|
14
|
+
from dforge.utils import (
|
|
15
|
+
abort, console, ensure_parent, info, require_pandoc, success, warn,
|
|
16
|
+
)
|
|
17
|
+
from dforge.config import DEFAULT_IMAGE_DPI, SUPPORTED_IMAGE_EXTS
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
PANDOC_FORMAT_MAP = {
|
|
21
|
+
"pdf": "pdf",
|
|
22
|
+
"docx": "docx",
|
|
23
|
+
"md": "markdown",
|
|
24
|
+
"markdown": "markdown",
|
|
25
|
+
"html": "html",
|
|
26
|
+
"txt": "plain",
|
|
27
|
+
"text": "plain",
|
|
28
|
+
"rst": "rst",
|
|
29
|
+
"odt": "odt",
|
|
30
|
+
"epub": "epub",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _ext_to_pandoc_format(ext: str) -> str:
|
|
35
|
+
"""Convert a file extension to a Pandoc format name."""
|
|
36
|
+
cleaned = ext.lstrip(".").lower()
|
|
37
|
+
return PANDOC_FORMAT_MAP.get(cleaned, cleaned)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
# Universal convert
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
|
|
44
|
+
def convert(
|
|
45
|
+
input_path: Path,
|
|
46
|
+
target_format: str,
|
|
47
|
+
output: Optional[Path] = None,
|
|
48
|
+
) -> None:
|
|
49
|
+
"""
|
|
50
|
+
Convert a document to the target format using Pandoc.
|
|
51
|
+
|
|
52
|
+
Supports: pdf, docx, md, html, txt, rst, odt, epub
|
|
53
|
+
"""
|
|
54
|
+
require_pandoc()
|
|
55
|
+
|
|
56
|
+
if not input_path.exists():
|
|
57
|
+
abort(f"File not found: {input_path}")
|
|
58
|
+
|
|
59
|
+
target_ext = f".{target_format.lstrip('.')}"
|
|
60
|
+
out = output or input_path.with_suffix(target_ext)
|
|
61
|
+
ensure_parent(out)
|
|
62
|
+
|
|
63
|
+
pandoc_to = _ext_to_pandoc_format(target_format)
|
|
64
|
+
pandoc_from = _ext_to_pandoc_format(input_path.suffix)
|
|
65
|
+
|
|
66
|
+
cmd = ["pandoc", str(input_path), "-f", pandoc_from, "-t", pandoc_to, "-o", str(out)]
|
|
67
|
+
|
|
68
|
+
# PDF requires a PDF engine
|
|
69
|
+
if pandoc_to == "pdf":
|
|
70
|
+
xelatex = get_tool_path("xelatex")
|
|
71
|
+
|
|
72
|
+
if xelatex:
|
|
73
|
+
cmd += [f"--pdf-engine={xelatex}"]
|
|
74
|
+
else:
|
|
75
|
+
cmd += ["--pdf-engine=xelatex"]
|
|
76
|
+
|
|
77
|
+
info(f"Converting [bold]{input_path.name}[/bold] -> [bold]{pandoc_to.upper()}[/bold]...")
|
|
78
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
79
|
+
if result.returncode != 0:
|
|
80
|
+
abort(f"Pandoc error:\n{result.stderr}")
|
|
81
|
+
|
|
82
|
+
success(f"Converted -> [bold]{out}[/bold]")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# ---------------------------------------------------------------------------
|
|
86
|
+
# Images -> PDF
|
|
87
|
+
# ---------------------------------------------------------------------------
|
|
88
|
+
|
|
89
|
+
def images_to_pdf(
|
|
90
|
+
source: Path,
|
|
91
|
+
output: Optional[Path] = None,
|
|
92
|
+
sort_files: bool = True,
|
|
93
|
+
) -> None:
|
|
94
|
+
"""
|
|
95
|
+
Combine images from a directory (or a single image) into a PDF.
|
|
96
|
+
|
|
97
|
+
source can be a directory of images or a single image file.
|
|
98
|
+
"""
|
|
99
|
+
try:
|
|
100
|
+
import img2pdf
|
|
101
|
+
except ImportError:
|
|
102
|
+
abort("img2pdf is required. Run: pip install img2pdf")
|
|
103
|
+
|
|
104
|
+
images: List[Path] = []
|
|
105
|
+
|
|
106
|
+
if source.is_dir():
|
|
107
|
+
for ext in SUPPORTED_IMAGE_EXTS:
|
|
108
|
+
images.extend(source.glob(f"*{ext}"))
|
|
109
|
+
if sort_files:
|
|
110
|
+
images = sorted(images)
|
|
111
|
+
if not images:
|
|
112
|
+
abort(f"No image files found in {source}")
|
|
113
|
+
out = output or source.parent / (source.name + ".pdf")
|
|
114
|
+
elif source.is_file():
|
|
115
|
+
if source.suffix.lower() not in SUPPORTED_IMAGE_EXTS:
|
|
116
|
+
abort(f"Not a supported image format: {source.suffix}")
|
|
117
|
+
images = [source]
|
|
118
|
+
out = output or source.with_suffix(".pdf")
|
|
119
|
+
else:
|
|
120
|
+
abort(f"Path not found: {source}")
|
|
121
|
+
|
|
122
|
+
ensure_parent(out)
|
|
123
|
+
|
|
124
|
+
info(f"Combining {len(images)} image(s) into PDF...")
|
|
125
|
+
with open(out, "wb") as fh:
|
|
126
|
+
fh.write(img2pdf.convert([str(img) for img in images]))
|
|
127
|
+
|
|
128
|
+
success(f"{len(images)} image(s) -> [bold]{out}[/bold]")
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# ---------------------------------------------------------------------------
|
|
132
|
+
# PDF -> Images
|
|
133
|
+
# ---------------------------------------------------------------------------
|
|
134
|
+
|
|
135
|
+
def pdf_to_images(
|
|
136
|
+
input_path: Path,
|
|
137
|
+
output_dir: Optional[Path] = None,
|
|
138
|
+
dpi: int = DEFAULT_IMAGE_DPI,
|
|
139
|
+
fmt: str = "png",
|
|
140
|
+
) -> None:
|
|
141
|
+
"""
|
|
142
|
+
Convert each page of a PDF to an image file.
|
|
143
|
+
|
|
144
|
+
fmt: png, jpeg, tiff
|
|
145
|
+
"""
|
|
146
|
+
try:
|
|
147
|
+
from pdf2image import convert_from_path
|
|
148
|
+
except ImportError:
|
|
149
|
+
abort("pdf2image is required. Run: pip install pdf2image")
|
|
150
|
+
|
|
151
|
+
if not input_path.exists():
|
|
152
|
+
abort(f"File not found: {input_path}")
|
|
153
|
+
|
|
154
|
+
dest = output_dir or input_path.parent / (input_path.stem + "_images")
|
|
155
|
+
dest.mkdir(parents=True, exist_ok=True)
|
|
156
|
+
|
|
157
|
+
info(f"Converting PDF pages to {fmt.upper()} images (DPI={dpi})...")
|
|
158
|
+
pages = convert_from_path(str(input_path), dpi=dpi, fmt=fmt)
|
|
159
|
+
|
|
160
|
+
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress:
|
|
161
|
+
task = progress.add_task(f"Exporting {len(pages)} pages...", total=len(pages))
|
|
162
|
+
for i, page in enumerate(pages, start=1):
|
|
163
|
+
out_file = dest / f"{input_path.stem}_page_{i:04d}.{fmt}"
|
|
164
|
+
page.save(str(out_file), fmt.upper())
|
|
165
|
+
progress.advance(task)
|
|
166
|
+
|
|
167
|
+
success(f"Exported {len(pages)} page(s) -> [bold]{dest}/[/bold]")
|
dforge/dependencies.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from rich.console import Console
|
|
4
|
+
from rich.table import Table
|
|
5
|
+
|
|
6
|
+
from dforge.config_manager import get_tool_path
|
|
7
|
+
|
|
8
|
+
console = Console()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def has_poppler():
|
|
12
|
+
path = get_tool_path("poppler")
|
|
13
|
+
return path is not None and Path(path).exists()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def has_tesseract():
|
|
17
|
+
path = get_tool_path("tesseract")
|
|
18
|
+
return path is not None and Path(path).exists()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def has_pandoc():
|
|
22
|
+
path = get_tool_path("pandoc")
|
|
23
|
+
return path is not None and Path(path).exists()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def has_ghostscript():
|
|
27
|
+
path = get_tool_path("ghostscript")
|
|
28
|
+
return path is not None and Path(path).exists()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def doctor():
|
|
32
|
+
table = Table(title="DForge System Check")
|
|
33
|
+
|
|
34
|
+
table.add_column("Dependency")
|
|
35
|
+
table.add_column("Status")
|
|
36
|
+
|
|
37
|
+
deps = [
|
|
38
|
+
("Poppler", has_poppler()),
|
|
39
|
+
("Tesseract", has_tesseract()),
|
|
40
|
+
("Ghostscript", has_ghostscript()),
|
|
41
|
+
("Pandoc", has_pandoc()),
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
for name, status in deps:
|
|
45
|
+
table.add_row(
|
|
46
|
+
name,
|
|
47
|
+
"✓ Installed" if status else "✗ Missing",
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
console.print(table)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def check_poppler():
|
|
54
|
+
if has_poppler():
|
|
55
|
+
return True
|
|
56
|
+
|
|
57
|
+
console.print(
|
|
58
|
+
"\n[red]Poppler is required for PDF OCR.[/red]\n"
|
|
59
|
+
"Run:\n"
|
|
60
|
+
"[cyan]dforge setup[/cyan]\n"
|
|
61
|
+
)
|
|
62
|
+
return False
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def check_tesseract():
|
|
66
|
+
if has_tesseract():
|
|
67
|
+
return True
|
|
68
|
+
|
|
69
|
+
console.print(
|
|
70
|
+
"\n[red]Tesseract OCR is not installed.[/red]\n"
|
|
71
|
+
"Run:\n"
|
|
72
|
+
"[cyan]dforge setup[/cyan]\n"
|
|
73
|
+
)
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def check_ghostscript():
|
|
78
|
+
if has_ghostscript():
|
|
79
|
+
return True
|
|
80
|
+
|
|
81
|
+
console.print(
|
|
82
|
+
"\n[red]Ghostscript is not installed.[/red]\n"
|
|
83
|
+
"Run:\n"
|
|
84
|
+
"[cyan]dforge setup[/cyan]\n"
|
|
85
|
+
)
|
|
86
|
+
return False
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def check_pandoc():
|
|
90
|
+
if has_pandoc():
|
|
91
|
+
return True
|
|
92
|
+
|
|
93
|
+
console.print(
|
|
94
|
+
"\n[red]Pandoc is not installed.[/red]\n"
|
|
95
|
+
"Run:\n"
|
|
96
|
+
"[cyan]dforge setup[/cyan]\n"
|
|
97
|
+
)
|
|
98
|
+
return False
|
dforge/engine.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DForge OCR Module
|
|
3
|
+
Handles: image OCR, PDF OCR, searchable PDF generation, batch OCR
|
|
4
|
+
Output formats: TXT, JSON, Markdown
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import tempfile
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import List, Optional
|
|
13
|
+
|
|
14
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
|
15
|
+
|
|
16
|
+
from dforge.utils import (
|
|
17
|
+
abort, console, ensure_parent, info, require_tesseract, success, warn,
|
|
18
|
+
)
|
|
19
|
+
from dforge.config import DEFAULT_OCR_LANG, DEFAULT_OCR_DPI, SUPPORTED_IMAGE_EXTS
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
# Core OCR helpers
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
|
|
26
|
+
def _run_tesseract(image_path: Path, lang: str) -> str:
|
|
27
|
+
"""Run Tesseract on a single image and return extracted text."""
|
|
28
|
+
require_tesseract()
|
|
29
|
+
try:
|
|
30
|
+
import pytesseract
|
|
31
|
+
except ImportError:
|
|
32
|
+
abort("pytesseract is required. Run: pip install pytesseract")
|
|
33
|
+
|
|
34
|
+
text = pytesseract.image_to_string(str(image_path), lang=lang)
|
|
35
|
+
return text.strip()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _pdf_to_images(pdf_path: Path, dpi: int = DEFAULT_OCR_DPI) -> List[Path]:
|
|
39
|
+
"""Convert PDF pages to temporary image files for OCR."""
|
|
40
|
+
try:
|
|
41
|
+
from pdf2image import convert_from_path
|
|
42
|
+
except ImportError:
|
|
43
|
+
abort("pdf2image is required. Run: pip install pdf2image")
|
|
44
|
+
|
|
45
|
+
tmp_dir = Path(tempfile.mkdtemp())
|
|
46
|
+
pages = convert_from_path(str(pdf_path), dpi=dpi, output_folder=str(tmp_dir), fmt="png")
|
|
47
|
+
paths = []
|
|
48
|
+
for i, page in enumerate(pages):
|
|
49
|
+
p = tmp_dir / f"page_{i:04d}.png"
|
|
50
|
+
page.save(str(p), "PNG")
|
|
51
|
+
paths.append(p)
|
|
52
|
+
return paths
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _write_output(text: str, output_path: Path, fmt: str, source_name: str = "") -> None:
|
|
56
|
+
"""Write OCR result in the specified format."""
|
|
57
|
+
ensure_parent(output_path)
|
|
58
|
+
if fmt == "json":
|
|
59
|
+
data = {
|
|
60
|
+
"source": source_name,
|
|
61
|
+
"text": text,
|
|
62
|
+
"word_count": len(text.split()),
|
|
63
|
+
"char_count": len(text),
|
|
64
|
+
}
|
|
65
|
+
output_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
66
|
+
elif fmt == "md":
|
|
67
|
+
content = f"# OCR Output: {source_name}\n\n{text}\n"
|
|
68
|
+
output_path.write_text(content, encoding="utf-8")
|
|
69
|
+
else: # txt
|
|
70
|
+
output_path.write_text(text, encoding="utf-8")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
# Public API
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
def ocr_image(
|
|
78
|
+
input_path: Path,
|
|
79
|
+
output: Optional[Path] = None,
|
|
80
|
+
lang: str = DEFAULT_OCR_LANG,
|
|
81
|
+
fmt: str = "txt",
|
|
82
|
+
) -> str:
|
|
83
|
+
"""Run OCR on a single image file."""
|
|
84
|
+
if not input_path.exists():
|
|
85
|
+
abort(f"File not found: {input_path}")
|
|
86
|
+
|
|
87
|
+
info(f"Running OCR on [bold]{input_path.name}[/bold] (lang: {lang})...")
|
|
88
|
+
text = _run_tesseract(input_path, lang)
|
|
89
|
+
|
|
90
|
+
ext_map = {"txt": ".txt", "json": ".json", "md": ".md"}
|
|
91
|
+
ext = ext_map.get(fmt, ".txt")
|
|
92
|
+
out = output or input_path.with_suffix(ext)
|
|
93
|
+
_write_output(text, out, fmt, input_path.name)
|
|
94
|
+
success(f"OCR complete -> [bold]{out}[/bold] ({len(text.split())} words)")
|
|
95
|
+
return text
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def ocr_pdf(
|
|
99
|
+
input_path: Path,
|
|
100
|
+
output: Optional[Path] = None,
|
|
101
|
+
lang: str = DEFAULT_OCR_LANG,
|
|
102
|
+
fmt: str = "txt",
|
|
103
|
+
dpi: int = DEFAULT_OCR_DPI,
|
|
104
|
+
) -> str:
|
|
105
|
+
"""Run OCR on all pages of a PDF."""
|
|
106
|
+
if not input_path.exists():
|
|
107
|
+
abort(f"File not found: {input_path}")
|
|
108
|
+
|
|
109
|
+
info(f"Converting PDF pages to images (DPI={dpi})...")
|
|
110
|
+
images = _pdf_to_images(input_path, dpi)
|
|
111
|
+
all_text_parts: List[str] = []
|
|
112
|
+
|
|
113
|
+
with Progress(
|
|
114
|
+
SpinnerColumn(),
|
|
115
|
+
TextColumn("[progress.description]{task.description}"),
|
|
116
|
+
BarColumn(),
|
|
117
|
+
TextColumn("{task.completed}/{task.total} pages"),
|
|
118
|
+
console=console,
|
|
119
|
+
) as progress:
|
|
120
|
+
task = progress.add_task("Running OCR...", total=len(images))
|
|
121
|
+
for img in images:
|
|
122
|
+
text = _run_tesseract(img, lang)
|
|
123
|
+
all_text_parts.append(text)
|
|
124
|
+
progress.advance(task)
|
|
125
|
+
# Cleanup temp image
|
|
126
|
+
img.unlink(missing_ok=True)
|
|
127
|
+
|
|
128
|
+
full_text = "\n\n---\n\n".join(all_text_parts)
|
|
129
|
+
|
|
130
|
+
ext_map = {"txt": ".txt", "json": ".json", "md": ".md"}
|
|
131
|
+
ext = ext_map.get(fmt, ".txt")
|
|
132
|
+
out = output or input_path.with_suffix(ext)
|
|
133
|
+
_write_output(full_text, out, fmt, input_path.name)
|
|
134
|
+
success(f"OCR complete ({len(images)} pages) -> [bold]{out}[/bold]")
|
|
135
|
+
return full_text
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def make_searchable_pdf(
|
|
139
|
+
input_path: Path,
|
|
140
|
+
output: Optional[Path] = None,
|
|
141
|
+
lang: str = DEFAULT_OCR_LANG,
|
|
142
|
+
dpi: int = DEFAULT_OCR_DPI,
|
|
143
|
+
) -> None:
|
|
144
|
+
"""Create a searchable PDF from a scanned PDF using OCR."""
|
|
145
|
+
require_tesseract()
|
|
146
|
+
try:
|
|
147
|
+
import pytesseract
|
|
148
|
+
from pdf2image import convert_from_path
|
|
149
|
+
from PIL import Image
|
|
150
|
+
except ImportError:
|
|
151
|
+
abort("pytesseract, pdf2image and Pillow are required.")
|
|
152
|
+
|
|
153
|
+
if not input_path.exists():
|
|
154
|
+
abort(f"File not found: {input_path}")
|
|
155
|
+
|
|
156
|
+
out = output or input_path.with_name(input_path.stem + "_searchable.pdf")
|
|
157
|
+
ensure_parent(out)
|
|
158
|
+
|
|
159
|
+
info(f"Converting PDF to images (DPI={dpi})...")
|
|
160
|
+
pages = convert_from_path(str(input_path), dpi=dpi)
|
|
161
|
+
|
|
162
|
+
pdf_pages = []
|
|
163
|
+
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress:
|
|
164
|
+
task = progress.add_task(f"Processing {len(pages)} pages...", total=len(pages))
|
|
165
|
+
for page_img in pages:
|
|
166
|
+
# pytesseract can produce a searchable PDF per-page
|
|
167
|
+
pdf_bytes = pytesseract.image_to_pdf_or_hocr(page_img, extension="pdf", lang=lang)
|
|
168
|
+
pdf_pages.append(pdf_bytes)
|
|
169
|
+
progress.advance(task)
|
|
170
|
+
|
|
171
|
+
# Merge all page PDFs
|
|
172
|
+
try:
|
|
173
|
+
from pypdf import PdfWriter, PdfReader
|
|
174
|
+
import io
|
|
175
|
+
except ImportError:
|
|
176
|
+
abort("pypdf is required.")
|
|
177
|
+
|
|
178
|
+
writer = PdfWriter()
|
|
179
|
+
for pdf_bytes in pdf_pages:
|
|
180
|
+
import io
|
|
181
|
+
reader = PdfReader(io.BytesIO(pdf_bytes))
|
|
182
|
+
for page in reader.pages:
|
|
183
|
+
writer.add_page(page)
|
|
184
|
+
|
|
185
|
+
with open(out, "wb") as fh:
|
|
186
|
+
writer.write(fh)
|
|
187
|
+
|
|
188
|
+
success(f"Searchable PDF created -> [bold]{out}[/bold]")
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def batch_ocr(
|
|
192
|
+
directory: Path,
|
|
193
|
+
lang: str = DEFAULT_OCR_LANG,
|
|
194
|
+
fmt: str = "txt",
|
|
195
|
+
recursive: bool = True,
|
|
196
|
+
) -> None:
|
|
197
|
+
"""Run OCR on all supported image/PDF files in a directory."""
|
|
198
|
+
from dforge.utils import collect_files
|
|
199
|
+
|
|
200
|
+
if not directory.exists():
|
|
201
|
+
abort(f"Directory not found: {directory}")
|
|
202
|
+
|
|
203
|
+
all_exts = SUPPORTED_IMAGE_EXTS | {".pdf"}
|
|
204
|
+
files = collect_files(directory, all_exts, recursive=recursive)
|
|
205
|
+
|
|
206
|
+
if not files:
|
|
207
|
+
warn(f"No supported files found in {directory}")
|
|
208
|
+
return
|
|
209
|
+
|
|
210
|
+
info(f"Found {len(files)} files to process...")
|
|
211
|
+
errors = []
|
|
212
|
+
|
|
213
|
+
with Progress(
|
|
214
|
+
SpinnerColumn(),
|
|
215
|
+
TextColumn("[progress.description]{task.description}"),
|
|
216
|
+
BarColumn(),
|
|
217
|
+
TextColumn("{task.completed}/{task.total}"),
|
|
218
|
+
console=console,
|
|
219
|
+
) as progress:
|
|
220
|
+
task = progress.add_task("Batch OCR...", total=len(files))
|
|
221
|
+
for f in files:
|
|
222
|
+
try:
|
|
223
|
+
if f.suffix.lower() == ".pdf":
|
|
224
|
+
ocr_pdf(f, lang=lang, fmt=fmt)
|
|
225
|
+
else:
|
|
226
|
+
ocr_image(f, lang=lang, fmt=fmt)
|
|
227
|
+
except Exception as exc:
|
|
228
|
+
errors.append((f, str(exc)))
|
|
229
|
+
progress.advance(task)
|
|
230
|
+
|
|
231
|
+
if errors:
|
|
232
|
+
warn(f"{len(errors)} file(s) failed:")
|
|
233
|
+
for f, err in errors:
|
|
234
|
+
console.print(f" [red]{f.name}[/red]: {err}")
|
|
235
|
+
|
|
236
|
+
success(f"Batch OCR complete. Processed {len(files) - len(errors)}/{len(files)} files.")
|
dforge/extractor.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DForge Extraction Module
|
|
3
|
+
Handles: text, images, metadata, and table extraction from PDFs
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
from dforge.utils import abort, console, ensure_parent, info, success, warn
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# ---------------------------------------------------------------------------
|
|
16
|
+
# Extract Text
|
|
17
|
+
# ---------------------------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
def extract_text(input_path: Path, output: Optional[Path] = None) -> str:
|
|
20
|
+
"""Extract all text from a PDF."""
|
|
21
|
+
try:
|
|
22
|
+
from pypdf import PdfReader
|
|
23
|
+
except ImportError:
|
|
24
|
+
abort("pypdf is required.")
|
|
25
|
+
|
|
26
|
+
if not input_path.exists():
|
|
27
|
+
abort(f"File not found: {input_path}")
|
|
28
|
+
|
|
29
|
+
reader = PdfReader(str(input_path))
|
|
30
|
+
parts = []
|
|
31
|
+
for i, page in enumerate(reader.pages, start=1):
|
|
32
|
+
text = page.extract_text() or ""
|
|
33
|
+
parts.append(f"--- Page {i} ---\n{text}")
|
|
34
|
+
|
|
35
|
+
full_text = "\n\n".join(parts)
|
|
36
|
+
|
|
37
|
+
out = output or input_path.with_suffix(".txt")
|
|
38
|
+
ensure_parent(out)
|
|
39
|
+
out.write_text(full_text, encoding="utf-8")
|
|
40
|
+
success(f"Text extracted ({len(reader.pages)} pages) -> [bold]{out}[/bold]")
|
|
41
|
+
return full_text
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
# Extract Images
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
def extract_images(input_path: Path, output_dir: Optional[Path] = None) -> None:
|
|
49
|
+
"""Extract all embedded images from a PDF."""
|
|
50
|
+
try:
|
|
51
|
+
from pypdf import PdfReader
|
|
52
|
+
except ImportError:
|
|
53
|
+
abort("pypdf is required.")
|
|
54
|
+
|
|
55
|
+
if not input_path.exists():
|
|
56
|
+
abort(f"File not found: {input_path}")
|
|
57
|
+
|
|
58
|
+
dest = output_dir or input_path.parent / (input_path.stem + "_images")
|
|
59
|
+
dest.mkdir(parents=True, exist_ok=True)
|
|
60
|
+
|
|
61
|
+
reader = PdfReader(str(input_path))
|
|
62
|
+
count = 0
|
|
63
|
+
|
|
64
|
+
for page_num, page in enumerate(reader.pages, start=1):
|
|
65
|
+
if "/XObject" not in page.get("/Resources", {}):
|
|
66
|
+
continue
|
|
67
|
+
xobject = page["/Resources"]["/XObject"].get_object()
|
|
68
|
+
for obj_name, obj_ref in xobject.items():
|
|
69
|
+
obj = obj_ref.get_object()
|
|
70
|
+
if obj.get("/Subtype") == "/Image":
|
|
71
|
+
data = obj.get_data()
|
|
72
|
+
# Determine extension from color space / filter
|
|
73
|
+
filters = obj.get("/Filter", "")
|
|
74
|
+
if isinstance(filters, list):
|
|
75
|
+
filters = filters[-1] if filters else ""
|
|
76
|
+
ext = {
|
|
77
|
+
"/DCTDecode": "jpg",
|
|
78
|
+
"/JPXDecode": "jp2",
|
|
79
|
+
"/FlateDecode": "png",
|
|
80
|
+
"/CCITTFaxDecode": "tiff",
|
|
81
|
+
}.get(str(filters), "bin")
|
|
82
|
+
|
|
83
|
+
fname = dest / f"page{page_num:03d}_{obj_name.lstrip('/')}.{ext}"
|
|
84
|
+
fname.write_bytes(data)
|
|
85
|
+
count += 1
|
|
86
|
+
|
|
87
|
+
if count == 0:
|
|
88
|
+
warn("No extractable images found in this PDF.")
|
|
89
|
+
else:
|
|
90
|
+
success(f"Extracted {count} image(s) -> [bold]{dest}/[/bold]")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# ---------------------------------------------------------------------------
|
|
94
|
+
# Extract Metadata
|
|
95
|
+
# ---------------------------------------------------------------------------
|
|
96
|
+
|
|
97
|
+
def extract_metadata(input_path: Path, output: Optional[Path] = None) -> dict:
|
|
98
|
+
"""Extract PDF metadata and optionally save to JSON."""
|
|
99
|
+
try:
|
|
100
|
+
from pypdf import PdfReader
|
|
101
|
+
except ImportError:
|
|
102
|
+
abort("pypdf is required.")
|
|
103
|
+
|
|
104
|
+
if not input_path.exists():
|
|
105
|
+
abort(f"File not found: {input_path}")
|
|
106
|
+
|
|
107
|
+
reader = PdfReader(str(input_path))
|
|
108
|
+
meta_raw = reader.metadata or {}
|
|
109
|
+
|
|
110
|
+
# Clean metadata keys (strip leading '/')
|
|
111
|
+
meta = {k.lstrip("/"): str(v) for k, v in meta_raw.items()}
|
|
112
|
+
meta["PageCount"] = str(len(reader.pages))
|
|
113
|
+
meta["Encrypted"] = str(reader.is_encrypted)
|
|
114
|
+
|
|
115
|
+
# Print to console
|
|
116
|
+
console.print("\n[bold cyan]PDF Metadata[/bold cyan]")
|
|
117
|
+
for k, v in meta.items():
|
|
118
|
+
console.print(f" [dim]{k}:[/dim] {v}")
|
|
119
|
+
|
|
120
|
+
if output:
|
|
121
|
+
ensure_parent(output)
|
|
122
|
+
output.write_text(json.dumps(meta, indent=2), encoding="utf-8")
|
|
123
|
+
success(f"Metadata saved -> [bold]{output}[/bold]")
|
|
124
|
+
|
|
125
|
+
return meta
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# ---------------------------------------------------------------------------
|
|
129
|
+
# Extract Tables
|
|
130
|
+
# ---------------------------------------------------------------------------
|
|
131
|
+
|
|
132
|
+
def extract_tables(
|
|
133
|
+
input_path: Path,
|
|
134
|
+
output: Optional[Path] = None,
|
|
135
|
+
fmt: str = "csv",
|
|
136
|
+
) -> None:
|
|
137
|
+
"""
|
|
138
|
+
Extract tables from a PDF and export to CSV, XLSX, or JSON.
|
|
139
|
+
Uses pdfplumber for table detection.
|
|
140
|
+
fmt: csv | xlsx | json
|
|
141
|
+
"""
|
|
142
|
+
try:
|
|
143
|
+
import pdfplumber
|
|
144
|
+
except ImportError:
|
|
145
|
+
abort("pdfplumber is required. Run: pip install pdfplumber")
|
|
146
|
+
|
|
147
|
+
if not input_path.exists():
|
|
148
|
+
abort(f"File not found: {input_path}")
|
|
149
|
+
|
|
150
|
+
all_tables = []
|
|
151
|
+
info(f"Scanning {input_path.name} for tables...")
|
|
152
|
+
|
|
153
|
+
with pdfplumber.open(str(input_path)) as pdf:
|
|
154
|
+
for page_num, page in enumerate(pdf.pages, start=1):
|
|
155
|
+
tables = page.extract_tables()
|
|
156
|
+
for t_idx, table in enumerate(tables):
|
|
157
|
+
all_tables.append({
|
|
158
|
+
"page": page_num,
|
|
159
|
+
"table_index": t_idx,
|
|
160
|
+
"data": table,
|
|
161
|
+
})
|
|
162
|
+
|
|
163
|
+
if not all_tables:
|
|
164
|
+
warn("No tables found in this PDF.")
|
|
165
|
+
return
|
|
166
|
+
|
|
167
|
+
total_tables = len(all_tables)
|
|
168
|
+
info(f"Found {total_tables} table(s) across the document.")
|
|
169
|
+
|
|
170
|
+
if fmt == "json":
|
|
171
|
+
out = output or input_path.with_suffix(".tables.json")
|
|
172
|
+
ensure_parent(out)
|
|
173
|
+
out.write_text(json.dumps(all_tables, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
174
|
+
success(f"Tables (JSON) -> [bold]{out}[/bold]")
|
|
175
|
+
|
|
176
|
+
elif fmt == "xlsx":
|
|
177
|
+
try:
|
|
178
|
+
import pandas as pd
|
|
179
|
+
except ImportError:
|
|
180
|
+
abort("pandas is required for XLSX export. Run: pip install pandas openpyxl")
|
|
181
|
+
out = output or input_path.with_suffix(".tables.xlsx")
|
|
182
|
+
ensure_parent(out)
|
|
183
|
+
with pd.ExcelWriter(str(out), engine="openpyxl") as writer:
|
|
184
|
+
for entry in all_tables:
|
|
185
|
+
sheet_name = f"P{entry['page']}_T{entry['table_index'] + 1}"[:31]
|
|
186
|
+
df = pd.DataFrame(entry["data"])
|
|
187
|
+
df.to_excel(writer, sheet_name=sheet_name, index=False, header=False)
|
|
188
|
+
success(f"Tables (XLSX, {total_tables} sheets) -> [bold]{out}[/bold]")
|
|
189
|
+
|
|
190
|
+
else: # csv
|
|
191
|
+
try:
|
|
192
|
+
import pandas as pd
|
|
193
|
+
except ImportError:
|
|
194
|
+
abort("pandas is required for CSV export.")
|
|
195
|
+
out_dir = output or input_path.parent / (input_path.stem + "_tables")
|
|
196
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
197
|
+
for entry in all_tables:
|
|
198
|
+
csv_file = out_dir / f"page{entry['page']:03d}_table{entry['table_index'] + 1}.csv"
|
|
199
|
+
df = pd.DataFrame(entry["data"])
|
|
200
|
+
df.to_csv(str(csv_file), index=False, header=False)
|
|
201
|
+
success(f"Tables (CSV, {total_tables} files) -> [bold]{out_dir}/[/bold]")
|