dforge-cli 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dforge/converter.py ADDED
@@ -0,0 +1,167 @@
1
+ """
2
+ DForge Conversion Module
3
+ Handles: document format conversion (docx, pdf, md, html, txt), img2pdf, pdf2img
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import subprocess
9
+ from pathlib import Path
10
+ from typing import List, Optional
11
+
12
+ from rich.progress import Progress, SpinnerColumn, TextColumn
13
+ from dforge.config_manager import get_tool_path
14
+ from dforge.utils import (
15
+ abort, console, ensure_parent, info, require_pandoc, success, warn,
16
+ )
17
+ from dforge.config import DEFAULT_IMAGE_DPI, SUPPORTED_IMAGE_EXTS
18
+
19
+
20
+ PANDOC_FORMAT_MAP = {
21
+ "pdf": "pdf",
22
+ "docx": "docx",
23
+ "md": "markdown",
24
+ "markdown": "markdown",
25
+ "html": "html",
26
+ "txt": "plain",
27
+ "text": "plain",
28
+ "rst": "rst",
29
+ "odt": "odt",
30
+ "epub": "epub",
31
+ }
32
+
33
+
34
+ def _ext_to_pandoc_format(ext: str) -> str:
35
+ """Convert a file extension to a Pandoc format name."""
36
+ cleaned = ext.lstrip(".").lower()
37
+ return PANDOC_FORMAT_MAP.get(cleaned, cleaned)
38
+
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Universal convert
42
+ # ---------------------------------------------------------------------------
43
+
44
+ def convert(
45
+ input_path: Path,
46
+ target_format: str,
47
+ output: Optional[Path] = None,
48
+ ) -> None:
49
+ """
50
+ Convert a document to the target format using Pandoc.
51
+
52
+ Supports: pdf, docx, md, html, txt, rst, odt, epub
53
+ """
54
+ require_pandoc()
55
+
56
+ if not input_path.exists():
57
+ abort(f"File not found: {input_path}")
58
+
59
+ target_ext = f".{target_format.lstrip('.')}"
60
+ out = output or input_path.with_suffix(target_ext)
61
+ ensure_parent(out)
62
+
63
+ pandoc_to = _ext_to_pandoc_format(target_format)
64
+ pandoc_from = _ext_to_pandoc_format(input_path.suffix)
65
+
66
+ cmd = ["pandoc", str(input_path), "-f", pandoc_from, "-t", pandoc_to, "-o", str(out)]
67
+
68
+ # PDF requires a PDF engine
69
+ if pandoc_to == "pdf":
70
+ xelatex = get_tool_path("xelatex")
71
+
72
+ if xelatex:
73
+ cmd += [f"--pdf-engine={xelatex}"]
74
+ else:
75
+ cmd += ["--pdf-engine=xelatex"]
76
+
77
+ info(f"Converting [bold]{input_path.name}[/bold] -> [bold]{pandoc_to.upper()}[/bold]...")
78
+ result = subprocess.run(cmd, capture_output=True, text=True)
79
+ if result.returncode != 0:
80
+ abort(f"Pandoc error:\n{result.stderr}")
81
+
82
+ success(f"Converted -> [bold]{out}[/bold]")
83
+
84
+
85
+ # ---------------------------------------------------------------------------
86
+ # Images -> PDF
87
+ # ---------------------------------------------------------------------------
88
+
89
+ def images_to_pdf(
90
+ source: Path,
91
+ output: Optional[Path] = None,
92
+ sort_files: bool = True,
93
+ ) -> None:
94
+ """
95
+ Combine images from a directory (or a single image) into a PDF.
96
+
97
+ source can be a directory of images or a single image file.
98
+ """
99
+ try:
100
+ import img2pdf
101
+ except ImportError:
102
+ abort("img2pdf is required. Run: pip install img2pdf")
103
+
104
+ images: List[Path] = []
105
+
106
+ if source.is_dir():
107
+ for ext in SUPPORTED_IMAGE_EXTS:
108
+ images.extend(source.glob(f"*{ext}"))
109
+ if sort_files:
110
+ images = sorted(images)
111
+ if not images:
112
+ abort(f"No image files found in {source}")
113
+ out = output or source.parent / (source.name + ".pdf")
114
+ elif source.is_file():
115
+ if source.suffix.lower() not in SUPPORTED_IMAGE_EXTS:
116
+ abort(f"Not a supported image format: {source.suffix}")
117
+ images = [source]
118
+ out = output or source.with_suffix(".pdf")
119
+ else:
120
+ abort(f"Path not found: {source}")
121
+
122
+ ensure_parent(out)
123
+
124
+ info(f"Combining {len(images)} image(s) into PDF...")
125
+ with open(out, "wb") as fh:
126
+ fh.write(img2pdf.convert([str(img) for img in images]))
127
+
128
+ success(f"{len(images)} image(s) -> [bold]{out}[/bold]")
129
+
130
+
131
+ # ---------------------------------------------------------------------------
132
+ # PDF -> Images
133
+ # ---------------------------------------------------------------------------
134
+
135
+ def pdf_to_images(
136
+ input_path: Path,
137
+ output_dir: Optional[Path] = None,
138
+ dpi: int = DEFAULT_IMAGE_DPI,
139
+ fmt: str = "png",
140
+ ) -> None:
141
+ """
142
+ Convert each page of a PDF to an image file.
143
+
144
+ fmt: png, jpeg, tiff
145
+ """
146
+ try:
147
+ from pdf2image import convert_from_path
148
+ except ImportError:
149
+ abort("pdf2image is required. Run: pip install pdf2image")
150
+
151
+ if not input_path.exists():
152
+ abort(f"File not found: {input_path}")
153
+
154
+ dest = output_dir or input_path.parent / (input_path.stem + "_images")
155
+ dest.mkdir(parents=True, exist_ok=True)
156
+
157
+ info(f"Converting PDF pages to {fmt.upper()} images (DPI={dpi})...")
158
+ pages = convert_from_path(str(input_path), dpi=dpi, fmt=fmt)
159
+
160
+ with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress:
161
+ task = progress.add_task(f"Exporting {len(pages)} pages...", total=len(pages))
162
+ for i, page in enumerate(pages, start=1):
163
+ out_file = dest / f"{input_path.stem}_page_{i:04d}.{fmt}"
164
+ page.save(str(out_file), fmt.upper())
165
+ progress.advance(task)
166
+
167
+ success(f"Exported {len(pages)} page(s) -> [bold]{dest}/[/bold]")
dforge/dependencies.py ADDED
@@ -0,0 +1,98 @@
1
+ from pathlib import Path
2
+
3
+ from rich.console import Console
4
+ from rich.table import Table
5
+
6
+ from dforge.config_manager import get_tool_path
7
+
8
+ console = Console()
9
+
10
+
11
+ def has_poppler():
12
+ path = get_tool_path("poppler")
13
+ return path is not None and Path(path).exists()
14
+
15
+
16
+ def has_tesseract():
17
+ path = get_tool_path("tesseract")
18
+ return path is not None and Path(path).exists()
19
+
20
+
21
+ def has_pandoc():
22
+ path = get_tool_path("pandoc")
23
+ return path is not None and Path(path).exists()
24
+
25
+
26
+ def has_ghostscript():
27
+ path = get_tool_path("ghostscript")
28
+ return path is not None and Path(path).exists()
29
+
30
+
31
+ def doctor():
32
+ table = Table(title="DForge System Check")
33
+
34
+ table.add_column("Dependency")
35
+ table.add_column("Status")
36
+
37
+ deps = [
38
+ ("Poppler", has_poppler()),
39
+ ("Tesseract", has_tesseract()),
40
+ ("Ghostscript", has_ghostscript()),
41
+ ("Pandoc", has_pandoc()),
42
+ ]
43
+
44
+ for name, status in deps:
45
+ table.add_row(
46
+ name,
47
+ "✓ Installed" if status else "✗ Missing",
48
+ )
49
+
50
+ console.print(table)
51
+
52
+
53
+ def check_poppler():
54
+ if has_poppler():
55
+ return True
56
+
57
+ console.print(
58
+ "\n[red]Poppler is required for PDF OCR.[/red]\n"
59
+ "Run:\n"
60
+ "[cyan]dforge setup[/cyan]\n"
61
+ )
62
+ return False
63
+
64
+
65
+ def check_tesseract():
66
+ if has_tesseract():
67
+ return True
68
+
69
+ console.print(
70
+ "\n[red]Tesseract OCR is not installed.[/red]\n"
71
+ "Run:\n"
72
+ "[cyan]dforge setup[/cyan]\n"
73
+ )
74
+ return False
75
+
76
+
77
+ def check_ghostscript():
78
+ if has_ghostscript():
79
+ return True
80
+
81
+ console.print(
82
+ "\n[red]Ghostscript is not installed.[/red]\n"
83
+ "Run:\n"
84
+ "[cyan]dforge setup[/cyan]\n"
85
+ )
86
+ return False
87
+
88
+
89
+ def check_pandoc():
90
+ if has_pandoc():
91
+ return True
92
+
93
+ console.print(
94
+ "\n[red]Pandoc is not installed.[/red]\n"
95
+ "Run:\n"
96
+ "[cyan]dforge setup[/cyan]\n"
97
+ )
98
+ return False
dforge/engine.py ADDED
@@ -0,0 +1,236 @@
1
+ """
2
+ DForge OCR Module
3
+ Handles: image OCR, PDF OCR, searchable PDF generation, batch OCR
4
+ Output formats: TXT, JSON, Markdown
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import tempfile
11
+ from pathlib import Path
12
+ from typing import List, Optional
13
+
14
+ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
15
+
16
+ from dforge.utils import (
17
+ abort, console, ensure_parent, info, require_tesseract, success, warn,
18
+ )
19
+ from dforge.config import DEFAULT_OCR_LANG, DEFAULT_OCR_DPI, SUPPORTED_IMAGE_EXTS
20
+
21
+
22
+ # ---------------------------------------------------------------------------
23
+ # Core OCR helpers
24
+ # ---------------------------------------------------------------------------
25
+
26
+ def _run_tesseract(image_path: Path, lang: str) -> str:
27
+ """Run Tesseract on a single image and return extracted text."""
28
+ require_tesseract()
29
+ try:
30
+ import pytesseract
31
+ except ImportError:
32
+ abort("pytesseract is required. Run: pip install pytesseract")
33
+
34
+ text = pytesseract.image_to_string(str(image_path), lang=lang)
35
+ return text.strip()
36
+
37
+
38
+ def _pdf_to_images(pdf_path: Path, dpi: int = DEFAULT_OCR_DPI) -> List[Path]:
39
+ """Convert PDF pages to temporary image files for OCR."""
40
+ try:
41
+ from pdf2image import convert_from_path
42
+ except ImportError:
43
+ abort("pdf2image is required. Run: pip install pdf2image")
44
+
45
+ tmp_dir = Path(tempfile.mkdtemp())
46
+ pages = convert_from_path(str(pdf_path), dpi=dpi, output_folder=str(tmp_dir), fmt="png")
47
+ paths = []
48
+ for i, page in enumerate(pages):
49
+ p = tmp_dir / f"page_{i:04d}.png"
50
+ page.save(str(p), "PNG")
51
+ paths.append(p)
52
+ return paths
53
+
54
+
55
+ def _write_output(text: str, output_path: Path, fmt: str, source_name: str = "") -> None:
56
+ """Write OCR result in the specified format."""
57
+ ensure_parent(output_path)
58
+ if fmt == "json":
59
+ data = {
60
+ "source": source_name,
61
+ "text": text,
62
+ "word_count": len(text.split()),
63
+ "char_count": len(text),
64
+ }
65
+ output_path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
66
+ elif fmt == "md":
67
+ content = f"# OCR Output: {source_name}\n\n{text}\n"
68
+ output_path.write_text(content, encoding="utf-8")
69
+ else: # txt
70
+ output_path.write_text(text, encoding="utf-8")
71
+
72
+
73
+ # ---------------------------------------------------------------------------
74
+ # Public API
75
+ # ---------------------------------------------------------------------------
76
+
77
+ def ocr_image(
78
+ input_path: Path,
79
+ output: Optional[Path] = None,
80
+ lang: str = DEFAULT_OCR_LANG,
81
+ fmt: str = "txt",
82
+ ) -> str:
83
+ """Run OCR on a single image file."""
84
+ if not input_path.exists():
85
+ abort(f"File not found: {input_path}")
86
+
87
+ info(f"Running OCR on [bold]{input_path.name}[/bold] (lang: {lang})...")
88
+ text = _run_tesseract(input_path, lang)
89
+
90
+ ext_map = {"txt": ".txt", "json": ".json", "md": ".md"}
91
+ ext = ext_map.get(fmt, ".txt")
92
+ out = output or input_path.with_suffix(ext)
93
+ _write_output(text, out, fmt, input_path.name)
94
+ success(f"OCR complete -> [bold]{out}[/bold] ({len(text.split())} words)")
95
+ return text
96
+
97
+
98
+ def ocr_pdf(
99
+ input_path: Path,
100
+ output: Optional[Path] = None,
101
+ lang: str = DEFAULT_OCR_LANG,
102
+ fmt: str = "txt",
103
+ dpi: int = DEFAULT_OCR_DPI,
104
+ ) -> str:
105
+ """Run OCR on all pages of a PDF."""
106
+ if not input_path.exists():
107
+ abort(f"File not found: {input_path}")
108
+
109
+ info(f"Converting PDF pages to images (DPI={dpi})...")
110
+ images = _pdf_to_images(input_path, dpi)
111
+ all_text_parts: List[str] = []
112
+
113
+ with Progress(
114
+ SpinnerColumn(),
115
+ TextColumn("[progress.description]{task.description}"),
116
+ BarColumn(),
117
+ TextColumn("{task.completed}/{task.total} pages"),
118
+ console=console,
119
+ ) as progress:
120
+ task = progress.add_task("Running OCR...", total=len(images))
121
+ for img in images:
122
+ text = _run_tesseract(img, lang)
123
+ all_text_parts.append(text)
124
+ progress.advance(task)
125
+ # Cleanup temp image
126
+ img.unlink(missing_ok=True)
127
+
128
+ full_text = "\n\n---\n\n".join(all_text_parts)
129
+
130
+ ext_map = {"txt": ".txt", "json": ".json", "md": ".md"}
131
+ ext = ext_map.get(fmt, ".txt")
132
+ out = output or input_path.with_suffix(ext)
133
+ _write_output(full_text, out, fmt, input_path.name)
134
+ success(f"OCR complete ({len(images)} pages) -> [bold]{out}[/bold]")
135
+ return full_text
136
+
137
+
138
+ def make_searchable_pdf(
139
+ input_path: Path,
140
+ output: Optional[Path] = None,
141
+ lang: str = DEFAULT_OCR_LANG,
142
+ dpi: int = DEFAULT_OCR_DPI,
143
+ ) -> None:
144
+ """Create a searchable PDF from a scanned PDF using OCR."""
145
+ require_tesseract()
146
+ try:
147
+ import pytesseract
148
+ from pdf2image import convert_from_path
149
+ from PIL import Image
150
+ except ImportError:
151
+ abort("pytesseract, pdf2image and Pillow are required.")
152
+
153
+ if not input_path.exists():
154
+ abort(f"File not found: {input_path}")
155
+
156
+ out = output or input_path.with_name(input_path.stem + "_searchable.pdf")
157
+ ensure_parent(out)
158
+
159
+ info(f"Converting PDF to images (DPI={dpi})...")
160
+ pages = convert_from_path(str(input_path), dpi=dpi)
161
+
162
+ pdf_pages = []
163
+ with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), console=console) as progress:
164
+ task = progress.add_task(f"Processing {len(pages)} pages...", total=len(pages))
165
+ for page_img in pages:
166
+ # pytesseract can produce a searchable PDF per-page
167
+ pdf_bytes = pytesseract.image_to_pdf_or_hocr(page_img, extension="pdf", lang=lang)
168
+ pdf_pages.append(pdf_bytes)
169
+ progress.advance(task)
170
+
171
+ # Merge all page PDFs
172
+ try:
173
+ from pypdf import PdfWriter, PdfReader
174
+ import io
175
+ except ImportError:
176
+ abort("pypdf is required.")
177
+
178
+ writer = PdfWriter()
179
+ for pdf_bytes in pdf_pages:
180
+ import io
181
+ reader = PdfReader(io.BytesIO(pdf_bytes))
182
+ for page in reader.pages:
183
+ writer.add_page(page)
184
+
185
+ with open(out, "wb") as fh:
186
+ writer.write(fh)
187
+
188
+ success(f"Searchable PDF created -> [bold]{out}[/bold]")
189
+
190
+
191
+ def batch_ocr(
192
+ directory: Path,
193
+ lang: str = DEFAULT_OCR_LANG,
194
+ fmt: str = "txt",
195
+ recursive: bool = True,
196
+ ) -> None:
197
+ """Run OCR on all supported image/PDF files in a directory."""
198
+ from dforge.utils import collect_files
199
+
200
+ if not directory.exists():
201
+ abort(f"Directory not found: {directory}")
202
+
203
+ all_exts = SUPPORTED_IMAGE_EXTS | {".pdf"}
204
+ files = collect_files(directory, all_exts, recursive=recursive)
205
+
206
+ if not files:
207
+ warn(f"No supported files found in {directory}")
208
+ return
209
+
210
+ info(f"Found {len(files)} files to process...")
211
+ errors = []
212
+
213
+ with Progress(
214
+ SpinnerColumn(),
215
+ TextColumn("[progress.description]{task.description}"),
216
+ BarColumn(),
217
+ TextColumn("{task.completed}/{task.total}"),
218
+ console=console,
219
+ ) as progress:
220
+ task = progress.add_task("Batch OCR...", total=len(files))
221
+ for f in files:
222
+ try:
223
+ if f.suffix.lower() == ".pdf":
224
+ ocr_pdf(f, lang=lang, fmt=fmt)
225
+ else:
226
+ ocr_image(f, lang=lang, fmt=fmt)
227
+ except Exception as exc:
228
+ errors.append((f, str(exc)))
229
+ progress.advance(task)
230
+
231
+ if errors:
232
+ warn(f"{len(errors)} file(s) failed:")
233
+ for f, err in errors:
234
+ console.print(f" [red]{f.name}[/red]: {err}")
235
+
236
+ success(f"Batch OCR complete. Processed {len(files) - len(errors)}/{len(files)} files.")
dforge/extractor.py ADDED
@@ -0,0 +1,201 @@
1
+ """
2
+ DForge Extraction Module
3
+ Handles: text, images, metadata, and table extraction from PDFs
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ from pathlib import Path
10
+ from typing import Optional
11
+
12
+ from dforge.utils import abort, console, ensure_parent, info, success, warn
13
+
14
+
15
+ # ---------------------------------------------------------------------------
16
+ # Extract Text
17
+ # ---------------------------------------------------------------------------
18
+
19
+ def extract_text(input_path: Path, output: Optional[Path] = None) -> str:
20
+ """Extract all text from a PDF."""
21
+ try:
22
+ from pypdf import PdfReader
23
+ except ImportError:
24
+ abort("pypdf is required.")
25
+
26
+ if not input_path.exists():
27
+ abort(f"File not found: {input_path}")
28
+
29
+ reader = PdfReader(str(input_path))
30
+ parts = []
31
+ for i, page in enumerate(reader.pages, start=1):
32
+ text = page.extract_text() or ""
33
+ parts.append(f"--- Page {i} ---\n{text}")
34
+
35
+ full_text = "\n\n".join(parts)
36
+
37
+ out = output or input_path.with_suffix(".txt")
38
+ ensure_parent(out)
39
+ out.write_text(full_text, encoding="utf-8")
40
+ success(f"Text extracted ({len(reader.pages)} pages) -> [bold]{out}[/bold]")
41
+ return full_text
42
+
43
+
44
+ # ---------------------------------------------------------------------------
45
+ # Extract Images
46
+ # ---------------------------------------------------------------------------
47
+
48
+ def extract_images(input_path: Path, output_dir: Optional[Path] = None) -> None:
49
+ """Extract all embedded images from a PDF."""
50
+ try:
51
+ from pypdf import PdfReader
52
+ except ImportError:
53
+ abort("pypdf is required.")
54
+
55
+ if not input_path.exists():
56
+ abort(f"File not found: {input_path}")
57
+
58
+ dest = output_dir or input_path.parent / (input_path.stem + "_images")
59
+ dest.mkdir(parents=True, exist_ok=True)
60
+
61
+ reader = PdfReader(str(input_path))
62
+ count = 0
63
+
64
+ for page_num, page in enumerate(reader.pages, start=1):
65
+ if "/XObject" not in page.get("/Resources", {}):
66
+ continue
67
+ xobject = page["/Resources"]["/XObject"].get_object()
68
+ for obj_name, obj_ref in xobject.items():
69
+ obj = obj_ref.get_object()
70
+ if obj.get("/Subtype") == "/Image":
71
+ data = obj.get_data()
72
+ # Determine extension from color space / filter
73
+ filters = obj.get("/Filter", "")
74
+ if isinstance(filters, list):
75
+ filters = filters[-1] if filters else ""
76
+ ext = {
77
+ "/DCTDecode": "jpg",
78
+ "/JPXDecode": "jp2",
79
+ "/FlateDecode": "png",
80
+ "/CCITTFaxDecode": "tiff",
81
+ }.get(str(filters), "bin")
82
+
83
+ fname = dest / f"page{page_num:03d}_{obj_name.lstrip('/')}.{ext}"
84
+ fname.write_bytes(data)
85
+ count += 1
86
+
87
+ if count == 0:
88
+ warn("No extractable images found in this PDF.")
89
+ else:
90
+ success(f"Extracted {count} image(s) -> [bold]{dest}/[/bold]")
91
+
92
+
93
+ # ---------------------------------------------------------------------------
94
+ # Extract Metadata
95
+ # ---------------------------------------------------------------------------
96
+
97
+ def extract_metadata(input_path: Path, output: Optional[Path] = None) -> dict:
98
+ """Extract PDF metadata and optionally save to JSON."""
99
+ try:
100
+ from pypdf import PdfReader
101
+ except ImportError:
102
+ abort("pypdf is required.")
103
+
104
+ if not input_path.exists():
105
+ abort(f"File not found: {input_path}")
106
+
107
+ reader = PdfReader(str(input_path))
108
+ meta_raw = reader.metadata or {}
109
+
110
+ # Clean metadata keys (strip leading '/')
111
+ meta = {k.lstrip("/"): str(v) for k, v in meta_raw.items()}
112
+ meta["PageCount"] = str(len(reader.pages))
113
+ meta["Encrypted"] = str(reader.is_encrypted)
114
+
115
+ # Print to console
116
+ console.print("\n[bold cyan]PDF Metadata[/bold cyan]")
117
+ for k, v in meta.items():
118
+ console.print(f" [dim]{k}:[/dim] {v}")
119
+
120
+ if output:
121
+ ensure_parent(output)
122
+ output.write_text(json.dumps(meta, indent=2), encoding="utf-8")
123
+ success(f"Metadata saved -> [bold]{output}[/bold]")
124
+
125
+ return meta
126
+
127
+
128
+ # ---------------------------------------------------------------------------
129
+ # Extract Tables
130
+ # ---------------------------------------------------------------------------
131
+
132
+ def extract_tables(
133
+ input_path: Path,
134
+ output: Optional[Path] = None,
135
+ fmt: str = "csv",
136
+ ) -> None:
137
+ """
138
+ Extract tables from a PDF and export to CSV, XLSX, or JSON.
139
+ Uses pdfplumber for table detection.
140
+ fmt: csv | xlsx | json
141
+ """
142
+ try:
143
+ import pdfplumber
144
+ except ImportError:
145
+ abort("pdfplumber is required. Run: pip install pdfplumber")
146
+
147
+ if not input_path.exists():
148
+ abort(f"File not found: {input_path}")
149
+
150
+ all_tables = []
151
+ info(f"Scanning {input_path.name} for tables...")
152
+
153
+ with pdfplumber.open(str(input_path)) as pdf:
154
+ for page_num, page in enumerate(pdf.pages, start=1):
155
+ tables = page.extract_tables()
156
+ for t_idx, table in enumerate(tables):
157
+ all_tables.append({
158
+ "page": page_num,
159
+ "table_index": t_idx,
160
+ "data": table,
161
+ })
162
+
163
+ if not all_tables:
164
+ warn("No tables found in this PDF.")
165
+ return
166
+
167
+ total_tables = len(all_tables)
168
+ info(f"Found {total_tables} table(s) across the document.")
169
+
170
+ if fmt == "json":
171
+ out = output or input_path.with_suffix(".tables.json")
172
+ ensure_parent(out)
173
+ out.write_text(json.dumps(all_tables, indent=2, ensure_ascii=False), encoding="utf-8")
174
+ success(f"Tables (JSON) -> [bold]{out}[/bold]")
175
+
176
+ elif fmt == "xlsx":
177
+ try:
178
+ import pandas as pd
179
+ except ImportError:
180
+ abort("pandas is required for XLSX export. Run: pip install pandas openpyxl")
181
+ out = output or input_path.with_suffix(".tables.xlsx")
182
+ ensure_parent(out)
183
+ with pd.ExcelWriter(str(out), engine="openpyxl") as writer:
184
+ for entry in all_tables:
185
+ sheet_name = f"P{entry['page']}_T{entry['table_index'] + 1}"[:31]
186
+ df = pd.DataFrame(entry["data"])
187
+ df.to_excel(writer, sheet_name=sheet_name, index=False, header=False)
188
+ success(f"Tables (XLSX, {total_tables} sheets) -> [bold]{out}[/bold]")
189
+
190
+ else: # csv
191
+ try:
192
+ import pandas as pd
193
+ except ImportError:
194
+ abort("pandas is required for CSV export.")
195
+ out_dir = output or input_path.parent / (input_path.stem + "_tables")
196
+ out_dir.mkdir(parents=True, exist_ok=True)
197
+ for entry in all_tables:
198
+ csv_file = out_dir / f"page{entry['page']:03d}_table{entry['table_index'] + 1}.csv"
199
+ df = pd.DataFrame(entry["data"])
200
+ df.to_csv(str(csv_file), index=False, header=False)
201
+ success(f"Tables (CSV, {total_tables} files) -> [bold]{out_dir}/[/bold]")