dforge-cli 1.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dforge_cli-1.0.1/PKG-INFO +244 -0
- dforge_cli-1.0.1/README.md +197 -0
- dforge_cli-1.0.1/dforge/__init__.py +1 -0
- dforge_cli-1.0.1/dforge/banner.py +31 -0
- dforge_cli-1.0.1/dforge/batch.py +156 -0
- dforge_cli-1.0.1/dforge/cli.py +525 -0
- dforge_cli-1.0.1/dforge/config.py +38 -0
- dforge_cli-1.0.1/dforge/config_manager.py +33 -0
- dforge_cli-1.0.1/dforge/converter.py +167 -0
- dforge_cli-1.0.1/dforge/dependencies.py +98 -0
- dforge_cli-1.0.1/dforge/engine.py +236 -0
- dforge_cli-1.0.1/dforge/extractor.py +201 -0
- dforge_cli-1.0.1/dforge/loading.py +19 -0
- dforge_cli-1.0.1/dforge/menu.py +115 -0
- dforge_cli-1.0.1/dforge/operations.py +314 -0
- dforge_cli-1.0.1/dforge/processor.py +251 -0
- dforge_cli-1.0.1/dforge/setup.py +107 -0
- dforge_cli-1.0.1/dforge/theme.py +12 -0
- dforge_cli-1.0.1/dforge/utils.py +169 -0
- dforge_cli-1.0.1/dforge/watcher.py +137 -0
- dforge_cli-1.0.1/dforge/workflows/__init__.py +0 -0
- dforge_cli-1.0.1/dforge/workflows/automation.py +21 -0
- dforge_cli-1.0.1/dforge/workflows/batch.py +18 -0
- dforge_cli-1.0.1/dforge/workflows/batch_ocr.py +61 -0
- dforge_cli-1.0.1/dforge/workflows/common.py +133 -0
- dforge_cli-1.0.1/dforge/workflows/compress.py +73 -0
- dforge_cli-1.0.1/dforge/workflows/convert.py +148 -0
- dforge_cli-1.0.1/dforge/workflows/decrypt.py +50 -0
- dforge_cli-1.0.1/dforge/workflows/encrypt.py +50 -0
- dforge_cli-1.0.1/dforge/workflows/extract.py +18 -0
- dforge_cli-1.0.1/dforge/workflows/image.py +21 -0
- dforge_cli-1.0.1/dforge/workflows/merge.py +109 -0
- dforge_cli-1.0.1/dforge/workflows/ocr.py +104 -0
- dforge_cli-1.0.1/dforge/workflows/ocr_folder.py +0 -0
- dforge_cli-1.0.1/dforge/workflows/pages.py +57 -0
- dforge_cli-1.0.1/dforge/workflows/rotate.py +53 -0
- dforge_cli-1.0.1/dforge/workflows/searchable.py +51 -0
- dforge_cli-1.0.1/dforge/workflows/settings.py +56 -0
- dforge_cli-1.0.1/dforge/workflows/split.py +32 -0
- dforge_cli-1.0.1/dforge/workflows/tables.py +45 -0
- dforge_cli-1.0.1/dforge/workflows/watermark.py +54 -0
- dforge_cli-1.0.1/dforge_cli.egg-info/PKG-INFO +244 -0
- dforge_cli-1.0.1/dforge_cli.egg-info/SOURCES.txt +47 -0
- dforge_cli-1.0.1/dforge_cli.egg-info/dependency_links.txt +1 -0
- dforge_cli-1.0.1/dforge_cli.egg-info/entry_points.txt +2 -0
- dforge_cli-1.0.1/dforge_cli.egg-info/requires.txt +28 -0
- dforge_cli-1.0.1/dforge_cli.egg-info/top_level.txt +1 -0
- dforge_cli-1.0.1/pyproject.toml +66 -0
- dforge_cli-1.0.1/setup.cfg +4 -0
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dforge-cli
|
|
3
|
+
Version: 1.0.1
|
|
4
|
+
Summary: DForge — Unified Document Processing CLI. Forge your documents from your terminal.
|
|
5
|
+
Author: Punith Naidu
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: pdf,ocr,document,cli,conversion,tesseract,batch
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Environment :: Console
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Office/Business
|
|
19
|
+
Classifier: Topic :: Utilities
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
Requires-Dist: typer[all]>=0.9.0
|
|
23
|
+
Requires-Dist: pypdf>=3.0.0
|
|
24
|
+
Requires-Dist: pikepdf>=8.0.0
|
|
25
|
+
Requires-Dist: Pillow>=10.0.0
|
|
26
|
+
Requires-Dist: opencv-python-headless>=4.8.0
|
|
27
|
+
Requires-Dist: img2pdf>=0.4.4
|
|
28
|
+
Requires-Dist: watchdog>=3.0.0
|
|
29
|
+
Requires-Dist: tqdm>=4.66.0
|
|
30
|
+
Requires-Dist: rich>=13.0.0
|
|
31
|
+
Requires-Dist: questionary>=2.0.0
|
|
32
|
+
Requires-Dist: prompt_toolkit>=3.0.0
|
|
33
|
+
Requires-Dist: pyfiglet>=1.0.2
|
|
34
|
+
Provides-Extra: ocr
|
|
35
|
+
Requires-Dist: pytesseract>=0.3.10; extra == "ocr"
|
|
36
|
+
Requires-Dist: pdf2image>=1.16.3; extra == "ocr"
|
|
37
|
+
Provides-Extra: tables
|
|
38
|
+
Requires-Dist: pdfplumber>=0.10.0; extra == "tables"
|
|
39
|
+
Requires-Dist: pandas>=2.0.0; extra == "tables"
|
|
40
|
+
Requires-Dist: openpyxl>=3.1.0; extra == "tables"
|
|
41
|
+
Provides-Extra: full
|
|
42
|
+
Requires-Dist: pytesseract>=0.3.10; extra == "full"
|
|
43
|
+
Requires-Dist: pdf2image>=1.16.3; extra == "full"
|
|
44
|
+
Requires-Dist: pdfplumber>=0.10.0; extra == "full"
|
|
45
|
+
Requires-Dist: pandas>=2.0.0; extra == "full"
|
|
46
|
+
Requires-Dist: openpyxl>=3.1.0; extra == "full"
|
|
47
|
+
|
|
48
|
+
# DForge — Forge your documents from your terminal.
|
|
49
|
+
|
|
50
|
+
A unified, offline-first Python CLI for all your document processing needs.
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## Installation
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install dforge
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### External Dependencies
|
|
61
|
+
|
|
62
|
+
| Tool | Purpose | Install |
|
|
63
|
+
|------|---------|---------|
|
|
64
|
+
| Tesseract OCR | OCR engine | [Install guide](https://tesseract-ocr.github.io/tessdoc/Installation.html) |
|
|
65
|
+
| Ghostscript | PDF compression | [ghostscript.com](https://ghostscript.com/releases/gsdnld.html) |
|
|
66
|
+
| Pandoc | Document conversion | [pandoc.org](https://pandoc.org/installing.html) |
|
|
67
|
+
| Poppler | PDF → image (pdf2image) | `apt install poppler-utils` / `brew install poppler` |
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## Quick Reference
|
|
72
|
+
|
|
73
|
+
### PDF Operations
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# Merge PDFs
|
|
77
|
+
dforge merge a.pdf b.pdf c.pdf -o merged.pdf
|
|
78
|
+
|
|
79
|
+
# Split into pages
|
|
80
|
+
dforge split report.pdf
|
|
81
|
+
|
|
82
|
+
# Compress (uses Ghostscript)
|
|
83
|
+
dforge compress large.pdf --preset ebook
|
|
84
|
+
|
|
85
|
+
# Rotate pages
|
|
86
|
+
dforge rotate file.pdf 90
|
|
87
|
+
|
|
88
|
+
# Extract page range
|
|
89
|
+
dforge pages file.pdf 1-5
|
|
90
|
+
|
|
91
|
+
# Watermark
|
|
92
|
+
dforge watermark file.pdf logo.png
|
|
93
|
+
|
|
94
|
+
# Encrypt / Decrypt
|
|
95
|
+
dforge encrypt file.pdf
|
|
96
|
+
dforge decrypt protected.pdf
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### OCR
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
# OCR an image
|
|
103
|
+
dforge ocr scan.png
|
|
104
|
+
|
|
105
|
+
# OCR a PDF
|
|
106
|
+
dforge ocr scan.pdf
|
|
107
|
+
|
|
108
|
+
# Output as JSON or Markdown
|
|
109
|
+
dforge ocr scan.pdf --fmt json
|
|
110
|
+
dforge ocr scan.pdf --fmt md
|
|
111
|
+
|
|
112
|
+
# Multi-language OCR
|
|
113
|
+
dforge ocr scan.png --lang eng+hin
|
|
114
|
+
|
|
115
|
+
# Make a scanned PDF searchable
|
|
116
|
+
dforge searchable scan.pdf
|
|
117
|
+
|
|
118
|
+
# Batch OCR an entire folder
|
|
119
|
+
dforge batch-ocr invoices/
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Document Conversion
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
# Convert DOCX → PDF
|
|
126
|
+
dforge convert report.docx pdf
|
|
127
|
+
|
|
128
|
+
# Convert Markdown → HTML
|
|
129
|
+
dforge convert notes.md html
|
|
130
|
+
|
|
131
|
+
# Combine images into a PDF
|
|
132
|
+
dforge img2pdf scans/
|
|
133
|
+
|
|
134
|
+
# Export PDF pages as images
|
|
135
|
+
dforge pdf2img report.pdf --dpi 300 --fmt png
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Content Extraction
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
# Extract text
|
|
142
|
+
dforge text report.pdf
|
|
143
|
+
|
|
144
|
+
# Extract embedded images
|
|
145
|
+
dforge images report.pdf
|
|
146
|
+
|
|
147
|
+
# Show / save metadata
|
|
148
|
+
dforge metadata report.pdf
|
|
149
|
+
dforge metadata report.pdf -o meta.json
|
|
150
|
+
|
|
151
|
+
# Extract tables
|
|
152
|
+
dforge tables invoice.pdf --fmt xlsx
|
|
153
|
+
dforge tables invoice.pdf --fmt csv
|
|
154
|
+
dforge tables invoice.pdf --fmt json
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Image Processing
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
# Enhance (contrast + sharpness)
|
|
161
|
+
dforge enhance scan.png
|
|
162
|
+
|
|
163
|
+
# Fix skewed scans
|
|
164
|
+
dforge deskew scan.png
|
|
165
|
+
|
|
166
|
+
# Remove noise
|
|
167
|
+
dforge denoise scan.png
|
|
168
|
+
|
|
169
|
+
# Resize
|
|
170
|
+
dforge resize photo.png --width 800
|
|
171
|
+
dforge resize photo.png --scale 0.5
|
|
172
|
+
|
|
173
|
+
# Full OCR preprocessing pipeline
|
|
174
|
+
dforge preprocess scan.png
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Batch Processing
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
# Batch OCR with 8 workers
|
|
181
|
+
dforge batch ./documents --ocr --workers 8
|
|
182
|
+
|
|
183
|
+
# Batch compress
|
|
184
|
+
dforge batch ./pdfs --compress
|
|
185
|
+
|
|
186
|
+
# Batch convert to markdown
|
|
187
|
+
dforge batch ./docs --convert md
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
### Watch Mode
|
|
191
|
+
|
|
192
|
+
```bash
|
|
193
|
+
# Auto-OCR new files dropped into a folder
|
|
194
|
+
dforge watch ./incoming --ocr
|
|
195
|
+
|
|
196
|
+
# Auto-make-searchable
|
|
197
|
+
dforge watch ./scans --searchable
|
|
198
|
+
|
|
199
|
+
# Auto-compress
|
|
200
|
+
dforge watch ./uploads --compress
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
## Project Structure
|
|
206
|
+
|
|
207
|
+
```
|
|
208
|
+
dforge/
|
|
209
|
+
├── cli.py ← Typer CLI entry point
|
|
210
|
+
├── config.py ← Global configuration
|
|
211
|
+
├── utils.py ← Shared utilities
|
|
212
|
+
├── pdf/
|
|
213
|
+
│ └── operations.py ← merge, split, compress, rotate, pages, watermark, encrypt, decrypt
|
|
214
|
+
├── ocr/
|
|
215
|
+
│ └── engine.py ← ocr_image, ocr_pdf, make_searchable_pdf, batch_ocr
|
|
216
|
+
├── convert/
|
|
217
|
+
│ └── converter.py ← convert, images_to_pdf, pdf_to_images
|
|
218
|
+
├── extract/
|
|
219
|
+
│ └── extractor.py ← extract_text, extract_images, extract_metadata, extract_tables
|
|
220
|
+
├── image/
|
|
221
|
+
│ └── processor.py ← enhance, deskew, denoise, resize, preprocess_for_ocr
|
|
222
|
+
├── batch/
|
|
223
|
+
│ └── processor.py ← parallel batch processing
|
|
224
|
+
└── watch/
|
|
225
|
+
└── watcher.py ← watchdog-based directory monitor
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
---
|
|
229
|
+
|
|
230
|
+
## Supported Formats
|
|
231
|
+
|
|
232
|
+
| Category | Formats |
|
|
233
|
+
|----------|---------|
|
|
234
|
+
| Input documents | PDF, DOCX, ODT, MD, HTML, TXT, RST, EPUB |
|
|
235
|
+
| Input images | PNG, JPG/JPEG, TIFF/TIF, BMP, WebP |
|
|
236
|
+
| OCR output | TXT, JSON, Markdown |
|
|
237
|
+
| Table export | CSV, XLSX, JSON |
|
|
238
|
+
| Image export | PNG, JPEG, TIFF |
|
|
239
|
+
|
|
240
|
+
---
|
|
241
|
+
|
|
242
|
+
## License
|
|
243
|
+
|
|
244
|
+
MIT License — DForge Contributors
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
# DForge — Forge your documents from your terminal.
|
|
2
|
+
|
|
3
|
+
A unified, offline-first Python CLI for all your document processing needs.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install dforge
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
### External Dependencies
|
|
14
|
+
|
|
15
|
+
| Tool | Purpose | Install |
|
|
16
|
+
|------|---------|---------|
|
|
17
|
+
| Tesseract OCR | OCR engine | [Install guide](https://tesseract-ocr.github.io/tessdoc/Installation.html) |
|
|
18
|
+
| Ghostscript | PDF compression | [ghostscript.com](https://ghostscript.com/releases/gsdnld.html) |
|
|
19
|
+
| Pandoc | Document conversion | [pandoc.org](https://pandoc.org/installing.html) |
|
|
20
|
+
| Poppler | PDF → image (pdf2image) | `apt install poppler-utils` / `brew install poppler` |
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Quick Reference
|
|
25
|
+
|
|
26
|
+
### PDF Operations
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# Merge PDFs
|
|
30
|
+
dforge merge a.pdf b.pdf c.pdf -o merged.pdf
|
|
31
|
+
|
|
32
|
+
# Split into pages
|
|
33
|
+
dforge split report.pdf
|
|
34
|
+
|
|
35
|
+
# Compress (uses Ghostscript)
|
|
36
|
+
dforge compress large.pdf --preset ebook
|
|
37
|
+
|
|
38
|
+
# Rotate pages
|
|
39
|
+
dforge rotate file.pdf 90
|
|
40
|
+
|
|
41
|
+
# Extract page range
|
|
42
|
+
dforge pages file.pdf 1-5
|
|
43
|
+
|
|
44
|
+
# Watermark
|
|
45
|
+
dforge watermark file.pdf logo.png
|
|
46
|
+
|
|
47
|
+
# Encrypt / Decrypt
|
|
48
|
+
dforge encrypt file.pdf
|
|
49
|
+
dforge decrypt protected.pdf
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### OCR
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
# OCR an image
|
|
56
|
+
dforge ocr scan.png
|
|
57
|
+
|
|
58
|
+
# OCR a PDF
|
|
59
|
+
dforge ocr scan.pdf
|
|
60
|
+
|
|
61
|
+
# Output as JSON or Markdown
|
|
62
|
+
dforge ocr scan.pdf --fmt json
|
|
63
|
+
dforge ocr scan.pdf --fmt md
|
|
64
|
+
|
|
65
|
+
# Multi-language OCR
|
|
66
|
+
dforge ocr scan.png --lang eng+hin
|
|
67
|
+
|
|
68
|
+
# Make a scanned PDF searchable
|
|
69
|
+
dforge searchable scan.pdf
|
|
70
|
+
|
|
71
|
+
# Batch OCR an entire folder
|
|
72
|
+
dforge batch-ocr invoices/
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Document Conversion
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
# Convert DOCX → PDF
|
|
79
|
+
dforge convert report.docx pdf
|
|
80
|
+
|
|
81
|
+
# Convert Markdown → HTML
|
|
82
|
+
dforge convert notes.md html
|
|
83
|
+
|
|
84
|
+
# Combine images into a PDF
|
|
85
|
+
dforge img2pdf scans/
|
|
86
|
+
|
|
87
|
+
# Export PDF pages as images
|
|
88
|
+
dforge pdf2img report.pdf --dpi 300 --fmt png
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Content Extraction
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
# Extract text
|
|
95
|
+
dforge text report.pdf
|
|
96
|
+
|
|
97
|
+
# Extract embedded images
|
|
98
|
+
dforge images report.pdf
|
|
99
|
+
|
|
100
|
+
# Show / save metadata
|
|
101
|
+
dforge metadata report.pdf
|
|
102
|
+
dforge metadata report.pdf -o meta.json
|
|
103
|
+
|
|
104
|
+
# Extract tables
|
|
105
|
+
dforge tables invoice.pdf --fmt xlsx
|
|
106
|
+
dforge tables invoice.pdf --fmt csv
|
|
107
|
+
dforge tables invoice.pdf --fmt json
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Image Processing
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
# Enhance (contrast + sharpness)
|
|
114
|
+
dforge enhance scan.png
|
|
115
|
+
|
|
116
|
+
# Fix skewed scans
|
|
117
|
+
dforge deskew scan.png
|
|
118
|
+
|
|
119
|
+
# Remove noise
|
|
120
|
+
dforge denoise scan.png
|
|
121
|
+
|
|
122
|
+
# Resize
|
|
123
|
+
dforge resize photo.png --width 800
|
|
124
|
+
dforge resize photo.png --scale 0.5
|
|
125
|
+
|
|
126
|
+
# Full OCR preprocessing pipeline
|
|
127
|
+
dforge preprocess scan.png
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Batch Processing
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
# Batch OCR with 8 workers
|
|
134
|
+
dforge batch ./documents --ocr --workers 8
|
|
135
|
+
|
|
136
|
+
# Batch compress
|
|
137
|
+
dforge batch ./pdfs --compress
|
|
138
|
+
|
|
139
|
+
# Batch convert to markdown
|
|
140
|
+
dforge batch ./docs --convert md
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Watch Mode
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
# Auto-OCR new files dropped into a folder
|
|
147
|
+
dforge watch ./incoming --ocr
|
|
148
|
+
|
|
149
|
+
# Auto-make-searchable
|
|
150
|
+
dforge watch ./scans --searchable
|
|
151
|
+
|
|
152
|
+
# Auto-compress
|
|
153
|
+
dforge watch ./uploads --compress
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## Project Structure
|
|
159
|
+
|
|
160
|
+
```
|
|
161
|
+
dforge/
|
|
162
|
+
├── cli.py ← Typer CLI entry point
|
|
163
|
+
├── config.py ← Global configuration
|
|
164
|
+
├── utils.py ← Shared utilities
|
|
165
|
+
├── pdf/
|
|
166
|
+
│ └── operations.py ← merge, split, compress, rotate, pages, watermark, encrypt, decrypt
|
|
167
|
+
├── ocr/
|
|
168
|
+
│ └── engine.py ← ocr_image, ocr_pdf, make_searchable_pdf, batch_ocr
|
|
169
|
+
├── convert/
|
|
170
|
+
│ └── converter.py ← convert, images_to_pdf, pdf_to_images
|
|
171
|
+
├── extract/
|
|
172
|
+
│ └── extractor.py ← extract_text, extract_images, extract_metadata, extract_tables
|
|
173
|
+
├── image/
|
|
174
|
+
│ └── processor.py ← enhance, deskew, denoise, resize, preprocess_for_ocr
|
|
175
|
+
├── batch/
|
|
176
|
+
│ └── processor.py ← parallel batch processing
|
|
177
|
+
└── watch/
|
|
178
|
+
└── watcher.py ← watchdog-based directory monitor
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
---
|
|
182
|
+
|
|
183
|
+
## Supported Formats
|
|
184
|
+
|
|
185
|
+
| Category | Formats |
|
|
186
|
+
|----------|---------|
|
|
187
|
+
| Input documents | PDF, DOCX, ODT, MD, HTML, TXT, RST, EPUB |
|
|
188
|
+
| Input images | PNG, JPG/JPEG, TIFF/TIF, BMP, WebP |
|
|
189
|
+
| OCR output | TXT, JSON, Markdown |
|
|
190
|
+
| Table export | CSV, XLSX, JSON |
|
|
191
|
+
| Image export | PNG, JPEG, TIFF |
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
## License
|
|
196
|
+
|
|
197
|
+
MIT License — DForge Contributors
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.0.0"
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from rich.console import Console
|
|
2
|
+
from rich.panel import Panel
|
|
3
|
+
from rich.align import Align
|
|
4
|
+
|
|
5
|
+
console = Console()
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def show_banner():
|
|
9
|
+
title = """
|
|
10
|
+
⚡ DFORGE
|
|
11
|
+
|
|
12
|
+
Fast Local Document Automation
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
console.print()
|
|
16
|
+
|
|
17
|
+
console.print(
|
|
18
|
+
Panel(
|
|
19
|
+
Align.center(f"[bold cyan]{title}[/bold cyan]"),
|
|
20
|
+
border_style="cyan",
|
|
21
|
+
padding=(1, 4),
|
|
22
|
+
)
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
console.print(
|
|
26
|
+
Align.center(
|
|
27
|
+
"[dim]Forge your documents from your terminal[/dim]"
|
|
28
|
+
)
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
console.print()
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DForge Batch Processing
|
|
3
|
+
Handles: batch OCR, batch PDF compression, batch document conversion
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Callable, Iterable
|
|
11
|
+
|
|
12
|
+
from rich.progress import Progress, TextColumn
|
|
13
|
+
|
|
14
|
+
from dforge.config import SUPPORTED_DOC_EXTS, SUPPORTED_IMAGE_EXTS, SUPPORTED_PDF_EXTS
|
|
15
|
+
from dforge.utils import (
|
|
16
|
+
abort,
|
|
17
|
+
collect_files,
|
|
18
|
+
console,
|
|
19
|
+
info,
|
|
20
|
+
success,
|
|
21
|
+
warn,
|
|
22
|
+
require_ghostscript,
|
|
23
|
+
require_pandoc,
|
|
24
|
+
require_tesseract,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _run_parallel(
|
|
29
|
+
label: str,
|
|
30
|
+
files: list[Path],
|
|
31
|
+
workers: int,
|
|
32
|
+
handler: Callable[[Path], None],
|
|
33
|
+
) -> list[tuple[Path, str]]:
|
|
34
|
+
errors: list[tuple[Path, str]] = []
|
|
35
|
+
worker_count = max(1, workers or 1)
|
|
36
|
+
|
|
37
|
+
with Progress(
|
|
38
|
+
TextColumn("{task.description}"),
|
|
39
|
+
TextColumn("{task.completed}/{task.total}"),
|
|
40
|
+
console=console,
|
|
41
|
+
) as progress:
|
|
42
|
+
task = progress.add_task(label, total=len(files))
|
|
43
|
+
|
|
44
|
+
if worker_count == 1:
|
|
45
|
+
for path in files:
|
|
46
|
+
try:
|
|
47
|
+
handler(path)
|
|
48
|
+
except Exception as exc:
|
|
49
|
+
errors.append((path, str(exc)))
|
|
50
|
+
progress.advance(task)
|
|
51
|
+
return errors
|
|
52
|
+
|
|
53
|
+
with ThreadPoolExecutor(max_workers=worker_count) as executor:
|
|
54
|
+
futures = {executor.submit(handler, path): path for path in files}
|
|
55
|
+
for future in as_completed(futures):
|
|
56
|
+
path = futures[future]
|
|
57
|
+
try:
|
|
58
|
+
future.result()
|
|
59
|
+
except Exception as exc:
|
|
60
|
+
errors.append((path, str(exc)))
|
|
61
|
+
progress.advance(task)
|
|
62
|
+
|
|
63
|
+
return errors
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _report_errors(errors: list[tuple[Path, str]]) -> None:
|
|
67
|
+
if not errors:
|
|
68
|
+
return
|
|
69
|
+
warn(f"{len(errors)} file(s) failed:")
|
|
70
|
+
for path, err in errors:
|
|
71
|
+
console.print(f" [red]{path.name}[/red]: {err}")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def batch_with_ocr(
|
|
75
|
+
directory: Path,
|
|
76
|
+
lang: str,
|
|
77
|
+
fmt: str,
|
|
78
|
+
recursive: bool = True,
|
|
79
|
+
workers: int = 4,
|
|
80
|
+
) -> None:
|
|
81
|
+
if not directory.exists():
|
|
82
|
+
abort(f"Directory not found: {directory}")
|
|
83
|
+
|
|
84
|
+
require_tesseract()
|
|
85
|
+
|
|
86
|
+
files = collect_files(directory, SUPPORTED_IMAGE_EXTS | SUPPORTED_PDF_EXTS, recursive=recursive)
|
|
87
|
+
if not files:
|
|
88
|
+
warn(f"No supported files found in {directory}")
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
info(f"Found {len(files)} file(s) to OCR...")
|
|
92
|
+
|
|
93
|
+
def handler(path: Path) -> None:
|
|
94
|
+
if path.suffix.lower() == ".pdf":
|
|
95
|
+
from dforge.engine import ocr_pdf
|
|
96
|
+
ocr_pdf(path, lang=lang, fmt=fmt)
|
|
97
|
+
else:
|
|
98
|
+
from dforge.engine import ocr_image
|
|
99
|
+
ocr_image(path, lang=lang, fmt=fmt)
|
|
100
|
+
|
|
101
|
+
errors = _run_parallel("Batch OCR", files, workers, handler)
|
|
102
|
+
_report_errors(errors)
|
|
103
|
+
success(f"Batch OCR complete. Processed {len(files) - len(errors)}/{len(files)} file(s).")
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def batch_compress(
|
|
107
|
+
directory: Path,
|
|
108
|
+
recursive: bool = True,
|
|
109
|
+
workers: int = 4,
|
|
110
|
+
) -> None:
|
|
111
|
+
if not directory.exists():
|
|
112
|
+
abort(f"Directory not found: {directory}")
|
|
113
|
+
|
|
114
|
+
require_ghostscript()
|
|
115
|
+
|
|
116
|
+
files = collect_files(directory, SUPPORTED_PDF_EXTS, recursive=recursive)
|
|
117
|
+
if not files:
|
|
118
|
+
warn(f"No PDF files found in {directory}")
|
|
119
|
+
return
|
|
120
|
+
|
|
121
|
+
info(f"Found {len(files)} PDF(s) to compress...")
|
|
122
|
+
|
|
123
|
+
def handler(path: Path) -> None:
|
|
124
|
+
from dforge.operations import compress
|
|
125
|
+
compress(path)
|
|
126
|
+
|
|
127
|
+
errors = _run_parallel("Batch compress", files, workers, handler)
|
|
128
|
+
_report_errors(errors)
|
|
129
|
+
success(f"Batch compress complete. Processed {len(files) - len(errors)}/{len(files)} file(s).")
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def batch_convert(
|
|
133
|
+
directory: Path,
|
|
134
|
+
target_format: str,
|
|
135
|
+
recursive: bool = True,
|
|
136
|
+
workers: int = 4,
|
|
137
|
+
) -> None:
|
|
138
|
+
if not directory.exists():
|
|
139
|
+
abort(f"Directory not found: {directory}")
|
|
140
|
+
|
|
141
|
+
require_pandoc()
|
|
142
|
+
|
|
143
|
+
files = collect_files(directory, SUPPORTED_DOC_EXTS, recursive=recursive)
|
|
144
|
+
if not files:
|
|
145
|
+
warn(f"No convertible documents found in {directory}")
|
|
146
|
+
return
|
|
147
|
+
|
|
148
|
+
info(f"Found {len(files)} document(s) to convert...")
|
|
149
|
+
|
|
150
|
+
def handler(path: Path) -> None:
|
|
151
|
+
from dforge.converter import convert
|
|
152
|
+
convert(path, target_format)
|
|
153
|
+
|
|
154
|
+
errors = _run_parallel("Batch convert", files, workers, handler)
|
|
155
|
+
_report_errors(errors)
|
|
156
|
+
success(f"Batch convert complete. Processed {len(files) - len(errors)}/{len(files)} file(s).")
|