mdify-cli 2.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- assets/mdify.png +0 -0
- mdify/__init__.py +3 -0
- mdify/__main__.py +7 -0
- mdify/cli.py +915 -0
- mdify/container.py +167 -0
- mdify/docling_client.py +232 -0
- mdify_cli-2.7.0.dist-info/METADATA +274 -0
- mdify_cli-2.7.0.dist-info/RECORD +12 -0
- mdify_cli-2.7.0.dist-info/WHEEL +5 -0
- mdify_cli-2.7.0.dist-info/entry_points.txt +2 -0
- mdify_cli-2.7.0.dist-info/licenses/LICENSE +21 -0
- mdify_cli-2.7.0.dist-info/top_level.txt +1 -0
mdify/cli.py
ADDED
|
@@ -0,0 +1,915 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
CLI for converting documents to Markdown.
|
|
4
|
+
|
|
5
|
+
This CLI orchestrates document conversion by invoking a Docker/Podman
|
|
6
|
+
container that contains Docling and ML dependencies. The CLI itself
|
|
7
|
+
is lightweight and has no ML dependencies.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import json
|
|
12
|
+
import os
|
|
13
|
+
import shutil
|
|
14
|
+
import subprocess
|
|
15
|
+
import sys
|
|
16
|
+
import threading
|
|
17
|
+
import time
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import List, Optional, Tuple
|
|
20
|
+
from urllib.error import URLError
|
|
21
|
+
from urllib.request import urlopen
|
|
22
|
+
|
|
23
|
+
from . import __version__
|
|
24
|
+
from mdify.container import DoclingContainer
|
|
25
|
+
from mdify.docling_client import convert_file
|
|
26
|
+
|
|
27
|
+
# Configuration
|
|
28
|
+
MDIFY_HOME = Path.home() / ".mdify"
|
|
29
|
+
LAST_CHECK_FILE = MDIFY_HOME / ".last_check"
|
|
30
|
+
PYPI_API_URL = "https://pypi.org/pypi/mdify-cli/json"
|
|
31
|
+
CHECK_INTERVAL_SECONDS = 86400 # 24 hours
|
|
32
|
+
|
|
33
|
+
# Container configuration
|
|
34
|
+
DEFAULT_IMAGE = "ghcr.io/docling-project/docling-serve-cpu:main"
|
|
35
|
+
GPU_IMAGE = "ghcr.io/docling-project/docling-serve-cu126:main"
|
|
36
|
+
SUPPORTED_RUNTIMES = ("docker", "podman")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# =============================================================================
|
|
40
|
+
# Update checking functions
|
|
41
|
+
# =============================================================================
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _get_remote_version(timeout: int = 5) -> Optional[str]:
|
|
45
|
+
"""
|
|
46
|
+
Fetch the latest version from PyPI.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Version string (e.g., "1.1.0") or None if fetch failed.
|
|
50
|
+
"""
|
|
51
|
+
try:
|
|
52
|
+
with urlopen(PYPI_API_URL, timeout=timeout) as response:
|
|
53
|
+
data = json.loads(response.read().decode("utf-8"))
|
|
54
|
+
version = data.get("info", {}).get("version", "")
|
|
55
|
+
return version if version else None
|
|
56
|
+
except (URLError, json.JSONDecodeError, KeyError, TimeoutError):
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _should_check_for_update() -> bool:
|
|
61
|
+
"""
|
|
62
|
+
Determine if we should check for updates based on last check time.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
True if check should be performed, False otherwise.
|
|
66
|
+
"""
|
|
67
|
+
if os.environ.get("MDIFY_NO_UPDATE_CHECK", "").lower() in ("1", "true", "yes"):
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
if not LAST_CHECK_FILE.exists():
|
|
71
|
+
return True
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
last_check = float(LAST_CHECK_FILE.read_text().strip())
|
|
75
|
+
elapsed = time.time() - last_check
|
|
76
|
+
return elapsed >= CHECK_INTERVAL_SECONDS
|
|
77
|
+
except (ValueError, OSError):
|
|
78
|
+
return True
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _update_last_check_time() -> None:
|
|
82
|
+
"""Update the last check timestamp file."""
|
|
83
|
+
try:
|
|
84
|
+
LAST_CHECK_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
LAST_CHECK_FILE.write_text(str(time.time()))
|
|
86
|
+
except OSError:
|
|
87
|
+
pass
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _compare_versions(current: str, remote: str) -> bool:
|
|
91
|
+
"""
|
|
92
|
+
Compare version strings.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
True if remote version is newer than current.
|
|
96
|
+
"""
|
|
97
|
+
try:
|
|
98
|
+
current_parts = [int(x) for x in current.split(".")]
|
|
99
|
+
remote_parts = [int(x) for x in remote.split(".")]
|
|
100
|
+
|
|
101
|
+
max_len = max(len(current_parts), len(remote_parts))
|
|
102
|
+
current_parts.extend([0] * (max_len - len(current_parts)))
|
|
103
|
+
remote_parts.extend([0] * (max_len - len(remote_parts)))
|
|
104
|
+
|
|
105
|
+
return remote_parts > current_parts
|
|
106
|
+
except (ValueError, AttributeError):
|
|
107
|
+
return False
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def check_for_update(force: bool = False) -> None:
|
|
111
|
+
"""
|
|
112
|
+
Check for updates and prompt user to upgrade if available.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
force: If True, check regardless of last check time and show errors.
|
|
116
|
+
"""
|
|
117
|
+
if not force and not _should_check_for_update():
|
|
118
|
+
return
|
|
119
|
+
|
|
120
|
+
remote_version = _get_remote_version()
|
|
121
|
+
|
|
122
|
+
if remote_version is None:
|
|
123
|
+
if force:
|
|
124
|
+
print(
|
|
125
|
+
"Error: Failed to check for updates. "
|
|
126
|
+
"Please check your internet connection.",
|
|
127
|
+
file=sys.stderr,
|
|
128
|
+
)
|
|
129
|
+
sys.exit(1)
|
|
130
|
+
return
|
|
131
|
+
|
|
132
|
+
_update_last_check_time()
|
|
133
|
+
|
|
134
|
+
if not _compare_versions(__version__, remote_version):
|
|
135
|
+
if force:
|
|
136
|
+
print(f"mdify is up to date (version {__version__})")
|
|
137
|
+
return
|
|
138
|
+
|
|
139
|
+
print(f"\n{'=' * 50}")
|
|
140
|
+
print(f"A new version of mdify-cli is available!")
|
|
141
|
+
print(f" Current version: {__version__}")
|
|
142
|
+
print(f" Latest version: {remote_version}")
|
|
143
|
+
print(f"{'=' * 50}")
|
|
144
|
+
print(f"\nTo upgrade, run:")
|
|
145
|
+
print(f" pipx upgrade mdify-cli")
|
|
146
|
+
print(f" # or: pip install --upgrade mdify-cli\n")
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# =============================================================================
|
|
150
|
+
# Container runtime functions
|
|
151
|
+
# =============================================================================
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def detect_runtime(preferred: str, explicit: bool = True) -> Optional[str]:
|
|
155
|
+
"""
|
|
156
|
+
Detect available container runtime.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
preferred: Preferred runtime ('docker' or 'podman')
|
|
160
|
+
explicit: If True, warn when falling back to alternative.
|
|
161
|
+
If False, silently use alternative without warning.
|
|
162
|
+
Note: This only controls warning emission; selection order
|
|
163
|
+
is always preferred → alternative regardless of this flag.
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Path to runtime executable, or None if not found.
|
|
167
|
+
"""
|
|
168
|
+
# Try preferred runtime first
|
|
169
|
+
runtime_path = shutil.which(preferred)
|
|
170
|
+
if runtime_path:
|
|
171
|
+
return runtime_path
|
|
172
|
+
|
|
173
|
+
# Try alternative
|
|
174
|
+
alternative = "podman" if preferred == "docker" else "docker"
|
|
175
|
+
runtime_path = shutil.which(alternative)
|
|
176
|
+
if runtime_path:
|
|
177
|
+
if explicit:
|
|
178
|
+
print(
|
|
179
|
+
f"Warning: {preferred} not found, using {alternative}", file=sys.stderr
|
|
180
|
+
)
|
|
181
|
+
return runtime_path
|
|
182
|
+
|
|
183
|
+
return None
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def check_image_exists(runtime: str, image: str) -> bool:
|
|
187
|
+
"""
|
|
188
|
+
Check if container image exists locally.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
runtime: Path to container runtime
|
|
192
|
+
image: Image name/tag
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
True if image exists locally.
|
|
196
|
+
"""
|
|
197
|
+
try:
|
|
198
|
+
result = subprocess.run(
|
|
199
|
+
[runtime, "image", "inspect", image],
|
|
200
|
+
capture_output=True,
|
|
201
|
+
check=False,
|
|
202
|
+
)
|
|
203
|
+
return result.returncode == 0
|
|
204
|
+
except OSError:
|
|
205
|
+
return False
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def pull_image(runtime: str, image: str, quiet: bool = False) -> bool:
|
|
209
|
+
"""
|
|
210
|
+
Pull container image.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
runtime: Path to container runtime
|
|
214
|
+
image: Image name/tag
|
|
215
|
+
quiet: Suppress progress output
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
True if pull succeeded.
|
|
219
|
+
"""
|
|
220
|
+
if not quiet:
|
|
221
|
+
print(f"Pulling image: {image}")
|
|
222
|
+
|
|
223
|
+
try:
|
|
224
|
+
result = subprocess.run(
|
|
225
|
+
[runtime, "pull", image],
|
|
226
|
+
capture_output=quiet,
|
|
227
|
+
check=False,
|
|
228
|
+
)
|
|
229
|
+
return result.returncode == 0
|
|
230
|
+
except OSError as e:
|
|
231
|
+
print(f"Error pulling image: {e}", file=sys.stderr)
|
|
232
|
+
return False
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def get_image_size_estimate(runtime: str, image: str) -> Optional[int]:
|
|
236
|
+
"""
|
|
237
|
+
Estimate image size by querying registry manifest.
|
|
238
|
+
|
|
239
|
+
Runs `<runtime> manifest inspect --verbose <image>` and sums all layer sizes
|
|
240
|
+
across all architectures, then applies 50% buffer for decompression.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
runtime: Path to container runtime
|
|
244
|
+
image: Image name/tag
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
Estimated size in bytes with 50% buffer, or None if command fails.
|
|
248
|
+
"""
|
|
249
|
+
try:
|
|
250
|
+
result = subprocess.run(
|
|
251
|
+
[runtime, "manifest", "inspect", "--verbose", image],
|
|
252
|
+
capture_output=True,
|
|
253
|
+
check=False,
|
|
254
|
+
)
|
|
255
|
+
if result.returncode != 0:
|
|
256
|
+
return None
|
|
257
|
+
|
|
258
|
+
manifest_data = json.loads(result.stdout.decode())
|
|
259
|
+
|
|
260
|
+
# Sum all layer sizes across all architectures
|
|
261
|
+
total_size = 0
|
|
262
|
+
for manifest in manifest_data.get("Manifests", []):
|
|
263
|
+
oci_manifest = manifest.get("OCIManifest", {})
|
|
264
|
+
for layer in oci_manifest.get("layers", []):
|
|
265
|
+
total_size += layer.get("size", 0)
|
|
266
|
+
|
|
267
|
+
# Apply 50% buffer for decompression (compressed -> uncompressed)
|
|
268
|
+
return int(total_size * 1.5)
|
|
269
|
+
except (json.JSONDecodeError, KeyError, ValueError, OSError):
|
|
270
|
+
return None
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def format_size(size_bytes: int) -> str:
|
|
274
|
+
"""Format file size in human-readable format."""
|
|
275
|
+
for unit in ["B", "KB", "MB", "GB"]:
|
|
276
|
+
if size_bytes < 1024:
|
|
277
|
+
return f"{size_bytes:.1f} {unit}" if unit != "B" else f"{size_bytes} {unit}"
|
|
278
|
+
size_bytes /= 1024
|
|
279
|
+
return f"{size_bytes:.1f} TB"
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def format_duration(seconds: float) -> str:
|
|
283
|
+
"""Format duration in human-readable format."""
|
|
284
|
+
if seconds < 60:
|
|
285
|
+
return f"{seconds:.1f}s"
|
|
286
|
+
minutes = int(seconds // 60)
|
|
287
|
+
secs = seconds % 60
|
|
288
|
+
if minutes < 60:
|
|
289
|
+
return f"{minutes}m {secs:.0f}s"
|
|
290
|
+
hours = minutes // 60
|
|
291
|
+
mins = minutes % 60
|
|
292
|
+
return f"{hours}h {mins}m {secs:.0f}s"
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def get_free_space(path: str) -> int:
|
|
296
|
+
"""Get free disk space for the given path in bytes."""
|
|
297
|
+
try:
|
|
298
|
+
return shutil.disk_usage(path).free
|
|
299
|
+
except (FileNotFoundError, OSError):
|
|
300
|
+
return 0
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def get_storage_root(runtime: str) -> Optional[str]:
|
|
304
|
+
"""
|
|
305
|
+
Get the storage root directory for Docker or Podman.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
runtime: Path to container runtime executable
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
Storage root path as string, or None if command fails.
|
|
312
|
+
"""
|
|
313
|
+
try:
|
|
314
|
+
# Extract runtime name from path (e.g., /usr/bin/docker -> docker)
|
|
315
|
+
runtime_name = os.path.basename(runtime)
|
|
316
|
+
|
|
317
|
+
if runtime_name == "docker":
|
|
318
|
+
result = subprocess.run(
|
|
319
|
+
[runtime, "system", "info", "--format", "{{.DockerRootDir}}"],
|
|
320
|
+
capture_output=True,
|
|
321
|
+
check=False,
|
|
322
|
+
)
|
|
323
|
+
if result.stdout:
|
|
324
|
+
return result.stdout.decode().strip()
|
|
325
|
+
elif runtime_name == "podman":
|
|
326
|
+
result = subprocess.run(
|
|
327
|
+
[runtime, "info", "--format", "json"],
|
|
328
|
+
capture_output=True,
|
|
329
|
+
check=False,
|
|
330
|
+
)
|
|
331
|
+
if result.stdout:
|
|
332
|
+
info = json.loads(result.stdout.decode())
|
|
333
|
+
return info.get("store", {}).get("graphRoot")
|
|
334
|
+
return None
|
|
335
|
+
except (OSError, json.JSONDecodeError):
|
|
336
|
+
return None
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def confirm_proceed(message: str, default_no: bool = True) -> bool:
|
|
340
|
+
"""
|
|
341
|
+
Prompt user for confirmation with a y/N prompt.
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
message: The confirmation message to display
|
|
345
|
+
default_no: If True, shows [y/N] (default no). If False, shows [Y/n] (default yes)
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
True if user entered 'y' or 'Y', False otherwise.
|
|
349
|
+
Returns False immediately if stdin is not a TTY (non-interactive).
|
|
350
|
+
"""
|
|
351
|
+
if not sys.stdin.isatty():
|
|
352
|
+
return False
|
|
353
|
+
|
|
354
|
+
prompt = "[y/N]" if default_no else "[Y/n]"
|
|
355
|
+
print(f"{message} {prompt}", file=sys.stderr)
|
|
356
|
+
response = input()
|
|
357
|
+
return response.lower() == "y"
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
class Spinner:
|
|
361
|
+
"""A simple spinner to show progress during long operations."""
|
|
362
|
+
|
|
363
|
+
def __init__(self):
|
|
364
|
+
self.frames = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
|
|
365
|
+
self.running = False
|
|
366
|
+
self.thread = None
|
|
367
|
+
self.start_time = None
|
|
368
|
+
|
|
369
|
+
def _spin(self):
|
|
370
|
+
idx = 0
|
|
371
|
+
while self.running:
|
|
372
|
+
elapsed = time.time() - self.start_time
|
|
373
|
+
frame = self.frames[idx % len(self.frames)]
|
|
374
|
+
print(
|
|
375
|
+
f"\r{self.prefix} {frame} ({format_duration(elapsed)})",
|
|
376
|
+
end="",
|
|
377
|
+
flush=True,
|
|
378
|
+
)
|
|
379
|
+
idx += 1
|
|
380
|
+
time.sleep(0.1)
|
|
381
|
+
|
|
382
|
+
def start(self, prefix: str = ""):
|
|
383
|
+
self.prefix = prefix
|
|
384
|
+
self.running = True
|
|
385
|
+
self.start_time = time.time()
|
|
386
|
+
self.thread = threading.Thread(target=self._spin, daemon=True)
|
|
387
|
+
self.thread.start()
|
|
388
|
+
|
|
389
|
+
def stop(self):
|
|
390
|
+
self.running = False
|
|
391
|
+
if self.thread:
|
|
392
|
+
self.thread.join(timeout=0.5)
|
|
393
|
+
# Clear the spinner line
|
|
394
|
+
print(f"\r{' ' * 80}\r", end="", flush=True)
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
# =============================================================================
|
|
398
|
+
# File handling functions
|
|
399
|
+
# =============================================================================
|
|
400
|
+
|
|
401
|
+
# Supported file extensions (based on Docling InputFormat)
|
|
402
|
+
SUPPORTED_EXTENSIONS = {
|
|
403
|
+
".pdf",
|
|
404
|
+
".docx",
|
|
405
|
+
".pptx",
|
|
406
|
+
".html",
|
|
407
|
+
".htm",
|
|
408
|
+
".png",
|
|
409
|
+
".jpg",
|
|
410
|
+
".jpeg",
|
|
411
|
+
".gif",
|
|
412
|
+
".bmp",
|
|
413
|
+
".tiff",
|
|
414
|
+
".tif", # images
|
|
415
|
+
".asciidoc",
|
|
416
|
+
".adoc",
|
|
417
|
+
".asc", # asciidoc
|
|
418
|
+
".md",
|
|
419
|
+
".markdown", # markdown
|
|
420
|
+
".csv",
|
|
421
|
+
".xlsx", # spreadsheets
|
|
422
|
+
".xml", # XML formats
|
|
423
|
+
".json", # JSON docling
|
|
424
|
+
".mp3",
|
|
425
|
+
".wav",
|
|
426
|
+
".m4a",
|
|
427
|
+
".flac", # audio
|
|
428
|
+
".vtt", # subtitles
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def get_files_to_convert(input_path: Path, mask: str, recursive: bool) -> List[Path]:
|
|
433
|
+
"""Get list of files to convert based on input path and options."""
|
|
434
|
+
files = []
|
|
435
|
+
|
|
436
|
+
if input_path.is_file():
|
|
437
|
+
files.append(input_path)
|
|
438
|
+
elif input_path.is_dir():
|
|
439
|
+
if recursive:
|
|
440
|
+
files = list(input_path.rglob(mask))
|
|
441
|
+
else:
|
|
442
|
+
files = list(input_path.glob(mask))
|
|
443
|
+
|
|
444
|
+
# Filter to only files
|
|
445
|
+
files = [f for f in files if f.is_file()]
|
|
446
|
+
else:
|
|
447
|
+
raise FileNotFoundError(f"Input path does not exist: {input_path}")
|
|
448
|
+
|
|
449
|
+
# Filter out hidden files and unsupported formats
|
|
450
|
+
files = [
|
|
451
|
+
f
|
|
452
|
+
for f in files
|
|
453
|
+
if not f.name.startswith(".") and f.suffix.lower() in SUPPORTED_EXTENSIONS
|
|
454
|
+
]
|
|
455
|
+
|
|
456
|
+
return files
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def get_output_path(
|
|
460
|
+
input_file: Path,
|
|
461
|
+
input_base: Path,
|
|
462
|
+
output_dir: Path,
|
|
463
|
+
flat: bool,
|
|
464
|
+
) -> Path:
|
|
465
|
+
"""Calculate output path for a given input file."""
|
|
466
|
+
if flat:
|
|
467
|
+
try:
|
|
468
|
+
relative_path = input_file.relative_to(input_base)
|
|
469
|
+
parts = list(relative_path.parts)
|
|
470
|
+
except ValueError:
|
|
471
|
+
parts = [input_file.name]
|
|
472
|
+
|
|
473
|
+
stem = input_file.stem
|
|
474
|
+
parent_prefix = "_".join(parts[:-1])
|
|
475
|
+
if parent_prefix:
|
|
476
|
+
output_name = f"{parent_prefix}_{stem}.md"
|
|
477
|
+
else:
|
|
478
|
+
output_name = f"{stem}.md"
|
|
479
|
+
|
|
480
|
+
return output_dir / output_name
|
|
481
|
+
else:
|
|
482
|
+
output_name = input_file.stem + ".md"
|
|
483
|
+
try:
|
|
484
|
+
relative_path = input_file.relative_to(input_base)
|
|
485
|
+
output_path = output_dir / relative_path.parent / output_name
|
|
486
|
+
except ValueError:
|
|
487
|
+
output_path = output_dir / output_name
|
|
488
|
+
|
|
489
|
+
return output_path
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
# =============================================================================
|
|
493
|
+
# CLI argument parsing
|
|
494
|
+
# =============================================================================
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def parse_args() -> argparse.Namespace:
|
|
498
|
+
"""Parse command line arguments."""
|
|
499
|
+
parser = argparse.ArgumentParser(
|
|
500
|
+
description="Convert documents to Markdown using Docling (via container)",
|
|
501
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
502
|
+
epilog="""
|
|
503
|
+
Examples:
|
|
504
|
+
mdify document.pdf Convert a single file
|
|
505
|
+
mdify ./docs -g "*.pdf" -r Convert PDFs recursively
|
|
506
|
+
mdify ./docs -g "*.pdf" -o out/ Specify output directory
|
|
507
|
+
mdify document.pdf -m Mask PII in images
|
|
508
|
+
mdify ./docs --runtime podman Use Podman instead of Docker
|
|
509
|
+
""",
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
parser.add_argument(
|
|
513
|
+
"input",
|
|
514
|
+
type=str,
|
|
515
|
+
nargs="?",
|
|
516
|
+
help="Input file or directory to convert",
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
parser.add_argument(
|
|
520
|
+
"-o",
|
|
521
|
+
"--out-dir",
|
|
522
|
+
type=str,
|
|
523
|
+
default="output",
|
|
524
|
+
help="Output directory for converted files (default: output)",
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
parser.add_argument(
|
|
528
|
+
"-g",
|
|
529
|
+
"--glob",
|
|
530
|
+
type=str,
|
|
531
|
+
default="*",
|
|
532
|
+
help="Glob pattern for filtering files in directory (default: *)",
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
parser.add_argument(
|
|
536
|
+
"-r",
|
|
537
|
+
"--recursive",
|
|
538
|
+
action="store_true",
|
|
539
|
+
help="Recursively scan directories",
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
parser.add_argument(
|
|
543
|
+
"--flat",
|
|
544
|
+
action="store_true",
|
|
545
|
+
help="Disable directory structure preservation in output",
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
parser.add_argument(
|
|
549
|
+
"--overwrite",
|
|
550
|
+
action="store_true",
|
|
551
|
+
help="Overwrite existing output files",
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
parser.add_argument(
|
|
555
|
+
"-q",
|
|
556
|
+
"--quiet",
|
|
557
|
+
action="store_true",
|
|
558
|
+
help="Suppress progress messages",
|
|
559
|
+
)
|
|
560
|
+
|
|
561
|
+
parser.add_argument(
|
|
562
|
+
"-y",
|
|
563
|
+
"--yes",
|
|
564
|
+
action="store_true",
|
|
565
|
+
help="Skip confirmation prompts (for scripts/CI)",
|
|
566
|
+
)
|
|
567
|
+
|
|
568
|
+
parser.add_argument(
|
|
569
|
+
"-m",
|
|
570
|
+
"--mask",
|
|
571
|
+
action="store_true",
|
|
572
|
+
help="Mask PII and sensitive content in document images",
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
parser.add_argument(
|
|
576
|
+
"--gpu",
|
|
577
|
+
action="store_true",
|
|
578
|
+
help="Use GPU-accelerated container image (docling-serve-cu126)",
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
parser.add_argument(
|
|
582
|
+
"--port",
|
|
583
|
+
type=int,
|
|
584
|
+
default=5001,
|
|
585
|
+
help="Port for docling-serve container (default: 5001)",
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
# Container options
|
|
589
|
+
parser.add_argument(
|
|
590
|
+
"--runtime",
|
|
591
|
+
type=str,
|
|
592
|
+
choices=SUPPORTED_RUNTIMES,
|
|
593
|
+
default=None,
|
|
594
|
+
help="Container runtime to use (auto-detects docker or podman if not specified)",
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
parser.add_argument(
|
|
598
|
+
"--image",
|
|
599
|
+
type=str,
|
|
600
|
+
default=DEFAULT_IMAGE,
|
|
601
|
+
help=f"Container image to use (default: {DEFAULT_IMAGE})",
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
parser.add_argument(
|
|
605
|
+
"--pull",
|
|
606
|
+
type=str,
|
|
607
|
+
choices=("always", "missing", "never"),
|
|
608
|
+
default="missing",
|
|
609
|
+
help="Image pull policy: always, missing, never (default: missing)",
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
parser.add_argument(
|
|
613
|
+
"--timeout",
|
|
614
|
+
type=int,
|
|
615
|
+
default=None,
|
|
616
|
+
help="Conversion timeout in seconds (default: 1200, can be set via MDIFY_TIMEOUT env var)",
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
# Utility options
|
|
620
|
+
parser.add_argument(
|
|
621
|
+
"--check-update",
|
|
622
|
+
action="store_true",
|
|
623
|
+
help="Check for available updates and exit",
|
|
624
|
+
)
|
|
625
|
+
|
|
626
|
+
parser.add_argument(
|
|
627
|
+
"--version",
|
|
628
|
+
action="version",
|
|
629
|
+
version=f"mdify {__version__}",
|
|
630
|
+
)
|
|
631
|
+
|
|
632
|
+
return parser.parse_args()
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
# =============================================================================
|
|
636
|
+
# Main entry point
|
|
637
|
+
# =============================================================================
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
def main() -> int:
|
|
641
|
+
"""Main entry point for the CLI."""
|
|
642
|
+
args = parse_args()
|
|
643
|
+
|
|
644
|
+
# Handle --check-update flag
|
|
645
|
+
if args.check_update:
|
|
646
|
+
check_for_update(force=True)
|
|
647
|
+
return 0
|
|
648
|
+
|
|
649
|
+
# Check for updates (daily, silent on errors)
|
|
650
|
+
check_for_update(force=False)
|
|
651
|
+
|
|
652
|
+
# Resolve timeout value: CLI > env > default 1200
|
|
653
|
+
timeout = args.timeout or int(os.environ.get("MDIFY_TIMEOUT", 1200))
|
|
654
|
+
|
|
655
|
+
# Validate input is provided
|
|
656
|
+
if args.input is None:
|
|
657
|
+
print("Error: Input file or directory is required", file=sys.stderr)
|
|
658
|
+
print("Usage: mdify <input> [options]", file=sys.stderr)
|
|
659
|
+
print(" mdify --help for more information", file=sys.stderr)
|
|
660
|
+
return 1
|
|
661
|
+
|
|
662
|
+
# Detect container runtime
|
|
663
|
+
preferred = args.runtime if args.runtime else "docker"
|
|
664
|
+
explicit = args.runtime is not None
|
|
665
|
+
runtime = detect_runtime(preferred, explicit=explicit)
|
|
666
|
+
if runtime is None:
|
|
667
|
+
print(
|
|
668
|
+
f"Error: Container runtime not found ({', '.join(SUPPORTED_RUNTIMES)})",
|
|
669
|
+
file=sys.stderr,
|
|
670
|
+
)
|
|
671
|
+
print("Please install Docker or Podman to use mdify.", file=sys.stderr)
|
|
672
|
+
return 2
|
|
673
|
+
|
|
674
|
+
# Handle image pull policy
|
|
675
|
+
# Determine image based on --gpu flag
|
|
676
|
+
if args.gpu:
|
|
677
|
+
image = GPU_IMAGE
|
|
678
|
+
elif args.image:
|
|
679
|
+
image = args.image
|
|
680
|
+
else:
|
|
681
|
+
image = DEFAULT_IMAGE
|
|
682
|
+
|
|
683
|
+
image_exists = check_image_exists(runtime, image)
|
|
684
|
+
|
|
685
|
+
# NOTE: Docker Desktop on macOS/Windows uses a VM, so disk space checks may not
|
|
686
|
+
# accurately reflect available space in the container's filesystem. Remote Docker
|
|
687
|
+
# daemons (DOCKER_HOST) are also not supported. In these cases, the check will
|
|
688
|
+
# gracefully degrade (warn and proceed).
|
|
689
|
+
|
|
690
|
+
# Check disk space before pulling image (skip if pull=never or image exists with pull=missing)
|
|
691
|
+
will_pull = args.pull == "always" or (args.pull == "missing" and not image_exists)
|
|
692
|
+
if will_pull:
|
|
693
|
+
storage_root = get_storage_root(runtime)
|
|
694
|
+
if storage_root:
|
|
695
|
+
image_size = get_image_size_estimate(runtime, image)
|
|
696
|
+
if image_size:
|
|
697
|
+
free_space = get_free_space(storage_root)
|
|
698
|
+
if free_space < image_size:
|
|
699
|
+
print(
|
|
700
|
+
f"Warning: Not enough free disk space on {storage_root}",
|
|
701
|
+
file=sys.stderr,
|
|
702
|
+
)
|
|
703
|
+
print(
|
|
704
|
+
f" Available: {format_size(free_space)}",
|
|
705
|
+
file=sys.stderr,
|
|
706
|
+
)
|
|
707
|
+
print(
|
|
708
|
+
f" Required: {format_size(image_size)} (estimated)",
|
|
709
|
+
file=sys.stderr,
|
|
710
|
+
)
|
|
711
|
+
if args.yes:
|
|
712
|
+
print(" Proceeding anyway (--yes flag set)", file=sys.stderr)
|
|
713
|
+
elif not sys.stdin.isatty():
|
|
714
|
+
print(
|
|
715
|
+
" Run with --yes to proceed anyway, or free up disk space",
|
|
716
|
+
file=sys.stderr,
|
|
717
|
+
)
|
|
718
|
+
return 1
|
|
719
|
+
elif not confirm_proceed("Continue anyway?"):
|
|
720
|
+
return 130
|
|
721
|
+
elif free_space - image_size < 1024 * 1024 * 1024:
|
|
722
|
+
print(
|
|
723
|
+
f"Warning: Less than 1 GB would remain after pulling image on {storage_root}",
|
|
724
|
+
file=sys.stderr,
|
|
725
|
+
)
|
|
726
|
+
print(
|
|
727
|
+
f" Available: {format_size(free_space)}",
|
|
728
|
+
file=sys.stderr,
|
|
729
|
+
)
|
|
730
|
+
print(
|
|
731
|
+
f" Required: {format_size(image_size)} (estimated)",
|
|
732
|
+
file=sys.stderr,
|
|
733
|
+
)
|
|
734
|
+
print(
|
|
735
|
+
f" Remaining: {format_size(free_space - image_size)}",
|
|
736
|
+
file=sys.stderr,
|
|
737
|
+
)
|
|
738
|
+
if args.yes:
|
|
739
|
+
print(" Proceeding anyway (--yes flag set)", file=sys.stderr)
|
|
740
|
+
elif not sys.stdin.isatty():
|
|
741
|
+
print(
|
|
742
|
+
" Run with --yes to proceed anyway, or free up disk space",
|
|
743
|
+
file=sys.stderr,
|
|
744
|
+
)
|
|
745
|
+
return 1
|
|
746
|
+
elif not confirm_proceed("Continue anyway?"):
|
|
747
|
+
return 130
|
|
748
|
+
|
|
749
|
+
if args.pull == "always" or (args.pull == "missing" and not image_exists):
|
|
750
|
+
if not pull_image(runtime, image, args.quiet):
|
|
751
|
+
print(f"Error: Failed to pull image: {image}", file=sys.stderr)
|
|
752
|
+
return 1
|
|
753
|
+
elif args.pull == "never" and not image_exists:
|
|
754
|
+
print(f"Error: Image not found locally: {image}", file=sys.stderr)
|
|
755
|
+
print(f"Run with --pull=missing or pull manually: {preferred} pull {image}")
|
|
756
|
+
return 1
|
|
757
|
+
|
|
758
|
+
# Resolve paths (use absolute() as fallback if resolve() fails due to permissions)
|
|
759
|
+
try:
|
|
760
|
+
input_path = Path(args.input).resolve()
|
|
761
|
+
except PermissionError:
|
|
762
|
+
input_path = Path(args.input).absolute()
|
|
763
|
+
try:
|
|
764
|
+
output_dir = Path(args.out_dir).resolve()
|
|
765
|
+
except PermissionError:
|
|
766
|
+
output_dir = Path(args.out_dir).absolute()
|
|
767
|
+
|
|
768
|
+
# Validate input
|
|
769
|
+
if not input_path.exists():
|
|
770
|
+
print(f"Error: Input path does not exist: {input_path}", file=sys.stderr)
|
|
771
|
+
return 1
|
|
772
|
+
|
|
773
|
+
# Get files to convert
|
|
774
|
+
try:
|
|
775
|
+
files_to_convert = get_files_to_convert(input_path, args.glob, args.recursive)
|
|
776
|
+
except Exception as e:
|
|
777
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
778
|
+
return 1
|
|
779
|
+
|
|
780
|
+
if not files_to_convert:
|
|
781
|
+
print(f"No files found to convert in: {input_path}", file=sys.stderr)
|
|
782
|
+
return 1
|
|
783
|
+
|
|
784
|
+
total_files = len(files_to_convert)
|
|
785
|
+
total_size = sum(f.stat().st_size for f in files_to_convert)
|
|
786
|
+
|
|
787
|
+
if not args.quiet:
|
|
788
|
+
print(f"Found {total_files} file(s) to convert ({format_size(total_size)})")
|
|
789
|
+
print(f"Using runtime: {runtime}")
|
|
790
|
+
print(f"Using image: {image}")
|
|
791
|
+
print()
|
|
792
|
+
|
|
793
|
+
if args.mask:
|
|
794
|
+
print(
|
|
795
|
+
"Warning: --mask is not supported with docling-serve and will be ignored",
|
|
796
|
+
file=sys.stderr,
|
|
797
|
+
)
|
|
798
|
+
|
|
799
|
+
# Determine input base for directory structure preservation
|
|
800
|
+
if input_path.is_file():
|
|
801
|
+
input_base = input_path.parent
|
|
802
|
+
else:
|
|
803
|
+
input_base = input_path
|
|
804
|
+
|
|
805
|
+
success_count = 0
|
|
806
|
+
skipped_count = 0
|
|
807
|
+
failed_count = 0
|
|
808
|
+
total_elapsed = 0.0
|
|
809
|
+
|
|
810
|
+
try:
|
|
811
|
+
if not args.quiet:
|
|
812
|
+
print(f"Starting docling-serve container...")
|
|
813
|
+
print()
|
|
814
|
+
|
|
815
|
+
with DoclingContainer(runtime, image, args.port, timeout=timeout) as container:
|
|
816
|
+
# Convert files
|
|
817
|
+
conversion_start = time.time()
|
|
818
|
+
spinner = Spinner()
|
|
819
|
+
|
|
820
|
+
for idx, input_file in enumerate(files_to_convert, 1):
|
|
821
|
+
output_file = get_output_path(
|
|
822
|
+
input_file, input_base, output_dir, args.flat
|
|
823
|
+
)
|
|
824
|
+
file_size = input_file.stat().st_size
|
|
825
|
+
progress = f"[{idx}/{total_files}]"
|
|
826
|
+
|
|
827
|
+
# Check if output exists and skip if not overwriting
|
|
828
|
+
if output_file.exists() and not args.overwrite:
|
|
829
|
+
if not args.quiet:
|
|
830
|
+
print(f"{progress} Skipped (exists): {input_file.name}")
|
|
831
|
+
skipped_count += 1
|
|
832
|
+
continue
|
|
833
|
+
|
|
834
|
+
# Ensure output directory exists
|
|
835
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
836
|
+
|
|
837
|
+
# Show spinner while processing
|
|
838
|
+
if not args.quiet:
|
|
839
|
+
spinner.start(
|
|
840
|
+
f"{progress} Processing: {input_file.name} ({format_size(file_size)})"
|
|
841
|
+
)
|
|
842
|
+
|
|
843
|
+
start_time = time.time()
|
|
844
|
+
try:
|
|
845
|
+
# Convert via HTTP API
|
|
846
|
+
result = convert_file(
|
|
847
|
+
container.base_url, input_file, to_format="md"
|
|
848
|
+
)
|
|
849
|
+
elapsed = time.time() - start_time
|
|
850
|
+
|
|
851
|
+
if not args.quiet:
|
|
852
|
+
spinner.stop()
|
|
853
|
+
|
|
854
|
+
if result.success:
|
|
855
|
+
# Write result to output file
|
|
856
|
+
output_file.write_text(result.content)
|
|
857
|
+
success_count += 1
|
|
858
|
+
if not args.quiet:
|
|
859
|
+
print(
|
|
860
|
+
f"{progress} {input_file.name} ✓ ({format_duration(elapsed)})"
|
|
861
|
+
)
|
|
862
|
+
else:
|
|
863
|
+
failed_count += 1
|
|
864
|
+
error_msg = result.error or "Unknown error"
|
|
865
|
+
if not args.quiet:
|
|
866
|
+
print(
|
|
867
|
+
f"{progress} {input_file.name} ✗ ({format_duration(elapsed)})"
|
|
868
|
+
)
|
|
869
|
+
print(f" Error: {error_msg}", file=sys.stderr)
|
|
870
|
+
except Exception as e:
|
|
871
|
+
elapsed = time.time() - start_time
|
|
872
|
+
failed_count += 1
|
|
873
|
+
if not args.quiet:
|
|
874
|
+
spinner.stop()
|
|
875
|
+
print(
|
|
876
|
+
f"{progress} {input_file.name} ✗ ({format_duration(elapsed)})"
|
|
877
|
+
)
|
|
878
|
+
print(f" Error: {str(e)}", file=sys.stderr)
|
|
879
|
+
|
|
880
|
+
total_elapsed = time.time() - conversion_start
|
|
881
|
+
|
|
882
|
+
# Print summary
|
|
883
|
+
if not args.quiet:
|
|
884
|
+
print()
|
|
885
|
+
print("=" * 50)
|
|
886
|
+
print("Conversion Summary:")
|
|
887
|
+
print(f" Total files: {total_files}")
|
|
888
|
+
print(f" Successful: {success_count}")
|
|
889
|
+
print(f" Skipped: {skipped_count}")
|
|
890
|
+
print(f" Failed: {failed_count}")
|
|
891
|
+
print(f" Total time: {format_duration(total_elapsed)}")
|
|
892
|
+
print("=" * 50)
|
|
893
|
+
|
|
894
|
+
except KeyboardInterrupt:
|
|
895
|
+
if not args.quiet:
|
|
896
|
+
print("\n\nInterrupted by user. Container stopped.")
|
|
897
|
+
if success_count > 0 or skipped_count > 0 or failed_count > 0:
|
|
898
|
+
print(
|
|
899
|
+
f"Partial progress: {success_count} successful, {failed_count} failed, {skipped_count} skipped"
|
|
900
|
+
)
|
|
901
|
+
return 130
|
|
902
|
+
|
|
903
|
+
# Return appropriate exit code
|
|
904
|
+
if failed_count > 0:
|
|
905
|
+
return 1
|
|
906
|
+
elif success_count == 0 and skipped_count > 0:
|
|
907
|
+
return 0
|
|
908
|
+
elif success_count > 0:
|
|
909
|
+
return 0
|
|
910
|
+
else:
|
|
911
|
+
return 1
|
|
912
|
+
|
|
913
|
+
|
|
914
|
+
if __name__ == "__main__":
|
|
915
|
+
sys.exit(main())
|