mdify-cli 1.4.1__py3-none-any.whl → 2.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdify/__init__.py +1 -1
- mdify/cli.py +587 -219
- mdify/container.py +167 -0
- mdify/docling_client.py +263 -0
- {mdify_cli-1.4.1.dist-info → mdify_cli-2.9.1.dist-info}/METADATA +92 -20
- mdify_cli-2.9.1.dist-info/RECORD +12 -0
- {mdify_cli-1.4.1.dist-info → mdify_cli-2.9.1.dist-info}/WHEEL +1 -1
- mdify_cli-1.4.1.dist-info/RECORD +0 -10
- {mdify_cli-1.4.1.dist-info → mdify_cli-2.9.1.dist-info}/entry_points.txt +0 -0
- {mdify_cli-1.4.1.dist-info → mdify_cli-2.9.1.dist-info}/licenses/LICENSE +0 -0
- {mdify_cli-1.4.1.dist-info → mdify_cli-2.9.1.dist-info}/top_level.txt +0 -0
mdify/cli.py
CHANGED
|
@@ -10,6 +10,7 @@ is lightweight and has no ML dependencies.
|
|
|
10
10
|
import argparse
|
|
11
11
|
import json
|
|
12
12
|
import os
|
|
13
|
+
import platform
|
|
13
14
|
import shutil
|
|
14
15
|
import subprocess
|
|
15
16
|
import sys
|
|
@@ -21,6 +22,8 @@ from urllib.error import URLError
|
|
|
21
22
|
from urllib.request import urlopen
|
|
22
23
|
|
|
23
24
|
from . import __version__
|
|
25
|
+
from mdify.container import DoclingContainer
|
|
26
|
+
from mdify.docling_client import convert_file
|
|
24
27
|
|
|
25
28
|
# Configuration
|
|
26
29
|
MDIFY_HOME = Path.home() / ".mdify"
|
|
@@ -29,18 +32,22 @@ PYPI_API_URL = "https://pypi.org/pypi/mdify-cli/json"
|
|
|
29
32
|
CHECK_INTERVAL_SECONDS = 86400 # 24 hours
|
|
30
33
|
|
|
31
34
|
# Container configuration
|
|
32
|
-
DEFAULT_IMAGE = "ghcr.io/
|
|
33
|
-
|
|
35
|
+
DEFAULT_IMAGE = "ghcr.io/docling-project/docling-serve-cpu:main"
|
|
36
|
+
GPU_IMAGE = "ghcr.io/docling-project/docling-serve-cu126:main"
|
|
37
|
+
SUPPORTED_RUNTIMES = ("docker", "podman", "orbstack", "colima", "container")
|
|
38
|
+
MACOS_RUNTIMES_PRIORITY = ("container", "orbstack", "colima", "podman", "docker")
|
|
39
|
+
OTHER_RUNTIMES_PRIORITY = ("docker", "podman")
|
|
34
40
|
|
|
35
41
|
|
|
36
42
|
# =============================================================================
|
|
37
43
|
# Update checking functions
|
|
38
44
|
# =============================================================================
|
|
39
45
|
|
|
46
|
+
|
|
40
47
|
def _get_remote_version(timeout: int = 5) -> Optional[str]:
|
|
41
48
|
"""
|
|
42
49
|
Fetch the latest version from PyPI.
|
|
43
|
-
|
|
50
|
+
|
|
44
51
|
Returns:
|
|
45
52
|
Version string (e.g., "1.1.0") or None if fetch failed.
|
|
46
53
|
"""
|
|
@@ -56,16 +63,16 @@ def _get_remote_version(timeout: int = 5) -> Optional[str]:
|
|
|
56
63
|
def _should_check_for_update() -> bool:
|
|
57
64
|
"""
|
|
58
65
|
Determine if we should check for updates based on last check time.
|
|
59
|
-
|
|
66
|
+
|
|
60
67
|
Returns:
|
|
61
68
|
True if check should be performed, False otherwise.
|
|
62
69
|
"""
|
|
63
70
|
if os.environ.get("MDIFY_NO_UPDATE_CHECK", "").lower() in ("1", "true", "yes"):
|
|
64
71
|
return False
|
|
65
|
-
|
|
72
|
+
|
|
66
73
|
if not LAST_CHECK_FILE.exists():
|
|
67
74
|
return True
|
|
68
|
-
|
|
75
|
+
|
|
69
76
|
try:
|
|
70
77
|
last_check = float(LAST_CHECK_FILE.read_text().strip())
|
|
71
78
|
elapsed = time.time() - last_check
|
|
@@ -86,18 +93,18 @@ def _update_last_check_time() -> None:
|
|
|
86
93
|
def _compare_versions(current: str, remote: str) -> bool:
|
|
87
94
|
"""
|
|
88
95
|
Compare version strings.
|
|
89
|
-
|
|
96
|
+
|
|
90
97
|
Returns:
|
|
91
98
|
True if remote version is newer than current.
|
|
92
99
|
"""
|
|
93
100
|
try:
|
|
94
101
|
current_parts = [int(x) for x in current.split(".")]
|
|
95
102
|
remote_parts = [int(x) for x in remote.split(".")]
|
|
96
|
-
|
|
103
|
+
|
|
97
104
|
max_len = max(len(current_parts), len(remote_parts))
|
|
98
105
|
current_parts.extend([0] * (max_len - len(current_parts)))
|
|
99
106
|
remote_parts.extend([0] * (max_len - len(remote_parts)))
|
|
100
|
-
|
|
107
|
+
|
|
101
108
|
return remote_parts > current_parts
|
|
102
109
|
except (ValueError, AttributeError):
|
|
103
110
|
return False
|
|
@@ -106,15 +113,15 @@ def _compare_versions(current: str, remote: str) -> bool:
|
|
|
106
113
|
def check_for_update(force: bool = False) -> None:
|
|
107
114
|
"""
|
|
108
115
|
Check for updates and prompt user to upgrade if available.
|
|
109
|
-
|
|
116
|
+
|
|
110
117
|
Args:
|
|
111
118
|
force: If True, check regardless of last check time and show errors.
|
|
112
119
|
"""
|
|
113
120
|
if not force and not _should_check_for_update():
|
|
114
121
|
return
|
|
115
|
-
|
|
122
|
+
|
|
116
123
|
remote_version = _get_remote_version()
|
|
117
|
-
|
|
124
|
+
|
|
118
125
|
if remote_version is None:
|
|
119
126
|
if force:
|
|
120
127
|
print(
|
|
@@ -124,19 +131,19 @@ def check_for_update(force: bool = False) -> None:
|
|
|
124
131
|
)
|
|
125
132
|
sys.exit(1)
|
|
126
133
|
return
|
|
127
|
-
|
|
134
|
+
|
|
128
135
|
_update_last_check_time()
|
|
129
|
-
|
|
136
|
+
|
|
130
137
|
if not _compare_versions(__version__, remote_version):
|
|
131
138
|
if force:
|
|
132
139
|
print(f"mdify is up to date (version {__version__})")
|
|
133
140
|
return
|
|
134
|
-
|
|
135
|
-
print(f"\n{'='*50}")
|
|
141
|
+
|
|
142
|
+
print(f"\n{'=' * 50}")
|
|
136
143
|
print(f"A new version of mdify-cli is available!")
|
|
137
144
|
print(f" Current version: {__version__}")
|
|
138
145
|
print(f" Latest version: {remote_version}")
|
|
139
|
-
print(f"{'='*50}")
|
|
146
|
+
print(f"{'=' * 50}")
|
|
140
147
|
print(f"\nTo upgrade, run:")
|
|
141
148
|
print(f" pipx upgrade mdify-cli")
|
|
142
149
|
print(f" # or: pip install --upgrade mdify-cli\n")
|
|
@@ -146,43 +153,155 @@ def check_for_update(force: bool = False) -> None:
|
|
|
146
153
|
# Container runtime functions
|
|
147
154
|
# =============================================================================
|
|
148
155
|
|
|
149
|
-
|
|
156
|
+
|
|
157
|
+
def is_daemon_running(runtime: str) -> bool:
|
|
150
158
|
"""
|
|
151
|
-
|
|
152
|
-
|
|
159
|
+
Check if a container runtime daemon is running.
|
|
160
|
+
|
|
153
161
|
Args:
|
|
154
|
-
|
|
162
|
+
runtime: Path to container runtime executable
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
True if daemon is running and responsive, False otherwise.
|
|
166
|
+
"""
|
|
167
|
+
try:
|
|
168
|
+
runtime_name = os.path.basename(runtime)
|
|
169
|
+
|
|
170
|
+
# Apple Container uses 'container system status' to check daemon
|
|
171
|
+
if runtime_name == "container":
|
|
172
|
+
result = subprocess.run(
|
|
173
|
+
[runtime, "system", "status"],
|
|
174
|
+
capture_output=True,
|
|
175
|
+
timeout=5,
|
|
176
|
+
check=False,
|
|
177
|
+
)
|
|
178
|
+
return result.returncode == 0
|
|
155
179
|
|
|
180
|
+
# Other runtimes use --version check
|
|
181
|
+
result = subprocess.run(
|
|
182
|
+
[runtime, "--version"],
|
|
183
|
+
capture_output=True,
|
|
184
|
+
timeout=5,
|
|
185
|
+
check=False,
|
|
186
|
+
)
|
|
187
|
+
return result.returncode == 0
|
|
188
|
+
except (OSError, subprocess.TimeoutExpired):
|
|
189
|
+
return False
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def detect_runtime(preferred: Optional[str] = None, explicit: bool = True) -> Optional[str]:
|
|
193
|
+
"""
|
|
194
|
+
Detect available container runtime.
|
|
195
|
+
|
|
196
|
+
First checks MDIFY_CONTAINER_RUNTIME environment variable for explicit override.
|
|
197
|
+
On macOS, tries native tools first (OrbStack → Colima → Podman → Docker).
|
|
198
|
+
On other platforms, tries Docker → Podman.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
preferred: Preferred runtime name override (deprecated, use MDIFY_CONTAINER_RUNTIME)
|
|
202
|
+
explicit: If True, print info about detection/fallback choices.
|
|
203
|
+
|
|
156
204
|
Returns:
|
|
157
205
|
Path to runtime executable, or None if not found.
|
|
158
206
|
"""
|
|
159
|
-
#
|
|
160
|
-
|
|
161
|
-
if
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
207
|
+
# Check for explicit environment variable override
|
|
208
|
+
env_runtime = os.environ.get("MDIFY_CONTAINER_RUNTIME", "").strip().lower()
|
|
209
|
+
if env_runtime:
|
|
210
|
+
if env_runtime not in SUPPORTED_RUNTIMES:
|
|
211
|
+
print(
|
|
212
|
+
f"Warning: MDIFY_CONTAINER_RUNTIME='{env_runtime}' is not supported. "
|
|
213
|
+
f"Supported: {', '.join(SUPPORTED_RUNTIMES)}",
|
|
214
|
+
file=sys.stderr,
|
|
215
|
+
)
|
|
216
|
+
else:
|
|
217
|
+
runtime_path = shutil.which(env_runtime)
|
|
218
|
+
if runtime_path:
|
|
219
|
+
if explicit:
|
|
220
|
+
print(f"Using runtime from MDIFY_CONTAINER_RUNTIME: {env_runtime}")
|
|
221
|
+
return runtime_path
|
|
222
|
+
else:
|
|
223
|
+
print(
|
|
224
|
+
f"Warning: MDIFY_CONTAINER_RUNTIME='{env_runtime}' specified but not found in PATH",
|
|
225
|
+
file=sys.stderr,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# Determine runtime priority based on OS
|
|
229
|
+
is_macos = platform.system() == "Darwin"
|
|
230
|
+
if is_macos:
|
|
231
|
+
runtime_priority = MACOS_RUNTIMES_PRIORITY
|
|
232
|
+
if explicit:
|
|
233
|
+
print(f"Detected macOS: checking for native container tools...")
|
|
234
|
+
else:
|
|
235
|
+
runtime_priority = OTHER_RUNTIMES_PRIORITY
|
|
236
|
+
|
|
237
|
+
# Try each runtime in priority order
|
|
238
|
+
found_but_not_running = []
|
|
239
|
+
for runtime_name in runtime_priority:
|
|
240
|
+
runtime_path = shutil.which(runtime_name)
|
|
241
|
+
if runtime_path:
|
|
242
|
+
# Check if daemon is running
|
|
243
|
+
if is_daemon_running(runtime_path):
|
|
244
|
+
if explicit:
|
|
245
|
+
print(f"Using container runtime: {runtime_name}")
|
|
246
|
+
return runtime_path
|
|
247
|
+
else:
|
|
248
|
+
found_but_not_running.append((runtime_name, runtime_path))
|
|
249
|
+
|
|
250
|
+
# If we found tools but none are running, warn and ask user to start one
|
|
251
|
+
if found_but_not_running:
|
|
252
|
+
print(
|
|
253
|
+
f"\nWarning: Found container runtime(s) but daemon is not running:",
|
|
254
|
+
file=sys.stderr,
|
|
255
|
+
)
|
|
256
|
+
for runtime_name, runtime_path in found_but_not_running:
|
|
257
|
+
print(f" - {runtime_name} ({runtime_path})", file=sys.stderr)
|
|
258
|
+
print(
|
|
259
|
+
"\nPlease start one of these tools before running mdify.",
|
|
260
|
+
file=sys.stderr,
|
|
261
|
+
)
|
|
262
|
+
if is_macos:
|
|
263
|
+
print(
|
|
264
|
+
" macOS tip: Start OrbStack, Colima, or Podman Desktop application",
|
|
265
|
+
file=sys.stderr,
|
|
266
|
+
)
|
|
267
|
+
return None
|
|
268
|
+
|
|
171
269
|
return None
|
|
172
270
|
|
|
173
271
|
|
|
174
272
|
def check_image_exists(runtime: str, image: str) -> bool:
|
|
175
273
|
"""
|
|
176
274
|
Check if container image exists locally.
|
|
177
|
-
|
|
275
|
+
|
|
178
276
|
Args:
|
|
179
277
|
runtime: Path to container runtime
|
|
180
278
|
image: Image name/tag
|
|
181
|
-
|
|
279
|
+
|
|
182
280
|
Returns:
|
|
183
281
|
True if image exists locally.
|
|
184
282
|
"""
|
|
185
283
|
try:
|
|
284
|
+
runtime_name = os.path.basename(runtime)
|
|
285
|
+
|
|
286
|
+
# Apple Container uses 'image-list' command
|
|
287
|
+
if runtime_name == "container":
|
|
288
|
+
result = subprocess.run(
|
|
289
|
+
[runtime, "image-list", "--format", "json"],
|
|
290
|
+
capture_output=True,
|
|
291
|
+
check=False,
|
|
292
|
+
)
|
|
293
|
+
if result.returncode == 0 and result.stdout:
|
|
294
|
+
try:
|
|
295
|
+
images = json.loads(result.stdout.decode())
|
|
296
|
+
# Check if image exists in the list
|
|
297
|
+
for img in images:
|
|
298
|
+
if img.get("name") == image or image in img.get("repoTags", []):
|
|
299
|
+
return True
|
|
300
|
+
except json.JSONDecodeError:
|
|
301
|
+
pass
|
|
302
|
+
return False
|
|
303
|
+
|
|
304
|
+
# Docker/Podman/OrbStack/Colima use standard 'image inspect'
|
|
186
305
|
result = subprocess.run(
|
|
187
306
|
[runtime, "image", "inspect", image],
|
|
188
307
|
capture_output=True,
|
|
@@ -196,19 +315,31 @@ def check_image_exists(runtime: str, image: str) -> bool:
|
|
|
196
315
|
def pull_image(runtime: str, image: str, quiet: bool = False) -> bool:
|
|
197
316
|
"""
|
|
198
317
|
Pull container image.
|
|
199
|
-
|
|
318
|
+
|
|
200
319
|
Args:
|
|
201
320
|
runtime: Path to container runtime
|
|
202
321
|
image: Image name/tag
|
|
203
322
|
quiet: Suppress progress output
|
|
204
|
-
|
|
323
|
+
|
|
205
324
|
Returns:
|
|
206
325
|
True if pull succeeded.
|
|
207
326
|
"""
|
|
208
327
|
if not quiet:
|
|
209
328
|
print(f"Pulling image: {image}")
|
|
210
|
-
|
|
329
|
+
|
|
211
330
|
try:
|
|
331
|
+
runtime_name = os.path.basename(runtime)
|
|
332
|
+
|
|
333
|
+
# Apple Container uses 'image-pull' command
|
|
334
|
+
if runtime_name == "container":
|
|
335
|
+
result = subprocess.run(
|
|
336
|
+
[runtime, "image-pull", image],
|
|
337
|
+
capture_output=quiet,
|
|
338
|
+
check=False,
|
|
339
|
+
)
|
|
340
|
+
return result.returncode == 0
|
|
341
|
+
|
|
342
|
+
# Docker/Podman/OrbStack/Colima use standard 'pull'
|
|
212
343
|
result = subprocess.run(
|
|
213
344
|
[runtime, "pull", image],
|
|
214
345
|
capture_output=quiet,
|
|
@@ -220,11 +351,49 @@ def pull_image(runtime: str, image: str, quiet: bool = False) -> bool:
|
|
|
220
351
|
return False
|
|
221
352
|
|
|
222
353
|
|
|
354
|
+
def get_image_size_estimate(runtime: str, image: str) -> Optional[int]:
|
|
355
|
+
"""
|
|
356
|
+
Estimate image size by querying registry manifest.
|
|
357
|
+
|
|
358
|
+
Runs `<runtime> manifest inspect --verbose <image>` and sums all layer sizes
|
|
359
|
+
across all architectures, then applies 50% buffer for decompression.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
runtime: Path to container runtime
|
|
363
|
+
image: Image name/tag
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
Estimated size in bytes with 50% buffer, or None if command fails.
|
|
367
|
+
"""
|
|
368
|
+
try:
|
|
369
|
+
result = subprocess.run(
|
|
370
|
+
[runtime, "manifest", "inspect", "--verbose", image],
|
|
371
|
+
capture_output=True,
|
|
372
|
+
check=False,
|
|
373
|
+
)
|
|
374
|
+
if result.returncode != 0:
|
|
375
|
+
return None
|
|
376
|
+
|
|
377
|
+
manifest_data = json.loads(result.stdout.decode())
|
|
378
|
+
|
|
379
|
+
# Sum all layer sizes across all architectures
|
|
380
|
+
total_size = 0
|
|
381
|
+
for manifest in manifest_data.get("Manifests", []):
|
|
382
|
+
oci_manifest = manifest.get("OCIManifest", {})
|
|
383
|
+
for layer in oci_manifest.get("layers", []):
|
|
384
|
+
total_size += layer.get("size", 0)
|
|
385
|
+
|
|
386
|
+
# Apply 50% buffer for decompression (compressed -> uncompressed)
|
|
387
|
+
return int(total_size * 1.5)
|
|
388
|
+
except (json.JSONDecodeError, KeyError, ValueError, OSError):
|
|
389
|
+
return None
|
|
390
|
+
|
|
391
|
+
|
|
223
392
|
def format_size(size_bytes: int) -> str:
|
|
224
393
|
"""Format file size in human-readable format."""
|
|
225
|
-
for unit in [
|
|
394
|
+
for unit in ["B", "KB", "MB", "GB"]:
|
|
226
395
|
if size_bytes < 1024:
|
|
227
|
-
return f"{size_bytes:.1f} {unit}" if unit !=
|
|
396
|
+
return f"{size_bytes:.1f} {unit}" if unit != "B" else f"{size_bytes} {unit}"
|
|
228
397
|
size_bytes /= 1024
|
|
229
398
|
return f"{size_bytes:.1f} TB"
|
|
230
399
|
|
|
@@ -242,31 +411,112 @@ def format_duration(seconds: float) -> str:
|
|
|
242
411
|
return f"{hours}h {mins}m {secs:.0f}s"
|
|
243
412
|
|
|
244
413
|
|
|
414
|
+
def get_free_space(path: str) -> int:
|
|
415
|
+
"""Get free disk space for the given path in bytes."""
|
|
416
|
+
try:
|
|
417
|
+
return shutil.disk_usage(path).free
|
|
418
|
+
except (FileNotFoundError, OSError):
|
|
419
|
+
return 0
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def get_storage_root(runtime: str) -> Optional[str]:
|
|
423
|
+
"""
|
|
424
|
+
Get the storage root directory for Docker, Podman, OrbStack, or Colima.
|
|
425
|
+
|
|
426
|
+
Args:
|
|
427
|
+
runtime: Path to container runtime executable
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
Storage root path as string, or None if command fails.
|
|
431
|
+
"""
|
|
432
|
+
try:
|
|
433
|
+
# Extract runtime name from path (e.g., /usr/bin/docker -> docker)
|
|
434
|
+
runtime_name = os.path.basename(runtime)
|
|
435
|
+
|
|
436
|
+
if runtime_name == "docker":
|
|
437
|
+
result = subprocess.run(
|
|
438
|
+
[runtime, "system", "info", "--format", "{{.DockerRootDir}}"],
|
|
439
|
+
capture_output=True,
|
|
440
|
+
check=False,
|
|
441
|
+
)
|
|
442
|
+
if result.stdout:
|
|
443
|
+
return result.stdout.decode().strip()
|
|
444
|
+
elif runtime_name == "podman":
|
|
445
|
+
result = subprocess.run(
|
|
446
|
+
[runtime, "info", "--format", "json"],
|
|
447
|
+
capture_output=True,
|
|
448
|
+
check=False,
|
|
449
|
+
)
|
|
450
|
+
if result.stdout:
|
|
451
|
+
info = json.loads(result.stdout.decode())
|
|
452
|
+
return info.get("store", {}).get("graphRoot")
|
|
453
|
+
elif runtime_name == "orbstack":
|
|
454
|
+
# OrbStack stores containers in ~/.orbstack
|
|
455
|
+
home = os.path.expanduser("~")
|
|
456
|
+
return os.path.join(home, ".orbstack")
|
|
457
|
+
elif runtime_name == "colima":
|
|
458
|
+
# Colima stores containers in ~/.colima
|
|
459
|
+
home = os.path.expanduser("~")
|
|
460
|
+
return os.path.join(home, ".colima")
|
|
461
|
+
elif runtime_name == "container":
|
|
462
|
+
# Apple Container stores data in Application Support
|
|
463
|
+
home = os.path.expanduser("~")
|
|
464
|
+
return os.path.join(home, "Library", "Application Support", "com.apple.container")
|
|
465
|
+
return None
|
|
466
|
+
except (OSError, json.JSONDecodeError):
|
|
467
|
+
return None
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def confirm_proceed(message: str, default_no: bool = True) -> bool:
|
|
471
|
+
"""
|
|
472
|
+
Prompt user for confirmation with a y/N prompt.
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
message: The confirmation message to display
|
|
476
|
+
default_no: If True, shows [y/N] (default no). If False, shows [Y/n] (default yes)
|
|
477
|
+
|
|
478
|
+
Returns:
|
|
479
|
+
True if user entered 'y' or 'Y', False otherwise.
|
|
480
|
+
Returns False immediately if stdin is not a TTY (non-interactive).
|
|
481
|
+
"""
|
|
482
|
+
if not sys.stdin.isatty():
|
|
483
|
+
return False
|
|
484
|
+
|
|
485
|
+
prompt = "[y/N]" if default_no else "[Y/n]"
|
|
486
|
+
print(f"{message} {prompt}", file=sys.stderr)
|
|
487
|
+
response = input()
|
|
488
|
+
return response.lower() == "y"
|
|
489
|
+
|
|
490
|
+
|
|
245
491
|
class Spinner:
|
|
246
492
|
"""A simple spinner to show progress during long operations."""
|
|
247
|
-
|
|
493
|
+
|
|
248
494
|
def __init__(self):
|
|
249
|
-
self.frames = [
|
|
495
|
+
self.frames = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
|
|
250
496
|
self.running = False
|
|
251
497
|
self.thread = None
|
|
252
498
|
self.start_time = None
|
|
253
|
-
|
|
499
|
+
|
|
254
500
|
def _spin(self):
|
|
255
501
|
idx = 0
|
|
256
502
|
while self.running:
|
|
257
503
|
elapsed = time.time() - self.start_time
|
|
258
504
|
frame = self.frames[idx % len(self.frames)]
|
|
259
|
-
print(
|
|
505
|
+
print(
|
|
506
|
+
f"\r{self.prefix} {frame} ({format_duration(elapsed)})",
|
|
507
|
+
end="",
|
|
508
|
+
flush=True,
|
|
509
|
+
)
|
|
260
510
|
idx += 1
|
|
261
511
|
time.sleep(0.1)
|
|
262
|
-
|
|
512
|
+
|
|
263
513
|
def start(self, prefix: str = ""):
|
|
264
514
|
self.prefix = prefix
|
|
265
515
|
self.running = True
|
|
266
516
|
self.start_time = time.time()
|
|
267
517
|
self.thread = threading.Thread(target=self._spin, daemon=True)
|
|
268
518
|
self.thread.start()
|
|
269
|
-
|
|
519
|
+
|
|
270
520
|
def stop(self):
|
|
271
521
|
self.running = False
|
|
272
522
|
if self.thread:
|
|
@@ -275,93 +525,45 @@ class Spinner:
|
|
|
275
525
|
print(f"\r{' ' * 80}\r", end="", flush=True)
|
|
276
526
|
|
|
277
527
|
|
|
278
|
-
def run_container(
|
|
279
|
-
runtime: str,
|
|
280
|
-
image: str,
|
|
281
|
-
input_file: Path,
|
|
282
|
-
output_file: Path,
|
|
283
|
-
mask_pii: bool = False,
|
|
284
|
-
) -> Tuple[bool, str, float]:
|
|
285
|
-
"""
|
|
286
|
-
Run container to convert a single file.
|
|
287
|
-
|
|
288
|
-
Args:
|
|
289
|
-
runtime: Path to container runtime
|
|
290
|
-
image: Image name/tag
|
|
291
|
-
input_file: Absolute path to input file
|
|
292
|
-
output_file: Absolute path to output file
|
|
293
|
-
mask_pii: Whether to mask PII in images
|
|
294
|
-
|
|
295
|
-
Returns:
|
|
296
|
-
Tuple of (success: bool, message: str, elapsed_seconds: float)
|
|
297
|
-
"""
|
|
298
|
-
start_time = time.time()
|
|
299
|
-
|
|
300
|
-
# Ensure output directory exists
|
|
301
|
-
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
302
|
-
|
|
303
|
-
# Mount directories
|
|
304
|
-
input_dir = input_file.parent
|
|
305
|
-
output_dir = output_file.parent
|
|
306
|
-
|
|
307
|
-
# Container paths
|
|
308
|
-
container_in = f"/work/in/{input_file.name}"
|
|
309
|
-
container_out = f"/work/out/{output_file.name}"
|
|
310
|
-
|
|
311
|
-
cmd = [
|
|
312
|
-
runtime, "run", "--rm",
|
|
313
|
-
"-v", f"{input_dir}:/work/in:ro",
|
|
314
|
-
"-v", f"{output_dir}:/work/out",
|
|
315
|
-
image,
|
|
316
|
-
"--in", container_in,
|
|
317
|
-
"--out", container_out,
|
|
318
|
-
]
|
|
319
|
-
|
|
320
|
-
if mask_pii:
|
|
321
|
-
cmd.append("--mask")
|
|
322
|
-
|
|
323
|
-
try:
|
|
324
|
-
result = subprocess.run(
|
|
325
|
-
cmd,
|
|
326
|
-
capture_output=True,
|
|
327
|
-
text=True,
|
|
328
|
-
check=False,
|
|
329
|
-
)
|
|
330
|
-
elapsed = time.time() - start_time
|
|
331
|
-
|
|
332
|
-
if result.returncode == 0:
|
|
333
|
-
return True, "success", elapsed
|
|
334
|
-
else:
|
|
335
|
-
error_msg = result.stderr.strip() or result.stdout.strip() or "Unknown error"
|
|
336
|
-
return False, error_msg, elapsed
|
|
337
|
-
|
|
338
|
-
except OSError as e:
|
|
339
|
-
elapsed = time.time() - start_time
|
|
340
|
-
return False, str(e), elapsed
|
|
341
|
-
|
|
342
|
-
|
|
343
528
|
# =============================================================================
|
|
344
529
|
# File handling functions
|
|
345
530
|
# =============================================================================
|
|
346
531
|
|
|
347
532
|
# Supported file extensions (based on Docling InputFormat)
|
|
348
533
|
SUPPORTED_EXTENSIONS = {
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
534
|
+
".pdf",
|
|
535
|
+
".docx",
|
|
536
|
+
".pptx",
|
|
537
|
+
".html",
|
|
538
|
+
".htm",
|
|
539
|
+
".png",
|
|
540
|
+
".jpg",
|
|
541
|
+
".jpeg",
|
|
542
|
+
".gif",
|
|
543
|
+
".bmp",
|
|
544
|
+
".tiff",
|
|
545
|
+
".tif", # images
|
|
546
|
+
".asciidoc",
|
|
547
|
+
".adoc",
|
|
548
|
+
".asc", # asciidoc
|
|
549
|
+
".md",
|
|
550
|
+
".markdown", # markdown
|
|
551
|
+
".csv",
|
|
552
|
+
".xlsx", # spreadsheets
|
|
553
|
+
".xml", # XML formats
|
|
554
|
+
".json", # JSON docling
|
|
555
|
+
".mp3",
|
|
556
|
+
".wav",
|
|
557
|
+
".m4a",
|
|
558
|
+
".flac", # audio
|
|
559
|
+
".vtt", # subtitles
|
|
358
560
|
}
|
|
359
561
|
|
|
360
562
|
|
|
361
563
|
def get_files_to_convert(input_path: Path, mask: str, recursive: bool) -> List[Path]:
|
|
362
564
|
"""Get list of files to convert based on input path and options."""
|
|
363
565
|
files = []
|
|
364
|
-
|
|
566
|
+
|
|
365
567
|
if input_path.is_file():
|
|
366
568
|
files.append(input_path)
|
|
367
569
|
elif input_path.is_dir():
|
|
@@ -369,19 +571,19 @@ def get_files_to_convert(input_path: Path, mask: str, recursive: bool) -> List[P
|
|
|
369
571
|
files = list(input_path.rglob(mask))
|
|
370
572
|
else:
|
|
371
573
|
files = list(input_path.glob(mask))
|
|
372
|
-
|
|
574
|
+
|
|
373
575
|
# Filter to only files
|
|
374
576
|
files = [f for f in files if f.is_file()]
|
|
375
577
|
else:
|
|
376
578
|
raise FileNotFoundError(f"Input path does not exist: {input_path}")
|
|
377
|
-
|
|
579
|
+
|
|
378
580
|
# Filter out hidden files and unsupported formats
|
|
379
581
|
files = [
|
|
380
|
-
f
|
|
381
|
-
|
|
382
|
-
and f.suffix.lower() in SUPPORTED_EXTENSIONS
|
|
582
|
+
f
|
|
583
|
+
for f in files
|
|
584
|
+
if not f.name.startswith(".") and f.suffix.lower() in SUPPORTED_EXTENSIONS
|
|
383
585
|
]
|
|
384
|
-
|
|
586
|
+
|
|
385
587
|
return files
|
|
386
588
|
|
|
387
589
|
|
|
@@ -414,7 +616,7 @@ def get_output_path(
|
|
|
414
616
|
output_path = output_dir / relative_path.parent / output_name
|
|
415
617
|
except ValueError:
|
|
416
618
|
output_path = output_dir / output_name
|
|
417
|
-
|
|
619
|
+
|
|
418
620
|
return output_path
|
|
419
621
|
|
|
420
622
|
|
|
@@ -422,6 +624,7 @@ def get_output_path(
|
|
|
422
624
|
# CLI argument parsing
|
|
423
625
|
# =============================================================================
|
|
424
626
|
|
|
627
|
+
|
|
425
628
|
def parse_args() -> argparse.Namespace:
|
|
426
629
|
"""Parse command line arguments."""
|
|
427
630
|
parser = argparse.ArgumentParser(
|
|
@@ -436,74 +639,99 @@ Examples:
|
|
|
436
639
|
mdify ./docs --runtime podman Use Podman instead of Docker
|
|
437
640
|
""",
|
|
438
641
|
)
|
|
439
|
-
|
|
642
|
+
|
|
440
643
|
parser.add_argument(
|
|
441
644
|
"input",
|
|
442
645
|
type=str,
|
|
443
646
|
nargs="?",
|
|
444
647
|
help="Input file or directory to convert",
|
|
445
648
|
)
|
|
446
|
-
|
|
649
|
+
|
|
447
650
|
parser.add_argument(
|
|
448
|
-
"-o",
|
|
651
|
+
"-o",
|
|
652
|
+
"--out-dir",
|
|
449
653
|
type=str,
|
|
450
654
|
default="output",
|
|
451
655
|
help="Output directory for converted files (default: output)",
|
|
452
656
|
)
|
|
453
|
-
|
|
657
|
+
|
|
454
658
|
parser.add_argument(
|
|
455
|
-
"-g",
|
|
659
|
+
"-g",
|
|
660
|
+
"--glob",
|
|
456
661
|
type=str,
|
|
457
662
|
default="*",
|
|
458
663
|
help="Glob pattern for filtering files in directory (default: *)",
|
|
459
664
|
)
|
|
460
|
-
|
|
665
|
+
|
|
461
666
|
parser.add_argument(
|
|
462
|
-
"-r",
|
|
667
|
+
"-r",
|
|
668
|
+
"--recursive",
|
|
463
669
|
action="store_true",
|
|
464
670
|
help="Recursively scan directories",
|
|
465
671
|
)
|
|
466
|
-
|
|
672
|
+
|
|
467
673
|
parser.add_argument(
|
|
468
674
|
"--flat",
|
|
469
675
|
action="store_true",
|
|
470
676
|
help="Disable directory structure preservation in output",
|
|
471
677
|
)
|
|
472
|
-
|
|
678
|
+
|
|
473
679
|
parser.add_argument(
|
|
474
680
|
"--overwrite",
|
|
475
681
|
action="store_true",
|
|
476
682
|
help="Overwrite existing output files",
|
|
477
683
|
)
|
|
478
|
-
|
|
684
|
+
|
|
479
685
|
parser.add_argument(
|
|
480
|
-
"-q",
|
|
686
|
+
"-q",
|
|
687
|
+
"--quiet",
|
|
481
688
|
action="store_true",
|
|
482
689
|
help="Suppress progress messages",
|
|
483
690
|
)
|
|
484
|
-
|
|
691
|
+
|
|
485
692
|
parser.add_argument(
|
|
486
|
-
"-
|
|
693
|
+
"-y",
|
|
694
|
+
"--yes",
|
|
695
|
+
action="store_true",
|
|
696
|
+
help="Skip confirmation prompts (for scripts/CI)",
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
parser.add_argument(
|
|
700
|
+
"-m",
|
|
701
|
+
"--mask",
|
|
487
702
|
action="store_true",
|
|
488
703
|
help="Mask PII and sensitive content in document images",
|
|
489
704
|
)
|
|
490
|
-
|
|
705
|
+
|
|
706
|
+
parser.add_argument(
|
|
707
|
+
"--gpu",
|
|
708
|
+
action="store_true",
|
|
709
|
+
help="Use GPU-accelerated container image (docling-serve-cu126)",
|
|
710
|
+
)
|
|
711
|
+
|
|
712
|
+
parser.add_argument(
|
|
713
|
+
"--port",
|
|
714
|
+
type=int,
|
|
715
|
+
default=5001,
|
|
716
|
+
help="Port for docling-serve container (default: 5001)",
|
|
717
|
+
)
|
|
718
|
+
|
|
491
719
|
# Container options
|
|
492
720
|
parser.add_argument(
|
|
493
721
|
"--runtime",
|
|
494
722
|
type=str,
|
|
495
723
|
choices=SUPPORTED_RUNTIMES,
|
|
496
|
-
default=
|
|
497
|
-
help="Container runtime to use (
|
|
724
|
+
default=None,
|
|
725
|
+
help="Container runtime to use (auto-detects docker or podman if not specified)",
|
|
498
726
|
)
|
|
499
|
-
|
|
727
|
+
|
|
500
728
|
parser.add_argument(
|
|
501
729
|
"--image",
|
|
502
730
|
type=str,
|
|
503
731
|
default=DEFAULT_IMAGE,
|
|
504
732
|
help=f"Container image to use (default: {DEFAULT_IMAGE})",
|
|
505
733
|
)
|
|
506
|
-
|
|
734
|
+
|
|
507
735
|
parser.add_argument(
|
|
508
736
|
"--pull",
|
|
509
737
|
type=str,
|
|
@@ -511,20 +739,27 @@ Examples:
|
|
|
511
739
|
default="missing",
|
|
512
740
|
help="Image pull policy: always, missing, never (default: missing)",
|
|
513
741
|
)
|
|
514
|
-
|
|
742
|
+
|
|
743
|
+
parser.add_argument(
|
|
744
|
+
"--timeout",
|
|
745
|
+
type=int,
|
|
746
|
+
default=None,
|
|
747
|
+
help="Conversion timeout in seconds (default: 1200, can be set via MDIFY_TIMEOUT env var)",
|
|
748
|
+
)
|
|
749
|
+
|
|
515
750
|
# Utility options
|
|
516
751
|
parser.add_argument(
|
|
517
752
|
"--check-update",
|
|
518
753
|
action="store_true",
|
|
519
754
|
help="Check for available updates and exit",
|
|
520
755
|
)
|
|
521
|
-
|
|
756
|
+
|
|
522
757
|
parser.add_argument(
|
|
523
758
|
"--version",
|
|
524
759
|
action="version",
|
|
525
760
|
version=f"mdify {__version__}",
|
|
526
761
|
)
|
|
527
|
-
|
|
762
|
+
|
|
528
763
|
return parser.parse_args()
|
|
529
764
|
|
|
530
765
|
|
|
@@ -532,137 +767,270 @@ Examples:
|
|
|
532
767
|
# Main entry point
|
|
533
768
|
# =============================================================================
|
|
534
769
|
|
|
770
|
+
|
|
535
771
|
def main() -> int:
|
|
536
772
|
"""Main entry point for the CLI."""
|
|
537
773
|
args = parse_args()
|
|
538
|
-
|
|
774
|
+
|
|
539
775
|
# Handle --check-update flag
|
|
540
776
|
if args.check_update:
|
|
541
777
|
check_for_update(force=True)
|
|
542
778
|
return 0
|
|
543
|
-
|
|
779
|
+
|
|
544
780
|
# Check for updates (daily, silent on errors)
|
|
545
781
|
check_for_update(force=False)
|
|
546
|
-
|
|
782
|
+
|
|
783
|
+
# Resolve timeout value: CLI > env > default 1200
|
|
784
|
+
timeout = args.timeout or int(os.environ.get("MDIFY_TIMEOUT", 1200))
|
|
785
|
+
|
|
547
786
|
# Validate input is provided
|
|
548
787
|
if args.input is None:
|
|
549
788
|
print("Error: Input file or directory is required", file=sys.stderr)
|
|
550
789
|
print("Usage: mdify <input> [options]", file=sys.stderr)
|
|
551
790
|
print(" mdify --help for more information", file=sys.stderr)
|
|
552
791
|
return 1
|
|
553
|
-
|
|
792
|
+
|
|
554
793
|
# Detect container runtime
|
|
555
|
-
runtime
|
|
794
|
+
# If --runtime is specified, treat as explicit user choice
|
|
795
|
+
explicit = args.runtime is not None
|
|
796
|
+
runtime = detect_runtime(preferred=args.runtime, explicit=explicit)
|
|
556
797
|
if runtime is None:
|
|
557
798
|
print(
|
|
558
799
|
f"Error: Container runtime not found ({', '.join(SUPPORTED_RUNTIMES)})",
|
|
559
800
|
file=sys.stderr,
|
|
560
801
|
)
|
|
561
|
-
print("Please install Docker or Podman to use mdify.", file=sys.stderr)
|
|
562
802
|
return 2
|
|
563
|
-
|
|
803
|
+
|
|
564
804
|
# Handle image pull policy
|
|
565
|
-
image
|
|
805
|
+
# Determine image based on --gpu flag
|
|
806
|
+
if args.gpu:
|
|
807
|
+
image = GPU_IMAGE
|
|
808
|
+
elif args.image:
|
|
809
|
+
image = args.image
|
|
810
|
+
else:
|
|
811
|
+
image = DEFAULT_IMAGE
|
|
812
|
+
|
|
566
813
|
image_exists = check_image_exists(runtime, image)
|
|
567
|
-
|
|
814
|
+
|
|
815
|
+
# NOTE: Docker Desktop on macOS/Windows uses a VM, so disk space checks may not
|
|
816
|
+
# accurately reflect available space in the container's filesystem. Remote Docker
|
|
817
|
+
# daemons (DOCKER_HOST) are also not supported. In these cases, the check will
|
|
818
|
+
# gracefully degrade (warn and proceed).
|
|
819
|
+
|
|
820
|
+
# Check disk space before pulling image (skip if pull=never or image exists with pull=missing)
|
|
821
|
+
will_pull = args.pull == "always" or (args.pull == "missing" and not image_exists)
|
|
822
|
+
if will_pull:
|
|
823
|
+
storage_root = get_storage_root(runtime)
|
|
824
|
+
if storage_root:
|
|
825
|
+
image_size = get_image_size_estimate(runtime, image)
|
|
826
|
+
if image_size:
|
|
827
|
+
free_space = get_free_space(storage_root)
|
|
828
|
+
if free_space < image_size:
|
|
829
|
+
print(
|
|
830
|
+
f"Warning: Not enough free disk space on {storage_root}",
|
|
831
|
+
file=sys.stderr,
|
|
832
|
+
)
|
|
833
|
+
print(
|
|
834
|
+
f" Available: {format_size(free_space)}",
|
|
835
|
+
file=sys.stderr,
|
|
836
|
+
)
|
|
837
|
+
print(
|
|
838
|
+
f" Required: {format_size(image_size)} (estimated)",
|
|
839
|
+
file=sys.stderr,
|
|
840
|
+
)
|
|
841
|
+
if args.yes:
|
|
842
|
+
print(" Proceeding anyway (--yes flag set)", file=sys.stderr)
|
|
843
|
+
elif not sys.stdin.isatty():
|
|
844
|
+
print(
|
|
845
|
+
" Run with --yes to proceed anyway, or free up disk space",
|
|
846
|
+
file=sys.stderr,
|
|
847
|
+
)
|
|
848
|
+
return 1
|
|
849
|
+
elif not confirm_proceed("Continue anyway?"):
|
|
850
|
+
return 130
|
|
851
|
+
elif free_space - image_size < 1024 * 1024 * 1024:
|
|
852
|
+
print(
|
|
853
|
+
f"Warning: Less than 1 GB would remain after pulling image on {storage_root}",
|
|
854
|
+
file=sys.stderr,
|
|
855
|
+
)
|
|
856
|
+
print(
|
|
857
|
+
f" Available: {format_size(free_space)}",
|
|
858
|
+
file=sys.stderr,
|
|
859
|
+
)
|
|
860
|
+
print(
|
|
861
|
+
f" Required: {format_size(image_size)} (estimated)",
|
|
862
|
+
file=sys.stderr,
|
|
863
|
+
)
|
|
864
|
+
print(
|
|
865
|
+
f" Remaining: {format_size(free_space - image_size)}",
|
|
866
|
+
file=sys.stderr,
|
|
867
|
+
)
|
|
868
|
+
if args.yes:
|
|
869
|
+
print(" Proceeding anyway (--yes flag set)", file=sys.stderr)
|
|
870
|
+
elif not sys.stdin.isatty():
|
|
871
|
+
print(
|
|
872
|
+
" Run with --yes to proceed anyway, or free up disk space",
|
|
873
|
+
file=sys.stderr,
|
|
874
|
+
)
|
|
875
|
+
return 1
|
|
876
|
+
elif not confirm_proceed("Continue anyway?"):
|
|
877
|
+
return 130
|
|
878
|
+
|
|
568
879
|
if args.pull == "always" or (args.pull == "missing" and not image_exists):
|
|
569
880
|
if not pull_image(runtime, image, args.quiet):
|
|
570
881
|
print(f"Error: Failed to pull image: {image}", file=sys.stderr)
|
|
571
882
|
return 1
|
|
572
883
|
elif args.pull == "never" and not image_exists:
|
|
573
884
|
print(f"Error: Image not found locally: {image}", file=sys.stderr)
|
|
574
|
-
|
|
885
|
+
runtime_name = os.path.basename(runtime)
|
|
886
|
+
print(f"Run with --pull=missing or pull manually: {runtime_name} pull {image}")
|
|
575
887
|
return 1
|
|
576
|
-
|
|
577
|
-
# Resolve paths
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
888
|
+
|
|
889
|
+
# Resolve paths (use absolute() as fallback if resolve() fails due to permissions)
|
|
890
|
+
try:
|
|
891
|
+
input_path = Path(args.input).resolve()
|
|
892
|
+
except PermissionError:
|
|
893
|
+
input_path = Path(args.input).absolute()
|
|
894
|
+
try:
|
|
895
|
+
output_dir = Path(args.out_dir).resolve()
|
|
896
|
+
except PermissionError:
|
|
897
|
+
output_dir = Path(args.out_dir).absolute()
|
|
898
|
+
|
|
581
899
|
# Validate input
|
|
582
900
|
if not input_path.exists():
|
|
583
901
|
print(f"Error: Input path does not exist: {input_path}", file=sys.stderr)
|
|
584
902
|
return 1
|
|
585
|
-
|
|
903
|
+
|
|
586
904
|
# Get files to convert
|
|
587
905
|
try:
|
|
588
906
|
files_to_convert = get_files_to_convert(input_path, args.glob, args.recursive)
|
|
589
907
|
except Exception as e:
|
|
590
908
|
print(f"Error: {e}", file=sys.stderr)
|
|
591
909
|
return 1
|
|
592
|
-
|
|
910
|
+
|
|
593
911
|
if not files_to_convert:
|
|
594
912
|
print(f"No files found to convert in: {input_path}", file=sys.stderr)
|
|
595
913
|
return 1
|
|
596
|
-
|
|
914
|
+
|
|
597
915
|
total_files = len(files_to_convert)
|
|
598
916
|
total_size = sum(f.stat().st_size for f in files_to_convert)
|
|
599
|
-
|
|
917
|
+
|
|
600
918
|
if not args.quiet:
|
|
601
919
|
print(f"Found {total_files} file(s) to convert ({format_size(total_size)})")
|
|
602
920
|
print(f"Using runtime: {runtime}")
|
|
603
921
|
print(f"Using image: {image}")
|
|
604
922
|
print()
|
|
605
|
-
|
|
923
|
+
|
|
924
|
+
if args.mask:
|
|
925
|
+
print(
|
|
926
|
+
"Warning: --mask is not supported with docling-serve and will be ignored",
|
|
927
|
+
file=sys.stderr,
|
|
928
|
+
)
|
|
929
|
+
|
|
606
930
|
# Determine input base for directory structure preservation
|
|
607
931
|
if input_path.is_file():
|
|
608
932
|
input_base = input_path.parent
|
|
609
933
|
else:
|
|
610
934
|
input_base = input_path
|
|
611
|
-
|
|
612
|
-
# Convert files
|
|
935
|
+
|
|
613
936
|
success_count = 0
|
|
614
937
|
skipped_count = 0
|
|
615
938
|
failed_count = 0
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
for idx, input_file in enumerate(files_to_convert, 1):
|
|
620
|
-
output_file = get_output_path(input_file, input_base, output_dir, args.flat)
|
|
621
|
-
file_size = input_file.stat().st_size
|
|
622
|
-
progress = f"[{idx}/{total_files}]"
|
|
623
|
-
|
|
624
|
-
# Check if output exists and skip if not overwriting
|
|
625
|
-
if output_file.exists() and not args.overwrite:
|
|
626
|
-
if not args.quiet:
|
|
627
|
-
print(f"{progress} Skipped (exists): {input_file.name}")
|
|
628
|
-
skipped_count += 1
|
|
629
|
-
continue
|
|
630
|
-
|
|
631
|
-
# Show spinner while processing
|
|
939
|
+
total_elapsed = 0.0
|
|
940
|
+
|
|
941
|
+
try:
|
|
632
942
|
if not args.quiet:
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
943
|
+
print(f"Starting docling-serve container...")
|
|
944
|
+
print()
|
|
945
|
+
|
|
946
|
+
with DoclingContainer(runtime, image, args.port, timeout=timeout) as container:
|
|
947
|
+
# Convert files
|
|
948
|
+
conversion_start = time.time()
|
|
949
|
+
spinner = Spinner()
|
|
950
|
+
|
|
951
|
+
for idx, input_file in enumerate(files_to_convert, 1):
|
|
952
|
+
output_file = get_output_path(
|
|
953
|
+
input_file, input_base, output_dir, args.flat
|
|
954
|
+
)
|
|
955
|
+
file_size = input_file.stat().st_size
|
|
956
|
+
progress = f"[{idx}/{total_files}]"
|
|
957
|
+
|
|
958
|
+
# Check if output exists and skip if not overwriting
|
|
959
|
+
if output_file.exists() and not args.overwrite:
|
|
960
|
+
if not args.quiet:
|
|
961
|
+
print(f"{progress} Skipped (exists): {input_file.name}")
|
|
962
|
+
skipped_count += 1
|
|
963
|
+
continue
|
|
964
|
+
|
|
965
|
+
# Ensure output directory exists
|
|
966
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
967
|
+
|
|
968
|
+
# Show spinner while processing
|
|
969
|
+
if not args.quiet:
|
|
970
|
+
spinner.start(
|
|
971
|
+
f"{progress} Processing: {input_file.name} ({format_size(file_size)})"
|
|
972
|
+
)
|
|
973
|
+
|
|
974
|
+
start_time = time.time()
|
|
975
|
+
try:
|
|
976
|
+
# Convert via HTTP API
|
|
977
|
+
result = convert_file(
|
|
978
|
+
container.base_url, input_file, to_format="md"
|
|
979
|
+
)
|
|
980
|
+
elapsed = time.time() - start_time
|
|
981
|
+
|
|
982
|
+
if not args.quiet:
|
|
983
|
+
spinner.stop()
|
|
984
|
+
|
|
985
|
+
if result.success:
|
|
986
|
+
# Write result to output file
|
|
987
|
+
output_file.write_text(result.content)
|
|
988
|
+
success_count += 1
|
|
989
|
+
if not args.quiet:
|
|
990
|
+
print(
|
|
991
|
+
f"{progress} {input_file.name} ✓ ({format_duration(elapsed)})"
|
|
992
|
+
)
|
|
993
|
+
else:
|
|
994
|
+
failed_count += 1
|
|
995
|
+
error_msg = result.error or "Unknown error"
|
|
996
|
+
if not args.quiet:
|
|
997
|
+
print(
|
|
998
|
+
f"{progress} {input_file.name} ✗ ({format_duration(elapsed)})"
|
|
999
|
+
)
|
|
1000
|
+
print(f" Error: {error_msg}", file=sys.stderr)
|
|
1001
|
+
except Exception as e:
|
|
1002
|
+
elapsed = time.time() - start_time
|
|
1003
|
+
failed_count += 1
|
|
1004
|
+
if not args.quiet:
|
|
1005
|
+
spinner.stop()
|
|
1006
|
+
print(
|
|
1007
|
+
f"{progress} {input_file.name} ✗ ({format_duration(elapsed)})"
|
|
1008
|
+
)
|
|
1009
|
+
print(f" Error: {str(e)}", file=sys.stderr)
|
|
1010
|
+
|
|
1011
|
+
total_elapsed = time.time() - conversion_start
|
|
1012
|
+
|
|
1013
|
+
# Print summary
|
|
639
1014
|
if not args.quiet:
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
print(f" Total files: {total_files}")
|
|
660
|
-
print(f" Successful: {success_count}")
|
|
661
|
-
print(f" Skipped: {skipped_count}")
|
|
662
|
-
print(f" Failed: {failed_count}")
|
|
663
|
-
print(f" Total time: {format_duration(total_elapsed)}")
|
|
664
|
-
print("=" * 50)
|
|
665
|
-
|
|
1015
|
+
print()
|
|
1016
|
+
print("=" * 50)
|
|
1017
|
+
print("Conversion Summary:")
|
|
1018
|
+
print(f" Total files: {total_files}")
|
|
1019
|
+
print(f" Successful: {success_count}")
|
|
1020
|
+
print(f" Skipped: {skipped_count}")
|
|
1021
|
+
print(f" Failed: {failed_count}")
|
|
1022
|
+
print(f" Total time: {format_duration(total_elapsed)}")
|
|
1023
|
+
print("=" * 50)
|
|
1024
|
+
|
|
1025
|
+
except KeyboardInterrupt:
|
|
1026
|
+
if not args.quiet:
|
|
1027
|
+
print("\n\nInterrupted by user. Container stopped.")
|
|
1028
|
+
if success_count > 0 or skipped_count > 0 or failed_count > 0:
|
|
1029
|
+
print(
|
|
1030
|
+
f"Partial progress: {success_count} successful, {failed_count} failed, {skipped_count} skipped"
|
|
1031
|
+
)
|
|
1032
|
+
return 130
|
|
1033
|
+
|
|
666
1034
|
# Return appropriate exit code
|
|
667
1035
|
if failed_count > 0:
|
|
668
1036
|
return 1
|