mdify-cli 2.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdify/cli.py ADDED
@@ -0,0 +1,915 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ CLI for converting documents to Markdown.
4
+
5
+ This CLI orchestrates document conversion by invoking a Docker/Podman
6
+ container that contains Docling and ML dependencies. The CLI itself
7
+ is lightweight and has no ML dependencies.
8
+ """
9
+
10
+ import argparse
11
+ import json
12
+ import os
13
+ import shutil
14
+ import subprocess
15
+ import sys
16
+ import threading
17
+ import time
18
+ from pathlib import Path
19
+ from typing import List, Optional, Tuple
20
+ from urllib.error import URLError
21
+ from urllib.request import urlopen
22
+
23
+ from . import __version__
24
+ from mdify.container import DoclingContainer
25
+ from mdify.docling_client import convert_file
26
+
27
+ # Configuration
28
+ MDIFY_HOME = Path.home() / ".mdify"
29
+ LAST_CHECK_FILE = MDIFY_HOME / ".last_check"
30
+ PYPI_API_URL = "https://pypi.org/pypi/mdify-cli/json"
31
+ CHECK_INTERVAL_SECONDS = 86400 # 24 hours
32
+
33
+ # Container configuration
34
+ DEFAULT_IMAGE = "ghcr.io/docling-project/docling-serve-cpu:main"
35
+ GPU_IMAGE = "ghcr.io/docling-project/docling-serve-cu126:main"
36
+ SUPPORTED_RUNTIMES = ("docker", "podman")
37
+
38
+
39
+ # =============================================================================
40
+ # Update checking functions
41
+ # =============================================================================
42
+
43
+
44
+ def _get_remote_version(timeout: int = 5) -> Optional[str]:
45
+ """
46
+ Fetch the latest version from PyPI.
47
+
48
+ Returns:
49
+ Version string (e.g., "1.1.0") or None if fetch failed.
50
+ """
51
+ try:
52
+ with urlopen(PYPI_API_URL, timeout=timeout) as response:
53
+ data = json.loads(response.read().decode("utf-8"))
54
+ version = data.get("info", {}).get("version", "")
55
+ return version if version else None
56
+ except (URLError, json.JSONDecodeError, KeyError, TimeoutError):
57
+ return None
58
+
59
+
60
+ def _should_check_for_update() -> bool:
61
+ """
62
+ Determine if we should check for updates based on last check time.
63
+
64
+ Returns:
65
+ True if check should be performed, False otherwise.
66
+ """
67
+ if os.environ.get("MDIFY_NO_UPDATE_CHECK", "").lower() in ("1", "true", "yes"):
68
+ return False
69
+
70
+ if not LAST_CHECK_FILE.exists():
71
+ return True
72
+
73
+ try:
74
+ last_check = float(LAST_CHECK_FILE.read_text().strip())
75
+ elapsed = time.time() - last_check
76
+ return elapsed >= CHECK_INTERVAL_SECONDS
77
+ except (ValueError, OSError):
78
+ return True
79
+
80
+
81
+ def _update_last_check_time() -> None:
82
+ """Update the last check timestamp file."""
83
+ try:
84
+ LAST_CHECK_FILE.parent.mkdir(parents=True, exist_ok=True)
85
+ LAST_CHECK_FILE.write_text(str(time.time()))
86
+ except OSError:
87
+ pass
88
+
89
+
90
+ def _compare_versions(current: str, remote: str) -> bool:
91
+ """
92
+ Compare version strings.
93
+
94
+ Returns:
95
+ True if remote version is newer than current.
96
+ """
97
+ try:
98
+ current_parts = [int(x) for x in current.split(".")]
99
+ remote_parts = [int(x) for x in remote.split(".")]
100
+
101
+ max_len = max(len(current_parts), len(remote_parts))
102
+ current_parts.extend([0] * (max_len - len(current_parts)))
103
+ remote_parts.extend([0] * (max_len - len(remote_parts)))
104
+
105
+ return remote_parts > current_parts
106
+ except (ValueError, AttributeError):
107
+ return False
108
+
109
+
110
+ def check_for_update(force: bool = False) -> None:
111
+ """
112
+ Check for updates and prompt user to upgrade if available.
113
+
114
+ Args:
115
+ force: If True, check regardless of last check time and show errors.
116
+ """
117
+ if not force and not _should_check_for_update():
118
+ return
119
+
120
+ remote_version = _get_remote_version()
121
+
122
+ if remote_version is None:
123
+ if force:
124
+ print(
125
+ "Error: Failed to check for updates. "
126
+ "Please check your internet connection.",
127
+ file=sys.stderr,
128
+ )
129
+ sys.exit(1)
130
+ return
131
+
132
+ _update_last_check_time()
133
+
134
+ if not _compare_versions(__version__, remote_version):
135
+ if force:
136
+ print(f"mdify is up to date (version {__version__})")
137
+ return
138
+
139
+ print(f"\n{'=' * 50}")
140
+ print(f"A new version of mdify-cli is available!")
141
+ print(f" Current version: {__version__}")
142
+ print(f" Latest version: {remote_version}")
143
+ print(f"{'=' * 50}")
144
+ print(f"\nTo upgrade, run:")
145
+ print(f" pipx upgrade mdify-cli")
146
+ print(f" # or: pip install --upgrade mdify-cli\n")
147
+
148
+
149
+ # =============================================================================
150
+ # Container runtime functions
151
+ # =============================================================================
152
+
153
+
154
+ def detect_runtime(preferred: str, explicit: bool = True) -> Optional[str]:
155
+ """
156
+ Detect available container runtime.
157
+
158
+ Args:
159
+ preferred: Preferred runtime ('docker' or 'podman')
160
+ explicit: If True, warn when falling back to alternative.
161
+ If False, silently use alternative without warning.
162
+ Note: This only controls warning emission; selection order
163
+ is always preferred → alternative regardless of this flag.
164
+
165
+ Returns:
166
+ Path to runtime executable, or None if not found.
167
+ """
168
+ # Try preferred runtime first
169
+ runtime_path = shutil.which(preferred)
170
+ if runtime_path:
171
+ return runtime_path
172
+
173
+ # Try alternative
174
+ alternative = "podman" if preferred == "docker" else "docker"
175
+ runtime_path = shutil.which(alternative)
176
+ if runtime_path:
177
+ if explicit:
178
+ print(
179
+ f"Warning: {preferred} not found, using {alternative}", file=sys.stderr
180
+ )
181
+ return runtime_path
182
+
183
+ return None
184
+
185
+
186
+ def check_image_exists(runtime: str, image: str) -> bool:
187
+ """
188
+ Check if container image exists locally.
189
+
190
+ Args:
191
+ runtime: Path to container runtime
192
+ image: Image name/tag
193
+
194
+ Returns:
195
+ True if image exists locally.
196
+ """
197
+ try:
198
+ result = subprocess.run(
199
+ [runtime, "image", "inspect", image],
200
+ capture_output=True,
201
+ check=False,
202
+ )
203
+ return result.returncode == 0
204
+ except OSError:
205
+ return False
206
+
207
+
208
+ def pull_image(runtime: str, image: str, quiet: bool = False) -> bool:
209
+ """
210
+ Pull container image.
211
+
212
+ Args:
213
+ runtime: Path to container runtime
214
+ image: Image name/tag
215
+ quiet: Suppress progress output
216
+
217
+ Returns:
218
+ True if pull succeeded.
219
+ """
220
+ if not quiet:
221
+ print(f"Pulling image: {image}")
222
+
223
+ try:
224
+ result = subprocess.run(
225
+ [runtime, "pull", image],
226
+ capture_output=quiet,
227
+ check=False,
228
+ )
229
+ return result.returncode == 0
230
+ except OSError as e:
231
+ print(f"Error pulling image: {e}", file=sys.stderr)
232
+ return False
233
+
234
+
235
+ def get_image_size_estimate(runtime: str, image: str) -> Optional[int]:
236
+ """
237
+ Estimate image size by querying registry manifest.
238
+
239
+ Runs `<runtime> manifest inspect --verbose <image>` and sums all layer sizes
240
+ across all architectures, then applies 50% buffer for decompression.
241
+
242
+ Args:
243
+ runtime: Path to container runtime
244
+ image: Image name/tag
245
+
246
+ Returns:
247
+ Estimated size in bytes with 50% buffer, or None if command fails.
248
+ """
249
+ try:
250
+ result = subprocess.run(
251
+ [runtime, "manifest", "inspect", "--verbose", image],
252
+ capture_output=True,
253
+ check=False,
254
+ )
255
+ if result.returncode != 0:
256
+ return None
257
+
258
+ manifest_data = json.loads(result.stdout.decode())
259
+
260
+ # Sum all layer sizes across all architectures
261
+ total_size = 0
262
+ for manifest in manifest_data.get("Manifests", []):
263
+ oci_manifest = manifest.get("OCIManifest", {})
264
+ for layer in oci_manifest.get("layers", []):
265
+ total_size += layer.get("size", 0)
266
+
267
+ # Apply 50% buffer for decompression (compressed -> uncompressed)
268
+ return int(total_size * 1.5)
269
+ except (json.JSONDecodeError, KeyError, ValueError, OSError):
270
+ return None
271
+
272
+
273
+ def format_size(size_bytes: int) -> str:
274
+ """Format file size in human-readable format."""
275
+ for unit in ["B", "KB", "MB", "GB"]:
276
+ if size_bytes < 1024:
277
+ return f"{size_bytes:.1f} {unit}" if unit != "B" else f"{size_bytes} {unit}"
278
+ size_bytes /= 1024
279
+ return f"{size_bytes:.1f} TB"
280
+
281
+
282
+ def format_duration(seconds: float) -> str:
283
+ """Format duration in human-readable format."""
284
+ if seconds < 60:
285
+ return f"{seconds:.1f}s"
286
+ minutes = int(seconds // 60)
287
+ secs = seconds % 60
288
+ if minutes < 60:
289
+ return f"{minutes}m {secs:.0f}s"
290
+ hours = minutes // 60
291
+ mins = minutes % 60
292
+ return f"{hours}h {mins}m {secs:.0f}s"
293
+
294
+
295
+ def get_free_space(path: str) -> int:
296
+ """Get free disk space for the given path in bytes."""
297
+ try:
298
+ return shutil.disk_usage(path).free
299
+ except (FileNotFoundError, OSError):
300
+ return 0
301
+
302
+
303
+ def get_storage_root(runtime: str) -> Optional[str]:
304
+ """
305
+ Get the storage root directory for Docker or Podman.
306
+
307
+ Args:
308
+ runtime: Path to container runtime executable
309
+
310
+ Returns:
311
+ Storage root path as string, or None if command fails.
312
+ """
313
+ try:
314
+ # Extract runtime name from path (e.g., /usr/bin/docker -> docker)
315
+ runtime_name = os.path.basename(runtime)
316
+
317
+ if runtime_name == "docker":
318
+ result = subprocess.run(
319
+ [runtime, "system", "info", "--format", "{{.DockerRootDir}}"],
320
+ capture_output=True,
321
+ check=False,
322
+ )
323
+ if result.stdout:
324
+ return result.stdout.decode().strip()
325
+ elif runtime_name == "podman":
326
+ result = subprocess.run(
327
+ [runtime, "info", "--format", "json"],
328
+ capture_output=True,
329
+ check=False,
330
+ )
331
+ if result.stdout:
332
+ info = json.loads(result.stdout.decode())
333
+ return info.get("store", {}).get("graphRoot")
334
+ return None
335
+ except (OSError, json.JSONDecodeError):
336
+ return None
337
+
338
+
339
+ def confirm_proceed(message: str, default_no: bool = True) -> bool:
340
+ """
341
+ Prompt user for confirmation with a y/N prompt.
342
+
343
+ Args:
344
+ message: The confirmation message to display
345
+ default_no: If True, shows [y/N] (default no). If False, shows [Y/n] (default yes)
346
+
347
+ Returns:
348
+ True if user entered 'y' or 'Y', False otherwise.
349
+ Returns False immediately if stdin is not a TTY (non-interactive).
350
+ """
351
+ if not sys.stdin.isatty():
352
+ return False
353
+
354
+ prompt = "[y/N]" if default_no else "[Y/n]"
355
+ print(f"{message} {prompt}", file=sys.stderr)
356
+ response = input()
357
+ return response.lower() == "y"
358
+
359
+
360
+ class Spinner:
361
+ """A simple spinner to show progress during long operations."""
362
+
363
+ def __init__(self):
364
+ self.frames = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
365
+ self.running = False
366
+ self.thread = None
367
+ self.start_time = None
368
+
369
+ def _spin(self):
370
+ idx = 0
371
+ while self.running:
372
+ elapsed = time.time() - self.start_time
373
+ frame = self.frames[idx % len(self.frames)]
374
+ print(
375
+ f"\r{self.prefix} {frame} ({format_duration(elapsed)})",
376
+ end="",
377
+ flush=True,
378
+ )
379
+ idx += 1
380
+ time.sleep(0.1)
381
+
382
+ def start(self, prefix: str = ""):
383
+ self.prefix = prefix
384
+ self.running = True
385
+ self.start_time = time.time()
386
+ self.thread = threading.Thread(target=self._spin, daemon=True)
387
+ self.thread.start()
388
+
389
+ def stop(self):
390
+ self.running = False
391
+ if self.thread:
392
+ self.thread.join(timeout=0.5)
393
+ # Clear the spinner line
394
+ print(f"\r{' ' * 80}\r", end="", flush=True)
395
+
396
+
397
+ # =============================================================================
398
+ # File handling functions
399
+ # =============================================================================
400
+
401
+ # Supported file extensions (based on Docling InputFormat)
402
+ SUPPORTED_EXTENSIONS = {
403
+ ".pdf",
404
+ ".docx",
405
+ ".pptx",
406
+ ".html",
407
+ ".htm",
408
+ ".png",
409
+ ".jpg",
410
+ ".jpeg",
411
+ ".gif",
412
+ ".bmp",
413
+ ".tiff",
414
+ ".tif", # images
415
+ ".asciidoc",
416
+ ".adoc",
417
+ ".asc", # asciidoc
418
+ ".md",
419
+ ".markdown", # markdown
420
+ ".csv",
421
+ ".xlsx", # spreadsheets
422
+ ".xml", # XML formats
423
+ ".json", # JSON docling
424
+ ".mp3",
425
+ ".wav",
426
+ ".m4a",
427
+ ".flac", # audio
428
+ ".vtt", # subtitles
429
+ }
430
+
431
+
432
+ def get_files_to_convert(input_path: Path, mask: str, recursive: bool) -> List[Path]:
433
+ """Get list of files to convert based on input path and options."""
434
+ files = []
435
+
436
+ if input_path.is_file():
437
+ files.append(input_path)
438
+ elif input_path.is_dir():
439
+ if recursive:
440
+ files = list(input_path.rglob(mask))
441
+ else:
442
+ files = list(input_path.glob(mask))
443
+
444
+ # Filter to only files
445
+ files = [f for f in files if f.is_file()]
446
+ else:
447
+ raise FileNotFoundError(f"Input path does not exist: {input_path}")
448
+
449
+ # Filter out hidden files and unsupported formats
450
+ files = [
451
+ f
452
+ for f in files
453
+ if not f.name.startswith(".") and f.suffix.lower() in SUPPORTED_EXTENSIONS
454
+ ]
455
+
456
+ return files
457
+
458
+
459
+ def get_output_path(
460
+ input_file: Path,
461
+ input_base: Path,
462
+ output_dir: Path,
463
+ flat: bool,
464
+ ) -> Path:
465
+ """Calculate output path for a given input file."""
466
+ if flat:
467
+ try:
468
+ relative_path = input_file.relative_to(input_base)
469
+ parts = list(relative_path.parts)
470
+ except ValueError:
471
+ parts = [input_file.name]
472
+
473
+ stem = input_file.stem
474
+ parent_prefix = "_".join(parts[:-1])
475
+ if parent_prefix:
476
+ output_name = f"{parent_prefix}_{stem}.md"
477
+ else:
478
+ output_name = f"{stem}.md"
479
+
480
+ return output_dir / output_name
481
+ else:
482
+ output_name = input_file.stem + ".md"
483
+ try:
484
+ relative_path = input_file.relative_to(input_base)
485
+ output_path = output_dir / relative_path.parent / output_name
486
+ except ValueError:
487
+ output_path = output_dir / output_name
488
+
489
+ return output_path
490
+
491
+
492
+ # =============================================================================
493
+ # CLI argument parsing
494
+ # =============================================================================
495
+
496
+
497
+ def parse_args() -> argparse.Namespace:
498
+ """Parse command line arguments."""
499
+ parser = argparse.ArgumentParser(
500
+ description="Convert documents to Markdown using Docling (via container)",
501
+ formatter_class=argparse.RawDescriptionHelpFormatter,
502
+ epilog="""
503
+ Examples:
504
+ mdify document.pdf Convert a single file
505
+ mdify ./docs -g "*.pdf" -r Convert PDFs recursively
506
+ mdify ./docs -g "*.pdf" -o out/ Specify output directory
507
+ mdify document.pdf -m Mask PII in images
508
+ mdify ./docs --runtime podman Use Podman instead of Docker
509
+ """,
510
+ )
511
+
512
+ parser.add_argument(
513
+ "input",
514
+ type=str,
515
+ nargs="?",
516
+ help="Input file or directory to convert",
517
+ )
518
+
519
+ parser.add_argument(
520
+ "-o",
521
+ "--out-dir",
522
+ type=str,
523
+ default="output",
524
+ help="Output directory for converted files (default: output)",
525
+ )
526
+
527
+ parser.add_argument(
528
+ "-g",
529
+ "--glob",
530
+ type=str,
531
+ default="*",
532
+ help="Glob pattern for filtering files in directory (default: *)",
533
+ )
534
+
535
+ parser.add_argument(
536
+ "-r",
537
+ "--recursive",
538
+ action="store_true",
539
+ help="Recursively scan directories",
540
+ )
541
+
542
+ parser.add_argument(
543
+ "--flat",
544
+ action="store_true",
545
+ help="Disable directory structure preservation in output",
546
+ )
547
+
548
+ parser.add_argument(
549
+ "--overwrite",
550
+ action="store_true",
551
+ help="Overwrite existing output files",
552
+ )
553
+
554
+ parser.add_argument(
555
+ "-q",
556
+ "--quiet",
557
+ action="store_true",
558
+ help="Suppress progress messages",
559
+ )
560
+
561
+ parser.add_argument(
562
+ "-y",
563
+ "--yes",
564
+ action="store_true",
565
+ help="Skip confirmation prompts (for scripts/CI)",
566
+ )
567
+
568
+ parser.add_argument(
569
+ "-m",
570
+ "--mask",
571
+ action="store_true",
572
+ help="Mask PII and sensitive content in document images",
573
+ )
574
+
575
+ parser.add_argument(
576
+ "--gpu",
577
+ action="store_true",
578
+ help="Use GPU-accelerated container image (docling-serve-cu126)",
579
+ )
580
+
581
+ parser.add_argument(
582
+ "--port",
583
+ type=int,
584
+ default=5001,
585
+ help="Port for docling-serve container (default: 5001)",
586
+ )
587
+
588
+ # Container options
589
+ parser.add_argument(
590
+ "--runtime",
591
+ type=str,
592
+ choices=SUPPORTED_RUNTIMES,
593
+ default=None,
594
+ help="Container runtime to use (auto-detects docker or podman if not specified)",
595
+ )
596
+
597
+ parser.add_argument(
598
+ "--image",
599
+ type=str,
600
+ default=DEFAULT_IMAGE,
601
+ help=f"Container image to use (default: {DEFAULT_IMAGE})",
602
+ )
603
+
604
+ parser.add_argument(
605
+ "--pull",
606
+ type=str,
607
+ choices=("always", "missing", "never"),
608
+ default="missing",
609
+ help="Image pull policy: always, missing, never (default: missing)",
610
+ )
611
+
612
+ parser.add_argument(
613
+ "--timeout",
614
+ type=int,
615
+ default=None,
616
+ help="Conversion timeout in seconds (default: 1200, can be set via MDIFY_TIMEOUT env var)",
617
+ )
618
+
619
+ # Utility options
620
+ parser.add_argument(
621
+ "--check-update",
622
+ action="store_true",
623
+ help="Check for available updates and exit",
624
+ )
625
+
626
+ parser.add_argument(
627
+ "--version",
628
+ action="version",
629
+ version=f"mdify {__version__}",
630
+ )
631
+
632
+ return parser.parse_args()
633
+
634
+
635
+ # =============================================================================
636
+ # Main entry point
637
+ # =============================================================================
638
+
639
+
640
+ def main() -> int:
641
+ """Main entry point for the CLI."""
642
+ args = parse_args()
643
+
644
+ # Handle --check-update flag
645
+ if args.check_update:
646
+ check_for_update(force=True)
647
+ return 0
648
+
649
+ # Check for updates (daily, silent on errors)
650
+ check_for_update(force=False)
651
+
652
+ # Resolve timeout value: CLI > env > default 1200
653
+ timeout = args.timeout or int(os.environ.get("MDIFY_TIMEOUT", 1200))
654
+
655
+ # Validate input is provided
656
+ if args.input is None:
657
+ print("Error: Input file or directory is required", file=sys.stderr)
658
+ print("Usage: mdify <input> [options]", file=sys.stderr)
659
+ print(" mdify --help for more information", file=sys.stderr)
660
+ return 1
661
+
662
+ # Detect container runtime
663
+ preferred = args.runtime if args.runtime else "docker"
664
+ explicit = args.runtime is not None
665
+ runtime = detect_runtime(preferred, explicit=explicit)
666
+ if runtime is None:
667
+ print(
668
+ f"Error: Container runtime not found ({', '.join(SUPPORTED_RUNTIMES)})",
669
+ file=sys.stderr,
670
+ )
671
+ print("Please install Docker or Podman to use mdify.", file=sys.stderr)
672
+ return 2
673
+
674
+ # Handle image pull policy
675
+ # Determine image based on --gpu flag
676
+ if args.gpu:
677
+ image = GPU_IMAGE
678
+ elif args.image:
679
+ image = args.image
680
+ else:
681
+ image = DEFAULT_IMAGE
682
+
683
+ image_exists = check_image_exists(runtime, image)
684
+
685
+ # NOTE: Docker Desktop on macOS/Windows uses a VM, so disk space checks may not
686
+ # accurately reflect available space in the container's filesystem. Remote Docker
687
+ # daemons (DOCKER_HOST) are also not supported. In these cases, the check will
688
+ # gracefully degrade (warn and proceed).
689
+
690
+ # Check disk space before pulling image (skip if pull=never or image exists with pull=missing)
691
+ will_pull = args.pull == "always" or (args.pull == "missing" and not image_exists)
692
+ if will_pull:
693
+ storage_root = get_storage_root(runtime)
694
+ if storage_root:
695
+ image_size = get_image_size_estimate(runtime, image)
696
+ if image_size:
697
+ free_space = get_free_space(storage_root)
698
+ if free_space < image_size:
699
+ print(
700
+ f"Warning: Not enough free disk space on {storage_root}",
701
+ file=sys.stderr,
702
+ )
703
+ print(
704
+ f" Available: {format_size(free_space)}",
705
+ file=sys.stderr,
706
+ )
707
+ print(
708
+ f" Required: {format_size(image_size)} (estimated)",
709
+ file=sys.stderr,
710
+ )
711
+ if args.yes:
712
+ print(" Proceeding anyway (--yes flag set)", file=sys.stderr)
713
+ elif not sys.stdin.isatty():
714
+ print(
715
+ " Run with --yes to proceed anyway, or free up disk space",
716
+ file=sys.stderr,
717
+ )
718
+ return 1
719
+ elif not confirm_proceed("Continue anyway?"):
720
+ return 130
721
+ elif free_space - image_size < 1024 * 1024 * 1024:
722
+ print(
723
+ f"Warning: Less than 1 GB would remain after pulling image on {storage_root}",
724
+ file=sys.stderr,
725
+ )
726
+ print(
727
+ f" Available: {format_size(free_space)}",
728
+ file=sys.stderr,
729
+ )
730
+ print(
731
+ f" Required: {format_size(image_size)} (estimated)",
732
+ file=sys.stderr,
733
+ )
734
+ print(
735
+ f" Remaining: {format_size(free_space - image_size)}",
736
+ file=sys.stderr,
737
+ )
738
+ if args.yes:
739
+ print(" Proceeding anyway (--yes flag set)", file=sys.stderr)
740
+ elif not sys.stdin.isatty():
741
+ print(
742
+ " Run with --yes to proceed anyway, or free up disk space",
743
+ file=sys.stderr,
744
+ )
745
+ return 1
746
+ elif not confirm_proceed("Continue anyway?"):
747
+ return 130
748
+
749
+ if args.pull == "always" or (args.pull == "missing" and not image_exists):
750
+ if not pull_image(runtime, image, args.quiet):
751
+ print(f"Error: Failed to pull image: {image}", file=sys.stderr)
752
+ return 1
753
+ elif args.pull == "never" and not image_exists:
754
+ print(f"Error: Image not found locally: {image}", file=sys.stderr)
755
+ print(f"Run with --pull=missing or pull manually: {preferred} pull {image}")
756
+ return 1
757
+
758
+ # Resolve paths (use absolute() as fallback if resolve() fails due to permissions)
759
+ try:
760
+ input_path = Path(args.input).resolve()
761
+ except PermissionError:
762
+ input_path = Path(args.input).absolute()
763
+ try:
764
+ output_dir = Path(args.out_dir).resolve()
765
+ except PermissionError:
766
+ output_dir = Path(args.out_dir).absolute()
767
+
768
+ # Validate input
769
+ if not input_path.exists():
770
+ print(f"Error: Input path does not exist: {input_path}", file=sys.stderr)
771
+ return 1
772
+
773
+ # Get files to convert
774
+ try:
775
+ files_to_convert = get_files_to_convert(input_path, args.glob, args.recursive)
776
+ except Exception as e:
777
+ print(f"Error: {e}", file=sys.stderr)
778
+ return 1
779
+
780
+ if not files_to_convert:
781
+ print(f"No files found to convert in: {input_path}", file=sys.stderr)
782
+ return 1
783
+
784
+ total_files = len(files_to_convert)
785
+ total_size = sum(f.stat().st_size for f in files_to_convert)
786
+
787
+ if not args.quiet:
788
+ print(f"Found {total_files} file(s) to convert ({format_size(total_size)})")
789
+ print(f"Using runtime: {runtime}")
790
+ print(f"Using image: {image}")
791
+ print()
792
+
793
+ if args.mask:
794
+ print(
795
+ "Warning: --mask is not supported with docling-serve and will be ignored",
796
+ file=sys.stderr,
797
+ )
798
+
799
+ # Determine input base for directory structure preservation
800
+ if input_path.is_file():
801
+ input_base = input_path.parent
802
+ else:
803
+ input_base = input_path
804
+
805
+ success_count = 0
806
+ skipped_count = 0
807
+ failed_count = 0
808
+ total_elapsed = 0.0
809
+
810
+ try:
811
+ if not args.quiet:
812
+ print(f"Starting docling-serve container...")
813
+ print()
814
+
815
+ with DoclingContainer(runtime, image, args.port, timeout=timeout) as container:
816
+ # Convert files
817
+ conversion_start = time.time()
818
+ spinner = Spinner()
819
+
820
+ for idx, input_file in enumerate(files_to_convert, 1):
821
+ output_file = get_output_path(
822
+ input_file, input_base, output_dir, args.flat
823
+ )
824
+ file_size = input_file.stat().st_size
825
+ progress = f"[{idx}/{total_files}]"
826
+
827
+ # Check if output exists and skip if not overwriting
828
+ if output_file.exists() and not args.overwrite:
829
+ if not args.quiet:
830
+ print(f"{progress} Skipped (exists): {input_file.name}")
831
+ skipped_count += 1
832
+ continue
833
+
834
+ # Ensure output directory exists
835
+ output_file.parent.mkdir(parents=True, exist_ok=True)
836
+
837
+ # Show spinner while processing
838
+ if not args.quiet:
839
+ spinner.start(
840
+ f"{progress} Processing: {input_file.name} ({format_size(file_size)})"
841
+ )
842
+
843
+ start_time = time.time()
844
+ try:
845
+ # Convert via HTTP API
846
+ result = convert_file(
847
+ container.base_url, input_file, to_format="md"
848
+ )
849
+ elapsed = time.time() - start_time
850
+
851
+ if not args.quiet:
852
+ spinner.stop()
853
+
854
+ if result.success:
855
+ # Write result to output file
856
+ output_file.write_text(result.content)
857
+ success_count += 1
858
+ if not args.quiet:
859
+ print(
860
+ f"{progress} {input_file.name} ✓ ({format_duration(elapsed)})"
861
+ )
862
+ else:
863
+ failed_count += 1
864
+ error_msg = result.error or "Unknown error"
865
+ if not args.quiet:
866
+ print(
867
+ f"{progress} {input_file.name} ✗ ({format_duration(elapsed)})"
868
+ )
869
+ print(f" Error: {error_msg}", file=sys.stderr)
870
+ except Exception as e:
871
+ elapsed = time.time() - start_time
872
+ failed_count += 1
873
+ if not args.quiet:
874
+ spinner.stop()
875
+ print(
876
+ f"{progress} {input_file.name} ✗ ({format_duration(elapsed)})"
877
+ )
878
+ print(f" Error: {str(e)}", file=sys.stderr)
879
+
880
+ total_elapsed = time.time() - conversion_start
881
+
882
+ # Print summary
883
+ if not args.quiet:
884
+ print()
885
+ print("=" * 50)
886
+ print("Conversion Summary:")
887
+ print(f" Total files: {total_files}")
888
+ print(f" Successful: {success_count}")
889
+ print(f" Skipped: {skipped_count}")
890
+ print(f" Failed: {failed_count}")
891
+ print(f" Total time: {format_duration(total_elapsed)}")
892
+ print("=" * 50)
893
+
894
+ except KeyboardInterrupt:
895
+ if not args.quiet:
896
+ print("\n\nInterrupted by user. Container stopped.")
897
+ if success_count > 0 or skipped_count > 0 or failed_count > 0:
898
+ print(
899
+ f"Partial progress: {success_count} successful, {failed_count} failed, {skipped_count} skipped"
900
+ )
901
+ return 130
902
+
903
+ # Return appropriate exit code
904
+ if failed_count > 0:
905
+ return 1
906
+ elif success_count == 0 and skipped_count > 0:
907
+ return 0
908
+ elif success_count > 0:
909
+ return 0
910
+ else:
911
+ return 1
912
+
913
+
914
+ if __name__ == "__main__":
915
+ sys.exit(main())