mdify-cli 2.0.0__tar.gz → 2.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mdify-cli
3
- Version: 2.0.0
3
+ Version: 2.6.0
4
4
  Summary: Convert PDFs and document images into structured Markdown for LLM workflows
5
5
  Author: tiroq
6
6
  License-Expression: MIT
@@ -1,3 +1,3 @@
1
1
  """mdify - Convert documents to Markdown via Docling container."""
2
2
 
3
- __version__ = "2.0.0"
3
+ __version__ = "2.6.0"
@@ -232,6 +232,44 @@ def pull_image(runtime: str, image: str, quiet: bool = False) -> bool:
232
232
  return False
233
233
 
234
234
 
235
+ def get_image_size_estimate(runtime: str, image: str) -> Optional[int]:
236
+ """
237
+ Estimate image size by querying registry manifest.
238
+
239
+ Runs `<runtime> manifest inspect --verbose <image>` and sums all layer sizes
240
+ across all architectures, then applies 50% buffer for decompression.
241
+
242
+ Args:
243
+ runtime: Path to container runtime
244
+ image: Image name/tag
245
+
246
+ Returns:
247
+ Estimated size in bytes with 50% buffer, or None if command fails.
248
+ """
249
+ try:
250
+ result = subprocess.run(
251
+ [runtime, "manifest", "inspect", "--verbose", image],
252
+ capture_output=True,
253
+ check=False,
254
+ )
255
+ if result.returncode != 0:
256
+ return None
257
+
258
+ manifest_data = json.loads(result.stdout.decode())
259
+
260
+ # Sum all layer sizes across all architectures
261
+ total_size = 0
262
+ for manifest in manifest_data.get("Manifests", []):
263
+ oci_manifest = manifest.get("OCIManifest", {})
264
+ for layer in oci_manifest.get("layers", []):
265
+ total_size += layer.get("size", 0)
266
+
267
+ # Apply 50% buffer for decompression (compressed -> uncompressed)
268
+ return int(total_size * 1.5)
269
+ except (json.JSONDecodeError, KeyError, ValueError, OSError):
270
+ return None
271
+
272
+
235
273
  def format_size(size_bytes: int) -> str:
236
274
  """Format file size in human-readable format."""
237
275
  for unit in ["B", "KB", "MB", "GB"]:
@@ -254,6 +292,71 @@ def format_duration(seconds: float) -> str:
254
292
  return f"{hours}h {mins}m {secs:.0f}s"
255
293
 
256
294
 
295
+ def get_free_space(path: str) -> int:
296
+ """Get free disk space for the given path in bytes."""
297
+ try:
298
+ return shutil.disk_usage(path).free
299
+ except (FileNotFoundError, OSError):
300
+ return 0
301
+
302
+
303
+ def get_storage_root(runtime: str) -> Optional[str]:
304
+ """
305
+ Get the storage root directory for Docker or Podman.
306
+
307
+ Args:
308
+ runtime: Path to container runtime executable
309
+
310
+ Returns:
311
+ Storage root path as string, or None if command fails.
312
+ """
313
+ try:
314
+ # Extract runtime name from path (e.g., /usr/bin/docker -> docker)
315
+ runtime_name = os.path.basename(runtime)
316
+
317
+ if runtime_name == "docker":
318
+ result = subprocess.run(
319
+ [runtime, "system", "info", "--format", "{{.DockerRootDir}}"],
320
+ capture_output=True,
321
+ check=False,
322
+ )
323
+ if result.stdout:
324
+ return result.stdout.decode().strip()
325
+ elif runtime_name == "podman":
326
+ result = subprocess.run(
327
+ [runtime, "info", "--format", "json"],
328
+ capture_output=True,
329
+ check=False,
330
+ )
331
+ if result.stdout:
332
+ info = json.loads(result.stdout.decode())
333
+ return info.get("store", {}).get("graphRoot")
334
+ return None
335
+ except (OSError, json.JSONDecodeError):
336
+ return None
337
+
338
+
339
+ def confirm_proceed(message: str, default_no: bool = True) -> bool:
340
+ """
341
+ Prompt user for confirmation with a y/N prompt.
342
+
343
+ Args:
344
+ message: The confirmation message to display
345
+ default_no: If True, shows [y/N] (default no). If False, shows [Y/n] (default yes)
346
+
347
+ Returns:
348
+ True if user entered 'y' or 'Y', False otherwise.
349
+ Returns False immediately if stdin is not a TTY (non-interactive).
350
+ """
351
+ if not sys.stdin.isatty():
352
+ return False
353
+
354
+ prompt = "[y/N]" if default_no else "[Y/n]"
355
+ print(f"{message} {prompt}", file=sys.stderr)
356
+ response = input()
357
+ return response.lower() == "y"
358
+
359
+
257
360
  class Spinner:
258
361
  """A simple spinner to show progress during long operations."""
259
362
 
@@ -455,6 +558,13 @@ Examples:
455
558
  help="Suppress progress messages",
456
559
  )
457
560
 
561
+ parser.add_argument(
562
+ "-y",
563
+ "--yes",
564
+ action="store_true",
565
+ help="Skip confirmation prompts (for scripts/CI)",
566
+ )
567
+
458
568
  parser.add_argument(
459
569
  "-m",
460
570
  "--mask",
@@ -499,6 +609,13 @@ Examples:
499
609
  help="Image pull policy: always, missing, never (default: missing)",
500
610
  )
501
611
 
612
+ parser.add_argument(
613
+ "--timeout",
614
+ type=int,
615
+ default=None,
616
+ help="Conversion timeout in seconds (default: 1200, can be set via MDIFY_TIMEOUT env var)",
617
+ )
618
+
502
619
  # Utility options
503
620
  parser.add_argument(
504
621
  "--check-update",
@@ -532,6 +649,9 @@ def main() -> int:
532
649
  # Check for updates (daily, silent on errors)
533
650
  check_for_update(force=False)
534
651
 
652
+ # Resolve timeout value: CLI > env > default 1200
653
+ timeout = args.timeout or int(os.environ.get("MDIFY_TIMEOUT", 1200))
654
+
535
655
  # Validate input is provided
536
656
  if args.input is None:
537
657
  print("Error: Input file or directory is required", file=sys.stderr)
@@ -562,6 +682,70 @@ def main() -> int:
562
682
 
563
683
  image_exists = check_image_exists(runtime, image)
564
684
 
685
+ # NOTE: Docker Desktop on macOS/Windows uses a VM, so disk space checks may not
686
+ # accurately reflect available space in the container's filesystem. Remote Docker
687
+ # daemons (DOCKER_HOST) are also not supported. In these cases, the check will
688
+ # gracefully degrade (warn and proceed).
689
+
690
+ # Check disk space before pulling image (skip if pull=never or image exists with pull=missing)
691
+ will_pull = args.pull == "always" or (args.pull == "missing" and not image_exists)
692
+ if will_pull:
693
+ storage_root = get_storage_root(runtime)
694
+ if storage_root:
695
+ image_size = get_image_size_estimate(runtime, image)
696
+ if image_size:
697
+ free_space = get_free_space(storage_root)
698
+ if free_space < image_size:
699
+ print(
700
+ f"Warning: Not enough free disk space on {storage_root}",
701
+ file=sys.stderr,
702
+ )
703
+ print(
704
+ f" Available: {format_size(free_space)}",
705
+ file=sys.stderr,
706
+ )
707
+ print(
708
+ f" Required: {format_size(image_size)} (estimated)",
709
+ file=sys.stderr,
710
+ )
711
+ if args.yes:
712
+ print(" Proceeding anyway (--yes flag set)", file=sys.stderr)
713
+ elif not sys.stdin.isatty():
714
+ print(
715
+ " Run with --yes to proceed anyway, or free up disk space",
716
+ file=sys.stderr,
717
+ )
718
+ return 1
719
+ elif not confirm_proceed("Continue anyway?"):
720
+ return 130
721
+ elif free_space - image_size < 1024 * 1024 * 1024:
722
+ print(
723
+ f"Warning: Less than 1 GB would remain after pulling image on {storage_root}",
724
+ file=sys.stderr,
725
+ )
726
+ print(
727
+ f" Available: {format_size(free_space)}",
728
+ file=sys.stderr,
729
+ )
730
+ print(
731
+ f" Required: {format_size(image_size)} (estimated)",
732
+ file=sys.stderr,
733
+ )
734
+ print(
735
+ f" Remaining: {format_size(free_space - image_size)}",
736
+ file=sys.stderr,
737
+ )
738
+ if args.yes:
739
+ print(" Proceeding anyway (--yes flag set)", file=sys.stderr)
740
+ elif not sys.stdin.isatty():
741
+ print(
742
+ " Run with --yes to proceed anyway, or free up disk space",
743
+ file=sys.stderr,
744
+ )
745
+ return 1
746
+ elif not confirm_proceed("Continue anyway?"):
747
+ return 130
748
+
565
749
  if args.pull == "always" or (args.pull == "missing" and not image_exists):
566
750
  if not pull_image(runtime, image, args.quiet):
567
751
  print(f"Error: Failed to pull image: {image}", file=sys.stderr)
@@ -571,9 +755,15 @@ def main() -> int:
571
755
  print(f"Run with --pull=missing or pull manually: {preferred} pull {image}")
572
756
  return 1
573
757
 
574
- # Resolve paths
575
- input_path = Path(args.input).resolve()
576
- output_dir = Path(args.out_dir).resolve()
758
+ # Resolve paths (use absolute() as fallback if resolve() fails due to permissions)
759
+ try:
760
+ input_path = Path(args.input).resolve()
761
+ except PermissionError:
762
+ input_path = Path(args.input).absolute()
763
+ try:
764
+ output_dir = Path(args.out_dir).resolve()
765
+ except PermissionError:
766
+ output_dir = Path(args.out_dir).absolute()
577
767
 
578
768
  # Validate input
579
769
  if not input_path.exists():
@@ -622,7 +812,7 @@ def main() -> int:
622
812
  print(f"Starting docling-serve container...")
623
813
  print()
624
814
 
625
- with DoclingContainer(runtime, image, args.port) as container:
815
+ with DoclingContainer(runtime, image, args.port, timeout=timeout) as container:
626
816
  # Convert files
627
817
  conversion_start = time.time()
628
818
  spinner = Spinner()
@@ -20,17 +20,19 @@ class DoclingContainer:
20
20
  # Container automatically stopped and removed
21
21
  """
22
22
 
23
- def __init__(self, runtime: str, image: str, port: int = 5001):
23
+ def __init__(self, runtime: str, image: str, port: int = 5001, timeout: int = 1200):
24
24
  """Initialize container manager.
25
25
 
26
26
  Args:
27
27
  runtime: Container runtime ("docker" or "podman")
28
28
  image: Container image to use
29
29
  port: Host port to bind (default: 5001)
30
+ timeout: Conversion timeout in seconds (default: 1200)
30
31
  """
31
32
  self.runtime = runtime
32
33
  self.image = image
33
34
  self.port = port
35
+ self.timeout = timeout
34
36
  self.container_name = f"mdify-serve-{uuid.uuid4().hex[:8]}"
35
37
  self.container_id: Optional[str] = None
36
38
 
@@ -39,6 +41,39 @@ class DoclingContainer:
39
41
  """Return base URL for API requests."""
40
42
  return f"http://localhost:{self.port}"
41
43
 
44
+ def _cleanup_stale_containers(self) -> None:
45
+ """Stop any existing mdify-serve containers.
46
+
47
+ This handles the case where a previous run left a container running
48
+ (e.g., due to crash, interrupt, or timeout).
49
+ """
50
+ # Find running containers matching mdify-serve-* pattern
51
+ result = subprocess.run(
52
+ [
53
+ self.runtime,
54
+ "ps",
55
+ "--filter",
56
+ "name=mdify-serve-",
57
+ "--format",
58
+ "{{.Names}}",
59
+ ],
60
+ capture_output=True,
61
+ text=True,
62
+ check=False,
63
+ )
64
+
65
+ if result.returncode != 0 or not result.stdout.strip():
66
+ return
67
+
68
+ # Stop each stale container
69
+ for container_name in result.stdout.strip().split("\n"):
70
+ if container_name:
71
+ subprocess.run(
72
+ [self.runtime, "stop", container_name],
73
+ capture_output=True,
74
+ check=False,
75
+ )
76
+
42
77
  def start(self, timeout: int = 120) -> None:
43
78
  """Start container and wait for health check.
44
79
 
@@ -49,6 +84,8 @@ class DoclingContainer:
49
84
  subprocess.CalledProcessError: If container fails to start
50
85
  TimeoutError: If health check doesn't pass within timeout
51
86
  """
87
+ self._cleanup_stale_containers()
88
+
52
89
  # Start container in detached mode
53
90
  cmd = [
54
91
  self.runtime,
@@ -59,6 +96,8 @@ class DoclingContainer:
59
96
  self.container_name,
60
97
  "-p",
61
98
  f"{self.port}:5001",
99
+ "-e",
100
+ f"DOCLING_SERVE_MAX_SYNC_WAIT={self.timeout}",
62
101
  self.image,
63
102
  ]
64
103
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mdify-cli
3
- Version: 2.0.0
3
+ Version: 2.6.0
4
4
  Summary: Convert PDFs and document images into structured Markdown for LLM workflows
5
5
  Author: tiroq
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mdify-cli"
3
- version = "2.0.0"
3
+ version = "2.6.0"
4
4
  description = "Convert PDFs and document images into structured Markdown for LLM workflows"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.8"