PyPI - mdify-cli - Versions diffs - 1.5.0__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

mdify-cli 1.5.0py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

mdify/__init__.py +1 -1
mdify/cli.py +251 -204
mdify/container.py +128 -0
mdify/docling_client.py +224 -0
{mdify_cli-1.5.0.dist-info → mdify_cli-2.0.0.dist-info}/METADATA +40 -15
mdify_cli-2.0.0.dist-info/RECORD +12 -0
{mdify_cli-1.5.0.dist-info → mdify_cli-2.0.0.dist-info}/WHEEL +1 -1
mdify_cli-1.5.0.dist-info/RECORD +0 -10
{mdify_cli-1.5.0.dist-info → mdify_cli-2.0.0.dist-info}/entry_points.txt +0 -0
{mdify_cli-1.5.0.dist-info → mdify_cli-2.0.0.dist-info}/licenses/LICENSE +0 -0
{mdify_cli-1.5.0.dist-info → mdify_cli-2.0.0.dist-info}/top_level.txt +0 -0

mdify/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """mdify - Convert documents to Markdown via Docling container."""
-__version__ = "1.5.0"
+__version__ = "2.0.0"

mdify/cli.py CHANGED Viewed

@@ -21,6 +21,8 @@ from urllib.error import URLError
 from urllib.request import urlopen
 from . import __version__
+from mdify.container import DoclingContainer
+from mdify.docling_client import convert_file
 # Configuration
 MDIFY_HOME = Path.home() / ".mdify"
@@ -29,7 +31,8 @@ PYPI_API_URL = "https://pypi.org/pypi/mdify-cli/json"
 CHECK_INTERVAL_SECONDS = 86400  # 24 hours
 # Container configuration
-DEFAULT_IMAGE = "ghcr.io/tiroq/mdify-runtime:latest"
+DEFAULT_IMAGE = "ghcr.io/docling-project/docling-serve-cpu:main"
+GPU_IMAGE = "ghcr.io/docling-project/docling-serve-cu126:main"
 SUPPORTED_RUNTIMES = ("docker", "podman")
@@ -37,10 +40,11 @@ SUPPORTED_RUNTIMES = ("docker", "podman")
 # Update checking functions
 # =============================================================================
 def _get_remote_version(timeout: int = 5) -> Optional[str]:
     """
     Fetch the latest version from PyPI.
     Returns:
         Version string (e.g., "1.1.0") or None if fetch failed.
     """
@@ -56,16 +60,16 @@ def _get_remote_version(timeout: int = 5) -> Optional[str]:
 def _should_check_for_update() -> bool:
     """
     Determine if we should check for updates based on last check time.
     Returns:
         True if check should be performed, False otherwise.
     """
     if os.environ.get("MDIFY_NO_UPDATE_CHECK", "").lower() in ("1", "true", "yes"):
         return False
     if not LAST_CHECK_FILE.exists():
         return True
     try:
         last_check = float(LAST_CHECK_FILE.read_text().strip())
         elapsed = time.time() - last_check
@@ -86,18 +90,18 @@ def _update_last_check_time() -> None:
 def _compare_versions(current: str, remote: str) -> bool:
     """
     Compare version strings.
     Returns:
         True if remote version is newer than current.
     """
     try:
         current_parts = [int(x) for x in current.split(".")]
         remote_parts = [int(x) for x in remote.split(".")]
         max_len = max(len(current_parts), len(remote_parts))
         current_parts.extend([0] * (max_len - len(current_parts)))
         remote_parts.extend([0] * (max_len - len(remote_parts)))
         return remote_parts > current_parts
     except (ValueError, AttributeError):
         return False
@@ -106,15 +110,15 @@ def _compare_versions(current: str, remote: str) -> bool:
 def check_for_update(force: bool = False) -> None:
     """
     Check for updates and prompt user to upgrade if available.
     Args:
         force: If True, check regardless of last check time and show errors.
     """
     if not force and not _should_check_for_update():
         return
     remote_version = _get_remote_version()
     if remote_version is None:
         if force:
             print(
@@ -124,19 +128,19 @@ def check_for_update(force: bool = False) -> None:
             )
             sys.exit(1)
         return
     _update_last_check_time()
     if not _compare_versions(__version__, remote_version):
         if force:
             print(f"mdify is up to date (version {__version__})")
         return
-    print(f"\n{'='*50}")
+    print(f"\n{'=' * 50}")
     print(f"A new version of mdify-cli is available!")
     print(f"  Current version: {__version__}")
     print(f"  Latest version:  {remote_version}")
-    print(f"{'='*50}")
+    print(f"{'=' * 50}")
     print(f"\nTo upgrade, run:")
     print(f"  pipx upgrade mdify-cli")
     print(f"  # or: pip install --upgrade mdify-cli\n")
@@ -146,13 +150,18 @@ def check_for_update(force: bool = False) -> None:
 # Container runtime functions
 # =============================================================================
-def detect_runtime(preferred: str) -> Optional[str]:
+def detect_runtime(preferred: str, explicit: bool = True) -> Optional[str]:
     """
     Detect available container runtime.
     Args:
         preferred: Preferred runtime ('docker' or 'podman')
+        explicit: If True, warn when falling back to alternative.
+                  If False, silently use alternative without warning.
+                  Note: This only controls warning emission; selection order
+                  is always preferred → alternative regardless of this flag.
     Returns:
         Path to runtime executable, or None if not found.
     """
@@ -160,25 +169,28 @@ def detect_runtime(preferred: str) -> Optional[str]:
     runtime_path = shutil.which(preferred)
     if runtime_path:
         return runtime_path
     # Try alternative
     alternative = "podman" if preferred == "docker" else "docker"
     runtime_path = shutil.which(alternative)
     if runtime_path:
-        print(f"Warning: {preferred} not found, using {alternative}", file=sys.stderr)
+        if explicit:
+            print(
+                f"Warning: {preferred} not found, using {alternative}", file=sys.stderr
+            )
         return runtime_path
     return None
 def check_image_exists(runtime: str, image: str) -> bool:
     """
     Check if container image exists locally.
     Args:
         runtime: Path to container runtime
         image: Image name/tag
     Returns:
         True if image exists locally.
     """
@@ -196,18 +208,18 @@ def check_image_exists(runtime: str, image: str) -> bool:
 def pull_image(runtime: str, image: str, quiet: bool = False) -> bool:
     """
     Pull container image.
     Args:
         runtime: Path to container runtime
         image: Image name/tag
         quiet: Suppress progress output
     Returns:
         True if pull succeeded.
     """
     if not quiet:
         print(f"Pulling image: {image}")
     try:
         result = subprocess.run(
             [runtime, "pull", image],
@@ -222,9 +234,9 @@ def pull_image(runtime: str, image: str, quiet: bool = False) -> bool:
 def format_size(size_bytes: int) -> str:
     """Format file size in human-readable format."""
-    for unit in ['B', 'KB', 'MB', 'GB']:
+    for unit in ["B", "KB", "MB", "GB"]:
         if size_bytes < 1024:
-            return f"{size_bytes:.1f} {unit}" if unit != 'B' else f"{size_bytes} {unit}"
+            return f"{size_bytes:.1f} {unit}" if unit != "B" else f"{size_bytes} {unit}"
         size_bytes /= 1024
     return f"{size_bytes:.1f} TB"
@@ -244,29 +256,33 @@ def format_duration(seconds: float) -> str:
 class Spinner:
     """A simple spinner to show progress during long operations."""
     def __init__(self):
-        self.frames = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏']
+        self.frames = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
         self.running = False
         self.thread = None
         self.start_time = None
     def _spin(self):
         idx = 0
         while self.running:
             elapsed = time.time() - self.start_time
             frame = self.frames[idx % len(self.frames)]
-            print(f"\r{self.prefix} {frame} ({format_duration(elapsed)})", end="", flush=True)
+            print(
+                f"\r{self.prefix} {frame} ({format_duration(elapsed)})",
+                end="",
+                flush=True,
+            )
             idx += 1
             time.sleep(0.1)
     def start(self, prefix: str = ""):
         self.prefix = prefix
         self.running = True
         self.start_time = time.time()
         self.thread = threading.Thread(target=self._spin, daemon=True)
         self.thread.start()
     def stop(self):
         self.running = False
         if self.thread:
@@ -275,93 +291,45 @@ class Spinner:
         print(f"\r{' ' * 80}\r", end="", flush=True)
-def run_container(
-    runtime: str,
-    image: str,
-    input_file: Path,
-    output_file: Path,
-    mask_pii: bool = False,
-) -> Tuple[bool, str, float]:
-    """
-    Run container to convert a single file.
-    Args:
-        runtime: Path to container runtime
-        image: Image name/tag
-        input_file: Absolute path to input file
-        output_file: Absolute path to output file
-        mask_pii: Whether to mask PII in images
-    Returns:
-        Tuple of (success: bool, message: str, elapsed_seconds: float)
-    """
-    start_time = time.time()
-    # Ensure output directory exists
-    output_file.parent.mkdir(parents=True, exist_ok=True)
-    # Mount directories
-    input_dir = input_file.parent
-    output_dir = output_file.parent
-    # Container paths
-    container_in = f"/work/in/{input_file.name}"
-    container_out = f"/work/out/{output_file.name}"
-    cmd = [
-        runtime, "run", "--rm",
-        "-v", f"{input_dir}:/work/in:ro",
-        "-v", f"{output_dir}:/work/out",
-        image,
-        "--in", container_in,
-        "--out", container_out,
-    ]
-    if mask_pii:
-        cmd.append("--mask")
-    try:
-        result = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            check=False,
-        )
-        elapsed = time.time() - start_time
-        if result.returncode == 0:
-            return True, "success", elapsed
-        else:
-            error_msg = result.stderr.strip() or result.stdout.strip() or "Unknown error"
-            return False, error_msg, elapsed
-    except OSError as e:
-        elapsed = time.time() - start_time
-        return False, str(e), elapsed
 # =============================================================================
 # File handling functions
 # =============================================================================
 # Supported file extensions (based on Docling InputFormat)
 SUPPORTED_EXTENSIONS = {
-    '.pdf', '.docx', '.pptx', '.html', '.htm',
-    '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.tif',  # images
-    '.asciidoc', '.adoc', '.asc',  # asciidoc
-    '.md', '.markdown',  # markdown
-    '.csv', '.xlsx',  # spreadsheets
-    '.xml',  # XML formats
-    '.json',  # JSON docling
-    '.mp3', '.wav', '.m4a', '.flac',  # audio
-    '.vtt',  # subtitles
+    ".pdf",
+    ".docx",
+    ".pptx",
+    ".html",
+    ".htm",
+    ".png",
+    ".jpg",
+    ".jpeg",
+    ".gif",
+    ".bmp",
+    ".tiff",
+    ".tif",  # images
+    ".asciidoc",
+    ".adoc",
+    ".asc",  # asciidoc
+    ".md",
+    ".markdown",  # markdown
+    ".csv",
+    ".xlsx",  # spreadsheets
+    ".xml",  # XML formats
+    ".json",  # JSON docling
+    ".mp3",
+    ".wav",
+    ".m4a",
+    ".flac",  # audio
+    ".vtt",  # subtitles
 }
 def get_files_to_convert(input_path: Path, mask: str, recursive: bool) -> List[Path]:
     """Get list of files to convert based on input path and options."""
     files = []
     if input_path.is_file():
         files.append(input_path)
     elif input_path.is_dir():
@@ -369,19 +337,19 @@ def get_files_to_convert(input_path: Path, mask: str, recursive: bool) -> List[P
             files = list(input_path.rglob(mask))
         else:
             files = list(input_path.glob(mask))
         # Filter to only files
         files = [f for f in files if f.is_file()]
     else:
         raise FileNotFoundError(f"Input path does not exist: {input_path}")
     # Filter out hidden files and unsupported formats
     files = [
-        f for f in files
-        if not f.name.startswith('.')
-        and f.suffix.lower() in SUPPORTED_EXTENSIONS
+        f
+        for f in files
+        if not f.name.startswith(".") and f.suffix.lower() in SUPPORTED_EXTENSIONS
     ]
     return files
@@ -414,7 +382,7 @@ def get_output_path(
             output_path = output_dir / relative_path.parent / output_name
         except ValueError:
             output_path = output_dir / output_name
         return output_path
@@ -422,6 +390,7 @@ def get_output_path(
 # CLI argument parsing
 # =============================================================================
 def parse_args() -> argparse.Namespace:
     """Parse command line arguments."""
     parser = argparse.ArgumentParser(
@@ -436,74 +405,92 @@ Examples:
   mdify ./docs --runtime podman          Use Podman instead of Docker
 """,
     )
     parser.add_argument(
         "input",
         type=str,
         nargs="?",
         help="Input file or directory to convert",
     )
     parser.add_argument(
-        "-o", "--out-dir",
+        "-o",
+        "--out-dir",
         type=str,
         default="output",
         help="Output directory for converted files (default: output)",
     )
     parser.add_argument(
-        "-g", "--glob",
+        "-g",
+        "--glob",
         type=str,
         default="*",
         help="Glob pattern for filtering files in directory (default: *)",
     )
     parser.add_argument(
-        "-r", "--recursive",
+        "-r",
+        "--recursive",
         action="store_true",
         help="Recursively scan directories",
     )
     parser.add_argument(
         "--flat",
         action="store_true",
         help="Disable directory structure preservation in output",
     )
     parser.add_argument(
         "--overwrite",
         action="store_true",
         help="Overwrite existing output files",
     )
     parser.add_argument(
-        "-q", "--quiet",
+        "-q",
+        "--quiet",
         action="store_true",
         help="Suppress progress messages",
     )
     parser.add_argument(
-        "-m", "--mask",
+        "-m",
+        "--mask",
         action="store_true",
         help="Mask PII and sensitive content in document images",
     )
+    parser.add_argument(
+        "--gpu",
+        action="store_true",
+        help="Use GPU-accelerated container image (docling-serve-cu126)",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=5001,
+        help="Port for docling-serve container (default: 5001)",
+    )
     # Container options
     parser.add_argument(
         "--runtime",
         type=str,
         choices=SUPPORTED_RUNTIMES,
-        default="docker",
-        help="Container runtime to use (default: docker)",
+        default=None,
+        help="Container runtime to use (auto-detects docker or podman if not specified)",
     )
     parser.add_argument(
         "--image",
         type=str,
         default=DEFAULT_IMAGE,
         help=f"Container image to use (default: {DEFAULT_IMAGE})",
     )
     parser.add_argument(
         "--pull",
         type=str,
@@ -511,20 +498,20 @@ Examples:
         default="missing",
         help="Image pull policy: always, missing, never (default: missing)",
     )
     # Utility options
     parser.add_argument(
         "--check-update",
         action="store_true",
         help="Check for available updates and exit",
     )
     parser.add_argument(
         "--version",
         action="version",
         version=f"mdify {__version__}",
     )
     return parser.parse_args()
@@ -532,27 +519,30 @@ Examples:
 # Main entry point
 # =============================================================================
 def main() -> int:
     """Main entry point for the CLI."""
     args = parse_args()
     # Handle --check-update flag
     if args.check_update:
         check_for_update(force=True)
         return 0
     # Check for updates (daily, silent on errors)
     check_for_update(force=False)
     # Validate input is provided
     if args.input is None:
         print("Error: Input file or directory is required", file=sys.stderr)
         print("Usage: mdify <input> [options]", file=sys.stderr)
         print("       mdify --help for more information", file=sys.stderr)
         return 1
     # Detect container runtime
-    runtime = detect_runtime(args.runtime)
+    preferred = args.runtime if args.runtime else "docker"
+    explicit = args.runtime is not None
+    runtime = detect_runtime(preferred, explicit=explicit)
     if runtime is None:
         print(
             f"Error: Container runtime not found ({', '.join(SUPPORTED_RUNTIMES)})",
@@ -560,109 +550,166 @@ def main() -> int:
         )
         print("Please install Docker or Podman to use mdify.", file=sys.stderr)
         return 2
     # Handle image pull policy
-    image = args.image
+    # Determine image based on --gpu flag
+    if args.gpu:
+        image = GPU_IMAGE
+    elif args.image:
+        image = args.image
+    else:
+        image = DEFAULT_IMAGE
     image_exists = check_image_exists(runtime, image)
     if args.pull == "always" or (args.pull == "missing" and not image_exists):
         if not pull_image(runtime, image, args.quiet):
             print(f"Error: Failed to pull image: {image}", file=sys.stderr)
             return 1
     elif args.pull == "never" and not image_exists:
         print(f"Error: Image not found locally: {image}", file=sys.stderr)
-        print(f"Run with --pull=missing or pull manually: {args.runtime} pull {image}")
+        print(f"Run with --pull=missing or pull manually: {preferred} pull {image}")
         return 1
     # Resolve paths
     input_path = Path(args.input).resolve()
     output_dir = Path(args.out_dir).resolve()
     # Validate input
     if not input_path.exists():
         print(f"Error: Input path does not exist: {input_path}", file=sys.stderr)
         return 1
     # Get files to convert
     try:
         files_to_convert = get_files_to_convert(input_path, args.glob, args.recursive)
     except Exception as e:
         print(f"Error: {e}", file=sys.stderr)
         return 1
     if not files_to_convert:
         print(f"No files found to convert in: {input_path}", file=sys.stderr)
         return 1
     total_files = len(files_to_convert)
     total_size = sum(f.stat().st_size for f in files_to_convert)
     if not args.quiet:
         print(f"Found {total_files} file(s) to convert ({format_size(total_size)})")
         print(f"Using runtime: {runtime}")
         print(f"Using image: {image}")
         print()
+    if args.mask:
+        print(
+            "Warning: --mask is not supported with docling-serve and will be ignored",
+            file=sys.stderr,
+        )
     # Determine input base for directory structure preservation
     if input_path.is_file():
         input_base = input_path.parent
     else:
         input_base = input_path
-    # Convert files
     success_count = 0
     skipped_count = 0
     failed_count = 0
-    conversion_start = time.time()
-    spinner = Spinner()
-    for idx, input_file in enumerate(files_to_convert, 1):
-        output_file = get_output_path(input_file, input_base, output_dir, args.flat)
-        file_size = input_file.stat().st_size
-        progress = f"[{idx}/{total_files}]"
-        # Check if output exists and skip if not overwriting
-        if output_file.exists() and not args.overwrite:
-            if not args.quiet:
-                print(f"{progress} Skipped (exists): {input_file.name}")
-            skipped_count += 1
-            continue
-        # Show spinner while processing
+    total_elapsed = 0.0
+    try:
         if not args.quiet:
-            spinner.start(f"{progress} Processing: {input_file.name} ({format_size(file_size)})")
-        success, result, elapsed = run_container(
-            runtime, image, input_file, output_file, args.mask
-        )
+            print(f"Starting docling-serve container...")
+            print()
+        with DoclingContainer(runtime, image, args.port) as container:
+            # Convert files
+            conversion_start = time.time()
+            spinner = Spinner()
+            for idx, input_file in enumerate(files_to_convert, 1):
+                output_file = get_output_path(
+                    input_file, input_base, output_dir, args.flat
+                )
+                file_size = input_file.stat().st_size
+                progress = f"[{idx}/{total_files}]"
+                # Check if output exists and skip if not overwriting
+                if output_file.exists() and not args.overwrite:
+                    if not args.quiet:
+                        print(f"{progress} Skipped (exists): {input_file.name}")
+                    skipped_count += 1
+                    continue
+                # Ensure output directory exists
+                output_file.parent.mkdir(parents=True, exist_ok=True)
+                # Show spinner while processing
+                if not args.quiet:
+                    spinner.start(
+                        f"{progress} Processing: {input_file.name} ({format_size(file_size)})"
+                    )
+                start_time = time.time()
+                try:
+                    # Convert via HTTP API
+                    result = convert_file(
+                        container.base_url, input_file, to_format="md"
+                    )
+                    elapsed = time.time() - start_time
+                    if not args.quiet:
+                        spinner.stop()
+                    if result.success:
+                        # Write result to output file
+                        output_file.write_text(result.content)
+                        success_count += 1
+                        if not args.quiet:
+                            print(
+                                f"{progress} {input_file.name} ✓ ({format_duration(elapsed)})"
+                            )
+                    else:
+                        failed_count += 1
+                        error_msg = result.error or "Unknown error"
+                        if not args.quiet:
+                            print(
+                                f"{progress} {input_file.name} ✗ ({format_duration(elapsed)})"
+                            )
+                            print(f"    Error: {error_msg}", file=sys.stderr)
+                except Exception as e:
+                    elapsed = time.time() - start_time
+                    failed_count += 1
+                    if not args.quiet:
+                        spinner.stop()
+                        print(
+                            f"{progress} {input_file.name} ✗ ({format_duration(elapsed)})"
+                        )
+                        print(f"    Error: {str(e)}", file=sys.stderr)
+            total_elapsed = time.time() - conversion_start
+        # Print summary
         if not args.quiet:
-            spinner.stop()
-        if success:
-            success_count += 1
-            if not args.quiet:
-                print(f"{progress} {input_file.name} ✓ ({format_duration(elapsed)})")
-        else:
-            failed_count += 1
-            if not args.quiet:
-                print(f"{progress} {input_file.name} ✗ ({format_duration(elapsed)})")
-                print(f"    Error: {result}", file=sys.stderr)
-    total_elapsed = time.time() - conversion_start
-    # Print summary
-    if not args.quiet:
-        print()
-        print("=" * 50)
-        print("Conversion Summary:")
-        print(f"  Total files:     {total_files}")
-        print(f"  Successful:      {success_count}")
-        print(f"  Skipped:         {skipped_count}")
-        print(f"  Failed:          {failed_count}")
-        print(f"  Total time:      {format_duration(total_elapsed)}")
-        print("=" * 50)
+            print()
+            print("=" * 50)
+            print("Conversion Summary:")
+            print(f"  Total files:     {total_files}")
+            print(f"  Successful:      {success_count}")
+            print(f"  Skipped:         {skipped_count}")
+            print(f"  Failed:          {failed_count}")
+            print(f"  Total time:      {format_duration(total_elapsed)}")
+            print("=" * 50)
+    except KeyboardInterrupt:
+        if not args.quiet:
+            print("\n\nInterrupted by user. Container stopped.")
+            if success_count > 0 or skipped_count > 0 or failed_count > 0:
+                print(
+                    f"Partial progress: {success_count} successful, {failed_count} failed, {skipped_count} skipped"
+                )
+        return 130
     # Return appropriate exit code
     if failed_count > 0:
         return 1

mdify/container.py ADDED Viewed

@@ -0,0 +1,128 @@
+"""Container lifecycle management for docling-serve."""
+import subprocess
+import time
+import uuid
+from typing import Optional
+from mdify.docling_client import check_health
+class DoclingContainer:
+    """Manages docling-serve container lifecycle.
+    Provides context manager support for automatic startup and cleanup.
+    Usage:
+        with DoclingContainer("docker", "ghcr.io/docling-project/docling-serve-cpu:main") as container:
+            # Container is running and healthy
+            response = requests.post(f"{container.base_url}/v1/convert/file", ...)
+        # Container automatically stopped and removed
+    """
+    def __init__(self, runtime: str, image: str, port: int = 5001):
+        """Initialize container manager.
+        Args:
+            runtime: Container runtime ("docker" or "podman")
+            image: Container image to use
+            port: Host port to bind (default: 5001)
+        """
+        self.runtime = runtime
+        self.image = image
+        self.port = port
+        self.container_name = f"mdify-serve-{uuid.uuid4().hex[:8]}"
+        self.container_id: Optional[str] = None
+    @property
+    def base_url(self) -> str:
+        """Return base URL for API requests."""
+        return f"http://localhost:{self.port}"
+    def start(self, timeout: int = 120) -> None:
+        """Start container and wait for health check.
+        Args:
+            timeout: Maximum seconds to wait for health (default: 120)
+        Raises:
+            subprocess.CalledProcessError: If container fails to start
+            TimeoutError: If health check doesn't pass within timeout
+        """
+        # Start container in detached mode
+        cmd = [
+            self.runtime,
+            "run",
+            "-d",  # Detached mode
+            "--rm",  # Auto-remove on stop
+            "--name",
+            self.container_name,
+            "-p",
+            f"{self.port}:5001",
+            self.image,
+        ]
+        try:
+            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+            self.container_id = result.stdout.strip()
+        except subprocess.CalledProcessError as e:
+            error_msg = e.stderr.strip() or e.stdout.strip() or "Unknown error"
+            raise subprocess.CalledProcessError(
+                e.returncode,
+                e.cmd,
+                output=e.stdout,
+                stderr=f"Failed to start container: {error_msg}",
+            )
+        # Wait for health check
+        self._wait_for_health(timeout)
+    def stop(self) -> None:
+        """Stop and remove container. Safe to call multiple times."""
+        if self.container_name:
+            subprocess.run(
+                [self.runtime, "stop", self.container_name],
+                capture_output=True,
+                check=False,
+            )
+    def is_ready(self) -> bool:
+        """Check if container is healthy.
+        Returns:
+            True if container is healthy, False otherwise
+        """
+        try:
+            return check_health(self.base_url)
+        except Exception:
+            return False
+    def _wait_for_health(self, timeout: int) -> None:
+        """Poll health endpoint until ready.
+        Args:
+            timeout: Maximum seconds to wait
+        Raises:
+            TimeoutError: If health check doesn't pass within timeout
+        """
+        start_time = time.time()
+        while time.time() - start_time < timeout:
+            try:
+                if check_health(self.base_url):
+                    return
+            except Exception:
+                pass
+            time.sleep(2)  # Poll every 2 seconds
+        raise TimeoutError(f"Container failed to become healthy within {timeout}s")
+    def __enter__(self):
+        """Context manager entry."""
+        self.start()
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit - ensures cleanup."""
+        self.stop()
+        return False

mdify/docling_client.py ADDED Viewed

@@ -0,0 +1,224 @@
+"""HTTP client for docling-serve REST API."""
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+import requests
+@dataclass
+class ConvertResult:
+    """Result from document conversion."""
+    content: str
+    format: str
+    success: bool
+    error: Optional[str] = None
+@dataclass
+class StatusResult:
+    """Status of async conversion task."""
+    status: str  # "pending", "completed", "failed"
+    task_id: str
+    error: Optional[str] = None
+class DoclingClientError(Exception):
+    """Base exception for docling client errors."""
+    pass
+class DoclingHTTPError(DoclingClientError):
+    """HTTP error from docling-serve API."""
+    def __init__(self, status_code: int, message: str):
+        self.status_code = status_code
+        super().__init__(f"HTTP {status_code}: {message}")
+def check_health(base_url: str) -> bool:
+    """Check if docling-serve is healthy.
+    Args:
+        base_url: Base URL of docling-serve (e.g., "http://localhost:8000")
+    Returns:
+        True if healthy, False otherwise
+    """
+    try:
+        response = requests.get(f"{base_url}/health")
+        return response.status_code == 200
+    except requests.RequestException:
+        return False
+def convert_file(
+    base_url: str, file_path: Path, to_format: str = "md", do_ocr: bool = True
+) -> ConvertResult:
+    """Convert a file synchronously.
+    Args:
+        base_url: Base URL of docling-serve
+        file_path: Path to file to convert
+        to_format: Output format (default: "md")
+        do_ocr: Whether to perform OCR (default: True)
+    Returns:
+        ConvertResult with conversion output
+    Raises:
+        DoclingHTTPError: If HTTP request fails
+    """
+    try:
+        with open(file_path, "rb") as f:
+            response = requests.post(
+                f"{base_url}/v1/convert/file",
+                files={"files": (file_path.name, f, "application/pdf")},
+                data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
+            )
+        if response.status_code != 200:
+            raise DoclingHTTPError(
+                response.status_code, response.text or "Conversion failed"
+            )
+        result_data = response.json()
+        # docling-serve returns results in a list format
+        if isinstance(result_data, list) and len(result_data) > 0:
+            first_result = result_data[0]
+            return ConvertResult(
+                content=first_result.get("content", ""), format=to_format, success=True
+            )
+        elif isinstance(result_data, dict):
+            return ConvertResult(
+                content=result_data.get("content", ""), format=to_format, success=True
+            )
+        else:
+            raise DoclingHTTPError(200, f"Unexpected response format: {result_data}")
+    except requests.RequestException as e:
+        return ConvertResult(content="", format=to_format, success=False, error=str(e))
+def convert_file_async(
+    base_url: str, file_path: Path, to_format: str = "md", do_ocr: bool = True
+) -> str:
+    """Start async file conversion.
+    Args:
+        base_url: Base URL of docling-serve
+        file_path: Path to file to convert
+        to_format: Output format (default: "md")
+        do_ocr: Whether to perform OCR (default: True)
+    Returns:
+        Task ID for polling
+    Raises:
+        DoclingHTTPError: If HTTP request fails
+    """
+    try:
+        with open(file_path, "rb") as f:
+            response = requests.post(
+                f"{base_url}/v1/convert/file/async",
+                files={"files": (file_path.name, f, "application/pdf")},
+                data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
+            )
+        if response.status_code != 200:
+            raise DoclingHTTPError(
+                response.status_code, response.text or "Async conversion failed"
+            )
+        result_data = response.json()
+        task_id = result_data.get("task_id")
+        if not task_id:
+            raise DoclingHTTPError(200, f"No task_id in response: {result_data}")
+        return task_id
+    except requests.RequestException as e:
+        raise DoclingHTTPError(500, str(e))
+def poll_status(base_url: str, task_id: str) -> StatusResult:
+    """Poll status of async conversion task.
+    Args:
+        base_url: Base URL of docling-serve
+        task_id: Task ID from convert_file_async
+    Returns:
+        StatusResult with current status
+    Raises:
+        DoclingHTTPError: If HTTP request fails
+    """
+    try:
+        response = requests.get(f"{base_url}/v1/status/poll/{task_id}")
+        if response.status_code != 200:
+            raise DoclingHTTPError(
+                response.status_code, response.text or "Status poll failed"
+            )
+        result_data = response.json()
+        return StatusResult(
+            status=result_data.get("status", "unknown"),
+            task_id=task_id,
+            error=result_data.get("error"),
+        )
+    except requests.RequestException as e:
+        raise DoclingHTTPError(500, str(e))
+def get_result(base_url: str, task_id: str) -> ConvertResult:
+    """Get result of completed async conversion.
+    Args:
+        base_url: Base URL of docling-serve
+        task_id: Task ID from convert_file_async
+    Returns:
+        ConvertResult with conversion output
+    Raises:
+        DoclingHTTPError: If HTTP request fails or task not completed
+    """
+    try:
+        response = requests.get(f"{base_url}/v1/result/{task_id}")
+        if response.status_code != 200:
+            raise DoclingHTTPError(
+                response.status_code, response.text or "Result retrieval failed"
+            )
+        result_data = response.json()
+        # Similar to sync conversion, handle list or dict format
+        if isinstance(result_data, list) and len(result_data) > 0:
+            first_result = result_data[0]
+            return ConvertResult(
+                content=first_result.get("content", ""),
+                format=first_result.get("format", "md"),
+                success=True,
+            )
+        elif isinstance(result_data, dict):
+            return ConvertResult(
+                content=result_data.get("content", ""),
+                format=result_data.get("format", "md"),
+                success=True,
+            )
+        else:
+            raise DoclingHTTPError(200, f"Unexpected response format: {result_data}")
+    except requests.RequestException as e:
+        return ConvertResult(content="", format="md", success=False, error=str(e))

{mdify_cli-1.5.0.dist-info → mdify_cli-2.0.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mdify-cli
-Version: 1.5.0
+Version: 2.0.0
 Summary: Convert PDFs and document images into structured Markdown for LLM workflows
 Author: tiroq
 License-Expression: MIT
@@ -24,6 +24,9 @@ Classifier: Topic :: Utilities
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE
+Requires-Dist: requests
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0; extra == "dev"
 Dynamic: license-file
 # mdify
@@ -98,15 +101,32 @@ Recursively convert files:
 mdify /path/to/documents -r -g "*.pdf"
 ```
-### Masking sensitive content
+### GPU Acceleration
-Mask PII and sensitive content in images:
+For faster processing with NVIDIA GPU:
 ```bash
-mdify document.pdf -m
-mdify document.pdf --mask
+mdify --gpu documents/*.pdf
 ```
-This uses Docling's content-aware masking to obscure sensitive information in embedded images.
+Requires NVIDIA GPU with CUDA support and nvidia-container-toolkit.
+### ⚠️ PII Masking (Deprecated)
+The `--mask` flag is deprecated and will be ignored in this version. PII masking functionality was available in older versions using a custom runtime but is not supported with the current docling-serve backend.
+If PII masking is critical for your use case, please use mdify v1.5.x or earlier versions.
+## Performance
+mdify now uses docling-serve for significantly faster batch processing:
+- **Single model load**: Models are loaded once per session, not per file
+- **~10-20x speedup** for multiple file conversions compared to previous versions
+- **GPU acceleration**: Use `--gpu` for additional 2-6x speedup (requires NVIDIA GPU)
+### First Run Behavior
+The first conversion takes longer (~30-60s) as the container loads ML models into memory. Subsequent files in the same batch process quickly, typically in 1-3 seconds per file.
 ## Options
@@ -119,9 +139,11 @@ This uses Docling's content-aware masking to obscure sensitive information in em
 | `--flat` | Disable directory structure preservation |
 | `--overwrite` | Overwrite existing output files |
 | `-q, --quiet` | Suppress progress messages |
-| `-m, --mask` | Mask PII and sensitive content in images |
+| `-m, --mask` | ⚠️ **Deprecated**: PII masking not supported in current version |
+| `--gpu` | Use GPU-accelerated container (requires NVIDIA GPU and nvidia-container-toolkit) |
+| `--port PORT` | Container port (default: 5001) |
 | `--runtime RUNTIME` | Container runtime: docker or podman (auto-detected) |
-| `--image IMAGE` | Custom container image (default: ghcr.io/tiroq/mdify-runtime:latest) |
+| `--image IMAGE` | Custom container image (default: ghcr.io/docling-project/docling-serve-cpu:main) |
 | `--pull POLICY` | Image pull policy: always, missing, never (default: missing) |
 | `--check-update` | Check for available updates and exit |
 | `--version` | Show version and exit |
@@ -175,19 +197,22 @@ The CLI:
 - Pulls the runtime container on first use
 - Mounts files and runs conversions in the container
-## Container Image
+## Container Images
+mdify uses official docling-serve containers:
-The runtime container is hosted at:
+**CPU Version** (default):
 ```
-ghcr.io/tiroq/mdify-runtime:latest
+ghcr.io/docling-project/docling-serve-cpu:main
 ```
-To build locally:
-```bash
-cd runtime
-docker build -t mdify-runtime .
+**GPU Version** (use with `--gpu` flag):
+```
+ghcr.io/docling-project/docling-serve-cu126:main
 ```
+These are official images from the [docling-serve project](https://github.com/DS4SD/docling-serve).
 ## Updates
 mdify checks for updates daily. When a new version is available:

mdify_cli-2.0.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,12 @@
+assets/mdify.png,sha256=qUj7WXWqNwpI2KNXOW79XJwqFqa-UI0JEkmt1mmy4Rg,1820418
+mdify/__init__.py,sha256=s7XlWmH4zJ5jFiPjpd7mXrCaU8bD-S9RaPzT2VHUdeQ,90
+mdify/__main__.py,sha256=bhpJ00co6MfaVOdH4XLoW04NtLYDa_oJK7ODzfLrn9M,143
+mdify/cli.py,sha256=LY0q8NlnKuN5aFz_OpO5hGro-tQNCxoYO_M0qVd6FJY,21493
+mdify/container.py,sha256=AVIhiq_wO5id5hQ_s83lUPkAPCsAoTs25azRT6JmKII,3962
+mdify/docling_client.py,sha256=_9qjL5yOOeJahOg6an2P6Iii1xkeR6wmNJZG4Q6NRkk,6553
+mdify_cli-2.0.0.dist-info/licenses/LICENSE,sha256=NWM66Uv-XuSMKaU-gaPmvfyk4WgE6zcIPr78wyg6GAo,1065
+mdify_cli-2.0.0.dist-info/METADATA,sha256=92_uBI2nnKK-YEf39TB7gX1KHbZBHqIHxLZBe7-GOqY,7923
+mdify_cli-2.0.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+mdify_cli-2.0.0.dist-info/entry_points.txt,sha256=0Xki8f5lADQUtwdt6Eq_FEaieI6Byhk8UE7BuDhChMg,41
+mdify_cli-2.0.0.dist-info/top_level.txt,sha256=qltzf7h8owHq7dxCdfCkSHY8gT21hn1_E8P-VWS_OKM,6
+mdify_cli-2.0.0.dist-info/RECORD,,

{mdify_cli-1.5.0.dist-info → mdify_cli-2.0.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (80.10.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

mdify_cli-1.5.0.dist-info/RECORD DELETED Viewed

@@ -1,10 +0,0 @@
-assets/mdify.png,sha256=qUj7WXWqNwpI2KNXOW79XJwqFqa-UI0JEkmt1mmy4Rg,1820418
-mdify/__init__.py,sha256=GxfVEOJLubSaiA0jNE2zgZq7sxiJMAr6Qn-cLPK8XJU,90
-mdify/__main__.py,sha256=bhpJ00co6MfaVOdH4XLoW04NtLYDa_oJK7ODzfLrn9M,143
-mdify/cli.py,sha256=D8_1_6NgWXkexGWqkgB0JO7c1r2T2_Va7J7iGwvewQA,20038
-mdify_cli-1.5.0.dist-info/licenses/LICENSE,sha256=NWM66Uv-XuSMKaU-gaPmvfyk4WgE6zcIPr78wyg6GAo,1065
-mdify_cli-1.5.0.dist-info/METADATA,sha256=LRKZupINA7w6HM9FyuDdmrLzWYovHqxTnQRHNohmRM0,6658
-mdify_cli-1.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-mdify_cli-1.5.0.dist-info/entry_points.txt,sha256=0Xki8f5lADQUtwdt6Eq_FEaieI6Byhk8UE7BuDhChMg,41
-mdify_cli-1.5.0.dist-info/top_level.txt,sha256=qltzf7h8owHq7dxCdfCkSHY8gT21hn1_E8P-VWS_OKM,6
-mdify_cli-1.5.0.dist-info/RECORD,,

{mdify_cli-1.5.0.dist-info → mdify_cli-2.0.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{mdify_cli-1.5.0.dist-info → mdify_cli-2.0.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{mdify_cli-1.5.0.dist-info → mdify_cli-2.0.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

mdify-cli 1.5.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

mdify-cli 1.5.0py3-none-any.whl → 2.0.0py3-none-any.whl