mdify-cli 1.6.0__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdify/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """mdify - Convert documents to Markdown via Docling container."""
2
2
 
3
- __version__ = "1.6.0"
3
+ __version__ = "2.1.0"
mdify/cli.py CHANGED
@@ -21,6 +21,8 @@ from urllib.error import URLError
21
21
  from urllib.request import urlopen
22
22
 
23
23
  from . import __version__
24
+ from mdify.container import DoclingContainer
25
+ from mdify.docling_client import convert_file
24
26
 
25
27
  # Configuration
26
28
  MDIFY_HOME = Path.home() / ".mdify"
@@ -29,7 +31,8 @@ PYPI_API_URL = "https://pypi.org/pypi/mdify-cli/json"
29
31
  CHECK_INTERVAL_SECONDS = 86400 # 24 hours
30
32
 
31
33
  # Container configuration
32
- DEFAULT_IMAGE = "ghcr.io/tiroq/mdify-runtime:latest"
34
+ DEFAULT_IMAGE = "ghcr.io/docling-project/docling-serve-cpu:main"
35
+ GPU_IMAGE = "ghcr.io/docling-project/docling-serve-cu126:main"
33
36
  SUPPORTED_RUNTIMES = ("docker", "podman")
34
37
 
35
38
 
@@ -288,79 +291,6 @@ class Spinner:
288
291
  print(f"\r{' ' * 80}\r", end="", flush=True)
289
292
 
290
293
 
291
- def run_container(
292
- runtime: str,
293
- image: str,
294
- input_file: Path,
295
- output_file: Path,
296
- mask_pii: bool = False,
297
- ) -> Tuple[bool, str, float]:
298
- """
299
- Run container to convert a single file.
300
-
301
- Args:
302
- runtime: Path to container runtime
303
- image: Image name/tag
304
- input_file: Absolute path to input file
305
- output_file: Absolute path to output file
306
- mask_pii: Whether to mask PII in images
307
-
308
- Returns:
309
- Tuple of (success: bool, message: str, elapsed_seconds: float)
310
- """
311
- start_time = time.time()
312
-
313
- # Ensure output directory exists
314
- output_file.parent.mkdir(parents=True, exist_ok=True)
315
-
316
- # Mount directories
317
- input_dir = input_file.parent
318
- output_dir = output_file.parent
319
-
320
- # Container paths
321
- container_in = f"/work/in/{input_file.name}"
322
- container_out = f"/work/out/{output_file.name}"
323
-
324
- cmd = [
325
- runtime,
326
- "run",
327
- "--rm",
328
- "-v",
329
- f"{input_dir}:/work/in:ro",
330
- "-v",
331
- f"{output_dir}:/work/out",
332
- image,
333
- "--in",
334
- container_in,
335
- "--out",
336
- container_out,
337
- ]
338
-
339
- if mask_pii:
340
- cmd.append("--mask")
341
-
342
- try:
343
- result = subprocess.run(
344
- cmd,
345
- capture_output=True,
346
- text=True,
347
- check=False,
348
- )
349
- elapsed = time.time() - start_time
350
-
351
- if result.returncode == 0:
352
- return True, "success", elapsed
353
- else:
354
- error_msg = (
355
- result.stderr.strip() or result.stdout.strip() or "Unknown error"
356
- )
357
- return False, error_msg, elapsed
358
-
359
- except OSError as e:
360
- elapsed = time.time() - start_time
361
- return False, str(e), elapsed
362
-
363
-
364
294
  # =============================================================================
365
295
  # File handling functions
366
296
  # =============================================================================
@@ -532,6 +462,19 @@ Examples:
532
462
  help="Mask PII and sensitive content in document images",
533
463
  )
534
464
 
465
+ parser.add_argument(
466
+ "--gpu",
467
+ action="store_true",
468
+ help="Use GPU-accelerated container image (docling-serve-cu126)",
469
+ )
470
+
471
+ parser.add_argument(
472
+ "--port",
473
+ type=int,
474
+ default=5001,
475
+ help="Port for docling-serve container (default: 5001)",
476
+ )
477
+
535
478
  # Container options
536
479
  parser.add_argument(
537
480
  "--runtime",
@@ -609,7 +552,14 @@ def main() -> int:
609
552
  return 2
610
553
 
611
554
  # Handle image pull policy
612
- image = args.image
555
+ # Determine image based on --gpu flag
556
+ if args.gpu:
557
+ image = GPU_IMAGE
558
+ elif args.image:
559
+ image = args.image
560
+ else:
561
+ image = DEFAULT_IMAGE
562
+
613
563
  image_exists = check_image_exists(runtime, image)
614
564
 
615
565
  if args.pull == "always" or (args.pull == "missing" and not image_exists):
@@ -621,9 +571,15 @@ def main() -> int:
621
571
  print(f"Run with --pull=missing or pull manually: {preferred} pull {image}")
622
572
  return 1
623
573
 
624
- # Resolve paths
625
- input_path = Path(args.input).resolve()
626
- output_dir = Path(args.out_dir).resolve()
574
+ # Resolve paths (use absolute() as fallback if resolve() fails due to permissions)
575
+ try:
576
+ input_path = Path(args.input).resolve()
577
+ except PermissionError:
578
+ input_path = Path(args.input).absolute()
579
+ try:
580
+ output_dir = Path(args.out_dir).resolve()
581
+ except PermissionError:
582
+ output_dir = Path(args.out_dir).absolute()
627
583
 
628
584
  # Validate input
629
585
  if not input_path.exists():
@@ -650,67 +606,115 @@ def main() -> int:
650
606
  print(f"Using image: {image}")
651
607
  print()
652
608
 
609
+ if args.mask:
610
+ print(
611
+ "Warning: --mask is not supported with docling-serve and will be ignored",
612
+ file=sys.stderr,
613
+ )
614
+
653
615
  # Determine input base for directory structure preservation
654
616
  if input_path.is_file():
655
617
  input_base = input_path.parent
656
618
  else:
657
619
  input_base = input_path
658
620
 
659
- # Convert files
660
621
  success_count = 0
661
622
  skipped_count = 0
662
623
  failed_count = 0
663
- conversion_start = time.time()
664
- spinner = Spinner()
665
-
666
- for idx, input_file in enumerate(files_to_convert, 1):
667
- output_file = get_output_path(input_file, input_base, output_dir, args.flat)
668
- file_size = input_file.stat().st_size
669
- progress = f"[{idx}/{total_files}]"
670
-
671
- # Check if output exists and skip if not overwriting
672
- if output_file.exists() and not args.overwrite:
673
- if not args.quiet:
674
- print(f"{progress} Skipped (exists): {input_file.name}")
675
- skipped_count += 1
676
- continue
677
-
678
- # Show spinner while processing
679
- if not args.quiet:
680
- spinner.start(
681
- f"{progress} Processing: {input_file.name} ({format_size(file_size)})"
682
- )
683
-
684
- success, result, elapsed = run_container(
685
- runtime, image, input_file, output_file, args.mask
686
- )
624
+ total_elapsed = 0.0
687
625
 
626
+ try:
688
627
  if not args.quiet:
689
- spinner.stop()
690
-
691
- if success:
692
- success_count += 1
693
- if not args.quiet:
694
- print(f"{progress} {input_file.name} ✓ ({format_duration(elapsed)})")
695
- else:
696
- failed_count += 1
697
- if not args.quiet:
698
- print(f"{progress} {input_file.name} ({format_duration(elapsed)})")
699
- print(f" Error: {result}", file=sys.stderr)
700
-
701
- total_elapsed = time.time() - conversion_start
702
-
703
- # Print summary
704
- if not args.quiet:
705
- print()
706
- print("=" * 50)
707
- print("Conversion Summary:")
708
- print(f" Total files: {total_files}")
709
- print(f" Successful: {success_count}")
710
- print(f" Skipped: {skipped_count}")
711
- print(f" Failed: {failed_count}")
712
- print(f" Total time: {format_duration(total_elapsed)}")
713
- print("=" * 50)
628
+ print(f"Starting docling-serve container...")
629
+ print()
630
+
631
+ with DoclingContainer(runtime, image, args.port) as container:
632
+ # Convert files
633
+ conversion_start = time.time()
634
+ spinner = Spinner()
635
+
636
+ for idx, input_file in enumerate(files_to_convert, 1):
637
+ output_file = get_output_path(
638
+ input_file, input_base, output_dir, args.flat
639
+ )
640
+ file_size = input_file.stat().st_size
641
+ progress = f"[{idx}/{total_files}]"
642
+
643
+ # Check if output exists and skip if not overwriting
644
+ if output_file.exists() and not args.overwrite:
645
+ if not args.quiet:
646
+ print(f"{progress} Skipped (exists): {input_file.name}")
647
+ skipped_count += 1
648
+ continue
649
+
650
+ # Ensure output directory exists
651
+ output_file.parent.mkdir(parents=True, exist_ok=True)
652
+
653
+ # Show spinner while processing
654
+ if not args.quiet:
655
+ spinner.start(
656
+ f"{progress} Processing: {input_file.name} ({format_size(file_size)})"
657
+ )
658
+
659
+ start_time = time.time()
660
+ try:
661
+ # Convert via HTTP API
662
+ result = convert_file(
663
+ container.base_url, input_file, to_format="md"
664
+ )
665
+ elapsed = time.time() - start_time
666
+
667
+ if not args.quiet:
668
+ spinner.stop()
669
+
670
+ if result.success:
671
+ # Write result to output file
672
+ output_file.write_text(result.content)
673
+ success_count += 1
674
+ if not args.quiet:
675
+ print(
676
+ f"{progress} {input_file.name} ✓ ({format_duration(elapsed)})"
677
+ )
678
+ else:
679
+ failed_count += 1
680
+ error_msg = result.error or "Unknown error"
681
+ if not args.quiet:
682
+ print(
683
+ f"{progress} {input_file.name} ✗ ({format_duration(elapsed)})"
684
+ )
685
+ print(f" Error: {error_msg}", file=sys.stderr)
686
+ except Exception as e:
687
+ elapsed = time.time() - start_time
688
+ failed_count += 1
689
+ if not args.quiet:
690
+ spinner.stop()
691
+ print(
692
+ f"{progress} {input_file.name} ✗ ({format_duration(elapsed)})"
693
+ )
694
+ print(f" Error: {str(e)}", file=sys.stderr)
695
+
696
+ total_elapsed = time.time() - conversion_start
697
+
698
+ # Print summary
699
+ if not args.quiet:
700
+ print()
701
+ print("=" * 50)
702
+ print("Conversion Summary:")
703
+ print(f" Total files: {total_files}")
704
+ print(f" Successful: {success_count}")
705
+ print(f" Skipped: {skipped_count}")
706
+ print(f" Failed: {failed_count}")
707
+ print(f" Total time: {format_duration(total_elapsed)}")
708
+ print("=" * 50)
709
+
710
+ except KeyboardInterrupt:
711
+ if not args.quiet:
712
+ print("\n\nInterrupted by user. Container stopped.")
713
+ if success_count > 0 or skipped_count > 0 or failed_count > 0:
714
+ print(
715
+ f"Partial progress: {success_count} successful, {failed_count} failed, {skipped_count} skipped"
716
+ )
717
+ return 130
714
718
 
715
719
  # Return appropriate exit code
716
720
  if failed_count > 0:
mdify/container.py ADDED
@@ -0,0 +1,128 @@
1
+ """Container lifecycle management for docling-serve."""
2
+
3
+ import subprocess
4
+ import time
5
+ import uuid
6
+ from typing import Optional
7
+
8
+ from mdify.docling_client import check_health
9
+
10
+
11
+ class DoclingContainer:
12
+ """Manages docling-serve container lifecycle.
13
+
14
+ Provides context manager support for automatic startup and cleanup.
15
+
16
+ Usage:
17
+ with DoclingContainer("docker", "ghcr.io/docling-project/docling-serve-cpu:main") as container:
18
+ # Container is running and healthy
19
+ response = requests.post(f"{container.base_url}/v1/convert/file", ...)
20
+ # Container automatically stopped and removed
21
+ """
22
+
23
+ def __init__(self, runtime: str, image: str, port: int = 5001):
24
+ """Initialize container manager.
25
+
26
+ Args:
27
+ runtime: Container runtime ("docker" or "podman")
28
+ image: Container image to use
29
+ port: Host port to bind (default: 5001)
30
+ """
31
+ self.runtime = runtime
32
+ self.image = image
33
+ self.port = port
34
+ self.container_name = f"mdify-serve-{uuid.uuid4().hex[:8]}"
35
+ self.container_id: Optional[str] = None
36
+
37
+ @property
38
+ def base_url(self) -> str:
39
+ """Return base URL for API requests."""
40
+ return f"http://localhost:{self.port}"
41
+
42
+ def start(self, timeout: int = 120) -> None:
43
+ """Start container and wait for health check.
44
+
45
+ Args:
46
+ timeout: Maximum seconds to wait for health (default: 120)
47
+
48
+ Raises:
49
+ subprocess.CalledProcessError: If container fails to start
50
+ TimeoutError: If health check doesn't pass within timeout
51
+ """
52
+ # Start container in detached mode
53
+ cmd = [
54
+ self.runtime,
55
+ "run",
56
+ "-d", # Detached mode
57
+ "--rm", # Auto-remove on stop
58
+ "--name",
59
+ self.container_name,
60
+ "-p",
61
+ f"{self.port}:5001",
62
+ self.image,
63
+ ]
64
+
65
+ try:
66
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
67
+ self.container_id = result.stdout.strip()
68
+ except subprocess.CalledProcessError as e:
69
+ error_msg = e.stderr.strip() or e.stdout.strip() or "Unknown error"
70
+ raise subprocess.CalledProcessError(
71
+ e.returncode,
72
+ e.cmd,
73
+ output=e.stdout,
74
+ stderr=f"Failed to start container: {error_msg}",
75
+ )
76
+
77
+ # Wait for health check
78
+ self._wait_for_health(timeout)
79
+
80
+ def stop(self) -> None:
81
+ """Stop and remove container. Safe to call multiple times."""
82
+ if self.container_name:
83
+ subprocess.run(
84
+ [self.runtime, "stop", self.container_name],
85
+ capture_output=True,
86
+ check=False,
87
+ )
88
+
89
+ def is_ready(self) -> bool:
90
+ """Check if container is healthy.
91
+
92
+ Returns:
93
+ True if container is healthy, False otherwise
94
+ """
95
+ try:
96
+ return check_health(self.base_url)
97
+ except Exception:
98
+ return False
99
+
100
+ def _wait_for_health(self, timeout: int) -> None:
101
+ """Poll health endpoint until ready.
102
+
103
+ Args:
104
+ timeout: Maximum seconds to wait
105
+
106
+ Raises:
107
+ TimeoutError: If health check doesn't pass within timeout
108
+ """
109
+ start_time = time.time()
110
+ while time.time() - start_time < timeout:
111
+ try:
112
+ if check_health(self.base_url):
113
+ return
114
+ except Exception:
115
+ pass
116
+ time.sleep(2) # Poll every 2 seconds
117
+
118
+ raise TimeoutError(f"Container failed to become healthy within {timeout}s")
119
+
120
+ def __enter__(self):
121
+ """Context manager entry."""
122
+ self.start()
123
+ return self
124
+
125
+ def __exit__(self, exc_type, exc_val, exc_tb):
126
+ """Context manager exit - ensures cleanup."""
127
+ self.stop()
128
+ return False
@@ -0,0 +1,224 @@
1
+ """HTTP client for docling-serve REST API."""
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ import requests
8
+
9
+
10
+ @dataclass
11
+ class ConvertResult:
12
+ """Result from document conversion."""
13
+
14
+ content: str
15
+ format: str
16
+ success: bool
17
+ error: Optional[str] = None
18
+
19
+
20
+ @dataclass
21
+ class StatusResult:
22
+ """Status of async conversion task."""
23
+
24
+ status: str # "pending", "completed", "failed"
25
+ task_id: str
26
+ error: Optional[str] = None
27
+
28
+
29
+ class DoclingClientError(Exception):
30
+ """Base exception for docling client errors."""
31
+
32
+ pass
33
+
34
+
35
+ class DoclingHTTPError(DoclingClientError):
36
+ """HTTP error from docling-serve API."""
37
+
38
+ def __init__(self, status_code: int, message: str):
39
+ self.status_code = status_code
40
+ super().__init__(f"HTTP {status_code}: {message}")
41
+
42
+
43
+ def check_health(base_url: str) -> bool:
44
+ """Check if docling-serve is healthy.
45
+
46
+ Args:
47
+ base_url: Base URL of docling-serve (e.g., "http://localhost:8000")
48
+
49
+ Returns:
50
+ True if healthy, False otherwise
51
+ """
52
+ try:
53
+ response = requests.get(f"{base_url}/health")
54
+ return response.status_code == 200
55
+ except requests.RequestException:
56
+ return False
57
+
58
+
59
+ def convert_file(
60
+ base_url: str, file_path: Path, to_format: str = "md", do_ocr: bool = True
61
+ ) -> ConvertResult:
62
+ """Convert a file synchronously.
63
+
64
+ Args:
65
+ base_url: Base URL of docling-serve
66
+ file_path: Path to file to convert
67
+ to_format: Output format (default: "md")
68
+ do_ocr: Whether to perform OCR (default: True)
69
+
70
+ Returns:
71
+ ConvertResult with conversion output
72
+
73
+ Raises:
74
+ DoclingHTTPError: If HTTP request fails
75
+ """
76
+ try:
77
+ with open(file_path, "rb") as f:
78
+ response = requests.post(
79
+ f"{base_url}/v1/convert/file",
80
+ files={"files": (file_path.name, f, "application/pdf")},
81
+ data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
82
+ )
83
+
84
+ if response.status_code != 200:
85
+ raise DoclingHTTPError(
86
+ response.status_code, response.text or "Conversion failed"
87
+ )
88
+
89
+ result_data = response.json()
90
+
91
+ # docling-serve returns results in a list format
92
+ if isinstance(result_data, list) and len(result_data) > 0:
93
+ first_result = result_data[0]
94
+ return ConvertResult(
95
+ content=first_result.get("content", ""), format=to_format, success=True
96
+ )
97
+ elif isinstance(result_data, dict):
98
+ return ConvertResult(
99
+ content=result_data.get("content", ""), format=to_format, success=True
100
+ )
101
+ else:
102
+ raise DoclingHTTPError(200, f"Unexpected response format: {result_data}")
103
+
104
+ except requests.RequestException as e:
105
+ return ConvertResult(content="", format=to_format, success=False, error=str(e))
106
+
107
+
108
+ def convert_file_async(
109
+ base_url: str, file_path: Path, to_format: str = "md", do_ocr: bool = True
110
+ ) -> str:
111
+ """Start async file conversion.
112
+
113
+ Args:
114
+ base_url: Base URL of docling-serve
115
+ file_path: Path to file to convert
116
+ to_format: Output format (default: "md")
117
+ do_ocr: Whether to perform OCR (default: True)
118
+
119
+ Returns:
120
+ Task ID for polling
121
+
122
+ Raises:
123
+ DoclingHTTPError: If HTTP request fails
124
+ """
125
+ try:
126
+ with open(file_path, "rb") as f:
127
+ response = requests.post(
128
+ f"{base_url}/v1/convert/file/async",
129
+ files={"files": (file_path.name, f, "application/pdf")},
130
+ data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
131
+ )
132
+
133
+ if response.status_code != 200:
134
+ raise DoclingHTTPError(
135
+ response.status_code, response.text or "Async conversion failed"
136
+ )
137
+
138
+ result_data = response.json()
139
+ task_id = result_data.get("task_id")
140
+
141
+ if not task_id:
142
+ raise DoclingHTTPError(200, f"No task_id in response: {result_data}")
143
+
144
+ return task_id
145
+
146
+ except requests.RequestException as e:
147
+ raise DoclingHTTPError(500, str(e))
148
+
149
+
150
+ def poll_status(base_url: str, task_id: str) -> StatusResult:
151
+ """Poll status of async conversion task.
152
+
153
+ Args:
154
+ base_url: Base URL of docling-serve
155
+ task_id: Task ID from convert_file_async
156
+
157
+ Returns:
158
+ StatusResult with current status
159
+
160
+ Raises:
161
+ DoclingHTTPError: If HTTP request fails
162
+ """
163
+ try:
164
+ response = requests.get(f"{base_url}/v1/status/poll/{task_id}")
165
+
166
+ if response.status_code != 200:
167
+ raise DoclingHTTPError(
168
+ response.status_code, response.text or "Status poll failed"
169
+ )
170
+
171
+ result_data = response.json()
172
+
173
+ return StatusResult(
174
+ status=result_data.get("status", "unknown"),
175
+ task_id=task_id,
176
+ error=result_data.get("error"),
177
+ )
178
+
179
+ except requests.RequestException as e:
180
+ raise DoclingHTTPError(500, str(e))
181
+
182
+
183
+ def get_result(base_url: str, task_id: str) -> ConvertResult:
184
+ """Get result of completed async conversion.
185
+
186
+ Args:
187
+ base_url: Base URL of docling-serve
188
+ task_id: Task ID from convert_file_async
189
+
190
+ Returns:
191
+ ConvertResult with conversion output
192
+
193
+ Raises:
194
+ DoclingHTTPError: If HTTP request fails or task not completed
195
+ """
196
+ try:
197
+ response = requests.get(f"{base_url}/v1/result/{task_id}")
198
+
199
+ if response.status_code != 200:
200
+ raise DoclingHTTPError(
201
+ response.status_code, response.text or "Result retrieval failed"
202
+ )
203
+
204
+ result_data = response.json()
205
+
206
+ # Similar to sync conversion, handle list or dict format
207
+ if isinstance(result_data, list) and len(result_data) > 0:
208
+ first_result = result_data[0]
209
+ return ConvertResult(
210
+ content=first_result.get("content", ""),
211
+ format=first_result.get("format", "md"),
212
+ success=True,
213
+ )
214
+ elif isinstance(result_data, dict):
215
+ return ConvertResult(
216
+ content=result_data.get("content", ""),
217
+ format=result_data.get("format", "md"),
218
+ success=True,
219
+ )
220
+ else:
221
+ raise DoclingHTTPError(200, f"Unexpected response format: {result_data}")
222
+
223
+ except requests.RequestException as e:
224
+ return ConvertResult(content="", format="md", success=False, error=str(e))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mdify-cli
3
- Version: 1.6.0
3
+ Version: 2.1.0
4
4
  Summary: Convert PDFs and document images into structured Markdown for LLM workflows
5
5
  Author: tiroq
6
6
  License-Expression: MIT
@@ -24,6 +24,7 @@ Classifier: Topic :: Utilities
24
24
  Requires-Python: >=3.8
25
25
  Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
+ Requires-Dist: requests
27
28
  Provides-Extra: dev
28
29
  Requires-Dist: pytest>=7.0; extra == "dev"
29
30
  Dynamic: license-file
@@ -100,15 +101,32 @@ Recursively convert files:
100
101
  mdify /path/to/documents -r -g "*.pdf"
101
102
  ```
102
103
 
103
- ### Masking sensitive content
104
+ ### GPU Acceleration
104
105
 
105
- Mask PII and sensitive content in images:
106
+ For faster processing with NVIDIA GPU:
106
107
  ```bash
107
- mdify document.pdf -m
108
- mdify document.pdf --mask
108
+ mdify --gpu documents/*.pdf
109
109
  ```
110
110
 
111
- This uses Docling's content-aware masking to obscure sensitive information in embedded images.
111
+ Requires NVIDIA GPU with CUDA support and nvidia-container-toolkit.
112
+
113
+ ### ⚠️ PII Masking (Deprecated)
114
+
115
+ The `--mask` flag is deprecated and will be ignored in this version. PII masking functionality was available in older versions using a custom runtime but is not supported with the current docling-serve backend.
116
+
117
+ If PII masking is critical for your use case, please use mdify v1.5.x or earlier versions.
118
+
119
+ ## Performance
120
+
121
+ mdify now uses docling-serve for significantly faster batch processing:
122
+
123
+ - **Single model load**: Models are loaded once per session, not per file
124
+ - **~10-20x speedup** for multiple file conversions compared to previous versions
125
+ - **GPU acceleration**: Use `--gpu` for additional 2-6x speedup (requires NVIDIA GPU)
126
+
127
+ ### First Run Behavior
128
+
129
+ The first conversion takes longer (~30-60s) as the container loads ML models into memory. Subsequent files in the same batch process quickly, typically in 1-3 seconds per file.
112
130
 
113
131
  ## Options
114
132
 
@@ -121,9 +139,11 @@ This uses Docling's content-aware masking to obscure sensitive information in em
121
139
  | `--flat` | Disable directory structure preservation |
122
140
  | `--overwrite` | Overwrite existing output files |
123
141
  | `-q, --quiet` | Suppress progress messages |
124
- | `-m, --mask` | Mask PII and sensitive content in images |
142
+ | `-m, --mask` | ⚠️ **Deprecated**: PII masking not supported in current version |
143
+ | `--gpu` | Use GPU-accelerated container (requires NVIDIA GPU and nvidia-container-toolkit) |
144
+ | `--port PORT` | Container port (default: 5001) |
125
145
  | `--runtime RUNTIME` | Container runtime: docker or podman (auto-detected) |
126
- | `--image IMAGE` | Custom container image (default: ghcr.io/tiroq/mdify-runtime:latest) |
146
+ | `--image IMAGE` | Custom container image (default: ghcr.io/docling-project/docling-serve-cpu:main) |
127
147
  | `--pull POLICY` | Image pull policy: always, missing, never (default: missing) |
128
148
  | `--check-update` | Check for available updates and exit |
129
149
  | `--version` | Show version and exit |
@@ -177,19 +197,22 @@ The CLI:
177
197
  - Pulls the runtime container on first use
178
198
  - Mounts files and runs conversions in the container
179
199
 
180
- ## Container Image
200
+ ## Container Images
201
+
202
+ mdify uses official docling-serve containers:
181
203
 
182
- The runtime container is hosted at:
204
+ **CPU Version** (default):
183
205
  ```
184
- ghcr.io/tiroq/mdify-runtime:latest
206
+ ghcr.io/docling-project/docling-serve-cpu:main
185
207
  ```
186
208
 
187
- To build locally:
188
- ```bash
189
- cd runtime
190
- docker build -t mdify-runtime .
209
+ **GPU Version** (use with `--gpu` flag):
210
+ ```
211
+ ghcr.io/docling-project/docling-serve-cu126:main
191
212
  ```
192
213
 
214
+ These are official images from the [docling-serve project](https://github.com/DS4SD/docling-serve).
215
+
193
216
  ## Updates
194
217
 
195
218
  mdify checks for updates daily. When a new version is available:
@@ -0,0 +1,12 @@
1
+ assets/mdify.png,sha256=qUj7WXWqNwpI2KNXOW79XJwqFqa-UI0JEkmt1mmy4Rg,1820418
2
+ mdify/__init__.py,sha256=OxKblQk_woS_-5Z6N5BMndbfQBdvopK1KrnSMAt-4A0,90
3
+ mdify/__main__.py,sha256=bhpJ00co6MfaVOdH4XLoW04NtLYDa_oJK7ODzfLrn9M,143
4
+ mdify/cli.py,sha256=0lRT1XCV9YMMJQ68kH93mrGfFtAC6GgXS-OGEn9mnGw,21742
5
+ mdify/container.py,sha256=AVIhiq_wO5id5hQ_s83lUPkAPCsAoTs25azRT6JmKII,3962
6
+ mdify/docling_client.py,sha256=_9qjL5yOOeJahOg6an2P6Iii1xkeR6wmNJZG4Q6NRkk,6553
7
+ mdify_cli-2.1.0.dist-info/licenses/LICENSE,sha256=NWM66Uv-XuSMKaU-gaPmvfyk4WgE6zcIPr78wyg6GAo,1065
8
+ mdify_cli-2.1.0.dist-info/METADATA,sha256=IaVJcjFLlyCKUIhWBntq47owI7Izrbj6t7Ce0EBrUwg,7923
9
+ mdify_cli-2.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
10
+ mdify_cli-2.1.0.dist-info/entry_points.txt,sha256=0Xki8f5lADQUtwdt6Eq_FEaieI6Byhk8UE7BuDhChMg,41
11
+ mdify_cli-2.1.0.dist-info/top_level.txt,sha256=qltzf7h8owHq7dxCdfCkSHY8gT21hn1_E8P-VWS_OKM,6
12
+ mdify_cli-2.1.0.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- assets/mdify.png,sha256=qUj7WXWqNwpI2KNXOW79XJwqFqa-UI0JEkmt1mmy4Rg,1820418
2
- mdify/__init__.py,sha256=tvxIF7MWdoaHBgdk4tT81csn-ZhTTOlfooBYqM4YsMg,90
3
- mdify/__main__.py,sha256=bhpJ00co6MfaVOdH4XLoW04NtLYDa_oJK7ODzfLrn9M,143
4
- mdify/cli.py,sha256=sDwkOf4H33l7WmfAR3tw2MjO-7kuIOHcrQXTZto6bF0,20460
5
- mdify_cli-1.6.0.dist-info/licenses/LICENSE,sha256=NWM66Uv-XuSMKaU-gaPmvfyk4WgE6zcIPr78wyg6GAo,1065
6
- mdify_cli-1.6.0.dist-info/METADATA,sha256=W-tubNyeCkt6_GAYmS59JHwka8FxQ4D5ZxbHDKFhaLQ,6721
7
- mdify_cli-1.6.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
8
- mdify_cli-1.6.0.dist-info/entry_points.txt,sha256=0Xki8f5lADQUtwdt6Eq_FEaieI6Byhk8UE7BuDhChMg,41
9
- mdify_cli-1.6.0.dist-info/top_level.txt,sha256=qltzf7h8owHq7dxCdfCkSHY8gT21hn1_E8P-VWS_OKM,6
10
- mdify_cli-1.6.0.dist-info/RECORD,,