mdify-cli 1.4.1__py3-none-any.whl → 2.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdify/__init__.py +1 -1
- mdify/cli.py +587 -219
- mdify/container.py +167 -0
- mdify/docling_client.py +263 -0
- {mdify_cli-1.4.1.dist-info → mdify_cli-2.9.1.dist-info}/METADATA +92 -20
- mdify_cli-2.9.1.dist-info/RECORD +12 -0
- {mdify_cli-1.4.1.dist-info → mdify_cli-2.9.1.dist-info}/WHEEL +1 -1
- mdify_cli-1.4.1.dist-info/RECORD +0 -10
- {mdify_cli-1.4.1.dist-info → mdify_cli-2.9.1.dist-info}/entry_points.txt +0 -0
- {mdify_cli-1.4.1.dist-info → mdify_cli-2.9.1.dist-info}/licenses/LICENSE +0 -0
- {mdify_cli-1.4.1.dist-info → mdify_cli-2.9.1.dist-info}/top_level.txt +0 -0
mdify/container.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""Container lifecycle management for docling-serve."""
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
import time
|
|
5
|
+
import uuid
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from mdify.docling_client import check_health
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DoclingContainer:
|
|
12
|
+
"""Manages docling-serve container lifecycle.
|
|
13
|
+
|
|
14
|
+
Provides context manager support for automatic startup and cleanup.
|
|
15
|
+
|
|
16
|
+
Usage:
|
|
17
|
+
with DoclingContainer("docker", "ghcr.io/docling-project/docling-serve-cpu:main") as container:
|
|
18
|
+
# Container is running and healthy
|
|
19
|
+
response = requests.post(f"{container.base_url}/v1/convert/file", ...)
|
|
20
|
+
# Container automatically stopped and removed
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, runtime: str, image: str, port: int = 5001, timeout: int = 1200):
|
|
24
|
+
"""Initialize container manager.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
runtime: Container runtime ("docker" or "podman")
|
|
28
|
+
image: Container image to use
|
|
29
|
+
port: Host port to bind (default: 5001)
|
|
30
|
+
timeout: Conversion timeout in seconds (default: 1200)
|
|
31
|
+
"""
|
|
32
|
+
self.runtime = runtime
|
|
33
|
+
self.image = image
|
|
34
|
+
self.port = port
|
|
35
|
+
self.timeout = timeout
|
|
36
|
+
self.container_name = f"mdify-serve-{uuid.uuid4().hex[:8]}"
|
|
37
|
+
self.container_id: Optional[str] = None
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def base_url(self) -> str:
|
|
41
|
+
"""Return base URL for API requests."""
|
|
42
|
+
return f"http://localhost:{self.port}"
|
|
43
|
+
|
|
44
|
+
def _cleanup_stale_containers(self) -> None:
|
|
45
|
+
"""Stop any existing mdify-serve containers.
|
|
46
|
+
|
|
47
|
+
This handles the case where a previous run left a container running
|
|
48
|
+
(e.g., due to crash, interrupt, or timeout).
|
|
49
|
+
"""
|
|
50
|
+
# Find running containers matching mdify-serve-* pattern
|
|
51
|
+
result = subprocess.run(
|
|
52
|
+
[
|
|
53
|
+
self.runtime,
|
|
54
|
+
"ps",
|
|
55
|
+
"--filter",
|
|
56
|
+
"name=mdify-serve-",
|
|
57
|
+
"--format",
|
|
58
|
+
"{{.Names}}",
|
|
59
|
+
],
|
|
60
|
+
capture_output=True,
|
|
61
|
+
text=True,
|
|
62
|
+
check=False,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
if result.returncode != 0 or not result.stdout.strip():
|
|
66
|
+
return
|
|
67
|
+
|
|
68
|
+
# Stop each stale container
|
|
69
|
+
for container_name in result.stdout.strip().split("\n"):
|
|
70
|
+
if container_name:
|
|
71
|
+
subprocess.run(
|
|
72
|
+
[self.runtime, "stop", container_name],
|
|
73
|
+
capture_output=True,
|
|
74
|
+
check=False,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
def start(self, timeout: int = 120) -> None:
|
|
78
|
+
"""Start container and wait for health check.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
timeout: Maximum seconds to wait for health (default: 120)
|
|
82
|
+
|
|
83
|
+
Raises:
|
|
84
|
+
subprocess.CalledProcessError: If container fails to start
|
|
85
|
+
TimeoutError: If health check doesn't pass within timeout
|
|
86
|
+
"""
|
|
87
|
+
self._cleanup_stale_containers()
|
|
88
|
+
|
|
89
|
+
# Start container in detached mode
|
|
90
|
+
cmd = [
|
|
91
|
+
self.runtime,
|
|
92
|
+
"run",
|
|
93
|
+
"-d", # Detached mode
|
|
94
|
+
"--rm", # Auto-remove on stop
|
|
95
|
+
"--name",
|
|
96
|
+
self.container_name,
|
|
97
|
+
"-p",
|
|
98
|
+
f"{self.port}:5001",
|
|
99
|
+
"-e",
|
|
100
|
+
f"DOCLING_SERVE_MAX_SYNC_WAIT={self.timeout}",
|
|
101
|
+
self.image,
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
106
|
+
self.container_id = result.stdout.strip()
|
|
107
|
+
except subprocess.CalledProcessError as e:
|
|
108
|
+
error_msg = e.stderr.strip() or e.stdout.strip() or "Unknown error"
|
|
109
|
+
raise subprocess.CalledProcessError(
|
|
110
|
+
e.returncode,
|
|
111
|
+
e.cmd,
|
|
112
|
+
output=e.stdout,
|
|
113
|
+
stderr=f"Failed to start container: {error_msg}",
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Wait for health check
|
|
117
|
+
self._wait_for_health(timeout)
|
|
118
|
+
|
|
119
|
+
def stop(self) -> None:
|
|
120
|
+
"""Stop and remove container. Safe to call multiple times."""
|
|
121
|
+
if self.container_name:
|
|
122
|
+
subprocess.run(
|
|
123
|
+
[self.runtime, "stop", self.container_name],
|
|
124
|
+
capture_output=True,
|
|
125
|
+
check=False,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
def is_ready(self) -> bool:
|
|
129
|
+
"""Check if container is healthy.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
True if container is healthy, False otherwise
|
|
133
|
+
"""
|
|
134
|
+
try:
|
|
135
|
+
return check_health(self.base_url)
|
|
136
|
+
except Exception:
|
|
137
|
+
return False
|
|
138
|
+
|
|
139
|
+
def _wait_for_health(self, timeout: int) -> None:
|
|
140
|
+
"""Poll health endpoint until ready.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
timeout: Maximum seconds to wait
|
|
144
|
+
|
|
145
|
+
Raises:
|
|
146
|
+
TimeoutError: If health check doesn't pass within timeout
|
|
147
|
+
"""
|
|
148
|
+
start_time = time.time()
|
|
149
|
+
while time.time() - start_time < timeout:
|
|
150
|
+
try:
|
|
151
|
+
if check_health(self.base_url):
|
|
152
|
+
return
|
|
153
|
+
except Exception:
|
|
154
|
+
pass
|
|
155
|
+
time.sleep(2) # Poll every 2 seconds
|
|
156
|
+
|
|
157
|
+
raise TimeoutError(f"Container failed to become healthy within {timeout}s")
|
|
158
|
+
|
|
159
|
+
def __enter__(self):
|
|
160
|
+
"""Context manager entry."""
|
|
161
|
+
self.start()
|
|
162
|
+
return self
|
|
163
|
+
|
|
164
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
165
|
+
"""Context manager exit - ensures cleanup."""
|
|
166
|
+
self.stop()
|
|
167
|
+
return False
|
mdify/docling_client.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"""HTTP client for docling-serve REST API."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
import mimetypes
|
|
8
|
+
|
|
9
|
+
import requests
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class ConvertResult:
|
|
14
|
+
"""Result from document conversion."""
|
|
15
|
+
|
|
16
|
+
content: str
|
|
17
|
+
format: str
|
|
18
|
+
success: bool
|
|
19
|
+
error: Optional[str] = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class StatusResult:
|
|
24
|
+
"""Status of async conversion task."""
|
|
25
|
+
|
|
26
|
+
status: str # "pending", "completed", "failed"
|
|
27
|
+
task_id: str
|
|
28
|
+
error: Optional[str] = None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class DoclingClientError(Exception):
|
|
32
|
+
"""Base exception for docling client errors."""
|
|
33
|
+
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DoclingHTTPError(DoclingClientError):
|
|
38
|
+
"""HTTP error from docling-serve API."""
|
|
39
|
+
|
|
40
|
+
def __init__(self, status_code: int, message: str):
|
|
41
|
+
self.status_code = status_code
|
|
42
|
+
super().__init__(f"HTTP {status_code}: {message}")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _get_mime_type(file_path: Path) -> str:
|
|
46
|
+
"""Get MIME type for file, with fallback for unknown types."""
|
|
47
|
+
mime_type, _ = mimetypes.guess_type(str(file_path))
|
|
48
|
+
return mime_type or "application/octet-stream"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _extract_content(result_data) -> str:
|
|
52
|
+
"""Extract content from API response, supporting both old and new formats.
|
|
53
|
+
|
|
54
|
+
Supports:
|
|
55
|
+
- New format: {"document": {"md_content": "..."}}
|
|
56
|
+
- Fallback: {"document": {"content": "..."}}
|
|
57
|
+
- Old format: {"content": "..."}
|
|
58
|
+
- List format: [{"document": {...}} or {"content": "..."}]
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
result_data: Response data from docling-serve API
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Extracted content string, or empty string if not found
|
|
65
|
+
"""
|
|
66
|
+
if isinstance(result_data, dict):
|
|
67
|
+
# New format with document field
|
|
68
|
+
if "document" in result_data:
|
|
69
|
+
doc = result_data["document"]
|
|
70
|
+
# Try md_content first, then content
|
|
71
|
+
return doc.get("md_content", "") or doc.get("content", "")
|
|
72
|
+
# Old format without document field
|
|
73
|
+
return result_data.get("content", "")
|
|
74
|
+
elif isinstance(result_data, list) and len(result_data) > 0:
|
|
75
|
+
# List format - process first item
|
|
76
|
+
first_result = result_data[0]
|
|
77
|
+
if isinstance(first_result, dict):
|
|
78
|
+
if "document" in first_result:
|
|
79
|
+
doc = first_result["document"]
|
|
80
|
+
# Try md_content first, then content
|
|
81
|
+
return doc.get("md_content", "") or doc.get("content", "")
|
|
82
|
+
# Old format without document field
|
|
83
|
+
return first_result.get("content", "")
|
|
84
|
+
return ""
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def check_health(base_url: str) -> bool:
|
|
88
|
+
"""Check if docling-serve is healthy.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
base_url: Base URL of docling-serve (e.g., "http://localhost:8000")
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
True if healthy, False otherwise
|
|
95
|
+
"""
|
|
96
|
+
try:
|
|
97
|
+
response = requests.get(f"{base_url}/health")
|
|
98
|
+
return response.status_code == 200
|
|
99
|
+
except requests.RequestException:
|
|
100
|
+
return False
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def convert_file(
|
|
104
|
+
base_url: str, file_path: Path, to_format: str = "md", do_ocr: bool = True
|
|
105
|
+
) -> ConvertResult:
|
|
106
|
+
"""Convert a file synchronously.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
base_url: Base URL of docling-serve
|
|
110
|
+
file_path: Path to file to convert
|
|
111
|
+
to_format: Output format (default: "md")
|
|
112
|
+
do_ocr: Whether to perform OCR (default: True)
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
ConvertResult with conversion output
|
|
116
|
+
|
|
117
|
+
Raises:
|
|
118
|
+
DoclingHTTPError: If HTTP request fails
|
|
119
|
+
"""
|
|
120
|
+
try:
|
|
121
|
+
with open(file_path, "rb") as f:
|
|
122
|
+
response = requests.post(
|
|
123
|
+
f"{base_url}/v1/convert/file",
|
|
124
|
+
files={"files": (file_path.name, f, _get_mime_type(file_path))},
|
|
125
|
+
data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
if response.status_code != 200:
|
|
129
|
+
raise DoclingHTTPError(
|
|
130
|
+
response.status_code, response.text or "Conversion failed"
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
result_data = response.json()
|
|
134
|
+
content = _extract_content(result_data)
|
|
135
|
+
|
|
136
|
+
if content or isinstance(result_data, (dict, list)):
|
|
137
|
+
return ConvertResult(content=content, format=to_format, success=True)
|
|
138
|
+
else:
|
|
139
|
+
raise DoclingHTTPError(200, f"Unexpected response format: {result_data}")
|
|
140
|
+
|
|
141
|
+
except requests.RequestException as e:
|
|
142
|
+
return ConvertResult(content="", format=to_format, success=False, error=str(e))
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def convert_file_async(
|
|
146
|
+
base_url: str, file_path: Path, to_format: str = "md", do_ocr: bool = True
|
|
147
|
+
) -> str:
|
|
148
|
+
"""Start async file conversion.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
base_url: Base URL of docling-serve
|
|
152
|
+
file_path: Path to file to convert
|
|
153
|
+
to_format: Output format (default: "md")
|
|
154
|
+
do_ocr: Whether to perform OCR (default: True)
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
Task ID for polling
|
|
158
|
+
|
|
159
|
+
Raises:
|
|
160
|
+
DoclingHTTPError: If HTTP request fails
|
|
161
|
+
"""
|
|
162
|
+
try:
|
|
163
|
+
with open(file_path, "rb") as f:
|
|
164
|
+
response = requests.post(
|
|
165
|
+
f"{base_url}/v1/convert/file/async",
|
|
166
|
+
files={"files": (file_path.name, f, _get_mime_type(file_path))},
|
|
167
|
+
data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
if response.status_code != 200:
|
|
171
|
+
raise DoclingHTTPError(
|
|
172
|
+
response.status_code, response.text or "Async conversion failed"
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
result_data = response.json()
|
|
176
|
+
task_id = result_data.get("task_id")
|
|
177
|
+
|
|
178
|
+
if not task_id:
|
|
179
|
+
raise DoclingHTTPError(200, f"No task_id in response: {result_data}")
|
|
180
|
+
|
|
181
|
+
return task_id
|
|
182
|
+
|
|
183
|
+
except requests.RequestException as e:
|
|
184
|
+
raise DoclingHTTPError(500, str(e))
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def poll_status(base_url: str, task_id: str) -> StatusResult:
|
|
188
|
+
"""Poll status of async conversion task.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
base_url: Base URL of docling-serve
|
|
192
|
+
task_id: Task ID from convert_file_async
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
StatusResult with current status
|
|
196
|
+
|
|
197
|
+
Raises:
|
|
198
|
+
DoclingHTTPError: If HTTP request fails
|
|
199
|
+
"""
|
|
200
|
+
try:
|
|
201
|
+
response = requests.get(f"{base_url}/v1/status/poll/{task_id}")
|
|
202
|
+
|
|
203
|
+
if response.status_code != 200:
|
|
204
|
+
raise DoclingHTTPError(
|
|
205
|
+
response.status_code, response.text or "Status poll failed"
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
result_data = response.json()
|
|
209
|
+
|
|
210
|
+
return StatusResult(
|
|
211
|
+
status=result_data.get("status", "unknown"),
|
|
212
|
+
task_id=task_id,
|
|
213
|
+
error=result_data.get("error"),
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
except requests.RequestException as e:
|
|
217
|
+
raise DoclingHTTPError(500, str(e))
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def get_result(base_url: str, task_id: str) -> ConvertResult:
|
|
221
|
+
"""Get result of completed async conversion.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
base_url: Base URL of docling-serve
|
|
225
|
+
task_id: Task ID from convert_file_async
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
ConvertResult with conversion output
|
|
229
|
+
|
|
230
|
+
Raises:
|
|
231
|
+
DoclingHTTPError: If HTTP request fails or task not completed
|
|
232
|
+
"""
|
|
233
|
+
try:
|
|
234
|
+
response = requests.get(f"{base_url}/v1/result/{task_id}")
|
|
235
|
+
|
|
236
|
+
if response.status_code != 200:
|
|
237
|
+
raise DoclingHTTPError(
|
|
238
|
+
response.status_code, response.text or "Result retrieval failed"
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
result_data = response.json()
|
|
242
|
+
content = _extract_content(result_data)
|
|
243
|
+
|
|
244
|
+
# Determine format from response, defaulting to "md"
|
|
245
|
+
result_format = "md"
|
|
246
|
+
if isinstance(result_data, dict):
|
|
247
|
+
result_format = result_data.get("format", "md")
|
|
248
|
+
elif isinstance(result_data, list) and len(result_data) > 0:
|
|
249
|
+
first_result = result_data[0]
|
|
250
|
+
if isinstance(first_result, dict):
|
|
251
|
+
result_format = first_result.get("format", "md")
|
|
252
|
+
|
|
253
|
+
if content or isinstance(result_data, (dict, list)):
|
|
254
|
+
return ConvertResult(
|
|
255
|
+
content=content,
|
|
256
|
+
format=result_format,
|
|
257
|
+
success=True,
|
|
258
|
+
)
|
|
259
|
+
else:
|
|
260
|
+
raise DoclingHTTPError(200, f"Unexpected response format: {result_data}")
|
|
261
|
+
|
|
262
|
+
except requests.RequestException as e:
|
|
263
|
+
return ConvertResult(content="", format="md", success=False, error=str(e))
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mdify-cli
|
|
3
|
-
Version:
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 2.9.1
|
|
4
|
+
Summary: Convert PDFs and document images into structured Markdown for LLM workflows
|
|
5
5
|
Author: tiroq
|
|
6
6
|
License-Expression: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/tiroq/mdify
|
|
@@ -24,6 +24,9 @@ Classifier: Topic :: Utilities
|
|
|
24
24
|
Requires-Python: >=3.8
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
License-File: LICENSE
|
|
27
|
+
Requires-Dist: requests
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
27
30
|
Dynamic: license-file
|
|
28
31
|
|
|
29
32
|
# mdify
|
|
@@ -34,12 +37,15 @@ Dynamic: license-file
|
|
|
34
37
|
[](https://github.com/tiroq/mdify/pkgs/container/mdify-runtime)
|
|
35
38
|
[](https://opensource.org/licenses/MIT)
|
|
36
39
|
|
|
37
|
-
A lightweight CLI for converting documents to Markdown. The CLI is fast to install via pipx, while the heavy ML conversion
|
|
40
|
+
A lightweight CLI for converting documents to Markdown. The CLI is fast to install via pipx, while the heavy ML conversion runs inside a container.
|
|
38
41
|
|
|
39
42
|
## Requirements
|
|
40
43
|
|
|
41
44
|
- **Python 3.8+**
|
|
42
|
-
- **Docker** or
|
|
45
|
+
- **Docker**, **Podman**, or native macOS container tools (for document conversion)
|
|
46
|
+
- On macOS: Supports Apple Container (macOS 26+), OrbStack, Colima, Podman, or Docker Desktop
|
|
47
|
+
- On Linux: Docker or Podman
|
|
48
|
+
- Auto-detects available tools
|
|
43
49
|
|
|
44
50
|
## Installation
|
|
45
51
|
|
|
@@ -53,6 +59,13 @@ pipx install mdify-cli
|
|
|
53
59
|
|
|
54
60
|
Restart your terminal after installation.
|
|
55
61
|
|
|
62
|
+
For containerized document conversion, install one of these (or use Docker Desktop):
|
|
63
|
+
- **Apple Container** (macOS 26+): Download from https://github.com/apple/container/releases
|
|
64
|
+
- **OrbStack** (recommended): `brew install orbstack`
|
|
65
|
+
- **Colima**: `brew install colima && colima start`
|
|
66
|
+
- **Podman**: `brew install podman && podman machine init && podman machine start`
|
|
67
|
+
- **Docker Desktop**: Available at https://www.docker.com/products/docker-desktop
|
|
68
|
+
|
|
56
69
|
### Linux
|
|
57
70
|
|
|
58
71
|
```bash
|
|
@@ -98,15 +111,32 @@ Recursively convert files:
|
|
|
98
111
|
mdify /path/to/documents -r -g "*.pdf"
|
|
99
112
|
```
|
|
100
113
|
|
|
101
|
-
###
|
|
114
|
+
### GPU Acceleration
|
|
102
115
|
|
|
103
|
-
|
|
116
|
+
For faster processing with NVIDIA GPU:
|
|
104
117
|
```bash
|
|
105
|
-
mdify
|
|
106
|
-
mdify document.pdf --mask
|
|
118
|
+
mdify --gpu documents/*.pdf
|
|
107
119
|
```
|
|
108
120
|
|
|
109
|
-
|
|
121
|
+
Requires NVIDIA GPU with CUDA support and nvidia-container-toolkit.
|
|
122
|
+
|
|
123
|
+
### ⚠️ PII Masking (Deprecated)
|
|
124
|
+
|
|
125
|
+
The `--mask` flag is deprecated and will be ignored in this version. PII masking functionality was available in older versions using a custom runtime but is not supported with the current docling-serve backend.
|
|
126
|
+
|
|
127
|
+
If PII masking is critical for your use case, please use mdify v1.5.x or earlier versions.
|
|
128
|
+
|
|
129
|
+
## Performance
|
|
130
|
+
|
|
131
|
+
mdify now uses docling-serve for significantly faster batch processing:
|
|
132
|
+
|
|
133
|
+
- **Single model load**: Models are loaded once per session, not per file
|
|
134
|
+
- **~10-20x speedup** for multiple file conversions compared to previous versions
|
|
135
|
+
- **GPU acceleration**: Use `--gpu` for additional 2-6x speedup (requires NVIDIA GPU)
|
|
136
|
+
|
|
137
|
+
### First Run Behavior
|
|
138
|
+
|
|
139
|
+
The first conversion takes longer (~30-60s) as the container loads ML models into memory. Subsequent files in the same batch process quickly, typically in 1-3 seconds per file.
|
|
110
140
|
|
|
111
141
|
## Options
|
|
112
142
|
|
|
@@ -119,14 +149,53 @@ This uses Docling's content-aware masking to obscure sensitive information in em
|
|
|
119
149
|
| `--flat` | Disable directory structure preservation |
|
|
120
150
|
| `--overwrite` | Overwrite existing output files |
|
|
121
151
|
| `-q, --quiet` | Suppress progress messages |
|
|
122
|
-
| `-m, --mask` |
|
|
123
|
-
| `--
|
|
124
|
-
| `--
|
|
152
|
+
| `-m, --mask` | ⚠️ **Deprecated**: PII masking not supported in current version |
|
|
153
|
+
| `--gpu` | Use GPU-accelerated container (requires NVIDIA GPU and nvidia-container-toolkit) |
|
|
154
|
+
| `--port PORT` | Container port (default: 5001) |
|
|
155
|
+
| `--runtime RUNTIME` | Container runtime: docker, podman, orbstack, colima, or container (auto-detected) |
|
|
156
|
+
| `--image IMAGE` | Custom container image (default: ghcr.io/docling-project/docling-serve-cpu:main) |
|
|
125
157
|
| `--pull POLICY` | Image pull policy: always, missing, never (default: missing) |
|
|
126
158
|
| `--check-update` | Check for available updates and exit |
|
|
127
159
|
| `--version` | Show version and exit |
|
|
128
160
|
|
|
129
|
-
###
|
|
161
|
+
### Container Runtime Selection
|
|
162
|
+
|
|
163
|
+
mdify automatically detects and uses the best available container runtime. The detection order differs by platform:
|
|
164
|
+
|
|
165
|
+
**macOS (recommended):**
|
|
166
|
+
1. Apple Container (native, macOS 26+ required)
|
|
167
|
+
2. OrbStack (lightweight, fast)
|
|
168
|
+
3. Colima (open-source alternative)
|
|
169
|
+
4. Podman (via Podman machine)
|
|
170
|
+
5. Docker Desktop (full Docker)
|
|
171
|
+
|
|
172
|
+
**Linux:**
|
|
173
|
+
1. Docker
|
|
174
|
+
2. Podman
|
|
175
|
+
|
|
176
|
+
**Override runtime:**
|
|
177
|
+
Use the `MDIFY_CONTAINER_RUNTIME` environment variable to force a specific runtime:
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
export MDIFY_CONTAINER_RUNTIME=orbstack
|
|
181
|
+
mdify document.pdf
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
Or inline:
|
|
185
|
+
```bash
|
|
186
|
+
MDIFY_CONTAINER_RUNTIME=colima mdify document.pdf
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
**Supported values:** `docker`, `podman`, `orbstack`, `colima`, `container`
|
|
190
|
+
|
|
191
|
+
If the selected runtime is installed but not running, mdify will display a helpful warning:
|
|
192
|
+
```
|
|
193
|
+
Warning: Found container runtime(s) but daemon is not running:
|
|
194
|
+
- orbstack (/opt/homebrew/bin/orbstack)
|
|
195
|
+
|
|
196
|
+
Please start one of these tools before running mdify.
|
|
197
|
+
macOS tip: Start OrbStack, Colima, or Podman Desktop application
|
|
198
|
+
```
|
|
130
199
|
|
|
131
200
|
With `--flat`, all output files are placed directly in the output directory. Directory paths are incorporated into filenames to prevent collisions:
|
|
132
201
|
|
|
@@ -175,19 +244,22 @@ The CLI:
|
|
|
175
244
|
- Pulls the runtime container on first use
|
|
176
245
|
- Mounts files and runs conversions in the container
|
|
177
246
|
|
|
178
|
-
## Container
|
|
247
|
+
## Container Images
|
|
248
|
+
|
|
249
|
+
mdify uses official docling-serve containers:
|
|
179
250
|
|
|
180
|
-
|
|
251
|
+
**CPU Version** (default):
|
|
181
252
|
```
|
|
182
|
-
ghcr.io/
|
|
253
|
+
ghcr.io/docling-project/docling-serve-cpu:main
|
|
183
254
|
```
|
|
184
255
|
|
|
185
|
-
|
|
186
|
-
```
|
|
187
|
-
|
|
188
|
-
docker build -t mdify-runtime .
|
|
256
|
+
**GPU Version** (use with `--gpu` flag):
|
|
257
|
+
```
|
|
258
|
+
ghcr.io/docling-project/docling-serve-cu126:main
|
|
189
259
|
```
|
|
190
260
|
|
|
261
|
+
These are official images from the [docling-serve project](https://github.com/DS4SD/docling-serve).
|
|
262
|
+
|
|
191
263
|
## Updates
|
|
192
264
|
|
|
193
265
|
mdify checks for updates daily. When a new version is available:
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
assets/mdify.png,sha256=qUj7WXWqNwpI2KNXOW79XJwqFqa-UI0JEkmt1mmy4Rg,1820418
|
|
2
|
+
mdify/__init__.py,sha256=6F0BWnZCqC4Il4XjrXJ4_Uaa1airfJKQXqY8INPQqPI,90
|
|
3
|
+
mdify/__main__.py,sha256=bhpJ00co6MfaVOdH4XLoW04NtLYDa_oJK7ODzfLrn9M,143
|
|
4
|
+
mdify/cli.py,sha256=V3WoX7Kmm5jzjtWt2a0v-kIkfYPrF-V7M5mLlZ0y-B0,33634
|
|
5
|
+
mdify/container.py,sha256=tkk0nv7EquL-rKUY4nkS_yGITb7mqw8B7eEfuqaeVrg,5239
|
|
6
|
+
mdify/docling_client.py,sha256=xuQR6sC1v3EPloOSwExoHCqT4uUxE8myYq-Yeby3C2I,7975
|
|
7
|
+
mdify_cli-2.9.1.dist-info/licenses/LICENSE,sha256=NWM66Uv-XuSMKaU-gaPmvfyk4WgE6zcIPr78wyg6GAo,1065
|
|
8
|
+
mdify_cli-2.9.1.dist-info/METADATA,sha256=ex5ghiFyguru9C7jWyXZSQDmtZUO65ROJcHWicuyC98,9622
|
|
9
|
+
mdify_cli-2.9.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
10
|
+
mdify_cli-2.9.1.dist-info/entry_points.txt,sha256=0Xki8f5lADQUtwdt6Eq_FEaieI6Byhk8UE7BuDhChMg,41
|
|
11
|
+
mdify_cli-2.9.1.dist-info/top_level.txt,sha256=qltzf7h8owHq7dxCdfCkSHY8gT21hn1_E8P-VWS_OKM,6
|
|
12
|
+
mdify_cli-2.9.1.dist-info/RECORD,,
|
mdify_cli-1.4.1.dist-info/RECORD
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
assets/mdify.png,sha256=qUj7WXWqNwpI2KNXOW79XJwqFqa-UI0JEkmt1mmy4Rg,1820418
|
|
2
|
-
mdify/__init__.py,sha256=NWY-5XYsO7gQZs9c4utyzGda6anA_FDBB2LNNUIqsdo,90
|
|
3
|
-
mdify/__main__.py,sha256=bhpJ00co6MfaVOdH4XLoW04NtLYDa_oJK7ODzfLrn9M,143
|
|
4
|
-
mdify/cli.py,sha256=D8_1_6NgWXkexGWqkgB0JO7c1r2T2_Va7J7iGwvewQA,20038
|
|
5
|
-
mdify_cli-1.4.1.dist-info/licenses/LICENSE,sha256=NWM66Uv-XuSMKaU-gaPmvfyk4WgE6zcIPr78wyg6GAo,1065
|
|
6
|
-
mdify_cli-1.4.1.dist-info/METADATA,sha256=sZgTSq6CrpBgpJn0NCnLcBYNTp2e0byKeFkAOO6em3E,6667
|
|
7
|
-
mdify_cli-1.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
-
mdify_cli-1.4.1.dist-info/entry_points.txt,sha256=0Xki8f5lADQUtwdt6Eq_FEaieI6Byhk8UE7BuDhChMg,41
|
|
9
|
-
mdify_cli-1.4.1.dist-info/top_level.txt,sha256=qltzf7h8owHq7dxCdfCkSHY8gT21hn1_E8P-VWS_OKM,6
|
|
10
|
-
mdify_cli-1.4.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|