mdify-cli 2.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- assets/mdify.png +0 -0
- mdify/__init__.py +3 -0
- mdify/__main__.py +7 -0
- mdify/cli.py +915 -0
- mdify/container.py +167 -0
- mdify/docling_client.py +232 -0
- mdify_cli-2.7.0.dist-info/METADATA +274 -0
- mdify_cli-2.7.0.dist-info/RECORD +12 -0
- mdify_cli-2.7.0.dist-info/WHEEL +5 -0
- mdify_cli-2.7.0.dist-info/entry_points.txt +2 -0
- mdify_cli-2.7.0.dist-info/licenses/LICENSE +21 -0
- mdify_cli-2.7.0.dist-info/top_level.txt +1 -0
mdify/container.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""Container lifecycle management for docling-serve."""
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
import time
|
|
5
|
+
import uuid
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from mdify.docling_client import check_health
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DoclingContainer:
|
|
12
|
+
"""Manages docling-serve container lifecycle.
|
|
13
|
+
|
|
14
|
+
Provides context manager support for automatic startup and cleanup.
|
|
15
|
+
|
|
16
|
+
Usage:
|
|
17
|
+
with DoclingContainer("docker", "ghcr.io/docling-project/docling-serve-cpu:main") as container:
|
|
18
|
+
# Container is running and healthy
|
|
19
|
+
response = requests.post(f"{container.base_url}/v1/convert/file", ...)
|
|
20
|
+
# Container automatically stopped and removed
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, runtime: str, image: str, port: int = 5001, timeout: int = 1200):
|
|
24
|
+
"""Initialize container manager.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
runtime: Container runtime ("docker" or "podman")
|
|
28
|
+
image: Container image to use
|
|
29
|
+
port: Host port to bind (default: 5001)
|
|
30
|
+
timeout: Conversion timeout in seconds (default: 1200)
|
|
31
|
+
"""
|
|
32
|
+
self.runtime = runtime
|
|
33
|
+
self.image = image
|
|
34
|
+
self.port = port
|
|
35
|
+
self.timeout = timeout
|
|
36
|
+
self.container_name = f"mdify-serve-{uuid.uuid4().hex[:8]}"
|
|
37
|
+
self.container_id: Optional[str] = None
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def base_url(self) -> str:
|
|
41
|
+
"""Return base URL for API requests."""
|
|
42
|
+
return f"http://localhost:{self.port}"
|
|
43
|
+
|
|
44
|
+
def _cleanup_stale_containers(self) -> None:
|
|
45
|
+
"""Stop any existing mdify-serve containers.
|
|
46
|
+
|
|
47
|
+
This handles the case where a previous run left a container running
|
|
48
|
+
(e.g., due to crash, interrupt, or timeout).
|
|
49
|
+
"""
|
|
50
|
+
# Find running containers matching mdify-serve-* pattern
|
|
51
|
+
result = subprocess.run(
|
|
52
|
+
[
|
|
53
|
+
self.runtime,
|
|
54
|
+
"ps",
|
|
55
|
+
"--filter",
|
|
56
|
+
"name=mdify-serve-",
|
|
57
|
+
"--format",
|
|
58
|
+
"{{.Names}}",
|
|
59
|
+
],
|
|
60
|
+
capture_output=True,
|
|
61
|
+
text=True,
|
|
62
|
+
check=False,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
if result.returncode != 0 or not result.stdout.strip():
|
|
66
|
+
return
|
|
67
|
+
|
|
68
|
+
# Stop each stale container
|
|
69
|
+
for container_name in result.stdout.strip().split("\n"):
|
|
70
|
+
if container_name:
|
|
71
|
+
subprocess.run(
|
|
72
|
+
[self.runtime, "stop", container_name],
|
|
73
|
+
capture_output=True,
|
|
74
|
+
check=False,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
def start(self, timeout: int = 120) -> None:
|
|
78
|
+
"""Start container and wait for health check.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
timeout: Maximum seconds to wait for health (default: 120)
|
|
82
|
+
|
|
83
|
+
Raises:
|
|
84
|
+
subprocess.CalledProcessError: If container fails to start
|
|
85
|
+
TimeoutError: If health check doesn't pass within timeout
|
|
86
|
+
"""
|
|
87
|
+
self._cleanup_stale_containers()
|
|
88
|
+
|
|
89
|
+
# Start container in detached mode
|
|
90
|
+
cmd = [
|
|
91
|
+
self.runtime,
|
|
92
|
+
"run",
|
|
93
|
+
"-d", # Detached mode
|
|
94
|
+
"--rm", # Auto-remove on stop
|
|
95
|
+
"--name",
|
|
96
|
+
self.container_name,
|
|
97
|
+
"-p",
|
|
98
|
+
f"{self.port}:5001",
|
|
99
|
+
"-e",
|
|
100
|
+
f"DOCLING_SERVE_MAX_SYNC_WAIT={self.timeout}",
|
|
101
|
+
self.image,
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
106
|
+
self.container_id = result.stdout.strip()
|
|
107
|
+
except subprocess.CalledProcessError as e:
|
|
108
|
+
error_msg = e.stderr.strip() or e.stdout.strip() or "Unknown error"
|
|
109
|
+
raise subprocess.CalledProcessError(
|
|
110
|
+
e.returncode,
|
|
111
|
+
e.cmd,
|
|
112
|
+
output=e.stdout,
|
|
113
|
+
stderr=f"Failed to start container: {error_msg}",
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Wait for health check
|
|
117
|
+
self._wait_for_health(timeout)
|
|
118
|
+
|
|
119
|
+
def stop(self) -> None:
|
|
120
|
+
"""Stop and remove container. Safe to call multiple times."""
|
|
121
|
+
if self.container_name:
|
|
122
|
+
subprocess.run(
|
|
123
|
+
[self.runtime, "stop", self.container_name],
|
|
124
|
+
capture_output=True,
|
|
125
|
+
check=False,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
def is_ready(self) -> bool:
|
|
129
|
+
"""Check if container is healthy.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
True if container is healthy, False otherwise
|
|
133
|
+
"""
|
|
134
|
+
try:
|
|
135
|
+
return check_health(self.base_url)
|
|
136
|
+
except Exception:
|
|
137
|
+
return False
|
|
138
|
+
|
|
139
|
+
def _wait_for_health(self, timeout: int) -> None:
|
|
140
|
+
"""Poll health endpoint until ready.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
timeout: Maximum seconds to wait
|
|
144
|
+
|
|
145
|
+
Raises:
|
|
146
|
+
TimeoutError: If health check doesn't pass within timeout
|
|
147
|
+
"""
|
|
148
|
+
start_time = time.time()
|
|
149
|
+
while time.time() - start_time < timeout:
|
|
150
|
+
try:
|
|
151
|
+
if check_health(self.base_url):
|
|
152
|
+
return
|
|
153
|
+
except Exception:
|
|
154
|
+
pass
|
|
155
|
+
time.sleep(2) # Poll every 2 seconds
|
|
156
|
+
|
|
157
|
+
raise TimeoutError(f"Container failed to become healthy within {timeout}s")
|
|
158
|
+
|
|
159
|
+
def __enter__(self):
|
|
160
|
+
"""Context manager entry."""
|
|
161
|
+
self.start()
|
|
162
|
+
return self
|
|
163
|
+
|
|
164
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
165
|
+
"""Context manager exit - ensures cleanup."""
|
|
166
|
+
self.stop()
|
|
167
|
+
return False
|
mdify/docling_client.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""HTTP client for docling-serve REST API."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
import mimetypes
|
|
8
|
+
|
|
9
|
+
import requests
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class ConvertResult:
|
|
14
|
+
"""Result from document conversion."""
|
|
15
|
+
|
|
16
|
+
content: str
|
|
17
|
+
format: str
|
|
18
|
+
success: bool
|
|
19
|
+
error: Optional[str] = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class StatusResult:
|
|
24
|
+
"""Status of async conversion task."""
|
|
25
|
+
|
|
26
|
+
status: str # "pending", "completed", "failed"
|
|
27
|
+
task_id: str
|
|
28
|
+
error: Optional[str] = None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class DoclingClientError(Exception):
|
|
32
|
+
"""Base exception for docling client errors."""
|
|
33
|
+
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DoclingHTTPError(DoclingClientError):
|
|
38
|
+
"""HTTP error from docling-serve API."""
|
|
39
|
+
|
|
40
|
+
def __init__(self, status_code: int, message: str):
|
|
41
|
+
self.status_code = status_code
|
|
42
|
+
super().__init__(f"HTTP {status_code}: {message}")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _get_mime_type(file_path: Path) -> str:
|
|
46
|
+
"""Get MIME type for file, with fallback for unknown types."""
|
|
47
|
+
mime_type, _ = mimetypes.guess_type(str(file_path))
|
|
48
|
+
return mime_type or "application/octet-stream"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def check_health(base_url: str) -> bool:
|
|
52
|
+
"""Check if docling-serve is healthy.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
base_url: Base URL of docling-serve (e.g., "http://localhost:8000")
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
True if healthy, False otherwise
|
|
59
|
+
"""
|
|
60
|
+
try:
|
|
61
|
+
response = requests.get(f"{base_url}/health")
|
|
62
|
+
return response.status_code == 200
|
|
63
|
+
except requests.RequestException:
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def convert_file(
|
|
68
|
+
base_url: str, file_path: Path, to_format: str = "md", do_ocr: bool = True
|
|
69
|
+
) -> ConvertResult:
|
|
70
|
+
"""Convert a file synchronously.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
base_url: Base URL of docling-serve
|
|
74
|
+
file_path: Path to file to convert
|
|
75
|
+
to_format: Output format (default: "md")
|
|
76
|
+
do_ocr: Whether to perform OCR (default: True)
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
ConvertResult with conversion output
|
|
80
|
+
|
|
81
|
+
Raises:
|
|
82
|
+
DoclingHTTPError: If HTTP request fails
|
|
83
|
+
"""
|
|
84
|
+
try:
|
|
85
|
+
with open(file_path, "rb") as f:
|
|
86
|
+
response = requests.post(
|
|
87
|
+
f"{base_url}/v1/convert/file",
|
|
88
|
+
files={"files": (file_path.name, f, _get_mime_type(file_path))},
|
|
89
|
+
data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
if response.status_code != 200:
|
|
93
|
+
raise DoclingHTTPError(
|
|
94
|
+
response.status_code, response.text or "Conversion failed"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
result_data = response.json()
|
|
98
|
+
|
|
99
|
+
# docling-serve returns results in a list format
|
|
100
|
+
if isinstance(result_data, list) and len(result_data) > 0:
|
|
101
|
+
first_result = result_data[0]
|
|
102
|
+
return ConvertResult(
|
|
103
|
+
content=first_result.get("content", ""), format=to_format, success=True
|
|
104
|
+
)
|
|
105
|
+
elif isinstance(result_data, dict):
|
|
106
|
+
return ConvertResult(
|
|
107
|
+
content=result_data.get("content", ""), format=to_format, success=True
|
|
108
|
+
)
|
|
109
|
+
else:
|
|
110
|
+
raise DoclingHTTPError(200, f"Unexpected response format: {result_data}")
|
|
111
|
+
|
|
112
|
+
except requests.RequestException as e:
|
|
113
|
+
return ConvertResult(content="", format=to_format, success=False, error=str(e))
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def convert_file_async(
|
|
117
|
+
base_url: str, file_path: Path, to_format: str = "md", do_ocr: bool = True
|
|
118
|
+
) -> str:
|
|
119
|
+
"""Start async file conversion.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
base_url: Base URL of docling-serve
|
|
123
|
+
file_path: Path to file to convert
|
|
124
|
+
to_format: Output format (default: "md")
|
|
125
|
+
do_ocr: Whether to perform OCR (default: True)
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
Task ID for polling
|
|
129
|
+
|
|
130
|
+
Raises:
|
|
131
|
+
DoclingHTTPError: If HTTP request fails
|
|
132
|
+
"""
|
|
133
|
+
try:
|
|
134
|
+
with open(file_path, "rb") as f:
|
|
135
|
+
response = requests.post(
|
|
136
|
+
f"{base_url}/v1/convert/file/async",
|
|
137
|
+
files={"files": (file_path.name, f, _get_mime_type(file_path))},
|
|
138
|
+
data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
if response.status_code != 200:
|
|
142
|
+
raise DoclingHTTPError(
|
|
143
|
+
response.status_code, response.text or "Async conversion failed"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
result_data = response.json()
|
|
147
|
+
task_id = result_data.get("task_id")
|
|
148
|
+
|
|
149
|
+
if not task_id:
|
|
150
|
+
raise DoclingHTTPError(200, f"No task_id in response: {result_data}")
|
|
151
|
+
|
|
152
|
+
return task_id
|
|
153
|
+
|
|
154
|
+
except requests.RequestException as e:
|
|
155
|
+
raise DoclingHTTPError(500, str(e))
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def poll_status(base_url: str, task_id: str) -> StatusResult:
|
|
159
|
+
"""Poll status of async conversion task.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
base_url: Base URL of docling-serve
|
|
163
|
+
task_id: Task ID from convert_file_async
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
StatusResult with current status
|
|
167
|
+
|
|
168
|
+
Raises:
|
|
169
|
+
DoclingHTTPError: If HTTP request fails
|
|
170
|
+
"""
|
|
171
|
+
try:
|
|
172
|
+
response = requests.get(f"{base_url}/v1/status/poll/{task_id}")
|
|
173
|
+
|
|
174
|
+
if response.status_code != 200:
|
|
175
|
+
raise DoclingHTTPError(
|
|
176
|
+
response.status_code, response.text or "Status poll failed"
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
result_data = response.json()
|
|
180
|
+
|
|
181
|
+
return StatusResult(
|
|
182
|
+
status=result_data.get("status", "unknown"),
|
|
183
|
+
task_id=task_id,
|
|
184
|
+
error=result_data.get("error"),
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
except requests.RequestException as e:
|
|
188
|
+
raise DoclingHTTPError(500, str(e))
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def get_result(base_url: str, task_id: str) -> ConvertResult:
|
|
192
|
+
"""Get result of completed async conversion.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
base_url: Base URL of docling-serve
|
|
196
|
+
task_id: Task ID from convert_file_async
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
ConvertResult with conversion output
|
|
200
|
+
|
|
201
|
+
Raises:
|
|
202
|
+
DoclingHTTPError: If HTTP request fails or task not completed
|
|
203
|
+
"""
|
|
204
|
+
try:
|
|
205
|
+
response = requests.get(f"{base_url}/v1/result/{task_id}")
|
|
206
|
+
|
|
207
|
+
if response.status_code != 200:
|
|
208
|
+
raise DoclingHTTPError(
|
|
209
|
+
response.status_code, response.text or "Result retrieval failed"
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
result_data = response.json()
|
|
213
|
+
|
|
214
|
+
# Similar to sync conversion, handle list or dict format
|
|
215
|
+
if isinstance(result_data, list) and len(result_data) > 0:
|
|
216
|
+
first_result = result_data[0]
|
|
217
|
+
return ConvertResult(
|
|
218
|
+
content=first_result.get("content", ""),
|
|
219
|
+
format=first_result.get("format", "md"),
|
|
220
|
+
success=True,
|
|
221
|
+
)
|
|
222
|
+
elif isinstance(result_data, dict):
|
|
223
|
+
return ConvertResult(
|
|
224
|
+
content=result_data.get("content", ""),
|
|
225
|
+
format=result_data.get("format", "md"),
|
|
226
|
+
success=True,
|
|
227
|
+
)
|
|
228
|
+
else:
|
|
229
|
+
raise DoclingHTTPError(200, f"Unexpected response format: {result_data}")
|
|
230
|
+
|
|
231
|
+
except requests.RequestException as e:
|
|
232
|
+
return ConvertResult(content="", format="md", success=False, error=str(e))
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mdify-cli
|
|
3
|
+
Version: 2.7.0
|
|
4
|
+
Summary: Convert PDFs and document images into structured Markdown for LLM workflows
|
|
5
|
+
Author: tiroq
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/tiroq/mdify
|
|
8
|
+
Project-URL: Repository, https://github.com/tiroq/mdify
|
|
9
|
+
Project-URL: Issues, https://github.com/tiroq/mdify/issues
|
|
10
|
+
Keywords: markdown,conversion,pdf,docling,cli,document,docker
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
23
|
+
Classifier: Topic :: Utilities
|
|
24
|
+
Requires-Python: >=3.8
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: requests
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
# mdify
|
|
33
|
+
|
|
34
|
+

|
|
35
|
+
|
|
36
|
+
[](https://pypi.org/project/mdify-cli/)
|
|
37
|
+
[](https://github.com/tiroq/mdify/pkgs/container/mdify-runtime)
|
|
38
|
+
[](https://opensource.org/licenses/MIT)
|
|
39
|
+
|
|
40
|
+
A lightweight CLI for converting documents to Markdown. The CLI is fast to install via pipx, while the heavy ML conversion runs inside a container.
|
|
41
|
+
|
|
42
|
+
## Requirements
|
|
43
|
+
|
|
44
|
+
- **Python 3.8+**
|
|
45
|
+
- **Docker** or **Podman** (for document conversion)
|
|
46
|
+
|
|
47
|
+
## Installation
|
|
48
|
+
|
|
49
|
+
### macOS (recommended)
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
brew install pipx
|
|
53
|
+
pipx ensurepath
|
|
54
|
+
pipx install mdify-cli
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Restart your terminal after installation.
|
|
58
|
+
|
|
59
|
+
### Linux
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
python3 -m pip install --user pipx
|
|
63
|
+
pipx ensurepath
|
|
64
|
+
pipx install mdify-cli
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Install via pip
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install mdify-cli
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Development install
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
git clone https://github.com/tiroq/mdify.git
|
|
77
|
+
cd mdify
|
|
78
|
+
pip install -e .
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Usage
|
|
82
|
+
|
|
83
|
+
### Basic conversion
|
|
84
|
+
|
|
85
|
+
Convert a single file:
|
|
86
|
+
```bash
|
|
87
|
+
mdify document.pdf
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
The first run will automatically pull the container image (~2GB) if not present.
|
|
91
|
+
|
|
92
|
+
### Convert multiple files
|
|
93
|
+
|
|
94
|
+
Convert all PDFs in a directory:
|
|
95
|
+
```bash
|
|
96
|
+
mdify /path/to/documents -g "*.pdf"
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Recursively convert files:
|
|
100
|
+
```bash
|
|
101
|
+
mdify /path/to/documents -r -g "*.pdf"
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### GPU Acceleration
|
|
105
|
+
|
|
106
|
+
For faster processing with NVIDIA GPU:
|
|
107
|
+
```bash
|
|
108
|
+
mdify --gpu documents/*.pdf
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
Requires NVIDIA GPU with CUDA support and nvidia-container-toolkit.
|
|
112
|
+
|
|
113
|
+
### ⚠️ PII Masking (Deprecated)
|
|
114
|
+
|
|
115
|
+
The `--mask` flag is deprecated and will be ignored in this version. PII masking functionality was available in older versions using a custom runtime but is not supported with the current docling-serve backend.
|
|
116
|
+
|
|
117
|
+
If PII masking is critical for your use case, please use mdify v1.5.x or earlier versions.
|
|
118
|
+
|
|
119
|
+
## Performance
|
|
120
|
+
|
|
121
|
+
mdify now uses docling-serve for significantly faster batch processing:
|
|
122
|
+
|
|
123
|
+
- **Single model load**: Models are loaded once per session, not per file
|
|
124
|
+
- **~10-20x speedup** for multiple file conversions compared to previous versions
|
|
125
|
+
- **GPU acceleration**: Use `--gpu` for additional 2-6x speedup (requires NVIDIA GPU)
|
|
126
|
+
|
|
127
|
+
### First Run Behavior
|
|
128
|
+
|
|
129
|
+
The first conversion takes longer (~30-60s) as the container loads ML models into memory. Subsequent files in the same batch process quickly, typically in 1-3 seconds per file.
|
|
130
|
+
|
|
131
|
+
## Options
|
|
132
|
+
|
|
133
|
+
| Option | Description |
|
|
134
|
+
|--------|-------------|
|
|
135
|
+
| `input` | Input file or directory to convert (required) |
|
|
136
|
+
| `-o, --out-dir DIR` | Output directory for converted files (default: output) |
|
|
137
|
+
| `-g, --glob PATTERN` | Glob pattern for filtering files (default: *) |
|
|
138
|
+
| `-r, --recursive` | Recursively scan directories |
|
|
139
|
+
| `--flat` | Disable directory structure preservation |
|
|
140
|
+
| `--overwrite` | Overwrite existing output files |
|
|
141
|
+
| `-q, --quiet` | Suppress progress messages |
|
|
142
|
+
| `-m, --mask` | ⚠️ **Deprecated**: PII masking not supported in current version |
|
|
143
|
+
| `--gpu` | Use GPU-accelerated container (requires NVIDIA GPU and nvidia-container-toolkit) |
|
|
144
|
+
| `--port PORT` | Container port (default: 5001) |
|
|
145
|
+
| `--runtime RUNTIME` | Container runtime: docker or podman (auto-detected) |
|
|
146
|
+
| `--image IMAGE` | Custom container image (default: ghcr.io/docling-project/docling-serve-cpu:main) |
|
|
147
|
+
| `--pull POLICY` | Image pull policy: always, missing, never (default: missing) |
|
|
148
|
+
| `--check-update` | Check for available updates and exit |
|
|
149
|
+
| `--version` | Show version and exit |
|
|
150
|
+
|
|
151
|
+
### Flat Mode
|
|
152
|
+
|
|
153
|
+
With `--flat`, all output files are placed directly in the output directory. Directory paths are incorporated into filenames to prevent collisions:
|
|
154
|
+
|
|
155
|
+
- `docs/subdir1/file.pdf` → `output/subdir1_file.md`
|
|
156
|
+
- `docs/subdir2/file.pdf` → `output/subdir2_file.md`
|
|
157
|
+
|
|
158
|
+
## Examples
|
|
159
|
+
|
|
160
|
+
Convert all PDFs recursively, preserving structure:
|
|
161
|
+
```bash
|
|
162
|
+
mdify documents/ -r -g "*.pdf" -o markdown_output
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
Convert with Podman instead of Docker:
|
|
166
|
+
```bash
|
|
167
|
+
mdify document.pdf --runtime podman
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
Use a custom/local container image:
|
|
171
|
+
```bash
|
|
172
|
+
mdify document.pdf --image my-custom-image:latest
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
Force pull latest container image:
|
|
176
|
+
```bash
|
|
177
|
+
mdify document.pdf --pull
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## Architecture
|
|
181
|
+
|
|
182
|
+
```
|
|
183
|
+
┌──────────────────┐ ┌─────────────────────────────────┐
|
|
184
|
+
│ mdify CLI │ │ Container (Docker/Podman) │
|
|
185
|
+
│ (lightweight) │────▶│ ┌───────────────────────────┐ │
|
|
186
|
+
│ │ │ │ Docling + ML Models │ │
|
|
187
|
+
│ - File handling │◀────│ │ - PDF parsing │ │
|
|
188
|
+
│ - Container │ │ │ - OCR (Tesseract) │ │
|
|
189
|
+
│ orchestration │ │ │ - Document conversion │ │
|
|
190
|
+
└──────────────────┘ │ └───────────────────────────┘ │
|
|
191
|
+
└─────────────────────────────────┘
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
The CLI:
|
|
195
|
+
- Installs in seconds via pipx (no ML dependencies)
|
|
196
|
+
- Automatically detects Docker or Podman
|
|
197
|
+
- Pulls the runtime container on first use
|
|
198
|
+
- Mounts files and runs conversions in the container
|
|
199
|
+
|
|
200
|
+
## Container Images
|
|
201
|
+
|
|
202
|
+
mdify uses official docling-serve containers:
|
|
203
|
+
|
|
204
|
+
**CPU Version** (default):
|
|
205
|
+
```
|
|
206
|
+
ghcr.io/docling-project/docling-serve-cpu:main
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
**GPU Version** (use with `--gpu` flag):
|
|
210
|
+
```
|
|
211
|
+
ghcr.io/docling-project/docling-serve-cu126:main
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
These are official images from the [docling-serve project](https://github.com/DS4SD/docling-serve).
|
|
215
|
+
|
|
216
|
+
## Updates
|
|
217
|
+
|
|
218
|
+
mdify checks for updates daily. When a new version is available:
|
|
219
|
+
|
|
220
|
+
```
|
|
221
|
+
==================================================
|
|
222
|
+
A new version of mdify is available!
|
|
223
|
+
Current version: 0.3.0
|
|
224
|
+
Latest version: 0.4.0
|
|
225
|
+
==================================================
|
|
226
|
+
|
|
227
|
+
Run upgrade now? [y/N]
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
### Disable update checks
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
export MDIFY_NO_UPDATE_CHECK=1
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
## Uninstall
|
|
237
|
+
|
|
238
|
+
```bash
|
|
239
|
+
pipx uninstall mdify-cli
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
Or if installed via pip:
|
|
243
|
+
|
|
244
|
+
```bash
|
|
245
|
+
pip uninstall mdify-cli
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
## Development
|
|
249
|
+
|
|
250
|
+
### Task automation
|
|
251
|
+
|
|
252
|
+
This project uses [Task](https://taskfile.dev) for automation:
|
|
253
|
+
|
|
254
|
+
```bash
|
|
255
|
+
# Show available tasks
|
|
256
|
+
task
|
|
257
|
+
|
|
258
|
+
# Build package
|
|
259
|
+
task build
|
|
260
|
+
|
|
261
|
+
# Build container locally
|
|
262
|
+
task container-build
|
|
263
|
+
|
|
264
|
+
# Release workflow
|
|
265
|
+
task release-patch
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
### Building for PyPI
|
|
269
|
+
|
|
270
|
+
See [PUBLISHING.md](PUBLISHING.md) for complete publishing instructions.
|
|
271
|
+
|
|
272
|
+
## License
|
|
273
|
+
|
|
274
|
+
MIT
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
assets/mdify.png,sha256=qUj7WXWqNwpI2KNXOW79XJwqFqa-UI0JEkmt1mmy4Rg,1820418
|
|
2
|
+
mdify/__init__.py,sha256=ymBvtqVt-BtORLCI0ZO674etO8tlMJxzghl39z6gCUg,90
|
|
3
|
+
mdify/__main__.py,sha256=bhpJ00co6MfaVOdH4XLoW04NtLYDa_oJK7ODzfLrn9M,143
|
|
4
|
+
mdify/cli.py,sha256=LqIibolYSKGCNYqxuIyFnvPkjJyNlXvfWeKaSaoOrqo,28542
|
|
5
|
+
mdify/container.py,sha256=tkk0nv7EquL-rKUY4nkS_yGITb7mqw8B7eEfuqaeVrg,5239
|
|
6
|
+
mdify/docling_client.py,sha256=9QWPmd0W5APzf6LeUrdDBAru6E4d89w2q8WqGVlJoHg,6807
|
|
7
|
+
mdify_cli-2.7.0.dist-info/licenses/LICENSE,sha256=NWM66Uv-XuSMKaU-gaPmvfyk4WgE6zcIPr78wyg6GAo,1065
|
|
8
|
+
mdify_cli-2.7.0.dist-info/METADATA,sha256=4v5CMHOhZ2LKgRgH7xm7hOUUYwahYCRJSCMcGtNja5g,7923
|
|
9
|
+
mdify_cli-2.7.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
10
|
+
mdify_cli-2.7.0.dist-info/entry_points.txt,sha256=0Xki8f5lADQUtwdt6Eq_FEaieI6Byhk8UE7BuDhChMg,41
|
|
11
|
+
mdify_cli-2.7.0.dist-info/top_level.txt,sha256=qltzf7h8owHq7dxCdfCkSHY8gT21hn1_E8P-VWS_OKM,6
|
|
12
|
+
mdify_cli-2.7.0.dist-info/RECORD,,
|