mdify-cli 1.2.0__tar.gz → 1.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mdify_cli-1.2.0/mdify_cli.egg-info → mdify_cli-1.6.0}/PKG-INFO +11 -3
- {mdify_cli-1.2.0 → mdify_cli-1.6.0}/README.md +7 -1
- mdify_cli-1.6.0/assets/mdify.png +0 -0
- {mdify_cli-1.2.0 → mdify_cli-1.6.0}/mdify/__init__.py +1 -1
- {mdify_cli-1.2.0 → mdify_cli-1.6.0}/mdify/cli.py +252 -172
- {mdify_cli-1.2.0 → mdify_cli-1.6.0/mdify_cli.egg-info}/PKG-INFO +11 -3
- {mdify_cli-1.2.0 → mdify_cli-1.6.0}/mdify_cli.egg-info/SOURCES.txt +4 -1
- mdify_cli-1.6.0/mdify_cli.egg-info/requires.txt +3 -0
- {mdify_cli-1.2.0 → mdify_cli-1.6.0}/pyproject.toml +8 -2
- mdify_cli-1.6.0/tests/test_cli.py +77 -0
- {mdify_cli-1.2.0 → mdify_cli-1.6.0}/LICENSE +0 -0
- {mdify_cli-1.2.0 → mdify_cli-1.6.0}/mdify/__main__.py +0 -0
- {mdify_cli-1.2.0 → mdify_cli-1.6.0}/mdify_cli.egg-info/dependency_links.txt +0 -0
- {mdify_cli-1.2.0 → mdify_cli-1.6.0}/mdify_cli.egg-info/entry_points.txt +0 -0
- {mdify_cli-1.2.0 → mdify_cli-1.6.0}/mdify_cli.egg-info/top_level.txt +0 -0
- {mdify_cli-1.2.0 → mdify_cli-1.6.0}/setup.cfg +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mdify-cli
|
|
3
|
-
Version: 1.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 1.6.0
|
|
4
|
+
Summary: Convert PDFs and document images into structured Markdown for LLM workflows
|
|
5
5
|
Author: tiroq
|
|
6
6
|
License-Expression: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/tiroq/mdify
|
|
@@ -24,11 +24,19 @@ Classifier: Topic :: Utilities
|
|
|
24
24
|
Requires-Python: >=3.8
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
License-File: LICENSE
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
27
29
|
Dynamic: license-file
|
|
28
30
|
|
|
29
31
|
# mdify
|
|
30
32
|
|
|
31
|
-
|
|
33
|
+

|
|
34
|
+
|
|
35
|
+
[](https://pypi.org/project/mdify-cli/)
|
|
36
|
+
[](https://github.com/tiroq/mdify/pkgs/container/mdify-runtime)
|
|
37
|
+
[](https://opensource.org/licenses/MIT)
|
|
38
|
+
|
|
39
|
+
A lightweight CLI for converting documents to Markdown. The CLI is fast to install via pipx, while the heavy ML conversion runs inside a container.
|
|
32
40
|
|
|
33
41
|
## Requirements
|
|
34
42
|
|
|
@@ -1,6 +1,12 @@
|
|
|
1
1
|
# mdify
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+

|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/mdify-cli/)
|
|
6
|
+
[](https://github.com/tiroq/mdify/pkgs/container/mdify-runtime)
|
|
7
|
+
[](https://opensource.org/licenses/MIT)
|
|
8
|
+
|
|
9
|
+
A lightweight CLI for converting documents to Markdown. The CLI is fast to install via pipx, while the heavy ML conversion runs inside a container.
|
|
4
10
|
|
|
5
11
|
## Requirements
|
|
6
12
|
|
|
Binary file
|
|
@@ -13,6 +13,7 @@ import os
|
|
|
13
13
|
import shutil
|
|
14
14
|
import subprocess
|
|
15
15
|
import sys
|
|
16
|
+
import threading
|
|
16
17
|
import time
|
|
17
18
|
from pathlib import Path
|
|
18
19
|
from typing import List, Optional, Tuple
|
|
@@ -24,8 +25,7 @@ from . import __version__
|
|
|
24
25
|
# Configuration
|
|
25
26
|
MDIFY_HOME = Path.home() / ".mdify"
|
|
26
27
|
LAST_CHECK_FILE = MDIFY_HOME / ".last_check"
|
|
27
|
-
|
|
28
|
-
GITHUB_API_URL = "https://api.github.com/repos/tiroq/mdify/releases/latest"
|
|
28
|
+
PYPI_API_URL = "https://pypi.org/pypi/mdify-cli/json"
|
|
29
29
|
CHECK_INTERVAL_SECONDS = 86400 # 24 hours
|
|
30
30
|
|
|
31
31
|
# Container configuration
|
|
@@ -37,18 +37,19 @@ SUPPORTED_RUNTIMES = ("docker", "podman")
|
|
|
37
37
|
# Update checking functions
|
|
38
38
|
# =============================================================================
|
|
39
39
|
|
|
40
|
+
|
|
40
41
|
def _get_remote_version(timeout: int = 5) -> Optional[str]:
|
|
41
42
|
"""
|
|
42
|
-
Fetch the latest version from
|
|
43
|
-
|
|
43
|
+
Fetch the latest version from PyPI.
|
|
44
|
+
|
|
44
45
|
Returns:
|
|
45
|
-
Version string (e.g., "
|
|
46
|
+
Version string (e.g., "1.1.0") or None if fetch failed.
|
|
46
47
|
"""
|
|
47
48
|
try:
|
|
48
|
-
with urlopen(
|
|
49
|
+
with urlopen(PYPI_API_URL, timeout=timeout) as response:
|
|
49
50
|
data = json.loads(response.read().decode("utf-8"))
|
|
50
|
-
|
|
51
|
-
return
|
|
51
|
+
version = data.get("info", {}).get("version", "")
|
|
52
|
+
return version if version else None
|
|
52
53
|
except (URLError, json.JSONDecodeError, KeyError, TimeoutError):
|
|
53
54
|
return None
|
|
54
55
|
|
|
@@ -56,16 +57,16 @@ def _get_remote_version(timeout: int = 5) -> Optional[str]:
|
|
|
56
57
|
def _should_check_for_update() -> bool:
|
|
57
58
|
"""
|
|
58
59
|
Determine if we should check for updates based on last check time.
|
|
59
|
-
|
|
60
|
+
|
|
60
61
|
Returns:
|
|
61
62
|
True if check should be performed, False otherwise.
|
|
62
63
|
"""
|
|
63
64
|
if os.environ.get("MDIFY_NO_UPDATE_CHECK", "").lower() in ("1", "true", "yes"):
|
|
64
65
|
return False
|
|
65
|
-
|
|
66
|
+
|
|
66
67
|
if not LAST_CHECK_FILE.exists():
|
|
67
68
|
return True
|
|
68
|
-
|
|
69
|
+
|
|
69
70
|
try:
|
|
70
71
|
last_check = float(LAST_CHECK_FILE.read_text().strip())
|
|
71
72
|
elapsed = time.time() - last_check
|
|
@@ -86,63 +87,35 @@ def _update_last_check_time() -> None:
|
|
|
86
87
|
def _compare_versions(current: str, remote: str) -> bool:
|
|
87
88
|
"""
|
|
88
89
|
Compare version strings.
|
|
89
|
-
|
|
90
|
+
|
|
90
91
|
Returns:
|
|
91
92
|
True if remote version is newer than current.
|
|
92
93
|
"""
|
|
93
94
|
try:
|
|
94
95
|
current_parts = [int(x) for x in current.split(".")]
|
|
95
96
|
remote_parts = [int(x) for x in remote.split(".")]
|
|
96
|
-
|
|
97
|
+
|
|
97
98
|
max_len = max(len(current_parts), len(remote_parts))
|
|
98
99
|
current_parts.extend([0] * (max_len - len(current_parts)))
|
|
99
100
|
remote_parts.extend([0] * (max_len - len(remote_parts)))
|
|
100
|
-
|
|
101
|
+
|
|
101
102
|
return remote_parts > current_parts
|
|
102
103
|
except (ValueError, AttributeError):
|
|
103
104
|
return False
|
|
104
105
|
|
|
105
106
|
|
|
106
|
-
def _run_upgrade() -> bool:
|
|
107
|
-
"""
|
|
108
|
-
Run the upgrade installer.
|
|
109
|
-
|
|
110
|
-
Returns:
|
|
111
|
-
True if upgrade was successful, False otherwise.
|
|
112
|
-
"""
|
|
113
|
-
if not INSTALLER_PATH.exists():
|
|
114
|
-
print(
|
|
115
|
-
f"Installer not found at {INSTALLER_PATH}. "
|
|
116
|
-
"Please reinstall mdify manually.",
|
|
117
|
-
file=sys.stderr,
|
|
118
|
-
)
|
|
119
|
-
return False
|
|
120
|
-
|
|
121
|
-
try:
|
|
122
|
-
result = subprocess.run(
|
|
123
|
-
[str(INSTALLER_PATH), "--upgrade", "-y"],
|
|
124
|
-
check=True,
|
|
125
|
-
)
|
|
126
|
-
return result.returncode == 0
|
|
127
|
-
except subprocess.CalledProcessError:
|
|
128
|
-
return False
|
|
129
|
-
except OSError as e:
|
|
130
|
-
print(f"Failed to run installer: {e}", file=sys.stderr)
|
|
131
|
-
return False
|
|
132
|
-
|
|
133
|
-
|
|
134
107
|
def check_for_update(force: bool = False) -> None:
|
|
135
108
|
"""
|
|
136
109
|
Check for updates and prompt user to upgrade if available.
|
|
137
|
-
|
|
110
|
+
|
|
138
111
|
Args:
|
|
139
112
|
force: If True, check regardless of last check time and show errors.
|
|
140
113
|
"""
|
|
141
114
|
if not force and not _should_check_for_update():
|
|
142
115
|
return
|
|
143
|
-
|
|
116
|
+
|
|
144
117
|
remote_version = _get_remote_version()
|
|
145
|
-
|
|
118
|
+
|
|
146
119
|
if remote_version is None:
|
|
147
120
|
if force:
|
|
148
121
|
print(
|
|
@@ -152,49 +125,40 @@ def check_for_update(force: bool = False) -> None:
|
|
|
152
125
|
)
|
|
153
126
|
sys.exit(1)
|
|
154
127
|
return
|
|
155
|
-
|
|
128
|
+
|
|
156
129
|
_update_last_check_time()
|
|
157
|
-
|
|
130
|
+
|
|
158
131
|
if not _compare_versions(__version__, remote_version):
|
|
159
132
|
if force:
|
|
160
133
|
print(f"mdify is up to date (version {__version__})")
|
|
161
134
|
return
|
|
162
|
-
|
|
163
|
-
print(f"\n{'='*50}")
|
|
164
|
-
print(f"A new version of mdify is available!")
|
|
135
|
+
|
|
136
|
+
print(f"\n{'=' * 50}")
|
|
137
|
+
print(f"A new version of mdify-cli is available!")
|
|
165
138
|
print(f" Current version: {__version__}")
|
|
166
139
|
print(f" Latest version: {remote_version}")
|
|
167
|
-
print(f"{'='*50}
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
except (EOFError, KeyboardInterrupt):
|
|
172
|
-
print()
|
|
173
|
-
return
|
|
174
|
-
|
|
175
|
-
if response in ("y", "yes"):
|
|
176
|
-
print("\nStarting upgrade...\n")
|
|
177
|
-
if _run_upgrade():
|
|
178
|
-
print("\nUpgrade completed! Please restart mdify.")
|
|
179
|
-
sys.exit(0)
|
|
180
|
-
else:
|
|
181
|
-
print("\nUpgrade failed. You can try manually with:")
|
|
182
|
-
print(f" {INSTALLER_PATH} --upgrade")
|
|
183
|
-
else:
|
|
184
|
-
print(f"\nTo upgrade later, run: {INSTALLER_PATH} --upgrade\n")
|
|
140
|
+
print(f"{'=' * 50}")
|
|
141
|
+
print(f"\nTo upgrade, run:")
|
|
142
|
+
print(f" pipx upgrade mdify-cli")
|
|
143
|
+
print(f" # or: pip install --upgrade mdify-cli\n")
|
|
185
144
|
|
|
186
145
|
|
|
187
146
|
# =============================================================================
|
|
188
147
|
# Container runtime functions
|
|
189
148
|
# =============================================================================
|
|
190
149
|
|
|
191
|
-
|
|
150
|
+
|
|
151
|
+
def detect_runtime(preferred: str, explicit: bool = True) -> Optional[str]:
|
|
192
152
|
"""
|
|
193
153
|
Detect available container runtime.
|
|
194
|
-
|
|
154
|
+
|
|
195
155
|
Args:
|
|
196
156
|
preferred: Preferred runtime ('docker' or 'podman')
|
|
197
|
-
|
|
157
|
+
explicit: If True, warn when falling back to alternative.
|
|
158
|
+
If False, silently use alternative without warning.
|
|
159
|
+
Note: This only controls warning emission; selection order
|
|
160
|
+
is always preferred → alternative regardless of this flag.
|
|
161
|
+
|
|
198
162
|
Returns:
|
|
199
163
|
Path to runtime executable, or None if not found.
|
|
200
164
|
"""
|
|
@@ -202,25 +166,28 @@ def detect_runtime(preferred: str) -> Optional[str]:
|
|
|
202
166
|
runtime_path = shutil.which(preferred)
|
|
203
167
|
if runtime_path:
|
|
204
168
|
return runtime_path
|
|
205
|
-
|
|
169
|
+
|
|
206
170
|
# Try alternative
|
|
207
171
|
alternative = "podman" if preferred == "docker" else "docker"
|
|
208
172
|
runtime_path = shutil.which(alternative)
|
|
209
173
|
if runtime_path:
|
|
210
|
-
|
|
174
|
+
if explicit:
|
|
175
|
+
print(
|
|
176
|
+
f"Warning: {preferred} not found, using {alternative}", file=sys.stderr
|
|
177
|
+
)
|
|
211
178
|
return runtime_path
|
|
212
|
-
|
|
179
|
+
|
|
213
180
|
return None
|
|
214
181
|
|
|
215
182
|
|
|
216
183
|
def check_image_exists(runtime: str, image: str) -> bool:
|
|
217
184
|
"""
|
|
218
185
|
Check if container image exists locally.
|
|
219
|
-
|
|
186
|
+
|
|
220
187
|
Args:
|
|
221
188
|
runtime: Path to container runtime
|
|
222
189
|
image: Image name/tag
|
|
223
|
-
|
|
190
|
+
|
|
224
191
|
Returns:
|
|
225
192
|
True if image exists locally.
|
|
226
193
|
"""
|
|
@@ -238,18 +205,18 @@ def check_image_exists(runtime: str, image: str) -> bool:
|
|
|
238
205
|
def pull_image(runtime: str, image: str, quiet: bool = False) -> bool:
|
|
239
206
|
"""
|
|
240
207
|
Pull container image.
|
|
241
|
-
|
|
208
|
+
|
|
242
209
|
Args:
|
|
243
210
|
runtime: Path to container runtime
|
|
244
211
|
image: Image name/tag
|
|
245
212
|
quiet: Suppress progress output
|
|
246
|
-
|
|
213
|
+
|
|
247
214
|
Returns:
|
|
248
215
|
True if pull succeeded.
|
|
249
216
|
"""
|
|
250
217
|
if not quiet:
|
|
251
218
|
print(f"Pulling image: {image}")
|
|
252
|
-
|
|
219
|
+
|
|
253
220
|
try:
|
|
254
221
|
result = subprocess.run(
|
|
255
222
|
[runtime, "pull", image],
|
|
@@ -262,51 +229,116 @@ def pull_image(runtime: str, image: str, quiet: bool = False) -> bool:
|
|
|
262
229
|
return False
|
|
263
230
|
|
|
264
231
|
|
|
232
|
+
def format_size(size_bytes: int) -> str:
|
|
233
|
+
"""Format file size in human-readable format."""
|
|
234
|
+
for unit in ["B", "KB", "MB", "GB"]:
|
|
235
|
+
if size_bytes < 1024:
|
|
236
|
+
return f"{size_bytes:.1f} {unit}" if unit != "B" else f"{size_bytes} {unit}"
|
|
237
|
+
size_bytes /= 1024
|
|
238
|
+
return f"{size_bytes:.1f} TB"
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def format_duration(seconds: float) -> str:
|
|
242
|
+
"""Format duration in human-readable format."""
|
|
243
|
+
if seconds < 60:
|
|
244
|
+
return f"{seconds:.1f}s"
|
|
245
|
+
minutes = int(seconds // 60)
|
|
246
|
+
secs = seconds % 60
|
|
247
|
+
if minutes < 60:
|
|
248
|
+
return f"{minutes}m {secs:.0f}s"
|
|
249
|
+
hours = minutes // 60
|
|
250
|
+
mins = minutes % 60
|
|
251
|
+
return f"{hours}h {mins}m {secs:.0f}s"
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
class Spinner:
|
|
255
|
+
"""A simple spinner to show progress during long operations."""
|
|
256
|
+
|
|
257
|
+
def __init__(self):
|
|
258
|
+
self.frames = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
|
|
259
|
+
self.running = False
|
|
260
|
+
self.thread = None
|
|
261
|
+
self.start_time = None
|
|
262
|
+
|
|
263
|
+
def _spin(self):
|
|
264
|
+
idx = 0
|
|
265
|
+
while self.running:
|
|
266
|
+
elapsed = time.time() - self.start_time
|
|
267
|
+
frame = self.frames[idx % len(self.frames)]
|
|
268
|
+
print(
|
|
269
|
+
f"\r{self.prefix} {frame} ({format_duration(elapsed)})",
|
|
270
|
+
end="",
|
|
271
|
+
flush=True,
|
|
272
|
+
)
|
|
273
|
+
idx += 1
|
|
274
|
+
time.sleep(0.1)
|
|
275
|
+
|
|
276
|
+
def start(self, prefix: str = ""):
|
|
277
|
+
self.prefix = prefix
|
|
278
|
+
self.running = True
|
|
279
|
+
self.start_time = time.time()
|
|
280
|
+
self.thread = threading.Thread(target=self._spin, daemon=True)
|
|
281
|
+
self.thread.start()
|
|
282
|
+
|
|
283
|
+
def stop(self):
|
|
284
|
+
self.running = False
|
|
285
|
+
if self.thread:
|
|
286
|
+
self.thread.join(timeout=0.5)
|
|
287
|
+
# Clear the spinner line
|
|
288
|
+
print(f"\r{' ' * 80}\r", end="", flush=True)
|
|
289
|
+
|
|
290
|
+
|
|
265
291
|
def run_container(
|
|
266
292
|
runtime: str,
|
|
267
293
|
image: str,
|
|
268
294
|
input_file: Path,
|
|
269
295
|
output_file: Path,
|
|
270
296
|
mask_pii: bool = False,
|
|
271
|
-
|
|
272
|
-
) -> Tuple[bool, str]:
|
|
297
|
+
) -> Tuple[bool, str, float]:
|
|
273
298
|
"""
|
|
274
299
|
Run container to convert a single file.
|
|
275
|
-
|
|
300
|
+
|
|
276
301
|
Args:
|
|
277
302
|
runtime: Path to container runtime
|
|
278
303
|
image: Image name/tag
|
|
279
304
|
input_file: Absolute path to input file
|
|
280
305
|
output_file: Absolute path to output file
|
|
281
306
|
mask_pii: Whether to mask PII in images
|
|
282
|
-
|
|
283
|
-
|
|
307
|
+
|
|
284
308
|
Returns:
|
|
285
|
-
Tuple of (success: bool, message: str)
|
|
309
|
+
Tuple of (success: bool, message: str, elapsed_seconds: float)
|
|
286
310
|
"""
|
|
311
|
+
start_time = time.time()
|
|
312
|
+
|
|
287
313
|
# Ensure output directory exists
|
|
288
314
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
289
|
-
|
|
315
|
+
|
|
290
316
|
# Mount directories
|
|
291
317
|
input_dir = input_file.parent
|
|
292
318
|
output_dir = output_file.parent
|
|
293
|
-
|
|
319
|
+
|
|
294
320
|
# Container paths
|
|
295
321
|
container_in = f"/work/in/{input_file.name}"
|
|
296
322
|
container_out = f"/work/out/{output_file.name}"
|
|
297
|
-
|
|
323
|
+
|
|
298
324
|
cmd = [
|
|
299
|
-
runtime,
|
|
300
|
-
"
|
|
301
|
-
"
|
|
325
|
+
runtime,
|
|
326
|
+
"run",
|
|
327
|
+
"--rm",
|
|
328
|
+
"-v",
|
|
329
|
+
f"{input_dir}:/work/in:ro",
|
|
330
|
+
"-v",
|
|
331
|
+
f"{output_dir}:/work/out",
|
|
302
332
|
image,
|
|
303
|
-
"--in",
|
|
304
|
-
|
|
333
|
+
"--in",
|
|
334
|
+
container_in,
|
|
335
|
+
"--out",
|
|
336
|
+
container_out,
|
|
305
337
|
]
|
|
306
|
-
|
|
338
|
+
|
|
307
339
|
if mask_pii:
|
|
308
340
|
cmd.append("--mask")
|
|
309
|
-
|
|
341
|
+
|
|
310
342
|
try:
|
|
311
343
|
result = subprocess.run(
|
|
312
344
|
cmd,
|
|
@@ -314,21 +346,19 @@ def run_container(
|
|
|
314
346
|
text=True,
|
|
315
347
|
check=False,
|
|
316
348
|
)
|
|
317
|
-
|
|
349
|
+
elapsed = time.time() - start_time
|
|
350
|
+
|
|
318
351
|
if result.returncode == 0:
|
|
319
|
-
|
|
320
|
-
print(f"Converted: {input_file} -> {output_file}")
|
|
321
|
-
return True, "success"
|
|
352
|
+
return True, "success", elapsed
|
|
322
353
|
else:
|
|
323
|
-
error_msg =
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
return False,
|
|
327
|
-
|
|
354
|
+
error_msg = (
|
|
355
|
+
result.stderr.strip() or result.stdout.strip() or "Unknown error"
|
|
356
|
+
)
|
|
357
|
+
return False, error_msg, elapsed
|
|
358
|
+
|
|
328
359
|
except OSError as e:
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
return False, f"error: {e}"
|
|
360
|
+
elapsed = time.time() - start_time
|
|
361
|
+
return False, str(e), elapsed
|
|
332
362
|
|
|
333
363
|
|
|
334
364
|
# =============================================================================
|
|
@@ -337,22 +367,39 @@ def run_container(
|
|
|
337
367
|
|
|
338
368
|
# Supported file extensions (based on Docling InputFormat)
|
|
339
369
|
SUPPORTED_EXTENSIONS = {
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
370
|
+
".pdf",
|
|
371
|
+
".docx",
|
|
372
|
+
".pptx",
|
|
373
|
+
".html",
|
|
374
|
+
".htm",
|
|
375
|
+
".png",
|
|
376
|
+
".jpg",
|
|
377
|
+
".jpeg",
|
|
378
|
+
".gif",
|
|
379
|
+
".bmp",
|
|
380
|
+
".tiff",
|
|
381
|
+
".tif", # images
|
|
382
|
+
".asciidoc",
|
|
383
|
+
".adoc",
|
|
384
|
+
".asc", # asciidoc
|
|
385
|
+
".md",
|
|
386
|
+
".markdown", # markdown
|
|
387
|
+
".csv",
|
|
388
|
+
".xlsx", # spreadsheets
|
|
389
|
+
".xml", # XML formats
|
|
390
|
+
".json", # JSON docling
|
|
391
|
+
".mp3",
|
|
392
|
+
".wav",
|
|
393
|
+
".m4a",
|
|
394
|
+
".flac", # audio
|
|
395
|
+
".vtt", # subtitles
|
|
349
396
|
}
|
|
350
397
|
|
|
351
398
|
|
|
352
399
|
def get_files_to_convert(input_path: Path, mask: str, recursive: bool) -> List[Path]:
|
|
353
400
|
"""Get list of files to convert based on input path and options."""
|
|
354
401
|
files = []
|
|
355
|
-
|
|
402
|
+
|
|
356
403
|
if input_path.is_file():
|
|
357
404
|
files.append(input_path)
|
|
358
405
|
elif input_path.is_dir():
|
|
@@ -360,19 +407,19 @@ def get_files_to_convert(input_path: Path, mask: str, recursive: bool) -> List[P
|
|
|
360
407
|
files = list(input_path.rglob(mask))
|
|
361
408
|
else:
|
|
362
409
|
files = list(input_path.glob(mask))
|
|
363
|
-
|
|
410
|
+
|
|
364
411
|
# Filter to only files
|
|
365
412
|
files = [f for f in files if f.is_file()]
|
|
366
413
|
else:
|
|
367
414
|
raise FileNotFoundError(f"Input path does not exist: {input_path}")
|
|
368
|
-
|
|
415
|
+
|
|
369
416
|
# Filter out hidden files and unsupported formats
|
|
370
417
|
files = [
|
|
371
|
-
f
|
|
372
|
-
|
|
373
|
-
and f.suffix.lower() in SUPPORTED_EXTENSIONS
|
|
418
|
+
f
|
|
419
|
+
for f in files
|
|
420
|
+
if not f.name.startswith(".") and f.suffix.lower() in SUPPORTED_EXTENSIONS
|
|
374
421
|
]
|
|
375
|
-
|
|
422
|
+
|
|
376
423
|
return files
|
|
377
424
|
|
|
378
425
|
|
|
@@ -405,7 +452,7 @@ def get_output_path(
|
|
|
405
452
|
output_path = output_dir / relative_path.parent / output_name
|
|
406
453
|
except ValueError:
|
|
407
454
|
output_path = output_dir / output_name
|
|
408
|
-
|
|
455
|
+
|
|
409
456
|
return output_path
|
|
410
457
|
|
|
411
458
|
|
|
@@ -413,6 +460,7 @@ def get_output_path(
|
|
|
413
460
|
# CLI argument parsing
|
|
414
461
|
# =============================================================================
|
|
415
462
|
|
|
463
|
+
|
|
416
464
|
def parse_args() -> argparse.Namespace:
|
|
417
465
|
"""Parse command line arguments."""
|
|
418
466
|
parser = argparse.ArgumentParser(
|
|
@@ -427,74 +475,79 @@ Examples:
|
|
|
427
475
|
mdify ./docs --runtime podman Use Podman instead of Docker
|
|
428
476
|
""",
|
|
429
477
|
)
|
|
430
|
-
|
|
478
|
+
|
|
431
479
|
parser.add_argument(
|
|
432
480
|
"input",
|
|
433
481
|
type=str,
|
|
434
482
|
nargs="?",
|
|
435
483
|
help="Input file or directory to convert",
|
|
436
484
|
)
|
|
437
|
-
|
|
485
|
+
|
|
438
486
|
parser.add_argument(
|
|
439
|
-
"-o",
|
|
487
|
+
"-o",
|
|
488
|
+
"--out-dir",
|
|
440
489
|
type=str,
|
|
441
490
|
default="output",
|
|
442
491
|
help="Output directory for converted files (default: output)",
|
|
443
492
|
)
|
|
444
|
-
|
|
493
|
+
|
|
445
494
|
parser.add_argument(
|
|
446
|
-
"-g",
|
|
495
|
+
"-g",
|
|
496
|
+
"--glob",
|
|
447
497
|
type=str,
|
|
448
498
|
default="*",
|
|
449
499
|
help="Glob pattern for filtering files in directory (default: *)",
|
|
450
500
|
)
|
|
451
|
-
|
|
501
|
+
|
|
452
502
|
parser.add_argument(
|
|
453
|
-
"-r",
|
|
503
|
+
"-r",
|
|
504
|
+
"--recursive",
|
|
454
505
|
action="store_true",
|
|
455
506
|
help="Recursively scan directories",
|
|
456
507
|
)
|
|
457
|
-
|
|
508
|
+
|
|
458
509
|
parser.add_argument(
|
|
459
510
|
"--flat",
|
|
460
511
|
action="store_true",
|
|
461
512
|
help="Disable directory structure preservation in output",
|
|
462
513
|
)
|
|
463
|
-
|
|
514
|
+
|
|
464
515
|
parser.add_argument(
|
|
465
516
|
"--overwrite",
|
|
466
517
|
action="store_true",
|
|
467
518
|
help="Overwrite existing output files",
|
|
468
519
|
)
|
|
469
|
-
|
|
520
|
+
|
|
470
521
|
parser.add_argument(
|
|
471
|
-
"-q",
|
|
522
|
+
"-q",
|
|
523
|
+
"--quiet",
|
|
472
524
|
action="store_true",
|
|
473
525
|
help="Suppress progress messages",
|
|
474
526
|
)
|
|
475
|
-
|
|
527
|
+
|
|
476
528
|
parser.add_argument(
|
|
477
|
-
"-m",
|
|
529
|
+
"-m",
|
|
530
|
+
"--mask",
|
|
478
531
|
action="store_true",
|
|
479
532
|
help="Mask PII and sensitive content in document images",
|
|
480
533
|
)
|
|
481
|
-
|
|
534
|
+
|
|
482
535
|
# Container options
|
|
483
536
|
parser.add_argument(
|
|
484
537
|
"--runtime",
|
|
485
538
|
type=str,
|
|
486
539
|
choices=SUPPORTED_RUNTIMES,
|
|
487
|
-
default=
|
|
488
|
-
help="Container runtime to use (
|
|
540
|
+
default=None,
|
|
541
|
+
help="Container runtime to use (auto-detects docker or podman if not specified)",
|
|
489
542
|
)
|
|
490
|
-
|
|
543
|
+
|
|
491
544
|
parser.add_argument(
|
|
492
545
|
"--image",
|
|
493
546
|
type=str,
|
|
494
547
|
default=DEFAULT_IMAGE,
|
|
495
548
|
help=f"Container image to use (default: {DEFAULT_IMAGE})",
|
|
496
549
|
)
|
|
497
|
-
|
|
550
|
+
|
|
498
551
|
parser.add_argument(
|
|
499
552
|
"--pull",
|
|
500
553
|
type=str,
|
|
@@ -502,20 +555,20 @@ Examples:
|
|
|
502
555
|
default="missing",
|
|
503
556
|
help="Image pull policy: always, missing, never (default: missing)",
|
|
504
557
|
)
|
|
505
|
-
|
|
558
|
+
|
|
506
559
|
# Utility options
|
|
507
560
|
parser.add_argument(
|
|
508
561
|
"--check-update",
|
|
509
562
|
action="store_true",
|
|
510
563
|
help="Check for available updates and exit",
|
|
511
564
|
)
|
|
512
|
-
|
|
565
|
+
|
|
513
566
|
parser.add_argument(
|
|
514
567
|
"--version",
|
|
515
568
|
action="version",
|
|
516
569
|
version=f"mdify {__version__}",
|
|
517
570
|
)
|
|
518
|
-
|
|
571
|
+
|
|
519
572
|
return parser.parse_args()
|
|
520
573
|
|
|
521
574
|
|
|
@@ -523,27 +576,30 @@ Examples:
|
|
|
523
576
|
# Main entry point
|
|
524
577
|
# =============================================================================
|
|
525
578
|
|
|
579
|
+
|
|
526
580
|
def main() -> int:
|
|
527
581
|
"""Main entry point for the CLI."""
|
|
528
582
|
args = parse_args()
|
|
529
|
-
|
|
583
|
+
|
|
530
584
|
# Handle --check-update flag
|
|
531
585
|
if args.check_update:
|
|
532
586
|
check_for_update(force=True)
|
|
533
587
|
return 0
|
|
534
|
-
|
|
588
|
+
|
|
535
589
|
# Check for updates (daily, silent on errors)
|
|
536
590
|
check_for_update(force=False)
|
|
537
|
-
|
|
591
|
+
|
|
538
592
|
# Validate input is provided
|
|
539
593
|
if args.input is None:
|
|
540
594
|
print("Error: Input file or directory is required", file=sys.stderr)
|
|
541
595
|
print("Usage: mdify <input> [options]", file=sys.stderr)
|
|
542
596
|
print(" mdify --help for more information", file=sys.stderr)
|
|
543
597
|
return 1
|
|
544
|
-
|
|
598
|
+
|
|
545
599
|
# Detect container runtime
|
|
546
|
-
|
|
600
|
+
preferred = args.runtime if args.runtime else "docker"
|
|
601
|
+
explicit = args.runtime is not None
|
|
602
|
+
runtime = detect_runtime(preferred, explicit=explicit)
|
|
547
603
|
if runtime is None:
|
|
548
604
|
print(
|
|
549
605
|
f"Error: Container runtime not found ({', '.join(SUPPORTED_RUNTIMES)})",
|
|
@@ -551,87 +607,111 @@ def main() -> int:
|
|
|
551
607
|
)
|
|
552
608
|
print("Please install Docker or Podman to use mdify.", file=sys.stderr)
|
|
553
609
|
return 2
|
|
554
|
-
|
|
610
|
+
|
|
555
611
|
# Handle image pull policy
|
|
556
612
|
image = args.image
|
|
557
613
|
image_exists = check_image_exists(runtime, image)
|
|
558
|
-
|
|
614
|
+
|
|
559
615
|
if args.pull == "always" or (args.pull == "missing" and not image_exists):
|
|
560
616
|
if not pull_image(runtime, image, args.quiet):
|
|
561
617
|
print(f"Error: Failed to pull image: {image}", file=sys.stderr)
|
|
562
618
|
return 1
|
|
563
619
|
elif args.pull == "never" and not image_exists:
|
|
564
620
|
print(f"Error: Image not found locally: {image}", file=sys.stderr)
|
|
565
|
-
print(f"Run with --pull=missing or pull manually: {
|
|
621
|
+
print(f"Run with --pull=missing or pull manually: {preferred} pull {image}")
|
|
566
622
|
return 1
|
|
567
|
-
|
|
623
|
+
|
|
568
624
|
# Resolve paths
|
|
569
625
|
input_path = Path(args.input).resolve()
|
|
570
626
|
output_dir = Path(args.out_dir).resolve()
|
|
571
|
-
|
|
627
|
+
|
|
572
628
|
# Validate input
|
|
573
629
|
if not input_path.exists():
|
|
574
630
|
print(f"Error: Input path does not exist: {input_path}", file=sys.stderr)
|
|
575
631
|
return 1
|
|
576
|
-
|
|
632
|
+
|
|
577
633
|
# Get files to convert
|
|
578
634
|
try:
|
|
579
635
|
files_to_convert = get_files_to_convert(input_path, args.glob, args.recursive)
|
|
580
636
|
except Exception as e:
|
|
581
637
|
print(f"Error: {e}", file=sys.stderr)
|
|
582
638
|
return 1
|
|
583
|
-
|
|
639
|
+
|
|
584
640
|
if not files_to_convert:
|
|
585
641
|
print(f"No files found to convert in: {input_path}", file=sys.stderr)
|
|
586
642
|
return 1
|
|
587
|
-
|
|
643
|
+
|
|
644
|
+
total_files = len(files_to_convert)
|
|
645
|
+
total_size = sum(f.stat().st_size for f in files_to_convert)
|
|
646
|
+
|
|
588
647
|
if not args.quiet:
|
|
589
|
-
print(f"Found {
|
|
648
|
+
print(f"Found {total_files} file(s) to convert ({format_size(total_size)})")
|
|
590
649
|
print(f"Using runtime: {runtime}")
|
|
591
650
|
print(f"Using image: {image}")
|
|
592
651
|
print()
|
|
593
|
-
|
|
652
|
+
|
|
594
653
|
# Determine input base for directory structure preservation
|
|
595
654
|
if input_path.is_file():
|
|
596
655
|
input_base = input_path.parent
|
|
597
656
|
else:
|
|
598
657
|
input_base = input_path
|
|
599
|
-
|
|
658
|
+
|
|
600
659
|
# Convert files
|
|
601
660
|
success_count = 0
|
|
602
661
|
skipped_count = 0
|
|
603
662
|
failed_count = 0
|
|
604
|
-
|
|
605
|
-
|
|
663
|
+
conversion_start = time.time()
|
|
664
|
+
spinner = Spinner()
|
|
665
|
+
|
|
666
|
+
for idx, input_file in enumerate(files_to_convert, 1):
|
|
606
667
|
output_file = get_output_path(input_file, input_base, output_dir, args.flat)
|
|
607
|
-
|
|
668
|
+
file_size = input_file.stat().st_size
|
|
669
|
+
progress = f"[{idx}/{total_files}]"
|
|
670
|
+
|
|
608
671
|
# Check if output exists and skip if not overwriting
|
|
609
672
|
if output_file.exists() and not args.overwrite:
|
|
610
673
|
if not args.quiet:
|
|
611
|
-
print(f"Skipped (exists): {input_file}
|
|
674
|
+
print(f"{progress} Skipped (exists): {input_file.name}")
|
|
612
675
|
skipped_count += 1
|
|
613
676
|
continue
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
677
|
+
|
|
678
|
+
# Show spinner while processing
|
|
679
|
+
if not args.quiet:
|
|
680
|
+
spinner.start(
|
|
681
|
+
f"{progress} Processing: {input_file.name} ({format_size(file_size)})"
|
|
682
|
+
)
|
|
683
|
+
|
|
684
|
+
success, result, elapsed = run_container(
|
|
685
|
+
runtime, image, input_file, output_file, args.mask
|
|
617
686
|
)
|
|
618
|
-
|
|
687
|
+
|
|
688
|
+
if not args.quiet:
|
|
689
|
+
spinner.stop()
|
|
690
|
+
|
|
619
691
|
if success:
|
|
620
692
|
success_count += 1
|
|
693
|
+
if not args.quiet:
|
|
694
|
+
print(f"{progress} {input_file.name} ✓ ({format_duration(elapsed)})")
|
|
621
695
|
else:
|
|
622
696
|
failed_count += 1
|
|
623
|
-
|
|
697
|
+
if not args.quiet:
|
|
698
|
+
print(f"{progress} {input_file.name} ✗ ({format_duration(elapsed)})")
|
|
699
|
+
print(f" Error: {result}", file=sys.stderr)
|
|
700
|
+
|
|
701
|
+
total_elapsed = time.time() - conversion_start
|
|
702
|
+
|
|
624
703
|
# Print summary
|
|
625
704
|
if not args.quiet:
|
|
626
705
|
print()
|
|
627
706
|
print("=" * 50)
|
|
628
707
|
print("Conversion Summary:")
|
|
629
|
-
print(f" Total files: {
|
|
708
|
+
print(f" Total files: {total_files}")
|
|
630
709
|
print(f" Successful: {success_count}")
|
|
631
710
|
print(f" Skipped: {skipped_count}")
|
|
632
711
|
print(f" Failed: {failed_count}")
|
|
712
|
+
print(f" Total time: {format_duration(total_elapsed)}")
|
|
633
713
|
print("=" * 50)
|
|
634
|
-
|
|
714
|
+
|
|
635
715
|
# Return appropriate exit code
|
|
636
716
|
if failed_count > 0:
|
|
637
717
|
return 1
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mdify-cli
|
|
3
|
-
Version: 1.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 1.6.0
|
|
4
|
+
Summary: Convert PDFs and document images into structured Markdown for LLM workflows
|
|
5
5
|
Author: tiroq
|
|
6
6
|
License-Expression: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/tiroq/mdify
|
|
@@ -24,11 +24,19 @@ Classifier: Topic :: Utilities
|
|
|
24
24
|
Requires-Python: >=3.8
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
License-File: LICENSE
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
27
29
|
Dynamic: license-file
|
|
28
30
|
|
|
29
31
|
# mdify
|
|
30
32
|
|
|
31
|
-
|
|
33
|
+

|
|
34
|
+
|
|
35
|
+
[](https://pypi.org/project/mdify-cli/)
|
|
36
|
+
[](https://github.com/tiroq/mdify/pkgs/container/mdify-runtime)
|
|
37
|
+
[](https://opensource.org/licenses/MIT)
|
|
38
|
+
|
|
39
|
+
A lightweight CLI for converting documents to Markdown. The CLI is fast to install via pipx, while the heavy ML conversion runs inside a container.
|
|
32
40
|
|
|
33
41
|
## Requirements
|
|
34
42
|
|
|
@@ -4,8 +4,11 @@ pyproject.toml
|
|
|
4
4
|
mdify/__init__.py
|
|
5
5
|
mdify/__main__.py
|
|
6
6
|
mdify/cli.py
|
|
7
|
+
mdify/../assets/mdify.png
|
|
7
8
|
mdify_cli.egg-info/PKG-INFO
|
|
8
9
|
mdify_cli.egg-info/SOURCES.txt
|
|
9
10
|
mdify_cli.egg-info/dependency_links.txt
|
|
10
11
|
mdify_cli.egg-info/entry_points.txt
|
|
11
|
-
mdify_cli.egg-info/
|
|
12
|
+
mdify_cli.egg-info/requires.txt
|
|
13
|
+
mdify_cli.egg-info/top_level.txt
|
|
14
|
+
tests/test_cli.py
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "mdify-cli"
|
|
3
|
-
version = "1.
|
|
4
|
-
description = "
|
|
3
|
+
version = "1.6.0"
|
|
4
|
+
description = "Convert PDFs and document images into structured Markdown for LLM workflows"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.8"
|
|
7
7
|
license = "MIT"
|
|
@@ -26,6 +26,9 @@ classifiers = [
|
|
|
26
26
|
]
|
|
27
27
|
dependencies = []
|
|
28
28
|
|
|
29
|
+
[project.optional-dependencies]
|
|
30
|
+
dev = ["pytest>=7.0"]
|
|
31
|
+
|
|
29
32
|
[project.urls]
|
|
30
33
|
Homepage = "https://github.com/tiroq/mdify"
|
|
31
34
|
Repository = "https://github.com/tiroq/mdify"
|
|
@@ -41,3 +44,6 @@ build-backend = "setuptools.build_meta"
|
|
|
41
44
|
[tool.setuptools.packages.find]
|
|
42
45
|
include = ["mdify", "mdify.*"]
|
|
43
46
|
exclude = ["runtime", "runtime.*"]
|
|
47
|
+
|
|
48
|
+
[tool.setuptools.package-data]
|
|
49
|
+
mdify = ["../assets/*.png"]
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Tests for mdify CLI runtime detection."""
|
|
2
|
+
|
|
3
|
+
from unittest.mock import patch
|
|
4
|
+
from mdify.cli import detect_runtime
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TestDetectRuntime:
|
|
8
|
+
"""Tests for detect_runtime() function."""
|
|
9
|
+
|
|
10
|
+
def test_auto_docker_exists(self):
|
|
11
|
+
with patch("mdify.cli.shutil.which") as mock_which:
|
|
12
|
+
mock_which.side_effect = (
|
|
13
|
+
lambda x: "/usr/bin/docker" if x == "docker" else None
|
|
14
|
+
)
|
|
15
|
+
result = detect_runtime("docker", explicit=False)
|
|
16
|
+
assert result == "/usr/bin/docker"
|
|
17
|
+
|
|
18
|
+
def test_auto_only_podman_exists(self, capsys):
|
|
19
|
+
with patch("mdify.cli.shutil.which") as mock_which:
|
|
20
|
+
mock_which.side_effect = (
|
|
21
|
+
lambda x: "/usr/bin/podman" if x == "podman" else None
|
|
22
|
+
)
|
|
23
|
+
result = detect_runtime("docker", explicit=False)
|
|
24
|
+
assert result == "/usr/bin/podman"
|
|
25
|
+
captured = capsys.readouterr()
|
|
26
|
+
assert captured.err == ""
|
|
27
|
+
|
|
28
|
+
def test_auto_neither_exists(self):
|
|
29
|
+
with patch("mdify.cli.shutil.which", return_value=None):
|
|
30
|
+
result = detect_runtime("docker", explicit=False)
|
|
31
|
+
assert result is None
|
|
32
|
+
|
|
33
|
+
def test_explicit_docker_exists(self):
|
|
34
|
+
with patch("mdify.cli.shutil.which") as mock_which:
|
|
35
|
+
mock_which.side_effect = (
|
|
36
|
+
lambda x: "/usr/bin/docker" if x == "docker" else None
|
|
37
|
+
)
|
|
38
|
+
result = detect_runtime("docker", explicit=True)
|
|
39
|
+
assert result == "/usr/bin/docker"
|
|
40
|
+
|
|
41
|
+
def test_explicit_docker_fallback_to_podman(self, capsys):
|
|
42
|
+
with patch("mdify.cli.shutil.which") as mock_which:
|
|
43
|
+
mock_which.side_effect = (
|
|
44
|
+
lambda x: "/usr/bin/podman" if x == "podman" else None
|
|
45
|
+
)
|
|
46
|
+
result = detect_runtime("docker", explicit=True)
|
|
47
|
+
assert result == "/usr/bin/podman"
|
|
48
|
+
captured = capsys.readouterr()
|
|
49
|
+
assert "Warning: docker not found, using podman" in captured.err
|
|
50
|
+
|
|
51
|
+
def test_explicit_docker_neither_exists(self):
|
|
52
|
+
with patch("mdify.cli.shutil.which", return_value=None):
|
|
53
|
+
result = detect_runtime("docker", explicit=True)
|
|
54
|
+
assert result is None
|
|
55
|
+
|
|
56
|
+
def test_explicit_podman_exists(self):
|
|
57
|
+
with patch("mdify.cli.shutil.which") as mock_which:
|
|
58
|
+
mock_which.side_effect = (
|
|
59
|
+
lambda x: "/usr/bin/podman" if x == "podman" else None
|
|
60
|
+
)
|
|
61
|
+
result = detect_runtime("podman", explicit=True)
|
|
62
|
+
assert result == "/usr/bin/podman"
|
|
63
|
+
|
|
64
|
+
def test_explicit_podman_fallback_to_docker(self, capsys):
|
|
65
|
+
with patch("mdify.cli.shutil.which") as mock_which:
|
|
66
|
+
mock_which.side_effect = (
|
|
67
|
+
lambda x: "/usr/bin/docker" if x == "docker" else None
|
|
68
|
+
)
|
|
69
|
+
result = detect_runtime("podman", explicit=True)
|
|
70
|
+
assert result == "/usr/bin/docker"
|
|
71
|
+
captured = capsys.readouterr()
|
|
72
|
+
assert "Warning: podman not found, using docker" in captured.err
|
|
73
|
+
|
|
74
|
+
def test_explicit_podman_neither_exists(self):
|
|
75
|
+
with patch("mdify.cli.shutil.which", return_value=None):
|
|
76
|
+
result = detect_runtime("podman", explicit=True)
|
|
77
|
+
assert result is None
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|