mdify-cli 1.5.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdify/__init__.py +1 -1
- mdify/cli.py +251 -204
- mdify/container.py +128 -0
- mdify/docling_client.py +224 -0
- {mdify_cli-1.5.0.dist-info → mdify_cli-2.0.0.dist-info}/METADATA +40 -15
- mdify_cli-2.0.0.dist-info/RECORD +12 -0
- {mdify_cli-1.5.0.dist-info → mdify_cli-2.0.0.dist-info}/WHEEL +1 -1
- mdify_cli-1.5.0.dist-info/RECORD +0 -10
- {mdify_cli-1.5.0.dist-info → mdify_cli-2.0.0.dist-info}/entry_points.txt +0 -0
- {mdify_cli-1.5.0.dist-info → mdify_cli-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {mdify_cli-1.5.0.dist-info → mdify_cli-2.0.0.dist-info}/top_level.txt +0 -0
mdify/__init__.py
CHANGED
mdify/cli.py
CHANGED
|
@@ -21,6 +21,8 @@ from urllib.error import URLError
|
|
|
21
21
|
from urllib.request import urlopen
|
|
22
22
|
|
|
23
23
|
from . import __version__
|
|
24
|
+
from mdify.container import DoclingContainer
|
|
25
|
+
from mdify.docling_client import convert_file
|
|
24
26
|
|
|
25
27
|
# Configuration
|
|
26
28
|
MDIFY_HOME = Path.home() / ".mdify"
|
|
@@ -29,7 +31,8 @@ PYPI_API_URL = "https://pypi.org/pypi/mdify-cli/json"
|
|
|
29
31
|
CHECK_INTERVAL_SECONDS = 86400 # 24 hours
|
|
30
32
|
|
|
31
33
|
# Container configuration
|
|
32
|
-
DEFAULT_IMAGE = "ghcr.io/
|
|
34
|
+
DEFAULT_IMAGE = "ghcr.io/docling-project/docling-serve-cpu:main"
|
|
35
|
+
GPU_IMAGE = "ghcr.io/docling-project/docling-serve-cu126:main"
|
|
33
36
|
SUPPORTED_RUNTIMES = ("docker", "podman")
|
|
34
37
|
|
|
35
38
|
|
|
@@ -37,10 +40,11 @@ SUPPORTED_RUNTIMES = ("docker", "podman")
|
|
|
37
40
|
# Update checking functions
|
|
38
41
|
# =============================================================================
|
|
39
42
|
|
|
43
|
+
|
|
40
44
|
def _get_remote_version(timeout: int = 5) -> Optional[str]:
|
|
41
45
|
"""
|
|
42
46
|
Fetch the latest version from PyPI.
|
|
43
|
-
|
|
47
|
+
|
|
44
48
|
Returns:
|
|
45
49
|
Version string (e.g., "1.1.0") or None if fetch failed.
|
|
46
50
|
"""
|
|
@@ -56,16 +60,16 @@ def _get_remote_version(timeout: int = 5) -> Optional[str]:
|
|
|
56
60
|
def _should_check_for_update() -> bool:
|
|
57
61
|
"""
|
|
58
62
|
Determine if we should check for updates based on last check time.
|
|
59
|
-
|
|
63
|
+
|
|
60
64
|
Returns:
|
|
61
65
|
True if check should be performed, False otherwise.
|
|
62
66
|
"""
|
|
63
67
|
if os.environ.get("MDIFY_NO_UPDATE_CHECK", "").lower() in ("1", "true", "yes"):
|
|
64
68
|
return False
|
|
65
|
-
|
|
69
|
+
|
|
66
70
|
if not LAST_CHECK_FILE.exists():
|
|
67
71
|
return True
|
|
68
|
-
|
|
72
|
+
|
|
69
73
|
try:
|
|
70
74
|
last_check = float(LAST_CHECK_FILE.read_text().strip())
|
|
71
75
|
elapsed = time.time() - last_check
|
|
@@ -86,18 +90,18 @@ def _update_last_check_time() -> None:
|
|
|
86
90
|
def _compare_versions(current: str, remote: str) -> bool:
|
|
87
91
|
"""
|
|
88
92
|
Compare version strings.
|
|
89
|
-
|
|
93
|
+
|
|
90
94
|
Returns:
|
|
91
95
|
True if remote version is newer than current.
|
|
92
96
|
"""
|
|
93
97
|
try:
|
|
94
98
|
current_parts = [int(x) for x in current.split(".")]
|
|
95
99
|
remote_parts = [int(x) for x in remote.split(".")]
|
|
96
|
-
|
|
100
|
+
|
|
97
101
|
max_len = max(len(current_parts), len(remote_parts))
|
|
98
102
|
current_parts.extend([0] * (max_len - len(current_parts)))
|
|
99
103
|
remote_parts.extend([0] * (max_len - len(remote_parts)))
|
|
100
|
-
|
|
104
|
+
|
|
101
105
|
return remote_parts > current_parts
|
|
102
106
|
except (ValueError, AttributeError):
|
|
103
107
|
return False
|
|
@@ -106,15 +110,15 @@ def _compare_versions(current: str, remote: str) -> bool:
|
|
|
106
110
|
def check_for_update(force: bool = False) -> None:
|
|
107
111
|
"""
|
|
108
112
|
Check for updates and prompt user to upgrade if available.
|
|
109
|
-
|
|
113
|
+
|
|
110
114
|
Args:
|
|
111
115
|
force: If True, check regardless of last check time and show errors.
|
|
112
116
|
"""
|
|
113
117
|
if not force and not _should_check_for_update():
|
|
114
118
|
return
|
|
115
|
-
|
|
119
|
+
|
|
116
120
|
remote_version = _get_remote_version()
|
|
117
|
-
|
|
121
|
+
|
|
118
122
|
if remote_version is None:
|
|
119
123
|
if force:
|
|
120
124
|
print(
|
|
@@ -124,19 +128,19 @@ def check_for_update(force: bool = False) -> None:
|
|
|
124
128
|
)
|
|
125
129
|
sys.exit(1)
|
|
126
130
|
return
|
|
127
|
-
|
|
131
|
+
|
|
128
132
|
_update_last_check_time()
|
|
129
|
-
|
|
133
|
+
|
|
130
134
|
if not _compare_versions(__version__, remote_version):
|
|
131
135
|
if force:
|
|
132
136
|
print(f"mdify is up to date (version {__version__})")
|
|
133
137
|
return
|
|
134
|
-
|
|
135
|
-
print(f"\n{'='*50}")
|
|
138
|
+
|
|
139
|
+
print(f"\n{'=' * 50}")
|
|
136
140
|
print(f"A new version of mdify-cli is available!")
|
|
137
141
|
print(f" Current version: {__version__}")
|
|
138
142
|
print(f" Latest version: {remote_version}")
|
|
139
|
-
print(f"{'='*50}")
|
|
143
|
+
print(f"{'=' * 50}")
|
|
140
144
|
print(f"\nTo upgrade, run:")
|
|
141
145
|
print(f" pipx upgrade mdify-cli")
|
|
142
146
|
print(f" # or: pip install --upgrade mdify-cli\n")
|
|
@@ -146,13 +150,18 @@ def check_for_update(force: bool = False) -> None:
|
|
|
146
150
|
# Container runtime functions
|
|
147
151
|
# =============================================================================
|
|
148
152
|
|
|
149
|
-
|
|
153
|
+
|
|
154
|
+
def detect_runtime(preferred: str, explicit: bool = True) -> Optional[str]:
|
|
150
155
|
"""
|
|
151
156
|
Detect available container runtime.
|
|
152
|
-
|
|
157
|
+
|
|
153
158
|
Args:
|
|
154
159
|
preferred: Preferred runtime ('docker' or 'podman')
|
|
155
|
-
|
|
160
|
+
explicit: If True, warn when falling back to alternative.
|
|
161
|
+
If False, silently use alternative without warning.
|
|
162
|
+
Note: This only controls warning emission; selection order
|
|
163
|
+
is always preferred → alternative regardless of this flag.
|
|
164
|
+
|
|
156
165
|
Returns:
|
|
157
166
|
Path to runtime executable, or None if not found.
|
|
158
167
|
"""
|
|
@@ -160,25 +169,28 @@ def detect_runtime(preferred: str) -> Optional[str]:
|
|
|
160
169
|
runtime_path = shutil.which(preferred)
|
|
161
170
|
if runtime_path:
|
|
162
171
|
return runtime_path
|
|
163
|
-
|
|
172
|
+
|
|
164
173
|
# Try alternative
|
|
165
174
|
alternative = "podman" if preferred == "docker" else "docker"
|
|
166
175
|
runtime_path = shutil.which(alternative)
|
|
167
176
|
if runtime_path:
|
|
168
|
-
|
|
177
|
+
if explicit:
|
|
178
|
+
print(
|
|
179
|
+
f"Warning: {preferred} not found, using {alternative}", file=sys.stderr
|
|
180
|
+
)
|
|
169
181
|
return runtime_path
|
|
170
|
-
|
|
182
|
+
|
|
171
183
|
return None
|
|
172
184
|
|
|
173
185
|
|
|
174
186
|
def check_image_exists(runtime: str, image: str) -> bool:
|
|
175
187
|
"""
|
|
176
188
|
Check if container image exists locally.
|
|
177
|
-
|
|
189
|
+
|
|
178
190
|
Args:
|
|
179
191
|
runtime: Path to container runtime
|
|
180
192
|
image: Image name/tag
|
|
181
|
-
|
|
193
|
+
|
|
182
194
|
Returns:
|
|
183
195
|
True if image exists locally.
|
|
184
196
|
"""
|
|
@@ -196,18 +208,18 @@ def check_image_exists(runtime: str, image: str) -> bool:
|
|
|
196
208
|
def pull_image(runtime: str, image: str, quiet: bool = False) -> bool:
|
|
197
209
|
"""
|
|
198
210
|
Pull container image.
|
|
199
|
-
|
|
211
|
+
|
|
200
212
|
Args:
|
|
201
213
|
runtime: Path to container runtime
|
|
202
214
|
image: Image name/tag
|
|
203
215
|
quiet: Suppress progress output
|
|
204
|
-
|
|
216
|
+
|
|
205
217
|
Returns:
|
|
206
218
|
True if pull succeeded.
|
|
207
219
|
"""
|
|
208
220
|
if not quiet:
|
|
209
221
|
print(f"Pulling image: {image}")
|
|
210
|
-
|
|
222
|
+
|
|
211
223
|
try:
|
|
212
224
|
result = subprocess.run(
|
|
213
225
|
[runtime, "pull", image],
|
|
@@ -222,9 +234,9 @@ def pull_image(runtime: str, image: str, quiet: bool = False) -> bool:
|
|
|
222
234
|
|
|
223
235
|
def format_size(size_bytes: int) -> str:
|
|
224
236
|
"""Format file size in human-readable format."""
|
|
225
|
-
for unit in [
|
|
237
|
+
for unit in ["B", "KB", "MB", "GB"]:
|
|
226
238
|
if size_bytes < 1024:
|
|
227
|
-
return f"{size_bytes:.1f} {unit}" if unit !=
|
|
239
|
+
return f"{size_bytes:.1f} {unit}" if unit != "B" else f"{size_bytes} {unit}"
|
|
228
240
|
size_bytes /= 1024
|
|
229
241
|
return f"{size_bytes:.1f} TB"
|
|
230
242
|
|
|
@@ -244,29 +256,33 @@ def format_duration(seconds: float) -> str:
|
|
|
244
256
|
|
|
245
257
|
class Spinner:
|
|
246
258
|
"""A simple spinner to show progress during long operations."""
|
|
247
|
-
|
|
259
|
+
|
|
248
260
|
def __init__(self):
|
|
249
|
-
self.frames = [
|
|
261
|
+
self.frames = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
|
|
250
262
|
self.running = False
|
|
251
263
|
self.thread = None
|
|
252
264
|
self.start_time = None
|
|
253
|
-
|
|
265
|
+
|
|
254
266
|
def _spin(self):
|
|
255
267
|
idx = 0
|
|
256
268
|
while self.running:
|
|
257
269
|
elapsed = time.time() - self.start_time
|
|
258
270
|
frame = self.frames[idx % len(self.frames)]
|
|
259
|
-
print(
|
|
271
|
+
print(
|
|
272
|
+
f"\r{self.prefix} {frame} ({format_duration(elapsed)})",
|
|
273
|
+
end="",
|
|
274
|
+
flush=True,
|
|
275
|
+
)
|
|
260
276
|
idx += 1
|
|
261
277
|
time.sleep(0.1)
|
|
262
|
-
|
|
278
|
+
|
|
263
279
|
def start(self, prefix: str = ""):
|
|
264
280
|
self.prefix = prefix
|
|
265
281
|
self.running = True
|
|
266
282
|
self.start_time = time.time()
|
|
267
283
|
self.thread = threading.Thread(target=self._spin, daemon=True)
|
|
268
284
|
self.thread.start()
|
|
269
|
-
|
|
285
|
+
|
|
270
286
|
def stop(self):
|
|
271
287
|
self.running = False
|
|
272
288
|
if self.thread:
|
|
@@ -275,93 +291,45 @@ class Spinner:
|
|
|
275
291
|
print(f"\r{' ' * 80}\r", end="", flush=True)
|
|
276
292
|
|
|
277
293
|
|
|
278
|
-
def run_container(
|
|
279
|
-
runtime: str,
|
|
280
|
-
image: str,
|
|
281
|
-
input_file: Path,
|
|
282
|
-
output_file: Path,
|
|
283
|
-
mask_pii: bool = False,
|
|
284
|
-
) -> Tuple[bool, str, float]:
|
|
285
|
-
"""
|
|
286
|
-
Run container to convert a single file.
|
|
287
|
-
|
|
288
|
-
Args:
|
|
289
|
-
runtime: Path to container runtime
|
|
290
|
-
image: Image name/tag
|
|
291
|
-
input_file: Absolute path to input file
|
|
292
|
-
output_file: Absolute path to output file
|
|
293
|
-
mask_pii: Whether to mask PII in images
|
|
294
|
-
|
|
295
|
-
Returns:
|
|
296
|
-
Tuple of (success: bool, message: str, elapsed_seconds: float)
|
|
297
|
-
"""
|
|
298
|
-
start_time = time.time()
|
|
299
|
-
|
|
300
|
-
# Ensure output directory exists
|
|
301
|
-
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
302
|
-
|
|
303
|
-
# Mount directories
|
|
304
|
-
input_dir = input_file.parent
|
|
305
|
-
output_dir = output_file.parent
|
|
306
|
-
|
|
307
|
-
# Container paths
|
|
308
|
-
container_in = f"/work/in/{input_file.name}"
|
|
309
|
-
container_out = f"/work/out/{output_file.name}"
|
|
310
|
-
|
|
311
|
-
cmd = [
|
|
312
|
-
runtime, "run", "--rm",
|
|
313
|
-
"-v", f"{input_dir}:/work/in:ro",
|
|
314
|
-
"-v", f"{output_dir}:/work/out",
|
|
315
|
-
image,
|
|
316
|
-
"--in", container_in,
|
|
317
|
-
"--out", container_out,
|
|
318
|
-
]
|
|
319
|
-
|
|
320
|
-
if mask_pii:
|
|
321
|
-
cmd.append("--mask")
|
|
322
|
-
|
|
323
|
-
try:
|
|
324
|
-
result = subprocess.run(
|
|
325
|
-
cmd,
|
|
326
|
-
capture_output=True,
|
|
327
|
-
text=True,
|
|
328
|
-
check=False,
|
|
329
|
-
)
|
|
330
|
-
elapsed = time.time() - start_time
|
|
331
|
-
|
|
332
|
-
if result.returncode == 0:
|
|
333
|
-
return True, "success", elapsed
|
|
334
|
-
else:
|
|
335
|
-
error_msg = result.stderr.strip() or result.stdout.strip() or "Unknown error"
|
|
336
|
-
return False, error_msg, elapsed
|
|
337
|
-
|
|
338
|
-
except OSError as e:
|
|
339
|
-
elapsed = time.time() - start_time
|
|
340
|
-
return False, str(e), elapsed
|
|
341
|
-
|
|
342
|
-
|
|
343
294
|
# =============================================================================
|
|
344
295
|
# File handling functions
|
|
345
296
|
# =============================================================================
|
|
346
297
|
|
|
347
298
|
# Supported file extensions (based on Docling InputFormat)
|
|
348
299
|
SUPPORTED_EXTENSIONS = {
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
300
|
+
".pdf",
|
|
301
|
+
".docx",
|
|
302
|
+
".pptx",
|
|
303
|
+
".html",
|
|
304
|
+
".htm",
|
|
305
|
+
".png",
|
|
306
|
+
".jpg",
|
|
307
|
+
".jpeg",
|
|
308
|
+
".gif",
|
|
309
|
+
".bmp",
|
|
310
|
+
".tiff",
|
|
311
|
+
".tif", # images
|
|
312
|
+
".asciidoc",
|
|
313
|
+
".adoc",
|
|
314
|
+
".asc", # asciidoc
|
|
315
|
+
".md",
|
|
316
|
+
".markdown", # markdown
|
|
317
|
+
".csv",
|
|
318
|
+
".xlsx", # spreadsheets
|
|
319
|
+
".xml", # XML formats
|
|
320
|
+
".json", # JSON docling
|
|
321
|
+
".mp3",
|
|
322
|
+
".wav",
|
|
323
|
+
".m4a",
|
|
324
|
+
".flac", # audio
|
|
325
|
+
".vtt", # subtitles
|
|
358
326
|
}
|
|
359
327
|
|
|
360
328
|
|
|
361
329
|
def get_files_to_convert(input_path: Path, mask: str, recursive: bool) -> List[Path]:
|
|
362
330
|
"""Get list of files to convert based on input path and options."""
|
|
363
331
|
files = []
|
|
364
|
-
|
|
332
|
+
|
|
365
333
|
if input_path.is_file():
|
|
366
334
|
files.append(input_path)
|
|
367
335
|
elif input_path.is_dir():
|
|
@@ -369,19 +337,19 @@ def get_files_to_convert(input_path: Path, mask: str, recursive: bool) -> List[P
|
|
|
369
337
|
files = list(input_path.rglob(mask))
|
|
370
338
|
else:
|
|
371
339
|
files = list(input_path.glob(mask))
|
|
372
|
-
|
|
340
|
+
|
|
373
341
|
# Filter to only files
|
|
374
342
|
files = [f for f in files if f.is_file()]
|
|
375
343
|
else:
|
|
376
344
|
raise FileNotFoundError(f"Input path does not exist: {input_path}")
|
|
377
|
-
|
|
345
|
+
|
|
378
346
|
# Filter out hidden files and unsupported formats
|
|
379
347
|
files = [
|
|
380
|
-
f
|
|
381
|
-
|
|
382
|
-
and f.suffix.lower() in SUPPORTED_EXTENSIONS
|
|
348
|
+
f
|
|
349
|
+
for f in files
|
|
350
|
+
if not f.name.startswith(".") and f.suffix.lower() in SUPPORTED_EXTENSIONS
|
|
383
351
|
]
|
|
384
|
-
|
|
352
|
+
|
|
385
353
|
return files
|
|
386
354
|
|
|
387
355
|
|
|
@@ -414,7 +382,7 @@ def get_output_path(
|
|
|
414
382
|
output_path = output_dir / relative_path.parent / output_name
|
|
415
383
|
except ValueError:
|
|
416
384
|
output_path = output_dir / output_name
|
|
417
|
-
|
|
385
|
+
|
|
418
386
|
return output_path
|
|
419
387
|
|
|
420
388
|
|
|
@@ -422,6 +390,7 @@ def get_output_path(
|
|
|
422
390
|
# CLI argument parsing
|
|
423
391
|
# =============================================================================
|
|
424
392
|
|
|
393
|
+
|
|
425
394
|
def parse_args() -> argparse.Namespace:
|
|
426
395
|
"""Parse command line arguments."""
|
|
427
396
|
parser = argparse.ArgumentParser(
|
|
@@ -436,74 +405,92 @@ Examples:
|
|
|
436
405
|
mdify ./docs --runtime podman Use Podman instead of Docker
|
|
437
406
|
""",
|
|
438
407
|
)
|
|
439
|
-
|
|
408
|
+
|
|
440
409
|
parser.add_argument(
|
|
441
410
|
"input",
|
|
442
411
|
type=str,
|
|
443
412
|
nargs="?",
|
|
444
413
|
help="Input file or directory to convert",
|
|
445
414
|
)
|
|
446
|
-
|
|
415
|
+
|
|
447
416
|
parser.add_argument(
|
|
448
|
-
"-o",
|
|
417
|
+
"-o",
|
|
418
|
+
"--out-dir",
|
|
449
419
|
type=str,
|
|
450
420
|
default="output",
|
|
451
421
|
help="Output directory for converted files (default: output)",
|
|
452
422
|
)
|
|
453
|
-
|
|
423
|
+
|
|
454
424
|
parser.add_argument(
|
|
455
|
-
"-g",
|
|
425
|
+
"-g",
|
|
426
|
+
"--glob",
|
|
456
427
|
type=str,
|
|
457
428
|
default="*",
|
|
458
429
|
help="Glob pattern for filtering files in directory (default: *)",
|
|
459
430
|
)
|
|
460
|
-
|
|
431
|
+
|
|
461
432
|
parser.add_argument(
|
|
462
|
-
"-r",
|
|
433
|
+
"-r",
|
|
434
|
+
"--recursive",
|
|
463
435
|
action="store_true",
|
|
464
436
|
help="Recursively scan directories",
|
|
465
437
|
)
|
|
466
|
-
|
|
438
|
+
|
|
467
439
|
parser.add_argument(
|
|
468
440
|
"--flat",
|
|
469
441
|
action="store_true",
|
|
470
442
|
help="Disable directory structure preservation in output",
|
|
471
443
|
)
|
|
472
|
-
|
|
444
|
+
|
|
473
445
|
parser.add_argument(
|
|
474
446
|
"--overwrite",
|
|
475
447
|
action="store_true",
|
|
476
448
|
help="Overwrite existing output files",
|
|
477
449
|
)
|
|
478
|
-
|
|
450
|
+
|
|
479
451
|
parser.add_argument(
|
|
480
|
-
"-q",
|
|
452
|
+
"-q",
|
|
453
|
+
"--quiet",
|
|
481
454
|
action="store_true",
|
|
482
455
|
help="Suppress progress messages",
|
|
483
456
|
)
|
|
484
|
-
|
|
457
|
+
|
|
485
458
|
parser.add_argument(
|
|
486
|
-
"-m",
|
|
459
|
+
"-m",
|
|
460
|
+
"--mask",
|
|
487
461
|
action="store_true",
|
|
488
462
|
help="Mask PII and sensitive content in document images",
|
|
489
463
|
)
|
|
490
|
-
|
|
464
|
+
|
|
465
|
+
parser.add_argument(
|
|
466
|
+
"--gpu",
|
|
467
|
+
action="store_true",
|
|
468
|
+
help="Use GPU-accelerated container image (docling-serve-cu126)",
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
parser.add_argument(
|
|
472
|
+
"--port",
|
|
473
|
+
type=int,
|
|
474
|
+
default=5001,
|
|
475
|
+
help="Port for docling-serve container (default: 5001)",
|
|
476
|
+
)
|
|
477
|
+
|
|
491
478
|
# Container options
|
|
492
479
|
parser.add_argument(
|
|
493
480
|
"--runtime",
|
|
494
481
|
type=str,
|
|
495
482
|
choices=SUPPORTED_RUNTIMES,
|
|
496
|
-
default=
|
|
497
|
-
help="Container runtime to use (
|
|
483
|
+
default=None,
|
|
484
|
+
help="Container runtime to use (auto-detects docker or podman if not specified)",
|
|
498
485
|
)
|
|
499
|
-
|
|
486
|
+
|
|
500
487
|
parser.add_argument(
|
|
501
488
|
"--image",
|
|
502
489
|
type=str,
|
|
503
490
|
default=DEFAULT_IMAGE,
|
|
504
491
|
help=f"Container image to use (default: {DEFAULT_IMAGE})",
|
|
505
492
|
)
|
|
506
|
-
|
|
493
|
+
|
|
507
494
|
parser.add_argument(
|
|
508
495
|
"--pull",
|
|
509
496
|
type=str,
|
|
@@ -511,20 +498,20 @@ Examples:
|
|
|
511
498
|
default="missing",
|
|
512
499
|
help="Image pull policy: always, missing, never (default: missing)",
|
|
513
500
|
)
|
|
514
|
-
|
|
501
|
+
|
|
515
502
|
# Utility options
|
|
516
503
|
parser.add_argument(
|
|
517
504
|
"--check-update",
|
|
518
505
|
action="store_true",
|
|
519
506
|
help="Check for available updates and exit",
|
|
520
507
|
)
|
|
521
|
-
|
|
508
|
+
|
|
522
509
|
parser.add_argument(
|
|
523
510
|
"--version",
|
|
524
511
|
action="version",
|
|
525
512
|
version=f"mdify {__version__}",
|
|
526
513
|
)
|
|
527
|
-
|
|
514
|
+
|
|
528
515
|
return parser.parse_args()
|
|
529
516
|
|
|
530
517
|
|
|
@@ -532,27 +519,30 @@ Examples:
|
|
|
532
519
|
# Main entry point
|
|
533
520
|
# =============================================================================
|
|
534
521
|
|
|
522
|
+
|
|
535
523
|
def main() -> int:
|
|
536
524
|
"""Main entry point for the CLI."""
|
|
537
525
|
args = parse_args()
|
|
538
|
-
|
|
526
|
+
|
|
539
527
|
# Handle --check-update flag
|
|
540
528
|
if args.check_update:
|
|
541
529
|
check_for_update(force=True)
|
|
542
530
|
return 0
|
|
543
|
-
|
|
531
|
+
|
|
544
532
|
# Check for updates (daily, silent on errors)
|
|
545
533
|
check_for_update(force=False)
|
|
546
|
-
|
|
534
|
+
|
|
547
535
|
# Validate input is provided
|
|
548
536
|
if args.input is None:
|
|
549
537
|
print("Error: Input file or directory is required", file=sys.stderr)
|
|
550
538
|
print("Usage: mdify <input> [options]", file=sys.stderr)
|
|
551
539
|
print(" mdify --help for more information", file=sys.stderr)
|
|
552
540
|
return 1
|
|
553
|
-
|
|
541
|
+
|
|
554
542
|
# Detect container runtime
|
|
555
|
-
|
|
543
|
+
preferred = args.runtime if args.runtime else "docker"
|
|
544
|
+
explicit = args.runtime is not None
|
|
545
|
+
runtime = detect_runtime(preferred, explicit=explicit)
|
|
556
546
|
if runtime is None:
|
|
557
547
|
print(
|
|
558
548
|
f"Error: Container runtime not found ({', '.join(SUPPORTED_RUNTIMES)})",
|
|
@@ -560,109 +550,166 @@ def main() -> int:
|
|
|
560
550
|
)
|
|
561
551
|
print("Please install Docker or Podman to use mdify.", file=sys.stderr)
|
|
562
552
|
return 2
|
|
563
|
-
|
|
553
|
+
|
|
564
554
|
# Handle image pull policy
|
|
565
|
-
image
|
|
555
|
+
# Determine image based on --gpu flag
|
|
556
|
+
if args.gpu:
|
|
557
|
+
image = GPU_IMAGE
|
|
558
|
+
elif args.image:
|
|
559
|
+
image = args.image
|
|
560
|
+
else:
|
|
561
|
+
image = DEFAULT_IMAGE
|
|
562
|
+
|
|
566
563
|
image_exists = check_image_exists(runtime, image)
|
|
567
|
-
|
|
564
|
+
|
|
568
565
|
if args.pull == "always" or (args.pull == "missing" and not image_exists):
|
|
569
566
|
if not pull_image(runtime, image, args.quiet):
|
|
570
567
|
print(f"Error: Failed to pull image: {image}", file=sys.stderr)
|
|
571
568
|
return 1
|
|
572
569
|
elif args.pull == "never" and not image_exists:
|
|
573
570
|
print(f"Error: Image not found locally: {image}", file=sys.stderr)
|
|
574
|
-
print(f"Run with --pull=missing or pull manually: {
|
|
571
|
+
print(f"Run with --pull=missing or pull manually: {preferred} pull {image}")
|
|
575
572
|
return 1
|
|
576
|
-
|
|
573
|
+
|
|
577
574
|
# Resolve paths
|
|
578
575
|
input_path = Path(args.input).resolve()
|
|
579
576
|
output_dir = Path(args.out_dir).resolve()
|
|
580
|
-
|
|
577
|
+
|
|
581
578
|
# Validate input
|
|
582
579
|
if not input_path.exists():
|
|
583
580
|
print(f"Error: Input path does not exist: {input_path}", file=sys.stderr)
|
|
584
581
|
return 1
|
|
585
|
-
|
|
582
|
+
|
|
586
583
|
# Get files to convert
|
|
587
584
|
try:
|
|
588
585
|
files_to_convert = get_files_to_convert(input_path, args.glob, args.recursive)
|
|
589
586
|
except Exception as e:
|
|
590
587
|
print(f"Error: {e}", file=sys.stderr)
|
|
591
588
|
return 1
|
|
592
|
-
|
|
589
|
+
|
|
593
590
|
if not files_to_convert:
|
|
594
591
|
print(f"No files found to convert in: {input_path}", file=sys.stderr)
|
|
595
592
|
return 1
|
|
596
|
-
|
|
593
|
+
|
|
597
594
|
total_files = len(files_to_convert)
|
|
598
595
|
total_size = sum(f.stat().st_size for f in files_to_convert)
|
|
599
|
-
|
|
596
|
+
|
|
600
597
|
if not args.quiet:
|
|
601
598
|
print(f"Found {total_files} file(s) to convert ({format_size(total_size)})")
|
|
602
599
|
print(f"Using runtime: {runtime}")
|
|
603
600
|
print(f"Using image: {image}")
|
|
604
601
|
print()
|
|
605
|
-
|
|
602
|
+
|
|
603
|
+
if args.mask:
|
|
604
|
+
print(
|
|
605
|
+
"Warning: --mask is not supported with docling-serve and will be ignored",
|
|
606
|
+
file=sys.stderr,
|
|
607
|
+
)
|
|
608
|
+
|
|
606
609
|
# Determine input base for directory structure preservation
|
|
607
610
|
if input_path.is_file():
|
|
608
611
|
input_base = input_path.parent
|
|
609
612
|
else:
|
|
610
613
|
input_base = input_path
|
|
611
|
-
|
|
612
|
-
# Convert files
|
|
614
|
+
|
|
613
615
|
success_count = 0
|
|
614
616
|
skipped_count = 0
|
|
615
617
|
failed_count = 0
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
for idx, input_file in enumerate(files_to_convert, 1):
|
|
620
|
-
output_file = get_output_path(input_file, input_base, output_dir, args.flat)
|
|
621
|
-
file_size = input_file.stat().st_size
|
|
622
|
-
progress = f"[{idx}/{total_files}]"
|
|
623
|
-
|
|
624
|
-
# Check if output exists and skip if not overwriting
|
|
625
|
-
if output_file.exists() and not args.overwrite:
|
|
626
|
-
if not args.quiet:
|
|
627
|
-
print(f"{progress} Skipped (exists): {input_file.name}")
|
|
628
|
-
skipped_count += 1
|
|
629
|
-
continue
|
|
630
|
-
|
|
631
|
-
# Show spinner while processing
|
|
618
|
+
total_elapsed = 0.0
|
|
619
|
+
|
|
620
|
+
try:
|
|
632
621
|
if not args.quiet:
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
622
|
+
print(f"Starting docling-serve container...")
|
|
623
|
+
print()
|
|
624
|
+
|
|
625
|
+
with DoclingContainer(runtime, image, args.port) as container:
|
|
626
|
+
# Convert files
|
|
627
|
+
conversion_start = time.time()
|
|
628
|
+
spinner = Spinner()
|
|
629
|
+
|
|
630
|
+
for idx, input_file in enumerate(files_to_convert, 1):
|
|
631
|
+
output_file = get_output_path(
|
|
632
|
+
input_file, input_base, output_dir, args.flat
|
|
633
|
+
)
|
|
634
|
+
file_size = input_file.stat().st_size
|
|
635
|
+
progress = f"[{idx}/{total_files}]"
|
|
636
|
+
|
|
637
|
+
# Check if output exists and skip if not overwriting
|
|
638
|
+
if output_file.exists() and not args.overwrite:
|
|
639
|
+
if not args.quiet:
|
|
640
|
+
print(f"{progress} Skipped (exists): {input_file.name}")
|
|
641
|
+
skipped_count += 1
|
|
642
|
+
continue
|
|
643
|
+
|
|
644
|
+
# Ensure output directory exists
|
|
645
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
646
|
+
|
|
647
|
+
# Show spinner while processing
|
|
648
|
+
if not args.quiet:
|
|
649
|
+
spinner.start(
|
|
650
|
+
f"{progress} Processing: {input_file.name} ({format_size(file_size)})"
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
start_time = time.time()
|
|
654
|
+
try:
|
|
655
|
+
# Convert via HTTP API
|
|
656
|
+
result = convert_file(
|
|
657
|
+
container.base_url, input_file, to_format="md"
|
|
658
|
+
)
|
|
659
|
+
elapsed = time.time() - start_time
|
|
660
|
+
|
|
661
|
+
if not args.quiet:
|
|
662
|
+
spinner.stop()
|
|
663
|
+
|
|
664
|
+
if result.success:
|
|
665
|
+
# Write result to output file
|
|
666
|
+
output_file.write_text(result.content)
|
|
667
|
+
success_count += 1
|
|
668
|
+
if not args.quiet:
|
|
669
|
+
print(
|
|
670
|
+
f"{progress} {input_file.name} ✓ ({format_duration(elapsed)})"
|
|
671
|
+
)
|
|
672
|
+
else:
|
|
673
|
+
failed_count += 1
|
|
674
|
+
error_msg = result.error or "Unknown error"
|
|
675
|
+
if not args.quiet:
|
|
676
|
+
print(
|
|
677
|
+
f"{progress} {input_file.name} ✗ ({format_duration(elapsed)})"
|
|
678
|
+
)
|
|
679
|
+
print(f" Error: {error_msg}", file=sys.stderr)
|
|
680
|
+
except Exception as e:
|
|
681
|
+
elapsed = time.time() - start_time
|
|
682
|
+
failed_count += 1
|
|
683
|
+
if not args.quiet:
|
|
684
|
+
spinner.stop()
|
|
685
|
+
print(
|
|
686
|
+
f"{progress} {input_file.name} ✗ ({format_duration(elapsed)})"
|
|
687
|
+
)
|
|
688
|
+
print(f" Error: {str(e)}", file=sys.stderr)
|
|
689
|
+
|
|
690
|
+
total_elapsed = time.time() - conversion_start
|
|
691
|
+
|
|
692
|
+
# Print summary
|
|
639
693
|
if not args.quiet:
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
print(f" Total files: {total_files}")
|
|
660
|
-
print(f" Successful: {success_count}")
|
|
661
|
-
print(f" Skipped: {skipped_count}")
|
|
662
|
-
print(f" Failed: {failed_count}")
|
|
663
|
-
print(f" Total time: {format_duration(total_elapsed)}")
|
|
664
|
-
print("=" * 50)
|
|
665
|
-
|
|
694
|
+
print()
|
|
695
|
+
print("=" * 50)
|
|
696
|
+
print("Conversion Summary:")
|
|
697
|
+
print(f" Total files: {total_files}")
|
|
698
|
+
print(f" Successful: {success_count}")
|
|
699
|
+
print(f" Skipped: {skipped_count}")
|
|
700
|
+
print(f" Failed: {failed_count}")
|
|
701
|
+
print(f" Total time: {format_duration(total_elapsed)}")
|
|
702
|
+
print("=" * 50)
|
|
703
|
+
|
|
704
|
+
except KeyboardInterrupt:
|
|
705
|
+
if not args.quiet:
|
|
706
|
+
print("\n\nInterrupted by user. Container stopped.")
|
|
707
|
+
if success_count > 0 or skipped_count > 0 or failed_count > 0:
|
|
708
|
+
print(
|
|
709
|
+
f"Partial progress: {success_count} successful, {failed_count} failed, {skipped_count} skipped"
|
|
710
|
+
)
|
|
711
|
+
return 130
|
|
712
|
+
|
|
666
713
|
# Return appropriate exit code
|
|
667
714
|
if failed_count > 0:
|
|
668
715
|
return 1
|
mdify/container.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""Container lifecycle management for docling-serve."""
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
import time
|
|
5
|
+
import uuid
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from mdify.docling_client import check_health
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DoclingContainer:
|
|
12
|
+
"""Manages docling-serve container lifecycle.
|
|
13
|
+
|
|
14
|
+
Provides context manager support for automatic startup and cleanup.
|
|
15
|
+
|
|
16
|
+
Usage:
|
|
17
|
+
with DoclingContainer("docker", "ghcr.io/docling-project/docling-serve-cpu:main") as container:
|
|
18
|
+
# Container is running and healthy
|
|
19
|
+
response = requests.post(f"{container.base_url}/v1/convert/file", ...)
|
|
20
|
+
# Container automatically stopped and removed
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, runtime: str, image: str, port: int = 5001):
|
|
24
|
+
"""Initialize container manager.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
runtime: Container runtime ("docker" or "podman")
|
|
28
|
+
image: Container image to use
|
|
29
|
+
port: Host port to bind (default: 5001)
|
|
30
|
+
"""
|
|
31
|
+
self.runtime = runtime
|
|
32
|
+
self.image = image
|
|
33
|
+
self.port = port
|
|
34
|
+
self.container_name = f"mdify-serve-{uuid.uuid4().hex[:8]}"
|
|
35
|
+
self.container_id: Optional[str] = None
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def base_url(self) -> str:
|
|
39
|
+
"""Return base URL for API requests."""
|
|
40
|
+
return f"http://localhost:{self.port}"
|
|
41
|
+
|
|
42
|
+
def start(self, timeout: int = 120) -> None:
|
|
43
|
+
"""Start container and wait for health check.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
timeout: Maximum seconds to wait for health (default: 120)
|
|
47
|
+
|
|
48
|
+
Raises:
|
|
49
|
+
subprocess.CalledProcessError: If container fails to start
|
|
50
|
+
TimeoutError: If health check doesn't pass within timeout
|
|
51
|
+
"""
|
|
52
|
+
# Start container in detached mode
|
|
53
|
+
cmd = [
|
|
54
|
+
self.runtime,
|
|
55
|
+
"run",
|
|
56
|
+
"-d", # Detached mode
|
|
57
|
+
"--rm", # Auto-remove on stop
|
|
58
|
+
"--name",
|
|
59
|
+
self.container_name,
|
|
60
|
+
"-p",
|
|
61
|
+
f"{self.port}:5001",
|
|
62
|
+
self.image,
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
67
|
+
self.container_id = result.stdout.strip()
|
|
68
|
+
except subprocess.CalledProcessError as e:
|
|
69
|
+
error_msg = e.stderr.strip() or e.stdout.strip() or "Unknown error"
|
|
70
|
+
raise subprocess.CalledProcessError(
|
|
71
|
+
e.returncode,
|
|
72
|
+
e.cmd,
|
|
73
|
+
output=e.stdout,
|
|
74
|
+
stderr=f"Failed to start container: {error_msg}",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Wait for health check
|
|
78
|
+
self._wait_for_health(timeout)
|
|
79
|
+
|
|
80
|
+
def stop(self) -> None:
|
|
81
|
+
"""Stop and remove container. Safe to call multiple times."""
|
|
82
|
+
if self.container_name:
|
|
83
|
+
subprocess.run(
|
|
84
|
+
[self.runtime, "stop", self.container_name],
|
|
85
|
+
capture_output=True,
|
|
86
|
+
check=False,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
def is_ready(self) -> bool:
|
|
90
|
+
"""Check if container is healthy.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
True if container is healthy, False otherwise
|
|
94
|
+
"""
|
|
95
|
+
try:
|
|
96
|
+
return check_health(self.base_url)
|
|
97
|
+
except Exception:
|
|
98
|
+
return False
|
|
99
|
+
|
|
100
|
+
def _wait_for_health(self, timeout: int) -> None:
|
|
101
|
+
"""Poll health endpoint until ready.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
timeout: Maximum seconds to wait
|
|
105
|
+
|
|
106
|
+
Raises:
|
|
107
|
+
TimeoutError: If health check doesn't pass within timeout
|
|
108
|
+
"""
|
|
109
|
+
start_time = time.time()
|
|
110
|
+
while time.time() - start_time < timeout:
|
|
111
|
+
try:
|
|
112
|
+
if check_health(self.base_url):
|
|
113
|
+
return
|
|
114
|
+
except Exception:
|
|
115
|
+
pass
|
|
116
|
+
time.sleep(2) # Poll every 2 seconds
|
|
117
|
+
|
|
118
|
+
raise TimeoutError(f"Container failed to become healthy within {timeout}s")
|
|
119
|
+
|
|
120
|
+
def __enter__(self):
|
|
121
|
+
"""Context manager entry."""
|
|
122
|
+
self.start()
|
|
123
|
+
return self
|
|
124
|
+
|
|
125
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
126
|
+
"""Context manager exit - ensures cleanup."""
|
|
127
|
+
self.stop()
|
|
128
|
+
return False
|
mdify/docling_client.py
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
"""HTTP client for docling-serve REST API."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class ConvertResult:
|
|
12
|
+
"""Result from document conversion."""
|
|
13
|
+
|
|
14
|
+
content: str
|
|
15
|
+
format: str
|
|
16
|
+
success: bool
|
|
17
|
+
error: Optional[str] = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class StatusResult:
|
|
22
|
+
"""Status of async conversion task."""
|
|
23
|
+
|
|
24
|
+
status: str # "pending", "completed", "failed"
|
|
25
|
+
task_id: str
|
|
26
|
+
error: Optional[str] = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class DoclingClientError(Exception):
|
|
30
|
+
"""Base exception for docling client errors."""
|
|
31
|
+
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DoclingHTTPError(DoclingClientError):
|
|
36
|
+
"""HTTP error from docling-serve API."""
|
|
37
|
+
|
|
38
|
+
def __init__(self, status_code: int, message: str):
|
|
39
|
+
self.status_code = status_code
|
|
40
|
+
super().__init__(f"HTTP {status_code}: {message}")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def check_health(base_url: str) -> bool:
|
|
44
|
+
"""Check if docling-serve is healthy.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
base_url: Base URL of docling-serve (e.g., "http://localhost:8000")
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
True if healthy, False otherwise
|
|
51
|
+
"""
|
|
52
|
+
try:
|
|
53
|
+
response = requests.get(f"{base_url}/health")
|
|
54
|
+
return response.status_code == 200
|
|
55
|
+
except requests.RequestException:
|
|
56
|
+
return False
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def convert_file(
|
|
60
|
+
base_url: str, file_path: Path, to_format: str = "md", do_ocr: bool = True
|
|
61
|
+
) -> ConvertResult:
|
|
62
|
+
"""Convert a file synchronously.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
base_url: Base URL of docling-serve
|
|
66
|
+
file_path: Path to file to convert
|
|
67
|
+
to_format: Output format (default: "md")
|
|
68
|
+
do_ocr: Whether to perform OCR (default: True)
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
ConvertResult with conversion output
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
DoclingHTTPError: If HTTP request fails
|
|
75
|
+
"""
|
|
76
|
+
try:
|
|
77
|
+
with open(file_path, "rb") as f:
|
|
78
|
+
response = requests.post(
|
|
79
|
+
f"{base_url}/v1/convert/file",
|
|
80
|
+
files={"files": (file_path.name, f, "application/pdf")},
|
|
81
|
+
data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
if response.status_code != 200:
|
|
85
|
+
raise DoclingHTTPError(
|
|
86
|
+
response.status_code, response.text or "Conversion failed"
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
result_data = response.json()
|
|
90
|
+
|
|
91
|
+
# docling-serve returns results in a list format
|
|
92
|
+
if isinstance(result_data, list) and len(result_data) > 0:
|
|
93
|
+
first_result = result_data[0]
|
|
94
|
+
return ConvertResult(
|
|
95
|
+
content=first_result.get("content", ""), format=to_format, success=True
|
|
96
|
+
)
|
|
97
|
+
elif isinstance(result_data, dict):
|
|
98
|
+
return ConvertResult(
|
|
99
|
+
content=result_data.get("content", ""), format=to_format, success=True
|
|
100
|
+
)
|
|
101
|
+
else:
|
|
102
|
+
raise DoclingHTTPError(200, f"Unexpected response format: {result_data}")
|
|
103
|
+
|
|
104
|
+
except requests.RequestException as e:
|
|
105
|
+
return ConvertResult(content="", format=to_format, success=False, error=str(e))
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def convert_file_async(
|
|
109
|
+
base_url: str, file_path: Path, to_format: str = "md", do_ocr: bool = True
|
|
110
|
+
) -> str:
|
|
111
|
+
"""Start async file conversion.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
base_url: Base URL of docling-serve
|
|
115
|
+
file_path: Path to file to convert
|
|
116
|
+
to_format: Output format (default: "md")
|
|
117
|
+
do_ocr: Whether to perform OCR (default: True)
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Task ID for polling
|
|
121
|
+
|
|
122
|
+
Raises:
|
|
123
|
+
DoclingHTTPError: If HTTP request fails
|
|
124
|
+
"""
|
|
125
|
+
try:
|
|
126
|
+
with open(file_path, "rb") as f:
|
|
127
|
+
response = requests.post(
|
|
128
|
+
f"{base_url}/v1/convert/file/async",
|
|
129
|
+
files={"files": (file_path.name, f, "application/pdf")},
|
|
130
|
+
data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
if response.status_code != 200:
|
|
134
|
+
raise DoclingHTTPError(
|
|
135
|
+
response.status_code, response.text or "Async conversion failed"
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
result_data = response.json()
|
|
139
|
+
task_id = result_data.get("task_id")
|
|
140
|
+
|
|
141
|
+
if not task_id:
|
|
142
|
+
raise DoclingHTTPError(200, f"No task_id in response: {result_data}")
|
|
143
|
+
|
|
144
|
+
return task_id
|
|
145
|
+
|
|
146
|
+
except requests.RequestException as e:
|
|
147
|
+
raise DoclingHTTPError(500, str(e))
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def poll_status(base_url: str, task_id: str) -> StatusResult:
|
|
151
|
+
"""Poll status of async conversion task.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
base_url: Base URL of docling-serve
|
|
155
|
+
task_id: Task ID from convert_file_async
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
StatusResult with current status
|
|
159
|
+
|
|
160
|
+
Raises:
|
|
161
|
+
DoclingHTTPError: If HTTP request fails
|
|
162
|
+
"""
|
|
163
|
+
try:
|
|
164
|
+
response = requests.get(f"{base_url}/v1/status/poll/{task_id}")
|
|
165
|
+
|
|
166
|
+
if response.status_code != 200:
|
|
167
|
+
raise DoclingHTTPError(
|
|
168
|
+
response.status_code, response.text or "Status poll failed"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
result_data = response.json()
|
|
172
|
+
|
|
173
|
+
return StatusResult(
|
|
174
|
+
status=result_data.get("status", "unknown"),
|
|
175
|
+
task_id=task_id,
|
|
176
|
+
error=result_data.get("error"),
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
except requests.RequestException as e:
|
|
180
|
+
raise DoclingHTTPError(500, str(e))
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def get_result(base_url: str, task_id: str) -> ConvertResult:
|
|
184
|
+
"""Get result of completed async conversion.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
base_url: Base URL of docling-serve
|
|
188
|
+
task_id: Task ID from convert_file_async
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
ConvertResult with conversion output
|
|
192
|
+
|
|
193
|
+
Raises:
|
|
194
|
+
DoclingHTTPError: If HTTP request fails or task not completed
|
|
195
|
+
"""
|
|
196
|
+
try:
|
|
197
|
+
response = requests.get(f"{base_url}/v1/result/{task_id}")
|
|
198
|
+
|
|
199
|
+
if response.status_code != 200:
|
|
200
|
+
raise DoclingHTTPError(
|
|
201
|
+
response.status_code, response.text or "Result retrieval failed"
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
result_data = response.json()
|
|
205
|
+
|
|
206
|
+
# Similar to sync conversion, handle list or dict format
|
|
207
|
+
if isinstance(result_data, list) and len(result_data) > 0:
|
|
208
|
+
first_result = result_data[0]
|
|
209
|
+
return ConvertResult(
|
|
210
|
+
content=first_result.get("content", ""),
|
|
211
|
+
format=first_result.get("format", "md"),
|
|
212
|
+
success=True,
|
|
213
|
+
)
|
|
214
|
+
elif isinstance(result_data, dict):
|
|
215
|
+
return ConvertResult(
|
|
216
|
+
content=result_data.get("content", ""),
|
|
217
|
+
format=result_data.get("format", "md"),
|
|
218
|
+
success=True,
|
|
219
|
+
)
|
|
220
|
+
else:
|
|
221
|
+
raise DoclingHTTPError(200, f"Unexpected response format: {result_data}")
|
|
222
|
+
|
|
223
|
+
except requests.RequestException as e:
|
|
224
|
+
return ConvertResult(content="", format="md", success=False, error=str(e))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mdify-cli
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2.0.0
|
|
4
4
|
Summary: Convert PDFs and document images into structured Markdown for LLM workflows
|
|
5
5
|
Author: tiroq
|
|
6
6
|
License-Expression: MIT
|
|
@@ -24,6 +24,9 @@ Classifier: Topic :: Utilities
|
|
|
24
24
|
Requires-Python: >=3.8
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
License-File: LICENSE
|
|
27
|
+
Requires-Dist: requests
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
27
30
|
Dynamic: license-file
|
|
28
31
|
|
|
29
32
|
# mdify
|
|
@@ -98,15 +101,32 @@ Recursively convert files:
|
|
|
98
101
|
mdify /path/to/documents -r -g "*.pdf"
|
|
99
102
|
```
|
|
100
103
|
|
|
101
|
-
###
|
|
104
|
+
### GPU Acceleration
|
|
102
105
|
|
|
103
|
-
|
|
106
|
+
For faster processing with NVIDIA GPU:
|
|
104
107
|
```bash
|
|
105
|
-
mdify
|
|
106
|
-
mdify document.pdf --mask
|
|
108
|
+
mdify --gpu documents/*.pdf
|
|
107
109
|
```
|
|
108
110
|
|
|
109
|
-
|
|
111
|
+
Requires NVIDIA GPU with CUDA support and nvidia-container-toolkit.
|
|
112
|
+
|
|
113
|
+
### ⚠️ PII Masking (Deprecated)
|
|
114
|
+
|
|
115
|
+
The `--mask` flag is deprecated and will be ignored in this version. PII masking functionality was available in older versions using a custom runtime but is not supported with the current docling-serve backend.
|
|
116
|
+
|
|
117
|
+
If PII masking is critical for your use case, please use mdify v1.5.x or earlier versions.
|
|
118
|
+
|
|
119
|
+
## Performance
|
|
120
|
+
|
|
121
|
+
mdify now uses docling-serve for significantly faster batch processing:
|
|
122
|
+
|
|
123
|
+
- **Single model load**: Models are loaded once per session, not per file
|
|
124
|
+
- **~10-20x speedup** for multiple file conversions compared to previous versions
|
|
125
|
+
- **GPU acceleration**: Use `--gpu` for additional 2-6x speedup (requires NVIDIA GPU)
|
|
126
|
+
|
|
127
|
+
### First Run Behavior
|
|
128
|
+
|
|
129
|
+
The first conversion takes longer (~30-60s) as the container loads ML models into memory. Subsequent files in the same batch process quickly, typically in 1-3 seconds per file.
|
|
110
130
|
|
|
111
131
|
## Options
|
|
112
132
|
|
|
@@ -119,9 +139,11 @@ This uses Docling's content-aware masking to obscure sensitive information in em
|
|
|
119
139
|
| `--flat` | Disable directory structure preservation |
|
|
120
140
|
| `--overwrite` | Overwrite existing output files |
|
|
121
141
|
| `-q, --quiet` | Suppress progress messages |
|
|
122
|
-
| `-m, --mask` |
|
|
142
|
+
| `-m, --mask` | ⚠️ **Deprecated**: PII masking not supported in current version |
|
|
143
|
+
| `--gpu` | Use GPU-accelerated container (requires NVIDIA GPU and nvidia-container-toolkit) |
|
|
144
|
+
| `--port PORT` | Container port (default: 5001) |
|
|
123
145
|
| `--runtime RUNTIME` | Container runtime: docker or podman (auto-detected) |
|
|
124
|
-
| `--image IMAGE` | Custom container image (default: ghcr.io/
|
|
146
|
+
| `--image IMAGE` | Custom container image (default: ghcr.io/docling-project/docling-serve-cpu:main) |
|
|
125
147
|
| `--pull POLICY` | Image pull policy: always, missing, never (default: missing) |
|
|
126
148
|
| `--check-update` | Check for available updates and exit |
|
|
127
149
|
| `--version` | Show version and exit |
|
|
@@ -175,19 +197,22 @@ The CLI:
|
|
|
175
197
|
- Pulls the runtime container on first use
|
|
176
198
|
- Mounts files and runs conversions in the container
|
|
177
199
|
|
|
178
|
-
## Container
|
|
200
|
+
## Container Images
|
|
201
|
+
|
|
202
|
+
mdify uses official docling-serve containers:
|
|
179
203
|
|
|
180
|
-
|
|
204
|
+
**CPU Version** (default):
|
|
181
205
|
```
|
|
182
|
-
ghcr.io/
|
|
206
|
+
ghcr.io/docling-project/docling-serve-cpu:main
|
|
183
207
|
```
|
|
184
208
|
|
|
185
|
-
|
|
186
|
-
```
|
|
187
|
-
|
|
188
|
-
docker build -t mdify-runtime .
|
|
209
|
+
**GPU Version** (use with `--gpu` flag):
|
|
210
|
+
```
|
|
211
|
+
ghcr.io/docling-project/docling-serve-cu126:main
|
|
189
212
|
```
|
|
190
213
|
|
|
214
|
+
These are official images from the [docling-serve project](https://github.com/DS4SD/docling-serve).
|
|
215
|
+
|
|
191
216
|
## Updates
|
|
192
217
|
|
|
193
218
|
mdify checks for updates daily. When a new version is available:
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
assets/mdify.png,sha256=qUj7WXWqNwpI2KNXOW79XJwqFqa-UI0JEkmt1mmy4Rg,1820418
|
|
2
|
+
mdify/__init__.py,sha256=s7XlWmH4zJ5jFiPjpd7mXrCaU8bD-S9RaPzT2VHUdeQ,90
|
|
3
|
+
mdify/__main__.py,sha256=bhpJ00co6MfaVOdH4XLoW04NtLYDa_oJK7ODzfLrn9M,143
|
|
4
|
+
mdify/cli.py,sha256=LY0q8NlnKuN5aFz_OpO5hGro-tQNCxoYO_M0qVd6FJY,21493
|
|
5
|
+
mdify/container.py,sha256=AVIhiq_wO5id5hQ_s83lUPkAPCsAoTs25azRT6JmKII,3962
|
|
6
|
+
mdify/docling_client.py,sha256=_9qjL5yOOeJahOg6an2P6Iii1xkeR6wmNJZG4Q6NRkk,6553
|
|
7
|
+
mdify_cli-2.0.0.dist-info/licenses/LICENSE,sha256=NWM66Uv-XuSMKaU-gaPmvfyk4WgE6zcIPr78wyg6GAo,1065
|
|
8
|
+
mdify_cli-2.0.0.dist-info/METADATA,sha256=92_uBI2nnKK-YEf39TB7gX1KHbZBHqIHxLZBe7-GOqY,7923
|
|
9
|
+
mdify_cli-2.0.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
10
|
+
mdify_cli-2.0.0.dist-info/entry_points.txt,sha256=0Xki8f5lADQUtwdt6Eq_FEaieI6Byhk8UE7BuDhChMg,41
|
|
11
|
+
mdify_cli-2.0.0.dist-info/top_level.txt,sha256=qltzf7h8owHq7dxCdfCkSHY8gT21hn1_E8P-VWS_OKM,6
|
|
12
|
+
mdify_cli-2.0.0.dist-info/RECORD,,
|
mdify_cli-1.5.0.dist-info/RECORD
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
assets/mdify.png,sha256=qUj7WXWqNwpI2KNXOW79XJwqFqa-UI0JEkmt1mmy4Rg,1820418
|
|
2
|
-
mdify/__init__.py,sha256=GxfVEOJLubSaiA0jNE2zgZq7sxiJMAr6Qn-cLPK8XJU,90
|
|
3
|
-
mdify/__main__.py,sha256=bhpJ00co6MfaVOdH4XLoW04NtLYDa_oJK7ODzfLrn9M,143
|
|
4
|
-
mdify/cli.py,sha256=D8_1_6NgWXkexGWqkgB0JO7c1r2T2_Va7J7iGwvewQA,20038
|
|
5
|
-
mdify_cli-1.5.0.dist-info/licenses/LICENSE,sha256=NWM66Uv-XuSMKaU-gaPmvfyk4WgE6zcIPr78wyg6GAo,1065
|
|
6
|
-
mdify_cli-1.5.0.dist-info/METADATA,sha256=LRKZupINA7w6HM9FyuDdmrLzWYovHqxTnQRHNohmRM0,6658
|
|
7
|
-
mdify_cli-1.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
-
mdify_cli-1.5.0.dist-info/entry_points.txt,sha256=0Xki8f5lADQUtwdt6Eq_FEaieI6Byhk8UE7BuDhChMg,41
|
|
9
|
-
mdify_cli-1.5.0.dist-info/top_level.txt,sha256=qltzf7h8owHq7dxCdfCkSHY8gT21hn1_E8P-VWS_OKM,6
|
|
10
|
-
mdify_cli-1.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|