mdify-cli 1.6.0__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdify/__init__.py +1 -1
- mdify/cli.py +132 -128
- mdify/container.py +128 -0
- mdify/docling_client.py +224 -0
- {mdify_cli-1.6.0.dist-info → mdify_cli-2.1.0.dist-info}/METADATA +38 -15
- mdify_cli-2.1.0.dist-info/RECORD +12 -0
- mdify_cli-1.6.0.dist-info/RECORD +0 -10
- {mdify_cli-1.6.0.dist-info → mdify_cli-2.1.0.dist-info}/WHEEL +0 -0
- {mdify_cli-1.6.0.dist-info → mdify_cli-2.1.0.dist-info}/entry_points.txt +0 -0
- {mdify_cli-1.6.0.dist-info → mdify_cli-2.1.0.dist-info}/licenses/LICENSE +0 -0
- {mdify_cli-1.6.0.dist-info → mdify_cli-2.1.0.dist-info}/top_level.txt +0 -0
mdify/__init__.py
CHANGED
mdify/cli.py
CHANGED
|
@@ -21,6 +21,8 @@ from urllib.error import URLError
|
|
|
21
21
|
from urllib.request import urlopen
|
|
22
22
|
|
|
23
23
|
from . import __version__
|
|
24
|
+
from mdify.container import DoclingContainer
|
|
25
|
+
from mdify.docling_client import convert_file
|
|
24
26
|
|
|
25
27
|
# Configuration
|
|
26
28
|
MDIFY_HOME = Path.home() / ".mdify"
|
|
@@ -29,7 +31,8 @@ PYPI_API_URL = "https://pypi.org/pypi/mdify-cli/json"
|
|
|
29
31
|
CHECK_INTERVAL_SECONDS = 86400 # 24 hours
|
|
30
32
|
|
|
31
33
|
# Container configuration
|
|
32
|
-
DEFAULT_IMAGE = "ghcr.io/
|
|
34
|
+
DEFAULT_IMAGE = "ghcr.io/docling-project/docling-serve-cpu:main"
|
|
35
|
+
GPU_IMAGE = "ghcr.io/docling-project/docling-serve-cu126:main"
|
|
33
36
|
SUPPORTED_RUNTIMES = ("docker", "podman")
|
|
34
37
|
|
|
35
38
|
|
|
@@ -288,79 +291,6 @@ class Spinner:
|
|
|
288
291
|
print(f"\r{' ' * 80}\r", end="", flush=True)
|
|
289
292
|
|
|
290
293
|
|
|
291
|
-
def run_container(
|
|
292
|
-
runtime: str,
|
|
293
|
-
image: str,
|
|
294
|
-
input_file: Path,
|
|
295
|
-
output_file: Path,
|
|
296
|
-
mask_pii: bool = False,
|
|
297
|
-
) -> Tuple[bool, str, float]:
|
|
298
|
-
"""
|
|
299
|
-
Run container to convert a single file.
|
|
300
|
-
|
|
301
|
-
Args:
|
|
302
|
-
runtime: Path to container runtime
|
|
303
|
-
image: Image name/tag
|
|
304
|
-
input_file: Absolute path to input file
|
|
305
|
-
output_file: Absolute path to output file
|
|
306
|
-
mask_pii: Whether to mask PII in images
|
|
307
|
-
|
|
308
|
-
Returns:
|
|
309
|
-
Tuple of (success: bool, message: str, elapsed_seconds: float)
|
|
310
|
-
"""
|
|
311
|
-
start_time = time.time()
|
|
312
|
-
|
|
313
|
-
# Ensure output directory exists
|
|
314
|
-
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
315
|
-
|
|
316
|
-
# Mount directories
|
|
317
|
-
input_dir = input_file.parent
|
|
318
|
-
output_dir = output_file.parent
|
|
319
|
-
|
|
320
|
-
# Container paths
|
|
321
|
-
container_in = f"/work/in/{input_file.name}"
|
|
322
|
-
container_out = f"/work/out/{output_file.name}"
|
|
323
|
-
|
|
324
|
-
cmd = [
|
|
325
|
-
runtime,
|
|
326
|
-
"run",
|
|
327
|
-
"--rm",
|
|
328
|
-
"-v",
|
|
329
|
-
f"{input_dir}:/work/in:ro",
|
|
330
|
-
"-v",
|
|
331
|
-
f"{output_dir}:/work/out",
|
|
332
|
-
image,
|
|
333
|
-
"--in",
|
|
334
|
-
container_in,
|
|
335
|
-
"--out",
|
|
336
|
-
container_out,
|
|
337
|
-
]
|
|
338
|
-
|
|
339
|
-
if mask_pii:
|
|
340
|
-
cmd.append("--mask")
|
|
341
|
-
|
|
342
|
-
try:
|
|
343
|
-
result = subprocess.run(
|
|
344
|
-
cmd,
|
|
345
|
-
capture_output=True,
|
|
346
|
-
text=True,
|
|
347
|
-
check=False,
|
|
348
|
-
)
|
|
349
|
-
elapsed = time.time() - start_time
|
|
350
|
-
|
|
351
|
-
if result.returncode == 0:
|
|
352
|
-
return True, "success", elapsed
|
|
353
|
-
else:
|
|
354
|
-
error_msg = (
|
|
355
|
-
result.stderr.strip() or result.stdout.strip() or "Unknown error"
|
|
356
|
-
)
|
|
357
|
-
return False, error_msg, elapsed
|
|
358
|
-
|
|
359
|
-
except OSError as e:
|
|
360
|
-
elapsed = time.time() - start_time
|
|
361
|
-
return False, str(e), elapsed
|
|
362
|
-
|
|
363
|
-
|
|
364
294
|
# =============================================================================
|
|
365
295
|
# File handling functions
|
|
366
296
|
# =============================================================================
|
|
@@ -532,6 +462,19 @@ Examples:
|
|
|
532
462
|
help="Mask PII and sensitive content in document images",
|
|
533
463
|
)
|
|
534
464
|
|
|
465
|
+
parser.add_argument(
|
|
466
|
+
"--gpu",
|
|
467
|
+
action="store_true",
|
|
468
|
+
help="Use GPU-accelerated container image (docling-serve-cu126)",
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
parser.add_argument(
|
|
472
|
+
"--port",
|
|
473
|
+
type=int,
|
|
474
|
+
default=5001,
|
|
475
|
+
help="Port for docling-serve container (default: 5001)",
|
|
476
|
+
)
|
|
477
|
+
|
|
535
478
|
# Container options
|
|
536
479
|
parser.add_argument(
|
|
537
480
|
"--runtime",
|
|
@@ -609,7 +552,14 @@ def main() -> int:
|
|
|
609
552
|
return 2
|
|
610
553
|
|
|
611
554
|
# Handle image pull policy
|
|
612
|
-
image
|
|
555
|
+
# Determine image based on --gpu flag
|
|
556
|
+
if args.gpu:
|
|
557
|
+
image = GPU_IMAGE
|
|
558
|
+
elif args.image:
|
|
559
|
+
image = args.image
|
|
560
|
+
else:
|
|
561
|
+
image = DEFAULT_IMAGE
|
|
562
|
+
|
|
613
563
|
image_exists = check_image_exists(runtime, image)
|
|
614
564
|
|
|
615
565
|
if args.pull == "always" or (args.pull == "missing" and not image_exists):
|
|
@@ -621,9 +571,15 @@ def main() -> int:
|
|
|
621
571
|
print(f"Run with --pull=missing or pull manually: {preferred} pull {image}")
|
|
622
572
|
return 1
|
|
623
573
|
|
|
624
|
-
# Resolve paths
|
|
625
|
-
|
|
626
|
-
|
|
574
|
+
# Resolve paths (use absolute() as fallback if resolve() fails due to permissions)
|
|
575
|
+
try:
|
|
576
|
+
input_path = Path(args.input).resolve()
|
|
577
|
+
except PermissionError:
|
|
578
|
+
input_path = Path(args.input).absolute()
|
|
579
|
+
try:
|
|
580
|
+
output_dir = Path(args.out_dir).resolve()
|
|
581
|
+
except PermissionError:
|
|
582
|
+
output_dir = Path(args.out_dir).absolute()
|
|
627
583
|
|
|
628
584
|
# Validate input
|
|
629
585
|
if not input_path.exists():
|
|
@@ -650,67 +606,115 @@ def main() -> int:
|
|
|
650
606
|
print(f"Using image: {image}")
|
|
651
607
|
print()
|
|
652
608
|
|
|
609
|
+
if args.mask:
|
|
610
|
+
print(
|
|
611
|
+
"Warning: --mask is not supported with docling-serve and will be ignored",
|
|
612
|
+
file=sys.stderr,
|
|
613
|
+
)
|
|
614
|
+
|
|
653
615
|
# Determine input base for directory structure preservation
|
|
654
616
|
if input_path.is_file():
|
|
655
617
|
input_base = input_path.parent
|
|
656
618
|
else:
|
|
657
619
|
input_base = input_path
|
|
658
620
|
|
|
659
|
-
# Convert files
|
|
660
621
|
success_count = 0
|
|
661
622
|
skipped_count = 0
|
|
662
623
|
failed_count = 0
|
|
663
|
-
|
|
664
|
-
spinner = Spinner()
|
|
665
|
-
|
|
666
|
-
for idx, input_file in enumerate(files_to_convert, 1):
|
|
667
|
-
output_file = get_output_path(input_file, input_base, output_dir, args.flat)
|
|
668
|
-
file_size = input_file.stat().st_size
|
|
669
|
-
progress = f"[{idx}/{total_files}]"
|
|
670
|
-
|
|
671
|
-
# Check if output exists and skip if not overwriting
|
|
672
|
-
if output_file.exists() and not args.overwrite:
|
|
673
|
-
if not args.quiet:
|
|
674
|
-
print(f"{progress} Skipped (exists): {input_file.name}")
|
|
675
|
-
skipped_count += 1
|
|
676
|
-
continue
|
|
677
|
-
|
|
678
|
-
# Show spinner while processing
|
|
679
|
-
if not args.quiet:
|
|
680
|
-
spinner.start(
|
|
681
|
-
f"{progress} Processing: {input_file.name} ({format_size(file_size)})"
|
|
682
|
-
)
|
|
683
|
-
|
|
684
|
-
success, result, elapsed = run_container(
|
|
685
|
-
runtime, image, input_file, output_file, args.mask
|
|
686
|
-
)
|
|
624
|
+
total_elapsed = 0.0
|
|
687
625
|
|
|
626
|
+
try:
|
|
688
627
|
if not args.quiet:
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
628
|
+
print(f"Starting docling-serve container...")
|
|
629
|
+
print()
|
|
630
|
+
|
|
631
|
+
with DoclingContainer(runtime, image, args.port) as container:
|
|
632
|
+
# Convert files
|
|
633
|
+
conversion_start = time.time()
|
|
634
|
+
spinner = Spinner()
|
|
635
|
+
|
|
636
|
+
for idx, input_file in enumerate(files_to_convert, 1):
|
|
637
|
+
output_file = get_output_path(
|
|
638
|
+
input_file, input_base, output_dir, args.flat
|
|
639
|
+
)
|
|
640
|
+
file_size = input_file.stat().st_size
|
|
641
|
+
progress = f"[{idx}/{total_files}]"
|
|
642
|
+
|
|
643
|
+
# Check if output exists and skip if not overwriting
|
|
644
|
+
if output_file.exists() and not args.overwrite:
|
|
645
|
+
if not args.quiet:
|
|
646
|
+
print(f"{progress} Skipped (exists): {input_file.name}")
|
|
647
|
+
skipped_count += 1
|
|
648
|
+
continue
|
|
649
|
+
|
|
650
|
+
# Ensure output directory exists
|
|
651
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
652
|
+
|
|
653
|
+
# Show spinner while processing
|
|
654
|
+
if not args.quiet:
|
|
655
|
+
spinner.start(
|
|
656
|
+
f"{progress} Processing: {input_file.name} ({format_size(file_size)})"
|
|
657
|
+
)
|
|
658
|
+
|
|
659
|
+
start_time = time.time()
|
|
660
|
+
try:
|
|
661
|
+
# Convert via HTTP API
|
|
662
|
+
result = convert_file(
|
|
663
|
+
container.base_url, input_file, to_format="md"
|
|
664
|
+
)
|
|
665
|
+
elapsed = time.time() - start_time
|
|
666
|
+
|
|
667
|
+
if not args.quiet:
|
|
668
|
+
spinner.stop()
|
|
669
|
+
|
|
670
|
+
if result.success:
|
|
671
|
+
# Write result to output file
|
|
672
|
+
output_file.write_text(result.content)
|
|
673
|
+
success_count += 1
|
|
674
|
+
if not args.quiet:
|
|
675
|
+
print(
|
|
676
|
+
f"{progress} {input_file.name} ✓ ({format_duration(elapsed)})"
|
|
677
|
+
)
|
|
678
|
+
else:
|
|
679
|
+
failed_count += 1
|
|
680
|
+
error_msg = result.error or "Unknown error"
|
|
681
|
+
if not args.quiet:
|
|
682
|
+
print(
|
|
683
|
+
f"{progress} {input_file.name} ✗ ({format_duration(elapsed)})"
|
|
684
|
+
)
|
|
685
|
+
print(f" Error: {error_msg}", file=sys.stderr)
|
|
686
|
+
except Exception as e:
|
|
687
|
+
elapsed = time.time() - start_time
|
|
688
|
+
failed_count += 1
|
|
689
|
+
if not args.quiet:
|
|
690
|
+
spinner.stop()
|
|
691
|
+
print(
|
|
692
|
+
f"{progress} {input_file.name} ✗ ({format_duration(elapsed)})"
|
|
693
|
+
)
|
|
694
|
+
print(f" Error: {str(e)}", file=sys.stderr)
|
|
695
|
+
|
|
696
|
+
total_elapsed = time.time() - conversion_start
|
|
697
|
+
|
|
698
|
+
# Print summary
|
|
699
|
+
if not args.quiet:
|
|
700
|
+
print()
|
|
701
|
+
print("=" * 50)
|
|
702
|
+
print("Conversion Summary:")
|
|
703
|
+
print(f" Total files: {total_files}")
|
|
704
|
+
print(f" Successful: {success_count}")
|
|
705
|
+
print(f" Skipped: {skipped_count}")
|
|
706
|
+
print(f" Failed: {failed_count}")
|
|
707
|
+
print(f" Total time: {format_duration(total_elapsed)}")
|
|
708
|
+
print("=" * 50)
|
|
709
|
+
|
|
710
|
+
except KeyboardInterrupt:
|
|
711
|
+
if not args.quiet:
|
|
712
|
+
print("\n\nInterrupted by user. Container stopped.")
|
|
713
|
+
if success_count > 0 or skipped_count > 0 or failed_count > 0:
|
|
714
|
+
print(
|
|
715
|
+
f"Partial progress: {success_count} successful, {failed_count} failed, {skipped_count} skipped"
|
|
716
|
+
)
|
|
717
|
+
return 130
|
|
714
718
|
|
|
715
719
|
# Return appropriate exit code
|
|
716
720
|
if failed_count > 0:
|
mdify/container.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""Container lifecycle management for docling-serve."""
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
import time
|
|
5
|
+
import uuid
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from mdify.docling_client import check_health
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DoclingContainer:
|
|
12
|
+
"""Manages docling-serve container lifecycle.
|
|
13
|
+
|
|
14
|
+
Provides context manager support for automatic startup and cleanup.
|
|
15
|
+
|
|
16
|
+
Usage:
|
|
17
|
+
with DoclingContainer("docker", "ghcr.io/docling-project/docling-serve-cpu:main") as container:
|
|
18
|
+
# Container is running and healthy
|
|
19
|
+
response = requests.post(f"{container.base_url}/v1/convert/file", ...)
|
|
20
|
+
# Container automatically stopped and removed
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, runtime: str, image: str, port: int = 5001):
|
|
24
|
+
"""Initialize container manager.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
runtime: Container runtime ("docker" or "podman")
|
|
28
|
+
image: Container image to use
|
|
29
|
+
port: Host port to bind (default: 5001)
|
|
30
|
+
"""
|
|
31
|
+
self.runtime = runtime
|
|
32
|
+
self.image = image
|
|
33
|
+
self.port = port
|
|
34
|
+
self.container_name = f"mdify-serve-{uuid.uuid4().hex[:8]}"
|
|
35
|
+
self.container_id: Optional[str] = None
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def base_url(self) -> str:
|
|
39
|
+
"""Return base URL for API requests."""
|
|
40
|
+
return f"http://localhost:{self.port}"
|
|
41
|
+
|
|
42
|
+
def start(self, timeout: int = 120) -> None:
|
|
43
|
+
"""Start container and wait for health check.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
timeout: Maximum seconds to wait for health (default: 120)
|
|
47
|
+
|
|
48
|
+
Raises:
|
|
49
|
+
subprocess.CalledProcessError: If container fails to start
|
|
50
|
+
TimeoutError: If health check doesn't pass within timeout
|
|
51
|
+
"""
|
|
52
|
+
# Start container in detached mode
|
|
53
|
+
cmd = [
|
|
54
|
+
self.runtime,
|
|
55
|
+
"run",
|
|
56
|
+
"-d", # Detached mode
|
|
57
|
+
"--rm", # Auto-remove on stop
|
|
58
|
+
"--name",
|
|
59
|
+
self.container_name,
|
|
60
|
+
"-p",
|
|
61
|
+
f"{self.port}:5001",
|
|
62
|
+
self.image,
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
67
|
+
self.container_id = result.stdout.strip()
|
|
68
|
+
except subprocess.CalledProcessError as e:
|
|
69
|
+
error_msg = e.stderr.strip() or e.stdout.strip() or "Unknown error"
|
|
70
|
+
raise subprocess.CalledProcessError(
|
|
71
|
+
e.returncode,
|
|
72
|
+
e.cmd,
|
|
73
|
+
output=e.stdout,
|
|
74
|
+
stderr=f"Failed to start container: {error_msg}",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Wait for health check
|
|
78
|
+
self._wait_for_health(timeout)
|
|
79
|
+
|
|
80
|
+
def stop(self) -> None:
|
|
81
|
+
"""Stop and remove container. Safe to call multiple times."""
|
|
82
|
+
if self.container_name:
|
|
83
|
+
subprocess.run(
|
|
84
|
+
[self.runtime, "stop", self.container_name],
|
|
85
|
+
capture_output=True,
|
|
86
|
+
check=False,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
def is_ready(self) -> bool:
|
|
90
|
+
"""Check if container is healthy.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
True if container is healthy, False otherwise
|
|
94
|
+
"""
|
|
95
|
+
try:
|
|
96
|
+
return check_health(self.base_url)
|
|
97
|
+
except Exception:
|
|
98
|
+
return False
|
|
99
|
+
|
|
100
|
+
def _wait_for_health(self, timeout: int) -> None:
|
|
101
|
+
"""Poll health endpoint until ready.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
timeout: Maximum seconds to wait
|
|
105
|
+
|
|
106
|
+
Raises:
|
|
107
|
+
TimeoutError: If health check doesn't pass within timeout
|
|
108
|
+
"""
|
|
109
|
+
start_time = time.time()
|
|
110
|
+
while time.time() - start_time < timeout:
|
|
111
|
+
try:
|
|
112
|
+
if check_health(self.base_url):
|
|
113
|
+
return
|
|
114
|
+
except Exception:
|
|
115
|
+
pass
|
|
116
|
+
time.sleep(2) # Poll every 2 seconds
|
|
117
|
+
|
|
118
|
+
raise TimeoutError(f"Container failed to become healthy within {timeout}s")
|
|
119
|
+
|
|
120
|
+
def __enter__(self):
|
|
121
|
+
"""Context manager entry."""
|
|
122
|
+
self.start()
|
|
123
|
+
return self
|
|
124
|
+
|
|
125
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
126
|
+
"""Context manager exit - ensures cleanup."""
|
|
127
|
+
self.stop()
|
|
128
|
+
return False
|
mdify/docling_client.py
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
"""HTTP client for docling-serve REST API."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class ConvertResult:
|
|
12
|
+
"""Result from document conversion."""
|
|
13
|
+
|
|
14
|
+
content: str
|
|
15
|
+
format: str
|
|
16
|
+
success: bool
|
|
17
|
+
error: Optional[str] = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class StatusResult:
|
|
22
|
+
"""Status of async conversion task."""
|
|
23
|
+
|
|
24
|
+
status: str # "pending", "completed", "failed"
|
|
25
|
+
task_id: str
|
|
26
|
+
error: Optional[str] = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class DoclingClientError(Exception):
|
|
30
|
+
"""Base exception for docling client errors."""
|
|
31
|
+
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DoclingHTTPError(DoclingClientError):
|
|
36
|
+
"""HTTP error from docling-serve API."""
|
|
37
|
+
|
|
38
|
+
def __init__(self, status_code: int, message: str):
|
|
39
|
+
self.status_code = status_code
|
|
40
|
+
super().__init__(f"HTTP {status_code}: {message}")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def check_health(base_url: str) -> bool:
|
|
44
|
+
"""Check if docling-serve is healthy.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
base_url: Base URL of docling-serve (e.g., "http://localhost:8000")
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
True if healthy, False otherwise
|
|
51
|
+
"""
|
|
52
|
+
try:
|
|
53
|
+
response = requests.get(f"{base_url}/health")
|
|
54
|
+
return response.status_code == 200
|
|
55
|
+
except requests.RequestException:
|
|
56
|
+
return False
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def convert_file(
|
|
60
|
+
base_url: str, file_path: Path, to_format: str = "md", do_ocr: bool = True
|
|
61
|
+
) -> ConvertResult:
|
|
62
|
+
"""Convert a file synchronously.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
base_url: Base URL of docling-serve
|
|
66
|
+
file_path: Path to file to convert
|
|
67
|
+
to_format: Output format (default: "md")
|
|
68
|
+
do_ocr: Whether to perform OCR (default: True)
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
ConvertResult with conversion output
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
DoclingHTTPError: If HTTP request fails
|
|
75
|
+
"""
|
|
76
|
+
try:
|
|
77
|
+
with open(file_path, "rb") as f:
|
|
78
|
+
response = requests.post(
|
|
79
|
+
f"{base_url}/v1/convert/file",
|
|
80
|
+
files={"files": (file_path.name, f, "application/pdf")},
|
|
81
|
+
data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
if response.status_code != 200:
|
|
85
|
+
raise DoclingHTTPError(
|
|
86
|
+
response.status_code, response.text or "Conversion failed"
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
result_data = response.json()
|
|
90
|
+
|
|
91
|
+
# docling-serve returns results in a list format
|
|
92
|
+
if isinstance(result_data, list) and len(result_data) > 0:
|
|
93
|
+
first_result = result_data[0]
|
|
94
|
+
return ConvertResult(
|
|
95
|
+
content=first_result.get("content", ""), format=to_format, success=True
|
|
96
|
+
)
|
|
97
|
+
elif isinstance(result_data, dict):
|
|
98
|
+
return ConvertResult(
|
|
99
|
+
content=result_data.get("content", ""), format=to_format, success=True
|
|
100
|
+
)
|
|
101
|
+
else:
|
|
102
|
+
raise DoclingHTTPError(200, f"Unexpected response format: {result_data}")
|
|
103
|
+
|
|
104
|
+
except requests.RequestException as e:
|
|
105
|
+
return ConvertResult(content="", format=to_format, success=False, error=str(e))
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def convert_file_async(
|
|
109
|
+
base_url: str, file_path: Path, to_format: str = "md", do_ocr: bool = True
|
|
110
|
+
) -> str:
|
|
111
|
+
"""Start async file conversion.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
base_url: Base URL of docling-serve
|
|
115
|
+
file_path: Path to file to convert
|
|
116
|
+
to_format: Output format (default: "md")
|
|
117
|
+
do_ocr: Whether to perform OCR (default: True)
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Task ID for polling
|
|
121
|
+
|
|
122
|
+
Raises:
|
|
123
|
+
DoclingHTTPError: If HTTP request fails
|
|
124
|
+
"""
|
|
125
|
+
try:
|
|
126
|
+
with open(file_path, "rb") as f:
|
|
127
|
+
response = requests.post(
|
|
128
|
+
f"{base_url}/v1/convert/file/async",
|
|
129
|
+
files={"files": (file_path.name, f, "application/pdf")},
|
|
130
|
+
data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
if response.status_code != 200:
|
|
134
|
+
raise DoclingHTTPError(
|
|
135
|
+
response.status_code, response.text or "Async conversion failed"
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
result_data = response.json()
|
|
139
|
+
task_id = result_data.get("task_id")
|
|
140
|
+
|
|
141
|
+
if not task_id:
|
|
142
|
+
raise DoclingHTTPError(200, f"No task_id in response: {result_data}")
|
|
143
|
+
|
|
144
|
+
return task_id
|
|
145
|
+
|
|
146
|
+
except requests.RequestException as e:
|
|
147
|
+
raise DoclingHTTPError(500, str(e))
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def poll_status(base_url: str, task_id: str) -> StatusResult:
|
|
151
|
+
"""Poll status of async conversion task.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
base_url: Base URL of docling-serve
|
|
155
|
+
task_id: Task ID from convert_file_async
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
StatusResult with current status
|
|
159
|
+
|
|
160
|
+
Raises:
|
|
161
|
+
DoclingHTTPError: If HTTP request fails
|
|
162
|
+
"""
|
|
163
|
+
try:
|
|
164
|
+
response = requests.get(f"{base_url}/v1/status/poll/{task_id}")
|
|
165
|
+
|
|
166
|
+
if response.status_code != 200:
|
|
167
|
+
raise DoclingHTTPError(
|
|
168
|
+
response.status_code, response.text or "Status poll failed"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
result_data = response.json()
|
|
172
|
+
|
|
173
|
+
return StatusResult(
|
|
174
|
+
status=result_data.get("status", "unknown"),
|
|
175
|
+
task_id=task_id,
|
|
176
|
+
error=result_data.get("error"),
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
except requests.RequestException as e:
|
|
180
|
+
raise DoclingHTTPError(500, str(e))
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def get_result(base_url: str, task_id: str) -> ConvertResult:
|
|
184
|
+
"""Get result of completed async conversion.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
base_url: Base URL of docling-serve
|
|
188
|
+
task_id: Task ID from convert_file_async
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
ConvertResult with conversion output
|
|
192
|
+
|
|
193
|
+
Raises:
|
|
194
|
+
DoclingHTTPError: If HTTP request fails or task not completed
|
|
195
|
+
"""
|
|
196
|
+
try:
|
|
197
|
+
response = requests.get(f"{base_url}/v1/result/{task_id}")
|
|
198
|
+
|
|
199
|
+
if response.status_code != 200:
|
|
200
|
+
raise DoclingHTTPError(
|
|
201
|
+
response.status_code, response.text or "Result retrieval failed"
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
result_data = response.json()
|
|
205
|
+
|
|
206
|
+
# Similar to sync conversion, handle list or dict format
|
|
207
|
+
if isinstance(result_data, list) and len(result_data) > 0:
|
|
208
|
+
first_result = result_data[0]
|
|
209
|
+
return ConvertResult(
|
|
210
|
+
content=first_result.get("content", ""),
|
|
211
|
+
format=first_result.get("format", "md"),
|
|
212
|
+
success=True,
|
|
213
|
+
)
|
|
214
|
+
elif isinstance(result_data, dict):
|
|
215
|
+
return ConvertResult(
|
|
216
|
+
content=result_data.get("content", ""),
|
|
217
|
+
format=result_data.get("format", "md"),
|
|
218
|
+
success=True,
|
|
219
|
+
)
|
|
220
|
+
else:
|
|
221
|
+
raise DoclingHTTPError(200, f"Unexpected response format: {result_data}")
|
|
222
|
+
|
|
223
|
+
except requests.RequestException as e:
|
|
224
|
+
return ConvertResult(content="", format="md", success=False, error=str(e))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mdify-cli
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 2.1.0
|
|
4
4
|
Summary: Convert PDFs and document images into structured Markdown for LLM workflows
|
|
5
5
|
Author: tiroq
|
|
6
6
|
License-Expression: MIT
|
|
@@ -24,6 +24,7 @@ Classifier: Topic :: Utilities
|
|
|
24
24
|
Requires-Python: >=3.8
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
License-File: LICENSE
|
|
27
|
+
Requires-Dist: requests
|
|
27
28
|
Provides-Extra: dev
|
|
28
29
|
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
29
30
|
Dynamic: license-file
|
|
@@ -100,15 +101,32 @@ Recursively convert files:
|
|
|
100
101
|
mdify /path/to/documents -r -g "*.pdf"
|
|
101
102
|
```
|
|
102
103
|
|
|
103
|
-
###
|
|
104
|
+
### GPU Acceleration
|
|
104
105
|
|
|
105
|
-
|
|
106
|
+
For faster processing with NVIDIA GPU:
|
|
106
107
|
```bash
|
|
107
|
-
mdify
|
|
108
|
-
mdify document.pdf --mask
|
|
108
|
+
mdify --gpu documents/*.pdf
|
|
109
109
|
```
|
|
110
110
|
|
|
111
|
-
|
|
111
|
+
Requires NVIDIA GPU with CUDA support and nvidia-container-toolkit.
|
|
112
|
+
|
|
113
|
+
### ⚠️ PII Masking (Deprecated)
|
|
114
|
+
|
|
115
|
+
The `--mask` flag is deprecated and will be ignored in this version. PII masking functionality was available in older versions using a custom runtime but is not supported with the current docling-serve backend.
|
|
116
|
+
|
|
117
|
+
If PII masking is critical for your use case, please use mdify v1.5.x or earlier versions.
|
|
118
|
+
|
|
119
|
+
## Performance
|
|
120
|
+
|
|
121
|
+
mdify now uses docling-serve for significantly faster batch processing:
|
|
122
|
+
|
|
123
|
+
- **Single model load**: Models are loaded once per session, not per file
|
|
124
|
+
- **~10-20x speedup** for multiple file conversions compared to previous versions
|
|
125
|
+
- **GPU acceleration**: Use `--gpu` for additional 2-6x speedup (requires NVIDIA GPU)
|
|
126
|
+
|
|
127
|
+
### First Run Behavior
|
|
128
|
+
|
|
129
|
+
The first conversion takes longer (~30-60s) as the container loads ML models into memory. Subsequent files in the same batch process quickly, typically in 1-3 seconds per file.
|
|
112
130
|
|
|
113
131
|
## Options
|
|
114
132
|
|
|
@@ -121,9 +139,11 @@ This uses Docling's content-aware masking to obscure sensitive information in em
|
|
|
121
139
|
| `--flat` | Disable directory structure preservation |
|
|
122
140
|
| `--overwrite` | Overwrite existing output files |
|
|
123
141
|
| `-q, --quiet` | Suppress progress messages |
|
|
124
|
-
| `-m, --mask` |
|
|
142
|
+
| `-m, --mask` | ⚠️ **Deprecated**: PII masking not supported in current version |
|
|
143
|
+
| `--gpu` | Use GPU-accelerated container (requires NVIDIA GPU and nvidia-container-toolkit) |
|
|
144
|
+
| `--port PORT` | Container port (default: 5001) |
|
|
125
145
|
| `--runtime RUNTIME` | Container runtime: docker or podman (auto-detected) |
|
|
126
|
-
| `--image IMAGE` | Custom container image (default: ghcr.io/
|
|
146
|
+
| `--image IMAGE` | Custom container image (default: ghcr.io/docling-project/docling-serve-cpu:main) |
|
|
127
147
|
| `--pull POLICY` | Image pull policy: always, missing, never (default: missing) |
|
|
128
148
|
| `--check-update` | Check for available updates and exit |
|
|
129
149
|
| `--version` | Show version and exit |
|
|
@@ -177,19 +197,22 @@ The CLI:
|
|
|
177
197
|
- Pulls the runtime container on first use
|
|
178
198
|
- Mounts files and runs conversions in the container
|
|
179
199
|
|
|
180
|
-
## Container
|
|
200
|
+
## Container Images
|
|
201
|
+
|
|
202
|
+
mdify uses official docling-serve containers:
|
|
181
203
|
|
|
182
|
-
|
|
204
|
+
**CPU Version** (default):
|
|
183
205
|
```
|
|
184
|
-
ghcr.io/
|
|
206
|
+
ghcr.io/docling-project/docling-serve-cpu:main
|
|
185
207
|
```
|
|
186
208
|
|
|
187
|
-
|
|
188
|
-
```
|
|
189
|
-
|
|
190
|
-
docker build -t mdify-runtime .
|
|
209
|
+
**GPU Version** (use with `--gpu` flag):
|
|
210
|
+
```
|
|
211
|
+
ghcr.io/docling-project/docling-serve-cu126:main
|
|
191
212
|
```
|
|
192
213
|
|
|
214
|
+
These are official images from the [docling-serve project](https://github.com/DS4SD/docling-serve).
|
|
215
|
+
|
|
193
216
|
## Updates
|
|
194
217
|
|
|
195
218
|
mdify checks for updates daily. When a new version is available:
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
assets/mdify.png,sha256=qUj7WXWqNwpI2KNXOW79XJwqFqa-UI0JEkmt1mmy4Rg,1820418
|
|
2
|
+
mdify/__init__.py,sha256=OxKblQk_woS_-5Z6N5BMndbfQBdvopK1KrnSMAt-4A0,90
|
|
3
|
+
mdify/__main__.py,sha256=bhpJ00co6MfaVOdH4XLoW04NtLYDa_oJK7ODzfLrn9M,143
|
|
4
|
+
mdify/cli.py,sha256=0lRT1XCV9YMMJQ68kH93mrGfFtAC6GgXS-OGEn9mnGw,21742
|
|
5
|
+
mdify/container.py,sha256=AVIhiq_wO5id5hQ_s83lUPkAPCsAoTs25azRT6JmKII,3962
|
|
6
|
+
mdify/docling_client.py,sha256=_9qjL5yOOeJahOg6an2P6Iii1xkeR6wmNJZG4Q6NRkk,6553
|
|
7
|
+
mdify_cli-2.1.0.dist-info/licenses/LICENSE,sha256=NWM66Uv-XuSMKaU-gaPmvfyk4WgE6zcIPr78wyg6GAo,1065
|
|
8
|
+
mdify_cli-2.1.0.dist-info/METADATA,sha256=IaVJcjFLlyCKUIhWBntq47owI7Izrbj6t7Ce0EBrUwg,7923
|
|
9
|
+
mdify_cli-2.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
10
|
+
mdify_cli-2.1.0.dist-info/entry_points.txt,sha256=0Xki8f5lADQUtwdt6Eq_FEaieI6Byhk8UE7BuDhChMg,41
|
|
11
|
+
mdify_cli-2.1.0.dist-info/top_level.txt,sha256=qltzf7h8owHq7dxCdfCkSHY8gT21hn1_E8P-VWS_OKM,6
|
|
12
|
+
mdify_cli-2.1.0.dist-info/RECORD,,
|
mdify_cli-1.6.0.dist-info/RECORD
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
assets/mdify.png,sha256=qUj7WXWqNwpI2KNXOW79XJwqFqa-UI0JEkmt1mmy4Rg,1820418
|
|
2
|
-
mdify/__init__.py,sha256=tvxIF7MWdoaHBgdk4tT81csn-ZhTTOlfooBYqM4YsMg,90
|
|
3
|
-
mdify/__main__.py,sha256=bhpJ00co6MfaVOdH4XLoW04NtLYDa_oJK7ODzfLrn9M,143
|
|
4
|
-
mdify/cli.py,sha256=sDwkOf4H33l7WmfAR3tw2MjO-7kuIOHcrQXTZto6bF0,20460
|
|
5
|
-
mdify_cli-1.6.0.dist-info/licenses/LICENSE,sha256=NWM66Uv-XuSMKaU-gaPmvfyk4WgE6zcIPr78wyg6GAo,1065
|
|
6
|
-
mdify_cli-1.6.0.dist-info/METADATA,sha256=W-tubNyeCkt6_GAYmS59JHwka8FxQ4D5ZxbHDKFhaLQ,6721
|
|
7
|
-
mdify_cli-1.6.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
8
|
-
mdify_cli-1.6.0.dist-info/entry_points.txt,sha256=0Xki8f5lADQUtwdt6Eq_FEaieI6Byhk8UE7BuDhChMg,41
|
|
9
|
-
mdify_cli-1.6.0.dist-info/top_level.txt,sha256=qltzf7h8owHq7dxCdfCkSHY8gT21hn1_E8P-VWS_OKM,6
|
|
10
|
-
mdify_cli-1.6.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|