mdify-cli 1.3.1__py3-none-any.whl → 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdify/container.py ADDED
@@ -0,0 +1,132 @@
1
+ """Container lifecycle management for docling-serve."""
2
+
3
+ import subprocess
4
+ import time
5
+ import uuid
6
+ from typing import Optional
7
+
8
+ from mdify.docling_client import check_health
9
+
10
+
11
+ class DoclingContainer:
12
+ """Manages docling-serve container lifecycle.
13
+
14
+ Provides context manager support for automatic startup and cleanup.
15
+
16
+ Usage:
17
+ with DoclingContainer("docker", "ghcr.io/docling-project/docling-serve-cpu:main") as container:
18
+ # Container is running and healthy
19
+ response = requests.post(f"{container.base_url}/v1/convert/file", ...)
20
+ # Container automatically stopped and removed
21
+ """
22
+
23
+ def __init__(self, runtime: str, image: str, port: int = 5001, timeout: int = 1200):
24
+ """Initialize container manager.
25
+
26
+ Args:
27
+ runtime: Container runtime ("docker" or "podman")
28
+ image: Container image to use
29
+ port: Host port to bind (default: 5001)
30
+ timeout: Conversion timeout in seconds (default: 1200)
31
+ """
32
+ self.runtime = runtime
33
+ self.image = image
34
+ self.port = port
35
+ self.timeout = timeout
36
+ self.container_name = f"mdify-serve-{uuid.uuid4().hex[:8]}"
37
+ self.container_id: Optional[str] = None
38
+
39
+ @property
40
+ def base_url(self) -> str:
41
+ """Return base URL for API requests."""
42
+ return f"http://localhost:{self.port}"
43
+
44
+ def start(self, timeout: int = 120) -> None:
45
+ """Start container and wait for health check.
46
+
47
+ Args:
48
+ timeout: Maximum seconds to wait for health (default: 120)
49
+
50
+ Raises:
51
+ subprocess.CalledProcessError: If container fails to start
52
+ TimeoutError: If health check doesn't pass within timeout
53
+ """
54
+ # Start container in detached mode
55
+ cmd = [
56
+ self.runtime,
57
+ "run",
58
+ "-d", # Detached mode
59
+ "--rm", # Auto-remove on stop
60
+ "--name",
61
+ self.container_name,
62
+ "-p",
63
+ f"{self.port}:5001",
64
+ "-e",
65
+ f"DOCLING_SERVE_MAX_SYNC_WAIT={self.timeout}",
66
+ self.image,
67
+ ]
68
+
69
+ try:
70
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
71
+ self.container_id = result.stdout.strip()
72
+ except subprocess.CalledProcessError as e:
73
+ error_msg = e.stderr.strip() or e.stdout.strip() or "Unknown error"
74
+ raise subprocess.CalledProcessError(
75
+ e.returncode,
76
+ e.cmd,
77
+ output=e.stdout,
78
+ stderr=f"Failed to start container: {error_msg}",
79
+ )
80
+
81
+ # Wait for health check
82
+ self._wait_for_health(timeout)
83
+
84
+ def stop(self) -> None:
85
+ """Stop and remove container. Safe to call multiple times."""
86
+ if self.container_name:
87
+ subprocess.run(
88
+ [self.runtime, "stop", self.container_name],
89
+ capture_output=True,
90
+ check=False,
91
+ )
92
+
93
+ def is_ready(self) -> bool:
94
+ """Check if container is healthy.
95
+
96
+ Returns:
97
+ True if container is healthy, False otherwise
98
+ """
99
+ try:
100
+ return check_health(self.base_url)
101
+ except Exception:
102
+ return False
103
+
104
+ def _wait_for_health(self, timeout: int) -> None:
105
+ """Poll health endpoint until ready.
106
+
107
+ Args:
108
+ timeout: Maximum seconds to wait
109
+
110
+ Raises:
111
+ TimeoutError: If health check doesn't pass within timeout
112
+ """
113
+ start_time = time.time()
114
+ while time.time() - start_time < timeout:
115
+ try:
116
+ if check_health(self.base_url):
117
+ return
118
+ except Exception:
119
+ pass
120
+ time.sleep(2) # Poll every 2 seconds
121
+
122
+ raise TimeoutError(f"Container failed to become healthy within {timeout}s")
123
+
124
+ def __enter__(self):
125
+ """Context manager entry."""
126
+ self.start()
127
+ return self
128
+
129
+ def __exit__(self, exc_type, exc_val, exc_tb):
130
+ """Context manager exit - ensures cleanup."""
131
+ self.stop()
132
+ return False
@@ -0,0 +1,224 @@
1
+ """HTTP client for docling-serve REST API."""
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ import requests
8
+
9
+
10
+ @dataclass
11
+ class ConvertResult:
12
+ """Result from document conversion."""
13
+
14
+ content: str
15
+ format: str
16
+ success: bool
17
+ error: Optional[str] = None
18
+
19
+
20
+ @dataclass
21
+ class StatusResult:
22
+ """Status of async conversion task."""
23
+
24
+ status: str # "pending", "completed", "failed"
25
+ task_id: str
26
+ error: Optional[str] = None
27
+
28
+
29
+ class DoclingClientError(Exception):
30
+ """Base exception for docling client errors."""
31
+
32
+ pass
33
+
34
+
35
+ class DoclingHTTPError(DoclingClientError):
36
+ """HTTP error from docling-serve API."""
37
+
38
+ def __init__(self, status_code: int, message: str):
39
+ self.status_code = status_code
40
+ super().__init__(f"HTTP {status_code}: {message}")
41
+
42
+
43
+ def check_health(base_url: str) -> bool:
44
+ """Check if docling-serve is healthy.
45
+
46
+ Args:
47
+ base_url: Base URL of docling-serve (e.g., "http://localhost:8000")
48
+
49
+ Returns:
50
+ True if healthy, False otherwise
51
+ """
52
+ try:
53
+ response = requests.get(f"{base_url}/health")
54
+ return response.status_code == 200
55
+ except requests.RequestException:
56
+ return False
57
+
58
+
59
+ def convert_file(
60
+ base_url: str, file_path: Path, to_format: str = "md", do_ocr: bool = True
61
+ ) -> ConvertResult:
62
+ """Convert a file synchronously.
63
+
64
+ Args:
65
+ base_url: Base URL of docling-serve
66
+ file_path: Path to file to convert
67
+ to_format: Output format (default: "md")
68
+ do_ocr: Whether to perform OCR (default: True)
69
+
70
+ Returns:
71
+ ConvertResult with conversion output
72
+
73
+ Raises:
74
+ DoclingHTTPError: If HTTP request fails
75
+ """
76
+ try:
77
+ with open(file_path, "rb") as f:
78
+ response = requests.post(
79
+ f"{base_url}/v1/convert/file",
80
+ files={"files": (file_path.name, f, "application/pdf")},
81
+ data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
82
+ )
83
+
84
+ if response.status_code != 200:
85
+ raise DoclingHTTPError(
86
+ response.status_code, response.text or "Conversion failed"
87
+ )
88
+
89
+ result_data = response.json()
90
+
91
+ # docling-serve returns results in a list format
92
+ if isinstance(result_data, list) and len(result_data) > 0:
93
+ first_result = result_data[0]
94
+ return ConvertResult(
95
+ content=first_result.get("content", ""), format=to_format, success=True
96
+ )
97
+ elif isinstance(result_data, dict):
98
+ return ConvertResult(
99
+ content=result_data.get("content", ""), format=to_format, success=True
100
+ )
101
+ else:
102
+ raise DoclingHTTPError(200, f"Unexpected response format: {result_data}")
103
+
104
+ except requests.RequestException as e:
105
+ return ConvertResult(content="", format=to_format, success=False, error=str(e))
106
+
107
+
108
+ def convert_file_async(
109
+ base_url: str, file_path: Path, to_format: str = "md", do_ocr: bool = True
110
+ ) -> str:
111
+ """Start async file conversion.
112
+
113
+ Args:
114
+ base_url: Base URL of docling-serve
115
+ file_path: Path to file to convert
116
+ to_format: Output format (default: "md")
117
+ do_ocr: Whether to perform OCR (default: True)
118
+
119
+ Returns:
120
+ Task ID for polling
121
+
122
+ Raises:
123
+ DoclingHTTPError: If HTTP request fails
124
+ """
125
+ try:
126
+ with open(file_path, "rb") as f:
127
+ response = requests.post(
128
+ f"{base_url}/v1/convert/file/async",
129
+ files={"files": (file_path.name, f, "application/pdf")},
130
+ data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
131
+ )
132
+
133
+ if response.status_code != 200:
134
+ raise DoclingHTTPError(
135
+ response.status_code, response.text or "Async conversion failed"
136
+ )
137
+
138
+ result_data = response.json()
139
+ task_id = result_data.get("task_id")
140
+
141
+ if not task_id:
142
+ raise DoclingHTTPError(200, f"No task_id in response: {result_data}")
143
+
144
+ return task_id
145
+
146
+ except requests.RequestException as e:
147
+ raise DoclingHTTPError(500, str(e))
148
+
149
+
150
+ def poll_status(base_url: str, task_id: str) -> StatusResult:
151
+ """Poll status of async conversion task.
152
+
153
+ Args:
154
+ base_url: Base URL of docling-serve
155
+ task_id: Task ID from convert_file_async
156
+
157
+ Returns:
158
+ StatusResult with current status
159
+
160
+ Raises:
161
+ DoclingHTTPError: If HTTP request fails
162
+ """
163
+ try:
164
+ response = requests.get(f"{base_url}/v1/status/poll/{task_id}")
165
+
166
+ if response.status_code != 200:
167
+ raise DoclingHTTPError(
168
+ response.status_code, response.text or "Status poll failed"
169
+ )
170
+
171
+ result_data = response.json()
172
+
173
+ return StatusResult(
174
+ status=result_data.get("status", "unknown"),
175
+ task_id=task_id,
176
+ error=result_data.get("error"),
177
+ )
178
+
179
+ except requests.RequestException as e:
180
+ raise DoclingHTTPError(500, str(e))
181
+
182
+
183
+ def get_result(base_url: str, task_id: str) -> ConvertResult:
184
+ """Get result of completed async conversion.
185
+
186
+ Args:
187
+ base_url: Base URL of docling-serve
188
+ task_id: Task ID from convert_file_async
189
+
190
+ Returns:
191
+ ConvertResult with conversion output
192
+
193
+ Raises:
194
+ DoclingHTTPError: If HTTP request fails or task not completed
195
+ """
196
+ try:
197
+ response = requests.get(f"{base_url}/v1/result/{task_id}")
198
+
199
+ if response.status_code != 200:
200
+ raise DoclingHTTPError(
201
+ response.status_code, response.text or "Result retrieval failed"
202
+ )
203
+
204
+ result_data = response.json()
205
+
206
+ # Similar to sync conversion, handle list or dict format
207
+ if isinstance(result_data, list) and len(result_data) > 0:
208
+ first_result = result_data[0]
209
+ return ConvertResult(
210
+ content=first_result.get("content", ""),
211
+ format=first_result.get("format", "md"),
212
+ success=True,
213
+ )
214
+ elif isinstance(result_data, dict):
215
+ return ConvertResult(
216
+ content=result_data.get("content", ""),
217
+ format=result_data.get("format", "md"),
218
+ success=True,
219
+ )
220
+ else:
221
+ raise DoclingHTTPError(200, f"Unexpected response format: {result_data}")
222
+
223
+ except requests.RequestException as e:
224
+ return ConvertResult(content="", format="md", success=False, error=str(e))
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mdify-cli
3
- Version: 1.3.1
4
- Summary: Lightweight CLI for converting documents to Markdown via Docling container
3
+ Version: 2.5.0
4
+ Summary: Convert PDFs and document images into structured Markdown for LLM workflows
5
5
  Author: tiroq
6
6
  License-Expression: MIT
7
7
  Project-URL: Homepage, https://github.com/tiroq/mdify
@@ -24,17 +24,20 @@ Classifier: Topic :: Utilities
24
24
  Requires-Python: >=3.8
25
25
  Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
+ Requires-Dist: requests
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=7.0; extra == "dev"
27
30
  Dynamic: license-file
28
31
 
29
32
  # mdify
30
33
 
31
- ![MDify banner](assets/mdify.png)
34
+ ![mdify banner](https://raw.githubusercontent.com/tiroq/mdify/main/assets/mdify.png)
32
35
 
33
36
  [![PyPI](https://img.shields.io/pypi/v/mdify-cli?logo=python&style=flat-square)](https://pypi.org/project/mdify-cli/)
34
37
  [![Container](https://img.shields.io/badge/container-ghcr.io-blue?logo=docker&style=flat-square)](https://github.com/tiroq/mdify/pkgs/container/mdify-runtime)
35
38
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg?style=flat-square)](https://opensource.org/licenses/MIT)
36
39
 
37
- A lightweight CLI for converting documents to Markdown. The CLI is fast to install via pipx, while the heavy ML conversion (Docling) runs inside a container.
40
+ A lightweight CLI for converting documents to Markdown. The CLI is fast to install via pipx, while the heavy ML conversion runs inside a container.
38
41
 
39
42
  ## Requirements
40
43
 
@@ -98,15 +101,32 @@ Recursively convert files:
98
101
  mdify /path/to/documents -r -g "*.pdf"
99
102
  ```
100
103
 
101
- ### Masking sensitive content
104
+ ### GPU Acceleration
102
105
 
103
- Mask PII and sensitive content in images:
106
+ For faster processing with NVIDIA GPU:
104
107
  ```bash
105
- mdify document.pdf -m
106
- mdify document.pdf --mask
108
+ mdify --gpu documents/*.pdf
107
109
  ```
108
110
 
109
- This uses Docling's content-aware masking to obscure sensitive information in embedded images.
111
+ Requires NVIDIA GPU with CUDA support and nvidia-container-toolkit.
112
+
113
+ ### ⚠️ PII Masking (Deprecated)
114
+
115
+ The `--mask` flag is deprecated and will be ignored in this version. PII masking functionality was available in older versions using a custom runtime but is not supported with the current docling-serve backend.
116
+
117
+ If PII masking is critical for your use case, please use mdify v1.5.x or earlier versions.
118
+
119
+ ## Performance
120
+
121
+ mdify now uses docling-serve for significantly faster batch processing:
122
+
123
+ - **Single model load**: Models are loaded once per session, not per file
124
+ - **~10-20x speedup** for multiple file conversions compared to previous versions
125
+ - **GPU acceleration**: Use `--gpu` for additional 2-6x speedup (requires NVIDIA GPU)
126
+
127
+ ### First Run Behavior
128
+
129
+ The first conversion takes longer (~30-60s) as the container loads ML models into memory. Subsequent files in the same batch process quickly, typically in 1-3 seconds per file.
110
130
 
111
131
  ## Options
112
132
 
@@ -119,9 +139,11 @@ This uses Docling's content-aware masking to obscure sensitive information in em
119
139
  | `--flat` | Disable directory structure preservation |
120
140
  | `--overwrite` | Overwrite existing output files |
121
141
  | `-q, --quiet` | Suppress progress messages |
122
- | `-m, --mask` | Mask PII and sensitive content in images |
142
+ | `-m, --mask` | ⚠️ **Deprecated**: PII masking not supported in current version |
143
+ | `--gpu` | Use GPU-accelerated container (requires NVIDIA GPU and nvidia-container-toolkit) |
144
+ | `--port PORT` | Container port (default: 5001) |
123
145
  | `--runtime RUNTIME` | Container runtime: docker or podman (auto-detected) |
124
- | `--image IMAGE` | Custom container image (default: ghcr.io/tiroq/mdify-runtime:latest) |
146
+ | `--image IMAGE` | Custom container image (default: ghcr.io/docling-project/docling-serve-cpu:main) |
125
147
  | `--pull POLICY` | Image pull policy: always, missing, never (default: missing) |
126
148
  | `--check-update` | Check for available updates and exit |
127
149
  | `--version` | Show version and exit |
@@ -175,19 +197,22 @@ The CLI:
175
197
  - Pulls the runtime container on first use
176
198
  - Mounts files and runs conversions in the container
177
199
 
178
- ## Container Image
200
+ ## Container Images
201
+
202
+ mdify uses official docling-serve containers:
179
203
 
180
- The runtime container is hosted at:
204
+ **CPU Version** (default):
181
205
  ```
182
- ghcr.io/tiroq/mdify-runtime:latest
206
+ ghcr.io/docling-project/docling-serve-cpu:main
183
207
  ```
184
208
 
185
- To build locally:
186
- ```bash
187
- cd runtime
188
- docker build -t mdify-runtime .
209
+ **GPU Version** (use with `--gpu` flag):
210
+ ```
211
+ ghcr.io/docling-project/docling-serve-cu126:main
189
212
  ```
190
213
 
214
+ These are official images from the [docling-serve project](https://github.com/DS4SD/docling-serve).
215
+
191
216
  ## Updates
192
217
 
193
218
  mdify checks for updates daily. When a new version is available:
@@ -0,0 +1,12 @@
1
+ assets/mdify.png,sha256=qUj7WXWqNwpI2KNXOW79XJwqFqa-UI0JEkmt1mmy4Rg,1820418
2
+ mdify/__init__.py,sha256=lH-hnX0KOG9_zJ_QZ-A_kQFPYghziohhpm7nmxVZc7w,90
3
+ mdify/__main__.py,sha256=bhpJ00co6MfaVOdH4XLoW04NtLYDa_oJK7ODzfLrn9M,143
4
+ mdify/cli.py,sha256=LqIibolYSKGCNYqxuIyFnvPkjJyNlXvfWeKaSaoOrqo,28542
5
+ mdify/container.py,sha256=2oh9NyvFr9lCRb2YYpM_qKP3PPmAin0DbxvNP3m69jw,4158
6
+ mdify/docling_client.py,sha256=_9qjL5yOOeJahOg6an2P6Iii1xkeR6wmNJZG4Q6NRkk,6553
7
+ mdify_cli-2.5.0.dist-info/licenses/LICENSE,sha256=NWM66Uv-XuSMKaU-gaPmvfyk4WgE6zcIPr78wyg6GAo,1065
8
+ mdify_cli-2.5.0.dist-info/METADATA,sha256=egwIWB2tV9F41fcUf3RvfszEJGb--AQVDN3ybI1FFt0,7923
9
+ mdify_cli-2.5.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
10
+ mdify_cli-2.5.0.dist-info/entry_points.txt,sha256=0Xki8f5lADQUtwdt6Eq_FEaieI6Byhk8UE7BuDhChMg,41
11
+ mdify_cli-2.5.0.dist-info/top_level.txt,sha256=qltzf7h8owHq7dxCdfCkSHY8gT21hn1_E8P-VWS_OKM,6
12
+ mdify_cli-2.5.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,9 +0,0 @@
1
- mdify/__init__.py,sha256=i8PTIA0EY8RsB6lf3pwGlb0oX30633B0o2KMcqaGl4c,90
2
- mdify/__main__.py,sha256=bhpJ00co6MfaVOdH4XLoW04NtLYDa_oJK7ODzfLrn9M,143
3
- mdify/cli.py,sha256=t1c3lSDwB5zco-gji-udZkx_5OPCmLNFRN05XULW7TM,21242
4
- mdify_cli-1.3.1.dist-info/licenses/LICENSE,sha256=NWM66Uv-XuSMKaU-gaPmvfyk4WgE6zcIPr78wyg6GAo,1065
5
- mdify_cli-1.3.1.dist-info/METADATA,sha256=pKbl1j497DivGmonSaXZ9tE8wE9x0lS5QXdpQ3ozLaM,6616
6
- mdify_cli-1.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
- mdify_cli-1.3.1.dist-info/entry_points.txt,sha256=0Xki8f5lADQUtwdt6Eq_FEaieI6Byhk8UE7BuDhChMg,41
8
- mdify_cli-1.3.1.dist-info/top_level.txt,sha256=qltzf7h8owHq7dxCdfCkSHY8gT21hn1_E8P-VWS_OKM,6
9
- mdify_cli-1.3.1.dist-info/RECORD,,