mdify-cli 2.6.0__py3-none-any.whl → 2.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdify/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """mdify - Convert documents to Markdown via Docling container."""
2
2
 
3
- __version__ = "2.6.0"
3
+ __version__ = "2.8.0"
mdify/docling_client.py CHANGED
@@ -4,6 +4,8 @@ from dataclasses import dataclass
4
4
  from pathlib import Path
5
5
  from typing import Optional
6
6
 
7
+ import mimetypes
8
+
7
9
  import requests
8
10
 
9
11
 
@@ -40,6 +42,48 @@ class DoclingHTTPError(DoclingClientError):
40
42
  super().__init__(f"HTTP {status_code}: {message}")
41
43
 
42
44
 
45
+ def _get_mime_type(file_path: Path) -> str:
46
+ """Get MIME type for file, with fallback for unknown types."""
47
+ mime_type, _ = mimetypes.guess_type(str(file_path))
48
+ return mime_type or "application/octet-stream"
49
+
50
+
51
+ def _extract_content(result_data) -> str:
52
+ """Extract content from API response, supporting both old and new formats.
53
+
54
+ Supports:
55
+ - New format: {"document": {"md_content": "..."}}
56
+ - Fallback: {"document": {"content": "..."}}
57
+ - Old format: {"content": "..."}
58
+ - List format: [{"document": {...}} or {"content": "..."}]
59
+
60
+ Args:
61
+ result_data: Response data from docling-serve API
62
+
63
+ Returns:
64
+ Extracted content string, or empty string if not found
65
+ """
66
+ if isinstance(result_data, dict):
67
+ # New format with document field
68
+ if "document" in result_data:
69
+ doc = result_data["document"]
70
+ # Try md_content first, then content
71
+ return doc.get("md_content", "") or doc.get("content", "")
72
+ # Old format without document field
73
+ return result_data.get("content", "")
74
+ elif isinstance(result_data, list) and len(result_data) > 0:
75
+ # List format - process first item
76
+ first_result = result_data[0]
77
+ if isinstance(first_result, dict):
78
+ if "document" in first_result:
79
+ doc = first_result["document"]
80
+ # Try md_content first, then content
81
+ return doc.get("md_content", "") or doc.get("content", "")
82
+ # Old format without document field
83
+ return first_result.get("content", "")
84
+ return ""
85
+
86
+
43
87
  def check_health(base_url: str) -> bool:
44
88
  """Check if docling-serve is healthy.
45
89
 
@@ -77,7 +121,7 @@ def convert_file(
77
121
  with open(file_path, "rb") as f:
78
122
  response = requests.post(
79
123
  f"{base_url}/v1/convert/file",
80
- files={"files": (file_path.name, f, "application/pdf")},
124
+ files={"files": (file_path.name, f, _get_mime_type(file_path))},
81
125
  data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
82
126
  )
83
127
 
@@ -87,17 +131,10 @@ def convert_file(
87
131
  )
88
132
 
89
133
  result_data = response.json()
134
+ content = _extract_content(result_data)
90
135
 
91
- # docling-serve returns results in a list format
92
- if isinstance(result_data, list) and len(result_data) > 0:
93
- first_result = result_data[0]
94
- return ConvertResult(
95
- content=first_result.get("content", ""), format=to_format, success=True
96
- )
97
- elif isinstance(result_data, dict):
98
- return ConvertResult(
99
- content=result_data.get("content", ""), format=to_format, success=True
100
- )
136
+ if content or isinstance(result_data, (dict, list)):
137
+ return ConvertResult(content=content, format=to_format, success=True)
101
138
  else:
102
139
  raise DoclingHTTPError(200, f"Unexpected response format: {result_data}")
103
140
 
@@ -126,7 +163,7 @@ def convert_file_async(
126
163
  with open(file_path, "rb") as f:
127
164
  response = requests.post(
128
165
  f"{base_url}/v1/convert/file/async",
129
- files={"files": (file_path.name, f, "application/pdf")},
166
+ files={"files": (file_path.name, f, _get_mime_type(file_path))},
130
167
  data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
131
168
  )
132
169
 
@@ -202,19 +239,21 @@ def get_result(base_url: str, task_id: str) -> ConvertResult:
202
239
  )
203
240
 
204
241
  result_data = response.json()
242
+ content = _extract_content(result_data)
205
243
 
206
- # Similar to sync conversion, handle list or dict format
207
- if isinstance(result_data, list) and len(result_data) > 0:
244
+ # Determine format from response, defaulting to "md"
245
+ result_format = "md"
246
+ if isinstance(result_data, dict):
247
+ result_format = result_data.get("format", "md")
248
+ elif isinstance(result_data, list) and len(result_data) > 0:
208
249
  first_result = result_data[0]
250
+ if isinstance(first_result, dict):
251
+ result_format = first_result.get("format", "md")
252
+
253
+ if content or isinstance(result_data, (dict, list)):
209
254
  return ConvertResult(
210
- content=first_result.get("content", ""),
211
- format=first_result.get("format", "md"),
212
- success=True,
213
- )
214
- elif isinstance(result_data, dict):
215
- return ConvertResult(
216
- content=result_data.get("content", ""),
217
- format=result_data.get("format", "md"),
255
+ content=content,
256
+ format=result_format,
218
257
  success=True,
219
258
  )
220
259
  else:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mdify-cli
3
- Version: 2.6.0
3
+ Version: 2.8.0
4
4
  Summary: Convert PDFs and document images into structured Markdown for LLM workflows
5
5
  Author: tiroq
6
6
  License-Expression: MIT
@@ -0,0 +1,12 @@
1
+ assets/mdify.png,sha256=qUj7WXWqNwpI2KNXOW79XJwqFqa-UI0JEkmt1mmy4Rg,1820418
2
+ mdify/__init__.py,sha256=YBAx8MdkINw38Jx6zeS5ikx8buI1avmzcHF2v23nZQU,90
3
+ mdify/__main__.py,sha256=bhpJ00co6MfaVOdH4XLoW04NtLYDa_oJK7ODzfLrn9M,143
4
+ mdify/cli.py,sha256=LqIibolYSKGCNYqxuIyFnvPkjJyNlXvfWeKaSaoOrqo,28542
5
+ mdify/container.py,sha256=tkk0nv7EquL-rKUY4nkS_yGITb7mqw8B7eEfuqaeVrg,5239
6
+ mdify/docling_client.py,sha256=xuQR6sC1v3EPloOSwExoHCqT4uUxE8myYq-Yeby3C2I,7975
7
+ mdify_cli-2.8.0.dist-info/licenses/LICENSE,sha256=NWM66Uv-XuSMKaU-gaPmvfyk4WgE6zcIPr78wyg6GAo,1065
8
+ mdify_cli-2.8.0.dist-info/METADATA,sha256=LKU3PAHABNp5dT9KJ3hGeCMSXxjDkIFXNveXzRv2fIA,7923
9
+ mdify_cli-2.8.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
10
+ mdify_cli-2.8.0.dist-info/entry_points.txt,sha256=0Xki8f5lADQUtwdt6Eq_FEaieI6Byhk8UE7BuDhChMg,41
11
+ mdify_cli-2.8.0.dist-info/top_level.txt,sha256=qltzf7h8owHq7dxCdfCkSHY8gT21hn1_E8P-VWS_OKM,6
12
+ mdify_cli-2.8.0.dist-info/RECORD,,
@@ -1,12 +0,0 @@
1
- assets/mdify.png,sha256=qUj7WXWqNwpI2KNXOW79XJwqFqa-UI0JEkmt1mmy4Rg,1820418
2
- mdify/__init__.py,sha256=4mWutp3KF_BH9sz_oEPFBoN7Ee6vamK3cHDBpUtRQVY,90
3
- mdify/__main__.py,sha256=bhpJ00co6MfaVOdH4XLoW04NtLYDa_oJK7ODzfLrn9M,143
4
- mdify/cli.py,sha256=LqIibolYSKGCNYqxuIyFnvPkjJyNlXvfWeKaSaoOrqo,28542
5
- mdify/container.py,sha256=tkk0nv7EquL-rKUY4nkS_yGITb7mqw8B7eEfuqaeVrg,5239
6
- mdify/docling_client.py,sha256=_9qjL5yOOeJahOg6an2P6Iii1xkeR6wmNJZG4Q6NRkk,6553
7
- mdify_cli-2.6.0.dist-info/licenses/LICENSE,sha256=NWM66Uv-XuSMKaU-gaPmvfyk4WgE6zcIPr78wyg6GAo,1065
8
- mdify_cli-2.6.0.dist-info/METADATA,sha256=NcyfsGSLiSkz0NkRdc6g5pOervCpXJbWEIDSPnYSvFk,7923
9
- mdify_cli-2.6.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
10
- mdify_cli-2.6.0.dist-info/entry_points.txt,sha256=0Xki8f5lADQUtwdt6Eq_FEaieI6Byhk8UE7BuDhChMg,41
11
- mdify_cli-2.6.0.dist-info/top_level.txt,sha256=qltzf7h8owHq7dxCdfCkSHY8gT21hn1_E8P-VWS_OKM,6
12
- mdify_cli-2.6.0.dist-info/RECORD,,