mdify-cli 2.6.0__tar.gz → 2.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mdify_cli-2.6.0/mdify_cli.egg-info → mdify_cli-2.8.0}/PKG-INFO +1 -1
- {mdify_cli-2.6.0 → mdify_cli-2.8.0}/mdify/__init__.py +1 -1
- {mdify_cli-2.6.0 → mdify_cli-2.8.0}/mdify/docling_client.py +61 -22
- {mdify_cli-2.6.0 → mdify_cli-2.8.0/mdify_cli.egg-info}/PKG-INFO +1 -1
- {mdify_cli-2.6.0 → mdify_cli-2.8.0}/pyproject.toml +1 -1
- {mdify_cli-2.6.0 → mdify_cli-2.8.0}/tests/test_docling_client.py +133 -0
- {mdify_cli-2.6.0 → mdify_cli-2.8.0}/LICENSE +0 -0
- {mdify_cli-2.6.0 → mdify_cli-2.8.0}/README.md +0 -0
- {mdify_cli-2.6.0 → mdify_cli-2.8.0}/assets/mdify.png +0 -0
- {mdify_cli-2.6.0 → mdify_cli-2.8.0}/mdify/__main__.py +0 -0
- {mdify_cli-2.6.0 → mdify_cli-2.8.0}/mdify/cli.py +0 -0
- {mdify_cli-2.6.0 → mdify_cli-2.8.0}/mdify/container.py +0 -0
- {mdify_cli-2.6.0 → mdify_cli-2.8.0}/mdify_cli.egg-info/SOURCES.txt +0 -0
- {mdify_cli-2.6.0 → mdify_cli-2.8.0}/mdify_cli.egg-info/dependency_links.txt +0 -0
- {mdify_cli-2.6.0 → mdify_cli-2.8.0}/mdify_cli.egg-info/entry_points.txt +0 -0
- {mdify_cli-2.6.0 → mdify_cli-2.8.0}/mdify_cli.egg-info/requires.txt +0 -0
- {mdify_cli-2.6.0 → mdify_cli-2.8.0}/mdify_cli.egg-info/top_level.txt +0 -0
- {mdify_cli-2.6.0 → mdify_cli-2.8.0}/setup.cfg +0 -0
- {mdify_cli-2.6.0 → mdify_cli-2.8.0}/tests/test_cli.py +0 -0
- {mdify_cli-2.6.0 → mdify_cli-2.8.0}/tests/test_container.py +0 -0
|
@@ -4,6 +4,8 @@ from dataclasses import dataclass
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Optional
|
|
6
6
|
|
|
7
|
+
import mimetypes
|
|
8
|
+
|
|
7
9
|
import requests
|
|
8
10
|
|
|
9
11
|
|
|
@@ -40,6 +42,48 @@ class DoclingHTTPError(DoclingClientError):
|
|
|
40
42
|
super().__init__(f"HTTP {status_code}: {message}")
|
|
41
43
|
|
|
42
44
|
|
|
45
|
+
def _get_mime_type(file_path: Path) -> str:
|
|
46
|
+
"""Get MIME type for file, with fallback for unknown types."""
|
|
47
|
+
mime_type, _ = mimetypes.guess_type(str(file_path))
|
|
48
|
+
return mime_type or "application/octet-stream"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _extract_content(result_data) -> str:
|
|
52
|
+
"""Extract content from API response, supporting both old and new formats.
|
|
53
|
+
|
|
54
|
+
Supports:
|
|
55
|
+
- New format: {"document": {"md_content": "..."}}
|
|
56
|
+
- Fallback: {"document": {"content": "..."}}
|
|
57
|
+
- Old format: {"content": "..."}
|
|
58
|
+
- List format: [{"document": {...}} or {"content": "..."}]
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
result_data: Response data from docling-serve API
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Extracted content string, or empty string if not found
|
|
65
|
+
"""
|
|
66
|
+
if isinstance(result_data, dict):
|
|
67
|
+
# New format with document field
|
|
68
|
+
if "document" in result_data:
|
|
69
|
+
doc = result_data["document"]
|
|
70
|
+
# Try md_content first, then content
|
|
71
|
+
return doc.get("md_content", "") or doc.get("content", "")
|
|
72
|
+
# Old format without document field
|
|
73
|
+
return result_data.get("content", "")
|
|
74
|
+
elif isinstance(result_data, list) and len(result_data) > 0:
|
|
75
|
+
# List format - process first item
|
|
76
|
+
first_result = result_data[0]
|
|
77
|
+
if isinstance(first_result, dict):
|
|
78
|
+
if "document" in first_result:
|
|
79
|
+
doc = first_result["document"]
|
|
80
|
+
# Try md_content first, then content
|
|
81
|
+
return doc.get("md_content", "") or doc.get("content", "")
|
|
82
|
+
# Old format without document field
|
|
83
|
+
return first_result.get("content", "")
|
|
84
|
+
return ""
|
|
85
|
+
|
|
86
|
+
|
|
43
87
|
def check_health(base_url: str) -> bool:
|
|
44
88
|
"""Check if docling-serve is healthy.
|
|
45
89
|
|
|
@@ -77,7 +121,7 @@ def convert_file(
|
|
|
77
121
|
with open(file_path, "rb") as f:
|
|
78
122
|
response = requests.post(
|
|
79
123
|
f"{base_url}/v1/convert/file",
|
|
80
|
-
files={"files": (file_path.name, f,
|
|
124
|
+
files={"files": (file_path.name, f, _get_mime_type(file_path))},
|
|
81
125
|
data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
|
|
82
126
|
)
|
|
83
127
|
|
|
@@ -87,17 +131,10 @@ def convert_file(
|
|
|
87
131
|
)
|
|
88
132
|
|
|
89
133
|
result_data = response.json()
|
|
134
|
+
content = _extract_content(result_data)
|
|
90
135
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
first_result = result_data[0]
|
|
94
|
-
return ConvertResult(
|
|
95
|
-
content=first_result.get("content", ""), format=to_format, success=True
|
|
96
|
-
)
|
|
97
|
-
elif isinstance(result_data, dict):
|
|
98
|
-
return ConvertResult(
|
|
99
|
-
content=result_data.get("content", ""), format=to_format, success=True
|
|
100
|
-
)
|
|
136
|
+
if content or isinstance(result_data, (dict, list)):
|
|
137
|
+
return ConvertResult(content=content, format=to_format, success=True)
|
|
101
138
|
else:
|
|
102
139
|
raise DoclingHTTPError(200, f"Unexpected response format: {result_data}")
|
|
103
140
|
|
|
@@ -126,7 +163,7 @@ def convert_file_async(
|
|
|
126
163
|
with open(file_path, "rb") as f:
|
|
127
164
|
response = requests.post(
|
|
128
165
|
f"{base_url}/v1/convert/file/async",
|
|
129
|
-
files={"files": (file_path.name, f,
|
|
166
|
+
files={"files": (file_path.name, f, _get_mime_type(file_path))},
|
|
130
167
|
data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
|
|
131
168
|
)
|
|
132
169
|
|
|
@@ -202,19 +239,21 @@ def get_result(base_url: str, task_id: str) -> ConvertResult:
|
|
|
202
239
|
)
|
|
203
240
|
|
|
204
241
|
result_data = response.json()
|
|
242
|
+
content = _extract_content(result_data)
|
|
205
243
|
|
|
206
|
-
#
|
|
207
|
-
|
|
244
|
+
# Determine format from response, defaulting to "md"
|
|
245
|
+
result_format = "md"
|
|
246
|
+
if isinstance(result_data, dict):
|
|
247
|
+
result_format = result_data.get("format", "md")
|
|
248
|
+
elif isinstance(result_data, list) and len(result_data) > 0:
|
|
208
249
|
first_result = result_data[0]
|
|
250
|
+
if isinstance(first_result, dict):
|
|
251
|
+
result_format = first_result.get("format", "md")
|
|
252
|
+
|
|
253
|
+
if content or isinstance(result_data, (dict, list)):
|
|
209
254
|
return ConvertResult(
|
|
210
|
-
content=
|
|
211
|
-
format=
|
|
212
|
-
success=True,
|
|
213
|
-
)
|
|
214
|
-
elif isinstance(result_data, dict):
|
|
215
|
-
return ConvertResult(
|
|
216
|
-
content=result_data.get("content", ""),
|
|
217
|
-
format=result_data.get("format", "md"),
|
|
255
|
+
content=content,
|
|
256
|
+
format=result_format,
|
|
218
257
|
success=True,
|
|
219
258
|
)
|
|
220
259
|
else:
|
|
@@ -133,6 +133,26 @@ class TestConvertFile:
|
|
|
133
133
|
|
|
134
134
|
assert result.format == "html"
|
|
135
135
|
|
|
136
|
+
def test_convert_file_new_document_format(self, tmp_path):
|
|
137
|
+
"""Test file conversion with new document.md_content format."""
|
|
138
|
+
test_file = tmp_path / "test.pdf"
|
|
139
|
+
test_file.write_bytes(b"fake pdf content")
|
|
140
|
+
|
|
141
|
+
with patch("mdify.docling_client.requests.post") as mock_post:
|
|
142
|
+
mock_response = Mock()
|
|
143
|
+
mock_response.status_code = 200
|
|
144
|
+
mock_response.json.return_value = {
|
|
145
|
+
"document": {"md_content": "# New Format Content\n\nMarkdown here."}
|
|
146
|
+
}
|
|
147
|
+
mock_post.return_value = mock_response
|
|
148
|
+
|
|
149
|
+
result = convert_file("http://localhost:5001", test_file)
|
|
150
|
+
|
|
151
|
+
assert result.success is True
|
|
152
|
+
assert "# New Format Content" in result.content
|
|
153
|
+
assert result.error is None
|
|
154
|
+
assert result.format == "md"
|
|
155
|
+
|
|
136
156
|
|
|
137
157
|
class TestConvertFileAsync:
|
|
138
158
|
"""Test async file conversion."""
|
|
@@ -302,6 +322,25 @@ class TestGetResult:
|
|
|
302
322
|
assert result.success is False
|
|
303
323
|
assert result.error is not None
|
|
304
324
|
|
|
325
|
+
def test_get_result_new_document_format(self):
|
|
326
|
+
"""Test getting result with new document.md_content format."""
|
|
327
|
+
with patch("mdify.docling_client.requests.get") as mock_get:
|
|
328
|
+
mock_response = Mock()
|
|
329
|
+
mock_response.status_code = 200
|
|
330
|
+
mock_response.json.return_value = {
|
|
331
|
+
"document": {
|
|
332
|
+
"md_content": "# Result with MD Content\n\nFormatted markdown."
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
mock_get.return_value = mock_response
|
|
336
|
+
|
|
337
|
+
result = get_result("http://localhost:5001", "abc123")
|
|
338
|
+
|
|
339
|
+
assert result.success is True
|
|
340
|
+
assert "# Result with MD Content" in result.content
|
|
341
|
+
assert result.error is None
|
|
342
|
+
assert result.format == "md"
|
|
343
|
+
|
|
305
344
|
|
|
306
345
|
class TestDataClasses:
|
|
307
346
|
"""Test dataclass definitions."""
|
|
@@ -356,3 +395,97 @@ class TestDoclingHTTPError:
|
|
|
356
395
|
error = DoclingHTTPError(400, "Bad Request")
|
|
357
396
|
|
|
358
397
|
assert isinstance(error, Exception)
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
class TestMimeTypeDetection:
|
|
401
|
+
"""Test MIME type detection in file conversion."""
|
|
402
|
+
|
|
403
|
+
def test_convert_file_sends_correct_mime_for_xlsx(self, tmp_path):
|
|
404
|
+
"""Test that .xlsx files are sent with correct MIME type."""
|
|
405
|
+
test_file = tmp_path / "test.xlsx"
|
|
406
|
+
test_file.write_bytes(b"fake xlsx content")
|
|
407
|
+
|
|
408
|
+
with patch("mdify.docling_client.requests.post") as mock_post:
|
|
409
|
+
mock_response = Mock()
|
|
410
|
+
mock_response.status_code = 200
|
|
411
|
+
mock_response.json.return_value = [
|
|
412
|
+
{"content": "# Test Spreadsheet\n\nContent here."}
|
|
413
|
+
]
|
|
414
|
+
mock_post.return_value = mock_response
|
|
415
|
+
|
|
416
|
+
convert_file("http://localhost:5001", test_file)
|
|
417
|
+
|
|
418
|
+
mock_post.assert_called_once()
|
|
419
|
+
call_args = mock_post.call_args
|
|
420
|
+
files_param = call_args[1]["files"]
|
|
421
|
+
filename, file_obj, mime_type = files_param["files"]
|
|
422
|
+
assert (
|
|
423
|
+
mime_type
|
|
424
|
+
== "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
def test_convert_file_sends_correct_mime_for_pdf(self, tmp_path):
|
|
428
|
+
"""Test that .pdf files are sent with correct MIME type (regression test)."""
|
|
429
|
+
test_file = tmp_path / "test.pdf"
|
|
430
|
+
test_file.write_bytes(b"fake pdf content")
|
|
431
|
+
|
|
432
|
+
with patch("mdify.docling_client.requests.post") as mock_post:
|
|
433
|
+
mock_response = Mock()
|
|
434
|
+
mock_response.status_code = 200
|
|
435
|
+
mock_response.json.return_value = [
|
|
436
|
+
{"content": "# Test Document\n\nContent here."}
|
|
437
|
+
]
|
|
438
|
+
mock_post.return_value = mock_response
|
|
439
|
+
|
|
440
|
+
convert_file("http://localhost:5001", test_file)
|
|
441
|
+
|
|
442
|
+
mock_post.assert_called_once()
|
|
443
|
+
call_args = mock_post.call_args
|
|
444
|
+
files_param = call_args[1]["files"]
|
|
445
|
+
filename, file_obj, mime_type = files_param["files"]
|
|
446
|
+
assert mime_type == "application/pdf"
|
|
447
|
+
|
|
448
|
+
def test_convert_file_sends_correct_mime_for_docx(self, tmp_path):
|
|
449
|
+
"""Test that .docx files are sent with correct MIME type."""
|
|
450
|
+
test_file = tmp_path / "test.docx"
|
|
451
|
+
test_file.write_bytes(b"fake docx content")
|
|
452
|
+
|
|
453
|
+
with patch("mdify.docling_client.requests.post") as mock_post:
|
|
454
|
+
mock_response = Mock()
|
|
455
|
+
mock_response.status_code = 200
|
|
456
|
+
mock_response.json.return_value = [
|
|
457
|
+
{"content": "# Test Document\n\nContent here."}
|
|
458
|
+
]
|
|
459
|
+
mock_post.return_value = mock_response
|
|
460
|
+
|
|
461
|
+
convert_file("http://localhost:5001", test_file)
|
|
462
|
+
|
|
463
|
+
mock_post.assert_called_once()
|
|
464
|
+
call_args = mock_post.call_args
|
|
465
|
+
files_param = call_args[1]["files"]
|
|
466
|
+
filename, file_obj, mime_type = files_param["files"]
|
|
467
|
+
assert (
|
|
468
|
+
mime_type
|
|
469
|
+
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
def test_convert_file_fallback_for_unknown_extension(self, tmp_path):
|
|
473
|
+
"""Test that unknown file extensions fall back to application/octet-stream."""
|
|
474
|
+
test_file = tmp_path / "test.unknownext123"
|
|
475
|
+
test_file.write_bytes(b"fake unknown content")
|
|
476
|
+
|
|
477
|
+
with patch("mdify.docling_client.requests.post") as mock_post:
|
|
478
|
+
mock_response = Mock()
|
|
479
|
+
mock_response.status_code = 200
|
|
480
|
+
mock_response.json.return_value = [
|
|
481
|
+
{"content": "# Test Content\n\nContent here."}
|
|
482
|
+
]
|
|
483
|
+
mock_post.return_value = mock_response
|
|
484
|
+
|
|
485
|
+
convert_file("http://localhost:5001", test_file)
|
|
486
|
+
|
|
487
|
+
mock_post.assert_called_once()
|
|
488
|
+
call_args = mock_post.call_args
|
|
489
|
+
files_param = call_args[1]["files"]
|
|
490
|
+
filename, file_obj, mime_type = files_param["files"]
|
|
491
|
+
assert mime_type == "application/octet-stream"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|