mdify-cli 2.5.0__tar.gz → 2.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mdify-cli
3
- Version: 2.5.0
3
+ Version: 2.8.0
4
4
  Summary: Convert PDFs and document images into structured Markdown for LLM workflows
5
5
  Author: tiroq
6
6
  License-Expression: MIT
@@ -1,3 +1,3 @@
1
1
  """mdify - Convert documents to Markdown via Docling container."""
2
2
 
3
- __version__ = "2.5.0"
3
+ __version__ = "2.8.0"
@@ -41,6 +41,39 @@ class DoclingContainer:
41
41
  """Return base URL for API requests."""
42
42
  return f"http://localhost:{self.port}"
43
43
 
44
+ def _cleanup_stale_containers(self) -> None:
45
+ """Stop any existing mdify-serve containers.
46
+
47
+ This handles the case where a previous run left a container running
48
+ (e.g., due to crash, interrupt, or timeout).
49
+ """
50
+ # Find running containers matching mdify-serve-* pattern
51
+ result = subprocess.run(
52
+ [
53
+ self.runtime,
54
+ "ps",
55
+ "--filter",
56
+ "name=mdify-serve-",
57
+ "--format",
58
+ "{{.Names}}",
59
+ ],
60
+ capture_output=True,
61
+ text=True,
62
+ check=False,
63
+ )
64
+
65
+ if result.returncode != 0 or not result.stdout.strip():
66
+ return
67
+
68
+ # Stop each stale container
69
+ for container_name in result.stdout.strip().split("\n"):
70
+ if container_name:
71
+ subprocess.run(
72
+ [self.runtime, "stop", container_name],
73
+ capture_output=True,
74
+ check=False,
75
+ )
76
+
44
77
  def start(self, timeout: int = 120) -> None:
45
78
  """Start container and wait for health check.
46
79
 
@@ -51,6 +84,8 @@ class DoclingContainer:
51
84
  subprocess.CalledProcessError: If container fails to start
52
85
  TimeoutError: If health check doesn't pass within timeout
53
86
  """
87
+ self._cleanup_stale_containers()
88
+
54
89
  # Start container in detached mode
55
90
  cmd = [
56
91
  self.runtime,
@@ -4,6 +4,8 @@ from dataclasses import dataclass
4
4
  from pathlib import Path
5
5
  from typing import Optional
6
6
 
7
+ import mimetypes
8
+
7
9
  import requests
8
10
 
9
11
 
@@ -40,6 +42,48 @@ class DoclingHTTPError(DoclingClientError):
40
42
  super().__init__(f"HTTP {status_code}: {message}")
41
43
 
42
44
 
45
+ def _get_mime_type(file_path: Path) -> str:
46
+ """Get MIME type for file, with fallback for unknown types."""
47
+ mime_type, _ = mimetypes.guess_type(str(file_path))
48
+ return mime_type or "application/octet-stream"
49
+
50
+
51
+ def _extract_content(result_data) -> str:
52
+ """Extract content from API response, supporting both old and new formats.
53
+
54
+ Supports:
55
+ - New format: {"document": {"md_content": "..."}}
56
+ - Fallback: {"document": {"content": "..."}}
57
+ - Old format: {"content": "..."}
58
+ - List format: [{"document": {...}} or {"content": "..."}]
59
+
60
+ Args:
61
+ result_data: Response data from docling-serve API
62
+
63
+ Returns:
64
+ Extracted content string, or empty string if not found
65
+ """
66
+ if isinstance(result_data, dict):
67
+ # New format with document field
68
+ if "document" in result_data:
69
+ doc = result_data["document"]
70
+ # Try md_content first, then content
71
+ return doc.get("md_content", "") or doc.get("content", "")
72
+ # Old format without document field
73
+ return result_data.get("content", "")
74
+ elif isinstance(result_data, list) and len(result_data) > 0:
75
+ # List format - process first item
76
+ first_result = result_data[0]
77
+ if isinstance(first_result, dict):
78
+ if "document" in first_result:
79
+ doc = first_result["document"]
80
+ # Try md_content first, then content
81
+ return doc.get("md_content", "") or doc.get("content", "")
82
+ # Old format without document field
83
+ return first_result.get("content", "")
84
+ return ""
85
+
86
+
43
87
  def check_health(base_url: str) -> bool:
44
88
  """Check if docling-serve is healthy.
45
89
 
@@ -77,7 +121,7 @@ def convert_file(
77
121
  with open(file_path, "rb") as f:
78
122
  response = requests.post(
79
123
  f"{base_url}/v1/convert/file",
80
- files={"files": (file_path.name, f, "application/pdf")},
124
+ files={"files": (file_path.name, f, _get_mime_type(file_path))},
81
125
  data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
82
126
  )
83
127
 
@@ -87,17 +131,10 @@ def convert_file(
87
131
  )
88
132
 
89
133
  result_data = response.json()
134
+ content = _extract_content(result_data)
90
135
 
91
- # docling-serve returns results in a list format
92
- if isinstance(result_data, list) and len(result_data) > 0:
93
- first_result = result_data[0]
94
- return ConvertResult(
95
- content=first_result.get("content", ""), format=to_format, success=True
96
- )
97
- elif isinstance(result_data, dict):
98
- return ConvertResult(
99
- content=result_data.get("content", ""), format=to_format, success=True
100
- )
136
+ if content or isinstance(result_data, (dict, list)):
137
+ return ConvertResult(content=content, format=to_format, success=True)
101
138
  else:
102
139
  raise DoclingHTTPError(200, f"Unexpected response format: {result_data}")
103
140
 
@@ -126,7 +163,7 @@ def convert_file_async(
126
163
  with open(file_path, "rb") as f:
127
164
  response = requests.post(
128
165
  f"{base_url}/v1/convert/file/async",
129
- files={"files": (file_path.name, f, "application/pdf")},
166
+ files={"files": (file_path.name, f, _get_mime_type(file_path))},
130
167
  data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
131
168
  )
132
169
 
@@ -202,19 +239,21 @@ def get_result(base_url: str, task_id: str) -> ConvertResult:
202
239
  )
203
240
 
204
241
  result_data = response.json()
242
+ content = _extract_content(result_data)
205
243
 
206
- # Similar to sync conversion, handle list or dict format
207
- if isinstance(result_data, list) and len(result_data) > 0:
244
+ # Determine format from response, defaulting to "md"
245
+ result_format = "md"
246
+ if isinstance(result_data, dict):
247
+ result_format = result_data.get("format", "md")
248
+ elif isinstance(result_data, list) and len(result_data) > 0:
208
249
  first_result = result_data[0]
250
+ if isinstance(first_result, dict):
251
+ result_format = first_result.get("format", "md")
252
+
253
+ if content or isinstance(result_data, (dict, list)):
209
254
  return ConvertResult(
210
- content=first_result.get("content", ""),
211
- format=first_result.get("format", "md"),
212
- success=True,
213
- )
214
- elif isinstance(result_data, dict):
215
- return ConvertResult(
216
- content=result_data.get("content", ""),
217
- format=result_data.get("format", "md"),
255
+ content=content,
256
+ format=result_format,
218
257
  success=True,
219
258
  )
220
259
  else:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mdify-cli
3
- Version: 2.5.0
3
+ Version: 2.8.0
4
4
  Summary: Convert PDFs and document images into structured Markdown for LLM workflows
5
5
  Author: tiroq
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mdify-cli"
3
- version = "2.5.0"
3
+ version = "2.8.0"
4
4
  description = "Convert PDFs and document images into structured Markdown for LLM workflows"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.8"
@@ -311,7 +311,110 @@ class TestDoclingContainerIntegration:
311
311
  container1 = DoclingContainer("docker", "image1", port=5001)
312
312
  container2 = DoclingContainer("docker", "image2", port=5002)
313
313
 
314
- # Each should be independent
315
314
  assert container1.port == 5001
316
315
  assert container2.port == 5002
317
316
  assert container1.container_name != container2.container_name
317
+
318
+
319
+ class TestDoclingContainerCleanup:
320
+ """Test cleanup of stale containers."""
321
+
322
+ def test_cleanup_no_stale_containers(self):
323
+ """Test cleanup runs when no stale containers exist."""
324
+ with patch("mdify.container.subprocess.run") as mock_run, patch(
325
+ "mdify.container.check_health"
326
+ ) as mock_health:
327
+ ps_result = Mock()
328
+ ps_result.returncode = 0
329
+ ps_result.stdout = ""
330
+
331
+ run_result = Mock()
332
+ run_result.stdout = "container_id\n"
333
+
334
+ mock_run.side_effect = [ps_result, run_result]
335
+ mock_health.return_value = True
336
+
337
+ container = DoclingContainer("docker", "test-image")
338
+ container.start(timeout=5)
339
+
340
+ ps_call = mock_run.call_args_list[0][0][0]
341
+ assert "ps" in ps_call
342
+ assert "--filter" in ps_call
343
+ assert "name=mdify-serve-" in ps_call
344
+
345
+ def test_cleanup_stops_stale_containers(self):
346
+ """Test cleanup finds and stops stale containers."""
347
+ with patch("mdify.container.subprocess.run") as mock_run, patch(
348
+ "mdify.container.check_health"
349
+ ) as mock_health:
350
+ ps_result = Mock()
351
+ ps_result.returncode = 0
352
+ ps_result.stdout = "mdify-serve-abc123\nmdify-serve-def456\n"
353
+
354
+ stop_result1 = Mock()
355
+ stop_result2 = Mock()
356
+
357
+ run_result = Mock()
358
+ run_result.stdout = "container_id\n"
359
+
360
+ mock_run.side_effect = [ps_result, stop_result1, stop_result2, run_result]
361
+ mock_health.return_value = True
362
+
363
+ container = DoclingContainer("docker", "test-image")
364
+ container.start(timeout=5)
365
+
366
+ ps_call = mock_run.call_args_list[0][0][0]
367
+ assert "ps" in ps_call
368
+
369
+ stop_calls = [
370
+ call for call in mock_run.call_args_list if "stop" in str(call)
371
+ ]
372
+ assert len(stop_calls) == 2
373
+ assert "mdify-serve-abc123" in str(stop_calls[0])
374
+ assert "mdify-serve-def456" in str(stop_calls[1])
375
+
376
+ def test_cleanup_handles_subprocess_error(self):
377
+ """Test cleanup handles subprocess errors gracefully."""
378
+ with patch("mdify.container.subprocess.run") as mock_run, patch(
379
+ "mdify.container.check_health"
380
+ ) as mock_health:
381
+ ps_result = Mock()
382
+ ps_result.returncode = 1
383
+ ps_result.stdout = ""
384
+
385
+ run_result = Mock()
386
+ run_result.stdout = "container_id\n"
387
+
388
+ mock_run.side_effect = [ps_result, run_result]
389
+ mock_health.return_value = True
390
+
391
+ container = DoclingContainer("docker", "test-image")
392
+ container.start(timeout=5)
393
+
394
+ assert container.container_id == "container_id"
395
+
396
+ def test_start_calls_cleanup(self):
397
+ """Test that start() calls _cleanup_stale_containers()."""
398
+ with patch("mdify.container.subprocess.run") as mock_run, patch(
399
+ "mdify.container.check_health"
400
+ ) as mock_health:
401
+ ps_result = Mock()
402
+ ps_result.returncode = 0
403
+ ps_result.stdout = ""
404
+
405
+ run_result = Mock()
406
+ run_result.stdout = "new_container_id\n"
407
+
408
+ mock_run.side_effect = [ps_result, run_result]
409
+ mock_health.return_value = True
410
+
411
+ container = DoclingContainer("docker", "test-image")
412
+ container.start(timeout=5)
413
+
414
+ all_calls = mock_run.call_args_list
415
+ ps_called = any("ps" in str(call) for call in all_calls[:1])
416
+ run_called = any("run" in str(call) for call in all_calls)
417
+
418
+ assert ps_called
419
+ assert run_called
420
+ assert "ps" in all_calls[0][0][0]
@@ -133,6 +133,26 @@ class TestConvertFile:
133
133
 
134
134
  assert result.format == "html"
135
135
 
136
+ def test_convert_file_new_document_format(self, tmp_path):
137
+ """Test file conversion with new document.md_content format."""
138
+ test_file = tmp_path / "test.pdf"
139
+ test_file.write_bytes(b"fake pdf content")
140
+
141
+ with patch("mdify.docling_client.requests.post") as mock_post:
142
+ mock_response = Mock()
143
+ mock_response.status_code = 200
144
+ mock_response.json.return_value = {
145
+ "document": {"md_content": "# New Format Content\n\nMarkdown here."}
146
+ }
147
+ mock_post.return_value = mock_response
148
+
149
+ result = convert_file("http://localhost:5001", test_file)
150
+
151
+ assert result.success is True
152
+ assert "# New Format Content" in result.content
153
+ assert result.error is None
154
+ assert result.format == "md"
155
+
136
156
 
137
157
  class TestConvertFileAsync:
138
158
  """Test async file conversion."""
@@ -302,6 +322,25 @@ class TestGetResult:
302
322
  assert result.success is False
303
323
  assert result.error is not None
304
324
 
325
+ def test_get_result_new_document_format(self):
326
+ """Test getting result with new document.md_content format."""
327
+ with patch("mdify.docling_client.requests.get") as mock_get:
328
+ mock_response = Mock()
329
+ mock_response.status_code = 200
330
+ mock_response.json.return_value = {
331
+ "document": {
332
+ "md_content": "# Result with MD Content\n\nFormatted markdown."
333
+ }
334
+ }
335
+ mock_get.return_value = mock_response
336
+
337
+ result = get_result("http://localhost:5001", "abc123")
338
+
339
+ assert result.success is True
340
+ assert "# Result with MD Content" in result.content
341
+ assert result.error is None
342
+ assert result.format == "md"
343
+
305
344
 
306
345
  class TestDataClasses:
307
346
  """Test dataclass definitions."""
@@ -356,3 +395,97 @@ class TestDoclingHTTPError:
356
395
  error = DoclingHTTPError(400, "Bad Request")
357
396
 
358
397
  assert isinstance(error, Exception)
398
+
399
+
400
+ class TestMimeTypeDetection:
401
+ """Test MIME type detection in file conversion."""
402
+
403
+ def test_convert_file_sends_correct_mime_for_xlsx(self, tmp_path):
404
+ """Test that .xlsx files are sent with correct MIME type."""
405
+ test_file = tmp_path / "test.xlsx"
406
+ test_file.write_bytes(b"fake xlsx content")
407
+
408
+ with patch("mdify.docling_client.requests.post") as mock_post:
409
+ mock_response = Mock()
410
+ mock_response.status_code = 200
411
+ mock_response.json.return_value = [
412
+ {"content": "# Test Spreadsheet\n\nContent here."}
413
+ ]
414
+ mock_post.return_value = mock_response
415
+
416
+ convert_file("http://localhost:5001", test_file)
417
+
418
+ mock_post.assert_called_once()
419
+ call_args = mock_post.call_args
420
+ files_param = call_args[1]["files"]
421
+ filename, file_obj, mime_type = files_param["files"]
422
+ assert (
423
+ mime_type
424
+ == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
425
+ )
426
+
427
+ def test_convert_file_sends_correct_mime_for_pdf(self, tmp_path):
428
+ """Test that .pdf files are sent with correct MIME type (regression test)."""
429
+ test_file = tmp_path / "test.pdf"
430
+ test_file.write_bytes(b"fake pdf content")
431
+
432
+ with patch("mdify.docling_client.requests.post") as mock_post:
433
+ mock_response = Mock()
434
+ mock_response.status_code = 200
435
+ mock_response.json.return_value = [
436
+ {"content": "# Test Document\n\nContent here."}
437
+ ]
438
+ mock_post.return_value = mock_response
439
+
440
+ convert_file("http://localhost:5001", test_file)
441
+
442
+ mock_post.assert_called_once()
443
+ call_args = mock_post.call_args
444
+ files_param = call_args[1]["files"]
445
+ filename, file_obj, mime_type = files_param["files"]
446
+ assert mime_type == "application/pdf"
447
+
448
+ def test_convert_file_sends_correct_mime_for_docx(self, tmp_path):
449
+ """Test that .docx files are sent with correct MIME type."""
450
+ test_file = tmp_path / "test.docx"
451
+ test_file.write_bytes(b"fake docx content")
452
+
453
+ with patch("mdify.docling_client.requests.post") as mock_post:
454
+ mock_response = Mock()
455
+ mock_response.status_code = 200
456
+ mock_response.json.return_value = [
457
+ {"content": "# Test Document\n\nContent here."}
458
+ ]
459
+ mock_post.return_value = mock_response
460
+
461
+ convert_file("http://localhost:5001", test_file)
462
+
463
+ mock_post.assert_called_once()
464
+ call_args = mock_post.call_args
465
+ files_param = call_args[1]["files"]
466
+ filename, file_obj, mime_type = files_param["files"]
467
+ assert (
468
+ mime_type
469
+ == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
470
+ )
471
+
472
+ def test_convert_file_fallback_for_unknown_extension(self, tmp_path):
473
+ """Test that unknown file extensions fall back to application/octet-stream."""
474
+ test_file = tmp_path / "test.unknownext123"
475
+ test_file.write_bytes(b"fake unknown content")
476
+
477
+ with patch("mdify.docling_client.requests.post") as mock_post:
478
+ mock_response = Mock()
479
+ mock_response.status_code = 200
480
+ mock_response.json.return_value = [
481
+ {"content": "# Test Content\n\nContent here."}
482
+ ]
483
+ mock_post.return_value = mock_response
484
+
485
+ convert_file("http://localhost:5001", test_file)
486
+
487
+ mock_post.assert_called_once()
488
+ call_args = mock_post.call_args
489
+ files_param = call_args[1]["files"]
490
+ filename, file_obj, mime_type = files_param["files"]
491
+ assert mime_type == "application/octet-stream"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes