mdify-cli 2.5.0__tar.gz → 2.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mdify-cli
3
- Version: 2.5.0
3
+ Version: 2.7.0
4
4
  Summary: Convert PDFs and document images into structured Markdown for LLM workflows
5
5
  Author: tiroq
6
6
  License-Expression: MIT
@@ -1,3 +1,3 @@
1
1
  """mdify - Convert documents to Markdown via Docling container."""
2
2
 
3
- __version__ = "2.5.0"
3
+ __version__ = "2.7.0"
@@ -41,6 +41,39 @@ class DoclingContainer:
41
41
  """Return base URL for API requests."""
42
42
  return f"http://localhost:{self.port}"
43
43
 
44
+ def _cleanup_stale_containers(self) -> None:
45
+ """Stop any existing mdify-serve containers.
46
+
47
+ This handles the case where a previous run left a container running
48
+ (e.g., due to crash, interrupt, or timeout).
49
+ """
50
+ # Find running containers matching mdify-serve-* pattern
51
+ result = subprocess.run(
52
+ [
53
+ self.runtime,
54
+ "ps",
55
+ "--filter",
56
+ "name=mdify-serve-",
57
+ "--format",
58
+ "{{.Names}}",
59
+ ],
60
+ capture_output=True,
61
+ text=True,
62
+ check=False,
63
+ )
64
+
65
+ if result.returncode != 0 or not result.stdout.strip():
66
+ return
67
+
68
+ # Stop each stale container
69
+ for container_name in result.stdout.strip().split("\n"):
70
+ if container_name:
71
+ subprocess.run(
72
+ [self.runtime, "stop", container_name],
73
+ capture_output=True,
74
+ check=False,
75
+ )
76
+
44
77
  def start(self, timeout: int = 120) -> None:
45
78
  """Start container and wait for health check.
46
79
 
@@ -51,6 +84,8 @@ class DoclingContainer:
51
84
  subprocess.CalledProcessError: If container fails to start
52
85
  TimeoutError: If health check doesn't pass within timeout
53
86
  """
87
+ self._cleanup_stale_containers()
88
+
54
89
  # Start container in detached mode
55
90
  cmd = [
56
91
  self.runtime,
@@ -4,6 +4,8 @@ from dataclasses import dataclass
4
4
  from pathlib import Path
5
5
  from typing import Optional
6
6
 
7
+ import mimetypes
8
+
7
9
  import requests
8
10
 
9
11
 
@@ -40,6 +42,12 @@ class DoclingHTTPError(DoclingClientError):
40
42
  super().__init__(f"HTTP {status_code}: {message}")
41
43
 
42
44
 
45
+ def _get_mime_type(file_path: Path) -> str:
46
+ """Get MIME type for file, with fallback for unknown types."""
47
+ mime_type, _ = mimetypes.guess_type(str(file_path))
48
+ return mime_type or "application/octet-stream"
49
+
50
+
43
51
  def check_health(base_url: str) -> bool:
44
52
  """Check if docling-serve is healthy.
45
53
 
@@ -77,7 +85,7 @@ def convert_file(
77
85
  with open(file_path, "rb") as f:
78
86
  response = requests.post(
79
87
  f"{base_url}/v1/convert/file",
80
- files={"files": (file_path.name, f, "application/pdf")},
88
+ files={"files": (file_path.name, f, _get_mime_type(file_path))},
81
89
  data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
82
90
  )
83
91
 
@@ -126,7 +134,7 @@ def convert_file_async(
126
134
  with open(file_path, "rb") as f:
127
135
  response = requests.post(
128
136
  f"{base_url}/v1/convert/file/async",
129
- files={"files": (file_path.name, f, "application/pdf")},
137
+ files={"files": (file_path.name, f, _get_mime_type(file_path))},
130
138
  data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
131
139
  )
132
140
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mdify-cli
3
- Version: 2.5.0
3
+ Version: 2.7.0
4
4
  Summary: Convert PDFs and document images into structured Markdown for LLM workflows
5
5
  Author: tiroq
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "mdify-cli"
3
- version = "2.5.0"
3
+ version = "2.7.0"
4
4
  description = "Convert PDFs and document images into structured Markdown for LLM workflows"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.8"
@@ -311,7 +311,110 @@ class TestDoclingContainerIntegration:
311
311
  container1 = DoclingContainer("docker", "image1", port=5001)
312
312
  container2 = DoclingContainer("docker", "image2", port=5002)
313
313
 
314
- # Each should be independent
315
314
  assert container1.port == 5001
316
315
  assert container2.port == 5002
317
316
  assert container1.container_name != container2.container_name
317
+
318
+
319
+ class TestDoclingContainerCleanup:
320
+ """Test cleanup of stale containers."""
321
+
322
+ def test_cleanup_no_stale_containers(self):
323
+ """Test cleanup runs when no stale containers exist."""
324
+ with patch("mdify.container.subprocess.run") as mock_run, patch(
325
+ "mdify.container.check_health"
326
+ ) as mock_health:
327
+ ps_result = Mock()
328
+ ps_result.returncode = 0
329
+ ps_result.stdout = ""
330
+
331
+ run_result = Mock()
332
+ run_result.stdout = "container_id\n"
333
+
334
+ mock_run.side_effect = [ps_result, run_result]
335
+ mock_health.return_value = True
336
+
337
+ container = DoclingContainer("docker", "test-image")
338
+ container.start(timeout=5)
339
+
340
+ ps_call = mock_run.call_args_list[0][0][0]
341
+ assert "ps" in ps_call
342
+ assert "--filter" in ps_call
343
+ assert "name=mdify-serve-" in ps_call
344
+
345
+ def test_cleanup_stops_stale_containers(self):
346
+ """Test cleanup finds and stops stale containers."""
347
+ with patch("mdify.container.subprocess.run") as mock_run, patch(
348
+ "mdify.container.check_health"
349
+ ) as mock_health:
350
+ ps_result = Mock()
351
+ ps_result.returncode = 0
352
+ ps_result.stdout = "mdify-serve-abc123\nmdify-serve-def456\n"
353
+
354
+ stop_result1 = Mock()
355
+ stop_result2 = Mock()
356
+
357
+ run_result = Mock()
358
+ run_result.stdout = "container_id\n"
359
+
360
+ mock_run.side_effect = [ps_result, stop_result1, stop_result2, run_result]
361
+ mock_health.return_value = True
362
+
363
+ container = DoclingContainer("docker", "test-image")
364
+ container.start(timeout=5)
365
+
366
+ ps_call = mock_run.call_args_list[0][0][0]
367
+ assert "ps" in ps_call
368
+
369
+ stop_calls = [
370
+ call for call in mock_run.call_args_list if "stop" in str(call)
371
+ ]
372
+ assert len(stop_calls) == 2
373
+ assert "mdify-serve-abc123" in str(stop_calls[0])
374
+ assert "mdify-serve-def456" in str(stop_calls[1])
375
+
376
+ def test_cleanup_handles_subprocess_error(self):
377
+ """Test cleanup handles subprocess errors gracefully."""
378
+ with patch("mdify.container.subprocess.run") as mock_run, patch(
379
+ "mdify.container.check_health"
380
+ ) as mock_health:
381
+ ps_result = Mock()
382
+ ps_result.returncode = 1
383
+ ps_result.stdout = ""
384
+
385
+ run_result = Mock()
386
+ run_result.stdout = "container_id\n"
387
+
388
+ mock_run.side_effect = [ps_result, run_result]
389
+ mock_health.return_value = True
390
+
391
+ container = DoclingContainer("docker", "test-image")
392
+ container.start(timeout=5)
393
+
394
+ assert container.container_id == "container_id"
395
+
396
+ def test_start_calls_cleanup(self):
397
+ """Test that start() calls _cleanup_stale_containers()."""
398
+ with patch("mdify.container.subprocess.run") as mock_run, patch(
399
+ "mdify.container.check_health"
400
+ ) as mock_health:
401
+ ps_result = Mock()
402
+ ps_result.returncode = 0
403
+ ps_result.stdout = ""
404
+
405
+ run_result = Mock()
406
+ run_result.stdout = "new_container_id\n"
407
+
408
+ mock_run.side_effect = [ps_result, run_result]
409
+ mock_health.return_value = True
410
+
411
+ container = DoclingContainer("docker", "test-image")
412
+ container.start(timeout=5)
413
+
414
+ all_calls = mock_run.call_args_list
415
+ ps_called = any("ps" in str(call) for call in all_calls[:1])
416
+ run_called = any("run" in str(call) for call in all_calls)
417
+
418
+ assert ps_called
419
+ assert run_called
420
+ assert "ps" in all_calls[0][0][0]
@@ -356,3 +356,97 @@ class TestDoclingHTTPError:
356
356
  error = DoclingHTTPError(400, "Bad Request")
357
357
 
358
358
  assert isinstance(error, Exception)
359
+
360
+
361
+ class TestMimeTypeDetection:
362
+ """Test MIME type detection in file conversion."""
363
+
364
+ def test_convert_file_sends_correct_mime_for_xlsx(self, tmp_path):
365
+ """Test that .xlsx files are sent with correct MIME type."""
366
+ test_file = tmp_path / "test.xlsx"
367
+ test_file.write_bytes(b"fake xlsx content")
368
+
369
+ with patch("mdify.docling_client.requests.post") as mock_post:
370
+ mock_response = Mock()
371
+ mock_response.status_code = 200
372
+ mock_response.json.return_value = [
373
+ {"content": "# Test Spreadsheet\n\nContent here."}
374
+ ]
375
+ mock_post.return_value = mock_response
376
+
377
+ convert_file("http://localhost:5001", test_file)
378
+
379
+ mock_post.assert_called_once()
380
+ call_args = mock_post.call_args
381
+ files_param = call_args[1]["files"]
382
+ filename, file_obj, mime_type = files_param["files"]
383
+ assert (
384
+ mime_type
385
+ == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
386
+ )
387
+
388
+ def test_convert_file_sends_correct_mime_for_pdf(self, tmp_path):
389
+ """Test that .pdf files are sent with correct MIME type (regression test)."""
390
+ test_file = tmp_path / "test.pdf"
391
+ test_file.write_bytes(b"fake pdf content")
392
+
393
+ with patch("mdify.docling_client.requests.post") as mock_post:
394
+ mock_response = Mock()
395
+ mock_response.status_code = 200
396
+ mock_response.json.return_value = [
397
+ {"content": "# Test Document\n\nContent here."}
398
+ ]
399
+ mock_post.return_value = mock_response
400
+
401
+ convert_file("http://localhost:5001", test_file)
402
+
403
+ mock_post.assert_called_once()
404
+ call_args = mock_post.call_args
405
+ files_param = call_args[1]["files"]
406
+ filename, file_obj, mime_type = files_param["files"]
407
+ assert mime_type == "application/pdf"
408
+
409
+ def test_convert_file_sends_correct_mime_for_docx(self, tmp_path):
410
+ """Test that .docx files are sent with correct MIME type."""
411
+ test_file = tmp_path / "test.docx"
412
+ test_file.write_bytes(b"fake docx content")
413
+
414
+ with patch("mdify.docling_client.requests.post") as mock_post:
415
+ mock_response = Mock()
416
+ mock_response.status_code = 200
417
+ mock_response.json.return_value = [
418
+ {"content": "# Test Document\n\nContent here."}
419
+ ]
420
+ mock_post.return_value = mock_response
421
+
422
+ convert_file("http://localhost:5001", test_file)
423
+
424
+ mock_post.assert_called_once()
425
+ call_args = mock_post.call_args
426
+ files_param = call_args[1]["files"]
427
+ filename, file_obj, mime_type = files_param["files"]
428
+ assert (
429
+ mime_type
430
+ == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
431
+ )
432
+
433
+ def test_convert_file_fallback_for_unknown_extension(self, tmp_path):
434
+ """Test that unknown file extensions fall back to application/octet-stream."""
435
+ test_file = tmp_path / "test.unknownext123"
436
+ test_file.write_bytes(b"fake unknown content")
437
+
438
+ with patch("mdify.docling_client.requests.post") as mock_post:
439
+ mock_response = Mock()
440
+ mock_response.status_code = 200
441
+ mock_response.json.return_value = [
442
+ {"content": "# Test Content\n\nContent here."}
443
+ ]
444
+ mock_post.return_value = mock_response
445
+
446
+ convert_file("http://localhost:5001", test_file)
447
+
448
+ mock_post.assert_called_once()
449
+ call_args = mock_post.call_args
450
+ files_param = call_args[1]["files"]
451
+ filename, file_obj, mime_type = files_param["files"]
452
+ assert mime_type == "application/octet-stream"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes