mdify-cli 1.5.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdify/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """mdify - Convert documents to Markdown via Docling container."""
2
2
 
3
- __version__ = "1.5.0"
3
+ __version__ = "2.0.0"
mdify/cli.py CHANGED
@@ -21,6 +21,8 @@ from urllib.error import URLError
21
21
  from urllib.request import urlopen
22
22
 
23
23
  from . import __version__
24
+ from mdify.container import DoclingContainer
25
+ from mdify.docling_client import convert_file
24
26
 
25
27
  # Configuration
26
28
  MDIFY_HOME = Path.home() / ".mdify"
@@ -29,7 +31,8 @@ PYPI_API_URL = "https://pypi.org/pypi/mdify-cli/json"
29
31
  CHECK_INTERVAL_SECONDS = 86400 # 24 hours
30
32
 
31
33
  # Container configuration
32
- DEFAULT_IMAGE = "ghcr.io/tiroq/mdify-runtime:latest"
34
+ DEFAULT_IMAGE = "ghcr.io/docling-project/docling-serve-cpu:main"
35
+ GPU_IMAGE = "ghcr.io/docling-project/docling-serve-cu126:main"
33
36
  SUPPORTED_RUNTIMES = ("docker", "podman")
34
37
 
35
38
 
@@ -37,10 +40,11 @@ SUPPORTED_RUNTIMES = ("docker", "podman")
37
40
  # Update checking functions
38
41
  # =============================================================================
39
42
 
43
+
40
44
  def _get_remote_version(timeout: int = 5) -> Optional[str]:
41
45
  """
42
46
  Fetch the latest version from PyPI.
43
-
47
+
44
48
  Returns:
45
49
  Version string (e.g., "1.1.0") or None if fetch failed.
46
50
  """
@@ -56,16 +60,16 @@ def _get_remote_version(timeout: int = 5) -> Optional[str]:
56
60
  def _should_check_for_update() -> bool:
57
61
  """
58
62
  Determine if we should check for updates based on last check time.
59
-
63
+
60
64
  Returns:
61
65
  True if check should be performed, False otherwise.
62
66
  """
63
67
  if os.environ.get("MDIFY_NO_UPDATE_CHECK", "").lower() in ("1", "true", "yes"):
64
68
  return False
65
-
69
+
66
70
  if not LAST_CHECK_FILE.exists():
67
71
  return True
68
-
72
+
69
73
  try:
70
74
  last_check = float(LAST_CHECK_FILE.read_text().strip())
71
75
  elapsed = time.time() - last_check
@@ -86,18 +90,18 @@ def _update_last_check_time() -> None:
86
90
  def _compare_versions(current: str, remote: str) -> bool:
87
91
  """
88
92
  Compare version strings.
89
-
93
+
90
94
  Returns:
91
95
  True if remote version is newer than current.
92
96
  """
93
97
  try:
94
98
  current_parts = [int(x) for x in current.split(".")]
95
99
  remote_parts = [int(x) for x in remote.split(".")]
96
-
100
+
97
101
  max_len = max(len(current_parts), len(remote_parts))
98
102
  current_parts.extend([0] * (max_len - len(current_parts)))
99
103
  remote_parts.extend([0] * (max_len - len(remote_parts)))
100
-
104
+
101
105
  return remote_parts > current_parts
102
106
  except (ValueError, AttributeError):
103
107
  return False
@@ -106,15 +110,15 @@ def _compare_versions(current: str, remote: str) -> bool:
106
110
  def check_for_update(force: bool = False) -> None:
107
111
  """
108
112
  Check for updates and prompt user to upgrade if available.
109
-
113
+
110
114
  Args:
111
115
  force: If True, check regardless of last check time and show errors.
112
116
  """
113
117
  if not force and not _should_check_for_update():
114
118
  return
115
-
119
+
116
120
  remote_version = _get_remote_version()
117
-
121
+
118
122
  if remote_version is None:
119
123
  if force:
120
124
  print(
@@ -124,19 +128,19 @@ def check_for_update(force: bool = False) -> None:
124
128
  )
125
129
  sys.exit(1)
126
130
  return
127
-
131
+
128
132
  _update_last_check_time()
129
-
133
+
130
134
  if not _compare_versions(__version__, remote_version):
131
135
  if force:
132
136
  print(f"mdify is up to date (version {__version__})")
133
137
  return
134
-
135
- print(f"\n{'='*50}")
138
+
139
+ print(f"\n{'=' * 50}")
136
140
  print(f"A new version of mdify-cli is available!")
137
141
  print(f" Current version: {__version__}")
138
142
  print(f" Latest version: {remote_version}")
139
- print(f"{'='*50}")
143
+ print(f"{'=' * 50}")
140
144
  print(f"\nTo upgrade, run:")
141
145
  print(f" pipx upgrade mdify-cli")
142
146
  print(f" # or: pip install --upgrade mdify-cli\n")
@@ -146,13 +150,18 @@ def check_for_update(force: bool = False) -> None:
146
150
  # Container runtime functions
147
151
  # =============================================================================
148
152
 
149
- def detect_runtime(preferred: str) -> Optional[str]:
153
+
154
+ def detect_runtime(preferred: str, explicit: bool = True) -> Optional[str]:
150
155
  """
151
156
  Detect available container runtime.
152
-
157
+
153
158
  Args:
154
159
  preferred: Preferred runtime ('docker' or 'podman')
155
-
160
+ explicit: If True, warn when falling back to alternative.
161
+ If False, silently use alternative without warning.
162
+ Note: This only controls warning emission; selection order
163
+ is always preferred → alternative regardless of this flag.
164
+
156
165
  Returns:
157
166
  Path to runtime executable, or None if not found.
158
167
  """
@@ -160,25 +169,28 @@ def detect_runtime(preferred: str) -> Optional[str]:
160
169
  runtime_path = shutil.which(preferred)
161
170
  if runtime_path:
162
171
  return runtime_path
163
-
172
+
164
173
  # Try alternative
165
174
  alternative = "podman" if preferred == "docker" else "docker"
166
175
  runtime_path = shutil.which(alternative)
167
176
  if runtime_path:
168
- print(f"Warning: {preferred} not found, using {alternative}", file=sys.stderr)
177
+ if explicit:
178
+ print(
179
+ f"Warning: {preferred} not found, using {alternative}", file=sys.stderr
180
+ )
169
181
  return runtime_path
170
-
182
+
171
183
  return None
172
184
 
173
185
 
174
186
  def check_image_exists(runtime: str, image: str) -> bool:
175
187
  """
176
188
  Check if container image exists locally.
177
-
189
+
178
190
  Args:
179
191
  runtime: Path to container runtime
180
192
  image: Image name/tag
181
-
193
+
182
194
  Returns:
183
195
  True if image exists locally.
184
196
  """
@@ -196,18 +208,18 @@ def check_image_exists(runtime: str, image: str) -> bool:
196
208
  def pull_image(runtime: str, image: str, quiet: bool = False) -> bool:
197
209
  """
198
210
  Pull container image.
199
-
211
+
200
212
  Args:
201
213
  runtime: Path to container runtime
202
214
  image: Image name/tag
203
215
  quiet: Suppress progress output
204
-
216
+
205
217
  Returns:
206
218
  True if pull succeeded.
207
219
  """
208
220
  if not quiet:
209
221
  print(f"Pulling image: {image}")
210
-
222
+
211
223
  try:
212
224
  result = subprocess.run(
213
225
  [runtime, "pull", image],
@@ -222,9 +234,9 @@ def pull_image(runtime: str, image: str, quiet: bool = False) -> bool:
222
234
 
223
235
  def format_size(size_bytes: int) -> str:
224
236
  """Format file size in human-readable format."""
225
- for unit in ['B', 'KB', 'MB', 'GB']:
237
+ for unit in ["B", "KB", "MB", "GB"]:
226
238
  if size_bytes < 1024:
227
- return f"{size_bytes:.1f} {unit}" if unit != 'B' else f"{size_bytes} {unit}"
239
+ return f"{size_bytes:.1f} {unit}" if unit != "B" else f"{size_bytes} {unit}"
228
240
  size_bytes /= 1024
229
241
  return f"{size_bytes:.1f} TB"
230
242
 
@@ -244,29 +256,33 @@ def format_duration(seconds: float) -> str:
244
256
 
245
257
  class Spinner:
246
258
  """A simple spinner to show progress during long operations."""
247
-
259
+
248
260
  def __init__(self):
249
- self.frames = ['', '', '', '', '', '', '', '', '', '']
261
+ self.frames = ["", "", "", "", "", "", "", "", "", ""]
250
262
  self.running = False
251
263
  self.thread = None
252
264
  self.start_time = None
253
-
265
+
254
266
  def _spin(self):
255
267
  idx = 0
256
268
  while self.running:
257
269
  elapsed = time.time() - self.start_time
258
270
  frame = self.frames[idx % len(self.frames)]
259
- print(f"\r{self.prefix} {frame} ({format_duration(elapsed)})", end="", flush=True)
271
+ print(
272
+ f"\r{self.prefix} {frame} ({format_duration(elapsed)})",
273
+ end="",
274
+ flush=True,
275
+ )
260
276
  idx += 1
261
277
  time.sleep(0.1)
262
-
278
+
263
279
  def start(self, prefix: str = ""):
264
280
  self.prefix = prefix
265
281
  self.running = True
266
282
  self.start_time = time.time()
267
283
  self.thread = threading.Thread(target=self._spin, daemon=True)
268
284
  self.thread.start()
269
-
285
+
270
286
  def stop(self):
271
287
  self.running = False
272
288
  if self.thread:
@@ -275,93 +291,45 @@ class Spinner:
275
291
  print(f"\r{' ' * 80}\r", end="", flush=True)
276
292
 
277
293
 
278
- def run_container(
279
- runtime: str,
280
- image: str,
281
- input_file: Path,
282
- output_file: Path,
283
- mask_pii: bool = False,
284
- ) -> Tuple[bool, str, float]:
285
- """
286
- Run container to convert a single file.
287
-
288
- Args:
289
- runtime: Path to container runtime
290
- image: Image name/tag
291
- input_file: Absolute path to input file
292
- output_file: Absolute path to output file
293
- mask_pii: Whether to mask PII in images
294
-
295
- Returns:
296
- Tuple of (success: bool, message: str, elapsed_seconds: float)
297
- """
298
- start_time = time.time()
299
-
300
- # Ensure output directory exists
301
- output_file.parent.mkdir(parents=True, exist_ok=True)
302
-
303
- # Mount directories
304
- input_dir = input_file.parent
305
- output_dir = output_file.parent
306
-
307
- # Container paths
308
- container_in = f"/work/in/{input_file.name}"
309
- container_out = f"/work/out/{output_file.name}"
310
-
311
- cmd = [
312
- runtime, "run", "--rm",
313
- "-v", f"{input_dir}:/work/in:ro",
314
- "-v", f"{output_dir}:/work/out",
315
- image,
316
- "--in", container_in,
317
- "--out", container_out,
318
- ]
319
-
320
- if mask_pii:
321
- cmd.append("--mask")
322
-
323
- try:
324
- result = subprocess.run(
325
- cmd,
326
- capture_output=True,
327
- text=True,
328
- check=False,
329
- )
330
- elapsed = time.time() - start_time
331
-
332
- if result.returncode == 0:
333
- return True, "success", elapsed
334
- else:
335
- error_msg = result.stderr.strip() or result.stdout.strip() or "Unknown error"
336
- return False, error_msg, elapsed
337
-
338
- except OSError as e:
339
- elapsed = time.time() - start_time
340
- return False, str(e), elapsed
341
-
342
-
343
294
  # =============================================================================
344
295
  # File handling functions
345
296
  # =============================================================================
346
297
 
347
298
  # Supported file extensions (based on Docling InputFormat)
348
299
  SUPPORTED_EXTENSIONS = {
349
- '.pdf', '.docx', '.pptx', '.html', '.htm',
350
- '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.tif', # images
351
- '.asciidoc', '.adoc', '.asc', # asciidoc
352
- '.md', '.markdown', # markdown
353
- '.csv', '.xlsx', # spreadsheets
354
- '.xml', # XML formats
355
- '.json', # JSON docling
356
- '.mp3', '.wav', '.m4a', '.flac', # audio
357
- '.vtt', # subtitles
300
+ ".pdf",
301
+ ".docx",
302
+ ".pptx",
303
+ ".html",
304
+ ".htm",
305
+ ".png",
306
+ ".jpg",
307
+ ".jpeg",
308
+ ".gif",
309
+ ".bmp",
310
+ ".tiff",
311
+ ".tif", # images
312
+ ".asciidoc",
313
+ ".adoc",
314
+ ".asc", # asciidoc
315
+ ".md",
316
+ ".markdown", # markdown
317
+ ".csv",
318
+ ".xlsx", # spreadsheets
319
+ ".xml", # XML formats
320
+ ".json", # JSON docling
321
+ ".mp3",
322
+ ".wav",
323
+ ".m4a",
324
+ ".flac", # audio
325
+ ".vtt", # subtitles
358
326
  }
359
327
 
360
328
 
361
329
  def get_files_to_convert(input_path: Path, mask: str, recursive: bool) -> List[Path]:
362
330
  """Get list of files to convert based on input path and options."""
363
331
  files = []
364
-
332
+
365
333
  if input_path.is_file():
366
334
  files.append(input_path)
367
335
  elif input_path.is_dir():
@@ -369,19 +337,19 @@ def get_files_to_convert(input_path: Path, mask: str, recursive: bool) -> List[P
369
337
  files = list(input_path.rglob(mask))
370
338
  else:
371
339
  files = list(input_path.glob(mask))
372
-
340
+
373
341
  # Filter to only files
374
342
  files = [f for f in files if f.is_file()]
375
343
  else:
376
344
  raise FileNotFoundError(f"Input path does not exist: {input_path}")
377
-
345
+
378
346
  # Filter out hidden files and unsupported formats
379
347
  files = [
380
- f for f in files
381
- if not f.name.startswith('.')
382
- and f.suffix.lower() in SUPPORTED_EXTENSIONS
348
+ f
349
+ for f in files
350
+ if not f.name.startswith(".") and f.suffix.lower() in SUPPORTED_EXTENSIONS
383
351
  ]
384
-
352
+
385
353
  return files
386
354
 
387
355
 
@@ -414,7 +382,7 @@ def get_output_path(
414
382
  output_path = output_dir / relative_path.parent / output_name
415
383
  except ValueError:
416
384
  output_path = output_dir / output_name
417
-
385
+
418
386
  return output_path
419
387
 
420
388
 
@@ -422,6 +390,7 @@ def get_output_path(
422
390
  # CLI argument parsing
423
391
  # =============================================================================
424
392
 
393
+
425
394
  def parse_args() -> argparse.Namespace:
426
395
  """Parse command line arguments."""
427
396
  parser = argparse.ArgumentParser(
@@ -436,74 +405,92 @@ Examples:
436
405
  mdify ./docs --runtime podman Use Podman instead of Docker
437
406
  """,
438
407
  )
439
-
408
+
440
409
  parser.add_argument(
441
410
  "input",
442
411
  type=str,
443
412
  nargs="?",
444
413
  help="Input file or directory to convert",
445
414
  )
446
-
415
+
447
416
  parser.add_argument(
448
- "-o", "--out-dir",
417
+ "-o",
418
+ "--out-dir",
449
419
  type=str,
450
420
  default="output",
451
421
  help="Output directory for converted files (default: output)",
452
422
  )
453
-
423
+
454
424
  parser.add_argument(
455
- "-g", "--glob",
425
+ "-g",
426
+ "--glob",
456
427
  type=str,
457
428
  default="*",
458
429
  help="Glob pattern for filtering files in directory (default: *)",
459
430
  )
460
-
431
+
461
432
  parser.add_argument(
462
- "-r", "--recursive",
433
+ "-r",
434
+ "--recursive",
463
435
  action="store_true",
464
436
  help="Recursively scan directories",
465
437
  )
466
-
438
+
467
439
  parser.add_argument(
468
440
  "--flat",
469
441
  action="store_true",
470
442
  help="Disable directory structure preservation in output",
471
443
  )
472
-
444
+
473
445
  parser.add_argument(
474
446
  "--overwrite",
475
447
  action="store_true",
476
448
  help="Overwrite existing output files",
477
449
  )
478
-
450
+
479
451
  parser.add_argument(
480
- "-q", "--quiet",
452
+ "-q",
453
+ "--quiet",
481
454
  action="store_true",
482
455
  help="Suppress progress messages",
483
456
  )
484
-
457
+
485
458
  parser.add_argument(
486
- "-m", "--mask",
459
+ "-m",
460
+ "--mask",
487
461
  action="store_true",
488
462
  help="Mask PII and sensitive content in document images",
489
463
  )
490
-
464
+
465
+ parser.add_argument(
466
+ "--gpu",
467
+ action="store_true",
468
+ help="Use GPU-accelerated container image (docling-serve-cu126)",
469
+ )
470
+
471
+ parser.add_argument(
472
+ "--port",
473
+ type=int,
474
+ default=5001,
475
+ help="Port for docling-serve container (default: 5001)",
476
+ )
477
+
491
478
  # Container options
492
479
  parser.add_argument(
493
480
  "--runtime",
494
481
  type=str,
495
482
  choices=SUPPORTED_RUNTIMES,
496
- default="docker",
497
- help="Container runtime to use (default: docker)",
483
+ default=None,
484
+ help="Container runtime to use (auto-detects docker or podman if not specified)",
498
485
  )
499
-
486
+
500
487
  parser.add_argument(
501
488
  "--image",
502
489
  type=str,
503
490
  default=DEFAULT_IMAGE,
504
491
  help=f"Container image to use (default: {DEFAULT_IMAGE})",
505
492
  )
506
-
493
+
507
494
  parser.add_argument(
508
495
  "--pull",
509
496
  type=str,
@@ -511,20 +498,20 @@ Examples:
511
498
  default="missing",
512
499
  help="Image pull policy: always, missing, never (default: missing)",
513
500
  )
514
-
501
+
515
502
  # Utility options
516
503
  parser.add_argument(
517
504
  "--check-update",
518
505
  action="store_true",
519
506
  help="Check for available updates and exit",
520
507
  )
521
-
508
+
522
509
  parser.add_argument(
523
510
  "--version",
524
511
  action="version",
525
512
  version=f"mdify {__version__}",
526
513
  )
527
-
514
+
528
515
  return parser.parse_args()
529
516
 
530
517
 
@@ -532,27 +519,30 @@ Examples:
532
519
  # Main entry point
533
520
  # =============================================================================
534
521
 
522
+
535
523
  def main() -> int:
536
524
  """Main entry point for the CLI."""
537
525
  args = parse_args()
538
-
526
+
539
527
  # Handle --check-update flag
540
528
  if args.check_update:
541
529
  check_for_update(force=True)
542
530
  return 0
543
-
531
+
544
532
  # Check for updates (daily, silent on errors)
545
533
  check_for_update(force=False)
546
-
534
+
547
535
  # Validate input is provided
548
536
  if args.input is None:
549
537
  print("Error: Input file or directory is required", file=sys.stderr)
550
538
  print("Usage: mdify <input> [options]", file=sys.stderr)
551
539
  print(" mdify --help for more information", file=sys.stderr)
552
540
  return 1
553
-
541
+
554
542
  # Detect container runtime
555
- runtime = detect_runtime(args.runtime)
543
+ preferred = args.runtime if args.runtime else "docker"
544
+ explicit = args.runtime is not None
545
+ runtime = detect_runtime(preferred, explicit=explicit)
556
546
  if runtime is None:
557
547
  print(
558
548
  f"Error: Container runtime not found ({', '.join(SUPPORTED_RUNTIMES)})",
@@ -560,109 +550,166 @@ def main() -> int:
560
550
  )
561
551
  print("Please install Docker or Podman to use mdify.", file=sys.stderr)
562
552
  return 2
563
-
553
+
564
554
  # Handle image pull policy
565
- image = args.image
555
+ # Determine image based on --gpu flag
556
+ if args.gpu:
557
+ image = GPU_IMAGE
558
+ elif args.image:
559
+ image = args.image
560
+ else:
561
+ image = DEFAULT_IMAGE
562
+
566
563
  image_exists = check_image_exists(runtime, image)
567
-
564
+
568
565
  if args.pull == "always" or (args.pull == "missing" and not image_exists):
569
566
  if not pull_image(runtime, image, args.quiet):
570
567
  print(f"Error: Failed to pull image: {image}", file=sys.stderr)
571
568
  return 1
572
569
  elif args.pull == "never" and not image_exists:
573
570
  print(f"Error: Image not found locally: {image}", file=sys.stderr)
574
- print(f"Run with --pull=missing or pull manually: {args.runtime} pull {image}")
571
+ print(f"Run with --pull=missing or pull manually: {preferred} pull {image}")
575
572
  return 1
576
-
573
+
577
574
  # Resolve paths
578
575
  input_path = Path(args.input).resolve()
579
576
  output_dir = Path(args.out_dir).resolve()
580
-
577
+
581
578
  # Validate input
582
579
  if not input_path.exists():
583
580
  print(f"Error: Input path does not exist: {input_path}", file=sys.stderr)
584
581
  return 1
585
-
582
+
586
583
  # Get files to convert
587
584
  try:
588
585
  files_to_convert = get_files_to_convert(input_path, args.glob, args.recursive)
589
586
  except Exception as e:
590
587
  print(f"Error: {e}", file=sys.stderr)
591
588
  return 1
592
-
589
+
593
590
  if not files_to_convert:
594
591
  print(f"No files found to convert in: {input_path}", file=sys.stderr)
595
592
  return 1
596
-
593
+
597
594
  total_files = len(files_to_convert)
598
595
  total_size = sum(f.stat().st_size for f in files_to_convert)
599
-
596
+
600
597
  if not args.quiet:
601
598
  print(f"Found {total_files} file(s) to convert ({format_size(total_size)})")
602
599
  print(f"Using runtime: {runtime}")
603
600
  print(f"Using image: {image}")
604
601
  print()
605
-
602
+
603
+ if args.mask:
604
+ print(
605
+ "Warning: --mask is not supported with docling-serve and will be ignored",
606
+ file=sys.stderr,
607
+ )
608
+
606
609
  # Determine input base for directory structure preservation
607
610
  if input_path.is_file():
608
611
  input_base = input_path.parent
609
612
  else:
610
613
  input_base = input_path
611
-
612
- # Convert files
614
+
613
615
  success_count = 0
614
616
  skipped_count = 0
615
617
  failed_count = 0
616
- conversion_start = time.time()
617
- spinner = Spinner()
618
-
619
- for idx, input_file in enumerate(files_to_convert, 1):
620
- output_file = get_output_path(input_file, input_base, output_dir, args.flat)
621
- file_size = input_file.stat().st_size
622
- progress = f"[{idx}/{total_files}]"
623
-
624
- # Check if output exists and skip if not overwriting
625
- if output_file.exists() and not args.overwrite:
626
- if not args.quiet:
627
- print(f"{progress} Skipped (exists): {input_file.name}")
628
- skipped_count += 1
629
- continue
630
-
631
- # Show spinner while processing
618
+ total_elapsed = 0.0
619
+
620
+ try:
632
621
  if not args.quiet:
633
- spinner.start(f"{progress} Processing: {input_file.name} ({format_size(file_size)})")
634
-
635
- success, result, elapsed = run_container(
636
- runtime, image, input_file, output_file, args.mask
637
- )
638
-
622
+ print(f"Starting docling-serve container...")
623
+ print()
624
+
625
+ with DoclingContainer(runtime, image, args.port) as container:
626
+ # Convert files
627
+ conversion_start = time.time()
628
+ spinner = Spinner()
629
+
630
+ for idx, input_file in enumerate(files_to_convert, 1):
631
+ output_file = get_output_path(
632
+ input_file, input_base, output_dir, args.flat
633
+ )
634
+ file_size = input_file.stat().st_size
635
+ progress = f"[{idx}/{total_files}]"
636
+
637
+ # Check if output exists and skip if not overwriting
638
+ if output_file.exists() and not args.overwrite:
639
+ if not args.quiet:
640
+ print(f"{progress} Skipped (exists): {input_file.name}")
641
+ skipped_count += 1
642
+ continue
643
+
644
+ # Ensure output directory exists
645
+ output_file.parent.mkdir(parents=True, exist_ok=True)
646
+
647
+ # Show spinner while processing
648
+ if not args.quiet:
649
+ spinner.start(
650
+ f"{progress} Processing: {input_file.name} ({format_size(file_size)})"
651
+ )
652
+
653
+ start_time = time.time()
654
+ try:
655
+ # Convert via HTTP API
656
+ result = convert_file(
657
+ container.base_url, input_file, to_format="md"
658
+ )
659
+ elapsed = time.time() - start_time
660
+
661
+ if not args.quiet:
662
+ spinner.stop()
663
+
664
+ if result.success:
665
+ # Write result to output file
666
+ output_file.write_text(result.content)
667
+ success_count += 1
668
+ if not args.quiet:
669
+ print(
670
+ f"{progress} {input_file.name} ✓ ({format_duration(elapsed)})"
671
+ )
672
+ else:
673
+ failed_count += 1
674
+ error_msg = result.error or "Unknown error"
675
+ if not args.quiet:
676
+ print(
677
+ f"{progress} {input_file.name} ✗ ({format_duration(elapsed)})"
678
+ )
679
+ print(f" Error: {error_msg}", file=sys.stderr)
680
+ except Exception as e:
681
+ elapsed = time.time() - start_time
682
+ failed_count += 1
683
+ if not args.quiet:
684
+ spinner.stop()
685
+ print(
686
+ f"{progress} {input_file.name} ✗ ({format_duration(elapsed)})"
687
+ )
688
+ print(f" Error: {str(e)}", file=sys.stderr)
689
+
690
+ total_elapsed = time.time() - conversion_start
691
+
692
+ # Print summary
639
693
  if not args.quiet:
640
- spinner.stop()
641
-
642
- if success:
643
- success_count += 1
644
- if not args.quiet:
645
- print(f"{progress} {input_file.name} ✓ ({format_duration(elapsed)})")
646
- else:
647
- failed_count += 1
648
- if not args.quiet:
649
- print(f"{progress} {input_file.name} ✗ ({format_duration(elapsed)})")
650
- print(f" Error: {result}", file=sys.stderr)
651
-
652
- total_elapsed = time.time() - conversion_start
653
-
654
- # Print summary
655
- if not args.quiet:
656
- print()
657
- print("=" * 50)
658
- print("Conversion Summary:")
659
- print(f" Total files: {total_files}")
660
- print(f" Successful: {success_count}")
661
- print(f" Skipped: {skipped_count}")
662
- print(f" Failed: {failed_count}")
663
- print(f" Total time: {format_duration(total_elapsed)}")
664
- print("=" * 50)
665
-
694
+ print()
695
+ print("=" * 50)
696
+ print("Conversion Summary:")
697
+ print(f" Total files: {total_files}")
698
+ print(f" Successful: {success_count}")
699
+ print(f" Skipped: {skipped_count}")
700
+ print(f" Failed: {failed_count}")
701
+ print(f" Total time: {format_duration(total_elapsed)}")
702
+ print("=" * 50)
703
+
704
+ except KeyboardInterrupt:
705
+ if not args.quiet:
706
+ print("\n\nInterrupted by user. Container stopped.")
707
+ if success_count > 0 or skipped_count > 0 or failed_count > 0:
708
+ print(
709
+ f"Partial progress: {success_count} successful, {failed_count} failed, {skipped_count} skipped"
710
+ )
711
+ return 130
712
+
666
713
  # Return appropriate exit code
667
714
  if failed_count > 0:
668
715
  return 1
mdify/container.py ADDED
@@ -0,0 +1,128 @@
1
+ """Container lifecycle management for docling-serve."""
2
+
3
+ import subprocess
4
+ import time
5
+ import uuid
6
+ from typing import Optional
7
+
8
+ from mdify.docling_client import check_health
9
+
10
+
11
+ class DoclingContainer:
12
+ """Manages docling-serve container lifecycle.
13
+
14
+ Provides context manager support for automatic startup and cleanup.
15
+
16
+ Usage:
17
+ with DoclingContainer("docker", "ghcr.io/docling-project/docling-serve-cpu:main") as container:
18
+ # Container is running and healthy
19
+ response = requests.post(f"{container.base_url}/v1/convert/file", ...)
20
+ # Container automatically stopped and removed
21
+ """
22
+
23
+ def __init__(self, runtime: str, image: str, port: int = 5001):
24
+ """Initialize container manager.
25
+
26
+ Args:
27
+ runtime: Container runtime ("docker" or "podman")
28
+ image: Container image to use
29
+ port: Host port to bind (default: 5001)
30
+ """
31
+ self.runtime = runtime
32
+ self.image = image
33
+ self.port = port
34
+ self.container_name = f"mdify-serve-{uuid.uuid4().hex[:8]}"
35
+ self.container_id: Optional[str] = None
36
+
37
+ @property
38
+ def base_url(self) -> str:
39
+ """Return base URL for API requests."""
40
+ return f"http://localhost:{self.port}"
41
+
42
+ def start(self, timeout: int = 120) -> None:
43
+ """Start container and wait for health check.
44
+
45
+ Args:
46
+ timeout: Maximum seconds to wait for health (default: 120)
47
+
48
+ Raises:
49
+ subprocess.CalledProcessError: If container fails to start
50
+ TimeoutError: If health check doesn't pass within timeout
51
+ """
52
+ # Start container in detached mode
53
+ cmd = [
54
+ self.runtime,
55
+ "run",
56
+ "-d", # Detached mode
57
+ "--rm", # Auto-remove on stop
58
+ "--name",
59
+ self.container_name,
60
+ "-p",
61
+ f"{self.port}:5001",
62
+ self.image,
63
+ ]
64
+
65
+ try:
66
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
67
+ self.container_id = result.stdout.strip()
68
+ except subprocess.CalledProcessError as e:
69
+ error_msg = e.stderr.strip() or e.stdout.strip() or "Unknown error"
70
+ raise subprocess.CalledProcessError(
71
+ e.returncode,
72
+ e.cmd,
73
+ output=e.stdout,
74
+ stderr=f"Failed to start container: {error_msg}",
75
+ )
76
+
77
+ # Wait for health check
78
+ self._wait_for_health(timeout)
79
+
80
+ def stop(self) -> None:
81
+ """Stop and remove container. Safe to call multiple times."""
82
+ if self.container_name:
83
+ subprocess.run(
84
+ [self.runtime, "stop", self.container_name],
85
+ capture_output=True,
86
+ check=False,
87
+ )
88
+
89
+ def is_ready(self) -> bool:
90
+ """Check if container is healthy.
91
+
92
+ Returns:
93
+ True if container is healthy, False otherwise
94
+ """
95
+ try:
96
+ return check_health(self.base_url)
97
+ except Exception:
98
+ return False
99
+
100
+ def _wait_for_health(self, timeout: int) -> None:
101
+ """Poll health endpoint until ready.
102
+
103
+ Args:
104
+ timeout: Maximum seconds to wait
105
+
106
+ Raises:
107
+ TimeoutError: If health check doesn't pass within timeout
108
+ """
109
+ start_time = time.time()
110
+ while time.time() - start_time < timeout:
111
+ try:
112
+ if check_health(self.base_url):
113
+ return
114
+ except Exception:
115
+ pass
116
+ time.sleep(2) # Poll every 2 seconds
117
+
118
+ raise TimeoutError(f"Container failed to become healthy within {timeout}s")
119
+
120
+ def __enter__(self):
121
+ """Context manager entry."""
122
+ self.start()
123
+ return self
124
+
125
+ def __exit__(self, exc_type, exc_val, exc_tb):
126
+ """Context manager exit - ensures cleanup."""
127
+ self.stop()
128
+ return False
@@ -0,0 +1,224 @@
1
+ """HTTP client for docling-serve REST API."""
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ import requests
8
+
9
+
10
+ @dataclass
11
+ class ConvertResult:
12
+ """Result from document conversion."""
13
+
14
+ content: str
15
+ format: str
16
+ success: bool
17
+ error: Optional[str] = None
18
+
19
+
20
+ @dataclass
21
+ class StatusResult:
22
+ """Status of async conversion task."""
23
+
24
+ status: str # "pending", "completed", "failed"
25
+ task_id: str
26
+ error: Optional[str] = None
27
+
28
+
29
+ class DoclingClientError(Exception):
30
+ """Base exception for docling client errors."""
31
+
32
+ pass
33
+
34
+
35
+ class DoclingHTTPError(DoclingClientError):
36
+ """HTTP error from docling-serve API."""
37
+
38
+ def __init__(self, status_code: int, message: str):
39
+ self.status_code = status_code
40
+ super().__init__(f"HTTP {status_code}: {message}")
41
+
42
+
43
+ def check_health(base_url: str) -> bool:
44
+ """Check if docling-serve is healthy.
45
+
46
+ Args:
47
+ base_url: Base URL of docling-serve (e.g., "http://localhost:8000")
48
+
49
+ Returns:
50
+ True if healthy, False otherwise
51
+ """
52
+ try:
53
+ response = requests.get(f"{base_url}/health")
54
+ return response.status_code == 200
55
+ except requests.RequestException:
56
+ return False
57
+
58
+
59
+ def convert_file(
60
+ base_url: str, file_path: Path, to_format: str = "md", do_ocr: bool = True
61
+ ) -> ConvertResult:
62
+ """Convert a file synchronously.
63
+
64
+ Args:
65
+ base_url: Base URL of docling-serve
66
+ file_path: Path to file to convert
67
+ to_format: Output format (default: "md")
68
+ do_ocr: Whether to perform OCR (default: True)
69
+
70
+ Returns:
71
+ ConvertResult with conversion output
72
+
73
+ Raises:
74
+ DoclingHTTPError: If HTTP request fails
75
+ """
76
+ try:
77
+ with open(file_path, "rb") as f:
78
+ response = requests.post(
79
+ f"{base_url}/v1/convert/file",
80
+ files={"files": (file_path.name, f, "application/pdf")},
81
+ data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
82
+ )
83
+
84
+ if response.status_code != 200:
85
+ raise DoclingHTTPError(
86
+ response.status_code, response.text or "Conversion failed"
87
+ )
88
+
89
+ result_data = response.json()
90
+
91
+ # docling-serve returns results in a list format
92
+ if isinstance(result_data, list) and len(result_data) > 0:
93
+ first_result = result_data[0]
94
+ return ConvertResult(
95
+ content=first_result.get("content", ""), format=to_format, success=True
96
+ )
97
+ elif isinstance(result_data, dict):
98
+ return ConvertResult(
99
+ content=result_data.get("content", ""), format=to_format, success=True
100
+ )
101
+ else:
102
+ raise DoclingHTTPError(200, f"Unexpected response format: {result_data}")
103
+
104
+ except requests.RequestException as e:
105
+ return ConvertResult(content="", format=to_format, success=False, error=str(e))
106
+
107
+
108
+ def convert_file_async(
109
+ base_url: str, file_path: Path, to_format: str = "md", do_ocr: bool = True
110
+ ) -> str:
111
+ """Start async file conversion.
112
+
113
+ Args:
114
+ base_url: Base URL of docling-serve
115
+ file_path: Path to file to convert
116
+ to_format: Output format (default: "md")
117
+ do_ocr: Whether to perform OCR (default: True)
118
+
119
+ Returns:
120
+ Task ID for polling
121
+
122
+ Raises:
123
+ DoclingHTTPError: If HTTP request fails
124
+ """
125
+ try:
126
+ with open(file_path, "rb") as f:
127
+ response = requests.post(
128
+ f"{base_url}/v1/convert/file/async",
129
+ files={"files": (file_path.name, f, "application/pdf")},
130
+ data={"to_formats": to_format, "do_ocr": str(do_ocr).lower()},
131
+ )
132
+
133
+ if response.status_code != 200:
134
+ raise DoclingHTTPError(
135
+ response.status_code, response.text or "Async conversion failed"
136
+ )
137
+
138
+ result_data = response.json()
139
+ task_id = result_data.get("task_id")
140
+
141
+ if not task_id:
142
+ raise DoclingHTTPError(200, f"No task_id in response: {result_data}")
143
+
144
+ return task_id
145
+
146
+ except requests.RequestException as e:
147
+ raise DoclingHTTPError(500, str(e))
148
+
149
+
150
+ def poll_status(base_url: str, task_id: str) -> StatusResult:
151
+ """Poll status of async conversion task.
152
+
153
+ Args:
154
+ base_url: Base URL of docling-serve
155
+ task_id: Task ID from convert_file_async
156
+
157
+ Returns:
158
+ StatusResult with current status
159
+
160
+ Raises:
161
+ DoclingHTTPError: If HTTP request fails
162
+ """
163
+ try:
164
+ response = requests.get(f"{base_url}/v1/status/poll/{task_id}")
165
+
166
+ if response.status_code != 200:
167
+ raise DoclingHTTPError(
168
+ response.status_code, response.text or "Status poll failed"
169
+ )
170
+
171
+ result_data = response.json()
172
+
173
+ return StatusResult(
174
+ status=result_data.get("status", "unknown"),
175
+ task_id=task_id,
176
+ error=result_data.get("error"),
177
+ )
178
+
179
+ except requests.RequestException as e:
180
+ raise DoclingHTTPError(500, str(e))
181
+
182
+
183
+ def get_result(base_url: str, task_id: str) -> ConvertResult:
184
+ """Get result of completed async conversion.
185
+
186
+ Args:
187
+ base_url: Base URL of docling-serve
188
+ task_id: Task ID from convert_file_async
189
+
190
+ Returns:
191
+ ConvertResult with conversion output
192
+
193
+ Raises:
194
+ DoclingHTTPError: If HTTP request fails or task not completed
195
+ """
196
+ try:
197
+ response = requests.get(f"{base_url}/v1/result/{task_id}")
198
+
199
+ if response.status_code != 200:
200
+ raise DoclingHTTPError(
201
+ response.status_code, response.text or "Result retrieval failed"
202
+ )
203
+
204
+ result_data = response.json()
205
+
206
+ # Similar to sync conversion, handle list or dict format
207
+ if isinstance(result_data, list) and len(result_data) > 0:
208
+ first_result = result_data[0]
209
+ return ConvertResult(
210
+ content=first_result.get("content", ""),
211
+ format=first_result.get("format", "md"),
212
+ success=True,
213
+ )
214
+ elif isinstance(result_data, dict):
215
+ return ConvertResult(
216
+ content=result_data.get("content", ""),
217
+ format=result_data.get("format", "md"),
218
+ success=True,
219
+ )
220
+ else:
221
+ raise DoclingHTTPError(200, f"Unexpected response format: {result_data}")
222
+
223
+ except requests.RequestException as e:
224
+ return ConvertResult(content="", format="md", success=False, error=str(e))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mdify-cli
3
- Version: 1.5.0
3
+ Version: 2.0.0
4
4
  Summary: Convert PDFs and document images into structured Markdown for LLM workflows
5
5
  Author: tiroq
6
6
  License-Expression: MIT
@@ -24,6 +24,9 @@ Classifier: Topic :: Utilities
24
24
  Requires-Python: >=3.8
25
25
  Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
+ Requires-Dist: requests
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=7.0; extra == "dev"
27
30
  Dynamic: license-file
28
31
 
29
32
  # mdify
@@ -98,15 +101,32 @@ Recursively convert files:
98
101
  mdify /path/to/documents -r -g "*.pdf"
99
102
  ```
100
103
 
101
- ### Masking sensitive content
104
+ ### GPU Acceleration
102
105
 
103
- Mask PII and sensitive content in images:
106
+ For faster processing with NVIDIA GPU:
104
107
  ```bash
105
- mdify document.pdf -m
106
- mdify document.pdf --mask
108
+ mdify --gpu documents/*.pdf
107
109
  ```
108
110
 
109
- This uses Docling's content-aware masking to obscure sensitive information in embedded images.
111
+ Requires NVIDIA GPU with CUDA support and nvidia-container-toolkit.
112
+
113
+ ### ⚠️ PII Masking (Deprecated)
114
+
115
+ The `--mask` flag is deprecated and will be ignored in this version. PII masking functionality was available in older versions using a custom runtime but is not supported with the current docling-serve backend.
116
+
117
+ If PII masking is critical for your use case, please use mdify v1.5.x or earlier versions.
118
+
119
+ ## Performance
120
+
121
+ mdify now uses docling-serve for significantly faster batch processing:
122
+
123
+ - **Single model load**: Models are loaded once per session, not per file
124
+ - **~10-20x speedup** for multiple file conversions compared to previous versions
125
+ - **GPU acceleration**: Use `--gpu` for additional 2-6x speedup (requires NVIDIA GPU)
126
+
127
+ ### First Run Behavior
128
+
129
+ The first conversion takes longer (~30-60s) as the container loads ML models into memory. Subsequent files in the same batch process quickly, typically in 1-3 seconds per file.
110
130
 
111
131
  ## Options
112
132
 
@@ -119,9 +139,11 @@ This uses Docling's content-aware masking to obscure sensitive information in em
119
139
  | `--flat` | Disable directory structure preservation |
120
140
  | `--overwrite` | Overwrite existing output files |
121
141
  | `-q, --quiet` | Suppress progress messages |
122
- | `-m, --mask` | Mask PII and sensitive content in images |
142
+ | `-m, --mask` | ⚠️ **Deprecated**: PII masking not supported in current version |
143
+ | `--gpu` | Use GPU-accelerated container (requires NVIDIA GPU and nvidia-container-toolkit) |
144
+ | `--port PORT` | Container port (default: 5001) |
123
145
  | `--runtime RUNTIME` | Container runtime: docker or podman (auto-detected) |
124
- | `--image IMAGE` | Custom container image (default: ghcr.io/tiroq/mdify-runtime:latest) |
146
+ | `--image IMAGE` | Custom container image (default: ghcr.io/docling-project/docling-serve-cpu:main) |
125
147
  | `--pull POLICY` | Image pull policy: always, missing, never (default: missing) |
126
148
  | `--check-update` | Check for available updates and exit |
127
149
  | `--version` | Show version and exit |
@@ -175,19 +197,22 @@ The CLI:
175
197
  - Pulls the runtime container on first use
176
198
  - Mounts files and runs conversions in the container
177
199
 
178
- ## Container Image
200
+ ## Container Images
201
+
202
+ mdify uses official docling-serve containers:
179
203
 
180
- The runtime container is hosted at:
204
+ **CPU Version** (default):
181
205
  ```
182
- ghcr.io/tiroq/mdify-runtime:latest
206
+ ghcr.io/docling-project/docling-serve-cpu:main
183
207
  ```
184
208
 
185
- To build locally:
186
- ```bash
187
- cd runtime
188
- docker build -t mdify-runtime .
209
+ **GPU Version** (use with `--gpu` flag):
210
+ ```
211
+ ghcr.io/docling-project/docling-serve-cu126:main
189
212
  ```
190
213
 
214
+ These are official images from the [docling-serve project](https://github.com/DS4SD/docling-serve).
215
+
191
216
  ## Updates
192
217
 
193
218
  mdify checks for updates daily. When a new version is available:
@@ -0,0 +1,12 @@
1
+ assets/mdify.png,sha256=qUj7WXWqNwpI2KNXOW79XJwqFqa-UI0JEkmt1mmy4Rg,1820418
2
+ mdify/__init__.py,sha256=s7XlWmH4zJ5jFiPjpd7mXrCaU8bD-S9RaPzT2VHUdeQ,90
3
+ mdify/__main__.py,sha256=bhpJ00co6MfaVOdH4XLoW04NtLYDa_oJK7ODzfLrn9M,143
4
+ mdify/cli.py,sha256=LY0q8NlnKuN5aFz_OpO5hGro-tQNCxoYO_M0qVd6FJY,21493
5
+ mdify/container.py,sha256=AVIhiq_wO5id5hQ_s83lUPkAPCsAoTs25azRT6JmKII,3962
6
+ mdify/docling_client.py,sha256=_9qjL5yOOeJahOg6an2P6Iii1xkeR6wmNJZG4Q6NRkk,6553
7
+ mdify_cli-2.0.0.dist-info/licenses/LICENSE,sha256=NWM66Uv-XuSMKaU-gaPmvfyk4WgE6zcIPr78wyg6GAo,1065
8
+ mdify_cli-2.0.0.dist-info/METADATA,sha256=92_uBI2nnKK-YEf39TB7gX1KHbZBHqIHxLZBe7-GOqY,7923
9
+ mdify_cli-2.0.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
10
+ mdify_cli-2.0.0.dist-info/entry_points.txt,sha256=0Xki8f5lADQUtwdt6Eq_FEaieI6Byhk8UE7BuDhChMg,41
11
+ mdify_cli-2.0.0.dist-info/top_level.txt,sha256=qltzf7h8owHq7dxCdfCkSHY8gT21hn1_E8P-VWS_OKM,6
12
+ mdify_cli-2.0.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,10 +0,0 @@
1
- assets/mdify.png,sha256=qUj7WXWqNwpI2KNXOW79XJwqFqa-UI0JEkmt1mmy4Rg,1820418
2
- mdify/__init__.py,sha256=GxfVEOJLubSaiA0jNE2zgZq7sxiJMAr6Qn-cLPK8XJU,90
3
- mdify/__main__.py,sha256=bhpJ00co6MfaVOdH4XLoW04NtLYDa_oJK7ODzfLrn9M,143
4
- mdify/cli.py,sha256=D8_1_6NgWXkexGWqkgB0JO7c1r2T2_Va7J7iGwvewQA,20038
5
- mdify_cli-1.5.0.dist-info/licenses/LICENSE,sha256=NWM66Uv-XuSMKaU-gaPmvfyk4WgE6zcIPr78wyg6GAo,1065
6
- mdify_cli-1.5.0.dist-info/METADATA,sha256=LRKZupINA7w6HM9FyuDdmrLzWYovHqxTnQRHNohmRM0,6658
7
- mdify_cli-1.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
- mdify_cli-1.5.0.dist-info/entry_points.txt,sha256=0Xki8f5lADQUtwdt6Eq_FEaieI6Byhk8UE7BuDhChMg,41
9
- mdify_cli-1.5.0.dist-info/top_level.txt,sha256=qltzf7h8owHq7dxCdfCkSHY8gT21hn1_E8P-VWS_OKM,6
10
- mdify_cli-1.5.0.dist-info/RECORD,,