mdify-cli 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mdify/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """mdify - Convert documents to Markdown via Docling container."""
2
+
3
+ __version__ = "1.3.1"
mdify/__main__.py ADDED
@@ -0,0 +1,7 @@
1
+ """Allow running mdify as a module: python -m mdify"""
2
+
3
+ import sys
4
+ from mdify.cli import main
5
+
6
+ if __name__ == "__main__":
7
+ sys.exit(main())
mdify/cli.py ADDED
@@ -0,0 +1,721 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ CLI for converting documents to Markdown.
4
+
5
+ This CLI orchestrates document conversion by invoking a Docker/Podman
6
+ container that contains Docling and ML dependencies. The CLI itself
7
+ is lightweight and has no ML dependencies.
8
+ """
9
+
10
+ import argparse
11
+ import json
12
+ import os
13
+ import shutil
14
+ import subprocess
15
+ import sys
16
+ import threading
17
+ import time
18
+ from pathlib import Path
19
+ from typing import List, Optional, Tuple
20
+ from urllib.error import URLError
21
+ from urllib.request import urlopen
22
+
23
+ from . import __version__
24
+
25
+ # Configuration
26
+ MDIFY_HOME = Path.home() / ".mdify"
27
+ LAST_CHECK_FILE = MDIFY_HOME / ".last_check"
28
+ INSTALLER_PATH = MDIFY_HOME / "install.sh"
29
+ GITHUB_API_URL = "https://api.github.com/repos/tiroq/mdify/releases/latest"
30
+ CHECK_INTERVAL_SECONDS = 86400 # 24 hours
31
+
32
+ # Container configuration
33
+ DEFAULT_IMAGE = "ghcr.io/tiroq/mdify-runtime:latest"
34
+ SUPPORTED_RUNTIMES = ("docker", "podman")
35
+
36
+
37
+ # =============================================================================
38
+ # Update checking functions
39
+ # =============================================================================
40
+
41
+ def _get_remote_version(timeout: int = 5) -> Optional[str]:
42
+ """
43
+ Fetch the latest version from GitHub API.
44
+
45
+ Returns:
46
+ Version string (e.g., "0.2.0") or None if fetch failed.
47
+ """
48
+ try:
49
+ with urlopen(GITHUB_API_URL, timeout=timeout) as response:
50
+ data = json.loads(response.read().decode("utf-8"))
51
+ tag = data.get("tag_name", "")
52
+ return tag.lstrip("v") if tag else None
53
+ except (URLError, json.JSONDecodeError, KeyError, TimeoutError):
54
+ return None
55
+
56
+
57
+ def _should_check_for_update() -> bool:
58
+ """
59
+ Determine if we should check for updates based on last check time.
60
+
61
+ Returns:
62
+ True if check should be performed, False otherwise.
63
+ """
64
+ if os.environ.get("MDIFY_NO_UPDATE_CHECK", "").lower() in ("1", "true", "yes"):
65
+ return False
66
+
67
+ if not LAST_CHECK_FILE.exists():
68
+ return True
69
+
70
+ try:
71
+ last_check = float(LAST_CHECK_FILE.read_text().strip())
72
+ elapsed = time.time() - last_check
73
+ return elapsed >= CHECK_INTERVAL_SECONDS
74
+ except (ValueError, OSError):
75
+ return True
76
+
77
+
78
+ def _update_last_check_time() -> None:
79
+ """Update the last check timestamp file."""
80
+ try:
81
+ LAST_CHECK_FILE.parent.mkdir(parents=True, exist_ok=True)
82
+ LAST_CHECK_FILE.write_text(str(time.time()))
83
+ except OSError:
84
+ pass
85
+
86
+
87
+ def _compare_versions(current: str, remote: str) -> bool:
88
+ """
89
+ Compare version strings.
90
+
91
+ Returns:
92
+ True if remote version is newer than current.
93
+ """
94
+ try:
95
+ current_parts = [int(x) for x in current.split(".")]
96
+ remote_parts = [int(x) for x in remote.split(".")]
97
+
98
+ max_len = max(len(current_parts), len(remote_parts))
99
+ current_parts.extend([0] * (max_len - len(current_parts)))
100
+ remote_parts.extend([0] * (max_len - len(remote_parts)))
101
+
102
+ return remote_parts > current_parts
103
+ except (ValueError, AttributeError):
104
+ return False
105
+
106
+
107
+ def _run_upgrade() -> bool:
108
+ """
109
+ Run the upgrade installer.
110
+
111
+ Returns:
112
+ True if upgrade was successful, False otherwise.
113
+ """
114
+ if not INSTALLER_PATH.exists():
115
+ print(
116
+ f"Installer not found at {INSTALLER_PATH}. "
117
+ "Please reinstall mdify manually.",
118
+ file=sys.stderr,
119
+ )
120
+ return False
121
+
122
+ try:
123
+ result = subprocess.run(
124
+ [str(INSTALLER_PATH), "--upgrade", "-y"],
125
+ check=True,
126
+ )
127
+ return result.returncode == 0
128
+ except subprocess.CalledProcessError:
129
+ return False
130
+ except OSError as e:
131
+ print(f"Failed to run installer: {e}", file=sys.stderr)
132
+ return False
133
+
134
+
135
+ def check_for_update(force: bool = False) -> None:
136
+ """
137
+ Check for updates and prompt user to upgrade if available.
138
+
139
+ Args:
140
+ force: If True, check regardless of last check time and show errors.
141
+ """
142
+ if not force and not _should_check_for_update():
143
+ return
144
+
145
+ remote_version = _get_remote_version()
146
+
147
+ if remote_version is None:
148
+ if force:
149
+ print(
150
+ "Error: Failed to check for updates. "
151
+ "Please check your internet connection.",
152
+ file=sys.stderr,
153
+ )
154
+ sys.exit(1)
155
+ return
156
+
157
+ _update_last_check_time()
158
+
159
+ if not _compare_versions(__version__, remote_version):
160
+ if force:
161
+ print(f"mdify is up to date (version {__version__})")
162
+ return
163
+
164
+ print(f"\n{'='*50}")
165
+ print(f"A new version of mdify is available!")
166
+ print(f" Current version: {__version__}")
167
+ print(f" Latest version: {remote_version}")
168
+ print(f"{'='*50}\n")
169
+
170
+ try:
171
+ response = input("Run upgrade now? [y/N] ").strip().lower()
172
+ except (EOFError, KeyboardInterrupt):
173
+ print()
174
+ return
175
+
176
+ if response in ("y", "yes"):
177
+ print("\nStarting upgrade...\n")
178
+ if _run_upgrade():
179
+ print("\nUpgrade completed! Please restart mdify.")
180
+ sys.exit(0)
181
+ else:
182
+ print("\nUpgrade failed. You can try manually with:")
183
+ print(f" {INSTALLER_PATH} --upgrade")
184
+ else:
185
+ print(f"\nTo upgrade later, run: {INSTALLER_PATH} --upgrade\n")
186
+
187
+
188
+ # =============================================================================
189
+ # Container runtime functions
190
+ # =============================================================================
191
+
192
+ def detect_runtime(preferred: str) -> Optional[str]:
193
+ """
194
+ Detect available container runtime.
195
+
196
+ Args:
197
+ preferred: Preferred runtime ('docker' or 'podman')
198
+
199
+ Returns:
200
+ Path to runtime executable, or None if not found.
201
+ """
202
+ # Try preferred runtime first
203
+ runtime_path = shutil.which(preferred)
204
+ if runtime_path:
205
+ return runtime_path
206
+
207
+ # Try alternative
208
+ alternative = "podman" if preferred == "docker" else "docker"
209
+ runtime_path = shutil.which(alternative)
210
+ if runtime_path:
211
+ print(f"Warning: {preferred} not found, using {alternative}", file=sys.stderr)
212
+ return runtime_path
213
+
214
+ return None
215
+
216
+
217
+ def check_image_exists(runtime: str, image: str) -> bool:
218
+ """
219
+ Check if container image exists locally.
220
+
221
+ Args:
222
+ runtime: Path to container runtime
223
+ image: Image name/tag
224
+
225
+ Returns:
226
+ True if image exists locally.
227
+ """
228
+ try:
229
+ result = subprocess.run(
230
+ [runtime, "image", "inspect", image],
231
+ capture_output=True,
232
+ check=False,
233
+ )
234
+ return result.returncode == 0
235
+ except OSError:
236
+ return False
237
+
238
+
239
+ def pull_image(runtime: str, image: str, quiet: bool = False) -> bool:
240
+ """
241
+ Pull container image.
242
+
243
+ Args:
244
+ runtime: Path to container runtime
245
+ image: Image name/tag
246
+ quiet: Suppress progress output
247
+
248
+ Returns:
249
+ True if pull succeeded.
250
+ """
251
+ if not quiet:
252
+ print(f"Pulling image: {image}")
253
+
254
+ try:
255
+ result = subprocess.run(
256
+ [runtime, "pull", image],
257
+ capture_output=quiet,
258
+ check=False,
259
+ )
260
+ return result.returncode == 0
261
+ except OSError as e:
262
+ print(f"Error pulling image: {e}", file=sys.stderr)
263
+ return False
264
+
265
+
266
+ def format_size(size_bytes: int) -> str:
267
+ """Format file size in human-readable format."""
268
+ for unit in ['B', 'KB', 'MB', 'GB']:
269
+ if size_bytes < 1024:
270
+ return f"{size_bytes:.1f} {unit}" if unit != 'B' else f"{size_bytes} {unit}"
271
+ size_bytes /= 1024
272
+ return f"{size_bytes:.1f} TB"
273
+
274
+
275
+ def format_duration(seconds: float) -> str:
276
+ """Format duration in human-readable format."""
277
+ if seconds < 60:
278
+ return f"{seconds:.1f}s"
279
+ minutes = int(seconds // 60)
280
+ secs = seconds % 60
281
+ if minutes < 60:
282
+ return f"{minutes}m {secs:.0f}s"
283
+ hours = minutes // 60
284
+ mins = minutes % 60
285
+ return f"{hours}h {mins}m {secs:.0f}s"
286
+
287
+
288
+ class Spinner:
289
+ """A simple spinner to show progress during long operations."""
290
+
291
+ def __init__(self):
292
+ self.frames = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏']
293
+ self.running = False
294
+ self.thread = None
295
+ self.start_time = None
296
+
297
+ def _spin(self):
298
+ idx = 0
299
+ while self.running:
300
+ elapsed = time.time() - self.start_time
301
+ frame = self.frames[idx % len(self.frames)]
302
+ print(f"\r{self.prefix} {frame} ({format_duration(elapsed)})", end="", flush=True)
303
+ idx += 1
304
+ time.sleep(0.1)
305
+
306
+ def start(self, prefix: str = ""):
307
+ self.prefix = prefix
308
+ self.running = True
309
+ self.start_time = time.time()
310
+ self.thread = threading.Thread(target=self._spin, daemon=True)
311
+ self.thread.start()
312
+
313
+ def stop(self):
314
+ self.running = False
315
+ if self.thread:
316
+ self.thread.join(timeout=0.5)
317
+ # Clear the spinner line
318
+ print(f"\r{' ' * 80}\r", end="", flush=True)
319
+
320
+
321
+ def run_container(
322
+ runtime: str,
323
+ image: str,
324
+ input_file: Path,
325
+ output_file: Path,
326
+ mask_pii: bool = False,
327
+ ) -> Tuple[bool, str, float]:
328
+ """
329
+ Run container to convert a single file.
330
+
331
+ Args:
332
+ runtime: Path to container runtime
333
+ image: Image name/tag
334
+ input_file: Absolute path to input file
335
+ output_file: Absolute path to output file
336
+ mask_pii: Whether to mask PII in images
337
+
338
+ Returns:
339
+ Tuple of (success: bool, message: str, elapsed_seconds: float)
340
+ """
341
+ start_time = time.time()
342
+
343
+ # Ensure output directory exists
344
+ output_file.parent.mkdir(parents=True, exist_ok=True)
345
+
346
+ # Mount directories
347
+ input_dir = input_file.parent
348
+ output_dir = output_file.parent
349
+
350
+ # Container paths
351
+ container_in = f"/work/in/{input_file.name}"
352
+ container_out = f"/work/out/{output_file.name}"
353
+
354
+ cmd = [
355
+ runtime, "run", "--rm",
356
+ "-v", f"{input_dir}:/work/in:ro",
357
+ "-v", f"{output_dir}:/work/out",
358
+ image,
359
+ "--in", container_in,
360
+ "--out", container_out,
361
+ ]
362
+
363
+ if mask_pii:
364
+ cmd.append("--mask")
365
+
366
+ try:
367
+ result = subprocess.run(
368
+ cmd,
369
+ capture_output=True,
370
+ text=True,
371
+ check=False,
372
+ )
373
+ elapsed = time.time() - start_time
374
+
375
+ if result.returncode == 0:
376
+ return True, "success", elapsed
377
+ else:
378
+ error_msg = result.stderr.strip() or result.stdout.strip() or "Unknown error"
379
+ return False, error_msg, elapsed
380
+
381
+ except OSError as e:
382
+ elapsed = time.time() - start_time
383
+ return False, str(e), elapsed
384
+
385
+
386
+ # =============================================================================
387
+ # File handling functions
388
+ # =============================================================================
389
+
390
+ # Supported file extensions (based on Docling InputFormat)
391
+ SUPPORTED_EXTENSIONS = {
392
+ '.pdf', '.docx', '.pptx', '.html', '.htm',
393
+ '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.tif', # images
394
+ '.asciidoc', '.adoc', '.asc', # asciidoc
395
+ '.md', '.markdown', # markdown
396
+ '.csv', '.xlsx', # spreadsheets
397
+ '.xml', # XML formats
398
+ '.json', # JSON docling
399
+ '.mp3', '.wav', '.m4a', '.flac', # audio
400
+ '.vtt', # subtitles
401
+ }
402
+
403
+
404
+ def get_files_to_convert(input_path: Path, mask: str, recursive: bool) -> List[Path]:
405
+ """Get list of files to convert based on input path and options."""
406
+ files = []
407
+
408
+ if input_path.is_file():
409
+ files.append(input_path)
410
+ elif input_path.is_dir():
411
+ if recursive:
412
+ files = list(input_path.rglob(mask))
413
+ else:
414
+ files = list(input_path.glob(mask))
415
+
416
+ # Filter to only files
417
+ files = [f for f in files if f.is_file()]
418
+ else:
419
+ raise FileNotFoundError(f"Input path does not exist: {input_path}")
420
+
421
+ # Filter out hidden files and unsupported formats
422
+ files = [
423
+ f for f in files
424
+ if not f.name.startswith('.')
425
+ and f.suffix.lower() in SUPPORTED_EXTENSIONS
426
+ ]
427
+
428
+ return files
429
+
430
+
431
+ def get_output_path(
432
+ input_file: Path,
433
+ input_base: Path,
434
+ output_dir: Path,
435
+ flat: bool,
436
+ ) -> Path:
437
+ """Calculate output path for a given input file."""
438
+ if flat:
439
+ try:
440
+ relative_path = input_file.relative_to(input_base)
441
+ parts = list(relative_path.parts)
442
+ except ValueError:
443
+ parts = [input_file.name]
444
+
445
+ stem = input_file.stem
446
+ parent_prefix = "_".join(parts[:-1])
447
+ if parent_prefix:
448
+ output_name = f"{parent_prefix}_{stem}.md"
449
+ else:
450
+ output_name = f"{stem}.md"
451
+
452
+ return output_dir / output_name
453
+ else:
454
+ output_name = input_file.stem + ".md"
455
+ try:
456
+ relative_path = input_file.relative_to(input_base)
457
+ output_path = output_dir / relative_path.parent / output_name
458
+ except ValueError:
459
+ output_path = output_dir / output_name
460
+
461
+ return output_path
462
+
463
+
464
+ # =============================================================================
465
+ # CLI argument parsing
466
+ # =============================================================================
467
+
468
+ def parse_args() -> argparse.Namespace:
469
+ """Parse command line arguments."""
470
+ parser = argparse.ArgumentParser(
471
+ description="Convert documents to Markdown using Docling (via container)",
472
+ formatter_class=argparse.RawDescriptionHelpFormatter,
473
+ epilog="""
474
+ Examples:
475
+ mdify document.pdf Convert a single file
476
+ mdify ./docs -g "*.pdf" -r Convert PDFs recursively
477
+ mdify ./docs -g "*.pdf" -o out/ Specify output directory
478
+ mdify document.pdf -m Mask PII in images
479
+ mdify ./docs --runtime podman Use Podman instead of Docker
480
+ """,
481
+ )
482
+
483
+ parser.add_argument(
484
+ "input",
485
+ type=str,
486
+ nargs="?",
487
+ help="Input file or directory to convert",
488
+ )
489
+
490
+ parser.add_argument(
491
+ "-o", "--out-dir",
492
+ type=str,
493
+ default="output",
494
+ help="Output directory for converted files (default: output)",
495
+ )
496
+
497
+ parser.add_argument(
498
+ "-g", "--glob",
499
+ type=str,
500
+ default="*",
501
+ help="Glob pattern for filtering files in directory (default: *)",
502
+ )
503
+
504
+ parser.add_argument(
505
+ "-r", "--recursive",
506
+ action="store_true",
507
+ help="Recursively scan directories",
508
+ )
509
+
510
+ parser.add_argument(
511
+ "--flat",
512
+ action="store_true",
513
+ help="Disable directory structure preservation in output",
514
+ )
515
+
516
+ parser.add_argument(
517
+ "--overwrite",
518
+ action="store_true",
519
+ help="Overwrite existing output files",
520
+ )
521
+
522
+ parser.add_argument(
523
+ "-q", "--quiet",
524
+ action="store_true",
525
+ help="Suppress progress messages",
526
+ )
527
+
528
+ parser.add_argument(
529
+ "-m", "--mask",
530
+ action="store_true",
531
+ help="Mask PII and sensitive content in document images",
532
+ )
533
+
534
+ # Container options
535
+ parser.add_argument(
536
+ "--runtime",
537
+ type=str,
538
+ choices=SUPPORTED_RUNTIMES,
539
+ default="docker",
540
+ help="Container runtime to use (default: docker)",
541
+ )
542
+
543
+ parser.add_argument(
544
+ "--image",
545
+ type=str,
546
+ default=DEFAULT_IMAGE,
547
+ help=f"Container image to use (default: {DEFAULT_IMAGE})",
548
+ )
549
+
550
+ parser.add_argument(
551
+ "--pull",
552
+ type=str,
553
+ choices=("always", "missing", "never"),
554
+ default="missing",
555
+ help="Image pull policy: always, missing, never (default: missing)",
556
+ )
557
+
558
+ # Utility options
559
+ parser.add_argument(
560
+ "--check-update",
561
+ action="store_true",
562
+ help="Check for available updates and exit",
563
+ )
564
+
565
+ parser.add_argument(
566
+ "--version",
567
+ action="version",
568
+ version=f"mdify {__version__}",
569
+ )
570
+
571
+ return parser.parse_args()
572
+
573
+
574
+ # =============================================================================
575
+ # Main entry point
576
+ # =============================================================================
577
+
578
+ def main() -> int:
579
+ """Main entry point for the CLI."""
580
+ args = parse_args()
581
+
582
+ # Handle --check-update flag
583
+ if args.check_update:
584
+ check_for_update(force=True)
585
+ return 0
586
+
587
+ # Check for updates (daily, silent on errors)
588
+ check_for_update(force=False)
589
+
590
+ # Validate input is provided
591
+ if args.input is None:
592
+ print("Error: Input file or directory is required", file=sys.stderr)
593
+ print("Usage: mdify <input> [options]", file=sys.stderr)
594
+ print(" mdify --help for more information", file=sys.stderr)
595
+ return 1
596
+
597
+ # Detect container runtime
598
+ runtime = detect_runtime(args.runtime)
599
+ if runtime is None:
600
+ print(
601
+ f"Error: Container runtime not found ({', '.join(SUPPORTED_RUNTIMES)})",
602
+ file=sys.stderr,
603
+ )
604
+ print("Please install Docker or Podman to use mdify.", file=sys.stderr)
605
+ return 2
606
+
607
+ # Handle image pull policy
608
+ image = args.image
609
+ image_exists = check_image_exists(runtime, image)
610
+
611
+ if args.pull == "always" or (args.pull == "missing" and not image_exists):
612
+ if not pull_image(runtime, image, args.quiet):
613
+ print(f"Error: Failed to pull image: {image}", file=sys.stderr)
614
+ return 1
615
+ elif args.pull == "never" and not image_exists:
616
+ print(f"Error: Image not found locally: {image}", file=sys.stderr)
617
+ print(f"Run with --pull=missing or pull manually: {args.runtime} pull {image}")
618
+ return 1
619
+
620
+ # Resolve paths
621
+ input_path = Path(args.input).resolve()
622
+ output_dir = Path(args.out_dir).resolve()
623
+
624
+ # Validate input
625
+ if not input_path.exists():
626
+ print(f"Error: Input path does not exist: {input_path}", file=sys.stderr)
627
+ return 1
628
+
629
+ # Get files to convert
630
+ try:
631
+ files_to_convert = get_files_to_convert(input_path, args.glob, args.recursive)
632
+ except Exception as e:
633
+ print(f"Error: {e}", file=sys.stderr)
634
+ return 1
635
+
636
+ if not files_to_convert:
637
+ print(f"No files found to convert in: {input_path}", file=sys.stderr)
638
+ return 1
639
+
640
+ total_files = len(files_to_convert)
641
+ total_size = sum(f.stat().st_size for f in files_to_convert)
642
+
643
+ if not args.quiet:
644
+ print(f"Found {total_files} file(s) to convert ({format_size(total_size)})")
645
+ print(f"Using runtime: {runtime}")
646
+ print(f"Using image: {image}")
647
+ print()
648
+
649
+ # Determine input base for directory structure preservation
650
+ if input_path.is_file():
651
+ input_base = input_path.parent
652
+ else:
653
+ input_base = input_path
654
+
655
+ # Convert files
656
+ success_count = 0
657
+ skipped_count = 0
658
+ failed_count = 0
659
+ conversion_start = time.time()
660
+ spinner = Spinner()
661
+
662
+ for idx, input_file in enumerate(files_to_convert, 1):
663
+ output_file = get_output_path(input_file, input_base, output_dir, args.flat)
664
+ file_size = input_file.stat().st_size
665
+ progress = f"[{idx}/{total_files}]"
666
+
667
+ # Check if output exists and skip if not overwriting
668
+ if output_file.exists() and not args.overwrite:
669
+ if not args.quiet:
670
+ print(f"{progress} Skipped (exists): {input_file.name}")
671
+ skipped_count += 1
672
+ continue
673
+
674
+ # Show spinner while processing
675
+ if not args.quiet:
676
+ spinner.start(f"{progress} Processing: {input_file.name} ({format_size(file_size)})")
677
+
678
+ success, result, elapsed = run_container(
679
+ runtime, image, input_file, output_file, args.mask
680
+ )
681
+
682
+ if not args.quiet:
683
+ spinner.stop()
684
+
685
+ if success:
686
+ success_count += 1
687
+ if not args.quiet:
688
+ print(f"{progress} {input_file.name} ✓ ({format_duration(elapsed)})")
689
+ else:
690
+ failed_count += 1
691
+ if not args.quiet:
692
+ print(f"{progress} {input_file.name} ✗ ({format_duration(elapsed)})")
693
+ print(f" Error: {result}", file=sys.stderr)
694
+
695
+ total_elapsed = time.time() - conversion_start
696
+
697
+ # Print summary
698
+ if not args.quiet:
699
+ print()
700
+ print("=" * 50)
701
+ print("Conversion Summary:")
702
+ print(f" Total files: {total_files}")
703
+ print(f" Successful: {success_count}")
704
+ print(f" Skipped: {skipped_count}")
705
+ print(f" Failed: {failed_count}")
706
+ print(f" Total time: {format_duration(total_elapsed)}")
707
+ print("=" * 50)
708
+
709
+ # Return appropriate exit code
710
+ if failed_count > 0:
711
+ return 1
712
+ elif success_count == 0 and skipped_count > 0:
713
+ return 0
714
+ elif success_count > 0:
715
+ return 0
716
+ else:
717
+ return 1
718
+
719
+
720
+ if __name__ == "__main__":
721
+ sys.exit(main())
@@ -0,0 +1,249 @@
1
+ Metadata-Version: 2.4
2
+ Name: mdify-cli
3
+ Version: 1.3.1
4
+ Summary: Lightweight CLI for converting documents to Markdown via Docling container
5
+ Author: tiroq
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/tiroq/mdify
8
+ Project-URL: Repository, https://github.com/tiroq/mdify
9
+ Project-URL: Issues, https://github.com/tiroq/mdify/issues
10
+ Keywords: markdown,conversion,pdf,docling,cli,document,docker
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: End Users/Desktop
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.8
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
23
+ Classifier: Topic :: Utilities
24
+ Requires-Python: >=3.8
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Dynamic: license-file
28
+
29
+ # mdify
30
+
31
+ ![MDify banner](assets/mdify.png)
32
+
33
+ [![PyPI](https://img.shields.io/pypi/v/mdify-cli?logo=python&style=flat-square)](https://pypi.org/project/mdify-cli/)
34
+ [![Container](https://img.shields.io/badge/container-ghcr.io-blue?logo=docker&style=flat-square)](https://github.com/tiroq/mdify/pkgs/container/mdify-runtime)
35
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg?style=flat-square)](https://opensource.org/licenses/MIT)
36
+
37
+ A lightweight CLI for converting documents to Markdown. The CLI is fast to install via pipx, while the heavy ML conversion (Docling) runs inside a container.
38
+
39
+ ## Requirements
40
+
41
+ - **Python 3.8+**
42
+ - **Docker** or **Podman** (for document conversion)
43
+
44
+ ## Installation
45
+
46
+ ### macOS (recommended)
47
+
48
+ ```bash
49
+ brew install pipx
50
+ pipx ensurepath
51
+ pipx install mdify-cli
52
+ ```
53
+
54
+ Restart your terminal after installation.
55
+
56
+ ### Linux
57
+
58
+ ```bash
59
+ python3 -m pip install --user pipx
60
+ pipx ensurepath
61
+ pipx install mdify-cli
62
+ ```
63
+
64
+ ### Install via pip
65
+
66
+ ```bash
67
+ pip install mdify-cli
68
+ ```
69
+
70
+ ### Development install
71
+
72
+ ```bash
73
+ git clone https://github.com/tiroq/mdify.git
74
+ cd mdify
75
+ pip install -e .
76
+ ```
77
+
78
+ ## Usage
79
+
80
+ ### Basic conversion
81
+
82
+ Convert a single file:
83
+ ```bash
84
+ mdify document.pdf
85
+ ```
86
+
87
+ The first run will automatically pull the container image (~2GB) if not present.
88
+
89
+ ### Convert multiple files
90
+
91
+ Convert all PDFs in a directory:
92
+ ```bash
93
+ mdify /path/to/documents -g "*.pdf"
94
+ ```
95
+
96
+ Recursively convert files:
97
+ ```bash
98
+ mdify /path/to/documents -r -g "*.pdf"
99
+ ```
100
+
101
+ ### Masking sensitive content
102
+
103
+ Mask PII and sensitive content in images:
104
+ ```bash
105
+ mdify document.pdf -m
106
+ mdify document.pdf --mask
107
+ ```
108
+
109
+ This uses Docling's content-aware masking to obscure sensitive information in embedded images.
110
+
111
+ ## Options
112
+
113
+ | Option | Description |
114
+ |--------|-------------|
115
+ | `input` | Input file or directory to convert (required) |
116
+ | `-o, --out-dir DIR` | Output directory for converted files (default: output) |
117
+ | `-g, --glob PATTERN` | Glob pattern for filtering files (default: *) |
118
+ | `-r, --recursive` | Recursively scan directories |
119
+ | `--flat` | Disable directory structure preservation |
120
+ | `--overwrite` | Overwrite existing output files |
121
+ | `-q, --quiet` | Suppress progress messages |
122
+ | `-m, --mask` | Mask PII and sensitive content in images |
123
+ | `--runtime RUNTIME` | Container runtime: docker or podman (auto-detected) |
124
+ | `--image IMAGE` | Custom container image (default: ghcr.io/tiroq/mdify-runtime:latest) |
125
+ | `--pull POLICY` | Image pull policy: always, missing, never (default: missing) |
126
+ | `--check-update` | Check for available updates and exit |
127
+ | `--version` | Show version and exit |
128
+
129
+ ### Flat Mode
130
+
131
+ With `--flat`, all output files are placed directly in the output directory. Directory paths are incorporated into filenames to prevent collisions:
132
+
133
+ - `docs/subdir1/file.pdf` → `output/subdir1_file.md`
134
+ - `docs/subdir2/file.pdf` → `output/subdir2_file.md`
135
+
136
+ ## Examples
137
+
138
+ Convert all PDFs recursively, preserving structure:
139
+ ```bash
140
+ mdify documents/ -r -g "*.pdf" -o markdown_output
141
+ ```
142
+
143
+ Convert with Podman instead of Docker:
144
+ ```bash
145
+ mdify document.pdf --runtime podman
146
+ ```
147
+
148
+ Use a custom/local container image:
149
+ ```bash
150
+ mdify document.pdf --image my-custom-image:latest
151
+ ```
152
+
153
+ Force pull latest container image:
154
+ ```bash
155
+ mdify document.pdf --pull
156
+ ```
157
+
158
+ ## Architecture
159
+
160
+ ```
161
+ ┌──────────────────┐ ┌─────────────────────────────────┐
162
+ │ mdify CLI │ │ Container (Docker/Podman) │
163
+ │ (lightweight) │────▶│ ┌───────────────────────────┐ │
164
+ │ │ │ │ Docling + ML Models │ │
165
+ │ - File handling │◀────│ │ - PDF parsing │ │
166
+ │ - Container │ │ │ - OCR (Tesseract) │ │
167
+ │ orchestration │ │ │ - Document conversion │ │
168
+ └──────────────────┘ │ └───────────────────────────┘ │
169
+ └─────────────────────────────────┘
170
+ ```
171
+
172
+ The CLI:
173
+ - Installs in seconds via pipx (no ML dependencies)
174
+ - Automatically detects Docker or Podman
175
+ - Pulls the runtime container on first use
176
+ - Mounts files and runs conversions in the container
177
+
178
+ ## Container Image
179
+
180
+ The runtime container is hosted at:
181
+ ```
182
+ ghcr.io/tiroq/mdify-runtime:latest
183
+ ```
184
+
185
+ To build locally:
186
+ ```bash
187
+ cd runtime
188
+ docker build -t mdify-runtime .
189
+ ```
190
+
191
+ ## Updates
192
+
193
+ mdify checks for updates daily. When a new version is available:
194
+
195
+ ```
196
+ ==================================================
197
+ A new version of mdify is available!
198
+ Current version: 0.3.0
199
+ Latest version: 0.4.0
200
+ ==================================================
201
+
202
+ Run upgrade now? [y/N]
203
+ ```
204
+
205
+ ### Disable update checks
206
+
207
+ ```bash
208
+ export MDIFY_NO_UPDATE_CHECK=1
209
+ ```
210
+
211
+ ## Uninstall
212
+
213
+ ```bash
214
+ pipx uninstall mdify-cli
215
+ ```
216
+
217
+ Or if installed via pip:
218
+
219
+ ```bash
220
+ pip uninstall mdify-cli
221
+ ```
222
+
223
+ ## Development
224
+
225
+ ### Task automation
226
+
227
+ This project uses [Task](https://taskfile.dev) for automation:
228
+
229
+ ```bash
230
+ # Show available tasks
231
+ task
232
+
233
+ # Build package
234
+ task build
235
+
236
+ # Build container locally
237
+ task container-build
238
+
239
+ # Release workflow
240
+ task release-patch
241
+ ```
242
+
243
+ ### Building for PyPI
244
+
245
+ See [PUBLISHING.md](PUBLISHING.md) for complete publishing instructions.
246
+
247
+ ## License
248
+
249
+ MIT
@@ -0,0 +1,9 @@
1
+ mdify/__init__.py,sha256=i8PTIA0EY8RsB6lf3pwGlb0oX30633B0o2KMcqaGl4c,90
2
+ mdify/__main__.py,sha256=bhpJ00co6MfaVOdH4XLoW04NtLYDa_oJK7ODzfLrn9M,143
3
+ mdify/cli.py,sha256=t1c3lSDwB5zco-gji-udZkx_5OPCmLNFRN05XULW7TM,21242
4
+ mdify_cli-1.3.1.dist-info/licenses/LICENSE,sha256=NWM66Uv-XuSMKaU-gaPmvfyk4WgE6zcIPr78wyg6GAo,1065
5
+ mdify_cli-1.3.1.dist-info/METADATA,sha256=pKbl1j497DivGmonSaXZ9tE8wE9x0lS5QXdpQ3ozLaM,6616
6
+ mdify_cli-1.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
+ mdify_cli-1.3.1.dist-info/entry_points.txt,sha256=0Xki8f5lADQUtwdt6Eq_FEaieI6Byhk8UE7BuDhChMg,41
8
+ mdify_cli-1.3.1.dist-info/top_level.txt,sha256=qltzf7h8owHq7dxCdfCkSHY8gT21hn1_E8P-VWS_OKM,6
9
+ mdify_cli-1.3.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ mdify = mdify.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Stranger
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ mdify