doc-to-md-cli 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
doc2md/__init__.py CHANGED
@@ -0,0 +1,84 @@
1
+ """
2
+ doc-to-md-cli: Convert DOCX files to Markdown using Playwright
3
+ """
4
+
5
+ from pathlib import Path
6
+ from playwright.sync_api import sync_playwright
7
+
8
+
9
+ __version__ = "0.1.1"
10
+
11
+
12
+ def convert_docx_to_md(
13
+ input_path: str | Path,
14
+ output_dir: str | Path = "./md",
15
+ headless: bool = True,
16
+ ) -> list[Path]:
17
+ """
18
+ Convert DOCX file(s) to Markdown.
19
+
20
+ Args:
21
+ input_path: Path to a DOCX file or directory containing DOCX files
22
+ output_dir: Directory where Markdown files will be saved
23
+ headless: Whether to run browser in headless mode
24
+
25
+ Returns:
26
+ List of paths to the generated Markdown files
27
+
28
+ Raises:
29
+ FileNotFoundError: If input_path doesn't exist
30
+ ValueError: If no DOCX files found
31
+
32
+ Example:
33
+ >>> from doc2md import convert_docx_to_md
34
+ >>> convert_docx_to_md("document.docx", "output")
35
+ ['output/document.md']
36
+ """
37
+ input_path = Path(input_path).expanduser()
38
+ output_dir = Path(output_dir).expanduser()
39
+ output_dir.mkdir(parents=True, exist_ok=True)
40
+
41
+ if not input_path.exists():
42
+ raise FileNotFoundError(f"Path not found: {input_path}")
43
+
44
+ # Determine files to convert
45
+ if input_path.is_file():
46
+ if input_path.suffix.lower() != ".docx":
47
+ raise ValueError("Input file must be a .docx file")
48
+ files = [input_path]
49
+ else:
50
+ files = list(input_path.glob("*.docx"))
51
+ if not files:
52
+ raise ValueError("No .docx files found in directory")
53
+
54
+ converted_files = []
55
+
56
+ with sync_playwright() as p:
57
+ browser = p.chromium.launch(headless=headless)
58
+ context = browser.new_context(
59
+ permissions=["clipboard-read", "clipboard-write"]
60
+ )
61
+ page = context.new_page()
62
+ page.goto("https://word2md.com/")
63
+
64
+ for file_path in files:
65
+ with page.expect_file_chooser() as fc_info:
66
+ page.click('input[type="file"]')
67
+ file_chooser = fc_info.value
68
+ file_chooser.set_files(str(file_path))
69
+
70
+ page.wait_for_selector("#copy-button", state="visible")
71
+ page.click("#copy-button")
72
+
73
+ md_content = page.evaluate("navigator.clipboard.readText()")
74
+
75
+ out_file = output_dir / (file_path.stem + ".md")
76
+ out_file.write_text(md_content, encoding="utf-8")
77
+ converted_files.append(out_file)
78
+
79
+ browser.close()
80
+
81
+ return converted_files
82
+
83
+
84
+ __all__ = ["convert_docx_to_md", "__version__"]
doc2md/cli.py CHANGED
@@ -1,44 +1,13 @@
1
1
  #!/usr/bin/env python3
2
2
  import argparse
3
3
  from pathlib import Path
4
- from playwright.sync_api import sync_playwright
5
4
 
6
-
7
- def convert_files(files, output_dir: Path, headless: bool):
8
- output_dir.mkdir(parents=True, exist_ok=True)
9
-
10
- with sync_playwright() as p:
11
- browser = p.chromium.launch(headless=headless)
12
- context = browser.new_context(
13
- permissions=["clipboard-read", "clipboard-write"]
14
- )
15
- page = context.new_page()
16
- page.goto("https://word2md.com/")
17
-
18
- for file_path in files:
19
- print(f"⬇ Converting: {file_path.name}")
20
-
21
- with page.expect_file_chooser() as fc_info:
22
- page.click('input[type="file"]')
23
- file_chooser = fc_info.value
24
- file_chooser.set_files(str(file_path))
25
-
26
- page.wait_for_selector("#copy-button", state="visible")
27
- page.click("#copy-button")
28
-
29
- md_content = page.evaluate("navigator.clipboard.readText()")
30
-
31
- out_file = output_dir / (file_path.stem + ".md")
32
- out_file.write_text(md_content, encoding="utf-8")
33
-
34
- print(f"✔ Saved: {out_file}")
35
-
36
- browser.close()
5
+ from . import convert_docx_to_md
37
6
 
38
7
 
39
8
  def main():
40
9
  parser = argparse.ArgumentParser(
41
- prog="doc2md",
10
+ prog="doc-to-md",
42
11
  description="Convert DOCX files to Markdown using word2md.com"
43
12
  )
44
13
 
@@ -56,29 +25,29 @@ def main():
56
25
  parser.add_argument(
57
26
  "--headless",
58
27
  action="store_true",
59
- help="Run browser in headless mode"
28
+ default=True,
29
+ help="Run browser in headless mode (default: True)"
60
30
  )
61
31
 
62
- args = parser.parse_args()
63
-
64
- input_path = Path(args.input).expanduser()
65
- output_dir = Path(args.out).expanduser()
66
-
67
- if not input_path.exists():
68
- raise SystemExit(f"❌ Path not found: {input_path}")
69
-
70
- # 🔥 NEW LOGIC
71
- if input_path.is_file():
72
- if input_path.suffix.lower() != ".docx":
73
- raise SystemExit("❌ Input file must be a .docx file")
74
- files = [input_path]
32
+ parser.add_argument(
33
+ "--no-headless",
34
+ action="store_false",
35
+ dest="headless",
36
+ help="Show browser window"
37
+ )
75
38
 
76
- else:
77
- files = list(input_path.glob("*.docx"))
78
- if not files:
79
- raise SystemExit("❌ No .docx files found")
39
+ args = parser.parse_args()
80
40
 
81
- convert_files(files, output_dir, args.headless)
41
+ try:
42
+ converted_files = convert_docx_to_md(
43
+ args.input,
44
+ output_dir=args.out,
45
+ headless=args.headless,
46
+ )
47
+ for f in converted_files:
48
+ print(f"✔ Saved: {f}")
49
+ except (FileNotFoundError, ValueError) as e:
50
+ raise SystemExit(f"❌ {e}")
82
51
 
83
52
 
84
53
  if __name__ == "__main__":
@@ -1,10 +1,10 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: doc-to-md-cli
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Convert DOCX files to Markdown using Playwright
5
- Project-URL: Homepage, https://github.com/YOUR_USERNAME/doc2md-cli
6
- Project-URL: Repository, https://github.com/YOUR_USERNAME/doc2md-cli
7
- Project-URL: Issues, https://github.com/YOUR_USERNAME/doc2md-cli/issues
5
+ Project-URL: Homepage, https://github.com/ebinesh25/doc2md-cli
6
+ Project-URL: Repository, https://github.com/ebinesh25/doc2md-cli
7
+ Project-URL: Issues, https://github.com/ebinesh25/doc2md-cli/issues
8
8
  License: MIT
9
9
  License-File: LICENSE
10
10
  Keywords: cli,converter,docx,markdown,word
@@ -26,7 +26,7 @@ Description-Content-Type: text/markdown
26
26
 
27
27
  # doc-to-md-cli
28
28
 
29
- A command-line tool to convert DOCX files to Markdown using Playwright automation.
29
+ A Python library and CLI tool to convert DOCX files to Markdown using Playwright automation.
30
30
 
31
31
  ## Features
32
32
 
@@ -34,6 +34,7 @@ A command-line tool to convert DOCX files to Markdown using Playwright automatio
34
34
  - Batch processing support
35
35
  - Headless browser mode for automation
36
36
  - Clean Markdown output powered by word2md.com
37
+ - Use as a CLI tool or import as a Python library
37
38
 
38
39
  ## Installation
39
40
 
@@ -41,43 +42,62 @@ A command-line tool to convert DOCX files to Markdown using Playwright automatio
41
42
  pip install doc-to-md-cli
42
43
  ```
43
44
 
45
+ Install Playwright browser (required):
46
+
47
+ ```bash
48
+ playwright install chromium
49
+ ```
50
+
44
51
  ## Usage
45
52
 
46
- ### Convert a single file
53
+ ### CLI
54
+
55
+ Convert a single file:
47
56
 
48
57
  ```bash
49
58
  doc-to-md document.docx
50
59
  ```
51
60
 
52
- ### Convert to a specific output directory
61
+ Convert to a specific output directory:
53
62
 
54
63
  ```bash
55
64
  doc-to-md document.docx --out ./output
56
65
  ```
57
66
 
58
- ### Convert all DOCX files in a directory
67
+ Convert all DOCX files in a directory:
59
68
 
60
69
  ```bash
61
70
  doc-to-md ./my-docs --out ./converted
62
71
  ```
63
72
 
64
- ### Run in headless mode (no visible browser)
73
+ Show browser window (non-headless mode):
65
74
 
66
75
  ```bash
67
- doc-to-md document.docx --headless
76
+ doc-to-md document.docx --no-headless
68
77
  ```
69
78
 
70
- ## Requirements
79
+ ### Python API
71
80
 
72
- - Python 3.8 or higher
73
- - Playwright (installed automatically)
81
+ ```python
82
+ from doc2md import convert_docx_to_md
74
83
 
75
- After installing, you may need to install Playwright browsers:
84
+ # Convert a single file
85
+ files = convert_docx_to_md("document.docx", "output")
86
+ print(files) # ['output/document.md']
76
87
 
77
- ```bash
78
- playwright install chromium
88
+ # Convert all files in a directory
89
+ files = convert_docx_to_md("./my-docs", "output")
90
+ # ['output/file1.md', 'output/file2.md', ...]
91
+
92
+ # Convert with browser visible
93
+ files = convert_docx_to_md("document.docx", headless=False)
79
94
  ```
80
95
 
96
+ ## Requirements
97
+
98
+ - Python 3.8 or higher
99
+ - Playwright (installed automatically)
100
+
81
101
  ## License
82
102
 
83
103
  MIT License - see LICENSE file for details.
@@ -0,0 +1,7 @@
1
+ doc2md/__init__.py,sha256=ZzsCLzhzHV0fKtIVNFNHT0dFPGzXeNgLSIBnUzwc55g,2478
2
+ doc2md/cli.py,sha256=y8PNOErH6cVm_4zOJgzlLj26Wo5Riue2u4dM735VN_I,1183
3
+ doc_to_md_cli-0.1.1.dist-info/METADATA,sha256=3EqwwL0WYKKUwqwbMZne03PqMgMdzAnQ2PzCmfldjwI,2447
4
+ doc_to_md_cli-0.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
5
+ doc_to_md_cli-0.1.1.dist-info/entry_points.txt,sha256=YyynNzAcKo_GQg_JVs7gKf3JKJvjf1Tg6HPPmRhzBvc,46
6
+ doc_to_md_cli-0.1.1.dist-info/licenses/LICENSE,sha256=iDvN_LXmDQT00XUgh0zJHPIWQqWpwd-YXOhR_7WD7uQ,1063
7
+ doc_to_md_cli-0.1.1.dist-info/RECORD,,
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2025 [Your Name]
3
+ Copyright (c) 2025Ebinesh
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -1,7 +0,0 @@
1
- doc2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- doc2md/cli.py,sha256=GLfepA1qHLdPb1Jjl630srrDAAGL6NW86MW0S5ihp2s,2319
3
- doc_to_md_cli-0.1.0.dist-info/METADATA,sha256=rhWZqq_MsfmkGaWF5YNEUBuy8RLoTUF_s7VDYqLXLys,2014
4
- doc_to_md_cli-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
5
- doc_to_md_cli-0.1.0.dist-info/entry_points.txt,sha256=YyynNzAcKo_GQg_JVs7gKf3JKJvjf1Tg6HPPmRhzBvc,46
6
- doc_to_md_cli-0.1.0.dist-info/licenses/LICENSE,sha256=mnxgbhYn7K71v_xzsVUFRVeXIMYebeqdmubgALMnuIg,1068
7
- doc_to_md_cli-0.1.0.dist-info/RECORD,,