doc-to-md-cli 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doc2md/__init__.py +84 -0
- doc2md/cli.py +21 -52
- {doc_to_md_cli-0.1.0.dist-info → doc_to_md_cli-0.1.1.dist-info}/METADATA +36 -16
- doc_to_md_cli-0.1.1.dist-info/RECORD +7 -0
- {doc_to_md_cli-0.1.0.dist-info → doc_to_md_cli-0.1.1.dist-info}/licenses/LICENSE +1 -1
- doc_to_md_cli-0.1.0.dist-info/RECORD +0 -7
- {doc_to_md_cli-0.1.0.dist-info → doc_to_md_cli-0.1.1.dist-info}/WHEEL +0 -0
- {doc_to_md_cli-0.1.0.dist-info → doc_to_md_cli-0.1.1.dist-info}/entry_points.txt +0 -0
doc2md/__init__.py
CHANGED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""
|
|
2
|
+
doc-to-md-cli: Convert DOCX files to Markdown using Playwright
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from playwright.sync_api import sync_playwright
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
__version__ = "0.1.1"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def convert_docx_to_md(
|
|
13
|
+
input_path: str | Path,
|
|
14
|
+
output_dir: str | Path = "./md",
|
|
15
|
+
headless: bool = True,
|
|
16
|
+
) -> list[Path]:
|
|
17
|
+
"""
|
|
18
|
+
Convert DOCX file(s) to Markdown.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
input_path: Path to a DOCX file or directory containing DOCX files
|
|
22
|
+
output_dir: Directory where Markdown files will be saved
|
|
23
|
+
headless: Whether to run browser in headless mode
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
List of paths to the generated Markdown files
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
FileNotFoundError: If input_path doesn't exist
|
|
30
|
+
ValueError: If no DOCX files found
|
|
31
|
+
|
|
32
|
+
Example:
|
|
33
|
+
>>> from doc2md import convert_docx_to_md
|
|
34
|
+
>>> convert_docx_to_md("document.docx", "output")
|
|
35
|
+
['output/document.md']
|
|
36
|
+
"""
|
|
37
|
+
input_path = Path(input_path).expanduser()
|
|
38
|
+
output_dir = Path(output_dir).expanduser()
|
|
39
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
40
|
+
|
|
41
|
+
if not input_path.exists():
|
|
42
|
+
raise FileNotFoundError(f"Path not found: {input_path}")
|
|
43
|
+
|
|
44
|
+
# Determine files to convert
|
|
45
|
+
if input_path.is_file():
|
|
46
|
+
if input_path.suffix.lower() != ".docx":
|
|
47
|
+
raise ValueError("Input file must be a .docx file")
|
|
48
|
+
files = [input_path]
|
|
49
|
+
else:
|
|
50
|
+
files = list(input_path.glob("*.docx"))
|
|
51
|
+
if not files:
|
|
52
|
+
raise ValueError("No .docx files found in directory")
|
|
53
|
+
|
|
54
|
+
converted_files = []
|
|
55
|
+
|
|
56
|
+
with sync_playwright() as p:
|
|
57
|
+
browser = p.chromium.launch(headless=headless)
|
|
58
|
+
context = browser.new_context(
|
|
59
|
+
permissions=["clipboard-read", "clipboard-write"]
|
|
60
|
+
)
|
|
61
|
+
page = context.new_page()
|
|
62
|
+
page.goto("https://word2md.com/")
|
|
63
|
+
|
|
64
|
+
for file_path in files:
|
|
65
|
+
with page.expect_file_chooser() as fc_info:
|
|
66
|
+
page.click('input[type="file"]')
|
|
67
|
+
file_chooser = fc_info.value
|
|
68
|
+
file_chooser.set_files(str(file_path))
|
|
69
|
+
|
|
70
|
+
page.wait_for_selector("#copy-button", state="visible")
|
|
71
|
+
page.click("#copy-button")
|
|
72
|
+
|
|
73
|
+
md_content = page.evaluate("navigator.clipboard.readText()")
|
|
74
|
+
|
|
75
|
+
out_file = output_dir / (file_path.stem + ".md")
|
|
76
|
+
out_file.write_text(md_content, encoding="utf-8")
|
|
77
|
+
converted_files.append(out_file)
|
|
78
|
+
|
|
79
|
+
browser.close()
|
|
80
|
+
|
|
81
|
+
return converted_files
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
__all__ = ["convert_docx_to_md", "__version__"]
|
doc2md/cli.py
CHANGED
|
@@ -1,44 +1,13 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
import argparse
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from playwright.sync_api import sync_playwright
|
|
5
4
|
|
|
6
|
-
|
|
7
|
-
def convert_files(files, output_dir: Path, headless: bool):
|
|
8
|
-
output_dir.mkdir(parents=True, exist_ok=True)
|
|
9
|
-
|
|
10
|
-
with sync_playwright() as p:
|
|
11
|
-
browser = p.chromium.launch(headless=headless)
|
|
12
|
-
context = browser.new_context(
|
|
13
|
-
permissions=["clipboard-read", "clipboard-write"]
|
|
14
|
-
)
|
|
15
|
-
page = context.new_page()
|
|
16
|
-
page.goto("https://word2md.com/")
|
|
17
|
-
|
|
18
|
-
for file_path in files:
|
|
19
|
-
print(f"⬇ Converting: {file_path.name}")
|
|
20
|
-
|
|
21
|
-
with page.expect_file_chooser() as fc_info:
|
|
22
|
-
page.click('input[type="file"]')
|
|
23
|
-
file_chooser = fc_info.value
|
|
24
|
-
file_chooser.set_files(str(file_path))
|
|
25
|
-
|
|
26
|
-
page.wait_for_selector("#copy-button", state="visible")
|
|
27
|
-
page.click("#copy-button")
|
|
28
|
-
|
|
29
|
-
md_content = page.evaluate("navigator.clipboard.readText()")
|
|
30
|
-
|
|
31
|
-
out_file = output_dir / (file_path.stem + ".md")
|
|
32
|
-
out_file.write_text(md_content, encoding="utf-8")
|
|
33
|
-
|
|
34
|
-
print(f"✔ Saved: {out_file}")
|
|
35
|
-
|
|
36
|
-
browser.close()
|
|
5
|
+
from . import convert_docx_to_md
|
|
37
6
|
|
|
38
7
|
|
|
39
8
|
def main():
|
|
40
9
|
parser = argparse.ArgumentParser(
|
|
41
|
-
prog="
|
|
10
|
+
prog="doc-to-md",
|
|
42
11
|
description="Convert DOCX files to Markdown using word2md.com"
|
|
43
12
|
)
|
|
44
13
|
|
|
@@ -56,29 +25,29 @@ def main():
|
|
|
56
25
|
parser.add_argument(
|
|
57
26
|
"--headless",
|
|
58
27
|
action="store_true",
|
|
59
|
-
|
|
28
|
+
default=True,
|
|
29
|
+
help="Run browser in headless mode (default: True)"
|
|
60
30
|
)
|
|
61
31
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
raise SystemExit(f"❌ Path not found: {input_path}")
|
|
69
|
-
|
|
70
|
-
# 🔥 NEW LOGIC
|
|
71
|
-
if input_path.is_file():
|
|
72
|
-
if input_path.suffix.lower() != ".docx":
|
|
73
|
-
raise SystemExit("❌ Input file must be a .docx file")
|
|
74
|
-
files = [input_path]
|
|
32
|
+
parser.add_argument(
|
|
33
|
+
"--no-headless",
|
|
34
|
+
action="store_false",
|
|
35
|
+
dest="headless",
|
|
36
|
+
help="Show browser window"
|
|
37
|
+
)
|
|
75
38
|
|
|
76
|
-
|
|
77
|
-
files = list(input_path.glob("*.docx"))
|
|
78
|
-
if not files:
|
|
79
|
-
raise SystemExit("❌ No .docx files found")
|
|
39
|
+
args = parser.parse_args()
|
|
80
40
|
|
|
81
|
-
|
|
41
|
+
try:
|
|
42
|
+
converted_files = convert_docx_to_md(
|
|
43
|
+
args.input,
|
|
44
|
+
output_dir=args.out,
|
|
45
|
+
headless=args.headless,
|
|
46
|
+
)
|
|
47
|
+
for f in converted_files:
|
|
48
|
+
print(f"✔ Saved: {f}")
|
|
49
|
+
except (FileNotFoundError, ValueError) as e:
|
|
50
|
+
raise SystemExit(f"❌ {e}")
|
|
82
51
|
|
|
83
52
|
|
|
84
53
|
if __name__ == "__main__":
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: doc-to-md-cli
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: Convert DOCX files to Markdown using Playwright
|
|
5
|
-
Project-URL: Homepage, https://github.com/
|
|
6
|
-
Project-URL: Repository, https://github.com/
|
|
7
|
-
Project-URL: Issues, https://github.com/
|
|
5
|
+
Project-URL: Homepage, https://github.com/ebinesh25/doc2md-cli
|
|
6
|
+
Project-URL: Repository, https://github.com/ebinesh25/doc2md-cli
|
|
7
|
+
Project-URL: Issues, https://github.com/ebinesh25/doc2md-cli/issues
|
|
8
8
|
License: MIT
|
|
9
9
|
License-File: LICENSE
|
|
10
10
|
Keywords: cli,converter,docx,markdown,word
|
|
@@ -26,7 +26,7 @@ Description-Content-Type: text/markdown
|
|
|
26
26
|
|
|
27
27
|
# doc-to-md-cli
|
|
28
28
|
|
|
29
|
-
A
|
|
29
|
+
A Python library and CLI tool to convert DOCX files to Markdown using Playwright automation.
|
|
30
30
|
|
|
31
31
|
## Features
|
|
32
32
|
|
|
@@ -34,6 +34,7 @@ A command-line tool to convert DOCX files to Markdown using Playwright automatio
|
|
|
34
34
|
- Batch processing support
|
|
35
35
|
- Headless browser mode for automation
|
|
36
36
|
- Clean Markdown output powered by word2md.com
|
|
37
|
+
- Use as a CLI tool or import as a Python library
|
|
37
38
|
|
|
38
39
|
## Installation
|
|
39
40
|
|
|
@@ -41,43 +42,62 @@ A command-line tool to convert DOCX files to Markdown using Playwright automatio
|
|
|
41
42
|
pip install doc-to-md-cli
|
|
42
43
|
```
|
|
43
44
|
|
|
45
|
+
Install Playwright browser (required):
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
playwright install chromium
|
|
49
|
+
```
|
|
50
|
+
|
|
44
51
|
## Usage
|
|
45
52
|
|
|
46
|
-
###
|
|
53
|
+
### CLI
|
|
54
|
+
|
|
55
|
+
Convert a single file:
|
|
47
56
|
|
|
48
57
|
```bash
|
|
49
58
|
doc-to-md document.docx
|
|
50
59
|
```
|
|
51
60
|
|
|
52
|
-
|
|
61
|
+
Convert to a specific output directory:
|
|
53
62
|
|
|
54
63
|
```bash
|
|
55
64
|
doc-to-md document.docx --out ./output
|
|
56
65
|
```
|
|
57
66
|
|
|
58
|
-
|
|
67
|
+
Convert all DOCX files in a directory:
|
|
59
68
|
|
|
60
69
|
```bash
|
|
61
70
|
doc-to-md ./my-docs --out ./converted
|
|
62
71
|
```
|
|
63
72
|
|
|
64
|
-
|
|
73
|
+
Show browser window (non-headless mode):
|
|
65
74
|
|
|
66
75
|
```bash
|
|
67
|
-
doc-to-md document.docx --headless
|
|
76
|
+
doc-to-md document.docx --no-headless
|
|
68
77
|
```
|
|
69
78
|
|
|
70
|
-
|
|
79
|
+
### Python API
|
|
71
80
|
|
|
72
|
-
|
|
73
|
-
|
|
81
|
+
```python
|
|
82
|
+
from doc2md import convert_docx_to_md
|
|
74
83
|
|
|
75
|
-
|
|
84
|
+
# Convert a single file
|
|
85
|
+
files = convert_docx_to_md("document.docx", "output")
|
|
86
|
+
print(files) # ['output/document.md']
|
|
76
87
|
|
|
77
|
-
|
|
78
|
-
|
|
88
|
+
# Convert all files in a directory
|
|
89
|
+
files = convert_docx_to_md("./my-docs", "output")
|
|
90
|
+
# ['output/file1.md', 'output/file2.md', ...]
|
|
91
|
+
|
|
92
|
+
# Convert with browser visible
|
|
93
|
+
files = convert_docx_to_md("document.docx", headless=False)
|
|
79
94
|
```
|
|
80
95
|
|
|
96
|
+
## Requirements
|
|
97
|
+
|
|
98
|
+
- Python 3.8 or higher
|
|
99
|
+
- Playwright (installed automatically)
|
|
100
|
+
|
|
81
101
|
## License
|
|
82
102
|
|
|
83
103
|
MIT License - see LICENSE file for details.
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
doc2md/__init__.py,sha256=ZzsCLzhzHV0fKtIVNFNHT0dFPGzXeNgLSIBnUzwc55g,2478
|
|
2
|
+
doc2md/cli.py,sha256=y8PNOErH6cVm_4zOJgzlLj26Wo5Riue2u4dM735VN_I,1183
|
|
3
|
+
doc_to_md_cli-0.1.1.dist-info/METADATA,sha256=3EqwwL0WYKKUwqwbMZne03PqMgMdzAnQ2PzCmfldjwI,2447
|
|
4
|
+
doc_to_md_cli-0.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
5
|
+
doc_to_md_cli-0.1.1.dist-info/entry_points.txt,sha256=YyynNzAcKo_GQg_JVs7gKf3JKJvjf1Tg6HPPmRhzBvc,46
|
|
6
|
+
doc_to_md_cli-0.1.1.dist-info/licenses/LICENSE,sha256=iDvN_LXmDQT00XUgh0zJHPIWQqWpwd-YXOhR_7WD7uQ,1063
|
|
7
|
+
doc_to_md_cli-0.1.1.dist-info/RECORD,,
|
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
doc2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
doc2md/cli.py,sha256=GLfepA1qHLdPb1Jjl630srrDAAGL6NW86MW0S5ihp2s,2319
|
|
3
|
-
doc_to_md_cli-0.1.0.dist-info/METADATA,sha256=rhWZqq_MsfmkGaWF5YNEUBuy8RLoTUF_s7VDYqLXLys,2014
|
|
4
|
-
doc_to_md_cli-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
5
|
-
doc_to_md_cli-0.1.0.dist-info/entry_points.txt,sha256=YyynNzAcKo_GQg_JVs7gKf3JKJvjf1Tg6HPPmRhzBvc,46
|
|
6
|
-
doc_to_md_cli-0.1.0.dist-info/licenses/LICENSE,sha256=mnxgbhYn7K71v_xzsVUFRVeXIMYebeqdmubgALMnuIg,1068
|
|
7
|
-
doc_to_md_cli-0.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|