mdengine 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mdengine-0.1.0/LICENSE +21 -0
- mdengine-0.1.0/PKG-INFO +509 -0
- mdengine-0.1.0/README.md +423 -0
- mdengine-0.1.0/pyproject.toml +111 -0
- mdengine-0.1.0/setup.cfg +4 -0
- mdengine-0.1.0/src/md_generator/__init__.py +5 -0
- mdengine-0.1.0/src/md_generator/archive/__init__.py +1 -0
- mdengine-0.1.0/src/md_generator/archive/api/__init__.py +1 -0
- mdengine-0.1.0/src/md_generator/archive/api/convert_runner.py +29 -0
- mdengine-0.1.0/src/md_generator/archive/api/jobs.py +93 -0
- mdengine-0.1.0/src/md_generator/archive/api/main.py +204 -0
- mdengine-0.1.0/src/md_generator/archive/api/mcp_server.py +38 -0
- mdengine-0.1.0/src/md_generator/archive/api/mcp_setup.py +84 -0
- mdengine-0.1.0/src/md_generator/archive/api/query_options.py +36 -0
- mdengine-0.1.0/src/md_generator/archive/api/settings.py +32 -0
- mdengine-0.1.0/src/md_generator/archive/convert_impl.py +729 -0
- mdengine-0.1.0/src/md_generator/archive/converter.py +123 -0
- mdengine-0.1.0/src/md_generator/archive/options.py +26 -0
- mdengine-0.1.0/src/md_generator/image/__init__.py +5 -0
- mdengine-0.1.0/src/md_generator/image/api/__init__.py +1 -0
- mdengine-0.1.0/src/md_generator/image/api/main.py +314 -0
- mdengine-0.1.0/src/md_generator/image/api/mcp_server.py +169 -0
- mdengine-0.1.0/src/md_generator/image/api/query_options.py +42 -0
- mdengine-0.1.0/src/md_generator/image/api/settings.py +44 -0
- mdengine-0.1.0/src/md_generator/image/api/staging.py +46 -0
- mdengine-0.1.0/src/md_generator/image/api/zip_bundle.py +17 -0
- mdengine-0.1.0/src/md_generator/image/backends/__init__.py +10 -0
- mdengine-0.1.0/src/md_generator/image/backends/base.py +17 -0
- mdengine-0.1.0/src/md_generator/image/backends/easy.py +60 -0
- mdengine-0.1.0/src/md_generator/image/backends/paddle.py +88 -0
- mdengine-0.1.0/src/md_generator/image/backends/tesseract.py +45 -0
- mdengine-0.1.0/src/md_generator/image/convert_impl.py +117 -0
- mdengine-0.1.0/src/md_generator/image/converter.py +111 -0
- mdengine-0.1.0/src/md_generator/image/emit.py +67 -0
- mdengine-0.1.0/src/md_generator/image/io_util.py +53 -0
- mdengine-0.1.0/src/md_generator/image/utils.py +18 -0
- mdengine-0.1.0/src/md_generator/pdf/__init__.py +1 -0
- mdengine-0.1.0/src/md_generator/pdf/api/__init__.py +1 -0
- mdengine-0.1.0/src/md_generator/pdf/api/main.py +261 -0
- mdengine-0.1.0/src/md_generator/pdf/api/mcp_server.py +95 -0
- mdengine-0.1.0/src/md_generator/pdf/api/settings.py +40 -0
- mdengine-0.1.0/src/md_generator/pdf/api/zip_bundle.py +17 -0
- mdengine-0.1.0/src/md_generator/pdf/converter.py +70 -0
- mdengine-0.1.0/src/md_generator/pdf/md_emit.py +42 -0
- mdengine-0.1.0/src/md_generator/pdf/pdf_extract.py +227 -0
- mdengine-0.1.0/src/md_generator/pdf/utils.py +55 -0
- mdengine-0.1.0/src/md_generator/ppt/__init__.py +1 -0
- mdengine-0.1.0/src/md_generator/ppt/api/__init__.py +1 -0
- mdengine-0.1.0/src/md_generator/ppt/api/convert_runner.py +30 -0
- mdengine-0.1.0/src/md_generator/ppt/api/jobs.py +93 -0
- mdengine-0.1.0/src/md_generator/ppt/api/main.py +197 -0
- mdengine-0.1.0/src/md_generator/ppt/api/mcp_server.py +38 -0
- mdengine-0.1.0/src/md_generator/ppt/api/mcp_setup.py +81 -0
- mdengine-0.1.0/src/md_generator/ppt/api/query_options.py +35 -0
- mdengine-0.1.0/src/md_generator/ppt/api/settings.py +24 -0
- mdengine-0.1.0/src/md_generator/ppt/convert_impl.py +231 -0
- mdengine-0.1.0/src/md_generator/ppt/converter.py +81 -0
- mdengine-0.1.0/src/md_generator/ppt/embedded_extract.py +190 -0
- mdengine-0.1.0/src/md_generator/ppt/ooxml_media.py +94 -0
- mdengine-0.1.0/src/md_generator/ppt/options.py +39 -0
- mdengine-0.1.0/src/md_generator/ppt/post_assets.py +172 -0
- mdengine-0.1.0/src/md_generator/ppt/text_formatting.py +97 -0
- mdengine-0.1.0/src/md_generator/ppt/vendor_pdf_md/__init__.py +5 -0
- mdengine-0.1.0/src/md_generator/ppt/vendor_pdf_md/convert.py +76 -0
- mdengine-0.1.0/src/md_generator/ppt/vendor_word_md/__init__.py +5 -0
- mdengine-0.1.0/src/md_generator/ppt/vendor_word_md/convert.py +43 -0
- mdengine-0.1.0/src/md_generator/ppt/zip_deep.py +31 -0
- mdengine-0.1.0/src/md_generator/text/__init__.py +1 -0
- mdengine-0.1.0/src/md_generator/text/api/__init__.py +1 -0
- mdengine-0.1.0/src/md_generator/text/api/convert_runner.py +24 -0
- mdengine-0.1.0/src/md_generator/text/api/jobs.py +93 -0
- mdengine-0.1.0/src/md_generator/text/api/main.py +183 -0
- mdengine-0.1.0/src/md_generator/text/api/mcp_server.py +38 -0
- mdengine-0.1.0/src/md_generator/text/api/mcp_setup.py +78 -0
- mdengine-0.1.0/src/md_generator/text/api/query_options.py +17 -0
- mdengine-0.1.0/src/md_generator/text/api/settings.py +24 -0
- mdengine-0.1.0/src/md_generator/text/convert_impl.py +72 -0
- mdengine-0.1.0/src/md_generator/text/converter.py +73 -0
- mdengine-0.1.0/src/md_generator/text/format_detect.py +53 -0
- mdengine-0.1.0/src/md_generator/text/md_emit_json.py +144 -0
- mdengine-0.1.0/src/md_generator/text/md_emit_txt.py +115 -0
- mdengine-0.1.0/src/md_generator/text/md_emit_xml.py +127 -0
- mdengine-0.1.0/src/md_generator/text/options.py +27 -0
- mdengine-0.1.0/src/md_generator/word/__init__.py +3 -0
- mdengine-0.1.0/src/md_generator/word/api/__init__.py +1 -0
- mdengine-0.1.0/src/md_generator/word/api/convert_util.py +41 -0
- mdengine-0.1.0/src/md_generator/word/api/jobs.py +86 -0
- mdengine-0.1.0/src/md_generator/word/api/main.py +172 -0
- mdengine-0.1.0/src/md_generator/word/api/mcp_server.py +98 -0
- mdengine-0.1.0/src/md_generator/word/artifact.py +30 -0
- mdengine-0.1.0/src/md_generator/word/converter.py +192 -0
- mdengine-0.1.0/src/md_generator/word/settings.py +58 -0
- mdengine-0.1.0/src/md_generator/xlsx/__init__.py +8 -0
- mdengine-0.1.0/src/md_generator/xlsx/api/__init__.py +1 -0
- mdengine-0.1.0/src/md_generator/xlsx/api/app.py +293 -0
- mdengine-0.1.0/src/md_generator/xlsx/convert_config.py +58 -0
- mdengine-0.1.0/src/md_generator/xlsx/converter.py +67 -0
- mdengine-0.1.0/src/md_generator/xlsx/converter_core.py +135 -0
- mdengine-0.1.0/src/md_generator/xlsx/excel_reader.py +170 -0
- mdengine-0.1.0/src/md_generator/xlsx/markdown_emitter.py +143 -0
- mdengine-0.1.0/src/md_generator/xlsx/mcp_server.py +50 -0
- mdengine-0.1.0/src/mdengine.egg-info/PKG-INFO +509 -0
- mdengine-0.1.0/src/mdengine.egg-info/SOURCES.txt +105 -0
- mdengine-0.1.0/src/mdengine.egg-info/dependency_links.txt +1 -0
- mdengine-0.1.0/src/mdengine.egg-info/entry_points.txt +8 -0
- mdengine-0.1.0/src/mdengine.egg-info/requires.txt +80 -0
- mdengine-0.1.0/src/mdengine.egg-info/top_level.txt +1 -0
mdengine-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 mdengine contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
mdengine-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,509 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mdengine
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Convert PDF, Office, images, text/JSON/XML, and ZIP archives to Markdown.
|
|
5
|
+
Author: mdengine contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/vishal7090/md-generator
|
|
8
|
+
Project-URL: Repository, https://github.com/vishal7090/md-generator
|
|
9
|
+
Project-URL: Issues, https://github.com/vishal7090/md-generator/issues
|
|
10
|
+
Keywords: markdown,pdf,docx,pptx,xlsx,ocr,zip
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
13
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
14
|
+
Requires-Python: >=3.10
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Provides-Extra: pdf
|
|
18
|
+
Requires-Dist: pymupdf>=1.24.0; extra == "pdf"
|
|
19
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == "pdf"
|
|
20
|
+
Provides-Extra: word
|
|
21
|
+
Requires-Dist: mammoth>=1.8.0; extra == "word"
|
|
22
|
+
Requires-Dist: markdownify>=0.13.0; extra == "word"
|
|
23
|
+
Provides-Extra: ppt
|
|
24
|
+
Requires-Dist: python-pptx>=1.0.0; extra == "ppt"
|
|
25
|
+
Requires-Dist: Pillow>=10.0.0; extra == "ppt"
|
|
26
|
+
Requires-Dist: lxml>=5.0.0; extra == "ppt"
|
|
27
|
+
Requires-Dist: olefile>=0.47; extra == "ppt"
|
|
28
|
+
Requires-Dist: openpyxl>=3.1.0; extra == "ppt"
|
|
29
|
+
Requires-Dist: mammoth>=1.8.0; extra == "ppt"
|
|
30
|
+
Requires-Dist: markdownify>=0.14.0; extra == "ppt"
|
|
31
|
+
Requires-Dist: pymupdf>=1.24.0; extra == "ppt"
|
|
32
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == "ppt"
|
|
33
|
+
Requires-Dist: pytesseract>=0.3.10; extra == "ppt"
|
|
34
|
+
Provides-Extra: xlsx
|
|
35
|
+
Requires-Dist: openpyxl>=3.1.0; extra == "xlsx"
|
|
36
|
+
Provides-Extra: image
|
|
37
|
+
Requires-Dist: Pillow>=10.0.0; extra == "image"
|
|
38
|
+
Provides-Extra: image-ocr
|
|
39
|
+
Requires-Dist: Pillow>=10.0.0; extra == "image-ocr"
|
|
40
|
+
Requires-Dist: pytesseract>=0.3.10; extra == "image-ocr"
|
|
41
|
+
Requires-Dist: numpy>=1.24.0; extra == "image-ocr"
|
|
42
|
+
Requires-Dist: paddlepaddle>=2.5.0; extra == "image-ocr"
|
|
43
|
+
Requires-Dist: paddleocr>=2.7.0.3; extra == "image-ocr"
|
|
44
|
+
Requires-Dist: easyocr>=1.7.0; extra == "image-ocr"
|
|
45
|
+
Provides-Extra: text
|
|
46
|
+
Provides-Extra: archive
|
|
47
|
+
Requires-Dist: Pillow>=10.0.0; extra == "archive"
|
|
48
|
+
Requires-Dist: pytesseract>=0.3.10; extra == "archive"
|
|
49
|
+
Provides-Extra: api
|
|
50
|
+
Requires-Dist: fastapi>=0.115.0; extra == "api"
|
|
51
|
+
Requires-Dist: uvicorn[standard]>=0.32.0; extra == "api"
|
|
52
|
+
Requires-Dist: python-multipart>=0.0.12; extra == "api"
|
|
53
|
+
Requires-Dist: httpx>=0.27.0; extra == "api"
|
|
54
|
+
Requires-Dist: pydantic-settings>=2.0.0; extra == "api"
|
|
55
|
+
Provides-Extra: mcp
|
|
56
|
+
Requires-Dist: mcp>=1.2.0; extra == "mcp"
|
|
57
|
+
Requires-Dist: fastmcp>=2.3.0; extra == "mcp"
|
|
58
|
+
Provides-Extra: dev
|
|
59
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
60
|
+
Requires-Dist: httpx>=0.27.0; extra == "dev"
|
|
61
|
+
Requires-Dist: mdengine[api]; extra == "dev"
|
|
62
|
+
Requires-Dist: mdengine[mcp]; extra == "dev"
|
|
63
|
+
Provides-Extra: all
|
|
64
|
+
Requires-Dist: pymupdf>=1.24.0; extra == "all"
|
|
65
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == "all"
|
|
66
|
+
Requires-Dist: mammoth>=1.8.0; extra == "all"
|
|
67
|
+
Requires-Dist: markdownify>=0.14.0; extra == "all"
|
|
68
|
+
Requires-Dist: python-pptx>=1.0.0; extra == "all"
|
|
69
|
+
Requires-Dist: Pillow>=10.0.0; extra == "all"
|
|
70
|
+
Requires-Dist: lxml>=5.0.0; extra == "all"
|
|
71
|
+
Requires-Dist: olefile>=0.47; extra == "all"
|
|
72
|
+
Requires-Dist: openpyxl>=3.1.0; extra == "all"
|
|
73
|
+
Requires-Dist: pytesseract>=0.3.10; extra == "all"
|
|
74
|
+
Requires-Dist: numpy>=1.24.0; extra == "all"
|
|
75
|
+
Requires-Dist: paddlepaddle>=2.5.0; extra == "all"
|
|
76
|
+
Requires-Dist: paddleocr>=2.7.0.3; extra == "all"
|
|
77
|
+
Requires-Dist: easyocr>=1.7.0; extra == "all"
|
|
78
|
+
Requires-Dist: fastapi>=0.115.0; extra == "all"
|
|
79
|
+
Requires-Dist: uvicorn[standard]>=0.32.0; extra == "all"
|
|
80
|
+
Requires-Dist: python-multipart>=0.0.12; extra == "all"
|
|
81
|
+
Requires-Dist: httpx>=0.27.0; extra == "all"
|
|
82
|
+
Requires-Dist: pydantic-settings>=2.0.0; extra == "all"
|
|
83
|
+
Requires-Dist: mcp>=1.2.0; extra == "all"
|
|
84
|
+
Requires-Dist: fastmcp>=2.3.0; extra == "all"
|
|
85
|
+
Dynamic: license-file
|
|
86
|
+
|
|
87
|
+
# mdengine
|
|
88
|
+
|
|
89
|
+
Single Python distribution for converting **PDF**, **Word (.docx)**, **PowerPoint (.pptx)**, **Excel (.xlsx/.xlsm)**, **images** (OCR), **plain text / JSON / XML**, and **ZIP archives** into **Markdown** (and related assets). Install only the extras you need; everything imports under the **`md_generator`** package.
|
|
90
|
+
|
|
91
|
+
- **PyPI name:** `mdengine` (import package: `md_generator`)
|
|
92
|
+
- **Source:** [github.com/vishal7090/md-generator](https://github.com/vishal7090/md-generator)
|
|
93
|
+
- **Python:** 3.10+
|
|
94
|
+
- **License:** [MIT](LICENSE)
|
|
95
|
+
|
|
96
|
+
**Quick links:** [On a new computer](#on-a-new-computer) · [Command-line execution](#command-line-execution) · [Python library](#python-library) · [HTTP API](#http-api-fastapi) · [MCP](#mcp-model-context-protocol) · [Development](#development) · [Code of Conduct](CODE_OF_CONDUCT.md)
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
## On a new computer
|
|
101
|
+
|
|
102
|
+
Use this checklist the first time you run the tools on a machine that does not have the project yet.
|
|
103
|
+
|
|
104
|
+
1. **Install Python 3.10 or newer** from [python.org](https://www.python.org/downloads/) (Windows: enable **Add python.exe to PATH** in the installer). Confirm in a new terminal: `python --version`.
|
|
105
|
+
2. **(Recommended)** Create an isolated environment so dependencies do not clash with other projects:
|
|
106
|
+
```bash
|
|
107
|
+
python -m venv .venv
|
|
108
|
+
```
|
|
109
|
+
Then activate it: **Windows (PowerShell)** `.\.venv\Scripts\Activate.ps1` · **Windows (CMD)** `.venv\Scripts\activate.bat` · **macOS / Linux** `source .venv/bin/activate`.
|
|
110
|
+
3. **Install this package with the extras you need** (see [Optional dependency extras](#optional-dependency-extras) for what each extra does):
|
|
111
|
+
```bash
|
|
112
|
+
pip install "mdengine[pdf,word]"
|
|
113
|
+
```
|
|
114
|
+
If the package is not on PyPI yet, clone [the repository](https://github.com/vishal7090/md-generator), `cd` into the repo root, then:
|
|
115
|
+
```bash
|
|
116
|
+
pip install -e ".[pdf,word]"
|
|
117
|
+
```
|
|
118
|
+
4. **Confirm the CLI is on your PATH:** `md-pdf --help` (or `md-word --help`, etc.). If you see “command not found”, the folder where `pip` puts scripts (often `.venv\Scripts` on Windows or `.venv/bin` on Unix) must be on your `PATH`, or you must run commands from an **activated** virtual environment.
|
|
119
|
+
5. **Run one conversion** with a real file path, for example:
|
|
120
|
+
```bash
|
|
121
|
+
md-pdf path\to\report.pdf out.md
|
|
122
|
+
```
|
|
123
|
+
Full flags and every `md-*` command are in [Command-line execution](#command-line-execution).
|
|
124
|
+
|
|
125
|
+
---
|
|
126
|
+
|
|
127
|
+
## Installation
|
|
128
|
+
|
|
129
|
+
From the repository root (editable install for development):
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
pip install -e .
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
With format-specific and HTTP extras:
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
pip install -e ".[pdf,word,api]"
|
|
139
|
+
pip install -e ".[ppt,xlsx,image,archive,api,mcp]"
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
From PyPI (once published):
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
pip install "mdengine[pdf,word]"
|
|
146
|
+
pip install "mdengine[all]"
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### Optional dependency extras
|
|
150
|
+
|
|
151
|
+
| Extra | Purpose |
|
|
152
|
+
|--------|---------|
|
|
153
|
+
| `pdf` | PDF extraction (PyMuPDF, pdfplumber) |
|
|
154
|
+
| `word` | DOCX → Markdown (mammoth, markdownify) |
|
|
155
|
+
| `ppt` | PPTX and embedded content (python-pptx, Pillow, lxml, mammoth, PyMuPDF, …) |
|
|
156
|
+
| `xlsx` | Excel → Markdown (openpyxl) |
|
|
157
|
+
| `image` | Image I/O for OCR pipelines (Pillow) |
|
|
158
|
+
| `image-ocr` | Heavy OCR backends (pytesseract, paddle, easyocr, …) |
|
|
159
|
+
| `text` | TXT / JSON / XML converter (stdlib-oriented; marker extra) |
|
|
160
|
+
| `archive` | ZIP → Markdown layout (Pillow; optional tesseract for inline image OCR) |
|
|
161
|
+
| `api` | FastAPI, uvicorn, httpx, pydantic-settings |
|
|
162
|
+
| `mcp` | MCP servers (`mcp`, `fastmcp` where used) |
|
|
163
|
+
| `dev` | pytest + API/MCP test helpers |
|
|
164
|
+
| `all` | Large superset of dependencies (use only if you need everything) |
|
|
165
|
+
|
|
166
|
+
Nested ZIP and office files inside archives require the corresponding extras (e.g. `archive` plus `pdf` for PDFs inside a ZIP).
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## Command-line execution
|
|
171
|
+
|
|
172
|
+
All converters can be run from a terminal after you install the package (with the right **extras** for that format). Each tool is a normal executable on your `PATH` (no need to open Python yourself unless you choose the shim workflow below).
|
|
173
|
+
|
|
174
|
+
### 1. Install (once)
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
pip install "mdengine[pdf,word]" # adjust extras: ppt, xlsx, image, archive, text, …
|
|
178
|
+
# or from a clone:
|
|
179
|
+
pip install -e ".[pdf,word,archive]"
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
### 2. Check that the command is available
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
md-pdf --help
|
|
186
|
+
md-zip --help
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
If the shell reports “command not found”, ensure the Python **Scripts** directory is on your `PATH` (same place `pip` installs console scripts).
|
|
190
|
+
|
|
191
|
+
### 3. Commands (command-line entry points)
|
|
192
|
+
|
|
193
|
+
| Command | Implements | One-line example |
|
|
194
|
+
|---------|------------|------------------|
|
|
195
|
+
| `md-pdf` | `md_generator.pdf.converter:main` | `md-pdf report.pdf out.md` |
|
|
196
|
+
| `md-word` | `md_generator.word.converter:main` | `md-word notes.docx body.md` |
|
|
197
|
+
| `md-ppt` | `md_generator.ppt.converter:main` | `md-ppt deck.pptx ./ppt-out` |
|
|
198
|
+
| `md-xlsx` | `md_generator.xlsx.converter:main` | `md-xlsx -i data.xlsx -o ./excel-out` (also **`.csv`**) |
|
|
199
|
+
| `md-image` | `md_generator.image.converter:main` | `md-image ./scans page.md` |
|
|
200
|
+
| `md-text` | `md_generator.text.converter:main` | `md-text config.xml out.md` |
|
|
201
|
+
| `md-zip` | `md_generator.archive.converter:main` | `md-zip bundle.zip ./zip-out` |
|
|
202
|
+
|
|
203
|
+
Every command accepts **`-h` / `--help`** for full flags (artifact layout, OCR, ZIP options, etc.).
|
|
204
|
+
|
|
205
|
+
### 4. Copy-paste examples (terminal)
|
|
206
|
+
|
|
207
|
+
**bash / macOS / Linux**
|
|
208
|
+
|
|
209
|
+
```bash
|
|
210
|
+
md-pdf manual.pdf ./artifact --artifact-layout
|
|
211
|
+
md-word letter.docx letter.md --images-dir ./letter-images
|
|
212
|
+
md-ppt slides.pptx ./ppt-artifact --artifact-layout
|
|
213
|
+
md-xlsx -i sales.xlsx -o ./md-sheets --split
|
|
214
|
+
md-xlsx -i export.csv -o ./csv-out
|
|
215
|
+
md-image ./photos ocr.md --engines tess --strategy best
|
|
216
|
+
md-text data.json data.md
|
|
217
|
+
md-zip archive.zip ./unzipped-md
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
**Windows PowerShell** (same commands; use backslashes for paths if you prefer)
|
|
221
|
+
|
|
222
|
+
```powershell
|
|
223
|
+
md-pdf .\manual.pdf .\out\doc.md
|
|
224
|
+
md-zip .\archive.zip .\zip-out
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
**Windows CMD**
|
|
228
|
+
|
|
229
|
+
```cmd
|
|
230
|
+
md-pdf manual.pdf out\doc.md
|
|
231
|
+
md-zip archive.zip zip-out
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
### 5. Run without `pip install` (repo clone + `PYTHONPATH`)
|
|
235
|
+
|
|
236
|
+
The folders `pdf-to-md/`, `word-to-md/`, … contain a thin `converter.py` that calls the same code as `md-pdf`, `md-word`, etc. From the **repository root**, point Python at `src` so `md_generator` imports, then run the shim:
|
|
237
|
+
|
|
238
|
+
**PowerShell**
|
|
239
|
+
|
|
240
|
+
```powershell
|
|
241
|
+
$env:PYTHONPATH = "$PWD\src"
|
|
242
|
+
python pdf-to-md\converter.py input.pdf out.md
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
**CMD**
|
|
246
|
+
|
|
247
|
+
```cmd
|
|
248
|
+
set PYTHONPATH=src
|
|
249
|
+
python pdf-to-md\converter.py input.pdf out.md
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
**bash**
|
|
253
|
+
|
|
254
|
+
```bash
|
|
255
|
+
PYTHONPATH=src python pdf-to-md/converter.py input.pdf out.md
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
### 6. Convert every file in `docs/` (strictly command-line)
|
|
259
|
+
|
|
260
|
+
To process **all supported files** under the [`docs/`](docs/) folder using only the installed **`md-*`** tools (no Python snippets), use the batch driver:
|
|
261
|
+
|
|
262
|
+
| Platform | Command (run from **repository root** unless noted) |
|
|
263
|
+
|----------|------------------------------------------------------|
|
|
264
|
+
| Windows | `powershell -ExecutionPolicy Bypass -File scripts/run-docs-cli.ps1` |
|
|
265
|
+
| Windows | Or double-click / run [`docs/run-all-cli.cmd`](docs/run-all-cli.cmd) (changes to repo root, then runs the script on `docs\`) |
|
|
266
|
+
| macOS / Linux | `bash scripts/run-docs-cli.sh` |
|
|
267
|
+
|
|
268
|
+
Optional environment variables for the shell script: `DOCS_DIR`, `OUT_DIR`, `IMAGE_ENGINES` (default `tess`). PowerShell script parameters: `-DocsDir`, `-OutDir`, `-ImageEngines`.
|
|
269
|
+
|
|
270
|
+
Outputs are written to **`docs/cli-output/<basename>/`** (one subfolder per input file). **`.csv`** files are converted with **`md-xlsx`** (same engine as Excel). **`.md`** files are skipped.
|
|
271
|
+
|
|
272
|
+
---
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
## Python library
|
|
276
|
+
|
|
277
|
+
Import from `md_generator.<format>` after installing the matching extras.
|
|
278
|
+
|
|
279
|
+
### PDF
|
|
280
|
+
|
|
281
|
+
```python
|
|
282
|
+
from pathlib import Path
|
|
283
|
+
from md_generator.pdf.pdf_extract import ConvertOptions, convert_pdf
|
|
284
|
+
from md_generator.pdf.utils import resolve_output
|
|
285
|
+
|
|
286
|
+
pdf = Path("input.pdf")
|
|
287
|
+
out = resolve_output(Path("out-dir"), artifact_layout=True, images_dir=None)
|
|
288
|
+
convert_pdf(pdf, out, ConvertOptions(verbose=True))
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
### Word (DOCX)
|
|
292
|
+
|
|
293
|
+
```python
|
|
294
|
+
from pathlib import Path
|
|
295
|
+
from md_generator.word.converter import convert_docx_to_markdown
|
|
296
|
+
|
|
297
|
+
convert_docx_to_markdown(
|
|
298
|
+
Path("input.docx"),
|
|
299
|
+
Path("out/body.md"),
|
|
300
|
+
images_dir=Path("out/images"),
|
|
301
|
+
verbose=False,
|
|
302
|
+
)
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
### PowerPoint
|
|
306
|
+
|
|
307
|
+
```python
|
|
308
|
+
from pathlib import Path
|
|
309
|
+
from md_generator.ppt.convert_impl import convert_pptx
|
|
310
|
+
from md_generator.ppt.options import ConvertOptions
|
|
311
|
+
|
|
312
|
+
convert_pptx(
|
|
313
|
+
Path("slides.pptx"),
|
|
314
|
+
Path("artifact-dir"),
|
|
315
|
+
ConvertOptions(artifact_layout=True, extract_embedded_deep=False),
|
|
316
|
+
)
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
### Excel
|
|
320
|
+
|
|
321
|
+
```python
|
|
322
|
+
from pathlib import Path
|
|
323
|
+
from md_generator.xlsx.convert_config import ConvertConfig
|
|
324
|
+
from md_generator.xlsx.converter_core import convert_excel_to_markdown
|
|
325
|
+
|
|
326
|
+
result = convert_excel_to_markdown(
|
|
327
|
+
Path("book.xlsx"),
|
|
328
|
+
Path("out-dir"),
|
|
329
|
+
config=ConvertConfig(),
|
|
330
|
+
)
|
|
331
|
+
print(result.paths_written)
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
### Images (OCR)
|
|
335
|
+
|
|
336
|
+
```python
|
|
337
|
+
from pathlib import Path
|
|
338
|
+
from md_generator.image.convert_impl import ConvertOptions, convert_images
|
|
339
|
+
|
|
340
|
+
convert_images(
|
|
341
|
+
Path("scan.png"),
|
|
342
|
+
Path("out.md"),
|
|
343
|
+
ConvertOptions(
|
|
344
|
+
engines=("tess",),
|
|
345
|
+
strategy="best",
|
|
346
|
+
title="OCR",
|
|
347
|
+
tess_lang="eng",
|
|
348
|
+
tesseract_cmd=None,
|
|
349
|
+
paddle_lang="en",
|
|
350
|
+
paddle_use_angle_cls=True,
|
|
351
|
+
easy_langs=("en",),
|
|
352
|
+
verbose=False,
|
|
353
|
+
),
|
|
354
|
+
)
|
|
355
|
+
```
|
|
356
|
+
|
|
357
|
+
### Text / JSON / XML
|
|
358
|
+
|
|
359
|
+
```python
|
|
360
|
+
from pathlib import Path
|
|
361
|
+
from md_generator.text.convert_impl import convert_text_file
|
|
362
|
+
from md_generator.text.options import ConvertOptions
|
|
363
|
+
|
|
364
|
+
convert_text_file(
|
|
365
|
+
Path("data.json"),
|
|
366
|
+
Path("out.md"),
|
|
367
|
+
ConvertOptions(artifact_layout=False, verbose=False),
|
|
368
|
+
)
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
### ZIP archive
|
|
372
|
+
|
|
373
|
+
```python
|
|
374
|
+
from pathlib import Path
|
|
375
|
+
from md_generator.archive.convert_impl import convert_zip
|
|
376
|
+
from md_generator.archive.options import ConvertOptions
|
|
377
|
+
|
|
378
|
+
convert_zip(
|
|
379
|
+
Path("upload.zip"),
|
|
380
|
+
Path("artifact-out"),
|
|
381
|
+
ConvertOptions(
|
|
382
|
+
enable_office=True,
|
|
383
|
+
use_image_to_md=True,
|
|
384
|
+
verbose=False,
|
|
385
|
+
),
|
|
386
|
+
)
|
|
387
|
+
```
|
|
388
|
+
|
|
389
|
+
`repo_root` on `ConvertOptions` is **deprecated and ignored**; converters are loaded in-process from `md_generator`.
|
|
390
|
+
|
|
391
|
+
---
|
|
392
|
+
|
|
393
|
+
## HTTP API (FastAPI)
|
|
394
|
+
|
|
395
|
+
All format APIs follow a similar pattern:
|
|
396
|
+
|
|
397
|
+
- **`POST /convert/sync`** — upload a file; response is often a **ZIP** (artifact bundle) for larger formats.
|
|
398
|
+
- **`POST /convert/jobs`** — async job; returns `job_id`.
|
|
399
|
+
- **`GET /convert/jobs/{job_id}`** — status.
|
|
400
|
+
- **`GET /convert/jobs/{job_id}/download`** — download result when ready.
|
|
401
|
+
|
|
402
|
+
Upload field name is **`file`** (multipart form). Use `httpx` or `curl -F "file=@path/to/file"`.
|
|
403
|
+
|
|
404
|
+
### Run with Uvicorn
|
|
405
|
+
|
|
406
|
+
Install `mdengine[api]` plus the format extra(s), then run the **`app`** object from the table below.
|
|
407
|
+
|
|
408
|
+
| Service | Uvicorn target | Required extras (typical) |
|
|
409
|
+
|---------|----------------|---------------------------|
|
|
410
|
+
| PDF | `md_generator.pdf.api.main:app` | `pdf`, `api` |
|
|
411
|
+
| Word | `md_generator.word.api.main:app` | `word`, `api`, `mcp` (Word mounts FastMCP) |
|
|
412
|
+
| PPTX | `md_generator.ppt.api.main:app` | `ppt`, `api`, `mcp` |
|
|
413
|
+
| XLSX | `md_generator.xlsx.api.app:app` | `xlsx`, `api` |
|
|
414
|
+
| Image | `md_generator.image.api.main:app` | `image`, `api`, `mcp` |
|
|
415
|
+
| Text/JSON/XML | `md_generator.text.api.main:app` | `text`, `api`, `mcp` |
|
|
416
|
+
| ZIP | `md_generator.archive.api.main:app` | `archive`, `api`, `mcp` (+ extras for nested office/PDF) |
|
|
417
|
+
|
|
418
|
+
Examples:
|
|
419
|
+
|
|
420
|
+
```bash
|
|
421
|
+
uvicorn md_generator.pdf.api.main:app --host 127.0.0.1 --port 8001
|
|
422
|
+
uvicorn md_generator.word.api.main:app --host 127.0.0.1 --port 8002
|
|
423
|
+
uvicorn md_generator.archive.api.main:app --host 127.0.0.1 --port 8010
|
|
424
|
+
```
|
|
425
|
+
|
|
426
|
+
### MCP over HTTP on the same server
|
|
427
|
+
|
|
428
|
+
These apps mount an MCP HTTP app at **`/mcp`** (Streamable HTTP / framework-specific). Start the API as above, then point an MCP client at `http://<host>:<port>/mcp` where supported.
|
|
429
|
+
|
|
430
|
+
### Environment variables (limits & CORS)
|
|
431
|
+
|
|
432
|
+
Prefixes differ per service (often read from a `.env` file next to the process):
|
|
433
|
+
|
|
434
|
+
| Service | Prefix | Examples |
|
|
435
|
+
|---------|--------|----------|
|
|
436
|
+
| PDF | `PDF_TO_MD_` | `PDF_TO_MD_MAX_UPLOAD_MB`, `PDF_TO_MD_MAX_SYNC_UPLOAD_MB`, `PDF_TO_MD_TEMP_DIR`, `PDF_TO_MD_CORS_ORIGINS` |
|
|
437
|
+
| Word | `WORD_TO_MD_` | `WORD_TO_MD_MAX_UPLOAD_MB`, `WORD_TO_MD_MAX_SYNC_UPLOAD_MB`, `WORD_TO_MD_JOB_TTL_SECONDS`, `WORD_TO_MD_TEMP_DIR`, `WORD_TO_MD_CORS_ORIGINS` |
|
|
438
|
+
| ZIP | `ZIP_TO_MD_` | `ZIP_TO_MD_MAX_UPLOAD_MB`, `ZIP_TO_MD_MAX_SYNC_UPLOAD_MB`, `ZIP_TO_MD_JOB_TTL_SECONDS`, `ZIP_TO_MD_TEMP_DIR`, `ZIP_TO_MD_CORS_ORIGINS`, optional image post-pass defaults |
|
|
439
|
+
| PPTX | `PPT_TO_MD_` | `PPT_TO_MD_MAX_UPLOAD_MB`, … |
|
|
440
|
+
| Text | `TXT_JSON_XML_TO_MD_` | same pattern |
|
|
441
|
+
| XLSX | `XLSX_TO_MD_` | `XLSX_TO_MD_TEMP_DIR`, `XLSX_TO_MD_CORS_ORIGINS`, etc. (see `md_generator.xlsx.api.app`) |
|
|
442
|
+
|
|
443
|
+
Exact variable names match the `ApiSettings` / helper functions in each `api/settings` or `api/app` module.
|
|
444
|
+
|
|
445
|
+
---
|
|
446
|
+
|
|
447
|
+
## MCP (Model Context Protocol)
|
|
448
|
+
|
|
449
|
+
Two usage patterns:
|
|
450
|
+
|
|
451
|
+
1. **Bundled with FastAPI** — run Uvicorn as in the previous section; use path **`/mcp`** on the same host/port.
|
|
452
|
+
2. **Standalone process** — run a small `__main__` module (stdio, SSE, or streamable-http) for use with Cursor, Claude Desktop, or other MCP hosts.
|
|
453
|
+
|
|
454
|
+
### Standalone MCP processes
|
|
455
|
+
|
|
456
|
+
| Converter | Command (examples) |
|
|
457
|
+
|-----------|---------------------|
|
|
458
|
+
| ZIP | `python -m md_generator.archive.api.mcp_server` / `--transport sse` / `--transport streamable-http` |
|
|
459
|
+
| Text/JSON/XML | `python -m md_generator.text.api.mcp_server` |
|
|
460
|
+
| Word (FastMCP) | `python -m md_generator.word.api.mcp_server` / `--transport stdio` (default) or `streamable-http`, plus `--host` / `--port` when needed |
|
|
461
|
+
| PDF (FastMCP) | `python -m md_generator.pdf.api.mcp_server` / `--transport stdio` / `sse` / `streamable-http` |
|
|
462
|
+
| PPTX | `python -m md_generator.ppt.api.mcp_server` (see module docstring for flags) |
|
|
463
|
+
| Image | `python -m md_generator.image.api.mcp_server` (see module for CLI) |
|
|
464
|
+
|
|
465
|
+
**Word** and **XLSX** also ship a small runner script in the repo:
|
|
466
|
+
|
|
467
|
+
```bash
|
|
468
|
+
python word-to-md/run.py api --host 127.0.0.1 --port 8002
|
|
469
|
+
python word-to-md/run.py mcp --transport stdio
|
|
470
|
+
|
|
471
|
+
python xlsx-to-md/run.py api --port 8003
|
|
472
|
+
python xlsx-to-md/run.py mcp --transport stdio
|
|
473
|
+
```
|
|
474
|
+
|
|
475
|
+
The XLSX MCP server is built in code (`build_mcp_server()` in `md_generator.xlsx.mcp_server`) and is mounted on the XLSX FastAPI app when MCP dependencies are installed.
|
|
476
|
+
|
|
477
|
+
Install **`mdengine[mcp]`** (and usually **`[api]`** when using HTTP) for MCP-related imports to resolve.
|
|
478
|
+
|
|
479
|
+
---
|
|
480
|
+
|
|
481
|
+
## Development
|
|
482
|
+
|
|
483
|
+
```bash
|
|
484
|
+
pip install -e ".[dev,all]" # or a smaller subset of extras
|
|
485
|
+
python -m pytest
|
|
486
|
+
```
|
|
487
|
+
|
|
488
|
+
Tests live under each legacy folder’s `tests/` directory (e.g. `pdf-to-md/tests/`); `pyproject.toml` configures `pythonpath = ["src"]` so `md_generator` resolves without a separate `PYTHONPATH`.
|
|
489
|
+
|
|
490
|
+
---
|
|
491
|
+
|
|
492
|
+
## Repository layout
|
|
493
|
+
|
|
494
|
+
| Path | Role |
|
|
495
|
+
|------|------|
|
|
496
|
+
| `LICENSE` | MIT license text |
|
|
497
|
+
| `CODE_OF_CONDUCT.md` | [Contributor Covenant](https://www.contributor-covenant.org/) 2.1 |
|
|
498
|
+
| `src/md_generator/` | **Library source** (all formats + `api` subpackages) |
|
|
499
|
+
| `pyproject.toml` | Packaging, extras, CLI entry points, pytest |
|
|
500
|
+
| `*-to-md/` | **Docs, tests, fixtures**, thin `converter.py` shims, some `run.py` helpers |
|
|
501
|
+
| `README.md` | This document |
|
|
502
|
+
|
|
503
|
+
For deeper behavior per format, see the original README files under each `*-to-md/` folder where they still exist.
|
|
504
|
+
|
|
505
|
+
---
|
|
506
|
+
|
|
507
|
+
## Legal
|
|
508
|
+
|
|
509
|
+
This project is released under the [MIT License](LICENSE). A copy of the license text is included in the repository root.
|