doc2md-helper 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doc2md_helper-0.1.0/PKG-INFO +15 -0
- doc2md_helper-0.1.0/README.md +288 -0
- doc2md_helper-0.1.0/doc2md_helper.egg-info/PKG-INFO +15 -0
- doc2md_helper-0.1.0/doc2md_helper.egg-info/SOURCES.txt +15 -0
- doc2md_helper-0.1.0/doc2md_helper.egg-info/dependency_links.txt +1 -0
- doc2md_helper-0.1.0/doc2md_helper.egg-info/entry_points.txt +2 -0
- doc2md_helper-0.1.0/doc2md_helper.egg-info/requires.txt +11 -0
- doc2md_helper-0.1.0/doc2md_helper.egg-info/top_level.txt +1 -0
- doc2md_helper-0.1.0/mcp_document_converter/__init__.py +29 -0
- doc2md_helper-0.1.0/mcp_document_converter/cli.py +413 -0
- doc2md_helper-0.1.0/mcp_document_converter/docx2markdown.py +225 -0
- doc2md_helper-0.1.0/mcp_document_converter/excel2markdown.py +273 -0
- doc2md_helper-0.1.0/mcp_document_converter/pdf2markdown.py +259 -0
- doc2md_helper-0.1.0/mcp_document_converter/pdf2markdown_markitdown.py +104 -0
- doc2md_helper-0.1.0/mcp_document_converter/server.py +148 -0
- doc2md_helper-0.1.0/pyproject.toml +30 -0
- doc2md_helper-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: doc2md-helper
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Document converter for converting PDF, DOCX, Excel to Markdown, with MCP support
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Requires-Dist: mcp
|
|
7
|
+
Requires-Dist: markitdown
|
|
8
|
+
Requires-Dist: openpyxl
|
|
9
|
+
Requires-Dist: python-docx
|
|
10
|
+
Requires-Dist: mammoth
|
|
11
|
+
Requires-Dist: PyPDF2
|
|
12
|
+
Provides-Extra: marker
|
|
13
|
+
Requires-Dist: marker-pdf; extra == "marker"
|
|
14
|
+
Requires-Dist: torch; extra == "marker"
|
|
15
|
+
Requires-Dist: bitsandbytes; extra == "marker"
|
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
<h1 align="center">mcp-document-converter</h1>
|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<strong>一键安装,让 Claude Code 直接读取 PDF、Word、Excel!</strong>
|
|
5
|
+
</p>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<a href="https://opensource.org/licenses/MIT"><img src="https://img.shields.io/badge/License-MIT-yellow.svg?style=flat-square" alt="MIT Licence"></a>
|
|
9
|
+
<a href="https://www.python.org/"><img src="https://img.shields.io/badge/python-3.10%2B-blue.svg?style=flat-square" alt="Python 3.10+"></a>
|
|
10
|
+
<a href="https://modelcontextprotocol.io/"><img src="https://img.shields.io/badge/MCP-compatible-green.svg?style=flat-square" alt="MCP"></a>
|
|
11
|
+
</p>
|
|
12
|
+
|
|
13
|
+
<br>
|
|
14
|
+
|
|
15
|
+
将 PDF、Word、Excel 文件转换为 Markdown,支持 MCP 集成和直接 CLI 调用两种方式。
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## 🚀 快速开始(Windows 用户)
|
|
20
|
+
|
|
21
|
+
### 方式一:一键安装(最简单)
|
|
22
|
+
|
|
23
|
+
**双击运行 `install.bat`** 即可!
|
|
24
|
+
|
|
25
|
+
安装完成后,重启 Claude Code 即可使用。
|
|
26
|
+
|
|
27
|
+
### 方式二:手动安装
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install mcp-document-converter
|
|
31
|
+
mcp-document-converter install --platform claude-code # 为 Claude Code 配置
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## 💡 两种使用方式
|
|
37
|
+
|
|
38
|
+
### 方式一:直接让 Claude Code 调用(推荐)
|
|
39
|
+
|
|
40
|
+
安装后,在 Claude Code 中直接说:
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
请帮我读取这个 report.pdf 文件
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Claude Code 会先询问你想用哪种方式:
|
|
47
|
+
1. **快速版本 (MarkItDown)** - 轻量快速,适合文本型 PDF
|
|
48
|
+
2. **高精度版本 (Marker)** - 高精度 OCR,适合扫描版或复杂排版的 PDF
|
|
49
|
+
|
|
50
|
+
选择后,自动开始转换!
|
|
51
|
+
|
|
52
|
+
### 方式二:使用命令行
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
# 转换 PDF(轻量版)
|
|
56
|
+
mcp-document-converter convert-pdf document.pdf
|
|
57
|
+
|
|
58
|
+
# 转换 PDF(高精度 OCR 版)
|
|
59
|
+
mcp-document-converter convert-pdf-marker scanned.pdf
|
|
60
|
+
|
|
61
|
+
# 转换 Word
|
|
62
|
+
mcp-document-converter convert-docx report.docx
|
|
63
|
+
|
|
64
|
+
# 转换 Excel
|
|
65
|
+
mcp-document-converter convert-excel data.xlsx
|
|
66
|
+
|
|
67
|
+
# 保存到文件
|
|
68
|
+
mcp-document-converter convert-pdf document.pdf -o output.md
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## 📦 安装选项
|
|
74
|
+
|
|
75
|
+
### 基本安装
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
pip install mcp-document-converter
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### 带 Marker 支持(PDF 高精度 OCR)
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
pip install mcp-document-converter[marker]
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## ⚙️ 配置平台
|
|
90
|
+
|
|
91
|
+
### Claude Code
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
mcp-document-converter install --platform claude-code
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
配置将写入 `~/.claude/settings.json`。重启 Claude Code 后生效。
|
|
98
|
+
|
|
99
|
+
### 其他平台
|
|
100
|
+
|
|
101
|
+
同样支持 Cursor、Windsurf、Zed、Continue 等:
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
mcp-document-converter install --platform cursor
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
## 🎯 PDF 转换的两个版本
|
|
110
|
+
|
|
111
|
+
| 版本 | 说明 | 使用场景 | 依赖 |
|
|
112
|
+
|------|------|---------|------|
|
|
113
|
+
| **MarkItDown** | 轻量快速 | 文本型 PDF | markitdown |
|
|
114
|
+
| **Marker** | 高精度 OCR | 扫描版、复杂排版 | marker-pdf, torch |
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## 📚 完整 CLI 参考
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
# 转换命令
|
|
122
|
+
mcp-document-converter convert-pdf <文件路径>
|
|
123
|
+
mcp-document-converter convert-pdf-marker <文件路径>
|
|
124
|
+
mcp-document-converter convert-docx <文件路径>
|
|
125
|
+
mcp-document-converter convert-excel <文件路径>
|
|
126
|
+
|
|
127
|
+
# 配置命令
|
|
128
|
+
mcp-document-converter install # 自动配置
|
|
129
|
+
mcp-document-converter install --platform claude-code
|
|
130
|
+
mcp-document-converter install --platform cursor
|
|
131
|
+
|
|
132
|
+
# MCP 服务命令
|
|
133
|
+
mcp-document-converter serve
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
## 工作原理
|
|
139
|
+
|
|
140
|
+
安装本包后,AI 编码工具可以通过 MCP 调用以下文档转换工具:
|
|
141
|
+
|
|
142
|
+
1. 用户在对话中请求读取 PDF/Word/Excel 文件
|
|
143
|
+
2. AI 编码工具调用对应的 MCP 转换工具
|
|
144
|
+
3. 返回文件的 Markdown 内容,供 AI 理解和分析
|
|
145
|
+
|
|
146
|
+
### PDF 转换的两个版本
|
|
147
|
+
|
|
148
|
+
- **Marker 版本**(`convert_pdf_with_marker`):基于 marker-pdf 库,支持高精度 OCR 和复杂排版识别,适合扫描版 PDF。需要 GPU 和额外的依赖(`pip install mcp-document-converter[marker]`)。
|
|
149
|
+
- **MarkItDown 版本**(`convert_pdf_with_markitdown`):基于 MarkItDown 库,轻量快速,适合文本型 PDF。无需 GPU。
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
## MCP 工具一览
|
|
154
|
+
|
|
155
|
+
| 工具 | 说明 | 依赖 |
|
|
156
|
+
|------|------|------|
|
|
157
|
+
| `convert_pdf_with_marker` | 使用 marker 库将 PDF 转为 Markdown(高精度 OCR) | marker-pdf, torch |
|
|
158
|
+
| `convert_pdf_with_markitdown` | 使用 MarkItDown 库将 PDF 转为 Markdown(轻量) | markitdown |
|
|
159
|
+
| `convert_docx_to_markdown` | 将 Word 文档(.docx/.doc)转为 Markdown | mammoth / python-docx / textract |
|
|
160
|
+
| `convert_excel_to_markdown` | 将 Excel 文件(.xlsx/.xls)转为 Markdown | markitdown / openpyxl |
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
## CLI 参考
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
mcp-document-converter install # 自动检测并配置所有支持的平台
|
|
168
|
+
mcp-document-converter install --platform claude-code # 仅配置 Claude Code
|
|
169
|
+
mcp-document-converter install --platform cursor # 仅配置 Cursor
|
|
170
|
+
mcp-document-converter serve # 启动 MCP 服务器
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
## 安装
|
|
176
|
+
|
|
177
|
+
### 基本安装
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
pip install mcp-document-converter
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### 带 Marker 支持(PDF 高精度转换)
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
pip install mcp-document-converter[marker]
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
### 开发模式
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
git clone <repository-url>
|
|
193
|
+
cd mcp-document-converter
|
|
194
|
+
pip install -e .
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
199
|
+
## 配置平台
|
|
200
|
+
|
|
201
|
+
### Claude Code
|
|
202
|
+
|
|
203
|
+
```bash
|
|
204
|
+
mcp-document-converter install --platform claude-code
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
配置将写入 `~/.claude/settings.json`。重启 Claude Code 后生效。
|
|
208
|
+
|
|
209
|
+
### Cursor
|
|
210
|
+
|
|
211
|
+
```bash
|
|
212
|
+
mcp-document-converter install --platform cursor
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
配置将写入当前项目的 `.cursor/mcp.json`。重启 Cursor 后生效。
|
|
216
|
+
|
|
217
|
+
### 手动配置
|
|
218
|
+
|
|
219
|
+
如果自动配置失败,可以手动在对应平台的 MCP 配置中添加:
|
|
220
|
+
|
|
221
|
+
```json
|
|
222
|
+
{
|
|
223
|
+
"mcpServers": {
|
|
224
|
+
"mcp-document-converter": {
|
|
225
|
+
"command": "mcp-document-converter",
|
|
226
|
+
"args": ["serve"]
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
如果使用 `uvx`:
|
|
233
|
+
|
|
234
|
+
```json
|
|
235
|
+
{
|
|
236
|
+
"mcpServers": {
|
|
237
|
+
"mcp-document-converter": {
|
|
238
|
+
"command": "uvx",
|
|
239
|
+
"args": ["mcp-document-converter", "serve"]
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
---
|
|
246
|
+
|
|
247
|
+
## 使用示例
|
|
248
|
+
|
|
249
|
+
### 在 Claude Code 中使用
|
|
250
|
+
|
|
251
|
+
安装并配置后,在对话中直接请求:
|
|
252
|
+
|
|
253
|
+
```
|
|
254
|
+
请读取 report.pdf 并总结内容
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
Claude Code 会自动调用 `convert_pdf_with_markitdown` 或 `convert_pdf_with_marker` 工具获取文件的 Markdown 内容。
|
|
258
|
+
|
|
259
|
+
### 指定使用哪个 PDF 转换版本
|
|
260
|
+
|
|
261
|
+
```
|
|
262
|
+
请使用 marker 版本转换 design.pdf(需要高精度 OCR)
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
```
|
|
266
|
+
请使用 markitdown 版本转换 notes.pdf(快速转换)
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
---
|
|
270
|
+
|
|
271
|
+
## 支持的格式
|
|
272
|
+
|
|
273
|
+
| 格式 | 扩展名 | 说明 |
|
|
274
|
+
|------|--------|------|
|
|
275
|
+
| PDF | .pdf | 支持文本型和扫描型 PDF |
|
|
276
|
+
| Word | .docx, .doc | 支持多种后端(mammoth, python-docx, textract) |
|
|
277
|
+
| Excel | .xlsx, .xls | 支持多工作表 |
|
|
278
|
+
|
|
279
|
+
---
|
|
280
|
+
|
|
281
|
+
## 开发
|
|
282
|
+
|
|
283
|
+
```bash
|
|
284
|
+
pip install -e .
|
|
285
|
+
pytest
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
添加新的文件格式支持:在 `mcp_document_converter/` 中创建新的转换模块,然后在 `server.py` 中注册为 MCP 工具。
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: doc2md-helper
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Document converter for converting PDF, DOCX, Excel to Markdown, with MCP support
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Requires-Dist: mcp
|
|
7
|
+
Requires-Dist: markitdown
|
|
8
|
+
Requires-Dist: openpyxl
|
|
9
|
+
Requires-Dist: python-docx
|
|
10
|
+
Requires-Dist: mammoth
|
|
11
|
+
Requires-Dist: PyPDF2
|
|
12
|
+
Provides-Extra: marker
|
|
13
|
+
Requires-Dist: marker-pdf; extra == "marker"
|
|
14
|
+
Requires-Dist: torch; extra == "marker"
|
|
15
|
+
Requires-Dist: bitsandbytes; extra == "marker"
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
doc2md_helper.egg-info/PKG-INFO
|
|
4
|
+
doc2md_helper.egg-info/SOURCES.txt
|
|
5
|
+
doc2md_helper.egg-info/dependency_links.txt
|
|
6
|
+
doc2md_helper.egg-info/entry_points.txt
|
|
7
|
+
doc2md_helper.egg-info/requires.txt
|
|
8
|
+
doc2md_helper.egg-info/top_level.txt
|
|
9
|
+
mcp_document_converter/__init__.py
|
|
10
|
+
mcp_document_converter/cli.py
|
|
11
|
+
mcp_document_converter/docx2markdown.py
|
|
12
|
+
mcp_document_converter/excel2markdown.py
|
|
13
|
+
mcp_document_converter/pdf2markdown.py
|
|
14
|
+
mcp_document_converter/pdf2markdown_markitdown.py
|
|
15
|
+
mcp_document_converter/server.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
mcp_document_converter
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# 条件导入 marker 版本(可选依赖)
|
|
2
|
+
try:
|
|
3
|
+
from mcp_document_converter.pdf2markdown import convert_pdf_page_by_page
|
|
4
|
+
_has_marker = True
|
|
5
|
+
except ImportError:
|
|
6
|
+
convert_pdf_page_by_page = None
|
|
7
|
+
_has_marker = False
|
|
8
|
+
|
|
9
|
+
# 无条件导入其他模块
|
|
10
|
+
from mcp_document_converter.pdf2markdown_markitdown import (
|
|
11
|
+
pdf_to_markdown_markitdown,
|
|
12
|
+
pdf_to_markdown_simple,
|
|
13
|
+
)
|
|
14
|
+
from mcp_document_converter.docx2markdown import docx_to_markdown
|
|
15
|
+
from mcp_document_converter.excel2markdown import (
|
|
16
|
+
excel_to_markdown,
|
|
17
|
+
excel_to_markdown_simple,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# 构建 __all__
|
|
21
|
+
__all__ = [
|
|
22
|
+
"pdf_to_markdown_markitdown",
|
|
23
|
+
"pdf_to_markdown_simple",
|
|
24
|
+
"docx_to_markdown",
|
|
25
|
+
"excel_to_markdown",
|
|
26
|
+
"excel_to_markdown_simple",
|
|
27
|
+
]
|
|
28
|
+
if _has_marker:
|
|
29
|
+
__all__.append("convert_pdf_page_by_page")
|