mcp-documents-reader 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_documents_reader-1.0.0/.gitignore +214 -0
- mcp_documents_reader-1.0.0/LICENSE +21 -0
- mcp_documents_reader-1.0.0/PKG-INFO +136 -0
- mcp_documents_reader-1.0.0/README.md +122 -0
- mcp_documents_reader-1.0.0/README.zh-CN.md +122 -0
- mcp_documents_reader-1.0.0/mcp_documents_reader.py +223 -0
- mcp_documents_reader-1.0.0/pyproject.toml +39 -0
- mcp_documents_reader-1.0.0/server.json +21 -0
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
# ---------------------------
|
|
2
|
+
# Project Specific Exclusions
|
|
3
|
+
# ---------------------------
|
|
4
|
+
|
|
5
|
+
# Document directory with test files
|
|
6
|
+
/documents/
|
|
7
|
+
|
|
8
|
+
# Test files
|
|
9
|
+
/test_*.py
|
|
10
|
+
**/test_*.py
|
|
11
|
+
|
|
12
|
+
# Trae IDE configuration
|
|
13
|
+
.trae/
|
|
14
|
+
|
|
15
|
+
# ---------------------------
|
|
16
|
+
# Python Exclusions
|
|
17
|
+
# ---------------------------
|
|
18
|
+
|
|
19
|
+
# Virtual environment
|
|
20
|
+
venv/
|
|
21
|
+
env/
|
|
22
|
+
.env/
|
|
23
|
+
.venv/
|
|
24
|
+
|
|
25
|
+
# Python bytecode
|
|
26
|
+
*.py[cod]
|
|
27
|
+
*$py.class
|
|
28
|
+
|
|
29
|
+
# Compiled output
|
|
30
|
+
build/
|
|
31
|
+
dist/
|
|
32
|
+
*.egg-info/
|
|
33
|
+
.installed.cfg
|
|
34
|
+
*.egg
|
|
35
|
+
|
|
36
|
+
# PIP files
|
|
37
|
+
pip-selfcheck.json
|
|
38
|
+
requirements.txt
|
|
39
|
+
|
|
40
|
+
# ---------------------------
|
|
41
|
+
# IDE Exclusions
|
|
42
|
+
# ---------------------------
|
|
43
|
+
|
|
44
|
+
# VSCode
|
|
45
|
+
.vscode/
|
|
46
|
+
*.code-workspace
|
|
47
|
+
|
|
48
|
+
# IntelliJ IDEA
|
|
49
|
+
.idea/
|
|
50
|
+
*.iml
|
|
51
|
+
*.ipr
|
|
52
|
+
*.iws
|
|
53
|
+
|
|
54
|
+
# Eclipse
|
|
55
|
+
.classpath
|
|
56
|
+
.project
|
|
57
|
+
.settings/
|
|
58
|
+
|
|
59
|
+
# PyCharm
|
|
60
|
+
.idea/
|
|
61
|
+
|
|
62
|
+
# Sublime Text
|
|
63
|
+
*.sublime-project
|
|
64
|
+
*.sublime-workspace
|
|
65
|
+
|
|
66
|
+
# Vim
|
|
67
|
+
*.swp
|
|
68
|
+
*.swo
|
|
69
|
+
|
|
70
|
+
# Emacs
|
|
71
|
+
*~
|
|
72
|
+
|
|
73
|
+
# ---------------------------
|
|
74
|
+
# OS Exclusions
|
|
75
|
+
# ---------------------------
|
|
76
|
+
|
|
77
|
+
# Windows
|
|
78
|
+
Thumbs.db
|
|
79
|
+
Thumbs.db:encryptable
|
|
80
|
+
ehthumbs.db
|
|
81
|
+
ehthumbs_vista.db
|
|
82
|
+
*.stackdump
|
|
83
|
+
[Dd]esktop.ini
|
|
84
|
+
$RECYCLE.BIN/
|
|
85
|
+
*.cab
|
|
86
|
+
*.msi
|
|
87
|
+
*.msix
|
|
88
|
+
*.msm
|
|
89
|
+
*.msp
|
|
90
|
+
*.lnk
|
|
91
|
+
|
|
92
|
+
# macOS
|
|
93
|
+
.DS_Store
|
|
94
|
+
.AppleDouble
|
|
95
|
+
.LSOverride
|
|
96
|
+
._*
|
|
97
|
+
.Spotlight-V100
|
|
98
|
+
.Trashes
|
|
99
|
+
ehthumbs.db
|
|
100
|
+
Thumbs.db
|
|
101
|
+
|
|
102
|
+
# Linux
|
|
103
|
+
*~
|
|
104
|
+
.fuse_hidden*
|
|
105
|
+
.directory
|
|
106
|
+
.Trash-*
|
|
107
|
+
.nfs*
|
|
108
|
+
|
|
109
|
+
# ---------------------------
|
|
110
|
+
# Build and Log Exclusions
|
|
111
|
+
# ---------------------------
|
|
112
|
+
|
|
113
|
+
# Logs
|
|
114
|
+
logs/
|
|
115
|
+
*.log
|
|
116
|
+
npm-debug.log*
|
|
117
|
+
yarn-debug.log*
|
|
118
|
+
yarn-error.log*
|
|
119
|
+
lerna-debug.log*
|
|
120
|
+
.pnpm-debug.log*
|
|
121
|
+
|
|
122
|
+
# Temporary files
|
|
123
|
+
*.tmp
|
|
124
|
+
*.temp
|
|
125
|
+
.cache/
|
|
126
|
+
.temp/
|
|
127
|
+
.tmp/
|
|
128
|
+
|
|
129
|
+
# Environment files
|
|
130
|
+
.env
|
|
131
|
+
.env.local
|
|
132
|
+
.env.development.local
|
|
133
|
+
.env.test.local
|
|
134
|
+
.env.production.local
|
|
135
|
+
.env.*.local
|
|
136
|
+
|
|
137
|
+
# Runtime data
|
|
138
|
+
pids
|
|
139
|
+
*.pid
|
|
140
|
+
*.seed
|
|
141
|
+
*.pid.lock
|
|
142
|
+
|
|
143
|
+
# Coverage directory used by tools like istanbul
|
|
144
|
+
coverage/
|
|
145
|
+
.nyc_output/
|
|
146
|
+
|
|
147
|
+
# Dependency directories
|
|
148
|
+
node_modules/
|
|
149
|
+
jspm_packages/
|
|
150
|
+
|
|
151
|
+
# Optional npm cache directory
|
|
152
|
+
.npm
|
|
153
|
+
|
|
154
|
+
# Optional eslint cache
|
|
155
|
+
.eslintcache
|
|
156
|
+
|
|
157
|
+
# Optional REPL history
|
|
158
|
+
.node_repl_history
|
|
159
|
+
|
|
160
|
+
# Output of 'npm pack'
|
|
161
|
+
*.tgz
|
|
162
|
+
|
|
163
|
+
# Yarn Integrity file
|
|
164
|
+
.yarn-integrity
|
|
165
|
+
|
|
166
|
+
# parcel-bundler cache (https://parceljs.org/)
|
|
167
|
+
.cache
|
|
168
|
+
.parcel-cache
|
|
169
|
+
|
|
170
|
+
# Next.js build output
|
|
171
|
+
.next
|
|
172
|
+
out
|
|
173
|
+
|
|
174
|
+
# Nuxt.js build / generate output
|
|
175
|
+
.nuxt
|
|
176
|
+
dist
|
|
177
|
+
|
|
178
|
+
# Gatsby files
|
|
179
|
+
.cache/
|
|
180
|
+
# Comment in the public line in if your project uses Gatsby and not Next.js
|
|
181
|
+
# https://nextjs.org/blog/next-9-1#public-directory-support
|
|
182
|
+
# public
|
|
183
|
+
|
|
184
|
+
# vuepress build output
|
|
185
|
+
.vuepress/dist
|
|
186
|
+
|
|
187
|
+
# vuepress v2.x temp and cache directory
|
|
188
|
+
.temp
|
|
189
|
+
.cache
|
|
190
|
+
|
|
191
|
+
# Docusaurus cache and generated files
|
|
192
|
+
.docusaurus
|
|
193
|
+
|
|
194
|
+
# Serverless directories
|
|
195
|
+
.serverless/
|
|
196
|
+
|
|
197
|
+
# FuseBox cache
|
|
198
|
+
.fusebox/
|
|
199
|
+
|
|
200
|
+
# DynamoDB Local files
|
|
201
|
+
.dynamodb/
|
|
202
|
+
|
|
203
|
+
# TernJS port file
|
|
204
|
+
.tern-port
|
|
205
|
+
|
|
206
|
+
# Stores VSCode versions used for testing VSCode extensions
|
|
207
|
+
.vscode-test
|
|
208
|
+
|
|
209
|
+
# yarn v2
|
|
210
|
+
.yarn/cache
|
|
211
|
+
.yarn/unplugged
|
|
212
|
+
.yarn/build-state.yml
|
|
213
|
+
.yarn/install-state.gz
|
|
214
|
+
.pnp.*
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 玄同765
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mcp_documents_reader
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: An MCP enabled multi-format document reader supporting DOCX, PDF, TXT, and Excel files
|
|
5
|
+
Author-email: xt765 <xt765@foxmail.com>
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Keywords: document-reader,docx,excel,mcp,model-context-protocol,pdf
|
|
8
|
+
Requires-Python: >=3.8
|
|
9
|
+
Requires-Dist: mcp>=0.1.0
|
|
10
|
+
Requires-Dist: openpyxl>=3.0.10
|
|
11
|
+
Requires-Dist: pypdf2>=3.0.1
|
|
12
|
+
Requires-Dist: python-docx>=0.8.11
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# MCP Document Reader
|
|
16
|
+
|
|
17
|
+
<!-- mcp-name: io.github.xt765/mcp_documents_reader -->
|
|
18
|
+
|
|
19
|
+
[](https://blog.csdn.net/Yunyi_Chi)
|
|
20
|
+
[](https://github.com/xt765/mcp_documents_reader)
|
|
21
|
+
[](https://gitee.com/xt765/mcp_documents_reader)
|
|
22
|
+
[](https://github.com/xt765/mcp_documents_reader/blob/main/LICENSE)
|
|
23
|
+
[](https://www.python.org/downloads/)
|
|
24
|
+
|
|
25
|
+
MCP (Model Context Protocol) Document Reader - A powerful MCP tool for reading documents in multiple formats, enabling AI agents to truly "read" your documents.
|
|
26
|
+
|
|
27
|
+
GitHub Repository: [https://github.com/xt765/mcp_documents_reader](https://github.com/xt765/mcp_documents_reader)
|
|
28
|
+
Gitee Repository: [https://gitee.com/xt765/mcp_documents_reader](https://gitee.com/xt765/mcp_documents_reader)
|
|
29
|
+
|
|
30
|
+
## Features
|
|
31
|
+
|
|
32
|
+
- **Multi-format Support**: Supports 4 mainstream document formats: Excel (XLSX/XLS), DOCX, PDF, and TXT
|
|
33
|
+
- **MCP Protocol**: Compliant with MCP standards, can be used as a tool for AI assistants like Trae IDE
|
|
34
|
+
- **Easy Integration**: Simple configuration for immediate use
|
|
35
|
+
- **Reliable Performance**: Successfully tested and running in Trae IDE
|
|
36
|
+
- **File System Support**: Reads documents directly from the file system
|
|
37
|
+
|
|
38
|
+
## Supported Formats
|
|
39
|
+
|
|
40
|
+
| Format | Extensions | MIME Type | Features |
|
|
41
|
+
|--------|------------|-----------|----------|
|
|
42
|
+
| Excel | .xlsx, .xls | application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | Sheet and cell data extraction |
|
|
43
|
+
| DOCX | .docx | application/vnd.openxmlformats-officedocument.wordprocessingml.document | Text and structure extraction |
|
|
44
|
+
| PDF | .pdf | application/pdf | Text extraction |
|
|
45
|
+
| Text | .txt | text/plain | Plain text reading |
|
|
46
|
+
|
|
47
|
+
## Installation
|
|
48
|
+
|
|
49
|
+
### Prerequisites
|
|
50
|
+
|
|
51
|
+
- Python 3.8 or higher
|
|
52
|
+
- MCP-enabled AI tool such as Trae IDE
|
|
53
|
+
|
|
54
|
+
### Installation Steps
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
# Clone the repository
|
|
58
|
+
git clone https://github.com/xt765/mcp_documents_reader.git
|
|
59
|
+
cd mcp_documents_reader
|
|
60
|
+
|
|
61
|
+
# Install dependencies
|
|
62
|
+
pip install -e .
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Configuration
|
|
66
|
+
|
|
67
|
+
### Using in Trae IDE
|
|
68
|
+
|
|
69
|
+
Add the following to your Trae IDE's MCP configuration:
|
|
70
|
+
|
|
71
|
+
#### Option 1: Using GitHub repository (Recommended)
|
|
72
|
+
```json
|
|
73
|
+
{
|
|
74
|
+
"mcpServers": {
|
|
75
|
+
"mcp-document-reader": {
|
|
76
|
+
"command": "uvx",
|
|
77
|
+
"args": [
|
|
78
|
+
"--from",
|
|
79
|
+
"git+https://github.com/xt765/mcp_documents_reader",
|
|
80
|
+
"mcp_documents_reader"
|
|
81
|
+
]
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
#### Option 2: Using Gitee repository
|
|
88
|
+
```json
|
|
89
|
+
{
|
|
90
|
+
"mcpServers": {
|
|
91
|
+
"mcp-document-reader": {
|
|
92
|
+
"command": "uvx",
|
|
93
|
+
"args": [
|
|
94
|
+
"--from",
|
|
95
|
+
"git+https://gitee.com/xt765/mcp_documents_reader",
|
|
96
|
+
"mcp_documents_reader"
|
|
97
|
+
]
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Environment Variables
|
|
104
|
+
|
|
105
|
+
- `DOCUMENT_DIRECTORY` - Directory where documents are stored (default: "./documents")
|
|
106
|
+
|
|
107
|
+
## Usage
|
|
108
|
+
|
|
109
|
+
### As an MCP Tool
|
|
110
|
+
|
|
111
|
+
After configuration, AI assistants can directly call the following tool:
|
|
112
|
+
|
|
113
|
+
#### read_document (Recommended)
|
|
114
|
+
Read any supported document type with a unified interface.
|
|
115
|
+
|
|
116
|
+
```
|
|
117
|
+
read_document(filename="example.docx")
|
|
118
|
+
read_document(filename="example.pdf")
|
|
119
|
+
read_document(filename="example.xlsx")
|
|
120
|
+
read_document(filename="example.txt")
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Tool Interface Details
|
|
124
|
+
|
|
125
|
+
### read_document
|
|
126
|
+
Read any supported document type.
|
|
127
|
+
|
|
128
|
+
**Parameters:**
|
|
129
|
+
|
|
130
|
+
| Parameter | Type | Required | Description |
|
|
131
|
+
|-----------|------|----------|-------------|
|
|
132
|
+
| filename | string | ✅ | Document file path, supports absolute or relative paths |
|
|
133
|
+
|
|
134
|
+
## License
|
|
135
|
+
|
|
136
|
+
MIT
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# MCP Document Reader
|
|
2
|
+
|
|
3
|
+
<!-- mcp-name: io.github.xt765/mcp_documents_reader -->
|
|
4
|
+
|
|
5
|
+
[](https://blog.csdn.net/Yunyi_Chi)
|
|
6
|
+
[](https://github.com/xt765/mcp_documents_reader)
|
|
7
|
+
[](https://gitee.com/xt765/mcp_documents_reader)
|
|
8
|
+
[](https://github.com/xt765/mcp_documents_reader/blob/main/LICENSE)
|
|
9
|
+
[](https://www.python.org/downloads/)
|
|
10
|
+
|
|
11
|
+
MCP (Model Context Protocol) Document Reader - A powerful MCP tool for reading documents in multiple formats, enabling AI agents to truly "read" your documents.
|
|
12
|
+
|
|
13
|
+
GitHub Repository: [https://github.com/xt765/mcp_documents_reader](https://github.com/xt765/mcp_documents_reader)
|
|
14
|
+
Gitee Repository: [https://gitee.com/xt765/mcp_documents_reader](https://gitee.com/xt765/mcp_documents_reader)
|
|
15
|
+
|
|
16
|
+
## Features
|
|
17
|
+
|
|
18
|
+
- **Multi-format Support**: Supports 4 mainstream document formats: Excel (XLSX/XLS), DOCX, PDF, and TXT
|
|
19
|
+
- **MCP Protocol**: Compliant with MCP standards, can be used as a tool for AI assistants like Trae IDE
|
|
20
|
+
- **Easy Integration**: Simple configuration for immediate use
|
|
21
|
+
- **Reliable Performance**: Successfully tested and running in Trae IDE
|
|
22
|
+
- **File System Support**: Reads documents directly from the file system
|
|
23
|
+
|
|
24
|
+
## Supported Formats
|
|
25
|
+
|
|
26
|
+
| Format | Extensions | MIME Type | Features |
|
|
27
|
+
|--------|------------|-----------|----------|
|
|
28
|
+
| Excel | .xlsx, .xls | application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | Sheet and cell data extraction |
|
|
29
|
+
| DOCX | .docx | application/vnd.openxmlformats-officedocument.wordprocessingml.document | Text and structure extraction |
|
|
30
|
+
| PDF | .pdf | application/pdf | Text extraction |
|
|
31
|
+
| Text | .txt | text/plain | Plain text reading |
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
### Prerequisites
|
|
36
|
+
|
|
37
|
+
- Python 3.8 or higher
|
|
38
|
+
- MCP-enabled AI tool such as Trae IDE
|
|
39
|
+
|
|
40
|
+
### Installation Steps
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
# Clone the repository
|
|
44
|
+
git clone https://github.com/xt765/mcp_documents_reader.git
|
|
45
|
+
cd mcp_documents_reader
|
|
46
|
+
|
|
47
|
+
# Install dependencies
|
|
48
|
+
pip install -e .
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Configuration
|
|
52
|
+
|
|
53
|
+
### Using in Trae IDE
|
|
54
|
+
|
|
55
|
+
Add the following to your Trae IDE's MCP configuration:
|
|
56
|
+
|
|
57
|
+
#### Option 1: Using GitHub repository (Recommended)
|
|
58
|
+
```json
|
|
59
|
+
{
|
|
60
|
+
"mcpServers": {
|
|
61
|
+
"mcp-document-reader": {
|
|
62
|
+
"command": "uvx",
|
|
63
|
+
"args": [
|
|
64
|
+
"--from",
|
|
65
|
+
"git+https://github.com/xt765/mcp_documents_reader",
|
|
66
|
+
"mcp_documents_reader"
|
|
67
|
+
]
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
#### Option 2: Using Gitee repository
|
|
74
|
+
```json
|
|
75
|
+
{
|
|
76
|
+
"mcpServers": {
|
|
77
|
+
"mcp-document-reader": {
|
|
78
|
+
"command": "uvx",
|
|
79
|
+
"args": [
|
|
80
|
+
"--from",
|
|
81
|
+
"git+https://gitee.com/xt765/mcp_documents_reader",
|
|
82
|
+
"mcp_documents_reader"
|
|
83
|
+
]
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Environment Variables
|
|
90
|
+
|
|
91
|
+
- `DOCUMENT_DIRECTORY` - Directory where documents are stored (default: "./documents")
|
|
92
|
+
|
|
93
|
+
## Usage
|
|
94
|
+
|
|
95
|
+
### As an MCP Tool
|
|
96
|
+
|
|
97
|
+
After configuration, AI assistants can directly call the following tool:
|
|
98
|
+
|
|
99
|
+
#### read_document (Recommended)
|
|
100
|
+
Read any supported document type with a unified interface.
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
read_document(filename="example.docx")
|
|
104
|
+
read_document(filename="example.pdf")
|
|
105
|
+
read_document(filename="example.xlsx")
|
|
106
|
+
read_document(filename="example.txt")
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Tool Interface Details
|
|
110
|
+
|
|
111
|
+
### read_document
|
|
112
|
+
Read any supported document type.
|
|
113
|
+
|
|
114
|
+
**Parameters:**
|
|
115
|
+
|
|
116
|
+
| Parameter | Type | Required | Description |
|
|
117
|
+
|-----------|------|----------|-------------|
|
|
118
|
+
| filename | string | ✅ | Document file path, supports absolute or relative paths |
|
|
119
|
+
|
|
120
|
+
## License
|
|
121
|
+
|
|
122
|
+
MIT
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# MCP 文档读取器
|
|
2
|
+
|
|
3
|
+
<!-- mcp-name: io.github.xt765/mcp_documents_reader -->
|
|
4
|
+
|
|
5
|
+
[](https://blog.csdn.net/Yunyi_Chi)
|
|
6
|
+
[](https://github.com/xt765/mcp_documents_reader)
|
|
7
|
+
[](https://gitee.com/xt765/mcp_documents_reader)
|
|
8
|
+
[](https://github.com/xt765/mcp_documents_reader/blob/main/LICENSE)
|
|
9
|
+
[](https://www.python.org/downloads/)
|
|
10
|
+
|
|
11
|
+
MCP(模型上下文协议)文档读取器 - 一个强大的 MCP 工具,用于读取多种格式的文档,使 AI 智能体能够真正"读取"您的文档。
|
|
12
|
+
|
|
13
|
+
GitHub 仓库:[https://github.com/xt765/mcp_documents_reader](https://github.com/xt765/mcp_documents_reader)
|
|
14
|
+
Gitee 仓库:[https://gitee.com/xt765/mcp_documents_reader](https://gitee.com/xt765/mcp_documents_reader)
|
|
15
|
+
|
|
16
|
+
## 功能特性
|
|
17
|
+
|
|
18
|
+
- **多格式支持**:支持 4 种主流文档格式:Excel(XLSX/XLS)、DOCX、PDF 和 TXT
|
|
19
|
+
- **MCP 协议**:符合 MCP 标准,可作为 AI 助手(如 Trae IDE)的工具使用
|
|
20
|
+
- **易于集成**:简单配置即可立即使用
|
|
21
|
+
- **可靠性能**:已在 Trae IDE 中成功测试运行
|
|
22
|
+
- **文件系统支持**:直接从文件系统读取文档
|
|
23
|
+
|
|
24
|
+
## 支持的格式
|
|
25
|
+
|
|
26
|
+
| 格式 | 扩展名 | MIME 类型 | 特性 |
|
|
27
|
+
|------|--------|-----------|------|
|
|
28
|
+
| Excel | .xlsx, .xls | application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 工作表和单元格数据提取 |
|
|
29
|
+
| DOCX | .docx | application/vnd.openxmlformats-officedocument.wordprocessingml.document | 文本和结构提取 |
|
|
30
|
+
| PDF | .pdf | application/pdf | 文本提取 |
|
|
31
|
+
| Text | .txt | text/plain | 纯文本读取 |
|
|
32
|
+
|
|
33
|
+
## 安装
|
|
34
|
+
|
|
35
|
+
### 前提条件
|
|
36
|
+
|
|
37
|
+
- Python 3.8 或更高版本
|
|
38
|
+
- 支持 MCP 的 AI 工具,如 Trae IDE
|
|
39
|
+
|
|
40
|
+
### 安装步骤
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
# 克隆仓库
|
|
44
|
+
git clone https://github.com/xt765/mcp_documents_reader.git
|
|
45
|
+
cd mcp_documents_reader
|
|
46
|
+
|
|
47
|
+
# 安装依赖
|
|
48
|
+
pip install -e .
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## 配置
|
|
52
|
+
|
|
53
|
+
### 在 Trae IDE 中使用
|
|
54
|
+
|
|
55
|
+
将以下内容添加到 Trae IDE 的 MCP 配置中:
|
|
56
|
+
|
|
57
|
+
#### 选项 1:使用 GitHub 仓库(推荐)
|
|
58
|
+
```json
|
|
59
|
+
{
|
|
60
|
+
"mcpServers": {
|
|
61
|
+
"mcp-document-reader": {
|
|
62
|
+
"command": "uvx",
|
|
63
|
+
"args": [
|
|
64
|
+
"--from",
|
|
65
|
+
"git+https://github.com/xt765/mcp_documents_reader",
|
|
66
|
+
"mcp_documents_reader"
|
|
67
|
+
]
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
#### 选项 2:使用 Gitee 仓库
|
|
74
|
+
```json
|
|
75
|
+
{
|
|
76
|
+
"mcpServers": {
|
|
77
|
+
"mcp-document-reader": {
|
|
78
|
+
"command": "uvx",
|
|
79
|
+
"args": [
|
|
80
|
+
"--from",
|
|
81
|
+
"git+https://gitee.com/xt765/mcp_documents_reader",
|
|
82
|
+
"mcp_documents_reader"
|
|
83
|
+
]
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### 环境变量
|
|
90
|
+
|
|
91
|
+
- `DOCUMENT_DIRECTORY` - 存储文档的目录(默认:"./documents")
|
|
92
|
+
|
|
93
|
+
## 使用方法
|
|
94
|
+
|
|
95
|
+
### 作为 MCP 工具使用
|
|
96
|
+
|
|
97
|
+
配置完成后,AI 助手可以直接调用以下工具:
|
|
98
|
+
|
|
99
|
+
#### read_document(推荐)
|
|
100
|
+
使用统一接口读取任何支持的文档类型。
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
read_document(filename="example.docx")
|
|
104
|
+
read_document(filename="example.pdf")
|
|
105
|
+
read_document(filename="example.xlsx")
|
|
106
|
+
read_document(filename="example.txt")
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## 工具接口详情
|
|
110
|
+
|
|
111
|
+
### read_document
|
|
112
|
+
读取任何支持的文档类型。
|
|
113
|
+
|
|
114
|
+
**参数:**
|
|
115
|
+
|
|
116
|
+
| 参数 | 类型 | 必填 | 描述 |
|
|
117
|
+
|------|------|------|------|
|
|
118
|
+
| filename | string | ✅ | 文档文件路径,支持绝对路径或相对路径 |
|
|
119
|
+
|
|
120
|
+
## 许可证
|
|
121
|
+
|
|
122
|
+
MIT
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import AsyncIterator, Type, Dict
|
|
3
|
+
from mcp.server.fastmcp import FastMCP
|
|
4
|
+
import os
|
|
5
|
+
from contextlib import asynccontextmanager
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from PyPDF2 import PdfReader as PyPdfReader
|
|
8
|
+
from docx import Document as DocxDocument
|
|
9
|
+
from openpyxl import load_workbook
|
|
10
|
+
|
|
11
|
+
# Directory where documents are stored
|
|
12
|
+
DOCUMENT_DIRECTORY = os.getenv("DOCUMENT_DIRECTORY", "./documents")
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class AppContext:
|
|
16
|
+
"""Application context for lifecycle management."""
|
|
17
|
+
document_directory: str
|
|
18
|
+
|
|
19
|
+
# Initialize the MCP server (lifespan added below)
|
|
20
|
+
mcp = FastMCP("Document Reader")
|
|
21
|
+
|
|
22
|
+
@asynccontextmanager
|
|
23
|
+
async def app_lifespan(server: FastMCP) -> AsyncIterator[AppContext]:
|
|
24
|
+
"""Manage application lifecycle with type-safe context"""
|
|
25
|
+
try:
|
|
26
|
+
# Ensure document directory exists
|
|
27
|
+
os.makedirs(DOCUMENT_DIRECTORY, exist_ok=True)
|
|
28
|
+
yield AppContext(document_directory=DOCUMENT_DIRECTORY)
|
|
29
|
+
finally:
|
|
30
|
+
# Cleanup (if needed)
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
# Assign lifespan to server
|
|
34
|
+
mcp.lifespan = app_lifespan
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# ------------------------- Document Reader Architecture -------------------------
|
|
38
|
+
|
|
39
|
+
class DocumentReader(ABC):
|
|
40
|
+
"""Abstract base class for document readers"""
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def read(self, file_path: str) -> str:
|
|
44
|
+
"""Read and extract text from a document"""
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class DocxReader(DocumentReader):
|
|
49
|
+
"""DOCX document reader implementation"""
|
|
50
|
+
|
|
51
|
+
def read(self, file_path: str) -> str:
|
|
52
|
+
"""Read and extract text from DOCX file"""
|
|
53
|
+
try:
|
|
54
|
+
doc = DocxDocument(file_path)
|
|
55
|
+
text = []
|
|
56
|
+
|
|
57
|
+
# Extract paragraph text
|
|
58
|
+
for paragraph in doc.paragraphs:
|
|
59
|
+
if paragraph.text:
|
|
60
|
+
text.append(paragraph.text)
|
|
61
|
+
|
|
62
|
+
# Extract table content
|
|
63
|
+
for table in doc.tables:
|
|
64
|
+
for row in table.rows:
|
|
65
|
+
row_text = []
|
|
66
|
+
for cell in row.cells:
|
|
67
|
+
cell_text = ' '.join([p.text for p in cell.paragraphs]).strip()
|
|
68
|
+
if cell_text:
|
|
69
|
+
row_text.append(cell_text)
|
|
70
|
+
if row_text:
|
|
71
|
+
text.append('\t'.join(row_text))
|
|
72
|
+
|
|
73
|
+
extracted_text = "\n".join(text)
|
|
74
|
+
return extracted_text if extracted_text else "No text found in the DOCX."
|
|
75
|
+
except Exception as e:
|
|
76
|
+
return f"Error reading DOCX: {str(e)}"
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class PdfReader(DocumentReader):
|
|
80
|
+
"""PDF document reader implementation"""
|
|
81
|
+
|
|
82
|
+
def read(self, file_path: str) -> str:
|
|
83
|
+
"""Read and extract text from PDF file"""
|
|
84
|
+
try:
|
|
85
|
+
with open(file_path, 'rb') as file:
|
|
86
|
+
pdf_reader = PyPdfReader(file)
|
|
87
|
+
text = []
|
|
88
|
+
|
|
89
|
+
# Extract text from each page
|
|
90
|
+
for page in pdf_reader.pages:
|
|
91
|
+
page_text = page.extract_text()
|
|
92
|
+
if page_text:
|
|
93
|
+
text.append(page_text.strip())
|
|
94
|
+
|
|
95
|
+
extracted_text = "\n\n".join(text)
|
|
96
|
+
return extracted_text if extracted_text else "No text found in the PDF."
|
|
97
|
+
except Exception as e:
|
|
98
|
+
return f"Error reading PDF: {str(e)}"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class TxtReader(DocumentReader):
|
|
102
|
+
"""TXT document reader implementation"""
|
|
103
|
+
|
|
104
|
+
def read(self, file_path: str) -> str:
|
|
105
|
+
"""Read and extract text from TXT file with encoding handling"""
|
|
106
|
+
# Supported encodings in priority order
|
|
107
|
+
encodings = ['utf-8', 'gbk', 'gb2312', 'ansi', 'latin-1']
|
|
108
|
+
|
|
109
|
+
for encoding in encodings:
|
|
110
|
+
try:
|
|
111
|
+
with open(file_path, 'r', encoding=encoding) as f:
|
|
112
|
+
text = f.read()
|
|
113
|
+
return text if text else "No text found in the TXT file."
|
|
114
|
+
except UnicodeDecodeError:
|
|
115
|
+
continue
|
|
116
|
+
except Exception as e:
|
|
117
|
+
return f"Error reading TXT: {str(e)}"
|
|
118
|
+
|
|
119
|
+
return "Error reading TXT: Could not decode file with any supported encoding."
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class ExcelReader(DocumentReader):
|
|
123
|
+
"""Excel document reader implementation"""
|
|
124
|
+
|
|
125
|
+
def read(self, file_path: str) -> str:
|
|
126
|
+
"""Read and extract text from Excel file"""
|
|
127
|
+
try:
|
|
128
|
+
wb = load_workbook(file_path, read_only=True)
|
|
129
|
+
text = []
|
|
130
|
+
|
|
131
|
+
# Extract text from all sheets
|
|
132
|
+
for sheet_name in wb.sheetnames:
|
|
133
|
+
sheet = wb[sheet_name]
|
|
134
|
+
text.append(f"=== Sheet: {sheet_name} ===")
|
|
135
|
+
|
|
136
|
+
# Extract cell content
|
|
137
|
+
for row in sheet.iter_rows(values_only=True):
|
|
138
|
+
row_text = [str(cell) if cell is not None else "" for cell in row]
|
|
139
|
+
if any(row_text): # Only add non-empty rows
|
|
140
|
+
text.append("\t".join(row_text))
|
|
141
|
+
|
|
142
|
+
text.append("") # Add blank line between sheets
|
|
143
|
+
|
|
144
|
+
extracted_text = "\n".join(text)
|
|
145
|
+
wb.close() # Properly close the workbook
|
|
146
|
+
return extracted_text if extracted_text else "No text found in the Excel file."
|
|
147
|
+
except Exception as e:
|
|
148
|
+
return f"Error reading Excel: {str(e)}"
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class DocumentReaderFactory:
|
|
152
|
+
"""Factory for creating document readers based on file extension"""
|
|
153
|
+
|
|
154
|
+
# Mapping of file extensions to reader classes
|
|
155
|
+
_readers: Dict[str, Type[DocumentReader]] = {
|
|
156
|
+
'.txt': TxtReader,
|
|
157
|
+
'.docx': DocxReader,
|
|
158
|
+
'.pdf': PdfReader,
|
|
159
|
+
'.xlsx': ExcelReader,
|
|
160
|
+
'.xls': ExcelReader
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
@classmethod
|
|
164
|
+
def get_reader(cls, file_path: str) -> DocumentReader:
|
|
165
|
+
"""Get appropriate reader for the given file"""
|
|
166
|
+
_, ext = os.path.splitext(file_path.lower())
|
|
167
|
+
if ext not in cls._readers:
|
|
168
|
+
raise ValueError(f"Unsupported document type: {ext}")
|
|
169
|
+
return cls._readers[ext]()
|
|
170
|
+
|
|
171
|
+
@classmethod
|
|
172
|
+
def is_supported(cls, file_path: str) -> bool:
|
|
173
|
+
"""Check if the file type is supported"""
|
|
174
|
+
_, ext = os.path.splitext(file_path.lower())
|
|
175
|
+
return ext in cls._readers
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# ------------------------- Tool Functions -------------------------
|
|
179
|
+
|
|
180
|
+
def _get_document_path(ctx, filename: str) -> str:
|
|
181
|
+
"""Get full document path from context or environment"""
|
|
182
|
+
try:
|
|
183
|
+
doc_dir = getattr(ctx, 'document_directory', DOCUMENT_DIRECTORY)
|
|
184
|
+
except:
|
|
185
|
+
doc_dir = DOCUMENT_DIRECTORY
|
|
186
|
+
return os.path.join(doc_dir, filename)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
@mcp.tool()
|
|
193
|
+
def read_document(ctx, filename: str) -> str:
|
|
194
|
+
"""
|
|
195
|
+
Reads and extracts text from a specified document file.
|
|
196
|
+
Supports multiple document types: TXT, DOCX, PDF, Excel (XLSX, XLS).
|
|
197
|
+
|
|
198
|
+
:param ctx: FastMCP context
|
|
199
|
+
:param filename: Name of the document file to read
|
|
200
|
+
:return: Extracted text from the document
|
|
201
|
+
"""
|
|
202
|
+
doc_path = _get_document_path(ctx, filename)
|
|
203
|
+
|
|
204
|
+
if not os.path.exists(doc_path):
|
|
205
|
+
return f"Error: File '{filename}' not found at {doc_path}."
|
|
206
|
+
|
|
207
|
+
if not DocumentReaderFactory.is_supported(doc_path):
|
|
208
|
+
return f"Error: Unsupported document type for file '{filename}'."
|
|
209
|
+
|
|
210
|
+
try:
|
|
211
|
+
reader = DocumentReaderFactory.get_reader(doc_path)
|
|
212
|
+
return reader.read(doc_path)
|
|
213
|
+
except Exception as e:
|
|
214
|
+
return f"Error reading document: {str(e)}"
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
# Run the MCP server
|
|
218
|
+
def main():
|
|
219
|
+
mcp.run()
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
if __name__ == "__main__":
|
|
223
|
+
main()
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "mcp_documents_reader"
|
|
3
|
+
version = "1.0.0"
|
|
4
|
+
description = "An MCP enabled multi-format document reader supporting DOCX, PDF, TXT, and Excel files"
|
|
5
|
+
keywords = ["mcp", "model-context-protocol", "document-reader", "pdf", "docx", "excel"]
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "xt765", email = "xt765@foxmail.com" }
|
|
8
|
+
]
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"mcp>=0.1.0",
|
|
13
|
+
"python-docx>=0.8.11",
|
|
14
|
+
"PyPDF2>=3.0.1",
|
|
15
|
+
"openpyxl>=3.0.10"
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[project.scripts]
|
|
19
|
+
mcp_documents_reader = "mcp_documents_reader:main"
|
|
20
|
+
|
|
21
|
+
[build-system]
|
|
22
|
+
requires = ["hatchling>=1.14.0"]
|
|
23
|
+
build-backend = "hatchling.build"
|
|
24
|
+
|
|
25
|
+
[tool.hatch.build.targets.wheel]
|
|
26
|
+
include = ["mcp_documents_reader.py"]
|
|
27
|
+
|
|
28
|
+
[tool.ruff]
|
|
29
|
+
select = ["E", "F", "I"]
|
|
30
|
+
line-length = 88
|
|
31
|
+
|
|
32
|
+
[tool.pytest.ini_options]
|
|
33
|
+
pythonpath = "."
|
|
34
|
+
testpaths = ["tests"]
|
|
35
|
+
addopts = "-v"
|
|
36
|
+
|
|
37
|
+
[tool.black]
|
|
38
|
+
line-length = 88
|
|
39
|
+
target-version = ["py38"]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://static.modelcontextprotocol.io/schemas/2025-12-11/server.schema.json",
|
|
3
|
+
"name": "io.github.xt765/mcp_documents_reader",
|
|
4
|
+
"title": "MCP Document Reader",
|
|
5
|
+
"description": "An MCP enabled multi-format document reader supporting DOCX, PDF, TXT, and Excel files",
|
|
6
|
+
"repository": {
|
|
7
|
+
"url": "https://github.com/xt765/mcp_documents_reader",
|
|
8
|
+
"source": "github"
|
|
9
|
+
},
|
|
10
|
+
"version": "1.0.0",
|
|
11
|
+
"packages": [
|
|
12
|
+
{
|
|
13
|
+
"registryType": "pypi",
|
|
14
|
+
"identifier": "mcp_documents_reader",
|
|
15
|
+
"version": "1.0.0",
|
|
16
|
+
"transport": {
|
|
17
|
+
"type": "stdio"
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
]
|
|
21
|
+
}
|