mcp-documents-reader 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,214 @@
1
+ # ---------------------------
2
+ # Project Specific Exclusions
3
+ # ---------------------------
4
+
5
+ # Document directory with test files
6
+ /documents/
7
+
8
+ # Test files
9
+ /test_*.py
10
+ **/test_*.py
11
+
12
+ # Trae IDE configuration
13
+ .trae/
14
+
15
+ # ---------------------------
16
+ # Python Exclusions
17
+ # ---------------------------
18
+
19
+ # Virtual environment
20
+ venv/
21
+ env/
22
+ .env/
23
+ .venv/
24
+
25
+ # Python bytecode
26
+ *.py[cod]
27
+ *$py.class
28
+
29
+ # Compiled output
30
+ build/
31
+ dist/
32
+ *.egg-info/
33
+ .installed.cfg
34
+ *.egg
35
+
36
+ # PIP files
37
+ pip-selfcheck.json
38
+ requirements.txt
39
+
40
+ # ---------------------------
41
+ # IDE Exclusions
42
+ # ---------------------------
43
+
44
+ # VSCode
45
+ .vscode/
46
+ *.code-workspace
47
+
48
+ # IntelliJ IDEA
49
+ .idea/
50
+ *.iml
51
+ *.ipr
52
+ *.iws
53
+
54
+ # Eclipse
55
+ .classpath
56
+ .project
57
+ .settings/
58
+
59
+ # PyCharm
60
+ .idea/
61
+
62
+ # Sublime Text
63
+ *.sublime-project
64
+ *.sublime-workspace
65
+
66
+ # Vim
67
+ *.swp
68
+ *.swo
69
+
70
+ # Emacs
71
+ *~
72
+
73
+ # ---------------------------
74
+ # OS Exclusions
75
+ # ---------------------------
76
+
77
+ # Windows
78
+ Thumbs.db
79
+ Thumbs.db:encryptable
80
+ ehthumbs.db
81
+ ehthumbs_vista.db
82
+ *.stackdump
83
+ [Dd]esktop.ini
84
+ $RECYCLE.BIN/
85
+ *.cab
86
+ *.msi
87
+ *.msix
88
+ *.msm
89
+ *.msp
90
+ *.lnk
91
+
92
+ # macOS
93
+ .DS_Store
94
+ .AppleDouble
95
+ .LSOverride
96
+ ._*
97
+ .Spotlight-V100
98
+ .Trashes
99
+ ehthumbs.db
100
+ Thumbs.db
101
+
102
+ # Linux
103
+ *~
104
+ .fuse_hidden*
105
+ .directory
106
+ .Trash-*
107
+ .nfs*
108
+
109
+ # ---------------------------
110
+ # Build and Log Exclusions
111
+ # ---------------------------
112
+
113
+ # Logs
114
+ logs/
115
+ *.log
116
+ npm-debug.log*
117
+ yarn-debug.log*
118
+ yarn-error.log*
119
+ lerna-debug.log*
120
+ .pnpm-debug.log*
121
+
122
+ # Temporary files
123
+ *.tmp
124
+ *.temp
125
+ .cache/
126
+ .temp/
127
+ .tmp/
128
+
129
+ # Environment files
130
+ .env
131
+ .env.local
132
+ .env.development.local
133
+ .env.test.local
134
+ .env.production.local
135
+ .env.*.local
136
+
137
+ # Runtime data
138
+ pids
139
+ *.pid
140
+ *.seed
141
+ *.pid.lock
142
+
143
+ # Coverage directory used by tools like istanbul
144
+ coverage/
145
+ .nyc_output/
146
+
147
+ # Dependency directories
148
+ node_modules/
149
+ jspm_packages/
150
+
151
+ # Optional npm cache directory
152
+ .npm
153
+
154
+ # Optional eslint cache
155
+ .eslintcache
156
+
157
+ # Optional REPL history
158
+ .node_repl_history
159
+
160
+ # Output of 'npm pack'
161
+ *.tgz
162
+
163
+ # Yarn Integrity file
164
+ .yarn-integrity
165
+
166
+ # parcel-bundler cache (https://parceljs.org/)
167
+ .cache
168
+ .parcel-cache
169
+
170
+ # Next.js build output
171
+ .next
172
+ out
173
+
174
+ # Nuxt.js build / generate output
175
+ .nuxt
176
+ dist
177
+
178
+ # Gatsby files
179
+ .cache/
180
+ # Comment in the public line in if your project uses Gatsby and not Next.js
181
+ # https://nextjs.org/blog/next-9-1#public-directory-support
182
+ # public
183
+
184
+ # vuepress build output
185
+ .vuepress/dist
186
+
187
+ # vuepress v2.x temp and cache directory
188
+ .temp
189
+ .cache
190
+
191
+ # Docusaurus cache and generated files
192
+ .docusaurus
193
+
194
+ # Serverless directories
195
+ .serverless/
196
+
197
+ # FuseBox cache
198
+ .fusebox/
199
+
200
+ # DynamoDB Local files
201
+ .dynamodb/
202
+
203
+ # TernJS port file
204
+ .tern-port
205
+
206
+ # Stores VSCode versions used for testing VSCode extensions
207
+ .vscode-test
208
+
209
+ # yarn v2
210
+ .yarn/cache
211
+ .yarn/unplugged
212
+ .yarn/build-state.yml
213
+ .yarn/install-state.gz
214
+ .pnp.*
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 玄同765
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,136 @@
1
+ Metadata-Version: 2.4
2
+ Name: mcp_documents_reader
3
+ Version: 1.0.0
4
+ Summary: An MCP enabled multi-format document reader supporting DOCX, PDF, TXT, and Excel files
5
+ Author-email: xt765 <xt765@foxmail.com>
6
+ License-File: LICENSE
7
+ Keywords: document-reader,docx,excel,mcp,model-context-protocol,pdf
8
+ Requires-Python: >=3.8
9
+ Requires-Dist: mcp>=0.1.0
10
+ Requires-Dist: openpyxl>=3.0.10
11
+ Requires-Dist: pypdf2>=3.0.1
12
+ Requires-Dist: python-docx>=0.8.11
13
+ Description-Content-Type: text/markdown
14
+
15
+ # MCP Document Reader
16
+
17
+ <!-- mcp-name: io.github.xt765/mcp_documents_reader -->
18
+
19
+ [![CSDN Blog](https://img.shields.io/badge/CSDN-玄同765-orange.svg?style=flat&logo=csdn)](https://blog.csdn.net/Yunyi_Chi)
20
+ [![GitHub Repository](https://img.shields.io/badge/GitHub-mcp_documents_reader-black.svg?style=flat&logo=github)](https://github.com/xt765/mcp_documents_reader)
21
+ [![Gitee Repository](https://img.shields.io/badge/Gitee-mcp_documents_reader-red.svg?style=flat&logo=gitee)](https://gitee.com/xt765/mcp_documents_reader)
22
+ [![GitHub License](https://img.shields.io/github/license/xt765/mcp_documents_reader.svg?style=flat&logo=github)](https://github.com/xt765/mcp_documents_reader/blob/main/LICENSE)
23
+ [![Python Version](https://img.shields.io/badge/python-3.8%2B-blue.svg?style=flat&logo=python)](https://www.python.org/downloads/)
24
+
25
+ MCP (Model Context Protocol) Document Reader - A powerful MCP tool for reading documents in multiple formats, enabling AI agents to truly "read" your documents.
26
+
27
+ GitHub Repository: [https://github.com/xt765/mcp_documents_reader](https://github.com/xt765/mcp_documents_reader)
28
+ Gitee Repository: [https://gitee.com/xt765/mcp_documents_reader](https://gitee.com/xt765/mcp_documents_reader)
29
+
30
+ ## Features
31
+
32
+ - **Multi-format Support**: Supports 4 mainstream document formats: Excel (XLSX/XLS), DOCX, PDF, and TXT
33
+ - **MCP Protocol**: Compliant with MCP standards, can be used as a tool for AI assistants like Trae IDE
34
+ - **Easy Integration**: Simple configuration for immediate use
35
+ - **Reliable Performance**: Successfully tested and running in Trae IDE
36
+ - **File System Support**: Reads documents directly from the file system
37
+
38
+ ## Supported Formats
39
+
40
+ | Format | Extensions | MIME Type | Features |
41
+ |--------|------------|-----------|----------|
42
+ | Excel | .xlsx, .xls | application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | Sheet and cell data extraction |
43
+ | DOCX | .docx | application/vnd.openxmlformats-officedocument.wordprocessingml.document | Text and structure extraction |
44
+ | PDF | .pdf | application/pdf | Text extraction |
45
+ | Text | .txt | text/plain | Plain text reading |
46
+
47
+ ## Installation
48
+
49
+ ### Prerequisites
50
+
51
+ - Python 3.8 or higher
52
+ - MCP-enabled AI tool such as Trae IDE
53
+
54
+ ### Installation Steps
55
+
56
+ ```bash
57
+ # Clone the repository
58
+ git clone https://github.com/xt765/mcp_documents_reader.git
59
+ cd mcp_documents_reader
60
+
61
+ # Install dependencies
62
+ pip install -e .
63
+ ```
64
+
65
+ ## Configuration
66
+
67
+ ### Using in Trae IDE
68
+
69
+ Add the following to your Trae IDE's MCP configuration:
70
+
71
+ #### Option 1: Using GitHub repository (Recommended)
72
+ ```json
73
+ {
74
+ "mcpServers": {
75
+ "mcp-document-reader": {
76
+ "command": "uvx",
77
+ "args": [
78
+ "--from",
79
+ "git+https://github.com/xt765/mcp_documents_reader",
80
+ "mcp_documents_reader"
81
+ ]
82
+ }
83
+ }
84
+ }
85
+ ```
86
+
87
+ #### Option 2: Using Gitee repository
88
+ ```json
89
+ {
90
+ "mcpServers": {
91
+ "mcp-document-reader": {
92
+ "command": "uvx",
93
+ "args": [
94
+ "--from",
95
+ "git+https://gitee.com/xt765/mcp_documents_reader",
96
+ "mcp_documents_reader"
97
+ ]
98
+ }
99
+ }
100
+ }
101
+ ```
102
+
103
+ ### Environment Variables
104
+
105
+ - `DOCUMENT_DIRECTORY` - Directory where documents are stored (default: "./documents")
106
+
107
+ ## Usage
108
+
109
+ ### As an MCP Tool
110
+
111
+ After configuration, AI assistants can directly call the following tool:
112
+
113
+ #### read_document (Recommended)
114
+ Read any supported document type with a unified interface.
115
+
116
+ ```
117
+ read_document(filename="example.docx")
118
+ read_document(filename="example.pdf")
119
+ read_document(filename="example.xlsx")
120
+ read_document(filename="example.txt")
121
+ ```
122
+
123
+ ## Tool Interface Details
124
+
125
+ ### read_document
126
+ Read any supported document type.
127
+
128
+ **Parameters:**
129
+
130
+ | Parameter | Type | Required | Description |
131
+ |-----------|------|----------|-------------|
132
+ | filename | string | ✅ | Document file path, supports absolute or relative paths |
133
+
134
+ ## License
135
+
136
+ MIT
@@ -0,0 +1,122 @@
1
+ # MCP Document Reader
2
+
3
+ <!-- mcp-name: io.github.xt765/mcp_documents_reader -->
4
+
5
+ [![CSDN Blog](https://img.shields.io/badge/CSDN-玄同765-orange.svg?style=flat&logo=csdn)](https://blog.csdn.net/Yunyi_Chi)
6
+ [![GitHub Repository](https://img.shields.io/badge/GitHub-mcp_documents_reader-black.svg?style=flat&logo=github)](https://github.com/xt765/mcp_documents_reader)
7
+ [![Gitee Repository](https://img.shields.io/badge/Gitee-mcp_documents_reader-red.svg?style=flat&logo=gitee)](https://gitee.com/xt765/mcp_documents_reader)
8
+ [![GitHub License](https://img.shields.io/github/license/xt765/mcp_documents_reader.svg?style=flat&logo=github)](https://github.com/xt765/mcp_documents_reader/blob/main/LICENSE)
9
+ [![Python Version](https://img.shields.io/badge/python-3.8%2B-blue.svg?style=flat&logo=python)](https://www.python.org/downloads/)
10
+
11
+ MCP (Model Context Protocol) Document Reader - A powerful MCP tool for reading documents in multiple formats, enabling AI agents to truly "read" your documents.
12
+
13
+ GitHub Repository: [https://github.com/xt765/mcp_documents_reader](https://github.com/xt765/mcp_documents_reader)
14
+ Gitee Repository: [https://gitee.com/xt765/mcp_documents_reader](https://gitee.com/xt765/mcp_documents_reader)
15
+
16
+ ## Features
17
+
18
+ - **Multi-format Support**: Supports 4 mainstream document formats: Excel (XLSX/XLS), DOCX, PDF, and TXT
19
+ - **MCP Protocol**: Compliant with MCP standards, can be used as a tool for AI assistants like Trae IDE
20
+ - **Easy Integration**: Simple configuration for immediate use
21
+ - **Reliable Performance**: Successfully tested and running in Trae IDE
22
+ - **File System Support**: Reads documents directly from the file system
23
+
24
+ ## Supported Formats
25
+
26
+ | Format | Extensions | MIME Type | Features |
27
+ |--------|------------|-----------|----------|
28
+ | Excel | .xlsx, .xls | application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | Sheet and cell data extraction |
29
+ | DOCX | .docx | application/vnd.openxmlformats-officedocument.wordprocessingml.document | Text and structure extraction |
30
+ | PDF | .pdf | application/pdf | Text extraction |
31
+ | Text | .txt | text/plain | Plain text reading |
32
+
33
+ ## Installation
34
+
35
+ ### Prerequisites
36
+
37
+ - Python 3.8 or higher
38
+ - MCP-enabled AI tool such as Trae IDE
39
+
40
+ ### Installation Steps
41
+
42
+ ```bash
43
+ # Clone the repository
44
+ git clone https://github.com/xt765/mcp_documents_reader.git
45
+ cd mcp_documents_reader
46
+
47
+ # Install dependencies
48
+ pip install -e .
49
+ ```
50
+
51
+ ## Configuration
52
+
53
+ ### Using in Trae IDE
54
+
55
+ Add the following to your Trae IDE's MCP configuration:
56
+
57
+ #### Option 1: Using GitHub repository (Recommended)
58
+ ```json
59
+ {
60
+ "mcpServers": {
61
+ "mcp-document-reader": {
62
+ "command": "uvx",
63
+ "args": [
64
+ "--from",
65
+ "git+https://github.com/xt765/mcp_documents_reader",
66
+ "mcp_documents_reader"
67
+ ]
68
+ }
69
+ }
70
+ }
71
+ ```
72
+
73
+ #### Option 2: Using Gitee repository
74
+ ```json
75
+ {
76
+ "mcpServers": {
77
+ "mcp-document-reader": {
78
+ "command": "uvx",
79
+ "args": [
80
+ "--from",
81
+ "git+https://gitee.com/xt765/mcp_documents_reader",
82
+ "mcp_documents_reader"
83
+ ]
84
+ }
85
+ }
86
+ }
87
+ ```
88
+
89
+ ### Environment Variables
90
+
91
+ - `DOCUMENT_DIRECTORY` - Directory where documents are stored (default: "./documents")
92
+
93
+ ## Usage
94
+
95
+ ### As an MCP Tool
96
+
97
+ After configuration, AI assistants can directly call the following tool:
98
+
99
+ #### read_document (Recommended)
100
+ Read any supported document type with a unified interface.
101
+
102
+ ```
103
+ read_document(filename="example.docx")
104
+ read_document(filename="example.pdf")
105
+ read_document(filename="example.xlsx")
106
+ read_document(filename="example.txt")
107
+ ```
108
+
109
+ ## Tool Interface Details
110
+
111
+ ### read_document
112
+ Read any supported document type.
113
+
114
+ **Parameters:**
115
+
116
+ | Parameter | Type | Required | Description |
117
+ |-----------|------|----------|-------------|
118
+ | filename | string | ✅ | Document file path, supports absolute or relative paths |
119
+
120
+ ## License
121
+
122
+ MIT
@@ -0,0 +1,122 @@
1
+ # MCP 文档读取器
2
+
3
+ <!-- mcp-name: io.github.xt765/mcp_documents_reader -->
4
+
5
+ [![CSDN Blog](https://img.shields.io/badge/CSDN-玄同765-orange.svg?style=flat&logo=csdn)](https://blog.csdn.net/Yunyi_Chi)
6
+ [![GitHub Repository](https://img.shields.io/badge/GitHub-mcp_documents_reader-black.svg?style=flat&logo=github)](https://github.com/xt765/mcp_documents_reader)
7
+ [![Gitee Repository](https://img.shields.io/badge/Gitee-mcp_documents_reader-red.svg?style=flat&logo=gitee)](https://gitee.com/xt765/mcp_documents_reader)
8
+ [![GitHub License](https://img.shields.io/github/license/xt765/mcp_documents_reader.svg?style=flat&logo=github)](https://github.com/xt765/mcp_documents_reader/blob/main/LICENSE)
9
+ [![Python Version](https://img.shields.io/badge/python-3.8%2B-blue.svg?style=flat&logo=python)](https://www.python.org/downloads/)
10
+
11
+ MCP(模型上下文协议)文档读取器 - 一个强大的 MCP 工具,用于读取多种格式的文档,使 AI 智能体能够真正"读取"您的文档。
12
+
13
+ GitHub 仓库:[https://github.com/xt765/mcp_documents_reader](https://github.com/xt765/mcp_documents_reader)
14
+ Gitee 仓库:[https://gitee.com/xt765/mcp_documents_reader](https://gitee.com/xt765/mcp_documents_reader)
15
+
16
+ ## 功能特性
17
+
18
+ - **多格式支持**:支持 4 种主流文档格式:Excel(XLSX/XLS)、DOCX、PDF 和 TXT
19
+ - **MCP 协议**:符合 MCP 标准,可作为 AI 助手(如 Trae IDE)的工具使用
20
+ - **易于集成**:简单配置即可立即使用
21
+ - **可靠性能**:已在 Trae IDE 中成功测试运行
22
+ - **文件系统支持**:直接从文件系统读取文档
23
+
24
+ ## 支持的格式
25
+
26
+ | 格式 | 扩展名 | MIME 类型 | 特性 |
27
+ |------|--------|-----------|------|
28
+ | Excel | .xlsx, .xls | application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 工作表和单元格数据提取 |
29
+ | DOCX | .docx | application/vnd.openxmlformats-officedocument.wordprocessingml.document | 文本和结构提取 |
30
+ | PDF | .pdf | application/pdf | 文本提取 |
31
+ | Text | .txt | text/plain | 纯文本读取 |
32
+
33
+ ## 安装
34
+
35
+ ### 前提条件
36
+
37
+ - Python 3.8 或更高版本
38
+ - 支持 MCP 的 AI 工具,如 Trae IDE
39
+
40
+ ### 安装步骤
41
+
42
+ ```bash
43
+ # 克隆仓库
44
+ git clone https://github.com/xt765/mcp_documents_reader.git
45
+ cd mcp_documents_reader
46
+
47
+ # 安装依赖
48
+ pip install -e .
49
+ ```
50
+
51
+ ## 配置
52
+
53
+ ### 在 Trae IDE 中使用
54
+
55
+ 将以下内容添加到 Trae IDE 的 MCP 配置中:
56
+
57
+ #### 选项 1:使用 GitHub 仓库(推荐)
58
+ ```json
59
+ {
60
+ "mcpServers": {
61
+ "mcp-document-reader": {
62
+ "command": "uvx",
63
+ "args": [
64
+ "--from",
65
+ "git+https://github.com/xt765/mcp_documents_reader",
66
+ "mcp_documents_reader"
67
+ ]
68
+ }
69
+ }
70
+ }
71
+ ```
72
+
73
+ #### 选项 2:使用 Gitee 仓库
74
+ ```json
75
+ {
76
+ "mcpServers": {
77
+ "mcp-document-reader": {
78
+ "command": "uvx",
79
+ "args": [
80
+ "--from",
81
+ "git+https://gitee.com/xt765/mcp_documents_reader",
82
+ "mcp_documents_reader"
83
+ ]
84
+ }
85
+ }
86
+ }
87
+ ```
88
+
89
+ ### 环境变量
90
+
91
+ - `DOCUMENT_DIRECTORY` - 存储文档的目录(默认:"./documents")
92
+
93
+ ## 使用方法
94
+
95
+ ### 作为 MCP 工具使用
96
+
97
+ 配置完成后,AI 助手可以直接调用以下工具:
98
+
99
+ #### read_document(推荐)
100
+ 使用统一接口读取任何支持的文档类型。
101
+
102
+ ```
103
+ read_document(filename="example.docx")
104
+ read_document(filename="example.pdf")
105
+ read_document(filename="example.xlsx")
106
+ read_document(filename="example.txt")
107
+ ```
108
+
109
+ ## 工具接口详情
110
+
111
+ ### read_document
112
+ 读取任何支持的文档类型。
113
+
114
+ **参数:**
115
+
116
+ | 参数 | 类型 | 必填 | 描述 |
117
+ |------|------|------|------|
118
+ | filename | string | ✅ | 文档文件路径,支持绝对路径或相对路径 |
119
+
120
+ ## 许可证
121
+
122
+ MIT
@@ -0,0 +1,223 @@
1
+ from dataclasses import dataclass
2
+ from typing import AsyncIterator, Type, Dict
3
+ from mcp.server.fastmcp import FastMCP
4
+ import os
5
+ from contextlib import asynccontextmanager
6
+ from abc import ABC, abstractmethod
7
+ from PyPDF2 import PdfReader as PyPdfReader
8
+ from docx import Document as DocxDocument
9
+ from openpyxl import load_workbook
10
+
11
+ # Directory where documents are stored
12
+ DOCUMENT_DIRECTORY = os.getenv("DOCUMENT_DIRECTORY", "./documents")
13
+
14
+ @dataclass
15
+ class AppContext:
16
+ """Application context for lifecycle management."""
17
+ document_directory: str
18
+
19
+ # Initialize the MCP server (lifespan added below)
20
+ mcp = FastMCP("Document Reader")
21
+
22
+ @asynccontextmanager
23
+ async def app_lifespan(server: FastMCP) -> AsyncIterator[AppContext]:
24
+ """Manage application lifecycle with type-safe context"""
25
+ try:
26
+ # Ensure document directory exists
27
+ os.makedirs(DOCUMENT_DIRECTORY, exist_ok=True)
28
+ yield AppContext(document_directory=DOCUMENT_DIRECTORY)
29
+ finally:
30
+ # Cleanup (if needed)
31
+ pass
32
+
33
+ # Assign lifespan to server
34
+ mcp.lifespan = app_lifespan
35
+
36
+
37
+ # ------------------------- Document Reader Architecture -------------------------
38
+
39
+ class DocumentReader(ABC):
40
+ """Abstract base class for document readers"""
41
+
42
+ @abstractmethod
43
+ def read(self, file_path: str) -> str:
44
+ """Read and extract text from a document"""
45
+ pass
46
+
47
+
48
+ class DocxReader(DocumentReader):
49
+ """DOCX document reader implementation"""
50
+
51
+ def read(self, file_path: str) -> str:
52
+ """Read and extract text from DOCX file"""
53
+ try:
54
+ doc = DocxDocument(file_path)
55
+ text = []
56
+
57
+ # Extract paragraph text
58
+ for paragraph in doc.paragraphs:
59
+ if paragraph.text:
60
+ text.append(paragraph.text)
61
+
62
+ # Extract table content
63
+ for table in doc.tables:
64
+ for row in table.rows:
65
+ row_text = []
66
+ for cell in row.cells:
67
+ cell_text = ' '.join([p.text for p in cell.paragraphs]).strip()
68
+ if cell_text:
69
+ row_text.append(cell_text)
70
+ if row_text:
71
+ text.append('\t'.join(row_text))
72
+
73
+ extracted_text = "\n".join(text)
74
+ return extracted_text if extracted_text else "No text found in the DOCX."
75
+ except Exception as e:
76
+ return f"Error reading DOCX: {str(e)}"
77
+
78
+
79
+ class PdfReader(DocumentReader):
80
+ """PDF document reader implementation"""
81
+
82
+ def read(self, file_path: str) -> str:
83
+ """Read and extract text from PDF file"""
84
+ try:
85
+ with open(file_path, 'rb') as file:
86
+ pdf_reader = PyPdfReader(file)
87
+ text = []
88
+
89
+ # Extract text from each page
90
+ for page in pdf_reader.pages:
91
+ page_text = page.extract_text()
92
+ if page_text:
93
+ text.append(page_text.strip())
94
+
95
+ extracted_text = "\n\n".join(text)
96
+ return extracted_text if extracted_text else "No text found in the PDF."
97
+ except Exception as e:
98
+ return f"Error reading PDF: {str(e)}"
99
+
100
+
101
+ class TxtReader(DocumentReader):
102
+ """TXT document reader implementation"""
103
+
104
+ def read(self, file_path: str) -> str:
105
+ """Read and extract text from TXT file with encoding handling"""
106
+ # Supported encodings in priority order
107
+ encodings = ['utf-8', 'gbk', 'gb2312', 'ansi', 'latin-1']
108
+
109
+ for encoding in encodings:
110
+ try:
111
+ with open(file_path, 'r', encoding=encoding) as f:
112
+ text = f.read()
113
+ return text if text else "No text found in the TXT file."
114
+ except UnicodeDecodeError:
115
+ continue
116
+ except Exception as e:
117
+ return f"Error reading TXT: {str(e)}"
118
+
119
+ return "Error reading TXT: Could not decode file with any supported encoding."
120
+
121
+
122
+ class ExcelReader(DocumentReader):
123
+ """Excel document reader implementation"""
124
+
125
+ def read(self, file_path: str) -> str:
126
+ """Read and extract text from Excel file"""
127
+ try:
128
+ wb = load_workbook(file_path, read_only=True)
129
+ text = []
130
+
131
+ # Extract text from all sheets
132
+ for sheet_name in wb.sheetnames:
133
+ sheet = wb[sheet_name]
134
+ text.append(f"=== Sheet: {sheet_name} ===")
135
+
136
+ # Extract cell content
137
+ for row in sheet.iter_rows(values_only=True):
138
+ row_text = [str(cell) if cell is not None else "" for cell in row]
139
+ if any(row_text): # Only add non-empty rows
140
+ text.append("\t".join(row_text))
141
+
142
+ text.append("") # Add blank line between sheets
143
+
144
+ extracted_text = "\n".join(text)
145
+ wb.close() # Properly close the workbook
146
+ return extracted_text if extracted_text else "No text found in the Excel file."
147
+ except Exception as e:
148
+ return f"Error reading Excel: {str(e)}"
149
+
150
+
151
+ class DocumentReaderFactory:
152
+ """Factory for creating document readers based on file extension"""
153
+
154
+ # Mapping of file extensions to reader classes
155
+ _readers: Dict[str, Type[DocumentReader]] = {
156
+ '.txt': TxtReader,
157
+ '.docx': DocxReader,
158
+ '.pdf': PdfReader,
159
+ '.xlsx': ExcelReader,
160
+ '.xls': ExcelReader
161
+ }
162
+
163
+ @classmethod
164
+ def get_reader(cls, file_path: str) -> DocumentReader:
165
+ """Get appropriate reader for the given file"""
166
+ _, ext = os.path.splitext(file_path.lower())
167
+ if ext not in cls._readers:
168
+ raise ValueError(f"Unsupported document type: {ext}")
169
+ return cls._readers[ext]()
170
+
171
+ @classmethod
172
+ def is_supported(cls, file_path: str) -> bool:
173
+ """Check if the file type is supported"""
174
+ _, ext = os.path.splitext(file_path.lower())
175
+ return ext in cls._readers
176
+
177
+
178
+ # ------------------------- Tool Functions -------------------------
179
+
180
+ def _get_document_path(ctx, filename: str) -> str:
181
+ """Get full document path from context or environment"""
182
+ try:
183
+ doc_dir = getattr(ctx, 'document_directory', DOCUMENT_DIRECTORY)
184
+ except:
185
+ doc_dir = DOCUMENT_DIRECTORY
186
+ return os.path.join(doc_dir, filename)
187
+
188
+
189
+
190
+
191
+
192
+ @mcp.tool()
193
+ def read_document(ctx, filename: str) -> str:
194
+ """
195
+ Reads and extracts text from a specified document file.
196
+ Supports multiple document types: TXT, DOCX, PDF, Excel (XLSX, XLS).
197
+
198
+ :param ctx: FastMCP context
199
+ :param filename: Name of the document file to read
200
+ :return: Extracted text from the document
201
+ """
202
+ doc_path = _get_document_path(ctx, filename)
203
+
204
+ if not os.path.exists(doc_path):
205
+ return f"Error: File '{filename}' not found at {doc_path}."
206
+
207
+ if not DocumentReaderFactory.is_supported(doc_path):
208
+ return f"Error: Unsupported document type for file '{filename}'."
209
+
210
+ try:
211
+ reader = DocumentReaderFactory.get_reader(doc_path)
212
+ return reader.read(doc_path)
213
+ except Exception as e:
214
+ return f"Error reading document: {str(e)}"
215
+
216
+
217
+ # Run the MCP server
218
+ def main():
219
+ mcp.run()
220
+
221
+
222
+ if __name__ == "__main__":
223
+ main()
@@ -0,0 +1,39 @@
1
+ [project]
2
+ name = "mcp_documents_reader"
3
+ version = "1.0.0"
4
+ description = "An MCP enabled multi-format document reader supporting DOCX, PDF, TXT, and Excel files"
5
+ keywords = ["mcp", "model-context-protocol", "document-reader", "pdf", "docx", "excel"]
6
+ authors = [
7
+ { name = "xt765", email = "xt765@foxmail.com" }
8
+ ]
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ dependencies = [
12
+ "mcp>=0.1.0",
13
+ "python-docx>=0.8.11",
14
+ "PyPDF2>=3.0.1",
15
+ "openpyxl>=3.0.10"
16
+ ]
17
+
18
+ [project.scripts]
19
+ mcp_documents_reader = "mcp_documents_reader:main"
20
+
21
+ [build-system]
22
+ requires = ["hatchling>=1.14.0"]
23
+ build-backend = "hatchling.build"
24
+
25
+ [tool.hatch.build.targets.wheel]
26
+ include = ["mcp_documents_reader.py"]
27
+
28
+ [tool.ruff]
29
+ select = ["E", "F", "I"]
30
+ line-length = 88
31
+
32
+ [tool.pytest.ini_options]
33
+ pythonpath = "."
34
+ testpaths = ["tests"]
35
+ addopts = "-v"
36
+
37
+ [tool.black]
38
+ line-length = 88
39
+ target-version = ["py38"]
@@ -0,0 +1,21 @@
1
+ {
2
+ "$schema": "https://static.modelcontextprotocol.io/schemas/2025-12-11/server.schema.json",
3
+ "name": "io.github.xt765/mcp_documents_reader",
4
+ "title": "MCP Document Reader",
5
+ "description": "An MCP enabled multi-format document reader supporting DOCX, PDF, TXT, and Excel files",
6
+ "repository": {
7
+ "url": "https://github.com/xt765/mcp_documents_reader",
8
+ "source": "github"
9
+ },
10
+ "version": "1.0.0",
11
+ "packages": [
12
+ {
13
+ "registryType": "pypi",
14
+ "identifier": "mcp_documents_reader",
15
+ "version": "1.0.0",
16
+ "transport": {
17
+ "type": "stdio"
18
+ }
19
+ }
20
+ ]
21
+ }