docsmith-mcp 0.0.1-beta.1 → 0.0.1-beta.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +750 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +1 -0
- package/dist/index.js +150 -103
- package/dist/index.js.map +1 -1
- package/package.json +24 -2
- package/python/pptx_handler.py +169 -0
- package/scripts/preload-packages.mjs +2 -0
- package/.github/workflows/test.yml +0 -35
- package/dist/python/excel_handler.py +0 -97
- package/dist/python/pdf_handler.py +0 -81
- package/dist/python/text_handler.py +0 -331
- package/dist/python/word_handler.py +0 -98
- package/examples/sample_data.csv +0 -6
- package/examples/sample_data.json +0 -9
- package/examples/sample_document.pdf +0 -80
- package/examples/sample_report.docx +0 -0
- package/examples/sample_sales_data.xlsx +0 -0
- package/examples/sample_text.txt +0 -10
- package/src/code-runner.ts +0 -136
- package/src/index.ts +0 -496
- package/src/utils.ts +0 -45
- package/tests/document-processing.test.ts +0 -230
- package/tsconfig.json +0 -20
- package/tsdown.config.ts +0 -21
- package/vitest.config.ts +0 -15
package/package.json
CHANGED
|
@@ -1,9 +1,31 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "docsmith-mcp",
|
|
3
|
-
"version": "0.0.1-beta.
|
|
3
|
+
"version": "0.0.1-beta.3",
|
|
4
4
|
"description": "Python-powered document processing MCP for Excel, Word, PDF",
|
|
5
5
|
"type": "module",
|
|
6
|
-
"main": "dist/index.js",
|
|
6
|
+
"main": "./dist/index.js",
|
|
7
|
+
"module": "./dist/index.js",
|
|
8
|
+
"types": "./dist/index.d.ts",
|
|
9
|
+
"exports": {
|
|
10
|
+
".": {
|
|
11
|
+
"import": {
|
|
12
|
+
"types": "./dist/index.d.ts",
|
|
13
|
+
"default": "./dist/index.js"
|
|
14
|
+
},
|
|
15
|
+
"require": {
|
|
16
|
+
"types": "./dist/index.d.cts",
|
|
17
|
+
"default": "./dist/index.cjs"
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"bin": {
|
|
22
|
+
"docsmith-mcp": "./dist/index.js"
|
|
23
|
+
},
|
|
24
|
+
"files": [
|
|
25
|
+
"dist",
|
|
26
|
+
"python",
|
|
27
|
+
"scripts"
|
|
28
|
+
],
|
|
7
29
|
"scripts": {
|
|
8
30
|
"build": "tsdown",
|
|
9
31
|
"dev": "tsdown --watch",
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PowerPoint document handler - read/write PPTX files
|
|
3
|
+
"""
|
|
4
|
+
import json
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def read_pptx(file_path: str, page: int = None, page_size: int = 100):
|
|
10
|
+
"""Read PowerPoint presentation with optional pagination by slides"""
|
|
11
|
+
from pptx import Presentation
|
|
12
|
+
|
|
13
|
+
prs = Presentation(file_path)
|
|
14
|
+
|
|
15
|
+
# Extract all slides content
|
|
16
|
+
slides = []
|
|
17
|
+
for i, slide in enumerate(prs.slides):
|
|
18
|
+
slide_data = {
|
|
19
|
+
"slide_number": i + 1,
|
|
20
|
+
"title": "",
|
|
21
|
+
"content": [],
|
|
22
|
+
"notes": ""
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
# Extract text from shapes
|
|
26
|
+
for shape in slide.shapes:
|
|
27
|
+
if hasattr(shape, "text") and shape.text.strip():
|
|
28
|
+
# Try to detect if it's a title
|
|
29
|
+
if hasattr(shape, "is_placeholder") and shape.is_placeholder:
|
|
30
|
+
placeholder = shape.placeholder_format
|
|
31
|
+
if placeholder.type == 1: # Title placeholder
|
|
32
|
+
slide_data["title"] = shape.text
|
|
33
|
+
continue
|
|
34
|
+
|
|
35
|
+
slide_data["content"].append(shape.text)
|
|
36
|
+
|
|
37
|
+
# Extract table data
|
|
38
|
+
if hasattr(shape, "table"):
|
|
39
|
+
table_data = []
|
|
40
|
+
for row in shape.table.rows:
|
|
41
|
+
row_data = [cell.text for cell in row.cells]
|
|
42
|
+
table_data.append(row_data)
|
|
43
|
+
slide_data["content"].append({"table": table_data})
|
|
44
|
+
|
|
45
|
+
# Extract notes
|
|
46
|
+
if slide.has_notes_slide:
|
|
47
|
+
notes_frame = slide.notes_slide.notes_text_frame
|
|
48
|
+
if notes_frame:
|
|
49
|
+
slide_data["notes"] = notes_frame.text
|
|
50
|
+
|
|
51
|
+
slides.append(slide_data)
|
|
52
|
+
|
|
53
|
+
total_slides = len(slides)
|
|
54
|
+
|
|
55
|
+
# Handle pagination
|
|
56
|
+
if page is not None:
|
|
57
|
+
start = (page - 1) * page_size
|
|
58
|
+
end = start + page_size
|
|
59
|
+
slides = slides[start:end]
|
|
60
|
+
total_pages = (total_slides + page_size - 1) // page_size if total_slides else 1
|
|
61
|
+
else:
|
|
62
|
+
total_pages = 1
|
|
63
|
+
|
|
64
|
+
return {
|
|
65
|
+
"total_slides": total_slides,
|
|
66
|
+
"slides": slides,
|
|
67
|
+
"current_page": page,
|
|
68
|
+
"page_size": page_size if page else None,
|
|
69
|
+
"total_pages": total_pages
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def get_pptx_info(file_path: str):
|
|
74
|
+
"""Get PowerPoint metadata"""
|
|
75
|
+
from pptx import Presentation
|
|
76
|
+
|
|
77
|
+
prs = Presentation(file_path)
|
|
78
|
+
|
|
79
|
+
info = {
|
|
80
|
+
"slides": len(prs.slides),
|
|
81
|
+
"file_size": Path(file_path).stat().st_size
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
# Try to get presentation properties
|
|
85
|
+
if prs.core_properties:
|
|
86
|
+
props = prs.core_properties
|
|
87
|
+
metadata = {}
|
|
88
|
+
if props.title:
|
|
89
|
+
metadata["title"] = props.title
|
|
90
|
+
if props.author:
|
|
91
|
+
metadata["author"] = props.author
|
|
92
|
+
if props.subject:
|
|
93
|
+
metadata["subject"] = props.subject
|
|
94
|
+
if props.created:
|
|
95
|
+
metadata["created"] = str(props.created)
|
|
96
|
+
if props.modified:
|
|
97
|
+
metadata["modified"] = str(props.modified)
|
|
98
|
+
|
|
99
|
+
if metadata:
|
|
100
|
+
info["metadata"] = metadata
|
|
101
|
+
|
|
102
|
+
return info
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def write_pptx(file_path: str, slides_data: list):
|
|
106
|
+
"""Write data to PowerPoint presentation"""
|
|
107
|
+
from pptx import Presentation
|
|
108
|
+
from pptx.util import Inches, Pt
|
|
109
|
+
|
|
110
|
+
prs = Presentation()
|
|
111
|
+
prs.slide_width = Inches(10)
|
|
112
|
+
prs.slide_height = Inches(7.5)
|
|
113
|
+
|
|
114
|
+
for slide_info in slides_data:
|
|
115
|
+
# Add blank slide
|
|
116
|
+
blank_layout = prs.slide_layouts[6] # Blank layout
|
|
117
|
+
slide = prs.slides.add_slide(blank_layout)
|
|
118
|
+
|
|
119
|
+
# Add title if provided
|
|
120
|
+
title = slide_info.get("title", "")
|
|
121
|
+
if title:
|
|
122
|
+
left = Inches(0.5)
|
|
123
|
+
top = Inches(0.5)
|
|
124
|
+
width = Inches(9)
|
|
125
|
+
height = Inches(1)
|
|
126
|
+
title_box = slide.shapes.add_textbox(left, top, width, height)
|
|
127
|
+
title_frame = title_box.text_frame
|
|
128
|
+
title_frame.text = title
|
|
129
|
+
title_frame.paragraphs[0].font.size = Pt(32)
|
|
130
|
+
title_frame.paragraphs[0].font.bold = True
|
|
131
|
+
|
|
132
|
+
# Add content
|
|
133
|
+
content = slide_info.get("content", [])
|
|
134
|
+
if content:
|
|
135
|
+
left = Inches(0.5)
|
|
136
|
+
top = Inches(2)
|
|
137
|
+
width = Inches(9)
|
|
138
|
+
height = Inches(5)
|
|
139
|
+
content_box = slide.shapes.add_textbox(left, top, width, height)
|
|
140
|
+
text_frame = content_box.text_frame
|
|
141
|
+
|
|
142
|
+
for item in content:
|
|
143
|
+
if isinstance(item, str):
|
|
144
|
+
p = text_frame.add_paragraph()
|
|
145
|
+
p.text = item
|
|
146
|
+
p.level = 0
|
|
147
|
+
|
|
148
|
+
prs.save(file_path)
|
|
149
|
+
return {"success": True, "file_path": file_path}
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
if __name__ == "__main__":
|
|
153
|
+
command = sys.argv[1]
|
|
154
|
+
file_path = sys.argv[2]
|
|
155
|
+
|
|
156
|
+
if command == "read":
|
|
157
|
+
page = int(sys.argv[3]) if len(sys.argv) > 3 else None
|
|
158
|
+
page_size = int(sys.argv[4]) if len(sys.argv) > 4 else 100
|
|
159
|
+
result = read_pptx(file_path, page, page_size)
|
|
160
|
+
elif command == "info":
|
|
161
|
+
result = get_pptx_info(file_path)
|
|
162
|
+
elif command == "write":
|
|
163
|
+
# Data passed as JSON string
|
|
164
|
+
slides_data = json.loads(sys.argv[3])
|
|
165
|
+
result = write_pptx(file_path, slides_data)
|
|
166
|
+
else:
|
|
167
|
+
result = {"error": f"Unknown command: {command}"}
|
|
168
|
+
|
|
169
|
+
print(json.dumps(result, default=str))
|
|
@@ -10,6 +10,7 @@ import { runPy } from "@mcpc-tech/code-runner-mcp";
|
|
|
10
10
|
const PACKAGES = [
|
|
11
11
|
"openpyxl",
|
|
12
12
|
"python-docx",
|
|
13
|
+
"python-pptx",
|
|
13
14
|
"PyPDF2",
|
|
14
15
|
];
|
|
15
16
|
|
|
@@ -39,6 +40,7 @@ asyncio.run(main())
|
|
|
39
40
|
packages: {
|
|
40
41
|
openpyxl: "openpyxl",
|
|
41
42
|
"python-docx": "python-docx",
|
|
43
|
+
"python-pptx": "python-pptx",
|
|
42
44
|
PyPDF2: "PyPDF2",
|
|
43
45
|
},
|
|
44
46
|
});
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
name: Test
|
|
2
|
-
|
|
3
|
-
on:
|
|
4
|
-
push:
|
|
5
|
-
branches: [main, master]
|
|
6
|
-
pull_request:
|
|
7
|
-
branches: [main, master]
|
|
8
|
-
|
|
9
|
-
jobs:
|
|
10
|
-
test:
|
|
11
|
-
runs-on: ubuntu-latest
|
|
12
|
-
|
|
13
|
-
steps:
|
|
14
|
-
- name: Checkout code
|
|
15
|
-
uses: actions/checkout@v4
|
|
16
|
-
|
|
17
|
-
- name: Setup pnpm
|
|
18
|
-
uses: pnpm/action-setup@v2
|
|
19
|
-
with:
|
|
20
|
-
version: 9
|
|
21
|
-
|
|
22
|
-
- name: Setup Node.js
|
|
23
|
-
uses: actions/setup-node@v4
|
|
24
|
-
with:
|
|
25
|
-
node-version: 24
|
|
26
|
-
cache: "pnpm"
|
|
27
|
-
|
|
28
|
-
- name: Install dependencies
|
|
29
|
-
run: pnpm install --frozen-lockfile
|
|
30
|
-
|
|
31
|
-
- name: Run tests
|
|
32
|
-
run: pnpm test:run
|
|
33
|
-
|
|
34
|
-
- name: Build
|
|
35
|
-
run: pnpm build
|
|
@@ -1,97 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Excel document handler - read/write Excel files
|
|
3
|
-
"""
|
|
4
|
-
import json
|
|
5
|
-
import sys
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
|
|
8
|
-
def read_excel(file_path: str, sheet_name: str = None, page: int = None, page_size: int = 100):
|
|
9
|
-
"""Read Excel file with optional pagination"""
|
|
10
|
-
import openpyxl
|
|
11
|
-
|
|
12
|
-
wb = openpyxl.load_workbook(file_path, data_only=True)
|
|
13
|
-
|
|
14
|
-
if sheet_name is None:
|
|
15
|
-
sheet_name = wb.sheetnames[0]
|
|
16
|
-
|
|
17
|
-
ws = wb[sheet_name]
|
|
18
|
-
|
|
19
|
-
# Get all data
|
|
20
|
-
data = []
|
|
21
|
-
for row in ws.iter_rows(values_only=True):
|
|
22
|
-
data.append(row)
|
|
23
|
-
|
|
24
|
-
# Handle pagination
|
|
25
|
-
if page is not None:
|
|
26
|
-
start = (page - 1) * page_size
|
|
27
|
-
end = start + page_size
|
|
28
|
-
data = data[start:end]
|
|
29
|
-
total_pages = (len(data) + page_size - 1) // page_size if data else 1
|
|
30
|
-
else:
|
|
31
|
-
total_pages = 1
|
|
32
|
-
|
|
33
|
-
return {
|
|
34
|
-
"sheet_name": sheet_name,
|
|
35
|
-
"sheets": wb.sheetnames,
|
|
36
|
-
"total_rows": ws.max_row,
|
|
37
|
-
"total_cols": ws.max_column,
|
|
38
|
-
"current_page": page,
|
|
39
|
-
"page_size": page_size if page else None,
|
|
40
|
-
"total_pages": total_pages,
|
|
41
|
-
"data": data
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
def get_excel_info(file_path: str):
|
|
45
|
-
"""Get Excel file metadata"""
|
|
46
|
-
import openpyxl
|
|
47
|
-
|
|
48
|
-
wb = openpyxl.load_workbook(file_path, data_only=True)
|
|
49
|
-
info = {
|
|
50
|
-
"sheets": [],
|
|
51
|
-
"file_size": Path(file_path).stat().st_size
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
for sheet_name in wb.sheetnames:
|
|
55
|
-
ws = wb[sheet_name]
|
|
56
|
-
info["sheets"].append({
|
|
57
|
-
"name": sheet_name,
|
|
58
|
-
"rows": ws.max_row,
|
|
59
|
-
"cols": ws.max_column
|
|
60
|
-
})
|
|
61
|
-
|
|
62
|
-
return info
|
|
63
|
-
|
|
64
|
-
def write_excel(file_path: str, data: list, sheet_name: str = "Sheet1"):
|
|
65
|
-
"""Write data to Excel file"""
|
|
66
|
-
import openpyxl
|
|
67
|
-
|
|
68
|
-
wb = openpyxl.Workbook()
|
|
69
|
-
ws = wb.active
|
|
70
|
-
ws.title = sheet_name
|
|
71
|
-
|
|
72
|
-
for row in data:
|
|
73
|
-
ws.append(row)
|
|
74
|
-
|
|
75
|
-
wb.save(file_path)
|
|
76
|
-
return {"success": True, "file_path": file_path}
|
|
77
|
-
|
|
78
|
-
if __name__ == "__main__":
|
|
79
|
-
command = sys.argv[1]
|
|
80
|
-
file_path = sys.argv[2]
|
|
81
|
-
|
|
82
|
-
if command == "read":
|
|
83
|
-
sheet = sys.argv[3] if len(sys.argv) > 3 else None
|
|
84
|
-
page = int(sys.argv[4]) if len(sys.argv) > 4 else None
|
|
85
|
-
page_size = int(sys.argv[5]) if len(sys.argv) > 5 else 100
|
|
86
|
-
result = read_excel(file_path, sheet, page, page_size)
|
|
87
|
-
elif command == "info":
|
|
88
|
-
result = get_excel_info(file_path)
|
|
89
|
-
elif command == "write":
|
|
90
|
-
# Data passed as JSON string
|
|
91
|
-
data = json.loads(sys.argv[3])
|
|
92
|
-
sheet = sys.argv[4] if len(sys.argv) > 4 else "Sheet1"
|
|
93
|
-
result = write_excel(file_path, data, sheet)
|
|
94
|
-
else:
|
|
95
|
-
result = {"error": f"Unknown command: {command}"}
|
|
96
|
-
|
|
97
|
-
print(json.dumps(result, default=str))
|
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
PDF document handler - read PDF files using PyPDF2
|
|
3
|
-
"""
|
|
4
|
-
import json
|
|
5
|
-
import sys
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
|
|
8
|
-
def read_pdf(file_path: str, page: int = None, page_size: int = 100):
|
|
9
|
-
"""Read PDF with optional pagination by pages"""
|
|
10
|
-
from PyPDF2 import PdfReader
|
|
11
|
-
|
|
12
|
-
reader = PdfReader(file_path)
|
|
13
|
-
total_pages = len(reader.pages)
|
|
14
|
-
|
|
15
|
-
# Handle pagination
|
|
16
|
-
if page is not None:
|
|
17
|
-
start_page = (page - 1) * page_size
|
|
18
|
-
end_page = min(start_page + page_size, total_pages)
|
|
19
|
-
pages_to_read = range(start_page, end_page)
|
|
20
|
-
current_page = page
|
|
21
|
-
total_page_groups = (total_pages + page_size - 1) // page_size
|
|
22
|
-
else:
|
|
23
|
-
pages_to_read = range(total_pages)
|
|
24
|
-
current_page = None
|
|
25
|
-
total_page_groups = 1
|
|
26
|
-
|
|
27
|
-
content = []
|
|
28
|
-
for i in pages_to_read:
|
|
29
|
-
page_obj = reader.pages[i]
|
|
30
|
-
text = page_obj.extract_text()
|
|
31
|
-
content.append({
|
|
32
|
-
"page_number": i + 1,
|
|
33
|
-
"text": text or "",
|
|
34
|
-
"words": len(text.split()) if text else 0
|
|
35
|
-
})
|
|
36
|
-
|
|
37
|
-
return {
|
|
38
|
-
"total_pages": total_pages,
|
|
39
|
-
"current_page_group": current_page,
|
|
40
|
-
"page_size": page_size if page else None,
|
|
41
|
-
"total_page_groups": total_page_groups,
|
|
42
|
-
"content": content
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
def get_pdf_info(file_path: str):
|
|
46
|
-
"""Get PDF metadata"""
|
|
47
|
-
from PyPDF2 import PdfReader
|
|
48
|
-
|
|
49
|
-
reader = PdfReader(file_path)
|
|
50
|
-
info = {
|
|
51
|
-
"pages": len(reader.pages),
|
|
52
|
-
"file_size": Path(file_path).stat().st_size
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
# Try to get PDF metadata
|
|
56
|
-
if reader.metadata:
|
|
57
|
-
info["metadata"] = {k: str(v) for k, v in reader.metadata.items()}
|
|
58
|
-
|
|
59
|
-
# Count total words
|
|
60
|
-
total_words = 0
|
|
61
|
-
for page in reader.pages:
|
|
62
|
-
text = page.extract_text() or ""
|
|
63
|
-
total_words += len(text.split())
|
|
64
|
-
info["total_words"] = total_words
|
|
65
|
-
|
|
66
|
-
return info
|
|
67
|
-
|
|
68
|
-
if __name__ == "__main__":
|
|
69
|
-
command = sys.argv[1]
|
|
70
|
-
file_path = sys.argv[2]
|
|
71
|
-
|
|
72
|
-
if command == "read":
|
|
73
|
-
page = int(sys.argv[3]) if len(sys.argv) > 3 else None
|
|
74
|
-
page_size = int(sys.argv[4]) if len(sys.argv) > 4 else 10
|
|
75
|
-
result = read_pdf(file_path, page, page_size)
|
|
76
|
-
elif command == "info":
|
|
77
|
-
result = get_pdf_info(file_path)
|
|
78
|
-
else:
|
|
79
|
-
result = {"error": f"Unknown command: {command}"}
|
|
80
|
-
|
|
81
|
-
print(json.dumps(result, default=str))
|