docsmith-mcp 0.0.1-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,679 @@
1
+ #!/usr/bin/env node
2
+ import { Server } from "@modelcontextprotocol/sdk/server/index.js";
3
+ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
4
+ import { CallToolRequestSchema, ListToolsRequestSchema } from "@modelcontextprotocol/sdk/types.js";
5
+ import { z } from "zod";
6
+ import { runPy } from "@mcpc-tech/code-runner-mcp";
7
+ import { readFileSync } from "fs";
8
+ import { fileURLToPath } from "url";
9
+ import { dirname, join, resolve } from "path";
10
+
11
+ //#region src/code-runner.ts
12
+ const __filename = fileURLToPath(import.meta.url);
13
+ const __dirname = dirname(__filename);
14
+ /**
15
+ * Convert absolute file path to Pyodide virtual path
16
+ * Determines the mount root and converts the path accordingly
17
+ *
18
+ * @param filePath - Absolute path to the file
19
+ * @returns Object with mountRoot (host path) and virtualPath (Pyodide path)
20
+ */
21
+ function getFileSystemMapping(filePath) {
22
+ const absolutePath = resolve(filePath);
23
+ const mountRoot = dirname(absolutePath);
24
+ const virtualPath = absolutePath;
25
+ return {
26
+ mountRoot,
27
+ virtualPath
28
+ };
29
+ }
30
+ /**
31
+ * Run a Python script file using code-runner-mcp
32
+ *
33
+ * @param scriptPath - Path to the Python script (relative to baseDir)
34
+ * @param options - Execution options
35
+ * @returns The execution result
36
+ */
37
+ async function runPythonFile(scriptPath, options = {}) {
38
+ const { args = [], packages = {}, baseDir = "python", filePaths = [] } = options;
39
+ const fullPath = join(__dirname, "..", baseDir, scriptPath);
40
+ const scriptContent = readFileSync(fullPath, "utf-8");
41
+ const wrapperCode = `
42
+ import sys
43
+ import json
44
+
45
+ # Set command line arguments
46
+ sys.argv = ['${scriptPath}'] + ${JSON.stringify(args)}
47
+
48
+ # Execute the script
49
+ ${scriptContent}
50
+ `;
51
+ let mountRoot = join(__dirname, "..");
52
+ if (filePaths.length > 0) {
53
+ const mapping = getFileSystemMapping(filePaths[0]);
54
+ mountRoot = mapping.mountRoot;
55
+ }
56
+ const runPyOptions = {
57
+ packages,
58
+ nodeFSMountPoint: mountRoot,
59
+ nodeFSRoot: mountRoot
60
+ };
61
+ const stream = await runPy(wrapperCode, runPyOptions);
62
+ const reader = stream.getReader();
63
+ const decoder = new TextDecoder();
64
+ let stdout = "";
65
+ let stderr = "";
66
+ let error = "";
67
+ try {
68
+ while (true) {
69
+ const { done, value } = await reader.read();
70
+ if (done) break;
71
+ const chunk = decoder.decode(value, { stream: true });
72
+ if (chunk.startsWith("[stderr] ")) stderr += chunk.slice(9);
73
+ else if (chunk.startsWith("[err]")) error += chunk;
74
+ else stdout += chunk;
75
+ }
76
+ } catch (streamError) {
77
+ return { error: String(streamError) };
78
+ }
79
+ if (error) return { error: error.replace(/\[err\]\[py\]\s*/g, "").trim() };
80
+ const lines = stdout.trim().split("\n");
81
+ const lastLine = lines[lines.length - 1];
82
+ try {
83
+ return JSON.parse(lastLine);
84
+ } catch {
85
+ return {
86
+ stdout,
87
+ stderr
88
+ };
89
+ }
90
+ }
91
+
92
+ //#endregion
93
+ //#region src/utils.ts
94
+ /**
95
+ * Utility functions for document processing
96
+ */
97
+ /**
98
+ * Detect file type from file extension
99
+ */
100
+ function detectFileType(filePath) {
101
+ const ext = filePath.toLowerCase().split(".").pop();
102
+ if (ext === "xlsx" || ext === "xls") return "excel";
103
+ if (ext === "docx") return "word";
104
+ if (ext === "pdf") return "pdf";
105
+ if (ext === "txt" || ext === "csv" || ext === "md" || ext === "json" || ext === "yaml" || ext === "yml") return "text";
106
+ return null;
107
+ }
108
+ /**
109
+ * Get required packages for each file type
110
+ */
111
+ function getPackages(fileType) {
112
+ const packages = {
113
+ excel: { openpyxl: "openpyxl" },
114
+ word: { docx: "python-docx" },
115
+ pdf: { PyPDF2: "PyPDF2" },
116
+ text: {}
117
+ };
118
+ return packages[fileType] || {};
119
+ }
120
+ /**
121
+ * Get environment configuration
122
+ */
123
+ function getConfig() {
124
+ return {
125
+ rawFullRead: process.env.DOC_RAW_FULL_READ === "true",
126
+ pageSize: parseInt(process.env.DOC_PAGE_SIZE || "100", 10),
127
+ maxFileSize: parseInt(process.env.DOC_MAX_FILE_SIZE || "50", 10) * 1024 * 1024
128
+ };
129
+ }
130
+
131
+ //#endregion
132
+ //#region src/index.ts
133
+ const ReadDocumentSchema = z.object({
134
+ file_path: z.string().describe("Absolute path to the document file"),
135
+ mode: z.enum(["raw", "paginated"]).optional().describe("Read mode: 'raw' for full content, 'paginated' for chunked reading"),
136
+ page: z.number().optional().describe("Page number for paginated mode (1-based)"),
137
+ page_size: z.number().optional().describe("Items per page for paginated mode"),
138
+ sheet_name: z.string().optional().describe("Sheet name for Excel files")
139
+ });
140
+ const WriteDocumentSchema = z.object({
141
+ file_path: z.string().describe("Absolute path to save the document"),
142
+ format: z.enum([
143
+ "excel",
144
+ "word",
145
+ "text"
146
+ ]).describe("Document format"),
147
+ data: z.any().describe("Document data structure")
148
+ });
149
+ const GetDocumentInfoSchema = z.object({ file_path: z.string().describe("Absolute path to the document file") });
150
+ const server = new Server({
151
+ name: "docsmith-mcp",
152
+ version: "0.1.0"
153
+ }, { capabilities: { tools: {} } });
154
+ const BaseOutputSchema = {
155
+ type: "object",
156
+ properties: {
157
+ success: {
158
+ type: "boolean",
159
+ description: "Operation success status"
160
+ },
161
+ error: {
162
+ type: "string",
163
+ description: "Error message if failed"
164
+ }
165
+ }
166
+ };
167
+ const PaginationSchema = {
168
+ current_page: {
169
+ type: "number",
170
+ description: "Current page number"
171
+ },
172
+ page_size: {
173
+ type: "number",
174
+ description: "Items per page"
175
+ },
176
+ total_pages: {
177
+ type: "number",
178
+ description: "Total number of pages"
179
+ },
180
+ page: {
181
+ type: "number",
182
+ description: "Current page number (alternative)"
183
+ },
184
+ has_more: {
185
+ type: "boolean",
186
+ description: "Whether more pages exist"
187
+ }
188
+ };
189
+ const ExcelReadOutputSchema = {
190
+ type: "object",
191
+ properties: {
192
+ sheet_name: {
193
+ type: "string",
194
+ description: "Active sheet name"
195
+ },
196
+ sheets: {
197
+ type: "array",
198
+ items: { type: "string" },
199
+ description: "All sheet names"
200
+ },
201
+ total_rows: {
202
+ type: "number",
203
+ description: "Total rows in sheet"
204
+ },
205
+ total_cols: {
206
+ type: "number",
207
+ description: "Total columns in sheet"
208
+ },
209
+ data: {
210
+ type: "array",
211
+ items: {
212
+ type: "array",
213
+ items: {}
214
+ },
215
+ description: "Sheet data as array of rows"
216
+ },
217
+ ...PaginationSchema
218
+ }
219
+ };
220
+ const WordReadOutputSchema = {
221
+ type: "object",
222
+ properties: {
223
+ paragraphs: {
224
+ type: "array",
225
+ items: { type: "string" },
226
+ description: "Document paragraphs"
227
+ },
228
+ tables: {
229
+ type: "array",
230
+ items: {
231
+ type: "array",
232
+ items: {
233
+ type: "array",
234
+ items: { type: "string" }
235
+ }
236
+ },
237
+ description: "Tables data"
238
+ },
239
+ total_paragraphs: {
240
+ type: "number",
241
+ description: "Total paragraph count"
242
+ },
243
+ total_tables: {
244
+ type: "number",
245
+ description: "Total table count"
246
+ },
247
+ ...PaginationSchema
248
+ }
249
+ };
250
+ const PDFReadOutputSchema = {
251
+ type: "object",
252
+ properties: {
253
+ total_pages: {
254
+ type: "number",
255
+ description: "Total pages in PDF"
256
+ },
257
+ content: {
258
+ type: "array",
259
+ items: {
260
+ type: "object",
261
+ properties: {
262
+ page_number: { type: "number" },
263
+ text: { type: "string" }
264
+ }
265
+ },
266
+ description: "Page content array"
267
+ },
268
+ current_page_group: { type: "number" },
269
+ page_size: { type: "number" }
270
+ }
271
+ };
272
+ const TextReadOutputSchema = {
273
+ type: "object",
274
+ properties: {
275
+ ...BaseOutputSchema.properties,
276
+ content: {
277
+ type: "string",
278
+ description: "Text content"
279
+ },
280
+ total_lines: {
281
+ type: "number",
282
+ description: "Total line count"
283
+ },
284
+ encoding: {
285
+ type: "string",
286
+ description: "File encoding"
287
+ },
288
+ ...PaginationSchema
289
+ }
290
+ };
291
+ const CSVReadOutputSchema = {
292
+ type: "object",
293
+ properties: {
294
+ ...BaseOutputSchema.properties,
295
+ headers: {
296
+ type: "array",
297
+ items: { type: "string" },
298
+ description: "CSV headers"
299
+ },
300
+ data: {
301
+ type: "array",
302
+ items: { type: "object" },
303
+ description: "Structured data as array of objects"
304
+ },
305
+ total_rows: {
306
+ type: "number",
307
+ description: "Total data rows"
308
+ },
309
+ encoding: {
310
+ type: "string",
311
+ description: "File encoding"
312
+ },
313
+ ...PaginationSchema
314
+ }
315
+ };
316
+ const JSONReadOutputSchema = {
317
+ type: "object",
318
+ properties: {
319
+ ...BaseOutputSchema.properties,
320
+ data: {
321
+ type: "object",
322
+ description: "Parsed JSON data"
323
+ },
324
+ encoding: {
325
+ type: "string",
326
+ description: "File encoding"
327
+ }
328
+ }
329
+ };
330
+ const WriteOutputSchema = {
331
+ type: "object",
332
+ properties: {
333
+ success: {
334
+ type: "boolean",
335
+ description: "Write operation success"
336
+ },
337
+ file_path: {
338
+ type: "string",
339
+ description: "Written file path"
340
+ },
341
+ message: {
342
+ type: "string",
343
+ description: "Success message"
344
+ },
345
+ error: {
346
+ type: "string",
347
+ description: "Error message if failed"
348
+ }
349
+ }
350
+ };
351
+ const ExcelInfoOutputSchema = {
352
+ type: "object",
353
+ properties: {
354
+ sheets: {
355
+ type: "array",
356
+ items: {
357
+ type: "object",
358
+ properties: {
359
+ name: { type: "string" },
360
+ rows: { type: "number" },
361
+ cols: { type: "number" }
362
+ }
363
+ },
364
+ description: "Sheet information"
365
+ },
366
+ file_size: {
367
+ type: "number",
368
+ description: "File size in bytes"
369
+ }
370
+ }
371
+ };
372
+ const WordInfoOutputSchema = {
373
+ type: "object",
374
+ properties: {
375
+ paragraphs: {
376
+ type: "number",
377
+ description: "Paragraph count"
378
+ },
379
+ tables: {
380
+ type: "number",
381
+ description: "Table count"
382
+ },
383
+ file_size: {
384
+ type: "number",
385
+ description: "File size in bytes"
386
+ }
387
+ }
388
+ };
389
+ const PDFInfoOutputSchema = {
390
+ type: "object",
391
+ properties: {
392
+ pages: {
393
+ type: "number",
394
+ description: "Page count"
395
+ },
396
+ file_size: {
397
+ type: "number",
398
+ description: "File size in bytes"
399
+ },
400
+ total_words: {
401
+ type: "number",
402
+ description: "Total word count"
403
+ }
404
+ }
405
+ };
406
+ const TextInfoOutputSchema = {
407
+ type: "object",
408
+ properties: {
409
+ ...BaseOutputSchema.properties,
410
+ file_size: {
411
+ type: "number",
412
+ description: "File size in bytes"
413
+ },
414
+ line_count: {
415
+ type: "number",
416
+ description: "Line count"
417
+ },
418
+ encoding: {
419
+ type: "string",
420
+ description: "File encoding"
421
+ },
422
+ file_type: {
423
+ type: "string",
424
+ description: "File extension"
425
+ },
426
+ headers: {
427
+ type: "array",
428
+ items: { type: "string" }
429
+ },
430
+ total_rows: { type: "number" },
431
+ total_cols: { type: "number" },
432
+ item_count: { type: "number" },
433
+ key_count: { type: "number" }
434
+ }
435
+ };
436
+ server.setRequestHandler(ListToolsRequestSchema, async () => {
437
+ return { tools: [
438
+ {
439
+ name: "read_document",
440
+ description: "Read document content (Excel, Word, PDF, TXT, CSV, Markdown, JSON, YAML). Supports raw full read or paginated mode.",
441
+ inputSchema: {
442
+ type: "object",
443
+ properties: {
444
+ file_path: {
445
+ type: "string",
446
+ description: "Absolute path to the document file"
447
+ },
448
+ mode: {
449
+ type: "string",
450
+ enum: ["raw", "paginated"],
451
+ description: "Read mode"
452
+ },
453
+ page: {
454
+ type: "number",
455
+ description: "Page number for paginated mode"
456
+ },
457
+ page_size: {
458
+ type: "number",
459
+ description: "Items per page"
460
+ },
461
+ sheet_name: {
462
+ type: "string",
463
+ description: "Sheet name for Excel files"
464
+ }
465
+ },
466
+ required: ["file_path"]
467
+ },
468
+ outputSchema: {
469
+ type: "object",
470
+ description: "Returns different structures based on file type: Excel (sheet data), Word (paragraphs/tables), PDF (page content), Text (plain text), CSV (structured rows), JSON (parsed object)",
471
+ oneOf: [
472
+ ExcelReadOutputSchema,
473
+ WordReadOutputSchema,
474
+ PDFReadOutputSchema,
475
+ TextReadOutputSchema,
476
+ CSVReadOutputSchema,
477
+ JSONReadOutputSchema
478
+ ]
479
+ }
480
+ },
481
+ {
482
+ name: "write_document",
483
+ description: "Write document content (Excel, Word, Text)",
484
+ inputSchema: {
485
+ type: "object",
486
+ properties: {
487
+ file_path: {
488
+ type: "string",
489
+ description: "Absolute path to save the document"
490
+ },
491
+ format: {
492
+ type: "string",
493
+ enum: [
494
+ "excel",
495
+ "word",
496
+ "text"
497
+ ],
498
+ description: "Document format"
499
+ },
500
+ data: { description: "Document data structure. Excel: array of rows [[cell1, cell2], ...]. Word: {paragraphs: string[], tables?: [[[cell]]]}. Text/CSV/JSON: string or object" }
501
+ },
502
+ required: [
503
+ "file_path",
504
+ "format",
505
+ "data"
506
+ ]
507
+ },
508
+ outputSchema: WriteOutputSchema
509
+ },
510
+ {
511
+ name: "get_document_info",
512
+ description: "Get document metadata (page count, sheet count, file size, etc.)",
513
+ inputSchema: {
514
+ type: "object",
515
+ properties: { file_path: {
516
+ type: "string",
517
+ description: "Absolute path to the document file"
518
+ } },
519
+ required: ["file_path"]
520
+ },
521
+ outputSchema: {
522
+ type: "object",
523
+ description: "Returns metadata based on file type",
524
+ oneOf: [
525
+ ExcelInfoOutputSchema,
526
+ WordInfoOutputSchema,
527
+ PDFInfoOutputSchema,
528
+ TextInfoOutputSchema
529
+ ]
530
+ }
531
+ }
532
+ ] };
533
+ });
534
+ server.setRequestHandler(CallToolRequestSchema, async (request) => {
535
+ const { name, arguments: args } = request.params;
536
+ try {
537
+ if (name === "read_document") {
538
+ const params = ReadDocumentSchema.parse(args);
539
+ const fileType = detectFileType(params.file_path);
540
+ if (!fileType) throw new Error(`Unsupported file type: ${params.file_path}`);
541
+ const config = getConfig();
542
+ const mode = params.mode || (config.rawFullRead ? "raw" : "paginated");
543
+ const page = mode === "paginated" ? params.page || 1 : void 0;
544
+ const pageSize = params.page_size || config.pageSize;
545
+ let scriptName;
546
+ let scriptArgs;
547
+ if (fileType === "excel") {
548
+ scriptName = "excel_handler.py";
549
+ scriptArgs = ["read", params.file_path];
550
+ if (params.sheet_name) scriptArgs.push(params.sheet_name);
551
+ if (page) {
552
+ scriptArgs.push(String(page));
553
+ scriptArgs.push(String(pageSize));
554
+ }
555
+ } else if (fileType === "word") {
556
+ scriptName = "word_handler.py";
557
+ scriptArgs = ["read", params.file_path];
558
+ if (page) {
559
+ scriptArgs.push(String(page));
560
+ scriptArgs.push(String(pageSize));
561
+ }
562
+ } else if (fileType === "pdf") {
563
+ scriptName = "pdf_handler.py";
564
+ scriptArgs = ["read", params.file_path];
565
+ if (page) {
566
+ scriptArgs.push(String(page));
567
+ scriptArgs.push(String(Math.min(pageSize, 10)));
568
+ }
569
+ } else {
570
+ scriptName = "text_handler.py";
571
+ scriptArgs = ["read", params.file_path];
572
+ if (page) {
573
+ scriptArgs.push(String(page));
574
+ scriptArgs.push(String(pageSize));
575
+ }
576
+ }
577
+ const result = await runPythonFile(scriptName, {
578
+ args: scriptArgs,
579
+ packages: getPackages(fileType),
580
+ filePaths: [params.file_path]
581
+ });
582
+ return {
583
+ content: [{
584
+ type: "text",
585
+ text: JSON.stringify(result, null, 2)
586
+ }],
587
+ structuredContent: result
588
+ };
589
+ }
590
+ if (name === "write_document") {
591
+ const params = WriteDocumentSchema.parse(args);
592
+ let scriptName;
593
+ let scriptArgs;
594
+ if (params.format === "excel") {
595
+ scriptName = "excel_handler.py";
596
+ scriptArgs = [
597
+ "write",
598
+ params.file_path,
599
+ JSON.stringify(params.data)
600
+ ];
601
+ } else if (params.format === "word") {
602
+ scriptName = "word_handler.py";
603
+ const paragraphs = params.data.paragraphs || [];
604
+ const tables = params.data.tables || null;
605
+ scriptArgs = [
606
+ "write",
607
+ params.file_path,
608
+ JSON.stringify(paragraphs)
609
+ ];
610
+ if (tables) scriptArgs.push(JSON.stringify(tables));
611
+ } else if (params.format === "text") {
612
+ scriptName = "text_handler.py";
613
+ const content = typeof params.data === "string" ? params.data : JSON.stringify(params.data);
614
+ scriptArgs = [
615
+ "write",
616
+ params.file_path,
617
+ content
618
+ ];
619
+ } else throw new Error(`Unsupported write format: ${params.format}`);
620
+ const result = await runPythonFile(scriptName, {
621
+ args: scriptArgs,
622
+ packages: getPackages(params.format),
623
+ filePaths: [params.file_path]
624
+ });
625
+ return {
626
+ content: [{
627
+ type: "text",
628
+ text: JSON.stringify(result, null, 2)
629
+ }],
630
+ structuredContent: result
631
+ };
632
+ }
633
+ if (name === "get_document_info") {
634
+ const params = GetDocumentInfoSchema.parse(args);
635
+ const fileType = detectFileType(params.file_path);
636
+ if (!fileType) throw new Error(`Unsupported file type: ${params.file_path}`);
637
+ let scriptName;
638
+ let scriptArgs = ["info", params.file_path];
639
+ if (fileType === "excel") scriptName = "excel_handler.py";
640
+ else if (fileType === "word") scriptName = "word_handler.py";
641
+ else if (fileType === "pdf") scriptName = "pdf_handler.py";
642
+ else scriptName = "text_handler.py";
643
+ const result = await runPythonFile(scriptName, {
644
+ args: scriptArgs,
645
+ packages: getPackages(fileType),
646
+ filePaths: [params.file_path]
647
+ });
648
+ return {
649
+ content: [{
650
+ type: "text",
651
+ text: JSON.stringify(result, null, 2)
652
+ }],
653
+ structuredContent: result
654
+ };
655
+ }
656
+ throw new Error(`Unknown tool: ${name}`);
657
+ } catch (error) {
658
+ const errorMessage = error instanceof Error ? error.message : String(error);
659
+ return {
660
+ content: [{
661
+ type: "text",
662
+ text: `Error: ${errorMessage}`
663
+ }],
664
+ isError: true
665
+ };
666
+ }
667
+ });
668
+ async function main() {
669
+ const transport = new StdioServerTransport();
670
+ await server.connect(transport);
671
+ console.error("Docsmith MCP server running on stdio");
672
+ }
673
+ main().catch((error) => {
674
+ console.error("Server error:", error);
675
+ process.exit(1);
676
+ });
677
+
678
+ //#endregion
679
+ //# sourceMappingURL=index.js.map