@sylphx/pdf-reader-mcp 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +205 -91
- package/dist/index.js +620 -49
- package/package.json +44 -42
- package/dist/handlers/index.js +0 -4
- package/dist/handlers/readPdf.js +0 -170
- package/dist/pdf/extractor.js +0 -394
- package/dist/pdf/loader.js +0 -53
- package/dist/pdf/parser.js +0 -96
- package/dist/schemas/readPdf.js +0 -59
- package/dist/types/pdf.js +0 -2
- package/dist/utils/pathUtils.js +0 -25
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sylphx/pdf-reader-mcp",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.4.0",
|
|
4
4
|
"description": "An MCP server providing tools to read PDF files.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -11,6 +11,12 @@
|
|
|
11
11
|
"README.md",
|
|
12
12
|
"LICENSE"
|
|
13
13
|
],
|
|
14
|
+
"exports": {
|
|
15
|
+
".": {
|
|
16
|
+
"import": "./dist/index.js",
|
|
17
|
+
"types": "./dist/index.d.ts"
|
|
18
|
+
}
|
|
19
|
+
},
|
|
14
20
|
"publishConfig": {
|
|
15
21
|
"access": "public"
|
|
16
22
|
},
|
|
@@ -19,12 +25,12 @@
|
|
|
19
25
|
},
|
|
20
26
|
"repository": {
|
|
21
27
|
"type": "git",
|
|
22
|
-
"url": "git+https://github.com/
|
|
28
|
+
"url": "git+https://github.com/SylphxAI/pdf-reader-mcp.git"
|
|
23
29
|
},
|
|
24
30
|
"bugs": {
|
|
25
|
-
"url": "https://github.com/
|
|
31
|
+
"url": "https://github.com/SylphxAI/pdf-reader-mcp/issues"
|
|
26
32
|
},
|
|
27
|
-
"homepage": "https://github.com/
|
|
33
|
+
"homepage": "https://github.com/SylphxAI/pdf-reader-mcp#readme",
|
|
28
34
|
"author": "Sylphx <contact@sylphx.com> (https://sylphx.com)",
|
|
29
35
|
"license": "MIT",
|
|
30
36
|
"keywords": [
|
|
@@ -40,65 +46,61 @@
|
|
|
40
46
|
"tool"
|
|
41
47
|
],
|
|
42
48
|
"scripts": {
|
|
43
|
-
"build": "
|
|
49
|
+
"build": "bunup",
|
|
44
50
|
"watch": "tsc --watch",
|
|
45
51
|
"inspector": "npx @modelcontextprotocol/inspector dist/index.js",
|
|
46
|
-
"test": "
|
|
47
|
-
"test:watch": "
|
|
48
|
-
"test:cov": "
|
|
52
|
+
"test": "bun test",
|
|
53
|
+
"test:watch": "bun test --watch",
|
|
54
|
+
"test:cov": "bun test --coverage",
|
|
49
55
|
"lint": "biome lint .",
|
|
50
56
|
"lint:fix": "biome lint --write .",
|
|
51
57
|
"format": "biome format --write .",
|
|
52
58
|
"check-format": "biome format .",
|
|
53
59
|
"check": "biome check .",
|
|
54
60
|
"check:fix": "biome check --write .",
|
|
55
|
-
"validate": "
|
|
56
|
-
"docs:dev": "
|
|
57
|
-
"docs:build": "
|
|
58
|
-
"docs:preview": "
|
|
61
|
+
"validate": "bun run check && bun run test",
|
|
62
|
+
"docs:dev": "leaf dev docs",
|
|
63
|
+
"docs:build": "leaf build docs",
|
|
64
|
+
"docs:preview": "leaf preview docs",
|
|
59
65
|
"start": "node dist/index.js",
|
|
60
66
|
"typecheck": "tsc --noEmit",
|
|
61
|
-
"benchmark": "
|
|
67
|
+
"benchmark": "bun bench",
|
|
62
68
|
"clean": "rm -rf dist coverage",
|
|
63
69
|
"docs:api": "typedoc --entryPoints src/index.ts --tsconfig tsconfig.json --plugin typedoc-plugin-markdown --out docs/api --readme none",
|
|
64
|
-
"prepublishOnly": "
|
|
70
|
+
"prepublishOnly": "bunx @sylphx/doctor prepublish && bun run clean && bun run build",
|
|
65
71
|
"release": "standard-version",
|
|
66
|
-
"prepare": "
|
|
72
|
+
"prepare": "node_modules/.bin/lefthook install || true"
|
|
67
73
|
},
|
|
68
74
|
"dependencies": {
|
|
69
|
-
"@
|
|
70
|
-
"glob": "^11.0
|
|
71
|
-
"pdfjs-dist": "^5.4.
|
|
75
|
+
"@sylphx/mcp-server-sdk": "1.0.0",
|
|
76
|
+
"glob": "^11.1.0",
|
|
77
|
+
"pdfjs-dist": "^5.4.394",
|
|
72
78
|
"pngjs": "^7.0.0",
|
|
73
|
-
"zod": "
|
|
74
|
-
"zod-to-json-schema": "^3.
|
|
79
|
+
"zod": "4.2.0-canary.20251124T022609",
|
|
80
|
+
"zod-to-json-schema": "^3.25.0"
|
|
75
81
|
},
|
|
76
82
|
"devDependencies": {
|
|
77
|
-
"@biomejs/biome": "^2.3.
|
|
78
|
-
"@
|
|
79
|
-
"@
|
|
83
|
+
"@biomejs/biome": "^2.3.8",
|
|
84
|
+
"@solidjs/router": "^0.15.4",
|
|
85
|
+
"@sylphx/biome-config": "^0.4.0",
|
|
86
|
+
"@sylphx/bump": "^0.12.1",
|
|
87
|
+
"@sylphx/doctor": "^1.23.3",
|
|
88
|
+
"@sylphx/leaf": "^1.0.0",
|
|
89
|
+
"@sylphx/leaf-theme-default": "^1.0.0",
|
|
90
|
+
"@sylphx/tsconfig": "^0.3.0",
|
|
80
91
|
"@types/glob": "^8.1.0",
|
|
81
|
-
"@types/node": "^24.
|
|
92
|
+
"@types/node": "^24.10.1",
|
|
82
93
|
"@types/pngjs": "^6.0.5",
|
|
83
|
-
"
|
|
84
|
-
"
|
|
85
|
-
"
|
|
86
|
-
"
|
|
87
|
-
"typedoc": "^0.28.2",
|
|
94
|
+
"bunup": "^0.16.10",
|
|
95
|
+
"lefthook": "^2.0.4",
|
|
96
|
+
"solid-js": "^1.9.10",
|
|
97
|
+
"typedoc": "^0.28.14",
|
|
88
98
|
"typedoc-plugin-markdown": "^4.9.0",
|
|
89
|
-
"typescript": "^5.
|
|
90
|
-
"
|
|
91
|
-
"vitest": "^4.0.7",
|
|
92
|
-
"vue": "^3.5.13"
|
|
93
|
-
},
|
|
94
|
-
"commitlint": {
|
|
95
|
-
"extends": [
|
|
96
|
-
"@commitlint/config-conventional"
|
|
97
|
-
]
|
|
99
|
+
"typescript": "^5.9.3",
|
|
100
|
+
"vite": "^7.2.4"
|
|
98
101
|
},
|
|
99
|
-
"
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
]
|
|
102
|
+
"packageManager": "bun@1.3.1",
|
|
103
|
+
"overrides": {
|
|
104
|
+
"js-yaml": "^4.1.0"
|
|
103
105
|
}
|
|
104
106
|
}
|
package/dist/handlers/index.js
DELETED
package/dist/handlers/readPdf.js
DELETED
|
@@ -1,170 +0,0 @@
|
|
|
1
|
-
// PDF reading handler - orchestrates PDF processing workflow
|
|
2
|
-
import { ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js';
|
|
3
|
-
import { z } from 'zod';
|
|
4
|
-
import { buildWarnings, extractMetadataAndPageCount, extractPageContent, } from '../pdf/extractor.js';
|
|
5
|
-
import { loadPdfDocument } from '../pdf/loader.js';
|
|
6
|
-
import { determinePagesToProcess, getTargetPages } from '../pdf/parser.js';
|
|
7
|
-
import { readPdfArgsSchema } from '../schemas/readPdf.js';
|
|
8
|
-
/**
|
|
9
|
-
* Process a single PDF source
|
|
10
|
-
*/
|
|
11
|
-
const processSingleSource = async (source, options) => {
|
|
12
|
-
const sourceDescription = source.path ?? source.url ?? 'unknown source';
|
|
13
|
-
let individualResult = { source: sourceDescription, success: false };
|
|
14
|
-
try {
|
|
15
|
-
// Parse target pages
|
|
16
|
-
const targetPages = getTargetPages(source.pages, sourceDescription);
|
|
17
|
-
// Load PDF document
|
|
18
|
-
const { pages: _pages, ...loadArgs } = source;
|
|
19
|
-
const pdfDocument = await loadPdfDocument(loadArgs, sourceDescription);
|
|
20
|
-
const totalPages = pdfDocument.numPages;
|
|
21
|
-
// Extract metadata and page count
|
|
22
|
-
const metadataOutput = await extractMetadataAndPageCount(pdfDocument, options.includeMetadata, options.includePageCount);
|
|
23
|
-
const output = { ...metadataOutput };
|
|
24
|
-
// Determine pages to process
|
|
25
|
-
const { pagesToProcess, invalidPages } = determinePagesToProcess(targetPages, totalPages, options.includeFullText);
|
|
26
|
-
// Add warnings for invalid pages
|
|
27
|
-
const warnings = buildWarnings(invalidPages, totalPages);
|
|
28
|
-
if (warnings.length > 0) {
|
|
29
|
-
output.warnings = warnings;
|
|
30
|
-
}
|
|
31
|
-
// Extract content with ordering preserved
|
|
32
|
-
if (pagesToProcess.length > 0) {
|
|
33
|
-
// Use new extractPageContent to preserve Y-coordinate ordering
|
|
34
|
-
const pageContents = await Promise.all(pagesToProcess.map((pageNum) => extractPageContent(pdfDocument, pageNum, options.includeImages, sourceDescription)));
|
|
35
|
-
// Store page contents for ordered retrieval
|
|
36
|
-
output.page_contents = pageContents.map((items, idx) => ({
|
|
37
|
-
page: pagesToProcess[idx],
|
|
38
|
-
items,
|
|
39
|
-
}));
|
|
40
|
-
// For backward compatibility, also provide text-only outputs
|
|
41
|
-
const extractedPageTexts = pageContents.map((items, idx) => ({
|
|
42
|
-
page: pagesToProcess[idx],
|
|
43
|
-
text: items
|
|
44
|
-
.filter((item) => item.type === 'text')
|
|
45
|
-
.map((item) => item.textContent)
|
|
46
|
-
.join(''),
|
|
47
|
-
}));
|
|
48
|
-
if (targetPages) {
|
|
49
|
-
// Specific pages requested
|
|
50
|
-
output.page_texts = extractedPageTexts;
|
|
51
|
-
}
|
|
52
|
-
else {
|
|
53
|
-
// Full text requested
|
|
54
|
-
output.full_text = extractedPageTexts.map((p) => p.text).join('\n\n');
|
|
55
|
-
}
|
|
56
|
-
// Extract image metadata for JSON response
|
|
57
|
-
if (options.includeImages) {
|
|
58
|
-
const extractedImages = pageContents
|
|
59
|
-
.flatMap((items) => items.filter((item) => item.type === 'image' && item.imageData))
|
|
60
|
-
.map((item) => item.imageData)
|
|
61
|
-
.filter((img) => img !== undefined);
|
|
62
|
-
if (extractedImages.length > 0) {
|
|
63
|
-
output.images = extractedImages;
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
individualResult = { ...individualResult, data: output, success: true };
|
|
68
|
-
}
|
|
69
|
-
catch (error) {
|
|
70
|
-
let errorMessage = `Failed to process PDF from ${sourceDescription}.`;
|
|
71
|
-
if (error instanceof McpError) {
|
|
72
|
-
errorMessage = error.message;
|
|
73
|
-
} /* c8 ignore next */
|
|
74
|
-
else if (error instanceof Error) {
|
|
75
|
-
errorMessage += ` Reason: ${error.message}`;
|
|
76
|
-
}
|
|
77
|
-
else {
|
|
78
|
-
errorMessage += ` Unknown error: ${JSON.stringify(error)}`;
|
|
79
|
-
}
|
|
80
|
-
individualResult.error = errorMessage;
|
|
81
|
-
individualResult.success = false;
|
|
82
|
-
individualResult.data = undefined;
|
|
83
|
-
}
|
|
84
|
-
return individualResult;
|
|
85
|
-
};
|
|
86
|
-
/**
|
|
87
|
-
* Main handler function for read_pdf tool
|
|
88
|
-
*/
|
|
89
|
-
export const handleReadPdfFunc = async (args) => {
|
|
90
|
-
let parsedArgs;
|
|
91
|
-
try {
|
|
92
|
-
parsedArgs = readPdfArgsSchema.parse(args);
|
|
93
|
-
}
|
|
94
|
-
catch (error) {
|
|
95
|
-
if (error instanceof z.ZodError) {
|
|
96
|
-
throw new McpError(ErrorCode.InvalidParams, `Invalid arguments: ${error.issues.map((e) => `${e.path.join('.')} (${e.message})`).join(', ')}`);
|
|
97
|
-
}
|
|
98
|
-
/* c8 ignore next */
|
|
99
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
100
|
-
/* c8 ignore next */
|
|
101
|
-
throw new McpError(ErrorCode.InvalidParams, `Argument validation failed: ${message}`);
|
|
102
|
-
}
|
|
103
|
-
const { sources, include_full_text, include_metadata, include_page_count, include_images } = parsedArgs;
|
|
104
|
-
// Process all sources concurrently
|
|
105
|
-
const results = await Promise.all(sources.map((source) => processSingleSource(source, {
|
|
106
|
-
includeFullText: include_full_text,
|
|
107
|
-
includeMetadata: include_metadata,
|
|
108
|
-
includePageCount: include_page_count,
|
|
109
|
-
includeImages: include_images,
|
|
110
|
-
})));
|
|
111
|
-
// Build content parts - start with structured JSON for backward compatibility
|
|
112
|
-
const content = [];
|
|
113
|
-
// Strip image data and page_contents from JSON to keep it manageable
|
|
114
|
-
const resultsForJson = results.map((result) => {
|
|
115
|
-
if (result.data) {
|
|
116
|
-
const { images, page_contents, ...dataWithoutBinaryContent } = result.data;
|
|
117
|
-
// Include image count and metadata in JSON, but not the base64 data
|
|
118
|
-
if (images) {
|
|
119
|
-
const imageInfo = images.map((img) => ({
|
|
120
|
-
page: img.page,
|
|
121
|
-
index: img.index,
|
|
122
|
-
width: img.width,
|
|
123
|
-
height: img.height,
|
|
124
|
-
format: img.format,
|
|
125
|
-
}));
|
|
126
|
-
return { ...result, data: { ...dataWithoutBinaryContent, image_info: imageInfo } };
|
|
127
|
-
}
|
|
128
|
-
return { ...result, data: dataWithoutBinaryContent };
|
|
129
|
-
}
|
|
130
|
-
return result;
|
|
131
|
-
});
|
|
132
|
-
// First content part: Structured JSON results
|
|
133
|
-
content.push({
|
|
134
|
-
type: 'text',
|
|
135
|
-
text: JSON.stringify({ results: resultsForJson }, null, 2),
|
|
136
|
-
});
|
|
137
|
-
// Add page content in exact Y-coordinate order
|
|
138
|
-
for (const result of results) {
|
|
139
|
-
if (!result.success || !result.data?.page_contents)
|
|
140
|
-
continue;
|
|
141
|
-
// Process each page's content items in order
|
|
142
|
-
for (const pageContent of result.data.page_contents) {
|
|
143
|
-
for (const item of pageContent.items) {
|
|
144
|
-
if (item.type === 'text' && item.textContent) {
|
|
145
|
-
// Add text content part
|
|
146
|
-
content.push({
|
|
147
|
-
type: 'text',
|
|
148
|
-
text: item.textContent,
|
|
149
|
-
});
|
|
150
|
-
}
|
|
151
|
-
else if (item.type === 'image' && item.imageData) {
|
|
152
|
-
// Add image content part (all images are now encoded as PNG)
|
|
153
|
-
content.push({
|
|
154
|
-
type: 'image',
|
|
155
|
-
data: item.imageData.data,
|
|
156
|
-
mimeType: 'image/png',
|
|
157
|
-
});
|
|
158
|
-
}
|
|
159
|
-
}
|
|
160
|
-
}
|
|
161
|
-
}
|
|
162
|
-
return { content };
|
|
163
|
-
};
|
|
164
|
-
// Export the tool definition
|
|
165
|
-
export const readPdfToolDefinition = {
|
|
166
|
-
name: 'read_pdf',
|
|
167
|
-
description: 'Reads content/metadata/images from one or more PDFs (local/URL). Each source can specify pages to extract.',
|
|
168
|
-
schema: readPdfArgsSchema,
|
|
169
|
-
handler: handleReadPdfFunc,
|
|
170
|
-
};
|