@sylphx/pdf-reader-mcp 1.1.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +508 -246
- package/dist/handlers/readPdf.js +64 -55
- package/dist/index.js +1 -1
- package/dist/pdf/extractor.js +255 -14
- package/dist/pdf/parser.js +6 -4
- package/dist/schemas/readPdf.js +5 -1
- package/dist/utils/pathUtils.js +7 -12
- package/package.json +37 -33
package/dist/handlers/readPdf.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
// PDF reading handler - orchestrates PDF processing workflow
|
|
2
2
|
import { ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js';
|
|
3
3
|
import { z } from 'zod';
|
|
4
|
-
import { buildWarnings,
|
|
4
|
+
import { buildWarnings, extractMetadataAndPageCount, extractPageContent, } from '../pdf/extractor.js';
|
|
5
5
|
import { loadPdfDocument } from '../pdf/loader.js';
|
|
6
6
|
import { determinePagesToProcess, getTargetPages } from '../pdf/parser.js';
|
|
7
7
|
import { readPdfArgsSchema } from '../schemas/readPdf.js';
|
|
@@ -28,9 +28,23 @@ const processSingleSource = async (source, options) => {
|
|
|
28
28
|
if (warnings.length > 0) {
|
|
29
29
|
output.warnings = warnings;
|
|
30
30
|
}
|
|
31
|
-
// Extract
|
|
31
|
+
// Extract content with ordering preserved
|
|
32
32
|
if (pagesToProcess.length > 0) {
|
|
33
|
-
|
|
33
|
+
// Use new extractPageContent to preserve Y-coordinate ordering
|
|
34
|
+
const pageContents = await Promise.all(pagesToProcess.map((pageNum) => extractPageContent(pdfDocument, pageNum, options.includeImages, sourceDescription)));
|
|
35
|
+
// Store page contents for ordered retrieval
|
|
36
|
+
output.page_contents = pageContents.map((items, idx) => ({
|
|
37
|
+
page: pagesToProcess[idx],
|
|
38
|
+
items,
|
|
39
|
+
}));
|
|
40
|
+
// For backward compatibility, also provide text-only outputs
|
|
41
|
+
const extractedPageTexts = pageContents.map((items, idx) => ({
|
|
42
|
+
page: pagesToProcess[idx],
|
|
43
|
+
text: items
|
|
44
|
+
.filter((item) => item.type === 'text')
|
|
45
|
+
.map((item) => item.textContent)
|
|
46
|
+
.join(''),
|
|
47
|
+
}));
|
|
34
48
|
if (targetPages) {
|
|
35
49
|
// Specific pages requested
|
|
36
50
|
output.page_texts = extractedPageTexts;
|
|
@@ -39,12 +53,15 @@ const processSingleSource = async (source, options) => {
|
|
|
39
53
|
// Full text requested
|
|
40
54
|
output.full_text = extractedPageTexts.map((p) => p.text).join('\n\n');
|
|
41
55
|
}
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
56
|
+
// Extract image metadata for JSON response
|
|
57
|
+
if (options.includeImages) {
|
|
58
|
+
const extractedImages = pageContents
|
|
59
|
+
.flatMap((items) => items.filter((item) => item.type === 'image' && item.imageData))
|
|
60
|
+
.map((item) => item.imageData)
|
|
61
|
+
.filter((img) => img !== undefined);
|
|
62
|
+
if (extractedImages.length > 0) {
|
|
63
|
+
output.images = extractedImages;
|
|
64
|
+
}
|
|
48
65
|
}
|
|
49
66
|
}
|
|
50
67
|
individualResult = { ...individualResult, data: output, success: true };
|
|
@@ -53,7 +70,7 @@ const processSingleSource = async (source, options) => {
|
|
|
53
70
|
let errorMessage = `Failed to process PDF from ${sourceDescription}.`;
|
|
54
71
|
if (error instanceof McpError) {
|
|
55
72
|
errorMessage = error.message;
|
|
56
|
-
}
|
|
73
|
+
} /* c8 ignore next */
|
|
57
74
|
else if (error instanceof Error) {
|
|
58
75
|
errorMessage += ` Reason: ${error.message}`;
|
|
59
76
|
}
|
|
@@ -76,9 +93,11 @@ export const handleReadPdfFunc = async (args) => {
|
|
|
76
93
|
}
|
|
77
94
|
catch (error) {
|
|
78
95
|
if (error instanceof z.ZodError) {
|
|
79
|
-
throw new McpError(ErrorCode.InvalidParams, `Invalid arguments: ${error.
|
|
96
|
+
throw new McpError(ErrorCode.InvalidParams, `Invalid arguments: ${error.issues.map((e) => `${e.path.join('.')} (${e.message})`).join(', ')}`);
|
|
80
97
|
}
|
|
98
|
+
/* c8 ignore next */
|
|
81
99
|
const message = error instanceof Error ? error.message : String(error);
|
|
100
|
+
/* c8 ignore next */
|
|
82
101
|
throw new McpError(ErrorCode.InvalidParams, `Argument validation failed: ${message}`);
|
|
83
102
|
}
|
|
84
103
|
const { sources, include_full_text, include_metadata, include_page_count, include_images } = parsedArgs;
|
|
@@ -91,19 +110,22 @@ export const handleReadPdfFunc = async (args) => {
|
|
|
91
110
|
})));
|
|
92
111
|
// Build content parts - start with structured JSON for backward compatibility
|
|
93
112
|
const content = [];
|
|
94
|
-
// Strip image data from JSON to keep it manageable
|
|
113
|
+
// Strip image data and page_contents from JSON to keep it manageable
|
|
95
114
|
const resultsForJson = results.map((result) => {
|
|
96
|
-
if (result.data
|
|
97
|
-
const { images, ...
|
|
115
|
+
if (result.data) {
|
|
116
|
+
const { images, page_contents, ...dataWithoutBinaryContent } = result.data;
|
|
98
117
|
// Include image count and metadata in JSON, but not the base64 data
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
118
|
+
if (images) {
|
|
119
|
+
const imageInfo = images.map((img) => ({
|
|
120
|
+
page: img.page,
|
|
121
|
+
index: img.index,
|
|
122
|
+
width: img.width,
|
|
123
|
+
height: img.height,
|
|
124
|
+
format: img.format,
|
|
125
|
+
}));
|
|
126
|
+
return { ...result, data: { ...dataWithoutBinaryContent, image_info: imageInfo } };
|
|
127
|
+
}
|
|
128
|
+
return { ...result, data: dataWithoutBinaryContent };
|
|
107
129
|
}
|
|
108
130
|
return result;
|
|
109
131
|
});
|
|
@@ -112,40 +134,27 @@ export const handleReadPdfFunc = async (args) => {
|
|
|
112
134
|
type: 'text',
|
|
113
135
|
text: JSON.stringify({ results: resultsForJson }, null, 2),
|
|
114
136
|
});
|
|
115
|
-
// Add page content in
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
// Add
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
type: 'image',
|
|
129
|
-
data: image.data,
|
|
130
|
-
mimeType: image.format === 'rgba' ? 'image/png' : 'image/jpeg',
|
|
131
|
-
});
|
|
132
|
-
}
|
|
133
|
-
}
|
|
137
|
+
// Add page content in exact Y-coordinate order
|
|
138
|
+
for (const result of results) {
|
|
139
|
+
if (!result.success || !result.data?.page_contents)
|
|
140
|
+
continue;
|
|
141
|
+
// Process each page's content items in order
|
|
142
|
+
for (const pageContent of result.data.page_contents) {
|
|
143
|
+
for (const item of pageContent.items) {
|
|
144
|
+
if (item.type === 'text' && item.textContent) {
|
|
145
|
+
// Add text content part
|
|
146
|
+
content.push({
|
|
147
|
+
type: 'text',
|
|
148
|
+
text: item.textContent,
|
|
149
|
+
});
|
|
134
150
|
}
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
for (const image of pageImages) {
|
|
143
|
-
content.push({
|
|
144
|
-
type: 'image',
|
|
145
|
-
data: image.data,
|
|
146
|
-
mimeType: image.format === 'rgba' ? 'image/png' : 'image/jpeg',
|
|
147
|
-
});
|
|
148
|
-
}
|
|
151
|
+
else if (item.type === 'image' && item.imageData) {
|
|
152
|
+
// Add image content part (all images are now encoded as PNG)
|
|
153
|
+
content.push({
|
|
154
|
+
type: 'image',
|
|
155
|
+
data: item.imageData.data,
|
|
156
|
+
mimeType: 'image/png',
|
|
157
|
+
});
|
|
149
158
|
}
|
|
150
159
|
}
|
|
151
160
|
}
|
package/dist/index.js
CHANGED
|
@@ -11,7 +11,7 @@ import { allToolDefinitions } from './handlers/index.js';
|
|
|
11
11
|
// --- Server Setup ---
|
|
12
12
|
const server = new Server({
|
|
13
13
|
name: 'pdf-reader-mcp',
|
|
14
|
-
version: '1.
|
|
14
|
+
version: '1.3.0',
|
|
15
15
|
description: 'MCP Server for reading PDF files and extracting text, metadata, images, and page information.',
|
|
16
16
|
}, {
|
|
17
17
|
capabilities: { tools: {} },
|
package/dist/pdf/extractor.js
CHANGED
|
@@ -1,5 +1,42 @@
|
|
|
1
1
|
// PDF text and metadata extraction utilities
|
|
2
2
|
import { OPS } from 'pdfjs-dist/legacy/build/pdf.mjs';
|
|
3
|
+
import { PNG } from 'pngjs';
|
|
4
|
+
/**
|
|
5
|
+
* Encode raw pixel data to PNG format
|
|
6
|
+
*/
|
|
7
|
+
const encodePixelsToPNG = (pixelData, width, height, channels) => {
|
|
8
|
+
const png = new PNG({ width, height });
|
|
9
|
+
// Convert pixel data to RGBA format expected by pngjs
|
|
10
|
+
if (channels === 4) {
|
|
11
|
+
// Already RGBA
|
|
12
|
+
png.data = Buffer.from(pixelData);
|
|
13
|
+
}
|
|
14
|
+
else if (channels === 3) {
|
|
15
|
+
// RGB -> RGBA (add alpha channel)
|
|
16
|
+
for (let i = 0; i < width * height; i++) {
|
|
17
|
+
const srcIdx = i * 3;
|
|
18
|
+
const dstIdx = i * 4;
|
|
19
|
+
png.data[dstIdx] = pixelData[srcIdx] ?? 0; // R
|
|
20
|
+
png.data[dstIdx + 1] = pixelData[srcIdx + 1] ?? 0; // G
|
|
21
|
+
png.data[dstIdx + 2] = pixelData[srcIdx + 2] ?? 0; // B
|
|
22
|
+
png.data[dstIdx + 3] = 255; // A (fully opaque)
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
else if (channels === 1) {
|
|
26
|
+
// Grayscale -> RGBA
|
|
27
|
+
for (let i = 0; i < width * height; i++) {
|
|
28
|
+
const gray = pixelData[i] ?? 0;
|
|
29
|
+
const dstIdx = i * 4;
|
|
30
|
+
png.data[dstIdx] = gray; // R
|
|
31
|
+
png.data[dstIdx + 1] = gray; // G
|
|
32
|
+
png.data[dstIdx + 2] = gray; // B
|
|
33
|
+
png.data[dstIdx + 3] = 255; // A
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
// Encode to PNG and convert to base64
|
|
37
|
+
const pngBuffer = PNG.sync.write(png);
|
|
38
|
+
return pngBuffer.toString('base64');
|
|
39
|
+
};
|
|
3
40
|
/**
|
|
4
41
|
* Extract metadata and page count from a PDF document
|
|
5
42
|
*/
|
|
@@ -68,6 +105,7 @@ export const extractPageTexts = async (pdfDocument, pagesToProcess, sourceDescri
|
|
|
68
105
|
*/
|
|
69
106
|
const extractImagesFromPage = async (page, pageNum) => {
|
|
70
107
|
const images = [];
|
|
108
|
+
/* c8 ignore next */
|
|
71
109
|
try {
|
|
72
110
|
const operatorList = await page.getOperatorList();
|
|
73
111
|
// Find all image painting operations
|
|
@@ -78,7 +116,7 @@ const extractImagesFromPage = async (page, pageNum) => {
|
|
|
78
116
|
imageIndices.push(i);
|
|
79
117
|
}
|
|
80
118
|
}
|
|
81
|
-
// Extract each image
|
|
119
|
+
// Extract each image - try sync first, then async if needed
|
|
82
120
|
const imagePromises = imageIndices.map((imgIndex, arrayIndex) => new Promise((resolve) => {
|
|
83
121
|
const argsArray = operatorList.argsArray[imgIndex];
|
|
84
122
|
if (!argsArray || argsArray.length === 0) {
|
|
@@ -86,30 +124,75 @@ const extractImagesFromPage = async (page, pageNum) => {
|
|
|
86
124
|
return;
|
|
87
125
|
}
|
|
88
126
|
const imageName = argsArray[0];
|
|
89
|
-
//
|
|
90
|
-
|
|
127
|
+
// Helper to process image data
|
|
128
|
+
const processImageData = (imageData) => {
|
|
91
129
|
if (!imageData || typeof imageData !== 'object') {
|
|
92
|
-
|
|
93
|
-
return;
|
|
130
|
+
return null;
|
|
94
131
|
}
|
|
95
132
|
const img = imageData;
|
|
96
133
|
if (!img.data || !img.width || !img.height) {
|
|
97
|
-
|
|
98
|
-
return;
|
|
134
|
+
return null;
|
|
99
135
|
}
|
|
100
|
-
// Determine
|
|
101
|
-
// kind === 1 = grayscale, 2 = RGB, 3 = RGBA
|
|
136
|
+
// Determine number of channels based on kind
|
|
137
|
+
// kind === 1 = grayscale (1 channel), 2 = RGB (3 channels), 3 = RGBA (4 channels)
|
|
138
|
+
const channels = img.kind === 1 ? 1 : img.kind === 3 ? 4 : 3;
|
|
102
139
|
const format = img.kind === 1 ? 'grayscale' : img.kind === 3 ? 'rgba' : 'rgb';
|
|
103
|
-
//
|
|
104
|
-
const
|
|
105
|
-
|
|
140
|
+
// Encode raw pixel data to PNG format
|
|
141
|
+
const pngBase64 = encodePixelsToPNG(img.data, img.width, img.height, channels);
|
|
142
|
+
return {
|
|
106
143
|
page: pageNum,
|
|
107
144
|
index: arrayIndex,
|
|
108
145
|
width: img.width,
|
|
109
146
|
height: img.height,
|
|
110
147
|
format,
|
|
111
|
-
data:
|
|
112
|
-
}
|
|
148
|
+
data: pngBase64,
|
|
149
|
+
};
|
|
150
|
+
};
|
|
151
|
+
// Try to get from commonObjs first if it starts with 'g_'
|
|
152
|
+
if (imageName.startsWith('g_')) {
|
|
153
|
+
try {
|
|
154
|
+
const imageData = page.commonObjs.get(imageName);
|
|
155
|
+
if (imageData) {
|
|
156
|
+
const result = processImageData(imageData);
|
|
157
|
+
resolve(result);
|
|
158
|
+
return;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
catch (error) {
|
|
162
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
163
|
+
console.warn(`[PDF Reader MCP] Error getting image from commonObjs ${imageName}: ${message}`);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
// Try synchronous get first - if image is already loaded
|
|
167
|
+
try {
|
|
168
|
+
const imageData = page.objs.get(imageName);
|
|
169
|
+
if (imageData !== undefined) {
|
|
170
|
+
const result = processImageData(imageData);
|
|
171
|
+
resolve(result);
|
|
172
|
+
return;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
catch (error) {
|
|
176
|
+
// Synchronous get failed or not supported, fall through to async
|
|
177
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
178
|
+
console.warn(`[PDF Reader MCP] Sync image get failed for ${imageName}, trying async: ${message}`);
|
|
179
|
+
}
|
|
180
|
+
// Fallback to async callback-based get with timeout
|
|
181
|
+
let resolved = false;
|
|
182
|
+
const timeout = setTimeout(() => {
|
|
183
|
+
if (!resolved) {
|
|
184
|
+
resolved = true;
|
|
185
|
+
console.warn(`[PDF Reader MCP] Image extraction timeout for ${imageName} on page ${String(pageNum)}`);
|
|
186
|
+
resolve(null);
|
|
187
|
+
}
|
|
188
|
+
}, 10000); // 10 second timeout as a safety net
|
|
189
|
+
page.objs.get(imageName, (imageData) => {
|
|
190
|
+
if (!resolved) {
|
|
191
|
+
resolved = true;
|
|
192
|
+
clearTimeout(timeout);
|
|
193
|
+
const result = processImageData(imageData);
|
|
194
|
+
resolve(result);
|
|
195
|
+
}
|
|
113
196
|
});
|
|
114
197
|
}));
|
|
115
198
|
const resolvedImages = await Promise.all(imagePromises);
|
|
@@ -151,3 +234,161 @@ export const buildWarnings = (invalidPages, totalPages) => {
|
|
|
151
234
|
`Requested page numbers ${invalidPages.join(', ')} exceed total pages (${String(totalPages)}).`,
|
|
152
235
|
];
|
|
153
236
|
};
|
|
237
|
+
/**
|
|
238
|
+
* Extract all content (text and images) from a single page with Y-coordinate ordering
|
|
239
|
+
*/
|
|
240
|
+
export const extractPageContent = async (pdfDocument, pageNum, includeImages, sourceDescription) => {
|
|
241
|
+
const contentItems = [];
|
|
242
|
+
try {
|
|
243
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
244
|
+
// Extract text content with Y-coordinates
|
|
245
|
+
const textContent = await page.getTextContent();
|
|
246
|
+
// Group text items by Y-coordinate (items on same line have similar Y values)
|
|
247
|
+
const textByY = new Map();
|
|
248
|
+
for (const item of textContent.items) {
|
|
249
|
+
const textItem = item;
|
|
250
|
+
// transform[5] is the Y coordinate
|
|
251
|
+
const yCoord = textItem.transform[5];
|
|
252
|
+
if (yCoord === undefined)
|
|
253
|
+
continue;
|
|
254
|
+
const y = Math.round(yCoord);
|
|
255
|
+
if (!textByY.has(y)) {
|
|
256
|
+
textByY.set(y, []);
|
|
257
|
+
}
|
|
258
|
+
textByY.get(y)?.push(textItem.str);
|
|
259
|
+
}
|
|
260
|
+
// Convert grouped text to content items
|
|
261
|
+
for (const [y, textParts] of textByY.entries()) {
|
|
262
|
+
const textContent = textParts.join('');
|
|
263
|
+
if (textContent.trim()) {
|
|
264
|
+
contentItems.push({
|
|
265
|
+
type: 'text',
|
|
266
|
+
yPosition: y,
|
|
267
|
+
textContent,
|
|
268
|
+
});
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
// Extract images with Y-coordinates if requested
|
|
272
|
+
if (includeImages) {
|
|
273
|
+
const operatorList = await page.getOperatorList();
|
|
274
|
+
// Find all image painting operations
|
|
275
|
+
const imageIndices = [];
|
|
276
|
+
for (let i = 0; i < operatorList.fnArray.length; i++) {
|
|
277
|
+
const op = operatorList.fnArray[i];
|
|
278
|
+
if (op === OPS.paintImageXObject || op === OPS.paintXObject) {
|
|
279
|
+
imageIndices.push(i);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
// Extract each image with its Y-coordinate - try sync first, then async if needed
|
|
283
|
+
const imagePromises = imageIndices.map((imgIndex, arrayIndex) => new Promise((resolve) => {
|
|
284
|
+
const argsArray = operatorList.argsArray[imgIndex];
|
|
285
|
+
if (!argsArray || argsArray.length === 0) {
|
|
286
|
+
resolve(null);
|
|
287
|
+
return;
|
|
288
|
+
}
|
|
289
|
+
const imageName = argsArray[0];
|
|
290
|
+
// Get transform matrix from the args (if available)
|
|
291
|
+
let yPosition = 0;
|
|
292
|
+
if (argsArray.length > 1 && Array.isArray(argsArray[1])) {
|
|
293
|
+
const transform = argsArray[1];
|
|
294
|
+
const yCoord = transform[5];
|
|
295
|
+
if (yCoord !== undefined) {
|
|
296
|
+
yPosition = Math.round(yCoord);
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
// Helper to process image data
|
|
300
|
+
const processImageData = (imageData) => {
|
|
301
|
+
if (!imageData || typeof imageData !== 'object') {
|
|
302
|
+
return null;
|
|
303
|
+
}
|
|
304
|
+
const img = imageData;
|
|
305
|
+
if (!img.data || !img.width || !img.height) {
|
|
306
|
+
return null;
|
|
307
|
+
}
|
|
308
|
+
// Determine number of channels based on kind
|
|
309
|
+
const channels = img.kind === 1 ? 1 : img.kind === 3 ? 4 : 3;
|
|
310
|
+
const format = img.kind === 1 ? 'grayscale' : img.kind === 3 ? 'rgba' : 'rgb';
|
|
311
|
+
// Encode raw pixel data to PNG format
|
|
312
|
+
const pngBase64 = encodePixelsToPNG(img.data, img.width, img.height, channels);
|
|
313
|
+
return {
|
|
314
|
+
type: 'image',
|
|
315
|
+
yPosition,
|
|
316
|
+
imageData: {
|
|
317
|
+
page: pageNum,
|
|
318
|
+
index: arrayIndex,
|
|
319
|
+
width: img.width,
|
|
320
|
+
height: img.height,
|
|
321
|
+
format,
|
|
322
|
+
data: pngBase64,
|
|
323
|
+
},
|
|
324
|
+
};
|
|
325
|
+
};
|
|
326
|
+
// Try to get from commonObjs first if it starts with 'g_'
|
|
327
|
+
if (imageName.startsWith('g_')) {
|
|
328
|
+
try {
|
|
329
|
+
const imageData = page.commonObjs.get(imageName);
|
|
330
|
+
if (imageData) {
|
|
331
|
+
const result = processImageData(imageData);
|
|
332
|
+
resolve(result);
|
|
333
|
+
return;
|
|
334
|
+
}
|
|
335
|
+
/* c8 ignore next */
|
|
336
|
+
}
|
|
337
|
+
catch (error) {
|
|
338
|
+
/* c8 ignore next */ const message = error instanceof Error ? error.message : String(error);
|
|
339
|
+
/* c8 ignore next */ console.warn(
|
|
340
|
+
/* c8 ignore next */ `[PDF Reader MCP] Error getting image from commonObjs ${imageName}: ${message}`
|
|
341
|
+
/* c8 ignore next */
|
|
342
|
+
);
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
// Try synchronous get first - if image is already loaded
|
|
346
|
+
try {
|
|
347
|
+
const imageData = page.objs.get(imageName);
|
|
348
|
+
if (imageData !== undefined) {
|
|
349
|
+
const result = processImageData(imageData);
|
|
350
|
+
resolve(result);
|
|
351
|
+
return;
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
catch (error) {
|
|
355
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
356
|
+
console.warn(`[PDF Reader MCP] Sync image get failed for ${imageName}, trying async: ${message}`);
|
|
357
|
+
}
|
|
358
|
+
// Fallback to async callback-based get with timeout
|
|
359
|
+
let resolved = false;
|
|
360
|
+
const timeout = setTimeout(() => {
|
|
361
|
+
if (!resolved) {
|
|
362
|
+
resolved = true;
|
|
363
|
+
console.warn(`[PDF Reader MCP] Image extraction timeout for ${imageName} on page ${String(pageNum)}`);
|
|
364
|
+
resolve(null);
|
|
365
|
+
}
|
|
366
|
+
}, 10000); // 10 second timeout as a safety net
|
|
367
|
+
page.objs.get(imageName, (imageData) => {
|
|
368
|
+
if (!resolved) {
|
|
369
|
+
resolved = true;
|
|
370
|
+
clearTimeout(timeout);
|
|
371
|
+
const result = processImageData(imageData);
|
|
372
|
+
resolve(result);
|
|
373
|
+
}
|
|
374
|
+
});
|
|
375
|
+
}));
|
|
376
|
+
const resolvedImages = await Promise.all(imagePromises);
|
|
377
|
+
contentItems.push(...resolvedImages.filter((item) => item !== null));
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
catch (error) {
|
|
381
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
382
|
+
console.warn(`[PDF Reader MCP] Error extracting page content for page ${String(pageNum)} in ${sourceDescription}: ${message}`);
|
|
383
|
+
// Return error message as text content
|
|
384
|
+
return [
|
|
385
|
+
{
|
|
386
|
+
type: 'text',
|
|
387
|
+
yPosition: 0,
|
|
388
|
+
textContent: `Error processing page: ${message}`,
|
|
389
|
+
},
|
|
390
|
+
];
|
|
391
|
+
}
|
|
392
|
+
// Sort by Y-position (descending = top to bottom in PDF coordinates)
|
|
393
|
+
return contentItems.sort((a, b) => b.yPosition - a.yPosition);
|
|
394
|
+
};
|
package/dist/pdf/parser.js
CHANGED
|
@@ -7,10 +7,9 @@ const MAX_RANGE_SIZE = 10000; // Prevent infinite loops for open ranges
|
|
|
7
7
|
const parseRangePart = (part, pages) => {
|
|
8
8
|
const trimmedPart = part.trim();
|
|
9
9
|
if (trimmedPart.includes('-')) {
|
|
10
|
-
const
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
}
|
|
10
|
+
const splitResult = trimmedPart.split('-');
|
|
11
|
+
const startStr = splitResult[0] || '';
|
|
12
|
+
const endStr = splitResult[1];
|
|
14
13
|
const start = parseInt(startStr, 10);
|
|
15
14
|
const end = endStr === '' || endStr === undefined ? Infinity : parseInt(endStr, 10);
|
|
16
15
|
if (Number.isNaN(start) || Number.isNaN(end) || start <= 0 || start > end) {
|
|
@@ -43,6 +42,9 @@ export const parsePageRanges = (ranges) => {
|
|
|
43
42
|
for (const part of parts) {
|
|
44
43
|
parseRangePart(part, pages);
|
|
45
44
|
}
|
|
45
|
+
// This should never happen as parseRangePart would have thrown an error
|
|
46
|
+
// if no valid pages were found, but we keep this as a safety check
|
|
47
|
+
/* c8 ignore next */
|
|
46
48
|
if (pages.size === 0) {
|
|
47
49
|
throw new Error('Page range string resulted in zero valid pages.');
|
|
48
50
|
}
|
package/dist/schemas/readPdf.js
CHANGED
|
@@ -14,7 +14,11 @@ export const pageSpecifierSchema = z.union([
|
|
|
14
14
|
// Schema for a single PDF source (path or URL)
|
|
15
15
|
export const pdfSourceSchema = z
|
|
16
16
|
.object({
|
|
17
|
-
path: z
|
|
17
|
+
path: z
|
|
18
|
+
.string()
|
|
19
|
+
.min(1)
|
|
20
|
+
.optional()
|
|
21
|
+
.describe('Path to the local PDF file (absolute or relative to cwd).'),
|
|
18
22
|
url: z.string().url().optional().describe('URL of the PDF file.'),
|
|
19
23
|
pages: pageSpecifierSchema
|
|
20
24
|
.optional()
|
package/dist/utils/pathUtils.js
CHANGED
|
@@ -6,10 +6,9 @@ import { ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js';
|
|
|
6
6
|
export const PROJECT_ROOT = process.cwd();
|
|
7
7
|
console.info(`[Filesystem MCP - pathUtils] Project Root determined from CWD: ${PROJECT_ROOT}`); // Use info instead of log
|
|
8
8
|
/**
|
|
9
|
-
* Resolves a user-provided
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
* @param userPath The relative path provided by the user.
|
|
9
|
+
* Resolves a user-provided path, accepting both absolute and relative paths.
|
|
10
|
+
* Relative paths are resolved against the current working directory (PROJECT_ROOT).
|
|
11
|
+
* @param userPath The path provided by the user (absolute or relative).
|
|
13
12
|
* @returns The resolved absolute path.
|
|
14
13
|
*/
|
|
15
14
|
export const resolvePath = (userPath) => {
|
|
@@ -17,14 +16,10 @@ export const resolvePath = (userPath) => {
|
|
|
17
16
|
throw new McpError(ErrorCode.InvalidParams, 'Path must be a string.');
|
|
18
17
|
}
|
|
19
18
|
const normalizedUserPath = path.normalize(userPath);
|
|
19
|
+
// If absolute path, return it normalized
|
|
20
20
|
if (path.isAbsolute(normalizedUserPath)) {
|
|
21
|
-
|
|
21
|
+
return normalizedUserPath;
|
|
22
22
|
}
|
|
23
|
-
//
|
|
24
|
-
|
|
25
|
-
// Security check: Ensure the resolved path is still within the project root
|
|
26
|
-
if (!resolved.startsWith(PROJECT_ROOT)) {
|
|
27
|
-
throw new McpError(ErrorCode.InvalidRequest, 'Path traversal detected. Access denied.');
|
|
28
|
-
}
|
|
29
|
-
return resolved;
|
|
23
|
+
// If relative path, resolve against the PROJECT_ROOT (cwd)
|
|
24
|
+
return path.resolve(PROJECT_ROOT, normalizedUserPath);
|
|
30
25
|
};
|