@sylphx/pdf-reader-mcp 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +205 -91
- package/dist/index.js +620 -49
- package/package.json +44 -42
- package/dist/handlers/index.js +0 -4
- package/dist/handlers/readPdf.js +0 -170
- package/dist/pdf/extractor.js +0 -394
- package/dist/pdf/loader.js +0 -53
- package/dist/pdf/parser.js +0 -96
- package/dist/schemas/readPdf.js +0 -59
- package/dist/types/pdf.js +0 -2
- package/dist/utils/pathUtils.js +0 -25
package/dist/index.js
CHANGED
|
@@ -1,57 +1,628 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
import {
|
|
5
|
-
|
|
6
|
-
//
|
|
7
|
-
import {
|
|
8
|
-
|
|
9
|
-
//
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
2
|
+
|
|
3
|
+
// src/index.ts
|
|
4
|
+
import { createServer, stdio } from "@sylphx/mcp-server-sdk";
|
|
5
|
+
|
|
6
|
+
// src/handlers/readPdf.ts
|
|
7
|
+
import { image, text, tool, toolError } from "@sylphx/mcp-server-sdk";
|
|
8
|
+
|
|
9
|
+
// src/pdf/extractor.ts
|
|
10
|
+
import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
11
|
+
import { PNG } from "pngjs";
|
|
12
|
+
|
|
13
|
+
// src/utils/logger.ts
|
|
14
|
+
class Logger {
|
|
15
|
+
prefix;
|
|
16
|
+
minLevel;
|
|
17
|
+
constructor(component, minLevel = 1 /* INFO */) {
|
|
18
|
+
this.prefix = `[PDF Reader MCP${component ? ` - ${component}` : ""}]`;
|
|
19
|
+
this.minLevel = minLevel;
|
|
20
|
+
}
|
|
21
|
+
setLevel(level) {
|
|
22
|
+
this.minLevel = level;
|
|
23
|
+
}
|
|
24
|
+
debug(message, context) {
|
|
25
|
+
if (this.minLevel <= 0 /* DEBUG */) {
|
|
26
|
+
this.log("debug", message, context);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
info(message, context) {
|
|
30
|
+
if (this.minLevel <= 1 /* INFO */) {
|
|
31
|
+
this.log("info", message, context);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
warn(message, context) {
|
|
35
|
+
if (this.minLevel <= 2 /* WARN */) {
|
|
36
|
+
this.log("warn", message, context);
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
error(message, context) {
|
|
40
|
+
if (this.minLevel <= 3 /* ERROR */) {
|
|
41
|
+
this.log("error", message, context);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
logWithContext(level, logMessage, structuredLog) {
|
|
45
|
+
if (level === "error") {
|
|
46
|
+
console.error(logMessage);
|
|
47
|
+
console.error(JSON.stringify(structuredLog));
|
|
48
|
+
} else if (level === "warn") {
|
|
49
|
+
console.warn(logMessage);
|
|
50
|
+
console.warn(JSON.stringify(structuredLog));
|
|
51
|
+
} else if (level === "info") {
|
|
52
|
+
console.info(logMessage);
|
|
53
|
+
} else {
|
|
54
|
+
console.log(logMessage);
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
logSimple(level, logMessage) {
|
|
58
|
+
if (level === "error") {
|
|
59
|
+
console.error(logMessage);
|
|
60
|
+
} else if (level === "warn") {
|
|
61
|
+
console.warn(logMessage);
|
|
62
|
+
} else if (level === "info") {
|
|
63
|
+
console.info(logMessage);
|
|
64
|
+
} else {
|
|
65
|
+
console.log(logMessage);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
log(level, message, context) {
|
|
69
|
+
const logMessage = `${this.prefix} ${message}`;
|
|
70
|
+
if (context && Object.keys(context).length > 0) {
|
|
71
|
+
const timestamp = new Date().toISOString();
|
|
72
|
+
const structuredLog = {
|
|
73
|
+
timestamp,
|
|
74
|
+
level,
|
|
75
|
+
component: this.prefix,
|
|
76
|
+
message,
|
|
77
|
+
...context
|
|
78
|
+
};
|
|
79
|
+
this.logWithContext(level, logMessage, structuredLog);
|
|
80
|
+
} else {
|
|
81
|
+
this.logSimple(level, logMessage);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
var createLogger = (component, minLevel) => {
|
|
86
|
+
return new Logger(component, minLevel);
|
|
87
|
+
};
|
|
88
|
+
var logger = new Logger("", 2 /* WARN */);
|
|
89
|
+
|
|
90
|
+
// src/pdf/extractor.ts
|
|
91
|
+
var logger2 = createLogger("Extractor");
|
|
92
|
+
var encodePixelsToPNG = (pixelData, width, height, channels) => {
|
|
93
|
+
const png = new PNG({ width, height });
|
|
94
|
+
if (channels === 4) {
|
|
95
|
+
png.data = Buffer.from(pixelData);
|
|
96
|
+
} else if (channels === 3) {
|
|
97
|
+
for (let i = 0;i < width * height; i++) {
|
|
98
|
+
const srcIdx = i * 3;
|
|
99
|
+
const dstIdx = i * 4;
|
|
100
|
+
png.data[dstIdx] = pixelData[srcIdx] ?? 0;
|
|
101
|
+
png.data[dstIdx + 1] = pixelData[srcIdx + 1] ?? 0;
|
|
102
|
+
png.data[dstIdx + 2] = pixelData[srcIdx + 2] ?? 0;
|
|
103
|
+
png.data[dstIdx + 3] = 255;
|
|
104
|
+
}
|
|
105
|
+
} else if (channels === 1) {
|
|
106
|
+
for (let i = 0;i < width * height; i++) {
|
|
107
|
+
const gray = pixelData[i] ?? 0;
|
|
108
|
+
const dstIdx = i * 4;
|
|
109
|
+
png.data[dstIdx] = gray;
|
|
110
|
+
png.data[dstIdx + 1] = gray;
|
|
111
|
+
png.data[dstIdx + 2] = gray;
|
|
112
|
+
png.data[dstIdx + 3] = 255;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
const pngBuffer = PNG.sync.write(png);
|
|
116
|
+
return pngBuffer.toString("base64");
|
|
117
|
+
};
|
|
118
|
+
var processImageData = (imageData, pageNum, arrayIndex) => {
|
|
119
|
+
if (!imageData || typeof imageData !== "object") {
|
|
120
|
+
return null;
|
|
121
|
+
}
|
|
122
|
+
const img = imageData;
|
|
123
|
+
if (!img.data || !img.width || !img.height) {
|
|
124
|
+
return null;
|
|
125
|
+
}
|
|
126
|
+
const channels = img.kind === 1 ? 1 : img.kind === 3 ? 4 : 3;
|
|
127
|
+
const format = img.kind === 1 ? "grayscale" : img.kind === 3 ? "rgba" : "rgb";
|
|
128
|
+
const pngBase64 = encodePixelsToPNG(img.data, img.width, img.height, channels);
|
|
129
|
+
return {
|
|
130
|
+
page: pageNum,
|
|
131
|
+
index: arrayIndex,
|
|
132
|
+
width: img.width,
|
|
133
|
+
height: img.height,
|
|
134
|
+
format,
|
|
135
|
+
data: pngBase64
|
|
136
|
+
};
|
|
137
|
+
};
|
|
138
|
+
var retrieveImageData = async (page, imageName, pageNum) => {
|
|
139
|
+
if (imageName.startsWith("g_")) {
|
|
140
|
+
try {
|
|
141
|
+
const imageData = page.commonObjs.get(imageName);
|
|
142
|
+
if (imageData) {
|
|
143
|
+
return imageData;
|
|
144
|
+
}
|
|
145
|
+
} catch (error) {
|
|
146
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
147
|
+
logger2.warn("Error getting image from commonObjs", { imageName, error: message });
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
try {
|
|
151
|
+
const imageData = page.objs.get(imageName);
|
|
152
|
+
if (imageData !== undefined) {
|
|
153
|
+
return imageData;
|
|
154
|
+
}
|
|
155
|
+
} catch (error) {
|
|
156
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
157
|
+
logger2.warn("Sync image get failed, trying async", { imageName, error: message });
|
|
158
|
+
}
|
|
159
|
+
return new Promise((resolve) => {
|
|
160
|
+
let resolved = false;
|
|
161
|
+
let timeoutId = null;
|
|
162
|
+
const cleanup = () => {
|
|
163
|
+
if (timeoutId !== null) {
|
|
164
|
+
clearTimeout(timeoutId);
|
|
165
|
+
timeoutId = null;
|
|
166
|
+
}
|
|
167
|
+
};
|
|
168
|
+
timeoutId = setTimeout(() => {
|
|
169
|
+
if (!resolved) {
|
|
170
|
+
resolved = true;
|
|
171
|
+
cleanup();
|
|
172
|
+
logger2.warn("Image extraction timeout", { imageName, pageNum });
|
|
173
|
+
resolve(null);
|
|
174
|
+
}
|
|
175
|
+
}, 1e4);
|
|
176
|
+
try {
|
|
177
|
+
page.objs.get(imageName, (imageData) => {
|
|
178
|
+
if (!resolved) {
|
|
179
|
+
resolved = true;
|
|
180
|
+
cleanup();
|
|
181
|
+
resolve(imageData);
|
|
182
|
+
}
|
|
183
|
+
});
|
|
184
|
+
} catch (error) {
|
|
185
|
+
if (!resolved) {
|
|
186
|
+
resolved = true;
|
|
187
|
+
cleanup();
|
|
188
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
189
|
+
logger2.warn("Error in async image get", { imageName, error: message });
|
|
190
|
+
resolve(null);
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
});
|
|
194
|
+
};
|
|
195
|
+
var extractMetadataAndPageCount = async (pdfDocument, includeMetadata, includePageCount) => {
|
|
196
|
+
const output = {};
|
|
197
|
+
if (includePageCount) {
|
|
198
|
+
output.num_pages = pdfDocument.numPages;
|
|
199
|
+
}
|
|
200
|
+
if (includeMetadata) {
|
|
201
|
+
try {
|
|
202
|
+
const pdfMetadata = await pdfDocument.getMetadata();
|
|
203
|
+
const infoData = pdfMetadata.info;
|
|
204
|
+
if (infoData !== undefined) {
|
|
205
|
+
output.info = infoData;
|
|
206
|
+
}
|
|
207
|
+
const metadataObj = pdfMetadata.metadata;
|
|
208
|
+
if (typeof metadataObj.getAll === "function") {
|
|
209
|
+
output.metadata = metadataObj.getAll();
|
|
210
|
+
} else {
|
|
211
|
+
const metadataRecord = {};
|
|
212
|
+
for (const key in metadataObj) {
|
|
213
|
+
if (Object.hasOwn(metadataObj, key)) {
|
|
214
|
+
metadataRecord[key] = metadataObj[key];
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
output.metadata = metadataRecord;
|
|
218
|
+
}
|
|
219
|
+
} catch (metaError) {
|
|
220
|
+
const message = metaError instanceof Error ? metaError.message : String(metaError);
|
|
221
|
+
logger2.warn("Error extracting metadata", { error: message });
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
return output;
|
|
225
|
+
};
|
|
226
|
+
var buildWarnings = (invalidPages, totalPages) => {
|
|
227
|
+
if (invalidPages.length === 0) {
|
|
228
|
+
return [];
|
|
229
|
+
}
|
|
230
|
+
return [
|
|
231
|
+
`Requested page numbers ${invalidPages.join(", ")} exceed total pages (${String(totalPages)}).`
|
|
232
|
+
];
|
|
233
|
+
};
|
|
234
|
+
var extractPageContent = async (pdfDocument, pageNum, includeImages, sourceDescription) => {
|
|
235
|
+
const contentItems = [];
|
|
236
|
+
try {
|
|
237
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
238
|
+
const textContent = await page.getTextContent();
|
|
239
|
+
const textByY = new Map;
|
|
240
|
+
for (const item of textContent.items) {
|
|
241
|
+
const textItem = item;
|
|
242
|
+
const yCoord = textItem.transform[5];
|
|
243
|
+
if (yCoord === undefined)
|
|
244
|
+
continue;
|
|
245
|
+
const y = Math.round(yCoord);
|
|
246
|
+
if (!textByY.has(y)) {
|
|
247
|
+
textByY.set(y, []);
|
|
248
|
+
}
|
|
249
|
+
textByY.get(y)?.push(textItem.str);
|
|
250
|
+
}
|
|
251
|
+
for (const [y, textParts] of textByY.entries()) {
|
|
252
|
+
const textContent2 = textParts.join("");
|
|
253
|
+
if (textContent2.trim()) {
|
|
254
|
+
contentItems.push({
|
|
255
|
+
type: "text",
|
|
256
|
+
yPosition: y,
|
|
257
|
+
textContent: textContent2
|
|
258
|
+
});
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
if (includeImages) {
|
|
262
|
+
const operatorList = await page.getOperatorList();
|
|
263
|
+
const imageIndices = [];
|
|
264
|
+
for (let i = 0;i < operatorList.fnArray.length; i++) {
|
|
265
|
+
const op = operatorList.fnArray[i];
|
|
266
|
+
if (op === OPS.paintImageXObject || op === OPS.paintXObject) {
|
|
267
|
+
imageIndices.push(i);
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
const imagePromises = imageIndices.map(async (imgIndex, arrayIndex) => {
|
|
271
|
+
const argsArray = operatorList.argsArray[imgIndex];
|
|
272
|
+
if (!argsArray || argsArray.length === 0) {
|
|
273
|
+
return null;
|
|
274
|
+
}
|
|
275
|
+
const imageName = argsArray[0];
|
|
276
|
+
let yPosition = 0;
|
|
277
|
+
if (argsArray.length > 1 && Array.isArray(argsArray[1])) {
|
|
278
|
+
const transform = argsArray[1];
|
|
279
|
+
const yCoord = transform[5];
|
|
280
|
+
if (yCoord !== undefined) {
|
|
281
|
+
yPosition = Math.round(yCoord);
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
const imageData = await retrieveImageData(page, imageName, pageNum);
|
|
285
|
+
const extractedImage = processImageData(imageData, pageNum, arrayIndex);
|
|
286
|
+
if (extractedImage) {
|
|
287
|
+
return {
|
|
288
|
+
type: "image",
|
|
289
|
+
yPosition,
|
|
290
|
+
imageData: extractedImage
|
|
291
|
+
};
|
|
292
|
+
}
|
|
293
|
+
return null;
|
|
294
|
+
});
|
|
295
|
+
const resolvedImages = await Promise.all(imagePromises);
|
|
296
|
+
const validImages = resolvedImages.filter((item) => item !== null);
|
|
297
|
+
contentItems.push(...validImages);
|
|
298
|
+
}
|
|
299
|
+
} catch (error) {
|
|
300
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
301
|
+
logger2.warn("Error extracting page content", {
|
|
302
|
+
pageNum,
|
|
303
|
+
sourceDescription,
|
|
304
|
+
error: message
|
|
305
|
+
});
|
|
306
|
+
return [
|
|
307
|
+
{
|
|
308
|
+
type: "text",
|
|
309
|
+
yPosition: 0,
|
|
310
|
+
textContent: `Error processing page: ${message}`
|
|
311
|
+
}
|
|
312
|
+
];
|
|
313
|
+
}
|
|
314
|
+
return contentItems.sort((a, b) => b.yPosition - a.yPosition);
|
|
315
|
+
};
|
|
316
|
+
|
|
317
|
+
// src/pdf/loader.ts
|
|
318
|
+
import fs from "node:fs/promises";
|
|
319
|
+
import { getDocument } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
320
|
+
|
|
321
|
+
// src/utils/errors.ts
|
|
322
|
+
class PdfError extends Error {
|
|
323
|
+
code;
|
|
324
|
+
constructor(code, message, options) {
|
|
325
|
+
super(message, options?.cause ? { cause: options.cause } : undefined);
|
|
326
|
+
this.code = code;
|
|
327
|
+
this.name = "PdfError";
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
// src/utils/pathUtils.ts
|
|
332
|
+
import os from "node:os";
|
|
333
|
+
import path from "node:path";
|
|
334
|
+
var PROJECT_ROOT = process.cwd();
|
|
335
|
+
var ALLOWED_ROOTS = [PROJECT_ROOT, os.homedir()];
|
|
336
|
+
var resolvePath = (userPath) => {
|
|
337
|
+
if (typeof userPath !== "string") {
|
|
338
|
+
throw new PdfError(-32602 /* InvalidParams */, "Path must be a string.");
|
|
339
|
+
}
|
|
340
|
+
const normalizedUserPath = path.normalize(userPath);
|
|
341
|
+
const resolvedPath = path.isAbsolute(normalizedUserPath) ? normalizedUserPath : path.resolve(PROJECT_ROOT, normalizedUserPath);
|
|
342
|
+
const isWithinAllowedRoot = ALLOWED_ROOTS.some((allowedRoot) => {
|
|
343
|
+
const relativePath = path.relative(allowedRoot, resolvedPath);
|
|
344
|
+
return relativePath !== "" && !relativePath.startsWith("..") && !path.isAbsolute(relativePath);
|
|
345
|
+
});
|
|
346
|
+
if (!isWithinAllowedRoot) {
|
|
347
|
+
throw new PdfError(-32602 /* InvalidParams */, "Access denied: Path resolves outside allowed directories.");
|
|
348
|
+
}
|
|
349
|
+
return resolvedPath;
|
|
350
|
+
};
|
|
351
|
+
|
|
352
|
+
// src/pdf/loader.ts
|
|
353
|
+
var logger3 = createLogger("Loader");
|
|
354
|
+
var MAX_PDF_SIZE = 100 * 1024 * 1024;
|
|
355
|
+
var loadPdfDocument = async (source, sourceDescription) => {
|
|
356
|
+
let pdfDataSource;
|
|
357
|
+
try {
|
|
358
|
+
if (source.path) {
|
|
359
|
+
const safePath = resolvePath(source.path);
|
|
360
|
+
const buffer = await fs.readFile(safePath);
|
|
361
|
+
if (buffer.length > MAX_PDF_SIZE) {
|
|
362
|
+
throw new PdfError(-32600 /* InvalidRequest */, `PDF file exceeds maximum size of ${MAX_PDF_SIZE} bytes (${(MAX_PDF_SIZE / 1024 / 1024).toFixed(0)}MB). File size: ${buffer.length} bytes.`);
|
|
363
|
+
}
|
|
364
|
+
pdfDataSource = new Uint8Array(buffer);
|
|
365
|
+
} else if (source.url) {
|
|
366
|
+
pdfDataSource = { url: source.url };
|
|
367
|
+
} else {
|
|
368
|
+
throw new PdfError(-32602 /* InvalidParams */, `Source ${sourceDescription} missing 'path' or 'url'.`);
|
|
369
|
+
}
|
|
370
|
+
} catch (err) {
|
|
371
|
+
if (err instanceof PdfError) {
|
|
372
|
+
throw err;
|
|
373
|
+
}
|
|
374
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
375
|
+
const errorCode = -32600 /* InvalidRequest */;
|
|
376
|
+
if (typeof err === "object" && err !== null && "code" in err && err.code === "ENOENT" && source.path) {
|
|
377
|
+
throw new PdfError(errorCode, `File not found at '${source.path}'.`, {
|
|
378
|
+
cause: err instanceof Error ? err : undefined
|
|
379
|
+
});
|
|
380
|
+
}
|
|
381
|
+
throw new PdfError(errorCode, `Failed to prepare PDF source ${sourceDescription}. Reason: ${message}`, { cause: err instanceof Error ? err : undefined });
|
|
382
|
+
}
|
|
383
|
+
const loadingTask = getDocument(pdfDataSource);
|
|
384
|
+
try {
|
|
385
|
+
return await loadingTask.promise;
|
|
386
|
+
} catch (err) {
|
|
387
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
388
|
+
logger3.error("PDF.js loading error", { sourceDescription, error: message });
|
|
389
|
+
throw new PdfError(-32600 /* InvalidRequest */, `Failed to load PDF document from ${sourceDescription}. Reason: ${message || "Unknown loading error"}`, { cause: err instanceof Error ? err : undefined });
|
|
390
|
+
}
|
|
391
|
+
};
|
|
392
|
+
|
|
393
|
+
// src/pdf/parser.ts
|
|
394
|
+
var logger4 = createLogger("Parser");
|
|
395
|
+
var MAX_RANGE_SIZE = 1e4;
|
|
396
|
+
var parseRangePart = (part, pages) => {
|
|
397
|
+
const trimmedPart = part.trim();
|
|
398
|
+
if (trimmedPart.includes("-")) {
|
|
399
|
+
const splitResult = trimmedPart.split("-");
|
|
400
|
+
const startStr = splitResult[0] || "";
|
|
401
|
+
const endStr = splitResult[1];
|
|
402
|
+
const start = parseInt(startStr, 10);
|
|
403
|
+
const end = endStr === "" || endStr === undefined ? Infinity : parseInt(endStr, 10);
|
|
404
|
+
if (Number.isNaN(start) || Number.isNaN(end) || start <= 0 || start > end) {
|
|
405
|
+
throw new Error(`Invalid page range values: ${trimmedPart}`);
|
|
406
|
+
}
|
|
407
|
+
const practicalEnd = Math.min(end, start + MAX_RANGE_SIZE);
|
|
408
|
+
for (let i = start;i <= practicalEnd; i++) {
|
|
409
|
+
pages.add(i);
|
|
410
|
+
}
|
|
411
|
+
if (end === Infinity && practicalEnd === start + MAX_RANGE_SIZE) {
|
|
412
|
+
logger4.warn("Open-ended range truncated", { start, practicalEnd });
|
|
413
|
+
}
|
|
414
|
+
} else {
|
|
415
|
+
const page = parseInt(trimmedPart, 10);
|
|
416
|
+
if (Number.isNaN(page) || page <= 0) {
|
|
417
|
+
throw new Error(`Invalid page number: ${trimmedPart}`);
|
|
418
|
+
}
|
|
419
|
+
pages.add(page);
|
|
420
|
+
}
|
|
421
|
+
};
|
|
422
|
+
var parsePageRanges = (ranges) => {
|
|
423
|
+
const pages = new Set;
|
|
424
|
+
const parts = ranges.split(",");
|
|
425
|
+
for (const part of parts) {
|
|
426
|
+
parseRangePart(part, pages);
|
|
427
|
+
}
|
|
428
|
+
if (pages.size === 0) {
|
|
429
|
+
throw new Error("Page range string resulted in zero valid pages.");
|
|
430
|
+
}
|
|
431
|
+
return Array.from(pages).sort((a, b) => a - b);
|
|
432
|
+
};
|
|
433
|
+
var getTargetPages = (sourcePages, sourceDescription) => {
|
|
434
|
+
if (!sourcePages) {
|
|
435
|
+
return;
|
|
436
|
+
}
|
|
437
|
+
try {
|
|
438
|
+
if (typeof sourcePages === "string") {
|
|
439
|
+
return parsePageRanges(sourcePages);
|
|
440
|
+
}
|
|
441
|
+
if (sourcePages.some((p) => !Number.isInteger(p) || p <= 0)) {
|
|
442
|
+
throw new Error("Page numbers in array must be positive integers.");
|
|
443
|
+
}
|
|
444
|
+
const uniquePages = [...new Set(sourcePages)].sort((a, b) => a - b);
|
|
445
|
+
if (uniquePages.length === 0) {
|
|
446
|
+
throw new Error("Page specification resulted in an empty set of pages.");
|
|
447
|
+
}
|
|
448
|
+
return uniquePages;
|
|
449
|
+
} catch (error) {
|
|
450
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
451
|
+
throw new PdfError(-32602 /* InvalidParams */, `Invalid page specification for source ${sourceDescription}: ${message}`);
|
|
452
|
+
}
|
|
453
|
+
};
|
|
454
|
+
var determinePagesToProcess = (targetPages, totalPages, includeFullText) => {
|
|
455
|
+
if (targetPages) {
|
|
456
|
+
const pagesToProcess = targetPages.filter((p) => p <= totalPages);
|
|
457
|
+
const invalidPages = targetPages.filter((p) => p > totalPages);
|
|
458
|
+
return { pagesToProcess, invalidPages };
|
|
459
|
+
}
|
|
460
|
+
if (includeFullText) {
|
|
461
|
+
const pagesToProcess = Array.from({ length: totalPages }, (_, i) => i + 1);
|
|
462
|
+
return { pagesToProcess, invalidPages: [] };
|
|
463
|
+
}
|
|
464
|
+
return { pagesToProcess: [], invalidPages: [] };
|
|
465
|
+
};
|
|
466
|
+
|
|
467
|
+
// src/schemas/readPdf.ts
|
|
468
|
+
import { z } from "zod";
|
|
469
|
+
var pageSpecifierSchema = z.union([
|
|
470
|
+
z.array(z.number().int().min(1)).min(1).describe("Array of page numbers (1-based)"),
|
|
471
|
+
z.string().min(1).refine((val) => /^[0-9,-]+$/.test(val.replace(/\s/g, "")), {
|
|
472
|
+
message: "Page string must contain only numbers, commas, and hyphens."
|
|
473
|
+
}).describe('Page range string (e.g., "1-5,10,15-20")')
|
|
474
|
+
]);
|
|
475
|
+
var pdfSourceSchema = z.object({
|
|
476
|
+
path: z.string().min(1).optional().describe("Path to the local PDF file (absolute or relative to cwd)."),
|
|
477
|
+
url: z.string().url().optional().describe("URL of the PDF file."),
|
|
478
|
+
pages: pageSpecifierSchema.optional().describe("Extract text only from specific pages (1-based) or ranges for this source. If provided, 'include_full_text' is ignored for this source.")
|
|
479
|
+
}).strict().refine((data) => !!(data.path && !data.url) || !!(!data.path && data.url), {
|
|
480
|
+
message: "Each source must have either 'path' or 'url', but not both."
|
|
18
481
|
});
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
482
|
+
var readPdfArgsSchema = z.object({
|
|
483
|
+
sources: z.array(pdfSourceSchema).min(1).describe("An array of PDF sources to process, each can optionally specify pages."),
|
|
484
|
+
include_full_text: z.boolean().optional().default(false).describe("Include the full text content of each PDF (only if 'pages' is not specified for that source)."),
|
|
485
|
+
include_metadata: z.boolean().optional().default(true).describe("Include metadata and info objects for each PDF."),
|
|
486
|
+
include_page_count: z.boolean().optional().default(true).describe("Include the total number of pages for each PDF."),
|
|
487
|
+
include_images: z.boolean().optional().default(false).describe("Extract and include embedded images from the PDF pages as base64-encoded data.")
|
|
488
|
+
}).strict();
|
|
489
|
+
|
|
490
|
+
// src/handlers/readPdf.ts
|
|
491
|
+
var logger5 = createLogger("ReadPdf");
|
|
492
|
+
var processSingleSource = async (source, options) => {
|
|
493
|
+
const sourceDescription = source.path ?? source.url ?? "unknown source";
|
|
494
|
+
let individualResult = { source: sourceDescription, success: false };
|
|
495
|
+
let pdfDocument = null;
|
|
496
|
+
try {
|
|
497
|
+
const targetPages = getTargetPages(source.pages, sourceDescription);
|
|
498
|
+
const { pages: _pages, ...loadArgs } = source;
|
|
499
|
+
pdfDocument = await loadPdfDocument(loadArgs, sourceDescription);
|
|
500
|
+
const totalPages = pdfDocument.numPages;
|
|
501
|
+
const metadataOutput = await extractMetadataAndPageCount(pdfDocument, options.includeMetadata, options.includePageCount);
|
|
502
|
+
const output = { ...metadataOutput };
|
|
503
|
+
const { pagesToProcess, invalidPages } = determinePagesToProcess(targetPages, totalPages, options.includeFullText);
|
|
504
|
+
const warnings = buildWarnings(invalidPages, totalPages);
|
|
505
|
+
if (warnings.length > 0) {
|
|
506
|
+
output.warnings = warnings;
|
|
507
|
+
}
|
|
508
|
+
if (pagesToProcess.length > 0) {
|
|
509
|
+
const pageContents = await Promise.all(pagesToProcess.map((pageNum) => extractPageContent(pdfDocument, pageNum, options.includeImages, sourceDescription)));
|
|
510
|
+
output.page_contents = pageContents.map((items, idx) => ({
|
|
511
|
+
page: pagesToProcess[idx],
|
|
512
|
+
items
|
|
513
|
+
}));
|
|
514
|
+
const extractedPageTexts = pageContents.map((items, idx) => ({
|
|
515
|
+
page: pagesToProcess[idx],
|
|
516
|
+
text: items.filter((item) => item.type === "text").map((item) => item.textContent).join("")
|
|
517
|
+
}));
|
|
518
|
+
if (targetPages) {
|
|
519
|
+
output.page_texts = extractedPageTexts;
|
|
520
|
+
} else {
|
|
521
|
+
output.full_text = extractedPageTexts.map((p) => p.text).join(`
|
|
522
|
+
|
|
523
|
+
`);
|
|
524
|
+
}
|
|
525
|
+
if (options.includeImages) {
|
|
526
|
+
const extractedImages = pageContents.flatMap((items) => items.filter((item) => item.type === "image" && item.imageData)).map((item) => item.imageData).filter((img) => img !== undefined);
|
|
527
|
+
if (extractedImages.length > 0) {
|
|
528
|
+
output.images = extractedImages;
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
individualResult = { ...individualResult, data: output, success: true };
|
|
533
|
+
} catch (error) {
|
|
534
|
+
let errorMessage = `Failed to process PDF from ${sourceDescription}.`;
|
|
535
|
+
if (error instanceof Error) {
|
|
536
|
+
errorMessage += ` Reason: ${error.message}`;
|
|
537
|
+
} else {
|
|
538
|
+
errorMessage += ` Unknown error: ${JSON.stringify(error)}`;
|
|
539
|
+
}
|
|
540
|
+
individualResult.error = errorMessage;
|
|
541
|
+
individualResult.success = false;
|
|
542
|
+
individualResult.data = undefined;
|
|
543
|
+
} finally {
|
|
544
|
+
if (pdfDocument && typeof pdfDocument.destroy === "function") {
|
|
545
|
+
try {
|
|
546
|
+
await pdfDocument.destroy();
|
|
547
|
+
} catch (destroyError) {
|
|
548
|
+
const message = destroyError instanceof Error ? destroyError.message : String(destroyError);
|
|
549
|
+
logger5.warn("Error destroying PDF document", { sourceDescription, error: message });
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
return individualResult;
|
|
554
|
+
};
|
|
555
|
+
var readPdf = tool().description("Reads content/metadata/images from one or more PDFs (local/URL). Each source can specify pages to extract.").input(readPdfArgsSchema).handler(async ({ input }) => {
|
|
556
|
+
const { sources, include_full_text, include_metadata, include_page_count, include_images } = input;
|
|
557
|
+
const MAX_CONCURRENT_SOURCES = 3;
|
|
558
|
+
const results = [];
|
|
559
|
+
const options = {
|
|
560
|
+
includeFullText: include_full_text ?? false,
|
|
561
|
+
includeMetadata: include_metadata ?? true,
|
|
562
|
+
includePageCount: include_page_count ?? true,
|
|
563
|
+
includeImages: include_images ?? false
|
|
564
|
+
};
|
|
565
|
+
for (let i = 0;i < sources.length; i += MAX_CONCURRENT_SOURCES) {
|
|
566
|
+
const batch = sources.slice(i, i + MAX_CONCURRENT_SOURCES);
|
|
567
|
+
const batchResults = await Promise.all(batch.map((source) => processSingleSource(source, options)));
|
|
568
|
+
results.push(...batchResults);
|
|
569
|
+
}
|
|
570
|
+
const allFailed = results.every((r) => !r.success);
|
|
571
|
+
if (allFailed) {
|
|
572
|
+
const errorMessages = results.map((r) => r.error).join("; ");
|
|
573
|
+
return toolError(`All PDF sources failed to process: ${errorMessages}`);
|
|
574
|
+
}
|
|
575
|
+
const content = [];
|
|
576
|
+
const resultsForJson = results.map((result) => {
|
|
577
|
+
if (result.data) {
|
|
578
|
+
const { images, page_contents, ...dataWithoutBinaryContent } = result.data;
|
|
579
|
+
if (images) {
|
|
580
|
+
const imageInfo = images.map((img) => ({
|
|
581
|
+
page: img.page,
|
|
582
|
+
index: img.index,
|
|
583
|
+
width: img.width,
|
|
584
|
+
height: img.height,
|
|
585
|
+
format: img.format
|
|
586
|
+
}));
|
|
587
|
+
return { ...result, data: { ...dataWithoutBinaryContent, image_info: imageInfo } };
|
|
588
|
+
}
|
|
589
|
+
return { ...result, data: dataWithoutBinaryContent };
|
|
590
|
+
}
|
|
591
|
+
return result;
|
|
592
|
+
});
|
|
593
|
+
content.push(text(JSON.stringify({ results: resultsForJson }, null, 2)));
|
|
594
|
+
for (const result of results) {
|
|
595
|
+
if (!result.success || !result.data?.page_contents)
|
|
596
|
+
continue;
|
|
597
|
+
for (const pageContent of result.data.page_contents) {
|
|
598
|
+
for (const item of pageContent.items) {
|
|
599
|
+
if (item.type === "text" && item.textContent) {
|
|
600
|
+
content.push(text(item.textContent));
|
|
601
|
+
} else if (item.type === "image" && item.imageData) {
|
|
602
|
+
content.push(image(item.imageData.data, "image/png"));
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
return content;
|
|
35
608
|
});
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
// The handler itself will perform Zod validation on the arguments
|
|
45
|
-
return toolDefinition.handler(request.params.arguments);
|
|
609
|
+
|
|
610
|
+
// src/index.ts
|
|
611
|
+
var server = createServer({
|
|
612
|
+
name: "pdf-reader-mcp",
|
|
613
|
+
version: "1.3.0",
|
|
614
|
+
instructions: "MCP Server for reading PDF files and extracting text, metadata, images, and page information.",
|
|
615
|
+
tools: { read_pdf: readPdf },
|
|
616
|
+
transport: stdio()
|
|
46
617
|
});
|
|
47
|
-
// --- Server Start ---
|
|
48
618
|
async function main() {
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
console.error(
|
|
619
|
+
await server.start();
|
|
620
|
+
if (process.env.DEBUG_MCP) {
|
|
621
|
+
console.error("[PDF Reader MCP] Server running on stdio");
|
|
622
|
+
console.error("[PDF Reader MCP] Project root:", process.cwd());
|
|
623
|
+
}
|
|
52
624
|
}
|
|
53
625
|
main().catch((error) => {
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
process.exit(1);
|
|
626
|
+
console.error("[PDF Reader MCP] Server error:", error);
|
|
627
|
+
process.exit(1);
|
|
57
628
|
});
|