@sylphx/pdf-reader-mcp 1.3.0 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,170 +0,0 @@
1
- // PDF reading handler - orchestrates PDF processing workflow
2
- import { ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js';
3
- import { z } from 'zod';
4
- import { buildWarnings, extractMetadataAndPageCount, extractPageContent, } from '../pdf/extractor.js';
5
- import { loadPdfDocument } from '../pdf/loader.js';
6
- import { determinePagesToProcess, getTargetPages } from '../pdf/parser.js';
7
- import { readPdfArgsSchema } from '../schemas/readPdf.js';
8
- /**
9
- * Process a single PDF source
10
- */
11
- const processSingleSource = async (source, options) => {
12
- const sourceDescription = source.path ?? source.url ?? 'unknown source';
13
- let individualResult = { source: sourceDescription, success: false };
14
- try {
15
- // Parse target pages
16
- const targetPages = getTargetPages(source.pages, sourceDescription);
17
- // Load PDF document
18
- const { pages: _pages, ...loadArgs } = source;
19
- const pdfDocument = await loadPdfDocument(loadArgs, sourceDescription);
20
- const totalPages = pdfDocument.numPages;
21
- // Extract metadata and page count
22
- const metadataOutput = await extractMetadataAndPageCount(pdfDocument, options.includeMetadata, options.includePageCount);
23
- const output = { ...metadataOutput };
24
- // Determine pages to process
25
- const { pagesToProcess, invalidPages } = determinePagesToProcess(targetPages, totalPages, options.includeFullText);
26
- // Add warnings for invalid pages
27
- const warnings = buildWarnings(invalidPages, totalPages);
28
- if (warnings.length > 0) {
29
- output.warnings = warnings;
30
- }
31
- // Extract content with ordering preserved
32
- if (pagesToProcess.length > 0) {
33
- // Use new extractPageContent to preserve Y-coordinate ordering
34
- const pageContents = await Promise.all(pagesToProcess.map((pageNum) => extractPageContent(pdfDocument, pageNum, options.includeImages, sourceDescription)));
35
- // Store page contents for ordered retrieval
36
- output.page_contents = pageContents.map((items, idx) => ({
37
- page: pagesToProcess[idx],
38
- items,
39
- }));
40
- // For backward compatibility, also provide text-only outputs
41
- const extractedPageTexts = pageContents.map((items, idx) => ({
42
- page: pagesToProcess[idx],
43
- text: items
44
- .filter((item) => item.type === 'text')
45
- .map((item) => item.textContent)
46
- .join(''),
47
- }));
48
- if (targetPages) {
49
- // Specific pages requested
50
- output.page_texts = extractedPageTexts;
51
- }
52
- else {
53
- // Full text requested
54
- output.full_text = extractedPageTexts.map((p) => p.text).join('\n\n');
55
- }
56
- // Extract image metadata for JSON response
57
- if (options.includeImages) {
58
- const extractedImages = pageContents
59
- .flatMap((items) => items.filter((item) => item.type === 'image' && item.imageData))
60
- .map((item) => item.imageData)
61
- .filter((img) => img !== undefined);
62
- if (extractedImages.length > 0) {
63
- output.images = extractedImages;
64
- }
65
- }
66
- }
67
- individualResult = { ...individualResult, data: output, success: true };
68
- }
69
- catch (error) {
70
- let errorMessage = `Failed to process PDF from ${sourceDescription}.`;
71
- if (error instanceof McpError) {
72
- errorMessage = error.message;
73
- } /* c8 ignore next */
74
- else if (error instanceof Error) {
75
- errorMessage += ` Reason: ${error.message}`;
76
- }
77
- else {
78
- errorMessage += ` Unknown error: ${JSON.stringify(error)}`;
79
- }
80
- individualResult.error = errorMessage;
81
- individualResult.success = false;
82
- individualResult.data = undefined;
83
- }
84
- return individualResult;
85
- };
86
- /**
87
- * Main handler function for read_pdf tool
88
- */
89
- export const handleReadPdfFunc = async (args) => {
90
- let parsedArgs;
91
- try {
92
- parsedArgs = readPdfArgsSchema.parse(args);
93
- }
94
- catch (error) {
95
- if (error instanceof z.ZodError) {
96
- throw new McpError(ErrorCode.InvalidParams, `Invalid arguments: ${error.issues.map((e) => `${e.path.join('.')} (${e.message})`).join(', ')}`);
97
- }
98
- /* c8 ignore next */
99
- const message = error instanceof Error ? error.message : String(error);
100
- /* c8 ignore next */
101
- throw new McpError(ErrorCode.InvalidParams, `Argument validation failed: ${message}`);
102
- }
103
- const { sources, include_full_text, include_metadata, include_page_count, include_images } = parsedArgs;
104
- // Process all sources concurrently
105
- const results = await Promise.all(sources.map((source) => processSingleSource(source, {
106
- includeFullText: include_full_text,
107
- includeMetadata: include_metadata,
108
- includePageCount: include_page_count,
109
- includeImages: include_images,
110
- })));
111
- // Build content parts - start with structured JSON for backward compatibility
112
- const content = [];
113
- // Strip image data and page_contents from JSON to keep it manageable
114
- const resultsForJson = results.map((result) => {
115
- if (result.data) {
116
- const { images, page_contents, ...dataWithoutBinaryContent } = result.data;
117
- // Include image count and metadata in JSON, but not the base64 data
118
- if (images) {
119
- const imageInfo = images.map((img) => ({
120
- page: img.page,
121
- index: img.index,
122
- width: img.width,
123
- height: img.height,
124
- format: img.format,
125
- }));
126
- return { ...result, data: { ...dataWithoutBinaryContent, image_info: imageInfo } };
127
- }
128
- return { ...result, data: dataWithoutBinaryContent };
129
- }
130
- return result;
131
- });
132
- // First content part: Structured JSON results
133
- content.push({
134
- type: 'text',
135
- text: JSON.stringify({ results: resultsForJson }, null, 2),
136
- });
137
- // Add page content in exact Y-coordinate order
138
- for (const result of results) {
139
- if (!result.success || !result.data?.page_contents)
140
- continue;
141
- // Process each page's content items in order
142
- for (const pageContent of result.data.page_contents) {
143
- for (const item of pageContent.items) {
144
- if (item.type === 'text' && item.textContent) {
145
- // Add text content part
146
- content.push({
147
- type: 'text',
148
- text: item.textContent,
149
- });
150
- }
151
- else if (item.type === 'image' && item.imageData) {
152
- // Add image content part (all images are now encoded as PNG)
153
- content.push({
154
- type: 'image',
155
- data: item.imageData.data,
156
- mimeType: 'image/png',
157
- });
158
- }
159
- }
160
- }
161
- }
162
- return { content };
163
- };
164
- // Export the tool definition
165
- export const readPdfToolDefinition = {
166
- name: 'read_pdf',
167
- description: 'Reads content/metadata/images from one or more PDFs (local/URL). Each source can specify pages to extract.',
168
- schema: readPdfArgsSchema,
169
- handler: handleReadPdfFunc,
170
- };
@@ -1,394 +0,0 @@
1
- // PDF text and metadata extraction utilities
2
- import { OPS } from 'pdfjs-dist/legacy/build/pdf.mjs';
3
- import { PNG } from 'pngjs';
4
- /**
5
- * Encode raw pixel data to PNG format
6
- */
7
- const encodePixelsToPNG = (pixelData, width, height, channels) => {
8
- const png = new PNG({ width, height });
9
- // Convert pixel data to RGBA format expected by pngjs
10
- if (channels === 4) {
11
- // Already RGBA
12
- png.data = Buffer.from(pixelData);
13
- }
14
- else if (channels === 3) {
15
- // RGB -> RGBA (add alpha channel)
16
- for (let i = 0; i < width * height; i++) {
17
- const srcIdx = i * 3;
18
- const dstIdx = i * 4;
19
- png.data[dstIdx] = pixelData[srcIdx] ?? 0; // R
20
- png.data[dstIdx + 1] = pixelData[srcIdx + 1] ?? 0; // G
21
- png.data[dstIdx + 2] = pixelData[srcIdx + 2] ?? 0; // B
22
- png.data[dstIdx + 3] = 255; // A (fully opaque)
23
- }
24
- }
25
- else if (channels === 1) {
26
- // Grayscale -> RGBA
27
- for (let i = 0; i < width * height; i++) {
28
- const gray = pixelData[i] ?? 0;
29
- const dstIdx = i * 4;
30
- png.data[dstIdx] = gray; // R
31
- png.data[dstIdx + 1] = gray; // G
32
- png.data[dstIdx + 2] = gray; // B
33
- png.data[dstIdx + 3] = 255; // A
34
- }
35
- }
36
- // Encode to PNG and convert to base64
37
- const pngBuffer = PNG.sync.write(png);
38
- return pngBuffer.toString('base64');
39
- };
40
- /**
41
- * Extract metadata and page count from a PDF document
42
- */
43
- export const extractMetadataAndPageCount = async (pdfDocument, includeMetadata, includePageCount) => {
44
- const output = {};
45
- if (includePageCount) {
46
- output.num_pages = pdfDocument.numPages;
47
- }
48
- if (includeMetadata) {
49
- try {
50
- const pdfMetadata = await pdfDocument.getMetadata();
51
- const infoData = pdfMetadata.info;
52
- if (infoData !== undefined) {
53
- output.info = infoData;
54
- }
55
- const metadataObj = pdfMetadata.metadata;
56
- // Check if it has a getAll method (as used in tests)
57
- if (typeof metadataObj.getAll === 'function') {
58
- output.metadata = metadataObj.getAll();
59
- }
60
- else {
61
- // For real PDF.js metadata, convert to plain object
62
- const metadataRecord = {};
63
- for (const key in metadataObj) {
64
- if (Object.hasOwn(metadataObj, key)) {
65
- metadataRecord[key] = metadataObj[key];
66
- }
67
- }
68
- output.metadata = metadataRecord;
69
- }
70
- }
71
- catch (metaError) {
72
- console.warn(`[PDF Reader MCP] Error extracting metadata: ${metaError instanceof Error ? metaError.message : String(metaError)}`);
73
- }
74
- }
75
- return output;
76
- };
77
- /**
78
- * Extract text from a single page
79
- */
80
- const extractSinglePageText = async (pdfDocument, pageNum, sourceDescription) => {
81
- try {
82
- const page = await pdfDocument.getPage(pageNum);
83
- const textContent = await page.getTextContent();
84
- const pageText = textContent.items
85
- .map((item) => item.str)
86
- .join('');
87
- return { page: pageNum, text: pageText };
88
- }
89
- catch (pageError) {
90
- const message = pageError instanceof Error ? pageError.message : String(pageError);
91
- console.warn(`[PDF Reader MCP] Error getting text content for page ${String(pageNum)} in ${sourceDescription}: ${message}`);
92
- return { page: pageNum, text: `Error processing page: ${message}` };
93
- }
94
- };
95
- /**
96
- * Extract text from specified pages (parallel processing for performance)
97
- */
98
- export const extractPageTexts = async (pdfDocument, pagesToProcess, sourceDescription) => {
99
- // Process all pages in parallel for better performance
100
- const extractedPageTexts = await Promise.all(pagesToProcess.map((pageNum) => extractSinglePageText(pdfDocument, pageNum, sourceDescription)));
101
- return extractedPageTexts.sort((a, b) => a.page - b.page);
102
- };
103
- /**
104
- * Extract images from a single page
105
- */
106
- const extractImagesFromPage = async (page, pageNum) => {
107
- const images = [];
108
- /* c8 ignore next */
109
- try {
110
- const operatorList = await page.getOperatorList();
111
- // Find all image painting operations
112
- const imageIndices = [];
113
- for (let i = 0; i < operatorList.fnArray.length; i++) {
114
- const op = operatorList.fnArray[i];
115
- if (op === OPS.paintImageXObject || op === OPS.paintXObject) {
116
- imageIndices.push(i);
117
- }
118
- }
119
- // Extract each image - try sync first, then async if needed
120
- const imagePromises = imageIndices.map((imgIndex, arrayIndex) => new Promise((resolve) => {
121
- const argsArray = operatorList.argsArray[imgIndex];
122
- if (!argsArray || argsArray.length === 0) {
123
- resolve(null);
124
- return;
125
- }
126
- const imageName = argsArray[0];
127
- // Helper to process image data
128
- const processImageData = (imageData) => {
129
- if (!imageData || typeof imageData !== 'object') {
130
- return null;
131
- }
132
- const img = imageData;
133
- if (!img.data || !img.width || !img.height) {
134
- return null;
135
- }
136
- // Determine number of channels based on kind
137
- // kind === 1 = grayscale (1 channel), 2 = RGB (3 channels), 3 = RGBA (4 channels)
138
- const channels = img.kind === 1 ? 1 : img.kind === 3 ? 4 : 3;
139
- const format = img.kind === 1 ? 'grayscale' : img.kind === 3 ? 'rgba' : 'rgb';
140
- // Encode raw pixel data to PNG format
141
- const pngBase64 = encodePixelsToPNG(img.data, img.width, img.height, channels);
142
- return {
143
- page: pageNum,
144
- index: arrayIndex,
145
- width: img.width,
146
- height: img.height,
147
- format,
148
- data: pngBase64,
149
- };
150
- };
151
- // Try to get from commonObjs first if it starts with 'g_'
152
- if (imageName.startsWith('g_')) {
153
- try {
154
- const imageData = page.commonObjs.get(imageName);
155
- if (imageData) {
156
- const result = processImageData(imageData);
157
- resolve(result);
158
- return;
159
- }
160
- }
161
- catch (error) {
162
- const message = error instanceof Error ? error.message : String(error);
163
- console.warn(`[PDF Reader MCP] Error getting image from commonObjs ${imageName}: ${message}`);
164
- }
165
- }
166
- // Try synchronous get first - if image is already loaded
167
- try {
168
- const imageData = page.objs.get(imageName);
169
- if (imageData !== undefined) {
170
- const result = processImageData(imageData);
171
- resolve(result);
172
- return;
173
- }
174
- }
175
- catch (error) {
176
- // Synchronous get failed or not supported, fall through to async
177
- const message = error instanceof Error ? error.message : String(error);
178
- console.warn(`[PDF Reader MCP] Sync image get failed for ${imageName}, trying async: ${message}`);
179
- }
180
- // Fallback to async callback-based get with timeout
181
- let resolved = false;
182
- const timeout = setTimeout(() => {
183
- if (!resolved) {
184
- resolved = true;
185
- console.warn(`[PDF Reader MCP] Image extraction timeout for ${imageName} on page ${String(pageNum)}`);
186
- resolve(null);
187
- }
188
- }, 10000); // 10 second timeout as a safety net
189
- page.objs.get(imageName, (imageData) => {
190
- if (!resolved) {
191
- resolved = true;
192
- clearTimeout(timeout);
193
- const result = processImageData(imageData);
194
- resolve(result);
195
- }
196
- });
197
- }));
198
- const resolvedImages = await Promise.all(imagePromises);
199
- images.push(...resolvedImages.filter((img) => img !== null));
200
- }
201
- catch (error) {
202
- const message = error instanceof Error ? error.message : String(error);
203
- console.warn(`[PDF Reader MCP] Error extracting images from page ${String(pageNum)}: ${message}`);
204
- }
205
- return images;
206
- };
207
- /**
208
- * Extract images from specified pages
209
- */
210
- export const extractImages = async (pdfDocument, pagesToProcess) => {
211
- const allImages = [];
212
- // Process pages sequentially to avoid overwhelming PDF.js
213
- for (const pageNum of pagesToProcess) {
214
- try {
215
- const page = await pdfDocument.getPage(pageNum);
216
- const pageImages = await extractImagesFromPage(page, pageNum);
217
- allImages.push(...pageImages);
218
- }
219
- catch (error) {
220
- const message = error instanceof Error ? error.message : String(error);
221
- console.warn(`[PDF Reader MCP] Error getting page ${String(pageNum)} for image extraction: ${message}`);
222
- }
223
- }
224
- return allImages;
225
- };
226
- /**
227
- * Build warnings array for invalid page numbers
228
- */
229
- export const buildWarnings = (invalidPages, totalPages) => {
230
- if (invalidPages.length === 0) {
231
- return [];
232
- }
233
- return [
234
- `Requested page numbers ${invalidPages.join(', ')} exceed total pages (${String(totalPages)}).`,
235
- ];
236
- };
237
- /**
238
- * Extract all content (text and images) from a single page with Y-coordinate ordering
239
- */
240
- export const extractPageContent = async (pdfDocument, pageNum, includeImages, sourceDescription) => {
241
- const contentItems = [];
242
- try {
243
- const page = await pdfDocument.getPage(pageNum);
244
- // Extract text content with Y-coordinates
245
- const textContent = await page.getTextContent();
246
- // Group text items by Y-coordinate (items on same line have similar Y values)
247
- const textByY = new Map();
248
- for (const item of textContent.items) {
249
- const textItem = item;
250
- // transform[5] is the Y coordinate
251
- const yCoord = textItem.transform[5];
252
- if (yCoord === undefined)
253
- continue;
254
- const y = Math.round(yCoord);
255
- if (!textByY.has(y)) {
256
- textByY.set(y, []);
257
- }
258
- textByY.get(y)?.push(textItem.str);
259
- }
260
- // Convert grouped text to content items
261
- for (const [y, textParts] of textByY.entries()) {
262
- const textContent = textParts.join('');
263
- if (textContent.trim()) {
264
- contentItems.push({
265
- type: 'text',
266
- yPosition: y,
267
- textContent,
268
- });
269
- }
270
- }
271
- // Extract images with Y-coordinates if requested
272
- if (includeImages) {
273
- const operatorList = await page.getOperatorList();
274
- // Find all image painting operations
275
- const imageIndices = [];
276
- for (let i = 0; i < operatorList.fnArray.length; i++) {
277
- const op = operatorList.fnArray[i];
278
- if (op === OPS.paintImageXObject || op === OPS.paintXObject) {
279
- imageIndices.push(i);
280
- }
281
- }
282
- // Extract each image with its Y-coordinate - try sync first, then async if needed
283
- const imagePromises = imageIndices.map((imgIndex, arrayIndex) => new Promise((resolve) => {
284
- const argsArray = operatorList.argsArray[imgIndex];
285
- if (!argsArray || argsArray.length === 0) {
286
- resolve(null);
287
- return;
288
- }
289
- const imageName = argsArray[0];
290
- // Get transform matrix from the args (if available)
291
- let yPosition = 0;
292
- if (argsArray.length > 1 && Array.isArray(argsArray[1])) {
293
- const transform = argsArray[1];
294
- const yCoord = transform[5];
295
- if (yCoord !== undefined) {
296
- yPosition = Math.round(yCoord);
297
- }
298
- }
299
- // Helper to process image data
300
- const processImageData = (imageData) => {
301
- if (!imageData || typeof imageData !== 'object') {
302
- return null;
303
- }
304
- const img = imageData;
305
- if (!img.data || !img.width || !img.height) {
306
- return null;
307
- }
308
- // Determine number of channels based on kind
309
- const channels = img.kind === 1 ? 1 : img.kind === 3 ? 4 : 3;
310
- const format = img.kind === 1 ? 'grayscale' : img.kind === 3 ? 'rgba' : 'rgb';
311
- // Encode raw pixel data to PNG format
312
- const pngBase64 = encodePixelsToPNG(img.data, img.width, img.height, channels);
313
- return {
314
- type: 'image',
315
- yPosition,
316
- imageData: {
317
- page: pageNum,
318
- index: arrayIndex,
319
- width: img.width,
320
- height: img.height,
321
- format,
322
- data: pngBase64,
323
- },
324
- };
325
- };
326
- // Try to get from commonObjs first if it starts with 'g_'
327
- if (imageName.startsWith('g_')) {
328
- try {
329
- const imageData = page.commonObjs.get(imageName);
330
- if (imageData) {
331
- const result = processImageData(imageData);
332
- resolve(result);
333
- return;
334
- }
335
- /* c8 ignore next */
336
- }
337
- catch (error) {
338
- /* c8 ignore next */ const message = error instanceof Error ? error.message : String(error);
339
- /* c8 ignore next */ console.warn(
340
- /* c8 ignore next */ `[PDF Reader MCP] Error getting image from commonObjs ${imageName}: ${message}`
341
- /* c8 ignore next */
342
- );
343
- }
344
- }
345
- // Try synchronous get first - if image is already loaded
346
- try {
347
- const imageData = page.objs.get(imageName);
348
- if (imageData !== undefined) {
349
- const result = processImageData(imageData);
350
- resolve(result);
351
- return;
352
- }
353
- }
354
- catch (error) {
355
- const message = error instanceof Error ? error.message : String(error);
356
- console.warn(`[PDF Reader MCP] Sync image get failed for ${imageName}, trying async: ${message}`);
357
- }
358
- // Fallback to async callback-based get with timeout
359
- let resolved = false;
360
- const timeout = setTimeout(() => {
361
- if (!resolved) {
362
- resolved = true;
363
- console.warn(`[PDF Reader MCP] Image extraction timeout for ${imageName} on page ${String(pageNum)}`);
364
- resolve(null);
365
- }
366
- }, 10000); // 10 second timeout as a safety net
367
- page.objs.get(imageName, (imageData) => {
368
- if (!resolved) {
369
- resolved = true;
370
- clearTimeout(timeout);
371
- const result = processImageData(imageData);
372
- resolve(result);
373
- }
374
- });
375
- }));
376
- const resolvedImages = await Promise.all(imagePromises);
377
- contentItems.push(...resolvedImages.filter((item) => item !== null));
378
- }
379
- }
380
- catch (error) {
381
- const message = error instanceof Error ? error.message : String(error);
382
- console.warn(`[PDF Reader MCP] Error extracting page content for page ${String(pageNum)} in ${sourceDescription}: ${message}`);
383
- // Return error message as text content
384
- return [
385
- {
386
- type: 'text',
387
- yPosition: 0,
388
- textContent: `Error processing page: ${message}`,
389
- },
390
- ];
391
- }
392
- // Sort by Y-position (descending = top to bottom in PDF coordinates)
393
- return contentItems.sort((a, b) => b.yPosition - a.yPosition);
394
- };
@@ -1,53 +0,0 @@
1
- // PDF document loading utilities
2
- import fs from 'node:fs/promises';
3
- import { ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js';
4
- import { getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs';
5
- import { resolvePath } from '../utils/pathUtils.js';
6
- /**
7
- * Load a PDF document from a local file path or URL
8
- * @param source - Object containing either path or url
9
- * @param sourceDescription - Description for error messages
10
- * @returns PDF document proxy
11
- */
12
- export const loadPdfDocument = async (source, sourceDescription) => {
13
- let pdfDataSource;
14
- try {
15
- if (source.path) {
16
- const safePath = resolvePath(source.path);
17
- const buffer = await fs.readFile(safePath);
18
- pdfDataSource = new Uint8Array(buffer);
19
- }
20
- else if (source.url) {
21
- pdfDataSource = { url: source.url };
22
- }
23
- else {
24
- throw new McpError(ErrorCode.InvalidParams, `Source ${sourceDescription} missing 'path' or 'url'.`);
25
- }
26
- }
27
- catch (err) {
28
- if (err instanceof McpError) {
29
- throw err;
30
- }
31
- const message = err instanceof Error ? err.message : String(err);
32
- const errorCode = ErrorCode.InvalidRequest;
33
- if (typeof err === 'object' &&
34
- err !== null &&
35
- 'code' in err &&
36
- err.code === 'ENOENT' &&
37
- source.path) {
38
- throw new McpError(errorCode, `File not found at '${source.path}'.`, {
39
- cause: err instanceof Error ? err : undefined,
40
- });
41
- }
42
- throw new McpError(errorCode, `Failed to prepare PDF source ${sourceDescription}. Reason: ${message}`, { cause: err instanceof Error ? err : undefined });
43
- }
44
- const loadingTask = getDocument(pdfDataSource);
45
- try {
46
- return await loadingTask.promise;
47
- }
48
- catch (err) {
49
- console.error(`[PDF Reader MCP] PDF.js loading error for ${sourceDescription}:`, err);
50
- const message = err instanceof Error ? err.message : String(err);
51
- throw new McpError(ErrorCode.InvalidRequest, `Failed to load PDF document from ${sourceDescription}. Reason: ${message || 'Unknown loading error'}`, { cause: err instanceof Error ? err : undefined });
52
- }
53
- };