@sylphx/pdf-reader-mcp 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/handlers/readPdf.js +60 -53
- package/dist/index.js +1 -1
- package/dist/pdf/extractor.js +112 -0
- package/package.json +1 -1
package/dist/handlers/readPdf.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
// PDF reading handler - orchestrates PDF processing workflow
|
|
2
2
|
import { ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js';
|
|
3
3
|
import { z } from 'zod';
|
|
4
|
-
import { buildWarnings,
|
|
4
|
+
import { buildWarnings, extractMetadataAndPageCount, extractPageContent, } from '../pdf/extractor.js';
|
|
5
5
|
import { loadPdfDocument } from '../pdf/loader.js';
|
|
6
6
|
import { determinePagesToProcess, getTargetPages } from '../pdf/parser.js';
|
|
7
7
|
import { readPdfArgsSchema } from '../schemas/readPdf.js';
|
|
@@ -28,9 +28,23 @@ const processSingleSource = async (source, options) => {
|
|
|
28
28
|
if (warnings.length > 0) {
|
|
29
29
|
output.warnings = warnings;
|
|
30
30
|
}
|
|
31
|
-
// Extract
|
|
31
|
+
// Extract content with ordering preserved
|
|
32
32
|
if (pagesToProcess.length > 0) {
|
|
33
|
-
|
|
33
|
+
// Use new extractPageContent to preserve Y-coordinate ordering
|
|
34
|
+
const pageContents = await Promise.all(pagesToProcess.map((pageNum) => extractPageContent(pdfDocument, pageNum, options.includeImages, sourceDescription)));
|
|
35
|
+
// Store page contents for ordered retrieval
|
|
36
|
+
output.page_contents = pageContents.map((items, idx) => ({
|
|
37
|
+
page: pagesToProcess[idx],
|
|
38
|
+
items,
|
|
39
|
+
}));
|
|
40
|
+
// For backward compatibility, also provide text-only outputs
|
|
41
|
+
const extractedPageTexts = pageContents.map((items, idx) => ({
|
|
42
|
+
page: pagesToProcess[idx],
|
|
43
|
+
text: items
|
|
44
|
+
.filter((item) => item.type === 'text')
|
|
45
|
+
.map((item) => item.textContent)
|
|
46
|
+
.join(''),
|
|
47
|
+
}));
|
|
34
48
|
if (targetPages) {
|
|
35
49
|
// Specific pages requested
|
|
36
50
|
output.page_texts = extractedPageTexts;
|
|
@@ -39,12 +53,15 @@ const processSingleSource = async (source, options) => {
|
|
|
39
53
|
// Full text requested
|
|
40
54
|
output.full_text = extractedPageTexts.map((p) => p.text).join('\n\n');
|
|
41
55
|
}
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
56
|
+
// Extract image metadata for JSON response
|
|
57
|
+
if (options.includeImages) {
|
|
58
|
+
const extractedImages = pageContents
|
|
59
|
+
.flatMap((items) => items.filter((item) => item.type === 'image' && item.imageData))
|
|
60
|
+
.map((item) => item.imageData)
|
|
61
|
+
.filter((img) => img !== undefined);
|
|
62
|
+
if (extractedImages.length > 0) {
|
|
63
|
+
output.images = extractedImages;
|
|
64
|
+
}
|
|
48
65
|
}
|
|
49
66
|
}
|
|
50
67
|
individualResult = { ...individualResult, data: output, success: true };
|
|
@@ -91,19 +108,22 @@ export const handleReadPdfFunc = async (args) => {
|
|
|
91
108
|
})));
|
|
92
109
|
// Build content parts - start with structured JSON for backward compatibility
|
|
93
110
|
const content = [];
|
|
94
|
-
// Strip image data from JSON to keep it manageable
|
|
111
|
+
// Strip image data and page_contents from JSON to keep it manageable
|
|
95
112
|
const resultsForJson = results.map((result) => {
|
|
96
|
-
if (result.data
|
|
97
|
-
const { images, ...
|
|
113
|
+
if (result.data) {
|
|
114
|
+
const { images, page_contents, ...dataWithoutBinaryContent } = result.data;
|
|
98
115
|
// Include image count and metadata in JSON, but not the base64 data
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
116
|
+
if (images) {
|
|
117
|
+
const imageInfo = images.map((img) => ({
|
|
118
|
+
page: img.page,
|
|
119
|
+
index: img.index,
|
|
120
|
+
width: img.width,
|
|
121
|
+
height: img.height,
|
|
122
|
+
format: img.format,
|
|
123
|
+
}));
|
|
124
|
+
return { ...result, data: { ...dataWithoutBinaryContent, image_info: imageInfo } };
|
|
125
|
+
}
|
|
126
|
+
return { ...result, data: dataWithoutBinaryContent };
|
|
107
127
|
}
|
|
108
128
|
return result;
|
|
109
129
|
});
|
|
@@ -112,40 +132,27 @@ export const handleReadPdfFunc = async (args) => {
|
|
|
112
132
|
type: 'text',
|
|
113
133
|
text: JSON.stringify({ results: resultsForJson }, null, 2),
|
|
114
134
|
});
|
|
115
|
-
// Add page content in
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
// Add
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
type: 'image',
|
|
129
|
-
data: image.data,
|
|
130
|
-
mimeType: image.format === 'rgba' ? 'image/png' : 'image/jpeg',
|
|
131
|
-
});
|
|
132
|
-
}
|
|
133
|
-
}
|
|
135
|
+
// Add page content in exact Y-coordinate order
|
|
136
|
+
for (const result of results) {
|
|
137
|
+
if (!result.success || !result.data?.page_contents)
|
|
138
|
+
continue;
|
|
139
|
+
// Process each page's content items in order
|
|
140
|
+
for (const pageContent of result.data.page_contents) {
|
|
141
|
+
for (const item of pageContent.items) {
|
|
142
|
+
if (item.type === 'text' && item.textContent) {
|
|
143
|
+
// Add text content part
|
|
144
|
+
content.push({
|
|
145
|
+
type: 'text',
|
|
146
|
+
text: item.textContent,
|
|
147
|
+
});
|
|
134
148
|
}
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
for (const image of pageImages) {
|
|
143
|
-
content.push({
|
|
144
|
-
type: 'image',
|
|
145
|
-
data: image.data,
|
|
146
|
-
mimeType: image.format === 'rgba' ? 'image/png' : 'image/jpeg',
|
|
147
|
-
});
|
|
148
|
-
}
|
|
149
|
+
else if (item.type === 'image' && item.imageData) {
|
|
150
|
+
// Add image content part
|
|
151
|
+
content.push({
|
|
152
|
+
type: 'image',
|
|
153
|
+
data: item.imageData.data,
|
|
154
|
+
mimeType: item.imageData.format === 'rgba' ? 'image/png' : 'image/jpeg',
|
|
155
|
+
});
|
|
149
156
|
}
|
|
150
157
|
}
|
|
151
158
|
}
|
package/dist/index.js
CHANGED
|
@@ -11,7 +11,7 @@ import { allToolDefinitions } from './handlers/index.js';
|
|
|
11
11
|
// --- Server Setup ---
|
|
12
12
|
const server = new Server({
|
|
13
13
|
name: 'pdf-reader-mcp',
|
|
14
|
-
version: '1.
|
|
14
|
+
version: '1.2.0',
|
|
15
15
|
description: 'MCP Server for reading PDF files and extracting text, metadata, images, and page information.',
|
|
16
16
|
}, {
|
|
17
17
|
capabilities: { tools: {} },
|
package/dist/pdf/extractor.js
CHANGED
|
@@ -151,3 +151,115 @@ export const buildWarnings = (invalidPages, totalPages) => {
|
|
|
151
151
|
`Requested page numbers ${invalidPages.join(', ')} exceed total pages (${String(totalPages)}).`,
|
|
152
152
|
];
|
|
153
153
|
};
|
|
154
|
+
/**
|
|
155
|
+
* Extract all content (text and images) from a single page with Y-coordinate ordering
|
|
156
|
+
*/
|
|
157
|
+
export const extractPageContent = async (pdfDocument, pageNum, includeImages, sourceDescription) => {
|
|
158
|
+
const contentItems = [];
|
|
159
|
+
try {
|
|
160
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
161
|
+
// Extract text content with Y-coordinates
|
|
162
|
+
const textContent = await page.getTextContent();
|
|
163
|
+
// Group text items by Y-coordinate (items on same line have similar Y values)
|
|
164
|
+
const textByY = new Map();
|
|
165
|
+
for (const item of textContent.items) {
|
|
166
|
+
const textItem = item;
|
|
167
|
+
// transform[5] is the Y coordinate
|
|
168
|
+
const yCoord = textItem.transform[5];
|
|
169
|
+
if (yCoord === undefined)
|
|
170
|
+
continue;
|
|
171
|
+
const y = Math.round(yCoord);
|
|
172
|
+
if (!textByY.has(y)) {
|
|
173
|
+
textByY.set(y, []);
|
|
174
|
+
}
|
|
175
|
+
textByY.get(y)?.push(textItem.str);
|
|
176
|
+
}
|
|
177
|
+
// Convert grouped text to content items
|
|
178
|
+
for (const [y, textParts] of textByY.entries()) {
|
|
179
|
+
const textContent = textParts.join('');
|
|
180
|
+
if (textContent.trim()) {
|
|
181
|
+
contentItems.push({
|
|
182
|
+
type: 'text',
|
|
183
|
+
yPosition: y,
|
|
184
|
+
textContent,
|
|
185
|
+
});
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
// Extract images with Y-coordinates if requested
|
|
189
|
+
if (includeImages) {
|
|
190
|
+
const operatorList = await page.getOperatorList();
|
|
191
|
+
// Find all image painting operations
|
|
192
|
+
const imageIndices = [];
|
|
193
|
+
for (let i = 0; i < operatorList.fnArray.length; i++) {
|
|
194
|
+
const op = operatorList.fnArray[i];
|
|
195
|
+
if (op === OPS.paintImageXObject || op === OPS.paintXObject) {
|
|
196
|
+
imageIndices.push(i);
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
// Extract each image with its Y-coordinate
|
|
200
|
+
const imagePromises = imageIndices.map((imgIndex, arrayIndex) => new Promise((resolve) => {
|
|
201
|
+
const argsArray = operatorList.argsArray[imgIndex];
|
|
202
|
+
if (!argsArray || argsArray.length === 0) {
|
|
203
|
+
resolve(null);
|
|
204
|
+
return;
|
|
205
|
+
}
|
|
206
|
+
const imageName = argsArray[0];
|
|
207
|
+
// Get transform matrix from the args (if available)
|
|
208
|
+
// The transform is typically in argsArray[1] for some ops
|
|
209
|
+
let yPosition = 0;
|
|
210
|
+
if (argsArray.length > 1 && Array.isArray(argsArray[1])) {
|
|
211
|
+
const transform = argsArray[1];
|
|
212
|
+
// transform[5] is the Y coordinate
|
|
213
|
+
const yCoord = transform[5];
|
|
214
|
+
if (yCoord !== undefined) {
|
|
215
|
+
yPosition = Math.round(yCoord);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
// Use callback-based get() as images may not be resolved yet
|
|
219
|
+
page.objs.get(imageName, (imageData) => {
|
|
220
|
+
if (!imageData || typeof imageData !== 'object') {
|
|
221
|
+
resolve(null);
|
|
222
|
+
return;
|
|
223
|
+
}
|
|
224
|
+
const img = imageData;
|
|
225
|
+
if (!img.data || !img.width || !img.height) {
|
|
226
|
+
resolve(null);
|
|
227
|
+
return;
|
|
228
|
+
}
|
|
229
|
+
// Determine image format based on kind
|
|
230
|
+
const format = img.kind === 1 ? 'grayscale' : img.kind === 3 ? 'rgba' : 'rgb';
|
|
231
|
+
// Convert Uint8Array to base64
|
|
232
|
+
const base64 = Buffer.from(img.data).toString('base64');
|
|
233
|
+
resolve({
|
|
234
|
+
type: 'image',
|
|
235
|
+
yPosition,
|
|
236
|
+
imageData: {
|
|
237
|
+
page: pageNum,
|
|
238
|
+
index: arrayIndex,
|
|
239
|
+
width: img.width,
|
|
240
|
+
height: img.height,
|
|
241
|
+
format,
|
|
242
|
+
data: base64,
|
|
243
|
+
},
|
|
244
|
+
});
|
|
245
|
+
});
|
|
246
|
+
}));
|
|
247
|
+
const resolvedImages = await Promise.all(imagePromises);
|
|
248
|
+
contentItems.push(...resolvedImages.filter((item) => item !== null));
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
catch (error) {
|
|
252
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
253
|
+
console.warn(`[PDF Reader MCP] Error extracting page content for page ${String(pageNum)} in ${sourceDescription}: ${message}`);
|
|
254
|
+
// Return error message as text content
|
|
255
|
+
return [
|
|
256
|
+
{
|
|
257
|
+
type: 'text',
|
|
258
|
+
yPosition: 0,
|
|
259
|
+
textContent: `Error processing page: ${message}`,
|
|
260
|
+
},
|
|
261
|
+
];
|
|
262
|
+
}
|
|
263
|
+
// Sort by Y-position (descending = top to bottom in PDF coordinates)
|
|
264
|
+
return contentItems.sort((a, b) => b.yPosition - a.yPosition);
|
|
265
|
+
};
|