@sylphx/pdf-reader-mcp 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  // PDF reading handler - orchestrates PDF processing workflow
2
2
  import { ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js';
3
3
  import { z } from 'zod';
4
- import { buildWarnings, extractImages, extractMetadataAndPageCount, extractPageTexts, } from '../pdf/extractor.js';
4
+ import { buildWarnings, extractMetadataAndPageCount, extractPageContent, } from '../pdf/extractor.js';
5
5
  import { loadPdfDocument } from '../pdf/loader.js';
6
6
  import { determinePagesToProcess, getTargetPages } from '../pdf/parser.js';
7
7
  import { readPdfArgsSchema } from '../schemas/readPdf.js';
@@ -28,9 +28,23 @@ const processSingleSource = async (source, options) => {
28
28
  if (warnings.length > 0) {
29
29
  output.warnings = warnings;
30
30
  }
31
- // Extract text if needed
31
+ // Extract content with ordering preserved
32
32
  if (pagesToProcess.length > 0) {
33
- const extractedPageTexts = await extractPageTexts(pdfDocument, pagesToProcess, sourceDescription);
33
+ // Use new extractPageContent to preserve Y-coordinate ordering
34
+ const pageContents = await Promise.all(pagesToProcess.map((pageNum) => extractPageContent(pdfDocument, pageNum, options.includeImages, sourceDescription)));
35
+ // Store page contents for ordered retrieval
36
+ output.page_contents = pageContents.map((items, idx) => ({
37
+ page: pagesToProcess[idx],
38
+ items,
39
+ }));
40
+ // For backward compatibility, also provide text-only outputs
41
+ const extractedPageTexts = pageContents.map((items, idx) => ({
42
+ page: pagesToProcess[idx],
43
+ text: items
44
+ .filter((item) => item.type === 'text')
45
+ .map((item) => item.textContent)
46
+ .join(''),
47
+ }));
34
48
  if (targetPages) {
35
49
  // Specific pages requested
36
50
  output.page_texts = extractedPageTexts;
@@ -39,12 +53,15 @@ const processSingleSource = async (source, options) => {
39
53
  // Full text requested
40
54
  output.full_text = extractedPageTexts.map((p) => p.text).join('\n\n');
41
55
  }
42
- }
43
- // Extract images if needed
44
- if (options.includeImages && pagesToProcess.length > 0) {
45
- const extractedImages = await extractImages(pdfDocument, pagesToProcess);
46
- if (extractedImages.length > 0) {
47
- output.images = extractedImages;
56
+ // Extract image metadata for JSON response
57
+ if (options.includeImages) {
58
+ const extractedImages = pageContents
59
+ .flatMap((items) => items.filter((item) => item.type === 'image' && item.imageData))
60
+ .map((item) => item.imageData)
61
+ .filter((img) => img !== undefined);
62
+ if (extractedImages.length > 0) {
63
+ output.images = extractedImages;
64
+ }
48
65
  }
49
66
  }
50
67
  individualResult = { ...individualResult, data: output, success: true };
@@ -91,19 +108,22 @@ export const handleReadPdfFunc = async (args) => {
91
108
  })));
92
109
  // Build content parts - start with structured JSON for backward compatibility
93
110
  const content = [];
94
- // Strip image data from JSON to keep it manageable
111
+ // Strip image data and page_contents from JSON to keep it manageable
95
112
  const resultsForJson = results.map((result) => {
96
- if (result.data?.images) {
97
- const { images, ...dataWithoutImages } = result.data;
113
+ if (result.data) {
114
+ const { images, page_contents, ...dataWithoutBinaryContent } = result.data;
98
115
  // Include image count and metadata in JSON, but not the base64 data
99
- const imageInfo = images.map((img) => ({
100
- page: img.page,
101
- index: img.index,
102
- width: img.width,
103
- height: img.height,
104
- format: img.format,
105
- }));
106
- return { ...result, data: { ...dataWithoutImages, image_info: imageInfo } };
116
+ if (images) {
117
+ const imageInfo = images.map((img) => ({
118
+ page: img.page,
119
+ index: img.index,
120
+ width: img.width,
121
+ height: img.height,
122
+ format: img.format,
123
+ }));
124
+ return { ...result, data: { ...dataWithoutBinaryContent, image_info: imageInfo } };
125
+ }
126
+ return { ...result, data: dataWithoutBinaryContent };
107
127
  }
108
128
  return result;
109
129
  });
@@ -112,40 +132,27 @@ export const handleReadPdfFunc = async (args) => {
112
132
  type: 'text',
113
133
  text: JSON.stringify({ results: resultsForJson }, null, 2),
114
134
  });
115
- // Add page content in order: text then images for each page
116
- if (include_images) {
117
- for (const result of results) {
118
- if (!result.success || !result.data)
119
- continue;
120
- // Handle page_texts (specific pages requested)
121
- if (result.data.page_texts) {
122
- for (const pageText of result.data.page_texts) {
123
- // Add images for this page (if any) right after page text
124
- if (result.data.images) {
125
- const pageImages = result.data.images.filter((img) => img.page === pageText.page);
126
- for (const image of pageImages) {
127
- content.push({
128
- type: 'image',
129
- data: image.data,
130
- mimeType: image.format === 'rgba' ? 'image/png' : 'image/jpeg',
131
- });
132
- }
133
- }
135
+ // Add page content in exact Y-coordinate order
136
+ for (const result of results) {
137
+ if (!result.success || !result.data?.page_contents)
138
+ continue;
139
+ // Process each page's content items in order
140
+ for (const pageContent of result.data.page_contents) {
141
+ for (const item of pageContent.items) {
142
+ if (item.type === 'text' && item.textContent) {
143
+ // Add text content part
144
+ content.push({
145
+ type: 'text',
146
+ text: item.textContent,
147
+ });
134
148
  }
135
- }
136
- // Handle full_text mode - add all images by page order
137
- if (result.data.full_text && result.data.images) {
138
- // Group images by page and add in order
139
- const pageNumbers = [...new Set(result.data.images.map((img) => img.page))].sort((a, b) => a - b);
140
- for (const pageNum of pageNumbers) {
141
- const pageImages = result.data.images.filter((img) => img.page === pageNum);
142
- for (const image of pageImages) {
143
- content.push({
144
- type: 'image',
145
- data: image.data,
146
- mimeType: image.format === 'rgba' ? 'image/png' : 'image/jpeg',
147
- });
148
- }
149
+ else if (item.type === 'image' && item.imageData) {
150
+ // Add image content part
151
+ content.push({
152
+ type: 'image',
153
+ data: item.imageData.data,
154
+ mimeType: item.imageData.format === 'rgba' ? 'image/png' : 'image/jpeg',
155
+ });
149
156
  }
150
157
  }
151
158
  }
package/dist/index.js CHANGED
@@ -11,7 +11,7 @@ import { allToolDefinitions } from './handlers/index.js';
11
11
  // --- Server Setup ---
12
12
  const server = new Server({
13
13
  name: 'pdf-reader-mcp',
14
- version: '1.1.0',
14
+ version: '1.2.0',
15
15
  description: 'MCP Server for reading PDF files and extracting text, metadata, images, and page information.',
16
16
  }, {
17
17
  capabilities: { tools: {} },
@@ -151,3 +151,115 @@ export const buildWarnings = (invalidPages, totalPages) => {
151
151
  `Requested page numbers ${invalidPages.join(', ')} exceed total pages (${String(totalPages)}).`,
152
152
  ];
153
153
  };
154
+ /**
155
+ * Extract all content (text and images) from a single page with Y-coordinate ordering
156
+ */
157
+ export const extractPageContent = async (pdfDocument, pageNum, includeImages, sourceDescription) => {
158
+ const contentItems = [];
159
+ try {
160
+ const page = await pdfDocument.getPage(pageNum);
161
+ // Extract text content with Y-coordinates
162
+ const textContent = await page.getTextContent();
163
+ // Group text items by Y-coordinate (items on same line have similar Y values)
164
+ const textByY = new Map();
165
+ for (const item of textContent.items) {
166
+ const textItem = item;
167
+ // transform[5] is the Y coordinate
168
+ const yCoord = textItem.transform[5];
169
+ if (yCoord === undefined)
170
+ continue;
171
+ const y = Math.round(yCoord);
172
+ if (!textByY.has(y)) {
173
+ textByY.set(y, []);
174
+ }
175
+ textByY.get(y)?.push(textItem.str);
176
+ }
177
+ // Convert grouped text to content items
178
+ for (const [y, textParts] of textByY.entries()) {
179
+ const textContent = textParts.join('');
180
+ if (textContent.trim()) {
181
+ contentItems.push({
182
+ type: 'text',
183
+ yPosition: y,
184
+ textContent,
185
+ });
186
+ }
187
+ }
188
+ // Extract images with Y-coordinates if requested
189
+ if (includeImages) {
190
+ const operatorList = await page.getOperatorList();
191
+ // Find all image painting operations
192
+ const imageIndices = [];
193
+ for (let i = 0; i < operatorList.fnArray.length; i++) {
194
+ const op = operatorList.fnArray[i];
195
+ if (op === OPS.paintImageXObject || op === OPS.paintXObject) {
196
+ imageIndices.push(i);
197
+ }
198
+ }
199
+ // Extract each image with its Y-coordinate
200
+ const imagePromises = imageIndices.map((imgIndex, arrayIndex) => new Promise((resolve) => {
201
+ const argsArray = operatorList.argsArray[imgIndex];
202
+ if (!argsArray || argsArray.length === 0) {
203
+ resolve(null);
204
+ return;
205
+ }
206
+ const imageName = argsArray[0];
207
+ // Get transform matrix from the args (if available)
208
+ // The transform is typically in argsArray[1] for some ops
209
+ let yPosition = 0;
210
+ if (argsArray.length > 1 && Array.isArray(argsArray[1])) {
211
+ const transform = argsArray[1];
212
+ // transform[5] is the Y coordinate
213
+ const yCoord = transform[5];
214
+ if (yCoord !== undefined) {
215
+ yPosition = Math.round(yCoord);
216
+ }
217
+ }
218
+ // Use callback-based get() as images may not be resolved yet
219
+ page.objs.get(imageName, (imageData) => {
220
+ if (!imageData || typeof imageData !== 'object') {
221
+ resolve(null);
222
+ return;
223
+ }
224
+ const img = imageData;
225
+ if (!img.data || !img.width || !img.height) {
226
+ resolve(null);
227
+ return;
228
+ }
229
+ // Determine image format based on kind
230
+ const format = img.kind === 1 ? 'grayscale' : img.kind === 3 ? 'rgba' : 'rgb';
231
+ // Convert Uint8Array to base64
232
+ const base64 = Buffer.from(img.data).toString('base64');
233
+ resolve({
234
+ type: 'image',
235
+ yPosition,
236
+ imageData: {
237
+ page: pageNum,
238
+ index: arrayIndex,
239
+ width: img.width,
240
+ height: img.height,
241
+ format,
242
+ data: base64,
243
+ },
244
+ });
245
+ });
246
+ }));
247
+ const resolvedImages = await Promise.all(imagePromises);
248
+ contentItems.push(...resolvedImages.filter((item) => item !== null));
249
+ }
250
+ }
251
+ catch (error) {
252
+ const message = error instanceof Error ? error.message : String(error);
253
+ console.warn(`[PDF Reader MCP] Error extracting page content for page ${String(pageNum)} in ${sourceDescription}: ${message}`);
254
+ // Return error message as text content
255
+ return [
256
+ {
257
+ type: 'text',
258
+ yPosition: 0,
259
+ textContent: `Error processing page: ${message}`,
260
+ },
261
+ ];
262
+ }
263
+ // Sort by Y-position (descending = top to bottom in PDF coordinates)
264
+ return contentItems.sort((a, b) => b.yPosition - a.yPosition);
265
+ };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sylphx/pdf-reader-mcp",
3
- "version": "1.1.0",
3
+ "version": "1.2.0",
4
4
  "description": "An MCP server providing tools to read PDF files.",
5
5
  "type": "module",
6
6
  "bin": {