@sylphx/pdf-reader-mcp 1.2.0 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +544 -250
- package/dist/index.js +524 -45
- package/package.json +46 -36
- package/dist/handlers/index.js +0 -4
- package/dist/handlers/readPdf.js +0 -168
- package/dist/pdf/extractor.js +0 -265
- package/dist/pdf/loader.js +0 -53
- package/dist/pdf/parser.js +0 -94
- package/dist/schemas/readPdf.js +0 -55
- package/dist/types/pdf.js +0 -2
- package/dist/utils/pathUtils.js +0 -30
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sylphx/pdf-reader-mcp",
|
|
3
|
-
"version": "1.2
|
|
3
|
+
"version": "1.3.2",
|
|
4
4
|
"description": "An MCP server providing tools to read PDF files.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -19,12 +19,12 @@
|
|
|
19
19
|
},
|
|
20
20
|
"repository": {
|
|
21
21
|
"type": "git",
|
|
22
|
-
"url": "git+https://github.com/
|
|
22
|
+
"url": "git+https://github.com/SylphxAI/pdf-reader-mcp.git"
|
|
23
23
|
},
|
|
24
24
|
"bugs": {
|
|
25
|
-
"url": "https://github.com/
|
|
25
|
+
"url": "https://github.com/SylphxAI/pdf-reader-mcp/issues"
|
|
26
26
|
},
|
|
27
|
-
"homepage": "https://github.com/
|
|
27
|
+
"homepage": "https://github.com/SylphxAI/pdf-reader-mcp#readme",
|
|
28
28
|
"author": "Sylphx <contact@sylphx.com> (https://sylphx.com)",
|
|
29
29
|
"license": "MIT",
|
|
30
30
|
"keywords": [
|
|
@@ -39,20 +39,53 @@
|
|
|
39
39
|
"agent",
|
|
40
40
|
"tool"
|
|
41
41
|
],
|
|
42
|
+
"scripts": {
|
|
43
|
+
"build": "bunup",
|
|
44
|
+
"watch": "tsc --watch",
|
|
45
|
+
"inspector": "npx @modelcontextprotocol/inspector dist/index.js",
|
|
46
|
+
"test": "vitest run",
|
|
47
|
+
"test:watch": "vitest watch",
|
|
48
|
+
"test:cov": "vitest run --coverage",
|
|
49
|
+
"lint": "biome lint .",
|
|
50
|
+
"lint:fix": "biome lint --write .",
|
|
51
|
+
"format": "biome format --write .",
|
|
52
|
+
"check-format": "biome format .",
|
|
53
|
+
"check": "biome check .",
|
|
54
|
+
"check:fix": "biome check --write .",
|
|
55
|
+
"validate": "bun run check && bun run test",
|
|
56
|
+
"docs:dev": "vitepress dev docs",
|
|
57
|
+
"docs:build": "vitepress build docs",
|
|
58
|
+
"docs:preview": "vitepress preview docs",
|
|
59
|
+
"start": "node dist/index.js",
|
|
60
|
+
"typecheck": "tsc --noEmit",
|
|
61
|
+
"benchmark": "vitest bench",
|
|
62
|
+
"clean": "rm -rf dist coverage",
|
|
63
|
+
"docs:api": "typedoc --entryPoints src/index.ts --tsconfig tsconfig.json --plugin typedoc-plugin-markdown --out docs/api --readme none",
|
|
64
|
+
"prepublishOnly": "bun run clean && bun run build",
|
|
65
|
+
"release": "standard-version",
|
|
66
|
+
"prepare": "husky",
|
|
67
|
+
"changeset": "changeset",
|
|
68
|
+
"version-packages": "changeset version",
|
|
69
|
+
"release:new": "bun run build && changeset publish"
|
|
70
|
+
},
|
|
42
71
|
"dependencies": {
|
|
43
|
-
"@modelcontextprotocol/sdk": "1.
|
|
72
|
+
"@modelcontextprotocol/sdk": "^1.21.0",
|
|
44
73
|
"glob": "^11.0.1",
|
|
45
74
|
"pdfjs-dist": "^5.4.296",
|
|
46
|
-
"
|
|
47
|
-
"zod
|
|
75
|
+
"pngjs": "^7.0.0",
|
|
76
|
+
"zod": "^3.25.76",
|
|
77
|
+
"zod-to-json-schema": "^3.24.6"
|
|
48
78
|
},
|
|
49
79
|
"devDependencies": {
|
|
50
80
|
"@biomejs/biome": "^2.3.2",
|
|
51
|
-
"@
|
|
52
|
-
"@commitlint/
|
|
81
|
+
"@changesets/cli": "^2.29.7",
|
|
82
|
+
"@commitlint/cli": "^20.1.0",
|
|
83
|
+
"@commitlint/config-conventional": "^20.0.0",
|
|
53
84
|
"@types/glob": "^8.1.0",
|
|
54
85
|
"@types/node": "^24.0.7",
|
|
55
|
-
"@
|
|
86
|
+
"@types/pngjs": "^6.0.5",
|
|
87
|
+
"@vitest/coverage-v8": "^4.0.8",
|
|
88
|
+
"bunup": "^0.16.10",
|
|
56
89
|
"husky": "^9.1.7",
|
|
57
90
|
"lint-staged": "^16.2.6",
|
|
58
91
|
"standard-version": "^9.5.0",
|
|
@@ -60,7 +93,7 @@
|
|
|
60
93
|
"typedoc-plugin-markdown": "^4.9.0",
|
|
61
94
|
"typescript": "^5.8.3",
|
|
62
95
|
"vitepress": "^1.6.3",
|
|
63
|
-
"vitest": "^
|
|
96
|
+
"vitest": "^4.0.7",
|
|
64
97
|
"vue": "^3.5.13"
|
|
65
98
|
},
|
|
66
99
|
"commitlint": {
|
|
@@ -73,28 +106,5 @@
|
|
|
73
106
|
"biome check --write --no-errors-on-unmatched --files-ignore-unknown=true"
|
|
74
107
|
]
|
|
75
108
|
},
|
|
76
|
-
"
|
|
77
|
-
|
|
78
|
-
"watch": "tsc --watch",
|
|
79
|
-
"inspector": "npx @modelcontextprotocol/inspector dist/index.js",
|
|
80
|
-
"test": "vitest run",
|
|
81
|
-
"test:watch": "vitest watch",
|
|
82
|
-
"test:cov": "vitest run --coverage --reporter=junit --outputFile=test-report.junit.xml",
|
|
83
|
-
"lint": "biome lint .",
|
|
84
|
-
"lint:fix": "biome lint --write .",
|
|
85
|
-
"format": "biome format --write .",
|
|
86
|
-
"check-format": "biome format .",
|
|
87
|
-
"check": "biome check .",
|
|
88
|
-
"check:fix": "biome check --write .",
|
|
89
|
-
"validate": "npm run check && npm run test",
|
|
90
|
-
"docs:dev": "vitepress dev docs",
|
|
91
|
-
"docs:build": "vitepress build docs",
|
|
92
|
-
"docs:preview": "vitepress preview docs",
|
|
93
|
-
"start": "node dist/index.js",
|
|
94
|
-
"typecheck": "tsc --noEmit",
|
|
95
|
-
"benchmark": "vitest bench",
|
|
96
|
-
"clean": "rm -rf dist coverage",
|
|
97
|
-
"docs:api": "typedoc --entryPoints src/index.ts --tsconfig tsconfig.json --plugin typedoc-plugin-markdown --out docs/api --readme none",
|
|
98
|
-
"release": "standard-version"
|
|
99
|
-
}
|
|
100
|
-
}
|
|
109
|
+
"packageManager": "bun@1.3.1"
|
|
110
|
+
}
|
package/dist/handlers/index.js
DELETED
package/dist/handlers/readPdf.js
DELETED
|
@@ -1,168 +0,0 @@
|
|
|
1
|
-
// PDF reading handler - orchestrates PDF processing workflow
|
|
2
|
-
import { ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js';
|
|
3
|
-
import { z } from 'zod';
|
|
4
|
-
import { buildWarnings, extractMetadataAndPageCount, extractPageContent, } from '../pdf/extractor.js';
|
|
5
|
-
import { loadPdfDocument } from '../pdf/loader.js';
|
|
6
|
-
import { determinePagesToProcess, getTargetPages } from '../pdf/parser.js';
|
|
7
|
-
import { readPdfArgsSchema } from '../schemas/readPdf.js';
|
|
8
|
-
/**
|
|
9
|
-
* Process a single PDF source
|
|
10
|
-
*/
|
|
11
|
-
const processSingleSource = async (source, options) => {
|
|
12
|
-
const sourceDescription = source.path ?? source.url ?? 'unknown source';
|
|
13
|
-
let individualResult = { source: sourceDescription, success: false };
|
|
14
|
-
try {
|
|
15
|
-
// Parse target pages
|
|
16
|
-
const targetPages = getTargetPages(source.pages, sourceDescription);
|
|
17
|
-
// Load PDF document
|
|
18
|
-
const { pages: _pages, ...loadArgs } = source;
|
|
19
|
-
const pdfDocument = await loadPdfDocument(loadArgs, sourceDescription);
|
|
20
|
-
const totalPages = pdfDocument.numPages;
|
|
21
|
-
// Extract metadata and page count
|
|
22
|
-
const metadataOutput = await extractMetadataAndPageCount(pdfDocument, options.includeMetadata, options.includePageCount);
|
|
23
|
-
const output = { ...metadataOutput };
|
|
24
|
-
// Determine pages to process
|
|
25
|
-
const { pagesToProcess, invalidPages } = determinePagesToProcess(targetPages, totalPages, options.includeFullText);
|
|
26
|
-
// Add warnings for invalid pages
|
|
27
|
-
const warnings = buildWarnings(invalidPages, totalPages);
|
|
28
|
-
if (warnings.length > 0) {
|
|
29
|
-
output.warnings = warnings;
|
|
30
|
-
}
|
|
31
|
-
// Extract content with ordering preserved
|
|
32
|
-
if (pagesToProcess.length > 0) {
|
|
33
|
-
// Use new extractPageContent to preserve Y-coordinate ordering
|
|
34
|
-
const pageContents = await Promise.all(pagesToProcess.map((pageNum) => extractPageContent(pdfDocument, pageNum, options.includeImages, sourceDescription)));
|
|
35
|
-
// Store page contents for ordered retrieval
|
|
36
|
-
output.page_contents = pageContents.map((items, idx) => ({
|
|
37
|
-
page: pagesToProcess[idx],
|
|
38
|
-
items,
|
|
39
|
-
}));
|
|
40
|
-
// For backward compatibility, also provide text-only outputs
|
|
41
|
-
const extractedPageTexts = pageContents.map((items, idx) => ({
|
|
42
|
-
page: pagesToProcess[idx],
|
|
43
|
-
text: items
|
|
44
|
-
.filter((item) => item.type === 'text')
|
|
45
|
-
.map((item) => item.textContent)
|
|
46
|
-
.join(''),
|
|
47
|
-
}));
|
|
48
|
-
if (targetPages) {
|
|
49
|
-
// Specific pages requested
|
|
50
|
-
output.page_texts = extractedPageTexts;
|
|
51
|
-
}
|
|
52
|
-
else {
|
|
53
|
-
// Full text requested
|
|
54
|
-
output.full_text = extractedPageTexts.map((p) => p.text).join('\n\n');
|
|
55
|
-
}
|
|
56
|
-
// Extract image metadata for JSON response
|
|
57
|
-
if (options.includeImages) {
|
|
58
|
-
const extractedImages = pageContents
|
|
59
|
-
.flatMap((items) => items.filter((item) => item.type === 'image' && item.imageData))
|
|
60
|
-
.map((item) => item.imageData)
|
|
61
|
-
.filter((img) => img !== undefined);
|
|
62
|
-
if (extractedImages.length > 0) {
|
|
63
|
-
output.images = extractedImages;
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
individualResult = { ...individualResult, data: output, success: true };
|
|
68
|
-
}
|
|
69
|
-
catch (error) {
|
|
70
|
-
let errorMessage = `Failed to process PDF from ${sourceDescription}.`;
|
|
71
|
-
if (error instanceof McpError) {
|
|
72
|
-
errorMessage = error.message;
|
|
73
|
-
}
|
|
74
|
-
else if (error instanceof Error) {
|
|
75
|
-
errorMessage += ` Reason: ${error.message}`;
|
|
76
|
-
}
|
|
77
|
-
else {
|
|
78
|
-
errorMessage += ` Unknown error: ${JSON.stringify(error)}`;
|
|
79
|
-
}
|
|
80
|
-
individualResult.error = errorMessage;
|
|
81
|
-
individualResult.success = false;
|
|
82
|
-
individualResult.data = undefined;
|
|
83
|
-
}
|
|
84
|
-
return individualResult;
|
|
85
|
-
};
|
|
86
|
-
/**
|
|
87
|
-
* Main handler function for read_pdf tool
|
|
88
|
-
*/
|
|
89
|
-
export const handleReadPdfFunc = async (args) => {
|
|
90
|
-
let parsedArgs;
|
|
91
|
-
try {
|
|
92
|
-
parsedArgs = readPdfArgsSchema.parse(args);
|
|
93
|
-
}
|
|
94
|
-
catch (error) {
|
|
95
|
-
if (error instanceof z.ZodError) {
|
|
96
|
-
throw new McpError(ErrorCode.InvalidParams, `Invalid arguments: ${error.errors.map((e) => `${e.path.join('.')} (${e.message})`).join(', ')}`);
|
|
97
|
-
}
|
|
98
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
99
|
-
throw new McpError(ErrorCode.InvalidParams, `Argument validation failed: ${message}`);
|
|
100
|
-
}
|
|
101
|
-
const { sources, include_full_text, include_metadata, include_page_count, include_images } = parsedArgs;
|
|
102
|
-
// Process all sources concurrently
|
|
103
|
-
const results = await Promise.all(sources.map((source) => processSingleSource(source, {
|
|
104
|
-
includeFullText: include_full_text,
|
|
105
|
-
includeMetadata: include_metadata,
|
|
106
|
-
includePageCount: include_page_count,
|
|
107
|
-
includeImages: include_images,
|
|
108
|
-
})));
|
|
109
|
-
// Build content parts - start with structured JSON for backward compatibility
|
|
110
|
-
const content = [];
|
|
111
|
-
// Strip image data and page_contents from JSON to keep it manageable
|
|
112
|
-
const resultsForJson = results.map((result) => {
|
|
113
|
-
if (result.data) {
|
|
114
|
-
const { images, page_contents, ...dataWithoutBinaryContent } = result.data;
|
|
115
|
-
// Include image count and metadata in JSON, but not the base64 data
|
|
116
|
-
if (images) {
|
|
117
|
-
const imageInfo = images.map((img) => ({
|
|
118
|
-
page: img.page,
|
|
119
|
-
index: img.index,
|
|
120
|
-
width: img.width,
|
|
121
|
-
height: img.height,
|
|
122
|
-
format: img.format,
|
|
123
|
-
}));
|
|
124
|
-
return { ...result, data: { ...dataWithoutBinaryContent, image_info: imageInfo } };
|
|
125
|
-
}
|
|
126
|
-
return { ...result, data: dataWithoutBinaryContent };
|
|
127
|
-
}
|
|
128
|
-
return result;
|
|
129
|
-
});
|
|
130
|
-
// First content part: Structured JSON results
|
|
131
|
-
content.push({
|
|
132
|
-
type: 'text',
|
|
133
|
-
text: JSON.stringify({ results: resultsForJson }, null, 2),
|
|
134
|
-
});
|
|
135
|
-
// Add page content in exact Y-coordinate order
|
|
136
|
-
for (const result of results) {
|
|
137
|
-
if (!result.success || !result.data?.page_contents)
|
|
138
|
-
continue;
|
|
139
|
-
// Process each page's content items in order
|
|
140
|
-
for (const pageContent of result.data.page_contents) {
|
|
141
|
-
for (const item of pageContent.items) {
|
|
142
|
-
if (item.type === 'text' && item.textContent) {
|
|
143
|
-
// Add text content part
|
|
144
|
-
content.push({
|
|
145
|
-
type: 'text',
|
|
146
|
-
text: item.textContent,
|
|
147
|
-
});
|
|
148
|
-
}
|
|
149
|
-
else if (item.type === 'image' && item.imageData) {
|
|
150
|
-
// Add image content part
|
|
151
|
-
content.push({
|
|
152
|
-
type: 'image',
|
|
153
|
-
data: item.imageData.data,
|
|
154
|
-
mimeType: item.imageData.format === 'rgba' ? 'image/png' : 'image/jpeg',
|
|
155
|
-
});
|
|
156
|
-
}
|
|
157
|
-
}
|
|
158
|
-
}
|
|
159
|
-
}
|
|
160
|
-
return { content };
|
|
161
|
-
};
|
|
162
|
-
// Export the tool definition
|
|
163
|
-
export const readPdfToolDefinition = {
|
|
164
|
-
name: 'read_pdf',
|
|
165
|
-
description: 'Reads content/metadata/images from one or more PDFs (local/URL). Each source can specify pages to extract.',
|
|
166
|
-
schema: readPdfArgsSchema,
|
|
167
|
-
handler: handleReadPdfFunc,
|
|
168
|
-
};
|
package/dist/pdf/extractor.js
DELETED
|
@@ -1,265 +0,0 @@
|
|
|
1
|
-
// PDF text and metadata extraction utilities
|
|
2
|
-
import { OPS } from 'pdfjs-dist/legacy/build/pdf.mjs';
|
|
3
|
-
/**
|
|
4
|
-
* Extract metadata and page count from a PDF document
|
|
5
|
-
*/
|
|
6
|
-
export const extractMetadataAndPageCount = async (pdfDocument, includeMetadata, includePageCount) => {
|
|
7
|
-
const output = {};
|
|
8
|
-
if (includePageCount) {
|
|
9
|
-
output.num_pages = pdfDocument.numPages;
|
|
10
|
-
}
|
|
11
|
-
if (includeMetadata) {
|
|
12
|
-
try {
|
|
13
|
-
const pdfMetadata = await pdfDocument.getMetadata();
|
|
14
|
-
const infoData = pdfMetadata.info;
|
|
15
|
-
if (infoData !== undefined) {
|
|
16
|
-
output.info = infoData;
|
|
17
|
-
}
|
|
18
|
-
const metadataObj = pdfMetadata.metadata;
|
|
19
|
-
// Check if it has a getAll method (as used in tests)
|
|
20
|
-
if (typeof metadataObj.getAll === 'function') {
|
|
21
|
-
output.metadata = metadataObj.getAll();
|
|
22
|
-
}
|
|
23
|
-
else {
|
|
24
|
-
// For real PDF.js metadata, convert to plain object
|
|
25
|
-
const metadataRecord = {};
|
|
26
|
-
for (const key in metadataObj) {
|
|
27
|
-
if (Object.hasOwn(metadataObj, key)) {
|
|
28
|
-
metadataRecord[key] = metadataObj[key];
|
|
29
|
-
}
|
|
30
|
-
}
|
|
31
|
-
output.metadata = metadataRecord;
|
|
32
|
-
}
|
|
33
|
-
}
|
|
34
|
-
catch (metaError) {
|
|
35
|
-
console.warn(`[PDF Reader MCP] Error extracting metadata: ${metaError instanceof Error ? metaError.message : String(metaError)}`);
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
return output;
|
|
39
|
-
};
|
|
40
|
-
/**
|
|
41
|
-
* Extract text from a single page
|
|
42
|
-
*/
|
|
43
|
-
const extractSinglePageText = async (pdfDocument, pageNum, sourceDescription) => {
|
|
44
|
-
try {
|
|
45
|
-
const page = await pdfDocument.getPage(pageNum);
|
|
46
|
-
const textContent = await page.getTextContent();
|
|
47
|
-
const pageText = textContent.items
|
|
48
|
-
.map((item) => item.str)
|
|
49
|
-
.join('');
|
|
50
|
-
return { page: pageNum, text: pageText };
|
|
51
|
-
}
|
|
52
|
-
catch (pageError) {
|
|
53
|
-
const message = pageError instanceof Error ? pageError.message : String(pageError);
|
|
54
|
-
console.warn(`[PDF Reader MCP] Error getting text content for page ${String(pageNum)} in ${sourceDescription}: ${message}`);
|
|
55
|
-
return { page: pageNum, text: `Error processing page: ${message}` };
|
|
56
|
-
}
|
|
57
|
-
};
|
|
58
|
-
/**
|
|
59
|
-
* Extract text from specified pages (parallel processing for performance)
|
|
60
|
-
*/
|
|
61
|
-
export const extractPageTexts = async (pdfDocument, pagesToProcess, sourceDescription) => {
|
|
62
|
-
// Process all pages in parallel for better performance
|
|
63
|
-
const extractedPageTexts = await Promise.all(pagesToProcess.map((pageNum) => extractSinglePageText(pdfDocument, pageNum, sourceDescription)));
|
|
64
|
-
return extractedPageTexts.sort((a, b) => a.page - b.page);
|
|
65
|
-
};
|
|
66
|
-
/**
|
|
67
|
-
* Extract images from a single page
|
|
68
|
-
*/
|
|
69
|
-
const extractImagesFromPage = async (page, pageNum) => {
|
|
70
|
-
const images = [];
|
|
71
|
-
try {
|
|
72
|
-
const operatorList = await page.getOperatorList();
|
|
73
|
-
// Find all image painting operations
|
|
74
|
-
const imageIndices = [];
|
|
75
|
-
for (let i = 0; i < operatorList.fnArray.length; i++) {
|
|
76
|
-
const op = operatorList.fnArray[i];
|
|
77
|
-
if (op === OPS.paintImageXObject || op === OPS.paintXObject) {
|
|
78
|
-
imageIndices.push(i);
|
|
79
|
-
}
|
|
80
|
-
}
|
|
81
|
-
// Extract each image using Promise-based approach
|
|
82
|
-
const imagePromises = imageIndices.map((imgIndex, arrayIndex) => new Promise((resolve) => {
|
|
83
|
-
const argsArray = operatorList.argsArray[imgIndex];
|
|
84
|
-
if (!argsArray || argsArray.length === 0) {
|
|
85
|
-
resolve(null);
|
|
86
|
-
return;
|
|
87
|
-
}
|
|
88
|
-
const imageName = argsArray[0];
|
|
89
|
-
// Use callback-based get() as images may not be resolved yet
|
|
90
|
-
page.objs.get(imageName, (imageData) => {
|
|
91
|
-
if (!imageData || typeof imageData !== 'object') {
|
|
92
|
-
resolve(null);
|
|
93
|
-
return;
|
|
94
|
-
}
|
|
95
|
-
const img = imageData;
|
|
96
|
-
if (!img.data || !img.width || !img.height) {
|
|
97
|
-
resolve(null);
|
|
98
|
-
return;
|
|
99
|
-
}
|
|
100
|
-
// Determine image format based on kind
|
|
101
|
-
// kind === 1 = grayscale, 2 = RGB, 3 = RGBA
|
|
102
|
-
const format = img.kind === 1 ? 'grayscale' : img.kind === 3 ? 'rgba' : 'rgb';
|
|
103
|
-
// Convert Uint8Array to base64
|
|
104
|
-
const base64 = Buffer.from(img.data).toString('base64');
|
|
105
|
-
resolve({
|
|
106
|
-
page: pageNum,
|
|
107
|
-
index: arrayIndex,
|
|
108
|
-
width: img.width,
|
|
109
|
-
height: img.height,
|
|
110
|
-
format,
|
|
111
|
-
data: base64,
|
|
112
|
-
});
|
|
113
|
-
});
|
|
114
|
-
}));
|
|
115
|
-
const resolvedImages = await Promise.all(imagePromises);
|
|
116
|
-
images.push(...resolvedImages.filter((img) => img !== null));
|
|
117
|
-
}
|
|
118
|
-
catch (error) {
|
|
119
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
120
|
-
console.warn(`[PDF Reader MCP] Error extracting images from page ${String(pageNum)}: ${message}`);
|
|
121
|
-
}
|
|
122
|
-
return images;
|
|
123
|
-
};
|
|
124
|
-
/**
|
|
125
|
-
* Extract images from specified pages
|
|
126
|
-
*/
|
|
127
|
-
export const extractImages = async (pdfDocument, pagesToProcess) => {
|
|
128
|
-
const allImages = [];
|
|
129
|
-
// Process pages sequentially to avoid overwhelming PDF.js
|
|
130
|
-
for (const pageNum of pagesToProcess) {
|
|
131
|
-
try {
|
|
132
|
-
const page = await pdfDocument.getPage(pageNum);
|
|
133
|
-
const pageImages = await extractImagesFromPage(page, pageNum);
|
|
134
|
-
allImages.push(...pageImages);
|
|
135
|
-
}
|
|
136
|
-
catch (error) {
|
|
137
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
138
|
-
console.warn(`[PDF Reader MCP] Error getting page ${String(pageNum)} for image extraction: ${message}`);
|
|
139
|
-
}
|
|
140
|
-
}
|
|
141
|
-
return allImages;
|
|
142
|
-
};
|
|
143
|
-
/**
|
|
144
|
-
* Build warnings array for invalid page numbers
|
|
145
|
-
*/
|
|
146
|
-
export const buildWarnings = (invalidPages, totalPages) => {
|
|
147
|
-
if (invalidPages.length === 0) {
|
|
148
|
-
return [];
|
|
149
|
-
}
|
|
150
|
-
return [
|
|
151
|
-
`Requested page numbers ${invalidPages.join(', ')} exceed total pages (${String(totalPages)}).`,
|
|
152
|
-
];
|
|
153
|
-
};
|
|
154
|
-
/**
|
|
155
|
-
* Extract all content (text and images) from a single page with Y-coordinate ordering
|
|
156
|
-
*/
|
|
157
|
-
export const extractPageContent = async (pdfDocument, pageNum, includeImages, sourceDescription) => {
|
|
158
|
-
const contentItems = [];
|
|
159
|
-
try {
|
|
160
|
-
const page = await pdfDocument.getPage(pageNum);
|
|
161
|
-
// Extract text content with Y-coordinates
|
|
162
|
-
const textContent = await page.getTextContent();
|
|
163
|
-
// Group text items by Y-coordinate (items on same line have similar Y values)
|
|
164
|
-
const textByY = new Map();
|
|
165
|
-
for (const item of textContent.items) {
|
|
166
|
-
const textItem = item;
|
|
167
|
-
// transform[5] is the Y coordinate
|
|
168
|
-
const yCoord = textItem.transform[5];
|
|
169
|
-
if (yCoord === undefined)
|
|
170
|
-
continue;
|
|
171
|
-
const y = Math.round(yCoord);
|
|
172
|
-
if (!textByY.has(y)) {
|
|
173
|
-
textByY.set(y, []);
|
|
174
|
-
}
|
|
175
|
-
textByY.get(y)?.push(textItem.str);
|
|
176
|
-
}
|
|
177
|
-
// Convert grouped text to content items
|
|
178
|
-
for (const [y, textParts] of textByY.entries()) {
|
|
179
|
-
const textContent = textParts.join('');
|
|
180
|
-
if (textContent.trim()) {
|
|
181
|
-
contentItems.push({
|
|
182
|
-
type: 'text',
|
|
183
|
-
yPosition: y,
|
|
184
|
-
textContent,
|
|
185
|
-
});
|
|
186
|
-
}
|
|
187
|
-
}
|
|
188
|
-
// Extract images with Y-coordinates if requested
|
|
189
|
-
if (includeImages) {
|
|
190
|
-
const operatorList = await page.getOperatorList();
|
|
191
|
-
// Find all image painting operations
|
|
192
|
-
const imageIndices = [];
|
|
193
|
-
for (let i = 0; i < operatorList.fnArray.length; i++) {
|
|
194
|
-
const op = operatorList.fnArray[i];
|
|
195
|
-
if (op === OPS.paintImageXObject || op === OPS.paintXObject) {
|
|
196
|
-
imageIndices.push(i);
|
|
197
|
-
}
|
|
198
|
-
}
|
|
199
|
-
// Extract each image with its Y-coordinate
|
|
200
|
-
const imagePromises = imageIndices.map((imgIndex, arrayIndex) => new Promise((resolve) => {
|
|
201
|
-
const argsArray = operatorList.argsArray[imgIndex];
|
|
202
|
-
if (!argsArray || argsArray.length === 0) {
|
|
203
|
-
resolve(null);
|
|
204
|
-
return;
|
|
205
|
-
}
|
|
206
|
-
const imageName = argsArray[0];
|
|
207
|
-
// Get transform matrix from the args (if available)
|
|
208
|
-
// The transform is typically in argsArray[1] for some ops
|
|
209
|
-
let yPosition = 0;
|
|
210
|
-
if (argsArray.length > 1 && Array.isArray(argsArray[1])) {
|
|
211
|
-
const transform = argsArray[1];
|
|
212
|
-
// transform[5] is the Y coordinate
|
|
213
|
-
const yCoord = transform[5];
|
|
214
|
-
if (yCoord !== undefined) {
|
|
215
|
-
yPosition = Math.round(yCoord);
|
|
216
|
-
}
|
|
217
|
-
}
|
|
218
|
-
// Use callback-based get() as images may not be resolved yet
|
|
219
|
-
page.objs.get(imageName, (imageData) => {
|
|
220
|
-
if (!imageData || typeof imageData !== 'object') {
|
|
221
|
-
resolve(null);
|
|
222
|
-
return;
|
|
223
|
-
}
|
|
224
|
-
const img = imageData;
|
|
225
|
-
if (!img.data || !img.width || !img.height) {
|
|
226
|
-
resolve(null);
|
|
227
|
-
return;
|
|
228
|
-
}
|
|
229
|
-
// Determine image format based on kind
|
|
230
|
-
const format = img.kind === 1 ? 'grayscale' : img.kind === 3 ? 'rgba' : 'rgb';
|
|
231
|
-
// Convert Uint8Array to base64
|
|
232
|
-
const base64 = Buffer.from(img.data).toString('base64');
|
|
233
|
-
resolve({
|
|
234
|
-
type: 'image',
|
|
235
|
-
yPosition,
|
|
236
|
-
imageData: {
|
|
237
|
-
page: pageNum,
|
|
238
|
-
index: arrayIndex,
|
|
239
|
-
width: img.width,
|
|
240
|
-
height: img.height,
|
|
241
|
-
format,
|
|
242
|
-
data: base64,
|
|
243
|
-
},
|
|
244
|
-
});
|
|
245
|
-
});
|
|
246
|
-
}));
|
|
247
|
-
const resolvedImages = await Promise.all(imagePromises);
|
|
248
|
-
contentItems.push(...resolvedImages.filter((item) => item !== null));
|
|
249
|
-
}
|
|
250
|
-
}
|
|
251
|
-
catch (error) {
|
|
252
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
253
|
-
console.warn(`[PDF Reader MCP] Error extracting page content for page ${String(pageNum)} in ${sourceDescription}: ${message}`);
|
|
254
|
-
// Return error message as text content
|
|
255
|
-
return [
|
|
256
|
-
{
|
|
257
|
-
type: 'text',
|
|
258
|
-
yPosition: 0,
|
|
259
|
-
textContent: `Error processing page: ${message}`,
|
|
260
|
-
},
|
|
261
|
-
];
|
|
262
|
-
}
|
|
263
|
-
// Sort by Y-position (descending = top to bottom in PDF coordinates)
|
|
264
|
-
return contentItems.sort((a, b) => b.yPosition - a.yPosition);
|
|
265
|
-
};
|
package/dist/pdf/loader.js
DELETED
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
// PDF document loading utilities
|
|
2
|
-
import fs from 'node:fs/promises';
|
|
3
|
-
import { ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js';
|
|
4
|
-
import { getDocument } from 'pdfjs-dist/legacy/build/pdf.mjs';
|
|
5
|
-
import { resolvePath } from '../utils/pathUtils.js';
|
|
6
|
-
/**
|
|
7
|
-
* Load a PDF document from a local file path or URL
|
|
8
|
-
* @param source - Object containing either path or url
|
|
9
|
-
* @param sourceDescription - Description for error messages
|
|
10
|
-
* @returns PDF document proxy
|
|
11
|
-
*/
|
|
12
|
-
export const loadPdfDocument = async (source, sourceDescription) => {
|
|
13
|
-
let pdfDataSource;
|
|
14
|
-
try {
|
|
15
|
-
if (source.path) {
|
|
16
|
-
const safePath = resolvePath(source.path);
|
|
17
|
-
const buffer = await fs.readFile(safePath);
|
|
18
|
-
pdfDataSource = new Uint8Array(buffer);
|
|
19
|
-
}
|
|
20
|
-
else if (source.url) {
|
|
21
|
-
pdfDataSource = { url: source.url };
|
|
22
|
-
}
|
|
23
|
-
else {
|
|
24
|
-
throw new McpError(ErrorCode.InvalidParams, `Source ${sourceDescription} missing 'path' or 'url'.`);
|
|
25
|
-
}
|
|
26
|
-
}
|
|
27
|
-
catch (err) {
|
|
28
|
-
if (err instanceof McpError) {
|
|
29
|
-
throw err;
|
|
30
|
-
}
|
|
31
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
32
|
-
const errorCode = ErrorCode.InvalidRequest;
|
|
33
|
-
if (typeof err === 'object' &&
|
|
34
|
-
err !== null &&
|
|
35
|
-
'code' in err &&
|
|
36
|
-
err.code === 'ENOENT' &&
|
|
37
|
-
source.path) {
|
|
38
|
-
throw new McpError(errorCode, `File not found at '${source.path}'.`, {
|
|
39
|
-
cause: err instanceof Error ? err : undefined,
|
|
40
|
-
});
|
|
41
|
-
}
|
|
42
|
-
throw new McpError(errorCode, `Failed to prepare PDF source ${sourceDescription}. Reason: ${message}`, { cause: err instanceof Error ? err : undefined });
|
|
43
|
-
}
|
|
44
|
-
const loadingTask = getDocument(pdfDataSource);
|
|
45
|
-
try {
|
|
46
|
-
return await loadingTask.promise;
|
|
47
|
-
}
|
|
48
|
-
catch (err) {
|
|
49
|
-
console.error(`[PDF Reader MCP] PDF.js loading error for ${sourceDescription}:`, err);
|
|
50
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
51
|
-
throw new McpError(ErrorCode.InvalidRequest, `Failed to load PDF document from ${sourceDescription}. Reason: ${message || 'Unknown loading error'}`, { cause: err instanceof Error ? err : undefined });
|
|
52
|
-
}
|
|
53
|
-
};
|