@sylphx/pdf-reader-mcp 1.3.0 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +110 -78
- package/dist/index.js +524 -45
- package/package.json +17 -11
- package/dist/handlers/index.js +0 -4
- package/dist/handlers/readPdf.js +0 -170
- package/dist/pdf/extractor.js +0 -394
- package/dist/pdf/loader.js +0 -53
- package/dist/pdf/parser.js +0 -96
- package/dist/schemas/readPdf.js +0 -59
- package/dist/types/pdf.js +0 -2
- package/dist/utils/pathUtils.js +0 -25
package/dist/pdf/parser.js
DELETED
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
// Page range parsing utilities
|
|
2
|
-
import { ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js';
|
|
3
|
-
const MAX_RANGE_SIZE = 10000; // Prevent infinite loops for open ranges
|
|
4
|
-
/**
|
|
5
|
-
* Parse a single range part (e.g., "1-3", "5", "7-")
|
|
6
|
-
*/
|
|
7
|
-
const parseRangePart = (part, pages) => {
|
|
8
|
-
const trimmedPart = part.trim();
|
|
9
|
-
if (trimmedPart.includes('-')) {
|
|
10
|
-
const splitResult = trimmedPart.split('-');
|
|
11
|
-
const startStr = splitResult[0] || '';
|
|
12
|
-
const endStr = splitResult[1];
|
|
13
|
-
const start = parseInt(startStr, 10);
|
|
14
|
-
const end = endStr === '' || endStr === undefined ? Infinity : parseInt(endStr, 10);
|
|
15
|
-
if (Number.isNaN(start) || Number.isNaN(end) || start <= 0 || start > end) {
|
|
16
|
-
throw new Error(`Invalid page range values: ${trimmedPart}`);
|
|
17
|
-
}
|
|
18
|
-
const practicalEnd = Math.min(end, start + MAX_RANGE_SIZE);
|
|
19
|
-
for (let i = start; i <= practicalEnd; i++) {
|
|
20
|
-
pages.add(i);
|
|
21
|
-
}
|
|
22
|
-
if (end === Infinity && practicalEnd === start + MAX_RANGE_SIZE) {
|
|
23
|
-
console.warn(`[PDF Reader MCP] Open-ended range starting at ${String(start)} was truncated at page ${String(practicalEnd)}.`);
|
|
24
|
-
}
|
|
25
|
-
}
|
|
26
|
-
else {
|
|
27
|
-
const page = parseInt(trimmedPart, 10);
|
|
28
|
-
if (Number.isNaN(page) || page <= 0) {
|
|
29
|
-
throw new Error(`Invalid page number: ${trimmedPart}`);
|
|
30
|
-
}
|
|
31
|
-
pages.add(page);
|
|
32
|
-
}
|
|
33
|
-
};
|
|
34
|
-
/**
|
|
35
|
-
* Parse page range string into array of page numbers
|
|
36
|
-
* @param ranges - Range string (e.g., "1-3,5,7-10")
|
|
37
|
-
* @returns Sorted array of unique page numbers
|
|
38
|
-
*/
|
|
39
|
-
export const parsePageRanges = (ranges) => {
|
|
40
|
-
const pages = new Set();
|
|
41
|
-
const parts = ranges.split(',');
|
|
42
|
-
for (const part of parts) {
|
|
43
|
-
parseRangePart(part, pages);
|
|
44
|
-
}
|
|
45
|
-
// This should never happen as parseRangePart would have thrown an error
|
|
46
|
-
// if no valid pages were found, but we keep this as a safety check
|
|
47
|
-
/* c8 ignore next */
|
|
48
|
-
if (pages.size === 0) {
|
|
49
|
-
throw new Error('Page range string resulted in zero valid pages.');
|
|
50
|
-
}
|
|
51
|
-
return Array.from(pages).sort((a, b) => a - b);
|
|
52
|
-
};
|
|
53
|
-
/**
|
|
54
|
-
* Get target pages from page specification
|
|
55
|
-
* @param sourcePages - Page specification (string or array)
|
|
56
|
-
* @param sourceDescription - Description for error messages
|
|
57
|
-
* @returns Array of page numbers or undefined
|
|
58
|
-
*/
|
|
59
|
-
export const getTargetPages = (sourcePages, sourceDescription) => {
|
|
60
|
-
if (!sourcePages) {
|
|
61
|
-
return undefined;
|
|
62
|
-
}
|
|
63
|
-
try {
|
|
64
|
-
if (typeof sourcePages === 'string') {
|
|
65
|
-
return parsePageRanges(sourcePages);
|
|
66
|
-
}
|
|
67
|
-
// Array of page numbers
|
|
68
|
-
if (sourcePages.some((p) => !Number.isInteger(p) || p <= 0)) {
|
|
69
|
-
throw new Error('Page numbers in array must be positive integers.');
|
|
70
|
-
}
|
|
71
|
-
const uniquePages = [...new Set(sourcePages)].sort((a, b) => a - b);
|
|
72
|
-
if (uniquePages.length === 0) {
|
|
73
|
-
throw new Error('Page specification resulted in an empty set of pages.');
|
|
74
|
-
}
|
|
75
|
-
return uniquePages;
|
|
76
|
-
}
|
|
77
|
-
catch (error) {
|
|
78
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
79
|
-
throw new McpError(ErrorCode.InvalidParams, `Invalid page specification for source ${sourceDescription}: ${message}`);
|
|
80
|
-
}
|
|
81
|
-
};
|
|
82
|
-
/**
|
|
83
|
-
* Determine which pages to process based on target pages and document size
|
|
84
|
-
*/
|
|
85
|
-
export const determinePagesToProcess = (targetPages, totalPages, includeFullText) => {
|
|
86
|
-
if (targetPages) {
|
|
87
|
-
const pagesToProcess = targetPages.filter((p) => p <= totalPages);
|
|
88
|
-
const invalidPages = targetPages.filter((p) => p > totalPages);
|
|
89
|
-
return { pagesToProcess, invalidPages };
|
|
90
|
-
}
|
|
91
|
-
if (includeFullText) {
|
|
92
|
-
const pagesToProcess = Array.from({ length: totalPages }, (_, i) => i + 1);
|
|
93
|
-
return { pagesToProcess, invalidPages: [] };
|
|
94
|
-
}
|
|
95
|
-
return { pagesToProcess: [], invalidPages: [] };
|
|
96
|
-
};
|
package/dist/schemas/readPdf.js
DELETED
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
// Zod validation schemas for PDF reading
|
|
2
|
-
import { z } from 'zod';
|
|
3
|
-
// Schema for page specification (array of numbers or range string)
|
|
4
|
-
export const pageSpecifierSchema = z.union([
|
|
5
|
-
z.array(z.number().int().min(1)).min(1).describe('Array of page numbers (1-based)'),
|
|
6
|
-
z
|
|
7
|
-
.string()
|
|
8
|
-
.min(1)
|
|
9
|
-
.refine((val) => /^[0-9,-]+$/.test(val.replace(/\s/g, '')), {
|
|
10
|
-
message: 'Page string must contain only numbers, commas, and hyphens.',
|
|
11
|
-
})
|
|
12
|
-
.describe('Page range string (e.g., "1-5,10,15-20")'),
|
|
13
|
-
]);
|
|
14
|
-
// Schema for a single PDF source (path or URL)
|
|
15
|
-
export const pdfSourceSchema = z
|
|
16
|
-
.object({
|
|
17
|
-
path: z
|
|
18
|
-
.string()
|
|
19
|
-
.min(1)
|
|
20
|
-
.optional()
|
|
21
|
-
.describe('Path to the local PDF file (absolute or relative to cwd).'),
|
|
22
|
-
url: z.string().url().optional().describe('URL of the PDF file.'),
|
|
23
|
-
pages: pageSpecifierSchema
|
|
24
|
-
.optional()
|
|
25
|
-
.describe("Extract text only from specific pages (1-based) or ranges for this source. If provided, 'include_full_text' is ignored for this source."),
|
|
26
|
-
})
|
|
27
|
-
.strict()
|
|
28
|
-
.refine((data) => !!(data.path && !data.url) || !!(!data.path && data.url), {
|
|
29
|
-
message: "Each source must have either 'path' or 'url', but not both.",
|
|
30
|
-
});
|
|
31
|
-
// Schema for the read_pdf tool arguments
|
|
32
|
-
export const readPdfArgsSchema = z
|
|
33
|
-
.object({
|
|
34
|
-
sources: z
|
|
35
|
-
.array(pdfSourceSchema)
|
|
36
|
-
.min(1)
|
|
37
|
-
.describe('An array of PDF sources to process, each can optionally specify pages.'),
|
|
38
|
-
include_full_text: z
|
|
39
|
-
.boolean()
|
|
40
|
-
.optional()
|
|
41
|
-
.default(false)
|
|
42
|
-
.describe("Include the full text content of each PDF (only if 'pages' is not specified for that source)."),
|
|
43
|
-
include_metadata: z
|
|
44
|
-
.boolean()
|
|
45
|
-
.optional()
|
|
46
|
-
.default(true)
|
|
47
|
-
.describe('Include metadata and info objects for each PDF.'),
|
|
48
|
-
include_page_count: z
|
|
49
|
-
.boolean()
|
|
50
|
-
.optional()
|
|
51
|
-
.default(true)
|
|
52
|
-
.describe('Include the total number of pages for each PDF.'),
|
|
53
|
-
include_images: z
|
|
54
|
-
.boolean()
|
|
55
|
-
.optional()
|
|
56
|
-
.default(false)
|
|
57
|
-
.describe('Extract and include embedded images from the PDF pages as base64-encoded data.'),
|
|
58
|
-
})
|
|
59
|
-
.strict();
|
package/dist/types/pdf.js
DELETED
package/dist/utils/pathUtils.js
DELETED
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
// Removed unused import: import { fileURLToPath } from 'url';
|
|
2
|
-
import path from 'node:path';
|
|
3
|
-
import { ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js';
|
|
4
|
-
// Use the server's current working directory as the project root.
|
|
5
|
-
// This relies on the process launching the server to set the CWD correctly.
|
|
6
|
-
export const PROJECT_ROOT = process.cwd();
|
|
7
|
-
console.info(`[Filesystem MCP - pathUtils] Project Root determined from CWD: ${PROJECT_ROOT}`); // Use info instead of log
|
|
8
|
-
/**
|
|
9
|
-
* Resolves a user-provided path, accepting both absolute and relative paths.
|
|
10
|
-
* Relative paths are resolved against the current working directory (PROJECT_ROOT).
|
|
11
|
-
* @param userPath The path provided by the user (absolute or relative).
|
|
12
|
-
* @returns The resolved absolute path.
|
|
13
|
-
*/
|
|
14
|
-
export const resolvePath = (userPath) => {
|
|
15
|
-
if (typeof userPath !== 'string') {
|
|
16
|
-
throw new McpError(ErrorCode.InvalidParams, 'Path must be a string.');
|
|
17
|
-
}
|
|
18
|
-
const normalizedUserPath = path.normalize(userPath);
|
|
19
|
-
// If absolute path, return it normalized
|
|
20
|
-
if (path.isAbsolute(normalizedUserPath)) {
|
|
21
|
-
return normalizedUserPath;
|
|
22
|
-
}
|
|
23
|
-
// If relative path, resolve against the PROJECT_ROOT (cwd)
|
|
24
|
-
return path.resolve(PROJECT_ROOT, normalizedUserPath);
|
|
25
|
-
};
|