@sylphx/pdf-reader-mcp 2.1.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +58 -2
- package/dist/index.js +366 -16
- package/package.json +14 -19
package/README.md
CHANGED
|
@@ -593,6 +593,63 @@ Restart MCP client completely.
|
|
|
593
593
|
|
|
594
594
|
---
|
|
595
595
|
|
|
596
|
+
## 🌐 HTTP Transport (Remote Access)
|
|
597
|
+
|
|
598
|
+
By default, PDF Reader MCP uses stdio transport for local use. You can also run it as an HTTP server for remote access from multiple machines.
|
|
599
|
+
|
|
600
|
+
### Quick Start
|
|
601
|
+
|
|
602
|
+
```bash
|
|
603
|
+
# Run as HTTP server on port 8080
|
|
604
|
+
MCP_TRANSPORT=http npx @sylphx/pdf-reader-mcp
|
|
605
|
+
```
|
|
606
|
+
|
|
607
|
+
### Environment Variables
|
|
608
|
+
|
|
609
|
+
| Variable | Default | Description |
|
|
610
|
+
|----------|---------|-------------|
|
|
611
|
+
| `MCP_TRANSPORT` | `stdio` | Transport type: `stdio` or `http` |
|
|
612
|
+
| `MCP_HTTP_PORT` | `8080` | HTTP server port |
|
|
613
|
+
| `MCP_HTTP_HOST` | `0.0.0.0` | HTTP server hostname |
|
|
614
|
+
| `MCP_API_KEY` | - | Optional API key for authentication |
|
|
615
|
+
|
|
616
|
+
### Docker Deployment
|
|
617
|
+
|
|
618
|
+
```dockerfile
|
|
619
|
+
FROM oven/bun:1
|
|
620
|
+
WORKDIR /app
|
|
621
|
+
RUN bun add @sylphx/pdf-reader-mcp
|
|
622
|
+
ENV MCP_TRANSPORT=http
|
|
623
|
+
ENV MCP_HTTP_PORT=8080
|
|
624
|
+
EXPOSE 8080
|
|
625
|
+
CMD ["bun", "node_modules/@sylphx/pdf-reader-mcp/dist/index.js"]
|
|
626
|
+
```
|
|
627
|
+
|
|
628
|
+
### MCP Client Configuration (HTTP)
|
|
629
|
+
|
|
630
|
+
```json
|
|
631
|
+
{
|
|
632
|
+
"servers": {
|
|
633
|
+
"pdf-reader": {
|
|
634
|
+
"type": "http",
|
|
635
|
+
"url": "https://your-server.com/mcp",
|
|
636
|
+
"headers": {
|
|
637
|
+
"X-API-Key": "your-api-key"
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
}
|
|
642
|
+
```
|
|
643
|
+
|
|
644
|
+
### Endpoints
|
|
645
|
+
|
|
646
|
+
| Endpoint | Method | Description |
|
|
647
|
+
|----------|--------|-------------|
|
|
648
|
+
| `/mcp` | POST | JSON-RPC endpoint |
|
|
649
|
+
| `/mcp/health` | GET | Health check |
|
|
650
|
+
|
|
651
|
+
---
|
|
652
|
+
|
|
596
653
|
## 🏗️ Architecture
|
|
597
654
|
|
|
598
655
|
### Tech Stack
|
|
@@ -772,12 +829,11 @@ Special thanks to the open source community ❤️
|
|
|
772
829
|
This project uses the following [@sylphx](https://github.com/SylphxAI) packages:
|
|
773
830
|
|
|
774
831
|
- [@sylphx/mcp-server-sdk](https://github.com/SylphxAI/mcp-server-sdk) - MCP server framework
|
|
832
|
+
- [@sylphx/vex](https://github.com/SylphxAI/vex) - Schema validation
|
|
775
833
|
- [@sylphx/biome-config](https://github.com/SylphxAI/biome-config) - Biome configuration
|
|
776
834
|
- [@sylphx/tsconfig](https://github.com/SylphxAI/tsconfig) - TypeScript configuration
|
|
777
835
|
- [@sylphx/bump](https://github.com/SylphxAI/bump) - Version management
|
|
778
836
|
- [@sylphx/doctor](https://github.com/SylphxAI/doctor) - Project health checker
|
|
779
|
-
- [@sylphx/leaf](https://github.com/SylphxAI/leaf) - Documentation framework
|
|
780
|
-
- [@sylphx/leaf-theme-default](https://github.com/SylphxAI/leaf-theme-default) - Documentation theme
|
|
781
837
|
|
|
782
838
|
---
|
|
783
839
|
|
package/dist/index.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/index.ts
|
|
4
|
-
import { createServer, stdio } from "@sylphx/mcp-server-sdk";
|
|
4
|
+
import { createServer, http, stdio } from "@sylphx/mcp-server-sdk";
|
|
5
5
|
|
|
6
6
|
// src/handlers/readPdf.ts
|
|
7
7
|
import { image, text, tool, toolError } from "@sylphx/mcp-server-sdk";
|
|
@@ -462,6 +462,280 @@ var determinePagesToProcess = (targetPages, totalPages, includeFullText) => {
|
|
|
462
462
|
return { pagesToProcess: [], invalidPages: [] };
|
|
463
463
|
};
|
|
464
464
|
|
|
465
|
+
// src/pdf/tableExtractor.ts
|
|
466
|
+
var logger5 = createLogger("TableExtractor");
|
|
467
|
+
var Y_TOLERANCE = 5;
|
|
468
|
+
var COLUMN_GAP_THRESHOLD = 15;
|
|
469
|
+
var MIN_ROWS = 2;
|
|
470
|
+
var MIN_COLS = 2;
|
|
471
|
+
var MIN_ROW_ITEMS = 2;
|
|
472
|
+
var extractTextItemsWithPositions = async (page) => {
|
|
473
|
+
const textContent = await page.getTextContent();
|
|
474
|
+
const items = [];
|
|
475
|
+
for (const item of textContent.items) {
|
|
476
|
+
const textItem = item;
|
|
477
|
+
if (!textItem.str.trim())
|
|
478
|
+
continue;
|
|
479
|
+
if (!textItem.transform || textItem.transform.length < 6)
|
|
480
|
+
continue;
|
|
481
|
+
const x = textItem.transform[4];
|
|
482
|
+
const y = textItem.transform[5];
|
|
483
|
+
if (x === undefined || y === undefined)
|
|
484
|
+
continue;
|
|
485
|
+
items.push({
|
|
486
|
+
text: textItem.str,
|
|
487
|
+
x,
|
|
488
|
+
y,
|
|
489
|
+
width: textItem.width ?? textItem.str.length * 6
|
|
490
|
+
});
|
|
491
|
+
}
|
|
492
|
+
return items;
|
|
493
|
+
};
|
|
494
|
+
var clusterByY = (items, tolerance = Y_TOLERANCE) => {
|
|
495
|
+
if (items.length === 0)
|
|
496
|
+
return [];
|
|
497
|
+
const sorted = [...items].sort((a, b) => b.y - a.y);
|
|
498
|
+
const firstItem = sorted[0];
|
|
499
|
+
if (!firstItem)
|
|
500
|
+
return [];
|
|
501
|
+
const rows = [];
|
|
502
|
+
let currentRow = { y: firstItem.y, items: [firstItem] };
|
|
503
|
+
for (let i = 1;i < sorted.length; i++) {
|
|
504
|
+
const item = sorted[i];
|
|
505
|
+
if (!item)
|
|
506
|
+
continue;
|
|
507
|
+
const yDiff = Math.abs(currentRow.y - item.y);
|
|
508
|
+
if (yDiff <= tolerance) {
|
|
509
|
+
currentRow.items.push(item);
|
|
510
|
+
} else {
|
|
511
|
+
rows.push(currentRow);
|
|
512
|
+
currentRow = { y: item.y, items: [item] };
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
rows.push(currentRow);
|
|
516
|
+
for (const row of rows) {
|
|
517
|
+
row.items.sort((a, b) => a.x - b.x);
|
|
518
|
+
}
|
|
519
|
+
return rows;
|
|
520
|
+
};
|
|
521
|
+
var detectColumnBoundaries = (rows, gapThreshold = COLUMN_GAP_THRESHOLD) => {
|
|
522
|
+
if (rows.length === 0)
|
|
523
|
+
return [];
|
|
524
|
+
const allXPositions = [];
|
|
525
|
+
for (const row of rows) {
|
|
526
|
+
for (const item of row.items) {
|
|
527
|
+
allXPositions.push(item.x);
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
if (allXPositions.length === 0)
|
|
531
|
+
return [];
|
|
532
|
+
allXPositions.sort((a, b) => a - b);
|
|
533
|
+
const firstX = allXPositions[0];
|
|
534
|
+
if (firstX === undefined)
|
|
535
|
+
return [];
|
|
536
|
+
const boundaries = [firstX];
|
|
537
|
+
for (let i = 1;i < allXPositions.length; i++) {
|
|
538
|
+
const current = allXPositions[i];
|
|
539
|
+
const previous = allXPositions[i - 1];
|
|
540
|
+
if (current === undefined || previous === undefined)
|
|
541
|
+
continue;
|
|
542
|
+
const gap = current - previous;
|
|
543
|
+
if (gap >= gapThreshold) {
|
|
544
|
+
boundaries.push(current);
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
return boundaries;
|
|
548
|
+
};
|
|
549
|
+
var assignToColumns = (row, columnBoundaries, tolerance = COLUMN_GAP_THRESHOLD / 2) => {
|
|
550
|
+
const cells = new Array(columnBoundaries.length).fill("");
|
|
551
|
+
for (const item of row.items) {
|
|
552
|
+
let colIndex = 0;
|
|
553
|
+
for (let i = columnBoundaries.length - 1;i >= 0; i--) {
|
|
554
|
+
const boundary = columnBoundaries[i];
|
|
555
|
+
if (boundary !== undefined && item.x >= boundary - tolerance) {
|
|
556
|
+
colIndex = i;
|
|
557
|
+
break;
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
const current = cells[colIndex];
|
|
561
|
+
cells[colIndex] = current ? `${current} ${item.text}` : item.text;
|
|
562
|
+
}
|
|
563
|
+
return cells;
|
|
564
|
+
};
|
|
565
|
+
var calculateConfidence = (rows, columnBoundaries) => {
|
|
566
|
+
if (rows.length < MIN_ROWS || columnBoundaries.length < MIN_COLS) {
|
|
567
|
+
return 0;
|
|
568
|
+
}
|
|
569
|
+
let score = 0;
|
|
570
|
+
let checks = 0;
|
|
571
|
+
for (const row of rows) {
|
|
572
|
+
const itemsPerColumn = new Set;
|
|
573
|
+
for (const item of row.items) {
|
|
574
|
+
for (let i = columnBoundaries.length - 1;i >= 0; i--) {
|
|
575
|
+
const boundary = columnBoundaries[i];
|
|
576
|
+
if (boundary !== undefined && item.x >= boundary - COLUMN_GAP_THRESHOLD / 2) {
|
|
577
|
+
itemsPerColumn.add(i);
|
|
578
|
+
break;
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
}
|
|
582
|
+
score += itemsPerColumn.size / columnBoundaries.length;
|
|
583
|
+
checks++;
|
|
584
|
+
}
|
|
585
|
+
if (rows.length >= 2) {
|
|
586
|
+
const spacings = [];
|
|
587
|
+
for (let i = 1;i < rows.length; i++) {
|
|
588
|
+
const prevRow = rows[i - 1];
|
|
589
|
+
const currRow = rows[i];
|
|
590
|
+
if (prevRow && currRow) {
|
|
591
|
+
spacings.push(Math.abs(prevRow.y - currRow.y));
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
if (spacings.length > 0) {
|
|
595
|
+
const avgSpacing = spacings.reduce((a, b) => a + b, 0) / spacings.length;
|
|
596
|
+
const variance = spacings.reduce((sum, s) => sum + (s - avgSpacing) ** 2, 0) / spacings.length;
|
|
597
|
+
const stdDev = Math.sqrt(variance);
|
|
598
|
+
const regularityScore = avgSpacing > 0 ? Math.max(0, 1 - stdDev / avgSpacing) : 0;
|
|
599
|
+
score += regularityScore;
|
|
600
|
+
checks++;
|
|
601
|
+
}
|
|
602
|
+
}
|
|
603
|
+
return checks > 0 ? Math.min(1, score / checks) : 0;
|
|
604
|
+
};
|
|
605
|
+
var identifyTableRegions = (rows) => {
|
|
606
|
+
const regions = [];
|
|
607
|
+
const candidateRows = rows.filter((row) => row.items.length >= MIN_ROW_ITEMS);
|
|
608
|
+
if (candidateRows.length < MIN_ROWS) {
|
|
609
|
+
return regions;
|
|
610
|
+
}
|
|
611
|
+
const columnBoundaries = detectColumnBoundaries(candidateRows);
|
|
612
|
+
if (columnBoundaries.length < MIN_COLS) {
|
|
613
|
+
return regions;
|
|
614
|
+
}
|
|
615
|
+
let currentRegion = [];
|
|
616
|
+
for (const row of candidateRows) {
|
|
617
|
+
const alignedItems = row.items.filter((item) => {
|
|
618
|
+
return columnBoundaries.some((boundary) => Math.abs(item.x - boundary) < COLUMN_GAP_THRESHOLD);
|
|
619
|
+
});
|
|
620
|
+
if (alignedItems.length >= MIN_COLS - 1) {
|
|
621
|
+
currentRegion.push(row);
|
|
622
|
+
} else if (currentRegion.length >= MIN_ROWS) {
|
|
623
|
+
const firstRow = currentRegion[0];
|
|
624
|
+
const lastRow = currentRegion[currentRegion.length - 1];
|
|
625
|
+
if (firstRow && lastRow) {
|
|
626
|
+
regions.push({
|
|
627
|
+
rows: currentRegion,
|
|
628
|
+
columnBoundaries,
|
|
629
|
+
startY: firstRow.y,
|
|
630
|
+
endY: lastRow.y
|
|
631
|
+
});
|
|
632
|
+
}
|
|
633
|
+
currentRegion = [];
|
|
634
|
+
} else {
|
|
635
|
+
currentRegion = [];
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
639
|
+
const firstRow = currentRegion[0];
|
|
640
|
+
const lastRow = currentRegion[currentRegion.length - 1];
|
|
641
|
+
if (firstRow && lastRow) {
|
|
642
|
+
regions.push({
|
|
643
|
+
rows: currentRegion,
|
|
644
|
+
columnBoundaries,
|
|
645
|
+
startY: firstRow.y,
|
|
646
|
+
endY: lastRow.y
|
|
647
|
+
});
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
return regions;
|
|
651
|
+
};
|
|
652
|
+
var extractTablesFromPage = async (page, pageNum) => {
|
|
653
|
+
const tables = [];
|
|
654
|
+
try {
|
|
655
|
+
const textItems = await extractTextItemsWithPositions(page);
|
|
656
|
+
if (textItems.length === 0) {
|
|
657
|
+
return tables;
|
|
658
|
+
}
|
|
659
|
+
const rows = clusterByY(textItems);
|
|
660
|
+
const tableRegions = identifyTableRegions(rows);
|
|
661
|
+
for (let tableIndex = 0;tableIndex < tableRegions.length; tableIndex++) {
|
|
662
|
+
const region = tableRegions[tableIndex];
|
|
663
|
+
if (!region)
|
|
664
|
+
continue;
|
|
665
|
+
const tableRows = [];
|
|
666
|
+
for (const row of region.rows) {
|
|
667
|
+
const cells = assignToColumns(row, region.columnBoundaries);
|
|
668
|
+
tableRows.push(cells);
|
|
669
|
+
}
|
|
670
|
+
const confidence = calculateConfidence(region.rows, region.columnBoundaries);
|
|
671
|
+
if (confidence >= 0.3) {
|
|
672
|
+
tables.push({
|
|
673
|
+
page: pageNum,
|
|
674
|
+
tableIndex,
|
|
675
|
+
rows: tableRows,
|
|
676
|
+
rowCount: tableRows.length,
|
|
677
|
+
colCount: region.columnBoundaries.length,
|
|
678
|
+
confidence: Math.round(confidence * 100) / 100
|
|
679
|
+
});
|
|
680
|
+
}
|
|
681
|
+
}
|
|
682
|
+
} catch (error) {
|
|
683
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
684
|
+
logger5.warn("Error extracting tables from page", { pageNum, error: message });
|
|
685
|
+
}
|
|
686
|
+
return tables;
|
|
687
|
+
};
|
|
688
|
+
var extractTables = async (pdfDocument, pagesToProcess) => {
|
|
689
|
+
const allTables = [];
|
|
690
|
+
for (const pageNum of pagesToProcess) {
|
|
691
|
+
try {
|
|
692
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
693
|
+
const pageTables = await extractTablesFromPage(page, pageNum);
|
|
694
|
+
allTables.push(...pageTables);
|
|
695
|
+
} catch (error) {
|
|
696
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
697
|
+
logger5.warn("Error getting page for table extraction", { pageNum, error: message });
|
|
698
|
+
}
|
|
699
|
+
}
|
|
700
|
+
return allTables;
|
|
701
|
+
};
|
|
702
|
+
var tableToMarkdown = (table) => {
|
|
703
|
+
if (table.rows.length === 0)
|
|
704
|
+
return "";
|
|
705
|
+
const lines = [];
|
|
706
|
+
const headerRow = table.rows[0];
|
|
707
|
+
if (!headerRow)
|
|
708
|
+
return "";
|
|
709
|
+
lines.push(`| ${headerRow.map((cell) => cell.trim() || " ").join(" | ")} |`);
|
|
710
|
+
lines.push(`| ${headerRow.map(() => "---").join(" | ")} |`);
|
|
711
|
+
for (let i = 1;i < table.rows.length; i++) {
|
|
712
|
+
const row = table.rows[i];
|
|
713
|
+
if (!row)
|
|
714
|
+
continue;
|
|
715
|
+
const paddedRow = [...row];
|
|
716
|
+
while (paddedRow.length < headerRow.length) {
|
|
717
|
+
paddedRow.push("");
|
|
718
|
+
}
|
|
719
|
+
lines.push(`| ${paddedRow.map((cell) => cell.trim() || " ").join(" | ")} |`);
|
|
720
|
+
}
|
|
721
|
+
return lines.join(`
|
|
722
|
+
`);
|
|
723
|
+
};
|
|
724
|
+
var tablesToMarkdown = (tables) => {
|
|
725
|
+
if (tables.length === 0)
|
|
726
|
+
return "";
|
|
727
|
+
const sections = ["## Extracted Tables", ""];
|
|
728
|
+
for (const table of tables) {
|
|
729
|
+
sections.push(`### Page ${table.page}, Table ${table.tableIndex + 1}`);
|
|
730
|
+
sections.push(`*Confidence: ${(table.confidence * 100).toFixed(0)}%*`);
|
|
731
|
+
sections.push("");
|
|
732
|
+
sections.push(tableToMarkdown(table));
|
|
733
|
+
sections.push("");
|
|
734
|
+
}
|
|
735
|
+
return sections.join(`
|
|
736
|
+
`);
|
|
737
|
+
};
|
|
738
|
+
|
|
465
739
|
// src/schemas/readPdf.ts
|
|
466
740
|
import {
|
|
467
741
|
array,
|
|
@@ -487,11 +761,12 @@ var readPdfArgsSchema = object({
|
|
|
487
761
|
include_full_text: optional(bool(description("Include the full text content of each PDF (only if 'pages' is not specified for that source)."))),
|
|
488
762
|
include_metadata: optional(bool(description("Include metadata and info objects for each PDF."))),
|
|
489
763
|
include_page_count: optional(bool(description("Include the total number of pages for each PDF."))),
|
|
490
|
-
include_images: optional(bool(description("Extract and include embedded images from the PDF pages as base64-encoded data.")))
|
|
764
|
+
include_images: optional(bool(description("Extract and include embedded images from the PDF pages as base64-encoded data."))),
|
|
765
|
+
include_tables: optional(bool(description("Detect and extract tables from PDF pages. Uses spatial clustering of text coordinates to identify tabular structures.")))
|
|
491
766
|
});
|
|
492
767
|
|
|
493
768
|
// src/handlers/readPdf.ts
|
|
494
|
-
var
|
|
769
|
+
var logger6 = createLogger("ReadPdf");
|
|
495
770
|
var processSingleSource = async (source, options) => {
|
|
496
771
|
const sourceDescription = source.path ?? source.url ?? "unknown source";
|
|
497
772
|
let individualResult = { source: sourceDescription, success: false };
|
|
@@ -509,7 +784,16 @@ var processSingleSource = async (source, options) => {
|
|
|
509
784
|
output.warnings = warnings;
|
|
510
785
|
}
|
|
511
786
|
if (pagesToProcess.length > 0) {
|
|
512
|
-
const
|
|
787
|
+
const MAX_CONCURRENT_PAGES = 5;
|
|
788
|
+
const pageContents = [];
|
|
789
|
+
for (let i = 0;i < pagesToProcess.length; i += MAX_CONCURRENT_PAGES) {
|
|
790
|
+
const batch = pagesToProcess.slice(i, i + MAX_CONCURRENT_PAGES);
|
|
791
|
+
const batchResults = await Promise.all(batch.map((pageNum) => extractPageContent(pdfDocument, pageNum, options.includeImages, sourceDescription)));
|
|
792
|
+
pageContents.push(...batchResults);
|
|
793
|
+
if (i + MAX_CONCURRENT_PAGES < pagesToProcess.length) {
|
|
794
|
+
await new Promise((resolve) => setImmediate(resolve));
|
|
795
|
+
}
|
|
796
|
+
}
|
|
513
797
|
output.page_contents = pageContents.map((items, idx) => ({
|
|
514
798
|
page: pagesToProcess[idx],
|
|
515
799
|
items
|
|
@@ -531,6 +815,12 @@ var processSingleSource = async (source, options) => {
|
|
|
531
815
|
output.images = extractedImages;
|
|
532
816
|
}
|
|
533
817
|
}
|
|
818
|
+
if (options.includeTables) {
|
|
819
|
+
const extractedTables = await extractTables(pdfDocument, pagesToProcess);
|
|
820
|
+
if (extractedTables.length > 0) {
|
|
821
|
+
output.tables = extractedTables;
|
|
822
|
+
}
|
|
823
|
+
}
|
|
534
824
|
}
|
|
535
825
|
individualResult = { ...individualResult, data: output, success: true };
|
|
536
826
|
} catch (error) {
|
|
@@ -549,21 +839,29 @@ var processSingleSource = async (source, options) => {
|
|
|
549
839
|
await pdfDocument.destroy();
|
|
550
840
|
} catch (destroyError) {
|
|
551
841
|
const message = destroyError instanceof Error ? destroyError.message : String(destroyError);
|
|
552
|
-
|
|
842
|
+
logger6.warn("Error destroying PDF document", { sourceDescription, error: message });
|
|
553
843
|
}
|
|
554
844
|
}
|
|
555
845
|
}
|
|
556
846
|
return individualResult;
|
|
557
847
|
};
|
|
558
848
|
var readPdf = tool().description("Reads content/metadata/images from one or more PDFs (local/URL). Each source can specify pages to extract.").input(readPdfArgsSchema).handler(async ({ input }) => {
|
|
559
|
-
const {
|
|
849
|
+
const {
|
|
850
|
+
sources,
|
|
851
|
+
include_full_text,
|
|
852
|
+
include_metadata,
|
|
853
|
+
include_page_count,
|
|
854
|
+
include_images,
|
|
855
|
+
include_tables
|
|
856
|
+
} = input;
|
|
560
857
|
const MAX_CONCURRENT_SOURCES = 3;
|
|
561
858
|
const results = [];
|
|
562
859
|
const options = {
|
|
563
860
|
includeFullText: include_full_text ?? false,
|
|
564
861
|
includeMetadata: include_metadata ?? true,
|
|
565
862
|
includePageCount: include_page_count ?? true,
|
|
566
|
-
includeImages: include_images ?? false
|
|
863
|
+
includeImages: include_images ?? false,
|
|
864
|
+
includeTables: include_tables ?? false
|
|
567
865
|
};
|
|
568
866
|
for (let i = 0;i < sources.length; i += MAX_CONCURRENT_SOURCES) {
|
|
569
867
|
const batch = sources.slice(i, i + MAX_CONCURRENT_SOURCES);
|
|
@@ -578,18 +876,27 @@ var readPdf = tool().description("Reads content/metadata/images from one or more
|
|
|
578
876
|
const content = [];
|
|
579
877
|
const resultsForJson = results.map((result) => {
|
|
580
878
|
if (result.data) {
|
|
581
|
-
const { images, page_contents, ...dataWithoutBinaryContent } = result.data;
|
|
879
|
+
const { images, page_contents, tables, ...dataWithoutBinaryContent } = result.data;
|
|
880
|
+
const processedData = { ...dataWithoutBinaryContent };
|
|
582
881
|
if (images) {
|
|
583
|
-
|
|
882
|
+
processedData["image_info"] = images.map((img) => ({
|
|
584
883
|
page: img.page,
|
|
585
884
|
index: img.index,
|
|
586
885
|
width: img.width,
|
|
587
886
|
height: img.height,
|
|
588
887
|
format: img.format
|
|
589
888
|
}));
|
|
590
|
-
return { ...result, data: { ...dataWithoutBinaryContent, image_info: imageInfo } };
|
|
591
889
|
}
|
|
592
|
-
|
|
890
|
+
if (tables && tables.length > 0) {
|
|
891
|
+
processedData["table_info"] = tables.map((tbl) => ({
|
|
892
|
+
page: tbl.page,
|
|
893
|
+
tableIndex: tbl.tableIndex,
|
|
894
|
+
rowCount: tbl.rowCount,
|
|
895
|
+
colCount: tbl.colCount,
|
|
896
|
+
confidence: tbl.confidence
|
|
897
|
+
}));
|
|
898
|
+
}
|
|
899
|
+
return { ...result, data: processedData };
|
|
593
900
|
}
|
|
594
901
|
return result;
|
|
595
902
|
});
|
|
@@ -598,29 +905,72 @@ var readPdf = tool().description("Reads content/metadata/images from one or more
|
|
|
598
905
|
if (!result.success || !result.data?.page_contents)
|
|
599
906
|
continue;
|
|
600
907
|
for (const pageContent of result.data.page_contents) {
|
|
908
|
+
const pageTextParts = [];
|
|
909
|
+
const pageImages = [];
|
|
601
910
|
for (const item of pageContent.items) {
|
|
602
911
|
if (item.type === "text" && item.textContent) {
|
|
603
|
-
|
|
912
|
+
pageTextParts.push(item.textContent);
|
|
604
913
|
} else if (item.type === "image" && item.imageData) {
|
|
605
|
-
|
|
914
|
+
pageImages.push(item.imageData);
|
|
606
915
|
}
|
|
607
916
|
}
|
|
917
|
+
if (pageTextParts.length > 0) {
|
|
918
|
+
content.push(text(`[Page ${pageContent.page}]
|
|
919
|
+
${pageTextParts.join(`
|
|
920
|
+
`)}`));
|
|
921
|
+
}
|
|
922
|
+
for (const img of pageImages) {
|
|
923
|
+
content.push(image(img.data, "image/png"));
|
|
924
|
+
}
|
|
925
|
+
}
|
|
926
|
+
}
|
|
927
|
+
if (options.includeTables) {
|
|
928
|
+
const allTables = [];
|
|
929
|
+
for (const result of results) {
|
|
930
|
+
if (result.success && result.data?.tables) {
|
|
931
|
+
allTables.push(...result.data.tables);
|
|
932
|
+
}
|
|
933
|
+
}
|
|
934
|
+
if (allTables.length > 0) {
|
|
935
|
+
const markdownTables = tablesToMarkdown(allTables);
|
|
936
|
+
content.push(text(markdownTables));
|
|
608
937
|
}
|
|
609
938
|
}
|
|
610
939
|
return content;
|
|
611
940
|
});
|
|
612
941
|
|
|
613
942
|
// src/index.ts
|
|
943
|
+
var transportType = process.env["MCP_TRANSPORT"] ?? "stdio";
|
|
944
|
+
var httpPort = Number.parseInt(process.env["MCP_HTTP_PORT"] ?? "8080", 10);
|
|
945
|
+
var httpHost = process.env["MCP_HTTP_HOST"] ?? "0.0.0.0";
|
|
946
|
+
var apiKey = process.env["MCP_API_KEY"];
|
|
947
|
+
function createTransport() {
|
|
948
|
+
if (transportType === "http") {
|
|
949
|
+
return http({
|
|
950
|
+
port: httpPort,
|
|
951
|
+
hostname: httpHost,
|
|
952
|
+
cors: "*"
|
|
953
|
+
});
|
|
954
|
+
}
|
|
955
|
+
return stdio();
|
|
956
|
+
}
|
|
614
957
|
var server = createServer({
|
|
615
958
|
name: "pdf-reader-mcp",
|
|
616
|
-
version: "1.
|
|
959
|
+
version: "2.1.0",
|
|
617
960
|
instructions: "MCP Server for reading PDF files and extracting text, metadata, images, and page information.",
|
|
618
961
|
tools: { read_pdf: readPdf },
|
|
619
|
-
transport:
|
|
962
|
+
transport: createTransport()
|
|
620
963
|
});
|
|
621
964
|
async function main() {
|
|
622
965
|
await server.start();
|
|
623
|
-
if (
|
|
966
|
+
if (transportType === "http") {
|
|
967
|
+
console.log(`[PDF Reader MCP] Server running on http://${httpHost}:${httpPort}/mcp`);
|
|
968
|
+
console.log(`[PDF Reader MCP] Health check: http://${httpHost}:${httpPort}/mcp/health`);
|
|
969
|
+
if (apiKey) {
|
|
970
|
+
console.log("[PDF Reader MCP] API key authentication enabled (X-API-Key header)");
|
|
971
|
+
}
|
|
972
|
+
console.log("[PDF Reader MCP] Project root:", process.cwd());
|
|
973
|
+
} else if (process.env["DEBUG_MCP"]) {
|
|
624
974
|
console.error("[PDF Reader MCP] Server running on stdio");
|
|
625
975
|
console.error("[PDF Reader MCP] Project root:", process.cwd());
|
|
626
976
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sylphx/pdf-reader-mcp",
|
|
3
|
-
"version": "2.
|
|
3
|
+
"version": "2.3.0",
|
|
4
4
|
"description": "An MCP server providing tools to read PDF files.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -12,9 +12,7 @@
|
|
|
12
12
|
"LICENSE"
|
|
13
13
|
],
|
|
14
14
|
"exports": {
|
|
15
|
-
".":
|
|
16
|
-
"import": "./dist/index.js"
|
|
17
|
-
}
|
|
15
|
+
".": "./dist/index.js"
|
|
18
16
|
},
|
|
19
17
|
"publishConfig": {
|
|
20
18
|
"access": "public"
|
|
@@ -45,7 +43,7 @@
|
|
|
45
43
|
"tool"
|
|
46
44
|
],
|
|
47
45
|
"scripts": {
|
|
48
|
-
"build": "bunup",
|
|
46
|
+
"build": "bunup --no-dts",
|
|
49
47
|
"watch": "tsc --watch",
|
|
50
48
|
"inspector": "npx @modelcontextprotocol/inspector dist/index.js",
|
|
51
49
|
"test": "bun test",
|
|
@@ -58,9 +56,9 @@
|
|
|
58
56
|
"check": "biome check .",
|
|
59
57
|
"check:fix": "biome check --write .",
|
|
60
58
|
"validate": "bun run check && bun run test",
|
|
61
|
-
"docs:dev": "
|
|
62
|
-
"docs:build": "
|
|
63
|
-
"docs:preview": "
|
|
59
|
+
"docs:dev": "vitepress dev docs",
|
|
60
|
+
"docs:build": "vitepress build docs",
|
|
61
|
+
"docs:preview": "vitepress preview docs",
|
|
64
62
|
"start": "node dist/index.js",
|
|
65
63
|
"typecheck": "tsc --noEmit",
|
|
66
64
|
"benchmark": "bun bench",
|
|
@@ -77,28 +75,25 @@
|
|
|
77
75
|
"pdfjs-dist": "^5.4.449",
|
|
78
76
|
"pngjs": "^7.0.0"
|
|
79
77
|
},
|
|
78
|
+
"overrides": {
|
|
79
|
+
"esbuild": "^0.25.0",
|
|
80
|
+
"preact": "^10.28.2"
|
|
81
|
+
},
|
|
80
82
|
"devDependencies": {
|
|
81
83
|
"@biomejs/biome": "^2.3.8",
|
|
82
|
-
"@solidjs/router": "^0.15.4",
|
|
83
84
|
"@sylphx/biome-config": "^0.4.1",
|
|
84
85
|
"@sylphx/bump": "^1.6.1",
|
|
85
86
|
"@sylphx/doctor": "^1.32.1",
|
|
86
|
-
"@sylphx/leaf": "^1.0.0",
|
|
87
|
-
"@sylphx/leaf-theme-default": "^1.0.0",
|
|
88
87
|
"@sylphx/tsconfig": "^0.3.1",
|
|
89
88
|
"@types/glob": "^8.1.0",
|
|
90
|
-
"@types/node": "^
|
|
89
|
+
"@types/node": "^25.0.3",
|
|
91
90
|
"@types/pngjs": "^6.0.5",
|
|
92
|
-
"bunup": "
|
|
91
|
+
"bunup": "0.16.10",
|
|
93
92
|
"lefthook": "^2.0.7",
|
|
94
|
-
"solid-js": "^1.9.10",
|
|
95
93
|
"typedoc": "^0.28.15",
|
|
96
94
|
"typedoc-plugin-markdown": "^4.9.0",
|
|
97
95
|
"typescript": "^5.9.3",
|
|
98
|
-
"
|
|
96
|
+
"vitepress": "^1.6.4"
|
|
99
97
|
},
|
|
100
|
-
"packageManager": "bun@1.3.1"
|
|
101
|
-
"overrides": {
|
|
102
|
-
"js-yaml": "^4.1.0"
|
|
103
|
-
}
|
|
98
|
+
"packageManager": "bun@1.3.1"
|
|
104
99
|
}
|