@sylphx/pdf-reader-mcp 2.2.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +319 -9
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -462,6 +462,280 @@ var determinePagesToProcess = (targetPages, totalPages, includeFullText) => {
|
|
|
462
462
|
return { pagesToProcess: [], invalidPages: [] };
|
|
463
463
|
};
|
|
464
464
|
|
|
465
|
+
// src/pdf/tableExtractor.ts
|
|
466
|
+
var logger5 = createLogger("TableExtractor");
|
|
467
|
+
var Y_TOLERANCE = 5;
|
|
468
|
+
var COLUMN_GAP_THRESHOLD = 15;
|
|
469
|
+
var MIN_ROWS = 2;
|
|
470
|
+
var MIN_COLS = 2;
|
|
471
|
+
var MIN_ROW_ITEMS = 2;
|
|
472
|
+
var extractTextItemsWithPositions = async (page) => {
|
|
473
|
+
const textContent = await page.getTextContent();
|
|
474
|
+
const items = [];
|
|
475
|
+
for (const item of textContent.items) {
|
|
476
|
+
const textItem = item;
|
|
477
|
+
if (!textItem.str.trim())
|
|
478
|
+
continue;
|
|
479
|
+
if (!textItem.transform || textItem.transform.length < 6)
|
|
480
|
+
continue;
|
|
481
|
+
const x = textItem.transform[4];
|
|
482
|
+
const y = textItem.transform[5];
|
|
483
|
+
if (x === undefined || y === undefined)
|
|
484
|
+
continue;
|
|
485
|
+
items.push({
|
|
486
|
+
text: textItem.str,
|
|
487
|
+
x,
|
|
488
|
+
y,
|
|
489
|
+
width: textItem.width ?? textItem.str.length * 6
|
|
490
|
+
});
|
|
491
|
+
}
|
|
492
|
+
return items;
|
|
493
|
+
};
|
|
494
|
+
var clusterByY = (items, tolerance = Y_TOLERANCE) => {
|
|
495
|
+
if (items.length === 0)
|
|
496
|
+
return [];
|
|
497
|
+
const sorted = [...items].sort((a, b) => b.y - a.y);
|
|
498
|
+
const firstItem = sorted[0];
|
|
499
|
+
if (!firstItem)
|
|
500
|
+
return [];
|
|
501
|
+
const rows = [];
|
|
502
|
+
let currentRow = { y: firstItem.y, items: [firstItem] };
|
|
503
|
+
for (let i = 1;i < sorted.length; i++) {
|
|
504
|
+
const item = sorted[i];
|
|
505
|
+
if (!item)
|
|
506
|
+
continue;
|
|
507
|
+
const yDiff = Math.abs(currentRow.y - item.y);
|
|
508
|
+
if (yDiff <= tolerance) {
|
|
509
|
+
currentRow.items.push(item);
|
|
510
|
+
} else {
|
|
511
|
+
rows.push(currentRow);
|
|
512
|
+
currentRow = { y: item.y, items: [item] };
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
rows.push(currentRow);
|
|
516
|
+
for (const row of rows) {
|
|
517
|
+
row.items.sort((a, b) => a.x - b.x);
|
|
518
|
+
}
|
|
519
|
+
return rows;
|
|
520
|
+
};
|
|
521
|
+
var detectColumnBoundaries = (rows, gapThreshold = COLUMN_GAP_THRESHOLD) => {
|
|
522
|
+
if (rows.length === 0)
|
|
523
|
+
return [];
|
|
524
|
+
const allXPositions = [];
|
|
525
|
+
for (const row of rows) {
|
|
526
|
+
for (const item of row.items) {
|
|
527
|
+
allXPositions.push(item.x);
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
if (allXPositions.length === 0)
|
|
531
|
+
return [];
|
|
532
|
+
allXPositions.sort((a, b) => a - b);
|
|
533
|
+
const firstX = allXPositions[0];
|
|
534
|
+
if (firstX === undefined)
|
|
535
|
+
return [];
|
|
536
|
+
const boundaries = [firstX];
|
|
537
|
+
for (let i = 1;i < allXPositions.length; i++) {
|
|
538
|
+
const current = allXPositions[i];
|
|
539
|
+
const previous = allXPositions[i - 1];
|
|
540
|
+
if (current === undefined || previous === undefined)
|
|
541
|
+
continue;
|
|
542
|
+
const gap = current - previous;
|
|
543
|
+
if (gap >= gapThreshold) {
|
|
544
|
+
boundaries.push(current);
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
return boundaries;
|
|
548
|
+
};
|
|
549
|
+
var assignToColumns = (row, columnBoundaries, tolerance = COLUMN_GAP_THRESHOLD / 2) => {
|
|
550
|
+
const cells = new Array(columnBoundaries.length).fill("");
|
|
551
|
+
for (const item of row.items) {
|
|
552
|
+
let colIndex = 0;
|
|
553
|
+
for (let i = columnBoundaries.length - 1;i >= 0; i--) {
|
|
554
|
+
const boundary = columnBoundaries[i];
|
|
555
|
+
if (boundary !== undefined && item.x >= boundary - tolerance) {
|
|
556
|
+
colIndex = i;
|
|
557
|
+
break;
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
const current = cells[colIndex];
|
|
561
|
+
cells[colIndex] = current ? `${current} ${item.text}` : item.text;
|
|
562
|
+
}
|
|
563
|
+
return cells;
|
|
564
|
+
};
|
|
565
|
+
var calculateConfidence = (rows, columnBoundaries) => {
|
|
566
|
+
if (rows.length < MIN_ROWS || columnBoundaries.length < MIN_COLS) {
|
|
567
|
+
return 0;
|
|
568
|
+
}
|
|
569
|
+
let score = 0;
|
|
570
|
+
let checks = 0;
|
|
571
|
+
for (const row of rows) {
|
|
572
|
+
const itemsPerColumn = new Set;
|
|
573
|
+
for (const item of row.items) {
|
|
574
|
+
for (let i = columnBoundaries.length - 1;i >= 0; i--) {
|
|
575
|
+
const boundary = columnBoundaries[i];
|
|
576
|
+
if (boundary !== undefined && item.x >= boundary - COLUMN_GAP_THRESHOLD / 2) {
|
|
577
|
+
itemsPerColumn.add(i);
|
|
578
|
+
break;
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
}
|
|
582
|
+
score += itemsPerColumn.size / columnBoundaries.length;
|
|
583
|
+
checks++;
|
|
584
|
+
}
|
|
585
|
+
if (rows.length >= 2) {
|
|
586
|
+
const spacings = [];
|
|
587
|
+
for (let i = 1;i < rows.length; i++) {
|
|
588
|
+
const prevRow = rows[i - 1];
|
|
589
|
+
const currRow = rows[i];
|
|
590
|
+
if (prevRow && currRow) {
|
|
591
|
+
spacings.push(Math.abs(prevRow.y - currRow.y));
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
if (spacings.length > 0) {
|
|
595
|
+
const avgSpacing = spacings.reduce((a, b) => a + b, 0) / spacings.length;
|
|
596
|
+
const variance = spacings.reduce((sum, s) => sum + (s - avgSpacing) ** 2, 0) / spacings.length;
|
|
597
|
+
const stdDev = Math.sqrt(variance);
|
|
598
|
+
const regularityScore = avgSpacing > 0 ? Math.max(0, 1 - stdDev / avgSpacing) : 0;
|
|
599
|
+
score += regularityScore;
|
|
600
|
+
checks++;
|
|
601
|
+
}
|
|
602
|
+
}
|
|
603
|
+
return checks > 0 ? Math.min(1, score / checks) : 0;
|
|
604
|
+
};
|
|
605
|
+
var identifyTableRegions = (rows) => {
|
|
606
|
+
const regions = [];
|
|
607
|
+
const candidateRows = rows.filter((row) => row.items.length >= MIN_ROW_ITEMS);
|
|
608
|
+
if (candidateRows.length < MIN_ROWS) {
|
|
609
|
+
return regions;
|
|
610
|
+
}
|
|
611
|
+
const columnBoundaries = detectColumnBoundaries(candidateRows);
|
|
612
|
+
if (columnBoundaries.length < MIN_COLS) {
|
|
613
|
+
return regions;
|
|
614
|
+
}
|
|
615
|
+
let currentRegion = [];
|
|
616
|
+
for (const row of candidateRows) {
|
|
617
|
+
const alignedItems = row.items.filter((item) => {
|
|
618
|
+
return columnBoundaries.some((boundary) => Math.abs(item.x - boundary) < COLUMN_GAP_THRESHOLD);
|
|
619
|
+
});
|
|
620
|
+
if (alignedItems.length >= MIN_COLS - 1) {
|
|
621
|
+
currentRegion.push(row);
|
|
622
|
+
} else if (currentRegion.length >= MIN_ROWS) {
|
|
623
|
+
const firstRow = currentRegion[0];
|
|
624
|
+
const lastRow = currentRegion[currentRegion.length - 1];
|
|
625
|
+
if (firstRow && lastRow) {
|
|
626
|
+
regions.push({
|
|
627
|
+
rows: currentRegion,
|
|
628
|
+
columnBoundaries,
|
|
629
|
+
startY: firstRow.y,
|
|
630
|
+
endY: lastRow.y
|
|
631
|
+
});
|
|
632
|
+
}
|
|
633
|
+
currentRegion = [];
|
|
634
|
+
} else {
|
|
635
|
+
currentRegion = [];
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
639
|
+
const firstRow = currentRegion[0];
|
|
640
|
+
const lastRow = currentRegion[currentRegion.length - 1];
|
|
641
|
+
if (firstRow && lastRow) {
|
|
642
|
+
regions.push({
|
|
643
|
+
rows: currentRegion,
|
|
644
|
+
columnBoundaries,
|
|
645
|
+
startY: firstRow.y,
|
|
646
|
+
endY: lastRow.y
|
|
647
|
+
});
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
return regions;
|
|
651
|
+
};
|
|
652
|
+
var extractTablesFromPage = async (page, pageNum) => {
|
|
653
|
+
const tables = [];
|
|
654
|
+
try {
|
|
655
|
+
const textItems = await extractTextItemsWithPositions(page);
|
|
656
|
+
if (textItems.length === 0) {
|
|
657
|
+
return tables;
|
|
658
|
+
}
|
|
659
|
+
const rows = clusterByY(textItems);
|
|
660
|
+
const tableRegions = identifyTableRegions(rows);
|
|
661
|
+
for (let tableIndex = 0;tableIndex < tableRegions.length; tableIndex++) {
|
|
662
|
+
const region = tableRegions[tableIndex];
|
|
663
|
+
if (!region)
|
|
664
|
+
continue;
|
|
665
|
+
const tableRows = [];
|
|
666
|
+
for (const row of region.rows) {
|
|
667
|
+
const cells = assignToColumns(row, region.columnBoundaries);
|
|
668
|
+
tableRows.push(cells);
|
|
669
|
+
}
|
|
670
|
+
const confidence = calculateConfidence(region.rows, region.columnBoundaries);
|
|
671
|
+
if (confidence >= 0.3) {
|
|
672
|
+
tables.push({
|
|
673
|
+
page: pageNum,
|
|
674
|
+
tableIndex,
|
|
675
|
+
rows: tableRows,
|
|
676
|
+
rowCount: tableRows.length,
|
|
677
|
+
colCount: region.columnBoundaries.length,
|
|
678
|
+
confidence: Math.round(confidence * 100) / 100
|
|
679
|
+
});
|
|
680
|
+
}
|
|
681
|
+
}
|
|
682
|
+
} catch (error) {
|
|
683
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
684
|
+
logger5.warn("Error extracting tables from page", { pageNum, error: message });
|
|
685
|
+
}
|
|
686
|
+
return tables;
|
|
687
|
+
};
|
|
688
|
+
var extractTables = async (pdfDocument, pagesToProcess) => {
|
|
689
|
+
const allTables = [];
|
|
690
|
+
for (const pageNum of pagesToProcess) {
|
|
691
|
+
try {
|
|
692
|
+
const page = await pdfDocument.getPage(pageNum);
|
|
693
|
+
const pageTables = await extractTablesFromPage(page, pageNum);
|
|
694
|
+
allTables.push(...pageTables);
|
|
695
|
+
} catch (error) {
|
|
696
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
697
|
+
logger5.warn("Error getting page for table extraction", { pageNum, error: message });
|
|
698
|
+
}
|
|
699
|
+
}
|
|
700
|
+
return allTables;
|
|
701
|
+
};
|
|
702
|
+
var tableToMarkdown = (table) => {
|
|
703
|
+
if (table.rows.length === 0)
|
|
704
|
+
return "";
|
|
705
|
+
const lines = [];
|
|
706
|
+
const headerRow = table.rows[0];
|
|
707
|
+
if (!headerRow)
|
|
708
|
+
return "";
|
|
709
|
+
lines.push(`| ${headerRow.map((cell) => cell.trim() || " ").join(" | ")} |`);
|
|
710
|
+
lines.push(`| ${headerRow.map(() => "---").join(" | ")} |`);
|
|
711
|
+
for (let i = 1;i < table.rows.length; i++) {
|
|
712
|
+
const row = table.rows[i];
|
|
713
|
+
if (!row)
|
|
714
|
+
continue;
|
|
715
|
+
const paddedRow = [...row];
|
|
716
|
+
while (paddedRow.length < headerRow.length) {
|
|
717
|
+
paddedRow.push("");
|
|
718
|
+
}
|
|
719
|
+
lines.push(`| ${paddedRow.map((cell) => cell.trim() || " ").join(" | ")} |`);
|
|
720
|
+
}
|
|
721
|
+
return lines.join(`
|
|
722
|
+
`);
|
|
723
|
+
};
|
|
724
|
+
var tablesToMarkdown = (tables) => {
|
|
725
|
+
if (tables.length === 0)
|
|
726
|
+
return "";
|
|
727
|
+
const sections = ["## Extracted Tables", ""];
|
|
728
|
+
for (const table of tables) {
|
|
729
|
+
sections.push(`### Page ${table.page}, Table ${table.tableIndex + 1}`);
|
|
730
|
+
sections.push(`*Confidence: ${(table.confidence * 100).toFixed(0)}%*`);
|
|
731
|
+
sections.push("");
|
|
732
|
+
sections.push(tableToMarkdown(table));
|
|
733
|
+
sections.push("");
|
|
734
|
+
}
|
|
735
|
+
return sections.join(`
|
|
736
|
+
`);
|
|
737
|
+
};
|
|
738
|
+
|
|
465
739
|
// src/schemas/readPdf.ts
|
|
466
740
|
import {
|
|
467
741
|
array,
|
|
@@ -487,11 +761,12 @@ var readPdfArgsSchema = object({
|
|
|
487
761
|
include_full_text: optional(bool(description("Include the full text content of each PDF (only if 'pages' is not specified for that source)."))),
|
|
488
762
|
include_metadata: optional(bool(description("Include metadata and info objects for each PDF."))),
|
|
489
763
|
include_page_count: optional(bool(description("Include the total number of pages for each PDF."))),
|
|
490
|
-
include_images: optional(bool(description("Extract and include embedded images from the PDF pages as base64-encoded data.")))
|
|
764
|
+
include_images: optional(bool(description("Extract and include embedded images from the PDF pages as base64-encoded data."))),
|
|
765
|
+
include_tables: optional(bool(description("Detect and extract tables from PDF pages. Uses spatial clustering of text coordinates to identify tabular structures.")))
|
|
491
766
|
});
|
|
492
767
|
|
|
493
768
|
// src/handlers/readPdf.ts
|
|
494
|
-
var
|
|
769
|
+
var logger6 = createLogger("ReadPdf");
|
|
495
770
|
var processSingleSource = async (source, options) => {
|
|
496
771
|
const sourceDescription = source.path ?? source.url ?? "unknown source";
|
|
497
772
|
let individualResult = { source: sourceDescription, success: false };
|
|
@@ -540,6 +815,12 @@ var processSingleSource = async (source, options) => {
|
|
|
540
815
|
output.images = extractedImages;
|
|
541
816
|
}
|
|
542
817
|
}
|
|
818
|
+
if (options.includeTables) {
|
|
819
|
+
const extractedTables = await extractTables(pdfDocument, pagesToProcess);
|
|
820
|
+
if (extractedTables.length > 0) {
|
|
821
|
+
output.tables = extractedTables;
|
|
822
|
+
}
|
|
823
|
+
}
|
|
543
824
|
}
|
|
544
825
|
individualResult = { ...individualResult, data: output, success: true };
|
|
545
826
|
} catch (error) {
|
|
@@ -558,21 +839,29 @@ var processSingleSource = async (source, options) => {
|
|
|
558
839
|
await pdfDocument.destroy();
|
|
559
840
|
} catch (destroyError) {
|
|
560
841
|
const message = destroyError instanceof Error ? destroyError.message : String(destroyError);
|
|
561
|
-
|
|
842
|
+
logger6.warn("Error destroying PDF document", { sourceDescription, error: message });
|
|
562
843
|
}
|
|
563
844
|
}
|
|
564
845
|
}
|
|
565
846
|
return individualResult;
|
|
566
847
|
};
|
|
567
848
|
var readPdf = tool().description("Reads content/metadata/images from one or more PDFs (local/URL). Each source can specify pages to extract.").input(readPdfArgsSchema).handler(async ({ input }) => {
|
|
568
|
-
const {
|
|
849
|
+
const {
|
|
850
|
+
sources,
|
|
851
|
+
include_full_text,
|
|
852
|
+
include_metadata,
|
|
853
|
+
include_page_count,
|
|
854
|
+
include_images,
|
|
855
|
+
include_tables
|
|
856
|
+
} = input;
|
|
569
857
|
const MAX_CONCURRENT_SOURCES = 3;
|
|
570
858
|
const results = [];
|
|
571
859
|
const options = {
|
|
572
860
|
includeFullText: include_full_text ?? false,
|
|
573
861
|
includeMetadata: include_metadata ?? true,
|
|
574
862
|
includePageCount: include_page_count ?? true,
|
|
575
|
-
includeImages: include_images ?? false
|
|
863
|
+
includeImages: include_images ?? false,
|
|
864
|
+
includeTables: include_tables ?? false
|
|
576
865
|
};
|
|
577
866
|
for (let i = 0;i < sources.length; i += MAX_CONCURRENT_SOURCES) {
|
|
578
867
|
const batch = sources.slice(i, i + MAX_CONCURRENT_SOURCES);
|
|
@@ -587,18 +876,27 @@ var readPdf = tool().description("Reads content/metadata/images from one or more
|
|
|
587
876
|
const content = [];
|
|
588
877
|
const resultsForJson = results.map((result) => {
|
|
589
878
|
if (result.data) {
|
|
590
|
-
const { images, page_contents, ...dataWithoutBinaryContent } = result.data;
|
|
879
|
+
const { images, page_contents, tables, ...dataWithoutBinaryContent } = result.data;
|
|
880
|
+
const processedData = { ...dataWithoutBinaryContent };
|
|
591
881
|
if (images) {
|
|
592
|
-
|
|
882
|
+
processedData["image_info"] = images.map((img) => ({
|
|
593
883
|
page: img.page,
|
|
594
884
|
index: img.index,
|
|
595
885
|
width: img.width,
|
|
596
886
|
height: img.height,
|
|
597
887
|
format: img.format
|
|
598
888
|
}));
|
|
599
|
-
return { ...result, data: { ...dataWithoutBinaryContent, image_info: imageInfo } };
|
|
600
889
|
}
|
|
601
|
-
|
|
890
|
+
if (tables && tables.length > 0) {
|
|
891
|
+
processedData["table_info"] = tables.map((tbl) => ({
|
|
892
|
+
page: tbl.page,
|
|
893
|
+
tableIndex: tbl.tableIndex,
|
|
894
|
+
rowCount: tbl.rowCount,
|
|
895
|
+
colCount: tbl.colCount,
|
|
896
|
+
confidence: tbl.confidence
|
|
897
|
+
}));
|
|
898
|
+
}
|
|
899
|
+
return { ...result, data: processedData };
|
|
602
900
|
}
|
|
603
901
|
return result;
|
|
604
902
|
});
|
|
@@ -626,6 +924,18 @@ ${pageTextParts.join(`
|
|
|
626
924
|
}
|
|
627
925
|
}
|
|
628
926
|
}
|
|
927
|
+
if (options.includeTables) {
|
|
928
|
+
const allTables = [];
|
|
929
|
+
for (const result of results) {
|
|
930
|
+
if (result.success && result.data?.tables) {
|
|
931
|
+
allTables.push(...result.data.tables);
|
|
932
|
+
}
|
|
933
|
+
}
|
|
934
|
+
if (allTables.length > 0) {
|
|
935
|
+
const markdownTables = tablesToMarkdown(allTables);
|
|
936
|
+
content.push(text(markdownTables));
|
|
937
|
+
}
|
|
938
|
+
}
|
|
629
939
|
return content;
|
|
630
940
|
});
|
|
631
941
|
|