@sylphx/pdf-reader-mcp 2.2.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +319 -9
  2. package/package.json +1 -1
package/dist/index.js CHANGED
@@ -462,6 +462,280 @@ var determinePagesToProcess = (targetPages, totalPages, includeFullText) => {
462
462
  return { pagesToProcess: [], invalidPages: [] };
463
463
  };
464
464
 
465
+ // src/pdf/tableExtractor.ts
466
+ var logger5 = createLogger("TableExtractor");
467
+ var Y_TOLERANCE = 5;
468
+ var COLUMN_GAP_THRESHOLD = 15;
469
+ var MIN_ROWS = 2;
470
+ var MIN_COLS = 2;
471
+ var MIN_ROW_ITEMS = 2;
472
+ var extractTextItemsWithPositions = async (page) => {
473
+ const textContent = await page.getTextContent();
474
+ const items = [];
475
+ for (const item of textContent.items) {
476
+ const textItem = item;
477
+ if (!textItem.str.trim())
478
+ continue;
479
+ if (!textItem.transform || textItem.transform.length < 6)
480
+ continue;
481
+ const x = textItem.transform[4];
482
+ const y = textItem.transform[5];
483
+ if (x === undefined || y === undefined)
484
+ continue;
485
+ items.push({
486
+ text: textItem.str,
487
+ x,
488
+ y,
489
+ width: textItem.width ?? textItem.str.length * 6
490
+ });
491
+ }
492
+ return items;
493
+ };
494
+ var clusterByY = (items, tolerance = Y_TOLERANCE) => {
495
+ if (items.length === 0)
496
+ return [];
497
+ const sorted = [...items].sort((a, b) => b.y - a.y);
498
+ const firstItem = sorted[0];
499
+ if (!firstItem)
500
+ return [];
501
+ const rows = [];
502
+ let currentRow = { y: firstItem.y, items: [firstItem] };
503
+ for (let i = 1;i < sorted.length; i++) {
504
+ const item = sorted[i];
505
+ if (!item)
506
+ continue;
507
+ const yDiff = Math.abs(currentRow.y - item.y);
508
+ if (yDiff <= tolerance) {
509
+ currentRow.items.push(item);
510
+ } else {
511
+ rows.push(currentRow);
512
+ currentRow = { y: item.y, items: [item] };
513
+ }
514
+ }
515
+ rows.push(currentRow);
516
+ for (const row of rows) {
517
+ row.items.sort((a, b) => a.x - b.x);
518
+ }
519
+ return rows;
520
+ };
521
+ var detectColumnBoundaries = (rows, gapThreshold = COLUMN_GAP_THRESHOLD) => {
522
+ if (rows.length === 0)
523
+ return [];
524
+ const allXPositions = [];
525
+ for (const row of rows) {
526
+ for (const item of row.items) {
527
+ allXPositions.push(item.x);
528
+ }
529
+ }
530
+ if (allXPositions.length === 0)
531
+ return [];
532
+ allXPositions.sort((a, b) => a - b);
533
+ const firstX = allXPositions[0];
534
+ if (firstX === undefined)
535
+ return [];
536
+ const boundaries = [firstX];
537
+ for (let i = 1;i < allXPositions.length; i++) {
538
+ const current = allXPositions[i];
539
+ const previous = allXPositions[i - 1];
540
+ if (current === undefined || previous === undefined)
541
+ continue;
542
+ const gap = current - previous;
543
+ if (gap >= gapThreshold) {
544
+ boundaries.push(current);
545
+ }
546
+ }
547
+ return boundaries;
548
+ };
549
+ var assignToColumns = (row, columnBoundaries, tolerance = COLUMN_GAP_THRESHOLD / 2) => {
550
+ const cells = new Array(columnBoundaries.length).fill("");
551
+ for (const item of row.items) {
552
+ let colIndex = 0;
553
+ for (let i = columnBoundaries.length - 1;i >= 0; i--) {
554
+ const boundary = columnBoundaries[i];
555
+ if (boundary !== undefined && item.x >= boundary - tolerance) {
556
+ colIndex = i;
557
+ break;
558
+ }
559
+ }
560
+ const current = cells[colIndex];
561
+ cells[colIndex] = current ? `${current} ${item.text}` : item.text;
562
+ }
563
+ return cells;
564
+ };
565
+ var calculateConfidence = (rows, columnBoundaries) => {
566
+ if (rows.length < MIN_ROWS || columnBoundaries.length < MIN_COLS) {
567
+ return 0;
568
+ }
569
+ let score = 0;
570
+ let checks = 0;
571
+ for (const row of rows) {
572
+ const itemsPerColumn = new Set;
573
+ for (const item of row.items) {
574
+ for (let i = columnBoundaries.length - 1;i >= 0; i--) {
575
+ const boundary = columnBoundaries[i];
576
+ if (boundary !== undefined && item.x >= boundary - COLUMN_GAP_THRESHOLD / 2) {
577
+ itemsPerColumn.add(i);
578
+ break;
579
+ }
580
+ }
581
+ }
582
+ score += itemsPerColumn.size / columnBoundaries.length;
583
+ checks++;
584
+ }
585
+ if (rows.length >= 2) {
586
+ const spacings = [];
587
+ for (let i = 1;i < rows.length; i++) {
588
+ const prevRow = rows[i - 1];
589
+ const currRow = rows[i];
590
+ if (prevRow && currRow) {
591
+ spacings.push(Math.abs(prevRow.y - currRow.y));
592
+ }
593
+ }
594
+ if (spacings.length > 0) {
595
+ const avgSpacing = spacings.reduce((a, b) => a + b, 0) / spacings.length;
596
+ const variance = spacings.reduce((sum, s) => sum + (s - avgSpacing) ** 2, 0) / spacings.length;
597
+ const stdDev = Math.sqrt(variance);
598
+ const regularityScore = avgSpacing > 0 ? Math.max(0, 1 - stdDev / avgSpacing) : 0;
599
+ score += regularityScore;
600
+ checks++;
601
+ }
602
+ }
603
+ return checks > 0 ? Math.min(1, score / checks) : 0;
604
+ };
605
+ var identifyTableRegions = (rows) => {
606
+ const regions = [];
607
+ const candidateRows = rows.filter((row) => row.items.length >= MIN_ROW_ITEMS);
608
+ if (candidateRows.length < MIN_ROWS) {
609
+ return regions;
610
+ }
611
+ const columnBoundaries = detectColumnBoundaries(candidateRows);
612
+ if (columnBoundaries.length < MIN_COLS) {
613
+ return regions;
614
+ }
615
+ let currentRegion = [];
616
+ for (const row of candidateRows) {
617
+ const alignedItems = row.items.filter((item) => {
618
+ return columnBoundaries.some((boundary) => Math.abs(item.x - boundary) < COLUMN_GAP_THRESHOLD);
619
+ });
620
+ if (alignedItems.length >= MIN_COLS - 1) {
621
+ currentRegion.push(row);
622
+ } else if (currentRegion.length >= MIN_ROWS) {
623
+ const firstRow = currentRegion[0];
624
+ const lastRow = currentRegion[currentRegion.length - 1];
625
+ if (firstRow && lastRow) {
626
+ regions.push({
627
+ rows: currentRegion,
628
+ columnBoundaries,
629
+ startY: firstRow.y,
630
+ endY: lastRow.y
631
+ });
632
+ }
633
+ currentRegion = [];
634
+ } else {
635
+ currentRegion = [];
636
+ }
637
+ }
638
+ if (currentRegion.length >= MIN_ROWS) {
639
+ const firstRow = currentRegion[0];
640
+ const lastRow = currentRegion[currentRegion.length - 1];
641
+ if (firstRow && lastRow) {
642
+ regions.push({
643
+ rows: currentRegion,
644
+ columnBoundaries,
645
+ startY: firstRow.y,
646
+ endY: lastRow.y
647
+ });
648
+ }
649
+ }
650
+ return regions;
651
+ };
652
+ var extractTablesFromPage = async (page, pageNum) => {
653
+ const tables = [];
654
+ try {
655
+ const textItems = await extractTextItemsWithPositions(page);
656
+ if (textItems.length === 0) {
657
+ return tables;
658
+ }
659
+ const rows = clusterByY(textItems);
660
+ const tableRegions = identifyTableRegions(rows);
661
+ for (let tableIndex = 0;tableIndex < tableRegions.length; tableIndex++) {
662
+ const region = tableRegions[tableIndex];
663
+ if (!region)
664
+ continue;
665
+ const tableRows = [];
666
+ for (const row of region.rows) {
667
+ const cells = assignToColumns(row, region.columnBoundaries);
668
+ tableRows.push(cells);
669
+ }
670
+ const confidence = calculateConfidence(region.rows, region.columnBoundaries);
671
+ if (confidence >= 0.3) {
672
+ tables.push({
673
+ page: pageNum,
674
+ tableIndex,
675
+ rows: tableRows,
676
+ rowCount: tableRows.length,
677
+ colCount: region.columnBoundaries.length,
678
+ confidence: Math.round(confidence * 100) / 100
679
+ });
680
+ }
681
+ }
682
+ } catch (error) {
683
+ const message = error instanceof Error ? error.message : String(error);
684
+ logger5.warn("Error extracting tables from page", { pageNum, error: message });
685
+ }
686
+ return tables;
687
+ };
688
+ var extractTables = async (pdfDocument, pagesToProcess) => {
689
+ const allTables = [];
690
+ for (const pageNum of pagesToProcess) {
691
+ try {
692
+ const page = await pdfDocument.getPage(pageNum);
693
+ const pageTables = await extractTablesFromPage(page, pageNum);
694
+ allTables.push(...pageTables);
695
+ } catch (error) {
696
+ const message = error instanceof Error ? error.message : String(error);
697
+ logger5.warn("Error getting page for table extraction", { pageNum, error: message });
698
+ }
699
+ }
700
+ return allTables;
701
+ };
702
+ var tableToMarkdown = (table) => {
703
+ if (table.rows.length === 0)
704
+ return "";
705
+ const lines = [];
706
+ const headerRow = table.rows[0];
707
+ if (!headerRow)
708
+ return "";
709
+ lines.push(`| ${headerRow.map((cell) => cell.trim() || " ").join(" | ")} |`);
710
+ lines.push(`| ${headerRow.map(() => "---").join(" | ")} |`);
711
+ for (let i = 1;i < table.rows.length; i++) {
712
+ const row = table.rows[i];
713
+ if (!row)
714
+ continue;
715
+ const paddedRow = [...row];
716
+ while (paddedRow.length < headerRow.length) {
717
+ paddedRow.push("");
718
+ }
719
+ lines.push(`| ${paddedRow.map((cell) => cell.trim() || " ").join(" | ")} |`);
720
+ }
721
+ return lines.join(`
722
+ `);
723
+ };
724
+ var tablesToMarkdown = (tables) => {
725
+ if (tables.length === 0)
726
+ return "";
727
+ const sections = ["## Extracted Tables", ""];
728
+ for (const table of tables) {
729
+ sections.push(`### Page ${table.page}, Table ${table.tableIndex + 1}`);
730
+ sections.push(`*Confidence: ${(table.confidence * 100).toFixed(0)}%*`);
731
+ sections.push("");
732
+ sections.push(tableToMarkdown(table));
733
+ sections.push("");
734
+ }
735
+ return sections.join(`
736
+ `);
737
+ };
738
+
465
739
  // src/schemas/readPdf.ts
466
740
  import {
467
741
  array,
@@ -487,11 +761,12 @@ var readPdfArgsSchema = object({
487
761
  include_full_text: optional(bool(description("Include the full text content of each PDF (only if 'pages' is not specified for that source)."))),
488
762
  include_metadata: optional(bool(description("Include metadata and info objects for each PDF."))),
489
763
  include_page_count: optional(bool(description("Include the total number of pages for each PDF."))),
490
- include_images: optional(bool(description("Extract and include embedded images from the PDF pages as base64-encoded data.")))
764
+ include_images: optional(bool(description("Extract and include embedded images from the PDF pages as base64-encoded data."))),
765
+ include_tables: optional(bool(description("Detect and extract tables from PDF pages. Uses spatial clustering of text coordinates to identify tabular structures.")))
491
766
  });
492
767
 
493
768
  // src/handlers/readPdf.ts
494
- var logger5 = createLogger("ReadPdf");
769
+ var logger6 = createLogger("ReadPdf");
495
770
  var processSingleSource = async (source, options) => {
496
771
  const sourceDescription = source.path ?? source.url ?? "unknown source";
497
772
  let individualResult = { source: sourceDescription, success: false };
@@ -540,6 +815,12 @@ var processSingleSource = async (source, options) => {
540
815
  output.images = extractedImages;
541
816
  }
542
817
  }
818
+ if (options.includeTables) {
819
+ const extractedTables = await extractTables(pdfDocument, pagesToProcess);
820
+ if (extractedTables.length > 0) {
821
+ output.tables = extractedTables;
822
+ }
823
+ }
543
824
  }
544
825
  individualResult = { ...individualResult, data: output, success: true };
545
826
  } catch (error) {
@@ -558,21 +839,29 @@ var processSingleSource = async (source, options) => {
558
839
  await pdfDocument.destroy();
559
840
  } catch (destroyError) {
560
841
  const message = destroyError instanceof Error ? destroyError.message : String(destroyError);
561
- logger5.warn("Error destroying PDF document", { sourceDescription, error: message });
842
+ logger6.warn("Error destroying PDF document", { sourceDescription, error: message });
562
843
  }
563
844
  }
564
845
  }
565
846
  return individualResult;
566
847
  };
567
848
  var readPdf = tool().description("Reads content/metadata/images from one or more PDFs (local/URL). Each source can specify pages to extract.").input(readPdfArgsSchema).handler(async ({ input }) => {
568
- const { sources, include_full_text, include_metadata, include_page_count, include_images } = input;
849
+ const {
850
+ sources,
851
+ include_full_text,
852
+ include_metadata,
853
+ include_page_count,
854
+ include_images,
855
+ include_tables
856
+ } = input;
569
857
  const MAX_CONCURRENT_SOURCES = 3;
570
858
  const results = [];
571
859
  const options = {
572
860
  includeFullText: include_full_text ?? false,
573
861
  includeMetadata: include_metadata ?? true,
574
862
  includePageCount: include_page_count ?? true,
575
- includeImages: include_images ?? false
863
+ includeImages: include_images ?? false,
864
+ includeTables: include_tables ?? false
576
865
  };
577
866
  for (let i = 0;i < sources.length; i += MAX_CONCURRENT_SOURCES) {
578
867
  const batch = sources.slice(i, i + MAX_CONCURRENT_SOURCES);
@@ -587,18 +876,27 @@ var readPdf = tool().description("Reads content/metadata/images from one or more
587
876
  const content = [];
588
877
  const resultsForJson = results.map((result) => {
589
878
  if (result.data) {
590
- const { images, page_contents, ...dataWithoutBinaryContent } = result.data;
879
+ const { images, page_contents, tables, ...dataWithoutBinaryContent } = result.data;
880
+ const processedData = { ...dataWithoutBinaryContent };
591
881
  if (images) {
592
- const imageInfo = images.map((img) => ({
882
+ processedData["image_info"] = images.map((img) => ({
593
883
  page: img.page,
594
884
  index: img.index,
595
885
  width: img.width,
596
886
  height: img.height,
597
887
  format: img.format
598
888
  }));
599
- return { ...result, data: { ...dataWithoutBinaryContent, image_info: imageInfo } };
600
889
  }
601
- return { ...result, data: dataWithoutBinaryContent };
890
+ if (tables && tables.length > 0) {
891
+ processedData["table_info"] = tables.map((tbl) => ({
892
+ page: tbl.page,
893
+ tableIndex: tbl.tableIndex,
894
+ rowCount: tbl.rowCount,
895
+ colCount: tbl.colCount,
896
+ confidence: tbl.confidence
897
+ }));
898
+ }
899
+ return { ...result, data: processedData };
602
900
  }
603
901
  return result;
604
902
  });
@@ -626,6 +924,18 @@ ${pageTextParts.join(`
626
924
  }
627
925
  }
628
926
  }
927
+ if (options.includeTables) {
928
+ const allTables = [];
929
+ for (const result of results) {
930
+ if (result.success && result.data?.tables) {
931
+ allTables.push(...result.data.tables);
932
+ }
933
+ }
934
+ if (allTables.length > 0) {
935
+ const markdownTables = tablesToMarkdown(allTables);
936
+ content.push(text(markdownTables));
937
+ }
938
+ }
629
939
  return content;
630
940
  });
631
941
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sylphx/pdf-reader-mcp",
3
- "version": "2.2.0",
3
+ "version": "2.3.0",
4
4
  "description": "An MCP server providing tools to read PDF files.",
5
5
  "type": "module",
6
6
  "bin": {