@sylphx/pdf-reader-mcp 2.2.0 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +328 -11
  2. package/package.json +14 -11
package/dist/index.js CHANGED
@@ -343,7 +343,11 @@ var resolvePath = (userPath) => {
343
343
  // src/pdf/loader.ts
344
344
  var logger3 = createLogger("Loader");
345
345
  var require2 = createRequire(import.meta.url);
346
- var CMAP_URL = require2.resolve("pdfjs-dist/package.json").replace("package.json", "cmaps/");
346
+ var PDFJS_ROOT = require2.resolve("pdfjs-dist/package.json").replace("package.json", "");
347
+ var CMAP_URL = `${PDFJS_ROOT}cmaps/`;
348
+ var STANDARD_FONT_DATA_URL = `${PDFJS_ROOT}standard_fonts/`;
349
+ var WASM_URL = `${PDFJS_ROOT}wasm/`;
350
+ var ICC_URL = `${PDFJS_ROOT}iccs/`;
347
351
  var MAX_PDF_SIZE = 100 * 1024 * 1024;
348
352
  var loadPdfDocument = async (source, sourceDescription) => {
349
353
  let pdfDataSource;
@@ -377,7 +381,10 @@ var loadPdfDocument = async (source, sourceDescription) => {
377
381
  const loadingTask = getDocument({
378
382
  ...documentParams,
379
383
  cMapUrl: CMAP_URL,
380
- cMapPacked: true
384
+ cMapPacked: true,
385
+ standardFontDataUrl: STANDARD_FONT_DATA_URL,
386
+ wasmUrl: WASM_URL,
387
+ iccUrl: ICC_URL
381
388
  });
382
389
  try {
383
390
  return await loadingTask.promise;
@@ -462,6 +469,280 @@ var determinePagesToProcess = (targetPages, totalPages, includeFullText) => {
462
469
  return { pagesToProcess: [], invalidPages: [] };
463
470
  };
464
471
 
472
+ // src/pdf/tableExtractor.ts
473
+ var logger5 = createLogger("TableExtractor");
474
+ var Y_TOLERANCE = 5;
475
+ var COLUMN_GAP_THRESHOLD = 15;
476
+ var MIN_ROWS = 2;
477
+ var MIN_COLS = 2;
478
+ var MIN_ROW_ITEMS = 2;
479
+ var extractTextItemsWithPositions = async (page) => {
480
+ const textContent = await page.getTextContent();
481
+ const items = [];
482
+ for (const item of textContent.items) {
483
+ const textItem = item;
484
+ if (!textItem.str.trim())
485
+ continue;
486
+ if (!textItem.transform || textItem.transform.length < 6)
487
+ continue;
488
+ const x = textItem.transform[4];
489
+ const y = textItem.transform[5];
490
+ if (x === undefined || y === undefined)
491
+ continue;
492
+ items.push({
493
+ text: textItem.str,
494
+ x,
495
+ y,
496
+ width: textItem.width ?? textItem.str.length * 6
497
+ });
498
+ }
499
+ return items;
500
+ };
501
+ var clusterByY = (items, tolerance = Y_TOLERANCE) => {
502
+ if (items.length === 0)
503
+ return [];
504
+ const sorted = [...items].sort((a, b) => b.y - a.y);
505
+ const firstItem = sorted[0];
506
+ if (!firstItem)
507
+ return [];
508
+ const rows = [];
509
+ let currentRow = { y: firstItem.y, items: [firstItem] };
510
+ for (let i = 1;i < sorted.length; i++) {
511
+ const item = sorted[i];
512
+ if (!item)
513
+ continue;
514
+ const yDiff = Math.abs(currentRow.y - item.y);
515
+ if (yDiff <= tolerance) {
516
+ currentRow.items.push(item);
517
+ } else {
518
+ rows.push(currentRow);
519
+ currentRow = { y: item.y, items: [item] };
520
+ }
521
+ }
522
+ rows.push(currentRow);
523
+ for (const row of rows) {
524
+ row.items.sort((a, b) => a.x - b.x);
525
+ }
526
+ return rows;
527
+ };
528
+ var detectColumnBoundaries = (rows, gapThreshold = COLUMN_GAP_THRESHOLD) => {
529
+ if (rows.length === 0)
530
+ return [];
531
+ const allXPositions = [];
532
+ for (const row of rows) {
533
+ for (const item of row.items) {
534
+ allXPositions.push(item.x);
535
+ }
536
+ }
537
+ if (allXPositions.length === 0)
538
+ return [];
539
+ allXPositions.sort((a, b) => a - b);
540
+ const firstX = allXPositions[0];
541
+ if (firstX === undefined)
542
+ return [];
543
+ const boundaries = [firstX];
544
+ for (let i = 1;i < allXPositions.length; i++) {
545
+ const current = allXPositions[i];
546
+ const previous = allXPositions[i - 1];
547
+ if (current === undefined || previous === undefined)
548
+ continue;
549
+ const gap = current - previous;
550
+ if (gap >= gapThreshold) {
551
+ boundaries.push(current);
552
+ }
553
+ }
554
+ return boundaries;
555
+ };
556
+ var assignToColumns = (row, columnBoundaries, tolerance = COLUMN_GAP_THRESHOLD / 2) => {
557
+ const cells = new Array(columnBoundaries.length).fill("");
558
+ for (const item of row.items) {
559
+ let colIndex = 0;
560
+ for (let i = columnBoundaries.length - 1;i >= 0; i--) {
561
+ const boundary = columnBoundaries[i];
562
+ if (boundary !== undefined && item.x >= boundary - tolerance) {
563
+ colIndex = i;
564
+ break;
565
+ }
566
+ }
567
+ const current = cells[colIndex];
568
+ cells[colIndex] = current ? `${current} ${item.text}` : item.text;
569
+ }
570
+ return cells;
571
+ };
572
+ var calculateConfidence = (rows, columnBoundaries) => {
573
+ if (rows.length < MIN_ROWS || columnBoundaries.length < MIN_COLS) {
574
+ return 0;
575
+ }
576
+ let score = 0;
577
+ let checks = 0;
578
+ for (const row of rows) {
579
+ const itemsPerColumn = new Set;
580
+ for (const item of row.items) {
581
+ for (let i = columnBoundaries.length - 1;i >= 0; i--) {
582
+ const boundary = columnBoundaries[i];
583
+ if (boundary !== undefined && item.x >= boundary - COLUMN_GAP_THRESHOLD / 2) {
584
+ itemsPerColumn.add(i);
585
+ break;
586
+ }
587
+ }
588
+ }
589
+ score += itemsPerColumn.size / columnBoundaries.length;
590
+ checks++;
591
+ }
592
+ if (rows.length >= 2) {
593
+ const spacings = [];
594
+ for (let i = 1;i < rows.length; i++) {
595
+ const prevRow = rows[i - 1];
596
+ const currRow = rows[i];
597
+ if (prevRow && currRow) {
598
+ spacings.push(Math.abs(prevRow.y - currRow.y));
599
+ }
600
+ }
601
+ if (spacings.length > 0) {
602
+ const avgSpacing = spacings.reduce((a, b) => a + b, 0) / spacings.length;
603
+ const variance = spacings.reduce((sum, s) => sum + (s - avgSpacing) ** 2, 0) / spacings.length;
604
+ const stdDev = Math.sqrt(variance);
605
+ const regularityScore = avgSpacing > 0 ? Math.max(0, 1 - stdDev / avgSpacing) : 0;
606
+ score += regularityScore;
607
+ checks++;
608
+ }
609
+ }
610
+ return checks > 0 ? Math.min(1, score / checks) : 0;
611
+ };
612
+ var identifyTableRegions = (rows) => {
613
+ const regions = [];
614
+ const candidateRows = rows.filter((row) => row.items.length >= MIN_ROW_ITEMS);
615
+ if (candidateRows.length < MIN_ROWS) {
616
+ return regions;
617
+ }
618
+ const columnBoundaries = detectColumnBoundaries(candidateRows);
619
+ if (columnBoundaries.length < MIN_COLS) {
620
+ return regions;
621
+ }
622
+ let currentRegion = [];
623
+ for (const row of candidateRows) {
624
+ const alignedItems = row.items.filter((item) => {
625
+ return columnBoundaries.some((boundary) => Math.abs(item.x - boundary) < COLUMN_GAP_THRESHOLD);
626
+ });
627
+ if (alignedItems.length >= MIN_COLS - 1) {
628
+ currentRegion.push(row);
629
+ } else if (currentRegion.length >= MIN_ROWS) {
630
+ const firstRow = currentRegion[0];
631
+ const lastRow = currentRegion[currentRegion.length - 1];
632
+ if (firstRow && lastRow) {
633
+ regions.push({
634
+ rows: currentRegion,
635
+ columnBoundaries,
636
+ startY: firstRow.y,
637
+ endY: lastRow.y
638
+ });
639
+ }
640
+ currentRegion = [];
641
+ } else {
642
+ currentRegion = [];
643
+ }
644
+ }
645
+ if (currentRegion.length >= MIN_ROWS) {
646
+ const firstRow = currentRegion[0];
647
+ const lastRow = currentRegion[currentRegion.length - 1];
648
+ if (firstRow && lastRow) {
649
+ regions.push({
650
+ rows: currentRegion,
651
+ columnBoundaries,
652
+ startY: firstRow.y,
653
+ endY: lastRow.y
654
+ });
655
+ }
656
+ }
657
+ return regions;
658
+ };
659
+ var extractTablesFromPage = async (page, pageNum) => {
660
+ const tables = [];
661
+ try {
662
+ const textItems = await extractTextItemsWithPositions(page);
663
+ if (textItems.length === 0) {
664
+ return tables;
665
+ }
666
+ const rows = clusterByY(textItems);
667
+ const tableRegions = identifyTableRegions(rows);
668
+ for (let tableIndex = 0;tableIndex < tableRegions.length; tableIndex++) {
669
+ const region = tableRegions[tableIndex];
670
+ if (!region)
671
+ continue;
672
+ const tableRows = [];
673
+ for (const row of region.rows) {
674
+ const cells = assignToColumns(row, region.columnBoundaries);
675
+ tableRows.push(cells);
676
+ }
677
+ const confidence = calculateConfidence(region.rows, region.columnBoundaries);
678
+ if (confidence >= 0.3) {
679
+ tables.push({
680
+ page: pageNum,
681
+ tableIndex,
682
+ rows: tableRows,
683
+ rowCount: tableRows.length,
684
+ colCount: region.columnBoundaries.length,
685
+ confidence: Math.round(confidence * 100) / 100
686
+ });
687
+ }
688
+ }
689
+ } catch (error) {
690
+ const message = error instanceof Error ? error.message : String(error);
691
+ logger5.warn("Error extracting tables from page", { pageNum, error: message });
692
+ }
693
+ return tables;
694
+ };
695
+ var extractTables = async (pdfDocument, pagesToProcess) => {
696
+ const allTables = [];
697
+ for (const pageNum of pagesToProcess) {
698
+ try {
699
+ const page = await pdfDocument.getPage(pageNum);
700
+ const pageTables = await extractTablesFromPage(page, pageNum);
701
+ allTables.push(...pageTables);
702
+ } catch (error) {
703
+ const message = error instanceof Error ? error.message : String(error);
704
+ logger5.warn("Error getting page for table extraction", { pageNum, error: message });
705
+ }
706
+ }
707
+ return allTables;
708
+ };
709
+ var tableToMarkdown = (table) => {
710
+ if (table.rows.length === 0)
711
+ return "";
712
+ const lines = [];
713
+ const headerRow = table.rows[0];
714
+ if (!headerRow)
715
+ return "";
716
+ lines.push(`| ${headerRow.map((cell) => cell.trim() || " ").join(" | ")} |`);
717
+ lines.push(`| ${headerRow.map(() => "---").join(" | ")} |`);
718
+ for (let i = 1;i < table.rows.length; i++) {
719
+ const row = table.rows[i];
720
+ if (!row)
721
+ continue;
722
+ const paddedRow = [...row];
723
+ while (paddedRow.length < headerRow.length) {
724
+ paddedRow.push("");
725
+ }
726
+ lines.push(`| ${paddedRow.map((cell) => cell.trim() || " ").join(" | ")} |`);
727
+ }
728
+ return lines.join(`
729
+ `);
730
+ };
731
+ var tablesToMarkdown = (tables) => {
732
+ if (tables.length === 0)
733
+ return "";
734
+ const sections = ["## Extracted Tables", ""];
735
+ for (const table of tables) {
736
+ sections.push(`### Page ${table.page}, Table ${table.tableIndex + 1}`);
737
+ sections.push(`*Confidence: ${(table.confidence * 100).toFixed(0)}%*`);
738
+ sections.push("");
739
+ sections.push(tableToMarkdown(table));
740
+ sections.push("");
741
+ }
742
+ return sections.join(`
743
+ `);
744
+ };
745
+
465
746
  // src/schemas/readPdf.ts
466
747
  import {
467
748
  array,
@@ -487,11 +768,12 @@ var readPdfArgsSchema = object({
487
768
  include_full_text: optional(bool(description("Include the full text content of each PDF (only if 'pages' is not specified for that source)."))),
488
769
  include_metadata: optional(bool(description("Include metadata and info objects for each PDF."))),
489
770
  include_page_count: optional(bool(description("Include the total number of pages for each PDF."))),
490
- include_images: optional(bool(description("Extract and include embedded images from the PDF pages as base64-encoded data.")))
771
+ include_images: optional(bool(description("Extract and include embedded images from the PDF pages as base64-encoded data."))),
772
+ include_tables: optional(bool(description("Detect and extract tables from PDF pages. Uses spatial clustering of text coordinates to identify tabular structures.")))
491
773
  });
492
774
 
493
775
  // src/handlers/readPdf.ts
494
- var logger5 = createLogger("ReadPdf");
776
+ var logger6 = createLogger("ReadPdf");
495
777
  var processSingleSource = async (source, options) => {
496
778
  const sourceDescription = source.path ?? source.url ?? "unknown source";
497
779
  let individualResult = { source: sourceDescription, success: false };
@@ -540,6 +822,12 @@ var processSingleSource = async (source, options) => {
540
822
  output.images = extractedImages;
541
823
  }
542
824
  }
825
+ if (options.includeTables) {
826
+ const extractedTables = await extractTables(pdfDocument, pagesToProcess);
827
+ if (extractedTables.length > 0) {
828
+ output.tables = extractedTables;
829
+ }
830
+ }
543
831
  }
544
832
  individualResult = { ...individualResult, data: output, success: true };
545
833
  } catch (error) {
@@ -558,21 +846,29 @@ var processSingleSource = async (source, options) => {
558
846
  await pdfDocument.destroy();
559
847
  } catch (destroyError) {
560
848
  const message = destroyError instanceof Error ? destroyError.message : String(destroyError);
561
- logger5.warn("Error destroying PDF document", { sourceDescription, error: message });
849
+ logger6.warn("Error destroying PDF document", { sourceDescription, error: message });
562
850
  }
563
851
  }
564
852
  }
565
853
  return individualResult;
566
854
  };
567
855
  var readPdf = tool().description("Reads content/metadata/images from one or more PDFs (local/URL). Each source can specify pages to extract.").input(readPdfArgsSchema).handler(async ({ input }) => {
568
- const { sources, include_full_text, include_metadata, include_page_count, include_images } = input;
856
+ const {
857
+ sources,
858
+ include_full_text,
859
+ include_metadata,
860
+ include_page_count,
861
+ include_images,
862
+ include_tables
863
+ } = input;
569
864
  const MAX_CONCURRENT_SOURCES = 3;
570
865
  const results = [];
571
866
  const options = {
572
867
  includeFullText: include_full_text ?? false,
573
868
  includeMetadata: include_metadata ?? true,
574
869
  includePageCount: include_page_count ?? true,
575
- includeImages: include_images ?? false
870
+ includeImages: include_images ?? false,
871
+ includeTables: include_tables ?? false
576
872
  };
577
873
  for (let i = 0;i < sources.length; i += MAX_CONCURRENT_SOURCES) {
578
874
  const batch = sources.slice(i, i + MAX_CONCURRENT_SOURCES);
@@ -587,18 +883,27 @@ var readPdf = tool().description("Reads content/metadata/images from one or more
587
883
  const content = [];
588
884
  const resultsForJson = results.map((result) => {
589
885
  if (result.data) {
590
- const { images, page_contents, ...dataWithoutBinaryContent } = result.data;
886
+ const { images, page_contents, tables, ...dataWithoutBinaryContent } = result.data;
887
+ const processedData = { ...dataWithoutBinaryContent };
591
888
  if (images) {
592
- const imageInfo = images.map((img) => ({
889
+ processedData["image_info"] = images.map((img) => ({
593
890
  page: img.page,
594
891
  index: img.index,
595
892
  width: img.width,
596
893
  height: img.height,
597
894
  format: img.format
598
895
  }));
599
- return { ...result, data: { ...dataWithoutBinaryContent, image_info: imageInfo } };
600
896
  }
601
- return { ...result, data: dataWithoutBinaryContent };
897
+ if (tables && tables.length > 0) {
898
+ processedData["table_info"] = tables.map((tbl) => ({
899
+ page: tbl.page,
900
+ tableIndex: tbl.tableIndex,
901
+ rowCount: tbl.rowCount,
902
+ colCount: tbl.colCount,
903
+ confidence: tbl.confidence
904
+ }));
905
+ }
906
+ return { ...result, data: processedData };
602
907
  }
603
908
  return result;
604
909
  });
@@ -626,6 +931,18 @@ ${pageTextParts.join(`
626
931
  }
627
932
  }
628
933
  }
934
+ if (options.includeTables) {
935
+ const allTables = [];
936
+ for (const result of results) {
937
+ if (result.success && result.data?.tables) {
938
+ allTables.push(...result.data.tables);
939
+ }
940
+ }
941
+ if (allTables.length > 0) {
942
+ const markdownTables = tablesToMarkdown(allTables);
943
+ content.push(text(markdownTables));
944
+ }
945
+ }
629
946
  return content;
630
947
  });
631
948
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sylphx/pdf-reader-mcp",
3
- "version": "2.2.0",
3
+ "version": "2.3.1",
4
4
  "description": "An MCP server providing tools to read PDF files.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -69,29 +69,32 @@
69
69
  "prepare": "node_modules/.bin/lefthook install || true"
70
70
  },
71
71
  "dependencies": {
72
- "@sylphx/mcp-server-sdk": "^2.1.0",
72
+ "@sylphx/mcp-server-sdk": "^2.1.1",
73
73
  "@sylphx/vex": "^0.1.11",
74
- "glob": "^13.0.0",
75
- "pdfjs-dist": "^5.4.449",
74
+ "glob": "^13.0.6",
75
+ "pdfjs-dist": "^5.6.205",
76
76
  "pngjs": "^7.0.0"
77
77
  },
78
78
  "overrides": {
79
79
  "esbuild": "^0.25.0",
80
- "preact": "^10.28.2"
80
+ "preact": "^10.28.2",
81
+ "defu": "^6.1.7",
82
+ "rollup": "^4.60.2",
83
+ "vite": "^6.4.2"
81
84
  },
82
85
  "devDependencies": {
83
- "@biomejs/biome": "^2.3.8",
86
+ "@biomejs/biome": "^2.4.12",
84
87
  "@sylphx/biome-config": "^0.4.1",
85
88
  "@sylphx/bump": "^1.6.1",
86
- "@sylphx/doctor": "^1.32.1",
89
+ "@sylphx/doctor": "^1.34.0",
87
90
  "@sylphx/tsconfig": "^0.3.1",
88
91
  "@types/glob": "^8.1.0",
89
- "@types/node": "^25.0.3",
92
+ "@types/node": "^25.6.0",
90
93
  "@types/pngjs": "^6.0.5",
91
94
  "bunup": "0.16.10",
92
- "lefthook": "^2.0.7",
93
- "typedoc": "^0.28.15",
94
- "typedoc-plugin-markdown": "^4.9.0",
95
+ "lefthook": "^2.1.6",
96
+ "typedoc": "^0.28.19",
97
+ "typedoc-plugin-markdown": "^4.11.0",
95
98
  "typescript": "^5.9.3",
96
99
  "vitepress": "^1.6.4"
97
100
  },