pagesight 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/tools/robots.ts +64 -30
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pagesight",
3
- "version": "0.2.0",
3
+ "version": "0.2.1",
4
4
  "description": "See your site the way search engines and AI see it.",
5
5
  "keywords": [
6
6
  "seo",
@@ -2,31 +2,15 @@ import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
2
2
  import { z } from "zod";
3
3
  import { auditAiCrawlers, type CrawlerStatus, fetchRobotsTxt, isAllowed, type RobotsTxt } from "../lib/robots.js";
4
4
 
5
- function formatCrawlerStatus(statuses: CrawlerStatus[]): string {
6
- const lines: string[] = [];
7
-
8
- // Group by category
9
- const categories = new Map<string, CrawlerStatus[]>();
10
- for (const s of statuses) {
11
- const cat = s.category;
12
- if (!categories.has(cat)) categories.set(cat, []);
13
- categories.get(cat)?.push(s);
14
- }
15
-
16
- for (const [cat, bots] of categories) {
17
- lines.push(`--- ${cat} (${bots.length}) ---`, "");
18
-
19
- for (const bot of bots) {
20
- const status = bot.allowed ? "ALLOWED" : "BLOCKED";
21
- lines.push(` ${status} ${bot.name} (${bot.company})`);
22
- if (bot.matchedRule) {
23
- lines.push(` Rule: ${bot.matchedRule.type}: ${bot.matchedRule.path} (group: ${bot.matchedGroup})`);
24
- }
25
- }
26
- lines.push("");
27
- }
28
-
29
- return lines.join("\n").trimEnd();
5
+ // Normalize the messy registry categories into clean buckets
6
+ function normalizeCategory(raw: string): string {
7
+ const lower = raw.toLowerCase();
8
+ if (lower.includes("training") || lower.includes("train") || lower.includes("scrape") || lower.includes("dataset"))
9
+ return "Training";
10
+ if (lower.includes("search") && !lower.includes("assistant")) return "Search";
11
+ if (lower.includes("assistant") || lower.includes("user prompt") || lower.includes("user quer")) return "Assistant";
12
+ if (lower.includes("agent")) return "Agent";
13
+ return "Other";
30
14
  }
31
15
 
32
16
  function formatRobotsAudit(origin: string, robots: RobotsTxt, statusCode: number, crawlers: CrawlerStatus[]): string {
@@ -63,17 +47,67 @@ function formatRobotsAudit(origin: string, robots: RobotsTxt, statusCode: number
63
47
  }
64
48
 
65
49
  // AI Crawler audit
66
- const allowed = crawlers.filter((c) => c.allowed);
67
50
  const blocked = crawlers.filter((c) => !c.allowed);
51
+ const allowed = crawlers.filter((c) => c.allowed);
68
52
 
69
53
  lines.push(
70
54
  "",
71
55
  `--- AI Crawlers: ${blocked.length} blocked, ${allowed.length} allowed (of ${crawlers.length} known) ---`,
72
56
  "",
73
- `Source: github.com/ai-robots-txt/ai.robots.txt (${crawlers.length} bots)`,
74
- "",
57
+ `Source: github.com/ai-robots-txt/ai.robots.txt`,
75
58
  );
76
- lines.push(formatCrawlerStatus(crawlers));
59
+
60
+ if (blocked.length === 0) {
61
+ lines.push("", "All 139 known AI crawlers are allowed. No bots are explicitly blocked.");
62
+ } else if (blocked.length === crawlers.length) {
63
+ lines.push("", "All known AI crawlers are blocked.");
64
+ // Show how they're blocked
65
+ const byGroup = new Map<string, string[]>();
66
+ for (const bot of blocked) {
67
+ const group = bot.matchedGroup ?? "wildcard";
68
+ if (!byGroup.has(group)) byGroup.set(group, []);
69
+ byGroup.get(group)?.push(bot.name);
70
+ }
71
+ for (const [group, bots] of byGroup) {
72
+ lines.push(` via group "${group}": ${bots.length} bots`);
73
+ }
74
+ } else {
75
+ // Mixed — show blocked bots in detail, grouped by normalized category
76
+ lines.push("");
77
+
78
+ const blockedByCategory = new Map<string, CrawlerStatus[]>();
79
+ for (const bot of blocked) {
80
+ const cat = normalizeCategory(bot.category);
81
+ if (!blockedByCategory.has(cat)) blockedByCategory.set(cat, []);
82
+ blockedByCategory.get(cat)?.push(bot);
83
+ }
84
+
85
+ const categoryOrder = ["Training", "Search", "Assistant", "Agent", "Other"];
86
+ for (const cat of categoryOrder) {
87
+ const bots = blockedByCategory.get(cat);
88
+ if (!bots) continue;
89
+
90
+ lines.push(`Blocked ${cat} (${bots.length}):`);
91
+ for (const bot of bots) {
92
+ lines.push(` BLOCKED ${bot.name} (${bot.company})`);
93
+ }
94
+ lines.push("");
95
+ }
96
+
97
+ // Summary of allowed by category
98
+ const allowedByCategory = new Map<string, number>();
99
+ for (const bot of allowed) {
100
+ const cat = normalizeCategory(bot.category);
101
+ allowedByCategory.set(cat, (allowedByCategory.get(cat) ?? 0) + 1);
102
+ }
103
+
104
+ const allowedSummary = categoryOrder
105
+ .filter((cat) => allowedByCategory.has(cat))
106
+ .map((cat) => `${cat}: ${allowedByCategory.get(cat)}`)
107
+ .join(", ");
108
+
109
+ lines.push(`Allowed (${allowed.length}): ${allowedSummary}`);
110
+ }
77
111
 
78
112
  return lines.join("\n");
79
113
  }
@@ -81,7 +115,7 @@ function formatRobotsAudit(origin: string, robots: RobotsTxt, statusCode: number
81
115
  export function registerRobotsTool(server: McpServer): void {
82
116
  server.tool(
83
117
  "robots",
84
- "Fetch and analyze a site's robots.txt. Validates syntax per RFC 9309, audits AI crawler access (130+ bots from ai-robots-txt registry), lists sitemaps, and reports blocked vs allowed bots by category.",
118
+ "Fetch and analyze a site's robots.txt. Validates syntax per RFC 9309, audits AI crawler access (139+ bots), lists sitemaps. Shows blocked bots in detail, summarizes allowed.",
85
119
  {
86
120
  url: z
87
121
  .string()