pagesight 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/tools/robots.ts +64 -30
package/package.json
CHANGED
package/src/tools/robots.ts
CHANGED
|
@@ -2,31 +2,15 @@ import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
|
2
2
|
import { z } from "zod";
|
|
3
3
|
import { auditAiCrawlers, type CrawlerStatus, fetchRobotsTxt, isAllowed, type RobotsTxt } from "../lib/robots.js";
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
for (const [cat, bots] of categories) {
|
|
17
|
-
lines.push(`--- ${cat} (${bots.length}) ---`, "");
|
|
18
|
-
|
|
19
|
-
for (const bot of bots) {
|
|
20
|
-
const status = bot.allowed ? "ALLOWED" : "BLOCKED";
|
|
21
|
-
lines.push(` ${status} ${bot.name} (${bot.company})`);
|
|
22
|
-
if (bot.matchedRule) {
|
|
23
|
-
lines.push(` Rule: ${bot.matchedRule.type}: ${bot.matchedRule.path} (group: ${bot.matchedGroup})`);
|
|
24
|
-
}
|
|
25
|
-
}
|
|
26
|
-
lines.push("");
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
return lines.join("\n").trimEnd();
|
|
5
|
+
// Normalize the messy registry categories into clean buckets
|
|
6
|
+
function normalizeCategory(raw: string): string {
|
|
7
|
+
const lower = raw.toLowerCase();
|
|
8
|
+
if (lower.includes("training") || lower.includes("train") || lower.includes("scrape") || lower.includes("dataset"))
|
|
9
|
+
return "Training";
|
|
10
|
+
if (lower.includes("search") && !lower.includes("assistant")) return "Search";
|
|
11
|
+
if (lower.includes("assistant") || lower.includes("user prompt") || lower.includes("user quer")) return "Assistant";
|
|
12
|
+
if (lower.includes("agent")) return "Agent";
|
|
13
|
+
return "Other";
|
|
30
14
|
}
|
|
31
15
|
|
|
32
16
|
function formatRobotsAudit(origin: string, robots: RobotsTxt, statusCode: number, crawlers: CrawlerStatus[]): string {
|
|
@@ -63,17 +47,67 @@ function formatRobotsAudit(origin: string, robots: RobotsTxt, statusCode: number
|
|
|
63
47
|
}
|
|
64
48
|
|
|
65
49
|
// AI Crawler audit
|
|
66
|
-
const allowed = crawlers.filter((c) => c.allowed);
|
|
67
50
|
const blocked = crawlers.filter((c) => !c.allowed);
|
|
51
|
+
const allowed = crawlers.filter((c) => c.allowed);
|
|
68
52
|
|
|
69
53
|
lines.push(
|
|
70
54
|
"",
|
|
71
55
|
`--- AI Crawlers: ${blocked.length} blocked, ${allowed.length} allowed (of ${crawlers.length} known) ---`,
|
|
72
56
|
"",
|
|
73
|
-
`Source: github.com/ai-robots-txt/ai.robots.txt
|
|
74
|
-
"",
|
|
57
|
+
`Source: github.com/ai-robots-txt/ai.robots.txt`,
|
|
75
58
|
);
|
|
76
|
-
|
|
59
|
+
|
|
60
|
+
if (blocked.length === 0) {
|
|
61
|
+
lines.push("", "All 139 known AI crawlers are allowed. No bots are explicitly blocked.");
|
|
62
|
+
} else if (blocked.length === crawlers.length) {
|
|
63
|
+
lines.push("", "All known AI crawlers are blocked.");
|
|
64
|
+
// Show how they're blocked
|
|
65
|
+
const byGroup = new Map<string, string[]>();
|
|
66
|
+
for (const bot of blocked) {
|
|
67
|
+
const group = bot.matchedGroup ?? "wildcard";
|
|
68
|
+
if (!byGroup.has(group)) byGroup.set(group, []);
|
|
69
|
+
byGroup.get(group)?.push(bot.name);
|
|
70
|
+
}
|
|
71
|
+
for (const [group, bots] of byGroup) {
|
|
72
|
+
lines.push(` via group "${group}": ${bots.length} bots`);
|
|
73
|
+
}
|
|
74
|
+
} else {
|
|
75
|
+
// Mixed — show blocked bots in detail, grouped by normalized category
|
|
76
|
+
lines.push("");
|
|
77
|
+
|
|
78
|
+
const blockedByCategory = new Map<string, CrawlerStatus[]>();
|
|
79
|
+
for (const bot of blocked) {
|
|
80
|
+
const cat = normalizeCategory(bot.category);
|
|
81
|
+
if (!blockedByCategory.has(cat)) blockedByCategory.set(cat, []);
|
|
82
|
+
blockedByCategory.get(cat)?.push(bot);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const categoryOrder = ["Training", "Search", "Assistant", "Agent", "Other"];
|
|
86
|
+
for (const cat of categoryOrder) {
|
|
87
|
+
const bots = blockedByCategory.get(cat);
|
|
88
|
+
if (!bots) continue;
|
|
89
|
+
|
|
90
|
+
lines.push(`Blocked ${cat} (${bots.length}):`);
|
|
91
|
+
for (const bot of bots) {
|
|
92
|
+
lines.push(` BLOCKED ${bot.name} (${bot.company})`);
|
|
93
|
+
}
|
|
94
|
+
lines.push("");
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// Summary of allowed by category
|
|
98
|
+
const allowedByCategory = new Map<string, number>();
|
|
99
|
+
for (const bot of allowed) {
|
|
100
|
+
const cat = normalizeCategory(bot.category);
|
|
101
|
+
allowedByCategory.set(cat, (allowedByCategory.get(cat) ?? 0) + 1);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const allowedSummary = categoryOrder
|
|
105
|
+
.filter((cat) => allowedByCategory.has(cat))
|
|
106
|
+
.map((cat) => `${cat}: ${allowedByCategory.get(cat)}`)
|
|
107
|
+
.join(", ");
|
|
108
|
+
|
|
109
|
+
lines.push(`Allowed (${allowed.length}): ${allowedSummary}`);
|
|
110
|
+
}
|
|
77
111
|
|
|
78
112
|
return lines.join("\n");
|
|
79
113
|
}
|
|
@@ -81,7 +115,7 @@ function formatRobotsAudit(origin: string, robots: RobotsTxt, statusCode: number
|
|
|
81
115
|
export function registerRobotsTool(server: McpServer): void {
|
|
82
116
|
server.tool(
|
|
83
117
|
"robots",
|
|
84
|
-
"Fetch and analyze a site's robots.txt. Validates syntax per RFC 9309, audits AI crawler access (
|
|
118
|
+
"Fetch and analyze a site's robots.txt. Validates syntax per RFC 9309, audits AI crawler access (139+ bots), lists sitemaps. Shows blocked bots in detail, summarizes allowed.",
|
|
85
119
|
{
|
|
86
120
|
url: z
|
|
87
121
|
.string()
|