@agentmarkup/audit 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -3
- package/dist/bin.js +1 -1
- package/dist/{chunk-PNE6FBX2.js → chunk-VYQOM2ID.js} +329 -129
- package/dist/index.d.ts +11 -1
- package/dist/index.js +7 -1
- package/package.json +5 -3
package/README.md
CHANGED
|
@@ -26,11 +26,14 @@ Bare domains are normalized to `https://`. Exit code is `1` when any **error**-l
|
|
|
26
26
|
|
|
27
27
|
| Area | What it does |
|
|
28
28
|
| --- | --- |
|
|
29
|
-
| **Crawler access** | Fetches as each AI crawler user-agent and diffs
|
|
29
|
+
| **Crawler access** | Fetches as each AI crawler user-agent and diffs against a browser control. Flags challenges, differential blocks, rate limits, origin errors, and when an *accessible* crawler gets materially less content than a browser (JS-gated or cloaked pages). |
|
|
30
30
|
| **JS dependence** | Measures whether the raw (un-executed) HTML actually contains content, or is an empty `#root`/`#app` shell that only fills in after JavaScript runs. |
|
|
31
31
|
| **robots.txt** | Reuses `@agentmarkup/core` to detect whether the crawlers you likely want are shadowed by a wildcard `Disallow`, and whether a canonical Content-Signal policy is present. |
|
|
32
|
-
| **llms.txt** | Fetches `/llms.txt
|
|
33
|
-
| **JSON-LD** | Extracts and
|
|
32
|
+
| **llms.txt** | Fetches `/llms.txt` (guarding against HTML soft-404s), validates it, and checks the homepage links it for discovery. |
|
|
33
|
+
| **JSON-LD** | Extracts JSON-LD and flags only unparseable or type-less blocks; parseable structured data (including `@graph`) passes. |
|
|
34
|
+
| **Markdown mirror** | Detects a fetchable markdown mirror or a `text/markdown` alternate link — the clean, low-noise version agents prefer. |
|
|
35
|
+
| **Sitemap** | Checks for `/sitemap.xml`, a `Sitemap:` directive in robots.txt, or common non-standard sitemap paths. |
|
|
36
|
+
| **Page metadata** | Checks for a title, meta description, and canonical link. |
|
|
34
37
|
|
|
35
38
|
## An honest note on "blocked" crawlers
|
|
36
39
|
|
package/dist/bin.js
CHANGED
|
@@ -49,122 +49,27 @@ var CRAWLER_AGENTS = [
|
|
|
49
49
|
];
|
|
50
50
|
var ALL_AGENTS = [BROWSER_CONTROL, ...CRAWLER_AGENTS];
|
|
51
51
|
|
|
52
|
-
// src/analyzers/crawler-access.ts
|
|
53
|
-
var CHALLENGE_MARKERS = [
|
|
54
|
-
"cf-browser-verification",
|
|
55
|
-
"challenge-platform",
|
|
56
|
-
"just a moment",
|
|
57
|
-
"attention required",
|
|
58
|
-
"enable javascript and cookies to continue"
|
|
59
|
-
];
|
|
60
|
-
function looksLikeBotChallenge(result) {
|
|
61
|
-
const mitigated = result.headers["cf-mitigated"];
|
|
62
|
-
if (mitigated && mitigated.toLowerCase().includes("challenge")) return true;
|
|
63
|
-
const body = (result.body ?? "").toLowerCase();
|
|
64
|
-
return CHALLENGE_MARKERS.some((marker) => body.includes(marker));
|
|
65
|
-
}
|
|
66
|
-
function statusClass(status) {
|
|
67
|
-
return status === null ? null : Math.floor(status / 100);
|
|
68
|
-
}
|
|
69
|
-
function analyzeCrawlerAccess(control, probes) {
|
|
70
|
-
const findings = [];
|
|
71
|
-
const controlClass = statusClass(control.status);
|
|
72
|
-
if (control.error || controlClass !== 2) {
|
|
73
|
-
findings.push({
|
|
74
|
-
code: "crawler.control-failed",
|
|
75
|
-
level: "warn",
|
|
76
|
-
title: "Could not establish a browser baseline",
|
|
77
|
-
detail: "The control request (normal browser user-agent) did not return a 2xx response, so bot-vs-browser differences cannot be judged reliably.",
|
|
78
|
-
evidence: `browser control: status=${control.status ?? "none"}${control.error ? ` error=${control.error}` : ""}`,
|
|
79
|
-
fix: "Confirm the URL is reachable and returns 200 in a browser, then re-run the audit."
|
|
80
|
-
});
|
|
81
|
-
return findings;
|
|
82
|
-
}
|
|
83
|
-
for (const { agent, result } of probes) {
|
|
84
|
-
const botClass = statusClass(result.status);
|
|
85
|
-
const evidence = `${agent.id} \u2192 status=${result.status ?? "none"}${result.error ? ` error=${result.error}` : ""}; browser \u2192 status=${control.status}`;
|
|
86
|
-
if (result.error === "timeout" || result.error === "network-error") {
|
|
87
|
-
findings.push({
|
|
88
|
-
code: "crawler.probe-failed",
|
|
89
|
-
level: "warn",
|
|
90
|
-
title: `Could not probe as ${agent.vendor} ${agent.id}`,
|
|
91
|
-
detail: `The request as ${agent.id} failed (${result.error}); no conclusion drawn for this crawler.`,
|
|
92
|
-
evidence
|
|
93
|
-
});
|
|
94
|
-
continue;
|
|
95
|
-
}
|
|
96
|
-
if (botClass === 2) {
|
|
97
|
-
findings.push({
|
|
98
|
-
code: "crawler.accessible",
|
|
99
|
-
level: "pass",
|
|
100
|
-
title: `${agent.vendor} ${agent.id} can reach the page`,
|
|
101
|
-
detail: `A request with the ${agent.id} user-agent returned the same success class as a browser.`,
|
|
102
|
-
evidence
|
|
103
|
-
});
|
|
104
|
-
continue;
|
|
105
|
-
}
|
|
106
|
-
if (result.status === 429) {
|
|
107
|
-
findings.push({
|
|
108
|
-
code: "crawler.rate-limited",
|
|
109
|
-
level: "warn",
|
|
110
|
-
title: `${agent.vendor} ${agent.id} is rate-limited`,
|
|
111
|
-
detail: `The ${agent.id} request was rate-limited (429). This is usually transient, but aggressive rate limits can starve crawlers of your content.`,
|
|
112
|
-
evidence
|
|
113
|
-
});
|
|
114
|
-
continue;
|
|
115
|
-
}
|
|
116
|
-
if (result.status === 403 || result.status === 401) {
|
|
117
|
-
const challenge = looksLikeBotChallenge(result);
|
|
118
|
-
if (challenge) {
|
|
119
|
-
findings.push({
|
|
120
|
-
code: "crawler.bot-challenge",
|
|
121
|
-
level: "warn",
|
|
122
|
-
title: `${agent.vendor} ${agent.id} hit a bot challenge`,
|
|
123
|
-
detail: `The ${agent.id} user-agent got a challenge/verification response (${result.status}). Because ${agent.id} is verified by ${agent.verification ?? "its published identity"}, the real crawler may pass where this spoofed user-agent does not. Confirm the verified bot is allowlisted at your CDN.`,
|
|
124
|
-
evidence,
|
|
125
|
-
fix: "Allowlist the crawler by its published IP ranges (verified bots) rather than relying on user-agent rules."
|
|
126
|
-
});
|
|
127
|
-
} else {
|
|
128
|
-
findings.push({
|
|
129
|
-
code: "crawler.ua-differential-block",
|
|
130
|
-
level: "warn",
|
|
131
|
-
title: `${agent.vendor} ${agent.id} is blocked from a generic IP`,
|
|
132
|
-
detail: `A browser gets ${control.status} but the ${agent.id} user-agent gets ${result.status}, with no challenge signal. Two things cause this and they mean opposite things: a user-agent-string WAF rule (which also blocks the real ${agent.id}) or IP allowlisting (where the verified ${agent.id} is fine). Check which it is at your CDN.`,
|
|
133
|
-
evidence,
|
|
134
|
-
fix: `If a WAF rule blocks the "${agent.id}" user-agent, remove or narrow it. If you allowlist verified bots by IP, no action is needed.`
|
|
135
|
-
});
|
|
136
|
-
}
|
|
137
|
-
continue;
|
|
138
|
-
}
|
|
139
|
-
if (botClass === 5) {
|
|
140
|
-
findings.push({
|
|
141
|
-
code: "crawler.origin-error",
|
|
142
|
-
level: "warn",
|
|
143
|
-
title: `${agent.vendor} ${agent.id} triggered a server error`,
|
|
144
|
-
detail: `The ${agent.id} user-agent got a ${result.status} while the browser got ${control.status}. Something in the stack treats this crawler differently and errors.`,
|
|
145
|
-
evidence
|
|
146
|
-
});
|
|
147
|
-
continue;
|
|
148
|
-
}
|
|
149
|
-
findings.push({
|
|
150
|
-
code: "crawler.differential-unknown",
|
|
151
|
-
level: "warn",
|
|
152
|
-
title: `${agent.vendor} ${agent.id} is treated differently than a browser`,
|
|
153
|
-
detail: `The ${agent.id} user-agent returned ${result.status} while a browser returned ${control.status}. The cause is unclear from the response; inspect the evidence.`,
|
|
154
|
-
evidence
|
|
155
|
-
});
|
|
156
|
-
}
|
|
157
|
-
return findings;
|
|
158
|
-
}
|
|
159
|
-
|
|
160
52
|
// src/analyzers/site-checks.ts
|
|
161
53
|
import {
|
|
162
54
|
extractJsonLdScriptContents,
|
|
163
55
|
findBlockedCrawlers,
|
|
164
56
|
hasLlmsTxtDiscoveryLink,
|
|
165
|
-
validateJsonLdNode,
|
|
166
57
|
validateLlmsTxt
|
|
167
58
|
} from "@agentmarkup/core";
|
|
59
|
+
var HTML_BODY_RE = /^\s*(?:<!doctype\s+html|<html[\s>])/i;
|
|
60
|
+
function isRealTextResource(res) {
|
|
61
|
+
if (res.error || (res.status ?? 0) >= 400 || !res.body) {
|
|
62
|
+
return false;
|
|
63
|
+
}
|
|
64
|
+
const contentType = (res.headers["content-type"] ?? "").toLowerCase();
|
|
65
|
+
if (contentType.includes("text/html")) {
|
|
66
|
+
return false;
|
|
67
|
+
}
|
|
68
|
+
return !HTML_BODY_RE.test(res.body);
|
|
69
|
+
}
|
|
70
|
+
function isGraphContainer(value) {
|
|
71
|
+
return !!value && typeof value === "object" && Array.isArray(value["@graph"]);
|
|
72
|
+
}
|
|
168
73
|
var EXPECTED_CRAWLERS = Object.fromEntries(
|
|
169
74
|
CRAWLER_AGENTS.map((agent) => [agent.ua.split("/")[0], "allow"])
|
|
170
75
|
);
|
|
@@ -214,7 +119,7 @@ function analyzeJsDependence(control) {
|
|
|
214
119
|
}
|
|
215
120
|
function analyzeRobots(robots) {
|
|
216
121
|
const findings = [];
|
|
217
|
-
const has =
|
|
122
|
+
const has = isRealTextResource(robots);
|
|
218
123
|
if (!has) {
|
|
219
124
|
findings.push({
|
|
220
125
|
code: "robots.missing",
|
|
@@ -267,7 +172,7 @@ function analyzeRobots(robots) {
|
|
|
267
172
|
function analyzeMachineReadable(control, llms) {
|
|
268
173
|
const findings = [];
|
|
269
174
|
const html = control.body ?? "";
|
|
270
|
-
const llmsOk =
|
|
175
|
+
const llmsOk = isRealTextResource(llms);
|
|
271
176
|
if (llmsOk) {
|
|
272
177
|
const results = validateLlmsTxt(llms.body ?? "");
|
|
273
178
|
const errors = results.filter((r) => r.severity === "error");
|
|
@@ -313,37 +218,274 @@ function analyzeMachineReadable(control, llms) {
|
|
|
313
218
|
fix: "Add JSON-LD with agentmarkup schema presets (webSite, organization, article, \u2026)."
|
|
314
219
|
});
|
|
315
220
|
} else {
|
|
316
|
-
|
|
221
|
+
let parseError = false;
|
|
222
|
+
let anyTyped = false;
|
|
317
223
|
for (const block of blocks) {
|
|
224
|
+
let parsed;
|
|
318
225
|
try {
|
|
319
|
-
|
|
320
|
-
|
|
226
|
+
parsed = JSON.parse(block);
|
|
227
|
+
} catch {
|
|
228
|
+
parseError = true;
|
|
229
|
+
continue;
|
|
230
|
+
}
|
|
231
|
+
const roots = Array.isArray(parsed) ? parsed : [parsed];
|
|
232
|
+
for (const root of roots) {
|
|
233
|
+
const nodes = isGraphContainer(root) ? root["@graph"] : [root];
|
|
321
234
|
for (const node of nodes) {
|
|
322
|
-
|
|
323
|
-
|
|
235
|
+
if (node && typeof node === "object" && "@type" in node) {
|
|
236
|
+
anyTyped = true;
|
|
324
237
|
}
|
|
325
238
|
}
|
|
326
|
-
} catch {
|
|
327
|
-
errors.push("a JSON-LD script block is not valid JSON");
|
|
328
239
|
}
|
|
329
240
|
}
|
|
330
|
-
|
|
331
|
-
|
|
241
|
+
if (parseError) {
|
|
242
|
+
findings.push({
|
|
243
|
+
code: "jsonld.invalid",
|
|
244
|
+
level: "error",
|
|
245
|
+
title: "JSON-LD has errors",
|
|
246
|
+
detail: "a JSON-LD script block is not valid JSON"
|
|
247
|
+
});
|
|
248
|
+
} else if (!anyTyped) {
|
|
249
|
+
findings.push({
|
|
332
250
|
code: "jsonld.invalid",
|
|
333
251
|
level: "error",
|
|
334
252
|
title: "JSON-LD has errors",
|
|
335
|
-
detail:
|
|
336
|
-
}
|
|
253
|
+
detail: "a JSON-LD block has no @type, so it is not usable structured data"
|
|
254
|
+
});
|
|
255
|
+
} else {
|
|
256
|
+
findings.push({
|
|
337
257
|
code: "jsonld.present",
|
|
338
258
|
level: "pass",
|
|
339
259
|
title: "JSON-LD structured data present",
|
|
340
|
-
detail: `${blocks.length} JSON-LD block(s) found and
|
|
341
|
-
}
|
|
342
|
-
|
|
260
|
+
detail: `${blocks.length} JSON-LD block(s) found and parseable.`
|
|
261
|
+
});
|
|
262
|
+
}
|
|
343
263
|
}
|
|
344
264
|
}
|
|
345
265
|
return findings;
|
|
346
266
|
}
|
|
267
|
+
function hasMarkdownAlternate(html) {
|
|
268
|
+
const links = html.match(/<link\b[^>]*>/gi) ?? [];
|
|
269
|
+
return links.some(
|
|
270
|
+
(link) => /\brel=["']?[^"'>]*\balternate\b/i.test(link) && /\btype=["']?text\/markdown\b/i.test(link)
|
|
271
|
+
);
|
|
272
|
+
}
|
|
273
|
+
function analyzeMarkdown(control, mirror) {
|
|
274
|
+
const html = control.body ?? "";
|
|
275
|
+
const viaLink = html.length > 0 && hasMarkdownAlternate(html);
|
|
276
|
+
const mirrorType = (mirror.headers["content-type"] ?? "").toLowerCase();
|
|
277
|
+
const viaMirror = isRealTextResource(mirror) && (mirrorType.includes("markdown") || /^\s*#/.test(mirror.body ?? ""));
|
|
278
|
+
if (!viaLink && !viaMirror) {
|
|
279
|
+
return [];
|
|
280
|
+
}
|
|
281
|
+
return [
|
|
282
|
+
{
|
|
283
|
+
code: "markdown.present",
|
|
284
|
+
level: "pass",
|
|
285
|
+
title: "A markdown alternate is available for agents",
|
|
286
|
+
detail: viaMirror ? "A markdown mirror of the page is fetchable, giving agents a clean, low-noise version of the content." : "The page advertises a text/markdown alternate link for agents."
|
|
287
|
+
}
|
|
288
|
+
];
|
|
289
|
+
}
|
|
290
|
+
function isXmlSitemap(sitemap) {
|
|
291
|
+
const body = sitemap.body ?? "";
|
|
292
|
+
const contentType = (sitemap.headers["content-type"] ?? "").toLowerCase();
|
|
293
|
+
const reachable = !sitemap.error && (sitemap.status ?? 0) < 400 && body.length > 0;
|
|
294
|
+
const looksXml = /<(?:urlset|sitemapindex)\b/i.test(body) || /^\s*<\?xml/i.test(body);
|
|
295
|
+
const isHtml = contentType.includes("text/html") || HTML_BODY_RE.test(body);
|
|
296
|
+
return reachable && looksXml && !isHtml;
|
|
297
|
+
}
|
|
298
|
+
function analyzeSitemap(sitemap, robots) {
|
|
299
|
+
const declaredInRobots = /^\s*sitemap\s*:/im.test(robots.body ?? "");
|
|
300
|
+
if (isXmlSitemap(sitemap) || declaredInRobots) {
|
|
301
|
+
return [
|
|
302
|
+
{
|
|
303
|
+
code: "sitemap.present",
|
|
304
|
+
level: "pass",
|
|
305
|
+
title: "Sitemap found",
|
|
306
|
+
detail: declaredInRobots ? "A sitemap is declared in robots.txt, which helps crawlers and AI systems discover all of your pages." : "A sitemap.xml is reachable, which helps crawlers and AI systems discover all of your pages."
|
|
307
|
+
}
|
|
308
|
+
];
|
|
309
|
+
}
|
|
310
|
+
return [
|
|
311
|
+
{
|
|
312
|
+
code: "sitemap.missing",
|
|
313
|
+
level: "warn",
|
|
314
|
+
title: "No sitemap.xml found",
|
|
315
|
+
detail: "No reachable sitemap.xml. A sitemap helps crawlers and AI systems discover pages they would not reach by following links.",
|
|
316
|
+
fix: "Generate a sitemap.xml and reference it from robots.txt."
|
|
317
|
+
}
|
|
318
|
+
];
|
|
319
|
+
}
|
|
320
|
+
function analyzeMetadata(control) {
|
|
321
|
+
if (control.error || (control.status ?? 0) >= 400 || !control.body) {
|
|
322
|
+
return [];
|
|
323
|
+
}
|
|
324
|
+
const html = control.body;
|
|
325
|
+
const missing = [];
|
|
326
|
+
const titleMatch = /<title\b[^>]*>([\s\S]*?)<\/title>/i.exec(html);
|
|
327
|
+
if (!titleMatch || titleMatch[1].trim().length === 0) {
|
|
328
|
+
missing.push("title");
|
|
329
|
+
}
|
|
330
|
+
const metas = html.match(/<meta\b[^>]*>/gi) ?? [];
|
|
331
|
+
const hasDescription = metas.some(
|
|
332
|
+
(tag) => /\bname=["']?description["']?/i.test(tag) && /\bcontent=["'][^"']*\S[^"']*["']/i.test(tag)
|
|
333
|
+
);
|
|
334
|
+
if (!hasDescription) missing.push("description");
|
|
335
|
+
const links = html.match(/<link\b[^>]*>/gi) ?? [];
|
|
336
|
+
const hasCanonical = links.some(
|
|
337
|
+
(link) => /\brel=["']?canonical\b/i.test(link)
|
|
338
|
+
);
|
|
339
|
+
if (!hasCanonical) missing.push("canonical");
|
|
340
|
+
if (missing.length === 0) {
|
|
341
|
+
return [
|
|
342
|
+
{
|
|
343
|
+
code: "meta.complete",
|
|
344
|
+
level: "pass",
|
|
345
|
+
title: "Core page metadata present",
|
|
346
|
+
detail: "The page has a title, a meta description, and a canonical link, which help AI systems and search attribute the page."
|
|
347
|
+
}
|
|
348
|
+
];
|
|
349
|
+
}
|
|
350
|
+
return [
|
|
351
|
+
{
|
|
352
|
+
code: "meta.incomplete",
|
|
353
|
+
level: "warn",
|
|
354
|
+
title: "Core page metadata is incomplete",
|
|
355
|
+
detail: `Missing: ${missing.join(
|
|
356
|
+
", "
|
|
357
|
+
)}. Title, meta description, and canonical link help AI systems and search understand and correctly attribute the page.`,
|
|
358
|
+
evidence: `missing: ${missing.join(", ")}`,
|
|
359
|
+
fix: "Add the missing head tags; agentmarkup keeps these consistent on generated pages."
|
|
360
|
+
}
|
|
361
|
+
];
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
// src/analyzers/crawler-access.ts
|
|
365
|
+
function isThinnerThanBrowser(controlTextLength, crawlerBody) {
|
|
366
|
+
if (controlTextLength < 500) return false;
|
|
367
|
+
const crawlerTextLength = stripTags(crawlerBody ?? "").length;
|
|
368
|
+
return crawlerTextLength < controlTextLength * 0.4 && controlTextLength - crawlerTextLength >= 500;
|
|
369
|
+
}
|
|
370
|
+
var CHALLENGE_MARKERS = [
|
|
371
|
+
"cf-browser-verification",
|
|
372
|
+
"challenge-platform",
|
|
373
|
+
"just a moment",
|
|
374
|
+
"attention required",
|
|
375
|
+
"enable javascript and cookies to continue"
|
|
376
|
+
];
|
|
377
|
+
function looksLikeBotChallenge(result) {
|
|
378
|
+
const mitigated = result.headers["cf-mitigated"];
|
|
379
|
+
if (mitigated && mitigated.toLowerCase().includes("challenge")) return true;
|
|
380
|
+
const body = (result.body ?? "").toLowerCase();
|
|
381
|
+
return CHALLENGE_MARKERS.some((marker) => body.includes(marker));
|
|
382
|
+
}
|
|
383
|
+
function statusClass(status) {
|
|
384
|
+
return status === null ? null : Math.floor(status / 100);
|
|
385
|
+
}
|
|
386
|
+
function analyzeCrawlerAccess(control, probes) {
|
|
387
|
+
const findings = [];
|
|
388
|
+
const controlClass = statusClass(control.status);
|
|
389
|
+
const controlTextLength = stripTags(control.body ?? "").length;
|
|
390
|
+
if (control.error || controlClass !== 2) {
|
|
391
|
+
findings.push({
|
|
392
|
+
code: "crawler.control-failed",
|
|
393
|
+
level: "warn",
|
|
394
|
+
title: "Could not establish a browser baseline",
|
|
395
|
+
detail: "The control request (normal browser user-agent) did not return a 2xx response, so bot-vs-browser differences cannot be judged reliably.",
|
|
396
|
+
evidence: `browser control: status=${control.status ?? "none"}${control.error ? ` error=${control.error}` : ""}`,
|
|
397
|
+
fix: "Confirm the URL is reachable and returns 200 in a browser, then re-run the audit."
|
|
398
|
+
});
|
|
399
|
+
return findings;
|
|
400
|
+
}
|
|
401
|
+
for (const { agent, result } of probes) {
|
|
402
|
+
const botClass = statusClass(result.status);
|
|
403
|
+
const evidence = `${agent.id} \u2192 status=${result.status ?? "none"}${result.error ? ` error=${result.error}` : ""}; browser \u2192 status=${control.status}`;
|
|
404
|
+
if (result.error === "timeout" || result.error === "network-error") {
|
|
405
|
+
findings.push({
|
|
406
|
+
code: "crawler.probe-failed",
|
|
407
|
+
level: "warn",
|
|
408
|
+
title: `Could not probe as ${agent.vendor} ${agent.id}`,
|
|
409
|
+
detail: `The request as ${agent.id} failed (${result.error}); no conclusion drawn for this crawler.`,
|
|
410
|
+
evidence
|
|
411
|
+
});
|
|
412
|
+
continue;
|
|
413
|
+
}
|
|
414
|
+
if (botClass === 2) {
|
|
415
|
+
if (isThinnerThanBrowser(controlTextLength, result.body)) {
|
|
416
|
+
const crawlerTextLength = stripTags(result.body ?? "").length;
|
|
417
|
+
findings.push({
|
|
418
|
+
code: "crawler.content-differential",
|
|
419
|
+
level: "warn",
|
|
420
|
+
title: `${agent.vendor} ${agent.id} gets much less content than a browser`,
|
|
421
|
+
detail: `The ${agent.id} user-agent reached the page (${result.status}) but its HTML has far less text than the browser's (${crawlerTextLength} vs ${controlTextLength} characters). Content may be gated behind JavaScript or served only to browsers, so the crawler indexes a thinner page.`,
|
|
422
|
+
evidence: `${agent.id} text=${crawlerTextLength} chars; browser text=${controlTextLength} chars`,
|
|
423
|
+
fix: "Server-render or prerender the shared content, or provide a markdown mirror, so crawlers get the same text as browsers."
|
|
424
|
+
});
|
|
425
|
+
continue;
|
|
426
|
+
}
|
|
427
|
+
findings.push({
|
|
428
|
+
code: "crawler.accessible",
|
|
429
|
+
level: "pass",
|
|
430
|
+
title: `${agent.vendor} ${agent.id} can reach the page`,
|
|
431
|
+
detail: `A request with the ${agent.id} user-agent returned the same success class as a browser.`,
|
|
432
|
+
evidence
|
|
433
|
+
});
|
|
434
|
+
continue;
|
|
435
|
+
}
|
|
436
|
+
if (result.status === 429) {
|
|
437
|
+
findings.push({
|
|
438
|
+
code: "crawler.rate-limited",
|
|
439
|
+
level: "warn",
|
|
440
|
+
title: `${agent.vendor} ${agent.id} is rate-limited`,
|
|
441
|
+
detail: `The ${agent.id} request was rate-limited (429). This is usually transient, but aggressive rate limits can starve crawlers of your content.`,
|
|
442
|
+
evidence
|
|
443
|
+
});
|
|
444
|
+
continue;
|
|
445
|
+
}
|
|
446
|
+
if (result.status === 403 || result.status === 401) {
|
|
447
|
+
const challenge = looksLikeBotChallenge(result);
|
|
448
|
+
if (challenge) {
|
|
449
|
+
findings.push({
|
|
450
|
+
code: "crawler.bot-challenge",
|
|
451
|
+
level: "warn",
|
|
452
|
+
title: `${agent.vendor} ${agent.id} hit a bot challenge`,
|
|
453
|
+
detail: `The ${agent.id} user-agent got a challenge/verification response (${result.status}). Because ${agent.id} is verified by ${agent.verification ?? "its published identity"}, the real crawler may pass where this spoofed user-agent does not. Confirm the verified bot is allowlisted at your CDN.`,
|
|
454
|
+
evidence,
|
|
455
|
+
fix: "Allowlist the crawler by its published IP ranges (verified bots) rather than relying on user-agent rules."
|
|
456
|
+
});
|
|
457
|
+
} else {
|
|
458
|
+
findings.push({
|
|
459
|
+
code: "crawler.ua-differential-block",
|
|
460
|
+
level: "warn",
|
|
461
|
+
title: `${agent.vendor} ${agent.id} is blocked from a generic IP`,
|
|
462
|
+
detail: `A browser gets ${control.status} but the ${agent.id} user-agent gets ${result.status}, with no challenge signal. Two things cause this and they mean opposite things: a user-agent-string WAF rule (which also blocks the real ${agent.id}) or IP allowlisting (where the verified ${agent.id} is fine). Check which it is at your CDN.`,
|
|
463
|
+
evidence,
|
|
464
|
+
fix: `If a WAF rule blocks the "${agent.id}" user-agent, remove or narrow it. If you allowlist verified bots by IP, no action is needed.`
|
|
465
|
+
});
|
|
466
|
+
}
|
|
467
|
+
continue;
|
|
468
|
+
}
|
|
469
|
+
if (botClass === 5) {
|
|
470
|
+
findings.push({
|
|
471
|
+
code: "crawler.origin-error",
|
|
472
|
+
level: "warn",
|
|
473
|
+
title: `${agent.vendor} ${agent.id} triggered a server error`,
|
|
474
|
+
detail: `The ${agent.id} user-agent got a ${result.status} while the browser got ${control.status}. Something in the stack treats this crawler differently and errors.`,
|
|
475
|
+
evidence
|
|
476
|
+
});
|
|
477
|
+
continue;
|
|
478
|
+
}
|
|
479
|
+
findings.push({
|
|
480
|
+
code: "crawler.differential-unknown",
|
|
481
|
+
level: "warn",
|
|
482
|
+
title: `${agent.vendor} ${agent.id} is treated differently than a browser`,
|
|
483
|
+
detail: `The ${agent.id} user-agent returned ${result.status} while a browser returned ${control.status}. The cause is unclear from the response; inspect the evidence.`,
|
|
484
|
+
evidence
|
|
485
|
+
});
|
|
486
|
+
}
|
|
487
|
+
return findings;
|
|
488
|
+
}
|
|
347
489
|
|
|
348
490
|
// src/findings.ts
|
|
349
491
|
function worstLevel(findings) {
|
|
@@ -584,6 +726,12 @@ async function safeFetch(targetUrl, options) {
|
|
|
584
726
|
}
|
|
585
727
|
|
|
586
728
|
// src/audit.ts
|
|
729
|
+
var SITEMAP_FALLBACK_PATHS = [
|
|
730
|
+
"/sitemap_index.xml",
|
|
731
|
+
"/sitemap-index.xml",
|
|
732
|
+
"/wp-sitemap.xml",
|
|
733
|
+
"/sitemap/sitemap.xml"
|
|
734
|
+
];
|
|
587
735
|
function originOf(url) {
|
|
588
736
|
try {
|
|
589
737
|
return new URL(url).origin;
|
|
@@ -591,6 +739,30 @@ function originOf(url) {
|
|
|
591
739
|
return url.replace(/\/+$/, "");
|
|
592
740
|
}
|
|
593
741
|
}
|
|
742
|
+
function markdownMirrorUrl(pageUrl) {
|
|
743
|
+
try {
|
|
744
|
+
const url = new URL(pageUrl);
|
|
745
|
+
const path = url.pathname.replace(/\/+$/, "");
|
|
746
|
+
if (path === "") return `${url.origin}/index.md`;
|
|
747
|
+
if (/\.[a-z0-9]+$/i.test(path)) return null;
|
|
748
|
+
return `${url.origin}${path}.md`;
|
|
749
|
+
} catch {
|
|
750
|
+
return null;
|
|
751
|
+
}
|
|
752
|
+
}
|
|
753
|
+
function notFetched(url) {
|
|
754
|
+
return {
|
|
755
|
+
requestedUrl: url,
|
|
756
|
+
finalUrl: url,
|
|
757
|
+
status: null,
|
|
758
|
+
ok: false,
|
|
759
|
+
headers: {},
|
|
760
|
+
body: null,
|
|
761
|
+
bodyBytes: 0,
|
|
762
|
+
redirects: 0,
|
|
763
|
+
blocked: false
|
|
764
|
+
};
|
|
765
|
+
}
|
|
594
766
|
async function audit(targetUrl, options) {
|
|
595
767
|
const doFetch = options.fetchImpl ?? safeFetch;
|
|
596
768
|
const timeoutMs = options.timeoutMs;
|
|
@@ -605,8 +777,7 @@ async function audit(targetUrl, options) {
|
|
|
605
777
|
const result = await doFetch(targetUrl, {
|
|
606
778
|
userAgent: agent.ua,
|
|
607
779
|
timeoutMs,
|
|
608
|
-
readBody: true
|
|
609
|
-
maxBytes: 64 * 1024
|
|
780
|
+
readBody: true
|
|
610
781
|
});
|
|
611
782
|
probes.push({ agent, result });
|
|
612
783
|
}
|
|
@@ -622,11 +793,37 @@ async function audit(targetUrl, options) {
|
|
|
622
793
|
readBody: true,
|
|
623
794
|
maxBytes: 1024 * 1024
|
|
624
795
|
});
|
|
796
|
+
const fetchSitemap = (path) => doFetch(`${origin}${path}`, {
|
|
797
|
+
userAgent: BROWSER_CONTROL.ua,
|
|
798
|
+
timeoutMs,
|
|
799
|
+
readBody: true,
|
|
800
|
+
maxBytes: 1024 * 1024
|
|
801
|
+
});
|
|
802
|
+
let sitemap = await fetchSitemap("/sitemap.xml");
|
|
803
|
+
if (!isXmlSitemap(sitemap) && !/^\s*sitemap\s*:/im.test(robots.body ?? "")) {
|
|
804
|
+
for (const path of SITEMAP_FALLBACK_PATHS) {
|
|
805
|
+
const candidate = await fetchSitemap(path);
|
|
806
|
+
if (isXmlSitemap(candidate)) {
|
|
807
|
+
sitemap = candidate;
|
|
808
|
+
break;
|
|
809
|
+
}
|
|
810
|
+
}
|
|
811
|
+
}
|
|
812
|
+
const mirrorUrl = markdownMirrorUrl(control.finalUrl || targetUrl);
|
|
813
|
+
const mirror = mirrorUrl ? await doFetch(mirrorUrl, {
|
|
814
|
+
userAgent: BROWSER_CONTROL.ua,
|
|
815
|
+
timeoutMs,
|
|
816
|
+
readBody: true,
|
|
817
|
+
maxBytes: 1024 * 1024
|
|
818
|
+
}) : notFetched(`${origin}/index.md`);
|
|
625
819
|
const findings = [
|
|
626
820
|
...analyzeCrawlerAccess(control, probes),
|
|
627
821
|
...analyzeJsDependence(control),
|
|
628
822
|
...analyzeRobots(robots),
|
|
629
|
-
...analyzeMachineReadable(control, llms)
|
|
823
|
+
...analyzeMachineReadable(control, llms),
|
|
824
|
+
...analyzeMarkdown(control, mirror),
|
|
825
|
+
...analyzeSitemap(sitemap, robots),
|
|
826
|
+
...analyzeMetadata(control)
|
|
630
827
|
];
|
|
631
828
|
const counts = countByLevel(findings);
|
|
632
829
|
const passed = counts.pass;
|
|
@@ -748,10 +945,13 @@ export {
|
|
|
748
945
|
BROWSER_CONTROL,
|
|
749
946
|
CRAWLER_AGENTS,
|
|
750
947
|
ALL_AGENTS,
|
|
751
|
-
analyzeCrawlerAccess,
|
|
752
948
|
analyzeJsDependence,
|
|
753
949
|
analyzeRobots,
|
|
754
950
|
analyzeMachineReadable,
|
|
951
|
+
analyzeMarkdown,
|
|
952
|
+
analyzeSitemap,
|
|
953
|
+
analyzeMetadata,
|
|
954
|
+
analyzeCrawlerAccess,
|
|
755
955
|
worstLevel,
|
|
756
956
|
countByLevel,
|
|
757
957
|
parseIpv4,
|
package/dist/index.d.ts
CHANGED
|
@@ -135,5 +135,15 @@ declare function analyzeJsDependence(control: FetchResult): AuditFinding[];
|
|
|
135
135
|
declare function analyzeRobots(robots: FetchResult): AuditFinding[];
|
|
136
136
|
/** Machine-readability surface on the homepage HTML plus a fetched llms.txt. */
|
|
137
137
|
declare function analyzeMachineReadable(control: FetchResult, llms: FetchResult): AuditFinding[];
|
|
138
|
+
/**
|
|
139
|
+
* Markdown mirrors / alternates are optional but valuable: they give agents a
|
|
140
|
+
* clean, low-noise version of the page (agentmarkup can generate them, and some
|
|
141
|
+
* CDNs serve runtime markdown). Present is a pass; absent emits no finding
|
|
142
|
+
* because a content-rich HTML page does not need one.
|
|
143
|
+
*/
|
|
144
|
+
declare function analyzeMarkdown(control: FetchResult, mirror: FetchResult): AuditFinding[];
|
|
145
|
+
declare function analyzeSitemap(sitemap: FetchResult, robots: FetchResult): AuditFinding[];
|
|
146
|
+
/** Core head metadata (title / description / canonical) crawlers use to attribute a page. */
|
|
147
|
+
declare function analyzeMetadata(control: FetchResult): AuditFinding[];
|
|
138
148
|
|
|
139
|
-
export { ALL_AGENTS, type AgentProbe, type AuditFinding, type AuditLevel, type AuditOptions, type AuditReport, BROWSER_CONTROL, CRAWLER_AGENTS, type CrawlerAgent, type FetchOptions, type FetchResult, type RunContext, analyzeCrawlerAccess, analyzeJsDependence, analyzeMachineReadable, analyzeRobots, audit, countByLevel, isBlockedHostname, parseIpv4, parseIpv6, renderJson, renderText, run, safeFetch, worstLevel };
|
|
149
|
+
export { ALL_AGENTS, type AgentProbe, type AuditFinding, type AuditLevel, type AuditOptions, type AuditReport, BROWSER_CONTROL, CRAWLER_AGENTS, type CrawlerAgent, type FetchOptions, type FetchResult, type RunContext, analyzeCrawlerAccess, analyzeJsDependence, analyzeMachineReadable, analyzeMarkdown, analyzeMetadata, analyzeRobots, analyzeSitemap, audit, countByLevel, isBlockedHostname, parseIpv4, parseIpv6, renderJson, renderText, run, safeFetch, worstLevel };
|
package/dist/index.js
CHANGED
|
@@ -5,7 +5,10 @@ import {
|
|
|
5
5
|
analyzeCrawlerAccess,
|
|
6
6
|
analyzeJsDependence,
|
|
7
7
|
analyzeMachineReadable,
|
|
8
|
+
analyzeMarkdown,
|
|
9
|
+
analyzeMetadata,
|
|
8
10
|
analyzeRobots,
|
|
11
|
+
analyzeSitemap,
|
|
9
12
|
audit,
|
|
10
13
|
countByLevel,
|
|
11
14
|
isBlockedHostname,
|
|
@@ -16,7 +19,7 @@ import {
|
|
|
16
19
|
run,
|
|
17
20
|
safeFetch,
|
|
18
21
|
worstLevel
|
|
19
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-VYQOM2ID.js";
|
|
20
23
|
export {
|
|
21
24
|
ALL_AGENTS,
|
|
22
25
|
BROWSER_CONTROL,
|
|
@@ -24,7 +27,10 @@ export {
|
|
|
24
27
|
analyzeCrawlerAccess,
|
|
25
28
|
analyzeJsDependence,
|
|
26
29
|
analyzeMachineReadable,
|
|
30
|
+
analyzeMarkdown,
|
|
31
|
+
analyzeMetadata,
|
|
27
32
|
analyzeRobots,
|
|
33
|
+
analyzeSitemap,
|
|
28
34
|
audit,
|
|
29
35
|
countByLevel,
|
|
30
36
|
isBlockedHostname,
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@agentmarkup/audit",
|
|
3
|
-
"version": "0.1
|
|
4
|
-
"description": "Audit a live URL the way AI crawlers see it: fetch as GPTBot, ClaudeBot, PerplexityBot and more, diff against a browser to catch accidental CDN blocks, plus llms.txt, JSON-LD, robots.txt intent, Content-Signal, and
|
|
3
|
+
"version": "0.2.1",
|
|
4
|
+
"description": "Audit a live URL the way AI crawlers see it: fetch as GPTBot, ClaudeBot, PerplexityBot and more, diff against a browser to catch accidental CDN blocks and JS-gated content, plus llms.txt, JSON-LD, robots.txt intent, Content-Signal, markdown mirror, sitemap, and page-metadata checks",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"license": "MIT",
|
|
7
7
|
"author": "Sebastian Cochinescu <hello@animafelix.com> (https://animafelix.com)",
|
|
@@ -23,6 +23,8 @@
|
|
|
23
23
|
"robots-txt",
|
|
24
24
|
"content-signal",
|
|
25
25
|
"json-ld",
|
|
26
|
+
"sitemap",
|
|
27
|
+
"markdown",
|
|
26
28
|
"geo",
|
|
27
29
|
"aeo",
|
|
28
30
|
"seo",
|
|
@@ -45,7 +47,7 @@
|
|
|
45
47
|
"dist"
|
|
46
48
|
],
|
|
47
49
|
"dependencies": {
|
|
48
|
-
"@agentmarkup/core": "0.5.
|
|
50
|
+
"@agentmarkup/core": "0.5.3"
|
|
49
51
|
},
|
|
50
52
|
"devDependencies": {
|
|
51
53
|
"eslint": "^9.0.0",
|