@agentmarkup/audit 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -26,11 +26,14 @@ Bare domains are normalized to `https://`. Exit code is `1` when any **error**-l
26
26
 
27
27
  | Area | What it does |
28
28
  | --- | --- |
29
- | **Crawler access** | Fetches as each AI crawler user-agent and diffs status against a browser control. Flags challenges, differential blocks, rate limits, and origin errors. |
29
+ | **Crawler access** | Fetches as each AI crawler user-agent and diffs against a browser control. Flags challenges, differential blocks, rate limits, origin errors, and when an *accessible* crawler gets materially less content than a browser (JS-gated or cloaked pages). |
30
30
  | **JS dependence** | Measures whether the raw (un-executed) HTML actually contains content, or is an empty `#root`/`#app` shell that only fills in after JavaScript runs. |
31
31
  | **robots.txt** | Reuses `@agentmarkup/core` to detect whether the crawlers you likely want are shadowed by a wildcard `Disallow`, and whether a canonical Content-Signal policy is present. |
32
- | **llms.txt** | Fetches `/llms.txt`, validates it, and checks the homepage links it for discovery. |
33
- | **JSON-LD** | Extracts and structurally validates JSON-LD blocks on the page. |
32
+ | **llms.txt** | Fetches `/llms.txt` (guarding against HTML soft-404s), validates it, and checks the homepage links it for discovery. |
33
+ | **JSON-LD** | Extracts JSON-LD and flags only unparseable or type-less blocks; parseable structured data (including `@graph`) passes. |
34
+ | **Markdown mirror** | Detects a fetchable markdown mirror or a `text/markdown` alternate link — the clean, low-noise version agents prefer. |
35
+ | **Sitemap** | Checks for `/sitemap.xml`, a `Sitemap:` directive in robots.txt, or common non-standard sitemap paths. |
36
+ | **Page metadata** | Checks for a title, meta description, and canonical link. |
34
37
 
35
38
  ## An honest note on "blocked" crawlers
36
39
 
package/dist/bin.js CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
  import {
3
3
  run
4
- } from "./chunk-PNE6FBX2.js";
4
+ } from "./chunk-VYQOM2ID.js";
5
5
 
6
6
  // src/bin.ts
7
7
  import { createRequire } from "module";
@@ -49,122 +49,27 @@ var CRAWLER_AGENTS = [
49
49
  ];
50
50
  var ALL_AGENTS = [BROWSER_CONTROL, ...CRAWLER_AGENTS];
51
51
 
52
- // src/analyzers/crawler-access.ts
53
- var CHALLENGE_MARKERS = [
54
- "cf-browser-verification",
55
- "challenge-platform",
56
- "just a moment",
57
- "attention required",
58
- "enable javascript and cookies to continue"
59
- ];
60
- function looksLikeBotChallenge(result) {
61
- const mitigated = result.headers["cf-mitigated"];
62
- if (mitigated && mitigated.toLowerCase().includes("challenge")) return true;
63
- const body = (result.body ?? "").toLowerCase();
64
- return CHALLENGE_MARKERS.some((marker) => body.includes(marker));
65
- }
66
- function statusClass(status) {
67
- return status === null ? null : Math.floor(status / 100);
68
- }
69
- function analyzeCrawlerAccess(control, probes) {
70
- const findings = [];
71
- const controlClass = statusClass(control.status);
72
- if (control.error || controlClass !== 2) {
73
- findings.push({
74
- code: "crawler.control-failed",
75
- level: "warn",
76
- title: "Could not establish a browser baseline",
77
- detail: "The control request (normal browser user-agent) did not return a 2xx response, so bot-vs-browser differences cannot be judged reliably.",
78
- evidence: `browser control: status=${control.status ?? "none"}${control.error ? ` error=${control.error}` : ""}`,
79
- fix: "Confirm the URL is reachable and returns 200 in a browser, then re-run the audit."
80
- });
81
- return findings;
82
- }
83
- for (const { agent, result } of probes) {
84
- const botClass = statusClass(result.status);
85
- const evidence = `${agent.id} \u2192 status=${result.status ?? "none"}${result.error ? ` error=${result.error}` : ""}; browser \u2192 status=${control.status}`;
86
- if (result.error === "timeout" || result.error === "network-error") {
87
- findings.push({
88
- code: "crawler.probe-failed",
89
- level: "warn",
90
- title: `Could not probe as ${agent.vendor} ${agent.id}`,
91
- detail: `The request as ${agent.id} failed (${result.error}); no conclusion drawn for this crawler.`,
92
- evidence
93
- });
94
- continue;
95
- }
96
- if (botClass === 2) {
97
- findings.push({
98
- code: "crawler.accessible",
99
- level: "pass",
100
- title: `${agent.vendor} ${agent.id} can reach the page`,
101
- detail: `A request with the ${agent.id} user-agent returned the same success class as a browser.`,
102
- evidence
103
- });
104
- continue;
105
- }
106
- if (result.status === 429) {
107
- findings.push({
108
- code: "crawler.rate-limited",
109
- level: "warn",
110
- title: `${agent.vendor} ${agent.id} is rate-limited`,
111
- detail: `The ${agent.id} request was rate-limited (429). This is usually transient, but aggressive rate limits can starve crawlers of your content.`,
112
- evidence
113
- });
114
- continue;
115
- }
116
- if (result.status === 403 || result.status === 401) {
117
- const challenge = looksLikeBotChallenge(result);
118
- if (challenge) {
119
- findings.push({
120
- code: "crawler.bot-challenge",
121
- level: "warn",
122
- title: `${agent.vendor} ${agent.id} hit a bot challenge`,
123
- detail: `The ${agent.id} user-agent got a challenge/verification response (${result.status}). Because ${agent.id} is verified by ${agent.verification ?? "its published identity"}, the real crawler may pass where this spoofed user-agent does not. Confirm the verified bot is allowlisted at your CDN.`,
124
- evidence,
125
- fix: "Allowlist the crawler by its published IP ranges (verified bots) rather than relying on user-agent rules."
126
- });
127
- } else {
128
- findings.push({
129
- code: "crawler.ua-differential-block",
130
- level: "warn",
131
- title: `${agent.vendor} ${agent.id} is blocked from a generic IP`,
132
- detail: `A browser gets ${control.status} but the ${agent.id} user-agent gets ${result.status}, with no challenge signal. Two things cause this and they mean opposite things: a user-agent-string WAF rule (which also blocks the real ${agent.id}) or IP allowlisting (where the verified ${agent.id} is fine). Check which it is at your CDN.`,
133
- evidence,
134
- fix: `If a WAF rule blocks the "${agent.id}" user-agent, remove or narrow it. If you allowlist verified bots by IP, no action is needed.`
135
- });
136
- }
137
- continue;
138
- }
139
- if (botClass === 5) {
140
- findings.push({
141
- code: "crawler.origin-error",
142
- level: "warn",
143
- title: `${agent.vendor} ${agent.id} triggered a server error`,
144
- detail: `The ${agent.id} user-agent got a ${result.status} while the browser got ${control.status}. Something in the stack treats this crawler differently and errors.`,
145
- evidence
146
- });
147
- continue;
148
- }
149
- findings.push({
150
- code: "crawler.differential-unknown",
151
- level: "warn",
152
- title: `${agent.vendor} ${agent.id} is treated differently than a browser`,
153
- detail: `The ${agent.id} user-agent returned ${result.status} while a browser returned ${control.status}. The cause is unclear from the response; inspect the evidence.`,
154
- evidence
155
- });
156
- }
157
- return findings;
158
- }
159
-
160
52
  // src/analyzers/site-checks.ts
161
53
  import {
162
54
  extractJsonLdScriptContents,
163
55
  findBlockedCrawlers,
164
56
  hasLlmsTxtDiscoveryLink,
165
- validateJsonLdNode,
166
57
  validateLlmsTxt
167
58
  } from "@agentmarkup/core";
59
+ var HTML_BODY_RE = /^\s*(?:<!doctype\s+html|<html[\s>])/i;
60
+ function isRealTextResource(res) {
61
+ if (res.error || (res.status ?? 0) >= 400 || !res.body) {
62
+ return false;
63
+ }
64
+ const contentType = (res.headers["content-type"] ?? "").toLowerCase();
65
+ if (contentType.includes("text/html")) {
66
+ return false;
67
+ }
68
+ return !HTML_BODY_RE.test(res.body);
69
+ }
70
+ function isGraphContainer(value) {
71
+ return !!value && typeof value === "object" && Array.isArray(value["@graph"]);
72
+ }
168
73
  var EXPECTED_CRAWLERS = Object.fromEntries(
169
74
  CRAWLER_AGENTS.map((agent) => [agent.ua.split("/")[0], "allow"])
170
75
  );
@@ -214,7 +119,7 @@ function analyzeJsDependence(control) {
214
119
  }
215
120
  function analyzeRobots(robots) {
216
121
  const findings = [];
217
- const has = !robots.error && (robots.status ?? 0) < 400 && Boolean(robots.body);
122
+ const has = isRealTextResource(robots);
218
123
  if (!has) {
219
124
  findings.push({
220
125
  code: "robots.missing",
@@ -267,7 +172,7 @@ function analyzeRobots(robots) {
267
172
  function analyzeMachineReadable(control, llms) {
268
173
  const findings = [];
269
174
  const html = control.body ?? "";
270
- const llmsOk = !llms.error && (llms.status ?? 0) < 400 && Boolean(llms.body);
175
+ const llmsOk = isRealTextResource(llms);
271
176
  if (llmsOk) {
272
177
  const results = validateLlmsTxt(llms.body ?? "");
273
178
  const errors = results.filter((r) => r.severity === "error");
@@ -313,37 +218,274 @@ function analyzeMachineReadable(control, llms) {
313
218
  fix: "Add JSON-LD with agentmarkup schema presets (webSite, organization, article, \u2026)."
314
219
  });
315
220
  } else {
316
- const errors = [];
221
+ let parseError = false;
222
+ let anyTyped = false;
317
223
  for (const block of blocks) {
224
+ let parsed;
318
225
  try {
319
- const parsed = JSON.parse(block);
320
- const nodes = Array.isArray(parsed) ? parsed : [parsed];
226
+ parsed = JSON.parse(block);
227
+ } catch {
228
+ parseError = true;
229
+ continue;
230
+ }
231
+ const roots = Array.isArray(parsed) ? parsed : [parsed];
232
+ for (const root of roots) {
233
+ const nodes = isGraphContainer(root) ? root["@graph"] : [root];
321
234
  for (const node of nodes) {
322
- for (const r of validateJsonLdNode(node)) {
323
- if (r.severity === "error") errors.push(r.message);
235
+ if (node && typeof node === "object" && "@type" in node) {
236
+ anyTyped = true;
324
237
  }
325
238
  }
326
- } catch {
327
- errors.push("a JSON-LD script block is not valid JSON");
328
239
  }
329
240
  }
330
- findings.push(
331
- errors.length > 0 ? {
241
+ if (parseError) {
242
+ findings.push({
243
+ code: "jsonld.invalid",
244
+ level: "error",
245
+ title: "JSON-LD has errors",
246
+ detail: "a JSON-LD script block is not valid JSON"
247
+ });
248
+ } else if (!anyTyped) {
249
+ findings.push({
332
250
  code: "jsonld.invalid",
333
251
  level: "error",
334
252
  title: "JSON-LD has errors",
335
- detail: errors.join("; ")
336
- } : {
253
+ detail: "a JSON-LD block has no @type, so it is not usable structured data"
254
+ });
255
+ } else {
256
+ findings.push({
337
257
  code: "jsonld.present",
338
258
  level: "pass",
339
259
  title: "JSON-LD structured data present",
340
- detail: `${blocks.length} JSON-LD block(s) found and structurally valid.`
341
- }
342
- );
260
+ detail: `${blocks.length} JSON-LD block(s) found and parseable.`
261
+ });
262
+ }
343
263
  }
344
264
  }
345
265
  return findings;
346
266
  }
267
+ function hasMarkdownAlternate(html) {
268
+ const links = html.match(/<link\b[^>]*>/gi) ?? [];
269
+ return links.some(
270
+ (link) => /\brel=["']?[^"'>]*\balternate\b/i.test(link) && /\btype=["']?text\/markdown\b/i.test(link)
271
+ );
272
+ }
273
+ function analyzeMarkdown(control, mirror) {
274
+ const html = control.body ?? "";
275
+ const viaLink = html.length > 0 && hasMarkdownAlternate(html);
276
+ const mirrorType = (mirror.headers["content-type"] ?? "").toLowerCase();
277
+ const viaMirror = isRealTextResource(mirror) && (mirrorType.includes("markdown") || /^\s*#/.test(mirror.body ?? ""));
278
+ if (!viaLink && !viaMirror) {
279
+ return [];
280
+ }
281
+ return [
282
+ {
283
+ code: "markdown.present",
284
+ level: "pass",
285
+ title: "A markdown alternate is available for agents",
286
+ detail: viaMirror ? "A markdown mirror of the page is fetchable, giving agents a clean, low-noise version of the content." : "The page advertises a text/markdown alternate link for agents."
287
+ }
288
+ ];
289
+ }
290
+ function isXmlSitemap(sitemap) {
291
+ const body = sitemap.body ?? "";
292
+ const contentType = (sitemap.headers["content-type"] ?? "").toLowerCase();
293
+ const reachable = !sitemap.error && (sitemap.status ?? 0) < 400 && body.length > 0;
294
+ const looksXml = /<(?:urlset|sitemapindex)\b/i.test(body) || /^\s*<\?xml/i.test(body);
295
+ const isHtml = contentType.includes("text/html") || HTML_BODY_RE.test(body);
296
+ return reachable && looksXml && !isHtml;
297
+ }
298
+ function analyzeSitemap(sitemap, robots) {
299
+ const declaredInRobots = /^\s*sitemap\s*:/im.test(robots.body ?? "");
300
+ if (isXmlSitemap(sitemap) || declaredInRobots) {
301
+ return [
302
+ {
303
+ code: "sitemap.present",
304
+ level: "pass",
305
+ title: "Sitemap found",
306
+ detail: declaredInRobots ? "A sitemap is declared in robots.txt, which helps crawlers and AI systems discover all of your pages." : "A sitemap.xml is reachable, which helps crawlers and AI systems discover all of your pages."
307
+ }
308
+ ];
309
+ }
310
+ return [
311
+ {
312
+ code: "sitemap.missing",
313
+ level: "warn",
314
+ title: "No sitemap.xml found",
315
+ detail: "No reachable sitemap.xml. A sitemap helps crawlers and AI systems discover pages they would not reach by following links.",
316
+ fix: "Generate a sitemap.xml and reference it from robots.txt."
317
+ }
318
+ ];
319
+ }
320
+ function analyzeMetadata(control) {
321
+ if (control.error || (control.status ?? 0) >= 400 || !control.body) {
322
+ return [];
323
+ }
324
+ const html = control.body;
325
+ const missing = [];
326
+ const titleMatch = /<title\b[^>]*>([\s\S]*?)<\/title>/i.exec(html);
327
+ if (!titleMatch || titleMatch[1].trim().length === 0) {
328
+ missing.push("title");
329
+ }
330
+ const metas = html.match(/<meta\b[^>]*>/gi) ?? [];
331
+ const hasDescription = metas.some(
332
+ (tag) => /\bname=["']?description["']?/i.test(tag) && /\bcontent=["'][^"']*\S[^"']*["']/i.test(tag)
333
+ );
334
+ if (!hasDescription) missing.push("description");
335
+ const links = html.match(/<link\b[^>]*>/gi) ?? [];
336
+ const hasCanonical = links.some(
337
+ (link) => /\brel=["']?canonical\b/i.test(link)
338
+ );
339
+ if (!hasCanonical) missing.push("canonical");
340
+ if (missing.length === 0) {
341
+ return [
342
+ {
343
+ code: "meta.complete",
344
+ level: "pass",
345
+ title: "Core page metadata present",
346
+ detail: "The page has a title, a meta description, and a canonical link, which help AI systems and search attribute the page."
347
+ }
348
+ ];
349
+ }
350
+ return [
351
+ {
352
+ code: "meta.incomplete",
353
+ level: "warn",
354
+ title: "Core page metadata is incomplete",
355
+ detail: `Missing: ${missing.join(
356
+ ", "
357
+ )}. Title, meta description, and canonical link help AI systems and search understand and correctly attribute the page.`,
358
+ evidence: `missing: ${missing.join(", ")}`,
359
+ fix: "Add the missing head tags; agentmarkup keeps these consistent on generated pages."
360
+ }
361
+ ];
362
+ }
363
+
364
+ // src/analyzers/crawler-access.ts
365
+ function isThinnerThanBrowser(controlTextLength, crawlerBody) {
366
+ if (controlTextLength < 500) return false;
367
+ const crawlerTextLength = stripTags(crawlerBody ?? "").length;
368
+ return crawlerTextLength < controlTextLength * 0.4 && controlTextLength - crawlerTextLength >= 500;
369
+ }
370
+ var CHALLENGE_MARKERS = [
371
+ "cf-browser-verification",
372
+ "challenge-platform",
373
+ "just a moment",
374
+ "attention required",
375
+ "enable javascript and cookies to continue"
376
+ ];
377
+ function looksLikeBotChallenge(result) {
378
+ const mitigated = result.headers["cf-mitigated"];
379
+ if (mitigated && mitigated.toLowerCase().includes("challenge")) return true;
380
+ const body = (result.body ?? "").toLowerCase();
381
+ return CHALLENGE_MARKERS.some((marker) => body.includes(marker));
382
+ }
383
+ function statusClass(status) {
384
+ return status === null ? null : Math.floor(status / 100);
385
+ }
386
+ function analyzeCrawlerAccess(control, probes) {
387
+ const findings = [];
388
+ const controlClass = statusClass(control.status);
389
+ const controlTextLength = stripTags(control.body ?? "").length;
390
+ if (control.error || controlClass !== 2) {
391
+ findings.push({
392
+ code: "crawler.control-failed",
393
+ level: "warn",
394
+ title: "Could not establish a browser baseline",
395
+ detail: "The control request (normal browser user-agent) did not return a 2xx response, so bot-vs-browser differences cannot be judged reliably.",
396
+ evidence: `browser control: status=${control.status ?? "none"}${control.error ? ` error=${control.error}` : ""}`,
397
+ fix: "Confirm the URL is reachable and returns 200 in a browser, then re-run the audit."
398
+ });
399
+ return findings;
400
+ }
401
+ for (const { agent, result } of probes) {
402
+ const botClass = statusClass(result.status);
403
+ const evidence = `${agent.id} \u2192 status=${result.status ?? "none"}${result.error ? ` error=${result.error}` : ""}; browser \u2192 status=${control.status}`;
404
+ if (result.error === "timeout" || result.error === "network-error") {
405
+ findings.push({
406
+ code: "crawler.probe-failed",
407
+ level: "warn",
408
+ title: `Could not probe as ${agent.vendor} ${agent.id}`,
409
+ detail: `The request as ${agent.id} failed (${result.error}); no conclusion drawn for this crawler.`,
410
+ evidence
411
+ });
412
+ continue;
413
+ }
414
+ if (botClass === 2) {
415
+ if (isThinnerThanBrowser(controlTextLength, result.body)) {
416
+ const crawlerTextLength = stripTags(result.body ?? "").length;
417
+ findings.push({
418
+ code: "crawler.content-differential",
419
+ level: "warn",
420
+ title: `${agent.vendor} ${agent.id} gets much less content than a browser`,
421
+ detail: `The ${agent.id} user-agent reached the page (${result.status}) but its HTML has far less text than the browser's (${crawlerTextLength} vs ${controlTextLength} characters). Content may be gated behind JavaScript or served only to browsers, so the crawler indexes a thinner page.`,
422
+ evidence: `${agent.id} text=${crawlerTextLength} chars; browser text=${controlTextLength} chars`,
423
+ fix: "Server-render or prerender the shared content, or provide a markdown mirror, so crawlers get the same text as browsers."
424
+ });
425
+ continue;
426
+ }
427
+ findings.push({
428
+ code: "crawler.accessible",
429
+ level: "pass",
430
+ title: `${agent.vendor} ${agent.id} can reach the page`,
431
+ detail: `A request with the ${agent.id} user-agent returned the same success class as a browser.`,
432
+ evidence
433
+ });
434
+ continue;
435
+ }
436
+ if (result.status === 429) {
437
+ findings.push({
438
+ code: "crawler.rate-limited",
439
+ level: "warn",
440
+ title: `${agent.vendor} ${agent.id} is rate-limited`,
441
+ detail: `The ${agent.id} request was rate-limited (429). This is usually transient, but aggressive rate limits can starve crawlers of your content.`,
442
+ evidence
443
+ });
444
+ continue;
445
+ }
446
+ if (result.status === 403 || result.status === 401) {
447
+ const challenge = looksLikeBotChallenge(result);
448
+ if (challenge) {
449
+ findings.push({
450
+ code: "crawler.bot-challenge",
451
+ level: "warn",
452
+ title: `${agent.vendor} ${agent.id} hit a bot challenge`,
453
+ detail: `The ${agent.id} user-agent got a challenge/verification response (${result.status}). Because ${agent.id} is verified by ${agent.verification ?? "its published identity"}, the real crawler may pass where this spoofed user-agent does not. Confirm the verified bot is allowlisted at your CDN.`,
454
+ evidence,
455
+ fix: "Allowlist the crawler by its published IP ranges (verified bots) rather than relying on user-agent rules."
456
+ });
457
+ } else {
458
+ findings.push({
459
+ code: "crawler.ua-differential-block",
460
+ level: "warn",
461
+ title: `${agent.vendor} ${agent.id} is blocked from a generic IP`,
462
+ detail: `A browser gets ${control.status} but the ${agent.id} user-agent gets ${result.status}, with no challenge signal. Two things cause this and they mean opposite things: a user-agent-string WAF rule (which also blocks the real ${agent.id}) or IP allowlisting (where the verified ${agent.id} is fine). Check which it is at your CDN.`,
463
+ evidence,
464
+ fix: `If a WAF rule blocks the "${agent.id}" user-agent, remove or narrow it. If you allowlist verified bots by IP, no action is needed.`
465
+ });
466
+ }
467
+ continue;
468
+ }
469
+ if (botClass === 5) {
470
+ findings.push({
471
+ code: "crawler.origin-error",
472
+ level: "warn",
473
+ title: `${agent.vendor} ${agent.id} triggered a server error`,
474
+ detail: `The ${agent.id} user-agent got a ${result.status} while the browser got ${control.status}. Something in the stack treats this crawler differently and errors.`,
475
+ evidence
476
+ });
477
+ continue;
478
+ }
479
+ findings.push({
480
+ code: "crawler.differential-unknown",
481
+ level: "warn",
482
+ title: `${agent.vendor} ${agent.id} is treated differently than a browser`,
483
+ detail: `The ${agent.id} user-agent returned ${result.status} while a browser returned ${control.status}. The cause is unclear from the response; inspect the evidence.`,
484
+ evidence
485
+ });
486
+ }
487
+ return findings;
488
+ }
347
489
 
348
490
  // src/findings.ts
349
491
  function worstLevel(findings) {
@@ -584,6 +726,12 @@ async function safeFetch(targetUrl, options) {
584
726
  }
585
727
 
586
728
  // src/audit.ts
729
+ var SITEMAP_FALLBACK_PATHS = [
730
+ "/sitemap_index.xml",
731
+ "/sitemap-index.xml",
732
+ "/wp-sitemap.xml",
733
+ "/sitemap/sitemap.xml"
734
+ ];
587
735
  function originOf(url) {
588
736
  try {
589
737
  return new URL(url).origin;
@@ -591,6 +739,30 @@ function originOf(url) {
591
739
  return url.replace(/\/+$/, "");
592
740
  }
593
741
  }
742
+ function markdownMirrorUrl(pageUrl) {
743
+ try {
744
+ const url = new URL(pageUrl);
745
+ const path = url.pathname.replace(/\/+$/, "");
746
+ if (path === "") return `${url.origin}/index.md`;
747
+ if (/\.[a-z0-9]+$/i.test(path)) return null;
748
+ return `${url.origin}${path}.md`;
749
+ } catch {
750
+ return null;
751
+ }
752
+ }
753
+ function notFetched(url) {
754
+ return {
755
+ requestedUrl: url,
756
+ finalUrl: url,
757
+ status: null,
758
+ ok: false,
759
+ headers: {},
760
+ body: null,
761
+ bodyBytes: 0,
762
+ redirects: 0,
763
+ blocked: false
764
+ };
765
+ }
594
766
  async function audit(targetUrl, options) {
595
767
  const doFetch = options.fetchImpl ?? safeFetch;
596
768
  const timeoutMs = options.timeoutMs;
@@ -605,8 +777,7 @@ async function audit(targetUrl, options) {
605
777
  const result = await doFetch(targetUrl, {
606
778
  userAgent: agent.ua,
607
779
  timeoutMs,
608
- readBody: true,
609
- maxBytes: 64 * 1024
780
+ readBody: true
610
781
  });
611
782
  probes.push({ agent, result });
612
783
  }
@@ -622,11 +793,37 @@ async function audit(targetUrl, options) {
622
793
  readBody: true,
623
794
  maxBytes: 1024 * 1024
624
795
  });
796
+ const fetchSitemap = (path) => doFetch(`${origin}${path}`, {
797
+ userAgent: BROWSER_CONTROL.ua,
798
+ timeoutMs,
799
+ readBody: true,
800
+ maxBytes: 1024 * 1024
801
+ });
802
+ let sitemap = await fetchSitemap("/sitemap.xml");
803
+ if (!isXmlSitemap(sitemap) && !/^\s*sitemap\s*:/im.test(robots.body ?? "")) {
804
+ for (const path of SITEMAP_FALLBACK_PATHS) {
805
+ const candidate = await fetchSitemap(path);
806
+ if (isXmlSitemap(candidate)) {
807
+ sitemap = candidate;
808
+ break;
809
+ }
810
+ }
811
+ }
812
+ const mirrorUrl = markdownMirrorUrl(control.finalUrl || targetUrl);
813
+ const mirror = mirrorUrl ? await doFetch(mirrorUrl, {
814
+ userAgent: BROWSER_CONTROL.ua,
815
+ timeoutMs,
816
+ readBody: true,
817
+ maxBytes: 1024 * 1024
818
+ }) : notFetched(`${origin}/index.md`);
625
819
  const findings = [
626
820
  ...analyzeCrawlerAccess(control, probes),
627
821
  ...analyzeJsDependence(control),
628
822
  ...analyzeRobots(robots),
629
- ...analyzeMachineReadable(control, llms)
823
+ ...analyzeMachineReadable(control, llms),
824
+ ...analyzeMarkdown(control, mirror),
825
+ ...analyzeSitemap(sitemap, robots),
826
+ ...analyzeMetadata(control)
630
827
  ];
631
828
  const counts = countByLevel(findings);
632
829
  const passed = counts.pass;
@@ -748,10 +945,13 @@ export {
748
945
  BROWSER_CONTROL,
749
946
  CRAWLER_AGENTS,
750
947
  ALL_AGENTS,
751
- analyzeCrawlerAccess,
752
948
  analyzeJsDependence,
753
949
  analyzeRobots,
754
950
  analyzeMachineReadable,
951
+ analyzeMarkdown,
952
+ analyzeSitemap,
953
+ analyzeMetadata,
954
+ analyzeCrawlerAccess,
755
955
  worstLevel,
756
956
  countByLevel,
757
957
  parseIpv4,
package/dist/index.d.ts CHANGED
@@ -135,5 +135,15 @@ declare function analyzeJsDependence(control: FetchResult): AuditFinding[];
135
135
  declare function analyzeRobots(robots: FetchResult): AuditFinding[];
136
136
  /** Machine-readability surface on the homepage HTML plus a fetched llms.txt. */
137
137
  declare function analyzeMachineReadable(control: FetchResult, llms: FetchResult): AuditFinding[];
138
+ /**
139
+ * Markdown mirrors / alternates are optional but valuable: they give agents a
140
+ * clean, low-noise version of the page (agentmarkup can generate them, and some
141
+ * CDNs serve runtime markdown). Present is a pass; absent emits no finding
142
+ * because a content-rich HTML page does not need one.
143
+ */
144
+ declare function analyzeMarkdown(control: FetchResult, mirror: FetchResult): AuditFinding[];
145
+ declare function analyzeSitemap(sitemap: FetchResult, robots: FetchResult): AuditFinding[];
146
+ /** Core head metadata (title / description / canonical) crawlers use to attribute a page. */
147
+ declare function analyzeMetadata(control: FetchResult): AuditFinding[];
138
148
 
139
- export { ALL_AGENTS, type AgentProbe, type AuditFinding, type AuditLevel, type AuditOptions, type AuditReport, BROWSER_CONTROL, CRAWLER_AGENTS, type CrawlerAgent, type FetchOptions, type FetchResult, type RunContext, analyzeCrawlerAccess, analyzeJsDependence, analyzeMachineReadable, analyzeRobots, audit, countByLevel, isBlockedHostname, parseIpv4, parseIpv6, renderJson, renderText, run, safeFetch, worstLevel };
149
+ export { ALL_AGENTS, type AgentProbe, type AuditFinding, type AuditLevel, type AuditOptions, type AuditReport, BROWSER_CONTROL, CRAWLER_AGENTS, type CrawlerAgent, type FetchOptions, type FetchResult, type RunContext, analyzeCrawlerAccess, analyzeJsDependence, analyzeMachineReadable, analyzeMarkdown, analyzeMetadata, analyzeRobots, analyzeSitemap, audit, countByLevel, isBlockedHostname, parseIpv4, parseIpv6, renderJson, renderText, run, safeFetch, worstLevel };
package/dist/index.js CHANGED
@@ -5,7 +5,10 @@ import {
5
5
  analyzeCrawlerAccess,
6
6
  analyzeJsDependence,
7
7
  analyzeMachineReadable,
8
+ analyzeMarkdown,
9
+ analyzeMetadata,
8
10
  analyzeRobots,
11
+ analyzeSitemap,
9
12
  audit,
10
13
  countByLevel,
11
14
  isBlockedHostname,
@@ -16,7 +19,7 @@ import {
16
19
  run,
17
20
  safeFetch,
18
21
  worstLevel
19
- } from "./chunk-PNE6FBX2.js";
22
+ } from "./chunk-VYQOM2ID.js";
20
23
  export {
21
24
  ALL_AGENTS,
22
25
  BROWSER_CONTROL,
@@ -24,7 +27,10 @@ export {
24
27
  analyzeCrawlerAccess,
25
28
  analyzeJsDependence,
26
29
  analyzeMachineReadable,
30
+ analyzeMarkdown,
31
+ analyzeMetadata,
27
32
  analyzeRobots,
33
+ analyzeSitemap,
28
34
  audit,
29
35
  countByLevel,
30
36
  isBlockedHostname,
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@agentmarkup/audit",
3
- "version": "0.1.0",
4
- "description": "Audit a live URL the way AI crawlers see it: fetch as GPTBot, ClaudeBot, PerplexityBot and more, diff against a browser to catch accidental CDN blocks, plus llms.txt, JSON-LD, robots.txt intent, Content-Signal, and JS-dependence checks",
3
+ "version": "0.2.0",
4
+ "description": "Audit a live URL the way AI crawlers see it: fetch as GPTBot, ClaudeBot, PerplexityBot and more, diff against a browser to catch accidental CDN blocks and JS-gated content, plus llms.txt, JSON-LD, robots.txt intent, Content-Signal, markdown mirror, sitemap, and page-metadata checks",
5
5
  "type": "module",
6
6
  "license": "MIT",
7
7
  "author": "Sebastian Cochinescu <hello@animafelix.com> (https://animafelix.com)",
@@ -23,6 +23,8 @@
23
23
  "robots-txt",
24
24
  "content-signal",
25
25
  "json-ld",
26
+ "sitemap",
27
+ "markdown",
26
28
  "geo",
27
29
  "aeo",
28
30
  "seo",