npm - @agentmarkup/audit - Versions diffs - 0.1.0 → 0.2.1 - Mend

@agentmarkup/audit 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md +6 -3
package/dist/bin.js +1 -1
package/dist/{chunk-PNE6FBX2.js → chunk-VYQOM2ID.js} +329 -129
package/dist/index.d.ts +11 -1
package/dist/index.js +7 -1
package/package.json +5 -3

package/README.md CHANGED Viewed

@@ -26,11 +26,14 @@ Bare domains are normalized to `https://`. Exit code is `1` when any **error**-l
 | Area | What it does |
 | --- | --- |
-| **Crawler access** | Fetches as each AI crawler user-agent and diffs status against a browser control. Flags challenges, differential blocks, rate limits, and origin errors. |
+| **Crawler access** | Fetches as each AI crawler user-agent and diffs against a browser control. Flags challenges, differential blocks, rate limits, origin errors, and when an *accessible* crawler gets materially less content than a browser (JS-gated or cloaked pages). |
 | **JS dependence** | Measures whether the raw (un-executed) HTML actually contains content, or is an empty `#root`/`#app` shell that only fills in after JavaScript runs. |
 | **robots.txt** | Reuses `@agentmarkup/core` to detect whether the crawlers you likely want are shadowed by a wildcard `Disallow`, and whether a canonical Content-Signal policy is present. |
-| **llms.txt** | Fetches `/llms.txt`, validates it, and checks the homepage links it for discovery. |
-| **JSON-LD** | Extracts and structurally validates JSON-LD blocks on the page. |
+| **llms.txt** | Fetches `/llms.txt` (guarding against HTML soft-404s), validates it, and checks the homepage links it for discovery. |
+| **JSON-LD** | Extracts JSON-LD and flags only unparseable or type-less blocks; parseable structured data (including `@graph`) passes. |
+| **Markdown mirror** | Detects a fetchable markdown mirror or a `text/markdown` alternate link — the clean, low-noise version agents prefer. |
+| **Sitemap** | Checks for `/sitemap.xml`, a `Sitemap:` directive in robots.txt, or common non-standard sitemap paths. |
+| **Page metadata** | Checks for a title, meta description, and canonical link. |
 ## An honest note on "blocked" crawlers

package/dist/bin.js CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env node
 import {
   run
-} from "./chunk-PNE6FBX2.js";
+} from "./chunk-VYQOM2ID.js";
 // src/bin.ts
 import { createRequire } from "module";

package/dist/{chunk-PNE6FBX2.js → chunk-VYQOM2ID.js} RENAMED Viewed

@@ -49,122 +49,27 @@ var CRAWLER_AGENTS = [
 ];
 var ALL_AGENTS = [BROWSER_CONTROL, ...CRAWLER_AGENTS];
-// src/analyzers/crawler-access.ts
-var CHALLENGE_MARKERS = [
-  "cf-browser-verification",
-  "challenge-platform",
-  "just a moment",
-  "attention required",
-  "enable javascript and cookies to continue"
-];
-function looksLikeBotChallenge(result) {
-  const mitigated = result.headers["cf-mitigated"];
-  if (mitigated && mitigated.toLowerCase().includes("challenge")) return true;
-  const body = (result.body ?? "").toLowerCase();
-  return CHALLENGE_MARKERS.some((marker) => body.includes(marker));
-}
-function statusClass(status) {
-  return status === null ? null : Math.floor(status / 100);
-}
-function analyzeCrawlerAccess(control, probes) {
-  const findings = [];
-  const controlClass = statusClass(control.status);
-  if (control.error || controlClass !== 2) {
-    findings.push({
-      code: "crawler.control-failed",
-      level: "warn",
-      title: "Could not establish a browser baseline",
-      detail: "The control request (normal browser user-agent) did not return a 2xx response, so bot-vs-browser differences cannot be judged reliably.",
-      evidence: `browser control: status=${control.status ?? "none"}${control.error ? ` error=${control.error}` : ""}`,
-      fix: "Confirm the URL is reachable and returns 200 in a browser, then re-run the audit."
-    });
-    return findings;
-  }
-  for (const { agent, result } of probes) {
-    const botClass = statusClass(result.status);
-    const evidence = `${agent.id} \u2192 status=${result.status ?? "none"}${result.error ? ` error=${result.error}` : ""}; browser \u2192 status=${control.status}`;
-    if (result.error === "timeout" || result.error === "network-error") {
-      findings.push({
-        code: "crawler.probe-failed",
-        level: "warn",
-        title: `Could not probe as ${agent.vendor} ${agent.id}`,
-        detail: `The request as ${agent.id} failed (${result.error}); no conclusion drawn for this crawler.`,
-        evidence
-      });
-      continue;
-    }
-    if (botClass === 2) {
-      findings.push({
-        code: "crawler.accessible",
-        level: "pass",
-        title: `${agent.vendor} ${agent.id} can reach the page`,
-        detail: `A request with the ${agent.id} user-agent returned the same success class as a browser.`,
-        evidence
-      });
-      continue;
-    }
-    if (result.status === 429) {
-      findings.push({
-        code: "crawler.rate-limited",
-        level: "warn",
-        title: `${agent.vendor} ${agent.id} is rate-limited`,
-        detail: `The ${agent.id} request was rate-limited (429). This is usually transient, but aggressive rate limits can starve crawlers of your content.`,
-        evidence
-      });
-      continue;
-    }
-    if (result.status === 403 || result.status === 401) {
-      const challenge = looksLikeBotChallenge(result);
-      if (challenge) {
-        findings.push({
-          code: "crawler.bot-challenge",
-          level: "warn",
-          title: `${agent.vendor} ${agent.id} hit a bot challenge`,
-          detail: `The ${agent.id} user-agent got a challenge/verification response (${result.status}). Because ${agent.id} is verified by ${agent.verification ?? "its published identity"}, the real crawler may pass where this spoofed user-agent does not. Confirm the verified bot is allowlisted at your CDN.`,
-          evidence,
-          fix: "Allowlist the crawler by its published IP ranges (verified bots) rather than relying on user-agent rules."
-        });
-      } else {
-        findings.push({
-          code: "crawler.ua-differential-block",
-          level: "warn",
-          title: `${agent.vendor} ${agent.id} is blocked from a generic IP`,
-          detail: `A browser gets ${control.status} but the ${agent.id} user-agent gets ${result.status}, with no challenge signal. Two things cause this and they mean opposite things: a user-agent-string WAF rule (which also blocks the real ${agent.id}) or IP allowlisting (where the verified ${agent.id} is fine). Check which it is at your CDN.`,
-          evidence,
-          fix: `If a WAF rule blocks the "${agent.id}" user-agent, remove or narrow it. If you allowlist verified bots by IP, no action is needed.`
-        });
-      }
-      continue;
-    }
-    if (botClass === 5) {
-      findings.push({
-        code: "crawler.origin-error",
-        level: "warn",
-        title: `${agent.vendor} ${agent.id} triggered a server error`,
-        detail: `The ${agent.id} user-agent got a ${result.status} while the browser got ${control.status}. Something in the stack treats this crawler differently and errors.`,
-        evidence
-      });
-      continue;
-    }
-    findings.push({
-      code: "crawler.differential-unknown",
-      level: "warn",
-      title: `${agent.vendor} ${agent.id} is treated differently than a browser`,
-      detail: `The ${agent.id} user-agent returned ${result.status} while a browser returned ${control.status}. The cause is unclear from the response; inspect the evidence.`,
-      evidence
-    });
-  }
-  return findings;
-}
 // src/analyzers/site-checks.ts
 import {
   extractJsonLdScriptContents,
   findBlockedCrawlers,
   hasLlmsTxtDiscoveryLink,
-  validateJsonLdNode,
   validateLlmsTxt
 } from "@agentmarkup/core";
+var HTML_BODY_RE = /^\s*(?:<!doctype\s+html|<html[\s>])/i;
+function isRealTextResource(res) {
+  if (res.error || (res.status ?? 0) >= 400 || !res.body) {
+    return false;
+  }
+  const contentType = (res.headers["content-type"] ?? "").toLowerCase();
+  if (contentType.includes("text/html")) {
+    return false;
+  }
+  return !HTML_BODY_RE.test(res.body);
+}
+function isGraphContainer(value) {
+  return !!value && typeof value === "object" && Array.isArray(value["@graph"]);
+}
 var EXPECTED_CRAWLERS = Object.fromEntries(
   CRAWLER_AGENTS.map((agent) => [agent.ua.split("/")[0], "allow"])
 );
@@ -214,7 +119,7 @@ function analyzeJsDependence(control) {
 }
 function analyzeRobots(robots) {
   const findings = [];
-  const has = !robots.error && (robots.status ?? 0) < 400 && Boolean(robots.body);
+  const has = isRealTextResource(robots);
   if (!has) {
     findings.push({
       code: "robots.missing",
@@ -267,7 +172,7 @@ function analyzeRobots(robots) {
 function analyzeMachineReadable(control, llms) {
   const findings = [];
   const html = control.body ?? "";
-  const llmsOk = !llms.error && (llms.status ?? 0) < 400 && Boolean(llms.body);
+  const llmsOk = isRealTextResource(llms);
   if (llmsOk) {
     const results = validateLlmsTxt(llms.body ?? "");
     const errors = results.filter((r) => r.severity === "error");
@@ -313,37 +218,274 @@ function analyzeMachineReadable(control, llms) {
         fix: "Add JSON-LD with agentmarkup schema presets (webSite, organization, article, \u2026)."
       });
     } else {
-      const errors = [];
+      let parseError = false;
+      let anyTyped = false;
       for (const block of blocks) {
+        let parsed;
         try {
-          const parsed = JSON.parse(block);
-          const nodes = Array.isArray(parsed) ? parsed : [parsed];
+          parsed = JSON.parse(block);
+        } catch {
+          parseError = true;
+          continue;
+        }
+        const roots = Array.isArray(parsed) ? parsed : [parsed];
+        for (const root of roots) {
+          const nodes = isGraphContainer(root) ? root["@graph"] : [root];
           for (const node of nodes) {
-            for (const r of validateJsonLdNode(node)) {
-              if (r.severity === "error") errors.push(r.message);
+            if (node && typeof node === "object" && "@type" in node) {
+              anyTyped = true;
             }
           }
-        } catch {
-          errors.push("a JSON-LD script block is not valid JSON");
         }
       }
-      findings.push(
-        errors.length > 0 ? {
+      if (parseError) {
+        findings.push({
+          code: "jsonld.invalid",
+          level: "error",
+          title: "JSON-LD has errors",
+          detail: "a JSON-LD script block is not valid JSON"
+        });
+      } else if (!anyTyped) {
+        findings.push({
           code: "jsonld.invalid",
           level: "error",
           title: "JSON-LD has errors",
-          detail: errors.join("; ")
-        } : {
+          detail: "a JSON-LD block has no @type, so it is not usable structured data"
+        });
+      } else {
+        findings.push({
           code: "jsonld.present",
           level: "pass",
           title: "JSON-LD structured data present",
-          detail: `${blocks.length} JSON-LD block(s) found and structurally valid.`
-        }
-      );
+          detail: `${blocks.length} JSON-LD block(s) found and parseable.`
+        });
+      }
     }
   }
   return findings;
 }
+function hasMarkdownAlternate(html) {
+  const links = html.match(/<link\b[^>]*>/gi) ?? [];
+  return links.some(
+    (link) => /\brel=["']?[^"'>]*\balternate\b/i.test(link) && /\btype=["']?text\/markdown\b/i.test(link)
+  );
+}
+function analyzeMarkdown(control, mirror) {
+  const html = control.body ?? "";
+  const viaLink = html.length > 0 && hasMarkdownAlternate(html);
+  const mirrorType = (mirror.headers["content-type"] ?? "").toLowerCase();
+  const viaMirror = isRealTextResource(mirror) && (mirrorType.includes("markdown") || /^\s*#/.test(mirror.body ?? ""));
+  if (!viaLink && !viaMirror) {
+    return [];
+  }
+  return [
+    {
+      code: "markdown.present",
+      level: "pass",
+      title: "A markdown alternate is available for agents",
+      detail: viaMirror ? "A markdown mirror of the page is fetchable, giving agents a clean, low-noise version of the content." : "The page advertises a text/markdown alternate link for agents."
+    }
+  ];
+}
+function isXmlSitemap(sitemap) {
+  const body = sitemap.body ?? "";
+  const contentType = (sitemap.headers["content-type"] ?? "").toLowerCase();
+  const reachable = !sitemap.error && (sitemap.status ?? 0) < 400 && body.length > 0;
+  const looksXml = /<(?:urlset|sitemapindex)\b/i.test(body) || /^\s*<\?xml/i.test(body);
+  const isHtml = contentType.includes("text/html") || HTML_BODY_RE.test(body);
+  return reachable && looksXml && !isHtml;
+}
+function analyzeSitemap(sitemap, robots) {
+  const declaredInRobots = /^\s*sitemap\s*:/im.test(robots.body ?? "");
+  if (isXmlSitemap(sitemap) || declaredInRobots) {
+    return [
+      {
+        code: "sitemap.present",
+        level: "pass",
+        title: "Sitemap found",
+        detail: declaredInRobots ? "A sitemap is declared in robots.txt, which helps crawlers and AI systems discover all of your pages." : "A sitemap.xml is reachable, which helps crawlers and AI systems discover all of your pages."
+      }
+    ];
+  }
+  return [
+    {
+      code: "sitemap.missing",
+      level: "warn",
+      title: "No sitemap.xml found",
+      detail: "No reachable sitemap.xml. A sitemap helps crawlers and AI systems discover pages they would not reach by following links.",
+      fix: "Generate a sitemap.xml and reference it from robots.txt."
+    }
+  ];
+}
+function analyzeMetadata(control) {
+  if (control.error || (control.status ?? 0) >= 400 || !control.body) {
+    return [];
+  }
+  const html = control.body;
+  const missing = [];
+  const titleMatch = /<title\b[^>]*>([\s\S]*?)<\/title>/i.exec(html);
+  if (!titleMatch || titleMatch[1].trim().length === 0) {
+    missing.push("title");
+  }
+  const metas = html.match(/<meta\b[^>]*>/gi) ?? [];
+  const hasDescription = metas.some(
+    (tag) => /\bname=["']?description["']?/i.test(tag) && /\bcontent=["'][^"']*\S[^"']*["']/i.test(tag)
+  );
+  if (!hasDescription) missing.push("description");
+  const links = html.match(/<link\b[^>]*>/gi) ?? [];
+  const hasCanonical = links.some(
+    (link) => /\brel=["']?canonical\b/i.test(link)
+  );
+  if (!hasCanonical) missing.push("canonical");
+  if (missing.length === 0) {
+    return [
+      {
+        code: "meta.complete",
+        level: "pass",
+        title: "Core page metadata present",
+        detail: "The page has a title, a meta description, and a canonical link, which help AI systems and search attribute the page."
+      }
+    ];
+  }
+  return [
+    {
+      code: "meta.incomplete",
+      level: "warn",
+      title: "Core page metadata is incomplete",
+      detail: `Missing: ${missing.join(
+        ", "
+      )}. Title, meta description, and canonical link help AI systems and search understand and correctly attribute the page.`,
+      evidence: `missing: ${missing.join(", ")}`,
+      fix: "Add the missing head tags; agentmarkup keeps these consistent on generated pages."
+    }
+  ];
+}
+// src/analyzers/crawler-access.ts
+function isThinnerThanBrowser(controlTextLength, crawlerBody) {
+  if (controlTextLength < 500) return false;
+  const crawlerTextLength = stripTags(crawlerBody ?? "").length;
+  return crawlerTextLength < controlTextLength * 0.4 && controlTextLength - crawlerTextLength >= 500;
+}
+var CHALLENGE_MARKERS = [
+  "cf-browser-verification",
+  "challenge-platform",
+  "just a moment",
+  "attention required",
+  "enable javascript and cookies to continue"
+];
+function looksLikeBotChallenge(result) {
+  const mitigated = result.headers["cf-mitigated"];
+  if (mitigated && mitigated.toLowerCase().includes("challenge")) return true;
+  const body = (result.body ?? "").toLowerCase();
+  return CHALLENGE_MARKERS.some((marker) => body.includes(marker));
+}
+function statusClass(status) {
+  return status === null ? null : Math.floor(status / 100);
+}
+function analyzeCrawlerAccess(control, probes) {
+  const findings = [];
+  const controlClass = statusClass(control.status);
+  const controlTextLength = stripTags(control.body ?? "").length;
+  if (control.error || controlClass !== 2) {
+    findings.push({
+      code: "crawler.control-failed",
+      level: "warn",
+      title: "Could not establish a browser baseline",
+      detail: "The control request (normal browser user-agent) did not return a 2xx response, so bot-vs-browser differences cannot be judged reliably.",
+      evidence: `browser control: status=${control.status ?? "none"}${control.error ? ` error=${control.error}` : ""}`,
+      fix: "Confirm the URL is reachable and returns 200 in a browser, then re-run the audit."
+    });
+    return findings;
+  }
+  for (const { agent, result } of probes) {
+    const botClass = statusClass(result.status);
+    const evidence = `${agent.id} \u2192 status=${result.status ?? "none"}${result.error ? ` error=${result.error}` : ""}; browser \u2192 status=${control.status}`;
+    if (result.error === "timeout" || result.error === "network-error") {
+      findings.push({
+        code: "crawler.probe-failed",
+        level: "warn",
+        title: `Could not probe as ${agent.vendor} ${agent.id}`,
+        detail: `The request as ${agent.id} failed (${result.error}); no conclusion drawn for this crawler.`,
+        evidence
+      });
+      continue;
+    }
+    if (botClass === 2) {
+      if (isThinnerThanBrowser(controlTextLength, result.body)) {
+        const crawlerTextLength = stripTags(result.body ?? "").length;
+        findings.push({
+          code: "crawler.content-differential",
+          level: "warn",
+          title: `${agent.vendor} ${agent.id} gets much less content than a browser`,
+          detail: `The ${agent.id} user-agent reached the page (${result.status}) but its HTML has far less text than the browser's (${crawlerTextLength} vs ${controlTextLength} characters). Content may be gated behind JavaScript or served only to browsers, so the crawler indexes a thinner page.`,
+          evidence: `${agent.id} text=${crawlerTextLength} chars; browser text=${controlTextLength} chars`,
+          fix: "Server-render or prerender the shared content, or provide a markdown mirror, so crawlers get the same text as browsers."
+        });
+        continue;
+      }
+      findings.push({
+        code: "crawler.accessible",
+        level: "pass",
+        title: `${agent.vendor} ${agent.id} can reach the page`,
+        detail: `A request with the ${agent.id} user-agent returned the same success class as a browser.`,
+        evidence
+      });
+      continue;
+    }
+    if (result.status === 429) {
+      findings.push({
+        code: "crawler.rate-limited",
+        level: "warn",
+        title: `${agent.vendor} ${agent.id} is rate-limited`,
+        detail: `The ${agent.id} request was rate-limited (429). This is usually transient, but aggressive rate limits can starve crawlers of your content.`,
+        evidence
+      });
+      continue;
+    }
+    if (result.status === 403 || result.status === 401) {
+      const challenge = looksLikeBotChallenge(result);
+      if (challenge) {
+        findings.push({
+          code: "crawler.bot-challenge",
+          level: "warn",
+          title: `${agent.vendor} ${agent.id} hit a bot challenge`,
+          detail: `The ${agent.id} user-agent got a challenge/verification response (${result.status}). Because ${agent.id} is verified by ${agent.verification ?? "its published identity"}, the real crawler may pass where this spoofed user-agent does not. Confirm the verified bot is allowlisted at your CDN.`,
+          evidence,
+          fix: "Allowlist the crawler by its published IP ranges (verified bots) rather than relying on user-agent rules."
+        });
+      } else {
+        findings.push({
+          code: "crawler.ua-differential-block",
+          level: "warn",
+          title: `${agent.vendor} ${agent.id} is blocked from a generic IP`,
+          detail: `A browser gets ${control.status} but the ${agent.id} user-agent gets ${result.status}, with no challenge signal. Two things cause this and they mean opposite things: a user-agent-string WAF rule (which also blocks the real ${agent.id}) or IP allowlisting (where the verified ${agent.id} is fine). Check which it is at your CDN.`,
+          evidence,
+          fix: `If a WAF rule blocks the "${agent.id}" user-agent, remove or narrow it. If you allowlist verified bots by IP, no action is needed.`
+        });
+      }
+      continue;
+    }
+    if (botClass === 5) {
+      findings.push({
+        code: "crawler.origin-error",
+        level: "warn",
+        title: `${agent.vendor} ${agent.id} triggered a server error`,
+        detail: `The ${agent.id} user-agent got a ${result.status} while the browser got ${control.status}. Something in the stack treats this crawler differently and errors.`,
+        evidence
+      });
+      continue;
+    }
+    findings.push({
+      code: "crawler.differential-unknown",
+      level: "warn",
+      title: `${agent.vendor} ${agent.id} is treated differently than a browser`,
+      detail: `The ${agent.id} user-agent returned ${result.status} while a browser returned ${control.status}. The cause is unclear from the response; inspect the evidence.`,
+      evidence
+    });
+  }
+  return findings;
+}
 // src/findings.ts
 function worstLevel(findings) {
@@ -584,6 +726,12 @@ async function safeFetch(targetUrl, options) {
 }
 // src/audit.ts
+var SITEMAP_FALLBACK_PATHS = [
+  "/sitemap_index.xml",
+  "/sitemap-index.xml",
+  "/wp-sitemap.xml",
+  "/sitemap/sitemap.xml"
+];
 function originOf(url) {
   try {
     return new URL(url).origin;
@@ -591,6 +739,30 @@ function originOf(url) {
     return url.replace(/\/+$/, "");
   }
 }
+function markdownMirrorUrl(pageUrl) {
+  try {
+    const url = new URL(pageUrl);
+    const path = url.pathname.replace(/\/+$/, "");
+    if (path === "") return `${url.origin}/index.md`;
+    if (/\.[a-z0-9]+$/i.test(path)) return null;
+    return `${url.origin}${path}.md`;
+  } catch {
+    return null;
+  }
+}
+function notFetched(url) {
+  return {
+    requestedUrl: url,
+    finalUrl: url,
+    status: null,
+    ok: false,
+    headers: {},
+    body: null,
+    bodyBytes: 0,
+    redirects: 0,
+    blocked: false
+  };
+}
 async function audit(targetUrl, options) {
   const doFetch = options.fetchImpl ?? safeFetch;
   const timeoutMs = options.timeoutMs;
@@ -605,8 +777,7 @@ async function audit(targetUrl, options) {
     const result = await doFetch(targetUrl, {
       userAgent: agent.ua,
       timeoutMs,
-      readBody: true,
-      maxBytes: 64 * 1024
+      readBody: true
     });
     probes.push({ agent, result });
   }
@@ -622,11 +793,37 @@ async function audit(targetUrl, options) {
     readBody: true,
     maxBytes: 1024 * 1024
   });
+  const fetchSitemap = (path) => doFetch(`${origin}${path}`, {
+    userAgent: BROWSER_CONTROL.ua,
+    timeoutMs,
+    readBody: true,
+    maxBytes: 1024 * 1024
+  });
+  let sitemap = await fetchSitemap("/sitemap.xml");
+  if (!isXmlSitemap(sitemap) && !/^\s*sitemap\s*:/im.test(robots.body ?? "")) {
+    for (const path of SITEMAP_FALLBACK_PATHS) {
+      const candidate = await fetchSitemap(path);
+      if (isXmlSitemap(candidate)) {
+        sitemap = candidate;
+        break;
+      }
+    }
+  }
+  const mirrorUrl = markdownMirrorUrl(control.finalUrl || targetUrl);
+  const mirror = mirrorUrl ? await doFetch(mirrorUrl, {
+    userAgent: BROWSER_CONTROL.ua,
+    timeoutMs,
+    readBody: true,
+    maxBytes: 1024 * 1024
+  }) : notFetched(`${origin}/index.md`);
   const findings = [
     ...analyzeCrawlerAccess(control, probes),
     ...analyzeJsDependence(control),
     ...analyzeRobots(robots),
-    ...analyzeMachineReadable(control, llms)
+    ...analyzeMachineReadable(control, llms),
+    ...analyzeMarkdown(control, mirror),
+    ...analyzeSitemap(sitemap, robots),
+    ...analyzeMetadata(control)
   ];
   const counts = countByLevel(findings);
   const passed = counts.pass;
@@ -748,10 +945,13 @@ export {
   BROWSER_CONTROL,
   CRAWLER_AGENTS,
   ALL_AGENTS,
-  analyzeCrawlerAccess,
   analyzeJsDependence,
   analyzeRobots,
   analyzeMachineReadable,
+  analyzeMarkdown,
+  analyzeSitemap,
+  analyzeMetadata,
+  analyzeCrawlerAccess,
   worstLevel,
   countByLevel,
   parseIpv4,

package/dist/index.d.ts CHANGED Viewed

@@ -135,5 +135,15 @@ declare function analyzeJsDependence(control: FetchResult): AuditFinding[];
 declare function analyzeRobots(robots: FetchResult): AuditFinding[];
 /** Machine-readability surface on the homepage HTML plus a fetched llms.txt. */
 declare function analyzeMachineReadable(control: FetchResult, llms: FetchResult): AuditFinding[];
+/**
+ * Markdown mirrors / alternates are optional but valuable: they give agents a
+ * clean, low-noise version of the page (agentmarkup can generate them, and some
+ * CDNs serve runtime markdown). Present is a pass; absent emits no finding
+ * because a content-rich HTML page does not need one.
+ */
+declare function analyzeMarkdown(control: FetchResult, mirror: FetchResult): AuditFinding[];
+declare function analyzeSitemap(sitemap: FetchResult, robots: FetchResult): AuditFinding[];
+/** Core head metadata (title / description / canonical) crawlers use to attribute a page. */
+declare function analyzeMetadata(control: FetchResult): AuditFinding[];
-export { ALL_AGENTS, type AgentProbe, type AuditFinding, type AuditLevel, type AuditOptions, type AuditReport, BROWSER_CONTROL, CRAWLER_AGENTS, type CrawlerAgent, type FetchOptions, type FetchResult, type RunContext, analyzeCrawlerAccess, analyzeJsDependence, analyzeMachineReadable, analyzeRobots, audit, countByLevel, isBlockedHostname, parseIpv4, parseIpv6, renderJson, renderText, run, safeFetch, worstLevel };
+export { ALL_AGENTS, type AgentProbe, type AuditFinding, type AuditLevel, type AuditOptions, type AuditReport, BROWSER_CONTROL, CRAWLER_AGENTS, type CrawlerAgent, type FetchOptions, type FetchResult, type RunContext, analyzeCrawlerAccess, analyzeJsDependence, analyzeMachineReadable, analyzeMarkdown, analyzeMetadata, analyzeRobots, analyzeSitemap, audit, countByLevel, isBlockedHostname, parseIpv4, parseIpv6, renderJson, renderText, run, safeFetch, worstLevel };

package/dist/index.js CHANGED Viewed

@@ -5,7 +5,10 @@ import {
   analyzeCrawlerAccess,
   analyzeJsDependence,
   analyzeMachineReadable,
+  analyzeMarkdown,
+  analyzeMetadata,
   analyzeRobots,
+  analyzeSitemap,
   audit,
   countByLevel,
   isBlockedHostname,
@@ -16,7 +19,7 @@ import {
   run,
   safeFetch,
   worstLevel
-} from "./chunk-PNE6FBX2.js";
+} from "./chunk-VYQOM2ID.js";
 export {
   ALL_AGENTS,
   BROWSER_CONTROL,
@@ -24,7 +27,10 @@ export {
   analyzeCrawlerAccess,
   analyzeJsDependence,
   analyzeMachineReadable,
+  analyzeMarkdown,
+  analyzeMetadata,
   analyzeRobots,
+  analyzeSitemap,
   audit,
   countByLevel,
   isBlockedHostname,

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@agentmarkup/audit",
-  "version": "0.1.0",
-  "description": "Audit a live URL the way AI crawlers see it: fetch as GPTBot, ClaudeBot, PerplexityBot and more, diff against a browser to catch accidental CDN blocks, plus llms.txt, JSON-LD, robots.txt intent, Content-Signal, and JS-dependence checks",
+  "version": "0.2.1",
+  "description": "Audit a live URL the way AI crawlers see it: fetch as GPTBot, ClaudeBot, PerplexityBot and more, diff against a browser to catch accidental CDN blocks and JS-gated content, plus llms.txt, JSON-LD, robots.txt intent, Content-Signal, markdown mirror, sitemap, and page-metadata checks",
   "type": "module",
   "license": "MIT",
   "author": "Sebastian Cochinescu <hello@animafelix.com> (https://animafelix.com)",
@@ -23,6 +23,8 @@
     "robots-txt",
     "content-signal",
     "json-ld",
+    "sitemap",
+    "markdown",
     "geo",
     "aeo",
     "seo",
@@ -45,7 +47,7 @@
     "dist"
   ],
   "dependencies": {
-    "@agentmarkup/core": "0.5.2"
+    "@agentmarkup/core": "0.5.3"
   },
   "devDependencies": {
     "eslint": "^9.0.0",