npm - @agentmarkup/audit - Versions diffs - 0.1.0 - Mend

@agentmarkup/audit 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Sebastian Cochinescu and Anima Felix
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

package/README.md ADDED Viewed

@@ -0,0 +1,66 @@
+# @agentmarkup/audit
+Audit any live URL the way AI crawlers actually see it.
+Most SEO tools fetch a page once, as a browser, and grade the HTML. `@agentmarkup/audit` fetches the **same URL as GPTBot, ClaudeBot, PerplexityBot, OAI-SearchBot, and Google-Extended**, diffs each response against a normal browser, and reports where AI systems get a different — often worse — view than your human visitors. It also checks the machine-readable surface: `robots.txt` intent, Content-Signal, `llms.txt`, JSON-LD, and JavaScript-dependence.
+It is deterministic (pass / warn / error, no invented scores) and CI-friendly.
+## Usage
+```bash
+npx @agentmarkup/audit https://example.com
+```
+```bash
+# JSON for CI / league tables
+npx @agentmarkup/audit https://example.com --json
+# custom per-request timeout
+npx @agentmarkup/audit example.com --timeout 15000
+```
+Bare domains are normalized to `https://`. Exit code is `1` when any **error**-level finding is present (a CI gate), `0` otherwise, `2` on a usage error.
+## What it checks
+| Area | What it does |
+| --- | --- |
+| **Crawler access** | Fetches as each AI crawler user-agent and diffs status against a browser control. Flags challenges, differential blocks, rate limits, and origin errors. |
+| **JS dependence** | Measures whether the raw (un-executed) HTML actually contains content, or is an empty `#root`/`#app` shell that only fills in after JavaScript runs. |
+| **robots.txt** | Reuses `@agentmarkup/core` to detect whether the crawlers you likely want are shadowed by a wildcard `Disallow`, and whether a canonical Content-Signal policy is present. |
+| **llms.txt** | Fetches `/llms.txt`, validates it, and checks the homepage links it for discovery. |
+| **JSON-LD** | Extracts and structurally validates JSON-LD blocks on the page. |
+## An honest note on "blocked" crawlers
+This tool spoofs a crawler's **user-agent** from an ordinary IP. That is exactly what a browser extension or a curious developer can do, and it is *not* what the real, verified bot does. So a `403` for a spoofed `GPTBot` user-agent is genuinely ambiguous:
+- it can be a **user-agent WAF rule** — which also blocks the real GPTBot (a real problem), **or**
+- it can be **IP allowlisting** — where the verified GPTBot, coming from OpenAI's published IP ranges, is let through just fine (no problem at all).
+From a spoofed request we cannot tell these apart, so the audit reports them as **warnings with both explanations and the raw evidence**, never as a bare "your site blocks AI" error. Error-level findings are reserved for things that are provable from the response itself: a `robots.txt` that literally disallows the crawler, an empty JavaScript shell, or invalid `llms.txt` / JSON-LD.
+## Programmatic use
+```ts
+import { audit, renderText } from '@agentmarkup/audit';
+const report = await audit('https://example.com', {
+  fetchedAt: new Date().toISOString(),
+});
+console.log(report.summary); // { pass, warn, error, checks, passed, worst }
+process.stdout.write(renderText(report));
+```
+The exported analyzers (`analyzeCrawlerAccess`, `analyzeRobots`, `analyzeJsDependence`, `analyzeMachineReadable`) and the SSRF-safe `safeFetch` are available for building custom pipelines.
+## Safety
+Requests are made with an SSRF-safe fetch: `localhost`, private, loopback, link-local, CGNAT, and IPv6-bypass address forms are refused, redirects are followed manually and re-validated per hop, and responses are size- and time-bounded. The blocklist mirrors the hosted checker at [agentmarkup.dev](https://agentmarkup.dev).
+## License
+MIT © Sebastian Cochinescu and Anima Felix
+Part of [agentmarkup](https://agentmarkup.dev) — build-time tooling to make websites machine-readable for LLMs and AI agents.

package/dist/bin.d.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ #!/usr/bin/env node

package/dist/bin.js ADDED Viewed

@@ -0,0 +1,25 @@
+#!/usr/bin/env node
+import {
+  run
+} from "./chunk-PNE6FBX2.js";
+// src/bin.ts
+import { createRequire } from "module";
+function resolveVersion() {
+  try {
+    const require2 = createRequire(import.meta.url);
+    const pkg = require2("../package.json");
+    return pkg.version ?? "0.0.0";
+  } catch {
+    return "0.0.0";
+  }
+}
+run(process.argv.slice(2), { version: resolveVersion() }).then((code) => {
+  process.exitCode = code;
+}).catch((error) => {
+  process.stderr.write(
+    `agentmarkup-audit: ${error instanceof Error ? error.message : String(error)}
+`
+  );
+  process.exitCode = 1;
+});

package/dist/chunk-PNE6FBX2.js ADDED Viewed

@@ -0,0 +1,765 @@
+// src/agents.ts
+var BROWSER_CONTROL = {
+  id: "browser",
+  ua: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
+  vendor: "Control",
+  control: true
+};
+var CRAWLER_AGENTS = [
+  {
+    id: "gptbot",
+    ua: "Mozilla/5.0 (compatible; GPTBot/1.1; +https://openai.com/gptbot)",
+    vendor: "OpenAI",
+    verification: "ip-range",
+    intent: "training",
+    docsUrl: "https://platform.openai.com/docs/bots"
+  },
+  {
+    id: "oai-searchbot",
+    ua: "Mozilla/5.0 (compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot)",
+    vendor: "OpenAI",
+    verification: "ip-range",
+    intent: "search",
+    docsUrl: "https://platform.openai.com/docs/bots"
+  },
+  {
+    id: "claudebot",
+    ua: "Mozilla/5.0 (compatible; ClaudeBot/1.0; +https://www.anthropic.com/claude-bot)",
+    vendor: "Anthropic",
+    verification: "ip-range",
+    intent: "training",
+    docsUrl: "https://support.anthropic.com/en/articles/8896518"
+  },
+  {
+    id: "perplexitybot",
+    ua: "Mozilla/5.0 (compatible; PerplexityBot/1.0; +https://perplexity.ai/perplexitybot)",
+    vendor: "Perplexity",
+    verification: "ip-range",
+    intent: "search",
+    docsUrl: "https://docs.perplexity.ai/guides/bots"
+  },
+  {
+    id: "google-extended",
+    ua: "Mozilla/5.0 (compatible; Google-Extended/1.0; +http://www.google.com/bot.html)",
+    vendor: "Google",
+    verification: "reverse-dns",
+    intent: "training",
+    docsUrl: "https://developers.google.com/search/docs/crawling-indexing/google-common-crawlers"
+  }
+];
+var ALL_AGENTS = [BROWSER_CONTROL, ...CRAWLER_AGENTS];
+// src/analyzers/crawler-access.ts
+var CHALLENGE_MARKERS = [
+  "cf-browser-verification",
+  "challenge-platform",
+  "just a moment",
+  "attention required",
+  "enable javascript and cookies to continue"
+];
+function looksLikeBotChallenge(result) {
+  const mitigated = result.headers["cf-mitigated"];
+  if (mitigated && mitigated.toLowerCase().includes("challenge")) return true;
+  const body = (result.body ?? "").toLowerCase();
+  return CHALLENGE_MARKERS.some((marker) => body.includes(marker));
+}
+function statusClass(status) {
+  return status === null ? null : Math.floor(status / 100);
+}
+function analyzeCrawlerAccess(control, probes) {
+  const findings = [];
+  const controlClass = statusClass(control.status);
+  if (control.error || controlClass !== 2) {
+    findings.push({
+      code: "crawler.control-failed",
+      level: "warn",
+      title: "Could not establish a browser baseline",
+      detail: "The control request (normal browser user-agent) did not return a 2xx response, so bot-vs-browser differences cannot be judged reliably.",
+      evidence: `browser control: status=${control.status ?? "none"}${control.error ? ` error=${control.error}` : ""}`,
+      fix: "Confirm the URL is reachable and returns 200 in a browser, then re-run the audit."
+    });
+    return findings;
+  }
+  for (const { agent, result } of probes) {
+    const botClass = statusClass(result.status);
+    const evidence = `${agent.id} \u2192 status=${result.status ?? "none"}${result.error ? ` error=${result.error}` : ""}; browser \u2192 status=${control.status}`;
+    if (result.error === "timeout" || result.error === "network-error") {
+      findings.push({
+        code: "crawler.probe-failed",
+        level: "warn",
+        title: `Could not probe as ${agent.vendor} ${agent.id}`,
+        detail: `The request as ${agent.id} failed (${result.error}); no conclusion drawn for this crawler.`,
+        evidence
+      });
+      continue;
+    }
+    if (botClass === 2) {
+      findings.push({
+        code: "crawler.accessible",
+        level: "pass",
+        title: `${agent.vendor} ${agent.id} can reach the page`,
+        detail: `A request with the ${agent.id} user-agent returned the same success class as a browser.`,
+        evidence
+      });
+      continue;
+    }
+    if (result.status === 429) {
+      findings.push({
+        code: "crawler.rate-limited",
+        level: "warn",
+        title: `${agent.vendor} ${agent.id} is rate-limited`,
+        detail: `The ${agent.id} request was rate-limited (429). This is usually transient, but aggressive rate limits can starve crawlers of your content.`,
+        evidence
+      });
+      continue;
+    }
+    if (result.status === 403 || result.status === 401) {
+      const challenge = looksLikeBotChallenge(result);
+      if (challenge) {
+        findings.push({
+          code: "crawler.bot-challenge",
+          level: "warn",
+          title: `${agent.vendor} ${agent.id} hit a bot challenge`,
+          detail: `The ${agent.id} user-agent got a challenge/verification response (${result.status}). Because ${agent.id} is verified by ${agent.verification ?? "its published identity"}, the real crawler may pass where this spoofed user-agent does not. Confirm the verified bot is allowlisted at your CDN.`,
+          evidence,
+          fix: "Allowlist the crawler by its published IP ranges (verified bots) rather than relying on user-agent rules."
+        });
+      } else {
+        findings.push({
+          code: "crawler.ua-differential-block",
+          level: "warn",
+          title: `${agent.vendor} ${agent.id} is blocked from a generic IP`,
+          detail: `A browser gets ${control.status} but the ${agent.id} user-agent gets ${result.status}, with no challenge signal. Two things cause this and they mean opposite things: a user-agent-string WAF rule (which also blocks the real ${agent.id}) or IP allowlisting (where the verified ${agent.id} is fine). Check which it is at your CDN.`,
+          evidence,
+          fix: `If a WAF rule blocks the "${agent.id}" user-agent, remove or narrow it. If you allowlist verified bots by IP, no action is needed.`
+        });
+      }
+      continue;
+    }
+    if (botClass === 5) {
+      findings.push({
+        code: "crawler.origin-error",
+        level: "warn",
+        title: `${agent.vendor} ${agent.id} triggered a server error`,
+        detail: `The ${agent.id} user-agent got a ${result.status} while the browser got ${control.status}. Something in the stack treats this crawler differently and errors.`,
+        evidence
+      });
+      continue;
+    }
+    findings.push({
+      code: "crawler.differential-unknown",
+      level: "warn",
+      title: `${agent.vendor} ${agent.id} is treated differently than a browser`,
+      detail: `The ${agent.id} user-agent returned ${result.status} while a browser returned ${control.status}. The cause is unclear from the response; inspect the evidence.`,
+      evidence
+    });
+  }
+  return findings;
+}
+// src/analyzers/site-checks.ts
+import {
+  extractJsonLdScriptContents,
+  findBlockedCrawlers,
+  hasLlmsTxtDiscoveryLink,
+  validateJsonLdNode,
+  validateLlmsTxt
+} from "@agentmarkup/core";
+var EXPECTED_CRAWLERS = Object.fromEntries(
+  CRAWLER_AGENTS.map((agent) => [agent.ua.split("/")[0], "allow"])
+);
+function stripTags(html) {
+  return html.replace(/<(script|style|template|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, " ").replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
+}
+var EMPTY_ROOT_RE = /<(?:div|main)\b[^>]*\bid=(['"])(?:root|app|__next|__nuxt|svelte)\1[^>]*>\s*<\/(?:div|main)>/i;
+function analyzeJsDependence(control) {
+  if (control.error || !control.body || (control.status ?? 0) >= 400) {
+    return [];
+  }
+  const text = stripTags(control.body);
+  const emptyRoot = EMPTY_ROOT_RE.test(control.body);
+  if (text.length < 200 && emptyRoot) {
+    return [
+      {
+        code: "js.empty-shell",
+        level: "error",
+        title: "Page content requires JavaScript",
+        detail: "The raw HTML has an empty root container and almost no text. Most AI crawlers do not run JavaScript, so they see an empty page. Server-render or prerender the content.",
+        evidence: `raw text length=${text.length}; empty root container detected`,
+        fix: "Prerender or SSR the page, or add markdown mirrors (agentmarkup markdownPages) so agents get real content."
+      }
+    ];
+  }
+  if (text.length < 200) {
+    return [
+      {
+        code: "js.thin-html",
+        level: "warn",
+        title: "Raw HTML is very thin",
+        detail: "The raw (un-executed) HTML contains little text. If the real content is injected by JavaScript, crawlers that do not run JS will miss it.",
+        evidence: `raw text length=${text.length}`,
+        fix: "Confirm meaningful content is present without JavaScript; consider markdown mirrors."
+      }
+    ];
+  }
+  return [
+    {
+      code: "js.server-rendered",
+      level: "pass",
+      title: "Content is present without JavaScript",
+      detail: "The raw HTML already contains meaningful text, so crawlers that do not execute JavaScript can read the page.",
+      evidence: `raw text length=${text.length}`
+    }
+  ];
+}
+function analyzeRobots(robots) {
+  const findings = [];
+  const has = !robots.error && (robots.status ?? 0) < 400 && Boolean(robots.body);
+  if (!has) {
+    findings.push({
+      code: "robots.missing",
+      level: "warn",
+      title: "No robots.txt found",
+      detail: "No reachable robots.txt. Crawlers assume full access, but you also cannot express AI-specific or Content-Signal preferences.",
+      fix: "Generate robots.txt with agentmarkup (aiCrawlers + contentSignalHeaders)."
+    });
+    return findings;
+  }
+  const body = robots.body ?? "";
+  const blocked = findBlockedCrawlers(body, EXPECTED_CRAWLERS);
+  if (blocked.length > 0) {
+    findings.push({
+      code: "robots.blocks-crawlers",
+      level: "error",
+      title: "robots.txt blocks AI crawlers you likely want",
+      detail: `A wildcard disallow shadows these crawlers: ${blocked.join(
+        ", "
+      )}. Blocking search/retrieval crawlers drops you from AI answers.`,
+      evidence: blocked.join(", "),
+      fix: "Split rules by intent: block training crawlers if you must, but keep search/retrieval crawlers allowed."
+    });
+  } else {
+    findings.push({
+      code: "robots.crawlers-allowed",
+      level: "pass",
+      title: "robots.txt does not block the expected AI crawlers",
+      detail: "None of the checked AI crawlers are shadowed by a wildcard disallow."
+    });
+  }
+  if (/^\s*content-signal\s*:/im.test(body)) {
+    findings.push({
+      code: "robots.content-signal",
+      level: "pass",
+      title: "Content-Signal policy present in robots.txt",
+      detail: "The canonical Content-Signal directive is in robots.txt, where the Content Signals Policy and Lighthouse look for it."
+    });
+  } else {
+    findings.push({
+      code: "robots.no-content-signal",
+      level: "warn",
+      title: "No Content-Signal policy in robots.txt",
+      detail: "Content-Signal in robots.txt is the canonical place to state training/search/ai-input preferences. It may still be set as an HTTP header, which fewer tools read.",
+      fix: "Enable agentmarkup contentSignalHeaders so Content-Signal is written into robots.txt."
+    });
+  }
+  return findings;
+}
+function analyzeMachineReadable(control, llms) {
+  const findings = [];
+  const html = control.body ?? "";
+  const llmsOk = !llms.error && (llms.status ?? 0) < 400 && Boolean(llms.body);
+  if (llmsOk) {
+    const results = validateLlmsTxt(llms.body ?? "");
+    const errors = results.filter((r) => r.severity === "error");
+    findings.push(
+      errors.length > 0 ? {
+        code: "llms.invalid",
+        level: "error",
+        title: "llms.txt has errors",
+        detail: errors.map((r) => r.message).join("; ")
+      } : {
+        code: "llms.present",
+        level: "pass",
+        title: "llms.txt is present and well-formed",
+        detail: "A parseable llms.txt was found. Note: most AI crawlers do not yet fetch llms.txt, but AI coding tools and some assistants do."
+      }
+    );
+  } else {
+    findings.push({
+      code: "llms.missing",
+      level: "warn",
+      title: "No llms.txt found",
+      detail: "No reachable /llms.txt. This is optional \u2014 it helps AI coding tools and some assistants, but major crawlers do not require it.",
+      fix: "Generate llms.txt with agentmarkup if you want a curated agent manifest."
+    });
+  }
+  if (html && !hasLlmsTxtDiscoveryLink(html) && llmsOk) {
+    findings.push({
+      code: "llms.no-discovery-link",
+      level: "warn",
+      title: "llms.txt is not linked from the homepage",
+      detail: 'An llms.txt exists but the homepage has no <link rel="alternate" type="text/plain" href="/llms.txt">, so agents cannot discover it from the page.',
+      fix: "agentmarkup injects this discovery link automatically."
+    });
+  }
+  if (html) {
+    const blocks = extractJsonLdScriptContents(html);
+    if (blocks.length === 0) {
+      findings.push({
+        code: "jsonld.missing",
+        level: "warn",
+        title: "No JSON-LD structured data",
+        detail: "The page has no JSON-LD. Structured data helps AI systems and search understand the page entity.",
+        fix: "Add JSON-LD with agentmarkup schema presets (webSite, organization, article, \u2026)."
+      });
+    } else {
+      const errors = [];
+      for (const block of blocks) {
+        try {
+          const parsed = JSON.parse(block);
+          const nodes = Array.isArray(parsed) ? parsed : [parsed];
+          for (const node of nodes) {
+            for (const r of validateJsonLdNode(node)) {
+              if (r.severity === "error") errors.push(r.message);
+            }
+          }
+        } catch {
+          errors.push("a JSON-LD script block is not valid JSON");
+        }
+      }
+      findings.push(
+        errors.length > 0 ? {
+          code: "jsonld.invalid",
+          level: "error",
+          title: "JSON-LD has errors",
+          detail: errors.join("; ")
+        } : {
+          code: "jsonld.present",
+          level: "pass",
+          title: "JSON-LD structured data present",
+          detail: `${blocks.length} JSON-LD block(s) found and structurally valid.`
+        }
+      );
+    }
+  }
+  return findings;
+}
+// src/findings.ts
+function worstLevel(findings) {
+  if (findings.some((f) => f.level === "error")) return "error";
+  if (findings.some((f) => f.level === "warn")) return "warn";
+  return "pass";
+}
+function countByLevel(findings) {
+  return findings.reduce(
+    (acc, f) => {
+      acc[f.level] += 1;
+      return acc;
+    },
+    { pass: 0, warn: 0, error: 0 }
+  );
+}
+// src/net.ts
+var DEFAULT_TIMEOUT_MS = 1e4;
+var DEFAULT_MAX_BYTES = 5 * 1024 * 1024;
+var MAX_REDIRECTS = 5;
+function parseIpv4(value) {
+  const match = /^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/.exec(value);
+  if (!match) return null;
+  const octets = match.slice(1).map(Number);
+  return octets.some((octet) => octet > 255) ? null : octets;
+}
+function isBlockedIpv4(octets) {
+  const [first, second] = octets;
+  return first === 0 || first === 10 || first === 127 || first === 100 && second >= 64 && second <= 127 || first === 169 && second === 254 || first === 172 && second >= 16 && second <= 31 || first === 192 && second === 168;
+}
+function parseIpv6(value) {
+  if (!value.includes(":")) return null;
+  let head = value;
+  const embedded = [];
+  const lastColon = value.lastIndexOf(":");
+  const suffix = value.slice(lastColon + 1);
+  if (suffix.includes(".")) {
+    const v4 = parseIpv4(suffix);
+    if (!v4) return null;
+    embedded.push(v4[0] << 8 | v4[1], v4[2] << 8 | v4[3]);
+    head = value.slice(0, lastColon);
+  }
+  const halves = head.split("::");
+  if (halves.length > 2) return null;
+  const parseGroups = (part) => part === "" ? [] : part.split(":").map(
+    (group) => /^[0-9a-f]{1,4}$/.test(group) ? parseInt(group, 16) : NaN
+  );
+  const left = parseGroups(halves[0]);
+  const right = halves.length === 2 ? parseGroups(halves[1]) : null;
+  let groups;
+  if (right === null) {
+    groups = [...left, ...embedded];
+  } else {
+    const known = left.length + right.length + embedded.length;
+    const missing = 8 - known;
+    if (missing < 1) return null;
+    groups = [...left, ...Array(missing).fill(0), ...right, ...embedded];
+  }
+  if (groups.length !== 8 || groups.some((group) => Number.isNaN(group))) {
+    return null;
+  }
+  return groups;
+}
+function isBlockedIpv6(groups) {
+  const [first] = groups;
+  if (groups.every((group) => group === 0)) return true;
+  if (groups.slice(0, 7).every((group) => group === 0) && groups[7] === 1) {
+    return true;
+  }
+  const mapped = groups.slice(0, 5).every((group) => group === 0) && groups[5] === 65535;
+  const compatible = groups.slice(0, 6).every((group) => group === 0);
+  if (mapped || compatible) {
+    return isBlockedIpv4([
+      groups[6] >> 8,
+      groups[6] & 255,
+      groups[7] >> 8,
+      groups[7] & 255
+    ]);
+  }
+  return (first & 65024) === 64512 || (first & 65472) === 65152 || (first & 65472) === 65216;
+}
+function isBlockedHostname(hostname) {
+  const lower = hostname.toLowerCase();
+  if (lower === "localhost" || lower.endsWith(".localhost") || lower.endsWith(".local")) {
+    return true;
+  }
+  const ipv4 = parseIpv4(lower);
+  if (ipv4) return isBlockedIpv4(ipv4);
+  const ipv6 = parseIpv6(lower.replace(/^\[|\]$/g, ""));
+  if (ipv6) return isBlockedIpv6(ipv6);
+  return false;
+}
+function isFetchableUrl(url) {
+  return (url.protocol === "http:" || url.protocol === "https:") && !isBlockedHostname(url.hostname);
+}
+async function readBounded(response, maxBytes) {
+  if (!response.body) {
+    const text = await response.text();
+    return { text: text.slice(0, maxBytes), bytes: Buffer.byteLength(text) };
+  }
+  const reader = response.body.getReader();
+  const chunks = [];
+  let total = 0;
+  let kept = 0;
+  for (; ; ) {
+    const { done, value } = await reader.read();
+    if (done) break;
+    total += value.byteLength;
+    if (kept < maxBytes) {
+      const room = maxBytes - kept;
+      const slice = value.byteLength <= room ? value : value.slice(0, room);
+      chunks.push(slice);
+      kept += slice.byteLength;
+    }
+    if (total >= maxBytes) {
+      await reader.cancel().catch(() => void 0);
+      break;
+    }
+  }
+  const buffer = Buffer.concat(chunks.map((chunk) => Buffer.from(chunk)));
+  return { text: buffer.toString("utf8"), bytes: total };
+}
+async function safeFetch(targetUrl, options) {
+  const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS;
+  const maxBytes = options.maxBytes ?? DEFAULT_MAX_BYTES;
+  const readBody = options.readBody ?? true;
+  const base = {
+    requestedUrl: targetUrl,
+    finalUrl: targetUrl,
+    status: null,
+    ok: false,
+    headers: {},
+    body: null,
+    bodyBytes: 0,
+    redirects: 0,
+    blocked: false
+  };
+  let currentUrl;
+  try {
+    const parsed = new URL(targetUrl);
+    if (!isFetchableUrl(parsed)) {
+      return { ...base, blocked: true, error: "blocked-by-ssrf-rules" };
+    }
+    currentUrl = parsed.toString();
+  } catch {
+    return { ...base, error: "invalid-url" };
+  }
+  for (let hop = 0; hop < MAX_REDIRECTS; hop += 1) {
+    const controller = new AbortController();
+    const timer = setTimeout(() => controller.abort(), timeoutMs);
+    try {
+      const response = await fetch(currentUrl, {
+        redirect: "manual",
+        signal: controller.signal,
+        headers: {
+          "user-agent": options.userAgent,
+          accept: "text/html,text/plain,application/xml,application/json;q=0.9,*/*;q=0.1"
+        }
+      });
+      const headers = {};
+      response.headers.forEach((value, key) => {
+        headers[key.toLowerCase()] = value;
+      });
+      if (response.status >= 300 && response.status < 400) {
+        const location = response.headers.get("location");
+        if (!location) {
+          return {
+            ...base,
+            finalUrl: currentUrl,
+            status: response.status,
+            headers,
+            redirects: hop,
+            error: "redirect-without-location"
+          };
+        }
+        let nextUrl;
+        try {
+          nextUrl = new URL(location, currentUrl);
+        } catch {
+          return {
+            ...base,
+            finalUrl: currentUrl,
+            status: response.status,
+            headers,
+            redirects: hop,
+            error: "invalid-redirect-target"
+          };
+        }
+        if (!isFetchableUrl(nextUrl)) {
+          return {
+            ...base,
+            finalUrl: nextUrl.toString(),
+            status: response.status,
+            headers,
+            redirects: hop,
+            blocked: true,
+            error: "blocked-by-ssrf-rules"
+          };
+        }
+        await response.body?.cancel().catch(() => void 0);
+        currentUrl = nextUrl.toString();
+        continue;
+      }
+      let body = null;
+      let bodyBytes = 0;
+      if (readBody) {
+        const read = await readBounded(response, maxBytes);
+        body = read.text;
+        bodyBytes = read.bytes;
+      } else {
+        await response.body?.cancel().catch(() => void 0);
+      }
+      return {
+        requestedUrl: targetUrl,
+        finalUrl: currentUrl,
+        status: response.status,
+        ok: response.ok,
+        headers,
+        body,
+        bodyBytes,
+        redirects: hop,
+        blocked: false
+      };
+    } catch (error) {
+      const aborted = error instanceof Error && error.name === "AbortError";
+      return {
+        ...base,
+        finalUrl: currentUrl,
+        redirects: hop,
+        error: aborted ? "timeout" : "network-error"
+      };
+    } finally {
+      clearTimeout(timer);
+    }
+  }
+  return { ...base, finalUrl: currentUrl, error: "too-many-redirects" };
+}
+// src/audit.ts
+function originOf(url) {
+  try {
+    return new URL(url).origin;
+  } catch {
+    return url.replace(/\/+$/, "");
+  }
+}
+async function audit(targetUrl, options) {
+  const doFetch = options.fetchImpl ?? safeFetch;
+  const timeoutMs = options.timeoutMs;
+  const origin = originOf(targetUrl);
+  const control = await doFetch(targetUrl, {
+    userAgent: BROWSER_CONTROL.ua,
+    timeoutMs,
+    readBody: true
+  });
+  const probes = [];
+  for (const agent of CRAWLER_AGENTS) {
+    const result = await doFetch(targetUrl, {
+      userAgent: agent.ua,
+      timeoutMs,
+      readBody: true,
+      maxBytes: 64 * 1024
+    });
+    probes.push({ agent, result });
+  }
+  const robots = await doFetch(`${origin}/robots.txt`, {
+    userAgent: BROWSER_CONTROL.ua,
+    timeoutMs,
+    readBody: true,
+    maxBytes: 256 * 1024
+  });
+  const llms = await doFetch(`${origin}/llms.txt`, {
+    userAgent: BROWSER_CONTROL.ua,
+    timeoutMs,
+    readBody: true,
+    maxBytes: 1024 * 1024
+  });
+  const findings = [
+    ...analyzeCrawlerAccess(control, probes),
+    ...analyzeJsDependence(control),
+    ...analyzeRobots(robots),
+    ...analyzeMachineReadable(control, llms)
+  ];
+  const counts = countByLevel(findings);
+  const passed = counts.pass;
+  const checks = findings.length;
+  return {
+    url: targetUrl,
+    finalUrl: control.finalUrl,
+    fetchedAt: options.fetchedAt,
+    findings,
+    summary: {
+      ...counts,
+      checks,
+      passed,
+      worst: worstLevel(findings)
+    }
+  };
+}
+// src/report.ts
+var RESET = "\x1B[0m";
+var GREEN = "\x1B[32m";
+var YELLOW = "\x1B[33m";
+var RED = "\x1B[31m";
+var BOLD = "\x1B[1m";
+var DIM = "\x1B[2m";
+var GLYPH = {
+  pass: `${GREEN}\u2713${RESET}`,
+  warn: `${YELLOW}\u26A0${RESET}`,
+  error: `${RED}\u2717${RESET}`
+};
+var ORDER = { error: 0, warn: 1, pass: 2 };
+function renderText(report) {
+  const lines = [];
+  lines.push("");
+  lines.push(`${BOLD}agentmarkup audit${RESET} ${DIM}${report.url}${RESET}`);
+  lines.push("");
+  const sorted = [...report.findings].sort(
+    (a, b) => ORDER[a.level] - ORDER[b.level]
+  );
+  for (const f of sorted) {
+    lines.push(`  ${GLYPH[f.level]} ${f.title}`);
+    lines.push(`    ${DIM}${f.detail}${RESET}`);
+    if (f.evidence) lines.push(`    ${DIM}evidence: ${f.evidence}${RESET}`);
+    if (f.fix) lines.push(`    ${DIM}fix: ${f.fix}${RESET}`);
+    lines.push("");
+  }
+  const { passed, checks, error, warn } = report.summary;
+  const headline = error > 0 ? `${RED}${error} error(s)${RESET}, ${warn} warning(s)` : warn > 0 ? `${YELLOW}${warn} warning(s)${RESET}` : `${GREEN}all clear${RESET}`;
+  lines.push(`  ${BOLD}${passed}/${checks} checks passed${RESET} \u2014 ${headline}`);
+  lines.push("");
+  return lines.join("\n");
+}
+function renderJson(report) {
+  return JSON.stringify(report, null, 2);
+}
+// src/cli.ts
+var HELP = `agentmarkup audit \u2014 see a URL the way AI crawlers do
+Usage:
+  agentmarkup-audit <url> [options]
+Options:
+  --json            Output the full report as JSON (for CI / league tables)
+  --timeout <ms>    Per-request timeout in milliseconds (default 10000)
+  --version         Print version
+  --help            Show this help
+Exit codes:
+  0  no error-level findings
+  1  at least one error-level finding (CI gate)
+  2  usage error
+`;
+function normalizeUrl(input) {
+  if (/^https?:\/\//i.test(input)) return input;
+  return `https://${input}`;
+}
+async function run(argv, ctx) {
+  const out = ctx.stdout ?? ((t) => process.stdout.write(t));
+  const err = ctx.stderr ?? ((t) => process.stderr.write(t));
+  if (argv.includes("--help") || argv.includes("-h")) {
+    out(HELP);
+    return 0;
+  }
+  if (argv.includes("--version")) {
+    out(`${ctx.version}
+`);
+    return 0;
+  }
+  const json = argv.includes("--json");
+  let timeoutMs;
+  const timeoutIdx = argv.indexOf("--timeout");
+  if (timeoutIdx !== -1) {
+    const raw = Number(argv[timeoutIdx + 1]);
+    if (!Number.isFinite(raw) || raw <= 0) {
+      err("agentmarkup-audit: --timeout expects a positive number of milliseconds\n");
+      return 2;
+    }
+    timeoutMs = raw;
+  }
+  const positional = argv.filter(
+    (arg, i) => !arg.startsWith("-") && argv[i - 1] !== "--timeout"
+  );
+  const target = positional[0];
+  if (!target) {
+    err("agentmarkup-audit: missing <url>\n\n");
+    err(HELP);
+    return 2;
+  }
+  const url = normalizeUrl(target);
+  const fetchedAt = (ctx.now ?? (() => (/* @__PURE__ */ new Date()).toISOString()))();
+  const report = await audit(url, { timeoutMs, fetchedAt });
+  out(json ? `${renderJson(report)}
+` : renderText(report));
+  return report.summary.worst === "error" ? 1 : 0;
+}
+export {
+  BROWSER_CONTROL,
+  CRAWLER_AGENTS,
+  ALL_AGENTS,
+  analyzeCrawlerAccess,
+  analyzeJsDependence,
+  analyzeRobots,
+  analyzeMachineReadable,
+  worstLevel,
+  countByLevel,
+  parseIpv4,
+  parseIpv6,
+  isBlockedHostname,
+  safeFetch,
+  audit,
+  renderText,
+  renderJson,
+  run
+};

package/dist/index.d.ts ADDED Viewed

@@ -0,0 +1,139 @@
+/**
+ * A single audit finding. Deterministic pass/warn/error — no scores, matching
+ * the repo's checker convention. `evidence` carries the raw observation so the
+ * report never asserts more than it saw (see the crawler-access classifier).
+ */
+type AuditLevel = 'pass' | 'warn' | 'error';
+interface AuditFinding {
+    /** Stable machine-readable code, e.g. `crawler.ua-waf-block`. */
+    code: string;
+    level: AuditLevel;
+    /** One-line summary. */
+    title: string;
+    /** Human explanation of what was checked and what it means. */
+    detail: string;
+    /** Raw evidence for the finding (status codes, headers), when applicable. */
+    evidence?: string;
+    /** Concrete next step, e.g. an agentmarkup config snippet or CDN setting. */
+    fix?: string;
+}
+declare function worstLevel(findings: AuditFinding[]): AuditLevel;
+declare function countByLevel(findings: AuditFinding[]): Record<AuditLevel, number>;
+/**
+ * SSRF-safe fetch used by every audit probe. The hostname blocklist mirrors the
+ * hardened checker worker (`website/public/_worker.js`) — evaluated as numeric
+ * IPs so IPv4-mapped IPv6, `::`, `fe80::/10`, and CGNAT cannot slip through.
+ *
+ * The audit CLI runs on the user's own machine auditing their own site, so the
+ * SSRF surface is lower than the hosted checker, but the same guard keeps the
+ * probe honest and lets this module be reused by a future hosted audit.
+ */
+interface FetchOptions {
+    userAgent: string;
+    timeoutMs?: number;
+    maxBytes?: number;
+    /** Read and return the response body (default true). */
+    readBody?: boolean;
+}
+interface FetchResult {
+    requestedUrl: string;
+    finalUrl: string;
+    status: number | null;
+    ok: boolean;
+    headers: Record<string, string>;
+    body: string | null;
+    bodyBytes: number;
+    redirects: number;
+    blocked: boolean;
+    error?: string;
+}
+declare function parseIpv4(value: string): number[] | null;
+declare function parseIpv6(value: string): number[] | null;
+declare function isBlockedHostname(hostname: string): boolean;
+/** Fetch a URL with a spoofed user-agent, SSRF-safe manual redirects, timeout, and a size bound. */
+declare function safeFetch(targetUrl: string, options: FetchOptions): Promise<FetchResult>;
+interface AuditOptions {
+    timeoutMs?: number;
+    /** Injected for tests; defaults to the real SSRF-safe fetch. */
+    fetchImpl?: typeof safeFetch;
+}
+interface AuditReport {
+    url: string;
+    finalUrl: string;
+    fetchedAt: string;
+    findings: AuditFinding[];
+    summary: {
+        pass: number;
+        warn: number;
+        error: number;
+        checks: number;
+        passed: number;
+        worst: 'pass' | 'warn' | 'error';
+    };
+}
+/**
+ * Audit a live URL. Fetches the page as a browser control and as each AI
+ * crawler user-agent, plus robots.txt and llms.txt, then runs the analyzers.
+ * `fetchedAt` is injected by the caller so the core stays deterministic/testable.
+ */
+declare function audit(targetUrl: string, options: AuditOptions & {
+    fetchedAt: string;
+}): Promise<AuditReport>;
+interface RunContext {
+    version: string;
+    now?: () => string;
+    stdout?: (text: string) => void;
+    stderr?: (text: string) => void;
+}
+declare function run(argv: string[], ctx: RunContext): Promise<number>;
+declare function renderText(report: AuditReport): string;
+declare function renderJson(report: AuditReport): string;
+/**
+ * The crawler user-agents the audit fetches as, plus the control browser.
+ * `verification` records how the real bot proves identity — this is what lets
+ * the crawler-access classifier avoid false positives: a 403 for a spoofed UA
+ * whose real bot verifies by IP range may be intentional, not a block bug.
+ */
+type Verification = 'ip-range' | 'reverse-dns' | 'ua-only';
+interface CrawlerAgent {
+    /** Short id used in findings and --json output. */
+    id: string;
+    /** The User-Agent string sent. */
+    ua: string;
+    vendor: string;
+    /** Whether this agent is the browser control rather than a crawler. */
+    control?: boolean;
+    verification?: Verification;
+    intent?: 'training' | 'search' | 'user-fetch';
+    docsUrl?: string;
+}
+declare const BROWSER_CONTROL: CrawlerAgent;
+declare const CRAWLER_AGENTS: CrawlerAgent[];
+declare const ALL_AGENTS: CrawlerAgent[];
+interface AgentProbe {
+    agent: CrawlerAgent;
+    result: FetchResult;
+}
+/**
+ * Diffs each crawler user-agent's response against the browser control and
+ * classifies the difference. Every finding states the evidence and never
+ * asserts "your site blocks AI" from a user-agent-only 403 (see plan §6):
+ * a 403 for a spoofed UA can mean an intentional IP-verification policy, not a
+ * block bug, so those are surfaced as warnings, not errors.
+ */
+declare function analyzeCrawlerAccess(control: FetchResult, probes: AgentProbe[]): AuditFinding[];
+/** Flags pages whose raw HTML has no meaningful content — invisible to crawlers that do not run JS. */
+declare function analyzeJsDependence(control: FetchResult): AuditFinding[];
+/** robots.txt intent: are the crawlers we expect to allow actually blocked? */
+declare function analyzeRobots(robots: FetchResult): AuditFinding[];
+/** Machine-readability surface on the homepage HTML plus a fetched llms.txt. */
+declare function analyzeMachineReadable(control: FetchResult, llms: FetchResult): AuditFinding[];
+export { ALL_AGENTS, type AgentProbe, type AuditFinding, type AuditLevel, type AuditOptions, type AuditReport, BROWSER_CONTROL, CRAWLER_AGENTS, type CrawlerAgent, type FetchOptions, type FetchResult, type RunContext, analyzeCrawlerAccess, analyzeJsDependence, analyzeMachineReadable, analyzeRobots, audit, countByLevel, isBlockedHostname, parseIpv4, parseIpv6, renderJson, renderText, run, safeFetch, worstLevel };

package/dist/index.js ADDED Viewed

@@ -0,0 +1,38 @@
+import {
+  ALL_AGENTS,
+  BROWSER_CONTROL,
+  CRAWLER_AGENTS,
+  analyzeCrawlerAccess,
+  analyzeJsDependence,
+  analyzeMachineReadable,
+  analyzeRobots,
+  audit,
+  countByLevel,
+  isBlockedHostname,
+  parseIpv4,
+  parseIpv6,
+  renderJson,
+  renderText,
+  run,
+  safeFetch,
+  worstLevel
+} from "./chunk-PNE6FBX2.js";
+export {
+  ALL_AGENTS,
+  BROWSER_CONTROL,
+  CRAWLER_AGENTS,
+  analyzeCrawlerAccess,
+  analyzeJsDependence,
+  analyzeMachineReadable,
+  analyzeRobots,
+  audit,
+  countByLevel,
+  isBlockedHostname,
+  parseIpv4,
+  parseIpv6,
+  renderJson,
+  renderText,
+  run,
+  safeFetch,
+  worstLevel
+};

package/package.json ADDED Viewed

@@ -0,0 +1,68 @@
+{
+  "name": "@agentmarkup/audit",
+  "version": "0.1.0",
+  "description": "Audit a live URL the way AI crawlers see it: fetch as GPTBot, ClaudeBot, PerplexityBot and more, diff against a browser to catch accidental CDN blocks, plus llms.txt, JSON-LD, robots.txt intent, Content-Signal, and JS-dependence checks",
+  "type": "module",
+  "license": "MIT",
+  "author": "Sebastian Cochinescu <hello@animafelix.com> (https://animafelix.com)",
+  "homepage": "https://agentmarkup.dev",
+  "bugs": {
+    "url": "https://github.com/agentmarkup/agentmarkup/issues"
+  },
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/agentmarkup/agentmarkup",
+    "directory": "packages/audit"
+  },
+  "keywords": [
+    "ai-crawler",
+    "gptbot",
+    "claudebot",
+    "perplexitybot",
+    "llms-txt",
+    "robots-txt",
+    "content-signal",
+    "json-ld",
+    "geo",
+    "aeo",
+    "seo",
+    "audit",
+    "machine-readable",
+    "ci"
+  ],
+  "bin": {
+    "agentmarkup-audit": "./dist/bin.js"
+  },
+  "exports": {
+    ".": {
+      "import": "./dist/index.js",
+      "types": "./dist/index.d.ts"
+    }
+  },
+  "main": "./dist/index.js",
+  "types": "./dist/index.d.ts",
+  "files": [
+    "dist"
+  ],
+  "dependencies": {
+    "@agentmarkup/core": "0.5.2"
+  },
+  "devDependencies": {
+    "eslint": "^9.0.0",
+    "tsup": "^8.4.0",
+    "typescript": "^5.7.0",
+    "vitest": "^3.0.0"
+  },
+  "scripts": {
+    "prebuild": "pnpm -C ../core build",
+    "build": "tsup",
+    "predev": "pnpm -C ../core build",
+    "dev": "tsup --watch",
+    "pretest": "pnpm -C ../core build",
+    "test": "vitest run",
+    "test:watch": "vitest",
+    "lint": "eslint src/ test/",
+    "pretypecheck": "pnpm -C ../core build",
+    "typecheck": "tsc --noEmit"
+  }
+}