npm - @braedenbuilds/crawl-sim - Versions diffs - 1.3.0 → 1.4.0 - Mend

@braedenbuilds/crawl-sim 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/.claude-plugin/marketplace.json +1 -1
package/.claude-plugin/plugin.json +1 -1
package/package.json +1 -1
package/skills/crawl-sim/SKILL.md +42 -2
package/skills/crawl-sim/profiles/chatgpt-user.json +8 -3
package/skills/crawl-sim/profiles/claude-searchbot.json +7 -2
package/skills/crawl-sim/profiles/claude-user.json +8 -3
package/skills/crawl-sim/profiles/claudebot.json +7 -2
package/skills/crawl-sim/profiles/googlebot.json +4 -2
package/skills/crawl-sim/profiles/gptbot.json +7 -2
package/skills/crawl-sim/profiles/oai-searchbot.json +7 -2
package/skills/crawl-sim/profiles/perplexity-user.json +7 -3
package/skills/crawl-sim/profiles/perplexitybot.json +7 -3
package/skills/crawl-sim/scripts/_lib.sh +6 -1
package/skills/crawl-sim/scripts/compute-score.sh +15 -9
package/skills/crawl-sim/scripts/fetch-as-bot.sh +12 -2
package/skills/crawl-sim/scripts/generate-compare-html.sh +158 -0
package/skills/crawl-sim/scripts/generate-report-html.sh +148 -0
package/skills/crawl-sim/scripts/html-to-pdf.sh +85 -0

package/.claude-plugin/marketplace.json CHANGED Viewed

@@ -9,7 +9,7 @@
       "name": "crawl-sim",
       "source": "./",
       "description": "Multi-bot web crawler simulator — audit how Googlebot, GPTBot, ClaudeBot, and PerplexityBot see your site",
-      "version": "1.3.0"
+      "version": "1.4.0"
     }
   ]
 }

package/.claude-plugin/plugin.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "crawl-sim",
-  "version": "1.3.0",
+  "version": "1.4.0",
   "description": "Multi-bot web crawler simulator — audit how Googlebot, GPTBot, ClaudeBot, and PerplexityBot see your site",
   "author": {
     "name": "BraedenBDev",

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@braedenbuilds/crawl-sim",
-  "version": "1.3.0",
+  "version": "1.4.0",
   "description": "Agent-native multi-bot web crawler simulator. See your site through the eyes of Googlebot, GPTBot, ClaudeBot, and PerplexityBot.",
   "bin": {
     "crawl-sim": "bin/install.js"

package/skills/crawl-sim/SKILL.md CHANGED Viewed

@@ -31,6 +31,9 @@ Keep status lines short, active, and specific to this URL. Never use the same se
 /crawl-sim <url> --bot gptbot               # single bot
 /crawl-sim <url> --category structured-data # category deep dive
 /crawl-sim <url> --json                     # JSON output only (for CI)
+/crawl-sim <url> --pdf                      # audit + PDF report to Desktop
+/crawl-sim <url> --compare <url2>           # side-by-side comparison of two sites
+/crawl-sim <url> --compare <url2> --pdf     # comparison + PDF report
 ```
 ## Prerequisites — check once at the start
@@ -214,11 +217,17 @@ Then produce **prioritized findings** ranked by total point impact across bots:
 - **Framework detection.** Scan the HTML body for signals: `<meta name="next-head-count">` or `_next/static` → Next.js (Pages Router or App Router respectively), `<div id="__nuxt">` → Nuxt, `<div id="app">` with thin content → SPA (Vue/React CSR), `<!--$-->` placeholder tags → React 18 Suspense. Use these to tailor fix recommendations.
 - **No speculation beyond the data.** If server HTML has 0 `<a>` tags inside a component, say "component not present in server HTML" — not "JavaScript hydration failed" unless the diff-render data proves it.
 - **Known extractor limitations.** The bash meta extractor sometimes reports `h1Text: null` even when `h1.count: 1` — that happens when the H1 contains nested tags (`<br>`, `<span>`, `<svg>`). The count is still correct. Don't flag this as a site bug — it's tracked in GitHub issue #4.
+- **robots.txt enforceability.** Each bot in the score output carries `robotsTxtEnforceability` — one of `enforced`, `advisory_only`, or `stealth_risk`. When robots blocks a bot:
+  - `enforced`: The block works. State it directly: *"GPTBot is blocked by robots.txt."*
+  - `advisory_only`: The block is unenforceable via robots.txt alone. Flag it: *"robots.txt blocks ChatGPT-User, but OpenAI has stated user-initiated fetches may not respect robots.txt. Network-level enforcement (e.g., Cloudflare WAF rules) is needed to actually block this bot."*
+  - `stealth_risk`: The bot claims compliance but has been caught bypassing. Note: *"PerplexityBot is blocked by robots.txt, but Cloudflare has documented instances of Perplexity using undeclared crawlers with generic user-agent strings to access blocked sites."*
+- **Cloudflare context.** Since July 2025, Cloudflare blocks all AI training crawlers (GPTBot, ClaudeBot, CCBot, etc.) **by default** for new domains (~20% of the web). If a site uses Cloudflare, robots.txt may be redundant for training bots — the CDN blocks them at the network level before they reach the origin. The score output's `cloudflareCategory` field (`ai_crawler`, `ai_search`, `ai_assistant`) indicates which tier each bot falls into.
 - **Per-bot quirks to surface:**
   - Googlebot: renders JS. If `diff-render.sh` was skipped, note that comparison was unavailable and recommend installing Playwright.
   - GPTBot / ClaudeBot / PerplexityBot: `rendersJavaScript: false` at observed confidence — flag any server-vs-rendered delta as invisible-to-AI content.
-  - `chatgpt-user` / `perplexity-user`: officially ignore robots.txt for user-initiated fetches. Blocking these via robots.txt has no effect.
-  - PerplexityBot: third-party reports of stealth/undeclared crawling. Mention if relevant, don't assert.
+  - `chatgpt-user` / `perplexity-user`: `robotsTxtEnforceability: advisory_only`. Blocking these via robots.txt alone has no effect — always flag this in findings.
+  - `claude-user`: Anthropic is notably stricter — commits to respecting robots.txt even for user-initiated fetches (`robotsTxtEnforceability: enforced`).
+  - PerplexityBot: `robotsTxtEnforceability: stealth_risk` — third-party and Cloudflare reports of stealth/undeclared crawling. Mention if relevant, don't assert.
 After findings, write a **Summary** paragraph: what's working well, biggest wins, confidence caveats. Keep it short — two to three sentences.
@@ -233,6 +242,37 @@ After findings, write a **Summary** paragraph: what's working well, biggest wins
 - If `jq` or `curl` is missing, exit with install instructions.
 - If `diff-render.sh` skips, the narrative must note that per-bot differentiation is reduced.
+## PDF Report (`--pdf`)
+When the user passes `--pdf`, after the narrative output, generate a PDF report:
+```bash
+"$SKILL_DIR/scripts/generate-report-html.sh" ./crawl-sim-report.json "$RUN_DIR/report.html"
+"$SKILL_DIR/scripts/html-to-pdf.sh" "$RUN_DIR/report.html" "$HOME/Desktop/crawl-sim-audit.pdf"
+```
+Tell the user where the PDF was saved. If `html-to-pdf.sh` fails (no Chrome or Playwright), the HTML file is still available — tell the user and suggest installing a renderer.
+## Comparative Audit (`--compare <url2>`)
+When the user passes `--compare <url2>`, run two full audits and produce a side-by-side report:
+1. Run the complete 5-stage pipeline for `<url>` — save report as `./crawl-sim-report-a.json`
+2. Run the complete 5-stage pipeline for `<url2>` — save report as `./crawl-sim-report-b.json`
+3. Generate the comparison:
+```bash
+"$SKILL_DIR/scripts/generate-compare-html.sh" ./crawl-sim-report-a.json ./crawl-sim-report-b.json "$RUN_DIR/compare.html"
+```
+4. If `--pdf` was also passed:
+```bash
+"$SKILL_DIR/scripts/html-to-pdf.sh" "$RUN_DIR/compare.html" "$HOME/Desktop/crawl-sim-compare.pdf"
+```
+The narrative for a comparison should lead with: which site wins overall, by how many points, and in which categories. Then highlight the biggest deltas — what Site A does better, what Site B does better, and what both share.
 ## Cleanup
 `$RUN_DIR` is small and informative — leave it in place and print the path. The user may want to inspect the raw JSON for any of the 23+ intermediate files.

package/skills/crawl-sim/profiles/chatgpt-user.json CHANGED Viewed

@@ -4,7 +4,7 @@
   "vendor": "OpenAI",
   "userAgent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot",
   "robotsTxtToken": "ChatGPT-User",
-  "purpose": "user-initiated",
+  "purpose": "user_retrieval",
   "rendersJavaScript": "unknown",
   "respectsRobotsTxt": "partial",
   "crawlDelaySupported": "unknown",
@@ -23,6 +23,11 @@
     }
   },
   "lastVerified": "2026-04-11",
-  "relatedBots": ["gptbot", "oai-searchbot"],
-  "notes": "Not used for automatic crawling. Not used to determine search appearance. User-initiated fetches in ChatGPT and Custom GPTs."
+  "relatedBots": [
+    "gptbot",
+    "oai-searchbot"
+  ],
+  "notes": "Not used for automatic crawling. Not used to determine search appearance. User-initiated fetches in ChatGPT and Custom GPTs.",
+  "cloudflareCategory": "ai_assistant",
+  "robotsTxtEnforceability": "advisory_only"
 }

package/skills/crawl-sim/profiles/claude-searchbot.json CHANGED Viewed

@@ -23,6 +23,11 @@
     }
   },
   "lastVerified": "2026-04-11",
-  "relatedBots": ["claudebot", "claude-user"],
-  "notes": "Navigates the web to improve search result quality. Focused on search indexing, not training."
+  "relatedBots": [
+    "claudebot",
+    "claude-user"
+  ],
+  "notes": "Navigates the web to improve search result quality. Focused on search indexing, not training.",
+  "cloudflareCategory": "ai_search",
+  "robotsTxtEnforceability": "enforced"
 }

package/skills/crawl-sim/profiles/claude-user.json CHANGED Viewed

@@ -4,7 +4,7 @@
   "vendor": "Anthropic",
   "userAgent": "Claude-User",
   "robotsTxtToken": "Claude-User",
-  "purpose": "user-initiated",
+  "purpose": "user_retrieval",
   "rendersJavaScript": "unknown",
   "respectsRobotsTxt": true,
   "crawlDelaySupported": "unknown",
@@ -23,6 +23,11 @@
     }
   },
   "lastVerified": "2026-04-11",
-  "relatedBots": ["claudebot", "claude-searchbot"],
-  "notes": "When individuals ask questions to Claude, it may access websites. Blocking prevents Claude from retrieving content in response to user queries."
+  "relatedBots": [
+    "claudebot",
+    "claude-searchbot"
+  ],
+  "notes": "When individuals ask questions to Claude, it may access websites. Blocking prevents Claude from retrieving content in response to user queries.",
+  "cloudflareCategory": "ai_assistant",
+  "robotsTxtEnforceability": "enforced"
 }

package/skills/crawl-sim/profiles/claudebot.json CHANGED Viewed

@@ -23,6 +23,11 @@
     }
   },
   "lastVerified": "2026-04-11",
-  "relatedBots": ["claude-user", "claude-searchbot"],
-  "notes": "Collects web content that could potentially contribute to AI model training. Crawl-delay explicitly supported (non-standard). Blocking IP addresses will not reliably work."
+  "relatedBots": [
+    "claude-user",
+    "claude-searchbot"
+  ],
+  "notes": "Collects web content that could potentially contribute to AI model training. Crawl-delay explicitly supported (non-standard). Blocking IP addresses will not reliably work.",
+  "cloudflareCategory": "ai_crawler",
+  "robotsTxtEnforceability": "enforced"
 }

package/skills/crawl-sim/profiles/googlebot.json CHANGED Viewed

@@ -4,7 +4,7 @@
   "vendor": "Google",
   "userAgent": "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/W.X.Y.Z Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
   "robotsTxtToken": "Googlebot",
-  "purpose": "search-indexing",
+  "purpose": "search",
   "rendersJavaScript": true,
   "respectsRobotsTxt": true,
   "crawlDelaySupported": false,
@@ -24,5 +24,7 @@
   },
   "lastVerified": "2026-04-11",
   "relatedBots": [],
-  "notes": "Two-phase: initial fetch (HTML) then queued render (headless Chrome via WRS). Evergreen Chromium. Stateless sessions. ~5s default timeout. Mobile-first indexing."
+  "notes": "Two-phase: initial fetch (HTML) then queued render (headless Chrome via WRS). Evergreen Chromium. Stateless sessions. ~5s default timeout. Mobile-first indexing.",
+  "cloudflareCategory": "search_engine",
+  "robotsTxtEnforceability": "enforced"
 }

package/skills/crawl-sim/profiles/gptbot.json CHANGED Viewed

@@ -23,6 +23,11 @@
     }
   },
   "lastVerified": "2026-04-11",
-  "relatedBots": ["oai-searchbot", "chatgpt-user"],
-  "notes": "Disallowing GPTBot indicates a site's content should not be used in training generative AI foundation models."
+  "relatedBots": [
+    "oai-searchbot",
+    "chatgpt-user"
+  ],
+  "notes": "Disallowing GPTBot indicates a site's content should not be used in training generative AI foundation models.",
+  "cloudflareCategory": "ai_crawler",
+  "robotsTxtEnforceability": "enforced"
 }

package/skills/crawl-sim/profiles/oai-searchbot.json CHANGED Viewed

@@ -23,6 +23,11 @@
     }
   },
   "lastVerified": "2026-04-11",
-  "relatedBots": ["gptbot", "chatgpt-user"],
-  "notes": "Sites opted out of OAI-SearchBot will not be shown in ChatGPT search answers, though can still appear as navigational links."
+  "relatedBots": [
+    "gptbot",
+    "chatgpt-user"
+  ],
+  "notes": "Sites opted out of OAI-SearchBot will not be shown in ChatGPT search answers, though can still appear as navigational links.",
+  "cloudflareCategory": "ai_search",
+  "robotsTxtEnforceability": "enforced"
 }

package/skills/crawl-sim/profiles/perplexity-user.json CHANGED Viewed

@@ -4,7 +4,7 @@
   "vendor": "Perplexity",
   "userAgent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Perplexity-User/1.0; +https://perplexity.ai/perplexity-user)",
   "robotsTxtToken": "Perplexity-User",
-  "purpose": "user-initiated",
+  "purpose": "user_retrieval",
   "rendersJavaScript": "unknown",
   "respectsRobotsTxt": false,
   "crawlDelaySupported": "unknown",
@@ -23,6 +23,10 @@
     }
   },
   "lastVerified": "2026-04-11",
-  "relatedBots": ["perplexitybot"],
-  "notes": "Supports user actions within Perplexity. Not used for web crawling or AI training. Generally ignores robots.txt since fetches are user-initiated."
+  "relatedBots": [
+    "perplexitybot"
+  ],
+  "notes": "Supports user actions within Perplexity. Not used for web crawling or AI training. Generally ignores robots.txt since fetches are user-initiated.",
+  "cloudflareCategory": "ai_assistant",
+  "robotsTxtEnforceability": "advisory_only"
 }

package/skills/crawl-sim/profiles/perplexitybot.json CHANGED Viewed

@@ -4,7 +4,7 @@
   "vendor": "Perplexity",
   "userAgent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; PerplexityBot/1.0; +https://perplexity.ai/perplexitybot)",
   "robotsTxtToken": "PerplexityBot",
-  "purpose": "search-indexing",
+  "purpose": "search",
   "rendersJavaScript": false,
   "respectsRobotsTxt": true,
   "crawlDelaySupported": "unknown",
@@ -23,6 +23,10 @@
     }
   },
   "lastVerified": "2026-04-11",
-  "relatedBots": ["perplexity-user"],
-  "notes": "Designed to surface and link websites in search results on Perplexity. NOT used to crawl content for AI foundation models. Changes may take up to 24 hours to reflect."
+  "relatedBots": [
+    "perplexity-user"
+  ],
+  "notes": "Designed to surface and link websites in search results on Perplexity. NOT used to crawl content for AI foundation models. Changes may take up to 24 hours to reflect.",
+  "cloudflareCategory": "ai_search",
+  "robotsTxtEnforceability": "stealth_risk"
 }

package/skills/crawl-sim/scripts/_lib.sh CHANGED Viewed

@@ -73,9 +73,14 @@ page_type_for_url() {
 # Fetch a URL to a local file and return the HTTP status code on stdout.
 # Usage: status=$(fetch_to_file <url> <output-file> [timeout-seconds])
+# Retries once on transient failure (same SSL/DNS flake that caused #11).
 fetch_to_file() {
   local url="$1"
   local out="$2"
   local timeout="${3:-15}"
-  curl -sS -L -o "$out" -w '%{http_code}' --max-time "$timeout" "$url" 2>/dev/null || echo "000"
+  local status
+  status=$(curl -sS -L -o "$out" -w '%{http_code}' --max-time "$timeout" "$url" 2>/dev/null) && echo "$status" && return
+  # Retry once on transient failure
+  status=$(curl -sS -L -o "$out" -w '%{http_code}' --max-time "$timeout" "$url" 2>/dev/null) && echo "$status" && return
+  echo "000"
 }

package/skills/crawl-sim/scripts/compute-score.sh CHANGED Viewed

@@ -312,14 +312,16 @@ for bot_id in $BOTS; do
     continue
   fi
-  # Batch-read fields from fetch file (1 jq call instead of 4)
-  read -r STATUS TOTAL_TIME SERVER_WORD_COUNT RENDERS_JS <<< \
+  # Batch-read fields from fetch file
+  read -r STATUS TOTAL_TIME SERVER_WORD_COUNT RENDERS_JS PURPOSE_TIER ROBOTS_ENFORCE <<< \
     "$(jq -r '[
       (.status // 0),
       (.timing.total // 0),
       (.wordCount // 0),
-      (.bot.rendersJavaScript | if . == null then "unknown" else tostring end)
-    ] | @tsv' "$FETCH" 2>/dev/null || echo "0	0	0	unknown")"
+      (.bot.rendersJavaScript | if . == null then "unknown" else tostring end),
+      (.bot.purpose // "unknown"),
+      (.bot.robotsTxtEnforceability // "unknown")
+    ] | @tsv' "$FETCH" 2>/dev/null || echo "0	0	0	unknown	unknown	unknown")"
   ROBOTS_ALLOWED=$(jq -r '.allowed // false | tostring' "$ROBOTS" 2>/dev/null || echo "false")
@@ -547,13 +549,13 @@ for bot_id in $BOTS; do
   # --- Category 5: AI Readiness (0-100) ---
   AI=0
-  # Batch-read llmstxt fields (1 jq call instead of 4)
+  # Batch-read llmstxt fields — use top-level exists (M1) which covers both variants
   read -r LLMS_EXISTS LLMS_HAS_TITLE LLMS_HAS_DESC LLMS_URLS <<< \
     "$(jq -r '[
-      (.llmsTxt.exists // false | tostring),
-      (.llmsTxt.hasTitle // false | tostring),
-      (.llmsTxt.hasDescription // false | tostring),
-      (.llmsTxt.urlCount // 0)
+      (.exists // (.llmsTxt.exists or .llmsFullTxt.exists) | tostring),
+      ((.llmsTxt.hasTitle // .llmsFullTxt.hasTitle // false) | tostring),
+      ((.llmsTxt.hasDescription // .llmsFullTxt.hasDescription // false) | tostring),
+      ((.llmsTxt.urlCount // 0) + (.llmsFullTxt.urlCount // 0))
     ] | @tsv' "$LLMSTXT_FILE" 2>/dev/null || echo "false	false	false	0")"
   if [ "$LLMS_EXISTS" = "true" ]; then
@@ -586,6 +588,8 @@ for bot_id in $BOTS; do
     --arg id "$bot_id" \
     --arg name "$BOT_NAME" \
     --arg rendersJs "$RENDERS_JS" \
+    --arg purpose "$PURPOSE_TIER" \
+    --arg robotsEnforce "$ROBOTS_ENFORCE" \
     --argjson score "$BOT_SCORE" \
     --arg grade "$BOT_GRADE" \
     --argjson acc "$ACC" \
@@ -605,6 +609,8 @@ for bot_id in $BOTS; do
       id: $id,
       name: $name,
       rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end),
+      purpose: $purpose,
+      robotsTxtEnforceability: $robotsEnforce,
       score: $score,
       grade: $grade,
       visibility: {

package/skills/crawl-sim/scripts/fetch-as-bot.sh CHANGED Viewed

@@ -15,6 +15,8 @@ BOT_ID=$(jq -r '.id' "$PROFILE")
 BOT_NAME=$(jq -r '.name' "$PROFILE")
 UA=$(jq -r '.userAgent' "$PROFILE")
 RENDERS_JS=$(jq -r '.rendersJavaScript' "$PROFILE")
+PURPOSE=$(jq -r '.purpose // "unknown"' "$PROFILE")
+ROBOTS_ENFORCE=$(jq -r '.robotsTxtEnforceability // "unknown"' "$PROFILE")
 TMPDIR="${TMPDIR:-/tmp}"
 HEADERS_FILE=$(mktemp "$TMPDIR/crawlsim-headers.XXXXXX")
@@ -48,6 +50,8 @@ if [ "$CURL_EXIT" -ne 0 ]; then
     --arg botName "$BOT_NAME" \
     --arg ua "$UA" \
     --arg rendersJs "$RENDERS_JS" \
+    --arg purpose "$PURPOSE" \
+    --arg robotsEnforce "$ROBOTS_ENFORCE" \
     --arg error "$CURL_ERR" \
     --argjson exitCode "$CURL_EXIT" \
     '{
@@ -56,7 +60,9 @@ if [ "$CURL_EXIT" -ne 0 ]; then
         id: $botId,
         name: $botName,
         userAgent: $ua,
-        rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end)
+        rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end),
+        purpose: $purpose,
+        robotsTxtEnforceability: $robotsEnforce
       },
       fetchFailed: true,
       error: $error,
@@ -121,6 +127,8 @@ jq -n \
   --arg botName "$BOT_NAME" \
   --arg ua "$UA" \
   --arg rendersJs "$RENDERS_JS" \
+  --arg purpose "$PURPOSE" \
+  --arg robotsEnforce "$ROBOTS_ENFORCE" \
   --argjson status "$STATUS" \
   --argjson totalTime "$TOTAL_TIME" \
   --argjson ttfb "$TTFB" \
@@ -137,7 +145,9 @@ jq -n \
       id: $botId,
       name: $botName,
       userAgent: $ua,
-      rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end)
+      rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end),
+      purpose: $purpose,
+      robotsTxtEnforceability: $robotsEnforce
     },
     status: $status,
     timing: { total: $totalTime, ttfb: $ttfb },

package/skills/crawl-sim/scripts/generate-compare-html.sh ADDED Viewed

@@ -0,0 +1,158 @@
+#!/usr/bin/env bash
+set -eu
+# generate-compare-html.sh — Generate a side-by-side comparison HTML from two crawl-sim reports
+# Usage: generate-compare-html.sh <report-a.json> <report-b.json> [output.html]
+REPORT_A="${1:?Usage: generate-compare-html.sh <report-a.json> <report-b.json> [output.html]}"
+REPORT_B="${2:?Usage: generate-compare-html.sh <report-a.json> <report-b.json> [output.html]}"
+OUTPUT="${3:-}"
+for f in "$REPORT_A" "$REPORT_B"; do
+  [ -f "$f" ] || { echo "Error: report not found: $f" >&2; exit 1; }
+done
+# Extract key data from both reports
+URL_A=$(jq -r '.url' "$REPORT_A")
+URL_B=$(jq -r '.url' "$REPORT_B")
+SCORE_A=$(jq -r '.overall.score' "$REPORT_A")
+SCORE_B=$(jq -r '.overall.score' "$REPORT_B")
+GRADE_A=$(jq -r '.overall.grade' "$REPORT_A")
+GRADE_B=$(jq -r '.overall.grade' "$REPORT_B")
+PARITY_A=$(jq -r '.parity.score' "$REPORT_A")
+PARITY_B=$(jq -r '.parity.score' "$REPORT_B")
+# Build category comparison rows
+CAT_COMPARE=$(jq -r --slurpfile b "$REPORT_B" '
+  .categories | to_entries[] |
+  . as $cat |
+  ($b[0].categories[$cat.key]) as $bcat |
+  (if $cat.value.score > $bcat.score then "winner-a"
+   elif $cat.value.score < $bcat.score then "winner-b"
+   else "tie" end) as $cls |
+  ($cat.value.score - $bcat.score) as $delta |
+  "<tr class=\"\($cls)\"><td>\($cat.key)</td>" +
+  "<td>\($cat.value.score) (\($cat.value.grade))</td>" +
+  "<td>\($bcat.score) (\($bcat.grade))</td>" +
+  "<td>\(if $delta > 0 then "+\($delta)" elif $delta < 0 then "\($delta)" else "=" end)</td></tr>"
+' "$REPORT_A")
+# Build per-bot comparison (using the 4 main bots)
+BOT_COMPARE=$(jq -r --slurpfile b "$REPORT_B" '
+  ["googlebot", "gptbot", "claudebot", "perplexitybot"] | .[] |
+  . as $id |
+  (input.bots[$id] // {score: 0, grade: "N/A"}) as $ba |
+  ($b[0].bots[$id] // {score: 0, grade: "N/A"}) as $bb |
+  ($ba.score - $bb.score) as $delta |
+  "<tr><td>\($id)</td>" +
+  "<td>\($ba.score) (\($ba.grade))</td>" +
+  "<td>\($bb.score) (\($bb.grade))</td>" +
+  "<td>\(if $delta > 0 then "+\($delta)" elif $delta < 0 then "\($delta)" else "=" end)</td></tr>"
+' "$REPORT_A")
+# Determine overall winner
+if [ "$SCORE_A" -gt "$SCORE_B" ]; then
+  WINNER="Site A leads by $((SCORE_A - SCORE_B)) points"
+elif [ "$SCORE_B" -gt "$SCORE_A" ]; then
+  WINNER="Site B leads by $((SCORE_B - SCORE_A)) points"
+else
+  WINNER="Both sites tied at ${SCORE_A}/100"
+fi
+# Count category wins
+WINS_A=$(jq --slurpfile b "$REPORT_B" '
+  [.categories | to_entries[] | select(.value.score > ($b[0].categories[.key].score))] | length
+' "$REPORT_A")
+WINS_B=$(jq --slurpfile b "$REPORT_B" '
+  [.categories | to_entries[] | select(.value.score < ($b[0].categories[.key].score))] | length
+' "$REPORT_A")
+HTML=$(cat <<HTMLEOF
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<title>crawl-sim Comparison</title>
+<style>
+  @page { size: A4 landscape; margin: 15mm; }
+  * { box-sizing: border-box; margin: 0; padding: 0; }
+  body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; color: #1a1a1a; line-height: 1.5; padding: 40px; max-width: 1100px; margin: 0 auto; }
+  h1 { font-size: 24px; margin-bottom: 4px; }
+  .subtitle { color: #666; font-size: 13px; margin-bottom: 24px; }
+  .vs-hero { display: grid; grid-template-columns: 1fr auto 1fr; gap: 20px; align-items: center; margin-bottom: 32px; }
+  .site-card { background: #f8f9fa; border-radius: 12px; padding: 24px; text-align: center; }
+  .site-card.winner { background: #e8f5e9; border: 2px solid #27ae60; }
+  .site-score { font-size: 56px; font-weight: 800; line-height: 1; }
+  .site-grade { font-size: 36px; font-weight: 700; color: #2d7d46; }
+  .site-url { font-size: 12px; color: #666; word-break: break-all; margin-top: 8px; }
+  .vs { font-size: 32px; font-weight: 800; color: #999; }
+  .verdict { text-align: center; font-size: 16px; font-weight: 600; margin-bottom: 24px; padding: 12px; background: #f0f0f0; border-radius: 8px; }
+  table { width: 100%; border-collapse: collapse; margin-bottom: 24px; font-size: 13px; }
+  th { background: #1a1a1a; color: white; padding: 8px 12px; text-align: left; }
+  td { padding: 8px 12px; border-bottom: 1px solid #e0e0e0; }
+  tr:nth-child(even) { background: #f8f9fa; }
+  .winner-a td:nth-child(2) { color: #27ae60; font-weight: 600; }
+  .winner-b td:nth-child(3) { color: #27ae60; font-weight: 600; }
+  .winner-a td:last-child { color: #27ae60; }
+  .winner-b td:last-child { color: #c0392b; }
+  h2 { font-size: 18px; margin: 24px 0 12px; border-bottom: 2px solid #1a1a1a; padding-bottom: 4px; }
+  .footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #e0e0e0; font-size: 11px; color: #999; }
+  @media print { body { padding: 0; } }
+</style>
+</head>
+<body>
+<h1>crawl-sim — Comparative Audit</h1>
+<div class="subtitle">Generated $(date -u +"%Y-%m-%d %H:%M UTC")</div>
+<div class="vs-hero">
+  <div class="site-card$([ "$SCORE_A" -ge "$SCORE_B" ] && echo ' winner' || echo '')">
+    <div style="font-size:12px;font-weight:600;color:#666;margin-bottom:8px">SITE A</div>
+    <div class="site-score">${SCORE_A}</div>
+    <div class="site-grade">${GRADE_A}</div>
+    <div class="site-url">${URL_A}</div>
+  </div>
+  <div class="vs">VS</div>
+  <div class="site-card$([ "$SCORE_B" -gt "$SCORE_A" ] && echo ' winner' || echo '')">
+    <div style="font-size:12px;font-weight:600;color:#666;margin-bottom:8px">SITE B</div>
+    <div class="site-score">${SCORE_B}</div>
+    <div class="site-grade">${GRADE_B}</div>
+    <div class="site-url">${URL_B}</div>
+  </div>
+</div>
+<div class="verdict">${WINNER} &middot; Site A wins ${WINS_A} categories, Site B wins ${WINS_B}</div>
+<h2>Category Breakdown</h2>
+<table>
+<tr><th>Category</th><th>Site A</th><th>Site B</th><th>Delta</th></tr>
+${CAT_COMPARE}
+<tr style="font-weight:600;border-top:2px solid #1a1a1a">
+  <td>Content Parity</td>
+  <td>${PARITY_A}</td>
+  <td>${PARITY_B}</td>
+  <td>$([ "$PARITY_A" -gt "$PARITY_B" ] 2>/dev/null && echo "+$((PARITY_A - PARITY_B))" || ([ "$PARITY_B" -gt "$PARITY_A" ] 2>/dev/null && echo "-$((PARITY_B - PARITY_A))" || echo "="))</td>
+</tr>
+</table>
+<h2>Per-Bot Scores</h2>
+<table>
+<tr><th>Bot</th><th>Site A</th><th>Site B</th><th>Delta</th></tr>
+${BOT_COMPARE}
+</table>
+<div class="footer">
+  Generated by crawl-sim v1.4.0 &middot; <a href="https://github.com/BraedenBDev/crawl-sim">github.com/BraedenBDev/crawl-sim</a>
+</div>
+</body>
+</html>
+HTMLEOF
+)
+if [ -n "$OUTPUT" ]; then
+  printf '%s' "$HTML" > "$OUTPUT"
+  printf '[generate-compare-html] wrote %s\n' "$OUTPUT" >&2
+else
+  printf '%s' "$HTML"
+fi

package/skills/crawl-sim/scripts/generate-report-html.sh ADDED Viewed

@@ -0,0 +1,148 @@
+#!/usr/bin/env bash
+set -eu
+# generate-report-html.sh — Generate a styled HTML audit report from crawl-sim-report.json
+# Usage: generate-report-html.sh <report.json> [output.html]
+# Output: HTML to stdout (or file if second arg given)
+REPORT="${1:?Usage: generate-report-html.sh <report.json> [output.html]}"
+OUTPUT="${2:-}"
+if [ ! -f "$REPORT" ]; then
+  echo "Error: report not found: $REPORT" >&2
+  exit 1
+fi
+# Extract key data
+URL=$(jq -r '.url' "$REPORT")
+TIMESTAMP=$(jq -r '.timestamp' "$REPORT")
+PAGE_TYPE=$(jq -r '.pageType' "$REPORT")
+OVERALL_SCORE=$(jq -r '.overall.score' "$REPORT")
+OVERALL_GRADE=$(jq -r '.overall.grade' "$REPORT")
+PARITY_SCORE=$(jq -r '.parity.score' "$REPORT")
+PARITY_GRADE=$(jq -r '.parity.grade' "$REPORT")
+PARITY_INTERP=$(jq -r '.parity.interpretation' "$REPORT")
+# Build per-bot table rows
+BOT_ROWS=$(jq -r '
+  .bots | to_entries[] |
+  "<tr><td>\(.value.name)</td><td>\(.value.score)</td><td>\(.value.grade)</td>" +
+  "<td>\(.value.categories.accessibility.score)</td>" +
+  "<td>\(.value.categories.contentVisibility.score)</td>" +
+  "<td>\(.value.categories.structuredData.score)</td>" +
+  "<td>\(.value.categories.technicalSignals.score)</td>" +
+  "<td>\(.value.categories.aiReadiness.score)</td>" +
+  "<td>\(.value.purpose // "-")</td>" +
+  "<td class=\"enforce-\(.value.robotsTxtEnforceability // "unknown")\">\(.value.robotsTxtEnforceability // "-")</td></tr>"
+' "$REPORT")
+# Build category averages
+CAT_ROWS=$(jq -r '
+  .categories | to_entries[] |
+  "<tr><td>\(.key)</td><td>\(.value.score)</td><td>\(.value.grade)</td></tr>"
+' "$REPORT")
+# Build warnings
+WARNINGS_HTML=$(jq -r '
+  if (.warnings | length) > 0 then
+    (.warnings[] | "<div class=\"warning\"><strong>⚠ \(.code)</strong>: \(.message)</div>")
+  else
+    "<div class=\"ok\">No warnings.</div>"
+  end
+' "$REPORT")
+# Build structured data details for first bot
+SD_DETAILS=$(jq -r '
+  .bots | to_entries[0].value.categories.structuredData |
+  "<p><strong>Page type:</strong> \(.pageType)</p>" +
+  "<p><strong>Present:</strong> \(.present | join(", "))</p>" +
+  "<p><strong>Missing:</strong> \(if (.missing | length) > 0 then (.missing | join(", ")) else "none" end)</p>" +
+  "<p><strong>Violations:</strong> \(if (.violations | length) > 0 then (.violations | map("\(.kind): \(.schema // .field // "")") | join(", ")) else "none" end)</p>" +
+  "<p><strong>Notes:</strong> \(.notes)</p>"
+' "$REPORT")
+# Generate HTML
+HTML=$(cat <<HTMLEOF
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<title>crawl-sim Audit — ${URL}</title>
+<style>
+  @page { size: A4; margin: 20mm; }
+  * { box-sizing: border-box; margin: 0; padding: 0; }
+  body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; color: #1a1a1a; line-height: 1.5; padding: 40px; max-width: 900px; margin: 0 auto; }
+  h1 { font-size: 28px; margin-bottom: 4px; }
+  .subtitle { color: #666; font-size: 14px; margin-bottom: 24px; }
+  .score-hero { display: flex; align-items: center; gap: 24px; background: #f8f9fa; border-radius: 12px; padding: 24px; margin-bottom: 24px; }
+  .score-big { font-size: 64px; font-weight: 800; line-height: 1; }
+  .grade-big { font-size: 48px; font-weight: 700; color: #2d7d46; }
+  .score-meta { font-size: 14px; color: #666; }
+  table { width: 100%; border-collapse: collapse; margin-bottom: 24px; font-size: 13px; }
+  th { background: #1a1a1a; color: white; padding: 8px 12px; text-align: left; font-weight: 600; }
+  td { padding: 8px 12px; border-bottom: 1px solid #e0e0e0; }
+  tr:nth-child(even) { background: #f8f9fa; }
+  .enforce-advisory_only { color: #c0392b; font-weight: 600; }
+  .enforce-stealth_risk { color: #e67e22; font-weight: 600; }
+  .enforce-enforced { color: #27ae60; }
+  h2 { font-size: 18px; margin: 32px 0 12px; border-bottom: 2px solid #1a1a1a; padding-bottom: 4px; }
+  .warning { background: #fff3cd; border-left: 4px solid #ffc107; padding: 12px 16px; margin-bottom: 8px; border-radius: 4px; font-size: 13px; }
+  .ok { color: #27ae60; font-size: 13px; }
+  .parity { display: flex; gap: 16px; align-items: center; background: #e8f5e9; border-radius: 8px; padding: 16px; margin-bottom: 24px; }
+  .parity.low { background: #ffebee; }
+  .footer { margin-top: 40px; padding-top: 16px; border-top: 1px solid #e0e0e0; font-size: 11px; color: #999; }
+  @media print { body { padding: 0; } .score-hero { break-inside: avoid; } table { break-inside: avoid; } }
+</style>
+</head>
+<body>
+<h1>crawl-sim — Bot Visibility Audit</h1>
+<div class="subtitle">${URL} &middot; ${TIMESTAMP} &middot; Page type: ${PAGE_TYPE}</div>
+<div class="score-hero">
+  <div>
+    <span class="score-big">${OVERALL_SCORE}</span><span style="font-size:24px;color:#666">/100</span>
+  </div>
+  <div>
+    <div class="grade-big">${OVERALL_GRADE}</div>
+    <div class="score-meta">Overall Score</div>
+  </div>
+</div>
+<div class="parity${PARITY_SCORE:+ }$([ "$PARITY_SCORE" -lt 50 ] 2>/dev/null && echo 'low' || echo '')">
+  <div><strong>Content Parity:</strong> ${PARITY_SCORE}/100 (${PARITY_GRADE})</div>
+  <div>${PARITY_INTERP}</div>
+</div>
+${WARNINGS_HTML}
+<h2>Per-Bot Scores</h2>
+<table>
+<tr><th>Bot</th><th>Score</th><th>Grade</th><th>Access</th><th>Content</th><th>Schema</th><th>Technical</th><th>AI</th><th>Purpose</th><th>robots.txt</th></tr>
+${BOT_ROWS}
+</table>
+<h2>Category Averages</h2>
+<table>
+<tr><th>Category</th><th>Score</th><th>Grade</th></tr>
+${CAT_ROWS}
+</table>
+<h2>Structured Data Details</h2>
+${SD_DETAILS}
+<div class="footer">
+  Generated by crawl-sim v1.4.0 &middot; <a href="https://github.com/BraedenBDev/crawl-sim">github.com/BraedenBDev/crawl-sim</a>
+</div>
+</body>
+</html>
+HTMLEOF
+)
+if [ -n "$OUTPUT" ]; then
+  printf '%s' "$HTML" > "$OUTPUT"
+  printf '[generate-report-html] wrote %s\n' "$OUTPUT" >&2
+else
+  printf '%s' "$HTML"
+fi

package/skills/crawl-sim/scripts/html-to-pdf.sh ADDED Viewed

@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+set -eu
+# html-to-pdf.sh — Convert an HTML file to PDF using the best available renderer.
+# Usage: html-to-pdf.sh <input.html> <output.pdf>
+#
+# Detection order:
+#   1. Chrome/Chromium at known system paths
+#   2. Playwright's bundled Chromium (npx playwright pdf)
+#   3. Neither → exit 1 with instructions
+#
+# This script is intentionally renderer-agnostic. Callers don't need to know
+# which engine is available — they just pass HTML in and get PDF out.
+INPUT="${1:?Usage: html-to-pdf.sh <input.html> <output.pdf>}"
+OUTPUT="${2:?Usage: html-to-pdf.sh <input.html> <output.pdf>}"
+if [ ! -f "$INPUT" ]; then
+  echo "Error: input file not found: $INPUT" >&2
+  exit 1
+fi
+# Convert to file:// URL for Chrome (needs absolute path)
+case "$INPUT" in
+  /*) INPUT_URL="file://$INPUT" ;;
+  *)  INPUT_URL="file://$(pwd)/$INPUT" ;;
+esac
+# --- Strategy 1: System Chrome/Chromium ---
+find_chrome() {
+  # macOS
+  for path in \
+    "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" \
+    "/Applications/Chromium.app/Contents/MacOS/Chromium" \
+    "/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary" \
+    "/Applications/Brave Browser.app/Contents/MacOS/Brave Browser"; do
+    [ -x "$path" ] && echo "$path" && return 0
+  done
+  # Linux / WSL
+  for cmd in google-chrome chromium-browser chromium google-chrome-stable; do
+    command -v "$cmd" >/dev/null 2>&1 && command -v "$cmd" && return 0
+  done
+  return 1
+}
+if CHROME=$(find_chrome); then
+  printf '[html-to-pdf] using Chrome: %s\n' "$CHROME" >&2
+  "$CHROME" \
+    --headless \
+    --disable-gpu \
+    --no-sandbox \
+    --print-to-pdf="$OUTPUT" \
+    --no-margins \
+    "$INPUT_URL" 2>/dev/null
+  if [ -s "$OUTPUT" ]; then
+    printf '[html-to-pdf] wrote %s (%s bytes)\n' "$OUTPUT" "$(wc -c < "$OUTPUT" | tr -d ' ')" >&2
+    exit 0
+  fi
+  printf '[html-to-pdf] Chrome produced empty output, trying Playwright fallback\n' >&2
+fi
+# --- Strategy 2: Playwright's bundled Chromium ---
+if command -v npx >/dev/null 2>&1; then
+  # Check if playwright is installed (don't auto-install)
+  if npx playwright --version >/dev/null 2>&1; then
+    printf '[html-to-pdf] using Playwright bundled Chromium\n' >&2
+    npx playwright pdf "$INPUT_URL" "$OUTPUT" 2>/dev/null
+    if [ -s "$OUTPUT" ]; then
+      printf '[html-to-pdf] wrote %s (%s bytes)\n' "$OUTPUT" "$(wc -c < "$OUTPUT" | tr -d ' ')" >&2
+      exit 0
+    fi
+    printf '[html-to-pdf] Playwright produced empty output\n' >&2
+  fi
+fi
+# --- No renderer available ---
+echo "Error: no PDF renderer found." >&2
+echo "  Install one of:" >&2
+echo "    - Google Chrome (recommended — already handles print CSS)" >&2
+echo "    - Playwright: npx playwright install chromium" >&2
+echo "  The HTML report is still available at: $INPUT" >&2
+exit 1