npm - @braedenbuilds/crawl-sim - Versions diffs - 1.1.0 → 1.2.0 - Mend

@braedenbuilds/crawl-sim 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/.claude-plugin/marketplace.json ADDED Viewed

@@ -0,0 +1,15 @@
+{
+  "name": "crawl-sim",
+  "owner": {
+    "name": "BraedenBDev",
+    "url": "https://github.com/BraedenBDev"
+  },
+  "plugins": [
+    {
+      "name": "crawl-sim",
+      "source": "./",
+      "description": "Multi-bot web crawler simulator — audit how Googlebot, GPTBot, ClaudeBot, and PerplexityBot see your site",
+      "version": "1.2.0"
+    }
+  ]
+}

package/.claude-plugin/plugin.json ADDED Viewed

@@ -0,0 +1,13 @@
+{
+  "name": "crawl-sim",
+  "version": "1.2.0",
+  "description": "Multi-bot web crawler simulator — audit how Googlebot, GPTBot, ClaudeBot, and PerplexityBot see your site",
+  "author": {
+    "name": "BraedenBDev",
+    "url": "https://github.com/BraedenBDev"
+  },
+  "homepage": "https://github.com/BraedenBDev/crawl-sim#readme",
+  "repository": "https://github.com/BraedenBDev/crawl-sim",
+  "license": "MIT",
+  "keywords": ["seo", "crawler", "ai-visibility", "claude-code-skill", "googlebot", "gptbot", "claudebot", "perplexitybot"]
+}

package/README.md CHANGED Viewed

@@ -44,15 +44,20 @@ The concept was validated manually: a curl-as-GPTBot + Claude analysis caught a
 ## Quick start
-### In Claude Code (recommended)
+### As a Claude Code plugin (recommended)
+```
+/plugin install BraedenBDev/crawl-sim@github
+```
+Or add as a marketplace for easy updates:
-```bash
-npm install -g @braedenbuilds/crawl-sim
-crawl-sim install              # → ~/.claude/skills/crawl-sim/
-crawl-sim install --project    # → .claude/skills/crawl-sim/
+```
+/plugin marketplace add BraedenBDev/crawl-sim
+/plugin install crawl-sim@crawl-sim
 ```
-Then in Claude Code:
+Then invoke:
 ```
 /crawl-sim https://yoursite.com
@@ -60,6 +65,14 @@ Then in Claude Code:
 Claude runs the full pipeline, interprets the results, and returns a score card plus prioritized findings.
+### Via npm (alternative)
+```bash
+npm install -g @braedenbuilds/crawl-sim
+crawl-sim install              # → ~/.claude/skills/crawl-sim/
+crawl-sim install --project    # → .claude/skills/crawl-sim/
+```
 > **Why `npm install -g` instead of `npx`?** Recent versions of npx have a known issue linking bins for scoped single-bin packages in ephemeral installs. A persistent global install avoids the problem entirely. The git clone path below is the zero-npm fallback.
 ### As a standalone CLI

package/bin/install.js CHANGED Viewed

@@ -14,6 +14,7 @@ const os = require('os');
 const { execFileSync } = require('child_process');
 const SOURCE_DIR = path.resolve(__dirname, '..');
+const SKILL_ROOT = path.resolve(SOURCE_DIR, 'skills', 'crawl-sim');
 const SKILL_FILES = ['SKILL.md'];
 const SKILL_DIRS = ['profiles', 'scripts'];
@@ -80,7 +81,9 @@ function install(target) {
   fs.mkdirSync(target, { recursive: true });
   for (const file of SKILL_FILES) {
-    const src = path.join(SOURCE_DIR, file);
+    // Look in skills/crawl-sim/ first (canonical), fallback to root (symlink)
+    let src = path.join(SKILL_ROOT, file);
+    if (!fs.existsSync(src)) src = path.join(SOURCE_DIR, file);
     const dest = path.join(target, file);
     if (fs.existsSync(src)) {
       fs.copyFileSync(src, dest);
@@ -92,7 +95,8 @@ function install(target) {
   }
   for (const dir of SKILL_DIRS) {
-    const src = path.join(SOURCE_DIR, dir);
+    let src = path.join(SKILL_ROOT, dir);
+    if (!fs.existsSync(src)) src = path.join(SOURCE_DIR, dir);
     const dest = path.join(target, dir);
     if (fs.existsSync(src)) {
       if (fs.existsSync(dest)) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@braedenbuilds/crawl-sim",
-  "version": "1.1.0",
+  "version": "1.2.0",
   "description": "Agent-native multi-bot web crawler simulator. See your site through the eyes of Googlebot, GPTBot, ClaudeBot, and PerplexityBot.",
   "bin": {
     "crawl-sim": "bin/install.js"
@@ -40,9 +40,11 @@
   },
   "files": [
     "bin/",
+    "skills/",
+    ".claude-plugin/",
     "SKILL.md",
-    "profiles/",
-    "scripts/",
+    "profiles",
+    "scripts",
     "README.md",
     "LICENSE"
   ]

package/{SKILL.md → skills/crawl-sim/SKILL.md} RENAMED Viewed

@@ -40,7 +40,10 @@ command -v curl >/dev/null 2>&1 || { echo "ERROR: curl is required"; exit 1; }
 command -v jq   >/dev/null 2>&1 || { echo "ERROR: jq is required (brew install jq)"; exit 1; }
 ```
-Locate the skill directory: typically `~/.claude/skills/crawl-sim/` or `.claude/skills/crawl-sim/`. Use `$CLAUDE_PLUGIN_ROOT` if set, otherwise find the directory containing this `SKILL.md`.
+Locate the skill directory. Check in this order:
+1. `$CLAUDE_PLUGIN_ROOT/skills/crawl-sim` (plugin install)
+2. `~/.claude/skills/crawl-sim/` (global npm install)
+3. `.claude/skills/crawl-sim/` (project-level install)
 ## Orchestration — five narrated stages
@@ -51,7 +54,16 @@ Split the work into **five Bash invocations**, each with a clear `description` f
 Tell the user: "Fetching as Googlebot, GPTBot, ClaudeBot, and PerplexityBot..."
 ```bash
-SKILL_DIR="$HOME/.claude/skills/crawl-sim"  # or wherever this SKILL.md lives
+# Resolve skill directory
+if [ -n "${CLAUDE_PLUGIN_ROOT:-}" ] && [ -d "$CLAUDE_PLUGIN_ROOT/skills/crawl-sim" ]; then
+  SKILL_DIR="$CLAUDE_PLUGIN_ROOT/skills/crawl-sim"
+elif [ -d "$HOME/.claude/skills/crawl-sim" ]; then
+  SKILL_DIR="$HOME/.claude/skills/crawl-sim"
+elif [ -d ".claude/skills/crawl-sim" ]; then
+  SKILL_DIR=".claude/skills/crawl-sim"
+else
+  echo "ERROR: cannot find crawl-sim skill directory" >&2; exit 1
+fi
 RUN_DIR=$(mktemp -d -t crawl-sim.XXXXXX)
 URL="<user-provided-url>"
 for bot in googlebot gptbot claudebot perplexitybot; do

package/{scripts → skills/crawl-sim/scripts}/compute-score.sh RENAMED Viewed

@@ -21,6 +21,8 @@ set -eu
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 # shellcheck source=_lib.sh
 . "$SCRIPT_DIR/_lib.sh"
+# shellcheck source=schema-fields.sh
+. "$SCRIPT_DIR/schema-fields.sh"
 PAGE_TYPE_OVERRIDE=""
 while [ $# -gt 0 ]; do
@@ -276,6 +278,40 @@ for bot_id in $BOTS; do
   ROBOTS="$RESULTS_DIR/robots-$bot_id.json"
   BOT_NAME=$(jget "$FETCH" '.bot.name' "$bot_id")
+  # Check for fetch failure — skip scoring, emit F grade (AC-A3)
+  FETCH_FAILED=$(jget_bool "$FETCH" '.fetchFailed')
+  if [ "$FETCH_FAILED" = "true" ]; then
+    FETCH_ERROR=$(jget "$FETCH" '.error' "unknown error")
+    RENDERS_JS=$(jq -r '.bot.rendersJavaScript | if . == null then "unknown" else tostring end' "$FETCH" 2>/dev/null || echo "unknown")
+    BOT_OBJ=$(jq -n \
+      --arg id "$bot_id" \
+      --arg name "$BOT_NAME" \
+      --arg rendersJs "$RENDERS_JS" \
+      --arg error "$FETCH_ERROR" \
+      '{
+        id: $id,
+        name: $name,
+        rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end),
+        fetchFailed: true,
+        error: $error,
+        score: 0,
+        grade: "F",
+        visibility: { serverWords: 0, effectiveWords: 0, missedWordsVsRendered: 0, hydrationPenaltyPts: 0 },
+        categories: {
+          accessibility:     { score: 0, grade: "F" },
+          contentVisibility: { score: 0, grade: "F" },
+          structuredData:    { score: 0, grade: "F", pageType: "unknown", expected: [], optional: [], forbidden: [], present: [], missing: [], extras: [], violations: [{ kind: "fetch_failed", impact: -100 }], calculation: "fetch failed — no data to score", notes: ("Fetch failed: " + $error) },
+          technicalSignals:  { score: 0, grade: "F" },
+          aiReadiness:       { score: 0, grade: "F" }
+        }
+      }')
+    BOTS_JSON=$(printf '%s' "$BOTS_JSON" | jq --argjson bot "$BOT_OBJ" --arg id "$bot_id" '.[$id] = $bot')
+    printf '[compute-score] %s: fetch failed, scoring as F\n' "$bot_id" >&2
+    CAT_N=$((CAT_N + 1))
+    continue
+  fi
   STATUS=$(jget_num "$FETCH" '.status')
   TOTAL_TIME=$(jget_num "$FETCH" '.timing.total')
   SERVER_WORD_COUNT=$(jget_num "$FETCH" '.wordCount')
@@ -376,15 +412,42 @@ for bot_id in $BOTS; do
     [ $VALID_PENALTY -gt 20 ] && VALID_PENALTY=20
   fi
-  STRUCTURED=$((BASE + BONUS - FORBID_PENALTY - VALID_PENALTY))
+  # Field-level validation (C3): check required fields per schema type
+  FIELD_PENALTY=0
+  FIELD_VIOLATIONS_JSON="[]"
+  BLOCK_COUNT_FOR_FIELDS=0
+  if [ -f "$JSONLD" ]; then
+    BLOCK_COUNT_FOR_FIELDS=$(jq 'if has("blocks") then .blocks | length else 0 end' "$JSONLD" 2>/dev/null || echo "0")
+  fi
+  if [ "$BLOCK_COUNT_FOR_FIELDS" -gt 0 ]; then
+    i=0
+    while [ "$i" -lt "$BLOCK_COUNT_FOR_FIELDS" ]; do
+      BLOCK_TYPE=$(jq -r ".blocks[$i].type" "$JSONLD" 2>/dev/null || echo "")
+      BLOCK_FIELDS=$(jq -r ".blocks[$i].fields[]?" "$JSONLD" 2>/dev/null | tr '\n' ' ')
+      REQUIRED=$(required_fields_for "$BLOCK_TYPE")
+      for field in $REQUIRED; do
+        # shellcheck disable=SC2086
+        if ! list_contains "$field" $BLOCK_FIELDS; then
+          FIELD_VIOLATIONS_JSON=$(printf '%s' "$FIELD_VIOLATIONS_JSON" | jq \
+            --arg schema "$BLOCK_TYPE" --arg field "$field" \
+            '. + [{kind: "missing_required_field", schema: $schema, field: $field, impact: -5}]')
+          FIELD_PENALTY=$((FIELD_PENALTY + 5))
+        fi
+      done
+      i=$((i + 1))
+    done
+  fi
+  [ $FIELD_PENALTY -gt 30 ] && FIELD_PENALTY=30
+  STRUCTURED=$((BASE + BONUS - FORBID_PENALTY - VALID_PENALTY - FIELD_PENALTY))
   [ $STRUCTURED -gt 100 ] && STRUCTURED=100
   [ $STRUCTURED -lt 0 ] && STRUCTURED=0
-  CALCULATION=$(printf 'base: %d/%d expected present = %d; +%d optional bonus; -%d forbidden penalty; -%d validity penalty; clamp [0,100] = %d' \
+  CALCULATION=$(printf 'base: %d/%d expected present = %d; +%d optional bonus; -%d forbidden penalty; -%d validity penalty; -%d field penalty; clamp [0,100] = %d' \
     "$PRESENT_EXPECTED_COUNT" "$EXPECTED_COUNT" "$BASE" \
-    "$BONUS" "$FORBID_PENALTY" "$VALID_PENALTY" "$STRUCTURED")
+    "$BONUS" "$FORBID_PENALTY" "$VALID_PENALTY" "$FIELD_PENALTY" "$STRUCTURED")
-  if [ "$STRUCTURED" -ge 100 ] && [ -z "$PRESENT_FORBIDDEN" ] && [ "$VALID_PENALTY" -eq 0 ]; then
+  if [ "$STRUCTURED" -ge 100 ] && [ -z "$PRESENT_FORBIDDEN" ] && [ "$VALID_PENALTY" -eq 0 ] && [ "$FIELD_PENALTY" -eq 0 ]; then
     NOTES="All expected schemas for pageType=$PAGE_TYPE are present. No structured-data action needed."
   elif [ -n "$MISSING_EXPECTED" ] && [ -z "$PRESENT_FORBIDDEN" ]; then
     NOTES="Missing expected schemas for pageType=$PAGE_TYPE: $MISSING_EXPECTED. Add these to raise the score."
@@ -392,8 +455,12 @@ for bot_id in $BOTS; do
     NOTES="Forbidden schemas present for pageType=$PAGE_TYPE: $PRESENT_FORBIDDEN. Remove these (or re-classify the page type with --page-type)."
   elif [ -n "$PRESENT_FORBIDDEN" ] && [ -n "$MISSING_EXPECTED" ]; then
     NOTES="Mixed: missing $MISSING_EXPECTED and forbidden present $PRESENT_FORBIDDEN for pageType=$PAGE_TYPE."
-  else
+  elif [ "$FIELD_PENALTY" -gt 0 ]; then
+    NOTES="Schemas for pageType=$PAGE_TYPE are present but missing required fields. See violations for details."
+  elif [ "$VALID_PENALTY" -gt 0 ]; then
     NOTES="Score reduced by $VALID_PENALTY pts due to invalid JSON-LD blocks."
+  else
+    NOTES="Structured data scored for pageType=$PAGE_TYPE."
   fi
   STRUCTURED_GRADE=$(grade_for "$STRUCTURED")
@@ -410,6 +477,7 @@ for bot_id in $BOTS; do
     --arg forbiddenPresent "$PRESENT_FORBIDDEN" \
     --argjson invalidCount "$JSONLD_INVALID" \
     --argjson validPenalty "$VALID_PENALTY" \
+    --argjson fieldViolations "$FIELD_VIOLATIONS_JSON" \
     --arg calculation "$CALCULATION" \
     --arg notes "$NOTES" \
     '
@@ -430,6 +498,7 @@ for bot_id in $BOTS; do
              then [{kind: "invalid_jsonld", count: $invalidCount, impact: (0 - $validPenalty)}]
              else []
            end)
+        + $fieldViolations
       ),
       calculation: $calculation,
       notes: $notes
@@ -566,6 +635,60 @@ CAT_STRUCTURED_GRADE=$(grade_for "$CAT_STRUCTURED_AVG")
 CAT_TECHNICAL_GRADE=$(grade_for "$CAT_TECHNICAL_AVG")
 CAT_AI_GRADE=$(grade_for "$CAT_AI_AVG")
+# --- Cross-bot content parity (C4) ---
+PARITY_MIN_WORDS=999999999
+PARITY_MAX_WORDS=0
+PARITY_BOT_COUNT=0
+for bot_id in $BOTS; do
+  FETCH="$RESULTS_DIR/fetch-$bot_id.json"
+  P_FETCH_FAILED=$(jget_bool "$FETCH" '.fetchFailed')
+  [ "$P_FETCH_FAILED" = "true" ] && continue
+  WC=$(jget_num "$FETCH" '.wordCount')
+  [ "$WC" -lt "$PARITY_MIN_WORDS" ] && PARITY_MIN_WORDS=$WC
+  [ "$WC" -gt "$PARITY_MAX_WORDS" ] && PARITY_MAX_WORDS=$WC
+  PARITY_BOT_COUNT=$((PARITY_BOT_COUNT + 1))
+done
+if [ "$PARITY_BOT_COUNT" -le 1 ]; then
+  PARITY_SCORE=100
+  PARITY_MAX_DELTA=0
+elif [ "$PARITY_MAX_WORDS" -gt 0 ]; then
+  PARITY_SCORE=$(awk -v min="$PARITY_MIN_WORDS" -v max="$PARITY_MAX_WORDS" \
+    'BEGIN { printf "%d", (min / max) * 100 + 0.5 }')
+  PARITY_MAX_DELTA=$(awk -v min="$PARITY_MIN_WORDS" -v max="$PARITY_MAX_WORDS" \
+    'BEGIN { printf "%d", ((max - min) / max) * 100 + 0.5 }')
+else
+  PARITY_SCORE=100
+  PARITY_MAX_DELTA=0
+fi
+[ "$PARITY_SCORE" -gt 100 ] && PARITY_SCORE=100
+PARITY_GRADE=$(grade_for "$PARITY_SCORE")
+if [ "$PARITY_SCORE" -ge 95 ]; then
+  PARITY_INTERP="Content is consistent across all bots."
+elif [ "$PARITY_SCORE" -ge 50 ]; then
+  PARITY_INTERP="Moderate content divergence between bots — likely partial client-side rendering hydration."
+else
+  PARITY_INTERP="Severe content divergence — site likely relies on client-side rendering. AI bots see significantly less content than Googlebot."
+fi
+# --- Warnings (H2) ---
+WARNINGS="[]"
+if [ "$DIFF_AVAILABLE" != "true" ]; then
+  DIFF_REASON="not_found"
+  if [ -f "$DIFF_RENDER_FILE" ]; then
+    DIFF_REASON=$(jq -r '.reason // "skipped"' "$DIFF_RENDER_FILE" 2>/dev/null || echo "skipped")
+  fi
+  WARNINGS=$(printf '%s' "$WARNINGS" | jq --arg reason "$DIFF_REASON" \
+    '. + [{
+      code: "diff_render_unavailable",
+      severity: "high",
+      message: "JS rendering comparison was skipped. If this site uses CSR, non-JS bot scores may be inaccurate.",
+      reason: $reason
+    }]')
+fi
 TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
 jq -n \
@@ -587,6 +710,13 @@ jq -n \
   --arg catTechnicalGrade "$CAT_TECHNICAL_GRADE" \
   --argjson catAi "$CAT_AI_AVG" \
   --arg catAiGrade "$CAT_AI_GRADE" \
+  --argjson warnings "$WARNINGS" \
+  --argjson parityScore "$PARITY_SCORE" \
+  --arg parityGrade "$PARITY_GRADE" \
+  --argjson parityMinWords "$PARITY_MIN_WORDS" \
+  --argjson parityMaxWords "$PARITY_MAX_WORDS" \
+  --argjson parityMaxDelta "$PARITY_MAX_DELTA" \
+  --arg parityInterp "$PARITY_INTERP" \
   '{
     url: $url,
     timestamp: $timestamp,
@@ -594,6 +724,15 @@ jq -n \
     pageType: $pageType,
     pageTypeOverridden: ($pageTypeOverride | length > 0),
     overall: { score: $overallScore, grade: $overallGrade },
+    parity: {
+      score: $parityScore,
+      grade: $parityGrade,
+      minWords: (if $parityMinWords >= 999999999 then 0 else $parityMinWords end),
+      maxWords: $parityMaxWords,
+      maxDeltaPct: $parityMaxDelta,
+      interpretation: $parityInterp
+    },
+    warnings: $warnings,
     bots: $bots,
     categories: {
       accessibility:     { score: $catAcc,        grade: $catAccGrade },

package/{scripts → skills/crawl-sim/scripts}/extract-jsonld.sh RENAMED Viewed

@@ -62,6 +62,7 @@ fi
 VALID_COUNT=0
 INVALID_COUNT=0
+BLOCKS_JSON="[]"
 if [ "$BLOCK_COUNT" -gt 0 ]; then
   while IFS= read -r block; do
@@ -79,6 +80,15 @@ if [ "$BLOCK_COUNT" -gt 0 ]; then
           else empty end;
         collect_types
       ' 2>/dev/null >> "$TYPES_FILE" || true
+      # Extract per-block type + top-level field names for field validation (AC-B1)
+      BLOCK_INFO=$(printf '%s' "$block" | jq -c '
+        {
+          type: (if has("@type") then (.["@type"] | if type == "array" then .[0] else . end) else "unknown" end),
+          fields: (keys | map(select(startswith("@") | not)))
+        }
+      ' 2>/dev/null || echo '{"type":"unknown","fields":[]}')
+      BLOCKS_JSON=$(printf '%s' "$BLOCKS_JSON" | jq --argjson b "$BLOCK_INFO" '. + [$b]')
     else
       INVALID_COUNT=$((INVALID_COUNT + 1))
     fi
@@ -109,6 +119,7 @@ jq -n \
   --argjson valid "$VALID_COUNT" \
   --argjson invalid "$INVALID_COUNT" \
   --argjson types "$TYPES_JSON" \
+  --argjson blocks "$BLOCKS_JSON" \
   --argjson hasOrg "$HAS_ORG" \
   --argjson hasBreadcrumb "$HAS_BREADCRUMB" \
   --argjson hasWebsite "$HAS_WEBSITE" \
@@ -121,6 +132,7 @@ jq -n \
     validCount: $valid,
     invalidCount: $invalid,
     types: $types,
+    blocks: $blocks,
     flags: {
       hasOrganization: $hasOrg,
       hasBreadcrumbList: $hasBreadcrumb,

package/skills/crawl-sim/scripts/fetch-as-bot.sh ADDED Viewed

@@ -0,0 +1,151 @@
+#!/usr/bin/env bash
+set -euo pipefail
+# fetch-as-bot.sh — Fetch a URL as a specific bot User-Agent
+# Usage: fetch-as-bot.sh <url> <profile.json>
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=_lib.sh
+. "$SCRIPT_DIR/_lib.sh"
+URL="${1:?Usage: fetch-as-bot.sh <url> <profile.json>}"
+PROFILE="${2:?Usage: fetch-as-bot.sh <url> <profile.json>}"
+BOT_ID=$(jq -r '.id' "$PROFILE")
+BOT_NAME=$(jq -r '.name' "$PROFILE")
+UA=$(jq -r '.userAgent' "$PROFILE")
+RENDERS_JS=$(jq -r '.rendersJavaScript' "$PROFILE")
+TMPDIR="${TMPDIR:-/tmp}"
+HEADERS_FILE=$(mktemp "$TMPDIR/crawlsim-headers.XXXXXX")
+BODY_FILE=$(mktemp "$TMPDIR/crawlsim-body.XXXXXX")
+CURL_STDERR_FILE=$(mktemp "$TMPDIR/crawlsim-stderr.XXXXXX")
+trap 'rm -f "$HEADERS_FILE" "$BODY_FILE" "$CURL_STDERR_FILE"' EXIT
+printf '[%s] fetching %s\n' "$BOT_ID" "$URL" >&2
+set +e
+TIMING=$(curl -sS -L \
+  -H "User-Agent: $UA" \
+  -D "$HEADERS_FILE" \
+  -o "$BODY_FILE" \
+  -w '{"total":%{time_total},"ttfb":%{time_starttransfer},"connect":%{time_connect},"statusCode":%{http_code},"sizeDownload":%{size_download},"redirectCount":%{num_redirects},"finalUrl":"%{url_effective}"}' \
+  --max-time 30 \
+  "$URL" 2>"$CURL_STDERR_FILE")
+CURL_EXIT=$?
+set -e
+CURL_ERR=""
+if [ -s "$CURL_STDERR_FILE" ]; then
+  CURL_ERR=$(cat "$CURL_STDERR_FILE")
+fi
+if [ "$CURL_EXIT" -ne 0 ]; then
+  printf '[%s] FAILED: curl exit %d — %s\n' "$BOT_ID" "$CURL_EXIT" "$CURL_ERR" >&2
+  jq -n \
+    --arg url "$URL" \
+    --arg botId "$BOT_ID" \
+    --arg botName "$BOT_NAME" \
+    --arg ua "$UA" \
+    --arg rendersJs "$RENDERS_JS" \
+    --arg error "$CURL_ERR" \
+    --argjson exitCode "$CURL_EXIT" \
+    '{
+      url: $url,
+      bot: {
+        id: $botId,
+        name: $botName,
+        userAgent: $ua,
+        rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end)
+      },
+      fetchFailed: true,
+      error: $error,
+      curlExitCode: $exitCode,
+      status: 0,
+      timing: { total: 0, ttfb: 0 },
+      size: 0,
+      wordCount: 0,
+      headers: {},
+      bodyBase64: ""
+    }'
+  exit 0
+fi
+read -r STATUS TOTAL_TIME TTFB SIZE REDIRECT_COUNT FINAL_URL <<< \
+  "$(echo "$TIMING" | jq -r '[.statusCode, .total, .ttfb, .sizeDownload, .redirectCount, .finalUrl] | @tsv')"
+# Parse response headers into a JSON object using jq for safe escaping.
+# curl -L writes multiple blocks on redirect; jq keeps the last definition
+# of each header since `add` overwrites left-to-right.
+HEADERS_JSON=$(tr -d '\r' < "$HEADERS_FILE" \
+  | grep -E '^[A-Za-z][A-Za-z0-9-]*:[[:space:]]' \
+  | jq -Rs '
+      split("\n")
+      | map(select(length > 0))
+      | map(capture("^(?<k>[^:]+):[[:space:]]*(?<v>.*)$"))
+      | map({(.k): .v})
+      | add // {}
+    ')
+# Parse redirect chain from headers dump.
+# curl -D writes multiple HTTP response blocks on redirect — each starts with HTTP/.
+REDIRECT_CHAIN="[]"
+if [ "$REDIRECT_COUNT" -gt 0 ]; then
+  REDIRECT_CHAIN=$(tr -d '\r' < "$HEADERS_FILE" | awk '
+    /^HTTP\// { status=$2; url="" }
+    /^[Ll]ocation:/ { url=$2 }
+    /^$/ && status && url { printf "%s %s\n", status, url; status=""; url="" }
+  ' | jq -Rs '
+    split("\n") | map(select(length > 0)) |
+    to_entries | map({
+      hop: .key,
+      status: (.value | split(" ")[0] | tonumber),
+      location: (.value | split(" ")[1:] | join(" "))
+    })
+  ')
+fi
+WORD_COUNT=$(count_words "$BODY_FILE")
+[ -z "$WORD_COUNT" ] && WORD_COUNT=0
+BODY_B64=""
+if [ -s "$BODY_FILE" ]; then
+  BODY_B64=$(base64 < "$BODY_FILE")
+fi
+printf '[%s] ok: status=%s size=%s words=%s time=%ss\n' "$BOT_ID" "$STATUS" "$SIZE" "$WORD_COUNT" "$TOTAL_TIME" >&2
+jq -n \
+  --arg url "$URL" \
+  --arg botId "$BOT_ID" \
+  --arg botName "$BOT_NAME" \
+  --arg ua "$UA" \
+  --arg rendersJs "$RENDERS_JS" \
+  --argjson status "$STATUS" \
+  --argjson totalTime "$TOTAL_TIME" \
+  --argjson ttfb "$TTFB" \
+  --argjson size "$SIZE" \
+  --argjson wordCount "$WORD_COUNT" \
+  --argjson headers "$HEADERS_JSON" \
+  --argjson redirectCount "$REDIRECT_COUNT" \
+  --arg finalUrl "$FINAL_URL" \
+  --argjson redirectChain "$REDIRECT_CHAIN" \
+  --arg bodyBase64 "$BODY_B64" \
+  '{
+    url: $url,
+    bot: {
+      id: $botId,
+      name: $botName,
+      userAgent: $ua,
+      rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end)
+    },
+    status: $status,
+    timing: { total: $totalTime, ttfb: $ttfb },
+    size: $size,
+    wordCount: $wordCount,
+    redirectCount: $redirectCount,
+    finalUrl: $finalUrl,
+    redirectChain: $redirectChain,
+    headers: $headers,
+    bodyBase64: $bodyBase64
+  }'

package/skills/crawl-sim/scripts/schema-fields.sh ADDED Viewed

@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+# schema-fields.sh — Required field definitions per schema.org type.
+# Source this file, then call required_fields_for <SchemaType>.
+required_fields_for() {
+  case "$1" in
+    Organization)        echo "name url" ;;
+    WebSite)             echo "name url" ;;
+    Article)             echo "headline author datePublished" ;;
+    NewsArticle)         echo "headline author datePublished" ;;
+    FAQPage)             echo "mainEntity" ;;
+    BreadcrumbList)      echo "itemListElement" ;;
+    CollectionPage)      echo "name" ;;
+    ItemList)            echo "itemListElement" ;;
+    AboutPage)           echo "name" ;;
+    ContactPage)         echo "name" ;;
+    Product)             echo "name" ;;
+    LocalBusiness)       echo "name address" ;;
+    ProfessionalService) echo "name" ;;
+    Person)              echo "name" ;;
+    ImageObject)         echo "contentUrl" ;;
+    PostalAddress)       echo "streetAddress" ;;
+    *)                   echo "" ;;
+  esac
+}

package/scripts/fetch-as-bot.sh DELETED Viewed

@@ -1,87 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-# fetch-as-bot.sh — Fetch a URL as a specific bot User-Agent
-# Usage: fetch-as-bot.sh <url> <profile.json>
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-# shellcheck source=_lib.sh
-. "$SCRIPT_DIR/_lib.sh"
-URL="${1:?Usage: fetch-as-bot.sh <url> <profile.json>}"
-PROFILE="${2:?Usage: fetch-as-bot.sh <url> <profile.json>}"
-BOT_ID=$(jq -r '.id' "$PROFILE")
-BOT_NAME=$(jq -r '.name' "$PROFILE")
-UA=$(jq -r '.userAgent' "$PROFILE")
-RENDERS_JS=$(jq -r '.rendersJavaScript' "$PROFILE")
-printf '[fetch-as-bot] %s <- %s\n' "$BOT_NAME" "$URL" >&2
-TMPDIR="${TMPDIR:-/tmp}"
-HEADERS_FILE=$(mktemp "$TMPDIR/crawlsim-headers.XXXXXX")
-BODY_FILE=$(mktemp "$TMPDIR/crawlsim-body.XXXXXX")
-trap 'rm -f "$HEADERS_FILE" "$BODY_FILE"' EXIT
-TIMING=$(curl -sS -L \
-  -H "User-Agent: $UA" \
-  -D "$HEADERS_FILE" \
-  -o "$BODY_FILE" \
-  -w '{"total":%{time_total},"ttfb":%{time_starttransfer},"connect":%{time_connect},"statusCode":%{http_code},"sizeDownload":%{size_download}}' \
-  --max-time 30 \
-  "$URL" 2>/dev/null || echo '{"total":0,"ttfb":0,"connect":0,"statusCode":0,"sizeDownload":0}')
-STATUS=$(echo "$TIMING" | jq -r '.statusCode')
-TOTAL_TIME=$(echo "$TIMING" | jq -r '.total')
-TTFB=$(echo "$TIMING" | jq -r '.ttfb')
-SIZE=$(echo "$TIMING" | jq -r '.sizeDownload')
-# Parse response headers into a JSON object using jq for safe escaping.
-# curl -L writes multiple blocks on redirect; jq keeps the last definition
-# of each header since `add` overwrites left-to-right.
-HEADERS_JSON=$(tr -d '\r' < "$HEADERS_FILE" \
-  | grep -E '^[A-Za-z][A-Za-z0-9-]*:[[:space:]]' \
-  | jq -Rs '
-      split("\n")
-      | map(select(length > 0))
-      | map(capture("^(?<k>[^:]+):[[:space:]]*(?<v>.*)$"))
-      | map({(.k): .v})
-      | add // {}
-    ')
-WORD_COUNT=$(count_words "$BODY_FILE")
-[ -z "$WORD_COUNT" ] && WORD_COUNT=0
-BODY_B64=""
-if [ -s "$BODY_FILE" ]; then
-  BODY_B64=$(base64 < "$BODY_FILE")
-fi
-jq -n \
-  --arg url "$URL" \
-  --arg botId "$BOT_ID" \
-  --arg botName "$BOT_NAME" \
-  --arg ua "$UA" \
-  --arg rendersJs "$RENDERS_JS" \
-  --argjson status "$STATUS" \
-  --argjson totalTime "$TOTAL_TIME" \
-  --argjson ttfb "$TTFB" \
-  --argjson size "$SIZE" \
-  --argjson wordCount "$WORD_COUNT" \
-  --argjson headers "$HEADERS_JSON" \
-  --arg bodyBase64 "$BODY_B64" \
-  '{
-    url: $url,
-    bot: {
-      id: $botId,
-      name: $botName,
-      userAgent: $ua,
-      rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end)
-    },
-    status: $status,
-    timing: { total: $totalTime, ttfb: $ttfb },
-    size: $size,
-    wordCount: $wordCount,
-    headers: $headers,
-    bodyBase64: $bodyBase64
-  }'