npm - @braedenbuilds/crawl-sim - Versions diffs - 1.2.0 → 1.3.1 - Mend

@braedenbuilds/crawl-sim 1.2.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/.claude-plugin/marketplace.json +1 -1
package/.claude-plugin/plugin.json +1 -1
package/package.json +1 -1
package/skills/crawl-sim/SKILL.md +24 -3
package/skills/crawl-sim/scripts/_lib.sh +6 -1
package/skills/crawl-sim/scripts/build-report.sh +45 -0
package/skills/crawl-sim/scripts/check-llmstxt.sh +5 -0
package/skills/crawl-sim/scripts/check-sitemap.sh +10 -1
package/skills/crawl-sim/scripts/compute-score.sh +59 -37
package/skills/crawl-sim/scripts/extract-links.sh +5 -7

package/.claude-plugin/marketplace.json CHANGED Viewed

@@ -9,7 +9,7 @@
       "name": "crawl-sim",
       "source": "./",
       "description": "Multi-bot web crawler simulator — audit how Googlebot, GPTBot, ClaudeBot, and PerplexityBot see your site",
-      "version": "1.2.0"
+      "version": "1.3.1"
     }
   ]
 }

package/.claude-plugin/plugin.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "crawl-sim",
-  "version": "1.2.0",
+  "version": "1.3.1",
   "description": "Multi-bot web crawler simulator — audit how Googlebot, GPTBot, ClaudeBot, and PerplexityBot see your site",
   "author": {
     "name": "BraedenBDev",

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@braedenbuilds/crawl-sim",
-  "version": "1.2.0",
+  "version": "1.3.1",
   "description": "Agent-native multi-bot web crawler simulator. See your site through the eyes of Googlebot, GPTBot, ClaudeBot, and PerplexityBot.",
   "bin": {
     "crawl-sim": "bin/install.js"

package/skills/crawl-sim/SKILL.md CHANGED Viewed

@@ -51,7 +51,7 @@ Split the work into **five Bash invocations**, each with a clear `description` f
 ### Stage 1 — Fetch
-Tell the user: "Fetching as Googlebot, GPTBot, ClaudeBot, and PerplexityBot..."
+Tell the user: "Fetching as Googlebot, GPTBot, ClaudeBot, and PerplexityBot in parallel..."
 ```bash
 # Resolve skill directory
@@ -67,7 +67,16 @@ fi
 RUN_DIR=$(mktemp -d -t crawl-sim.XXXXXX)
 URL="<user-provided-url>"
 for bot in googlebot gptbot claudebot perplexitybot; do
-  "$SKILL_DIR/scripts/fetch-as-bot.sh" "$URL" "$SKILL_DIR/profiles/${bot}.json" > "$RUN_DIR/fetch-${bot}.json"
+  "$SKILL_DIR/scripts/fetch-as-bot.sh" "$URL" "$SKILL_DIR/profiles/${bot}.json" > "$RUN_DIR/fetch-${bot}.json" &
+done
+wait
+# Verify no empty fetch files (guard against silent parallel failures)
+for bot in googlebot gptbot claudebot perplexitybot; do
+  if [ ! -s "$RUN_DIR/fetch-${bot}.json" ]; then
+    echo "WARNING: fetch-${bot}.json is empty — retrying serially" >&2
+    "$SKILL_DIR/scripts/fetch-as-bot.sh" "$URL" "$SKILL_DIR/profiles/${bot}.json" > "$RUN_DIR/fetch-${bot}.json"
+  fi
 done
 ```
@@ -124,7 +133,7 @@ Tell the user: "Computing per-bot scores and finalizing the report..."
 ```bash
 "$SKILL_DIR/scripts/compute-score.sh" "$RUN_DIR" > "$RUN_DIR/score.json"
-cp "$RUN_DIR/score.json" ./crawl-sim-report.json
+"$SKILL_DIR/scripts/build-report.sh" "$RUN_DIR" > ./crawl-sim-report.json
 ```
 **Page-type awareness.** `compute-score.sh` derives a page type from the target URL (`root` / `detail` / `archive` / `faq` / `about` / `contact` / `generic`) and picks a schema rubric accordingly. Root pages are expected to ship `Organization` + `WebSite` — penalizing them for missing `BreadcrumbList` or `FAQPage` would be wrong, so the scorer doesn't. If the URL heuristic picks the wrong type (e.g., a homepage at `/en/` that URL-parses as generic), pass `--page-type <type>`:
@@ -162,6 +171,18 @@ Print a boxed score card to the terminal:
 Progress bars are 20 chars wide using `█` and `░` (each char = 5%).
+**Parity-aware display.** When `parity.score >= 95` AND all per-bot composite scores are within 5 points of each other, collapse the four bot rows into one:
+```
+║  All 4 bots     98  A   ███████████████████░  (parity: content identical)  ║
+```
+Only show individual bot rows when scores diverge — that's when per-bot detail adds information. Always show the parity line in the category breakdown:
+```
+║  Content Parity   100  A   (all bots see the same content)                 ║
+```
 ## Output Layer 2 — Narrative Audit
 Lead with a **Bot differentiation summary** — state up front whether the bots scored the same or differently, and why. If they scored the same, explicitly say so:

package/skills/crawl-sim/scripts/_lib.sh CHANGED Viewed

@@ -73,9 +73,14 @@ page_type_for_url() {
 # Fetch a URL to a local file and return the HTTP status code on stdout.
 # Usage: status=$(fetch_to_file <url> <output-file> [timeout-seconds])
+# Retries once on transient failure (same SSL/DNS flake that caused #11).
 fetch_to_file() {
   local url="$1"
   local out="$2"
   local timeout="${3:-15}"
-  curl -sS -L -o "$out" -w '%{http_code}' --max-time "$timeout" "$url" 2>/dev/null || echo "000"
+  local status
+  status=$(curl -sS -L -o "$out" -w '%{http_code}' --max-time "$timeout" "$url" 2>/dev/null) && echo "$status" && return
+  # Retry once on transient failure
+  status=$(curl -sS -L -o "$out" -w '%{http_code}' --max-time "$timeout" "$url" 2>/dev/null) && echo "$status" && return
+  echo "000"
 }

package/skills/crawl-sim/scripts/build-report.sh ADDED Viewed

@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+set -eu
+# build-report.sh — Consolidate all crawl-sim outputs into a single JSON report
+# Usage: build-report.sh <results-dir>
+# Output: JSON to stdout
+RESULTS_DIR="${1:?Usage: build-report.sh <results-dir>}"
+if [ ! -f "$RESULTS_DIR/score.json" ]; then
+  echo "Error: score.json not found in $RESULTS_DIR — run compute-score.sh first" >&2
+  exit 1
+fi
+SCORE=$(cat "$RESULTS_DIR/score.json")
+# Collect per-bot raw data
+PER_BOT="{}"
+for f in "$RESULTS_DIR"/fetch-*.json; do
+  [ -f "$f" ] || continue
+  bot_id=$(basename "$f" .json | sed 's/^fetch-//')
+  BOT_RAW=$(jq -n \
+    --argjson fetch "$(jq '{status, timing, size, wordCount, redirectCount, finalUrl, redirectChain, fetchFailed, error}' "$f" 2>/dev/null || echo '{}')" \
+    --argjson meta "$(jq '.' "$RESULTS_DIR/meta-$bot_id.json" 2>/dev/null || echo '{}')" \
+    --argjson jsonld "$(jq '{blockCount, types, blocks}' "$RESULTS_DIR/jsonld-$bot_id.json" 2>/dev/null || echo '{}')" \
+    --argjson links "$(jq '.' "$RESULTS_DIR/links-$bot_id.json" 2>/dev/null || echo '{}')" \
+    --argjson robots "$(jq '.' "$RESULTS_DIR/robots-$bot_id.json" 2>/dev/null || echo '{}')" \
+    '{fetch: $fetch, meta: $meta, jsonld: $jsonld, links: $links, robots: $robots}')
+  PER_BOT=$(printf '%s' "$PER_BOT" | jq --argjson raw "$BOT_RAW" --arg id "$bot_id" '.[$id] = $raw')
+done
+# Collect independent (non-per-bot) data
+INDEPENDENT=$(jq -n \
+  --argjson sitemap "$(jq '.' "$RESULTS_DIR/sitemap.json" 2>/dev/null || echo '{}')" \
+  --argjson llmstxt "$(jq '.' "$RESULTS_DIR/llmstxt.json" 2>/dev/null || echo '{}')" \
+  --argjson diffRender "$(jq '.' "$RESULTS_DIR/diff-render.json" 2>/dev/null || echo '{"skipped":true,"reason":"not_found"}')" \
+  '{sitemap: $sitemap, llmstxt: $llmstxt, diffRender: $diffRender}')
+# Merge score + raw data
+printf '%s' "$SCORE" | jq \
+  --argjson perBot "$PER_BOT" \
+  --argjson independent "$INDEPENDENT" \
+  '. + {raw: {perBot: $perBot, independent: $independent}}'

package/skills/crawl-sim/scripts/check-llmstxt.sh CHANGED Viewed

@@ -79,8 +79,12 @@ LLMS_FULL_HAS_TITLE=$HAS_TITLE
 LLMS_FULL_HAS_DESC=$HAS_DESCRIPTION
 LLMS_FULL_URLS=$URL_COUNT
+TOP_EXISTS=false
+[ "$LLMS_EXISTS" = "true" ] || [ "$LLMS_FULL_EXISTS" = "true" ] && TOP_EXISTS=true
 jq -n \
   --arg url "$URL" \
+  --argjson topExists "$TOP_EXISTS" \
   --arg llmsUrl "${ORIGIN}/llms.txt" \
   --arg llmsFullUrl "${ORIGIN}/llms-full.txt" \
   --argjson llmsExists "$LLMS_EXISTS" \
@@ -96,6 +100,7 @@ jq -n \
   --argjson llmsFullUrls "$LLMS_FULL_URLS" \
   '{
     url: $url,
+    exists: $topExists,
     llmsTxt: {
       url: $llmsUrl,
       exists: $llmsExists,

package/skills/crawl-sim/scripts/check-sitemap.sh CHANGED Viewed

@@ -25,6 +25,7 @@ CONTAINS_TARGET=false
 HAS_LASTMOD=false
 IS_INDEX=false
 CHILD_SITEMAP_COUNT=0
+SAMPLE_URLS="[]"
 if [ "$HTTP_STATUS" = "200" ] && [ -s "$SITEMAP_FILE" ]; then
   # Check if content looks like XML (not HTML fallback)
@@ -43,6 +44,12 @@ if [ "$HTTP_STATUS" = "200" ] && [ -s "$SITEMAP_FILE" ]; then
       # Count <loc> tags (URLs, or child sitemaps in an index)
       URL_COUNT=$(grep -oE '<loc>' "$SITEMAP_FILE" | wc -l | tr -d ' ')
+      # Extract first 10 <loc> URLs as sample
+      SAMPLE_URLS=$(grep -oE '<loc>[^<]+</loc>' "$SITEMAP_FILE" \
+        | sed -E 's/<\/?loc>//g' \
+        | head -10 \
+        | jq -R . | jq -s .)
       # Check if target URL appears anywhere in the sitemap
       # Match both with and without trailing slash
       URL_NO_TRAILING=$(printf '%s' "$URL" | sed -E 's#/$##')
@@ -67,6 +74,7 @@ jq -n \
   --argjson childSitemapCount "$CHILD_SITEMAP_COUNT" \
   --argjson containsTarget "$CONTAINS_TARGET" \
   --argjson hasLastmod "$HAS_LASTMOD" \
+  --argjson sampleUrls "$SAMPLE_URLS" \
   '{
     url: $url,
     sitemapUrl: $sitemapUrl,
@@ -75,5 +83,6 @@ jq -n \
     urlCount: $urlCount,
     childSitemapCount: $childSitemapCount,
     containsTarget: $containsTarget,
-    hasLastmod: $hasLastmod
+    hasLastmod: $hasLastmod,
+    sampleUrls: $sampleUrls
   }'

package/skills/crawl-sim/scripts/compute-score.sh CHANGED Viewed

@@ -312,12 +312,16 @@ for bot_id in $BOTS; do
     continue
   fi
-  STATUS=$(jget_num "$FETCH" '.status')
-  TOTAL_TIME=$(jget_num "$FETCH" '.timing.total')
-  SERVER_WORD_COUNT=$(jget_num "$FETCH" '.wordCount')
-  RENDERS_JS=$(jq -r '.bot.rendersJavaScript | if . == null then "unknown" else tostring end' "$FETCH" 2>/dev/null || echo "unknown")
+  # Batch-read fields from fetch file (1 jq call instead of 4)
+  read -r STATUS TOTAL_TIME SERVER_WORD_COUNT RENDERS_JS <<< \
+    "$(jq -r '[
+      (.status // 0),
+      (.timing.total // 0),
+      (.wordCount // 0),
+      (.bot.rendersJavaScript | if . == null then "unknown" else tostring end)
+    ] | @tsv' "$FETCH" 2>/dev/null || echo "0	0	0	unknown")"
-  ROBOTS_ALLOWED=$(jget_bool "$ROBOTS" '.allowed')
+  ROBOTS_ALLOWED=$(jq -r '.allowed // false | tostring' "$ROBOTS" 2>/dev/null || echo "false")
   EFFECTIVE_WORD_COUNT=$SERVER_WORD_COUNT
   HYDRATION_PENALTY=0
@@ -341,10 +345,15 @@ for bot_id in $BOTS; do
   # --- Category 1: Accessibility (0-100) ---
   ACC=0
-  [ "$ROBOTS_ALLOWED" = "true" ] && ACC=$((ACC + 40))
-  [ "$STATUS" = "200" ] && ACC=$((ACC + 40))
-  TIME_SCORE=$(awk -v t="$TOTAL_TIME" 'BEGIN { if (t < 2) print 20; else if (t < 5) print 10; else print 0 }')
-  ACC=$((ACC + TIME_SCORE))
+  if [ "$ROBOTS_ALLOWED" != "true" ]; then
+    # R4 critical-fail: robots blocking overrides accessibility to 0/F
+    ACC=0
+  else
+    ACC=$((ACC + 40))
+    [ "$STATUS" = "200" ] && ACC=$((ACC + 40))
+    TIME_SCORE=$(awk -v t="$TOTAL_TIME" 'BEGIN { if (t < 2) print 20; else if (t < 5) print 10; else print 0 }')
+    ACC=$((ACC + TIME_SCORE))
+  fi
   # --- Category 2: Content Visibility (0-100) ---
   CONTENT=0
@@ -353,18 +362,23 @@ for bot_id in $BOTS; do
   elif [ "$EFFECTIVE_WORD_COUNT" -ge 50 ]; then CONTENT=$((CONTENT + 10))
   fi
-  H1_COUNT=$(jget_num "$META" '.headings.h1.count')
-  H2_COUNT=$(jget_num "$META" '.headings.h2.count')
+  # Batch-read fields from meta + links (1 jq call instead of 4 + 1)
+  read -r H1_COUNT H2_COUNT IMG_TOTAL IMG_WITH_ALT <<< \
+    "$(jq -r '[
+      (.headings.h1.count // 0),
+      (.headings.h2.count // 0),
+      (.images.total // 0),
+      (.images.withAlt // 0)
+    ] | @tsv' "$META" 2>/dev/null || echo "0	0	0	0")"
+  INTERNAL_LINKS=$(jq -r 'if (.internal | type) == "number" then .internal else .counts.internal // 0 end' "$LINKS" 2>/dev/null || echo "0")
   [ "$H1_COUNT" -ge 1 ] && CONTENT=$((CONTENT + 20))
   [ "$H2_COUNT" -ge 1 ] && CONTENT=$((CONTENT + 15))
-  INTERNAL_LINKS=$(jget_num "$LINKS" '.counts.internal')
   if [ "$INTERNAL_LINKS" -ge 5 ]; then CONTENT=$((CONTENT + 20))
   elif [ "$INTERNAL_LINKS" -ge 1 ]; then CONTENT=$((CONTENT + 10))
   fi
-  IMG_TOTAL=$(jget_num "$META" '.images.total')
-  IMG_WITH_ALT=$(jget_num "$META" '.images.withAlt')
   if [ "$IMG_TOTAL" -eq 0 ]; then
     CONTENT=$((CONTENT + 15))
   else
@@ -430,7 +444,7 @@ for bot_id in $BOTS; do
         if ! list_contains "$field" $BLOCK_FIELDS; then
           FIELD_VIOLATIONS_JSON=$(printf '%s' "$FIELD_VIOLATIONS_JSON" | jq \
             --arg schema "$BLOCK_TYPE" --arg field "$field" \
-            '. + [{kind: "missing_required_field", schema: $schema, field: $field, impact: -5}]')
+            '. + [{kind: "missing_required_field", schema: $schema, field: $field, impact: -5, confidence: "high"}]')
           FIELD_PENALTY=$((FIELD_PENALTY + 5))
         fi
       done
@@ -493,9 +507,9 @@ for bot_id in $BOTS; do
       missing:    ($missingList    | to_arr),
       extras:     ($extrasList     | to_arr),
       violations: (
-        ($forbiddenPresent | to_arr | map({kind: "forbidden_schema", schema: ., impact: -10}))
+        ($forbiddenPresent | to_arr | map({kind: "forbidden_schema", schema: ., impact: -10, confidence: "high"}))
         + (if $validPenalty > 0
-             then [{kind: "invalid_jsonld", count: $invalidCount, impact: (0 - $validPenalty)}]
+             then [{kind: "invalid_jsonld", count: $invalidCount, impact: (0 - $validPenalty), confidence: "high"}]
              else []
            end)
         + $fieldViolations
@@ -507,20 +521,24 @@ for bot_id in $BOTS; do
   # --- Category 4: Technical Signals (0-100) ---
   TECHNICAL=0
-  TITLE=$(jget "$META" '.title' "")
-  DESCRIPTION=$(jget "$META" '.description' "")
-  CANONICAL=$(jget "$META" '.canonical' "")
-  OG_TITLE=$(jget "$META" '.og.title' "")
-  OG_DESC=$(jget "$META" '.og.description' "")
-  [ -n "$TITLE" ] && [ "$TITLE" != "null" ] && TECHNICAL=$((TECHNICAL + 25))
-  [ -n "$DESCRIPTION" ] && [ "$DESCRIPTION" != "null" ] && TECHNICAL=$((TECHNICAL + 25))
-  [ -n "$CANONICAL" ] && [ "$CANONICAL" != "null" ] && TECHNICAL=$((TECHNICAL + 20))
-  if [ -n "$OG_TITLE" ] && [ "$OG_TITLE" != "null" ]; then TECHNICAL=$((TECHNICAL + 8)); fi
-  if [ -n "$OG_DESC" ] && [ "$OG_DESC" != "null" ]; then TECHNICAL=$((TECHNICAL + 7)); fi
-  SITEMAP_EXISTS=$(jget_bool "$SITEMAP_FILE" '.exists')
-  SITEMAP_CONTAINS=$(jget_bool "$SITEMAP_FILE" '.containsTarget')
+  # Batch-read meta fields for technical scoring (1 jq call instead of 5)
+  IFS=$'\t' read -r TITLE DESCRIPTION CANONICAL OG_TITLE OG_DESC <<< \
+    "$(jq -r '[
+      (.title // "" | gsub("\t"; " ")),
+      (.description // "" | gsub("\t"; " ")),
+      (.canonical // "" | gsub("\t"; " ")),
+      (.og.title // "" | gsub("\t"; " ")),
+      (.og.description // "" | gsub("\t"; " "))
+    ] | @tsv' "$META" 2>/dev/null || printf '\t\t\t\t')"
+  [ -n "$TITLE" ] && TECHNICAL=$((TECHNICAL + 25))
+  [ -n "$DESCRIPTION" ] && TECHNICAL=$((TECHNICAL + 25))
+  [ -n "$CANONICAL" ] && TECHNICAL=$((TECHNICAL + 20))
+  [ -n "$OG_TITLE" ] && TECHNICAL=$((TECHNICAL + 8))
+  [ -n "$OG_DESC" ] && TECHNICAL=$((TECHNICAL + 7))
+  SITEMAP_EXISTS=$(jq -r '.exists // false | tostring' "$SITEMAP_FILE" 2>/dev/null || echo "false")
+  SITEMAP_CONTAINS=$(jq -r '.containsTarget // false | tostring' "$SITEMAP_FILE" 2>/dev/null || echo "false")
   if [ "$SITEMAP_EXISTS" = "true" ] && [ "$SITEMAP_CONTAINS" = "true" ]; then
     TECHNICAL=$((TECHNICAL + 15))
   elif [ "$SITEMAP_EXISTS" = "true" ]; then
@@ -529,10 +547,14 @@ for bot_id in $BOTS; do
   # --- Category 5: AI Readiness (0-100) ---
   AI=0
-  LLMS_EXISTS=$(jget_bool "$LLMSTXT_FILE" '.llmsTxt.exists')
-  LLMS_HAS_TITLE=$(jget_bool "$LLMSTXT_FILE" '.llmsTxt.hasTitle')
-  LLMS_HAS_DESC=$(jget_bool "$LLMSTXT_FILE" '.llmsTxt.hasDescription')
-  LLMS_URLS=$(jget_num "$LLMSTXT_FILE" '.llmsTxt.urlCount')
+  # Batch-read llmstxt fields — use top-level exists (M1) which covers both variants
+  read -r LLMS_EXISTS LLMS_HAS_TITLE LLMS_HAS_DESC LLMS_URLS <<< \
+    "$(jq -r '[
+      (.exists // (.llmsTxt.exists or .llmsFullTxt.exists) | tostring),
+      ((.llmsTxt.hasTitle // .llmsFullTxt.hasTitle // false) | tostring),
+      ((.llmsTxt.hasDescription // .llmsFullTxt.hasDescription // false) | tostring),
+      ((.llmsTxt.urlCount // 0) + (.llmsFullTxt.urlCount // 0))
+    ] | @tsv' "$LLMSTXT_FILE" 2>/dev/null || echo "false	false	false	0")"
   if [ "$LLMS_EXISTS" = "true" ]; then
     AI=$((AI + 40))
@@ -541,7 +563,7 @@ for bot_id in $BOTS; do
     [ "$LLMS_URLS" -ge 1 ] && AI=$((AI + 6))
   fi
   [ "$EFFECTIVE_WORD_COUNT" -ge 200 ] && AI=$((AI + 20))
-  if [ "$H1_COUNT" -ge 1 ] && [ -n "$DESCRIPTION" ] && [ "$DESCRIPTION" != "null" ]; then
+  if [ "$H1_COUNT" -ge 1 ] && [ -n "$DESCRIPTION" ]; then
     AI=$((AI + 20))
   fi

package/skills/crawl-sim/scripts/extract-links.sh CHANGED Viewed

@@ -93,11 +93,9 @@ jq -n \
   --argjson internalSample "$INTERNAL_SAMPLE" \
   --argjson externalSample "$EXTERNAL_SAMPLE" \
   '{
-    counts: {
-      internal: $internalCount,
-      external: $externalCount,
-      total: ($internalCount + $externalCount)
-    },
-    internal: $internalSample,
-    external: $externalSample
+    total: ($internalCount + $externalCount),
+    internal: $internalCount,
+    external: $externalCount,
+    internalUrls: $internalSample,
+    externalUrls: $externalSample
   }'