@braedenbuilds/crawl-sim 1.0.5 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. package/.claude-plugin/marketplace.json +15 -0
  2. package/.claude-plugin/plugin.json +13 -0
  3. package/README.md +32 -9
  4. package/bin/install.js +6 -2
  5. package/package.json +8 -3
  6. package/{SKILL.md → skills/crawl-sim/SKILL.md} +23 -2
  7. package/{scripts → skills/crawl-sim/scripts}/_lib.sh +30 -0
  8. package/skills/crawl-sim/scripts/compute-score.sh +744 -0
  9. package/{scripts → skills/crawl-sim/scripts}/extract-jsonld.sh +12 -0
  10. package/skills/crawl-sim/scripts/fetch-as-bot.sh +151 -0
  11. package/skills/crawl-sim/scripts/schema-fields.sh +25 -0
  12. package/scripts/compute-score.sh +0 -424
  13. package/scripts/fetch-as-bot.sh +0 -87
  14. /package/{profiles → skills/crawl-sim/profiles}/chatgpt-user.json +0 -0
  15. /package/{profiles → skills/crawl-sim/profiles}/claude-searchbot.json +0 -0
  16. /package/{profiles → skills/crawl-sim/profiles}/claude-user.json +0 -0
  17. /package/{profiles → skills/crawl-sim/profiles}/claudebot.json +0 -0
  18. /package/{profiles → skills/crawl-sim/profiles}/googlebot.json +0 -0
  19. /package/{profiles → skills/crawl-sim/profiles}/gptbot.json +0 -0
  20. /package/{profiles → skills/crawl-sim/profiles}/oai-searchbot.json +0 -0
  21. /package/{profiles → skills/crawl-sim/profiles}/perplexity-user.json +0 -0
  22. /package/{profiles → skills/crawl-sim/profiles}/perplexitybot.json +0 -0
  23. /package/{scripts → skills/crawl-sim/scripts}/check-llmstxt.sh +0 -0
  24. /package/{scripts → skills/crawl-sim/scripts}/check-robots.sh +0 -0
  25. /package/{scripts → skills/crawl-sim/scripts}/check-sitemap.sh +0 -0
  26. /package/{scripts → skills/crawl-sim/scripts}/diff-render.sh +0 -0
  27. /package/{scripts → skills/crawl-sim/scripts}/extract-links.sh +0 -0
  28. /package/{scripts → skills/crawl-sim/scripts}/extract-meta.sh +0 -0
@@ -62,6 +62,7 @@ fi
62
62
 
63
63
  VALID_COUNT=0
64
64
  INVALID_COUNT=0
65
+ BLOCKS_JSON="[]"
65
66
 
66
67
  if [ "$BLOCK_COUNT" -gt 0 ]; then
67
68
  while IFS= read -r block; do
@@ -79,6 +80,15 @@ if [ "$BLOCK_COUNT" -gt 0 ]; then
79
80
  else empty end;
80
81
  collect_types
81
82
  ' 2>/dev/null >> "$TYPES_FILE" || true
83
+
84
+ # Extract per-block type + top-level field names for field validation (AC-B1)
85
+ BLOCK_INFO=$(printf '%s' "$block" | jq -c '
86
+ {
87
+ type: (if has("@type") then (.["@type"] | if type == "array" then .[0] else . end) else "unknown" end),
88
+ fields: (keys | map(select(startswith("@") | not)))
89
+ }
90
+ ' 2>/dev/null || echo '{"type":"unknown","fields":[]}')
91
+ BLOCKS_JSON=$(printf '%s' "$BLOCKS_JSON" | jq --argjson b "$BLOCK_INFO" '. + [$b]')
82
92
  else
83
93
  INVALID_COUNT=$((INVALID_COUNT + 1))
84
94
  fi
@@ -109,6 +119,7 @@ jq -n \
109
119
  --argjson valid "$VALID_COUNT" \
110
120
  --argjson invalid "$INVALID_COUNT" \
111
121
  --argjson types "$TYPES_JSON" \
122
+ --argjson blocks "$BLOCKS_JSON" \
112
123
  --argjson hasOrg "$HAS_ORG" \
113
124
  --argjson hasBreadcrumb "$HAS_BREADCRUMB" \
114
125
  --argjson hasWebsite "$HAS_WEBSITE" \
@@ -121,6 +132,7 @@ jq -n \
121
132
  validCount: $valid,
122
133
  invalidCount: $invalid,
123
134
  types: $types,
135
+ blocks: $blocks,
124
136
  flags: {
125
137
  hasOrganization: $hasOrg,
126
138
  hasBreadcrumbList: $hasBreadcrumb,
@@ -0,0 +1,151 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # fetch-as-bot.sh — Fetch a URL as a specific bot User-Agent
5
+ # Usage: fetch-as-bot.sh <url> <profile.json>
6
+
7
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
8
+ # shellcheck source=_lib.sh
9
+ . "$SCRIPT_DIR/_lib.sh"
10
+
11
+ URL="${1:?Usage: fetch-as-bot.sh <url> <profile.json>}"
12
+ PROFILE="${2:?Usage: fetch-as-bot.sh <url> <profile.json>}"
13
+
14
+ BOT_ID=$(jq -r '.id' "$PROFILE")
15
+ BOT_NAME=$(jq -r '.name' "$PROFILE")
16
+ UA=$(jq -r '.userAgent' "$PROFILE")
17
+ RENDERS_JS=$(jq -r '.rendersJavaScript' "$PROFILE")
18
+
19
+ TMPDIR="${TMPDIR:-/tmp}"
20
+ HEADERS_FILE=$(mktemp "$TMPDIR/crawlsim-headers.XXXXXX")
21
+ BODY_FILE=$(mktemp "$TMPDIR/crawlsim-body.XXXXXX")
22
+ CURL_STDERR_FILE=$(mktemp "$TMPDIR/crawlsim-stderr.XXXXXX")
23
+ trap 'rm -f "$HEADERS_FILE" "$BODY_FILE" "$CURL_STDERR_FILE"' EXIT
24
+
25
+ printf '[%s] fetching %s\n' "$BOT_ID" "$URL" >&2
26
+
27
+ set +e
28
+ TIMING=$(curl -sS -L \
29
+ -H "User-Agent: $UA" \
30
+ -D "$HEADERS_FILE" \
31
+ -o "$BODY_FILE" \
32
+ -w '{"total":%{time_total},"ttfb":%{time_starttransfer},"connect":%{time_connect},"statusCode":%{http_code},"sizeDownload":%{size_download},"redirectCount":%{num_redirects},"finalUrl":"%{url_effective}"}' \
33
+ --max-time 30 \
34
+ "$URL" 2>"$CURL_STDERR_FILE")
35
+ CURL_EXIT=$?
36
+ set -e
37
+
38
+ CURL_ERR=""
39
+ if [ -s "$CURL_STDERR_FILE" ]; then
40
+ CURL_ERR=$(cat "$CURL_STDERR_FILE")
41
+ fi
42
+
43
+ if [ "$CURL_EXIT" -ne 0 ]; then
44
+ printf '[%s] FAILED: curl exit %d — %s\n' "$BOT_ID" "$CURL_EXIT" "$CURL_ERR" >&2
45
+ jq -n \
46
+ --arg url "$URL" \
47
+ --arg botId "$BOT_ID" \
48
+ --arg botName "$BOT_NAME" \
49
+ --arg ua "$UA" \
50
+ --arg rendersJs "$RENDERS_JS" \
51
+ --arg error "$CURL_ERR" \
52
+ --argjson exitCode "$CURL_EXIT" \
53
+ '{
54
+ url: $url,
55
+ bot: {
56
+ id: $botId,
57
+ name: $botName,
58
+ userAgent: $ua,
59
+ rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end)
60
+ },
61
+ fetchFailed: true,
62
+ error: $error,
63
+ curlExitCode: $exitCode,
64
+ status: 0,
65
+ timing: { total: 0, ttfb: 0 },
66
+ size: 0,
67
+ wordCount: 0,
68
+ headers: {},
69
+ bodyBase64: ""
70
+ }'
71
+ exit 0
72
+ fi
73
+
74
+ read -r STATUS TOTAL_TIME TTFB SIZE REDIRECT_COUNT FINAL_URL <<< \
75
+ "$(echo "$TIMING" | jq -r '[.statusCode, .total, .ttfb, .sizeDownload, .redirectCount, .finalUrl] | @tsv')"
76
+
77
+ # Parse response headers into a JSON object using jq for safe escaping.
78
+ # curl -L writes multiple blocks on redirect; jq keeps the last definition
79
+ # of each header since `add` overwrites left-to-right.
80
+ HEADERS_JSON=$(tr -d '\r' < "$HEADERS_FILE" \
81
+ | grep -E '^[A-Za-z][A-Za-z0-9-]*:[[:space:]]' \
82
+ | jq -Rs '
83
+ split("\n")
84
+ | map(select(length > 0))
85
+ | map(capture("^(?<k>[^:]+):[[:space:]]*(?<v>.*)$"))
86
+ | map({(.k): .v})
87
+ | add // {}
88
+ ')
89
+
90
+ # Parse redirect chain from headers dump.
91
+ # curl -D writes multiple HTTP response blocks on redirect — each starts with HTTP/.
92
+ REDIRECT_CHAIN="[]"
93
+ if [ "$REDIRECT_COUNT" -gt 0 ]; then
94
+ REDIRECT_CHAIN=$(tr -d '\r' < "$HEADERS_FILE" | awk '
95
+ /^HTTP\// { status=$2; url="" }
96
+ /^[Ll]ocation:/ { url=$2 }
97
+ /^$/ && status && url { printf "%s %s\n", status, url; status=""; url="" }
98
+ ' | jq -Rs '
99
+ split("\n") | map(select(length > 0)) |
100
+ to_entries | map({
101
+ hop: .key,
102
+ status: (.value | split(" ")[0] | tonumber),
103
+ location: (.value | split(" ")[1:] | join(" "))
104
+ })
105
+ ')
106
+ fi
107
+
108
+ WORD_COUNT=$(count_words "$BODY_FILE")
109
+ [ -z "$WORD_COUNT" ] && WORD_COUNT=0
110
+
111
+ BODY_B64=""
112
+ if [ -s "$BODY_FILE" ]; then
113
+ BODY_B64=$(base64 < "$BODY_FILE")
114
+ fi
115
+
116
+ printf '[%s] ok: status=%s size=%s words=%s time=%ss\n' "$BOT_ID" "$STATUS" "$SIZE" "$WORD_COUNT" "$TOTAL_TIME" >&2
117
+
118
+ jq -n \
119
+ --arg url "$URL" \
120
+ --arg botId "$BOT_ID" \
121
+ --arg botName "$BOT_NAME" \
122
+ --arg ua "$UA" \
123
+ --arg rendersJs "$RENDERS_JS" \
124
+ --argjson status "$STATUS" \
125
+ --argjson totalTime "$TOTAL_TIME" \
126
+ --argjson ttfb "$TTFB" \
127
+ --argjson size "$SIZE" \
128
+ --argjson wordCount "$WORD_COUNT" \
129
+ --argjson headers "$HEADERS_JSON" \
130
+ --argjson redirectCount "$REDIRECT_COUNT" \
131
+ --arg finalUrl "$FINAL_URL" \
132
+ --argjson redirectChain "$REDIRECT_CHAIN" \
133
+ --arg bodyBase64 "$BODY_B64" \
134
+ '{
135
+ url: $url,
136
+ bot: {
137
+ id: $botId,
138
+ name: $botName,
139
+ userAgent: $ua,
140
+ rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end)
141
+ },
142
+ status: $status,
143
+ timing: { total: $totalTime, ttfb: $ttfb },
144
+ size: $size,
145
+ wordCount: $wordCount,
146
+ redirectCount: $redirectCount,
147
+ finalUrl: $finalUrl,
148
+ redirectChain: $redirectChain,
149
+ headers: $headers,
150
+ bodyBase64: $bodyBase64
151
+ }'
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env bash
2
+ # schema-fields.sh — Required field definitions per schema.org type.
3
+ # Source this file, then call required_fields_for <SchemaType>.
4
+
5
+ required_fields_for() {
6
+ case "$1" in
7
+ Organization) echo "name url" ;;
8
+ WebSite) echo "name url" ;;
9
+ Article) echo "headline author datePublished" ;;
10
+ NewsArticle) echo "headline author datePublished" ;;
11
+ FAQPage) echo "mainEntity" ;;
12
+ BreadcrumbList) echo "itemListElement" ;;
13
+ CollectionPage) echo "name" ;;
14
+ ItemList) echo "itemListElement" ;;
15
+ AboutPage) echo "name" ;;
16
+ ContactPage) echo "name" ;;
17
+ Product) echo "name" ;;
18
+ LocalBusiness) echo "name address" ;;
19
+ ProfessionalService) echo "name" ;;
20
+ Person) echo "name" ;;
21
+ ImageObject) echo "contentUrl" ;;
22
+ PostalAddress) echo "streetAddress" ;;
23
+ *) echo "" ;;
24
+ esac
25
+ }
@@ -1,424 +0,0 @@
1
- #!/usr/bin/env bash
2
- set -eu
3
-
4
- # compute-score.sh — Aggregate check outputs into per-bot + per-category scores
5
- # Usage: compute-score.sh <results-dir>
6
- # Output: JSON to stdout
7
- #
8
- # Expected filenames in <results-dir>:
9
- # fetch-<bot_id>.json — fetch-as-bot.sh output
10
- # meta-<bot_id>.json — extract-meta.sh output
11
- # jsonld-<bot_id>.json — extract-jsonld.sh output
12
- # links-<bot_id>.json — extract-links.sh output
13
- # robots-<bot_id>.json — check-robots.sh output
14
- # llmstxt.json — check-llmstxt.sh output (bot-independent)
15
- # sitemap.json — check-sitemap.sh output (bot-independent)
16
- # diff-render.json — diff-render.sh output (optional, Googlebot only)
17
-
18
- RESULTS_DIR="${1:?Usage: compute-score.sh <results-dir>}"
19
- printf '[compute-score] aggregating %s\n' "$RESULTS_DIR" >&2
20
-
21
- if [ ! -d "$RESULTS_DIR" ]; then
22
- echo "Error: results dir not found: $RESULTS_DIR" >&2
23
- exit 1
24
- fi
25
-
26
- # Category weights (as percentages of per-bot composite)
27
- W_ACCESSIBILITY=25
28
- W_CONTENT=30
29
- W_STRUCTURED=20
30
- W_TECHNICAL=15
31
- W_AI=10
32
-
33
- # Overall composite weights (per bot)
34
- # Default: Googlebot 40, GPTBot 20, ClaudeBot 20, PerplexityBot 20
35
- overall_weight() {
36
- case "$1" in
37
- googlebot) echo 40 ;;
38
- gptbot) echo 20 ;;
39
- claudebot) echo 20 ;;
40
- perplexitybot) echo 20 ;;
41
- *) echo 0 ;;
42
- esac
43
- }
44
-
45
- # Grade from score (0-100)
46
- grade_for() {
47
- local s=$1
48
- if [ "$s" -ge 93 ]; then echo "A"
49
- elif [ "$s" -ge 90 ]; then echo "A-"
50
- elif [ "$s" -ge 87 ]; then echo "B+"
51
- elif [ "$s" -ge 83 ]; then echo "B"
52
- elif [ "$s" -ge 80 ]; then echo "B-"
53
- elif [ "$s" -ge 77 ]; then echo "C+"
54
- elif [ "$s" -ge 73 ]; then echo "C"
55
- elif [ "$s" -ge 70 ]; then echo "C-"
56
- elif [ "$s" -ge 67 ]; then echo "D+"
57
- elif [ "$s" -ge 63 ]; then echo "D"
58
- elif [ "$s" -ge 60 ]; then echo "D-"
59
- else echo "F"
60
- fi
61
- }
62
-
63
- # Read a jq value from a file with a default fallback
64
- jget() {
65
- local file="$1"
66
- local query="$2"
67
- local default="${3:-null}"
68
- if [ -f "$file" ]; then
69
- jq -r --arg d "$default" "$query // \$d" "$file" 2>/dev/null || echo "$default"
70
- else
71
- echo "$default"
72
- fi
73
- }
74
-
75
- jget_num() {
76
- local v
77
- v=$(jget "$1" "$2" "0")
78
- # Replace "null" or non-numeric with 0
79
- if ! printf '%s' "$v" | grep -qE '^-?[0-9]+(\.[0-9]+)?$'; then
80
- echo "0"
81
- else
82
- echo "$v"
83
- fi
84
- }
85
-
86
- jget_bool() {
87
- local v
88
- v=$(jget "$1" "$2" "false")
89
- if [ "$v" = "true" ]; then echo "true"; else echo "false"; fi
90
- }
91
-
92
- BOTS=""
93
- for f in "$RESULTS_DIR"/fetch-*.json; do
94
- [ -f "$f" ] || continue
95
- bot_id=$(basename "$f" .json | sed 's/^fetch-//')
96
- BOTS="$BOTS $bot_id"
97
- done
98
-
99
- if [ -z "$BOTS" ]; then
100
- echo "Error: no fetch-*.json files found in $RESULTS_DIR" >&2
101
- exit 1
102
- fi
103
-
104
- LLMSTXT_FILE="$RESULTS_DIR/llmstxt.json"
105
- SITEMAP_FILE="$RESULTS_DIR/sitemap.json"
106
- DIFF_RENDER_FILE="$RESULTS_DIR/diff-render.json"
107
-
108
- # Load Playwright render-delta data once (used to differentiate JS-rendering
109
- # bots from non-rendering ones). If the comparison was skipped or missing,
110
- # all bots score against server HTML only.
111
- DIFF_AVAILABLE=false
112
- DIFF_RENDERED_WORDS=0
113
- DIFF_DELTA_PCT=0
114
- if [ -f "$DIFF_RENDER_FILE" ]; then
115
- # Explicit null check — `.skipped // true` would treat real false as null.
116
- DIFF_SKIPPED=$(jq -r '.skipped | if . == null then "true" else tostring end' "$DIFF_RENDER_FILE" 2>/dev/null || echo "true")
117
- if [ "$DIFF_SKIPPED" = "false" ]; then
118
- DIFF_AVAILABLE=true
119
- DIFF_RENDERED_WORDS=$(jq -r '.renderedWordCount // 0' "$DIFF_RENDER_FILE")
120
- DIFF_DELTA_PCT=$(jq -r '.deltaPct // 0' "$DIFF_RENDER_FILE")
121
- fi
122
- fi
123
-
124
- BOTS_JSON="{}"
125
-
126
- # Accumulators for per-category averages (across bots)
127
- CAT_ACCESSIBILITY_SUM=0
128
- CAT_CONTENT_SUM=0
129
- CAT_STRUCTURED_SUM=0
130
- CAT_TECHNICAL_SUM=0
131
- CAT_AI_SUM=0
132
- CAT_N=0
133
-
134
- # Accumulators for overall weighted composite
135
- OVERALL_WEIGHTED_SUM=0
136
- OVERALL_WEIGHT_TOTAL=0
137
-
138
- for bot_id in $BOTS; do
139
- FETCH="$RESULTS_DIR/fetch-$bot_id.json"
140
- META="$RESULTS_DIR/meta-$bot_id.json"
141
- JSONLD="$RESULTS_DIR/jsonld-$bot_id.json"
142
- LINKS="$RESULTS_DIR/links-$bot_id.json"
143
- ROBOTS="$RESULTS_DIR/robots-$bot_id.json"
144
-
145
- BOT_NAME=$(jget "$FETCH" '.bot.name' "$bot_id")
146
- STATUS=$(jget_num "$FETCH" '.status')
147
- TOTAL_TIME=$(jget_num "$FETCH" '.timing.total')
148
- SERVER_WORD_COUNT=$(jget_num "$FETCH" '.wordCount')
149
- # Read with explicit null fallback — jq's `//` is unsafe here because it
150
- # treats boolean false as falsy, which is exactly the value we need to see.
151
- RENDERS_JS=$(jq -r '.bot.rendersJavaScript | if . == null then "unknown" else tostring end' "$FETCH" 2>/dev/null || echo "unknown")
152
-
153
- ROBOTS_ALLOWED=$(jget_bool "$ROBOTS" '.allowed')
154
-
155
- # Effective word count depends on JS rendering capability:
156
- # - true (e.g. Googlebot) + diff-render data → rendered DOM word count
157
- # - false (AI training/search bots, observed) → server HTML only, with
158
- # penalty proportional to the rendering delta
159
- # - unknown → conservative: server HTML (same as false but no penalty)
160
- EFFECTIVE_WORD_COUNT=$SERVER_WORD_COUNT
161
- HYDRATION_PENALTY=0
162
- MISSED_WORDS=0
163
- if [ "$DIFF_AVAILABLE" = "true" ]; then
164
- if [ "$RENDERS_JS" = "true" ]; then
165
- EFFECTIVE_WORD_COUNT=$DIFF_RENDERED_WORDS
166
- elif [ "$RENDERS_JS" = "false" ]; then
167
- # Absolute-value delta: if rendered DOM has materially more than server,
168
- # AI bots are missing that content.
169
- ABS_DELTA=$(awk -v d="$DIFF_DELTA_PCT" 'BEGIN { printf "%d", (d < 0 ? -d : d) + 0.5 }')
170
- if [ "$ABS_DELTA" -gt 5 ]; then
171
- # Scale penalty: 5% delta = 0, 10% = 5, 20%+ = 15 (cap)
172
- HYDRATION_PENALTY=$(awk -v d="$ABS_DELTA" 'BEGIN {
173
- p = (d - 5)
174
- if (p > 15) p = 15
175
- printf "%d", p
176
- }')
177
- fi
178
- MISSED_WORDS=$((DIFF_RENDERED_WORDS - SERVER_WORD_COUNT))
179
- [ "$MISSED_WORDS" -lt 0 ] && MISSED_WORDS=0
180
- fi
181
- fi
182
-
183
- # --- Category 1: Accessibility (0-100) ---
184
- ACC=0
185
- # robots.txt allows: 40
186
- [ "$ROBOTS_ALLOWED" = "true" ] && ACC=$((ACC + 40))
187
- # HTTP 200: 40
188
- [ "$STATUS" = "200" ] && ACC=$((ACC + 40))
189
- # Response time: <2s = 20, <5s = 10, else 0
190
- TIME_SCORE=$(awk -v t="$TOTAL_TIME" 'BEGIN { if (t < 2) print 20; else if (t < 5) print 10; else print 0 }')
191
- ACC=$((ACC + TIME_SCORE))
192
-
193
- # --- Category 2: Content Visibility (0-100) ---
194
- CONTENT=0
195
- if [ "$EFFECTIVE_WORD_COUNT" -ge 300 ]; then CONTENT=$((CONTENT + 30))
196
- elif [ "$EFFECTIVE_WORD_COUNT" -ge 150 ]; then CONTENT=$((CONTENT + 20))
197
- elif [ "$EFFECTIVE_WORD_COUNT" -ge 50 ]; then CONTENT=$((CONTENT + 10))
198
- fi
199
-
200
- H1_COUNT=$(jget_num "$META" '.headings.h1.count')
201
- H2_COUNT=$(jget_num "$META" '.headings.h2.count')
202
- [ "$H1_COUNT" -ge 1 ] && CONTENT=$((CONTENT + 20))
203
- [ "$H2_COUNT" -ge 1 ] && CONTENT=$((CONTENT + 15))
204
-
205
- INTERNAL_LINKS=$(jget_num "$LINKS" '.counts.internal')
206
- if [ "$INTERNAL_LINKS" -ge 5 ]; then CONTENT=$((CONTENT + 20))
207
- elif [ "$INTERNAL_LINKS" -ge 1 ]; then CONTENT=$((CONTENT + 10))
208
- fi
209
-
210
- IMG_TOTAL=$(jget_num "$META" '.images.total')
211
- IMG_WITH_ALT=$(jget_num "$META" '.images.withAlt')
212
- if [ "$IMG_TOTAL" -eq 0 ]; then
213
- CONTENT=$((CONTENT + 15))
214
- else
215
- ALT_SCORE=$(awk -v a="$IMG_WITH_ALT" -v t="$IMG_TOTAL" 'BEGIN { printf "%d", (a / t) * 15 }')
216
- CONTENT=$((CONTENT + ALT_SCORE))
217
- fi
218
-
219
- # Apply hydration penalty for non-rendering bots that are missing content
220
- CONTENT=$((CONTENT - HYDRATION_PENALTY))
221
- [ $CONTENT -lt 0 ] && CONTENT=0
222
-
223
- # --- Category 3: Structured Data (0-100) ---
224
- STRUCTURED=0
225
- JSONLD_COUNT=$(jget_num "$JSONLD" '.blockCount')
226
- JSONLD_VALID=$(jget_num "$JSONLD" '.validCount')
227
- JSONLD_INVALID=$(jget_num "$JSONLD" '.invalidCount')
228
- HAS_ORG=$(jget_bool "$JSONLD" '.flags.hasOrganization')
229
- HAS_WEBSITE=$(jget_bool "$JSONLD" '.flags.hasWebSite')
230
- HAS_BREADCRUMB=$(jget_bool "$JSONLD" '.flags.hasBreadcrumbList')
231
- HAS_ARTICLE=$(jget_bool "$JSONLD" '.flags.hasArticle')
232
- HAS_PRODUCT=$(jget_bool "$JSONLD" '.flags.hasProduct')
233
- HAS_FAQ=$(jget_bool "$JSONLD" '.flags.hasFAQPage')
234
-
235
- [ "$JSONLD_COUNT" -ge 1 ] && STRUCTURED=$((STRUCTURED + 30))
236
- if [ "$JSONLD_COUNT" -ge 1 ] && [ "$JSONLD_INVALID" -eq 0 ]; then
237
- STRUCTURED=$((STRUCTURED + 20))
238
- fi
239
- if [ "$HAS_ORG" = "true" ] || [ "$HAS_WEBSITE" = "true" ]; then
240
- STRUCTURED=$((STRUCTURED + 20))
241
- fi
242
- [ "$HAS_BREADCRUMB" = "true" ] && STRUCTURED=$((STRUCTURED + 15))
243
- if [ "$HAS_ARTICLE" = "true" ] || [ "$HAS_PRODUCT" = "true" ] || [ "$HAS_FAQ" = "true" ]; then
244
- STRUCTURED=$((STRUCTURED + 15))
245
- fi
246
-
247
- # --- Category 4: Technical Signals (0-100) ---
248
- TECHNICAL=0
249
- TITLE=$(jget "$META" '.title' "")
250
- DESCRIPTION=$(jget "$META" '.description' "")
251
- CANONICAL=$(jget "$META" '.canonical' "")
252
- OG_TITLE=$(jget "$META" '.og.title' "")
253
- OG_DESC=$(jget "$META" '.og.description' "")
254
-
255
- [ -n "$TITLE" ] && [ "$TITLE" != "null" ] && TECHNICAL=$((TECHNICAL + 25))
256
- [ -n "$DESCRIPTION" ] && [ "$DESCRIPTION" != "null" ] && TECHNICAL=$((TECHNICAL + 25))
257
- [ -n "$CANONICAL" ] && [ "$CANONICAL" != "null" ] && TECHNICAL=$((TECHNICAL + 20))
258
- if [ -n "$OG_TITLE" ] && [ "$OG_TITLE" != "null" ]; then TECHNICAL=$((TECHNICAL + 8)); fi
259
- if [ -n "$OG_DESC" ] && [ "$OG_DESC" != "null" ]; then TECHNICAL=$((TECHNICAL + 7)); fi
260
-
261
- SITEMAP_EXISTS=$(jget_bool "$SITEMAP_FILE" '.exists')
262
- SITEMAP_CONTAINS=$(jget_bool "$SITEMAP_FILE" '.containsTarget')
263
- if [ "$SITEMAP_EXISTS" = "true" ] && [ "$SITEMAP_CONTAINS" = "true" ]; then
264
- TECHNICAL=$((TECHNICAL + 15))
265
- elif [ "$SITEMAP_EXISTS" = "true" ]; then
266
- TECHNICAL=$((TECHNICAL + 10))
267
- fi
268
-
269
- # --- Category 5: AI Readiness (0-100) ---
270
- AI=0
271
- LLMS_EXISTS=$(jget_bool "$LLMSTXT_FILE" '.llmsTxt.exists')
272
- LLMS_HAS_TITLE=$(jget_bool "$LLMSTXT_FILE" '.llmsTxt.hasTitle')
273
- LLMS_HAS_DESC=$(jget_bool "$LLMSTXT_FILE" '.llmsTxt.hasDescription')
274
- LLMS_URLS=$(jget_num "$LLMSTXT_FILE" '.llmsTxt.urlCount')
275
-
276
- if [ "$LLMS_EXISTS" = "true" ]; then
277
- AI=$((AI + 40))
278
- [ "$LLMS_HAS_TITLE" = "true" ] && AI=$((AI + 7))
279
- [ "$LLMS_HAS_DESC" = "true" ] && AI=$((AI + 7))
280
- [ "$LLMS_URLS" -ge 1 ] && AI=$((AI + 6))
281
- fi
282
- # Content citable (>= 200 words, effective for this bot)
283
- [ "$EFFECTIVE_WORD_COUNT" -ge 200 ] && AI=$((AI + 20))
284
- # Semantic clarity: has H1 + description
285
- if [ "$H1_COUNT" -ge 1 ] && [ -n "$DESCRIPTION" ] && [ "$DESCRIPTION" != "null" ]; then
286
- AI=$((AI + 20))
287
- fi
288
-
289
- # Cap categories at 100
290
- [ $ACC -gt 100 ] && ACC=100
291
- [ $CONTENT -gt 100 ] && CONTENT=100
292
- [ $STRUCTURED -gt 100 ] && STRUCTURED=100
293
- [ $TECHNICAL -gt 100 ] && TECHNICAL=100
294
- [ $AI -gt 100 ] && AI=100
295
-
296
- # Per-bot composite score (weighted average of 5 categories)
297
- BOT_SCORE=$(awk -v a=$ACC -v c=$CONTENT -v s=$STRUCTURED -v t=$TECHNICAL -v ai=$AI \
298
- -v wa=$W_ACCESSIBILITY -v wc=$W_CONTENT -v ws=$W_STRUCTURED -v wt=$W_TECHNICAL -v wai=$W_AI \
299
- 'BEGIN { printf "%d", (a*wa + c*wc + s*ws + t*wt + ai*wai) / (wa+wc+ws+wt+wai) + 0.5 }')
300
-
301
- BOT_GRADE=$(grade_for "$BOT_SCORE")
302
- ACC_GRADE=$(grade_for "$ACC")
303
- CONTENT_GRADE=$(grade_for "$CONTENT")
304
- STRUCTURED_GRADE=$(grade_for "$STRUCTURED")
305
- TECHNICAL_GRADE=$(grade_for "$TECHNICAL")
306
- AI_GRADE=$(grade_for "$AI")
307
-
308
- BOT_OBJ=$(jq -n \
309
- --arg id "$bot_id" \
310
- --arg name "$BOT_NAME" \
311
- --arg rendersJs "$RENDERS_JS" \
312
- --argjson score "$BOT_SCORE" \
313
- --arg grade "$BOT_GRADE" \
314
- --argjson acc "$ACC" \
315
- --arg accGrade "$ACC_GRADE" \
316
- --argjson content "$CONTENT" \
317
- --arg contentGrade "$CONTENT_GRADE" \
318
- --argjson structured "$STRUCTURED" \
319
- --arg structuredGrade "$STRUCTURED_GRADE" \
320
- --argjson technical "$TECHNICAL" \
321
- --arg technicalGrade "$TECHNICAL_GRADE" \
322
- --argjson ai "$AI" \
323
- --arg aiGrade "$AI_GRADE" \
324
- --argjson serverWords "$SERVER_WORD_COUNT" \
325
- --argjson effectiveWords "$EFFECTIVE_WORD_COUNT" \
326
- --argjson missedWords "$MISSED_WORDS" \
327
- --argjson hydrationPenalty "$HYDRATION_PENALTY" \
328
- '{
329
- id: $id,
330
- name: $name,
331
- rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end),
332
- score: $score,
333
- grade: $grade,
334
- visibility: {
335
- serverWords: $serverWords,
336
- effectiveWords: $effectiveWords,
337
- missedWordsVsRendered: $missedWords,
338
- hydrationPenaltyPts: $hydrationPenalty
339
- },
340
- categories: {
341
- accessibility: { score: $acc, grade: $accGrade },
342
- contentVisibility: { score: $content, grade: $contentGrade },
343
- structuredData: { score: $structured, grade: $structuredGrade },
344
- technicalSignals: { score: $technical, grade: $technicalGrade },
345
- aiReadiness: { score: $ai, grade: $aiGrade }
346
- }
347
- }')
348
-
349
- BOTS_JSON=$(printf '%s' "$BOTS_JSON" | jq --argjson bot "$BOT_OBJ" --arg id "$bot_id" '.[$id] = $bot')
350
-
351
- # Accumulate category averages
352
- CAT_ACCESSIBILITY_SUM=$((CAT_ACCESSIBILITY_SUM + ACC))
353
- CAT_CONTENT_SUM=$((CAT_CONTENT_SUM + CONTENT))
354
- CAT_STRUCTURED_SUM=$((CAT_STRUCTURED_SUM + STRUCTURED))
355
- CAT_TECHNICAL_SUM=$((CAT_TECHNICAL_SUM + TECHNICAL))
356
- CAT_AI_SUM=$((CAT_AI_SUM + AI))
357
- CAT_N=$((CAT_N + 1))
358
-
359
- # Accumulate weighted overall
360
- W=$(overall_weight "$bot_id")
361
- if [ "$W" -gt 0 ]; then
362
- OVERALL_WEIGHTED_SUM=$((OVERALL_WEIGHTED_SUM + BOT_SCORE * W))
363
- OVERALL_WEIGHT_TOTAL=$((OVERALL_WEIGHT_TOTAL + W))
364
- fi
365
- done
366
-
367
- # Per-category averages (across all bots)
368
- CAT_ACC_AVG=$((CAT_ACCESSIBILITY_SUM / CAT_N))
369
- CAT_CONTENT_AVG=$((CAT_CONTENT_SUM / CAT_N))
370
- CAT_STRUCTURED_AVG=$((CAT_STRUCTURED_SUM / CAT_N))
371
- CAT_TECHNICAL_AVG=$((CAT_TECHNICAL_SUM / CAT_N))
372
- CAT_AI_AVG=$((CAT_AI_SUM / CAT_N))
373
-
374
- # Overall composite
375
- if [ "$OVERALL_WEIGHT_TOTAL" -gt 0 ]; then
376
- OVERALL_SCORE=$((OVERALL_WEIGHTED_SUM / OVERALL_WEIGHT_TOTAL))
377
- else
378
- # Fall back to simple average if none of the 4 standard bots are present
379
- OVERALL_SCORE=$(((CAT_ACC_AVG + CAT_CONTENT_AVG + CAT_STRUCTURED_AVG + CAT_TECHNICAL_AVG + CAT_AI_AVG) / 5))
380
- fi
381
-
382
- OVERALL_GRADE=$(grade_for "$OVERALL_SCORE")
383
- CAT_ACC_GRADE=$(grade_for "$CAT_ACC_AVG")
384
- CAT_CONTENT_GRADE=$(grade_for "$CAT_CONTENT_AVG")
385
- CAT_STRUCTURED_GRADE=$(grade_for "$CAT_STRUCTURED_AVG")
386
- CAT_TECHNICAL_GRADE=$(grade_for "$CAT_TECHNICAL_AVG")
387
- CAT_AI_GRADE=$(grade_for "$CAT_AI_AVG")
388
-
389
- # Get the URL from the first fetch file
390
- FIRST_FETCH=$(ls "$RESULTS_DIR"/fetch-*.json | head -1)
391
- TARGET_URL=$(jget "$FIRST_FETCH" '.url' "")
392
- TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
393
-
394
- jq -n \
395
- --arg url "$TARGET_URL" \
396
- --arg timestamp "$TIMESTAMP" \
397
- --arg version "0.1.0" \
398
- --argjson overallScore "$OVERALL_SCORE" \
399
- --arg overallGrade "$OVERALL_GRADE" \
400
- --argjson bots "$BOTS_JSON" \
401
- --argjson catAcc "$CAT_ACC_AVG" \
402
- --arg catAccGrade "$CAT_ACC_GRADE" \
403
- --argjson catContent "$CAT_CONTENT_AVG" \
404
- --arg catContentGrade "$CAT_CONTENT_GRADE" \
405
- --argjson catStructured "$CAT_STRUCTURED_AVG" \
406
- --arg catStructuredGrade "$CAT_STRUCTURED_GRADE" \
407
- --argjson catTechnical "$CAT_TECHNICAL_AVG" \
408
- --arg catTechnicalGrade "$CAT_TECHNICAL_GRADE" \
409
- --argjson catAi "$CAT_AI_AVG" \
410
- --arg catAiGrade "$CAT_AI_GRADE" \
411
- '{
412
- url: $url,
413
- timestamp: $timestamp,
414
- version: $version,
415
- overall: { score: $overallScore, grade: $overallGrade },
416
- bots: $bots,
417
- categories: {
418
- accessibility: { score: $catAcc, grade: $catAccGrade },
419
- contentVisibility: { score: $catContent, grade: $catContentGrade },
420
- structuredData: { score: $catStructured, grade: $catStructuredGrade },
421
- technicalSignals: { score: $catTechnical, grade: $catTechnicalGrade },
422
- aiReadiness: { score: $catAi, grade: $catAiGrade }
423
- }
424
- }'