@braedenbuilds/crawl-sim 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env bash
2
+ set -eu
3
+
4
+ # check-sitemap.sh — Fetch sitemap.xml, check URL inclusion and structure
5
+ # Usage: check-sitemap.sh <url>
6
+
7
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
8
+ # shellcheck source=_lib.sh
9
+ . "$SCRIPT_DIR/_lib.sh"
10
+
11
+ URL="${1:?Usage: check-sitemap.sh <url>}"
12
+ printf '[check-sitemap] %s\n' "$URL" >&2
13
+ ORIGIN=$(origin_from_url "$URL")
14
+ SITEMAP_URL="${ORIGIN}/sitemap.xml"
15
+
16
+ TMPDIR="${TMPDIR:-/tmp}"
17
+ SITEMAP_FILE=$(mktemp "$TMPDIR/crawlsim-sitemap.XXXXXX")
18
+ trap 'rm -f "$SITEMAP_FILE"' EXIT
19
+
20
+ HTTP_STATUS=$(fetch_to_file "$SITEMAP_URL" "$SITEMAP_FILE")
21
+
22
+ EXISTS=false
23
+ URL_COUNT=0
24
+ CONTAINS_TARGET=false
25
+ HAS_LASTMOD=false
26
+ IS_INDEX=false
27
+ CHILD_SITEMAP_COUNT=0
28
+
29
+ if [ "$HTTP_STATUS" = "200" ] && [ -s "$SITEMAP_FILE" ]; then
30
+ # Check if content looks like XML (not HTML fallback)
31
+ FIRST_BYTES=$(head -c 200 "$SITEMAP_FILE" | tr '[:upper:]' '[:lower:]')
32
+ case "$FIRST_BYTES" in
33
+ *"<!doctype html"*|*"<html"*) ;;
34
+ *)
35
+ EXISTS=true
36
+
37
+ # Is this a sitemap index?
38
+ if grep -qi '<sitemapindex' "$SITEMAP_FILE"; then
39
+ IS_INDEX=true
40
+ CHILD_SITEMAP_COUNT=$(grep -oE '<sitemap>' "$SITEMAP_FILE" | wc -l | tr -d ' ')
41
+ fi
42
+
43
+ # Count <loc> tags (URLs, or child sitemaps in an index)
44
+ URL_COUNT=$(grep -oE '<loc>' "$SITEMAP_FILE" | wc -l | tr -d ' ')
45
+
46
+ # Check if target URL appears anywhere in the sitemap
47
+ # Match both with and without trailing slash
48
+ URL_NO_TRAILING=$(printf '%s' "$URL" | sed -E 's#/$##')
49
+ if grep -qF "$URL_NO_TRAILING<" "$SITEMAP_FILE" || grep -qF "${URL_NO_TRAILING}/<" "$SITEMAP_FILE"; then
50
+ CONTAINS_TARGET=true
51
+ fi
52
+
53
+ # Has lastmod dates?
54
+ if grep -qi '<lastmod>' "$SITEMAP_FILE"; then
55
+ HAS_LASTMOD=true
56
+ fi
57
+ ;;
58
+ esac
59
+ fi
60
+
61
+ jq -n \
62
+ --arg url "$URL" \
63
+ --arg sitemapUrl "$SITEMAP_URL" \
64
+ --argjson exists "$EXISTS" \
65
+ --argjson isIndex "$IS_INDEX" \
66
+ --argjson urlCount "$URL_COUNT" \
67
+ --argjson childSitemapCount "$CHILD_SITEMAP_COUNT" \
68
+ --argjson containsTarget "$CONTAINS_TARGET" \
69
+ --argjson hasLastmod "$HAS_LASTMOD" \
70
+ '{
71
+ url: $url,
72
+ sitemapUrl: $sitemapUrl,
73
+ exists: $exists,
74
+ isIndex: $isIndex,
75
+ urlCount: $urlCount,
76
+ childSitemapCount: $childSitemapCount,
77
+ containsTarget: $containsTarget,
78
+ hasLastmod: $hasLastmod
79
+ }'
@@ -0,0 +1,424 @@
1
+ #!/usr/bin/env bash
2
+ set -eu
3
+
4
+ # compute-score.sh — Aggregate check outputs into per-bot + per-category scores
5
+ # Usage: compute-score.sh <results-dir>
6
+ # Output: JSON to stdout
7
+ #
8
+ # Expected filenames in <results-dir>:
9
+ # fetch-<bot_id>.json — fetch-as-bot.sh output
10
+ # meta-<bot_id>.json — extract-meta.sh output
11
+ # jsonld-<bot_id>.json — extract-jsonld.sh output
12
+ # links-<bot_id>.json — extract-links.sh output
13
+ # robots-<bot_id>.json — check-robots.sh output
14
+ # llmstxt.json — check-llmstxt.sh output (bot-independent)
15
+ # sitemap.json — check-sitemap.sh output (bot-independent)
16
+ # diff-render.json — diff-render.sh output (optional, Googlebot only)
17
+
18
+ RESULTS_DIR="${1:?Usage: compute-score.sh <results-dir>}"
19
+ printf '[compute-score] aggregating %s\n' "$RESULTS_DIR" >&2
20
+
21
+ if [ ! -d "$RESULTS_DIR" ]; then
22
+ echo "Error: results dir not found: $RESULTS_DIR" >&2
23
+ exit 1
24
+ fi
25
+
26
+ # Category weights (as percentages of per-bot composite)
27
+ W_ACCESSIBILITY=25
28
+ W_CONTENT=30
29
+ W_STRUCTURED=20
30
+ W_TECHNICAL=15
31
+ W_AI=10
32
+
33
+ # Overall composite weights (per bot)
34
+ # Default: Googlebot 40, GPTBot 20, ClaudeBot 20, PerplexityBot 20
35
+ overall_weight() {
36
+ case "$1" in
37
+ googlebot) echo 40 ;;
38
+ gptbot) echo 20 ;;
39
+ claudebot) echo 20 ;;
40
+ perplexitybot) echo 20 ;;
41
+ *) echo 0 ;;
42
+ esac
43
+ }
44
+
45
+ # Grade from score (0-100)
46
+ grade_for() {
47
+ local s=$1
48
+ if [ "$s" -ge 93 ]; then echo "A"
49
+ elif [ "$s" -ge 90 ]; then echo "A-"
50
+ elif [ "$s" -ge 87 ]; then echo "B+"
51
+ elif [ "$s" -ge 83 ]; then echo "B"
52
+ elif [ "$s" -ge 80 ]; then echo "B-"
53
+ elif [ "$s" -ge 77 ]; then echo "C+"
54
+ elif [ "$s" -ge 73 ]; then echo "C"
55
+ elif [ "$s" -ge 70 ]; then echo "C-"
56
+ elif [ "$s" -ge 67 ]; then echo "D+"
57
+ elif [ "$s" -ge 63 ]; then echo "D"
58
+ elif [ "$s" -ge 60 ]; then echo "D-"
59
+ else echo "F"
60
+ fi
61
+ }
62
+
63
+ # Read a jq value from a file with a default fallback
64
+ jget() {
65
+ local file="$1"
66
+ local query="$2"
67
+ local default="${3:-null}"
68
+ if [ -f "$file" ]; then
69
+ jq -r --arg d "$default" "$query // \$d" "$file" 2>/dev/null || echo "$default"
70
+ else
71
+ echo "$default"
72
+ fi
73
+ }
74
+
75
+ jget_num() {
76
+ local v
77
+ v=$(jget "$1" "$2" "0")
78
+ # Replace "null" or non-numeric with 0
79
+ if ! printf '%s' "$v" | grep -qE '^-?[0-9]+(\.[0-9]+)?$'; then
80
+ echo "0"
81
+ else
82
+ echo "$v"
83
+ fi
84
+ }
85
+
86
+ jget_bool() {
87
+ local v
88
+ v=$(jget "$1" "$2" "false")
89
+ if [ "$v" = "true" ]; then echo "true"; else echo "false"; fi
90
+ }
91
+
92
+ BOTS=""
93
+ for f in "$RESULTS_DIR"/fetch-*.json; do
94
+ [ -f "$f" ] || continue
95
+ bot_id=$(basename "$f" .json | sed 's/^fetch-//')
96
+ BOTS="$BOTS $bot_id"
97
+ done
98
+
99
+ if [ -z "$BOTS" ]; then
100
+ echo "Error: no fetch-*.json files found in $RESULTS_DIR" >&2
101
+ exit 1
102
+ fi
103
+
104
+ LLMSTXT_FILE="$RESULTS_DIR/llmstxt.json"
105
+ SITEMAP_FILE="$RESULTS_DIR/sitemap.json"
106
+ DIFF_RENDER_FILE="$RESULTS_DIR/diff-render.json"
107
+
108
+ # Load Playwright render-delta data once (used to differentiate JS-rendering
109
+ # bots from non-rendering ones). If the comparison was skipped or missing,
110
+ # all bots score against server HTML only.
111
+ DIFF_AVAILABLE=false
112
+ DIFF_RENDERED_WORDS=0
113
+ DIFF_DELTA_PCT=0
114
+ if [ -f "$DIFF_RENDER_FILE" ]; then
115
+ # Explicit null check — `.skipped // true` would treat real false as null.
116
+ DIFF_SKIPPED=$(jq -r '.skipped | if . == null then "true" else tostring end' "$DIFF_RENDER_FILE" 2>/dev/null || echo "true")
117
+ if [ "$DIFF_SKIPPED" = "false" ]; then
118
+ DIFF_AVAILABLE=true
119
+ DIFF_RENDERED_WORDS=$(jq -r '.renderedWordCount // 0' "$DIFF_RENDER_FILE")
120
+ DIFF_DELTA_PCT=$(jq -r '.deltaPct // 0' "$DIFF_RENDER_FILE")
121
+ fi
122
+ fi
123
+
124
+ BOTS_JSON="{}"
125
+
126
+ # Accumulators for per-category averages (across bots)
127
+ CAT_ACCESSIBILITY_SUM=0
128
+ CAT_CONTENT_SUM=0
129
+ CAT_STRUCTURED_SUM=0
130
+ CAT_TECHNICAL_SUM=0
131
+ CAT_AI_SUM=0
132
+ CAT_N=0
133
+
134
+ # Accumulators for overall weighted composite
135
+ OVERALL_WEIGHTED_SUM=0
136
+ OVERALL_WEIGHT_TOTAL=0
137
+
138
+ for bot_id in $BOTS; do
139
+ FETCH="$RESULTS_DIR/fetch-$bot_id.json"
140
+ META="$RESULTS_DIR/meta-$bot_id.json"
141
+ JSONLD="$RESULTS_DIR/jsonld-$bot_id.json"
142
+ LINKS="$RESULTS_DIR/links-$bot_id.json"
143
+ ROBOTS="$RESULTS_DIR/robots-$bot_id.json"
144
+
145
+ BOT_NAME=$(jget "$FETCH" '.bot.name' "$bot_id")
146
+ STATUS=$(jget_num "$FETCH" '.status')
147
+ TOTAL_TIME=$(jget_num "$FETCH" '.timing.total')
148
+ SERVER_WORD_COUNT=$(jget_num "$FETCH" '.wordCount')
149
+ # Read with explicit null fallback — jq's `//` is unsafe here because it
150
+ # treats boolean false as falsy, which is exactly the value we need to see.
151
+ RENDERS_JS=$(jq -r '.bot.rendersJavaScript | if . == null then "unknown" else tostring end' "$FETCH" 2>/dev/null || echo "unknown")
152
+
153
+ ROBOTS_ALLOWED=$(jget_bool "$ROBOTS" '.allowed')
154
+
155
+ # Effective word count depends on JS rendering capability:
156
+ # - true (e.g. Googlebot) + diff-render data → rendered DOM word count
157
+ # - false (AI training/search bots, observed) → server HTML only, with
158
+ # penalty proportional to the rendering delta
159
+ # - unknown → conservative: server HTML (same as false but no penalty)
160
+ EFFECTIVE_WORD_COUNT=$SERVER_WORD_COUNT
161
+ HYDRATION_PENALTY=0
162
+ MISSED_WORDS=0
163
+ if [ "$DIFF_AVAILABLE" = "true" ]; then
164
+ if [ "$RENDERS_JS" = "true" ]; then
165
+ EFFECTIVE_WORD_COUNT=$DIFF_RENDERED_WORDS
166
+ elif [ "$RENDERS_JS" = "false" ]; then
167
+ # Absolute-value delta: if rendered DOM has materially more than server,
168
+ # AI bots are missing that content.
169
+ ABS_DELTA=$(awk -v d="$DIFF_DELTA_PCT" 'BEGIN { printf "%d", (d < 0 ? -d : d) + 0.5 }')
170
+ if [ "$ABS_DELTA" -gt 5 ]; then
171
+ # Scale penalty: 5% delta = 0, 10% = 5, 20%+ = 15 (cap)
172
+ HYDRATION_PENALTY=$(awk -v d="$ABS_DELTA" 'BEGIN {
173
+ p = (d - 5)
174
+ if (p > 15) p = 15
175
+ printf "%d", p
176
+ }')
177
+ fi
178
+ MISSED_WORDS=$((DIFF_RENDERED_WORDS - SERVER_WORD_COUNT))
179
+ [ "$MISSED_WORDS" -lt 0 ] && MISSED_WORDS=0
180
+ fi
181
+ fi
182
+
183
+ # --- Category 1: Accessibility (0-100) ---
184
+ ACC=0
185
+ # robots.txt allows: 40
186
+ [ "$ROBOTS_ALLOWED" = "true" ] && ACC=$((ACC + 40))
187
+ # HTTP 200: 40
188
+ [ "$STATUS" = "200" ] && ACC=$((ACC + 40))
189
+ # Response time: <2s = 20, <5s = 10, else 0
190
+ TIME_SCORE=$(awk -v t="$TOTAL_TIME" 'BEGIN { if (t < 2) print 20; else if (t < 5) print 10; else print 0 }')
191
+ ACC=$((ACC + TIME_SCORE))
192
+
193
+ # --- Category 2: Content Visibility (0-100) ---
194
+ CONTENT=0
195
+ if [ "$EFFECTIVE_WORD_COUNT" -ge 300 ]; then CONTENT=$((CONTENT + 30))
196
+ elif [ "$EFFECTIVE_WORD_COUNT" -ge 150 ]; then CONTENT=$((CONTENT + 20))
197
+ elif [ "$EFFECTIVE_WORD_COUNT" -ge 50 ]; then CONTENT=$((CONTENT + 10))
198
+ fi
199
+
200
+ H1_COUNT=$(jget_num "$META" '.headings.h1.count')
201
+ H2_COUNT=$(jget_num "$META" '.headings.h2.count')
202
+ [ "$H1_COUNT" -ge 1 ] && CONTENT=$((CONTENT + 20))
203
+ [ "$H2_COUNT" -ge 1 ] && CONTENT=$((CONTENT + 15))
204
+
205
+ INTERNAL_LINKS=$(jget_num "$LINKS" '.counts.internal')
206
+ if [ "$INTERNAL_LINKS" -ge 5 ]; then CONTENT=$((CONTENT + 20))
207
+ elif [ "$INTERNAL_LINKS" -ge 1 ]; then CONTENT=$((CONTENT + 10))
208
+ fi
209
+
210
+ IMG_TOTAL=$(jget_num "$META" '.images.total')
211
+ IMG_WITH_ALT=$(jget_num "$META" '.images.withAlt')
212
+ if [ "$IMG_TOTAL" -eq 0 ]; then
213
+ CONTENT=$((CONTENT + 15))
214
+ else
215
+ ALT_SCORE=$(awk -v a="$IMG_WITH_ALT" -v t="$IMG_TOTAL" 'BEGIN { printf "%d", (a / t) * 15 }')
216
+ CONTENT=$((CONTENT + ALT_SCORE))
217
+ fi
218
+
219
+ # Apply hydration penalty for non-rendering bots that are missing content
220
+ CONTENT=$((CONTENT - HYDRATION_PENALTY))
221
+ [ $CONTENT -lt 0 ] && CONTENT=0
222
+
223
+ # --- Category 3: Structured Data (0-100) ---
224
+ STRUCTURED=0
225
+ JSONLD_COUNT=$(jget_num "$JSONLD" '.blockCount')
226
+ JSONLD_VALID=$(jget_num "$JSONLD" '.validCount')
227
+ JSONLD_INVALID=$(jget_num "$JSONLD" '.invalidCount')
228
+ HAS_ORG=$(jget_bool "$JSONLD" '.flags.hasOrganization')
229
+ HAS_WEBSITE=$(jget_bool "$JSONLD" '.flags.hasWebSite')
230
+ HAS_BREADCRUMB=$(jget_bool "$JSONLD" '.flags.hasBreadcrumbList')
231
+ HAS_ARTICLE=$(jget_bool "$JSONLD" '.flags.hasArticle')
232
+ HAS_PRODUCT=$(jget_bool "$JSONLD" '.flags.hasProduct')
233
+ HAS_FAQ=$(jget_bool "$JSONLD" '.flags.hasFAQPage')
234
+
235
+ [ "$JSONLD_COUNT" -ge 1 ] && STRUCTURED=$((STRUCTURED + 30))
236
+ if [ "$JSONLD_COUNT" -ge 1 ] && [ "$JSONLD_INVALID" -eq 0 ]; then
237
+ STRUCTURED=$((STRUCTURED + 20))
238
+ fi
239
+ if [ "$HAS_ORG" = "true" ] || [ "$HAS_WEBSITE" = "true" ]; then
240
+ STRUCTURED=$((STRUCTURED + 20))
241
+ fi
242
+ [ "$HAS_BREADCRUMB" = "true" ] && STRUCTURED=$((STRUCTURED + 15))
243
+ if [ "$HAS_ARTICLE" = "true" ] || [ "$HAS_PRODUCT" = "true" ] || [ "$HAS_FAQ" = "true" ]; then
244
+ STRUCTURED=$((STRUCTURED + 15))
245
+ fi
246
+
247
+ # --- Category 4: Technical Signals (0-100) ---
248
+ TECHNICAL=0
249
+ TITLE=$(jget "$META" '.title' "")
250
+ DESCRIPTION=$(jget "$META" '.description' "")
251
+ CANONICAL=$(jget "$META" '.canonical' "")
252
+ OG_TITLE=$(jget "$META" '.og.title' "")
253
+ OG_DESC=$(jget "$META" '.og.description' "")
254
+
255
+ [ -n "$TITLE" ] && [ "$TITLE" != "null" ] && TECHNICAL=$((TECHNICAL + 25))
256
+ [ -n "$DESCRIPTION" ] && [ "$DESCRIPTION" != "null" ] && TECHNICAL=$((TECHNICAL + 25))
257
+ [ -n "$CANONICAL" ] && [ "$CANONICAL" != "null" ] && TECHNICAL=$((TECHNICAL + 20))
258
+ if [ -n "$OG_TITLE" ] && [ "$OG_TITLE" != "null" ]; then TECHNICAL=$((TECHNICAL + 8)); fi
259
+ if [ -n "$OG_DESC" ] && [ "$OG_DESC" != "null" ]; then TECHNICAL=$((TECHNICAL + 7)); fi
260
+
261
+ SITEMAP_EXISTS=$(jget_bool "$SITEMAP_FILE" '.exists')
262
+ SITEMAP_CONTAINS=$(jget_bool "$SITEMAP_FILE" '.containsTarget')
263
+ if [ "$SITEMAP_EXISTS" = "true" ] && [ "$SITEMAP_CONTAINS" = "true" ]; then
264
+ TECHNICAL=$((TECHNICAL + 15))
265
+ elif [ "$SITEMAP_EXISTS" = "true" ]; then
266
+ TECHNICAL=$((TECHNICAL + 10))
267
+ fi
268
+
269
+ # --- Category 5: AI Readiness (0-100) ---
270
+ AI=0
271
+ LLMS_EXISTS=$(jget_bool "$LLMSTXT_FILE" '.llmsTxt.exists')
272
+ LLMS_HAS_TITLE=$(jget_bool "$LLMSTXT_FILE" '.llmsTxt.hasTitle')
273
+ LLMS_HAS_DESC=$(jget_bool "$LLMSTXT_FILE" '.llmsTxt.hasDescription')
274
+ LLMS_URLS=$(jget_num "$LLMSTXT_FILE" '.llmsTxt.urlCount')
275
+
276
+ if [ "$LLMS_EXISTS" = "true" ]; then
277
+ AI=$((AI + 40))
278
+ [ "$LLMS_HAS_TITLE" = "true" ] && AI=$((AI + 7))
279
+ [ "$LLMS_HAS_DESC" = "true" ] && AI=$((AI + 7))
280
+ [ "$LLMS_URLS" -ge 1 ] && AI=$((AI + 6))
281
+ fi
282
+ # Content citable (>= 200 words, effective for this bot)
283
+ [ "$EFFECTIVE_WORD_COUNT" -ge 200 ] && AI=$((AI + 20))
284
+ # Semantic clarity: has H1 + description
285
+ if [ "$H1_COUNT" -ge 1 ] && [ -n "$DESCRIPTION" ] && [ "$DESCRIPTION" != "null" ]; then
286
+ AI=$((AI + 20))
287
+ fi
288
+
289
+ # Cap categories at 100
290
+ [ $ACC -gt 100 ] && ACC=100
291
+ [ $CONTENT -gt 100 ] && CONTENT=100
292
+ [ $STRUCTURED -gt 100 ] && STRUCTURED=100
293
+ [ $TECHNICAL -gt 100 ] && TECHNICAL=100
294
+ [ $AI -gt 100 ] && AI=100
295
+
296
+ # Per-bot composite score (weighted average of 5 categories)
297
+ BOT_SCORE=$(awk -v a=$ACC -v c=$CONTENT -v s=$STRUCTURED -v t=$TECHNICAL -v ai=$AI \
298
+ -v wa=$W_ACCESSIBILITY -v wc=$W_CONTENT -v ws=$W_STRUCTURED -v wt=$W_TECHNICAL -v wai=$W_AI \
299
+ 'BEGIN { printf "%d", (a*wa + c*wc + s*ws + t*wt + ai*wai) / (wa+wc+ws+wt+wai) + 0.5 }')
300
+
301
+ BOT_GRADE=$(grade_for "$BOT_SCORE")
302
+ ACC_GRADE=$(grade_for "$ACC")
303
+ CONTENT_GRADE=$(grade_for "$CONTENT")
304
+ STRUCTURED_GRADE=$(grade_for "$STRUCTURED")
305
+ TECHNICAL_GRADE=$(grade_for "$TECHNICAL")
306
+ AI_GRADE=$(grade_for "$AI")
307
+
308
+ BOT_OBJ=$(jq -n \
309
+ --arg id "$bot_id" \
310
+ --arg name "$BOT_NAME" \
311
+ --arg rendersJs "$RENDERS_JS" \
312
+ --argjson score "$BOT_SCORE" \
313
+ --arg grade "$BOT_GRADE" \
314
+ --argjson acc "$ACC" \
315
+ --arg accGrade "$ACC_GRADE" \
316
+ --argjson content "$CONTENT" \
317
+ --arg contentGrade "$CONTENT_GRADE" \
318
+ --argjson structured "$STRUCTURED" \
319
+ --arg structuredGrade "$STRUCTURED_GRADE" \
320
+ --argjson technical "$TECHNICAL" \
321
+ --arg technicalGrade "$TECHNICAL_GRADE" \
322
+ --argjson ai "$AI" \
323
+ --arg aiGrade "$AI_GRADE" \
324
+ --argjson serverWords "$SERVER_WORD_COUNT" \
325
+ --argjson effectiveWords "$EFFECTIVE_WORD_COUNT" \
326
+ --argjson missedWords "$MISSED_WORDS" \
327
+ --argjson hydrationPenalty "$HYDRATION_PENALTY" \
328
+ '{
329
+ id: $id,
330
+ name: $name,
331
+ rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end),
332
+ score: $score,
333
+ grade: $grade,
334
+ visibility: {
335
+ serverWords: $serverWords,
336
+ effectiveWords: $effectiveWords,
337
+ missedWordsVsRendered: $missedWords,
338
+ hydrationPenaltyPts: $hydrationPenalty
339
+ },
340
+ categories: {
341
+ accessibility: { score: $acc, grade: $accGrade },
342
+ contentVisibility: { score: $content, grade: $contentGrade },
343
+ structuredData: { score: $structured, grade: $structuredGrade },
344
+ technicalSignals: { score: $technical, grade: $technicalGrade },
345
+ aiReadiness: { score: $ai, grade: $aiGrade }
346
+ }
347
+ }')
348
+
349
+ BOTS_JSON=$(printf '%s' "$BOTS_JSON" | jq --argjson bot "$BOT_OBJ" --arg id "$bot_id" '.[$id] = $bot')
350
+
351
+ # Accumulate category averages
352
+ CAT_ACCESSIBILITY_SUM=$((CAT_ACCESSIBILITY_SUM + ACC))
353
+ CAT_CONTENT_SUM=$((CAT_CONTENT_SUM + CONTENT))
354
+ CAT_STRUCTURED_SUM=$((CAT_STRUCTURED_SUM + STRUCTURED))
355
+ CAT_TECHNICAL_SUM=$((CAT_TECHNICAL_SUM + TECHNICAL))
356
+ CAT_AI_SUM=$((CAT_AI_SUM + AI))
357
+ CAT_N=$((CAT_N + 1))
358
+
359
+ # Accumulate weighted overall
360
+ W=$(overall_weight "$bot_id")
361
+ if [ "$W" -gt 0 ]; then
362
+ OVERALL_WEIGHTED_SUM=$((OVERALL_WEIGHTED_SUM + BOT_SCORE * W))
363
+ OVERALL_WEIGHT_TOTAL=$((OVERALL_WEIGHT_TOTAL + W))
364
+ fi
365
+ done
366
+
367
+ # Per-category averages (across all bots)
368
+ CAT_ACC_AVG=$((CAT_ACCESSIBILITY_SUM / CAT_N))
369
+ CAT_CONTENT_AVG=$((CAT_CONTENT_SUM / CAT_N))
370
+ CAT_STRUCTURED_AVG=$((CAT_STRUCTURED_SUM / CAT_N))
371
+ CAT_TECHNICAL_AVG=$((CAT_TECHNICAL_SUM / CAT_N))
372
+ CAT_AI_AVG=$((CAT_AI_SUM / CAT_N))
373
+
374
+ # Overall composite
375
+ if [ "$OVERALL_WEIGHT_TOTAL" -gt 0 ]; then
376
+ OVERALL_SCORE=$((OVERALL_WEIGHTED_SUM / OVERALL_WEIGHT_TOTAL))
377
+ else
378
+ # Fall back to simple average if none of the 4 standard bots are present
379
+ OVERALL_SCORE=$(((CAT_ACC_AVG + CAT_CONTENT_AVG + CAT_STRUCTURED_AVG + CAT_TECHNICAL_AVG + CAT_AI_AVG) / 5))
380
+ fi
381
+
382
+ OVERALL_GRADE=$(grade_for "$OVERALL_SCORE")
383
+ CAT_ACC_GRADE=$(grade_for "$CAT_ACC_AVG")
384
+ CAT_CONTENT_GRADE=$(grade_for "$CAT_CONTENT_AVG")
385
+ CAT_STRUCTURED_GRADE=$(grade_for "$CAT_STRUCTURED_AVG")
386
+ CAT_TECHNICAL_GRADE=$(grade_for "$CAT_TECHNICAL_AVG")
387
+ CAT_AI_GRADE=$(grade_for "$CAT_AI_AVG")
388
+
389
+ # Get the URL from the first fetch file
390
+ FIRST_FETCH=$(ls "$RESULTS_DIR"/fetch-*.json | head -1)
391
+ TARGET_URL=$(jget "$FIRST_FETCH" '.url' "")
392
+ TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
393
+
394
+ jq -n \
395
+ --arg url "$TARGET_URL" \
396
+ --arg timestamp "$TIMESTAMP" \
397
+ --arg version "0.1.0" \
398
+ --argjson overallScore "$OVERALL_SCORE" \
399
+ --arg overallGrade "$OVERALL_GRADE" \
400
+ --argjson bots "$BOTS_JSON" \
401
+ --argjson catAcc "$CAT_ACC_AVG" \
402
+ --arg catAccGrade "$CAT_ACC_GRADE" \
403
+ --argjson catContent "$CAT_CONTENT_AVG" \
404
+ --arg catContentGrade "$CAT_CONTENT_GRADE" \
405
+ --argjson catStructured "$CAT_STRUCTURED_AVG" \
406
+ --arg catStructuredGrade "$CAT_STRUCTURED_GRADE" \
407
+ --argjson catTechnical "$CAT_TECHNICAL_AVG" \
408
+ --arg catTechnicalGrade "$CAT_TECHNICAL_GRADE" \
409
+ --argjson catAi "$CAT_AI_AVG" \
410
+ --arg catAiGrade "$CAT_AI_GRADE" \
411
+ '{
412
+ url: $url,
413
+ timestamp: $timestamp,
414
+ version: $version,
415
+ overall: { score: $overallScore, grade: $overallGrade },
416
+ bots: $bots,
417
+ categories: {
418
+ accessibility: { score: $catAcc, grade: $catAccGrade },
419
+ contentVisibility: { score: $catContent, grade: $catContentGrade },
420
+ structuredData: { score: $catStructured, grade: $catStructuredGrade },
421
+ technicalSignals: { score: $catTechnical, grade: $catTechnicalGrade },
422
+ aiReadiness: { score: $catAi, grade: $catAiGrade }
423
+ }
424
+ }'
@@ -0,0 +1,136 @@
1
+ #!/usr/bin/env bash
2
+ set -eu
3
+
4
+ # diff-render.sh — Compare server HTML word count vs JS-rendered word count
5
+ # Usage: diff-render.sh <url>
6
+ # Requires Playwright. Gracefully outputs { skipped: true } if unavailable.
7
+
8
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
9
+ # shellcheck source=_lib.sh
10
+ . "$SCRIPT_DIR/_lib.sh"
11
+
12
+ URL="${1:?Usage: diff-render.sh <url>}"
13
+ printf '[diff-render] comparing server HTML vs Playwright render for %s\n' "$URL" >&2
14
+
15
+ emit_skipped() {
16
+ local reason="$1"
17
+ jq -n \
18
+ --arg url "$URL" \
19
+ --arg reason "$reason" \
20
+ '{
21
+ url: $url,
22
+ skipped: true,
23
+ reason: $reason,
24
+ serverWordCount: null,
25
+ renderedWordCount: null,
26
+ deltaPct: null,
27
+ significantDelta: null
28
+ }'
29
+ exit 0
30
+ }
31
+
32
+ # Check for Node.js
33
+ if ! command -v node >/dev/null 2>&1; then
34
+ emit_skipped "node not installed"
35
+ fi
36
+
37
+ # Check for Playwright — try to require it from the current dir or globally
38
+ PLAYWRIGHT_CHECK=$(node -e "
39
+ try {
40
+ require('playwright');
41
+ console.log('ok');
42
+ } catch (e) {
43
+ try {
44
+ require('playwright-core');
45
+ console.log('ok');
46
+ } catch (e2) {
47
+ console.log('missing');
48
+ }
49
+ }" 2>/dev/null || echo "missing")
50
+
51
+ if [ "$PLAYWRIGHT_CHECK" != "ok" ]; then
52
+ emit_skipped "playwright not installed (run: npm install playwright && npx playwright install chromium)"
53
+ fi
54
+
55
+ # Fetch server HTML and count words
56
+ TMPDIR="${TMPDIR:-/tmp}"
57
+ SERVER_HTML=$(mktemp "$TMPDIR/crawlsim-server.XXXXXX")
58
+ RENDERED_HTML=$(mktemp "$TMPDIR/crawlsim-rendered.XXXXXX")
59
+ trap 'rm -f "$SERVER_HTML" "$RENDERED_HTML"' EXIT
60
+
61
+ # Fetch server HTML with Googlebot UA
62
+ UA="Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
63
+ curl -sS -L -A "$UA" -o "$SERVER_HTML" --max-time 30 "$URL" 2>/dev/null || {
64
+ emit_skipped "failed to fetch server HTML"
65
+ }
66
+
67
+ SERVER_WORDS=$(count_words "$SERVER_HTML")
68
+ [ -z "$SERVER_WORDS" ] && SERVER_WORDS=0
69
+
70
+ # Use Playwright to render and capture the final DOM
71
+ node -e "
72
+ (async () => {
73
+ const { chromium } = require('playwright');
74
+ const browser = await chromium.launch({ headless: true });
75
+ try {
76
+ const context = await browser.newContext({
77
+ userAgent: 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
78
+ });
79
+ const page = await context.newPage();
80
+ await page.goto(process.argv[1], { waitUntil: 'networkidle', timeout: 30000 });
81
+ const html = await page.content();
82
+ const fs = require('fs');
83
+ fs.writeFileSync(process.argv[2], html);
84
+ } finally {
85
+ await browser.close();
86
+ }
87
+ })().catch(err => {
88
+ console.error('RENDER_ERROR:', err.message);
89
+ process.exit(1);
90
+ });
91
+ " "$URL" "$RENDERED_HTML" 2>/dev/null || {
92
+ emit_skipped "playwright render failed"
93
+ }
94
+
95
+ RENDERED_WORDS=$(count_words "$RENDERED_HTML")
96
+ [ -z "$RENDERED_WORDS" ] && RENDERED_WORDS=0
97
+
98
+ # Compute delta percentage (rendered vs server)
99
+ DELTA_PCT=0
100
+ SIGNIFICANT=false
101
+ if [ "$SERVER_WORDS" -gt 0 ]; then
102
+ DELTA_PCT=$(awk -v s="$SERVER_WORDS" -v r="$RENDERED_WORDS" \
103
+ 'BEGIN { printf "%.1f", ((r - s) / s) * 100 }')
104
+ ABS_DELTA=$(awk -v d="$DELTA_PCT" 'BEGIN { printf "%d", (d < 0 ? -d : d) }')
105
+ if [ "$ABS_DELTA" -gt 20 ]; then
106
+ SIGNIFICANT=true
107
+ fi
108
+ elif [ "$RENDERED_WORDS" -gt 0 ]; then
109
+ # Server had nothing, rendered has content — significant
110
+ DELTA_PCT=100
111
+ SIGNIFICANT=true
112
+ fi
113
+
114
+ jq -n \
115
+ --arg url "$URL" \
116
+ --argjson serverWords "$SERVER_WORDS" \
117
+ --argjson renderedWords "$RENDERED_WORDS" \
118
+ --argjson deltaPct "$DELTA_PCT" \
119
+ --argjson significant "$SIGNIFICANT" \
120
+ '{
121
+ url: $url,
122
+ skipped: false,
123
+ serverWordCount: $serverWords,
124
+ renderedWordCount: $renderedWords,
125
+ deltaPct: $deltaPct,
126
+ significantDelta: $significant,
127
+ interpretation: (
128
+ if $significant and $deltaPct > 0 then
129
+ "JS rendering reveals significantly more content than server HTML — non-rendering bots (GPTBot/ClaudeBot/Perplexity) will see less."
130
+ elif $significant and $deltaPct < 0 then
131
+ "Server HTML has more content than rendered DOM — unusual, possibly JS removing content."
132
+ else
133
+ "Server HTML and rendered DOM word counts are close — no significant hydration delta."
134
+ end
135
+ )
136
+ }'