@braedenbuilds/crawl-sim 1.0.5 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. package/.claude-plugin/marketplace.json +15 -0
  2. package/.claude-plugin/plugin.json +13 -0
  3. package/README.md +32 -9
  4. package/bin/install.js +6 -2
  5. package/package.json +8 -3
  6. package/{SKILL.md → skills/crawl-sim/SKILL.md} +23 -2
  7. package/{scripts → skills/crawl-sim/scripts}/_lib.sh +30 -0
  8. package/skills/crawl-sim/scripts/compute-score.sh +744 -0
  9. package/{scripts → skills/crawl-sim/scripts}/extract-jsonld.sh +12 -0
  10. package/skills/crawl-sim/scripts/fetch-as-bot.sh +151 -0
  11. package/skills/crawl-sim/scripts/schema-fields.sh +25 -0
  12. package/scripts/compute-score.sh +0 -424
  13. package/scripts/fetch-as-bot.sh +0 -87
  14. /package/{profiles → skills/crawl-sim/profiles}/chatgpt-user.json +0 -0
  15. /package/{profiles → skills/crawl-sim/profiles}/claude-searchbot.json +0 -0
  16. /package/{profiles → skills/crawl-sim/profiles}/claude-user.json +0 -0
  17. /package/{profiles → skills/crawl-sim/profiles}/claudebot.json +0 -0
  18. /package/{profiles → skills/crawl-sim/profiles}/googlebot.json +0 -0
  19. /package/{profiles → skills/crawl-sim/profiles}/gptbot.json +0 -0
  20. /package/{profiles → skills/crawl-sim/profiles}/oai-searchbot.json +0 -0
  21. /package/{profiles → skills/crawl-sim/profiles}/perplexity-user.json +0 -0
  22. /package/{profiles → skills/crawl-sim/profiles}/perplexitybot.json +0 -0
  23. /package/{scripts → skills/crawl-sim/scripts}/check-llmstxt.sh +0 -0
  24. /package/{scripts → skills/crawl-sim/scripts}/check-robots.sh +0 -0
  25. /package/{scripts → skills/crawl-sim/scripts}/check-sitemap.sh +0 -0
  26. /package/{scripts → skills/crawl-sim/scripts}/diff-render.sh +0 -0
  27. /package/{scripts → skills/crawl-sim/scripts}/extract-links.sh +0 -0
  28. /package/{scripts → skills/crawl-sim/scripts}/extract-meta.sh +0 -0
@@ -0,0 +1,744 @@
1
+ #!/usr/bin/env bash
2
+ set -eu
3
+
4
+ # compute-score.sh — Aggregate check outputs into per-bot + per-category scores
5
+ # Usage: compute-score.sh [--page-type <type>] <results-dir>
6
+ # Output: JSON to stdout
7
+ #
8
+ # Expected filenames in <results-dir>:
9
+ # fetch-<bot_id>.json — fetch-as-bot.sh output
10
+ # meta-<bot_id>.json — extract-meta.sh output
11
+ # jsonld-<bot_id>.json — extract-jsonld.sh output
12
+ # links-<bot_id>.json — extract-links.sh output
13
+ # robots-<bot_id>.json — check-robots.sh output
14
+ # llmstxt.json — check-llmstxt.sh output (bot-independent)
15
+ # sitemap.json — check-sitemap.sh output (bot-independent)
16
+ # diff-render.json — diff-render.sh output (optional, Googlebot only)
17
+ #
18
+ # The --page-type flag overrides URL-based page-type detection. Valid values:
19
+ # root, detail, archive, faq, about, contact, generic.
20
+
21
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
22
+ # shellcheck source=_lib.sh
23
+ . "$SCRIPT_DIR/_lib.sh"
24
+ # shellcheck source=schema-fields.sh
25
+ . "$SCRIPT_DIR/schema-fields.sh"
26
+
27
+ PAGE_TYPE_OVERRIDE=""
28
+ while [ $# -gt 0 ]; do
29
+ case "$1" in
30
+ --page-type)
31
+ [ $# -ge 2 ] || { echo "--page-type requires a value" >&2; exit 2; }
32
+ PAGE_TYPE_OVERRIDE="$2"
33
+ shift 2
34
+ ;;
35
+ --page-type=*)
36
+ PAGE_TYPE_OVERRIDE="${1#--page-type=}"
37
+ shift
38
+ ;;
39
+ -h|--help)
40
+ echo "Usage: compute-score.sh [--page-type <type>] <results-dir>"
41
+ exit 0
42
+ ;;
43
+ --)
44
+ shift
45
+ break
46
+ ;;
47
+ -*)
48
+ echo "Unknown flag: $1" >&2
49
+ exit 2
50
+ ;;
51
+ *)
52
+ break
53
+ ;;
54
+ esac
55
+ done
56
+
57
+ RESULTS_DIR="${1:?Usage: compute-score.sh [--page-type <type>] <results-dir>}"
58
+
59
+ if [ -n "$PAGE_TYPE_OVERRIDE" ]; then
60
+ case "$PAGE_TYPE_OVERRIDE" in
61
+ root|detail|archive|faq|about|contact|generic) ;;
62
+ *)
63
+ echo "Error: invalid --page-type '$PAGE_TYPE_OVERRIDE' (valid: root, detail, archive, faq, about, contact, generic)" >&2
64
+ exit 2
65
+ ;;
66
+ esac
67
+ fi
68
+
69
+ printf '[compute-score] aggregating %s\n' "$RESULTS_DIR" >&2
70
+
71
+ if [ ! -d "$RESULTS_DIR" ]; then
72
+ echo "Error: results dir not found: $RESULTS_DIR" >&2
73
+ exit 1
74
+ fi
75
+
76
+ # Category weights (as percentages of per-bot composite)
77
+ W_ACCESSIBILITY=25
78
+ W_CONTENT=30
79
+ W_STRUCTURED=20
80
+ W_TECHNICAL=15
81
+ W_AI=10
82
+
83
+ # Overall composite weights (per bot)
84
+ overall_weight() {
85
+ case "$1" in
86
+ googlebot) echo 40 ;;
87
+ gptbot) echo 20 ;;
88
+ claudebot) echo 20 ;;
89
+ perplexitybot) echo 20 ;;
90
+ *) echo 0 ;;
91
+ esac
92
+ }
93
+
94
+ grade_for() {
95
+ local s=$1
96
+ if [ "$s" -ge 93 ]; then echo "A"
97
+ elif [ "$s" -ge 90 ]; then echo "A-"
98
+ elif [ "$s" -ge 87 ]; then echo "B+"
99
+ elif [ "$s" -ge 83 ]; then echo "B"
100
+ elif [ "$s" -ge 80 ]; then echo "B-"
101
+ elif [ "$s" -ge 77 ]; then echo "C+"
102
+ elif [ "$s" -ge 73 ]; then echo "C"
103
+ elif [ "$s" -ge 70 ]; then echo "C-"
104
+ elif [ "$s" -ge 67 ]; then echo "D+"
105
+ elif [ "$s" -ge 63 ]; then echo "D"
106
+ elif [ "$s" -ge 60 ]; then echo "D-"
107
+ else echo "F"
108
+ fi
109
+ }
110
+
111
+ # Rubric: expected schema types per page type.
112
+ rubric_expected() {
113
+ case "$1" in
114
+ root) echo "Organization WebSite" ;;
115
+ detail) echo "Article BreadcrumbList" ;;
116
+ archive) echo "CollectionPage ItemList BreadcrumbList" ;;
117
+ faq) echo "FAQPage BreadcrumbList" ;;
118
+ about) echo "AboutPage BreadcrumbList Organization" ;;
119
+ contact) echo "ContactPage BreadcrumbList" ;;
120
+ *) echo "WebPage BreadcrumbList" ;;
121
+ esac
122
+ }
123
+
124
+ rubric_optional() {
125
+ case "$1" in
126
+ root) echo "ProfessionalService LocalBusiness" ;;
127
+ detail) echo "NewsArticle ImageObject Person" ;;
128
+ archive) echo "" ;;
129
+ faq) echo "WebPage" ;;
130
+ about) echo "Person" ;;
131
+ contact) echo "PostalAddress" ;;
132
+ *) echo "" ;;
133
+ esac
134
+ }
135
+
136
+ rubric_forbidden() {
137
+ case "$1" in
138
+ root) echo "BreadcrumbList Article FAQPage" ;;
139
+ detail) echo "CollectionPage ItemList" ;;
140
+ archive) echo "Article Product" ;;
141
+ faq) echo "Article CollectionPage" ;;
142
+ about) echo "Article Product" ;;
143
+ contact) echo "Article Product" ;;
144
+ *) echo "" ;;
145
+ esac
146
+ }
147
+
148
+ list_contains() {
149
+ local needle="$1"
150
+ shift
151
+ local item
152
+ for item in "$@"; do
153
+ [ "$item" = "$needle" ] && return 0
154
+ done
155
+ return 1
156
+ }
157
+
158
+ list_count() {
159
+ # shellcheck disable=SC2086
160
+ set -- $1
161
+ echo "$#"
162
+ }
163
+
164
+ list_intersect() {
165
+ local a="$1" b="$2"
166
+ local out="" item
167
+ # shellcheck disable=SC2086
168
+ for item in $a; do
169
+ # shellcheck disable=SC2086
170
+ if list_contains "$item" $b; then
171
+ out="$out $item"
172
+ fi
173
+ done
174
+ printf '%s' "${out# }"
175
+ }
176
+
177
+ list_diff() {
178
+ local a="$1" b="$2"
179
+ local out="" item
180
+ # shellcheck disable=SC2086
181
+ for item in $a; do
182
+ # shellcheck disable=SC2086
183
+ if ! list_contains "$item" $b; then
184
+ out="$out $item"
185
+ fi
186
+ done
187
+ printf '%s' "${out# }"
188
+ }
189
+
190
+ jget() {
191
+ local file="$1"
192
+ local query="$2"
193
+ local default="${3:-null}"
194
+ if [ -f "$file" ]; then
195
+ jq -r --arg d "$default" "$query // \$d" "$file" 2>/dev/null || echo "$default"
196
+ else
197
+ echo "$default"
198
+ fi
199
+ }
200
+
201
+ jget_num() {
202
+ local v
203
+ v=$(jget "$1" "$2" "0")
204
+ if ! printf '%s' "$v" | grep -qE '^-?[0-9]+(\.[0-9]+)?$'; then
205
+ echo "0"
206
+ else
207
+ echo "$v"
208
+ fi
209
+ }
210
+
211
+ jget_bool() {
212
+ local v
213
+ v=$(jget "$1" "$2" "false")
214
+ if [ "$v" = "true" ]; then echo "true"; else echo "false"; fi
215
+ }
216
+
217
+ BOTS=""
218
+ FIRST_FETCH=""
219
+ for f in "$RESULTS_DIR"/fetch-*.json; do
220
+ [ -f "$f" ] || continue
221
+ [ -z "$FIRST_FETCH" ] && FIRST_FETCH="$f"
222
+ bot_id=$(basename "$f" .json | sed 's/^fetch-//')
223
+ BOTS="$BOTS $bot_id"
224
+ done
225
+
226
+ if [ -z "$BOTS" ]; then
227
+ echo "Error: no fetch-*.json files found in $RESULTS_DIR" >&2
228
+ exit 1
229
+ fi
230
+
231
+ LLMSTXT_FILE="$RESULTS_DIR/llmstxt.json"
232
+ SITEMAP_FILE="$RESULTS_DIR/sitemap.json"
233
+ DIFF_RENDER_FILE="$RESULTS_DIR/diff-render.json"
234
+
235
+ DIFF_AVAILABLE=false
236
+ DIFF_RENDERED_WORDS=0
237
+ DIFF_DELTA_PCT=0
238
+ if [ -f "$DIFF_RENDER_FILE" ]; then
239
+ DIFF_SKIPPED=$(jq -r '.skipped | if . == null then "true" else tostring end' "$DIFF_RENDER_FILE" 2>/dev/null || echo "true")
240
+ if [ "$DIFF_SKIPPED" = "false" ]; then
241
+ DIFF_AVAILABLE=true
242
+ DIFF_RENDERED_WORDS=$(jq -r '.renderedWordCount // 0' "$DIFF_RENDER_FILE")
243
+ DIFF_DELTA_PCT=$(jq -r '.deltaPct // 0' "$DIFF_RENDER_FILE")
244
+ fi
245
+ fi
246
+
247
+ # Resolve page type once from the first fetch file's URL, unless overridden.
248
+ TARGET_URL=$(jget "$FIRST_FETCH" '.url' "")
249
+ if [ -n "$PAGE_TYPE_OVERRIDE" ]; then
250
+ PAGE_TYPE="$PAGE_TYPE_OVERRIDE"
251
+ else
252
+ PAGE_TYPE=$(page_type_for_url "$TARGET_URL")
253
+ fi
254
+ printf '[compute-score] page type: %s (url: %s)\n' "$PAGE_TYPE" "$TARGET_URL" >&2
255
+
256
+ RUBRIC_EXPECTED="$(rubric_expected "$PAGE_TYPE")"
257
+ RUBRIC_OPTIONAL="$(rubric_optional "$PAGE_TYPE")"
258
+ RUBRIC_FORBIDDEN="$(rubric_forbidden "$PAGE_TYPE")"
259
+ EXPECTED_COUNT=$(list_count "$RUBRIC_EXPECTED")
260
+
261
+ BOTS_JSON="{}"
262
+
263
+ CAT_ACCESSIBILITY_SUM=0
264
+ CAT_CONTENT_SUM=0
265
+ CAT_STRUCTURED_SUM=0
266
+ CAT_TECHNICAL_SUM=0
267
+ CAT_AI_SUM=0
268
+ CAT_N=0
269
+
270
+ OVERALL_WEIGHTED_SUM=0
271
+ OVERALL_WEIGHT_TOTAL=0
272
+
273
+ for bot_id in $BOTS; do
274
+ FETCH="$RESULTS_DIR/fetch-$bot_id.json"
275
+ META="$RESULTS_DIR/meta-$bot_id.json"
276
+ JSONLD="$RESULTS_DIR/jsonld-$bot_id.json"
277
+ LINKS="$RESULTS_DIR/links-$bot_id.json"
278
+ ROBOTS="$RESULTS_DIR/robots-$bot_id.json"
279
+
280
+ BOT_NAME=$(jget "$FETCH" '.bot.name' "$bot_id")
281
+
282
+ # Check for fetch failure — skip scoring, emit F grade (AC-A3)
283
+ FETCH_FAILED=$(jget_bool "$FETCH" '.fetchFailed')
284
+ if [ "$FETCH_FAILED" = "true" ]; then
285
+ FETCH_ERROR=$(jget "$FETCH" '.error' "unknown error")
286
+ RENDERS_JS=$(jq -r '.bot.rendersJavaScript | if . == null then "unknown" else tostring end' "$FETCH" 2>/dev/null || echo "unknown")
287
+ BOT_OBJ=$(jq -n \
288
+ --arg id "$bot_id" \
289
+ --arg name "$BOT_NAME" \
290
+ --arg rendersJs "$RENDERS_JS" \
291
+ --arg error "$FETCH_ERROR" \
292
+ '{
293
+ id: $id,
294
+ name: $name,
295
+ rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end),
296
+ fetchFailed: true,
297
+ error: $error,
298
+ score: 0,
299
+ grade: "F",
300
+ visibility: { serverWords: 0, effectiveWords: 0, missedWordsVsRendered: 0, hydrationPenaltyPts: 0 },
301
+ categories: {
302
+ accessibility: { score: 0, grade: "F" },
303
+ contentVisibility: { score: 0, grade: "F" },
304
+ structuredData: { score: 0, grade: "F", pageType: "unknown", expected: [], optional: [], forbidden: [], present: [], missing: [], extras: [], violations: [{ kind: "fetch_failed", impact: -100 }], calculation: "fetch failed — no data to score", notes: ("Fetch failed: " + $error) },
305
+ technicalSignals: { score: 0, grade: "F" },
306
+ aiReadiness: { score: 0, grade: "F" }
307
+ }
308
+ }')
309
+ BOTS_JSON=$(printf '%s' "$BOTS_JSON" | jq --argjson bot "$BOT_OBJ" --arg id "$bot_id" '.[$id] = $bot')
310
+ printf '[compute-score] %s: fetch failed, scoring as F\n' "$bot_id" >&2
311
+ CAT_N=$((CAT_N + 1))
312
+ continue
313
+ fi
314
+
315
+ STATUS=$(jget_num "$FETCH" '.status')
316
+ TOTAL_TIME=$(jget_num "$FETCH" '.timing.total')
317
+ SERVER_WORD_COUNT=$(jget_num "$FETCH" '.wordCount')
318
+ RENDERS_JS=$(jq -r '.bot.rendersJavaScript | if . == null then "unknown" else tostring end' "$FETCH" 2>/dev/null || echo "unknown")
319
+
320
+ ROBOTS_ALLOWED=$(jget_bool "$ROBOTS" '.allowed')
321
+
322
+ EFFECTIVE_WORD_COUNT=$SERVER_WORD_COUNT
323
+ HYDRATION_PENALTY=0
324
+ MISSED_WORDS=0
325
+ if [ "$DIFF_AVAILABLE" = "true" ]; then
326
+ if [ "$RENDERS_JS" = "true" ]; then
327
+ EFFECTIVE_WORD_COUNT=$DIFF_RENDERED_WORDS
328
+ elif [ "$RENDERS_JS" = "false" ]; then
329
+ ABS_DELTA=$(awk -v d="$DIFF_DELTA_PCT" 'BEGIN { printf "%d", (d < 0 ? -d : d) + 0.5 }')
330
+ if [ "$ABS_DELTA" -gt 5 ]; then
331
+ HYDRATION_PENALTY=$(awk -v d="$ABS_DELTA" 'BEGIN {
332
+ p = (d - 5)
333
+ if (p > 15) p = 15
334
+ printf "%d", p
335
+ }')
336
+ fi
337
+ MISSED_WORDS=$((DIFF_RENDERED_WORDS - SERVER_WORD_COUNT))
338
+ [ "$MISSED_WORDS" -lt 0 ] && MISSED_WORDS=0
339
+ fi
340
+ fi
341
+
342
+ # --- Category 1: Accessibility (0-100) ---
343
+ ACC=0
344
+ [ "$ROBOTS_ALLOWED" = "true" ] && ACC=$((ACC + 40))
345
+ [ "$STATUS" = "200" ] && ACC=$((ACC + 40))
346
+ TIME_SCORE=$(awk -v t="$TOTAL_TIME" 'BEGIN { if (t < 2) print 20; else if (t < 5) print 10; else print 0 }')
347
+ ACC=$((ACC + TIME_SCORE))
348
+
349
+ # --- Category 2: Content Visibility (0-100) ---
350
+ CONTENT=0
351
+ if [ "$EFFECTIVE_WORD_COUNT" -ge 300 ]; then CONTENT=$((CONTENT + 30))
352
+ elif [ "$EFFECTIVE_WORD_COUNT" -ge 150 ]; then CONTENT=$((CONTENT + 20))
353
+ elif [ "$EFFECTIVE_WORD_COUNT" -ge 50 ]; then CONTENT=$((CONTENT + 10))
354
+ fi
355
+
356
+ H1_COUNT=$(jget_num "$META" '.headings.h1.count')
357
+ H2_COUNT=$(jget_num "$META" '.headings.h2.count')
358
+ [ "$H1_COUNT" -ge 1 ] && CONTENT=$((CONTENT + 20))
359
+ [ "$H2_COUNT" -ge 1 ] && CONTENT=$((CONTENT + 15))
360
+
361
+ INTERNAL_LINKS=$(jget_num "$LINKS" '.counts.internal')
362
+ if [ "$INTERNAL_LINKS" -ge 5 ]; then CONTENT=$((CONTENT + 20))
363
+ elif [ "$INTERNAL_LINKS" -ge 1 ]; then CONTENT=$((CONTENT + 10))
364
+ fi
365
+
366
+ IMG_TOTAL=$(jget_num "$META" '.images.total')
367
+ IMG_WITH_ALT=$(jget_num "$META" '.images.withAlt')
368
+ if [ "$IMG_TOTAL" -eq 0 ]; then
369
+ CONTENT=$((CONTENT + 15))
370
+ else
371
+ ALT_SCORE=$(awk -v a="$IMG_WITH_ALT" -v t="$IMG_TOTAL" 'BEGIN { printf "%d", (a / t) * 15 }')
372
+ CONTENT=$((CONTENT + ALT_SCORE))
373
+ fi
374
+
375
+ CONTENT=$((CONTENT - HYDRATION_PENALTY))
376
+ [ $CONTENT -lt 0 ] && CONTENT=0
377
+
378
+ # --- Category 3: Structured Data (0-100) ---
379
+ JSONLD_COUNT=$(jget_num "$JSONLD" '.blockCount')
380
+ JSONLD_VALID=$(jget_num "$JSONLD" '.validCount')
381
+ JSONLD_INVALID=$(jget_num "$JSONLD" '.invalidCount')
382
+
383
+ if [ -f "$JSONLD" ]; then
384
+ PRESENT_TYPES=$(jq -r '.types[]? // empty' "$JSONLD" 2>/dev/null | awk 'NF && !seen[$0]++' | tr '\n' ' ')
385
+ PRESENT_TYPES=${PRESENT_TYPES% }
386
+ else
387
+ PRESENT_TYPES=""
388
+ fi
389
+
390
+ PRESENT_EXPECTED=$(list_intersect "$RUBRIC_EXPECTED" "$PRESENT_TYPES")
391
+ PRESENT_OPTIONAL=$(list_intersect "$RUBRIC_OPTIONAL" "$PRESENT_TYPES")
392
+ PRESENT_FORBIDDEN=$(list_intersect "$RUBRIC_FORBIDDEN" "$PRESENT_TYPES")
393
+ MISSING_EXPECTED=$(list_diff "$RUBRIC_EXPECTED" "$PRESENT_TYPES")
394
+ RUBRIC_KNOWN="$RUBRIC_EXPECTED $RUBRIC_OPTIONAL $RUBRIC_FORBIDDEN"
395
+ EXTRAS=$(list_diff "$PRESENT_TYPES" "$RUBRIC_KNOWN")
396
+
397
+ PRESENT_EXPECTED_COUNT=$(list_count "$PRESENT_EXPECTED")
398
+ PRESENT_OPTIONAL_COUNT=$(list_count "$PRESENT_OPTIONAL")
399
+ PRESENT_FORBIDDEN_COUNT=$(list_count "$PRESENT_FORBIDDEN")
400
+
401
+ BASE=$(awk -v h="$PRESENT_EXPECTED_COUNT" -v t="$EXPECTED_COUNT" \
402
+ 'BEGIN { if (t == 0) print 0; else printf "%d", (h / t) * 100 + 0.5 }')
403
+
404
+ BONUS=$((PRESENT_OPTIONAL_COUNT * 10))
405
+ [ $BONUS -gt 20 ] && BONUS=20
406
+
407
+ FORBID_PENALTY=$((PRESENT_FORBIDDEN_COUNT * 10))
408
+
409
+ VALID_PENALTY=0
410
+ if [ "$JSONLD_COUNT" -gt 0 ] && [ "$JSONLD_INVALID" -gt 0 ]; then
411
+ VALID_PENALTY=$((JSONLD_INVALID * 5))
412
+ [ $VALID_PENALTY -gt 20 ] && VALID_PENALTY=20
413
+ fi
414
+
415
+ # Field-level validation (C3): check required fields per schema type
416
+ FIELD_PENALTY=0
417
+ FIELD_VIOLATIONS_JSON="[]"
418
+ BLOCK_COUNT_FOR_FIELDS=0
419
+ if [ -f "$JSONLD" ]; then
420
+ BLOCK_COUNT_FOR_FIELDS=$(jq 'if has("blocks") then .blocks | length else 0 end' "$JSONLD" 2>/dev/null || echo "0")
421
+ fi
422
+ if [ "$BLOCK_COUNT_FOR_FIELDS" -gt 0 ]; then
423
+ i=0
424
+ while [ "$i" -lt "$BLOCK_COUNT_FOR_FIELDS" ]; do
425
+ BLOCK_TYPE=$(jq -r ".blocks[$i].type" "$JSONLD" 2>/dev/null || echo "")
426
+ BLOCK_FIELDS=$(jq -r ".blocks[$i].fields[]?" "$JSONLD" 2>/dev/null | tr '\n' ' ')
427
+ REQUIRED=$(required_fields_for "$BLOCK_TYPE")
428
+ for field in $REQUIRED; do
429
+ # shellcheck disable=SC2086
430
+ if ! list_contains "$field" $BLOCK_FIELDS; then
431
+ FIELD_VIOLATIONS_JSON=$(printf '%s' "$FIELD_VIOLATIONS_JSON" | jq \
432
+ --arg schema "$BLOCK_TYPE" --arg field "$field" \
433
+ '. + [{kind: "missing_required_field", schema: $schema, field: $field, impact: -5}]')
434
+ FIELD_PENALTY=$((FIELD_PENALTY + 5))
435
+ fi
436
+ done
437
+ i=$((i + 1))
438
+ done
439
+ fi
440
+ [ $FIELD_PENALTY -gt 30 ] && FIELD_PENALTY=30
441
+
442
+ STRUCTURED=$((BASE + BONUS - FORBID_PENALTY - VALID_PENALTY - FIELD_PENALTY))
443
+ [ $STRUCTURED -gt 100 ] && STRUCTURED=100
444
+ [ $STRUCTURED -lt 0 ] && STRUCTURED=0
445
+
446
+ CALCULATION=$(printf 'base: %d/%d expected present = %d; +%d optional bonus; -%d forbidden penalty; -%d validity penalty; -%d field penalty; clamp [0,100] = %d' \
447
+ "$PRESENT_EXPECTED_COUNT" "$EXPECTED_COUNT" "$BASE" \
448
+ "$BONUS" "$FORBID_PENALTY" "$VALID_PENALTY" "$FIELD_PENALTY" "$STRUCTURED")
449
+
450
+ if [ "$STRUCTURED" -ge 100 ] && [ -z "$PRESENT_FORBIDDEN" ] && [ "$VALID_PENALTY" -eq 0 ] && [ "$FIELD_PENALTY" -eq 0 ]; then
451
+ NOTES="All expected schemas for pageType=$PAGE_TYPE are present. No structured-data action needed."
452
+ elif [ -n "$MISSING_EXPECTED" ] && [ -z "$PRESENT_FORBIDDEN" ]; then
453
+ NOTES="Missing expected schemas for pageType=$PAGE_TYPE: $MISSING_EXPECTED. Add these to raise the score."
454
+ elif [ -n "$PRESENT_FORBIDDEN" ] && [ -z "$MISSING_EXPECTED" ]; then
455
+ NOTES="Forbidden schemas present for pageType=$PAGE_TYPE: $PRESENT_FORBIDDEN. Remove these (or re-classify the page type with --page-type)."
456
+ elif [ -n "$PRESENT_FORBIDDEN" ] && [ -n "$MISSING_EXPECTED" ]; then
457
+ NOTES="Mixed: missing $MISSING_EXPECTED and forbidden present $PRESENT_FORBIDDEN for pageType=$PAGE_TYPE."
458
+ elif [ "$FIELD_PENALTY" -gt 0 ]; then
459
+ NOTES="Schemas for pageType=$PAGE_TYPE are present but missing required fields. See violations for details."
460
+ elif [ "$VALID_PENALTY" -gt 0 ]; then
461
+ NOTES="Score reduced by $VALID_PENALTY pts due to invalid JSON-LD blocks."
462
+ else
463
+ NOTES="Structured data scored for pageType=$PAGE_TYPE."
464
+ fi
465
+
466
+ STRUCTURED_GRADE=$(grade_for "$STRUCTURED")
467
+ STRUCTURED_OBJ=$(jq -n \
468
+ --argjson score "$STRUCTURED" \
469
+ --arg grade "$STRUCTURED_GRADE" \
470
+ --arg pageType "$PAGE_TYPE" \
471
+ --arg expectedList "$RUBRIC_EXPECTED" \
472
+ --arg optionalList "$RUBRIC_OPTIONAL" \
473
+ --arg forbiddenList "$RUBRIC_FORBIDDEN" \
474
+ --arg presentList "$PRESENT_TYPES" \
475
+ --arg missingList "$MISSING_EXPECTED" \
476
+ --arg extrasList "$EXTRAS" \
477
+ --arg forbiddenPresent "$PRESENT_FORBIDDEN" \
478
+ --argjson invalidCount "$JSONLD_INVALID" \
479
+ --argjson validPenalty "$VALID_PENALTY" \
480
+ --argjson fieldViolations "$FIELD_VIOLATIONS_JSON" \
481
+ --arg calculation "$CALCULATION" \
482
+ --arg notes "$NOTES" \
483
+ '
484
+ def to_arr: split(" ") | map(select(length > 0));
485
+ {
486
+ score: $score,
487
+ grade: $grade,
488
+ pageType: $pageType,
489
+ expected: ($expectedList | to_arr),
490
+ optional: ($optionalList | to_arr),
491
+ forbidden: ($forbiddenList | to_arr),
492
+ present: ($presentList | to_arr),
493
+ missing: ($missingList | to_arr),
494
+ extras: ($extrasList | to_arr),
495
+ violations: (
496
+ ($forbiddenPresent | to_arr | map({kind: "forbidden_schema", schema: ., impact: -10}))
497
+ + (if $validPenalty > 0
498
+ then [{kind: "invalid_jsonld", count: $invalidCount, impact: (0 - $validPenalty)}]
499
+ else []
500
+ end)
501
+ + $fieldViolations
502
+ ),
503
+ calculation: $calculation,
504
+ notes: $notes
505
+ }
506
+ ')
507
+
508
+ # --- Category 4: Technical Signals (0-100) ---
509
+ TECHNICAL=0
510
+ TITLE=$(jget "$META" '.title' "")
511
+ DESCRIPTION=$(jget "$META" '.description' "")
512
+ CANONICAL=$(jget "$META" '.canonical' "")
513
+ OG_TITLE=$(jget "$META" '.og.title' "")
514
+ OG_DESC=$(jget "$META" '.og.description' "")
515
+
516
+ [ -n "$TITLE" ] && [ "$TITLE" != "null" ] && TECHNICAL=$((TECHNICAL + 25))
517
+ [ -n "$DESCRIPTION" ] && [ "$DESCRIPTION" != "null" ] && TECHNICAL=$((TECHNICAL + 25))
518
+ [ -n "$CANONICAL" ] && [ "$CANONICAL" != "null" ] && TECHNICAL=$((TECHNICAL + 20))
519
+ if [ -n "$OG_TITLE" ] && [ "$OG_TITLE" != "null" ]; then TECHNICAL=$((TECHNICAL + 8)); fi
520
+ if [ -n "$OG_DESC" ] && [ "$OG_DESC" != "null" ]; then TECHNICAL=$((TECHNICAL + 7)); fi
521
+
522
+ SITEMAP_EXISTS=$(jget_bool "$SITEMAP_FILE" '.exists')
523
+ SITEMAP_CONTAINS=$(jget_bool "$SITEMAP_FILE" '.containsTarget')
524
+ if [ "$SITEMAP_EXISTS" = "true" ] && [ "$SITEMAP_CONTAINS" = "true" ]; then
525
+ TECHNICAL=$((TECHNICAL + 15))
526
+ elif [ "$SITEMAP_EXISTS" = "true" ]; then
527
+ TECHNICAL=$((TECHNICAL + 10))
528
+ fi
529
+
530
+ # --- Category 5: AI Readiness (0-100) ---
531
+ AI=0
532
+ LLMS_EXISTS=$(jget_bool "$LLMSTXT_FILE" '.llmsTxt.exists')
533
+ LLMS_HAS_TITLE=$(jget_bool "$LLMSTXT_FILE" '.llmsTxt.hasTitle')
534
+ LLMS_HAS_DESC=$(jget_bool "$LLMSTXT_FILE" '.llmsTxt.hasDescription')
535
+ LLMS_URLS=$(jget_num "$LLMSTXT_FILE" '.llmsTxt.urlCount')
536
+
537
+ if [ "$LLMS_EXISTS" = "true" ]; then
538
+ AI=$((AI + 40))
539
+ [ "$LLMS_HAS_TITLE" = "true" ] && AI=$((AI + 7))
540
+ [ "$LLMS_HAS_DESC" = "true" ] && AI=$((AI + 7))
541
+ [ "$LLMS_URLS" -ge 1 ] && AI=$((AI + 6))
542
+ fi
543
+ [ "$EFFECTIVE_WORD_COUNT" -ge 200 ] && AI=$((AI + 20))
544
+ if [ "$H1_COUNT" -ge 1 ] && [ -n "$DESCRIPTION" ] && [ "$DESCRIPTION" != "null" ]; then
545
+ AI=$((AI + 20))
546
+ fi
547
+
548
+ [ $ACC -gt 100 ] && ACC=100
549
+ [ $CONTENT -gt 100 ] && CONTENT=100
550
+ [ $TECHNICAL -gt 100 ] && TECHNICAL=100
551
+ [ $AI -gt 100 ] && AI=100
552
+
553
+ BOT_SCORE=$(awk -v a=$ACC -v c=$CONTENT -v s=$STRUCTURED -v t=$TECHNICAL -v ai=$AI \
554
+ -v wa=$W_ACCESSIBILITY -v wc=$W_CONTENT -v ws=$W_STRUCTURED -v wt=$W_TECHNICAL -v wai=$W_AI \
555
+ 'BEGIN { printf "%d", (a*wa + c*wc + s*ws + t*wt + ai*wai) / (wa+wc+ws+wt+wai) + 0.5 }')
556
+
557
+ BOT_GRADE=$(grade_for "$BOT_SCORE")
558
+ ACC_GRADE=$(grade_for "$ACC")
559
+ CONTENT_GRADE=$(grade_for "$CONTENT")
560
+ TECHNICAL_GRADE=$(grade_for "$TECHNICAL")
561
+ AI_GRADE=$(grade_for "$AI")
562
+
563
+ BOT_OBJ=$(jq -n \
564
+ --arg id "$bot_id" \
565
+ --arg name "$BOT_NAME" \
566
+ --arg rendersJs "$RENDERS_JS" \
567
+ --argjson score "$BOT_SCORE" \
568
+ --arg grade "$BOT_GRADE" \
569
+ --argjson acc "$ACC" \
570
+ --arg accGrade "$ACC_GRADE" \
571
+ --argjson content "$CONTENT" \
572
+ --arg contentGrade "$CONTENT_GRADE" \
573
+ --argjson structured "$STRUCTURED_OBJ" \
574
+ --argjson technical "$TECHNICAL" \
575
+ --arg technicalGrade "$TECHNICAL_GRADE" \
576
+ --argjson ai "$AI" \
577
+ --arg aiGrade "$AI_GRADE" \
578
+ --argjson serverWords "$SERVER_WORD_COUNT" \
579
+ --argjson effectiveWords "$EFFECTIVE_WORD_COUNT" \
580
+ --argjson missedWords "$MISSED_WORDS" \
581
+ --argjson hydrationPenalty "$HYDRATION_PENALTY" \
582
+ '{
583
+ id: $id,
584
+ name: $name,
585
+ rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end),
586
+ score: $score,
587
+ grade: $grade,
588
+ visibility: {
589
+ serverWords: $serverWords,
590
+ effectiveWords: $effectiveWords,
591
+ missedWordsVsRendered: $missedWords,
592
+ hydrationPenaltyPts: $hydrationPenalty
593
+ },
594
+ categories: {
595
+ accessibility: { score: $acc, grade: $accGrade },
596
+ contentVisibility: { score: $content, grade: $contentGrade },
597
+ structuredData: $structured,
598
+ technicalSignals: { score: $technical, grade: $technicalGrade },
599
+ aiReadiness: { score: $ai, grade: $aiGrade }
600
+ }
601
+ }')
602
+
603
+ BOTS_JSON=$(printf '%s' "$BOTS_JSON" | jq --argjson bot "$BOT_OBJ" --arg id "$bot_id" '.[$id] = $bot')
604
+
605
+ CAT_ACCESSIBILITY_SUM=$((CAT_ACCESSIBILITY_SUM + ACC))
606
+ CAT_CONTENT_SUM=$((CAT_CONTENT_SUM + CONTENT))
607
+ CAT_STRUCTURED_SUM=$((CAT_STRUCTURED_SUM + STRUCTURED))
608
+ CAT_TECHNICAL_SUM=$((CAT_TECHNICAL_SUM + TECHNICAL))
609
+ CAT_AI_SUM=$((CAT_AI_SUM + AI))
610
+ CAT_N=$((CAT_N + 1))
611
+
612
+ W=$(overall_weight "$bot_id")
613
+ if [ "$W" -gt 0 ]; then
614
+ OVERALL_WEIGHTED_SUM=$((OVERALL_WEIGHTED_SUM + BOT_SCORE * W))
615
+ OVERALL_WEIGHT_TOTAL=$((OVERALL_WEIGHT_TOTAL + W))
616
+ fi
617
+ done
618
+
619
+ CAT_ACC_AVG=$((CAT_ACCESSIBILITY_SUM / CAT_N))
620
+ CAT_CONTENT_AVG=$((CAT_CONTENT_SUM / CAT_N))
621
+ CAT_STRUCTURED_AVG=$((CAT_STRUCTURED_SUM / CAT_N))
622
+ CAT_TECHNICAL_AVG=$((CAT_TECHNICAL_SUM / CAT_N))
623
+ CAT_AI_AVG=$((CAT_AI_SUM / CAT_N))
624
+
625
+ if [ "$OVERALL_WEIGHT_TOTAL" -gt 0 ]; then
626
+ OVERALL_SCORE=$((OVERALL_WEIGHTED_SUM / OVERALL_WEIGHT_TOTAL))
627
+ else
628
+ OVERALL_SCORE=$(((CAT_ACC_AVG + CAT_CONTENT_AVG + CAT_STRUCTURED_AVG + CAT_TECHNICAL_AVG + CAT_AI_AVG) / 5))
629
+ fi
630
+
631
+ OVERALL_GRADE=$(grade_for "$OVERALL_SCORE")
632
+ CAT_ACC_GRADE=$(grade_for "$CAT_ACC_AVG")
633
+ CAT_CONTENT_GRADE=$(grade_for "$CAT_CONTENT_AVG")
634
+ CAT_STRUCTURED_GRADE=$(grade_for "$CAT_STRUCTURED_AVG")
635
+ CAT_TECHNICAL_GRADE=$(grade_for "$CAT_TECHNICAL_AVG")
636
+ CAT_AI_GRADE=$(grade_for "$CAT_AI_AVG")
637
+
638
+ # --- Cross-bot content parity (C4) ---
639
+ PARITY_MIN_WORDS=999999999
640
+ PARITY_MAX_WORDS=0
641
+ PARITY_BOT_COUNT=0
642
+ for bot_id in $BOTS; do
643
+ FETCH="$RESULTS_DIR/fetch-$bot_id.json"
644
+ P_FETCH_FAILED=$(jget_bool "$FETCH" '.fetchFailed')
645
+ [ "$P_FETCH_FAILED" = "true" ] && continue
646
+ WC=$(jget_num "$FETCH" '.wordCount')
647
+ [ "$WC" -lt "$PARITY_MIN_WORDS" ] && PARITY_MIN_WORDS=$WC
648
+ [ "$WC" -gt "$PARITY_MAX_WORDS" ] && PARITY_MAX_WORDS=$WC
649
+ PARITY_BOT_COUNT=$((PARITY_BOT_COUNT + 1))
650
+ done
651
+
652
+ if [ "$PARITY_BOT_COUNT" -le 1 ]; then
653
+ PARITY_SCORE=100
654
+ PARITY_MAX_DELTA=0
655
+ elif [ "$PARITY_MAX_WORDS" -gt 0 ]; then
656
+ PARITY_SCORE=$(awk -v min="$PARITY_MIN_WORDS" -v max="$PARITY_MAX_WORDS" \
657
+ 'BEGIN { printf "%d", (min / max) * 100 + 0.5 }')
658
+ PARITY_MAX_DELTA=$(awk -v min="$PARITY_MIN_WORDS" -v max="$PARITY_MAX_WORDS" \
659
+ 'BEGIN { printf "%d", ((max - min) / max) * 100 + 0.5 }')
660
+ else
661
+ PARITY_SCORE=100
662
+ PARITY_MAX_DELTA=0
663
+ fi
664
+
665
+ [ "$PARITY_SCORE" -gt 100 ] && PARITY_SCORE=100
666
+ PARITY_GRADE=$(grade_for "$PARITY_SCORE")
667
+
668
+ if [ "$PARITY_SCORE" -ge 95 ]; then
669
+ PARITY_INTERP="Content is consistent across all bots."
670
+ elif [ "$PARITY_SCORE" -ge 50 ]; then
671
+ PARITY_INTERP="Moderate content divergence between bots — likely partial client-side rendering hydration."
672
+ else
673
+ PARITY_INTERP="Severe content divergence — site likely relies on client-side rendering. AI bots see significantly less content than Googlebot."
674
+ fi
675
+
676
+ # --- Warnings (H2) ---
677
+ WARNINGS="[]"
678
+ if [ "$DIFF_AVAILABLE" != "true" ]; then
679
+ DIFF_REASON="not_found"
680
+ if [ -f "$DIFF_RENDER_FILE" ]; then
681
+ DIFF_REASON=$(jq -r '.reason // "skipped"' "$DIFF_RENDER_FILE" 2>/dev/null || echo "skipped")
682
+ fi
683
+ WARNINGS=$(printf '%s' "$WARNINGS" | jq --arg reason "$DIFF_REASON" \
684
+ '. + [{
685
+ code: "diff_render_unavailable",
686
+ severity: "high",
687
+ message: "JS rendering comparison was skipped. If this site uses CSR, non-JS bot scores may be inaccurate.",
688
+ reason: $reason
689
+ }]')
690
+ fi
691
+
692
+ TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
693
+
694
+ jq -n \
695
+ --arg url "$TARGET_URL" \
696
+ --arg timestamp "$TIMESTAMP" \
697
+ --arg version "0.2.0" \
698
+ --arg pageType "$PAGE_TYPE" \
699
+ --arg pageTypeOverride "$PAGE_TYPE_OVERRIDE" \
700
+ --argjson overallScore "$OVERALL_SCORE" \
701
+ --arg overallGrade "$OVERALL_GRADE" \
702
+ --argjson bots "$BOTS_JSON" \
703
+ --argjson catAcc "$CAT_ACC_AVG" \
704
+ --arg catAccGrade "$CAT_ACC_GRADE" \
705
+ --argjson catContent "$CAT_CONTENT_AVG" \
706
+ --arg catContentGrade "$CAT_CONTENT_GRADE" \
707
+ --argjson catStructured "$CAT_STRUCTURED_AVG" \
708
+ --arg catStructuredGrade "$CAT_STRUCTURED_GRADE" \
709
+ --argjson catTechnical "$CAT_TECHNICAL_AVG" \
710
+ --arg catTechnicalGrade "$CAT_TECHNICAL_GRADE" \
711
+ --argjson catAi "$CAT_AI_AVG" \
712
+ --arg catAiGrade "$CAT_AI_GRADE" \
713
+ --argjson warnings "$WARNINGS" \
714
+ --argjson parityScore "$PARITY_SCORE" \
715
+ --arg parityGrade "$PARITY_GRADE" \
716
+ --argjson parityMinWords "$PARITY_MIN_WORDS" \
717
+ --argjson parityMaxWords "$PARITY_MAX_WORDS" \
718
+ --argjson parityMaxDelta "$PARITY_MAX_DELTA" \
719
+ --arg parityInterp "$PARITY_INTERP" \
720
+ '{
721
+ url: $url,
722
+ timestamp: $timestamp,
723
+ version: $version,
724
+ pageType: $pageType,
725
+ pageTypeOverridden: ($pageTypeOverride | length > 0),
726
+ overall: { score: $overallScore, grade: $overallGrade },
727
+ parity: {
728
+ score: $parityScore,
729
+ grade: $parityGrade,
730
+ minWords: (if $parityMinWords >= 999999999 then 0 else $parityMinWords end),
731
+ maxWords: $parityMaxWords,
732
+ maxDeltaPct: $parityMaxDelta,
733
+ interpretation: $parityInterp
734
+ },
735
+ warnings: $warnings,
736
+ bots: $bots,
737
+ categories: {
738
+ accessibility: { score: $catAcc, grade: $catAccGrade },
739
+ contentVisibility: { score: $catContent, grade: $catContentGrade },
740
+ structuredData: { score: $catStructured, grade: $catStructuredGrade },
741
+ technicalSignals: { score: $catTechnical, grade: $catTechnicalGrade },
742
+ aiReadiness: { score: $catAi, grade: $catAiGrade }
743
+ }
744
+ }'