@braedenbuilds/crawl-sim 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. package/.claude-plugin/marketplace.json +15 -0
  2. package/.claude-plugin/plugin.json +13 -0
  3. package/README.md +19 -6
  4. package/bin/install.js +6 -2
  5. package/package.json +5 -3
  6. package/{SKILL.md → skills/crawl-sim/SKILL.md} +38 -5
  7. package/skills/crawl-sim/scripts/build-report.sh +45 -0
  8. package/{scripts → skills/crawl-sim/scripts}/check-llmstxt.sh +5 -0
  9. package/{scripts → skills/crawl-sim/scripts}/check-sitemap.sh +10 -1
  10. package/{scripts → skills/crawl-sim/scripts}/compute-score.sh +202 -41
  11. package/{scripts → skills/crawl-sim/scripts}/extract-jsonld.sh +12 -0
  12. package/{scripts → skills/crawl-sim/scripts}/extract-links.sh +5 -7
  13. package/skills/crawl-sim/scripts/fetch-as-bot.sh +151 -0
  14. package/skills/crawl-sim/scripts/schema-fields.sh +25 -0
  15. package/scripts/fetch-as-bot.sh +0 -87
  16. /package/{profiles → skills/crawl-sim/profiles}/chatgpt-user.json +0 -0
  17. /package/{profiles → skills/crawl-sim/profiles}/claude-searchbot.json +0 -0
  18. /package/{profiles → skills/crawl-sim/profiles}/claude-user.json +0 -0
  19. /package/{profiles → skills/crawl-sim/profiles}/claudebot.json +0 -0
  20. /package/{profiles → skills/crawl-sim/profiles}/googlebot.json +0 -0
  21. /package/{profiles → skills/crawl-sim/profiles}/gptbot.json +0 -0
  22. /package/{profiles → skills/crawl-sim/profiles}/oai-searchbot.json +0 -0
  23. /package/{profiles → skills/crawl-sim/profiles}/perplexity-user.json +0 -0
  24. /package/{profiles → skills/crawl-sim/profiles}/perplexitybot.json +0 -0
  25. /package/{scripts → skills/crawl-sim/scripts}/_lib.sh +0 -0
  26. /package/{scripts → skills/crawl-sim/scripts}/check-robots.sh +0 -0
  27. /package/{scripts → skills/crawl-sim/scripts}/diff-render.sh +0 -0
  28. /package/{scripts → skills/crawl-sim/scripts}/extract-meta.sh +0 -0
@@ -0,0 +1,15 @@
1
+ {
2
+ "name": "crawl-sim",
3
+ "owner": {
4
+ "name": "BraedenBDev",
5
+ "url": "https://github.com/BraedenBDev"
6
+ },
7
+ "plugins": [
8
+ {
9
+ "name": "crawl-sim",
10
+ "source": "./",
11
+ "description": "Multi-bot web crawler simulator — audit how Googlebot, GPTBot, ClaudeBot, and PerplexityBot see your site",
12
+ "version": "1.3.0"
13
+ }
14
+ ]
15
+ }
@@ -0,0 +1,13 @@
1
+ {
2
+ "name": "crawl-sim",
3
+ "version": "1.3.0",
4
+ "description": "Multi-bot web crawler simulator — audit how Googlebot, GPTBot, ClaudeBot, and PerplexityBot see your site",
5
+ "author": {
6
+ "name": "BraedenBDev",
7
+ "url": "https://github.com/BraedenBDev"
8
+ },
9
+ "homepage": "https://github.com/BraedenBDev/crawl-sim#readme",
10
+ "repository": "https://github.com/BraedenBDev/crawl-sim",
11
+ "license": "MIT",
12
+ "keywords": ["seo", "crawler", "ai-visibility", "claude-code-skill", "googlebot", "gptbot", "claudebot", "perplexitybot"]
13
+ }
package/README.md CHANGED
@@ -44,15 +44,20 @@ The concept was validated manually: a curl-as-GPTBot + Claude analysis caught a
44
44
 
45
45
  ## Quick start
46
46
 
47
- ### In Claude Code (recommended)
47
+ ### As a Claude Code plugin (recommended)
48
+
49
+ ```
50
+ /plugin install BraedenBDev/crawl-sim@github
51
+ ```
52
+
53
+ Or add as a marketplace for easy updates:
48
54
 
49
- ```bash
50
- npm install -g @braedenbuilds/crawl-sim
51
- crawl-sim install # → ~/.claude/skills/crawl-sim/
52
- crawl-sim install --project # → .claude/skills/crawl-sim/
55
+ ```
56
+ /plugin marketplace add BraedenBDev/crawl-sim
57
+ /plugin install crawl-sim@crawl-sim
53
58
  ```
54
59
 
55
- Then in Claude Code:
60
+ Then invoke:
56
61
 
57
62
  ```
58
63
  /crawl-sim https://yoursite.com
@@ -60,6 +65,14 @@ Then in Claude Code:
60
65
 
61
66
  Claude runs the full pipeline, interprets the results, and returns a score card plus prioritized findings.
62
67
 
68
+ ### Via npm (alternative)
69
+
70
+ ```bash
71
+ npm install -g @braedenbuilds/crawl-sim
72
+ crawl-sim install # → ~/.claude/skills/crawl-sim/
73
+ crawl-sim install --project # → .claude/skills/crawl-sim/
74
+ ```
75
+
63
76
  > **Why `npm install -g` instead of `npx`?** Recent versions of npx have a known issue linking bins for scoped single-bin packages in ephemeral installs. A persistent global install avoids the problem entirely. The git clone path below is the zero-npm fallback.
64
77
 
65
78
  ### As a standalone CLI
package/bin/install.js CHANGED
@@ -14,6 +14,7 @@ const os = require('os');
14
14
  const { execFileSync } = require('child_process');
15
15
 
16
16
  const SOURCE_DIR = path.resolve(__dirname, '..');
17
+ const SKILL_ROOT = path.resolve(SOURCE_DIR, 'skills', 'crawl-sim');
17
18
  const SKILL_FILES = ['SKILL.md'];
18
19
  const SKILL_DIRS = ['profiles', 'scripts'];
19
20
 
@@ -80,7 +81,9 @@ function install(target) {
80
81
  fs.mkdirSync(target, { recursive: true });
81
82
 
82
83
  for (const file of SKILL_FILES) {
83
- const src = path.join(SOURCE_DIR, file);
84
+ // Look in skills/crawl-sim/ first (canonical), fallback to root (symlink)
85
+ let src = path.join(SKILL_ROOT, file);
86
+ if (!fs.existsSync(src)) src = path.join(SOURCE_DIR, file);
84
87
  const dest = path.join(target, file);
85
88
  if (fs.existsSync(src)) {
86
89
  fs.copyFileSync(src, dest);
@@ -92,7 +95,8 @@ function install(target) {
92
95
  }
93
96
 
94
97
  for (const dir of SKILL_DIRS) {
95
- const src = path.join(SOURCE_DIR, dir);
98
+ let src = path.join(SKILL_ROOT, dir);
99
+ if (!fs.existsSync(src)) src = path.join(SOURCE_DIR, dir);
96
100
  const dest = path.join(target, dir);
97
101
  if (fs.existsSync(src)) {
98
102
  if (fs.existsSync(dest)) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@braedenbuilds/crawl-sim",
3
- "version": "1.1.0",
3
+ "version": "1.3.0",
4
4
  "description": "Agent-native multi-bot web crawler simulator. See your site through the eyes of Googlebot, GPTBot, ClaudeBot, and PerplexityBot.",
5
5
  "bin": {
6
6
  "crawl-sim": "bin/install.js"
@@ -40,9 +40,11 @@
40
40
  },
41
41
  "files": [
42
42
  "bin/",
43
+ "skills/",
44
+ ".claude-plugin/",
43
45
  "SKILL.md",
44
- "profiles/",
45
- "scripts/",
46
+ "profiles",
47
+ "scripts",
46
48
  "README.md",
47
49
  "LICENSE"
48
50
  ]
@@ -40,7 +40,10 @@ command -v curl >/dev/null 2>&1 || { echo "ERROR: curl is required"; exit 1; }
40
40
  command -v jq >/dev/null 2>&1 || { echo "ERROR: jq is required (brew install jq)"; exit 1; }
41
41
  ```
42
42
 
43
- Locate the skill directory: typically `~/.claude/skills/crawl-sim/` or `.claude/skills/crawl-sim/`. Use `$CLAUDE_PLUGIN_ROOT` if set, otherwise find the directory containing this `SKILL.md`.
43
+ Locate the skill directory. Check in this order:
44
+ 1. `$CLAUDE_PLUGIN_ROOT/skills/crawl-sim` (plugin install)
45
+ 2. `~/.claude/skills/crawl-sim/` (global npm install)
46
+ 3. `.claude/skills/crawl-sim/` (project-level install)
44
47
 
45
48
  ## Orchestration — five narrated stages
46
49
 
@@ -48,14 +51,32 @@ Split the work into **five Bash invocations**, each with a clear `description` f
48
51
 
49
52
  ### Stage 1 — Fetch
50
53
 
51
- Tell the user: "Fetching as Googlebot, GPTBot, ClaudeBot, and PerplexityBot..."
54
+ Tell the user: "Fetching as Googlebot, GPTBot, ClaudeBot, and PerplexityBot in parallel..."
52
55
 
53
56
  ```bash
54
- SKILL_DIR="$HOME/.claude/skills/crawl-sim" # or wherever this SKILL.md lives
57
+ # Resolve skill directory
58
+ if [ -n "${CLAUDE_PLUGIN_ROOT:-}" ] && [ -d "$CLAUDE_PLUGIN_ROOT/skills/crawl-sim" ]; then
59
+ SKILL_DIR="$CLAUDE_PLUGIN_ROOT/skills/crawl-sim"
60
+ elif [ -d "$HOME/.claude/skills/crawl-sim" ]; then
61
+ SKILL_DIR="$HOME/.claude/skills/crawl-sim"
62
+ elif [ -d ".claude/skills/crawl-sim" ]; then
63
+ SKILL_DIR=".claude/skills/crawl-sim"
64
+ else
65
+ echo "ERROR: cannot find crawl-sim skill directory" >&2; exit 1
66
+ fi
55
67
  RUN_DIR=$(mktemp -d -t crawl-sim.XXXXXX)
56
68
  URL="<user-provided-url>"
57
69
  for bot in googlebot gptbot claudebot perplexitybot; do
58
- "$SKILL_DIR/scripts/fetch-as-bot.sh" "$URL" "$SKILL_DIR/profiles/${bot}.json" > "$RUN_DIR/fetch-${bot}.json"
70
+ "$SKILL_DIR/scripts/fetch-as-bot.sh" "$URL" "$SKILL_DIR/profiles/${bot}.json" > "$RUN_DIR/fetch-${bot}.json" &
71
+ done
72
+ wait
73
+
74
+ # Verify no empty fetch files (guard against silent parallel failures)
75
+ for bot in googlebot gptbot claudebot perplexitybot; do
76
+ if [ ! -s "$RUN_DIR/fetch-${bot}.json" ]; then
77
+ echo "WARNING: fetch-${bot}.json is empty — retrying serially" >&2
78
+ "$SKILL_DIR/scripts/fetch-as-bot.sh" "$URL" "$SKILL_DIR/profiles/${bot}.json" > "$RUN_DIR/fetch-${bot}.json"
79
+ fi
59
80
  done
60
81
  ```
61
82
 
@@ -112,7 +133,7 @@ Tell the user: "Computing per-bot scores and finalizing the report..."
112
133
 
113
134
  ```bash
114
135
  "$SKILL_DIR/scripts/compute-score.sh" "$RUN_DIR" > "$RUN_DIR/score.json"
115
- cp "$RUN_DIR/score.json" ./crawl-sim-report.json
136
+ "$SKILL_DIR/scripts/build-report.sh" "$RUN_DIR" > ./crawl-sim-report.json
116
137
  ```
117
138
 
118
139
  **Page-type awareness.** `compute-score.sh` derives a page type from the target URL (`root` / `detail` / `archive` / `faq` / `about` / `contact` / `generic`) and picks a schema rubric accordingly. Root pages are expected to ship `Organization` + `WebSite` — penalizing them for missing `BreadcrumbList` or `FAQPage` would be wrong, so the scorer doesn't. If the URL heuristic picks the wrong type (e.g., a homepage at `/en/` that URL-parses as generic), pass `--page-type <type>`:
@@ -150,6 +171,18 @@ Print a boxed score card to the terminal:
150
171
 
151
172
  Progress bars are 20 chars wide using `█` and `░` (each char = 5%).
152
173
 
174
+ **Parity-aware display.** When `parity.score >= 95` AND all per-bot composite scores are within 5 points of each other, collapse the four bot rows into one:
175
+
176
+ ```
177
+ ║ All 4 bots 98 A ███████████████████░ (parity: content identical) ║
178
+ ```
179
+
180
+ Only show individual bot rows when scores diverge — that's when per-bot detail adds information. Always show the parity line in the category breakdown:
181
+
182
+ ```
183
+ ║ Content Parity 100 A (all bots see the same content) ║
184
+ ```
185
+
153
186
  ## Output Layer 2 — Narrative Audit
154
187
 
155
188
  Lead with a **Bot differentiation summary** — state up front whether the bots scored the same or differently, and why. If they scored the same, explicitly say so:
@@ -0,0 +1,45 @@
1
+ #!/usr/bin/env bash
2
+ set -eu
3
+
4
+ # build-report.sh — Consolidate all crawl-sim outputs into a single JSON report
5
+ # Usage: build-report.sh <results-dir>
6
+ # Output: JSON to stdout
7
+
8
+ RESULTS_DIR="${1:?Usage: build-report.sh <results-dir>}"
9
+
10
+ if [ ! -f "$RESULTS_DIR/score.json" ]; then
11
+ echo "Error: score.json not found in $RESULTS_DIR — run compute-score.sh first" >&2
12
+ exit 1
13
+ fi
14
+
15
+ SCORE=$(cat "$RESULTS_DIR/score.json")
16
+
17
+ # Collect per-bot raw data
18
+ PER_BOT="{}"
19
+ for f in "$RESULTS_DIR"/fetch-*.json; do
20
+ [ -f "$f" ] || continue
21
+ bot_id=$(basename "$f" .json | sed 's/^fetch-//')
22
+
23
+ BOT_RAW=$(jq -n \
24
+ --argjson fetch "$(jq '{status, timing, size, wordCount, redirectCount, finalUrl, redirectChain, fetchFailed, error}' "$f" 2>/dev/null || echo '{}')" \
25
+ --argjson meta "$(jq '.' "$RESULTS_DIR/meta-$bot_id.json" 2>/dev/null || echo '{}')" \
26
+ --argjson jsonld "$(jq '{blockCount, types, blocks}' "$RESULTS_DIR/jsonld-$bot_id.json" 2>/dev/null || echo '{}')" \
27
+ --argjson links "$(jq '.' "$RESULTS_DIR/links-$bot_id.json" 2>/dev/null || echo '{}')" \
28
+ --argjson robots "$(jq '.' "$RESULTS_DIR/robots-$bot_id.json" 2>/dev/null || echo '{}')" \
29
+ '{fetch: $fetch, meta: $meta, jsonld: $jsonld, links: $links, robots: $robots}')
30
+
31
+ PER_BOT=$(printf '%s' "$PER_BOT" | jq --argjson raw "$BOT_RAW" --arg id "$bot_id" '.[$id] = $raw')
32
+ done
33
+
34
+ # Collect independent (non-per-bot) data
35
+ INDEPENDENT=$(jq -n \
36
+ --argjson sitemap "$(jq '.' "$RESULTS_DIR/sitemap.json" 2>/dev/null || echo '{}')" \
37
+ --argjson llmstxt "$(jq '.' "$RESULTS_DIR/llmstxt.json" 2>/dev/null || echo '{}')" \
38
+ --argjson diffRender "$(jq '.' "$RESULTS_DIR/diff-render.json" 2>/dev/null || echo '{"skipped":true,"reason":"not_found"}')" \
39
+ '{sitemap: $sitemap, llmstxt: $llmstxt, diffRender: $diffRender}')
40
+
41
+ # Merge score + raw data
42
+ printf '%s' "$SCORE" | jq \
43
+ --argjson perBot "$PER_BOT" \
44
+ --argjson independent "$INDEPENDENT" \
45
+ '. + {raw: {perBot: $perBot, independent: $independent}}'
@@ -79,8 +79,12 @@ LLMS_FULL_HAS_TITLE=$HAS_TITLE
79
79
  LLMS_FULL_HAS_DESC=$HAS_DESCRIPTION
80
80
  LLMS_FULL_URLS=$URL_COUNT
81
81
 
82
+ TOP_EXISTS=false
83
+ [ "$LLMS_EXISTS" = "true" ] || [ "$LLMS_FULL_EXISTS" = "true" ] && TOP_EXISTS=true
84
+
82
85
  jq -n \
83
86
  --arg url "$URL" \
87
+ --argjson topExists "$TOP_EXISTS" \
84
88
  --arg llmsUrl "${ORIGIN}/llms.txt" \
85
89
  --arg llmsFullUrl "${ORIGIN}/llms-full.txt" \
86
90
  --argjson llmsExists "$LLMS_EXISTS" \
@@ -96,6 +100,7 @@ jq -n \
96
100
  --argjson llmsFullUrls "$LLMS_FULL_URLS" \
97
101
  '{
98
102
  url: $url,
103
+ exists: $topExists,
99
104
  llmsTxt: {
100
105
  url: $llmsUrl,
101
106
  exists: $llmsExists,
@@ -25,6 +25,7 @@ CONTAINS_TARGET=false
25
25
  HAS_LASTMOD=false
26
26
  IS_INDEX=false
27
27
  CHILD_SITEMAP_COUNT=0
28
+ SAMPLE_URLS="[]"
28
29
 
29
30
  if [ "$HTTP_STATUS" = "200" ] && [ -s "$SITEMAP_FILE" ]; then
30
31
  # Check if content looks like XML (not HTML fallback)
@@ -43,6 +44,12 @@ if [ "$HTTP_STATUS" = "200" ] && [ -s "$SITEMAP_FILE" ]; then
43
44
  # Count <loc> tags (URLs, or child sitemaps in an index)
44
45
  URL_COUNT=$(grep -oE '<loc>' "$SITEMAP_FILE" | wc -l | tr -d ' ')
45
46
 
47
+ # Extract first 10 <loc> URLs as sample
48
+ SAMPLE_URLS=$(grep -oE '<loc>[^<]+</loc>' "$SITEMAP_FILE" \
49
+ | sed -E 's/<\/?loc>//g' \
50
+ | head -10 \
51
+ | jq -R . | jq -s .)
52
+
46
53
  # Check if target URL appears anywhere in the sitemap
47
54
  # Match both with and without trailing slash
48
55
  URL_NO_TRAILING=$(printf '%s' "$URL" | sed -E 's#/$##')
@@ -67,6 +74,7 @@ jq -n \
67
74
  --argjson childSitemapCount "$CHILD_SITEMAP_COUNT" \
68
75
  --argjson containsTarget "$CONTAINS_TARGET" \
69
76
  --argjson hasLastmod "$HAS_LASTMOD" \
77
+ --argjson sampleUrls "$SAMPLE_URLS" \
70
78
  '{
71
79
  url: $url,
72
80
  sitemapUrl: $sitemapUrl,
@@ -75,5 +83,6 @@ jq -n \
75
83
  urlCount: $urlCount,
76
84
  childSitemapCount: $childSitemapCount,
77
85
  containsTarget: $containsTarget,
78
- hasLastmod: $hasLastmod
86
+ hasLastmod: $hasLastmod,
87
+ sampleUrls: $sampleUrls
79
88
  }'
@@ -21,6 +21,8 @@ set -eu
21
21
  SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
22
22
  # shellcheck source=_lib.sh
23
23
  . "$SCRIPT_DIR/_lib.sh"
24
+ # shellcheck source=schema-fields.sh
25
+ . "$SCRIPT_DIR/schema-fields.sh"
24
26
 
25
27
  PAGE_TYPE_OVERRIDE=""
26
28
  while [ $# -gt 0 ]; do
@@ -276,12 +278,50 @@ for bot_id in $BOTS; do
276
278
  ROBOTS="$RESULTS_DIR/robots-$bot_id.json"
277
279
 
278
280
  BOT_NAME=$(jget "$FETCH" '.bot.name' "$bot_id")
279
- STATUS=$(jget_num "$FETCH" '.status')
280
- TOTAL_TIME=$(jget_num "$FETCH" '.timing.total')
281
- SERVER_WORD_COUNT=$(jget_num "$FETCH" '.wordCount')
282
- RENDERS_JS=$(jq -r '.bot.rendersJavaScript | if . == null then "unknown" else tostring end' "$FETCH" 2>/dev/null || echo "unknown")
283
281
 
284
- ROBOTS_ALLOWED=$(jget_bool "$ROBOTS" '.allowed')
282
+ # Check for fetch failure — skip scoring, emit F grade (AC-A3)
283
+ FETCH_FAILED=$(jget_bool "$FETCH" '.fetchFailed')
284
+ if [ "$FETCH_FAILED" = "true" ]; then
285
+ FETCH_ERROR=$(jget "$FETCH" '.error' "unknown error")
286
+ RENDERS_JS=$(jq -r '.bot.rendersJavaScript | if . == null then "unknown" else tostring end' "$FETCH" 2>/dev/null || echo "unknown")
287
+ BOT_OBJ=$(jq -n \
288
+ --arg id "$bot_id" \
289
+ --arg name "$BOT_NAME" \
290
+ --arg rendersJs "$RENDERS_JS" \
291
+ --arg error "$FETCH_ERROR" \
292
+ '{
293
+ id: $id,
294
+ name: $name,
295
+ rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end),
296
+ fetchFailed: true,
297
+ error: $error,
298
+ score: 0,
299
+ grade: "F",
300
+ visibility: { serverWords: 0, effectiveWords: 0, missedWordsVsRendered: 0, hydrationPenaltyPts: 0 },
301
+ categories: {
302
+ accessibility: { score: 0, grade: "F" },
303
+ contentVisibility: { score: 0, grade: "F" },
304
+ structuredData: { score: 0, grade: "F", pageType: "unknown", expected: [], optional: [], forbidden: [], present: [], missing: [], extras: [], violations: [{ kind: "fetch_failed", impact: -100 }], calculation: "fetch failed — no data to score", notes: ("Fetch failed: " + $error) },
305
+ technicalSignals: { score: 0, grade: "F" },
306
+ aiReadiness: { score: 0, grade: "F" }
307
+ }
308
+ }')
309
+ BOTS_JSON=$(printf '%s' "$BOTS_JSON" | jq --argjson bot "$BOT_OBJ" --arg id "$bot_id" '.[$id] = $bot')
310
+ printf '[compute-score] %s: fetch failed, scoring as F\n' "$bot_id" >&2
311
+ CAT_N=$((CAT_N + 1))
312
+ continue
313
+ fi
314
+
315
+ # Batch-read fields from fetch file (1 jq call instead of 4)
316
+ read -r STATUS TOTAL_TIME SERVER_WORD_COUNT RENDERS_JS <<< \
317
+ "$(jq -r '[
318
+ (.status // 0),
319
+ (.timing.total // 0),
320
+ (.wordCount // 0),
321
+ (.bot.rendersJavaScript | if . == null then "unknown" else tostring end)
322
+ ] | @tsv' "$FETCH" 2>/dev/null || echo "0 0 0 unknown")"
323
+
324
+ ROBOTS_ALLOWED=$(jq -r '.allowed // false | tostring' "$ROBOTS" 2>/dev/null || echo "false")
285
325
 
286
326
  EFFECTIVE_WORD_COUNT=$SERVER_WORD_COUNT
287
327
  HYDRATION_PENALTY=0
@@ -305,10 +345,15 @@ for bot_id in $BOTS; do
305
345
 
306
346
  # --- Category 1: Accessibility (0-100) ---
307
347
  ACC=0
308
- [ "$ROBOTS_ALLOWED" = "true" ] && ACC=$((ACC + 40))
309
- [ "$STATUS" = "200" ] && ACC=$((ACC + 40))
310
- TIME_SCORE=$(awk -v t="$TOTAL_TIME" 'BEGIN { if (t < 2) print 20; else if (t < 5) print 10; else print 0 }')
311
- ACC=$((ACC + TIME_SCORE))
348
+ if [ "$ROBOTS_ALLOWED" != "true" ]; then
349
+ # R4 critical-fail: robots blocking overrides accessibility to 0/F
350
+ ACC=0
351
+ else
352
+ ACC=$((ACC + 40))
353
+ [ "$STATUS" = "200" ] && ACC=$((ACC + 40))
354
+ TIME_SCORE=$(awk -v t="$TOTAL_TIME" 'BEGIN { if (t < 2) print 20; else if (t < 5) print 10; else print 0 }')
355
+ ACC=$((ACC + TIME_SCORE))
356
+ fi
312
357
 
313
358
  # --- Category 2: Content Visibility (0-100) ---
314
359
  CONTENT=0
@@ -317,18 +362,23 @@ for bot_id in $BOTS; do
317
362
  elif [ "$EFFECTIVE_WORD_COUNT" -ge 50 ]; then CONTENT=$((CONTENT + 10))
318
363
  fi
319
364
 
320
- H1_COUNT=$(jget_num "$META" '.headings.h1.count')
321
- H2_COUNT=$(jget_num "$META" '.headings.h2.count')
365
+ # Batch-read fields from meta + links (1 jq call instead of 4 + 1)
366
+ read -r H1_COUNT H2_COUNT IMG_TOTAL IMG_WITH_ALT <<< \
367
+ "$(jq -r '[
368
+ (.headings.h1.count // 0),
369
+ (.headings.h2.count // 0),
370
+ (.images.total // 0),
371
+ (.images.withAlt // 0)
372
+ ] | @tsv' "$META" 2>/dev/null || echo "0 0 0 0")"
373
+
374
+ INTERNAL_LINKS=$(jq -r 'if (.internal | type) == "number" then .internal else .counts.internal // 0 end' "$LINKS" 2>/dev/null || echo "0")
375
+
322
376
  [ "$H1_COUNT" -ge 1 ] && CONTENT=$((CONTENT + 20))
323
377
  [ "$H2_COUNT" -ge 1 ] && CONTENT=$((CONTENT + 15))
324
378
 
325
- INTERNAL_LINKS=$(jget_num "$LINKS" '.counts.internal')
326
379
  if [ "$INTERNAL_LINKS" -ge 5 ]; then CONTENT=$((CONTENT + 20))
327
380
  elif [ "$INTERNAL_LINKS" -ge 1 ]; then CONTENT=$((CONTENT + 10))
328
381
  fi
329
-
330
- IMG_TOTAL=$(jget_num "$META" '.images.total')
331
- IMG_WITH_ALT=$(jget_num "$META" '.images.withAlt')
332
382
  if [ "$IMG_TOTAL" -eq 0 ]; then
333
383
  CONTENT=$((CONTENT + 15))
334
384
  else
@@ -376,15 +426,42 @@ for bot_id in $BOTS; do
376
426
  [ $VALID_PENALTY -gt 20 ] && VALID_PENALTY=20
377
427
  fi
378
428
 
379
- STRUCTURED=$((BASE + BONUS - FORBID_PENALTY - VALID_PENALTY))
429
+ # Field-level validation (C3): check required fields per schema type
430
+ FIELD_PENALTY=0
431
+ FIELD_VIOLATIONS_JSON="[]"
432
+ BLOCK_COUNT_FOR_FIELDS=0
433
+ if [ -f "$JSONLD" ]; then
434
+ BLOCK_COUNT_FOR_FIELDS=$(jq 'if has("blocks") then .blocks | length else 0 end' "$JSONLD" 2>/dev/null || echo "0")
435
+ fi
436
+ if [ "$BLOCK_COUNT_FOR_FIELDS" -gt 0 ]; then
437
+ i=0
438
+ while [ "$i" -lt "$BLOCK_COUNT_FOR_FIELDS" ]; do
439
+ BLOCK_TYPE=$(jq -r ".blocks[$i].type" "$JSONLD" 2>/dev/null || echo "")
440
+ BLOCK_FIELDS=$(jq -r ".blocks[$i].fields[]?" "$JSONLD" 2>/dev/null | tr '\n' ' ')
441
+ REQUIRED=$(required_fields_for "$BLOCK_TYPE")
442
+ for field in $REQUIRED; do
443
+ # shellcheck disable=SC2086
444
+ if ! list_contains "$field" $BLOCK_FIELDS; then
445
+ FIELD_VIOLATIONS_JSON=$(printf '%s' "$FIELD_VIOLATIONS_JSON" | jq \
446
+ --arg schema "$BLOCK_TYPE" --arg field "$field" \
447
+ '. + [{kind: "missing_required_field", schema: $schema, field: $field, impact: -5, confidence: "high"}]')
448
+ FIELD_PENALTY=$((FIELD_PENALTY + 5))
449
+ fi
450
+ done
451
+ i=$((i + 1))
452
+ done
453
+ fi
454
+ [ $FIELD_PENALTY -gt 30 ] && FIELD_PENALTY=30
455
+
456
+ STRUCTURED=$((BASE + BONUS - FORBID_PENALTY - VALID_PENALTY - FIELD_PENALTY))
380
457
  [ $STRUCTURED -gt 100 ] && STRUCTURED=100
381
458
  [ $STRUCTURED -lt 0 ] && STRUCTURED=0
382
459
 
383
- CALCULATION=$(printf 'base: %d/%d expected present = %d; +%d optional bonus; -%d forbidden penalty; -%d validity penalty; clamp [0,100] = %d' \
460
+ CALCULATION=$(printf 'base: %d/%d expected present = %d; +%d optional bonus; -%d forbidden penalty; -%d validity penalty; -%d field penalty; clamp [0,100] = %d' \
384
461
  "$PRESENT_EXPECTED_COUNT" "$EXPECTED_COUNT" "$BASE" \
385
- "$BONUS" "$FORBID_PENALTY" "$VALID_PENALTY" "$STRUCTURED")
462
+ "$BONUS" "$FORBID_PENALTY" "$VALID_PENALTY" "$FIELD_PENALTY" "$STRUCTURED")
386
463
 
387
- if [ "$STRUCTURED" -ge 100 ] && [ -z "$PRESENT_FORBIDDEN" ] && [ "$VALID_PENALTY" -eq 0 ]; then
464
+ if [ "$STRUCTURED" -ge 100 ] && [ -z "$PRESENT_FORBIDDEN" ] && [ "$VALID_PENALTY" -eq 0 ] && [ "$FIELD_PENALTY" -eq 0 ]; then
388
465
  NOTES="All expected schemas for pageType=$PAGE_TYPE are present. No structured-data action needed."
389
466
  elif [ -n "$MISSING_EXPECTED" ] && [ -z "$PRESENT_FORBIDDEN" ]; then
390
467
  NOTES="Missing expected schemas for pageType=$PAGE_TYPE: $MISSING_EXPECTED. Add these to raise the score."
@@ -392,8 +469,12 @@ for bot_id in $BOTS; do
392
469
  NOTES="Forbidden schemas present for pageType=$PAGE_TYPE: $PRESENT_FORBIDDEN. Remove these (or re-classify the page type with --page-type)."
393
470
  elif [ -n "$PRESENT_FORBIDDEN" ] && [ -n "$MISSING_EXPECTED" ]; then
394
471
  NOTES="Mixed: missing $MISSING_EXPECTED and forbidden present $PRESENT_FORBIDDEN for pageType=$PAGE_TYPE."
395
- else
472
+ elif [ "$FIELD_PENALTY" -gt 0 ]; then
473
+ NOTES="Schemas for pageType=$PAGE_TYPE are present but missing required fields. See violations for details."
474
+ elif [ "$VALID_PENALTY" -gt 0 ]; then
396
475
  NOTES="Score reduced by $VALID_PENALTY pts due to invalid JSON-LD blocks."
476
+ else
477
+ NOTES="Structured data scored for pageType=$PAGE_TYPE."
397
478
  fi
398
479
 
399
480
  STRUCTURED_GRADE=$(grade_for "$STRUCTURED")
@@ -410,6 +491,7 @@ for bot_id in $BOTS; do
410
491
  --arg forbiddenPresent "$PRESENT_FORBIDDEN" \
411
492
  --argjson invalidCount "$JSONLD_INVALID" \
412
493
  --argjson validPenalty "$VALID_PENALTY" \
494
+ --argjson fieldViolations "$FIELD_VIOLATIONS_JSON" \
413
495
  --arg calculation "$CALCULATION" \
414
496
  --arg notes "$NOTES" \
415
497
  '
@@ -425,11 +507,12 @@ for bot_id in $BOTS; do
425
507
  missing: ($missingList | to_arr),
426
508
  extras: ($extrasList | to_arr),
427
509
  violations: (
428
- ($forbiddenPresent | to_arr | map({kind: "forbidden_schema", schema: ., impact: -10}))
510
+ ($forbiddenPresent | to_arr | map({kind: "forbidden_schema", schema: ., impact: -10, confidence: "high"}))
429
511
  + (if $validPenalty > 0
430
- then [{kind: "invalid_jsonld", count: $invalidCount, impact: (0 - $validPenalty)}]
512
+ then [{kind: "invalid_jsonld", count: $invalidCount, impact: (0 - $validPenalty), confidence: "high"}]
431
513
  else []
432
514
  end)
515
+ + $fieldViolations
433
516
  ),
434
517
  calculation: $calculation,
435
518
  notes: $notes
@@ -438,20 +521,24 @@ for bot_id in $BOTS; do
438
521
 
439
522
  # --- Category 4: Technical Signals (0-100) ---
440
523
  TECHNICAL=0
441
- TITLE=$(jget "$META" '.title' "")
442
- DESCRIPTION=$(jget "$META" '.description' "")
443
- CANONICAL=$(jget "$META" '.canonical' "")
444
- OG_TITLE=$(jget "$META" '.og.title' "")
445
- OG_DESC=$(jget "$META" '.og.description' "")
446
-
447
- [ -n "$TITLE" ] && [ "$TITLE" != "null" ] && TECHNICAL=$((TECHNICAL + 25))
448
- [ -n "$DESCRIPTION" ] && [ "$DESCRIPTION" != "null" ] && TECHNICAL=$((TECHNICAL + 25))
449
- [ -n "$CANONICAL" ] && [ "$CANONICAL" != "null" ] && TECHNICAL=$((TECHNICAL + 20))
450
- if [ -n "$OG_TITLE" ] && [ "$OG_TITLE" != "null" ]; then TECHNICAL=$((TECHNICAL + 8)); fi
451
- if [ -n "$OG_DESC" ] && [ "$OG_DESC" != "null" ]; then TECHNICAL=$((TECHNICAL + 7)); fi
452
-
453
- SITEMAP_EXISTS=$(jget_bool "$SITEMAP_FILE" '.exists')
454
- SITEMAP_CONTAINS=$(jget_bool "$SITEMAP_FILE" '.containsTarget')
524
+ # Batch-read meta fields for technical scoring (1 jq call instead of 5)
525
+ IFS=$'\t' read -r TITLE DESCRIPTION CANONICAL OG_TITLE OG_DESC <<< \
526
+ "$(jq -r '[
527
+ (.title // "" | gsub("\t"; " ")),
528
+ (.description // "" | gsub("\t"; " ")),
529
+ (.canonical // "" | gsub("\t"; " ")),
530
+ (.og.title // "" | gsub("\t"; " ")),
531
+ (.og.description // "" | gsub("\t"; " "))
532
+ ] | @tsv' "$META" 2>/dev/null || printf '\t\t\t\t')"
533
+
534
+ [ -n "$TITLE" ] && TECHNICAL=$((TECHNICAL + 25))
535
+ [ -n "$DESCRIPTION" ] && TECHNICAL=$((TECHNICAL + 25))
536
+ [ -n "$CANONICAL" ] && TECHNICAL=$((TECHNICAL + 20))
537
+ [ -n "$OG_TITLE" ] && TECHNICAL=$((TECHNICAL + 8))
538
+ [ -n "$OG_DESC" ] && TECHNICAL=$((TECHNICAL + 7))
539
+
540
+ SITEMAP_EXISTS=$(jq -r '.exists // false | tostring' "$SITEMAP_FILE" 2>/dev/null || echo "false")
541
+ SITEMAP_CONTAINS=$(jq -r '.containsTarget // false | tostring' "$SITEMAP_FILE" 2>/dev/null || echo "false")
455
542
  if [ "$SITEMAP_EXISTS" = "true" ] && [ "$SITEMAP_CONTAINS" = "true" ]; then
456
543
  TECHNICAL=$((TECHNICAL + 15))
457
544
  elif [ "$SITEMAP_EXISTS" = "true" ]; then
@@ -460,10 +547,14 @@ for bot_id in $BOTS; do
460
547
 
461
548
  # --- Category 5: AI Readiness (0-100) ---
462
549
  AI=0
463
- LLMS_EXISTS=$(jget_bool "$LLMSTXT_FILE" '.llmsTxt.exists')
464
- LLMS_HAS_TITLE=$(jget_bool "$LLMSTXT_FILE" '.llmsTxt.hasTitle')
465
- LLMS_HAS_DESC=$(jget_bool "$LLMSTXT_FILE" '.llmsTxt.hasDescription')
466
- LLMS_URLS=$(jget_num "$LLMSTXT_FILE" '.llmsTxt.urlCount')
550
+ # Batch-read llmstxt fields (1 jq call instead of 4)
551
+ read -r LLMS_EXISTS LLMS_HAS_TITLE LLMS_HAS_DESC LLMS_URLS <<< \
552
+ "$(jq -r '[
553
+ (.llmsTxt.exists // false | tostring),
554
+ (.llmsTxt.hasTitle // false | tostring),
555
+ (.llmsTxt.hasDescription // false | tostring),
556
+ (.llmsTxt.urlCount // 0)
557
+ ] | @tsv' "$LLMSTXT_FILE" 2>/dev/null || echo "false false false 0")"
467
558
 
468
559
  if [ "$LLMS_EXISTS" = "true" ]; then
469
560
  AI=$((AI + 40))
@@ -472,7 +563,7 @@ for bot_id in $BOTS; do
472
563
  [ "$LLMS_URLS" -ge 1 ] && AI=$((AI + 6))
473
564
  fi
474
565
  [ "$EFFECTIVE_WORD_COUNT" -ge 200 ] && AI=$((AI + 20))
475
- if [ "$H1_COUNT" -ge 1 ] && [ -n "$DESCRIPTION" ] && [ "$DESCRIPTION" != "null" ]; then
566
+ if [ "$H1_COUNT" -ge 1 ] && [ -n "$DESCRIPTION" ]; then
476
567
  AI=$((AI + 20))
477
568
  fi
478
569
 
@@ -566,6 +657,60 @@ CAT_STRUCTURED_GRADE=$(grade_for "$CAT_STRUCTURED_AVG")
566
657
  CAT_TECHNICAL_GRADE=$(grade_for "$CAT_TECHNICAL_AVG")
567
658
  CAT_AI_GRADE=$(grade_for "$CAT_AI_AVG")
568
659
 
660
+ # --- Cross-bot content parity (C4) ---
661
+ PARITY_MIN_WORDS=999999999
662
+ PARITY_MAX_WORDS=0
663
+ PARITY_BOT_COUNT=0
664
+ for bot_id in $BOTS; do
665
+ FETCH="$RESULTS_DIR/fetch-$bot_id.json"
666
+ P_FETCH_FAILED=$(jget_bool "$FETCH" '.fetchFailed')
667
+ [ "$P_FETCH_FAILED" = "true" ] && continue
668
+ WC=$(jget_num "$FETCH" '.wordCount')
669
+ [ "$WC" -lt "$PARITY_MIN_WORDS" ] && PARITY_MIN_WORDS=$WC
670
+ [ "$WC" -gt "$PARITY_MAX_WORDS" ] && PARITY_MAX_WORDS=$WC
671
+ PARITY_BOT_COUNT=$((PARITY_BOT_COUNT + 1))
672
+ done
673
+
674
+ if [ "$PARITY_BOT_COUNT" -le 1 ]; then
675
+ PARITY_SCORE=100
676
+ PARITY_MAX_DELTA=0
677
+ elif [ "$PARITY_MAX_WORDS" -gt 0 ]; then
678
+ PARITY_SCORE=$(awk -v min="$PARITY_MIN_WORDS" -v max="$PARITY_MAX_WORDS" \
679
+ 'BEGIN { printf "%d", (min / max) * 100 + 0.5 }')
680
+ PARITY_MAX_DELTA=$(awk -v min="$PARITY_MIN_WORDS" -v max="$PARITY_MAX_WORDS" \
681
+ 'BEGIN { printf "%d", ((max - min) / max) * 100 + 0.5 }')
682
+ else
683
+ PARITY_SCORE=100
684
+ PARITY_MAX_DELTA=0
685
+ fi
686
+
687
+ [ "$PARITY_SCORE" -gt 100 ] && PARITY_SCORE=100
688
+ PARITY_GRADE=$(grade_for "$PARITY_SCORE")
689
+
690
+ if [ "$PARITY_SCORE" -ge 95 ]; then
691
+ PARITY_INTERP="Content is consistent across all bots."
692
+ elif [ "$PARITY_SCORE" -ge 50 ]; then
693
+ PARITY_INTERP="Moderate content divergence between bots — likely partial client-side rendering hydration."
694
+ else
695
+ PARITY_INTERP="Severe content divergence — site likely relies on client-side rendering. AI bots see significantly less content than Googlebot."
696
+ fi
697
+
698
+ # --- Warnings (H2) ---
699
+ WARNINGS="[]"
700
+ if [ "$DIFF_AVAILABLE" != "true" ]; then
701
+ DIFF_REASON="not_found"
702
+ if [ -f "$DIFF_RENDER_FILE" ]; then
703
+ DIFF_REASON=$(jq -r '.reason // "skipped"' "$DIFF_RENDER_FILE" 2>/dev/null || echo "skipped")
704
+ fi
705
+ WARNINGS=$(printf '%s' "$WARNINGS" | jq --arg reason "$DIFF_REASON" \
706
+ '. + [{
707
+ code: "diff_render_unavailable",
708
+ severity: "high",
709
+ message: "JS rendering comparison was skipped. If this site uses CSR, non-JS bot scores may be inaccurate.",
710
+ reason: $reason
711
+ }]')
712
+ fi
713
+
569
714
  TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
570
715
 
571
716
  jq -n \
@@ -587,6 +732,13 @@ jq -n \
587
732
  --arg catTechnicalGrade "$CAT_TECHNICAL_GRADE" \
588
733
  --argjson catAi "$CAT_AI_AVG" \
589
734
  --arg catAiGrade "$CAT_AI_GRADE" \
735
+ --argjson warnings "$WARNINGS" \
736
+ --argjson parityScore "$PARITY_SCORE" \
737
+ --arg parityGrade "$PARITY_GRADE" \
738
+ --argjson parityMinWords "$PARITY_MIN_WORDS" \
739
+ --argjson parityMaxWords "$PARITY_MAX_WORDS" \
740
+ --argjson parityMaxDelta "$PARITY_MAX_DELTA" \
741
+ --arg parityInterp "$PARITY_INTERP" \
590
742
  '{
591
743
  url: $url,
592
744
  timestamp: $timestamp,
@@ -594,6 +746,15 @@ jq -n \
594
746
  pageType: $pageType,
595
747
  pageTypeOverridden: ($pageTypeOverride | length > 0),
596
748
  overall: { score: $overallScore, grade: $overallGrade },
749
+ parity: {
750
+ score: $parityScore,
751
+ grade: $parityGrade,
752
+ minWords: (if $parityMinWords >= 999999999 then 0 else $parityMinWords end),
753
+ maxWords: $parityMaxWords,
754
+ maxDeltaPct: $parityMaxDelta,
755
+ interpretation: $parityInterp
756
+ },
757
+ warnings: $warnings,
597
758
  bots: $bots,
598
759
  categories: {
599
760
  accessibility: { score: $catAcc, grade: $catAccGrade },
@@ -62,6 +62,7 @@ fi
62
62
 
63
63
  VALID_COUNT=0
64
64
  INVALID_COUNT=0
65
+ BLOCKS_JSON="[]"
65
66
 
66
67
  if [ "$BLOCK_COUNT" -gt 0 ]; then
67
68
  while IFS= read -r block; do
@@ -79,6 +80,15 @@ if [ "$BLOCK_COUNT" -gt 0 ]; then
79
80
  else empty end;
80
81
  collect_types
81
82
  ' 2>/dev/null >> "$TYPES_FILE" || true
83
+
84
+ # Extract per-block type + top-level field names for field validation (AC-B1)
85
+ BLOCK_INFO=$(printf '%s' "$block" | jq -c '
86
+ {
87
+ type: (if has("@type") then (.["@type"] | if type == "array" then .[0] else . end) else "unknown" end),
88
+ fields: (keys | map(select(startswith("@") | not)))
89
+ }
90
+ ' 2>/dev/null || echo '{"type":"unknown","fields":[]}')
91
+ BLOCKS_JSON=$(printf '%s' "$BLOCKS_JSON" | jq --argjson b "$BLOCK_INFO" '. + [$b]')
82
92
  else
83
93
  INVALID_COUNT=$((INVALID_COUNT + 1))
84
94
  fi
@@ -109,6 +119,7 @@ jq -n \
109
119
  --argjson valid "$VALID_COUNT" \
110
120
  --argjson invalid "$INVALID_COUNT" \
111
121
  --argjson types "$TYPES_JSON" \
122
+ --argjson blocks "$BLOCKS_JSON" \
112
123
  --argjson hasOrg "$HAS_ORG" \
113
124
  --argjson hasBreadcrumb "$HAS_BREADCRUMB" \
114
125
  --argjson hasWebsite "$HAS_WEBSITE" \
@@ -121,6 +132,7 @@ jq -n \
121
132
  validCount: $valid,
122
133
  invalidCount: $invalid,
123
134
  types: $types,
135
+ blocks: $blocks,
124
136
  flags: {
125
137
  hasOrganization: $hasOrg,
126
138
  hasBreadcrumbList: $hasBreadcrumb,
@@ -93,11 +93,9 @@ jq -n \
93
93
  --argjson internalSample "$INTERNAL_SAMPLE" \
94
94
  --argjson externalSample "$EXTERNAL_SAMPLE" \
95
95
  '{
96
- counts: {
97
- internal: $internalCount,
98
- external: $externalCount,
99
- total: ($internalCount + $externalCount)
100
- },
101
- internal: $internalSample,
102
- external: $externalSample
96
+ total: ($internalCount + $externalCount),
97
+ internal: $internalCount,
98
+ external: $externalCount,
99
+ internalUrls: $internalSample,
100
+ externalUrls: $externalSample
103
101
  }'
@@ -0,0 +1,151 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # fetch-as-bot.sh — Fetch a URL as a specific bot User-Agent
5
+ # Usage: fetch-as-bot.sh <url> <profile.json>
6
+
7
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
8
+ # shellcheck source=_lib.sh
9
+ . "$SCRIPT_DIR/_lib.sh"
10
+
11
+ URL="${1:?Usage: fetch-as-bot.sh <url> <profile.json>}"
12
+ PROFILE="${2:?Usage: fetch-as-bot.sh <url> <profile.json>}"
13
+
14
+ BOT_ID=$(jq -r '.id' "$PROFILE")
15
+ BOT_NAME=$(jq -r '.name' "$PROFILE")
16
+ UA=$(jq -r '.userAgent' "$PROFILE")
17
+ RENDERS_JS=$(jq -r '.rendersJavaScript' "$PROFILE")
18
+
19
+ TMPDIR="${TMPDIR:-/tmp}"
20
+ HEADERS_FILE=$(mktemp "$TMPDIR/crawlsim-headers.XXXXXX")
21
+ BODY_FILE=$(mktemp "$TMPDIR/crawlsim-body.XXXXXX")
22
+ CURL_STDERR_FILE=$(mktemp "$TMPDIR/crawlsim-stderr.XXXXXX")
23
+ trap 'rm -f "$HEADERS_FILE" "$BODY_FILE" "$CURL_STDERR_FILE"' EXIT
24
+
25
+ printf '[%s] fetching %s\n' "$BOT_ID" "$URL" >&2
26
+
27
+ set +e
28
+ TIMING=$(curl -sS -L \
29
+ -H "User-Agent: $UA" \
30
+ -D "$HEADERS_FILE" \
31
+ -o "$BODY_FILE" \
32
+ -w '{"total":%{time_total},"ttfb":%{time_starttransfer},"connect":%{time_connect},"statusCode":%{http_code},"sizeDownload":%{size_download},"redirectCount":%{num_redirects},"finalUrl":"%{url_effective}"}' \
33
+ --max-time 30 \
34
+ "$URL" 2>"$CURL_STDERR_FILE")
35
+ CURL_EXIT=$?
36
+ set -e
37
+
38
+ CURL_ERR=""
39
+ if [ -s "$CURL_STDERR_FILE" ]; then
40
+ CURL_ERR=$(cat "$CURL_STDERR_FILE")
41
+ fi
42
+
43
+ if [ "$CURL_EXIT" -ne 0 ]; then
44
+ printf '[%s] FAILED: curl exit %d — %s\n' "$BOT_ID" "$CURL_EXIT" "$CURL_ERR" >&2
45
+ jq -n \
46
+ --arg url "$URL" \
47
+ --arg botId "$BOT_ID" \
48
+ --arg botName "$BOT_NAME" \
49
+ --arg ua "$UA" \
50
+ --arg rendersJs "$RENDERS_JS" \
51
+ --arg error "$CURL_ERR" \
52
+ --argjson exitCode "$CURL_EXIT" \
53
+ '{
54
+ url: $url,
55
+ bot: {
56
+ id: $botId,
57
+ name: $botName,
58
+ userAgent: $ua,
59
+ rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end)
60
+ },
61
+ fetchFailed: true,
62
+ error: $error,
63
+ curlExitCode: $exitCode,
64
+ status: 0,
65
+ timing: { total: 0, ttfb: 0 },
66
+ size: 0,
67
+ wordCount: 0,
68
+ headers: {},
69
+ bodyBase64: ""
70
+ }'
71
+ exit 0
72
+ fi
73
+
74
+ read -r STATUS TOTAL_TIME TTFB SIZE REDIRECT_COUNT FINAL_URL <<< \
75
+ "$(echo "$TIMING" | jq -r '[.statusCode, .total, .ttfb, .sizeDownload, .redirectCount, .finalUrl] | @tsv')"
76
+
77
+ # Parse response headers into a JSON object using jq for safe escaping.
78
+ # curl -L writes multiple blocks on redirect; jq keeps the last definition
79
+ # of each header since `add` overwrites left-to-right.
80
+ HEADERS_JSON=$(tr -d '\r' < "$HEADERS_FILE" \
81
+ | grep -E '^[A-Za-z][A-Za-z0-9-]*:[[:space:]]' \
82
+ | jq -Rs '
83
+ split("\n")
84
+ | map(select(length > 0))
85
+ | map(capture("^(?<k>[^:]+):[[:space:]]*(?<v>.*)$"))
86
+ | map({(.k): .v})
87
+ | add // {}
88
+ ')
89
+
90
+ # Parse redirect chain from headers dump.
91
+ # curl -D writes multiple HTTP response blocks on redirect — each starts with HTTP/.
92
+ REDIRECT_CHAIN="[]"
93
+ if [ "$REDIRECT_COUNT" -gt 0 ]; then
94
+ REDIRECT_CHAIN=$(tr -d '\r' < "$HEADERS_FILE" | awk '
95
+ /^HTTP\// { status=$2; url="" }
96
+ /^[Ll]ocation:/ { url=$2 }
97
+ /^$/ && status && url { printf "%s %s\n", status, url; status=""; url="" }
98
+ ' | jq -Rs '
99
+ split("\n") | map(select(length > 0)) |
100
+ to_entries | map({
101
+ hop: .key,
102
+ status: (.value | split(" ")[0] | tonumber),
103
+ location: (.value | split(" ")[1:] | join(" "))
104
+ })
105
+ ')
106
+ fi
107
+
108
+ WORD_COUNT=$(count_words "$BODY_FILE")
109
+ [ -z "$WORD_COUNT" ] && WORD_COUNT=0
110
+
111
+ BODY_B64=""
112
+ if [ -s "$BODY_FILE" ]; then
113
+ BODY_B64=$(base64 < "$BODY_FILE")
114
+ fi
115
+
116
+ printf '[%s] ok: status=%s size=%s words=%s time=%ss\n' "$BOT_ID" "$STATUS" "$SIZE" "$WORD_COUNT" "$TOTAL_TIME" >&2
117
+
118
+ jq -n \
119
+ --arg url "$URL" \
120
+ --arg botId "$BOT_ID" \
121
+ --arg botName "$BOT_NAME" \
122
+ --arg ua "$UA" \
123
+ --arg rendersJs "$RENDERS_JS" \
124
+ --argjson status "$STATUS" \
125
+ --argjson totalTime "$TOTAL_TIME" \
126
+ --argjson ttfb "$TTFB" \
127
+ --argjson size "$SIZE" \
128
+ --argjson wordCount "$WORD_COUNT" \
129
+ --argjson headers "$HEADERS_JSON" \
130
+ --argjson redirectCount "$REDIRECT_COUNT" \
131
+ --arg finalUrl "$FINAL_URL" \
132
+ --argjson redirectChain "$REDIRECT_CHAIN" \
133
+ --arg bodyBase64 "$BODY_B64" \
134
+ '{
135
+ url: $url,
136
+ bot: {
137
+ id: $botId,
138
+ name: $botName,
139
+ userAgent: $ua,
140
+ rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end)
141
+ },
142
+ status: $status,
143
+ timing: { total: $totalTime, ttfb: $ttfb },
144
+ size: $size,
145
+ wordCount: $wordCount,
146
+ redirectCount: $redirectCount,
147
+ finalUrl: $finalUrl,
148
+ redirectChain: $redirectChain,
149
+ headers: $headers,
150
+ bodyBase64: $bodyBase64
151
+ }'
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env bash
2
+ # schema-fields.sh — Required field definitions per schema.org type.
3
+ # Source this file, then call required_fields_for <SchemaType>.
4
+
5
+ required_fields_for() {
6
+ case "$1" in
7
+ Organization) echo "name url" ;;
8
+ WebSite) echo "name url" ;;
9
+ Article) echo "headline author datePublished" ;;
10
+ NewsArticle) echo "headline author datePublished" ;;
11
+ FAQPage) echo "mainEntity" ;;
12
+ BreadcrumbList) echo "itemListElement" ;;
13
+ CollectionPage) echo "name" ;;
14
+ ItemList) echo "itemListElement" ;;
15
+ AboutPage) echo "name" ;;
16
+ ContactPage) echo "name" ;;
17
+ Product) echo "name" ;;
18
+ LocalBusiness) echo "name address" ;;
19
+ ProfessionalService) echo "name" ;;
20
+ Person) echo "name" ;;
21
+ ImageObject) echo "contentUrl" ;;
22
+ PostalAddress) echo "streetAddress" ;;
23
+ *) echo "" ;;
24
+ esac
25
+ }
@@ -1,87 +0,0 @@
1
- #!/usr/bin/env bash
2
- set -euo pipefail
3
-
4
- # fetch-as-bot.sh — Fetch a URL as a specific bot User-Agent
5
- # Usage: fetch-as-bot.sh <url> <profile.json>
6
-
7
- SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
8
- # shellcheck source=_lib.sh
9
- . "$SCRIPT_DIR/_lib.sh"
10
-
11
- URL="${1:?Usage: fetch-as-bot.sh <url> <profile.json>}"
12
- PROFILE="${2:?Usage: fetch-as-bot.sh <url> <profile.json>}"
13
-
14
- BOT_ID=$(jq -r '.id' "$PROFILE")
15
- BOT_NAME=$(jq -r '.name' "$PROFILE")
16
- UA=$(jq -r '.userAgent' "$PROFILE")
17
- RENDERS_JS=$(jq -r '.rendersJavaScript' "$PROFILE")
18
-
19
- printf '[fetch-as-bot] %s <- %s\n' "$BOT_NAME" "$URL" >&2
20
-
21
- TMPDIR="${TMPDIR:-/tmp}"
22
- HEADERS_FILE=$(mktemp "$TMPDIR/crawlsim-headers.XXXXXX")
23
- BODY_FILE=$(mktemp "$TMPDIR/crawlsim-body.XXXXXX")
24
- trap 'rm -f "$HEADERS_FILE" "$BODY_FILE"' EXIT
25
-
26
- TIMING=$(curl -sS -L \
27
- -H "User-Agent: $UA" \
28
- -D "$HEADERS_FILE" \
29
- -o "$BODY_FILE" \
30
- -w '{"total":%{time_total},"ttfb":%{time_starttransfer},"connect":%{time_connect},"statusCode":%{http_code},"sizeDownload":%{size_download}}' \
31
- --max-time 30 \
32
- "$URL" 2>/dev/null || echo '{"total":0,"ttfb":0,"connect":0,"statusCode":0,"sizeDownload":0}')
33
-
34
- STATUS=$(echo "$TIMING" | jq -r '.statusCode')
35
- TOTAL_TIME=$(echo "$TIMING" | jq -r '.total')
36
- TTFB=$(echo "$TIMING" | jq -r '.ttfb')
37
- SIZE=$(echo "$TIMING" | jq -r '.sizeDownload')
38
-
39
- # Parse response headers into a JSON object using jq for safe escaping.
40
- # curl -L writes multiple blocks on redirect; jq keeps the last definition
41
- # of each header since `add` overwrites left-to-right.
42
- HEADERS_JSON=$(tr -d '\r' < "$HEADERS_FILE" \
43
- | grep -E '^[A-Za-z][A-Za-z0-9-]*:[[:space:]]' \
44
- | jq -Rs '
45
- split("\n")
46
- | map(select(length > 0))
47
- | map(capture("^(?<k>[^:]+):[[:space:]]*(?<v>.*)$"))
48
- | map({(.k): .v})
49
- | add // {}
50
- ')
51
-
52
- WORD_COUNT=$(count_words "$BODY_FILE")
53
- [ -z "$WORD_COUNT" ] && WORD_COUNT=0
54
-
55
- BODY_B64=""
56
- if [ -s "$BODY_FILE" ]; then
57
- BODY_B64=$(base64 < "$BODY_FILE")
58
- fi
59
-
60
- jq -n \
61
- --arg url "$URL" \
62
- --arg botId "$BOT_ID" \
63
- --arg botName "$BOT_NAME" \
64
- --arg ua "$UA" \
65
- --arg rendersJs "$RENDERS_JS" \
66
- --argjson status "$STATUS" \
67
- --argjson totalTime "$TOTAL_TIME" \
68
- --argjson ttfb "$TTFB" \
69
- --argjson size "$SIZE" \
70
- --argjson wordCount "$WORD_COUNT" \
71
- --argjson headers "$HEADERS_JSON" \
72
- --arg bodyBase64 "$BODY_B64" \
73
- '{
74
- url: $url,
75
- bot: {
76
- id: $botId,
77
- name: $botName,
78
- userAgent: $ua,
79
- rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end)
80
- },
81
- status: $status,
82
- timing: { total: $totalTime, ttfb: $ttfb },
83
- size: $size,
84
- wordCount: $wordCount,
85
- headers: $headers,
86
- bodyBase64: $bodyBase64
87
- }'
File without changes