@braedenbuilds/crawl-sim 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. package/.claude-plugin/marketplace.json +15 -0
  2. package/.claude-plugin/plugin.json +13 -0
  3. package/README.md +19 -6
  4. package/bin/install.js +6 -2
  5. package/package.json +5 -3
  6. package/{SKILL.md → skills/crawl-sim/SKILL.md} +14 -2
  7. package/{scripts → skills/crawl-sim/scripts}/compute-score.sh +144 -5
  8. package/{scripts → skills/crawl-sim/scripts}/extract-jsonld.sh +12 -0
  9. package/skills/crawl-sim/scripts/fetch-as-bot.sh +151 -0
  10. package/skills/crawl-sim/scripts/schema-fields.sh +25 -0
  11. package/scripts/fetch-as-bot.sh +0 -87
  12. /package/{profiles → skills/crawl-sim/profiles}/chatgpt-user.json +0 -0
  13. /package/{profiles → skills/crawl-sim/profiles}/claude-searchbot.json +0 -0
  14. /package/{profiles → skills/crawl-sim/profiles}/claude-user.json +0 -0
  15. /package/{profiles → skills/crawl-sim/profiles}/claudebot.json +0 -0
  16. /package/{profiles → skills/crawl-sim/profiles}/googlebot.json +0 -0
  17. /package/{profiles → skills/crawl-sim/profiles}/gptbot.json +0 -0
  18. /package/{profiles → skills/crawl-sim/profiles}/oai-searchbot.json +0 -0
  19. /package/{profiles → skills/crawl-sim/profiles}/perplexity-user.json +0 -0
  20. /package/{profiles → skills/crawl-sim/profiles}/perplexitybot.json +0 -0
  21. /package/{scripts → skills/crawl-sim/scripts}/_lib.sh +0 -0
  22. /package/{scripts → skills/crawl-sim/scripts}/check-llmstxt.sh +0 -0
  23. /package/{scripts → skills/crawl-sim/scripts}/check-robots.sh +0 -0
  24. /package/{scripts → skills/crawl-sim/scripts}/check-sitemap.sh +0 -0
  25. /package/{scripts → skills/crawl-sim/scripts}/diff-render.sh +0 -0
  26. /package/{scripts → skills/crawl-sim/scripts}/extract-links.sh +0 -0
  27. /package/{scripts → skills/crawl-sim/scripts}/extract-meta.sh +0 -0
@@ -0,0 +1,15 @@
1
+ {
2
+ "name": "crawl-sim",
3
+ "owner": {
4
+ "name": "BraedenBDev",
5
+ "url": "https://github.com/BraedenBDev"
6
+ },
7
+ "plugins": [
8
+ {
9
+ "name": "crawl-sim",
10
+ "source": "./",
11
+ "description": "Multi-bot web crawler simulator — audit how Googlebot, GPTBot, ClaudeBot, and PerplexityBot see your site",
12
+ "version": "1.2.0"
13
+ }
14
+ ]
15
+ }
@@ -0,0 +1,13 @@
1
+ {
2
+ "name": "crawl-sim",
3
+ "version": "1.2.0",
4
+ "description": "Multi-bot web crawler simulator — audit how Googlebot, GPTBot, ClaudeBot, and PerplexityBot see your site",
5
+ "author": {
6
+ "name": "BraedenBDev",
7
+ "url": "https://github.com/BraedenBDev"
8
+ },
9
+ "homepage": "https://github.com/BraedenBDev/crawl-sim#readme",
10
+ "repository": "https://github.com/BraedenBDev/crawl-sim",
11
+ "license": "MIT",
12
+ "keywords": ["seo", "crawler", "ai-visibility", "claude-code-skill", "googlebot", "gptbot", "claudebot", "perplexitybot"]
13
+ }
package/README.md CHANGED
@@ -44,15 +44,20 @@ The concept was validated manually: a curl-as-GPTBot + Claude analysis caught a
44
44
 
45
45
  ## Quick start
46
46
 
47
- ### In Claude Code (recommended)
47
+ ### As a Claude Code plugin (recommended)
48
+
49
+ ```
50
+ /plugin install BraedenBDev/crawl-sim@github
51
+ ```
52
+
53
+ Or add as a marketplace for easy updates:
48
54
 
49
- ```bash
50
- npm install -g @braedenbuilds/crawl-sim
51
- crawl-sim install # → ~/.claude/skills/crawl-sim/
52
- crawl-sim install --project # → .claude/skills/crawl-sim/
55
+ ```
56
+ /plugin marketplace add BraedenBDev/crawl-sim
57
+ /plugin install crawl-sim@crawl-sim
53
58
  ```
54
59
 
55
- Then in Claude Code:
60
+ Then invoke:
56
61
 
57
62
  ```
58
63
  /crawl-sim https://yoursite.com
@@ -60,6 +65,14 @@ Then in Claude Code:
60
65
 
61
66
  Claude runs the full pipeline, interprets the results, and returns a score card plus prioritized findings.
62
67
 
68
+ ### Via npm (alternative)
69
+
70
+ ```bash
71
+ npm install -g @braedenbuilds/crawl-sim
72
+ crawl-sim install # → ~/.claude/skills/crawl-sim/
73
+ crawl-sim install --project # → .claude/skills/crawl-sim/
74
+ ```
75
+
63
76
  > **Why `npm install -g` instead of `npx`?** Recent versions of npx have a known issue linking bins for scoped single-bin packages in ephemeral installs. A persistent global install avoids the problem entirely. The git clone path below is the zero-npm fallback.
64
77
 
65
78
  ### As a standalone CLI
package/bin/install.js CHANGED
@@ -14,6 +14,7 @@ const os = require('os');
14
14
  const { execFileSync } = require('child_process');
15
15
 
16
16
  const SOURCE_DIR = path.resolve(__dirname, '..');
17
+ const SKILL_ROOT = path.resolve(SOURCE_DIR, 'skills', 'crawl-sim');
17
18
  const SKILL_FILES = ['SKILL.md'];
18
19
  const SKILL_DIRS = ['profiles', 'scripts'];
19
20
 
@@ -80,7 +81,9 @@ function install(target) {
80
81
  fs.mkdirSync(target, { recursive: true });
81
82
 
82
83
  for (const file of SKILL_FILES) {
83
- const src = path.join(SOURCE_DIR, file);
84
+ // Look in skills/crawl-sim/ first (canonical), fallback to root (symlink)
85
+ let src = path.join(SKILL_ROOT, file);
86
+ if (!fs.existsSync(src)) src = path.join(SOURCE_DIR, file);
84
87
  const dest = path.join(target, file);
85
88
  if (fs.existsSync(src)) {
86
89
  fs.copyFileSync(src, dest);
@@ -92,7 +95,8 @@ function install(target) {
92
95
  }
93
96
 
94
97
  for (const dir of SKILL_DIRS) {
95
- const src = path.join(SOURCE_DIR, dir);
98
+ let src = path.join(SKILL_ROOT, dir);
99
+ if (!fs.existsSync(src)) src = path.join(SOURCE_DIR, dir);
96
100
  const dest = path.join(target, dir);
97
101
  if (fs.existsSync(src)) {
98
102
  if (fs.existsSync(dest)) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@braedenbuilds/crawl-sim",
3
- "version": "1.1.0",
3
+ "version": "1.2.0",
4
4
  "description": "Agent-native multi-bot web crawler simulator. See your site through the eyes of Googlebot, GPTBot, ClaudeBot, and PerplexityBot.",
5
5
  "bin": {
6
6
  "crawl-sim": "bin/install.js"
@@ -40,9 +40,11 @@
40
40
  },
41
41
  "files": [
42
42
  "bin/",
43
+ "skills/",
44
+ ".claude-plugin/",
43
45
  "SKILL.md",
44
- "profiles/",
45
- "scripts/",
46
+ "profiles",
47
+ "scripts",
46
48
  "README.md",
47
49
  "LICENSE"
48
50
  ]
@@ -40,7 +40,10 @@ command -v curl >/dev/null 2>&1 || { echo "ERROR: curl is required"; exit 1; }
40
40
  command -v jq >/dev/null 2>&1 || { echo "ERROR: jq is required (brew install jq)"; exit 1; }
41
41
  ```
42
42
 
43
- Locate the skill directory: typically `~/.claude/skills/crawl-sim/` or `.claude/skills/crawl-sim/`. Use `$CLAUDE_PLUGIN_ROOT` if set, otherwise find the directory containing this `SKILL.md`.
43
+ Locate the skill directory. Check in this order:
44
+ 1. `$CLAUDE_PLUGIN_ROOT/skills/crawl-sim` (plugin install)
45
+ 2. `~/.claude/skills/crawl-sim/` (global npm install)
46
+ 3. `.claude/skills/crawl-sim/` (project-level install)
44
47
 
45
48
  ## Orchestration — five narrated stages
46
49
 
@@ -51,7 +54,16 @@ Split the work into **five Bash invocations**, each with a clear `description` f
51
54
  Tell the user: "Fetching as Googlebot, GPTBot, ClaudeBot, and PerplexityBot..."
52
55
 
53
56
  ```bash
54
- SKILL_DIR="$HOME/.claude/skills/crawl-sim" # or wherever this SKILL.md lives
57
+ # Resolve skill directory
58
+ if [ -n "${CLAUDE_PLUGIN_ROOT:-}" ] && [ -d "$CLAUDE_PLUGIN_ROOT/skills/crawl-sim" ]; then
59
+ SKILL_DIR="$CLAUDE_PLUGIN_ROOT/skills/crawl-sim"
60
+ elif [ -d "$HOME/.claude/skills/crawl-sim" ]; then
61
+ SKILL_DIR="$HOME/.claude/skills/crawl-sim"
62
+ elif [ -d ".claude/skills/crawl-sim" ]; then
63
+ SKILL_DIR=".claude/skills/crawl-sim"
64
+ else
65
+ echo "ERROR: cannot find crawl-sim skill directory" >&2; exit 1
66
+ fi
55
67
  RUN_DIR=$(mktemp -d -t crawl-sim.XXXXXX)
56
68
  URL="<user-provided-url>"
57
69
  for bot in googlebot gptbot claudebot perplexitybot; do
@@ -21,6 +21,8 @@ set -eu
21
21
  SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
22
22
  # shellcheck source=_lib.sh
23
23
  . "$SCRIPT_DIR/_lib.sh"
24
+ # shellcheck source=schema-fields.sh
25
+ . "$SCRIPT_DIR/schema-fields.sh"
24
26
 
25
27
  PAGE_TYPE_OVERRIDE=""
26
28
  while [ $# -gt 0 ]; do
@@ -276,6 +278,40 @@ for bot_id in $BOTS; do
276
278
  ROBOTS="$RESULTS_DIR/robots-$bot_id.json"
277
279
 
278
280
  BOT_NAME=$(jget "$FETCH" '.bot.name' "$bot_id")
281
+
282
+ # Check for fetch failure — skip scoring, emit F grade (AC-A3)
283
+ FETCH_FAILED=$(jget_bool "$FETCH" '.fetchFailed')
284
+ if [ "$FETCH_FAILED" = "true" ]; then
285
+ FETCH_ERROR=$(jget "$FETCH" '.error' "unknown error")
286
+ RENDERS_JS=$(jq -r '.bot.rendersJavaScript | if . == null then "unknown" else tostring end' "$FETCH" 2>/dev/null || echo "unknown")
287
+ BOT_OBJ=$(jq -n \
288
+ --arg id "$bot_id" \
289
+ --arg name "$BOT_NAME" \
290
+ --arg rendersJs "$RENDERS_JS" \
291
+ --arg error "$FETCH_ERROR" \
292
+ '{
293
+ id: $id,
294
+ name: $name,
295
+ rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end),
296
+ fetchFailed: true,
297
+ error: $error,
298
+ score: 0,
299
+ grade: "F",
300
+ visibility: { serverWords: 0, effectiveWords: 0, missedWordsVsRendered: 0, hydrationPenaltyPts: 0 },
301
+ categories: {
302
+ accessibility: { score: 0, grade: "F" },
303
+ contentVisibility: { score: 0, grade: "F" },
304
+ structuredData: { score: 0, grade: "F", pageType: "unknown", expected: [], optional: [], forbidden: [], present: [], missing: [], extras: [], violations: [{ kind: "fetch_failed", impact: -100 }], calculation: "fetch failed — no data to score", notes: ("Fetch failed: " + $error) },
305
+ technicalSignals: { score: 0, grade: "F" },
306
+ aiReadiness: { score: 0, grade: "F" }
307
+ }
308
+ }')
309
+ BOTS_JSON=$(printf '%s' "$BOTS_JSON" | jq --argjson bot "$BOT_OBJ" --arg id "$bot_id" '.[$id] = $bot')
310
+ printf '[compute-score] %s: fetch failed, scoring as F\n' "$bot_id" >&2
311
+ CAT_N=$((CAT_N + 1))
312
+ continue
313
+ fi
314
+
279
315
  STATUS=$(jget_num "$FETCH" '.status')
280
316
  TOTAL_TIME=$(jget_num "$FETCH" '.timing.total')
281
317
  SERVER_WORD_COUNT=$(jget_num "$FETCH" '.wordCount')
@@ -376,15 +412,42 @@ for bot_id in $BOTS; do
376
412
  [ $VALID_PENALTY -gt 20 ] && VALID_PENALTY=20
377
413
  fi
378
414
 
379
- STRUCTURED=$((BASE + BONUS - FORBID_PENALTY - VALID_PENALTY))
415
+ # Field-level validation (C3): check required fields per schema type
416
+ FIELD_PENALTY=0
417
+ FIELD_VIOLATIONS_JSON="[]"
418
+ BLOCK_COUNT_FOR_FIELDS=0
419
+ if [ -f "$JSONLD" ]; then
420
+ BLOCK_COUNT_FOR_FIELDS=$(jq 'if has("blocks") then .blocks | length else 0 end' "$JSONLD" 2>/dev/null || echo "0")
421
+ fi
422
+ if [ "$BLOCK_COUNT_FOR_FIELDS" -gt 0 ]; then
423
+ i=0
424
+ while [ "$i" -lt "$BLOCK_COUNT_FOR_FIELDS" ]; do
425
+ BLOCK_TYPE=$(jq -r ".blocks[$i].type" "$JSONLD" 2>/dev/null || echo "")
426
+ BLOCK_FIELDS=$(jq -r ".blocks[$i].fields[]?" "$JSONLD" 2>/dev/null | tr '\n' ' ')
427
+ REQUIRED=$(required_fields_for "$BLOCK_TYPE")
428
+ for field in $REQUIRED; do
429
+ # shellcheck disable=SC2086
430
+ if ! list_contains "$field" $BLOCK_FIELDS; then
431
+ FIELD_VIOLATIONS_JSON=$(printf '%s' "$FIELD_VIOLATIONS_JSON" | jq \
432
+ --arg schema "$BLOCK_TYPE" --arg field "$field" \
433
+ '. + [{kind: "missing_required_field", schema: $schema, field: $field, impact: -5}]')
434
+ FIELD_PENALTY=$((FIELD_PENALTY + 5))
435
+ fi
436
+ done
437
+ i=$((i + 1))
438
+ done
439
+ fi
440
+ [ $FIELD_PENALTY -gt 30 ] && FIELD_PENALTY=30
441
+
442
+ STRUCTURED=$((BASE + BONUS - FORBID_PENALTY - VALID_PENALTY - FIELD_PENALTY))
380
443
  [ $STRUCTURED -gt 100 ] && STRUCTURED=100
381
444
  [ $STRUCTURED -lt 0 ] && STRUCTURED=0
382
445
 
383
- CALCULATION=$(printf 'base: %d/%d expected present = %d; +%d optional bonus; -%d forbidden penalty; -%d validity penalty; clamp [0,100] = %d' \
446
+ CALCULATION=$(printf 'base: %d/%d expected present = %d; +%d optional bonus; -%d forbidden penalty; -%d validity penalty; -%d field penalty; clamp [0,100] = %d' \
384
447
  "$PRESENT_EXPECTED_COUNT" "$EXPECTED_COUNT" "$BASE" \
385
- "$BONUS" "$FORBID_PENALTY" "$VALID_PENALTY" "$STRUCTURED")
448
+ "$BONUS" "$FORBID_PENALTY" "$VALID_PENALTY" "$FIELD_PENALTY" "$STRUCTURED")
386
449
 
387
- if [ "$STRUCTURED" -ge 100 ] && [ -z "$PRESENT_FORBIDDEN" ] && [ "$VALID_PENALTY" -eq 0 ]; then
450
+ if [ "$STRUCTURED" -ge 100 ] && [ -z "$PRESENT_FORBIDDEN" ] && [ "$VALID_PENALTY" -eq 0 ] && [ "$FIELD_PENALTY" -eq 0 ]; then
388
451
  NOTES="All expected schemas for pageType=$PAGE_TYPE are present. No structured-data action needed."
389
452
  elif [ -n "$MISSING_EXPECTED" ] && [ -z "$PRESENT_FORBIDDEN" ]; then
390
453
  NOTES="Missing expected schemas for pageType=$PAGE_TYPE: $MISSING_EXPECTED. Add these to raise the score."
@@ -392,8 +455,12 @@ for bot_id in $BOTS; do
392
455
  NOTES="Forbidden schemas present for pageType=$PAGE_TYPE: $PRESENT_FORBIDDEN. Remove these (or re-classify the page type with --page-type)."
393
456
  elif [ -n "$PRESENT_FORBIDDEN" ] && [ -n "$MISSING_EXPECTED" ]; then
394
457
  NOTES="Mixed: missing $MISSING_EXPECTED and forbidden present $PRESENT_FORBIDDEN for pageType=$PAGE_TYPE."
395
- else
458
+ elif [ "$FIELD_PENALTY" -gt 0 ]; then
459
+ NOTES="Schemas for pageType=$PAGE_TYPE are present but missing required fields. See violations for details."
460
+ elif [ "$VALID_PENALTY" -gt 0 ]; then
396
461
  NOTES="Score reduced by $VALID_PENALTY pts due to invalid JSON-LD blocks."
462
+ else
463
+ NOTES="Structured data scored for pageType=$PAGE_TYPE."
397
464
  fi
398
465
 
399
466
  STRUCTURED_GRADE=$(grade_for "$STRUCTURED")
@@ -410,6 +477,7 @@ for bot_id in $BOTS; do
410
477
  --arg forbiddenPresent "$PRESENT_FORBIDDEN" \
411
478
  --argjson invalidCount "$JSONLD_INVALID" \
412
479
  --argjson validPenalty "$VALID_PENALTY" \
480
+ --argjson fieldViolations "$FIELD_VIOLATIONS_JSON" \
413
481
  --arg calculation "$CALCULATION" \
414
482
  --arg notes "$NOTES" \
415
483
  '
@@ -430,6 +498,7 @@ for bot_id in $BOTS; do
430
498
  then [{kind: "invalid_jsonld", count: $invalidCount, impact: (0 - $validPenalty)}]
431
499
  else []
432
500
  end)
501
+ + $fieldViolations
433
502
  ),
434
503
  calculation: $calculation,
435
504
  notes: $notes
@@ -566,6 +635,60 @@ CAT_STRUCTURED_GRADE=$(grade_for "$CAT_STRUCTURED_AVG")
566
635
  CAT_TECHNICAL_GRADE=$(grade_for "$CAT_TECHNICAL_AVG")
567
636
  CAT_AI_GRADE=$(grade_for "$CAT_AI_AVG")
568
637
 
638
+ # --- Cross-bot content parity (C4) ---
639
+ PARITY_MIN_WORDS=999999999
640
+ PARITY_MAX_WORDS=0
641
+ PARITY_BOT_COUNT=0
642
+ for bot_id in $BOTS; do
643
+ FETCH="$RESULTS_DIR/fetch-$bot_id.json"
644
+ P_FETCH_FAILED=$(jget_bool "$FETCH" '.fetchFailed')
645
+ [ "$P_FETCH_FAILED" = "true" ] && continue
646
+ WC=$(jget_num "$FETCH" '.wordCount')
647
+ [ "$WC" -lt "$PARITY_MIN_WORDS" ] && PARITY_MIN_WORDS=$WC
648
+ [ "$WC" -gt "$PARITY_MAX_WORDS" ] && PARITY_MAX_WORDS=$WC
649
+ PARITY_BOT_COUNT=$((PARITY_BOT_COUNT + 1))
650
+ done
651
+
652
+ if [ "$PARITY_BOT_COUNT" -le 1 ]; then
653
+ PARITY_SCORE=100
654
+ PARITY_MAX_DELTA=0
655
+ elif [ "$PARITY_MAX_WORDS" -gt 0 ]; then
656
+ PARITY_SCORE=$(awk -v min="$PARITY_MIN_WORDS" -v max="$PARITY_MAX_WORDS" \
657
+ 'BEGIN { printf "%d", (min / max) * 100 + 0.5 }')
658
+ PARITY_MAX_DELTA=$(awk -v min="$PARITY_MIN_WORDS" -v max="$PARITY_MAX_WORDS" \
659
+ 'BEGIN { printf "%d", ((max - min) / max) * 100 + 0.5 }')
660
+ else
661
+ PARITY_SCORE=100
662
+ PARITY_MAX_DELTA=0
663
+ fi
664
+
665
+ [ "$PARITY_SCORE" -gt 100 ] && PARITY_SCORE=100
666
+ PARITY_GRADE=$(grade_for "$PARITY_SCORE")
667
+
668
+ if [ "$PARITY_SCORE" -ge 95 ]; then
669
+ PARITY_INTERP="Content is consistent across all bots."
670
+ elif [ "$PARITY_SCORE" -ge 50 ]; then
671
+ PARITY_INTERP="Moderate content divergence between bots — likely partial client-side rendering hydration."
672
+ else
673
+ PARITY_INTERP="Severe content divergence — site likely relies on client-side rendering. AI bots see significantly less content than Googlebot."
674
+ fi
675
+
676
+ # --- Warnings (H2) ---
677
+ WARNINGS="[]"
678
+ if [ "$DIFF_AVAILABLE" != "true" ]; then
679
+ DIFF_REASON="not_found"
680
+ if [ -f "$DIFF_RENDER_FILE" ]; then
681
+ DIFF_REASON=$(jq -r '.reason // "skipped"' "$DIFF_RENDER_FILE" 2>/dev/null || echo "skipped")
682
+ fi
683
+ WARNINGS=$(printf '%s' "$WARNINGS" | jq --arg reason "$DIFF_REASON" \
684
+ '. + [{
685
+ code: "diff_render_unavailable",
686
+ severity: "high",
687
+ message: "JS rendering comparison was skipped. If this site uses CSR, non-JS bot scores may be inaccurate.",
688
+ reason: $reason
689
+ }]')
690
+ fi
691
+
569
692
  TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
570
693
 
571
694
  jq -n \
@@ -587,6 +710,13 @@ jq -n \
587
710
  --arg catTechnicalGrade "$CAT_TECHNICAL_GRADE" \
588
711
  --argjson catAi "$CAT_AI_AVG" \
589
712
  --arg catAiGrade "$CAT_AI_GRADE" \
713
+ --argjson warnings "$WARNINGS" \
714
+ --argjson parityScore "$PARITY_SCORE" \
715
+ --arg parityGrade "$PARITY_GRADE" \
716
+ --argjson parityMinWords "$PARITY_MIN_WORDS" \
717
+ --argjson parityMaxWords "$PARITY_MAX_WORDS" \
718
+ --argjson parityMaxDelta "$PARITY_MAX_DELTA" \
719
+ --arg parityInterp "$PARITY_INTERP" \
590
720
  '{
591
721
  url: $url,
592
722
  timestamp: $timestamp,
@@ -594,6 +724,15 @@ jq -n \
594
724
  pageType: $pageType,
595
725
  pageTypeOverridden: ($pageTypeOverride | length > 0),
596
726
  overall: { score: $overallScore, grade: $overallGrade },
727
+ parity: {
728
+ score: $parityScore,
729
+ grade: $parityGrade,
730
+ minWords: (if $parityMinWords >= 999999999 then 0 else $parityMinWords end),
731
+ maxWords: $parityMaxWords,
732
+ maxDeltaPct: $parityMaxDelta,
733
+ interpretation: $parityInterp
734
+ },
735
+ warnings: $warnings,
597
736
  bots: $bots,
598
737
  categories: {
599
738
  accessibility: { score: $catAcc, grade: $catAccGrade },
@@ -62,6 +62,7 @@ fi
62
62
 
63
63
  VALID_COUNT=0
64
64
  INVALID_COUNT=0
65
+ BLOCKS_JSON="[]"
65
66
 
66
67
  if [ "$BLOCK_COUNT" -gt 0 ]; then
67
68
  while IFS= read -r block; do
@@ -79,6 +80,15 @@ if [ "$BLOCK_COUNT" -gt 0 ]; then
79
80
  else empty end;
80
81
  collect_types
81
82
  ' 2>/dev/null >> "$TYPES_FILE" || true
83
+
84
+ # Extract per-block type + top-level field names for field validation (AC-B1)
85
+ BLOCK_INFO=$(printf '%s' "$block" | jq -c '
86
+ {
87
+ type: (if has("@type") then (.["@type"] | if type == "array" then .[0] else . end) else "unknown" end),
88
+ fields: (keys | map(select(startswith("@") | not)))
89
+ }
90
+ ' 2>/dev/null || echo '{"type":"unknown","fields":[]}')
91
+ BLOCKS_JSON=$(printf '%s' "$BLOCKS_JSON" | jq --argjson b "$BLOCK_INFO" '. + [$b]')
82
92
  else
83
93
  INVALID_COUNT=$((INVALID_COUNT + 1))
84
94
  fi
@@ -109,6 +119,7 @@ jq -n \
109
119
  --argjson valid "$VALID_COUNT" \
110
120
  --argjson invalid "$INVALID_COUNT" \
111
121
  --argjson types "$TYPES_JSON" \
122
+ --argjson blocks "$BLOCKS_JSON" \
112
123
  --argjson hasOrg "$HAS_ORG" \
113
124
  --argjson hasBreadcrumb "$HAS_BREADCRUMB" \
114
125
  --argjson hasWebsite "$HAS_WEBSITE" \
@@ -121,6 +132,7 @@ jq -n \
121
132
  validCount: $valid,
122
133
  invalidCount: $invalid,
123
134
  types: $types,
135
+ blocks: $blocks,
124
136
  flags: {
125
137
  hasOrganization: $hasOrg,
126
138
  hasBreadcrumbList: $hasBreadcrumb,
@@ -0,0 +1,151 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # fetch-as-bot.sh — Fetch a URL as a specific bot User-Agent
5
+ # Usage: fetch-as-bot.sh <url> <profile.json>
6
+
7
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
8
+ # shellcheck source=_lib.sh
9
+ . "$SCRIPT_DIR/_lib.sh"
10
+
11
+ URL="${1:?Usage: fetch-as-bot.sh <url> <profile.json>}"
12
+ PROFILE="${2:?Usage: fetch-as-bot.sh <url> <profile.json>}"
13
+
14
+ BOT_ID=$(jq -r '.id' "$PROFILE")
15
+ BOT_NAME=$(jq -r '.name' "$PROFILE")
16
+ UA=$(jq -r '.userAgent' "$PROFILE")
17
+ RENDERS_JS=$(jq -r '.rendersJavaScript' "$PROFILE")
18
+
19
+ TMPDIR="${TMPDIR:-/tmp}"
20
+ HEADERS_FILE=$(mktemp "$TMPDIR/crawlsim-headers.XXXXXX")
21
+ BODY_FILE=$(mktemp "$TMPDIR/crawlsim-body.XXXXXX")
22
+ CURL_STDERR_FILE=$(mktemp "$TMPDIR/crawlsim-stderr.XXXXXX")
23
+ trap 'rm -f "$HEADERS_FILE" "$BODY_FILE" "$CURL_STDERR_FILE"' EXIT
24
+
25
+ printf '[%s] fetching %s\n' "$BOT_ID" "$URL" >&2
26
+
27
+ set +e
28
+ TIMING=$(curl -sS -L \
29
+ -H "User-Agent: $UA" \
30
+ -D "$HEADERS_FILE" \
31
+ -o "$BODY_FILE" \
32
+ -w '{"total":%{time_total},"ttfb":%{time_starttransfer},"connect":%{time_connect},"statusCode":%{http_code},"sizeDownload":%{size_download},"redirectCount":%{num_redirects},"finalUrl":"%{url_effective}"}' \
33
+ --max-time 30 \
34
+ "$URL" 2>"$CURL_STDERR_FILE")
35
+ CURL_EXIT=$?
36
+ set -e
37
+
38
+ CURL_ERR=""
39
+ if [ -s "$CURL_STDERR_FILE" ]; then
40
+ CURL_ERR=$(cat "$CURL_STDERR_FILE")
41
+ fi
42
+
43
+ if [ "$CURL_EXIT" -ne 0 ]; then
44
+ printf '[%s] FAILED: curl exit %d — %s\n' "$BOT_ID" "$CURL_EXIT" "$CURL_ERR" >&2
45
+ jq -n \
46
+ --arg url "$URL" \
47
+ --arg botId "$BOT_ID" \
48
+ --arg botName "$BOT_NAME" \
49
+ --arg ua "$UA" \
50
+ --arg rendersJs "$RENDERS_JS" \
51
+ --arg error "$CURL_ERR" \
52
+ --argjson exitCode "$CURL_EXIT" \
53
+ '{
54
+ url: $url,
55
+ bot: {
56
+ id: $botId,
57
+ name: $botName,
58
+ userAgent: $ua,
59
+ rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end)
60
+ },
61
+ fetchFailed: true,
62
+ error: $error,
63
+ curlExitCode: $exitCode,
64
+ status: 0,
65
+ timing: { total: 0, ttfb: 0 },
66
+ size: 0,
67
+ wordCount: 0,
68
+ headers: {},
69
+ bodyBase64: ""
70
+ }'
71
+ exit 0
72
+ fi
73
+
74
+ read -r STATUS TOTAL_TIME TTFB SIZE REDIRECT_COUNT FINAL_URL <<< \
75
+ "$(echo "$TIMING" | jq -r '[.statusCode, .total, .ttfb, .sizeDownload, .redirectCount, .finalUrl] | @tsv')"
76
+
77
+ # Parse response headers into a JSON object using jq for safe escaping.
78
+ # curl -L writes multiple blocks on redirect; jq keeps the last definition
79
+ # of each header since `add` overwrites left-to-right.
80
+ HEADERS_JSON=$(tr -d '\r' < "$HEADERS_FILE" \
81
+ | grep -E '^[A-Za-z][A-Za-z0-9-]*:[[:space:]]' \
82
+ | jq -Rs '
83
+ split("\n")
84
+ | map(select(length > 0))
85
+ | map(capture("^(?<k>[^:]+):[[:space:]]*(?<v>.*)$"))
86
+ | map({(.k): .v})
87
+ | add // {}
88
+ ')
89
+
90
+ # Parse redirect chain from headers dump.
91
+ # curl -D writes multiple HTTP response blocks on redirect — each starts with HTTP/.
92
+ REDIRECT_CHAIN="[]"
93
+ if [ "$REDIRECT_COUNT" -gt 0 ]; then
94
+ REDIRECT_CHAIN=$(tr -d '\r' < "$HEADERS_FILE" | awk '
95
+ /^HTTP\// { status=$2; url="" }
96
+ /^[Ll]ocation:/ { url=$2 }
97
+ /^$/ && status && url { printf "%s %s\n", status, url; status=""; url="" }
98
+ ' | jq -Rs '
99
+ split("\n") | map(select(length > 0)) |
100
+ to_entries | map({
101
+ hop: .key,
102
+ status: (.value | split(" ")[0] | tonumber),
103
+ location: (.value | split(" ")[1:] | join(" "))
104
+ })
105
+ ')
106
+ fi
107
+
108
+ WORD_COUNT=$(count_words "$BODY_FILE")
109
+ [ -z "$WORD_COUNT" ] && WORD_COUNT=0
110
+
111
+ BODY_B64=""
112
+ if [ -s "$BODY_FILE" ]; then
113
+ BODY_B64=$(base64 < "$BODY_FILE")
114
+ fi
115
+
116
+ printf '[%s] ok: status=%s size=%s words=%s time=%ss\n' "$BOT_ID" "$STATUS" "$SIZE" "$WORD_COUNT" "$TOTAL_TIME" >&2
117
+
118
+ jq -n \
119
+ --arg url "$URL" \
120
+ --arg botId "$BOT_ID" \
121
+ --arg botName "$BOT_NAME" \
122
+ --arg ua "$UA" \
123
+ --arg rendersJs "$RENDERS_JS" \
124
+ --argjson status "$STATUS" \
125
+ --argjson totalTime "$TOTAL_TIME" \
126
+ --argjson ttfb "$TTFB" \
127
+ --argjson size "$SIZE" \
128
+ --argjson wordCount "$WORD_COUNT" \
129
+ --argjson headers "$HEADERS_JSON" \
130
+ --argjson redirectCount "$REDIRECT_COUNT" \
131
+ --arg finalUrl "$FINAL_URL" \
132
+ --argjson redirectChain "$REDIRECT_CHAIN" \
133
+ --arg bodyBase64 "$BODY_B64" \
134
+ '{
135
+ url: $url,
136
+ bot: {
137
+ id: $botId,
138
+ name: $botName,
139
+ userAgent: $ua,
140
+ rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end)
141
+ },
142
+ status: $status,
143
+ timing: { total: $totalTime, ttfb: $ttfb },
144
+ size: $size,
145
+ wordCount: $wordCount,
146
+ redirectCount: $redirectCount,
147
+ finalUrl: $finalUrl,
148
+ redirectChain: $redirectChain,
149
+ headers: $headers,
150
+ bodyBase64: $bodyBase64
151
+ }'
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env bash
2
+ # schema-fields.sh — Required field definitions per schema.org type.
3
+ # Source this file, then call required_fields_for <SchemaType>.
4
+
5
+ required_fields_for() {
6
+ case "$1" in
7
+ Organization) echo "name url" ;;
8
+ WebSite) echo "name url" ;;
9
+ Article) echo "headline author datePublished" ;;
10
+ NewsArticle) echo "headline author datePublished" ;;
11
+ FAQPage) echo "mainEntity" ;;
12
+ BreadcrumbList) echo "itemListElement" ;;
13
+ CollectionPage) echo "name" ;;
14
+ ItemList) echo "itemListElement" ;;
15
+ AboutPage) echo "name" ;;
16
+ ContactPage) echo "name" ;;
17
+ Product) echo "name" ;;
18
+ LocalBusiness) echo "name address" ;;
19
+ ProfessionalService) echo "name" ;;
20
+ Person) echo "name" ;;
21
+ ImageObject) echo "contentUrl" ;;
22
+ PostalAddress) echo "streetAddress" ;;
23
+ *) echo "" ;;
24
+ esac
25
+ }
@@ -1,87 +0,0 @@
1
- #!/usr/bin/env bash
2
- set -euo pipefail
3
-
4
- # fetch-as-bot.sh — Fetch a URL as a specific bot User-Agent
5
- # Usage: fetch-as-bot.sh <url> <profile.json>
6
-
7
- SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
8
- # shellcheck source=_lib.sh
9
- . "$SCRIPT_DIR/_lib.sh"
10
-
11
- URL="${1:?Usage: fetch-as-bot.sh <url> <profile.json>}"
12
- PROFILE="${2:?Usage: fetch-as-bot.sh <url> <profile.json>}"
13
-
14
- BOT_ID=$(jq -r '.id' "$PROFILE")
15
- BOT_NAME=$(jq -r '.name' "$PROFILE")
16
- UA=$(jq -r '.userAgent' "$PROFILE")
17
- RENDERS_JS=$(jq -r '.rendersJavaScript' "$PROFILE")
18
-
19
- printf '[fetch-as-bot] %s <- %s\n' "$BOT_NAME" "$URL" >&2
20
-
21
- TMPDIR="${TMPDIR:-/tmp}"
22
- HEADERS_FILE=$(mktemp "$TMPDIR/crawlsim-headers.XXXXXX")
23
- BODY_FILE=$(mktemp "$TMPDIR/crawlsim-body.XXXXXX")
24
- trap 'rm -f "$HEADERS_FILE" "$BODY_FILE"' EXIT
25
-
26
- TIMING=$(curl -sS -L \
27
- -H "User-Agent: $UA" \
28
- -D "$HEADERS_FILE" \
29
- -o "$BODY_FILE" \
30
- -w '{"total":%{time_total},"ttfb":%{time_starttransfer},"connect":%{time_connect},"statusCode":%{http_code},"sizeDownload":%{size_download}}' \
31
- --max-time 30 \
32
- "$URL" 2>/dev/null || echo '{"total":0,"ttfb":0,"connect":0,"statusCode":0,"sizeDownload":0}')
33
-
34
- STATUS=$(echo "$TIMING" | jq -r '.statusCode')
35
- TOTAL_TIME=$(echo "$TIMING" | jq -r '.total')
36
- TTFB=$(echo "$TIMING" | jq -r '.ttfb')
37
- SIZE=$(echo "$TIMING" | jq -r '.sizeDownload')
38
-
39
- # Parse response headers into a JSON object using jq for safe escaping.
40
- # curl -L writes multiple blocks on redirect; jq keeps the last definition
41
- # of each header since `add` overwrites left-to-right.
42
- HEADERS_JSON=$(tr -d '\r' < "$HEADERS_FILE" \
43
- | grep -E '^[A-Za-z][A-Za-z0-9-]*:[[:space:]]' \
44
- | jq -Rs '
45
- split("\n")
46
- | map(select(length > 0))
47
- | map(capture("^(?<k>[^:]+):[[:space:]]*(?<v>.*)$"))
48
- | map({(.k): .v})
49
- | add // {}
50
- ')
51
-
52
- WORD_COUNT=$(count_words "$BODY_FILE")
53
- [ -z "$WORD_COUNT" ] && WORD_COUNT=0
54
-
55
- BODY_B64=""
56
- if [ -s "$BODY_FILE" ]; then
57
- BODY_B64=$(base64 < "$BODY_FILE")
58
- fi
59
-
60
- jq -n \
61
- --arg url "$URL" \
62
- --arg botId "$BOT_ID" \
63
- --arg botName "$BOT_NAME" \
64
- --arg ua "$UA" \
65
- --arg rendersJs "$RENDERS_JS" \
66
- --argjson status "$STATUS" \
67
- --argjson totalTime "$TOTAL_TIME" \
68
- --argjson ttfb "$TTFB" \
69
- --argjson size "$SIZE" \
70
- --argjson wordCount "$WORD_COUNT" \
71
- --argjson headers "$HEADERS_JSON" \
72
- --arg bodyBase64 "$BODY_B64" \
73
- '{
74
- url: $url,
75
- bot: {
76
- id: $botId,
77
- name: $botName,
78
- userAgent: $ua,
79
- rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end)
80
- },
81
- status: $status,
82
- timing: { total: $totalTime, ttfb: $ttfb },
83
- size: $size,
84
- wordCount: $wordCount,
85
- headers: $headers,
86
- bodyBase64: $bodyBase64
87
- }'
File without changes