@braedenbuilds/crawl-sim 1.1.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +15 -0
- package/.claude-plugin/plugin.json +13 -0
- package/README.md +19 -6
- package/bin/install.js +6 -2
- package/package.json +5 -3
- package/{SKILL.md → skills/crawl-sim/SKILL.md} +38 -5
- package/skills/crawl-sim/scripts/build-report.sh +45 -0
- package/{scripts → skills/crawl-sim/scripts}/check-llmstxt.sh +5 -0
- package/{scripts → skills/crawl-sim/scripts}/check-sitemap.sh +10 -1
- package/{scripts → skills/crawl-sim/scripts}/compute-score.sh +202 -41
- package/{scripts → skills/crawl-sim/scripts}/extract-jsonld.sh +12 -0
- package/{scripts → skills/crawl-sim/scripts}/extract-links.sh +5 -7
- package/skills/crawl-sim/scripts/fetch-as-bot.sh +151 -0
- package/skills/crawl-sim/scripts/schema-fields.sh +25 -0
- package/scripts/fetch-as-bot.sh +0 -87
- /package/{profiles → skills/crawl-sim/profiles}/chatgpt-user.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/claude-searchbot.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/claude-user.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/claudebot.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/googlebot.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/gptbot.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/oai-searchbot.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/perplexity-user.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/perplexitybot.json +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/_lib.sh +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/check-robots.sh +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/diff-render.sh +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/extract-meta.sh +0 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "crawl-sim",
|
|
3
|
+
"owner": {
|
|
4
|
+
"name": "BraedenBDev",
|
|
5
|
+
"url": "https://github.com/BraedenBDev"
|
|
6
|
+
},
|
|
7
|
+
"plugins": [
|
|
8
|
+
{
|
|
9
|
+
"name": "crawl-sim",
|
|
10
|
+
"source": "./",
|
|
11
|
+
"description": "Multi-bot web crawler simulator — audit how Googlebot, GPTBot, ClaudeBot, and PerplexityBot see your site",
|
|
12
|
+
"version": "1.3.0"
|
|
13
|
+
}
|
|
14
|
+
]
|
|
15
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "crawl-sim",
|
|
3
|
+
"version": "1.3.0",
|
|
4
|
+
"description": "Multi-bot web crawler simulator — audit how Googlebot, GPTBot, ClaudeBot, and PerplexityBot see your site",
|
|
5
|
+
"author": {
|
|
6
|
+
"name": "BraedenBDev",
|
|
7
|
+
"url": "https://github.com/BraedenBDev"
|
|
8
|
+
},
|
|
9
|
+
"homepage": "https://github.com/BraedenBDev/crawl-sim#readme",
|
|
10
|
+
"repository": "https://github.com/BraedenBDev/crawl-sim",
|
|
11
|
+
"license": "MIT",
|
|
12
|
+
"keywords": ["seo", "crawler", "ai-visibility", "claude-code-skill", "googlebot", "gptbot", "claudebot", "perplexitybot"]
|
|
13
|
+
}
|
package/README.md
CHANGED
|
@@ -44,15 +44,20 @@ The concept was validated manually: a curl-as-GPTBot + Claude analysis caught a
|
|
|
44
44
|
|
|
45
45
|
## Quick start
|
|
46
46
|
|
|
47
|
-
###
|
|
47
|
+
### As a Claude Code plugin (recommended)
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
/plugin install BraedenBDev/crawl-sim@github
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Or add as a marketplace for easy updates:
|
|
48
54
|
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
crawl-sim install --project # → .claude/skills/crawl-sim/
|
|
55
|
+
```
|
|
56
|
+
/plugin marketplace add BraedenBDev/crawl-sim
|
|
57
|
+
/plugin install crawl-sim@crawl-sim
|
|
53
58
|
```
|
|
54
59
|
|
|
55
|
-
Then
|
|
60
|
+
Then invoke:
|
|
56
61
|
|
|
57
62
|
```
|
|
58
63
|
/crawl-sim https://yoursite.com
|
|
@@ -60,6 +65,14 @@ Then in Claude Code:
|
|
|
60
65
|
|
|
61
66
|
Claude runs the full pipeline, interprets the results, and returns a score card plus prioritized findings.
|
|
62
67
|
|
|
68
|
+
### Via npm (alternative)
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
npm install -g @braedenbuilds/crawl-sim
|
|
72
|
+
crawl-sim install # → ~/.claude/skills/crawl-sim/
|
|
73
|
+
crawl-sim install --project # → .claude/skills/crawl-sim/
|
|
74
|
+
```
|
|
75
|
+
|
|
63
76
|
> **Why `npm install -g` instead of `npx`?** Recent versions of npx have a known issue linking bins for scoped single-bin packages in ephemeral installs. A persistent global install avoids the problem entirely. The git clone path below is the zero-npm fallback.
|
|
64
77
|
|
|
65
78
|
### As a standalone CLI
|
package/bin/install.js
CHANGED
|
@@ -14,6 +14,7 @@ const os = require('os');
|
|
|
14
14
|
const { execFileSync } = require('child_process');
|
|
15
15
|
|
|
16
16
|
const SOURCE_DIR = path.resolve(__dirname, '..');
|
|
17
|
+
const SKILL_ROOT = path.resolve(SOURCE_DIR, 'skills', 'crawl-sim');
|
|
17
18
|
const SKILL_FILES = ['SKILL.md'];
|
|
18
19
|
const SKILL_DIRS = ['profiles', 'scripts'];
|
|
19
20
|
|
|
@@ -80,7 +81,9 @@ function install(target) {
|
|
|
80
81
|
fs.mkdirSync(target, { recursive: true });
|
|
81
82
|
|
|
82
83
|
for (const file of SKILL_FILES) {
|
|
83
|
-
|
|
84
|
+
// Look in skills/crawl-sim/ first (canonical), fallback to root (symlink)
|
|
85
|
+
let src = path.join(SKILL_ROOT, file);
|
|
86
|
+
if (!fs.existsSync(src)) src = path.join(SOURCE_DIR, file);
|
|
84
87
|
const dest = path.join(target, file);
|
|
85
88
|
if (fs.existsSync(src)) {
|
|
86
89
|
fs.copyFileSync(src, dest);
|
|
@@ -92,7 +95,8 @@ function install(target) {
|
|
|
92
95
|
}
|
|
93
96
|
|
|
94
97
|
for (const dir of SKILL_DIRS) {
|
|
95
|
-
|
|
98
|
+
let src = path.join(SKILL_ROOT, dir);
|
|
99
|
+
if (!fs.existsSync(src)) src = path.join(SOURCE_DIR, dir);
|
|
96
100
|
const dest = path.join(target, dir);
|
|
97
101
|
if (fs.existsSync(src)) {
|
|
98
102
|
if (fs.existsSync(dest)) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@braedenbuilds/crawl-sim",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.3.0",
|
|
4
4
|
"description": "Agent-native multi-bot web crawler simulator. See your site through the eyes of Googlebot, GPTBot, ClaudeBot, and PerplexityBot.",
|
|
5
5
|
"bin": {
|
|
6
6
|
"crawl-sim": "bin/install.js"
|
|
@@ -40,9 +40,11 @@
|
|
|
40
40
|
},
|
|
41
41
|
"files": [
|
|
42
42
|
"bin/",
|
|
43
|
+
"skills/",
|
|
44
|
+
".claude-plugin/",
|
|
43
45
|
"SKILL.md",
|
|
44
|
-
"profiles
|
|
45
|
-
"scripts
|
|
46
|
+
"profiles",
|
|
47
|
+
"scripts",
|
|
46
48
|
"README.md",
|
|
47
49
|
"LICENSE"
|
|
48
50
|
]
|
|
@@ -40,7 +40,10 @@ command -v curl >/dev/null 2>&1 || { echo "ERROR: curl is required"; exit 1; }
|
|
|
40
40
|
command -v jq >/dev/null 2>&1 || { echo "ERROR: jq is required (brew install jq)"; exit 1; }
|
|
41
41
|
```
|
|
42
42
|
|
|
43
|
-
Locate the skill directory
|
|
43
|
+
Locate the skill directory. Check in this order:
|
|
44
|
+
1. `$CLAUDE_PLUGIN_ROOT/skills/crawl-sim` (plugin install)
|
|
45
|
+
2. `~/.claude/skills/crawl-sim/` (global npm install)
|
|
46
|
+
3. `.claude/skills/crawl-sim/` (project-level install)
|
|
44
47
|
|
|
45
48
|
## Orchestration — five narrated stages
|
|
46
49
|
|
|
@@ -48,14 +51,32 @@ Split the work into **five Bash invocations**, each with a clear `description` f
|
|
|
48
51
|
|
|
49
52
|
### Stage 1 — Fetch
|
|
50
53
|
|
|
51
|
-
Tell the user: "Fetching as Googlebot, GPTBot, ClaudeBot, and PerplexityBot..."
|
|
54
|
+
Tell the user: "Fetching as Googlebot, GPTBot, ClaudeBot, and PerplexityBot in parallel..."
|
|
52
55
|
|
|
53
56
|
```bash
|
|
54
|
-
|
|
57
|
+
# Resolve skill directory
|
|
58
|
+
if [ -n "${CLAUDE_PLUGIN_ROOT:-}" ] && [ -d "$CLAUDE_PLUGIN_ROOT/skills/crawl-sim" ]; then
|
|
59
|
+
SKILL_DIR="$CLAUDE_PLUGIN_ROOT/skills/crawl-sim"
|
|
60
|
+
elif [ -d "$HOME/.claude/skills/crawl-sim" ]; then
|
|
61
|
+
SKILL_DIR="$HOME/.claude/skills/crawl-sim"
|
|
62
|
+
elif [ -d ".claude/skills/crawl-sim" ]; then
|
|
63
|
+
SKILL_DIR=".claude/skills/crawl-sim"
|
|
64
|
+
else
|
|
65
|
+
echo "ERROR: cannot find crawl-sim skill directory" >&2; exit 1
|
|
66
|
+
fi
|
|
55
67
|
RUN_DIR=$(mktemp -d -t crawl-sim.XXXXXX)
|
|
56
68
|
URL="<user-provided-url>"
|
|
57
69
|
for bot in googlebot gptbot claudebot perplexitybot; do
|
|
58
|
-
"$SKILL_DIR/scripts/fetch-as-bot.sh" "$URL" "$SKILL_DIR/profiles/${bot}.json" > "$RUN_DIR/fetch-${bot}.json"
|
|
70
|
+
"$SKILL_DIR/scripts/fetch-as-bot.sh" "$URL" "$SKILL_DIR/profiles/${bot}.json" > "$RUN_DIR/fetch-${bot}.json" &
|
|
71
|
+
done
|
|
72
|
+
wait
|
|
73
|
+
|
|
74
|
+
# Verify no empty fetch files (guard against silent parallel failures)
|
|
75
|
+
for bot in googlebot gptbot claudebot perplexitybot; do
|
|
76
|
+
if [ ! -s "$RUN_DIR/fetch-${bot}.json" ]; then
|
|
77
|
+
echo "WARNING: fetch-${bot}.json is empty — retrying serially" >&2
|
|
78
|
+
"$SKILL_DIR/scripts/fetch-as-bot.sh" "$URL" "$SKILL_DIR/profiles/${bot}.json" > "$RUN_DIR/fetch-${bot}.json"
|
|
79
|
+
fi
|
|
59
80
|
done
|
|
60
81
|
```
|
|
61
82
|
|
|
@@ -112,7 +133,7 @@ Tell the user: "Computing per-bot scores and finalizing the report..."
|
|
|
112
133
|
|
|
113
134
|
```bash
|
|
114
135
|
"$SKILL_DIR/scripts/compute-score.sh" "$RUN_DIR" > "$RUN_DIR/score.json"
|
|
115
|
-
|
|
136
|
+
"$SKILL_DIR/scripts/build-report.sh" "$RUN_DIR" > ./crawl-sim-report.json
|
|
116
137
|
```
|
|
117
138
|
|
|
118
139
|
**Page-type awareness.** `compute-score.sh` derives a page type from the target URL (`root` / `detail` / `archive` / `faq` / `about` / `contact` / `generic`) and picks a schema rubric accordingly. Root pages are expected to ship `Organization` + `WebSite` — penalizing them for missing `BreadcrumbList` or `FAQPage` would be wrong, so the scorer doesn't. If the URL heuristic picks the wrong type (e.g., a homepage at `/en/` that URL-parses as generic), pass `--page-type <type>`:
|
|
@@ -150,6 +171,18 @@ Print a boxed score card to the terminal:
|
|
|
150
171
|
|
|
151
172
|
Progress bars are 20 chars wide using `█` and `░` (each char = 5%).
|
|
152
173
|
|
|
174
|
+
**Parity-aware display.** When `parity.score >= 95` AND all per-bot composite scores are within 5 points of each other, collapse the four bot rows into one:
|
|
175
|
+
|
|
176
|
+
```
|
|
177
|
+
║ All 4 bots 98 A ███████████████████░ (parity: content identical) ║
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
Only show individual bot rows when scores diverge — that's when per-bot detail adds information. Always show the parity line in the category breakdown:
|
|
181
|
+
|
|
182
|
+
```
|
|
183
|
+
║ Content Parity 100 A (all bots see the same content) ║
|
|
184
|
+
```
|
|
185
|
+
|
|
153
186
|
## Output Layer 2 — Narrative Audit
|
|
154
187
|
|
|
155
188
|
Lead with a **Bot differentiation summary** — state up front whether the bots scored the same or differently, and why. If they scored the same, explicitly say so:
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -eu
|
|
3
|
+
|
|
4
|
+
# build-report.sh — Consolidate all crawl-sim outputs into a single JSON report
|
|
5
|
+
# Usage: build-report.sh <results-dir>
|
|
6
|
+
# Output: JSON to stdout
|
|
7
|
+
|
|
8
|
+
RESULTS_DIR="${1:?Usage: build-report.sh <results-dir>}"
|
|
9
|
+
|
|
10
|
+
if [ ! -f "$RESULTS_DIR/score.json" ]; then
|
|
11
|
+
echo "Error: score.json not found in $RESULTS_DIR — run compute-score.sh first" >&2
|
|
12
|
+
exit 1
|
|
13
|
+
fi
|
|
14
|
+
|
|
15
|
+
SCORE=$(cat "$RESULTS_DIR/score.json")
|
|
16
|
+
|
|
17
|
+
# Collect per-bot raw data
|
|
18
|
+
PER_BOT="{}"
|
|
19
|
+
for f in "$RESULTS_DIR"/fetch-*.json; do
|
|
20
|
+
[ -f "$f" ] || continue
|
|
21
|
+
bot_id=$(basename "$f" .json | sed 's/^fetch-//')
|
|
22
|
+
|
|
23
|
+
BOT_RAW=$(jq -n \
|
|
24
|
+
--argjson fetch "$(jq '{status, timing, size, wordCount, redirectCount, finalUrl, redirectChain, fetchFailed, error}' "$f" 2>/dev/null || echo '{}')" \
|
|
25
|
+
--argjson meta "$(jq '.' "$RESULTS_DIR/meta-$bot_id.json" 2>/dev/null || echo '{}')" \
|
|
26
|
+
--argjson jsonld "$(jq '{blockCount, types, blocks}' "$RESULTS_DIR/jsonld-$bot_id.json" 2>/dev/null || echo '{}')" \
|
|
27
|
+
--argjson links "$(jq '.' "$RESULTS_DIR/links-$bot_id.json" 2>/dev/null || echo '{}')" \
|
|
28
|
+
--argjson robots "$(jq '.' "$RESULTS_DIR/robots-$bot_id.json" 2>/dev/null || echo '{}')" \
|
|
29
|
+
'{fetch: $fetch, meta: $meta, jsonld: $jsonld, links: $links, robots: $robots}')
|
|
30
|
+
|
|
31
|
+
PER_BOT=$(printf '%s' "$PER_BOT" | jq --argjson raw "$BOT_RAW" --arg id "$bot_id" '.[$id] = $raw')
|
|
32
|
+
done
|
|
33
|
+
|
|
34
|
+
# Collect independent (non-per-bot) data
|
|
35
|
+
INDEPENDENT=$(jq -n \
|
|
36
|
+
--argjson sitemap "$(jq '.' "$RESULTS_DIR/sitemap.json" 2>/dev/null || echo '{}')" \
|
|
37
|
+
--argjson llmstxt "$(jq '.' "$RESULTS_DIR/llmstxt.json" 2>/dev/null || echo '{}')" \
|
|
38
|
+
--argjson diffRender "$(jq '.' "$RESULTS_DIR/diff-render.json" 2>/dev/null || echo '{"skipped":true,"reason":"not_found"}')" \
|
|
39
|
+
'{sitemap: $sitemap, llmstxt: $llmstxt, diffRender: $diffRender}')
|
|
40
|
+
|
|
41
|
+
# Merge score + raw data
|
|
42
|
+
printf '%s' "$SCORE" | jq \
|
|
43
|
+
--argjson perBot "$PER_BOT" \
|
|
44
|
+
--argjson independent "$INDEPENDENT" \
|
|
45
|
+
'. + {raw: {perBot: $perBot, independent: $independent}}'
|
|
@@ -79,8 +79,12 @@ LLMS_FULL_HAS_TITLE=$HAS_TITLE
|
|
|
79
79
|
LLMS_FULL_HAS_DESC=$HAS_DESCRIPTION
|
|
80
80
|
LLMS_FULL_URLS=$URL_COUNT
|
|
81
81
|
|
|
82
|
+
TOP_EXISTS=false
|
|
83
|
+
[ "$LLMS_EXISTS" = "true" ] || [ "$LLMS_FULL_EXISTS" = "true" ] && TOP_EXISTS=true
|
|
84
|
+
|
|
82
85
|
jq -n \
|
|
83
86
|
--arg url "$URL" \
|
|
87
|
+
--argjson topExists "$TOP_EXISTS" \
|
|
84
88
|
--arg llmsUrl "${ORIGIN}/llms.txt" \
|
|
85
89
|
--arg llmsFullUrl "${ORIGIN}/llms-full.txt" \
|
|
86
90
|
--argjson llmsExists "$LLMS_EXISTS" \
|
|
@@ -96,6 +100,7 @@ jq -n \
|
|
|
96
100
|
--argjson llmsFullUrls "$LLMS_FULL_URLS" \
|
|
97
101
|
'{
|
|
98
102
|
url: $url,
|
|
103
|
+
exists: $topExists,
|
|
99
104
|
llmsTxt: {
|
|
100
105
|
url: $llmsUrl,
|
|
101
106
|
exists: $llmsExists,
|
|
@@ -25,6 +25,7 @@ CONTAINS_TARGET=false
|
|
|
25
25
|
HAS_LASTMOD=false
|
|
26
26
|
IS_INDEX=false
|
|
27
27
|
CHILD_SITEMAP_COUNT=0
|
|
28
|
+
SAMPLE_URLS="[]"
|
|
28
29
|
|
|
29
30
|
if [ "$HTTP_STATUS" = "200" ] && [ -s "$SITEMAP_FILE" ]; then
|
|
30
31
|
# Check if content looks like XML (not HTML fallback)
|
|
@@ -43,6 +44,12 @@ if [ "$HTTP_STATUS" = "200" ] && [ -s "$SITEMAP_FILE" ]; then
|
|
|
43
44
|
# Count <loc> tags (URLs, or child sitemaps in an index)
|
|
44
45
|
URL_COUNT=$(grep -oE '<loc>' "$SITEMAP_FILE" | wc -l | tr -d ' ')
|
|
45
46
|
|
|
47
|
+
# Extract first 10 <loc> URLs as sample
|
|
48
|
+
SAMPLE_URLS=$(grep -oE '<loc>[^<]+</loc>' "$SITEMAP_FILE" \
|
|
49
|
+
| sed -E 's/<\/?loc>//g' \
|
|
50
|
+
| head -10 \
|
|
51
|
+
| jq -R . | jq -s .)
|
|
52
|
+
|
|
46
53
|
# Check if target URL appears anywhere in the sitemap
|
|
47
54
|
# Match both with and without trailing slash
|
|
48
55
|
URL_NO_TRAILING=$(printf '%s' "$URL" | sed -E 's#/$##')
|
|
@@ -67,6 +74,7 @@ jq -n \
|
|
|
67
74
|
--argjson childSitemapCount "$CHILD_SITEMAP_COUNT" \
|
|
68
75
|
--argjson containsTarget "$CONTAINS_TARGET" \
|
|
69
76
|
--argjson hasLastmod "$HAS_LASTMOD" \
|
|
77
|
+
--argjson sampleUrls "$SAMPLE_URLS" \
|
|
70
78
|
'{
|
|
71
79
|
url: $url,
|
|
72
80
|
sitemapUrl: $sitemapUrl,
|
|
@@ -75,5 +83,6 @@ jq -n \
|
|
|
75
83
|
urlCount: $urlCount,
|
|
76
84
|
childSitemapCount: $childSitemapCount,
|
|
77
85
|
containsTarget: $containsTarget,
|
|
78
|
-
hasLastmod: $hasLastmod
|
|
86
|
+
hasLastmod: $hasLastmod,
|
|
87
|
+
sampleUrls: $sampleUrls
|
|
79
88
|
}'
|
|
@@ -21,6 +21,8 @@ set -eu
|
|
|
21
21
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
22
22
|
# shellcheck source=_lib.sh
|
|
23
23
|
. "$SCRIPT_DIR/_lib.sh"
|
|
24
|
+
# shellcheck source=schema-fields.sh
|
|
25
|
+
. "$SCRIPT_DIR/schema-fields.sh"
|
|
24
26
|
|
|
25
27
|
PAGE_TYPE_OVERRIDE=""
|
|
26
28
|
while [ $# -gt 0 ]; do
|
|
@@ -276,12 +278,50 @@ for bot_id in $BOTS; do
|
|
|
276
278
|
ROBOTS="$RESULTS_DIR/robots-$bot_id.json"
|
|
277
279
|
|
|
278
280
|
BOT_NAME=$(jget "$FETCH" '.bot.name' "$bot_id")
|
|
279
|
-
STATUS=$(jget_num "$FETCH" '.status')
|
|
280
|
-
TOTAL_TIME=$(jget_num "$FETCH" '.timing.total')
|
|
281
|
-
SERVER_WORD_COUNT=$(jget_num "$FETCH" '.wordCount')
|
|
282
|
-
RENDERS_JS=$(jq -r '.bot.rendersJavaScript | if . == null then "unknown" else tostring end' "$FETCH" 2>/dev/null || echo "unknown")
|
|
283
281
|
|
|
284
|
-
|
|
282
|
+
# Check for fetch failure — skip scoring, emit F grade (AC-A3)
|
|
283
|
+
FETCH_FAILED=$(jget_bool "$FETCH" '.fetchFailed')
|
|
284
|
+
if [ "$FETCH_FAILED" = "true" ]; then
|
|
285
|
+
FETCH_ERROR=$(jget "$FETCH" '.error' "unknown error")
|
|
286
|
+
RENDERS_JS=$(jq -r '.bot.rendersJavaScript | if . == null then "unknown" else tostring end' "$FETCH" 2>/dev/null || echo "unknown")
|
|
287
|
+
BOT_OBJ=$(jq -n \
|
|
288
|
+
--arg id "$bot_id" \
|
|
289
|
+
--arg name "$BOT_NAME" \
|
|
290
|
+
--arg rendersJs "$RENDERS_JS" \
|
|
291
|
+
--arg error "$FETCH_ERROR" \
|
|
292
|
+
'{
|
|
293
|
+
id: $id,
|
|
294
|
+
name: $name,
|
|
295
|
+
rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end),
|
|
296
|
+
fetchFailed: true,
|
|
297
|
+
error: $error,
|
|
298
|
+
score: 0,
|
|
299
|
+
grade: "F",
|
|
300
|
+
visibility: { serverWords: 0, effectiveWords: 0, missedWordsVsRendered: 0, hydrationPenaltyPts: 0 },
|
|
301
|
+
categories: {
|
|
302
|
+
accessibility: { score: 0, grade: "F" },
|
|
303
|
+
contentVisibility: { score: 0, grade: "F" },
|
|
304
|
+
structuredData: { score: 0, grade: "F", pageType: "unknown", expected: [], optional: [], forbidden: [], present: [], missing: [], extras: [], violations: [{ kind: "fetch_failed", impact: -100 }], calculation: "fetch failed — no data to score", notes: ("Fetch failed: " + $error) },
|
|
305
|
+
technicalSignals: { score: 0, grade: "F" },
|
|
306
|
+
aiReadiness: { score: 0, grade: "F" }
|
|
307
|
+
}
|
|
308
|
+
}')
|
|
309
|
+
BOTS_JSON=$(printf '%s' "$BOTS_JSON" | jq --argjson bot "$BOT_OBJ" --arg id "$bot_id" '.[$id] = $bot')
|
|
310
|
+
printf '[compute-score] %s: fetch failed, scoring as F\n' "$bot_id" >&2
|
|
311
|
+
CAT_N=$((CAT_N + 1))
|
|
312
|
+
continue
|
|
313
|
+
fi
|
|
314
|
+
|
|
315
|
+
# Batch-read fields from fetch file (1 jq call instead of 4)
|
|
316
|
+
read -r STATUS TOTAL_TIME SERVER_WORD_COUNT RENDERS_JS <<< \
|
|
317
|
+
"$(jq -r '[
|
|
318
|
+
(.status // 0),
|
|
319
|
+
(.timing.total // 0),
|
|
320
|
+
(.wordCount // 0),
|
|
321
|
+
(.bot.rendersJavaScript | if . == null then "unknown" else tostring end)
|
|
322
|
+
] | @tsv' "$FETCH" 2>/dev/null || echo "0 0 0 unknown")"
|
|
323
|
+
|
|
324
|
+
ROBOTS_ALLOWED=$(jq -r '.allowed // false | tostring' "$ROBOTS" 2>/dev/null || echo "false")
|
|
285
325
|
|
|
286
326
|
EFFECTIVE_WORD_COUNT=$SERVER_WORD_COUNT
|
|
287
327
|
HYDRATION_PENALTY=0
|
|
@@ -305,10 +345,15 @@ for bot_id in $BOTS; do
|
|
|
305
345
|
|
|
306
346
|
# --- Category 1: Accessibility (0-100) ---
|
|
307
347
|
ACC=0
|
|
308
|
-
[ "$ROBOTS_ALLOWED"
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
348
|
+
if [ "$ROBOTS_ALLOWED" != "true" ]; then
|
|
349
|
+
# R4 critical-fail: robots blocking overrides accessibility to 0/F
|
|
350
|
+
ACC=0
|
|
351
|
+
else
|
|
352
|
+
ACC=$((ACC + 40))
|
|
353
|
+
[ "$STATUS" = "200" ] && ACC=$((ACC + 40))
|
|
354
|
+
TIME_SCORE=$(awk -v t="$TOTAL_TIME" 'BEGIN { if (t < 2) print 20; else if (t < 5) print 10; else print 0 }')
|
|
355
|
+
ACC=$((ACC + TIME_SCORE))
|
|
356
|
+
fi
|
|
312
357
|
|
|
313
358
|
# --- Category 2: Content Visibility (0-100) ---
|
|
314
359
|
CONTENT=0
|
|
@@ -317,18 +362,23 @@ for bot_id in $BOTS; do
|
|
|
317
362
|
elif [ "$EFFECTIVE_WORD_COUNT" -ge 50 ]; then CONTENT=$((CONTENT + 10))
|
|
318
363
|
fi
|
|
319
364
|
|
|
320
|
-
|
|
321
|
-
H2_COUNT
|
|
365
|
+
# Batch-read fields from meta + links (1 jq call instead of 4 + 1)
|
|
366
|
+
read -r H1_COUNT H2_COUNT IMG_TOTAL IMG_WITH_ALT <<< \
|
|
367
|
+
"$(jq -r '[
|
|
368
|
+
(.headings.h1.count // 0),
|
|
369
|
+
(.headings.h2.count // 0),
|
|
370
|
+
(.images.total // 0),
|
|
371
|
+
(.images.withAlt // 0)
|
|
372
|
+
] | @tsv' "$META" 2>/dev/null || echo "0 0 0 0")"
|
|
373
|
+
|
|
374
|
+
INTERNAL_LINKS=$(jq -r 'if (.internal | type) == "number" then .internal else .counts.internal // 0 end' "$LINKS" 2>/dev/null || echo "0")
|
|
375
|
+
|
|
322
376
|
[ "$H1_COUNT" -ge 1 ] && CONTENT=$((CONTENT + 20))
|
|
323
377
|
[ "$H2_COUNT" -ge 1 ] && CONTENT=$((CONTENT + 15))
|
|
324
378
|
|
|
325
|
-
INTERNAL_LINKS=$(jget_num "$LINKS" '.counts.internal')
|
|
326
379
|
if [ "$INTERNAL_LINKS" -ge 5 ]; then CONTENT=$((CONTENT + 20))
|
|
327
380
|
elif [ "$INTERNAL_LINKS" -ge 1 ]; then CONTENT=$((CONTENT + 10))
|
|
328
381
|
fi
|
|
329
|
-
|
|
330
|
-
IMG_TOTAL=$(jget_num "$META" '.images.total')
|
|
331
|
-
IMG_WITH_ALT=$(jget_num "$META" '.images.withAlt')
|
|
332
382
|
if [ "$IMG_TOTAL" -eq 0 ]; then
|
|
333
383
|
CONTENT=$((CONTENT + 15))
|
|
334
384
|
else
|
|
@@ -376,15 +426,42 @@ for bot_id in $BOTS; do
|
|
|
376
426
|
[ $VALID_PENALTY -gt 20 ] && VALID_PENALTY=20
|
|
377
427
|
fi
|
|
378
428
|
|
|
379
|
-
|
|
429
|
+
# Field-level validation (C3): check required fields per schema type
|
|
430
|
+
FIELD_PENALTY=0
|
|
431
|
+
FIELD_VIOLATIONS_JSON="[]"
|
|
432
|
+
BLOCK_COUNT_FOR_FIELDS=0
|
|
433
|
+
if [ -f "$JSONLD" ]; then
|
|
434
|
+
BLOCK_COUNT_FOR_FIELDS=$(jq 'if has("blocks") then .blocks | length else 0 end' "$JSONLD" 2>/dev/null || echo "0")
|
|
435
|
+
fi
|
|
436
|
+
if [ "$BLOCK_COUNT_FOR_FIELDS" -gt 0 ]; then
|
|
437
|
+
i=0
|
|
438
|
+
while [ "$i" -lt "$BLOCK_COUNT_FOR_FIELDS" ]; do
|
|
439
|
+
BLOCK_TYPE=$(jq -r ".blocks[$i].type" "$JSONLD" 2>/dev/null || echo "")
|
|
440
|
+
BLOCK_FIELDS=$(jq -r ".blocks[$i].fields[]?" "$JSONLD" 2>/dev/null | tr '\n' ' ')
|
|
441
|
+
REQUIRED=$(required_fields_for "$BLOCK_TYPE")
|
|
442
|
+
for field in $REQUIRED; do
|
|
443
|
+
# shellcheck disable=SC2086
|
|
444
|
+
if ! list_contains "$field" $BLOCK_FIELDS; then
|
|
445
|
+
FIELD_VIOLATIONS_JSON=$(printf '%s' "$FIELD_VIOLATIONS_JSON" | jq \
|
|
446
|
+
--arg schema "$BLOCK_TYPE" --arg field "$field" \
|
|
447
|
+
'. + [{kind: "missing_required_field", schema: $schema, field: $field, impact: -5, confidence: "high"}]')
|
|
448
|
+
FIELD_PENALTY=$((FIELD_PENALTY + 5))
|
|
449
|
+
fi
|
|
450
|
+
done
|
|
451
|
+
i=$((i + 1))
|
|
452
|
+
done
|
|
453
|
+
fi
|
|
454
|
+
[ $FIELD_PENALTY -gt 30 ] && FIELD_PENALTY=30
|
|
455
|
+
|
|
456
|
+
STRUCTURED=$((BASE + BONUS - FORBID_PENALTY - VALID_PENALTY - FIELD_PENALTY))
|
|
380
457
|
[ $STRUCTURED -gt 100 ] && STRUCTURED=100
|
|
381
458
|
[ $STRUCTURED -lt 0 ] && STRUCTURED=0
|
|
382
459
|
|
|
383
|
-
CALCULATION=$(printf 'base: %d/%d expected present = %d; +%d optional bonus; -%d forbidden penalty; -%d validity penalty; clamp [0,100] = %d' \
|
|
460
|
+
CALCULATION=$(printf 'base: %d/%d expected present = %d; +%d optional bonus; -%d forbidden penalty; -%d validity penalty; -%d field penalty; clamp [0,100] = %d' \
|
|
384
461
|
"$PRESENT_EXPECTED_COUNT" "$EXPECTED_COUNT" "$BASE" \
|
|
385
|
-
"$BONUS" "$FORBID_PENALTY" "$VALID_PENALTY" "$STRUCTURED")
|
|
462
|
+
"$BONUS" "$FORBID_PENALTY" "$VALID_PENALTY" "$FIELD_PENALTY" "$STRUCTURED")
|
|
386
463
|
|
|
387
|
-
if [ "$STRUCTURED" -ge 100 ] && [ -z "$PRESENT_FORBIDDEN" ] && [ "$VALID_PENALTY" -eq 0 ]; then
|
|
464
|
+
if [ "$STRUCTURED" -ge 100 ] && [ -z "$PRESENT_FORBIDDEN" ] && [ "$VALID_PENALTY" -eq 0 ] && [ "$FIELD_PENALTY" -eq 0 ]; then
|
|
388
465
|
NOTES="All expected schemas for pageType=$PAGE_TYPE are present. No structured-data action needed."
|
|
389
466
|
elif [ -n "$MISSING_EXPECTED" ] && [ -z "$PRESENT_FORBIDDEN" ]; then
|
|
390
467
|
NOTES="Missing expected schemas for pageType=$PAGE_TYPE: $MISSING_EXPECTED. Add these to raise the score."
|
|
@@ -392,8 +469,12 @@ for bot_id in $BOTS; do
|
|
|
392
469
|
NOTES="Forbidden schemas present for pageType=$PAGE_TYPE: $PRESENT_FORBIDDEN. Remove these (or re-classify the page type with --page-type)."
|
|
393
470
|
elif [ -n "$PRESENT_FORBIDDEN" ] && [ -n "$MISSING_EXPECTED" ]; then
|
|
394
471
|
NOTES="Mixed: missing $MISSING_EXPECTED and forbidden present $PRESENT_FORBIDDEN for pageType=$PAGE_TYPE."
|
|
395
|
-
|
|
472
|
+
elif [ "$FIELD_PENALTY" -gt 0 ]; then
|
|
473
|
+
NOTES="Schemas for pageType=$PAGE_TYPE are present but missing required fields. See violations for details."
|
|
474
|
+
elif [ "$VALID_PENALTY" -gt 0 ]; then
|
|
396
475
|
NOTES="Score reduced by $VALID_PENALTY pts due to invalid JSON-LD blocks."
|
|
476
|
+
else
|
|
477
|
+
NOTES="Structured data scored for pageType=$PAGE_TYPE."
|
|
397
478
|
fi
|
|
398
479
|
|
|
399
480
|
STRUCTURED_GRADE=$(grade_for "$STRUCTURED")
|
|
@@ -410,6 +491,7 @@ for bot_id in $BOTS; do
|
|
|
410
491
|
--arg forbiddenPresent "$PRESENT_FORBIDDEN" \
|
|
411
492
|
--argjson invalidCount "$JSONLD_INVALID" \
|
|
412
493
|
--argjson validPenalty "$VALID_PENALTY" \
|
|
494
|
+
--argjson fieldViolations "$FIELD_VIOLATIONS_JSON" \
|
|
413
495
|
--arg calculation "$CALCULATION" \
|
|
414
496
|
--arg notes "$NOTES" \
|
|
415
497
|
'
|
|
@@ -425,11 +507,12 @@ for bot_id in $BOTS; do
|
|
|
425
507
|
missing: ($missingList | to_arr),
|
|
426
508
|
extras: ($extrasList | to_arr),
|
|
427
509
|
violations: (
|
|
428
|
-
($forbiddenPresent | to_arr | map({kind: "forbidden_schema", schema: ., impact: -10}))
|
|
510
|
+
($forbiddenPresent | to_arr | map({kind: "forbidden_schema", schema: ., impact: -10, confidence: "high"}))
|
|
429
511
|
+ (if $validPenalty > 0
|
|
430
|
-
then [{kind: "invalid_jsonld", count: $invalidCount, impact: (0 - $validPenalty)}]
|
|
512
|
+
then [{kind: "invalid_jsonld", count: $invalidCount, impact: (0 - $validPenalty), confidence: "high"}]
|
|
431
513
|
else []
|
|
432
514
|
end)
|
|
515
|
+
+ $fieldViolations
|
|
433
516
|
),
|
|
434
517
|
calculation: $calculation,
|
|
435
518
|
notes: $notes
|
|
@@ -438,20 +521,24 @@ for bot_id in $BOTS; do
|
|
|
438
521
|
|
|
439
522
|
# --- Category 4: Technical Signals (0-100) ---
|
|
440
523
|
TECHNICAL=0
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
524
|
+
# Batch-read meta fields for technical scoring (1 jq call instead of 5)
|
|
525
|
+
IFS=$'\t' read -r TITLE DESCRIPTION CANONICAL OG_TITLE OG_DESC <<< \
|
|
526
|
+
"$(jq -r '[
|
|
527
|
+
(.title // "" | gsub("\t"; " ")),
|
|
528
|
+
(.description // "" | gsub("\t"; " ")),
|
|
529
|
+
(.canonical // "" | gsub("\t"; " ")),
|
|
530
|
+
(.og.title // "" | gsub("\t"; " ")),
|
|
531
|
+
(.og.description // "" | gsub("\t"; " "))
|
|
532
|
+
] | @tsv' "$META" 2>/dev/null || printf '\t\t\t\t')"
|
|
533
|
+
|
|
534
|
+
[ -n "$TITLE" ] && TECHNICAL=$((TECHNICAL + 25))
|
|
535
|
+
[ -n "$DESCRIPTION" ] && TECHNICAL=$((TECHNICAL + 25))
|
|
536
|
+
[ -n "$CANONICAL" ] && TECHNICAL=$((TECHNICAL + 20))
|
|
537
|
+
[ -n "$OG_TITLE" ] && TECHNICAL=$((TECHNICAL + 8))
|
|
538
|
+
[ -n "$OG_DESC" ] && TECHNICAL=$((TECHNICAL + 7))
|
|
539
|
+
|
|
540
|
+
SITEMAP_EXISTS=$(jq -r '.exists // false | tostring' "$SITEMAP_FILE" 2>/dev/null || echo "false")
|
|
541
|
+
SITEMAP_CONTAINS=$(jq -r '.containsTarget // false | tostring' "$SITEMAP_FILE" 2>/dev/null || echo "false")
|
|
455
542
|
if [ "$SITEMAP_EXISTS" = "true" ] && [ "$SITEMAP_CONTAINS" = "true" ]; then
|
|
456
543
|
TECHNICAL=$((TECHNICAL + 15))
|
|
457
544
|
elif [ "$SITEMAP_EXISTS" = "true" ]; then
|
|
@@ -460,10 +547,14 @@ for bot_id in $BOTS; do
|
|
|
460
547
|
|
|
461
548
|
# --- Category 5: AI Readiness (0-100) ---
|
|
462
549
|
AI=0
|
|
463
|
-
|
|
464
|
-
LLMS_HAS_TITLE
|
|
465
|
-
|
|
466
|
-
|
|
550
|
+
# Batch-read llmstxt fields (1 jq call instead of 4)
|
|
551
|
+
read -r LLMS_EXISTS LLMS_HAS_TITLE LLMS_HAS_DESC LLMS_URLS <<< \
|
|
552
|
+
"$(jq -r '[
|
|
553
|
+
(.llmsTxt.exists // false | tostring),
|
|
554
|
+
(.llmsTxt.hasTitle // false | tostring),
|
|
555
|
+
(.llmsTxt.hasDescription // false | tostring),
|
|
556
|
+
(.llmsTxt.urlCount // 0)
|
|
557
|
+
] | @tsv' "$LLMSTXT_FILE" 2>/dev/null || echo "false false false 0")"
|
|
467
558
|
|
|
468
559
|
if [ "$LLMS_EXISTS" = "true" ]; then
|
|
469
560
|
AI=$((AI + 40))
|
|
@@ -472,7 +563,7 @@ for bot_id in $BOTS; do
|
|
|
472
563
|
[ "$LLMS_URLS" -ge 1 ] && AI=$((AI + 6))
|
|
473
564
|
fi
|
|
474
565
|
[ "$EFFECTIVE_WORD_COUNT" -ge 200 ] && AI=$((AI + 20))
|
|
475
|
-
if [ "$H1_COUNT" -ge 1 ] && [ -n "$DESCRIPTION" ]
|
|
566
|
+
if [ "$H1_COUNT" -ge 1 ] && [ -n "$DESCRIPTION" ]; then
|
|
476
567
|
AI=$((AI + 20))
|
|
477
568
|
fi
|
|
478
569
|
|
|
@@ -566,6 +657,60 @@ CAT_STRUCTURED_GRADE=$(grade_for "$CAT_STRUCTURED_AVG")
|
|
|
566
657
|
CAT_TECHNICAL_GRADE=$(grade_for "$CAT_TECHNICAL_AVG")
|
|
567
658
|
CAT_AI_GRADE=$(grade_for "$CAT_AI_AVG")
|
|
568
659
|
|
|
660
|
+
# --- Cross-bot content parity (C4) ---
|
|
661
|
+
PARITY_MIN_WORDS=999999999
|
|
662
|
+
PARITY_MAX_WORDS=0
|
|
663
|
+
PARITY_BOT_COUNT=0
|
|
664
|
+
for bot_id in $BOTS; do
|
|
665
|
+
FETCH="$RESULTS_DIR/fetch-$bot_id.json"
|
|
666
|
+
P_FETCH_FAILED=$(jget_bool "$FETCH" '.fetchFailed')
|
|
667
|
+
[ "$P_FETCH_FAILED" = "true" ] && continue
|
|
668
|
+
WC=$(jget_num "$FETCH" '.wordCount')
|
|
669
|
+
[ "$WC" -lt "$PARITY_MIN_WORDS" ] && PARITY_MIN_WORDS=$WC
|
|
670
|
+
[ "$WC" -gt "$PARITY_MAX_WORDS" ] && PARITY_MAX_WORDS=$WC
|
|
671
|
+
PARITY_BOT_COUNT=$((PARITY_BOT_COUNT + 1))
|
|
672
|
+
done
|
|
673
|
+
|
|
674
|
+
if [ "$PARITY_BOT_COUNT" -le 1 ]; then
|
|
675
|
+
PARITY_SCORE=100
|
|
676
|
+
PARITY_MAX_DELTA=0
|
|
677
|
+
elif [ "$PARITY_MAX_WORDS" -gt 0 ]; then
|
|
678
|
+
PARITY_SCORE=$(awk -v min="$PARITY_MIN_WORDS" -v max="$PARITY_MAX_WORDS" \
|
|
679
|
+
'BEGIN { printf "%d", (min / max) * 100 + 0.5 }')
|
|
680
|
+
PARITY_MAX_DELTA=$(awk -v min="$PARITY_MIN_WORDS" -v max="$PARITY_MAX_WORDS" \
|
|
681
|
+
'BEGIN { printf "%d", ((max - min) / max) * 100 + 0.5 }')
|
|
682
|
+
else
|
|
683
|
+
PARITY_SCORE=100
|
|
684
|
+
PARITY_MAX_DELTA=0
|
|
685
|
+
fi
|
|
686
|
+
|
|
687
|
+
[ "$PARITY_SCORE" -gt 100 ] && PARITY_SCORE=100
|
|
688
|
+
PARITY_GRADE=$(grade_for "$PARITY_SCORE")
|
|
689
|
+
|
|
690
|
+
if [ "$PARITY_SCORE" -ge 95 ]; then
|
|
691
|
+
PARITY_INTERP="Content is consistent across all bots."
|
|
692
|
+
elif [ "$PARITY_SCORE" -ge 50 ]; then
|
|
693
|
+
PARITY_INTERP="Moderate content divergence between bots — likely partial client-side rendering hydration."
|
|
694
|
+
else
|
|
695
|
+
PARITY_INTERP="Severe content divergence — site likely relies on client-side rendering. AI bots see significantly less content than Googlebot."
|
|
696
|
+
fi
|
|
697
|
+
|
|
698
|
+
# --- Warnings (H2) ---
|
|
699
|
+
WARNINGS="[]"
|
|
700
|
+
if [ "$DIFF_AVAILABLE" != "true" ]; then
|
|
701
|
+
DIFF_REASON="not_found"
|
|
702
|
+
if [ -f "$DIFF_RENDER_FILE" ]; then
|
|
703
|
+
DIFF_REASON=$(jq -r '.reason // "skipped"' "$DIFF_RENDER_FILE" 2>/dev/null || echo "skipped")
|
|
704
|
+
fi
|
|
705
|
+
WARNINGS=$(printf '%s' "$WARNINGS" | jq --arg reason "$DIFF_REASON" \
|
|
706
|
+
'. + [{
|
|
707
|
+
code: "diff_render_unavailable",
|
|
708
|
+
severity: "high",
|
|
709
|
+
message: "JS rendering comparison was skipped. If this site uses CSR, non-JS bot scores may be inaccurate.",
|
|
710
|
+
reason: $reason
|
|
711
|
+
}]')
|
|
712
|
+
fi
|
|
713
|
+
|
|
569
714
|
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
570
715
|
|
|
571
716
|
jq -n \
|
|
@@ -587,6 +732,13 @@ jq -n \
|
|
|
587
732
|
--arg catTechnicalGrade "$CAT_TECHNICAL_GRADE" \
|
|
588
733
|
--argjson catAi "$CAT_AI_AVG" \
|
|
589
734
|
--arg catAiGrade "$CAT_AI_GRADE" \
|
|
735
|
+
--argjson warnings "$WARNINGS" \
|
|
736
|
+
--argjson parityScore "$PARITY_SCORE" \
|
|
737
|
+
--arg parityGrade "$PARITY_GRADE" \
|
|
738
|
+
--argjson parityMinWords "$PARITY_MIN_WORDS" \
|
|
739
|
+
--argjson parityMaxWords "$PARITY_MAX_WORDS" \
|
|
740
|
+
--argjson parityMaxDelta "$PARITY_MAX_DELTA" \
|
|
741
|
+
--arg parityInterp "$PARITY_INTERP" \
|
|
590
742
|
'{
|
|
591
743
|
url: $url,
|
|
592
744
|
timestamp: $timestamp,
|
|
@@ -594,6 +746,15 @@ jq -n \
|
|
|
594
746
|
pageType: $pageType,
|
|
595
747
|
pageTypeOverridden: ($pageTypeOverride | length > 0),
|
|
596
748
|
overall: { score: $overallScore, grade: $overallGrade },
|
|
749
|
+
parity: {
|
|
750
|
+
score: $parityScore,
|
|
751
|
+
grade: $parityGrade,
|
|
752
|
+
minWords: (if $parityMinWords >= 999999999 then 0 else $parityMinWords end),
|
|
753
|
+
maxWords: $parityMaxWords,
|
|
754
|
+
maxDeltaPct: $parityMaxDelta,
|
|
755
|
+
interpretation: $parityInterp
|
|
756
|
+
},
|
|
757
|
+
warnings: $warnings,
|
|
597
758
|
bots: $bots,
|
|
598
759
|
categories: {
|
|
599
760
|
accessibility: { score: $catAcc, grade: $catAccGrade },
|
|
@@ -62,6 +62,7 @@ fi
|
|
|
62
62
|
|
|
63
63
|
VALID_COUNT=0
|
|
64
64
|
INVALID_COUNT=0
|
|
65
|
+
BLOCKS_JSON="[]"
|
|
65
66
|
|
|
66
67
|
if [ "$BLOCK_COUNT" -gt 0 ]; then
|
|
67
68
|
while IFS= read -r block; do
|
|
@@ -79,6 +80,15 @@ if [ "$BLOCK_COUNT" -gt 0 ]; then
|
|
|
79
80
|
else empty end;
|
|
80
81
|
collect_types
|
|
81
82
|
' 2>/dev/null >> "$TYPES_FILE" || true
|
|
83
|
+
|
|
84
|
+
# Extract per-block type + top-level field names for field validation (AC-B1)
|
|
85
|
+
BLOCK_INFO=$(printf '%s' "$block" | jq -c '
|
|
86
|
+
{
|
|
87
|
+
type: (if has("@type") then (.["@type"] | if type == "array" then .[0] else . end) else "unknown" end),
|
|
88
|
+
fields: (keys | map(select(startswith("@") | not)))
|
|
89
|
+
}
|
|
90
|
+
' 2>/dev/null || echo '{"type":"unknown","fields":[]}')
|
|
91
|
+
BLOCKS_JSON=$(printf '%s' "$BLOCKS_JSON" | jq --argjson b "$BLOCK_INFO" '. + [$b]')
|
|
82
92
|
else
|
|
83
93
|
INVALID_COUNT=$((INVALID_COUNT + 1))
|
|
84
94
|
fi
|
|
@@ -109,6 +119,7 @@ jq -n \
|
|
|
109
119
|
--argjson valid "$VALID_COUNT" \
|
|
110
120
|
--argjson invalid "$INVALID_COUNT" \
|
|
111
121
|
--argjson types "$TYPES_JSON" \
|
|
122
|
+
--argjson blocks "$BLOCKS_JSON" \
|
|
112
123
|
--argjson hasOrg "$HAS_ORG" \
|
|
113
124
|
--argjson hasBreadcrumb "$HAS_BREADCRUMB" \
|
|
114
125
|
--argjson hasWebsite "$HAS_WEBSITE" \
|
|
@@ -121,6 +132,7 @@ jq -n \
|
|
|
121
132
|
validCount: $valid,
|
|
122
133
|
invalidCount: $invalid,
|
|
123
134
|
types: $types,
|
|
135
|
+
blocks: $blocks,
|
|
124
136
|
flags: {
|
|
125
137
|
hasOrganization: $hasOrg,
|
|
126
138
|
hasBreadcrumbList: $hasBreadcrumb,
|
|
@@ -93,11 +93,9 @@ jq -n \
|
|
|
93
93
|
--argjson internalSample "$INTERNAL_SAMPLE" \
|
|
94
94
|
--argjson externalSample "$EXTERNAL_SAMPLE" \
|
|
95
95
|
'{
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
internal: $internalSample,
|
|
102
|
-
external: $externalSample
|
|
96
|
+
total: ($internalCount + $externalCount),
|
|
97
|
+
internal: $internalCount,
|
|
98
|
+
external: $externalCount,
|
|
99
|
+
internalUrls: $internalSample,
|
|
100
|
+
externalUrls: $externalSample
|
|
103
101
|
}'
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -euo pipefail
|
|
3
|
+
|
|
4
|
+
# fetch-as-bot.sh — Fetch a URL as a specific bot User-Agent
|
|
5
|
+
# Usage: fetch-as-bot.sh <url> <profile.json>
|
|
6
|
+
|
|
7
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
8
|
+
# shellcheck source=_lib.sh
|
|
9
|
+
. "$SCRIPT_DIR/_lib.sh"
|
|
10
|
+
|
|
11
|
+
URL="${1:?Usage: fetch-as-bot.sh <url> <profile.json>}"
|
|
12
|
+
PROFILE="${2:?Usage: fetch-as-bot.sh <url> <profile.json>}"
|
|
13
|
+
|
|
14
|
+
BOT_ID=$(jq -r '.id' "$PROFILE")
|
|
15
|
+
BOT_NAME=$(jq -r '.name' "$PROFILE")
|
|
16
|
+
UA=$(jq -r '.userAgent' "$PROFILE")
|
|
17
|
+
RENDERS_JS=$(jq -r '.rendersJavaScript' "$PROFILE")
|
|
18
|
+
|
|
19
|
+
TMPDIR="${TMPDIR:-/tmp}"
|
|
20
|
+
HEADERS_FILE=$(mktemp "$TMPDIR/crawlsim-headers.XXXXXX")
|
|
21
|
+
BODY_FILE=$(mktemp "$TMPDIR/crawlsim-body.XXXXXX")
|
|
22
|
+
CURL_STDERR_FILE=$(mktemp "$TMPDIR/crawlsim-stderr.XXXXXX")
|
|
23
|
+
trap 'rm -f "$HEADERS_FILE" "$BODY_FILE" "$CURL_STDERR_FILE"' EXIT
|
|
24
|
+
|
|
25
|
+
printf '[%s] fetching %s\n' "$BOT_ID" "$URL" >&2
|
|
26
|
+
|
|
27
|
+
set +e
|
|
28
|
+
TIMING=$(curl -sS -L \
|
|
29
|
+
-H "User-Agent: $UA" \
|
|
30
|
+
-D "$HEADERS_FILE" \
|
|
31
|
+
-o "$BODY_FILE" \
|
|
32
|
+
-w '{"total":%{time_total},"ttfb":%{time_starttransfer},"connect":%{time_connect},"statusCode":%{http_code},"sizeDownload":%{size_download},"redirectCount":%{num_redirects},"finalUrl":"%{url_effective}"}' \
|
|
33
|
+
--max-time 30 \
|
|
34
|
+
"$URL" 2>"$CURL_STDERR_FILE")
|
|
35
|
+
CURL_EXIT=$?
|
|
36
|
+
set -e
|
|
37
|
+
|
|
38
|
+
CURL_ERR=""
|
|
39
|
+
if [ -s "$CURL_STDERR_FILE" ]; then
|
|
40
|
+
CURL_ERR=$(cat "$CURL_STDERR_FILE")
|
|
41
|
+
fi
|
|
42
|
+
|
|
43
|
+
if [ "$CURL_EXIT" -ne 0 ]; then
|
|
44
|
+
printf '[%s] FAILED: curl exit %d — %s\n' "$BOT_ID" "$CURL_EXIT" "$CURL_ERR" >&2
|
|
45
|
+
jq -n \
|
|
46
|
+
--arg url "$URL" \
|
|
47
|
+
--arg botId "$BOT_ID" \
|
|
48
|
+
--arg botName "$BOT_NAME" \
|
|
49
|
+
--arg ua "$UA" \
|
|
50
|
+
--arg rendersJs "$RENDERS_JS" \
|
|
51
|
+
--arg error "$CURL_ERR" \
|
|
52
|
+
--argjson exitCode "$CURL_EXIT" \
|
|
53
|
+
'{
|
|
54
|
+
url: $url,
|
|
55
|
+
bot: {
|
|
56
|
+
id: $botId,
|
|
57
|
+
name: $botName,
|
|
58
|
+
userAgent: $ua,
|
|
59
|
+
rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end)
|
|
60
|
+
},
|
|
61
|
+
fetchFailed: true,
|
|
62
|
+
error: $error,
|
|
63
|
+
curlExitCode: $exitCode,
|
|
64
|
+
status: 0,
|
|
65
|
+
timing: { total: 0, ttfb: 0 },
|
|
66
|
+
size: 0,
|
|
67
|
+
wordCount: 0,
|
|
68
|
+
headers: {},
|
|
69
|
+
bodyBase64: ""
|
|
70
|
+
}'
|
|
71
|
+
exit 0
|
|
72
|
+
fi
|
|
73
|
+
|
|
74
|
+
read -r STATUS TOTAL_TIME TTFB SIZE REDIRECT_COUNT FINAL_URL <<< \
|
|
75
|
+
"$(echo "$TIMING" | jq -r '[.statusCode, .total, .ttfb, .sizeDownload, .redirectCount, .finalUrl] | @tsv')"
|
|
76
|
+
|
|
77
|
+
# Parse response headers into a JSON object using jq for safe escaping.
|
|
78
|
+
# curl -L writes multiple blocks on redirect; jq keeps the last definition
|
|
79
|
+
# of each header since `add` overwrites left-to-right.
|
|
80
|
+
HEADERS_JSON=$(tr -d '\r' < "$HEADERS_FILE" \
|
|
81
|
+
| grep -E '^[A-Za-z][A-Za-z0-9-]*:[[:space:]]' \
|
|
82
|
+
| jq -Rs '
|
|
83
|
+
split("\n")
|
|
84
|
+
| map(select(length > 0))
|
|
85
|
+
| map(capture("^(?<k>[^:]+):[[:space:]]*(?<v>.*)$"))
|
|
86
|
+
| map({(.k): .v})
|
|
87
|
+
| add // {}
|
|
88
|
+
')
|
|
89
|
+
|
|
90
|
+
# Parse redirect chain from headers dump.
|
|
91
|
+
# curl -D writes multiple HTTP response blocks on redirect — each starts with HTTP/.
|
|
92
|
+
REDIRECT_CHAIN="[]"
|
|
93
|
+
if [ "$REDIRECT_COUNT" -gt 0 ]; then
|
|
94
|
+
REDIRECT_CHAIN=$(tr -d '\r' < "$HEADERS_FILE" | awk '
|
|
95
|
+
/^HTTP\// { status=$2; url="" }
|
|
96
|
+
/^[Ll]ocation:/ { url=$2 }
|
|
97
|
+
/^$/ && status && url { printf "%s %s\n", status, url; status=""; url="" }
|
|
98
|
+
' | jq -Rs '
|
|
99
|
+
split("\n") | map(select(length > 0)) |
|
|
100
|
+
to_entries | map({
|
|
101
|
+
hop: .key,
|
|
102
|
+
status: (.value | split(" ")[0] | tonumber),
|
|
103
|
+
location: (.value | split(" ")[1:] | join(" "))
|
|
104
|
+
})
|
|
105
|
+
')
|
|
106
|
+
fi
|
|
107
|
+
|
|
108
|
+
WORD_COUNT=$(count_words "$BODY_FILE")
|
|
109
|
+
[ -z "$WORD_COUNT" ] && WORD_COUNT=0
|
|
110
|
+
|
|
111
|
+
BODY_B64=""
|
|
112
|
+
if [ -s "$BODY_FILE" ]; then
|
|
113
|
+
BODY_B64=$(base64 < "$BODY_FILE")
|
|
114
|
+
fi
|
|
115
|
+
|
|
116
|
+
printf '[%s] ok: status=%s size=%s words=%s time=%ss\n' "$BOT_ID" "$STATUS" "$SIZE" "$WORD_COUNT" "$TOTAL_TIME" >&2
|
|
117
|
+
|
|
118
|
+
jq -n \
|
|
119
|
+
--arg url "$URL" \
|
|
120
|
+
--arg botId "$BOT_ID" \
|
|
121
|
+
--arg botName "$BOT_NAME" \
|
|
122
|
+
--arg ua "$UA" \
|
|
123
|
+
--arg rendersJs "$RENDERS_JS" \
|
|
124
|
+
--argjson status "$STATUS" \
|
|
125
|
+
--argjson totalTime "$TOTAL_TIME" \
|
|
126
|
+
--argjson ttfb "$TTFB" \
|
|
127
|
+
--argjson size "$SIZE" \
|
|
128
|
+
--argjson wordCount "$WORD_COUNT" \
|
|
129
|
+
--argjson headers "$HEADERS_JSON" \
|
|
130
|
+
--argjson redirectCount "$REDIRECT_COUNT" \
|
|
131
|
+
--arg finalUrl "$FINAL_URL" \
|
|
132
|
+
--argjson redirectChain "$REDIRECT_CHAIN" \
|
|
133
|
+
--arg bodyBase64 "$BODY_B64" \
|
|
134
|
+
'{
|
|
135
|
+
url: $url,
|
|
136
|
+
bot: {
|
|
137
|
+
id: $botId,
|
|
138
|
+
name: $botName,
|
|
139
|
+
userAgent: $ua,
|
|
140
|
+
rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end)
|
|
141
|
+
},
|
|
142
|
+
status: $status,
|
|
143
|
+
timing: { total: $totalTime, ttfb: $ttfb },
|
|
144
|
+
size: $size,
|
|
145
|
+
wordCount: $wordCount,
|
|
146
|
+
redirectCount: $redirectCount,
|
|
147
|
+
finalUrl: $finalUrl,
|
|
148
|
+
redirectChain: $redirectChain,
|
|
149
|
+
headers: $headers,
|
|
150
|
+
bodyBase64: $bodyBase64
|
|
151
|
+
}'
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# schema-fields.sh — Required field definitions per schema.org type.
|
|
3
|
+
# Source this file, then call required_fields_for <SchemaType>.
|
|
4
|
+
|
|
5
|
+
required_fields_for() {
|
|
6
|
+
case "$1" in
|
|
7
|
+
Organization) echo "name url" ;;
|
|
8
|
+
WebSite) echo "name url" ;;
|
|
9
|
+
Article) echo "headline author datePublished" ;;
|
|
10
|
+
NewsArticle) echo "headline author datePublished" ;;
|
|
11
|
+
FAQPage) echo "mainEntity" ;;
|
|
12
|
+
BreadcrumbList) echo "itemListElement" ;;
|
|
13
|
+
CollectionPage) echo "name" ;;
|
|
14
|
+
ItemList) echo "itemListElement" ;;
|
|
15
|
+
AboutPage) echo "name" ;;
|
|
16
|
+
ContactPage) echo "name" ;;
|
|
17
|
+
Product) echo "name" ;;
|
|
18
|
+
LocalBusiness) echo "name address" ;;
|
|
19
|
+
ProfessionalService) echo "name" ;;
|
|
20
|
+
Person) echo "name" ;;
|
|
21
|
+
ImageObject) echo "contentUrl" ;;
|
|
22
|
+
PostalAddress) echo "streetAddress" ;;
|
|
23
|
+
*) echo "" ;;
|
|
24
|
+
esac
|
|
25
|
+
}
|
package/scripts/fetch-as-bot.sh
DELETED
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bash
|
|
2
|
-
set -euo pipefail
|
|
3
|
-
|
|
4
|
-
# fetch-as-bot.sh — Fetch a URL as a specific bot User-Agent
|
|
5
|
-
# Usage: fetch-as-bot.sh <url> <profile.json>
|
|
6
|
-
|
|
7
|
-
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
8
|
-
# shellcheck source=_lib.sh
|
|
9
|
-
. "$SCRIPT_DIR/_lib.sh"
|
|
10
|
-
|
|
11
|
-
URL="${1:?Usage: fetch-as-bot.sh <url> <profile.json>}"
|
|
12
|
-
PROFILE="${2:?Usage: fetch-as-bot.sh <url> <profile.json>}"
|
|
13
|
-
|
|
14
|
-
BOT_ID=$(jq -r '.id' "$PROFILE")
|
|
15
|
-
BOT_NAME=$(jq -r '.name' "$PROFILE")
|
|
16
|
-
UA=$(jq -r '.userAgent' "$PROFILE")
|
|
17
|
-
RENDERS_JS=$(jq -r '.rendersJavaScript' "$PROFILE")
|
|
18
|
-
|
|
19
|
-
printf '[fetch-as-bot] %s <- %s\n' "$BOT_NAME" "$URL" >&2
|
|
20
|
-
|
|
21
|
-
TMPDIR="${TMPDIR:-/tmp}"
|
|
22
|
-
HEADERS_FILE=$(mktemp "$TMPDIR/crawlsim-headers.XXXXXX")
|
|
23
|
-
BODY_FILE=$(mktemp "$TMPDIR/crawlsim-body.XXXXXX")
|
|
24
|
-
trap 'rm -f "$HEADERS_FILE" "$BODY_FILE"' EXIT
|
|
25
|
-
|
|
26
|
-
TIMING=$(curl -sS -L \
|
|
27
|
-
-H "User-Agent: $UA" \
|
|
28
|
-
-D "$HEADERS_FILE" \
|
|
29
|
-
-o "$BODY_FILE" \
|
|
30
|
-
-w '{"total":%{time_total},"ttfb":%{time_starttransfer},"connect":%{time_connect},"statusCode":%{http_code},"sizeDownload":%{size_download}}' \
|
|
31
|
-
--max-time 30 \
|
|
32
|
-
"$URL" 2>/dev/null || echo '{"total":0,"ttfb":0,"connect":0,"statusCode":0,"sizeDownload":0}')
|
|
33
|
-
|
|
34
|
-
STATUS=$(echo "$TIMING" | jq -r '.statusCode')
|
|
35
|
-
TOTAL_TIME=$(echo "$TIMING" | jq -r '.total')
|
|
36
|
-
TTFB=$(echo "$TIMING" | jq -r '.ttfb')
|
|
37
|
-
SIZE=$(echo "$TIMING" | jq -r '.sizeDownload')
|
|
38
|
-
|
|
39
|
-
# Parse response headers into a JSON object using jq for safe escaping.
|
|
40
|
-
# curl -L writes multiple blocks on redirect; jq keeps the last definition
|
|
41
|
-
# of each header since `add` overwrites left-to-right.
|
|
42
|
-
HEADERS_JSON=$(tr -d '\r' < "$HEADERS_FILE" \
|
|
43
|
-
| grep -E '^[A-Za-z][A-Za-z0-9-]*:[[:space:]]' \
|
|
44
|
-
| jq -Rs '
|
|
45
|
-
split("\n")
|
|
46
|
-
| map(select(length > 0))
|
|
47
|
-
| map(capture("^(?<k>[^:]+):[[:space:]]*(?<v>.*)$"))
|
|
48
|
-
| map({(.k): .v})
|
|
49
|
-
| add // {}
|
|
50
|
-
')
|
|
51
|
-
|
|
52
|
-
WORD_COUNT=$(count_words "$BODY_FILE")
|
|
53
|
-
[ -z "$WORD_COUNT" ] && WORD_COUNT=0
|
|
54
|
-
|
|
55
|
-
BODY_B64=""
|
|
56
|
-
if [ -s "$BODY_FILE" ]; then
|
|
57
|
-
BODY_B64=$(base64 < "$BODY_FILE")
|
|
58
|
-
fi
|
|
59
|
-
|
|
60
|
-
jq -n \
|
|
61
|
-
--arg url "$URL" \
|
|
62
|
-
--arg botId "$BOT_ID" \
|
|
63
|
-
--arg botName "$BOT_NAME" \
|
|
64
|
-
--arg ua "$UA" \
|
|
65
|
-
--arg rendersJs "$RENDERS_JS" \
|
|
66
|
-
--argjson status "$STATUS" \
|
|
67
|
-
--argjson totalTime "$TOTAL_TIME" \
|
|
68
|
-
--argjson ttfb "$TTFB" \
|
|
69
|
-
--argjson size "$SIZE" \
|
|
70
|
-
--argjson wordCount "$WORD_COUNT" \
|
|
71
|
-
--argjson headers "$HEADERS_JSON" \
|
|
72
|
-
--arg bodyBase64 "$BODY_B64" \
|
|
73
|
-
'{
|
|
74
|
-
url: $url,
|
|
75
|
-
bot: {
|
|
76
|
-
id: $botId,
|
|
77
|
-
name: $botName,
|
|
78
|
-
userAgent: $ua,
|
|
79
|
-
rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end)
|
|
80
|
-
},
|
|
81
|
-
status: $status,
|
|
82
|
-
timing: { total: $totalTime, ttfb: $ttfb },
|
|
83
|
-
size: $size,
|
|
84
|
-
wordCount: $wordCount,
|
|
85
|
-
headers: $headers,
|
|
86
|
-
bodyBase64: $bodyBase64
|
|
87
|
-
}'
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|