@braedenbuilds/crawl-sim 1.0.5 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +15 -0
- package/.claude-plugin/plugin.json +13 -0
- package/README.md +32 -9
- package/bin/install.js +6 -2
- package/package.json +8 -3
- package/{SKILL.md → skills/crawl-sim/SKILL.md} +23 -2
- package/{scripts → skills/crawl-sim/scripts}/_lib.sh +30 -0
- package/skills/crawl-sim/scripts/compute-score.sh +744 -0
- package/{scripts → skills/crawl-sim/scripts}/extract-jsonld.sh +12 -0
- package/skills/crawl-sim/scripts/fetch-as-bot.sh +151 -0
- package/skills/crawl-sim/scripts/schema-fields.sh +25 -0
- package/scripts/compute-score.sh +0 -424
- package/scripts/fetch-as-bot.sh +0 -87
- /package/{profiles → skills/crawl-sim/profiles}/chatgpt-user.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/claude-searchbot.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/claude-user.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/claudebot.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/googlebot.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/gptbot.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/oai-searchbot.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/perplexity-user.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/perplexitybot.json +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/check-llmstxt.sh +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/check-robots.sh +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/check-sitemap.sh +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/diff-render.sh +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/extract-links.sh +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/extract-meta.sh +0 -0
|
@@ -62,6 +62,7 @@ fi
|
|
|
62
62
|
|
|
63
63
|
VALID_COUNT=0
|
|
64
64
|
INVALID_COUNT=0
|
|
65
|
+
BLOCKS_JSON="[]"
|
|
65
66
|
|
|
66
67
|
if [ "$BLOCK_COUNT" -gt 0 ]; then
|
|
67
68
|
while IFS= read -r block; do
|
|
@@ -79,6 +80,15 @@ if [ "$BLOCK_COUNT" -gt 0 ]; then
|
|
|
79
80
|
else empty end;
|
|
80
81
|
collect_types
|
|
81
82
|
' 2>/dev/null >> "$TYPES_FILE" || true
|
|
83
|
+
|
|
84
|
+
# Extract per-block type + top-level field names for field validation (AC-B1)
|
|
85
|
+
BLOCK_INFO=$(printf '%s' "$block" | jq -c '
|
|
86
|
+
{
|
|
87
|
+
type: (if has("@type") then (.["@type"] | if type == "array" then .[0] else . end) else "unknown" end),
|
|
88
|
+
fields: (keys | map(select(startswith("@") | not)))
|
|
89
|
+
}
|
|
90
|
+
' 2>/dev/null || echo '{"type":"unknown","fields":[]}')
|
|
91
|
+
BLOCKS_JSON=$(printf '%s' "$BLOCKS_JSON" | jq --argjson b "$BLOCK_INFO" '. + [$b]')
|
|
82
92
|
else
|
|
83
93
|
INVALID_COUNT=$((INVALID_COUNT + 1))
|
|
84
94
|
fi
|
|
@@ -109,6 +119,7 @@ jq -n \
|
|
|
109
119
|
--argjson valid "$VALID_COUNT" \
|
|
110
120
|
--argjson invalid "$INVALID_COUNT" \
|
|
111
121
|
--argjson types "$TYPES_JSON" \
|
|
122
|
+
--argjson blocks "$BLOCKS_JSON" \
|
|
112
123
|
--argjson hasOrg "$HAS_ORG" \
|
|
113
124
|
--argjson hasBreadcrumb "$HAS_BREADCRUMB" \
|
|
114
125
|
--argjson hasWebsite "$HAS_WEBSITE" \
|
|
@@ -121,6 +132,7 @@ jq -n \
|
|
|
121
132
|
validCount: $valid,
|
|
122
133
|
invalidCount: $invalid,
|
|
123
134
|
types: $types,
|
|
135
|
+
blocks: $blocks,
|
|
124
136
|
flags: {
|
|
125
137
|
hasOrganization: $hasOrg,
|
|
126
138
|
hasBreadcrumbList: $hasBreadcrumb,
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -euo pipefail
|
|
3
|
+
|
|
4
|
+
# fetch-as-bot.sh — Fetch a URL as a specific bot User-Agent
|
|
5
|
+
# Usage: fetch-as-bot.sh <url> <profile.json>
|
|
6
|
+
|
|
7
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
8
|
+
# shellcheck source=_lib.sh
|
|
9
|
+
. "$SCRIPT_DIR/_lib.sh"
|
|
10
|
+
|
|
11
|
+
URL="${1:?Usage: fetch-as-bot.sh <url> <profile.json>}"
|
|
12
|
+
PROFILE="${2:?Usage: fetch-as-bot.sh <url> <profile.json>}"
|
|
13
|
+
|
|
14
|
+
BOT_ID=$(jq -r '.id' "$PROFILE")
|
|
15
|
+
BOT_NAME=$(jq -r '.name' "$PROFILE")
|
|
16
|
+
UA=$(jq -r '.userAgent' "$PROFILE")
|
|
17
|
+
RENDERS_JS=$(jq -r '.rendersJavaScript' "$PROFILE")
|
|
18
|
+
|
|
19
|
+
TMPDIR="${TMPDIR:-/tmp}"
|
|
20
|
+
HEADERS_FILE=$(mktemp "$TMPDIR/crawlsim-headers.XXXXXX")
|
|
21
|
+
BODY_FILE=$(mktemp "$TMPDIR/crawlsim-body.XXXXXX")
|
|
22
|
+
CURL_STDERR_FILE=$(mktemp "$TMPDIR/crawlsim-stderr.XXXXXX")
|
|
23
|
+
trap 'rm -f "$HEADERS_FILE" "$BODY_FILE" "$CURL_STDERR_FILE"' EXIT
|
|
24
|
+
|
|
25
|
+
printf '[%s] fetching %s\n' "$BOT_ID" "$URL" >&2
|
|
26
|
+
|
|
27
|
+
set +e
|
|
28
|
+
TIMING=$(curl -sS -L \
|
|
29
|
+
-H "User-Agent: $UA" \
|
|
30
|
+
-D "$HEADERS_FILE" \
|
|
31
|
+
-o "$BODY_FILE" \
|
|
32
|
+
-w '{"total":%{time_total},"ttfb":%{time_starttransfer},"connect":%{time_connect},"statusCode":%{http_code},"sizeDownload":%{size_download},"redirectCount":%{num_redirects},"finalUrl":"%{url_effective}"}' \
|
|
33
|
+
--max-time 30 \
|
|
34
|
+
"$URL" 2>"$CURL_STDERR_FILE")
|
|
35
|
+
CURL_EXIT=$?
|
|
36
|
+
set -e
|
|
37
|
+
|
|
38
|
+
CURL_ERR=""
|
|
39
|
+
if [ -s "$CURL_STDERR_FILE" ]; then
|
|
40
|
+
CURL_ERR=$(cat "$CURL_STDERR_FILE")
|
|
41
|
+
fi
|
|
42
|
+
|
|
43
|
+
if [ "$CURL_EXIT" -ne 0 ]; then
|
|
44
|
+
printf '[%s] FAILED: curl exit %d — %s\n' "$BOT_ID" "$CURL_EXIT" "$CURL_ERR" >&2
|
|
45
|
+
jq -n \
|
|
46
|
+
--arg url "$URL" \
|
|
47
|
+
--arg botId "$BOT_ID" \
|
|
48
|
+
--arg botName "$BOT_NAME" \
|
|
49
|
+
--arg ua "$UA" \
|
|
50
|
+
--arg rendersJs "$RENDERS_JS" \
|
|
51
|
+
--arg error "$CURL_ERR" \
|
|
52
|
+
--argjson exitCode "$CURL_EXIT" \
|
|
53
|
+
'{
|
|
54
|
+
url: $url,
|
|
55
|
+
bot: {
|
|
56
|
+
id: $botId,
|
|
57
|
+
name: $botName,
|
|
58
|
+
userAgent: $ua,
|
|
59
|
+
rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end)
|
|
60
|
+
},
|
|
61
|
+
fetchFailed: true,
|
|
62
|
+
error: $error,
|
|
63
|
+
curlExitCode: $exitCode,
|
|
64
|
+
status: 0,
|
|
65
|
+
timing: { total: 0, ttfb: 0 },
|
|
66
|
+
size: 0,
|
|
67
|
+
wordCount: 0,
|
|
68
|
+
headers: {},
|
|
69
|
+
bodyBase64: ""
|
|
70
|
+
}'
|
|
71
|
+
exit 0
|
|
72
|
+
fi
|
|
73
|
+
|
|
74
|
+
read -r STATUS TOTAL_TIME TTFB SIZE REDIRECT_COUNT FINAL_URL <<< \
|
|
75
|
+
"$(echo "$TIMING" | jq -r '[.statusCode, .total, .ttfb, .sizeDownload, .redirectCount, .finalUrl] | @tsv')"
|
|
76
|
+
|
|
77
|
+
# Parse response headers into a JSON object using jq for safe escaping.
|
|
78
|
+
# curl -L writes multiple blocks on redirect; jq keeps the last definition
|
|
79
|
+
# of each header since `add` overwrites left-to-right.
|
|
80
|
+
HEADERS_JSON=$(tr -d '\r' < "$HEADERS_FILE" \
|
|
81
|
+
| grep -E '^[A-Za-z][A-Za-z0-9-]*:[[:space:]]' \
|
|
82
|
+
| jq -Rs '
|
|
83
|
+
split("\n")
|
|
84
|
+
| map(select(length > 0))
|
|
85
|
+
| map(capture("^(?<k>[^:]+):[[:space:]]*(?<v>.*)$"))
|
|
86
|
+
| map({(.k): .v})
|
|
87
|
+
| add // {}
|
|
88
|
+
')
|
|
89
|
+
|
|
90
|
+
# Parse redirect chain from headers dump.
|
|
91
|
+
# curl -D writes multiple HTTP response blocks on redirect — each starts with HTTP/.
|
|
92
|
+
REDIRECT_CHAIN="[]"
|
|
93
|
+
if [ "$REDIRECT_COUNT" -gt 0 ]; then
|
|
94
|
+
REDIRECT_CHAIN=$(tr -d '\r' < "$HEADERS_FILE" | awk '
|
|
95
|
+
/^HTTP\// { status=$2; url="" }
|
|
96
|
+
/^[Ll]ocation:/ { url=$2 }
|
|
97
|
+
/^$/ && status && url { printf "%s %s\n", status, url; status=""; url="" }
|
|
98
|
+
' | jq -Rs '
|
|
99
|
+
split("\n") | map(select(length > 0)) |
|
|
100
|
+
to_entries | map({
|
|
101
|
+
hop: .key,
|
|
102
|
+
status: (.value | split(" ")[0] | tonumber),
|
|
103
|
+
location: (.value | split(" ")[1:] | join(" "))
|
|
104
|
+
})
|
|
105
|
+
')
|
|
106
|
+
fi
|
|
107
|
+
|
|
108
|
+
WORD_COUNT=$(count_words "$BODY_FILE")
|
|
109
|
+
[ -z "$WORD_COUNT" ] && WORD_COUNT=0
|
|
110
|
+
|
|
111
|
+
BODY_B64=""
|
|
112
|
+
if [ -s "$BODY_FILE" ]; then
|
|
113
|
+
BODY_B64=$(base64 < "$BODY_FILE")
|
|
114
|
+
fi
|
|
115
|
+
|
|
116
|
+
printf '[%s] ok: status=%s size=%s words=%s time=%ss\n' "$BOT_ID" "$STATUS" "$SIZE" "$WORD_COUNT" "$TOTAL_TIME" >&2
|
|
117
|
+
|
|
118
|
+
jq -n \
|
|
119
|
+
--arg url "$URL" \
|
|
120
|
+
--arg botId "$BOT_ID" \
|
|
121
|
+
--arg botName "$BOT_NAME" \
|
|
122
|
+
--arg ua "$UA" \
|
|
123
|
+
--arg rendersJs "$RENDERS_JS" \
|
|
124
|
+
--argjson status "$STATUS" \
|
|
125
|
+
--argjson totalTime "$TOTAL_TIME" \
|
|
126
|
+
--argjson ttfb "$TTFB" \
|
|
127
|
+
--argjson size "$SIZE" \
|
|
128
|
+
--argjson wordCount "$WORD_COUNT" \
|
|
129
|
+
--argjson headers "$HEADERS_JSON" \
|
|
130
|
+
--argjson redirectCount "$REDIRECT_COUNT" \
|
|
131
|
+
--arg finalUrl "$FINAL_URL" \
|
|
132
|
+
--argjson redirectChain "$REDIRECT_CHAIN" \
|
|
133
|
+
--arg bodyBase64 "$BODY_B64" \
|
|
134
|
+
'{
|
|
135
|
+
url: $url,
|
|
136
|
+
bot: {
|
|
137
|
+
id: $botId,
|
|
138
|
+
name: $botName,
|
|
139
|
+
userAgent: $ua,
|
|
140
|
+
rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end)
|
|
141
|
+
},
|
|
142
|
+
status: $status,
|
|
143
|
+
timing: { total: $totalTime, ttfb: $ttfb },
|
|
144
|
+
size: $size,
|
|
145
|
+
wordCount: $wordCount,
|
|
146
|
+
redirectCount: $redirectCount,
|
|
147
|
+
finalUrl: $finalUrl,
|
|
148
|
+
redirectChain: $redirectChain,
|
|
149
|
+
headers: $headers,
|
|
150
|
+
bodyBase64: $bodyBase64
|
|
151
|
+
}'
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# schema-fields.sh — Required field definitions per schema.org type.
|
|
3
|
+
# Source this file, then call required_fields_for <SchemaType>.
|
|
4
|
+
|
|
5
|
+
required_fields_for() {
|
|
6
|
+
case "$1" in
|
|
7
|
+
Organization) echo "name url" ;;
|
|
8
|
+
WebSite) echo "name url" ;;
|
|
9
|
+
Article) echo "headline author datePublished" ;;
|
|
10
|
+
NewsArticle) echo "headline author datePublished" ;;
|
|
11
|
+
FAQPage) echo "mainEntity" ;;
|
|
12
|
+
BreadcrumbList) echo "itemListElement" ;;
|
|
13
|
+
CollectionPage) echo "name" ;;
|
|
14
|
+
ItemList) echo "itemListElement" ;;
|
|
15
|
+
AboutPage) echo "name" ;;
|
|
16
|
+
ContactPage) echo "name" ;;
|
|
17
|
+
Product) echo "name" ;;
|
|
18
|
+
LocalBusiness) echo "name address" ;;
|
|
19
|
+
ProfessionalService) echo "name" ;;
|
|
20
|
+
Person) echo "name" ;;
|
|
21
|
+
ImageObject) echo "contentUrl" ;;
|
|
22
|
+
PostalAddress) echo "streetAddress" ;;
|
|
23
|
+
*) echo "" ;;
|
|
24
|
+
esac
|
|
25
|
+
}
|
package/scripts/compute-score.sh
DELETED
|
@@ -1,424 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bash
|
|
2
|
-
set -eu
|
|
3
|
-
|
|
4
|
-
# compute-score.sh — Aggregate check outputs into per-bot + per-category scores
|
|
5
|
-
# Usage: compute-score.sh <results-dir>
|
|
6
|
-
# Output: JSON to stdout
|
|
7
|
-
#
|
|
8
|
-
# Expected filenames in <results-dir>:
|
|
9
|
-
# fetch-<bot_id>.json — fetch-as-bot.sh output
|
|
10
|
-
# meta-<bot_id>.json — extract-meta.sh output
|
|
11
|
-
# jsonld-<bot_id>.json — extract-jsonld.sh output
|
|
12
|
-
# links-<bot_id>.json — extract-links.sh output
|
|
13
|
-
# robots-<bot_id>.json — check-robots.sh output
|
|
14
|
-
# llmstxt.json — check-llmstxt.sh output (bot-independent)
|
|
15
|
-
# sitemap.json — check-sitemap.sh output (bot-independent)
|
|
16
|
-
# diff-render.json — diff-render.sh output (optional, Googlebot only)
|
|
17
|
-
|
|
18
|
-
RESULTS_DIR="${1:?Usage: compute-score.sh <results-dir>}"
|
|
19
|
-
printf '[compute-score] aggregating %s\n' "$RESULTS_DIR" >&2
|
|
20
|
-
|
|
21
|
-
if [ ! -d "$RESULTS_DIR" ]; then
|
|
22
|
-
echo "Error: results dir not found: $RESULTS_DIR" >&2
|
|
23
|
-
exit 1
|
|
24
|
-
fi
|
|
25
|
-
|
|
26
|
-
# Category weights (as percentages of per-bot composite)
|
|
27
|
-
W_ACCESSIBILITY=25
|
|
28
|
-
W_CONTENT=30
|
|
29
|
-
W_STRUCTURED=20
|
|
30
|
-
W_TECHNICAL=15
|
|
31
|
-
W_AI=10
|
|
32
|
-
|
|
33
|
-
# Overall composite weights (per bot)
|
|
34
|
-
# Default: Googlebot 40, GPTBot 20, ClaudeBot 20, PerplexityBot 20
|
|
35
|
-
overall_weight() {
|
|
36
|
-
case "$1" in
|
|
37
|
-
googlebot) echo 40 ;;
|
|
38
|
-
gptbot) echo 20 ;;
|
|
39
|
-
claudebot) echo 20 ;;
|
|
40
|
-
perplexitybot) echo 20 ;;
|
|
41
|
-
*) echo 0 ;;
|
|
42
|
-
esac
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
# Grade from score (0-100)
|
|
46
|
-
grade_for() {
|
|
47
|
-
local s=$1
|
|
48
|
-
if [ "$s" -ge 93 ]; then echo "A"
|
|
49
|
-
elif [ "$s" -ge 90 ]; then echo "A-"
|
|
50
|
-
elif [ "$s" -ge 87 ]; then echo "B+"
|
|
51
|
-
elif [ "$s" -ge 83 ]; then echo "B"
|
|
52
|
-
elif [ "$s" -ge 80 ]; then echo "B-"
|
|
53
|
-
elif [ "$s" -ge 77 ]; then echo "C+"
|
|
54
|
-
elif [ "$s" -ge 73 ]; then echo "C"
|
|
55
|
-
elif [ "$s" -ge 70 ]; then echo "C-"
|
|
56
|
-
elif [ "$s" -ge 67 ]; then echo "D+"
|
|
57
|
-
elif [ "$s" -ge 63 ]; then echo "D"
|
|
58
|
-
elif [ "$s" -ge 60 ]; then echo "D-"
|
|
59
|
-
else echo "F"
|
|
60
|
-
fi
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
# Read a jq value from a file with a default fallback
|
|
64
|
-
jget() {
|
|
65
|
-
local file="$1"
|
|
66
|
-
local query="$2"
|
|
67
|
-
local default="${3:-null}"
|
|
68
|
-
if [ -f "$file" ]; then
|
|
69
|
-
jq -r --arg d "$default" "$query // \$d" "$file" 2>/dev/null || echo "$default"
|
|
70
|
-
else
|
|
71
|
-
echo "$default"
|
|
72
|
-
fi
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
jget_num() {
|
|
76
|
-
local v
|
|
77
|
-
v=$(jget "$1" "$2" "0")
|
|
78
|
-
# Replace "null" or non-numeric with 0
|
|
79
|
-
if ! printf '%s' "$v" | grep -qE '^-?[0-9]+(\.[0-9]+)?$'; then
|
|
80
|
-
echo "0"
|
|
81
|
-
else
|
|
82
|
-
echo "$v"
|
|
83
|
-
fi
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
jget_bool() {
|
|
87
|
-
local v
|
|
88
|
-
v=$(jget "$1" "$2" "false")
|
|
89
|
-
if [ "$v" = "true" ]; then echo "true"; else echo "false"; fi
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
BOTS=""
|
|
93
|
-
for f in "$RESULTS_DIR"/fetch-*.json; do
|
|
94
|
-
[ -f "$f" ] || continue
|
|
95
|
-
bot_id=$(basename "$f" .json | sed 's/^fetch-//')
|
|
96
|
-
BOTS="$BOTS $bot_id"
|
|
97
|
-
done
|
|
98
|
-
|
|
99
|
-
if [ -z "$BOTS" ]; then
|
|
100
|
-
echo "Error: no fetch-*.json files found in $RESULTS_DIR" >&2
|
|
101
|
-
exit 1
|
|
102
|
-
fi
|
|
103
|
-
|
|
104
|
-
LLMSTXT_FILE="$RESULTS_DIR/llmstxt.json"
|
|
105
|
-
SITEMAP_FILE="$RESULTS_DIR/sitemap.json"
|
|
106
|
-
DIFF_RENDER_FILE="$RESULTS_DIR/diff-render.json"
|
|
107
|
-
|
|
108
|
-
# Load Playwright render-delta data once (used to differentiate JS-rendering
|
|
109
|
-
# bots from non-rendering ones). If the comparison was skipped or missing,
|
|
110
|
-
# all bots score against server HTML only.
|
|
111
|
-
DIFF_AVAILABLE=false
|
|
112
|
-
DIFF_RENDERED_WORDS=0
|
|
113
|
-
DIFF_DELTA_PCT=0
|
|
114
|
-
if [ -f "$DIFF_RENDER_FILE" ]; then
|
|
115
|
-
# Explicit null check — `.skipped // true` would treat real false as null.
|
|
116
|
-
DIFF_SKIPPED=$(jq -r '.skipped | if . == null then "true" else tostring end' "$DIFF_RENDER_FILE" 2>/dev/null || echo "true")
|
|
117
|
-
if [ "$DIFF_SKIPPED" = "false" ]; then
|
|
118
|
-
DIFF_AVAILABLE=true
|
|
119
|
-
DIFF_RENDERED_WORDS=$(jq -r '.renderedWordCount // 0' "$DIFF_RENDER_FILE")
|
|
120
|
-
DIFF_DELTA_PCT=$(jq -r '.deltaPct // 0' "$DIFF_RENDER_FILE")
|
|
121
|
-
fi
|
|
122
|
-
fi
|
|
123
|
-
|
|
124
|
-
BOTS_JSON="{}"
|
|
125
|
-
|
|
126
|
-
# Accumulators for per-category averages (across bots)
|
|
127
|
-
CAT_ACCESSIBILITY_SUM=0
|
|
128
|
-
CAT_CONTENT_SUM=0
|
|
129
|
-
CAT_STRUCTURED_SUM=0
|
|
130
|
-
CAT_TECHNICAL_SUM=0
|
|
131
|
-
CAT_AI_SUM=0
|
|
132
|
-
CAT_N=0
|
|
133
|
-
|
|
134
|
-
# Accumulators for overall weighted composite
|
|
135
|
-
OVERALL_WEIGHTED_SUM=0
|
|
136
|
-
OVERALL_WEIGHT_TOTAL=0
|
|
137
|
-
|
|
138
|
-
for bot_id in $BOTS; do
|
|
139
|
-
FETCH="$RESULTS_DIR/fetch-$bot_id.json"
|
|
140
|
-
META="$RESULTS_DIR/meta-$bot_id.json"
|
|
141
|
-
JSONLD="$RESULTS_DIR/jsonld-$bot_id.json"
|
|
142
|
-
LINKS="$RESULTS_DIR/links-$bot_id.json"
|
|
143
|
-
ROBOTS="$RESULTS_DIR/robots-$bot_id.json"
|
|
144
|
-
|
|
145
|
-
BOT_NAME=$(jget "$FETCH" '.bot.name' "$bot_id")
|
|
146
|
-
STATUS=$(jget_num "$FETCH" '.status')
|
|
147
|
-
TOTAL_TIME=$(jget_num "$FETCH" '.timing.total')
|
|
148
|
-
SERVER_WORD_COUNT=$(jget_num "$FETCH" '.wordCount')
|
|
149
|
-
# Read with explicit null fallback — jq's `//` is unsafe here because it
|
|
150
|
-
# treats boolean false as falsy, which is exactly the value we need to see.
|
|
151
|
-
RENDERS_JS=$(jq -r '.bot.rendersJavaScript | if . == null then "unknown" else tostring end' "$FETCH" 2>/dev/null || echo "unknown")
|
|
152
|
-
|
|
153
|
-
ROBOTS_ALLOWED=$(jget_bool "$ROBOTS" '.allowed')
|
|
154
|
-
|
|
155
|
-
# Effective word count depends on JS rendering capability:
|
|
156
|
-
# - true (e.g. Googlebot) + diff-render data → rendered DOM word count
|
|
157
|
-
# - false (AI training/search bots, observed) → server HTML only, with
|
|
158
|
-
# penalty proportional to the rendering delta
|
|
159
|
-
# - unknown → conservative: server HTML (same as false but no penalty)
|
|
160
|
-
EFFECTIVE_WORD_COUNT=$SERVER_WORD_COUNT
|
|
161
|
-
HYDRATION_PENALTY=0
|
|
162
|
-
MISSED_WORDS=0
|
|
163
|
-
if [ "$DIFF_AVAILABLE" = "true" ]; then
|
|
164
|
-
if [ "$RENDERS_JS" = "true" ]; then
|
|
165
|
-
EFFECTIVE_WORD_COUNT=$DIFF_RENDERED_WORDS
|
|
166
|
-
elif [ "$RENDERS_JS" = "false" ]; then
|
|
167
|
-
# Absolute-value delta: if rendered DOM has materially more than server,
|
|
168
|
-
# AI bots are missing that content.
|
|
169
|
-
ABS_DELTA=$(awk -v d="$DIFF_DELTA_PCT" 'BEGIN { printf "%d", (d < 0 ? -d : d) + 0.5 }')
|
|
170
|
-
if [ "$ABS_DELTA" -gt 5 ]; then
|
|
171
|
-
# Scale penalty: 5% delta = 0, 10% = 5, 20%+ = 15 (cap)
|
|
172
|
-
HYDRATION_PENALTY=$(awk -v d="$ABS_DELTA" 'BEGIN {
|
|
173
|
-
p = (d - 5)
|
|
174
|
-
if (p > 15) p = 15
|
|
175
|
-
printf "%d", p
|
|
176
|
-
}')
|
|
177
|
-
fi
|
|
178
|
-
MISSED_WORDS=$((DIFF_RENDERED_WORDS - SERVER_WORD_COUNT))
|
|
179
|
-
[ "$MISSED_WORDS" -lt 0 ] && MISSED_WORDS=0
|
|
180
|
-
fi
|
|
181
|
-
fi
|
|
182
|
-
|
|
183
|
-
# --- Category 1: Accessibility (0-100) ---
|
|
184
|
-
ACC=0
|
|
185
|
-
# robots.txt allows: 40
|
|
186
|
-
[ "$ROBOTS_ALLOWED" = "true" ] && ACC=$((ACC + 40))
|
|
187
|
-
# HTTP 200: 40
|
|
188
|
-
[ "$STATUS" = "200" ] && ACC=$((ACC + 40))
|
|
189
|
-
# Response time: <2s = 20, <5s = 10, else 0
|
|
190
|
-
TIME_SCORE=$(awk -v t="$TOTAL_TIME" 'BEGIN { if (t < 2) print 20; else if (t < 5) print 10; else print 0 }')
|
|
191
|
-
ACC=$((ACC + TIME_SCORE))
|
|
192
|
-
|
|
193
|
-
# --- Category 2: Content Visibility (0-100) ---
|
|
194
|
-
CONTENT=0
|
|
195
|
-
if [ "$EFFECTIVE_WORD_COUNT" -ge 300 ]; then CONTENT=$((CONTENT + 30))
|
|
196
|
-
elif [ "$EFFECTIVE_WORD_COUNT" -ge 150 ]; then CONTENT=$((CONTENT + 20))
|
|
197
|
-
elif [ "$EFFECTIVE_WORD_COUNT" -ge 50 ]; then CONTENT=$((CONTENT + 10))
|
|
198
|
-
fi
|
|
199
|
-
|
|
200
|
-
H1_COUNT=$(jget_num "$META" '.headings.h1.count')
|
|
201
|
-
H2_COUNT=$(jget_num "$META" '.headings.h2.count')
|
|
202
|
-
[ "$H1_COUNT" -ge 1 ] && CONTENT=$((CONTENT + 20))
|
|
203
|
-
[ "$H2_COUNT" -ge 1 ] && CONTENT=$((CONTENT + 15))
|
|
204
|
-
|
|
205
|
-
INTERNAL_LINKS=$(jget_num "$LINKS" '.counts.internal')
|
|
206
|
-
if [ "$INTERNAL_LINKS" -ge 5 ]; then CONTENT=$((CONTENT + 20))
|
|
207
|
-
elif [ "$INTERNAL_LINKS" -ge 1 ]; then CONTENT=$((CONTENT + 10))
|
|
208
|
-
fi
|
|
209
|
-
|
|
210
|
-
IMG_TOTAL=$(jget_num "$META" '.images.total')
|
|
211
|
-
IMG_WITH_ALT=$(jget_num "$META" '.images.withAlt')
|
|
212
|
-
if [ "$IMG_TOTAL" -eq 0 ]; then
|
|
213
|
-
CONTENT=$((CONTENT + 15))
|
|
214
|
-
else
|
|
215
|
-
ALT_SCORE=$(awk -v a="$IMG_WITH_ALT" -v t="$IMG_TOTAL" 'BEGIN { printf "%d", (a / t) * 15 }')
|
|
216
|
-
CONTENT=$((CONTENT + ALT_SCORE))
|
|
217
|
-
fi
|
|
218
|
-
|
|
219
|
-
# Apply hydration penalty for non-rendering bots that are missing content
|
|
220
|
-
CONTENT=$((CONTENT - HYDRATION_PENALTY))
|
|
221
|
-
[ $CONTENT -lt 0 ] && CONTENT=0
|
|
222
|
-
|
|
223
|
-
# --- Category 3: Structured Data (0-100) ---
|
|
224
|
-
STRUCTURED=0
|
|
225
|
-
JSONLD_COUNT=$(jget_num "$JSONLD" '.blockCount')
|
|
226
|
-
JSONLD_VALID=$(jget_num "$JSONLD" '.validCount')
|
|
227
|
-
JSONLD_INVALID=$(jget_num "$JSONLD" '.invalidCount')
|
|
228
|
-
HAS_ORG=$(jget_bool "$JSONLD" '.flags.hasOrganization')
|
|
229
|
-
HAS_WEBSITE=$(jget_bool "$JSONLD" '.flags.hasWebSite')
|
|
230
|
-
HAS_BREADCRUMB=$(jget_bool "$JSONLD" '.flags.hasBreadcrumbList')
|
|
231
|
-
HAS_ARTICLE=$(jget_bool "$JSONLD" '.flags.hasArticle')
|
|
232
|
-
HAS_PRODUCT=$(jget_bool "$JSONLD" '.flags.hasProduct')
|
|
233
|
-
HAS_FAQ=$(jget_bool "$JSONLD" '.flags.hasFAQPage')
|
|
234
|
-
|
|
235
|
-
[ "$JSONLD_COUNT" -ge 1 ] && STRUCTURED=$((STRUCTURED + 30))
|
|
236
|
-
if [ "$JSONLD_COUNT" -ge 1 ] && [ "$JSONLD_INVALID" -eq 0 ]; then
|
|
237
|
-
STRUCTURED=$((STRUCTURED + 20))
|
|
238
|
-
fi
|
|
239
|
-
if [ "$HAS_ORG" = "true" ] || [ "$HAS_WEBSITE" = "true" ]; then
|
|
240
|
-
STRUCTURED=$((STRUCTURED + 20))
|
|
241
|
-
fi
|
|
242
|
-
[ "$HAS_BREADCRUMB" = "true" ] && STRUCTURED=$((STRUCTURED + 15))
|
|
243
|
-
if [ "$HAS_ARTICLE" = "true" ] || [ "$HAS_PRODUCT" = "true" ] || [ "$HAS_FAQ" = "true" ]; then
|
|
244
|
-
STRUCTURED=$((STRUCTURED + 15))
|
|
245
|
-
fi
|
|
246
|
-
|
|
247
|
-
# --- Category 4: Technical Signals (0-100) ---
|
|
248
|
-
TECHNICAL=0
|
|
249
|
-
TITLE=$(jget "$META" '.title' "")
|
|
250
|
-
DESCRIPTION=$(jget "$META" '.description' "")
|
|
251
|
-
CANONICAL=$(jget "$META" '.canonical' "")
|
|
252
|
-
OG_TITLE=$(jget "$META" '.og.title' "")
|
|
253
|
-
OG_DESC=$(jget "$META" '.og.description' "")
|
|
254
|
-
|
|
255
|
-
[ -n "$TITLE" ] && [ "$TITLE" != "null" ] && TECHNICAL=$((TECHNICAL + 25))
|
|
256
|
-
[ -n "$DESCRIPTION" ] && [ "$DESCRIPTION" != "null" ] && TECHNICAL=$((TECHNICAL + 25))
|
|
257
|
-
[ -n "$CANONICAL" ] && [ "$CANONICAL" != "null" ] && TECHNICAL=$((TECHNICAL + 20))
|
|
258
|
-
if [ -n "$OG_TITLE" ] && [ "$OG_TITLE" != "null" ]; then TECHNICAL=$((TECHNICAL + 8)); fi
|
|
259
|
-
if [ -n "$OG_DESC" ] && [ "$OG_DESC" != "null" ]; then TECHNICAL=$((TECHNICAL + 7)); fi
|
|
260
|
-
|
|
261
|
-
SITEMAP_EXISTS=$(jget_bool "$SITEMAP_FILE" '.exists')
|
|
262
|
-
SITEMAP_CONTAINS=$(jget_bool "$SITEMAP_FILE" '.containsTarget')
|
|
263
|
-
if [ "$SITEMAP_EXISTS" = "true" ] && [ "$SITEMAP_CONTAINS" = "true" ]; then
|
|
264
|
-
TECHNICAL=$((TECHNICAL + 15))
|
|
265
|
-
elif [ "$SITEMAP_EXISTS" = "true" ]; then
|
|
266
|
-
TECHNICAL=$((TECHNICAL + 10))
|
|
267
|
-
fi
|
|
268
|
-
|
|
269
|
-
# --- Category 5: AI Readiness (0-100) ---
|
|
270
|
-
AI=0
|
|
271
|
-
LLMS_EXISTS=$(jget_bool "$LLMSTXT_FILE" '.llmsTxt.exists')
|
|
272
|
-
LLMS_HAS_TITLE=$(jget_bool "$LLMSTXT_FILE" '.llmsTxt.hasTitle')
|
|
273
|
-
LLMS_HAS_DESC=$(jget_bool "$LLMSTXT_FILE" '.llmsTxt.hasDescription')
|
|
274
|
-
LLMS_URLS=$(jget_num "$LLMSTXT_FILE" '.llmsTxt.urlCount')
|
|
275
|
-
|
|
276
|
-
if [ "$LLMS_EXISTS" = "true" ]; then
|
|
277
|
-
AI=$((AI + 40))
|
|
278
|
-
[ "$LLMS_HAS_TITLE" = "true" ] && AI=$((AI + 7))
|
|
279
|
-
[ "$LLMS_HAS_DESC" = "true" ] && AI=$((AI + 7))
|
|
280
|
-
[ "$LLMS_URLS" -ge 1 ] && AI=$((AI + 6))
|
|
281
|
-
fi
|
|
282
|
-
# Content citable (>= 200 words, effective for this bot)
|
|
283
|
-
[ "$EFFECTIVE_WORD_COUNT" -ge 200 ] && AI=$((AI + 20))
|
|
284
|
-
# Semantic clarity: has H1 + description
|
|
285
|
-
if [ "$H1_COUNT" -ge 1 ] && [ -n "$DESCRIPTION" ] && [ "$DESCRIPTION" != "null" ]; then
|
|
286
|
-
AI=$((AI + 20))
|
|
287
|
-
fi
|
|
288
|
-
|
|
289
|
-
# Cap categories at 100
|
|
290
|
-
[ $ACC -gt 100 ] && ACC=100
|
|
291
|
-
[ $CONTENT -gt 100 ] && CONTENT=100
|
|
292
|
-
[ $STRUCTURED -gt 100 ] && STRUCTURED=100
|
|
293
|
-
[ $TECHNICAL -gt 100 ] && TECHNICAL=100
|
|
294
|
-
[ $AI -gt 100 ] && AI=100
|
|
295
|
-
|
|
296
|
-
# Per-bot composite score (weighted average of 5 categories)
|
|
297
|
-
BOT_SCORE=$(awk -v a=$ACC -v c=$CONTENT -v s=$STRUCTURED -v t=$TECHNICAL -v ai=$AI \
|
|
298
|
-
-v wa=$W_ACCESSIBILITY -v wc=$W_CONTENT -v ws=$W_STRUCTURED -v wt=$W_TECHNICAL -v wai=$W_AI \
|
|
299
|
-
'BEGIN { printf "%d", (a*wa + c*wc + s*ws + t*wt + ai*wai) / (wa+wc+ws+wt+wai) + 0.5 }')
|
|
300
|
-
|
|
301
|
-
BOT_GRADE=$(grade_for "$BOT_SCORE")
|
|
302
|
-
ACC_GRADE=$(grade_for "$ACC")
|
|
303
|
-
CONTENT_GRADE=$(grade_for "$CONTENT")
|
|
304
|
-
STRUCTURED_GRADE=$(grade_for "$STRUCTURED")
|
|
305
|
-
TECHNICAL_GRADE=$(grade_for "$TECHNICAL")
|
|
306
|
-
AI_GRADE=$(grade_for "$AI")
|
|
307
|
-
|
|
308
|
-
BOT_OBJ=$(jq -n \
|
|
309
|
-
--arg id "$bot_id" \
|
|
310
|
-
--arg name "$BOT_NAME" \
|
|
311
|
-
--arg rendersJs "$RENDERS_JS" \
|
|
312
|
-
--argjson score "$BOT_SCORE" \
|
|
313
|
-
--arg grade "$BOT_GRADE" \
|
|
314
|
-
--argjson acc "$ACC" \
|
|
315
|
-
--arg accGrade "$ACC_GRADE" \
|
|
316
|
-
--argjson content "$CONTENT" \
|
|
317
|
-
--arg contentGrade "$CONTENT_GRADE" \
|
|
318
|
-
--argjson structured "$STRUCTURED" \
|
|
319
|
-
--arg structuredGrade "$STRUCTURED_GRADE" \
|
|
320
|
-
--argjson technical "$TECHNICAL" \
|
|
321
|
-
--arg technicalGrade "$TECHNICAL_GRADE" \
|
|
322
|
-
--argjson ai "$AI" \
|
|
323
|
-
--arg aiGrade "$AI_GRADE" \
|
|
324
|
-
--argjson serverWords "$SERVER_WORD_COUNT" \
|
|
325
|
-
--argjson effectiveWords "$EFFECTIVE_WORD_COUNT" \
|
|
326
|
-
--argjson missedWords "$MISSED_WORDS" \
|
|
327
|
-
--argjson hydrationPenalty "$HYDRATION_PENALTY" \
|
|
328
|
-
'{
|
|
329
|
-
id: $id,
|
|
330
|
-
name: $name,
|
|
331
|
-
rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end),
|
|
332
|
-
score: $score,
|
|
333
|
-
grade: $grade,
|
|
334
|
-
visibility: {
|
|
335
|
-
serverWords: $serverWords,
|
|
336
|
-
effectiveWords: $effectiveWords,
|
|
337
|
-
missedWordsVsRendered: $missedWords,
|
|
338
|
-
hydrationPenaltyPts: $hydrationPenalty
|
|
339
|
-
},
|
|
340
|
-
categories: {
|
|
341
|
-
accessibility: { score: $acc, grade: $accGrade },
|
|
342
|
-
contentVisibility: { score: $content, grade: $contentGrade },
|
|
343
|
-
structuredData: { score: $structured, grade: $structuredGrade },
|
|
344
|
-
technicalSignals: { score: $technical, grade: $technicalGrade },
|
|
345
|
-
aiReadiness: { score: $ai, grade: $aiGrade }
|
|
346
|
-
}
|
|
347
|
-
}')
|
|
348
|
-
|
|
349
|
-
BOTS_JSON=$(printf '%s' "$BOTS_JSON" | jq --argjson bot "$BOT_OBJ" --arg id "$bot_id" '.[$id] = $bot')
|
|
350
|
-
|
|
351
|
-
# Accumulate category averages
|
|
352
|
-
CAT_ACCESSIBILITY_SUM=$((CAT_ACCESSIBILITY_SUM + ACC))
|
|
353
|
-
CAT_CONTENT_SUM=$((CAT_CONTENT_SUM + CONTENT))
|
|
354
|
-
CAT_STRUCTURED_SUM=$((CAT_STRUCTURED_SUM + STRUCTURED))
|
|
355
|
-
CAT_TECHNICAL_SUM=$((CAT_TECHNICAL_SUM + TECHNICAL))
|
|
356
|
-
CAT_AI_SUM=$((CAT_AI_SUM + AI))
|
|
357
|
-
CAT_N=$((CAT_N + 1))
|
|
358
|
-
|
|
359
|
-
# Accumulate weighted overall
|
|
360
|
-
W=$(overall_weight "$bot_id")
|
|
361
|
-
if [ "$W" -gt 0 ]; then
|
|
362
|
-
OVERALL_WEIGHTED_SUM=$((OVERALL_WEIGHTED_SUM + BOT_SCORE * W))
|
|
363
|
-
OVERALL_WEIGHT_TOTAL=$((OVERALL_WEIGHT_TOTAL + W))
|
|
364
|
-
fi
|
|
365
|
-
done
|
|
366
|
-
|
|
367
|
-
# Per-category averages (across all bots)
|
|
368
|
-
CAT_ACC_AVG=$((CAT_ACCESSIBILITY_SUM / CAT_N))
|
|
369
|
-
CAT_CONTENT_AVG=$((CAT_CONTENT_SUM / CAT_N))
|
|
370
|
-
CAT_STRUCTURED_AVG=$((CAT_STRUCTURED_SUM / CAT_N))
|
|
371
|
-
CAT_TECHNICAL_AVG=$((CAT_TECHNICAL_SUM / CAT_N))
|
|
372
|
-
CAT_AI_AVG=$((CAT_AI_SUM / CAT_N))
|
|
373
|
-
|
|
374
|
-
# Overall composite
|
|
375
|
-
if [ "$OVERALL_WEIGHT_TOTAL" -gt 0 ]; then
|
|
376
|
-
OVERALL_SCORE=$((OVERALL_WEIGHTED_SUM / OVERALL_WEIGHT_TOTAL))
|
|
377
|
-
else
|
|
378
|
-
# Fall back to simple average if none of the 4 standard bots are present
|
|
379
|
-
OVERALL_SCORE=$(((CAT_ACC_AVG + CAT_CONTENT_AVG + CAT_STRUCTURED_AVG + CAT_TECHNICAL_AVG + CAT_AI_AVG) / 5))
|
|
380
|
-
fi
|
|
381
|
-
|
|
382
|
-
OVERALL_GRADE=$(grade_for "$OVERALL_SCORE")
|
|
383
|
-
CAT_ACC_GRADE=$(grade_for "$CAT_ACC_AVG")
|
|
384
|
-
CAT_CONTENT_GRADE=$(grade_for "$CAT_CONTENT_AVG")
|
|
385
|
-
CAT_STRUCTURED_GRADE=$(grade_for "$CAT_STRUCTURED_AVG")
|
|
386
|
-
CAT_TECHNICAL_GRADE=$(grade_for "$CAT_TECHNICAL_AVG")
|
|
387
|
-
CAT_AI_GRADE=$(grade_for "$CAT_AI_AVG")
|
|
388
|
-
|
|
389
|
-
# Get the URL from the first fetch file
|
|
390
|
-
FIRST_FETCH=$(ls "$RESULTS_DIR"/fetch-*.json | head -1)
|
|
391
|
-
TARGET_URL=$(jget "$FIRST_FETCH" '.url' "")
|
|
392
|
-
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
393
|
-
|
|
394
|
-
jq -n \
|
|
395
|
-
--arg url "$TARGET_URL" \
|
|
396
|
-
--arg timestamp "$TIMESTAMP" \
|
|
397
|
-
--arg version "0.1.0" \
|
|
398
|
-
--argjson overallScore "$OVERALL_SCORE" \
|
|
399
|
-
--arg overallGrade "$OVERALL_GRADE" \
|
|
400
|
-
--argjson bots "$BOTS_JSON" \
|
|
401
|
-
--argjson catAcc "$CAT_ACC_AVG" \
|
|
402
|
-
--arg catAccGrade "$CAT_ACC_GRADE" \
|
|
403
|
-
--argjson catContent "$CAT_CONTENT_AVG" \
|
|
404
|
-
--arg catContentGrade "$CAT_CONTENT_GRADE" \
|
|
405
|
-
--argjson catStructured "$CAT_STRUCTURED_AVG" \
|
|
406
|
-
--arg catStructuredGrade "$CAT_STRUCTURED_GRADE" \
|
|
407
|
-
--argjson catTechnical "$CAT_TECHNICAL_AVG" \
|
|
408
|
-
--arg catTechnicalGrade "$CAT_TECHNICAL_GRADE" \
|
|
409
|
-
--argjson catAi "$CAT_AI_AVG" \
|
|
410
|
-
--arg catAiGrade "$CAT_AI_GRADE" \
|
|
411
|
-
'{
|
|
412
|
-
url: $url,
|
|
413
|
-
timestamp: $timestamp,
|
|
414
|
-
version: $version,
|
|
415
|
-
overall: { score: $overallScore, grade: $overallGrade },
|
|
416
|
-
bots: $bots,
|
|
417
|
-
categories: {
|
|
418
|
-
accessibility: { score: $catAcc, grade: $catAccGrade },
|
|
419
|
-
contentVisibility: { score: $catContent, grade: $catContentGrade },
|
|
420
|
-
structuredData: { score: $catStructured, grade: $catStructuredGrade },
|
|
421
|
-
technicalSignals: { score: $catTechnical, grade: $catTechnicalGrade },
|
|
422
|
-
aiReadiness: { score: $catAi, grade: $catAiGrade }
|
|
423
|
-
}
|
|
424
|
-
}'
|