@braedenbuilds/crawl-sim 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +261 -0
- package/SKILL.md +196 -0
- package/bin/install.js +159 -0
- package/package.json +46 -0
- package/profiles/chatgpt-user.json +28 -0
- package/profiles/claude-searchbot.json +28 -0
- package/profiles/claude-user.json +28 -0
- package/profiles/claudebot.json +28 -0
- package/profiles/googlebot.json +28 -0
- package/profiles/gptbot.json +28 -0
- package/profiles/oai-searchbot.json +28 -0
- package/profiles/perplexity-user.json +28 -0
- package/profiles/perplexitybot.json +28 -0
- package/scripts/_lib.sh +51 -0
- package/scripts/check-llmstxt.sh +116 -0
- package/scripts/check-robots.sh +196 -0
- package/scripts/check-sitemap.sh +79 -0
- package/scripts/compute-score.sh +424 -0
- package/scripts/diff-render.sh +136 -0
- package/scripts/extract-jsonld.sh +133 -0
- package/scripts/extract-links.sh +103 -0
- package/scripts/extract-meta.sh +117 -0
- package/scripts/fetch-as-bot.sh +87 -0
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -eu
|
|
3
|
+
|
|
4
|
+
# check-sitemap.sh — Fetch sitemap.xml, check URL inclusion and structure
|
|
5
|
+
# Usage: check-sitemap.sh <url>
|
|
6
|
+
|
|
7
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
8
|
+
# shellcheck source=_lib.sh
|
|
9
|
+
. "$SCRIPT_DIR/_lib.sh"
|
|
10
|
+
|
|
11
|
+
URL="${1:?Usage: check-sitemap.sh <url>}"
|
|
12
|
+
printf '[check-sitemap] %s\n' "$URL" >&2
|
|
13
|
+
ORIGIN=$(origin_from_url "$URL")
|
|
14
|
+
SITEMAP_URL="${ORIGIN}/sitemap.xml"
|
|
15
|
+
|
|
16
|
+
TMPDIR="${TMPDIR:-/tmp}"
|
|
17
|
+
SITEMAP_FILE=$(mktemp "$TMPDIR/crawlsim-sitemap.XXXXXX")
|
|
18
|
+
trap 'rm -f "$SITEMAP_FILE"' EXIT
|
|
19
|
+
|
|
20
|
+
HTTP_STATUS=$(fetch_to_file "$SITEMAP_URL" "$SITEMAP_FILE")
|
|
21
|
+
|
|
22
|
+
EXISTS=false
|
|
23
|
+
URL_COUNT=0
|
|
24
|
+
CONTAINS_TARGET=false
|
|
25
|
+
HAS_LASTMOD=false
|
|
26
|
+
IS_INDEX=false
|
|
27
|
+
CHILD_SITEMAP_COUNT=0
|
|
28
|
+
|
|
29
|
+
if [ "$HTTP_STATUS" = "200" ] && [ -s "$SITEMAP_FILE" ]; then
|
|
30
|
+
# Check if content looks like XML (not HTML fallback)
|
|
31
|
+
FIRST_BYTES=$(head -c 200 "$SITEMAP_FILE" | tr '[:upper:]' '[:lower:]')
|
|
32
|
+
case "$FIRST_BYTES" in
|
|
33
|
+
*"<!doctype html"*|*"<html"*) ;;
|
|
34
|
+
*)
|
|
35
|
+
EXISTS=true
|
|
36
|
+
|
|
37
|
+
# Is this a sitemap index?
|
|
38
|
+
if grep -qi '<sitemapindex' "$SITEMAP_FILE"; then
|
|
39
|
+
IS_INDEX=true
|
|
40
|
+
CHILD_SITEMAP_COUNT=$(grep -oE '<sitemap>' "$SITEMAP_FILE" | wc -l | tr -d ' ')
|
|
41
|
+
fi
|
|
42
|
+
|
|
43
|
+
# Count <loc> tags (URLs, or child sitemaps in an index)
|
|
44
|
+
URL_COUNT=$(grep -oE '<loc>' "$SITEMAP_FILE" | wc -l | tr -d ' ')
|
|
45
|
+
|
|
46
|
+
# Check if target URL appears anywhere in the sitemap
|
|
47
|
+
# Match both with and without trailing slash
|
|
48
|
+
URL_NO_TRAILING=$(printf '%s' "$URL" | sed -E 's#/$##')
|
|
49
|
+
if grep -qF "$URL_NO_TRAILING<" "$SITEMAP_FILE" || grep -qF "${URL_NO_TRAILING}/<" "$SITEMAP_FILE"; then
|
|
50
|
+
CONTAINS_TARGET=true
|
|
51
|
+
fi
|
|
52
|
+
|
|
53
|
+
# Has lastmod dates?
|
|
54
|
+
if grep -qi '<lastmod>' "$SITEMAP_FILE"; then
|
|
55
|
+
HAS_LASTMOD=true
|
|
56
|
+
fi
|
|
57
|
+
;;
|
|
58
|
+
esac
|
|
59
|
+
fi
|
|
60
|
+
|
|
61
|
+
jq -n \
|
|
62
|
+
--arg url "$URL" \
|
|
63
|
+
--arg sitemapUrl "$SITEMAP_URL" \
|
|
64
|
+
--argjson exists "$EXISTS" \
|
|
65
|
+
--argjson isIndex "$IS_INDEX" \
|
|
66
|
+
--argjson urlCount "$URL_COUNT" \
|
|
67
|
+
--argjson childSitemapCount "$CHILD_SITEMAP_COUNT" \
|
|
68
|
+
--argjson containsTarget "$CONTAINS_TARGET" \
|
|
69
|
+
--argjson hasLastmod "$HAS_LASTMOD" \
|
|
70
|
+
'{
|
|
71
|
+
url: $url,
|
|
72
|
+
sitemapUrl: $sitemapUrl,
|
|
73
|
+
exists: $exists,
|
|
74
|
+
isIndex: $isIndex,
|
|
75
|
+
urlCount: $urlCount,
|
|
76
|
+
childSitemapCount: $childSitemapCount,
|
|
77
|
+
containsTarget: $containsTarget,
|
|
78
|
+
hasLastmod: $hasLastmod
|
|
79
|
+
}'
|
|
@@ -0,0 +1,424 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -eu
|
|
3
|
+
|
|
4
|
+
# compute-score.sh — Aggregate check outputs into per-bot + per-category scores
|
|
5
|
+
# Usage: compute-score.sh <results-dir>
|
|
6
|
+
# Output: JSON to stdout
|
|
7
|
+
#
|
|
8
|
+
# Expected filenames in <results-dir>:
|
|
9
|
+
# fetch-<bot_id>.json — fetch-as-bot.sh output
|
|
10
|
+
# meta-<bot_id>.json — extract-meta.sh output
|
|
11
|
+
# jsonld-<bot_id>.json — extract-jsonld.sh output
|
|
12
|
+
# links-<bot_id>.json — extract-links.sh output
|
|
13
|
+
# robots-<bot_id>.json — check-robots.sh output
|
|
14
|
+
# llmstxt.json — check-llmstxt.sh output (bot-independent)
|
|
15
|
+
# sitemap.json — check-sitemap.sh output (bot-independent)
|
|
16
|
+
# diff-render.json — diff-render.sh output (optional, Googlebot only)
|
|
17
|
+
|
|
18
|
+
RESULTS_DIR="${1:?Usage: compute-score.sh <results-dir>}"
|
|
19
|
+
printf '[compute-score] aggregating %s\n' "$RESULTS_DIR" >&2
|
|
20
|
+
|
|
21
|
+
if [ ! -d "$RESULTS_DIR" ]; then
|
|
22
|
+
echo "Error: results dir not found: $RESULTS_DIR" >&2
|
|
23
|
+
exit 1
|
|
24
|
+
fi
|
|
25
|
+
|
|
26
|
+
# Category weights (as percentages of per-bot composite)
|
|
27
|
+
W_ACCESSIBILITY=25
|
|
28
|
+
W_CONTENT=30
|
|
29
|
+
W_STRUCTURED=20
|
|
30
|
+
W_TECHNICAL=15
|
|
31
|
+
W_AI=10
|
|
32
|
+
|
|
33
|
+
# Overall composite weights (per bot)
|
|
34
|
+
# Default: Googlebot 40, GPTBot 20, ClaudeBot 20, PerplexityBot 20
|
|
35
|
+
overall_weight() {
|
|
36
|
+
case "$1" in
|
|
37
|
+
googlebot) echo 40 ;;
|
|
38
|
+
gptbot) echo 20 ;;
|
|
39
|
+
claudebot) echo 20 ;;
|
|
40
|
+
perplexitybot) echo 20 ;;
|
|
41
|
+
*) echo 0 ;;
|
|
42
|
+
esac
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
# Grade from score (0-100)
|
|
46
|
+
grade_for() {
|
|
47
|
+
local s=$1
|
|
48
|
+
if [ "$s" -ge 93 ]; then echo "A"
|
|
49
|
+
elif [ "$s" -ge 90 ]; then echo "A-"
|
|
50
|
+
elif [ "$s" -ge 87 ]; then echo "B+"
|
|
51
|
+
elif [ "$s" -ge 83 ]; then echo "B"
|
|
52
|
+
elif [ "$s" -ge 80 ]; then echo "B-"
|
|
53
|
+
elif [ "$s" -ge 77 ]; then echo "C+"
|
|
54
|
+
elif [ "$s" -ge 73 ]; then echo "C"
|
|
55
|
+
elif [ "$s" -ge 70 ]; then echo "C-"
|
|
56
|
+
elif [ "$s" -ge 67 ]; then echo "D+"
|
|
57
|
+
elif [ "$s" -ge 63 ]; then echo "D"
|
|
58
|
+
elif [ "$s" -ge 60 ]; then echo "D-"
|
|
59
|
+
else echo "F"
|
|
60
|
+
fi
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
# Read a jq value from a file with a default fallback
|
|
64
|
+
jget() {
|
|
65
|
+
local file="$1"
|
|
66
|
+
local query="$2"
|
|
67
|
+
local default="${3:-null}"
|
|
68
|
+
if [ -f "$file" ]; then
|
|
69
|
+
jq -r --arg d "$default" "$query // \$d" "$file" 2>/dev/null || echo "$default"
|
|
70
|
+
else
|
|
71
|
+
echo "$default"
|
|
72
|
+
fi
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
jget_num() {
|
|
76
|
+
local v
|
|
77
|
+
v=$(jget "$1" "$2" "0")
|
|
78
|
+
# Replace "null" or non-numeric with 0
|
|
79
|
+
if ! printf '%s' "$v" | grep -qE '^-?[0-9]+(\.[0-9]+)?$'; then
|
|
80
|
+
echo "0"
|
|
81
|
+
else
|
|
82
|
+
echo "$v"
|
|
83
|
+
fi
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
jget_bool() {
|
|
87
|
+
local v
|
|
88
|
+
v=$(jget "$1" "$2" "false")
|
|
89
|
+
if [ "$v" = "true" ]; then echo "true"; else echo "false"; fi
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
BOTS=""
|
|
93
|
+
for f in "$RESULTS_DIR"/fetch-*.json; do
|
|
94
|
+
[ -f "$f" ] || continue
|
|
95
|
+
bot_id=$(basename "$f" .json | sed 's/^fetch-//')
|
|
96
|
+
BOTS="$BOTS $bot_id"
|
|
97
|
+
done
|
|
98
|
+
|
|
99
|
+
if [ -z "$BOTS" ]; then
|
|
100
|
+
echo "Error: no fetch-*.json files found in $RESULTS_DIR" >&2
|
|
101
|
+
exit 1
|
|
102
|
+
fi
|
|
103
|
+
|
|
104
|
+
LLMSTXT_FILE="$RESULTS_DIR/llmstxt.json"
|
|
105
|
+
SITEMAP_FILE="$RESULTS_DIR/sitemap.json"
|
|
106
|
+
DIFF_RENDER_FILE="$RESULTS_DIR/diff-render.json"
|
|
107
|
+
|
|
108
|
+
# Load Playwright render-delta data once (used to differentiate JS-rendering
|
|
109
|
+
# bots from non-rendering ones). If the comparison was skipped or missing,
|
|
110
|
+
# all bots score against server HTML only.
|
|
111
|
+
DIFF_AVAILABLE=false
|
|
112
|
+
DIFF_RENDERED_WORDS=0
|
|
113
|
+
DIFF_DELTA_PCT=0
|
|
114
|
+
if [ -f "$DIFF_RENDER_FILE" ]; then
|
|
115
|
+
# Explicit null check — `.skipped // true` would treat real false as null.
|
|
116
|
+
DIFF_SKIPPED=$(jq -r '.skipped | if . == null then "true" else tostring end' "$DIFF_RENDER_FILE" 2>/dev/null || echo "true")
|
|
117
|
+
if [ "$DIFF_SKIPPED" = "false" ]; then
|
|
118
|
+
DIFF_AVAILABLE=true
|
|
119
|
+
DIFF_RENDERED_WORDS=$(jq -r '.renderedWordCount // 0' "$DIFF_RENDER_FILE")
|
|
120
|
+
DIFF_DELTA_PCT=$(jq -r '.deltaPct // 0' "$DIFF_RENDER_FILE")
|
|
121
|
+
fi
|
|
122
|
+
fi
|
|
123
|
+
|
|
124
|
+
BOTS_JSON="{}"
|
|
125
|
+
|
|
126
|
+
# Accumulators for per-category averages (across bots)
|
|
127
|
+
CAT_ACCESSIBILITY_SUM=0
|
|
128
|
+
CAT_CONTENT_SUM=0
|
|
129
|
+
CAT_STRUCTURED_SUM=0
|
|
130
|
+
CAT_TECHNICAL_SUM=0
|
|
131
|
+
CAT_AI_SUM=0
|
|
132
|
+
CAT_N=0
|
|
133
|
+
|
|
134
|
+
# Accumulators for overall weighted composite
|
|
135
|
+
OVERALL_WEIGHTED_SUM=0
|
|
136
|
+
OVERALL_WEIGHT_TOTAL=0
|
|
137
|
+
|
|
138
|
+
for bot_id in $BOTS; do
|
|
139
|
+
FETCH="$RESULTS_DIR/fetch-$bot_id.json"
|
|
140
|
+
META="$RESULTS_DIR/meta-$bot_id.json"
|
|
141
|
+
JSONLD="$RESULTS_DIR/jsonld-$bot_id.json"
|
|
142
|
+
LINKS="$RESULTS_DIR/links-$bot_id.json"
|
|
143
|
+
ROBOTS="$RESULTS_DIR/robots-$bot_id.json"
|
|
144
|
+
|
|
145
|
+
BOT_NAME=$(jget "$FETCH" '.bot.name' "$bot_id")
|
|
146
|
+
STATUS=$(jget_num "$FETCH" '.status')
|
|
147
|
+
TOTAL_TIME=$(jget_num "$FETCH" '.timing.total')
|
|
148
|
+
SERVER_WORD_COUNT=$(jget_num "$FETCH" '.wordCount')
|
|
149
|
+
# Read with explicit null fallback — jq's `//` is unsafe here because it
|
|
150
|
+
# treats boolean false as falsy, which is exactly the value we need to see.
|
|
151
|
+
RENDERS_JS=$(jq -r '.bot.rendersJavaScript | if . == null then "unknown" else tostring end' "$FETCH" 2>/dev/null || echo "unknown")
|
|
152
|
+
|
|
153
|
+
ROBOTS_ALLOWED=$(jget_bool "$ROBOTS" '.allowed')
|
|
154
|
+
|
|
155
|
+
# Effective word count depends on JS rendering capability:
|
|
156
|
+
# - true (e.g. Googlebot) + diff-render data → rendered DOM word count
|
|
157
|
+
# - false (AI training/search bots, observed) → server HTML only, with
|
|
158
|
+
# penalty proportional to the rendering delta
|
|
159
|
+
# - unknown → conservative: server HTML (same as false but no penalty)
|
|
160
|
+
EFFECTIVE_WORD_COUNT=$SERVER_WORD_COUNT
|
|
161
|
+
HYDRATION_PENALTY=0
|
|
162
|
+
MISSED_WORDS=0
|
|
163
|
+
if [ "$DIFF_AVAILABLE" = "true" ]; then
|
|
164
|
+
if [ "$RENDERS_JS" = "true" ]; then
|
|
165
|
+
EFFECTIVE_WORD_COUNT=$DIFF_RENDERED_WORDS
|
|
166
|
+
elif [ "$RENDERS_JS" = "false" ]; then
|
|
167
|
+
# Absolute-value delta: if rendered DOM has materially more than server,
|
|
168
|
+
# AI bots are missing that content.
|
|
169
|
+
ABS_DELTA=$(awk -v d="$DIFF_DELTA_PCT" 'BEGIN { printf "%d", (d < 0 ? -d : d) + 0.5 }')
|
|
170
|
+
if [ "$ABS_DELTA" -gt 5 ]; then
|
|
171
|
+
# Scale penalty: 5% delta = 0, 10% = 5, 20%+ = 15 (cap)
|
|
172
|
+
HYDRATION_PENALTY=$(awk -v d="$ABS_DELTA" 'BEGIN {
|
|
173
|
+
p = (d - 5)
|
|
174
|
+
if (p > 15) p = 15
|
|
175
|
+
printf "%d", p
|
|
176
|
+
}')
|
|
177
|
+
fi
|
|
178
|
+
MISSED_WORDS=$((DIFF_RENDERED_WORDS - SERVER_WORD_COUNT))
|
|
179
|
+
[ "$MISSED_WORDS" -lt 0 ] && MISSED_WORDS=0
|
|
180
|
+
fi
|
|
181
|
+
fi
|
|
182
|
+
|
|
183
|
+
# --- Category 1: Accessibility (0-100) ---
|
|
184
|
+
ACC=0
|
|
185
|
+
# robots.txt allows: 40
|
|
186
|
+
[ "$ROBOTS_ALLOWED" = "true" ] && ACC=$((ACC + 40))
|
|
187
|
+
# HTTP 200: 40
|
|
188
|
+
[ "$STATUS" = "200" ] && ACC=$((ACC + 40))
|
|
189
|
+
# Response time: <2s = 20, <5s = 10, else 0
|
|
190
|
+
TIME_SCORE=$(awk -v t="$TOTAL_TIME" 'BEGIN { if (t < 2) print 20; else if (t < 5) print 10; else print 0 }')
|
|
191
|
+
ACC=$((ACC + TIME_SCORE))
|
|
192
|
+
|
|
193
|
+
# --- Category 2: Content Visibility (0-100) ---
|
|
194
|
+
CONTENT=0
|
|
195
|
+
if [ "$EFFECTIVE_WORD_COUNT" -ge 300 ]; then CONTENT=$((CONTENT + 30))
|
|
196
|
+
elif [ "$EFFECTIVE_WORD_COUNT" -ge 150 ]; then CONTENT=$((CONTENT + 20))
|
|
197
|
+
elif [ "$EFFECTIVE_WORD_COUNT" -ge 50 ]; then CONTENT=$((CONTENT + 10))
|
|
198
|
+
fi
|
|
199
|
+
|
|
200
|
+
H1_COUNT=$(jget_num "$META" '.headings.h1.count')
|
|
201
|
+
H2_COUNT=$(jget_num "$META" '.headings.h2.count')
|
|
202
|
+
[ "$H1_COUNT" -ge 1 ] && CONTENT=$((CONTENT + 20))
|
|
203
|
+
[ "$H2_COUNT" -ge 1 ] && CONTENT=$((CONTENT + 15))
|
|
204
|
+
|
|
205
|
+
INTERNAL_LINKS=$(jget_num "$LINKS" '.counts.internal')
|
|
206
|
+
if [ "$INTERNAL_LINKS" -ge 5 ]; then CONTENT=$((CONTENT + 20))
|
|
207
|
+
elif [ "$INTERNAL_LINKS" -ge 1 ]; then CONTENT=$((CONTENT + 10))
|
|
208
|
+
fi
|
|
209
|
+
|
|
210
|
+
IMG_TOTAL=$(jget_num "$META" '.images.total')
|
|
211
|
+
IMG_WITH_ALT=$(jget_num "$META" '.images.withAlt')
|
|
212
|
+
if [ "$IMG_TOTAL" -eq 0 ]; then
|
|
213
|
+
CONTENT=$((CONTENT + 15))
|
|
214
|
+
else
|
|
215
|
+
ALT_SCORE=$(awk -v a="$IMG_WITH_ALT" -v t="$IMG_TOTAL" 'BEGIN { printf "%d", (a / t) * 15 }')
|
|
216
|
+
CONTENT=$((CONTENT + ALT_SCORE))
|
|
217
|
+
fi
|
|
218
|
+
|
|
219
|
+
# Apply hydration penalty for non-rendering bots that are missing content
|
|
220
|
+
CONTENT=$((CONTENT - HYDRATION_PENALTY))
|
|
221
|
+
[ $CONTENT -lt 0 ] && CONTENT=0
|
|
222
|
+
|
|
223
|
+
# --- Category 3: Structured Data (0-100) ---
|
|
224
|
+
STRUCTURED=0
|
|
225
|
+
JSONLD_COUNT=$(jget_num "$JSONLD" '.blockCount')
|
|
226
|
+
JSONLD_VALID=$(jget_num "$JSONLD" '.validCount')
|
|
227
|
+
JSONLD_INVALID=$(jget_num "$JSONLD" '.invalidCount')
|
|
228
|
+
HAS_ORG=$(jget_bool "$JSONLD" '.flags.hasOrganization')
|
|
229
|
+
HAS_WEBSITE=$(jget_bool "$JSONLD" '.flags.hasWebSite')
|
|
230
|
+
HAS_BREADCRUMB=$(jget_bool "$JSONLD" '.flags.hasBreadcrumbList')
|
|
231
|
+
HAS_ARTICLE=$(jget_bool "$JSONLD" '.flags.hasArticle')
|
|
232
|
+
HAS_PRODUCT=$(jget_bool "$JSONLD" '.flags.hasProduct')
|
|
233
|
+
HAS_FAQ=$(jget_bool "$JSONLD" '.flags.hasFAQPage')
|
|
234
|
+
|
|
235
|
+
[ "$JSONLD_COUNT" -ge 1 ] && STRUCTURED=$((STRUCTURED + 30))
|
|
236
|
+
if [ "$JSONLD_COUNT" -ge 1 ] && [ "$JSONLD_INVALID" -eq 0 ]; then
|
|
237
|
+
STRUCTURED=$((STRUCTURED + 20))
|
|
238
|
+
fi
|
|
239
|
+
if [ "$HAS_ORG" = "true" ] || [ "$HAS_WEBSITE" = "true" ]; then
|
|
240
|
+
STRUCTURED=$((STRUCTURED + 20))
|
|
241
|
+
fi
|
|
242
|
+
[ "$HAS_BREADCRUMB" = "true" ] && STRUCTURED=$((STRUCTURED + 15))
|
|
243
|
+
if [ "$HAS_ARTICLE" = "true" ] || [ "$HAS_PRODUCT" = "true" ] || [ "$HAS_FAQ" = "true" ]; then
|
|
244
|
+
STRUCTURED=$((STRUCTURED + 15))
|
|
245
|
+
fi
|
|
246
|
+
|
|
247
|
+
# --- Category 4: Technical Signals (0-100) ---
|
|
248
|
+
TECHNICAL=0
|
|
249
|
+
TITLE=$(jget "$META" '.title' "")
|
|
250
|
+
DESCRIPTION=$(jget "$META" '.description' "")
|
|
251
|
+
CANONICAL=$(jget "$META" '.canonical' "")
|
|
252
|
+
OG_TITLE=$(jget "$META" '.og.title' "")
|
|
253
|
+
OG_DESC=$(jget "$META" '.og.description' "")
|
|
254
|
+
|
|
255
|
+
[ -n "$TITLE" ] && [ "$TITLE" != "null" ] && TECHNICAL=$((TECHNICAL + 25))
|
|
256
|
+
[ -n "$DESCRIPTION" ] && [ "$DESCRIPTION" != "null" ] && TECHNICAL=$((TECHNICAL + 25))
|
|
257
|
+
[ -n "$CANONICAL" ] && [ "$CANONICAL" != "null" ] && TECHNICAL=$((TECHNICAL + 20))
|
|
258
|
+
if [ -n "$OG_TITLE" ] && [ "$OG_TITLE" != "null" ]; then TECHNICAL=$((TECHNICAL + 8)); fi
|
|
259
|
+
if [ -n "$OG_DESC" ] && [ "$OG_DESC" != "null" ]; then TECHNICAL=$((TECHNICAL + 7)); fi
|
|
260
|
+
|
|
261
|
+
SITEMAP_EXISTS=$(jget_bool "$SITEMAP_FILE" '.exists')
|
|
262
|
+
SITEMAP_CONTAINS=$(jget_bool "$SITEMAP_FILE" '.containsTarget')
|
|
263
|
+
if [ "$SITEMAP_EXISTS" = "true" ] && [ "$SITEMAP_CONTAINS" = "true" ]; then
|
|
264
|
+
TECHNICAL=$((TECHNICAL + 15))
|
|
265
|
+
elif [ "$SITEMAP_EXISTS" = "true" ]; then
|
|
266
|
+
TECHNICAL=$((TECHNICAL + 10))
|
|
267
|
+
fi
|
|
268
|
+
|
|
269
|
+
# --- Category 5: AI Readiness (0-100) ---
|
|
270
|
+
AI=0
|
|
271
|
+
LLMS_EXISTS=$(jget_bool "$LLMSTXT_FILE" '.llmsTxt.exists')
|
|
272
|
+
LLMS_HAS_TITLE=$(jget_bool "$LLMSTXT_FILE" '.llmsTxt.hasTitle')
|
|
273
|
+
LLMS_HAS_DESC=$(jget_bool "$LLMSTXT_FILE" '.llmsTxt.hasDescription')
|
|
274
|
+
LLMS_URLS=$(jget_num "$LLMSTXT_FILE" '.llmsTxt.urlCount')
|
|
275
|
+
|
|
276
|
+
if [ "$LLMS_EXISTS" = "true" ]; then
|
|
277
|
+
AI=$((AI + 40))
|
|
278
|
+
[ "$LLMS_HAS_TITLE" = "true" ] && AI=$((AI + 7))
|
|
279
|
+
[ "$LLMS_HAS_DESC" = "true" ] && AI=$((AI + 7))
|
|
280
|
+
[ "$LLMS_URLS" -ge 1 ] && AI=$((AI + 6))
|
|
281
|
+
fi
|
|
282
|
+
# Content citable (>= 200 words, effective for this bot)
|
|
283
|
+
[ "$EFFECTIVE_WORD_COUNT" -ge 200 ] && AI=$((AI + 20))
|
|
284
|
+
# Semantic clarity: has H1 + description
|
|
285
|
+
if [ "$H1_COUNT" -ge 1 ] && [ -n "$DESCRIPTION" ] && [ "$DESCRIPTION" != "null" ]; then
|
|
286
|
+
AI=$((AI + 20))
|
|
287
|
+
fi
|
|
288
|
+
|
|
289
|
+
# Cap categories at 100
|
|
290
|
+
[ $ACC -gt 100 ] && ACC=100
|
|
291
|
+
[ $CONTENT -gt 100 ] && CONTENT=100
|
|
292
|
+
[ $STRUCTURED -gt 100 ] && STRUCTURED=100
|
|
293
|
+
[ $TECHNICAL -gt 100 ] && TECHNICAL=100
|
|
294
|
+
[ $AI -gt 100 ] && AI=100
|
|
295
|
+
|
|
296
|
+
# Per-bot composite score (weighted average of 5 categories)
|
|
297
|
+
BOT_SCORE=$(awk -v a=$ACC -v c=$CONTENT -v s=$STRUCTURED -v t=$TECHNICAL -v ai=$AI \
|
|
298
|
+
-v wa=$W_ACCESSIBILITY -v wc=$W_CONTENT -v ws=$W_STRUCTURED -v wt=$W_TECHNICAL -v wai=$W_AI \
|
|
299
|
+
'BEGIN { printf "%d", (a*wa + c*wc + s*ws + t*wt + ai*wai) / (wa+wc+ws+wt+wai) + 0.5 }')
|
|
300
|
+
|
|
301
|
+
BOT_GRADE=$(grade_for "$BOT_SCORE")
|
|
302
|
+
ACC_GRADE=$(grade_for "$ACC")
|
|
303
|
+
CONTENT_GRADE=$(grade_for "$CONTENT")
|
|
304
|
+
STRUCTURED_GRADE=$(grade_for "$STRUCTURED")
|
|
305
|
+
TECHNICAL_GRADE=$(grade_for "$TECHNICAL")
|
|
306
|
+
AI_GRADE=$(grade_for "$AI")
|
|
307
|
+
|
|
308
|
+
BOT_OBJ=$(jq -n \
|
|
309
|
+
--arg id "$bot_id" \
|
|
310
|
+
--arg name "$BOT_NAME" \
|
|
311
|
+
--arg rendersJs "$RENDERS_JS" \
|
|
312
|
+
--argjson score "$BOT_SCORE" \
|
|
313
|
+
--arg grade "$BOT_GRADE" \
|
|
314
|
+
--argjson acc "$ACC" \
|
|
315
|
+
--arg accGrade "$ACC_GRADE" \
|
|
316
|
+
--argjson content "$CONTENT" \
|
|
317
|
+
--arg contentGrade "$CONTENT_GRADE" \
|
|
318
|
+
--argjson structured "$STRUCTURED" \
|
|
319
|
+
--arg structuredGrade "$STRUCTURED_GRADE" \
|
|
320
|
+
--argjson technical "$TECHNICAL" \
|
|
321
|
+
--arg technicalGrade "$TECHNICAL_GRADE" \
|
|
322
|
+
--argjson ai "$AI" \
|
|
323
|
+
--arg aiGrade "$AI_GRADE" \
|
|
324
|
+
--argjson serverWords "$SERVER_WORD_COUNT" \
|
|
325
|
+
--argjson effectiveWords "$EFFECTIVE_WORD_COUNT" \
|
|
326
|
+
--argjson missedWords "$MISSED_WORDS" \
|
|
327
|
+
--argjson hydrationPenalty "$HYDRATION_PENALTY" \
|
|
328
|
+
'{
|
|
329
|
+
id: $id,
|
|
330
|
+
name: $name,
|
|
331
|
+
rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end),
|
|
332
|
+
score: $score,
|
|
333
|
+
grade: $grade,
|
|
334
|
+
visibility: {
|
|
335
|
+
serverWords: $serverWords,
|
|
336
|
+
effectiveWords: $effectiveWords,
|
|
337
|
+
missedWordsVsRendered: $missedWords,
|
|
338
|
+
hydrationPenaltyPts: $hydrationPenalty
|
|
339
|
+
},
|
|
340
|
+
categories: {
|
|
341
|
+
accessibility: { score: $acc, grade: $accGrade },
|
|
342
|
+
contentVisibility: { score: $content, grade: $contentGrade },
|
|
343
|
+
structuredData: { score: $structured, grade: $structuredGrade },
|
|
344
|
+
technicalSignals: { score: $technical, grade: $technicalGrade },
|
|
345
|
+
aiReadiness: { score: $ai, grade: $aiGrade }
|
|
346
|
+
}
|
|
347
|
+
}')
|
|
348
|
+
|
|
349
|
+
BOTS_JSON=$(printf '%s' "$BOTS_JSON" | jq --argjson bot "$BOT_OBJ" --arg id "$bot_id" '.[$id] = $bot')
|
|
350
|
+
|
|
351
|
+
# Accumulate category averages
|
|
352
|
+
CAT_ACCESSIBILITY_SUM=$((CAT_ACCESSIBILITY_SUM + ACC))
|
|
353
|
+
CAT_CONTENT_SUM=$((CAT_CONTENT_SUM + CONTENT))
|
|
354
|
+
CAT_STRUCTURED_SUM=$((CAT_STRUCTURED_SUM + STRUCTURED))
|
|
355
|
+
CAT_TECHNICAL_SUM=$((CAT_TECHNICAL_SUM + TECHNICAL))
|
|
356
|
+
CAT_AI_SUM=$((CAT_AI_SUM + AI))
|
|
357
|
+
CAT_N=$((CAT_N + 1))
|
|
358
|
+
|
|
359
|
+
# Accumulate weighted overall
|
|
360
|
+
W=$(overall_weight "$bot_id")
|
|
361
|
+
if [ "$W" -gt 0 ]; then
|
|
362
|
+
OVERALL_WEIGHTED_SUM=$((OVERALL_WEIGHTED_SUM + BOT_SCORE * W))
|
|
363
|
+
OVERALL_WEIGHT_TOTAL=$((OVERALL_WEIGHT_TOTAL + W))
|
|
364
|
+
fi
|
|
365
|
+
done
|
|
366
|
+
|
|
367
|
+
# Per-category averages (across all bots)
|
|
368
|
+
CAT_ACC_AVG=$((CAT_ACCESSIBILITY_SUM / CAT_N))
|
|
369
|
+
CAT_CONTENT_AVG=$((CAT_CONTENT_SUM / CAT_N))
|
|
370
|
+
CAT_STRUCTURED_AVG=$((CAT_STRUCTURED_SUM / CAT_N))
|
|
371
|
+
CAT_TECHNICAL_AVG=$((CAT_TECHNICAL_SUM / CAT_N))
|
|
372
|
+
CAT_AI_AVG=$((CAT_AI_SUM / CAT_N))
|
|
373
|
+
|
|
374
|
+
# Overall composite
|
|
375
|
+
if [ "$OVERALL_WEIGHT_TOTAL" -gt 0 ]; then
|
|
376
|
+
OVERALL_SCORE=$((OVERALL_WEIGHTED_SUM / OVERALL_WEIGHT_TOTAL))
|
|
377
|
+
else
|
|
378
|
+
# Fall back to simple average if none of the 4 standard bots are present
|
|
379
|
+
OVERALL_SCORE=$(((CAT_ACC_AVG + CAT_CONTENT_AVG + CAT_STRUCTURED_AVG + CAT_TECHNICAL_AVG + CAT_AI_AVG) / 5))
|
|
380
|
+
fi
|
|
381
|
+
|
|
382
|
+
OVERALL_GRADE=$(grade_for "$OVERALL_SCORE")
|
|
383
|
+
CAT_ACC_GRADE=$(grade_for "$CAT_ACC_AVG")
|
|
384
|
+
CAT_CONTENT_GRADE=$(grade_for "$CAT_CONTENT_AVG")
|
|
385
|
+
CAT_STRUCTURED_GRADE=$(grade_for "$CAT_STRUCTURED_AVG")
|
|
386
|
+
CAT_TECHNICAL_GRADE=$(grade_for "$CAT_TECHNICAL_AVG")
|
|
387
|
+
CAT_AI_GRADE=$(grade_for "$CAT_AI_AVG")
|
|
388
|
+
|
|
389
|
+
# Get the URL from the first fetch file
|
|
390
|
+
FIRST_FETCH=$(ls "$RESULTS_DIR"/fetch-*.json | head -1)
|
|
391
|
+
TARGET_URL=$(jget "$FIRST_FETCH" '.url' "")
|
|
392
|
+
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
393
|
+
|
|
394
|
+
jq -n \
|
|
395
|
+
--arg url "$TARGET_URL" \
|
|
396
|
+
--arg timestamp "$TIMESTAMP" \
|
|
397
|
+
--arg version "0.1.0" \
|
|
398
|
+
--argjson overallScore "$OVERALL_SCORE" \
|
|
399
|
+
--arg overallGrade "$OVERALL_GRADE" \
|
|
400
|
+
--argjson bots "$BOTS_JSON" \
|
|
401
|
+
--argjson catAcc "$CAT_ACC_AVG" \
|
|
402
|
+
--arg catAccGrade "$CAT_ACC_GRADE" \
|
|
403
|
+
--argjson catContent "$CAT_CONTENT_AVG" \
|
|
404
|
+
--arg catContentGrade "$CAT_CONTENT_GRADE" \
|
|
405
|
+
--argjson catStructured "$CAT_STRUCTURED_AVG" \
|
|
406
|
+
--arg catStructuredGrade "$CAT_STRUCTURED_GRADE" \
|
|
407
|
+
--argjson catTechnical "$CAT_TECHNICAL_AVG" \
|
|
408
|
+
--arg catTechnicalGrade "$CAT_TECHNICAL_GRADE" \
|
|
409
|
+
--argjson catAi "$CAT_AI_AVG" \
|
|
410
|
+
--arg catAiGrade "$CAT_AI_GRADE" \
|
|
411
|
+
'{
|
|
412
|
+
url: $url,
|
|
413
|
+
timestamp: $timestamp,
|
|
414
|
+
version: $version,
|
|
415
|
+
overall: { score: $overallScore, grade: $overallGrade },
|
|
416
|
+
bots: $bots,
|
|
417
|
+
categories: {
|
|
418
|
+
accessibility: { score: $catAcc, grade: $catAccGrade },
|
|
419
|
+
contentVisibility: { score: $catContent, grade: $catContentGrade },
|
|
420
|
+
structuredData: { score: $catStructured, grade: $catStructuredGrade },
|
|
421
|
+
technicalSignals: { score: $catTechnical, grade: $catTechnicalGrade },
|
|
422
|
+
aiReadiness: { score: $catAi, grade: $catAiGrade }
|
|
423
|
+
}
|
|
424
|
+
}'
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -eu
|
|
3
|
+
|
|
4
|
+
# diff-render.sh — Compare server HTML word count vs JS-rendered word count
|
|
5
|
+
# Usage: diff-render.sh <url>
|
|
6
|
+
# Requires Playwright. Gracefully outputs { skipped: true } if unavailable.
|
|
7
|
+
|
|
8
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
9
|
+
# shellcheck source=_lib.sh
|
|
10
|
+
. "$SCRIPT_DIR/_lib.sh"
|
|
11
|
+
|
|
12
|
+
URL="${1:?Usage: diff-render.sh <url>}"
|
|
13
|
+
printf '[diff-render] comparing server HTML vs Playwright render for %s\n' "$URL" >&2
|
|
14
|
+
|
|
15
|
+
emit_skipped() {
|
|
16
|
+
local reason="$1"
|
|
17
|
+
jq -n \
|
|
18
|
+
--arg url "$URL" \
|
|
19
|
+
--arg reason "$reason" \
|
|
20
|
+
'{
|
|
21
|
+
url: $url,
|
|
22
|
+
skipped: true,
|
|
23
|
+
reason: $reason,
|
|
24
|
+
serverWordCount: null,
|
|
25
|
+
renderedWordCount: null,
|
|
26
|
+
deltaPct: null,
|
|
27
|
+
significantDelta: null
|
|
28
|
+
}'
|
|
29
|
+
exit 0
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
# Check for Node.js
|
|
33
|
+
if ! command -v node >/dev/null 2>&1; then
|
|
34
|
+
emit_skipped "node not installed"
|
|
35
|
+
fi
|
|
36
|
+
|
|
37
|
+
# Check for Playwright — try to require it from the current dir or globally
|
|
38
|
+
PLAYWRIGHT_CHECK=$(node -e "
|
|
39
|
+
try {
|
|
40
|
+
require('playwright');
|
|
41
|
+
console.log('ok');
|
|
42
|
+
} catch (e) {
|
|
43
|
+
try {
|
|
44
|
+
require('playwright-core');
|
|
45
|
+
console.log('ok');
|
|
46
|
+
} catch (e2) {
|
|
47
|
+
console.log('missing');
|
|
48
|
+
}
|
|
49
|
+
}" 2>/dev/null || echo "missing")
|
|
50
|
+
|
|
51
|
+
if [ "$PLAYWRIGHT_CHECK" != "ok" ]; then
|
|
52
|
+
emit_skipped "playwright not installed (run: npm install playwright && npx playwright install chromium)"
|
|
53
|
+
fi
|
|
54
|
+
|
|
55
|
+
# Fetch server HTML and count words
|
|
56
|
+
TMPDIR="${TMPDIR:-/tmp}"
|
|
57
|
+
SERVER_HTML=$(mktemp "$TMPDIR/crawlsim-server.XXXXXX")
|
|
58
|
+
RENDERED_HTML=$(mktemp "$TMPDIR/crawlsim-rendered.XXXXXX")
|
|
59
|
+
trap 'rm -f "$SERVER_HTML" "$RENDERED_HTML"' EXIT
|
|
60
|
+
|
|
61
|
+
# Fetch server HTML with Googlebot UA
|
|
62
|
+
UA="Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
|
63
|
+
curl -sS -L -A "$UA" -o "$SERVER_HTML" --max-time 30 "$URL" 2>/dev/null || {
|
|
64
|
+
emit_skipped "failed to fetch server HTML"
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
SERVER_WORDS=$(count_words "$SERVER_HTML")
|
|
68
|
+
[ -z "$SERVER_WORDS" ] && SERVER_WORDS=0
|
|
69
|
+
|
|
70
|
+
# Use Playwright to render and capture the final DOM
|
|
71
|
+
node -e "
|
|
72
|
+
(async () => {
|
|
73
|
+
const { chromium } = require('playwright');
|
|
74
|
+
const browser = await chromium.launch({ headless: true });
|
|
75
|
+
try {
|
|
76
|
+
const context = await browser.newContext({
|
|
77
|
+
userAgent: 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
|
|
78
|
+
});
|
|
79
|
+
const page = await context.newPage();
|
|
80
|
+
await page.goto(process.argv[1], { waitUntil: 'networkidle', timeout: 30000 });
|
|
81
|
+
const html = await page.content();
|
|
82
|
+
const fs = require('fs');
|
|
83
|
+
fs.writeFileSync(process.argv[2], html);
|
|
84
|
+
} finally {
|
|
85
|
+
await browser.close();
|
|
86
|
+
}
|
|
87
|
+
})().catch(err => {
|
|
88
|
+
console.error('RENDER_ERROR:', err.message);
|
|
89
|
+
process.exit(1);
|
|
90
|
+
});
|
|
91
|
+
" "$URL" "$RENDERED_HTML" 2>/dev/null || {
|
|
92
|
+
emit_skipped "playwright render failed"
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
RENDERED_WORDS=$(count_words "$RENDERED_HTML")
|
|
96
|
+
[ -z "$RENDERED_WORDS" ] && RENDERED_WORDS=0
|
|
97
|
+
|
|
98
|
+
# Compute delta percentage (rendered vs server)
|
|
99
|
+
DELTA_PCT=0
|
|
100
|
+
SIGNIFICANT=false
|
|
101
|
+
if [ "$SERVER_WORDS" -gt 0 ]; then
|
|
102
|
+
DELTA_PCT=$(awk -v s="$SERVER_WORDS" -v r="$RENDERED_WORDS" \
|
|
103
|
+
'BEGIN { printf "%.1f", ((r - s) / s) * 100 }')
|
|
104
|
+
ABS_DELTA=$(awk -v d="$DELTA_PCT" 'BEGIN { printf "%d", (d < 0 ? -d : d) }')
|
|
105
|
+
if [ "$ABS_DELTA" -gt 20 ]; then
|
|
106
|
+
SIGNIFICANT=true
|
|
107
|
+
fi
|
|
108
|
+
elif [ "$RENDERED_WORDS" -gt 0 ]; then
|
|
109
|
+
# Server had nothing, rendered has content — significant
|
|
110
|
+
DELTA_PCT=100
|
|
111
|
+
SIGNIFICANT=true
|
|
112
|
+
fi
|
|
113
|
+
|
|
114
|
+
jq -n \
|
|
115
|
+
--arg url "$URL" \
|
|
116
|
+
--argjson serverWords "$SERVER_WORDS" \
|
|
117
|
+
--argjson renderedWords "$RENDERED_WORDS" \
|
|
118
|
+
--argjson deltaPct "$DELTA_PCT" \
|
|
119
|
+
--argjson significant "$SIGNIFICANT" \
|
|
120
|
+
'{
|
|
121
|
+
url: $url,
|
|
122
|
+
skipped: false,
|
|
123
|
+
serverWordCount: $serverWords,
|
|
124
|
+
renderedWordCount: $renderedWords,
|
|
125
|
+
deltaPct: $deltaPct,
|
|
126
|
+
significantDelta: $significant,
|
|
127
|
+
interpretation: (
|
|
128
|
+
if $significant and $deltaPct > 0 then
|
|
129
|
+
"JS rendering reveals significantly more content than server HTML — non-rendering bots (GPTBot/ClaudeBot/Perplexity) will see less."
|
|
130
|
+
elif $significant and $deltaPct < 0 then
|
|
131
|
+
"Server HTML has more content than rendered DOM — unusual, possibly JS removing content."
|
|
132
|
+
else
|
|
133
|
+
"Server HTML and rendered DOM word counts are close — no significant hydration delta."
|
|
134
|
+
end
|
|
135
|
+
)
|
|
136
|
+
}'
|