@braedenbuilds/crawl-sim 1.0.5 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +15 -0
- package/.claude-plugin/plugin.json +13 -0
- package/README.md +32 -9
- package/bin/install.js +6 -2
- package/package.json +8 -3
- package/{SKILL.md → skills/crawl-sim/SKILL.md} +23 -2
- package/{scripts → skills/crawl-sim/scripts}/_lib.sh +30 -0
- package/skills/crawl-sim/scripts/compute-score.sh +744 -0
- package/{scripts → skills/crawl-sim/scripts}/extract-jsonld.sh +12 -0
- package/skills/crawl-sim/scripts/fetch-as-bot.sh +151 -0
- package/skills/crawl-sim/scripts/schema-fields.sh +25 -0
- package/scripts/compute-score.sh +0 -424
- package/scripts/fetch-as-bot.sh +0 -87
- /package/{profiles → skills/crawl-sim/profiles}/chatgpt-user.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/claude-searchbot.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/claude-user.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/claudebot.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/googlebot.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/gptbot.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/oai-searchbot.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/perplexity-user.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/perplexitybot.json +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/check-llmstxt.sh +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/check-robots.sh +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/check-sitemap.sh +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/diff-render.sh +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/extract-links.sh +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/extract-meta.sh +0 -0
package/scripts/fetch-as-bot.sh
DELETED
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bash
|
|
2
|
-
set -euo pipefail
|
|
3
|
-
|
|
4
|
-
# fetch-as-bot.sh — Fetch a URL as a specific bot User-Agent
|
|
5
|
-
# Usage: fetch-as-bot.sh <url> <profile.json>
|
|
6
|
-
|
|
7
|
-
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
8
|
-
# shellcheck source=_lib.sh
|
|
9
|
-
. "$SCRIPT_DIR/_lib.sh"
|
|
10
|
-
|
|
11
|
-
URL="${1:?Usage: fetch-as-bot.sh <url> <profile.json>}"
|
|
12
|
-
PROFILE="${2:?Usage: fetch-as-bot.sh <url> <profile.json>}"
|
|
13
|
-
|
|
14
|
-
BOT_ID=$(jq -r '.id' "$PROFILE")
|
|
15
|
-
BOT_NAME=$(jq -r '.name' "$PROFILE")
|
|
16
|
-
UA=$(jq -r '.userAgent' "$PROFILE")
|
|
17
|
-
RENDERS_JS=$(jq -r '.rendersJavaScript' "$PROFILE")
|
|
18
|
-
|
|
19
|
-
printf '[fetch-as-bot] %s <- %s\n' "$BOT_NAME" "$URL" >&2
|
|
20
|
-
|
|
21
|
-
TMPDIR="${TMPDIR:-/tmp}"
|
|
22
|
-
HEADERS_FILE=$(mktemp "$TMPDIR/crawlsim-headers.XXXXXX")
|
|
23
|
-
BODY_FILE=$(mktemp "$TMPDIR/crawlsim-body.XXXXXX")
|
|
24
|
-
trap 'rm -f "$HEADERS_FILE" "$BODY_FILE"' EXIT
|
|
25
|
-
|
|
26
|
-
TIMING=$(curl -sS -L \
|
|
27
|
-
-H "User-Agent: $UA" \
|
|
28
|
-
-D "$HEADERS_FILE" \
|
|
29
|
-
-o "$BODY_FILE" \
|
|
30
|
-
-w '{"total":%{time_total},"ttfb":%{time_starttransfer},"connect":%{time_connect},"statusCode":%{http_code},"sizeDownload":%{size_download}}' \
|
|
31
|
-
--max-time 30 \
|
|
32
|
-
"$URL" 2>/dev/null || echo '{"total":0,"ttfb":0,"connect":0,"statusCode":0,"sizeDownload":0}')
|
|
33
|
-
|
|
34
|
-
STATUS=$(echo "$TIMING" | jq -r '.statusCode')
|
|
35
|
-
TOTAL_TIME=$(echo "$TIMING" | jq -r '.total')
|
|
36
|
-
TTFB=$(echo "$TIMING" | jq -r '.ttfb')
|
|
37
|
-
SIZE=$(echo "$TIMING" | jq -r '.sizeDownload')
|
|
38
|
-
|
|
39
|
-
# Parse response headers into a JSON object using jq for safe escaping.
|
|
40
|
-
# curl -L writes multiple blocks on redirect; jq keeps the last definition
|
|
41
|
-
# of each header since `add` overwrites left-to-right.
|
|
42
|
-
HEADERS_JSON=$(tr -d '\r' < "$HEADERS_FILE" \
|
|
43
|
-
| grep -E '^[A-Za-z][A-Za-z0-9-]*:[[:space:]]' \
|
|
44
|
-
| jq -Rs '
|
|
45
|
-
split("\n")
|
|
46
|
-
| map(select(length > 0))
|
|
47
|
-
| map(capture("^(?<k>[^:]+):[[:space:]]*(?<v>.*)$"))
|
|
48
|
-
| map({(.k): .v})
|
|
49
|
-
| add // {}
|
|
50
|
-
')
|
|
51
|
-
|
|
52
|
-
WORD_COUNT=$(count_words "$BODY_FILE")
|
|
53
|
-
[ -z "$WORD_COUNT" ] && WORD_COUNT=0
|
|
54
|
-
|
|
55
|
-
BODY_B64=""
|
|
56
|
-
if [ -s "$BODY_FILE" ]; then
|
|
57
|
-
BODY_B64=$(base64 < "$BODY_FILE")
|
|
58
|
-
fi
|
|
59
|
-
|
|
60
|
-
jq -n \
|
|
61
|
-
--arg url "$URL" \
|
|
62
|
-
--arg botId "$BOT_ID" \
|
|
63
|
-
--arg botName "$BOT_NAME" \
|
|
64
|
-
--arg ua "$UA" \
|
|
65
|
-
--arg rendersJs "$RENDERS_JS" \
|
|
66
|
-
--argjson status "$STATUS" \
|
|
67
|
-
--argjson totalTime "$TOTAL_TIME" \
|
|
68
|
-
--argjson ttfb "$TTFB" \
|
|
69
|
-
--argjson size "$SIZE" \
|
|
70
|
-
--argjson wordCount "$WORD_COUNT" \
|
|
71
|
-
--argjson headers "$HEADERS_JSON" \
|
|
72
|
-
--arg bodyBase64 "$BODY_B64" \
|
|
73
|
-
'{
|
|
74
|
-
url: $url,
|
|
75
|
-
bot: {
|
|
76
|
-
id: $botId,
|
|
77
|
-
name: $botName,
|
|
78
|
-
userAgent: $ua,
|
|
79
|
-
rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end)
|
|
80
|
-
},
|
|
81
|
-
status: $status,
|
|
82
|
-
timing: { total: $totalTime, ttfb: $ttfb },
|
|
83
|
-
size: $size,
|
|
84
|
-
wordCount: $wordCount,
|
|
85
|
-
headers: $headers,
|
|
86
|
-
bodyBase64: $bodyBase64
|
|
87
|
-
}'
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|