@braedenbuilds/crawl-sim 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +261 -0
- package/SKILL.md +196 -0
- package/bin/install.js +159 -0
- package/package.json +46 -0
- package/profiles/chatgpt-user.json +28 -0
- package/profiles/claude-searchbot.json +28 -0
- package/profiles/claude-user.json +28 -0
- package/profiles/claudebot.json +28 -0
- package/profiles/googlebot.json +28 -0
- package/profiles/gptbot.json +28 -0
- package/profiles/oai-searchbot.json +28 -0
- package/profiles/perplexity-user.json +28 -0
- package/profiles/perplexitybot.json +28 -0
- package/scripts/_lib.sh +51 -0
- package/scripts/check-llmstxt.sh +116 -0
- package/scripts/check-robots.sh +196 -0
- package/scripts/check-sitemap.sh +79 -0
- package/scripts/compute-score.sh +424 -0
- package/scripts/diff-render.sh +136 -0
- package/scripts/extract-jsonld.sh +133 -0
- package/scripts/extract-links.sh +103 -0
- package/scripts/extract-meta.sh +117 -0
- package/scripts/fetch-as-bot.sh +87 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -eu
|
|
3
|
+
|
|
4
|
+
# extract-jsonld.sh — Extract JSON-LD structured data from HTML
|
|
5
|
+
# Usage: extract-jsonld.sh [file] | extract-jsonld.sh < html
|
|
6
|
+
# Output: JSON to stdout with count, types, and flags
|
|
7
|
+
|
|
8
|
+
# Read HTML from file or stdin
|
|
9
|
+
if [ $# -ge 1 ] && [ -f "$1" ]; then
|
|
10
|
+
HTML=$(cat "$1")
|
|
11
|
+
printf '[extract-jsonld] %s\n' "$1" >&2
|
|
12
|
+
else
|
|
13
|
+
HTML=$(cat)
|
|
14
|
+
printf '[extract-jsonld] (stdin)\n' >&2
|
|
15
|
+
fi
|
|
16
|
+
|
|
17
|
+
# Extract JSON-LD blocks into a temp file (one block per line, flattened)
|
|
18
|
+
# Match <script type="application/ld+json">...</script> across lines
|
|
19
|
+
TMPDIR="${TMPDIR:-/tmp}"
|
|
20
|
+
BLOCKS_FILE=$(mktemp "$TMPDIR/crawlsim-jsonld.XXXXXX")
|
|
21
|
+
TYPES_FILE=$(mktemp "$TMPDIR/crawlsim-types.XXXXXX")
|
|
22
|
+
VALID_FILE=$(mktemp "$TMPDIR/crawlsim-valid.XXXXXX")
|
|
23
|
+
trap 'rm -f "$BLOCKS_FILE" "$TYPES_FILE" "$VALID_FILE"' EXIT
|
|
24
|
+
|
|
25
|
+
# Use awk to extract script blocks across lines
|
|
26
|
+
printf '%s' "$HTML" | awk '
|
|
27
|
+
BEGIN { in_block = 0; block = "" }
|
|
28
|
+
{
|
|
29
|
+
line = $0
|
|
30
|
+
while (length(line) > 0) {
|
|
31
|
+
if (in_block == 0) {
|
|
32
|
+
# Look for opening script tag (case-insensitive)
|
|
33
|
+
idx = match(tolower(line), /<script[^>]*type=["'\''"]application\/ld\+json["'\''"][^>]*>/)
|
|
34
|
+
if (idx == 0) break
|
|
35
|
+
# Skip past the opening tag
|
|
36
|
+
end_of_open = idx + RLENGTH - 1
|
|
37
|
+
line = substr(line, end_of_open + 1)
|
|
38
|
+
in_block = 1
|
|
39
|
+
block = ""
|
|
40
|
+
} else {
|
|
41
|
+
# Look for closing tag
|
|
42
|
+
idx = match(tolower(line), /<\/script>/)
|
|
43
|
+
if (idx == 0) {
|
|
44
|
+
block = block line " "
|
|
45
|
+
break
|
|
46
|
+
}
|
|
47
|
+
block = block substr(line, 1, idx - 1)
|
|
48
|
+
print block
|
|
49
|
+
line = substr(line, idx + RLENGTH)
|
|
50
|
+
in_block = 0
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
' > "$BLOCKS_FILE"
|
|
55
|
+
|
|
56
|
+
# Count blocks
|
|
57
|
+
BLOCK_COUNT=$(wc -l < "$BLOCKS_FILE" | tr -d ' ')
|
|
58
|
+
# Handle empty file (wc returns 0 for empty)
|
|
59
|
+
if [ ! -s "$BLOCKS_FILE" ]; then
|
|
60
|
+
BLOCK_COUNT=0
|
|
61
|
+
fi
|
|
62
|
+
|
|
63
|
+
VALID_COUNT=0
|
|
64
|
+
INVALID_COUNT=0
|
|
65
|
+
|
|
66
|
+
if [ "$BLOCK_COUNT" -gt 0 ]; then
|
|
67
|
+
while IFS= read -r block; do
|
|
68
|
+
[ -z "$block" ] && continue
|
|
69
|
+
# Try to parse as JSON
|
|
70
|
+
if printf '%s' "$block" | jq -e . >/dev/null 2>&1; then
|
|
71
|
+
VALID_COUNT=$((VALID_COUNT + 1))
|
|
72
|
+
# Extract @type values (may be single string, array, or nested under @graph)
|
|
73
|
+
printf '%s' "$block" | jq -r '
|
|
74
|
+
def collect_types:
|
|
75
|
+
if type == "object" then
|
|
76
|
+
(if has("@type") then (.["@type"] | if type == "array" then .[] else . end) else empty end),
|
|
77
|
+
(if has("@graph") then (.["@graph"][] | collect_types) else empty end)
|
|
78
|
+
elif type == "array" then .[] | collect_types
|
|
79
|
+
else empty end;
|
|
80
|
+
collect_types
|
|
81
|
+
' 2>/dev/null >> "$TYPES_FILE" || true
|
|
82
|
+
else
|
|
83
|
+
INVALID_COUNT=$((INVALID_COUNT + 1))
|
|
84
|
+
fi
|
|
85
|
+
done < "$BLOCKS_FILE"
|
|
86
|
+
fi
|
|
87
|
+
|
|
88
|
+
# Deduplicate and sort types
|
|
89
|
+
TYPES_JSON="[]"
|
|
90
|
+
if [ -s "$TYPES_FILE" ]; then
|
|
91
|
+
TYPES_JSON=$(sort -u "$TYPES_FILE" | jq -R . | jq -s .)
|
|
92
|
+
fi
|
|
93
|
+
|
|
94
|
+
# Boolean flags for common types
|
|
95
|
+
has_type() {
|
|
96
|
+
printf '%s' "$TYPES_JSON" | jq -e --arg t "$1" 'any(. == $t)' >/dev/null 2>&1 && echo true || echo false
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
HAS_ORG=$(has_type "Organization")
|
|
100
|
+
HAS_BREADCRUMB=$(has_type "BreadcrumbList")
|
|
101
|
+
HAS_WEBSITE=$(has_type "WebSite")
|
|
102
|
+
HAS_ARTICLE=$(has_type "Article")
|
|
103
|
+
HAS_FAQ=$(has_type "FAQPage")
|
|
104
|
+
HAS_PRODUCT=$(has_type "Product")
|
|
105
|
+
HAS_PROFESSIONAL_SERVICE=$(has_type "ProfessionalService")
|
|
106
|
+
|
|
107
|
+
jq -n \
|
|
108
|
+
--argjson count "$BLOCK_COUNT" \
|
|
109
|
+
--argjson valid "$VALID_COUNT" \
|
|
110
|
+
--argjson invalid "$INVALID_COUNT" \
|
|
111
|
+
--argjson types "$TYPES_JSON" \
|
|
112
|
+
--argjson hasOrg "$HAS_ORG" \
|
|
113
|
+
--argjson hasBreadcrumb "$HAS_BREADCRUMB" \
|
|
114
|
+
--argjson hasWebsite "$HAS_WEBSITE" \
|
|
115
|
+
--argjson hasArticle "$HAS_ARTICLE" \
|
|
116
|
+
--argjson hasFaq "$HAS_FAQ" \
|
|
117
|
+
--argjson hasProduct "$HAS_PRODUCT" \
|
|
118
|
+
--argjson hasProfService "$HAS_PROFESSIONAL_SERVICE" \
|
|
119
|
+
'{
|
|
120
|
+
blockCount: $count,
|
|
121
|
+
validCount: $valid,
|
|
122
|
+
invalidCount: $invalid,
|
|
123
|
+
types: $types,
|
|
124
|
+
flags: {
|
|
125
|
+
hasOrganization: $hasOrg,
|
|
126
|
+
hasBreadcrumbList: $hasBreadcrumb,
|
|
127
|
+
hasWebSite: $hasWebsite,
|
|
128
|
+
hasArticle: $hasArticle,
|
|
129
|
+
hasFAQPage: $hasFaq,
|
|
130
|
+
hasProduct: $hasProduct,
|
|
131
|
+
hasProfessionalService: $hasProfService
|
|
132
|
+
}
|
|
133
|
+
}'
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -eu
|
|
3
|
+
|
|
4
|
+
# extract-links.sh — Extract and classify internal/external links from HTML
|
|
5
|
+
# Usage: extract-links.sh <base-url> [file] | extract-links.sh <base-url> < html
|
|
6
|
+
|
|
7
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
8
|
+
# shellcheck source=_lib.sh
|
|
9
|
+
. "$SCRIPT_DIR/_lib.sh"
|
|
10
|
+
|
|
11
|
+
BASE_URL="${1:?Usage: extract-links.sh <base-url> [file]}"
|
|
12
|
+
shift || true
|
|
13
|
+
|
|
14
|
+
if [ $# -ge 1 ] && [ -f "$1" ]; then
|
|
15
|
+
HTML=$(cat "$1")
|
|
16
|
+
printf '[extract-links] %s (base: %s)\n' "$1" "$BASE_URL" >&2
|
|
17
|
+
else
|
|
18
|
+
HTML=$(cat)
|
|
19
|
+
printf '[extract-links] (stdin) (base: %s)\n' "$BASE_URL" >&2
|
|
20
|
+
fi
|
|
21
|
+
|
|
22
|
+
BASE_HOST=$(host_from_url "$BASE_URL")
|
|
23
|
+
BASE_ORIGIN=$(origin_from_url "$BASE_URL")
|
|
24
|
+
BASE_DIR=$(dir_from_url "$BASE_URL")
|
|
25
|
+
|
|
26
|
+
HTML_FLAT=$(printf '%s' "$HTML" | tr '\n' ' ')
|
|
27
|
+
|
|
28
|
+
TMPDIR="${TMPDIR:-/tmp}"
|
|
29
|
+
HREFS_FILE=$(mktemp "$TMPDIR/crawlsim-hrefs.XXXXXX")
|
|
30
|
+
INTERNAL_FILE=$(mktemp "$TMPDIR/crawlsim-internal.XXXXXX")
|
|
31
|
+
EXTERNAL_FILE=$(mktemp "$TMPDIR/crawlsim-external.XXXXXX")
|
|
32
|
+
trap 'rm -f "$HREFS_FILE" "$INTERNAL_FILE" "$EXTERNAL_FILE"' EXIT
|
|
33
|
+
|
|
34
|
+
# Extract hrefs from <a> tags — handle double and single quoting separately.
|
|
35
|
+
{
|
|
36
|
+
printf '%s' "$HTML_FLAT" \
|
|
37
|
+
| grep -oiE '<a[[:space:]][^>]*href="[^"]*"' \
|
|
38
|
+
| sed -E 's/.*href="([^"]*)".*/\1/' || true
|
|
39
|
+
printf '%s' "$HTML_FLAT" \
|
|
40
|
+
| grep -oiE "<a[[:space:]][^>]*href='[^']*'" \
|
|
41
|
+
| sed -E "s/.*href='([^']*)'.*/\\1/" || true
|
|
42
|
+
} > "$HREFS_FILE"
|
|
43
|
+
|
|
44
|
+
while IFS= read -r href; do
|
|
45
|
+
[ -z "$href" ] && continue
|
|
46
|
+
case "$href" in
|
|
47
|
+
mailto:*|tel:*|javascript:*|"#"*) continue ;;
|
|
48
|
+
esac
|
|
49
|
+
|
|
50
|
+
if printf '%s' "$href" | grep -qE '^https?://'; then
|
|
51
|
+
HREF_HOST=$(host_from_url "$href")
|
|
52
|
+
if [ "$HREF_HOST" = "$BASE_HOST" ]; then
|
|
53
|
+
echo "$href" >> "$INTERNAL_FILE"
|
|
54
|
+
else
|
|
55
|
+
echo "$href" >> "$EXTERNAL_FILE"
|
|
56
|
+
fi
|
|
57
|
+
elif printf '%s' "$href" | grep -qE '^//'; then
|
|
58
|
+
# Protocol-relative — inherit base scheme
|
|
59
|
+
scheme=$(printf '%s' "$BASE_URL" | sed -E 's#^(https?):.*#\1#')
|
|
60
|
+
abs="${scheme}:${href}"
|
|
61
|
+
HREF_HOST=$(host_from_url "$abs")
|
|
62
|
+
if [ "$HREF_HOST" = "$BASE_HOST" ]; then
|
|
63
|
+
echo "$abs" >> "$INTERNAL_FILE"
|
|
64
|
+
else
|
|
65
|
+
echo "$abs" >> "$EXTERNAL_FILE"
|
|
66
|
+
fi
|
|
67
|
+
elif printf '%s' "$href" | grep -qE '^/'; then
|
|
68
|
+
# Root-relative — attach to origin
|
|
69
|
+
echo "${BASE_ORIGIN}${href}" >> "$INTERNAL_FILE"
|
|
70
|
+
else
|
|
71
|
+
# Document-relative — attach to base directory
|
|
72
|
+
echo "${BASE_DIR}${href}" >> "$INTERNAL_FILE"
|
|
73
|
+
fi
|
|
74
|
+
done < "$HREFS_FILE"
|
|
75
|
+
|
|
76
|
+
INTERNAL_COUNT=0
|
|
77
|
+
EXTERNAL_COUNT=0
|
|
78
|
+
[ -s "$INTERNAL_FILE" ] && INTERNAL_COUNT=$(wc -l < "$INTERNAL_FILE" | tr -d ' ')
|
|
79
|
+
[ -s "$EXTERNAL_FILE" ] && EXTERNAL_COUNT=$(wc -l < "$EXTERNAL_FILE" | tr -d ' ')
|
|
80
|
+
|
|
81
|
+
INTERNAL_SAMPLE="[]"
|
|
82
|
+
EXTERNAL_SAMPLE="[]"
|
|
83
|
+
if [ -s "$INTERNAL_FILE" ]; then
|
|
84
|
+
INTERNAL_SAMPLE=$(head -50 "$INTERNAL_FILE" | jq -R . | jq -s .)
|
|
85
|
+
fi
|
|
86
|
+
if [ -s "$EXTERNAL_FILE" ]; then
|
|
87
|
+
EXTERNAL_SAMPLE=$(head -50 "$EXTERNAL_FILE" | jq -R . | jq -s .)
|
|
88
|
+
fi
|
|
89
|
+
|
|
90
|
+
jq -n \
|
|
91
|
+
--argjson internalCount "$INTERNAL_COUNT" \
|
|
92
|
+
--argjson externalCount "$EXTERNAL_COUNT" \
|
|
93
|
+
--argjson internalSample "$INTERNAL_SAMPLE" \
|
|
94
|
+
--argjson externalSample "$EXTERNAL_SAMPLE" \
|
|
95
|
+
'{
|
|
96
|
+
counts: {
|
|
97
|
+
internal: $internalCount,
|
|
98
|
+
external: $externalCount,
|
|
99
|
+
total: ($internalCount + $externalCount)
|
|
100
|
+
},
|
|
101
|
+
internal: $internalSample,
|
|
102
|
+
external: $externalSample
|
|
103
|
+
}'
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -eu
|
|
3
|
+
|
|
4
|
+
# extract-meta.sh — Extract title, meta, OG, headings, images from HTML
|
|
5
|
+
# Usage: extract-meta.sh [file] | extract-meta.sh < html
|
|
6
|
+
|
|
7
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
8
|
+
# shellcheck source=_lib.sh
|
|
9
|
+
. "$SCRIPT_DIR/_lib.sh"
|
|
10
|
+
|
|
11
|
+
if [ $# -ge 1 ] && [ -f "$1" ]; then
|
|
12
|
+
HTML=$(cat "$1")
|
|
13
|
+
printf '[extract-meta] %s\n' "$1" >&2
|
|
14
|
+
else
|
|
15
|
+
HTML=$(cat)
|
|
16
|
+
printf '[extract-meta] (stdin)\n' >&2
|
|
17
|
+
fi
|
|
18
|
+
|
|
19
|
+
HTML_FLAT=$(printf '%s' "$HTML" | tr '\n' ' ' | tr -s ' ')
|
|
20
|
+
|
|
21
|
+
# Match a tag by regex, then pull a named attribute value that respects the
|
|
22
|
+
# actual opening quote char. Works for both "…" and '…' quoting.
|
|
23
|
+
# $1 = grep regex to find the tag
|
|
24
|
+
# $2 = attribute name (e.g. "content", "href")
|
|
25
|
+
get_attr() {
|
|
26
|
+
local tag_regex="$1"
|
|
27
|
+
local attr="$2"
|
|
28
|
+
local tag
|
|
29
|
+
tag=$(printf '%s' "$HTML_FLAT" | grep -oiE "$tag_regex" | head -1 || true)
|
|
30
|
+
[ -z "$tag" ] && return 0
|
|
31
|
+
# Try double-quoted first, then single-quoted
|
|
32
|
+
local val
|
|
33
|
+
val=$(printf '%s' "$tag" | sed -n -E "s/.*${attr}=\"([^\"]*)\".*/\\1/p" | head -1)
|
|
34
|
+
if [ -z "$val" ]; then
|
|
35
|
+
val=$(printf '%s' "$tag" | sed -n -E "s/.*${attr}='([^']*)'.*/\\1/p" | head -1)
|
|
36
|
+
fi
|
|
37
|
+
printf '%s' "$val"
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
count_pattern() {
|
|
41
|
+
local n
|
|
42
|
+
n=$(printf '%s' "$HTML_FLAT" | grep -oiE "$1" | wc -l | tr -d ' ' || true)
|
|
43
|
+
printf '%s' "${n:-0}"
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
TITLE_TAG=$(printf '%s' "$HTML_FLAT" | grep -oiE '<title[^>]*>[^<]*</title>' | head -1 || true)
|
|
47
|
+
TITLE=""
|
|
48
|
+
if [ -n "$TITLE_TAG" ]; then
|
|
49
|
+
TITLE=$(printf '%s' "$TITLE_TAG" | sed -E 's/<title[^>]*>(.*)<\/title>/\1/I')
|
|
50
|
+
fi
|
|
51
|
+
|
|
52
|
+
DESCRIPTION=$(get_attr '<meta[^>]*name=["'\''"]description["'\''"][^>]*>' 'content')
|
|
53
|
+
OG_TITLE=$(get_attr '<meta[^>]*property=["'\''"]og:title["'\''"][^>]*>' 'content')
|
|
54
|
+
OG_DESCRIPTION=$(get_attr '<meta[^>]*property=["'\''"]og:description["'\''"][^>]*>' 'content')
|
|
55
|
+
OG_IMAGE=$(get_attr '<meta[^>]*property=["'\''"]og:image["'\''"][^>]*>' 'content')
|
|
56
|
+
OG_TYPE=$(get_attr '<meta[^>]*property=["'\''"]og:type["'\''"][^>]*>' 'content')
|
|
57
|
+
TWITTER_CARD=$(get_attr '<meta[^>]*name=["'\''"]twitter:card["'\''"][^>]*>' 'content')
|
|
58
|
+
VIEWPORT=$(get_attr '<meta[^>]*name=["'\''"]viewport["'\''"][^>]*>' 'content')
|
|
59
|
+
CANONICAL=$(get_attr '<link[^>]*rel=["'\''"]canonical["'\''"][^>]*>' 'href')
|
|
60
|
+
LANG_VAL=$(get_attr '<html[^>]*>' 'lang')
|
|
61
|
+
|
|
62
|
+
H1_COUNT=$(count_pattern '<h1[^>]*>')
|
|
63
|
+
H2_COUNT=$(count_pattern '<h2[^>]*>')
|
|
64
|
+
H3_COUNT=$(count_pattern '<h3[^>]*>')
|
|
65
|
+
|
|
66
|
+
H1_TAG=$(printf '%s' "$HTML_FLAT" | grep -oiE '<h1[^>]*>[^<]*</h1>' | head -1 || true)
|
|
67
|
+
H1_TEXT=""
|
|
68
|
+
if [ -n "$H1_TAG" ]; then
|
|
69
|
+
H1_TEXT=$(printf '%s' "$H1_TAG" | sed -E 's/<h1[^>]*>(.*)<\/h1>/\1/I')
|
|
70
|
+
fi
|
|
71
|
+
|
|
72
|
+
IMG_TOTAL=$(count_pattern '<img[^>]*>')
|
|
73
|
+
IMG_WITH_ALT=$(count_pattern '<img[^>]*alt=("[^"]*"|'\''[^'\'']*'\'')[^>]*>')
|
|
74
|
+
|
|
75
|
+
jq -n \
|
|
76
|
+
--arg title "$TITLE" \
|
|
77
|
+
--arg description "$DESCRIPTION" \
|
|
78
|
+
--arg canonical "$CANONICAL" \
|
|
79
|
+
--arg ogTitle "$OG_TITLE" \
|
|
80
|
+
--arg ogDescription "$OG_DESCRIPTION" \
|
|
81
|
+
--arg ogImage "$OG_IMAGE" \
|
|
82
|
+
--arg ogType "$OG_TYPE" \
|
|
83
|
+
--arg twitterCard "$TWITTER_CARD" \
|
|
84
|
+
--arg h1Text "$H1_TEXT" \
|
|
85
|
+
--arg lang "$LANG_VAL" \
|
|
86
|
+
--arg viewport "$VIEWPORT" \
|
|
87
|
+
--argjson h1Count "$H1_COUNT" \
|
|
88
|
+
--argjson h2Count "$H2_COUNT" \
|
|
89
|
+
--argjson h3Count "$H3_COUNT" \
|
|
90
|
+
--argjson imgTotal "$IMG_TOTAL" \
|
|
91
|
+
--argjson imgWithAlt "$IMG_WITH_ALT" \
|
|
92
|
+
'{
|
|
93
|
+
title: (if $title == "" then null else $title end),
|
|
94
|
+
description: (if $description == "" then null else $description end),
|
|
95
|
+
canonical: (if $canonical == "" then null else $canonical end),
|
|
96
|
+
lang: (if $lang == "" then null else $lang end),
|
|
97
|
+
viewport: (if $viewport == "" then null else $viewport end),
|
|
98
|
+
og: {
|
|
99
|
+
title: (if $ogTitle == "" then null else $ogTitle end),
|
|
100
|
+
description: (if $ogDescription == "" then null else $ogDescription end),
|
|
101
|
+
image: (if $ogImage == "" then null else $ogImage end),
|
|
102
|
+
type: (if $ogType == "" then null else $ogType end)
|
|
103
|
+
},
|
|
104
|
+
twitter: {
|
|
105
|
+
card: (if $twitterCard == "" then null else $twitterCard end)
|
|
106
|
+
},
|
|
107
|
+
headings: {
|
|
108
|
+
h1: { count: $h1Count, firstText: (if $h1Text == "" then null else $h1Text end) },
|
|
109
|
+
h2: { count: $h2Count },
|
|
110
|
+
h3: { count: $h3Count }
|
|
111
|
+
},
|
|
112
|
+
images: {
|
|
113
|
+
total: $imgTotal,
|
|
114
|
+
withAlt: $imgWithAlt,
|
|
115
|
+
missingAlt: ($imgTotal - $imgWithAlt)
|
|
116
|
+
}
|
|
117
|
+
}'
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -euo pipefail
|
|
3
|
+
|
|
4
|
+
# fetch-as-bot.sh — Fetch a URL as a specific bot User-Agent
|
|
5
|
+
# Usage: fetch-as-bot.sh <url> <profile.json>
|
|
6
|
+
|
|
7
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
8
|
+
# shellcheck source=_lib.sh
|
|
9
|
+
. "$SCRIPT_DIR/_lib.sh"
|
|
10
|
+
|
|
11
|
+
URL="${1:?Usage: fetch-as-bot.sh <url> <profile.json>}"
|
|
12
|
+
PROFILE="${2:?Usage: fetch-as-bot.sh <url> <profile.json>}"
|
|
13
|
+
|
|
14
|
+
BOT_ID=$(jq -r '.id' "$PROFILE")
|
|
15
|
+
BOT_NAME=$(jq -r '.name' "$PROFILE")
|
|
16
|
+
UA=$(jq -r '.userAgent' "$PROFILE")
|
|
17
|
+
RENDERS_JS=$(jq -r '.rendersJavaScript' "$PROFILE")
|
|
18
|
+
|
|
19
|
+
printf '[fetch-as-bot] %s <- %s\n' "$BOT_NAME" "$URL" >&2
|
|
20
|
+
|
|
21
|
+
TMPDIR="${TMPDIR:-/tmp}"
|
|
22
|
+
HEADERS_FILE=$(mktemp "$TMPDIR/crawlsim-headers.XXXXXX")
|
|
23
|
+
BODY_FILE=$(mktemp "$TMPDIR/crawlsim-body.XXXXXX")
|
|
24
|
+
trap 'rm -f "$HEADERS_FILE" "$BODY_FILE"' EXIT
|
|
25
|
+
|
|
26
|
+
TIMING=$(curl -sS -L \
|
|
27
|
+
-H "User-Agent: $UA" \
|
|
28
|
+
-D "$HEADERS_FILE" \
|
|
29
|
+
-o "$BODY_FILE" \
|
|
30
|
+
-w '{"total":%{time_total},"ttfb":%{time_starttransfer},"connect":%{time_connect},"statusCode":%{http_code},"sizeDownload":%{size_download}}' \
|
|
31
|
+
--max-time 30 \
|
|
32
|
+
"$URL" 2>/dev/null || echo '{"total":0,"ttfb":0,"connect":0,"statusCode":0,"sizeDownload":0}')
|
|
33
|
+
|
|
34
|
+
STATUS=$(echo "$TIMING" | jq -r '.statusCode')
|
|
35
|
+
TOTAL_TIME=$(echo "$TIMING" | jq -r '.total')
|
|
36
|
+
TTFB=$(echo "$TIMING" | jq -r '.ttfb')
|
|
37
|
+
SIZE=$(echo "$TIMING" | jq -r '.sizeDownload')
|
|
38
|
+
|
|
39
|
+
# Parse response headers into a JSON object using jq for safe escaping.
|
|
40
|
+
# curl -L writes multiple blocks on redirect; jq keeps the last definition
|
|
41
|
+
# of each header since `add` overwrites left-to-right.
|
|
42
|
+
HEADERS_JSON=$(tr -d '\r' < "$HEADERS_FILE" \
|
|
43
|
+
| grep -E '^[A-Za-z][A-Za-z0-9-]*:[[:space:]]' \
|
|
44
|
+
| jq -Rs '
|
|
45
|
+
split("\n")
|
|
46
|
+
| map(select(length > 0))
|
|
47
|
+
| map(capture("^(?<k>[^:]+):[[:space:]]*(?<v>.*)$"))
|
|
48
|
+
| map({(.k): .v})
|
|
49
|
+
| add // {}
|
|
50
|
+
')
|
|
51
|
+
|
|
52
|
+
WORD_COUNT=$(count_words "$BODY_FILE")
|
|
53
|
+
[ -z "$WORD_COUNT" ] && WORD_COUNT=0
|
|
54
|
+
|
|
55
|
+
BODY_B64=""
|
|
56
|
+
if [ -s "$BODY_FILE" ]; then
|
|
57
|
+
BODY_B64=$(base64 < "$BODY_FILE")
|
|
58
|
+
fi
|
|
59
|
+
|
|
60
|
+
jq -n \
|
|
61
|
+
--arg url "$URL" \
|
|
62
|
+
--arg botId "$BOT_ID" \
|
|
63
|
+
--arg botName "$BOT_NAME" \
|
|
64
|
+
--arg ua "$UA" \
|
|
65
|
+
--arg rendersJs "$RENDERS_JS" \
|
|
66
|
+
--argjson status "$STATUS" \
|
|
67
|
+
--argjson totalTime "$TOTAL_TIME" \
|
|
68
|
+
--argjson ttfb "$TTFB" \
|
|
69
|
+
--argjson size "$SIZE" \
|
|
70
|
+
--argjson wordCount "$WORD_COUNT" \
|
|
71
|
+
--argjson headers "$HEADERS_JSON" \
|
|
72
|
+
--arg bodyBase64 "$BODY_B64" \
|
|
73
|
+
'{
|
|
74
|
+
url: $url,
|
|
75
|
+
bot: {
|
|
76
|
+
id: $botId,
|
|
77
|
+
name: $botName,
|
|
78
|
+
userAgent: $ua,
|
|
79
|
+
rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end)
|
|
80
|
+
},
|
|
81
|
+
status: $status,
|
|
82
|
+
timing: { total: $totalTime, ttfb: $ttfb },
|
|
83
|
+
size: $size,
|
|
84
|
+
wordCount: $wordCount,
|
|
85
|
+
headers: $headers,
|
|
86
|
+
bodyBase64: $bodyBase64
|
|
87
|
+
}'
|