@braedenbuilds/crawl-sim 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,133 @@
1
+ #!/usr/bin/env bash
2
+ set -eu
3
+
4
+ # extract-jsonld.sh — Extract JSON-LD structured data from HTML
5
+ # Usage: extract-jsonld.sh [file] | extract-jsonld.sh < html
6
+ # Output: JSON to stdout with count, types, and flags
7
+
8
+ # Read HTML from file or stdin
9
+ if [ $# -ge 1 ] && [ -f "$1" ]; then
10
+ HTML=$(cat "$1")
11
+ printf '[extract-jsonld] %s\n' "$1" >&2
12
+ else
13
+ HTML=$(cat)
14
+ printf '[extract-jsonld] (stdin)\n' >&2
15
+ fi
16
+
17
+ # Extract JSON-LD blocks into a temp file (one block per line, flattened)
18
+ # Match <script type="application/ld+json">...</script> across lines
19
+ TMPDIR="${TMPDIR:-/tmp}"
20
+ BLOCKS_FILE=$(mktemp "$TMPDIR/crawlsim-jsonld.XXXXXX")
21
+ TYPES_FILE=$(mktemp "$TMPDIR/crawlsim-types.XXXXXX")
22
+ VALID_FILE=$(mktemp "$TMPDIR/crawlsim-valid.XXXXXX")
23
+ trap 'rm -f "$BLOCKS_FILE" "$TYPES_FILE" "$VALID_FILE"' EXIT
24
+
25
+ # Use awk to extract script blocks across lines
26
+ printf '%s' "$HTML" | awk '
27
+ BEGIN { in_block = 0; block = "" }
28
+ {
29
+ line = $0
30
+ while (length(line) > 0) {
31
+ if (in_block == 0) {
32
+ # Look for opening script tag (case-insensitive)
33
+ idx = match(tolower(line), /<script[^>]*type=["'\''"]application\/ld\+json["'\''"][^>]*>/)
34
+ if (idx == 0) break
35
+ # Skip past the opening tag
36
+ end_of_open = idx + RLENGTH - 1
37
+ line = substr(line, end_of_open + 1)
38
+ in_block = 1
39
+ block = ""
40
+ } else {
41
+ # Look for closing tag
42
+ idx = match(tolower(line), /<\/script>/)
43
+ if (idx == 0) {
44
+ block = block line " "
45
+ break
46
+ }
47
+ block = block substr(line, 1, idx - 1)
48
+ print block
49
+ line = substr(line, idx + RLENGTH)
50
+ in_block = 0
51
+ }
52
+ }
53
+ }
54
+ ' > "$BLOCKS_FILE"
55
+
56
+ # Count blocks
57
+ BLOCK_COUNT=$(wc -l < "$BLOCKS_FILE" | tr -d ' ')
58
+ # Handle empty file (wc returns 0 for empty)
59
+ if [ ! -s "$BLOCKS_FILE" ]; then
60
+ BLOCK_COUNT=0
61
+ fi
62
+
63
+ VALID_COUNT=0
64
+ INVALID_COUNT=0
65
+
66
+ if [ "$BLOCK_COUNT" -gt 0 ]; then
67
+ while IFS= read -r block; do
68
+ [ -z "$block" ] && continue
69
+ # Try to parse as JSON
70
+ if printf '%s' "$block" | jq -e . >/dev/null 2>&1; then
71
+ VALID_COUNT=$((VALID_COUNT + 1))
72
+ # Extract @type values (may be single string, array, or nested under @graph)
73
+ printf '%s' "$block" | jq -r '
74
+ def collect_types:
75
+ if type == "object" then
76
+ (if has("@type") then (.["@type"] | if type == "array" then .[] else . end) else empty end),
77
+ (if has("@graph") then (.["@graph"][] | collect_types) else empty end)
78
+ elif type == "array" then .[] | collect_types
79
+ else empty end;
80
+ collect_types
81
+ ' 2>/dev/null >> "$TYPES_FILE" || true
82
+ else
83
+ INVALID_COUNT=$((INVALID_COUNT + 1))
84
+ fi
85
+ done < "$BLOCKS_FILE"
86
+ fi
87
+
88
+ # Deduplicate and sort types
89
+ TYPES_JSON="[]"
90
+ if [ -s "$TYPES_FILE" ]; then
91
+ TYPES_JSON=$(sort -u "$TYPES_FILE" | jq -R . | jq -s .)
92
+ fi
93
+
94
+ # Boolean flags for common types
95
+ has_type() {
96
+ printf '%s' "$TYPES_JSON" | jq -e --arg t "$1" 'any(. == $t)' >/dev/null 2>&1 && echo true || echo false
97
+ }
98
+
99
+ HAS_ORG=$(has_type "Organization")
100
+ HAS_BREADCRUMB=$(has_type "BreadcrumbList")
101
+ HAS_WEBSITE=$(has_type "WebSite")
102
+ HAS_ARTICLE=$(has_type "Article")
103
+ HAS_FAQ=$(has_type "FAQPage")
104
+ HAS_PRODUCT=$(has_type "Product")
105
+ HAS_PROFESSIONAL_SERVICE=$(has_type "ProfessionalService")
106
+
107
+ jq -n \
108
+ --argjson count "$BLOCK_COUNT" \
109
+ --argjson valid "$VALID_COUNT" \
110
+ --argjson invalid "$INVALID_COUNT" \
111
+ --argjson types "$TYPES_JSON" \
112
+ --argjson hasOrg "$HAS_ORG" \
113
+ --argjson hasBreadcrumb "$HAS_BREADCRUMB" \
114
+ --argjson hasWebsite "$HAS_WEBSITE" \
115
+ --argjson hasArticle "$HAS_ARTICLE" \
116
+ --argjson hasFaq "$HAS_FAQ" \
117
+ --argjson hasProduct "$HAS_PRODUCT" \
118
+ --argjson hasProfService "$HAS_PROFESSIONAL_SERVICE" \
119
+ '{
120
+ blockCount: $count,
121
+ validCount: $valid,
122
+ invalidCount: $invalid,
123
+ types: $types,
124
+ flags: {
125
+ hasOrganization: $hasOrg,
126
+ hasBreadcrumbList: $hasBreadcrumb,
127
+ hasWebSite: $hasWebsite,
128
+ hasArticle: $hasArticle,
129
+ hasFAQPage: $hasFaq,
130
+ hasProduct: $hasProduct,
131
+ hasProfessionalService: $hasProfService
132
+ }
133
+ }'
@@ -0,0 +1,103 @@
1
+ #!/usr/bin/env bash
2
+ set -eu
3
+
4
+ # extract-links.sh — Extract and classify internal/external links from HTML
5
+ # Usage: extract-links.sh <base-url> [file] | extract-links.sh <base-url> < html
6
+
7
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
8
+ # shellcheck source=_lib.sh
9
+ . "$SCRIPT_DIR/_lib.sh"
10
+
11
+ BASE_URL="${1:?Usage: extract-links.sh <base-url> [file]}"
12
+ shift || true
13
+
14
+ if [ $# -ge 1 ] && [ -f "$1" ]; then
15
+ HTML=$(cat "$1")
16
+ printf '[extract-links] %s (base: %s)\n' "$1" "$BASE_URL" >&2
17
+ else
18
+ HTML=$(cat)
19
+ printf '[extract-links] (stdin) (base: %s)\n' "$BASE_URL" >&2
20
+ fi
21
+
22
+ BASE_HOST=$(host_from_url "$BASE_URL")
23
+ BASE_ORIGIN=$(origin_from_url "$BASE_URL")
24
+ BASE_DIR=$(dir_from_url "$BASE_URL")
25
+
26
+ HTML_FLAT=$(printf '%s' "$HTML" | tr '\n' ' ')
27
+
28
+ TMPDIR="${TMPDIR:-/tmp}"
29
+ HREFS_FILE=$(mktemp "$TMPDIR/crawlsim-hrefs.XXXXXX")
30
+ INTERNAL_FILE=$(mktemp "$TMPDIR/crawlsim-internal.XXXXXX")
31
+ EXTERNAL_FILE=$(mktemp "$TMPDIR/crawlsim-external.XXXXXX")
32
+ trap 'rm -f "$HREFS_FILE" "$INTERNAL_FILE" "$EXTERNAL_FILE"' EXIT
33
+
34
+ # Extract hrefs from <a> tags — handle double and single quoting separately.
35
+ {
36
+ printf '%s' "$HTML_FLAT" \
37
+ | grep -oiE '<a[[:space:]][^>]*href="[^"]*"' \
38
+ | sed -E 's/.*href="([^"]*)".*/\1/' || true
39
+ printf '%s' "$HTML_FLAT" \
40
+ | grep -oiE "<a[[:space:]][^>]*href='[^']*'" \
41
+ | sed -E "s/.*href='([^']*)'.*/\\1/" || true
42
+ } > "$HREFS_FILE"
43
+
44
+ while IFS= read -r href; do
45
+ [ -z "$href" ] && continue
46
+ case "$href" in
47
+ mailto:*|tel:*|javascript:*|"#"*) continue ;;
48
+ esac
49
+
50
+ if printf '%s' "$href" | grep -qE '^https?://'; then
51
+ HREF_HOST=$(host_from_url "$href")
52
+ if [ "$HREF_HOST" = "$BASE_HOST" ]; then
53
+ echo "$href" >> "$INTERNAL_FILE"
54
+ else
55
+ echo "$href" >> "$EXTERNAL_FILE"
56
+ fi
57
+ elif printf '%s' "$href" | grep -qE '^//'; then
58
+ # Protocol-relative — inherit base scheme
59
+ scheme=$(printf '%s' "$BASE_URL" | sed -E 's#^(https?):.*#\1#')
60
+ abs="${scheme}:${href}"
61
+ HREF_HOST=$(host_from_url "$abs")
62
+ if [ "$HREF_HOST" = "$BASE_HOST" ]; then
63
+ echo "$abs" >> "$INTERNAL_FILE"
64
+ else
65
+ echo "$abs" >> "$EXTERNAL_FILE"
66
+ fi
67
+ elif printf '%s' "$href" | grep -qE '^/'; then
68
+ # Root-relative — attach to origin
69
+ echo "${BASE_ORIGIN}${href}" >> "$INTERNAL_FILE"
70
+ else
71
+ # Document-relative — attach to base directory
72
+ echo "${BASE_DIR}${href}" >> "$INTERNAL_FILE"
73
+ fi
74
+ done < "$HREFS_FILE"
75
+
76
+ INTERNAL_COUNT=0
77
+ EXTERNAL_COUNT=0
78
+ [ -s "$INTERNAL_FILE" ] && INTERNAL_COUNT=$(wc -l < "$INTERNAL_FILE" | tr -d ' ')
79
+ [ -s "$EXTERNAL_FILE" ] && EXTERNAL_COUNT=$(wc -l < "$EXTERNAL_FILE" | tr -d ' ')
80
+
81
+ INTERNAL_SAMPLE="[]"
82
+ EXTERNAL_SAMPLE="[]"
83
+ if [ -s "$INTERNAL_FILE" ]; then
84
+ INTERNAL_SAMPLE=$(head -50 "$INTERNAL_FILE" | jq -R . | jq -s .)
85
+ fi
86
+ if [ -s "$EXTERNAL_FILE" ]; then
87
+ EXTERNAL_SAMPLE=$(head -50 "$EXTERNAL_FILE" | jq -R . | jq -s .)
88
+ fi
89
+
90
+ jq -n \
91
+ --argjson internalCount "$INTERNAL_COUNT" \
92
+ --argjson externalCount "$EXTERNAL_COUNT" \
93
+ --argjson internalSample "$INTERNAL_SAMPLE" \
94
+ --argjson externalSample "$EXTERNAL_SAMPLE" \
95
+ '{
96
+ counts: {
97
+ internal: $internalCount,
98
+ external: $externalCount,
99
+ total: ($internalCount + $externalCount)
100
+ },
101
+ internal: $internalSample,
102
+ external: $externalSample
103
+ }'
@@ -0,0 +1,117 @@
1
+ #!/usr/bin/env bash
2
+ set -eu
3
+
4
+ # extract-meta.sh — Extract title, meta, OG, headings, images from HTML
5
+ # Usage: extract-meta.sh [file] | extract-meta.sh < html
6
+
7
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
8
+ # shellcheck source=_lib.sh
9
+ . "$SCRIPT_DIR/_lib.sh"
10
+
11
+ if [ $# -ge 1 ] && [ -f "$1" ]; then
12
+ HTML=$(cat "$1")
13
+ printf '[extract-meta] %s\n' "$1" >&2
14
+ else
15
+ HTML=$(cat)
16
+ printf '[extract-meta] (stdin)\n' >&2
17
+ fi
18
+
19
+ HTML_FLAT=$(printf '%s' "$HTML" | tr '\n' ' ' | tr -s ' ')
20
+
21
+ # Match a tag by regex, then pull a named attribute value that respects the
22
+ # actual opening quote char. Works for both "…" and '…' quoting.
23
+ # $1 = grep regex to find the tag
24
+ # $2 = attribute name (e.g. "content", "href")
25
+ get_attr() {
26
+ local tag_regex="$1"
27
+ local attr="$2"
28
+ local tag
29
+ tag=$(printf '%s' "$HTML_FLAT" | grep -oiE "$tag_regex" | head -1 || true)
30
+ [ -z "$tag" ] && return 0
31
+ # Try double-quoted first, then single-quoted
32
+ local val
33
+ val=$(printf '%s' "$tag" | sed -n -E "s/.*${attr}=\"([^\"]*)\".*/\\1/p" | head -1)
34
+ if [ -z "$val" ]; then
35
+ val=$(printf '%s' "$tag" | sed -n -E "s/.*${attr}='([^']*)'.*/\\1/p" | head -1)
36
+ fi
37
+ printf '%s' "$val"
38
+ }
39
+
40
+ count_pattern() {
41
+ local n
42
+ n=$(printf '%s' "$HTML_FLAT" | grep -oiE "$1" | wc -l | tr -d ' ' || true)
43
+ printf '%s' "${n:-0}"
44
+ }
45
+
46
+ TITLE_TAG=$(printf '%s' "$HTML_FLAT" | grep -oiE '<title[^>]*>[^<]*</title>' | head -1 || true)
47
+ TITLE=""
48
+ if [ -n "$TITLE_TAG" ]; then
49
+ TITLE=$(printf '%s' "$TITLE_TAG" | sed -E 's/<title[^>]*>(.*)<\/title>/\1/I')
50
+ fi
51
+
52
+ DESCRIPTION=$(get_attr '<meta[^>]*name=["'\''"]description["'\''"][^>]*>' 'content')
53
+ OG_TITLE=$(get_attr '<meta[^>]*property=["'\''"]og:title["'\''"][^>]*>' 'content')
54
+ OG_DESCRIPTION=$(get_attr '<meta[^>]*property=["'\''"]og:description["'\''"][^>]*>' 'content')
55
+ OG_IMAGE=$(get_attr '<meta[^>]*property=["'\''"]og:image["'\''"][^>]*>' 'content')
56
+ OG_TYPE=$(get_attr '<meta[^>]*property=["'\''"]og:type["'\''"][^>]*>' 'content')
57
+ TWITTER_CARD=$(get_attr '<meta[^>]*name=["'\''"]twitter:card["'\''"][^>]*>' 'content')
58
+ VIEWPORT=$(get_attr '<meta[^>]*name=["'\''"]viewport["'\''"][^>]*>' 'content')
59
+ CANONICAL=$(get_attr '<link[^>]*rel=["'\''"]canonical["'\''"][^>]*>' 'href')
60
+ LANG_VAL=$(get_attr '<html[^>]*>' 'lang')
61
+
62
+ H1_COUNT=$(count_pattern '<h1[^>]*>')
63
+ H2_COUNT=$(count_pattern '<h2[^>]*>')
64
+ H3_COUNT=$(count_pattern '<h3[^>]*>')
65
+
66
+ H1_TAG=$(printf '%s' "$HTML_FLAT" | grep -oiE '<h1[^>]*>[^<]*</h1>' | head -1 || true)
67
+ H1_TEXT=""
68
+ if [ -n "$H1_TAG" ]; then
69
+ H1_TEXT=$(printf '%s' "$H1_TAG" | sed -E 's/<h1[^>]*>(.*)<\/h1>/\1/I')
70
+ fi
71
+
72
+ IMG_TOTAL=$(count_pattern '<img[^>]*>')
73
+ IMG_WITH_ALT=$(count_pattern '<img[^>]*alt=("[^"]*"|'\''[^'\'']*'\'')[^>]*>')
74
+
75
+ jq -n \
76
+ --arg title "$TITLE" \
77
+ --arg description "$DESCRIPTION" \
78
+ --arg canonical "$CANONICAL" \
79
+ --arg ogTitle "$OG_TITLE" \
80
+ --arg ogDescription "$OG_DESCRIPTION" \
81
+ --arg ogImage "$OG_IMAGE" \
82
+ --arg ogType "$OG_TYPE" \
83
+ --arg twitterCard "$TWITTER_CARD" \
84
+ --arg h1Text "$H1_TEXT" \
85
+ --arg lang "$LANG_VAL" \
86
+ --arg viewport "$VIEWPORT" \
87
+ --argjson h1Count "$H1_COUNT" \
88
+ --argjson h2Count "$H2_COUNT" \
89
+ --argjson h3Count "$H3_COUNT" \
90
+ --argjson imgTotal "$IMG_TOTAL" \
91
+ --argjson imgWithAlt "$IMG_WITH_ALT" \
92
+ '{
93
+ title: (if $title == "" then null else $title end),
94
+ description: (if $description == "" then null else $description end),
95
+ canonical: (if $canonical == "" then null else $canonical end),
96
+ lang: (if $lang == "" then null else $lang end),
97
+ viewport: (if $viewport == "" then null else $viewport end),
98
+ og: {
99
+ title: (if $ogTitle == "" then null else $ogTitle end),
100
+ description: (if $ogDescription == "" then null else $ogDescription end),
101
+ image: (if $ogImage == "" then null else $ogImage end),
102
+ type: (if $ogType == "" then null else $ogType end)
103
+ },
104
+ twitter: {
105
+ card: (if $twitterCard == "" then null else $twitterCard end)
106
+ },
107
+ headings: {
108
+ h1: { count: $h1Count, firstText: (if $h1Text == "" then null else $h1Text end) },
109
+ h2: { count: $h2Count },
110
+ h3: { count: $h3Count }
111
+ },
112
+ images: {
113
+ total: $imgTotal,
114
+ withAlt: $imgWithAlt,
115
+ missingAlt: ($imgTotal - $imgWithAlt)
116
+ }
117
+ }'
@@ -0,0 +1,87 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # fetch-as-bot.sh — Fetch a URL as a specific bot User-Agent
5
+ # Usage: fetch-as-bot.sh <url> <profile.json>
6
+
7
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
8
+ # shellcheck source=_lib.sh
9
+ . "$SCRIPT_DIR/_lib.sh"
10
+
11
+ URL="${1:?Usage: fetch-as-bot.sh <url> <profile.json>}"
12
+ PROFILE="${2:?Usage: fetch-as-bot.sh <url> <profile.json>}"
13
+
14
+ BOT_ID=$(jq -r '.id' "$PROFILE")
15
+ BOT_NAME=$(jq -r '.name' "$PROFILE")
16
+ UA=$(jq -r '.userAgent' "$PROFILE")
17
+ RENDERS_JS=$(jq -r '.rendersJavaScript' "$PROFILE")
18
+
19
+ printf '[fetch-as-bot] %s <- %s\n' "$BOT_NAME" "$URL" >&2
20
+
21
+ TMPDIR="${TMPDIR:-/tmp}"
22
+ HEADERS_FILE=$(mktemp "$TMPDIR/crawlsim-headers.XXXXXX")
23
+ BODY_FILE=$(mktemp "$TMPDIR/crawlsim-body.XXXXXX")
24
+ trap 'rm -f "$HEADERS_FILE" "$BODY_FILE"' EXIT
25
+
26
+ TIMING=$(curl -sS -L \
27
+ -H "User-Agent: $UA" \
28
+ -D "$HEADERS_FILE" \
29
+ -o "$BODY_FILE" \
30
+ -w '{"total":%{time_total},"ttfb":%{time_starttransfer},"connect":%{time_connect},"statusCode":%{http_code},"sizeDownload":%{size_download}}' \
31
+ --max-time 30 \
32
+ "$URL" 2>/dev/null || echo '{"total":0,"ttfb":0,"connect":0,"statusCode":0,"sizeDownload":0}')
33
+
34
+ STATUS=$(echo "$TIMING" | jq -r '.statusCode')
35
+ TOTAL_TIME=$(echo "$TIMING" | jq -r '.total')
36
+ TTFB=$(echo "$TIMING" | jq -r '.ttfb')
37
+ SIZE=$(echo "$TIMING" | jq -r '.sizeDownload')
38
+
39
+ # Parse response headers into a JSON object using jq for safe escaping.
40
+ # curl -L writes multiple blocks on redirect; jq keeps the last definition
41
+ # of each header since `add` overwrites left-to-right.
42
+ HEADERS_JSON=$(tr -d '\r' < "$HEADERS_FILE" \
43
+ | grep -E '^[A-Za-z][A-Za-z0-9-]*:[[:space:]]' \
44
+ | jq -Rs '
45
+ split("\n")
46
+ | map(select(length > 0))
47
+ | map(capture("^(?<k>[^:]+):[[:space:]]*(?<v>.*)$"))
48
+ | map({(.k): .v})
49
+ | add // {}
50
+ ')
51
+
52
+ WORD_COUNT=$(count_words "$BODY_FILE")
53
+ [ -z "$WORD_COUNT" ] && WORD_COUNT=0
54
+
55
+ BODY_B64=""
56
+ if [ -s "$BODY_FILE" ]; then
57
+ BODY_B64=$(base64 < "$BODY_FILE")
58
+ fi
59
+
60
+ jq -n \
61
+ --arg url "$URL" \
62
+ --arg botId "$BOT_ID" \
63
+ --arg botName "$BOT_NAME" \
64
+ --arg ua "$UA" \
65
+ --arg rendersJs "$RENDERS_JS" \
66
+ --argjson status "$STATUS" \
67
+ --argjson totalTime "$TOTAL_TIME" \
68
+ --argjson ttfb "$TTFB" \
69
+ --argjson size "$SIZE" \
70
+ --argjson wordCount "$WORD_COUNT" \
71
+ --argjson headers "$HEADERS_JSON" \
72
+ --arg bodyBase64 "$BODY_B64" \
73
+ '{
74
+ url: $url,
75
+ bot: {
76
+ id: $botId,
77
+ name: $botName,
78
+ userAgent: $ua,
79
+ rendersJavaScript: (if $rendersJs == "true" then true elif $rendersJs == "false" then false else $rendersJs end)
80
+ },
81
+ status: $status,
82
+ timing: { total: $totalTime, ttfb: $ttfb },
83
+ size: $size,
84
+ wordCount: $wordCount,
85
+ headers: $headers,
86
+ bodyBase64: $bodyBase64
87
+ }'