@braedenbuilds/crawl-sim 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json ADDED
@@ -0,0 +1,46 @@
1
+ {
2
+ "name": "@braedenbuilds/crawl-sim",
3
+ "version": "1.0.1",
4
+ "description": "Agent-native multi-bot web crawler simulator. See your site through the eyes of Googlebot, GPTBot, ClaudeBot, and PerplexityBot.",
5
+ "bin": {
6
+ "crawl-sim": "bin/install.js"
7
+ },
8
+ "keywords": [
9
+ "seo",
10
+ "crawler",
11
+ "googlebot",
12
+ "gptbot",
13
+ "claudebot",
14
+ "perplexitybot",
15
+ "ai-visibility",
16
+ "ai-seo",
17
+ "geo",
18
+ "llms-txt",
19
+ "claude-code",
20
+ "claude-code-skill"
21
+ ],
22
+ "author": "BraedenBDev",
23
+ "license": "MIT",
24
+ "homepage": "https://github.com/BraedenBDev/crawl-sim#readme",
25
+ "bugs": {
26
+ "url": "https://github.com/BraedenBDev/crawl-sim/issues"
27
+ },
28
+ "repository": {
29
+ "type": "git",
30
+ "url": "git+https://github.com/BraedenBDev/crawl-sim.git"
31
+ },
32
+ "publishConfig": {
33
+ "access": "public"
34
+ },
35
+ "engines": {
36
+ "node": ">=18"
37
+ },
38
+ "files": [
39
+ "bin/",
40
+ "SKILL.md",
41
+ "profiles/",
42
+ "scripts/",
43
+ "README.md",
44
+ "LICENSE"
45
+ ]
46
+ }
@@ -0,0 +1,28 @@
1
+ {
2
+ "id": "chatgpt-user",
3
+ "name": "ChatGPT-User",
4
+ "vendor": "OpenAI",
5
+ "userAgent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot",
6
+ "robotsTxtToken": "ChatGPT-User",
7
+ "purpose": "user-initiated",
8
+ "rendersJavaScript": "unknown",
9
+ "respectsRobotsTxt": "partial",
10
+ "crawlDelaySupported": "unknown",
11
+ "ipRangesUrl": "https://openai.com/chatgpt-user.json",
12
+ "docs": "https://developers.openai.com/api/docs/bots",
13
+ "confidence": {
14
+ "rendersJavaScript": {
15
+ "value": "unknown",
16
+ "level": "inferred",
17
+ "source": "Not documented by OpenAI"
18
+ },
19
+ "respectsRobotsTxt": {
20
+ "value": "partial",
21
+ "level": "official",
22
+ "source": "Official docs state: 'Because these actions are initiated by a user, robots.txt rules may not apply.'"
23
+ }
24
+ },
25
+ "lastVerified": "2026-04-11",
26
+ "relatedBots": ["gptbot", "oai-searchbot"],
27
+ "notes": "Not used for automatic crawling. Not used to determine search appearance. User-initiated fetches in ChatGPT and Custom GPTs."
28
+ }
@@ -0,0 +1,28 @@
1
+ {
2
+ "id": "claude-searchbot",
3
+ "name": "Claude-SearchBot",
4
+ "vendor": "Anthropic",
5
+ "userAgent": "Claude-SearchBot",
6
+ "robotsTxtToken": "Claude-SearchBot",
7
+ "purpose": "search",
8
+ "rendersJavaScript": "unknown",
9
+ "respectsRobotsTxt": true,
10
+ "crawlDelaySupported": "unknown",
11
+ "ipRangesUrl": null,
12
+ "docs": "https://privacy.claude.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler",
13
+ "confidence": {
14
+ "rendersJavaScript": {
15
+ "value": "unknown",
16
+ "level": "inferred",
17
+ "source": "Not documented by Anthropic"
18
+ },
19
+ "respectsRobotsTxt": {
20
+ "value": true,
21
+ "level": "official",
22
+ "source": "Official docs, but notes blocking may reduce visibility and accuracy in user search results"
23
+ }
24
+ },
25
+ "lastVerified": "2026-04-11",
26
+ "relatedBots": ["claudebot", "claude-user"],
27
+ "notes": "Navigates the web to improve search result quality. Focused on search indexing, not training."
28
+ }
@@ -0,0 +1,28 @@
1
+ {
2
+ "id": "claude-user",
3
+ "name": "Claude-User",
4
+ "vendor": "Anthropic",
5
+ "userAgent": "Claude-User",
6
+ "robotsTxtToken": "Claude-User",
7
+ "purpose": "user-initiated",
8
+ "rendersJavaScript": "unknown",
9
+ "respectsRobotsTxt": true,
10
+ "crawlDelaySupported": "unknown",
11
+ "ipRangesUrl": null,
12
+ "docs": "https://privacy.claude.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler",
13
+ "confidence": {
14
+ "rendersJavaScript": {
15
+ "value": "unknown",
16
+ "level": "inferred",
17
+ "source": "Not documented by Anthropic"
18
+ },
19
+ "respectsRobotsTxt": {
20
+ "value": true,
21
+ "level": "official",
22
+ "source": "Official docs, but notes blocking may reduce visibility for user-directed web search"
23
+ }
24
+ },
25
+ "lastVerified": "2026-04-11",
26
+ "relatedBots": ["claudebot", "claude-searchbot"],
27
+ "notes": "When individuals ask questions to Claude, it may access websites. Blocking prevents Claude from retrieving content in response to user queries."
28
+ }
@@ -0,0 +1,28 @@
1
+ {
2
+ "id": "claudebot",
3
+ "name": "ClaudeBot",
4
+ "vendor": "Anthropic",
5
+ "userAgent": "ClaudeBot",
6
+ "robotsTxtToken": "ClaudeBot",
7
+ "purpose": "training",
8
+ "rendersJavaScript": false,
9
+ "respectsRobotsTxt": true,
10
+ "crawlDelaySupported": true,
11
+ "ipRangesUrl": null,
12
+ "docs": "https://privacy.claude.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler",
13
+ "confidence": {
14
+ "rendersJavaScript": {
15
+ "value": false,
16
+ "level": "observed",
17
+ "source": "Observational evidence consistent with no JS rendering"
18
+ },
19
+ "respectsRobotsTxt": {
20
+ "value": true,
21
+ "level": "official",
22
+ "source": "https://privacy.claude.com"
23
+ }
24
+ },
25
+ "lastVerified": "2026-04-11",
26
+ "relatedBots": ["claude-user", "claude-searchbot"],
27
+ "notes": "Collects web content that could potentially contribute to AI model training. Crawl-delay explicitly supported (non-standard). Blocking IP addresses will not reliably work."
28
+ }
@@ -0,0 +1,28 @@
1
+ {
2
+ "id": "googlebot",
3
+ "name": "Googlebot",
4
+ "vendor": "Google",
5
+ "userAgent": "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/W.X.Y.Z Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
6
+ "robotsTxtToken": "Googlebot",
7
+ "purpose": "search-indexing",
8
+ "rendersJavaScript": true,
9
+ "respectsRobotsTxt": true,
10
+ "crawlDelaySupported": false,
11
+ "ipRangesUrl": null,
12
+ "docs": "https://developers.google.com/search/docs/crawling-indexing/javascript/fix-search-javascript",
13
+ "confidence": {
14
+ "rendersJavaScript": {
15
+ "value": true,
16
+ "level": "official",
17
+ "source": "https://developers.google.com/search/docs/crawling-indexing/javascript/fix-search-javascript"
18
+ },
19
+ "respectsRobotsTxt": {
20
+ "value": true,
21
+ "level": "official",
22
+ "source": "RFC 9309 compliant"
23
+ }
24
+ },
25
+ "lastVerified": "2026-04-11",
26
+ "relatedBots": [],
27
+ "notes": "Two-phase: initial fetch (HTML) then queued render (headless Chrome via WRS). Evergreen Chromium. Stateless sessions. ~5s default timeout. Mobile-first indexing."
28
+ }
@@ -0,0 +1,28 @@
1
+ {
2
+ "id": "gptbot",
3
+ "name": "GPTBot",
4
+ "vendor": "OpenAI",
5
+ "userAgent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.3; +https://openai.com/gptbot",
6
+ "robotsTxtToken": "GPTBot",
7
+ "purpose": "training",
8
+ "rendersJavaScript": false,
9
+ "respectsRobotsTxt": true,
10
+ "crawlDelaySupported": "unknown",
11
+ "ipRangesUrl": "https://openai.com/gptbot.json",
12
+ "docs": "https://developers.openai.com/api/docs/bots",
13
+ "confidence": {
14
+ "rendersJavaScript": {
15
+ "value": false,
16
+ "level": "observed",
17
+ "source": "Multiple third-party tests with JS-only pages show empty content"
18
+ },
19
+ "respectsRobotsTxt": {
20
+ "value": true,
21
+ "level": "official",
22
+ "source": "https://developers.openai.com/api/docs/bots"
23
+ }
24
+ },
25
+ "lastVerified": "2026-04-11",
26
+ "relatedBots": ["oai-searchbot", "chatgpt-user"],
27
+ "notes": "Disallowing GPTBot indicates a site's content should not be used in training generative AI foundation models."
28
+ }
@@ -0,0 +1,28 @@
1
+ {
2
+ "id": "oai-searchbot",
3
+ "name": "OAI-SearchBot",
4
+ "vendor": "OpenAI",
5
+ "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36; compatible; OAI-SearchBot/1.3; +https://openai.com/searchbot",
6
+ "robotsTxtToken": "OAI-SearchBot",
7
+ "purpose": "search",
8
+ "rendersJavaScript": "unknown",
9
+ "respectsRobotsTxt": true,
10
+ "crawlDelaySupported": "unknown",
11
+ "ipRangesUrl": "https://openai.com/searchbot.json",
12
+ "docs": "https://developers.openai.com/api/docs/bots",
13
+ "confidence": {
14
+ "rendersJavaScript": {
15
+ "value": "unknown",
16
+ "level": "inferred",
17
+ "source": "UA mimics Chrome 131 — may indicate rendering capability, but unconfirmed"
18
+ },
19
+ "respectsRobotsTxt": {
20
+ "value": true,
21
+ "level": "official",
22
+ "source": "https://developers.openai.com/api/docs/bots"
23
+ }
24
+ },
25
+ "lastVerified": "2026-04-11",
26
+ "relatedBots": ["gptbot", "chatgpt-user"],
27
+ "notes": "Sites opted out of OAI-SearchBot will not be shown in ChatGPT search answers, though can still appear as navigational links."
28
+ }
@@ -0,0 +1,28 @@
1
+ {
2
+ "id": "perplexity-user",
3
+ "name": "Perplexity-User",
4
+ "vendor": "Perplexity",
5
+ "userAgent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Perplexity-User/1.0; +https://perplexity.ai/perplexity-user)",
6
+ "robotsTxtToken": "Perplexity-User",
7
+ "purpose": "user-initiated",
8
+ "rendersJavaScript": "unknown",
9
+ "respectsRobotsTxt": false,
10
+ "crawlDelaySupported": "unknown",
11
+ "ipRangesUrl": "https://www.perplexity.com/perplexity-user.json",
12
+ "docs": "https://docs.perplexity.ai/docs/resources/perplexity-crawlers",
13
+ "confidence": {
14
+ "rendersJavaScript": {
15
+ "value": "unknown",
16
+ "level": "inferred",
17
+ "source": "Not documented by Perplexity"
18
+ },
19
+ "respectsRobotsTxt": {
20
+ "value": false,
21
+ "level": "official",
22
+ "source": "Official docs state: 'Since a user requested the fetch, this fetcher generally ignores robots.txt rules.'"
23
+ }
24
+ },
25
+ "lastVerified": "2026-04-11",
26
+ "relatedBots": ["perplexitybot"],
27
+ "notes": "Supports user actions within Perplexity. Not used for web crawling or AI training. Generally ignores robots.txt since fetches are user-initiated."
28
+ }
@@ -0,0 +1,28 @@
1
+ {
2
+ "id": "perplexitybot",
3
+ "name": "PerplexityBot",
4
+ "vendor": "Perplexity",
5
+ "userAgent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; PerplexityBot/1.0; +https://perplexity.ai/perplexitybot)",
6
+ "robotsTxtToken": "PerplexityBot",
7
+ "purpose": "search-indexing",
8
+ "rendersJavaScript": false,
9
+ "respectsRobotsTxt": true,
10
+ "crawlDelaySupported": "unknown",
11
+ "ipRangesUrl": "https://www.perplexity.com/perplexitybot.json",
12
+ "docs": "https://docs.perplexity.ai/docs/resources/perplexity-crawlers",
13
+ "confidence": {
14
+ "rendersJavaScript": {
15
+ "value": false,
16
+ "level": "observed",
17
+ "source": "Most third-party reports indicate no JS rendering"
18
+ },
19
+ "respectsRobotsTxt": {
20
+ "value": true,
21
+ "level": "official",
22
+ "source": "https://docs.perplexity.ai/docs/resources/perplexity-crawlers"
23
+ }
24
+ },
25
+ "lastVerified": "2026-04-11",
26
+ "relatedBots": ["perplexity-user"],
27
+ "notes": "Designed to surface and link websites in search results on Perplexity. NOT used to crawl content for AI foundation models. Changes may take up to 24 hours to reflect."
28
+ }
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env bash
2
+ # crawl-sim shared helpers. Source this from other scripts:
3
+ # SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
4
+ # . "$SCRIPT_DIR/_lib.sh"
5
+
6
+ # Extract "https://host" from any URL.
7
+ origin_from_url() {
8
+ printf '%s' "$1" | sed -E 's#(^https?://[^/]+).*#\1#'
9
+ }
10
+
11
+ # Extract the host from a URL, stripping "www." prefix.
12
+ host_from_url() {
13
+ printf '%s' "$1" | sed -E 's#^https?://##' | sed -E 's#/.*$##' | sed -E 's#^www\.##'
14
+ }
15
+
16
+ # Extract the path portion of a URL. Returns "/" if empty.
17
+ path_from_url() {
18
+ local p
19
+ p=$(printf '%s' "$1" | sed -E 's#^https?://[^/]+##')
20
+ printf '%s' "${p:-/}"
21
+ }
22
+
23
+ # Extract the directory portion of a URL's path (everything up to the last /).
24
+ # Used for resolving relative URLs against a base page URL.
25
+ # Example: https://example.com/blog/index.html -> https://example.com/blog/
26
+ dir_from_url() {
27
+ local url="$1"
28
+ local origin
29
+ origin=$(origin_from_url "$url")
30
+ local p
31
+ p=$(path_from_url "$url")
32
+ # If path ends with /, keep as-is; otherwise strip last segment
33
+ case "$p" in
34
+ */) printf '%s%s' "$origin" "$p" ;;
35
+ *) printf '%s%s/' "$origin" "$(printf '%s' "$p" | sed -E 's#/[^/]*$##')" ;;
36
+ esac
37
+ }
38
+
39
+ # Count visible words in an HTML file (strips tags, counts alnum tokens).
40
+ count_words() {
41
+ sed 's/<[^>]*>//g' "$1" | tr -s '[:space:]' '\n' | grep -c '[a-zA-Z0-9]' || true
42
+ }
43
+
44
+ # Fetch a URL to a local file and return the HTTP status code on stdout.
45
+ # Usage: status=$(fetch_to_file <url> <output-file> [timeout-seconds])
46
+ fetch_to_file() {
47
+ local url="$1"
48
+ local out="$2"
49
+ local timeout="${3:-15}"
50
+ curl -sS -L -o "$out" -w '%{http_code}' --max-time "$timeout" "$url" 2>/dev/null || echo "000"
51
+ }
@@ -0,0 +1,116 @@
1
+ #!/usr/bin/env bash
2
+ set -eu
3
+
4
+ # check-llmstxt.sh — Check for llms.txt and llms-full.txt presence + structure
5
+ # Usage: check-llmstxt.sh <url>
6
+
7
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
8
+ # shellcheck source=_lib.sh
9
+ . "$SCRIPT_DIR/_lib.sh"
10
+
11
+ URL="${1:?Usage: check-llmstxt.sh <url>}"
12
+ printf '[check-llmstxt] %s\n' "$URL" >&2
13
+ ORIGIN=$(origin_from_url "$URL")
14
+
15
+ TMPDIR="${TMPDIR:-/tmp}"
16
+ LLMS_FILE=$(mktemp "$TMPDIR/crawlsim-llms.XXXXXX")
17
+ LLMS_FULL_FILE=$(mktemp "$TMPDIR/crawlsim-llms-full.XXXXXX")
18
+ trap 'rm -f "$LLMS_FILE" "$LLMS_FULL_FILE"' EXIT
19
+
20
+ analyze_file() {
21
+ local file="$1"
22
+ local status_code="$2"
23
+
24
+ local exists=false
25
+ local line_count=0
26
+ local has_title=false
27
+ local title=""
28
+ local has_description=false
29
+ local url_count=0
30
+
31
+ # Treat non-200 or HTML responses as "not present"
32
+ if [ "$status_code" = "200" ] && [ -s "$file" ]; then
33
+ # Heuristic: if file starts with <!doctype or <html, site serves HTML fallback — not a real llms.txt
34
+ local first_bytes
35
+ first_bytes=$(head -c 100 "$file" | tr '[:upper:]' '[:lower:]')
36
+ case "$first_bytes" in
37
+ *"<!doctype"*|*"<html"*) ;;
38
+ *)
39
+ exists=true
40
+ line_count=$(wc -l < "$file" | tr -d ' ')
41
+ # Title: first line starting with "# "
42
+ if head -1 "$file" | grep -qE '^#[[:space:]]+'; then
43
+ has_title=true
44
+ title=$(head -1 "$file" | sed -E 's/^#[[:space:]]+//' | tr -d '\r')
45
+ fi
46
+ # Description: block quote or paragraph after title
47
+ if grep -qE '^>[[:space:]]+' "$file" || sed -n '2,5p' "$file" | grep -qE '^[A-Za-z]'; then
48
+ has_description=true
49
+ fi
50
+ # Count URLs (markdown links)
51
+ url_count=$(grep -oE '\[[^]]*\]\(https?://[^)]+\)' "$file" 2>/dev/null | wc -l | tr -d ' ' || echo 0)
52
+ ;;
53
+ esac
54
+ fi
55
+
56
+ # Output values via globals (bash function limitation workaround)
57
+ EXISTS="$exists"
58
+ LINE_COUNT="$line_count"
59
+ HAS_TITLE="$has_title"
60
+ TITLE="$title"
61
+ HAS_DESCRIPTION="$has_description"
62
+ URL_COUNT="$url_count"
63
+ }
64
+
65
+ LLMS_STATUS=$(fetch_to_file "${ORIGIN}/llms.txt" "$LLMS_FILE")
66
+ analyze_file "$LLMS_FILE" "$LLMS_STATUS"
67
+ LLMS_EXISTS=$EXISTS
68
+ LLMS_LINES=$LINE_COUNT
69
+ LLMS_HAS_TITLE=$HAS_TITLE
70
+ LLMS_TITLE=$TITLE
71
+ LLMS_HAS_DESC=$HAS_DESCRIPTION
72
+ LLMS_URLS=$URL_COUNT
73
+
74
+ LLMS_FULL_STATUS=$(fetch_to_file "${ORIGIN}/llms-full.txt" "$LLMS_FULL_FILE")
75
+ analyze_file "$LLMS_FULL_FILE" "$LLMS_FULL_STATUS"
76
+ LLMS_FULL_EXISTS=$EXISTS
77
+ LLMS_FULL_LINES=$LINE_COUNT
78
+ LLMS_FULL_HAS_TITLE=$HAS_TITLE
79
+ LLMS_FULL_HAS_DESC=$HAS_DESCRIPTION
80
+ LLMS_FULL_URLS=$URL_COUNT
81
+
82
+ jq -n \
83
+ --arg url "$URL" \
84
+ --arg llmsUrl "${ORIGIN}/llms.txt" \
85
+ --arg llmsFullUrl "${ORIGIN}/llms-full.txt" \
86
+ --argjson llmsExists "$LLMS_EXISTS" \
87
+ --argjson llmsLines "$LLMS_LINES" \
88
+ --argjson llmsHasTitle "$LLMS_HAS_TITLE" \
89
+ --arg llmsTitle "$LLMS_TITLE" \
90
+ --argjson llmsHasDesc "$LLMS_HAS_DESC" \
91
+ --argjson llmsUrls "$LLMS_URLS" \
92
+ --argjson llmsFullExists "$LLMS_FULL_EXISTS" \
93
+ --argjson llmsFullLines "$LLMS_FULL_LINES" \
94
+ --argjson llmsFullHasTitle "$LLMS_FULL_HAS_TITLE" \
95
+ --argjson llmsFullHasDesc "$LLMS_FULL_HAS_DESC" \
96
+ --argjson llmsFullUrls "$LLMS_FULL_URLS" \
97
+ '{
98
+ url: $url,
99
+ llmsTxt: {
100
+ url: $llmsUrl,
101
+ exists: $llmsExists,
102
+ lineCount: $llmsLines,
103
+ hasTitle: $llmsHasTitle,
104
+ title: (if $llmsTitle == "" then null else $llmsTitle end),
105
+ hasDescription: $llmsHasDesc,
106
+ urlCount: $llmsUrls
107
+ },
108
+ llmsFullTxt: {
109
+ url: $llmsFullUrl,
110
+ exists: $llmsFullExists,
111
+ lineCount: $llmsFullLines,
112
+ hasTitle: $llmsFullHasTitle,
113
+ hasDescription: $llmsFullHasDesc,
114
+ urlCount: $llmsFullUrls
115
+ }
116
+ }'
@@ -0,0 +1,196 @@
1
+ #!/usr/bin/env bash
2
+ set -eu
3
+
4
+ # check-robots.sh — Fetch robots.txt and parse rules for a given UA token
5
+ # Usage: check-robots.sh <url> <ua-token>
6
+
7
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
8
+ # shellcheck source=_lib.sh
9
+ . "$SCRIPT_DIR/_lib.sh"
10
+
11
+ URL="${1:?Usage: check-robots.sh <url> <ua-token>}"
12
+ UA_TOKEN="${2:?Usage: check-robots.sh <url> <ua-token>}"
13
+
14
+ printf '[check-robots] %s for %s\n' "$URL" "$UA_TOKEN" >&2
15
+
16
+ ORIGIN=$(origin_from_url "$URL")
17
+ URL_PATH=$(path_from_url "$URL")
18
+ ROBOTS_URL="${ORIGIN}/robots.txt"
19
+
20
+ TMPDIR="${TMPDIR:-/tmp}"
21
+ ROBOTS_FILE=$(mktemp "$TMPDIR/crawlsim-robots.XXXXXX")
22
+ RAW_FILE=$(mktemp "$TMPDIR/crawlsim-robots-raw.XXXXXX")
23
+ DISALLOWED_PATHS_FILE=$(mktemp "$TMPDIR/crawlsim-disallowed.XXXXXX")
24
+ ALLOW_PATHS_FILE=$(mktemp "$TMPDIR/crawlsim-allow.XXXXXX")
25
+ SITEMAPS_FILE=$(mktemp "$TMPDIR/crawlsim-sitemaps.XXXXXX")
26
+ trap 'rm -f "$ROBOTS_FILE" "$RAW_FILE" "$DISALLOWED_PATHS_FILE" "$ALLOW_PATHS_FILE" "$SITEMAPS_FILE"' EXIT
27
+
28
+ HTTP_STATUS=$(fetch_to_file "$ROBOTS_URL" "$ROBOTS_FILE")
29
+
30
+ EXISTS=false
31
+ if [ "$HTTP_STATUS" = "200" ] && [ -s "$ROBOTS_FILE" ]; then
32
+ EXISTS=true
33
+ fi
34
+
35
+ ALLOWED=true
36
+ CRAWL_DELAY="null"
37
+
38
+ if [ "$EXISTS" = "true" ]; then
39
+ # Extract sitemap directives
40
+ grep -iE '^[[:space:]]*sitemap[[:space:]]*:' "$ROBOTS_FILE" 2>/dev/null \
41
+ | sed -E 's/^[[:space:]]*[sS][iI][tT][eE][mM][aA][pP][[:space:]]*:[[:space:]]*//' \
42
+ | tr -d '\r' \
43
+ | sed -E 's/[[:space:]]+$//' \
44
+ > "$SITEMAPS_FILE" || true
45
+
46
+ # Parse User-agent blocks using portable awk
47
+ # State machine: track current UA group(s), emit rules tagged EXACT_ or WILD_
48
+ awk -v ua="$UA_TOKEN" '
49
+ function lower(s) { return tolower(s) }
50
+ function trim(s) {
51
+ sub(/^[ \t\r]+/, "", s)
52
+ sub(/[ \t\r]+$/, "", s)
53
+ return s
54
+ }
55
+ function parse_directive(line, colon, key, val) {
56
+ colon = index(line, ":")
57
+ if (colon == 0) return ""
58
+ key = lower(trim(substr(line, 1, colon - 1)))
59
+ val = trim(substr(line, colon + 1))
60
+ return key "\t" val
61
+ }
62
+ function emit(kind, value, i, u) {
63
+ for (i = 1; i <= n_uas; i++) {
64
+ u = uas[i]
65
+ if (lower(u) == lower(ua)) {
66
+ print "EXACT_" kind "\t" value
67
+ }
68
+ if (u == "*") {
69
+ print "WILD_" kind "\t" value
70
+ }
71
+ }
72
+ }
73
+ BEGIN { n_uas = 0; prev_was_rule = 0 }
74
+ {
75
+ line = $0
76
+ # Strip comments
77
+ hash = index(line, "#")
78
+ if (hash > 0) line = substr(line, 1, hash - 1)
79
+ line = trim(line)
80
+ if (line == "") next
81
+
82
+ parsed = parse_directive(line)
83
+ if (parsed == "") next
84
+
85
+ tab = index(parsed, "\t")
86
+ key = substr(parsed, 1, tab - 1)
87
+ val = substr(parsed, tab + 1)
88
+
89
+ if (key == "user-agent") {
90
+ if (prev_was_rule) {
91
+ n_uas = 0
92
+ prev_was_rule = 0
93
+ }
94
+ n_uas++
95
+ uas[n_uas] = val
96
+ next
97
+ }
98
+ if (key == "disallow") { prev_was_rule = 1; emit("DISALLOW", val); next }
99
+ if (key == "allow") { prev_was_rule = 1; emit("ALLOW", val); next }
100
+ if (key == "crawl-delay") { prev_was_rule = 1; emit("DELAY", val); next }
101
+ }
102
+ ' "$ROBOTS_FILE" > "$RAW_FILE"
103
+
104
+ # Prefer exact UA rules if present, else wildcard
105
+ PREFIX="WILD_"
106
+ if grep -q '^EXACT_' "$RAW_FILE"; then
107
+ PREFIX="EXACT_"
108
+ fi
109
+
110
+ grep "^${PREFIX}DISALLOW" "$RAW_FILE" 2>/dev/null \
111
+ | cut -f2- \
112
+ | grep -v '^$' \
113
+ > "$DISALLOWED_PATHS_FILE" || true
114
+
115
+ grep "^${PREFIX}ALLOW" "$RAW_FILE" 2>/dev/null \
116
+ | cut -f2- \
117
+ > "$ALLOW_PATHS_FILE" || true
118
+
119
+ DELAY_LINE=$(grep "^${PREFIX}DELAY" "$RAW_FILE" 2>/dev/null | head -1 | cut -f2- || true)
120
+ if [ -n "$DELAY_LINE" ]; then
121
+ if printf '%s' "$DELAY_LINE" | grep -qE '^[0-9]+(\.[0-9]+)?$'; then
122
+ CRAWL_DELAY="$DELAY_LINE"
123
+ fi
124
+ fi
125
+
126
+ # Longest-match path check (allow overrides disallow at equal or longer length)
127
+ BEST_MATCH_LEN=-1
128
+ BEST_MATCH_KIND="allow"
129
+
130
+ match_pattern() {
131
+ # Convert robots.txt glob (* and $) to a regex prefix check
132
+ local pat="$1"
133
+ local path="$2"
134
+ # Escape regex special chars except * and $
135
+ local esc
136
+ esc=$(printf '%s' "$pat" | sed 's/[].[\^$()+?{|]/\\&/g' | sed 's/\*/.*/g')
137
+ printf '%s' "$path" | grep -qE "^${esc}"
138
+ }
139
+
140
+ while IFS= read -r pat; do
141
+ [ -z "$pat" ] && continue
142
+ if match_pattern "$pat" "$URL_PATH"; then
143
+ PAT_LEN=${#pat}
144
+ if [ "$PAT_LEN" -gt "$BEST_MATCH_LEN" ]; then
145
+ BEST_MATCH_LEN=$PAT_LEN
146
+ BEST_MATCH_KIND="disallow"
147
+ fi
148
+ fi
149
+ done < "$DISALLOWED_PATHS_FILE"
150
+
151
+ while IFS= read -r pat; do
152
+ [ -z "$pat" ] && continue
153
+ if match_pattern "$pat" "$URL_PATH"; then
154
+ PAT_LEN=${#pat}
155
+ if [ "$PAT_LEN" -ge "$BEST_MATCH_LEN" ]; then
156
+ BEST_MATCH_LEN=$PAT_LEN
157
+ BEST_MATCH_KIND="allow"
158
+ fi
159
+ fi
160
+ done < "$ALLOW_PATHS_FILE"
161
+
162
+ if [ "$BEST_MATCH_KIND" = "disallow" ]; then
163
+ ALLOWED=false
164
+ fi
165
+ fi
166
+
167
+ # Build JSON arrays
168
+ DISALLOWED_JSON="[]"
169
+ if [ -s "$DISALLOWED_PATHS_FILE" ]; then
170
+ DISALLOWED_JSON=$(head -100 "$DISALLOWED_PATHS_FILE" | jq -R . | jq -s .)
171
+ fi
172
+
173
+ SITEMAPS_JSON="[]"
174
+ if [ -s "$SITEMAPS_FILE" ]; then
175
+ SITEMAPS_JSON=$(jq -R . < "$SITEMAPS_FILE" | jq -s .)
176
+ fi
177
+
178
+ jq -n \
179
+ --arg url "$URL" \
180
+ --arg uaToken "$UA_TOKEN" \
181
+ --arg robotsUrl "$ROBOTS_URL" \
182
+ --argjson exists "$EXISTS" \
183
+ --argjson allowed "$ALLOWED" \
184
+ --argjson crawlDelay "$CRAWL_DELAY" \
185
+ --argjson disallowedPaths "$DISALLOWED_JSON" \
186
+ --argjson sitemaps "$SITEMAPS_JSON" \
187
+ '{
188
+ url: $url,
189
+ uaToken: $uaToken,
190
+ robotsUrl: $robotsUrl,
191
+ exists: $exists,
192
+ allowed: $allowed,
193
+ crawlDelay: $crawlDelay,
194
+ disallowedPaths: $disallowedPaths,
195
+ sitemaps: $sitemaps
196
+ }'