@braedenbuilds/crawl-sim 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +261 -0
- package/SKILL.md +196 -0
- package/bin/install.js +159 -0
- package/package.json +46 -0
- package/profiles/chatgpt-user.json +28 -0
- package/profiles/claude-searchbot.json +28 -0
- package/profiles/claude-user.json +28 -0
- package/profiles/claudebot.json +28 -0
- package/profiles/googlebot.json +28 -0
- package/profiles/gptbot.json +28 -0
- package/profiles/oai-searchbot.json +28 -0
- package/profiles/perplexity-user.json +28 -0
- package/profiles/perplexitybot.json +28 -0
- package/scripts/_lib.sh +51 -0
- package/scripts/check-llmstxt.sh +116 -0
- package/scripts/check-robots.sh +196 -0
- package/scripts/check-sitemap.sh +79 -0
- package/scripts/compute-score.sh +424 -0
- package/scripts/diff-render.sh +136 -0
- package/scripts/extract-jsonld.sh +133 -0
- package/scripts/extract-links.sh +103 -0
- package/scripts/extract-meta.sh +117 -0
- package/scripts/fetch-as-bot.sh +87 -0
package/package.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@braedenbuilds/crawl-sim",
|
|
3
|
+
"version": "1.0.1",
|
|
4
|
+
"description": "Agent-native multi-bot web crawler simulator. See your site through the eyes of Googlebot, GPTBot, ClaudeBot, and PerplexityBot.",
|
|
5
|
+
"bin": {
|
|
6
|
+
"crawl-sim": "bin/install.js"
|
|
7
|
+
},
|
|
8
|
+
"keywords": [
|
|
9
|
+
"seo",
|
|
10
|
+
"crawler",
|
|
11
|
+
"googlebot",
|
|
12
|
+
"gptbot",
|
|
13
|
+
"claudebot",
|
|
14
|
+
"perplexitybot",
|
|
15
|
+
"ai-visibility",
|
|
16
|
+
"ai-seo",
|
|
17
|
+
"geo",
|
|
18
|
+
"llms-txt",
|
|
19
|
+
"claude-code",
|
|
20
|
+
"claude-code-skill"
|
|
21
|
+
],
|
|
22
|
+
"author": "BraedenBDev",
|
|
23
|
+
"license": "MIT",
|
|
24
|
+
"homepage": "https://github.com/BraedenBDev/crawl-sim#readme",
|
|
25
|
+
"bugs": {
|
|
26
|
+
"url": "https://github.com/BraedenBDev/crawl-sim/issues"
|
|
27
|
+
},
|
|
28
|
+
"repository": {
|
|
29
|
+
"type": "git",
|
|
30
|
+
"url": "git+https://github.com/BraedenBDev/crawl-sim.git"
|
|
31
|
+
},
|
|
32
|
+
"publishConfig": {
|
|
33
|
+
"access": "public"
|
|
34
|
+
},
|
|
35
|
+
"engines": {
|
|
36
|
+
"node": ">=18"
|
|
37
|
+
},
|
|
38
|
+
"files": [
|
|
39
|
+
"bin/",
|
|
40
|
+
"SKILL.md",
|
|
41
|
+
"profiles/",
|
|
42
|
+
"scripts/",
|
|
43
|
+
"README.md",
|
|
44
|
+
"LICENSE"
|
|
45
|
+
]
|
|
46
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "chatgpt-user",
|
|
3
|
+
"name": "ChatGPT-User",
|
|
4
|
+
"vendor": "OpenAI",
|
|
5
|
+
"userAgent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot",
|
|
6
|
+
"robotsTxtToken": "ChatGPT-User",
|
|
7
|
+
"purpose": "user-initiated",
|
|
8
|
+
"rendersJavaScript": "unknown",
|
|
9
|
+
"respectsRobotsTxt": "partial",
|
|
10
|
+
"crawlDelaySupported": "unknown",
|
|
11
|
+
"ipRangesUrl": "https://openai.com/chatgpt-user.json",
|
|
12
|
+
"docs": "https://developers.openai.com/api/docs/bots",
|
|
13
|
+
"confidence": {
|
|
14
|
+
"rendersJavaScript": {
|
|
15
|
+
"value": "unknown",
|
|
16
|
+
"level": "inferred",
|
|
17
|
+
"source": "Not documented by OpenAI"
|
|
18
|
+
},
|
|
19
|
+
"respectsRobotsTxt": {
|
|
20
|
+
"value": "partial",
|
|
21
|
+
"level": "official",
|
|
22
|
+
"source": "Official docs state: 'Because these actions are initiated by a user, robots.txt rules may not apply.'"
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
"lastVerified": "2026-04-11",
|
|
26
|
+
"relatedBots": ["gptbot", "oai-searchbot"],
|
|
27
|
+
"notes": "Not used for automatic crawling. Not used to determine search appearance. User-initiated fetches in ChatGPT and Custom GPTs."
|
|
28
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "claude-searchbot",
|
|
3
|
+
"name": "Claude-SearchBot",
|
|
4
|
+
"vendor": "Anthropic",
|
|
5
|
+
"userAgent": "Claude-SearchBot",
|
|
6
|
+
"robotsTxtToken": "Claude-SearchBot",
|
|
7
|
+
"purpose": "search",
|
|
8
|
+
"rendersJavaScript": "unknown",
|
|
9
|
+
"respectsRobotsTxt": true,
|
|
10
|
+
"crawlDelaySupported": "unknown",
|
|
11
|
+
"ipRangesUrl": null,
|
|
12
|
+
"docs": "https://privacy.claude.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler",
|
|
13
|
+
"confidence": {
|
|
14
|
+
"rendersJavaScript": {
|
|
15
|
+
"value": "unknown",
|
|
16
|
+
"level": "inferred",
|
|
17
|
+
"source": "Not documented by Anthropic"
|
|
18
|
+
},
|
|
19
|
+
"respectsRobotsTxt": {
|
|
20
|
+
"value": true,
|
|
21
|
+
"level": "official",
|
|
22
|
+
"source": "Official docs, but notes blocking may reduce visibility and accuracy in user search results"
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
"lastVerified": "2026-04-11",
|
|
26
|
+
"relatedBots": ["claudebot", "claude-user"],
|
|
27
|
+
"notes": "Navigates the web to improve search result quality. Focused on search indexing, not training."
|
|
28
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "claude-user",
|
|
3
|
+
"name": "Claude-User",
|
|
4
|
+
"vendor": "Anthropic",
|
|
5
|
+
"userAgent": "Claude-User",
|
|
6
|
+
"robotsTxtToken": "Claude-User",
|
|
7
|
+
"purpose": "user-initiated",
|
|
8
|
+
"rendersJavaScript": "unknown",
|
|
9
|
+
"respectsRobotsTxt": true,
|
|
10
|
+
"crawlDelaySupported": "unknown",
|
|
11
|
+
"ipRangesUrl": null,
|
|
12
|
+
"docs": "https://privacy.claude.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler",
|
|
13
|
+
"confidence": {
|
|
14
|
+
"rendersJavaScript": {
|
|
15
|
+
"value": "unknown",
|
|
16
|
+
"level": "inferred",
|
|
17
|
+
"source": "Not documented by Anthropic"
|
|
18
|
+
},
|
|
19
|
+
"respectsRobotsTxt": {
|
|
20
|
+
"value": true,
|
|
21
|
+
"level": "official",
|
|
22
|
+
"source": "Official docs, but notes blocking may reduce visibility for user-directed web search"
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
"lastVerified": "2026-04-11",
|
|
26
|
+
"relatedBots": ["claudebot", "claude-searchbot"],
|
|
27
|
+
"notes": "When individuals ask questions to Claude, it may access websites. Blocking prevents Claude from retrieving content in response to user queries."
|
|
28
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "claudebot",
|
|
3
|
+
"name": "ClaudeBot",
|
|
4
|
+
"vendor": "Anthropic",
|
|
5
|
+
"userAgent": "ClaudeBot",
|
|
6
|
+
"robotsTxtToken": "ClaudeBot",
|
|
7
|
+
"purpose": "training",
|
|
8
|
+
"rendersJavaScript": false,
|
|
9
|
+
"respectsRobotsTxt": true,
|
|
10
|
+
"crawlDelaySupported": true,
|
|
11
|
+
"ipRangesUrl": null,
|
|
12
|
+
"docs": "https://privacy.claude.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler",
|
|
13
|
+
"confidence": {
|
|
14
|
+
"rendersJavaScript": {
|
|
15
|
+
"value": false,
|
|
16
|
+
"level": "observed",
|
|
17
|
+
"source": "Observational evidence consistent with no JS rendering"
|
|
18
|
+
},
|
|
19
|
+
"respectsRobotsTxt": {
|
|
20
|
+
"value": true,
|
|
21
|
+
"level": "official",
|
|
22
|
+
"source": "https://privacy.claude.com"
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
"lastVerified": "2026-04-11",
|
|
26
|
+
"relatedBots": ["claude-user", "claude-searchbot"],
|
|
27
|
+
"notes": "Collects web content that could potentially contribute to AI model training. Crawl-delay explicitly supported (non-standard). Blocking IP addresses will not reliably work."
|
|
28
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "googlebot",
|
|
3
|
+
"name": "Googlebot",
|
|
4
|
+
"vendor": "Google",
|
|
5
|
+
"userAgent": "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/W.X.Y.Z Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
|
|
6
|
+
"robotsTxtToken": "Googlebot",
|
|
7
|
+
"purpose": "search-indexing",
|
|
8
|
+
"rendersJavaScript": true,
|
|
9
|
+
"respectsRobotsTxt": true,
|
|
10
|
+
"crawlDelaySupported": false,
|
|
11
|
+
"ipRangesUrl": null,
|
|
12
|
+
"docs": "https://developers.google.com/search/docs/crawling-indexing/javascript/fix-search-javascript",
|
|
13
|
+
"confidence": {
|
|
14
|
+
"rendersJavaScript": {
|
|
15
|
+
"value": true,
|
|
16
|
+
"level": "official",
|
|
17
|
+
"source": "https://developers.google.com/search/docs/crawling-indexing/javascript/fix-search-javascript"
|
|
18
|
+
},
|
|
19
|
+
"respectsRobotsTxt": {
|
|
20
|
+
"value": true,
|
|
21
|
+
"level": "official",
|
|
22
|
+
"source": "RFC 9309 compliant"
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
"lastVerified": "2026-04-11",
|
|
26
|
+
"relatedBots": [],
|
|
27
|
+
"notes": "Two-phase: initial fetch (HTML) then queued render (headless Chrome via WRS). Evergreen Chromium. Stateless sessions. ~5s default timeout. Mobile-first indexing."
|
|
28
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "gptbot",
|
|
3
|
+
"name": "GPTBot",
|
|
4
|
+
"vendor": "OpenAI",
|
|
5
|
+
"userAgent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.3; +https://openai.com/gptbot",
|
|
6
|
+
"robotsTxtToken": "GPTBot",
|
|
7
|
+
"purpose": "training",
|
|
8
|
+
"rendersJavaScript": false,
|
|
9
|
+
"respectsRobotsTxt": true,
|
|
10
|
+
"crawlDelaySupported": "unknown",
|
|
11
|
+
"ipRangesUrl": "https://openai.com/gptbot.json",
|
|
12
|
+
"docs": "https://developers.openai.com/api/docs/bots",
|
|
13
|
+
"confidence": {
|
|
14
|
+
"rendersJavaScript": {
|
|
15
|
+
"value": false,
|
|
16
|
+
"level": "observed",
|
|
17
|
+
"source": "Multiple third-party tests with JS-only pages show empty content"
|
|
18
|
+
},
|
|
19
|
+
"respectsRobotsTxt": {
|
|
20
|
+
"value": true,
|
|
21
|
+
"level": "official",
|
|
22
|
+
"source": "https://developers.openai.com/api/docs/bots"
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
"lastVerified": "2026-04-11",
|
|
26
|
+
"relatedBots": ["oai-searchbot", "chatgpt-user"],
|
|
27
|
+
"notes": "Disallowing GPTBot indicates a site's content should not be used in training generative AI foundation models."
|
|
28
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "oai-searchbot",
|
|
3
|
+
"name": "OAI-SearchBot",
|
|
4
|
+
"vendor": "OpenAI",
|
|
5
|
+
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36; compatible; OAI-SearchBot/1.3; +https://openai.com/searchbot",
|
|
6
|
+
"robotsTxtToken": "OAI-SearchBot",
|
|
7
|
+
"purpose": "search",
|
|
8
|
+
"rendersJavaScript": "unknown",
|
|
9
|
+
"respectsRobotsTxt": true,
|
|
10
|
+
"crawlDelaySupported": "unknown",
|
|
11
|
+
"ipRangesUrl": "https://openai.com/searchbot.json",
|
|
12
|
+
"docs": "https://developers.openai.com/api/docs/bots",
|
|
13
|
+
"confidence": {
|
|
14
|
+
"rendersJavaScript": {
|
|
15
|
+
"value": "unknown",
|
|
16
|
+
"level": "inferred",
|
|
17
|
+
"source": "UA mimics Chrome 131 — may indicate rendering capability, but unconfirmed"
|
|
18
|
+
},
|
|
19
|
+
"respectsRobotsTxt": {
|
|
20
|
+
"value": true,
|
|
21
|
+
"level": "official",
|
|
22
|
+
"source": "https://developers.openai.com/api/docs/bots"
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
"lastVerified": "2026-04-11",
|
|
26
|
+
"relatedBots": ["gptbot", "chatgpt-user"],
|
|
27
|
+
"notes": "Sites opted out of OAI-SearchBot will not be shown in ChatGPT search answers, though can still appear as navigational links."
|
|
28
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "perplexity-user",
|
|
3
|
+
"name": "Perplexity-User",
|
|
4
|
+
"vendor": "Perplexity",
|
|
5
|
+
"userAgent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Perplexity-User/1.0; +https://perplexity.ai/perplexity-user)",
|
|
6
|
+
"robotsTxtToken": "Perplexity-User",
|
|
7
|
+
"purpose": "user-initiated",
|
|
8
|
+
"rendersJavaScript": "unknown",
|
|
9
|
+
"respectsRobotsTxt": false,
|
|
10
|
+
"crawlDelaySupported": "unknown",
|
|
11
|
+
"ipRangesUrl": "https://www.perplexity.com/perplexity-user.json",
|
|
12
|
+
"docs": "https://docs.perplexity.ai/docs/resources/perplexity-crawlers",
|
|
13
|
+
"confidence": {
|
|
14
|
+
"rendersJavaScript": {
|
|
15
|
+
"value": "unknown",
|
|
16
|
+
"level": "inferred",
|
|
17
|
+
"source": "Not documented by Perplexity"
|
|
18
|
+
},
|
|
19
|
+
"respectsRobotsTxt": {
|
|
20
|
+
"value": false,
|
|
21
|
+
"level": "official",
|
|
22
|
+
"source": "Official docs state: 'Since a user requested the fetch, this fetcher generally ignores robots.txt rules.'"
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
"lastVerified": "2026-04-11",
|
|
26
|
+
"relatedBots": ["perplexitybot"],
|
|
27
|
+
"notes": "Supports user actions within Perplexity. Not used for web crawling or AI training. Generally ignores robots.txt since fetches are user-initiated."
|
|
28
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "perplexitybot",
|
|
3
|
+
"name": "PerplexityBot",
|
|
4
|
+
"vendor": "Perplexity",
|
|
5
|
+
"userAgent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; PerplexityBot/1.0; +https://perplexity.ai/perplexitybot)",
|
|
6
|
+
"robotsTxtToken": "PerplexityBot",
|
|
7
|
+
"purpose": "search-indexing",
|
|
8
|
+
"rendersJavaScript": false,
|
|
9
|
+
"respectsRobotsTxt": true,
|
|
10
|
+
"crawlDelaySupported": "unknown",
|
|
11
|
+
"ipRangesUrl": "https://www.perplexity.com/perplexitybot.json",
|
|
12
|
+
"docs": "https://docs.perplexity.ai/docs/resources/perplexity-crawlers",
|
|
13
|
+
"confidence": {
|
|
14
|
+
"rendersJavaScript": {
|
|
15
|
+
"value": false,
|
|
16
|
+
"level": "observed",
|
|
17
|
+
"source": "Most third-party reports indicate no JS rendering"
|
|
18
|
+
},
|
|
19
|
+
"respectsRobotsTxt": {
|
|
20
|
+
"value": true,
|
|
21
|
+
"level": "official",
|
|
22
|
+
"source": "https://docs.perplexity.ai/docs/resources/perplexity-crawlers"
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
"lastVerified": "2026-04-11",
|
|
26
|
+
"relatedBots": ["perplexity-user"],
|
|
27
|
+
"notes": "Designed to surface and link websites in search results on Perplexity. NOT used to crawl content for AI foundation models. Changes may take up to 24 hours to reflect."
|
|
28
|
+
}
|
package/scripts/_lib.sh
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# crawl-sim shared helpers. Source this from other scripts:
|
|
3
|
+
# SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
4
|
+
# . "$SCRIPT_DIR/_lib.sh"
|
|
5
|
+
|
|
6
|
+
# Extract "https://host" from any URL.
|
|
7
|
+
origin_from_url() {
|
|
8
|
+
printf '%s' "$1" | sed -E 's#(^https?://[^/]+).*#\1#'
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
# Extract the host from a URL, stripping "www." prefix.
|
|
12
|
+
host_from_url() {
|
|
13
|
+
printf '%s' "$1" | sed -E 's#^https?://##' | sed -E 's#/.*$##' | sed -E 's#^www\.##'
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
# Extract the path portion of a URL. Returns "/" if empty.
|
|
17
|
+
path_from_url() {
|
|
18
|
+
local p
|
|
19
|
+
p=$(printf '%s' "$1" | sed -E 's#^https?://[^/]+##')
|
|
20
|
+
printf '%s' "${p:-/}"
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
# Extract the directory portion of a URL's path (everything up to the last /).
|
|
24
|
+
# Used for resolving relative URLs against a base page URL.
|
|
25
|
+
# Example: https://example.com/blog/index.html -> https://example.com/blog/
|
|
26
|
+
dir_from_url() {
|
|
27
|
+
local url="$1"
|
|
28
|
+
local origin
|
|
29
|
+
origin=$(origin_from_url "$url")
|
|
30
|
+
local p
|
|
31
|
+
p=$(path_from_url "$url")
|
|
32
|
+
# If path ends with /, keep as-is; otherwise strip last segment
|
|
33
|
+
case "$p" in
|
|
34
|
+
*/) printf '%s%s' "$origin" "$p" ;;
|
|
35
|
+
*) printf '%s%s/' "$origin" "$(printf '%s' "$p" | sed -E 's#/[^/]*$##')" ;;
|
|
36
|
+
esac
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
# Count visible words in an HTML file (strips tags, counts alnum tokens).
|
|
40
|
+
count_words() {
|
|
41
|
+
sed 's/<[^>]*>//g' "$1" | tr -s '[:space:]' '\n' | grep -c '[a-zA-Z0-9]' || true
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
# Fetch a URL to a local file and return the HTTP status code on stdout.
|
|
45
|
+
# Usage: status=$(fetch_to_file <url> <output-file> [timeout-seconds])
|
|
46
|
+
fetch_to_file() {
|
|
47
|
+
local url="$1"
|
|
48
|
+
local out="$2"
|
|
49
|
+
local timeout="${3:-15}"
|
|
50
|
+
curl -sS -L -o "$out" -w '%{http_code}' --max-time "$timeout" "$url" 2>/dev/null || echo "000"
|
|
51
|
+
}
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -eu
|
|
3
|
+
|
|
4
|
+
# check-llmstxt.sh — Check for llms.txt and llms-full.txt presence + structure
|
|
5
|
+
# Usage: check-llmstxt.sh <url>
|
|
6
|
+
|
|
7
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
8
|
+
# shellcheck source=_lib.sh
|
|
9
|
+
. "$SCRIPT_DIR/_lib.sh"
|
|
10
|
+
|
|
11
|
+
URL="${1:?Usage: check-llmstxt.sh <url>}"
|
|
12
|
+
printf '[check-llmstxt] %s\n' "$URL" >&2
|
|
13
|
+
ORIGIN=$(origin_from_url "$URL")
|
|
14
|
+
|
|
15
|
+
TMPDIR="${TMPDIR:-/tmp}"
|
|
16
|
+
LLMS_FILE=$(mktemp "$TMPDIR/crawlsim-llms.XXXXXX")
|
|
17
|
+
LLMS_FULL_FILE=$(mktemp "$TMPDIR/crawlsim-llms-full.XXXXXX")
|
|
18
|
+
trap 'rm -f "$LLMS_FILE" "$LLMS_FULL_FILE"' EXIT
|
|
19
|
+
|
|
20
|
+
analyze_file() {
|
|
21
|
+
local file="$1"
|
|
22
|
+
local status_code="$2"
|
|
23
|
+
|
|
24
|
+
local exists=false
|
|
25
|
+
local line_count=0
|
|
26
|
+
local has_title=false
|
|
27
|
+
local title=""
|
|
28
|
+
local has_description=false
|
|
29
|
+
local url_count=0
|
|
30
|
+
|
|
31
|
+
# Treat non-200 or HTML responses as "not present"
|
|
32
|
+
if [ "$status_code" = "200" ] && [ -s "$file" ]; then
|
|
33
|
+
# Heuristic: if file starts with <!doctype or <html, site serves HTML fallback — not a real llms.txt
|
|
34
|
+
local first_bytes
|
|
35
|
+
first_bytes=$(head -c 100 "$file" | tr '[:upper:]' '[:lower:]')
|
|
36
|
+
case "$first_bytes" in
|
|
37
|
+
*"<!doctype"*|*"<html"*) ;;
|
|
38
|
+
*)
|
|
39
|
+
exists=true
|
|
40
|
+
line_count=$(wc -l < "$file" | tr -d ' ')
|
|
41
|
+
# Title: first line starting with "# "
|
|
42
|
+
if head -1 "$file" | grep -qE '^#[[:space:]]+'; then
|
|
43
|
+
has_title=true
|
|
44
|
+
title=$(head -1 "$file" | sed -E 's/^#[[:space:]]+//' | tr -d '\r')
|
|
45
|
+
fi
|
|
46
|
+
# Description: block quote or paragraph after title
|
|
47
|
+
if grep -qE '^>[[:space:]]+' "$file" || sed -n '2,5p' "$file" | grep -qE '^[A-Za-z]'; then
|
|
48
|
+
has_description=true
|
|
49
|
+
fi
|
|
50
|
+
# Count URLs (markdown links)
|
|
51
|
+
url_count=$(grep -oE '\[[^]]*\]\(https?://[^)]+\)' "$file" 2>/dev/null | wc -l | tr -d ' ' || echo 0)
|
|
52
|
+
;;
|
|
53
|
+
esac
|
|
54
|
+
fi
|
|
55
|
+
|
|
56
|
+
# Output values via globals (bash function limitation workaround)
|
|
57
|
+
EXISTS="$exists"
|
|
58
|
+
LINE_COUNT="$line_count"
|
|
59
|
+
HAS_TITLE="$has_title"
|
|
60
|
+
TITLE="$title"
|
|
61
|
+
HAS_DESCRIPTION="$has_description"
|
|
62
|
+
URL_COUNT="$url_count"
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
LLMS_STATUS=$(fetch_to_file "${ORIGIN}/llms.txt" "$LLMS_FILE")
|
|
66
|
+
analyze_file "$LLMS_FILE" "$LLMS_STATUS"
|
|
67
|
+
LLMS_EXISTS=$EXISTS
|
|
68
|
+
LLMS_LINES=$LINE_COUNT
|
|
69
|
+
LLMS_HAS_TITLE=$HAS_TITLE
|
|
70
|
+
LLMS_TITLE=$TITLE
|
|
71
|
+
LLMS_HAS_DESC=$HAS_DESCRIPTION
|
|
72
|
+
LLMS_URLS=$URL_COUNT
|
|
73
|
+
|
|
74
|
+
LLMS_FULL_STATUS=$(fetch_to_file "${ORIGIN}/llms-full.txt" "$LLMS_FULL_FILE")
|
|
75
|
+
analyze_file "$LLMS_FULL_FILE" "$LLMS_FULL_STATUS"
|
|
76
|
+
LLMS_FULL_EXISTS=$EXISTS
|
|
77
|
+
LLMS_FULL_LINES=$LINE_COUNT
|
|
78
|
+
LLMS_FULL_HAS_TITLE=$HAS_TITLE
|
|
79
|
+
LLMS_FULL_HAS_DESC=$HAS_DESCRIPTION
|
|
80
|
+
LLMS_FULL_URLS=$URL_COUNT
|
|
81
|
+
|
|
82
|
+
jq -n \
|
|
83
|
+
--arg url "$URL" \
|
|
84
|
+
--arg llmsUrl "${ORIGIN}/llms.txt" \
|
|
85
|
+
--arg llmsFullUrl "${ORIGIN}/llms-full.txt" \
|
|
86
|
+
--argjson llmsExists "$LLMS_EXISTS" \
|
|
87
|
+
--argjson llmsLines "$LLMS_LINES" \
|
|
88
|
+
--argjson llmsHasTitle "$LLMS_HAS_TITLE" \
|
|
89
|
+
--arg llmsTitle "$LLMS_TITLE" \
|
|
90
|
+
--argjson llmsHasDesc "$LLMS_HAS_DESC" \
|
|
91
|
+
--argjson llmsUrls "$LLMS_URLS" \
|
|
92
|
+
--argjson llmsFullExists "$LLMS_FULL_EXISTS" \
|
|
93
|
+
--argjson llmsFullLines "$LLMS_FULL_LINES" \
|
|
94
|
+
--argjson llmsFullHasTitle "$LLMS_FULL_HAS_TITLE" \
|
|
95
|
+
--argjson llmsFullHasDesc "$LLMS_FULL_HAS_DESC" \
|
|
96
|
+
--argjson llmsFullUrls "$LLMS_FULL_URLS" \
|
|
97
|
+
'{
|
|
98
|
+
url: $url,
|
|
99
|
+
llmsTxt: {
|
|
100
|
+
url: $llmsUrl,
|
|
101
|
+
exists: $llmsExists,
|
|
102
|
+
lineCount: $llmsLines,
|
|
103
|
+
hasTitle: $llmsHasTitle,
|
|
104
|
+
title: (if $llmsTitle == "" then null else $llmsTitle end),
|
|
105
|
+
hasDescription: $llmsHasDesc,
|
|
106
|
+
urlCount: $llmsUrls
|
|
107
|
+
},
|
|
108
|
+
llmsFullTxt: {
|
|
109
|
+
url: $llmsFullUrl,
|
|
110
|
+
exists: $llmsFullExists,
|
|
111
|
+
lineCount: $llmsFullLines,
|
|
112
|
+
hasTitle: $llmsFullHasTitle,
|
|
113
|
+
hasDescription: $llmsFullHasDesc,
|
|
114
|
+
urlCount: $llmsFullUrls
|
|
115
|
+
}
|
|
116
|
+
}'
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -eu
|
|
3
|
+
|
|
4
|
+
# check-robots.sh — Fetch robots.txt and parse rules for a given UA token
|
|
5
|
+
# Usage: check-robots.sh <url> <ua-token>
|
|
6
|
+
|
|
7
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
8
|
+
# shellcheck source=_lib.sh
|
|
9
|
+
. "$SCRIPT_DIR/_lib.sh"
|
|
10
|
+
|
|
11
|
+
URL="${1:?Usage: check-robots.sh <url> <ua-token>}"
|
|
12
|
+
UA_TOKEN="${2:?Usage: check-robots.sh <url> <ua-token>}"
|
|
13
|
+
|
|
14
|
+
printf '[check-robots] %s for %s\n' "$URL" "$UA_TOKEN" >&2
|
|
15
|
+
|
|
16
|
+
ORIGIN=$(origin_from_url "$URL")
|
|
17
|
+
URL_PATH=$(path_from_url "$URL")
|
|
18
|
+
ROBOTS_URL="${ORIGIN}/robots.txt"
|
|
19
|
+
|
|
20
|
+
TMPDIR="${TMPDIR:-/tmp}"
|
|
21
|
+
ROBOTS_FILE=$(mktemp "$TMPDIR/crawlsim-robots.XXXXXX")
|
|
22
|
+
RAW_FILE=$(mktemp "$TMPDIR/crawlsim-robots-raw.XXXXXX")
|
|
23
|
+
DISALLOWED_PATHS_FILE=$(mktemp "$TMPDIR/crawlsim-disallowed.XXXXXX")
|
|
24
|
+
ALLOW_PATHS_FILE=$(mktemp "$TMPDIR/crawlsim-allow.XXXXXX")
|
|
25
|
+
SITEMAPS_FILE=$(mktemp "$TMPDIR/crawlsim-sitemaps.XXXXXX")
|
|
26
|
+
trap 'rm -f "$ROBOTS_FILE" "$RAW_FILE" "$DISALLOWED_PATHS_FILE" "$ALLOW_PATHS_FILE" "$SITEMAPS_FILE"' EXIT
|
|
27
|
+
|
|
28
|
+
HTTP_STATUS=$(fetch_to_file "$ROBOTS_URL" "$ROBOTS_FILE")
|
|
29
|
+
|
|
30
|
+
EXISTS=false
|
|
31
|
+
if [ "$HTTP_STATUS" = "200" ] && [ -s "$ROBOTS_FILE" ]; then
|
|
32
|
+
EXISTS=true
|
|
33
|
+
fi
|
|
34
|
+
|
|
35
|
+
ALLOWED=true
|
|
36
|
+
CRAWL_DELAY="null"
|
|
37
|
+
|
|
38
|
+
if [ "$EXISTS" = "true" ]; then
|
|
39
|
+
# Extract sitemap directives
|
|
40
|
+
grep -iE '^[[:space:]]*sitemap[[:space:]]*:' "$ROBOTS_FILE" 2>/dev/null \
|
|
41
|
+
| sed -E 's/^[[:space:]]*[sS][iI][tT][eE][mM][aA][pP][[:space:]]*:[[:space:]]*//' \
|
|
42
|
+
| tr -d '\r' \
|
|
43
|
+
| sed -E 's/[[:space:]]+$//' \
|
|
44
|
+
> "$SITEMAPS_FILE" || true
|
|
45
|
+
|
|
46
|
+
# Parse User-agent blocks using portable awk
|
|
47
|
+
# State machine: track current UA group(s), emit rules tagged EXACT_ or WILD_
|
|
48
|
+
awk -v ua="$UA_TOKEN" '
|
|
49
|
+
function lower(s) { return tolower(s) }
|
|
50
|
+
function trim(s) {
|
|
51
|
+
sub(/^[ \t\r]+/, "", s)
|
|
52
|
+
sub(/[ \t\r]+$/, "", s)
|
|
53
|
+
return s
|
|
54
|
+
}
|
|
55
|
+
function parse_directive(line, colon, key, val) {
|
|
56
|
+
colon = index(line, ":")
|
|
57
|
+
if (colon == 0) return ""
|
|
58
|
+
key = lower(trim(substr(line, 1, colon - 1)))
|
|
59
|
+
val = trim(substr(line, colon + 1))
|
|
60
|
+
return key "\t" val
|
|
61
|
+
}
|
|
62
|
+
function emit(kind, value, i, u) {
|
|
63
|
+
for (i = 1; i <= n_uas; i++) {
|
|
64
|
+
u = uas[i]
|
|
65
|
+
if (lower(u) == lower(ua)) {
|
|
66
|
+
print "EXACT_" kind "\t" value
|
|
67
|
+
}
|
|
68
|
+
if (u == "*") {
|
|
69
|
+
print "WILD_" kind "\t" value
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
BEGIN { n_uas = 0; prev_was_rule = 0 }
|
|
74
|
+
{
|
|
75
|
+
line = $0
|
|
76
|
+
# Strip comments
|
|
77
|
+
hash = index(line, "#")
|
|
78
|
+
if (hash > 0) line = substr(line, 1, hash - 1)
|
|
79
|
+
line = trim(line)
|
|
80
|
+
if (line == "") next
|
|
81
|
+
|
|
82
|
+
parsed = parse_directive(line)
|
|
83
|
+
if (parsed == "") next
|
|
84
|
+
|
|
85
|
+
tab = index(parsed, "\t")
|
|
86
|
+
key = substr(parsed, 1, tab - 1)
|
|
87
|
+
val = substr(parsed, tab + 1)
|
|
88
|
+
|
|
89
|
+
if (key == "user-agent") {
|
|
90
|
+
if (prev_was_rule) {
|
|
91
|
+
n_uas = 0
|
|
92
|
+
prev_was_rule = 0
|
|
93
|
+
}
|
|
94
|
+
n_uas++
|
|
95
|
+
uas[n_uas] = val
|
|
96
|
+
next
|
|
97
|
+
}
|
|
98
|
+
if (key == "disallow") { prev_was_rule = 1; emit("DISALLOW", val); next }
|
|
99
|
+
if (key == "allow") { prev_was_rule = 1; emit("ALLOW", val); next }
|
|
100
|
+
if (key == "crawl-delay") { prev_was_rule = 1; emit("DELAY", val); next }
|
|
101
|
+
}
|
|
102
|
+
' "$ROBOTS_FILE" > "$RAW_FILE"
|
|
103
|
+
|
|
104
|
+
# Prefer exact UA rules if present, else wildcard
|
|
105
|
+
PREFIX="WILD_"
|
|
106
|
+
if grep -q '^EXACT_' "$RAW_FILE"; then
|
|
107
|
+
PREFIX="EXACT_"
|
|
108
|
+
fi
|
|
109
|
+
|
|
110
|
+
grep "^${PREFIX}DISALLOW" "$RAW_FILE" 2>/dev/null \
|
|
111
|
+
| cut -f2- \
|
|
112
|
+
| grep -v '^$' \
|
|
113
|
+
> "$DISALLOWED_PATHS_FILE" || true
|
|
114
|
+
|
|
115
|
+
grep "^${PREFIX}ALLOW" "$RAW_FILE" 2>/dev/null \
|
|
116
|
+
| cut -f2- \
|
|
117
|
+
> "$ALLOW_PATHS_FILE" || true
|
|
118
|
+
|
|
119
|
+
DELAY_LINE=$(grep "^${PREFIX}DELAY" "$RAW_FILE" 2>/dev/null | head -1 | cut -f2- || true)
|
|
120
|
+
if [ -n "$DELAY_LINE" ]; then
|
|
121
|
+
if printf '%s' "$DELAY_LINE" | grep -qE '^[0-9]+(\.[0-9]+)?$'; then
|
|
122
|
+
CRAWL_DELAY="$DELAY_LINE"
|
|
123
|
+
fi
|
|
124
|
+
fi
|
|
125
|
+
|
|
126
|
+
# Longest-match path check (allow overrides disallow at equal or longer length)
|
|
127
|
+
BEST_MATCH_LEN=-1
|
|
128
|
+
BEST_MATCH_KIND="allow"
|
|
129
|
+
|
|
130
|
+
match_pattern() {
|
|
131
|
+
# Convert robots.txt glob (* and $) to a regex prefix check
|
|
132
|
+
local pat="$1"
|
|
133
|
+
local path="$2"
|
|
134
|
+
# Escape regex special chars except * and $
|
|
135
|
+
local esc
|
|
136
|
+
esc=$(printf '%s' "$pat" | sed 's/[].[\^$()+?{|]/\\&/g' | sed 's/\*/.*/g')
|
|
137
|
+
printf '%s' "$path" | grep -qE "^${esc}"
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
while IFS= read -r pat; do
|
|
141
|
+
[ -z "$pat" ] && continue
|
|
142
|
+
if match_pattern "$pat" "$URL_PATH"; then
|
|
143
|
+
PAT_LEN=${#pat}
|
|
144
|
+
if [ "$PAT_LEN" -gt "$BEST_MATCH_LEN" ]; then
|
|
145
|
+
BEST_MATCH_LEN=$PAT_LEN
|
|
146
|
+
BEST_MATCH_KIND="disallow"
|
|
147
|
+
fi
|
|
148
|
+
fi
|
|
149
|
+
done < "$DISALLOWED_PATHS_FILE"
|
|
150
|
+
|
|
151
|
+
while IFS= read -r pat; do
|
|
152
|
+
[ -z "$pat" ] && continue
|
|
153
|
+
if match_pattern "$pat" "$URL_PATH"; then
|
|
154
|
+
PAT_LEN=${#pat}
|
|
155
|
+
if [ "$PAT_LEN" -ge "$BEST_MATCH_LEN" ]; then
|
|
156
|
+
BEST_MATCH_LEN=$PAT_LEN
|
|
157
|
+
BEST_MATCH_KIND="allow"
|
|
158
|
+
fi
|
|
159
|
+
fi
|
|
160
|
+
done < "$ALLOW_PATHS_FILE"
|
|
161
|
+
|
|
162
|
+
if [ "$BEST_MATCH_KIND" = "disallow" ]; then
|
|
163
|
+
ALLOWED=false
|
|
164
|
+
fi
|
|
165
|
+
fi
|
|
166
|
+
|
|
167
|
+
# Build JSON arrays
|
|
168
|
+
DISALLOWED_JSON="[]"
|
|
169
|
+
if [ -s "$DISALLOWED_PATHS_FILE" ]; then
|
|
170
|
+
DISALLOWED_JSON=$(head -100 "$DISALLOWED_PATHS_FILE" | jq -R . | jq -s .)
|
|
171
|
+
fi
|
|
172
|
+
|
|
173
|
+
SITEMAPS_JSON="[]"
|
|
174
|
+
if [ -s "$SITEMAPS_FILE" ]; then
|
|
175
|
+
SITEMAPS_JSON=$(jq -R . < "$SITEMAPS_FILE" | jq -s .)
|
|
176
|
+
fi
|
|
177
|
+
|
|
178
|
+
jq -n \
|
|
179
|
+
--arg url "$URL" \
|
|
180
|
+
--arg uaToken "$UA_TOKEN" \
|
|
181
|
+
--arg robotsUrl "$ROBOTS_URL" \
|
|
182
|
+
--argjson exists "$EXISTS" \
|
|
183
|
+
--argjson allowed "$ALLOWED" \
|
|
184
|
+
--argjson crawlDelay "$CRAWL_DELAY" \
|
|
185
|
+
--argjson disallowedPaths "$DISALLOWED_JSON" \
|
|
186
|
+
--argjson sitemaps "$SITEMAPS_JSON" \
|
|
187
|
+
'{
|
|
188
|
+
url: $url,
|
|
189
|
+
uaToken: $uaToken,
|
|
190
|
+
robotsUrl: $robotsUrl,
|
|
191
|
+
exists: $exists,
|
|
192
|
+
allowed: $allowed,
|
|
193
|
+
crawlDelay: $crawlDelay,
|
|
194
|
+
disallowedPaths: $disallowedPaths,
|
|
195
|
+
sitemaps: $sitemaps
|
|
196
|
+
}'
|