@braedenbuilds/crawl-sim 1.0.5 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +15 -0
- package/.claude-plugin/plugin.json +13 -0
- package/README.md +32 -9
- package/bin/install.js +6 -2
- package/package.json +8 -3
- package/{SKILL.md → skills/crawl-sim/SKILL.md} +23 -2
- package/{scripts → skills/crawl-sim/scripts}/_lib.sh +30 -0
- package/skills/crawl-sim/scripts/compute-score.sh +744 -0
- package/{scripts → skills/crawl-sim/scripts}/extract-jsonld.sh +12 -0
- package/skills/crawl-sim/scripts/fetch-as-bot.sh +151 -0
- package/skills/crawl-sim/scripts/schema-fields.sh +25 -0
- package/scripts/compute-score.sh +0 -424
- package/scripts/fetch-as-bot.sh +0 -87
- /package/{profiles → skills/crawl-sim/profiles}/chatgpt-user.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/claude-searchbot.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/claude-user.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/claudebot.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/googlebot.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/gptbot.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/oai-searchbot.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/perplexity-user.json +0 -0
- /package/{profiles → skills/crawl-sim/profiles}/perplexitybot.json +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/check-llmstxt.sh +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/check-robots.sh +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/check-sitemap.sh +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/diff-render.sh +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/extract-links.sh +0 -0
- /package/{scripts → skills/crawl-sim/scripts}/extract-meta.sh +0 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "crawl-sim",
|
|
3
|
+
"owner": {
|
|
4
|
+
"name": "BraedenBDev",
|
|
5
|
+
"url": "https://github.com/BraedenBDev"
|
|
6
|
+
},
|
|
7
|
+
"plugins": [
|
|
8
|
+
{
|
|
9
|
+
"name": "crawl-sim",
|
|
10
|
+
"source": "./",
|
|
11
|
+
"description": "Multi-bot web crawler simulator — audit how Googlebot, GPTBot, ClaudeBot, and PerplexityBot see your site",
|
|
12
|
+
"version": "1.2.0"
|
|
13
|
+
}
|
|
14
|
+
]
|
|
15
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "crawl-sim",
|
|
3
|
+
"version": "1.2.0",
|
|
4
|
+
"description": "Multi-bot web crawler simulator — audit how Googlebot, GPTBot, ClaudeBot, and PerplexityBot see your site",
|
|
5
|
+
"author": {
|
|
6
|
+
"name": "BraedenBDev",
|
|
7
|
+
"url": "https://github.com/BraedenBDev"
|
|
8
|
+
},
|
|
9
|
+
"homepage": "https://github.com/BraedenBDev/crawl-sim#readme",
|
|
10
|
+
"repository": "https://github.com/BraedenBDev/crawl-sim",
|
|
11
|
+
"license": "MIT",
|
|
12
|
+
"keywords": ["seo", "crawler", "ai-visibility", "claude-code-skill", "googlebot", "gptbot", "claudebot", "perplexitybot"]
|
|
13
|
+
}
|
package/README.md
CHANGED
|
@@ -44,15 +44,20 @@ The concept was validated manually: a curl-as-GPTBot + Claude analysis caught a
|
|
|
44
44
|
|
|
45
45
|
## Quick start
|
|
46
46
|
|
|
47
|
-
###
|
|
47
|
+
### As a Claude Code plugin (recommended)
|
|
48
48
|
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
49
|
+
```
|
|
50
|
+
/plugin install BraedenBDev/crawl-sim@github
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Or add as a marketplace for easy updates:
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
/plugin marketplace add BraedenBDev/crawl-sim
|
|
57
|
+
/plugin install crawl-sim@crawl-sim
|
|
53
58
|
```
|
|
54
59
|
|
|
55
|
-
Then
|
|
60
|
+
Then invoke:
|
|
56
61
|
|
|
57
62
|
```
|
|
58
63
|
/crawl-sim https://yoursite.com
|
|
@@ -60,7 +65,15 @@ Then in Claude Code:
|
|
|
60
65
|
|
|
61
66
|
Claude runs the full pipeline, interprets the results, and returns a score card plus prioritized findings.
|
|
62
67
|
|
|
63
|
-
|
|
68
|
+
### Via npm (alternative)
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
npm install -g @braedenbuilds/crawl-sim
|
|
72
|
+
crawl-sim install # → ~/.claude/skills/crawl-sim/
|
|
73
|
+
crawl-sim install --project # → .claude/skills/crawl-sim/
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
> **Why `npm install -g` instead of `npx`?** Recent versions of npx have a known issue linking bins for scoped single-bin packages in ephemeral installs. A persistent global install avoids the problem entirely. The git clone path below is the zero-npm fallback.
|
|
64
77
|
|
|
65
78
|
### As a standalone CLI
|
|
66
79
|
|
|
@@ -88,10 +101,13 @@ git clone https://github.com/BraedenBDev/crawl-sim.git ~/.claude/skills/crawl-si
|
|
|
88
101
|
|
|
89
102
|
- **Multi-bot simulation.** Nine verified bot profiles covering Google, OpenAI, Anthropic, and Perplexity — including the bot-vs-user-agent distinction (e.g., `ChatGPT-User` officially ignores robots.txt; `claude-user` respects it).
|
|
90
103
|
- **Quantified scoring.** Each bot is graded 0–100 across five categories with letter grades A through F, plus a weighted composite score.
|
|
104
|
+
- **Page-type-aware rubric.** The structured-data category derives the page type from the URL (`root` / `detail` / `archive` / `faq` / `about` / `contact` / `generic`) and applies a per-type schema rubric. A homepage shipping `Organization` + `WebSite` scores 100 without being penalized for not having `BreadcrumbList` or `FAQPage`. Override the detection with `--page-type <type>` when the URL heuristic picks wrong.
|
|
105
|
+
- **Self-explaining scores.** Every `structuredData` block in the JSON report ships `pageType`, `expected`, `optional`, `forbidden`, `present`, `missing`, `extras`, `violations`, `calculation`, and `notes` — so the narrative layer reads the scorer's reasoning directly instead of guessing what was penalized.
|
|
91
106
|
- **Agent-native interpretation.** The Claude Code skill reads raw data, identifies root causes (framework signals, hydration boundaries, soft-404s), and recommends specific fixes.
|
|
92
107
|
- **Three-layer output.** Terminal score card, prose narrative, and structured JSON — so humans and CI both get what they need.
|
|
93
108
|
- **Confidence transparency.** Every claim is tagged `official`, `observed`, or `inferred`. The skill notes when recommendations depend on observed-but-undocumented behavior.
|
|
94
109
|
- **Shell-native core.** All checks use only `curl` + `jq`. No Node, no Python, no Docker. Each script is independently invokable.
|
|
110
|
+
- **Regression-tested.** `npm test` runs a 37-assertion scoring suite against synthetic fixtures, covering URL→page-type detection, per-type rubrics, missing/forbidden schema flagging, and golden non-structured output.
|
|
95
111
|
- **Extensible.** Drop a new profile JSON into `profiles/` and it's auto-discovered.
|
|
96
112
|
|
|
97
113
|
---
|
|
@@ -107,6 +123,8 @@ git clone https://github.com/BraedenBDev/crawl-sim.git ~/.claude/skills/crawl-si
|
|
|
107
123
|
/crawl-sim https://yoursite.com --json # JSON only (for CI)
|
|
108
124
|
```
|
|
109
125
|
|
|
126
|
+
The skill auto-detects page type from the URL. Pass `--page-type root|detail|archive|faq|about|contact|generic` to the underlying `compute-score.sh` when the URL heuristic picks the wrong type (e.g., a homepage at `/en/` that URL-parses as `generic`).
|
|
127
|
+
|
|
110
128
|
Output is a three-layer report:
|
|
111
129
|
|
|
112
130
|
1. **Score card** — ASCII overview with per-bot and per-category scores.
|
|
@@ -126,6 +144,7 @@ Every script is standalone and outputs JSON to stdout:
|
|
|
126
144
|
./scripts/check-llmstxt.sh https://yoursite.com
|
|
127
145
|
./scripts/check-sitemap.sh https://yoursite.com
|
|
128
146
|
./scripts/compute-score.sh /tmp/audit-data/
|
|
147
|
+
./scripts/compute-score.sh --page-type root /tmp/audit-data/ # override URL heuristic
|
|
129
148
|
```
|
|
130
149
|
|
|
131
150
|
### CI/CD
|
|
@@ -148,7 +167,7 @@ Each bot is scored 0–100 across five weighted categories:
|
|
|
148
167
|
|----------|:------:|----------|
|
|
149
168
|
| **Accessibility** | 25 | robots.txt allows, HTTP 200, response time |
|
|
150
169
|
| **Content Visibility** | 30 | server HTML word count, heading structure, internal links, image alt text |
|
|
151
|
-
| **Structured Data** | 20 | JSON-LD presence, validity, page-
|
|
170
|
+
| **Structured Data** | 20 | JSON-LD presence, validity, page-type-aware `@type` rubric (root / detail / archive / faq / about / contact / generic) |
|
|
152
171
|
| **Technical Signals** | 15 | title / description / canonical / OG meta, sitemap inclusion |
|
|
153
172
|
| **AI Readiness** | 10 | `llms.txt` structure, content citability |
|
|
154
173
|
|
|
@@ -218,6 +237,7 @@ crawl-sim/
|
|
|
218
237
|
├── bin/install.js # npm installer
|
|
219
238
|
├── profiles/ # 9 verified bot profiles (JSON)
|
|
220
239
|
├── scripts/
|
|
240
|
+
│ ├── _lib.sh # shared helpers (URL parsing, page-type detection)
|
|
221
241
|
│ ├── fetch-as-bot.sh # curl with bot UA → JSON (status/headers/body/timing)
|
|
222
242
|
│ ├── extract-meta.sh # title, description, OG, headings, images
|
|
223
243
|
│ ├── extract-jsonld.sh # JSON-LD @type detection
|
|
@@ -227,8 +247,11 @@ crawl-sim/
|
|
|
227
247
|
│ ├── check-sitemap.sh # sitemap.xml URL inclusion
|
|
228
248
|
│ ├── diff-render.sh # optional Playwright server-vs-rendered comparison
|
|
229
249
|
│ └── compute-score.sh # aggregates all checks → per-bot + per-category scores
|
|
250
|
+
├── test/
|
|
251
|
+
│ ├── run-scoring-tests.sh # 37-assertion bash harness (run with `npm test`)
|
|
252
|
+
│ └── fixtures/ # synthetic RUN_DIR fixtures for regression tests
|
|
230
253
|
├── research/ # Verified bot data sources
|
|
231
|
-
└── docs/
|
|
254
|
+
└── docs/ # Design docs, issues, accuracy handoffs
|
|
232
255
|
```
|
|
233
256
|
|
|
234
257
|
The shell scripts are the plumbing. The Claude Code skill is the intelligence — it reads the raw JSON, understands framework context (Next.js, Nuxt, SPAs), identifies root causes, and writes actionable recommendations.
|
package/bin/install.js
CHANGED
|
@@ -14,6 +14,7 @@ const os = require('os');
|
|
|
14
14
|
const { execFileSync } = require('child_process');
|
|
15
15
|
|
|
16
16
|
const SOURCE_DIR = path.resolve(__dirname, '..');
|
|
17
|
+
const SKILL_ROOT = path.resolve(SOURCE_DIR, 'skills', 'crawl-sim');
|
|
17
18
|
const SKILL_FILES = ['SKILL.md'];
|
|
18
19
|
const SKILL_DIRS = ['profiles', 'scripts'];
|
|
19
20
|
|
|
@@ -80,7 +81,9 @@ function install(target) {
|
|
|
80
81
|
fs.mkdirSync(target, { recursive: true });
|
|
81
82
|
|
|
82
83
|
for (const file of SKILL_FILES) {
|
|
83
|
-
|
|
84
|
+
// Look in skills/crawl-sim/ first (canonical), fallback to root (symlink)
|
|
85
|
+
let src = path.join(SKILL_ROOT, file);
|
|
86
|
+
if (!fs.existsSync(src)) src = path.join(SOURCE_DIR, file);
|
|
84
87
|
const dest = path.join(target, file);
|
|
85
88
|
if (fs.existsSync(src)) {
|
|
86
89
|
fs.copyFileSync(src, dest);
|
|
@@ -92,7 +95,8 @@ function install(target) {
|
|
|
92
95
|
}
|
|
93
96
|
|
|
94
97
|
for (const dir of SKILL_DIRS) {
|
|
95
|
-
|
|
98
|
+
let src = path.join(SKILL_ROOT, dir);
|
|
99
|
+
if (!fs.existsSync(src)) src = path.join(SOURCE_DIR, dir);
|
|
96
100
|
const dest = path.join(target, dir);
|
|
97
101
|
if (fs.existsSync(src)) {
|
|
98
102
|
if (fs.existsSync(dest)) {
|
package/package.json
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@braedenbuilds/crawl-sim",
|
|
3
|
-
"version": "1.0
|
|
3
|
+
"version": "1.2.0",
|
|
4
4
|
"description": "Agent-native multi-bot web crawler simulator. See your site through the eyes of Googlebot, GPTBot, ClaudeBot, and PerplexityBot.",
|
|
5
5
|
"bin": {
|
|
6
6
|
"crawl-sim": "bin/install.js"
|
|
7
7
|
},
|
|
8
|
+
"scripts": {
|
|
9
|
+
"test": "./test/run-scoring-tests.sh"
|
|
10
|
+
},
|
|
8
11
|
"keywords": [
|
|
9
12
|
"seo",
|
|
10
13
|
"crawler",
|
|
@@ -37,9 +40,11 @@
|
|
|
37
40
|
},
|
|
38
41
|
"files": [
|
|
39
42
|
"bin/",
|
|
43
|
+
"skills/",
|
|
44
|
+
".claude-plugin/",
|
|
40
45
|
"SKILL.md",
|
|
41
|
-
"profiles
|
|
42
|
-
"scripts
|
|
46
|
+
"profiles",
|
|
47
|
+
"scripts",
|
|
43
48
|
"README.md",
|
|
44
49
|
"LICENSE"
|
|
45
50
|
]
|
|
@@ -40,7 +40,10 @@ command -v curl >/dev/null 2>&1 || { echo "ERROR: curl is required"; exit 1; }
|
|
|
40
40
|
command -v jq >/dev/null 2>&1 || { echo "ERROR: jq is required (brew install jq)"; exit 1; }
|
|
41
41
|
```
|
|
42
42
|
|
|
43
|
-
Locate the skill directory
|
|
43
|
+
Locate the skill directory. Check in this order:
|
|
44
|
+
1. `$CLAUDE_PLUGIN_ROOT/skills/crawl-sim` (plugin install)
|
|
45
|
+
2. `~/.claude/skills/crawl-sim/` (global npm install)
|
|
46
|
+
3. `.claude/skills/crawl-sim/` (project-level install)
|
|
44
47
|
|
|
45
48
|
## Orchestration — five narrated stages
|
|
46
49
|
|
|
@@ -51,7 +54,16 @@ Split the work into **five Bash invocations**, each with a clear `description` f
|
|
|
51
54
|
Tell the user: "Fetching as Googlebot, GPTBot, ClaudeBot, and PerplexityBot..."
|
|
52
55
|
|
|
53
56
|
```bash
|
|
54
|
-
|
|
57
|
+
# Resolve skill directory
|
|
58
|
+
if [ -n "${CLAUDE_PLUGIN_ROOT:-}" ] && [ -d "$CLAUDE_PLUGIN_ROOT/skills/crawl-sim" ]; then
|
|
59
|
+
SKILL_DIR="$CLAUDE_PLUGIN_ROOT/skills/crawl-sim"
|
|
60
|
+
elif [ -d "$HOME/.claude/skills/crawl-sim" ]; then
|
|
61
|
+
SKILL_DIR="$HOME/.claude/skills/crawl-sim"
|
|
62
|
+
elif [ -d ".claude/skills/crawl-sim" ]; then
|
|
63
|
+
SKILL_DIR=".claude/skills/crawl-sim"
|
|
64
|
+
else
|
|
65
|
+
echo "ERROR: cannot find crawl-sim skill directory" >&2; exit 1
|
|
66
|
+
fi
|
|
55
67
|
RUN_DIR=$(mktemp -d -t crawl-sim.XXXXXX)
|
|
56
68
|
URL="<user-provided-url>"
|
|
57
69
|
for bot in googlebot gptbot claudebot perplexitybot; do
|
|
@@ -115,6 +127,14 @@ Tell the user: "Computing per-bot scores and finalizing the report..."
|
|
|
115
127
|
cp "$RUN_DIR/score.json" ./crawl-sim-report.json
|
|
116
128
|
```
|
|
117
129
|
|
|
130
|
+
**Page-type awareness.** `compute-score.sh` derives a page type from the target URL (`root` / `detail` / `archive` / `faq` / `about` / `contact` / `generic`) and picks a schema rubric accordingly. Root pages are expected to ship `Organization` + `WebSite` — penalizing them for missing `BreadcrumbList` or `FAQPage` would be wrong, so the scorer doesn't. If the URL heuristic picks the wrong type (e.g., a homepage at `/en/` that URL-parses as generic), pass `--page-type <type>`:
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
"$SKILL_DIR/scripts/compute-score.sh" --page-type root "$RUN_DIR" > "$RUN_DIR/score.json"
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Valid values: `root`, `detail`, `archive`, `faq`, `about`, `contact`, `generic`. The detected (or overridden) page type is exposed on `score.pageType`, and `score.pageTypeOverridden` flips `true` when `--page-type` was used.
|
|
137
|
+
|
|
118
138
|
## Output Layer 1 — Score Card (ASCII)
|
|
119
139
|
|
|
120
140
|
Print a boxed score card to the terminal:
|
|
@@ -168,6 +188,7 @@ Then produce **prioritized findings** ranked by total point impact across bots:
|
|
|
168
188
|
### Interpretation rules
|
|
169
189
|
|
|
170
190
|
- **Cross-bot deltas are the headline.** Compare `visibility.effectiveWords` across bots — if Googlebot has significantly more than the AI bots, that's finding #1. The raw delta is in `visibility.missedWordsVsRendered`.
|
|
191
|
+
- **Trust the structuredData rubric.** Every `bots.<bot>.categories.structuredData` block now carries `pageType`, `expected`, `optional`, `forbidden`, `present`, `missing`, `extras`, `violations`, `calculation`, and `notes`. Read `missing` and `violations` directly — never guess what the scorer was penalizing for. If `notes` says the page scores 100 with no action needed, that IS the finding; don't invent fixes. If the rubric looks wrong for this specific page (e.g., a homepage detected as `generic` because the URL ends in `/en/`), rerun with `--page-type <correct-type>` instead of arguing with the score. Never recommend adding a schema that already appears in `present` or `extras`.
|
|
171
192
|
- **Confidence transparency.** If a claim depends on a bot profile's `rendersJavaScript: false` at `observed` confidence (not `official`), note it: *"Based on observed behavior, not official documentation."*
|
|
172
193
|
- **Framework detection.** Scan the HTML body for signals: `<meta name="next-head-count">` or `_next/static` → Next.js (Pages Router or App Router respectively), `<div id="__nuxt">` → Nuxt, `<div id="app">` with thin content → SPA (Vue/React CSR), `<!--$-->` placeholder tags → React 18 Suspense. Use these to tailor fix recommendations.
|
|
173
194
|
- **No speculation beyond the data.** If server HTML has 0 `<a>` tags inside a component, say "component not present in server HTML" — not "JavaScript hydration failed" unless the diff-render data proves it.
|
|
@@ -41,6 +41,36 @@ count_words() {
|
|
|
41
41
|
sed 's/<[^>]*>//g' "$1" | tr -s '[:space:]' '\n' | grep -c '[a-zA-Z0-9]' || true
|
|
42
42
|
}
|
|
43
43
|
|
|
44
|
+
# Detect the structural page type of a URL based on its path.
|
|
45
|
+
# Returns one of: root, detail, archive, faq, about, contact, generic.
|
|
46
|
+
#
|
|
47
|
+
# Used by compute-score.sh to pick a schema rubric, but also exposed here
|
|
48
|
+
# so other tooling (narrative layer, planned multi-URL mode) can classify
|
|
49
|
+
# URLs consistently without re-implementing the heuristic.
|
|
50
|
+
page_type_for_url() {
|
|
51
|
+
local url="$1"
|
|
52
|
+
local path
|
|
53
|
+
path=$(path_from_url "$url" | sed 's#[?#].*##')
|
|
54
|
+
if [ "$path" = "/" ]; then
|
|
55
|
+
echo "root"
|
|
56
|
+
return
|
|
57
|
+
fi
|
|
58
|
+
local trimmed lower
|
|
59
|
+
trimmed=$(printf '%s' "$path" | sed 's#^/##' | sed 's#/$##')
|
|
60
|
+
lower=$(printf '%s' "$trimmed" | tr '[:upper:]' '[:lower:]')
|
|
61
|
+
case "$lower" in
|
|
62
|
+
"") echo "root" ;;
|
|
63
|
+
work|journal|blog|articles|news|careers|projects|case-studies|cases)
|
|
64
|
+
echo "archive" ;;
|
|
65
|
+
work/*|articles/*|journal/*|blog/*|news/*|case-studies/*|cases/*|case/*|careers/*|projects/*)
|
|
66
|
+
echo "detail" ;;
|
|
67
|
+
*faq*) echo "faq" ;;
|
|
68
|
+
*about*|*team*|*purpose*|*who-we-are*) echo "about" ;;
|
|
69
|
+
*contact*) echo "contact" ;;
|
|
70
|
+
*) echo "generic" ;;
|
|
71
|
+
esac
|
|
72
|
+
}
|
|
73
|
+
|
|
44
74
|
# Fetch a URL to a local file and return the HTTP status code on stdout.
|
|
45
75
|
# Usage: status=$(fetch_to_file <url> <output-file> [timeout-seconds])
|
|
46
76
|
fetch_to_file() {
|