@houseofmvps/claude-rank 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +59 -5
- package/bin/claude-rank.mjs +87 -9
- package/package.json +1 -1
- package/tools/lib/crawler.mjs +248 -0
- package/tools/lib/html-parser.mjs +45 -0
- package/tools/lib/report-generator.mjs +160 -0
- package/tools/seo-scanner.mjs +13 -4
- package/tools/url-scanner.mjs +165 -4
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<div align="center">
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
<img src="assets/hero-banner.png" alt="claude-rank — SEO/GEO/AEO Plugin for Claude Code" width="100%"/>
|
|
4
4
|
|
|
5
5
|
### The most comprehensive SEO/GEO/AEO plugin for Claude Code. 74+ rules. Auto-fix everything. Dominate search — traditional and AI.
|
|
6
6
|
|
|
@@ -26,6 +26,58 @@
|
|
|
26
26
|
|
|
27
27
|
---
|
|
28
28
|
|
|
29
|
+
## See It In Action
|
|
30
|
+
|
|
31
|
+
```
|
|
32
|
+
$ claude-rank scan ./my-saas-landing
|
|
33
|
+
|
|
34
|
+
╔════════════════════════════════════════════════╗
|
|
35
|
+
║ claude-rank SEO Audit ║
|
|
36
|
+
╠════════════════════════════════════════════════╣
|
|
37
|
+
║ Score: 65/100 ██████████░░░░░ NEEDS WORK ║
|
|
38
|
+
╠════════════════════════════════════════════════╣
|
|
39
|
+
║ Files scanned: 26 ║
|
|
40
|
+
║ Findings: 41 ║
|
|
41
|
+
║ Critical: 0 High: 1 Medium: 40 Low: 0 ║
|
|
42
|
+
╚════════════════════════════════════════════════╝
|
|
43
|
+
|
|
44
|
+
Findings:
|
|
45
|
+
HIGH thin-content
|
|
46
|
+
Page has only 190 words (minimum recommended: 300)
|
|
47
|
+
Files: dist/contact/index.html
|
|
48
|
+
|
|
49
|
+
MEDIUM title-too-long (18 pages)
|
|
50
|
+
Title is 63 chars (max recommended: 60)
|
|
51
|
+
Files: dist/about/index.html, dist/blog/..., +15 more
|
|
52
|
+
|
|
53
|
+
MEDIUM missing-main-landmark (9 pages)
|
|
54
|
+
Page is missing a <main> landmark element
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
```
|
|
58
|
+
$ claude-rank geo ./my-saas-landing
|
|
59
|
+
|
|
60
|
+
╔════════════════════════════════════════════════╗
|
|
61
|
+
║ claude-rank GEO Audit ║
|
|
62
|
+
╠════════════════════════════════════════════════╣
|
|
63
|
+
║ Score: 95/100 ██████████████░ EXCELLENT ║
|
|
64
|
+
╚════════════════════════════════════════════════╝
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
```
|
|
68
|
+
$ claude-rank scan https://houseofmvps.com # Scan any live URL
|
|
69
|
+
|
|
70
|
+
╔════════════════════════════════════════════════╗
|
|
71
|
+
║ claude-rank SEO Audit ║
|
|
72
|
+
╠════════════════════════════════════════════════╣
|
|
73
|
+
║ Score: 83/100 ████████████░░░ GOOD ║
|
|
74
|
+
╚════════════════════════════════════════════════╝
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
*Real output from scanning [savemrr.co](https://savemrr.co) (26-page SaaS landing) and [houseofmvps.com](https://houseofmvps.com).*
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
29
81
|
## Quick Start
|
|
30
82
|
|
|
31
83
|
### Install as a Claude Code plugin (recommended)
|
|
@@ -39,10 +91,12 @@ That's it. Restart Claude Code and all 6 skills + 4 agents are active.
|
|
|
39
91
|
### Or use standalone — no plugin install needed
|
|
40
92
|
|
|
41
93
|
```bash
|
|
42
|
-
npx @houseofmvps/claude-rank scan ./my-project
|
|
43
|
-
npx @houseofmvps/claude-rank
|
|
44
|
-
npx @houseofmvps/claude-rank
|
|
45
|
-
npx @houseofmvps/claude-rank
|
|
94
|
+
npx @houseofmvps/claude-rank scan ./my-project # Local directory
|
|
95
|
+
npx @houseofmvps/claude-rank scan https://example.com # Live URL
|
|
96
|
+
npx @houseofmvps/claude-rank geo ./my-project # AI search audit
|
|
97
|
+
npx @houseofmvps/claude-rank aeo ./my-project # Answer engine audit
|
|
98
|
+
npx @houseofmvps/claude-rank schema ./my-project # Structured data
|
|
99
|
+
npx @houseofmvps/claude-rank scan ./site --json # Raw JSON output
|
|
46
100
|
```
|
|
47
101
|
|
|
48
102
|
### Or install globally
|
package/bin/claude-rank.mjs
CHANGED
|
@@ -4,7 +4,26 @@
|
|
|
4
4
|
|
|
5
5
|
const args = process.argv.slice(2);
|
|
6
6
|
const jsonFlag = args.includes('--json');
|
|
7
|
-
const
|
|
7
|
+
const singleFlag = args.includes('--single');
|
|
8
|
+
const reportFlag = args.includes('--report') ? args[args.indexOf('--report') + 1] : null;
|
|
9
|
+
const thresholdIdx = args.indexOf('--threshold');
|
|
10
|
+
const thresholdFlag = thresholdIdx !== -1 ? Number(args[thresholdIdx + 1]) : null;
|
|
11
|
+
|
|
12
|
+
// Parse --pages N flag (default: 50)
|
|
13
|
+
let maxPages = 50;
|
|
14
|
+
const pagesIdx = args.indexOf('--pages');
|
|
15
|
+
if (pagesIdx !== -1 && args[pagesIdx + 1]) {
|
|
16
|
+
const parsed = parseInt(args[pagesIdx + 1], 10);
|
|
17
|
+
if (!isNaN(parsed) && parsed > 0) maxPages = parsed;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
const positional = args.filter((a, i) => {
|
|
21
|
+
if (a === '--json' || a === '--single') return false;
|
|
22
|
+
if (a === '--report' || a === '--threshold' || a === '--pages') return false;
|
|
23
|
+
// Skip the value after --report, --threshold, or --pages
|
|
24
|
+
if (i > 0 && (args[i - 1] === '--report' || args[i - 1] === '--threshold' || args[i - 1] === '--pages')) return false;
|
|
25
|
+
return true;
|
|
26
|
+
});
|
|
8
27
|
const [command = 'scan', dir = '.'] = positional;
|
|
9
28
|
|
|
10
29
|
const commands = {
|
|
@@ -17,7 +36,7 @@ const commands = {
|
|
|
17
36
|
if (command === 'help' || command === '--help') {
|
|
18
37
|
console.log(`claude-rank — SEO/GEO/AEO toolkit
|
|
19
38
|
|
|
20
|
-
Usage: claude-rank <command> [directory|url] [
|
|
39
|
+
Usage: claude-rank <command> [directory|url] [flags]
|
|
21
40
|
|
|
22
41
|
Commands:
|
|
23
42
|
scan Run core SEO scanner (default)
|
|
@@ -27,17 +46,28 @@ Commands:
|
|
|
27
46
|
help Show this help message
|
|
28
47
|
|
|
29
48
|
Flags:
|
|
30
|
-
--json
|
|
49
|
+
--json Output raw JSON (for programmatic use)
|
|
50
|
+
--single Scan only one page (skip multi-page crawl for URLs)
|
|
51
|
+
--pages N Max pages to crawl (default: 50, URL scanning only)
|
|
52
|
+
--report html Run all scanners and save HTML report to claude-rank-report.html
|
|
53
|
+
--threshold N Exit code 1 if score < N (for CI/CD pipelines)
|
|
31
54
|
|
|
32
55
|
URL scanning:
|
|
33
|
-
Pass a URL instead of a directory to scan a live
|
|
56
|
+
Pass a URL instead of a directory to scan a live site via HTTP.
|
|
57
|
+
By default, crawls up to 50 pages following internal links.
|
|
58
|
+
Use --single to scan only the given URL without crawling.
|
|
34
59
|
Only the "scan" command supports URL scanning.
|
|
35
60
|
|
|
36
61
|
Examples:
|
|
37
62
|
claude-rank scan ./my-project
|
|
38
63
|
claude-rank scan https://savemrr.co
|
|
64
|
+
claude-rank scan https://savemrr.co --pages 10
|
|
65
|
+
claude-rank scan https://savemrr.co --single
|
|
39
66
|
npx @houseofmvps/claude-rank geo .
|
|
40
67
|
claude-rank scan ./site --json
|
|
68
|
+
claude-rank scan ./site --report html
|
|
69
|
+
claude-rank scan ./site --threshold 80
|
|
70
|
+
claude-rank scan . --report html --threshold 80
|
|
41
71
|
`);
|
|
42
72
|
process.exit(0);
|
|
43
73
|
}
|
|
@@ -79,9 +109,11 @@ if (isUrl) {
|
|
|
79
109
|
process.exit(1);
|
|
80
110
|
}
|
|
81
111
|
|
|
82
|
-
const { scanUrl } = await import(new URL('../tools/url-scanner.mjs', import.meta.url));
|
|
112
|
+
const { scanUrl, scanSite } = await import(new URL('../tools/url-scanner.mjs', import.meta.url));
|
|
83
113
|
try {
|
|
84
|
-
const result =
|
|
114
|
+
const result = singleFlag
|
|
115
|
+
? await scanUrl(dir)
|
|
116
|
+
: await scanSite(dir, { maxPages });
|
|
85
117
|
if (jsonFlag) {
|
|
86
118
|
console.log(JSON.stringify(result, null, 2));
|
|
87
119
|
} else {
|
|
@@ -93,12 +125,47 @@ if (isUrl) {
|
|
|
93
125
|
}
|
|
94
126
|
} else {
|
|
95
127
|
// Directory-based scanning
|
|
96
|
-
const mod = await import(new URL(toolPath, import.meta.url));
|
|
97
128
|
const targetDir = resolve(dir);
|
|
98
129
|
|
|
99
|
-
|
|
130
|
+
// --report html: run ALL scanners, generate HTML report
|
|
131
|
+
if (reportFlag === 'html') {
|
|
132
|
+
const { writeFileSync } = await import('node:fs');
|
|
133
|
+
const { generateHtmlReport } = await import(new URL('../tools/lib/report-generator.mjs', import.meta.url));
|
|
134
|
+
|
|
135
|
+
const seoMod = await import(new URL('../tools/seo-scanner.mjs', import.meta.url));
|
|
136
|
+
const geoMod = await import(new URL('../tools/geo-scanner.mjs', import.meta.url));
|
|
137
|
+
const aeoMod = await import(new URL('../tools/aeo-scanner.mjs', import.meta.url));
|
|
138
|
+
|
|
139
|
+
const seo = seoMod.scanDirectory(targetDir);
|
|
140
|
+
const geo = geoMod.scanDirectory(targetDir);
|
|
141
|
+
const aeo = aeoMod.scanDirectory(targetDir);
|
|
142
|
+
|
|
143
|
+
const html = generateHtmlReport({
|
|
144
|
+
seo, geo, aeo,
|
|
145
|
+
target: dir,
|
|
146
|
+
timestamp: new Date().toISOString(),
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
const outPath = resolve('claude-rank-report.html');
|
|
150
|
+
writeFileSync(outPath, html, 'utf-8');
|
|
151
|
+
console.log(`HTML report saved to ${outPath}`);
|
|
152
|
+
|
|
153
|
+
// Also print terminal summaries
|
|
154
|
+
console.log(formatSeoReport(seo));
|
|
155
|
+
console.log(formatGeoReport(geo));
|
|
156
|
+
console.log(formatAeoReport(aeo));
|
|
157
|
+
|
|
158
|
+
// Check threshold against the primary (SEO) score
|
|
159
|
+
if (thresholdFlag != null) {
|
|
160
|
+
const score = seo.scores?.seo ?? 0;
|
|
161
|
+
if (score < thresholdFlag) {
|
|
162
|
+
console.error(`Score ${score} is below threshold ${thresholdFlag}`);
|
|
163
|
+
process.exit(1);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
} else if (command === 'schema') {
|
|
100
167
|
// schema-engine exports detectSchema (per-file) and findHtmlFiles via html-parser.
|
|
101
|
-
|
|
168
|
+
const mod = await import(new URL(toolPath, import.meta.url));
|
|
102
169
|
const { findHtmlFiles } = await import(new URL('../tools/lib/html-parser.mjs', import.meta.url));
|
|
103
170
|
const { readFileSync } = await import('node:fs');
|
|
104
171
|
const files = findHtmlFiles(targetDir);
|
|
@@ -116,11 +183,22 @@ if (isUrl) {
|
|
|
116
183
|
console.log(formatSchemaReport(results));
|
|
117
184
|
}
|
|
118
185
|
} else {
|
|
186
|
+
const mod = await import(new URL(toolPath, import.meta.url));
|
|
119
187
|
const result = mod.scanDirectory(targetDir);
|
|
120
188
|
if (jsonFlag) {
|
|
121
189
|
console.log(JSON.stringify(result, null, 2));
|
|
122
190
|
} else {
|
|
123
191
|
console.log(formatters[command](result));
|
|
124
192
|
}
|
|
193
|
+
|
|
194
|
+
// Check threshold
|
|
195
|
+
if (thresholdFlag != null) {
|
|
196
|
+
const scoreKey = command === 'scan' ? 'seo' : command;
|
|
197
|
+
const score = result.scores?.[scoreKey] ?? 0;
|
|
198
|
+
if (score < thresholdFlag) {
|
|
199
|
+
console.error(`Score ${score} is below threshold ${thresholdFlag}`);
|
|
200
|
+
process.exit(1);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
125
203
|
}
|
|
126
204
|
}
|
package/package.json
CHANGED
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* crawler.mjs — Multi-page site crawler using BFS with concurrency control.
|
|
3
|
+
* Follows internal links on the same domain. Uses fetchPage() for SSRF protection.
|
|
4
|
+
* No external dependencies.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { fetchPage } from './url-fetcher.mjs';
|
|
8
|
+
|
|
9
|
+
// ---------------------------------------------------------------------------
|
|
10
|
+
// URL helpers (exported for testing)
|
|
11
|
+
// ---------------------------------------------------------------------------
|
|
12
|
+
|
|
13
|
+
/** File extensions to skip (non-HTML resources) */
|
|
14
|
+
const SKIP_EXTENSIONS = new Set([
|
|
15
|
+
'.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.ico', '.bmp', '.avif',
|
|
16
|
+
'.css', '.js', '.mjs', '.cjs', '.map',
|
|
17
|
+
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
|
|
18
|
+
'.zip', '.tar', '.gz', '.rar', '.7z',
|
|
19
|
+
'.mp3', '.mp4', '.wav', '.avi', '.mov', '.webm', '.ogg',
|
|
20
|
+
'.woff', '.woff2', '.ttf', '.eot', '.otf',
|
|
21
|
+
'.xml', '.json', '.csv', '.txt', '.rss', '.atom',
|
|
22
|
+
]);
|
|
23
|
+
|
|
24
|
+
/** URL path patterns to skip (non-page routes) */
|
|
25
|
+
const SKIP_PATTERNS = [
|
|
26
|
+
/\/api\//i,
|
|
27
|
+
/\/auth\//i,
|
|
28
|
+
/\/login\b/i,
|
|
29
|
+
/\/logout\b/i,
|
|
30
|
+
/\/wp-admin/i,
|
|
31
|
+
/\/cdn-cgi\//i,
|
|
32
|
+
/\/wp-json\//i,
|
|
33
|
+
/\/feed\/?$/i,
|
|
34
|
+
/\/xmlrpc\.php/i,
|
|
35
|
+
/\/wp-login/i,
|
|
36
|
+
/\/admin\//i,
|
|
37
|
+
/\?/, // skip URLs with query strings to avoid crawl traps
|
|
38
|
+
];
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Normalize a URL: remove fragment, remove trailing slash (except root path).
|
|
42
|
+
* @param {string} urlStr
|
|
43
|
+
* @returns {string}
|
|
44
|
+
*/
|
|
45
|
+
export function normalizeUrl(urlStr) {
|
|
46
|
+
try {
|
|
47
|
+
const url = new URL(urlStr);
|
|
48
|
+
url.hash = '';
|
|
49
|
+
// Remove trailing slash unless it's just the root "/"
|
|
50
|
+
if (url.pathname.length > 1 && url.pathname.endsWith('/')) {
|
|
51
|
+
url.pathname = url.pathname.slice(0, -1);
|
|
52
|
+
}
|
|
53
|
+
return url.href;
|
|
54
|
+
} catch {
|
|
55
|
+
return urlStr;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Check if a URL should be skipped based on extension or path pattern.
|
|
61
|
+
* @param {string} urlStr
|
|
62
|
+
* @returns {boolean}
|
|
63
|
+
*/
|
|
64
|
+
export function shouldSkipUrl(urlStr) {
|
|
65
|
+
try {
|
|
66
|
+
const url = new URL(urlStr);
|
|
67
|
+
const pathname = url.pathname.toLowerCase();
|
|
68
|
+
|
|
69
|
+
// Check file extension
|
|
70
|
+
const lastDot = pathname.lastIndexOf('.');
|
|
71
|
+
if (lastDot !== -1) {
|
|
72
|
+
const ext = pathname.slice(lastDot);
|
|
73
|
+
if (SKIP_EXTENSIONS.has(ext)) return true;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Check path patterns
|
|
77
|
+
for (const pattern of SKIP_PATTERNS) {
|
|
78
|
+
if (pattern.test(url.pathname + url.search)) return true;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
return false;
|
|
82
|
+
} catch {
|
|
83
|
+
return true;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Check if two URLs share the same hostname.
|
|
89
|
+
* @param {string} urlA
|
|
90
|
+
* @param {string} urlB
|
|
91
|
+
* @returns {boolean}
|
|
92
|
+
*/
|
|
93
|
+
export function isSameDomain(urlA, urlB) {
|
|
94
|
+
try {
|
|
95
|
+
const a = new URL(urlA);
|
|
96
|
+
const b = new URL(urlB);
|
|
97
|
+
return a.hostname === b.hostname;
|
|
98
|
+
} catch {
|
|
99
|
+
return false;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Extract internal links from HTML content.
|
|
105
|
+
* Returns an array of absolute URL strings on the same domain as baseUrl.
|
|
106
|
+
* @param {string} html
|
|
107
|
+
* @param {string} baseUrl
|
|
108
|
+
* @returns {string[]}
|
|
109
|
+
*/
|
|
110
|
+
export function extractLinks(html, baseUrl) {
|
|
111
|
+
const links = [];
|
|
112
|
+
// Match <a href="..."> with both single and double quotes
|
|
113
|
+
const regex = /<a\s[^>]*href\s*=\s*(?:"([^"]*)"|'([^']*)')/gi;
|
|
114
|
+
let match;
|
|
115
|
+
|
|
116
|
+
while ((match = regex.exec(html)) !== null) {
|
|
117
|
+
const href = match[1] ?? match[2];
|
|
118
|
+
if (!href) continue;
|
|
119
|
+
|
|
120
|
+
// Skip javascript:, mailto:, tel:, data: schemes
|
|
121
|
+
if (/^(javascript|mailto|tel|data):/i.test(href)) continue;
|
|
122
|
+
// Skip empty or fragment-only
|
|
123
|
+
if (href === '' || href === '#' || href.startsWith('#')) continue;
|
|
124
|
+
|
|
125
|
+
try {
|
|
126
|
+
const resolved = new URL(href, baseUrl).href;
|
|
127
|
+
const normalized = normalizeUrl(resolved);
|
|
128
|
+
|
|
129
|
+
if (isSameDomain(normalized, baseUrl) && !shouldSkipUrl(normalized)) {
|
|
130
|
+
links.push(normalized);
|
|
131
|
+
}
|
|
132
|
+
} catch {
|
|
133
|
+
// Invalid URL — skip
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Deduplicate
|
|
138
|
+
return [...new Set(links)];
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// ---------------------------------------------------------------------------
|
|
142
|
+
// Semaphore for concurrency control
|
|
143
|
+
// ---------------------------------------------------------------------------
|
|
144
|
+
|
|
145
|
+
class Semaphore {
|
|
146
|
+
constructor(max) {
|
|
147
|
+
this._max = max;
|
|
148
|
+
this._active = 0;
|
|
149
|
+
this._queue = [];
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
async acquire() {
|
|
153
|
+
if (this._active < this._max) {
|
|
154
|
+
this._active++;
|
|
155
|
+
return;
|
|
156
|
+
}
|
|
157
|
+
return new Promise(resolve => {
|
|
158
|
+
this._queue.push(resolve);
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
release() {
|
|
163
|
+
this._active--;
|
|
164
|
+
if (this._queue.length > 0) {
|
|
165
|
+
this._active++;
|
|
166
|
+
const next = this._queue.shift();
|
|
167
|
+
next();
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// ---------------------------------------------------------------------------
|
|
173
|
+
// Main crawler
|
|
174
|
+
// ---------------------------------------------------------------------------
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Crawl a site starting from startUrl, following internal links (BFS).
|
|
178
|
+
* @param {string} startUrl — starting URL
|
|
179
|
+
* @param {object} options
|
|
180
|
+
* @param {number} [options.maxPages=50] — max pages to crawl
|
|
181
|
+
* @param {number} [options.concurrency=3] — concurrent fetches
|
|
182
|
+
* @param {function} [options.onPage] — callback(url, html) called per page
|
|
183
|
+
* @returns {Promise<{ pages: Array<{url: string, html: string, statusCode: number}>, errors: Array<{url: string, error: string}> }>}
|
|
184
|
+
*/
|
|
185
|
+
export async function crawlSite(startUrl, options = {}) {
|
|
186
|
+
const {
|
|
187
|
+
maxPages = 50,
|
|
188
|
+
concurrency = 3,
|
|
189
|
+
onPage,
|
|
190
|
+
} = options;
|
|
191
|
+
|
|
192
|
+
const normalizedStart = normalizeUrl(startUrl);
|
|
193
|
+
const visited = new Set();
|
|
194
|
+
const queue = [normalizedStart]; // BFS queue
|
|
195
|
+
const pages = [];
|
|
196
|
+
const errors = [];
|
|
197
|
+
const semaphore = new Semaphore(concurrency);
|
|
198
|
+
|
|
199
|
+
let queued = new Set([normalizedStart]);
|
|
200
|
+
let pagesProcessed = 0;
|
|
201
|
+
|
|
202
|
+
// Process BFS in waves for concurrency
|
|
203
|
+
while (queue.length > 0 && pagesProcessed < maxPages) {
|
|
204
|
+
// Take a batch from the queue (up to concurrency size)
|
|
205
|
+
const batchSize = Math.min(queue.length, maxPages - pagesProcessed, concurrency);
|
|
206
|
+
const batch = queue.splice(0, batchSize);
|
|
207
|
+
|
|
208
|
+
const promises = batch.map(async (url) => {
|
|
209
|
+
if (visited.has(url) || pagesProcessed >= maxPages) return;
|
|
210
|
+
visited.add(url);
|
|
211
|
+
|
|
212
|
+
await semaphore.acquire();
|
|
213
|
+
try {
|
|
214
|
+
pagesProcessed++;
|
|
215
|
+
const num = pagesProcessed;
|
|
216
|
+
process.stderr.write(`Crawling [${num}/${maxPages}] ${url}\n`);
|
|
217
|
+
|
|
218
|
+
const result = await fetchPage(url);
|
|
219
|
+
pages.push({
|
|
220
|
+
url: result.finalUrl,
|
|
221
|
+
html: result.html,
|
|
222
|
+
statusCode: result.statusCode,
|
|
223
|
+
});
|
|
224
|
+
|
|
225
|
+
if (onPage) {
|
|
226
|
+
onPage(result.finalUrl, result.html);
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// Extract links and add new ones to queue
|
|
230
|
+
const links = extractLinks(result.html, result.finalUrl);
|
|
231
|
+
for (const link of links) {
|
|
232
|
+
if (!queued.has(link) && !visited.has(link) && pagesProcessed + queue.length < maxPages) {
|
|
233
|
+
queued.add(link);
|
|
234
|
+
queue.push(link);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
} catch (err) {
|
|
238
|
+
errors.push({ url, error: err.message });
|
|
239
|
+
} finally {
|
|
240
|
+
semaphore.release();
|
|
241
|
+
}
|
|
242
|
+
});
|
|
243
|
+
|
|
244
|
+
await Promise.all(promises);
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
return { pages, errors };
|
|
248
|
+
}
|
|
@@ -444,6 +444,51 @@ export function parseHtml(htmlString) {
|
|
|
444
444
|
return state;
|
|
445
445
|
}
|
|
446
446
|
|
|
447
|
+
// ---------------------------------------------------------------------------
|
|
448
|
+
// detectPageType — classify page type from URL path + parsed state
|
|
449
|
+
// ---------------------------------------------------------------------------
|
|
450
|
+
|
|
451
|
+
/**
|
|
452
|
+
* Page type patterns — ordered by priority (first match wins).
|
|
453
|
+
* Each entry: { type, patterns[] } where patterns are matched against
|
|
454
|
+
* the lowercase URL path, title, and h1 text.
|
|
455
|
+
*/
|
|
456
|
+
const PAGE_TYPE_RULES = [
|
|
457
|
+
{ type: 'contact', patterns: ['contact', 'get in touch', 'reach us'] },
|
|
458
|
+
{ type: 'terms', patterns: ['terms', 'conditions', 'tos', 'terms-of-service'] },
|
|
459
|
+
{ type: 'privacy', patterns: ['privacy', 'cookie policy', 'gdpr'] },
|
|
460
|
+
{ type: 'legal', patterns: ['legal', 'disclaimer', 'imprint'] },
|
|
461
|
+
{ type: 'login', patterns: ['login', 'signin', 'sign-in', 'register', 'signup'] },
|
|
462
|
+
{ type: '404', patterns: ['404', 'not found', 'page not found'] },
|
|
463
|
+
{ type: 'sitemap', patterns: ['sitemap'] },
|
|
464
|
+
];
|
|
465
|
+
|
|
466
|
+
/**
|
|
467
|
+
* Detect the page type from the file path / URL and parsed HTML state.
|
|
468
|
+
* Returns a page type string: 'contact', 'terms', 'privacy', 'legal',
|
|
469
|
+
* 'login', '404', 'sitemap', or 'content' (default).
|
|
470
|
+
*
|
|
471
|
+
* @param {string} filePath — file path or URL (used for path-based signals)
|
|
472
|
+
* @param {object} state — PageState from parseHtml
|
|
473
|
+
* @returns {string} page type
|
|
474
|
+
*/
|
|
475
|
+
export function detectPageType(filePath, state) {
|
|
476
|
+
// Build a combined haystack from path, title, and h1
|
|
477
|
+
const pathLower = (filePath || '').toLowerCase();
|
|
478
|
+
const titleLower = (state.titleText || '').toLowerCase();
|
|
479
|
+
const h1Lower = (state.h1Text || '').toLowerCase();
|
|
480
|
+
|
|
481
|
+
for (const { type, patterns } of PAGE_TYPE_RULES) {
|
|
482
|
+
for (const pattern of patterns) {
|
|
483
|
+
if (pathLower.includes(pattern) || titleLower.includes(pattern) || h1Lower.includes(pattern)) {
|
|
484
|
+
return type;
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
return 'content';
|
|
490
|
+
}
|
|
491
|
+
|
|
447
492
|
// ---------------------------------------------------------------------------
|
|
448
493
|
// parseHtmlFile — read file then parseHtml
|
|
449
494
|
// ---------------------------------------------------------------------------
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* report-generator.mjs — Generate self-contained HTML audit reports.
|
|
3
|
+
* No external dependencies. All CSS is inline.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Generate a self-contained HTML report from scan results.
|
|
8
|
+
* @param {object} options
|
|
9
|
+
* @param {object} options.seo — SEO scan result (optional)
|
|
10
|
+
* @param {object} options.geo — GEO scan result (optional)
|
|
11
|
+
* @param {object} options.aeo — AEO scan result (optional)
|
|
12
|
+
* @param {string} options.target — directory or URL that was scanned
|
|
13
|
+
* @param {string} options.timestamp — ISO timestamp
|
|
14
|
+
* @returns {string} — complete HTML document
|
|
15
|
+
*/
|
|
16
|
+
export function generateHtmlReport({ seo, geo, aeo, target, timestamp }) {
|
|
17
|
+
const scanners = [];
|
|
18
|
+
if (seo && !seo.skipped) scanners.push({ label: 'SEO', key: 'seo', data: seo });
|
|
19
|
+
if (geo && !geo.skipped) scanners.push({ label: 'GEO', key: 'geo', data: geo });
|
|
20
|
+
if (aeo && !aeo.skipped) scanners.push({ label: 'AEO', key: 'aeo', data: aeo });
|
|
21
|
+
|
|
22
|
+
const scoreCards = scanners.map(s => {
|
|
23
|
+
const score = s.data.scores[s.key];
|
|
24
|
+
const { color, label } = scoreStyle(score);
|
|
25
|
+
return `
|
|
26
|
+
<div class="score-card">
|
|
27
|
+
<div class="score-ring" style="--score: ${score}; --color: ${color}">
|
|
28
|
+
<svg viewBox="0 0 120 120">
|
|
29
|
+
<circle cx="60" cy="60" r="52" class="ring-bg"/>
|
|
30
|
+
<circle cx="60" cy="60" r="52" class="ring-fill" style="stroke-dashoffset: calc(327 - (327 * ${score} / 100))"/>
|
|
31
|
+
</svg>
|
|
32
|
+
<span class="score-value">${score}</span>
|
|
33
|
+
</div>
|
|
34
|
+
<div class="score-label" style="color: ${color}">${label}</div>
|
|
35
|
+
<div class="score-type">${s.label}</div>
|
|
36
|
+
<div class="score-meta">${s.data.files_scanned} files · ${s.data.findings.length} findings</div>
|
|
37
|
+
</div>`;
|
|
38
|
+
}).join('\n');
|
|
39
|
+
|
|
40
|
+
const allFindings = [];
|
|
41
|
+
for (const s of scanners) {
|
|
42
|
+
for (const f of s.data.findings) {
|
|
43
|
+
allFindings.push({ ...f, scanner: s.label });
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const SEVERITY_ORDER = { critical: 0, high: 1, medium: 2, low: 3 };
|
|
48
|
+
allFindings.sort((a, b) => (SEVERITY_ORDER[a.severity] ?? 9) - (SEVERITY_ORDER[b.severity] ?? 9));
|
|
49
|
+
|
|
50
|
+
// Group by rule
|
|
51
|
+
const groups = new Map();
|
|
52
|
+
for (const f of allFindings) {
|
|
53
|
+
const key = `${f.scanner}:${f.rule}`;
|
|
54
|
+
if (!groups.has(key)) {
|
|
55
|
+
groups.set(key, { rule: f.rule, severity: f.severity, message: f.message, scanner: f.scanner, files: [] });
|
|
56
|
+
}
|
|
57
|
+
if (f.file && !groups.get(key).files.includes(f.file)) {
|
|
58
|
+
groups.get(key).files.push(f.file);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
const findingsRows = [...groups.values()].map(g => {
|
|
63
|
+
const badgeColor = severityBadgeColor(g.severity);
|
|
64
|
+
const filesStr = g.files.length > 0
|
|
65
|
+
? g.files.slice(0, 3).map(f => esc(f)).join(', ') + (g.files.length > 3 ? `, +${g.files.length - 3} more` : '')
|
|
66
|
+
: '—';
|
|
67
|
+
return `
|
|
68
|
+
<tr>
|
|
69
|
+
<td><span class="badge" style="background: ${badgeColor}">${esc(g.severity.toUpperCase())}</span></td>
|
|
70
|
+
<td class="rule-name">${esc(g.rule)}<span class="scanner-tag">${esc(g.scanner)}</span></td>
|
|
71
|
+
<td>${esc(g.message)}</td>
|
|
72
|
+
<td class="files-cell">${filesStr}</td>
|
|
73
|
+
</tr>`;
|
|
74
|
+
}).join('\n');
|
|
75
|
+
|
|
76
|
+
const displayDate = timestamp ? new Date(timestamp).toLocaleString('en-US', {
|
|
77
|
+
dateStyle: 'long', timeStyle: 'short',
|
|
78
|
+
}) : '';
|
|
79
|
+
|
|
80
|
+
return `<!DOCTYPE html>
|
|
81
|
+
<html lang="en">
|
|
82
|
+
<head>
|
|
83
|
+
<meta charset="utf-8"/>
|
|
84
|
+
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
|
85
|
+
<title>claude-rank Audit Report — ${esc(target)}</title>
|
|
86
|
+
<style>
|
|
87
|
+
*{margin:0;padding:0;box-sizing:border-box}
|
|
88
|
+
body{font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',Roboto,sans-serif;background:#0f172a;color:#e2e8f0;line-height:1.6;padding:2rem}
|
|
89
|
+
.container{max-width:960px;margin:0 auto}
|
|
90
|
+
header{text-align:center;margin-bottom:2.5rem;padding-bottom:1.5rem;border-bottom:1px solid #1e293b}
|
|
91
|
+
header h1{font-size:1.75rem;color:#f8fafc;margin-bottom:.25rem}
|
|
92
|
+
header p{color:#94a3b8;font-size:.875rem}
|
|
93
|
+
.scores{display:flex;gap:2rem;justify-content:center;flex-wrap:wrap;margin-bottom:2.5rem}
|
|
94
|
+
.score-card{text-align:center;background:#1e293b;border-radius:12px;padding:1.5rem 2rem;min-width:180px}
|
|
95
|
+
.score-ring{position:relative;width:100px;height:100px;margin:0 auto .75rem}
|
|
96
|
+
.score-ring svg{width:100%;height:100%;transform:rotate(-90deg)}
|
|
97
|
+
.ring-bg{fill:none;stroke:#334155;stroke-width:8}
|
|
98
|
+
.ring-fill{fill:none;stroke:var(--color);stroke-width:8;stroke-linecap:round;stroke-dasharray:327;transition:stroke-dashoffset .5s}
|
|
99
|
+
.score-value{position:absolute;inset:0;display:flex;align-items:center;justify-content:center;font-size:1.5rem;font-weight:700;color:#f8fafc}
|
|
100
|
+
.score-label{font-weight:600;font-size:.875rem;text-transform:uppercase;letter-spacing:.05em}
|
|
101
|
+
.score-type{font-size:1.125rem;font-weight:600;color:#f8fafc;margin-top:.25rem}
|
|
102
|
+
.score-meta{color:#64748b;font-size:.75rem;margin-top:.25rem}
|
|
103
|
+
h2{font-size:1.25rem;color:#f8fafc;margin-bottom:1rem}
|
|
104
|
+
table{width:100%;border-collapse:collapse;font-size:.85rem;margin-bottom:2rem}
|
|
105
|
+
th{text-align:left;color:#94a3b8;font-weight:600;padding:.75rem .5rem;border-bottom:2px solid #1e293b}
|
|
106
|
+
td{padding:.65rem .5rem;border-bottom:1px solid #1e293b;vertical-align:top}
|
|
107
|
+
.badge{display:inline-block;padding:2px 8px;border-radius:4px;font-size:.7rem;font-weight:700;color:#fff;text-transform:uppercase}
|
|
108
|
+
.rule-name{font-weight:600;color:#f8fafc}
|
|
109
|
+
.scanner-tag{margin-left:.5rem;font-size:.65rem;color:#64748b;font-weight:400}
|
|
110
|
+
.files-cell{color:#94a3b8;font-size:.8rem;max-width:200px;word-break:break-all}
|
|
111
|
+
footer{text-align:center;color:#475569;font-size:.75rem;margin-top:2rem;padding-top:1rem;border-top:1px solid #1e293b}
|
|
112
|
+
footer a{color:#64748b}
|
|
113
|
+
.empty{text-align:center;color:#22c55e;padding:2rem;font-size:1rem}
|
|
114
|
+
@media print{body{background:#fff;color:#1e293b;padding:1rem}.score-card{background:#f1f5f9}th{color:#475569;border-color:#cbd5e1}td{border-color:#e2e8f0}.rule-name{color:#0f172a}header{border-color:#cbd5e1}footer{border-color:#cbd5e1;color:#94a3b8}}
|
|
115
|
+
</style>
|
|
116
|
+
</head>
|
|
117
|
+
<body>
|
|
118
|
+
<div class="container">
|
|
119
|
+
<header>
|
|
120
|
+
<h1>claude-rank Audit Report</h1>
|
|
121
|
+
<p>${esc(target)} — ${esc(displayDate)}</p>
|
|
122
|
+
</header>
|
|
123
|
+
|
|
124
|
+
<section class="scores">
|
|
125
|
+
${scoreCards || '<p style="color:#94a3b8">No scan results available.</p>'}
|
|
126
|
+
</section>
|
|
127
|
+
|
|
128
|
+
<h2>Findings</h2>
|
|
129
|
+
${groups.size > 0 ? `
|
|
130
|
+
<table>
|
|
131
|
+
<thead><tr><th>Severity</th><th>Rule</th><th>Message</th><th>Files</th></tr></thead>
|
|
132
|
+
<tbody>
|
|
133
|
+
${findingsRows}
|
|
134
|
+
</tbody>
|
|
135
|
+
</table>` : '<div class="empty">No findings — looking great!</div>'}
|
|
136
|
+
|
|
137
|
+
<footer>Generated by claude-rank v1.2.1 — <a href="https://github.com/Houseofmvps/claude-rank">github.com/Houseofmvps/claude-rank</a></footer>
|
|
138
|
+
</div>
|
|
139
|
+
</body>
|
|
140
|
+
</html>`;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
function scoreStyle(score) {
|
|
144
|
+
if (score >= 90) return { color: '#22c55e', label: 'Excellent' };
|
|
145
|
+
if (score >= 80) return { color: '#3b82f6', label: 'Good' };
|
|
146
|
+
if (score >= 60) return { color: '#eab308', label: 'Needs Work' };
|
|
147
|
+
return { color: '#ef4444', label: 'Poor' };
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
function severityBadgeColor(severity) {
|
|
151
|
+
if (severity === 'critical') return '#dc2626';
|
|
152
|
+
if (severity === 'high') return '#ef4444';
|
|
153
|
+
if (severity === 'medium') return '#eab308';
|
|
154
|
+
return '#64748b';
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
function esc(str) {
|
|
158
|
+
if (!str) return '';
|
|
159
|
+
return String(str).replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>').replace(/"/g, '"');
|
|
160
|
+
}
|
package/tools/seo-scanner.mjs
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
|
|
6
6
|
import fs from 'node:fs';
|
|
7
7
|
import path from 'node:path';
|
|
8
|
-
import { parseHtml, findHtmlFiles } from './lib/html-parser.mjs';
|
|
8
|
+
import { parseHtml, findHtmlFiles, detectPageType } from './lib/html-parser.mjs';
|
|
9
9
|
import { checkFileSize } from './lib/security.mjs';
|
|
10
10
|
|
|
11
11
|
// ---------------------------------------------------------------------------
|
|
@@ -97,6 +97,13 @@ const RULES = {
|
|
|
97
97
|
// Per-file rule checks
|
|
98
98
|
// ---------------------------------------------------------------------------
|
|
99
99
|
|
|
100
|
+
// Page types where thin content is expected and should not be flagged
|
|
101
|
+
const THIN_CONTENT_EXEMPT = new Set(['contact', 'terms', 'privacy', 'legal', 'login', '404', 'sitemap']);
|
|
102
|
+
// Page types where missing analytics is expected
|
|
103
|
+
const NO_ANALYTICS_EXEMPT = new Set(['terms', 'privacy', 'legal']);
|
|
104
|
+
// Page types where missing OG image is expected
|
|
105
|
+
const NO_OG_IMAGE_EXEMPT = new Set(['terms', 'privacy', 'legal']);
|
|
106
|
+
|
|
100
107
|
/**
|
|
101
108
|
* Run per-file checks. Returns array of finding objects.
|
|
102
109
|
* @param {object} state — PageState from parseHtml
|
|
@@ -107,6 +114,7 @@ const RULES = {
|
|
|
107
114
|
function checkFile(state, filePath, rootDir, opts = {}) {
|
|
108
115
|
const findings = [];
|
|
109
116
|
const rel = path.relative(rootDir, filePath);
|
|
117
|
+
const pageType = detectPageType(filePath, state);
|
|
110
118
|
|
|
111
119
|
function add(rule, message, context = {}) {
|
|
112
120
|
const def = RULES[rule];
|
|
@@ -115,6 +123,7 @@ function checkFile(state, filePath, rootDir, opts = {}) {
|
|
|
115
123
|
severity: def.severity,
|
|
116
124
|
file: rel,
|
|
117
125
|
message,
|
|
126
|
+
pageType,
|
|
118
127
|
...context,
|
|
119
128
|
});
|
|
120
129
|
}
|
|
@@ -151,7 +160,7 @@ function checkFile(state, filePath, rootDir, opts = {}) {
|
|
|
151
160
|
add('missing-h1', 'Page has no <h1> heading');
|
|
152
161
|
}
|
|
153
162
|
|
|
154
|
-
if (state.wordCount > 0 && state.wordCount < 300) {
|
|
163
|
+
if (state.wordCount > 0 && state.wordCount < 300 && !THIN_CONTENT_EXEMPT.has(pageType)) {
|
|
155
164
|
add('thin-content', `Page has only ${state.wordCount} words (minimum recommended: 300)`);
|
|
156
165
|
}
|
|
157
166
|
|
|
@@ -196,7 +205,7 @@ function checkFile(state, filePath, rootDir, opts = {}) {
|
|
|
196
205
|
add('missing-og-description', 'Page is missing og:description Open Graph tag');
|
|
197
206
|
}
|
|
198
207
|
|
|
199
|
-
if (!state.hasOgImage) {
|
|
208
|
+
if (!state.hasOgImage && !NO_OG_IMAGE_EXEMPT.has(pageType)) {
|
|
200
209
|
add('missing-og-image', 'Page is missing og:image Open Graph tag');
|
|
201
210
|
}
|
|
202
211
|
|
|
@@ -238,7 +247,7 @@ function checkFile(state, filePath, rootDir, opts = {}) {
|
|
|
238
247
|
add('missing-favicon', 'Page is missing a favicon link');
|
|
239
248
|
}
|
|
240
249
|
|
|
241
|
-
if (!state.hasAnalytics) {
|
|
250
|
+
if (!state.hasAnalytics && !NO_ANALYTICS_EXEMPT.has(pageType)) {
|
|
242
251
|
add('no-analytics', 'No analytics provider detected on this page');
|
|
243
252
|
}
|
|
244
253
|
|
package/tools/url-scanner.mjs
CHANGED
|
@@ -2,10 +2,12 @@
|
|
|
2
2
|
* url-scanner.mjs — Scan a live URL for SEO issues.
|
|
3
3
|
* Fetches HTML from a URL and runs the same per-page analysis as seo-scanner.
|
|
4
4
|
* Cross-page rules (duplicates, orphans, canonicals) are skipped for single-URL scans.
|
|
5
|
+
* scanSite() crawls multiple pages and adds cross-page analysis.
|
|
5
6
|
*/
|
|
6
7
|
|
|
7
|
-
import { parseHtml } from './lib/html-parser.mjs';
|
|
8
|
+
import { parseHtml, detectPageType } from './lib/html-parser.mjs';
|
|
8
9
|
import { fetchPage } from './lib/url-fetcher.mjs';
|
|
10
|
+
import { crawlSite } from './lib/crawler.mjs';
|
|
9
11
|
|
|
10
12
|
// ---------------------------------------------------------------------------
|
|
11
13
|
// Rule definitions (same as seo-scanner, minus cross-page-only rules)
|
|
@@ -52,6 +54,11 @@ const RULES = {
|
|
|
52
54
|
'no-manifest': { severity: 'low', deduction: 2 },
|
|
53
55
|
'all-scripts-blocking': { severity: 'low', deduction: 2 },
|
|
54
56
|
|
|
57
|
+
// Cross-page rules (multi-page crawl only)
|
|
58
|
+
'duplicate-title': { severity: 'high', deduction: 10 },
|
|
59
|
+
'duplicate-meta-description':{ severity: 'high', deduction: 10 },
|
|
60
|
+
'canonical-conflict': { severity: 'high', deduction: 10 },
|
|
61
|
+
|
|
55
62
|
// HTTP-level rules (URL-scan only)
|
|
56
63
|
'http-error': { severity: 'critical', deduction: 20 },
|
|
57
64
|
'redirect-detected': { severity: 'low', deduction: 2 },
|
|
@@ -61,8 +68,16 @@ const RULES = {
|
|
|
61
68
|
// Per-page rule checks (reused from seo-scanner logic)
|
|
62
69
|
// ---------------------------------------------------------------------------
|
|
63
70
|
|
|
71
|
+
// Page types where thin content is expected and should not be flagged
|
|
72
|
+
const THIN_CONTENT_EXEMPT = new Set(['contact', 'terms', 'privacy', 'legal', 'login', '404', 'sitemap']);
|
|
73
|
+
// Page types where missing analytics is expected
|
|
74
|
+
const NO_ANALYTICS_EXEMPT = new Set(['terms', 'privacy', 'legal']);
|
|
75
|
+
// Page types where missing OG image is expected
|
|
76
|
+
const NO_OG_IMAGE_EXEMPT = new Set(['terms', 'privacy', 'legal']);
|
|
77
|
+
|
|
64
78
|
function checkPage(state, pageUrl) {
|
|
65
79
|
const findings = [];
|
|
80
|
+
const pageType = detectPageType(pageUrl, state);
|
|
66
81
|
|
|
67
82
|
function add(rule, message, context = {}) {
|
|
68
83
|
const def = RULES[rule];
|
|
@@ -71,6 +86,7 @@ function checkPage(state, pageUrl) {
|
|
|
71
86
|
severity: def.severity,
|
|
72
87
|
file: pageUrl,
|
|
73
88
|
message,
|
|
89
|
+
pageType,
|
|
74
90
|
...context,
|
|
75
91
|
});
|
|
76
92
|
}
|
|
@@ -109,7 +125,7 @@ function checkPage(state, pageUrl) {
|
|
|
109
125
|
add('missing-h1', 'Page has no <h1> heading');
|
|
110
126
|
}
|
|
111
127
|
|
|
112
|
-
if (state.wordCount > 0 && state.wordCount < 300) {
|
|
128
|
+
if (state.wordCount > 0 && state.wordCount < 300 && !THIN_CONTENT_EXEMPT.has(pageType)) {
|
|
113
129
|
add('thin-content', `Page has only ${state.wordCount} words (minimum recommended: 300)`);
|
|
114
130
|
}
|
|
115
131
|
|
|
@@ -150,7 +166,7 @@ function checkPage(state, pageUrl) {
|
|
|
150
166
|
add('missing-og-description', 'Page is missing og:description Open Graph tag');
|
|
151
167
|
}
|
|
152
168
|
|
|
153
|
-
if (!state.hasOgImage) {
|
|
169
|
+
if (!state.hasOgImage && !NO_OG_IMAGE_EXEMPT.has(pageType)) {
|
|
154
170
|
add('missing-og-image', 'Page is missing og:image Open Graph tag');
|
|
155
171
|
}
|
|
156
172
|
|
|
@@ -191,7 +207,7 @@ function checkPage(state, pageUrl) {
|
|
|
191
207
|
add('missing-favicon', 'Page is missing a favicon link');
|
|
192
208
|
}
|
|
193
209
|
|
|
194
|
-
if (!state.hasAnalytics) {
|
|
210
|
+
if (!state.hasAnalytics && !NO_ANALYTICS_EXEMPT.has(pageType)) {
|
|
195
211
|
add('no-analytics', 'No analytics provider detected on this page');
|
|
196
212
|
}
|
|
197
213
|
|
|
@@ -333,6 +349,151 @@ export async function scanUrl(url) {
|
|
|
333
349
|
};
|
|
334
350
|
}
|
|
335
351
|
|
|
352
|
+
// ---------------------------------------------------------------------------
|
|
353
|
+
// Cross-page checks (for multi-page crawl)
|
|
354
|
+
// ---------------------------------------------------------------------------
|
|
355
|
+
|
|
356
|
+
function crossPageChecks(allStates) {
|
|
357
|
+
const findings = [];
|
|
358
|
+
|
|
359
|
+
// --- Duplicate title detection ---
|
|
360
|
+
const titleMap = new Map();
|
|
361
|
+
for (const { url, state } of allStates) {
|
|
362
|
+
if (state.hasTitle && state.titleText) {
|
|
363
|
+
const title = state.titleText.trim().toLowerCase();
|
|
364
|
+
if (!titleMap.has(title)) titleMap.set(title, []);
|
|
365
|
+
titleMap.get(title).push(url);
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
for (const [title, urls] of titleMap) {
|
|
369
|
+
if (urls.length > 1) {
|
|
370
|
+
for (const pageUrl of urls) {
|
|
371
|
+
findings.push({
|
|
372
|
+
rule: 'duplicate-title',
|
|
373
|
+
severity: RULES['duplicate-title'].severity,
|
|
374
|
+
file: pageUrl,
|
|
375
|
+
message: `Duplicate title "${title}" shared across ${urls.length} pages`,
|
|
376
|
+
duplicates: urls,
|
|
377
|
+
});
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
// --- Duplicate meta description detection ---
|
|
383
|
+
const descMap = new Map();
|
|
384
|
+
for (const { url, state } of allStates) {
|
|
385
|
+
if (state.hasMetaDescription && state.metaDescriptionText) {
|
|
386
|
+
const desc = state.metaDescriptionText.trim().toLowerCase();
|
|
387
|
+
if (!descMap.has(desc)) descMap.set(desc, []);
|
|
388
|
+
descMap.get(desc).push(url);
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
for (const [, urls] of descMap) {
|
|
392
|
+
if (urls.length > 1) {
|
|
393
|
+
for (const pageUrl of urls) {
|
|
394
|
+
findings.push({
|
|
395
|
+
rule: 'duplicate-meta-description',
|
|
396
|
+
severity: RULES['duplicate-meta-description'].severity,
|
|
397
|
+
file: pageUrl,
|
|
398
|
+
message: `Duplicate meta description shared across ${urls.length} pages`,
|
|
399
|
+
duplicates: urls,
|
|
400
|
+
});
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
// --- Canonical conflict detection ---
|
|
406
|
+
const canonicalMap = new Map();
|
|
407
|
+
for (const { url, state } of allStates) {
|
|
408
|
+
if (state.hasCanonical && state.canonicalUrl) {
|
|
409
|
+
const canonical = state.canonicalUrl.trim();
|
|
410
|
+
if (!canonicalMap.has(canonical)) canonicalMap.set(canonical, []);
|
|
411
|
+
canonicalMap.get(canonical).push(url);
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
for (const [canonical, urls] of canonicalMap) {
|
|
415
|
+
if (urls.length > 1) {
|
|
416
|
+
for (const pageUrl of urls) {
|
|
417
|
+
findings.push({
|
|
418
|
+
rule: 'canonical-conflict',
|
|
419
|
+
severity: RULES['canonical-conflict'].severity,
|
|
420
|
+
file: pageUrl,
|
|
421
|
+
message: `Multiple pages share canonical URL "${canonical}"`,
|
|
422
|
+
duplicates: urls,
|
|
423
|
+
});
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
return findings;
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
// ---------------------------------------------------------------------------
|
|
432
|
+
// scanSite — crawl + analyse multiple pages
|
|
433
|
+
// ---------------------------------------------------------------------------
|
|
434
|
+
|
|
435
|
+
/**
|
|
436
|
+
* Crawl and scan an entire site.
|
|
437
|
+
* @param {string} startUrl
|
|
438
|
+
* @param {object} [options] — passed to crawlSite (maxPages, concurrency)
|
|
439
|
+
* @returns {Promise<object>} — { url, pages_scanned, files_scanned, findings, scores, summary, errors }
|
|
440
|
+
*/
|
|
441
|
+
export async function scanSite(startUrl, options = {}) {
|
|
442
|
+
// 1. Crawl the site
|
|
443
|
+
const crawlResult = await crawlSite(startUrl, options);
|
|
444
|
+
|
|
445
|
+
// 2. Parse each page and run per-page checks
|
|
446
|
+
const allStates = [];
|
|
447
|
+
const perPageFindings = [];
|
|
448
|
+
|
|
449
|
+
for (const page of crawlResult.pages) {
|
|
450
|
+
const state = parseHtml(page.html);
|
|
451
|
+
allStates.push({ url: page.url, state });
|
|
452
|
+
|
|
453
|
+
const pageFindings = checkPage(state, page.url);
|
|
454
|
+
|
|
455
|
+
// HTTP-level checks
|
|
456
|
+
if (page.statusCode >= 400) {
|
|
457
|
+
const def = RULES['http-error'];
|
|
458
|
+
pageFindings.unshift({
|
|
459
|
+
rule: 'http-error',
|
|
460
|
+
severity: def.severity,
|
|
461
|
+
file: page.url,
|
|
462
|
+
message: `HTTP ${page.statusCode} error response`,
|
|
463
|
+
});
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
perPageFindings.push(...pageFindings);
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
// 3. Run cross-page checks (duplicate titles, descriptions, canonical conflicts)
|
|
470
|
+
const multiPage = allStates.length > 1;
|
|
471
|
+
const crossFindings = multiPage ? crossPageChecks(allStates) : [];
|
|
472
|
+
|
|
473
|
+
const allFindings = [...perPageFindings, ...crossFindings];
|
|
474
|
+
|
|
475
|
+
// 4. Calculate deduplicated score
|
|
476
|
+
const seoScore = calculateScore(allFindings);
|
|
477
|
+
|
|
478
|
+
// 5. Summary counts
|
|
479
|
+
const summary = { critical: 0, high: 0, medium: 0, low: 0 };
|
|
480
|
+
for (const f of allFindings) {
|
|
481
|
+
if (summary[f.severity] !== undefined) {
|
|
482
|
+
summary[f.severity]++;
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
return {
|
|
487
|
+
url: startUrl,
|
|
488
|
+
pages_scanned: crawlResult.pages.length,
|
|
489
|
+
files_scanned: crawlResult.pages.length,
|
|
490
|
+
findings: allFindings,
|
|
491
|
+
scores: { seo: seoScore },
|
|
492
|
+
summary,
|
|
493
|
+
errors: crawlResult.errors,
|
|
494
|
+
};
|
|
495
|
+
}
|
|
496
|
+
|
|
336
497
|
// ---------------------------------------------------------------------------
|
|
337
498
|
// CLI entry point
|
|
338
499
|
// ---------------------------------------------------------------------------
|