@houseofmvps/claude-rank 1.0.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,10 @@
2
2
  // Standalone CLI: npx claude-rank <command> <directory>
3
3
  // Commands: scan, geo, aeo, schema, fix
4
4
 
5
- const [,, command = 'scan', dir = '.'] = process.argv;
5
+ const args = process.argv.slice(2);
6
+ const jsonFlag = args.includes('--json');
7
+ const positional = args.filter(a => a !== '--json');
8
+ const [command = 'scan', dir = '.'] = positional;
6
9
 
7
10
  const commands = {
8
11
  scan: '../tools/seo-scanner.mjs',
@@ -14,7 +17,7 @@ const commands = {
14
17
  if (command === 'help' || command === '--help') {
15
18
  console.log(`claude-rank — SEO/GEO/AEO toolkit
16
19
 
17
- Usage: claude-rank <command> [directory]
20
+ Usage: claude-rank <command> [directory|url] [--json]
18
21
 
19
22
  Commands:
20
23
  scan Run core SEO scanner (default)
@@ -23,9 +26,18 @@ Commands:
23
26
  schema Detect and validate structured data
24
27
  help Show this help message
25
28
 
29
+ Flags:
30
+ --json Output raw JSON (for programmatic use)
31
+
32
+ URL scanning:
33
+ Pass a URL instead of a directory to scan a live page via HTTP.
34
+ Only the "scan" command supports URL scanning.
35
+
26
36
  Examples:
27
37
  claude-rank scan ./my-project
38
+ claude-rank scan https://savemrr.co
28
39
  npx @houseofmvps/claude-rank geo .
40
+ claude-rank scan ./site --json
29
41
  `);
30
42
  process.exit(0);
31
43
  }
@@ -36,31 +48,79 @@ if (!toolPath) {
36
48
  process.exit(1);
37
49
  }
38
50
 
51
+ // Detect if the target is a URL (http:// or https://)
52
+ const isUrl = dir.startsWith('http://') || dir.startsWith('https://');
53
+
39
54
  // Dynamic import and run the scanner on the target directory
40
55
  import { resolve } from 'path';
41
56
 
42
57
  // Clear argv before importing tool modules so their inline CLI guards don't fire.
43
58
  // The tool files check `process.argv.slice(2).length > 0` to auto-run on import.
44
59
  process.argv = process.argv.slice(0, 2);
45
- const mod = await import(new URL(toolPath, import.meta.url));
46
- const targetDir = resolve(dir);
47
-
48
- if (command === 'schema') {
49
- // schema-engine exports detectSchema (per-file) and findHtmlFiles via html-parser.
50
- // Build a directory-level result by importing the html-parser helper and scanning each file.
51
- const { findHtmlFiles } = await import(new URL('../tools/lib/html-parser.mjs', import.meta.url));
52
- const { readFileSync } = await import('node:fs');
53
- const files = findHtmlFiles(targetDir);
54
- const results = [];
55
- for (const file of files) {
56
- const html = readFileSync(file, 'utf-8');
57
- const schemas = mod.detectSchema(html);
58
- if (schemas.length > 0) {
59
- results.push({ file, schemas });
60
+
61
+ const {
62
+ formatSeoReport,
63
+ formatGeoReport,
64
+ formatAeoReport,
65
+ formatSchemaReport,
66
+ } = await import(new URL('../tools/lib/formatter.mjs', import.meta.url));
67
+
68
+ const formatters = {
69
+ scan: formatSeoReport,
70
+ geo: formatGeoReport,
71
+ aeo: formatAeoReport,
72
+ schema: formatSchemaReport,
73
+ };
74
+
75
+ // URL-based scanning (scan command only)
76
+ if (isUrl) {
77
+ if (command !== 'scan') {
78
+ console.error(`URL scanning is only supported for the "scan" command, not "${command}".`);
79
+ process.exit(1);
80
+ }
81
+
82
+ const { scanUrl } = await import(new URL('../tools/url-scanner.mjs', import.meta.url));
83
+ try {
84
+ const result = await scanUrl(dir);
85
+ if (jsonFlag) {
86
+ console.log(JSON.stringify(result, null, 2));
87
+ } else {
88
+ console.log(formatSeoReport(result));
60
89
  }
90
+ } catch (err) {
91
+ console.error(`Error scanning URL: ${err.message}`);
92
+ process.exit(1);
61
93
  }
62
- console.log(JSON.stringify(results, null, 2));
63
94
  } else {
64
- const result = mod.scanDirectory(targetDir);
65
- console.log(JSON.stringify(result, null, 2));
95
+ // Directory-based scanning
96
+ const mod = await import(new URL(toolPath, import.meta.url));
97
+ const targetDir = resolve(dir);
98
+
99
+ if (command === 'schema') {
100
+ // schema-engine exports detectSchema (per-file) and findHtmlFiles via html-parser.
101
+ // Build a directory-level result by importing the html-parser helper and scanning each file.
102
+ const { findHtmlFiles } = await import(new URL('../tools/lib/html-parser.mjs', import.meta.url));
103
+ const { readFileSync } = await import('node:fs');
104
+ const files = findHtmlFiles(targetDir);
105
+ const results = [];
106
+ for (const file of files) {
107
+ const html = readFileSync(file, 'utf-8');
108
+ const schemas = mod.detectSchema(html);
109
+ if (schemas.length > 0) {
110
+ results.push({ file, schemas });
111
+ }
112
+ }
113
+ if (jsonFlag) {
114
+ console.log(JSON.stringify(results, null, 2));
115
+ } else {
116
+ console.log(formatSchemaReport(results));
117
+ }
118
+ } else {
119
+ const result = mod.scanDirectory(targetDir);
120
+ if (jsonFlag) {
121
+ console.log(JSON.stringify(result, null, 2));
122
+ } else {
123
+ console.log(formatters[command](result));
124
+ }
125
+ }
66
126
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@houseofmvps/claude-rank",
3
- "version": "1.0.2",
3
+ "version": "1.2.0",
4
4
  "description": "The most comprehensive SEO/GEO/AEO plugin for Claude Code. Audit, fix, and dominate search — traditional and AI.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -208,7 +208,20 @@ function analyzePage(filePath) {
208
208
  * @returns {{ files_scanned, findings, scores: { aeo }, summary }}
209
209
  */
210
210
  export function scanDirectory(rootDir) {
211
- const htmlFiles = findHtmlFiles(rootDir);
211
+ let htmlFiles = findHtmlFiles(rootDir);
212
+
213
+ // If dist/build/out has HTML, exclude root index.html (Vite/webpack source template)
214
+ const hasBuildDir = htmlFiles.some(f => {
215
+ const rel = path.relative(rootDir, f);
216
+ return rel.startsWith('dist' + path.sep) || rel.startsWith('build' + path.sep) || rel.startsWith('out' + path.sep);
217
+ });
218
+ if (hasBuildDir) {
219
+ htmlFiles = htmlFiles.filter(f => {
220
+ const rel = path.relative(rootDir, f);
221
+ return rel !== 'index.html' && rel !== 'index.htm';
222
+ });
223
+ }
224
+
212
225
  const findings = [];
213
226
 
214
227
  // Per-file analyses
@@ -155,16 +155,26 @@ function parseRobotsTxt(content) {
155
155
  */
156
156
  function extractSchemaTypes(jsonLdContent) {
157
157
  const types = new Set();
158
+
159
+ function walkSchema(obj) {
160
+ if (!obj || typeof obj !== 'object') return;
161
+ if (Array.isArray(obj)) {
162
+ for (const item of obj) walkSchema(item);
163
+ return;
164
+ }
165
+ if (obj['@type']) {
166
+ const t = Array.isArray(obj['@type']) ? obj['@type'] : [obj['@type']];
167
+ for (const type of t) types.add(type);
168
+ }
169
+ // Walk all nested objects to find embedded schemas (e.g., author: { @type: "Person" })
170
+ for (const val of Object.values(obj)) {
171
+ if (val && typeof val === 'object') walkSchema(val);
172
+ }
173
+ }
174
+
158
175
  for (const raw of jsonLdContent) {
159
176
  try {
160
- const parsed = JSON.parse(raw);
161
- const items = Array.isArray(parsed) ? parsed : [parsed];
162
- for (const item of items) {
163
- if (item && item['@type']) {
164
- const t = Array.isArray(item['@type']) ? item['@type'] : [item['@type']];
165
- for (const type of t) types.add(type);
166
- }
167
- }
177
+ walkSchema(JSON.parse(raw));
168
178
  } catch {
169
179
  // Non-parseable JSON-LD — skip
170
180
  }
@@ -307,7 +317,20 @@ export function scanDirectory(rootDir) {
307
317
  // 3. Scan HTML files
308
318
  // -------------------------------------------------------------------------
309
319
 
310
- const htmlFiles = findHtmlFiles(rootDir);
320
+ let htmlFiles = findHtmlFiles(rootDir);
321
+
322
+ // If dist/build/out has HTML, exclude root index.html (Vite/webpack source template)
323
+ const hasBuildDir = htmlFiles.some(f => {
324
+ const rel = path.relative(rootDir, f);
325
+ return rel.startsWith('dist' + path.sep) || rel.startsWith('build' + path.sep) || rel.startsWith('out' + path.sep);
326
+ });
327
+ if (hasBuildDir) {
328
+ htmlFiles = htmlFiles.filter(f => {
329
+ const rel = path.relative(rootDir, f);
330
+ return rel !== 'index.html' && rel !== 'index.htm';
331
+ });
332
+ }
333
+
311
334
  let filesScanned = 0;
312
335
 
313
336
  // Aggregate data across all pages
@@ -0,0 +1,173 @@
1
+ /**
2
+ * formatter.mjs — Pretty terminal output for claude-rank CLI reports.
3
+ * No external dependencies — uses raw ANSI escape codes.
4
+ */
5
+
6
+ const c = {
7
+ red: s => `\x1b[31m${s}\x1b[0m`,
8
+ yellow: s => `\x1b[33m${s}\x1b[0m`,
9
+ green: s => `\x1b[32m${s}\x1b[0m`,
10
+ cyan: s => `\x1b[36m${s}\x1b[0m`,
11
+ bold: s => `\x1b[1m${s}\x1b[0m`,
12
+ dim: s => `\x1b[2m${s}\x1b[0m`,
13
+ };
14
+
15
+ const BAR_WIDTH = 15;
16
+
17
+ function scoreLabel(score) {
18
+ if (score >= 90) return c.green('EXCELLENT');
19
+ if (score >= 80) return c.green('GOOD');
20
+ if (score >= 60) return c.yellow('NEEDS WORK');
21
+ return c.red('POOR');
22
+ }
23
+
24
+ function scoreBar(score) {
25
+ const filled = Math.round((score / 100) * BAR_WIDTH);
26
+ const empty = BAR_WIDTH - filled;
27
+ return '\u2588'.repeat(filled) + '\u2591'.repeat(empty);
28
+ }
29
+
30
+ function severityColor(severity) {
31
+ if (severity === 'critical' || severity === 'high') return c.red;
32
+ if (severity === 'medium') return c.yellow;
33
+ return c.dim;
34
+ }
35
+
36
+ function pad(str, len) {
37
+ const stripped = str.replace(/\x1b\[[0-9;]*m/g, '');
38
+ return str + ' '.repeat(Math.max(0, len - stripped.length));
39
+ }
40
+
41
+ /**
42
+ * Group findings by rule, aggregating affected files and using the first message.
43
+ */
44
+ function groupFindings(findings) {
45
+ const groups = new Map();
46
+ for (const f of findings) {
47
+ if (!groups.has(f.rule)) {
48
+ groups.set(f.rule, {
49
+ rule: f.rule,
50
+ severity: f.severity,
51
+ message: f.message,
52
+ files: [],
53
+ });
54
+ }
55
+ const g = groups.get(f.rule);
56
+ if (f.file && !g.files.includes(f.file)) {
57
+ g.files.push(f.file);
58
+ }
59
+ }
60
+ return [...groups.values()];
61
+ }
62
+
63
+ function formatFileList(files, max = 3) {
64
+ if (files.length === 0) return '';
65
+ const shown = files.slice(0, max);
66
+ const rest = files.length - max;
67
+ let out = shown.join(', ');
68
+ if (rest > 0) out += `, +${rest} more`;
69
+ return out;
70
+ }
71
+
72
+ const SEVERITY_ORDER = { critical: 0, high: 1, medium: 2, low: 3 };
73
+
74
+ /**
75
+ * Format a scanner report (SEO, GEO, or AEO) with a box header and grouped findings.
76
+ */
77
+ function formatReport(result, title, scoreKey) {
78
+ if (result.skipped) {
79
+ return c.yellow(`Skipped: ${result.reason}`);
80
+ }
81
+
82
+ const score = result.scores[scoreKey];
83
+ const { files_scanned, findings, summary } = result;
84
+ const groups = groupFindings(findings);
85
+ groups.sort((a, b) => (SEVERITY_ORDER[a.severity] ?? 9) - (SEVERITY_ORDER[b.severity] ?? 9));
86
+
87
+ const W = 48;
88
+ const hr = '\u2550'.repeat(W);
89
+ const lines = [];
90
+
91
+ lines.push(`\u2554${hr}\u2557`);
92
+ lines.push(`\u2551${pad(c.bold(` ${title}`), W + 9)}\u2551`);
93
+ lines.push(`\u2560${hr}\u2563`);
94
+
95
+ const barStr = ` Score: ${score}/100 ${scoreBar(score)} ${scoreLabel(score)}`;
96
+ lines.push(`\u2551${pad(barStr, W + 22)}\u2551`);
97
+ lines.push(`\u2560${hr}\u2563`);
98
+
99
+ lines.push(`\u2551${pad(` Files scanned: ${files_scanned}`, W)}\u2551`);
100
+ lines.push(`\u2551${pad(` Findings: ${findings.length}`, W)}\u2551`);
101
+ const countsLine = ` Critical: ${summary.critical} High: ${summary.high} Medium: ${summary.medium} Low: ${summary.low}`;
102
+ lines.push(`\u2551${pad(countsLine, W)}\u2551`);
103
+ lines.push(`\u255A${hr}\u255D`);
104
+ lines.push('');
105
+
106
+ if (groups.length === 0) {
107
+ lines.push(c.green('No findings — looking great!'));
108
+ return lines.join('\n');
109
+ }
110
+
111
+ lines.push(c.bold('Findings:'));
112
+ {
113
+ for (const g of groups) {
114
+ const colorFn = severityColor(g.severity);
115
+ const tag = pad(colorFn(g.severity.toUpperCase()), 10 + 9);
116
+ const countSuffix = g.files.length > 1 ? ` (${g.files.length} pages)` : '';
117
+ lines.push(` ${tag}${c.bold(g.rule)}${c.dim(countSuffix)}`);
118
+ lines.push(` ${g.message}`);
119
+ if (g.files.length > 0) {
120
+ lines.push(` ${c.dim('Files: ' + formatFileList(g.files))}`);
121
+ }
122
+ lines.push('');
123
+ }
124
+ }
125
+
126
+ return lines.join('\n');
127
+ }
128
+
129
+ export function formatSeoReport(result) {
130
+ return formatReport(result, 'claude-rank SEO Audit', 'seo');
131
+ }
132
+
133
+ export function formatGeoReport(result) {
134
+ return formatReport(result, 'claude-rank GEO Audit', 'geo');
135
+ }
136
+
137
+ export function formatAeoReport(result) {
138
+ return formatReport(result, 'claude-rank AEO Audit', 'aeo');
139
+ }
140
+
141
+ /**
142
+ * Format schema detection results.
143
+ */
144
+ export function formatSchemaReport(results) {
145
+ if (!results || results.length === 0) {
146
+ return c.yellow('No structured data (JSON-LD, Microdata, RDFa) detected.');
147
+ }
148
+
149
+ const lines = [];
150
+ const W = 48;
151
+ const hr = '\u2550'.repeat(W);
152
+
153
+ lines.push(`\u2554${hr}\u2557`);
154
+ lines.push(`\u2551${pad(c.bold(' claude-rank Schema Report'), W + 9)}\u2551`);
155
+ lines.push(`\u2560${hr}\u2563`);
156
+ lines.push(`\u2551${pad(` Files with schemas: ${results.length}`, W)}\u2551`);
157
+ const totalSchemas = results.reduce((n, r) => n + r.schemas.length, 0);
158
+ lines.push(`\u2551${pad(` Total schemas found: ${totalSchemas}`, W)}\u2551`);
159
+ lines.push(`\u255A${hr}\u255D`);
160
+ lines.push('');
161
+
162
+ for (const r of results) {
163
+ lines.push(c.bold(r.file));
164
+ for (const s of r.schemas) {
165
+ const type = s.type || s['@type'] || 'Unknown';
166
+ const format = s.format || 'JSON-LD';
167
+ lines.push(` ${c.cyan(type)} ${c.dim(`(${format})`)}`);
168
+ }
169
+ lines.push('');
170
+ }
171
+
172
+ return lines.join('\n');
173
+ }
@@ -127,6 +127,8 @@ export function parseHtml(htmlString) {
127
127
  let currentHeadingLevel = 0;
128
128
  let isJsonLd = false;
129
129
  let currentHeadingText = '';
130
+ let currentScriptSrc = '';
131
+ let inlineScriptBuffer = '';
130
132
  let bodyTextBuffer = '';
131
133
 
132
134
  const parser = new Parser(
@@ -252,8 +254,9 @@ export function parseHtml(htmlString) {
252
254
  }
253
255
 
254
256
  // Count total and deferred scripts
257
+ // type="module" is deferred by default per HTML spec
255
258
  state.totalScripts++;
256
- if (attribs.async !== undefined || attribs.defer !== undefined) {
259
+ if (attribs.async !== undefined || attribs.defer !== undefined || scriptType === 'module') {
257
260
  state.deferredScripts++;
258
261
  }
259
262
 
@@ -269,6 +272,7 @@ export function parseHtml(htmlString) {
269
272
  }
270
273
 
271
274
  inScript = true;
275
+ currentScriptSrc = src;
272
276
  return;
273
277
  }
274
278
 
@@ -349,6 +353,12 @@ export function parseHtml(htmlString) {
349
353
  return;
350
354
  }
351
355
 
356
+ // Inline script content — accumulate for analytics detection
357
+ if (inScript && !isJsonLd) {
358
+ inlineScriptBuffer += text;
359
+ return;
360
+ }
361
+
352
362
  // Body text (skip script/style)
353
363
  if (inBody && !inScript && !inStyle) {
354
364
  bodyTextBuffer += text + ' ';
@@ -372,7 +382,19 @@ export function parseHtml(htmlString) {
372
382
  state.jsonLdScripts++;
373
383
  isJsonLd = false;
374
384
  }
385
+ // Check inline script content for analytics patterns (catches lazy-loaded GA etc.)
386
+ if (!state.hasAnalytics && !currentScriptSrc && inlineScriptBuffer) {
387
+ for (const { pattern, provider } of ANALYTICS_PATTERNS) {
388
+ if (inlineScriptBuffer.includes(pattern)) {
389
+ state.hasAnalytics = true;
390
+ state.analyticsProvider = provider;
391
+ break;
392
+ }
393
+ }
394
+ }
375
395
  inScript = false;
396
+ currentScriptSrc = '';
397
+ inlineScriptBuffer = '';
376
398
  return;
377
399
  }
378
400
 
@@ -451,7 +473,9 @@ export async function parseHtmlFile(filePath) {
451
473
  // findHtmlFiles — recursively find .html/.htm files
452
474
  // ---------------------------------------------------------------------------
453
475
 
454
- const SKIP_DIRS = new Set(['node_modules', '.git', '.next', '.nuxt', '.svelte-kit', '.cache', '.turbo']);
476
+ const SKIP_DIRS = new Set(['node_modules', '.git', '.next', '.nuxt', '.svelte-kit', '.cache', '.turbo', 'public']);
477
+ // Files that look like HTML but aren't real pages (e.g., Google/Bing site verification)
478
+ const SKIP_FILE_PATTERNS = [/^google[a-f0-9]+\.html$/, /^bing[a-f0-9]+\.html$/, /^yandex_[a-f0-9]+\.html$/];
455
479
 
456
480
  /**
457
481
  * Recursively find all .html/.htm files under a directory.
@@ -479,6 +503,8 @@ export function findHtmlFiles(dir) {
479
503
  } else if (entry.isFile()) {
480
504
  const ext = path.extname(entry.name).toLowerCase();
481
505
  if (ext === '.html' || ext === '.htm') {
506
+ // Skip search engine verification files
507
+ if (SKIP_FILE_PATTERNS.some(p => p.test(entry.name))) continue;
482
508
  results.push(fullPath);
483
509
  }
484
510
  }
@@ -0,0 +1,79 @@
1
+ /**
2
+ * url-fetcher.mjs — Fetch a live URL with SSRF protection and size limits.
3
+ * Uses Node.js built-in fetch() (Node 18+). No external dependencies.
4
+ */
5
+
6
+ import { validateUrl, createResponseAccumulator } from './security.mjs';
7
+
8
+ const USER_AGENT = 'claude-rank/1.1.0 (https://github.com/Houseofmvps/claude-rank)';
9
+ const TIMEOUT_MS = 15_000;
10
+
11
+ /**
12
+ * Fetch a page by URL with SSRF protection and response size limits.
13
+ * @param {string} url — the URL to fetch
14
+ * @returns {Promise<{ html: string, url: string, statusCode: number, redirected: boolean, finalUrl: string }>}
15
+ */
16
+ export async function fetchPage(url) {
17
+ // 1. SSRF validation
18
+ const validation = validateUrl(url);
19
+ if (!validation.valid) {
20
+ throw new Error(`URL blocked: ${validation.reason}`);
21
+ }
22
+
23
+ // 2. Abort controller for timeout
24
+ const controller = new AbortController();
25
+ const timeoutId = setTimeout(() => controller.abort(), TIMEOUT_MS);
26
+
27
+ let response;
28
+ try {
29
+ response = await fetch(url, {
30
+ signal: controller.signal,
31
+ headers: {
32
+ 'User-Agent': USER_AGENT,
33
+ 'Accept': 'text/html,application/xhtml+xml,*/*',
34
+ },
35
+ redirect: 'follow',
36
+ });
37
+ } catch (err) {
38
+ clearTimeout(timeoutId);
39
+ if (err.name === 'AbortError') {
40
+ throw new Error(`Request timed out after ${TIMEOUT_MS / 1000}s: ${url}`);
41
+ }
42
+ throw new Error(`Fetch failed for ${url}: ${err.message}`);
43
+ }
44
+
45
+ clearTimeout(timeoutId);
46
+
47
+ // 3. Check Content-Type — only scan HTML responses
48
+ const contentType = response.headers.get('content-type') || '';
49
+ if (!contentType.includes('text/html') && !contentType.includes('application/xhtml+xml')) {
50
+ throw new Error(`Not an HTML page (Content-Type: ${contentType}): ${url}`);
51
+ }
52
+
53
+ // 4. Read body with size limits using response accumulator
54
+ const accumulator = createResponseAccumulator();
55
+
56
+ // Use response.body (ReadableStream) for streaming size control
57
+ // Fallback: if body is not a readable stream, use response.text()
58
+ if (response.body && typeof response.body[Symbol.asyncIterator] === 'function') {
59
+ const decoder = new TextDecoder();
60
+ for await (const chunk of response.body) {
61
+ accumulator.onData(decoder.decode(chunk, { stream: true }));
62
+ if (accumulator.isTruncated()) break;
63
+ }
64
+ } else {
65
+ // Fallback for environments where body isn't async-iterable
66
+ const text = await response.text();
67
+ accumulator.onData(text);
68
+ }
69
+
70
+ const html = accumulator.getBody();
71
+
72
+ return {
73
+ html,
74
+ url,
75
+ statusCode: response.status,
76
+ redirected: response.redirected,
77
+ finalUrl: response.url,
78
+ };
79
+ }
@@ -436,7 +436,19 @@ function calculateScore(findings) {
436
436
  */
437
437
  export function scanDirectory(rootDir) {
438
438
  const absRoot = path.resolve(rootDir);
439
- const htmlFiles = findHtmlFiles(absRoot);
439
+ let htmlFiles = findHtmlFiles(absRoot);
440
+
441
+ // If dist/ or build/ has HTML, exclude root index.html (Vite/webpack source template)
442
+ const hasBuildDir = htmlFiles.some(f => {
443
+ const rel = path.relative(absRoot, f);
444
+ return rel.startsWith('dist' + path.sep) || rel.startsWith('build' + path.sep) || rel.startsWith('out' + path.sep);
445
+ });
446
+ if (hasBuildDir) {
447
+ htmlFiles = htmlFiles.filter(f => {
448
+ const rel = path.relative(absRoot, f);
449
+ return rel !== 'index.html' && rel !== 'index.htm';
450
+ });
451
+ }
440
452
 
441
453
  // Backend-only detection
442
454
  if (isBackendOnlyProject(absRoot, htmlFiles)) {
@@ -0,0 +1,348 @@
1
+ /**
2
+ * url-scanner.mjs — Scan a live URL for SEO issues.
3
+ * Fetches HTML from a URL and runs the same per-page analysis as seo-scanner.
4
+ * Cross-page rules (duplicates, orphans, canonicals) are skipped for single-URL scans.
5
+ */
6
+
7
+ import { parseHtml } from './lib/html-parser.mjs';
8
+ import { fetchPage } from './lib/url-fetcher.mjs';
9
+
10
+ // ---------------------------------------------------------------------------
11
+ // Rule definitions (same as seo-scanner, minus cross-page-only rules)
12
+ // ---------------------------------------------------------------------------
13
+
14
+ const RULES = {
15
+ // Critical
16
+ 'has-noindex': { severity: 'critical', deduction: 20 },
17
+ 'canonical-points-elsewhere':{ severity: 'critical', deduction: 20 },
18
+
19
+ // High
20
+ 'missing-title': { severity: 'high', deduction: 10 },
21
+ 'missing-meta-description': { severity: 'high', deduction: 10 },
22
+ 'missing-h1': { severity: 'high', deduction: 10 },
23
+ 'thin-content': { severity: 'high', deduction: 10 },
24
+ 'missing-lang': { severity: 'high', deduction: 10 },
25
+
26
+ // Medium
27
+ 'title-too-long': { severity: 'medium', deduction: 5 },
28
+ 'title-too-short': { severity: 'medium', deduction: 5 },
29
+ 'meta-description-too-long': { severity: 'medium', deduction: 5 },
30
+ 'meta-description-too-short':{ severity: 'medium', deduction: 5 },
31
+ 'missing-viewport': { severity: 'medium', deduction: 5 },
32
+ 'missing-charset': { severity: 'medium', deduction: 5 },
33
+ 'missing-og-title': { severity: 'medium', deduction: 5 },
34
+ 'missing-og-description': { severity: 'medium', deduction: 5 },
35
+ 'missing-og-image': { severity: 'medium', deduction: 5 },
36
+ 'missing-canonical': { severity: 'medium', deduction: 5 },
37
+ 'multiple-h1': { severity: 'medium', deduction: 5 },
38
+ 'skipped-heading-level': { severity: 'medium', deduction: 5 },
39
+ 'images-missing-alt': { severity: 'medium', deduction: 5 },
40
+ 'images-missing-dimensions': { severity: 'medium', deduction: 5 },
41
+ 'missing-main-landmark': { severity: 'medium', deduction: 5 },
42
+ 'missing-json-ld': { severity: 'medium', deduction: 5 },
43
+ 'missing-favicon': { severity: 'medium', deduction: 5 },
44
+ 'no-analytics': { severity: 'medium', deduction: 5 },
45
+
46
+ // Low
47
+ 'missing-og-url': { severity: 'low', deduction: 2 },
48
+ 'missing-twitter-card': { severity: 'low', deduction: 2 },
49
+ 'missing-twitter-image': { severity: 'low', deduction: 2 },
50
+ 'missing-nav-landmark': { severity: 'low', deduction: 2 },
51
+ 'missing-footer-landmark': { severity: 'low', deduction: 2 },
52
+ 'no-manifest': { severity: 'low', deduction: 2 },
53
+ 'all-scripts-blocking': { severity: 'low', deduction: 2 },
54
+
55
+ // HTTP-level rules (URL-scan only)
56
+ 'http-error': { severity: 'critical', deduction: 20 },
57
+ 'redirect-detected': { severity: 'low', deduction: 2 },
58
+ };
59
+
60
+ // ---------------------------------------------------------------------------
61
+ // Per-page rule checks (reused from seo-scanner logic)
62
+ // ---------------------------------------------------------------------------
63
+
64
+ function checkPage(state, pageUrl) {
65
+ const findings = [];
66
+
67
+ function add(rule, message, context = {}) {
68
+ const def = RULES[rule];
69
+ findings.push({
70
+ rule,
71
+ severity: def.severity,
72
+ file: pageUrl,
73
+ message,
74
+ ...context,
75
+ });
76
+ }
77
+
78
+ // Critical
79
+ if (state.hasNoindex) {
80
+ add('has-noindex', 'Page has noindex directive — will be excluded from search engines');
81
+ }
82
+
83
+ if (state.hasCanonical && state.canonicalUrl) {
84
+ const canonical = state.canonicalUrl.trim();
85
+ // For URL scans: flag if canonical points to a completely different domain
86
+ if (canonical.startsWith('http://') || canonical.startsWith('https://')) {
87
+ try {
88
+ const canonicalHost = new URL(canonical).hostname;
89
+ const pageHost = new URL(pageUrl).hostname;
90
+ if (canonicalHost !== pageHost) {
91
+ add('canonical-points-elsewhere', `Canonical URL "${canonical}" points to a different domain`);
92
+ }
93
+ } catch {
94
+ // Invalid canonical URL — skip this check
95
+ }
96
+ }
97
+ }
98
+
99
+ // High
100
+ if (!state.hasTitle) {
101
+ add('missing-title', 'Page is missing a <title> tag');
102
+ }
103
+
104
+ if (!state.hasMetaDescription) {
105
+ add('missing-meta-description', 'Page is missing a meta description');
106
+ }
107
+
108
+ if (state.h1Count === 0) {
109
+ add('missing-h1', 'Page has no <h1> heading');
110
+ }
111
+
112
+ if (state.wordCount > 0 && state.wordCount < 300) {
113
+ add('thin-content', `Page has only ${state.wordCount} words (minimum recommended: 300)`);
114
+ }
115
+
116
+ if (!state.hasLang) {
117
+ add('missing-lang', 'HTML element is missing a lang attribute');
118
+ }
119
+
120
+ // Medium
121
+ if (state.hasTitle && state.titleText.length > 60) {
122
+ add('title-too-long', `Title is ${state.titleText.length} chars (max recommended: 60)`);
123
+ }
124
+
125
+ if (state.hasTitle && state.titleText.length < 20) {
126
+ add('title-too-short', `Title is only ${state.titleText.length} chars (min recommended: 20)`);
127
+ }
128
+
129
+ if (state.hasMetaDescription && state.metaDescriptionText.length > 160) {
130
+ add('meta-description-too-long', `Meta description is ${state.metaDescriptionText.length} chars (max recommended: 160)`);
131
+ }
132
+
133
+ if (state.hasMetaDescription && state.metaDescriptionText.length > 0 && state.metaDescriptionText.length < 70) {
134
+ add('meta-description-too-short', `Meta description is only ${state.metaDescriptionText.length} chars (min recommended: 70)`);
135
+ }
136
+
137
+ if (!state.hasViewport) {
138
+ add('missing-viewport', 'Page is missing a viewport meta tag');
139
+ }
140
+
141
+ if (!state.hasCharset) {
142
+ add('missing-charset', 'Page is missing a charset declaration');
143
+ }
144
+
145
+ if (!state.hasOgTitle) {
146
+ add('missing-og-title', 'Page is missing og:title Open Graph tag');
147
+ }
148
+
149
+ if (!state.hasOgDescription) {
150
+ add('missing-og-description', 'Page is missing og:description Open Graph tag');
151
+ }
152
+
153
+ if (!state.hasOgImage) {
154
+ add('missing-og-image', 'Page is missing og:image Open Graph tag');
155
+ }
156
+
157
+ if (!state.hasCanonical) {
158
+ add('missing-canonical', 'Page is missing a canonical link tag');
159
+ }
160
+
161
+ if (state.h1Count > 1) {
162
+ add('multiple-h1', `Page has ${state.h1Count} <h1> tags (should have exactly 1)`);
163
+ }
164
+
165
+ if (state.headingLevels.length > 1) {
166
+ for (let i = 1; i < state.headingLevels.length; i++) {
167
+ if (state.headingLevels[i] - state.headingLevels[i - 1] > 1) {
168
+ add('skipped-heading-level', `Heading level skipped: h${state.headingLevels[i - 1]} → h${state.headingLevels[i]}`);
169
+ break;
170
+ }
171
+ }
172
+ }
173
+
174
+ if (state.imagesWithoutAlt > 0) {
175
+ add('images-missing-alt', `${state.imagesWithoutAlt} image(s) missing alt attribute`);
176
+ }
177
+
178
+ if (state.imagesWithoutDimensions > 0) {
179
+ add('images-missing-dimensions', `${state.imagesWithoutDimensions} image(s) missing width/height attributes`);
180
+ }
181
+
182
+ if (!state.hasMain) {
183
+ add('missing-main-landmark', 'Page is missing a <main> landmark element');
184
+ }
185
+
186
+ if (state.jsonLdScripts === 0) {
187
+ add('missing-json-ld', 'Page has no JSON-LD structured data');
188
+ }
189
+
190
+ if (!state.hasFavicon) {
191
+ add('missing-favicon', 'Page is missing a favicon link');
192
+ }
193
+
194
+ if (!state.hasAnalytics) {
195
+ add('no-analytics', 'No analytics provider detected on this page');
196
+ }
197
+
198
+ // Low
199
+ if (!state.hasOgUrl) {
200
+ add('missing-og-url', 'Page is missing og:url Open Graph tag');
201
+ }
202
+
203
+ if (!state.hasTwitterCard) {
204
+ add('missing-twitter-card', 'Page is missing twitter:card meta tag');
205
+ }
206
+
207
+ if (!state.hasTwitterImage) {
208
+ add('missing-twitter-image', 'Page is missing twitter:image meta tag');
209
+ }
210
+
211
+ if (!state.hasNav) {
212
+ add('missing-nav-landmark', 'Page is missing a <nav> landmark element');
213
+ }
214
+
215
+ if (!state.hasFooter) {
216
+ add('missing-footer-landmark', 'Page is missing a <footer> landmark element');
217
+ }
218
+
219
+ if (!state.hasManifest) {
220
+ add('no-manifest', 'Page is missing a web app manifest link');
221
+ }
222
+
223
+ if (state.totalScripts > 0 && state.deferredScripts === 0) {
224
+ add('all-scripts-blocking', `All ${state.totalScripts} script(s) are render-blocking (no async/defer)`);
225
+ }
226
+
227
+ return findings;
228
+ }
229
+
230
+ // ---------------------------------------------------------------------------
231
+ // Score calculation
232
+ // ---------------------------------------------------------------------------
233
+
234
+ function calculateScore(findings) {
235
+ const triggeredRules = new Set(findings.map(f => f.rule));
236
+ let score = 100;
237
+ for (const rule of triggeredRules) {
238
+ const def = RULES[rule];
239
+ if (def) {
240
+ score -= def.deduction;
241
+ }
242
+ }
243
+ return Math.max(0, score);
244
+ }
245
+
246
+ // ---------------------------------------------------------------------------
247
+ // scanHtml — analyse raw HTML (for testing without HTTP)
248
+ // ---------------------------------------------------------------------------
249
+
250
+ /**
251
+ * Analyse an HTML string as if it were fetched from the given URL.
252
+ * Same analysis as scanUrl but takes HTML directly (no network request).
253
+ * @param {string} html — raw HTML string
254
+ * @param {string} [url='https://example.com'] — URL for context in findings
255
+ * @returns {object} { url, findings, scores, summary }
256
+ */
257
+ export function scanHtml(html, url = 'https://example.com') {
258
+ const state = parseHtml(html);
259
+ const findings = checkPage(state, url);
260
+
261
+ const seoScore = calculateScore(findings);
262
+
263
+ const summary = { critical: 0, high: 0, medium: 0, low: 0 };
264
+ for (const f of findings) {
265
+ if (summary[f.severity] !== undefined) {
266
+ summary[f.severity]++;
267
+ }
268
+ }
269
+
270
+ return {
271
+ url,
272
+ findings,
273
+ scores: { seo: seoScore },
274
+ summary,
275
+ };
276
+ }
277
+
278
+ // ---------------------------------------------------------------------------
279
+ // scanUrl — fetch + analyse
280
+ // ---------------------------------------------------------------------------
281
+
282
+ /**
283
+ * Fetch a live URL and run SEO analysis on the returned HTML.
284
+ * @param {string} url — the URL to scan
285
+ * @returns {Promise<object>} { url, findings, scores, summary, http }
286
+ */
287
+ export async function scanUrl(url) {
288
+ const page = await fetchPage(url);
289
+
290
+ const state = parseHtml(page.html);
291
+ const findings = checkPage(state, page.finalUrl);
292
+
293
+ // HTTP-level checks
294
+ if (page.statusCode >= 400) {
295
+ const def = RULES['http-error'];
296
+ findings.unshift({
297
+ rule: 'http-error',
298
+ severity: def.severity,
299
+ file: page.finalUrl,
300
+ message: `HTTP ${page.statusCode} error response`,
301
+ });
302
+ }
303
+
304
+ if (page.redirected) {
305
+ const def = RULES['redirect-detected'];
306
+ findings.push({
307
+ rule: 'redirect-detected',
308
+ severity: def.severity,
309
+ file: url,
310
+ message: `URL redirected: ${url} → ${page.finalUrl}`,
311
+ });
312
+ }
313
+
314
+ const seoScore = calculateScore(findings);
315
+
316
+ const summary = { critical: 0, high: 0, medium: 0, low: 0 };
317
+ for (const f of findings) {
318
+ if (summary[f.severity] !== undefined) {
319
+ summary[f.severity]++;
320
+ }
321
+ }
322
+
323
+ return {
324
+ url: page.finalUrl,
325
+ findings,
326
+ scores: { seo: seoScore },
327
+ summary,
328
+ http: {
329
+ statusCode: page.statusCode,
330
+ redirected: page.redirected,
331
+ finalUrl: page.finalUrl,
332
+ },
333
+ };
334
+ }
335
+
336
+ // ---------------------------------------------------------------------------
337
+ // CLI entry point
338
+ // ---------------------------------------------------------------------------
339
+
340
+ const args = process.argv.slice(2);
341
+ if (args.length > 0) {
342
+ scanUrl(args[0]).then(result => {
343
+ console.log(JSON.stringify(result, null, 2));
344
+ }).catch(err => {
345
+ console.error(`Error: ${err.message}`);
346
+ process.exit(1);
347
+ });
348
+ }