seo-intel 1.5.21 → 1.5.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +26 -0
- package/analyses/aeo/scorer.js +60 -6
- package/analyses/templates/index.js +1 -1
- package/analysis/prompt-builder.js +167 -2
- package/analysis/technical-audit.js +177 -0
- package/cli.js +246 -64
- package/crawler/index.js +36 -2
- package/crawler/sitemap.js +44 -0
- package/db/db.js +62 -9
- package/db/schema.sql +19 -0
- package/exports/queries.js +32 -0
- package/exports/technical.js +181 -1
- package/extractor/qwen.js +135 -13
- package/lib/scan-export.js +33 -9
- package/package.json +1 -1
- package/reports/generate-html.js +27 -6
- package/server.js +25 -8
- package/setup/checks.js +65 -5
- package/setup/engine.js +1 -0
- package/setup/web-routes.js +22 -3
- package/setup/wizard.html +8 -6
package/cli.js
CHANGED
|
@@ -39,6 +39,7 @@ import {
|
|
|
39
39
|
getCompetitorSummary, getKeywordMatrix, getHeadingStructure,
|
|
40
40
|
getPageHash, getSchemasByProject,
|
|
41
41
|
upsertInsightsFromAnalysis, upsertInsightsFromKeywords,
|
|
42
|
+
upsertSitemapUrls,
|
|
42
43
|
} from './db/db.js';
|
|
43
44
|
import { generateMultiDashboard } from './reports/generate-html.js';
|
|
44
45
|
import { buildTechnicalActions } from './exports/technical.js';
|
|
@@ -73,13 +74,13 @@ function resolveExtractionRuntime(config) {
|
|
|
73
74
|
const norm = h => String(h || '').trim().replace(/\/+$/, '');
|
|
74
75
|
|
|
75
76
|
const candidates = [
|
|
76
|
-
{ host: norm(primaryUrl), model: String(primaryModel).trim() || 'gemma4:e4b' },
|
|
77
|
+
{ host: norm(primaryUrl), model: String(primaryModel).trim() || 'gemma4:e4b', type: 'ollama' },
|
|
77
78
|
];
|
|
78
79
|
|
|
79
80
|
// Legacy single fallback — always use project-selected model, not OLLAMA_FALLBACK_MODEL
|
|
80
81
|
const fallbackUrl = norm(process.env.OLLAMA_FALLBACK_URL || '');
|
|
81
82
|
if (fallbackUrl && !candidates.some(c => c.host === fallbackUrl)) {
|
|
82
|
-
candidates.push({ host: fallbackUrl, model: String(primaryModel).trim() || 'gemma4:e4b' });
|
|
83
|
+
candidates.push({ host: fallbackUrl, model: String(primaryModel).trim() || 'gemma4:e4b', type: 'ollama' });
|
|
83
84
|
}
|
|
84
85
|
|
|
85
86
|
// OLLAMA_HOSTS — comma-separated LAN hosts from setup wizard
|
|
@@ -87,13 +88,20 @@ function resolveExtractionRuntime(config) {
|
|
|
87
88
|
for (const h of process.env.OLLAMA_HOSTS.split(',')) {
|
|
88
89
|
const host = norm(h);
|
|
89
90
|
if (host && !candidates.some(c => c.host === host)) {
|
|
90
|
-
candidates.push({ host, model: String(primaryModel).trim() || 'gemma4:e4b' });
|
|
91
|
+
candidates.push({ host, model: String(primaryModel).trim() || 'gemma4:e4b', type: 'ollama' });
|
|
91
92
|
}
|
|
92
93
|
}
|
|
93
94
|
}
|
|
94
95
|
|
|
95
96
|
if (!candidates.some(candidate => candidate.host === localhost)) {
|
|
96
|
-
candidates.push({ host: localhost, model: String(primaryModel).trim() || 'gemma4:e4b' });
|
|
97
|
+
candidates.push({ host: localhost, model: String(primaryModel).trim() || 'gemma4:e4b', type: 'ollama' });
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// LM Studio — always probe default port; env vars override URL/model
|
|
101
|
+
const lmStudioUrl = norm(process.env.LMSTUDIO_URL || '') || 'http://localhost:1234';
|
|
102
|
+
const lmStudioModel = String(process.env.LMSTUDIO_MODEL || '').trim();
|
|
103
|
+
if (!candidates.some(c => c.host === lmStudioUrl)) {
|
|
104
|
+
candidates.push({ host: lmStudioUrl, model: lmStudioModel, type: 'lmstudio' });
|
|
97
105
|
}
|
|
98
106
|
|
|
99
107
|
const seen = new Set();
|
|
@@ -115,33 +123,55 @@ function applyExtractionRuntimeConfig(config) {
|
|
|
115
123
|
// ── AI AVAILABILITY PREFLIGHT ────────────────────────────────────────────
|
|
116
124
|
/**
|
|
117
125
|
* Check if any AI extraction backend is reachable.
|
|
118
|
-
* Tries: primary Ollama → fallback Ollama → returns false.
|
|
126
|
+
* Tries: primary Ollama → fallback Ollama → LM Studio → returns false.
|
|
119
127
|
* Fast: 2s timeout per host, runs sequentially.
|
|
120
128
|
*/
|
|
121
129
|
async function checkOllamaAvailability(config) {
|
|
122
130
|
const candidates = resolveExtractionRuntime(config);
|
|
123
|
-
let
|
|
131
|
+
let sawOllamaHostNoModel = false;
|
|
124
132
|
|
|
125
133
|
for (const candidate of candidates) {
|
|
126
134
|
try {
|
|
127
135
|
const controller = new AbortController();
|
|
128
136
|
const timeout = setTimeout(() => controller.abort(), 2000);
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
const
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
137
|
+
|
|
138
|
+
if (candidate.type === 'lmstudio') {
|
|
139
|
+
// LM Studio: GET /api/v1/models
|
|
140
|
+
const res = await fetch(`${candidate.host}/api/v1/models`, { signal: controller.signal });
|
|
141
|
+
clearTimeout(timeout);
|
|
142
|
+
if (res.ok) {
|
|
143
|
+
const data = await res.json().catch(() => ({ data: [] }));
|
|
144
|
+
const models = (data.data || []).map(m => m.id || m.model).filter(Boolean);
|
|
145
|
+
// Accept any loaded model when no specific model was requested
|
|
146
|
+
if (!candidate.model || models.some(id => id === candidate.model || id.endsWith('/' + candidate.model))) {
|
|
147
|
+
console.log(chalk.dim(` LM Studio: ${candidate.host} ✓ (${models[0] || 'model loaded'})`));
|
|
148
|
+
return true;
|
|
149
|
+
}
|
|
150
|
+
if (models.length > 0) {
|
|
151
|
+
// Model mismatch but something is loaded — still usable
|
|
152
|
+
console.log(chalk.dim(` LM Studio: ${candidate.host} ✓ (using ${models[0]})`));
|
|
153
|
+
return true;
|
|
154
|
+
}
|
|
155
|
+
console.log(chalk.yellow(` ⚠️ LM Studio reachable but no models loaded`));
|
|
156
|
+
console.log(chalk.dim(` Load a model in LM Studio to enable extraction`));
|
|
157
|
+
}
|
|
158
|
+
} else {
|
|
159
|
+
// Ollama
|
|
160
|
+
const res = await fetch(`${candidate.host}/api/tags`, { signal: controller.signal });
|
|
161
|
+
clearTimeout(timeout);
|
|
162
|
+
if (res.ok) {
|
|
163
|
+
const data = await res.json();
|
|
164
|
+
const models = (data.models || []).map(m => m.name);
|
|
165
|
+
sawOllamaHostNoModel = true;
|
|
166
|
+
const hasModel = models.some(m => m && m.split(':')[0] === candidate.model.split(':')[0]);
|
|
167
|
+
if (hasModel) return true;
|
|
138
168
|
}
|
|
139
169
|
}
|
|
140
170
|
} catch { /* host unreachable, try next */ }
|
|
141
171
|
}
|
|
142
172
|
|
|
143
|
-
if (
|
|
144
|
-
const primary = candidates[0];
|
|
173
|
+
if (sawOllamaHostNoModel) {
|
|
174
|
+
const primary = candidates.find(c => c.type !== 'lmstudio') || candidates[0];
|
|
145
175
|
console.log(chalk.yellow(` ⚠️ Ollama is reachable but model "${primary?.model || 'gemma4:e4b'}" was not found on any live host`));
|
|
146
176
|
console.log(chalk.dim(` Run: ollama pull ${primary?.model || 'gemma4:e4b'}`));
|
|
147
177
|
}
|
|
@@ -480,9 +510,9 @@ program
|
|
|
480
510
|
if (opts.extract !== false) {
|
|
481
511
|
const ollamaAvailable = await checkOllamaAvailability(config);
|
|
482
512
|
if (!ollamaAvailable) {
|
|
483
|
-
console.log(chalk.yellow('\n ⚠️ No AI extraction available (Ollama unreachable, no API keys configured)'));
|
|
513
|
+
console.log(chalk.yellow('\n ⚠️ No AI extraction available (Ollama/LM Studio unreachable, no API keys configured)'));
|
|
484
514
|
console.log(chalk.white(' → Switching to ') + chalk.bold.green('crawl-only mode') + chalk.white(' — raw data will be collected without AI extraction'));
|
|
485
|
-
console.log(chalk.dim(' Tip: Install Ollama (ollama.com)
|
|
515
|
+
console.log(chalk.dim(' Tip: Install Ollama (ollama.com) or LM Studio (lmstudio.ai) to enable local AI extraction\n'));
|
|
486
516
|
opts.extract = false;
|
|
487
517
|
}
|
|
488
518
|
}
|
|
@@ -538,6 +568,10 @@ program
|
|
|
538
568
|
stealth: !!opts.stealth,
|
|
539
569
|
tiered: opts.tiered !== false,
|
|
540
570
|
strictHost: !!opts.domain, // BUG-006: enforce exact hostname when --domain is set
|
|
571
|
+
onSitemapDiscovered: (urls) => {
|
|
572
|
+
try { upsertSitemapUrls(db, domainId, urls.map(u => u.url), `${site.url}/sitemap.xml`); }
|
|
573
|
+
catch (e) { console.warn(`[sitemap] inventory save failed: ${e.message}`); }
|
|
574
|
+
},
|
|
541
575
|
};
|
|
542
576
|
|
|
543
577
|
for await (const page of crawlDomain(site.url, crawlOpts)) {
|
|
@@ -568,6 +602,9 @@ program
|
|
|
568
602
|
title: page.title || null,
|
|
569
603
|
metaDesc: page.metaDesc || null,
|
|
570
604
|
bodyText: page.fullBodyText || page.bodyText || null,
|
|
605
|
+
finalUrl: page.finalUrl || null,
|
|
606
|
+
redirectChain: page.redirectChain || null,
|
|
607
|
+
xRobotsTag: page.xRobotsTag || null,
|
|
571
608
|
});
|
|
572
609
|
const pageId = pageRes?.id;
|
|
573
610
|
|
|
@@ -1101,7 +1138,7 @@ function getOpenClawToken() {
|
|
|
1101
1138
|
return null;
|
|
1102
1139
|
}
|
|
1103
1140
|
|
|
1104
|
-
async function callOpenClaw(prompt, model = '
|
|
1141
|
+
async function callOpenClaw(prompt, model = 'openclaw') {
|
|
1105
1142
|
const token = getOpenClawToken();
|
|
1106
1143
|
if (!token) throw new Error('OpenClaw token not found');
|
|
1107
1144
|
|
|
@@ -1109,6 +1146,9 @@ async function callOpenClaw(prompt, model = 'default') {
|
|
|
1109
1146
|
const controller = new AbortController();
|
|
1110
1147
|
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
|
1111
1148
|
|
|
1149
|
+
// OpenClaw gateway expects 'openclaw' or 'openclaw/<agentId>'
|
|
1150
|
+
const clawModel = (!model || model === 'default') ? 'openclaw' : model;
|
|
1151
|
+
|
|
1112
1152
|
try {
|
|
1113
1153
|
const res = await fetch('http://127.0.0.1:18789/v1/chat/completions', {
|
|
1114
1154
|
method: 'POST',
|
|
@@ -1118,7 +1158,7 @@ async function callOpenClaw(prompt, model = 'default') {
|
|
|
1118
1158
|
'Content-Type': 'application/json',
|
|
1119
1159
|
},
|
|
1120
1160
|
body: JSON.stringify({
|
|
1121
|
-
model:
|
|
1161
|
+
model: clawModel,
|
|
1122
1162
|
messages: [{ role: 'user', content: prompt }],
|
|
1123
1163
|
temperature: 0.2,
|
|
1124
1164
|
max_tokens: 4000,
|
|
@@ -1138,15 +1178,18 @@ async function callAnalysisModel(prompt, model = 'gemini') {
|
|
|
1138
1178
|
const requestedModel = String(model || 'gemini').trim();
|
|
1139
1179
|
const normalizedModel = requestedModel.toLowerCase();
|
|
1140
1180
|
|
|
1181
|
+
// Non-Gemini model: try OpenClaw first, then fall back to Gemini CLI
|
|
1141
1182
|
if (normalizedModel !== 'gemini') {
|
|
1142
1183
|
try {
|
|
1143
1184
|
return await callOpenClaw(prompt, requestedModel);
|
|
1144
1185
|
} catch (err) {
|
|
1145
|
-
console.
|
|
1146
|
-
|
|
1186
|
+
console.warn(chalk.dim(` [openclaw] ${err.message}`));
|
|
1187
|
+
console.log(chalk.yellow(` Falling back to Gemini CLI...\n`));
|
|
1188
|
+
// Fall through to Gemini CLI below
|
|
1147
1189
|
}
|
|
1148
1190
|
}
|
|
1149
1191
|
|
|
1192
|
+
// Try Gemini CLI
|
|
1150
1193
|
const timeoutMs = parseInt(process.env.GEMINI_TIMEOUT_MS || '120000', 10);
|
|
1151
1194
|
try {
|
|
1152
1195
|
const result = spawnSync('gemini', ['-p', '-'], {
|
|
@@ -1163,7 +1206,17 @@ async function callAnalysisModel(prompt, model = 'gemini') {
|
|
|
1163
1206
|
|
|
1164
1207
|
return result.stdout;
|
|
1165
1208
|
} catch (err) {
|
|
1166
|
-
|
|
1209
|
+
// Gemini CLI failed — try OpenClaw as last resort (if we haven't already)
|
|
1210
|
+
const fallbackModel = process.env.OPENCLAW_ANALYSIS_MODEL || 'openclaw';
|
|
1211
|
+
if (normalizedModel !== 'gemini') {
|
|
1212
|
+
// Already tried OpenClaw above, show combined error
|
|
1213
|
+
const geminiMsg = err.message || '';
|
|
1214
|
+
console.error(chalk.red('\n ✗ Analysis failed — no model available\n'));
|
|
1215
|
+
console.error(chalk.dim(` Gemini: ${geminiMsg}`));
|
|
1216
|
+
console.error(chalk.dim(` OpenClaw: already tried (${requestedModel})`));
|
|
1217
|
+
console.error(chalk.dim('\n Docs: https://ukkometa.fi/en/seo-intel/setup/\n'));
|
|
1218
|
+
return null;
|
|
1219
|
+
}
|
|
1167
1220
|
try {
|
|
1168
1221
|
console.warn(`[gemini] ${err.message}`);
|
|
1169
1222
|
console.log(chalk.yellow(`Gemini CLI unavailable, retrying via OpenClaw (${fallbackModel})...\n`));
|
|
@@ -1269,7 +1322,12 @@ program
|
|
|
1269
1322
|
let pageCount = 0;
|
|
1270
1323
|
let skipped = 0;
|
|
1271
1324
|
let blocked = false;
|
|
1272
|
-
for await (const page of crawlDomain(next.url
|
|
1325
|
+
for await (const page of crawlDomain(next.url, {
|
|
1326
|
+
onSitemapDiscovered: (urls) => {
|
|
1327
|
+
try { upsertSitemapUrls(db, domainId, urls.map(u => u.url), `${next.url}/sitemap.xml`); }
|
|
1328
|
+
catch (e) { console.warn(`[sitemap] inventory save failed: ${e.message}`); }
|
|
1329
|
+
},
|
|
1330
|
+
})) {
|
|
1273
1331
|
// ── Handle blocked pages from backoff system ──
|
|
1274
1332
|
if (page._blocked) {
|
|
1275
1333
|
blocked = true;
|
|
@@ -1291,6 +1349,9 @@ program
|
|
|
1291
1349
|
title: page.title || null,
|
|
1292
1350
|
metaDesc: page.metaDesc || null,
|
|
1293
1351
|
bodyText: page.fullBodyText || page.bodyText || null,
|
|
1352
|
+
finalUrl: page.finalUrl || null,
|
|
1353
|
+
redirectChain: page.redirectChain || null,
|
|
1354
|
+
xRobotsTag: page.xRobotsTag || null,
|
|
1294
1355
|
});
|
|
1295
1356
|
const pageId = pageRes?.id;
|
|
1296
1357
|
|
|
@@ -2425,6 +2486,73 @@ program
|
|
|
2425
2486
|
console.log(chalk.gray(' Feed this to Gemini: "Find the gaps in each heading structure above."\n'));
|
|
2426
2487
|
});
|
|
2427
2488
|
|
|
2489
|
+
// ── TECHNICAL AUDIT (extended-data) ───────────────────────────────────────
|
|
2490
|
+
program
|
|
2491
|
+
.command('tech-audit <project>')
|
|
2492
|
+
.description('Technical SEO audit from crawled data (titles, meta, noindex, redirects, sitemap diff)')
|
|
2493
|
+
.option('--domain <domain>', 'Audit a single domain (defaults to all target domains)')
|
|
2494
|
+
.option('--head', 'Also run HEAD checks against sitemap URLs (network-heavy)')
|
|
2495
|
+
.option('--concurrency <n>', 'Parallel HEAD requests when --head is set', '6')
|
|
2496
|
+
.option('--format <type>', 'Output format: brief or json', 'brief')
|
|
2497
|
+
.action(async (project, opts) => {
|
|
2498
|
+
const { runTechnicalAudit } = await import('./analysis/technical-audit.js');
|
|
2499
|
+
const isJson = opts.format === 'json';
|
|
2500
|
+
const db = getDb();
|
|
2501
|
+
|
|
2502
|
+
const domainRows = opts.domain
|
|
2503
|
+
? [{ domain: opts.domain }]
|
|
2504
|
+
: db.prepare("SELECT domain FROM domains WHERE project = ? AND role IN ('target','owned')").all(project);
|
|
2505
|
+
|
|
2506
|
+
if (!domainRows.length) {
|
|
2507
|
+
if (isJson) console.log(JSON.stringify({ command: 'tech-audit', project, error: 'no target domains', domains: [] }));
|
|
2508
|
+
else console.log(chalk.yellow(`No target domains found for project ${project}.`));
|
|
2509
|
+
return;
|
|
2510
|
+
}
|
|
2511
|
+
|
|
2512
|
+
const results = [];
|
|
2513
|
+
for (const { domain } of domainRows) {
|
|
2514
|
+
const res = await runTechnicalAudit(db, {
|
|
2515
|
+
project,
|
|
2516
|
+
domain,
|
|
2517
|
+
runSitemapHead: !!opts.head,
|
|
2518
|
+
sitemapConcurrency: parseInt(opts.concurrency) || 6,
|
|
2519
|
+
});
|
|
2520
|
+
results.push({ domain, ...res });
|
|
2521
|
+
}
|
|
2522
|
+
|
|
2523
|
+
if (isJson) {
|
|
2524
|
+
console.log(JSON.stringify({ command: 'tech-audit', project, timestamp: new Date().toISOString(), domains: results }));
|
|
2525
|
+
return;
|
|
2526
|
+
}
|
|
2527
|
+
|
|
2528
|
+
for (const r of results) {
|
|
2529
|
+
console.log(chalk.bold.cyan(`\n🔧 Technical audit — ${r.domain}`));
|
|
2530
|
+
if (r.gated) {
|
|
2531
|
+
console.log(chalk.gray(' (extended-data gate closed — upgrade to unlock technical audits)'));
|
|
2532
|
+
continue;
|
|
2533
|
+
}
|
|
2534
|
+
if (r.error) { console.log(chalk.red(` ✗ ${r.error}`)); continue; }
|
|
2535
|
+
|
|
2536
|
+
const { stats, findings } = r;
|
|
2537
|
+
const sev = stats.findings_by_severity || {};
|
|
2538
|
+
console.log(chalk.gray(` ${stats.pages} pages · ${stats.sitemap_urls} sitemap URLs · ${stats.findings_total} findings`));
|
|
2539
|
+
console.log(chalk.gray(` ${chalk.red(sev.error || 0)} errors · ${chalk.yellow(sev.warn || 0)} warnings · ${chalk.blue(sev.info || 0)} info`));
|
|
2540
|
+
if (stats.sitemap_head) {
|
|
2541
|
+
const sh = stats.sitemap_head;
|
|
2542
|
+
console.log(chalk.gray(` sitemap HEAD — ${sh.ok} ok · ${sh.redirected} 3xx · ${sh.broken} 4xx/5xx · ${sh.errored} errors`));
|
|
2543
|
+
}
|
|
2544
|
+
|
|
2545
|
+
const order = { error: 0, warn: 1, info: 2 };
|
|
2546
|
+
const sorted = [...findings].sort((a, b) => (order[a.severity] ?? 3) - (order[b.severity] ?? 3));
|
|
2547
|
+
for (const f of sorted.slice(0, 40)) {
|
|
2548
|
+
const icon = f.severity === 'error' ? chalk.red('✗') : f.severity === 'warn' ? chalk.yellow('⚠') : chalk.blue('ℹ');
|
|
2549
|
+
const target = f.url ? f.url.replace(/https?:\/\/[^/]+/, '') : '';
|
|
2550
|
+
console.log(` ${icon} ${chalk.bold(f.type)} ${chalk.gray(target)} — ${f.details}`);
|
|
2551
|
+
}
|
|
2552
|
+
if (sorted.length > 40) console.log(chalk.gray(` … +${sorted.length - 40} more`));
|
|
2553
|
+
}
|
|
2554
|
+
});
|
|
2555
|
+
|
|
2428
2556
|
// ── ORPHAN ENTITIES ───────────────────────────────────────────────────────
|
|
2429
2557
|
program
|
|
2430
2558
|
.command('orphans <project>')
|
|
@@ -4780,15 +4908,47 @@ program
|
|
|
4780
4908
|
.option('--pages <n>', 'Max pages to crawl', '100')
|
|
4781
4909
|
.option('--no-ai', 'Skip AI-enriched export (deterministic only)')
|
|
4782
4910
|
.option('--model <name>', 'Model for analysis + AI export (gemini, claude, gpt)', 'gemini')
|
|
4783
|
-
.option('--
|
|
4911
|
+
.option('--stealth', 'Enable stealth browser mode (Playwright) for JS-heavy sites')
|
|
4784
4912
|
.action(async (domainInput, opts) => {
|
|
4785
4913
|
if (!requirePro('scan')) return;
|
|
4786
4914
|
|
|
4787
4915
|
// ── Parse domain ──
|
|
4788
|
-
const
|
|
4789
|
-
const projectSlug = '_scan-' +
|
|
4790
|
-
|
|
4791
|
-
|
|
4916
|
+
const domainRaw = domainInput.replace(/^https?:\/\//, '').replace(/\/.*$/, '').replace(/^www\./, '');
|
|
4917
|
+
const projectSlug = '_scan-' + domainRaw.replace(/[^a-z0-9]/gi, '-').toLowerCase();
|
|
4918
|
+
|
|
4919
|
+
// Resolve the actual reachable URL (handles www redirects and bare-domain failures)
|
|
4920
|
+
let domain = domainRaw;
|
|
4921
|
+
let siteUrl = defaultSiteUrl(domain);
|
|
4922
|
+
let wwwRedirectMissing = false;
|
|
4923
|
+
try {
|
|
4924
|
+
const controller = new AbortController();
|
|
4925
|
+
const timer = setTimeout(() => controller.abort(), 8000);
|
|
4926
|
+
const probe = await fetch(siteUrl, { method: 'HEAD', redirect: 'follow', signal: controller.signal });
|
|
4927
|
+
clearTimeout(timer);
|
|
4928
|
+
const finalUrl = new URL(probe.url);
|
|
4929
|
+
if (finalUrl.hostname !== domain) {
|
|
4930
|
+
console.log(chalk.dim(` Resolved: ${domain} → ${finalUrl.hostname}`));
|
|
4931
|
+
domain = finalUrl.hostname.replace(/^www\./, '') === domainRaw ? domainRaw : finalUrl.hostname;
|
|
4932
|
+
siteUrl = finalUrl.origin;
|
|
4933
|
+
}
|
|
4934
|
+
} catch {
|
|
4935
|
+
// Bare domain unreachable — try www variant
|
|
4936
|
+
const wwwUrl = `https://www.${domainRaw}`;
|
|
4937
|
+
try {
|
|
4938
|
+
const controller = new AbortController();
|
|
4939
|
+
const timer = setTimeout(() => controller.abort(), 8000);
|
|
4940
|
+
const probe = await fetch(wwwUrl, { method: 'HEAD', redirect: 'follow', signal: controller.signal });
|
|
4941
|
+
clearTimeout(timer);
|
|
4942
|
+
if (probe.ok || probe.status < 400) {
|
|
4943
|
+
console.log(chalk.dim(` ${domainRaw} unreachable, using www.${domainRaw}`));
|
|
4944
|
+
console.log(chalk.yellow(` ⚠ Missing redirect: ${domainRaw} should 301 to www.${domainRaw}`));
|
|
4945
|
+
siteUrl = wwwUrl;
|
|
4946
|
+
domain = `www.${domainRaw}`;
|
|
4947
|
+
wwwRedirectMissing = true;
|
|
4948
|
+
}
|
|
4949
|
+
} catch { /* www also unreachable — proceed with original, crawler will report error */ }
|
|
4950
|
+
}
|
|
4951
|
+
const useStealth = opts.stealth === true;
|
|
4792
4952
|
const useAi = opts.ai !== false;
|
|
4793
4953
|
const maxPages = Math.min(parseInt(opts.pages) || 100, capPages(9999));
|
|
4794
4954
|
|
|
@@ -4828,7 +4988,7 @@ program
|
|
|
4828
4988
|
let doExtract = true;
|
|
4829
4989
|
const ollamaAvailable = await checkOllamaAvailability(config);
|
|
4830
4990
|
if (!ollamaAvailable) {
|
|
4831
|
-
console.log(chalk.yellow(' ⚠ No AI extraction available (Ollama unreachable)'));
|
|
4991
|
+
console.log(chalk.yellow(' ⚠ No AI extraction available (Ollama/LM Studio unreachable)'));
|
|
4832
4992
|
console.log(chalk.gray(' → Crawl-only mode — body text still captured for analysis'));
|
|
4833
4993
|
console.log('');
|
|
4834
4994
|
doExtract = false;
|
|
@@ -4840,44 +5000,66 @@ program
|
|
|
4840
5000
|
let pageCount = 0, extracted = 0, failed = 0;
|
|
4841
5001
|
const tag = chalk.cyan(`[${domain.split('.')[0]}]`);
|
|
4842
5002
|
|
|
4843
|
-
|
|
4844
|
-
|
|
4845
|
-
|
|
4846
|
-
|
|
4847
|
-
|
|
5003
|
+
try {
|
|
5004
|
+
for await (const page of crawlDomain(siteUrl, {
|
|
5005
|
+
maxPages, stealth: useStealth, tiered: true,
|
|
5006
|
+
onSitemapDiscovered: (urls) => {
|
|
5007
|
+
try { upsertSitemapUrls(db, domainId, urls.map(u => u.url), `${siteUrl}/sitemap.xml`); }
|
|
5008
|
+
catch (e) { console.warn(`[sitemap] inventory save failed: ${e.message}`); }
|
|
5009
|
+
},
|
|
5010
|
+
})) {
|
|
5011
|
+
if (page._blocked) {
|
|
5012
|
+
console.log(chalk.bold.red(` ${tag} ⛔ BLOCKED: ${page._blockReason}`));
|
|
5013
|
+
break;
|
|
5014
|
+
}
|
|
4848
5015
|
|
|
4849
|
-
|
|
4850
|
-
|
|
4851
|
-
|
|
4852
|
-
|
|
4853
|
-
|
|
4854
|
-
|
|
4855
|
-
|
|
4856
|
-
|
|
4857
|
-
|
|
5016
|
+
try {
|
|
5017
|
+
const pageRes = upsertPage(db, {
|
|
5018
|
+
domainId, url: page.url, statusCode: page.status,
|
|
5019
|
+
wordCount: page.wordCount, loadMs: page.loadMs,
|
|
5020
|
+
isIndexable: page.isIndexable, clickDepth: page.depth ?? 0,
|
|
5021
|
+
publishedDate: page.publishedDate || null, modifiedDate: page.modifiedDate || null,
|
|
5022
|
+
contentHash: page.contentHash || null, title: page.title || null,
|
|
5023
|
+
metaDesc: page.metaDesc || null, bodyText: page.fullBodyText || page.bodyText || null,
|
|
5024
|
+
finalUrl: page.finalUrl || null, redirectChain: page.redirectChain || null, xRobotsTag: page.xRobotsTag || null,
|
|
5025
|
+
});
|
|
5026
|
+
const pageId = pageRes?.id;
|
|
4858
5027
|
|
|
4859
|
-
|
|
4860
|
-
|
|
4861
|
-
|
|
4862
|
-
|
|
5028
|
+
upsertTechnical(db, { pageId, hasCanonical: page.hasCanonical, hasOgTags: page.hasOgTags, hasSchema: page.hasSchema, hasRobots: page.hasRobots });
|
|
5029
|
+
insertHeadings(db, pageId, page.headings);
|
|
5030
|
+
insertLinks(db, pageId, page.links);
|
|
5031
|
+
if (page.parsedSchemas?.length) insertPageSchemas(db, pageId, page.parsedSchemas);
|
|
4863
5032
|
|
|
4864
|
-
|
|
4865
|
-
|
|
4866
|
-
|
|
4867
|
-
|
|
4868
|
-
|
|
4869
|
-
|
|
4870
|
-
|
|
4871
|
-
|
|
4872
|
-
|
|
4873
|
-
|
|
4874
|
-
|
|
5033
|
+
if (doExtract) {
|
|
5034
|
+
process.stdout.write(chalk.gray(` ${tag} [${pageCount + 1}] d${page.depth ?? 0} ${page.url.slice(0, 60)} → extracting...`));
|
|
5035
|
+
try {
|
|
5036
|
+
const extractFn = await getExtractPage();
|
|
5037
|
+
const extraction = await extractFn(page);
|
|
5038
|
+
insertExtraction(db, { pageId, data: extraction });
|
|
5039
|
+
insertKeywords(db, pageId, extraction.keywords);
|
|
5040
|
+
process.stdout.write(chalk.green(` ✓\n`));
|
|
5041
|
+
extracted++;
|
|
5042
|
+
} catch (err) {
|
|
5043
|
+
process.stdout.write(chalk.red(` ✗ ${err.message}\n`));
|
|
5044
|
+
failed++;
|
|
5045
|
+
}
|
|
5046
|
+
} else {
|
|
5047
|
+
process.stdout.write(chalk.gray(` ${tag} [${pageCount + 1}] d${page.depth ?? 0} ${page.url.slice(0, 65)} ✓\n`));
|
|
5048
|
+
}
|
|
5049
|
+
pageCount++;
|
|
5050
|
+
} catch (pageErr) {
|
|
5051
|
+
console.log(chalk.yellow(` ${tag} ⚠ Skipped ${page.url?.slice(0, 60) || 'unknown'}: ${pageErr.message}`));
|
|
4875
5052
|
failed++;
|
|
4876
5053
|
}
|
|
4877
|
-
} else {
|
|
4878
|
-
process.stdout.write(chalk.gray(` ${tag} [${pageCount + 1}] d${page.depth ?? 0} ${page.url.slice(0, 65)} ✓\n`));
|
|
4879
5054
|
}
|
|
4880
|
-
|
|
5055
|
+
} catch (crawlErr) {
|
|
5056
|
+
console.log(chalk.yellow(`\n ⚠ Crawl stopped early: ${crawlErr.message}`));
|
|
5057
|
+
if (pageCount === 0) {
|
|
5058
|
+
console.log(chalk.red(` ✗ Could not reach ${domain} — check the URL and try again.\n`));
|
|
5059
|
+
try { unlinkSync(configPath); } catch { /* fine */ }
|
|
5060
|
+
return;
|
|
5061
|
+
}
|
|
5062
|
+
console.log(chalk.dim(` → Continuing with ${pageCount} pages already captured...\n`));
|
|
4881
5063
|
}
|
|
4882
5064
|
|
|
4883
5065
|
const crawlSec = ((Date.now() - scanStart) / 1000).toFixed(1);
|
|
@@ -4959,7 +5141,7 @@ program
|
|
|
4959
5141
|
|
|
4960
5142
|
// Inline the deterministic markdown builder from server.js
|
|
4961
5143
|
const { buildScanMarkdown } = await import('./lib/scan-export.js');
|
|
4962
|
-
let md = buildScanMarkdown(dash, projectSlug, domain);
|
|
5144
|
+
let md = buildScanMarkdown(dash, projectSlug, domain, { wwwRedirectMissing, bareDomain: domainRaw });
|
|
4963
5145
|
|
|
4964
5146
|
// AI enrichment
|
|
4965
5147
|
if (useAi) {
|
package/crawler/index.js
CHANGED
|
@@ -263,6 +263,10 @@ export async function* crawlDomain(startUrl, opts = {}) {
|
|
|
263
263
|
// ── Sitemap-first: seed queue from sitemap.xml (section-aware) ──
|
|
264
264
|
try {
|
|
265
265
|
const sitemapUrls = await fetchSitemap(startUrl);
|
|
266
|
+
// Report full sitemap inventory to caller (for DB persistence / audit diff)
|
|
267
|
+
if (sitemapUrls.length > 0 && typeof opts.onSitemapDiscovered === 'function') {
|
|
268
|
+
try { await opts.onSitemapDiscovered(sitemapUrls); } catch { /* ignore */ }
|
|
269
|
+
}
|
|
266
270
|
if (sitemapUrls.length > 0) {
|
|
267
271
|
// Apply section budgets if tiered crawling is enabled
|
|
268
272
|
const budgeted = tiered ? applySectionBudgets(sitemapUrls, maxPages) : sitemapUrls;
|
|
@@ -452,9 +456,36 @@ async function processPage(page, url, base, depth, queue, maxDepth) {
|
|
|
452
456
|
status = res?.status() || 0;
|
|
453
457
|
const loadMs = Date.now() - t0;
|
|
454
458
|
|
|
459
|
+
// ── Final URL after redirects ──
|
|
460
|
+
let finalUrl = null;
|
|
461
|
+
try { finalUrl = page.url() || null; } catch { /* ignore */ }
|
|
462
|
+
|
|
463
|
+
// ── Redirect chain (walk request.redirectedFrom() backwards) ──
|
|
464
|
+
const redirectChain = [];
|
|
465
|
+
try {
|
|
466
|
+
let req = res?.request();
|
|
467
|
+
const chain = [];
|
|
468
|
+
while (req) {
|
|
469
|
+
const prev = req.redirectedFrom?.();
|
|
470
|
+
if (!prev) break;
|
|
471
|
+
const prevRes = await prev.response().catch(() => null);
|
|
472
|
+
chain.push({ url: prev.url(), status: prevRes?.status() ?? null });
|
|
473
|
+
req = prev;
|
|
474
|
+
}
|
|
475
|
+
// chain is in reverse order (closest redirect first); reverse for chronological
|
|
476
|
+
redirectChain.push(...chain.reverse());
|
|
477
|
+
} catch { /* ignore */ }
|
|
478
|
+
|
|
479
|
+
// ── X-Robots-Tag header ──
|
|
480
|
+
let xRobotsTag = null;
|
|
481
|
+
try {
|
|
482
|
+
const headers = res?.headers?.() || {};
|
|
483
|
+
xRobotsTag = headers['x-robots-tag'] || null;
|
|
484
|
+
} catch { /* ignore */ }
|
|
485
|
+
|
|
455
486
|
// ── Return status for backoff logic (don't silently drop 4xx) ──
|
|
456
487
|
if (status === 429 || status === 503 || status === 403) {
|
|
457
|
-
return { url, depth, status, loadMs, wordCount: 0, isIndexable: false, title: '', metaDesc: '', headings: [], links: [], bodyText: '', schemaTypes: [], vitals: {}, publishedDate: null, modifiedDate: null, contentHash: null };
|
|
488
|
+
return { url, depth, status, loadMs, wordCount: 0, isIndexable: false, title: '', metaDesc: '', headings: [], links: [], bodyText: '', schemaTypes: [], vitals: {}, publishedDate: null, modifiedDate: null, contentHash: null, finalUrl, redirectChain, xRobotsTag };
|
|
458
489
|
}
|
|
459
490
|
if (status >= 400) return null;
|
|
460
491
|
|
|
@@ -507,7 +538,9 @@ async function processPage(page, url, base, depth, queue, maxDepth) {
|
|
|
507
538
|
const wordCount = await page.$eval('body', el => el.innerText.split(/\s+/).filter(Boolean).length).catch(() => 0);
|
|
508
539
|
|
|
509
540
|
const robotsMeta = await page.$eval('meta[name="robots"]', el => el.content).catch(() => '');
|
|
510
|
-
const
|
|
541
|
+
const metaNoindex = robotsMeta.toLowerCase().includes('noindex');
|
|
542
|
+
const headerNoindex = (xRobotsTag || '').toLowerCase().includes('noindex');
|
|
543
|
+
const isIndexable = !(metaNoindex || headerNoindex);
|
|
511
544
|
const hasCanonical = await page.$('link[rel="canonical"]').then(el => !!el).catch(() => false);
|
|
512
545
|
const hasOgTags = await page.$('meta[property^="og:"]').then(el => !!el).catch(() => false);
|
|
513
546
|
|
|
@@ -576,6 +609,7 @@ async function processPage(page, url, base, depth, queue, maxDepth) {
|
|
|
576
609
|
hasCanonical, hasOgTags,
|
|
577
610
|
hasRobots: !!robotsMeta,
|
|
578
611
|
hasSchema: schemaTypes.length > 0,
|
|
612
|
+
finalUrl, redirectChain, xRobotsTag,
|
|
579
613
|
};
|
|
580
614
|
}
|
|
581
615
|
|
package/crawler/sitemap.js
CHANGED
|
@@ -101,3 +101,47 @@ function extractTagContent(xml, tagName) {
|
|
|
101
101
|
}
|
|
102
102
|
return results;
|
|
103
103
|
}
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* HEAD-check a single URL without following redirects.
|
|
107
|
+
* Returns { status, location } — location is the Location header when 3XX.
|
|
108
|
+
* Never throws — errors return { status: 0, error: msg }.
|
|
109
|
+
*/
|
|
110
|
+
export async function headCheck(url, { timeoutMs = 8000 } = {}) {
|
|
111
|
+
try {
|
|
112
|
+
const ctrl = new AbortController();
|
|
113
|
+
const t = setTimeout(() => ctrl.abort(), timeoutMs);
|
|
114
|
+
const res = await fetch(url, {
|
|
115
|
+
method: 'HEAD',
|
|
116
|
+
redirect: 'manual',
|
|
117
|
+
signal: ctrl.signal,
|
|
118
|
+
headers: { 'User-Agent': 'SEOIntelBot/1.0' },
|
|
119
|
+
}).finally(() => clearTimeout(t));
|
|
120
|
+
return {
|
|
121
|
+
status: res.status,
|
|
122
|
+
location: res.headers.get('location') || null,
|
|
123
|
+
};
|
|
124
|
+
} catch (err) {
|
|
125
|
+
return { status: 0, error: err.message };
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Run HEAD checks against an array of sitemap URL rows in parallel (capped).
|
|
131
|
+
* Accepts [{ id, url }]. Invokes onResult(row, result) per check.
|
|
132
|
+
*/
|
|
133
|
+
export async function headCheckAll(rows, { concurrency = 6, onResult } = {}) {
|
|
134
|
+
const queue = [...rows];
|
|
135
|
+
const worker = async () => {
|
|
136
|
+
while (queue.length) {
|
|
137
|
+
const row = queue.shift();
|
|
138
|
+
if (!row) break;
|
|
139
|
+
const result = await headCheck(row.url);
|
|
140
|
+
if (onResult) {
|
|
141
|
+
try { await onResult(row, result); } catch { /* swallow */ }
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
};
|
|
145
|
+
const workers = Array.from({ length: Math.min(concurrency, rows.length) }, () => worker());
|
|
146
|
+
await Promise.all(workers);
|
|
147
|
+
}
|