seo-intel 1.5.2 → 1.5.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cli.js CHANGED
@@ -39,6 +39,7 @@ import {
39
39
  getCompetitorSummary, getKeywordMatrix, getHeadingStructure,
40
40
  getPageHash, getSchemasByProject,
41
41
  upsertInsightsFromAnalysis, upsertInsightsFromKeywords,
42
+ upsertSitemapUrls,
42
43
  } from './db/db.js';
43
44
  import { generateMultiDashboard } from './reports/generate-html.js';
44
45
  import { buildTechnicalActions } from './exports/technical.js';
@@ -73,13 +74,13 @@ function resolveExtractionRuntime(config) {
73
74
  const norm = h => String(h || '').trim().replace(/\/+$/, '');
74
75
 
75
76
  const candidates = [
76
- { host: norm(primaryUrl), model: String(primaryModel).trim() || 'gemma4:e4b' },
77
+ { host: norm(primaryUrl), model: String(primaryModel).trim() || 'gemma4:e4b', type: 'ollama' },
77
78
  ];
78
79
 
79
80
  // Legacy single fallback — always use project-selected model, not OLLAMA_FALLBACK_MODEL
80
81
  const fallbackUrl = norm(process.env.OLLAMA_FALLBACK_URL || '');
81
82
  if (fallbackUrl && !candidates.some(c => c.host === fallbackUrl)) {
82
- candidates.push({ host: fallbackUrl, model: String(primaryModel).trim() || 'gemma4:e4b' });
83
+ candidates.push({ host: fallbackUrl, model: String(primaryModel).trim() || 'gemma4:e4b', type: 'ollama' });
83
84
  }
84
85
 
85
86
  // OLLAMA_HOSTS — comma-separated LAN hosts from setup wizard
@@ -87,13 +88,20 @@ function resolveExtractionRuntime(config) {
87
88
  for (const h of process.env.OLLAMA_HOSTS.split(',')) {
88
89
  const host = norm(h);
89
90
  if (host && !candidates.some(c => c.host === host)) {
90
- candidates.push({ host, model: String(primaryModel).trim() || 'gemma4:e4b' });
91
+ candidates.push({ host, model: String(primaryModel).trim() || 'gemma4:e4b', type: 'ollama' });
91
92
  }
92
93
  }
93
94
  }
94
95
 
95
96
  if (!candidates.some(candidate => candidate.host === localhost)) {
96
- candidates.push({ host: localhost, model: String(primaryModel).trim() || 'gemma4:e4b' });
97
+ candidates.push({ host: localhost, model: String(primaryModel).trim() || 'gemma4:e4b', type: 'ollama' });
98
+ }
99
+
100
+ // LM Studio — always probe default port; env vars override URL/model
101
+ const lmStudioUrl = norm(process.env.LMSTUDIO_URL || '') || 'http://localhost:1234';
102
+ const lmStudioModel = String(process.env.LMSTUDIO_MODEL || '').trim();
103
+ if (!candidates.some(c => c.host === lmStudioUrl)) {
104
+ candidates.push({ host: lmStudioUrl, model: lmStudioModel, type: 'lmstudio' });
97
105
  }
98
106
 
99
107
  const seen = new Set();
@@ -115,33 +123,55 @@ function applyExtractionRuntimeConfig(config) {
115
123
  // ── AI AVAILABILITY PREFLIGHT ────────────────────────────────────────────
116
124
  /**
117
125
  * Check if any AI extraction backend is reachable.
118
- * Tries: primary Ollama → fallback Ollama → returns false.
126
+ * Tries: primary Ollama → fallback Ollama → LM Studio → returns false.
119
127
  * Fast: 2s timeout per host, runs sequentially.
120
128
  */
121
129
  async function checkOllamaAvailability(config) {
122
130
  const candidates = resolveExtractionRuntime(config);
123
- let sawReachableHost = false;
131
+ let sawOllamaHostNoModel = false;
124
132
 
125
133
  for (const candidate of candidates) {
126
134
  try {
127
135
  const controller = new AbortController();
128
136
  const timeout = setTimeout(() => controller.abort(), 2000);
129
- const res = await fetch(`${candidate.host}/api/tags`, { signal: controller.signal });
130
- clearTimeout(timeout);
131
- if (res.ok) {
132
- const data = await res.json();
133
- const models = (data.models || []).map(m => m.name);
134
- sawReachableHost = true;
135
- const hasModel = models.some(m => m && m.split(':')[0] === candidate.model.split(':')[0]);
136
- if (hasModel) {
137
- return true; // Ollama reachable + model available
137
+
138
+ if (candidate.type === 'lmstudio') {
139
+ // LM Studio: GET /api/v1/models
140
+ const res = await fetch(`${candidate.host}/api/v1/models`, { signal: controller.signal });
141
+ clearTimeout(timeout);
142
+ if (res.ok) {
143
+ const data = await res.json().catch(() => ({ data: [] }));
144
+ const models = (data.data || []).map(m => m.id || m.model).filter(Boolean);
145
+ // Accept any loaded model when no specific model was requested
146
+ if (!candidate.model || models.some(id => id === candidate.model || id.endsWith('/' + candidate.model))) {
147
+ console.log(chalk.dim(` LM Studio: ${candidate.host} ✓ (${models[0] || 'model loaded'})`));
148
+ return true;
149
+ }
150
+ if (models.length > 0) {
151
+ // Model mismatch but something is loaded — still usable
152
+ console.log(chalk.dim(` LM Studio: ${candidate.host} ✓ (using ${models[0]})`));
153
+ return true;
154
+ }
155
+ console.log(chalk.yellow(` ⚠️ LM Studio reachable but no models loaded`));
156
+ console.log(chalk.dim(` Load a model in LM Studio to enable extraction`));
157
+ }
158
+ } else {
159
+ // Ollama
160
+ const res = await fetch(`${candidate.host}/api/tags`, { signal: controller.signal });
161
+ clearTimeout(timeout);
162
+ if (res.ok) {
163
+ const data = await res.json();
164
+ const models = (data.models || []).map(m => m.name);
165
+ sawOllamaHostNoModel = true;
166
+ const hasModel = models.some(m => m && m.split(':')[0] === candidate.model.split(':')[0]);
167
+ if (hasModel) return true;
138
168
  }
139
169
  }
140
170
  } catch { /* host unreachable, try next */ }
141
171
  }
142
172
 
143
- if (sawReachableHost) {
144
- const primary = candidates[0];
173
+ if (sawOllamaHostNoModel) {
174
+ const primary = candidates.find(c => c.type !== 'lmstudio') || candidates[0];
145
175
  console.log(chalk.yellow(` ⚠️ Ollama is reachable but model "${primary?.model || 'gemma4:e4b'}" was not found on any live host`));
146
176
  console.log(chalk.dim(` Run: ollama pull ${primary?.model || 'gemma4:e4b'}`));
147
177
  }
@@ -480,9 +510,9 @@ program
480
510
  if (opts.extract !== false) {
481
511
  const ollamaAvailable = await checkOllamaAvailability(config);
482
512
  if (!ollamaAvailable) {
483
- console.log(chalk.yellow('\n ⚠️ No AI extraction available (Ollama unreachable, no API keys configured)'));
513
+ console.log(chalk.yellow('\n ⚠️ No AI extraction available (Ollama/LM Studio unreachable, no API keys configured)'));
484
514
  console.log(chalk.white(' → Switching to ') + chalk.bold.green('crawl-only mode') + chalk.white(' — raw data will be collected without AI extraction'));
485
- console.log(chalk.dim(' Tip: Install Ollama (ollama.com) + run `ollama pull gemma4:e4b` to enable local AI extraction\n'));
515
+ console.log(chalk.dim(' Tip: Install Ollama (ollama.com) or LM Studio (lmstudio.ai) to enable local AI extraction\n'));
486
516
  opts.extract = false;
487
517
  }
488
518
  }
@@ -538,6 +568,10 @@ program
538
568
  stealth: !!opts.stealth,
539
569
  tiered: opts.tiered !== false,
540
570
  strictHost: !!opts.domain, // BUG-006: enforce exact hostname when --domain is set
571
+ onSitemapDiscovered: (urls) => {
572
+ try { upsertSitemapUrls(db, domainId, urls.map(u => u.url), `${site.url}/sitemap.xml`); }
573
+ catch (e) { console.warn(`[sitemap] inventory save failed: ${e.message}`); }
574
+ },
541
575
  };
542
576
 
543
577
  for await (const page of crawlDomain(site.url, crawlOpts)) {
@@ -568,6 +602,9 @@ program
568
602
  title: page.title || null,
569
603
  metaDesc: page.metaDesc || null,
570
604
  bodyText: page.fullBodyText || page.bodyText || null,
605
+ finalUrl: page.finalUrl || null,
606
+ redirectChain: page.redirectChain || null,
607
+ xRobotsTag: page.xRobotsTag || null,
571
608
  });
572
609
  const pageId = pageRes?.id;
573
610
 
@@ -1101,7 +1138,7 @@ function getOpenClawToken() {
1101
1138
  return null;
1102
1139
  }
1103
1140
 
1104
- async function callOpenClaw(prompt, model = 'default') {
1141
+ async function callOpenClaw(prompt, model = 'openclaw') {
1105
1142
  const token = getOpenClawToken();
1106
1143
  if (!token) throw new Error('OpenClaw token not found');
1107
1144
 
@@ -1109,6 +1146,9 @@ async function callOpenClaw(prompt, model = 'default') {
1109
1146
  const controller = new AbortController();
1110
1147
  const timeout = setTimeout(() => controller.abort(), timeoutMs);
1111
1148
 
1149
+ // OpenClaw gateway expects 'openclaw' or 'openclaw/<agentId>'
1150
+ const clawModel = (!model || model === 'default') ? 'openclaw' : model;
1151
+
1112
1152
  try {
1113
1153
  const res = await fetch('http://127.0.0.1:18789/v1/chat/completions', {
1114
1154
  method: 'POST',
@@ -1118,7 +1158,7 @@ async function callOpenClaw(prompt, model = 'default') {
1118
1158
  'Content-Type': 'application/json',
1119
1159
  },
1120
1160
  body: JSON.stringify({
1121
- model: model === 'openclaw' ? 'default' : model,
1161
+ model: clawModel,
1122
1162
  messages: [{ role: 'user', content: prompt }],
1123
1163
  temperature: 0.2,
1124
1164
  max_tokens: 4000,
@@ -1138,15 +1178,18 @@ async function callAnalysisModel(prompt, model = 'gemini') {
1138
1178
  const requestedModel = String(model || 'gemini').trim();
1139
1179
  const normalizedModel = requestedModel.toLowerCase();
1140
1180
 
1181
+ // Non-Gemini model: try OpenClaw first, then fall back to Gemini CLI
1141
1182
  if (normalizedModel !== 'gemini') {
1142
1183
  try {
1143
1184
  return await callOpenClaw(prompt, requestedModel);
1144
1185
  } catch (err) {
1145
- console.error('[openclaw]', err.message);
1146
- return null;
1186
+ console.warn(chalk.dim(` [openclaw] ${err.message}`));
1187
+ console.log(chalk.yellow(` Falling back to Gemini CLI...\n`));
1188
+ // Fall through to Gemini CLI below
1147
1189
  }
1148
1190
  }
1149
1191
 
1192
+ // Try Gemini CLI
1150
1193
  const timeoutMs = parseInt(process.env.GEMINI_TIMEOUT_MS || '120000', 10);
1151
1194
  try {
1152
1195
  const result = spawnSync('gemini', ['-p', '-'], {
@@ -1163,7 +1206,17 @@ async function callAnalysisModel(prompt, model = 'gemini') {
1163
1206
 
1164
1207
  return result.stdout;
1165
1208
  } catch (err) {
1166
- const fallbackModel = process.env.OPENCLAW_ANALYSIS_MODEL || 'default';
1209
+ // Gemini CLI failed try OpenClaw as last resort (if we haven't already)
1210
+ const fallbackModel = process.env.OPENCLAW_ANALYSIS_MODEL || 'openclaw';
1211
+ if (normalizedModel !== 'gemini') {
1212
+ // Already tried OpenClaw above, show combined error
1213
+ const geminiMsg = err.message || '';
1214
+ console.error(chalk.red('\n ✗ Analysis failed — no model available\n'));
1215
+ console.error(chalk.dim(` Gemini: ${geminiMsg}`));
1216
+ console.error(chalk.dim(` OpenClaw: already tried (${requestedModel})`));
1217
+ console.error(chalk.dim('\n Docs: https://ukkometa.fi/en/seo-intel/setup/\n'));
1218
+ return null;
1219
+ }
1167
1220
  try {
1168
1221
  console.warn(`[gemini] ${err.message}`);
1169
1222
  console.log(chalk.yellow(`Gemini CLI unavailable, retrying via OpenClaw (${fallbackModel})...\n`));
@@ -1269,7 +1322,12 @@ program
1269
1322
  let pageCount = 0;
1270
1323
  let skipped = 0;
1271
1324
  let blocked = false;
1272
- for await (const page of crawlDomain(next.url)) {
1325
+ for await (const page of crawlDomain(next.url, {
1326
+ onSitemapDiscovered: (urls) => {
1327
+ try { upsertSitemapUrls(db, domainId, urls.map(u => u.url), `${next.url}/sitemap.xml`); }
1328
+ catch (e) { console.warn(`[sitemap] inventory save failed: ${e.message}`); }
1329
+ },
1330
+ })) {
1273
1331
  // ── Handle blocked pages from backoff system ──
1274
1332
  if (page._blocked) {
1275
1333
  blocked = true;
@@ -1291,6 +1349,9 @@ program
1291
1349
  title: page.title || null,
1292
1350
  metaDesc: page.metaDesc || null,
1293
1351
  bodyText: page.fullBodyText || page.bodyText || null,
1352
+ finalUrl: page.finalUrl || null,
1353
+ redirectChain: page.redirectChain || null,
1354
+ xRobotsTag: page.xRobotsTag || null,
1294
1355
  });
1295
1356
  const pageId = pageRes?.id;
1296
1357
 
@@ -2425,6 +2486,73 @@ program
2425
2486
  console.log(chalk.gray(' Feed this to Gemini: "Find the gaps in each heading structure above."\n'));
2426
2487
  });
2427
2488
 
2489
+ // ── TECHNICAL AUDIT (extended-data) ───────────────────────────────────────
2490
+ program
2491
+ .command('tech-audit <project>')
2492
+ .description('Technical SEO audit from crawled data (titles, meta, noindex, redirects, sitemap diff)')
2493
+ .option('--domain <domain>', 'Audit a single domain (defaults to all target domains)')
2494
+ .option('--head', 'Also run HEAD checks against sitemap URLs (network-heavy)')
2495
+ .option('--concurrency <n>', 'Parallel HEAD requests when --head is set', '6')
2496
+ .option('--format <type>', 'Output format: brief or json', 'brief')
2497
+ .action(async (project, opts) => {
2498
+ const { runTechnicalAudit } = await import('./analysis/technical-audit.js');
2499
+ const isJson = opts.format === 'json';
2500
+ const db = getDb();
2501
+
2502
+ const domainRows = opts.domain
2503
+ ? [{ domain: opts.domain }]
2504
+ : db.prepare("SELECT domain FROM domains WHERE project = ? AND role IN ('target','owned')").all(project);
2505
+
2506
+ if (!domainRows.length) {
2507
+ if (isJson) console.log(JSON.stringify({ command: 'tech-audit', project, error: 'no target domains', domains: [] }));
2508
+ else console.log(chalk.yellow(`No target domains found for project ${project}.`));
2509
+ return;
2510
+ }
2511
+
2512
+ const results = [];
2513
+ for (const { domain } of domainRows) {
2514
+ const res = await runTechnicalAudit(db, {
2515
+ project,
2516
+ domain,
2517
+ runSitemapHead: !!opts.head,
2518
+ sitemapConcurrency: parseInt(opts.concurrency) || 6,
2519
+ });
2520
+ results.push({ domain, ...res });
2521
+ }
2522
+
2523
+ if (isJson) {
2524
+ console.log(JSON.stringify({ command: 'tech-audit', project, timestamp: new Date().toISOString(), domains: results }));
2525
+ return;
2526
+ }
2527
+
2528
+ for (const r of results) {
2529
+ console.log(chalk.bold.cyan(`\n🔧 Technical audit — ${r.domain}`));
2530
+ if (r.gated) {
2531
+ console.log(chalk.gray(' (extended-data gate closed — upgrade to unlock technical audits)'));
2532
+ continue;
2533
+ }
2534
+ if (r.error) { console.log(chalk.red(` ✗ ${r.error}`)); continue; }
2535
+
2536
+ const { stats, findings } = r;
2537
+ const sev = stats.findings_by_severity || {};
2538
+ console.log(chalk.gray(` ${stats.pages} pages · ${stats.sitemap_urls} sitemap URLs · ${stats.findings_total} findings`));
2539
+ console.log(chalk.gray(` ${chalk.red(sev.error || 0)} errors · ${chalk.yellow(sev.warn || 0)} warnings · ${chalk.blue(sev.info || 0)} info`));
2540
+ if (stats.sitemap_head) {
2541
+ const sh = stats.sitemap_head;
2542
+ console.log(chalk.gray(` sitemap HEAD — ${sh.ok} ok · ${sh.redirected} 3xx · ${sh.broken} 4xx/5xx · ${sh.errored} errors`));
2543
+ }
2544
+
2545
+ const order = { error: 0, warn: 1, info: 2 };
2546
+ const sorted = [...findings].sort((a, b) => (order[a.severity] ?? 3) - (order[b.severity] ?? 3));
2547
+ for (const f of sorted.slice(0, 40)) {
2548
+ const icon = f.severity === 'error' ? chalk.red('✗') : f.severity === 'warn' ? chalk.yellow('⚠') : chalk.blue('ℹ');
2549
+ const target = f.url ? f.url.replace(/https?:\/\/[^/]+/, '') : '';
2550
+ console.log(` ${icon} ${chalk.bold(f.type)} ${chalk.gray(target)} — ${f.details}`);
2551
+ }
2552
+ if (sorted.length > 40) console.log(chalk.gray(` … +${sorted.length - 40} more`));
2553
+ }
2554
+ });
2555
+
2428
2556
  // ── ORPHAN ENTITIES ───────────────────────────────────────────────────────
2429
2557
  program
2430
2558
  .command('orphans <project>')
@@ -4439,6 +4567,7 @@ program
4439
4567
  .description('Generate an AEO-optimised blog post draft from Intelligence Ledger data')
4440
4568
  .option('--topic <keyword>', 'Focus the post on a specific topic')
4441
4569
  .option('--lang <code>', 'Language: en or fi', 'en')
4570
+ .option('--type <type>', 'Content type: blog, docs, or social', 'blog')
4442
4571
  .option('--model <name>', 'Model to use for generation (gemini, claude, gpt, deepseek)', 'gemini')
4443
4572
  .option('--save', 'Save the generated draft to reports/')
4444
4573
  .action(async (project, opts) => {
@@ -4480,6 +4609,7 @@ program
4480
4609
  config,
4481
4610
  lang: opts.lang,
4482
4611
  topic: opts.topic || null,
4612
+ contentType: opts.type || 'blog',
4483
4613
  });
4484
4614
  console.log(chalk.gray(` Prompt size: ${(prompt.length / 1024).toFixed(1)}KB`));
4485
4615
 
@@ -4771,6 +4901,297 @@ program.hook('preAction', async () => {
4771
4901
  }
4772
4902
  });
4773
4903
 
4904
+ // ── SCAN — One-shot full audit ────────────────────────────────────────────────
4905
+ program
4906
+ .command('scan <domain>')
4907
+ .description('One-shot full audit: crawl → extract → analyze → export (no config needed)')
4908
+ .option('--pages <n>', 'Max pages to crawl', '100')
4909
+ .option('--no-ai', 'Skip AI-enriched export (deterministic only)')
4910
+ .option('--model <name>', 'Model for analysis + AI export (gemini, claude, gpt)', 'gemini')
4911
+ .option('--stealth', 'Enable stealth browser mode (Playwright) for JS-heavy sites')
4912
+ .action(async (domainInput, opts) => {
4913
+ if (!requirePro('scan')) return;
4914
+
4915
+ // ── Parse domain ──
4916
+ const domainRaw = domainInput.replace(/^https?:\/\//, '').replace(/\/.*$/, '').replace(/^www\./, '');
4917
+ const projectSlug = '_scan-' + domainRaw.replace(/[^a-z0-9]/gi, '-').toLowerCase();
4918
+
4919
+ // Resolve the actual reachable URL (handles www redirects and bare-domain failures)
4920
+ let domain = domainRaw;
4921
+ let siteUrl = defaultSiteUrl(domain);
4922
+ let wwwRedirectMissing = false;
4923
+ try {
4924
+ const controller = new AbortController();
4925
+ const timer = setTimeout(() => controller.abort(), 8000);
4926
+ const probe = await fetch(siteUrl, { method: 'HEAD', redirect: 'follow', signal: controller.signal });
4927
+ clearTimeout(timer);
4928
+ const finalUrl = new URL(probe.url);
4929
+ if (finalUrl.hostname !== domain) {
4930
+ console.log(chalk.dim(` Resolved: ${domain} → ${finalUrl.hostname}`));
4931
+ domain = finalUrl.hostname.replace(/^www\./, '') === domainRaw ? domainRaw : finalUrl.hostname;
4932
+ siteUrl = finalUrl.origin;
4933
+ }
4934
+ } catch {
4935
+ // Bare domain unreachable — try www variant
4936
+ const wwwUrl = `https://www.${domainRaw}`;
4937
+ try {
4938
+ const controller = new AbortController();
4939
+ const timer = setTimeout(() => controller.abort(), 8000);
4940
+ const probe = await fetch(wwwUrl, { method: 'HEAD', redirect: 'follow', signal: controller.signal });
4941
+ clearTimeout(timer);
4942
+ if (probe.ok || probe.status < 400) {
4943
+ console.log(chalk.dim(` ${domainRaw} unreachable, using www.${domainRaw}`));
4944
+ console.log(chalk.yellow(` ⚠ Missing redirect: ${domainRaw} should 301 to www.${domainRaw}`));
4945
+ siteUrl = wwwUrl;
4946
+ domain = `www.${domainRaw}`;
4947
+ wwwRedirectMissing = true;
4948
+ }
4949
+ } catch { /* www also unreachable — proceed with original, crawler will report error */ }
4950
+ }
4951
+ const useStealth = opts.stealth === true;
4952
+ const useAi = opts.ai !== false;
4953
+ const maxPages = Math.min(parseInt(opts.pages) || 100, capPages(9999));
4954
+
4955
+ console.log(chalk.bold.hex('#d4af37')(`\n${'═'.repeat(60)}`));
4956
+ console.log(chalk.bold.hex('#d4af37')(` ⚡ SCAN — Full SEO Audit`));
4957
+ console.log(chalk.bold.hex('#d4af37')(`${'═'.repeat(60)}`));
4958
+ console.log('');
4959
+ console.log(chalk.white(` Target: ${chalk.bold(domain)}`));
4960
+ console.log(chalk.white(` Pages: ${maxPages}`));
4961
+ console.log(chalk.white(` Stealth: ${useStealth ? chalk.green('yes') : chalk.gray('no')}`));
4962
+ console.log(chalk.white(` AI Export: ${useAi ? chalk.green('yes') : chalk.gray('no')}`));
4963
+ console.log(chalk.white(` Model: ${opts.model}`));
4964
+ console.log('');
4965
+
4966
+ const scanStart = Date.now();
4967
+ const db = getDb();
4968
+
4969
+ // ── Ephemeral config ──
4970
+ const config = {
4971
+ project: projectSlug,
4972
+ context: { siteName: domain, industry: '', audience: '', goal: '' },
4973
+ target: { domain, url: siteUrl, maxPages, crawlMode: 'standard' },
4974
+ competitors: [],
4975
+ owned: [],
4976
+ };
4977
+ // Save config so dashboard/export functions work
4978
+ const configPath = join(__dirname, `config/${projectSlug}.json`);
4979
+ writeFileSync(configPath, JSON.stringify(config, null, 2), 'utf8');
4980
+
4981
+ applyExtractionRuntimeConfig(config);
4982
+
4983
+ // ── Step 1: Crawl + Extract ──
4984
+ console.log(chalk.bold.cyan(' ⏱ Step 1/3 — Crawl + Extract'));
4985
+ console.log('');
4986
+
4987
+ // Check extraction availability
4988
+ let doExtract = true;
4989
+ const ollamaAvailable = await checkOllamaAvailability(config);
4990
+ if (!ollamaAvailable) {
4991
+ console.log(chalk.yellow(' ⚠ No AI extraction available (Ollama/LM Studio unreachable)'));
4992
+ console.log(chalk.gray(' → Crawl-only mode — body text still captured for analysis'));
4993
+ console.log('');
4994
+ doExtract = false;
4995
+ }
4996
+
4997
+ upsertDomain(db, { domain, project: projectSlug, role: 'target' });
4998
+ const domainId = db.prepare('SELECT id FROM domains WHERE domain = ? AND project = ?').get(domain, projectSlug)?.id;
4999
+
5000
+ let pageCount = 0, extracted = 0, failed = 0;
5001
+ const tag = chalk.cyan(`[${domain.split('.')[0]}]`);
5002
+
5003
+ try {
5004
+ for await (const page of crawlDomain(siteUrl, {
5005
+ maxPages, stealth: useStealth, tiered: true,
5006
+ onSitemapDiscovered: (urls) => {
5007
+ try { upsertSitemapUrls(db, domainId, urls.map(u => u.url), `${siteUrl}/sitemap.xml`); }
5008
+ catch (e) { console.warn(`[sitemap] inventory save failed: ${e.message}`); }
5009
+ },
5010
+ })) {
5011
+ if (page._blocked) {
5012
+ console.log(chalk.bold.red(` ${tag} ⛔ BLOCKED: ${page._blockReason}`));
5013
+ break;
5014
+ }
5015
+
5016
+ try {
5017
+ const pageRes = upsertPage(db, {
5018
+ domainId, url: page.url, statusCode: page.status,
5019
+ wordCount: page.wordCount, loadMs: page.loadMs,
5020
+ isIndexable: page.isIndexable, clickDepth: page.depth ?? 0,
5021
+ publishedDate: page.publishedDate || null, modifiedDate: page.modifiedDate || null,
5022
+ contentHash: page.contentHash || null, title: page.title || null,
5023
+ metaDesc: page.metaDesc || null, bodyText: page.fullBodyText || page.bodyText || null,
5024
+ finalUrl: page.finalUrl || null, redirectChain: page.redirectChain || null, xRobotsTag: page.xRobotsTag || null,
5025
+ });
5026
+ const pageId = pageRes?.id;
5027
+
5028
+ upsertTechnical(db, { pageId, hasCanonical: page.hasCanonical, hasOgTags: page.hasOgTags, hasSchema: page.hasSchema, hasRobots: page.hasRobots });
5029
+ insertHeadings(db, pageId, page.headings);
5030
+ insertLinks(db, pageId, page.links);
5031
+ if (page.parsedSchemas?.length) insertPageSchemas(db, pageId, page.parsedSchemas);
5032
+
5033
+ if (doExtract) {
5034
+ process.stdout.write(chalk.gray(` ${tag} [${pageCount + 1}] d${page.depth ?? 0} ${page.url.slice(0, 60)} → extracting...`));
5035
+ try {
5036
+ const extractFn = await getExtractPage();
5037
+ const extraction = await extractFn(page);
5038
+ insertExtraction(db, { pageId, data: extraction });
5039
+ insertKeywords(db, pageId, extraction.keywords);
5040
+ process.stdout.write(chalk.green(` ✓\n`));
5041
+ extracted++;
5042
+ } catch (err) {
5043
+ process.stdout.write(chalk.red(` ✗ ${err.message}\n`));
5044
+ failed++;
5045
+ }
5046
+ } else {
5047
+ process.stdout.write(chalk.gray(` ${tag} [${pageCount + 1}] d${page.depth ?? 0} ${page.url.slice(0, 65)} ✓\n`));
5048
+ }
5049
+ pageCount++;
5050
+ } catch (pageErr) {
5051
+ console.log(chalk.yellow(` ${tag} ⚠ Skipped ${page.url?.slice(0, 60) || 'unknown'}: ${pageErr.message}`));
5052
+ failed++;
5053
+ }
5054
+ }
5055
+ } catch (crawlErr) {
5056
+ console.log(chalk.yellow(`\n ⚠ Crawl stopped early: ${crawlErr.message}`));
5057
+ if (pageCount === 0) {
5058
+ console.log(chalk.red(` ✗ Could not reach ${domain} — check the URL and try again.\n`));
5059
+ try { unlinkSync(configPath); } catch { /* fine */ }
5060
+ return;
5061
+ }
5062
+ console.log(chalk.dim(` → Continuing with ${pageCount} pages already captured...\n`));
5063
+ }
5064
+
5065
+ const crawlSec = ((Date.now() - scanStart) / 1000).toFixed(1);
5066
+ console.log(chalk.green(`\n ✅ Crawl done: ${pageCount} pages, ${extracted} extracted (${crawlSec}s)\n`));
5067
+
5068
+ // ── Step 2: Analyze ──
5069
+ console.log(chalk.bold.cyan(' ⏱ Step 2/3 — Analyze'));
5070
+ console.log('');
5071
+
5072
+ const summary = getCompetitorSummary(db, projectSlug);
5073
+ const target = summary.find(s => s.role === 'target');
5074
+
5075
+ if (!target) {
5076
+ console.log(chalk.yellow(' ⚠ No target data found — skipping analysis'));
5077
+ } else {
5078
+ target.domain = domain;
5079
+ const keywordMatrix = getKeywordMatrix(db, projectSlug);
5080
+ const headings = getHeadingStructure(db, projectSlug);
5081
+
5082
+ const buildPromptFn = await getBuildAnalysisPrompt();
5083
+ const prompt = buildPromptFn({
5084
+ project: projectSlug, target, competitors: [],
5085
+ keywordMatrix, headingStructure: headings, context: config.context,
5086
+ });
5087
+
5088
+ console.log(chalk.gray(` Prompt: ~${Math.round(prompt.length / 4)} tokens → ${opts.model}...`));
5089
+ process.env._SEO_INTEL_PROJECT = projectSlug;
5090
+ const result = await callAnalysisModel(prompt, opts.model);
5091
+
5092
+ if (result) {
5093
+ try {
5094
+ const jsonMatch = result.match(/\{[\s\S]*\}/);
5095
+ const analysis = JSON.parse(jsonMatch[0]);
5096
+
5097
+ // Save to DB
5098
+ const analysisTs = Date.now();
5099
+ db.prepare(`
5100
+ INSERT INTO analyses (project, generated_at, model, keyword_gaps, long_tails, quick_wins, new_pages, content_gaps, positioning, technical_gaps, raw)
5101
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
5102
+ `).run(
5103
+ projectSlug, analysisTs, opts.model,
5104
+ JSON.stringify(analysis.keyword_gaps || []),
5105
+ JSON.stringify(analysis.long_tails || []),
5106
+ JSON.stringify(analysis.quick_wins || []),
5107
+ JSON.stringify(analysis.new_pages || []),
5108
+ JSON.stringify(analysis.content_gaps || []),
5109
+ JSON.stringify(analysis.positioning || {}),
5110
+ JSON.stringify(analysis.technical_gaps || []),
5111
+ result,
5112
+ );
5113
+ const analysisRowId = db.prepare('SELECT last_insert_rowid() as id').get().id;
5114
+ upsertInsightsFromAnalysis(db, projectSlug, analysisRowId, analysis, analysisTs);
5115
+
5116
+ printAnalysisSummary(analysis, projectSlug);
5117
+ } catch (parseErr) {
5118
+ console.log(chalk.yellow(` ⚠ Could not parse analysis: ${parseErr.message}`));
5119
+ const rawPath = join(__dirname, `reports/${projectSlug}-raw-${new Date().toISOString().slice(0, 10)}.md`);
5120
+ writeFileSync(rawPath, result, 'utf8');
5121
+ console.log(chalk.gray(` Raw output saved: ${rawPath}`));
5122
+ }
5123
+ } else {
5124
+ console.log(chalk.yellow(' ⚠ No response from model — skipping analysis'));
5125
+ }
5126
+ }
5127
+
5128
+ // ── Step 3: Export ──
5129
+ console.log(chalk.bold.cyan('\n ⏱ Step 3/3 — Export Report'));
5130
+ console.log('');
5131
+
5132
+ // Generate dashboard (so gatherProjectData works)
5133
+ try {
5134
+ const allConfigs = loadAllConfigs();
5135
+ generateMultiDashboard(db, allConfigs);
5136
+ } catch { /* ok if this fails */ }
5137
+
5138
+ // Build export markdown via the server's export logic
5139
+ const { gatherProjectData } = await import('./reports/generate-html.js');
5140
+ const dash = gatherProjectData(db, projectSlug, config);
5141
+
5142
+ // Inline the deterministic markdown builder from server.js
5143
+ const { buildScanMarkdown } = await import('./lib/scan-export.js');
5144
+ let md = buildScanMarkdown(dash, projectSlug, domain, { wwwRedirectMissing, bareDomain: domainRaw });
5145
+
5146
+ // AI enrichment
5147
+ if (useAi) {
5148
+ console.log(chalk.gray(` Enriching with AI (${opts.model})...`));
5149
+ const aiPrompt = `You are an SEO strategist reviewing a data export report. Your job is to ENRICH this report, NOT rewrite it.
5150
+
5151
+ Rules:
5152
+ - Keep ALL existing data, tables, headers, and instruction blocks exactly as they are
5153
+ - Fill any empty table cells (marked with empty | | columns) with concise, actionable content
5154
+ - For empty "Parent" cells: infer the parent keyword cluster
5155
+ - For empty "Opportunity" cells: classify as how-to guide, comparison, tutorial, landing page, etc.
5156
+ - For empty "Gap"/"Suggestion"/"Rationale"/"Potential" cells: fill with concise actionable content
5157
+ - After the last section, add "## AI Action Plan" with a numbered top-10 highest-impact actions
5158
+ - Keep markdown format — tables, headers, blockquotes
5159
+ - Be concise — table cells under 80 chars
5160
+ - Do NOT add commentary outside the report
5161
+
5162
+ Here is the report:
5163
+
5164
+ ${md}`;
5165
+ const aiResult = await callAnalysisModel(aiPrompt, opts.model);
5166
+ if (aiResult && aiResult.trim().length > md.length * 0.5) {
5167
+ md = aiResult;
5168
+ console.log(chalk.green(' ✓ AI enrichment applied'));
5169
+ } else {
5170
+ console.log(chalk.yellow(' ⚠ AI enrichment failed — using deterministic export'));
5171
+ }
5172
+ }
5173
+
5174
+ // Save
5175
+ const dateStr = new Date().toISOString().slice(0, 10);
5176
+ const fileName = `scan-${domain.replace(/[^a-z0-9]/gi, '-')}-${dateStr}.md`;
5177
+ const outPath = join(__dirname, 'reports', fileName);
5178
+ writeFileSync(outPath, md, 'utf8');
5179
+
5180
+ const totalSec = ((Date.now() - scanStart) / 1000).toFixed(1);
5181
+ console.log('');
5182
+ console.log(chalk.bold.hex('#d4af37')(`${'═'.repeat(60)}`));
5183
+ console.log(chalk.bold.hex('#d4af37')(` ✅ Scan Complete — ${totalSec}s`));
5184
+ console.log(chalk.bold.hex('#d4af37')(`${'═'.repeat(60)}`));
5185
+ console.log('');
5186
+ console.log(chalk.white(` Report: ${chalk.bold(outPath)}`));
5187
+ console.log(chalk.white(` Pages: ${pageCount} crawled, ${extracted} extracted`));
5188
+ console.log(chalk.white(` Export: ${useAi ? 'AI-enriched' : 'deterministic'} markdown`));
5189
+ console.log('');
5190
+
5191
+ // Clean up ephemeral config (keep the report)
5192
+ try { unlinkSync(configPath); } catch { /* fine if already gone */ }
5193
+ });
5194
+
4774
5195
  // ── BUG-002: No-args getting-started handler ─────────────────────────────────
4775
5196
  // When run with no command, show a friendly entry point instead of generic help
4776
5197
  if (process.argv.length <= 2) {