npm - seo-intel - Versions diffs - 1.5.21 → 1.5.23 - Mend

seo-intel 1.5.21 → 1.5.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/CHANGELOG.md +26 -0
package/analyses/aeo/scorer.js +60 -6
package/analyses/templates/index.js +1 -1
package/analysis/prompt-builder.js +167 -2
package/analysis/technical-audit.js +177 -0
package/cli.js +246 -64
package/crawler/index.js +36 -2
package/crawler/sitemap.js +44 -0
package/db/db.js +62 -9
package/db/schema.sql +19 -0
package/exports/queries.js +32 -0
package/exports/technical.js +181 -1
package/extractor/qwen.js +135 -13
package/lib/scan-export.js +33 -9
package/package.json +1 -1
package/reports/generate-html.js +27 -6
package/server.js +25 -8
package/setup/checks.js +65 -5
package/setup/engine.js +1 -0
package/setup/web-routes.js +22 -3
package/setup/wizard.html +8 -6

package/cli.js CHANGED Viewed

@@ -39,6 +39,7 @@ import {
   getCompetitorSummary, getKeywordMatrix, getHeadingStructure,
   getPageHash, getSchemasByProject,
   upsertInsightsFromAnalysis, upsertInsightsFromKeywords,
+  upsertSitemapUrls,
 } from './db/db.js';
 import { generateMultiDashboard } from './reports/generate-html.js';
 import { buildTechnicalActions } from './exports/technical.js';
@@ -73,13 +74,13 @@ function resolveExtractionRuntime(config) {
   const norm = h => String(h || '').trim().replace(/\/+$/, '');
   const candidates = [
-    { host: norm(primaryUrl), model: String(primaryModel).trim() || 'gemma4:e4b' },
+    { host: norm(primaryUrl), model: String(primaryModel).trim() || 'gemma4:e4b', type: 'ollama' },
   ];
   // Legacy single fallback — always use project-selected model, not OLLAMA_FALLBACK_MODEL
   const fallbackUrl = norm(process.env.OLLAMA_FALLBACK_URL || '');
   if (fallbackUrl && !candidates.some(c => c.host === fallbackUrl)) {
-    candidates.push({ host: fallbackUrl, model: String(primaryModel).trim() || 'gemma4:e4b' });
+    candidates.push({ host: fallbackUrl, model: String(primaryModel).trim() || 'gemma4:e4b', type: 'ollama' });
   }
   // OLLAMA_HOSTS — comma-separated LAN hosts from setup wizard
@@ -87,13 +88,20 @@ function resolveExtractionRuntime(config) {
     for (const h of process.env.OLLAMA_HOSTS.split(',')) {
       const host = norm(h);
       if (host && !candidates.some(c => c.host === host)) {
-        candidates.push({ host, model: String(primaryModel).trim() || 'gemma4:e4b' });
+        candidates.push({ host, model: String(primaryModel).trim() || 'gemma4:e4b', type: 'ollama' });
       }
     }
   }
   if (!candidates.some(candidate => candidate.host === localhost)) {
-    candidates.push({ host: localhost, model: String(primaryModel).trim() || 'gemma4:e4b' });
+    candidates.push({ host: localhost, model: String(primaryModel).trim() || 'gemma4:e4b', type: 'ollama' });
+  }
+  // LM Studio — always probe default port; env vars override URL/model
+  const lmStudioUrl = norm(process.env.LMSTUDIO_URL || '') || 'http://localhost:1234';
+  const lmStudioModel = String(process.env.LMSTUDIO_MODEL || '').trim();
+  if (!candidates.some(c => c.host === lmStudioUrl)) {
+    candidates.push({ host: lmStudioUrl, model: lmStudioModel, type: 'lmstudio' });
   }
   const seen = new Set();
@@ -115,33 +123,55 @@ function applyExtractionRuntimeConfig(config) {
 // ── AI AVAILABILITY PREFLIGHT ────────────────────────────────────────────
 /**
  * Check if any AI extraction backend is reachable.
- * Tries: primary Ollama → fallback Ollama → returns false.
+ * Tries: primary Ollama → fallback Ollama → LM Studio → returns false.
  * Fast: 2s timeout per host, runs sequentially.
  */
 async function checkOllamaAvailability(config) {
   const candidates = resolveExtractionRuntime(config);
-  let sawReachableHost = false;
+  let sawOllamaHostNoModel = false;
   for (const candidate of candidates) {
     try {
       const controller = new AbortController();
       const timeout = setTimeout(() => controller.abort(), 2000);
-      const res = await fetch(`${candidate.host}/api/tags`, { signal: controller.signal });
-      clearTimeout(timeout);
-      if (res.ok) {
-        const data = await res.json();
-        const models = (data.models || []).map(m => m.name);
-        sawReachableHost = true;
-        const hasModel = models.some(m => m && m.split(':')[0] === candidate.model.split(':')[0]);
-        if (hasModel) {
-          return true; // Ollama reachable + model available
+      if (candidate.type === 'lmstudio') {
+        // LM Studio: GET /api/v1/models
+        const res = await fetch(`${candidate.host}/api/v1/models`, { signal: controller.signal });
+        clearTimeout(timeout);
+        if (res.ok) {
+          const data = await res.json().catch(() => ({ data: [] }));
+          const models = (data.data || []).map(m => m.id || m.model).filter(Boolean);
+          // Accept any loaded model when no specific model was requested
+          if (!candidate.model || models.some(id => id === candidate.model || id.endsWith('/' + candidate.model))) {
+            console.log(chalk.dim(`  LM Studio: ${candidate.host} ✓ (${models[0] || 'model loaded'})`));
+            return true;
+          }
+          if (models.length > 0) {
+            // Model mismatch but something is loaded — still usable
+            console.log(chalk.dim(`  LM Studio: ${candidate.host} ✓ (using ${models[0]})`));
+            return true;
+          }
+          console.log(chalk.yellow(`  ⚠️  LM Studio reachable but no models loaded`));
+          console.log(chalk.dim(`  Load a model in LM Studio to enable extraction`));
+        }
+      } else {
+        // Ollama
+        const res = await fetch(`${candidate.host}/api/tags`, { signal: controller.signal });
+        clearTimeout(timeout);
+        if (res.ok) {
+          const data = await res.json();
+          const models = (data.models || []).map(m => m.name);
+          sawOllamaHostNoModel = true;
+          const hasModel = models.some(m => m && m.split(':')[0] === candidate.model.split(':')[0]);
+          if (hasModel) return true;
         }
       }
     } catch { /* host unreachable, try next */ }
   }
-  if (sawReachableHost) {
-    const primary = candidates[0];
+  if (sawOllamaHostNoModel) {
+    const primary = candidates.find(c => c.type !== 'lmstudio') || candidates[0];
     console.log(chalk.yellow(`  ⚠️  Ollama is reachable but model "${primary?.model || 'gemma4:e4b'}" was not found on any live host`));
     console.log(chalk.dim(`  Run: ollama pull ${primary?.model || 'gemma4:e4b'}`));
   }
@@ -480,9 +510,9 @@ program
     if (opts.extract !== false) {
       const ollamaAvailable = await checkOllamaAvailability(config);
       if (!ollamaAvailable) {
-        console.log(chalk.yellow('\n  ⚠️  No AI extraction available (Ollama unreachable, no API keys configured)'));
+        console.log(chalk.yellow('\n  ⚠️  No AI extraction available (Ollama/LM Studio unreachable, no API keys configured)'));
         console.log(chalk.white('  → Switching to ') + chalk.bold.green('crawl-only mode') + chalk.white(' — raw data will be collected without AI extraction'));
-        console.log(chalk.dim('  Tip: Install Ollama (ollama.com) + run `ollama pull gemma4:e4b` to enable local AI extraction\n'));
+        console.log(chalk.dim('  Tip: Install Ollama (ollama.com) or LM Studio (lmstudio.ai) to enable local AI extraction\n'));
         opts.extract = false;
       }
     }
@@ -538,6 +568,10 @@ program
         stealth: !!opts.stealth,
         tiered: opts.tiered !== false,
         strictHost: !!opts.domain, // BUG-006: enforce exact hostname when --domain is set
+        onSitemapDiscovered: (urls) => {
+          try { upsertSitemapUrls(db, domainId, urls.map(u => u.url), `${site.url}/sitemap.xml`); }
+          catch (e) { console.warn(`[sitemap] inventory save failed: ${e.message}`); }
+        },
       };
       for await (const page of crawlDomain(site.url, crawlOpts)) {
@@ -568,6 +602,9 @@ program
           title: page.title || null,
           metaDesc: page.metaDesc || null,
           bodyText: page.fullBodyText || page.bodyText || null,
+          finalUrl: page.finalUrl || null,
+          redirectChain: page.redirectChain || null,
+          xRobotsTag: page.xRobotsTag || null,
         });
         const pageId = pageRes?.id;
@@ -1101,7 +1138,7 @@ function getOpenClawToken() {
   return null;
 }
-async function callOpenClaw(prompt, model = 'default') {
+async function callOpenClaw(prompt, model = 'openclaw') {
   const token = getOpenClawToken();
   if (!token) throw new Error('OpenClaw token not found');
@@ -1109,6 +1146,9 @@ async function callOpenClaw(prompt, model = 'default') {
   const controller = new AbortController();
   const timeout = setTimeout(() => controller.abort(), timeoutMs);
+  // OpenClaw gateway expects 'openclaw' or 'openclaw/<agentId>'
+  const clawModel = (!model || model === 'default') ? 'openclaw' : model;
   try {
     const res = await fetch('http://127.0.0.1:18789/v1/chat/completions', {
       method: 'POST',
@@ -1118,7 +1158,7 @@ async function callOpenClaw(prompt, model = 'default') {
         'Content-Type': 'application/json',
       },
       body: JSON.stringify({
-        model: model === 'openclaw' ? 'default' : model,
+        model: clawModel,
         messages: [{ role: 'user', content: prompt }],
         temperature: 0.2,
         max_tokens: 4000,
@@ -1138,15 +1178,18 @@ async function callAnalysisModel(prompt, model = 'gemini') {
   const requestedModel = String(model || 'gemini').trim();
   const normalizedModel = requestedModel.toLowerCase();
+  // Non-Gemini model: try OpenClaw first, then fall back to Gemini CLI
   if (normalizedModel !== 'gemini') {
     try {
       return await callOpenClaw(prompt, requestedModel);
     } catch (err) {
-      console.error('[openclaw]', err.message);
-      return null;
+      console.warn(chalk.dim(`  [openclaw] ${err.message}`));
+      console.log(chalk.yellow(`  Falling back to Gemini CLI...\n`));
+      // Fall through to Gemini CLI below
     }
   }
+  // Try Gemini CLI
   const timeoutMs = parseInt(process.env.GEMINI_TIMEOUT_MS || '120000', 10);
   try {
     const result = spawnSync('gemini', ['-p', '-'], {
@@ -1163,7 +1206,17 @@ async function callAnalysisModel(prompt, model = 'gemini') {
     return result.stdout;
   } catch (err) {
-    const fallbackModel = process.env.OPENCLAW_ANALYSIS_MODEL || 'default';
+    // Gemini CLI failed — try OpenClaw as last resort (if we haven't already)
+    const fallbackModel = process.env.OPENCLAW_ANALYSIS_MODEL || 'openclaw';
+    if (normalizedModel !== 'gemini') {
+      // Already tried OpenClaw above, show combined error
+      const geminiMsg = err.message || '';
+      console.error(chalk.red('\n  ✗ Analysis failed — no model available\n'));
+      console.error(chalk.dim(`  Gemini: ${geminiMsg}`));
+      console.error(chalk.dim(`  OpenClaw: already tried (${requestedModel})`));
+      console.error(chalk.dim('\n  Docs: https://ukkometa.fi/en/seo-intel/setup/\n'));
+      return null;
+    }
     try {
       console.warn(`[gemini] ${err.message}`);
       console.log(chalk.yellow(`Gemini CLI unavailable, retrying via OpenClaw (${fallbackModel})...\n`));
@@ -1269,7 +1322,12 @@ program
     let pageCount = 0;
     let skipped = 0;
     let blocked = false;
-    for await (const page of crawlDomain(next.url)) {
+    for await (const page of crawlDomain(next.url, {
+      onSitemapDiscovered: (urls) => {
+        try { upsertSitemapUrls(db, domainId, urls.map(u => u.url), `${next.url}/sitemap.xml`); }
+        catch (e) { console.warn(`[sitemap] inventory save failed: ${e.message}`); }
+      },
+    })) {
       // ── Handle blocked pages from backoff system ──
       if (page._blocked) {
         blocked = true;
@@ -1291,6 +1349,9 @@ program
         title: page.title || null,
         metaDesc: page.metaDesc || null,
         bodyText: page.fullBodyText || page.bodyText || null,
+        finalUrl: page.finalUrl || null,
+        redirectChain: page.redirectChain || null,
+        xRobotsTag: page.xRobotsTag || null,
       });
       const pageId = pageRes?.id;
@@ -2425,6 +2486,73 @@ program
     console.log(chalk.gray('   Feed this to Gemini: "Find the gaps in each heading structure above."\n'));
   });
+// ── TECHNICAL AUDIT (extended-data) ───────────────────────────────────────
+program
+  .command('tech-audit <project>')
+  .description('Technical SEO audit from crawled data (titles, meta, noindex, redirects, sitemap diff)')
+  .option('--domain <domain>', 'Audit a single domain (defaults to all target domains)')
+  .option('--head', 'Also run HEAD checks against sitemap URLs (network-heavy)')
+  .option('--concurrency <n>', 'Parallel HEAD requests when --head is set', '6')
+  .option('--format <type>', 'Output format: brief or json', 'brief')
+  .action(async (project, opts) => {
+    const { runTechnicalAudit } = await import('./analysis/technical-audit.js');
+    const isJson = opts.format === 'json';
+    const db = getDb();
+    const domainRows = opts.domain
+      ? [{ domain: opts.domain }]
+      : db.prepare("SELECT domain FROM domains WHERE project = ? AND role IN ('target','owned')").all(project);
+    if (!domainRows.length) {
+      if (isJson) console.log(JSON.stringify({ command: 'tech-audit', project, error: 'no target domains', domains: [] }));
+      else console.log(chalk.yellow(`No target domains found for project ${project}.`));
+      return;
+    }
+    const results = [];
+    for (const { domain } of domainRows) {
+      const res = await runTechnicalAudit(db, {
+        project,
+        domain,
+        runSitemapHead: !!opts.head,
+        sitemapConcurrency: parseInt(opts.concurrency) || 6,
+      });
+      results.push({ domain, ...res });
+    }
+    if (isJson) {
+      console.log(JSON.stringify({ command: 'tech-audit', project, timestamp: new Date().toISOString(), domains: results }));
+      return;
+    }
+    for (const r of results) {
+      console.log(chalk.bold.cyan(`\n🔧 Technical audit — ${r.domain}`));
+      if (r.gated) {
+        console.log(chalk.gray('   (extended-data gate closed — upgrade to unlock technical audits)'));
+        continue;
+      }
+      if (r.error) { console.log(chalk.red(`  ✗ ${r.error}`)); continue; }
+      const { stats, findings } = r;
+      const sev = stats.findings_by_severity || {};
+      console.log(chalk.gray(`   ${stats.pages} pages · ${stats.sitemap_urls} sitemap URLs · ${stats.findings_total} findings`));
+      console.log(chalk.gray(`   ${chalk.red(sev.error || 0)} errors · ${chalk.yellow(sev.warn || 0)} warnings · ${chalk.blue(sev.info || 0)} info`));
+      if (stats.sitemap_head) {
+        const sh = stats.sitemap_head;
+        console.log(chalk.gray(`   sitemap HEAD — ${sh.ok} ok · ${sh.redirected} 3xx · ${sh.broken} 4xx/5xx · ${sh.errored} errors`));
+      }
+      const order = { error: 0, warn: 1, info: 2 };
+      const sorted = [...findings].sort((a, b) => (order[a.severity] ?? 3) - (order[b.severity] ?? 3));
+      for (const f of sorted.slice(0, 40)) {
+        const icon = f.severity === 'error' ? chalk.red('✗') : f.severity === 'warn' ? chalk.yellow('⚠') : chalk.blue('ℹ');
+        const target = f.url ? f.url.replace(/https?:\/\/[^/]+/, '') : '';
+        console.log(`  ${icon} ${chalk.bold(f.type)} ${chalk.gray(target)} — ${f.details}`);
+      }
+      if (sorted.length > 40) console.log(chalk.gray(`  … +${sorted.length - 40} more`));
+    }
+  });
 // ── ORPHAN ENTITIES ───────────────────────────────────────────────────────
 program
   .command('orphans <project>')
@@ -4780,15 +4908,47 @@ program
   .option('--pages <n>', 'Max pages to crawl', '100')
   .option('--no-ai', 'Skip AI-enriched export (deterministic only)')
   .option('--model <name>', 'Model for analysis + AI export (gemini, claude, gpt)', 'gemini')
-  .option('--no-stealth', 'Disable stealth browser mode')
+  .option('--stealth', 'Enable stealth browser mode (Playwright) for JS-heavy sites')
   .action(async (domainInput, opts) => {
     if (!requirePro('scan')) return;
     // ── Parse domain ──
-    const domain = domainInput.replace(/^https?:\/\//, '').replace(/\/.*$/, '').replace(/^www\./, '');
-    const projectSlug = '_scan-' + domain.replace(/[^a-z0-9]/gi, '-').toLowerCase();
-    const siteUrl = defaultSiteUrl(domain);
-    const useStealth = opts.stealth !== false;
+    const domainRaw = domainInput.replace(/^https?:\/\//, '').replace(/\/.*$/, '').replace(/^www\./, '');
+    const projectSlug = '_scan-' + domainRaw.replace(/[^a-z0-9]/gi, '-').toLowerCase();
+    // Resolve the actual reachable URL (handles www redirects and bare-domain failures)
+    let domain = domainRaw;
+    let siteUrl = defaultSiteUrl(domain);
+    let wwwRedirectMissing = false;
+    try {
+      const controller = new AbortController();
+      const timer = setTimeout(() => controller.abort(), 8000);
+      const probe = await fetch(siteUrl, { method: 'HEAD', redirect: 'follow', signal: controller.signal });
+      clearTimeout(timer);
+      const finalUrl = new URL(probe.url);
+      if (finalUrl.hostname !== domain) {
+        console.log(chalk.dim(`  Resolved: ${domain} → ${finalUrl.hostname}`));
+        domain = finalUrl.hostname.replace(/^www\./, '') === domainRaw ? domainRaw : finalUrl.hostname;
+        siteUrl = finalUrl.origin;
+      }
+    } catch {
+      // Bare domain unreachable — try www variant
+      const wwwUrl = `https://www.${domainRaw}`;
+      try {
+        const controller = new AbortController();
+        const timer = setTimeout(() => controller.abort(), 8000);
+        const probe = await fetch(wwwUrl, { method: 'HEAD', redirect: 'follow', signal: controller.signal });
+        clearTimeout(timer);
+        if (probe.ok || probe.status < 400) {
+          console.log(chalk.dim(`  ${domainRaw} unreachable, using www.${domainRaw}`));
+          console.log(chalk.yellow(`  ⚠  Missing redirect: ${domainRaw} should 301 to www.${domainRaw}`));
+          siteUrl = wwwUrl;
+          domain = `www.${domainRaw}`;
+          wwwRedirectMissing = true;
+        }
+      } catch { /* www also unreachable — proceed with original, crawler will report error */ }
+    }
+    const useStealth = opts.stealth === true;
     const useAi = opts.ai !== false;
     const maxPages = Math.min(parseInt(opts.pages) || 100, capPages(9999));
@@ -4828,7 +4988,7 @@ program
     let doExtract = true;
     const ollamaAvailable = await checkOllamaAvailability(config);
     if (!ollamaAvailable) {
-      console.log(chalk.yellow('  ⚠  No AI extraction available (Ollama unreachable)'));
+      console.log(chalk.yellow('  ⚠  No AI extraction available (Ollama/LM Studio unreachable)'));
       console.log(chalk.gray('  → Crawl-only mode — body text still captured for analysis'));
       console.log('');
       doExtract = false;
@@ -4840,44 +5000,66 @@ program
     let pageCount = 0, extracted = 0, failed = 0;
     const tag = chalk.cyan(`[${domain.split('.')[0]}]`);
-    for await (const page of crawlDomain(siteUrl, { maxPages, stealth: useStealth, tiered: true })) {
-      if (page._blocked) {
-        console.log(chalk.bold.red(`  ${tag} ⛔ BLOCKED: ${page._blockReason}`));
-        break;
-      }
+    try {
+      for await (const page of crawlDomain(siteUrl, {
+        maxPages, stealth: useStealth, tiered: true,
+        onSitemapDiscovered: (urls) => {
+          try { upsertSitemapUrls(db, domainId, urls.map(u => u.url), `${siteUrl}/sitemap.xml`); }
+          catch (e) { console.warn(`[sitemap] inventory save failed: ${e.message}`); }
+        },
+      })) {
+        if (page._blocked) {
+          console.log(chalk.bold.red(`  ${tag} ⛔ BLOCKED: ${page._blockReason}`));
+          break;
+        }
-      const pageRes = upsertPage(db, {
-        domainId, url: page.url, statusCode: page.status,
-        wordCount: page.wordCount, loadMs: page.loadMs,
-        isIndexable: page.isIndexable, clickDepth: page.depth ?? 0,
-        publishedDate: page.publishedDate || null, modifiedDate: page.modifiedDate || null,
-        contentHash: page.contentHash || null, title: page.title || null,
-        metaDesc: page.metaDesc || null, bodyText: page.fullBodyText || page.bodyText || null,
-      });
-      const pageId = pageRes?.id;
+        try {
+          const pageRes = upsertPage(db, {
+            domainId, url: page.url, statusCode: page.status,
+            wordCount: page.wordCount, loadMs: page.loadMs,
+            isIndexable: page.isIndexable, clickDepth: page.depth ?? 0,
+            publishedDate: page.publishedDate || null, modifiedDate: page.modifiedDate || null,
+            contentHash: page.contentHash || null, title: page.title || null,
+            metaDesc: page.metaDesc || null, bodyText: page.fullBodyText || page.bodyText || null,
+            finalUrl: page.finalUrl || null, redirectChain: page.redirectChain || null, xRobotsTag: page.xRobotsTag || null,
+          });
+          const pageId = pageRes?.id;
-      upsertTechnical(db, { pageId, hasCanonical: page.hasCanonical, hasOgTags: page.hasOgTags, hasSchema: page.hasSchema, hasRobots: page.hasRobots });
-      insertHeadings(db, pageId, page.headings);
-      insertLinks(db, pageId, page.links);
-      if (page.parsedSchemas?.length) insertPageSchemas(db, pageId, page.parsedSchemas);
+          upsertTechnical(db, { pageId, hasCanonical: page.hasCanonical, hasOgTags: page.hasOgTags, hasSchema: page.hasSchema, hasRobots: page.hasRobots });
+          insertHeadings(db, pageId, page.headings);
+          insertLinks(db, pageId, page.links);
+          if (page.parsedSchemas?.length) insertPageSchemas(db, pageId, page.parsedSchemas);
-      if (doExtract) {
-        process.stdout.write(chalk.gray(`  ${tag} [${pageCount + 1}] d${page.depth ?? 0} ${page.url.slice(0, 60)} → extracting...`));
-        try {
-          const extractFn = await getExtractPage();
-          const extraction = await extractFn(page);
-          insertExtraction(db, { pageId, data: extraction });
-          insertKeywords(db, pageId, extraction.keywords);
-          process.stdout.write(chalk.green(` ✓\n`));
-          extracted++;
-        } catch (err) {
-          process.stdout.write(chalk.red(` ✗ ${err.message}\n`));
+          if (doExtract) {
+            process.stdout.write(chalk.gray(`  ${tag} [${pageCount + 1}] d${page.depth ?? 0} ${page.url.slice(0, 60)} → extracting...`));
+            try {
+              const extractFn = await getExtractPage();
+              const extraction = await extractFn(page);
+              insertExtraction(db, { pageId, data: extraction });
+              insertKeywords(db, pageId, extraction.keywords);
+              process.stdout.write(chalk.green(` ✓\n`));
+              extracted++;
+            } catch (err) {
+              process.stdout.write(chalk.red(` ✗ ${err.message}\n`));
+              failed++;
+            }
+          } else {
+            process.stdout.write(chalk.gray(`  ${tag} [${pageCount + 1}] d${page.depth ?? 0} ${page.url.slice(0, 65)} ✓\n`));
+          }
+          pageCount++;
+        } catch (pageErr) {
+          console.log(chalk.yellow(`  ${tag} ⚠  Skipped ${page.url?.slice(0, 60) || 'unknown'}: ${pageErr.message}`));
           failed++;
         }
-      } else {
-        process.stdout.write(chalk.gray(`  ${tag} [${pageCount + 1}] d${page.depth ?? 0} ${page.url.slice(0, 65)} ✓\n`));
       }
-      pageCount++;
+    } catch (crawlErr) {
+      console.log(chalk.yellow(`\n  ⚠  Crawl stopped early: ${crawlErr.message}`));
+      if (pageCount === 0) {
+        console.log(chalk.red(`  ✗ Could not reach ${domain} — check the URL and try again.\n`));
+        try { unlinkSync(configPath); } catch { /* fine */ }
+        return;
+      }
+      console.log(chalk.dim(`  → Continuing with ${pageCount} pages already captured...\n`));
     }
     const crawlSec = ((Date.now() - scanStart) / 1000).toFixed(1);
@@ -4959,7 +5141,7 @@ program
     // Inline the deterministic markdown builder from server.js
     const { buildScanMarkdown } = await import('./lib/scan-export.js');
-    let md = buildScanMarkdown(dash, projectSlug, domain);
+    let md = buildScanMarkdown(dash, projectSlug, domain, { wwwRedirectMissing, bareDomain: domainRaw });
     // AI enrichment
     if (useAi) {

package/crawler/index.js CHANGED Viewed

@@ -263,6 +263,10 @@ export async function* crawlDomain(startUrl, opts = {}) {
   // ── Sitemap-first: seed queue from sitemap.xml (section-aware) ──
   try {
     const sitemapUrls = await fetchSitemap(startUrl);
+    // Report full sitemap inventory to caller (for DB persistence / audit diff)
+    if (sitemapUrls.length > 0 && typeof opts.onSitemapDiscovered === 'function') {
+      try { await opts.onSitemapDiscovered(sitemapUrls); } catch { /* ignore */ }
+    }
     if (sitemapUrls.length > 0) {
       // Apply section budgets if tiered crawling is enabled
       const budgeted = tiered ? applySectionBudgets(sitemapUrls, maxPages) : sitemapUrls;
@@ -452,9 +456,36 @@ async function processPage(page, url, base, depth, queue, maxDepth) {
   status = res?.status() || 0;
   const loadMs = Date.now() - t0;
+  // ── Final URL after redirects ──
+  let finalUrl = null;
+  try { finalUrl = page.url() || null; } catch { /* ignore */ }
+  // ── Redirect chain (walk request.redirectedFrom() backwards) ──
+  const redirectChain = [];
+  try {
+    let req = res?.request();
+    const chain = [];
+    while (req) {
+      const prev = req.redirectedFrom?.();
+      if (!prev) break;
+      const prevRes = await prev.response().catch(() => null);
+      chain.push({ url: prev.url(), status: prevRes?.status() ?? null });
+      req = prev;
+    }
+    // chain is in reverse order (closest redirect first); reverse for chronological
+    redirectChain.push(...chain.reverse());
+  } catch { /* ignore */ }
+  // ── X-Robots-Tag header ──
+  let xRobotsTag = null;
+  try {
+    const headers = res?.headers?.() || {};
+    xRobotsTag = headers['x-robots-tag'] || null;
+  } catch { /* ignore */ }
   // ── Return status for backoff logic (don't silently drop 4xx) ──
   if (status === 429 || status === 503 || status === 403) {
-    return { url, depth, status, loadMs, wordCount: 0, isIndexable: false, title: '', metaDesc: '', headings: [], links: [], bodyText: '', schemaTypes: [], vitals: {}, publishedDate: null, modifiedDate: null, contentHash: null };
+    return { url, depth, status, loadMs, wordCount: 0, isIndexable: false, title: '', metaDesc: '', headings: [], links: [], bodyText: '', schemaTypes: [], vitals: {}, publishedDate: null, modifiedDate: null, contentHash: null, finalUrl, redirectChain, xRobotsTag };
   }
   if (status >= 400) return null;
@@ -507,7 +538,9 @@ async function processPage(page, url, base, depth, queue, maxDepth) {
   const wordCount = await page.$eval('body', el => el.innerText.split(/\s+/).filter(Boolean).length).catch(() => 0);
   const robotsMeta = await page.$eval('meta[name="robots"]', el => el.content).catch(() => '');
-  const isIndexable = !robotsMeta.toLowerCase().includes('noindex');
+  const metaNoindex = robotsMeta.toLowerCase().includes('noindex');
+  const headerNoindex = (xRobotsTag || '').toLowerCase().includes('noindex');
+  const isIndexable = !(metaNoindex || headerNoindex);
   const hasCanonical = await page.$('link[rel="canonical"]').then(el => !!el).catch(() => false);
   const hasOgTags = await page.$('meta[property^="og:"]').then(el => !!el).catch(() => false);
@@ -576,6 +609,7 @@ async function processPage(page, url, base, depth, queue, maxDepth) {
     hasCanonical, hasOgTags,
     hasRobots: !!robotsMeta,
     hasSchema: schemaTypes.length > 0,
+    finalUrl, redirectChain, xRobotsTag,
   };
 }

package/crawler/sitemap.js CHANGED Viewed

@@ -101,3 +101,47 @@ function extractTagContent(xml, tagName) {
   }
   return results;
 }
+/**
+ * HEAD-check a single URL without following redirects.
+ * Returns { status, location } — location is the Location header when 3XX.
+ * Never throws — errors return { status: 0, error: msg }.
+ */
+export async function headCheck(url, { timeoutMs = 8000 } = {}) {
+  try {
+    const ctrl = new AbortController();
+    const t = setTimeout(() => ctrl.abort(), timeoutMs);
+    const res = await fetch(url, {
+      method: 'HEAD',
+      redirect: 'manual',
+      signal: ctrl.signal,
+      headers: { 'User-Agent': 'SEOIntelBot/1.0' },
+    }).finally(() => clearTimeout(t));
+    return {
+      status: res.status,
+      location: res.headers.get('location') || null,
+    };
+  } catch (err) {
+    return { status: 0, error: err.message };
+  }
+}
+/**
+ * Run HEAD checks against an array of sitemap URL rows in parallel (capped).
+ * Accepts [{ id, url }]. Invokes onResult(row, result) per check.
+ */
+export async function headCheckAll(rows, { concurrency = 6, onResult } = {}) {
+  const queue = [...rows];
+  const worker = async () => {
+    while (queue.length) {
+      const row = queue.shift();
+      if (!row) break;
+      const result = await headCheck(row.url);
+      if (onResult) {
+        try { await onResult(row, result); } catch { /* swallow */ }
+      }
+    }
+  };
+  const workers = Array.from({ length: Math.min(concurrency, rows.length) }, () => worker());
+  await Promise.all(workers);
+}