seo-intel 1.1.5 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cli.js CHANGED
@@ -24,11 +24,11 @@ import { getNextCrawlTarget, needsAnalysis, getCrawlStatus, loadAllConfigs } fro
24
24
  import {
25
25
  getDb, upsertDomain, upsertPage, insertExtraction,
26
26
  insertKeywords, insertHeadings, insertLinks, insertPageSchemas,
27
- upsertTechnical,
27
+ upsertTechnical, pruneStaleDomains,
28
28
  getCompetitorSummary, getKeywordMatrix, getHeadingStructure,
29
29
  getPageHash, getSchemasByProject
30
30
  } from './db/db.js';
31
- import { generateHtmlDashboard, generateMultiDashboard } from './reports/generate-html.js';
31
+ import { generateMultiDashboard } from './reports/generate-html.js';
32
32
  import { buildTechnicalActions } from './exports/technical.js';
33
33
  import { buildCompetitiveActions } from './exports/competitive.js';
34
34
  import { buildSuggestiveActions } from './exports/suggestive.js';
@@ -91,6 +91,38 @@ async function checkOllamaAvailability() {
91
91
  // ── EXTRACTION PROGRESS TRACKER ──────────────────────────────────────────
92
92
  const PROGRESS_FILE = join(__dirname, '.extraction-progress.json');
93
93
 
94
+ // ── Graceful shutdown support ──
95
+ // Cleanup callbacks registered by crawl/extract commands (e.g. close browser)
96
+ const _shutdownCallbacks = [];
97
+ let _shuttingDown = false;
98
+
99
+ function onShutdown(fn) { _shutdownCallbacks.push(fn); }
100
+ function clearShutdownCallbacks() { _shutdownCallbacks.length = 0; }
101
+
102
+ async function _gracefulExit(signal) {
103
+ if (_shuttingDown) return;
104
+ _shuttingDown = true;
105
+ console.log(chalk.yellow(`\n⏹ Received ${signal} — stopping gracefully…`));
106
+
107
+ // Update progress file
108
+ try {
109
+ const progress = readProgress();
110
+ if (progress && progress.status === 'running' && progress.pid === process.pid) {
111
+ writeProgress({ ...progress, status: 'stopped', stopped_at: Date.now() });
112
+ }
113
+ } catch { /* best-effort */ }
114
+
115
+ // Run cleanup callbacks (close browsers, etc.)
116
+ for (const fn of _shutdownCallbacks) {
117
+ try { await Promise.resolve(fn()); } catch { /* best-effort */ }
118
+ }
119
+
120
+ process.exit(0);
121
+ }
122
+
123
+ process.on('SIGTERM', () => _gracefulExit('SIGTERM'));
124
+ process.on('SIGINT', () => _gracefulExit('SIGINT'));
125
+
94
126
  function writeProgress(data) {
95
127
  try {
96
128
  writeFileSync(PROGRESS_FILE, JSON.stringify({
@@ -361,6 +393,21 @@ program
361
393
  }
362
394
  }
363
395
 
396
+ // ── Prune stale domains (DB entries no longer in config) ─────────────
397
+ {
398
+ const configDomains = new Set([
399
+ config.target?.domain,
400
+ ...(config.owned || []).map(o => o.domain),
401
+ ...(config.competitors || []).map(c => c.domain),
402
+ ].filter(Boolean));
403
+
404
+ const pruned = pruneStaleDomains(db, project, configDomains);
405
+ if (pruned.length) {
406
+ console.log(chalk.yellow(`\n 🧹 Pruned ${pruned.length} stale domain(s) from DB (no longer in config):`));
407
+ for (const d of pruned) console.log(chalk.dim(` − ${d}`));
408
+ }
409
+ }
410
+
364
411
  // ── Tier gate: Free tier = crawl-only, no AI extraction ──────────────
365
412
  if (opts.extract !== false && !isPro()) {
366
413
  console.log(chalk.dim('\n ℹ Free tier: crawl-only mode (AI extraction requires Solo/Agency)'));
@@ -456,6 +503,9 @@ program
456
503
  publishedDate: page.publishedDate || null,
457
504
  modifiedDate: page.modifiedDate || null,
458
505
  contentHash: page.contentHash || null,
506
+ title: page.title || null,
507
+ metaDesc: page.metaDesc || null,
508
+ bodyText: page.fullBodyText || page.bodyText || null,
459
509
  });
460
510
  const pageId = pageRes?.id;
461
511
 
@@ -481,6 +531,7 @@ program
481
531
  page_index: totalExtracted + 1,
482
532
  started_at: crawlStart,
483
533
  failed: totalFailed,
534
+ stealth: !!crawlOpts.stealth,
484
535
  });
485
536
  upsertTechnical(db, { pageId, hasCanonical: page.hasCanonical, hasOgTags: page.hasOgTags, hasSchema: page.hasSchema, hasRobots: page.hasRobots });
486
537
  try {
@@ -546,9 +597,10 @@ program
546
597
  if (totalSkipped > 0) console.log(chalk.blue(`\n📊 Incremental: ${totalSkipped} unchanged pages skipped (same content hash)`));
547
598
  if (totalBlocked > 0) console.log(chalk.red(`\n⛔ ${totalBlocked} domain(s) blocked (rate-limited or WAF)`));
548
599
  const elapsed = ((Date.now() - crawlStart) / 1000).toFixed(1);
549
- // Auto-regenerate dashboard so it never goes stale after a crawl
600
+ // Auto-regenerate dashboard (always multi-project so all projects stay current)
550
601
  try {
551
- const dashPath = generateHtmlDashboard(db, project, config);
602
+ const allConfigs = loadAllConfigs();
603
+ const dashPath = generateMultiDashboard(db, allConfigs);
552
604
  console.log(chalk.dim(` 📊 Dashboard refreshed → ${dashPath}`));
553
605
  } catch (dashErr) {
554
606
  console.log(chalk.dim(` ⚠ Dashboard refresh skipped: ${dashErr.message}`));
@@ -664,9 +716,10 @@ program
664
716
  // Print summary
665
717
  printAnalysisSummary(analysis, project);
666
718
 
667
- // Auto-regenerate dashboard so it reflects the new analysis immediately
719
+ // Auto-regenerate dashboard (always multi-project so all projects stay current)
668
720
  try {
669
- const dashPath = generateHtmlDashboard(db, project, config);
721
+ const allConfigs = loadAllConfigs();
722
+ const dashPath = generateMultiDashboard(db, allConfigs);
670
723
  console.log(chalk.dim(` 📊 Dashboard refreshed → ${dashPath}`));
671
724
  } catch (dashErr) {
672
725
  console.log(chalk.dim(` ⚠ Dashboard refresh skipped: ${dashErr.message}`));
@@ -1041,6 +1094,9 @@ program
1041
1094
  publishedDate: page.publishedDate || null,
1042
1095
  modifiedDate: page.modifiedDate || null,
1043
1096
  contentHash: page.contentHash || null,
1097
+ title: page.title || null,
1098
+ metaDesc: page.metaDesc || null,
1099
+ bodyText: page.fullBodyText || page.bodyText || null,
1044
1100
  });
1045
1101
  const pageId = pageRes?.id;
1046
1102
 
@@ -1346,6 +1402,7 @@ program
1346
1402
  .option('--add-owned <domain>', 'Add an owned subdomain')
1347
1403
  .option('--remove-owned <domain>', 'Remove an owned subdomain')
1348
1404
  .option('--set-target <domain>', 'Change the target domain')
1405
+ .option('--prune', 'Remove DB data for domains no longer in config')
1349
1406
  .action((project, opts) => {
1350
1407
  const configPath = join(__dirname, `config/${project}.json`);
1351
1408
  let config;
@@ -1438,6 +1495,24 @@ program
1438
1495
  console.log(chalk.dim(`\n Saved → config/${project}.json`));
1439
1496
  }
1440
1497
 
1498
+ // ── Prune stale DB data (auto on remove, or manual --prune) ─────────
1499
+ if (modified || opts.prune) {
1500
+ const db = getDb();
1501
+ const configDomains = new Set([
1502
+ config.target?.domain,
1503
+ ...(config.owned || []).map(o => o.domain),
1504
+ ...(config.competitors || []).map(c => c.domain),
1505
+ ].filter(Boolean));
1506
+
1507
+ const pruned = pruneStaleDomains(db, project, configDomains);
1508
+ if (pruned.length) {
1509
+ console.log(chalk.yellow(`\n 🧹 Pruned ${pruned.length} stale domain(s) from DB:`));
1510
+ for (const d of pruned) console.log(chalk.dim(` − ${d}`));
1511
+ } else if (opts.prune) {
1512
+ console.log(chalk.dim('\n ✓ No stale domains to prune'));
1513
+ }
1514
+ }
1515
+
1441
1516
  // ── Always show current config
1442
1517
  console.log(chalk.bold.cyan(`\n 📋 ${project} — Domain Configuration\n`));
1443
1518
  console.log(chalk.white(' Target:'));
@@ -1527,13 +1602,14 @@ async function runAnalysis(project, db) {
1527
1602
  program
1528
1603
  .command('extract <project>')
1529
1604
  .description('Run AI extraction on all crawled-but-not-yet-extracted pages (requires Solo/Agency)')
1530
- .option('--stealth', 'Advanced browser mode for JS-heavy and dynamic sites')
1531
- .action(async (project, opts) => {
1605
+ .action(async (project) => {
1532
1606
  if (!requirePro('extract')) return;
1533
1607
  const db = getDb();
1608
+
1609
+ // Query pages that have body_text stored (from crawl) but no extraction yet
1534
1610
  const pendingPages = db.prepare(`
1535
- SELECT p.id, p.url, p.word_count,
1536
- e.id as extracted
1611
+ SELECT p.id, p.url, p.word_count, p.title, p.meta_desc, p.body_text,
1612
+ p.published_date, p.modified_date
1537
1613
  FROM pages p
1538
1614
  JOIN domains d ON d.id = p.domain_id
1539
1615
  LEFT JOIN extractions e ON e.page_id = p.id
@@ -1545,94 +1621,241 @@ program
1545
1621
  process.exit(0);
1546
1622
  }
1547
1623
 
1548
- const mode = opts.stealth ? chalk.magenta('STEALTH') : chalk.gray('standard');
1549
- console.log(chalk.bold.cyan(`\n⚙️ Extracting ${pendingPages.length} pages for ${project} via Qwen [${mode}]...\n`));
1624
+ // Check how many have body_text stored vs need re-crawl
1625
+ const withContent = pendingPages.filter(r => r.body_text);
1626
+ const needsRecrawl = pendingPages.length - withContent.length;
1627
+
1628
+ console.log(chalk.bold.cyan(`\n⚙️ Extracting ${pendingPages.length} pages for ${project} via Qwen...\n`));
1629
+ if (needsRecrawl > 0) {
1630
+ console.log(chalk.yellow(` ⚠ ${needsRecrawl} pages have no stored content (crawled before v1.1.6). Re-crawl to populate.\n`));
1631
+ }
1550
1632
 
1551
1633
  const extractStart = Date.now();
1552
- let done = 0, failed = 0;
1634
+ let done = 0, failed = 0, skipped = 0;
1553
1635
 
1554
- // ── Stealth: single session across all pages (cookie accumulation) ──
1555
- let stealthSession = null;
1556
- if (opts.stealth) {
1557
- const { createStealthSession } = await import('./crawler/stealth.js');
1558
- stealthSession = await createStealthSession();
1559
- console.log(chalk.magenta(' 🥷 Advanced mode — full browser rendering, persistent sessions\n'));
1560
- }
1636
+ // ── Pre-extract template grouping: sample N per group, skip the rest ──
1637
+ const SAMPLE_PER_GROUP = 5;
1638
+ const MIN_GROUP_FOR_SAMPLING = 10;
1639
+ let extractQueue = pendingPages.filter(r => r.body_text); // only pages with stored content
1561
1640
 
1562
1641
  try {
1563
- for (const row of pendingPages) {
1564
- process.stdout.write(chalk.gray(` [${done + failed + 1}/${pendingPages.length}] ${row.url.slice(0, 65)} `));
1565
- if (opts.stealth) process.stdout.write(chalk.magenta('stealth '));
1566
- process.stdout.write(chalk.gray('fetching...'));
1567
-
1568
- writeProgress({
1569
- status: 'running', command: 'extract', project,
1570
- current_url: row.url,
1571
- page_index: done + failed + 1, total: pendingPages.length,
1572
- percent: Math.round(((done + failed) / pendingPages.length) * 100),
1573
- started_at: extractStart, failed,
1574
- stealth: !!opts.stealth,
1575
- });
1642
+ const { clusterUrls } = await import('./analyses/templates/cluster.js');
1643
+ const { groups } = clusterUrls(
1644
+ extractQueue.map(r => ({ url: r.url })),
1645
+ { minGroupSize: MIN_GROUP_FOR_SAMPLING }
1646
+ );
1576
1647
 
1577
- try {
1578
- let pageData;
1648
+ if (groups.length > 0) {
1649
+ const skipUrls = new Set();
1579
1650
 
1580
- if (stealthSession) {
1581
- // Stealth: reuse persistent browser session
1582
- pageData = await stealthSession.fetchPage(row.url);
1583
- } else {
1584
- // Standard: quick single-page crawl
1585
- const { crawlAll } = await import('./crawler/index.js');
1586
- const crawled = await crawlAll(row.url);
1587
- pageData = crawled[0] || null;
1588
- }
1651
+ for (const group of groups) {
1652
+ const urls = group.urls;
1653
+ if (urls.length <= SAMPLE_PER_GROUP) continue;
1589
1654
 
1590
- if (!pageData || pageData.status >= 400) {
1591
- const reason = pageData ? `HTTP ${pageData.status}` : 'no data';
1592
- process.stdout.write(chalk.red(` ${reason}\n`));
1593
- failed++;
1594
- if (stealthSession) {
1595
- // Jittered delay even on failure — don't hammer a blocking site
1596
- await new Promise(r => setTimeout(r, 1500 + Math.random() * 2000));
1597
- }
1598
- continue;
1599
- }
1655
+ const sampleSet = new Set();
1656
+ sampleSet.add(urls[0]); sampleSet.add(urls[1]);
1657
+ sampleSet.add(urls[urls.length - 1]); sampleSet.add(urls[urls.length - 2]);
1658
+ sampleSet.add(urls[Math.floor(urls.length / 2)]);
1600
1659
 
1601
- process.stdout.write(chalk.gray(' extracting...'));
1602
- const extractFn = await getExtractPage();
1603
- const extraction = await extractFn(pageData);
1604
- insertExtraction(db, { pageId: row.id, data: extraction });
1605
- insertKeywords(db, row.id, extraction.keywords);
1606
-
1607
- // Also update headings + links + schemas with fresh data from stealth fetch
1608
- if (stealthSession) {
1609
- insertHeadings(db, row.id, pageData.headings);
1610
- insertLinks(db, row.id, pageData.links);
1611
- if (pageData.parsedSchemas?.length) insertPageSchemas(db, row.id, pageData.parsedSchemas);
1660
+ const skippedCount = urls.length - sampleSet.size;
1661
+ for (const u of urls) {
1662
+ if (!sampleSet.has(u)) skipUrls.add(u);
1612
1663
  }
1664
+ console.log(chalk.yellow(` [template] ${group.pattern} → ${urls.length} pages, sampling ${sampleSet.size}, skipping ${skippedCount}`));
1665
+ }
1613
1666
 
1614
- process.stdout.write(chalk.green(` ✓${pageData.parsedSchemas?.length ? ` [${pageData.parsedSchemas.length} schema]` : ''}\n`));
1667
+ if (skipUrls.size > 0) {
1668
+ extractQueue = extractQueue.filter(r => !skipUrls.has(r.url));
1669
+ skipped += skipUrls.size;
1670
+ console.log(chalk.yellow(` [template] ${withContent.length} extractable → ${extractQueue.length} to extract (${skipUrls.size} template-skipped)\n`));
1671
+ }
1672
+ }
1673
+ } catch (e) {
1674
+ console.log(chalk.gray(` [template] Pattern detection skipped: ${e.message}`));
1675
+ }
1676
+
1677
+ // ── Consecutive failure tracking per URL pattern ──
1678
+ const CONSEC_FAIL_THRESHOLD = 3;
1679
+ const patternFailCounts = new Map();
1680
+ const skippedPatterns = new Set();
1681
+
1682
+ function getPatternKey(url) {
1683
+ try {
1684
+ const u = new URL(url);
1685
+ const parts = u.pathname.split('/').filter(Boolean);
1686
+ return u.hostname + '/' + parts.map(p =>
1687
+ (p.length > 20 || /^[0-9a-fA-F]{8,}$/.test(p) || /^0x/.test(p) || /[-_]/.test(p)) ? '{var}' : p
1688
+ ).join('/');
1689
+ } catch { return url; }
1690
+ }
1691
+
1692
+ // ── Content similarity detection ──
1693
+ const SIMILARITY_THRESHOLD = 0.80;
1694
+ const SIMILARITY_SAMPLE_SIZE = 3;
1695
+ const patternFingerprints = new Map();
1696
+
1697
+ function textToShingles(text, n = 3) {
1698
+ const words = (text || '').toLowerCase().replace(/[^a-z0-9\s]/g, '').split(/\s+/).filter(Boolean);
1699
+ const shingles = new Set();
1700
+ for (let i = 0; i <= words.length - n; i++) {
1701
+ shingles.add(words.slice(i, i + n).join(' '));
1702
+ }
1703
+ return shingles;
1704
+ }
1705
+
1706
+ function jaccardSimilarity(a, b) {
1707
+ if (!a.size || !b.size) return 0;
1708
+ let intersection = 0;
1709
+ for (const s of a) { if (b.has(s)) intersection++; }
1710
+ return intersection / (a.size + b.size - intersection);
1711
+ }
1712
+
1713
+ function checkPatternSimilarity(patKey, newShingles) {
1714
+ if (!patternFingerprints.has(patKey)) patternFingerprints.set(patKey, []);
1715
+ const fps = patternFingerprints.get(patKey);
1716
+ fps.push(newShingles);
1717
+ if (fps.length < SIMILARITY_SAMPLE_SIZE || fps.length > SIMILARITY_SAMPLE_SIZE) return false;
1718
+ for (let i = 0; i < fps.length; i++) {
1719
+ for (let j = i + 1; j < fps.length; j++) {
1720
+ if (jaccardSimilarity(fps[i], fps[j]) < SIMILARITY_THRESHOLD) return false;
1721
+ }
1722
+ }
1723
+ return true;
1724
+ }
1725
+
1726
+ // ── Prepare headings + schema queries (per-page lookups from DB) ──
1727
+ const getHeadings = db.prepare('SELECT level, text FROM headings WHERE page_id = ? ORDER BY id');
1728
+ const getSchemaTypes = db.prepare('SELECT DISTINCT schema_type FROM page_schemas WHERE page_id = ?');
1729
+
1730
+ const totalToProcess = extractQueue.length;
1731
+ console.log(chalk.gray(` 📖 Reading from DB — no network needed\n`));
1732
+
1733
+ for (const row of extractQueue) {
1734
+ const patKey = getPatternKey(row.url);
1735
+ if (skippedPatterns.has(patKey)) {
1736
+ skipped++;
1737
+ continue;
1738
+ }
1739
+
1740
+ const pos = done + failed + 1;
1741
+ process.stdout.write(chalk.gray(` [${pos}/${totalToProcess}] ${row.url.slice(0, 70)} → `));
1742
+ process.stdout.write(chalk.gray('extracting...'));
1743
+
1744
+ writeProgress({
1745
+ status: 'running', command: 'extract', project,
1746
+ current_url: row.url,
1747
+ page_index: pos, total: totalToProcess,
1748
+ percent: Math.round(((done + failed) / totalToProcess) * 100),
1749
+ started_at: extractStart, failed, skipped,
1750
+ });
1751
+
1752
+ let pageFailed = false;
1753
+
1754
+ try {
1755
+ // Read headings + schema types from DB
1756
+ const headings = getHeadings.all(row.id);
1757
+ const schemaTypes = getSchemaTypes.all(row.id).map(r => r.schema_type);
1758
+
1759
+ const extractFn = await getExtractPage();
1760
+ const extraction = await extractFn({
1761
+ url: row.url,
1762
+ title: row.title || '',
1763
+ metaDesc: row.meta_desc || '',
1764
+ headings,
1765
+ bodyText: row.body_text,
1766
+ schemaTypes,
1767
+ publishedDate: row.published_date,
1768
+ modifiedDate: row.modified_date,
1769
+ });
1770
+ insertExtraction(db, { pageId: row.id, data: extraction });
1771
+ insertKeywords(db, row.id, extraction.keywords);
1772
+
1773
+ const isDegraded = extraction.extraction_source === 'degraded';
1774
+ if (isDegraded) {
1775
+ process.stdout.write(chalk.yellow(` ⚠ degraded\n`));
1615
1776
  done++;
1616
- } catch (err) {
1617
- process.stdout.write(chalk.red(` ${err.message}\n`));
1618
- failed++;
1777
+ pageFailed = true;
1778
+ } else {
1779
+ process.stdout.write(chalk.green(` ✓\n`));
1780
+ done++;
1781
+ patternFailCounts.set(patKey, 0);
1619
1782
  }
1620
1783
 
1621
- // Jittered delay in stealth mode (2-5s) to mimic human browsing
1622
- if (stealthSession) {
1623
- await new Promise(r => setTimeout(r, 2000 + Math.random() * 3000));
1784
+ // ── Content similarity detection ──
1785
+ if (row.body_text.length > 50) {
1786
+ const shingles = textToShingles(row.body_text);
1787
+ if (checkPatternSimilarity(patKey, shingles) && !skippedPatterns.has(patKey)) {
1788
+ const remaining = extractQueue.filter(r => getPatternKey(r.url) === patKey).length - (patternFingerprints.get(patKey)?.length || 0);
1789
+ skippedPatterns.add(patKey);
1790
+ if (remaining > 0) {
1791
+ console.log(chalk.yellow(` [similarity] 🔍 ${SIMILARITY_SAMPLE_SIZE} pages from ${patKey} are ${Math.round(SIMILARITY_THRESHOLD * 100)}%+ identical — skipping ${remaining} remaining`));
1792
+ }
1793
+ }
1624
1794
  }
1795
+ } catch (err) {
1796
+ process.stdout.write(chalk.red(` ✗ ${err.message}\n`));
1797
+ failed++;
1798
+ pageFailed = true;
1625
1799
  }
1626
- } finally {
1627
- // Always close stealth session
1628
- if (stealthSession) {
1629
- await stealthSession.close();
1630
- console.log(chalk.magenta(`\n 🥷 Stealth session closed (${stealthSession.getPageCount()} pages fetched)`));
1800
+
1801
+ // ── Track consecutive failures per pattern ──
1802
+ if (pageFailed) {
1803
+ const count = (patternFailCounts.get(patKey) || 0) + 1;
1804
+ patternFailCounts.set(patKey, count);
1805
+ if (count >= CONSEC_FAIL_THRESHOLD) {
1806
+ const remaining = extractQueue.filter(r => !skippedPatterns.has(getPatternKey(r.url)) && getPatternKey(r.url) === patKey).length;
1807
+ skippedPatterns.add(patKey);
1808
+ console.log(chalk.yellow(` [template] ⚡ ${count} consecutive failures for ${patKey} — skipping ~${remaining} remaining pages`));
1809
+ }
1631
1810
  }
1632
1811
  }
1633
1812
 
1634
- writeProgress({ status: 'completed', command: 'extract', project, extracted: done, failed, total: pendingPages.length, started_at: extractStart, finished_at: Date.now() });
1635
- console.log(chalk.bold.green(`\n✅ Extraction complete: ${done} extracted, ${failed} failed\n`));
1813
+ writeProgress({ status: 'completed', command: 'extract', project, extracted: done, failed, skipped, total: pendingPages.length, started_at: extractStart, finished_at: Date.now() });
1814
+ const skipMsg = skipped > 0 ? chalk.yellow(`, ${skipped} template-skipped`) : '';
1815
+ const recrawlMsg = needsRecrawl > 0 ? chalk.yellow(`, ${needsRecrawl} need re-crawl`) : '';
1816
+ console.log(chalk.bold.green(`\n✅ Extraction complete: ${done} extracted, ${failed} failed${skipMsg}${recrawlMsg}\n`));
1817
+ });
1818
+
1819
+ // ── TEMPLATES ANALYSIS ────────────────────────────────────────────────────
1820
+ program
1821
+ .command('templates <project>')
1822
+ .description('Detect programmatic template pages — assess SEO value without crawling all of them')
1823
+ .option('--min-group <n>', 'Minimum URLs to qualify as a template group', '10')
1824
+ .option('--sample-size <n>', 'Pages to stealth-crawl per template group', '20')
1825
+ .option('--skip-crawl', 'Skip sample crawl (pattern analysis + GSC only)')
1826
+ .option('--skip-gsc', 'Skip GSC overlay phase')
1827
+ .option('--skip-competitors', 'Skip competitor sitemap census')
1828
+ .action(async (project, opts) => {
1829
+ if (!requirePro('templates')) return;
1830
+
1831
+ console.log(chalk.bold.cyan(`\n🔍 SEO Intel — Template Analysis`));
1832
+ console.log(chalk.dim(` Project: ${project}`));
1833
+
1834
+ try {
1835
+ const { runTemplatesAnalysis } = await import('./analyses/templates/index.js');
1836
+ const report = await runTemplatesAnalysis(project, {
1837
+ minGroupSize: parseInt(opts.minGroup) || 10,
1838
+ sampleSize: parseInt(opts.sampleSize) || 20,
1839
+ skipCrawl: !!opts.skipCrawl,
1840
+ skipGsc: !!opts.skipGsc,
1841
+ skipCompetitors: !!opts.skipCompetitors,
1842
+ log: (msg) => console.log(chalk.gray(msg)),
1843
+ });
1844
+
1845
+ if (report.groups.length === 0) {
1846
+ console.log(chalk.yellow(`\n No template patterns detected.\n`));
1847
+ process.exit(0);
1848
+ }
1849
+
1850
+ // Summary
1851
+ console.log(chalk.bold.green(`\n✅ Template analysis complete`));
1852
+ console.log(chalk.dim(` ${report.stats.totalGroups} groups · ${report.stats.totalGrouped.toLocaleString()} URLs · ${(report.stats.coverage * 100).toFixed(0)}% of sitemap`));
1853
+ console.log(chalk.dim(` Run ${chalk.white('seo-intel html ' + project)} to see the full dashboard.\n`));
1854
+ } catch (err) {
1855
+ console.error(chalk.red(`\n Error: ${err.message}\n`));
1856
+ if (process.env.DEBUG) console.error(err.stack);
1857
+ process.exit(1);
1858
+ }
1636
1859
  });
1637
1860
 
1638
1861
  // ── HTML DASHBOARD ─────────────────────────────────────────────────────────
@@ -1700,10 +1923,10 @@ program
1700
1923
  }
1701
1924
  });
1702
1925
 
1703
- // ── HTML ALL-PROJECTS DASHBOARD ──────────────────────────────────────────────
1926
+ // ── HTML ALL-PROJECTS DASHBOARD (alias for html — kept for backwards compat) ──
1704
1927
  program
1705
1928
  .command('html-all')
1706
- .description('Generate a single HTML dashboard with all projects (dropdown switcher)')
1929
+ .description('Alias for "html" generates the all-projects dashboard')
1707
1930
  .action(() => {
1708
1931
  const db = getDb();
1709
1932
  const configs = loadAllConfigs();
package/crawler/index.js CHANGED
@@ -547,11 +547,16 @@ async function processPage(page, url, base, depth, queue, maxDepth) {
547
547
  // ── Quality gate — detect shells, blocked pages, empty content ──
548
548
  const quality = assessQuality({ wordCount, bodyText, title, status });
549
549
 
550
+ // Full body text for DB storage (extraction reads this); truncated for log output
551
+ const fullBodyText = sanitize(bodyText, 50000); // ~200K chars — enough for any real page
552
+ const shortBodyText = sanitize(bodyText, 2000); // compact version for logging
553
+
550
554
  return {
551
555
  url, depth, status, loadMs, wordCount, isIndexable,
552
556
  title, metaDesc, headings,
553
557
  links: [...internalLinks, ...externalLinks],
554
- bodyText: sanitize(bodyText, 2000),
558
+ bodyText: shortBodyText,
559
+ fullBodyText,
555
560
  schemaTypes, parsedSchemas, vitals, publishedDate, modifiedDate,
556
561
  contentHash: hash,
557
562
  quality: quality.ok, qualityReason: quality.reason,