seo-intel 1.1.5 → 1.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli.js +304 -81
- package/crawler/index.js +6 -1
- package/db/db.js +152 -5
- package/db/schema.sql +48 -0
- package/lib/gate.js +1 -0
- package/package.json +1 -1
- package/reports/generate-html.js +224 -124
- package/server.js +71 -12
package/cli.js
CHANGED
|
@@ -24,11 +24,11 @@ import { getNextCrawlTarget, needsAnalysis, getCrawlStatus, loadAllConfigs } fro
|
|
|
24
24
|
import {
|
|
25
25
|
getDb, upsertDomain, upsertPage, insertExtraction,
|
|
26
26
|
insertKeywords, insertHeadings, insertLinks, insertPageSchemas,
|
|
27
|
-
upsertTechnical,
|
|
27
|
+
upsertTechnical, pruneStaleDomains,
|
|
28
28
|
getCompetitorSummary, getKeywordMatrix, getHeadingStructure,
|
|
29
29
|
getPageHash, getSchemasByProject
|
|
30
30
|
} from './db/db.js';
|
|
31
|
-
import {
|
|
31
|
+
import { generateMultiDashboard } from './reports/generate-html.js';
|
|
32
32
|
import { buildTechnicalActions } from './exports/technical.js';
|
|
33
33
|
import { buildCompetitiveActions } from './exports/competitive.js';
|
|
34
34
|
import { buildSuggestiveActions } from './exports/suggestive.js';
|
|
@@ -91,6 +91,38 @@ async function checkOllamaAvailability() {
|
|
|
91
91
|
// ── EXTRACTION PROGRESS TRACKER ──────────────────────────────────────────
|
|
92
92
|
const PROGRESS_FILE = join(__dirname, '.extraction-progress.json');
|
|
93
93
|
|
|
94
|
+
// ── Graceful shutdown support ──
|
|
95
|
+
// Cleanup callbacks registered by crawl/extract commands (e.g. close browser)
|
|
96
|
+
const _shutdownCallbacks = [];
|
|
97
|
+
let _shuttingDown = false;
|
|
98
|
+
|
|
99
|
+
function onShutdown(fn) { _shutdownCallbacks.push(fn); }
|
|
100
|
+
function clearShutdownCallbacks() { _shutdownCallbacks.length = 0; }
|
|
101
|
+
|
|
102
|
+
async function _gracefulExit(signal) {
|
|
103
|
+
if (_shuttingDown) return;
|
|
104
|
+
_shuttingDown = true;
|
|
105
|
+
console.log(chalk.yellow(`\n⏹ Received ${signal} — stopping gracefully…`));
|
|
106
|
+
|
|
107
|
+
// Update progress file
|
|
108
|
+
try {
|
|
109
|
+
const progress = readProgress();
|
|
110
|
+
if (progress && progress.status === 'running' && progress.pid === process.pid) {
|
|
111
|
+
writeProgress({ ...progress, status: 'stopped', stopped_at: Date.now() });
|
|
112
|
+
}
|
|
113
|
+
} catch { /* best-effort */ }
|
|
114
|
+
|
|
115
|
+
// Run cleanup callbacks (close browsers, etc.)
|
|
116
|
+
for (const fn of _shutdownCallbacks) {
|
|
117
|
+
try { await Promise.resolve(fn()); } catch { /* best-effort */ }
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
process.exit(0);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
process.on('SIGTERM', () => _gracefulExit('SIGTERM'));
|
|
124
|
+
process.on('SIGINT', () => _gracefulExit('SIGINT'));
|
|
125
|
+
|
|
94
126
|
function writeProgress(data) {
|
|
95
127
|
try {
|
|
96
128
|
writeFileSync(PROGRESS_FILE, JSON.stringify({
|
|
@@ -361,6 +393,21 @@ program
|
|
|
361
393
|
}
|
|
362
394
|
}
|
|
363
395
|
|
|
396
|
+
// ── Prune stale domains (DB entries no longer in config) ─────────────
|
|
397
|
+
{
|
|
398
|
+
const configDomains = new Set([
|
|
399
|
+
config.target?.domain,
|
|
400
|
+
...(config.owned || []).map(o => o.domain),
|
|
401
|
+
...(config.competitors || []).map(c => c.domain),
|
|
402
|
+
].filter(Boolean));
|
|
403
|
+
|
|
404
|
+
const pruned = pruneStaleDomains(db, project, configDomains);
|
|
405
|
+
if (pruned.length) {
|
|
406
|
+
console.log(chalk.yellow(`\n 🧹 Pruned ${pruned.length} stale domain(s) from DB (no longer in config):`));
|
|
407
|
+
for (const d of pruned) console.log(chalk.dim(` − ${d}`));
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
|
|
364
411
|
// ── Tier gate: Free tier = crawl-only, no AI extraction ──────────────
|
|
365
412
|
if (opts.extract !== false && !isPro()) {
|
|
366
413
|
console.log(chalk.dim('\n ℹ Free tier: crawl-only mode (AI extraction requires Solo/Agency)'));
|
|
@@ -456,6 +503,9 @@ program
|
|
|
456
503
|
publishedDate: page.publishedDate || null,
|
|
457
504
|
modifiedDate: page.modifiedDate || null,
|
|
458
505
|
contentHash: page.contentHash || null,
|
|
506
|
+
title: page.title || null,
|
|
507
|
+
metaDesc: page.metaDesc || null,
|
|
508
|
+
bodyText: page.fullBodyText || page.bodyText || null,
|
|
459
509
|
});
|
|
460
510
|
const pageId = pageRes?.id;
|
|
461
511
|
|
|
@@ -481,6 +531,7 @@ program
|
|
|
481
531
|
page_index: totalExtracted + 1,
|
|
482
532
|
started_at: crawlStart,
|
|
483
533
|
failed: totalFailed,
|
|
534
|
+
stealth: !!crawlOpts.stealth,
|
|
484
535
|
});
|
|
485
536
|
upsertTechnical(db, { pageId, hasCanonical: page.hasCanonical, hasOgTags: page.hasOgTags, hasSchema: page.hasSchema, hasRobots: page.hasRobots });
|
|
486
537
|
try {
|
|
@@ -546,9 +597,10 @@ program
|
|
|
546
597
|
if (totalSkipped > 0) console.log(chalk.blue(`\n📊 Incremental: ${totalSkipped} unchanged pages skipped (same content hash)`));
|
|
547
598
|
if (totalBlocked > 0) console.log(chalk.red(`\n⛔ ${totalBlocked} domain(s) blocked (rate-limited or WAF)`));
|
|
548
599
|
const elapsed = ((Date.now() - crawlStart) / 1000).toFixed(1);
|
|
549
|
-
// Auto-regenerate dashboard
|
|
600
|
+
// Auto-regenerate dashboard (always multi-project so all projects stay current)
|
|
550
601
|
try {
|
|
551
|
-
const
|
|
602
|
+
const allConfigs = loadAllConfigs();
|
|
603
|
+
const dashPath = generateMultiDashboard(db, allConfigs);
|
|
552
604
|
console.log(chalk.dim(` 📊 Dashboard refreshed → ${dashPath}`));
|
|
553
605
|
} catch (dashErr) {
|
|
554
606
|
console.log(chalk.dim(` ⚠ Dashboard refresh skipped: ${dashErr.message}`));
|
|
@@ -664,9 +716,10 @@ program
|
|
|
664
716
|
// Print summary
|
|
665
717
|
printAnalysisSummary(analysis, project);
|
|
666
718
|
|
|
667
|
-
// Auto-regenerate dashboard
|
|
719
|
+
// Auto-regenerate dashboard (always multi-project so all projects stay current)
|
|
668
720
|
try {
|
|
669
|
-
const
|
|
721
|
+
const allConfigs = loadAllConfigs();
|
|
722
|
+
const dashPath = generateMultiDashboard(db, allConfigs);
|
|
670
723
|
console.log(chalk.dim(` 📊 Dashboard refreshed → ${dashPath}`));
|
|
671
724
|
} catch (dashErr) {
|
|
672
725
|
console.log(chalk.dim(` ⚠ Dashboard refresh skipped: ${dashErr.message}`));
|
|
@@ -1041,6 +1094,9 @@ program
|
|
|
1041
1094
|
publishedDate: page.publishedDate || null,
|
|
1042
1095
|
modifiedDate: page.modifiedDate || null,
|
|
1043
1096
|
contentHash: page.contentHash || null,
|
|
1097
|
+
title: page.title || null,
|
|
1098
|
+
metaDesc: page.metaDesc || null,
|
|
1099
|
+
bodyText: page.fullBodyText || page.bodyText || null,
|
|
1044
1100
|
});
|
|
1045
1101
|
const pageId = pageRes?.id;
|
|
1046
1102
|
|
|
@@ -1346,6 +1402,7 @@ program
|
|
|
1346
1402
|
.option('--add-owned <domain>', 'Add an owned subdomain')
|
|
1347
1403
|
.option('--remove-owned <domain>', 'Remove an owned subdomain')
|
|
1348
1404
|
.option('--set-target <domain>', 'Change the target domain')
|
|
1405
|
+
.option('--prune', 'Remove DB data for domains no longer in config')
|
|
1349
1406
|
.action((project, opts) => {
|
|
1350
1407
|
const configPath = join(__dirname, `config/${project}.json`);
|
|
1351
1408
|
let config;
|
|
@@ -1438,6 +1495,24 @@ program
|
|
|
1438
1495
|
console.log(chalk.dim(`\n Saved → config/${project}.json`));
|
|
1439
1496
|
}
|
|
1440
1497
|
|
|
1498
|
+
// ── Prune stale DB data (auto on remove, or manual --prune) ─────────
|
|
1499
|
+
if (modified || opts.prune) {
|
|
1500
|
+
const db = getDb();
|
|
1501
|
+
const configDomains = new Set([
|
|
1502
|
+
config.target?.domain,
|
|
1503
|
+
...(config.owned || []).map(o => o.domain),
|
|
1504
|
+
...(config.competitors || []).map(c => c.domain),
|
|
1505
|
+
].filter(Boolean));
|
|
1506
|
+
|
|
1507
|
+
const pruned = pruneStaleDomains(db, project, configDomains);
|
|
1508
|
+
if (pruned.length) {
|
|
1509
|
+
console.log(chalk.yellow(`\n 🧹 Pruned ${pruned.length} stale domain(s) from DB:`));
|
|
1510
|
+
for (const d of pruned) console.log(chalk.dim(` − ${d}`));
|
|
1511
|
+
} else if (opts.prune) {
|
|
1512
|
+
console.log(chalk.dim('\n ✓ No stale domains to prune'));
|
|
1513
|
+
}
|
|
1514
|
+
}
|
|
1515
|
+
|
|
1441
1516
|
// ── Always show current config
|
|
1442
1517
|
console.log(chalk.bold.cyan(`\n 📋 ${project} — Domain Configuration\n`));
|
|
1443
1518
|
console.log(chalk.white(' Target:'));
|
|
@@ -1527,13 +1602,14 @@ async function runAnalysis(project, db) {
|
|
|
1527
1602
|
program
|
|
1528
1603
|
.command('extract <project>')
|
|
1529
1604
|
.description('Run AI extraction on all crawled-but-not-yet-extracted pages (requires Solo/Agency)')
|
|
1530
|
-
.
|
|
1531
|
-
.action(async (project, opts) => {
|
|
1605
|
+
.action(async (project) => {
|
|
1532
1606
|
if (!requirePro('extract')) return;
|
|
1533
1607
|
const db = getDb();
|
|
1608
|
+
|
|
1609
|
+
// Query pages that have body_text stored (from crawl) but no extraction yet
|
|
1534
1610
|
const pendingPages = db.prepare(`
|
|
1535
|
-
SELECT p.id, p.url, p.word_count,
|
|
1536
|
-
|
|
1611
|
+
SELECT p.id, p.url, p.word_count, p.title, p.meta_desc, p.body_text,
|
|
1612
|
+
p.published_date, p.modified_date
|
|
1537
1613
|
FROM pages p
|
|
1538
1614
|
JOIN domains d ON d.id = p.domain_id
|
|
1539
1615
|
LEFT JOIN extractions e ON e.page_id = p.id
|
|
@@ -1545,94 +1621,241 @@ program
|
|
|
1545
1621
|
process.exit(0);
|
|
1546
1622
|
}
|
|
1547
1623
|
|
|
1548
|
-
|
|
1549
|
-
|
|
1624
|
+
// Check how many have body_text stored vs need re-crawl
|
|
1625
|
+
const withContent = pendingPages.filter(r => r.body_text);
|
|
1626
|
+
const needsRecrawl = pendingPages.length - withContent.length;
|
|
1627
|
+
|
|
1628
|
+
console.log(chalk.bold.cyan(`\n⚙️ Extracting ${pendingPages.length} pages for ${project} via Qwen...\n`));
|
|
1629
|
+
if (needsRecrawl > 0) {
|
|
1630
|
+
console.log(chalk.yellow(` ⚠ ${needsRecrawl} pages have no stored content (crawled before v1.1.6). Re-crawl to populate.\n`));
|
|
1631
|
+
}
|
|
1550
1632
|
|
|
1551
1633
|
const extractStart = Date.now();
|
|
1552
|
-
let done = 0, failed = 0;
|
|
1634
|
+
let done = 0, failed = 0, skipped = 0;
|
|
1553
1635
|
|
|
1554
|
-
// ──
|
|
1555
|
-
|
|
1556
|
-
|
|
1557
|
-
|
|
1558
|
-
stealthSession = await createStealthSession();
|
|
1559
|
-
console.log(chalk.magenta(' 🥷 Advanced mode — full browser rendering, persistent sessions\n'));
|
|
1560
|
-
}
|
|
1636
|
+
// ── Pre-extract template grouping: sample N per group, skip the rest ──
|
|
1637
|
+
const SAMPLE_PER_GROUP = 5;
|
|
1638
|
+
const MIN_GROUP_FOR_SAMPLING = 10;
|
|
1639
|
+
let extractQueue = pendingPages.filter(r => r.body_text); // only pages with stored content
|
|
1561
1640
|
|
|
1562
1641
|
try {
|
|
1563
|
-
|
|
1564
|
-
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
writeProgress({
|
|
1569
|
-
status: 'running', command: 'extract', project,
|
|
1570
|
-
current_url: row.url,
|
|
1571
|
-
page_index: done + failed + 1, total: pendingPages.length,
|
|
1572
|
-
percent: Math.round(((done + failed) / pendingPages.length) * 100),
|
|
1573
|
-
started_at: extractStart, failed,
|
|
1574
|
-
stealth: !!opts.stealth,
|
|
1575
|
-
});
|
|
1642
|
+
const { clusterUrls } = await import('./analyses/templates/cluster.js');
|
|
1643
|
+
const { groups } = clusterUrls(
|
|
1644
|
+
extractQueue.map(r => ({ url: r.url })),
|
|
1645
|
+
{ minGroupSize: MIN_GROUP_FOR_SAMPLING }
|
|
1646
|
+
);
|
|
1576
1647
|
|
|
1577
|
-
|
|
1578
|
-
|
|
1648
|
+
if (groups.length > 0) {
|
|
1649
|
+
const skipUrls = new Set();
|
|
1579
1650
|
|
|
1580
|
-
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
} else {
|
|
1584
|
-
// Standard: quick single-page crawl
|
|
1585
|
-
const { crawlAll } = await import('./crawler/index.js');
|
|
1586
|
-
const crawled = await crawlAll(row.url);
|
|
1587
|
-
pageData = crawled[0] || null;
|
|
1588
|
-
}
|
|
1651
|
+
for (const group of groups) {
|
|
1652
|
+
const urls = group.urls;
|
|
1653
|
+
if (urls.length <= SAMPLE_PER_GROUP) continue;
|
|
1589
1654
|
|
|
1590
|
-
|
|
1591
|
-
|
|
1592
|
-
|
|
1593
|
-
|
|
1594
|
-
if (stealthSession) {
|
|
1595
|
-
// Jittered delay even on failure — don't hammer a blocking site
|
|
1596
|
-
await new Promise(r => setTimeout(r, 1500 + Math.random() * 2000));
|
|
1597
|
-
}
|
|
1598
|
-
continue;
|
|
1599
|
-
}
|
|
1655
|
+
const sampleSet = new Set();
|
|
1656
|
+
sampleSet.add(urls[0]); sampleSet.add(urls[1]);
|
|
1657
|
+
sampleSet.add(urls[urls.length - 1]); sampleSet.add(urls[urls.length - 2]);
|
|
1658
|
+
sampleSet.add(urls[Math.floor(urls.length / 2)]);
|
|
1600
1659
|
|
|
1601
|
-
|
|
1602
|
-
const
|
|
1603
|
-
|
|
1604
|
-
insertExtraction(db, { pageId: row.id, data: extraction });
|
|
1605
|
-
insertKeywords(db, row.id, extraction.keywords);
|
|
1606
|
-
|
|
1607
|
-
// Also update headings + links + schemas with fresh data from stealth fetch
|
|
1608
|
-
if (stealthSession) {
|
|
1609
|
-
insertHeadings(db, row.id, pageData.headings);
|
|
1610
|
-
insertLinks(db, row.id, pageData.links);
|
|
1611
|
-
if (pageData.parsedSchemas?.length) insertPageSchemas(db, row.id, pageData.parsedSchemas);
|
|
1660
|
+
const skippedCount = urls.length - sampleSet.size;
|
|
1661
|
+
for (const u of urls) {
|
|
1662
|
+
if (!sampleSet.has(u)) skipUrls.add(u);
|
|
1612
1663
|
}
|
|
1664
|
+
console.log(chalk.yellow(` [template] ${group.pattern} → ${urls.length} pages, sampling ${sampleSet.size}, skipping ${skippedCount}`));
|
|
1665
|
+
}
|
|
1613
1666
|
|
|
1614
|
-
|
|
1667
|
+
if (skipUrls.size > 0) {
|
|
1668
|
+
extractQueue = extractQueue.filter(r => !skipUrls.has(r.url));
|
|
1669
|
+
skipped += skipUrls.size;
|
|
1670
|
+
console.log(chalk.yellow(` [template] ${withContent.length} extractable → ${extractQueue.length} to extract (${skipUrls.size} template-skipped)\n`));
|
|
1671
|
+
}
|
|
1672
|
+
}
|
|
1673
|
+
} catch (e) {
|
|
1674
|
+
console.log(chalk.gray(` [template] Pattern detection skipped: ${e.message}`));
|
|
1675
|
+
}
|
|
1676
|
+
|
|
1677
|
+
// ── Consecutive failure tracking per URL pattern ──
|
|
1678
|
+
const CONSEC_FAIL_THRESHOLD = 3;
|
|
1679
|
+
const patternFailCounts = new Map();
|
|
1680
|
+
const skippedPatterns = new Set();
|
|
1681
|
+
|
|
1682
|
+
function getPatternKey(url) {
|
|
1683
|
+
try {
|
|
1684
|
+
const u = new URL(url);
|
|
1685
|
+
const parts = u.pathname.split('/').filter(Boolean);
|
|
1686
|
+
return u.hostname + '/' + parts.map(p =>
|
|
1687
|
+
(p.length > 20 || /^[0-9a-fA-F]{8,}$/.test(p) || /^0x/.test(p) || /[-_]/.test(p)) ? '{var}' : p
|
|
1688
|
+
).join('/');
|
|
1689
|
+
} catch { return url; }
|
|
1690
|
+
}
|
|
1691
|
+
|
|
1692
|
+
// ── Content similarity detection ──
|
|
1693
|
+
const SIMILARITY_THRESHOLD = 0.80;
|
|
1694
|
+
const SIMILARITY_SAMPLE_SIZE = 3;
|
|
1695
|
+
const patternFingerprints = new Map();
|
|
1696
|
+
|
|
1697
|
+
function textToShingles(text, n = 3) {
|
|
1698
|
+
const words = (text || '').toLowerCase().replace(/[^a-z0-9\s]/g, '').split(/\s+/).filter(Boolean);
|
|
1699
|
+
const shingles = new Set();
|
|
1700
|
+
for (let i = 0; i <= words.length - n; i++) {
|
|
1701
|
+
shingles.add(words.slice(i, i + n).join(' '));
|
|
1702
|
+
}
|
|
1703
|
+
return shingles;
|
|
1704
|
+
}
|
|
1705
|
+
|
|
1706
|
+
function jaccardSimilarity(a, b) {
|
|
1707
|
+
if (!a.size || !b.size) return 0;
|
|
1708
|
+
let intersection = 0;
|
|
1709
|
+
for (const s of a) { if (b.has(s)) intersection++; }
|
|
1710
|
+
return intersection / (a.size + b.size - intersection);
|
|
1711
|
+
}
|
|
1712
|
+
|
|
1713
|
+
function checkPatternSimilarity(patKey, newShingles) {
|
|
1714
|
+
if (!patternFingerprints.has(patKey)) patternFingerprints.set(patKey, []);
|
|
1715
|
+
const fps = patternFingerprints.get(patKey);
|
|
1716
|
+
fps.push(newShingles);
|
|
1717
|
+
if (fps.length < SIMILARITY_SAMPLE_SIZE || fps.length > SIMILARITY_SAMPLE_SIZE) return false;
|
|
1718
|
+
for (let i = 0; i < fps.length; i++) {
|
|
1719
|
+
for (let j = i + 1; j < fps.length; j++) {
|
|
1720
|
+
if (jaccardSimilarity(fps[i], fps[j]) < SIMILARITY_THRESHOLD) return false;
|
|
1721
|
+
}
|
|
1722
|
+
}
|
|
1723
|
+
return true;
|
|
1724
|
+
}
|
|
1725
|
+
|
|
1726
|
+
// ── Prepare headings + schema queries (per-page lookups from DB) ──
|
|
1727
|
+
const getHeadings = db.prepare('SELECT level, text FROM headings WHERE page_id = ? ORDER BY id');
|
|
1728
|
+
const getSchemaTypes = db.prepare('SELECT DISTINCT schema_type FROM page_schemas WHERE page_id = ?');
|
|
1729
|
+
|
|
1730
|
+
const totalToProcess = extractQueue.length;
|
|
1731
|
+
console.log(chalk.gray(` 📖 Reading from DB — no network needed\n`));
|
|
1732
|
+
|
|
1733
|
+
for (const row of extractQueue) {
|
|
1734
|
+
const patKey = getPatternKey(row.url);
|
|
1735
|
+
if (skippedPatterns.has(patKey)) {
|
|
1736
|
+
skipped++;
|
|
1737
|
+
continue;
|
|
1738
|
+
}
|
|
1739
|
+
|
|
1740
|
+
const pos = done + failed + 1;
|
|
1741
|
+
process.stdout.write(chalk.gray(` [${pos}/${totalToProcess}] ${row.url.slice(0, 70)} → `));
|
|
1742
|
+
process.stdout.write(chalk.gray('extracting...'));
|
|
1743
|
+
|
|
1744
|
+
writeProgress({
|
|
1745
|
+
status: 'running', command: 'extract', project,
|
|
1746
|
+
current_url: row.url,
|
|
1747
|
+
page_index: pos, total: totalToProcess,
|
|
1748
|
+
percent: Math.round(((done + failed) / totalToProcess) * 100),
|
|
1749
|
+
started_at: extractStart, failed, skipped,
|
|
1750
|
+
});
|
|
1751
|
+
|
|
1752
|
+
let pageFailed = false;
|
|
1753
|
+
|
|
1754
|
+
try {
|
|
1755
|
+
// Read headings + schema types from DB
|
|
1756
|
+
const headings = getHeadings.all(row.id);
|
|
1757
|
+
const schemaTypes = getSchemaTypes.all(row.id).map(r => r.schema_type);
|
|
1758
|
+
|
|
1759
|
+
const extractFn = await getExtractPage();
|
|
1760
|
+
const extraction = await extractFn({
|
|
1761
|
+
url: row.url,
|
|
1762
|
+
title: row.title || '',
|
|
1763
|
+
metaDesc: row.meta_desc || '',
|
|
1764
|
+
headings,
|
|
1765
|
+
bodyText: row.body_text,
|
|
1766
|
+
schemaTypes,
|
|
1767
|
+
publishedDate: row.published_date,
|
|
1768
|
+
modifiedDate: row.modified_date,
|
|
1769
|
+
});
|
|
1770
|
+
insertExtraction(db, { pageId: row.id, data: extraction });
|
|
1771
|
+
insertKeywords(db, row.id, extraction.keywords);
|
|
1772
|
+
|
|
1773
|
+
const isDegraded = extraction.extraction_source === 'degraded';
|
|
1774
|
+
if (isDegraded) {
|
|
1775
|
+
process.stdout.write(chalk.yellow(` ⚠ degraded\n`));
|
|
1615
1776
|
done++;
|
|
1616
|
-
|
|
1617
|
-
|
|
1618
|
-
|
|
1777
|
+
pageFailed = true;
|
|
1778
|
+
} else {
|
|
1779
|
+
process.stdout.write(chalk.green(` ✓\n`));
|
|
1780
|
+
done++;
|
|
1781
|
+
patternFailCounts.set(patKey, 0);
|
|
1619
1782
|
}
|
|
1620
1783
|
|
|
1621
|
-
//
|
|
1622
|
-
if (
|
|
1623
|
-
|
|
1784
|
+
// ── Content similarity detection ──
|
|
1785
|
+
if (row.body_text.length > 50) {
|
|
1786
|
+
const shingles = textToShingles(row.body_text);
|
|
1787
|
+
if (checkPatternSimilarity(patKey, shingles) && !skippedPatterns.has(patKey)) {
|
|
1788
|
+
const remaining = extractQueue.filter(r => getPatternKey(r.url) === patKey).length - (patternFingerprints.get(patKey)?.length || 0);
|
|
1789
|
+
skippedPatterns.add(patKey);
|
|
1790
|
+
if (remaining > 0) {
|
|
1791
|
+
console.log(chalk.yellow(` [similarity] 🔍 ${SIMILARITY_SAMPLE_SIZE} pages from ${patKey} are ${Math.round(SIMILARITY_THRESHOLD * 100)}%+ identical — skipping ${remaining} remaining`));
|
|
1792
|
+
}
|
|
1793
|
+
}
|
|
1624
1794
|
}
|
|
1795
|
+
} catch (err) {
|
|
1796
|
+
process.stdout.write(chalk.red(` ✗ ${err.message}\n`));
|
|
1797
|
+
failed++;
|
|
1798
|
+
pageFailed = true;
|
|
1625
1799
|
}
|
|
1626
|
-
|
|
1627
|
-
//
|
|
1628
|
-
if (
|
|
1629
|
-
|
|
1630
|
-
|
|
1800
|
+
|
|
1801
|
+
// ── Track consecutive failures per pattern ──
|
|
1802
|
+
if (pageFailed) {
|
|
1803
|
+
const count = (patternFailCounts.get(patKey) || 0) + 1;
|
|
1804
|
+
patternFailCounts.set(patKey, count);
|
|
1805
|
+
if (count >= CONSEC_FAIL_THRESHOLD) {
|
|
1806
|
+
const remaining = extractQueue.filter(r => !skippedPatterns.has(getPatternKey(r.url)) && getPatternKey(r.url) === patKey).length;
|
|
1807
|
+
skippedPatterns.add(patKey);
|
|
1808
|
+
console.log(chalk.yellow(` [template] ⚡ ${count} consecutive failures for ${patKey} — skipping ~${remaining} remaining pages`));
|
|
1809
|
+
}
|
|
1631
1810
|
}
|
|
1632
1811
|
}
|
|
1633
1812
|
|
|
1634
|
-
writeProgress({ status: 'completed', command: 'extract', project, extracted: done, failed, total: pendingPages.length, started_at: extractStart, finished_at: Date.now() });
|
|
1635
|
-
|
|
1813
|
+
writeProgress({ status: 'completed', command: 'extract', project, extracted: done, failed, skipped, total: pendingPages.length, started_at: extractStart, finished_at: Date.now() });
|
|
1814
|
+
const skipMsg = skipped > 0 ? chalk.yellow(`, ${skipped} template-skipped`) : '';
|
|
1815
|
+
const recrawlMsg = needsRecrawl > 0 ? chalk.yellow(`, ${needsRecrawl} need re-crawl`) : '';
|
|
1816
|
+
console.log(chalk.bold.green(`\n✅ Extraction complete: ${done} extracted, ${failed} failed${skipMsg}${recrawlMsg}\n`));
|
|
1817
|
+
});
|
|
1818
|
+
|
|
1819
|
+
// ── TEMPLATES ANALYSIS ────────────────────────────────────────────────────
|
|
1820
|
+
program
|
|
1821
|
+
.command('templates <project>')
|
|
1822
|
+
.description('Detect programmatic template pages — assess SEO value without crawling all of them')
|
|
1823
|
+
.option('--min-group <n>', 'Minimum URLs to qualify as a template group', '10')
|
|
1824
|
+
.option('--sample-size <n>', 'Pages to stealth-crawl per template group', '20')
|
|
1825
|
+
.option('--skip-crawl', 'Skip sample crawl (pattern analysis + GSC only)')
|
|
1826
|
+
.option('--skip-gsc', 'Skip GSC overlay phase')
|
|
1827
|
+
.option('--skip-competitors', 'Skip competitor sitemap census')
|
|
1828
|
+
.action(async (project, opts) => {
|
|
1829
|
+
if (!requirePro('templates')) return;
|
|
1830
|
+
|
|
1831
|
+
console.log(chalk.bold.cyan(`\n🔍 SEO Intel — Template Analysis`));
|
|
1832
|
+
console.log(chalk.dim(` Project: ${project}`));
|
|
1833
|
+
|
|
1834
|
+
try {
|
|
1835
|
+
const { runTemplatesAnalysis } = await import('./analyses/templates/index.js');
|
|
1836
|
+
const report = await runTemplatesAnalysis(project, {
|
|
1837
|
+
minGroupSize: parseInt(opts.minGroup) || 10,
|
|
1838
|
+
sampleSize: parseInt(opts.sampleSize) || 20,
|
|
1839
|
+
skipCrawl: !!opts.skipCrawl,
|
|
1840
|
+
skipGsc: !!opts.skipGsc,
|
|
1841
|
+
skipCompetitors: !!opts.skipCompetitors,
|
|
1842
|
+
log: (msg) => console.log(chalk.gray(msg)),
|
|
1843
|
+
});
|
|
1844
|
+
|
|
1845
|
+
if (report.groups.length === 0) {
|
|
1846
|
+
console.log(chalk.yellow(`\n No template patterns detected.\n`));
|
|
1847
|
+
process.exit(0);
|
|
1848
|
+
}
|
|
1849
|
+
|
|
1850
|
+
// Summary
|
|
1851
|
+
console.log(chalk.bold.green(`\n✅ Template analysis complete`));
|
|
1852
|
+
console.log(chalk.dim(` ${report.stats.totalGroups} groups · ${report.stats.totalGrouped.toLocaleString()} URLs · ${(report.stats.coverage * 100).toFixed(0)}% of sitemap`));
|
|
1853
|
+
console.log(chalk.dim(` Run ${chalk.white('seo-intel html ' + project)} to see the full dashboard.\n`));
|
|
1854
|
+
} catch (err) {
|
|
1855
|
+
console.error(chalk.red(`\n Error: ${err.message}\n`));
|
|
1856
|
+
if (process.env.DEBUG) console.error(err.stack);
|
|
1857
|
+
process.exit(1);
|
|
1858
|
+
}
|
|
1636
1859
|
});
|
|
1637
1860
|
|
|
1638
1861
|
// ── HTML DASHBOARD ─────────────────────────────────────────────────────────
|
|
@@ -1700,10 +1923,10 @@ program
|
|
|
1700
1923
|
}
|
|
1701
1924
|
});
|
|
1702
1925
|
|
|
1703
|
-
// ── HTML ALL-PROJECTS DASHBOARD
|
|
1926
|
+
// ── HTML ALL-PROJECTS DASHBOARD (alias for html — kept for backwards compat) ──
|
|
1704
1927
|
program
|
|
1705
1928
|
.command('html-all')
|
|
1706
|
-
.description('
|
|
1929
|
+
.description('Alias for "html" — generates the all-projects dashboard')
|
|
1707
1930
|
.action(() => {
|
|
1708
1931
|
const db = getDb();
|
|
1709
1932
|
const configs = loadAllConfigs();
|
package/crawler/index.js
CHANGED
|
@@ -547,11 +547,16 @@ async function processPage(page, url, base, depth, queue, maxDepth) {
|
|
|
547
547
|
// ── Quality gate — detect shells, blocked pages, empty content ──
|
|
548
548
|
const quality = assessQuality({ wordCount, bodyText, title, status });
|
|
549
549
|
|
|
550
|
+
// Full body text for DB storage (extraction reads this); truncated for log output
|
|
551
|
+
const fullBodyText = sanitize(bodyText, 50000); // ~200K chars — enough for any real page
|
|
552
|
+
const shortBodyText = sanitize(bodyText, 2000); // compact version for logging
|
|
553
|
+
|
|
550
554
|
return {
|
|
551
555
|
url, depth, status, loadMs, wordCount, isIndexable,
|
|
552
556
|
title, metaDesc, headings,
|
|
553
557
|
links: [...internalLinks, ...externalLinks],
|
|
554
|
-
bodyText:
|
|
558
|
+
bodyText: shortBodyText,
|
|
559
|
+
fullBodyText,
|
|
555
560
|
schemaTypes, parsedSchemas, vitals, publishedDate, modifiedDate,
|
|
556
561
|
contentHash: hash,
|
|
557
562
|
quality: quality.ok, qualityReason: quality.reason,
|