seo-intel 1.1.6 → 1.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +1 -1
- package/CHANGELOG.md +42 -0
- package/LICENSE +4 -5
- package/README.md +3 -3
- package/cli.js +271 -89
- package/crawler/index.js +7 -2
- package/crawler/subdomain-discovery.js +1 -1
- package/db/db.js +152 -5
- package/db/schema.sql +48 -0
- package/lib/gate.js +6 -5
- package/lib/license.js +2 -2
- package/lib/updater.js +3 -3
- package/package.json +3 -2
- package/reports/generate-html.js +124 -59
- package/seo-audit.js +2 -2
- package/server.js +54 -9
- package/setup/wizard.html +1 -1
package/.env.example
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
# Run `node cli.js setup` to configure interactively
|
|
3
3
|
|
|
4
4
|
# ── License (Pro features) ───────────────────────────────────────────────
|
|
5
|
-
# Get your key at https://
|
|
5
|
+
# Get your key at https://ukkometa.fi/en/seo-intel/
|
|
6
6
|
# SEO_INTEL_LICENSE=SI-xxxx-xxxx-xxxx-xxxx
|
|
7
7
|
|
|
8
8
|
# ── Analysis Model (cloud, pick one) ──────────────────────────────────────
|
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 1.1.8 (2026-03-27)
|
|
4
|
+
|
|
5
|
+
- Rebranded all references from froggo.pro → ukkometa.fi (endpoints, dashboard links, license validation, bot user-agents, skill)
|
|
6
|
+
- Pricing updated: €9.99/mo · €79/yr
|
|
7
|
+
- Contact updated: ukko@ukkometa.fi
|
|
8
|
+
- Added README.md and CHANGELOG.md to npm package and LS zip
|
|
9
|
+
|
|
10
|
+
## 1.1.7 (2026-03-26)
|
|
11
|
+
|
|
12
|
+
### New Features
|
|
13
|
+
- **Programmatic Template Intelligence** (`seo-intel templates <project>`) — detect URL pattern groups (e.g. `/token/*`, `/blog/*`), stealth-crawl samples, overlay GSC data, and score each group with keep/noindex/improve verdicts. Pro-gated.
|
|
14
|
+
- **Stale domain auto-pruning** — domains removed from config are now automatically cleaned from the DB (pages, keywords, extractions, schemas, headings, links) on next crawl. No more ghost data from renamed/removed subdomains.
|
|
15
|
+
- **Manual prune** — `seo-intel competitors <project> --prune` to clean stale DB entries on demand.
|
|
16
|
+
- **Full body text storage** — crawler now stores full page body text in DB (up to 200K chars) for richer extraction and analysis. Log output stays compact.
|
|
17
|
+
|
|
18
|
+
### Improvements
|
|
19
|
+
- **Background crawl/extract** — long-running crawl and extract jobs now survive browser tab close. Terminal shows "backgrounded" instead of "disconnected", and jobs continue server-side.
|
|
20
|
+
- **Dashboard terminal** — stealth flag now visible in terminal command display. Stop button properly closes SSE + server-side process. Status bar syncs with terminal state.
|
|
21
|
+
- **Templates button** added to dashboard terminal panel.
|
|
22
|
+
- **Dashboard refresh** — crawl and analyze now always regenerate the multi-project dashboard, keeping all projects current.
|
|
23
|
+
- **Config remove = DB remove** — `--remove` and `--remove-owned` now auto-prune matching DB data, not just config JSON.
|
|
24
|
+
|
|
25
|
+
### Fixes
|
|
26
|
+
- SSE disconnect no longer kills crawl/extract processes (detached child process).
|
|
27
|
+
- Terminal command display now shows `--stealth` flag when enabled.
|
|
28
|
+
|
|
29
|
+
## 1.1.6 (2026-03-24)
|
|
30
|
+
|
|
31
|
+
- Stop button, stealth sync, extraction layout, EADDRINUSE recovery.
|
|
32
|
+
|
|
33
|
+
## 1.1.5 (2026-03-21)
|
|
34
|
+
|
|
35
|
+
- Update checker, job stop API, background analyze, LAN Ollama hosts, `html` CLI command, wizard UX improvements.
|
|
36
|
+
|
|
37
|
+
## 1.1.8 (2026-03-27)
|
|
38
|
+
|
|
39
|
+
- Rebranded all references from froggo.pro → ukkometa.fi (endpoints, dashboard links, license validation, bot user-agents, skill)
|
|
40
|
+
- Pricing updated: €9.99/mo · €79/yr
|
|
41
|
+
- Contact updated: ukko@ukkometa.fi
|
|
42
|
+
- Added README.md and CHANGELOG.md to npm package and LS zip
|
package/LICENSE
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
SEO Intel — Dual License
|
|
2
2
|
|
|
3
|
-
Copyright (c) 2024-2026
|
|
3
|
+
Copyright (c) 2024-2026 Ukkometa (ukkometa.fi)
|
|
4
4
|
|
|
5
5
|
This project uses a dual license structure:
|
|
6
6
|
|
|
@@ -65,11 +65,10 @@ You MAY:
|
|
|
65
65
|
- Share generated reports and dashboards (outputs are yours)
|
|
66
66
|
|
|
67
67
|
License:
|
|
68
|
-
Solo — €19.99/month or €199/year — full AI analysis, all commands
|
|
69
|
-
Also available at $9.99/month via froggo.pro marketplace
|
|
68
|
+
Solo — €19.99/month or €199.99/year — full AI analysis, all commands
|
|
70
69
|
|
|
71
|
-
Purchase at: https://ukkometa.fi
|
|
70
|
+
Purchase at: https://ukkometa.fi/en/seo-intel/
|
|
72
71
|
|
|
73
72
|
================================================================================
|
|
74
73
|
|
|
75
|
-
For questions:
|
|
74
|
+
For questions: ukko@ukkometa.fi
|
package/README.md
CHANGED
|
@@ -75,7 +75,7 @@ seo-intel suggest-usecases myproject --scope docs # infer what pages/docs s
|
|
|
75
75
|
| `schemas <project>` | Schema.org coverage analysis |
|
|
76
76
|
| `update` | Check for updates |
|
|
77
77
|
|
|
78
|
-
### Solo (€19.99/mo · [ukkometa.fi/seo-intel](https://ukkometa.fi/seo-intel))
|
|
78
|
+
### Solo (€19.99/mo · [ukkometa.fi/seo-intel](https://ukkometa.fi/en/seo-intel/))
|
|
79
79
|
|
|
80
80
|
| Command | Description |
|
|
81
81
|
|---------|-------------|
|
|
@@ -194,7 +194,7 @@ Upload your GSC data for ranking insights:
|
|
|
194
194
|
- 1 project, 500 pages/domain
|
|
195
195
|
- Crawl, extract, setup, basic reports
|
|
196
196
|
|
|
197
|
-
###
|
|
197
|
+
### Solo (€19.99/mo · €199.99/yr)
|
|
198
198
|
- Unlimited projects and pages
|
|
199
199
|
- All analysis commands, GSC insights, scheduling
|
|
200
200
|
|
|
@@ -203,7 +203,7 @@ Upload your GSC data for ranking insights:
|
|
|
203
203
|
echo "SEO_INTEL_LICENSE=SI-xxxx-xxxx-xxxx-xxxx" >> .env
|
|
204
204
|
```
|
|
205
205
|
|
|
206
|
-
Get a key at [
|
|
206
|
+
Get a key at [ukkometa.fi/seo-intel](https://ukkometa.fi/en/seo-intel/)
|
|
207
207
|
|
|
208
208
|
## Updates
|
|
209
209
|
|
package/cli.js
CHANGED
|
@@ -24,11 +24,11 @@ import { getNextCrawlTarget, needsAnalysis, getCrawlStatus, loadAllConfigs } fro
|
|
|
24
24
|
import {
|
|
25
25
|
getDb, upsertDomain, upsertPage, insertExtraction,
|
|
26
26
|
insertKeywords, insertHeadings, insertLinks, insertPageSchemas,
|
|
27
|
-
upsertTechnical,
|
|
27
|
+
upsertTechnical, pruneStaleDomains,
|
|
28
28
|
getCompetitorSummary, getKeywordMatrix, getHeadingStructure,
|
|
29
29
|
getPageHash, getSchemasByProject
|
|
30
30
|
} from './db/db.js';
|
|
31
|
-
import {
|
|
31
|
+
import { generateMultiDashboard } from './reports/generate-html.js';
|
|
32
32
|
import { buildTechnicalActions } from './exports/technical.js';
|
|
33
33
|
import { buildCompetitiveActions } from './exports/competitive.js';
|
|
34
34
|
import { buildSuggestiveActions } from './exports/suggestive.js';
|
|
@@ -393,6 +393,21 @@ program
|
|
|
393
393
|
}
|
|
394
394
|
}
|
|
395
395
|
|
|
396
|
+
// ── Prune stale domains (DB entries no longer in config) ─────────────
|
|
397
|
+
{
|
|
398
|
+
const configDomains = new Set([
|
|
399
|
+
config.target?.domain,
|
|
400
|
+
...(config.owned || []).map(o => o.domain),
|
|
401
|
+
...(config.competitors || []).map(c => c.domain),
|
|
402
|
+
].filter(Boolean));
|
|
403
|
+
|
|
404
|
+
const pruned = pruneStaleDomains(db, project, configDomains);
|
|
405
|
+
if (pruned.length) {
|
|
406
|
+
console.log(chalk.yellow(`\n 🧹 Pruned ${pruned.length} stale domain(s) from DB (no longer in config):`));
|
|
407
|
+
for (const d of pruned) console.log(chalk.dim(` − ${d}`));
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
|
|
396
411
|
// ── Tier gate: Free tier = crawl-only, no AI extraction ──────────────
|
|
397
412
|
if (opts.extract !== false && !isPro()) {
|
|
398
413
|
console.log(chalk.dim('\n ℹ Free tier: crawl-only mode (AI extraction requires Solo/Agency)'));
|
|
@@ -488,6 +503,9 @@ program
|
|
|
488
503
|
publishedDate: page.publishedDate || null,
|
|
489
504
|
modifiedDate: page.modifiedDate || null,
|
|
490
505
|
contentHash: page.contentHash || null,
|
|
506
|
+
title: page.title || null,
|
|
507
|
+
metaDesc: page.metaDesc || null,
|
|
508
|
+
bodyText: page.fullBodyText || page.bodyText || null,
|
|
491
509
|
});
|
|
492
510
|
const pageId = pageRes?.id;
|
|
493
511
|
|
|
@@ -579,9 +597,10 @@ program
|
|
|
579
597
|
if (totalSkipped > 0) console.log(chalk.blue(`\n📊 Incremental: ${totalSkipped} unchanged pages skipped (same content hash)`));
|
|
580
598
|
if (totalBlocked > 0) console.log(chalk.red(`\n⛔ ${totalBlocked} domain(s) blocked (rate-limited or WAF)`));
|
|
581
599
|
const elapsed = ((Date.now() - crawlStart) / 1000).toFixed(1);
|
|
582
|
-
// Auto-regenerate dashboard
|
|
600
|
+
// Auto-regenerate dashboard (always multi-project so all projects stay current)
|
|
583
601
|
try {
|
|
584
|
-
const
|
|
602
|
+
const allConfigs = loadAllConfigs();
|
|
603
|
+
const dashPath = generateMultiDashboard(db, allConfigs);
|
|
585
604
|
console.log(chalk.dim(` 📊 Dashboard refreshed → ${dashPath}`));
|
|
586
605
|
} catch (dashErr) {
|
|
587
606
|
console.log(chalk.dim(` ⚠ Dashboard refresh skipped: ${dashErr.message}`));
|
|
@@ -697,9 +716,10 @@ program
|
|
|
697
716
|
// Print summary
|
|
698
717
|
printAnalysisSummary(analysis, project);
|
|
699
718
|
|
|
700
|
-
// Auto-regenerate dashboard
|
|
719
|
+
// Auto-regenerate dashboard (always multi-project so all projects stay current)
|
|
701
720
|
try {
|
|
702
|
-
const
|
|
721
|
+
const allConfigs = loadAllConfigs();
|
|
722
|
+
const dashPath = generateMultiDashboard(db, allConfigs);
|
|
703
723
|
console.log(chalk.dim(` 📊 Dashboard refreshed → ${dashPath}`));
|
|
704
724
|
} catch (dashErr) {
|
|
705
725
|
console.log(chalk.dim(` ⚠ Dashboard refresh skipped: ${dashErr.message}`));
|
|
@@ -1074,6 +1094,9 @@ program
|
|
|
1074
1094
|
publishedDate: page.publishedDate || null,
|
|
1075
1095
|
modifiedDate: page.modifiedDate || null,
|
|
1076
1096
|
contentHash: page.contentHash || null,
|
|
1097
|
+
title: page.title || null,
|
|
1098
|
+
metaDesc: page.metaDesc || null,
|
|
1099
|
+
bodyText: page.fullBodyText || page.bodyText || null,
|
|
1077
1100
|
});
|
|
1078
1101
|
const pageId = pageRes?.id;
|
|
1079
1102
|
|
|
@@ -1246,7 +1269,7 @@ program
|
|
|
1246
1269
|
console.log(chalk.gray(' npm registry: ') + chalk.white(info.npmVersion));
|
|
1247
1270
|
}
|
|
1248
1271
|
if (info.froggoVersion) {
|
|
1249
|
-
console.log(chalk.gray('
|
|
1272
|
+
console.log(chalk.gray(' ukkometa.fi: ') + chalk.white(info.froggoVersion));
|
|
1250
1273
|
}
|
|
1251
1274
|
|
|
1252
1275
|
if (!info.hasUpdate) {
|
|
@@ -1379,6 +1402,7 @@ program
|
|
|
1379
1402
|
.option('--add-owned <domain>', 'Add an owned subdomain')
|
|
1380
1403
|
.option('--remove-owned <domain>', 'Remove an owned subdomain')
|
|
1381
1404
|
.option('--set-target <domain>', 'Change the target domain')
|
|
1405
|
+
.option('--prune', 'Remove DB data for domains no longer in config')
|
|
1382
1406
|
.action((project, opts) => {
|
|
1383
1407
|
const configPath = join(__dirname, `config/${project}.json`);
|
|
1384
1408
|
let config;
|
|
@@ -1471,6 +1495,24 @@ program
|
|
|
1471
1495
|
console.log(chalk.dim(`\n Saved → config/${project}.json`));
|
|
1472
1496
|
}
|
|
1473
1497
|
|
|
1498
|
+
// ── Prune stale DB data (auto on remove, or manual --prune) ─────────
|
|
1499
|
+
if (modified || opts.prune) {
|
|
1500
|
+
const db = getDb();
|
|
1501
|
+
const configDomains = new Set([
|
|
1502
|
+
config.target?.domain,
|
|
1503
|
+
...(config.owned || []).map(o => o.domain),
|
|
1504
|
+
...(config.competitors || []).map(c => c.domain),
|
|
1505
|
+
].filter(Boolean));
|
|
1506
|
+
|
|
1507
|
+
const pruned = pruneStaleDomains(db, project, configDomains);
|
|
1508
|
+
if (pruned.length) {
|
|
1509
|
+
console.log(chalk.yellow(`\n 🧹 Pruned ${pruned.length} stale domain(s) from DB:`));
|
|
1510
|
+
for (const d of pruned) console.log(chalk.dim(` − ${d}`));
|
|
1511
|
+
} else if (opts.prune) {
|
|
1512
|
+
console.log(chalk.dim('\n ✓ No stale domains to prune'));
|
|
1513
|
+
}
|
|
1514
|
+
}
|
|
1515
|
+
|
|
1474
1516
|
// ── Always show current config
|
|
1475
1517
|
console.log(chalk.bold.cyan(`\n 📋 ${project} — Domain Configuration\n`));
|
|
1476
1518
|
console.log(chalk.white(' Target:'));
|
|
@@ -1560,13 +1602,14 @@ async function runAnalysis(project, db) {
|
|
|
1560
1602
|
program
|
|
1561
1603
|
.command('extract <project>')
|
|
1562
1604
|
.description('Run AI extraction on all crawled-but-not-yet-extracted pages (requires Solo/Agency)')
|
|
1563
|
-
.
|
|
1564
|
-
.action(async (project, opts) => {
|
|
1605
|
+
.action(async (project) => {
|
|
1565
1606
|
if (!requirePro('extract')) return;
|
|
1566
1607
|
const db = getDb();
|
|
1608
|
+
|
|
1609
|
+
// Query pages that have body_text stored (from crawl) but no extraction yet
|
|
1567
1610
|
const pendingPages = db.prepare(`
|
|
1568
|
-
SELECT p.id, p.url, p.word_count,
|
|
1569
|
-
|
|
1611
|
+
SELECT p.id, p.url, p.word_count, p.title, p.meta_desc, p.body_text,
|
|
1612
|
+
p.published_date, p.modified_date
|
|
1570
1613
|
FROM pages p
|
|
1571
1614
|
JOIN domains d ON d.id = p.domain_id
|
|
1572
1615
|
LEFT JOIN extractions e ON e.page_id = p.id
|
|
@@ -1578,102 +1621,241 @@ program
|
|
|
1578
1621
|
process.exit(0);
|
|
1579
1622
|
}
|
|
1580
1623
|
|
|
1581
|
-
|
|
1582
|
-
|
|
1624
|
+
// Check how many have body_text stored vs need re-crawl
|
|
1625
|
+
const withContent = pendingPages.filter(r => r.body_text);
|
|
1626
|
+
const needsRecrawl = pendingPages.length - withContent.length;
|
|
1627
|
+
|
|
1628
|
+
console.log(chalk.bold.cyan(`\n⚙️ Extracting ${pendingPages.length} pages for ${project} via Qwen...\n`));
|
|
1629
|
+
if (needsRecrawl > 0) {
|
|
1630
|
+
console.log(chalk.yellow(` ⚠ ${needsRecrawl} pages have no stored content (crawled before v1.1.6). Re-crawl to populate.\n`));
|
|
1631
|
+
}
|
|
1583
1632
|
|
|
1584
1633
|
const extractStart = Date.now();
|
|
1585
|
-
let done = 0, failed = 0;
|
|
1634
|
+
let done = 0, failed = 0, skipped = 0;
|
|
1635
|
+
|
|
1636
|
+
// ── Pre-extract template grouping: sample N per group, skip the rest ──
|
|
1637
|
+
const SAMPLE_PER_GROUP = 5;
|
|
1638
|
+
const MIN_GROUP_FOR_SAMPLING = 10;
|
|
1639
|
+
let extractQueue = pendingPages.filter(r => r.body_text); // only pages with stored content
|
|
1640
|
+
|
|
1641
|
+
try {
|
|
1642
|
+
const { clusterUrls } = await import('./analyses/templates/cluster.js');
|
|
1643
|
+
const { groups } = clusterUrls(
|
|
1644
|
+
extractQueue.map(r => ({ url: r.url })),
|
|
1645
|
+
{ minGroupSize: MIN_GROUP_FOR_SAMPLING }
|
|
1646
|
+
);
|
|
1647
|
+
|
|
1648
|
+
if (groups.length > 0) {
|
|
1649
|
+
const skipUrls = new Set();
|
|
1586
1650
|
|
|
1587
|
-
|
|
1588
|
-
|
|
1589
|
-
|
|
1590
|
-
|
|
1591
|
-
|
|
1592
|
-
|
|
1651
|
+
for (const group of groups) {
|
|
1652
|
+
const urls = group.urls;
|
|
1653
|
+
if (urls.length <= SAMPLE_PER_GROUP) continue;
|
|
1654
|
+
|
|
1655
|
+
const sampleSet = new Set();
|
|
1656
|
+
sampleSet.add(urls[0]); sampleSet.add(urls[1]);
|
|
1657
|
+
sampleSet.add(urls[urls.length - 1]); sampleSet.add(urls[urls.length - 2]);
|
|
1658
|
+
sampleSet.add(urls[Math.floor(urls.length / 2)]);
|
|
1659
|
+
|
|
1660
|
+
const skippedCount = urls.length - sampleSet.size;
|
|
1661
|
+
for (const u of urls) {
|
|
1662
|
+
if (!sampleSet.has(u)) skipUrls.add(u);
|
|
1663
|
+
}
|
|
1664
|
+
console.log(chalk.yellow(` [template] ${group.pattern} → ${urls.length} pages, sampling ${sampleSet.size}, skipping ${skippedCount}`));
|
|
1665
|
+
}
|
|
1666
|
+
|
|
1667
|
+
if (skipUrls.size > 0) {
|
|
1668
|
+
extractQueue = extractQueue.filter(r => !skipUrls.has(r.url));
|
|
1669
|
+
skipped += skipUrls.size;
|
|
1670
|
+
console.log(chalk.yellow(` [template] ${withContent.length} extractable → ${extractQueue.length} to extract (${skipUrls.size} template-skipped)\n`));
|
|
1671
|
+
}
|
|
1672
|
+
}
|
|
1673
|
+
} catch (e) {
|
|
1674
|
+
console.log(chalk.gray(` [template] Pattern detection skipped: ${e.message}`));
|
|
1593
1675
|
}
|
|
1594
1676
|
|
|
1595
|
-
//
|
|
1596
|
-
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
|
|
1677
|
+
// ── Consecutive failure tracking per URL pattern ──
|
|
1678
|
+
const CONSEC_FAIL_THRESHOLD = 3;
|
|
1679
|
+
const patternFailCounts = new Map();
|
|
1680
|
+
const skippedPatterns = new Set();
|
|
1681
|
+
|
|
1682
|
+
function getPatternKey(url) {
|
|
1683
|
+
try {
|
|
1684
|
+
const u = new URL(url);
|
|
1685
|
+
const parts = u.pathname.split('/').filter(Boolean);
|
|
1686
|
+
return u.hostname + '/' + parts.map(p =>
|
|
1687
|
+
(p.length > 20 || /^[0-9a-fA-F]{8,}$/.test(p) || /^0x/.test(p) || /[-_]/.test(p)) ? '{var}' : p
|
|
1688
|
+
).join('/');
|
|
1689
|
+
} catch { return url; }
|
|
1690
|
+
}
|
|
1691
|
+
|
|
1692
|
+
// ── Content similarity detection ──
|
|
1693
|
+
const SIMILARITY_THRESHOLD = 0.80;
|
|
1694
|
+
const SIMILARITY_SAMPLE_SIZE = 3;
|
|
1695
|
+
const patternFingerprints = new Map();
|
|
1696
|
+
|
|
1697
|
+
function textToShingles(text, n = 3) {
|
|
1698
|
+
const words = (text || '').toLowerCase().replace(/[^a-z0-9\s]/g, '').split(/\s+/).filter(Boolean);
|
|
1699
|
+
const shingles = new Set();
|
|
1700
|
+
for (let i = 0; i <= words.length - n; i++) {
|
|
1701
|
+
shingles.add(words.slice(i, i + n).join(' '));
|
|
1702
|
+
}
|
|
1703
|
+
return shingles;
|
|
1704
|
+
}
|
|
1705
|
+
|
|
1706
|
+
function jaccardSimilarity(a, b) {
|
|
1707
|
+
if (!a.size || !b.size) return 0;
|
|
1708
|
+
let intersection = 0;
|
|
1709
|
+
for (const s of a) { if (b.has(s)) intersection++; }
|
|
1710
|
+
return intersection / (a.size + b.size - intersection);
|
|
1711
|
+
}
|
|
1712
|
+
|
|
1713
|
+
function checkPatternSimilarity(patKey, newShingles) {
|
|
1714
|
+
if (!patternFingerprints.has(patKey)) patternFingerprints.set(patKey, []);
|
|
1715
|
+
const fps = patternFingerprints.get(patKey);
|
|
1716
|
+
fps.push(newShingles);
|
|
1717
|
+
if (fps.length < SIMILARITY_SAMPLE_SIZE || fps.length > SIMILARITY_SAMPLE_SIZE) return false;
|
|
1718
|
+
for (let i = 0; i < fps.length; i++) {
|
|
1719
|
+
for (let j = i + 1; j < fps.length; j++) {
|
|
1720
|
+
if (jaccardSimilarity(fps[i], fps[j]) < SIMILARITY_THRESHOLD) return false;
|
|
1721
|
+
}
|
|
1600
1722
|
}
|
|
1601
|
-
|
|
1723
|
+
return true;
|
|
1724
|
+
}
|
|
1602
1725
|
|
|
1603
|
-
|
|
1604
|
-
|
|
1605
|
-
|
|
1606
|
-
if (opts.stealth) process.stdout.write(chalk.magenta('stealth '));
|
|
1607
|
-
process.stdout.write(chalk.gray('fetching...'));
|
|
1608
|
-
|
|
1609
|
-
writeProgress({
|
|
1610
|
-
status: 'running', command: 'extract', project,
|
|
1611
|
-
current_url: row.url,
|
|
1612
|
-
page_index: done + failed + 1, total: pendingPages.length,
|
|
1613
|
-
percent: Math.round(((done + failed) / pendingPages.length) * 100),
|
|
1614
|
-
started_at: extractStart, failed,
|
|
1615
|
-
stealth: !!opts.stealth,
|
|
1616
|
-
});
|
|
1726
|
+
// ── Prepare headings + schema queries (per-page lookups from DB) ──
|
|
1727
|
+
const getHeadings = db.prepare('SELECT level, text FROM headings WHERE page_id = ? ORDER BY id');
|
|
1728
|
+
const getSchemaTypes = db.prepare('SELECT DISTINCT schema_type FROM page_schemas WHERE page_id = ?');
|
|
1617
1729
|
|
|
1618
|
-
|
|
1619
|
-
|
|
1730
|
+
const totalToProcess = extractQueue.length;
|
|
1731
|
+
console.log(chalk.gray(` 📖 Reading from DB — no network needed\n`));
|
|
1620
1732
|
|
|
1621
|
-
|
|
1622
|
-
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1626
|
-
|
|
1627
|
-
const crawled = await crawlAll(row.url);
|
|
1628
|
-
pageData = crawled[0] || null;
|
|
1629
|
-
}
|
|
1733
|
+
for (const row of extractQueue) {
|
|
1734
|
+
const patKey = getPatternKey(row.url);
|
|
1735
|
+
if (skippedPatterns.has(patKey)) {
|
|
1736
|
+
skipped++;
|
|
1737
|
+
continue;
|
|
1738
|
+
}
|
|
1630
1739
|
|
|
1631
|
-
|
|
1632
|
-
|
|
1633
|
-
|
|
1634
|
-
failed++;
|
|
1635
|
-
if (stealthSession) {
|
|
1636
|
-
// Jittered delay even on failure — don't hammer a blocking site
|
|
1637
|
-
await new Promise(r => setTimeout(r, 1500 + Math.random() * 2000));
|
|
1638
|
-
}
|
|
1639
|
-
continue;
|
|
1640
|
-
}
|
|
1740
|
+
const pos = done + failed + 1;
|
|
1741
|
+
process.stdout.write(chalk.gray(` [${pos}/${totalToProcess}] ${row.url.slice(0, 70)} → `));
|
|
1742
|
+
process.stdout.write(chalk.gray('extracting...'));
|
|
1641
1743
|
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
insertLinks(db, row.id, pageData.links);
|
|
1652
|
-
if (pageData.parsedSchemas?.length) insertPageSchemas(db, row.id, pageData.parsedSchemas);
|
|
1653
|
-
}
|
|
1744
|
+
writeProgress({
|
|
1745
|
+
status: 'running', command: 'extract', project,
|
|
1746
|
+
current_url: row.url,
|
|
1747
|
+
page_index: pos, total: totalToProcess,
|
|
1748
|
+
percent: Math.round(((done + failed) / totalToProcess) * 100),
|
|
1749
|
+
started_at: extractStart, failed, skipped,
|
|
1750
|
+
});
|
|
1751
|
+
|
|
1752
|
+
let pageFailed = false;
|
|
1654
1753
|
|
|
1655
|
-
|
|
1754
|
+
try {
|
|
1755
|
+
// Read headings + schema types from DB
|
|
1756
|
+
const headings = getHeadings.all(row.id);
|
|
1757
|
+
const schemaTypes = getSchemaTypes.all(row.id).map(r => r.schema_type);
|
|
1758
|
+
|
|
1759
|
+
const extractFn = await getExtractPage();
|
|
1760
|
+
const extraction = await extractFn({
|
|
1761
|
+
url: row.url,
|
|
1762
|
+
title: row.title || '',
|
|
1763
|
+
metaDesc: row.meta_desc || '',
|
|
1764
|
+
headings,
|
|
1765
|
+
bodyText: row.body_text,
|
|
1766
|
+
schemaTypes,
|
|
1767
|
+
publishedDate: row.published_date,
|
|
1768
|
+
modifiedDate: row.modified_date,
|
|
1769
|
+
});
|
|
1770
|
+
insertExtraction(db, { pageId: row.id, data: extraction });
|
|
1771
|
+
insertKeywords(db, row.id, extraction.keywords);
|
|
1772
|
+
|
|
1773
|
+
const isDegraded = extraction.extraction_source === 'degraded';
|
|
1774
|
+
if (isDegraded) {
|
|
1775
|
+
process.stdout.write(chalk.yellow(` ⚠ degraded\n`));
|
|
1656
1776
|
done++;
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1777
|
+
pageFailed = true;
|
|
1778
|
+
} else {
|
|
1779
|
+
process.stdout.write(chalk.green(` ✓\n`));
|
|
1780
|
+
done++;
|
|
1781
|
+
patternFailCounts.set(patKey, 0);
|
|
1660
1782
|
}
|
|
1661
1783
|
|
|
1662
|
-
//
|
|
1663
|
-
if (
|
|
1664
|
-
|
|
1784
|
+
// ── Content similarity detection ──
|
|
1785
|
+
if (row.body_text.length > 50) {
|
|
1786
|
+
const shingles = textToShingles(row.body_text);
|
|
1787
|
+
if (checkPatternSimilarity(patKey, shingles) && !skippedPatterns.has(patKey)) {
|
|
1788
|
+
const remaining = extractQueue.filter(r => getPatternKey(r.url) === patKey).length - (patternFingerprints.get(patKey)?.length || 0);
|
|
1789
|
+
skippedPatterns.add(patKey);
|
|
1790
|
+
if (remaining > 0) {
|
|
1791
|
+
console.log(chalk.yellow(` [similarity] 🔍 ${SIMILARITY_SAMPLE_SIZE} pages from ${patKey} are ${Math.round(SIMILARITY_THRESHOLD * 100)}%+ identical — skipping ${remaining} remaining`));
|
|
1792
|
+
}
|
|
1793
|
+
}
|
|
1665
1794
|
}
|
|
1795
|
+
} catch (err) {
|
|
1796
|
+
process.stdout.write(chalk.red(` ✗ ${err.message}\n`));
|
|
1797
|
+
failed++;
|
|
1798
|
+
pageFailed = true;
|
|
1666
1799
|
}
|
|
1667
|
-
|
|
1668
|
-
//
|
|
1669
|
-
if (
|
|
1670
|
-
|
|
1671
|
-
|
|
1800
|
+
|
|
1801
|
+
// ── Track consecutive failures per pattern ──
|
|
1802
|
+
if (pageFailed) {
|
|
1803
|
+
const count = (patternFailCounts.get(patKey) || 0) + 1;
|
|
1804
|
+
patternFailCounts.set(patKey, count);
|
|
1805
|
+
if (count >= CONSEC_FAIL_THRESHOLD) {
|
|
1806
|
+
const remaining = extractQueue.filter(r => !skippedPatterns.has(getPatternKey(r.url)) && getPatternKey(r.url) === patKey).length;
|
|
1807
|
+
skippedPatterns.add(patKey);
|
|
1808
|
+
console.log(chalk.yellow(` [template] ⚡ ${count} consecutive failures for ${patKey} — skipping ~${remaining} remaining pages`));
|
|
1809
|
+
}
|
|
1672
1810
|
}
|
|
1673
1811
|
}
|
|
1674
1812
|
|
|
1675
|
-
writeProgress({ status: 'completed', command: 'extract', project, extracted: done, failed, total: pendingPages.length, started_at: extractStart, finished_at: Date.now() });
|
|
1676
|
-
|
|
1813
|
+
writeProgress({ status: 'completed', command: 'extract', project, extracted: done, failed, skipped, total: pendingPages.length, started_at: extractStart, finished_at: Date.now() });
|
|
1814
|
+
const skipMsg = skipped > 0 ? chalk.yellow(`, ${skipped} template-skipped`) : '';
|
|
1815
|
+
const recrawlMsg = needsRecrawl > 0 ? chalk.yellow(`, ${needsRecrawl} need re-crawl`) : '';
|
|
1816
|
+
console.log(chalk.bold.green(`\n✅ Extraction complete: ${done} extracted, ${failed} failed${skipMsg}${recrawlMsg}\n`));
|
|
1817
|
+
});
|
|
1818
|
+
|
|
1819
|
+
// ── TEMPLATES ANALYSIS ────────────────────────────────────────────────────
|
|
1820
|
+
program
|
|
1821
|
+
.command('templates <project>')
|
|
1822
|
+
.description('Detect programmatic template pages — assess SEO value without crawling all of them')
|
|
1823
|
+
.option('--min-group <n>', 'Minimum URLs to qualify as a template group', '10')
|
|
1824
|
+
.option('--sample-size <n>', 'Pages to stealth-crawl per template group', '20')
|
|
1825
|
+
.option('--skip-crawl', 'Skip sample crawl (pattern analysis + GSC only)')
|
|
1826
|
+
.option('--skip-gsc', 'Skip GSC overlay phase')
|
|
1827
|
+
.option('--skip-competitors', 'Skip competitor sitemap census')
|
|
1828
|
+
.action(async (project, opts) => {
|
|
1829
|
+
if (!requirePro('templates')) return;
|
|
1830
|
+
|
|
1831
|
+
console.log(chalk.bold.cyan(`\n🔍 SEO Intel — Template Analysis`));
|
|
1832
|
+
console.log(chalk.dim(` Project: ${project}`));
|
|
1833
|
+
|
|
1834
|
+
try {
|
|
1835
|
+
const { runTemplatesAnalysis } = await import('./analyses/templates/index.js');
|
|
1836
|
+
const report = await runTemplatesAnalysis(project, {
|
|
1837
|
+
minGroupSize: parseInt(opts.minGroup) || 10,
|
|
1838
|
+
sampleSize: parseInt(opts.sampleSize) || 20,
|
|
1839
|
+
skipCrawl: !!opts.skipCrawl,
|
|
1840
|
+
skipGsc: !!opts.skipGsc,
|
|
1841
|
+
skipCompetitors: !!opts.skipCompetitors,
|
|
1842
|
+
log: (msg) => console.log(chalk.gray(msg)),
|
|
1843
|
+
});
|
|
1844
|
+
|
|
1845
|
+
if (report.groups.length === 0) {
|
|
1846
|
+
console.log(chalk.yellow(`\n No template patterns detected.\n`));
|
|
1847
|
+
process.exit(0);
|
|
1848
|
+
}
|
|
1849
|
+
|
|
1850
|
+
// Summary
|
|
1851
|
+
console.log(chalk.bold.green(`\n✅ Template analysis complete`));
|
|
1852
|
+
console.log(chalk.dim(` ${report.stats.totalGroups} groups · ${report.stats.totalGrouped.toLocaleString()} URLs · ${(report.stats.coverage * 100).toFixed(0)}% of sitemap`));
|
|
1853
|
+
console.log(chalk.dim(` Run ${chalk.white('seo-intel html ' + project)} to see the full dashboard.\n`));
|
|
1854
|
+
} catch (err) {
|
|
1855
|
+
console.error(chalk.red(`\n Error: ${err.message}\n`));
|
|
1856
|
+
if (process.env.DEBUG) console.error(err.stack);
|
|
1857
|
+
process.exit(1);
|
|
1858
|
+
}
|
|
1677
1859
|
});
|
|
1678
1860
|
|
|
1679
1861
|
// ── HTML DASHBOARD ─────────────────────────────────────────────────────────
|
|
@@ -1741,10 +1923,10 @@ program
|
|
|
1741
1923
|
}
|
|
1742
1924
|
});
|
|
1743
1925
|
|
|
1744
|
-
// ── HTML ALL-PROJECTS DASHBOARD
|
|
1926
|
+
// ── HTML ALL-PROJECTS DASHBOARD (alias for html — kept for backwards compat) ──
|
|
1745
1927
|
program
|
|
1746
1928
|
.command('html-all')
|
|
1747
|
-
.description('
|
|
1929
|
+
.description('Alias for "html" — generates the all-projects dashboard')
|
|
1748
1930
|
.action(() => {
|
|
1749
1931
|
const db = getDb();
|
|
1750
1932
|
const configs = loadAllConfigs();
|
package/crawler/index.js
CHANGED
|
@@ -186,7 +186,7 @@ export async function* crawlDomain(startUrl, opts = {}) {
|
|
|
186
186
|
// When hostname contains "docs.", spoof Googlebot UA to reduce WAF friction.
|
|
187
187
|
const isDocsHostname = base.hostname.toLowerCase().includes('docs.');
|
|
188
188
|
const GOOGLEBOT_UA = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';
|
|
189
|
-
const defaultUA = 'Mozilla/5.0 (compatible; SEOIntelBot/1.0; +https://
|
|
189
|
+
const defaultUA = 'Mozilla/5.0 (compatible; SEOIntelBot/1.0; +https://ukkometa.fi/en/seo-intel/bot)';
|
|
190
190
|
const effectiveUA = isDocsHostname ? GOOGLEBOT_UA : defaultUA;
|
|
191
191
|
|
|
192
192
|
async function tryLoadLlmsTxt() {
|
|
@@ -547,11 +547,16 @@ async function processPage(page, url, base, depth, queue, maxDepth) {
|
|
|
547
547
|
// ── Quality gate — detect shells, blocked pages, empty content ──
|
|
548
548
|
const quality = assessQuality({ wordCount, bodyText, title, status });
|
|
549
549
|
|
|
550
|
+
// Full body text for DB storage (extraction reads this); truncated for log output
|
|
551
|
+
const fullBodyText = sanitize(bodyText, 50000); // ~200K chars — enough for any real page
|
|
552
|
+
const shortBodyText = sanitize(bodyText, 2000); // compact version for logging
|
|
553
|
+
|
|
550
554
|
return {
|
|
551
555
|
url, depth, status, loadMs, wordCount, isIndexable,
|
|
552
556
|
title, metaDesc, headings,
|
|
553
557
|
links: [...internalLinks, ...externalLinks],
|
|
554
|
-
bodyText:
|
|
558
|
+
bodyText: shortBodyText,
|
|
559
|
+
fullBodyText,
|
|
555
560
|
schemaTypes, parsedSchemas, vitals, publishedDate, modifiedDate,
|
|
556
561
|
contentHash: hash,
|
|
557
562
|
quality: quality.ok, qualityReason: quality.reason,
|
|
@@ -181,7 +181,7 @@ async function checkHttp(hostname) {
|
|
|
181
181
|
signal: controller.signal,
|
|
182
182
|
redirect: 'follow',
|
|
183
183
|
headers: {
|
|
184
|
-
'User-Agent': 'Mozilla/5.0 (compatible; SEOIntelBot/1.0; +https://
|
|
184
|
+
'User-Agent': 'Mozilla/5.0 (compatible; SEOIntelBot/1.0; +https://ukkometa.fi/en/seo-intel/bot)',
|
|
185
185
|
},
|
|
186
186
|
});
|
|
187
187
|
|