seo-intel 1.1.6 → 1.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.env.example CHANGED
@@ -2,7 +2,7 @@
2
2
  # Run `node cli.js setup` to configure interactively
3
3
 
4
4
  # ── License (Pro features) ───────────────────────────────────────────────
5
- # Get your key at https://froggo.pro/seo-intel
5
+ # Get your key at https://ukkometa.fi/en/seo-intel/
6
6
  # SEO_INTEL_LICENSE=SI-xxxx-xxxx-xxxx-xxxx
7
7
 
8
8
  # ── Analysis Model (cloud, pick one) ──────────────────────────────────────
package/CHANGELOG.md ADDED
@@ -0,0 +1,42 @@
1
+ # Changelog
2
+
3
+ ## 1.1.8 (2026-03-27)
4
+
5
+ - Rebranded all references from froggo.pro → ukkometa.fi (endpoints, dashboard links, license validation, bot user-agents, skill)
6
+ - Pricing updated: €9.99/mo · €79/yr
7
+ - Contact updated: ukko@ukkometa.fi
8
+ - Added README.md and CHANGELOG.md to npm package and LS zip
9
+
10
+ ## 1.1.7 (2026-03-26)
11
+
12
+ ### New Features
13
+ - **Programmatic Template Intelligence** (`seo-intel templates <project>`) — detect URL pattern groups (e.g. `/token/*`, `/blog/*`), stealth-crawl samples, overlay GSC data, and score each group with keep/noindex/improve verdicts. Pro-gated.
14
+ - **Stale domain auto-pruning** — domains removed from config are now automatically cleaned from the DB (pages, keywords, extractions, schemas, headings, links) on next crawl. No more ghost data from renamed/removed subdomains.
15
+ - **Manual prune** — `seo-intel competitors <project> --prune` to clean stale DB entries on demand.
16
+ - **Full body text storage** — crawler now stores full page body text in DB (up to 200K chars) for richer extraction and analysis. Log output stays compact.
17
+
18
+ ### Improvements
19
+ - **Background crawl/extract** — long-running crawl and extract jobs now survive browser tab close. Terminal shows "backgrounded" instead of "disconnected", and jobs continue server-side.
20
+ - **Dashboard terminal** — stealth flag now visible in terminal command display. Stop button properly closes SSE + server-side process. Status bar syncs with terminal state.
21
+ - **Templates button** added to dashboard terminal panel.
22
+ - **Dashboard refresh** — crawl and analyze now always regenerate the multi-project dashboard, keeping all projects current.
23
+ - **Config remove = DB remove** — `--remove` and `--remove-owned` now auto-prune matching DB data, not just config JSON.
24
+
25
+ ### Fixes
26
+ - SSE disconnect no longer kills crawl/extract processes (detached child process).
27
+ - Terminal command display now shows `--stealth` flag when enabled.
28
+
29
+ ## 1.1.6 (2026-03-24)
30
+
31
+ - Stop button, stealth sync, extraction layout, EADDRINUSE recovery.
32
+
33
+ ## 1.1.5 (2026-03-21)
34
+
35
+ - Update checker, job stop API, background analyze, LAN Ollama hosts, `html` CLI command, wizard UX improvements.
36
+
37
+ ## 1.1.8 (2026-03-27)
38
+
39
+ - Rebranded all references from froggo.pro → ukkometa.fi (endpoints, dashboard links, license validation, bot user-agents, skill)
40
+ - Pricing updated: €9.99/mo · €79/yr
41
+ - Contact updated: ukko@ukkometa.fi
42
+ - Added README.md and CHANGELOG.md to npm package and LS zip
package/LICENSE CHANGED
@@ -1,6 +1,6 @@
1
1
  SEO Intel — Dual License
2
2
 
3
- Copyright (c) 2024-2026 froggo.pro
3
+ Copyright (c) 2024-2026 Ukkometa (ukkometa.fi)
4
4
 
5
5
  This project uses a dual license structure:
6
6
 
@@ -65,11 +65,10 @@ You MAY:
65
65
  - Share generated reports and dashboards (outputs are yours)
66
66
 
67
67
  License:
68
- Solo — €19.99/month or €199/year — full AI analysis, all commands
69
- Also available at $9.99/month via froggo.pro marketplace
68
+ Solo — €19.99/month or €199.99/year — full AI analysis, all commands
70
69
 
71
- Purchase at: https://ukkometa.fi or https://froggo.pro/seo-intel
70
+ Purchase at: https://ukkometa.fi/en/seo-intel/
72
71
 
73
72
  ================================================================================
74
73
 
75
- For questions: hello@froggo.pro
74
+ For questions: ukko@ukkometa.fi
package/README.md CHANGED
@@ -75,7 +75,7 @@ seo-intel suggest-usecases myproject --scope docs # infer what pages/docs s
75
75
  | `schemas <project>` | Schema.org coverage analysis |
76
76
  | `update` | Check for updates |
77
77
 
78
- ### Solo (€19.99/mo · [ukkometa.fi/seo-intel](https://ukkometa.fi/seo-intel))
78
+ ### Solo (€19.99/mo · [ukkometa.fi/seo-intel](https://ukkometa.fi/en/seo-intel/))
79
79
 
80
80
  | Command | Description |
81
81
  |---------|-------------|
@@ -194,7 +194,7 @@ Upload your GSC data for ranking insights:
194
194
  - 1 project, 500 pages/domain
195
195
  - Crawl, extract, setup, basic reports
196
196
 
197
- ### Pro Tier ($49 one-time)
197
+ ### Solo (€19.99/mo · €199.99/yr)
198
198
  - Unlimited projects and pages
199
199
  - All analysis commands, GSC insights, scheduling
200
200
 
@@ -203,7 +203,7 @@ Upload your GSC data for ranking insights:
203
203
  echo "SEO_INTEL_LICENSE=SI-xxxx-xxxx-xxxx-xxxx" >> .env
204
204
  ```
205
205
 
206
- Get a key at [froggo.pro/seo-intel](https://froggo.pro/seo-intel)
206
+ Get a key at [ukkometa.fi/seo-intel](https://ukkometa.fi/en/seo-intel/)
207
207
 
208
208
  ## Updates
209
209
 
package/cli.js CHANGED
@@ -24,11 +24,11 @@ import { getNextCrawlTarget, needsAnalysis, getCrawlStatus, loadAllConfigs } fro
24
24
  import {
25
25
  getDb, upsertDomain, upsertPage, insertExtraction,
26
26
  insertKeywords, insertHeadings, insertLinks, insertPageSchemas,
27
- upsertTechnical,
27
+ upsertTechnical, pruneStaleDomains,
28
28
  getCompetitorSummary, getKeywordMatrix, getHeadingStructure,
29
29
  getPageHash, getSchemasByProject
30
30
  } from './db/db.js';
31
- import { generateHtmlDashboard, generateMultiDashboard } from './reports/generate-html.js';
31
+ import { generateMultiDashboard } from './reports/generate-html.js';
32
32
  import { buildTechnicalActions } from './exports/technical.js';
33
33
  import { buildCompetitiveActions } from './exports/competitive.js';
34
34
  import { buildSuggestiveActions } from './exports/suggestive.js';
@@ -393,6 +393,21 @@ program
393
393
  }
394
394
  }
395
395
 
396
+ // ── Prune stale domains (DB entries no longer in config) ─────────────
397
+ {
398
+ const configDomains = new Set([
399
+ config.target?.domain,
400
+ ...(config.owned || []).map(o => o.domain),
401
+ ...(config.competitors || []).map(c => c.domain),
402
+ ].filter(Boolean));
403
+
404
+ const pruned = pruneStaleDomains(db, project, configDomains);
405
+ if (pruned.length) {
406
+ console.log(chalk.yellow(`\n 🧹 Pruned ${pruned.length} stale domain(s) from DB (no longer in config):`));
407
+ for (const d of pruned) console.log(chalk.dim(` − ${d}`));
408
+ }
409
+ }
410
+
396
411
  // ── Tier gate: Free tier = crawl-only, no AI extraction ──────────────
397
412
  if (opts.extract !== false && !isPro()) {
398
413
  console.log(chalk.dim('\n ℹ Free tier: crawl-only mode (AI extraction requires Solo/Agency)'));
@@ -488,6 +503,9 @@ program
488
503
  publishedDate: page.publishedDate || null,
489
504
  modifiedDate: page.modifiedDate || null,
490
505
  contentHash: page.contentHash || null,
506
+ title: page.title || null,
507
+ metaDesc: page.metaDesc || null,
508
+ bodyText: page.fullBodyText || page.bodyText || null,
491
509
  });
492
510
  const pageId = pageRes?.id;
493
511
 
@@ -579,9 +597,10 @@ program
579
597
  if (totalSkipped > 0) console.log(chalk.blue(`\n📊 Incremental: ${totalSkipped} unchanged pages skipped (same content hash)`));
580
598
  if (totalBlocked > 0) console.log(chalk.red(`\n⛔ ${totalBlocked} domain(s) blocked (rate-limited or WAF)`));
581
599
  const elapsed = ((Date.now() - crawlStart) / 1000).toFixed(1);
582
- // Auto-regenerate dashboard so it never goes stale after a crawl
600
+ // Auto-regenerate dashboard (always multi-project so all projects stay current)
583
601
  try {
584
- const dashPath = generateHtmlDashboard(db, project, config);
602
+ const allConfigs = loadAllConfigs();
603
+ const dashPath = generateMultiDashboard(db, allConfigs);
585
604
  console.log(chalk.dim(` 📊 Dashboard refreshed → ${dashPath}`));
586
605
  } catch (dashErr) {
587
606
  console.log(chalk.dim(` ⚠ Dashboard refresh skipped: ${dashErr.message}`));
@@ -697,9 +716,10 @@ program
697
716
  // Print summary
698
717
  printAnalysisSummary(analysis, project);
699
718
 
700
- // Auto-regenerate dashboard so it reflects the new analysis immediately
719
+ // Auto-regenerate dashboard (always multi-project so all projects stay current)
701
720
  try {
702
- const dashPath = generateHtmlDashboard(db, project, config);
721
+ const allConfigs = loadAllConfigs();
722
+ const dashPath = generateMultiDashboard(db, allConfigs);
703
723
  console.log(chalk.dim(` 📊 Dashboard refreshed → ${dashPath}`));
704
724
  } catch (dashErr) {
705
725
  console.log(chalk.dim(` ⚠ Dashboard refresh skipped: ${dashErr.message}`));
@@ -1074,6 +1094,9 @@ program
1074
1094
  publishedDate: page.publishedDate || null,
1075
1095
  modifiedDate: page.modifiedDate || null,
1076
1096
  contentHash: page.contentHash || null,
1097
+ title: page.title || null,
1098
+ metaDesc: page.metaDesc || null,
1099
+ bodyText: page.fullBodyText || page.bodyText || null,
1077
1100
  });
1078
1101
  const pageId = pageRes?.id;
1079
1102
 
@@ -1246,7 +1269,7 @@ program
1246
1269
  console.log(chalk.gray(' npm registry: ') + chalk.white(info.npmVersion));
1247
1270
  }
1248
1271
  if (info.froggoVersion) {
1249
- console.log(chalk.gray(' froggo.pro: ') + chalk.white(info.froggoVersion));
1272
+ console.log(chalk.gray(' ukkometa.fi: ') + chalk.white(info.froggoVersion));
1250
1273
  }
1251
1274
 
1252
1275
  if (!info.hasUpdate) {
@@ -1379,6 +1402,7 @@ program
1379
1402
  .option('--add-owned <domain>', 'Add an owned subdomain')
1380
1403
  .option('--remove-owned <domain>', 'Remove an owned subdomain')
1381
1404
  .option('--set-target <domain>', 'Change the target domain')
1405
+ .option('--prune', 'Remove DB data for domains no longer in config')
1382
1406
  .action((project, opts) => {
1383
1407
  const configPath = join(__dirname, `config/${project}.json`);
1384
1408
  let config;
@@ -1471,6 +1495,24 @@ program
1471
1495
  console.log(chalk.dim(`\n Saved → config/${project}.json`));
1472
1496
  }
1473
1497
 
1498
+ // ── Prune stale DB data (auto on remove, or manual --prune) ─────────
1499
+ if (modified || opts.prune) {
1500
+ const db = getDb();
1501
+ const configDomains = new Set([
1502
+ config.target?.domain,
1503
+ ...(config.owned || []).map(o => o.domain),
1504
+ ...(config.competitors || []).map(c => c.domain),
1505
+ ].filter(Boolean));
1506
+
1507
+ const pruned = pruneStaleDomains(db, project, configDomains);
1508
+ if (pruned.length) {
1509
+ console.log(chalk.yellow(`\n 🧹 Pruned ${pruned.length} stale domain(s) from DB:`));
1510
+ for (const d of pruned) console.log(chalk.dim(` − ${d}`));
1511
+ } else if (opts.prune) {
1512
+ console.log(chalk.dim('\n ✓ No stale domains to prune'));
1513
+ }
1514
+ }
1515
+
1474
1516
  // ── Always show current config
1475
1517
  console.log(chalk.bold.cyan(`\n 📋 ${project} — Domain Configuration\n`));
1476
1518
  console.log(chalk.white(' Target:'));
@@ -1560,13 +1602,14 @@ async function runAnalysis(project, db) {
1560
1602
  program
1561
1603
  .command('extract <project>')
1562
1604
  .description('Run AI extraction on all crawled-but-not-yet-extracted pages (requires Solo/Agency)')
1563
- .option('--stealth', 'Advanced browser mode for JS-heavy and dynamic sites')
1564
- .action(async (project, opts) => {
1605
+ .action(async (project) => {
1565
1606
  if (!requirePro('extract')) return;
1566
1607
  const db = getDb();
1608
+
1609
+ // Query pages that have body_text stored (from crawl) but no extraction yet
1567
1610
  const pendingPages = db.prepare(`
1568
- SELECT p.id, p.url, p.word_count,
1569
- e.id as extracted
1611
+ SELECT p.id, p.url, p.word_count, p.title, p.meta_desc, p.body_text,
1612
+ p.published_date, p.modified_date
1570
1613
  FROM pages p
1571
1614
  JOIN domains d ON d.id = p.domain_id
1572
1615
  LEFT JOIN extractions e ON e.page_id = p.id
@@ -1578,102 +1621,241 @@ program
1578
1621
  process.exit(0);
1579
1622
  }
1580
1623
 
1581
- const mode = opts.stealth ? chalk.magenta('STEALTH') : chalk.gray('standard');
1582
- console.log(chalk.bold.cyan(`\n⚙️ Extracting ${pendingPages.length} pages for ${project} via Qwen [${mode}]...\n`));
1624
+ // Check how many have body_text stored vs need re-crawl
1625
+ const withContent = pendingPages.filter(r => r.body_text);
1626
+ const needsRecrawl = pendingPages.length - withContent.length;
1627
+
1628
+ console.log(chalk.bold.cyan(`\n⚙️ Extracting ${pendingPages.length} pages for ${project} via Qwen...\n`));
1629
+ if (needsRecrawl > 0) {
1630
+ console.log(chalk.yellow(` ⚠ ${needsRecrawl} pages have no stored content (crawled before v1.1.6). Re-crawl to populate.\n`));
1631
+ }
1583
1632
 
1584
1633
  const extractStart = Date.now();
1585
- let done = 0, failed = 0;
1634
+ let done = 0, failed = 0, skipped = 0;
1635
+
1636
+ // ── Pre-extract template grouping: sample N per group, skip the rest ──
1637
+ const SAMPLE_PER_GROUP = 5;
1638
+ const MIN_GROUP_FOR_SAMPLING = 10;
1639
+ let extractQueue = pendingPages.filter(r => r.body_text); // only pages with stored content
1640
+
1641
+ try {
1642
+ const { clusterUrls } = await import('./analyses/templates/cluster.js');
1643
+ const { groups } = clusterUrls(
1644
+ extractQueue.map(r => ({ url: r.url })),
1645
+ { minGroupSize: MIN_GROUP_FOR_SAMPLING }
1646
+ );
1647
+
1648
+ if (groups.length > 0) {
1649
+ const skipUrls = new Set();
1586
1650
 
1587
- // ── Stealth: single session across all pages (cookie accumulation) ──
1588
- let stealthSession = null;
1589
- if (opts.stealth) {
1590
- const { createStealthSession } = await import('./crawler/stealth.js');
1591
- stealthSession = await createStealthSession();
1592
- console.log(chalk.magenta(' 🥷 Advanced mode — full browser rendering, persistent sessions\n'));
1651
+ for (const group of groups) {
1652
+ const urls = group.urls;
1653
+ if (urls.length <= SAMPLE_PER_GROUP) continue;
1654
+
1655
+ const sampleSet = new Set();
1656
+ sampleSet.add(urls[0]); sampleSet.add(urls[1]);
1657
+ sampleSet.add(urls[urls.length - 1]); sampleSet.add(urls[urls.length - 2]);
1658
+ sampleSet.add(urls[Math.floor(urls.length / 2)]);
1659
+
1660
+ const skippedCount = urls.length - sampleSet.size;
1661
+ for (const u of urls) {
1662
+ if (!sampleSet.has(u)) skipUrls.add(u);
1663
+ }
1664
+ console.log(chalk.yellow(` [template] ${group.pattern} → ${urls.length} pages, sampling ${sampleSet.size}, skipping ${skippedCount}`));
1665
+ }
1666
+
1667
+ if (skipUrls.size > 0) {
1668
+ extractQueue = extractQueue.filter(r => !skipUrls.has(r.url));
1669
+ skipped += skipUrls.size;
1670
+ console.log(chalk.yellow(` [template] ${withContent.length} extractable → ${extractQueue.length} to extract (${skipUrls.size} template-skipped)\n`));
1671
+ }
1672
+ }
1673
+ } catch (e) {
1674
+ console.log(chalk.gray(` [template] Pattern detection skipped: ${e.message}`));
1593
1675
  }
1594
1676
 
1595
- // Register cleanup so SIGTERM closes the browser gracefully
1596
- onShutdown(async () => {
1597
- if (stealthSession) {
1598
- await stealthSession.close();
1599
- console.log(chalk.magenta(' 🥷 Stealth session closed'));
1677
+ // ── Consecutive failure tracking per URL pattern ──
1678
+ const CONSEC_FAIL_THRESHOLD = 3;
1679
+ const patternFailCounts = new Map();
1680
+ const skippedPatterns = new Set();
1681
+
1682
+ function getPatternKey(url) {
1683
+ try {
1684
+ const u = new URL(url);
1685
+ const parts = u.pathname.split('/').filter(Boolean);
1686
+ return u.hostname + '/' + parts.map(p =>
1687
+ (p.length > 20 || /^[0-9a-fA-F]{8,}$/.test(p) || /^0x/.test(p) || /[-_]/.test(p)) ? '{var}' : p
1688
+ ).join('/');
1689
+ } catch { return url; }
1690
+ }
1691
+
1692
+ // ── Content similarity detection ──
1693
+ const SIMILARITY_THRESHOLD = 0.80;
1694
+ const SIMILARITY_SAMPLE_SIZE = 3;
1695
+ const patternFingerprints = new Map();
1696
+
1697
+ function textToShingles(text, n = 3) {
1698
+ const words = (text || '').toLowerCase().replace(/[^a-z0-9\s]/g, '').split(/\s+/).filter(Boolean);
1699
+ const shingles = new Set();
1700
+ for (let i = 0; i <= words.length - n; i++) {
1701
+ shingles.add(words.slice(i, i + n).join(' '));
1702
+ }
1703
+ return shingles;
1704
+ }
1705
+
1706
+ function jaccardSimilarity(a, b) {
1707
+ if (!a.size || !b.size) return 0;
1708
+ let intersection = 0;
1709
+ for (const s of a) { if (b.has(s)) intersection++; }
1710
+ return intersection / (a.size + b.size - intersection);
1711
+ }
1712
+
1713
+ function checkPatternSimilarity(patKey, newShingles) {
1714
+ if (!patternFingerprints.has(patKey)) patternFingerprints.set(patKey, []);
1715
+ const fps = patternFingerprints.get(patKey);
1716
+ fps.push(newShingles);
1717
+ if (fps.length < SIMILARITY_SAMPLE_SIZE || fps.length > SIMILARITY_SAMPLE_SIZE) return false;
1718
+ for (let i = 0; i < fps.length; i++) {
1719
+ for (let j = i + 1; j < fps.length; j++) {
1720
+ if (jaccardSimilarity(fps[i], fps[j]) < SIMILARITY_THRESHOLD) return false;
1721
+ }
1600
1722
  }
1601
- });
1723
+ return true;
1724
+ }
1602
1725
 
1603
- try {
1604
- for (const row of pendingPages) {
1605
- process.stdout.write(chalk.gray(` [${done + failed + 1}/${pendingPages.length}] ${row.url.slice(0, 65)} `));
1606
- if (opts.stealth) process.stdout.write(chalk.magenta('stealth '));
1607
- process.stdout.write(chalk.gray('fetching...'));
1608
-
1609
- writeProgress({
1610
- status: 'running', command: 'extract', project,
1611
- current_url: row.url,
1612
- page_index: done + failed + 1, total: pendingPages.length,
1613
- percent: Math.round(((done + failed) / pendingPages.length) * 100),
1614
- started_at: extractStart, failed,
1615
- stealth: !!opts.stealth,
1616
- });
1726
+ // ── Prepare headings + schema queries (per-page lookups from DB) ──
1727
+ const getHeadings = db.prepare('SELECT level, text FROM headings WHERE page_id = ? ORDER BY id');
1728
+ const getSchemaTypes = db.prepare('SELECT DISTINCT schema_type FROM page_schemas WHERE page_id = ?');
1617
1729
 
1618
- try {
1619
- let pageData;
1730
+ const totalToProcess = extractQueue.length;
1731
+ console.log(chalk.gray(` 📖 Reading from DB — no network needed\n`));
1620
1732
 
1621
- if (stealthSession) {
1622
- // Stealth: reuse persistent browser session
1623
- pageData = await stealthSession.fetchPage(row.url);
1624
- } else {
1625
- // Standard: quick single-page crawl
1626
- const { crawlAll } = await import('./crawler/index.js');
1627
- const crawled = await crawlAll(row.url);
1628
- pageData = crawled[0] || null;
1629
- }
1733
+ for (const row of extractQueue) {
1734
+ const patKey = getPatternKey(row.url);
1735
+ if (skippedPatterns.has(patKey)) {
1736
+ skipped++;
1737
+ continue;
1738
+ }
1630
1739
 
1631
- if (!pageData || pageData.status >= 400) {
1632
- const reason = pageData ? `HTTP ${pageData.status}` : 'no data';
1633
- process.stdout.write(chalk.red(` ✗ ${reason}\n`));
1634
- failed++;
1635
- if (stealthSession) {
1636
- // Jittered delay even on failure — don't hammer a blocking site
1637
- await new Promise(r => setTimeout(r, 1500 + Math.random() * 2000));
1638
- }
1639
- continue;
1640
- }
1740
+ const pos = done + failed + 1;
1741
+ process.stdout.write(chalk.gray(` [${pos}/${totalToProcess}] ${row.url.slice(0, 70)} `));
1742
+ process.stdout.write(chalk.gray('extracting...'));
1641
1743
 
1642
- process.stdout.write(chalk.gray(' extracting...'));
1643
- const extractFn = await getExtractPage();
1644
- const extraction = await extractFn(pageData);
1645
- insertExtraction(db, { pageId: row.id, data: extraction });
1646
- insertKeywords(db, row.id, extraction.keywords);
1647
-
1648
- // Also update headings + links + schemas with fresh data from stealth fetch
1649
- if (stealthSession) {
1650
- insertHeadings(db, row.id, pageData.headings);
1651
- insertLinks(db, row.id, pageData.links);
1652
- if (pageData.parsedSchemas?.length) insertPageSchemas(db, row.id, pageData.parsedSchemas);
1653
- }
1744
+ writeProgress({
1745
+ status: 'running', command: 'extract', project,
1746
+ current_url: row.url,
1747
+ page_index: pos, total: totalToProcess,
1748
+ percent: Math.round(((done + failed) / totalToProcess) * 100),
1749
+ started_at: extractStart, failed, skipped,
1750
+ });
1751
+
1752
+ let pageFailed = false;
1654
1753
 
1655
- process.stdout.write(chalk.green(` ✓${pageData.parsedSchemas?.length ? ` [${pageData.parsedSchemas.length} schema]` : ''}\n`));
1754
+ try {
1755
+ // Read headings + schema types from DB
1756
+ const headings = getHeadings.all(row.id);
1757
+ const schemaTypes = getSchemaTypes.all(row.id).map(r => r.schema_type);
1758
+
1759
+ const extractFn = await getExtractPage();
1760
+ const extraction = await extractFn({
1761
+ url: row.url,
1762
+ title: row.title || '',
1763
+ metaDesc: row.meta_desc || '',
1764
+ headings,
1765
+ bodyText: row.body_text,
1766
+ schemaTypes,
1767
+ publishedDate: row.published_date,
1768
+ modifiedDate: row.modified_date,
1769
+ });
1770
+ insertExtraction(db, { pageId: row.id, data: extraction });
1771
+ insertKeywords(db, row.id, extraction.keywords);
1772
+
1773
+ const isDegraded = extraction.extraction_source === 'degraded';
1774
+ if (isDegraded) {
1775
+ process.stdout.write(chalk.yellow(` ⚠ degraded\n`));
1656
1776
  done++;
1657
- } catch (err) {
1658
- process.stdout.write(chalk.red(` ${err.message}\n`));
1659
- failed++;
1777
+ pageFailed = true;
1778
+ } else {
1779
+ process.stdout.write(chalk.green(` ✓\n`));
1780
+ done++;
1781
+ patternFailCounts.set(patKey, 0);
1660
1782
  }
1661
1783
 
1662
- // Jittered delay in stealth mode (2-5s) to mimic human browsing
1663
- if (stealthSession) {
1664
- await new Promise(r => setTimeout(r, 2000 + Math.random() * 3000));
1784
+ // ── Content similarity detection ──
1785
+ if (row.body_text.length > 50) {
1786
+ const shingles = textToShingles(row.body_text);
1787
+ if (checkPatternSimilarity(patKey, shingles) && !skippedPatterns.has(patKey)) {
1788
+ const remaining = extractQueue.filter(r => getPatternKey(r.url) === patKey).length - (patternFingerprints.get(patKey)?.length || 0);
1789
+ skippedPatterns.add(patKey);
1790
+ if (remaining > 0) {
1791
+ console.log(chalk.yellow(` [similarity] 🔍 ${SIMILARITY_SAMPLE_SIZE} pages from ${patKey} are ${Math.round(SIMILARITY_THRESHOLD * 100)}%+ identical — skipping ${remaining} remaining`));
1792
+ }
1793
+ }
1665
1794
  }
1795
+ } catch (err) {
1796
+ process.stdout.write(chalk.red(` ✗ ${err.message}\n`));
1797
+ failed++;
1798
+ pageFailed = true;
1666
1799
  }
1667
- } finally {
1668
- // Always close stealth session
1669
- if (stealthSession) {
1670
- await stealthSession.close();
1671
- console.log(chalk.magenta(`\n 🥷 Stealth session closed (${stealthSession.getPageCount()} pages fetched)`));
1800
+
1801
+ // ── Track consecutive failures per pattern ──
1802
+ if (pageFailed) {
1803
+ const count = (patternFailCounts.get(patKey) || 0) + 1;
1804
+ patternFailCounts.set(patKey, count);
1805
+ if (count >= CONSEC_FAIL_THRESHOLD) {
1806
+ const remaining = extractQueue.filter(r => !skippedPatterns.has(getPatternKey(r.url)) && getPatternKey(r.url) === patKey).length;
1807
+ skippedPatterns.add(patKey);
1808
+ console.log(chalk.yellow(` [template] ⚡ ${count} consecutive failures for ${patKey} — skipping ~${remaining} remaining pages`));
1809
+ }
1672
1810
  }
1673
1811
  }
1674
1812
 
1675
- writeProgress({ status: 'completed', command: 'extract', project, extracted: done, failed, total: pendingPages.length, started_at: extractStart, finished_at: Date.now() });
1676
- console.log(chalk.bold.green(`\n✅ Extraction complete: ${done} extracted, ${failed} failed\n`));
1813
+ writeProgress({ status: 'completed', command: 'extract', project, extracted: done, failed, skipped, total: pendingPages.length, started_at: extractStart, finished_at: Date.now() });
1814
+ const skipMsg = skipped > 0 ? chalk.yellow(`, ${skipped} template-skipped`) : '';
1815
+ const recrawlMsg = needsRecrawl > 0 ? chalk.yellow(`, ${needsRecrawl} need re-crawl`) : '';
1816
+ console.log(chalk.bold.green(`\n✅ Extraction complete: ${done} extracted, ${failed} failed${skipMsg}${recrawlMsg}\n`));
1817
+ });
1818
+
1819
+ // ── TEMPLATES ANALYSIS ────────────────────────────────────────────────────
1820
+ program
1821
+ .command('templates <project>')
1822
+ .description('Detect programmatic template pages — assess SEO value without crawling all of them')
1823
+ .option('--min-group <n>', 'Minimum URLs to qualify as a template group', '10')
1824
+ .option('--sample-size <n>', 'Pages to stealth-crawl per template group', '20')
1825
+ .option('--skip-crawl', 'Skip sample crawl (pattern analysis + GSC only)')
1826
+ .option('--skip-gsc', 'Skip GSC overlay phase')
1827
+ .option('--skip-competitors', 'Skip competitor sitemap census')
1828
+ .action(async (project, opts) => {
1829
+ if (!requirePro('templates')) return;
1830
+
1831
+ console.log(chalk.bold.cyan(`\n🔍 SEO Intel — Template Analysis`));
1832
+ console.log(chalk.dim(` Project: ${project}`));
1833
+
1834
+ try {
1835
+ const { runTemplatesAnalysis } = await import('./analyses/templates/index.js');
1836
+ const report = await runTemplatesAnalysis(project, {
1837
+ minGroupSize: parseInt(opts.minGroup) || 10,
1838
+ sampleSize: parseInt(opts.sampleSize) || 20,
1839
+ skipCrawl: !!opts.skipCrawl,
1840
+ skipGsc: !!opts.skipGsc,
1841
+ skipCompetitors: !!opts.skipCompetitors,
1842
+ log: (msg) => console.log(chalk.gray(msg)),
1843
+ });
1844
+
1845
+ if (report.groups.length === 0) {
1846
+ console.log(chalk.yellow(`\n No template patterns detected.\n`));
1847
+ process.exit(0);
1848
+ }
1849
+
1850
+ // Summary
1851
+ console.log(chalk.bold.green(`\n✅ Template analysis complete`));
1852
+ console.log(chalk.dim(` ${report.stats.totalGroups} groups · ${report.stats.totalGrouped.toLocaleString()} URLs · ${(report.stats.coverage * 100).toFixed(0)}% of sitemap`));
1853
+ console.log(chalk.dim(` Run ${chalk.white('seo-intel html ' + project)} to see the full dashboard.\n`));
1854
+ } catch (err) {
1855
+ console.error(chalk.red(`\n Error: ${err.message}\n`));
1856
+ if (process.env.DEBUG) console.error(err.stack);
1857
+ process.exit(1);
1858
+ }
1677
1859
  });
1678
1860
 
1679
1861
  // ── HTML DASHBOARD ─────────────────────────────────────────────────────────
@@ -1741,10 +1923,10 @@ program
1741
1923
  }
1742
1924
  });
1743
1925
 
1744
- // ── HTML ALL-PROJECTS DASHBOARD ──────────────────────────────────────────────
1926
+ // ── HTML ALL-PROJECTS DASHBOARD (alias for html — kept for backwards compat) ──
1745
1927
  program
1746
1928
  .command('html-all')
1747
- .description('Generate a single HTML dashboard with all projects (dropdown switcher)')
1929
+ .description('Alias for "html" generates the all-projects dashboard')
1748
1930
  .action(() => {
1749
1931
  const db = getDb();
1750
1932
  const configs = loadAllConfigs();
package/crawler/index.js CHANGED
@@ -186,7 +186,7 @@ export async function* crawlDomain(startUrl, opts = {}) {
186
186
  // When hostname contains "docs.", spoof Googlebot UA to reduce WAF friction.
187
187
  const isDocsHostname = base.hostname.toLowerCase().includes('docs.');
188
188
  const GOOGLEBOT_UA = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';
189
- const defaultUA = 'Mozilla/5.0 (compatible; SEOIntelBot/1.0; +https://froggo.pro/seo-intel/bot)';
189
+ const defaultUA = 'Mozilla/5.0 (compatible; SEOIntelBot/1.0; +https://ukkometa.fi/en/seo-intel/bot)';
190
190
  const effectiveUA = isDocsHostname ? GOOGLEBOT_UA : defaultUA;
191
191
 
192
192
  async function tryLoadLlmsTxt() {
@@ -547,11 +547,16 @@ async function processPage(page, url, base, depth, queue, maxDepth) {
547
547
  // ── Quality gate — detect shells, blocked pages, empty content ──
548
548
  const quality = assessQuality({ wordCount, bodyText, title, status });
549
549
 
550
+ // Full body text for DB storage (extraction reads this); truncated for log output
551
+ const fullBodyText = sanitize(bodyText, 50000); // ~200K chars — enough for any real page
552
+ const shortBodyText = sanitize(bodyText, 2000); // compact version for logging
553
+
550
554
  return {
551
555
  url, depth, status, loadMs, wordCount, isIndexable,
552
556
  title, metaDesc, headings,
553
557
  links: [...internalLinks, ...externalLinks],
554
- bodyText: sanitize(bodyText, 2000),
558
+ bodyText: shortBodyText,
559
+ fullBodyText,
555
560
  schemaTypes, parsedSchemas, vitals, publishedDate, modifiedDate,
556
561
  contentHash: hash,
557
562
  quality: quality.ok, qualityReason: quality.reason,
@@ -181,7 +181,7 @@ async function checkHttp(hostname) {
181
181
  signal: controller.signal,
182
182
  redirect: 'follow',
183
183
  headers: {
184
- 'User-Agent': 'Mozilla/5.0 (compatible; SEOIntelBot/1.0; +https://froggo.pro/seo-intel/bot)',
184
+ 'User-Agent': 'Mozilla/5.0 (compatible; SEOIntelBot/1.0; +https://ukkometa.fi/en/seo-intel/bot)',
185
185
  },
186
186
  });
187
187