seo-intel 1.5.2 → 1.5.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +26 -0
- package/Start SEO Intel.command +10 -0
- package/analyses/aeo/scorer.js +60 -6
- package/analyses/blog-draft/index.js +62 -10
- package/analyses/templates/index.js +1 -1
- package/analysis/prompt-builder.js +167 -2
- package/analysis/technical-audit.js +177 -0
- package/cli.js +446 -25
- package/crawler/index.js +36 -2
- package/crawler/sitemap.js +44 -0
- package/db/db.js +62 -9
- package/db/schema.sql +19 -0
- package/exports/queries.js +32 -0
- package/exports/technical.js +181 -1
- package/extractor/qwen.js +135 -13
- package/lib/scan-export.js +204 -0
- package/package.json +1 -1
- package/reports/generate-html.js +517 -50
- package/server.js +319 -25
- package/setup/checks.js +65 -5
- package/setup/engine.js +1 -0
- package/setup/web-routes.js +22 -3
- package/setup/wizard.html +8 -6
package/crawler/index.js
CHANGED
|
@@ -263,6 +263,10 @@ export async function* crawlDomain(startUrl, opts = {}) {
|
|
|
263
263
|
// ── Sitemap-first: seed queue from sitemap.xml (section-aware) ──
|
|
264
264
|
try {
|
|
265
265
|
const sitemapUrls = await fetchSitemap(startUrl);
|
|
266
|
+
// Report full sitemap inventory to caller (for DB persistence / audit diff)
|
|
267
|
+
if (sitemapUrls.length > 0 && typeof opts.onSitemapDiscovered === 'function') {
|
|
268
|
+
try { await opts.onSitemapDiscovered(sitemapUrls); } catch { /* ignore */ }
|
|
269
|
+
}
|
|
266
270
|
if (sitemapUrls.length > 0) {
|
|
267
271
|
// Apply section budgets if tiered crawling is enabled
|
|
268
272
|
const budgeted = tiered ? applySectionBudgets(sitemapUrls, maxPages) : sitemapUrls;
|
|
@@ -452,9 +456,36 @@ async function processPage(page, url, base, depth, queue, maxDepth) {
|
|
|
452
456
|
status = res?.status() || 0;
|
|
453
457
|
const loadMs = Date.now() - t0;
|
|
454
458
|
|
|
459
|
+
// ── Final URL after redirects ──
|
|
460
|
+
let finalUrl = null;
|
|
461
|
+
try { finalUrl = page.url() || null; } catch { /* ignore */ }
|
|
462
|
+
|
|
463
|
+
// ── Redirect chain (walk request.redirectedFrom() backwards) ──
|
|
464
|
+
const redirectChain = [];
|
|
465
|
+
try {
|
|
466
|
+
let req = res?.request();
|
|
467
|
+
const chain = [];
|
|
468
|
+
while (req) {
|
|
469
|
+
const prev = req.redirectedFrom?.();
|
|
470
|
+
if (!prev) break;
|
|
471
|
+
const prevRes = await prev.response().catch(() => null);
|
|
472
|
+
chain.push({ url: prev.url(), status: prevRes?.status() ?? null });
|
|
473
|
+
req = prev;
|
|
474
|
+
}
|
|
475
|
+
// chain is in reverse order (closest redirect first); reverse for chronological
|
|
476
|
+
redirectChain.push(...chain.reverse());
|
|
477
|
+
} catch { /* ignore */ }
|
|
478
|
+
|
|
479
|
+
// ── X-Robots-Tag header ──
|
|
480
|
+
let xRobotsTag = null;
|
|
481
|
+
try {
|
|
482
|
+
const headers = res?.headers?.() || {};
|
|
483
|
+
xRobotsTag = headers['x-robots-tag'] || null;
|
|
484
|
+
} catch { /* ignore */ }
|
|
485
|
+
|
|
455
486
|
// ── Return status for backoff logic (don't silently drop 4xx) ──
|
|
456
487
|
if (status === 429 || status === 503 || status === 403) {
|
|
457
|
-
return { url, depth, status, loadMs, wordCount: 0, isIndexable: false, title: '', metaDesc: '', headings: [], links: [], bodyText: '', schemaTypes: [], vitals: {}, publishedDate: null, modifiedDate: null, contentHash: null };
|
|
488
|
+
return { url, depth, status, loadMs, wordCount: 0, isIndexable: false, title: '', metaDesc: '', headings: [], links: [], bodyText: '', schemaTypes: [], vitals: {}, publishedDate: null, modifiedDate: null, contentHash: null, finalUrl, redirectChain, xRobotsTag };
|
|
458
489
|
}
|
|
459
490
|
if (status >= 400) return null;
|
|
460
491
|
|
|
@@ -507,7 +538,9 @@ async function processPage(page, url, base, depth, queue, maxDepth) {
|
|
|
507
538
|
const wordCount = await page.$eval('body', el => el.innerText.split(/\s+/).filter(Boolean).length).catch(() => 0);
|
|
508
539
|
|
|
509
540
|
const robotsMeta = await page.$eval('meta[name="robots"]', el => el.content).catch(() => '');
|
|
510
|
-
const
|
|
541
|
+
const metaNoindex = robotsMeta.toLowerCase().includes('noindex');
|
|
542
|
+
const headerNoindex = (xRobotsTag || '').toLowerCase().includes('noindex');
|
|
543
|
+
const isIndexable = !(metaNoindex || headerNoindex);
|
|
511
544
|
const hasCanonical = await page.$('link[rel="canonical"]').then(el => !!el).catch(() => false);
|
|
512
545
|
const hasOgTags = await page.$('meta[property^="og:"]').then(el => !!el).catch(() => false);
|
|
513
546
|
|
|
@@ -576,6 +609,7 @@ async function processPage(page, url, base, depth, queue, maxDepth) {
|
|
|
576
609
|
hasCanonical, hasOgTags,
|
|
577
610
|
hasRobots: !!robotsMeta,
|
|
578
611
|
hasSchema: schemaTypes.length > 0,
|
|
612
|
+
finalUrl, redirectChain, xRobotsTag,
|
|
579
613
|
};
|
|
580
614
|
}
|
|
581
615
|
|
package/crawler/sitemap.js
CHANGED
|
@@ -101,3 +101,47 @@ function extractTagContent(xml, tagName) {
|
|
|
101
101
|
}
|
|
102
102
|
return results;
|
|
103
103
|
}
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* HEAD-check a single URL without following redirects.
|
|
107
|
+
* Returns { status, location } — location is the Location header when 3XX.
|
|
108
|
+
* Never throws — errors return { status: 0, error: msg }.
|
|
109
|
+
*/
|
|
110
|
+
export async function headCheck(url, { timeoutMs = 8000 } = {}) {
|
|
111
|
+
try {
|
|
112
|
+
const ctrl = new AbortController();
|
|
113
|
+
const t = setTimeout(() => ctrl.abort(), timeoutMs);
|
|
114
|
+
const res = await fetch(url, {
|
|
115
|
+
method: 'HEAD',
|
|
116
|
+
redirect: 'manual',
|
|
117
|
+
signal: ctrl.signal,
|
|
118
|
+
headers: { 'User-Agent': 'SEOIntelBot/1.0' },
|
|
119
|
+
}).finally(() => clearTimeout(t));
|
|
120
|
+
return {
|
|
121
|
+
status: res.status,
|
|
122
|
+
location: res.headers.get('location') || null,
|
|
123
|
+
};
|
|
124
|
+
} catch (err) {
|
|
125
|
+
return { status: 0, error: err.message };
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Run HEAD checks against an array of sitemap URL rows in parallel (capped).
|
|
131
|
+
* Accepts [{ id, url }]. Invokes onResult(row, result) per check.
|
|
132
|
+
*/
|
|
133
|
+
export async function headCheckAll(rows, { concurrency = 6, onResult } = {}) {
|
|
134
|
+
const queue = [...rows];
|
|
135
|
+
const worker = async () => {
|
|
136
|
+
while (queue.length) {
|
|
137
|
+
const row = queue.shift();
|
|
138
|
+
if (!row) break;
|
|
139
|
+
const result = await headCheck(row.url);
|
|
140
|
+
if (onResult) {
|
|
141
|
+
try { await onResult(row, result); } catch { /* swallow */ }
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
};
|
|
145
|
+
const workers = Array.from({ length: Math.min(concurrency, rows.length) }, () => worker());
|
|
146
|
+
await Promise.all(workers);
|
|
147
|
+
}
|
package/db/db.js
CHANGED
|
@@ -24,7 +24,11 @@ export function getDb(dbPath = './seo-intel.db') {
|
|
|
24
24
|
try { _db.exec('ALTER TABLE pages ADD COLUMN title TEXT'); } catch { /* already exists */ }
|
|
25
25
|
try { _db.exec('ALTER TABLE pages ADD COLUMN meta_desc TEXT'); } catch { /* already exists */ }
|
|
26
26
|
try { _db.exec('ALTER TABLE pages ADD COLUMN body_text TEXT'); } catch { /* already exists */ }
|
|
27
|
+
try { _db.exec('ALTER TABLE pages ADD COLUMN final_url TEXT'); } catch { /* already exists */ }
|
|
28
|
+
try { _db.exec('ALTER TABLE pages ADD COLUMN redirect_chain TEXT'); } catch { /* already exists */ }
|
|
29
|
+
try { _db.exec('ALTER TABLE pages ADD COLUMN x_robots_tag TEXT'); } catch { /* already exists */ }
|
|
27
30
|
try { _db.exec('ALTER TABLE analyses ADD COLUMN technical_gaps TEXT'); } catch { /* already exists */ }
|
|
31
|
+
try { _db.exec('ALTER TABLE extractions ADD COLUMN intent_scores TEXT'); } catch { /* already exists */ }
|
|
28
32
|
|
|
29
33
|
// Backfill first_seen_at from crawled_at for existing rows
|
|
30
34
|
_db.exec('UPDATE pages SET first_seen_at = crawled_at WHERE first_seen_at IS NULL');
|
|
@@ -279,12 +283,13 @@ function normalizePageUrl(rawUrl) {
|
|
|
279
283
|
} catch { return rawUrl; }
|
|
280
284
|
}
|
|
281
285
|
|
|
282
|
-
export function upsertPage(db, { domainId, url, statusCode, wordCount, loadMs, isIndexable, clickDepth = 0, publishedDate = null, modifiedDate = null, contentHash = null, title = null, metaDesc = null, bodyText = null }) {
|
|
286
|
+
export function upsertPage(db, { domainId, url, statusCode, wordCount, loadMs, isIndexable, clickDepth = 0, publishedDate = null, modifiedDate = null, contentHash = null, title = null, metaDesc = null, bodyText = null, finalUrl = null, redirectChain = null, xRobotsTag = null }) {
|
|
283
287
|
url = normalizePageUrl(url);
|
|
284
288
|
const now = Date.now();
|
|
289
|
+
const redirectChainJson = redirectChain ? JSON.stringify(redirectChain) : null;
|
|
285
290
|
db.prepare(`
|
|
286
|
-
INSERT INTO pages (domain_id, url, crawled_at, first_seen_at, status_code, word_count, load_ms, is_indexable, click_depth, published_date, modified_date, content_hash, title, meta_desc, body_text)
|
|
287
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
291
|
+
INSERT INTO pages (domain_id, url, crawled_at, first_seen_at, status_code, word_count, load_ms, is_indexable, click_depth, published_date, modified_date, content_hash, title, meta_desc, body_text, final_url, redirect_chain, x_robots_tag)
|
|
292
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
288
293
|
ON CONFLICT(url) DO UPDATE SET
|
|
289
294
|
crawled_at = excluded.crawled_at,
|
|
290
295
|
status_code = excluded.status_code,
|
|
@@ -296,8 +301,11 @@ export function upsertPage(db, { domainId, url, statusCode, wordCount, loadMs, i
|
|
|
296
301
|
content_hash = excluded.content_hash,
|
|
297
302
|
title = excluded.title,
|
|
298
303
|
meta_desc = excluded.meta_desc,
|
|
299
|
-
body_text = excluded.body_text
|
|
300
|
-
|
|
304
|
+
body_text = excluded.body_text,
|
|
305
|
+
final_url = excluded.final_url,
|
|
306
|
+
redirect_chain = excluded.redirect_chain,
|
|
307
|
+
x_robots_tag = excluded.x_robots_tag
|
|
308
|
+
`).run(domainId, url, now, now, statusCode, wordCount, loadMs, isIndexable ? 1 : 0, clickDepth, publishedDate, modifiedDate, contentHash, title || null, metaDesc || null, bodyText || null, finalUrl || null, redirectChainJson, xRobotsTag || null);
|
|
301
309
|
// first_seen_at is NOT in the ON CONFLICT UPDATE — it stays from original INSERT
|
|
302
310
|
return db.prepare('SELECT id FROM pages WHERE url = ?').get(url);
|
|
303
311
|
}
|
|
@@ -327,14 +335,15 @@ export function insertExtraction(db, { pageId, data }) {
|
|
|
327
335
|
return db.prepare(`
|
|
328
336
|
INSERT OR REPLACE INTO extractions
|
|
329
337
|
(page_id, title, meta_desc, h1, product_type, pricing_tier, cta_primary,
|
|
330
|
-
tech_stack, schema_types, search_intent, primary_entities, extracted_at)
|
|
331
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
338
|
+
tech_stack, schema_types, search_intent, intent_scores, primary_entities, extracted_at)
|
|
339
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
332
340
|
`).run(
|
|
333
341
|
pageId, data.title, data.meta_desc, data.h1,
|
|
334
342
|
data.product_type, data.pricing_tier, data.cta_primary,
|
|
335
343
|
JSON.stringify(data.tech_stack || []),
|
|
336
344
|
JSON.stringify(data.schema_types || []),
|
|
337
345
|
data.search_intent || 'Informational',
|
|
346
|
+
JSON.stringify(data.intent_scores || {}),
|
|
338
347
|
JSON.stringify(data.primary_entities || []),
|
|
339
348
|
Date.now()
|
|
340
349
|
);
|
|
@@ -421,10 +430,14 @@ export function getSchemasByProject(db, project) {
|
|
|
421
430
|
}
|
|
422
431
|
|
|
423
432
|
export function getCompetitorSummary(db, project) {
|
|
433
|
+
// target + owned rows are merged into a single 'target' row.
|
|
434
|
+
// This handles the common case where the target domain (e.g. dgents.ai) redirects
|
|
435
|
+
// to www.dgents.ai, which gets crawled as an owned subdomain — the parallel crawl
|
|
436
|
+
// race means pages end up under 'owned', leaving the target with 0 pages.
|
|
424
437
|
return db.prepare(`
|
|
425
438
|
SELECT
|
|
426
439
|
d.domain,
|
|
427
|
-
d.role,
|
|
440
|
+
CASE WHEN d.role IN ('target', 'owned') THEN 'target' ELSE d.role END AS role,
|
|
428
441
|
COUNT(DISTINCT p.id) as page_count,
|
|
429
442
|
AVG(p.word_count) as avg_word_count,
|
|
430
443
|
GROUP_CONCAT(DISTINCT e.product_type) as product_types,
|
|
@@ -434,7 +447,9 @@ export function getCompetitorSummary(db, project) {
|
|
|
434
447
|
JOIN pages p ON p.domain_id = d.id
|
|
435
448
|
LEFT JOIN extractions e ON e.page_id = p.id
|
|
436
449
|
WHERE d.project = ?
|
|
437
|
-
GROUP BY
|
|
450
|
+
GROUP BY
|
|
451
|
+
CASE WHEN d.role IN ('target', 'owned') THEN 'target-group' ELSE d.domain END,
|
|
452
|
+
CASE WHEN d.role IN ('target', 'owned') THEN 'target' ELSE d.role END
|
|
438
453
|
`).all(project);
|
|
439
454
|
}
|
|
440
455
|
|
|
@@ -538,6 +553,41 @@ export function getTemplateSamples(db, groupId) {
|
|
|
538
553
|
).all(groupId);
|
|
539
554
|
}
|
|
540
555
|
|
|
556
|
+
// ── Sitemap URL inventory ─────────────────────────────────────────────────
|
|
557
|
+
|
|
558
|
+
export function upsertSitemapUrls(db, domainId, urls, sitemapSource = null) {
|
|
559
|
+
if (!urls || !urls.length) return 0;
|
|
560
|
+
const now = Date.now();
|
|
561
|
+
const stmt = db.prepare(`
|
|
562
|
+
INSERT INTO sitemap_urls (domain_id, url, sitemap_source, discovered_at)
|
|
563
|
+
VALUES (?, ?, ?, ?)
|
|
564
|
+
ON CONFLICT(domain_id, url) DO UPDATE SET
|
|
565
|
+
sitemap_source = COALESCE(excluded.sitemap_source, sitemap_urls.sitemap_source),
|
|
566
|
+
discovered_at = excluded.discovered_at
|
|
567
|
+
`);
|
|
568
|
+
db.exec('BEGIN');
|
|
569
|
+
try {
|
|
570
|
+
for (const u of urls) {
|
|
571
|
+
const normalized = normalizePageUrl(u);
|
|
572
|
+
stmt.run(domainId, normalized, sitemapSource, now);
|
|
573
|
+
}
|
|
574
|
+
db.exec('COMMIT');
|
|
575
|
+
} catch (e) { db.exec('ROLLBACK'); throw e; }
|
|
576
|
+
return urls.length;
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
export function getSitemapUrlsForDomain(db, domainId) {
|
|
580
|
+
return db.prepare(
|
|
581
|
+
'SELECT * FROM sitemap_urls WHERE domain_id = ?'
|
|
582
|
+
).all(domainId);
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
export function updateSitemapHeadResult(db, id, { status, location }) {
|
|
586
|
+
db.prepare(
|
|
587
|
+
'UPDATE sitemap_urls SET head_status = ?, head_location = ?, head_checked_at = ? WHERE id = ?'
|
|
588
|
+
).run(status ?? null, location ?? null, Date.now(), id);
|
|
589
|
+
}
|
|
590
|
+
|
|
541
591
|
// ── Domain sync / prune ───────────────────────────────────────────────────
|
|
542
592
|
|
|
543
593
|
/**
|
|
@@ -576,6 +626,9 @@ export function pruneStaleDomains(db, project, configDomains) {
|
|
|
576
626
|
db.prepare(`DELETE FROM pages WHERE domain_id = ?`).run(id);
|
|
577
627
|
}
|
|
578
628
|
|
|
629
|
+
// Sitemap URLs for this domain
|
|
630
|
+
try { db.prepare('DELETE FROM sitemap_urls WHERE domain_id = ?').run(id); } catch { /* table may not exist */ }
|
|
631
|
+
|
|
579
632
|
// Template groups for this domain
|
|
580
633
|
db.prepare(
|
|
581
634
|
'DELETE FROM template_samples WHERE group_id IN (SELECT id FROM template_groups WHERE project = ? AND domain = ?)'
|
package/db/schema.sql
CHANGED
|
@@ -26,6 +26,9 @@ CREATE TABLE IF NOT EXISTS pages (
|
|
|
26
26
|
title TEXT, -- page <title>
|
|
27
27
|
meta_desc TEXT, -- meta description
|
|
28
28
|
body_text TEXT, -- cleaned body text for extraction (stored at crawl time)
|
|
29
|
+
final_url TEXT, -- URL after redirects (page.url() post-nav)
|
|
30
|
+
redirect_chain TEXT, -- JSON array of [{url, status}] hops, empty array if none
|
|
31
|
+
x_robots_tag TEXT, -- X-Robots-Tag response header value (raw)
|
|
29
32
|
FOREIGN KEY (domain_id) REFERENCES domains(id)
|
|
30
33
|
);
|
|
31
34
|
|
|
@@ -41,6 +44,7 @@ CREATE TABLE IF NOT EXISTS extractions (
|
|
|
41
44
|
tech_stack TEXT, -- JSON array
|
|
42
45
|
schema_types TEXT, -- JSON array (Article, Product, FAQ, etc.)
|
|
43
46
|
search_intent TEXT, -- 'Informational' | 'Navigational' | 'Commercial' | 'Transactional'
|
|
47
|
+
intent_scores TEXT, -- JSON object: {"commercial":70,"informational":20,"comparison":10}
|
|
44
48
|
primary_entities TEXT, -- JSON array of 3-7 core concept strings
|
|
45
49
|
extracted_at INTEGER NOT NULL
|
|
46
50
|
);
|
|
@@ -194,6 +198,21 @@ CREATE TABLE IF NOT EXISTS citability_scores (
|
|
|
194
198
|
|
|
195
199
|
CREATE INDEX IF NOT EXISTS idx_citability_page ON citability_scores(page_id);
|
|
196
200
|
|
|
201
|
+
-- Sitemap URL inventory (one row per URL declared in a sitemap)
|
|
202
|
+
CREATE TABLE IF NOT EXISTS sitemap_urls (
|
|
203
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
204
|
+
domain_id INTEGER NOT NULL REFERENCES domains(id),
|
|
205
|
+
url TEXT NOT NULL,
|
|
206
|
+
sitemap_source TEXT, -- which sitemap file this came from
|
|
207
|
+
discovered_at INTEGER NOT NULL,
|
|
208
|
+
head_status INTEGER, -- HTTP status from HEAD check (null until audit runs)
|
|
209
|
+
head_location TEXT, -- Location header when redirected
|
|
210
|
+
head_checked_at INTEGER,
|
|
211
|
+
UNIQUE(domain_id, url)
|
|
212
|
+
);
|
|
213
|
+
|
|
214
|
+
CREATE INDEX IF NOT EXISTS idx_sitemap_urls_domain ON sitemap_urls(domain_id);
|
|
215
|
+
|
|
197
216
|
-- Indexes
|
|
198
217
|
CREATE INDEX IF NOT EXISTS idx_pages_domain ON pages(domain_id);
|
|
199
218
|
CREATE INDEX IF NOT EXISTS idx_keywords_page ON keywords(page_id);
|
package/exports/queries.js
CHANGED
|
@@ -76,6 +76,9 @@ export function getTechnicalDataset(db, project) {
|
|
|
76
76
|
p.word_count,
|
|
77
77
|
p.click_depth,
|
|
78
78
|
p.is_indexable,
|
|
79
|
+
p.title,
|
|
80
|
+
p.published_date,
|
|
81
|
+
p.modified_date,
|
|
79
82
|
d.domain,
|
|
80
83
|
d.role,
|
|
81
84
|
COALESCE(e.meta_desc, '') AS meta_desc,
|
|
@@ -86,6 +89,17 @@ export function getTechnicalDataset(db, project) {
|
|
|
86
89
|
COALESCE((SELECT COUNT(*) FROM page_schemas ps WHERE ps.page_id = p.id), 0) AS schema_count,
|
|
87
90
|
COALESCE((SELECT COUNT(*) FROM page_schemas ps WHERE ps.page_id = p.id AND LOWER(ps.schema_type) = 'breadcrumblist'), 0) AS breadcrumb_count,
|
|
88
91
|
COALESCE((SELECT COUNT(*) FROM headings h WHERE h.page_id = p.id AND h.level = 1), 0) AS h1_count,
|
|
92
|
+
COALESCE((SELECT COUNT(*) FROM headings h WHERE h.page_id = p.id AND h.level = 1), 0) > 1 AS has_multiple_h1,
|
|
93
|
+
COALESCE((SELECT COUNT(*) FROM page_schemas ps WHERE ps.page_id = p.id AND LOWER(ps.schema_type) IN ('faqpage', 'faq')), 0) AS faq_schema_count,
|
|
94
|
+
COALESCE((SELECT COUNT(*) FROM page_schemas ps WHERE ps.page_id = p.id AND LOWER(ps.schema_type) = 'howto'), 0) AS howto_schema_count,
|
|
95
|
+
COALESCE((
|
|
96
|
+
SELECT COUNT(*) FROM headings h
|
|
97
|
+
WHERE h.page_id = p.id AND h.level IN (2, 3)
|
|
98
|
+
AND (h.text LIKE 'what %' OR h.text LIKE 'how %' OR h.text LIKE 'why %'
|
|
99
|
+
OR h.text LIKE 'when %' OR h.text LIKE 'which %' OR h.text LIKE 'can %'
|
|
100
|
+
OR h.text LIKE 'does %' OR h.text LIKE 'is %' OR h.text LIKE 'are %'
|
|
101
|
+
OR h.text LIKE '%?')
|
|
102
|
+
), 0) AS question_heading_count,
|
|
89
103
|
COALESCE((
|
|
90
104
|
SELECT COUNT(*)
|
|
91
105
|
FROM links l
|
|
@@ -115,6 +129,24 @@ export function getTechnicalDataset(db, project) {
|
|
|
115
129
|
`).all(project);
|
|
116
130
|
}
|
|
117
131
|
|
|
132
|
+
/**
|
|
133
|
+
* Get keywords associated with pages missing a specific schema type.
|
|
134
|
+
* Used to show "Missing FAQ Schema → Low PAA chance for query X".
|
|
135
|
+
*/
|
|
136
|
+
export function getKeywordsForSchemaDeficientPages(db, project, pageIds) {
|
|
137
|
+
if (!pageIds.length) return [];
|
|
138
|
+
const placeholders = pageIds.map(() => '?').join(',');
|
|
139
|
+
return db.prepare(`
|
|
140
|
+
SELECT k.keyword, k.location, k.page_id, p.url,
|
|
141
|
+
e.search_intent
|
|
142
|
+
FROM keywords k
|
|
143
|
+
JOIN pages p ON p.id = k.page_id
|
|
144
|
+
LEFT JOIN extractions e ON e.page_id = p.id
|
|
145
|
+
WHERE k.page_id IN (${placeholders})
|
|
146
|
+
ORDER BY k.page_id, k.location
|
|
147
|
+
`).all(...pageIds);
|
|
148
|
+
}
|
|
149
|
+
|
|
118
150
|
export function getSchemaCoverage(db, project, vsDomain = null) {
|
|
119
151
|
const params = [project];
|
|
120
152
|
let competitorFilter = '';
|
package/exports/technical.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { collectTop, inferPriorityFromCount, makeAction, sortActions } from './heuristics.js';
|
|
2
|
-
import { getTechnicalDataset } from './queries.js';
|
|
2
|
+
import { getTechnicalDataset, getKeywordsForSchemaDeficientPages } from './queries.js';
|
|
3
3
|
|
|
4
4
|
export function buildTechnicalActions(db, project) {
|
|
5
5
|
const rows = getTechnicalDataset(db, project);
|
|
@@ -176,5 +176,185 @@ export function buildTechnicalActions(db, project) {
|
|
|
176
176
|
}));
|
|
177
177
|
}
|
|
178
178
|
|
|
179
|
+
// ── Title length issues ──────────────────────────────────────────────────
|
|
180
|
+
const titleTooLong = rows.filter(r =>
|
|
181
|
+
r.title && r.title.length > 65 && Number(r.status_code) < 400 && r.is_indexable
|
|
182
|
+
);
|
|
183
|
+
if (titleTooLong.length) {
|
|
184
|
+
actions.push(makeAction({
|
|
185
|
+
id: 'technical-title-too-long',
|
|
186
|
+
type: 'improve',
|
|
187
|
+
priority: inferPriorityFromCount(titleTooLong.length, { critical: 20, high: 8, medium: 3 }),
|
|
188
|
+
area: 'content',
|
|
189
|
+
title: `Shorten page titles on ${titleTooLong.length} pages exceeding 65 characters`,
|
|
190
|
+
why: 'Titles over 65 characters are truncated in SERPs, hiding your key message and reducing CTR.',
|
|
191
|
+
evidence: collectTop(titleTooLong.map(r => `${r.url} (${r.title.length} chars)`), 8),
|
|
192
|
+
implementationHints: [
|
|
193
|
+
'Keep titles under 60–65 characters to avoid SERP truncation.',
|
|
194
|
+
'Lead with the primary keyword and brand separator at the end.',
|
|
195
|
+
],
|
|
196
|
+
}));
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
const titleTooShort = rows.filter(r =>
|
|
200
|
+
r.title && r.title.length < 30 && Number(r.status_code) < 400 && r.is_indexable
|
|
201
|
+
);
|
|
202
|
+
if (titleTooShort.length) {
|
|
203
|
+
actions.push(makeAction({
|
|
204
|
+
id: 'technical-title-too-short',
|
|
205
|
+
type: 'improve',
|
|
206
|
+
priority: inferPriorityFromCount(titleTooShort.length, { critical: 15, high: 6, medium: 2 }),
|
|
207
|
+
area: 'content',
|
|
208
|
+
title: `Expand thin page titles on ${titleTooShort.length} pages under 30 characters`,
|
|
209
|
+
why: 'Very short titles waste valuable SERP real estate and under-signal page relevance to search engines.',
|
|
210
|
+
evidence: collectTop(titleTooShort.map(r => `${r.url} ("${r.title}")`), 8),
|
|
211
|
+
implementationHints: [
|
|
212
|
+
'Include the primary keyword, secondary modifier, and brand in the title.',
|
|
213
|
+
'Target 50–60 characters for maximum SERP visibility.',
|
|
214
|
+
],
|
|
215
|
+
}));
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
// ── Missing date metadata ────────────────────────────────────────────────
|
|
219
|
+
const missingDates = rows.filter(r =>
|
|
220
|
+
!r.published_date && !r.modified_date &&
|
|
221
|
+
(r.word_count || 0) >= 500 &&
|
|
222
|
+
Number(r.status_code) < 400 && r.is_indexable
|
|
223
|
+
);
|
|
224
|
+
if (missingDates.length) {
|
|
225
|
+
actions.push(makeAction({
|
|
226
|
+
id: 'technical-missing-dates',
|
|
227
|
+
type: 'improve',
|
|
228
|
+
priority: inferPriorityFromCount(missingDates.length, { critical: 20, high: 8, medium: 3 }),
|
|
229
|
+
area: 'schema',
|
|
230
|
+
title: `Add publish/modified dates to ${missingDates.length} content pages`,
|
|
231
|
+
why: 'Date metadata in schema and HTML signals freshness to AI models and search engines, boosting citability and freshness scoring.',
|
|
232
|
+
evidence: collectTop(missingDates.map(r => `${r.url} (${r.word_count} words)`), 8),
|
|
233
|
+
implementationHints: [
|
|
234
|
+
'Add datePublished and dateModified in Article/BlogPosting/NewsArticle schema JSON-LD.',
|
|
235
|
+
'Include <time datetime="..."> or meta date tags in the HTML head.',
|
|
236
|
+
'Keep dateModified updated on meaningful content revisions.',
|
|
237
|
+
],
|
|
238
|
+
}));
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// ── FAQ content without FAQPage schema ──────────────────────────────────
|
|
242
|
+
const faqContentNoSchema = rows.filter(r =>
|
|
243
|
+
r.question_heading_count >= 3 && !r.faq_schema_count &&
|
|
244
|
+
Number(r.status_code) < 400 && r.is_indexable
|
|
245
|
+
);
|
|
246
|
+
if (faqContentNoSchema.length) {
|
|
247
|
+
// Enrich with affected keywords to show SERP impact
|
|
248
|
+
const faqPageIds = faqContentNoSchema.map(r => r.id);
|
|
249
|
+
const faqKeywords = getKeywordsForSchemaDeficientPages(db, project, faqPageIds);
|
|
250
|
+
const faqImpact = faqKeywords
|
|
251
|
+
.filter(k => k.location === 'h2' || k.location === 'h1')
|
|
252
|
+
.slice(0, 5)
|
|
253
|
+
.map(k => `"${k.keyword}" on ${k.url.replace(/^https?:\/\/[^/]+/, '')} → low People Also Ask chance without FAQ schema`);
|
|
254
|
+
|
|
255
|
+
actions.push(makeAction({
|
|
256
|
+
id: 'technical-faq-content-no-schema',
|
|
257
|
+
type: 'add_schema',
|
|
258
|
+
priority: inferPriorityFromCount(faqContentNoSchema.length, { critical: 10, high: 4, medium: 2 }),
|
|
259
|
+
area: 'schema',
|
|
260
|
+
title: `Add FAQPage schema to ${faqContentNoSchema.length} pages with Q&A content`,
|
|
261
|
+
why: 'Pages with multiple question headings but no FAQPage schema miss FAQ rich results and lose AI citability score.',
|
|
262
|
+
evidence: collectTop(faqContentNoSchema.map(r => `${r.url} (${r.question_heading_count} question headings)`), 8),
|
|
263
|
+
impact: faqImpact.length ? faqImpact : undefined,
|
|
264
|
+
implementationHints: [
|
|
265
|
+
'Wrap each question heading + answer paragraph in FAQPage JSON-LD with Question/Answer entities.',
|
|
266
|
+
'Keep answers under 300 words each — Google truncates longer ones in rich results.',
|
|
267
|
+
],
|
|
268
|
+
}));
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// ── HowTo content without HowTo schema ──────────────────────────────────
|
|
272
|
+
const howtoContentNoSchema = rows.filter(r => {
|
|
273
|
+
const title = String(r.title || '').toLowerCase();
|
|
274
|
+
const h1 = String(r.h1 || '').toLowerCase();
|
|
275
|
+
const hasHowToSignal = /\bhow to\b|\bstep[- ]by[- ]step\b|\bsetup guide\b|\binstall guide\b/.test(title) ||
|
|
276
|
+
/\bhow to\b|\bstep[- ]by[- ]step\b|\bsetup guide\b|\binstall guide\b/.test(h1);
|
|
277
|
+
return hasHowToSignal && !r.howto_schema_count &&
|
|
278
|
+
Number(r.status_code) < 400 && r.is_indexable;
|
|
279
|
+
});
|
|
280
|
+
if (howtoContentNoSchema.length) {
|
|
281
|
+
const howtoPageIds = howtoContentNoSchema.map(r => r.id);
|
|
282
|
+
const howtoKeywords = getKeywordsForSchemaDeficientPages(db, project, howtoPageIds);
|
|
283
|
+
const howtoImpact = howtoKeywords
|
|
284
|
+
.filter(k => k.location === 'title' || k.location === 'h1')
|
|
285
|
+
.slice(0, 5)
|
|
286
|
+
.map(k => `"${k.keyword}" → missing HowTo rich result (step-by-step carousel)`);
|
|
287
|
+
|
|
288
|
+
actions.push(makeAction({
|
|
289
|
+
id: 'technical-howto-content-no-schema',
|
|
290
|
+
type: 'add_schema',
|
|
291
|
+
priority: inferPriorityFromCount(howtoContentNoSchema.length, { critical: 8, high: 3, medium: 1 }),
|
|
292
|
+
area: 'schema',
|
|
293
|
+
title: `Add HowTo schema to ${howtoContentNoSchema.length} step-by-step guide pages`,
|
|
294
|
+
why: 'How-to guides without HowTo schema miss rich results and rank lower for procedural queries.',
|
|
295
|
+
evidence: collectTop(howtoContentNoSchema.map(r => `${r.url}`), 8),
|
|
296
|
+
impact: howtoImpact.length ? howtoImpact : undefined,
|
|
297
|
+
implementationHints: [
|
|
298
|
+
'Wrap numbered steps in HowTo JSON-LD with HowToStep entities.',
|
|
299
|
+
'Include tool, supply, and time/cost fields where applicable.',
|
|
300
|
+
],
|
|
301
|
+
}));
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
// ── Multiple H1 headings ─────────────────────────────────────────────────
|
|
305
|
+
const multipleH1 = rows.filter(r =>
|
|
306
|
+
r.has_multiple_h1 && Number(r.status_code) < 400 && r.is_indexable
|
|
307
|
+
);
|
|
308
|
+
if (multipleH1.length) {
|
|
309
|
+
actions.push(makeAction({
|
|
310
|
+
id: 'technical-multiple-h1',
|
|
311
|
+
type: 'fix',
|
|
312
|
+
priority: inferPriorityFromCount(multipleH1.length, { critical: 15, high: 6, medium: 2 }),
|
|
313
|
+
area: 'content',
|
|
314
|
+
title: `Fix multiple H1 headings on ${multipleH1.length} pages`,
|
|
315
|
+
why: 'Multiple H1s dilute topical focus and create ambiguity about the primary page topic for search engines.',
|
|
316
|
+
evidence: collectTop(multipleH1.map(r => r.url), 10),
|
|
317
|
+
implementationHints: [
|
|
318
|
+
'Keep exactly one H1 that matches the page\'s primary keyword intent.',
|
|
319
|
+
'Demote secondary H1s to H2 or H3 as appropriate.',
|
|
320
|
+
],
|
|
321
|
+
}));
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
// ── Homepage links to external sites (nav leak) ──────────────────────
|
|
325
|
+
// Flag when homepage has external links in nav-like positions (anchor text
|
|
326
|
+
// suggests navigation: short text like "Deck", "Docs", "Blog" etc.)
|
|
327
|
+
const homepage = rows.find(r => {
|
|
328
|
+
const path = new URL(r.url).pathname;
|
|
329
|
+
return (path === '/' || path === '') && Number(r.status_code) < 400;
|
|
330
|
+
});
|
|
331
|
+
if (homepage) {
|
|
332
|
+
const navAnchors = ['deck', 'docs', 'blog', 'about', 'home', 'pricing', 'features', 'faq', 'team', 'contact', 'app', 'dashboard', 'whitepaper', 'roadmap', 'litepaper'];
|
|
333
|
+
const externalNavLinks = db.prepare(`
|
|
334
|
+
SELECT l.target_url, l.anchor_text
|
|
335
|
+
FROM links l
|
|
336
|
+
WHERE l.source_id = ? AND l.is_internal = 0
|
|
337
|
+
AND LENGTH(l.anchor_text) > 0 AND LENGTH(l.anchor_text) < 20
|
|
338
|
+
`).all(homepage.id)
|
|
339
|
+
.filter(l => navAnchors.some(n => l.anchor_text.toLowerCase().includes(n)));
|
|
340
|
+
|
|
341
|
+
if (externalNavLinks.length) {
|
|
342
|
+
actions.push(makeAction({
|
|
343
|
+
id: 'technical-nav-links-external',
|
|
344
|
+
type: 'fix',
|
|
345
|
+
priority: 'high',
|
|
346
|
+
area: 'structure',
|
|
347
|
+
title: `${externalNavLinks.length} navigation link(s) on homepage point to external sites`,
|
|
348
|
+
why: 'Nav-level links to external domains leak PageRank and confuse users expecting to stay on-site. Use internal landing pages or relative paths instead.',
|
|
349
|
+
evidence: externalNavLinks.map(l => `"${l.anchor_text}" → ${l.target_url}`),
|
|
350
|
+
implementationHints: [
|
|
351
|
+
'Replace external nav links with internal pages (e.g. /deck instead of Google Docs link).',
|
|
352
|
+
'If the content must be external, use a landing page wrapper with canonical.',
|
|
353
|
+
'Ensure the logo/brand link always points to the homepage.',
|
|
354
|
+
],
|
|
355
|
+
}));
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
|
|
179
359
|
return sortActions(actions);
|
|
180
360
|
}
|