seo-intel 1.5.2 → 1.5.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/crawler/index.js CHANGED
@@ -263,6 +263,10 @@ export async function* crawlDomain(startUrl, opts = {}) {
263
263
  // ── Sitemap-first: seed queue from sitemap.xml (section-aware) ──
264
264
  try {
265
265
  const sitemapUrls = await fetchSitemap(startUrl);
266
+ // Report full sitemap inventory to caller (for DB persistence / audit diff)
267
+ if (sitemapUrls.length > 0 && typeof opts.onSitemapDiscovered === 'function') {
268
+ try { await opts.onSitemapDiscovered(sitemapUrls); } catch { /* ignore */ }
269
+ }
266
270
  if (sitemapUrls.length > 0) {
267
271
  // Apply section budgets if tiered crawling is enabled
268
272
  const budgeted = tiered ? applySectionBudgets(sitemapUrls, maxPages) : sitemapUrls;
@@ -452,9 +456,36 @@ async function processPage(page, url, base, depth, queue, maxDepth) {
452
456
  status = res?.status() || 0;
453
457
  const loadMs = Date.now() - t0;
454
458
 
459
+ // ── Final URL after redirects ──
460
+ let finalUrl = null;
461
+ try { finalUrl = page.url() || null; } catch { /* ignore */ }
462
+
463
+ // ── Redirect chain (walk request.redirectedFrom() backwards) ──
464
+ const redirectChain = [];
465
+ try {
466
+ let req = res?.request();
467
+ const chain = [];
468
+ while (req) {
469
+ const prev = req.redirectedFrom?.();
470
+ if (!prev) break;
471
+ const prevRes = await prev.response().catch(() => null);
472
+ chain.push({ url: prev.url(), status: prevRes?.status() ?? null });
473
+ req = prev;
474
+ }
475
+ // chain is in reverse order (closest redirect first); reverse for chronological
476
+ redirectChain.push(...chain.reverse());
477
+ } catch { /* ignore */ }
478
+
479
+ // ── X-Robots-Tag header ──
480
+ let xRobotsTag = null;
481
+ try {
482
+ const headers = res?.headers?.() || {};
483
+ xRobotsTag = headers['x-robots-tag'] || null;
484
+ } catch { /* ignore */ }
485
+
455
486
  // ── Return status for backoff logic (don't silently drop 4xx) ──
456
487
  if (status === 429 || status === 503 || status === 403) {
457
- return { url, depth, status, loadMs, wordCount: 0, isIndexable: false, title: '', metaDesc: '', headings: [], links: [], bodyText: '', schemaTypes: [], vitals: {}, publishedDate: null, modifiedDate: null, contentHash: null };
488
+ return { url, depth, status, loadMs, wordCount: 0, isIndexable: false, title: '', metaDesc: '', headings: [], links: [], bodyText: '', schemaTypes: [], vitals: {}, publishedDate: null, modifiedDate: null, contentHash: null, finalUrl, redirectChain, xRobotsTag };
458
489
  }
459
490
  if (status >= 400) return null;
460
491
 
@@ -507,7 +538,9 @@ async function processPage(page, url, base, depth, queue, maxDepth) {
507
538
  const wordCount = await page.$eval('body', el => el.innerText.split(/\s+/).filter(Boolean).length).catch(() => 0);
508
539
 
509
540
  const robotsMeta = await page.$eval('meta[name="robots"]', el => el.content).catch(() => '');
510
- const isIndexable = !robotsMeta.toLowerCase().includes('noindex');
541
+ const metaNoindex = robotsMeta.toLowerCase().includes('noindex');
542
+ const headerNoindex = (xRobotsTag || '').toLowerCase().includes('noindex');
543
+ const isIndexable = !(metaNoindex || headerNoindex);
511
544
  const hasCanonical = await page.$('link[rel="canonical"]').then(el => !!el).catch(() => false);
512
545
  const hasOgTags = await page.$('meta[property^="og:"]').then(el => !!el).catch(() => false);
513
546
 
@@ -576,6 +609,7 @@ async function processPage(page, url, base, depth, queue, maxDepth) {
576
609
  hasCanonical, hasOgTags,
577
610
  hasRobots: !!robotsMeta,
578
611
  hasSchema: schemaTypes.length > 0,
612
+ finalUrl, redirectChain, xRobotsTag,
579
613
  };
580
614
  }
581
615
 
@@ -101,3 +101,47 @@ function extractTagContent(xml, tagName) {
101
101
  }
102
102
  return results;
103
103
  }
104
+
105
+ /**
106
+ * HEAD-check a single URL without following redirects.
107
+ * Returns { status, location } — location is the Location header when 3XX.
108
+ * Never throws — errors return { status: 0, error: msg }.
109
+ */
110
+ export async function headCheck(url, { timeoutMs = 8000 } = {}) {
111
+ try {
112
+ const ctrl = new AbortController();
113
+ const t = setTimeout(() => ctrl.abort(), timeoutMs);
114
+ const res = await fetch(url, {
115
+ method: 'HEAD',
116
+ redirect: 'manual',
117
+ signal: ctrl.signal,
118
+ headers: { 'User-Agent': 'SEOIntelBot/1.0' },
119
+ }).finally(() => clearTimeout(t));
120
+ return {
121
+ status: res.status,
122
+ location: res.headers.get('location') || null,
123
+ };
124
+ } catch (err) {
125
+ return { status: 0, error: err.message };
126
+ }
127
+ }
128
+
129
+ /**
130
+ * Run HEAD checks against an array of sitemap URL rows in parallel (capped).
131
+ * Accepts [{ id, url }]. Invokes onResult(row, result) per check.
132
+ */
133
+ export async function headCheckAll(rows, { concurrency = 6, onResult } = {}) {
134
+ const queue = [...rows];
135
+ const worker = async () => {
136
+ while (queue.length) {
137
+ const row = queue.shift();
138
+ if (!row) break;
139
+ const result = await headCheck(row.url);
140
+ if (onResult) {
141
+ try { await onResult(row, result); } catch { /* swallow */ }
142
+ }
143
+ }
144
+ };
145
+ const workers = Array.from({ length: Math.min(concurrency, rows.length) }, () => worker());
146
+ await Promise.all(workers);
147
+ }
package/db/db.js CHANGED
@@ -24,7 +24,11 @@ export function getDb(dbPath = './seo-intel.db') {
24
24
  try { _db.exec('ALTER TABLE pages ADD COLUMN title TEXT'); } catch { /* already exists */ }
25
25
  try { _db.exec('ALTER TABLE pages ADD COLUMN meta_desc TEXT'); } catch { /* already exists */ }
26
26
  try { _db.exec('ALTER TABLE pages ADD COLUMN body_text TEXT'); } catch { /* already exists */ }
27
+ try { _db.exec('ALTER TABLE pages ADD COLUMN final_url TEXT'); } catch { /* already exists */ }
28
+ try { _db.exec('ALTER TABLE pages ADD COLUMN redirect_chain TEXT'); } catch { /* already exists */ }
29
+ try { _db.exec('ALTER TABLE pages ADD COLUMN x_robots_tag TEXT'); } catch { /* already exists */ }
27
30
  try { _db.exec('ALTER TABLE analyses ADD COLUMN technical_gaps TEXT'); } catch { /* already exists */ }
31
+ try { _db.exec('ALTER TABLE extractions ADD COLUMN intent_scores TEXT'); } catch { /* already exists */ }
28
32
 
29
33
  // Backfill first_seen_at from crawled_at for existing rows
30
34
  _db.exec('UPDATE pages SET first_seen_at = crawled_at WHERE first_seen_at IS NULL');
@@ -279,12 +283,13 @@ function normalizePageUrl(rawUrl) {
279
283
  } catch { return rawUrl; }
280
284
  }
281
285
 
282
- export function upsertPage(db, { domainId, url, statusCode, wordCount, loadMs, isIndexable, clickDepth = 0, publishedDate = null, modifiedDate = null, contentHash = null, title = null, metaDesc = null, bodyText = null }) {
286
+ export function upsertPage(db, { domainId, url, statusCode, wordCount, loadMs, isIndexable, clickDepth = 0, publishedDate = null, modifiedDate = null, contentHash = null, title = null, metaDesc = null, bodyText = null, finalUrl = null, redirectChain = null, xRobotsTag = null }) {
283
287
  url = normalizePageUrl(url);
284
288
  const now = Date.now();
289
+ const redirectChainJson = redirectChain ? JSON.stringify(redirectChain) : null;
285
290
  db.prepare(`
286
- INSERT INTO pages (domain_id, url, crawled_at, first_seen_at, status_code, word_count, load_ms, is_indexable, click_depth, published_date, modified_date, content_hash, title, meta_desc, body_text)
287
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
291
+ INSERT INTO pages (domain_id, url, crawled_at, first_seen_at, status_code, word_count, load_ms, is_indexable, click_depth, published_date, modified_date, content_hash, title, meta_desc, body_text, final_url, redirect_chain, x_robots_tag)
292
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
288
293
  ON CONFLICT(url) DO UPDATE SET
289
294
  crawled_at = excluded.crawled_at,
290
295
  status_code = excluded.status_code,
@@ -296,8 +301,11 @@ export function upsertPage(db, { domainId, url, statusCode, wordCount, loadMs, i
296
301
  content_hash = excluded.content_hash,
297
302
  title = excluded.title,
298
303
  meta_desc = excluded.meta_desc,
299
- body_text = excluded.body_text
300
- `).run(domainId, url, now, now, statusCode, wordCount, loadMs, isIndexable ? 1 : 0, clickDepth, publishedDate, modifiedDate, contentHash, title || null, metaDesc || null, bodyText || null);
304
+ body_text = excluded.body_text,
305
+ final_url = excluded.final_url,
306
+ redirect_chain = excluded.redirect_chain,
307
+ x_robots_tag = excluded.x_robots_tag
308
+ `).run(domainId, url, now, now, statusCode, wordCount, loadMs, isIndexable ? 1 : 0, clickDepth, publishedDate, modifiedDate, contentHash, title || null, metaDesc || null, bodyText || null, finalUrl || null, redirectChainJson, xRobotsTag || null);
301
309
  // first_seen_at is NOT in the ON CONFLICT UPDATE — it stays from original INSERT
302
310
  return db.prepare('SELECT id FROM pages WHERE url = ?').get(url);
303
311
  }
@@ -327,14 +335,15 @@ export function insertExtraction(db, { pageId, data }) {
327
335
  return db.prepare(`
328
336
  INSERT OR REPLACE INTO extractions
329
337
  (page_id, title, meta_desc, h1, product_type, pricing_tier, cta_primary,
330
- tech_stack, schema_types, search_intent, primary_entities, extracted_at)
331
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
338
+ tech_stack, schema_types, search_intent, intent_scores, primary_entities, extracted_at)
339
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
332
340
  `).run(
333
341
  pageId, data.title, data.meta_desc, data.h1,
334
342
  data.product_type, data.pricing_tier, data.cta_primary,
335
343
  JSON.stringify(data.tech_stack || []),
336
344
  JSON.stringify(data.schema_types || []),
337
345
  data.search_intent || 'Informational',
346
+ JSON.stringify(data.intent_scores || {}),
338
347
  JSON.stringify(data.primary_entities || []),
339
348
  Date.now()
340
349
  );
@@ -421,10 +430,14 @@ export function getSchemasByProject(db, project) {
421
430
  }
422
431
 
423
432
  export function getCompetitorSummary(db, project) {
433
+ // target + owned rows are merged into a single 'target' row.
434
+ // This handles the common case where the target domain (e.g. dgents.ai) redirects
435
+ // to www.dgents.ai, which gets crawled as an owned subdomain — the parallel crawl
436
+ // race means pages end up under 'owned', leaving the target with 0 pages.
424
437
  return db.prepare(`
425
438
  SELECT
426
439
  d.domain,
427
- d.role,
440
+ CASE WHEN d.role IN ('target', 'owned') THEN 'target' ELSE d.role END AS role,
428
441
  COUNT(DISTINCT p.id) as page_count,
429
442
  AVG(p.word_count) as avg_word_count,
430
443
  GROUP_CONCAT(DISTINCT e.product_type) as product_types,
@@ -434,7 +447,9 @@ export function getCompetitorSummary(db, project) {
434
447
  JOIN pages p ON p.domain_id = d.id
435
448
  LEFT JOIN extractions e ON e.page_id = p.id
436
449
  WHERE d.project = ?
437
- GROUP BY d.domain, d.role
450
+ GROUP BY
451
+ CASE WHEN d.role IN ('target', 'owned') THEN 'target-group' ELSE d.domain END,
452
+ CASE WHEN d.role IN ('target', 'owned') THEN 'target' ELSE d.role END
438
453
  `).all(project);
439
454
  }
440
455
 
@@ -538,6 +553,41 @@ export function getTemplateSamples(db, groupId) {
538
553
  ).all(groupId);
539
554
  }
540
555
 
556
+ // ── Sitemap URL inventory ─────────────────────────────────────────────────
557
+
558
+ export function upsertSitemapUrls(db, domainId, urls, sitemapSource = null) {
559
+ if (!urls || !urls.length) return 0;
560
+ const now = Date.now();
561
+ const stmt = db.prepare(`
562
+ INSERT INTO sitemap_urls (domain_id, url, sitemap_source, discovered_at)
563
+ VALUES (?, ?, ?, ?)
564
+ ON CONFLICT(domain_id, url) DO UPDATE SET
565
+ sitemap_source = COALESCE(excluded.sitemap_source, sitemap_urls.sitemap_source),
566
+ discovered_at = excluded.discovered_at
567
+ `);
568
+ db.exec('BEGIN');
569
+ try {
570
+ for (const u of urls) {
571
+ const normalized = normalizePageUrl(u);
572
+ stmt.run(domainId, normalized, sitemapSource, now);
573
+ }
574
+ db.exec('COMMIT');
575
+ } catch (e) { db.exec('ROLLBACK'); throw e; }
576
+ return urls.length;
577
+ }
578
+
579
+ export function getSitemapUrlsForDomain(db, domainId) {
580
+ return db.prepare(
581
+ 'SELECT * FROM sitemap_urls WHERE domain_id = ?'
582
+ ).all(domainId);
583
+ }
584
+
585
+ export function updateSitemapHeadResult(db, id, { status, location }) {
586
+ db.prepare(
587
+ 'UPDATE sitemap_urls SET head_status = ?, head_location = ?, head_checked_at = ? WHERE id = ?'
588
+ ).run(status ?? null, location ?? null, Date.now(), id);
589
+ }
590
+
541
591
  // ── Domain sync / prune ───────────────────────────────────────────────────
542
592
 
543
593
  /**
@@ -576,6 +626,9 @@ export function pruneStaleDomains(db, project, configDomains) {
576
626
  db.prepare(`DELETE FROM pages WHERE domain_id = ?`).run(id);
577
627
  }
578
628
 
629
+ // Sitemap URLs for this domain
630
+ try { db.prepare('DELETE FROM sitemap_urls WHERE domain_id = ?').run(id); } catch { /* table may not exist */ }
631
+
579
632
  // Template groups for this domain
580
633
  db.prepare(
581
634
  'DELETE FROM template_samples WHERE group_id IN (SELECT id FROM template_groups WHERE project = ? AND domain = ?)'
package/db/schema.sql CHANGED
@@ -26,6 +26,9 @@ CREATE TABLE IF NOT EXISTS pages (
26
26
  title TEXT, -- page <title>
27
27
  meta_desc TEXT, -- meta description
28
28
  body_text TEXT, -- cleaned body text for extraction (stored at crawl time)
29
+ final_url TEXT, -- URL after redirects (page.url() post-nav)
30
+ redirect_chain TEXT, -- JSON array of [{url, status}] hops, empty array if none
31
+ x_robots_tag TEXT, -- X-Robots-Tag response header value (raw)
29
32
  FOREIGN KEY (domain_id) REFERENCES domains(id)
30
33
  );
31
34
 
@@ -41,6 +44,7 @@ CREATE TABLE IF NOT EXISTS extractions (
41
44
  tech_stack TEXT, -- JSON array
42
45
  schema_types TEXT, -- JSON array (Article, Product, FAQ, etc.)
43
46
  search_intent TEXT, -- 'Informational' | 'Navigational' | 'Commercial' | 'Transactional'
47
+ intent_scores TEXT, -- JSON object: {"commercial":70,"informational":20,"comparison":10}
44
48
  primary_entities TEXT, -- JSON array of 3-7 core concept strings
45
49
  extracted_at INTEGER NOT NULL
46
50
  );
@@ -194,6 +198,21 @@ CREATE TABLE IF NOT EXISTS citability_scores (
194
198
 
195
199
  CREATE INDEX IF NOT EXISTS idx_citability_page ON citability_scores(page_id);
196
200
 
201
+ -- Sitemap URL inventory (one row per URL declared in a sitemap)
202
+ CREATE TABLE IF NOT EXISTS sitemap_urls (
203
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
204
+ domain_id INTEGER NOT NULL REFERENCES domains(id),
205
+ url TEXT NOT NULL,
206
+ sitemap_source TEXT, -- which sitemap file this came from
207
+ discovered_at INTEGER NOT NULL,
208
+ head_status INTEGER, -- HTTP status from HEAD check (null until audit runs)
209
+ head_location TEXT, -- Location header when redirected
210
+ head_checked_at INTEGER,
211
+ UNIQUE(domain_id, url)
212
+ );
213
+
214
+ CREATE INDEX IF NOT EXISTS idx_sitemap_urls_domain ON sitemap_urls(domain_id);
215
+
197
216
  -- Indexes
198
217
  CREATE INDEX IF NOT EXISTS idx_pages_domain ON pages(domain_id);
199
218
  CREATE INDEX IF NOT EXISTS idx_keywords_page ON keywords(page_id);
@@ -76,6 +76,9 @@ export function getTechnicalDataset(db, project) {
76
76
  p.word_count,
77
77
  p.click_depth,
78
78
  p.is_indexable,
79
+ p.title,
80
+ p.published_date,
81
+ p.modified_date,
79
82
  d.domain,
80
83
  d.role,
81
84
  COALESCE(e.meta_desc, '') AS meta_desc,
@@ -86,6 +89,17 @@ export function getTechnicalDataset(db, project) {
86
89
  COALESCE((SELECT COUNT(*) FROM page_schemas ps WHERE ps.page_id = p.id), 0) AS schema_count,
87
90
  COALESCE((SELECT COUNT(*) FROM page_schemas ps WHERE ps.page_id = p.id AND LOWER(ps.schema_type) = 'breadcrumblist'), 0) AS breadcrumb_count,
88
91
  COALESCE((SELECT COUNT(*) FROM headings h WHERE h.page_id = p.id AND h.level = 1), 0) AS h1_count,
92
+ COALESCE((SELECT COUNT(*) FROM headings h WHERE h.page_id = p.id AND h.level = 1), 0) > 1 AS has_multiple_h1,
93
+ COALESCE((SELECT COUNT(*) FROM page_schemas ps WHERE ps.page_id = p.id AND LOWER(ps.schema_type) IN ('faqpage', 'faq')), 0) AS faq_schema_count,
94
+ COALESCE((SELECT COUNT(*) FROM page_schemas ps WHERE ps.page_id = p.id AND LOWER(ps.schema_type) = 'howto'), 0) AS howto_schema_count,
95
+ COALESCE((
96
+ SELECT COUNT(*) FROM headings h
97
+ WHERE h.page_id = p.id AND h.level IN (2, 3)
98
+ AND (h.text LIKE 'what %' OR h.text LIKE 'how %' OR h.text LIKE 'why %'
99
+ OR h.text LIKE 'when %' OR h.text LIKE 'which %' OR h.text LIKE 'can %'
100
+ OR h.text LIKE 'does %' OR h.text LIKE 'is %' OR h.text LIKE 'are %'
101
+ OR h.text LIKE '%?')
102
+ ), 0) AS question_heading_count,
89
103
  COALESCE((
90
104
  SELECT COUNT(*)
91
105
  FROM links l
@@ -115,6 +129,24 @@ export function getTechnicalDataset(db, project) {
115
129
  `).all(project);
116
130
  }
117
131
 
132
+ /**
133
+ * Get keywords associated with pages missing a specific schema type.
134
+ * Used to show "Missing FAQ Schema → Low PAA chance for query X".
135
+ */
136
+ export function getKeywordsForSchemaDeficientPages(db, project, pageIds) {
137
+ if (!pageIds.length) return [];
138
+ const placeholders = pageIds.map(() => '?').join(',');
139
+ return db.prepare(`
140
+ SELECT k.keyword, k.location, k.page_id, p.url,
141
+ e.search_intent
142
+ FROM keywords k
143
+ JOIN pages p ON p.id = k.page_id
144
+ LEFT JOIN extractions e ON e.page_id = p.id
145
+ WHERE k.page_id IN (${placeholders})
146
+ ORDER BY k.page_id, k.location
147
+ `).all(...pageIds);
148
+ }
149
+
118
150
  export function getSchemaCoverage(db, project, vsDomain = null) {
119
151
  const params = [project];
120
152
  let competitorFilter = '';
@@ -1,5 +1,5 @@
1
1
  import { collectTop, inferPriorityFromCount, makeAction, sortActions } from './heuristics.js';
2
- import { getTechnicalDataset } from './queries.js';
2
+ import { getTechnicalDataset, getKeywordsForSchemaDeficientPages } from './queries.js';
3
3
 
4
4
  export function buildTechnicalActions(db, project) {
5
5
  const rows = getTechnicalDataset(db, project);
@@ -176,5 +176,185 @@ export function buildTechnicalActions(db, project) {
176
176
  }));
177
177
  }
178
178
 
179
+ // ── Title length issues ──────────────────────────────────────────────────
180
+ const titleTooLong = rows.filter(r =>
181
+ r.title && r.title.length > 65 && Number(r.status_code) < 400 && r.is_indexable
182
+ );
183
+ if (titleTooLong.length) {
184
+ actions.push(makeAction({
185
+ id: 'technical-title-too-long',
186
+ type: 'improve',
187
+ priority: inferPriorityFromCount(titleTooLong.length, { critical: 20, high: 8, medium: 3 }),
188
+ area: 'content',
189
+ title: `Shorten page titles on ${titleTooLong.length} pages exceeding 65 characters`,
190
+ why: 'Titles over 65 characters are truncated in SERPs, hiding your key message and reducing CTR.',
191
+ evidence: collectTop(titleTooLong.map(r => `${r.url} (${r.title.length} chars)`), 8),
192
+ implementationHints: [
193
+ 'Keep titles under 60–65 characters to avoid SERP truncation.',
194
+ 'Lead with the primary keyword and brand separator at the end.',
195
+ ],
196
+ }));
197
+ }
198
+
199
+ const titleTooShort = rows.filter(r =>
200
+ r.title && r.title.length < 30 && Number(r.status_code) < 400 && r.is_indexable
201
+ );
202
+ if (titleTooShort.length) {
203
+ actions.push(makeAction({
204
+ id: 'technical-title-too-short',
205
+ type: 'improve',
206
+ priority: inferPriorityFromCount(titleTooShort.length, { critical: 15, high: 6, medium: 2 }),
207
+ area: 'content',
208
+ title: `Expand thin page titles on ${titleTooShort.length} pages under 30 characters`,
209
+ why: 'Very short titles waste valuable SERP real estate and under-signal page relevance to search engines.',
210
+ evidence: collectTop(titleTooShort.map(r => `${r.url} ("${r.title}")`), 8),
211
+ implementationHints: [
212
+ 'Include the primary keyword, secondary modifier, and brand in the title.',
213
+ 'Target 50–60 characters for maximum SERP visibility.',
214
+ ],
215
+ }));
216
+ }
217
+
218
+ // ── Missing date metadata ────────────────────────────────────────────────
219
+ const missingDates = rows.filter(r =>
220
+ !r.published_date && !r.modified_date &&
221
+ (r.word_count || 0) >= 500 &&
222
+ Number(r.status_code) < 400 && r.is_indexable
223
+ );
224
+ if (missingDates.length) {
225
+ actions.push(makeAction({
226
+ id: 'technical-missing-dates',
227
+ type: 'improve',
228
+ priority: inferPriorityFromCount(missingDates.length, { critical: 20, high: 8, medium: 3 }),
229
+ area: 'schema',
230
+ title: `Add publish/modified dates to ${missingDates.length} content pages`,
231
+ why: 'Date metadata in schema and HTML signals freshness to AI models and search engines, boosting citability and freshness scoring.',
232
+ evidence: collectTop(missingDates.map(r => `${r.url} (${r.word_count} words)`), 8),
233
+ implementationHints: [
234
+ 'Add datePublished and dateModified in Article/BlogPosting/NewsArticle schema JSON-LD.',
235
+ 'Include <time datetime="..."> or meta date tags in the HTML head.',
236
+ 'Keep dateModified updated on meaningful content revisions.',
237
+ ],
238
+ }));
239
+ }
240
+
241
+ // ── FAQ content without FAQPage schema ──────────────────────────────────
242
+ const faqContentNoSchema = rows.filter(r =>
243
+ r.question_heading_count >= 3 && !r.faq_schema_count &&
244
+ Number(r.status_code) < 400 && r.is_indexable
245
+ );
246
+ if (faqContentNoSchema.length) {
247
+ // Enrich with affected keywords to show SERP impact
248
+ const faqPageIds = faqContentNoSchema.map(r => r.id);
249
+ const faqKeywords = getKeywordsForSchemaDeficientPages(db, project, faqPageIds);
250
+ const faqImpact = faqKeywords
251
+ .filter(k => k.location === 'h2' || k.location === 'h1')
252
+ .slice(0, 5)
253
+ .map(k => `"${k.keyword}" on ${k.url.replace(/^https?:\/\/[^/]+/, '')} → low People Also Ask chance without FAQ schema`);
254
+
255
+ actions.push(makeAction({
256
+ id: 'technical-faq-content-no-schema',
257
+ type: 'add_schema',
258
+ priority: inferPriorityFromCount(faqContentNoSchema.length, { critical: 10, high: 4, medium: 2 }),
259
+ area: 'schema',
260
+ title: `Add FAQPage schema to ${faqContentNoSchema.length} pages with Q&A content`,
261
+ why: 'Pages with multiple question headings but no FAQPage schema miss FAQ rich results and lose AI citability score.',
262
+ evidence: collectTop(faqContentNoSchema.map(r => `${r.url} (${r.question_heading_count} question headings)`), 8),
263
+ impact: faqImpact.length ? faqImpact : undefined,
264
+ implementationHints: [
265
+ 'Wrap each question heading + answer paragraph in FAQPage JSON-LD with Question/Answer entities.',
266
+ 'Keep answers under 300 words each — Google truncates longer ones in rich results.',
267
+ ],
268
+ }));
269
+ }
270
+
271
+ // ── HowTo content without HowTo schema ──────────────────────────────────
272
+ const howtoContentNoSchema = rows.filter(r => {
273
+ const title = String(r.title || '').toLowerCase();
274
+ const h1 = String(r.h1 || '').toLowerCase();
275
+ const hasHowToSignal = /\bhow to\b|\bstep[- ]by[- ]step\b|\bsetup guide\b|\binstall guide\b/.test(title) ||
276
+ /\bhow to\b|\bstep[- ]by[- ]step\b|\bsetup guide\b|\binstall guide\b/.test(h1);
277
+ return hasHowToSignal && !r.howto_schema_count &&
278
+ Number(r.status_code) < 400 && r.is_indexable;
279
+ });
280
+ if (howtoContentNoSchema.length) {
281
+ const howtoPageIds = howtoContentNoSchema.map(r => r.id);
282
+ const howtoKeywords = getKeywordsForSchemaDeficientPages(db, project, howtoPageIds);
283
+ const howtoImpact = howtoKeywords
284
+ .filter(k => k.location === 'title' || k.location === 'h1')
285
+ .slice(0, 5)
286
+ .map(k => `"${k.keyword}" → missing HowTo rich result (step-by-step carousel)`);
287
+
288
+ actions.push(makeAction({
289
+ id: 'technical-howto-content-no-schema',
290
+ type: 'add_schema',
291
+ priority: inferPriorityFromCount(howtoContentNoSchema.length, { critical: 8, high: 3, medium: 1 }),
292
+ area: 'schema',
293
+ title: `Add HowTo schema to ${howtoContentNoSchema.length} step-by-step guide pages`,
294
+ why: 'How-to guides without HowTo schema miss rich results and rank lower for procedural queries.',
295
+ evidence: collectTop(howtoContentNoSchema.map(r => `${r.url}`), 8),
296
+ impact: howtoImpact.length ? howtoImpact : undefined,
297
+ implementationHints: [
298
+ 'Wrap numbered steps in HowTo JSON-LD with HowToStep entities.',
299
+ 'Include tool, supply, and time/cost fields where applicable.',
300
+ ],
301
+ }));
302
+ }
303
+
304
+ // ── Multiple H1 headings ─────────────────────────────────────────────────
305
+ const multipleH1 = rows.filter(r =>
306
+ r.has_multiple_h1 && Number(r.status_code) < 400 && r.is_indexable
307
+ );
308
+ if (multipleH1.length) {
309
+ actions.push(makeAction({
310
+ id: 'technical-multiple-h1',
311
+ type: 'fix',
312
+ priority: inferPriorityFromCount(multipleH1.length, { critical: 15, high: 6, medium: 2 }),
313
+ area: 'content',
314
+ title: `Fix multiple H1 headings on ${multipleH1.length} pages`,
315
+ why: 'Multiple H1s dilute topical focus and create ambiguity about the primary page topic for search engines.',
316
+ evidence: collectTop(multipleH1.map(r => r.url), 10),
317
+ implementationHints: [
318
+ 'Keep exactly one H1 that matches the page\'s primary keyword intent.',
319
+ 'Demote secondary H1s to H2 or H3 as appropriate.',
320
+ ],
321
+ }));
322
+ }
323
+
324
+ // ── Homepage links to external sites (nav leak) ──────────────────────
325
+ // Flag when homepage has external links in nav-like positions (anchor text
326
+ // suggests navigation: short text like "Deck", "Docs", "Blog" etc.)
327
+ const homepage = rows.find(r => {
328
+ const path = new URL(r.url).pathname;
329
+ return (path === '/' || path === '') && Number(r.status_code) < 400;
330
+ });
331
+ if (homepage) {
332
+ const navAnchors = ['deck', 'docs', 'blog', 'about', 'home', 'pricing', 'features', 'faq', 'team', 'contact', 'app', 'dashboard', 'whitepaper', 'roadmap', 'litepaper'];
333
+ const externalNavLinks = db.prepare(`
334
+ SELECT l.target_url, l.anchor_text
335
+ FROM links l
336
+ WHERE l.source_id = ? AND l.is_internal = 0
337
+ AND LENGTH(l.anchor_text) > 0 AND LENGTH(l.anchor_text) < 20
338
+ `).all(homepage.id)
339
+ .filter(l => navAnchors.some(n => l.anchor_text.toLowerCase().includes(n)));
340
+
341
+ if (externalNavLinks.length) {
342
+ actions.push(makeAction({
343
+ id: 'technical-nav-links-external',
344
+ type: 'fix',
345
+ priority: 'high',
346
+ area: 'structure',
347
+ title: `${externalNavLinks.length} navigation link(s) on homepage point to external sites`,
348
+ why: 'Nav-level links to external domains leak PageRank and confuse users expecting to stay on-site. Use internal landing pages or relative paths instead.',
349
+ evidence: externalNavLinks.map(l => `"${l.anchor_text}" → ${l.target_url}`),
350
+ implementationHints: [
351
+ 'Replace external nav links with internal pages (e.g. /deck instead of Google Docs link).',
352
+ 'If the content must be external, use a landing page wrapper with canonical.',
353
+ 'Ensure the logo/brand link always points to the homepage.',
354
+ ],
355
+ }));
356
+ }
357
+ }
358
+
179
359
  return sortActions(actions);
180
360
  }