seo-intel 1.5.2 → 1.5.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +26 -0
- package/Start SEO Intel.command +10 -0
- package/analyses/aeo/scorer.js +60 -6
- package/analyses/blog-draft/index.js +62 -10
- package/analyses/templates/index.js +1 -1
- package/analysis/prompt-builder.js +167 -2
- package/analysis/technical-audit.js +177 -0
- package/cli.js +446 -25
- package/crawler/index.js +36 -2
- package/crawler/sitemap.js +44 -0
- package/db/db.js +62 -9
- package/db/schema.sql +19 -0
- package/exports/queries.js +32 -0
- package/exports/technical.js +181 -1
- package/extractor/qwen.js +135 -13
- package/lib/scan-export.js +204 -0
- package/package.json +1 -1
- package/reports/generate-html.js +517 -50
- package/server.js +319 -25
- package/setup/checks.js +65 -5
- package/setup/engine.js +1 -0
- package/setup/web-routes.js +22 -3
- package/setup/wizard.html +8 -6
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,31 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 1.5.23 (2026-04-23)
|
|
4
|
+
|
|
5
|
+
### Technical Audit — extended-data checks
|
|
6
|
+
- New `seo-intel tech-audit <project>` command — runs technical SEO validation off the crawl DB
|
|
7
|
+
- Findings: title length, meta description length, noindex detection (meta + `X-Robots-Tag`), redirect chains, indexable-but-not-in-sitemap, redirect-target cross-reference
|
|
8
|
+
- `--head` pass runs bounded-concurrency HEAD checks against sitemap URLs (flags 3XX / 4XX)
|
|
9
|
+
- Gated under the `extended-data` banner — same tier surface as other audit extensions
|
|
10
|
+
|
|
11
|
+
### Crawler — new signal capture
|
|
12
|
+
- Captures final URL after redirects (`page.url()`)
|
|
13
|
+
- Walks the Playwright redirect chain and persists it as JSON
|
|
14
|
+
- Reads `X-Robots-Tag` response header (no-index detection now covers meta **and** header)
|
|
15
|
+
- Sitemap URLs discovered during crawl are persisted to a new `sitemap_urls` table
|
|
16
|
+
|
|
17
|
+
### Schema
|
|
18
|
+
- `pages` table gains `final_url`, `redirect_chain`, `x_robots_tag` (additive `ALTER TABLE`, safe on existing DBs)
|
|
19
|
+
- New `sitemap_urls` table for the HEAD-check inventory pass
|
|
20
|
+
|
|
21
|
+
### Accumulated since last changelog (1.5.3–1.5.22)
|
|
22
|
+
- LM Studio extraction backend + auto-discovery
|
|
23
|
+
- Scan command auto-resolves `www` when bare domain is unreachable
|
|
24
|
+
- Intelligence modules: intent scores, schema impact, rich-result probability
|
|
25
|
+
- Nav-link detection for external sites + missing-www redirect warning
|
|
26
|
+
- Solo audit prompt rewrite — no more hallucinated competitors
|
|
27
|
+
- Scan/serve/dashboard resilience fixes
|
|
28
|
+
|
|
3
29
|
## 1.5.2 (2026-04-11)
|
|
4
30
|
|
|
5
31
|
### Unified Export
|
package/Start SEO Intel.command
CHANGED
|
@@ -5,4 +5,14 @@ echo ""
|
|
|
5
5
|
echo " Starting SEO Intel..."
|
|
6
6
|
echo " Dashboard will open in your browser."
|
|
7
7
|
echo ""
|
|
8
|
+
|
|
9
|
+
# Kill any stale server on the same port so new code is always loaded
|
|
10
|
+
PORT="${SEO_INTEL_PORT:-3000}"
|
|
11
|
+
OLD_PID=$(lsof -ti :"$PORT" 2>/dev/null)
|
|
12
|
+
if [ -n "$OLD_PID" ]; then
|
|
13
|
+
echo " Restarting server (killing stale PID $OLD_PID on port $PORT)..."
|
|
14
|
+
kill "$OLD_PID" 2>/dev/null
|
|
15
|
+
sleep 1
|
|
16
|
+
fi
|
|
17
|
+
|
|
8
18
|
node cli.js serve --open
|
package/analyses/aeo/scorer.js
CHANGED
|
@@ -123,7 +123,7 @@ function answerDensityScore(bodyText, wordCount) {
|
|
|
123
123
|
}
|
|
124
124
|
|
|
125
125
|
// ── Q&A proximity ──────────────────────────────────────────────────────────
|
|
126
|
-
function qaProximityScore(headings, bodyText) {
|
|
126
|
+
function qaProximityScore(headings, bodyText, schemaTypes) {
|
|
127
127
|
if (!headings.length || !bodyText) return 0;
|
|
128
128
|
|
|
129
129
|
const questionHeadings = headings.filter(h =>
|
|
@@ -138,8 +138,8 @@ function qaProximityScore(headings, bodyText) {
|
|
|
138
138
|
const qRatio = questionHeadings.length / headings.filter(h => h.level >= 2).length;
|
|
139
139
|
score += Math.min(qRatio * 60, 40);
|
|
140
140
|
|
|
141
|
-
// FAQ schema present? Huge bonus
|
|
142
|
-
score += 30;
|
|
141
|
+
// FAQ schema present? Huge bonus — only award if schema actually exists
|
|
142
|
+
if (Array.isArray(schemaTypes) && schemaTypes.includes('FAQPage')) score += 30;
|
|
143
143
|
|
|
144
144
|
// Heading density (one H2/H3 per ~300 words is ideal)
|
|
145
145
|
const h2h3Count = headings.filter(h => h.level >= 2 && h.level <= 3).length;
|
|
@@ -201,6 +201,59 @@ function classifyAiIntent(headings, bodyText, searchIntent) {
|
|
|
201
201
|
return intents;
|
|
202
202
|
}
|
|
203
203
|
|
|
204
|
+
// ── Rich Result Probability Predictor ──────────────────────────────────────
|
|
205
|
+
|
|
206
|
+
/**
|
|
207
|
+
* Estimate probability of achieving rich results based on page signals.
|
|
208
|
+
* Returns per-type probability (FAQ, HowTo, Article) and overall best chance.
|
|
209
|
+
*
|
|
210
|
+
* @param {object[]} headings
|
|
211
|
+
* @param {string} bodyText
|
|
212
|
+
* @param {string[]} schemaTypes - current schema on page
|
|
213
|
+
* @param {number} wordCount
|
|
214
|
+
* @returns {object} { faq, howto, article, best: { type, probability } }
|
|
215
|
+
*/
|
|
216
|
+
export function richResultProbability(headings, bodyText, schemaTypes, wordCount) {
|
|
217
|
+
const text = (bodyText || '').toLowerCase();
|
|
218
|
+
|
|
219
|
+
// ── FAQ Rich Result ──
|
|
220
|
+
const questionHeadings = headings.filter(h => h.level >= 2 && h.level <= 3 && QUESTION_RE.test(h.text));
|
|
221
|
+
let faq = 0;
|
|
222
|
+
if (schemaTypes.includes('FAQPage')) faq += 45;
|
|
223
|
+
if (questionHeadings.length >= 3) faq += 25;
|
|
224
|
+
else if (questionHeadings.length >= 1) faq += 10;
|
|
225
|
+
if (wordCount >= 500) faq += 10;
|
|
226
|
+
if (wordCount >= 1500) faq += 5;
|
|
227
|
+
const paras = text.split(/\n\s*\n/).filter(p => p.trim()).length;
|
|
228
|
+
if (paras >= 3 && wordCount / paras < 150) faq += 15;
|
|
229
|
+
faq = Math.min(faq, 95);
|
|
230
|
+
|
|
231
|
+
// ── HowTo Rich Result ──
|
|
232
|
+
let howto = 0;
|
|
233
|
+
if (schemaTypes.includes('HowTo')) howto += 45;
|
|
234
|
+
if (IMPL_RE.test(text)) howto += 20;
|
|
235
|
+
const steps = (text.match(/(?:^|\n)\s*(?:\d+[.)]\s|step\s+\d)/gm) || []).length;
|
|
236
|
+
if (steps >= 3) howto += 20;
|
|
237
|
+
else if (steps >= 1) howto += 8;
|
|
238
|
+
if (wordCount >= 300) howto += 10;
|
|
239
|
+
howto = Math.min(howto, 95);
|
|
240
|
+
|
|
241
|
+
// ── Article Rich Result ──
|
|
242
|
+
const articleSchemas = ['Article', 'TechArticle', 'BlogPosting', 'NewsArticle'];
|
|
243
|
+
let article = 0;
|
|
244
|
+
if (schemaTypes.some(t => articleSchemas.includes(t))) article += 35;
|
|
245
|
+
if (wordCount >= 800) article += 20;
|
|
246
|
+
else if (wordCount >= 400) article += 10;
|
|
247
|
+
if (headings.filter(h => h.level === 2).length >= 2) article += 15;
|
|
248
|
+
if (schemaTypes.includes('BreadcrumbList')) article += 10;
|
|
249
|
+
article = Math.min(article, 95);
|
|
250
|
+
|
|
251
|
+
const results = { faq, howto, article };
|
|
252
|
+
const best = Object.entries(results).sort((a, b) => b[1] - a[1])[0];
|
|
253
|
+
|
|
254
|
+
return { ...results, best: { type: best[0], probability: best[1] } };
|
|
255
|
+
}
|
|
256
|
+
|
|
204
257
|
// ── Main scorer ────────────────────────────────────────────────────────────
|
|
205
258
|
|
|
206
259
|
/**
|
|
@@ -212,7 +265,7 @@ function classifyAiIntent(headings, bodyText, searchIntent) {
|
|
|
212
265
|
* @param {string[]} schemaTypes - schema type strings present on page
|
|
213
266
|
* @param {object[]} schemas - full page_schemas rows
|
|
214
267
|
* @param {string} searchIntent - from extraction
|
|
215
|
-
* @returns {object} { score, breakdown, aiIntents, tier }
|
|
268
|
+
* @returns {object} { score, breakdown, aiIntents, tier, richResult }
|
|
216
269
|
*/
|
|
217
270
|
export function scorePage(page, headings, entities, schemaTypes, schemas, searchIntent) {
|
|
218
271
|
const bodyText = page.body_text || '';
|
|
@@ -222,7 +275,7 @@ export function scorePage(page, headings, entities, schemaTypes, schemas, search
|
|
|
222
275
|
entity_authority: entityAuthorityScore(entities, headings, wordCount),
|
|
223
276
|
structured_claims: structuredClaimsScore(bodyText, headings),
|
|
224
277
|
answer_density: answerDensityScore(bodyText, wordCount),
|
|
225
|
-
qa_proximity: qaProximityScore(headings, bodyText),
|
|
278
|
+
qa_proximity: qaProximityScore(headings, bodyText, schemaTypes),
|
|
226
279
|
freshness: freshnessScore(page, schemas),
|
|
227
280
|
schema_coverage: schemaCoverageScore(schemaTypes),
|
|
228
281
|
};
|
|
@@ -242,6 +295,7 @@ export function scorePage(page, headings, entities, schemaTypes, schemas, search
|
|
|
242
295
|
);
|
|
243
296
|
|
|
244
297
|
const aiIntents = classifyAiIntent(headings, bodyText, searchIntent);
|
|
298
|
+
const richResult = richResultProbability(headings, bodyText, schemaTypes, wordCount);
|
|
245
299
|
|
|
246
300
|
// Tier classification
|
|
247
301
|
let tier;
|
|
@@ -250,5 +304,5 @@ export function scorePage(page, headings, entities, schemaTypes, schemas, search
|
|
|
250
304
|
else if (score >= 35) tier = 'needs_work';
|
|
251
305
|
else tier = 'poor';
|
|
252
306
|
|
|
253
|
-
return { score, breakdown, aiIntents, tier };
|
|
307
|
+
return { score, breakdown, aiIntents, tier, richResult };
|
|
254
308
|
}
|
|
@@ -89,9 +89,10 @@ export function gatherBlogDraftContext(db, project, topic = null) {
|
|
|
89
89
|
|
|
90
90
|
// ── Prompt Builder ──────────────────────────────────────────────────────────
|
|
91
91
|
|
|
92
|
-
export function buildBlogDraftPrompt(context, { config, lang = 'en', topic = null }) {
|
|
92
|
+
export function buildBlogDraftPrompt(context, { config, lang = 'en', topic = null, contentType = 'blog' }) {
|
|
93
93
|
const { longTails, keywordGaps, citabilityGaps, entityRows, topCitablePages, kwInventor, contentGaps, insights } = context;
|
|
94
94
|
const isFi = lang === 'fi';
|
|
95
|
+
const langName = isFi ? 'Finnish' : 'English';
|
|
95
96
|
|
|
96
97
|
// Extract unique entities from extraction data
|
|
97
98
|
const allEntities = new Set();
|
|
@@ -103,11 +104,19 @@ export function buildBlogDraftPrompt(context, { config, lang = 'en', topic = nul
|
|
|
103
104
|
}
|
|
104
105
|
const topEntities = [...allEntities].slice(0, 15);
|
|
105
106
|
|
|
106
|
-
// ── Section 1: Role ──
|
|
107
|
+
// ── Section 1: Role — adapts to content type ──
|
|
108
|
+
const typeInstructions = {
|
|
109
|
+
blog: `Your task: write a complete, publish-ready blog post draft in ${langName}.
|
|
110
|
+
The post must score 70+ on the AEO citability scale (entity authority, structured claims, answer density, Q&A proximity, freshness signals, schema coverage).`,
|
|
111
|
+
docs: `Your task: write a complete, publish-ready documentation page in ${langName}.
|
|
112
|
+
The page must be technically precise, well-structured, and scannable. Use step-by-step instructions where applicable. Optimise for developers and technical users searching for how-to answers.`,
|
|
113
|
+
social: `Your task: write a set of social media posts in ${langName}.
|
|
114
|
+
Create 5-7 distinct posts suitable for LinkedIn/X/Twitter. Each should be self-contained, engaging, and drive traffic to the site. Include hashtag suggestions. Vary formats: thread opener, hot take, stat-based, question-based, listicle.`,
|
|
115
|
+
};
|
|
116
|
+
|
|
107
117
|
let prompt = `You are an expert content strategist and copywriter specialising in AEO (Answer Engine Optimisation).
|
|
108
118
|
|
|
109
|
-
|
|
110
|
-
The post must score 70+ on the AEO citability scale (entity authority, structured claims, answer density, Q&A proximity, freshness signals, schema coverage).
|
|
119
|
+
${typeInstructions[contentType] || typeInstructions.blog}
|
|
111
120
|
|
|
112
121
|
`;
|
|
113
122
|
|
|
@@ -183,12 +192,46 @@ The post must score 70+ on the AEO citability scale (entity authority, structure
|
|
|
183
192
|
}
|
|
184
193
|
}
|
|
185
194
|
|
|
186
|
-
// ── Section 5:
|
|
187
|
-
|
|
195
|
+
// ── Section 5: Structural requirements — adapts to content type ──
|
|
196
|
+
const siteName = config.context?.siteName || config.target?.domain;
|
|
197
|
+
const today = new Date().toISOString().slice(0, 10);
|
|
198
|
+
|
|
199
|
+
if (contentType === 'social') {
|
|
200
|
+
prompt += `
|
|
201
|
+
## Social Media Requirements
|
|
202
|
+
|
|
203
|
+
1. Create 5-7 distinct posts, each separated by ---
|
|
204
|
+
2. Each post must be self-contained (not a thread unless marked as such)
|
|
205
|
+
3. Include a mix of: hot takes, statistics/data, questions, listicles, how-to snippets
|
|
206
|
+
4. Optimise for engagement: hooks in the first line, clear value proposition
|
|
207
|
+
5. Include 3-5 relevant hashtags per post
|
|
208
|
+
6. Keep posts under 280 characters for X/Twitter variants; LinkedIn variants can be longer (600-1,200 chars)
|
|
209
|
+
7. Reference ${siteName} naturally where appropriate (not in every post)
|
|
210
|
+
8. Include one thread idea (3-5 connected posts) marked with [THREAD]
|
|
211
|
+
9. Language: ${isFi ? 'Finnish' : 'English'}
|
|
212
|
+
`;
|
|
213
|
+
} else if (contentType === 'docs') {
|
|
214
|
+
prompt += `
|
|
215
|
+
## Documentation Requirements
|
|
216
|
+
|
|
217
|
+
The page MUST include:
|
|
218
|
+
1. YAML frontmatter with: title, slug, description (155 chars max), primary_keyword, secondary_keywords[], date (${today}), lang (${lang}), tags[]
|
|
219
|
+
2. An H1 that clearly states what this page covers
|
|
220
|
+
3. A 1-2 sentence overview immediately after the H1
|
|
221
|
+
4. Prerequisites section (if applicable)
|
|
222
|
+
5. Step-by-step instructions with numbered lists
|
|
223
|
+
6. Code examples with language-tagged fenced code blocks
|
|
224
|
+
7. At least one table for reference data (parameters, options, etc.)
|
|
225
|
+
8. A "Troubleshooting" or "Common Issues" section at the end
|
|
226
|
+
9. Word count: 800-2,000 words
|
|
227
|
+
10. Internal link suggestions: include 2-3 \`[anchor text](URL)\` links to related pages
|
|
228
|
+
`;
|
|
229
|
+
} else {
|
|
230
|
+
prompt += `
|
|
188
231
|
## AEO Structural Requirements
|
|
189
232
|
|
|
190
233
|
The draft MUST include:
|
|
191
|
-
1. YAML frontmatter with: title, slug, description (155 chars max), primary_keyword, secondary_keywords[], date (${
|
|
234
|
+
1. YAML frontmatter with: title, slug, description (155 chars max), primary_keyword, secondary_keywords[], date (${today}), updated (same), lang (${lang}), tags[]${!topic ? ', topic_selection_rationale' : ''}
|
|
192
235
|
2. An H1 that contains the primary keyword
|
|
193
236
|
3. A 2-3 sentence summary immediately after the H1 (answer-first structure — inverted pyramid). This paragraph will be cited by AI assistants.
|
|
194
237
|
4. Minimum 6 H2 subheadings
|
|
@@ -196,10 +239,11 @@ The draft MUST include:
|
|
|
196
239
|
6. At least one numbered or bulleted list with 4+ items
|
|
197
240
|
7. At least one "X is Y because Z" definitional sentence per major concept
|
|
198
241
|
8. A FAQ section at the end with minimum 4 Q&A pairs (### H3 questions, 2-4 sentence answers)
|
|
199
|
-
9. A closing CTA paragraph referencing ${
|
|
242
|
+
9. A closing CTA paragraph referencing ${siteName}
|
|
200
243
|
10. Word count: 1,200-2,000 words
|
|
201
244
|
11. Internal link suggestions: include 2-3 \`[anchor text](URL)\` links back to the site where natural
|
|
202
245
|
`;
|
|
246
|
+
}
|
|
203
247
|
|
|
204
248
|
// ── Section 6: Language ──
|
|
205
249
|
if (isFi) {
|
|
@@ -217,11 +261,19 @@ Write in clear, direct international English. No filler phrases. No "in today's
|
|
|
217
261
|
}
|
|
218
262
|
|
|
219
263
|
// ── Section 7: Output format ──
|
|
220
|
-
|
|
264
|
+
if (contentType === 'social') {
|
|
265
|
+
prompt += `
|
|
266
|
+
## Output Format
|
|
267
|
+
|
|
268
|
+
Respond with ONLY the social media posts. Separate each post with ---. No explanation before or after. No triple backticks wrapping the response.
|
|
269
|
+
`;
|
|
270
|
+
} else {
|
|
271
|
+
prompt += `
|
|
221
272
|
## Output Format
|
|
222
273
|
|
|
223
|
-
Respond with ONLY the complete markdown document. Start with --- (YAML frontmatter open fence). End with the FAQ section and CTA. No explanation before or after. No triple backticks wrapping the response.
|
|
274
|
+
Respond with ONLY the complete markdown document. Start with --- (YAML frontmatter open fence). End with the ${contentType === 'docs' ? 'Troubleshooting section' : 'FAQ section and CTA'}. No explanation before or after. No triple backticks wrapping the response.
|
|
224
275
|
`;
|
|
276
|
+
}
|
|
225
277
|
|
|
226
278
|
return prompt;
|
|
227
279
|
}
|
|
@@ -44,7 +44,7 @@ export async function runTemplatesAnalysis(project, opts = {}) {
|
|
|
44
44
|
if (!config) throw new Error(`Project "${project}" not found. Run: seo-intel setup`);
|
|
45
45
|
|
|
46
46
|
const targetDomain = config.target.domain;
|
|
47
|
-
const targetUrl = config.target.url || `https://${targetDomain}
|
|
47
|
+
const targetUrl = (config.target.url || `https://${targetDomain}`).replace(/\/+$/, '');
|
|
48
48
|
|
|
49
49
|
log(`\n Target: ${targetDomain}`);
|
|
50
50
|
|
|
@@ -25,6 +25,171 @@
|
|
|
25
25
|
* @param {object} params.context - project context (industry, audience, goals)
|
|
26
26
|
*/
|
|
27
27
|
export function buildAnalysisPrompt({ project, target, competitors, keywordMatrix, headingStructure, context }) {
|
|
28
|
+
const isSolo = !competitors || competitors.length === 0;
|
|
29
|
+
return isSolo
|
|
30
|
+
? buildSoloPrompt({ target, keywordMatrix, headingStructure, context })
|
|
31
|
+
: buildCompetitivePrompt({ target, competitors, keywordMatrix, headingStructure, context });
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// ── Solo audit prompt (no competitors) ─────────────────────────────────────
|
|
35
|
+
|
|
36
|
+
function buildSoloPrompt({ target, keywordMatrix, headingStructure, context }) {
|
|
37
|
+
return `
|
|
38
|
+
# SEO Site Audit — ${context.siteName}
|
|
39
|
+
|
|
40
|
+
You are an expert SEO strategist performing a solo site audit. You have ONLY the crawled site data below — no competitor data.
|
|
41
|
+
|
|
42
|
+
**CRITICAL RULES:**
|
|
43
|
+
- You have ZERO competitor data. Do NOT invent, hallucinate, or reference any competitor domains.
|
|
44
|
+
- Never fill "covered_by" with domain names you were not given.
|
|
45
|
+
- Base keyword and content recommendations on: (1) the crawled site data, (2) your knowledge of the "${context.industry}" industry and what audiences search for.
|
|
46
|
+
- Label all industry-knowledge suggestions as "industry research" — not "data-driven".
|
|
47
|
+
- Every URL slug you suggest must be a real path (e.g. "/blog/how-to-x"), never "/undefined".
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## CONTEXT
|
|
52
|
+
|
|
53
|
+
**Site:** ${context.siteName} (${context.url})
|
|
54
|
+
**Industry:** ${context.industry}
|
|
55
|
+
**Target audience:** ${context.audience}
|
|
56
|
+
**Business goal:** ${context.goal}
|
|
57
|
+
**Current SEO maturity:** ${context.maturity || 'early stage'}
|
|
58
|
+
|
|
59
|
+
### Site Architecture
|
|
60
|
+
${context.site_architecture ? `
|
|
61
|
+
${context.site_architecture.note}
|
|
62
|
+
|
|
63
|
+
Available publishing properties:
|
|
64
|
+
${context.site_architecture.properties.map(p =>
|
|
65
|
+
`- **${p.id}** (${p.url}, platform: ${p.platform})\n Best for: ${p.best_for}\n Difficulty: ${p.difficulty}${p.seo_note ? `\n SEO note: ${p.seo_note}` : ''}`
|
|
66
|
+
).join('\n')}
|
|
67
|
+
` : 'No site architecture configured — recommend generic URL slugs.'}
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## SITE DATA
|
|
72
|
+
|
|
73
|
+
${formatSiteSummary(target)}
|
|
74
|
+
|
|
75
|
+
### Pages crawled: ${target.page_count || target.pageCount || 0}
|
|
76
|
+
### Keyword coverage:
|
|
77
|
+
${formatKeywordTable(keywordMatrix, target.domain)}
|
|
78
|
+
|
|
79
|
+
### Heading structure:
|
|
80
|
+
${formatHeadings(headingStructure, target.domain)}
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## ANALYSIS TASKS
|
|
85
|
+
|
|
86
|
+
### 1. KEYWORD OPPORTUNITIES
|
|
87
|
+
- Based on the site's existing content and the "${context.industry}" industry, identify 5-10 keyword phrases the site should target
|
|
88
|
+
- For each: search intent, estimated search demand (low/medium/high), difficulty, and whether to add to an existing page or create a new one
|
|
89
|
+
- Focus on keywords that match the site's actual product/service — no speculative gaps
|
|
90
|
+
|
|
91
|
+
### 2. LONG-TAIL OPPORTUNITIES
|
|
92
|
+
- Generate 10-20 specific long-tail phrases (3-6 words) from the site's content themes
|
|
93
|
+
- Focus on: question queries, feature queries, use-case queries
|
|
94
|
+
- For each: intent, page type, priority
|
|
95
|
+
- Weight toward commercial intent
|
|
96
|
+
|
|
97
|
+
### 3. CONTENT EXPANSION
|
|
98
|
+
- Topic areas the site should cover based on industry norms and audience needs
|
|
99
|
+
- Do NOT reference competitor domains — use "industry standard" or "common in ${context.industry}" instead
|
|
100
|
+
- For each: why it matters for this audience, suggested format, suggested title
|
|
101
|
+
|
|
102
|
+
### 4. QUICK WINS (existing pages to improve)
|
|
103
|
+
- Pages with thin content, missing structure, or weak metadata
|
|
104
|
+
- Only reference pages that appear in the crawled data above
|
|
105
|
+
- For each: specific fix, estimated impact
|
|
106
|
+
|
|
107
|
+
### 5. NEW PAGE SUGGESTIONS
|
|
108
|
+
- Specific new pages to create based on keyword opportunities
|
|
109
|
+
- For each: URL slug (real path like /blog/topic), title, target keyword, content angle
|
|
110
|
+
|
|
111
|
+
### 6. TECHNICAL SEO AUDIT
|
|
112
|
+
- Schema markup opportunities (FAQ, HowTo, Product, etc.)
|
|
113
|
+
- Meta description quality assessment
|
|
114
|
+
- H1/heading structure recommendations
|
|
115
|
+
- Do NOT compare to competitors — assess against SEO best practices
|
|
116
|
+
|
|
117
|
+
### 7. MARKET POSITIONING
|
|
118
|
+
- Based on the site's content and industry, what positioning should this site own?
|
|
119
|
+
- What audience need is underserved in this space?
|
|
120
|
+
- What is the site's clearest differentiator from its current content?
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## OUTPUT SCHEMA
|
|
125
|
+
|
|
126
|
+
Respond ONLY with valid JSON in this exact structure:
|
|
127
|
+
|
|
128
|
+
{
|
|
129
|
+
"keyword_gaps": [
|
|
130
|
+
{
|
|
131
|
+
"keyword": "string — 2-4 word SEO phrase",
|
|
132
|
+
"intent": "informational|commercial|navigational|transactional",
|
|
133
|
+
"search_demand": "low|medium|high",
|
|
134
|
+
"difficulty": "low|medium|high",
|
|
135
|
+
"suggested_action": "add_to_existing|new_page",
|
|
136
|
+
"suggested_page": "string — URL path like /blog/topic or existing page URL",
|
|
137
|
+
"priority": "high|medium|low",
|
|
138
|
+
"source": "site_content|industry_research"
|
|
139
|
+
}
|
|
140
|
+
],
|
|
141
|
+
"long_tails": [
|
|
142
|
+
{
|
|
143
|
+
"phrase": "string",
|
|
144
|
+
"intent": "string",
|
|
145
|
+
"page_type": "blog|landing|doc|faq|comparison|glossary",
|
|
146
|
+
"priority": "high|medium|low",
|
|
147
|
+
"notes": "string"
|
|
148
|
+
}
|
|
149
|
+
],
|
|
150
|
+
"content_gaps": [
|
|
151
|
+
{
|
|
152
|
+
"topic": "string",
|
|
153
|
+
"why_it_matters": "string",
|
|
154
|
+
"format": "blog|comparison|use_case|glossary|how_to|landing",
|
|
155
|
+
"suggested_title": "string"
|
|
156
|
+
}
|
|
157
|
+
],
|
|
158
|
+
"quick_wins": [
|
|
159
|
+
{
|
|
160
|
+
"page": "string — URL from crawled data",
|
|
161
|
+
"issue": "string",
|
|
162
|
+
"fix": "string",
|
|
163
|
+
"impact": "high|medium|low"
|
|
164
|
+
}
|
|
165
|
+
],
|
|
166
|
+
"new_pages": [
|
|
167
|
+
{
|
|
168
|
+
"title": "string",
|
|
169
|
+
"target_keyword": "string",
|
|
170
|
+
"content_angle": "string",
|
|
171
|
+
"why": "string",
|
|
172
|
+
"priority": "high|medium|low"
|
|
173
|
+
}
|
|
174
|
+
],
|
|
175
|
+
"technical_gaps": [
|
|
176
|
+
{
|
|
177
|
+
"gap": "string",
|
|
178
|
+
"fix": "string"
|
|
179
|
+
}
|
|
180
|
+
],
|
|
181
|
+
"positioning": {
|
|
182
|
+
"market_context": "string — 2-3 sentences on the industry landscape",
|
|
183
|
+
"open_angle": "string — what positioning this site should own",
|
|
184
|
+
"target_differentiator": "string — clearest differentiator from current content"
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
`.trim();
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// ── Competitive prompt (with competitors) ──────────────────────────────────
|
|
191
|
+
|
|
192
|
+
function buildCompetitivePrompt({ target, competitors, keywordMatrix, headingStructure, context }) {
|
|
28
193
|
return `
|
|
29
194
|
# SEO Competitive Intelligence Analysis — ${context.siteName}
|
|
30
195
|
|
|
@@ -59,7 +224,7 @@ For each content recommendation, rank ALL available properties as placement opti
|
|
|
59
224
|
|
|
60
225
|
${formatSiteSummary(target)}
|
|
61
226
|
|
|
62
|
-
### Pages crawled: ${target.pageCount}
|
|
227
|
+
### Pages crawled: ${target.page_count || target.pageCount || 0}
|
|
63
228
|
### Keyword coverage:
|
|
64
229
|
${formatKeywordTable(keywordMatrix, target.domain)}
|
|
65
230
|
|
|
@@ -218,7 +383,7 @@ Respond ONLY with valid JSON in this exact structure:
|
|
|
218
383
|
function formatSiteSummary(site) {
|
|
219
384
|
return `
|
|
220
385
|
- Domain: ${site.domain}
|
|
221
|
-
- Pages crawled: ${site.pageCount || 0}
|
|
386
|
+
- Pages crawled: ${site.page_count || site.pageCount || 0}
|
|
222
387
|
- Avg word count: ${Math.round(site.avg_word_count || 0)}
|
|
223
388
|
- Product types detected: ${site.product_types || 'unknown'}
|
|
224
389
|
- Pricing model: ${site.pricing_tiers || 'unknown'}
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Technical SEO Audit — reads crawl data from the DB and produces findings.
|
|
3
|
+
*
|
|
4
|
+
* Extended-data checks (gated via lib/gate.js `extended-data`):
|
|
5
|
+
* 1. Title length (>60 warn, missing err)
|
|
6
|
+
* 2. Meta description length (>160 warn, >320 err, missing err)
|
|
7
|
+
* 3. Noindex detection (meta robots OR X-Robots-Tag header)
|
|
8
|
+
* 4. Indexable pages missing from sitemap (set diff)
|
|
9
|
+
* 5. Redirect chain surfacing (uses final_url + redirect_chain columns)
|
|
10
|
+
* 6. Canonical points to a redirect target (uses redirect_chain + technical)
|
|
11
|
+
*
|
|
12
|
+
* Additional optional pass (network-heavy, must be explicitly enabled):
|
|
13
|
+
* - Sitemap HEAD check: flags 3XX / 4XX URLs in the sitemap itself.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import { gateSection } from '../lib/gate.js';
|
|
17
|
+
import { headCheckAll } from '../crawler/sitemap.js';
|
|
18
|
+
import {
|
|
19
|
+
getSitemapUrlsForDomain,
|
|
20
|
+
updateSitemapHeadResult,
|
|
21
|
+
} from '../db/db.js';
|
|
22
|
+
|
|
23
|
+
const TITLE_WARN = 60;
|
|
24
|
+
const DESC_WARN = 160;
|
|
25
|
+
const DESC_ERR = 320;
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Run the audit for a single domain. Returns { findings: [], stats: {} }.
|
|
29
|
+
* Pass { runSitemapHead: true } to run the HEAD pass over the sitemap inventory.
|
|
30
|
+
* Gated: the actual checks only run when the `extended-data` gate is open.
|
|
31
|
+
*/
|
|
32
|
+
export async function runTechnicalAudit(db, { project, domain, runSitemapHead = false, sitemapConcurrency = 6 } = {}) {
|
|
33
|
+
if (!gateSection('extended-data')) {
|
|
34
|
+
return { gated: true, findings: [], stats: {} };
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
const domainRow = db.prepare(
|
|
38
|
+
'SELECT id, domain FROM domains WHERE domain = ? AND project = ?'
|
|
39
|
+
).get(domain, project);
|
|
40
|
+
if (!domainRow) {
|
|
41
|
+
return { gated: false, findings: [], stats: {}, error: `domain not found: ${domain}` };
|
|
42
|
+
}
|
|
43
|
+
const domainId = domainRow.id;
|
|
44
|
+
|
|
45
|
+
const findings = [];
|
|
46
|
+
|
|
47
|
+
// ── Page-level checks (read from pages + technical) ──
|
|
48
|
+
const pages = db.prepare(`
|
|
49
|
+
SELECT
|
|
50
|
+
p.id, p.url, p.final_url, p.redirect_chain, p.x_robots_tag,
|
|
51
|
+
p.is_indexable, p.status_code, p.title, p.meta_desc,
|
|
52
|
+
t.has_canonical
|
|
53
|
+
FROM pages p
|
|
54
|
+
LEFT JOIN technical t ON t.page_id = p.id
|
|
55
|
+
WHERE p.domain_id = ?
|
|
56
|
+
`).all(domainId);
|
|
57
|
+
|
|
58
|
+
const redirectTargets = new Set();
|
|
59
|
+
|
|
60
|
+
for (const p of pages) {
|
|
61
|
+
// 1. Title length
|
|
62
|
+
if (!p.title) {
|
|
63
|
+
findings.push({ type: 'title_missing', severity: 'error', url: p.url, details: 'No <title>' });
|
|
64
|
+
} else if (p.title.length > TITLE_WARN) {
|
|
65
|
+
findings.push({ type: 'title_too_long', severity: 'warn', url: p.url, details: `${p.title.length}/${TITLE_WARN}` });
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// 2. Meta description length
|
|
69
|
+
if (!p.meta_desc) {
|
|
70
|
+
findings.push({ type: 'meta_desc_missing', severity: 'error', url: p.url, details: 'No meta description' });
|
|
71
|
+
} else if (p.meta_desc.length > DESC_ERR) {
|
|
72
|
+
findings.push({ type: 'meta_desc_too_long', severity: 'error', url: p.url, details: `${p.meta_desc.length}/${DESC_ERR}` });
|
|
73
|
+
} else if (p.meta_desc.length > DESC_WARN) {
|
|
74
|
+
findings.push({ type: 'meta_desc_too_long', severity: 'warn', url: p.url, details: `${p.meta_desc.length}/${DESC_WARN}` });
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// 3. Noindex (meta OR X-Robots-Tag) — informational only (valid decision, not error)
|
|
78
|
+
const xrt = (p.x_robots_tag || '').toLowerCase();
|
|
79
|
+
if (xrt.includes('noindex') && p.is_indexable === 0) {
|
|
80
|
+
findings.push({ type: 'noindex_header', severity: 'info', url: p.url, details: `X-Robots-Tag: ${p.x_robots_tag}` });
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// 5. Redirect chain
|
|
84
|
+
let chain = [];
|
|
85
|
+
try { chain = p.redirect_chain ? JSON.parse(p.redirect_chain) : []; } catch { chain = []; }
|
|
86
|
+
if (chain.length > 0) {
|
|
87
|
+
const finalUrl = p.final_url || p.url;
|
|
88
|
+
findings.push({
|
|
89
|
+
type: 'redirect_chain',
|
|
90
|
+
severity: chain.length >= 2 ? 'warn' : 'info',
|
|
91
|
+
url: p.url,
|
|
92
|
+
details: `${chain.length} hop(s) → ${finalUrl}`,
|
|
93
|
+
hops: chain,
|
|
94
|
+
finalUrl,
|
|
95
|
+
});
|
|
96
|
+
redirectTargets.add(finalUrl);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// 6. Canonical-points-to-redirect — requires a second pass with canonical URLs.
|
|
101
|
+
// `technical.has_canonical` is a boolean; the canonical URL itself isn't stored.
|
|
102
|
+
// For now we surface the set of redirect *targets* so reviewers can cross-reference.
|
|
103
|
+
if (redirectTargets.size > 0) {
|
|
104
|
+
findings.push({
|
|
105
|
+
type: 'redirect_targets_summary',
|
|
106
|
+
severity: 'info',
|
|
107
|
+
details: `${redirectTargets.size} redirect target URL(s) — review canonical tags pointing to any of these`,
|
|
108
|
+
urls: [...redirectTargets],
|
|
109
|
+
});
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// 4. Indexable-but-not-in-sitemap (set diff)
|
|
113
|
+
const sitemapRows = getSitemapUrlsForDomain(db, domainId);
|
|
114
|
+
const sitemapSet = new Set(sitemapRows.map(r => r.url));
|
|
115
|
+
const missing = pages.filter(p =>
|
|
116
|
+
p.is_indexable === 1 &&
|
|
117
|
+
p.status_code === 200 &&
|
|
118
|
+
!sitemapSet.has(p.url) &&
|
|
119
|
+
!sitemapSet.has(p.final_url || '')
|
|
120
|
+
);
|
|
121
|
+
for (const m of missing) {
|
|
122
|
+
findings.push({
|
|
123
|
+
type: 'indexable_missing_from_sitemap',
|
|
124
|
+
severity: 'warn',
|
|
125
|
+
url: m.url,
|
|
126
|
+
details: 'Page is indexable (200) but not declared in sitemap',
|
|
127
|
+
});
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// Optional: run HEAD pass over sitemap inventory
|
|
131
|
+
let sitemapHeadStats = null;
|
|
132
|
+
if (runSitemapHead && sitemapRows.length > 0) {
|
|
133
|
+
const uncheckedRows = sitemapRows.filter(r => r.head_checked_at === null);
|
|
134
|
+
const rowsToCheck = uncheckedRows.length ? uncheckedRows : sitemapRows;
|
|
135
|
+
let ok = 0, redirected = 0, broken = 0, errored = 0;
|
|
136
|
+
await headCheckAll(rowsToCheck, {
|
|
137
|
+
concurrency: sitemapConcurrency,
|
|
138
|
+
onResult: (row, res) => {
|
|
139
|
+
updateSitemapHeadResult(db, row.id, res);
|
|
140
|
+
if (!res.status) errored++;
|
|
141
|
+
else if (res.status >= 200 && res.status < 300) ok++;
|
|
142
|
+
else if (res.status >= 300 && res.status < 400) {
|
|
143
|
+
redirected++;
|
|
144
|
+
findings.push({
|
|
145
|
+
type: 'sitemap_redirect',
|
|
146
|
+
severity: 'warn',
|
|
147
|
+
url: row.url,
|
|
148
|
+
details: `Sitemap URL returns ${res.status}${res.location ? ` → ${res.location}` : ''}`,
|
|
149
|
+
});
|
|
150
|
+
}
|
|
151
|
+
else if (res.status >= 400) {
|
|
152
|
+
broken++;
|
|
153
|
+
findings.push({
|
|
154
|
+
type: 'sitemap_broken',
|
|
155
|
+
severity: 'error',
|
|
156
|
+
url: row.url,
|
|
157
|
+
details: `Sitemap URL returns ${res.status}`,
|
|
158
|
+
});
|
|
159
|
+
}
|
|
160
|
+
},
|
|
161
|
+
});
|
|
162
|
+
sitemapHeadStats = { checked: rowsToCheck.length, ok, redirected, broken, errored };
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
const stats = {
|
|
166
|
+
pages: pages.length,
|
|
167
|
+
sitemap_urls: sitemapRows.length,
|
|
168
|
+
findings_total: findings.length,
|
|
169
|
+
findings_by_severity: findings.reduce((acc, f) => {
|
|
170
|
+
acc[f.severity] = (acc[f.severity] || 0) + 1;
|
|
171
|
+
return acc;
|
|
172
|
+
}, {}),
|
|
173
|
+
sitemap_head: sitemapHeadStats,
|
|
174
|
+
};
|
|
175
|
+
|
|
176
|
+
return { gated: false, findings, stats };
|
|
177
|
+
}
|