seo-intel 1.2.5 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +29 -0
- package/analyses/aeo/index.js +252 -0
- package/analyses/aeo/scorer.js +254 -0
- package/analyses/blog-draft/index.js +227 -0
- package/analyses/blog-draft/prescorer.js +60 -0
- package/analyses/templates/cluster.js +209 -0
- package/analyses/templates/gsc-overlay.js +93 -0
- package/analyses/templates/index.js +425 -0
- package/analyses/templates/sampler.js +198 -0
- package/analyses/templates/scorer.js +149 -0
- package/analyses/templates/similarity.js +174 -0
- package/analysis/prompt-builder.js +272 -0
- package/analysis/topic-cluster-mapper.js +427 -0
- package/cli.js +124 -1
- package/extractor/qwen.js +558 -0
- package/lib/gate.js +1 -0
- package/package.json +4 -1
- package/reports/generate-html.js +183 -0
- package/server.js +6 -1
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AEO Blog Draft Generator — Data Gathering & Prompt Builder
|
|
3
|
+
*
|
|
4
|
+
* Pulls intelligence from the Ledger (keyword gaps, long-tails, citability gaps,
|
|
5
|
+
* entities, positioning) and builds a prompt that produces a publish-ready,
|
|
6
|
+
* AEO-optimised blog post in .md format with YAML frontmatter.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { getActiveInsights } from '../../db/db.js';
|
|
10
|
+
|
|
11
|
+
// ── Data Gathering ──────────────────────────────────────────────────────────
|
|
12
|
+
|
|
13
|
+
export function gatherBlogDraftContext(db, project, topic = null) {
|
|
14
|
+
const insights = getActiveInsights(db, project);
|
|
15
|
+
|
|
16
|
+
// citability_gap insights — not in getActiveInsights grouped return
|
|
17
|
+
let citabilityGaps = [];
|
|
18
|
+
try {
|
|
19
|
+
citabilityGaps = db.prepare(
|
|
20
|
+
`SELECT data FROM insights WHERE project = ? AND type = 'citability_gap' AND status = 'active' ORDER BY last_seen DESC LIMIT 15`
|
|
21
|
+
).all(project).map(r => JSON.parse(r.data));
|
|
22
|
+
} catch { /* table may not exist yet */ }
|
|
23
|
+
|
|
24
|
+
// Top entities across target pages
|
|
25
|
+
let entityRows = [];
|
|
26
|
+
try {
|
|
27
|
+
entityRows = db.prepare(`
|
|
28
|
+
SELECT e.primary_entities, p.title, p.url
|
|
29
|
+
FROM extractions e
|
|
30
|
+
JOIN pages p ON p.id = e.page_id
|
|
31
|
+
JOIN domains d ON d.id = p.domain_id
|
|
32
|
+
WHERE d.project = ? AND (d.role = 'target' OR d.role = 'owned')
|
|
33
|
+
AND e.primary_entities IS NOT NULL AND e.primary_entities != '[]'
|
|
34
|
+
ORDER BY p.word_count DESC LIMIT 20
|
|
35
|
+
`).all(project);
|
|
36
|
+
} catch { /* extraction may not have run */ }
|
|
37
|
+
|
|
38
|
+
// Best AEO-scoring pages (content to emulate)
|
|
39
|
+
let topCitablePages = [];
|
|
40
|
+
try {
|
|
41
|
+
topCitablePages = db.prepare(`
|
|
42
|
+
SELECT p.url, p.title, cs.total_score as score, cs.ai_intents, cs.tier
|
|
43
|
+
FROM citability_scores cs
|
|
44
|
+
JOIN pages p ON p.id = cs.page_id
|
|
45
|
+
JOIN domains d ON d.id = p.domain_id
|
|
46
|
+
WHERE d.project = ? AND (d.role = 'target' OR d.role = 'owned') AND cs.total_score >= 55
|
|
47
|
+
ORDER BY cs.total_score DESC LIMIT 5
|
|
48
|
+
`).all(project);
|
|
49
|
+
} catch { /* AEO may not have run */ }
|
|
50
|
+
|
|
51
|
+
// Filter by topic if given
|
|
52
|
+
const matchesTopic = (text) => {
|
|
53
|
+
if (!topic || !text) return true;
|
|
54
|
+
return text.toLowerCase().includes(topic.toLowerCase());
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
const kwInventor = insights.keyword_inventor
|
|
58
|
+
.filter(k => matchesTopic(k.phrase) || matchesTopic(k.cluster))
|
|
59
|
+
.slice(0, 30);
|
|
60
|
+
|
|
61
|
+
const longTails = topic
|
|
62
|
+
? [
|
|
63
|
+
...insights.long_tails.filter(lt => matchesTopic(lt.phrase)).slice(0, 20),
|
|
64
|
+
...insights.long_tails.filter(lt => !matchesTopic(lt.phrase)).slice(0, 10),
|
|
65
|
+
]
|
|
66
|
+
: insights.long_tails.slice(0, 30);
|
|
67
|
+
|
|
68
|
+
const keywordGaps = topic
|
|
69
|
+
? [
|
|
70
|
+
...insights.keyword_gaps.filter(kg => matchesTopic(kg.keyword)).slice(0, 15),
|
|
71
|
+
...insights.keyword_gaps.filter(kg => !matchesTopic(kg.keyword)).slice(0, 10),
|
|
72
|
+
]
|
|
73
|
+
: insights.keyword_gaps.filter(kg => kg.priority === 'high').slice(0, 25);
|
|
74
|
+
|
|
75
|
+
const contentGaps = (insights.content_gaps || []).slice(0, 8);
|
|
76
|
+
|
|
77
|
+
return {
|
|
78
|
+
insights,
|
|
79
|
+
citabilityGaps,
|
|
80
|
+
entityRows,
|
|
81
|
+
topCitablePages,
|
|
82
|
+
kwInventor,
|
|
83
|
+
longTails,
|
|
84
|
+
keywordGaps,
|
|
85
|
+
contentGaps,
|
|
86
|
+
topic,
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// ── Prompt Builder ──────────────────────────────────────────────────────────
|
|
91
|
+
|
|
92
|
+
export function buildBlogDraftPrompt(context, { config, lang = 'en', topic = null }) {
|
|
93
|
+
const { longTails, keywordGaps, citabilityGaps, entityRows, topCitablePages, kwInventor, contentGaps, insights } = context;
|
|
94
|
+
const isFi = lang === 'fi';
|
|
95
|
+
|
|
96
|
+
// Extract unique entities from extraction data
|
|
97
|
+
const allEntities = new Set();
|
|
98
|
+
for (const row of entityRows) {
|
|
99
|
+
try {
|
|
100
|
+
const ents = JSON.parse(row.primary_entities);
|
|
101
|
+
if (Array.isArray(ents)) ents.forEach(e => allEntities.add(typeof e === 'string' ? e : e.name || e));
|
|
102
|
+
} catch { /* skip */ }
|
|
103
|
+
}
|
|
104
|
+
const topEntities = [...allEntities].slice(0, 15);
|
|
105
|
+
|
|
106
|
+
// ── Section 1: Role ──
|
|
107
|
+
let prompt = `You are an expert content strategist and copywriter specialising in AEO (Answer Engine Optimisation).
|
|
108
|
+
|
|
109
|
+
Your task: write a complete, publish-ready blog post draft in ${isFi ? 'Finnish' : 'English'}.
|
|
110
|
+
The post must score 70+ on the AEO citability scale (entity authority, structured claims, answer density, Q&A proximity, freshness signals, schema coverage).
|
|
111
|
+
|
|
112
|
+
`;
|
|
113
|
+
|
|
114
|
+
// ── Section 2: Site intelligence ──
|
|
115
|
+
prompt += `## Site Context
|
|
116
|
+
|
|
117
|
+
- **Site:** ${config.context?.siteName || config.target?.domain} (${config.target?.url})
|
|
118
|
+
- **Industry:** ${config.context?.industry || 'N/A'}
|
|
119
|
+
- **Audience:** ${config.context?.audience || 'N/A'}
|
|
120
|
+
- **Goal:** ${config.context?.goal || 'N/A'}
|
|
121
|
+
`;
|
|
122
|
+
|
|
123
|
+
if (insights.positioning) {
|
|
124
|
+
prompt += `- **Positioning:** ${typeof insights.positioning === 'string' ? insights.positioning : JSON.stringify(insights.positioning)}\n`;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
if (topEntities.length) {
|
|
128
|
+
prompt += `- **Core entities:** ${topEntities.join(', ')}\n`;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
if (topCitablePages.length) {
|
|
132
|
+
prompt += `\n### Highest-scoring pages on the site (emulate their structure)\n`;
|
|
133
|
+
for (const p of topCitablePages) {
|
|
134
|
+
prompt += `- ${p.url} — AEO score: ${p.score}/100 (${p.tier})\n`;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// ── Section 3: Topic focus ──
|
|
139
|
+
prompt += `\n## Topic\n\n`;
|
|
140
|
+
if (topic) {
|
|
141
|
+
prompt += `Primary focus: **${topic}**. All keyword and gap data below has been filtered to this topic. Build the entire post around this subject.\n`;
|
|
142
|
+
} else {
|
|
143
|
+
prompt += `Select the highest-opportunity topic from the gaps below. Choose the gap that: (a) has the most keyword_gap entries or (b) is flagged as a high priority long-tail. Explain your topic choice in the frontmatter \`topic_selection_rationale\` field.\n`;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// ── Section 4: Intelligence data ──
|
|
147
|
+
if (keywordGaps.length) {
|
|
148
|
+
prompt += `\n## Keyword Gaps to Target (include these as primary/secondary keywords)\n\n`;
|
|
149
|
+
prompt += `| Keyword | Priority | Notes |\n|---|---|---|\n`;
|
|
150
|
+
for (const kg of keywordGaps) {
|
|
151
|
+
prompt += `| ${kg.keyword || kg.phrase || '—'} | ${kg.priority || 'medium'} | ${(kg.notes || '').slice(0, 80)} |\n`;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
if (longTails.length) {
|
|
156
|
+
prompt += `\n## Long-tail Phrases to Answer (each should have a direct answer in the post)\n\n`;
|
|
157
|
+
prompt += `| Phrase | Intent | Priority |\n|---|---|---|\n`;
|
|
158
|
+
for (const lt of longTails) {
|
|
159
|
+
prompt += `| ${lt.phrase || '—'} | ${lt.intent || '—'} | ${lt.priority || 'medium'} |\n`;
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
if (kwInventor.length) {
|
|
164
|
+
prompt += `\n## Keyword Inventor Phrases (weave these naturally into headings/body)\n\n`;
|
|
165
|
+
for (const kw of kwInventor.slice(0, 20)) {
|
|
166
|
+
prompt += `- "${kw.phrase}" (${kw.type || 'traditional'}, ${kw.intent || '—'})\n`;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
if (citabilityGaps.length) {
|
|
171
|
+
prompt += `\n## Citability Gaps (pages scoring <60 on AEO — model the fix in this post)\n\n`;
|
|
172
|
+
prompt += `| URL | Score | Weakest Signals |\n|---|---|---|\n`;
|
|
173
|
+
for (const cg of citabilityGaps) {
|
|
174
|
+
prompt += `| ${cg.url || '—'} | ${cg.score || '—'} | ${cg.weakest || cg.weakest_signal || '—'} |\n`;
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
if (contentGaps.length) {
|
|
179
|
+
prompt += `\n## Content Gaps (topics competitors cover that you don't)\n\n`;
|
|
180
|
+
for (const cg of contentGaps) {
|
|
181
|
+
const desc = typeof cg === 'string' ? cg : (cg.topic || cg.description || cg.gap || JSON.stringify(cg));
|
|
182
|
+
prompt += `- ${desc}\n`;
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// ── Section 5: AEO structural requirements ──
|
|
187
|
+
prompt += `
|
|
188
|
+
## AEO Structural Requirements
|
|
189
|
+
|
|
190
|
+
The draft MUST include:
|
|
191
|
+
1. YAML frontmatter with: title, slug, description (155 chars max), primary_keyword, secondary_keywords[], date (${new Date().toISOString().slice(0, 10)}), updated (same), lang (${lang}), tags[]${!topic ? ', topic_selection_rationale' : ''}
|
|
192
|
+
2. An H1 that contains the primary keyword
|
|
193
|
+
3. A 2-3 sentence summary immediately after the H1 (answer-first structure — inverted pyramid). This paragraph will be cited by AI assistants.
|
|
194
|
+
4. Minimum 6 H2 subheadings
|
|
195
|
+
5. At least 3 H2s phrased as direct questions (What is / How to / Why / When)
|
|
196
|
+
6. At least one numbered or bulleted list with 4+ items
|
|
197
|
+
7. At least one "X is Y because Z" definitional sentence per major concept
|
|
198
|
+
8. A FAQ section at the end with minimum 4 Q&A pairs (### H3 questions, 2-4 sentence answers)
|
|
199
|
+
9. A closing CTA paragraph referencing ${config.context?.siteName || config.target?.domain}
|
|
200
|
+
10. Word count: 1,200-2,000 words
|
|
201
|
+
11. Internal link suggestions: include 2-3 \`[anchor text](URL)\` links back to the site where natural
|
|
202
|
+
`;
|
|
203
|
+
|
|
204
|
+
// ── Section 6: Language ──
|
|
205
|
+
if (isFi) {
|
|
206
|
+
prompt += `
|
|
207
|
+
## Language: Finnish
|
|
208
|
+
|
|
209
|
+
Write in Finnish. Use informal, direct register (sinuttelu where natural). Avoid marketing clichés common in Finnish B2B copy. Prefer short sentences. Finnish SEO keywords must appear in their exact searched base form in headings — Finnish inflection reduces exact-match keyword presence.
|
|
210
|
+
`;
|
|
211
|
+
} else {
|
|
212
|
+
prompt += `
|
|
213
|
+
## Language: English
|
|
214
|
+
|
|
215
|
+
Write in clear, direct international English. No filler phrases. No "in today's digital landscape" or "it's no secret that" openers. Every sentence should contain a fact, insight, or actionable point.
|
|
216
|
+
`;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// ── Section 7: Output format ──
|
|
220
|
+
prompt += `
|
|
221
|
+
## Output Format
|
|
222
|
+
|
|
223
|
+
Respond with ONLY the complete markdown document. Start with --- (YAML frontmatter open fence). End with the FAQ section and CTA. No explanation before or after. No triple backticks wrapping the response.
|
|
224
|
+
`;
|
|
225
|
+
|
|
226
|
+
return prompt;
|
|
227
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AEO Pre-Scorer — scores a generated markdown draft against citability signals
|
|
3
|
+
*
|
|
4
|
+
* Uses the same scorePage() function as the full AEO audit, but constructs
|
|
5
|
+
* synthetic inputs from the markdown text instead of reading from the DB.
|
|
6
|
+
*
|
|
7
|
+
* Freshness always scores 0 (no publish date yet) — the reported score
|
|
8
|
+
* accounts for this by adding +10 for "what it will score once published."
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { scorePage } from '../aeo/scorer.js';
|
|
12
|
+
|
|
13
|
+
export function prescore(markdownText) {
|
|
14
|
+
// Strip YAML frontmatter
|
|
15
|
+
const bodyMatch = markdownText.match(/^---[\s\S]*?---\n([\s\S]*)$/);
|
|
16
|
+
const body = bodyMatch ? bodyMatch[1] : markdownText;
|
|
17
|
+
|
|
18
|
+
// Extract headings
|
|
19
|
+
const headings = [];
|
|
20
|
+
for (const line of body.split('\n')) {
|
|
21
|
+
const m = line.match(/^(#{1,6})\s+(.+)$/);
|
|
22
|
+
if (m) headings.push({ level: m[1].length, text: m[2].trim() });
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
// Word count
|
|
26
|
+
const wordCount = body.split(/\s+/).filter(Boolean).length;
|
|
27
|
+
|
|
28
|
+
// Extract frontmatter fields
|
|
29
|
+
let fmSchemaType = null;
|
|
30
|
+
const fmMatch = markdownText.match(/^---([\s\S]*?)---/);
|
|
31
|
+
if (fmMatch) {
|
|
32
|
+
const schemaLine = fmMatch[1].match(/schema_type:\s*(.+)/);
|
|
33
|
+
if (schemaLine) fmSchemaType = schemaLine[1].trim();
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// Build synthetic page object
|
|
37
|
+
const syntheticPage = {
|
|
38
|
+
body_text: body,
|
|
39
|
+
word_count: wordCount,
|
|
40
|
+
published_date: null, // not published yet — freshness = 0
|
|
41
|
+
modified_date: null,
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
// Extract entity candidates from headings (capitalised noun phrases)
|
|
45
|
+
const entityCandidates = headings
|
|
46
|
+
.filter(h => h.level <= 3)
|
|
47
|
+
.flatMap(h => h.text.match(/\b[A-ZÄÖÅ][a-zäöå]+(?:\s+[A-ZÄÖÅ][a-zäöå]+)*/g) || []);
|
|
48
|
+
const entities = [...new Set(entityCandidates)].slice(0, 8);
|
|
49
|
+
|
|
50
|
+
const schemaTypes = fmSchemaType ? [fmSchemaType] : [];
|
|
51
|
+
const schemas = [];
|
|
52
|
+
|
|
53
|
+
const result = scorePage(syntheticPage, headings, entities, schemaTypes, schemas, 'Informational');
|
|
54
|
+
|
|
55
|
+
return {
|
|
56
|
+
...result,
|
|
57
|
+
wordCount,
|
|
58
|
+
headingCount: headings.length,
|
|
59
|
+
};
|
|
60
|
+
}
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* URL Pattern Clustering — Phase 1
|
|
3
|
+
*
|
|
4
|
+
* Takes sitemap URLs, detects parametric patterns, groups them.
|
|
5
|
+
* Pure function — no I/O, no side effects.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Is this path segment a "variable" (one of N possible values)
|
|
10
|
+
* vs a "constant" (structural path like 'swap', 'docs', 'blog')?
|
|
11
|
+
*/
|
|
12
|
+
function isVariable(segment) {
|
|
13
|
+
// Version prefixes stay constant: v1, v2, v3...
|
|
14
|
+
if (/^v\d+$/.test(segment)) return false;
|
|
15
|
+
|
|
16
|
+
// Common structural words stay constant
|
|
17
|
+
const STRUCTURAL = new Set([
|
|
18
|
+
'api', 'docs', 'blog', 'news', 'about', 'pricing', 'features',
|
|
19
|
+
'help', 'support', 'contact', 'legal', 'terms', 'privacy',
|
|
20
|
+
'login', 'signup', 'register', 'dashboard', 'settings',
|
|
21
|
+
'token', 'tokens', 'swap', 'trade', 'perps', 'perpetuals',
|
|
22
|
+
'pool', 'pools', 'stake', 'staking', 'bridge', 'earn',
|
|
23
|
+
'governance', 'vote', 'proposals', 'stats', 'analytics',
|
|
24
|
+
'markets', 'pairs', 'explorer', 'episodes', 'categories',
|
|
25
|
+
'tags', 'products', 'collections', 'pages', 'posts',
|
|
26
|
+
]);
|
|
27
|
+
if (STRUCTURAL.has(segment.toLowerCase())) return false;
|
|
28
|
+
|
|
29
|
+
// Purely numeric → variable (IDs, dates)
|
|
30
|
+
if (/^\d+$/.test(segment)) return true;
|
|
31
|
+
|
|
32
|
+
// UUID or hash-like
|
|
33
|
+
if (/^[0-9a-f-]{8,}$/i.test(segment)) return true;
|
|
34
|
+
|
|
35
|
+
// Hex address (0x...)
|
|
36
|
+
if (/^0x[0-9a-fA-F]+$/.test(segment)) return true;
|
|
37
|
+
|
|
38
|
+
// Contains separator characters typical of slugs/pairs: SOL-USDC, my-blog-post
|
|
39
|
+
if (/[-_.]/.test(segment) && segment.length > 2) return true;
|
|
40
|
+
|
|
41
|
+
// All uppercase short string → likely a ticker: SOL, BONK, USDT, ETH
|
|
42
|
+
if (/^[A-Z0-9]{2,10}$/.test(segment)) return true;
|
|
43
|
+
|
|
44
|
+
// Mixed case with digits → product codes, IDs
|
|
45
|
+
if (/[A-Z]/.test(segment) && /\d/.test(segment)) return true;
|
|
46
|
+
|
|
47
|
+
// Very long segments are likely slugs or IDs
|
|
48
|
+
if (segment.length > 30) return true;
|
|
49
|
+
|
|
50
|
+
return false;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Infer a semantic name for a param position based on observed values.
|
|
55
|
+
*/
|
|
56
|
+
function inferParamName(values, position) {
|
|
57
|
+
const sample = values.slice(0, 100);
|
|
58
|
+
|
|
59
|
+
// Crypto pairs: X-Y format where both parts are short uppercase
|
|
60
|
+
const pairCount = sample.filter(v => /^[A-Za-z0-9]+-[A-Za-z0-9]+$/.test(v)).length;
|
|
61
|
+
if (pairCount > sample.length * 0.6) return 'pair';
|
|
62
|
+
|
|
63
|
+
// Token tickers: 2-10 uppercase chars
|
|
64
|
+
const tickerCount = sample.filter(v => /^[A-Z0-9]{2,10}$/.test(v)).length;
|
|
65
|
+
if (tickerCount > sample.length * 0.6) return 'symbol';
|
|
66
|
+
|
|
67
|
+
// Slugs: lowercase with hyphens
|
|
68
|
+
const slugCount = sample.filter(v => /^[a-z0-9]+(-[a-z0-9]+)+$/.test(v)).length;
|
|
69
|
+
if (slugCount > sample.length * 0.6) return 'slug';
|
|
70
|
+
|
|
71
|
+
// Numeric IDs
|
|
72
|
+
const numCount = sample.filter(v => /^\d+$/.test(v)).length;
|
|
73
|
+
if (numCount > sample.length * 0.6) return 'id';
|
|
74
|
+
|
|
75
|
+
// Hex hashes/addresses
|
|
76
|
+
const hexCount = sample.filter(v => /^(0x)?[0-9a-f]{8,}$/i.test(v)).length;
|
|
77
|
+
if (hexCount > sample.length * 0.6) return 'hash';
|
|
78
|
+
|
|
79
|
+
return `param${position}`;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Cluster sitemap URLs into template groups.
|
|
84
|
+
*
|
|
85
|
+
* @param {Array<{url: string, lastmod?: string}>} sitemapEntries
|
|
86
|
+
* @param {object} opts
|
|
87
|
+
* @param {number} opts.minGroupSize — min URLs to qualify as template (default 10)
|
|
88
|
+
* @param {number} opts.maxSegments — max path depth to consider (default 8)
|
|
89
|
+
* @returns {{ groups: TemplateGroup[], ungrouped: string[], stats: object }}
|
|
90
|
+
*/
|
|
91
|
+
export function clusterUrls(sitemapEntries, opts = {}) {
|
|
92
|
+
const minGroupSize = opts.minGroupSize || 10;
|
|
93
|
+
const maxSegments = opts.maxSegments || 8;
|
|
94
|
+
|
|
95
|
+
// patternKey → { pattern parts, urls[], paramValues by position }
|
|
96
|
+
const clusters = new Map();
|
|
97
|
+
|
|
98
|
+
for (const entry of sitemapEntries) {
|
|
99
|
+
let pathname;
|
|
100
|
+
try {
|
|
101
|
+
pathname = new URL(entry.url).pathname;
|
|
102
|
+
} catch { continue; }
|
|
103
|
+
|
|
104
|
+
// Normalize
|
|
105
|
+
pathname = pathname.replace(/\/+$/, '') || '/';
|
|
106
|
+
|
|
107
|
+
// Homepage is always unique
|
|
108
|
+
if (pathname === '/') continue;
|
|
109
|
+
|
|
110
|
+
const segments = pathname.split('/').filter(Boolean).slice(0, maxSegments);
|
|
111
|
+
const patternParts = [];
|
|
112
|
+
const paramPositions = {};
|
|
113
|
+
let paramIdx = 0;
|
|
114
|
+
|
|
115
|
+
for (let i = 0; i < segments.length; i++) {
|
|
116
|
+
if (isVariable(segments[i])) {
|
|
117
|
+
const key = `p${paramIdx}`;
|
|
118
|
+
patternParts.push(`{${key}}`);
|
|
119
|
+
if (!paramPositions[key]) paramPositions[key] = [];
|
|
120
|
+
paramPositions[key].push(segments[i]);
|
|
121
|
+
paramIdx++;
|
|
122
|
+
} else {
|
|
123
|
+
patternParts.push(segments[i].toLowerCase());
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
const patternKey = '/' + patternParts.join('/');
|
|
128
|
+
|
|
129
|
+
if (!clusters.has(patternKey)) {
|
|
130
|
+
clusters.set(patternKey, {
|
|
131
|
+
patternKey,
|
|
132
|
+
patternParts,
|
|
133
|
+
urls: [],
|
|
134
|
+
paramPositions: {},
|
|
135
|
+
lastmods: [],
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
const cluster = clusters.get(patternKey);
|
|
140
|
+
cluster.urls.push(entry.url);
|
|
141
|
+
if (entry.lastmod) cluster.lastmods.push(entry.lastmod);
|
|
142
|
+
|
|
143
|
+
// Collect param values (cap at 200 for memory)
|
|
144
|
+
for (const [key, values] of Object.entries(paramPositions)) {
|
|
145
|
+
if (!cluster.paramPositions[key]) cluster.paramPositions[key] = [];
|
|
146
|
+
if (cluster.paramPositions[key].length < 200) {
|
|
147
|
+
cluster.paramPositions[key].push(...values);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// Separate into template groups (>= minGroupSize) and ungrouped
|
|
153
|
+
const groups = [];
|
|
154
|
+
const ungrouped = [];
|
|
155
|
+
|
|
156
|
+
for (const [patternKey, cluster] of clusters) {
|
|
157
|
+
if (cluster.urls.length >= minGroupSize) {
|
|
158
|
+
// Rename params to semantic names
|
|
159
|
+
const params = {};
|
|
160
|
+
const renamedParts = [...cluster.patternParts];
|
|
161
|
+
|
|
162
|
+
let paramIdx = 0;
|
|
163
|
+
for (let i = 0; i < renamedParts.length; i++) {
|
|
164
|
+
const match = renamedParts[i].match(/^\{(p\d+)\}$/);
|
|
165
|
+
if (match) {
|
|
166
|
+
const key = match[1];
|
|
167
|
+
const name = inferParamName(cluster.paramPositions[key] || [], paramIdx);
|
|
168
|
+
renamedParts[i] = `{${name}}`;
|
|
169
|
+
params[name] = (cluster.paramPositions[key] || []).slice(0, 50);
|
|
170
|
+
paramIdx++;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
const pattern = '/' + renamedParts.join('/');
|
|
175
|
+
const sortedLastmods = cluster.lastmods.sort();
|
|
176
|
+
|
|
177
|
+
groups.push({
|
|
178
|
+
pattern,
|
|
179
|
+
patternKey,
|
|
180
|
+
params,
|
|
181
|
+
urls: cluster.urls,
|
|
182
|
+
urlCount: cluster.urls.length,
|
|
183
|
+
depth: cluster.patternParts.length,
|
|
184
|
+
firstSeen: sortedLastmods[0] || null,
|
|
185
|
+
lastSeen: sortedLastmods[sortedLastmods.length - 1] || null,
|
|
186
|
+
});
|
|
187
|
+
} else {
|
|
188
|
+
ungrouped.push(...cluster.urls);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// Sort by URL count descending
|
|
193
|
+
groups.sort((a, b) => b.urlCount - a.urlCount);
|
|
194
|
+
|
|
195
|
+
const totalGrouped = groups.reduce((sum, g) => sum + g.urlCount, 0);
|
|
196
|
+
const totalUrls = sitemapEntries.length;
|
|
197
|
+
|
|
198
|
+
return {
|
|
199
|
+
groups,
|
|
200
|
+
ungrouped,
|
|
201
|
+
stats: {
|
|
202
|
+
totalUrls,
|
|
203
|
+
totalGroups: groups.length,
|
|
204
|
+
totalGrouped,
|
|
205
|
+
largestGroup: groups[0]?.urlCount || 0,
|
|
206
|
+
coverage: totalUrls > 0 ? totalGrouped / totalUrls : 0,
|
|
207
|
+
},
|
|
208
|
+
};
|
|
209
|
+
}
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GSC Overlay — Phase 3
|
|
3
|
+
*
|
|
4
|
+
* Cross-references template groups against Google Search Console per-URL data.
|
|
5
|
+
* Pure computation — no I/O.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Normalize a URL for GSC matching.
|
|
10
|
+
* GSC reports URLs inconsistently — trailing slashes, www, http vs https.
|
|
11
|
+
*/
|
|
12
|
+
function normalizeUrl(url) {
|
|
13
|
+
try {
|
|
14
|
+
const u = new URL(url);
|
|
15
|
+
return (u.protocol + '//' + u.hostname + u.pathname).replace(/\/+$/, '').toLowerCase();
|
|
16
|
+
} catch {
|
|
17
|
+
return url.toLowerCase().replace(/\/+$/, '');
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Cross-reference template groups with GSC pages data.
|
|
23
|
+
*
|
|
24
|
+
* @param {TemplateGroup[]} groups — from cluster.js
|
|
25
|
+
* @param {Array<{url: string, clicks: number, impressions: number, ctr: number, position: number}>|null} gscPages
|
|
26
|
+
* @returns {GscOverlayResult[]}
|
|
27
|
+
*/
|
|
28
|
+
export function overlayGsc(groups, gscPages) {
|
|
29
|
+
if (!gscPages || gscPages.length === 0) {
|
|
30
|
+
// No GSC data — return groups with null GSC fields
|
|
31
|
+
return groups.map(g => ({
|
|
32
|
+
...g,
|
|
33
|
+
gscUrlsWithImpressions: null,
|
|
34
|
+
gscTotalClicks: null,
|
|
35
|
+
gscTotalImpressions: null,
|
|
36
|
+
gscAvgPosition: null,
|
|
37
|
+
indexationEfficiency: null,
|
|
38
|
+
topGscUrls: [],
|
|
39
|
+
}));
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// Build normalized URL → GSC entry lookup
|
|
43
|
+
const gscMap = new Map();
|
|
44
|
+
for (const entry of gscPages) {
|
|
45
|
+
const key = normalizeUrl(entry.url);
|
|
46
|
+
// Keep the one with more impressions if dupes
|
|
47
|
+
const existing = gscMap.get(key);
|
|
48
|
+
if (!existing || entry.impressions > existing.impressions) {
|
|
49
|
+
gscMap.set(key, entry);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return groups.map(group => {
|
|
54
|
+
let urlsWithImpressions = 0;
|
|
55
|
+
let totalClicks = 0;
|
|
56
|
+
let totalImpressions = 0;
|
|
57
|
+
let positionSum = 0;
|
|
58
|
+
let positionCount = 0;
|
|
59
|
+
const topUrls = [];
|
|
60
|
+
|
|
61
|
+
for (const url of group.urls) {
|
|
62
|
+
const gscEntry = gscMap.get(normalizeUrl(url));
|
|
63
|
+
if (gscEntry && gscEntry.impressions > 0) {
|
|
64
|
+
urlsWithImpressions++;
|
|
65
|
+
totalClicks += gscEntry.clicks || 0;
|
|
66
|
+
totalImpressions += gscEntry.impressions || 0;
|
|
67
|
+
if (gscEntry.position > 0) {
|
|
68
|
+
positionSum += gscEntry.position;
|
|
69
|
+
positionCount++;
|
|
70
|
+
}
|
|
71
|
+
topUrls.push({
|
|
72
|
+
url: gscEntry.url,
|
|
73
|
+
clicks: gscEntry.clicks,
|
|
74
|
+
impressions: gscEntry.impressions,
|
|
75
|
+
position: gscEntry.position,
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// Sort top URLs by impressions desc, take top 10
|
|
81
|
+
topUrls.sort((a, b) => b.impressions - a.impressions);
|
|
82
|
+
|
|
83
|
+
return {
|
|
84
|
+
...group,
|
|
85
|
+
gscUrlsWithImpressions: urlsWithImpressions,
|
|
86
|
+
gscTotalClicks: totalClicks,
|
|
87
|
+
gscTotalImpressions: totalImpressions,
|
|
88
|
+
gscAvgPosition: positionCount > 0 ? Math.round((positionSum / positionCount) * 10) / 10 : null,
|
|
89
|
+
indexationEfficiency: group.urlCount > 0 ? urlsWithImpressions / group.urlCount : 0,
|
|
90
|
+
topGscUrls: topUrls.slice(0, 10),
|
|
91
|
+
};
|
|
92
|
+
});
|
|
93
|
+
}
|