seo-intel 1.2.5 → 1.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +9 -0
- package/analyses/aeo/index.js +252 -0
- package/analyses/aeo/scorer.js +254 -0
- package/analyses/templates/cluster.js +209 -0
- package/analyses/templates/gsc-overlay.js +93 -0
- package/analyses/templates/index.js +425 -0
- package/analyses/templates/sampler.js +198 -0
- package/analyses/templates/scorer.js +149 -0
- package/analyses/templates/similarity.js +174 -0
- package/analysis/prompt-builder.js +272 -0
- package/analysis/topic-cluster-mapper.js +427 -0
- package/cli.js +0 -1
- package/extractor/qwen.js +558 -0
- package/package.json +4 -1
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* URL Pattern Clustering — Phase 1
|
|
3
|
+
*
|
|
4
|
+
* Takes sitemap URLs, detects parametric patterns, groups them.
|
|
5
|
+
* Pure function — no I/O, no side effects.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Is this path segment a "variable" (one of N possible values)
|
|
10
|
+
* vs a "constant" (structural path like 'swap', 'docs', 'blog')?
|
|
11
|
+
*/
|
|
12
|
+
function isVariable(segment) {
|
|
13
|
+
// Version prefixes stay constant: v1, v2, v3...
|
|
14
|
+
if (/^v\d+$/.test(segment)) return false;
|
|
15
|
+
|
|
16
|
+
// Common structural words stay constant
|
|
17
|
+
const STRUCTURAL = new Set([
|
|
18
|
+
'api', 'docs', 'blog', 'news', 'about', 'pricing', 'features',
|
|
19
|
+
'help', 'support', 'contact', 'legal', 'terms', 'privacy',
|
|
20
|
+
'login', 'signup', 'register', 'dashboard', 'settings',
|
|
21
|
+
'token', 'tokens', 'swap', 'trade', 'perps', 'perpetuals',
|
|
22
|
+
'pool', 'pools', 'stake', 'staking', 'bridge', 'earn',
|
|
23
|
+
'governance', 'vote', 'proposals', 'stats', 'analytics',
|
|
24
|
+
'markets', 'pairs', 'explorer', 'episodes', 'categories',
|
|
25
|
+
'tags', 'products', 'collections', 'pages', 'posts',
|
|
26
|
+
]);
|
|
27
|
+
if (STRUCTURAL.has(segment.toLowerCase())) return false;
|
|
28
|
+
|
|
29
|
+
// Purely numeric → variable (IDs, dates)
|
|
30
|
+
if (/^\d+$/.test(segment)) return true;
|
|
31
|
+
|
|
32
|
+
// UUID or hash-like
|
|
33
|
+
if (/^[0-9a-f-]{8,}$/i.test(segment)) return true;
|
|
34
|
+
|
|
35
|
+
// Hex address (0x...)
|
|
36
|
+
if (/^0x[0-9a-fA-F]+$/.test(segment)) return true;
|
|
37
|
+
|
|
38
|
+
// Contains separator characters typical of slugs/pairs: SOL-USDC, my-blog-post
|
|
39
|
+
if (/[-_.]/.test(segment) && segment.length > 2) return true;
|
|
40
|
+
|
|
41
|
+
// All uppercase short string → likely a ticker: SOL, BONK, USDT, ETH
|
|
42
|
+
if (/^[A-Z0-9]{2,10}$/.test(segment)) return true;
|
|
43
|
+
|
|
44
|
+
// Mixed case with digits → product codes, IDs
|
|
45
|
+
if (/[A-Z]/.test(segment) && /\d/.test(segment)) return true;
|
|
46
|
+
|
|
47
|
+
// Very long segments are likely slugs or IDs
|
|
48
|
+
if (segment.length > 30) return true;
|
|
49
|
+
|
|
50
|
+
return false;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Infer a semantic name for a param position based on observed values.
|
|
55
|
+
*/
|
|
56
|
+
function inferParamName(values, position) {
|
|
57
|
+
const sample = values.slice(0, 100);
|
|
58
|
+
|
|
59
|
+
// Crypto pairs: X-Y format where both parts are short uppercase
|
|
60
|
+
const pairCount = sample.filter(v => /^[A-Za-z0-9]+-[A-Za-z0-9]+$/.test(v)).length;
|
|
61
|
+
if (pairCount > sample.length * 0.6) return 'pair';
|
|
62
|
+
|
|
63
|
+
// Token tickers: 2-10 uppercase chars
|
|
64
|
+
const tickerCount = sample.filter(v => /^[A-Z0-9]{2,10}$/.test(v)).length;
|
|
65
|
+
if (tickerCount > sample.length * 0.6) return 'symbol';
|
|
66
|
+
|
|
67
|
+
// Slugs: lowercase with hyphens
|
|
68
|
+
const slugCount = sample.filter(v => /^[a-z0-9]+(-[a-z0-9]+)+$/.test(v)).length;
|
|
69
|
+
if (slugCount > sample.length * 0.6) return 'slug';
|
|
70
|
+
|
|
71
|
+
// Numeric IDs
|
|
72
|
+
const numCount = sample.filter(v => /^\d+$/.test(v)).length;
|
|
73
|
+
if (numCount > sample.length * 0.6) return 'id';
|
|
74
|
+
|
|
75
|
+
// Hex hashes/addresses
|
|
76
|
+
const hexCount = sample.filter(v => /^(0x)?[0-9a-f]{8,}$/i.test(v)).length;
|
|
77
|
+
if (hexCount > sample.length * 0.6) return 'hash';
|
|
78
|
+
|
|
79
|
+
return `param${position}`;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Cluster sitemap URLs into template groups.
|
|
84
|
+
*
|
|
85
|
+
* @param {Array<{url: string, lastmod?: string}>} sitemapEntries
|
|
86
|
+
* @param {object} opts
|
|
87
|
+
* @param {number} opts.minGroupSize — min URLs to qualify as template (default 10)
|
|
88
|
+
* @param {number} opts.maxSegments — max path depth to consider (default 8)
|
|
89
|
+
* @returns {{ groups: TemplateGroup[], ungrouped: string[], stats: object }}
|
|
90
|
+
*/
|
|
91
|
+
export function clusterUrls(sitemapEntries, opts = {}) {
|
|
92
|
+
const minGroupSize = opts.minGroupSize || 10;
|
|
93
|
+
const maxSegments = opts.maxSegments || 8;
|
|
94
|
+
|
|
95
|
+
// patternKey → { pattern parts, urls[], paramValues by position }
|
|
96
|
+
const clusters = new Map();
|
|
97
|
+
|
|
98
|
+
for (const entry of sitemapEntries) {
|
|
99
|
+
let pathname;
|
|
100
|
+
try {
|
|
101
|
+
pathname = new URL(entry.url).pathname;
|
|
102
|
+
} catch { continue; }
|
|
103
|
+
|
|
104
|
+
// Normalize
|
|
105
|
+
pathname = pathname.replace(/\/+$/, '') || '/';
|
|
106
|
+
|
|
107
|
+
// Homepage is always unique
|
|
108
|
+
if (pathname === '/') continue;
|
|
109
|
+
|
|
110
|
+
const segments = pathname.split('/').filter(Boolean).slice(0, maxSegments);
|
|
111
|
+
const patternParts = [];
|
|
112
|
+
const paramPositions = {};
|
|
113
|
+
let paramIdx = 0;
|
|
114
|
+
|
|
115
|
+
for (let i = 0; i < segments.length; i++) {
|
|
116
|
+
if (isVariable(segments[i])) {
|
|
117
|
+
const key = `p${paramIdx}`;
|
|
118
|
+
patternParts.push(`{${key}}`);
|
|
119
|
+
if (!paramPositions[key]) paramPositions[key] = [];
|
|
120
|
+
paramPositions[key].push(segments[i]);
|
|
121
|
+
paramIdx++;
|
|
122
|
+
} else {
|
|
123
|
+
patternParts.push(segments[i].toLowerCase());
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
const patternKey = '/' + patternParts.join('/');
|
|
128
|
+
|
|
129
|
+
if (!clusters.has(patternKey)) {
|
|
130
|
+
clusters.set(patternKey, {
|
|
131
|
+
patternKey,
|
|
132
|
+
patternParts,
|
|
133
|
+
urls: [],
|
|
134
|
+
paramPositions: {},
|
|
135
|
+
lastmods: [],
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
const cluster = clusters.get(patternKey);
|
|
140
|
+
cluster.urls.push(entry.url);
|
|
141
|
+
if (entry.lastmod) cluster.lastmods.push(entry.lastmod);
|
|
142
|
+
|
|
143
|
+
// Collect param values (cap at 200 for memory)
|
|
144
|
+
for (const [key, values] of Object.entries(paramPositions)) {
|
|
145
|
+
if (!cluster.paramPositions[key]) cluster.paramPositions[key] = [];
|
|
146
|
+
if (cluster.paramPositions[key].length < 200) {
|
|
147
|
+
cluster.paramPositions[key].push(...values);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// Separate into template groups (>= minGroupSize) and ungrouped
|
|
153
|
+
const groups = [];
|
|
154
|
+
const ungrouped = [];
|
|
155
|
+
|
|
156
|
+
for (const [patternKey, cluster] of clusters) {
|
|
157
|
+
if (cluster.urls.length >= minGroupSize) {
|
|
158
|
+
// Rename params to semantic names
|
|
159
|
+
const params = {};
|
|
160
|
+
const renamedParts = [...cluster.patternParts];
|
|
161
|
+
|
|
162
|
+
let paramIdx = 0;
|
|
163
|
+
for (let i = 0; i < renamedParts.length; i++) {
|
|
164
|
+
const match = renamedParts[i].match(/^\{(p\d+)\}$/);
|
|
165
|
+
if (match) {
|
|
166
|
+
const key = match[1];
|
|
167
|
+
const name = inferParamName(cluster.paramPositions[key] || [], paramIdx);
|
|
168
|
+
renamedParts[i] = `{${name}}`;
|
|
169
|
+
params[name] = (cluster.paramPositions[key] || []).slice(0, 50);
|
|
170
|
+
paramIdx++;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
const pattern = '/' + renamedParts.join('/');
|
|
175
|
+
const sortedLastmods = cluster.lastmods.sort();
|
|
176
|
+
|
|
177
|
+
groups.push({
|
|
178
|
+
pattern,
|
|
179
|
+
patternKey,
|
|
180
|
+
params,
|
|
181
|
+
urls: cluster.urls,
|
|
182
|
+
urlCount: cluster.urls.length,
|
|
183
|
+
depth: cluster.patternParts.length,
|
|
184
|
+
firstSeen: sortedLastmods[0] || null,
|
|
185
|
+
lastSeen: sortedLastmods[sortedLastmods.length - 1] || null,
|
|
186
|
+
});
|
|
187
|
+
} else {
|
|
188
|
+
ungrouped.push(...cluster.urls);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// Sort by URL count descending
|
|
193
|
+
groups.sort((a, b) => b.urlCount - a.urlCount);
|
|
194
|
+
|
|
195
|
+
const totalGrouped = groups.reduce((sum, g) => sum + g.urlCount, 0);
|
|
196
|
+
const totalUrls = sitemapEntries.length;
|
|
197
|
+
|
|
198
|
+
return {
|
|
199
|
+
groups,
|
|
200
|
+
ungrouped,
|
|
201
|
+
stats: {
|
|
202
|
+
totalUrls,
|
|
203
|
+
totalGroups: groups.length,
|
|
204
|
+
totalGrouped,
|
|
205
|
+
largestGroup: groups[0]?.urlCount || 0,
|
|
206
|
+
coverage: totalUrls > 0 ? totalGrouped / totalUrls : 0,
|
|
207
|
+
},
|
|
208
|
+
};
|
|
209
|
+
}
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GSC Overlay — Phase 3
|
|
3
|
+
*
|
|
4
|
+
* Cross-references template groups against Google Search Console per-URL data.
|
|
5
|
+
* Pure computation — no I/O.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Normalize a URL for GSC matching.
|
|
10
|
+
* GSC reports URLs inconsistently — trailing slashes, www, http vs https.
|
|
11
|
+
*/
|
|
12
|
+
function normalizeUrl(url) {
|
|
13
|
+
try {
|
|
14
|
+
const u = new URL(url);
|
|
15
|
+
return (u.protocol + '//' + u.hostname + u.pathname).replace(/\/+$/, '').toLowerCase();
|
|
16
|
+
} catch {
|
|
17
|
+
return url.toLowerCase().replace(/\/+$/, '');
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Cross-reference template groups with GSC pages data.
|
|
23
|
+
*
|
|
24
|
+
* @param {TemplateGroup[]} groups — from cluster.js
|
|
25
|
+
* @param {Array<{url: string, clicks: number, impressions: number, ctr: number, position: number}>|null} gscPages
|
|
26
|
+
* @returns {GscOverlayResult[]}
|
|
27
|
+
*/
|
|
28
|
+
export function overlayGsc(groups, gscPages) {
|
|
29
|
+
if (!gscPages || gscPages.length === 0) {
|
|
30
|
+
// No GSC data — return groups with null GSC fields
|
|
31
|
+
return groups.map(g => ({
|
|
32
|
+
...g,
|
|
33
|
+
gscUrlsWithImpressions: null,
|
|
34
|
+
gscTotalClicks: null,
|
|
35
|
+
gscTotalImpressions: null,
|
|
36
|
+
gscAvgPosition: null,
|
|
37
|
+
indexationEfficiency: null,
|
|
38
|
+
topGscUrls: [],
|
|
39
|
+
}));
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// Build normalized URL → GSC entry lookup
|
|
43
|
+
const gscMap = new Map();
|
|
44
|
+
for (const entry of gscPages) {
|
|
45
|
+
const key = normalizeUrl(entry.url);
|
|
46
|
+
// Keep the one with more impressions if dupes
|
|
47
|
+
const existing = gscMap.get(key);
|
|
48
|
+
if (!existing || entry.impressions > existing.impressions) {
|
|
49
|
+
gscMap.set(key, entry);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return groups.map(group => {
|
|
54
|
+
let urlsWithImpressions = 0;
|
|
55
|
+
let totalClicks = 0;
|
|
56
|
+
let totalImpressions = 0;
|
|
57
|
+
let positionSum = 0;
|
|
58
|
+
let positionCount = 0;
|
|
59
|
+
const topUrls = [];
|
|
60
|
+
|
|
61
|
+
for (const url of group.urls) {
|
|
62
|
+
const gscEntry = gscMap.get(normalizeUrl(url));
|
|
63
|
+
if (gscEntry && gscEntry.impressions > 0) {
|
|
64
|
+
urlsWithImpressions++;
|
|
65
|
+
totalClicks += gscEntry.clicks || 0;
|
|
66
|
+
totalImpressions += gscEntry.impressions || 0;
|
|
67
|
+
if (gscEntry.position > 0) {
|
|
68
|
+
positionSum += gscEntry.position;
|
|
69
|
+
positionCount++;
|
|
70
|
+
}
|
|
71
|
+
topUrls.push({
|
|
72
|
+
url: gscEntry.url,
|
|
73
|
+
clicks: gscEntry.clicks,
|
|
74
|
+
impressions: gscEntry.impressions,
|
|
75
|
+
position: gscEntry.position,
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// Sort top URLs by impressions desc, take top 10
|
|
81
|
+
topUrls.sort((a, b) => b.impressions - a.impressions);
|
|
82
|
+
|
|
83
|
+
return {
|
|
84
|
+
...group,
|
|
85
|
+
gscUrlsWithImpressions: urlsWithImpressions,
|
|
86
|
+
gscTotalClicks: totalClicks,
|
|
87
|
+
gscTotalImpressions: totalImpressions,
|
|
88
|
+
gscAvgPosition: positionCount > 0 ? Math.round((positionSum / positionCount) * 10) / 10 : null,
|
|
89
|
+
indexationEfficiency: group.urlCount > 0 ? urlsWithImpressions / group.urlCount : 0,
|
|
90
|
+
topGscUrls: topUrls.slice(0, 10),
|
|
91
|
+
};
|
|
92
|
+
});
|
|
93
|
+
}
|