seo-intel 1.2.5 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,209 @@
1
+ /**
2
+ * URL Pattern Clustering — Phase 1
3
+ *
4
+ * Takes sitemap URLs, detects parametric patterns, groups them.
5
+ * Pure function — no I/O, no side effects.
6
+ */
7
+
8
+ /**
9
+ * Is this path segment a "variable" (one of N possible values)
10
+ * vs a "constant" (structural path like 'swap', 'docs', 'blog')?
11
+ */
12
+ function isVariable(segment) {
13
+ // Version prefixes stay constant: v1, v2, v3...
14
+ if (/^v\d+$/.test(segment)) return false;
15
+
16
+ // Common structural words stay constant
17
+ const STRUCTURAL = new Set([
18
+ 'api', 'docs', 'blog', 'news', 'about', 'pricing', 'features',
19
+ 'help', 'support', 'contact', 'legal', 'terms', 'privacy',
20
+ 'login', 'signup', 'register', 'dashboard', 'settings',
21
+ 'token', 'tokens', 'swap', 'trade', 'perps', 'perpetuals',
22
+ 'pool', 'pools', 'stake', 'staking', 'bridge', 'earn',
23
+ 'governance', 'vote', 'proposals', 'stats', 'analytics',
24
+ 'markets', 'pairs', 'explorer', 'episodes', 'categories',
25
+ 'tags', 'products', 'collections', 'pages', 'posts',
26
+ ]);
27
+ if (STRUCTURAL.has(segment.toLowerCase())) return false;
28
+
29
+ // Purely numeric → variable (IDs, dates)
30
+ if (/^\d+$/.test(segment)) return true;
31
+
32
+ // UUID or hash-like
33
+ if (/^[0-9a-f-]{8,}$/i.test(segment)) return true;
34
+
35
+ // Hex address (0x...)
36
+ if (/^0x[0-9a-fA-F]+$/.test(segment)) return true;
37
+
38
+ // Contains separator characters typical of slugs/pairs: SOL-USDC, my-blog-post
39
+ if (/[-_.]/.test(segment) && segment.length > 2) return true;
40
+
41
+ // All uppercase short string → likely a ticker: SOL, BONK, USDT, ETH
42
+ if (/^[A-Z0-9]{2,10}$/.test(segment)) return true;
43
+
44
+ // Mixed case with digits → product codes, IDs
45
+ if (/[A-Z]/.test(segment) && /\d/.test(segment)) return true;
46
+
47
+ // Very long segments are likely slugs or IDs
48
+ if (segment.length > 30) return true;
49
+
50
+ return false;
51
+ }
52
+
53
+ /**
54
+ * Infer a semantic name for a param position based on observed values.
55
+ */
56
+ function inferParamName(values, position) {
57
+ const sample = values.slice(0, 100);
58
+
59
+ // Crypto pairs: X-Y format where both parts are short uppercase
60
+ const pairCount = sample.filter(v => /^[A-Za-z0-9]+-[A-Za-z0-9]+$/.test(v)).length;
61
+ if (pairCount > sample.length * 0.6) return 'pair';
62
+
63
+ // Token tickers: 2-10 uppercase chars
64
+ const tickerCount = sample.filter(v => /^[A-Z0-9]{2,10}$/.test(v)).length;
65
+ if (tickerCount > sample.length * 0.6) return 'symbol';
66
+
67
+ // Slugs: lowercase with hyphens
68
+ const slugCount = sample.filter(v => /^[a-z0-9]+(-[a-z0-9]+)+$/.test(v)).length;
69
+ if (slugCount > sample.length * 0.6) return 'slug';
70
+
71
+ // Numeric IDs
72
+ const numCount = sample.filter(v => /^\d+$/.test(v)).length;
73
+ if (numCount > sample.length * 0.6) return 'id';
74
+
75
+ // Hex hashes/addresses
76
+ const hexCount = sample.filter(v => /^(0x)?[0-9a-f]{8,}$/i.test(v)).length;
77
+ if (hexCount > sample.length * 0.6) return 'hash';
78
+
79
+ return `param${position}`;
80
+ }
81
+
82
+ /**
83
+ * Cluster sitemap URLs into template groups.
84
+ *
85
+ * @param {Array<{url: string, lastmod?: string}>} sitemapEntries
86
+ * @param {object} opts
87
+ * @param {number} opts.minGroupSize — min URLs to qualify as template (default 10)
88
+ * @param {number} opts.maxSegments — max path depth to consider (default 8)
89
+ * @returns {{ groups: TemplateGroup[], ungrouped: string[], stats: object }}
90
+ */
91
+ export function clusterUrls(sitemapEntries, opts = {}) {
92
+ const minGroupSize = opts.minGroupSize || 10;
93
+ const maxSegments = opts.maxSegments || 8;
94
+
95
+ // patternKey → { pattern parts, urls[], paramValues by position }
96
+ const clusters = new Map();
97
+
98
+ for (const entry of sitemapEntries) {
99
+ let pathname;
100
+ try {
101
+ pathname = new URL(entry.url).pathname;
102
+ } catch { continue; }
103
+
104
+ // Normalize
105
+ pathname = pathname.replace(/\/+$/, '') || '/';
106
+
107
+ // Homepage is always unique
108
+ if (pathname === '/') continue;
109
+
110
+ const segments = pathname.split('/').filter(Boolean).slice(0, maxSegments);
111
+ const patternParts = [];
112
+ const paramPositions = {};
113
+ let paramIdx = 0;
114
+
115
+ for (let i = 0; i < segments.length; i++) {
116
+ if (isVariable(segments[i])) {
117
+ const key = `p${paramIdx}`;
118
+ patternParts.push(`{${key}}`);
119
+ if (!paramPositions[key]) paramPositions[key] = [];
120
+ paramPositions[key].push(segments[i]);
121
+ paramIdx++;
122
+ } else {
123
+ patternParts.push(segments[i].toLowerCase());
124
+ }
125
+ }
126
+
127
+ const patternKey = '/' + patternParts.join('/');
128
+
129
+ if (!clusters.has(patternKey)) {
130
+ clusters.set(patternKey, {
131
+ patternKey,
132
+ patternParts,
133
+ urls: [],
134
+ paramPositions: {},
135
+ lastmods: [],
136
+ });
137
+ }
138
+
139
+ const cluster = clusters.get(patternKey);
140
+ cluster.urls.push(entry.url);
141
+ if (entry.lastmod) cluster.lastmods.push(entry.lastmod);
142
+
143
+ // Collect param values (cap at 200 for memory)
144
+ for (const [key, values] of Object.entries(paramPositions)) {
145
+ if (!cluster.paramPositions[key]) cluster.paramPositions[key] = [];
146
+ if (cluster.paramPositions[key].length < 200) {
147
+ cluster.paramPositions[key].push(...values);
148
+ }
149
+ }
150
+ }
151
+
152
+ // Separate into template groups (>= minGroupSize) and ungrouped
153
+ const groups = [];
154
+ const ungrouped = [];
155
+
156
+ for (const [patternKey, cluster] of clusters) {
157
+ if (cluster.urls.length >= minGroupSize) {
158
+ // Rename params to semantic names
159
+ const params = {};
160
+ const renamedParts = [...cluster.patternParts];
161
+
162
+ let paramIdx = 0;
163
+ for (let i = 0; i < renamedParts.length; i++) {
164
+ const match = renamedParts[i].match(/^\{(p\d+)\}$/);
165
+ if (match) {
166
+ const key = match[1];
167
+ const name = inferParamName(cluster.paramPositions[key] || [], paramIdx);
168
+ renamedParts[i] = `{${name}}`;
169
+ params[name] = (cluster.paramPositions[key] || []).slice(0, 50);
170
+ paramIdx++;
171
+ }
172
+ }
173
+
174
+ const pattern = '/' + renamedParts.join('/');
175
+ const sortedLastmods = cluster.lastmods.sort();
176
+
177
+ groups.push({
178
+ pattern,
179
+ patternKey,
180
+ params,
181
+ urls: cluster.urls,
182
+ urlCount: cluster.urls.length,
183
+ depth: cluster.patternParts.length,
184
+ firstSeen: sortedLastmods[0] || null,
185
+ lastSeen: sortedLastmods[sortedLastmods.length - 1] || null,
186
+ });
187
+ } else {
188
+ ungrouped.push(...cluster.urls);
189
+ }
190
+ }
191
+
192
+ // Sort by URL count descending
193
+ groups.sort((a, b) => b.urlCount - a.urlCount);
194
+
195
+ const totalGrouped = groups.reduce((sum, g) => sum + g.urlCount, 0);
196
+ const totalUrls = sitemapEntries.length;
197
+
198
+ return {
199
+ groups,
200
+ ungrouped,
201
+ stats: {
202
+ totalUrls,
203
+ totalGroups: groups.length,
204
+ totalGrouped,
205
+ largestGroup: groups[0]?.urlCount || 0,
206
+ coverage: totalUrls > 0 ? totalGrouped / totalUrls : 0,
207
+ },
208
+ };
209
+ }
@@ -0,0 +1,93 @@
1
+ /**
2
+ * GSC Overlay — Phase 3
3
+ *
4
+ * Cross-references template groups against Google Search Console per-URL data.
5
+ * Pure computation — no I/O.
6
+ */
7
+
8
+ /**
9
+ * Normalize a URL for GSC matching.
10
+ * GSC reports URLs inconsistently — trailing slashes, www, http vs https.
11
+ */
12
+ function normalizeUrl(url) {
13
+ try {
14
+ const u = new URL(url);
15
+ return (u.protocol + '//' + u.hostname + u.pathname).replace(/\/+$/, '').toLowerCase();
16
+ } catch {
17
+ return url.toLowerCase().replace(/\/+$/, '');
18
+ }
19
+ }
20
+
21
+ /**
22
+ * Cross-reference template groups with GSC pages data.
23
+ *
24
+ * @param {TemplateGroup[]} groups — from cluster.js
25
+ * @param {Array<{url: string, clicks: number, impressions: number, ctr: number, position: number}>|null} gscPages
26
+ * @returns {GscOverlayResult[]}
27
+ */
28
+ export function overlayGsc(groups, gscPages) {
29
+ if (!gscPages || gscPages.length === 0) {
30
+ // No GSC data — return groups with null GSC fields
31
+ return groups.map(g => ({
32
+ ...g,
33
+ gscUrlsWithImpressions: null,
34
+ gscTotalClicks: null,
35
+ gscTotalImpressions: null,
36
+ gscAvgPosition: null,
37
+ indexationEfficiency: null,
38
+ topGscUrls: [],
39
+ }));
40
+ }
41
+
42
+ // Build normalized URL → GSC entry lookup
43
+ const gscMap = new Map();
44
+ for (const entry of gscPages) {
45
+ const key = normalizeUrl(entry.url);
46
+ // Keep the one with more impressions if dupes
47
+ const existing = gscMap.get(key);
48
+ if (!existing || entry.impressions > existing.impressions) {
49
+ gscMap.set(key, entry);
50
+ }
51
+ }
52
+
53
+ return groups.map(group => {
54
+ let urlsWithImpressions = 0;
55
+ let totalClicks = 0;
56
+ let totalImpressions = 0;
57
+ let positionSum = 0;
58
+ let positionCount = 0;
59
+ const topUrls = [];
60
+
61
+ for (const url of group.urls) {
62
+ const gscEntry = gscMap.get(normalizeUrl(url));
63
+ if (gscEntry && gscEntry.impressions > 0) {
64
+ urlsWithImpressions++;
65
+ totalClicks += gscEntry.clicks || 0;
66
+ totalImpressions += gscEntry.impressions || 0;
67
+ if (gscEntry.position > 0) {
68
+ positionSum += gscEntry.position;
69
+ positionCount++;
70
+ }
71
+ topUrls.push({
72
+ url: gscEntry.url,
73
+ clicks: gscEntry.clicks,
74
+ impressions: gscEntry.impressions,
75
+ position: gscEntry.position,
76
+ });
77
+ }
78
+ }
79
+
80
+ // Sort top URLs by impressions desc, take top 10
81
+ topUrls.sort((a, b) => b.impressions - a.impressions);
82
+
83
+ return {
84
+ ...group,
85
+ gscUrlsWithImpressions: urlsWithImpressions,
86
+ gscTotalClicks: totalClicks,
87
+ gscTotalImpressions: totalImpressions,
88
+ gscAvgPosition: positionCount > 0 ? Math.round((positionSum / positionCount) * 10) / 10 : null,
89
+ indexationEfficiency: group.urlCount > 0 ? urlsWithImpressions / group.urlCount : 0,
90
+ topGscUrls: topUrls.slice(0, 10),
91
+ };
92
+ });
93
+ }