seo-intel 1.2.4 → 1.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +21 -0
- package/analyses/aeo/index.js +252 -0
- package/analyses/aeo/scorer.js +254 -0
- package/analyses/templates/cluster.js +209 -0
- package/analyses/templates/gsc-overlay.js +93 -0
- package/analyses/templates/index.js +425 -0
- package/analyses/templates/sampler.js +198 -0
- package/analyses/templates/scorer.js +149 -0
- package/analyses/templates/similarity.js +174 -0
- package/analysis/prompt-builder.js +272 -0
- package/analysis/topic-cluster-mapper.js +427 -0
- package/cli.js +0 -1
- package/extractor/qwen.js +558 -0
- package/package.json +4 -1
- package/setup/wizard.html +3 -3
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Template Analysis Orchestrator
|
|
3
|
+
*
|
|
4
|
+
* Runs five phases:
|
|
5
|
+
* 1. URL Pattern Clustering (sitemap parse → cluster)
|
|
6
|
+
* 2. Smart Sampling (stealth crawl ~20 pages/group)
|
|
7
|
+
* 3. GSC Overlay (cross-reference with Search Console data)
|
|
8
|
+
* 4. Scoring & Recommendations
|
|
9
|
+
* 5. Template Profile Extrapolation (infer extraction fields for all URLs from samples)
|
|
10
|
+
*
|
|
11
|
+
* Then writes results to DB and returns the report.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { fetchSitemap } from '../../crawler/sitemap.js';
|
|
15
|
+
import { loadGscData } from '../../reports/gsc-loader.js';
|
|
16
|
+
import { loadAllConfigs } from '../../scheduler.js';
|
|
17
|
+
import { getDb, upsertTemplateGroup, getTemplateGroupId, upsertTemplateSample } from '../../db/db.js';
|
|
18
|
+
import { clusterUrls } from './cluster.js';
|
|
19
|
+
import { selectSample, crawlSample } from './sampler.js';
|
|
20
|
+
import { averageSimilarity, averageFingerprintSimilarity } from './similarity.js';
|
|
21
|
+
import { overlayGsc } from './gsc-overlay.js';
|
|
22
|
+
import { scoreGroup } from './scorer.js';
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Run full template analysis for a project.
|
|
26
|
+
*
|
|
27
|
+
* @param {string} project
|
|
28
|
+
* @param {object} opts
|
|
29
|
+
* @param {number} opts.minGroupSize — min URLs per template (default 10)
|
|
30
|
+
* @param {number} opts.sampleSize — pages to crawl per group (default 20)
|
|
31
|
+
* @param {boolean} opts.skipCrawl — skip Phase 2 (pattern + GSC only)
|
|
32
|
+
* @param {boolean} opts.skipGsc — skip Phase 3
|
|
33
|
+
* @param {Function} opts.log — (message) => void (default console.log)
|
|
34
|
+
* @returns {Promise<TemplatesReport>}
|
|
35
|
+
*/
|
|
36
|
+
export async function runTemplatesAnalysis(project, opts = {}) {
|
|
37
|
+
const log = opts.log || console.log;
|
|
38
|
+
const minGroupSize = opts.minGroupSize || 10;
|
|
39
|
+
const sampleSize = opts.sampleSize || 20;
|
|
40
|
+
|
|
41
|
+
// ── Load project config ──
|
|
42
|
+
const configs = loadAllConfigs();
|
|
43
|
+
const config = configs.find(c => c.project === project);
|
|
44
|
+
if (!config) throw new Error(`Project "${project}" not found. Run: seo-intel setup`);
|
|
45
|
+
|
|
46
|
+
const targetDomain = config.target.domain;
|
|
47
|
+
const targetUrl = config.target.url || `https://${targetDomain}`;
|
|
48
|
+
|
|
49
|
+
log(`\n Target: ${targetDomain}`);
|
|
50
|
+
|
|
51
|
+
// ═══ PHASE 1: URL Pattern Clustering ═══
|
|
52
|
+
log(`\n Phase 1: URL Pattern Clustering`);
|
|
53
|
+
log(` Fetching sitemap...`);
|
|
54
|
+
|
|
55
|
+
const sitemapEntries = await fetchSitemap(targetUrl);
|
|
56
|
+
|
|
57
|
+
if (!sitemapEntries.length) {
|
|
58
|
+
log(` ⚠️ No sitemap URLs found for ${targetDomain}`);
|
|
59
|
+
log(` Ensure sitemap.xml is accessible at ${targetUrl}/sitemap.xml`);
|
|
60
|
+
return { groups: [], stats: { totalUrls: 0, totalGroups: 0, coverage: 0 }, project, domain: targetDomain };
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
log(` Found ${sitemapEntries.length.toLocaleString()} URLs in sitemap`);
|
|
64
|
+
|
|
65
|
+
const { groups, ungrouped, stats } = clusterUrls(sitemapEntries, { minGroupSize });
|
|
66
|
+
|
|
67
|
+
log(` ${stats.totalGroups} template groups found`);
|
|
68
|
+
log(` Coverage: ${stats.totalGrouped.toLocaleString()} URLs (${(stats.coverage * 100).toFixed(1)}% of sitemap)`);
|
|
69
|
+
log('');
|
|
70
|
+
|
|
71
|
+
if (groups.length === 0) {
|
|
72
|
+
log(` No template patterns detected (all pages are unique).`);
|
|
73
|
+
return { groups: [], ungrouped, stats, project, domain: targetDomain };
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Show discovered patterns
|
|
77
|
+
const maxPatternLen = Math.max(...groups.map(g => g.pattern.length), 7);
|
|
78
|
+
log(` ${'Pattern'.padEnd(maxPatternLen)} ${'URLs'.padStart(8)} Verdict`);
|
|
79
|
+
log(` ${'─'.repeat(maxPatternLen)} ${'─'.repeat(8)} ─────────`);
|
|
80
|
+
for (const g of groups) {
|
|
81
|
+
log(` ${g.pattern.padEnd(maxPatternLen)} ${g.urlCount.toLocaleString().padStart(8)} [pending]`);
|
|
82
|
+
}
|
|
83
|
+
log('');
|
|
84
|
+
|
|
85
|
+
// ═══ PHASE 2: Smart Sample Crawl ═══
|
|
86
|
+
if (!opts.skipCrawl) {
|
|
87
|
+
log(` Phase 2: Smart Sample Crawl (stealth)`);
|
|
88
|
+
|
|
89
|
+
for (const group of groups) {
|
|
90
|
+
const sample = selectSample(group.urls, sampleSize);
|
|
91
|
+
log(` Sampling ${group.pattern}... ${sample.length} pages`);
|
|
92
|
+
|
|
93
|
+
try {
|
|
94
|
+
const results = await crawlSample(sample, {
|
|
95
|
+
hostname: targetDomain,
|
|
96
|
+
onPage: (result, idx, total) => {
|
|
97
|
+
const status = result.statusCode >= 400 ? '✗' : result.statusCode > 0 ? '✓' : '?';
|
|
98
|
+
process.stdout.write(` [${idx + 1}/${total}] ${status} ${result.url.replace(/https?:\/\/[^/]+/, '').slice(0, 50)}\n`);
|
|
99
|
+
},
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
group.samples = results;
|
|
103
|
+
group.sampleSize = results.filter(r => r.statusCode > 0 && r.statusCode < 400).length;
|
|
104
|
+
|
|
105
|
+
// Compute similarity stats from successful samples
|
|
106
|
+
const successful = results.filter(r => r.statusCode > 0 && r.statusCode < 400);
|
|
107
|
+
if (successful.length >= 2) {
|
|
108
|
+
const bodyTexts = successful.map(r => r.bodyText).filter(Boolean);
|
|
109
|
+
const fingerprints = successful.map(r => r.domFingerprintStr).filter(Boolean);
|
|
110
|
+
|
|
111
|
+
group.avgWordCount = successful.reduce((s, r) => s + (r.wordCount || 0), 0) / successful.length;
|
|
112
|
+
group.contentSimilarity = averageSimilarity(bodyTexts);
|
|
113
|
+
group.domSimilarity = averageFingerprintSimilarity(fingerprints);
|
|
114
|
+
group.canonicalRate = successful.filter(r => r.hasCanonical).length / successful.length;
|
|
115
|
+
} else {
|
|
116
|
+
group.avgWordCount = successful[0]?.wordCount || 0;
|
|
117
|
+
group.contentSimilarity = null;
|
|
118
|
+
group.domSimilarity = null;
|
|
119
|
+
group.canonicalRate = null;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
log(` ✓ ${group.sampleSize} successful, similarity: ${group.contentSimilarity != null ? (group.contentSimilarity * 100).toFixed(0) + '%' : 'N/A'}`);
|
|
123
|
+
} catch (err) {
|
|
124
|
+
log(` ✗ Sample crawl failed: ${err.message}`);
|
|
125
|
+
group.samples = [];
|
|
126
|
+
group.sampleSize = 0;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
log('');
|
|
130
|
+
} else {
|
|
131
|
+
log(` Phase 2: Skipped (--skip-crawl)`);
|
|
132
|
+
for (const g of groups) {
|
|
133
|
+
g.samples = [];
|
|
134
|
+
g.sampleSize = 0;
|
|
135
|
+
}
|
|
136
|
+
log('');
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// ═══ PHASE 3: GSC Overlay ═══
|
|
140
|
+
if (!opts.skipGsc) {
|
|
141
|
+
log(` Phase 3: GSC Overlay`);
|
|
142
|
+
const gscData = loadGscData(project);
|
|
143
|
+
if (gscData?.pages?.length) {
|
|
144
|
+
log(` Loaded GSC data: ${gscData.pages.length.toLocaleString()} pages with data`);
|
|
145
|
+
const overlayed = overlayGsc(groups, gscData.pages);
|
|
146
|
+
// Merge GSC fields back into groups
|
|
147
|
+
for (let i = 0; i < groups.length; i++) {
|
|
148
|
+
Object.assign(groups[i], {
|
|
149
|
+
gscUrlsWithImpressions: overlayed[i].gscUrlsWithImpressions,
|
|
150
|
+
gscTotalClicks: overlayed[i].gscTotalClicks,
|
|
151
|
+
gscTotalImpressions: overlayed[i].gscTotalImpressions,
|
|
152
|
+
gscAvgPosition: overlayed[i].gscAvgPosition,
|
|
153
|
+
indexationEfficiency: overlayed[i].indexationEfficiency,
|
|
154
|
+
topGscUrls: overlayed[i].topGscUrls,
|
|
155
|
+
});
|
|
156
|
+
}
|
|
157
|
+
log(` Matched template URLs against GSC data`);
|
|
158
|
+
} else {
|
|
159
|
+
log(` No GSC data found for ${project}`);
|
|
160
|
+
for (const g of groups) {
|
|
161
|
+
g.gscUrlsWithImpressions = null;
|
|
162
|
+
g.gscTotalClicks = null;
|
|
163
|
+
g.gscTotalImpressions = null;
|
|
164
|
+
g.gscAvgPosition = null;
|
|
165
|
+
g.indexationEfficiency = null;
|
|
166
|
+
g.topGscUrls = [];
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
log('');
|
|
170
|
+
} else {
|
|
171
|
+
log(` Phase 3: Skipped (--skip-gsc)\n`);
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// ═══ PHASE 4: Scoring & Recommendations ═══
|
|
175
|
+
log(` Phase 4: Scoring & Recommendations`);
|
|
176
|
+
|
|
177
|
+
for (const group of groups) {
|
|
178
|
+
const result = scoreGroup(group);
|
|
179
|
+
group.score = result.score;
|
|
180
|
+
group.verdict = result.verdict;
|
|
181
|
+
group.recommendation = result.recommendations;
|
|
182
|
+
|
|
183
|
+
const verdictColor = { 'high-value': '🟢', mixed: '🟡', thin: '🟠', invisible: '🔴' };
|
|
184
|
+
log(` ${(verdictColor[group.verdict] || '⚪')} ${group.pattern.padEnd(maxPatternLen)} → ${group.verdict} (score: ${group.score})`);
|
|
185
|
+
}
|
|
186
|
+
log('');
|
|
187
|
+
|
|
188
|
+
// ═══ PHASE 5: Template Profile Extrapolation ═══
|
|
189
|
+
// For each group with samples, build an "inferred profile" — the common fields
|
|
190
|
+
// that apply to ALL URLs in the group. This lets us "know" 47k pages from 20 samples.
|
|
191
|
+
log(` Phase 5: Template Profile Extrapolation`);
|
|
192
|
+
|
|
193
|
+
for (const group of groups) {
|
|
194
|
+
group.profile = buildTemplateProfile(group);
|
|
195
|
+
if (group.profile) {
|
|
196
|
+
const p = group.profile;
|
|
197
|
+
log(` ${group.pattern}: ${group.urlCount.toLocaleString()} pages inferred`);
|
|
198
|
+
log(` schema: ${p.schemaPresence}% · canonical: ${p.canonicalPresence}% · indexable: ${p.indexablePresence}% · avg words: ${Math.round(p.avgWordCount)}`);
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
log('');
|
|
202
|
+
|
|
203
|
+
// ═══ PHASE 6: Competitor Sitemap Census ═══
|
|
204
|
+
// Fetch competitor sitemaps and cluster them — zero crawling, just URL counting.
|
|
205
|
+
// Shows: "You have 200 swap pages, Jupiter has 47k" — instant competitive intel.
|
|
206
|
+
const competitorCensus = [];
|
|
207
|
+
const competitors = config.competitors || [];
|
|
208
|
+
|
|
209
|
+
if (competitors.length > 0 && !opts.skipCompetitors) {
|
|
210
|
+
log(` Phase 6: Competitor Sitemap Census`);
|
|
211
|
+
|
|
212
|
+
for (const comp of competitors) {
|
|
213
|
+
const compUrl = comp.url || `https://${comp.domain}`;
|
|
214
|
+
log(` Scanning ${comp.domain}...`);
|
|
215
|
+
|
|
216
|
+
try {
|
|
217
|
+
const compEntries = await fetchSitemap(compUrl);
|
|
218
|
+
if (compEntries.length === 0) {
|
|
219
|
+
log(` No sitemap found`);
|
|
220
|
+
competitorCensus.push({ domain: comp.domain, totalUrls: 0, groups: [] });
|
|
221
|
+
continue;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
const compResult = clusterUrls(compEntries, { minGroupSize });
|
|
225
|
+
competitorCensus.push({
|
|
226
|
+
domain: comp.domain,
|
|
227
|
+
totalUrls: compResult.stats.totalUrls,
|
|
228
|
+
groups: compResult.groups.map(g => ({
|
|
229
|
+
pattern: g.pattern,
|
|
230
|
+
urlCount: g.urlCount,
|
|
231
|
+
})),
|
|
232
|
+
stats: compResult.stats,
|
|
233
|
+
});
|
|
234
|
+
|
|
235
|
+
log(` ${compEntries.length.toLocaleString()} URLs → ${compResult.stats.totalGroups} templates`);
|
|
236
|
+
for (const g of compResult.groups.slice(0, 5)) {
|
|
237
|
+
log(` ${g.pattern.padEnd(30)} ${g.urlCount.toLocaleString().padStart(8)} URLs`);
|
|
238
|
+
}
|
|
239
|
+
if (compResult.groups.length > 5) {
|
|
240
|
+
log(` ... and ${compResult.groups.length - 5} more`);
|
|
241
|
+
}
|
|
242
|
+
} catch (err) {
|
|
243
|
+
log(` ✗ Failed: ${err.message}`);
|
|
244
|
+
competitorCensus.push({ domain: comp.domain, totalUrls: 0, groups: [], error: err.message });
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
log('');
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// ═══ Write to DB ═══
|
|
251
|
+
const db = getDb();
|
|
252
|
+
const analyzedAt = Date.now();
|
|
253
|
+
|
|
254
|
+
for (const group of groups) {
|
|
255
|
+
upsertTemplateGroup(db, {
|
|
256
|
+
project,
|
|
257
|
+
domain: targetDomain,
|
|
258
|
+
pattern: group.pattern,
|
|
259
|
+
urlCount: group.urlCount,
|
|
260
|
+
sampleSize: group.sampleSize || 0,
|
|
261
|
+
avgWordCount: group.avgWordCount,
|
|
262
|
+
contentSimilarity: group.contentSimilarity,
|
|
263
|
+
domSimilarity: group.domSimilarity,
|
|
264
|
+
gscUrlsWithImpressions: group.gscUrlsWithImpressions,
|
|
265
|
+
gscTotalClicks: group.gscTotalClicks,
|
|
266
|
+
gscTotalImpressions: group.gscTotalImpressions,
|
|
267
|
+
gscAvgPosition: group.gscAvgPosition,
|
|
268
|
+
indexationEfficiency: group.indexationEfficiency,
|
|
269
|
+
score: group.score,
|
|
270
|
+
verdict: group.verdict,
|
|
271
|
+
recommendation: group.recommendation,
|
|
272
|
+
analyzedAt,
|
|
273
|
+
});
|
|
274
|
+
|
|
275
|
+
// Save samples
|
|
276
|
+
if (group.samples?.length) {
|
|
277
|
+
const groupId = getTemplateGroupId(db, project, targetDomain, group.pattern);
|
|
278
|
+
if (groupId) {
|
|
279
|
+
for (const s of group.samples) {
|
|
280
|
+
upsertTemplateSample(db, {
|
|
281
|
+
groupId,
|
|
282
|
+
url: s.url,
|
|
283
|
+
sampleRole: s.sampleRole,
|
|
284
|
+
statusCode: s.statusCode,
|
|
285
|
+
wordCount: s.wordCount,
|
|
286
|
+
title: s.title,
|
|
287
|
+
metaDesc: s.metaDesc,
|
|
288
|
+
hasCanonical: s.hasCanonical,
|
|
289
|
+
hasSchema: s.hasSchema,
|
|
290
|
+
isIndexable: s.isIndexable,
|
|
291
|
+
domFingerprint: s.domFingerprintStr,
|
|
292
|
+
contentHash: s.contentHash,
|
|
293
|
+
bodyText: s.bodyText,
|
|
294
|
+
crawledAt: s.crawledAt,
|
|
295
|
+
});
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
log(` Results saved to database.`);
|
|
302
|
+
|
|
303
|
+
return {
|
|
304
|
+
project,
|
|
305
|
+
domain: targetDomain,
|
|
306
|
+
groups,
|
|
307
|
+
ungrouped,
|
|
308
|
+
stats,
|
|
309
|
+
competitorCensus,
|
|
310
|
+
analyzedAt,
|
|
311
|
+
};
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
/**
|
|
315
|
+
* Build an inferred profile for a template group from its samples.
|
|
316
|
+
*
|
|
317
|
+
* If 20 sampled pages from /swap/{pair} show:
|
|
318
|
+
* - 95% have schema markup
|
|
319
|
+
* - 100% have canonical tags
|
|
320
|
+
* - avg 180 words
|
|
321
|
+
* - all use the same DOM structure
|
|
322
|
+
*
|
|
323
|
+
* We can extrapolate that to all 47,000 pages in the group.
|
|
324
|
+
* This replaces the need to crawl+extract every page.
|
|
325
|
+
*
|
|
326
|
+
* @param {object} group — template group with .samples[]
|
|
327
|
+
* @returns {object|null} — inferred profile, or null if no usable samples
|
|
328
|
+
*/
|
|
329
|
+
function buildTemplateProfile(group) {
|
|
330
|
+
const samples = (group.samples || []).filter(s => s.statusCode > 0 && s.statusCode < 400);
|
|
331
|
+
if (samples.length < 2) return null;
|
|
332
|
+
|
|
333
|
+
const n = samples.length;
|
|
334
|
+
|
|
335
|
+
// ── Presence rates (extrapolated to all URLs in group) ──
|
|
336
|
+
const schemaPresence = Math.round((samples.filter(s => s.hasSchema).length / n) * 100);
|
|
337
|
+
const canonicalPresence = Math.round((samples.filter(s => s.hasCanonical).length / n) * 100);
|
|
338
|
+
const indexablePresence = Math.round((samples.filter(s => s.isIndexable).length / n) * 100);
|
|
339
|
+
|
|
340
|
+
// ── Content stats ──
|
|
341
|
+
const avgWordCount = samples.reduce((sum, s) => sum + (s.wordCount || 0), 0) / n;
|
|
342
|
+
const minWordCount = Math.min(...samples.map(s => s.wordCount || 0));
|
|
343
|
+
const maxWordCount = Math.max(...samples.map(s => s.wordCount || 0));
|
|
344
|
+
|
|
345
|
+
// ── Title/meta pattern detection ──
|
|
346
|
+
// Find the common template in titles by extracting shared prefixes/suffixes
|
|
347
|
+
const titlePattern = detectPattern(samples.map(s => s.title).filter(Boolean));
|
|
348
|
+
const metaPattern = detectPattern(samples.map(s => s.metaDesc).filter(Boolean));
|
|
349
|
+
|
|
350
|
+
// ── Unique content hashes ──
|
|
351
|
+
const uniqueHashes = new Set(samples.map(s => s.contentHash).filter(Boolean));
|
|
352
|
+
const contentDiversity = uniqueHashes.size / n; // 1.0 = all unique, low = duplicates
|
|
353
|
+
|
|
354
|
+
// ── Inferred totals (extrapolated) ──
|
|
355
|
+
const estimatedWithSchema = Math.round(group.urlCount * (schemaPresence / 100));
|
|
356
|
+
const estimatedWithCanonical = Math.round(group.urlCount * (canonicalPresence / 100));
|
|
357
|
+
const estimatedIndexable = Math.round(group.urlCount * (indexablePresence / 100));
|
|
358
|
+
const estimatedTotalWords = Math.round(group.urlCount * avgWordCount);
|
|
359
|
+
|
|
360
|
+
return {
|
|
361
|
+
sampleCount: n,
|
|
362
|
+
totalInferred: group.urlCount,
|
|
363
|
+
|
|
364
|
+
// Rates (%)
|
|
365
|
+
schemaPresence,
|
|
366
|
+
canonicalPresence,
|
|
367
|
+
indexablePresence,
|
|
368
|
+
|
|
369
|
+
// Content
|
|
370
|
+
avgWordCount,
|
|
371
|
+
minWordCount,
|
|
372
|
+
maxWordCount,
|
|
373
|
+
contentDiversity,
|
|
374
|
+
estimatedTotalWords,
|
|
375
|
+
|
|
376
|
+
// Patterns
|
|
377
|
+
titlePattern,
|
|
378
|
+
metaPattern,
|
|
379
|
+
|
|
380
|
+
// Extrapolated totals
|
|
381
|
+
estimatedWithSchema,
|
|
382
|
+
estimatedWithCanonical,
|
|
383
|
+
estimatedIndexable,
|
|
384
|
+
};
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
/**
|
|
388
|
+
* Detect the common template pattern in a set of strings.
|
|
389
|
+
* Returns the shared prefix + "{variable}" + shared suffix.
|
|
390
|
+
*
|
|
391
|
+
* e.g. ["Swap SOL to USDC | Jupiter", "Swap BONK to USDT | Jupiter"]
|
|
392
|
+
* → "Swap {…} | Jupiter"
|
|
393
|
+
*/
|
|
394
|
+
function detectPattern(strings) {
|
|
395
|
+
if (strings.length < 2) return strings[0] || null;
|
|
396
|
+
|
|
397
|
+
// Find longest common prefix
|
|
398
|
+
let prefix = '';
|
|
399
|
+
for (let i = 0; i < strings[0].length; i++) {
|
|
400
|
+
const char = strings[0][i];
|
|
401
|
+
if (strings.every(s => s[i] === char)) {
|
|
402
|
+
prefix += char;
|
|
403
|
+
} else break;
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
// Find longest common suffix (reversed)
|
|
407
|
+
const reversed = strings.map(s => s.split('').reverse().join(''));
|
|
408
|
+
let suffix = '';
|
|
409
|
+
for (let i = 0; i < reversed[0].length; i++) {
|
|
410
|
+
const char = reversed[0][i];
|
|
411
|
+
if (reversed.every(s => s[i] === char)) {
|
|
412
|
+
suffix = char + suffix;
|
|
413
|
+
} else break;
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
// Don't overlap
|
|
417
|
+
if (prefix.length + suffix.length >= strings[0].length) {
|
|
418
|
+
return strings[0]; // all identical
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
const variable = prefix.length > 0 || suffix.length > 0;
|
|
422
|
+
if (!variable) return null; // no common pattern
|
|
423
|
+
|
|
424
|
+
return (prefix.trim() + ' {…} ' + suffix.trim()).trim();
|
|
425
|
+
}
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Smart Sample Selection & Stealth Crawl — Phase 2
|
|
3
|
+
*
|
|
4
|
+
* Selects a strategic sample from each template group,
|
|
5
|
+
* then stealth-crawls those pages for content analysis.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { createHash } from 'crypto';
|
|
9
|
+
import { domFingerprint } from './similarity.js';
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Select which URLs to crawl from a group.
|
|
13
|
+
* Pure function — no I/O.
|
|
14
|
+
*
|
|
15
|
+
* Strategy:
|
|
16
|
+
* - high-value: shortest paths (likely most important)
|
|
17
|
+
* - middle: middle of sorted list
|
|
18
|
+
* - long-tail: longest paths (most specific/obscure)
|
|
19
|
+
* - random: random picks across the full list
|
|
20
|
+
*
|
|
21
|
+
* @param {string[]} urls
|
|
22
|
+
* @param {number} sampleSize — default 20
|
|
23
|
+
* @returns {{ url: string, role: string }[]}
|
|
24
|
+
*/
|
|
25
|
+
export function selectSample(urls, sampleSize = 20) {
|
|
26
|
+
if (urls.length <= sampleSize) {
|
|
27
|
+
return urls.map(url => ({ url, role: 'all' }));
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
// Sort by path length (shorter = likely higher value)
|
|
31
|
+
const sorted = [...urls].sort((a, b) => {
|
|
32
|
+
const pathA = new URL(a).pathname;
|
|
33
|
+
const pathB = new URL(b).pathname;
|
|
34
|
+
return pathA.length - pathB.length || pathA.localeCompare(pathB);
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
const used = new Set();
|
|
38
|
+
const result = [];
|
|
39
|
+
|
|
40
|
+
const nHighValue = Math.ceil(sampleSize * 0.30);
|
|
41
|
+
const nMiddle = Math.ceil(sampleSize * 0.25);
|
|
42
|
+
const nLongTail = Math.ceil(sampleSize * 0.25);
|
|
43
|
+
|
|
44
|
+
// High-value: top of sorted (shortest paths)
|
|
45
|
+
for (let i = 0; i < sorted.length && result.length < nHighValue; i++) {
|
|
46
|
+
if (!used.has(sorted[i])) {
|
|
47
|
+
result.push({ url: sorted[i], role: 'high-value' });
|
|
48
|
+
used.add(sorted[i]);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// Middle: around the center
|
|
53
|
+
const mid = Math.floor(sorted.length / 2);
|
|
54
|
+
const midStart = Math.max(0, mid - Math.floor(nMiddle / 2));
|
|
55
|
+
for (let i = midStart; i < sorted.length && result.filter(r => r.role === 'middle').length < nMiddle; i++) {
|
|
56
|
+
if (!used.has(sorted[i])) {
|
|
57
|
+
result.push({ url: sorted[i], role: 'middle' });
|
|
58
|
+
used.add(sorted[i]);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Long-tail: bottom of sorted (longest paths)
|
|
63
|
+
for (let i = sorted.length - 1; i >= 0 && result.filter(r => r.role === 'long-tail').length < nLongTail; i--) {
|
|
64
|
+
if (!used.has(sorted[i])) {
|
|
65
|
+
result.push({ url: sorted[i], role: 'long-tail' });
|
|
66
|
+
used.add(sorted[i]);
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Random: fill remainder
|
|
71
|
+
const remaining = sampleSize - result.length;
|
|
72
|
+
const unused = sorted.filter(u => !used.has(u));
|
|
73
|
+
// Fisher-Yates shuffle
|
|
74
|
+
for (let i = unused.length - 1; i > 0; i--) {
|
|
75
|
+
const j = Math.floor(Math.random() * (i + 1));
|
|
76
|
+
[unused[i], unused[j]] = [unused[j], unused[i]];
|
|
77
|
+
}
|
|
78
|
+
for (let i = 0; i < Math.min(remaining, unused.length); i++) {
|
|
79
|
+
result.push({ url: unused[i], role: 'random' });
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
return result;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Stealth-crawl a sample of URLs from a template group.
|
|
87
|
+
*
|
|
88
|
+
* @param {{ url: string, role: string }[]} sample — from selectSample()
|
|
89
|
+
* @param {object} opts
|
|
90
|
+
* @param {string} opts.hostname — for session persistence
|
|
91
|
+
* @param {Function} opts.onPage — (result, index, total) => void
|
|
92
|
+
* @returns {Promise<SampleResult[]>}
|
|
93
|
+
*/
|
|
94
|
+
export async function crawlSample(sample, opts = {}) {
|
|
95
|
+
const { getStealthConfig, STEALTH_INIT_SCRIPT, applyStealthRoutes } = await import('../../crawler/stealth.js');
|
|
96
|
+
const { chromium } = await import('playwright');
|
|
97
|
+
|
|
98
|
+
const stealthCfg = getStealthConfig();
|
|
99
|
+
const browser = await chromium.launch({ headless: true, ...stealthCfg.launchArgs });
|
|
100
|
+
const context = await browser.newContext(stealthCfg.contextOpts);
|
|
101
|
+
await context.addInitScript(STEALTH_INIT_SCRIPT);
|
|
102
|
+
await applyStealthRoutes(context);
|
|
103
|
+
|
|
104
|
+
const results = [];
|
|
105
|
+
|
|
106
|
+
try {
|
|
107
|
+
for (let i = 0; i < sample.length; i++) {
|
|
108
|
+
const { url, role } = sample[i];
|
|
109
|
+
const result = await crawlSinglePage(context, url, role);
|
|
110
|
+
results.push(result);
|
|
111
|
+
|
|
112
|
+
if (opts.onPage) opts.onPage(result, i, sample.length);
|
|
113
|
+
|
|
114
|
+
// Jittered delay: 2-4s
|
|
115
|
+
if (i < sample.length - 1) {
|
|
116
|
+
await new Promise(r => setTimeout(r, 2000 + Math.random() * 2000));
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
} finally {
|
|
120
|
+
await browser.close().catch(() => {});
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
return results;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Crawl a single page and extract template analysis fields.
|
|
128
|
+
*/
|
|
129
|
+
async function crawlSinglePage(context, url, role) {
|
|
130
|
+
const page = await context.newPage();
|
|
131
|
+
const result = {
|
|
132
|
+
url,
|
|
133
|
+
sampleRole: role,
|
|
134
|
+
statusCode: 0,
|
|
135
|
+
wordCount: 0,
|
|
136
|
+
title: '',
|
|
137
|
+
metaDesc: '',
|
|
138
|
+
hasCanonical: false,
|
|
139
|
+
hasSchema: false,
|
|
140
|
+
isIndexable: true,
|
|
141
|
+
domFingerprintStr: '',
|
|
142
|
+
contentHash: '',
|
|
143
|
+
bodyText: '',
|
|
144
|
+
crawledAt: Date.now(),
|
|
145
|
+
};
|
|
146
|
+
|
|
147
|
+
try {
|
|
148
|
+
const response = await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 20000 });
|
|
149
|
+
result.statusCode = response?.status() || 0;
|
|
150
|
+
|
|
151
|
+
if (result.statusCode >= 400) {
|
|
152
|
+
await page.close();
|
|
153
|
+
return result;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// Wait for dynamic content
|
|
157
|
+
await page.waitForTimeout(2000);
|
|
158
|
+
|
|
159
|
+
// Extract page data
|
|
160
|
+
const data = await page.evaluate(() => {
|
|
161
|
+
const title = document.title || '';
|
|
162
|
+
const metaDesc = document.querySelector('meta[name="description"]')?.content || '';
|
|
163
|
+
const canonical = document.querySelector('link[rel="canonical"]');
|
|
164
|
+
const hasCanonical = !!canonical;
|
|
165
|
+
const hasSchema = !!document.querySelector('script[type="application/ld+json"]');
|
|
166
|
+
|
|
167
|
+
// Indexability: check robots meta
|
|
168
|
+
const robotsMeta = document.querySelector('meta[name="robots"]')?.content || '';
|
|
169
|
+
const isIndexable = !robotsMeta.includes('noindex');
|
|
170
|
+
|
|
171
|
+
// Body text
|
|
172
|
+
const bodyText = document.body?.innerText || '';
|
|
173
|
+
const wordCount = bodyText.split(/\s+/).filter(w => w.length > 1).length;
|
|
174
|
+
|
|
175
|
+
return { title, metaDesc, hasCanonical, hasSchema, isIndexable, bodyText, wordCount };
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
result.title = data.title;
|
|
179
|
+
result.metaDesc = data.metaDesc;
|
|
180
|
+
result.hasCanonical = data.hasCanonical;
|
|
181
|
+
result.hasSchema = data.hasSchema;
|
|
182
|
+
result.isIndexable = data.isIndexable;
|
|
183
|
+
result.wordCount = data.wordCount;
|
|
184
|
+
// Cap body text at 5000 chars for similarity computation
|
|
185
|
+
result.bodyText = data.bodyText.slice(0, 5000);
|
|
186
|
+
result.contentHash = createHash('sha256').update(data.bodyText).digest('hex').slice(0, 16);
|
|
187
|
+
|
|
188
|
+
// DOM fingerprint
|
|
189
|
+
result.domFingerprintStr = await domFingerprint(page);
|
|
190
|
+
|
|
191
|
+
} catch (err) {
|
|
192
|
+
// Page failed — result stays with defaults (statusCode 0)
|
|
193
|
+
} finally {
|
|
194
|
+
await page.close().catch(() => {});
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
return result;
|
|
198
|
+
}
|