seo-intel 1.2.4 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,425 @@
1
+ /**
2
+ * Template Analysis Orchestrator
3
+ *
4
+ * Runs five phases:
5
+ * 1. URL Pattern Clustering (sitemap parse → cluster)
6
+ * 2. Smart Sampling (stealth crawl ~20 pages/group)
7
+ * 3. GSC Overlay (cross-reference with Search Console data)
8
+ * 4. Scoring & Recommendations
9
+ * 5. Template Profile Extrapolation (infer extraction fields for all URLs from samples)
10
+ *
11
+ * Then writes results to DB and returns the report.
12
+ */
13
+
14
+ import { fetchSitemap } from '../../crawler/sitemap.js';
15
+ import { loadGscData } from '../../reports/gsc-loader.js';
16
+ import { loadAllConfigs } from '../../scheduler.js';
17
+ import { getDb, upsertTemplateGroup, getTemplateGroupId, upsertTemplateSample } from '../../db/db.js';
18
+ import { clusterUrls } from './cluster.js';
19
+ import { selectSample, crawlSample } from './sampler.js';
20
+ import { averageSimilarity, averageFingerprintSimilarity } from './similarity.js';
21
+ import { overlayGsc } from './gsc-overlay.js';
22
+ import { scoreGroup } from './scorer.js';
23
+
24
+ /**
25
+ * Run full template analysis for a project.
26
+ *
27
+ * @param {string} project
28
+ * @param {object} opts
29
+ * @param {number} opts.minGroupSize — min URLs per template (default 10)
30
+ * @param {number} opts.sampleSize — pages to crawl per group (default 20)
31
+ * @param {boolean} opts.skipCrawl — skip Phase 2 (pattern + GSC only)
32
+ * @param {boolean} opts.skipGsc — skip Phase 3
33
+ * @param {Function} opts.log — (message) => void (default console.log)
34
+ * @returns {Promise<TemplatesReport>}
35
+ */
36
+ export async function runTemplatesAnalysis(project, opts = {}) {
37
+ const log = opts.log || console.log;
38
+ const minGroupSize = opts.minGroupSize || 10;
39
+ const sampleSize = opts.sampleSize || 20;
40
+
41
+ // ── Load project config ──
42
+ const configs = loadAllConfigs();
43
+ const config = configs.find(c => c.project === project);
44
+ if (!config) throw new Error(`Project "${project}" not found. Run: seo-intel setup`);
45
+
46
+ const targetDomain = config.target.domain;
47
+ const targetUrl = config.target.url || `https://${targetDomain}`;
48
+
49
+ log(`\n Target: ${targetDomain}`);
50
+
51
+ // ═══ PHASE 1: URL Pattern Clustering ═══
52
+ log(`\n Phase 1: URL Pattern Clustering`);
53
+ log(` Fetching sitemap...`);
54
+
55
+ const sitemapEntries = await fetchSitemap(targetUrl);
56
+
57
+ if (!sitemapEntries.length) {
58
+ log(` ⚠️ No sitemap URLs found for ${targetDomain}`);
59
+ log(` Ensure sitemap.xml is accessible at ${targetUrl}/sitemap.xml`);
60
+ return { groups: [], stats: { totalUrls: 0, totalGroups: 0, coverage: 0 }, project, domain: targetDomain };
61
+ }
62
+
63
+ log(` Found ${sitemapEntries.length.toLocaleString()} URLs in sitemap`);
64
+
65
+ const { groups, ungrouped, stats } = clusterUrls(sitemapEntries, { minGroupSize });
66
+
67
+ log(` ${stats.totalGroups} template groups found`);
68
+ log(` Coverage: ${stats.totalGrouped.toLocaleString()} URLs (${(stats.coverage * 100).toFixed(1)}% of sitemap)`);
69
+ log('');
70
+
71
+ if (groups.length === 0) {
72
+ log(` No template patterns detected (all pages are unique).`);
73
+ return { groups: [], ungrouped, stats, project, domain: targetDomain };
74
+ }
75
+
76
+ // Show discovered patterns
77
+ const maxPatternLen = Math.max(...groups.map(g => g.pattern.length), 7);
78
+ log(` ${'Pattern'.padEnd(maxPatternLen)} ${'URLs'.padStart(8)} Verdict`);
79
+ log(` ${'─'.repeat(maxPatternLen)} ${'─'.repeat(8)} ─────────`);
80
+ for (const g of groups) {
81
+ log(` ${g.pattern.padEnd(maxPatternLen)} ${g.urlCount.toLocaleString().padStart(8)} [pending]`);
82
+ }
83
+ log('');
84
+
85
+ // ═══ PHASE 2: Smart Sample Crawl ═══
86
+ if (!opts.skipCrawl) {
87
+ log(` Phase 2: Smart Sample Crawl (stealth)`);
88
+
89
+ for (const group of groups) {
90
+ const sample = selectSample(group.urls, sampleSize);
91
+ log(` Sampling ${group.pattern}... ${sample.length} pages`);
92
+
93
+ try {
94
+ const results = await crawlSample(sample, {
95
+ hostname: targetDomain,
96
+ onPage: (result, idx, total) => {
97
+ const status = result.statusCode >= 400 ? '✗' : result.statusCode > 0 ? '✓' : '?';
98
+ process.stdout.write(` [${idx + 1}/${total}] ${status} ${result.url.replace(/https?:\/\/[^/]+/, '').slice(0, 50)}\n`);
99
+ },
100
+ });
101
+
102
+ group.samples = results;
103
+ group.sampleSize = results.filter(r => r.statusCode > 0 && r.statusCode < 400).length;
104
+
105
+ // Compute similarity stats from successful samples
106
+ const successful = results.filter(r => r.statusCode > 0 && r.statusCode < 400);
107
+ if (successful.length >= 2) {
108
+ const bodyTexts = successful.map(r => r.bodyText).filter(Boolean);
109
+ const fingerprints = successful.map(r => r.domFingerprintStr).filter(Boolean);
110
+
111
+ group.avgWordCount = successful.reduce((s, r) => s + (r.wordCount || 0), 0) / successful.length;
112
+ group.contentSimilarity = averageSimilarity(bodyTexts);
113
+ group.domSimilarity = averageFingerprintSimilarity(fingerprints);
114
+ group.canonicalRate = successful.filter(r => r.hasCanonical).length / successful.length;
115
+ } else {
116
+ group.avgWordCount = successful[0]?.wordCount || 0;
117
+ group.contentSimilarity = null;
118
+ group.domSimilarity = null;
119
+ group.canonicalRate = null;
120
+ }
121
+
122
+ log(` ✓ ${group.sampleSize} successful, similarity: ${group.contentSimilarity != null ? (group.contentSimilarity * 100).toFixed(0) + '%' : 'N/A'}`);
123
+ } catch (err) {
124
+ log(` ✗ Sample crawl failed: ${err.message}`);
125
+ group.samples = [];
126
+ group.sampleSize = 0;
127
+ }
128
+ }
129
+ log('');
130
+ } else {
131
+ log(` Phase 2: Skipped (--skip-crawl)`);
132
+ for (const g of groups) {
133
+ g.samples = [];
134
+ g.sampleSize = 0;
135
+ }
136
+ log('');
137
+ }
138
+
139
+ // ═══ PHASE 3: GSC Overlay ═══
140
+ if (!opts.skipGsc) {
141
+ log(` Phase 3: GSC Overlay`);
142
+ const gscData = loadGscData(project);
143
+ if (gscData?.pages?.length) {
144
+ log(` Loaded GSC data: ${gscData.pages.length.toLocaleString()} pages with data`);
145
+ const overlayed = overlayGsc(groups, gscData.pages);
146
+ // Merge GSC fields back into groups
147
+ for (let i = 0; i < groups.length; i++) {
148
+ Object.assign(groups[i], {
149
+ gscUrlsWithImpressions: overlayed[i].gscUrlsWithImpressions,
150
+ gscTotalClicks: overlayed[i].gscTotalClicks,
151
+ gscTotalImpressions: overlayed[i].gscTotalImpressions,
152
+ gscAvgPosition: overlayed[i].gscAvgPosition,
153
+ indexationEfficiency: overlayed[i].indexationEfficiency,
154
+ topGscUrls: overlayed[i].topGscUrls,
155
+ });
156
+ }
157
+ log(` Matched template URLs against GSC data`);
158
+ } else {
159
+ log(` No GSC data found for ${project}`);
160
+ for (const g of groups) {
161
+ g.gscUrlsWithImpressions = null;
162
+ g.gscTotalClicks = null;
163
+ g.gscTotalImpressions = null;
164
+ g.gscAvgPosition = null;
165
+ g.indexationEfficiency = null;
166
+ g.topGscUrls = [];
167
+ }
168
+ }
169
+ log('');
170
+ } else {
171
+ log(` Phase 3: Skipped (--skip-gsc)\n`);
172
+ }
173
+
174
+ // ═══ PHASE 4: Scoring & Recommendations ═══
175
+ log(` Phase 4: Scoring & Recommendations`);
176
+
177
+ for (const group of groups) {
178
+ const result = scoreGroup(group);
179
+ group.score = result.score;
180
+ group.verdict = result.verdict;
181
+ group.recommendation = result.recommendations;
182
+
183
+ const verdictColor = { 'high-value': '🟢', mixed: '🟡', thin: '🟠', invisible: '🔴' };
184
+ log(` ${(verdictColor[group.verdict] || '⚪')} ${group.pattern.padEnd(maxPatternLen)} → ${group.verdict} (score: ${group.score})`);
185
+ }
186
+ log('');
187
+
188
+ // ═══ PHASE 5: Template Profile Extrapolation ═══
189
+ // For each group with samples, build an "inferred profile" — the common fields
190
+ // that apply to ALL URLs in the group. This lets us "know" 47k pages from 20 samples.
191
+ log(` Phase 5: Template Profile Extrapolation`);
192
+
193
+ for (const group of groups) {
194
+ group.profile = buildTemplateProfile(group);
195
+ if (group.profile) {
196
+ const p = group.profile;
197
+ log(` ${group.pattern}: ${group.urlCount.toLocaleString()} pages inferred`);
198
+ log(` schema: ${p.schemaPresence}% · canonical: ${p.canonicalPresence}% · indexable: ${p.indexablePresence}% · avg words: ${Math.round(p.avgWordCount)}`);
199
+ }
200
+ }
201
+ log('');
202
+
203
+ // ═══ PHASE 6: Competitor Sitemap Census ═══
204
+ // Fetch competitor sitemaps and cluster them — zero crawling, just URL counting.
205
+ // Shows: "You have 200 swap pages, Jupiter has 47k" — instant competitive intel.
206
+ const competitorCensus = [];
207
+ const competitors = config.competitors || [];
208
+
209
+ if (competitors.length > 0 && !opts.skipCompetitors) {
210
+ log(` Phase 6: Competitor Sitemap Census`);
211
+
212
+ for (const comp of competitors) {
213
+ const compUrl = comp.url || `https://${comp.domain}`;
214
+ log(` Scanning ${comp.domain}...`);
215
+
216
+ try {
217
+ const compEntries = await fetchSitemap(compUrl);
218
+ if (compEntries.length === 0) {
219
+ log(` No sitemap found`);
220
+ competitorCensus.push({ domain: comp.domain, totalUrls: 0, groups: [] });
221
+ continue;
222
+ }
223
+
224
+ const compResult = clusterUrls(compEntries, { minGroupSize });
225
+ competitorCensus.push({
226
+ domain: comp.domain,
227
+ totalUrls: compResult.stats.totalUrls,
228
+ groups: compResult.groups.map(g => ({
229
+ pattern: g.pattern,
230
+ urlCount: g.urlCount,
231
+ })),
232
+ stats: compResult.stats,
233
+ });
234
+
235
+ log(` ${compEntries.length.toLocaleString()} URLs → ${compResult.stats.totalGroups} templates`);
236
+ for (const g of compResult.groups.slice(0, 5)) {
237
+ log(` ${g.pattern.padEnd(30)} ${g.urlCount.toLocaleString().padStart(8)} URLs`);
238
+ }
239
+ if (compResult.groups.length > 5) {
240
+ log(` ... and ${compResult.groups.length - 5} more`);
241
+ }
242
+ } catch (err) {
243
+ log(` ✗ Failed: ${err.message}`);
244
+ competitorCensus.push({ domain: comp.domain, totalUrls: 0, groups: [], error: err.message });
245
+ }
246
+ }
247
+ log('');
248
+ }
249
+
250
+ // ═══ Write to DB ═══
251
+ const db = getDb();
252
+ const analyzedAt = Date.now();
253
+
254
+ for (const group of groups) {
255
+ upsertTemplateGroup(db, {
256
+ project,
257
+ domain: targetDomain,
258
+ pattern: group.pattern,
259
+ urlCount: group.urlCount,
260
+ sampleSize: group.sampleSize || 0,
261
+ avgWordCount: group.avgWordCount,
262
+ contentSimilarity: group.contentSimilarity,
263
+ domSimilarity: group.domSimilarity,
264
+ gscUrlsWithImpressions: group.gscUrlsWithImpressions,
265
+ gscTotalClicks: group.gscTotalClicks,
266
+ gscTotalImpressions: group.gscTotalImpressions,
267
+ gscAvgPosition: group.gscAvgPosition,
268
+ indexationEfficiency: group.indexationEfficiency,
269
+ score: group.score,
270
+ verdict: group.verdict,
271
+ recommendation: group.recommendation,
272
+ analyzedAt,
273
+ });
274
+
275
+ // Save samples
276
+ if (group.samples?.length) {
277
+ const groupId = getTemplateGroupId(db, project, targetDomain, group.pattern);
278
+ if (groupId) {
279
+ for (const s of group.samples) {
280
+ upsertTemplateSample(db, {
281
+ groupId,
282
+ url: s.url,
283
+ sampleRole: s.sampleRole,
284
+ statusCode: s.statusCode,
285
+ wordCount: s.wordCount,
286
+ title: s.title,
287
+ metaDesc: s.metaDesc,
288
+ hasCanonical: s.hasCanonical,
289
+ hasSchema: s.hasSchema,
290
+ isIndexable: s.isIndexable,
291
+ domFingerprint: s.domFingerprintStr,
292
+ contentHash: s.contentHash,
293
+ bodyText: s.bodyText,
294
+ crawledAt: s.crawledAt,
295
+ });
296
+ }
297
+ }
298
+ }
299
+ }
300
+
301
+ log(` Results saved to database.`);
302
+
303
+ return {
304
+ project,
305
+ domain: targetDomain,
306
+ groups,
307
+ ungrouped,
308
+ stats,
309
+ competitorCensus,
310
+ analyzedAt,
311
+ };
312
+ }
313
+
314
+ /**
315
+ * Build an inferred profile for a template group from its samples.
316
+ *
317
+ * If 20 sampled pages from /swap/{pair} show:
318
+ * - 95% have schema markup
319
+ * - 100% have canonical tags
320
+ * - avg 180 words
321
+ * - all use the same DOM structure
322
+ *
323
+ * We can extrapolate that to all 47,000 pages in the group.
324
+ * This replaces the need to crawl+extract every page.
325
+ *
326
+ * @param {object} group — template group with .samples[]
327
+ * @returns {object|null} — inferred profile, or null if no usable samples
328
+ */
329
+ function buildTemplateProfile(group) {
330
+ const samples = (group.samples || []).filter(s => s.statusCode > 0 && s.statusCode < 400);
331
+ if (samples.length < 2) return null;
332
+
333
+ const n = samples.length;
334
+
335
+ // ── Presence rates (extrapolated to all URLs in group) ──
336
+ const schemaPresence = Math.round((samples.filter(s => s.hasSchema).length / n) * 100);
337
+ const canonicalPresence = Math.round((samples.filter(s => s.hasCanonical).length / n) * 100);
338
+ const indexablePresence = Math.round((samples.filter(s => s.isIndexable).length / n) * 100);
339
+
340
+ // ── Content stats ──
341
+ const avgWordCount = samples.reduce((sum, s) => sum + (s.wordCount || 0), 0) / n;
342
+ const minWordCount = Math.min(...samples.map(s => s.wordCount || 0));
343
+ const maxWordCount = Math.max(...samples.map(s => s.wordCount || 0));
344
+
345
+ // ── Title/meta pattern detection ──
346
+ // Find the common template in titles by extracting shared prefixes/suffixes
347
+ const titlePattern = detectPattern(samples.map(s => s.title).filter(Boolean));
348
+ const metaPattern = detectPattern(samples.map(s => s.metaDesc).filter(Boolean));
349
+
350
+ // ── Unique content hashes ──
351
+ const uniqueHashes = new Set(samples.map(s => s.contentHash).filter(Boolean));
352
+ const contentDiversity = uniqueHashes.size / n; // 1.0 = all unique, low = duplicates
353
+
354
+ // ── Inferred totals (extrapolated) ──
355
+ const estimatedWithSchema = Math.round(group.urlCount * (schemaPresence / 100));
356
+ const estimatedWithCanonical = Math.round(group.urlCount * (canonicalPresence / 100));
357
+ const estimatedIndexable = Math.round(group.urlCount * (indexablePresence / 100));
358
+ const estimatedTotalWords = Math.round(group.urlCount * avgWordCount);
359
+
360
+ return {
361
+ sampleCount: n,
362
+ totalInferred: group.urlCount,
363
+
364
+ // Rates (%)
365
+ schemaPresence,
366
+ canonicalPresence,
367
+ indexablePresence,
368
+
369
+ // Content
370
+ avgWordCount,
371
+ minWordCount,
372
+ maxWordCount,
373
+ contentDiversity,
374
+ estimatedTotalWords,
375
+
376
+ // Patterns
377
+ titlePattern,
378
+ metaPattern,
379
+
380
+ // Extrapolated totals
381
+ estimatedWithSchema,
382
+ estimatedWithCanonical,
383
+ estimatedIndexable,
384
+ };
385
+ }
386
+
387
+ /**
388
+ * Detect the common template pattern in a set of strings.
389
+ * Returns the shared prefix + "{variable}" + shared suffix.
390
+ *
391
+ * e.g. ["Swap SOL to USDC | Jupiter", "Swap BONK to USDT | Jupiter"]
392
+ * → "Swap {…} | Jupiter"
393
+ */
394
+ function detectPattern(strings) {
395
+ if (strings.length < 2) return strings[0] || null;
396
+
397
+ // Find longest common prefix
398
+ let prefix = '';
399
+ for (let i = 0; i < strings[0].length; i++) {
400
+ const char = strings[0][i];
401
+ if (strings.every(s => s[i] === char)) {
402
+ prefix += char;
403
+ } else break;
404
+ }
405
+
406
+ // Find longest common suffix (reversed)
407
+ const reversed = strings.map(s => s.split('').reverse().join(''));
408
+ let suffix = '';
409
+ for (let i = 0; i < reversed[0].length; i++) {
410
+ const char = reversed[0][i];
411
+ if (reversed.every(s => s[i] === char)) {
412
+ suffix = char + suffix;
413
+ } else break;
414
+ }
415
+
416
+ // Don't overlap
417
+ if (prefix.length + suffix.length >= strings[0].length) {
418
+ return strings[0]; // all identical
419
+ }
420
+
421
+ const variable = prefix.length > 0 || suffix.length > 0;
422
+ if (!variable) return null; // no common pattern
423
+
424
+ return (prefix.trim() + ' {…} ' + suffix.trim()).trim();
425
+ }
@@ -0,0 +1,198 @@
1
+ /**
2
+ * Smart Sample Selection & Stealth Crawl — Phase 2
3
+ *
4
+ * Selects a strategic sample from each template group,
5
+ * then stealth-crawls those pages for content analysis.
6
+ */
7
+
8
+ import { createHash } from 'crypto';
9
+ import { domFingerprint } from './similarity.js';
10
+
11
+ /**
12
+ * Select which URLs to crawl from a group.
13
+ * Pure function — no I/O.
14
+ *
15
+ * Strategy:
16
+ * - high-value: shortest paths (likely most important)
17
+ * - middle: middle of sorted list
18
+ * - long-tail: longest paths (most specific/obscure)
19
+ * - random: random picks across the full list
20
+ *
21
+ * @param {string[]} urls
22
+ * @param {number} sampleSize — default 20
23
+ * @returns {{ url: string, role: string }[]}
24
+ */
25
+ export function selectSample(urls, sampleSize = 20) {
26
+ if (urls.length <= sampleSize) {
27
+ return urls.map(url => ({ url, role: 'all' }));
28
+ }
29
+
30
+ // Sort by path length (shorter = likely higher value)
31
+ const sorted = [...urls].sort((a, b) => {
32
+ const pathA = new URL(a).pathname;
33
+ const pathB = new URL(b).pathname;
34
+ return pathA.length - pathB.length || pathA.localeCompare(pathB);
35
+ });
36
+
37
+ const used = new Set();
38
+ const result = [];
39
+
40
+ const nHighValue = Math.ceil(sampleSize * 0.30);
41
+ const nMiddle = Math.ceil(sampleSize * 0.25);
42
+ const nLongTail = Math.ceil(sampleSize * 0.25);
43
+
44
+ // High-value: top of sorted (shortest paths)
45
+ for (let i = 0; i < sorted.length && result.length < nHighValue; i++) {
46
+ if (!used.has(sorted[i])) {
47
+ result.push({ url: sorted[i], role: 'high-value' });
48
+ used.add(sorted[i]);
49
+ }
50
+ }
51
+
52
+ // Middle: around the center
53
+ const mid = Math.floor(sorted.length / 2);
54
+ const midStart = Math.max(0, mid - Math.floor(nMiddle / 2));
55
+ for (let i = midStart; i < sorted.length && result.filter(r => r.role === 'middle').length < nMiddle; i++) {
56
+ if (!used.has(sorted[i])) {
57
+ result.push({ url: sorted[i], role: 'middle' });
58
+ used.add(sorted[i]);
59
+ }
60
+ }
61
+
62
+ // Long-tail: bottom of sorted (longest paths)
63
+ for (let i = sorted.length - 1; i >= 0 && result.filter(r => r.role === 'long-tail').length < nLongTail; i--) {
64
+ if (!used.has(sorted[i])) {
65
+ result.push({ url: sorted[i], role: 'long-tail' });
66
+ used.add(sorted[i]);
67
+ }
68
+ }
69
+
70
+ // Random: fill remainder
71
+ const remaining = sampleSize - result.length;
72
+ const unused = sorted.filter(u => !used.has(u));
73
+ // Fisher-Yates shuffle
74
+ for (let i = unused.length - 1; i > 0; i--) {
75
+ const j = Math.floor(Math.random() * (i + 1));
76
+ [unused[i], unused[j]] = [unused[j], unused[i]];
77
+ }
78
+ for (let i = 0; i < Math.min(remaining, unused.length); i++) {
79
+ result.push({ url: unused[i], role: 'random' });
80
+ }
81
+
82
+ return result;
83
+ }
84
+
85
+ /**
86
+ * Stealth-crawl a sample of URLs from a template group.
87
+ *
88
+ * @param {{ url: string, role: string }[]} sample — from selectSample()
89
+ * @param {object} opts
90
+ * @param {string} opts.hostname — for session persistence
91
+ * @param {Function} opts.onPage — (result, index, total) => void
92
+ * @returns {Promise<SampleResult[]>}
93
+ */
94
+ export async function crawlSample(sample, opts = {}) {
95
+ const { getStealthConfig, STEALTH_INIT_SCRIPT, applyStealthRoutes } = await import('../../crawler/stealth.js');
96
+ const { chromium } = await import('playwright');
97
+
98
+ const stealthCfg = getStealthConfig();
99
+ const browser = await chromium.launch({ headless: true, ...stealthCfg.launchArgs });
100
+ const context = await browser.newContext(stealthCfg.contextOpts);
101
+ await context.addInitScript(STEALTH_INIT_SCRIPT);
102
+ await applyStealthRoutes(context);
103
+
104
+ const results = [];
105
+
106
+ try {
107
+ for (let i = 0; i < sample.length; i++) {
108
+ const { url, role } = sample[i];
109
+ const result = await crawlSinglePage(context, url, role);
110
+ results.push(result);
111
+
112
+ if (opts.onPage) opts.onPage(result, i, sample.length);
113
+
114
+ // Jittered delay: 2-4s
115
+ if (i < sample.length - 1) {
116
+ await new Promise(r => setTimeout(r, 2000 + Math.random() * 2000));
117
+ }
118
+ }
119
+ } finally {
120
+ await browser.close().catch(() => {});
121
+ }
122
+
123
+ return results;
124
+ }
125
+
126
+ /**
127
+ * Crawl a single page and extract template analysis fields.
128
+ */
129
+ async function crawlSinglePage(context, url, role) {
130
+ const page = await context.newPage();
131
+ const result = {
132
+ url,
133
+ sampleRole: role,
134
+ statusCode: 0,
135
+ wordCount: 0,
136
+ title: '',
137
+ metaDesc: '',
138
+ hasCanonical: false,
139
+ hasSchema: false,
140
+ isIndexable: true,
141
+ domFingerprintStr: '',
142
+ contentHash: '',
143
+ bodyText: '',
144
+ crawledAt: Date.now(),
145
+ };
146
+
147
+ try {
148
+ const response = await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 20000 });
149
+ result.statusCode = response?.status() || 0;
150
+
151
+ if (result.statusCode >= 400) {
152
+ await page.close();
153
+ return result;
154
+ }
155
+
156
+ // Wait for dynamic content
157
+ await page.waitForTimeout(2000);
158
+
159
+ // Extract page data
160
+ const data = await page.evaluate(() => {
161
+ const title = document.title || '';
162
+ const metaDesc = document.querySelector('meta[name="description"]')?.content || '';
163
+ const canonical = document.querySelector('link[rel="canonical"]');
164
+ const hasCanonical = !!canonical;
165
+ const hasSchema = !!document.querySelector('script[type="application/ld+json"]');
166
+
167
+ // Indexability: check robots meta
168
+ const robotsMeta = document.querySelector('meta[name="robots"]')?.content || '';
169
+ const isIndexable = !robotsMeta.includes('noindex');
170
+
171
+ // Body text
172
+ const bodyText = document.body?.innerText || '';
173
+ const wordCount = bodyText.split(/\s+/).filter(w => w.length > 1).length;
174
+
175
+ return { title, metaDesc, hasCanonical, hasSchema, isIndexable, bodyText, wordCount };
176
+ });
177
+
178
+ result.title = data.title;
179
+ result.metaDesc = data.metaDesc;
180
+ result.hasCanonical = data.hasCanonical;
181
+ result.hasSchema = data.hasSchema;
182
+ result.isIndexable = data.isIndexable;
183
+ result.wordCount = data.wordCount;
184
+ // Cap body text at 5000 chars for similarity computation
185
+ result.bodyText = data.bodyText.slice(0, 5000);
186
+ result.contentHash = createHash('sha256').update(data.bodyText).digest('hex').slice(0, 16);
187
+
188
+ // DOM fingerprint
189
+ result.domFingerprintStr = await domFingerprint(page);
190
+
191
+ } catch (err) {
192
+ // Page failed — result stays with defaults (statusCode 0)
193
+ } finally {
194
+ await page.close().catch(() => {});
195
+ }
196
+
197
+ return result;
198
+ }