headhunt 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/scraper.js ADDED
@@ -0,0 +1,815 @@
1
+ const axios = require('axios');
2
+ const cheerio = require('cheerio');
3
+ const { URL } = require('url');
4
+ const { fmt } = require('./colors');
5
+ const { httpsAgent, USER_AGENT, SCORE_WEIGHTS } = require('./constants');
6
+ const log = require('./logger');
7
+
8
+ class SEOScraper {
9
+ constructor(url, options = {}) {
10
+ this.url = url;
11
+ this.options = options;
12
+ this.html = '';
13
+ this.$ = null;
14
+ this.fetchTime = 0;
15
+ this.httpStatus = null;
16
+ this.responseHeaders = {};
17
+ this.finalUrl = url;
18
+
19
+ this.metadata = {
20
+ url,
21
+ finalUrl: url,
22
+ timestamp: new Date().toISOString(),
23
+ httpStatus: null,
24
+ responseHeaders: {},
25
+ fetchTimeMs: 0,
26
+ seo: {},
27
+ scores: {},
28
+ recommendations: [],
29
+ overallScore: 0,
30
+ grade: '',
31
+ summary: '',
32
+ };
33
+ }
34
+
35
+ async fetchPage() {
36
+ const start = Date.now();
37
+ log.info(`Fetching \u2192 ${fmt.cyan(this.url)}`);
38
+
39
+ const response = await axios.get(this.url, {
40
+ headers: {
41
+ 'User-Agent': USER_AGENT,
42
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43
+ 'Accept-Language': 'en-US,en;q=0.5',
44
+ 'Accept-Encoding': 'gzip, deflate, br',
45
+ 'Connection': 'keep-alive',
46
+ 'Upgrade-Insecure-Requests': '1',
47
+ 'Cache-Control': 'no-cache',
48
+ },
49
+ httpsAgent,
50
+ timeout: 30000,
51
+ maxRedirects: 10,
52
+ });
53
+
54
+ this.fetchTime = Date.now() - start;
55
+ this.html = response.data;
56
+ this.$ = cheerio.load(this.html);
57
+ this.httpStatus = response.status;
58
+ this.responseHeaders = response.headers;
59
+ this.finalUrl = response.request?.res?.responseUrl || this.url;
60
+
61
+ this.metadata.httpStatus = this.httpStatus;
62
+ this.metadata.fetchTimeMs = this.fetchTime;
63
+ this.metadata.finalUrl = this.finalUrl;
64
+ this.metadata.responseHeaders = {
65
+ 'content-type': response.headers['content-type'] || null,
66
+ 'x-powered-by': response.headers['x-powered-by'] || null,
67
+ 'server': response.headers['server'] || null,
68
+ 'cache-control': response.headers['cache-control'] || null,
69
+ 'strict-transport-security': response.headers['strict-transport-security'] || null,
70
+ 'x-frame-options': response.headers['x-frame-options'] || null,
71
+ 'x-content-type-options': response.headers['x-content-type-options'] || null,
72
+ 'content-security-policy': response.headers['content-security-policy'] ? '(present)' : null,
73
+ 'referrer-policy': response.headers['referrer-policy'] || null,
74
+ 'permissions-policy': response.headers['permissions-policy'] ? '(present)' : null,
75
+ };
76
+
77
+ log.ok(`Fetched in ${fmt.yellow(this.fetchTime + 'ms')} \u2014 HTTP ${fmt.green(this.httpStatus)}`);
78
+ if (this.finalUrl !== this.url) {
79
+ log.info(`Redirected \u2192 ${fmt.dim(this.finalUrl)}`);
80
+ }
81
+ }
82
+
83
+ extractBasicMetadata() {
84
+ const title = this.$('title').text().trim() || null;
85
+ const desc = this.$('meta[name="description"]').attr('content') || null;
86
+ const keywords = this.$('meta[name="keywords"]').attr('content') || null;
87
+ const canonical = this.$('link[rel="canonical"]').attr('href') || null;
88
+ const robots = this.$('meta[name="robots"]').attr('content') || null;
89
+ const googlebot = this.$('meta[name="googlebot"]').attr('content') || null;
90
+ const viewport = this.$('meta[name="viewport"]').attr('content') || null;
91
+ const charset = this.$('meta[charset]').attr('charset') ||
92
+ (this.$('meta[http-equiv="Content-Type"]').attr('content') || '').match(/charset=([^\s;]+)/i)?.[1] || null;
93
+ const language = this.$('html').attr('lang') || null;
94
+ const themeColor = this.$('meta[name="theme-color"]').attr('content') || null;
95
+ const author = this.$('meta[name="author"]').attr('content') || null;
96
+ const generator = this.$('meta[name="generator"]').attr('content') || null;
97
+ const rating = this.$('meta[name="rating"]').attr('content') || null;
98
+ const revisit = this.$('meta[name="revisit-after"]').attr('content') || null;
99
+
100
+ const cms = this.detectCMS(generator);
101
+
102
+ const hreflang = [];
103
+ this.$('link[rel="alternate"][hreflang]').each((i, el) => {
104
+ hreflang.push({
105
+ lang: this.$(el).attr('hreflang'),
106
+ href: this.$(el).attr('href'),
107
+ });
108
+ });
109
+
110
+ const prevPage = this.$('link[rel="prev"]').attr('href') || null;
111
+ const nextPage = this.$('link[rel="next"]').attr('href') || null;
112
+
113
+ const basic = {
114
+ title, titleLength: title?.length ?? 0,
115
+ metaDescription: desc, descriptionLength: desc?.length ?? 0,
116
+ keywords, keywordCount: keywords ? keywords.split(',').length : 0,
117
+ canonical, canonicalMatchesFinal: canonical ? this.urlsMatch(canonical, this.finalUrl) : null,
118
+ robots, googlebot,
119
+ viewport, charset, language, themeColor,
120
+ author, generator, cms, rating, revisit,
121
+ hreflang, pagination: { prev: prevPage, next: nextPage },
122
+ };
123
+
124
+ this.metadata.seo.basic = basic;
125
+ }
126
+
127
+ detectCMS(generator) {
128
+ const html = this.html.toLowerCase();
129
+
130
+ if (html.includes('wp-content') || html.includes('wp-json')) return 'WordPress';
131
+ if (html.includes('shopify')) return 'Shopify';
132
+ if (html.includes('drupal')) return 'Drupal';
133
+ if (html.includes('joomla')) return 'Joomla';
134
+ if (html.includes('squarespace')) return 'Squarespace';
135
+ if (html.includes('webflow')) return 'Webflow';
136
+ if (html.includes('wix.com')) return 'Wix';
137
+ if (html.includes('ghost')) return 'Ghost';
138
+ if (html.includes('nextjs') || html.includes('__next')) return 'Next.js';
139
+ if (html.includes('nuxt') || html.includes('__nuxt')) return 'Nuxt.js';
140
+ if (generator) return generator.split(' ')[0];
141
+ return null;
142
+ }
143
+
144
+ urlsMatch(a, b) {
145
+ try {
146
+ const ua = new URL(a), ub = new URL(b);
147
+ return ua.hostname === ub.hostname && ua.pathname.replace(/\/$/, '') === ub.pathname.replace(/\/$/, '');
148
+ } catch { return null; }
149
+ }
150
+
151
+ extractOpenGraphMetadata() {
152
+ const og = {};
153
+ this.$('meta[property^="og:"]').each((i, el) => {
154
+ const prop = this.$(el).attr('property')?.replace('og:', '');
155
+ const content = this.$(el).attr('content');
156
+ if (prop && content) og[prop] = content;
157
+ });
158
+ this.metadata.seo.openGraph = og;
159
+ }
160
+
161
+ extractTwitterMetadata() {
162
+ const tw = {};
163
+ this.$('meta[name^="twitter:"]').each((i, el) => {
164
+ const name = this.$(el).attr('name')?.replace('twitter:', '');
165
+ const content = this.$(el).attr('content');
166
+ if (name && content) tw[name] = content;
167
+ });
168
+ this.metadata.seo.twitter = tw;
169
+ }
170
+
171
+ extractSchemaMetadata() {
172
+ const jsonLd = [];
173
+ const schemaTypes = [];
174
+
175
+ this.$('script[type="application/ld+json"]').each((i, el) => {
176
+ try {
177
+ const parsed = JSON.parse(this.$(el).html());
178
+ jsonLd.push(parsed);
179
+ const type = parsed['@type'];
180
+ if (type) schemaTypes.push(Array.isArray(type) ? type.join(', ') : type);
181
+ } catch (_) { }
182
+ });
183
+
184
+ const microdata = {};
185
+ this.$('[itemscope]').each((i, el) => {
186
+ const itemtype = this.$(el).attr('itemtype');
187
+ if (!itemtype) return;
188
+ const type = itemtype.split('/').pop();
189
+ if (!microdata[type]) microdata[type] = [];
190
+ const item = {};
191
+ this.$(el).find('[itemprop]').each((j, prop) => {
192
+ const name = this.$(prop).attr('itemprop');
193
+ item[name] = this.$(prop).attr('content') || this.$(prop).text().trim().substring(0, 200);
194
+ });
195
+ microdata[type].push(item);
196
+ });
197
+
198
+ this.metadata.seo.schema = { jsonLd, schemaTypes, microdata };
199
+ }
200
+
201
+ extractTechnicalMetadata() {
202
+ const $ = this.$;
203
+ const headers = this.metadata.responseHeaders;
204
+
205
+ const technical = {
206
+ doctype: this.html.match(/<!DOCTYPE[^>]*>/i)?.[0] || null,
207
+ htmlVersion: this.getHtmlVersion(),
208
+ hasViewport: $('meta[name="viewport"]').length > 0,
209
+ hasCharset: $('meta[charset]').length > 0,
210
+ hasCanonical: $('link[rel="canonical"]').length > 0,
211
+ hasRobots: $('meta[name="robots"]').length > 0,
212
+ hasSitemap: $('link[rel="sitemap"]').length > 0,
213
+ hasRSS: $('link[type="application/rss+xml"]').length > 0,
214
+ hasAMP: $('link[rel="amphtml"]').length > 0,
215
+ hasFavicon: $('link[rel="icon"], link[rel="shortcut icon"]').length > 0,
216
+ hasManifest: $('link[rel="manifest"]').length > 0,
217
+ hasServiceWorker: this.html.includes('serviceWorker') || this.html.includes('service-worker'),
218
+ sitemapUrl: $('link[rel="sitemap"]').attr('href') || null,
219
+ rssUrl: $('link[type="application/rss+xml"]').attr('href') || null,
220
+ ampUrl: $('link[rel="amphtml"]').attr('href') || null,
221
+ manifestUrl: $('link[rel="manifest"]').attr('href') || null,
222
+
223
+ https: this.url.startsWith('https://'),
224
+ hsts: !!headers['strict-transport-security'],
225
+ xFrameOptions: !!headers['x-frame-options'],
226
+ xContentTypeOptions: !!headers['x-content-type-options'],
227
+ csp: !!headers['content-security-policy'],
228
+ referrerPolicy: !!headers['referrer-policy'],
229
+ permissionsPolicy: !!headers['permissions-policy'],
230
+ serverInfo: headers['server'] || null,
231
+ poweredBy: headers['x-powered-by'] || null,
232
+
233
+ wasRedirected: this.finalUrl !== this.url,
234
+ redirectTarget: this.finalUrl !== this.url ? this.finalUrl : null,
235
+
236
+ fetchTimeMs: this.fetchTime,
237
+ httpStatus: this.httpStatus,
238
+ };
239
+
240
+ this.metadata.seo.technical = technical;
241
+ }
242
+
243
+ getHtmlVersion() {
244
+ const d = (this.html.match(/<!DOCTYPE[^>]*>/i)?.[0] || '').toLowerCase();
245
+ if (!d) return 'Unknown';
246
+ if (d === '<!doctype html>') return 'HTML5';
247
+ if (d.includes('html 4.01')) return 'HTML 4.01';
248
+ if (d.includes('xhtml 1.0')) return 'XHTML 1.0';
249
+ if (d.includes('xhtml 1.1')) return 'XHTML 1.1';
250
+ return 'Unknown';
251
+ }
252
+
253
+ extractHeadings() {
254
+ const headings = { h1: [], h2: [], h3: [], h4: [], h5: [], h6: [], counts: {} };
255
+ let order = [];
256
+
257
+ for (let i = 1; i <= 6; i++) {
258
+ const tag = `h${i}`;
259
+ this.$(tag).each((idx, el) => {
260
+ const text = this.$(el).text().trim();
261
+ const id = this.$(el).attr('id') || null;
262
+ if (text) {
263
+ headings[tag].push({ text: text.substring(0, 200), length: text.length, id });
264
+ order.push({ level: i, text: text.substring(0, 100) });
265
+ }
266
+ });
267
+ headings.counts[tag] = headings[tag].length;
268
+ }
269
+
270
+ const issues = [];
271
+ let lastLevel = 0;
272
+ for (const h of order) {
273
+ if (lastLevel && h.level > lastLevel + 1) {
274
+ issues.push(`Skipped from H${lastLevel} to H${h.level}: "${h.text.substring(0, 60)}"`);
275
+ }
276
+ lastLevel = h.level;
277
+ }
278
+
279
+ headings.hierarchyIssues = issues;
280
+ headings.order = order.slice(0, 30);
281
+
282
+ this.metadata.seo.headings = headings;
283
+ }
284
+
285
+ extractImages() {
286
+ if (this.options.noImages) { this.metadata.seo.images = { skipped: true }; return; }
287
+
288
+ const images = { withAlt: [], withoutAlt: [], lazyLoaded: [], oversized: [], counts: {} };
289
+
290
+ this.$('img').each((i, el) => {
291
+ const src = this.$(el).attr('src') || this.$(el).attr('data-src');
292
+ const alt = this.$(el).attr('alt');
293
+ const loading = this.$(el).attr('loading');
294
+ const width = parseInt(this.$(el).attr('width') || '0');
295
+ const height = parseInt(this.$(el).attr('height') || '0');
296
+ const srcset = this.$(el).attr('srcset');
297
+
298
+ if (!src) return;
299
+
300
+ const img = {
301
+ src: src.substring(0, 300),
302
+ alt: alt || null,
303
+ hasAlt: alt !== undefined && alt !== null,
304
+ altEmpty: alt === '',
305
+ isLazy: loading === 'lazy',
306
+ hasDimensions: width > 0 && height > 0,
307
+ hasSrcset: !!srcset,
308
+ title: this.$(el).attr('title') || null,
309
+ };
310
+
311
+ if (img.hasAlt && !img.altEmpty) images.withAlt.push(img);
312
+ else images.withoutAlt.push(img);
313
+ if (img.isLazy) images.lazyLoaded.push(img);
314
+ });
315
+
316
+ const total = images.withAlt.length + images.withoutAlt.length;
317
+ images.counts = {
318
+ total,
319
+ withAlt: images.withAlt.length,
320
+ withoutAlt: images.withoutAlt.length,
321
+ lazyLoaded: images.lazyLoaded.length,
322
+ altCoverage: total > 0 ? Math.round((images.withAlt.length / total) * 100) : 100,
323
+ };
324
+
325
+ this.metadata.seo.images = images;
326
+ }
327
+
328
+ extractLinks() {
329
+ if (this.options.noLinks) { this.metadata.seo.links = { skipped: true }; return; }
330
+
331
+ const links = { internal: [], external: [], nofollow: [], sponsored: [], ugc: [] };
332
+ const seen = new Set();
333
+
334
+ this.$('a[href]').each((i, el) => {
335
+ const href = this.$(el).attr('href') || '';
336
+ const rel = (this.$(el).attr('rel') || '').toLowerCase();
337
+ const text = this.$(el).text().trim().substring(0, 100);
338
+
339
+ if (!href || href.startsWith('#') || href.startsWith('javascript:') || href.startsWith('mailto:') || href.startsWith('tel:')) return;
340
+
341
+ const link = {
342
+ url: href.substring(0, 500),
343
+ text: text || null,
344
+ rel: this.$(el).attr('rel') || null,
345
+ isNofollow: rel.includes('nofollow'),
346
+ isSponsored: rel.includes('sponsored'),
347
+ isUGC: rel.includes('ugc'),
348
+ title: this.$(el).attr('title') || null,
349
+ hasText: !!text,
350
+ };
351
+
352
+ const key = href;
353
+ if (!seen.has(key)) {
354
+ seen.add(key);
355
+ if (href.startsWith('http')) {
356
+ try {
357
+ const isInternal = new URL(href).hostname === new URL(this.url).hostname;
358
+ if (isInternal) links.internal.push(link);
359
+ else links.external.push(link);
360
+ } catch { links.external.push(link); }
361
+ } else {
362
+ links.internal.push(link);
363
+ }
364
+ }
365
+
366
+ if (link.isNofollow) links.nofollow.push(link);
367
+ if (link.isSponsored) links.sponsored.push(link);
368
+ if (link.isUGC) links.ugc.push(link);
369
+ });
370
+
371
+ const emptyTextLinks = [...links.internal, ...links.external].filter(l => !l.hasText);
372
+
373
+ links.counts = {
374
+ total: links.internal.length + links.external.length,
375
+ internal: links.internal.length,
376
+ external: links.external.length,
377
+ nofollow: links.nofollow.length,
378
+ emptyText: emptyTextLinks.length,
379
+ };
380
+
381
+ this.metadata.seo.links = links;
382
+ }
383
+
384
+ extractContentMetadata() {
385
+ const $ = this.$;
386
+
387
+ $('script, style, noscript, nav, footer, header').remove();
388
+ const bodyText = $('body').text().replace(/\s+/g, ' ').trim();
389
+ const wordCount = bodyText ? bodyText.split(/\s+/).length : 0;
390
+
391
+ const title = this.metadata.seo.basic?.title || '';
392
+ const titleWords = title.toLowerCase().split(/\s+/).filter(w => w.length > 3);
393
+ const densities = {};
394
+ if (bodyText && titleWords.length) {
395
+ const lowerText = bodyText.toLowerCase();
396
+ for (const word of titleWords) {
397
+ const count = (lowerText.match(new RegExp(`\\b${word}\\b`, 'g')) || []).length;
398
+ densities[word] = {
399
+ count,
400
+ density: wordCount > 0 ? ((count / wordCount) * 100).toFixed(2) + '%' : '0%',
401
+ };
402
+ }
403
+ }
404
+
405
+ const paragraphs = $('p').filter((i, el) => $(el).text().trim().length > 50).length;
406
+ const iframes = $('iframe').length;
407
+ const htmlLang = this.metadata.seo.basic?.language || '';
408
+
409
+ this.metadata.seo.content = {
410
+ wordCount,
411
+ paragraphCount: paragraphs,
412
+ iframeCount: iframes,
413
+ estimatedReadingMinutes: Math.ceil(wordCount / 200),
414
+ keywordDensities: densities,
415
+ htmlLang,
416
+ };
417
+ }
418
+
419
+ extractPerformanceMetrics() {
420
+ const $ = this.$;
421
+ const html = this.html;
422
+
423
+ const scripts = $('script[src]').map((i, el) => $(el).attr('src')).get();
424
+ const styles = $('link[rel="stylesheet"]').map((i, el) => $(el).attr('href')).get();
425
+
426
+ const renderBlocking = {
427
+ scripts: scripts.filter(s => s).length,
428
+ styles: styles.filter(s => s).length,
429
+ };
430
+
431
+ const headScripts = $('head script[src]:not([defer]):not([async])').length;
432
+ const deferScripts = $('script[defer]').length;
433
+ const asyncScripts = $('script[async]').length;
434
+ const moduleScripts = $('script[type="module"]').length;
435
+
436
+ const htmlSize = Buffer.byteLength(html, 'utf8');
437
+ const estimatedResources = renderBlocking.styles * 15000 + renderBlocking.scripts * 50000 + $('img').length * 80000;
438
+
439
+ this.metadata.seo.performance = {
440
+ htmlSizeBytes: htmlSize,
441
+ htmlSizeKB: Math.round(htmlSize / 1024),
442
+ fetchTimeMs: this.fetchTime,
443
+ totalImages: $('img').length,
444
+ totalScripts: $('script').length,
445
+ totalExternalScripts: scripts.length,
446
+ totalStylesheets: styles.length,
447
+ totalInlineScripts: $('script:not([src])').length,
448
+ totalInlineStyles: $('style').length,
449
+ headScripts,
450
+ deferScripts,
451
+ asyncScripts,
452
+ moduleScripts,
453
+ renderBlockingRisk: headScripts > 0,
454
+ estimated: {
455
+ externalResourcesBytes: estimatedResources,
456
+ totalPageBytes: htmlSize + estimatedResources,
457
+ totalPageKB: Math.round((htmlSize + estimatedResources) / 1024),
458
+ },
459
+ };
460
+ }
461
+
462
+ computeScores() {
463
+ const s = this.metadata.seo;
464
+ const scores = {};
465
+
466
+ {
467
+ let score = 0; const max = SCORE_WEIGHTS.title.max;
468
+ const title = s.basic?.title;
469
+ if (title) {
470
+ score += 5;
471
+ if (title.length >= 30 && title.length <= 60) score += 7;
472
+ else if (title.length >= 20 && title.length <= 70) score += 4;
473
+ else score += 1;
474
+ if (title.length > 0) score += 3;
475
+ }
476
+ scores.title = { score: Math.min(score, max), max };
477
+ }
478
+
479
+ {
480
+ let score = 0; const max = SCORE_WEIGHTS.description.max;
481
+ const desc = s.basic?.metaDescription;
482
+ if (desc) {
483
+ score += 4;
484
+ if (desc.length >= 120 && desc.length <= 160) score += 6;
485
+ else if (desc.length >= 80 && desc.length <= 200) score += 3;
486
+ else score += 1;
487
+ if (desc.length > 0) score += 2;
488
+ }
489
+ scores.description = { score: Math.min(score, max), max };
490
+ }
491
+
492
+ {
493
+ let score = 0; const max = SCORE_WEIGHTS.headings.max;
494
+ const h = s.headings;
495
+ if (h) {
496
+ const h1count = h.counts.h1 || 0;
497
+ if (h1count === 1) score += 5;
498
+ else if (h1count > 1) score += 2;
499
+ if (h.counts.h2 > 0) score += 3;
500
+ if (h.hierarchyIssues?.length === 0) score += 2;
501
+ }
502
+ scores.headings = { score: Math.min(score, max), max };
503
+ }
504
+
505
+ {
506
+ let score = 0; const max = SCORE_WEIGHTS.canonical.max;
507
+ if (s.basic?.canonical) {
508
+ score += 3;
509
+ if (s.basic.canonicalMatchesFinal) score += 2;
510
+ else score += 1;
511
+ }
512
+ scores.canonical = { score: Math.min(score, max), max };
513
+ }
514
+
515
+ {
516
+ let score = 0; const max = SCORE_WEIGHTS.robots.max;
517
+ const robots = s.basic?.robots;
518
+ if (robots) {
519
+ if (robots.includes('noindex')) score += 1;
520
+ else score += 3;
521
+ } else {
522
+ score += 2;
523
+ }
524
+ scores.robots = { score: Math.min(score, max), max };
525
+ }
526
+
527
+ {
528
+ let score = 0; const max = SCORE_WEIGHTS.openGraph.max;
529
+ const og = s.openGraph || {};
530
+ if (og.title) score += 2;
531
+ if (og.description) score += 2;
532
+ if (og.image) score += 2;
533
+ if (og.type) score += 1;
534
+ if (og.url) score += 1;
535
+ scores.openGraph = { score: Math.min(score, max), max };
536
+ }
537
+
538
+ {
539
+ let score = 0; const max = SCORE_WEIGHTS.twitterCard.max;
540
+ const tw = s.twitter || {};
541
+ if (tw.card) score += 2;
542
+ if (tw.title) score += 1;
543
+ if (tw.description) score += 1;
544
+ if (tw.image) score += 1;
545
+ scores.twitterCard = { score: Math.min(score, max), max };
546
+ }
547
+
548
+ {
549
+ let score = 0; const max = SCORE_WEIGHTS.schema.max;
550
+ const schema = s.schema;
551
+ if (schema?.jsonLd?.length > 0) {
552
+ score += 5;
553
+ if (schema.jsonLd.length >= 2) score += 2;
554
+ if (schema.schemaTypes?.some(t => ['WebPage', 'WebSite', 'Article', 'Product', 'Organization', 'LocalBusiness', 'BreadcrumbList'].includes(t))) score += 3;
555
+ }
556
+ if (Object.keys(schema?.microdata || {}).length > 0) score += 2;
557
+ scores.schema = { score: Math.min(score, max), max };
558
+ }
559
+
560
+ {
561
+ let score = 0; const max = SCORE_WEIGHTS.images.max;
562
+ const img = s.images;
563
+ if (!img || img.skipped) { scores.images = { score: max, max }; }
564
+ else {
565
+ const coverage = img.counts.altCoverage || 100;
566
+ if (coverage === 100) score += 4;
567
+ else if (coverage >= 80) score += 3;
568
+ else if (coverage >= 50) score += 1;
569
+
570
+ const lazyRatio = img.counts.total > 0 ? img.counts.lazyLoaded / img.counts.total : 1;
571
+ if (lazyRatio >= 0.5) score += 2;
572
+ if (img.counts.total === 0 || coverage >= 80) score += 2;
573
+ scores.images = { score: Math.min(score, max), max };
574
+ }
575
+ }
576
+
577
+ {
578
+ let score = 0; const max = SCORE_WEIGHTS.mobile.max;
579
+ if (s.technical?.hasViewport) score += 3;
580
+ const vp = s.basic?.viewport || '';
581
+ if (vp.includes('width=device-width')) score += 2;
582
+ scores.mobile = { score: Math.min(score, max), max };
583
+ }
584
+
585
+ {
586
+ let score = 0; const max = SCORE_WEIGHTS.performance.max;
587
+ const p = s.performance;
588
+ if (p) {
589
+ if (p.htmlSizeKB < 100) score += 2;
590
+ else if (p.htmlSizeKB < 200) score += 1;
591
+ if (p.fetchTimeMs < 2000) score += 2;
592
+ else if (p.fetchTimeMs < 4000) score += 1;
593
+ if (!p.renderBlockingRisk) score += 2;
594
+ else if (p.deferScripts + p.asyncScripts > 0) score += 1;
595
+ if (p.totalImages === 0 || (s.images?.counts?.lazyLoaded || 0) / Math.max(p.totalImages, 1) > 0.3) score += 1;
596
+ }
597
+ scores.performance = { score: Math.min(score, max), max };
598
+ }
599
+
600
+ {
601
+ let score = 0; const max = SCORE_WEIGHTS.links.max;
602
+ const lk = s.links;
603
+ if (lk && !lk.skipped) {
604
+ if (lk.counts.internal > 0) score += 2;
605
+ if (lk.counts.external > 0) score += 1;
606
+ if (lk.counts.emptyText === 0) score += 2;
607
+ else if (lk.counts.emptyText < 5) score += 1;
608
+ } else {
609
+ score = max;
610
+ }
611
+ scores.links = { score: Math.min(score, max), max };
612
+ }
613
+
614
+ {
615
+ let score = 0; const max = SCORE_WEIGHTS.security.max;
616
+ const tech = s.technical;
617
+ if (tech) {
618
+ if (tech.https) score += 2;
619
+ if (tech.hsts) score += 1;
620
+ if (tech.xFrameOptions || tech.csp) score += 1;
621
+ if (tech.xContentTypeOptions) score += 1;
622
+ }
623
+ scores.security = { score: Math.min(score, max), max };
624
+ }
625
+
626
+ {
627
+ let score = 0; const max = SCORE_WEIGHTS.content.max;
628
+ const c = s.content;
629
+ if (c) {
630
+ if (c.wordCount >= 300) score += 2;
631
+ else if (c.wordCount >= 100) score += 1;
632
+ if (c.paragraphCount >= 3) score += 1;
633
+ if (c.htmlLang) score += 1;
634
+ }
635
+ scores.content = { score: Math.min(score, max), max };
636
+ }
637
+
638
+ const totalScore = Object.values(scores).reduce((sum, v) => sum + v.score, 0);
639
+ const totalMax = Object.values(scores).reduce((sum, v) => sum + v.max, 0);
640
+ const overallPercent = Math.round((totalScore / totalMax) * 100);
641
+
642
+ this.metadata.scores = scores;
643
+ this.metadata.overallScore = overallPercent;
644
+ this.metadata.grade = this.computeGrade(overallPercent);
645
+ this.metadata.summary = this.computeSummary(overallPercent, scores);
646
+ }
647
+
648
+ computeGrade(score) {
649
+ if (score >= 90) return 'A+';
650
+ if (score >= 85) return 'A';
651
+ if (score >= 80) return 'A-';
652
+ if (score >= 75) return 'B+';
653
+ if (score >= 70) return 'B';
654
+ if (score >= 65) return 'B-';
655
+ if (score >= 60) return 'C+';
656
+ if (score >= 55) return 'C';
657
+ if (score >= 50) return 'C-';
658
+ if (score >= 40) return 'D';
659
+ return 'F';
660
+ }
661
+
662
+ computeSummary(score, scores) {
663
+ const weakest = Object.entries(scores)
664
+ .map(([k, v]) => ({ key: k, pct: v.score / v.max }))
665
+ .sort((a, b) => a.pct - b.pct)
666
+ .slice(0, 2)
667
+ .map(v => SCORE_WEIGHTS[v.key]?.label || v.key);
668
+
669
+ if (score >= 85) return `Excellent SEO foundation detected. Minor refinements in ${weakest.join(' and ')} would push this to peak performance.`;
670
+ if (score >= 70) return `Solid SEO signals present. Primary opportunities in ${weakest.join(' and ')} \u2014 addressing these could meaningfully boost rankings.`;
671
+ if (score >= 55) return `Moderate SEO health. Significant gaps in ${weakest.join(' and ')} are likely suppressing organic visibility.`;
672
+ if (score >= 40) return `Below-average SEO configuration. Core issues in ${weakest.join(' and ')} suggest this page is leaving substantial traffic on the table.`;
673
+ return `Critical SEO deficiencies detected. Foundational work needed across multiple categories before this page can effectively compete in search.`;
674
+ }
675
+
676
+ generateRecommendations() {
677
+ const s = this.metadata.seo;
678
+ const recs = [];
679
+
680
+ const add = (priority, category, issue, recommendation, impact, effort) => {
681
+ recs.push({ priority, category, issue, recommendation, impact, effort });
682
+ };
683
+
684
+ const title = s.basic?.title;
685
+ if (!title) {
686
+ add('CRITICAL', 'Title Tag', 'Page is missing a <title> tag.', 'Add a descriptive title tag (50\u201360 characters) containing your primary keyword near the beginning. Search engines use the title as the primary ranking signal and click-through label in SERPs.', 'Very High', 'Low');
687
+ } else if (title.length < 30) {
688
+ add('HIGH', 'Title Tag', `Title is too short (${title.length} chars): "${title}"`, 'Expand your title to 50\u201360 characters. Short titles fail to capture keyword opportunities and may appear truncated or thin to search engines.', 'High', 'Low');
689
+ } else if (title.length > 60) {
690
+ add('MEDIUM', 'Title Tag', `Title may be truncated in SERPs (${title.length} chars).`, 'Trim your title to under 60 characters. Google typically truncates display titles beyond this threshold, reducing click-through rates.', 'Medium', 'Low');
691
+ }
692
+
693
+ const desc = s.basic?.metaDescription;
694
+ if (!desc) {
695
+ add('HIGH', 'Meta Description', 'No meta description found.', 'Write a compelling meta description of 120\u2013160 characters that summarizes the page and includes a call-to-action. While not a direct ranking factor, descriptions strongly influence click-through rates.', 'High', 'Low');
696
+ } else if (desc.length < 120) {
697
+ add('MEDIUM', 'Meta Description', `Description is too short (${desc.length} chars).`, 'Expand your meta description to at least 120 characters. Longer descriptions provide more context to searchers and reduce the chance Google auto-generates a less compelling snippet.', 'Medium', 'Low');
698
+ } else if (desc.length > 160) {
699
+ add('LOW', 'Meta Description', `Description may be truncated (${desc.length} chars).`, 'Consider trimming your description to 160 characters to ensure the full text displays in desktop SERPs.', 'Low', 'Low');
700
+ }
701
+
702
+ const h1count = s.headings?.counts?.h1 || 0;
703
+ if (h1count === 0) {
704
+ add('CRITICAL', 'Heading Structure', 'No H1 tag found on the page.', 'Add exactly one H1 tag that clearly describes the page topic and includes your primary keyword. The H1 is the strongest on-page heading signal search engines use to understand page context.', 'Very High', 'Low');
705
+ } else if (h1count > 1) {
706
+ add('MEDIUM', 'Heading Structure', `Multiple H1 tags detected (${h1count} found).`, 'Reduce to a single H1 per page. Multiple H1s dilute topical focus and can confuse search engine interpretation of the primary page topic.', 'Medium', 'Low');
707
+ }
708
+
709
+ if (s.headings?.hierarchyIssues?.length > 0) {
710
+ add('MEDIUM', 'Heading Structure', `${s.headings.hierarchyIssues.length} heading hierarchy jump(s) detected.`, `Fix heading order to follow a logical H1 \u2192 H2 \u2192 H3 sequence without skipping levels. Issues: ${s.headings.hierarchyIssues.slice(0, 2).join('; ')}`, 'Medium', 'Medium');
711
+ }
712
+
713
+ if (!s.basic?.canonical) {
714
+ add('HIGH', 'Canonical URL', 'No canonical tag found.', 'Add a <link rel="canonical"> tag pointing to the preferred version of this URL. This prevents duplicate content penalties from URL parameter variants, trailing slashes, and protocol differences.', 'High', 'Low');
715
+ } else if (s.basic.canonicalMatchesFinal === false) {
716
+ add('HIGH', 'Canonical URL', 'Canonical URL does not match the final URL after redirects.', 'Update your canonical tag to point to the resolved final URL. A mismatched canonical can signal conflicting authority signals to search engines.', 'High', 'Low');
717
+ }
718
+
719
+ const og = s.openGraph || {};
720
+ const missingOG = ['title', 'description', 'image', 'type'].filter(k => !og[k]);
721
+ if (missingOG.length === 4) {
722
+ add('HIGH', 'Open Graph', 'No Open Graph tags found.', 'Implement Open Graph meta tags (og:title, og:description, og:image, og:type). These directly control how your content appears when shared on Facebook, LinkedIn, Slack, and other platforms \u2014 significantly affecting social click-through rates.', 'High', 'Medium');
723
+ } else if (missingOG.length > 0) {
724
+ add('MEDIUM', 'Open Graph', `Missing OG properties: ${missingOG.join(', ')}.`, `Add the missing og:${missingOG[0]} tag${missingOG.length > 1 ? ' (and others)' : ''}. Incomplete Open Graph data often results in poor social sharing previews.`, 'Medium', 'Low');
725
+ }
726
+
727
+ const tw = s.twitter || {};
728
+ if (!tw.card) {
729
+ add('MEDIUM', 'Twitter Card', 'No Twitter Card meta tags found.', 'Add twitter:card, twitter:title, twitter:description, and twitter:image tags. These control your content\'s appearance in Twitter/X shares and can significantly increase engagement from social traffic.', 'Medium', 'Low');
730
+ }
731
+
732
+ if (!s.schema?.jsonLd?.length && !Object.keys(s.schema?.microdata || {}).length) {
733
+ add('HIGH', 'Structured Data', 'No structured data (schema.org) found.', 'Implement JSON-LD structured data appropriate to your content type (WebPage, Article, Product, Organization, LocalBusiness, BreadcrumbList). Structured data enables rich results in SERPs and is increasingly used by AI search summaries.', 'Very High', 'Medium');
734
+ }
735
+
736
+ const img = s.images;
737
+ if (img && !img.skipped) {
738
+ if (img.counts.withoutAlt > 0) {
739
+ const pct = 100 - img.counts.altCoverage;
740
+ const priority = pct > 50 ? 'HIGH' : 'MEDIUM';
741
+ add(priority, 'Image Optimization', `${img.counts.withoutAlt} image(s) missing alt text (${pct}% uncovered).`, 'Add descriptive alt attributes to all meaningful images. Alt text is critical for accessibility, image search visibility, and provides contextual signals to search engines about image content.', 'High', 'Medium');
742
+ }
743
+ if (img.counts.lazyLoaded === 0 && img.counts.total > 2) {
744
+ add('MEDIUM', 'Image Optimization', 'No lazy-loading detected on images.', 'Add loading="lazy" to all below-the-fold images. Lazy loading reduces initial page weight and improves Core Web Vitals scores (LCP, FID), which are Google ranking factors.', 'Medium', 'Low');
745
+ }
746
+ }
747
+
748
+ if (!s.technical?.hasViewport) {
749
+ add('CRITICAL', 'Mobile Readiness', 'No viewport meta tag found.', 'Add <meta name="viewport" content="width=device-width, initial-scale=1"> to the <head>. Without this, Google\'s mobile-first indexing will penalize this page and mobile users will see a broken zoomed-out layout.', 'Very High', 'Low');
750
+ }
751
+
752
+ const perf = s.performance;
753
+ if (perf?.renderBlockingRisk) {
754
+ add('MEDIUM', 'Performance', `${perf.headScripts} render-blocking script(s) in <head>.`, 'Add defer or async attributes to non-critical scripts in <head>. Render-blocking scripts delay First Contentful Paint (FCP) and increase Time to Interactive (TTI), both measured by Core Web Vitals.', 'Medium', 'Low');
755
+ }
756
+ if (perf?.htmlSizeKB > 200) {
757
+ add('MEDIUM', 'Performance', `HTML size is large (${perf.htmlSizeKB}KB).`, 'Investigate opportunities to reduce HTML payload: remove unused markup, minimize inline scripts/styles, and consider server-side rendering optimizations. Large HTML documents increase parse time and bandwidth consumption.', 'Medium', 'High');
758
+ }
759
+ if (perf?.fetchTimeMs > 3000) {
760
+ add('HIGH', 'Performance', `Server response time is slow (${perf.fetchTimeMs}ms TTFB).`, 'Investigate server-side performance: enable caching, use a CDN, optimize database queries, or upgrade hosting. Google\'s recommended TTFB threshold is under 800ms for a good user experience.', 'High', 'High');
761
+ }
762
+
763
+ if (!s.technical?.https) {
764
+ add('CRITICAL', 'Security', 'Page is served over HTTP (not HTTPS).', 'Migrate to HTTPS immediately. Google has used HTTPS as a ranking signal since 2014, and browsers now display "Not Secure" warnings on HTTP pages, drastically reducing user trust and conversions.', 'Very High', 'High');
765
+ }
766
+ if (!s.technical?.hsts && s.technical?.https) {
767
+ add('LOW', 'Security', 'HSTS header not detected.', 'Enable HTTP Strict Transport Security (HSTS) on your server. HSTS prevents protocol downgrade attacks and ensures all connections use HTTPS, which signals security maturity to search engines and users.', 'Low', 'Medium');
768
+ }
769
+
770
+ if (!s.technical?.hasFavicon) {
771
+ add('LOW', 'Technical', 'No favicon detected.', 'Add a favicon (<link rel="icon">). While not a ranking factor, favicons appear in browser tabs and bookmarks, reinforcing brand recognition and user trust during search result navigation.', 'Low', 'Low');
772
+ }
773
+
774
+ if (s.links?.counts?.emptyText > 3) {
775
+ add('MEDIUM', 'Link Structure', `${s.links.counts.emptyText} links have no anchor text.`, 'Add descriptive anchor text to all links. Search engines use anchor text to understand the context and relevance of linked pages \u2014 empty anchors waste this signal entirely.', 'Medium', 'Medium');
776
+ }
777
+
778
+ if (s.content?.wordCount < 300 && s.content?.wordCount > 0) {
779
+ add('MEDIUM', 'Content Quality', `Page has low word count (${s.content.wordCount} words).`, 'Expand page content to at least 300 words. Thin content is a known quality signal issue in Google\'s algorithms. More comprehensive content tends to rank for broader keyword sets.', 'High', 'High');
780
+ }
781
+
782
+ const priorityOrder = { CRITICAL: 0, HIGH: 1, MEDIUM: 2, LOW: 3, GOOD: 4 };
783
+ recs.sort((a, b) => (priorityOrder[a.priority] ?? 9) - (priorityOrder[b.priority] ?? 9));
784
+
785
+ this.metadata.recommendations = recs;
786
+ }
787
+
788
+ async analyze() {
789
+ const steps = [
790
+ ['Basic Metadata', () => this.extractBasicMetadata()],
791
+ ['Open Graph', () => this.extractOpenGraphMetadata()],
792
+ ['Twitter Card', () => this.extractTwitterMetadata()],
793
+ ['Schema Markup', () => this.extractSchemaMetadata()],
794
+ ['Technical SEO', () => this.extractTechnicalMetadata()],
795
+ ['Link Analysis', () => this.extractLinks()],
796
+ ['Heading Structure', () => this.extractHeadings()],
797
+ ['Image Analysis', () => this.extractImages()],
798
+ ['Content Analysis', () => this.extractContentMetadata()],
799
+ ['Performance Metrics', () => this.extractPerformanceMetrics()],
800
+ ['Scoring Engine', () => this.computeScores()],
801
+ ['Recommendations', () => this.generateRecommendations()],
802
+ ];
803
+
804
+ for (const [name, fn] of steps) {
805
+ try {
806
+ fn();
807
+ log.ok(name);
808
+ } catch (e) {
809
+ log.warn(`${name}: ${e.message}`);
810
+ }
811
+ }
812
+ }
813
+ }
814
+
815
+ module.exports = SEOScraper;