recker 1.0.72 → 1.0.75-next.2e5a94f
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -18
- package/dist/browser/core/client.d.ts +14 -8
- package/dist/browser/core/client.js +199 -17
- package/dist/browser/core/errors.d.ts +15 -1
- package/dist/browser/core/errors.js +140 -9
- package/dist/browser/core/request.d.ts +5 -0
- package/dist/browser/core/request.js +33 -2
- package/dist/browser/core-runtime/plugin-manifest.d.ts +24 -0
- package/dist/browser/core-runtime/plugin-manifest.js +159 -0
- package/dist/browser/core-runtime/request-context.d.ts +13 -0
- package/dist/browser/core-runtime/request-context.js +24 -0
- package/dist/browser/core-runtime/typed-events.d.ts +89 -0
- package/dist/browser/core-runtime/typed-events.js +34 -0
- package/dist/browser/index.iife.min.js +79 -79
- package/dist/browser/index.min.js +79 -79
- package/dist/browser/index.mini.iife.js +913 -97
- package/dist/browser/index.mini.iife.min.js +46 -46
- package/dist/browser/index.mini.min.js +46 -46
- package/dist/browser/index.mini.umd.js +913 -97
- package/dist/browser/index.mini.umd.min.js +46 -46
- package/dist/browser/index.umd.min.js +79 -79
- package/dist/browser/plugins/auth/aws-sigv4.d.ts +1 -0
- package/dist/browser/plugins/auth/aws-sigv4.js +19 -2
- package/dist/browser/plugins/retry.js +29 -1
- package/dist/browser/presets/aws.d.ts +1 -0
- package/dist/browser/presets/aws.js +62 -1
- package/dist/browser/runner/request-runner.d.ts +15 -5
- package/dist/browser/runner/request-runner.js +164 -30
- package/dist/browser/scrape/parser/nodes/html.d.ts +6 -0
- package/dist/browser/scrape/parser/nodes/html.js +70 -18
- package/dist/browser/scrape/parser/nodes/node.d.ts +1 -0
- package/dist/browser/scrape/parser/nodes/node.js +5 -0
- package/dist/browser/scrape/spider.d.ts +1 -0
- package/dist/browser/scrape/spider.js +39 -26
- package/dist/browser/seo/analyzer.d.ts +1 -1
- package/dist/browser/seo/analyzer.js +73 -42
- package/dist/browser/seo/index.d.ts +1 -1
- package/dist/browser/seo/rules/types.d.ts +2 -0
- package/dist/browser/seo/seo-spider.d.ts +2 -3
- package/dist/browser/seo/seo-spider.js +26 -202
- package/dist/browser/seo/types.d.ts +4 -0
- package/dist/browser/seo/validators/sitemap.js +9 -2
- package/dist/browser/transport/fetch.js +38 -5
- package/dist/browser/transport/undici.js +73 -11
- package/dist/browser/transport/worker.d.ts +0 -1
- package/dist/browser/transport/worker.js +1 -3
- package/dist/browser/types/index.d.ts +24 -0
- package/dist/cli/commands/mcp.js +5 -3
- package/dist/core/client.d.ts +14 -8
- package/dist/core/client.js +199 -17
- package/dist/core/errors.d.ts +15 -1
- package/dist/core/errors.js +140 -9
- package/dist/core/request.d.ts +5 -0
- package/dist/core/request.js +33 -2
- package/dist/core-runtime/plugin-manifest.d.ts +24 -0
- package/dist/core-runtime/plugin-manifest.js +159 -0
- package/dist/core-runtime/request-context.d.ts +13 -0
- package/dist/core-runtime/request-context.js +24 -0
- package/dist/core-runtime/typed-events.d.ts +89 -0
- package/dist/core-runtime/typed-events.js +34 -0
- package/dist/index.d.ts +2 -1
- package/dist/index.js +2 -1
- package/dist/mcp/cli.js +10 -8
- package/dist/mcp/profiles.d.ts +1 -1
- package/dist/mcp/profiles.js +31 -6
- package/dist/mcp/tools/categories.js +0 -1
- package/dist/mcp/tools/seo.js +320 -4
- package/dist/plugins/auth/aws-sigv4.d.ts +1 -0
- package/dist/plugins/auth/aws-sigv4.js +19 -2
- package/dist/plugins/retry.js +29 -1
- package/dist/presets/aws.d.ts +1 -0
- package/dist/presets/aws.js +62 -1
- package/dist/recker.d.ts +3 -0
- package/dist/recker.js +5 -0
- package/dist/runner/request-runner.d.ts +15 -5
- package/dist/runner/request-runner.js +164 -30
- package/dist/scrape/parser/nodes/html.d.ts +6 -0
- package/dist/scrape/parser/nodes/html.js +70 -18
- package/dist/scrape/parser/nodes/node.d.ts +1 -0
- package/dist/scrape/parser/nodes/node.js +5 -0
- package/dist/scrape/spider.d.ts +1 -0
- package/dist/scrape/spider.js +39 -26
- package/dist/search/google.d.ts +67 -0
- package/dist/search/google.js +480 -0
- package/dist/search/index.d.ts +3 -0
- package/dist/search/index.js +1 -0
- package/dist/seo/analyzer.d.ts +1 -1
- package/dist/seo/analyzer.js +73 -42
- package/dist/seo/index.d.ts +1 -1
- package/dist/seo/rules/types.d.ts +2 -0
- package/dist/seo/seo-spider.d.ts +2 -3
- package/dist/seo/seo-spider.js +26 -202
- package/dist/seo/types.d.ts +4 -0
- package/dist/seo/validators/sitemap.js +9 -2
- package/dist/transport/fetch.js +38 -5
- package/dist/transport/undici.js +73 -11
- package/dist/transport/worker.d.ts +0 -1
- package/dist/transport/worker.js +1 -3
- package/dist/types/index.d.ts +24 -0
- package/dist/version.js +1 -1
- package/package.json +9 -1
|
@@ -98,11 +98,31 @@ export default class HTMLElement extends Node {
|
|
|
98
98
|
voidTag;
|
|
99
99
|
_attrs;
|
|
100
100
|
_rawAttrs;
|
|
101
|
+
_queryCache;
|
|
101
102
|
_parseOptions;
|
|
102
103
|
rawTagName;
|
|
103
104
|
id;
|
|
104
105
|
classList;
|
|
105
106
|
nodeType = NodeType.ELEMENT_NODE;
|
|
107
|
+
get isSelectorCacheEnabled() {
|
|
108
|
+
return this._parseOptions?.selectorCache !== false;
|
|
109
|
+
}
|
|
110
|
+
getQueryCache() {
|
|
111
|
+
if (!this._queryCache) {
|
|
112
|
+
this._queryCache = new Map();
|
|
113
|
+
}
|
|
114
|
+
return this._queryCache;
|
|
115
|
+
}
|
|
116
|
+
clearQueryCache() {
|
|
117
|
+
this._queryCache = undefined;
|
|
118
|
+
}
|
|
119
|
+
invalidateSelectorCacheRecursively() {
|
|
120
|
+
let current = this;
|
|
121
|
+
while (current) {
|
|
122
|
+
current.clearQueryCache();
|
|
123
|
+
current = current.parentNode;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
106
126
|
quoteAttribute(attr) {
|
|
107
127
|
if (attr == null) {
|
|
108
128
|
return 'null';
|
|
@@ -144,6 +164,7 @@ export default class HTMLElement extends Node {
|
|
|
144
164
|
this.childNodes = this.childNodes.filter((child) => {
|
|
145
165
|
return child !== node;
|
|
146
166
|
});
|
|
167
|
+
this.invalidateSelectorCacheRecursively();
|
|
147
168
|
return this;
|
|
148
169
|
}
|
|
149
170
|
exchangeChild(oldNode, newNode) {
|
|
@@ -154,6 +175,7 @@ export default class HTMLElement extends Node {
|
|
|
154
175
|
}
|
|
155
176
|
return child;
|
|
156
177
|
});
|
|
178
|
+
this.invalidateSelectorCacheRecursively();
|
|
157
179
|
return this;
|
|
158
180
|
}
|
|
159
181
|
get tagName() {
|
|
@@ -182,6 +204,7 @@ export default class HTMLElement extends Node {
|
|
|
182
204
|
set textContent(val) {
|
|
183
205
|
const content = [new TextNode(val, this)];
|
|
184
206
|
this.childNodes = content;
|
|
207
|
+
this.invalidateSelectorCacheRecursively();
|
|
185
208
|
}
|
|
186
209
|
get text() {
|
|
187
210
|
return decode(this.rawText);
|
|
@@ -249,6 +272,7 @@ export default class HTMLElement extends Node {
|
|
|
249
272
|
resetParent(nodes, this);
|
|
250
273
|
resetParent(this.childNodes, null);
|
|
251
274
|
this.childNodes = nodes;
|
|
275
|
+
this.invalidateSelectorCacheRecursively();
|
|
252
276
|
}
|
|
253
277
|
set_content(content, options = {}) {
|
|
254
278
|
if (content instanceof Node) {
|
|
@@ -264,6 +288,7 @@ export default class HTMLElement extends Node {
|
|
|
264
288
|
resetParent(this.childNodes, null);
|
|
265
289
|
resetParent(content, this);
|
|
266
290
|
this.childNodes = content;
|
|
291
|
+
this.invalidateSelectorCacheRecursively();
|
|
267
292
|
return this;
|
|
268
293
|
}
|
|
269
294
|
replaceWith(...nodes) {
|
|
@@ -293,6 +318,7 @@ export default class HTMLElement extends Node {
|
|
|
293
318
|
...resetParent(content, parent),
|
|
294
319
|
...parent.childNodes.slice(idx + 1),
|
|
295
320
|
];
|
|
321
|
+
parent.invalidateSelectorCacheRecursively();
|
|
296
322
|
return this;
|
|
297
323
|
}
|
|
298
324
|
get outerHTML() {
|
|
@@ -312,6 +338,7 @@ export default class HTMLElement extends Node {
|
|
|
312
338
|
}
|
|
313
339
|
}
|
|
314
340
|
}
|
|
341
|
+
this.invalidateSelectorCacheRecursively();
|
|
315
342
|
return this;
|
|
316
343
|
}
|
|
317
344
|
get structure() {
|
|
@@ -357,6 +384,7 @@ export default class HTMLElement extends Node {
|
|
|
357
384
|
this.childNodes[o++] = node;
|
|
358
385
|
});
|
|
359
386
|
this.childNodes.length = o;
|
|
387
|
+
this.invalidateSelectorCacheRecursively();
|
|
360
388
|
const attrs = Object.keys(this.rawAttributes)
|
|
361
389
|
.map((key) => {
|
|
362
390
|
const val = this.rawAttributes[key];
|
|
@@ -368,16 +396,49 @@ export default class HTMLElement extends Node {
|
|
|
368
396
|
return this;
|
|
369
397
|
}
|
|
370
398
|
querySelectorAll(selector) {
|
|
371
|
-
|
|
399
|
+
if (this.isSelectorCacheEnabled) {
|
|
400
|
+
const cached = this.getQueryCache().get(selector);
|
|
401
|
+
if (cached?.all) {
|
|
402
|
+
return cached.all.slice();
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
const nodes = selectAll(selector, this, {
|
|
372
406
|
xmlMode: false,
|
|
373
407
|
adapter: Matcher,
|
|
374
408
|
});
|
|
409
|
+
if (this.isSelectorCacheEnabled) {
|
|
410
|
+
const cacheEntry = this.getQueryCache().get(selector) || {};
|
|
411
|
+
cacheEntry.all = nodes;
|
|
412
|
+
if (cacheEntry.first === undefined) {
|
|
413
|
+
cacheEntry.first = nodes[0] || null;
|
|
414
|
+
}
|
|
415
|
+
this.getQueryCache().set(selector, cacheEntry);
|
|
416
|
+
}
|
|
417
|
+
return nodes;
|
|
375
418
|
}
|
|
376
419
|
querySelector(selector) {
|
|
377
|
-
|
|
420
|
+
if (this.isSelectorCacheEnabled) {
|
|
421
|
+
const cached = this.getQueryCache().get(selector);
|
|
422
|
+
if (cached?.first !== undefined) {
|
|
423
|
+
return cached.first || null;
|
|
424
|
+
}
|
|
425
|
+
if (cached?.all) {
|
|
426
|
+
const first = cached.all[0] || null;
|
|
427
|
+
cached.first = first;
|
|
428
|
+
this.getQueryCache().set(selector, cached);
|
|
429
|
+
return first;
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
const result = selectOne(selector, this, {
|
|
378
433
|
xmlMode: false,
|
|
379
434
|
adapter: Matcher,
|
|
380
435
|
});
|
|
436
|
+
if (this.isSelectorCacheEnabled) {
|
|
437
|
+
const cacheEntry = this.getQueryCache().get(selector) || {};
|
|
438
|
+
cacheEntry.first = result;
|
|
439
|
+
this.getQueryCache().set(selector, cacheEntry);
|
|
440
|
+
}
|
|
441
|
+
return result;
|
|
381
442
|
}
|
|
382
443
|
getElementsByTagName(tagName) {
|
|
383
444
|
const upperCasedTagName = tagName.toUpperCase();
|
|
@@ -440,22 +501,6 @@ export default class HTMLElement extends Node {
|
|
|
440
501
|
const mapChild = new Map();
|
|
441
502
|
let el = this;
|
|
442
503
|
let old = null;
|
|
443
|
-
function findOne(test, elems) {
|
|
444
|
-
let elem = null;
|
|
445
|
-
for (let i = 0, l = elems.length; i < l && !elem; i++) {
|
|
446
|
-
const el = elems[i];
|
|
447
|
-
if (test(el)) {
|
|
448
|
-
elem = el;
|
|
449
|
-
}
|
|
450
|
-
else {
|
|
451
|
-
const child = mapChild.get(el);
|
|
452
|
-
if (child) {
|
|
453
|
-
elem = findOne(test, [child]);
|
|
454
|
-
}
|
|
455
|
-
}
|
|
456
|
-
}
|
|
457
|
-
return elem;
|
|
458
|
-
}
|
|
459
504
|
while (el) {
|
|
460
505
|
if (old)
|
|
461
506
|
mapChild.set(el, old);
|
|
@@ -545,6 +590,7 @@ export default class HTMLElement extends Node {
|
|
|
545
590
|
if (key === 'id') {
|
|
546
591
|
this.id = '';
|
|
547
592
|
}
|
|
593
|
+
this.invalidateSelectorCacheRecursively();
|
|
548
594
|
return this;
|
|
549
595
|
}
|
|
550
596
|
hasAttribute(key) {
|
|
@@ -580,6 +626,7 @@ export default class HTMLElement extends Node {
|
|
|
580
626
|
if (key === 'id') {
|
|
581
627
|
this.id = value;
|
|
582
628
|
}
|
|
629
|
+
this.invalidateSelectorCacheRecursively();
|
|
583
630
|
return this;
|
|
584
631
|
}
|
|
585
632
|
setAttributes(attributes) {
|
|
@@ -597,6 +644,7 @@ export default class HTMLElement extends Node {
|
|
|
597
644
|
return `${name}=${this.quoteAttribute(String(val))}`;
|
|
598
645
|
})
|
|
599
646
|
.join(' ');
|
|
647
|
+
this.invalidateSelectorCacheRecursively();
|
|
600
648
|
return this;
|
|
601
649
|
}
|
|
602
650
|
insertAdjacentHTML(where, html) {
|
|
@@ -625,11 +673,13 @@ export default class HTMLElement extends Node {
|
|
|
625
673
|
const nodes = resolveInsertable(insertable, this._parseOptions);
|
|
626
674
|
resetParent(nodes, this);
|
|
627
675
|
this.childNodes.unshift(...nodes);
|
|
676
|
+
this.invalidateSelectorCacheRecursively();
|
|
628
677
|
}
|
|
629
678
|
append(...insertable) {
|
|
630
679
|
const nodes = resolveInsertable(insertable, this._parseOptions);
|
|
631
680
|
resetParent(nodes, this);
|
|
632
681
|
this.childNodes.push(...nodes);
|
|
682
|
+
this.invalidateSelectorCacheRecursively();
|
|
633
683
|
}
|
|
634
684
|
before(...insertable) {
|
|
635
685
|
if (!this.parentNode)
|
|
@@ -638,6 +688,7 @@ export default class HTMLElement extends Node {
|
|
|
638
688
|
const siblings = this.parentNode.childNodes;
|
|
639
689
|
resetParent(nodes, this.parentNode);
|
|
640
690
|
siblings.splice(siblings.indexOf(this), 0, ...nodes);
|
|
691
|
+
this.parentNode.invalidateSelectorCacheRecursively();
|
|
641
692
|
}
|
|
642
693
|
after(...insertable) {
|
|
643
694
|
if (!this.parentNode)
|
|
@@ -646,6 +697,7 @@ export default class HTMLElement extends Node {
|
|
|
646
697
|
const siblings = this.parentNode.childNodes;
|
|
647
698
|
resetParent(nodes, this.parentNode);
|
|
648
699
|
siblings.splice(siblings.indexOf(this) + 1, 0, ...nodes);
|
|
700
|
+
this.parentNode.invalidateSelectorCacheRecursively();
|
|
649
701
|
}
|
|
650
702
|
get nextSibling() {
|
|
651
703
|
if (this.parentNode) {
|
|
@@ -12,6 +12,7 @@ export default abstract class Node {
|
|
|
12
12
|
abstract clone(): Node;
|
|
13
13
|
constructor(parentNode?: HTMLElement | null, range?: [number, number]);
|
|
14
14
|
remove(): this;
|
|
15
|
+
invalidateSelectorCacheRecursively(): void;
|
|
15
16
|
get innerText(): string;
|
|
16
17
|
get textContent(): string;
|
|
17
18
|
set textContent(val: string);
|
|
@@ -11,6 +11,9 @@ export default class Node {
|
|
|
11
11
|
}
|
|
12
12
|
remove() {
|
|
13
13
|
if (this.parentNode) {
|
|
14
|
+
if (typeof this.parentNode.invalidateSelectorCacheRecursively === 'function') {
|
|
15
|
+
this.parentNode.invalidateSelectorCacheRecursively();
|
|
16
|
+
}
|
|
14
17
|
const children = this.parentNode.childNodes;
|
|
15
18
|
this.parentNode.childNodes = children.filter((child) => {
|
|
16
19
|
return this !== child;
|
|
@@ -19,6 +22,8 @@ export default class Node {
|
|
|
19
22
|
}
|
|
20
23
|
return this;
|
|
21
24
|
}
|
|
25
|
+
invalidateSelectorCacheRecursively() {
|
|
26
|
+
}
|
|
22
27
|
get innerText() {
|
|
23
28
|
return this.rawText;
|
|
24
29
|
}
|
|
@@ -123,6 +123,7 @@ export declare class Spider {
|
|
|
123
123
|
private robotsData;
|
|
124
124
|
private sitemapValidation;
|
|
125
125
|
private robotsValidation;
|
|
126
|
+
private toHeaderRecord;
|
|
126
127
|
constructor(options?: SpiderOptions);
|
|
127
128
|
crawl(startUrl: string): Promise<SpiderResult>;
|
|
128
129
|
private fetchRobotsTxt;
|
|
@@ -76,9 +76,6 @@ function shouldCrawl(url, baseHost, options) {
|
|
|
76
76
|
return false;
|
|
77
77
|
}
|
|
78
78
|
}
|
|
79
|
-
function sleep(ms) {
|
|
80
|
-
return new Promise(resolve => setTimeout(resolve, ms));
|
|
81
|
-
}
|
|
82
79
|
function parseExtractSelectors(selectors) {
|
|
83
80
|
const schema = {};
|
|
84
81
|
for (const sel of selectors) {
|
|
@@ -115,6 +112,13 @@ export class Spider {
|
|
|
115
112
|
robotsData = null;
|
|
116
113
|
sitemapValidation = null;
|
|
117
114
|
robotsValidation = null;
|
|
115
|
+
toHeaderRecord(headers) {
|
|
116
|
+
const headerRecord = {};
|
|
117
|
+
headers.forEach((value, key) => {
|
|
118
|
+
headerRecord[key] = value;
|
|
119
|
+
});
|
|
120
|
+
return headerRecord;
|
|
121
|
+
}
|
|
118
122
|
constructor(options = {}) {
|
|
119
123
|
let extractSchema;
|
|
120
124
|
if (options.extract) {
|
|
@@ -194,7 +198,7 @@ export class Spider {
|
|
|
194
198
|
await this.fetchSitemaps(baseUrl);
|
|
195
199
|
}
|
|
196
200
|
const pending = new Map();
|
|
197
|
-
const scheduleUrl = (item
|
|
201
|
+
const scheduleUrl = (item) => {
|
|
198
202
|
const normalized = normalizeUrl(item.url);
|
|
199
203
|
if (this.visited.has(normalized))
|
|
200
204
|
return;
|
|
@@ -230,7 +234,7 @@ export class Spider {
|
|
|
230
234
|
try {
|
|
231
235
|
const urlHost = new URL(sitemapUrl.loc).hostname;
|
|
232
236
|
if (urlHost === this.baseHost) {
|
|
233
|
-
scheduleUrl({ url: sitemapUrl.loc, depth: 1 }
|
|
237
|
+
scheduleUrl({ url: sitemapUrl.loc, depth: 1 });
|
|
234
238
|
}
|
|
235
239
|
}
|
|
236
240
|
catch {
|
|
@@ -303,7 +307,7 @@ export class Spider {
|
|
|
303
307
|
return {
|
|
304
308
|
status: response.status,
|
|
305
309
|
text: await response.text(),
|
|
306
|
-
headers:
|
|
310
|
+
headers: this.toHeaderRecord(response.headers),
|
|
307
311
|
};
|
|
308
312
|
};
|
|
309
313
|
try {
|
|
@@ -351,40 +355,49 @@ export class Spider {
|
|
|
351
355
|
}
|
|
352
356
|
buildSitemapAnalysis() {
|
|
353
357
|
const crawledUrls = new Set(this.results.map(r => normalizeUrl(r.url)));
|
|
354
|
-
const
|
|
358
|
+
const sitemapUrlSet = this.sitemapUrlSet.size > 0
|
|
359
|
+
? this.sitemapUrlSet
|
|
360
|
+
: new Set(this.sitemapUrls.map((u) => normalizeUrl(u.loc)));
|
|
361
|
+
const crawledFromSitemap = Array.from(sitemapUrlSet)
|
|
362
|
+
.filter(url => crawledUrls.has(url))
|
|
363
|
+
.length;
|
|
355
364
|
const linkedUrls = new Set();
|
|
356
|
-
|
|
357
|
-
for (const link of page.links) {
|
|
358
|
-
if (link.href) {
|
|
359
|
-
linkedUrls.add(normalizeUrl(link.href));
|
|
360
|
-
}
|
|
361
|
-
}
|
|
362
|
-
}
|
|
363
|
-
const orphanUrls = this.sitemapUrls
|
|
364
|
-
.filter(u => {
|
|
365
|
-
const normalized = normalizeUrl(u.loc);
|
|
366
|
-
return !linkedUrls.has(normalized) && crawledUrls.has(normalized);
|
|
367
|
-
})
|
|
368
|
-
.map(u => u.loc);
|
|
369
|
-
const missingFromSitemap = Array.from(crawledUrls)
|
|
370
|
-
.filter(url => !this.sitemapUrlSet.has(url));
|
|
371
|
-
const blockedBySitemapRobots = [];
|
|
365
|
+
const blockedBySitemapRobotsSet = new Set();
|
|
372
366
|
if (this.robotsData) {
|
|
373
367
|
for (const sitemapUrl of this.sitemapUrls) {
|
|
374
368
|
try {
|
|
369
|
+
const normalized = normalizeUrl(sitemapUrl.loc);
|
|
375
370
|
const urlPath = new URL(sitemapUrl.loc).pathname;
|
|
376
371
|
if (!isPathAllowed(this.robotsData, urlPath, this.options.userAgent)) {
|
|
377
|
-
|
|
372
|
+
blockedBySitemapRobotsSet.add(normalized);
|
|
378
373
|
}
|
|
379
374
|
}
|
|
380
375
|
catch {
|
|
381
376
|
}
|
|
382
377
|
}
|
|
383
378
|
}
|
|
379
|
+
for (const page of this.results) {
|
|
380
|
+
for (const link of page.links) {
|
|
381
|
+
if (link.href) {
|
|
382
|
+
linkedUrls.add(normalizeUrl(link.href));
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
const orphanUrlSet = new Set();
|
|
387
|
+
for (const u of this.sitemapUrls) {
|
|
388
|
+
const normalized = normalizeUrl(u.loc);
|
|
389
|
+
if (!linkedUrls.has(normalized) && !blockedBySitemapRobotsSet.has(normalized)) {
|
|
390
|
+
orphanUrlSet.add(normalized);
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
const orphanUrls = Array.from(orphanUrlSet);
|
|
394
|
+
const missingFromSitemap = Array.from(crawledUrls)
|
|
395
|
+
.filter(url => !sitemapUrlSet.has(url));
|
|
396
|
+
const blockedBySitemapRobots = Array.from(blockedBySitemapRobotsSet);
|
|
384
397
|
return {
|
|
385
398
|
found: this.sitemapUrls.length > 0,
|
|
386
|
-
url: this.
|
|
387
|
-
totalUrls:
|
|
399
|
+
url: this.sitemapUrls[0]?.loc,
|
|
400
|
+
totalUrls: sitemapUrlSet.size,
|
|
388
401
|
crawledFromSitemap,
|
|
389
402
|
orphanUrls,
|
|
390
403
|
missingFromSitemap,
|
|
@@ -12,6 +12,7 @@ export declare class SeoAnalyzer {
|
|
|
12
12
|
static fromHtml(html: string, options?: SeoAnalyzerFullOptions): Promise<SeoAnalyzer>;
|
|
13
13
|
analyze(): SeoReport;
|
|
14
14
|
private getMainBody;
|
|
15
|
+
private detectPageType;
|
|
15
16
|
private getVisibleText;
|
|
16
17
|
private buildRuleContext;
|
|
17
18
|
private analyzeUrlQuality;
|
|
@@ -32,7 +33,6 @@ export declare class SeoAnalyzer {
|
|
|
32
33
|
private analyzeAnalytics;
|
|
33
34
|
private analyzeFeeds;
|
|
34
35
|
private analyzeConversionElements;
|
|
35
|
-
private analyzeAdvancedImages;
|
|
36
36
|
private calculateTextHtmlRatio;
|
|
37
37
|
private convertToCheckResults;
|
|
38
38
|
private buildSummary;
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { parse } from '../scrape/parser/index.js';
|
|
2
2
|
import { extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractLinks, extractImages, } from '../scrape/extractors.js';
|
|
3
3
|
import { generateKeywordCloud } from './keywords.js';
|
|
4
|
-
import { createRulesEngine, SEO_THRESHOLDS, } from './rules/index.js';
|
|
4
|
+
import { createRulesEngine, SEO_THRESHOLDS, calculateWeightedScore, } from './rules/index.js';
|
|
5
5
|
export class SeoAnalyzer {
|
|
6
6
|
root;
|
|
7
7
|
options;
|
|
@@ -47,7 +47,9 @@ export class SeoAnalyzer {
|
|
|
47
47
|
const analytics = this.analyzeAnalytics();
|
|
48
48
|
const feeds = this.analyzeFeeds();
|
|
49
49
|
const conversion = this.analyzeConversionElements(links, visibleText);
|
|
50
|
+
const pageType = this.detectPageType(jsonLd);
|
|
50
51
|
const context = this.buildRuleContext({
|
|
52
|
+
pageType,
|
|
51
53
|
meta,
|
|
52
54
|
og,
|
|
53
55
|
twitter,
|
|
@@ -68,7 +70,7 @@ export class SeoAnalyzer {
|
|
|
68
70
|
});
|
|
69
71
|
const ruleResults = this.rulesEngine.evaluate(context);
|
|
70
72
|
const checks = this.convertToCheckResults(ruleResults);
|
|
71
|
-
const { score, grade } = this.calculateScore(
|
|
73
|
+
const { score, grade } = this.calculateScore(ruleResults);
|
|
72
74
|
const summary = this.buildSummary(ruleResults, checks, {
|
|
73
75
|
content,
|
|
74
76
|
imageAnalysis,
|
|
@@ -77,13 +79,17 @@ export class SeoAnalyzer {
|
|
|
77
79
|
og,
|
|
78
80
|
twitter,
|
|
79
81
|
technical,
|
|
82
|
+
pageType,
|
|
83
|
+
timings: this.options.timings,
|
|
80
84
|
});
|
|
81
85
|
return {
|
|
82
86
|
url,
|
|
83
87
|
timestamp: new Date(),
|
|
84
88
|
grade,
|
|
85
89
|
score,
|
|
90
|
+
timing: this.options.timings,
|
|
86
91
|
summary,
|
|
92
|
+
pageType,
|
|
87
93
|
checks,
|
|
88
94
|
title: meta.title
|
|
89
95
|
? { text: meta.title, length: meta.title.length }
|
|
@@ -134,6 +140,47 @@ export class SeoAnalyzer {
|
|
|
134
140
|
return bodies[0];
|
|
135
141
|
return bodies.reduce((prev, curr) => curr.text.length > prev.text.length ? curr : prev);
|
|
136
142
|
}
|
|
143
|
+
detectPageType(jsonLd) {
|
|
144
|
+
if (!this.options.baseUrl) {
|
|
145
|
+
return 'other';
|
|
146
|
+
}
|
|
147
|
+
try {
|
|
148
|
+
const parsed = new URL(this.options.baseUrl);
|
|
149
|
+
const pathname = parsed.pathname.toLowerCase();
|
|
150
|
+
const hasQueryKeyword = (value) => parsed.searchParams.has(value);
|
|
151
|
+
if (pathname === '/' || pathname === '') {
|
|
152
|
+
return 'homepage';
|
|
153
|
+
}
|
|
154
|
+
if (/(^|\/)(search|busca|s|results|query)\b/.test(pathname) ||
|
|
155
|
+
hasQueryKeyword('q') ||
|
|
156
|
+
hasQueryKeyword('query') ||
|
|
157
|
+
hasQueryKeyword('search')) {
|
|
158
|
+
return 'search';
|
|
159
|
+
}
|
|
160
|
+
const productSignals = ['product', 'produto', 'item', 'sku', 'shop'];
|
|
161
|
+
if (productSignals.some((segment) => pathname.includes(`/${segment}/`))) {
|
|
162
|
+
return 'product';
|
|
163
|
+
}
|
|
164
|
+
const articleSignals = ['article', 'post', 'blog', 'noticia', 'news'];
|
|
165
|
+
if (articleSignals.some((segment) => pathname.includes(`/${segment}/`)) ||
|
|
166
|
+
this.root.querySelectorAll('article').length > 0) {
|
|
167
|
+
return 'article';
|
|
168
|
+
}
|
|
169
|
+
if (/(^|\/)(categoria|category|tag|section|topic)\b/.test(pathname)) {
|
|
170
|
+
return 'category';
|
|
171
|
+
}
|
|
172
|
+
const hasProductJsonLd = jsonLd
|
|
173
|
+
.map((node) => node['@type'])
|
|
174
|
+
.some((type) => typeof type === 'string' && type.toLowerCase() === 'product');
|
|
175
|
+
if (hasProductJsonLd) {
|
|
176
|
+
return 'product';
|
|
177
|
+
}
|
|
178
|
+
return 'other';
|
|
179
|
+
}
|
|
180
|
+
catch {
|
|
181
|
+
return 'other';
|
|
182
|
+
}
|
|
183
|
+
}
|
|
137
184
|
getVisibleText() {
|
|
138
185
|
const body = this.getMainBody();
|
|
139
186
|
if (!body)
|
|
@@ -159,7 +206,7 @@ export class SeoAnalyzer {
|
|
|
159
206
|
return clone.text.replace(/\s+/g, ' ').trim();
|
|
160
207
|
}
|
|
161
208
|
buildRuleContext(data) {
|
|
162
|
-
const { meta, og, twitter, jsonLd, headings, content, linkAnalysis, imageAnalysis, links, keywords, resources, emailsFound, socialLinksFound, socialLinkDetails, analytics, feeds, conversion, } = data;
|
|
209
|
+
const { meta, og, twitter, jsonLd, headings, content, pageType, linkAnalysis, imageAnalysis, links, keywords, resources, emailsFound, socialLinksFound, socialLinkDetails, analytics, feeds, conversion, } = data;
|
|
163
210
|
const html = this.root.querySelector('html');
|
|
164
211
|
const htmlLang = html ? html.getAttribute('lang') : undefined;
|
|
165
212
|
const hreflangTags = [];
|
|
@@ -198,7 +245,6 @@ export class SeoAnalyzer {
|
|
|
198
245
|
const hasMixedContent = this.checkMixedContent();
|
|
199
246
|
const h1Elements = this.root.querySelectorAll('h1');
|
|
200
247
|
const h1Text = h1Elements.length > 0 ? h1Elements[0].text.trim() : '';
|
|
201
|
-
const iframeCount = this.root.querySelectorAll('iframe').length;
|
|
202
248
|
const topKeywords = keywords.topKeywords.slice(0, 5).map(k => k.word);
|
|
203
249
|
const mainKeyword = topKeywords.length > 0 ? topKeywords[0] : undefined;
|
|
204
250
|
const keywordsInTitle = topKeywords.some(kw => meta.title?.toLowerCase().includes(kw));
|
|
@@ -234,7 +280,6 @@ export class SeoAnalyzer {
|
|
|
234
280
|
const structuralHtml = this.analyzeStructuralHtml();
|
|
235
281
|
const breadcrumbs = this.analyzeBreadcrumbs(jsonLd.map((j) => j['@type']).filter(Boolean));
|
|
236
282
|
const multimedia = this.analyzeMultimedia();
|
|
237
|
-
const advancedImages = this.analyzeAdvancedImages();
|
|
238
283
|
const responsiveImages = this.analyzeResponsiveImages();
|
|
239
284
|
const inlineImages = this.analyzeInlineImages();
|
|
240
285
|
const trustSignals = this.analyzeTrustSignals(links);
|
|
@@ -245,6 +290,7 @@ export class SeoAnalyzer {
|
|
|
245
290
|
: 0;
|
|
246
291
|
const textHtmlRatio = this.calculateTextHtmlRatio(content.characterCount);
|
|
247
292
|
return {
|
|
293
|
+
pageType,
|
|
248
294
|
jsFilesCount: resources.jsFilesCount,
|
|
249
295
|
cssFilesCount: resources.cssFilesCount,
|
|
250
296
|
unminifiedResources: resources.unminifiedResources,
|
|
@@ -774,7 +820,6 @@ export class SeoAnalyzer {
|
|
|
774
820
|
}
|
|
775
821
|
analyzeAnalytics() {
|
|
776
822
|
const providers = [];
|
|
777
|
-
const html = this.root.innerHTML || '';
|
|
778
823
|
const scripts = this.root.querySelectorAll('script');
|
|
779
824
|
const scriptSources = [];
|
|
780
825
|
const scriptContents = [];
|
|
@@ -899,21 +944,6 @@ export class SeoAnalyzer {
|
|
|
899
944
|
hasPhoneOnPage,
|
|
900
945
|
};
|
|
901
946
|
}
|
|
902
|
-
analyzeAdvancedImages() {
|
|
903
|
-
let imagesWithSrcset = 0;
|
|
904
|
-
let largeBase64ImagesCount = 0;
|
|
905
|
-
const imgs = this.root.querySelectorAll('img');
|
|
906
|
-
imgs.forEach((img) => {
|
|
907
|
-
if (img.getAttribute('srcset') || (img.parentNode && img.parentNode.tagName === 'PICTURE')) {
|
|
908
|
-
imagesWithSrcset++;
|
|
909
|
-
}
|
|
910
|
-
const src = img.getAttribute('src') || '';
|
|
911
|
-
if (src.startsWith('data:image') && src.length > 5 * 1024) {
|
|
912
|
-
largeBase64ImagesCount++;
|
|
913
|
-
}
|
|
914
|
-
});
|
|
915
|
-
return { imagesWithSrcset, largeBase64ImagesCount };
|
|
916
|
-
}
|
|
917
947
|
calculateTextHtmlRatio(bodyTextLength) {
|
|
918
948
|
const htmlSize = this.root.innerHTML?.length;
|
|
919
949
|
if (htmlSize && htmlSize > 0) {
|
|
@@ -926,6 +956,7 @@ export class SeoAnalyzer {
|
|
|
926
956
|
id: r.id,
|
|
927
957
|
name: r.name,
|
|
928
958
|
category: r.category,
|
|
959
|
+
severity: r.severity,
|
|
929
960
|
status: r.status,
|
|
930
961
|
message: r.message,
|
|
931
962
|
value: r.value,
|
|
@@ -934,6 +965,8 @@ export class SeoAnalyzer {
|
|
|
934
965
|
}));
|
|
935
966
|
}
|
|
936
967
|
buildSummary(ruleResults, checks, data) {
|
|
968
|
+
const pageType = data.pageType;
|
|
969
|
+
const timings = data.timings;
|
|
937
970
|
const passed = checks.filter((c) => c.status === 'pass').length;
|
|
938
971
|
const warnings = checks.filter((c) => c.status === 'warn').length;
|
|
939
972
|
const errors = checks.filter((c) => c.status === 'fail').length;
|
|
@@ -954,21 +987,25 @@ export class SeoAnalyzer {
|
|
|
954
987
|
else if (result.status === 'fail')
|
|
955
988
|
issuesByCategory[cat].errors++;
|
|
956
989
|
}
|
|
957
|
-
const topIssues =
|
|
958
|
-
.filter((
|
|
990
|
+
const topIssues = checks
|
|
991
|
+
.filter((c) => c.status === 'fail' || c.status === 'warn')
|
|
959
992
|
.sort((a, b) => {
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
if (
|
|
963
|
-
return
|
|
964
|
-
|
|
993
|
+
const severityOrder = (status) => status === 'fail' ? 2 : 1;
|
|
994
|
+
const statusDiff = severityOrder(b.status) - severityOrder(a.status);
|
|
995
|
+
if (statusDiff !== 0)
|
|
996
|
+
return statusDiff;
|
|
997
|
+
const aSeverity = a.severity || (a.status === 'fail' ? 'error' : 'warning');
|
|
998
|
+
const bSeverity = b.severity || (b.status === 'fail' ? 'error' : 'warning');
|
|
999
|
+
if (aSeverity === bSeverity)
|
|
1000
|
+
return 0;
|
|
1001
|
+
return aSeverity === 'error' ? -1 : 1;
|
|
965
1002
|
})
|
|
966
1003
|
.slice(0, 5)
|
|
967
1004
|
.map((r) => ({
|
|
968
1005
|
name: r.name,
|
|
969
1006
|
message: r.message,
|
|
970
1007
|
category: r.category,
|
|
971
|
-
severity: (r.status === 'fail' ? 'error' : 'warning'),
|
|
1008
|
+
severity: (r.severity || (r.status === 'fail' ? 'error' : 'warning')),
|
|
972
1009
|
}));
|
|
973
1010
|
const quickWins = [];
|
|
974
1011
|
if (!data.meta.title)
|
|
@@ -993,8 +1030,8 @@ export class SeoAnalyzer {
|
|
|
993
1030
|
const vitals = {
|
|
994
1031
|
htmlSize,
|
|
995
1032
|
domElements,
|
|
996
|
-
ttfb:
|
|
997
|
-
totalTime:
|
|
1033
|
+
ttfb: timings?.ttfb,
|
|
1034
|
+
totalTime: timings?.total,
|
|
998
1035
|
wordCount: data.content.wordCount,
|
|
999
1036
|
totalWordCount: data.content.totalWordCount,
|
|
1000
1037
|
readingTime: data.content.readingTimeMinutes,
|
|
@@ -1017,6 +1054,7 @@ export class SeoAnalyzer {
|
|
|
1017
1054
|
infos,
|
|
1018
1055
|
passRate,
|
|
1019
1056
|
issuesByCategory,
|
|
1057
|
+
pageType: pageType,
|
|
1020
1058
|
topIssues,
|
|
1021
1059
|
quickWins: limitedQuickWins,
|
|
1022
1060
|
vitals,
|
|
@@ -1387,18 +1425,11 @@ export class SeoAnalyzer {
|
|
|
1387
1425
|
unminifiedResourceUrls: unminified
|
|
1388
1426
|
};
|
|
1389
1427
|
}
|
|
1390
|
-
calculateScore(
|
|
1391
|
-
|
|
1392
|
-
pass: 100,
|
|
1393
|
-
warn: 50,
|
|
1394
|
-
fail: 0,
|
|
1395
|
-
info: 100,
|
|
1396
|
-
};
|
|
1397
|
-
const scoringChecks = checks.filter((c) => c.status !== 'info');
|
|
1398
|
-
if (scoringChecks.length === 0)
|
|
1428
|
+
calculateScore(results) {
|
|
1429
|
+
if (results.length === 0)
|
|
1399
1430
|
return { score: 100, grade: 'A' };
|
|
1400
|
-
const
|
|
1401
|
-
const score =
|
|
1431
|
+
const { score: weightedScore } = calculateWeightedScore(results);
|
|
1432
|
+
const score = weightedScore;
|
|
1402
1433
|
let grade;
|
|
1403
1434
|
if (score >= 90)
|
|
1404
1435
|
grade = 'A';
|
|
@@ -6,7 +6,7 @@ export type { SeoSpiderOptions, SeoPageResult, SiteWideIssue, SeoSpiderResult, }
|
|
|
6
6
|
export { SeoRulesEngine, createRulesEngine, SEO_THRESHOLDS, ALL_SEO_RULES, } from './rules/index.js';
|
|
7
7
|
export { generateSeoFilename, resolveOutputPath, writeReport, formatReportForJson, } from './output.js';
|
|
8
8
|
export type { SeoOutputType, OutputOptions, WriteOptions } from './output.js';
|
|
9
|
-
export type { SeoReport, SeoCheckResult, SeoStatus, SeoTiming, HeadingAnalysis, HeadingInfo, ContentMetrics, LinkAnalysis, ImageAnalysis, SocialMetaAnalysis, TechnicalSeo, SeoAnalyzerOptions, } from './types.js';
|
|
9
|
+
export type { SeoReport, SeoCheckResult, SeoStatus, SeoPageType, SeoTiming, HeadingAnalysis, HeadingInfo, ContentMetrics, LinkAnalysis, ImageAnalysis, SocialMetaAnalysis, TechnicalSeo, SeoAnalyzerOptions, } from './types.js';
|
|
10
10
|
export type { SeoRule, RuleContext, RuleResult, RuleEvidence, RuleCategory, RuleSeverity, RulesEngineOptions, } from './rules/index.js';
|
|
11
11
|
export type { SeoAnalyzerFullOptions } from './analyzer.js';
|
|
12
12
|
export { parseRobotsTxt, validateRobotsTxt, isPathAllowed, fetchAndValidateRobotsTxt, } from './validators/robots.js';
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import type { SeoStatus } from '../types.js';
|
|
2
2
|
import type { ExtractedLink } from '../../scrape/types.js';
|
|
3
|
+
import type { SeoPageType } from '../types.js';
|
|
3
4
|
export type RuleSeverity = 'error' | 'warning' | 'info';
|
|
4
5
|
export type RuleCategory = 'title' | 'meta' | 'og' | 'twitter' | 'headings' | 'images' | 'links' | 'content' | 'technical' | 'security' | 'mobile' | 'structured-data' | 'performance' | 'accessibility' | 'i18n' | 'ai-search' | 'resources' | 'crawlability' | 'canonicalization';
|
|
5
6
|
export interface RuleContext {
|
|
7
|
+
pageType?: SeoPageType;
|
|
6
8
|
keywordsInTitle?: boolean;
|
|
7
9
|
keywordsInDescription?: boolean;
|
|
8
10
|
keywordsInH1?: boolean;
|
|
@@ -66,19 +66,18 @@ export interface SeoSpiderResult extends Omit<SpiderResult, 'pages'> {
|
|
|
66
66
|
export declare class SeoSpider {
|
|
67
67
|
private spider;
|
|
68
68
|
private options;
|
|
69
|
-
private seoResults;
|
|
70
69
|
private seoPages;
|
|
71
70
|
private homeHtml;
|
|
71
|
+
private normalizeUrl;
|
|
72
|
+
private toHeaderRecord;
|
|
72
73
|
constructor(options?: SeoSpiderOptions);
|
|
73
74
|
private analyzePageDuringCrawl;
|
|
74
75
|
crawl(startUrl: string): Promise<SeoSpiderResult>;
|
|
75
76
|
private checkSiteFiles;
|
|
76
77
|
private validateManifest;
|
|
77
78
|
private validateSitemap;
|
|
78
|
-
private createReportFromPageData;
|
|
79
79
|
private detectSiteWideIssues;
|
|
80
80
|
private calculateSummary;
|
|
81
|
-
private scoreToGrade;
|
|
82
81
|
private saveReport;
|
|
83
82
|
abort(): void;
|
|
84
83
|
isRunning(): boolean;
|