recker 1.0.73 → 1.0.75-next.2e5a94f

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. package/README.md +5 -18
  2. package/dist/browser/core/client.d.ts +14 -8
  3. package/dist/browser/core/client.js +199 -17
  4. package/dist/browser/core/errors.d.ts +15 -1
  5. package/dist/browser/core/errors.js +140 -9
  6. package/dist/browser/core/request.d.ts +5 -0
  7. package/dist/browser/core/request.js +33 -2
  8. package/dist/browser/core-runtime/plugin-manifest.d.ts +24 -0
  9. package/dist/browser/core-runtime/plugin-manifest.js +159 -0
  10. package/dist/browser/core-runtime/request-context.d.ts +13 -0
  11. package/dist/browser/core-runtime/request-context.js +24 -0
  12. package/dist/browser/core-runtime/typed-events.d.ts +89 -0
  13. package/dist/browser/core-runtime/typed-events.js +34 -0
  14. package/dist/browser/index.iife.min.js +79 -79
  15. package/dist/browser/index.min.js +79 -79
  16. package/dist/browser/index.mini.iife.js +913 -97
  17. package/dist/browser/index.mini.iife.min.js +46 -46
  18. package/dist/browser/index.mini.min.js +46 -46
  19. package/dist/browser/index.mini.umd.js +913 -97
  20. package/dist/browser/index.mini.umd.min.js +46 -46
  21. package/dist/browser/index.umd.min.js +79 -79
  22. package/dist/browser/plugins/auth/aws-sigv4.d.ts +1 -0
  23. package/dist/browser/plugins/auth/aws-sigv4.js +19 -2
  24. package/dist/browser/plugins/retry.js +29 -1
  25. package/dist/browser/presets/aws.d.ts +1 -0
  26. package/dist/browser/presets/aws.js +62 -1
  27. package/dist/browser/runner/request-runner.d.ts +15 -5
  28. package/dist/browser/runner/request-runner.js +164 -30
  29. package/dist/browser/scrape/parser/nodes/html.d.ts +6 -0
  30. package/dist/browser/scrape/parser/nodes/html.js +70 -18
  31. package/dist/browser/scrape/parser/nodes/node.d.ts +1 -0
  32. package/dist/browser/scrape/parser/nodes/node.js +5 -0
  33. package/dist/browser/scrape/spider.d.ts +1 -0
  34. package/dist/browser/scrape/spider.js +39 -26
  35. package/dist/browser/seo/analyzer.d.ts +1 -1
  36. package/dist/browser/seo/analyzer.js +73 -42
  37. package/dist/browser/seo/index.d.ts +1 -1
  38. package/dist/browser/seo/rules/types.d.ts +2 -0
  39. package/dist/browser/seo/seo-spider.d.ts +2 -3
  40. package/dist/browser/seo/seo-spider.js +26 -202
  41. package/dist/browser/seo/types.d.ts +4 -0
  42. package/dist/browser/seo/validators/sitemap.js +9 -2
  43. package/dist/browser/transport/fetch.js +38 -5
  44. package/dist/browser/transport/undici.js +73 -11
  45. package/dist/browser/transport/worker.d.ts +0 -1
  46. package/dist/browser/transport/worker.js +1 -3
  47. package/dist/browser/types/index.d.ts +24 -0
  48. package/dist/cli/commands/mcp.js +5 -3
  49. package/dist/core/client.d.ts +14 -8
  50. package/dist/core/client.js +199 -17
  51. package/dist/core/errors.d.ts +15 -1
  52. package/dist/core/errors.js +140 -9
  53. package/dist/core/request.d.ts +5 -0
  54. package/dist/core/request.js +33 -2
  55. package/dist/core-runtime/plugin-manifest.d.ts +24 -0
  56. package/dist/core-runtime/plugin-manifest.js +159 -0
  57. package/dist/core-runtime/request-context.d.ts +13 -0
  58. package/dist/core-runtime/request-context.js +24 -0
  59. package/dist/core-runtime/typed-events.d.ts +89 -0
  60. package/dist/core-runtime/typed-events.js +34 -0
  61. package/dist/index.d.ts +2 -1
  62. package/dist/index.js +2 -1
  63. package/dist/mcp/cli.js +10 -8
  64. package/dist/mcp/profiles.d.ts +1 -1
  65. package/dist/mcp/profiles.js +31 -6
  66. package/dist/mcp/tools/categories.js +0 -1
  67. package/dist/plugins/auth/aws-sigv4.d.ts +1 -0
  68. package/dist/plugins/auth/aws-sigv4.js +19 -2
  69. package/dist/plugins/retry.js +29 -1
  70. package/dist/presets/aws.d.ts +1 -0
  71. package/dist/presets/aws.js +62 -1
  72. package/dist/recker.d.ts +3 -0
  73. package/dist/recker.js +5 -0
  74. package/dist/runner/request-runner.d.ts +15 -5
  75. package/dist/runner/request-runner.js +164 -30
  76. package/dist/scrape/parser/nodes/html.d.ts +6 -0
  77. package/dist/scrape/parser/nodes/html.js +70 -18
  78. package/dist/scrape/parser/nodes/node.d.ts +1 -0
  79. package/dist/scrape/parser/nodes/node.js +5 -0
  80. package/dist/scrape/spider.d.ts +1 -0
  81. package/dist/scrape/spider.js +39 -26
  82. package/dist/search/google.d.ts +67 -0
  83. package/dist/search/google.js +480 -0
  84. package/dist/search/index.d.ts +3 -0
  85. package/dist/search/index.js +1 -0
  86. package/dist/seo/analyzer.d.ts +1 -1
  87. package/dist/seo/analyzer.js +73 -42
  88. package/dist/seo/index.d.ts +1 -1
  89. package/dist/seo/rules/types.d.ts +2 -0
  90. package/dist/seo/seo-spider.d.ts +2 -3
  91. package/dist/seo/seo-spider.js +26 -202
  92. package/dist/seo/types.d.ts +4 -0
  93. package/dist/seo/validators/sitemap.js +9 -2
  94. package/dist/transport/fetch.js +38 -5
  95. package/dist/transport/undici.js +73 -11
  96. package/dist/transport/worker.d.ts +0 -1
  97. package/dist/transport/worker.js +1 -3
  98. package/dist/types/index.d.ts +24 -0
  99. package/dist/version.js +1 -1
  100. package/package.json +9 -1
@@ -1,3 +1,4 @@
1
+ import { QueueCancelledError } from '../core/errors.js';
1
2
  class SimpleEmitter {
2
3
  listeners = new Map();
3
4
  on(event, listener) {
@@ -43,10 +44,14 @@ export class RequestRunner extends SimpleEmitter {
43
44
  queue = [];
44
45
  activeCount = 0;
45
46
  paused = false;
46
- results = new Map();
47
47
  stats = { total: 0, successful: 0, failed: 0 };
48
48
  startTime = 0;
49
49
  pendingRetries = 0;
50
+ isCancelled = false;
51
+ cancelReason = new QueueCancelledError('Request runner cancelled');
52
+ retryTimers = new Map();
53
+ timeoutId;
54
+ abortUnsubscribe;
50
55
  constructor(options = {}) {
51
56
  super();
52
57
  this.concurrency = options.concurrency || 5;
@@ -67,48 +72,146 @@ export class RequestRunner extends SimpleEmitter {
67
72
  this.processNext();
68
73
  }
69
74
  async run(items, processor, options = {}) {
75
+ this.queue = [];
76
+ this.activeCount = 0;
77
+ this.pendingRetries = 0;
78
+ this.stats = { total: 0, successful: 0, failed: 0 };
79
+ this.isCancelled = false;
80
+ if (this.retryTimers.size > 0) {
81
+ for (const [, timer] of this.retryTimers) {
82
+ clearTimeout(timer);
83
+ }
84
+ this.retryTimers.clear();
85
+ }
86
+ if (this.timeoutId) {
87
+ clearTimeout(this.timeoutId);
88
+ this.timeoutId = undefined;
89
+ }
90
+ if (this.abortUnsubscribe) {
91
+ this.abortUnsubscribe();
92
+ this.abortUnsubscribe = undefined;
93
+ }
70
94
  this.startTime = Date.now();
71
95
  this.stats = { total: items.length, successful: 0, failed: 0 };
72
- this.results.clear();
73
- const promises = items.map((item, index) => {
74
- return new Promise((resolve) => {
75
- this.add(() => processor(item, index), {
76
- priority: options.priority,
77
- id: String(index),
78
- retries: options.retries,
79
- resolve,
80
- trackTotal: false
96
+ this.isCancelled = false;
97
+ this.cancelReason = new QueueCancelledError('Request runner cancelled', {
98
+ queueName: 'request-runner',
99
+ request: undefined
100
+ });
101
+ if (options.signal) {
102
+ const signal = options.signal;
103
+ if (signal.aborted) {
104
+ const reason = signal.reason instanceof Error
105
+ ? signal.reason
106
+ : new QueueCancelledError('Request runner signal was aborted', {
107
+ queueName: 'request-runner',
108
+ request: undefined
109
+ });
110
+ this.cancelAll(reason);
111
+ }
112
+ else {
113
+ const handleAbort = () => {
114
+ const reason = signal.reason instanceof Error
115
+ ? signal.reason
116
+ : new QueueCancelledError('Request runner signal was aborted', {
117
+ queueName: 'request-runner',
118
+ request: undefined
119
+ });
120
+ this.cancelAll(reason);
121
+ };
122
+ signal.addEventListener('abort', handleAbort, { once: true });
123
+ this.abortUnsubscribe = () => signal.removeEventListener('abort', handleAbort);
124
+ }
125
+ }
126
+ if (options.deadlineMs !== undefined) {
127
+ const deadline = options.deadlineMs;
128
+ if (deadline <= 0) {
129
+ this.cancelAll(new QueueCancelledError('Request runner deadline elapsed', {
130
+ queueName: 'request-runner',
131
+ request: undefined
132
+ }));
133
+ }
134
+ else {
135
+ this.timeoutId = setTimeout(() => {
136
+ this.cancelAll(new QueueCancelledError('Request runner deadline exceeded', {
137
+ queueName: 'request-runner',
138
+ request: undefined
139
+ }));
140
+ }, deadline);
141
+ }
142
+ }
143
+ try {
144
+ const promises = items.map((item, index) => {
145
+ return new Promise((resolve) => {
146
+ this.add(() => processor(item, index), {
147
+ priority: options.priority,
148
+ id: String(index),
149
+ retries: options.retries,
150
+ resolve,
151
+ trackTotal: false
152
+ });
81
153
  });
82
154
  });
83
- });
84
- const results = await Promise.all(promises);
85
- return {
86
- results,
87
- stats: {
88
- ...this.stats,
89
- duration: Date.now() - this.startTime
155
+ const results = await Promise.all(promises);
156
+ return {
157
+ results,
158
+ stats: {
159
+ ...this.stats,
160
+ duration: Date.now() - this.startTime
161
+ }
162
+ };
163
+ }
164
+ finally {
165
+ if (this.timeoutId) {
166
+ clearTimeout(this.timeoutId);
167
+ this.timeoutId = undefined;
90
168
  }
91
- };
169
+ if (this.abortUnsubscribe) {
170
+ this.abortUnsubscribe();
171
+ this.abortUnsubscribe = undefined;
172
+ }
173
+ }
92
174
  }
93
175
  queueTask(task) {
94
176
  this.queue.push(task);
95
177
  this.queue.sort((a, b) => b.priority - a.priority);
96
178
  }
97
179
  scheduleRetry(task, delay) {
98
- this.pendingRetries++;
99
- if (delay > 0) {
100
- setTimeout(() => {
101
- this.pendingRetries--;
102
- this.queueTask(task);
103
- this.processNext();
104
- }, delay);
180
+ if (this.isCancelled) {
181
+ this.resolveTask(task, this.cancelReason);
105
182
  return;
106
183
  }
107
- this.pendingRetries--;
108
- this.queueTask(task);
109
- this.processNext();
184
+ if (delay <= 0) {
185
+ this.queueTask(task);
186
+ this.processNext();
187
+ return;
188
+ }
189
+ const timerKey = `${task.id}-${Date.now()}-${Math.random().toString(16).slice(2, 8)}`;
190
+ const enqueueTask = () => {
191
+ this.retryTimers.delete(timerKey);
192
+ this.pendingRetries--;
193
+ if (this.isCancelled) {
194
+ return;
195
+ }
196
+ this.queueTask(task);
197
+ this.processNext();
198
+ };
199
+ this.retryTimers.set(timerKey, setTimeout(() => {
200
+ if (this.isCancelled) {
201
+ this.pendingRetries--;
202
+ this.retryTimers.delete(timerKey);
203
+ this.resolveTask(task, this.cancelReason);
204
+ return;
205
+ }
206
+ enqueueTask();
207
+ }, delay));
208
+ this.pendingRetries++;
110
209
  }
111
210
  async processNext() {
211
+ if (this.isCancelled) {
212
+ this.resolveQueue();
213
+ return;
214
+ }
112
215
  if (this.paused || this.activeCount >= this.concurrency || this.queue.length === 0) {
113
216
  return;
114
217
  }
@@ -124,6 +227,10 @@ export class RequestRunner extends SimpleEmitter {
124
227
  this.emit('taskComplete', { task, result });
125
228
  }
126
229
  catch (error) {
230
+ if (this.isCancelled) {
231
+ this.resolveTask(task, this.cancelReason);
232
+ return;
233
+ }
127
234
  const remaining = task.retries ?? 0;
128
235
  if (remaining > 0) {
129
236
  task.retries = remaining - 1;
@@ -131,8 +238,7 @@ export class RequestRunner extends SimpleEmitter {
131
238
  this.scheduleRetry(task, this.retryDelay);
132
239
  }
133
240
  else {
134
- this.stats.failed++;
135
- task.resolve?.(error);
241
+ this.resolveTask(task, error);
136
242
  this.emit('taskError', { task, error });
137
243
  }
138
244
  }
@@ -145,6 +251,34 @@ export class RequestRunner extends SimpleEmitter {
145
251
  this.processNext();
146
252
  }
147
253
  }
254
+ cancelAll(reason) {
255
+ if (this.isCancelled) {
256
+ return;
257
+ }
258
+ this.isCancelled = true;
259
+ this.cancelReason = reason;
260
+ for (const [, timer] of this.retryTimers) {
261
+ clearTimeout(timer);
262
+ }
263
+ this.retryTimers.clear();
264
+ this.pendingRetries = 0;
265
+ this.resolveQueue();
266
+ }
267
+ resolveQueue() {
268
+ while (this.queue.length > 0) {
269
+ const task = this.queue.shift();
270
+ if (task) {
271
+ this.resolveTask(task, this.cancelReason);
272
+ }
273
+ }
274
+ }
275
+ resolveTask(task, error) {
276
+ if (!task.resolve) {
277
+ return;
278
+ }
279
+ this.stats.failed++;
280
+ task.resolve?.(error);
281
+ }
148
282
  getProgress() {
149
283
  const completed = this.stats.successful + this.stats.failed;
150
284
  return {
@@ -33,11 +33,16 @@ export default class HTMLElement extends Node {
33
33
  private voidTag;
34
34
  private _attrs?;
35
35
  private _rawAttrs?;
36
+ private _queryCache?;
36
37
  private _parseOptions;
37
38
  rawTagName: string;
38
39
  id: string;
39
40
  classList: DOMTokenList;
40
41
  nodeType: NodeType;
42
+ private get isSelectorCacheEnabled();
43
+ private getQueryCache;
44
+ private clearQueryCache;
45
+ invalidateSelectorCacheRecursively(): void;
41
46
  private quoteAttribute;
42
47
  constructor(tagName: string, keyAttrs: KeyAttributes, rawAttrs?: string, parentNode?: HTMLElement | null, range?: [number, number], voidTag?: VoidTag, _parseOptions?: Partial<Options>);
43
48
  removeChild(node: Node): this;
@@ -104,6 +109,7 @@ export interface Options {
104
109
  tags?: string[];
105
110
  closingSlash?: boolean;
106
111
  };
112
+ selectorCache?: boolean;
107
113
  }
108
114
  export declare function base_parse(data: string, options?: Partial<Options>): HTMLElement[];
109
115
  export declare function parse(data: string, options?: Partial<Options>): HTMLElement;
@@ -98,11 +98,31 @@ export default class HTMLElement extends Node {
98
98
  voidTag;
99
99
  _attrs;
100
100
  _rawAttrs;
101
+ _queryCache;
101
102
  _parseOptions;
102
103
  rawTagName;
103
104
  id;
104
105
  classList;
105
106
  nodeType = NodeType.ELEMENT_NODE;
107
+ get isSelectorCacheEnabled() {
108
+ return this._parseOptions?.selectorCache !== false;
109
+ }
110
+ getQueryCache() {
111
+ if (!this._queryCache) {
112
+ this._queryCache = new Map();
113
+ }
114
+ return this._queryCache;
115
+ }
116
+ clearQueryCache() {
117
+ this._queryCache = undefined;
118
+ }
119
+ invalidateSelectorCacheRecursively() {
120
+ let current = this;
121
+ while (current) {
122
+ current.clearQueryCache();
123
+ current = current.parentNode;
124
+ }
125
+ }
106
126
  quoteAttribute(attr) {
107
127
  if (attr == null) {
108
128
  return 'null';
@@ -144,6 +164,7 @@ export default class HTMLElement extends Node {
144
164
  this.childNodes = this.childNodes.filter((child) => {
145
165
  return child !== node;
146
166
  });
167
+ this.invalidateSelectorCacheRecursively();
147
168
  return this;
148
169
  }
149
170
  exchangeChild(oldNode, newNode) {
@@ -154,6 +175,7 @@ export default class HTMLElement extends Node {
154
175
  }
155
176
  return child;
156
177
  });
178
+ this.invalidateSelectorCacheRecursively();
157
179
  return this;
158
180
  }
159
181
  get tagName() {
@@ -182,6 +204,7 @@ export default class HTMLElement extends Node {
182
204
  set textContent(val) {
183
205
  const content = [new TextNode(val, this)];
184
206
  this.childNodes = content;
207
+ this.invalidateSelectorCacheRecursively();
185
208
  }
186
209
  get text() {
187
210
  return decode(this.rawText);
@@ -249,6 +272,7 @@ export default class HTMLElement extends Node {
249
272
  resetParent(nodes, this);
250
273
  resetParent(this.childNodes, null);
251
274
  this.childNodes = nodes;
275
+ this.invalidateSelectorCacheRecursively();
252
276
  }
253
277
  set_content(content, options = {}) {
254
278
  if (content instanceof Node) {
@@ -264,6 +288,7 @@ export default class HTMLElement extends Node {
264
288
  resetParent(this.childNodes, null);
265
289
  resetParent(content, this);
266
290
  this.childNodes = content;
291
+ this.invalidateSelectorCacheRecursively();
267
292
  return this;
268
293
  }
269
294
  replaceWith(...nodes) {
@@ -293,6 +318,7 @@ export default class HTMLElement extends Node {
293
318
  ...resetParent(content, parent),
294
319
  ...parent.childNodes.slice(idx + 1),
295
320
  ];
321
+ parent.invalidateSelectorCacheRecursively();
296
322
  return this;
297
323
  }
298
324
  get outerHTML() {
@@ -312,6 +338,7 @@ export default class HTMLElement extends Node {
312
338
  }
313
339
  }
314
340
  }
341
+ this.invalidateSelectorCacheRecursively();
315
342
  return this;
316
343
  }
317
344
  get structure() {
@@ -357,6 +384,7 @@ export default class HTMLElement extends Node {
357
384
  this.childNodes[o++] = node;
358
385
  });
359
386
  this.childNodes.length = o;
387
+ this.invalidateSelectorCacheRecursively();
360
388
  const attrs = Object.keys(this.rawAttributes)
361
389
  .map((key) => {
362
390
  const val = this.rawAttributes[key];
@@ -368,16 +396,49 @@ export default class HTMLElement extends Node {
368
396
  return this;
369
397
  }
370
398
  querySelectorAll(selector) {
371
- return selectAll(selector, this, {
399
+ if (this.isSelectorCacheEnabled) {
400
+ const cached = this.getQueryCache().get(selector);
401
+ if (cached?.all) {
402
+ return cached.all.slice();
403
+ }
404
+ }
405
+ const nodes = selectAll(selector, this, {
372
406
  xmlMode: false,
373
407
  adapter: Matcher,
374
408
  });
409
+ if (this.isSelectorCacheEnabled) {
410
+ const cacheEntry = this.getQueryCache().get(selector) || {};
411
+ cacheEntry.all = nodes;
412
+ if (cacheEntry.first === undefined) {
413
+ cacheEntry.first = nodes[0] || null;
414
+ }
415
+ this.getQueryCache().set(selector, cacheEntry);
416
+ }
417
+ return nodes;
375
418
  }
376
419
  querySelector(selector) {
377
- return selectOne(selector, this, {
420
+ if (this.isSelectorCacheEnabled) {
421
+ const cached = this.getQueryCache().get(selector);
422
+ if (cached?.first !== undefined) {
423
+ return cached.first || null;
424
+ }
425
+ if (cached?.all) {
426
+ const first = cached.all[0] || null;
427
+ cached.first = first;
428
+ this.getQueryCache().set(selector, cached);
429
+ return first;
430
+ }
431
+ }
432
+ const result = selectOne(selector, this, {
378
433
  xmlMode: false,
379
434
  adapter: Matcher,
380
435
  });
436
+ if (this.isSelectorCacheEnabled) {
437
+ const cacheEntry = this.getQueryCache().get(selector) || {};
438
+ cacheEntry.first = result;
439
+ this.getQueryCache().set(selector, cacheEntry);
440
+ }
441
+ return result;
381
442
  }
382
443
  getElementsByTagName(tagName) {
383
444
  const upperCasedTagName = tagName.toUpperCase();
@@ -440,22 +501,6 @@ export default class HTMLElement extends Node {
440
501
  const mapChild = new Map();
441
502
  let el = this;
442
503
  let old = null;
443
- function findOne(test, elems) {
444
- let elem = null;
445
- for (let i = 0, l = elems.length; i < l && !elem; i++) {
446
- const el = elems[i];
447
- if (test(el)) {
448
- elem = el;
449
- }
450
- else {
451
- const child = mapChild.get(el);
452
- if (child) {
453
- elem = findOne(test, [child]);
454
- }
455
- }
456
- }
457
- return elem;
458
- }
459
504
  while (el) {
460
505
  if (old)
461
506
  mapChild.set(el, old);
@@ -545,6 +590,7 @@ export default class HTMLElement extends Node {
545
590
  if (key === 'id') {
546
591
  this.id = '';
547
592
  }
593
+ this.invalidateSelectorCacheRecursively();
548
594
  return this;
549
595
  }
550
596
  hasAttribute(key) {
@@ -580,6 +626,7 @@ export default class HTMLElement extends Node {
580
626
  if (key === 'id') {
581
627
  this.id = value;
582
628
  }
629
+ this.invalidateSelectorCacheRecursively();
583
630
  return this;
584
631
  }
585
632
  setAttributes(attributes) {
@@ -597,6 +644,7 @@ export default class HTMLElement extends Node {
597
644
  return `${name}=${this.quoteAttribute(String(val))}`;
598
645
  })
599
646
  .join(' ');
647
+ this.invalidateSelectorCacheRecursively();
600
648
  return this;
601
649
  }
602
650
  insertAdjacentHTML(where, html) {
@@ -625,11 +673,13 @@ export default class HTMLElement extends Node {
625
673
  const nodes = resolveInsertable(insertable, this._parseOptions);
626
674
  resetParent(nodes, this);
627
675
  this.childNodes.unshift(...nodes);
676
+ this.invalidateSelectorCacheRecursively();
628
677
  }
629
678
  append(...insertable) {
630
679
  const nodes = resolveInsertable(insertable, this._parseOptions);
631
680
  resetParent(nodes, this);
632
681
  this.childNodes.push(...nodes);
682
+ this.invalidateSelectorCacheRecursively();
633
683
  }
634
684
  before(...insertable) {
635
685
  if (!this.parentNode)
@@ -638,6 +688,7 @@ export default class HTMLElement extends Node {
638
688
  const siblings = this.parentNode.childNodes;
639
689
  resetParent(nodes, this.parentNode);
640
690
  siblings.splice(siblings.indexOf(this), 0, ...nodes);
691
+ this.parentNode.invalidateSelectorCacheRecursively();
641
692
  }
642
693
  after(...insertable) {
643
694
  if (!this.parentNode)
@@ -646,6 +697,7 @@ export default class HTMLElement extends Node {
646
697
  const siblings = this.parentNode.childNodes;
647
698
  resetParent(nodes, this.parentNode);
648
699
  siblings.splice(siblings.indexOf(this) + 1, 0, ...nodes);
700
+ this.parentNode.invalidateSelectorCacheRecursively();
649
701
  }
650
702
  get nextSibling() {
651
703
  if (this.parentNode) {
@@ -12,6 +12,7 @@ export default abstract class Node {
12
12
  abstract clone(): Node;
13
13
  constructor(parentNode?: HTMLElement | null, range?: [number, number]);
14
14
  remove(): this;
15
+ invalidateSelectorCacheRecursively(): void;
15
16
  get innerText(): string;
16
17
  get textContent(): string;
17
18
  set textContent(val: string);
@@ -11,6 +11,9 @@ export default class Node {
11
11
  }
12
12
  remove() {
13
13
  if (this.parentNode) {
14
+ if (typeof this.parentNode.invalidateSelectorCacheRecursively === 'function') {
15
+ this.parentNode.invalidateSelectorCacheRecursively();
16
+ }
14
17
  const children = this.parentNode.childNodes;
15
18
  this.parentNode.childNodes = children.filter((child) => {
16
19
  return this !== child;
@@ -19,6 +22,8 @@ export default class Node {
19
22
  }
20
23
  return this;
21
24
  }
25
+ invalidateSelectorCacheRecursively() {
26
+ }
22
27
  get innerText() {
23
28
  return this.rawText;
24
29
  }
@@ -123,6 +123,7 @@ export declare class Spider {
123
123
  private robotsData;
124
124
  private sitemapValidation;
125
125
  private robotsValidation;
126
+ private toHeaderRecord;
126
127
  constructor(options?: SpiderOptions);
127
128
  crawl(startUrl: string): Promise<SpiderResult>;
128
129
  private fetchRobotsTxt;
@@ -76,9 +76,6 @@ function shouldCrawl(url, baseHost, options) {
76
76
  return false;
77
77
  }
78
78
  }
79
- function sleep(ms) {
80
- return new Promise(resolve => setTimeout(resolve, ms));
81
- }
82
79
  function parseExtractSelectors(selectors) {
83
80
  const schema = {};
84
81
  for (const sel of selectors) {
@@ -115,6 +112,13 @@ export class Spider {
115
112
  robotsData = null;
116
113
  sitemapValidation = null;
117
114
  robotsValidation = null;
115
+ toHeaderRecord(headers) {
116
+ const headerRecord = {};
117
+ headers.forEach((value, key) => {
118
+ headerRecord[key] = value;
119
+ });
120
+ return headerRecord;
121
+ }
118
122
  constructor(options = {}) {
119
123
  let extractSchema;
120
124
  if (options.extract) {
@@ -194,7 +198,7 @@ export class Spider {
194
198
  await this.fetchSitemaps(baseUrl);
195
199
  }
196
200
  const pending = new Map();
197
- const scheduleUrl = (item, fromSitemap = false) => {
201
+ const scheduleUrl = (item) => {
198
202
  const normalized = normalizeUrl(item.url);
199
203
  if (this.visited.has(normalized))
200
204
  return;
@@ -230,7 +234,7 @@ export class Spider {
230
234
  try {
231
235
  const urlHost = new URL(sitemapUrl.loc).hostname;
232
236
  if (urlHost === this.baseHost) {
233
- scheduleUrl({ url: sitemapUrl.loc, depth: 1 }, true);
237
+ scheduleUrl({ url: sitemapUrl.loc, depth: 1 });
234
238
  }
235
239
  }
236
240
  catch {
@@ -303,7 +307,7 @@ export class Spider {
303
307
  return {
304
308
  status: response.status,
305
309
  text: await response.text(),
306
- headers: Object.fromEntries([...response.headers.entries()]),
310
+ headers: this.toHeaderRecord(response.headers),
307
311
  };
308
312
  };
309
313
  try {
@@ -351,40 +355,49 @@ export class Spider {
351
355
  }
352
356
  buildSitemapAnalysis() {
353
357
  const crawledUrls = new Set(this.results.map(r => normalizeUrl(r.url)));
354
- const crawledFromSitemap = this.sitemapUrls.filter(u => crawledUrls.has(normalizeUrl(u.loc))).length;
358
+ const sitemapUrlSet = this.sitemapUrlSet.size > 0
359
+ ? this.sitemapUrlSet
360
+ : new Set(this.sitemapUrls.map((u) => normalizeUrl(u.loc)));
361
+ const crawledFromSitemap = Array.from(sitemapUrlSet)
362
+ .filter(url => crawledUrls.has(url))
363
+ .length;
355
364
  const linkedUrls = new Set();
356
- for (const page of this.results) {
357
- for (const link of page.links) {
358
- if (link.href) {
359
- linkedUrls.add(normalizeUrl(link.href));
360
- }
361
- }
362
- }
363
- const orphanUrls = this.sitemapUrls
364
- .filter(u => {
365
- const normalized = normalizeUrl(u.loc);
366
- return !linkedUrls.has(normalized) && crawledUrls.has(normalized);
367
- })
368
- .map(u => u.loc);
369
- const missingFromSitemap = Array.from(crawledUrls)
370
- .filter(url => !this.sitemapUrlSet.has(url));
371
- const blockedBySitemapRobots = [];
365
+ const blockedBySitemapRobotsSet = new Set();
372
366
  if (this.robotsData) {
373
367
  for (const sitemapUrl of this.sitemapUrls) {
374
368
  try {
369
+ const normalized = normalizeUrl(sitemapUrl.loc);
375
370
  const urlPath = new URL(sitemapUrl.loc).pathname;
376
371
  if (!isPathAllowed(this.robotsData, urlPath, this.options.userAgent)) {
377
- blockedBySitemapRobots.push(sitemapUrl.loc);
372
+ blockedBySitemapRobotsSet.add(normalized);
378
373
  }
379
374
  }
380
375
  catch {
381
376
  }
382
377
  }
383
378
  }
379
+ for (const page of this.results) {
380
+ for (const link of page.links) {
381
+ if (link.href) {
382
+ linkedUrls.add(normalizeUrl(link.href));
383
+ }
384
+ }
385
+ }
386
+ const orphanUrlSet = new Set();
387
+ for (const u of this.sitemapUrls) {
388
+ const normalized = normalizeUrl(u.loc);
389
+ if (!linkedUrls.has(normalized) && !blockedBySitemapRobotsSet.has(normalized)) {
390
+ orphanUrlSet.add(normalized);
391
+ }
392
+ }
393
+ const orphanUrls = Array.from(orphanUrlSet);
394
+ const missingFromSitemap = Array.from(crawledUrls)
395
+ .filter(url => !sitemapUrlSet.has(url));
396
+ const blockedBySitemapRobots = Array.from(blockedBySitemapRobotsSet);
384
397
  return {
385
398
  found: this.sitemapUrls.length > 0,
386
- url: this.sitemapValidation?.parseResult ? undefined : undefined,
387
- totalUrls: this.sitemapUrls.length,
399
+ url: this.sitemapUrls[0]?.loc,
400
+ totalUrls: sitemapUrlSet.size,
388
401
  crawledFromSitemap,
389
402
  orphanUrls,
390
403
  missingFromSitemap,