recker 1.0.94 → 1.0.95-next.0a5359d

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,7 +17,11 @@ export interface CrawlQueueAdapter {
17
17
  }
18
18
  export declare class InMemoryCrawlQueue implements CrawlQueueAdapter {
19
19
  private queue;
20
+ private head;
21
+ private tail;
20
22
  private visited;
23
+ private mode;
24
+ constructor(mode?: 'fifo' | 'lifo');
21
25
  push(item: CrawlQueueItem): Promise<void>;
22
26
  pushBatch(items: CrawlQueueItem[]): Promise<void>;
23
27
  pop(): Promise<CrawlQueueItem | null>;
@@ -1,14 +1,45 @@
1
1
  export class InMemoryCrawlQueue {
2
2
  queue = [];
3
+ head = 0;
4
+ tail = 0;
3
5
  visited = new Set();
6
+ mode;
7
+ constructor(mode = 'fifo') {
8
+ this.mode = mode;
9
+ }
4
10
  async push(item) {
5
- this.queue.push(item);
11
+ this.queue[this.tail++] = item;
6
12
  }
7
13
  async pushBatch(items) {
8
- this.queue.push(...items);
14
+ for (const item of items) {
15
+ this.queue[this.tail++] = item;
16
+ }
9
17
  }
10
18
  async pop() {
11
- return this.queue.shift() ?? null;
19
+ if (this.mode === 'lifo') {
20
+ while (this.tail > this.head) {
21
+ this.tail--;
22
+ const item = this.queue[this.tail];
23
+ this.queue[this.tail] = undefined;
24
+ if (item)
25
+ return item;
26
+ }
27
+ return null;
28
+ }
29
+ while (this.head < this.tail) {
30
+ const item = this.queue[this.head];
31
+ this.queue[this.head] = undefined;
32
+ this.head++;
33
+ if (item) {
34
+ if (this.head > 1024 && this.head > this.tail / 2) {
35
+ this.queue = this.queue.slice(this.head);
36
+ this.tail -= this.head;
37
+ this.head = 0;
38
+ }
39
+ return item;
40
+ }
41
+ }
42
+ return null;
12
43
  }
13
44
  async hasVisited(url) {
14
45
  return this.visited.has(url);
@@ -25,10 +56,12 @@ export class InMemoryCrawlQueue {
25
56
  this.visited.add(url);
26
57
  }
27
58
  async size() {
28
- return this.queue.length;
59
+ return this.tail - this.head;
29
60
  }
30
61
  async clear() {
31
62
  this.queue = [];
63
+ this.head = 0;
64
+ this.tail = 0;
32
65
  this.visited.clear();
33
66
  }
34
67
  async close() {
@@ -11,7 +11,7 @@ type CaptchaProvider = CaptchaDetectionResult['provider'];
11
11
  export interface SpiderOptions {
12
12
  maxDepth?: number;
13
13
  maxPages?: number;
14
- sameDomain?: boolean;
14
+ sameDomain?: boolean | 'exact' | 'subdomain';
15
15
  concurrency?: number;
16
16
  timeout?: number;
17
17
  delay?: number;
@@ -63,8 +63,14 @@ export interface SpiderOptions {
63
63
  domainRateLimit?: {
64
64
  maxPerSecond?: number;
65
65
  };
66
+ autoThrottle?: boolean | {
67
+ targetMs?: number;
68
+ minDelay?: number;
69
+ maxDelay?: number;
70
+ };
66
71
  deduplicateContent?: boolean;
67
72
  resume?: boolean;
73
+ strategy?: 'bfs' | 'dfs';
68
74
  crawlQueue?: CrawlQueueAdapter;
69
75
  crawlStorage?: CrawlStorageAdapter;
70
76
  }
@@ -190,11 +196,14 @@ export declare class Spider {
190
196
  private _queueSize;
191
197
  private _resultCount;
192
198
  private baseHost;
199
+ private baseRootDomain;
193
200
  private running;
194
201
  private aborted;
202
+ private abortController;
195
203
  private pendingCount;
196
204
  private domainRequestTimestamps;
197
205
  private contentHashes;
206
+ private domainAvgResponseTime;
198
207
  private blockedDomains;
199
208
  private curlTransport;
200
209
  private curlAvailable;
@@ -223,6 +232,7 @@ export declare class Spider {
223
232
  private waitForDomainPenalty;
224
233
  private registerDomainBlock;
225
234
  private registerDomainSuccess;
235
+ private updateAutoThrottle;
226
236
  private getCaptchaRetryMultiplier;
227
237
  private registerDomainChallenge;
228
238
  private getRetryWait;
@@ -40,10 +40,15 @@ const SEC_CH_PLATFORM_HINTS = [
40
40
  function getHostname(url) {
41
41
  return new URL(url).hostname;
42
42
  }
43
- function sleep(ms) {
43
+ function sleep(ms, signal) {
44
44
  if (ms <= 0)
45
45
  return Promise.resolve();
46
- return new Promise(resolve => setTimeout(resolve, ms));
46
+ if (signal?.aborted)
47
+ return Promise.resolve();
48
+ return new Promise(resolve => {
49
+ const timer = setTimeout(resolve, ms);
50
+ signal?.addEventListener('abort', () => { clearTimeout(timer); resolve(); }, { once: true });
51
+ });
47
52
  }
48
53
  function getRetryAfterDelay(response) {
49
54
  const retryAfter = response.headers.get('retry-after');
@@ -103,14 +108,30 @@ function normalizeUrl(urlStr) {
103
108
  return urlStr;
104
109
  }
105
110
  }
106
- function shouldCrawl(url, baseHost, options) {
111
+ function getRootDomain(hostname) {
112
+ const parts = hostname.replace(/^www\./, '').split('.');
113
+ if (parts.length >= 3 && parts[parts.length - 2].length <= 3) {
114
+ return parts.slice(-3).join('.');
115
+ }
116
+ return parts.slice(-2).join('.');
117
+ }
118
+ function shouldCrawl(url, baseHost, options, baseRootDomain) {
107
119
  try {
108
120
  const parsed = new URL(url);
109
121
  if (!['http:', 'https:'].includes(parsed.protocol)) {
110
122
  return false;
111
123
  }
112
- if (options.sameDomain !== false && parsed.hostname !== baseHost) {
113
- return false;
124
+ const hostname = parsed.hostname.replace(/^www\./, '');
125
+ const sameDomain = options.sameDomain;
126
+ if (sameDomain === 'subdomain') {
127
+ const pageRoot = getRootDomain(hostname);
128
+ const rootDomain = baseRootDomain ?? getRootDomain(baseHost);
129
+ if (pageRoot !== rootDomain)
130
+ return false;
131
+ }
132
+ else if (sameDomain !== false) {
133
+ if (hostname !== baseHost)
134
+ return false;
114
135
  }
115
136
  const skipExtensions = [
116
137
  '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico',
@@ -187,11 +208,14 @@ export class Spider {
187
208
  _queueSize = 0;
188
209
  _resultCount = 0;
189
210
  baseHost = '';
211
+ baseRootDomain = '';
190
212
  running = false;
191
213
  aborted = false;
214
+ abortController = new AbortController();
192
215
  pendingCount = 0;
193
216
  domainRequestTimestamps = new Map();
194
217
  contentHashes = new Map();
218
+ domainAvgResponseTime = new Map();
195
219
  blockedDomains = new Set();
196
220
  curlTransport = null;
197
221
  curlAvailable = false;
@@ -218,7 +242,9 @@ export class Spider {
218
242
  if (timestamps.length >= limit) {
219
243
  const waitMs = timestamps[0] + window - now;
220
244
  if (waitMs > 0)
221
- await sleep(waitMs);
245
+ await sleep(waitMs, this.abortController.signal);
246
+ if (this.aborted)
247
+ return;
222
248
  const afterWait = Date.now();
223
249
  while (timestamps.length > 0 && timestamps[0] <= afterWait - window) {
224
250
  timestamps.shift();
@@ -276,8 +302,10 @@ export class Spider {
276
302
  extract: extractSchema,
277
303
  parserOptions: options.parserOptions,
278
304
  domainRateLimit: options.domainRateLimit,
305
+ autoThrottle: options.autoThrottle ?? false,
279
306
  deduplicateContent: options.deduplicateContent ?? false,
280
307
  resume: options.resume ?? false,
308
+ strategy: options.strategy ?? 'bfs',
281
309
  };
282
310
  if (options.proxy) {
283
311
  if (typeof options.proxy === 'string') {
@@ -307,7 +335,7 @@ export class Spider {
307
335
  interval: this.options.delay,
308
336
  } : {}),
309
337
  });
310
- this.crawlQueue = options.crawlQueue ?? new InMemoryCrawlQueue();
338
+ this.crawlQueue = options.crawlQueue ?? new InMemoryCrawlQueue(this.options.strategy === 'dfs' ? 'lifo' : 'fifo');
311
339
  this.crawlStorage = options.crawlStorage ?? new InMemoryCrawlStorage();
312
340
  }
313
341
  async crawl(startUrl) {
@@ -315,7 +343,8 @@ export class Spider {
315
343
  const startTimestamp = Date.now();
316
344
  const normalizedStart = normalizeUrl(startUrl);
317
345
  const baseUrl = new URL(normalizedStart).origin;
318
- this.baseHost = new URL(normalizedStart).hostname;
346
+ this.baseHost = new URL(normalizedStart).hostname.replace(/^www\./, '');
347
+ this.baseRootDomain = getRootDomain(this.baseHost);
319
348
  if (!this.options.resume) {
320
349
  await this.crawlQueue.clear();
321
350
  await this.crawlStorage.clear();
@@ -324,6 +353,7 @@ export class Spider {
324
353
  this._resultCount = 0;
325
354
  this.domainRequestTimestamps.clear();
326
355
  this.contentHashes.clear();
356
+ this.domainAvgResponseTime.clear();
327
357
  }
328
358
  else {
329
359
  this._queueSize = await this.crawlQueue.size();
@@ -331,14 +361,25 @@ export class Spider {
331
361
  }
332
362
  this.running = true;
333
363
  this.aborted = false;
364
+ this.abortController = new AbortController();
334
365
  this.pendingCount = 0;
335
366
  this.sitemapUrls = [];
336
367
  this.sitemapUrlSet.clear();
337
368
  this.robotsData = null;
338
369
  this.sitemapValidation = null;
339
370
  this.robotsValidation = null;
340
- this.blockedDomains.clear();
341
- this.domainStates.clear();
371
+ if (!this.options.resume) {
372
+ this.blockedDomains.clear();
373
+ this.domainStates.clear();
374
+ }
375
+ if (this.options.resume && this.options.deduplicateContent) {
376
+ const existingResults = await this.crawlStorage.getResults();
377
+ for (const r of existingResults) {
378
+ if (r.contentHash) {
379
+ this.contentHashes.set(r.contentHash, r.url);
380
+ }
381
+ }
382
+ }
342
383
  if (this.options.transport !== 'undici') {
343
384
  this.curlAvailable = await hasImpersonate();
344
385
  if (this.options.transport === 'auto' && this.curlAvailable && isProtectedDomain(this.baseHost)) {
@@ -354,10 +395,12 @@ export class Spider {
354
395
  const pending = new Map();
355
396
  const scheduleUrl = async (item) => {
356
397
  const normalized = normalizeUrl(item.url);
357
- if (await this.crawlQueue.hasVisited(normalized))
358
- return;
359
398
  if (pending.has(normalized))
360
399
  return;
400
+ if (await this.crawlQueue.hasVisited(normalized))
401
+ return;
402
+ await this.crawlQueue.markVisited(normalized);
403
+ this._visitedCount++;
361
404
  if (item.depth > this.options.maxDepth)
362
405
  return;
363
406
  if (this._resultCount + pending.size >= this.options.maxPages)
@@ -373,8 +416,6 @@ export class Spider {
373
416
  return;
374
417
  }
375
418
  }
376
- await this.crawlQueue.markVisited(normalized);
377
- this._visitedCount++;
378
419
  this.pendingCount++;
379
420
  const promise = this.pool.run(() => this.crawlPage({ ...item, url: normalized }))
380
421
  .finally(() => {
@@ -416,14 +457,41 @@ export class Spider {
416
457
  if (pending.size > 0) {
417
458
  await Promise.all(pending.values());
418
459
  }
460
+ while (!this.aborted && this._resultCount < this.options.maxPages) {
461
+ const remaining = await this.crawlQueue.size();
462
+ if (remaining === 0 && pending.size === 0)
463
+ break;
464
+ this._queueSize = remaining;
465
+ let nextItem = await this.crawlQueue.pop();
466
+ while (nextItem && !this.aborted) {
467
+ this._queueSize = Math.max(0, this._queueSize - 1);
468
+ if (this._resultCount + pending.size >= this.options.maxPages)
469
+ break;
470
+ await scheduleUrl(nextItem);
471
+ nextItem = await this.crawlQueue.pop();
472
+ }
473
+ if (pending.size > 0) {
474
+ await Promise.all(pending.values());
475
+ }
476
+ }
419
477
  this.running = false;
420
478
  const pages = await this.crawlStorage.getResults();
421
479
  const errors = await this.crawlStorage.getErrors();
422
480
  const sitemapAnalysis = this.buildSitemapAnalysis(pages);
423
481
  const robotsAnalysis = this.buildRobotsAnalysis();
424
482
  const visited = this.crawlQueue instanceof InMemoryCrawlQueue
425
- ? this.crawlQueue.getVisited()
483
+ ? new Set(this.crawlQueue.getVisited())
426
484
  : new Set(pages.map(r => r.url));
485
+ await this.crawlQueue.close?.();
486
+ await this.crawlStorage.close?.();
487
+ for (const client of this.proxyClients.values()) {
488
+ if (typeof client.destroy === 'function') {
489
+ client.destroy();
490
+ }
491
+ }
492
+ this.proxyClients.clear();
493
+ await this.proxyAdapter?.close?.();
494
+ this.domainAvgResponseTime.clear();
427
495
  return {
428
496
  startUrl: normalizedStart,
429
497
  pages,
@@ -615,6 +683,8 @@ export class Spider {
615
683
  let lastRetryAfterMs = 0;
616
684
  const proxyUrl = this.proxyAdapter ? await this.proxyAdapter.getProxy() : null;
617
685
  const executeRequest = async (useCurl) => {
686
+ if (this.aborted)
687
+ throw new Error('Crawl aborted');
618
688
  if (useCurl && this.curlTransport) {
619
689
  const curlForRequest = proxyUrl
620
690
  ? new CurlTransport(proxyUrl)
@@ -655,6 +725,7 @@ export class Spider {
655
725
  const clientForRequest = this.getClientForProxy(proxyUrl);
656
726
  const response = await clientForRequest.get(url, {
657
727
  headers: this.buildRequestHeaders(url, false),
728
+ signal: this.abortController.signal,
658
729
  beforeRedirect: this.options.onRedirect
659
730
  ? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
660
731
  : undefined,
@@ -746,6 +817,9 @@ export class Spider {
746
817
  const isHighQualitySuccess = response.status < 400 && !detection.blocked && !hasCaptchaSignal;
747
818
  if (isHighQualitySuccess) {
748
819
  this.registerDomainSuccess(hostname);
820
+ if (timings?.total) {
821
+ this.updateAutoThrottle(hostname, timings.total);
822
+ }
749
823
  }
750
824
  if (!shouldRetry || attempt === maxAttempts - 1) {
751
825
  if (proxyUrl && this.proxyAdapter?.reportResult) {
@@ -792,7 +866,9 @@ export class Spider {
792
866
  timings,
793
867
  });
794
868
  }
795
- await sleep(waitMs);
869
+ if (this.aborted)
870
+ break;
871
+ await sleep(waitMs, this.abortController.signal);
796
872
  continue;
797
873
  }
798
874
  catch (error) {
@@ -823,8 +899,10 @@ export class Spider {
823
899
  forcedTransport = 'curl';
824
900
  }
825
901
  }
902
+ if (this.aborted)
903
+ break;
826
904
  const waitMs = this.getRetryWait(hostname, attempt + 1);
827
- await sleep(waitMs);
905
+ await sleep(waitMs, this.abortController.signal);
828
906
  }
829
907
  }
830
908
  if (lastResponse) {
@@ -1035,7 +1113,7 @@ export class Spider {
1035
1113
  if (!link.href)
1036
1114
  continue;
1037
1115
  const normalized = normalizeUrl(link.href);
1038
- if (!shouldCrawl(normalized, this.baseHost, this.options))
1116
+ if (!shouldCrawl(normalized, this.baseHost, this.options, this.baseRootDomain))
1039
1117
  continue;
1040
1118
  candidateUrls.push(normalized);
1041
1119
  candidates.push({ url: normalized, depth: item.depth + 1 });
@@ -1059,6 +1137,8 @@ export class Spider {
1059
1137
  }
1060
1138
  }
1061
1139
  catch (error) {
1140
+ if (this.aborted)
1141
+ return;
1062
1142
  const errAttempts = typeof error?.attempts === 'number' ? error.attempts : 1;
1063
1143
  const errRetryCount = typeof error?.retryCount === 'number' ? error.retryCount : Math.max(0, errAttempts - 1);
1064
1144
  const errRetryAfterMs = typeof error?.retryAfterMs === 'number' ? error?.retryAfterMs : 0;
@@ -1144,6 +1224,7 @@ export class Spider {
1144
1224
  consecutiveUndiciFailures: 0,
1145
1225
  lastTransport: 'undici',
1146
1226
  lastCaptchaConfidence: 0,
1227
+ autoThrottleDelay: 0,
1147
1228
  };
1148
1229
  this.domainStates.set(hostname, next);
1149
1230
  return next;
@@ -1246,9 +1327,11 @@ export class Spider {
1246
1327
  async waitForDomainPenalty(hostname) {
1247
1328
  const state = this.getOrCreateDomainState(hostname);
1248
1329
  const now = Date.now();
1249
- const delay = Math.max(state.penaltyUntil, state.challengeCooldownUntil) - now;
1330
+ const penaltyDelay = Math.max(state.penaltyUntil, state.challengeCooldownUntil) - now;
1331
+ const throttleDelay = state.autoThrottleDelay ?? 0;
1332
+ const delay = Math.max(penaltyDelay, throttleDelay);
1250
1333
  if (delay > 0) {
1251
- await sleep(delay);
1334
+ await sleep(delay, this.abortController.signal);
1252
1335
  }
1253
1336
  }
1254
1337
  registerDomainBlock(hostname) {
@@ -1274,6 +1357,20 @@ export class Spider {
1274
1357
  state.lastCaptchaProvider = undefined;
1275
1358
  }
1276
1359
  }
1360
+ updateAutoThrottle(hostname, responseTimeMs) {
1361
+ const config = this.options.autoThrottle;
1362
+ if (!config)
1363
+ return;
1364
+ const target = (typeof config === 'object' ? config.targetMs : undefined) ?? 500;
1365
+ const minDelay = (typeof config === 'object' ? config.minDelay : undefined) ?? 50;
1366
+ const maxDelay = (typeof config === 'object' ? config.maxDelay : undefined) ?? 30000;
1367
+ const prev = this.domainAvgResponseTime.get(hostname) ?? responseTimeMs;
1368
+ const avg = prev * 0.7 + responseTimeMs * 0.3;
1369
+ this.domainAvgResponseTime.set(hostname, avg);
1370
+ const ratio = avg / target;
1371
+ const state = this.getOrCreateDomainState(hostname);
1372
+ state.autoThrottleDelay = Math.max(minDelay, Math.min(maxDelay, Math.round(this.options.delay * ratio)));
1373
+ }
1277
1374
  getCaptchaRetryMultiplier(provider) {
1278
1375
  if (!provider)
1279
1376
  return 1.2;
@@ -1319,6 +1416,7 @@ export class Spider {
1319
1416
  }
1320
1417
  abort() {
1321
1418
  this.aborted = true;
1419
+ this.abortController.abort();
1322
1420
  }
1323
1421
  isRunning() {
1324
1422
  return this.running;
@@ -56,7 +56,7 @@ export class SeoSpider {
56
56
  });
57
57
  }
58
58
  async analyzePageDuringCrawl(pageResult, html) {
59
- if (pageResult.error || pageResult.status >= 400) {
59
+ if (pageResult.status >= 400 || !html) {
60
60
  const seoPage = { ...pageResult, seoReport: undefined };
61
61
  this.seoPages.push(seoPage);
62
62
  return;
@@ -141,7 +141,8 @@ export class SeoSpider {
141
141
  results.humans.content = await res.text();
142
142
  }
143
143
  }
144
- catch { }
144
+ catch {
145
+ }
145
146
  try {
146
147
  const res = await client.get(results.llms.url);
147
148
  if (res.status === 200) {
@@ -149,7 +150,8 @@ export class SeoSpider {
149
150
  results.llms.content = await res.text();
150
151
  }
151
152
  }
152
- catch { }
153
+ catch {
154
+ }
153
155
  try {
154
156
  const res = await client.get(results.sitemap.url);
155
157
  if (res.status === 200) {
@@ -159,7 +161,8 @@ export class SeoSpider {
159
161
  results.sitemap.urlCount = urlMatches ? urlMatches.length : 0;
160
162
  }
161
163
  }
162
- catch { }
164
+ catch {
165
+ }
163
166
  try {
164
167
  let res = await client.get(results.manifest.url);
165
168
  if (res.status !== 200) {
@@ -181,7 +184,8 @@ export class SeoSpider {
181
184
  }
182
185
  }
183
186
  }
184
- catch { }
187
+ catch {
188
+ }
185
189
  return results;
186
190
  }
187
191
  catch {
@@ -25,7 +25,8 @@ export async function discoverFeeds(baseUrl, html) {
25
25
  href = new URL(href, baseUrl).toString();
26
26
  candidateUrls.add(href);
27
27
  }
28
- catch { }
28
+ catch {
29
+ }
29
30
  }
30
31
  }
31
32
  }
@@ -35,7 +36,8 @@ export async function discoverFeeds(baseUrl, html) {
35
36
  const url = new URL(path, baseUrl).toString();
36
37
  candidateUrls.add(url);
37
38
  }
38
- catch { }
39
+ catch {
40
+ }
39
41
  }
40
42
  }
41
43
  const client = createClient({ timeout: 8000 });