recker 1.0.95 → 1.0.96

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,6 +20,8 @@ export declare class InMemoryCrawlQueue implements CrawlQueueAdapter {
20
20
  private head;
21
21
  private tail;
22
22
  private visited;
23
+ private mode;
24
+ constructor(mode?: 'fifo' | 'lifo');
23
25
  push(item: CrawlQueueItem): Promise<void>;
24
26
  pushBatch(items: CrawlQueueItem[]): Promise<void>;
25
27
  pop(): Promise<CrawlQueueItem | null>;
@@ -3,6 +3,10 @@ export class InMemoryCrawlQueue {
3
3
  head = 0;
4
4
  tail = 0;
5
5
  visited = new Set();
6
+ mode;
7
+ constructor(mode = 'fifo') {
8
+ this.mode = mode;
9
+ }
6
10
  async push(item) {
7
11
  this.queue[this.tail++] = item;
8
12
  }
@@ -12,6 +16,16 @@ export class InMemoryCrawlQueue {
12
16
  }
13
17
  }
14
18
  async pop() {
19
+ if (this.mode === 'lifo') {
20
+ while (this.tail > this.head) {
21
+ this.tail--;
22
+ const item = this.queue[this.tail];
23
+ this.queue[this.tail] = undefined;
24
+ if (item)
25
+ return item;
26
+ }
27
+ return null;
28
+ }
15
29
  while (this.head < this.tail) {
16
30
  const item = this.queue[this.head];
17
31
  this.queue[this.head] = undefined;
@@ -11,7 +11,7 @@ type CaptchaProvider = CaptchaDetectionResult['provider'];
11
11
  export interface SpiderOptions {
12
12
  maxDepth?: number;
13
13
  maxPages?: number;
14
- sameDomain?: boolean;
14
+ sameDomain?: boolean | 'exact' | 'subdomain';
15
15
  concurrency?: number;
16
16
  timeout?: number;
17
17
  delay?: number;
@@ -63,8 +63,14 @@ export interface SpiderOptions {
63
63
  domainRateLimit?: {
64
64
  maxPerSecond?: number;
65
65
  };
66
+ autoThrottle?: boolean | {
67
+ targetMs?: number;
68
+ minDelay?: number;
69
+ maxDelay?: number;
70
+ };
66
71
  deduplicateContent?: boolean;
67
72
  resume?: boolean;
73
+ strategy?: 'bfs' | 'dfs';
68
74
  crawlQueue?: CrawlQueueAdapter;
69
75
  crawlStorage?: CrawlStorageAdapter;
70
76
  }
@@ -190,12 +196,14 @@ export declare class Spider {
190
196
  private _queueSize;
191
197
  private _resultCount;
192
198
  private baseHost;
199
+ private baseRootDomain;
193
200
  private running;
194
201
  private aborted;
195
202
  private abortController;
196
203
  private pendingCount;
197
204
  private domainRequestTimestamps;
198
205
  private contentHashes;
206
+ private domainAvgResponseTime;
199
207
  private blockedDomains;
200
208
  private curlTransport;
201
209
  private curlAvailable;
@@ -224,6 +232,7 @@ export declare class Spider {
224
232
  private waitForDomainPenalty;
225
233
  private registerDomainBlock;
226
234
  private registerDomainSuccess;
235
+ private updateAutoThrottle;
227
236
  private getCaptchaRetryMultiplier;
228
237
  private registerDomainChallenge;
229
238
  private getRetryWait;
@@ -108,15 +108,30 @@ function normalizeUrl(urlStr) {
108
108
  return urlStr;
109
109
  }
110
110
  }
111
- function shouldCrawl(url, baseHost, options) {
111
+ function getRootDomain(hostname) {
112
+ const parts = hostname.replace(/^www\./, '').split('.');
113
+ if (parts.length >= 3 && parts[parts.length - 2].length <= 3) {
114
+ return parts.slice(-3).join('.');
115
+ }
116
+ return parts.slice(-2).join('.');
117
+ }
118
+ function shouldCrawl(url, baseHost, options, baseRootDomain) {
112
119
  try {
113
120
  const parsed = new URL(url);
114
121
  if (!['http:', 'https:'].includes(parsed.protocol)) {
115
122
  return false;
116
123
  }
117
124
  const hostname = parsed.hostname.replace(/^www\./, '');
118
- if (options.sameDomain !== false && hostname !== baseHost) {
119
- return false;
125
+ const sameDomain = options.sameDomain;
126
+ if (sameDomain === 'subdomain') {
127
+ const pageRoot = getRootDomain(hostname);
128
+ const rootDomain = baseRootDomain ?? getRootDomain(baseHost);
129
+ if (pageRoot !== rootDomain)
130
+ return false;
131
+ }
132
+ else if (sameDomain !== false) {
133
+ if (hostname !== baseHost)
134
+ return false;
120
135
  }
121
136
  const skipExtensions = [
122
137
  '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico',
@@ -193,12 +208,14 @@ export class Spider {
193
208
  _queueSize = 0;
194
209
  _resultCount = 0;
195
210
  baseHost = '';
211
+ baseRootDomain = '';
196
212
  running = false;
197
213
  aborted = false;
198
214
  abortController = new AbortController();
199
215
  pendingCount = 0;
200
216
  domainRequestTimestamps = new Map();
201
217
  contentHashes = new Map();
218
+ domainAvgResponseTime = new Map();
202
219
  blockedDomains = new Set();
203
220
  curlTransport = null;
204
221
  curlAvailable = false;
@@ -285,8 +302,10 @@ export class Spider {
285
302
  extract: extractSchema,
286
303
  parserOptions: options.parserOptions,
287
304
  domainRateLimit: options.domainRateLimit,
305
+ autoThrottle: options.autoThrottle ?? false,
288
306
  deduplicateContent: options.deduplicateContent ?? false,
289
307
  resume: options.resume ?? false,
308
+ strategy: options.strategy ?? 'bfs',
290
309
  };
291
310
  if (options.proxy) {
292
311
  if (typeof options.proxy === 'string') {
@@ -316,7 +335,7 @@ export class Spider {
316
335
  interval: this.options.delay,
317
336
  } : {}),
318
337
  });
319
- this.crawlQueue = options.crawlQueue ?? new InMemoryCrawlQueue();
338
+ this.crawlQueue = options.crawlQueue ?? new InMemoryCrawlQueue(this.options.strategy === 'dfs' ? 'lifo' : 'fifo');
320
339
  this.crawlStorage = options.crawlStorage ?? new InMemoryCrawlStorage();
321
340
  }
322
341
  async crawl(startUrl) {
@@ -325,6 +344,7 @@ export class Spider {
325
344
  const normalizedStart = normalizeUrl(startUrl);
326
345
  const baseUrl = new URL(normalizedStart).origin;
327
346
  this.baseHost = new URL(normalizedStart).hostname.replace(/^www\./, '');
347
+ this.baseRootDomain = getRootDomain(this.baseHost);
328
348
  if (!this.options.resume) {
329
349
  await this.crawlQueue.clear();
330
350
  await this.crawlStorage.clear();
@@ -333,6 +353,7 @@ export class Spider {
333
353
  this._resultCount = 0;
334
354
  this.domainRequestTimestamps.clear();
335
355
  this.contentHashes.clear();
356
+ this.domainAvgResponseTime.clear();
336
357
  }
337
358
  else {
338
359
  this._queueSize = await this.crawlQueue.size();
@@ -463,6 +484,14 @@ export class Spider {
463
484
  : new Set(pages.map(r => r.url));
464
485
  await this.crawlQueue.close?.();
465
486
  await this.crawlStorage.close?.();
487
+ for (const client of this.proxyClients.values()) {
488
+ if (typeof client.destroy === 'function') {
489
+ client.destroy();
490
+ }
491
+ }
492
+ this.proxyClients.clear();
493
+ await this.proxyAdapter?.close?.();
494
+ this.domainAvgResponseTime.clear();
466
495
  return {
467
496
  startUrl: normalizedStart,
468
497
  pages,
@@ -788,6 +817,9 @@ export class Spider {
788
817
  const isHighQualitySuccess = response.status < 400 && !detection.blocked && !hasCaptchaSignal;
789
818
  if (isHighQualitySuccess) {
790
819
  this.registerDomainSuccess(hostname);
820
+ if (timings?.total) {
821
+ this.updateAutoThrottle(hostname, timings.total);
822
+ }
791
823
  }
792
824
  if (!shouldRetry || attempt === maxAttempts - 1) {
793
825
  if (proxyUrl && this.proxyAdapter?.reportResult) {
@@ -1081,7 +1113,7 @@ export class Spider {
1081
1113
  if (!link.href)
1082
1114
  continue;
1083
1115
  const normalized = normalizeUrl(link.href);
1084
- if (!shouldCrawl(normalized, this.baseHost, this.options))
1116
+ if (!shouldCrawl(normalized, this.baseHost, this.options, this.baseRootDomain))
1085
1117
  continue;
1086
1118
  candidateUrls.push(normalized);
1087
1119
  candidates.push({ url: normalized, depth: item.depth + 1 });
@@ -1192,6 +1224,7 @@ export class Spider {
1192
1224
  consecutiveUndiciFailures: 0,
1193
1225
  lastTransport: 'undici',
1194
1226
  lastCaptchaConfidence: 0,
1227
+ autoThrottleDelay: 0,
1195
1228
  };
1196
1229
  this.domainStates.set(hostname, next);
1197
1230
  return next;
@@ -1294,7 +1327,9 @@ export class Spider {
1294
1327
  async waitForDomainPenalty(hostname) {
1295
1328
  const state = this.getOrCreateDomainState(hostname);
1296
1329
  const now = Date.now();
1297
- const delay = Math.max(state.penaltyUntil, state.challengeCooldownUntil) - now;
1330
+ const penaltyDelay = Math.max(state.penaltyUntil, state.challengeCooldownUntil) - now;
1331
+ const throttleDelay = state.autoThrottleDelay ?? 0;
1332
+ const delay = Math.max(penaltyDelay, throttleDelay);
1298
1333
  if (delay > 0) {
1299
1334
  await sleep(delay, this.abortController.signal);
1300
1335
  }
@@ -1322,6 +1357,20 @@ export class Spider {
1322
1357
  state.lastCaptchaProvider = undefined;
1323
1358
  }
1324
1359
  }
1360
+ updateAutoThrottle(hostname, responseTimeMs) {
1361
+ const config = this.options.autoThrottle;
1362
+ if (!config)
1363
+ return;
1364
+ const target = (typeof config === 'object' ? config.targetMs : undefined) ?? 500;
1365
+ const minDelay = (typeof config === 'object' ? config.minDelay : undefined) ?? 50;
1366
+ const maxDelay = (typeof config === 'object' ? config.maxDelay : undefined) ?? 30000;
1367
+ const prev = this.domainAvgResponseTime.get(hostname) ?? responseTimeMs;
1368
+ const avg = prev * 0.7 + responseTimeMs * 0.3;
1369
+ this.domainAvgResponseTime.set(hostname, avg);
1370
+ const ratio = avg / target;
1371
+ const state = this.getOrCreateDomainState(hostname);
1372
+ state.autoThrottleDelay = Math.max(minDelay, Math.min(maxDelay, Math.round(this.options.delay * ratio)));
1373
+ }
1325
1374
  getCaptchaRetryMultiplier(provider) {
1326
1375
  if (!provider)
1327
1376
  return 1.2;
@@ -141,7 +141,8 @@ export class SeoSpider {
141
141
  results.humans.content = await res.text();
142
142
  }
143
143
  }
144
- catch { }
144
+ catch {
145
+ }
145
146
  try {
146
147
  const res = await client.get(results.llms.url);
147
148
  if (res.status === 200) {
@@ -149,7 +150,8 @@ export class SeoSpider {
149
150
  results.llms.content = await res.text();
150
151
  }
151
152
  }
152
- catch { }
153
+ catch {
154
+ }
153
155
  try {
154
156
  const res = await client.get(results.sitemap.url);
155
157
  if (res.status === 200) {
@@ -159,7 +161,8 @@ export class SeoSpider {
159
161
  results.sitemap.urlCount = urlMatches ? urlMatches.length : 0;
160
162
  }
161
163
  }
162
- catch { }
164
+ catch {
165
+ }
163
166
  try {
164
167
  let res = await client.get(results.manifest.url);
165
168
  if (res.status !== 200) {
@@ -181,7 +184,8 @@ export class SeoSpider {
181
184
  }
182
185
  }
183
186
  }
184
- catch { }
187
+ catch {
188
+ }
185
189
  return results;
186
190
  }
187
191
  catch {
@@ -25,7 +25,8 @@ export async function discoverFeeds(baseUrl, html) {
25
25
  href = new URL(href, baseUrl).toString();
26
26
  candidateUrls.add(href);
27
27
  }
28
- catch { }
28
+ catch {
29
+ }
29
30
  }
30
31
  }
31
32
  }
@@ -35,7 +36,8 @@ export async function discoverFeeds(baseUrl, html) {
35
36
  const url = new URL(path, baseUrl).toString();
36
37
  candidateUrls.add(url);
37
38
  }
38
- catch { }
39
+ catch {
40
+ }
39
41
  }
40
42
  }
41
43
  const client = createClient({ timeout: 8000 });
@@ -526,18 +526,42 @@ export function detectBlock(response, body) {
526
526
  if (body) {
527
527
  const isLongBody = body.length > 100_000;
528
528
  const checkBody = isLongBody ? body.slice(0, 30_000) : body;
529
- const challengeHint = /just a moment|attention required|access denied|checking your browser|security check|human verification|suspicious activity|captcha|recaptcha|hcaptcha|turnstile|datadome|cloudflare/i.test(checkBody.slice(0, 8000));
530
- for (const { pattern, reason, confidence, description } of BLOCK_PATTERNS) {
531
- if (isLongBody && response.status === 200 && confidence < 0.85 && !challengeHint) {
532
- continue;
533
- }
534
- if (pattern.test(checkBody)) {
535
- results.push({
536
- blocked: true,
537
- reason,
538
- confidence,
539
- description,
540
- });
529
+ const lowerBody = checkBody.slice(0, 8000).toLowerCase();
530
+ const hasAnyChallengeKeyword = lowerBody.includes('captcha') ||
531
+ lowerBody.includes('cloudflare') ||
532
+ lowerBody.includes('datadome') ||
533
+ lowerBody.includes('blocked') ||
534
+ lowerBody.includes('denied') ||
535
+ lowerBody.includes('forbidden') ||
536
+ lowerBody.includes('too many') ||
537
+ lowerBody.includes('rate limit') ||
538
+ lowerBody.includes('checking your browser') ||
539
+ lowerBody.includes('just a moment') ||
540
+ lowerBody.includes('security check') ||
541
+ lowerBody.includes('human verification') ||
542
+ lowerBody.includes('perimeterx') ||
543
+ lowerBody.includes('incapsula') ||
544
+ lowerBody.includes('imperva') ||
545
+ lowerBody.includes('akamai') ||
546
+ lowerBody.includes('httpservice') ||
547
+ lowerBody.includes('enablejs') ||
548
+ lowerBody.includes('verify') ||
549
+ lowerBody.includes('suspicious') ||
550
+ lowerBody.includes('bot') ||
551
+ lowerBody.includes('access denied');
552
+ if (hasAnyChallengeKeyword || response.status !== 200) {
553
+ for (const { pattern, reason, confidence, description } of BLOCK_PATTERNS) {
554
+ if (isLongBody && response.status === 200 && confidence < 0.85 && !hasAnyChallengeKeyword) {
555
+ continue;
556
+ }
557
+ if (pattern.test(checkBody)) {
558
+ results.push({
559
+ blocked: true,
560
+ reason,
561
+ confidence,
562
+ description,
563
+ });
564
+ }
541
565
  }
542
566
  }
543
567
  if (body.length < 5000 &&
@@ -603,8 +627,25 @@ export function detectCaptcha(response, body) {
603
627
  if (body) {
604
628
  const checkBody = body.length < 120_000 ? body : body.slice(0, 120_000);
605
629
  const isTinyBody = checkBody.length > 0 && checkBody.length < 12000;
630
+ const lowerSnippet = checkBody.slice(0, 8000).toLowerCase();
631
+ const hasAnyCaptchaKeyword = lowerSnippet.includes('captcha') ||
632
+ lowerSnippet.includes('recaptcha') ||
633
+ lowerSnippet.includes('hcaptcha') ||
634
+ lowerSnippet.includes('turnstile') ||
635
+ lowerSnippet.includes('challenge') ||
636
+ lowerSnippet.includes('cloudflare') ||
637
+ lowerSnippet.includes('datadome') ||
638
+ lowerSnippet.includes('perimeterx') ||
639
+ lowerSnippet.includes('funcaptcha') ||
640
+ lowerSnippet.includes('arkose') ||
641
+ lowerSnippet.includes('sitekey') ||
642
+ lowerSnippet.includes('just a moment') ||
643
+ lowerSnippet.includes('human verification');
606
644
  const hasHtmlTags = /<html|<head|<body|<script|<meta/i.test(checkBody);
607
- const challengeTitleOrText = /just a moment|attention required|human verification|verify you are human|security check/i.test(checkBody);
645
+ const challengeTitleOrText = hasAnyCaptchaKeyword ||
646
+ lowerSnippet.includes('attention required') ||
647
+ lowerSnippet.includes('verify you are human') ||
648
+ lowerSnippet.includes('security check');
608
649
  if (isTinyBody && challengeTitleOrText && hasHtmlTags) {
609
650
  addMatch(matches, 'generic', 0.6, 'Tiny HTML response with challenge-like text');
610
651
  }
@@ -616,24 +657,38 @@ export function detectCaptcha(response, body) {
616
657
  addMatch(matches, provider, confidence, description);
617
658
  }
618
659
  }
619
- for (const { pattern, provider, confidence, description } of CAPTCHA_SCRIPT_PATTERNS) {
620
- if (pattern.test(checkBody)) {
621
- addMatch(matches, provider, confidence, description);
660
+ if (hasAnyCaptchaKeyword || !(response.status >= 200 && response.status < 300)) {
661
+ for (const { pattern, provider, confidence, description } of CAPTCHA_SCRIPT_PATTERNS) {
662
+ if (pattern.test(checkBody)) {
663
+ addMatch(matches, provider, confidence, description);
664
+ }
622
665
  }
623
- }
624
- for (const { pattern, provider, confidence, description } of CAPTCHA_FORM_PATTERNS) {
625
- if (pattern.test(checkBody)) {
626
- addMatch(matches, provider, confidence, description);
666
+ for (const { pattern, provider, confidence, description } of CAPTCHA_FORM_PATTERNS) {
667
+ if (pattern.test(checkBody)) {
668
+ addMatch(matches, provider, confidence, description);
669
+ }
627
670
  }
628
- }
629
- for (const { pattern, provider, confidence, description } of CAPTCHA_DOM_MARKERS) {
630
- if (pattern.test(checkBody)) {
631
- addMatch(matches, provider, confidence, description);
671
+ for (const { pattern, provider, confidence, description } of CAPTCHA_DOM_MARKERS) {
672
+ if (pattern.test(checkBody)) {
673
+ addMatch(matches, provider, confidence, description);
674
+ }
632
675
  }
633
- }
634
- for (const { pattern, provider, confidence, description } of CAPTCHA_NOSCRIPT_PATTERNS) {
635
- if (pattern.test(checkBody)) {
636
- addMatch(matches, provider, confidence, description);
676
+ for (const { pattern, provider, confidence, description } of CAPTCHA_NOSCRIPT_PATTERNS) {
677
+ if (pattern.test(checkBody)) {
678
+ addMatch(matches, provider, confidence, description);
679
+ }
680
+ }
681
+ for (const { pattern, provider, confidence, description } of CAPTCHA_PATTERNS) {
682
+ if (pattern.test(checkBody)) {
683
+ addMatch(matches, provider, confidence, description);
684
+ }
685
+ }
686
+ if (/<script[^>]*src=["'][^"']*(?:challenges\.cloudflare\.com|recaptcha\/api\.js|hcaptcha\.com\/1\/api\.js|challenges\.perimeterx\.)[^"']*["'][^>]*>/i.test(checkBody.slice(0, 120_000))) {
687
+ addMatch(matches, 'cloudflare', 0.6, 'Known challenge script reference found in HTML');
688
+ }
689
+ if ((response.status >= 500 && response.status < 600) &&
690
+ /cloudflare|challenge|verification|blocked|security/i.test(checkBody.slice(0, 3000))) {
691
+ addMatch(matches, 'generic', 0.2, 'Server challenge-like text on error response');
637
692
  }
638
693
  }
639
694
  if (response.status === 403 || response.status === 503) {
@@ -653,18 +708,6 @@ export function detectCaptcha(response, body) {
653
708
  /refresh|challenge|verify|captcha|javascript challenge|cloudflare|bot/i.test((location || ''))) {
654
709
  addMatch(matches, 'generic', 0.67, 'Redirect location indicates bot challenge flow');
655
710
  }
656
- if (/<script[^>]*src=["'][^"']*(?:challenges\.cloudflare\.com|recaptcha\/api\.js|hcaptcha\.com\/1\/api\.js|challenges\.perimeterx\.)[^"']*["'][^>]*>/i.test(checkBody.slice(0, 120_000))) {
657
- addMatch(matches, 'cloudflare', 0.6, 'Known challenge script reference found in HTML');
658
- }
659
- if ((response.status >= 500 && response.status < 600) &&
660
- /cloudflare|challenge|verification|blocked|security/i.test(checkBody.slice(0, 3000))) {
661
- addMatch(matches, 'generic', 0.2, 'Server challenge-like text on error response');
662
- }
663
- for (const { pattern, provider, confidence, description } of CAPTCHA_PATTERNS) {
664
- if (pattern.test(checkBody)) {
665
- addMatch(matches, provider, confidence, description);
666
- }
667
- }
668
711
  }
669
712
  const result = scoreByProvider(matches);
670
713
  if (result.confidence >= 0.5 || hasCaptchaHeader) {
@@ -681,23 +724,33 @@ export function detectCaptcha(response, body) {
681
724
  };
682
725
  }
683
726
  export function isProtectedDomain(hostname) {
684
- const protectedPatterns = [
685
- /cloudflare/i,
686
- /\.gov$/i,
687
- /\.mil$/i,
688
- /linkedin\.com$/i,
689
- /twitter\.com$/i,
690
- /x\.com$/i,
691
- /instagram\.com$/i,
692
- /facebook\.com$/i,
693
- /amazon\./i,
694
- /google\./i,
695
- /microsoft\.com$/i,
696
- /apple\.com$/i,
697
- /netflix\.com$/i,
698
- /spotify\.com$/i,
727
+ const h = hostname.toLowerCase();
728
+ const protectedDomains = [
729
+ 'linkedin.com',
730
+ 'twitter.com',
731
+ 'x.com',
732
+ 'instagram.com',
733
+ 'facebook.com',
734
+ 'amazon.com',
735
+ 'amazon.co.uk',
736
+ 'amazon.de',
737
+ 'amazon.co.jp',
738
+ 'google.com',
739
+ 'microsoft.com',
740
+ 'apple.com',
741
+ 'netflix.com',
742
+ 'spotify.com',
699
743
  ];
700
- return protectedPatterns.some(p => p.test(hostname));
744
+ const protectedTlds = ['.gov', '.mil'];
745
+ for (const domain of protectedDomains) {
746
+ if (h === domain || h.endsWith('.' + domain))
747
+ return true;
748
+ }
749
+ for (const tld of protectedTlds) {
750
+ if (h.endsWith(tld))
751
+ return true;
752
+ }
753
+ return false;
701
754
  }
702
755
  export function isCloudflareChallenge(response, body) {
703
756
  const cfRay = response.headers.get('cf-ray');
@@ -20,6 +20,8 @@ export declare class InMemoryCrawlQueue implements CrawlQueueAdapter {
20
20
  private head;
21
21
  private tail;
22
22
  private visited;
23
+ private mode;
24
+ constructor(mode?: 'fifo' | 'lifo');
23
25
  push(item: CrawlQueueItem): Promise<void>;
24
26
  pushBatch(items: CrawlQueueItem[]): Promise<void>;
25
27
  pop(): Promise<CrawlQueueItem | null>;
@@ -3,6 +3,10 @@ export class InMemoryCrawlQueue {
3
3
  head = 0;
4
4
  tail = 0;
5
5
  visited = new Set();
6
+ mode;
7
+ constructor(mode = 'fifo') {
8
+ this.mode = mode;
9
+ }
6
10
  async push(item) {
7
11
  this.queue[this.tail++] = item;
8
12
  }
@@ -12,6 +16,16 @@ export class InMemoryCrawlQueue {
12
16
  }
13
17
  }
14
18
  async pop() {
19
+ if (this.mode === 'lifo') {
20
+ while (this.tail > this.head) {
21
+ this.tail--;
22
+ const item = this.queue[this.tail];
23
+ this.queue[this.tail] = undefined;
24
+ if (item)
25
+ return item;
26
+ }
27
+ return null;
28
+ }
15
29
  while (this.head < this.tail) {
16
30
  const item = this.queue[this.head];
17
31
  this.queue[this.head] = undefined;
@@ -11,7 +11,7 @@ type CaptchaProvider = CaptchaDetectionResult['provider'];
11
11
  export interface SpiderOptions {
12
12
  maxDepth?: number;
13
13
  maxPages?: number;
14
- sameDomain?: boolean;
14
+ sameDomain?: boolean | 'exact' | 'subdomain';
15
15
  concurrency?: number;
16
16
  timeout?: number;
17
17
  delay?: number;
@@ -63,8 +63,14 @@ export interface SpiderOptions {
63
63
  domainRateLimit?: {
64
64
  maxPerSecond?: number;
65
65
  };
66
+ autoThrottle?: boolean | {
67
+ targetMs?: number;
68
+ minDelay?: number;
69
+ maxDelay?: number;
70
+ };
66
71
  deduplicateContent?: boolean;
67
72
  resume?: boolean;
73
+ strategy?: 'bfs' | 'dfs';
68
74
  crawlQueue?: CrawlQueueAdapter;
69
75
  crawlStorage?: CrawlStorageAdapter;
70
76
  }
@@ -190,12 +196,14 @@ export declare class Spider {
190
196
  private _queueSize;
191
197
  private _resultCount;
192
198
  private baseHost;
199
+ private baseRootDomain;
193
200
  private running;
194
201
  private aborted;
195
202
  private abortController;
196
203
  private pendingCount;
197
204
  private domainRequestTimestamps;
198
205
  private contentHashes;
206
+ private domainAvgResponseTime;
199
207
  private blockedDomains;
200
208
  private curlTransport;
201
209
  private curlAvailable;
@@ -224,6 +232,7 @@ export declare class Spider {
224
232
  private waitForDomainPenalty;
225
233
  private registerDomainBlock;
226
234
  private registerDomainSuccess;
235
+ private updateAutoThrottle;
227
236
  private getCaptchaRetryMultiplier;
228
237
  private registerDomainChallenge;
229
238
  private getRetryWait;
@@ -108,15 +108,30 @@ function normalizeUrl(urlStr) {
108
108
  return urlStr;
109
109
  }
110
110
  }
111
- function shouldCrawl(url, baseHost, options) {
111
+ function getRootDomain(hostname) {
112
+ const parts = hostname.replace(/^www\./, '').split('.');
113
+ if (parts.length >= 3 && parts[parts.length - 2].length <= 3) {
114
+ return parts.slice(-3).join('.');
115
+ }
116
+ return parts.slice(-2).join('.');
117
+ }
118
+ function shouldCrawl(url, baseHost, options, baseRootDomain) {
112
119
  try {
113
120
  const parsed = new URL(url);
114
121
  if (!['http:', 'https:'].includes(parsed.protocol)) {
115
122
  return false;
116
123
  }
117
124
  const hostname = parsed.hostname.replace(/^www\./, '');
118
- if (options.sameDomain !== false && hostname !== baseHost) {
119
- return false;
125
+ const sameDomain = options.sameDomain;
126
+ if (sameDomain === 'subdomain') {
127
+ const pageRoot = getRootDomain(hostname);
128
+ const rootDomain = baseRootDomain ?? getRootDomain(baseHost);
129
+ if (pageRoot !== rootDomain)
130
+ return false;
131
+ }
132
+ else if (sameDomain !== false) {
133
+ if (hostname !== baseHost)
134
+ return false;
120
135
  }
121
136
  const skipExtensions = [
122
137
  '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico',
@@ -193,12 +208,14 @@ export class Spider {
193
208
  _queueSize = 0;
194
209
  _resultCount = 0;
195
210
  baseHost = '';
211
+ baseRootDomain = '';
196
212
  running = false;
197
213
  aborted = false;
198
214
  abortController = new AbortController();
199
215
  pendingCount = 0;
200
216
  domainRequestTimestamps = new Map();
201
217
  contentHashes = new Map();
218
+ domainAvgResponseTime = new Map();
202
219
  blockedDomains = new Set();
203
220
  curlTransport = null;
204
221
  curlAvailable = false;
@@ -285,8 +302,10 @@ export class Spider {
285
302
  extract: extractSchema,
286
303
  parserOptions: options.parserOptions,
287
304
  domainRateLimit: options.domainRateLimit,
305
+ autoThrottle: options.autoThrottle ?? false,
288
306
  deduplicateContent: options.deduplicateContent ?? false,
289
307
  resume: options.resume ?? false,
308
+ strategy: options.strategy ?? 'bfs',
290
309
  };
291
310
  if (options.proxy) {
292
311
  if (typeof options.proxy === 'string') {
@@ -316,7 +335,7 @@ export class Spider {
316
335
  interval: this.options.delay,
317
336
  } : {}),
318
337
  });
319
- this.crawlQueue = options.crawlQueue ?? new InMemoryCrawlQueue();
338
+ this.crawlQueue = options.crawlQueue ?? new InMemoryCrawlQueue(this.options.strategy === 'dfs' ? 'lifo' : 'fifo');
320
339
  this.crawlStorage = options.crawlStorage ?? new InMemoryCrawlStorage();
321
340
  }
322
341
  async crawl(startUrl) {
@@ -325,6 +344,7 @@ export class Spider {
325
344
  const normalizedStart = normalizeUrl(startUrl);
326
345
  const baseUrl = new URL(normalizedStart).origin;
327
346
  this.baseHost = new URL(normalizedStart).hostname.replace(/^www\./, '');
347
+ this.baseRootDomain = getRootDomain(this.baseHost);
328
348
  if (!this.options.resume) {
329
349
  await this.crawlQueue.clear();
330
350
  await this.crawlStorage.clear();
@@ -333,6 +353,7 @@ export class Spider {
333
353
  this._resultCount = 0;
334
354
  this.domainRequestTimestamps.clear();
335
355
  this.contentHashes.clear();
356
+ this.domainAvgResponseTime.clear();
336
357
  }
337
358
  else {
338
359
  this._queueSize = await this.crawlQueue.size();
@@ -463,6 +484,14 @@ export class Spider {
463
484
  : new Set(pages.map(r => r.url));
464
485
  await this.crawlQueue.close?.();
465
486
  await this.crawlStorage.close?.();
487
+ for (const client of this.proxyClients.values()) {
488
+ if (typeof client.destroy === 'function') {
489
+ client.destroy();
490
+ }
491
+ }
492
+ this.proxyClients.clear();
493
+ await this.proxyAdapter?.close?.();
494
+ this.domainAvgResponseTime.clear();
466
495
  return {
467
496
  startUrl: normalizedStart,
468
497
  pages,
@@ -788,6 +817,9 @@ export class Spider {
788
817
  const isHighQualitySuccess = response.status < 400 && !detection.blocked && !hasCaptchaSignal;
789
818
  if (isHighQualitySuccess) {
790
819
  this.registerDomainSuccess(hostname);
820
+ if (timings?.total) {
821
+ this.updateAutoThrottle(hostname, timings.total);
822
+ }
791
823
  }
792
824
  if (!shouldRetry || attempt === maxAttempts - 1) {
793
825
  if (proxyUrl && this.proxyAdapter?.reportResult) {
@@ -1081,7 +1113,7 @@ export class Spider {
1081
1113
  if (!link.href)
1082
1114
  continue;
1083
1115
  const normalized = normalizeUrl(link.href);
1084
- if (!shouldCrawl(normalized, this.baseHost, this.options))
1116
+ if (!shouldCrawl(normalized, this.baseHost, this.options, this.baseRootDomain))
1085
1117
  continue;
1086
1118
  candidateUrls.push(normalized);
1087
1119
  candidates.push({ url: normalized, depth: item.depth + 1 });
@@ -1192,6 +1224,7 @@ export class Spider {
1192
1224
  consecutiveUndiciFailures: 0,
1193
1225
  lastTransport: 'undici',
1194
1226
  lastCaptchaConfidence: 0,
1227
+ autoThrottleDelay: 0,
1195
1228
  };
1196
1229
  this.domainStates.set(hostname, next);
1197
1230
  return next;
@@ -1294,7 +1327,9 @@ export class Spider {
1294
1327
  async waitForDomainPenalty(hostname) {
1295
1328
  const state = this.getOrCreateDomainState(hostname);
1296
1329
  const now = Date.now();
1297
- const delay = Math.max(state.penaltyUntil, state.challengeCooldownUntil) - now;
1330
+ const penaltyDelay = Math.max(state.penaltyUntil, state.challengeCooldownUntil) - now;
1331
+ const throttleDelay = state.autoThrottleDelay ?? 0;
1332
+ const delay = Math.max(penaltyDelay, throttleDelay);
1298
1333
  if (delay > 0) {
1299
1334
  await sleep(delay, this.abortController.signal);
1300
1335
  }
@@ -1322,6 +1357,20 @@ export class Spider {
1322
1357
  state.lastCaptchaProvider = undefined;
1323
1358
  }
1324
1359
  }
1360
+ updateAutoThrottle(hostname, responseTimeMs) {
1361
+ const config = this.options.autoThrottle;
1362
+ if (!config)
1363
+ return;
1364
+ const target = (typeof config === 'object' ? config.targetMs : undefined) ?? 500;
1365
+ const minDelay = (typeof config === 'object' ? config.minDelay : undefined) ?? 50;
1366
+ const maxDelay = (typeof config === 'object' ? config.maxDelay : undefined) ?? 30000;
1367
+ const prev = this.domainAvgResponseTime.get(hostname) ?? responseTimeMs;
1368
+ const avg = prev * 0.7 + responseTimeMs * 0.3;
1369
+ this.domainAvgResponseTime.set(hostname, avg);
1370
+ const ratio = avg / target;
1371
+ const state = this.getOrCreateDomainState(hostname);
1372
+ state.autoThrottleDelay = Math.max(minDelay, Math.min(maxDelay, Math.round(this.options.delay * ratio)));
1373
+ }
1325
1374
  getCaptchaRetryMultiplier(provider) {
1326
1375
  if (!provider)
1327
1376
  return 1.2;
@@ -141,7 +141,8 @@ export class SeoSpider {
141
141
  results.humans.content = await res.text();
142
142
  }
143
143
  }
144
- catch { }
144
+ catch {
145
+ }
145
146
  try {
146
147
  const res = await client.get(results.llms.url);
147
148
  if (res.status === 200) {
@@ -149,7 +150,8 @@ export class SeoSpider {
149
150
  results.llms.content = await res.text();
150
151
  }
151
152
  }
152
- catch { }
153
+ catch {
154
+ }
153
155
  try {
154
156
  const res = await client.get(results.sitemap.url);
155
157
  if (res.status === 200) {
@@ -159,7 +161,8 @@ export class SeoSpider {
159
161
  results.sitemap.urlCount = urlMatches ? urlMatches.length : 0;
160
162
  }
161
163
  }
162
- catch { }
164
+ catch {
165
+ }
163
166
  try {
164
167
  let res = await client.get(results.manifest.url);
165
168
  if (res.status !== 200) {
@@ -181,7 +184,8 @@ export class SeoSpider {
181
184
  }
182
185
  }
183
186
  }
184
- catch { }
187
+ catch {
188
+ }
185
189
  return results;
186
190
  }
187
191
  catch {
@@ -25,7 +25,8 @@ export async function discoverFeeds(baseUrl, html) {
25
25
  href = new URL(href, baseUrl).toString();
26
26
  candidateUrls.add(href);
27
27
  }
28
- catch { }
28
+ catch {
29
+ }
29
30
  }
30
31
  }
31
32
  }
@@ -35,7 +36,8 @@ export async function discoverFeeds(baseUrl, html) {
35
36
  const url = new URL(path, baseUrl).toString();
36
37
  candidateUrls.add(url);
37
38
  }
38
- catch { }
39
+ catch {
40
+ }
39
41
  }
40
42
  }
41
43
  const client = createClient({ timeout: 8000 });
@@ -526,18 +526,42 @@ export function detectBlock(response, body) {
526
526
  if (body) {
527
527
  const isLongBody = body.length > 100_000;
528
528
  const checkBody = isLongBody ? body.slice(0, 30_000) : body;
529
- const challengeHint = /just a moment|attention required|access denied|checking your browser|security check|human verification|suspicious activity|captcha|recaptcha|hcaptcha|turnstile|datadome|cloudflare/i.test(checkBody.slice(0, 8000));
530
- for (const { pattern, reason, confidence, description } of BLOCK_PATTERNS) {
531
- if (isLongBody && response.status === 200 && confidence < 0.85 && !challengeHint) {
532
- continue;
533
- }
534
- if (pattern.test(checkBody)) {
535
- results.push({
536
- blocked: true,
537
- reason,
538
- confidence,
539
- description,
540
- });
529
+ const lowerBody = checkBody.slice(0, 8000).toLowerCase();
530
+ const hasAnyChallengeKeyword = lowerBody.includes('captcha') ||
531
+ lowerBody.includes('cloudflare') ||
532
+ lowerBody.includes('datadome') ||
533
+ lowerBody.includes('blocked') ||
534
+ lowerBody.includes('denied') ||
535
+ lowerBody.includes('forbidden') ||
536
+ lowerBody.includes('too many') ||
537
+ lowerBody.includes('rate limit') ||
538
+ lowerBody.includes('checking your browser') ||
539
+ lowerBody.includes('just a moment') ||
540
+ lowerBody.includes('security check') ||
541
+ lowerBody.includes('human verification') ||
542
+ lowerBody.includes('perimeterx') ||
543
+ lowerBody.includes('incapsula') ||
544
+ lowerBody.includes('imperva') ||
545
+ lowerBody.includes('akamai') ||
546
+ lowerBody.includes('httpservice') ||
547
+ lowerBody.includes('enablejs') ||
548
+ lowerBody.includes('verify') ||
549
+ lowerBody.includes('suspicious') ||
550
+ lowerBody.includes('bot') ||
551
+ lowerBody.includes('access denied');
552
+ if (hasAnyChallengeKeyword || response.status !== 200) {
553
+ for (const { pattern, reason, confidence, description } of BLOCK_PATTERNS) {
554
+ if (isLongBody && response.status === 200 && confidence < 0.85 && !hasAnyChallengeKeyword) {
555
+ continue;
556
+ }
557
+ if (pattern.test(checkBody)) {
558
+ results.push({
559
+ blocked: true,
560
+ reason,
561
+ confidence,
562
+ description,
563
+ });
564
+ }
541
565
  }
542
566
  }
543
567
  if (body.length < 5000 &&
@@ -603,8 +627,25 @@ export function detectCaptcha(response, body) {
603
627
  if (body) {
604
628
  const checkBody = body.length < 120_000 ? body : body.slice(0, 120_000);
605
629
  const isTinyBody = checkBody.length > 0 && checkBody.length < 12000;
630
+ const lowerSnippet = checkBody.slice(0, 8000).toLowerCase();
631
+ const hasAnyCaptchaKeyword = lowerSnippet.includes('captcha') ||
632
+ lowerSnippet.includes('recaptcha') ||
633
+ lowerSnippet.includes('hcaptcha') ||
634
+ lowerSnippet.includes('turnstile') ||
635
+ lowerSnippet.includes('challenge') ||
636
+ lowerSnippet.includes('cloudflare') ||
637
+ lowerSnippet.includes('datadome') ||
638
+ lowerSnippet.includes('perimeterx') ||
639
+ lowerSnippet.includes('funcaptcha') ||
640
+ lowerSnippet.includes('arkose') ||
641
+ lowerSnippet.includes('sitekey') ||
642
+ lowerSnippet.includes('just a moment') ||
643
+ lowerSnippet.includes('human verification');
606
644
  const hasHtmlTags = /<html|<head|<body|<script|<meta/i.test(checkBody);
607
- const challengeTitleOrText = /just a moment|attention required|human verification|verify you are human|security check/i.test(checkBody);
645
+ const challengeTitleOrText = hasAnyCaptchaKeyword ||
646
+ lowerSnippet.includes('attention required') ||
647
+ lowerSnippet.includes('verify you are human') ||
648
+ lowerSnippet.includes('security check');
608
649
  if (isTinyBody && challengeTitleOrText && hasHtmlTags) {
609
650
  addMatch(matches, 'generic', 0.6, 'Tiny HTML response with challenge-like text');
610
651
  }
@@ -616,24 +657,38 @@ export function detectCaptcha(response, body) {
616
657
  addMatch(matches, provider, confidence, description);
617
658
  }
618
659
  }
619
- for (const { pattern, provider, confidence, description } of CAPTCHA_SCRIPT_PATTERNS) {
620
- if (pattern.test(checkBody)) {
621
- addMatch(matches, provider, confidence, description);
660
+ if (hasAnyCaptchaKeyword || !(response.status >= 200 && response.status < 300)) {
661
+ for (const { pattern, provider, confidence, description } of CAPTCHA_SCRIPT_PATTERNS) {
662
+ if (pattern.test(checkBody)) {
663
+ addMatch(matches, provider, confidence, description);
664
+ }
622
665
  }
623
- }
624
- for (const { pattern, provider, confidence, description } of CAPTCHA_FORM_PATTERNS) {
625
- if (pattern.test(checkBody)) {
626
- addMatch(matches, provider, confidence, description);
666
+ for (const { pattern, provider, confidence, description } of CAPTCHA_FORM_PATTERNS) {
667
+ if (pattern.test(checkBody)) {
668
+ addMatch(matches, provider, confidence, description);
669
+ }
627
670
  }
628
- }
629
- for (const { pattern, provider, confidence, description } of CAPTCHA_DOM_MARKERS) {
630
- if (pattern.test(checkBody)) {
631
- addMatch(matches, provider, confidence, description);
671
+ for (const { pattern, provider, confidence, description } of CAPTCHA_DOM_MARKERS) {
672
+ if (pattern.test(checkBody)) {
673
+ addMatch(matches, provider, confidence, description);
674
+ }
632
675
  }
633
- }
634
- for (const { pattern, provider, confidence, description } of CAPTCHA_NOSCRIPT_PATTERNS) {
635
- if (pattern.test(checkBody)) {
636
- addMatch(matches, provider, confidence, description);
676
+ for (const { pattern, provider, confidence, description } of CAPTCHA_NOSCRIPT_PATTERNS) {
677
+ if (pattern.test(checkBody)) {
678
+ addMatch(matches, provider, confidence, description);
679
+ }
680
+ }
681
+ for (const { pattern, provider, confidence, description } of CAPTCHA_PATTERNS) {
682
+ if (pattern.test(checkBody)) {
683
+ addMatch(matches, provider, confidence, description);
684
+ }
685
+ }
686
+ if (/<script[^>]*src=["'][^"']*(?:challenges\.cloudflare\.com|recaptcha\/api\.js|hcaptcha\.com\/1\/api\.js|challenges\.perimeterx\.)[^"']*["'][^>]*>/i.test(checkBody.slice(0, 120_000))) {
687
+ addMatch(matches, 'cloudflare', 0.6, 'Known challenge script reference found in HTML');
688
+ }
689
+ if ((response.status >= 500 && response.status < 600) &&
690
+ /cloudflare|challenge|verification|blocked|security/i.test(checkBody.slice(0, 3000))) {
691
+ addMatch(matches, 'generic', 0.2, 'Server challenge-like text on error response');
637
692
  }
638
693
  }
639
694
  if (response.status === 403 || response.status === 503) {
@@ -653,18 +708,6 @@ export function detectCaptcha(response, body) {
653
708
  /refresh|challenge|verify|captcha|javascript challenge|cloudflare|bot/i.test((location || ''))) {
654
709
  addMatch(matches, 'generic', 0.67, 'Redirect location indicates bot challenge flow');
655
710
  }
656
- if (/<script[^>]*src=["'][^"']*(?:challenges\.cloudflare\.com|recaptcha\/api\.js|hcaptcha\.com\/1\/api\.js|challenges\.perimeterx\.)[^"']*["'][^>]*>/i.test(checkBody.slice(0, 120_000))) {
657
- addMatch(matches, 'cloudflare', 0.6, 'Known challenge script reference found in HTML');
658
- }
659
- if ((response.status >= 500 && response.status < 600) &&
660
- /cloudflare|challenge|verification|blocked|security/i.test(checkBody.slice(0, 3000))) {
661
- addMatch(matches, 'generic', 0.2, 'Server challenge-like text on error response');
662
- }
663
- for (const { pattern, provider, confidence, description } of CAPTCHA_PATTERNS) {
664
- if (pattern.test(checkBody)) {
665
- addMatch(matches, provider, confidence, description);
666
- }
667
- }
668
711
  }
669
712
  const result = scoreByProvider(matches);
670
713
  if (result.confidence >= 0.5 || hasCaptchaHeader) {
@@ -681,23 +724,33 @@ export function detectCaptcha(response, body) {
681
724
  };
682
725
  }
683
726
  export function isProtectedDomain(hostname) {
684
- const protectedPatterns = [
685
- /cloudflare/i,
686
- /\.gov$/i,
687
- /\.mil$/i,
688
- /linkedin\.com$/i,
689
- /twitter\.com$/i,
690
- /x\.com$/i,
691
- /instagram\.com$/i,
692
- /facebook\.com$/i,
693
- /amazon\./i,
694
- /google\./i,
695
- /microsoft\.com$/i,
696
- /apple\.com$/i,
697
- /netflix\.com$/i,
698
- /spotify\.com$/i,
727
+ const h = hostname.toLowerCase();
728
+ const protectedDomains = [
729
+ 'linkedin.com',
730
+ 'twitter.com',
731
+ 'x.com',
732
+ 'instagram.com',
733
+ 'facebook.com',
734
+ 'amazon.com',
735
+ 'amazon.co.uk',
736
+ 'amazon.de',
737
+ 'amazon.co.jp',
738
+ 'google.com',
739
+ 'microsoft.com',
740
+ 'apple.com',
741
+ 'netflix.com',
742
+ 'spotify.com',
699
743
  ];
700
- return protectedPatterns.some(p => p.test(hostname));
744
+ const protectedTlds = ['.gov', '.mil'];
745
+ for (const domain of protectedDomains) {
746
+ if (h === domain || h.endsWith('.' + domain))
747
+ return true;
748
+ }
749
+ for (const tld of protectedTlds) {
750
+ if (h.endsWith(tld))
751
+ return true;
752
+ }
753
+ return false;
701
754
  }
702
755
  export function isCloudflareChallenge(response, body) {
703
756
  const cfRay = response.headers.get('cf-ray');
package/dist/version.js CHANGED
@@ -1,4 +1,4 @@
1
- const VERSION = '1.0.95';
1
+ const VERSION = '1.0.96';
2
2
  let _version = null;
3
3
  export async function getVersion() {
4
4
  if (_version)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "recker",
3
- "version": "1.0.95",
3
+ "version": "1.0.96",
4
4
  "description": "Multi-Protocol SDK for the AI Era - HTTP, WebSocket, DNS, FTP, SFTP, Telnet, HLS unified with AI providers and MCP tools",
5
5
  "main": "./dist/index.js",
6
6
  "types": "./dist/index.d.ts",