recker 1.0.95 → 1.0.96
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/scrape/crawl-queue.d.ts +2 -0
- package/dist/browser/scrape/crawl-queue.js +14 -0
- package/dist/browser/scrape/spider.d.ts +10 -1
- package/dist/browser/scrape/spider.js +55 -6
- package/dist/browser/seo/seo-spider.js +8 -4
- package/dist/browser/seo/validators/rss.js +4 -2
- package/dist/browser/utils/block-detector.js +109 -56
- package/dist/scrape/crawl-queue.d.ts +2 -0
- package/dist/scrape/crawl-queue.js +14 -0
- package/dist/scrape/spider.d.ts +10 -1
- package/dist/scrape/spider.js +55 -6
- package/dist/seo/seo-spider.js +8 -4
- package/dist/seo/validators/rss.js +4 -2
- package/dist/utils/block-detector.js +109 -56
- package/dist/version.js +1 -1
- package/package.json +1 -1
|
@@ -20,6 +20,8 @@ export declare class InMemoryCrawlQueue implements CrawlQueueAdapter {
|
|
|
20
20
|
private head;
|
|
21
21
|
private tail;
|
|
22
22
|
private visited;
|
|
23
|
+
private mode;
|
|
24
|
+
constructor(mode?: 'fifo' | 'lifo');
|
|
23
25
|
push(item: CrawlQueueItem): Promise<void>;
|
|
24
26
|
pushBatch(items: CrawlQueueItem[]): Promise<void>;
|
|
25
27
|
pop(): Promise<CrawlQueueItem | null>;
|
|
@@ -3,6 +3,10 @@ export class InMemoryCrawlQueue {
|
|
|
3
3
|
head = 0;
|
|
4
4
|
tail = 0;
|
|
5
5
|
visited = new Set();
|
|
6
|
+
mode;
|
|
7
|
+
constructor(mode = 'fifo') {
|
|
8
|
+
this.mode = mode;
|
|
9
|
+
}
|
|
6
10
|
async push(item) {
|
|
7
11
|
this.queue[this.tail++] = item;
|
|
8
12
|
}
|
|
@@ -12,6 +16,16 @@ export class InMemoryCrawlQueue {
|
|
|
12
16
|
}
|
|
13
17
|
}
|
|
14
18
|
async pop() {
|
|
19
|
+
if (this.mode === 'lifo') {
|
|
20
|
+
while (this.tail > this.head) {
|
|
21
|
+
this.tail--;
|
|
22
|
+
const item = this.queue[this.tail];
|
|
23
|
+
this.queue[this.tail] = undefined;
|
|
24
|
+
if (item)
|
|
25
|
+
return item;
|
|
26
|
+
}
|
|
27
|
+
return null;
|
|
28
|
+
}
|
|
15
29
|
while (this.head < this.tail) {
|
|
16
30
|
const item = this.queue[this.head];
|
|
17
31
|
this.queue[this.head] = undefined;
|
|
@@ -11,7 +11,7 @@ type CaptchaProvider = CaptchaDetectionResult['provider'];
|
|
|
11
11
|
export interface SpiderOptions {
|
|
12
12
|
maxDepth?: number;
|
|
13
13
|
maxPages?: number;
|
|
14
|
-
sameDomain?: boolean;
|
|
14
|
+
sameDomain?: boolean | 'exact' | 'subdomain';
|
|
15
15
|
concurrency?: number;
|
|
16
16
|
timeout?: number;
|
|
17
17
|
delay?: number;
|
|
@@ -63,8 +63,14 @@ export interface SpiderOptions {
|
|
|
63
63
|
domainRateLimit?: {
|
|
64
64
|
maxPerSecond?: number;
|
|
65
65
|
};
|
|
66
|
+
autoThrottle?: boolean | {
|
|
67
|
+
targetMs?: number;
|
|
68
|
+
minDelay?: number;
|
|
69
|
+
maxDelay?: number;
|
|
70
|
+
};
|
|
66
71
|
deduplicateContent?: boolean;
|
|
67
72
|
resume?: boolean;
|
|
73
|
+
strategy?: 'bfs' | 'dfs';
|
|
68
74
|
crawlQueue?: CrawlQueueAdapter;
|
|
69
75
|
crawlStorage?: CrawlStorageAdapter;
|
|
70
76
|
}
|
|
@@ -190,12 +196,14 @@ export declare class Spider {
|
|
|
190
196
|
private _queueSize;
|
|
191
197
|
private _resultCount;
|
|
192
198
|
private baseHost;
|
|
199
|
+
private baseRootDomain;
|
|
193
200
|
private running;
|
|
194
201
|
private aborted;
|
|
195
202
|
private abortController;
|
|
196
203
|
private pendingCount;
|
|
197
204
|
private domainRequestTimestamps;
|
|
198
205
|
private contentHashes;
|
|
206
|
+
private domainAvgResponseTime;
|
|
199
207
|
private blockedDomains;
|
|
200
208
|
private curlTransport;
|
|
201
209
|
private curlAvailable;
|
|
@@ -224,6 +232,7 @@ export declare class Spider {
|
|
|
224
232
|
private waitForDomainPenalty;
|
|
225
233
|
private registerDomainBlock;
|
|
226
234
|
private registerDomainSuccess;
|
|
235
|
+
private updateAutoThrottle;
|
|
227
236
|
private getCaptchaRetryMultiplier;
|
|
228
237
|
private registerDomainChallenge;
|
|
229
238
|
private getRetryWait;
|
|
@@ -108,15 +108,30 @@ function normalizeUrl(urlStr) {
|
|
|
108
108
|
return urlStr;
|
|
109
109
|
}
|
|
110
110
|
}
|
|
111
|
-
function
|
|
111
|
+
function getRootDomain(hostname) {
|
|
112
|
+
const parts = hostname.replace(/^www\./, '').split('.');
|
|
113
|
+
if (parts.length >= 3 && parts[parts.length - 2].length <= 3) {
|
|
114
|
+
return parts.slice(-3).join('.');
|
|
115
|
+
}
|
|
116
|
+
return parts.slice(-2).join('.');
|
|
117
|
+
}
|
|
118
|
+
function shouldCrawl(url, baseHost, options, baseRootDomain) {
|
|
112
119
|
try {
|
|
113
120
|
const parsed = new URL(url);
|
|
114
121
|
if (!['http:', 'https:'].includes(parsed.protocol)) {
|
|
115
122
|
return false;
|
|
116
123
|
}
|
|
117
124
|
const hostname = parsed.hostname.replace(/^www\./, '');
|
|
118
|
-
|
|
119
|
-
|
|
125
|
+
const sameDomain = options.sameDomain;
|
|
126
|
+
if (sameDomain === 'subdomain') {
|
|
127
|
+
const pageRoot = getRootDomain(hostname);
|
|
128
|
+
const rootDomain = baseRootDomain ?? getRootDomain(baseHost);
|
|
129
|
+
if (pageRoot !== rootDomain)
|
|
130
|
+
return false;
|
|
131
|
+
}
|
|
132
|
+
else if (sameDomain !== false) {
|
|
133
|
+
if (hostname !== baseHost)
|
|
134
|
+
return false;
|
|
120
135
|
}
|
|
121
136
|
const skipExtensions = [
|
|
122
137
|
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico',
|
|
@@ -193,12 +208,14 @@ export class Spider {
|
|
|
193
208
|
_queueSize = 0;
|
|
194
209
|
_resultCount = 0;
|
|
195
210
|
baseHost = '';
|
|
211
|
+
baseRootDomain = '';
|
|
196
212
|
running = false;
|
|
197
213
|
aborted = false;
|
|
198
214
|
abortController = new AbortController();
|
|
199
215
|
pendingCount = 0;
|
|
200
216
|
domainRequestTimestamps = new Map();
|
|
201
217
|
contentHashes = new Map();
|
|
218
|
+
domainAvgResponseTime = new Map();
|
|
202
219
|
blockedDomains = new Set();
|
|
203
220
|
curlTransport = null;
|
|
204
221
|
curlAvailable = false;
|
|
@@ -285,8 +302,10 @@ export class Spider {
|
|
|
285
302
|
extract: extractSchema,
|
|
286
303
|
parserOptions: options.parserOptions,
|
|
287
304
|
domainRateLimit: options.domainRateLimit,
|
|
305
|
+
autoThrottle: options.autoThrottle ?? false,
|
|
288
306
|
deduplicateContent: options.deduplicateContent ?? false,
|
|
289
307
|
resume: options.resume ?? false,
|
|
308
|
+
strategy: options.strategy ?? 'bfs',
|
|
290
309
|
};
|
|
291
310
|
if (options.proxy) {
|
|
292
311
|
if (typeof options.proxy === 'string') {
|
|
@@ -316,7 +335,7 @@ export class Spider {
|
|
|
316
335
|
interval: this.options.delay,
|
|
317
336
|
} : {}),
|
|
318
337
|
});
|
|
319
|
-
this.crawlQueue = options.crawlQueue ?? new InMemoryCrawlQueue();
|
|
338
|
+
this.crawlQueue = options.crawlQueue ?? new InMemoryCrawlQueue(this.options.strategy === 'dfs' ? 'lifo' : 'fifo');
|
|
320
339
|
this.crawlStorage = options.crawlStorage ?? new InMemoryCrawlStorage();
|
|
321
340
|
}
|
|
322
341
|
async crawl(startUrl) {
|
|
@@ -325,6 +344,7 @@ export class Spider {
|
|
|
325
344
|
const normalizedStart = normalizeUrl(startUrl);
|
|
326
345
|
const baseUrl = new URL(normalizedStart).origin;
|
|
327
346
|
this.baseHost = new URL(normalizedStart).hostname.replace(/^www\./, '');
|
|
347
|
+
this.baseRootDomain = getRootDomain(this.baseHost);
|
|
328
348
|
if (!this.options.resume) {
|
|
329
349
|
await this.crawlQueue.clear();
|
|
330
350
|
await this.crawlStorage.clear();
|
|
@@ -333,6 +353,7 @@ export class Spider {
|
|
|
333
353
|
this._resultCount = 0;
|
|
334
354
|
this.domainRequestTimestamps.clear();
|
|
335
355
|
this.contentHashes.clear();
|
|
356
|
+
this.domainAvgResponseTime.clear();
|
|
336
357
|
}
|
|
337
358
|
else {
|
|
338
359
|
this._queueSize = await this.crawlQueue.size();
|
|
@@ -463,6 +484,14 @@ export class Spider {
|
|
|
463
484
|
: new Set(pages.map(r => r.url));
|
|
464
485
|
await this.crawlQueue.close?.();
|
|
465
486
|
await this.crawlStorage.close?.();
|
|
487
|
+
for (const client of this.proxyClients.values()) {
|
|
488
|
+
if (typeof client.destroy === 'function') {
|
|
489
|
+
client.destroy();
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
this.proxyClients.clear();
|
|
493
|
+
await this.proxyAdapter?.close?.();
|
|
494
|
+
this.domainAvgResponseTime.clear();
|
|
466
495
|
return {
|
|
467
496
|
startUrl: normalizedStart,
|
|
468
497
|
pages,
|
|
@@ -788,6 +817,9 @@ export class Spider {
|
|
|
788
817
|
const isHighQualitySuccess = response.status < 400 && !detection.blocked && !hasCaptchaSignal;
|
|
789
818
|
if (isHighQualitySuccess) {
|
|
790
819
|
this.registerDomainSuccess(hostname);
|
|
820
|
+
if (timings?.total) {
|
|
821
|
+
this.updateAutoThrottle(hostname, timings.total);
|
|
822
|
+
}
|
|
791
823
|
}
|
|
792
824
|
if (!shouldRetry || attempt === maxAttempts - 1) {
|
|
793
825
|
if (proxyUrl && this.proxyAdapter?.reportResult) {
|
|
@@ -1081,7 +1113,7 @@ export class Spider {
|
|
|
1081
1113
|
if (!link.href)
|
|
1082
1114
|
continue;
|
|
1083
1115
|
const normalized = normalizeUrl(link.href);
|
|
1084
|
-
if (!shouldCrawl(normalized, this.baseHost, this.options))
|
|
1116
|
+
if (!shouldCrawl(normalized, this.baseHost, this.options, this.baseRootDomain))
|
|
1085
1117
|
continue;
|
|
1086
1118
|
candidateUrls.push(normalized);
|
|
1087
1119
|
candidates.push({ url: normalized, depth: item.depth + 1 });
|
|
@@ -1192,6 +1224,7 @@ export class Spider {
|
|
|
1192
1224
|
consecutiveUndiciFailures: 0,
|
|
1193
1225
|
lastTransport: 'undici',
|
|
1194
1226
|
lastCaptchaConfidence: 0,
|
|
1227
|
+
autoThrottleDelay: 0,
|
|
1195
1228
|
};
|
|
1196
1229
|
this.domainStates.set(hostname, next);
|
|
1197
1230
|
return next;
|
|
@@ -1294,7 +1327,9 @@ export class Spider {
|
|
|
1294
1327
|
async waitForDomainPenalty(hostname) {
|
|
1295
1328
|
const state = this.getOrCreateDomainState(hostname);
|
|
1296
1329
|
const now = Date.now();
|
|
1297
|
-
const
|
|
1330
|
+
const penaltyDelay = Math.max(state.penaltyUntil, state.challengeCooldownUntil) - now;
|
|
1331
|
+
const throttleDelay = state.autoThrottleDelay ?? 0;
|
|
1332
|
+
const delay = Math.max(penaltyDelay, throttleDelay);
|
|
1298
1333
|
if (delay > 0) {
|
|
1299
1334
|
await sleep(delay, this.abortController.signal);
|
|
1300
1335
|
}
|
|
@@ -1322,6 +1357,20 @@ export class Spider {
|
|
|
1322
1357
|
state.lastCaptchaProvider = undefined;
|
|
1323
1358
|
}
|
|
1324
1359
|
}
|
|
1360
|
+
updateAutoThrottle(hostname, responseTimeMs) {
|
|
1361
|
+
const config = this.options.autoThrottle;
|
|
1362
|
+
if (!config)
|
|
1363
|
+
return;
|
|
1364
|
+
const target = (typeof config === 'object' ? config.targetMs : undefined) ?? 500;
|
|
1365
|
+
const minDelay = (typeof config === 'object' ? config.minDelay : undefined) ?? 50;
|
|
1366
|
+
const maxDelay = (typeof config === 'object' ? config.maxDelay : undefined) ?? 30000;
|
|
1367
|
+
const prev = this.domainAvgResponseTime.get(hostname) ?? responseTimeMs;
|
|
1368
|
+
const avg = prev * 0.7 + responseTimeMs * 0.3;
|
|
1369
|
+
this.domainAvgResponseTime.set(hostname, avg);
|
|
1370
|
+
const ratio = avg / target;
|
|
1371
|
+
const state = this.getOrCreateDomainState(hostname);
|
|
1372
|
+
state.autoThrottleDelay = Math.max(minDelay, Math.min(maxDelay, Math.round(this.options.delay * ratio)));
|
|
1373
|
+
}
|
|
1325
1374
|
getCaptchaRetryMultiplier(provider) {
|
|
1326
1375
|
if (!provider)
|
|
1327
1376
|
return 1.2;
|
|
@@ -141,7 +141,8 @@ export class SeoSpider {
|
|
|
141
141
|
results.humans.content = await res.text();
|
|
142
142
|
}
|
|
143
143
|
}
|
|
144
|
-
catch {
|
|
144
|
+
catch {
|
|
145
|
+
}
|
|
145
146
|
try {
|
|
146
147
|
const res = await client.get(results.llms.url);
|
|
147
148
|
if (res.status === 200) {
|
|
@@ -149,7 +150,8 @@ export class SeoSpider {
|
|
|
149
150
|
results.llms.content = await res.text();
|
|
150
151
|
}
|
|
151
152
|
}
|
|
152
|
-
catch {
|
|
153
|
+
catch {
|
|
154
|
+
}
|
|
153
155
|
try {
|
|
154
156
|
const res = await client.get(results.sitemap.url);
|
|
155
157
|
if (res.status === 200) {
|
|
@@ -159,7 +161,8 @@ export class SeoSpider {
|
|
|
159
161
|
results.sitemap.urlCount = urlMatches ? urlMatches.length : 0;
|
|
160
162
|
}
|
|
161
163
|
}
|
|
162
|
-
catch {
|
|
164
|
+
catch {
|
|
165
|
+
}
|
|
163
166
|
try {
|
|
164
167
|
let res = await client.get(results.manifest.url);
|
|
165
168
|
if (res.status !== 200) {
|
|
@@ -181,7 +184,8 @@ export class SeoSpider {
|
|
|
181
184
|
}
|
|
182
185
|
}
|
|
183
186
|
}
|
|
184
|
-
catch {
|
|
187
|
+
catch {
|
|
188
|
+
}
|
|
185
189
|
return results;
|
|
186
190
|
}
|
|
187
191
|
catch {
|
|
@@ -25,7 +25,8 @@ export async function discoverFeeds(baseUrl, html) {
|
|
|
25
25
|
href = new URL(href, baseUrl).toString();
|
|
26
26
|
candidateUrls.add(href);
|
|
27
27
|
}
|
|
28
|
-
catch {
|
|
28
|
+
catch {
|
|
29
|
+
}
|
|
29
30
|
}
|
|
30
31
|
}
|
|
31
32
|
}
|
|
@@ -35,7 +36,8 @@ export async function discoverFeeds(baseUrl, html) {
|
|
|
35
36
|
const url = new URL(path, baseUrl).toString();
|
|
36
37
|
candidateUrls.add(url);
|
|
37
38
|
}
|
|
38
|
-
catch {
|
|
39
|
+
catch {
|
|
40
|
+
}
|
|
39
41
|
}
|
|
40
42
|
}
|
|
41
43
|
const client = createClient({ timeout: 8000 });
|
|
@@ -526,18 +526,42 @@ export function detectBlock(response, body) {
|
|
|
526
526
|
if (body) {
|
|
527
527
|
const isLongBody = body.length > 100_000;
|
|
528
528
|
const checkBody = isLongBody ? body.slice(0, 30_000) : body;
|
|
529
|
-
const
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
529
|
+
const lowerBody = checkBody.slice(0, 8000).toLowerCase();
|
|
530
|
+
const hasAnyChallengeKeyword = lowerBody.includes('captcha') ||
|
|
531
|
+
lowerBody.includes('cloudflare') ||
|
|
532
|
+
lowerBody.includes('datadome') ||
|
|
533
|
+
lowerBody.includes('blocked') ||
|
|
534
|
+
lowerBody.includes('denied') ||
|
|
535
|
+
lowerBody.includes('forbidden') ||
|
|
536
|
+
lowerBody.includes('too many') ||
|
|
537
|
+
lowerBody.includes('rate limit') ||
|
|
538
|
+
lowerBody.includes('checking your browser') ||
|
|
539
|
+
lowerBody.includes('just a moment') ||
|
|
540
|
+
lowerBody.includes('security check') ||
|
|
541
|
+
lowerBody.includes('human verification') ||
|
|
542
|
+
lowerBody.includes('perimeterx') ||
|
|
543
|
+
lowerBody.includes('incapsula') ||
|
|
544
|
+
lowerBody.includes('imperva') ||
|
|
545
|
+
lowerBody.includes('akamai') ||
|
|
546
|
+
lowerBody.includes('httpservice') ||
|
|
547
|
+
lowerBody.includes('enablejs') ||
|
|
548
|
+
lowerBody.includes('verify') ||
|
|
549
|
+
lowerBody.includes('suspicious') ||
|
|
550
|
+
lowerBody.includes('bot') ||
|
|
551
|
+
lowerBody.includes('access denied');
|
|
552
|
+
if (hasAnyChallengeKeyword || response.status !== 200) {
|
|
553
|
+
for (const { pattern, reason, confidence, description } of BLOCK_PATTERNS) {
|
|
554
|
+
if (isLongBody && response.status === 200 && confidence < 0.85 && !hasAnyChallengeKeyword) {
|
|
555
|
+
continue;
|
|
556
|
+
}
|
|
557
|
+
if (pattern.test(checkBody)) {
|
|
558
|
+
results.push({
|
|
559
|
+
blocked: true,
|
|
560
|
+
reason,
|
|
561
|
+
confidence,
|
|
562
|
+
description,
|
|
563
|
+
});
|
|
564
|
+
}
|
|
541
565
|
}
|
|
542
566
|
}
|
|
543
567
|
if (body.length < 5000 &&
|
|
@@ -603,8 +627,25 @@ export function detectCaptcha(response, body) {
|
|
|
603
627
|
if (body) {
|
|
604
628
|
const checkBody = body.length < 120_000 ? body : body.slice(0, 120_000);
|
|
605
629
|
const isTinyBody = checkBody.length > 0 && checkBody.length < 12000;
|
|
630
|
+
const lowerSnippet = checkBody.slice(0, 8000).toLowerCase();
|
|
631
|
+
const hasAnyCaptchaKeyword = lowerSnippet.includes('captcha') ||
|
|
632
|
+
lowerSnippet.includes('recaptcha') ||
|
|
633
|
+
lowerSnippet.includes('hcaptcha') ||
|
|
634
|
+
lowerSnippet.includes('turnstile') ||
|
|
635
|
+
lowerSnippet.includes('challenge') ||
|
|
636
|
+
lowerSnippet.includes('cloudflare') ||
|
|
637
|
+
lowerSnippet.includes('datadome') ||
|
|
638
|
+
lowerSnippet.includes('perimeterx') ||
|
|
639
|
+
lowerSnippet.includes('funcaptcha') ||
|
|
640
|
+
lowerSnippet.includes('arkose') ||
|
|
641
|
+
lowerSnippet.includes('sitekey') ||
|
|
642
|
+
lowerSnippet.includes('just a moment') ||
|
|
643
|
+
lowerSnippet.includes('human verification');
|
|
606
644
|
const hasHtmlTags = /<html|<head|<body|<script|<meta/i.test(checkBody);
|
|
607
|
-
const challengeTitleOrText =
|
|
645
|
+
const challengeTitleOrText = hasAnyCaptchaKeyword ||
|
|
646
|
+
lowerSnippet.includes('attention required') ||
|
|
647
|
+
lowerSnippet.includes('verify you are human') ||
|
|
648
|
+
lowerSnippet.includes('security check');
|
|
608
649
|
if (isTinyBody && challengeTitleOrText && hasHtmlTags) {
|
|
609
650
|
addMatch(matches, 'generic', 0.6, 'Tiny HTML response with challenge-like text');
|
|
610
651
|
}
|
|
@@ -616,24 +657,38 @@ export function detectCaptcha(response, body) {
|
|
|
616
657
|
addMatch(matches, provider, confidence, description);
|
|
617
658
|
}
|
|
618
659
|
}
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
660
|
+
if (hasAnyCaptchaKeyword || !(response.status >= 200 && response.status < 300)) {
|
|
661
|
+
for (const { pattern, provider, confidence, description } of CAPTCHA_SCRIPT_PATTERNS) {
|
|
662
|
+
if (pattern.test(checkBody)) {
|
|
663
|
+
addMatch(matches, provider, confidence, description);
|
|
664
|
+
}
|
|
622
665
|
}
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
666
|
+
for (const { pattern, provider, confidence, description } of CAPTCHA_FORM_PATTERNS) {
|
|
667
|
+
if (pattern.test(checkBody)) {
|
|
668
|
+
addMatch(matches, provider, confidence, description);
|
|
669
|
+
}
|
|
627
670
|
}
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
671
|
+
for (const { pattern, provider, confidence, description } of CAPTCHA_DOM_MARKERS) {
|
|
672
|
+
if (pattern.test(checkBody)) {
|
|
673
|
+
addMatch(matches, provider, confidence, description);
|
|
674
|
+
}
|
|
632
675
|
}
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
676
|
+
for (const { pattern, provider, confidence, description } of CAPTCHA_NOSCRIPT_PATTERNS) {
|
|
677
|
+
if (pattern.test(checkBody)) {
|
|
678
|
+
addMatch(matches, provider, confidence, description);
|
|
679
|
+
}
|
|
680
|
+
}
|
|
681
|
+
for (const { pattern, provider, confidence, description } of CAPTCHA_PATTERNS) {
|
|
682
|
+
if (pattern.test(checkBody)) {
|
|
683
|
+
addMatch(matches, provider, confidence, description);
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
if (/<script[^>]*src=["'][^"']*(?:challenges\.cloudflare\.com|recaptcha\/api\.js|hcaptcha\.com\/1\/api\.js|challenges\.perimeterx\.)[^"']*["'][^>]*>/i.test(checkBody.slice(0, 120_000))) {
|
|
687
|
+
addMatch(matches, 'cloudflare', 0.6, 'Known challenge script reference found in HTML');
|
|
688
|
+
}
|
|
689
|
+
if ((response.status >= 500 && response.status < 600) &&
|
|
690
|
+
/cloudflare|challenge|verification|blocked|security/i.test(checkBody.slice(0, 3000))) {
|
|
691
|
+
addMatch(matches, 'generic', 0.2, 'Server challenge-like text on error response');
|
|
637
692
|
}
|
|
638
693
|
}
|
|
639
694
|
if (response.status === 403 || response.status === 503) {
|
|
@@ -653,18 +708,6 @@ export function detectCaptcha(response, body) {
|
|
|
653
708
|
/refresh|challenge|verify|captcha|javascript challenge|cloudflare|bot/i.test((location || ''))) {
|
|
654
709
|
addMatch(matches, 'generic', 0.67, 'Redirect location indicates bot challenge flow');
|
|
655
710
|
}
|
|
656
|
-
if (/<script[^>]*src=["'][^"']*(?:challenges\.cloudflare\.com|recaptcha\/api\.js|hcaptcha\.com\/1\/api\.js|challenges\.perimeterx\.)[^"']*["'][^>]*>/i.test(checkBody.slice(0, 120_000))) {
|
|
657
|
-
addMatch(matches, 'cloudflare', 0.6, 'Known challenge script reference found in HTML');
|
|
658
|
-
}
|
|
659
|
-
if ((response.status >= 500 && response.status < 600) &&
|
|
660
|
-
/cloudflare|challenge|verification|blocked|security/i.test(checkBody.slice(0, 3000))) {
|
|
661
|
-
addMatch(matches, 'generic', 0.2, 'Server challenge-like text on error response');
|
|
662
|
-
}
|
|
663
|
-
for (const { pattern, provider, confidence, description } of CAPTCHA_PATTERNS) {
|
|
664
|
-
if (pattern.test(checkBody)) {
|
|
665
|
-
addMatch(matches, provider, confidence, description);
|
|
666
|
-
}
|
|
667
|
-
}
|
|
668
711
|
}
|
|
669
712
|
const result = scoreByProvider(matches);
|
|
670
713
|
if (result.confidence >= 0.5 || hasCaptchaHeader) {
|
|
@@ -681,23 +724,33 @@ export function detectCaptcha(response, body) {
|
|
|
681
724
|
};
|
|
682
725
|
}
|
|
683
726
|
export function isProtectedDomain(hostname) {
|
|
684
|
-
const
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
727
|
+
const h = hostname.toLowerCase();
|
|
728
|
+
const protectedDomains = [
|
|
729
|
+
'linkedin.com',
|
|
730
|
+
'twitter.com',
|
|
731
|
+
'x.com',
|
|
732
|
+
'instagram.com',
|
|
733
|
+
'facebook.com',
|
|
734
|
+
'amazon.com',
|
|
735
|
+
'amazon.co.uk',
|
|
736
|
+
'amazon.de',
|
|
737
|
+
'amazon.co.jp',
|
|
738
|
+
'google.com',
|
|
739
|
+
'microsoft.com',
|
|
740
|
+
'apple.com',
|
|
741
|
+
'netflix.com',
|
|
742
|
+
'spotify.com',
|
|
699
743
|
];
|
|
700
|
-
|
|
744
|
+
const protectedTlds = ['.gov', '.mil'];
|
|
745
|
+
for (const domain of protectedDomains) {
|
|
746
|
+
if (h === domain || h.endsWith('.' + domain))
|
|
747
|
+
return true;
|
|
748
|
+
}
|
|
749
|
+
for (const tld of protectedTlds) {
|
|
750
|
+
if (h.endsWith(tld))
|
|
751
|
+
return true;
|
|
752
|
+
}
|
|
753
|
+
return false;
|
|
701
754
|
}
|
|
702
755
|
export function isCloudflareChallenge(response, body) {
|
|
703
756
|
const cfRay = response.headers.get('cf-ray');
|
|
@@ -20,6 +20,8 @@ export declare class InMemoryCrawlQueue implements CrawlQueueAdapter {
|
|
|
20
20
|
private head;
|
|
21
21
|
private tail;
|
|
22
22
|
private visited;
|
|
23
|
+
private mode;
|
|
24
|
+
constructor(mode?: 'fifo' | 'lifo');
|
|
23
25
|
push(item: CrawlQueueItem): Promise<void>;
|
|
24
26
|
pushBatch(items: CrawlQueueItem[]): Promise<void>;
|
|
25
27
|
pop(): Promise<CrawlQueueItem | null>;
|
|
@@ -3,6 +3,10 @@ export class InMemoryCrawlQueue {
|
|
|
3
3
|
head = 0;
|
|
4
4
|
tail = 0;
|
|
5
5
|
visited = new Set();
|
|
6
|
+
mode;
|
|
7
|
+
constructor(mode = 'fifo') {
|
|
8
|
+
this.mode = mode;
|
|
9
|
+
}
|
|
6
10
|
async push(item) {
|
|
7
11
|
this.queue[this.tail++] = item;
|
|
8
12
|
}
|
|
@@ -12,6 +16,16 @@ export class InMemoryCrawlQueue {
|
|
|
12
16
|
}
|
|
13
17
|
}
|
|
14
18
|
async pop() {
|
|
19
|
+
if (this.mode === 'lifo') {
|
|
20
|
+
while (this.tail > this.head) {
|
|
21
|
+
this.tail--;
|
|
22
|
+
const item = this.queue[this.tail];
|
|
23
|
+
this.queue[this.tail] = undefined;
|
|
24
|
+
if (item)
|
|
25
|
+
return item;
|
|
26
|
+
}
|
|
27
|
+
return null;
|
|
28
|
+
}
|
|
15
29
|
while (this.head < this.tail) {
|
|
16
30
|
const item = this.queue[this.head];
|
|
17
31
|
this.queue[this.head] = undefined;
|
package/dist/scrape/spider.d.ts
CHANGED
|
@@ -11,7 +11,7 @@ type CaptchaProvider = CaptchaDetectionResult['provider'];
|
|
|
11
11
|
export interface SpiderOptions {
|
|
12
12
|
maxDepth?: number;
|
|
13
13
|
maxPages?: number;
|
|
14
|
-
sameDomain?: boolean;
|
|
14
|
+
sameDomain?: boolean | 'exact' | 'subdomain';
|
|
15
15
|
concurrency?: number;
|
|
16
16
|
timeout?: number;
|
|
17
17
|
delay?: number;
|
|
@@ -63,8 +63,14 @@ export interface SpiderOptions {
|
|
|
63
63
|
domainRateLimit?: {
|
|
64
64
|
maxPerSecond?: number;
|
|
65
65
|
};
|
|
66
|
+
autoThrottle?: boolean | {
|
|
67
|
+
targetMs?: number;
|
|
68
|
+
minDelay?: number;
|
|
69
|
+
maxDelay?: number;
|
|
70
|
+
};
|
|
66
71
|
deduplicateContent?: boolean;
|
|
67
72
|
resume?: boolean;
|
|
73
|
+
strategy?: 'bfs' | 'dfs';
|
|
68
74
|
crawlQueue?: CrawlQueueAdapter;
|
|
69
75
|
crawlStorage?: CrawlStorageAdapter;
|
|
70
76
|
}
|
|
@@ -190,12 +196,14 @@ export declare class Spider {
|
|
|
190
196
|
private _queueSize;
|
|
191
197
|
private _resultCount;
|
|
192
198
|
private baseHost;
|
|
199
|
+
private baseRootDomain;
|
|
193
200
|
private running;
|
|
194
201
|
private aborted;
|
|
195
202
|
private abortController;
|
|
196
203
|
private pendingCount;
|
|
197
204
|
private domainRequestTimestamps;
|
|
198
205
|
private contentHashes;
|
|
206
|
+
private domainAvgResponseTime;
|
|
199
207
|
private blockedDomains;
|
|
200
208
|
private curlTransport;
|
|
201
209
|
private curlAvailable;
|
|
@@ -224,6 +232,7 @@ export declare class Spider {
|
|
|
224
232
|
private waitForDomainPenalty;
|
|
225
233
|
private registerDomainBlock;
|
|
226
234
|
private registerDomainSuccess;
|
|
235
|
+
private updateAutoThrottle;
|
|
227
236
|
private getCaptchaRetryMultiplier;
|
|
228
237
|
private registerDomainChallenge;
|
|
229
238
|
private getRetryWait;
|
package/dist/scrape/spider.js
CHANGED
|
@@ -108,15 +108,30 @@ function normalizeUrl(urlStr) {
|
|
|
108
108
|
return urlStr;
|
|
109
109
|
}
|
|
110
110
|
}
|
|
111
|
-
function
|
|
111
|
+
function getRootDomain(hostname) {
|
|
112
|
+
const parts = hostname.replace(/^www\./, '').split('.');
|
|
113
|
+
if (parts.length >= 3 && parts[parts.length - 2].length <= 3) {
|
|
114
|
+
return parts.slice(-3).join('.');
|
|
115
|
+
}
|
|
116
|
+
return parts.slice(-2).join('.');
|
|
117
|
+
}
|
|
118
|
+
function shouldCrawl(url, baseHost, options, baseRootDomain) {
|
|
112
119
|
try {
|
|
113
120
|
const parsed = new URL(url);
|
|
114
121
|
if (!['http:', 'https:'].includes(parsed.protocol)) {
|
|
115
122
|
return false;
|
|
116
123
|
}
|
|
117
124
|
const hostname = parsed.hostname.replace(/^www\./, '');
|
|
118
|
-
|
|
119
|
-
|
|
125
|
+
const sameDomain = options.sameDomain;
|
|
126
|
+
if (sameDomain === 'subdomain') {
|
|
127
|
+
const pageRoot = getRootDomain(hostname);
|
|
128
|
+
const rootDomain = baseRootDomain ?? getRootDomain(baseHost);
|
|
129
|
+
if (pageRoot !== rootDomain)
|
|
130
|
+
return false;
|
|
131
|
+
}
|
|
132
|
+
else if (sameDomain !== false) {
|
|
133
|
+
if (hostname !== baseHost)
|
|
134
|
+
return false;
|
|
120
135
|
}
|
|
121
136
|
const skipExtensions = [
|
|
122
137
|
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico',
|
|
@@ -193,12 +208,14 @@ export class Spider {
|
|
|
193
208
|
_queueSize = 0;
|
|
194
209
|
_resultCount = 0;
|
|
195
210
|
baseHost = '';
|
|
211
|
+
baseRootDomain = '';
|
|
196
212
|
running = false;
|
|
197
213
|
aborted = false;
|
|
198
214
|
abortController = new AbortController();
|
|
199
215
|
pendingCount = 0;
|
|
200
216
|
domainRequestTimestamps = new Map();
|
|
201
217
|
contentHashes = new Map();
|
|
218
|
+
domainAvgResponseTime = new Map();
|
|
202
219
|
blockedDomains = new Set();
|
|
203
220
|
curlTransport = null;
|
|
204
221
|
curlAvailable = false;
|
|
@@ -285,8 +302,10 @@ export class Spider {
|
|
|
285
302
|
extract: extractSchema,
|
|
286
303
|
parserOptions: options.parserOptions,
|
|
287
304
|
domainRateLimit: options.domainRateLimit,
|
|
305
|
+
autoThrottle: options.autoThrottle ?? false,
|
|
288
306
|
deduplicateContent: options.deduplicateContent ?? false,
|
|
289
307
|
resume: options.resume ?? false,
|
|
308
|
+
strategy: options.strategy ?? 'bfs',
|
|
290
309
|
};
|
|
291
310
|
if (options.proxy) {
|
|
292
311
|
if (typeof options.proxy === 'string') {
|
|
@@ -316,7 +335,7 @@ export class Spider {
|
|
|
316
335
|
interval: this.options.delay,
|
|
317
336
|
} : {}),
|
|
318
337
|
});
|
|
319
|
-
this.crawlQueue = options.crawlQueue ?? new InMemoryCrawlQueue();
|
|
338
|
+
this.crawlQueue = options.crawlQueue ?? new InMemoryCrawlQueue(this.options.strategy === 'dfs' ? 'lifo' : 'fifo');
|
|
320
339
|
this.crawlStorage = options.crawlStorage ?? new InMemoryCrawlStorage();
|
|
321
340
|
}
|
|
322
341
|
async crawl(startUrl) {
|
|
@@ -325,6 +344,7 @@ export class Spider {
|
|
|
325
344
|
const normalizedStart = normalizeUrl(startUrl);
|
|
326
345
|
const baseUrl = new URL(normalizedStart).origin;
|
|
327
346
|
this.baseHost = new URL(normalizedStart).hostname.replace(/^www\./, '');
|
|
347
|
+
this.baseRootDomain = getRootDomain(this.baseHost);
|
|
328
348
|
if (!this.options.resume) {
|
|
329
349
|
await this.crawlQueue.clear();
|
|
330
350
|
await this.crawlStorage.clear();
|
|
@@ -333,6 +353,7 @@ export class Spider {
|
|
|
333
353
|
this._resultCount = 0;
|
|
334
354
|
this.domainRequestTimestamps.clear();
|
|
335
355
|
this.contentHashes.clear();
|
|
356
|
+
this.domainAvgResponseTime.clear();
|
|
336
357
|
}
|
|
337
358
|
else {
|
|
338
359
|
this._queueSize = await this.crawlQueue.size();
|
|
@@ -463,6 +484,14 @@ export class Spider {
|
|
|
463
484
|
: new Set(pages.map(r => r.url));
|
|
464
485
|
await this.crawlQueue.close?.();
|
|
465
486
|
await this.crawlStorage.close?.();
|
|
487
|
+
for (const client of this.proxyClients.values()) {
|
|
488
|
+
if (typeof client.destroy === 'function') {
|
|
489
|
+
client.destroy();
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
this.proxyClients.clear();
|
|
493
|
+
await this.proxyAdapter?.close?.();
|
|
494
|
+
this.domainAvgResponseTime.clear();
|
|
466
495
|
return {
|
|
467
496
|
startUrl: normalizedStart,
|
|
468
497
|
pages,
|
|
@@ -788,6 +817,9 @@ export class Spider {
|
|
|
788
817
|
const isHighQualitySuccess = response.status < 400 && !detection.blocked && !hasCaptchaSignal;
|
|
789
818
|
if (isHighQualitySuccess) {
|
|
790
819
|
this.registerDomainSuccess(hostname);
|
|
820
|
+
if (timings?.total) {
|
|
821
|
+
this.updateAutoThrottle(hostname, timings.total);
|
|
822
|
+
}
|
|
791
823
|
}
|
|
792
824
|
if (!shouldRetry || attempt === maxAttempts - 1) {
|
|
793
825
|
if (proxyUrl && this.proxyAdapter?.reportResult) {
|
|
@@ -1081,7 +1113,7 @@ export class Spider {
|
|
|
1081
1113
|
if (!link.href)
|
|
1082
1114
|
continue;
|
|
1083
1115
|
const normalized = normalizeUrl(link.href);
|
|
1084
|
-
if (!shouldCrawl(normalized, this.baseHost, this.options))
|
|
1116
|
+
if (!shouldCrawl(normalized, this.baseHost, this.options, this.baseRootDomain))
|
|
1085
1117
|
continue;
|
|
1086
1118
|
candidateUrls.push(normalized);
|
|
1087
1119
|
candidates.push({ url: normalized, depth: item.depth + 1 });
|
|
@@ -1192,6 +1224,7 @@ export class Spider {
|
|
|
1192
1224
|
consecutiveUndiciFailures: 0,
|
|
1193
1225
|
lastTransport: 'undici',
|
|
1194
1226
|
lastCaptchaConfidence: 0,
|
|
1227
|
+
autoThrottleDelay: 0,
|
|
1195
1228
|
};
|
|
1196
1229
|
this.domainStates.set(hostname, next);
|
|
1197
1230
|
return next;
|
|
@@ -1294,7 +1327,9 @@ export class Spider {
|
|
|
1294
1327
|
async waitForDomainPenalty(hostname) {
|
|
1295
1328
|
const state = this.getOrCreateDomainState(hostname);
|
|
1296
1329
|
const now = Date.now();
|
|
1297
|
-
const
|
|
1330
|
+
const penaltyDelay = Math.max(state.penaltyUntil, state.challengeCooldownUntil) - now;
|
|
1331
|
+
const throttleDelay = state.autoThrottleDelay ?? 0;
|
|
1332
|
+
const delay = Math.max(penaltyDelay, throttleDelay);
|
|
1298
1333
|
if (delay > 0) {
|
|
1299
1334
|
await sleep(delay, this.abortController.signal);
|
|
1300
1335
|
}
|
|
@@ -1322,6 +1357,20 @@ export class Spider {
|
|
|
1322
1357
|
state.lastCaptchaProvider = undefined;
|
|
1323
1358
|
}
|
|
1324
1359
|
}
|
|
1360
|
+
updateAutoThrottle(hostname, responseTimeMs) {
|
|
1361
|
+
const config = this.options.autoThrottle;
|
|
1362
|
+
if (!config)
|
|
1363
|
+
return;
|
|
1364
|
+
const target = (typeof config === 'object' ? config.targetMs : undefined) ?? 500;
|
|
1365
|
+
const minDelay = (typeof config === 'object' ? config.minDelay : undefined) ?? 50;
|
|
1366
|
+
const maxDelay = (typeof config === 'object' ? config.maxDelay : undefined) ?? 30000;
|
|
1367
|
+
const prev = this.domainAvgResponseTime.get(hostname) ?? responseTimeMs;
|
|
1368
|
+
const avg = prev * 0.7 + responseTimeMs * 0.3;
|
|
1369
|
+
this.domainAvgResponseTime.set(hostname, avg);
|
|
1370
|
+
const ratio = avg / target;
|
|
1371
|
+
const state = this.getOrCreateDomainState(hostname);
|
|
1372
|
+
state.autoThrottleDelay = Math.max(minDelay, Math.min(maxDelay, Math.round(this.options.delay * ratio)));
|
|
1373
|
+
}
|
|
1325
1374
|
getCaptchaRetryMultiplier(provider) {
|
|
1326
1375
|
if (!provider)
|
|
1327
1376
|
return 1.2;
|
package/dist/seo/seo-spider.js
CHANGED
|
@@ -141,7 +141,8 @@ export class SeoSpider {
|
|
|
141
141
|
results.humans.content = await res.text();
|
|
142
142
|
}
|
|
143
143
|
}
|
|
144
|
-
catch {
|
|
144
|
+
catch {
|
|
145
|
+
}
|
|
145
146
|
try {
|
|
146
147
|
const res = await client.get(results.llms.url);
|
|
147
148
|
if (res.status === 200) {
|
|
@@ -149,7 +150,8 @@ export class SeoSpider {
|
|
|
149
150
|
results.llms.content = await res.text();
|
|
150
151
|
}
|
|
151
152
|
}
|
|
152
|
-
catch {
|
|
153
|
+
catch {
|
|
154
|
+
}
|
|
153
155
|
try {
|
|
154
156
|
const res = await client.get(results.sitemap.url);
|
|
155
157
|
if (res.status === 200) {
|
|
@@ -159,7 +161,8 @@ export class SeoSpider {
|
|
|
159
161
|
results.sitemap.urlCount = urlMatches ? urlMatches.length : 0;
|
|
160
162
|
}
|
|
161
163
|
}
|
|
162
|
-
catch {
|
|
164
|
+
catch {
|
|
165
|
+
}
|
|
163
166
|
try {
|
|
164
167
|
let res = await client.get(results.manifest.url);
|
|
165
168
|
if (res.status !== 200) {
|
|
@@ -181,7 +184,8 @@ export class SeoSpider {
|
|
|
181
184
|
}
|
|
182
185
|
}
|
|
183
186
|
}
|
|
184
|
-
catch {
|
|
187
|
+
catch {
|
|
188
|
+
}
|
|
185
189
|
return results;
|
|
186
190
|
}
|
|
187
191
|
catch {
|
|
@@ -25,7 +25,8 @@ export async function discoverFeeds(baseUrl, html) {
|
|
|
25
25
|
href = new URL(href, baseUrl).toString();
|
|
26
26
|
candidateUrls.add(href);
|
|
27
27
|
}
|
|
28
|
-
catch {
|
|
28
|
+
catch {
|
|
29
|
+
}
|
|
29
30
|
}
|
|
30
31
|
}
|
|
31
32
|
}
|
|
@@ -35,7 +36,8 @@ export async function discoverFeeds(baseUrl, html) {
|
|
|
35
36
|
const url = new URL(path, baseUrl).toString();
|
|
36
37
|
candidateUrls.add(url);
|
|
37
38
|
}
|
|
38
|
-
catch {
|
|
39
|
+
catch {
|
|
40
|
+
}
|
|
39
41
|
}
|
|
40
42
|
}
|
|
41
43
|
const client = createClient({ timeout: 8000 });
|
|
@@ -526,18 +526,42 @@ export function detectBlock(response, body) {
|
|
|
526
526
|
if (body) {
|
|
527
527
|
const isLongBody = body.length > 100_000;
|
|
528
528
|
const checkBody = isLongBody ? body.slice(0, 30_000) : body;
|
|
529
|
-
const
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
529
|
+
const lowerBody = checkBody.slice(0, 8000).toLowerCase();
|
|
530
|
+
const hasAnyChallengeKeyword = lowerBody.includes('captcha') ||
|
|
531
|
+
lowerBody.includes('cloudflare') ||
|
|
532
|
+
lowerBody.includes('datadome') ||
|
|
533
|
+
lowerBody.includes('blocked') ||
|
|
534
|
+
lowerBody.includes('denied') ||
|
|
535
|
+
lowerBody.includes('forbidden') ||
|
|
536
|
+
lowerBody.includes('too many') ||
|
|
537
|
+
lowerBody.includes('rate limit') ||
|
|
538
|
+
lowerBody.includes('checking your browser') ||
|
|
539
|
+
lowerBody.includes('just a moment') ||
|
|
540
|
+
lowerBody.includes('security check') ||
|
|
541
|
+
lowerBody.includes('human verification') ||
|
|
542
|
+
lowerBody.includes('perimeterx') ||
|
|
543
|
+
lowerBody.includes('incapsula') ||
|
|
544
|
+
lowerBody.includes('imperva') ||
|
|
545
|
+
lowerBody.includes('akamai') ||
|
|
546
|
+
lowerBody.includes('httpservice') ||
|
|
547
|
+
lowerBody.includes('enablejs') ||
|
|
548
|
+
lowerBody.includes('verify') ||
|
|
549
|
+
lowerBody.includes('suspicious') ||
|
|
550
|
+
lowerBody.includes('bot') ||
|
|
551
|
+
lowerBody.includes('access denied');
|
|
552
|
+
if (hasAnyChallengeKeyword || response.status !== 200) {
|
|
553
|
+
for (const { pattern, reason, confidence, description } of BLOCK_PATTERNS) {
|
|
554
|
+
if (isLongBody && response.status === 200 && confidence < 0.85 && !hasAnyChallengeKeyword) {
|
|
555
|
+
continue;
|
|
556
|
+
}
|
|
557
|
+
if (pattern.test(checkBody)) {
|
|
558
|
+
results.push({
|
|
559
|
+
blocked: true,
|
|
560
|
+
reason,
|
|
561
|
+
confidence,
|
|
562
|
+
description,
|
|
563
|
+
});
|
|
564
|
+
}
|
|
541
565
|
}
|
|
542
566
|
}
|
|
543
567
|
if (body.length < 5000 &&
|
|
@@ -603,8 +627,25 @@ export function detectCaptcha(response, body) {
|
|
|
603
627
|
if (body) {
|
|
604
628
|
const checkBody = body.length < 120_000 ? body : body.slice(0, 120_000);
|
|
605
629
|
const isTinyBody = checkBody.length > 0 && checkBody.length < 12000;
|
|
630
|
+
const lowerSnippet = checkBody.slice(0, 8000).toLowerCase();
|
|
631
|
+
const hasAnyCaptchaKeyword = lowerSnippet.includes('captcha') ||
|
|
632
|
+
lowerSnippet.includes('recaptcha') ||
|
|
633
|
+
lowerSnippet.includes('hcaptcha') ||
|
|
634
|
+
lowerSnippet.includes('turnstile') ||
|
|
635
|
+
lowerSnippet.includes('challenge') ||
|
|
636
|
+
lowerSnippet.includes('cloudflare') ||
|
|
637
|
+
lowerSnippet.includes('datadome') ||
|
|
638
|
+
lowerSnippet.includes('perimeterx') ||
|
|
639
|
+
lowerSnippet.includes('funcaptcha') ||
|
|
640
|
+
lowerSnippet.includes('arkose') ||
|
|
641
|
+
lowerSnippet.includes('sitekey') ||
|
|
642
|
+
lowerSnippet.includes('just a moment') ||
|
|
643
|
+
lowerSnippet.includes('human verification');
|
|
606
644
|
const hasHtmlTags = /<html|<head|<body|<script|<meta/i.test(checkBody);
|
|
607
|
-
const challengeTitleOrText =
|
|
645
|
+
const challengeTitleOrText = hasAnyCaptchaKeyword ||
|
|
646
|
+
lowerSnippet.includes('attention required') ||
|
|
647
|
+
lowerSnippet.includes('verify you are human') ||
|
|
648
|
+
lowerSnippet.includes('security check');
|
|
608
649
|
if (isTinyBody && challengeTitleOrText && hasHtmlTags) {
|
|
609
650
|
addMatch(matches, 'generic', 0.6, 'Tiny HTML response with challenge-like text');
|
|
610
651
|
}
|
|
@@ -616,24 +657,38 @@ export function detectCaptcha(response, body) {
|
|
|
616
657
|
addMatch(matches, provider, confidence, description);
|
|
617
658
|
}
|
|
618
659
|
}
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
660
|
+
if (hasAnyCaptchaKeyword || !(response.status >= 200 && response.status < 300)) {
|
|
661
|
+
for (const { pattern, provider, confidence, description } of CAPTCHA_SCRIPT_PATTERNS) {
|
|
662
|
+
if (pattern.test(checkBody)) {
|
|
663
|
+
addMatch(matches, provider, confidence, description);
|
|
664
|
+
}
|
|
622
665
|
}
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
666
|
+
for (const { pattern, provider, confidence, description } of CAPTCHA_FORM_PATTERNS) {
|
|
667
|
+
if (pattern.test(checkBody)) {
|
|
668
|
+
addMatch(matches, provider, confidence, description);
|
|
669
|
+
}
|
|
627
670
|
}
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
671
|
+
for (const { pattern, provider, confidence, description } of CAPTCHA_DOM_MARKERS) {
|
|
672
|
+
if (pattern.test(checkBody)) {
|
|
673
|
+
addMatch(matches, provider, confidence, description);
|
|
674
|
+
}
|
|
632
675
|
}
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
676
|
+
for (const { pattern, provider, confidence, description } of CAPTCHA_NOSCRIPT_PATTERNS) {
|
|
677
|
+
if (pattern.test(checkBody)) {
|
|
678
|
+
addMatch(matches, provider, confidence, description);
|
|
679
|
+
}
|
|
680
|
+
}
|
|
681
|
+
for (const { pattern, provider, confidence, description } of CAPTCHA_PATTERNS) {
|
|
682
|
+
if (pattern.test(checkBody)) {
|
|
683
|
+
addMatch(matches, provider, confidence, description);
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
if (/<script[^>]*src=["'][^"']*(?:challenges\.cloudflare\.com|recaptcha\/api\.js|hcaptcha\.com\/1\/api\.js|challenges\.perimeterx\.)[^"']*["'][^>]*>/i.test(checkBody.slice(0, 120_000))) {
|
|
687
|
+
addMatch(matches, 'cloudflare', 0.6, 'Known challenge script reference found in HTML');
|
|
688
|
+
}
|
|
689
|
+
if ((response.status >= 500 && response.status < 600) &&
|
|
690
|
+
/cloudflare|challenge|verification|blocked|security/i.test(checkBody.slice(0, 3000))) {
|
|
691
|
+
addMatch(matches, 'generic', 0.2, 'Server challenge-like text on error response');
|
|
637
692
|
}
|
|
638
693
|
}
|
|
639
694
|
if (response.status === 403 || response.status === 503) {
|
|
@@ -653,18 +708,6 @@ export function detectCaptcha(response, body) {
|
|
|
653
708
|
/refresh|challenge|verify|captcha|javascript challenge|cloudflare|bot/i.test((location || ''))) {
|
|
654
709
|
addMatch(matches, 'generic', 0.67, 'Redirect location indicates bot challenge flow');
|
|
655
710
|
}
|
|
656
|
-
if (/<script[^>]*src=["'][^"']*(?:challenges\.cloudflare\.com|recaptcha\/api\.js|hcaptcha\.com\/1\/api\.js|challenges\.perimeterx\.)[^"']*["'][^>]*>/i.test(checkBody.slice(0, 120_000))) {
|
|
657
|
-
addMatch(matches, 'cloudflare', 0.6, 'Known challenge script reference found in HTML');
|
|
658
|
-
}
|
|
659
|
-
if ((response.status >= 500 && response.status < 600) &&
|
|
660
|
-
/cloudflare|challenge|verification|blocked|security/i.test(checkBody.slice(0, 3000))) {
|
|
661
|
-
addMatch(matches, 'generic', 0.2, 'Server challenge-like text on error response');
|
|
662
|
-
}
|
|
663
|
-
for (const { pattern, provider, confidence, description } of CAPTCHA_PATTERNS) {
|
|
664
|
-
if (pattern.test(checkBody)) {
|
|
665
|
-
addMatch(matches, provider, confidence, description);
|
|
666
|
-
}
|
|
667
|
-
}
|
|
668
711
|
}
|
|
669
712
|
const result = scoreByProvider(matches);
|
|
670
713
|
if (result.confidence >= 0.5 || hasCaptchaHeader) {
|
|
@@ -681,23 +724,33 @@ export function detectCaptcha(response, body) {
|
|
|
681
724
|
};
|
|
682
725
|
}
|
|
683
726
|
export function isProtectedDomain(hostname) {
|
|
684
|
-
const
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
727
|
+
const h = hostname.toLowerCase();
|
|
728
|
+
const protectedDomains = [
|
|
729
|
+
'linkedin.com',
|
|
730
|
+
'twitter.com',
|
|
731
|
+
'x.com',
|
|
732
|
+
'instagram.com',
|
|
733
|
+
'facebook.com',
|
|
734
|
+
'amazon.com',
|
|
735
|
+
'amazon.co.uk',
|
|
736
|
+
'amazon.de',
|
|
737
|
+
'amazon.co.jp',
|
|
738
|
+
'google.com',
|
|
739
|
+
'microsoft.com',
|
|
740
|
+
'apple.com',
|
|
741
|
+
'netflix.com',
|
|
742
|
+
'spotify.com',
|
|
699
743
|
];
|
|
700
|
-
|
|
744
|
+
const protectedTlds = ['.gov', '.mil'];
|
|
745
|
+
for (const domain of protectedDomains) {
|
|
746
|
+
if (h === domain || h.endsWith('.' + domain))
|
|
747
|
+
return true;
|
|
748
|
+
}
|
|
749
|
+
for (const tld of protectedTlds) {
|
|
750
|
+
if (h.endsWith(tld))
|
|
751
|
+
return true;
|
|
752
|
+
}
|
|
753
|
+
return false;
|
|
701
754
|
}
|
|
702
755
|
export function isCloudflareChallenge(response, body) {
|
|
703
756
|
const cfRay = response.headers.get('cf-ray');
|
package/dist/version.js
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "recker",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.96",
|
|
4
4
|
"description": "Multi-Protocol SDK for the AI Era - HTTP, WebSocket, DNS, FTP, SFTP, Telnet, HLS unified with AI providers and MCP tools",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"types": "./dist/index.d.ts",
|