recker 1.0.94 → 1.0.95
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/scrape/crawl-queue.d.ts +2 -0
- package/dist/browser/scrape/crawl-queue.js +23 -4
- package/dist/browser/scrape/spider.d.ts +1 -0
- package/dist/browser/scrape/spider.js +64 -15
- package/dist/browser/seo/seo-spider.js +1 -1
- package/dist/browser/utils/block-detector.js +12 -0
- package/dist/scrape/crawl-queue.d.ts +2 -0
- package/dist/scrape/crawl-queue.js +23 -4
- package/dist/scrape/spider.d.ts +1 -0
- package/dist/scrape/spider.js +64 -15
- package/dist/seo/seo-spider.js +1 -1
- package/dist/utils/block-detector.js +12 -0
- package/dist/version.js +1 -1
- package/package.json +1 -1
|
@@ -17,6 +17,8 @@ export interface CrawlQueueAdapter {
|
|
|
17
17
|
}
|
|
18
18
|
export declare class InMemoryCrawlQueue implements CrawlQueueAdapter {
|
|
19
19
|
private queue;
|
|
20
|
+
private head;
|
|
21
|
+
private tail;
|
|
20
22
|
private visited;
|
|
21
23
|
push(item: CrawlQueueItem): Promise<void>;
|
|
22
24
|
pushBatch(items: CrawlQueueItem[]): Promise<void>;
|
|
@@ -1,14 +1,31 @@
|
|
|
1
1
|
export class InMemoryCrawlQueue {
|
|
2
2
|
queue = [];
|
|
3
|
+
head = 0;
|
|
4
|
+
tail = 0;
|
|
3
5
|
visited = new Set();
|
|
4
6
|
async push(item) {
|
|
5
|
-
this.queue.
|
|
7
|
+
this.queue[this.tail++] = item;
|
|
6
8
|
}
|
|
7
9
|
async pushBatch(items) {
|
|
8
|
-
|
|
10
|
+
for (const item of items) {
|
|
11
|
+
this.queue[this.tail++] = item;
|
|
12
|
+
}
|
|
9
13
|
}
|
|
10
14
|
async pop() {
|
|
11
|
-
|
|
15
|
+
while (this.head < this.tail) {
|
|
16
|
+
const item = this.queue[this.head];
|
|
17
|
+
this.queue[this.head] = undefined;
|
|
18
|
+
this.head++;
|
|
19
|
+
if (item) {
|
|
20
|
+
if (this.head > 1024 && this.head > this.tail / 2) {
|
|
21
|
+
this.queue = this.queue.slice(this.head);
|
|
22
|
+
this.tail -= this.head;
|
|
23
|
+
this.head = 0;
|
|
24
|
+
}
|
|
25
|
+
return item;
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
return null;
|
|
12
29
|
}
|
|
13
30
|
async hasVisited(url) {
|
|
14
31
|
return this.visited.has(url);
|
|
@@ -25,10 +42,12 @@ export class InMemoryCrawlQueue {
|
|
|
25
42
|
this.visited.add(url);
|
|
26
43
|
}
|
|
27
44
|
async size() {
|
|
28
|
-
return this.
|
|
45
|
+
return this.tail - this.head;
|
|
29
46
|
}
|
|
30
47
|
async clear() {
|
|
31
48
|
this.queue = [];
|
|
49
|
+
this.head = 0;
|
|
50
|
+
this.tail = 0;
|
|
32
51
|
this.visited.clear();
|
|
33
52
|
}
|
|
34
53
|
async close() {
|
|
@@ -40,10 +40,15 @@ const SEC_CH_PLATFORM_HINTS = [
|
|
|
40
40
|
function getHostname(url) {
|
|
41
41
|
return new URL(url).hostname;
|
|
42
42
|
}
|
|
43
|
-
function sleep(ms) {
|
|
43
|
+
function sleep(ms, signal) {
|
|
44
44
|
if (ms <= 0)
|
|
45
45
|
return Promise.resolve();
|
|
46
|
-
|
|
46
|
+
if (signal?.aborted)
|
|
47
|
+
return Promise.resolve();
|
|
48
|
+
return new Promise(resolve => {
|
|
49
|
+
const timer = setTimeout(resolve, ms);
|
|
50
|
+
signal?.addEventListener('abort', () => { clearTimeout(timer); resolve(); }, { once: true });
|
|
51
|
+
});
|
|
47
52
|
}
|
|
48
53
|
function getRetryAfterDelay(response) {
|
|
49
54
|
const retryAfter = response.headers.get('retry-after');
|
|
@@ -109,7 +114,8 @@ function shouldCrawl(url, baseHost, options) {
|
|
|
109
114
|
if (!['http:', 'https:'].includes(parsed.protocol)) {
|
|
110
115
|
return false;
|
|
111
116
|
}
|
|
112
|
-
|
|
117
|
+
const hostname = parsed.hostname.replace(/^www\./, '');
|
|
118
|
+
if (options.sameDomain !== false && hostname !== baseHost) {
|
|
113
119
|
return false;
|
|
114
120
|
}
|
|
115
121
|
const skipExtensions = [
|
|
@@ -189,6 +195,7 @@ export class Spider {
|
|
|
189
195
|
baseHost = '';
|
|
190
196
|
running = false;
|
|
191
197
|
aborted = false;
|
|
198
|
+
abortController = new AbortController();
|
|
192
199
|
pendingCount = 0;
|
|
193
200
|
domainRequestTimestamps = new Map();
|
|
194
201
|
contentHashes = new Map();
|
|
@@ -218,7 +225,9 @@ export class Spider {
|
|
|
218
225
|
if (timestamps.length >= limit) {
|
|
219
226
|
const waitMs = timestamps[0] + window - now;
|
|
220
227
|
if (waitMs > 0)
|
|
221
|
-
await sleep(waitMs);
|
|
228
|
+
await sleep(waitMs, this.abortController.signal);
|
|
229
|
+
if (this.aborted)
|
|
230
|
+
return;
|
|
222
231
|
const afterWait = Date.now();
|
|
223
232
|
while (timestamps.length > 0 && timestamps[0] <= afterWait - window) {
|
|
224
233
|
timestamps.shift();
|
|
@@ -315,7 +324,7 @@ export class Spider {
|
|
|
315
324
|
const startTimestamp = Date.now();
|
|
316
325
|
const normalizedStart = normalizeUrl(startUrl);
|
|
317
326
|
const baseUrl = new URL(normalizedStart).origin;
|
|
318
|
-
this.baseHost = new URL(normalizedStart).hostname;
|
|
327
|
+
this.baseHost = new URL(normalizedStart).hostname.replace(/^www\./, '');
|
|
319
328
|
if (!this.options.resume) {
|
|
320
329
|
await this.crawlQueue.clear();
|
|
321
330
|
await this.crawlStorage.clear();
|
|
@@ -331,14 +340,25 @@ export class Spider {
|
|
|
331
340
|
}
|
|
332
341
|
this.running = true;
|
|
333
342
|
this.aborted = false;
|
|
343
|
+
this.abortController = new AbortController();
|
|
334
344
|
this.pendingCount = 0;
|
|
335
345
|
this.sitemapUrls = [];
|
|
336
346
|
this.sitemapUrlSet.clear();
|
|
337
347
|
this.robotsData = null;
|
|
338
348
|
this.sitemapValidation = null;
|
|
339
349
|
this.robotsValidation = null;
|
|
340
|
-
this.
|
|
341
|
-
|
|
350
|
+
if (!this.options.resume) {
|
|
351
|
+
this.blockedDomains.clear();
|
|
352
|
+
this.domainStates.clear();
|
|
353
|
+
}
|
|
354
|
+
if (this.options.resume && this.options.deduplicateContent) {
|
|
355
|
+
const existingResults = await this.crawlStorage.getResults();
|
|
356
|
+
for (const r of existingResults) {
|
|
357
|
+
if (r.contentHash) {
|
|
358
|
+
this.contentHashes.set(r.contentHash, r.url);
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
}
|
|
342
362
|
if (this.options.transport !== 'undici') {
|
|
343
363
|
this.curlAvailable = await hasImpersonate();
|
|
344
364
|
if (this.options.transport === 'auto' && this.curlAvailable && isProtectedDomain(this.baseHost)) {
|
|
@@ -354,10 +374,12 @@ export class Spider {
|
|
|
354
374
|
const pending = new Map();
|
|
355
375
|
const scheduleUrl = async (item) => {
|
|
356
376
|
const normalized = normalizeUrl(item.url);
|
|
357
|
-
if (await this.crawlQueue.hasVisited(normalized))
|
|
358
|
-
return;
|
|
359
377
|
if (pending.has(normalized))
|
|
360
378
|
return;
|
|
379
|
+
if (await this.crawlQueue.hasVisited(normalized))
|
|
380
|
+
return;
|
|
381
|
+
await this.crawlQueue.markVisited(normalized);
|
|
382
|
+
this._visitedCount++;
|
|
361
383
|
if (item.depth > this.options.maxDepth)
|
|
362
384
|
return;
|
|
363
385
|
if (this._resultCount + pending.size >= this.options.maxPages)
|
|
@@ -373,8 +395,6 @@ export class Spider {
|
|
|
373
395
|
return;
|
|
374
396
|
}
|
|
375
397
|
}
|
|
376
|
-
await this.crawlQueue.markVisited(normalized);
|
|
377
|
-
this._visitedCount++;
|
|
378
398
|
this.pendingCount++;
|
|
379
399
|
const promise = this.pool.run(() => this.crawlPage({ ...item, url: normalized }))
|
|
380
400
|
.finally(() => {
|
|
@@ -416,14 +436,33 @@ export class Spider {
|
|
|
416
436
|
if (pending.size > 0) {
|
|
417
437
|
await Promise.all(pending.values());
|
|
418
438
|
}
|
|
439
|
+
while (!this.aborted && this._resultCount < this.options.maxPages) {
|
|
440
|
+
const remaining = await this.crawlQueue.size();
|
|
441
|
+
if (remaining === 0 && pending.size === 0)
|
|
442
|
+
break;
|
|
443
|
+
this._queueSize = remaining;
|
|
444
|
+
let nextItem = await this.crawlQueue.pop();
|
|
445
|
+
while (nextItem && !this.aborted) {
|
|
446
|
+
this._queueSize = Math.max(0, this._queueSize - 1);
|
|
447
|
+
if (this._resultCount + pending.size >= this.options.maxPages)
|
|
448
|
+
break;
|
|
449
|
+
await scheduleUrl(nextItem);
|
|
450
|
+
nextItem = await this.crawlQueue.pop();
|
|
451
|
+
}
|
|
452
|
+
if (pending.size > 0) {
|
|
453
|
+
await Promise.all(pending.values());
|
|
454
|
+
}
|
|
455
|
+
}
|
|
419
456
|
this.running = false;
|
|
420
457
|
const pages = await this.crawlStorage.getResults();
|
|
421
458
|
const errors = await this.crawlStorage.getErrors();
|
|
422
459
|
const sitemapAnalysis = this.buildSitemapAnalysis(pages);
|
|
423
460
|
const robotsAnalysis = this.buildRobotsAnalysis();
|
|
424
461
|
const visited = this.crawlQueue instanceof InMemoryCrawlQueue
|
|
425
|
-
? this.crawlQueue.getVisited()
|
|
462
|
+
? new Set(this.crawlQueue.getVisited())
|
|
426
463
|
: new Set(pages.map(r => r.url));
|
|
464
|
+
await this.crawlQueue.close?.();
|
|
465
|
+
await this.crawlStorage.close?.();
|
|
427
466
|
return {
|
|
428
467
|
startUrl: normalizedStart,
|
|
429
468
|
pages,
|
|
@@ -615,6 +654,8 @@ export class Spider {
|
|
|
615
654
|
let lastRetryAfterMs = 0;
|
|
616
655
|
const proxyUrl = this.proxyAdapter ? await this.proxyAdapter.getProxy() : null;
|
|
617
656
|
const executeRequest = async (useCurl) => {
|
|
657
|
+
if (this.aborted)
|
|
658
|
+
throw new Error('Crawl aborted');
|
|
618
659
|
if (useCurl && this.curlTransport) {
|
|
619
660
|
const curlForRequest = proxyUrl
|
|
620
661
|
? new CurlTransport(proxyUrl)
|
|
@@ -655,6 +696,7 @@ export class Spider {
|
|
|
655
696
|
const clientForRequest = this.getClientForProxy(proxyUrl);
|
|
656
697
|
const response = await clientForRequest.get(url, {
|
|
657
698
|
headers: this.buildRequestHeaders(url, false),
|
|
699
|
+
signal: this.abortController.signal,
|
|
658
700
|
beforeRedirect: this.options.onRedirect
|
|
659
701
|
? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
|
|
660
702
|
: undefined,
|
|
@@ -792,7 +834,9 @@ export class Spider {
|
|
|
792
834
|
timings,
|
|
793
835
|
});
|
|
794
836
|
}
|
|
795
|
-
|
|
837
|
+
if (this.aborted)
|
|
838
|
+
break;
|
|
839
|
+
await sleep(waitMs, this.abortController.signal);
|
|
796
840
|
continue;
|
|
797
841
|
}
|
|
798
842
|
catch (error) {
|
|
@@ -823,8 +867,10 @@ export class Spider {
|
|
|
823
867
|
forcedTransport = 'curl';
|
|
824
868
|
}
|
|
825
869
|
}
|
|
870
|
+
if (this.aborted)
|
|
871
|
+
break;
|
|
826
872
|
const waitMs = this.getRetryWait(hostname, attempt + 1);
|
|
827
|
-
await sleep(waitMs);
|
|
873
|
+
await sleep(waitMs, this.abortController.signal);
|
|
828
874
|
}
|
|
829
875
|
}
|
|
830
876
|
if (lastResponse) {
|
|
@@ -1059,6 +1105,8 @@ export class Spider {
|
|
|
1059
1105
|
}
|
|
1060
1106
|
}
|
|
1061
1107
|
catch (error) {
|
|
1108
|
+
if (this.aborted)
|
|
1109
|
+
return;
|
|
1062
1110
|
const errAttempts = typeof error?.attempts === 'number' ? error.attempts : 1;
|
|
1063
1111
|
const errRetryCount = typeof error?.retryCount === 'number' ? error.retryCount : Math.max(0, errAttempts - 1);
|
|
1064
1112
|
const errRetryAfterMs = typeof error?.retryAfterMs === 'number' ? error?.retryAfterMs : 0;
|
|
@@ -1248,7 +1296,7 @@ export class Spider {
|
|
|
1248
1296
|
const now = Date.now();
|
|
1249
1297
|
const delay = Math.max(state.penaltyUntil, state.challengeCooldownUntil) - now;
|
|
1250
1298
|
if (delay > 0) {
|
|
1251
|
-
await sleep(delay);
|
|
1299
|
+
await sleep(delay, this.abortController.signal);
|
|
1252
1300
|
}
|
|
1253
1301
|
}
|
|
1254
1302
|
registerDomainBlock(hostname) {
|
|
@@ -1319,6 +1367,7 @@ export class Spider {
|
|
|
1319
1367
|
}
|
|
1320
1368
|
abort() {
|
|
1321
1369
|
this.aborted = true;
|
|
1370
|
+
this.abortController.abort();
|
|
1322
1371
|
}
|
|
1323
1372
|
isRunning() {
|
|
1324
1373
|
return this.running;
|
|
@@ -56,7 +56,7 @@ export class SeoSpider {
|
|
|
56
56
|
});
|
|
57
57
|
}
|
|
58
58
|
async analyzePageDuringCrawl(pageResult, html) {
|
|
59
|
-
if (pageResult.
|
|
59
|
+
if (pageResult.status >= 400 || !html) {
|
|
60
60
|
const seoPage = { ...pageResult, seoReport: undefined };
|
|
61
61
|
this.seoPages.push(seoPage);
|
|
62
62
|
return;
|
|
@@ -517,6 +517,12 @@ export function detectBlock(response, body) {
|
|
|
517
517
|
description: 'DataDome headers detected',
|
|
518
518
|
});
|
|
519
519
|
}
|
|
520
|
+
if (response.status === 200 &&
|
|
521
|
+
results.length === 0 &&
|
|
522
|
+
!location &&
|
|
523
|
+
!body) {
|
|
524
|
+
return { blocked: false, confidence: 0 };
|
|
525
|
+
}
|
|
520
526
|
if (body) {
|
|
521
527
|
const isLongBody = body.length > 100_000;
|
|
522
528
|
const checkBody = isLongBody ? body.slice(0, 30_000) : body;
|
|
@@ -588,6 +594,12 @@ export function detectCaptcha(response, body) {
|
|
|
588
594
|
addMatch(matches, 'cloudflare', 0.7, 'Cloudflare challenge headers detected');
|
|
589
595
|
}
|
|
590
596
|
}
|
|
597
|
+
if (matches.length === 0 &&
|
|
598
|
+
response.status >= 200 && response.status < 300 &&
|
|
599
|
+
!location &&
|
|
600
|
+
!body) {
|
|
601
|
+
return { detected: false, confidence: 0 };
|
|
602
|
+
}
|
|
591
603
|
if (body) {
|
|
592
604
|
const checkBody = body.length < 120_000 ? body : body.slice(0, 120_000);
|
|
593
605
|
const isTinyBody = checkBody.length > 0 && checkBody.length < 12000;
|
|
@@ -17,6 +17,8 @@ export interface CrawlQueueAdapter {
|
|
|
17
17
|
}
|
|
18
18
|
export declare class InMemoryCrawlQueue implements CrawlQueueAdapter {
|
|
19
19
|
private queue;
|
|
20
|
+
private head;
|
|
21
|
+
private tail;
|
|
20
22
|
private visited;
|
|
21
23
|
push(item: CrawlQueueItem): Promise<void>;
|
|
22
24
|
pushBatch(items: CrawlQueueItem[]): Promise<void>;
|
|
@@ -1,14 +1,31 @@
|
|
|
1
1
|
export class InMemoryCrawlQueue {
|
|
2
2
|
queue = [];
|
|
3
|
+
head = 0;
|
|
4
|
+
tail = 0;
|
|
3
5
|
visited = new Set();
|
|
4
6
|
async push(item) {
|
|
5
|
-
this.queue.
|
|
7
|
+
this.queue[this.tail++] = item;
|
|
6
8
|
}
|
|
7
9
|
async pushBatch(items) {
|
|
8
|
-
|
|
10
|
+
for (const item of items) {
|
|
11
|
+
this.queue[this.tail++] = item;
|
|
12
|
+
}
|
|
9
13
|
}
|
|
10
14
|
async pop() {
|
|
11
|
-
|
|
15
|
+
while (this.head < this.tail) {
|
|
16
|
+
const item = this.queue[this.head];
|
|
17
|
+
this.queue[this.head] = undefined;
|
|
18
|
+
this.head++;
|
|
19
|
+
if (item) {
|
|
20
|
+
if (this.head > 1024 && this.head > this.tail / 2) {
|
|
21
|
+
this.queue = this.queue.slice(this.head);
|
|
22
|
+
this.tail -= this.head;
|
|
23
|
+
this.head = 0;
|
|
24
|
+
}
|
|
25
|
+
return item;
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
return null;
|
|
12
29
|
}
|
|
13
30
|
async hasVisited(url) {
|
|
14
31
|
return this.visited.has(url);
|
|
@@ -25,10 +42,12 @@ export class InMemoryCrawlQueue {
|
|
|
25
42
|
this.visited.add(url);
|
|
26
43
|
}
|
|
27
44
|
async size() {
|
|
28
|
-
return this.
|
|
45
|
+
return this.tail - this.head;
|
|
29
46
|
}
|
|
30
47
|
async clear() {
|
|
31
48
|
this.queue = [];
|
|
49
|
+
this.head = 0;
|
|
50
|
+
this.tail = 0;
|
|
32
51
|
this.visited.clear();
|
|
33
52
|
}
|
|
34
53
|
async close() {
|
package/dist/scrape/spider.d.ts
CHANGED
package/dist/scrape/spider.js
CHANGED
|
@@ -40,10 +40,15 @@ const SEC_CH_PLATFORM_HINTS = [
|
|
|
40
40
|
function getHostname(url) {
|
|
41
41
|
return new URL(url).hostname;
|
|
42
42
|
}
|
|
43
|
-
function sleep(ms) {
|
|
43
|
+
function sleep(ms, signal) {
|
|
44
44
|
if (ms <= 0)
|
|
45
45
|
return Promise.resolve();
|
|
46
|
-
|
|
46
|
+
if (signal?.aborted)
|
|
47
|
+
return Promise.resolve();
|
|
48
|
+
return new Promise(resolve => {
|
|
49
|
+
const timer = setTimeout(resolve, ms);
|
|
50
|
+
signal?.addEventListener('abort', () => { clearTimeout(timer); resolve(); }, { once: true });
|
|
51
|
+
});
|
|
47
52
|
}
|
|
48
53
|
function getRetryAfterDelay(response) {
|
|
49
54
|
const retryAfter = response.headers.get('retry-after');
|
|
@@ -109,7 +114,8 @@ function shouldCrawl(url, baseHost, options) {
|
|
|
109
114
|
if (!['http:', 'https:'].includes(parsed.protocol)) {
|
|
110
115
|
return false;
|
|
111
116
|
}
|
|
112
|
-
|
|
117
|
+
const hostname = parsed.hostname.replace(/^www\./, '');
|
|
118
|
+
if (options.sameDomain !== false && hostname !== baseHost) {
|
|
113
119
|
return false;
|
|
114
120
|
}
|
|
115
121
|
const skipExtensions = [
|
|
@@ -189,6 +195,7 @@ export class Spider {
|
|
|
189
195
|
baseHost = '';
|
|
190
196
|
running = false;
|
|
191
197
|
aborted = false;
|
|
198
|
+
abortController = new AbortController();
|
|
192
199
|
pendingCount = 0;
|
|
193
200
|
domainRequestTimestamps = new Map();
|
|
194
201
|
contentHashes = new Map();
|
|
@@ -218,7 +225,9 @@ export class Spider {
|
|
|
218
225
|
if (timestamps.length >= limit) {
|
|
219
226
|
const waitMs = timestamps[0] + window - now;
|
|
220
227
|
if (waitMs > 0)
|
|
221
|
-
await sleep(waitMs);
|
|
228
|
+
await sleep(waitMs, this.abortController.signal);
|
|
229
|
+
if (this.aborted)
|
|
230
|
+
return;
|
|
222
231
|
const afterWait = Date.now();
|
|
223
232
|
while (timestamps.length > 0 && timestamps[0] <= afterWait - window) {
|
|
224
233
|
timestamps.shift();
|
|
@@ -315,7 +324,7 @@ export class Spider {
|
|
|
315
324
|
const startTimestamp = Date.now();
|
|
316
325
|
const normalizedStart = normalizeUrl(startUrl);
|
|
317
326
|
const baseUrl = new URL(normalizedStart).origin;
|
|
318
|
-
this.baseHost = new URL(normalizedStart).hostname;
|
|
327
|
+
this.baseHost = new URL(normalizedStart).hostname.replace(/^www\./, '');
|
|
319
328
|
if (!this.options.resume) {
|
|
320
329
|
await this.crawlQueue.clear();
|
|
321
330
|
await this.crawlStorage.clear();
|
|
@@ -331,14 +340,25 @@ export class Spider {
|
|
|
331
340
|
}
|
|
332
341
|
this.running = true;
|
|
333
342
|
this.aborted = false;
|
|
343
|
+
this.abortController = new AbortController();
|
|
334
344
|
this.pendingCount = 0;
|
|
335
345
|
this.sitemapUrls = [];
|
|
336
346
|
this.sitemapUrlSet.clear();
|
|
337
347
|
this.robotsData = null;
|
|
338
348
|
this.sitemapValidation = null;
|
|
339
349
|
this.robotsValidation = null;
|
|
340
|
-
this.
|
|
341
|
-
|
|
350
|
+
if (!this.options.resume) {
|
|
351
|
+
this.blockedDomains.clear();
|
|
352
|
+
this.domainStates.clear();
|
|
353
|
+
}
|
|
354
|
+
if (this.options.resume && this.options.deduplicateContent) {
|
|
355
|
+
const existingResults = await this.crawlStorage.getResults();
|
|
356
|
+
for (const r of existingResults) {
|
|
357
|
+
if (r.contentHash) {
|
|
358
|
+
this.contentHashes.set(r.contentHash, r.url);
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
}
|
|
342
362
|
if (this.options.transport !== 'undici') {
|
|
343
363
|
this.curlAvailable = await hasImpersonate();
|
|
344
364
|
if (this.options.transport === 'auto' && this.curlAvailable && isProtectedDomain(this.baseHost)) {
|
|
@@ -354,10 +374,12 @@ export class Spider {
|
|
|
354
374
|
const pending = new Map();
|
|
355
375
|
const scheduleUrl = async (item) => {
|
|
356
376
|
const normalized = normalizeUrl(item.url);
|
|
357
|
-
if (await this.crawlQueue.hasVisited(normalized))
|
|
358
|
-
return;
|
|
359
377
|
if (pending.has(normalized))
|
|
360
378
|
return;
|
|
379
|
+
if (await this.crawlQueue.hasVisited(normalized))
|
|
380
|
+
return;
|
|
381
|
+
await this.crawlQueue.markVisited(normalized);
|
|
382
|
+
this._visitedCount++;
|
|
361
383
|
if (item.depth > this.options.maxDepth)
|
|
362
384
|
return;
|
|
363
385
|
if (this._resultCount + pending.size >= this.options.maxPages)
|
|
@@ -373,8 +395,6 @@ export class Spider {
|
|
|
373
395
|
return;
|
|
374
396
|
}
|
|
375
397
|
}
|
|
376
|
-
await this.crawlQueue.markVisited(normalized);
|
|
377
|
-
this._visitedCount++;
|
|
378
398
|
this.pendingCount++;
|
|
379
399
|
const promise = this.pool.run(() => this.crawlPage({ ...item, url: normalized }))
|
|
380
400
|
.finally(() => {
|
|
@@ -416,14 +436,33 @@ export class Spider {
|
|
|
416
436
|
if (pending.size > 0) {
|
|
417
437
|
await Promise.all(pending.values());
|
|
418
438
|
}
|
|
439
|
+
while (!this.aborted && this._resultCount < this.options.maxPages) {
|
|
440
|
+
const remaining = await this.crawlQueue.size();
|
|
441
|
+
if (remaining === 0 && pending.size === 0)
|
|
442
|
+
break;
|
|
443
|
+
this._queueSize = remaining;
|
|
444
|
+
let nextItem = await this.crawlQueue.pop();
|
|
445
|
+
while (nextItem && !this.aborted) {
|
|
446
|
+
this._queueSize = Math.max(0, this._queueSize - 1);
|
|
447
|
+
if (this._resultCount + pending.size >= this.options.maxPages)
|
|
448
|
+
break;
|
|
449
|
+
await scheduleUrl(nextItem);
|
|
450
|
+
nextItem = await this.crawlQueue.pop();
|
|
451
|
+
}
|
|
452
|
+
if (pending.size > 0) {
|
|
453
|
+
await Promise.all(pending.values());
|
|
454
|
+
}
|
|
455
|
+
}
|
|
419
456
|
this.running = false;
|
|
420
457
|
const pages = await this.crawlStorage.getResults();
|
|
421
458
|
const errors = await this.crawlStorage.getErrors();
|
|
422
459
|
const sitemapAnalysis = this.buildSitemapAnalysis(pages);
|
|
423
460
|
const robotsAnalysis = this.buildRobotsAnalysis();
|
|
424
461
|
const visited = this.crawlQueue instanceof InMemoryCrawlQueue
|
|
425
|
-
? this.crawlQueue.getVisited()
|
|
462
|
+
? new Set(this.crawlQueue.getVisited())
|
|
426
463
|
: new Set(pages.map(r => r.url));
|
|
464
|
+
await this.crawlQueue.close?.();
|
|
465
|
+
await this.crawlStorage.close?.();
|
|
427
466
|
return {
|
|
428
467
|
startUrl: normalizedStart,
|
|
429
468
|
pages,
|
|
@@ -615,6 +654,8 @@ export class Spider {
|
|
|
615
654
|
let lastRetryAfterMs = 0;
|
|
616
655
|
const proxyUrl = this.proxyAdapter ? await this.proxyAdapter.getProxy() : null;
|
|
617
656
|
const executeRequest = async (useCurl) => {
|
|
657
|
+
if (this.aborted)
|
|
658
|
+
throw new Error('Crawl aborted');
|
|
618
659
|
if (useCurl && this.curlTransport) {
|
|
619
660
|
const curlForRequest = proxyUrl
|
|
620
661
|
? new CurlTransport(proxyUrl)
|
|
@@ -655,6 +696,7 @@ export class Spider {
|
|
|
655
696
|
const clientForRequest = this.getClientForProxy(proxyUrl);
|
|
656
697
|
const response = await clientForRequest.get(url, {
|
|
657
698
|
headers: this.buildRequestHeaders(url, false),
|
|
699
|
+
signal: this.abortController.signal,
|
|
658
700
|
beforeRedirect: this.options.onRedirect
|
|
659
701
|
? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
|
|
660
702
|
: undefined,
|
|
@@ -792,7 +834,9 @@ export class Spider {
|
|
|
792
834
|
timings,
|
|
793
835
|
});
|
|
794
836
|
}
|
|
795
|
-
|
|
837
|
+
if (this.aborted)
|
|
838
|
+
break;
|
|
839
|
+
await sleep(waitMs, this.abortController.signal);
|
|
796
840
|
continue;
|
|
797
841
|
}
|
|
798
842
|
catch (error) {
|
|
@@ -823,8 +867,10 @@ export class Spider {
|
|
|
823
867
|
forcedTransport = 'curl';
|
|
824
868
|
}
|
|
825
869
|
}
|
|
870
|
+
if (this.aborted)
|
|
871
|
+
break;
|
|
826
872
|
const waitMs = this.getRetryWait(hostname, attempt + 1);
|
|
827
|
-
await sleep(waitMs);
|
|
873
|
+
await sleep(waitMs, this.abortController.signal);
|
|
828
874
|
}
|
|
829
875
|
}
|
|
830
876
|
if (lastResponse) {
|
|
@@ -1059,6 +1105,8 @@ export class Spider {
|
|
|
1059
1105
|
}
|
|
1060
1106
|
}
|
|
1061
1107
|
catch (error) {
|
|
1108
|
+
if (this.aborted)
|
|
1109
|
+
return;
|
|
1062
1110
|
const errAttempts = typeof error?.attempts === 'number' ? error.attempts : 1;
|
|
1063
1111
|
const errRetryCount = typeof error?.retryCount === 'number' ? error.retryCount : Math.max(0, errAttempts - 1);
|
|
1064
1112
|
const errRetryAfterMs = typeof error?.retryAfterMs === 'number' ? error?.retryAfterMs : 0;
|
|
@@ -1248,7 +1296,7 @@ export class Spider {
|
|
|
1248
1296
|
const now = Date.now();
|
|
1249
1297
|
const delay = Math.max(state.penaltyUntil, state.challengeCooldownUntil) - now;
|
|
1250
1298
|
if (delay > 0) {
|
|
1251
|
-
await sleep(delay);
|
|
1299
|
+
await sleep(delay, this.abortController.signal);
|
|
1252
1300
|
}
|
|
1253
1301
|
}
|
|
1254
1302
|
registerDomainBlock(hostname) {
|
|
@@ -1319,6 +1367,7 @@ export class Spider {
|
|
|
1319
1367
|
}
|
|
1320
1368
|
abort() {
|
|
1321
1369
|
this.aborted = true;
|
|
1370
|
+
this.abortController.abort();
|
|
1322
1371
|
}
|
|
1323
1372
|
isRunning() {
|
|
1324
1373
|
return this.running;
|
package/dist/seo/seo-spider.js
CHANGED
|
@@ -56,7 +56,7 @@ export class SeoSpider {
|
|
|
56
56
|
});
|
|
57
57
|
}
|
|
58
58
|
async analyzePageDuringCrawl(pageResult, html) {
|
|
59
|
-
if (pageResult.
|
|
59
|
+
if (pageResult.status >= 400 || !html) {
|
|
60
60
|
const seoPage = { ...pageResult, seoReport: undefined };
|
|
61
61
|
this.seoPages.push(seoPage);
|
|
62
62
|
return;
|
|
@@ -517,6 +517,12 @@ export function detectBlock(response, body) {
|
|
|
517
517
|
description: 'DataDome headers detected',
|
|
518
518
|
});
|
|
519
519
|
}
|
|
520
|
+
if (response.status === 200 &&
|
|
521
|
+
results.length === 0 &&
|
|
522
|
+
!location &&
|
|
523
|
+
!body) {
|
|
524
|
+
return { blocked: false, confidence: 0 };
|
|
525
|
+
}
|
|
520
526
|
if (body) {
|
|
521
527
|
const isLongBody = body.length > 100_000;
|
|
522
528
|
const checkBody = isLongBody ? body.slice(0, 30_000) : body;
|
|
@@ -588,6 +594,12 @@ export function detectCaptcha(response, body) {
|
|
|
588
594
|
addMatch(matches, 'cloudflare', 0.7, 'Cloudflare challenge headers detected');
|
|
589
595
|
}
|
|
590
596
|
}
|
|
597
|
+
if (matches.length === 0 &&
|
|
598
|
+
response.status >= 200 && response.status < 300 &&
|
|
599
|
+
!location &&
|
|
600
|
+
!body) {
|
|
601
|
+
return { detected: false, confidence: 0 };
|
|
602
|
+
}
|
|
591
603
|
if (body) {
|
|
592
604
|
const checkBody = body.length < 120_000 ? body : body.slice(0, 120_000);
|
|
593
605
|
const isTinyBody = checkBody.length > 0 && checkBody.length < 12000;
|
package/dist/version.js
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "recker",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.95",
|
|
4
4
|
"description": "Multi-Protocol SDK for the AI Era - HTTP, WebSocket, DNS, FTP, SFTP, Telnet, HLS unified with AI providers and MCP tools",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"types": "./dist/index.d.ts",
|