recker 1.0.94 → 1.0.95-next.0a5359d
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/scrape/crawl-queue.d.ts +4 -0
- package/dist/browser/scrape/crawl-queue.js +37 -4
- package/dist/browser/scrape/spider.d.ts +11 -1
- package/dist/browser/scrape/spider.js +118 -20
- package/dist/browser/seo/seo-spider.js +9 -5
- package/dist/browser/seo/validators/rss.js +4 -2
- package/dist/browser/utils/block-detector.js +121 -56
- package/dist/scrape/crawl-queue.d.ts +4 -0
- package/dist/scrape/crawl-queue.js +37 -4
- package/dist/scrape/spider.d.ts +11 -1
- package/dist/scrape/spider.js +118 -20
- package/dist/seo/seo-spider.js +9 -5
- package/dist/seo/validators/rss.js +4 -2
- package/dist/utils/block-detector.js +121 -56
- package/dist/version.js +1 -1
- package/package.json +1 -1
|
@@ -17,7 +17,11 @@ export interface CrawlQueueAdapter {
|
|
|
17
17
|
}
|
|
18
18
|
export declare class InMemoryCrawlQueue implements CrawlQueueAdapter {
|
|
19
19
|
private queue;
|
|
20
|
+
private head;
|
|
21
|
+
private tail;
|
|
20
22
|
private visited;
|
|
23
|
+
private mode;
|
|
24
|
+
constructor(mode?: 'fifo' | 'lifo');
|
|
21
25
|
push(item: CrawlQueueItem): Promise<void>;
|
|
22
26
|
pushBatch(items: CrawlQueueItem[]): Promise<void>;
|
|
23
27
|
pop(): Promise<CrawlQueueItem | null>;
|
|
@@ -1,14 +1,45 @@
|
|
|
1
1
|
export class InMemoryCrawlQueue {
|
|
2
2
|
queue = [];
|
|
3
|
+
head = 0;
|
|
4
|
+
tail = 0;
|
|
3
5
|
visited = new Set();
|
|
6
|
+
mode;
|
|
7
|
+
constructor(mode = 'fifo') {
|
|
8
|
+
this.mode = mode;
|
|
9
|
+
}
|
|
4
10
|
async push(item) {
|
|
5
|
-
this.queue.
|
|
11
|
+
this.queue[this.tail++] = item;
|
|
6
12
|
}
|
|
7
13
|
async pushBatch(items) {
|
|
8
|
-
|
|
14
|
+
for (const item of items) {
|
|
15
|
+
this.queue[this.tail++] = item;
|
|
16
|
+
}
|
|
9
17
|
}
|
|
10
18
|
async pop() {
|
|
11
|
-
|
|
19
|
+
if (this.mode === 'lifo') {
|
|
20
|
+
while (this.tail > this.head) {
|
|
21
|
+
this.tail--;
|
|
22
|
+
const item = this.queue[this.tail];
|
|
23
|
+
this.queue[this.tail] = undefined;
|
|
24
|
+
if (item)
|
|
25
|
+
return item;
|
|
26
|
+
}
|
|
27
|
+
return null;
|
|
28
|
+
}
|
|
29
|
+
while (this.head < this.tail) {
|
|
30
|
+
const item = this.queue[this.head];
|
|
31
|
+
this.queue[this.head] = undefined;
|
|
32
|
+
this.head++;
|
|
33
|
+
if (item) {
|
|
34
|
+
if (this.head > 1024 && this.head > this.tail / 2) {
|
|
35
|
+
this.queue = this.queue.slice(this.head);
|
|
36
|
+
this.tail -= this.head;
|
|
37
|
+
this.head = 0;
|
|
38
|
+
}
|
|
39
|
+
return item;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
return null;
|
|
12
43
|
}
|
|
13
44
|
async hasVisited(url) {
|
|
14
45
|
return this.visited.has(url);
|
|
@@ -25,10 +56,12 @@ export class InMemoryCrawlQueue {
|
|
|
25
56
|
this.visited.add(url);
|
|
26
57
|
}
|
|
27
58
|
async size() {
|
|
28
|
-
return this.
|
|
59
|
+
return this.tail - this.head;
|
|
29
60
|
}
|
|
30
61
|
async clear() {
|
|
31
62
|
this.queue = [];
|
|
63
|
+
this.head = 0;
|
|
64
|
+
this.tail = 0;
|
|
32
65
|
this.visited.clear();
|
|
33
66
|
}
|
|
34
67
|
async close() {
|
|
@@ -11,7 +11,7 @@ type CaptchaProvider = CaptchaDetectionResult['provider'];
|
|
|
11
11
|
export interface SpiderOptions {
|
|
12
12
|
maxDepth?: number;
|
|
13
13
|
maxPages?: number;
|
|
14
|
-
sameDomain?: boolean;
|
|
14
|
+
sameDomain?: boolean | 'exact' | 'subdomain';
|
|
15
15
|
concurrency?: number;
|
|
16
16
|
timeout?: number;
|
|
17
17
|
delay?: number;
|
|
@@ -63,8 +63,14 @@ export interface SpiderOptions {
|
|
|
63
63
|
domainRateLimit?: {
|
|
64
64
|
maxPerSecond?: number;
|
|
65
65
|
};
|
|
66
|
+
autoThrottle?: boolean | {
|
|
67
|
+
targetMs?: number;
|
|
68
|
+
minDelay?: number;
|
|
69
|
+
maxDelay?: number;
|
|
70
|
+
};
|
|
66
71
|
deduplicateContent?: boolean;
|
|
67
72
|
resume?: boolean;
|
|
73
|
+
strategy?: 'bfs' | 'dfs';
|
|
68
74
|
crawlQueue?: CrawlQueueAdapter;
|
|
69
75
|
crawlStorage?: CrawlStorageAdapter;
|
|
70
76
|
}
|
|
@@ -190,11 +196,14 @@ export declare class Spider {
|
|
|
190
196
|
private _queueSize;
|
|
191
197
|
private _resultCount;
|
|
192
198
|
private baseHost;
|
|
199
|
+
private baseRootDomain;
|
|
193
200
|
private running;
|
|
194
201
|
private aborted;
|
|
202
|
+
private abortController;
|
|
195
203
|
private pendingCount;
|
|
196
204
|
private domainRequestTimestamps;
|
|
197
205
|
private contentHashes;
|
|
206
|
+
private domainAvgResponseTime;
|
|
198
207
|
private blockedDomains;
|
|
199
208
|
private curlTransport;
|
|
200
209
|
private curlAvailable;
|
|
@@ -223,6 +232,7 @@ export declare class Spider {
|
|
|
223
232
|
private waitForDomainPenalty;
|
|
224
233
|
private registerDomainBlock;
|
|
225
234
|
private registerDomainSuccess;
|
|
235
|
+
private updateAutoThrottle;
|
|
226
236
|
private getCaptchaRetryMultiplier;
|
|
227
237
|
private registerDomainChallenge;
|
|
228
238
|
private getRetryWait;
|
|
@@ -40,10 +40,15 @@ const SEC_CH_PLATFORM_HINTS = [
|
|
|
40
40
|
function getHostname(url) {
|
|
41
41
|
return new URL(url).hostname;
|
|
42
42
|
}
|
|
43
|
-
function sleep(ms) {
|
|
43
|
+
function sleep(ms, signal) {
|
|
44
44
|
if (ms <= 0)
|
|
45
45
|
return Promise.resolve();
|
|
46
|
-
|
|
46
|
+
if (signal?.aborted)
|
|
47
|
+
return Promise.resolve();
|
|
48
|
+
return new Promise(resolve => {
|
|
49
|
+
const timer = setTimeout(resolve, ms);
|
|
50
|
+
signal?.addEventListener('abort', () => { clearTimeout(timer); resolve(); }, { once: true });
|
|
51
|
+
});
|
|
47
52
|
}
|
|
48
53
|
function getRetryAfterDelay(response) {
|
|
49
54
|
const retryAfter = response.headers.get('retry-after');
|
|
@@ -103,14 +108,30 @@ function normalizeUrl(urlStr) {
|
|
|
103
108
|
return urlStr;
|
|
104
109
|
}
|
|
105
110
|
}
|
|
106
|
-
function
|
|
111
|
+
function getRootDomain(hostname) {
|
|
112
|
+
const parts = hostname.replace(/^www\./, '').split('.');
|
|
113
|
+
if (parts.length >= 3 && parts[parts.length - 2].length <= 3) {
|
|
114
|
+
return parts.slice(-3).join('.');
|
|
115
|
+
}
|
|
116
|
+
return parts.slice(-2).join('.');
|
|
117
|
+
}
|
|
118
|
+
function shouldCrawl(url, baseHost, options, baseRootDomain) {
|
|
107
119
|
try {
|
|
108
120
|
const parsed = new URL(url);
|
|
109
121
|
if (!['http:', 'https:'].includes(parsed.protocol)) {
|
|
110
122
|
return false;
|
|
111
123
|
}
|
|
112
|
-
|
|
113
|
-
|
|
124
|
+
const hostname = parsed.hostname.replace(/^www\./, '');
|
|
125
|
+
const sameDomain = options.sameDomain;
|
|
126
|
+
if (sameDomain === 'subdomain') {
|
|
127
|
+
const pageRoot = getRootDomain(hostname);
|
|
128
|
+
const rootDomain = baseRootDomain ?? getRootDomain(baseHost);
|
|
129
|
+
if (pageRoot !== rootDomain)
|
|
130
|
+
return false;
|
|
131
|
+
}
|
|
132
|
+
else if (sameDomain !== false) {
|
|
133
|
+
if (hostname !== baseHost)
|
|
134
|
+
return false;
|
|
114
135
|
}
|
|
115
136
|
const skipExtensions = [
|
|
116
137
|
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico',
|
|
@@ -187,11 +208,14 @@ export class Spider {
|
|
|
187
208
|
_queueSize = 0;
|
|
188
209
|
_resultCount = 0;
|
|
189
210
|
baseHost = '';
|
|
211
|
+
baseRootDomain = '';
|
|
190
212
|
running = false;
|
|
191
213
|
aborted = false;
|
|
214
|
+
abortController = new AbortController();
|
|
192
215
|
pendingCount = 0;
|
|
193
216
|
domainRequestTimestamps = new Map();
|
|
194
217
|
contentHashes = new Map();
|
|
218
|
+
domainAvgResponseTime = new Map();
|
|
195
219
|
blockedDomains = new Set();
|
|
196
220
|
curlTransport = null;
|
|
197
221
|
curlAvailable = false;
|
|
@@ -218,7 +242,9 @@ export class Spider {
|
|
|
218
242
|
if (timestamps.length >= limit) {
|
|
219
243
|
const waitMs = timestamps[0] + window - now;
|
|
220
244
|
if (waitMs > 0)
|
|
221
|
-
await sleep(waitMs);
|
|
245
|
+
await sleep(waitMs, this.abortController.signal);
|
|
246
|
+
if (this.aborted)
|
|
247
|
+
return;
|
|
222
248
|
const afterWait = Date.now();
|
|
223
249
|
while (timestamps.length > 0 && timestamps[0] <= afterWait - window) {
|
|
224
250
|
timestamps.shift();
|
|
@@ -276,8 +302,10 @@ export class Spider {
|
|
|
276
302
|
extract: extractSchema,
|
|
277
303
|
parserOptions: options.parserOptions,
|
|
278
304
|
domainRateLimit: options.domainRateLimit,
|
|
305
|
+
autoThrottle: options.autoThrottle ?? false,
|
|
279
306
|
deduplicateContent: options.deduplicateContent ?? false,
|
|
280
307
|
resume: options.resume ?? false,
|
|
308
|
+
strategy: options.strategy ?? 'bfs',
|
|
281
309
|
};
|
|
282
310
|
if (options.proxy) {
|
|
283
311
|
if (typeof options.proxy === 'string') {
|
|
@@ -307,7 +335,7 @@ export class Spider {
|
|
|
307
335
|
interval: this.options.delay,
|
|
308
336
|
} : {}),
|
|
309
337
|
});
|
|
310
|
-
this.crawlQueue = options.crawlQueue ?? new InMemoryCrawlQueue();
|
|
338
|
+
this.crawlQueue = options.crawlQueue ?? new InMemoryCrawlQueue(this.options.strategy === 'dfs' ? 'lifo' : 'fifo');
|
|
311
339
|
this.crawlStorage = options.crawlStorage ?? new InMemoryCrawlStorage();
|
|
312
340
|
}
|
|
313
341
|
async crawl(startUrl) {
|
|
@@ -315,7 +343,8 @@ export class Spider {
|
|
|
315
343
|
const startTimestamp = Date.now();
|
|
316
344
|
const normalizedStart = normalizeUrl(startUrl);
|
|
317
345
|
const baseUrl = new URL(normalizedStart).origin;
|
|
318
|
-
this.baseHost = new URL(normalizedStart).hostname;
|
|
346
|
+
this.baseHost = new URL(normalizedStart).hostname.replace(/^www\./, '');
|
|
347
|
+
this.baseRootDomain = getRootDomain(this.baseHost);
|
|
319
348
|
if (!this.options.resume) {
|
|
320
349
|
await this.crawlQueue.clear();
|
|
321
350
|
await this.crawlStorage.clear();
|
|
@@ -324,6 +353,7 @@ export class Spider {
|
|
|
324
353
|
this._resultCount = 0;
|
|
325
354
|
this.domainRequestTimestamps.clear();
|
|
326
355
|
this.contentHashes.clear();
|
|
356
|
+
this.domainAvgResponseTime.clear();
|
|
327
357
|
}
|
|
328
358
|
else {
|
|
329
359
|
this._queueSize = await this.crawlQueue.size();
|
|
@@ -331,14 +361,25 @@ export class Spider {
|
|
|
331
361
|
}
|
|
332
362
|
this.running = true;
|
|
333
363
|
this.aborted = false;
|
|
364
|
+
this.abortController = new AbortController();
|
|
334
365
|
this.pendingCount = 0;
|
|
335
366
|
this.sitemapUrls = [];
|
|
336
367
|
this.sitemapUrlSet.clear();
|
|
337
368
|
this.robotsData = null;
|
|
338
369
|
this.sitemapValidation = null;
|
|
339
370
|
this.robotsValidation = null;
|
|
340
|
-
this.
|
|
341
|
-
|
|
371
|
+
if (!this.options.resume) {
|
|
372
|
+
this.blockedDomains.clear();
|
|
373
|
+
this.domainStates.clear();
|
|
374
|
+
}
|
|
375
|
+
if (this.options.resume && this.options.deduplicateContent) {
|
|
376
|
+
const existingResults = await this.crawlStorage.getResults();
|
|
377
|
+
for (const r of existingResults) {
|
|
378
|
+
if (r.contentHash) {
|
|
379
|
+
this.contentHashes.set(r.contentHash, r.url);
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
}
|
|
342
383
|
if (this.options.transport !== 'undici') {
|
|
343
384
|
this.curlAvailable = await hasImpersonate();
|
|
344
385
|
if (this.options.transport === 'auto' && this.curlAvailable && isProtectedDomain(this.baseHost)) {
|
|
@@ -354,10 +395,12 @@ export class Spider {
|
|
|
354
395
|
const pending = new Map();
|
|
355
396
|
const scheduleUrl = async (item) => {
|
|
356
397
|
const normalized = normalizeUrl(item.url);
|
|
357
|
-
if (await this.crawlQueue.hasVisited(normalized))
|
|
358
|
-
return;
|
|
359
398
|
if (pending.has(normalized))
|
|
360
399
|
return;
|
|
400
|
+
if (await this.crawlQueue.hasVisited(normalized))
|
|
401
|
+
return;
|
|
402
|
+
await this.crawlQueue.markVisited(normalized);
|
|
403
|
+
this._visitedCount++;
|
|
361
404
|
if (item.depth > this.options.maxDepth)
|
|
362
405
|
return;
|
|
363
406
|
if (this._resultCount + pending.size >= this.options.maxPages)
|
|
@@ -373,8 +416,6 @@ export class Spider {
|
|
|
373
416
|
return;
|
|
374
417
|
}
|
|
375
418
|
}
|
|
376
|
-
await this.crawlQueue.markVisited(normalized);
|
|
377
|
-
this._visitedCount++;
|
|
378
419
|
this.pendingCount++;
|
|
379
420
|
const promise = this.pool.run(() => this.crawlPage({ ...item, url: normalized }))
|
|
380
421
|
.finally(() => {
|
|
@@ -416,14 +457,41 @@ export class Spider {
|
|
|
416
457
|
if (pending.size > 0) {
|
|
417
458
|
await Promise.all(pending.values());
|
|
418
459
|
}
|
|
460
|
+
while (!this.aborted && this._resultCount < this.options.maxPages) {
|
|
461
|
+
const remaining = await this.crawlQueue.size();
|
|
462
|
+
if (remaining === 0 && pending.size === 0)
|
|
463
|
+
break;
|
|
464
|
+
this._queueSize = remaining;
|
|
465
|
+
let nextItem = await this.crawlQueue.pop();
|
|
466
|
+
while (nextItem && !this.aborted) {
|
|
467
|
+
this._queueSize = Math.max(0, this._queueSize - 1);
|
|
468
|
+
if (this._resultCount + pending.size >= this.options.maxPages)
|
|
469
|
+
break;
|
|
470
|
+
await scheduleUrl(nextItem);
|
|
471
|
+
nextItem = await this.crawlQueue.pop();
|
|
472
|
+
}
|
|
473
|
+
if (pending.size > 0) {
|
|
474
|
+
await Promise.all(pending.values());
|
|
475
|
+
}
|
|
476
|
+
}
|
|
419
477
|
this.running = false;
|
|
420
478
|
const pages = await this.crawlStorage.getResults();
|
|
421
479
|
const errors = await this.crawlStorage.getErrors();
|
|
422
480
|
const sitemapAnalysis = this.buildSitemapAnalysis(pages);
|
|
423
481
|
const robotsAnalysis = this.buildRobotsAnalysis();
|
|
424
482
|
const visited = this.crawlQueue instanceof InMemoryCrawlQueue
|
|
425
|
-
? this.crawlQueue.getVisited()
|
|
483
|
+
? new Set(this.crawlQueue.getVisited())
|
|
426
484
|
: new Set(pages.map(r => r.url));
|
|
485
|
+
await this.crawlQueue.close?.();
|
|
486
|
+
await this.crawlStorage.close?.();
|
|
487
|
+
for (const client of this.proxyClients.values()) {
|
|
488
|
+
if (typeof client.destroy === 'function') {
|
|
489
|
+
client.destroy();
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
this.proxyClients.clear();
|
|
493
|
+
await this.proxyAdapter?.close?.();
|
|
494
|
+
this.domainAvgResponseTime.clear();
|
|
427
495
|
return {
|
|
428
496
|
startUrl: normalizedStart,
|
|
429
497
|
pages,
|
|
@@ -615,6 +683,8 @@ export class Spider {
|
|
|
615
683
|
let lastRetryAfterMs = 0;
|
|
616
684
|
const proxyUrl = this.proxyAdapter ? await this.proxyAdapter.getProxy() : null;
|
|
617
685
|
const executeRequest = async (useCurl) => {
|
|
686
|
+
if (this.aborted)
|
|
687
|
+
throw new Error('Crawl aborted');
|
|
618
688
|
if (useCurl && this.curlTransport) {
|
|
619
689
|
const curlForRequest = proxyUrl
|
|
620
690
|
? new CurlTransport(proxyUrl)
|
|
@@ -655,6 +725,7 @@ export class Spider {
|
|
|
655
725
|
const clientForRequest = this.getClientForProxy(proxyUrl);
|
|
656
726
|
const response = await clientForRequest.get(url, {
|
|
657
727
|
headers: this.buildRequestHeaders(url, false),
|
|
728
|
+
signal: this.abortController.signal,
|
|
658
729
|
beforeRedirect: this.options.onRedirect
|
|
659
730
|
? (info) => { this.options.onRedirect({ from: info.from, to: info.to, status: info.status }); }
|
|
660
731
|
: undefined,
|
|
@@ -746,6 +817,9 @@ export class Spider {
|
|
|
746
817
|
const isHighQualitySuccess = response.status < 400 && !detection.blocked && !hasCaptchaSignal;
|
|
747
818
|
if (isHighQualitySuccess) {
|
|
748
819
|
this.registerDomainSuccess(hostname);
|
|
820
|
+
if (timings?.total) {
|
|
821
|
+
this.updateAutoThrottle(hostname, timings.total);
|
|
822
|
+
}
|
|
749
823
|
}
|
|
750
824
|
if (!shouldRetry || attempt === maxAttempts - 1) {
|
|
751
825
|
if (proxyUrl && this.proxyAdapter?.reportResult) {
|
|
@@ -792,7 +866,9 @@ export class Spider {
|
|
|
792
866
|
timings,
|
|
793
867
|
});
|
|
794
868
|
}
|
|
795
|
-
|
|
869
|
+
if (this.aborted)
|
|
870
|
+
break;
|
|
871
|
+
await sleep(waitMs, this.abortController.signal);
|
|
796
872
|
continue;
|
|
797
873
|
}
|
|
798
874
|
catch (error) {
|
|
@@ -823,8 +899,10 @@ export class Spider {
|
|
|
823
899
|
forcedTransport = 'curl';
|
|
824
900
|
}
|
|
825
901
|
}
|
|
902
|
+
if (this.aborted)
|
|
903
|
+
break;
|
|
826
904
|
const waitMs = this.getRetryWait(hostname, attempt + 1);
|
|
827
|
-
await sleep(waitMs);
|
|
905
|
+
await sleep(waitMs, this.abortController.signal);
|
|
828
906
|
}
|
|
829
907
|
}
|
|
830
908
|
if (lastResponse) {
|
|
@@ -1035,7 +1113,7 @@ export class Spider {
|
|
|
1035
1113
|
if (!link.href)
|
|
1036
1114
|
continue;
|
|
1037
1115
|
const normalized = normalizeUrl(link.href);
|
|
1038
|
-
if (!shouldCrawl(normalized, this.baseHost, this.options))
|
|
1116
|
+
if (!shouldCrawl(normalized, this.baseHost, this.options, this.baseRootDomain))
|
|
1039
1117
|
continue;
|
|
1040
1118
|
candidateUrls.push(normalized);
|
|
1041
1119
|
candidates.push({ url: normalized, depth: item.depth + 1 });
|
|
@@ -1059,6 +1137,8 @@ export class Spider {
|
|
|
1059
1137
|
}
|
|
1060
1138
|
}
|
|
1061
1139
|
catch (error) {
|
|
1140
|
+
if (this.aborted)
|
|
1141
|
+
return;
|
|
1062
1142
|
const errAttempts = typeof error?.attempts === 'number' ? error.attempts : 1;
|
|
1063
1143
|
const errRetryCount = typeof error?.retryCount === 'number' ? error.retryCount : Math.max(0, errAttempts - 1);
|
|
1064
1144
|
const errRetryAfterMs = typeof error?.retryAfterMs === 'number' ? error?.retryAfterMs : 0;
|
|
@@ -1144,6 +1224,7 @@ export class Spider {
|
|
|
1144
1224
|
consecutiveUndiciFailures: 0,
|
|
1145
1225
|
lastTransport: 'undici',
|
|
1146
1226
|
lastCaptchaConfidence: 0,
|
|
1227
|
+
autoThrottleDelay: 0,
|
|
1147
1228
|
};
|
|
1148
1229
|
this.domainStates.set(hostname, next);
|
|
1149
1230
|
return next;
|
|
@@ -1246,9 +1327,11 @@ export class Spider {
|
|
|
1246
1327
|
async waitForDomainPenalty(hostname) {
|
|
1247
1328
|
const state = this.getOrCreateDomainState(hostname);
|
|
1248
1329
|
const now = Date.now();
|
|
1249
|
-
const
|
|
1330
|
+
const penaltyDelay = Math.max(state.penaltyUntil, state.challengeCooldownUntil) - now;
|
|
1331
|
+
const throttleDelay = state.autoThrottleDelay ?? 0;
|
|
1332
|
+
const delay = Math.max(penaltyDelay, throttleDelay);
|
|
1250
1333
|
if (delay > 0) {
|
|
1251
|
-
await sleep(delay);
|
|
1334
|
+
await sleep(delay, this.abortController.signal);
|
|
1252
1335
|
}
|
|
1253
1336
|
}
|
|
1254
1337
|
registerDomainBlock(hostname) {
|
|
@@ -1274,6 +1357,20 @@ export class Spider {
|
|
|
1274
1357
|
state.lastCaptchaProvider = undefined;
|
|
1275
1358
|
}
|
|
1276
1359
|
}
|
|
1360
|
+
updateAutoThrottle(hostname, responseTimeMs) {
|
|
1361
|
+
const config = this.options.autoThrottle;
|
|
1362
|
+
if (!config)
|
|
1363
|
+
return;
|
|
1364
|
+
const target = (typeof config === 'object' ? config.targetMs : undefined) ?? 500;
|
|
1365
|
+
const minDelay = (typeof config === 'object' ? config.minDelay : undefined) ?? 50;
|
|
1366
|
+
const maxDelay = (typeof config === 'object' ? config.maxDelay : undefined) ?? 30000;
|
|
1367
|
+
const prev = this.domainAvgResponseTime.get(hostname) ?? responseTimeMs;
|
|
1368
|
+
const avg = prev * 0.7 + responseTimeMs * 0.3;
|
|
1369
|
+
this.domainAvgResponseTime.set(hostname, avg);
|
|
1370
|
+
const ratio = avg / target;
|
|
1371
|
+
const state = this.getOrCreateDomainState(hostname);
|
|
1372
|
+
state.autoThrottleDelay = Math.max(minDelay, Math.min(maxDelay, Math.round(this.options.delay * ratio)));
|
|
1373
|
+
}
|
|
1277
1374
|
getCaptchaRetryMultiplier(provider) {
|
|
1278
1375
|
if (!provider)
|
|
1279
1376
|
return 1.2;
|
|
@@ -1319,6 +1416,7 @@ export class Spider {
|
|
|
1319
1416
|
}
|
|
1320
1417
|
abort() {
|
|
1321
1418
|
this.aborted = true;
|
|
1419
|
+
this.abortController.abort();
|
|
1322
1420
|
}
|
|
1323
1421
|
isRunning() {
|
|
1324
1422
|
return this.running;
|
|
@@ -56,7 +56,7 @@ export class SeoSpider {
|
|
|
56
56
|
});
|
|
57
57
|
}
|
|
58
58
|
async analyzePageDuringCrawl(pageResult, html) {
|
|
59
|
-
if (pageResult.
|
|
59
|
+
if (pageResult.status >= 400 || !html) {
|
|
60
60
|
const seoPage = { ...pageResult, seoReport: undefined };
|
|
61
61
|
this.seoPages.push(seoPage);
|
|
62
62
|
return;
|
|
@@ -141,7 +141,8 @@ export class SeoSpider {
|
|
|
141
141
|
results.humans.content = await res.text();
|
|
142
142
|
}
|
|
143
143
|
}
|
|
144
|
-
catch {
|
|
144
|
+
catch {
|
|
145
|
+
}
|
|
145
146
|
try {
|
|
146
147
|
const res = await client.get(results.llms.url);
|
|
147
148
|
if (res.status === 200) {
|
|
@@ -149,7 +150,8 @@ export class SeoSpider {
|
|
|
149
150
|
results.llms.content = await res.text();
|
|
150
151
|
}
|
|
151
152
|
}
|
|
152
|
-
catch {
|
|
153
|
+
catch {
|
|
154
|
+
}
|
|
153
155
|
try {
|
|
154
156
|
const res = await client.get(results.sitemap.url);
|
|
155
157
|
if (res.status === 200) {
|
|
@@ -159,7 +161,8 @@ export class SeoSpider {
|
|
|
159
161
|
results.sitemap.urlCount = urlMatches ? urlMatches.length : 0;
|
|
160
162
|
}
|
|
161
163
|
}
|
|
162
|
-
catch {
|
|
164
|
+
catch {
|
|
165
|
+
}
|
|
163
166
|
try {
|
|
164
167
|
let res = await client.get(results.manifest.url);
|
|
165
168
|
if (res.status !== 200) {
|
|
@@ -181,7 +184,8 @@ export class SeoSpider {
|
|
|
181
184
|
}
|
|
182
185
|
}
|
|
183
186
|
}
|
|
184
|
-
catch {
|
|
187
|
+
catch {
|
|
188
|
+
}
|
|
185
189
|
return results;
|
|
186
190
|
}
|
|
187
191
|
catch {
|
|
@@ -25,7 +25,8 @@ export async function discoverFeeds(baseUrl, html) {
|
|
|
25
25
|
href = new URL(href, baseUrl).toString();
|
|
26
26
|
candidateUrls.add(href);
|
|
27
27
|
}
|
|
28
|
-
catch {
|
|
28
|
+
catch {
|
|
29
|
+
}
|
|
29
30
|
}
|
|
30
31
|
}
|
|
31
32
|
}
|
|
@@ -35,7 +36,8 @@ export async function discoverFeeds(baseUrl, html) {
|
|
|
35
36
|
const url = new URL(path, baseUrl).toString();
|
|
36
37
|
candidateUrls.add(url);
|
|
37
38
|
}
|
|
38
|
-
catch {
|
|
39
|
+
catch {
|
|
40
|
+
}
|
|
39
41
|
}
|
|
40
42
|
}
|
|
41
43
|
const client = createClient({ timeout: 8000 });
|