@crawlee/basic 4.0.0-beta.12 → 4.0.0-beta.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/index.d.ts +1 -0
- package/index.d.ts.map +1 -1
- package/internals/basic-crawler.d.ts +84 -28
- package/internals/basic-crawler.d.ts.map +1 -1
- package/internals/basic-crawler.js +249 -115
- package/internals/basic-crawler.js.map +1 -1
- package/internals/send-request.d.ts +1 -3
- package/internals/send-request.d.ts.map +1 -1
- package/internals/send-request.js +2 -4
- package/internals/send-request.js.map +1 -1
- package/package.json +5 -5
- package/tsconfig.build.tsbuildinfo +0 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { writeFile } from 'node:fs/promises';
|
|
2
2
|
import { dirname } from 'node:path';
|
|
3
|
-
import { AutoscaledPool, Configuration, ContextPipeline, ContextPipelineCleanupError, ContextPipelineInitializationError, ContextPipelineInterruptedError, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, GotScrapingHttpClient, KeyValueStore, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestHandlerError, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
|
|
4
|
-
import { RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
|
|
3
|
+
import { AutoscaledPool, Configuration, ContextPipeline, ContextPipelineCleanupError, ContextPipelineInitializationError, ContextPipelineInterruptedError, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, GotScrapingHttpClient, KeyValueStore, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestHandlerError, RequestListAdapter, RequestManagerTandem, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
|
|
4
|
+
import { getObjectType, isAsyncIterable, isIterable, RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
|
|
5
5
|
import { stringify } from 'csv-stringify/sync';
|
|
6
6
|
import { ensureDir, writeJSON } from 'fs-extra/esm';
|
|
7
7
|
import ow from 'ow';
|
|
@@ -103,6 +103,10 @@ export class BasicCrawler {
|
|
|
103
103
|
* Only available if used by the crawler.
|
|
104
104
|
*/
|
|
105
105
|
requestQueue;
|
|
106
|
+
/**
|
|
107
|
+
* The main request-handling component of the crawler. It's initialized during the crawler startup.
|
|
108
|
+
*/
|
|
109
|
+
requestManager;
|
|
106
110
|
/**
|
|
107
111
|
* A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session|sessions}.
|
|
108
112
|
* Only available if used by the crawler.
|
|
@@ -143,10 +147,12 @@ export class BasicCrawler {
|
|
|
143
147
|
requestHandlerTimeoutMillis;
|
|
144
148
|
internalTimeoutMillis;
|
|
145
149
|
maxRequestRetries;
|
|
150
|
+
maxCrawlDepth;
|
|
146
151
|
sameDomainDelayMillis;
|
|
147
152
|
domainAccessedTime;
|
|
148
153
|
maxSessionRotations;
|
|
149
|
-
|
|
154
|
+
maxRequestsPerCrawl;
|
|
155
|
+
handledRequestsCount = 0;
|
|
150
156
|
statusMessageLoggingInterval;
|
|
151
157
|
statusMessageCallback;
|
|
152
158
|
sessionPoolOptions;
|
|
@@ -158,6 +164,8 @@ export class BasicCrawler {
|
|
|
158
164
|
respectRobotsTxtFile;
|
|
159
165
|
onSkippedRequest;
|
|
160
166
|
_closeEvents;
|
|
167
|
+
shouldLogMaxProcessedRequestsExceeded = true;
|
|
168
|
+
shouldLogMaxEnqueuedRequestsExceeded = true;
|
|
161
169
|
experiments;
|
|
162
170
|
robotsTxtFileCache;
|
|
163
171
|
_experimentWarnings = {};
|
|
@@ -177,6 +185,7 @@ export class BasicCrawler {
|
|
|
177
185
|
sameDomainDelaySecs: ow.optional.number,
|
|
178
186
|
maxSessionRotations: ow.optional.number,
|
|
179
187
|
maxRequestsPerCrawl: ow.optional.number,
|
|
188
|
+
maxCrawlDepth: ow.optional.number,
|
|
180
189
|
autoscaledPoolOptions: ow.optional.object,
|
|
181
190
|
sessionPoolOptions: ow.optional.object,
|
|
182
191
|
useSessionPool: ow.optional.boolean,
|
|
@@ -184,7 +193,7 @@ export class BasicCrawler {
|
|
|
184
193
|
statusMessageLoggingInterval: ow.optional.number,
|
|
185
194
|
statusMessageCallback: ow.optional.function,
|
|
186
195
|
retryOnBlocked: ow.optional.boolean,
|
|
187
|
-
respectRobotsTxtFile: ow.optional.boolean,
|
|
196
|
+
respectRobotsTxtFile: ow.optional.any(ow.boolean, ow.object),
|
|
188
197
|
onSkippedRequest: ow.optional.function,
|
|
189
198
|
httpClient: ow.optional.object,
|
|
190
199
|
// AutoscaledPool shorthands
|
|
@@ -204,7 +213,7 @@ export class BasicCrawler {
|
|
|
204
213
|
config = Configuration.getGlobalConfig()) {
|
|
205
214
|
this.config = config;
|
|
206
215
|
ow(options, 'BasicCrawlerOptions', ow.object.exactShape(BasicCrawler.optionsShape));
|
|
207
|
-
const { requestList, requestQueue, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, useSessionPool = true, proxyConfiguration,
|
|
216
|
+
const { requestList, requestQueue, requestManager, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, maxCrawlDepth, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, useSessionPool = true, proxyConfiguration,
|
|
208
217
|
// AutoscaledPool shorthands
|
|
209
218
|
minConcurrency, maxConcurrency, maxRequestsPerMinute, retryOnBlocked = false, respectRobotsTxtFile = false, onSkippedRequest, requestHandler, requestHandlerTimeoutSecs, errorHandler, failedRequestHandler, statusMessageLoggingInterval = 10, statusMessageCallback, statisticsOptions, httpClient,
|
|
210
219
|
// internal
|
|
@@ -228,6 +237,7 @@ export class BasicCrawler {
|
|
|
228
237
|
this.log.debug(message);
|
|
229
238
|
request.noRetry = true;
|
|
230
239
|
request.state = RequestState.SKIPPED;
|
|
240
|
+
await this.handleSkippedRequest({ url: request.url, reason: 'redirect' });
|
|
231
241
|
throw new ContextPipelineInterruptedError(message);
|
|
232
242
|
}
|
|
233
243
|
return context;
|
|
@@ -235,8 +245,17 @@ export class BasicCrawler {
|
|
|
235
245
|
});
|
|
236
246
|
return contextPipeline;
|
|
237
247
|
};
|
|
238
|
-
|
|
239
|
-
|
|
248
|
+
if (requestManager !== undefined) {
|
|
249
|
+
if (requestList !== undefined || requestQueue !== undefined) {
|
|
250
|
+
throw new Error('The `requestManager` option cannot be used in conjunction with `requestList` and/or `requestQueue`');
|
|
251
|
+
}
|
|
252
|
+
this.requestManager = requestManager;
|
|
253
|
+
this.requestQueue = requestManager; // TODO(v4) - the cast is not fully legitimate here, but it's fine for internal usage by the BasicCrawler
|
|
254
|
+
}
|
|
255
|
+
else {
|
|
256
|
+
this.requestList = requestList;
|
|
257
|
+
this.requestQueue = requestQueue;
|
|
258
|
+
}
|
|
240
259
|
this.httpClient = httpClient ?? new GotScrapingHttpClient();
|
|
241
260
|
this.proxyConfiguration = proxyConfiguration;
|
|
242
261
|
this.log = log;
|
|
@@ -246,6 +265,7 @@ export class BasicCrawler {
|
|
|
246
265
|
this.domainAccessedTime = new Map();
|
|
247
266
|
this.experiments = experiments;
|
|
248
267
|
this.robotsTxtFileCache = new LruCache({ maxLength: 1000 });
|
|
268
|
+
this.handleSkippedRequest = this.handleSkippedRequest.bind(this);
|
|
249
269
|
this.requestHandler = requestHandler ?? this.router;
|
|
250
270
|
this.failedRequestHandler = failedRequestHandler;
|
|
251
271
|
this.errorHandler = errorHandler;
|
|
@@ -270,9 +290,9 @@ export class BasicCrawler {
|
|
|
270
290
|
this.requestQueue.requestLockSecs = Math.max(this.requestHandlerTimeoutMillis / 1000 + 5, 60);
|
|
271
291
|
}
|
|
272
292
|
this.maxRequestRetries = maxRequestRetries;
|
|
293
|
+
this.maxCrawlDepth = maxCrawlDepth;
|
|
273
294
|
this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
|
|
274
295
|
this.maxSessionRotations = maxSessionRotations;
|
|
275
|
-
this.handledRequestsCount = 0;
|
|
276
296
|
this.stats = new Statistics({
|
|
277
297
|
logMessage: `${log.getOptions().prefix} request statistics:`,
|
|
278
298
|
log,
|
|
@@ -297,8 +317,8 @@ export class BasicCrawler {
|
|
|
297
317
|
this.requestHandlerTimeoutMillis = maxSignedInteger;
|
|
298
318
|
}
|
|
299
319
|
this.internalTimeoutMillis = Math.min(this.internalTimeoutMillis, maxSignedInteger);
|
|
300
|
-
|
|
301
|
-
const isMaxPagesExceeded = () => maxRequestsPerCrawl && maxRequestsPerCrawl <= this.handledRequestsCount;
|
|
320
|
+
this.maxRequestsPerCrawl = maxRequestsPerCrawl;
|
|
321
|
+
const isMaxPagesExceeded = () => this.maxRequestsPerCrawl && this.maxRequestsPerCrawl <= this.handledRequestsCount;
|
|
302
322
|
// eslint-disable-next-line prefer-const
|
|
303
323
|
let { isFinishedFunction, isTaskReadyFunction } = autoscaledPoolOptions;
|
|
304
324
|
// override even if `isFinishedFunction` provided by user - `keepAlive` has higher priority
|
|
@@ -312,10 +332,10 @@ export class BasicCrawler {
|
|
|
312
332
|
runTaskFunction: this._runTaskFunction.bind(this),
|
|
313
333
|
isTaskReadyFunction: async () => {
|
|
314
334
|
if (isMaxPagesExceeded()) {
|
|
315
|
-
if (
|
|
335
|
+
if (this.shouldLogMaxProcessedRequestsExceeded) {
|
|
316
336
|
log.info('Crawler reached the maxRequestsPerCrawl limit of ' +
|
|
317
|
-
`${maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
|
|
318
|
-
|
|
337
|
+
`${this.maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
|
|
338
|
+
this.shouldLogMaxProcessedRequestsExceeded = false;
|
|
319
339
|
}
|
|
320
340
|
return false;
|
|
321
341
|
}
|
|
@@ -323,7 +343,7 @@ export class BasicCrawler {
|
|
|
323
343
|
},
|
|
324
344
|
isFinishedFunction: async () => {
|
|
325
345
|
if (isMaxPagesExceeded()) {
|
|
326
|
-
log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${maxRequestsPerCrawl} requests ` +
|
|
346
|
+
log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${this.maxRequestsPerCrawl} requests ` +
|
|
327
347
|
'and all requests that were in progress at that time have now finished. ' +
|
|
328
348
|
`In total, the crawler processed ${this.handledRequestsCount} requests and will shut down.`);
|
|
329
349
|
return true;
|
|
@@ -383,7 +403,7 @@ export class BasicCrawler {
|
|
|
383
403
|
message = `Experiencing problems, ${this.stats.state.requestsFailed - previousState.requestsFailed || this.stats.state.requestsFailed} failed requests in the past ${this.statusMessageLoggingInterval} seconds.`;
|
|
384
404
|
}
|
|
385
405
|
else {
|
|
386
|
-
const total = this.
|
|
406
|
+
const total = this.requestManager?.getTotalCount();
|
|
387
407
|
message = `Crawled ${this.stats.state.requestsFinished}${total ? `/${total}` : ''} pages, ${this.stats.state.requestsFailed} failed requests, desired concurrency ${this.autoscaledPool?.desiredConcurrency ?? 0}.`;
|
|
388
408
|
}
|
|
389
409
|
if (this.statusMessageCallback) {
|
|
@@ -423,20 +443,30 @@ export class BasicCrawler {
|
|
|
423
443
|
if (this.requestQueue?.name === 'default' && purgeRequestQueue) {
|
|
424
444
|
await this.requestQueue.drop();
|
|
425
445
|
this.requestQueue = await this._getRequestQueue();
|
|
446
|
+
this.requestManager = undefined;
|
|
447
|
+
await this.initializeRequestManager();
|
|
448
|
+
this.handledRequestsCount = 0; // This would've been reset by this._init() further down below, but at that point `handledRequestsCount` could prevent `addRequests` from adding the initial requests
|
|
426
449
|
}
|
|
427
450
|
this.stats.reset();
|
|
428
451
|
await this.stats.resetStore();
|
|
429
452
|
await this.sessionPool?.resetStore();
|
|
430
453
|
}
|
|
431
454
|
this.running = true;
|
|
432
|
-
|
|
455
|
+
this.shouldLogMaxProcessedRequestsExceeded = true;
|
|
456
|
+
this.shouldLogMaxEnqueuedRequestsExceeded = true;
|
|
457
|
+
await purgeDefaultStorages({
|
|
458
|
+
onlyPurgeOnce: true,
|
|
459
|
+
client: this.config.getStorageClient(),
|
|
460
|
+
config: this.config,
|
|
461
|
+
});
|
|
433
462
|
if (requests) {
|
|
434
463
|
await this.addRequests(requests, addRequestsOptions);
|
|
435
464
|
}
|
|
436
465
|
await this._init();
|
|
437
466
|
await this.stats.startCapturing();
|
|
438
467
|
const periodicLogger = this.getPeriodicLogger();
|
|
439
|
-
|
|
468
|
+
// Don't await, we don't want to block the execution
|
|
469
|
+
void this.setStatusMessage('Starting the crawler.', { level: 'INFO' });
|
|
440
470
|
const sigintHandler = async () => {
|
|
441
471
|
this.log.warning('Pausing... Press CTRL+C again to force exit. To resume, do: CRAWLEE_PURGE_ON_START=0 npm start');
|
|
442
472
|
await this._pauseOnMigration();
|
|
@@ -485,7 +515,8 @@ export class BasicCrawler {
|
|
|
485
515
|
finished = true;
|
|
486
516
|
}
|
|
487
517
|
periodicLogger.stop();
|
|
488
|
-
|
|
518
|
+
// Don't await, we don't want to block the execution
|
|
519
|
+
void this.setStatusMessage(`Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${this.stats.state.requestsFinished} succeeded, ${this.stats.state.requestsFailed} failed.`, { isStatusMessageTerminal: true, level: 'INFO' });
|
|
489
520
|
this.running = false;
|
|
490
521
|
this.hasFinishedBefore = true;
|
|
491
522
|
}
|
|
@@ -495,6 +526,8 @@ export class BasicCrawler {
|
|
|
495
526
|
* Gracefully stops the current run of the crawler.
|
|
496
527
|
*
|
|
497
528
|
* All the tasks active at the time of calling this method will be allowed to finish.
|
|
529
|
+
*
|
|
530
|
+
* To stop the crawler immediately, use {@link BasicCrawler.teardown|`crawler.teardown()`} instead.
|
|
498
531
|
*/
|
|
499
532
|
stop(message = 'The crawler has been gracefully stopped.') {
|
|
500
533
|
// Gracefully starve the this.autoscaledPool, so it doesn't start new tasks. Resolves once the pool is cleared.
|
|
@@ -511,13 +544,46 @@ export class BasicCrawler {
|
|
|
511
544
|
if (!this.requestQueue && this.requestList) {
|
|
512
545
|
this.log.warningOnce('When using RequestList and RequestQueue at the same time, you should instantiate both explicitly and provide them in the crawler options, to ensure correctly handled restarts of the crawler.');
|
|
513
546
|
}
|
|
514
|
-
this.requestQueue
|
|
547
|
+
if (!this.requestQueue) {
|
|
548
|
+
this.requestQueue = await this._getRequestQueue();
|
|
549
|
+
this.requestManager = undefined;
|
|
550
|
+
}
|
|
551
|
+
if (!this.requestManager) {
|
|
552
|
+
this.requestManager =
|
|
553
|
+
this.requestList === undefined
|
|
554
|
+
? this.requestQueue
|
|
555
|
+
: new RequestManagerTandem(this.requestList, this.requestQueue);
|
|
556
|
+
}
|
|
515
557
|
return this.requestQueue;
|
|
516
558
|
}
|
|
517
559
|
async useState(defaultValue = {}) {
|
|
518
560
|
const kvs = await KeyValueStore.open(null, { config: this.config });
|
|
519
561
|
return kvs.getAutoSavedValue(BasicCrawler.CRAWLEE_STATE_KEY, defaultValue);
|
|
520
562
|
}
|
|
563
|
+
get pendingRequestCountApproximation() {
|
|
564
|
+
return this.requestManager?.getPendingCount() ?? 0;
|
|
565
|
+
}
|
|
566
|
+
calculateEnqueuedRequestLimit(explicitLimit) {
|
|
567
|
+
if (this.maxRequestsPerCrawl === undefined) {
|
|
568
|
+
return explicitLimit;
|
|
569
|
+
}
|
|
570
|
+
const limit = Math.max(0, this.maxRequestsPerCrawl - this.handledRequestsCount - this.pendingRequestCountApproximation);
|
|
571
|
+
return Math.min(limit, explicitLimit ?? Infinity);
|
|
572
|
+
}
|
|
573
|
+
async handleSkippedRequest(options) {
|
|
574
|
+
if (options.reason === 'limit' && this.shouldLogMaxEnqueuedRequestsExceeded) {
|
|
575
|
+
this.log.info('The number of requests enqueued by the crawler reached the maxRequestsPerCrawl limit of ' +
|
|
576
|
+
`${this.maxRequestsPerCrawl} requests and no further requests will be added.`);
|
|
577
|
+
this.shouldLogMaxEnqueuedRequestsExceeded = false;
|
|
578
|
+
}
|
|
579
|
+
if (options.reason === 'enqueueLimit') {
|
|
580
|
+
const enqueuedRequestLimit = this.calculateEnqueuedRequestLimit();
|
|
581
|
+
if (enqueuedRequestLimit === undefined || enqueuedRequestLimit !== 0) {
|
|
582
|
+
this.log.info('The number of requests enqueued by the crawler reached the enqueueLinks limit.');
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
await this.onSkippedRequest?.(options);
|
|
586
|
+
}
|
|
521
587
|
/**
|
|
522
588
|
* Adds requests to the queue in batches. By default, it will resolve after the initial batch is added, and continue
|
|
523
589
|
* adding the rest in background. You can configure the batch size via `batchSize` option and the sleep time in between
|
|
@@ -530,33 +596,57 @@ export class BasicCrawler {
|
|
|
530
596
|
* @param options Options for the request queue
|
|
531
597
|
*/
|
|
532
598
|
async addRequests(requests, options = {}) {
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
const
|
|
538
|
-
const
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
599
|
+
await this.getRequestQueue();
|
|
600
|
+
const requestLimit = this.calculateEnqueuedRequestLimit();
|
|
601
|
+
const skippedBecauseOfRobots = new Set();
|
|
602
|
+
const skippedBecauseOfLimit = new Set();
|
|
603
|
+
const skippedBecauseOfMaxCrawlDepth = new Set();
|
|
604
|
+
const isAllowedBasedOnRobotsTxtFile = this.isAllowedBasedOnRobotsTxtFile.bind(this);
|
|
605
|
+
const maxCrawlDepth = this.maxCrawlDepth;
|
|
606
|
+
ow(requests, ow.object
|
|
607
|
+
.is((value) => isIterable(value) || isAsyncIterable(value))
|
|
608
|
+
.message((value) => `Expected an iterable or async iterable, got ${getObjectType(value)}`));
|
|
609
|
+
async function* filteredRequests() {
|
|
610
|
+
let yieldedRequestCount = 0;
|
|
611
|
+
for await (const request of requests) {
|
|
612
|
+
const url = typeof request === 'string' ? request : request.url;
|
|
613
|
+
if (requestLimit !== undefined && yieldedRequestCount >= requestLimit) {
|
|
614
|
+
skippedBecauseOfLimit.add(url);
|
|
615
|
+
continue;
|
|
616
|
+
}
|
|
617
|
+
if (maxCrawlDepth !== undefined && request.crawlDepth > maxCrawlDepth) {
|
|
618
|
+
skippedBecauseOfMaxCrawlDepth.add(url);
|
|
619
|
+
continue;
|
|
620
|
+
}
|
|
621
|
+
if (await isAllowedBasedOnRobotsTxtFile(url)) {
|
|
622
|
+
yield request;
|
|
623
|
+
yieldedRequestCount += 1;
|
|
624
|
+
}
|
|
625
|
+
else {
|
|
626
|
+
skippedBecauseOfRobots.add(url);
|
|
627
|
+
}
|
|
547
628
|
}
|
|
548
629
|
}
|
|
549
|
-
|
|
630
|
+
const result = await this.requestManager.addRequestsBatched(filteredRequests(), options);
|
|
631
|
+
if (skippedBecauseOfRobots.size > 0) {
|
|
550
632
|
this.log.warning(`Some requests were skipped because they were disallowed based on the robots.txt file`, {
|
|
551
|
-
skipped: [...
|
|
633
|
+
skipped: [...skippedBecauseOfRobots],
|
|
552
634
|
});
|
|
553
|
-
if (this.onSkippedRequest) {
|
|
554
|
-
await Promise.all([...skipped].map((url) => {
|
|
555
|
-
return this.onSkippedRequest({ url, reason: 'robotsTxt' });
|
|
556
|
-
}));
|
|
557
|
-
}
|
|
558
635
|
}
|
|
559
|
-
|
|
636
|
+
if (skippedBecauseOfRobots.size > 0 ||
|
|
637
|
+
skippedBecauseOfLimit.size > 0 ||
|
|
638
|
+
skippedBecauseOfMaxCrawlDepth.size > 0) {
|
|
639
|
+
await Promise.all([...skippedBecauseOfRobots]
|
|
640
|
+
.map((url) => {
|
|
641
|
+
return this.handleSkippedRequest({ url, reason: 'robotsTxt' });
|
|
642
|
+
})
|
|
643
|
+
.concat([...skippedBecauseOfLimit].map((url) => {
|
|
644
|
+
return this.handleSkippedRequest({ url, reason: 'limit' });
|
|
645
|
+
}), [...skippedBecauseOfMaxCrawlDepth].map((url) => {
|
|
646
|
+
return this.handleSkippedRequest({ url, reason: 'depth' });
|
|
647
|
+
})));
|
|
648
|
+
}
|
|
649
|
+
return result;
|
|
560
650
|
}
|
|
561
651
|
/**
|
|
562
652
|
* Pushes data to the specified {@link Dataset}, or the default crawler {@link Dataset} by calling {@link Dataset.pushData}.
|
|
@@ -596,7 +686,21 @@ export class BasicCrawler {
|
|
|
596
686
|
const dataset = await this.getDataset();
|
|
597
687
|
const items = await dataset.export(options);
|
|
598
688
|
if (format === 'csv') {
|
|
599
|
-
|
|
689
|
+
let value;
|
|
690
|
+
if (items.length === 0) {
|
|
691
|
+
value = '';
|
|
692
|
+
}
|
|
693
|
+
else {
|
|
694
|
+
const keys = options?.collectAllKeys
|
|
695
|
+
? Array.from(new Set(items.flatMap(Object.keys)))
|
|
696
|
+
: Object.keys(items[0]);
|
|
697
|
+
value = stringify([
|
|
698
|
+
keys,
|
|
699
|
+
...items.map((item) => {
|
|
700
|
+
return keys.map((k) => item[k]);
|
|
701
|
+
}),
|
|
702
|
+
]);
|
|
703
|
+
}
|
|
600
704
|
await ensureDir(dirname(path));
|
|
601
705
|
await writeFile(path, value);
|
|
602
706
|
this.log.info(`Export to ${path} finished!`);
|
|
@@ -608,6 +712,9 @@ export class BasicCrawler {
|
|
|
608
712
|
}
|
|
609
713
|
return items;
|
|
610
714
|
}
|
|
715
|
+
/**
|
|
716
|
+
* Initializes the crawler.
|
|
717
|
+
*/
|
|
611
718
|
async _init() {
|
|
612
719
|
if (!this.events.isInitialized()) {
|
|
613
720
|
await this.events.init();
|
|
@@ -622,6 +729,7 @@ export class BasicCrawler {
|
|
|
622
729
|
// Assuming there are not more than 20 browsers running at once;
|
|
623
730
|
this.sessionPool.setMaxListeners(20);
|
|
624
731
|
}
|
|
732
|
+
await this.initializeRequestManager();
|
|
625
733
|
await this._loadHandledRequestCount();
|
|
626
734
|
}
|
|
627
735
|
async runRequestHandler(crawlingContext) {
|
|
@@ -643,7 +751,8 @@ export class BasicCrawler {
|
|
|
643
751
|
return true;
|
|
644
752
|
}
|
|
645
753
|
const robotsTxtFile = await this.getRobotsTxtFileForUrl(url);
|
|
646
|
-
|
|
754
|
+
const userAgent = typeof this.respectRobotsTxtFile === 'object' ? this.respectRobotsTxtFile?.userAgent : '*';
|
|
755
|
+
return !robotsTxtFile || robotsTxtFile.isAllowed(url, userAgent);
|
|
647
756
|
}
|
|
648
757
|
async getRobotsTxtFileForUrl(url) {
|
|
649
758
|
if (!this.respectRobotsTxtFile) {
|
|
@@ -697,30 +806,35 @@ export class BasicCrawler {
|
|
|
697
806
|
await Promise.all([requestListPersistPromise, this.stats.persistState()]);
|
|
698
807
|
}
|
|
699
808
|
/**
|
|
700
|
-
*
|
|
701
|
-
* and RequestQueue is present then enqueues it to the queue first.
|
|
809
|
+
* Initializes the RequestManager based on the configured requestList and requestQueue.
|
|
702
810
|
*/
|
|
703
|
-
async
|
|
704
|
-
if (
|
|
705
|
-
return
|
|
706
|
-
}
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
811
|
+
async initializeRequestManager() {
|
|
812
|
+
if (this.requestManager !== undefined) {
|
|
813
|
+
return;
|
|
814
|
+
}
|
|
815
|
+
if (this.requestList && this.requestQueue) {
|
|
816
|
+
// Create a RequestManagerTandem if both RequestList and RequestQueue are provided
|
|
817
|
+
this.requestManager = new RequestManagerTandem(this.requestList, this.requestQueue);
|
|
818
|
+
}
|
|
819
|
+
else if (this.requestQueue) {
|
|
820
|
+
// Use RequestQueue directly if only it is provided
|
|
821
|
+
this.requestManager = this.requestQueue;
|
|
822
|
+
}
|
|
823
|
+
else if (this.requestList) {
|
|
824
|
+
// Use RequestList directly if only it is provided
|
|
825
|
+
// Make it compatible with the IRequestManager interface
|
|
826
|
+
this.requestManager = new RequestListAdapter(this.requestList);
|
|
714
827
|
}
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
828
|
+
// If neither RequestList nor RequestQueue is provided, leave the requestManager uninitialized until `getRequestQueue` is called
|
|
829
|
+
}
|
|
830
|
+
/**
|
|
831
|
+
* Fetches the next request to process from the underlying request provider.
|
|
832
|
+
*/
|
|
833
|
+
async _fetchNextRequest() {
|
|
834
|
+
if (this.requestManager === undefined) {
|
|
835
|
+
throw new Error(`_fetchNextRequest called on an uninitialized crawler`);
|
|
721
836
|
}
|
|
722
|
-
|
|
723
|
-
return this.requestQueue.fetchNextRequest();
|
|
837
|
+
return this.requestManager.fetchNextRequest();
|
|
724
838
|
}
|
|
725
839
|
/**
|
|
726
840
|
* Delays processing of the request based on the `sameDomainDelaySecs` option,
|
|
@@ -759,23 +873,21 @@ export class BasicCrawler {
|
|
|
759
873
|
* then retries them in a case of an error, etc.
|
|
760
874
|
*/
|
|
761
875
|
async _runTaskFunction() {
|
|
762
|
-
const source = this.
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
await this._timeoutAndRetry(
|
|
766
|
-
request = await this._fetchNextRequest();
|
|
767
|
-
}, this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
876
|
+
const source = this.requestManager;
|
|
877
|
+
if (!source)
|
|
878
|
+
throw new Error('Request provider is not initialized!');
|
|
879
|
+
const request = await this._timeoutAndRetry(this._fetchNextRequest.bind(this), this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
768
880
|
tryCancel();
|
|
769
|
-
|
|
770
|
-
await this._timeoutAndRetry(async () => {
|
|
771
|
-
|
|
881
|
+
const session = this.useSessionPool
|
|
882
|
+
? await this._timeoutAndRetry(async () => {
|
|
883
|
+
return await this.sessionPool.newSession({
|
|
772
884
|
proxyInfo: await this.proxyConfiguration?.newProxyInfo({
|
|
773
885
|
request: request ?? undefined,
|
|
774
886
|
}),
|
|
775
887
|
maxUsageCount: 1,
|
|
776
888
|
});
|
|
777
|
-
}, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`)
|
|
778
|
-
|
|
889
|
+
}, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`)
|
|
890
|
+
: undefined;
|
|
779
891
|
tryCancel();
|
|
780
892
|
if (!request || this.delayRequest(request, source)) {
|
|
781
893
|
return;
|
|
@@ -785,7 +897,7 @@ export class BasicCrawler {
|
|
|
785
897
|
request.state = RequestState.SKIPPED;
|
|
786
898
|
request.noRetry = true;
|
|
787
899
|
await source.markRequestHandled(request);
|
|
788
|
-
await this.
|
|
900
|
+
await this.handleSkippedRequest({
|
|
789
901
|
url: request.url,
|
|
790
902
|
reason: 'robotsTxt',
|
|
791
903
|
});
|
|
@@ -803,16 +915,13 @@ export class BasicCrawler {
|
|
|
803
915
|
session,
|
|
804
916
|
proxyInfo: session?.proxyInfo,
|
|
805
917
|
enqueueLinks: async (options) => {
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
requestQueue: await this.getRequestQueue(),
|
|
809
|
-
robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
|
|
810
|
-
onSkippedRequest: this.onSkippedRequest,
|
|
811
|
-
...options,
|
|
812
|
-
});
|
|
918
|
+
const requestQueue = await this.getRequestQueue();
|
|
919
|
+
return await this.enqueueLinksWithCrawlDepth(options, request, requestQueue);
|
|
813
920
|
},
|
|
814
|
-
addRequests: async (requests, options) => {
|
|
815
|
-
|
|
921
|
+
addRequests: async (requests, options = {}) => {
|
|
922
|
+
const newCrawlDepth = request.crawlDepth + 1;
|
|
923
|
+
const requestsGenerator = this.addCrawlDepthRequestGenerator(requests, newCrawlDepth);
|
|
924
|
+
await this.addRequests(requestsGenerator, options);
|
|
816
925
|
},
|
|
817
926
|
pushData: this.pushData.bind(this),
|
|
818
927
|
useState: this.useState.bind(this),
|
|
@@ -875,19 +984,60 @@ export class BasicCrawler {
|
|
|
875
984
|
}
|
|
876
985
|
}
|
|
877
986
|
/**
|
|
878
|
-
*
|
|
987
|
+
* Wrapper around the crawling context's `enqueueLinks` method:
|
|
988
|
+
* - Injects `crawlDepth` to each request being added based on the crawling context request.
|
|
989
|
+
* - Provides defaults for the `enqueueLinks` options based on the crawler configuration.
|
|
990
|
+
* - These options can be overridden by the user.
|
|
991
|
+
* @internal
|
|
992
|
+
*/
|
|
993
|
+
async enqueueLinksWithCrawlDepth(options, request, requestQueue) {
|
|
994
|
+
const transformRequestFunctionWrapper = (newRequest) => {
|
|
995
|
+
newRequest.crawlDepth = request.crawlDepth + 1;
|
|
996
|
+
if (this.maxCrawlDepth !== undefined && newRequest.crawlDepth > this.maxCrawlDepth) {
|
|
997
|
+
newRequest.skippedReason = 'depth';
|
|
998
|
+
return false;
|
|
999
|
+
}
|
|
1000
|
+
// After injecting the crawlDepth, we call the user-provided transform function, if there is one.
|
|
1001
|
+
return options.transformRequestFunction?.(newRequest) ?? newRequest;
|
|
1002
|
+
};
|
|
1003
|
+
return await enqueueLinks({
|
|
1004
|
+
requestQueue,
|
|
1005
|
+
robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
|
|
1006
|
+
onSkippedRequest: this.handleSkippedRequest,
|
|
1007
|
+
limit: this.calculateEnqueuedRequestLimit(options.limit),
|
|
1008
|
+
// Allow user options to override defaults set above ⤴
|
|
1009
|
+
...options,
|
|
1010
|
+
transformRequestFunction: transformRequestFunctionWrapper,
|
|
1011
|
+
});
|
|
1012
|
+
}
|
|
1013
|
+
/**
|
|
1014
|
+
* Generator function that yields requests injected with the given crawl depth.
|
|
1015
|
+
* @internal
|
|
1016
|
+
*/
|
|
1017
|
+
async *addCrawlDepthRequestGenerator(requests, newRequestDepth) {
|
|
1018
|
+
for await (const request of requests) {
|
|
1019
|
+
if (typeof request === 'string') {
|
|
1020
|
+
yield { url: request, crawlDepth: newRequestDepth };
|
|
1021
|
+
}
|
|
1022
|
+
else {
|
|
1023
|
+
request.crawlDepth ??= newRequestDepth;
|
|
1024
|
+
yield request;
|
|
1025
|
+
}
|
|
1026
|
+
}
|
|
1027
|
+
}
|
|
1028
|
+
/**
|
|
1029
|
+
* Run async callback with given timeout and retry. Returns the result of the callback.
|
|
879
1030
|
* @ignore
|
|
880
1031
|
*/
|
|
881
1032
|
async _timeoutAndRetry(handler, timeout, error, maxRetries = 3, retried = 1) {
|
|
882
1033
|
try {
|
|
883
|
-
await addTimeoutToPromise(handler, timeout, error);
|
|
1034
|
+
return await addTimeoutToPromise(handler, timeout, error);
|
|
884
1035
|
}
|
|
885
1036
|
catch (e) {
|
|
886
1037
|
if (retried <= maxRetries) {
|
|
887
1038
|
// we retry on any error, not just timeout
|
|
888
1039
|
this.log.warning(`${e.message} (retrying ${retried}/${maxRetries})`);
|
|
889
|
-
|
|
890
|
-
return;
|
|
1040
|
+
return this._timeoutAndRetry(handler, timeout, error, maxRetries, retried + 1);
|
|
891
1041
|
}
|
|
892
1042
|
throw e;
|
|
893
1043
|
}
|
|
@@ -896,24 +1046,13 @@ export class BasicCrawler {
|
|
|
896
1046
|
* Returns true if either RequestList or RequestQueue have a request ready for processing.
|
|
897
1047
|
*/
|
|
898
1048
|
async _isTaskReadyFunction() {
|
|
899
|
-
|
|
900
|
-
const isRequestListEmpty = this.requestList ? await this.requestList.isEmpty() : true;
|
|
901
|
-
// If RequestList is not empty, task is ready, no reason to check RequestQueue.
|
|
902
|
-
if (!isRequestListEmpty)
|
|
903
|
-
return true;
|
|
904
|
-
// If RequestQueue is not empty, task is ready, return true, otherwise false.
|
|
905
|
-
return this.requestQueue ? !(await this.requestQueue.isEmpty()) : false;
|
|
1049
|
+
return this.requestManager !== undefined && !(await this.requestManager.isEmpty());
|
|
906
1050
|
}
|
|
907
1051
|
/**
|
|
908
1052
|
* Returns true if both RequestList and RequestQueue have all requests finished.
|
|
909
1053
|
*/
|
|
910
1054
|
async _defaultIsFinishedFunction() {
|
|
911
|
-
|
|
912
|
-
this.requestList ? this.requestList.isFinished() : true,
|
|
913
|
-
this.requestQueue ? this.requestQueue.isFinished() : true,
|
|
914
|
-
]);
|
|
915
|
-
// If both are finished, return true, otherwise return false.
|
|
916
|
-
return isRequestListFinished && isRequestQueueFinished;
|
|
1055
|
+
return !this.requestManager || (await this.requestManager.isFinished());
|
|
917
1056
|
}
|
|
918
1057
|
async _rotateSession(crawlingContext) {
|
|
919
1058
|
const { request } = crawlingContext;
|
|
@@ -1037,19 +1176,11 @@ export class BasicCrawler {
|
|
|
1037
1176
|
return request.retryCount < maxRequestRetries;
|
|
1038
1177
|
}
|
|
1039
1178
|
/**
|
|
1040
|
-
* Updates handledRequestsCount from possibly stored counts,
|
|
1041
|
-
* usually after worker migration. Since one of the stores
|
|
1042
|
-
* needs to have priority when both are present,
|
|
1043
|
-
* it is the request queue, because generally, the request
|
|
1044
|
-
* list will first be dumped into the queue and then left
|
|
1045
|
-
* empty.
|
|
1179
|
+
* Updates handledRequestsCount from possibly stored counts, usually after worker migration.
|
|
1046
1180
|
*/
|
|
1047
1181
|
async _loadHandledRequestCount() {
|
|
1048
|
-
if (this.
|
|
1049
|
-
this.handledRequestsCount = await this.
|
|
1050
|
-
}
|
|
1051
|
-
else if (this.requestList) {
|
|
1052
|
-
this.handledRequestsCount = this.requestList.handledCount();
|
|
1182
|
+
if (this.requestManager) {
|
|
1183
|
+
this.handledRequestsCount = await this.requestManager.handledCount();
|
|
1053
1184
|
}
|
|
1054
1185
|
}
|
|
1055
1186
|
async _executeHooks(hooks, ...args) {
|
|
@@ -1060,8 +1191,11 @@ export class BasicCrawler {
|
|
|
1060
1191
|
}
|
|
1061
1192
|
}
|
|
1062
1193
|
/**
|
|
1063
|
-
*
|
|
1064
|
-
*
|
|
1194
|
+
* Stops the crawler immediately.
|
|
1195
|
+
*
|
|
1196
|
+
* This method doesn't wait for currently active requests to finish.
|
|
1197
|
+
*
|
|
1198
|
+
* To stop the crawler gracefully (waiting for all running requests to finish), use {@link BasicCrawler.stop|`crawler.stop()`} instead.
|
|
1065
1199
|
*/
|
|
1066
1200
|
async teardown() {
|
|
1067
1201
|
this.events.emit("persistState" /* EventType.PERSIST_STATE */, { isMigrating: false });
|