@crawlee/basic 4.0.0-beta.13 → 4.0.0-beta.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  import { writeFile } from 'node:fs/promises';
2
2
  import { dirname } from 'node:path';
3
- import { AutoscaledPool, Configuration, ContextPipeline, ContextPipelineCleanupError, ContextPipelineInitializationError, ContextPipelineInterruptedError, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, GotScrapingHttpClient, KeyValueStore, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestHandlerError, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
4
- import { RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
3
+ import { AutoscaledPool, Configuration, ContextPipeline, ContextPipelineCleanupError, ContextPipelineInitializationError, ContextPipelineInterruptedError, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, GotScrapingHttpClient, KeyValueStore, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestHandlerError, RequestListAdapter, RequestManagerTandem, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
4
+ import { getObjectType, isAsyncIterable, isIterable, RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
5
5
  import { stringify } from 'csv-stringify/sync';
6
6
  import { ensureDir, writeJSON } from 'fs-extra/esm';
7
7
  import ow from 'ow';
@@ -103,6 +103,10 @@ export class BasicCrawler {
103
103
  * Only available if used by the crawler.
104
104
  */
105
105
  requestQueue;
106
+ /**
107
+ * The main request-handling component of the crawler. It's initialized during the crawler startup.
108
+ */
109
+ requestManager;
106
110
  /**
107
111
  * A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session|sessions}.
108
112
  * Only available if used by the crawler.
@@ -143,10 +147,12 @@ export class BasicCrawler {
143
147
  requestHandlerTimeoutMillis;
144
148
  internalTimeoutMillis;
145
149
  maxRequestRetries;
150
+ maxCrawlDepth;
146
151
  sameDomainDelayMillis;
147
152
  domainAccessedTime;
148
153
  maxSessionRotations;
149
- handledRequestsCount;
154
+ maxRequestsPerCrawl;
155
+ handledRequestsCount = 0;
150
156
  statusMessageLoggingInterval;
151
157
  statusMessageCallback;
152
158
  sessionPoolOptions;
@@ -158,6 +164,8 @@ export class BasicCrawler {
158
164
  respectRobotsTxtFile;
159
165
  onSkippedRequest;
160
166
  _closeEvents;
167
+ shouldLogMaxProcessedRequestsExceeded = true;
168
+ shouldLogMaxEnqueuedRequestsExceeded = true;
161
169
  experiments;
162
170
  robotsTxtFileCache;
163
171
  _experimentWarnings = {};
@@ -177,6 +185,7 @@ export class BasicCrawler {
177
185
  sameDomainDelaySecs: ow.optional.number,
178
186
  maxSessionRotations: ow.optional.number,
179
187
  maxRequestsPerCrawl: ow.optional.number,
188
+ maxCrawlDepth: ow.optional.number,
180
189
  autoscaledPoolOptions: ow.optional.object,
181
190
  sessionPoolOptions: ow.optional.object,
182
191
  useSessionPool: ow.optional.boolean,
@@ -184,7 +193,7 @@ export class BasicCrawler {
184
193
  statusMessageLoggingInterval: ow.optional.number,
185
194
  statusMessageCallback: ow.optional.function,
186
195
  retryOnBlocked: ow.optional.boolean,
187
- respectRobotsTxtFile: ow.optional.boolean,
196
+ respectRobotsTxtFile: ow.optional.any(ow.boolean, ow.object),
188
197
  onSkippedRequest: ow.optional.function,
189
198
  httpClient: ow.optional.object,
190
199
  // AutoscaledPool shorthands
@@ -204,7 +213,7 @@ export class BasicCrawler {
204
213
  config = Configuration.getGlobalConfig()) {
205
214
  this.config = config;
206
215
  ow(options, 'BasicCrawlerOptions', ow.object.exactShape(BasicCrawler.optionsShape));
207
- const { requestList, requestQueue, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, useSessionPool = true, proxyConfiguration,
216
+ const { requestList, requestQueue, requestManager, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, maxCrawlDepth, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, useSessionPool = true, proxyConfiguration,
208
217
  // AutoscaledPool shorthands
209
218
  minConcurrency, maxConcurrency, maxRequestsPerMinute, retryOnBlocked = false, respectRobotsTxtFile = false, onSkippedRequest, requestHandler, requestHandlerTimeoutSecs, errorHandler, failedRequestHandler, statusMessageLoggingInterval = 10, statusMessageCallback, statisticsOptions, httpClient,
210
219
  // internal
@@ -228,6 +237,7 @@ export class BasicCrawler {
228
237
  this.log.debug(message);
229
238
  request.noRetry = true;
230
239
  request.state = RequestState.SKIPPED;
240
+ await this.handleSkippedRequest({ url: request.url, reason: 'redirect' });
231
241
  throw new ContextPipelineInterruptedError(message);
232
242
  }
233
243
  return context;
@@ -235,8 +245,17 @@ export class BasicCrawler {
235
245
  });
236
246
  return contextPipeline;
237
247
  };
238
- this.requestList = requestList;
239
- this.requestQueue = requestQueue;
248
+ if (requestManager !== undefined) {
249
+ if (requestList !== undefined || requestQueue !== undefined) {
250
+ throw new Error('The `requestManager` option cannot be used in conjunction with `requestList` and/or `requestQueue`');
251
+ }
252
+ this.requestManager = requestManager;
253
+ this.requestQueue = requestManager; // TODO(v4) - the cast is not fully legitimate here, but it's fine for internal usage by the BasicCrawler
254
+ }
255
+ else {
256
+ this.requestList = requestList;
257
+ this.requestQueue = requestQueue;
258
+ }
240
259
  this.httpClient = httpClient ?? new GotScrapingHttpClient();
241
260
  this.proxyConfiguration = proxyConfiguration;
242
261
  this.log = log;
@@ -246,6 +265,7 @@ export class BasicCrawler {
246
265
  this.domainAccessedTime = new Map();
247
266
  this.experiments = experiments;
248
267
  this.robotsTxtFileCache = new LruCache({ maxLength: 1000 });
268
+ this.handleSkippedRequest = this.handleSkippedRequest.bind(this);
249
269
  this.requestHandler = requestHandler ?? this.router;
250
270
  this.failedRequestHandler = failedRequestHandler;
251
271
  this.errorHandler = errorHandler;
@@ -270,9 +290,9 @@ export class BasicCrawler {
270
290
  this.requestQueue.requestLockSecs = Math.max(this.requestHandlerTimeoutMillis / 1000 + 5, 60);
271
291
  }
272
292
  this.maxRequestRetries = maxRequestRetries;
293
+ this.maxCrawlDepth = maxCrawlDepth;
273
294
  this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
274
295
  this.maxSessionRotations = maxSessionRotations;
275
- this.handledRequestsCount = 0;
276
296
  this.stats = new Statistics({
277
297
  logMessage: `${log.getOptions().prefix} request statistics:`,
278
298
  log,
@@ -297,8 +317,8 @@ export class BasicCrawler {
297
317
  this.requestHandlerTimeoutMillis = maxSignedInteger;
298
318
  }
299
319
  this.internalTimeoutMillis = Math.min(this.internalTimeoutMillis, maxSignedInteger);
300
- let shouldLogMaxPagesExceeded = true;
301
- const isMaxPagesExceeded = () => maxRequestsPerCrawl && maxRequestsPerCrawl <= this.handledRequestsCount;
320
+ this.maxRequestsPerCrawl = maxRequestsPerCrawl;
321
+ const isMaxPagesExceeded = () => this.maxRequestsPerCrawl && this.maxRequestsPerCrawl <= this.handledRequestsCount;
302
322
  // eslint-disable-next-line prefer-const
303
323
  let { isFinishedFunction, isTaskReadyFunction } = autoscaledPoolOptions;
304
324
  // override even if `isFinishedFunction` provided by user - `keepAlive` has higher priority
@@ -312,10 +332,10 @@ export class BasicCrawler {
312
332
  runTaskFunction: this._runTaskFunction.bind(this),
313
333
  isTaskReadyFunction: async () => {
314
334
  if (isMaxPagesExceeded()) {
315
- if (shouldLogMaxPagesExceeded) {
335
+ if (this.shouldLogMaxProcessedRequestsExceeded) {
316
336
  log.info('Crawler reached the maxRequestsPerCrawl limit of ' +
317
- `${maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
318
- shouldLogMaxPagesExceeded = false;
337
+ `${this.maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
338
+ this.shouldLogMaxProcessedRequestsExceeded = false;
319
339
  }
320
340
  return false;
321
341
  }
@@ -323,7 +343,7 @@ export class BasicCrawler {
323
343
  },
324
344
  isFinishedFunction: async () => {
325
345
  if (isMaxPagesExceeded()) {
326
- log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${maxRequestsPerCrawl} requests ` +
346
+ log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${this.maxRequestsPerCrawl} requests ` +
327
347
  'and all requests that were in progress at that time have now finished. ' +
328
348
  `In total, the crawler processed ${this.handledRequestsCount} requests and will shut down.`);
329
349
  return true;
@@ -383,7 +403,7 @@ export class BasicCrawler {
383
403
  message = `Experiencing problems, ${this.stats.state.requestsFailed - previousState.requestsFailed || this.stats.state.requestsFailed} failed requests in the past ${this.statusMessageLoggingInterval} seconds.`;
384
404
  }
385
405
  else {
386
- const total = this.requestQueue?.getTotalCount() || this.requestList?.length();
406
+ const total = this.requestManager?.getTotalCount();
387
407
  message = `Crawled ${this.stats.state.requestsFinished}${total ? `/${total}` : ''} pages, ${this.stats.state.requestsFailed} failed requests, desired concurrency ${this.autoscaledPool?.desiredConcurrency ?? 0}.`;
388
408
  }
389
409
  if (this.statusMessageCallback) {
@@ -423,20 +443,30 @@ export class BasicCrawler {
423
443
  if (this.requestQueue?.name === 'default' && purgeRequestQueue) {
424
444
  await this.requestQueue.drop();
425
445
  this.requestQueue = await this._getRequestQueue();
446
+ this.requestManager = undefined;
447
+ await this.initializeRequestManager();
448
+ this.handledRequestsCount = 0; // This would've been reset by this._init() further down below, but at that point `handledRequestsCount` could prevent `addRequests` from adding the initial requests
426
449
  }
427
450
  this.stats.reset();
428
451
  await this.stats.resetStore();
429
452
  await this.sessionPool?.resetStore();
430
453
  }
431
454
  this.running = true;
432
- await purgeDefaultStorages({ onlyPurgeOnce: true });
455
+ this.shouldLogMaxProcessedRequestsExceeded = true;
456
+ this.shouldLogMaxEnqueuedRequestsExceeded = true;
457
+ await purgeDefaultStorages({
458
+ onlyPurgeOnce: true,
459
+ client: this.config.getStorageClient(),
460
+ config: this.config,
461
+ });
433
462
  if (requests) {
434
463
  await this.addRequests(requests, addRequestsOptions);
435
464
  }
436
465
  await this._init();
437
466
  await this.stats.startCapturing();
438
467
  const periodicLogger = this.getPeriodicLogger();
439
- await this.setStatusMessage('Starting the crawler.', { level: 'INFO' });
468
+ // Don't await, we don't want to block the execution
469
+ void this.setStatusMessage('Starting the crawler.', { level: 'INFO' });
440
470
  const sigintHandler = async () => {
441
471
  this.log.warning('Pausing... Press CTRL+C again to force exit. To resume, do: CRAWLEE_PURGE_ON_START=0 npm start');
442
472
  await this._pauseOnMigration();
@@ -485,7 +515,8 @@ export class BasicCrawler {
485
515
  finished = true;
486
516
  }
487
517
  periodicLogger.stop();
488
- await this.setStatusMessage(`Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${this.stats.state.requestsFinished} succeeded, ${this.stats.state.requestsFailed} failed.`, { isStatusMessageTerminal: true, level: 'INFO' });
518
+ // Don't await, we don't want to block the execution
519
+ void this.setStatusMessage(`Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${this.stats.state.requestsFinished} succeeded, ${this.stats.state.requestsFailed} failed.`, { isStatusMessageTerminal: true, level: 'INFO' });
489
520
  this.running = false;
490
521
  this.hasFinishedBefore = true;
491
522
  }
@@ -495,6 +526,8 @@ export class BasicCrawler {
495
526
  * Gracefully stops the current run of the crawler.
496
527
  *
497
528
  * All the tasks active at the time of calling this method will be allowed to finish.
529
+ *
530
+ * To stop the crawler immediately, use {@link BasicCrawler.teardown|`crawler.teardown()`} instead.
498
531
  */
499
532
  stop(message = 'The crawler has been gracefully stopped.') {
500
533
  // Gracefully starve the this.autoscaledPool, so it doesn't start new tasks. Resolves once the pool is cleared.
@@ -511,13 +544,46 @@ export class BasicCrawler {
511
544
  if (!this.requestQueue && this.requestList) {
512
545
  this.log.warningOnce('When using RequestList and RequestQueue at the same time, you should instantiate both explicitly and provide them in the crawler options, to ensure correctly handled restarts of the crawler.');
513
546
  }
514
- this.requestQueue ??= await this._getRequestQueue();
547
+ if (!this.requestQueue) {
548
+ this.requestQueue = await this._getRequestQueue();
549
+ this.requestManager = undefined;
550
+ }
551
+ if (!this.requestManager) {
552
+ this.requestManager =
553
+ this.requestList === undefined
554
+ ? this.requestQueue
555
+ : new RequestManagerTandem(this.requestList, this.requestQueue);
556
+ }
515
557
  return this.requestQueue;
516
558
  }
517
559
  async useState(defaultValue = {}) {
518
560
  const kvs = await KeyValueStore.open(null, { config: this.config });
519
561
  return kvs.getAutoSavedValue(BasicCrawler.CRAWLEE_STATE_KEY, defaultValue);
520
562
  }
563
+ get pendingRequestCountApproximation() {
564
+ return this.requestManager?.getPendingCount() ?? 0;
565
+ }
566
+ calculateEnqueuedRequestLimit(explicitLimit) {
567
+ if (this.maxRequestsPerCrawl === undefined) {
568
+ return explicitLimit;
569
+ }
570
+ const limit = Math.max(0, this.maxRequestsPerCrawl - this.handledRequestsCount - this.pendingRequestCountApproximation);
571
+ return Math.min(limit, explicitLimit ?? Infinity);
572
+ }
573
+ async handleSkippedRequest(options) {
574
+ if (options.reason === 'limit' && this.shouldLogMaxEnqueuedRequestsExceeded) {
575
+ this.log.info('The number of requests enqueued by the crawler reached the maxRequestsPerCrawl limit of ' +
576
+ `${this.maxRequestsPerCrawl} requests and no further requests will be added.`);
577
+ this.shouldLogMaxEnqueuedRequestsExceeded = false;
578
+ }
579
+ if (options.reason === 'enqueueLimit') {
580
+ const enqueuedRequestLimit = this.calculateEnqueuedRequestLimit();
581
+ if (enqueuedRequestLimit === undefined || enqueuedRequestLimit !== 0) {
582
+ this.log.info('The number of requests enqueued by the crawler reached the enqueueLinks limit.');
583
+ }
584
+ }
585
+ await this.onSkippedRequest?.(options);
586
+ }
521
587
  /**
522
588
  * Adds requests to the queue in batches. By default, it will resolve after the initial batch is added, and continue
523
589
  * adding the rest in background. You can configure the batch size via `batchSize` option and the sleep time in between
@@ -530,33 +596,57 @@ export class BasicCrawler {
530
596
  * @param options Options for the request queue
531
597
  */
532
598
  async addRequests(requests, options = {}) {
533
- const requestQueue = await this.getRequestQueue();
534
- if (!this.respectRobotsTxtFile) {
535
- return requestQueue.addRequestsBatched(requests, options);
536
- }
537
- const allowedRequests = [];
538
- const skipped = new Set();
539
- for (const request of requests) {
540
- const url = typeof request === 'string' ? request : request.url;
541
- if (await this.isAllowedBasedOnRobotsTxtFile(url)) {
542
- allowedRequests.push(request);
543
- }
544
- else {
545
- skipped.add(url);
546
- await this.onSkippedRequest?.({ url, reason: 'robotsTxt' });
599
+ await this.getRequestQueue();
600
+ const requestLimit = this.calculateEnqueuedRequestLimit();
601
+ const skippedBecauseOfRobots = new Set();
602
+ const skippedBecauseOfLimit = new Set();
603
+ const skippedBecauseOfMaxCrawlDepth = new Set();
604
+ const isAllowedBasedOnRobotsTxtFile = this.isAllowedBasedOnRobotsTxtFile.bind(this);
605
+ const maxCrawlDepth = this.maxCrawlDepth;
606
+ ow(requests, ow.object
607
+ .is((value) => isIterable(value) || isAsyncIterable(value))
608
+ .message((value) => `Expected an iterable or async iterable, got ${getObjectType(value)}`));
609
+ async function* filteredRequests() {
610
+ let yieldedRequestCount = 0;
611
+ for await (const request of requests) {
612
+ const url = typeof request === 'string' ? request : request.url;
613
+ if (requestLimit !== undefined && yieldedRequestCount >= requestLimit) {
614
+ skippedBecauseOfLimit.add(url);
615
+ continue;
616
+ }
617
+ if (maxCrawlDepth !== undefined && request.crawlDepth > maxCrawlDepth) {
618
+ skippedBecauseOfMaxCrawlDepth.add(url);
619
+ continue;
620
+ }
621
+ if (await isAllowedBasedOnRobotsTxtFile(url)) {
622
+ yield request;
623
+ yieldedRequestCount += 1;
624
+ }
625
+ else {
626
+ skippedBecauseOfRobots.add(url);
627
+ }
547
628
  }
548
629
  }
549
- if (skipped.size > 0) {
630
+ const result = await this.requestManager.addRequestsBatched(filteredRequests(), options);
631
+ if (skippedBecauseOfRobots.size > 0) {
550
632
  this.log.warning(`Some requests were skipped because they were disallowed based on the robots.txt file`, {
551
- skipped: [...skipped],
633
+ skipped: [...skippedBecauseOfRobots],
552
634
  });
553
- if (this.onSkippedRequest) {
554
- await Promise.all([...skipped].map((url) => {
555
- return this.onSkippedRequest({ url, reason: 'robotsTxt' });
556
- }));
557
- }
558
635
  }
559
- return requestQueue.addRequestsBatched(allowedRequests, options);
636
+ if (skippedBecauseOfRobots.size > 0 ||
637
+ skippedBecauseOfLimit.size > 0 ||
638
+ skippedBecauseOfMaxCrawlDepth.size > 0) {
639
+ await Promise.all([...skippedBecauseOfRobots]
640
+ .map((url) => {
641
+ return this.handleSkippedRequest({ url, reason: 'robotsTxt' });
642
+ })
643
+ .concat([...skippedBecauseOfLimit].map((url) => {
644
+ return this.handleSkippedRequest({ url, reason: 'limit' });
645
+ }), [...skippedBecauseOfMaxCrawlDepth].map((url) => {
646
+ return this.handleSkippedRequest({ url, reason: 'depth' });
647
+ })));
648
+ }
649
+ return result;
560
650
  }
561
651
  /**
562
652
  * Pushes data to the specified {@link Dataset}, or the default crawler {@link Dataset} by calling {@link Dataset.pushData}.
@@ -596,7 +686,21 @@ export class BasicCrawler {
596
686
  const dataset = await this.getDataset();
597
687
  const items = await dataset.export(options);
598
688
  if (format === 'csv') {
599
- const value = stringify([Object.keys(items[0]), ...items.map((item) => Object.values(item))]);
689
+ let value;
690
+ if (items.length === 0) {
691
+ value = '';
692
+ }
693
+ else {
694
+ const keys = options?.collectAllKeys
695
+ ? Array.from(new Set(items.flatMap(Object.keys)))
696
+ : Object.keys(items[0]);
697
+ value = stringify([
698
+ keys,
699
+ ...items.map((item) => {
700
+ return keys.map((k) => item[k]);
701
+ }),
702
+ ]);
703
+ }
600
704
  await ensureDir(dirname(path));
601
705
  await writeFile(path, value);
602
706
  this.log.info(`Export to ${path} finished!`);
@@ -608,6 +712,9 @@ export class BasicCrawler {
608
712
  }
609
713
  return items;
610
714
  }
715
+ /**
716
+ * Initializes the crawler.
717
+ */
611
718
  async _init() {
612
719
  if (!this.events.isInitialized()) {
613
720
  await this.events.init();
@@ -622,6 +729,7 @@ export class BasicCrawler {
622
729
  // Assuming there are not more than 20 browsers running at once;
623
730
  this.sessionPool.setMaxListeners(20);
624
731
  }
732
+ await this.initializeRequestManager();
625
733
  await this._loadHandledRequestCount();
626
734
  }
627
735
  async runRequestHandler(crawlingContext) {
@@ -643,7 +751,8 @@ export class BasicCrawler {
643
751
  return true;
644
752
  }
645
753
  const robotsTxtFile = await this.getRobotsTxtFileForUrl(url);
646
- return !robotsTxtFile || robotsTxtFile.isAllowed(url);
754
+ const userAgent = typeof this.respectRobotsTxtFile === 'object' ? this.respectRobotsTxtFile?.userAgent : '*';
755
+ return !robotsTxtFile || robotsTxtFile.isAllowed(url, userAgent);
647
756
  }
648
757
  async getRobotsTxtFileForUrl(url) {
649
758
  if (!this.respectRobotsTxtFile) {
@@ -697,30 +806,35 @@ export class BasicCrawler {
697
806
  await Promise.all([requestListPersistPromise, this.stats.persistState()]);
698
807
  }
699
808
  /**
700
- * Fetches request from either RequestList or RequestQueue. If request comes from a RequestList
701
- * and RequestQueue is present then enqueues it to the queue first.
809
+ * Initializes the RequestManager based on the configured requestList and requestQueue.
702
810
  */
703
- async _fetchNextRequest() {
704
- if (!this.requestList || (await this.requestList.isFinished())) {
705
- return this.requestQueue?.fetchNextRequest();
706
- }
707
- const request = await this.requestList.fetchNextRequest();
708
- if (!this.requestQueue)
709
- return request;
710
- if (!request)
711
- return this.requestQueue.fetchNextRequest();
712
- try {
713
- await this.requestQueue.addRequest(request, { forefront: true });
811
+ async initializeRequestManager() {
812
+ if (this.requestManager !== undefined) {
813
+ return;
814
+ }
815
+ if (this.requestList && this.requestQueue) {
816
+ // Create a RequestManagerTandem if both RequestList and RequestQueue are provided
817
+ this.requestManager = new RequestManagerTandem(this.requestList, this.requestQueue);
818
+ }
819
+ else if (this.requestQueue) {
820
+ // Use RequestQueue directly if only it is provided
821
+ this.requestManager = this.requestQueue;
822
+ }
823
+ else if (this.requestList) {
824
+ // Use RequestList directly if only it is provided
825
+ // Make it compatible with the IRequestManager interface
826
+ this.requestManager = new RequestListAdapter(this.requestList);
714
827
  }
715
- catch (err) {
716
- // If requestQueue.addRequest() fails here then we must reclaim it back to
717
- // the RequestList because probably it's not yet in the queue!
718
- this.log.error('Adding of request from the RequestList to the RequestQueue failed, reclaiming request back to the list.', { request });
719
- await this.requestList.reclaimRequest(request);
720
- return null;
828
+ // If neither RequestList nor RequestQueue is provided, leave the requestManager uninitialized until `getRequestQueue` is called
829
+ }
830
+ /**
831
+ * Fetches the next request to process from the underlying request provider.
832
+ */
833
+ async _fetchNextRequest() {
834
+ if (this.requestManager === undefined) {
835
+ throw new Error(`_fetchNextRequest called on an uninitialized crawler`);
721
836
  }
722
- await this.requestList.markRequestHandled(request);
723
- return this.requestQueue.fetchNextRequest();
837
+ return this.requestManager.fetchNextRequest();
724
838
  }
725
839
  /**
726
840
  * Delays processing of the request based on the `sameDomainDelaySecs` option,
@@ -759,23 +873,21 @@ export class BasicCrawler {
759
873
  * then retries them in a case of an error, etc.
760
874
  */
761
875
  async _runTaskFunction() {
762
- const source = this.requestQueue || this.requestList || (await this.getRequestQueue());
763
- let request;
764
- let session;
765
- await this._timeoutAndRetry(async () => {
766
- request = await this._fetchNextRequest();
767
- }, this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
876
+ const source = this.requestManager;
877
+ if (!source)
878
+ throw new Error('Request provider is not initialized!');
879
+ const request = await this._timeoutAndRetry(this._fetchNextRequest.bind(this), this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
768
880
  tryCancel();
769
- if (this.useSessionPool) {
770
- await this._timeoutAndRetry(async () => {
771
- session = await this.sessionPool.newSession({
881
+ const session = this.useSessionPool
882
+ ? await this._timeoutAndRetry(async () => {
883
+ return await this.sessionPool.newSession({
772
884
  proxyInfo: await this.proxyConfiguration?.newProxyInfo({
773
885
  request: request ?? undefined,
774
886
  }),
775
887
  maxUsageCount: 1,
776
888
  });
777
- }, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
778
- }
889
+ }, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`)
890
+ : undefined;
779
891
  tryCancel();
780
892
  if (!request || this.delayRequest(request, source)) {
781
893
  return;
@@ -785,7 +897,7 @@ export class BasicCrawler {
785
897
  request.state = RequestState.SKIPPED;
786
898
  request.noRetry = true;
787
899
  await source.markRequestHandled(request);
788
- await this.onSkippedRequest?.({
900
+ await this.handleSkippedRequest({
789
901
  url: request.url,
790
902
  reason: 'robotsTxt',
791
903
  });
@@ -803,16 +915,13 @@ export class BasicCrawler {
803
915
  session,
804
916
  proxyInfo: session?.proxyInfo,
805
917
  enqueueLinks: async (options) => {
806
- return await enqueueLinks({
807
- // specify the RQ first to allow overriding it
808
- requestQueue: await this.getRequestQueue(),
809
- robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
810
- onSkippedRequest: this.onSkippedRequest,
811
- ...options,
812
- });
918
+ const requestQueue = await this.getRequestQueue();
919
+ return await this.enqueueLinksWithCrawlDepth(options, request, requestQueue);
813
920
  },
814
- addRequests: async (requests, options) => {
815
- await this.addRequests(requests, options);
921
+ addRequests: async (requests, options = {}) => {
922
+ const newCrawlDepth = request.crawlDepth + 1;
923
+ const requestsGenerator = this.addCrawlDepthRequestGenerator(requests, newCrawlDepth);
924
+ await this.addRequests(requestsGenerator, options);
816
925
  },
817
926
  pushData: this.pushData.bind(this),
818
927
  useState: this.useState.bind(this),
@@ -875,19 +984,60 @@ export class BasicCrawler {
875
984
  }
876
985
  }
877
986
  /**
878
- * Run async callback with given timeout and retry.
987
+ * Wrapper around the crawling context's `enqueueLinks` method:
988
+ * - Injects `crawlDepth` to each request being added based on the crawling context request.
989
+ * - Provides defaults for the `enqueueLinks` options based on the crawler configuration.
990
+ * - These options can be overridden by the user.
991
+ * @internal
992
+ */
993
+ async enqueueLinksWithCrawlDepth(options, request, requestQueue) {
994
+ const transformRequestFunctionWrapper = (newRequest) => {
995
+ newRequest.crawlDepth = request.crawlDepth + 1;
996
+ if (this.maxCrawlDepth !== undefined && newRequest.crawlDepth > this.maxCrawlDepth) {
997
+ newRequest.skippedReason = 'depth';
998
+ return false;
999
+ }
1000
+ // After injecting the crawlDepth, we call the user-provided transform function, if there is one.
1001
+ return options.transformRequestFunction?.(newRequest) ?? newRequest;
1002
+ };
1003
+ return await enqueueLinks({
1004
+ requestQueue,
1005
+ robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
1006
+ onSkippedRequest: this.handleSkippedRequest,
1007
+ limit: this.calculateEnqueuedRequestLimit(options.limit),
1008
+ // Allow user options to override defaults set above ⤴
1009
+ ...options,
1010
+ transformRequestFunction: transformRequestFunctionWrapper,
1011
+ });
1012
+ }
1013
+ /**
1014
+ * Generator function that yields requests injected with the given crawl depth.
1015
+ * @internal
1016
+ */
1017
+ async *addCrawlDepthRequestGenerator(requests, newRequestDepth) {
1018
+ for await (const request of requests) {
1019
+ if (typeof request === 'string') {
1020
+ yield { url: request, crawlDepth: newRequestDepth };
1021
+ }
1022
+ else {
1023
+ request.crawlDepth ??= newRequestDepth;
1024
+ yield request;
1025
+ }
1026
+ }
1027
+ }
1028
+ /**
1029
+ * Run async callback with given timeout and retry. Returns the result of the callback.
879
1030
  * @ignore
880
1031
  */
881
1032
  async _timeoutAndRetry(handler, timeout, error, maxRetries = 3, retried = 1) {
882
1033
  try {
883
- await addTimeoutToPromise(handler, timeout, error);
1034
+ return await addTimeoutToPromise(handler, timeout, error);
884
1035
  }
885
1036
  catch (e) {
886
1037
  if (retried <= maxRetries) {
887
1038
  // we retry on any error, not just timeout
888
1039
  this.log.warning(`${e.message} (retrying ${retried}/${maxRetries})`);
889
- void this._timeoutAndRetry(handler, timeout, error, maxRetries, retried + 1);
890
- return;
1040
+ return this._timeoutAndRetry(handler, timeout, error, maxRetries, retried + 1);
891
1041
  }
892
1042
  throw e;
893
1043
  }
@@ -896,24 +1046,13 @@ export class BasicCrawler {
896
1046
  * Returns true if either RequestList or RequestQueue have a request ready for processing.
897
1047
  */
898
1048
  async _isTaskReadyFunction() {
899
- // First check RequestList, since it's only in memory.
900
- const isRequestListEmpty = this.requestList ? await this.requestList.isEmpty() : true;
901
- // If RequestList is not empty, task is ready, no reason to check RequestQueue.
902
- if (!isRequestListEmpty)
903
- return true;
904
- // If RequestQueue is not empty, task is ready, return true, otherwise false.
905
- return this.requestQueue ? !(await this.requestQueue.isEmpty()) : false;
1049
+ return this.requestManager !== undefined && !(await this.requestManager.isEmpty());
906
1050
  }
907
1051
  /**
908
1052
  * Returns true if both RequestList and RequestQueue have all requests finished.
909
1053
  */
910
1054
  async _defaultIsFinishedFunction() {
911
- const [isRequestListFinished, isRequestQueueFinished] = await Promise.all([
912
- this.requestList ? this.requestList.isFinished() : true,
913
- this.requestQueue ? this.requestQueue.isFinished() : true,
914
- ]);
915
- // If both are finished, return true, otherwise return false.
916
- return isRequestListFinished && isRequestQueueFinished;
1055
+ return !this.requestManager || (await this.requestManager.isFinished());
917
1056
  }
918
1057
  async _rotateSession(crawlingContext) {
919
1058
  const { request } = crawlingContext;
@@ -1037,19 +1176,11 @@ export class BasicCrawler {
1037
1176
  return request.retryCount < maxRequestRetries;
1038
1177
  }
1039
1178
  /**
1040
- * Updates handledRequestsCount from possibly stored counts,
1041
- * usually after worker migration. Since one of the stores
1042
- * needs to have priority when both are present,
1043
- * it is the request queue, because generally, the request
1044
- * list will first be dumped into the queue and then left
1045
- * empty.
1179
+ * Updates handledRequestsCount from possibly stored counts, usually after worker migration.
1046
1180
  */
1047
1181
  async _loadHandledRequestCount() {
1048
- if (this.requestQueue) {
1049
- this.handledRequestsCount = await this.requestQueue.handledCount();
1050
- }
1051
- else if (this.requestList) {
1052
- this.handledRequestsCount = this.requestList.handledCount();
1182
+ if (this.requestManager) {
1183
+ this.handledRequestsCount = await this.requestManager.handledCount();
1053
1184
  }
1054
1185
  }
1055
1186
  async _executeHooks(hooks, ...args) {
@@ -1060,8 +1191,11 @@ export class BasicCrawler {
1060
1191
  }
1061
1192
  }
1062
1193
  /**
1063
- * Function for cleaning up after all request are processed.
1064
- * @ignore
1194
+ * Stops the crawler immediately.
1195
+ *
1196
+ * This method doesn't wait for currently active requests to finish.
1197
+ *
1198
+ * To stop the crawler gracefully (waiting for all running requests to finish), use {@link BasicCrawler.stop|`crawler.stop()`} instead.
1065
1199
  */
1066
1200
  async teardown() {
1067
1201
  this.events.emit("persistState" /* EventType.PERSIST_STATE */, { isMigrating: false });