npm - @crawlee/basic - Versions diffs - 4.0.0-beta.12 → 4.0.0-beta.14 - Mend

@crawlee/basic 4.0.0-beta.12 → 4.0.0-beta.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/README.md +5 -1
package/index.d.ts +1 -0
package/index.d.ts.map +1 -1
package/internals/basic-crawler.d.ts +84 -28
package/internals/basic-crawler.d.ts.map +1 -1
package/internals/basic-crawler.js +249 -115
package/internals/basic-crawler.js.map +1 -1
package/internals/send-request.d.ts +1 -3
package/internals/send-request.d.ts.map +1 -1
package/internals/send-request.js +2 -4
package/internals/send-request.js.map +1 -1
package/package.json +5 -5
package/tsconfig.build.tsbuildinfo +0 -1

package/internals/basic-crawler.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import { writeFile } from 'node:fs/promises';
 import { dirname } from 'node:path';
-import { AutoscaledPool, Configuration, ContextPipeline, ContextPipelineCleanupError, ContextPipelineInitializationError, ContextPipelineInterruptedError, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, GotScrapingHttpClient, KeyValueStore, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestHandlerError, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
-import { RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
+import { AutoscaledPool, Configuration, ContextPipeline, ContextPipelineCleanupError, ContextPipelineInitializationError, ContextPipelineInterruptedError, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, GotScrapingHttpClient, KeyValueStore, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestHandlerError, RequestListAdapter, RequestManagerTandem, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
+import { getObjectType, isAsyncIterable, isIterable, RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
 import { stringify } from 'csv-stringify/sync';
 import { ensureDir, writeJSON } from 'fs-extra/esm';
 import ow from 'ow';
@@ -103,6 +103,10 @@ export class BasicCrawler {
      * Only available if used by the crawler.
      */
     requestQueue;
+    /**
+     * The main request-handling component of the crawler. It's initialized during the crawler startup.
+     */
+    requestManager;
     /**
      * A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session|sessions}.
      * Only available if used by the crawler.
@@ -143,10 +147,12 @@ export class BasicCrawler {
     requestHandlerTimeoutMillis;
     internalTimeoutMillis;
     maxRequestRetries;
+    maxCrawlDepth;
     sameDomainDelayMillis;
     domainAccessedTime;
     maxSessionRotations;
-    handledRequestsCount;
+    maxRequestsPerCrawl;
+    handledRequestsCount = 0;
     statusMessageLoggingInterval;
     statusMessageCallback;
     sessionPoolOptions;
@@ -158,6 +164,8 @@ export class BasicCrawler {
     respectRobotsTxtFile;
     onSkippedRequest;
     _closeEvents;
+    shouldLogMaxProcessedRequestsExceeded = true;
+    shouldLogMaxEnqueuedRequestsExceeded = true;
     experiments;
     robotsTxtFileCache;
     _experimentWarnings = {};
@@ -177,6 +185,7 @@ export class BasicCrawler {
         sameDomainDelaySecs: ow.optional.number,
         maxSessionRotations: ow.optional.number,
         maxRequestsPerCrawl: ow.optional.number,
+        maxCrawlDepth: ow.optional.number,
         autoscaledPoolOptions: ow.optional.object,
         sessionPoolOptions: ow.optional.object,
         useSessionPool: ow.optional.boolean,
@@ -184,7 +193,7 @@ export class BasicCrawler {
         statusMessageLoggingInterval: ow.optional.number,
         statusMessageCallback: ow.optional.function,
         retryOnBlocked: ow.optional.boolean,
-        respectRobotsTxtFile: ow.optional.boolean,
+        respectRobotsTxtFile: ow.optional.any(ow.boolean, ow.object),
         onSkippedRequest: ow.optional.function,
         httpClient: ow.optional.object,
         // AutoscaledPool shorthands
@@ -204,7 +213,7 @@ export class BasicCrawler {
     config = Configuration.getGlobalConfig()) {
         this.config = config;
         ow(options, 'BasicCrawlerOptions', ow.object.exactShape(BasicCrawler.optionsShape));
-        const { requestList, requestQueue, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, useSessionPool = true, proxyConfiguration,
+        const { requestList, requestQueue, requestManager, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, maxCrawlDepth, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, useSessionPool = true, proxyConfiguration,
         // AutoscaledPool shorthands
         minConcurrency, maxConcurrency, maxRequestsPerMinute, retryOnBlocked = false, respectRobotsTxtFile = false, onSkippedRequest, requestHandler, requestHandlerTimeoutSecs, errorHandler, failedRequestHandler, statusMessageLoggingInterval = 10, statusMessageCallback, statisticsOptions, httpClient,
         // internal
@@ -228,6 +237,7 @@ export class BasicCrawler {
                         this.log.debug(message);
                         request.noRetry = true;
                         request.state = RequestState.SKIPPED;
+                        await this.handleSkippedRequest({ url: request.url, reason: 'redirect' });
                         throw new ContextPipelineInterruptedError(message);
                     }
                     return context;
@@ -235,8 +245,17 @@ export class BasicCrawler {
             });
             return contextPipeline;
         };
-        this.requestList = requestList;
-        this.requestQueue = requestQueue;
+        if (requestManager !== undefined) {
+            if (requestList !== undefined || requestQueue !== undefined) {
+                throw new Error('The `requestManager` option cannot be used in conjunction with `requestList` and/or `requestQueue`');
+            }
+            this.requestManager = requestManager;
+            this.requestQueue = requestManager; // TODO(v4) - the cast is not fully legitimate here, but it's fine for internal usage by the BasicCrawler
+        }
+        else {
+            this.requestList = requestList;
+            this.requestQueue = requestQueue;
+        }
         this.httpClient = httpClient ?? new GotScrapingHttpClient();
         this.proxyConfiguration = proxyConfiguration;
         this.log = log;
@@ -246,6 +265,7 @@ export class BasicCrawler {
         this.domainAccessedTime = new Map();
         this.experiments = experiments;
         this.robotsTxtFileCache = new LruCache({ maxLength: 1000 });
+        this.handleSkippedRequest = this.handleSkippedRequest.bind(this);
         this.requestHandler = requestHandler ?? this.router;
         this.failedRequestHandler = failedRequestHandler;
         this.errorHandler = errorHandler;
@@ -270,9 +290,9 @@ export class BasicCrawler {
             this.requestQueue.requestLockSecs = Math.max(this.requestHandlerTimeoutMillis / 1000 + 5, 60);
         }
         this.maxRequestRetries = maxRequestRetries;
+        this.maxCrawlDepth = maxCrawlDepth;
         this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
         this.maxSessionRotations = maxSessionRotations;
-        this.handledRequestsCount = 0;
         this.stats = new Statistics({
             logMessage: `${log.getOptions().prefix} request statistics:`,
             log,
@@ -297,8 +317,8 @@ export class BasicCrawler {
             this.requestHandlerTimeoutMillis = maxSignedInteger;
         }
         this.internalTimeoutMillis = Math.min(this.internalTimeoutMillis, maxSignedInteger);
-        let shouldLogMaxPagesExceeded = true;
-        const isMaxPagesExceeded = () => maxRequestsPerCrawl && maxRequestsPerCrawl <= this.handledRequestsCount;
+        this.maxRequestsPerCrawl = maxRequestsPerCrawl;
+        const isMaxPagesExceeded = () => this.maxRequestsPerCrawl && this.maxRequestsPerCrawl <= this.handledRequestsCount;
         // eslint-disable-next-line prefer-const
         let { isFinishedFunction, isTaskReadyFunction } = autoscaledPoolOptions;
         // override even if `isFinishedFunction` provided by user - `keepAlive` has higher priority
@@ -312,10 +332,10 @@ export class BasicCrawler {
             runTaskFunction: this._runTaskFunction.bind(this),
             isTaskReadyFunction: async () => {
                 if (isMaxPagesExceeded()) {
-                    if (shouldLogMaxPagesExceeded) {
+                    if (this.shouldLogMaxProcessedRequestsExceeded) {
                         log.info('Crawler reached the maxRequestsPerCrawl limit of ' +
-                            `${maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
-                        shouldLogMaxPagesExceeded = false;
+                            `${this.maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
+                        this.shouldLogMaxProcessedRequestsExceeded = false;
                     }
                     return false;
                 }
@@ -323,7 +343,7 @@ export class BasicCrawler {
             },
             isFinishedFunction: async () => {
                 if (isMaxPagesExceeded()) {
-                    log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${maxRequestsPerCrawl} requests ` +
+                    log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${this.maxRequestsPerCrawl} requests ` +
                         'and all requests that were in progress at that time have now finished. ' +
                         `In total, the crawler processed ${this.handledRequestsCount} requests and will shut down.`);
                     return true;
@@ -383,7 +403,7 @@ export class BasicCrawler {
                 message = `Experiencing problems, ${this.stats.state.requestsFailed - previousState.requestsFailed || this.stats.state.requestsFailed} failed requests in the past ${this.statusMessageLoggingInterval} seconds.`;
             }
             else {
-                const total = this.requestQueue?.getTotalCount() || this.requestList?.length();
+                const total = this.requestManager?.getTotalCount();
                 message = `Crawled ${this.stats.state.requestsFinished}${total ? `/${total}` : ''} pages, ${this.stats.state.requestsFailed} failed requests, desired concurrency ${this.autoscaledPool?.desiredConcurrency ?? 0}.`;
             }
             if (this.statusMessageCallback) {
@@ -423,20 +443,30 @@ export class BasicCrawler {
             if (this.requestQueue?.name === 'default' && purgeRequestQueue) {
                 await this.requestQueue.drop();
                 this.requestQueue = await this._getRequestQueue();
+                this.requestManager = undefined;
+                await this.initializeRequestManager();
+                this.handledRequestsCount = 0; // This would've been reset by this._init() further down below, but at that point `handledRequestsCount` could prevent `addRequests` from adding the initial requests
             }
             this.stats.reset();
             await this.stats.resetStore();
             await this.sessionPool?.resetStore();
         }
         this.running = true;
-        await purgeDefaultStorages({ onlyPurgeOnce: true });
+        this.shouldLogMaxProcessedRequestsExceeded = true;
+        this.shouldLogMaxEnqueuedRequestsExceeded = true;
+        await purgeDefaultStorages({
+            onlyPurgeOnce: true,
+            client: this.config.getStorageClient(),
+            config: this.config,
+        });
         if (requests) {
             await this.addRequests(requests, addRequestsOptions);
         }
         await this._init();
         await this.stats.startCapturing();
         const periodicLogger = this.getPeriodicLogger();
-        await this.setStatusMessage('Starting the crawler.', { level: 'INFO' });
+        // Don't await, we don't want to block the execution
+        void this.setStatusMessage('Starting the crawler.', { level: 'INFO' });
         const sigintHandler = async () => {
             this.log.warning('Pausing... Press CTRL+C again to force exit. To resume, do: CRAWLEE_PURGE_ON_START=0 npm start');
             await this._pauseOnMigration();
@@ -485,7 +515,8 @@ export class BasicCrawler {
                 finished = true;
             }
             periodicLogger.stop();
-            await this.setStatusMessage(`Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${this.stats.state.requestsFinished} succeeded, ${this.stats.state.requestsFailed} failed.`, { isStatusMessageTerminal: true, level: 'INFO' });
+            // Don't await, we don't want to block the execution
+            void this.setStatusMessage(`Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${this.stats.state.requestsFinished} succeeded, ${this.stats.state.requestsFailed} failed.`, { isStatusMessageTerminal: true, level: 'INFO' });
             this.running = false;
             this.hasFinishedBefore = true;
         }
@@ -495,6 +526,8 @@ export class BasicCrawler {
      * Gracefully stops the current run of the crawler.
      *
      * All the tasks active at the time of calling this method will be allowed to finish.
+     *
+     * To stop the crawler immediately, use {@link BasicCrawler.teardown|`crawler.teardown()`} instead.
      */
     stop(message = 'The crawler has been gracefully stopped.') {
         // Gracefully starve the this.autoscaledPool, so it doesn't start new tasks. Resolves once the pool is cleared.
@@ -511,13 +544,46 @@ export class BasicCrawler {
         if (!this.requestQueue && this.requestList) {
             this.log.warningOnce('When using RequestList and RequestQueue at the same time, you should instantiate both explicitly and provide them in the crawler options, to ensure correctly handled restarts of the crawler.');
         }
-        this.requestQueue ??= await this._getRequestQueue();
+        if (!this.requestQueue) {
+            this.requestQueue = await this._getRequestQueue();
+            this.requestManager = undefined;
+        }
+        if (!this.requestManager) {
+            this.requestManager =
+                this.requestList === undefined
+                    ? this.requestQueue
+                    : new RequestManagerTandem(this.requestList, this.requestQueue);
+        }
         return this.requestQueue;
     }
     async useState(defaultValue = {}) {
         const kvs = await KeyValueStore.open(null, { config: this.config });
         return kvs.getAutoSavedValue(BasicCrawler.CRAWLEE_STATE_KEY, defaultValue);
     }
+    get pendingRequestCountApproximation() {
+        return this.requestManager?.getPendingCount() ?? 0;
+    }
+    calculateEnqueuedRequestLimit(explicitLimit) {
+        if (this.maxRequestsPerCrawl === undefined) {
+            return explicitLimit;
+        }
+        const limit = Math.max(0, this.maxRequestsPerCrawl - this.handledRequestsCount - this.pendingRequestCountApproximation);
+        return Math.min(limit, explicitLimit ?? Infinity);
+    }
+    async handleSkippedRequest(options) {
+        if (options.reason === 'limit' && this.shouldLogMaxEnqueuedRequestsExceeded) {
+            this.log.info('The number of requests enqueued by the crawler reached the maxRequestsPerCrawl limit of ' +
+                `${this.maxRequestsPerCrawl} requests and no further requests will be added.`);
+            this.shouldLogMaxEnqueuedRequestsExceeded = false;
+        }
+        if (options.reason === 'enqueueLimit') {
+            const enqueuedRequestLimit = this.calculateEnqueuedRequestLimit();
+            if (enqueuedRequestLimit === undefined || enqueuedRequestLimit !== 0) {
+                this.log.info('The number of requests enqueued by the crawler reached the enqueueLinks limit.');
+            }
+        }
+        await this.onSkippedRequest?.(options);
+    }
     /**
      * Adds requests to the queue in batches. By default, it will resolve after the initial batch is added, and continue
      * adding the rest in background. You can configure the batch size via `batchSize` option and the sleep time in between
@@ -530,33 +596,57 @@ export class BasicCrawler {
      * @param options Options for the request queue
      */
     async addRequests(requests, options = {}) {
-        const requestQueue = await this.getRequestQueue();
-        if (!this.respectRobotsTxtFile) {
-            return requestQueue.addRequestsBatched(requests, options);
-        }
-        const allowedRequests = [];
-        const skipped = new Set();
-        for (const request of requests) {
-            const url = typeof request === 'string' ? request : request.url;
-            if (await this.isAllowedBasedOnRobotsTxtFile(url)) {
-                allowedRequests.push(request);
-            }
-            else {
-                skipped.add(url);
-                await this.onSkippedRequest?.({ url, reason: 'robotsTxt' });
+        await this.getRequestQueue();
+        const requestLimit = this.calculateEnqueuedRequestLimit();
+        const skippedBecauseOfRobots = new Set();
+        const skippedBecauseOfLimit = new Set();
+        const skippedBecauseOfMaxCrawlDepth = new Set();
+        const isAllowedBasedOnRobotsTxtFile = this.isAllowedBasedOnRobotsTxtFile.bind(this);
+        const maxCrawlDepth = this.maxCrawlDepth;
+        ow(requests, ow.object
+            .is((value) => isIterable(value) || isAsyncIterable(value))
+            .message((value) => `Expected an iterable or async iterable, got ${getObjectType(value)}`));
+        async function* filteredRequests() {
+            let yieldedRequestCount = 0;
+            for await (const request of requests) {
+                const url = typeof request === 'string' ? request : request.url;
+                if (requestLimit !== undefined && yieldedRequestCount >= requestLimit) {
+                    skippedBecauseOfLimit.add(url);
+                    continue;
+                }
+                if (maxCrawlDepth !== undefined && request.crawlDepth > maxCrawlDepth) {
+                    skippedBecauseOfMaxCrawlDepth.add(url);
+                    continue;
+                }
+                if (await isAllowedBasedOnRobotsTxtFile(url)) {
+                    yield request;
+                    yieldedRequestCount += 1;
+                }
+                else {
+                    skippedBecauseOfRobots.add(url);
+                }
             }
         }
-        if (skipped.size > 0) {
+        const result = await this.requestManager.addRequestsBatched(filteredRequests(), options);
+        if (skippedBecauseOfRobots.size > 0) {
             this.log.warning(`Some requests were skipped because they were disallowed based on the robots.txt file`, {
-                skipped: [...skipped],
+                skipped: [...skippedBecauseOfRobots],
             });
-            if (this.onSkippedRequest) {
-                await Promise.all([...skipped].map((url) => {
-                    return this.onSkippedRequest({ url, reason: 'robotsTxt' });
-                }));
-            }
         }
-        return requestQueue.addRequestsBatched(allowedRequests, options);
+        if (skippedBecauseOfRobots.size > 0 ||
+            skippedBecauseOfLimit.size > 0 ||
+            skippedBecauseOfMaxCrawlDepth.size > 0) {
+            await Promise.all([...skippedBecauseOfRobots]
+                .map((url) => {
+                return this.handleSkippedRequest({ url, reason: 'robotsTxt' });
+            })
+                .concat([...skippedBecauseOfLimit].map((url) => {
+                return this.handleSkippedRequest({ url, reason: 'limit' });
+            }), [...skippedBecauseOfMaxCrawlDepth].map((url) => {
+                return this.handleSkippedRequest({ url, reason: 'depth' });
+            })));
+        }
+        return result;
     }
     /**
      * Pushes data to the specified {@link Dataset}, or the default crawler {@link Dataset} by calling {@link Dataset.pushData}.
@@ -596,7 +686,21 @@ export class BasicCrawler {
         const dataset = await this.getDataset();
         const items = await dataset.export(options);
         if (format === 'csv') {
-            const value = stringify([Object.keys(items[0]), ...items.map((item) => Object.values(item))]);
+            let value;
+            if (items.length === 0) {
+                value = '';
+            }
+            else {
+                const keys = options?.collectAllKeys
+                    ? Array.from(new Set(items.flatMap(Object.keys)))
+                    : Object.keys(items[0]);
+                value = stringify([
+                    keys,
+                    ...items.map((item) => {
+                        return keys.map((k) => item[k]);
+                    }),
+                ]);
+            }
             await ensureDir(dirname(path));
             await writeFile(path, value);
             this.log.info(`Export to ${path} finished!`);
@@ -608,6 +712,9 @@ export class BasicCrawler {
         }
         return items;
     }
+    /**
+     * Initializes the crawler.
+     */
     async _init() {
         if (!this.events.isInitialized()) {
             await this.events.init();
@@ -622,6 +729,7 @@ export class BasicCrawler {
             // Assuming there are not more than 20 browsers running at once;
             this.sessionPool.setMaxListeners(20);
         }
+        await this.initializeRequestManager();
         await this._loadHandledRequestCount();
     }
     async runRequestHandler(crawlingContext) {
@@ -643,7 +751,8 @@ export class BasicCrawler {
             return true;
         }
         const robotsTxtFile = await this.getRobotsTxtFileForUrl(url);
-        return !robotsTxtFile || robotsTxtFile.isAllowed(url);
+        const userAgent = typeof this.respectRobotsTxtFile === 'object' ? this.respectRobotsTxtFile?.userAgent : '*';
+        return !robotsTxtFile || robotsTxtFile.isAllowed(url, userAgent);
     }
     async getRobotsTxtFileForUrl(url) {
         if (!this.respectRobotsTxtFile) {
@@ -697,30 +806,35 @@ export class BasicCrawler {
         await Promise.all([requestListPersistPromise, this.stats.persistState()]);
     }
     /**
-     * Fetches request from either RequestList or RequestQueue. If request comes from a RequestList
-     * and RequestQueue is present then enqueues it to the queue first.
+     * Initializes the RequestManager based on the configured requestList and requestQueue.
      */
-    async _fetchNextRequest() {
-        if (!this.requestList || (await this.requestList.isFinished())) {
-            return this.requestQueue?.fetchNextRequest();
-        }
-        const request = await this.requestList.fetchNextRequest();
-        if (!this.requestQueue)
-            return request;
-        if (!request)
-            return this.requestQueue.fetchNextRequest();
-        try {
-            await this.requestQueue.addRequest(request, { forefront: true });
+    async initializeRequestManager() {
+        if (this.requestManager !== undefined) {
+            return;
+        }
+        if (this.requestList && this.requestQueue) {
+            // Create a RequestManagerTandem if both RequestList and RequestQueue are provided
+            this.requestManager = new RequestManagerTandem(this.requestList, this.requestQueue);
+        }
+        else if (this.requestQueue) {
+            // Use RequestQueue directly if only it is provided
+            this.requestManager = this.requestQueue;
+        }
+        else if (this.requestList) {
+            // Use RequestList directly if only it is provided
+            // Make it compatible with the IRequestManager interface
+            this.requestManager = new RequestListAdapter(this.requestList);
         }
-        catch (err) {
-            // If requestQueue.addRequest() fails here then we must reclaim it back to
-            // the RequestList because probably it's not yet in the queue!
-            this.log.error('Adding of request from the RequestList to the RequestQueue failed, reclaiming request back to the list.', { request });
-            await this.requestList.reclaimRequest(request);
-            return null;
+        // If neither RequestList nor RequestQueue is provided, leave the requestManager uninitialized until `getRequestQueue` is called
+    }
+    /**
+     * Fetches the next request to process from the underlying request provider.
+     */
+    async _fetchNextRequest() {
+        if (this.requestManager === undefined) {
+            throw new Error(`_fetchNextRequest called on an uninitialized crawler`);
         }
-        await this.requestList.markRequestHandled(request);
-        return this.requestQueue.fetchNextRequest();
+        return this.requestManager.fetchNextRequest();
     }
     /**
      * Delays processing of the request based on the `sameDomainDelaySecs` option,
@@ -759,23 +873,21 @@ export class BasicCrawler {
      * then retries them in a case of an error, etc.
      */
     async _runTaskFunction() {
-        const source = this.requestQueue || this.requestList || (await this.getRequestQueue());
-        let request;
-        let session;
-        await this._timeoutAndRetry(async () => {
-            request = await this._fetchNextRequest();
-        }, this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
+        const source = this.requestManager;
+        if (!source)
+            throw new Error('Request provider is not initialized!');
+        const request = await this._timeoutAndRetry(this._fetchNextRequest.bind(this), this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
         tryCancel();
-        if (this.useSessionPool) {
-            await this._timeoutAndRetry(async () => {
-                session = await this.sessionPool.newSession({
+        const session = this.useSessionPool
+            ? await this._timeoutAndRetry(async () => {
+                return await this.sessionPool.newSession({
                     proxyInfo: await this.proxyConfiguration?.newProxyInfo({
                         request: request ?? undefined,
                     }),
                     maxUsageCount: 1,
                 });
-            }, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
-        }
+            }, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`)
+            : undefined;
         tryCancel();
         if (!request || this.delayRequest(request, source)) {
             return;
@@ -785,7 +897,7 @@ export class BasicCrawler {
             request.state = RequestState.SKIPPED;
             request.noRetry = true;
             await source.markRequestHandled(request);
-            await this.onSkippedRequest?.({
+            await this.handleSkippedRequest({
                 url: request.url,
                 reason: 'robotsTxt',
             });
@@ -803,16 +915,13 @@ export class BasicCrawler {
             session,
             proxyInfo: session?.proxyInfo,
             enqueueLinks: async (options) => {
-                return await enqueueLinks({
-                    // specify the RQ first to allow overriding it
-                    requestQueue: await this.getRequestQueue(),
-                    robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
-                    onSkippedRequest: this.onSkippedRequest,
-                    ...options,
-                });
+                const requestQueue = await this.getRequestQueue();
+                return await this.enqueueLinksWithCrawlDepth(options, request, requestQueue);
             },
-            addRequests: async (requests, options) => {
-                await this.addRequests(requests, options);
+            addRequests: async (requests, options = {}) => {
+                const newCrawlDepth = request.crawlDepth + 1;
+                const requestsGenerator = this.addCrawlDepthRequestGenerator(requests, newCrawlDepth);
+                await this.addRequests(requestsGenerator, options);
             },
             pushData: this.pushData.bind(this),
             useState: this.useState.bind(this),
@@ -875,19 +984,60 @@ export class BasicCrawler {
         }
     }
     /**
-     * Run async callback with given timeout and retry.
+     * Wrapper around the crawling context's `enqueueLinks` method:
+     * - Injects `crawlDepth` to each request being added based on the crawling context request.
+     * - Provides defaults for the `enqueueLinks` options based on the crawler configuration.
+     *      - These options can be overridden by the user.
+     * @internal
+     */
+    async enqueueLinksWithCrawlDepth(options, request, requestQueue) {
+        const transformRequestFunctionWrapper = (newRequest) => {
+            newRequest.crawlDepth = request.crawlDepth + 1;
+            if (this.maxCrawlDepth !== undefined && newRequest.crawlDepth > this.maxCrawlDepth) {
+                newRequest.skippedReason = 'depth';
+                return false;
+            }
+            // After injecting the crawlDepth, we call the user-provided transform function, if there is one.
+            return options.transformRequestFunction?.(newRequest) ?? newRequest;
+        };
+        return await enqueueLinks({
+            requestQueue,
+            robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
+            onSkippedRequest: this.handleSkippedRequest,
+            limit: this.calculateEnqueuedRequestLimit(options.limit),
+            // Allow user options to override defaults set above ⤴
+            ...options,
+            transformRequestFunction: transformRequestFunctionWrapper,
+        });
+    }
+    /**
+     * Generator function that yields requests injected with the given crawl depth.
+     * @internal
+     */
+    async *addCrawlDepthRequestGenerator(requests, newRequestDepth) {
+        for await (const request of requests) {
+            if (typeof request === 'string') {
+                yield { url: request, crawlDepth: newRequestDepth };
+            }
+            else {
+                request.crawlDepth ??= newRequestDepth;
+                yield request;
+            }
+        }
+    }
+    /**
+     * Run async callback with given timeout and retry. Returns the result of the callback.
      * @ignore
      */
     async _timeoutAndRetry(handler, timeout, error, maxRetries = 3, retried = 1) {
         try {
-            await addTimeoutToPromise(handler, timeout, error);
+            return await addTimeoutToPromise(handler, timeout, error);
         }
         catch (e) {
             if (retried <= maxRetries) {
                 // we retry on any error, not just timeout
                 this.log.warning(`${e.message} (retrying ${retried}/${maxRetries})`);
-                void this._timeoutAndRetry(handler, timeout, error, maxRetries, retried + 1);
-                return;
+                return this._timeoutAndRetry(handler, timeout, error, maxRetries, retried + 1);
             }
             throw e;
         }
@@ -896,24 +1046,13 @@ export class BasicCrawler {
      * Returns true if either RequestList or RequestQueue have a request ready for processing.
      */
     async _isTaskReadyFunction() {
-        // First check RequestList, since it's only in memory.
-        const isRequestListEmpty = this.requestList ? await this.requestList.isEmpty() : true;
-        // If RequestList is not empty, task is ready, no reason to check RequestQueue.
-        if (!isRequestListEmpty)
-            return true;
-        // If RequestQueue is not empty, task is ready, return true, otherwise false.
-        return this.requestQueue ? !(await this.requestQueue.isEmpty()) : false;
+        return this.requestManager !== undefined && !(await this.requestManager.isEmpty());
     }
     /**
      * Returns true if both RequestList and RequestQueue have all requests finished.
      */
     async _defaultIsFinishedFunction() {
-        const [isRequestListFinished, isRequestQueueFinished] = await Promise.all([
-            this.requestList ? this.requestList.isFinished() : true,
-            this.requestQueue ? this.requestQueue.isFinished() : true,
-        ]);
-        // If both are finished, return true, otherwise return false.
-        return isRequestListFinished && isRequestQueueFinished;
+        return !this.requestManager || (await this.requestManager.isFinished());
     }
     async _rotateSession(crawlingContext) {
         const { request } = crawlingContext;
@@ -1037,19 +1176,11 @@ export class BasicCrawler {
         return request.retryCount < maxRequestRetries;
     }
     /**
-     * Updates handledRequestsCount from possibly stored counts,
-     * usually after worker migration. Since one of the stores
-     * needs to have priority when both are present,
-     * it is the request queue, because generally, the request
-     * list will first be dumped into the queue and then left
-     * empty.
+     * Updates handledRequestsCount from possibly stored counts, usually after worker migration.
      */
     async _loadHandledRequestCount() {
-        if (this.requestQueue) {
-            this.handledRequestsCount = await this.requestQueue.handledCount();
-        }
-        else if (this.requestList) {
-            this.handledRequestsCount = this.requestList.handledCount();
+        if (this.requestManager) {
+            this.handledRequestsCount = await this.requestManager.handledCount();
         }
     }
     async _executeHooks(hooks, ...args) {
@@ -1060,8 +1191,11 @@ export class BasicCrawler {
         }
     }
     /**
-     * Function for cleaning up after all request are processed.
-     * @ignore
+     * Stops the crawler immediately.
+     *
+     * This method doesn't wait for currently active requests to finish.
+     *
+     * To stop the crawler gracefully (waiting for all running requests to finish), use {@link BasicCrawler.stop|`crawler.stop()`} instead.
      */
     async teardown() {
         this.events.emit("persistState" /* EventType.PERSIST_STATE */, { isMigrating: false });