npm - @crawlee/basic - Versions diffs - 4.0.0-beta.4 → 4.0.0-beta.40 - Mend

@crawlee/basic 4.0.0-beta.4 → 4.0.0-beta.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/README.md +9 -5
package/index.d.ts +1 -1
package/index.d.ts.map +1 -1
package/index.js +0 -1
package/index.js.map +1 -1
package/internals/basic-crawler.d.ts +270 -102
package/internals/basic-crawler.d.ts.map +1 -1
package/internals/basic-crawler.js +666 -330
package/internals/basic-crawler.js.map +1 -1
package/internals/send-request.d.ts +3 -5
package/internals/send-request.d.ts.map +1 -1
package/internals/send-request.js +21 -25
package/internals/send-request.js.map +1 -1
package/package.json +6 -6
package/internals/constants.d.ts +0 -7
package/internals/constants.d.ts.map +0 -1
package/internals/constants.js +0 -7
package/internals/constants.js.map +0 -1
package/tsconfig.build.tsbuildinfo +0 -1

package/internals/basic-crawler.js CHANGED Viewed

@@ -1,14 +1,14 @@
 import { writeFile } from 'node:fs/promises';
 import { dirname } from 'node:path';
-import { AutoscaledPool, Configuration, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, GotScrapingHttpClient, KeyValueStore, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
-import { RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
+import { AutoscaledPool, bindMethodsToServiceLocator, BLOCKED_STATUS_CODES, ContextPipeline, ContextPipelineCleanupError, ContextPipelineInitializationError, ContextPipelineInterruptedError, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, KeyValueStore, LogLevel, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestHandlerError, RequestListAdapter, RequestManagerTandem, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, ServiceLocator, serviceLocator, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
+import { GotScrapingHttpClient } from '@crawlee/got-scraping-client';
+import { getObjectType, isAsyncIterable, isIterable, RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
 import { stringify } from 'csv-stringify/sync';
 import { ensureDir, writeJSON } from 'fs-extra/esm';
 import ow from 'ow';
 import { getDomain } from 'tldts';
 import { LruCache } from '@apify/datastructures';
-import defaultLog, { LogLevel } from '@apify/log';
-import { addTimeoutToPromise, TimeoutError, tryCancel } from '@apify/timeout';
+import { addTimeoutToPromise, TimeoutError } from '@apify/timeout';
 import { cryptoRandomObjectId } from '@apify/utilities';
 import { createSendRequest } from './send-request.js';
 /**
@@ -21,6 +21,7 @@ import { createSendRequest } from './send-request.js';
  * @ignore
  */
 const SAFE_MIGRATION_WAIT_MILLIS = 20000;
+const deferredCleanupKey = Symbol('deferredCleanup');
 /**
  * Provides a simple framework for parallel crawling of web pages.
  * The URLs to crawl are fed either from a static list of URLs
@@ -86,8 +87,12 @@ const SAFE_MIGRATION_WAIT_MILLIS = 20000;
  * @category Crawlers
  */
 export class BasicCrawler {
-    config;
     static CRAWLEE_STATE_KEY = 'CRAWLEE_STATE';
+    /**
+     * Tracks crawler instances that accessed shared state without having an explicit id.
+     * Used to detect and warn about multiple crawlers sharing the same state.
+     */
+    static useStateCrawlerIds = new Set();
     /**
      * A reference to the underlying {@link Statistics} class that collects and logs run statistics for requests.
      */
@@ -103,9 +108,12 @@ export class BasicCrawler {
      * Only available if used by the crawler.
      */
     requestQueue;
+    /**
+     * The main request-handling component of the crawler. It's initialized during the crawler startup.
+     */
+    requestManager;
     /**
      * A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session|sessions}.
-     * Only available if used by the crawler.
      */
     sessionPool;
     /**
@@ -116,40 +124,79 @@ export class BasicCrawler {
      * or to abort it by calling {@link AutoscaledPool.abort|`autoscaledPool.abort()`}.
      */
     autoscaledPool;
+    /**
+     * A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
+     * Only available if used by the crawler.
+     */
+    proxyConfiguration;
     /**
      * Default {@link Router} instance that will be used if we don't specify any {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
      * See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
      */
     router = Router.create();
+    _basicContextPipeline;
+    /**
+     * The basic part of the context pipeline. Unlike the subclass pipeline, this
+     * part has no major side effects (e.g. launching a browser). It also makes typing more explicit, as subclass
+     * pipelines expect the basic crawler fields to already be present in the context at runtime.
+     *
+     * Context built with this pipeline can be passed into multiple crawler pipelines at once.
+     * This is used e.g. in the {@link AdaptivePlaywrightCrawler|`AdaptivePlaywrightCrawler`}.
+     */
+    get basicContextPipeline() {
+        if (this._basicContextPipeline === undefined) {
+            this._basicContextPipeline = this.buildBasicContextPipeline();
+        }
+        return this._basicContextPipeline;
+    }
+    _contextPipeline;
+    get contextPipeline() {
+        if (this._contextPipeline === undefined) {
+            this._contextPipeline = this.buildFinalContextPipeline();
+        }
+        return this._contextPipeline;
+    }
     running = false;
     hasFinishedBefore = false;
-    log;
+    unexpectedStop = false;
+    #log;
+    get log() {
+        return this.#log;
+    }
     requestHandler;
     errorHandler;
     failedRequestHandler;
     requestHandlerTimeoutMillis;
     internalTimeoutMillis;
     maxRequestRetries;
+    maxCrawlDepth;
     sameDomainDelayMillis;
     domainAccessedTime;
     maxSessionRotations;
-    handledRequestsCount;
+    maxRequestsPerCrawl;
+    handledRequestsCount = 0;
     statusMessageLoggingInterval;
     statusMessageCallback;
     sessionPoolOptions;
-    useSessionPool;
-    crawlingContexts = new Map();
+    blockedStatusCodes = new Set();
+    additionalHttpErrorStatusCodes;
+    ignoreHttpErrorStatusCodes;
     autoscaledPoolOptions;
-    events;
     httpClient;
     retryOnBlocked;
     respectRobotsTxtFile;
     onSkippedRequest;
     _closeEvents;
+    loggedPerRun = new Set();
     experiments;
     robotsTxtFileCache;
     _experimentWarnings = {};
+    crawlerId;
+    hasExplicitId;
+    contextPipelineOptions;
     static optionsShape = {
+        contextPipelineBuilder: ow.optional.object,
+        extendContext: ow.optional.function,
         requestList: ow.optional.object.validate(validators.requestList),
         requestQueue: ow.optional.object.validate(validators.requestQueue),
         // Subclasses override this function instead of passing it
@@ -163,144 +210,342 @@ export class BasicCrawler {
         sameDomainDelaySecs: ow.optional.number,
         maxSessionRotations: ow.optional.number,
         maxRequestsPerCrawl: ow.optional.number,
+        maxCrawlDepth: ow.optional.number,
         autoscaledPoolOptions: ow.optional.object,
         sessionPoolOptions: ow.optional.object,
-        useSessionPool: ow.optional.boolean,
+        proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration),
         statusMessageLoggingInterval: ow.optional.number,
         statusMessageCallback: ow.optional.function,
+        additionalHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
+        ignoreHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
+        blockedStatusCodes: ow.optional.array.ofType(ow.number),
         retryOnBlocked: ow.optional.boolean,
-        respectRobotsTxtFile: ow.optional.boolean,
+        respectRobotsTxtFile: ow.optional.any(ow.boolean, ow.object),
         onSkippedRequest: ow.optional.function,
         httpClient: ow.optional.object,
+        configuration: ow.optional.object,
+        storageClient: ow.optional.object,
+        eventManager: ow.optional.object,
+        logger: ow.optional.object,
         // AutoscaledPool shorthands
         minConcurrency: ow.optional.number,
         maxConcurrency: ow.optional.number,
         maxRequestsPerMinute: ow.optional.number.integerOrInfinite.positive.greaterThanOrEqual(1),
         keepAlive: ow.optional.boolean,
         // internal
-        log: ow.optional.object,
         experiments: ow.optional.object,
         statisticsOptions: ow.optional.object,
+        id: ow.optional.string,
     };
     /**
      * All `BasicCrawler` parameters are passed via an options object.
      */
-    constructor(options = {}, config = Configuration.getGlobalConfig()) {
-        this.config = config;
+    constructor(options = {}) {
         ow(options, 'BasicCrawlerOptions', ow.object.exactShape(BasicCrawler.optionsShape));
-        const { requestList, requestQueue, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, useSessionPool = true,
+        const { requestList, requestQueue, requestManager, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, maxCrawlDepth, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, proxyConfiguration, additionalHttpErrorStatusCodes = [], ignoreHttpErrorStatusCodes = [],
+        // Service locator options
+        configuration, storageClient, eventManager, logger,
         // AutoscaledPool shorthands
-        minConcurrency, maxConcurrency, maxRequestsPerMinute, retryOnBlocked = false, respectRobotsTxtFile = false, onSkippedRequest, requestHandler, requestHandlerTimeoutSecs, errorHandler, failedRequestHandler, statusMessageLoggingInterval = 10, statusMessageCallback, statisticsOptions, httpClient,
+        minConcurrency, maxConcurrency, maxRequestsPerMinute, blockedStatusCodes: blockedStatusCodesInput, retryOnBlocked = false, respectRobotsTxtFile = false, onSkippedRequest, requestHandler, requestHandlerTimeoutSecs, errorHandler, failedRequestHandler, statusMessageLoggingInterval = 10, statusMessageCallback, statisticsOptions, httpClient,
         // internal
-        log = defaultLog.child({ prefix: this.constructor.name }), experiments = {}, } = options;
-        this.requestList = requestList;
-        this.requestQueue = requestQueue;
-        this.httpClient = httpClient ?? new GotScrapingHttpClient();
-        this.log = log;
-        this.statusMessageLoggingInterval = statusMessageLoggingInterval;
-        this.statusMessageCallback = statusMessageCallback;
-        this.events = config.getEventManager();
-        this.domainAccessedTime = new Map();
-        this.experiments = experiments;
-        this.robotsTxtFileCache = new LruCache({ maxLength: 1000 });
-        // FIXME any
-        this.requestHandler = requestHandler ?? this.router;
-        this.failedRequestHandler = failedRequestHandler;
-        this.errorHandler = errorHandler;
-        if (requestHandlerTimeoutSecs) {
-            this.requestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
+        experiments = {}, id, } = options;
+        // Create per-crawler service locator if custom services were provided.
+        // This wraps every method on the crawler instance so that calls to the global `serviceLocator`
+        // (via AsyncLocalStorage) resolve to this scoped instance instead.
+        // We also enter the scope for the rest of the constructor body, so that any code below
+        // that accesses `serviceLocator` will see the correct (scoped) instance.
+        let serviceLocatorScope = { enterScope: () => { }, exitScope: () => { } };
+        if (storageClient ||
+            eventManager ||
+            logger ||
+            (configuration !== undefined && configuration !== serviceLocator.getConfiguration())) {
+            const scopedServiceLocator = new ServiceLocator(configuration, eventManager, storageClient, logger);
+            serviceLocatorScope = bindMethodsToServiceLocator(scopedServiceLocator, this);
         }
-        else {
-            this.requestHandlerTimeoutMillis = 60_000;
-        }
-        this.retryOnBlocked = retryOnBlocked;
-        this.respectRobotsTxtFile = respectRobotsTxtFile;
-        this.onSkippedRequest = onSkippedRequest;
-        const tryEnv = (val) => (val == null ? null : +val);
-        // allow at least 5min for internal timeouts
-        this.internalTimeoutMillis =
-            tryEnv(process.env.CRAWLEE_INTERNAL_TIMEOUT) ?? Math.max(this.requestHandlerTimeoutMillis * 2, 300e3);
-        // override the default internal timeout of request queue to respect `requestHandlerTimeoutMillis`
-        if (this.requestQueue) {
-            this.requestQueue.internalTimeoutMillis = this.internalTimeoutMillis;
-            // for request queue v2, we want to lock requests for slightly longer than the request handler timeout so that there is some padding for locking-related overhead,
-            // but never for less than a minute
-            this.requestQueue.requestLockSecs = Math.max(this.requestHandlerTimeoutMillis / 1000 + 5, 60);
-        }
-        this.maxRequestRetries = maxRequestRetries;
-        this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
-        this.maxSessionRotations = maxSessionRotations;
-        this.handledRequestsCount = 0;
-        this.stats = new Statistics({
-            logMessage: `${log.getOptions().prefix} request statistics:`,
-            log,
-            config,
-            ...statisticsOptions,
-        });
-        this.sessionPoolOptions = {
-            ...sessionPoolOptions,
-            log,
-        };
-        if (this.retryOnBlocked) {
-            this.sessionPoolOptions.blockedStatusCodes = sessionPoolOptions.blockedStatusCodes ?? [];
-            if (this.sessionPoolOptions.blockedStatusCodes.length !== 0) {
-                log.warning(`Both 'blockedStatusCodes' and 'retryOnBlocked' are set. Please note that the 'retryOnBlocked' feature might not work as expected.`);
+        try {
+            serviceLocatorScope.enterScope();
+            this.contextPipelineOptions = {
+                contextPipelineBuilder: options.contextPipelineBuilder,
+                extendContext: options.extendContext,
+            };
+            this.#log = serviceLocator.getLogger().child({ prefix: this.constructor.name });
+            // Store whether the user explicitly provided an ID
+            this.hasExplicitId = id !== undefined;
+            // Store the user-provided ID, or generate a unique one for tracking purposes (not for state key)
+            this.crawlerId = id ?? cryptoRandomObjectId();
+            if (requestManager !== undefined) {
+                if (requestList !== undefined || requestQueue !== undefined) {
+                    throw new Error('The `requestManager` option cannot be used in conjunction with `requestList` and/or `requestQueue`');
+                }
+                this.requestManager = requestManager;
+                this.requestQueue = requestManager; // TODO(v4) - the cast is not fully legitimate here, but it's fine for internal usage by the BasicCrawler
             }
-        }
-        this.useSessionPool = useSessionPool;
-        this.crawlingContexts = new Map();
-        const maxSignedInteger = 2 ** 31 - 1;
-        if (this.requestHandlerTimeoutMillis > maxSignedInteger) {
-            log.warning(`requestHandlerTimeoutMillis ${this.requestHandlerTimeoutMillis}` +
-                ` does not fit a signed 32-bit integer. Limiting the value to ${maxSignedInteger}`);
-            this.requestHandlerTimeoutMillis = maxSignedInteger;
-        }
-        this.internalTimeoutMillis = Math.min(this.internalTimeoutMillis, maxSignedInteger);
-        let shouldLogMaxPagesExceeded = true;
-        const isMaxPagesExceeded = () => maxRequestsPerCrawl && maxRequestsPerCrawl <= this.handledRequestsCount;
-        // eslint-disable-next-line prefer-const
-        let { isFinishedFunction, isTaskReadyFunction } = autoscaledPoolOptions;
-        // override even if `isFinishedFunction` provided by user - `keepAlive` has higher priority
-        if (keepAlive) {
-            isFinishedFunction = async () => false;
-        }
-        const basicCrawlerAutoscaledPoolConfiguration = {
-            minConcurrency: minConcurrency ?? autoscaledPoolOptions?.minConcurrency,
-            maxConcurrency: maxConcurrency ?? autoscaledPoolOptions?.maxConcurrency,
-            maxTasksPerMinute: maxRequestsPerMinute ?? autoscaledPoolOptions?.maxTasksPerMinute,
-            runTaskFunction: this._runTaskFunction.bind(this),
-            isTaskReadyFunction: async () => {
-                if (isMaxPagesExceeded()) {
-                    if (shouldLogMaxPagesExceeded) {
-                        log.info('Crawler reached the maxRequestsPerCrawl limit of ' +
-                            `${maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
-                        shouldLogMaxPagesExceeded = false;
+            else {
+                this.requestList = requestList;
+                this.requestQueue = requestQueue;
+            }
+            this.httpClient = httpClient ?? new GotScrapingHttpClient();
+            this.proxyConfiguration = proxyConfiguration;
+            this.statusMessageLoggingInterval = statusMessageLoggingInterval;
+            this.statusMessageCallback = statusMessageCallback;
+            this.domainAccessedTime = new Map();
+            this.experiments = experiments;
+            this.robotsTxtFileCache = new LruCache({ maxLength: 1000 });
+            this.handleSkippedRequest = this.handleSkippedRequest.bind(this);
+            this.additionalHttpErrorStatusCodes = new Set([...additionalHttpErrorStatusCodes]);
+            this.ignoreHttpErrorStatusCodes = new Set([...ignoreHttpErrorStatusCodes]);
+            this.requestHandler = requestHandler ?? this.router;
+            this.failedRequestHandler = failedRequestHandler;
+            this.errorHandler = errorHandler;
+            if (requestHandlerTimeoutSecs) {
+                this.requestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
+            }
+            else {
+                this.requestHandlerTimeoutMillis = 60_000;
+            }
+            this.retryOnBlocked = retryOnBlocked;
+            this.respectRobotsTxtFile = respectRobotsTxtFile;
+            this.onSkippedRequest = onSkippedRequest;
+            const tryEnv = (val) => (val == null ? null : +val);
+            // allow at least 5min for internal timeouts
+            this.internalTimeoutMillis =
+                tryEnv(process.env.CRAWLEE_INTERNAL_TIMEOUT) ?? Math.max(this.requestHandlerTimeoutMillis * 2, 300e3);
+            // override the default internal timeout of request queue to respect `requestHandlerTimeoutMillis`
+            if (this.requestQueue) {
+                this.requestQueue.internalTimeoutMillis = this.internalTimeoutMillis;
+                // for request queue v2, we want to lock requests for slightly longer than the request handler timeout so that there is some padding for locking-related overhead,
+                // but never for less than a minute
+                this.requestQueue.requestLockSecs = Math.max(this.requestHandlerTimeoutMillis / 1000 + 5, 60);
+            }
+            this.maxRequestRetries = maxRequestRetries;
+            this.maxCrawlDepth = maxCrawlDepth;
+            this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
+            this.maxSessionRotations = maxSessionRotations;
+            this.stats = new Statistics({
+                logMessage: `${this.constructor.name} request statistics:`,
+                log: this.log,
+                ...(this.hasExplicitId ? { id: this.crawlerId } : {}),
+                ...statisticsOptions,
+            });
+            this.sessionPoolOptions = {
+                ...sessionPoolOptions,
+                log: this.log,
+            };
+            this.sessionPool = new SessionPool(this.sessionPoolOptions);
+            this.blockedStatusCodes = new Set(blockedStatusCodesInput ?? BLOCKED_STATUS_CODES);
+            const maxSignedInteger = 2 ** 31 - 1;
+            if (this.requestHandlerTimeoutMillis > maxSignedInteger) {
+                this.log.warning(`requestHandlerTimeoutMillis ${this.requestHandlerTimeoutMillis}` +
+                    ` does not fit a signed 32-bit integer. Limiting the value to ${maxSignedInteger}`);
+                this.requestHandlerTimeoutMillis = maxSignedInteger;
+            }
+            this.internalTimeoutMillis = Math.min(this.internalTimeoutMillis, maxSignedInteger);
+            this.maxRequestsPerCrawl = maxRequestsPerCrawl;
+            const isMaxPagesExceeded = () => this.maxRequestsPerCrawl && this.maxRequestsPerCrawl <= this.handledRequestsCount;
+            // eslint-disable-next-line prefer-const
+            let { isFinishedFunction, isTaskReadyFunction } = autoscaledPoolOptions;
+            // override even if `isFinishedFunction` provided by user - `keepAlive` has higher priority
+            if (keepAlive) {
+                isFinishedFunction = async () => false;
+            }
+            const basicCrawlerAutoscaledPoolConfiguration = {
+                minConcurrency: minConcurrency ?? autoscaledPoolOptions?.minConcurrency,
+                maxConcurrency: maxConcurrency ?? autoscaledPoolOptions?.maxConcurrency,
+                maxTasksPerMinute: maxRequestsPerMinute ?? autoscaledPoolOptions?.maxTasksPerMinute,
+                runTaskFunction: async () => {
+                    const source = this.requestManager;
+                    if (!source)
+                        throw new Error('Request provider is not initialized!');
+                    const request = await this.resolveRequest();
+                    if (!request || this.delayRequest(request, source)) {
+                        return;
                     }
-                    return false;
-                }
-                return isTaskReadyFunction ? await isTaskReadyFunction() : await this._isTaskReadyFunction();
+                    const crawlingContext = { request };
+                    try {
+                        await this.basicContextPipeline
+                            .chain(this.contextPipeline)
+                            .call(crawlingContext, (ctx) => this.handleRequest(ctx, source));
+                    }
+                    catch (error) {
+                        // ContextPipelineInterruptedError means the request was intentionally skipped
+                        // (e.g., doesn't match enqueue strategy after redirect). Just return gracefully.
+                        if (error instanceof ContextPipelineInterruptedError) {
+                            await this._timeoutAndRetry(async () => this.requestManager?.markRequestHandled(crawlingContext.request), this.internalTimeoutMillis, `Marking request ${crawlingContext.request.url} (${crawlingContext.request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
+                            return;
+                        }
+                        // If the error happened during pipeline initialization (e.g., navigation timeout, session/proxy error,
+                        // i.e. not in user's requestHandler), handle it through the normal error flow.
+                        const isPipelineError = error instanceof ContextPipelineInitializationError || error instanceof SessionError;
+                        if (isPipelineError) {
+                            const unwrappedError = this.unwrapError(error);
+                            await this._requestFunctionErrorHandler(unwrappedError, crawlingContext, this.requestManager);
+                            crawlingContext.session?.markBad();
+                            return;
+                        }
+                        throw this.unwrapError(error);
+                    }
+                },
+                isTaskReadyFunction: async () => {
+                    if (isMaxPagesExceeded()) {
+                        this.logOncePerRun('shuttingDown', 'Crawler reached the maxRequestsPerCrawl limit of ' +
+                            `${this.maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
+                        return false;
+                    }
+                    if (this.unexpectedStop) {
+                        this.logOncePerRun('shuttingDown', 'No new requests are allowed because the `stop()` method has been called. ' +
+                            'Ongoing requests will be allowed to complete.');
+                        return false;
+                    }
+                    return isTaskReadyFunction ? await isTaskReadyFunction() : await this._isTaskReadyFunction();
+                },
+                isFinishedFunction: async () => {
+                    if (isMaxPagesExceeded()) {
+                        this.log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${this.maxRequestsPerCrawl} requests ` +
+                            'and all requests that were in progress at that time have now finished. ' +
+                            `In total, the crawler processed ${this.handledRequestsCount} requests and will shut down.`);
+                        return true;
+                    }
+                    if (this.unexpectedStop) {
+                        this.log.info('The crawler has finished all the remaining ongoing requests and will shut down now.');
+                        return true;
+                    }
+                    const isFinished = isFinishedFunction
+                        ? await isFinishedFunction()
+                        : await this._defaultIsFinishedFunction();
+                    if (isFinished) {
+                        const reason = isFinishedFunction
+                            ? "Crawler's custom isFinishedFunction() returned true, the crawler will shut down."
+                            : 'All requests from the queue have been processed, the crawler will shut down.';
+                        this.log.info(reason);
+                    }
+                    return isFinished;
+                },
+                log: this.log,
+            };
+            this.autoscaledPoolOptions = { ...autoscaledPoolOptions, ...basicCrawlerAutoscaledPoolConfiguration };
+        }
+        finally {
+            serviceLocatorScope.exitScope();
+        }
+    }
+    /**
+     * Determines if the given HTTP status code is an error status code given
+     * the default behaviour and user-set preferences.
+     * @param status
+     * @returns `true` if the status code is considered an error, `false` otherwise
+     */
+    isErrorStatusCode(status) {
+        const excludeError = this.ignoreHttpErrorStatusCodes.has(status);
+        const includeError = this.additionalHttpErrorStatusCodes.has(status);
+        return (status >= 500 && !excludeError) || includeError;
+    }
+    /**
+     * Builds the basic context pipeline that transforms `{ request }` into a full `CrawlingContext`.
+     * This handles base context creation, session resolution, and context helpers.
+     */
+    buildBasicContextPipeline() {
+        return ContextPipeline.create()
+            .compose({ action: this.checkRobotsTxt.bind(this) })
+            .compose({
+            action: () => this.createBaseContext(),
+            cleanup: async (context) => {
+                await Promise.all(context[deferredCleanupKey].map((fn) => fn()));
             },
-            isFinishedFunction: async () => {
-                if (isMaxPagesExceeded()) {
-                    log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${maxRequestsPerCrawl} requests ` +
-                        'and all requests that were in progress at that time have now finished. ' +
-                        `In total, the crawler processed ${this.handledRequestsCount} requests and will shut down.`);
-                    return true;
-                }
-                const isFinished = isFinishedFunction
-                    ? await isFinishedFunction()
-                    : await this._defaultIsFinishedFunction();
-                if (isFinished) {
-                    const reason = isFinishedFunction
-                        ? "Crawler's custom isFinishedFunction() returned true, the crawler will shut down."
-                        : 'All requests from the queue have been processed, the crawler will shut down.';
-                    log.info(reason);
-                }
-                return isFinished;
+        })
+            .compose({ action: this.resolveSession.bind(this) })
+            .compose({ action: this.createContextHelpers.bind(this) });
+    }
+    async checkRobotsTxt({ request }) {
+        if (!(await this.isAllowedBasedOnRobotsTxtFile(request.url))) {
+            this.log.warning(`Skipping request ${request.url} (${request.id}) because it is disallowed based on robots.txt`);
+            request.state = RequestState.SKIPPED;
+            request.noRetry = true;
+            await this.handleSkippedRequest({
+                url: request.url,
+                reason: 'robotsTxt',
+            });
+            throw new ContextPipelineInterruptedError(`Skipping request ${request.url} as disallowed by robots.txt`);
+        }
+        return {};
+    }
+    /**
+     * Builds the subclass-specific context pipeline that transforms a `CrawlingContext` into the crawler's target context type.
+     * Subclasses should override this to add their own pipeline stages.
+     */
+    buildContextPipeline() {
+        return ContextPipeline.create();
+    }
+    createBaseContext() {
+        const deferredCleanup = [];
+        return {
+            id: cryptoRandomObjectId(10),
+            log: this.log,
+            pushData: this.pushData.bind(this),
+            useState: this.useState.bind(this),
+            getKeyValueStore: async (idOrName) => KeyValueStore.open(idOrName),
+            registerDeferredCleanup: (cleanup) => {
+                deferredCleanup.push(cleanup);
             },
-            log,
+            [deferredCleanupKey]: deferredCleanup,
         };
-        this.autoscaledPoolOptions = { ...autoscaledPoolOptions, ...basicCrawlerAutoscaledPoolConfiguration };
+    }
+    async resolveRequest() {
+        const request = await this._timeoutAndRetry(this._fetchNextRequest.bind(this), this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
+        // Reset loadedUrl so an old one is not carried over to retries.
+        if (request) {
+            request.loadedUrl = undefined;
+        }
+        return request;
+    }
+    async resolveSession({ request }) {
+        const session = await this._timeoutAndRetry(async () => {
+            return await this.sessionPool.newSession({
+                proxyInfo: await this.proxyConfiguration?.newProxyInfo({
+                    request: request ?? undefined,
+                }),
+                maxUsageCount: 1,
+            });
+        }, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
+        return { session, proxyInfo: session.proxyInfo };
+    }
+    async createContextHelpers({ request, session }) {
+        const enqueueLinksWrapper = async (options) => {
+            const requestQueue = await this.getRequestQueue();
+            return await this.enqueueLinksWithCrawlDepth(options, request, requestQueue);
+        };
+        const addRequests = async (requests, options = {}) => {
+            const newCrawlDepth = request.crawlDepth + 1;
+            const requestsGenerator = this.addCrawlDepthRequestGenerator(requests, newCrawlDepth);
+            await this.addRequests(requestsGenerator, options);
+        };
+        const sendRequest = createSendRequest(this.httpClient, request, session);
+        return { enqueueLinks: enqueueLinksWrapper, addRequests, sendRequest };
+    }
+    buildFinalContextPipeline() {
+        let contextPipeline = (this.contextPipelineOptions.contextPipelineBuilder?.() ??
+            this.buildContextPipeline());
+        const { extendContext } = this.contextPipelineOptions;
+        if (extendContext !== undefined) {
+            contextPipeline = contextPipeline.compose({
+                action: async (context) => await extendContext(context),
+            });
+        }
+        contextPipeline = contextPipeline.compose({
+            action: async (context) => {
+                const { request } = context;
+                if (request && !this.requestMatchesEnqueueStrategy(request)) {
+                    // eslint-disable-next-line dot-notation
+                    const message = `Skipping request ${request.id} (starting url: ${request.url} -> loaded url: ${request.loadedUrl}) because it does not match the enqueue strategy (${request['enqueueStrategy']}).`;
+                    this.log.debug(message);
+                    request.noRetry = true;
+                    request.state = RequestState.SKIPPED;
+                    await this.handleSkippedRequest({ url: request.url, reason: 'redirect' });
+                    throw new ContextPipelineInterruptedError(message);
+                }
+                return context;
+            },
+        });
+        return contextPipeline;
     }
     /**
      * Checks if the given error is a proxy error by comparing its message to a list of known proxy error messages.
@@ -311,21 +556,13 @@ export class BasicCrawler {
     isProxyError(error) {
         return ROTATE_PROXY_ERRORS.some((x) => this._getMessageFromError(error)?.includes(x));
     }
-    /**
-     * Checks whether the given crawling context is getting blocked by anti-bot protection using several heuristics.
-     * Returns `false` if the request is not blocked, otherwise returns a string with a description of the block reason.
-     * @param _crawlingContext The crawling context to check.
-     */
-    async isRequestBlocked(_crawlingContext) {
-        throw new Error('the "isRequestBlocked" method is not implemented in this crawler.');
-    }
     /**
      * This method is periodically called by the crawler, every `statusMessageLoggingInterval` seconds.
      */
     async setStatusMessage(message, options = {}) {
         const data = options.isStatusMessageTerminal != null ? { terminal: options.isStatusMessageTerminal } : undefined;
-        this.log.internal(LogLevel[options.level ?? 'DEBUG'], message, data);
-        const client = this.config.getStorageClient();
+        this.log.logWithLevel(LogLevel[options.level ?? 'DEBUG'], message, data);
+        const client = serviceLocator.getStorageClient();
         if (!client.setStatusMessage) {
             return;
         }
@@ -350,7 +587,7 @@ export class BasicCrawler {
                 message = `Experiencing problems, ${this.stats.state.requestsFailed - previousState.requestsFailed || this.stats.state.requestsFailed} failed requests in the past ${this.statusMessageLoggingInterval} seconds.`;
             }
             else {
-                const total = this.requestQueue?.getTotalCount() || this.requestList?.length();
+                const total = this.requestManager?.getTotalCount();
                 message = `Crawled ${this.stats.state.requestsFinished}${total ? `/${total}` : ''} pages, ${this.stats.state.requestsFailed} failed requests, desired concurrency ${this.autoscaledPool?.desiredConcurrency ?? 0}.`;
             }
             if (this.statusMessageCallback) {
@@ -390,20 +627,30 @@ export class BasicCrawler {
             if (this.requestQueue?.name === 'default' && purgeRequestQueue) {
                 await this.requestQueue.drop();
                 this.requestQueue = await this._getRequestQueue();
+                this.requestManager = undefined;
+                await this.initializeRequestManager();
+                this.handledRequestsCount = 0; // This would've been reset by this._init() further down below, but at that point `handledRequestsCount` could prevent `addRequests` from adding the initial requests
             }
             this.stats.reset();
             await this.stats.resetStore();
             await this.sessionPool?.resetStore();
         }
+        this.unexpectedStop = false;
         this.running = true;
-        await purgeDefaultStorages({ onlyPurgeOnce: true });
+        this.loggedPerRun.clear();
+        await purgeDefaultStorages({
+            onlyPurgeOnce: true,
+            client: serviceLocator.getStorageClient(),
+            config: serviceLocator.getConfiguration(),
+        });
         if (requests) {
             await this.addRequests(requests, addRequestsOptions);
         }
         await this._init();
         await this.stats.startCapturing();
         const periodicLogger = this.getPeriodicLogger();
-        await this.setStatusMessage('Starting the crawler.', { level: 'INFO' });
+        // Don't await, we don't want to block the execution
+        void this.setStatusMessage('Starting the crawler.', { level: 'INFO' });
         const sigintHandler = async () => {
             this.log.warning('Pausing... Press CTRL+C again to force exit. To resume, do: CRAWLEE_PURGE_ON_START=0 npm start');
             await this._pauseOnMigration();
@@ -412,8 +659,9 @@ export class BasicCrawler {
         // Attach a listener to handle migration and aborting events gracefully.
         const boundPauseOnMigration = this._pauseOnMigration.bind(this);
         process.once('SIGINT', sigintHandler);
-        this.events.on("migrating" /* EventType.MIGRATING */, boundPauseOnMigration);
-        this.events.on("aborting" /* EventType.ABORTING */, boundPauseOnMigration);
+        const eventManager = serviceLocator.getEventManager();
+        eventManager.on("migrating" /* EventType.MIGRATING */, boundPauseOnMigration);
+        eventManager.on("aborting" /* EventType.ABORTING */, boundPauseOnMigration);
         let stats = {};
         try {
             await this.autoscaledPool.run();
@@ -422,8 +670,8 @@ export class BasicCrawler {
             await this.teardown();
             await this.stats.stopCapturing();
             process.off('SIGINT', sigintHandler);
-            this.events.off("migrating" /* EventType.MIGRATING */, boundPauseOnMigration);
-            this.events.off("aborting" /* EventType.ABORTING */, boundPauseOnMigration);
+            eventManager.off("migrating" /* EventType.MIGRATING */, boundPauseOnMigration);
+            eventManager.off("aborting" /* EventType.ABORTING */, boundPauseOnMigration);
             const finalStats = this.stats.calculate();
             stats = {
                 requestsFinished: this.stats.state.requestsFinished,
@@ -440,7 +688,7 @@ export class BasicCrawler {
                     mostCommonErrors: this.stats.errorTracker.getMostPopularErrors(3).map(prettify),
                 });
             }
-            const client = this.config.getStorageClient();
+            const client = serviceLocator.getStorageClient();
             if (client.teardown) {
                 let finished = false;
                 setTimeout(() => {
@@ -452,7 +700,8 @@ export class BasicCrawler {
                 finished = true;
             }
             periodicLogger.stop();
-            await this.setStatusMessage(`Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${this.stats.state.requestsFinished} succeeded, ${this.stats.state.requestsFailed} failed.`, { isStatusMessageTerminal: true, level: 'INFO' });
+            // Don't await, we don't want to block the execution
+            void this.setStatusMessage(`Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${this.stats.state.requestsFinished} succeeded, ${this.stats.state.requestsFailed} failed.`, { isStatusMessageTerminal: true, level: 'INFO' });
             this.running = false;
             this.hasFinishedBefore = true;
         }
@@ -462,29 +711,75 @@ export class BasicCrawler {
      * Gracefully stops the current run of the crawler.
      *
      * All the tasks active at the time of calling this method will be allowed to finish.
+     *
+     * To stop the crawler immediately, use {@link BasicCrawler.teardown|`crawler.teardown()`} instead.
      */
-    stop(message = 'The crawler has been gracefully stopped.') {
-        // Gracefully starve the this.autoscaledPool, so it doesn't start new tasks. Resolves once the pool is cleared.
-        this.autoscaledPool
-            ?.pause()
-            // Resolves the `autoscaledPool.run()` promise in the `BasicCrawler.run()` method. Since the pool is already paused, it resolves immediately and doesn't kill any tasks.
-            .then(async () => this.autoscaledPool?.abort())
-            .then(() => this.log.info(message))
-            .catch((err) => {
-            this.log.error('An error occurred when stopping the crawler:', err);
-        });
+    stop(reason = 'The crawler has been gracefully stopped.') {
+        if (this.unexpectedStop) {
+            return;
+        }
+        this.log.info(reason);
+        this.unexpectedStop = true;
     }
     async getRequestQueue() {
         if (!this.requestQueue && this.requestList) {
             this.log.warningOnce('When using RequestList and RequestQueue at the same time, you should instantiate both explicitly and provide them in the crawler options, to ensure correctly handled restarts of the crawler.');
         }
-        this.requestQueue ??= await this._getRequestQueue();
+        if (!this.requestQueue) {
+            this.requestQueue = await this._getRequestQueue();
+            this.requestManager = undefined;
+        }
+        if (!this.requestManager) {
+            this.requestManager =
+                this.requestList === undefined
+                    ? this.requestQueue
+                    : new RequestManagerTandem(this.requestList, this.requestQueue);
+        }
         return this.requestQueue;
     }
     async useState(defaultValue = {}) {
-        const kvs = await KeyValueStore.open(null, { config: this.config });
+        const kvs = await KeyValueStore.open(null, { config: serviceLocator.getConfiguration() });
+        if (this.hasExplicitId) {
+            const stateKey = `${BasicCrawler.CRAWLEE_STATE_KEY}_${this.crawlerId}`;
+            return kvs.getAutoSavedValue(stateKey, defaultValue);
+        }
+        BasicCrawler.useStateCrawlerIds.add(this.crawlerId);
+        if (BasicCrawler.useStateCrawlerIds.size > 1) {
+            serviceLocator
+                .getLogger()
+                .warningOnce('Multiple crawler instances are calling useState() without an explicit `id` option. \n' +
+                'This means they will share the same state object, which is likely unintended. \n' +
+                'To fix this, provide a unique `id` option to each crawler instance. \n' +
+                'Example: new BasicCrawler({ id: "my-crawler-1", ... })');
+        }
         return kvs.getAutoSavedValue(BasicCrawler.CRAWLEE_STATE_KEY, defaultValue);
     }
+    get pendingRequestCountApproximation() {
+        return this.requestManager?.getPendingCount() ?? 0;
+    }
+    calculateEnqueuedRequestLimit(explicitLimit) {
+        if (this.maxRequestsPerCrawl === undefined) {
+            return explicitLimit;
+        }
+        const limit = Math.max(0, this.maxRequestsPerCrawl - this.handledRequestsCount - this.pendingRequestCountApproximation);
+        return Math.min(limit, explicitLimit ?? Infinity);
+    }
+    async handleSkippedRequest(options) {
+        if (options.reason === 'limit') {
+            this.logOncePerRun('maxRequestsPerCrawl', 'The number of requests enqueued by the crawler reached the maxRequestsPerCrawl limit of ' +
+                `${this.maxRequestsPerCrawl} requests and no further requests will be added.`);
+        }
+        if (options.reason === 'depth') {
+            this.logOncePerRun('maxCrawlDepth', `The crawler reached the maxCrawlDepth limit of ${this.maxCrawlDepth} and no further requests will be enqueued.`);
+        }
+        await this.onSkippedRequest?.(options);
+    }
+    logOncePerRun(key, message) {
+        if (!this.loggedPerRun.has(key)) {
+            this.log.info(message);
+            this.loggedPerRun.add(key);
+        }
+    }
     /**
      * Adds requests to the queue in batches. By default, it will resolve after the initial batch is added, and continue
      * adding the rest in background. You can configure the batch size via `batchSize` option and the sleep time in between
@@ -497,33 +792,57 @@ export class BasicCrawler {
      * @param options Options for the request queue
      */
     async addRequests(requests, options = {}) {
-        const requestQueue = await this.getRequestQueue();
-        if (!this.respectRobotsTxtFile) {
-            return requestQueue.addRequestsBatched(requests, options);
-        }
-        const allowedRequests = [];
-        const skipped = new Set();
-        for (const request of requests) {
-            const url = typeof request === 'string' ? request : request.url;
-            if (await this.isAllowedBasedOnRobotsTxtFile(url)) {
-                allowedRequests.push(request);
-            }
-            else {
-                skipped.add(url);
-                await this.onSkippedRequest?.({ url, reason: 'robotsTxt' });
+        await this.getRequestQueue();
+        const requestLimit = this.calculateEnqueuedRequestLimit();
+        const skippedBecauseOfRobots = new Set();
+        const skippedBecauseOfLimit = new Set();
+        const skippedBecauseOfMaxCrawlDepth = new Set();
+        const isAllowedBasedOnRobotsTxtFile = this.isAllowedBasedOnRobotsTxtFile.bind(this);
+        const maxCrawlDepth = this.maxCrawlDepth;
+        ow(requests, ow.object
+            .is((value) => isIterable(value) || isAsyncIterable(value))
+            .message((value) => `Expected an iterable or async iterable, got ${getObjectType(value)}`));
+        async function* filteredRequests() {
+            let yieldedRequestCount = 0;
+            for await (const request of requests) {
+                const url = typeof request === 'string' ? request : request.url;
+                if (requestLimit !== undefined && yieldedRequestCount >= requestLimit) {
+                    skippedBecauseOfLimit.add(url);
+                    continue;
+                }
+                if (maxCrawlDepth !== undefined && request.crawlDepth > maxCrawlDepth) {
+                    skippedBecauseOfMaxCrawlDepth.add(url);
+                    continue;
+                }
+                if (await isAllowedBasedOnRobotsTxtFile(url)) {
+                    yield request;
+                    yieldedRequestCount += 1;
+                }
+                else {
+                    skippedBecauseOfRobots.add(url);
+                }
             }
         }
-        if (skipped.size > 0) {
+        const result = await this.requestManager.addRequestsBatched(filteredRequests(), options);
+        if (skippedBecauseOfRobots.size > 0) {
             this.log.warning(`Some requests were skipped because they were disallowed based on the robots.txt file`, {
-                skipped: [...skipped],
+                skipped: [...skippedBecauseOfRobots],
             });
-            if (this.onSkippedRequest) {
-                await Promise.all([...skipped].map((url) => {
-                    return this.onSkippedRequest({ url, reason: 'robotsTxt' });
-                }));
-            }
         }
-        return requestQueue.addRequestsBatched(allowedRequests, options);
+        if (skippedBecauseOfRobots.size > 0 ||
+            skippedBecauseOfLimit.size > 0 ||
+            skippedBecauseOfMaxCrawlDepth.size > 0) {
+            await Promise.all([...skippedBecauseOfRobots]
+                .map((url) => {
+                return this.handleSkippedRequest({ url, reason: 'robotsTxt' });
+            })
+                .concat([...skippedBecauseOfLimit].map((url) => {
+                return this.handleSkippedRequest({ url, reason: 'limit' });
+            }), [...skippedBecauseOfMaxCrawlDepth].map((url) => {
+                return this.handleSkippedRequest({ url, reason: 'depth' });
+            })));
+        }
+        return result;
     }
     /**
      * Pushes data to the specified {@link Dataset}, or the default crawler {@link Dataset} by calling {@link Dataset.pushData}.
@@ -536,7 +855,7 @@ export class BasicCrawler {
      * Retrieves the specified {@link Dataset}, or the default crawler {@link Dataset}.
      */
     async getDataset(idOrName) {
-        return Dataset.open(idOrName, { config: this.config });
+        return Dataset.open(idOrName, { config: serviceLocator.getConfiguration() });
     }
     /**
      * Retrieves data from the default crawler {@link Dataset} by calling {@link Dataset.getData}.
@@ -563,7 +882,21 @@ export class BasicCrawler {
         const dataset = await this.getDataset();
         const items = await dataset.export(options);
         if (format === 'csv') {
-            const value = stringify([Object.keys(items[0]), ...items.map((item) => Object.values(item))]);
+            let value;
+            if (items.length === 0) {
+                value = '';
+            }
+            else {
+                const keys = options?.collectAllKeys
+                    ? Array.from(new Set(items.flatMap(Object.keys)))
+                    : Object.keys(items[0]);
+                value = stringify([
+                    keys,
+                    ...items.map((item) => {
+                        return keys.map((k) => item[k]);
+                    }),
+                ]);
+            }
             await ensureDir(dirname(path));
             await writeFile(path, value);
             this.log.info(`Export to ${path} finished!`);
@@ -575,32 +908,34 @@ export class BasicCrawler {
         }
         return items;
     }
+    /**
+     * Initializes the crawler.
+     */
     async _init() {
-        if (!this.events.isInitialized()) {
-            await this.events.init();
+        const eventManager = serviceLocator.getEventManager();
+        if (!eventManager.isInitialized()) {
+            await eventManager.init();
             this._closeEvents = true;
         }
         // Initialize AutoscaledPool before awaiting _loadHandledRequestCount(),
         // so that the caller can get a reference to it before awaiting the promise returned from run()
         // (otherwise there would be no way)
-        this.autoscaledPool = new AutoscaledPool(this.autoscaledPoolOptions, this.config);
-        if (this.useSessionPool) {
-            this.sessionPool = await SessionPool.open(this.sessionPoolOptions, this.config);
-            // Assuming there are not more than 20 browsers running at once;
-            this.sessionPool.setMaxListeners(20);
-        }
+        this.autoscaledPool = new AutoscaledPool(this.autoscaledPoolOptions);
+        this.sessionPool.setMaxListeners(20);
+        await this.initializeRequestManager();
         await this._loadHandledRequestCount();
     }
-    async _runRequestHandler(crawlingContext) {
-        await this.requestHandler(crawlingContext);
+    async runRequestHandler(crawlingContext) {
+        await addTimeoutToPromise(async () => this.requestHandler(crawlingContext), this.requestHandlerTimeoutMillis, `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${crawlingContext.request.id}).`);
     }
     /**
      * Handles blocked request
      */
-    _throwOnBlockedRequest(session, statusCode) {
-        const isBlocked = session.retireOnBlockedStatusCodes(statusCode);
-        if (isBlocked) {
-            throw new Error(`Request blocked - received ${statusCode} status code.`);
+    _throwOnBlockedRequest(statusCode) {
+        if (this.retryOnBlocked)
+            return;
+        if (this.blockedStatusCodes.has(statusCode)) {
+            throw new SessionError(`Request blocked - received ${statusCode} status code.`);
         }
     }
     async isAllowedBasedOnRobotsTxtFile(url) {
@@ -608,7 +943,8 @@ export class BasicCrawler {
             return true;
         }
         const robotsTxtFile = await this.getRobotsTxtFileForUrl(url);
-        return !robotsTxtFile || robotsTxtFile.isAllowed(url);
+        const userAgent = typeof this.respectRobotsTxtFile === 'object' ? this.respectRobotsTxtFile?.userAgent : '*';
+        return !robotsTxtFile || robotsTxtFile.isAllowed(url, userAgent);
     }
     async getRobotsTxtFileForUrl(url) {
         if (!this.respectRobotsTxtFile) {
@@ -662,36 +998,36 @@ export class BasicCrawler {
         await Promise.all([requestListPersistPromise, this.stats.persistState()]);
     }
     /**
-     * Fetches request from either RequestList or RequestQueue. If request comes from a RequestList
-     * and RequestQueue is present then enqueues it to the queue first.
+     * Initializes the RequestManager based on the configured requestList and requestQueue.
      */
-    async _fetchNextRequest() {
-        if (!this.requestList || (await this.requestList.isFinished())) {
-            return this.requestQueue?.fetchNextRequest();
-        }
-        const request = await this.requestList.fetchNextRequest();
-        if (!this.requestQueue)
-            return request;
-        if (!request)
-            return this.requestQueue.fetchNextRequest();
-        try {
-            await this.requestQueue.addRequest(request, { forefront: true });
+    async initializeRequestManager() {
+        if (this.requestManager !== undefined) {
+            return;
+        }
+        if (this.requestList && this.requestQueue) {
+            // Create a RequestManagerTandem if both RequestList and RequestQueue are provided
+            this.requestManager = new RequestManagerTandem(this.requestList, this.requestQueue);
+        }
+        else if (this.requestQueue) {
+            // Use RequestQueue directly if only it is provided
+            this.requestManager = this.requestQueue;
         }
-        catch (err) {
-            // If requestQueue.addRequest() fails here then we must reclaim it back to
-            // the RequestList because probably it's not yet in the queue!
-            this.log.error('Adding of request from the RequestList to the RequestQueue failed, reclaiming request back to the list.', { request });
-            await this.requestList.reclaimRequest(request);
-            return null;
+        else if (this.requestList) {
+            // Use RequestList directly if only it is provided
+            // Make it compatible with the IRequestManager interface
+            this.requestManager = new RequestListAdapter(this.requestList);
         }
-        await this.requestList.markRequestHandled(request);
-        return this.requestQueue.fetchNextRequest();
+        // If neither RequestList nor RequestQueue is provided, leave the requestManager uninitialized until `getRequestQueue` is called
     }
     /**
-     * Executed when `errorHandler` finishes or the request is successful.
-     * Can be used to clean up orphaned browser pages.
+     * Fetches the next request to process from the underlying request provider.
      */
-    async _cleanupContext(_crawlingContext) { }
+    async _fetchNextRequest() {
+        if (this.requestManager === undefined) {
+            throw new Error(`_fetchNextRequest called on an uninitialized crawler`);
+        }
+        return this.requestManager.fetchNextRequest();
+    }
     /**
      * Delays processing of the request based on the `sameDomainDelaySecs` option,
      * adding it back to the queue after the timeout passes. Returns `true` if the request
@@ -724,112 +1060,55 @@ export class BasicCrawler {
         }, delay);
         return true;
     }
-    /**
-     * Wrapper around requestHandler that fetches requests from RequestList/RequestQueue
-     * then retries them in a case of an error, etc.
-     */
-    async _runTaskFunction() {
-        const source = this.requestQueue || this.requestList || (await this.getRequestQueue());
-        let request;
-        let session;
-        await this._timeoutAndRetry(async () => {
-            request = await this._fetchNextRequest();
-        }, this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
-        tryCancel();
-        if (this.useSessionPool) {
-            await this._timeoutAndRetry(async () => {
-                session = await this.sessionPool.getSession();
-            }, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
-        }
-        tryCancel();
-        if (!request || this.delayRequest(request, source)) {
-            return;
-        }
-        if (!(await this.isAllowedBasedOnRobotsTxtFile(request.url))) {
-            this.log.warning(`Skipping request ${request.url} (${request.id}) because it is disallowed based on robots.txt`);
-            request.state = RequestState.SKIPPED;
-            request.noRetry = true;
-            await source.markRequestHandled(request);
-            await this.onSkippedRequest?.({
-                url: request.url,
-                reason: 'robotsTxt',
-            });
-            return;
-        }
-        // Reset loadedUrl so an old one is not carried over to retries.
-        request.loadedUrl = undefined;
+    /** Handles a single request - runs the request handler with retries, error handling, and lifecycle management. */
+    async handleRequest(crawlingContext, requestSource) {
+        const { request } = crawlingContext;
         const statisticsId = request.id || request.uniqueKey;
         this.stats.startJob(statisticsId);
-        // Shared crawling context
-        // @ts-expect-error
-        // All missing properties (that extend CrawlingContext) are set dynamically,
-        // but TS does not know that, so otherwise it would throw when compiling.
-        const crawlingContext = {
-            id: cryptoRandomObjectId(10),
-            crawler: this,
-            log: this.log,
-            request,
-            session,
-            enqueueLinks: async (options) => {
-                return enqueueLinks({
-                    // specify the RQ first to allow overriding it
-                    requestQueue: await this.getRequestQueue(),
-                    robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
-                    onSkippedRequest: this.onSkippedRequest,
-                    ...options,
-                });
-            },
-            addRequests: this.addRequests.bind(this),
-            pushData: this.pushData.bind(this),
-            useState: this.useState.bind(this),
-            sendRequest: createSendRequest(this.httpClient, request, session, () => crawlingContext.proxyInfo?.url),
-            getKeyValueStore: async (idOrName) => KeyValueStore.open(idOrName, { config: this.config }),
-        };
-        this.crawlingContexts.set(crawlingContext.id, crawlingContext);
         let isRequestLocked = true;
         try {
             request.state = RequestState.REQUEST_HANDLER;
-            await addTimeoutToPromise(async () => this._runRequestHandler(crawlingContext), this.requestHandlerTimeoutMillis, `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${request.id}).`);
-            await this._timeoutAndRetry(async () => source.markRequestHandled(request), this.internalTimeoutMillis, `Marking request ${request.url} (${request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
+            await this.runRequestHandler(crawlingContext);
+            await this._timeoutAndRetry(async () => requestSource.markRequestHandled(request), this.internalTimeoutMillis, `Marking request ${request.url} (${request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
             isRequestLocked = false; // markRequestHandled succeeded and unlocked the request
             this.stats.finishJob(statisticsId, request.retryCount);
             this.handledRequestsCount++;
             // reclaim session if request finishes successfully
             request.state = RequestState.DONE;
-            crawlingContext.session?.markGood();
+            crawlingContext.session.markGood();
         }
-        catch (err) {
+        catch (rawError) {
+            const err = this.unwrapError(rawError);
             try {
                 request.state = RequestState.ERROR_HANDLER;
-                await addTimeoutToPromise(async () => this._requestFunctionErrorHandler(err, crawlingContext, source), this.internalTimeoutMillis, `Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
+                await addTimeoutToPromise(async () => this._requestFunctionErrorHandler(err, crawlingContext, requestSource), this.internalTimeoutMillis, `Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
                 if (!(err instanceof CriticalError)) {
                     isRequestLocked = false; // _requestFunctionErrorHandler calls either markRequestHandled or reclaimRequest
                 }
                 request.state = RequestState.DONE;
             }
             catch (secondaryError) {
-                if (!secondaryError.triggeredFromUserHandler &&
+                const unwrappedSecondaryError = this.unwrapError(secondaryError);
+                if (!unwrappedSecondaryError.triggeredFromUserHandler &&
                     // avoid reprinting the same critical error multiple times, as it will be printed by Nodejs at the end anyway
-                    !(secondaryError instanceof CriticalError)) {
+                    !(unwrappedSecondaryError instanceof CriticalError)) {
                     const apifySpecific = process.env.APIFY_IS_AT_HOME
                         ? `This may have happened due to an internal error of Apify's API or due to a misconfigured crawler.`
                         : '';
-                    this.log.exception(secondaryError, 'An exception occurred during handling of failed request. ' +
+                    this.log.exception(unwrappedSecondaryError, 'An exception occurred during handling of failed request. ' +
                         `This places the crawler and its underlying storages into an unknown state and crawling will be terminated. ${apifySpecific}`);
                 }
                 request.state = RequestState.ERROR;
-                throw secondaryError;
+                throw unwrappedSecondaryError;
             }
             // decrease the session score if the request fails (but the error handler did not throw)
-            crawlingContext.session?.markBad();
+            crawlingContext.session.markBad();
         }
         finally {
-            await this._cleanupContext(crawlingContext);
-            this.crawlingContexts.delete(crawlingContext.id);
             // Safety net - release the lock if nobody managed to do it before
-            if (isRequestLocked && source instanceof RequestProvider) {
+            if (isRequestLocked && requestSource instanceof RequestProvider) {
                 try {
-                    await source.client.deleteRequestLock(request.id);
+                    await requestSource.client.deleteRequestLock(request.id);
                 }
                 catch {
                     // We don't have the lock, or the request was never locked. Either way it's fine
@@ -838,19 +1117,75 @@ export class BasicCrawler {
         }
     }
     /**
-     * Run async callback with given timeout and retry.
+     * Wrapper around the crawling context's `enqueueLinks` method:
+     * - Injects `crawlDepth` to each request being added based on the crawling context request.
+     * - Provides defaults for the `enqueueLinks` options based on the crawler configuration.
+     *      - These options can be overridden by the user.
+     * @internal
+     */
+    async enqueueLinksWithCrawlDepth(options, request, requestQueue) {
+        const transformRequestFunctionWrapper = (requestOptions) => {
+            requestOptions.crawlDepth = request.crawlDepth + 1;
+            if (this.maxCrawlDepth !== undefined && requestOptions.crawlDepth > this.maxCrawlDepth) {
+                // Setting `skippedReason` before returning `false` ensures that `reportSkippedRequests`
+                // reports `'depth'` as the reason (via `request.skippedReason ?? reason` fallback),
+                // rather than the generic `'transform'` reason.
+                requestOptions.skippedReason = 'depth';
+                return false;
+            }
+            // After injecting the crawlDepth, we call the user-provided transform function, if there is one.
+            return options.transformRequestFunction?.(requestOptions) ?? requestOptions;
+        };
+        // Create a request-scoped callback that logs enqueueLimit once per request handler call
+        // Only log if an explicit limit was passed to enqueueLinks (not the internal maxRequestsPerCrawl-derived limit)
+        let loggedEnqueueLimitForThisRequest = false;
+        const onSkippedRequest = async (skippedOptions) => {
+            if (skippedOptions.reason === 'enqueueLimit') {
+                if (!loggedEnqueueLimitForThisRequest && options.limit !== undefined) {
+                    this.log.info(`Skipping URLs in the handler for ${request.url} due to the enqueueLinks limit of ${options.limit}.`);
+                    loggedEnqueueLimitForThisRequest = true;
+                }
+            }
+            await this.handleSkippedRequest(skippedOptions);
+        };
+        return await enqueueLinks({
+            requestQueue,
+            robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
+            onSkippedRequest,
+            limit: this.calculateEnqueuedRequestLimit(options.limit),
+            // Allow user options to override defaults set above ⤴
+            ...options,
+            transformRequestFunction: transformRequestFunctionWrapper,
+        });
+    }
+    /**
+     * Generator function that yields requests injected with the given crawl depth.
+     * @internal
+     */
+    async *addCrawlDepthRequestGenerator(requests, newRequestDepth) {
+        for await (const request of requests) {
+            if (typeof request === 'string') {
+                yield { url: request, crawlDepth: newRequestDepth };
+            }
+            else {
+                request.crawlDepth ??= newRequestDepth;
+                yield request;
+            }
+        }
+    }
+    /**
+     * Run async callback with given timeout and retry. Returns the result of the callback.
      * @ignore
      */
     async _timeoutAndRetry(handler, timeout, error, maxRetries = 3, retried = 1) {
         try {
-            await addTimeoutToPromise(handler, timeout, error);
+            return await addTimeoutToPromise(handler, timeout, error);
         }
         catch (e) {
             if (retried <= maxRetries) {
                 // we retry on any error, not just timeout
                 this.log.warning(`${e.message} (retrying ${retried}/${maxRetries})`);
-                void this._timeoutAndRetry(handler, timeout, error, maxRetries, retried + 1);
-                return;
+                return this._timeoutAndRetry(handler, timeout, error, maxRetries, retried + 1);
             }
             throw e;
         }
@@ -859,30 +1194,31 @@ export class BasicCrawler {
      * Returns true if either RequestList or RequestQueue have a request ready for processing.
      */
     async _isTaskReadyFunction() {
-        // First check RequestList, since it's only in memory.
-        const isRequestListEmpty = this.requestList ? await this.requestList.isEmpty() : true;
-        // If RequestList is not empty, task is ready, no reason to check RequestQueue.
-        if (!isRequestListEmpty)
-            return true;
-        // If RequestQueue is not empty, task is ready, return true, otherwise false.
-        return this.requestQueue ? !(await this.requestQueue.isEmpty()) : false;
+        return this.requestManager !== undefined && !(await this.requestManager.isEmpty());
     }
     /**
      * Returns true if both RequestList and RequestQueue have all requests finished.
      */
     async _defaultIsFinishedFunction() {
-        const [isRequestListFinished, isRequestQueueFinished] = await Promise.all([
-            this.requestList ? this.requestList.isFinished() : true,
-            this.requestQueue ? this.requestQueue.isFinished() : true,
-        ]);
-        // If both are finished, return true, otherwise return false.
-        return isRequestListFinished && isRequestQueueFinished;
+        return !this.requestManager || (await this.requestManager.isFinished());
     }
     async _rotateSession(crawlingContext) {
         const { request } = crawlingContext;
         request.sessionRotationCount ??= 0;
         request.sessionRotationCount++;
-        crawlingContext.session?.retire();
+        crawlingContext.session.retire();
+    }
+    /**
+     * Unwraps errors thrown by the context pipeline to get the actual user error.
+     * RequestHandlerError and ContextPipelineInitializationError wrap the actual error.
+     */
+    unwrapError(error) {
+        if (error instanceof RequestHandlerError ||
+            error instanceof ContextPipelineInitializationError ||
+            error instanceof ContextPipelineCleanupError) {
+            return this.unwrapError(error.cause);
+        }
+        return error;
     }
     /**
      * Handles errors thrown by user provided requestHandler()
@@ -896,12 +1232,15 @@ export class BasicCrawler {
         const shouldRetryRequest = this._canRequestBeRetried(request, error);
         if (shouldRetryRequest) {
             await this.stats.errorTrackerRetry.addAsync(error, crawlingContext);
-            await this.errorHandler?.(crawlingContext, error);
+            await this.errorHandler?.(crawlingContext, // valid cast - ExtendedContext transitively extends CrawlingContext
+            error);
             if (error instanceof SessionError) {
                 await this._rotateSession(crawlingContext);
             }
             if (!request.noRetry) {
-                request.retryCount++;
+                if (!(error instanceof SessionError)) {
+                    request.retryCount++;
+                }
                 const { url, retryCount, id } = request;
                 // We don't want to see the stack trace in the logs by default, when we are going to retry the request.
                 // Thus, we print the full stack trace only when CRAWLEE_VERBOSE_LOG environment variable is set to true.
@@ -915,6 +1254,9 @@ export class BasicCrawler {
                 return;
             }
         }
+        if (error instanceof SessionError) {
+            crawlingContext.session?.retire();
+        }
         // If the request is non-retryable, the error and snapshot aren't saved in the errorTrackerRetry object.
         // Therefore, we pass the crawlingContext to the errorTracker.add method, enabling snapshot capture.
         // This is to make sure the error snapshot is not duplicated in the errorTrackerRetry and errorTracker objects.
@@ -948,7 +1290,8 @@ export class BasicCrawler {
         const message = this._getMessageFromError(error, true);
         this.log.error(`Request failed and reached maximum retries. ${message}`, { id, url, method, uniqueKey });
         if (this.failedRequestHandler) {
-            await this.failedRequestHandler?.(crawlingContext, error);
+            await this.failedRequestHandler?.(crawlingContext, // valid cast - ExtendedContext transitively extends CrawlingContext
+            error);
         }
     }
     /**
@@ -986,19 +1329,11 @@ export class BasicCrawler {
         return request.retryCount < maxRequestRetries;
     }
     /**
-     * Updates handledRequestsCount from possibly stored counts,
-     * usually after worker migration. Since one of the stores
-     * needs to have priority when both are present,
-     * it is the request queue, because generally, the request
-     * list will first be dumped into the queue and then left
-     * empty.
+     * Updates handledRequestsCount from possibly stored counts, usually after worker migration.
      */
     async _loadHandledRequestCount() {
-        if (this.requestQueue) {
-            this.handledRequestsCount = await this.requestQueue.handledCount();
-        }
-        else if (this.requestList) {
-            this.handledRequestsCount = this.requestList.handledCount();
+        if (this.requestManager) {
+            this.handledRequestsCount = await this.requestManager.handledCount();
         }
     }
     async _executeHooks(hooks, ...args) {
@@ -1009,16 +1344,17 @@ export class BasicCrawler {
         }
     }
     /**
-     * Function for cleaning up after all request are processed.
-     * @ignore
+     * Stops the crawler immediately.
+     *
+     * This method doesn't wait for currently active requests to finish.
+     *
+     * To stop the crawler gracefully (waiting for all running requests to finish), use {@link BasicCrawler.stop|`crawler.stop()`} instead.
      */
     async teardown() {
-        this.events.emit("persistState" /* EventType.PERSIST_STATE */, { isMigrating: false });
-        if (this.useSessionPool) {
-            await this.sessionPool.teardown();
-        }
+        serviceLocator.getEventManager().emit("persistState" /* EventType.PERSIST_STATE */, { isMigrating: false });
+        await this.sessionPool?.teardown();
         if (this._closeEvents) {
-            await this.events.close();
+            await serviceLocator.getEventManager().close();
         }
         await this.autoscaledPool?.abort();
     }
@@ -1036,9 +1372,9 @@ export class BasicCrawler {
                 this.log.info('Using the old RequestQueue implementation without request locking.');
                 this._experimentWarnings.requestLocking = true;
             }
-            return RequestQueueV1.open(null, { config: this.config });
+            return RequestQueueV1.open(null, { config: serviceLocator.getConfiguration() });
         }
-        return RequestQueue.open(null, { config: this.config });
+        return RequestQueue.open(null, { config: serviceLocator.getConfiguration() });
     }
     requestMatchesEnqueueStrategy(request) {
         const { url, loadedUrl } = request;