npm - @crawlee/basic - Versions diffs - 4.0.0-beta.5 → 4.0.0-beta.50 - Mend

@crawlee/basic 4.0.0-beta.5 → 4.0.0-beta.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/README.md +9 -5
package/index.d.ts +1 -1
package/index.d.ts.map +1 -1
package/index.js +0 -1
package/index.js.map +1 -1
package/internals/basic-crawler.d.ts +282 -107
package/internals/basic-crawler.d.ts.map +1 -1
package/internals/basic-crawler.js +705 -340
package/internals/basic-crawler.js.map +1 -1
package/internals/send-request.d.ts +3 -5
package/internals/send-request.d.ts.map +1 -1
package/internals/send-request.js +21 -25
package/internals/send-request.js.map +1 -1
package/package.json +7 -8
package/internals/constants.d.ts +0 -7
package/internals/constants.d.ts.map +0 -1
package/internals/constants.js +0 -7
package/internals/constants.js.map +0 -1
package/tsconfig.build.tsbuildinfo +0 -1

package/internals/basic-crawler.d.ts CHANGED Viewed

@@ -1,38 +1,13 @@
-import type { AddRequestsBatchedOptions, AddRequestsBatchedResult, AutoscaledPoolOptions, BaseHttpClient, CrawlingContext, DatasetExportOptions, EnqueueLinksOptions, EventManager, FinalStatistics, GetUserDataFromRequest, IRequestList, LoadedContext, ProxyInfo, Request, RequestOptions, RestrictedCrawlingContext, RouterHandler, RouterRoutes, Session, SessionPoolOptions, SkippedRequestCallback, Source, StatisticsOptions, StatisticState } from '@crawlee/core';
-import { AutoscaledPool, Configuration, Dataset, RequestProvider, SessionPool, Statistics } from '@crawlee/core';
-import type { Awaitable, BatchAddRequestsResult, Dictionary, SetStatusMessageOptions } from '@crawlee/types';
+import type { AddRequestsBatchedOptions, AddRequestsBatchedResult, AutoscaledPoolOptions, Configuration, CrawleeLogger, CrawlingContext, DatasetExportOptions, EnqueueLinksOptions, EventManager, FinalStatistics, GetUserDataFromRequest, IRequestList, IRequestManager, ProxyConfiguration, Request, RequestsLike, RouterHandler, RouterRoutes, Session, SkippedRequestCallback, Source, StatisticsOptions, StatisticState, StorageIdentifier } from '@crawlee/core';
+import { AutoscaledPool, ContextPipeline, Dataset, RequestProvider, SessionPool, Statistics } from '@crawlee/core';
+import type { Awaitable, BaseHttpClient, BatchAddRequestsResult, Dictionary, ProxyInfo, SetStatusMessageOptions, StorageClient } from '@crawlee/types';
 import { RobotsTxtFile } from '@crawlee/utils';
-import type { SetRequired } from 'type-fest';
-import type { Log } from '@apify/log';
+import type { ReadonlyDeep, SetRequired } from 'type-fest';
 import { TimeoutError } from '@apify/timeout';
-export interface BasicCrawlingContext<UserData extends Dictionary = Dictionary> extends CrawlingContext<BasicCrawler, UserData> {
-    /**
-     * This function automatically finds and enqueues links from the current page, adding them to the {@link RequestQueue}
-     * currently used by the crawler.
-     *
-     * Optionally, the function allows you to filter the target links' URLs using an array of globs or regular expressions
-     * and override settings of the enqueued {@link Request} objects.
-     *
-     * Check out the [Crawl a website with relative links](https://crawlee.dev/js/docs/examples/crawl-relative-links) example
-     * for more details regarding its usage.
-     *
-     * **Example usage**
-     *
-     * ```ts
-     * async requestHandler({ enqueueLinks }) {
-     *     await enqueueLinks({
-     *       urls: [...],
-     *     });
-     * },
-     * ```
-     *
-     * @param [options] All `enqueueLinks()` parameters are passed via an options object.
-     * @returns Promise that resolves to {@link BatchAddRequestsResult} object.
-     */
-    enqueueLinks(options?: SetRequired<EnqueueLinksOptions, 'urls'>): Promise<BatchAddRequestsResult>;
+export interface BasicCrawlingContext<UserData extends Dictionary = Dictionary> extends CrawlingContext<UserData> {
 }
-export type RequestHandler<Context extends CrawlingContext = LoadedContext<BasicCrawlingContext & RestrictedCrawlingContext>> = (inputs: LoadedContext<Context>) => Awaitable<void>;
-export type ErrorHandler<Context extends CrawlingContext = LoadedContext<BasicCrawlingContext & RestrictedCrawlingContext>> = (inputs: LoadedContext<Context>, error: Error) => Awaitable<void>;
+export type RequestHandler<Context extends CrawlingContext = CrawlingContext> = (inputs: Context) => Awaitable<void>;
+export type ErrorHandler<Context extends CrawlingContext = CrawlingContext, ExtendedContext extends Context = Context> = (inputs: Context & Partial<ExtendedContext>, error: Error) => Awaitable<void>;
 export interface StatusMessageCallbackParams<Context extends CrawlingContext = BasicCrawlingContext, Crawler extends BasicCrawler<any> = BasicCrawler<Context>> {
     state: StatisticState;
     crawler: Crawler;
@@ -40,7 +15,10 @@ export interface StatusMessageCallbackParams<Context extends CrawlingContext = B
     message: string;
 }
 export type StatusMessageCallback<Context extends CrawlingContext = BasicCrawlingContext, Crawler extends BasicCrawler<any> = BasicCrawler<Context>> = (params: StatusMessageCallbackParams<Context, Crawler>) => Awaitable<void>;
-export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCrawlingContext> {
+export type RequireContextPipeline<DefaultContextType extends CrawlingContext, FinalContextType extends DefaultContextType> = DefaultContextType extends FinalContextType ? {} : {
+    contextPipelineBuilder: () => ContextPipeline<CrawlingContext, FinalContextType>;
+};
+export interface BasicCrawlerOptions<Context extends CrawlingContext = CrawlingContext, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension> {
     /**
      * User-provided function that performs the logic of the crawler. It is called for each URL to crawl.
      *
@@ -58,7 +36,35 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
      * The exceptions are logged to the request using the
      * {@link Request.pushErrorMessage|`Request.pushErrorMessage()`} function.
      */
-    requestHandler?: RequestHandler<Context>;
+    requestHandler?: RequestHandler<ExtendedContext>;
+    /**
+     * Allows the user to extend the crawling context passed to the request handler with custom functionality.
+     *
+     * **Example usage:**
+     *
+     * ```javascript
+     * import { BasicCrawler } from 'crawlee';
+     *
+     * // Create a crawler instance
+     * const crawler = new BasicCrawler({
+     *     extendContext(context) => ({
+     *         async customHelper() {
+     *             await context.pushData({ url: context.request.url })
+     *         }
+     *     }),
+     *     async requestHandler(context) {
+     *         await context.customHelper();
+     *     },
+     * });
+     * ```
+     */
+    extendContext?: (context: Context) => Awaitable<ContextExtension>;
+    /**
+     * *Intended for BasicCrawler subclasses*. Prepares a context pipeline that transforms the initial crawling context into the shape given by the `Context` type parameter.
+     *
+     * The option is not required if your crawler subclass does not extend the crawling context with custom information or helpers.
+     */
+    contextPipelineBuilder?: () => ContextPipeline<CrawlingContext, Context>;
     /**
      * Static list of URLs to be processed.
      * If not provided, the crawler will open the default request queue when the {@link BasicCrawler.addRequests|`crawler.addRequests()`} function is called.
@@ -73,6 +79,13 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
      * it is a shortcut for running `crawler.addRequests()` before the `crawler.run()`.
      */
     requestQueue?: RequestProvider;
+    /**
+     * Allows explicitly configuring a request manager. Mutually exclusive with the `requestQueue` and `requestList` options.
+     *
+     * This enables explicitly configuring the crawler to use `RequestManagerTandem`, for instance.
+     * If using this, the type of `BasicCrawler.requestQueue` may not be fully compatible with the `RequestProvider` class.
+     */
+    requestManager?: IRequestManager;
     /**
      * Timeout in which the function passed as {@link BasicCrawlerOptions.requestHandler|`requestHandler`} needs to finish, in seconds.
      * @default 60
@@ -87,7 +100,7 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
      * Second argument is the `Error` instance that
      * represents the last error thrown during processing of the request.
      */
-    errorHandler?: ErrorHandler<Context>;
+    errorHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
     /**
      * A function to handle requests that failed more than {@link BasicCrawlerOptions.maxRequestRetries|`maxRequestRetries`} times.
      *
@@ -96,7 +109,7 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
      * Second argument is the `Error` instance that
      * represents the last error thrown during processing of the request.
      */
-    failedRequestHandler?: ErrorHandler<Context>;
+    failedRequestHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
     /**
      * Specifies the maximum number of retries allowed for a request if its processing fails.
      * This includes retries due to navigation errors or errors thrown from user-supplied functions
@@ -126,12 +139,18 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
      * > *NOTE:* In cases of parallel crawling, the actual number of pages visited might be slightly higher than this value.
      */
     maxRequestsPerCrawl?: number;
+    /**
+     * Maximum depth of the crawl. If not set, the crawl will continue until all requests are processed.
+     * Setting this to `0` will only process the initial requests, skipping all links enqueued by `crawlingContext.enqueueLinks` and `crawlingContext.addRequests`.
+     * Passing `1` will process the initial requests and all links enqueued by `crawlingContext.enqueueLinks` and `crawlingContext.addRequests` in the handler for initial requests.
+     */
+    maxCrawlDepth?: number;
     /**
      * Custom options passed to the underlying {@link AutoscaledPool} constructor.
      * > *NOTE:* The {@link AutoscaledPoolOptions.runTaskFunction|`runTaskFunction`}
-     * and {@link AutoscaledPoolOptions.isTaskReadyFunction|`isTaskReadyFunction`} options
-     * are provided by the crawler and cannot be overridden.
-     * However, we can provide a custom implementation of {@link AutoscaledPoolOptions.isFinishedFunction|`isFinishedFunction`}.
+     * option is provided by the crawler and cannot be overridden.
+     * However, we can provide custom implementations of {@link AutoscaledPoolOptions.isFinishedFunction|`isFinishedFunction`}
+     * and {@link AutoscaledPoolOptions.isTaskReadyFunction|`isTaskReadyFunction`}.
      */
     autoscaledPoolOptions?: AutoscaledPoolOptions;
     /**
@@ -159,14 +178,11 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
      */
     keepAlive?: boolean;
     /**
-     * Basic crawler will initialize the {@link SessionPool} with the corresponding {@link SessionPoolOptions|`sessionPoolOptions`}.
-     * The session instance will be than available in the {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
-     */
-    useSessionPool?: boolean;
-    /**
-     * The configuration options for {@link SessionPool} to use.
+     * An existing {@link SessionPool} instance to use. When provided, the crawler will use this
+     * pool directly instead of creating a new one, enabling session sharing across multiple crawlers.
+     * The crawler will not tear down a shared pool — the caller is responsible for its lifecycle.
      */
-    sessionPoolOptions?: SessionPoolOptions;
+    sessionPool?: SessionPool;
     /**
      * Defines the length of the interval for calling the `setStatusMessage` in seconds.
      */
@@ -188,6 +204,11 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
      * ```
      */
     statusMessageCallback?: StatusMessageCallback;
+    /**
+     * HTTP status codes that indicate the session should be retired.
+     * @default [401, 403, 429]
+     */
+    blockedStatusCodes?: number[];
     /**
      * If set to `true`, the crawler will automatically try to bypass any detected bot protection.
      *
@@ -199,15 +220,22 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
     /**
      * If set to `true`, the crawler will automatically try to fetch the robots.txt file for each domain,
      * and skip those that are not allowed. This also prevents disallowed URLs to be added via `enqueueLinks`.
+     *
+     * If an object is provided, it may contain a `userAgent` property to specify which user-agent
+     * should be used when checking the robots.txt file. If not provided, the default user-agent `*` will be used.
      */
-    respectRobotsTxtFile?: boolean;
+    respectRobotsTxtFile?: boolean | {
+        userAgent?: string;
+    };
     /**
      * When a request is skipped for some reason, you can use this callback to act on it.
-     * This is currently fired only for requests skipped based on robots.txt file.
+     * This is currently fired for requests skipped
+     * 1. based on robots.txt file,
+     * 2. because they don't match enqueueLinks filters,
+     * 3. because they are redirected to a URL that doesn't match the enqueueLinks strategy,
+     * 4. or because the {@link BasicCrawlerOptions.maxRequestsPerCrawl|`maxRequestsPerCrawl`} limit has been reached
      */
     onSkippedRequest?: SkippedRequestCallback;
-    /** @internal */
-    log?: Log;
     /**
      * Enables experimental features of Crawlee, which can alter the behavior of the crawler.
      * WARNING: these options are not guaranteed to be stable and may change or be removed at any time.
@@ -223,6 +251,53 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
      * Defaults to a new instance of {@link GotScrapingHttpClient}
      */
     httpClient?: BaseHttpClient;
+    /**
+     * If set, the crawler will be configured for all connections to use
+     * the Proxy URLs provided and rotated according to the configuration.
+     */
+    proxyConfiguration?: ProxyConfiguration;
+    /**
+     * Custom configuration to use for this crawler.
+     * If provided, the crawler will use its own ServiceLocator instance instead of the global one.
+     */
+    configuration?: Configuration;
+    /**
+     * Custom storage client to use for this crawler.
+     * If provided, the crawler will use its own ServiceLocator instance instead of the global one.
+     */
+    storageClient?: StorageClient;
+    /**
+     * Custom event manager to use for this crawler.
+     * If provided, the crawler will use its own ServiceLocator instance instead of the global one.
+     */
+    eventManager?: EventManager;
+    /**
+     * Custom logger to use for this crawler.
+     * If provided, the crawler will use its own ServiceLocator instance instead of the global one.
+     */
+    logger?: CrawleeLogger;
+    /**
+     * A unique identifier for the crawler instance. This ID is used to isolate the state returned by
+     * {@link BasicCrawler.useState|`crawler.useState()`} from other crawler instances.
+     *
+     * When multiple crawler instances use `useState()` without an explicit `id`, they will share the same
+     * state object for backward compatibility. A warning will be logged in this case.
+     *
+     * To ensure each crawler has its own isolated state that also persists across script restarts
+     * (e.g., during Apify migrations), provide a stable, unique ID for each crawler instance.
+     *
+     */
+    id?: string;
+    /**
+     * An array of HTTP response [Status Codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to be excluded from error consideration.
+     * By default, status codes >= 500 trigger errors.
+     */
+    ignoreHttpErrorStatusCodes?: number[];
+    /**
+     * An array of additional HTTP response [Status Codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to be treated as errors.
+     * By default, status codes >= 500 trigger errors.
+     */
+    additionalHttpErrorStatusCodes?: number[];
 }
 /**
  * A set of options that you can toggle to enable experimental features in Crawlee.
@@ -303,9 +378,14 @@ export interface CrawlerExperiments {
  * ```
  * @category Crawlers
  */
-export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext> {
-    readonly config: Configuration;
+export declare class BasicCrawler<Context extends CrawlingContext = CrawlingContext, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension> {
+    #private;
     protected static readonly CRAWLEE_STATE_KEY = "CRAWLEE_STATE";
+    /**
+     * Tracks crawler instances that accessed shared state without having an explicit id.
+     * Used to detect and warn about multiple crawlers sharing the same state.
+     */
+    private static useStateCrawlerIds;
     /**
      * A reference to the underlying {@link Statistics} class that collects and logs run statistics for requests.
      */
@@ -321,11 +401,18 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
      * Only available if used by the crawler.
      */
     requestQueue?: RequestProvider;
+    /**
+     * The main request-handling component of the crawler. It's initialized during the crawler startup.
+     */
+    protected requestManager?: IRequestManager;
     /**
      * A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session|sessions}.
-     * Only available if used by the crawler.
      */
-    sessionPool?: SessionPool;
+    sessionPool: SessionPool;
+    /**
+     * Indicates whether the crawler owns the session pool (it was not passed from the outside using the `sessionPool` constructor option).
+     */
+    private ownsSessionPool;
     /**
      * A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
      * > *NOTE:* This property is only initialized after calling the {@link BasicCrawler.run|`crawler.run()`} function.
@@ -334,40 +421,71 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
      * or to abort it by calling {@link AutoscaledPool.abort|`autoscaledPool.abort()`}.
      */
     autoscaledPool?: AutoscaledPool;
+    /**
+     * A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
+     * Only available if used by the crawler.
+     */
+    proxyConfiguration?: ProxyConfiguration;
     /**
      * Default {@link Router} instance that will be used if we don't specify any {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
      * See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
      */
-    readonly router: RouterHandler<LoadedContext<Context>>;
+    readonly router: RouterHandler<Context>;
+    private _basicContextPipeline?;
+    /**
+     * The basic part of the context pipeline. Unlike the subclass pipeline, this
+     * part has no major side effects (e.g. launching a browser). It also makes typing more explicit, as subclass
+     * pipelines expect the basic crawler fields to already be present in the context at runtime.
+     *
+     * Context built with this pipeline can be passed into multiple crawler pipelines at once.
+     * This is used e.g. in the {@link AdaptivePlaywrightCrawler|`AdaptivePlaywrightCrawler`}.
+     */
+    get basicContextPipeline(): ContextPipeline<{
+        request: Request;
+    }, CrawlingContext>;
+    private _contextPipeline?;
+    get contextPipeline(): ContextPipeline<CrawlingContext, ExtendedContext>;
     running: boolean;
     hasFinishedBefore: boolean;
-    readonly log: Log;
-    protected requestHandler: RequestHandler<Context>;
-    protected errorHandler?: ErrorHandler<Context>;
-    protected failedRequestHandler?: ErrorHandler<Context>;
+    protected unexpectedStop: boolean;
+    get log(): CrawleeLogger;
+    protected requestHandler: RequestHandler<ExtendedContext>;
+    protected errorHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
+    protected failedRequestHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
     protected requestHandlerTimeoutMillis: number;
     protected internalTimeoutMillis: number;
     protected maxRequestRetries: number;
+    protected maxCrawlDepth?: number;
     protected sameDomainDelayMillis: number;
     protected domainAccessedTime: Map<string, number>;
     protected maxSessionRotations: number;
+    protected maxRequestsPerCrawl?: number;
     protected handledRequestsCount: number;
     protected statusMessageLoggingInterval: number;
     protected statusMessageCallback?: StatusMessageCallback;
-    protected sessionPoolOptions: SessionPoolOptions;
-    protected useSessionPool: boolean;
-    protected crawlingContexts: Map<string, Context>;
+    protected blockedStatusCodes: Set<number>;
+    protected additionalHttpErrorStatusCodes: Set<number>;
+    protected ignoreHttpErrorStatusCodes: Set<number>;
     protected autoscaledPoolOptions: AutoscaledPoolOptions;
-    protected events: EventManager;
     protected httpClient: BaseHttpClient;
     protected retryOnBlocked: boolean;
-    protected respectRobotsTxtFile: boolean;
+    protected respectRobotsTxtFile: boolean | {
+        userAgent?: string;
+    };
     protected onSkippedRequest?: SkippedRequestCallback;
     private _closeEvents?;
+    private loggedPerRun;
     private experiments;
     private readonly robotsTxtFileCache;
     private _experimentWarnings;
+    private readonly crawlerId;
+    private readonly hasExplicitId;
+    private readonly contextPipelineOptions;
     protected static optionsShape: {
+// @ts-ignore optional peer dependency or compatibility with es2022
+        contextPipelineBuilder: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
+// @ts-ignore optional peer dependency or compatibility with es2022
+        extendContext: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
 // @ts-ignore optional peer dependency or compatibility with es2022
         requestList: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
 // @ts-ignore optional peer dependency or compatibility with es2022
@@ -388,24 +506,40 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
         maxSessionRotations: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
 // @ts-ignore optional peer dependency or compatibility with es2022
         maxRequestsPerCrawl: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
+// @ts-ignore optional peer dependency or compatibility with es2022
+        maxCrawlDepth: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
 // @ts-ignore optional peer dependency or compatibility with es2022
         autoscaledPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
 // @ts-ignore optional peer dependency or compatibility with es2022
-        sessionPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
+        sessionPool: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
 // @ts-ignore optional peer dependency or compatibility with es2022
-        useSessionPool: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
+        proxyConfiguration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
 // @ts-ignore optional peer dependency or compatibility with es2022
         statusMessageLoggingInterval: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
 // @ts-ignore optional peer dependency or compatibility with es2022
         statusMessageCallback: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
+// @ts-ignore optional peer dependency or compatibility with es2022
+        additionalHttpErrorStatusCodes: import("ow").ArrayPredicate<number>;
+// @ts-ignore optional peer dependency or compatibility with es2022
+        ignoreHttpErrorStatusCodes: import("ow").ArrayPredicate<number>;
+// @ts-ignore optional peer dependency or compatibility with es2022
+        blockedStatusCodes: import("ow").ArrayPredicate<number>;
 // @ts-ignore optional peer dependency or compatibility with es2022
         retryOnBlocked: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
 // @ts-ignore optional peer dependency or compatibility with es2022
-        respectRobotsTxtFile: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
+        respectRobotsTxtFile: import("ow").AnyPredicate<boolean | object>;
 // @ts-ignore optional peer dependency or compatibility with es2022
         onSkippedRequest: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
 // @ts-ignore optional peer dependency or compatibility with es2022
         httpClient: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
+// @ts-ignore optional peer dependency or compatibility with es2022
+        configuration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
+// @ts-ignore optional peer dependency or compatibility with es2022
+        storageClient: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
+// @ts-ignore optional peer dependency or compatibility with es2022
+        eventManager: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
+// @ts-ignore optional peer dependency or compatibility with es2022
+        logger: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
 // @ts-ignore optional peer dependency or compatibility with es2022
         minConcurrency: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
 // @ts-ignore optional peer dependency or compatibility with es2022
@@ -414,17 +548,42 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
         maxRequestsPerMinute: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
 // @ts-ignore optional peer dependency or compatibility with es2022
         keepAlive: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
-// @ts-ignore optional peer dependency or compatibility with es2022
-        log: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
 // @ts-ignore optional peer dependency or compatibility with es2022
         experiments: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
 // @ts-ignore optional peer dependency or compatibility with es2022
         statisticsOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
+// @ts-ignore optional peer dependency or compatibility with es2022
+        id: import("ow").StringPredicate & import("ow").BasePredicate<string | undefined>;
     };
     /**
      * All `BasicCrawler` parameters are passed via an options object.
      */
-    constructor(options?: BasicCrawlerOptions<Context>, config?: Configuration);
+    constructor(options?: BasicCrawlerOptions<Context, ContextExtension, ExtendedContext> & RequireContextPipeline<CrawlingContext, Context>);
+    /**
+     * Determines if the given HTTP status code is an error status code given
+     * the default behaviour and user-set preferences.
+     * @param status
+     * @returns `true` if the status code is considered an error, `false` otherwise
+     */
+    protected isErrorStatusCode(status: number): boolean;
+    /**
+     * Builds the basic context pipeline that transforms `{ request }` into a full `CrawlingContext`.
+     * This handles base context creation, session resolution, and context helpers.
+     */
+    protected buildBasicContextPipeline(): ContextPipeline<{
+        request: Request;
+    }, CrawlingContext>;
+    private checkRobotsTxt;
+    /**
+     * Builds the subclass-specific context pipeline that transforms a `CrawlingContext` into the crawler's target context type.
+     * Subclasses should override this to add their own pipeline stages.
+     */
+    protected buildContextPipeline(): ContextPipeline<CrawlingContext, CrawlingContext>;
+    private createBaseContext;
+    private resolveRequest;
+    private resolveSession;
+    private createContextHelpers;
+    private buildFinalContextPipeline;
     /**
      * Checks if the given error is a proxy error by comparing its message to a list of known proxy error messages.
      * Used for retrying requests that failed due to proxy errors.
@@ -432,12 +591,6 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
      * @param error The error to check.
      */
     protected isProxyError(error: Error): boolean;
-    /**
-     * Checks whether the given crawling context is getting blocked by anti-bot protection using several heuristics.
-     * Returns `false` if the request is not blocked, otherwise returns a string with a description of the block reason.
-     * @param _crawlingContext The crawling context to check.
-     */
-    protected isRequestBlocked(_crawlingContext: Context): Promise<string | false>;
     /**
      * This method is periodically called by the crawler, every `statusMessageLoggingInterval` seconds.
      */
@@ -453,15 +606,21 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
      * @param [requests] The requests to add.
      * @param [options] Options for the request queue.
      */
-    run(requests?: (string | Request | RequestOptions)[], options?: CrawlerRunOptions): Promise<FinalStatistics>;
+    run(requests?: RequestsLike, options?: CrawlerRunOptions): Promise<FinalStatistics>;
     /**
      * Gracefully stops the current run of the crawler.
      *
      * All the tasks active at the time of calling this method will be allowed to finish.
+     *
+     * To stop the crawler immediately, use {@link BasicCrawler.teardown|`crawler.teardown()`} instead.
      */
-    stop(message?: string): void;
+    stop(reason?: string): void;
     getRequestQueue(): Promise<RequestProvider>;
     useState<State extends Dictionary = Dictionary>(defaultValue?: State): Promise<State>;
+    protected get pendingRequestCountApproximation(): number;
+    protected calculateEnqueuedRequestLimit(explicitLimit?: number): number | undefined;
+    protected handleSkippedRequest(options: Parameters<SkippedRequestCallback>[0]): Promise<void>;
+    private logOncePerRun;
     /**
      * Adds requests to the queue in batches. By default, it will resolve after the initial batch is added, and continue
      * adding the rest in background. You can configure the batch size via `batchSize` option and the sleep time in between
@@ -473,15 +632,15 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
      * @param requests The requests to add
      * @param options Options for the request queue
      */
-    addRequests(requests: (string | Source)[], options?: CrawlerAddRequestsOptions): Promise<CrawlerAddRequestsResult>;
+    addRequests(requests: ReadonlyDeep<RequestsLike>, options?: CrawlerAddRequestsOptions): Promise<CrawlerAddRequestsResult>;
     /**
      * Pushes data to the specified {@link Dataset}, or the default crawler {@link Dataset} by calling {@link Dataset.pushData}.
      */
-    pushData(data: Parameters<Dataset['pushData']>[0], datasetIdOrName?: string): Promise<void>;
+    pushData(data: Parameters<Dataset['pushData']>[0], datasetIdentifier?: string | StorageIdentifier): Promise<void>;
     /**
      * Retrieves the specified {@link Dataset}, or the default crawler {@link Dataset}.
      */
-    getDataset(idOrName?: string): Promise<Dataset>;
+    getDataset(identifier?: string | StorageIdentifier): Promise<Dataset>;
     /**
      * Retrieves data from the default crawler {@link Dataset} by calling {@link Dataset.getData}.
      */
@@ -491,41 +650,52 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
      * Supported formats are currently 'json' and 'csv', and will be inferred from the `path` automatically.
      */
     exportData<Data>(path: string, format?: 'json' | 'csv', options?: DatasetExportOptions): Promise<Data[]>;
+    /**
+     * Initializes the crawler.
+     */
     protected _init(): Promise<void>;
-    protected _runRequestHandler(crawlingContext: Context): Promise<void>;
+    protected runRequestHandler(crawlingContext: ExtendedContext): Promise<void>;
     /**
      * Handles blocked request
      */
-    protected _throwOnBlockedRequest(session: Session, statusCode: number): void;
+    protected _throwOnBlockedRequest(statusCode: number): void;
     private isAllowedBasedOnRobotsTxtFile;
     protected getRobotsTxtFileForUrl(url: string): Promise<RobotsTxtFile | undefined>;
     protected _pauseOnMigration(): Promise<void>;
     /**
-     * Fetches request from either RequestList or RequestQueue. If request comes from a RequestList
-     * and RequestQueue is present then enqueues it to the queue first.
+     * Initializes the RequestManager based on the configured requestList and requestQueue.
      */
-    protected _fetchNextRequest(): Promise<Request<Dictionary> | null | undefined>;
+    private initializeRequestManager;
     /**
-     * Executed when `errorHandler` finishes or the request is successful.
-     * Can be used to clean up orphaned browser pages.
+     * Fetches the next request to process from the underlying request provider.
      */
-    protected _cleanupContext(_crawlingContext: Context): Promise<void>;
+    protected _fetchNextRequest(): Promise<Request<Dictionary> | null>;
     /**
      * Delays processing of the request based on the `sameDomainDelaySecs` option,
      * adding it back to the queue after the timeout passes. Returns `true` if the request
      * should be ignored and will be reclaimed to the queue once ready.
      */
-    protected delayRequest(request: Request, source: IRequestList | RequestProvider): boolean;
+    protected delayRequest(request: Request, source: IRequestList | RequestProvider | IRequestManager): boolean;
+    /** Handles a single request - runs the request handler with retries, error handling, and lifecycle management. */
+    protected handleRequest(crawlingContext: ExtendedContext, requestSource: IRequestManager, request: Request): Promise<void>;
+    /**
+     * Wrapper around the crawling context's `enqueueLinks` method:
+     * - Injects `crawlDepth` to each request being added based on the crawling context request.
+     * - Provides defaults for the `enqueueLinks` options based on the crawler configuration.
+     *      - These options can be overridden by the user.
+     * @internal
+     */
+    protected enqueueLinksWithCrawlDepth(options: SetRequired<EnqueueLinksOptions, 'urls'>, request: Request<Dictionary>, requestQueue: RequestProvider): Promise<BatchAddRequestsResult>;
     /**
-     * Wrapper around requestHandler that fetches requests from RequestList/RequestQueue
-     * then retries them in a case of an error, etc.
+     * Generator function that yields requests injected with the given crawl depth.
+     * @internal
      */
-    protected _runTaskFunction(): Promise<void>;
+    protected addCrawlDepthRequestGenerator(requests: RequestsLike, newRequestDepth: number): AsyncGenerator<Source, void, undefined>;
     /**
-     * Run async callback with given timeout and retry.
+     * Run async callback with given timeout and retry. Returns the result of the callback.
      * @ignore
      */
-    protected _timeoutAndRetry(handler: () => Promise<unknown>, timeout: number, error: Error | string, maxRetries?: number, retried?: number): Promise<void>;
+    protected _timeoutAndRetry<T>(handler: () => Promise<T>, timeout: number, error: Error | string, maxRetries?: number, retried?: number): Promise<T>;
     /**
      * Returns true if either RequestList or RequestQueue have a request ready for processing.
      */
@@ -535,12 +705,19 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
      */
     protected _defaultIsFinishedFunction(): Promise<boolean>;
     private _rotateSession;
+    /**
+     * Unwraps errors thrown by the context pipeline to get the actual user error.
+     * RequestHandlerError and ContextPipelineInitializationError wrap the actual error.
+     */
+    private unwrapError;
     /**
      * Handles errors thrown by user provided requestHandler()
+     *
+     * @param request The request object, passed separately to circumvent potential dynamic logic in crawlingContext.request
      */
-    protected _requestFunctionErrorHandler(error: Error, crawlingContext: Context, source: IRequestList | RequestProvider): Promise<void>;
+    protected _requestFunctionErrorHandler(error: Error, crawlingContext: CrawlingContext, request: Request, source: IRequestList | IRequestManager): Promise<void>;
     protected _tagUserHandlerError<T>(cb: () => unknown): Promise<T>;
-    protected _handleFailedRequestHandler(crawlingContext: Context, error: Error): Promise<void>;
+    protected _handleFailedRequestHandler(crawlingContext: CrawlingContext, error: Error): Promise<void>;
     /**
      * Resolves the most verbose error message from a thrown error
      * @param error The error received
@@ -549,27 +726,25 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
     protected _getMessageFromError(error: Error, forceStack?: boolean): string | TimeoutError | undefined;
     protected _canRequestBeRetried(request: Request, error: Error): boolean;
     /**
-     * Updates handledRequestsCount from possibly stored counts,
-     * usually after worker migration. Since one of the stores
-     * needs to have priority when both are present,
-     * it is the request queue, because generally, the request
-     * list will first be dumped into the queue and then left
-     * empty.
+     * Updates handledRequestsCount from possibly stored counts, usually after worker migration.
      */
     protected _loadHandledRequestCount(): Promise<void>;
     protected _executeHooks<HookLike extends (...args: any[]) => Awaitable<void>>(hooks: HookLike[], ...args: Parameters<HookLike>): Promise<void>;
     /**
-     * Function for cleaning up after all request are processed.
-     * @ignore
+     * Stops the crawler immediately.
+     *
+     * This method doesn't wait for currently active requests to finish.
+     *
+     * To stop the crawler gracefully (waiting for all running requests to finish), use {@link BasicCrawler.stop|`crawler.stop()`} instead.
      */
     teardown(): Promise<void>;
     protected _getCookieHeaderFromRequest(request: Request): string;
     private _getRequestQueue;
-    protected requestMatchesEnqueueStrategy(request: Request): boolean;
+    private requestMatchesEnqueueStrategy;
 }
 export interface CreateContextOptions {
     request: Request;
-    session?: Session;
+    session: Session;
     proxyInfo?: ProxyInfo;
 }
 export interface CrawlerAddRequestsOptions extends AddRequestsBatchedOptions {