@crawlee/basic 4.0.0-beta.5 → 4.0.0-beta.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,38 +1,13 @@
1
- import type { AddRequestsBatchedOptions, AddRequestsBatchedResult, AutoscaledPoolOptions, BaseHttpClient, CrawlingContext, DatasetExportOptions, EnqueueLinksOptions, EventManager, FinalStatistics, GetUserDataFromRequest, IRequestList, LoadedContext, ProxyInfo, Request, RequestOptions, RestrictedCrawlingContext, RouterHandler, RouterRoutes, Session, SessionPoolOptions, SkippedRequestCallback, Source, StatisticsOptions, StatisticState } from '@crawlee/core';
2
- import { AutoscaledPool, Configuration, Dataset, RequestProvider, SessionPool, Statistics } from '@crawlee/core';
3
- import type { Awaitable, BatchAddRequestsResult, Dictionary, SetStatusMessageOptions } from '@crawlee/types';
1
+ import type { AddRequestsBatchedOptions, AddRequestsBatchedResult, AutoscaledPoolOptions, Configuration, CrawleeLogger, CrawlingContext, DatasetExportOptions, EnqueueLinksOptions, EventManager, FinalStatistics, GetUserDataFromRequest, IRequestList, IRequestManager, ProxyConfiguration, Request, RequestsLike, RouterHandler, RouterRoutes, Session, SkippedRequestCallback, Source, StatisticsOptions, StatisticState, StorageIdentifier } from '@crawlee/core';
2
+ import { AutoscaledPool, ContextPipeline, Dataset, RequestProvider, SessionPool, Statistics } from '@crawlee/core';
3
+ import type { Awaitable, BaseHttpClient, BatchAddRequestsResult, Dictionary, ProxyInfo, SetStatusMessageOptions, StorageClient } from '@crawlee/types';
4
4
  import { RobotsTxtFile } from '@crawlee/utils';
5
- import type { SetRequired } from 'type-fest';
6
- import type { Log } from '@apify/log';
5
+ import type { ReadonlyDeep, SetRequired } from 'type-fest';
7
6
  import { TimeoutError } from '@apify/timeout';
8
- export interface BasicCrawlingContext<UserData extends Dictionary = Dictionary> extends CrawlingContext<BasicCrawler, UserData> {
9
- /**
10
- * This function automatically finds and enqueues links from the current page, adding them to the {@link RequestQueue}
11
- * currently used by the crawler.
12
- *
13
- * Optionally, the function allows you to filter the target links' URLs using an array of globs or regular expressions
14
- * and override settings of the enqueued {@link Request} objects.
15
- *
16
- * Check out the [Crawl a website with relative links](https://crawlee.dev/js/docs/examples/crawl-relative-links) example
17
- * for more details regarding its usage.
18
- *
19
- * **Example usage**
20
- *
21
- * ```ts
22
- * async requestHandler({ enqueueLinks }) {
23
- * await enqueueLinks({
24
- * urls: [...],
25
- * });
26
- * },
27
- * ```
28
- *
29
- * @param [options] All `enqueueLinks()` parameters are passed via an options object.
30
- * @returns Promise that resolves to {@link BatchAddRequestsResult} object.
31
- */
32
- enqueueLinks(options?: SetRequired<EnqueueLinksOptions, 'urls'>): Promise<BatchAddRequestsResult>;
7
+ export interface BasicCrawlingContext<UserData extends Dictionary = Dictionary> extends CrawlingContext<UserData> {
33
8
  }
34
- export type RequestHandler<Context extends CrawlingContext = LoadedContext<BasicCrawlingContext & RestrictedCrawlingContext>> = (inputs: LoadedContext<Context>) => Awaitable<void>;
35
- export type ErrorHandler<Context extends CrawlingContext = LoadedContext<BasicCrawlingContext & RestrictedCrawlingContext>> = (inputs: LoadedContext<Context>, error: Error) => Awaitable<void>;
9
+ export type RequestHandler<Context extends CrawlingContext = CrawlingContext> = (inputs: Context) => Awaitable<void>;
10
+ export type ErrorHandler<Context extends CrawlingContext = CrawlingContext, ExtendedContext extends Context = Context> = (inputs: Context & Partial<ExtendedContext>, error: Error) => Awaitable<void>;
36
11
  export interface StatusMessageCallbackParams<Context extends CrawlingContext = BasicCrawlingContext, Crawler extends BasicCrawler<any> = BasicCrawler<Context>> {
37
12
  state: StatisticState;
38
13
  crawler: Crawler;
@@ -40,7 +15,10 @@ export interface StatusMessageCallbackParams<Context extends CrawlingContext = B
40
15
  message: string;
41
16
  }
42
17
  export type StatusMessageCallback<Context extends CrawlingContext = BasicCrawlingContext, Crawler extends BasicCrawler<any> = BasicCrawler<Context>> = (params: StatusMessageCallbackParams<Context, Crawler>) => Awaitable<void>;
43
- export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCrawlingContext> {
18
+ export type RequireContextPipeline<DefaultContextType extends CrawlingContext, FinalContextType extends DefaultContextType> = DefaultContextType extends FinalContextType ? {} : {
19
+ contextPipelineBuilder: () => ContextPipeline<CrawlingContext, FinalContextType>;
20
+ };
21
+ export interface BasicCrawlerOptions<Context extends CrawlingContext = CrawlingContext, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension> {
44
22
  /**
45
23
  * User-provided function that performs the logic of the crawler. It is called for each URL to crawl.
46
24
  *
@@ -58,7 +36,35 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
58
36
  * The exceptions are logged to the request using the
59
37
  * {@link Request.pushErrorMessage|`Request.pushErrorMessage()`} function.
60
38
  */
61
- requestHandler?: RequestHandler<Context>;
39
+ requestHandler?: RequestHandler<ExtendedContext>;
40
+ /**
41
+ * Allows the user to extend the crawling context passed to the request handler with custom functionality.
42
+ *
43
+ * **Example usage:**
44
+ *
45
+ * ```javascript
46
+ * import { BasicCrawler } from 'crawlee';
47
+ *
48
+ * // Create a crawler instance
49
+ * const crawler = new BasicCrawler({
50
+ * extendContext(context) => ({
51
+ * async customHelper() {
52
+ * await context.pushData({ url: context.request.url })
53
+ * }
54
+ * }),
55
+ * async requestHandler(context) {
56
+ * await context.customHelper();
57
+ * },
58
+ * });
59
+ * ```
60
+ */
61
+ extendContext?: (context: Context) => Awaitable<ContextExtension>;
62
+ /**
63
+ * *Intended for BasicCrawler subclasses*. Prepares a context pipeline that transforms the initial crawling context into the shape given by the `Context` type parameter.
64
+ *
65
+ * The option is not required if your crawler subclass does not extend the crawling context with custom information or helpers.
66
+ */
67
+ contextPipelineBuilder?: () => ContextPipeline<CrawlingContext, Context>;
62
68
  /**
63
69
  * Static list of URLs to be processed.
64
70
  * If not provided, the crawler will open the default request queue when the {@link BasicCrawler.addRequests|`crawler.addRequests()`} function is called.
@@ -73,6 +79,13 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
73
79
  * it is a shortcut for running `crawler.addRequests()` before the `crawler.run()`.
74
80
  */
75
81
  requestQueue?: RequestProvider;
82
+ /**
83
+ * Allows explicitly configuring a request manager. Mutually exclusive with the `requestQueue` and `requestList` options.
84
+ *
85
+ * This enables explicitly configuring the crawler to use `RequestManagerTandem`, for instance.
86
+ * If using this, the type of `BasicCrawler.requestQueue` may not be fully compatible with the `RequestProvider` class.
87
+ */
88
+ requestManager?: IRequestManager;
76
89
  /**
77
90
  * Timeout in which the function passed as {@link BasicCrawlerOptions.requestHandler|`requestHandler`} needs to finish, in seconds.
78
91
  * @default 60
@@ -87,7 +100,7 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
87
100
  * Second argument is the `Error` instance that
88
101
  * represents the last error thrown during processing of the request.
89
102
  */
90
- errorHandler?: ErrorHandler<Context>;
103
+ errorHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
91
104
  /**
92
105
  * A function to handle requests that failed more than {@link BasicCrawlerOptions.maxRequestRetries|`maxRequestRetries`} times.
93
106
  *
@@ -96,7 +109,7 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
96
109
  * Second argument is the `Error` instance that
97
110
  * represents the last error thrown during processing of the request.
98
111
  */
99
- failedRequestHandler?: ErrorHandler<Context>;
112
+ failedRequestHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
100
113
  /**
101
114
  * Specifies the maximum number of retries allowed for a request if its processing fails.
102
115
  * This includes retries due to navigation errors or errors thrown from user-supplied functions
@@ -126,12 +139,18 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
126
139
  * > *NOTE:* In cases of parallel crawling, the actual number of pages visited might be slightly higher than this value.
127
140
  */
128
141
  maxRequestsPerCrawl?: number;
142
+ /**
143
+ * Maximum depth of the crawl. If not set, the crawl will continue until all requests are processed.
144
+ * Setting this to `0` will only process the initial requests, skipping all links enqueued by `crawlingContext.enqueueLinks` and `crawlingContext.addRequests`.
145
+ * Passing `1` will process the initial requests and all links enqueued by `crawlingContext.enqueueLinks` and `crawlingContext.addRequests` in the handler for initial requests.
146
+ */
147
+ maxCrawlDepth?: number;
129
148
  /**
130
149
  * Custom options passed to the underlying {@link AutoscaledPool} constructor.
131
150
  * > *NOTE:* The {@link AutoscaledPoolOptions.runTaskFunction|`runTaskFunction`}
132
- * and {@link AutoscaledPoolOptions.isTaskReadyFunction|`isTaskReadyFunction`} options
133
- * are provided by the crawler and cannot be overridden.
134
- * However, we can provide a custom implementation of {@link AutoscaledPoolOptions.isFinishedFunction|`isFinishedFunction`}.
151
+ * option is provided by the crawler and cannot be overridden.
152
+ * However, we can provide custom implementations of {@link AutoscaledPoolOptions.isFinishedFunction|`isFinishedFunction`}
153
+ * and {@link AutoscaledPoolOptions.isTaskReadyFunction|`isTaskReadyFunction`}.
135
154
  */
136
155
  autoscaledPoolOptions?: AutoscaledPoolOptions;
137
156
  /**
@@ -159,14 +178,11 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
159
178
  */
160
179
  keepAlive?: boolean;
161
180
  /**
162
- * Basic crawler will initialize the {@link SessionPool} with the corresponding {@link SessionPoolOptions|`sessionPoolOptions`}.
163
- * The session instance will be than available in the {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
164
- */
165
- useSessionPool?: boolean;
166
- /**
167
- * The configuration options for {@link SessionPool} to use.
181
+ * An existing {@link SessionPool} instance to use. When provided, the crawler will use this
182
+ * pool directly instead of creating a new one, enabling session sharing across multiple crawlers.
183
+ * The crawler will not tear down a shared pool — the caller is responsible for its lifecycle.
168
184
  */
169
- sessionPoolOptions?: SessionPoolOptions;
185
+ sessionPool?: SessionPool;
170
186
  /**
171
187
  * Defines the length of the interval for calling the `setStatusMessage` in seconds.
172
188
  */
@@ -188,6 +204,11 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
188
204
  * ```
189
205
  */
190
206
  statusMessageCallback?: StatusMessageCallback;
207
+ /**
208
+ * HTTP status codes that indicate the session should be retired.
209
+ * @default [401, 403, 429]
210
+ */
211
+ blockedStatusCodes?: number[];
191
212
  /**
192
213
  * If set to `true`, the crawler will automatically try to bypass any detected bot protection.
193
214
  *
@@ -199,15 +220,22 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
199
220
  /**
200
221
  * If set to `true`, the crawler will automatically try to fetch the robots.txt file for each domain,
201
222
  * and skip those that are not allowed. This also prevents disallowed URLs to be added via `enqueueLinks`.
223
+ *
224
+ * If an object is provided, it may contain a `userAgent` property to specify which user-agent
225
+ * should be used when checking the robots.txt file. If not provided, the default user-agent `*` will be used.
202
226
  */
203
- respectRobotsTxtFile?: boolean;
227
+ respectRobotsTxtFile?: boolean | {
228
+ userAgent?: string;
229
+ };
204
230
  /**
205
231
  * When a request is skipped for some reason, you can use this callback to act on it.
206
- * This is currently fired only for requests skipped based on robots.txt file.
232
+ * This is currently fired for requests skipped
233
+ * 1. based on robots.txt file,
234
+ * 2. because they don't match enqueueLinks filters,
235
+ * 3. because they are redirected to a URL that doesn't match the enqueueLinks strategy,
236
+ * 4. or because the {@link BasicCrawlerOptions.maxRequestsPerCrawl|`maxRequestsPerCrawl`} limit has been reached
207
237
  */
208
238
  onSkippedRequest?: SkippedRequestCallback;
209
- /** @internal */
210
- log?: Log;
211
239
  /**
212
240
  * Enables experimental features of Crawlee, which can alter the behavior of the crawler.
213
241
  * WARNING: these options are not guaranteed to be stable and may change or be removed at any time.
@@ -223,6 +251,53 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
223
251
  * Defaults to a new instance of {@link GotScrapingHttpClient}
224
252
  */
225
253
  httpClient?: BaseHttpClient;
254
+ /**
255
+ * If set, the crawler will be configured for all connections to use
256
+ * the Proxy URLs provided and rotated according to the configuration.
257
+ */
258
+ proxyConfiguration?: ProxyConfiguration;
259
+ /**
260
+ * Custom configuration to use for this crawler.
261
+ * If provided, the crawler will use its own ServiceLocator instance instead of the global one.
262
+ */
263
+ configuration?: Configuration;
264
+ /**
265
+ * Custom storage client to use for this crawler.
266
+ * If provided, the crawler will use its own ServiceLocator instance instead of the global one.
267
+ */
268
+ storageClient?: StorageClient;
269
+ /**
270
+ * Custom event manager to use for this crawler.
271
+ * If provided, the crawler will use its own ServiceLocator instance instead of the global one.
272
+ */
273
+ eventManager?: EventManager;
274
+ /**
275
+ * Custom logger to use for this crawler.
276
+ * If provided, the crawler will use its own ServiceLocator instance instead of the global one.
277
+ */
278
+ logger?: CrawleeLogger;
279
+ /**
280
+ * A unique identifier for the crawler instance. This ID is used to isolate the state returned by
281
+ * {@link BasicCrawler.useState|`crawler.useState()`} from other crawler instances.
282
+ *
283
+ * When multiple crawler instances use `useState()` without an explicit `id`, they will share the same
284
+ * state object for backward compatibility. A warning will be logged in this case.
285
+ *
286
+ * To ensure each crawler has its own isolated state that also persists across script restarts
287
+ * (e.g., during Apify migrations), provide a stable, unique ID for each crawler instance.
288
+ *
289
+ */
290
+ id?: string;
291
+ /**
292
+ * An array of HTTP response [Status Codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to be excluded from error consideration.
293
+ * By default, status codes >= 500 trigger errors.
294
+ */
295
+ ignoreHttpErrorStatusCodes?: number[];
296
+ /**
297
+ * An array of additional HTTP response [Status Codes](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status) to be treated as errors.
298
+ * By default, status codes >= 500 trigger errors.
299
+ */
300
+ additionalHttpErrorStatusCodes?: number[];
226
301
  }
227
302
  /**
228
303
  * A set of options that you can toggle to enable experimental features in Crawlee.
@@ -303,9 +378,14 @@ export interface CrawlerExperiments {
303
378
  * ```
304
379
  * @category Crawlers
305
380
  */
306
- export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext> {
307
- readonly config: Configuration;
381
+ export declare class BasicCrawler<Context extends CrawlingContext = CrawlingContext, ContextExtension = Dictionary<never>, ExtendedContext extends Context = Context & ContextExtension> {
382
+ #private;
308
383
  protected static readonly CRAWLEE_STATE_KEY = "CRAWLEE_STATE";
384
+ /**
385
+ * Tracks crawler instances that accessed shared state without having an explicit id.
386
+ * Used to detect and warn about multiple crawlers sharing the same state.
387
+ */
388
+ private static useStateCrawlerIds;
309
389
  /**
310
390
  * A reference to the underlying {@link Statistics} class that collects and logs run statistics for requests.
311
391
  */
@@ -321,11 +401,18 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
321
401
  * Only available if used by the crawler.
322
402
  */
323
403
  requestQueue?: RequestProvider;
404
+ /**
405
+ * The main request-handling component of the crawler. It's initialized during the crawler startup.
406
+ */
407
+ protected requestManager?: IRequestManager;
324
408
  /**
325
409
  * A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session|sessions}.
326
- * Only available if used by the crawler.
327
410
  */
328
- sessionPool?: SessionPool;
411
+ sessionPool: SessionPool;
412
+ /**
413
+ * Indicates whether the crawler owns the session pool (it was not passed from the outside using the `sessionPool` constructor option).
414
+ */
415
+ private ownsSessionPool;
329
416
  /**
330
417
  * A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
331
418
  * > *NOTE:* This property is only initialized after calling the {@link BasicCrawler.run|`crawler.run()`} function.
@@ -334,40 +421,71 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
334
421
  * or to abort it by calling {@link AutoscaledPool.abort|`autoscaledPool.abort()`}.
335
422
  */
336
423
  autoscaledPool?: AutoscaledPool;
424
+ /**
425
+ * A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
426
+ * Only available if used by the crawler.
427
+ */
428
+ proxyConfiguration?: ProxyConfiguration;
337
429
  /**
338
430
  * Default {@link Router} instance that will be used if we don't specify any {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
339
431
  * See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
340
432
  */
341
- readonly router: RouterHandler<LoadedContext<Context>>;
433
+ readonly router: RouterHandler<Context>;
434
+ private _basicContextPipeline?;
435
+ /**
436
+ * The basic part of the context pipeline. Unlike the subclass pipeline, this
437
+ * part has no major side effects (e.g. launching a browser). It also makes typing more explicit, as subclass
438
+ * pipelines expect the basic crawler fields to already be present in the context at runtime.
439
+ *
440
+ * Context built with this pipeline can be passed into multiple crawler pipelines at once.
441
+ * This is used e.g. in the {@link AdaptivePlaywrightCrawler|`AdaptivePlaywrightCrawler`}.
442
+ */
443
+ get basicContextPipeline(): ContextPipeline<{
444
+ request: Request;
445
+ }, CrawlingContext>;
446
+ private _contextPipeline?;
447
+ get contextPipeline(): ContextPipeline<CrawlingContext, ExtendedContext>;
342
448
  running: boolean;
343
449
  hasFinishedBefore: boolean;
344
- readonly log: Log;
345
- protected requestHandler: RequestHandler<Context>;
346
- protected errorHandler?: ErrorHandler<Context>;
347
- protected failedRequestHandler?: ErrorHandler<Context>;
450
+ protected unexpectedStop: boolean;
451
+ get log(): CrawleeLogger;
452
+ protected requestHandler: RequestHandler<ExtendedContext>;
453
+ protected errorHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
454
+ protected failedRequestHandler?: ErrorHandler<CrawlingContext, ExtendedContext>;
348
455
  protected requestHandlerTimeoutMillis: number;
349
456
  protected internalTimeoutMillis: number;
350
457
  protected maxRequestRetries: number;
458
+ protected maxCrawlDepth?: number;
351
459
  protected sameDomainDelayMillis: number;
352
460
  protected domainAccessedTime: Map<string, number>;
353
461
  protected maxSessionRotations: number;
462
+ protected maxRequestsPerCrawl?: number;
354
463
  protected handledRequestsCount: number;
355
464
  protected statusMessageLoggingInterval: number;
356
465
  protected statusMessageCallback?: StatusMessageCallback;
357
- protected sessionPoolOptions: SessionPoolOptions;
358
- protected useSessionPool: boolean;
359
- protected crawlingContexts: Map<string, Context>;
466
+ protected blockedStatusCodes: Set<number>;
467
+ protected additionalHttpErrorStatusCodes: Set<number>;
468
+ protected ignoreHttpErrorStatusCodes: Set<number>;
360
469
  protected autoscaledPoolOptions: AutoscaledPoolOptions;
361
- protected events: EventManager;
362
470
  protected httpClient: BaseHttpClient;
363
471
  protected retryOnBlocked: boolean;
364
- protected respectRobotsTxtFile: boolean;
472
+ protected respectRobotsTxtFile: boolean | {
473
+ userAgent?: string;
474
+ };
365
475
  protected onSkippedRequest?: SkippedRequestCallback;
366
476
  private _closeEvents?;
477
+ private loggedPerRun;
367
478
  private experiments;
368
479
  private readonly robotsTxtFileCache;
369
480
  private _experimentWarnings;
481
+ private readonly crawlerId;
482
+ private readonly hasExplicitId;
483
+ private readonly contextPipelineOptions;
370
484
  protected static optionsShape: {
485
+ // @ts-ignore optional peer dependency or compatibility with es2022
486
+ contextPipelineBuilder: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
487
+ // @ts-ignore optional peer dependency or compatibility with es2022
488
+ extendContext: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
371
489
  // @ts-ignore optional peer dependency or compatibility with es2022
372
490
  requestList: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
373
491
  // @ts-ignore optional peer dependency or compatibility with es2022
@@ -388,24 +506,40 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
388
506
  maxSessionRotations: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
389
507
  // @ts-ignore optional peer dependency or compatibility with es2022
390
508
  maxRequestsPerCrawl: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
509
+ // @ts-ignore optional peer dependency or compatibility with es2022
510
+ maxCrawlDepth: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
391
511
  // @ts-ignore optional peer dependency or compatibility with es2022
392
512
  autoscaledPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
393
513
  // @ts-ignore optional peer dependency or compatibility with es2022
394
- sessionPoolOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
514
+ sessionPool: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
395
515
  // @ts-ignore optional peer dependency or compatibility with es2022
396
- useSessionPool: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
516
+ proxyConfiguration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
397
517
  // @ts-ignore optional peer dependency or compatibility with es2022
398
518
  statusMessageLoggingInterval: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
399
519
  // @ts-ignore optional peer dependency or compatibility with es2022
400
520
  statusMessageCallback: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
521
+ // @ts-ignore optional peer dependency or compatibility with es2022
522
+ additionalHttpErrorStatusCodes: import("ow").ArrayPredicate<number>;
523
+ // @ts-ignore optional peer dependency or compatibility with es2022
524
+ ignoreHttpErrorStatusCodes: import("ow").ArrayPredicate<number>;
525
+ // @ts-ignore optional peer dependency or compatibility with es2022
526
+ blockedStatusCodes: import("ow").ArrayPredicate<number>;
401
527
  // @ts-ignore optional peer dependency or compatibility with es2022
402
528
  retryOnBlocked: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
403
529
  // @ts-ignore optional peer dependency or compatibility with es2022
404
- respectRobotsTxtFile: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
530
+ respectRobotsTxtFile: import("ow").AnyPredicate<boolean | object>;
405
531
  // @ts-ignore optional peer dependency or compatibility with es2022
406
532
  onSkippedRequest: import("ow").Predicate<Function> & import("ow").BasePredicate<Function | undefined>;
407
533
  // @ts-ignore optional peer dependency or compatibility with es2022
408
534
  httpClient: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
535
+ // @ts-ignore optional peer dependency or compatibility with es2022
536
+ configuration: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
537
+ // @ts-ignore optional peer dependency or compatibility with es2022
538
+ storageClient: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
539
+ // @ts-ignore optional peer dependency or compatibility with es2022
540
+ eventManager: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
541
+ // @ts-ignore optional peer dependency or compatibility with es2022
542
+ logger: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
409
543
  // @ts-ignore optional peer dependency or compatibility with es2022
410
544
  minConcurrency: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
411
545
  // @ts-ignore optional peer dependency or compatibility with es2022
@@ -414,17 +548,42 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
414
548
  maxRequestsPerMinute: import("ow").NumberPredicate & import("ow").BasePredicate<number | undefined>;
415
549
  // @ts-ignore optional peer dependency or compatibility with es2022
416
550
  keepAlive: import("ow").BooleanPredicate & import("ow").BasePredicate<boolean | undefined>;
417
- // @ts-ignore optional peer dependency or compatibility with es2022
418
- log: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
419
551
  // @ts-ignore optional peer dependency or compatibility with es2022
420
552
  experiments: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
421
553
  // @ts-ignore optional peer dependency or compatibility with es2022
422
554
  statisticsOptions: import("ow").ObjectPredicate<object> & import("ow").BasePredicate<object | undefined>;
555
+ // @ts-ignore optional peer dependency or compatibility with es2022
556
+ id: import("ow").StringPredicate & import("ow").BasePredicate<string | undefined>;
423
557
  };
424
558
  /**
425
559
  * All `BasicCrawler` parameters are passed via an options object.
426
560
  */
427
- constructor(options?: BasicCrawlerOptions<Context>, config?: Configuration);
561
+ constructor(options?: BasicCrawlerOptions<Context, ContextExtension, ExtendedContext> & RequireContextPipeline<CrawlingContext, Context>);
562
+ /**
563
+ * Determines if the given HTTP status code is an error status code given
564
+ * the default behaviour and user-set preferences.
565
+ * @param status
566
+ * @returns `true` if the status code is considered an error, `false` otherwise
567
+ */
568
+ protected isErrorStatusCode(status: number): boolean;
569
+ /**
570
+ * Builds the basic context pipeline that transforms `{ request }` into a full `CrawlingContext`.
571
+ * This handles base context creation, session resolution, and context helpers.
572
+ */
573
+ protected buildBasicContextPipeline(): ContextPipeline<{
574
+ request: Request;
575
+ }, CrawlingContext>;
576
+ private checkRobotsTxt;
577
+ /**
578
+ * Builds the subclass-specific context pipeline that transforms a `CrawlingContext` into the crawler's target context type.
579
+ * Subclasses should override this to add their own pipeline stages.
580
+ */
581
+ protected buildContextPipeline(): ContextPipeline<CrawlingContext, CrawlingContext>;
582
+ private createBaseContext;
583
+ private resolveRequest;
584
+ private resolveSession;
585
+ private createContextHelpers;
586
+ private buildFinalContextPipeline;
428
587
  /**
429
588
  * Checks if the given error is a proxy error by comparing its message to a list of known proxy error messages.
430
589
  * Used for retrying requests that failed due to proxy errors.
@@ -432,12 +591,6 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
432
591
  * @param error The error to check.
433
592
  */
434
593
  protected isProxyError(error: Error): boolean;
435
- /**
436
- * Checks whether the given crawling context is getting blocked by anti-bot protection using several heuristics.
437
- * Returns `false` if the request is not blocked, otherwise returns a string with a description of the block reason.
438
- * @param _crawlingContext The crawling context to check.
439
- */
440
- protected isRequestBlocked(_crawlingContext: Context): Promise<string | false>;
441
594
  /**
442
595
  * This method is periodically called by the crawler, every `statusMessageLoggingInterval` seconds.
443
596
  */
@@ -453,15 +606,21 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
453
606
  * @param [requests] The requests to add.
454
607
  * @param [options] Options for the request queue.
455
608
  */
456
- run(requests?: (string | Request | RequestOptions)[], options?: CrawlerRunOptions): Promise<FinalStatistics>;
609
+ run(requests?: RequestsLike, options?: CrawlerRunOptions): Promise<FinalStatistics>;
457
610
  /**
458
611
  * Gracefully stops the current run of the crawler.
459
612
  *
460
613
  * All the tasks active at the time of calling this method will be allowed to finish.
614
+ *
615
+ * To stop the crawler immediately, use {@link BasicCrawler.teardown|`crawler.teardown()`} instead.
461
616
  */
462
- stop(message?: string): void;
617
+ stop(reason?: string): void;
463
618
  getRequestQueue(): Promise<RequestProvider>;
464
619
  useState<State extends Dictionary = Dictionary>(defaultValue?: State): Promise<State>;
620
+ protected get pendingRequestCountApproximation(): number;
621
+ protected calculateEnqueuedRequestLimit(explicitLimit?: number): number | undefined;
622
+ protected handleSkippedRequest(options: Parameters<SkippedRequestCallback>[0]): Promise<void>;
623
+ private logOncePerRun;
465
624
  /**
466
625
  * Adds requests to the queue in batches. By default, it will resolve after the initial batch is added, and continue
467
626
  * adding the rest in background. You can configure the batch size via `batchSize` option and the sleep time in between
@@ -473,15 +632,15 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
473
632
  * @param requests The requests to add
474
633
  * @param options Options for the request queue
475
634
  */
476
- addRequests(requests: (string | Source)[], options?: CrawlerAddRequestsOptions): Promise<CrawlerAddRequestsResult>;
635
+ addRequests(requests: ReadonlyDeep<RequestsLike>, options?: CrawlerAddRequestsOptions): Promise<CrawlerAddRequestsResult>;
477
636
  /**
478
637
  * Pushes data to the specified {@link Dataset}, or the default crawler {@link Dataset} by calling {@link Dataset.pushData}.
479
638
  */
480
- pushData(data: Parameters<Dataset['pushData']>[0], datasetIdOrName?: string): Promise<void>;
639
+ pushData(data: Parameters<Dataset['pushData']>[0], datasetIdentifier?: string | StorageIdentifier): Promise<void>;
481
640
  /**
482
641
  * Retrieves the specified {@link Dataset}, or the default crawler {@link Dataset}.
483
642
  */
484
- getDataset(idOrName?: string): Promise<Dataset>;
643
+ getDataset(identifier?: string | StorageIdentifier): Promise<Dataset>;
485
644
  /**
486
645
  * Retrieves data from the default crawler {@link Dataset} by calling {@link Dataset.getData}.
487
646
  */
@@ -491,41 +650,52 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
491
650
  * Supported formats are currently 'json' and 'csv', and will be inferred from the `path` automatically.
492
651
  */
493
652
  exportData<Data>(path: string, format?: 'json' | 'csv', options?: DatasetExportOptions): Promise<Data[]>;
653
+ /**
654
+ * Initializes the crawler.
655
+ */
494
656
  protected _init(): Promise<void>;
495
- protected _runRequestHandler(crawlingContext: Context): Promise<void>;
657
+ protected runRequestHandler(crawlingContext: ExtendedContext): Promise<void>;
496
658
  /**
497
659
  * Handles blocked request
498
660
  */
499
- protected _throwOnBlockedRequest(session: Session, statusCode: number): void;
661
+ protected _throwOnBlockedRequest(statusCode: number): void;
500
662
  private isAllowedBasedOnRobotsTxtFile;
501
663
  protected getRobotsTxtFileForUrl(url: string): Promise<RobotsTxtFile | undefined>;
502
664
  protected _pauseOnMigration(): Promise<void>;
503
665
  /**
504
- * Fetches request from either RequestList or RequestQueue. If request comes from a RequestList
505
- * and RequestQueue is present then enqueues it to the queue first.
666
+ * Initializes the RequestManager based on the configured requestList and requestQueue.
506
667
  */
507
- protected _fetchNextRequest(): Promise<Request<Dictionary> | null | undefined>;
668
+ private initializeRequestManager;
508
669
  /**
509
- * Executed when `errorHandler` finishes or the request is successful.
510
- * Can be used to clean up orphaned browser pages.
670
+ * Fetches the next request to process from the underlying request provider.
511
671
  */
512
- protected _cleanupContext(_crawlingContext: Context): Promise<void>;
672
+ protected _fetchNextRequest(): Promise<Request<Dictionary> | null>;
513
673
  /**
514
674
  * Delays processing of the request based on the `sameDomainDelaySecs` option,
515
675
  * adding it back to the queue after the timeout passes. Returns `true` if the request
516
676
  * should be ignored and will be reclaimed to the queue once ready.
517
677
  */
518
- protected delayRequest(request: Request, source: IRequestList | RequestProvider): boolean;
678
+ protected delayRequest(request: Request, source: IRequestList | RequestProvider | IRequestManager): boolean;
679
+ /** Handles a single request - runs the request handler with retries, error handling, and lifecycle management. */
680
+ protected handleRequest(crawlingContext: ExtendedContext, requestSource: IRequestManager, request: Request): Promise<void>;
681
+ /**
682
+ * Wrapper around the crawling context's `enqueueLinks` method:
683
+ * - Injects `crawlDepth` to each request being added based on the crawling context request.
684
+ * - Provides defaults for the `enqueueLinks` options based on the crawler configuration.
685
+ * - These options can be overridden by the user.
686
+ * @internal
687
+ */
688
+ protected enqueueLinksWithCrawlDepth(options: SetRequired<EnqueueLinksOptions, 'urls'>, request: Request<Dictionary>, requestQueue: RequestProvider): Promise<BatchAddRequestsResult>;
519
689
  /**
520
- * Wrapper around requestHandler that fetches requests from RequestList/RequestQueue
521
- * then retries them in a case of an error, etc.
690
+ * Generator function that yields requests injected with the given crawl depth.
691
+ * @internal
522
692
  */
523
- protected _runTaskFunction(): Promise<void>;
693
+ protected addCrawlDepthRequestGenerator(requests: RequestsLike, newRequestDepth: number): AsyncGenerator<Source, void, undefined>;
524
694
  /**
525
- * Run async callback with given timeout and retry.
695
+ * Run async callback with given timeout and retry. Returns the result of the callback.
526
696
  * @ignore
527
697
  */
528
- protected _timeoutAndRetry(handler: () => Promise<unknown>, timeout: number, error: Error | string, maxRetries?: number, retried?: number): Promise<void>;
698
+ protected _timeoutAndRetry<T>(handler: () => Promise<T>, timeout: number, error: Error | string, maxRetries?: number, retried?: number): Promise<T>;
529
699
  /**
530
700
  * Returns true if either RequestList or RequestQueue have a request ready for processing.
531
701
  */
@@ -535,12 +705,19 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
535
705
  */
536
706
  protected _defaultIsFinishedFunction(): Promise<boolean>;
537
707
  private _rotateSession;
708
+ /**
709
+ * Unwraps errors thrown by the context pipeline to get the actual user error.
710
+ * RequestHandlerError and ContextPipelineInitializationError wrap the actual error.
711
+ */
712
+ private unwrapError;
538
713
  /**
539
714
  * Handles errors thrown by user provided requestHandler()
715
+ *
716
+ * @param request The request object, passed separately to circumvent potential dynamic logic in crawlingContext.request
540
717
  */
541
- protected _requestFunctionErrorHandler(error: Error, crawlingContext: Context, source: IRequestList | RequestProvider): Promise<void>;
718
+ protected _requestFunctionErrorHandler(error: Error, crawlingContext: CrawlingContext, request: Request, source: IRequestList | IRequestManager): Promise<void>;
542
719
  protected _tagUserHandlerError<T>(cb: () => unknown): Promise<T>;
543
- protected _handleFailedRequestHandler(crawlingContext: Context, error: Error): Promise<void>;
720
+ protected _handleFailedRequestHandler(crawlingContext: CrawlingContext, error: Error): Promise<void>;
544
721
  /**
545
722
  * Resolves the most verbose error message from a thrown error
546
723
  * @param error The error received
@@ -549,27 +726,25 @@ export declare class BasicCrawler<Context extends CrawlingContext = BasicCrawlin
549
726
  protected _getMessageFromError(error: Error, forceStack?: boolean): string | TimeoutError | undefined;
550
727
  protected _canRequestBeRetried(request: Request, error: Error): boolean;
551
728
  /**
552
- * Updates handledRequestsCount from possibly stored counts,
553
- * usually after worker migration. Since one of the stores
554
- * needs to have priority when both are present,
555
- * it is the request queue, because generally, the request
556
- * list will first be dumped into the queue and then left
557
- * empty.
729
+ * Updates handledRequestsCount from possibly stored counts, usually after worker migration.
558
730
  */
559
731
  protected _loadHandledRequestCount(): Promise<void>;
560
732
  protected _executeHooks<HookLike extends (...args: any[]) => Awaitable<void>>(hooks: HookLike[], ...args: Parameters<HookLike>): Promise<void>;
561
733
  /**
562
- * Function for cleaning up after all request are processed.
563
- * @ignore
734
+ * Stops the crawler immediately.
735
+ *
736
+ * This method doesn't wait for currently active requests to finish.
737
+ *
738
+ * To stop the crawler gracefully (waiting for all running requests to finish), use {@link BasicCrawler.stop|`crawler.stop()`} instead.
564
739
  */
565
740
  teardown(): Promise<void>;
566
741
  protected _getCookieHeaderFromRequest(request: Request): string;
567
742
  private _getRequestQueue;
568
- protected requestMatchesEnqueueStrategy(request: Request): boolean;
743
+ private requestMatchesEnqueueStrategy;
569
744
  }
570
745
  export interface CreateContextOptions {
571
746
  request: Request;
572
- session?: Session;
747
+ session: Session;
573
748
  proxyInfo?: ProxyInfo;
574
749
  }
575
750
  export interface CrawlerAddRequestsOptions extends AddRequestsBatchedOptions {