@crawlee/basic 4.0.0-beta.5 → 4.0.0-beta.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,14 @@
1
1
  import { writeFile } from 'node:fs/promises';
2
2
  import { dirname } from 'node:path';
3
- import { AutoscaledPool, Configuration, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, GotScrapingHttpClient, KeyValueStore, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
4
- import { RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
3
+ import { AutoscaledPool, bindMethodsToServiceLocator, BLOCKED_STATUS_CODES, ContextPipeline, ContextPipelineCleanupError, ContextPipelineInitializationError, ContextPipelineInterruptedError, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, KeyValueStore, LogLevel, mergeCookies, MissingSessionError, NavigationSkippedError, NonRetryableError, purgeDefaultStorages, RequestHandlerError, RequestListAdapter, RequestManagerTandem, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, ServiceLocator, serviceLocator, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
4
+ import { GotScrapingHttpClient } from '@crawlee/got-scraping-client';
5
+ import { getObjectType, isAsyncIterable, isIterable, RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
5
6
  import { stringify } from 'csv-stringify/sync';
6
7
  import { ensureDir, writeJSON } from 'fs-extra/esm';
7
8
  import ow from 'ow';
8
9
  import { getDomain } from 'tldts';
9
10
  import { LruCache } from '@apify/datastructures';
10
- import defaultLog, { LogLevel } from '@apify/log';
11
- import { addTimeoutToPromise, TimeoutError, tryCancel } from '@apify/timeout';
11
+ import { addTimeoutToPromise, TimeoutError } from '@apify/timeout';
12
12
  import { cryptoRandomObjectId } from '@apify/utilities';
13
13
  import { createSendRequest } from './send-request.js';
14
14
  /**
@@ -21,6 +21,7 @@ import { createSendRequest } from './send-request.js';
21
21
  * @ignore
22
22
  */
23
23
  const SAFE_MIGRATION_WAIT_MILLIS = 20000;
24
+ const deferredCleanupKey = Symbol('deferredCleanup');
24
25
  /**
25
26
  * Provides a simple framework for parallel crawling of web pages.
26
27
  * The URLs to crawl are fed either from a static list of URLs
@@ -86,8 +87,12 @@ const SAFE_MIGRATION_WAIT_MILLIS = 20000;
86
87
  * @category Crawlers
87
88
  */
88
89
  export class BasicCrawler {
89
- config;
90
90
  static CRAWLEE_STATE_KEY = 'CRAWLEE_STATE';
91
+ /**
92
+ * Tracks crawler instances that accessed shared state without having an explicit id.
93
+ * Used to detect and warn about multiple crawlers sharing the same state.
94
+ */
95
+ static useStateCrawlerIds = new Set();
91
96
  /**
92
97
  * A reference to the underlying {@link Statistics} class that collects and logs run statistics for requests.
93
98
  */
@@ -103,11 +108,18 @@ export class BasicCrawler {
103
108
  * Only available if used by the crawler.
104
109
  */
105
110
  requestQueue;
111
+ /**
112
+ * The main request-handling component of the crawler. It's initialized during the crawler startup.
113
+ */
114
+ requestManager;
106
115
  /**
107
116
  * A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session|sessions}.
108
- * Only available if used by the crawler.
109
117
  */
110
118
  sessionPool;
119
+ /**
120
+ * Indicates whether the crawler owns the session pool (it was not passed from the outside using the `sessionPool` constructor option).
121
+ */
122
+ ownsSessionPool;
111
123
  /**
112
124
  * A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
113
125
  * > *NOTE:* This property is only initialized after calling the {@link BasicCrawler.run|`crawler.run()`} function.
@@ -116,40 +128,78 @@ export class BasicCrawler {
116
128
  * or to abort it by calling {@link AutoscaledPool.abort|`autoscaledPool.abort()`}.
117
129
  */
118
130
  autoscaledPool;
131
+ /**
132
+ * A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
133
+ * Only available if used by the crawler.
134
+ */
135
+ proxyConfiguration;
119
136
  /**
120
137
  * Default {@link Router} instance that will be used if we don't specify any {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
121
138
  * See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
122
139
  */
123
140
  router = Router.create();
141
+ _basicContextPipeline;
142
+ /**
143
+ * The basic part of the context pipeline. Unlike the subclass pipeline, this
144
+ * part has no major side effects (e.g. launching a browser). It also makes typing more explicit, as subclass
145
+ * pipelines expect the basic crawler fields to already be present in the context at runtime.
146
+ *
147
+ * Context built with this pipeline can be passed into multiple crawler pipelines at once.
148
+ * This is used e.g. in the {@link AdaptivePlaywrightCrawler|`AdaptivePlaywrightCrawler`}.
149
+ */
150
+ get basicContextPipeline() {
151
+ if (this._basicContextPipeline === undefined) {
152
+ this._basicContextPipeline = this.buildBasicContextPipeline();
153
+ }
154
+ return this._basicContextPipeline;
155
+ }
156
+ _contextPipeline;
157
+ get contextPipeline() {
158
+ if (this._contextPipeline === undefined) {
159
+ this._contextPipeline = this.buildFinalContextPipeline();
160
+ }
161
+ return this._contextPipeline;
162
+ }
124
163
  running = false;
125
164
  hasFinishedBefore = false;
126
- log;
165
+ unexpectedStop = false;
166
+ #log;
167
+ get log() {
168
+ return this.#log;
169
+ }
127
170
  requestHandler;
128
171
  errorHandler;
129
172
  failedRequestHandler;
130
173
  requestHandlerTimeoutMillis;
131
174
  internalTimeoutMillis;
132
175
  maxRequestRetries;
176
+ maxCrawlDepth;
133
177
  sameDomainDelayMillis;
134
178
  domainAccessedTime;
135
179
  maxSessionRotations;
136
- handledRequestsCount;
180
+ maxRequestsPerCrawl;
181
+ handledRequestsCount = 0;
137
182
  statusMessageLoggingInterval;
138
183
  statusMessageCallback;
139
- sessionPoolOptions;
140
- useSessionPool;
141
- crawlingContexts = new Map();
184
+ blockedStatusCodes = new Set();
185
+ additionalHttpErrorStatusCodes;
186
+ ignoreHttpErrorStatusCodes;
142
187
  autoscaledPoolOptions;
143
- events;
144
188
  httpClient;
145
189
  retryOnBlocked;
146
190
  respectRobotsTxtFile;
147
191
  onSkippedRequest;
148
192
  _closeEvents;
193
+ loggedPerRun = new Set();
149
194
  experiments;
150
195
  robotsTxtFileCache;
151
196
  _experimentWarnings = {};
197
+ crawlerId;
198
+ hasExplicitId;
199
+ contextPipelineOptions;
152
200
  static optionsShape = {
201
+ contextPipelineBuilder: ow.optional.object,
202
+ extendContext: ow.optional.function,
153
203
  requestList: ow.optional.object.validate(validators.requestList),
154
204
  requestQueue: ow.optional.object.validate(validators.requestQueue),
155
205
  // Subclasses override this function instead of passing it
@@ -163,143 +213,347 @@ export class BasicCrawler {
163
213
  sameDomainDelaySecs: ow.optional.number,
164
214
  maxSessionRotations: ow.optional.number,
165
215
  maxRequestsPerCrawl: ow.optional.number,
216
+ maxCrawlDepth: ow.optional.number,
166
217
  autoscaledPoolOptions: ow.optional.object,
167
- sessionPoolOptions: ow.optional.object,
168
- useSessionPool: ow.optional.boolean,
218
+ sessionPool: ow.optional.object.instanceOf(SessionPool),
219
+ proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration),
169
220
  statusMessageLoggingInterval: ow.optional.number,
170
221
  statusMessageCallback: ow.optional.function,
222
+ additionalHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
223
+ ignoreHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
224
+ blockedStatusCodes: ow.optional.array.ofType(ow.number),
171
225
  retryOnBlocked: ow.optional.boolean,
172
- respectRobotsTxtFile: ow.optional.boolean,
226
+ respectRobotsTxtFile: ow.optional.any(ow.boolean, ow.object),
173
227
  onSkippedRequest: ow.optional.function,
174
228
  httpClient: ow.optional.object,
229
+ configuration: ow.optional.object,
230
+ storageClient: ow.optional.object,
231
+ eventManager: ow.optional.object,
232
+ logger: ow.optional.object,
175
233
  // AutoscaledPool shorthands
176
234
  minConcurrency: ow.optional.number,
177
235
  maxConcurrency: ow.optional.number,
178
236
  maxRequestsPerMinute: ow.optional.number.integerOrInfinite.positive.greaterThanOrEqual(1),
179
237
  keepAlive: ow.optional.boolean,
180
238
  // internal
181
- log: ow.optional.object,
182
239
  experiments: ow.optional.object,
183
240
  statisticsOptions: ow.optional.object,
241
+ id: ow.optional.string,
184
242
  };
185
243
  /**
186
244
  * All `BasicCrawler` parameters are passed via an options object.
187
245
  */
188
- constructor(options = {}, config = Configuration.getGlobalConfig()) {
189
- this.config = config;
246
+ constructor(options = {}) {
190
247
  ow(options, 'BasicCrawlerOptions', ow.object.exactShape(BasicCrawler.optionsShape));
191
- const { requestList, requestQueue, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, useSessionPool = true,
248
+ const { requestList, requestQueue, requestManager, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, maxCrawlDepth, autoscaledPoolOptions = {}, keepAlive, sessionPool, proxyConfiguration, additionalHttpErrorStatusCodes = [], ignoreHttpErrorStatusCodes = [],
249
+ // Service locator options
250
+ configuration, storageClient, eventManager, logger,
192
251
  // AutoscaledPool shorthands
193
- minConcurrency, maxConcurrency, maxRequestsPerMinute, retryOnBlocked = false, respectRobotsTxtFile = false, onSkippedRequest, requestHandler, requestHandlerTimeoutSecs, errorHandler, failedRequestHandler, statusMessageLoggingInterval = 10, statusMessageCallback, statisticsOptions, httpClient,
252
+ minConcurrency, maxConcurrency, maxRequestsPerMinute, blockedStatusCodes: blockedStatusCodesInput, retryOnBlocked = false, respectRobotsTxtFile = false, onSkippedRequest, requestHandler, requestHandlerTimeoutSecs, errorHandler, failedRequestHandler, statusMessageLoggingInterval = 10, statusMessageCallback, statisticsOptions, httpClient,
194
253
  // internal
195
- log = defaultLog.child({ prefix: this.constructor.name }), experiments = {}, } = options;
196
- this.requestList = requestList;
197
- this.requestQueue = requestQueue;
198
- this.httpClient = httpClient ?? new GotScrapingHttpClient();
199
- this.log = log;
200
- this.statusMessageLoggingInterval = statusMessageLoggingInterval;
201
- this.statusMessageCallback = statusMessageCallback;
202
- this.events = config.getEventManager();
203
- this.domainAccessedTime = new Map();
204
- this.experiments = experiments;
205
- this.robotsTxtFileCache = new LruCache({ maxLength: 1000 });
206
- this.requestHandler = requestHandler ?? this.router;
207
- this.failedRequestHandler = failedRequestHandler;
208
- this.errorHandler = errorHandler;
209
- if (requestHandlerTimeoutSecs) {
210
- this.requestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
254
+ experiments = {}, id, } = options;
255
+ // Create per-crawler service locator if custom services were provided.
256
+ // This wraps every method on the crawler instance so that calls to the global `serviceLocator`
257
+ // (via AsyncLocalStorage) resolve to this scoped instance instead.
258
+ // We also enter the scope for the rest of the constructor body, so that any code below
259
+ // that accesses `serviceLocator` will see the correct (scoped) instance.
260
+ let serviceLocatorScope = { enterScope: () => { }, exitScope: () => { } };
261
+ if (storageClient ||
262
+ eventManager ||
263
+ logger ||
264
+ (configuration !== undefined && configuration !== serviceLocator.getConfiguration())) {
265
+ const scopedServiceLocator = new ServiceLocator(configuration, eventManager, storageClient, logger);
266
+ serviceLocatorScope = bindMethodsToServiceLocator(scopedServiceLocator, this);
211
267
  }
212
- else {
213
- this.requestHandlerTimeoutMillis = 60_000;
214
- }
215
- this.retryOnBlocked = retryOnBlocked;
216
- this.respectRobotsTxtFile = respectRobotsTxtFile;
217
- this.onSkippedRequest = onSkippedRequest;
218
- const tryEnv = (val) => (val == null ? null : +val);
219
- // allow at least 5min for internal timeouts
220
- this.internalTimeoutMillis =
221
- tryEnv(process.env.CRAWLEE_INTERNAL_TIMEOUT) ?? Math.max(this.requestHandlerTimeoutMillis * 2, 300e3);
222
- // override the default internal timeout of request queue to respect `requestHandlerTimeoutMillis`
223
- if (this.requestQueue) {
224
- this.requestQueue.internalTimeoutMillis = this.internalTimeoutMillis;
225
- // for request queue v2, we want to lock requests for slightly longer than the request handler timeout so that there is some padding for locking-related overhead,
226
- // but never for less than a minute
227
- this.requestQueue.requestLockSecs = Math.max(this.requestHandlerTimeoutMillis / 1000 + 5, 60);
228
- }
229
- this.maxRequestRetries = maxRequestRetries;
230
- this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
231
- this.maxSessionRotations = maxSessionRotations;
232
- this.handledRequestsCount = 0;
233
- this.stats = new Statistics({
234
- logMessage: `${log.getOptions().prefix} request statistics:`,
235
- log,
236
- config,
237
- ...statisticsOptions,
238
- });
239
- this.sessionPoolOptions = {
240
- ...sessionPoolOptions,
241
- log,
242
- };
243
- if (this.retryOnBlocked) {
244
- this.sessionPoolOptions.blockedStatusCodes = sessionPoolOptions.blockedStatusCodes ?? [];
245
- if (this.sessionPoolOptions.blockedStatusCodes.length !== 0) {
246
- log.warning(`Both 'blockedStatusCodes' and 'retryOnBlocked' are set. Please note that the 'retryOnBlocked' feature might not work as expected.`);
268
+ try {
269
+ serviceLocatorScope.enterScope();
270
+ this.contextPipelineOptions = {
271
+ contextPipelineBuilder: options.contextPipelineBuilder,
272
+ extendContext: options.extendContext,
273
+ };
274
+ this.#log = serviceLocator.getLogger().child({ prefix: this.constructor.name });
275
+ // Store whether the user explicitly provided an ID
276
+ this.hasExplicitId = id !== undefined;
277
+ // Store the user-provided ID, or generate a unique one for tracking purposes (not for state key)
278
+ this.crawlerId = id ?? cryptoRandomObjectId();
279
+ if (requestManager !== undefined) {
280
+ if (requestList !== undefined || requestQueue !== undefined) {
281
+ throw new Error('The `requestManager` option cannot be used in conjunction with `requestList` and/or `requestQueue`');
282
+ }
283
+ this.requestManager = requestManager;
284
+ this.requestQueue = requestManager; // TODO(v4) - the cast is not fully legitimate here, but it's fine for internal usage by the BasicCrawler
247
285
  }
248
- }
249
- this.useSessionPool = useSessionPool;
250
- this.crawlingContexts = new Map();
251
- const maxSignedInteger = 2 ** 31 - 1;
252
- if (this.requestHandlerTimeoutMillis > maxSignedInteger) {
253
- log.warning(`requestHandlerTimeoutMillis ${this.requestHandlerTimeoutMillis}` +
254
- ` does not fit a signed 32-bit integer. Limiting the value to ${maxSignedInteger}`);
255
- this.requestHandlerTimeoutMillis = maxSignedInteger;
256
- }
257
- this.internalTimeoutMillis = Math.min(this.internalTimeoutMillis, maxSignedInteger);
258
- let shouldLogMaxPagesExceeded = true;
259
- const isMaxPagesExceeded = () => maxRequestsPerCrawl && maxRequestsPerCrawl <= this.handledRequestsCount;
260
- // eslint-disable-next-line prefer-const
261
- let { isFinishedFunction, isTaskReadyFunction } = autoscaledPoolOptions;
262
- // override even if `isFinishedFunction` provided by user - `keepAlive` has higher priority
263
- if (keepAlive) {
264
- isFinishedFunction = async () => false;
265
- }
266
- const basicCrawlerAutoscaledPoolConfiguration = {
267
- minConcurrency: minConcurrency ?? autoscaledPoolOptions?.minConcurrency,
268
- maxConcurrency: maxConcurrency ?? autoscaledPoolOptions?.maxConcurrency,
269
- maxTasksPerMinute: maxRequestsPerMinute ?? autoscaledPoolOptions?.maxTasksPerMinute,
270
- runTaskFunction: this._runTaskFunction.bind(this),
271
- isTaskReadyFunction: async () => {
272
- if (isMaxPagesExceeded()) {
273
- if (shouldLogMaxPagesExceeded) {
274
- log.info('Crawler reached the maxRequestsPerCrawl limit of ' +
275
- `${maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
276
- shouldLogMaxPagesExceeded = false;
286
+ else {
287
+ this.requestList = requestList;
288
+ this.requestQueue = requestQueue;
289
+ }
290
+ this.httpClient = httpClient ?? new GotScrapingHttpClient({ logger: this.log });
291
+ this.proxyConfiguration = proxyConfiguration;
292
+ this.statusMessageLoggingInterval = statusMessageLoggingInterval;
293
+ this.statusMessageCallback = statusMessageCallback;
294
+ this.domainAccessedTime = new Map();
295
+ this.experiments = experiments;
296
+ this.robotsTxtFileCache = new LruCache({ maxLength: 1000 });
297
+ this.handleSkippedRequest = this.handleSkippedRequest.bind(this);
298
+ this.additionalHttpErrorStatusCodes = new Set([...additionalHttpErrorStatusCodes]);
299
+ this.ignoreHttpErrorStatusCodes = new Set([...ignoreHttpErrorStatusCodes]);
300
+ this.requestHandler = requestHandler ?? this.router;
301
+ this.failedRequestHandler = failedRequestHandler;
302
+ this.errorHandler = errorHandler;
303
+ if (requestHandlerTimeoutSecs) {
304
+ this.requestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
305
+ }
306
+ else {
307
+ this.requestHandlerTimeoutMillis = 60_000;
308
+ }
309
+ this.retryOnBlocked = retryOnBlocked;
310
+ this.respectRobotsTxtFile = respectRobotsTxtFile;
311
+ this.onSkippedRequest = onSkippedRequest;
312
+ const tryEnv = (val) => (val == null ? null : +val);
313
+ // allow at least 5min for internal timeouts
314
+ this.internalTimeoutMillis =
315
+ tryEnv(process.env.CRAWLEE_INTERNAL_TIMEOUT) ?? Math.max(this.requestHandlerTimeoutMillis * 2, 300e3);
316
+ // override the default internal timeout of request queue to respect `requestHandlerTimeoutMillis`
317
+ if (this.requestQueue) {
318
+ this.requestQueue.internalTimeoutMillis = this.internalTimeoutMillis;
319
+ // for request queue v2, we want to lock requests for slightly longer than the request handler timeout so that there is some padding for locking-related overhead,
320
+ // but never for less than a minute
321
+ this.requestQueue.requestLockSecs = Math.max(this.requestHandlerTimeoutMillis / 1000 + 5, 60);
322
+ }
323
+ this.maxRequestRetries = maxRequestRetries;
324
+ this.maxCrawlDepth = maxCrawlDepth;
325
+ this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
326
+ this.maxSessionRotations = maxSessionRotations;
327
+ this.stats = new Statistics({
328
+ logMessage: `${this.constructor.name} request statistics:`,
329
+ log: this.log,
330
+ ...(this.hasExplicitId ? { id: this.crawlerId } : {}),
331
+ ...statisticsOptions,
332
+ });
333
+ this.sessionPool = sessionPool ?? new SessionPool();
334
+ this.sessionPool.setMaxListeners(20);
335
+ this.ownsSessionPool = !sessionPool;
336
+ this.blockedStatusCodes = new Set(blockedStatusCodesInput ?? BLOCKED_STATUS_CODES);
337
+ const maxSignedInteger = 2 ** 31 - 1;
338
+ if (this.requestHandlerTimeoutMillis > maxSignedInteger) {
339
+ this.log.warning(`requestHandlerTimeoutMillis ${this.requestHandlerTimeoutMillis}` +
340
+ ` does not fit a signed 32-bit integer. Limiting the value to ${maxSignedInteger}`);
341
+ this.requestHandlerTimeoutMillis = maxSignedInteger;
342
+ }
343
+ this.internalTimeoutMillis = Math.min(this.internalTimeoutMillis, maxSignedInteger);
344
+ this.maxRequestsPerCrawl = maxRequestsPerCrawl;
345
+ const isMaxPagesExceeded = () => this.maxRequestsPerCrawl && this.maxRequestsPerCrawl <= this.handledRequestsCount;
346
+ // eslint-disable-next-line prefer-const
347
+ let { isFinishedFunction, isTaskReadyFunction } = autoscaledPoolOptions;
348
+ // override even if `isFinishedFunction` provided by user - `keepAlive` has higher priority
349
+ if (keepAlive) {
350
+ isFinishedFunction = async () => false;
351
+ }
352
+ const basicCrawlerAutoscaledPoolConfiguration = {
353
+ minConcurrency: minConcurrency ?? autoscaledPoolOptions?.minConcurrency,
354
+ maxConcurrency: maxConcurrency ?? autoscaledPoolOptions?.maxConcurrency,
355
+ maxTasksPerMinute: maxRequestsPerMinute ?? autoscaledPoolOptions?.maxTasksPerMinute,
356
+ runTaskFunction: async () => {
357
+ const source = this.requestManager;
358
+ if (!source)
359
+ throw new Error('Request provider is not initialized!');
360
+ const request = await this.resolveRequest();
361
+ if (!request || this.delayRequest(request, source)) {
362
+ return;
277
363
  }
278
- return false;
279
- }
280
- return isTaskReadyFunction ? await isTaskReadyFunction() : await this._isTaskReadyFunction();
364
+ const crawlingContext = { request };
365
+ try {
366
+ await this.basicContextPipeline
367
+ .chain(this.contextPipeline)
368
+ .call(crawlingContext, (ctx) => this.handleRequest(ctx, source, request));
369
+ }
370
+ catch (error) {
371
+ // ContextPipelineInterruptedError means the request was intentionally skipped
372
+ // (e.g., doesn't match enqueue strategy after redirect). Just return gracefully.
373
+ if (error instanceof ContextPipelineInterruptedError) {
374
+ await this._timeoutAndRetry(async () => this.requestManager?.markRequestHandled(request), this.internalTimeoutMillis, `Marking request ${crawlingContext.request.url} (${crawlingContext.request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
375
+ return;
376
+ }
377
+ // If the error happened during pipeline initialization (e.g., navigation timeout, session/proxy error,
378
+ // i.e. not in user's requestHandler), handle it through the normal error flow.
379
+ const isPipelineError = error instanceof ContextPipelineInitializationError || error instanceof SessionError;
380
+ if (isPipelineError) {
381
+ const unwrappedError = this.unwrapError(error);
382
+ await this._requestFunctionErrorHandler(unwrappedError, crawlingContext, request, this.requestManager);
383
+ crawlingContext.session?.markBad();
384
+ return;
385
+ }
386
+ throw this.unwrapError(error);
387
+ }
388
+ },
389
+ isTaskReadyFunction: async () => {
390
+ if (isMaxPagesExceeded()) {
391
+ this.logOncePerRun('shuttingDown', 'Crawler reached the maxRequestsPerCrawl limit of ' +
392
+ `${this.maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
393
+ return false;
394
+ }
395
+ if (this.unexpectedStop) {
396
+ this.logOncePerRun('shuttingDown', 'No new requests are allowed because the `stop()` method has been called. ' +
397
+ 'Ongoing requests will be allowed to complete.');
398
+ return false;
399
+ }
400
+ return isTaskReadyFunction ? await isTaskReadyFunction() : await this._isTaskReadyFunction();
401
+ },
402
+ isFinishedFunction: async () => {
403
+ if (isMaxPagesExceeded()) {
404
+ this.log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${this.maxRequestsPerCrawl} requests ` +
405
+ 'and all requests that were in progress at that time have now finished. ' +
406
+ `In total, the crawler processed ${this.handledRequestsCount} requests and will shut down.`);
407
+ return true;
408
+ }
409
+ if (this.unexpectedStop) {
410
+ this.log.info('The crawler has finished all the remaining ongoing requests and will shut down now.');
411
+ return true;
412
+ }
413
+ const isFinished = isFinishedFunction
414
+ ? await isFinishedFunction()
415
+ : await this._defaultIsFinishedFunction();
416
+ if (isFinished) {
417
+ const reason = isFinishedFunction
418
+ ? "Crawler's custom isFinishedFunction() returned true, the crawler will shut down."
419
+ : 'All requests from the queue have been processed, the crawler will shut down.';
420
+ this.log.info(reason);
421
+ }
422
+ return isFinished;
423
+ },
424
+ log: this.log,
425
+ };
426
+ this.autoscaledPoolOptions = { ...autoscaledPoolOptions, ...basicCrawlerAutoscaledPoolConfiguration };
427
+ }
428
+ finally {
429
+ serviceLocatorScope.exitScope();
430
+ }
431
+ }
432
+ /**
433
+ * Determines if the given HTTP status code is an error status code given
434
+ * the default behaviour and user-set preferences.
435
+ * @param status
436
+ * @returns `true` if the status code is considered an error, `false` otherwise
437
+ */
438
+ isErrorStatusCode(status) {
439
+ const excludeError = this.ignoreHttpErrorStatusCodes.has(status);
440
+ const includeError = this.additionalHttpErrorStatusCodes.has(status);
441
+ return (status >= 500 && !excludeError) || includeError;
442
+ }
443
+ /**
444
+ * Builds the basic context pipeline that transforms `{ request }` into a full `CrawlingContext`.
445
+ * This handles base context creation, session resolution, and context helpers.
446
+ */
447
+ buildBasicContextPipeline() {
448
+ return ContextPipeline.create()
449
+ .compose({ action: this.checkRobotsTxt.bind(this) })
450
+ .compose({
451
+ action: () => this.createBaseContext(),
452
+ cleanup: async (context) => {
453
+ await Promise.all(context[deferredCleanupKey].map((fn) => fn()));
281
454
  },
282
- isFinishedFunction: async () => {
283
- if (isMaxPagesExceeded()) {
284
- log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${maxRequestsPerCrawl} requests ` +
285
- 'and all requests that were in progress at that time have now finished. ' +
286
- `In total, the crawler processed ${this.handledRequestsCount} requests and will shut down.`);
287
- return true;
455
+ })
456
+ .compose({ action: this.resolveSession.bind(this) })
457
+ .compose({ action: this.createContextHelpers.bind(this) });
458
+ }
459
+ async checkRobotsTxt({ request }) {
460
+ if (!(await this.isAllowedBasedOnRobotsTxtFile(request.url))) {
461
+ this.log.warning(`Skipping request ${request.url} (${request.id}) because it is disallowed based on robots.txt`);
462
+ request.state = RequestState.SKIPPED;
463
+ request.noRetry = true;
464
+ await this.handleSkippedRequest({
465
+ url: request.url,
466
+ reason: 'robotsTxt',
467
+ });
468
+ throw new ContextPipelineInterruptedError(`Skipping request ${request.url} as disallowed by robots.txt`);
469
+ }
470
+ return {};
471
+ }
472
+ /**
473
+ * Builds the subclass-specific context pipeline that transforms a `CrawlingContext` into the crawler's target context type.
474
+ * Subclasses should override this to add their own pipeline stages.
475
+ */
476
+ buildContextPipeline() {
477
+ return ContextPipeline.create();
478
+ }
479
+ createBaseContext() {
480
+ const deferredCleanup = [];
481
+ return {
482
+ id: cryptoRandomObjectId(10),
483
+ log: this.log,
484
+ pushData: this.pushData.bind(this),
485
+ useState: this.useState.bind(this),
486
+ getKeyValueStore: async (identifier) => KeyValueStore.open(identifier),
487
+ registerDeferredCleanup: (cleanup) => {
488
+ deferredCleanup.push(cleanup);
489
+ },
490
+ [deferredCleanupKey]: deferredCleanup,
491
+ };
492
+ }
493
+ async resolveRequest() {
494
+ const request = await this._timeoutAndRetry(this._fetchNextRequest.bind(this), this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
495
+ // Reset loadedUrl so an old one is not carried over to retries.
496
+ if (request) {
497
+ request.loadedUrl = undefined;
498
+ }
499
+ return request;
500
+ }
501
+ async resolveSession({ request }) {
502
+ const session = await this._timeoutAndRetry(async () => {
503
+ if (request.sessionId) {
504
+ const existingSession = await this.sessionPool.getSession(request.sessionId);
505
+ if (!existingSession) {
506
+ throw new ContextPipelineInitializationError(new MissingSessionError(request.sessionId));
288
507
  }
289
- const isFinished = isFinishedFunction
290
- ? await isFinishedFunction()
291
- : await this._defaultIsFinishedFunction();
292
- if (isFinished) {
293
- const reason = isFinishedFunction
294
- ? "Crawler's custom isFinishedFunction() returned true, the crawler will shut down."
295
- : 'All requests from the queue have been processed, the crawler will shut down.';
296
- log.info(reason);
508
+ return existingSession;
509
+ }
510
+ return await this.sessionPool.newSession({
511
+ proxyInfo: await this.proxyConfiguration?.newProxyInfo({
512
+ request: request ?? undefined,
513
+ }),
514
+ maxUsageCount: 1,
515
+ });
516
+ }, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
517
+ return { session, proxyInfo: session.proxyInfo };
518
+ }
519
+ async createContextHelpers({ request, session }) {
520
+ const enqueueLinksWrapper = async (options) => {
521
+ const requestQueue = await this.getRequestQueue();
522
+ return await this.enqueueLinksWithCrawlDepth(options, request, requestQueue);
523
+ };
524
+ const addRequests = async (requests, options = {}) => {
525
+ const newCrawlDepth = request.crawlDepth + 1;
526
+ const requestsGenerator = this.addCrawlDepthRequestGenerator(requests, newCrawlDepth);
527
+ await this.addRequests(requestsGenerator, options);
528
+ };
529
+ const sendRequest = createSendRequest(this.httpClient, request, session);
530
+ return { enqueueLinks: enqueueLinksWrapper, addRequests, sendRequest };
531
+ }
532
+ buildFinalContextPipeline() {
533
+ let contextPipeline = (this.contextPipelineOptions.contextPipelineBuilder?.() ??
534
+ this.buildContextPipeline());
535
+ const { extendContext } = this.contextPipelineOptions;
536
+ if (extendContext !== undefined) {
537
+ contextPipeline = contextPipeline.compose({
538
+ action: async (context) => await extendContext(context),
539
+ });
540
+ }
541
+ contextPipeline = contextPipeline.compose({
542
+ action: async (context) => {
543
+ const { request } = context;
544
+ if (request && !this.requestMatchesEnqueueStrategy(request)) {
545
+ // eslint-disable-next-line dot-notation
546
+ const message = `Skipping request ${request.id} (starting url: ${request.url} -> loaded url: ${request.loadedUrl}) because it does not match the enqueue strategy (${request['enqueueStrategy']}).`;
547
+ this.log.debug(message);
548
+ request.noRetry = true;
549
+ request.state = RequestState.SKIPPED;
550
+ await this.handleSkippedRequest({ url: request.url, reason: 'redirect' });
551
+ throw new ContextPipelineInterruptedError(message);
297
552
  }
298
- return isFinished;
553
+ return context;
299
554
  },
300
- log,
301
- };
302
- this.autoscaledPoolOptions = { ...autoscaledPoolOptions, ...basicCrawlerAutoscaledPoolConfiguration };
555
+ });
556
+ return contextPipeline;
303
557
  }
304
558
  /**
305
559
  * Checks if the given error is a proxy error by comparing its message to a list of known proxy error messages.
@@ -310,21 +564,13 @@ export class BasicCrawler {
310
564
  isProxyError(error) {
311
565
  return ROTATE_PROXY_ERRORS.some((x) => this._getMessageFromError(error)?.includes(x));
312
566
  }
313
- /**
314
- * Checks whether the given crawling context is getting blocked by anti-bot protection using several heuristics.
315
- * Returns `false` if the request is not blocked, otherwise returns a string with a description of the block reason.
316
- * @param _crawlingContext The crawling context to check.
317
- */
318
- async isRequestBlocked(_crawlingContext) {
319
- throw new Error('the "isRequestBlocked" method is not implemented in this crawler.');
320
- }
321
567
  /**
322
568
  * This method is periodically called by the crawler, every `statusMessageLoggingInterval` seconds.
323
569
  */
324
570
  async setStatusMessage(message, options = {}) {
325
571
  const data = options.isStatusMessageTerminal != null ? { terminal: options.isStatusMessageTerminal } : undefined;
326
- this.log.internal(LogLevel[options.level ?? 'DEBUG'], message, data);
327
- const client = this.config.getStorageClient();
572
+ this.log.logWithLevel(LogLevel[options.level ?? 'DEBUG'], message, data);
573
+ const client = serviceLocator.getStorageClient();
328
574
  if (!client.setStatusMessage) {
329
575
  return;
330
576
  }
@@ -349,7 +595,7 @@ export class BasicCrawler {
349
595
  message = `Experiencing problems, ${this.stats.state.requestsFailed - previousState.requestsFailed || this.stats.state.requestsFailed} failed requests in the past ${this.statusMessageLoggingInterval} seconds.`;
350
596
  }
351
597
  else {
352
- const total = this.requestQueue?.getTotalCount() || this.requestList?.length();
598
+ const total = this.requestManager?.getTotalCount();
353
599
  message = `Crawled ${this.stats.state.requestsFinished}${total ? `/${total}` : ''} pages, ${this.stats.state.requestsFailed} failed requests, desired concurrency ${this.autoscaledPool?.desiredConcurrency ?? 0}.`;
354
600
  }
355
601
  if (this.statusMessageCallback) {
@@ -389,20 +635,32 @@ export class BasicCrawler {
389
635
  if (this.requestQueue?.name === 'default' && purgeRequestQueue) {
390
636
  await this.requestQueue.drop();
391
637
  this.requestQueue = await this._getRequestQueue();
638
+ this.requestManager = undefined;
639
+ await this.initializeRequestManager();
640
+ this.handledRequestsCount = 0; // This would've been reset by this._init() further down below, but at that point `handledRequestsCount` could prevent `addRequests` from adding the initial requests
392
641
  }
393
642
  this.stats.reset();
394
643
  await this.stats.resetStore();
395
- await this.sessionPool?.resetStore();
644
+ if (this.ownsSessionPool) {
645
+ await this.sessionPool.resetStore();
646
+ }
396
647
  }
648
+ this.unexpectedStop = false;
397
649
  this.running = true;
398
- await purgeDefaultStorages({ onlyPurgeOnce: true });
650
+ this.loggedPerRun.clear();
651
+ await purgeDefaultStorages({
652
+ onlyPurgeOnce: true,
653
+ client: serviceLocator.getStorageClient(),
654
+ config: serviceLocator.getConfiguration(),
655
+ });
399
656
  if (requests) {
400
657
  await this.addRequests(requests, addRequestsOptions);
401
658
  }
402
659
  await this._init();
403
660
  await this.stats.startCapturing();
404
661
  const periodicLogger = this.getPeriodicLogger();
405
- await this.setStatusMessage('Starting the crawler.', { level: 'INFO' });
662
+ // Don't await, we don't want to block the execution
663
+ void this.setStatusMessage('Starting the crawler.', { level: 'INFO' });
406
664
  const sigintHandler = async () => {
407
665
  this.log.warning('Pausing... Press CTRL+C again to force exit. To resume, do: CRAWLEE_PURGE_ON_START=0 npm start');
408
666
  await this._pauseOnMigration();
@@ -411,8 +669,9 @@ export class BasicCrawler {
411
669
  // Attach a listener to handle migration and aborting events gracefully.
412
670
  const boundPauseOnMigration = this._pauseOnMigration.bind(this);
413
671
  process.once('SIGINT', sigintHandler);
414
- this.events.on("migrating" /* EventType.MIGRATING */, boundPauseOnMigration);
415
- this.events.on("aborting" /* EventType.ABORTING */, boundPauseOnMigration);
672
+ const eventManager = serviceLocator.getEventManager();
673
+ eventManager.on("migrating" /* EventType.MIGRATING */, boundPauseOnMigration);
674
+ eventManager.on("aborting" /* EventType.ABORTING */, boundPauseOnMigration);
416
675
  let stats = {};
417
676
  try {
418
677
  await this.autoscaledPool.run();
@@ -421,8 +680,8 @@ export class BasicCrawler {
421
680
  await this.teardown();
422
681
  await this.stats.stopCapturing();
423
682
  process.off('SIGINT', sigintHandler);
424
- this.events.off("migrating" /* EventType.MIGRATING */, boundPauseOnMigration);
425
- this.events.off("aborting" /* EventType.ABORTING */, boundPauseOnMigration);
683
+ eventManager.off("migrating" /* EventType.MIGRATING */, boundPauseOnMigration);
684
+ eventManager.off("aborting" /* EventType.ABORTING */, boundPauseOnMigration);
426
685
  const finalStats = this.stats.calculate();
427
686
  stats = {
428
687
  requestsFinished: this.stats.state.requestsFinished,
@@ -439,7 +698,7 @@ export class BasicCrawler {
439
698
  mostCommonErrors: this.stats.errorTracker.getMostPopularErrors(3).map(prettify),
440
699
  });
441
700
  }
442
- const client = this.config.getStorageClient();
701
+ const client = serviceLocator.getStorageClient();
443
702
  if (client.teardown) {
444
703
  let finished = false;
445
704
  setTimeout(() => {
@@ -451,7 +710,8 @@ export class BasicCrawler {
451
710
  finished = true;
452
711
  }
453
712
  periodicLogger.stop();
454
- await this.setStatusMessage(`Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${this.stats.state.requestsFinished} succeeded, ${this.stats.state.requestsFailed} failed.`, { isStatusMessageTerminal: true, level: 'INFO' });
713
+ // Don't await, we don't want to block the execution
714
+ void this.setStatusMessage(`Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${this.stats.state.requestsFinished} succeeded, ${this.stats.state.requestsFailed} failed.`, { isStatusMessageTerminal: true, level: 'INFO' });
455
715
  this.running = false;
456
716
  this.hasFinishedBefore = true;
457
717
  }
@@ -461,29 +721,75 @@ export class BasicCrawler {
461
721
  * Gracefully stops the current run of the crawler.
462
722
  *
463
723
  * All the tasks active at the time of calling this method will be allowed to finish.
724
+ *
725
+ * To stop the crawler immediately, use {@link BasicCrawler.teardown|`crawler.teardown()`} instead.
464
726
  */
465
- stop(message = 'The crawler has been gracefully stopped.') {
466
- // Gracefully starve the this.autoscaledPool, so it doesn't start new tasks. Resolves once the pool is cleared.
467
- this.autoscaledPool
468
- ?.pause()
469
- // Resolves the `autoscaledPool.run()` promise in the `BasicCrawler.run()` method. Since the pool is already paused, it resolves immediately and doesn't kill any tasks.
470
- .then(async () => this.autoscaledPool?.abort())
471
- .then(() => this.log.info(message))
472
- .catch((err) => {
473
- this.log.error('An error occurred when stopping the crawler:', err);
474
- });
727
+ stop(reason = 'The crawler has been gracefully stopped.') {
728
+ if (this.unexpectedStop) {
729
+ return;
730
+ }
731
+ this.log.info(reason);
732
+ this.unexpectedStop = true;
475
733
  }
476
734
  async getRequestQueue() {
477
735
  if (!this.requestQueue && this.requestList) {
478
736
  this.log.warningOnce('When using RequestList and RequestQueue at the same time, you should instantiate both explicitly and provide them in the crawler options, to ensure correctly handled restarts of the crawler.');
479
737
  }
480
- this.requestQueue ??= await this._getRequestQueue();
738
+ if (!this.requestQueue) {
739
+ this.requestQueue = await this._getRequestQueue();
740
+ this.requestManager = undefined;
741
+ }
742
+ if (!this.requestManager) {
743
+ this.requestManager =
744
+ this.requestList === undefined
745
+ ? this.requestQueue
746
+ : new RequestManagerTandem(this.requestList, this.requestQueue);
747
+ }
481
748
  return this.requestQueue;
482
749
  }
483
750
  async useState(defaultValue = {}) {
484
- const kvs = await KeyValueStore.open(null, { config: this.config });
751
+ const kvs = await KeyValueStore.open(null, { config: serviceLocator.getConfiguration() });
752
+ if (this.hasExplicitId) {
753
+ const stateKey = `${BasicCrawler.CRAWLEE_STATE_KEY}_${this.crawlerId}`;
754
+ return kvs.getAutoSavedValue(stateKey, defaultValue);
755
+ }
756
+ BasicCrawler.useStateCrawlerIds.add(this.crawlerId);
757
+ if (BasicCrawler.useStateCrawlerIds.size > 1) {
758
+ serviceLocator
759
+ .getLogger()
760
+ .warningOnce('Multiple crawler instances are calling useState() without an explicit `id` option. \n' +
761
+ 'This means they will share the same state object, which is likely unintended. \n' +
762
+ 'To fix this, provide a unique `id` option to each crawler instance. \n' +
763
+ 'Example: new BasicCrawler({ id: "my-crawler-1", ... })');
764
+ }
485
765
  return kvs.getAutoSavedValue(BasicCrawler.CRAWLEE_STATE_KEY, defaultValue);
486
766
  }
767
+ get pendingRequestCountApproximation() {
768
+ return this.requestManager?.getPendingCount() ?? 0;
769
+ }
770
+ calculateEnqueuedRequestLimit(explicitLimit) {
771
+ if (this.maxRequestsPerCrawl === undefined) {
772
+ return explicitLimit;
773
+ }
774
+ const limit = Math.max(0, this.maxRequestsPerCrawl - this.handledRequestsCount - this.pendingRequestCountApproximation);
775
+ return Math.min(limit, explicitLimit ?? Infinity);
776
+ }
777
+ async handleSkippedRequest(options) {
778
+ if (options.reason === 'limit') {
779
+ this.logOncePerRun('maxRequestsPerCrawl', 'The number of requests enqueued by the crawler reached the maxRequestsPerCrawl limit of ' +
780
+ `${this.maxRequestsPerCrawl} requests and no further requests will be added.`);
781
+ }
782
+ if (options.reason === 'depth') {
783
+ this.logOncePerRun('maxCrawlDepth', `The crawler reached the maxCrawlDepth limit of ${this.maxCrawlDepth} and no further requests will be enqueued.`);
784
+ }
785
+ await this.onSkippedRequest?.(options);
786
+ }
787
+ logOncePerRun(key, message) {
788
+ if (!this.loggedPerRun.has(key)) {
789
+ this.log.info(message);
790
+ this.loggedPerRun.add(key);
791
+ }
792
+ }
487
793
  /**
488
794
  * Adds requests to the queue in batches. By default, it will resolve after the initial batch is added, and continue
489
795
  * adding the rest in background. You can configure the batch size via `batchSize` option and the sleep time in between
@@ -496,46 +802,72 @@ export class BasicCrawler {
496
802
  * @param options Options for the request queue
497
803
  */
498
804
  async addRequests(requests, options = {}) {
499
- const requestQueue = await this.getRequestQueue();
500
- if (!this.respectRobotsTxtFile) {
501
- return requestQueue.addRequestsBatched(requests, options);
502
- }
503
- const allowedRequests = [];
504
- const skipped = new Set();
505
- for (const request of requests) {
506
- const url = typeof request === 'string' ? request : request.url;
507
- if (await this.isAllowedBasedOnRobotsTxtFile(url)) {
508
- allowedRequests.push(request);
509
- }
510
- else {
511
- skipped.add(url);
512
- await this.onSkippedRequest?.({ url, reason: 'robotsTxt' });
805
+ await this.getRequestQueue();
806
+ const requestLimit = this.calculateEnqueuedRequestLimit();
807
+ const skippedBecauseOfRobots = new Set();
808
+ const skippedBecauseOfLimit = new Set();
809
+ const skippedBecauseOfMaxCrawlDepth = new Set();
810
+ const isAllowedBasedOnRobotsTxtFile = this.isAllowedBasedOnRobotsTxtFile.bind(this);
811
+ const maxCrawlDepth = this.maxCrawlDepth;
812
+ ow(requests, ow.object
813
+ .is((value) => isIterable(value) || isAsyncIterable(value))
814
+ .message((value) => `Expected an iterable or async iterable, got ${getObjectType(value)}`));
815
+ async function* filteredRequests() {
816
+ let yieldedRequestCount = 0;
817
+ for await (const request of requests) {
818
+ const url = typeof request === 'string' ? request : request.url;
819
+ if (requestLimit !== undefined && yieldedRequestCount >= requestLimit) {
820
+ skippedBecauseOfLimit.add(url);
821
+ continue;
822
+ }
823
+ if (maxCrawlDepth !== undefined && request.crawlDepth > maxCrawlDepth) {
824
+ skippedBecauseOfMaxCrawlDepth.add(url);
825
+ continue;
826
+ }
827
+ if (await isAllowedBasedOnRobotsTxtFile(url)) {
828
+ yield request;
829
+ yieldedRequestCount += 1;
830
+ }
831
+ else {
832
+ skippedBecauseOfRobots.add(url);
833
+ }
513
834
  }
514
835
  }
515
- if (skipped.size > 0) {
836
+ const result = await this.requestManager.addRequestsBatched(filteredRequests(), options);
837
+ if (skippedBecauseOfRobots.size > 0) {
516
838
  this.log.warning(`Some requests were skipped because they were disallowed based on the robots.txt file`, {
517
- skipped: [...skipped],
839
+ skipped: [...skippedBecauseOfRobots],
518
840
  });
519
- if (this.onSkippedRequest) {
520
- await Promise.all([...skipped].map((url) => {
521
- return this.onSkippedRequest({ url, reason: 'robotsTxt' });
522
- }));
523
- }
524
841
  }
525
- return requestQueue.addRequestsBatched(allowedRequests, options);
842
+ if (skippedBecauseOfRobots.size > 0 ||
843
+ skippedBecauseOfLimit.size > 0 ||
844
+ skippedBecauseOfMaxCrawlDepth.size > 0) {
845
+ await Promise.all([...skippedBecauseOfRobots]
846
+ .map((url) => {
847
+ return this.handleSkippedRequest({ url, reason: 'robotsTxt' });
848
+ })
849
+ .concat([...skippedBecauseOfLimit].map((url) => {
850
+ return this.handleSkippedRequest({ url, reason: 'limit' });
851
+ }), [...skippedBecauseOfMaxCrawlDepth].map((url) => {
852
+ return this.handleSkippedRequest({ url, reason: 'depth' });
853
+ })));
854
+ }
855
+ return result;
526
856
  }
527
857
  /**
528
858
  * Pushes data to the specified {@link Dataset}, or the default crawler {@link Dataset} by calling {@link Dataset.pushData}.
529
859
  */
530
- async pushData(data, datasetIdOrName) {
531
- const dataset = await this.getDataset(datasetIdOrName);
860
+ async pushData(data, datasetIdentifier) {
861
+ const dataset = await this.getDataset(datasetIdentifier);
532
862
  return dataset.pushData(data);
533
863
  }
534
864
  /**
535
865
  * Retrieves the specified {@link Dataset}, or the default crawler {@link Dataset}.
536
866
  */
537
- async getDataset(idOrName) {
538
- return Dataset.open(idOrName, { config: this.config });
867
+ async getDataset(identifier) {
868
+ return Dataset.open(identifier, {
869
+ config: serviceLocator.getConfiguration(),
870
+ });
539
871
  }
540
872
  /**
541
873
  * Retrieves data from the default crawler {@link Dataset} by calling {@link Dataset.getData}.
@@ -550,8 +882,9 @@ export class BasicCrawler {
550
882
  */
551
883
  async exportData(path, format, options) {
552
884
  const supportedFormats = ['json', 'csv'];
553
- if (!format && path.match(/\.(json|csv)$/i)) {
554
- format = path.toLowerCase().match(/\.(json|csv)$/)[1];
885
+ const formatMatch = /\.(json|csv)$/i.exec(path);
886
+ if (!format && formatMatch) {
887
+ format = formatMatch[1].toLowerCase();
555
888
  }
556
889
  if (!format) {
557
890
  throw new Error(`Failed to infer format from the path: '${path}'. Supported formats: ${supportedFormats.join(', ')}`);
@@ -562,7 +895,21 @@ export class BasicCrawler {
562
895
  const dataset = await this.getDataset();
563
896
  const items = await dataset.export(options);
564
897
  if (format === 'csv') {
565
- const value = stringify([Object.keys(items[0]), ...items.map((item) => Object.values(item))]);
898
+ let value;
899
+ if (items.length === 0) {
900
+ value = '';
901
+ }
902
+ else {
903
+ const keys = options?.collectAllKeys
904
+ ? Array.from(new Set(items.flatMap(Object.keys)))
905
+ : Object.keys(items[0]);
906
+ value = stringify([
907
+ keys,
908
+ ...items.map((item) => {
909
+ return keys.map((k) => item[k]);
910
+ }),
911
+ ]);
912
+ }
566
913
  await ensureDir(dirname(path));
567
914
  await writeFile(path, value);
568
915
  this.log.info(`Export to ${path} finished!`);
@@ -574,32 +921,33 @@ export class BasicCrawler {
574
921
  }
575
922
  return items;
576
923
  }
924
+ /**
925
+ * Initializes the crawler.
926
+ */
577
927
  async _init() {
578
- if (!this.events.isInitialized()) {
579
- await this.events.init();
928
+ const eventManager = serviceLocator.getEventManager();
929
+ if (!eventManager.isInitialized()) {
930
+ await eventManager.init();
580
931
  this._closeEvents = true;
581
932
  }
582
933
  // Initialize AutoscaledPool before awaiting _loadHandledRequestCount(),
583
934
  // so that the caller can get a reference to it before awaiting the promise returned from run()
584
935
  // (otherwise there would be no way)
585
- this.autoscaledPool = new AutoscaledPool(this.autoscaledPoolOptions, this.config);
586
- if (this.useSessionPool) {
587
- this.sessionPool = await SessionPool.open(this.sessionPoolOptions, this.config);
588
- // Assuming there are not more than 20 browsers running at once;
589
- this.sessionPool.setMaxListeners(20);
590
- }
936
+ this.autoscaledPool = new AutoscaledPool(this.autoscaledPoolOptions);
937
+ await this.initializeRequestManager();
591
938
  await this._loadHandledRequestCount();
592
939
  }
593
- async _runRequestHandler(crawlingContext) {
594
- await this.requestHandler(crawlingContext);
940
+ async runRequestHandler(crawlingContext) {
941
+ await addTimeoutToPromise(async () => this.requestHandler(crawlingContext), this.requestHandlerTimeoutMillis, `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${crawlingContext.request.id}).`);
595
942
  }
596
943
  /**
597
944
  * Handles blocked request
598
945
  */
599
- _throwOnBlockedRequest(session, statusCode) {
600
- const isBlocked = session.retireOnBlockedStatusCodes(statusCode);
601
- if (isBlocked) {
602
- throw new Error(`Request blocked - received ${statusCode} status code.`);
946
+ _throwOnBlockedRequest(statusCode) {
947
+ if (this.retryOnBlocked)
948
+ return;
949
+ if (this.blockedStatusCodes.has(statusCode)) {
950
+ throw new SessionError(`Request blocked - received ${statusCode} status code.`);
603
951
  }
604
952
  }
605
953
  async isAllowedBasedOnRobotsTxtFile(url) {
@@ -607,7 +955,8 @@ export class BasicCrawler {
607
955
  return true;
608
956
  }
609
957
  const robotsTxtFile = await this.getRobotsTxtFileForUrl(url);
610
- return !robotsTxtFile || robotsTxtFile.isAllowed(url);
958
+ const userAgent = typeof this.respectRobotsTxtFile === 'object' ? this.respectRobotsTxtFile?.userAgent : '*';
959
+ return !robotsTxtFile || robotsTxtFile.isAllowed(url, userAgent);
611
960
  }
612
961
  async getRobotsTxtFileForUrl(url) {
613
962
  if (!this.respectRobotsTxtFile) {
@@ -619,7 +968,7 @@ export class BasicCrawler {
619
968
  if (cachedRobotsTxtFile) {
620
969
  return cachedRobotsTxtFile;
621
970
  }
622
- const robotsTxtFile = await RobotsTxtFile.find(url);
971
+ const robotsTxtFile = await RobotsTxtFile.find(url, { logger: this.log });
623
972
  this.robotsTxtFileCache.add(origin, robotsTxtFile);
624
973
  return robotsTxtFile;
625
974
  }
@@ -661,36 +1010,36 @@ export class BasicCrawler {
661
1010
  await Promise.all([requestListPersistPromise, this.stats.persistState()]);
662
1011
  }
663
1012
  /**
664
- * Fetches request from either RequestList or RequestQueue. If request comes from a RequestList
665
- * and RequestQueue is present then enqueues it to the queue first.
1013
+ * Initializes the RequestManager based on the configured requestList and requestQueue.
666
1014
  */
667
- async _fetchNextRequest() {
668
- if (!this.requestList || (await this.requestList.isFinished())) {
669
- return this.requestQueue?.fetchNextRequest();
670
- }
671
- const request = await this.requestList.fetchNextRequest();
672
- if (!this.requestQueue)
673
- return request;
674
- if (!request)
675
- return this.requestQueue.fetchNextRequest();
676
- try {
677
- await this.requestQueue.addRequest(request, { forefront: true });
1015
+ async initializeRequestManager() {
1016
+ if (this.requestManager !== undefined) {
1017
+ return;
678
1018
  }
679
- catch (err) {
680
- // If requestQueue.addRequest() fails here then we must reclaim it back to
681
- // the RequestList because probably it's not yet in the queue!
682
- this.log.error('Adding of request from the RequestList to the RequestQueue failed, reclaiming request back to the list.', { request });
683
- await this.requestList.reclaimRequest(request);
684
- return null;
685
- }
686
- await this.requestList.markRequestHandled(request);
687
- return this.requestQueue.fetchNextRequest();
1019
+ if (this.requestList && this.requestQueue) {
1020
+ // Create a RequestManagerTandem if both RequestList and RequestQueue are provided
1021
+ this.requestManager = new RequestManagerTandem(this.requestList, this.requestQueue);
1022
+ }
1023
+ else if (this.requestQueue) {
1024
+ // Use RequestQueue directly if only it is provided
1025
+ this.requestManager = this.requestQueue;
1026
+ }
1027
+ else if (this.requestList) {
1028
+ // Use RequestList directly if only it is provided
1029
+ // Make it compatible with the IRequestManager interface
1030
+ this.requestManager = new RequestListAdapter(this.requestList);
1031
+ }
1032
+ // If neither RequestList nor RequestQueue is provided, leave the requestManager uninitialized until `getRequestQueue` is called
688
1033
  }
689
1034
  /**
690
- * Executed when `errorHandler` finishes or the request is successful.
691
- * Can be used to clean up orphaned browser pages.
1035
+ * Fetches the next request to process from the underlying request provider.
692
1036
  */
693
- async _cleanupContext(_crawlingContext) { }
1037
+ async _fetchNextRequest() {
1038
+ if (this.requestManager === undefined) {
1039
+ throw new Error(`_fetchNextRequest called on an uninitialized crawler`);
1040
+ }
1041
+ return this.requestManager.fetchNextRequest();
1042
+ }
694
1043
  /**
695
1044
  * Delays processing of the request based on the `sameDomainDelaySecs` option,
696
1045
  * adding it back to the queue after the timeout passes. Returns `true` if the request
@@ -723,112 +1072,54 @@ export class BasicCrawler {
723
1072
  }, delay);
724
1073
  return true;
725
1074
  }
726
- /**
727
- * Wrapper around requestHandler that fetches requests from RequestList/RequestQueue
728
- * then retries them in a case of an error, etc.
729
- */
730
- async _runTaskFunction() {
731
- const source = this.requestQueue || this.requestList || (await this.getRequestQueue());
732
- let request;
733
- let session;
734
- await this._timeoutAndRetry(async () => {
735
- request = await this._fetchNextRequest();
736
- }, this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
737
- tryCancel();
738
- if (this.useSessionPool) {
739
- await this._timeoutAndRetry(async () => {
740
- session = await this.sessionPool.getSession();
741
- }, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
742
- }
743
- tryCancel();
744
- if (!request || this.delayRequest(request, source)) {
745
- return;
746
- }
747
- if (!(await this.isAllowedBasedOnRobotsTxtFile(request.url))) {
748
- this.log.warning(`Skipping request ${request.url} (${request.id}) because it is disallowed based on robots.txt`);
749
- request.state = RequestState.SKIPPED;
750
- request.noRetry = true;
751
- await source.markRequestHandled(request);
752
- await this.onSkippedRequest?.({
753
- url: request.url,
754
- reason: 'robotsTxt',
755
- });
756
- return;
757
- }
758
- // Reset loadedUrl so an old one is not carried over to retries.
759
- request.loadedUrl = undefined;
1075
+ /** Handles a single request - runs the request handler with retries, error handling, and lifecycle management. */
1076
+ async handleRequest(crawlingContext, requestSource, request) {
760
1077
  const statisticsId = request.id || request.uniqueKey;
761
1078
  this.stats.startJob(statisticsId);
762
- // Shared crawling context
763
- // @ts-expect-error
764
- // All missing properties (that extend CrawlingContext) are set dynamically,
765
- // but TS does not know that, so otherwise it would throw when compiling.
766
- const crawlingContext = {
767
- id: cryptoRandomObjectId(10),
768
- crawler: this,
769
- log: this.log,
770
- request,
771
- session,
772
- enqueueLinks: async (options) => {
773
- return enqueueLinks({
774
- // specify the RQ first to allow overriding it
775
- requestQueue: await this.getRequestQueue(),
776
- robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
777
- onSkippedRequest: this.onSkippedRequest,
778
- ...options,
779
- });
780
- },
781
- addRequests: this.addRequests.bind(this),
782
- pushData: this.pushData.bind(this),
783
- useState: this.useState.bind(this),
784
- sendRequest: createSendRequest(this.httpClient, request, session, () => crawlingContext.proxyInfo?.url),
785
- getKeyValueStore: async (idOrName) => KeyValueStore.open(idOrName, { config: this.config }),
786
- };
787
- this.crawlingContexts.set(crawlingContext.id, crawlingContext);
788
1079
  let isRequestLocked = true;
789
1080
  try {
790
1081
  request.state = RequestState.REQUEST_HANDLER;
791
- await addTimeoutToPromise(async () => this._runRequestHandler(crawlingContext), this.requestHandlerTimeoutMillis, `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${request.id}).`);
792
- await this._timeoutAndRetry(async () => source.markRequestHandled(request), this.internalTimeoutMillis, `Marking request ${request.url} (${request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
1082
+ await this.runRequestHandler(crawlingContext);
1083
+ await this._timeoutAndRetry(async () => requestSource.markRequestHandled(request), this.internalTimeoutMillis, `Marking request ${request.url} (${request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
793
1084
  isRequestLocked = false; // markRequestHandled succeeded and unlocked the request
794
1085
  this.stats.finishJob(statisticsId, request.retryCount);
795
1086
  this.handledRequestsCount++;
796
1087
  // reclaim session if request finishes successfully
797
1088
  request.state = RequestState.DONE;
798
- crawlingContext.session?.markGood();
1089
+ crawlingContext.session.markGood();
799
1090
  }
800
- catch (err) {
1091
+ catch (rawError) {
1092
+ const err = this.unwrapError(rawError);
801
1093
  try {
802
1094
  request.state = RequestState.ERROR_HANDLER;
803
- await addTimeoutToPromise(async () => this._requestFunctionErrorHandler(err, crawlingContext, source), this.internalTimeoutMillis, `Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
1095
+ await addTimeoutToPromise(async () => this._requestFunctionErrorHandler(err, crawlingContext, request, requestSource), this.internalTimeoutMillis, `Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
804
1096
  if (!(err instanceof CriticalError)) {
805
1097
  isRequestLocked = false; // _requestFunctionErrorHandler calls either markRequestHandled or reclaimRequest
806
1098
  }
807
1099
  request.state = RequestState.DONE;
808
1100
  }
809
1101
  catch (secondaryError) {
810
- if (!secondaryError.triggeredFromUserHandler &&
1102
+ const unwrappedSecondaryError = this.unwrapError(secondaryError);
1103
+ if (!unwrappedSecondaryError.triggeredFromUserHandler &&
811
1104
  // avoid reprinting the same critical error multiple times, as it will be printed by Nodejs at the end anyway
812
- !(secondaryError instanceof CriticalError)) {
1105
+ !(unwrappedSecondaryError instanceof CriticalError)) {
813
1106
  const apifySpecific = process.env.APIFY_IS_AT_HOME
814
1107
  ? `This may have happened due to an internal error of Apify's API or due to a misconfigured crawler.`
815
1108
  : '';
816
- this.log.exception(secondaryError, 'An exception occurred during handling of failed request. ' +
1109
+ this.log.exception(unwrappedSecondaryError, 'An exception occurred during handling of failed request. ' +
817
1110
  `This places the crawler and its underlying storages into an unknown state and crawling will be terminated. ${apifySpecific}`);
818
1111
  }
819
1112
  request.state = RequestState.ERROR;
820
- throw secondaryError;
1113
+ throw unwrappedSecondaryError;
821
1114
  }
822
1115
  // decrease the session score if the request fails (but the error handler did not throw)
823
- crawlingContext.session?.markBad();
1116
+ crawlingContext.session.markBad();
824
1117
  }
825
1118
  finally {
826
- await this._cleanupContext(crawlingContext);
827
- this.crawlingContexts.delete(crawlingContext.id);
828
1119
  // Safety net - release the lock if nobody managed to do it before
829
- if (isRequestLocked && source instanceof RequestProvider) {
1120
+ if (isRequestLocked && requestSource instanceof RequestProvider) {
830
1121
  try {
831
- await source.client.deleteRequestLock(request.id);
1122
+ await requestSource.client.deleteRequestLock(request.id);
832
1123
  }
833
1124
  catch {
834
1125
  // We don't have the lock, or the request was never locked. Either way it's fine
@@ -837,19 +1128,75 @@ export class BasicCrawler {
837
1128
  }
838
1129
  }
839
1130
  /**
840
- * Run async callback with given timeout and retry.
1131
+ * Wrapper around the crawling context's `enqueueLinks` method:
1132
+ * - Injects `crawlDepth` to each request being added based on the crawling context request.
1133
+ * - Provides defaults for the `enqueueLinks` options based on the crawler configuration.
1134
+ * - These options can be overridden by the user.
1135
+ * @internal
1136
+ */
1137
+ async enqueueLinksWithCrawlDepth(options, request, requestQueue) {
1138
+ const transformRequestFunctionWrapper = (requestOptions) => {
1139
+ requestOptions.crawlDepth = request.crawlDepth + 1;
1140
+ if (this.maxCrawlDepth !== undefined && requestOptions.crawlDepth > this.maxCrawlDepth) {
1141
+ // Setting `skippedReason` before returning `false` ensures that `reportSkippedRequests`
1142
+ // reports `'depth'` as the reason (via `request.skippedReason ?? reason` fallback),
1143
+ // rather than the generic `'transform'` reason.
1144
+ requestOptions.skippedReason = 'depth';
1145
+ return false;
1146
+ }
1147
+ // After injecting the crawlDepth, we call the user-provided transform function, if there is one.
1148
+ return options.transformRequestFunction?.(requestOptions) ?? requestOptions;
1149
+ };
1150
+ // Create a request-scoped callback that logs enqueueLimit once per request handler call
1151
+ // Only log if an explicit limit was passed to enqueueLinks (not the internal maxRequestsPerCrawl-derived limit)
1152
+ let loggedEnqueueLimitForThisRequest = false;
1153
+ const onSkippedRequest = async (skippedOptions) => {
1154
+ if (skippedOptions.reason === 'enqueueLimit') {
1155
+ if (!loggedEnqueueLimitForThisRequest && options.limit !== undefined) {
1156
+ this.log.info(`Skipping URLs in the handler for ${request.url} due to the enqueueLinks limit of ${options.limit}.`);
1157
+ loggedEnqueueLimitForThisRequest = true;
1158
+ }
1159
+ }
1160
+ await this.handleSkippedRequest(skippedOptions);
1161
+ };
1162
+ return await enqueueLinks({
1163
+ requestQueue,
1164
+ robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
1165
+ onSkippedRequest,
1166
+ limit: this.calculateEnqueuedRequestLimit(options.limit),
1167
+ // Allow user options to override defaults set above ⤴
1168
+ ...options,
1169
+ transformRequestFunction: transformRequestFunctionWrapper,
1170
+ });
1171
+ }
1172
+ /**
1173
+ * Generator function that yields requests injected with the given crawl depth.
1174
+ * @internal
1175
+ */
1176
+ async *addCrawlDepthRequestGenerator(requests, newRequestDepth) {
1177
+ for await (const request of requests) {
1178
+ if (typeof request === 'string') {
1179
+ yield { url: request, crawlDepth: newRequestDepth };
1180
+ }
1181
+ else {
1182
+ request.crawlDepth ??= newRequestDepth;
1183
+ yield request;
1184
+ }
1185
+ }
1186
+ }
1187
+ /**
1188
+ * Run async callback with given timeout and retry. Returns the result of the callback.
841
1189
  * @ignore
842
1190
  */
843
1191
  async _timeoutAndRetry(handler, timeout, error, maxRetries = 3, retried = 1) {
844
1192
  try {
845
- await addTimeoutToPromise(handler, timeout, error);
1193
+ return await addTimeoutToPromise(handler, timeout, error);
846
1194
  }
847
1195
  catch (e) {
848
1196
  if (retried <= maxRetries) {
849
1197
  // we retry on any error, not just timeout
850
1198
  this.log.warning(`${e.message} (retrying ${retried}/${maxRetries})`);
851
- void this._timeoutAndRetry(handler, timeout, error, maxRetries, retried + 1);
852
- return;
1199
+ return this._timeoutAndRetry(handler, timeout, error, maxRetries, retried + 1);
853
1200
  }
854
1201
  throw e;
855
1202
  }
@@ -858,36 +1205,38 @@ export class BasicCrawler {
858
1205
  * Returns true if either RequestList or RequestQueue have a request ready for processing.
859
1206
  */
860
1207
  async _isTaskReadyFunction() {
861
- // First check RequestList, since it's only in memory.
862
- const isRequestListEmpty = this.requestList ? await this.requestList.isEmpty() : true;
863
- // If RequestList is not empty, task is ready, no reason to check RequestQueue.
864
- if (!isRequestListEmpty)
865
- return true;
866
- // If RequestQueue is not empty, task is ready, return true, otherwise false.
867
- return this.requestQueue ? !(await this.requestQueue.isEmpty()) : false;
1208
+ return this.requestManager !== undefined && !(await this.requestManager.isEmpty());
868
1209
  }
869
1210
  /**
870
1211
  * Returns true if both RequestList and RequestQueue have all requests finished.
871
1212
  */
872
1213
  async _defaultIsFinishedFunction() {
873
- const [isRequestListFinished, isRequestQueueFinished] = await Promise.all([
874
- this.requestList ? this.requestList.isFinished() : true,
875
- this.requestQueue ? this.requestQueue.isFinished() : true,
876
- ]);
877
- // If both are finished, return true, otherwise return false.
878
- return isRequestListFinished && isRequestQueueFinished;
1214
+ return !this.requestManager || (await this.requestManager.isFinished());
879
1215
  }
880
1216
  async _rotateSession(crawlingContext) {
881
1217
  const { request } = crawlingContext;
882
1218
  request.sessionRotationCount ??= 0;
883
1219
  request.sessionRotationCount++;
884
- crawlingContext.session?.retire();
1220
+ crawlingContext.session.retire();
1221
+ }
1222
+ /**
1223
+ * Unwraps errors thrown by the context pipeline to get the actual user error.
1224
+ * RequestHandlerError and ContextPipelineInitializationError wrap the actual error.
1225
+ */
1226
+ unwrapError(error) {
1227
+ if (error instanceof RequestHandlerError ||
1228
+ error instanceof ContextPipelineInitializationError ||
1229
+ error instanceof ContextPipelineCleanupError) {
1230
+ return this.unwrapError(error.cause);
1231
+ }
1232
+ return error;
885
1233
  }
886
1234
  /**
887
1235
  * Handles errors thrown by user provided requestHandler()
1236
+ *
1237
+ * @param request The request object, passed separately to circumvent potential dynamic logic in crawlingContext.request
888
1238
  */
889
- async _requestFunctionErrorHandler(error, crawlingContext, source) {
890
- const { request } = crawlingContext;
1239
+ async _requestFunctionErrorHandler(error, crawlingContext, request, source) {
891
1240
  request.pushErrorMessage(error);
892
1241
  if (error instanceof CriticalError) {
893
1242
  throw error;
@@ -895,12 +1244,15 @@ export class BasicCrawler {
895
1244
  const shouldRetryRequest = this._canRequestBeRetried(request, error);
896
1245
  if (shouldRetryRequest) {
897
1246
  await this.stats.errorTrackerRetry.addAsync(error, crawlingContext);
898
- await this.errorHandler?.(crawlingContext, error);
1247
+ await this.errorHandler?.(crawlingContext, // valid cast - ExtendedContext transitively extends CrawlingContext
1248
+ error);
899
1249
  if (error instanceof SessionError) {
900
1250
  await this._rotateSession(crawlingContext);
901
1251
  }
902
1252
  if (!request.noRetry) {
903
- request.retryCount++;
1253
+ if (!(error instanceof SessionError)) {
1254
+ request.retryCount++;
1255
+ }
904
1256
  const { url, retryCount, id } = request;
905
1257
  // We don't want to see the stack trace in the logs by default, when we are going to retry the request.
906
1258
  // Thus, we print the full stack trace only when CRAWLEE_VERBOSE_LOG environment variable is set to true.
@@ -914,6 +1266,9 @@ export class BasicCrawler {
914
1266
  return;
915
1267
  }
916
1268
  }
1269
+ if (error instanceof SessionError) {
1270
+ crawlingContext.session?.retire();
1271
+ }
917
1272
  // If the request is non-retryable, the error and snapshot aren't saved in the errorTrackerRetry object.
918
1273
  // Therefore, we pass the crawlingContext to the errorTracker.add method, enabling snapshot capture.
919
1274
  // This is to make sure the error snapshot is not duplicated in the errorTrackerRetry and errorTracker objects.
@@ -947,7 +1302,8 @@ export class BasicCrawler {
947
1302
  const message = this._getMessageFromError(error, true);
948
1303
  this.log.error(`Request failed and reached maximum retries. ${message}`, { id, url, method, uniqueKey });
949
1304
  if (this.failedRequestHandler) {
950
- await this.failedRequestHandler?.(crawlingContext, error);
1305
+ await this.failedRequestHandler?.(crawlingContext, // valid cast - ExtendedContext transitively extends CrawlingContext
1306
+ error);
951
1307
  }
952
1308
  }
953
1309
  /**
@@ -985,19 +1341,11 @@ export class BasicCrawler {
985
1341
  return request.retryCount < maxRequestRetries;
986
1342
  }
987
1343
  /**
988
- * Updates handledRequestsCount from possibly stored counts,
989
- * usually after worker migration. Since one of the stores
990
- * needs to have priority when both are present,
991
- * it is the request queue, because generally, the request
992
- * list will first be dumped into the queue and then left
993
- * empty.
1344
+ * Updates handledRequestsCount from possibly stored counts, usually after worker migration.
994
1345
  */
995
1346
  async _loadHandledRequestCount() {
996
- if (this.requestQueue) {
997
- this.handledRequestsCount = await this.requestQueue.handledCount();
998
- }
999
- else if (this.requestList) {
1000
- this.handledRequestsCount = this.requestList.handledCount();
1347
+ if (this.requestManager) {
1348
+ this.handledRequestsCount = await this.requestManager.handledCount();
1001
1349
  }
1002
1350
  }
1003
1351
  async _executeHooks(hooks, ...args) {
@@ -1008,16 +1356,19 @@ export class BasicCrawler {
1008
1356
  }
1009
1357
  }
1010
1358
  /**
1011
- * Function for cleaning up after all request are processed.
1012
- * @ignore
1359
+ * Stops the crawler immediately.
1360
+ *
1361
+ * This method doesn't wait for currently active requests to finish.
1362
+ *
1363
+ * To stop the crawler gracefully (waiting for all running requests to finish), use {@link BasicCrawler.stop|`crawler.stop()`} instead.
1013
1364
  */
1014
1365
  async teardown() {
1015
- this.events.emit("persistState" /* EventType.PERSIST_STATE */, { isMigrating: false });
1016
- if (this.useSessionPool) {
1017
- await this.sessionPool.teardown();
1018
- }
1366
+ serviceLocator.getEventManager().emit("persistState" /* EventType.PERSIST_STATE */, { isMigrating: false });
1019
1367
  if (this._closeEvents) {
1020
- await this.events.close();
1368
+ await serviceLocator.getEventManager().close();
1369
+ }
1370
+ if (this.ownsSessionPool) {
1371
+ await this.sessionPool.teardown();
1021
1372
  }
1022
1373
  await this.autoscaledPool?.abort();
1023
1374
  }
@@ -1030,16 +1381,30 @@ export class BasicCrawler {
1030
1381
  }
1031
1382
  async _getRequestQueue() {
1032
1383
  // Check if it's explicitly disabled
1384
+ // oxlint-disable-next-line typescript/no-deprecated -- still honored for opt-out until the flag is removed
1033
1385
  if (this.experiments.requestLocking === false) {
1386
+ // oxlint-disable-next-line typescript/no-deprecated
1034
1387
  if (!this._experimentWarnings.requestLocking) {
1035
1388
  this.log.info('Using the old RequestQueue implementation without request locking.');
1389
+ // oxlint-disable-next-line typescript/no-deprecated
1036
1390
  this._experimentWarnings.requestLocking = true;
1037
1391
  }
1038
- return RequestQueueV1.open(null, { config: this.config });
1392
+ return RequestQueueV1.open(null, { config: serviceLocator.getConfiguration() });
1039
1393
  }
1040
- return RequestQueue.open(null, { config: this.config });
1394
+ return RequestQueue.open(null, { config: serviceLocator.getConfiguration() });
1041
1395
  }
1042
1396
  requestMatchesEnqueueStrategy(request) {
1397
+ // If `skipNavigation` was used, just return `true`
1398
+ try {
1399
+ // eslint-disable-next-line @typescript-eslint/no-unused-expressions
1400
+ request.loadedUrl;
1401
+ }
1402
+ catch (err) {
1403
+ if (err instanceof NavigationSkippedError) {
1404
+ return true;
1405
+ }
1406
+ throw err;
1407
+ }
1043
1408
  const { url, loadedUrl } = request;
1044
1409
  // eslint-disable-next-line dot-notation -- private access
1045
1410
  const strategy = request['enqueueStrategy'];