@crawlee/basic 4.0.0-beta.3 → 4.0.0-beta.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,8 @@
1
1
  import { writeFile } from 'node:fs/promises';
2
2
  import { dirname } from 'node:path';
3
- import { AutoscaledPool, Configuration, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, GotScrapingHttpClient, KeyValueStore, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
4
- import { RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
3
+ import { AutoscaledPool, bindMethodsToServiceLocator, ContextPipeline, ContextPipelineCleanupError, ContextPipelineInitializationError, ContextPipelineInterruptedError, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, KeyValueStore, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestHandlerError, RequestListAdapter, RequestManagerTandem, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, ServiceLocator, serviceLocator, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
4
+ import { GotScrapingHttpClient } from '@crawlee/got-scraping-client';
5
+ import { getObjectType, isAsyncIterable, isIterable, RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
5
6
  import { stringify } from 'csv-stringify/sync';
6
7
  import { ensureDir, writeJSON } from 'fs-extra/esm';
7
8
  import ow from 'ow';
@@ -86,8 +87,12 @@ const SAFE_MIGRATION_WAIT_MILLIS = 20000;
86
87
  * @category Crawlers
87
88
  */
88
89
  export class BasicCrawler {
89
- config;
90
90
  static CRAWLEE_STATE_KEY = 'CRAWLEE_STATE';
91
+ /**
92
+ * Tracks crawler instances that accessed shared state without having an explicit id.
93
+ * Used to detect and warn about multiple crawlers sharing the same state.
94
+ */
95
+ static useStateCrawlerIds = new Set();
91
96
  /**
92
97
  * A reference to the underlying {@link Statistics} class that collects and logs run statistics for requests.
93
98
  */
@@ -103,6 +108,10 @@ export class BasicCrawler {
103
108
  * Only available if used by the crawler.
104
109
  */
105
110
  requestQueue;
111
+ /**
112
+ * The main request-handling component of the crawler. It's initialized during the crawler startup.
113
+ */
114
+ requestManager;
106
115
  /**
107
116
  * A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session|sessions}.
108
117
  * Only available if used by the crawler.
@@ -116,11 +125,24 @@ export class BasicCrawler {
116
125
  * or to abort it by calling {@link AutoscaledPool.abort|`autoscaledPool.abort()`}.
117
126
  */
118
127
  autoscaledPool;
128
+ /**
129
+ * A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
130
+ * Only available if used by the crawler.
131
+ */
132
+ proxyConfiguration;
119
133
  /**
120
134
  * Default {@link Router} instance that will be used if we don't specify any {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
121
135
  * See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
122
136
  */
123
137
  router = Router.create();
138
+ contextPipelineBuilder;
139
+ _contextPipeline;
140
+ get contextPipeline() {
141
+ if (this._contextPipeline === undefined) {
142
+ this._contextPipeline = this.contextPipelineBuilder();
143
+ }
144
+ return this._contextPipeline;
145
+ }
124
146
  running = false;
125
147
  hasFinishedBefore = false;
126
148
  log;
@@ -130,26 +152,32 @@ export class BasicCrawler {
130
152
  requestHandlerTimeoutMillis;
131
153
  internalTimeoutMillis;
132
154
  maxRequestRetries;
155
+ maxCrawlDepth;
133
156
  sameDomainDelayMillis;
134
157
  domainAccessedTime;
135
158
  maxSessionRotations;
136
- handledRequestsCount;
159
+ maxRequestsPerCrawl;
160
+ handledRequestsCount = 0;
137
161
  statusMessageLoggingInterval;
138
162
  statusMessageCallback;
139
163
  sessionPoolOptions;
140
164
  useSessionPool;
141
- crawlingContexts = new Map();
142
165
  autoscaledPoolOptions;
143
- events;
144
166
  httpClient;
145
167
  retryOnBlocked;
146
168
  respectRobotsTxtFile;
147
169
  onSkippedRequest;
148
170
  _closeEvents;
171
+ shouldLogMaxProcessedRequestsExceeded = true;
172
+ shouldLogMaxEnqueuedRequestsExceeded = true;
149
173
  experiments;
150
174
  robotsTxtFileCache;
151
175
  _experimentWarnings = {};
176
+ crawlerId;
177
+ hasExplicitId;
152
178
  static optionsShape = {
179
+ contextPipelineBuilder: ow.optional.object,
180
+ extendContext: ow.optional.function,
153
181
  requestList: ow.optional.object.validate(validators.requestList),
154
182
  requestQueue: ow.optional.object.validate(validators.requestQueue),
155
183
  // Subclasses override this function instead of passing it
@@ -163,15 +191,20 @@ export class BasicCrawler {
163
191
  sameDomainDelaySecs: ow.optional.number,
164
192
  maxSessionRotations: ow.optional.number,
165
193
  maxRequestsPerCrawl: ow.optional.number,
194
+ maxCrawlDepth: ow.optional.number,
166
195
  autoscaledPoolOptions: ow.optional.object,
167
196
  sessionPoolOptions: ow.optional.object,
168
197
  useSessionPool: ow.optional.boolean,
198
+ proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration),
169
199
  statusMessageLoggingInterval: ow.optional.number,
170
200
  statusMessageCallback: ow.optional.function,
171
201
  retryOnBlocked: ow.optional.boolean,
172
- respectRobotsTxtFile: ow.optional.boolean,
202
+ respectRobotsTxtFile: ow.optional.any(ow.boolean, ow.object),
173
203
  onSkippedRequest: ow.optional.function,
174
204
  httpClient: ow.optional.object,
205
+ configuration: ow.optional.object,
206
+ storageClient: ow.optional.object,
207
+ eventManager: ow.optional.object,
175
208
  // AutoscaledPool shorthands
176
209
  minConcurrency: ow.optional.number,
177
210
  maxConcurrency: ow.optional.number,
@@ -181,126 +214,185 @@ export class BasicCrawler {
181
214
  log: ow.optional.object,
182
215
  experiments: ow.optional.object,
183
216
  statisticsOptions: ow.optional.object,
217
+ id: ow.optional.string,
184
218
  };
185
219
  /**
186
220
  * All `BasicCrawler` parameters are passed via an options object.
187
221
  */
188
- constructor(options = {}, config = Configuration.getGlobalConfig()) {
189
- this.config = config;
222
+ constructor(options = {}) {
190
223
  ow(options, 'BasicCrawlerOptions', ow.object.exactShape(BasicCrawler.optionsShape));
191
- const { requestList, requestQueue, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, useSessionPool = true,
224
+ const { requestList, requestQueue, requestManager, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, maxCrawlDepth, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, useSessionPool = true, proxyConfiguration,
225
+ // Service locator options
226
+ configuration, storageClient, eventManager,
192
227
  // AutoscaledPool shorthands
193
228
  minConcurrency, maxConcurrency, maxRequestsPerMinute, retryOnBlocked = false, respectRobotsTxtFile = false, onSkippedRequest, requestHandler, requestHandlerTimeoutSecs, errorHandler, failedRequestHandler, statusMessageLoggingInterval = 10, statusMessageCallback, statisticsOptions, httpClient,
194
229
  // internal
195
- log = defaultLog.child({ prefix: this.constructor.name }), experiments = {}, } = options;
196
- this.requestList = requestList;
197
- this.requestQueue = requestQueue;
198
- this.httpClient = httpClient ?? new GotScrapingHttpClient();
199
- this.log = log;
200
- this.statusMessageLoggingInterval = statusMessageLoggingInterval;
201
- this.statusMessageCallback = statusMessageCallback;
202
- this.events = config.getEventManager();
203
- this.domainAccessedTime = new Map();
204
- this.experiments = experiments;
205
- this.robotsTxtFileCache = new LruCache({ maxLength: 1000 });
206
- // FIXME any
207
- this.requestHandler = requestHandler ?? this.router;
208
- this.failedRequestHandler = failedRequestHandler;
209
- this.errorHandler = errorHandler;
210
- if (requestHandlerTimeoutSecs) {
211
- this.requestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
212
- }
213
- else {
214
- this.requestHandlerTimeoutMillis = 60_000;
215
- }
216
- this.retryOnBlocked = retryOnBlocked;
217
- this.respectRobotsTxtFile = respectRobotsTxtFile;
218
- this.onSkippedRequest = onSkippedRequest;
219
- const tryEnv = (val) => (val == null ? null : +val);
220
- // allow at least 5min for internal timeouts
221
- this.internalTimeoutMillis =
222
- tryEnv(process.env.CRAWLEE_INTERNAL_TIMEOUT) ?? Math.max(this.requestHandlerTimeoutMillis * 2, 300e3);
223
- // override the default internal timeout of request queue to respect `requestHandlerTimeoutMillis`
224
- if (this.requestQueue) {
225
- this.requestQueue.internalTimeoutMillis = this.internalTimeoutMillis;
226
- // for request queue v2, we want to lock requests for slightly longer than the request handler timeout so that there is some padding for locking-related overhead,
227
- // but never for less than a minute
228
- this.requestQueue.requestLockSecs = Math.max(this.requestHandlerTimeoutMillis / 1000 + 5, 60);
229
- }
230
- this.maxRequestRetries = maxRequestRetries;
231
- this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
232
- this.maxSessionRotations = maxSessionRotations;
233
- this.handledRequestsCount = 0;
234
- this.stats = new Statistics({
235
- logMessage: `${log.getOptions().prefix} request statistics:`,
236
- log,
237
- config,
238
- ...statisticsOptions,
239
- });
240
- this.sessionPoolOptions = {
241
- ...sessionPoolOptions,
242
- log,
243
- };
244
- if (this.retryOnBlocked) {
245
- this.sessionPoolOptions.blockedStatusCodes = sessionPoolOptions.blockedStatusCodes ?? [];
246
- if (this.sessionPoolOptions.blockedStatusCodes.length !== 0) {
247
- log.warning(`Both 'blockedStatusCodes' and 'retryOnBlocked' are set. Please note that the 'retryOnBlocked' feature might not work as expected.`);
248
- }
230
+ log = defaultLog.child({ prefix: this.constructor.name }), experiments = {}, id, } = options;
231
+ // Create per-crawler service locator if custom services were provided.
232
+ // This wraps every method on the crawler instance so that calls to the global `serviceLocator`
233
+ // (via AsyncLocalStorage) resolve to this scoped instance instead.
234
+ // We also enter the scope for the rest of the constructor body, so that any code below
235
+ // that accesses `serviceLocator` will see the correct (scoped) instance.
236
+ let serviceLocatorScope = { enterScope: () => { }, exitScope: () => { } };
237
+ if (storageClient ||
238
+ eventManager ||
239
+ (configuration !== undefined && configuration !== serviceLocator.getConfiguration())) {
240
+ const scopedServiceLocator = new ServiceLocator(configuration, eventManager, storageClient);
241
+ serviceLocatorScope = bindMethodsToServiceLocator(scopedServiceLocator, this);
249
242
  }
250
- this.useSessionPool = useSessionPool;
251
- this.crawlingContexts = new Map();
252
- const maxSignedInteger = 2 ** 31 - 1;
253
- if (this.requestHandlerTimeoutMillis > maxSignedInteger) {
254
- log.warning(`requestHandlerTimeoutMillis ${this.requestHandlerTimeoutMillis}` +
255
- ` does not fit a signed 32-bit integer. Limiting the value to ${maxSignedInteger}`);
256
- this.requestHandlerTimeoutMillis = maxSignedInteger;
257
- }
258
- this.internalTimeoutMillis = Math.min(this.internalTimeoutMillis, maxSignedInteger);
259
- let shouldLogMaxPagesExceeded = true;
260
- const isMaxPagesExceeded = () => maxRequestsPerCrawl && maxRequestsPerCrawl <= this.handledRequestsCount;
261
- // eslint-disable-next-line prefer-const
262
- let { isFinishedFunction, isTaskReadyFunction } = autoscaledPoolOptions;
263
- // override even if `isFinishedFunction` provided by user - `keepAlive` has higher priority
264
- if (keepAlive) {
265
- isFinishedFunction = async () => false;
266
- }
267
- const basicCrawlerAutoscaledPoolConfiguration = {
268
- minConcurrency: minConcurrency ?? autoscaledPoolOptions?.minConcurrency,
269
- maxConcurrency: maxConcurrency ?? autoscaledPoolOptions?.maxConcurrency,
270
- maxTasksPerMinute: maxRequestsPerMinute ?? autoscaledPoolOptions?.maxTasksPerMinute,
271
- runTaskFunction: this._runTaskFunction.bind(this),
272
- isTaskReadyFunction: async () => {
273
- if (isMaxPagesExceeded()) {
274
- if (shouldLogMaxPagesExceeded) {
275
- log.info('Crawler reached the maxRequestsPerCrawl limit of ' +
276
- `${maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
277
- shouldLogMaxPagesExceeded = false;
278
- }
279
- return false;
243
+ try {
244
+ serviceLocatorScope.enterScope();
245
+ // Store whether the user explicitly provided an ID
246
+ this.hasExplicitId = id !== undefined;
247
+ // Store the user-provided ID, or generate a unique one for tracking purposes (not for state key)
248
+ this.crawlerId = id ?? cryptoRandomObjectId();
249
+ // Store the builder so that it can be run when the contextPipeline is needed.
250
+ // Invoking it immediately would cause problems with parent constructor call order.
251
+ this.contextPipelineBuilder = () => {
252
+ let contextPipeline = (options.contextPipelineBuilder?.() ??
253
+ ContextPipeline.create()); // Thanks to the RequireContextPipeline, contextPipeline will only be undefined if InitialContextType is CrawlingContext
254
+ if (options.extendContext !== undefined) {
255
+ contextPipeline = contextPipeline.compose({
256
+ action: async (context) => await options.extendContext(context),
257
+ });
280
258
  }
281
- return isTaskReadyFunction ? await isTaskReadyFunction() : await this._isTaskReadyFunction();
282
- },
283
- isFinishedFunction: async () => {
284
- if (isMaxPagesExceeded()) {
285
- log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${maxRequestsPerCrawl} requests ` +
286
- 'and all requests that were in progress at that time have now finished. ' +
287
- `In total, the crawler processed ${this.handledRequestsCount} requests and will shut down.`);
288
- return true;
259
+ contextPipeline = contextPipeline.compose({
260
+ action: async (context) => {
261
+ const { request } = context;
262
+ if (!this.requestMatchesEnqueueStrategy(request)) {
263
+ // eslint-disable-next-line dot-notation
264
+ const message = `Skipping request ${request.id} (starting url: ${request.url} -> loaded url: ${request.loadedUrl}) because it does not match the enqueue strategy (${request['enqueueStrategy']}).`;
265
+ this.log.debug(message);
266
+ request.noRetry = true;
267
+ request.state = RequestState.SKIPPED;
268
+ await this.handleSkippedRequest({ url: request.url, reason: 'redirect' });
269
+ throw new ContextPipelineInterruptedError(message);
270
+ }
271
+ return context;
272
+ },
273
+ });
274
+ return contextPipeline;
275
+ };
276
+ if (requestManager !== undefined) {
277
+ if (requestList !== undefined || requestQueue !== undefined) {
278
+ throw new Error('The `requestManager` option cannot be used in conjunction with `requestList` and/or `requestQueue`');
289
279
  }
290
- const isFinished = isFinishedFunction
291
- ? await isFinishedFunction()
292
- : await this._defaultIsFinishedFunction();
293
- if (isFinished) {
294
- const reason = isFinishedFunction
295
- ? "Crawler's custom isFinishedFunction() returned true, the crawler will shut down."
296
- : 'All requests from the queue have been processed, the crawler will shut down.';
297
- log.info(reason);
280
+ this.requestManager = requestManager;
281
+ this.requestQueue = requestManager; // TODO(v4) - the cast is not fully legitimate here, but it's fine for internal usage by the BasicCrawler
282
+ }
283
+ else {
284
+ this.requestList = requestList;
285
+ this.requestQueue = requestQueue;
286
+ }
287
+ this.httpClient = httpClient ?? new GotScrapingHttpClient();
288
+ this.proxyConfiguration = proxyConfiguration;
289
+ this.log = log;
290
+ this.statusMessageLoggingInterval = statusMessageLoggingInterval;
291
+ this.statusMessageCallback = statusMessageCallback;
292
+ this.domainAccessedTime = new Map();
293
+ this.experiments = experiments;
294
+ this.robotsTxtFileCache = new LruCache({ maxLength: 1000 });
295
+ this.handleSkippedRequest = this.handleSkippedRequest.bind(this);
296
+ this.requestHandler = requestHandler ?? this.router;
297
+ this.failedRequestHandler = failedRequestHandler;
298
+ this.errorHandler = errorHandler;
299
+ if (requestHandlerTimeoutSecs) {
300
+ this.requestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
301
+ }
302
+ else {
303
+ this.requestHandlerTimeoutMillis = 60_000;
304
+ }
305
+ this.retryOnBlocked = retryOnBlocked;
306
+ this.respectRobotsTxtFile = respectRobotsTxtFile;
307
+ this.onSkippedRequest = onSkippedRequest;
308
+ const tryEnv = (val) => (val == null ? null : +val);
309
+ // allow at least 5min for internal timeouts
310
+ this.internalTimeoutMillis =
311
+ tryEnv(process.env.CRAWLEE_INTERNAL_TIMEOUT) ?? Math.max(this.requestHandlerTimeoutMillis * 2, 300e3);
312
+ // override the default internal timeout of request queue to respect `requestHandlerTimeoutMillis`
313
+ if (this.requestQueue) {
314
+ this.requestQueue.internalTimeoutMillis = this.internalTimeoutMillis;
315
+ // for request queue v2, we want to lock requests for slightly longer than the request handler timeout so that there is some padding for locking-related overhead,
316
+ // but never for less than a minute
317
+ this.requestQueue.requestLockSecs = Math.max(this.requestHandlerTimeoutMillis / 1000 + 5, 60);
318
+ }
319
+ this.maxRequestRetries = maxRequestRetries;
320
+ this.maxCrawlDepth = maxCrawlDepth;
321
+ this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
322
+ this.maxSessionRotations = maxSessionRotations;
323
+ this.stats = new Statistics({
324
+ logMessage: `${log.getOptions().prefix} request statistics:`,
325
+ log,
326
+ ...(this.hasExplicitId ? { id: this.crawlerId } : {}),
327
+ ...statisticsOptions,
328
+ });
329
+ this.sessionPoolOptions = {
330
+ ...sessionPoolOptions,
331
+ log,
332
+ };
333
+ if (this.retryOnBlocked) {
334
+ this.sessionPoolOptions.blockedStatusCodes = sessionPoolOptions.blockedStatusCodes ?? [];
335
+ if (this.sessionPoolOptions.blockedStatusCodes.length !== 0) {
336
+ log.warning(`Both 'blockedStatusCodes' and 'retryOnBlocked' are set. Please note that the 'retryOnBlocked' feature might not work as expected.`);
298
337
  }
299
- return isFinished;
300
- },
301
- log,
302
- };
303
- this.autoscaledPoolOptions = { ...autoscaledPoolOptions, ...basicCrawlerAutoscaledPoolConfiguration };
338
+ }
339
+ this.useSessionPool = useSessionPool;
340
+ const maxSignedInteger = 2 ** 31 - 1;
341
+ if (this.requestHandlerTimeoutMillis > maxSignedInteger) {
342
+ log.warning(`requestHandlerTimeoutMillis ${this.requestHandlerTimeoutMillis}` +
343
+ ` does not fit a signed 32-bit integer. Limiting the value to ${maxSignedInteger}`);
344
+ this.requestHandlerTimeoutMillis = maxSignedInteger;
345
+ }
346
+ this.internalTimeoutMillis = Math.min(this.internalTimeoutMillis, maxSignedInteger);
347
+ this.maxRequestsPerCrawl = maxRequestsPerCrawl;
348
+ const isMaxPagesExceeded = () => this.maxRequestsPerCrawl && this.maxRequestsPerCrawl <= this.handledRequestsCount;
349
+ // eslint-disable-next-line prefer-const
350
+ let { isFinishedFunction, isTaskReadyFunction } = autoscaledPoolOptions;
351
+ // override even if `isFinishedFunction` provided by user - `keepAlive` has higher priority
352
+ if (keepAlive) {
353
+ isFinishedFunction = async () => false;
354
+ }
355
+ const basicCrawlerAutoscaledPoolConfiguration = {
356
+ minConcurrency: minConcurrency ?? autoscaledPoolOptions?.minConcurrency,
357
+ maxConcurrency: maxConcurrency ?? autoscaledPoolOptions?.maxConcurrency,
358
+ maxTasksPerMinute: maxRequestsPerMinute ?? autoscaledPoolOptions?.maxTasksPerMinute,
359
+ runTaskFunction: this._runTaskFunction.bind(this),
360
+ isTaskReadyFunction: async () => {
361
+ if (isMaxPagesExceeded()) {
362
+ if (this.shouldLogMaxProcessedRequestsExceeded) {
363
+ log.info('Crawler reached the maxRequestsPerCrawl limit of ' +
364
+ `${this.maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
365
+ this.shouldLogMaxProcessedRequestsExceeded = false;
366
+ }
367
+ return false;
368
+ }
369
+ return isTaskReadyFunction ? await isTaskReadyFunction() : await this._isTaskReadyFunction();
370
+ },
371
+ isFinishedFunction: async () => {
372
+ if (isMaxPagesExceeded()) {
373
+ log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${this.maxRequestsPerCrawl} requests ` +
374
+ 'and all requests that were in progress at that time have now finished. ' +
375
+ `In total, the crawler processed ${this.handledRequestsCount} requests and will shut down.`);
376
+ return true;
377
+ }
378
+ const isFinished = isFinishedFunction
379
+ ? await isFinishedFunction()
380
+ : await this._defaultIsFinishedFunction();
381
+ if (isFinished) {
382
+ const reason = isFinishedFunction
383
+ ? "Crawler's custom isFinishedFunction() returned true, the crawler will shut down."
384
+ : 'All requests from the queue have been processed, the crawler will shut down.';
385
+ log.info(reason);
386
+ }
387
+ return isFinished;
388
+ },
389
+ log,
390
+ };
391
+ this.autoscaledPoolOptions = { ...autoscaledPoolOptions, ...basicCrawlerAutoscaledPoolConfiguration };
392
+ }
393
+ finally {
394
+ serviceLocatorScope.exitScope();
395
+ }
304
396
  }
305
397
  /**
306
398
  * Checks if the given error is a proxy error by comparing its message to a list of known proxy error messages.
@@ -311,21 +403,13 @@ export class BasicCrawler {
311
403
  isProxyError(error) {
312
404
  return ROTATE_PROXY_ERRORS.some((x) => this._getMessageFromError(error)?.includes(x));
313
405
  }
314
- /**
315
- * Checks whether the given crawling context is getting blocked by anti-bot protection using several heuristics.
316
- * Returns `false` if the request is not blocked, otherwise returns a string with a description of the block reason.
317
- * @param _crawlingContext The crawling context to check.
318
- */
319
- async isRequestBlocked(_crawlingContext) {
320
- throw new Error('the "isRequestBlocked" method is not implemented in this crawler.');
321
- }
322
406
  /**
323
407
  * This method is periodically called by the crawler, every `statusMessageLoggingInterval` seconds.
324
408
  */
325
409
  async setStatusMessage(message, options = {}) {
326
410
  const data = options.isStatusMessageTerminal != null ? { terminal: options.isStatusMessageTerminal } : undefined;
327
411
  this.log.internal(LogLevel[options.level ?? 'DEBUG'], message, data);
328
- const client = this.config.getStorageClient();
412
+ const client = serviceLocator.getStorageClient();
329
413
  if (!client.setStatusMessage) {
330
414
  return;
331
415
  }
@@ -350,7 +434,7 @@ export class BasicCrawler {
350
434
  message = `Experiencing problems, ${this.stats.state.requestsFailed - previousState.requestsFailed || this.stats.state.requestsFailed} failed requests in the past ${this.statusMessageLoggingInterval} seconds.`;
351
435
  }
352
436
  else {
353
- const total = this.requestQueue?.getTotalCount() || this.requestList?.length();
437
+ const total = this.requestManager?.getTotalCount();
354
438
  message = `Crawled ${this.stats.state.requestsFinished}${total ? `/${total}` : ''} pages, ${this.stats.state.requestsFailed} failed requests, desired concurrency ${this.autoscaledPool?.desiredConcurrency ?? 0}.`;
355
439
  }
356
440
  if (this.statusMessageCallback) {
@@ -390,20 +474,30 @@ export class BasicCrawler {
390
474
  if (this.requestQueue?.name === 'default' && purgeRequestQueue) {
391
475
  await this.requestQueue.drop();
392
476
  this.requestQueue = await this._getRequestQueue();
477
+ this.requestManager = undefined;
478
+ await this.initializeRequestManager();
479
+ this.handledRequestsCount = 0; // This would've been reset by this._init() further down below, but at that point `handledRequestsCount` could prevent `addRequests` from adding the initial requests
393
480
  }
394
481
  this.stats.reset();
395
482
  await this.stats.resetStore();
396
483
  await this.sessionPool?.resetStore();
397
484
  }
398
485
  this.running = true;
399
- await purgeDefaultStorages({ onlyPurgeOnce: true });
486
+ this.shouldLogMaxProcessedRequestsExceeded = true;
487
+ this.shouldLogMaxEnqueuedRequestsExceeded = true;
488
+ await purgeDefaultStorages({
489
+ onlyPurgeOnce: true,
490
+ client: serviceLocator.getStorageClient(),
491
+ config: serviceLocator.getConfiguration(),
492
+ });
400
493
  if (requests) {
401
494
  await this.addRequests(requests, addRequestsOptions);
402
495
  }
403
496
  await this._init();
404
497
  await this.stats.startCapturing();
405
498
  const periodicLogger = this.getPeriodicLogger();
406
- await this.setStatusMessage('Starting the crawler.', { level: 'INFO' });
499
+ // Don't await, we don't want to block the execution
500
+ void this.setStatusMessage('Starting the crawler.', { level: 'INFO' });
407
501
  const sigintHandler = async () => {
408
502
  this.log.warning('Pausing... Press CTRL+C again to force exit. To resume, do: CRAWLEE_PURGE_ON_START=0 npm start');
409
503
  await this._pauseOnMigration();
@@ -412,8 +506,9 @@ export class BasicCrawler {
412
506
  // Attach a listener to handle migration and aborting events gracefully.
413
507
  const boundPauseOnMigration = this._pauseOnMigration.bind(this);
414
508
  process.once('SIGINT', sigintHandler);
415
- this.events.on("migrating" /* EventType.MIGRATING */, boundPauseOnMigration);
416
- this.events.on("aborting" /* EventType.ABORTING */, boundPauseOnMigration);
509
+ const eventManager = serviceLocator.getEventManager();
510
+ eventManager.on("migrating" /* EventType.MIGRATING */, boundPauseOnMigration);
511
+ eventManager.on("aborting" /* EventType.ABORTING */, boundPauseOnMigration);
417
512
  let stats = {};
418
513
  try {
419
514
  await this.autoscaledPool.run();
@@ -422,8 +517,8 @@ export class BasicCrawler {
422
517
  await this.teardown();
423
518
  await this.stats.stopCapturing();
424
519
  process.off('SIGINT', sigintHandler);
425
- this.events.off("migrating" /* EventType.MIGRATING */, boundPauseOnMigration);
426
- this.events.off("aborting" /* EventType.ABORTING */, boundPauseOnMigration);
520
+ eventManager.off("migrating" /* EventType.MIGRATING */, boundPauseOnMigration);
521
+ eventManager.off("aborting" /* EventType.ABORTING */, boundPauseOnMigration);
427
522
  const finalStats = this.stats.calculate();
428
523
  stats = {
429
524
  requestsFinished: this.stats.state.requestsFinished,
@@ -440,7 +535,7 @@ export class BasicCrawler {
440
535
  mostCommonErrors: this.stats.errorTracker.getMostPopularErrors(3).map(prettify),
441
536
  });
442
537
  }
443
- const client = this.config.getStorageClient();
538
+ const client = serviceLocator.getStorageClient();
444
539
  if (client.teardown) {
445
540
  let finished = false;
446
541
  setTimeout(() => {
@@ -452,7 +547,8 @@ export class BasicCrawler {
452
547
  finished = true;
453
548
  }
454
549
  periodicLogger.stop();
455
- await this.setStatusMessage(`Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${this.stats.state.requestsFinished} succeeded, ${this.stats.state.requestsFailed} failed.`, { isStatusMessageTerminal: true, level: 'INFO' });
550
+ // Don't await, we don't want to block the execution
551
+ void this.setStatusMessage(`Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${this.stats.state.requestsFinished} succeeded, ${this.stats.state.requestsFailed} failed.`, { isStatusMessageTerminal: true, level: 'INFO' });
456
552
  this.running = false;
457
553
  this.hasFinishedBefore = true;
458
554
  }
@@ -462,6 +558,8 @@ export class BasicCrawler {
462
558
  * Gracefully stops the current run of the crawler.
463
559
  *
464
560
  * All the tasks active at the time of calling this method will be allowed to finish.
561
+ *
562
+ * To stop the crawler immediately, use {@link BasicCrawler.teardown|`crawler.teardown()`} instead.
465
563
  */
466
564
  stop(message = 'The crawler has been gracefully stopped.') {
467
565
  // Gracefully starve the this.autoscaledPool, so it doesn't start new tasks. Resolves once the pool is cleared.
@@ -478,13 +576,57 @@ export class BasicCrawler {
478
576
  if (!this.requestQueue && this.requestList) {
479
577
  this.log.warningOnce('When using RequestList and RequestQueue at the same time, you should instantiate both explicitly and provide them in the crawler options, to ensure correctly handled restarts of the crawler.');
480
578
  }
481
- this.requestQueue ??= await this._getRequestQueue();
579
+ if (!this.requestQueue) {
580
+ this.requestQueue = await this._getRequestQueue();
581
+ this.requestManager = undefined;
582
+ }
583
+ if (!this.requestManager) {
584
+ this.requestManager =
585
+ this.requestList === undefined
586
+ ? this.requestQueue
587
+ : new RequestManagerTandem(this.requestList, this.requestQueue);
588
+ }
482
589
  return this.requestQueue;
483
590
  }
484
591
  async useState(defaultValue = {}) {
485
- const kvs = await KeyValueStore.open(null, { config: this.config });
592
+ const kvs = await KeyValueStore.open(null, { config: serviceLocator.getConfiguration() });
593
+ if (this.hasExplicitId) {
594
+ const stateKey = `${BasicCrawler.CRAWLEE_STATE_KEY}_${this.crawlerId}`;
595
+ return kvs.getAutoSavedValue(stateKey, defaultValue);
596
+ }
597
+ BasicCrawler.useStateCrawlerIds.add(this.crawlerId);
598
+ if (BasicCrawler.useStateCrawlerIds.size > 1) {
599
+ defaultLog.warningOnce('Multiple crawler instances are calling useState() without an explicit `id` option. \n' +
600
+ 'This means they will share the same state object, which is likely unintended. \n' +
601
+ 'To fix this, provide a unique `id` option to each crawler instance. \n' +
602
+ 'Example: new BasicCrawler({ id: "my-crawler-1", ... })');
603
+ }
486
604
  return kvs.getAutoSavedValue(BasicCrawler.CRAWLEE_STATE_KEY, defaultValue);
487
605
  }
606
+ get pendingRequestCountApproximation() {
607
+ return this.requestManager?.getPendingCount() ?? 0;
608
+ }
609
+ calculateEnqueuedRequestLimit(explicitLimit) {
610
+ if (this.maxRequestsPerCrawl === undefined) {
611
+ return explicitLimit;
612
+ }
613
+ const limit = Math.max(0, this.maxRequestsPerCrawl - this.handledRequestsCount - this.pendingRequestCountApproximation);
614
+ return Math.min(limit, explicitLimit ?? Infinity);
615
+ }
616
+ async handleSkippedRequest(options) {
617
+ if (options.reason === 'limit' && this.shouldLogMaxEnqueuedRequestsExceeded) {
618
+ this.log.info('The number of requests enqueued by the crawler reached the maxRequestsPerCrawl limit of ' +
619
+ `${this.maxRequestsPerCrawl} requests and no further requests will be added.`);
620
+ this.shouldLogMaxEnqueuedRequestsExceeded = false;
621
+ }
622
+ if (options.reason === 'enqueueLimit') {
623
+ const enqueuedRequestLimit = this.calculateEnqueuedRequestLimit();
624
+ if (enqueuedRequestLimit === undefined || enqueuedRequestLimit !== 0) {
625
+ this.log.info('The number of requests enqueued by the crawler reached the enqueueLinks limit.');
626
+ }
627
+ }
628
+ await this.onSkippedRequest?.(options);
629
+ }
488
630
  /**
489
631
  * Adds requests to the queue in batches. By default, it will resolve after the initial batch is added, and continue
490
632
  * adding the rest in background. You can configure the batch size via `batchSize` option and the sleep time in between
@@ -497,33 +639,57 @@ export class BasicCrawler {
497
639
  * @param options Options for the request queue
498
640
  */
499
641
  async addRequests(requests, options = {}) {
500
- const requestQueue = await this.getRequestQueue();
501
- if (!this.respectRobotsTxtFile) {
502
- return requestQueue.addRequestsBatched(requests, options);
503
- }
504
- const allowedRequests = [];
505
- const skipped = new Set();
506
- for (const request of requests) {
507
- const url = typeof request === 'string' ? request : request.url;
508
- if (await this.isAllowedBasedOnRobotsTxtFile(url)) {
509
- allowedRequests.push(request);
510
- }
511
- else {
512
- skipped.add(url);
513
- await this.onSkippedRequest?.({ url, reason: 'robotsTxt' });
642
+ await this.getRequestQueue();
643
+ const requestLimit = this.calculateEnqueuedRequestLimit();
644
+ const skippedBecauseOfRobots = new Set();
645
+ const skippedBecauseOfLimit = new Set();
646
+ const skippedBecauseOfMaxCrawlDepth = new Set();
647
+ const isAllowedBasedOnRobotsTxtFile = this.isAllowedBasedOnRobotsTxtFile.bind(this);
648
+ const maxCrawlDepth = this.maxCrawlDepth;
649
+ ow(requests, ow.object
650
+ .is((value) => isIterable(value) || isAsyncIterable(value))
651
+ .message((value) => `Expected an iterable or async iterable, got ${getObjectType(value)}`));
652
+ async function* filteredRequests() {
653
+ let yieldedRequestCount = 0;
654
+ for await (const request of requests) {
655
+ const url = typeof request === 'string' ? request : request.url;
656
+ if (requestLimit !== undefined && yieldedRequestCount >= requestLimit) {
657
+ skippedBecauseOfLimit.add(url);
658
+ continue;
659
+ }
660
+ if (maxCrawlDepth !== undefined && request.crawlDepth > maxCrawlDepth) {
661
+ skippedBecauseOfMaxCrawlDepth.add(url);
662
+ continue;
663
+ }
664
+ if (await isAllowedBasedOnRobotsTxtFile(url)) {
665
+ yield request;
666
+ yieldedRequestCount += 1;
667
+ }
668
+ else {
669
+ skippedBecauseOfRobots.add(url);
670
+ }
514
671
  }
515
672
  }
516
- if (skipped.size > 0) {
673
+ const result = await this.requestManager.addRequestsBatched(filteredRequests(), options);
674
+ if (skippedBecauseOfRobots.size > 0) {
517
675
  this.log.warning(`Some requests were skipped because they were disallowed based on the robots.txt file`, {
518
- skipped: [...skipped],
676
+ skipped: [...skippedBecauseOfRobots],
519
677
  });
520
- if (this.onSkippedRequest) {
521
- await Promise.all([...skipped].map((url) => {
522
- return this.onSkippedRequest({ url, reason: 'robotsTxt' });
523
- }));
524
- }
525
678
  }
526
- return requestQueue.addRequestsBatched(allowedRequests, options);
679
+ if (skippedBecauseOfRobots.size > 0 ||
680
+ skippedBecauseOfLimit.size > 0 ||
681
+ skippedBecauseOfMaxCrawlDepth.size > 0) {
682
+ await Promise.all([...skippedBecauseOfRobots]
683
+ .map((url) => {
684
+ return this.handleSkippedRequest({ url, reason: 'robotsTxt' });
685
+ })
686
+ .concat([...skippedBecauseOfLimit].map((url) => {
687
+ return this.handleSkippedRequest({ url, reason: 'limit' });
688
+ }), [...skippedBecauseOfMaxCrawlDepth].map((url) => {
689
+ return this.handleSkippedRequest({ url, reason: 'depth' });
690
+ })));
691
+ }
692
+ return result;
527
693
  }
528
694
  /**
529
695
  * Pushes data to the specified {@link Dataset}, or the default crawler {@link Dataset} by calling {@link Dataset.pushData}.
@@ -536,7 +702,7 @@ export class BasicCrawler {
536
702
  * Retrieves the specified {@link Dataset}, or the default crawler {@link Dataset}.
537
703
  */
538
704
  async getDataset(idOrName) {
539
- return Dataset.open(idOrName, { config: this.config });
705
+ return Dataset.open(idOrName, { config: serviceLocator.getConfiguration() });
540
706
  }
541
707
  /**
542
708
  * Retrieves data from the default crawler {@link Dataset} by calling {@link Dataset.getData}.
@@ -563,7 +729,21 @@ export class BasicCrawler {
563
729
  const dataset = await this.getDataset();
564
730
  const items = await dataset.export(options);
565
731
  if (format === 'csv') {
566
- const value = stringify([Object.keys(items[0]), ...items.map((item) => Object.values(item))]);
732
+ let value;
733
+ if (items.length === 0) {
734
+ value = '';
735
+ }
736
+ else {
737
+ const keys = options?.collectAllKeys
738
+ ? Array.from(new Set(items.flatMap(Object.keys)))
739
+ : Object.keys(items[0]);
740
+ value = stringify([
741
+ keys,
742
+ ...items.map((item) => {
743
+ return keys.map((k) => item[k]);
744
+ }),
745
+ ]);
746
+ }
567
747
  await ensureDir(dirname(path));
568
748
  await writeFile(path, value);
569
749
  this.log.info(`Export to ${path} finished!`);
@@ -575,24 +755,31 @@ export class BasicCrawler {
575
755
  }
576
756
  return items;
577
757
  }
758
+ /**
759
+ * Initializes the crawler.
760
+ */
578
761
  async _init() {
579
- if (!this.events.isInitialized()) {
580
- await this.events.init();
762
+ const eventManager = serviceLocator.getEventManager();
763
+ if (!eventManager.isInitialized()) {
764
+ await eventManager.init();
581
765
  this._closeEvents = true;
582
766
  }
583
767
  // Initialize AutoscaledPool before awaiting _loadHandledRequestCount(),
584
768
  // so that the caller can get a reference to it before awaiting the promise returned from run()
585
769
  // (otherwise there would be no way)
586
- this.autoscaledPool = new AutoscaledPool(this.autoscaledPoolOptions, this.config);
770
+ this.autoscaledPool = new AutoscaledPool(this.autoscaledPoolOptions);
587
771
  if (this.useSessionPool) {
588
- this.sessionPool = await SessionPool.open(this.sessionPoolOptions, this.config);
772
+ this.sessionPool = await SessionPool.open(this.sessionPoolOptions);
589
773
  // Assuming there are not more than 20 browsers running at once;
590
774
  this.sessionPool.setMaxListeners(20);
591
775
  }
776
+ await this.initializeRequestManager();
592
777
  await this._loadHandledRequestCount();
593
778
  }
594
- async _runRequestHandler(crawlingContext) {
595
- await this.requestHandler(crawlingContext);
779
+ async runRequestHandler(crawlingContext) {
780
+ await this.contextPipeline.call(crawlingContext, async (finalContext) => {
781
+ await addTimeoutToPromise(async () => this.requestHandler(finalContext), this.requestHandlerTimeoutMillis, `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${finalContext.request.id}).`);
782
+ });
596
783
  }
597
784
  /**
598
785
  * Handles blocked request
@@ -608,7 +795,8 @@ export class BasicCrawler {
608
795
  return true;
609
796
  }
610
797
  const robotsTxtFile = await this.getRobotsTxtFileForUrl(url);
611
- return !robotsTxtFile || robotsTxtFile.isAllowed(url);
798
+ const userAgent = typeof this.respectRobotsTxtFile === 'object' ? this.respectRobotsTxtFile?.userAgent : '*';
799
+ return !robotsTxtFile || robotsTxtFile.isAllowed(url, userAgent);
612
800
  }
613
801
  async getRobotsTxtFileForUrl(url) {
614
802
  if (!this.respectRobotsTxtFile) {
@@ -662,36 +850,36 @@ export class BasicCrawler {
662
850
  await Promise.all([requestListPersistPromise, this.stats.persistState()]);
663
851
  }
664
852
  /**
665
- * Fetches request from either RequestList or RequestQueue. If request comes from a RequestList
666
- * and RequestQueue is present then enqueues it to the queue first.
853
+ * Initializes the RequestManager based on the configured requestList and requestQueue.
667
854
  */
668
- async _fetchNextRequest() {
669
- if (!this.requestList || (await this.requestList.isFinished())) {
670
- return this.requestQueue?.fetchNextRequest();
671
- }
672
- const request = await this.requestList.fetchNextRequest();
673
- if (!this.requestQueue)
674
- return request;
675
- if (!request)
676
- return this.requestQueue.fetchNextRequest();
677
- try {
678
- await this.requestQueue.addRequest(request, { forefront: true });
855
+ async initializeRequestManager() {
856
+ if (this.requestManager !== undefined) {
857
+ return;
858
+ }
859
+ if (this.requestList && this.requestQueue) {
860
+ // Create a RequestManagerTandem if both RequestList and RequestQueue are provided
861
+ this.requestManager = new RequestManagerTandem(this.requestList, this.requestQueue);
679
862
  }
680
- catch (err) {
681
- // If requestQueue.addRequest() fails here then we must reclaim it back to
682
- // the RequestList because probably it's not yet in the queue!
683
- this.log.error('Adding of request from the RequestList to the RequestQueue failed, reclaiming request back to the list.', { request });
684
- await this.requestList.reclaimRequest(request);
685
- return null;
863
+ else if (this.requestQueue) {
864
+ // Use RequestQueue directly if only it is provided
865
+ this.requestManager = this.requestQueue;
686
866
  }
687
- await this.requestList.markRequestHandled(request);
688
- return this.requestQueue.fetchNextRequest();
867
+ else if (this.requestList) {
868
+ // Use RequestList directly if only it is provided
869
+ // Make it compatible with the IRequestManager interface
870
+ this.requestManager = new RequestListAdapter(this.requestList);
871
+ }
872
+ // If neither RequestList nor RequestQueue is provided, leave the requestManager uninitialized until `getRequestQueue` is called
689
873
  }
690
874
  /**
691
- * Executed when `errorHandler` finishes or the request is successful.
692
- * Can be used to clean up orphaned browser pages.
875
+ * Fetches the next request to process from the underlying request provider.
693
876
  */
694
- async _cleanupContext(_crawlingContext) { }
877
+ async _fetchNextRequest() {
878
+ if (this.requestManager === undefined) {
879
+ throw new Error(`_fetchNextRequest called on an uninitialized crawler`);
880
+ }
881
+ return this.requestManager.fetchNextRequest();
882
+ }
695
883
  /**
696
884
  * Delays processing of the request based on the `sameDomainDelaySecs` option,
697
885
  * adding it back to the queue after the timeout passes. Returns `true` if the request
@@ -729,18 +917,21 @@ export class BasicCrawler {
729
917
  * then retries them in a case of an error, etc.
730
918
  */
731
919
  async _runTaskFunction() {
732
- const source = this.requestQueue || this.requestList || (await this.getRequestQueue());
733
- let request;
734
- let session;
735
- await this._timeoutAndRetry(async () => {
736
- request = await this._fetchNextRequest();
737
- }, this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
920
+ const source = this.requestManager;
921
+ if (!source)
922
+ throw new Error('Request provider is not initialized!');
923
+ const request = await this._timeoutAndRetry(this._fetchNextRequest.bind(this), this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
738
924
  tryCancel();
739
- if (this.useSessionPool) {
740
- await this._timeoutAndRetry(async () => {
741
- session = await this.sessionPool.getSession();
742
- }, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
743
- }
925
+ const session = this.useSessionPool
926
+ ? await this._timeoutAndRetry(async () => {
927
+ return await this.sessionPool.newSession({
928
+ proxyInfo: await this.proxyConfiguration?.newProxyInfo({
929
+ request: request ?? undefined,
930
+ }),
931
+ maxUsageCount: 1,
932
+ });
933
+ }, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`)
934
+ : undefined;
744
935
  tryCancel();
745
936
  if (!request || this.delayRequest(request, source)) {
746
937
  return;
@@ -750,7 +941,7 @@ export class BasicCrawler {
750
941
  request.state = RequestState.SKIPPED;
751
942
  request.noRetry = true;
752
943
  await source.markRequestHandled(request);
753
- await this.onSkippedRequest?.({
944
+ await this.handleSkippedRequest({
754
945
  url: request.url,
755
946
  reason: 'robotsTxt',
756
947
  });
@@ -760,36 +951,34 @@ export class BasicCrawler {
760
951
  request.loadedUrl = undefined;
761
952
  const statisticsId = request.id || request.uniqueKey;
762
953
  this.stats.startJob(statisticsId);
763
- // Shared crawling context
764
- // @ts-expect-error
765
- // All missing properties (that extend CrawlingContext) are set dynamically,
766
- // but TS does not know that, so otherwise it would throw when compiling.
954
+ const deferredCleanup = [];
767
955
  const crawlingContext = {
768
956
  id: cryptoRandomObjectId(10),
769
- crawler: this,
770
957
  log: this.log,
771
958
  request,
772
959
  session,
960
+ proxyInfo: session?.proxyInfo,
773
961
  enqueueLinks: async (options) => {
774
- return enqueueLinks({
775
- // specify the RQ first to allow overriding it
776
- requestQueue: await this.getRequestQueue(),
777
- robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
778
- onSkippedRequest: this.onSkippedRequest,
779
- ...options,
780
- });
962
+ const requestQueue = await this.getRequestQueue();
963
+ return await this.enqueueLinksWithCrawlDepth(options, request, requestQueue);
964
+ },
965
+ addRequests: async (requests, options = {}) => {
966
+ const newCrawlDepth = request.crawlDepth + 1;
967
+ const requestsGenerator = this.addCrawlDepthRequestGenerator(requests, newCrawlDepth);
968
+ await this.addRequests(requestsGenerator, options);
781
969
  },
782
- addRequests: this.addRequests.bind(this),
783
970
  pushData: this.pushData.bind(this),
784
971
  useState: this.useState.bind(this),
785
- sendRequest: createSendRequest(this.httpClient, request, session, () => crawlingContext.proxyInfo?.url),
786
- getKeyValueStore: async (idOrName) => KeyValueStore.open(idOrName, { config: this.config }),
972
+ sendRequest: createSendRequest(this.httpClient, request, session),
973
+ getKeyValueStore: async (idOrName) => KeyValueStore.open(idOrName, { config: serviceLocator.getConfiguration() }),
974
+ registerDeferredCleanup: (cleanup) => {
975
+ deferredCleanup.push(cleanup);
976
+ },
787
977
  };
788
- this.crawlingContexts.set(crawlingContext.id, crawlingContext);
789
978
  let isRequestLocked = true;
790
979
  try {
791
980
  request.state = RequestState.REQUEST_HANDLER;
792
- await addTimeoutToPromise(async () => this._runRequestHandler(crawlingContext), this.requestHandlerTimeoutMillis, `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${request.id}).`);
981
+ await this.runRequestHandler(crawlingContext);
793
982
  await this._timeoutAndRetry(async () => source.markRequestHandled(request), this.internalTimeoutMillis, `Marking request ${request.url} (${request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
794
983
  isRequestLocked = false; // markRequestHandled succeeded and unlocked the request
795
984
  this.stats.finishJob(statisticsId, request.retryCount);
@@ -798,7 +987,8 @@ export class BasicCrawler {
798
987
  request.state = RequestState.DONE;
799
988
  crawlingContext.session?.markGood();
800
989
  }
801
- catch (err) {
990
+ catch (rawError) {
991
+ const err = this.unwrapError(rawError);
802
992
  try {
803
993
  request.state = RequestState.ERROR_HANDLER;
804
994
  await addTimeoutToPromise(async () => this._requestFunctionErrorHandler(err, crawlingContext, source), this.internalTimeoutMillis, `Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
@@ -808,24 +998,24 @@ export class BasicCrawler {
808
998
  request.state = RequestState.DONE;
809
999
  }
810
1000
  catch (secondaryError) {
811
- if (!secondaryError.triggeredFromUserHandler &&
1001
+ const unwrappedSecondaryError = this.unwrapError(secondaryError);
1002
+ if (!unwrappedSecondaryError.triggeredFromUserHandler &&
812
1003
  // avoid reprinting the same critical error multiple times, as it will be printed by Nodejs at the end anyway
813
- !(secondaryError instanceof CriticalError)) {
1004
+ !(unwrappedSecondaryError instanceof CriticalError)) {
814
1005
  const apifySpecific = process.env.APIFY_IS_AT_HOME
815
1006
  ? `This may have happened due to an internal error of Apify's API or due to a misconfigured crawler.`
816
1007
  : '';
817
- this.log.exception(secondaryError, 'An exception occurred during handling of failed request. ' +
1008
+ this.log.exception(unwrappedSecondaryError, 'An exception occurred during handling of failed request. ' +
818
1009
  `This places the crawler and its underlying storages into an unknown state and crawling will be terminated. ${apifySpecific}`);
819
1010
  }
820
1011
  request.state = RequestState.ERROR;
821
- throw secondaryError;
1012
+ throw unwrappedSecondaryError;
822
1013
  }
823
1014
  // decrease the session score if the request fails (but the error handler did not throw)
824
1015
  crawlingContext.session?.markBad();
825
1016
  }
826
1017
  finally {
827
- await this._cleanupContext(crawlingContext);
828
- this.crawlingContexts.delete(crawlingContext.id);
1018
+ await Promise.all(deferredCleanup.map((cleanup) => cleanup()));
829
1019
  // Safety net - release the lock if nobody managed to do it before
830
1020
  if (isRequestLocked && source instanceof RequestProvider) {
831
1021
  try {
@@ -838,19 +1028,63 @@ export class BasicCrawler {
838
1028
  }
839
1029
  }
840
1030
  /**
841
- * Run async callback with given timeout and retry.
1031
+ * Wrapper around the crawling context's `enqueueLinks` method:
1032
+ * - Injects `crawlDepth` to each request being added based on the crawling context request.
1033
+ * - Provides defaults for the `enqueueLinks` options based on the crawler configuration.
1034
+ * - These options can be overridden by the user.
1035
+ * @internal
1036
+ */
1037
+ async enqueueLinksWithCrawlDepth(options, request, requestQueue) {
1038
+ const transformRequestFunctionWrapper = (requestOptions) => {
1039
+ requestOptions.crawlDepth = request.crawlDepth + 1;
1040
+ if (this.maxCrawlDepth !== undefined && requestOptions.crawlDepth > this.maxCrawlDepth) {
1041
+ // Setting `skippedReason` before returning `false` ensures that `reportSkippedRequests`
1042
+ // reports `'depth'` as the reason (via `request.skippedReason ?? reason` fallback),
1043
+ // rather than the generic `'transform'` reason.
1044
+ requestOptions.skippedReason = 'depth';
1045
+ return false;
1046
+ }
1047
+ // After injecting the crawlDepth, we call the user-provided transform function, if there is one.
1048
+ return options.transformRequestFunction?.(requestOptions) ?? requestOptions;
1049
+ };
1050
+ return await enqueueLinks({
1051
+ requestQueue,
1052
+ robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
1053
+ onSkippedRequest: this.handleSkippedRequest,
1054
+ limit: this.calculateEnqueuedRequestLimit(options.limit),
1055
+ // Allow user options to override defaults set above ⤴
1056
+ ...options,
1057
+ transformRequestFunction: transformRequestFunctionWrapper,
1058
+ });
1059
+ }
1060
+ /**
1061
+ * Generator function that yields requests injected with the given crawl depth.
1062
+ * @internal
1063
+ */
1064
+ async *addCrawlDepthRequestGenerator(requests, newRequestDepth) {
1065
+ for await (const request of requests) {
1066
+ if (typeof request === 'string') {
1067
+ yield { url: request, crawlDepth: newRequestDepth };
1068
+ }
1069
+ else {
1070
+ request.crawlDepth ??= newRequestDepth;
1071
+ yield request;
1072
+ }
1073
+ }
1074
+ }
1075
+ /**
1076
+ * Run async callback with given timeout and retry. Returns the result of the callback.
842
1077
  * @ignore
843
1078
  */
844
1079
  async _timeoutAndRetry(handler, timeout, error, maxRetries = 3, retried = 1) {
845
1080
  try {
846
- await addTimeoutToPromise(handler, timeout, error);
1081
+ return await addTimeoutToPromise(handler, timeout, error);
847
1082
  }
848
1083
  catch (e) {
849
1084
  if (retried <= maxRetries) {
850
1085
  // we retry on any error, not just timeout
851
1086
  this.log.warning(`${e.message} (retrying ${retried}/${maxRetries})`);
852
- void this._timeoutAndRetry(handler, timeout, error, maxRetries, retried + 1);
853
- return;
1087
+ return this._timeoutAndRetry(handler, timeout, error, maxRetries, retried + 1);
854
1088
  }
855
1089
  throw e;
856
1090
  }
@@ -859,24 +1093,13 @@ export class BasicCrawler {
859
1093
  * Returns true if either RequestList or RequestQueue have a request ready for processing.
860
1094
  */
861
1095
  async _isTaskReadyFunction() {
862
- // First check RequestList, since it's only in memory.
863
- const isRequestListEmpty = this.requestList ? await this.requestList.isEmpty() : true;
864
- // If RequestList is not empty, task is ready, no reason to check RequestQueue.
865
- if (!isRequestListEmpty)
866
- return true;
867
- // If RequestQueue is not empty, task is ready, return true, otherwise false.
868
- return this.requestQueue ? !(await this.requestQueue.isEmpty()) : false;
1096
+ return this.requestManager !== undefined && !(await this.requestManager.isEmpty());
869
1097
  }
870
1098
  /**
871
1099
  * Returns true if both RequestList and RequestQueue have all requests finished.
872
1100
  */
873
1101
  async _defaultIsFinishedFunction() {
874
- const [isRequestListFinished, isRequestQueueFinished] = await Promise.all([
875
- this.requestList ? this.requestList.isFinished() : true,
876
- this.requestQueue ? this.requestQueue.isFinished() : true,
877
- ]);
878
- // If both are finished, return true, otherwise return false.
879
- return isRequestListFinished && isRequestQueueFinished;
1102
+ return !this.requestManager || (await this.requestManager.isFinished());
880
1103
  }
881
1104
  async _rotateSession(crawlingContext) {
882
1105
  const { request } = crawlingContext;
@@ -884,6 +1107,18 @@ export class BasicCrawler {
884
1107
  request.sessionRotationCount++;
885
1108
  crawlingContext.session?.retire();
886
1109
  }
1110
+ /**
1111
+ * Unwraps errors thrown by the context pipeline to get the actual user error.
1112
+ * RequestHandlerError and ContextPipelineInitializationError wrap the actual error.
1113
+ */
1114
+ unwrapError(error) {
1115
+ if (error instanceof RequestHandlerError ||
1116
+ error instanceof ContextPipelineInitializationError ||
1117
+ error instanceof ContextPipelineCleanupError) {
1118
+ return this.unwrapError(error.cause);
1119
+ }
1120
+ return error;
1121
+ }
887
1122
  /**
888
1123
  * Handles errors thrown by user provided requestHandler()
889
1124
  */
@@ -896,7 +1131,8 @@ export class BasicCrawler {
896
1131
  const shouldRetryRequest = this._canRequestBeRetried(request, error);
897
1132
  if (shouldRetryRequest) {
898
1133
  await this.stats.errorTrackerRetry.addAsync(error, crawlingContext);
899
- await this.errorHandler?.(crawlingContext, error);
1134
+ await this.errorHandler?.(crawlingContext, // valid cast - ExtendedContext transitively extends CrawlingContext
1135
+ error);
900
1136
  if (error instanceof SessionError) {
901
1137
  await this._rotateSession(crawlingContext);
902
1138
  }
@@ -948,7 +1184,8 @@ export class BasicCrawler {
948
1184
  const message = this._getMessageFromError(error, true);
949
1185
  this.log.error(`Request failed and reached maximum retries. ${message}`, { id, url, method, uniqueKey });
950
1186
  if (this.failedRequestHandler) {
951
- await this.failedRequestHandler?.(crawlingContext, error);
1187
+ await this.failedRequestHandler?.(crawlingContext, // valid cast - ExtendedContext transitively extends CrawlingContext
1188
+ error);
952
1189
  }
953
1190
  }
954
1191
  /**
@@ -986,19 +1223,11 @@ export class BasicCrawler {
986
1223
  return request.retryCount < maxRequestRetries;
987
1224
  }
988
1225
  /**
989
- * Updates handledRequestsCount from possibly stored counts,
990
- * usually after worker migration. Since one of the stores
991
- * needs to have priority when both are present,
992
- * it is the request queue, because generally, the request
993
- * list will first be dumped into the queue and then left
994
- * empty.
1226
+ * Updates handledRequestsCount from possibly stored counts, usually after worker migration.
995
1227
  */
996
1228
  async _loadHandledRequestCount() {
997
- if (this.requestQueue) {
998
- this.handledRequestsCount = await this.requestQueue.handledCount();
999
- }
1000
- else if (this.requestList) {
1001
- this.handledRequestsCount = this.requestList.handledCount();
1229
+ if (this.requestManager) {
1230
+ this.handledRequestsCount = await this.requestManager.handledCount();
1002
1231
  }
1003
1232
  }
1004
1233
  async _executeHooks(hooks, ...args) {
@@ -1009,16 +1238,17 @@ export class BasicCrawler {
1009
1238
  }
1010
1239
  }
1011
1240
  /**
1012
- * Function for cleaning up after all request are processed.
1013
- * @ignore
1241
+ * Stops the crawler immediately.
1242
+ *
1243
+ * This method doesn't wait for currently active requests to finish.
1244
+ *
1245
+ * To stop the crawler gracefully (waiting for all running requests to finish), use {@link BasicCrawler.stop|`crawler.stop()`} instead.
1014
1246
  */
1015
1247
  async teardown() {
1016
- this.events.emit("persistState" /* EventType.PERSIST_STATE */, { isMigrating: false });
1017
- if (this.useSessionPool) {
1018
- await this.sessionPool.teardown();
1019
- }
1248
+ serviceLocator.getEventManager().emit("persistState" /* EventType.PERSIST_STATE */, { isMigrating: false });
1249
+ await this.sessionPool?.teardown();
1020
1250
  if (this._closeEvents) {
1021
- await this.events.close();
1251
+ await serviceLocator.getEventManager().close();
1022
1252
  }
1023
1253
  await this.autoscaledPool?.abort();
1024
1254
  }
@@ -1036,9 +1266,9 @@ export class BasicCrawler {
1036
1266
  this.log.info('Using the old RequestQueue implementation without request locking.');
1037
1267
  this._experimentWarnings.requestLocking = true;
1038
1268
  }
1039
- return RequestQueueV1.open(null, { config: this.config });
1269
+ return RequestQueueV1.open(null, { config: serviceLocator.getConfiguration() });
1040
1270
  }
1041
- return RequestQueue.open(null, { config: this.config });
1271
+ return RequestQueue.open(null, { config: serviceLocator.getConfiguration() });
1042
1272
  }
1043
1273
  requestMatchesEnqueueStrategy(request) {
1044
1274
  const { url, loadedUrl } = request;