@crawlee/basic 4.0.0-beta.2 → 4.0.0-beta.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  import { writeFile } from 'node:fs/promises';
2
2
  import { dirname } from 'node:path';
3
- import { AutoscaledPool, Configuration, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, GotScrapingHttpClient, KeyValueStore, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
4
- import { RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
3
+ import { AutoscaledPool, Configuration, ContextPipeline, ContextPipelineCleanupError, ContextPipelineInitializationError, ContextPipelineInterruptedError, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, GotScrapingHttpClient, KeyValueStore, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestHandlerError, RequestListAdapter, RequestManagerTandem, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
4
+ import { getObjectType, isAsyncIterable, isIterable, RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
5
5
  import { stringify } from 'csv-stringify/sync';
6
6
  import { ensureDir, writeJSON } from 'fs-extra/esm';
7
7
  import ow from 'ow';
@@ -103,6 +103,10 @@ export class BasicCrawler {
103
103
  * Only available if used by the crawler.
104
104
  */
105
105
  requestQueue;
106
+ /**
107
+ * The main request-handling component of the crawler. It's initialized during the crawler startup.
108
+ */
109
+ requestManager;
106
110
  /**
107
111
  * A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session|sessions}.
108
112
  * Only available if used by the crawler.
@@ -116,11 +120,24 @@ export class BasicCrawler {
116
120
  * or to abort it by calling {@link AutoscaledPool.abort|`autoscaledPool.abort()`}.
117
121
  */
118
122
  autoscaledPool;
123
+ /**
124
+ * A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
125
+ * Only available if used by the crawler.
126
+ */
127
+ proxyConfiguration;
119
128
  /**
120
129
  * Default {@link Router} instance that will be used if we don't specify any {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
121
130
  * See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
122
131
  */
123
132
  router = Router.create();
133
+ contextPipelineBuilder;
134
+ _contextPipeline;
135
+ get contextPipeline() {
136
+ if (this._contextPipeline === undefined) {
137
+ this._contextPipeline = this.contextPipelineBuilder();
138
+ }
139
+ return this._contextPipeline;
140
+ }
124
141
  running = false;
125
142
  hasFinishedBefore = false;
126
143
  log;
@@ -130,15 +147,16 @@ export class BasicCrawler {
130
147
  requestHandlerTimeoutMillis;
131
148
  internalTimeoutMillis;
132
149
  maxRequestRetries;
150
+ maxCrawlDepth;
133
151
  sameDomainDelayMillis;
134
152
  domainAccessedTime;
135
153
  maxSessionRotations;
136
- handledRequestsCount;
154
+ maxRequestsPerCrawl;
155
+ handledRequestsCount = 0;
137
156
  statusMessageLoggingInterval;
138
157
  statusMessageCallback;
139
158
  sessionPoolOptions;
140
159
  useSessionPool;
141
- crawlingContexts = new Map();
142
160
  autoscaledPoolOptions;
143
161
  events;
144
162
  httpClient;
@@ -146,10 +164,14 @@ export class BasicCrawler {
146
164
  respectRobotsTxtFile;
147
165
  onSkippedRequest;
148
166
  _closeEvents;
167
+ shouldLogMaxProcessedRequestsExceeded = true;
168
+ shouldLogMaxEnqueuedRequestsExceeded = true;
149
169
  experiments;
150
170
  robotsTxtFileCache;
151
171
  _experimentWarnings = {};
152
172
  static optionsShape = {
173
+ contextPipelineBuilder: ow.optional.object,
174
+ extendContext: ow.optional.function,
153
175
  requestList: ow.optional.object.validate(validators.requestList),
154
176
  requestQueue: ow.optional.object.validate(validators.requestQueue),
155
177
  // Subclasses override this function instead of passing it
@@ -163,13 +185,15 @@ export class BasicCrawler {
163
185
  sameDomainDelaySecs: ow.optional.number,
164
186
  maxSessionRotations: ow.optional.number,
165
187
  maxRequestsPerCrawl: ow.optional.number,
188
+ maxCrawlDepth: ow.optional.number,
166
189
  autoscaledPoolOptions: ow.optional.object,
167
190
  sessionPoolOptions: ow.optional.object,
168
191
  useSessionPool: ow.optional.boolean,
192
+ proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration),
169
193
  statusMessageLoggingInterval: ow.optional.number,
170
194
  statusMessageCallback: ow.optional.function,
171
195
  retryOnBlocked: ow.optional.boolean,
172
- respectRobotsTxtFile: ow.optional.boolean,
196
+ respectRobotsTxtFile: ow.optional.any(ow.boolean, ow.object),
173
197
  onSkippedRequest: ow.optional.function,
174
198
  httpClient: ow.optional.object,
175
199
  // AutoscaledPool shorthands
@@ -185,17 +209,55 @@ export class BasicCrawler {
185
209
  /**
186
210
  * All `BasicCrawler` parameters are passed via an options object.
187
211
  */
188
- constructor(options = {}, config = Configuration.getGlobalConfig()) {
212
+ constructor(options = {}, // cast because the constructor logic handles missing `contextPipelineBuilder` - the type is just for DX
213
+ config = Configuration.getGlobalConfig()) {
189
214
  this.config = config;
190
215
  ow(options, 'BasicCrawlerOptions', ow.object.exactShape(BasicCrawler.optionsShape));
191
- const { requestList, requestQueue, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, useSessionPool = true,
216
+ const { requestList, requestQueue, requestManager, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, maxCrawlDepth, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, useSessionPool = true, proxyConfiguration,
192
217
  // AutoscaledPool shorthands
193
218
  minConcurrency, maxConcurrency, maxRequestsPerMinute, retryOnBlocked = false, respectRobotsTxtFile = false, onSkippedRequest, requestHandler, requestHandlerTimeoutSecs, errorHandler, failedRequestHandler, statusMessageLoggingInterval = 10, statusMessageCallback, statisticsOptions, httpClient,
194
219
  // internal
195
220
  log = defaultLog.child({ prefix: this.constructor.name }), experiments = {}, } = options;
196
- this.requestList = requestList;
197
- this.requestQueue = requestQueue;
221
+ // Store the builder so that it can be run when the contextPipeline is needed.
222
+ // Invoking it immediately would cause problems with parent constructor call order.
223
+ this.contextPipelineBuilder = () => {
224
+ let contextPipeline = (options.contextPipelineBuilder?.() ??
225
+ ContextPipeline.create()); // Thanks to the RequireContextPipeline, contextPipeline will only be undefined if InitialContextType is CrawlingContext
226
+ if (options.extendContext !== undefined) {
227
+ contextPipeline = contextPipeline.compose({
228
+ action: async (context) => await options.extendContext(context),
229
+ });
230
+ }
231
+ contextPipeline = contextPipeline.compose({
232
+ action: async (context) => {
233
+ const { request } = context;
234
+ if (!this.requestMatchesEnqueueStrategy(request)) {
235
+ // eslint-disable-next-line dot-notation
236
+ const message = `Skipping request ${request.id} (starting url: ${request.url} -> loaded url: ${request.loadedUrl}) because it does not match the enqueue strategy (${request['enqueueStrategy']}).`;
237
+ this.log.debug(message);
238
+ request.noRetry = true;
239
+ request.state = RequestState.SKIPPED;
240
+ await this.handleSkippedRequest({ url: request.url, reason: 'redirect' });
241
+ throw new ContextPipelineInterruptedError(message);
242
+ }
243
+ return context;
244
+ },
245
+ });
246
+ return contextPipeline;
247
+ };
248
+ if (requestManager !== undefined) {
249
+ if (requestList !== undefined || requestQueue !== undefined) {
250
+ throw new Error('The `requestManager` option cannot be used in conjunction with `requestList` and/or `requestQueue`');
251
+ }
252
+ this.requestManager = requestManager;
253
+ this.requestQueue = requestManager; // TODO(v4) - the cast is not fully legitimate here, but it's fine for internal usage by the BasicCrawler
254
+ }
255
+ else {
256
+ this.requestList = requestList;
257
+ this.requestQueue = requestQueue;
258
+ }
198
259
  this.httpClient = httpClient ?? new GotScrapingHttpClient();
260
+ this.proxyConfiguration = proxyConfiguration;
199
261
  this.log = log;
200
262
  this.statusMessageLoggingInterval = statusMessageLoggingInterval;
201
263
  this.statusMessageCallback = statusMessageCallback;
@@ -203,7 +265,7 @@ export class BasicCrawler {
203
265
  this.domainAccessedTime = new Map();
204
266
  this.experiments = experiments;
205
267
  this.robotsTxtFileCache = new LruCache({ maxLength: 1000 });
206
- // FIXME any
268
+ this.handleSkippedRequest = this.handleSkippedRequest.bind(this);
207
269
  this.requestHandler = requestHandler ?? this.router;
208
270
  this.failedRequestHandler = failedRequestHandler;
209
271
  this.errorHandler = errorHandler;
@@ -228,9 +290,9 @@ export class BasicCrawler {
228
290
  this.requestQueue.requestLockSecs = Math.max(this.requestHandlerTimeoutMillis / 1000 + 5, 60);
229
291
  }
230
292
  this.maxRequestRetries = maxRequestRetries;
293
+ this.maxCrawlDepth = maxCrawlDepth;
231
294
  this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
232
295
  this.maxSessionRotations = maxSessionRotations;
233
- this.handledRequestsCount = 0;
234
296
  this.stats = new Statistics({
235
297
  logMessage: `${log.getOptions().prefix} request statistics:`,
236
298
  log,
@@ -248,7 +310,6 @@ export class BasicCrawler {
248
310
  }
249
311
  }
250
312
  this.useSessionPool = useSessionPool;
251
- this.crawlingContexts = new Map();
252
313
  const maxSignedInteger = 2 ** 31 - 1;
253
314
  if (this.requestHandlerTimeoutMillis > maxSignedInteger) {
254
315
  log.warning(`requestHandlerTimeoutMillis ${this.requestHandlerTimeoutMillis}` +
@@ -256,8 +317,8 @@ export class BasicCrawler {
256
317
  this.requestHandlerTimeoutMillis = maxSignedInteger;
257
318
  }
258
319
  this.internalTimeoutMillis = Math.min(this.internalTimeoutMillis, maxSignedInteger);
259
- let shouldLogMaxPagesExceeded = true;
260
- const isMaxPagesExceeded = () => maxRequestsPerCrawl && maxRequestsPerCrawl <= this.handledRequestsCount;
320
+ this.maxRequestsPerCrawl = maxRequestsPerCrawl;
321
+ const isMaxPagesExceeded = () => this.maxRequestsPerCrawl && this.maxRequestsPerCrawl <= this.handledRequestsCount;
261
322
  // eslint-disable-next-line prefer-const
262
323
  let { isFinishedFunction, isTaskReadyFunction } = autoscaledPoolOptions;
263
324
  // override even if `isFinishedFunction` provided by user - `keepAlive` has higher priority
@@ -271,10 +332,10 @@ export class BasicCrawler {
271
332
  runTaskFunction: this._runTaskFunction.bind(this),
272
333
  isTaskReadyFunction: async () => {
273
334
  if (isMaxPagesExceeded()) {
274
- if (shouldLogMaxPagesExceeded) {
335
+ if (this.shouldLogMaxProcessedRequestsExceeded) {
275
336
  log.info('Crawler reached the maxRequestsPerCrawl limit of ' +
276
- `${maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
277
- shouldLogMaxPagesExceeded = false;
337
+ `${this.maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
338
+ this.shouldLogMaxProcessedRequestsExceeded = false;
278
339
  }
279
340
  return false;
280
341
  }
@@ -282,7 +343,7 @@ export class BasicCrawler {
282
343
  },
283
344
  isFinishedFunction: async () => {
284
345
  if (isMaxPagesExceeded()) {
285
- log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${maxRequestsPerCrawl} requests ` +
346
+ log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${this.maxRequestsPerCrawl} requests ` +
286
347
  'and all requests that were in progress at that time have now finished. ' +
287
348
  `In total, the crawler processed ${this.handledRequestsCount} requests and will shut down.`);
288
349
  return true;
@@ -311,14 +372,6 @@ export class BasicCrawler {
311
372
  isProxyError(error) {
312
373
  return ROTATE_PROXY_ERRORS.some((x) => this._getMessageFromError(error)?.includes(x));
313
374
  }
314
- /**
315
- * Checks whether the given crawling context is getting blocked by anti-bot protection using several heuristics.
316
- * Returns `false` if the request is not blocked, otherwise returns a string with a description of the block reason.
317
- * @param _crawlingContext The crawling context to check.
318
- */
319
- async isRequestBlocked(_crawlingContext) {
320
- throw new Error('the "isRequestBlocked" method is not implemented in this crawler.');
321
- }
322
375
  /**
323
376
  * This method is periodically called by the crawler, every `statusMessageLoggingInterval` seconds.
324
377
  */
@@ -350,7 +403,7 @@ export class BasicCrawler {
350
403
  message = `Experiencing problems, ${this.stats.state.requestsFailed - previousState.requestsFailed || this.stats.state.requestsFailed} failed requests in the past ${this.statusMessageLoggingInterval} seconds.`;
351
404
  }
352
405
  else {
353
- const total = this.requestQueue?.getTotalCount() || this.requestList?.length();
406
+ const total = this.requestManager?.getTotalCount();
354
407
  message = `Crawled ${this.stats.state.requestsFinished}${total ? `/${total}` : ''} pages, ${this.stats.state.requestsFailed} failed requests, desired concurrency ${this.autoscaledPool?.desiredConcurrency ?? 0}.`;
355
408
  }
356
409
  if (this.statusMessageCallback) {
@@ -390,20 +443,30 @@ export class BasicCrawler {
390
443
  if (this.requestQueue?.name === 'default' && purgeRequestQueue) {
391
444
  await this.requestQueue.drop();
392
445
  this.requestQueue = await this._getRequestQueue();
446
+ this.requestManager = undefined;
447
+ await this.initializeRequestManager();
448
+ this.handledRequestsCount = 0; // This would've been reset by this._init() further down below, but at that point `handledRequestsCount` could prevent `addRequests` from adding the initial requests
393
449
  }
394
450
  this.stats.reset();
395
451
  await this.stats.resetStore();
396
452
  await this.sessionPool?.resetStore();
397
453
  }
398
454
  this.running = true;
399
- await purgeDefaultStorages({ onlyPurgeOnce: true });
455
+ this.shouldLogMaxProcessedRequestsExceeded = true;
456
+ this.shouldLogMaxEnqueuedRequestsExceeded = true;
457
+ await purgeDefaultStorages({
458
+ onlyPurgeOnce: true,
459
+ client: this.config.getStorageClient(),
460
+ config: this.config,
461
+ });
400
462
  if (requests) {
401
463
  await this.addRequests(requests, addRequestsOptions);
402
464
  }
403
465
  await this._init();
404
466
  await this.stats.startCapturing();
405
467
  const periodicLogger = this.getPeriodicLogger();
406
- await this.setStatusMessage('Starting the crawler.', { level: 'INFO' });
468
+ // Don't await, we don't want to block the execution
469
+ void this.setStatusMessage('Starting the crawler.', { level: 'INFO' });
407
470
  const sigintHandler = async () => {
408
471
  this.log.warning('Pausing... Press CTRL+C again to force exit. To resume, do: CRAWLEE_PURGE_ON_START=0 npm start');
409
472
  await this._pauseOnMigration();
@@ -452,7 +515,8 @@ export class BasicCrawler {
452
515
  finished = true;
453
516
  }
454
517
  periodicLogger.stop();
455
- await this.setStatusMessage(`Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${this.stats.state.requestsFinished} succeeded, ${this.stats.state.requestsFailed} failed.`, { isStatusMessageTerminal: true, level: 'INFO' });
518
+ // Don't await, we don't want to block the execution
519
+ void this.setStatusMessage(`Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${this.stats.state.requestsFinished} succeeded, ${this.stats.state.requestsFailed} failed.`, { isStatusMessageTerminal: true, level: 'INFO' });
456
520
  this.running = false;
457
521
  this.hasFinishedBefore = true;
458
522
  }
@@ -462,6 +526,8 @@ export class BasicCrawler {
462
526
  * Gracefully stops the current run of the crawler.
463
527
  *
464
528
  * All the tasks active at the time of calling this method will be allowed to finish.
529
+ *
530
+ * To stop the crawler immediately, use {@link BasicCrawler.teardown|`crawler.teardown()`} instead.
465
531
  */
466
532
  stop(message = 'The crawler has been gracefully stopped.') {
467
533
  // Gracefully starve the this.autoscaledPool, so it doesn't start new tasks. Resolves once the pool is cleared.
@@ -478,13 +544,46 @@ export class BasicCrawler {
478
544
  if (!this.requestQueue && this.requestList) {
479
545
  this.log.warningOnce('When using RequestList and RequestQueue at the same time, you should instantiate both explicitly and provide them in the crawler options, to ensure correctly handled restarts of the crawler.');
480
546
  }
481
- this.requestQueue ??= await this._getRequestQueue();
547
+ if (!this.requestQueue) {
548
+ this.requestQueue = await this._getRequestQueue();
549
+ this.requestManager = undefined;
550
+ }
551
+ if (!this.requestManager) {
552
+ this.requestManager =
553
+ this.requestList === undefined
554
+ ? this.requestQueue
555
+ : new RequestManagerTandem(this.requestList, this.requestQueue);
556
+ }
482
557
  return this.requestQueue;
483
558
  }
484
559
  async useState(defaultValue = {}) {
485
560
  const kvs = await KeyValueStore.open(null, { config: this.config });
486
561
  return kvs.getAutoSavedValue(BasicCrawler.CRAWLEE_STATE_KEY, defaultValue);
487
562
  }
563
+ get pendingRequestCountApproximation() {
564
+ return this.requestManager?.getPendingCount() ?? 0;
565
+ }
566
+ calculateEnqueuedRequestLimit(explicitLimit) {
567
+ if (this.maxRequestsPerCrawl === undefined) {
568
+ return explicitLimit;
569
+ }
570
+ const limit = Math.max(0, this.maxRequestsPerCrawl - this.handledRequestsCount - this.pendingRequestCountApproximation);
571
+ return Math.min(limit, explicitLimit ?? Infinity);
572
+ }
573
+ async handleSkippedRequest(options) {
574
+ if (options.reason === 'limit' && this.shouldLogMaxEnqueuedRequestsExceeded) {
575
+ this.log.info('The number of requests enqueued by the crawler reached the maxRequestsPerCrawl limit of ' +
576
+ `${this.maxRequestsPerCrawl} requests and no further requests will be added.`);
577
+ this.shouldLogMaxEnqueuedRequestsExceeded = false;
578
+ }
579
+ if (options.reason === 'enqueueLimit') {
580
+ const enqueuedRequestLimit = this.calculateEnqueuedRequestLimit();
581
+ if (enqueuedRequestLimit === undefined || enqueuedRequestLimit !== 0) {
582
+ this.log.info('The number of requests enqueued by the crawler reached the enqueueLinks limit.');
583
+ }
584
+ }
585
+ await this.onSkippedRequest?.(options);
586
+ }
488
587
  /**
489
588
  * Adds requests to the queue in batches. By default, it will resolve after the initial batch is added, and continue
490
589
  * adding the rest in background. You can configure the batch size via `batchSize` option and the sleep time in between
@@ -497,33 +596,57 @@ export class BasicCrawler {
497
596
  * @param options Options for the request queue
498
597
  */
499
598
  async addRequests(requests, options = {}) {
500
- const requestQueue = await this.getRequestQueue();
501
- if (!this.respectRobotsTxtFile) {
502
- return requestQueue.addRequestsBatched(requests, options);
503
- }
504
- const allowedRequests = [];
505
- const skipped = new Set();
506
- for (const request of requests) {
507
- const url = typeof request === 'string' ? request : request.url;
508
- if (await this.isAllowedBasedOnRobotsTxtFile(url)) {
509
- allowedRequests.push(request);
510
- }
511
- else {
512
- skipped.add(url);
513
- await this.onSkippedRequest?.({ url, reason: 'robotsTxt' });
599
+ await this.getRequestQueue();
600
+ const requestLimit = this.calculateEnqueuedRequestLimit();
601
+ const skippedBecauseOfRobots = new Set();
602
+ const skippedBecauseOfLimit = new Set();
603
+ const skippedBecauseOfMaxCrawlDepth = new Set();
604
+ const isAllowedBasedOnRobotsTxtFile = this.isAllowedBasedOnRobotsTxtFile.bind(this);
605
+ const maxCrawlDepth = this.maxCrawlDepth;
606
+ ow(requests, ow.object
607
+ .is((value) => isIterable(value) || isAsyncIterable(value))
608
+ .message((value) => `Expected an iterable or async iterable, got ${getObjectType(value)}`));
609
+ async function* filteredRequests() {
610
+ let yieldedRequestCount = 0;
611
+ for await (const request of requests) {
612
+ const url = typeof request === 'string' ? request : request.url;
613
+ if (requestLimit !== undefined && yieldedRequestCount >= requestLimit) {
614
+ skippedBecauseOfLimit.add(url);
615
+ continue;
616
+ }
617
+ if (maxCrawlDepth !== undefined && request.crawlDepth > maxCrawlDepth) {
618
+ skippedBecauseOfMaxCrawlDepth.add(url);
619
+ continue;
620
+ }
621
+ if (await isAllowedBasedOnRobotsTxtFile(url)) {
622
+ yield request;
623
+ yieldedRequestCount += 1;
624
+ }
625
+ else {
626
+ skippedBecauseOfRobots.add(url);
627
+ }
514
628
  }
515
629
  }
516
- if (skipped.size > 0) {
630
+ const result = await this.requestManager.addRequestsBatched(filteredRequests(), options);
631
+ if (skippedBecauseOfRobots.size > 0) {
517
632
  this.log.warning(`Some requests were skipped because they were disallowed based on the robots.txt file`, {
518
- skipped: [...skipped],
633
+ skipped: [...skippedBecauseOfRobots],
519
634
  });
520
- if (this.onSkippedRequest) {
521
- await Promise.all([...skipped].map((url) => {
522
- return this.onSkippedRequest({ url, reason: 'robotsTxt' });
523
- }));
524
- }
525
635
  }
526
- return requestQueue.addRequestsBatched(allowedRequests, options);
636
+ if (skippedBecauseOfRobots.size > 0 ||
637
+ skippedBecauseOfLimit.size > 0 ||
638
+ skippedBecauseOfMaxCrawlDepth.size > 0) {
639
+ await Promise.all([...skippedBecauseOfRobots]
640
+ .map((url) => {
641
+ return this.handleSkippedRequest({ url, reason: 'robotsTxt' });
642
+ })
643
+ .concat([...skippedBecauseOfLimit].map((url) => {
644
+ return this.handleSkippedRequest({ url, reason: 'limit' });
645
+ }), [...skippedBecauseOfMaxCrawlDepth].map((url) => {
646
+ return this.handleSkippedRequest({ url, reason: 'depth' });
647
+ })));
648
+ }
649
+ return result;
527
650
  }
528
651
  /**
529
652
  * Pushes data to the specified {@link Dataset}, or the default crawler {@link Dataset} by calling {@link Dataset.pushData}.
@@ -563,7 +686,21 @@ export class BasicCrawler {
563
686
  const dataset = await this.getDataset();
564
687
  const items = await dataset.export(options);
565
688
  if (format === 'csv') {
566
- const value = stringify([Object.keys(items[0]), ...items.map((item) => Object.values(item))]);
689
+ let value;
690
+ if (items.length === 0) {
691
+ value = '';
692
+ }
693
+ else {
694
+ const keys = options?.collectAllKeys
695
+ ? Array.from(new Set(items.flatMap(Object.keys)))
696
+ : Object.keys(items[0]);
697
+ value = stringify([
698
+ keys,
699
+ ...items.map((item) => {
700
+ return keys.map((k) => item[k]);
701
+ }),
702
+ ]);
703
+ }
567
704
  await ensureDir(dirname(path));
568
705
  await writeFile(path, value);
569
706
  this.log.info(`Export to ${path} finished!`);
@@ -575,6 +712,9 @@ export class BasicCrawler {
575
712
  }
576
713
  return items;
577
714
  }
715
+ /**
716
+ * Initializes the crawler.
717
+ */
578
718
  async _init() {
579
719
  if (!this.events.isInitialized()) {
580
720
  await this.events.init();
@@ -589,10 +729,13 @@ export class BasicCrawler {
589
729
  // Assuming there are not more than 20 browsers running at once;
590
730
  this.sessionPool.setMaxListeners(20);
591
731
  }
732
+ await this.initializeRequestManager();
592
733
  await this._loadHandledRequestCount();
593
734
  }
594
- async _runRequestHandler(crawlingContext) {
595
- await this.requestHandler(crawlingContext);
735
+ async runRequestHandler(crawlingContext) {
736
+ await this.contextPipeline.call(crawlingContext, async (finalContext) => {
737
+ await addTimeoutToPromise(async () => this.requestHandler(finalContext), this.requestHandlerTimeoutMillis, `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${finalContext.request.id}).`);
738
+ });
596
739
  }
597
740
  /**
598
741
  * Handles blocked request
@@ -608,7 +751,8 @@ export class BasicCrawler {
608
751
  return true;
609
752
  }
610
753
  const robotsTxtFile = await this.getRobotsTxtFileForUrl(url);
611
- return !robotsTxtFile || robotsTxtFile.isAllowed(url);
754
+ const userAgent = typeof this.respectRobotsTxtFile === 'object' ? this.respectRobotsTxtFile?.userAgent : '*';
755
+ return !robotsTxtFile || robotsTxtFile.isAllowed(url, userAgent);
612
756
  }
613
757
  async getRobotsTxtFileForUrl(url) {
614
758
  if (!this.respectRobotsTxtFile) {
@@ -662,36 +806,36 @@ export class BasicCrawler {
662
806
  await Promise.all([requestListPersistPromise, this.stats.persistState()]);
663
807
  }
664
808
  /**
665
- * Fetches request from either RequestList or RequestQueue. If request comes from a RequestList
666
- * and RequestQueue is present then enqueues it to the queue first.
809
+ * Initializes the RequestManager based on the configured requestList and requestQueue.
667
810
  */
668
- async _fetchNextRequest() {
669
- if (!this.requestList || (await this.requestList.isFinished())) {
670
- return this.requestQueue?.fetchNextRequest();
671
- }
672
- const request = await this.requestList.fetchNextRequest();
673
- if (!this.requestQueue)
674
- return request;
675
- if (!request)
676
- return this.requestQueue.fetchNextRequest();
677
- try {
678
- await this.requestQueue.addRequest(request, { forefront: true });
811
+ async initializeRequestManager() {
812
+ if (this.requestManager !== undefined) {
813
+ return;
814
+ }
815
+ if (this.requestList && this.requestQueue) {
816
+ // Create a RequestManagerTandem if both RequestList and RequestQueue are provided
817
+ this.requestManager = new RequestManagerTandem(this.requestList, this.requestQueue);
679
818
  }
680
- catch (err) {
681
- // If requestQueue.addRequest() fails here then we must reclaim it back to
682
- // the RequestList because probably it's not yet in the queue!
683
- this.log.error('Adding of request from the RequestList to the RequestQueue failed, reclaiming request back to the list.', { request });
684
- await this.requestList.reclaimRequest(request);
685
- return null;
819
+ else if (this.requestQueue) {
820
+ // Use RequestQueue directly if only it is provided
821
+ this.requestManager = this.requestQueue;
686
822
  }
687
- await this.requestList.markRequestHandled(request);
688
- return this.requestQueue.fetchNextRequest();
823
+ else if (this.requestList) {
824
+ // Use RequestList directly if only it is provided
825
+ // Make it compatible with the IRequestManager interface
826
+ this.requestManager = new RequestListAdapter(this.requestList);
827
+ }
828
+ // If neither RequestList nor RequestQueue is provided, leave the requestManager uninitialized until `getRequestQueue` is called
689
829
  }
690
830
  /**
691
- * Executed when `errorHandler` finishes or the request is successful.
692
- * Can be used to clean up orphaned browser pages.
831
+ * Fetches the next request to process from the underlying request provider.
693
832
  */
694
- async _cleanupContext(_crawlingContext) { }
833
+ async _fetchNextRequest() {
834
+ if (this.requestManager === undefined) {
835
+ throw new Error(`_fetchNextRequest called on an uninitialized crawler`);
836
+ }
837
+ return this.requestManager.fetchNextRequest();
838
+ }
695
839
  /**
696
840
  * Delays processing of the request based on the `sameDomainDelaySecs` option,
697
841
  * adding it back to the queue after the timeout passes. Returns `true` if the request
@@ -729,18 +873,21 @@ export class BasicCrawler {
729
873
  * then retries them in a case of an error, etc.
730
874
  */
731
875
  async _runTaskFunction() {
732
- const source = this.requestQueue || this.requestList || (await this.getRequestQueue());
733
- let request;
734
- let session;
735
- await this._timeoutAndRetry(async () => {
736
- request = await this._fetchNextRequest();
737
- }, this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
876
+ const source = this.requestManager;
877
+ if (!source)
878
+ throw new Error('Request provider is not initialized!');
879
+ const request = await this._timeoutAndRetry(this._fetchNextRequest.bind(this), this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
738
880
  tryCancel();
739
- if (this.useSessionPool) {
740
- await this._timeoutAndRetry(async () => {
741
- session = await this.sessionPool.getSession();
742
- }, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
743
- }
881
+ const session = this.useSessionPool
882
+ ? await this._timeoutAndRetry(async () => {
883
+ return await this.sessionPool.newSession({
884
+ proxyInfo: await this.proxyConfiguration?.newProxyInfo({
885
+ request: request ?? undefined,
886
+ }),
887
+ maxUsageCount: 1,
888
+ });
889
+ }, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`)
890
+ : undefined;
744
891
  tryCancel();
745
892
  if (!request || this.delayRequest(request, source)) {
746
893
  return;
@@ -750,7 +897,7 @@ export class BasicCrawler {
750
897
  request.state = RequestState.SKIPPED;
751
898
  request.noRetry = true;
752
899
  await source.markRequestHandled(request);
753
- await this.onSkippedRequest?.({
900
+ await this.handleSkippedRequest({
754
901
  url: request.url,
755
902
  reason: 'robotsTxt',
756
903
  });
@@ -760,36 +907,34 @@ export class BasicCrawler {
760
907
  request.loadedUrl = undefined;
761
908
  const statisticsId = request.id || request.uniqueKey;
762
909
  this.stats.startJob(statisticsId);
763
- // Shared crawling context
764
- // @ts-expect-error
765
- // All missing properties (that extend CrawlingContext) are set dynamically,
766
- // but TS does not know that, so otherwise it would throw when compiling.
910
+ const deferredCleanup = [];
767
911
  const crawlingContext = {
768
912
  id: cryptoRandomObjectId(10),
769
- crawler: this,
770
913
  log: this.log,
771
914
  request,
772
915
  session,
916
+ proxyInfo: session?.proxyInfo,
773
917
  enqueueLinks: async (options) => {
774
- return enqueueLinks({
775
- // specify the RQ first to allow overriding it
776
- requestQueue: await this.getRequestQueue(),
777
- robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
778
- onSkippedRequest: this.onSkippedRequest,
779
- ...options,
780
- });
918
+ const requestQueue = await this.getRequestQueue();
919
+ return await this.enqueueLinksWithCrawlDepth(options, request, requestQueue);
920
+ },
921
+ addRequests: async (requests, options = {}) => {
922
+ const newCrawlDepth = request.crawlDepth + 1;
923
+ const requestsGenerator = this.addCrawlDepthRequestGenerator(requests, newCrawlDepth);
924
+ await this.addRequests(requestsGenerator, options);
781
925
  },
782
- addRequests: this.addRequests.bind(this),
783
926
  pushData: this.pushData.bind(this),
784
927
  useState: this.useState.bind(this),
785
- sendRequest: createSendRequest(this.httpClient, request, session, () => crawlingContext.proxyInfo?.url),
928
+ sendRequest: createSendRequest(this.httpClient, request, session),
786
929
  getKeyValueStore: async (idOrName) => KeyValueStore.open(idOrName, { config: this.config }),
930
+ registerDeferredCleanup: (cleanup) => {
931
+ deferredCleanup.push(cleanup);
932
+ },
787
933
  };
788
- this.crawlingContexts.set(crawlingContext.id, crawlingContext);
789
934
  let isRequestLocked = true;
790
935
  try {
791
936
  request.state = RequestState.REQUEST_HANDLER;
792
- await addTimeoutToPromise(async () => this._runRequestHandler(crawlingContext), this.requestHandlerTimeoutMillis, `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${request.id}).`);
937
+ await this.runRequestHandler(crawlingContext);
793
938
  await this._timeoutAndRetry(async () => source.markRequestHandled(request), this.internalTimeoutMillis, `Marking request ${request.url} (${request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
794
939
  isRequestLocked = false; // markRequestHandled succeeded and unlocked the request
795
940
  this.stats.finishJob(statisticsId, request.retryCount);
@@ -798,7 +943,8 @@ export class BasicCrawler {
798
943
  request.state = RequestState.DONE;
799
944
  crawlingContext.session?.markGood();
800
945
  }
801
- catch (err) {
946
+ catch (rawError) {
947
+ const err = this.unwrapError(rawError);
802
948
  try {
803
949
  request.state = RequestState.ERROR_HANDLER;
804
950
  await addTimeoutToPromise(async () => this._requestFunctionErrorHandler(err, crawlingContext, source), this.internalTimeoutMillis, `Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
@@ -808,24 +954,24 @@ export class BasicCrawler {
808
954
  request.state = RequestState.DONE;
809
955
  }
810
956
  catch (secondaryError) {
811
- if (!secondaryError.triggeredFromUserHandler &&
957
+ const unwrappedSecondaryError = this.unwrapError(secondaryError);
958
+ if (!unwrappedSecondaryError.triggeredFromUserHandler &&
812
959
  // avoid reprinting the same critical error multiple times, as it will be printed by Nodejs at the end anyway
813
- !(secondaryError instanceof CriticalError)) {
960
+ !(unwrappedSecondaryError instanceof CriticalError)) {
814
961
  const apifySpecific = process.env.APIFY_IS_AT_HOME
815
962
  ? `This may have happened due to an internal error of Apify's API or due to a misconfigured crawler.`
816
963
  : '';
817
- this.log.exception(secondaryError, 'An exception occurred during handling of failed request. ' +
964
+ this.log.exception(unwrappedSecondaryError, 'An exception occurred during handling of failed request. ' +
818
965
  `This places the crawler and its underlying storages into an unknown state and crawling will be terminated. ${apifySpecific}`);
819
966
  }
820
967
  request.state = RequestState.ERROR;
821
- throw secondaryError;
968
+ throw unwrappedSecondaryError;
822
969
  }
823
970
  // decrease the session score if the request fails (but the error handler did not throw)
824
971
  crawlingContext.session?.markBad();
825
972
  }
826
973
  finally {
827
- await this._cleanupContext(crawlingContext);
828
- this.crawlingContexts.delete(crawlingContext.id);
974
+ await Promise.all(deferredCleanup.map((cleanup) => cleanup()));
829
975
  // Safety net - release the lock if nobody managed to do it before
830
976
  if (isRequestLocked && source instanceof RequestProvider) {
831
977
  try {
@@ -838,19 +984,60 @@ export class BasicCrawler {
838
984
  }
839
985
  }
840
986
  /**
841
- * Run async callback with given timeout and retry.
987
+ * Wrapper around the crawling context's `enqueueLinks` method:
988
+ * - Injects `crawlDepth` to each request being added based on the crawling context request.
989
+ * - Provides defaults for the `enqueueLinks` options based on the crawler configuration.
990
+ * - These options can be overridden by the user.
991
+ * @internal
992
+ */
993
+ async enqueueLinksWithCrawlDepth(options, request, requestQueue) {
994
+ const transformRequestFunctionWrapper = (newRequest) => {
995
+ newRequest.crawlDepth = request.crawlDepth + 1;
996
+ if (this.maxCrawlDepth !== undefined && newRequest.crawlDepth > this.maxCrawlDepth) {
997
+ newRequest.skippedReason = 'depth';
998
+ return false;
999
+ }
1000
+ // After injecting the crawlDepth, we call the user-provided transform function, if there is one.
1001
+ return options.transformRequestFunction?.(newRequest) ?? newRequest;
1002
+ };
1003
+ return await enqueueLinks({
1004
+ requestQueue,
1005
+ robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
1006
+ onSkippedRequest: this.handleSkippedRequest,
1007
+ limit: this.calculateEnqueuedRequestLimit(options.limit),
1008
+ // Allow user options to override defaults set above ⤴
1009
+ ...options,
1010
+ transformRequestFunction: transformRequestFunctionWrapper,
1011
+ });
1012
+ }
1013
+ /**
1014
+ * Generator function that yields requests injected with the given crawl depth.
1015
+ * @internal
1016
+ */
1017
+ async *addCrawlDepthRequestGenerator(requests, newRequestDepth) {
1018
+ for await (const request of requests) {
1019
+ if (typeof request === 'string') {
1020
+ yield { url: request, crawlDepth: newRequestDepth };
1021
+ }
1022
+ else {
1023
+ request.crawlDepth ??= newRequestDepth;
1024
+ yield request;
1025
+ }
1026
+ }
1027
+ }
1028
+ /**
1029
+ * Run async callback with given timeout and retry. Returns the result of the callback.
842
1030
  * @ignore
843
1031
  */
844
1032
  async _timeoutAndRetry(handler, timeout, error, maxRetries = 3, retried = 1) {
845
1033
  try {
846
- await addTimeoutToPromise(handler, timeout, error);
1034
+ return await addTimeoutToPromise(handler, timeout, error);
847
1035
  }
848
1036
  catch (e) {
849
1037
  if (retried <= maxRetries) {
850
1038
  // we retry on any error, not just timeout
851
1039
  this.log.warning(`${e.message} (retrying ${retried}/${maxRetries})`);
852
- void this._timeoutAndRetry(handler, timeout, error, maxRetries, retried + 1);
853
- return;
1040
+ return this._timeoutAndRetry(handler, timeout, error, maxRetries, retried + 1);
854
1041
  }
855
1042
  throw e;
856
1043
  }
@@ -859,24 +1046,13 @@ export class BasicCrawler {
859
1046
  * Returns true if either RequestList or RequestQueue have a request ready for processing.
860
1047
  */
861
1048
  async _isTaskReadyFunction() {
862
- // First check RequestList, since it's only in memory.
863
- const isRequestListEmpty = this.requestList ? await this.requestList.isEmpty() : true;
864
- // If RequestList is not empty, task is ready, no reason to check RequestQueue.
865
- if (!isRequestListEmpty)
866
- return true;
867
- // If RequestQueue is not empty, task is ready, return true, otherwise false.
868
- return this.requestQueue ? !(await this.requestQueue.isEmpty()) : false;
1049
+ return this.requestManager !== undefined && !(await this.requestManager.isEmpty());
869
1050
  }
870
1051
  /**
871
1052
  * Returns true if both RequestList and RequestQueue have all requests finished.
872
1053
  */
873
1054
  async _defaultIsFinishedFunction() {
874
- const [isRequestListFinished, isRequestQueueFinished] = await Promise.all([
875
- this.requestList ? this.requestList.isFinished() : true,
876
- this.requestQueue ? this.requestQueue.isFinished() : true,
877
- ]);
878
- // If both are finished, return true, otherwise return false.
879
- return isRequestListFinished && isRequestQueueFinished;
1055
+ return !this.requestManager || (await this.requestManager.isFinished());
880
1056
  }
881
1057
  async _rotateSession(crawlingContext) {
882
1058
  const { request } = crawlingContext;
@@ -884,6 +1060,18 @@ export class BasicCrawler {
884
1060
  request.sessionRotationCount++;
885
1061
  crawlingContext.session?.retire();
886
1062
  }
1063
+ /**
1064
+ * Unwraps errors thrown by the context pipeline to get the actual user error.
1065
+ * RequestHandlerError and ContextPipelineInitializationError wrap the actual error.
1066
+ */
1067
+ unwrapError(error) {
1068
+ if (error instanceof RequestHandlerError ||
1069
+ error instanceof ContextPipelineInitializationError ||
1070
+ error instanceof ContextPipelineCleanupError) {
1071
+ return this.unwrapError(error.cause);
1072
+ }
1073
+ return error;
1074
+ }
887
1075
  /**
888
1076
  * Handles errors thrown by user provided requestHandler()
889
1077
  */
@@ -896,7 +1084,8 @@ export class BasicCrawler {
896
1084
  const shouldRetryRequest = this._canRequestBeRetried(request, error);
897
1085
  if (shouldRetryRequest) {
898
1086
  await this.stats.errorTrackerRetry.addAsync(error, crawlingContext);
899
- await this.errorHandler?.(crawlingContext, error);
1087
+ await this.errorHandler?.(crawlingContext, // valid cast - ExtendedContext transitively extends CrawlingContext
1088
+ error);
900
1089
  if (error instanceof SessionError) {
901
1090
  await this._rotateSession(crawlingContext);
902
1091
  }
@@ -948,7 +1137,8 @@ export class BasicCrawler {
948
1137
  const message = this._getMessageFromError(error, true);
949
1138
  this.log.error(`Request failed and reached maximum retries. ${message}`, { id, url, method, uniqueKey });
950
1139
  if (this.failedRequestHandler) {
951
- await this.failedRequestHandler?.(crawlingContext, error);
1140
+ await this.failedRequestHandler?.(crawlingContext, // valid cast - ExtendedContext transitively extends CrawlingContext
1141
+ error);
952
1142
  }
953
1143
  }
954
1144
  /**
@@ -986,19 +1176,11 @@ export class BasicCrawler {
986
1176
  return request.retryCount < maxRequestRetries;
987
1177
  }
988
1178
  /**
989
- * Updates handledRequestsCount from possibly stored counts,
990
- * usually after worker migration. Since one of the stores
991
- * needs to have priority when both are present,
992
- * it is the request queue, because generally, the request
993
- * list will first be dumped into the queue and then left
994
- * empty.
1179
+ * Updates handledRequestsCount from possibly stored counts, usually after worker migration.
995
1180
  */
996
1181
  async _loadHandledRequestCount() {
997
- if (this.requestQueue) {
998
- this.handledRequestsCount = await this.requestQueue.handledCount();
999
- }
1000
- else if (this.requestList) {
1001
- this.handledRequestsCount = this.requestList.handledCount();
1182
+ if (this.requestManager) {
1183
+ this.handledRequestsCount = await this.requestManager.handledCount();
1002
1184
  }
1003
1185
  }
1004
1186
  async _executeHooks(hooks, ...args) {
@@ -1009,14 +1191,15 @@ export class BasicCrawler {
1009
1191
  }
1010
1192
  }
1011
1193
  /**
1012
- * Function for cleaning up after all request are processed.
1013
- * @ignore
1194
+ * Stops the crawler immediately.
1195
+ *
1196
+ * This method doesn't wait for currently active requests to finish.
1197
+ *
1198
+ * To stop the crawler gracefully (waiting for all running requests to finish), use {@link BasicCrawler.stop|`crawler.stop()`} instead.
1014
1199
  */
1015
1200
  async teardown() {
1016
1201
  this.events.emit("persistState" /* EventType.PERSIST_STATE */, { isMigrating: false });
1017
- if (this.useSessionPool) {
1018
- await this.sessionPool.teardown();
1019
- }
1202
+ await this.sessionPool?.teardown();
1020
1203
  if (this._closeEvents) {
1021
1204
  await this.events.close();
1022
1205
  }