@crawlee/basic 3.13.6-beta.0 → 4.0.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,20 +1,16 @@
1
- "use strict";
2
- Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.BasicCrawler = void 0;
4
- exports.createBasicRouter = createBasicRouter;
5
- const tslib_1 = require("tslib");
6
- const node_path_1 = require("node:path");
7
- const core_1 = require("@crawlee/core");
8
- const utils_1 = require("@crawlee/utils");
9
- const sync_1 = require("csv-stringify/sync");
10
- const fs_extra_1 = require("fs-extra");
11
- const ow_1 = tslib_1.__importStar(require("ow"));
12
- const tldts_1 = require("tldts");
13
- const datastructures_1 = require("@apify/datastructures");
14
- const log_1 = tslib_1.__importStar(require("@apify/log"));
15
- const timeout_1 = require("@apify/timeout");
16
- const utilities_1 = require("@apify/utilities");
17
- const send_request_1 = require("./send-request");
1
+ import { writeFile } from 'node:fs/promises';
2
+ import { dirname } from 'node:path';
3
+ import { AutoscaledPool, Configuration, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, GotScrapingHttpClient, KeyValueStore, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
4
+ import { RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
5
+ import { stringify } from 'csv-stringify/sync';
6
+ import { ensureDir, writeJSON } from 'fs-extra/esm';
7
+ import ow from 'ow';
8
+ import { getDomain } from 'tldts';
9
+ import { LruCache } from '@apify/datastructures';
10
+ import defaultLog, { LogLevel } from '@apify/log';
11
+ import { addTimeoutToPromise, TimeoutError, tryCancel } from '@apify/timeout';
12
+ import { cryptoRandomObjectId } from '@apify/utilities';
13
+ import { createSendRequest } from './send-request.js';
18
14
  /**
19
15
  * Since there's no set number of seconds before the container is terminated after
20
16
  * a migration event, we need some reasonable number to use for RequestList persistence.
@@ -89,308 +85,137 @@ const SAFE_MIGRATION_WAIT_MILLIS = 20000;
89
85
  * ```
90
86
  * @category Crawlers
91
87
  */
92
- class BasicCrawler {
88
+ export class BasicCrawler {
89
+ config;
90
+ static CRAWLEE_STATE_KEY = 'CRAWLEE_STATE';
91
+ /**
92
+ * A reference to the underlying {@link Statistics} class that collects and logs run statistics for requests.
93
+ */
94
+ stats;
95
+ /**
96
+ * A reference to the underlying {@link RequestList} class that manages the crawler's {@link Request|requests}.
97
+ * Only available if used by the crawler.
98
+ */
99
+ requestList;
100
+ /**
101
+ * Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
102
+ * A reference to the underlying {@link RequestQueue} class that manages the crawler's {@link Request|requests}.
103
+ * Only available if used by the crawler.
104
+ */
105
+ requestQueue;
106
+ /**
107
+ * A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session|sessions}.
108
+ * Only available if used by the crawler.
109
+ */
110
+ sessionPool;
111
+ /**
112
+ * A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
113
+ * > *NOTE:* This property is only initialized after calling the {@link BasicCrawler.run|`crawler.run()`} function.
114
+ * We can use it to change the concurrency settings on the fly,
115
+ * to pause the crawler by calling {@link AutoscaledPool.pause|`autoscaledPool.pause()`}
116
+ * or to abort it by calling {@link AutoscaledPool.abort|`autoscaledPool.abort()`}.
117
+ */
118
+ autoscaledPool;
119
+ /**
120
+ * Default {@link Router} instance that will be used if we don't specify any {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
121
+ * See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
122
+ */
123
+ router = Router.create();
124
+ running = false;
125
+ hasFinishedBefore = false;
126
+ log;
127
+ requestHandler;
128
+ errorHandler;
129
+ failedRequestHandler;
130
+ requestHandlerTimeoutMillis;
131
+ internalTimeoutMillis;
132
+ maxRequestRetries;
133
+ sameDomainDelayMillis;
134
+ domainAccessedTime;
135
+ maxSessionRotations;
136
+ handledRequestsCount;
137
+ statusMessageLoggingInterval;
138
+ statusMessageCallback;
139
+ sessionPoolOptions;
140
+ useSessionPool;
141
+ crawlingContexts = new Map();
142
+ autoscaledPoolOptions;
143
+ events;
144
+ httpClient;
145
+ retryOnBlocked;
146
+ respectRobotsTxtFile;
147
+ onSkippedRequest;
148
+ _closeEvents;
149
+ experiments;
150
+ robotsTxtFileCache;
151
+ _experimentWarnings = {};
152
+ static optionsShape = {
153
+ requestList: ow.optional.object.validate(validators.requestList),
154
+ requestQueue: ow.optional.object.validate(validators.requestQueue),
155
+ // Subclasses override this function instead of passing it
156
+ // in constructor, so this validation needs to apply only
157
+ // if the user creates an instance of BasicCrawler directly.
158
+ requestHandler: ow.optional.function,
159
+ requestHandlerTimeoutSecs: ow.optional.number,
160
+ errorHandler: ow.optional.function,
161
+ failedRequestHandler: ow.optional.function,
162
+ maxRequestRetries: ow.optional.number,
163
+ sameDomainDelaySecs: ow.optional.number,
164
+ maxSessionRotations: ow.optional.number,
165
+ maxRequestsPerCrawl: ow.optional.number,
166
+ autoscaledPoolOptions: ow.optional.object,
167
+ sessionPoolOptions: ow.optional.object,
168
+ useSessionPool: ow.optional.boolean,
169
+ statusMessageLoggingInterval: ow.optional.number,
170
+ statusMessageCallback: ow.optional.function,
171
+ retryOnBlocked: ow.optional.boolean,
172
+ respectRobotsTxtFile: ow.optional.boolean,
173
+ onSkippedRequest: ow.optional.function,
174
+ httpClient: ow.optional.object,
175
+ // AutoscaledPool shorthands
176
+ minConcurrency: ow.optional.number,
177
+ maxConcurrency: ow.optional.number,
178
+ maxRequestsPerMinute: ow.optional.number.integerOrInfinite.positive.greaterThanOrEqual(1),
179
+ keepAlive: ow.optional.boolean,
180
+ // internal
181
+ log: ow.optional.object,
182
+ experiments: ow.optional.object,
183
+ statisticsOptions: ow.optional.object,
184
+ };
93
185
  /**
94
186
  * All `BasicCrawler` parameters are passed via an options object.
95
187
  */
96
- constructor(options = {}, config = core_1.Configuration.getGlobalConfig()) {
97
- Object.defineProperty(this, "config", {
98
- enumerable: true,
99
- configurable: true,
100
- writable: true,
101
- value: config
102
- });
103
- /**
104
- * A reference to the underlying {@link Statistics} class that collects and logs run statistics for requests.
105
- */
106
- Object.defineProperty(this, "stats", {
107
- enumerable: true,
108
- configurable: true,
109
- writable: true,
110
- value: void 0
111
- });
112
- /**
113
- * A reference to the underlying {@link RequestList} class that manages the crawler's {@link Request|requests}.
114
- * Only available if used by the crawler.
115
- */
116
- Object.defineProperty(this, "requestList", {
117
- enumerable: true,
118
- configurable: true,
119
- writable: true,
120
- value: void 0
121
- });
122
- /**
123
- * Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
124
- * A reference to the underlying {@link RequestQueue} class that manages the crawler's {@link Request|requests}.
125
- * Only available if used by the crawler.
126
- */
127
- Object.defineProperty(this, "requestQueue", {
128
- enumerable: true,
129
- configurable: true,
130
- writable: true,
131
- value: void 0
132
- });
133
- /**
134
- * A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session|sessions}.
135
- * Only available if used by the crawler.
136
- */
137
- Object.defineProperty(this, "sessionPool", {
138
- enumerable: true,
139
- configurable: true,
140
- writable: true,
141
- value: void 0
142
- });
143
- /**
144
- * A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
145
- * > *NOTE:* This property is only initialized after calling the {@link BasicCrawler.run|`crawler.run()`} function.
146
- * We can use it to change the concurrency settings on the fly,
147
- * to pause the crawler by calling {@link AutoscaledPool.pause|`autoscaledPool.pause()`}
148
- * or to abort it by calling {@link AutoscaledPool.abort|`autoscaledPool.abort()`}.
149
- */
150
- Object.defineProperty(this, "autoscaledPool", {
151
- enumerable: true,
152
- configurable: true,
153
- writable: true,
154
- value: void 0
155
- });
156
- /**
157
- * Default {@link Router} instance that will be used if we don't specify any {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
158
- * See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
159
- */
160
- Object.defineProperty(this, "router", {
161
- enumerable: true,
162
- configurable: true,
163
- writable: true,
164
- value: core_1.Router.create()
165
- });
166
- Object.defineProperty(this, "running", {
167
- enumerable: true,
168
- configurable: true,
169
- writable: true,
170
- value: false
171
- });
172
- Object.defineProperty(this, "hasFinishedBefore", {
173
- enumerable: true,
174
- configurable: true,
175
- writable: true,
176
- value: false
177
- });
178
- Object.defineProperty(this, "log", {
179
- enumerable: true,
180
- configurable: true,
181
- writable: true,
182
- value: void 0
183
- });
184
- Object.defineProperty(this, "requestHandler", {
185
- enumerable: true,
186
- configurable: true,
187
- writable: true,
188
- value: void 0
189
- });
190
- Object.defineProperty(this, "errorHandler", {
191
- enumerable: true,
192
- configurable: true,
193
- writable: true,
194
- value: void 0
195
- });
196
- Object.defineProperty(this, "failedRequestHandler", {
197
- enumerable: true,
198
- configurable: true,
199
- writable: true,
200
- value: void 0
201
- });
202
- Object.defineProperty(this, "requestHandlerTimeoutMillis", {
203
- enumerable: true,
204
- configurable: true,
205
- writable: true,
206
- value: void 0
207
- });
208
- Object.defineProperty(this, "internalTimeoutMillis", {
209
- enumerable: true,
210
- configurable: true,
211
- writable: true,
212
- value: void 0
213
- });
214
- Object.defineProperty(this, "maxRequestRetries", {
215
- enumerable: true,
216
- configurable: true,
217
- writable: true,
218
- value: void 0
219
- });
220
- Object.defineProperty(this, "sameDomainDelayMillis", {
221
- enumerable: true,
222
- configurable: true,
223
- writable: true,
224
- value: void 0
225
- });
226
- Object.defineProperty(this, "domainAccessedTime", {
227
- enumerable: true,
228
- configurable: true,
229
- writable: true,
230
- value: void 0
231
- });
232
- Object.defineProperty(this, "maxSessionRotations", {
233
- enumerable: true,
234
- configurable: true,
235
- writable: true,
236
- value: void 0
237
- });
238
- Object.defineProperty(this, "handledRequestsCount", {
239
- enumerable: true,
240
- configurable: true,
241
- writable: true,
242
- value: void 0
243
- });
244
- Object.defineProperty(this, "statusMessageLoggingInterval", {
245
- enumerable: true,
246
- configurable: true,
247
- writable: true,
248
- value: void 0
249
- });
250
- Object.defineProperty(this, "statusMessageCallback", {
251
- enumerable: true,
252
- configurable: true,
253
- writable: true,
254
- value: void 0
255
- });
256
- Object.defineProperty(this, "sessionPoolOptions", {
257
- enumerable: true,
258
- configurable: true,
259
- writable: true,
260
- value: void 0
261
- });
262
- Object.defineProperty(this, "useSessionPool", {
263
- enumerable: true,
264
- configurable: true,
265
- writable: true,
266
- value: void 0
267
- });
268
- Object.defineProperty(this, "crawlingContexts", {
269
- enumerable: true,
270
- configurable: true,
271
- writable: true,
272
- value: new Map()
273
- });
274
- Object.defineProperty(this, "autoscaledPoolOptions", {
275
- enumerable: true,
276
- configurable: true,
277
- writable: true,
278
- value: void 0
279
- });
280
- Object.defineProperty(this, "events", {
281
- enumerable: true,
282
- configurable: true,
283
- writable: true,
284
- value: void 0
285
- });
286
- Object.defineProperty(this, "httpClient", {
287
- enumerable: true,
288
- configurable: true,
289
- writable: true,
290
- value: void 0
291
- });
292
- Object.defineProperty(this, "retryOnBlocked", {
293
- enumerable: true,
294
- configurable: true,
295
- writable: true,
296
- value: void 0
297
- });
298
- Object.defineProperty(this, "respectRobotsTxtFile", {
299
- enumerable: true,
300
- configurable: true,
301
- writable: true,
302
- value: void 0
303
- });
304
- Object.defineProperty(this, "onSkippedRequest", {
305
- enumerable: true,
306
- configurable: true,
307
- writable: true,
308
- value: void 0
309
- });
310
- Object.defineProperty(this, "_closeEvents", {
311
- enumerable: true,
312
- configurable: true,
313
- writable: true,
314
- value: void 0
315
- });
316
- Object.defineProperty(this, "experiments", {
317
- enumerable: true,
318
- configurable: true,
319
- writable: true,
320
- value: void 0
321
- });
322
- Object.defineProperty(this, "robotsTxtFileCache", {
323
- enumerable: true,
324
- configurable: true,
325
- writable: true,
326
- value: void 0
327
- });
328
- Object.defineProperty(this, "_experimentWarnings", {
329
- enumerable: true,
330
- configurable: true,
331
- writable: true,
332
- value: {}
333
- });
334
- (0, ow_1.default)(options, 'BasicCrawlerOptions', ow_1.default.object.exactShape(BasicCrawler.optionsShape));
188
+ constructor(options = {}, config = Configuration.getGlobalConfig()) {
189
+ this.config = config;
190
+ ow(options, 'BasicCrawlerOptions', ow.object.exactShape(BasicCrawler.optionsShape));
335
191
  const { requestList, requestQueue, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, useSessionPool = true,
336
192
  // AutoscaledPool shorthands
337
- minConcurrency, maxConcurrency, maxRequestsPerMinute, retryOnBlocked = false, respectRobotsTxtFile = false, onSkippedRequest,
193
+ minConcurrency, maxConcurrency, maxRequestsPerMinute, retryOnBlocked = false, respectRobotsTxtFile = false, onSkippedRequest, requestHandler, requestHandlerTimeoutSecs, errorHandler, failedRequestHandler, statusMessageLoggingInterval = 10, statusMessageCallback, statisticsOptions, httpClient,
338
194
  // internal
339
- log = log_1.default.child({ prefix: this.constructor.name }), experiments = {},
340
- // Old and new request handler methods
341
- handleRequestFunction, requestHandler, handleRequestTimeoutSecs, requestHandlerTimeoutSecs, errorHandler, handleFailedRequestFunction, failedRequestHandler, statusMessageLoggingInterval = 10, statusMessageCallback, statisticsOptions, httpClient, } = options;
195
+ log = defaultLog.child({ prefix: this.constructor.name }), experiments = {}, } = options;
342
196
  this.requestList = requestList;
343
197
  this.requestQueue = requestQueue;
344
- this.httpClient = httpClient ?? new core_1.GotScrapingHttpClient();
198
+ this.httpClient = httpClient ?? new GotScrapingHttpClient();
345
199
  this.log = log;
346
200
  this.statusMessageLoggingInterval = statusMessageLoggingInterval;
347
201
  this.statusMessageCallback = statusMessageCallback;
348
202
  this.events = config.getEventManager();
349
203
  this.domainAccessedTime = new Map();
350
204
  this.experiments = experiments;
351
- this.robotsTxtFileCache = new datastructures_1.LruCache({ maxLength: 1000 });
352
- this._handlePropertyNameChange({
353
- newName: 'requestHandler',
354
- oldName: 'handleRequestFunction',
355
- propertyKey: 'requestHandler',
356
- newProperty: requestHandler,
357
- oldProperty: handleRequestFunction,
358
- allowUndefined: true, // fallback to the default router
359
- });
360
- if (!this.requestHandler) {
361
- this.requestHandler = this.router;
362
- }
205
+ this.robotsTxtFileCache = new LruCache({ maxLength: 1000 });
206
+ // FIXME any
207
+ this.requestHandler = requestHandler ?? this.router;
208
+ this.failedRequestHandler = failedRequestHandler;
363
209
  this.errorHandler = errorHandler;
364
- this._handlePropertyNameChange({
365
- newName: 'failedRequestHandler',
366
- oldName: 'handleFailedRequestFunction',
367
- propertyKey: 'failedRequestHandler',
368
- newProperty: failedRequestHandler,
369
- oldProperty: handleFailedRequestFunction,
370
- allowUndefined: true,
371
- });
372
- let newRequestHandlerTimeout;
373
- if (!handleRequestTimeoutSecs) {
374
- if (!requestHandlerTimeoutSecs) {
375
- newRequestHandlerTimeout = 60000;
376
- }
377
- else {
378
- newRequestHandlerTimeout = requestHandlerTimeoutSecs * 1000;
379
- }
210
+ if (requestHandlerTimeoutSecs) {
211
+ this.requestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
380
212
  }
381
- else if (requestHandlerTimeoutSecs) {
382
- newRequestHandlerTimeout = requestHandlerTimeoutSecs * 1000;
213
+ else {
214
+ this.requestHandlerTimeoutMillis = 60_000;
383
215
  }
384
216
  this.retryOnBlocked = retryOnBlocked;
385
217
  this.respectRobotsTxtFile = respectRobotsTxtFile;
386
218
  this.onSkippedRequest = onSkippedRequest;
387
- this._handlePropertyNameChange({
388
- newName: 'requestHandlerTimeoutSecs',
389
- oldName: 'handleRequestTimeoutSecs',
390
- propertyKey: 'requestHandlerTimeoutMillis',
391
- newProperty: newRequestHandlerTimeout,
392
- oldProperty: handleRequestTimeoutSecs ? handleRequestTimeoutSecs * 1000 : undefined,
393
- });
394
219
  const tryEnv = (val) => (val == null ? null : +val);
395
220
  // allow at least 5min for internal timeouts
396
221
  this.internalTimeoutMillis =
@@ -406,7 +231,7 @@ class BasicCrawler {
406
231
  this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
407
232
  this.maxSessionRotations = maxSessionRotations;
408
233
  this.handledRequestsCount = 0;
409
- this.stats = new core_1.Statistics({
234
+ this.stats = new Statistics({
410
235
  logMessage: `${log.getOptions().prefix} request statistics:`,
411
236
  log,
412
237
  config,
@@ -484,7 +309,7 @@ class BasicCrawler {
484
309
  * @param error The error to check.
485
310
  */
486
311
  isProxyError(error) {
487
- return utils_1.ROTATE_PROXY_ERRORS.some((x) => this._getMessageFromError(error)?.includes(x));
312
+ return ROTATE_PROXY_ERRORS.some((x) => this._getMessageFromError(error)?.includes(x));
488
313
  }
489
314
  /**
490
315
  * Checks whether the given crawling context is getting blocked by anti-bot protection using several heuristics.
@@ -499,13 +324,13 @@ class BasicCrawler {
499
324
  */
500
325
  async setStatusMessage(message, options = {}) {
501
326
  const data = options.isStatusMessageTerminal != null ? { terminal: options.isStatusMessageTerminal } : undefined;
502
- this.log.internal(log_1.LogLevel[options.level ?? 'DEBUG'], message, data);
327
+ this.log.internal(LogLevel[options.level ?? 'DEBUG'], message, data);
503
328
  const client = this.config.getStorageClient();
504
329
  if (!client.setStatusMessage) {
505
330
  return;
506
331
  }
507
332
  // just to be sure, this should be fast
508
- await (0, timeout_1.addTimeoutToPromise)(async () => client.setStatusMessage(message, options), 1000, 'Setting status message timed out after 1s').catch((e) => this.log.debug(e.message));
333
+ await addTimeoutToPromise(async () => client.setStatusMessage(message, options), 1000, 'Setting status message timed out after 1s').catch((e) => this.log.debug(e.message));
509
334
  }
510
335
  getPeriodicLogger() {
511
336
  let previousState = { ...this.stats.state };
@@ -571,7 +396,7 @@ class BasicCrawler {
571
396
  await this.sessionPool?.resetStore();
572
397
  }
573
398
  this.running = true;
574
- await (0, core_1.purgeDefaultStorages)({ onlyPurgeOnce: true });
399
+ await purgeDefaultStorages({ onlyPurgeOnce: true });
575
400
  if (requests) {
576
401
  await this.addRequests(requests, addRequestsOptions);
577
402
  }
@@ -653,11 +478,11 @@ class BasicCrawler {
653
478
  if (!this.requestQueue && this.requestList) {
654
479
  this.log.warningOnce('When using RequestList and RequestQueue at the same time, you should instantiate both explicitly and provide them in the crawler options, to ensure correctly handled restarts of the crawler.');
655
480
  }
656
- this.requestQueue ?? (this.requestQueue = await this._getRequestQueue());
481
+ this.requestQueue ??= await this._getRequestQueue();
657
482
  return this.requestQueue;
658
483
  }
659
484
  async useState(defaultValue = {}) {
660
- const kvs = await core_1.KeyValueStore.open(null, { config: this.config });
485
+ const kvs = await KeyValueStore.open(null, { config: this.config });
661
486
  return kvs.getAutoSavedValue(BasicCrawler.CRAWLEE_STATE_KEY, defaultValue);
662
487
  }
663
488
  /**
@@ -711,7 +536,7 @@ class BasicCrawler {
711
536
  * Retrieves the specified {@link Dataset}, or the default crawler {@link Dataset}.
712
537
  */
713
538
  async getDataset(idOrName) {
714
- return core_1.Dataset.open(idOrName, { config: this.config });
539
+ return Dataset.open(idOrName, { config: this.config });
715
540
  }
716
541
  /**
717
542
  * Retrieves data from the default crawler {@link Dataset} by calling {@link Dataset.getData}.
@@ -738,14 +563,14 @@ class BasicCrawler {
738
563
  const dataset = await this.getDataset();
739
564
  const items = await dataset.export(options);
740
565
  if (format === 'csv') {
741
- const value = (0, sync_1.stringify)([Object.keys(items[0]), ...items.map((item) => Object.values(item))]);
742
- await (0, fs_extra_1.ensureDir)((0, node_path_1.dirname)(path));
743
- await (0, fs_extra_1.writeFile)(path, value);
566
+ const value = stringify([Object.keys(items[0]), ...items.map((item) => Object.values(item))]);
567
+ await ensureDir(dirname(path));
568
+ await writeFile(path, value);
744
569
  this.log.info(`Export to ${path} finished!`);
745
570
  }
746
571
  if (format === 'json') {
747
- await (0, fs_extra_1.ensureDir)((0, node_path_1.dirname)(path));
748
- await (0, fs_extra_1.writeJSON)(path, items, { spaces: 4 });
572
+ await ensureDir(dirname(path));
573
+ await writeJSON(path, items, { spaces: 4 });
749
574
  this.log.info(`Export to ${path} finished!`);
750
575
  }
751
576
  return items;
@@ -758,9 +583,9 @@ class BasicCrawler {
758
583
  // Initialize AutoscaledPool before awaiting _loadHandledRequestCount(),
759
584
  // so that the caller can get a reference to it before awaiting the promise returned from run()
760
585
  // (otherwise there would be no way)
761
- this.autoscaledPool = new core_1.AutoscaledPool(this.autoscaledPoolOptions, this.config);
586
+ this.autoscaledPool = new AutoscaledPool(this.autoscaledPoolOptions, this.config);
762
587
  if (this.useSessionPool) {
763
- this.sessionPool = await core_1.SessionPool.open(this.sessionPoolOptions, this.config);
588
+ this.sessionPool = await SessionPool.open(this.sessionPoolOptions, this.config);
764
589
  // Assuming there are not more than 20 browsers running at once;
765
590
  this.sessionPool.setMaxListeners(20);
766
591
  }
@@ -795,7 +620,7 @@ class BasicCrawler {
795
620
  if (cachedRobotsTxtFile) {
796
621
  return cachedRobotsTxtFile;
797
622
  }
798
- const robotsTxtFile = await utils_1.RobotsTxtFile.find(url);
623
+ const robotsTxtFile = await RobotsTxtFile.find(url);
799
624
  this.robotsTxtFileCache.add(origin, robotsTxtFile);
800
625
  return robotsTxtFile;
801
626
  }
@@ -873,7 +698,7 @@ class BasicCrawler {
873
698
  * should be ignored and will be reclaimed to the queue once ready.
874
699
  */
875
700
  delayRequest(request, source) {
876
- const domain = (0, tldts_1.getDomain)(request.url);
701
+ const domain = getDomain(request.url);
877
702
  if (!domain || !request) {
878
703
  return false;
879
704
  }
@@ -883,7 +708,7 @@ class BasicCrawler {
883
708
  this.domainAccessedTime.set(domain, now);
884
709
  return false;
885
710
  }
886
- if (source instanceof core_1.RequestQueueV1) {
711
+ if (source instanceof RequestQueueV1) {
887
712
  // eslint-disable-next-line dot-notation
888
713
  source['inProgress']?.delete(request.id);
889
714
  }
@@ -891,7 +716,7 @@ class BasicCrawler {
891
716
  this.log.debug(`Request ${request.url} (${request.id}) will be reclaimed after ${delay} milliseconds due to same domain delay`);
892
717
  setTimeout(async () => {
893
718
  this.log.debug(`Adding request ${request.url} (${request.id}) back to the queue`);
894
- if (source instanceof core_1.RequestQueueV1) {
719
+ if (source instanceof RequestQueueV1) {
895
720
  // eslint-disable-next-line dot-notation
896
721
  source['inProgress'].add(request.id);
897
722
  }
@@ -910,19 +735,19 @@ class BasicCrawler {
910
735
  await this._timeoutAndRetry(async () => {
911
736
  request = await this._fetchNextRequest();
912
737
  }, this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
913
- (0, timeout_1.tryCancel)();
738
+ tryCancel();
914
739
  if (this.useSessionPool) {
915
740
  await this._timeoutAndRetry(async () => {
916
741
  session = await this.sessionPool.getSession();
917
742
  }, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
918
743
  }
919
- (0, timeout_1.tryCancel)();
744
+ tryCancel();
920
745
  if (!request || this.delayRequest(request, source)) {
921
746
  return;
922
747
  }
923
748
  if (!(await this.isAllowedBasedOnRobotsTxtFile(request.url))) {
924
749
  this.log.warning(`Skipping request ${request.url} (${request.id}) because it is disallowed based on robots.txt`);
925
- request.state = core_1.RequestState.SKIPPED;
750
+ request.state = RequestState.SKIPPED;
926
751
  request.noRetry = true;
927
752
  await source.markRequestHandled(request);
928
753
  await this.onSkippedRequest?.({
@@ -940,13 +765,13 @@ class BasicCrawler {
940
765
  // All missing properties (that extend CrawlingContext) are set dynamically,
941
766
  // but TS does not know that, so otherwise it would throw when compiling.
942
767
  const crawlingContext = {
943
- id: (0, utilities_1.cryptoRandomObjectId)(10),
768
+ id: cryptoRandomObjectId(10),
944
769
  crawler: this,
945
770
  log: this.log,
946
771
  request,
947
772
  session,
948
773
  enqueueLinks: async (options) => {
949
- return (0, core_1.enqueueLinks)({
774
+ return enqueueLinks({
950
775
  // specify the RQ first to allow overriding it
951
776
  requestQueue: await this.getRequestQueue(),
952
777
  robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
@@ -957,42 +782,42 @@ class BasicCrawler {
957
782
  addRequests: this.addRequests.bind(this),
958
783
  pushData: this.pushData.bind(this),
959
784
  useState: this.useState.bind(this),
960
- sendRequest: (0, send_request_1.createSendRequest)(this.httpClient, request, session, () => crawlingContext.proxyInfo?.url),
961
- getKeyValueStore: async (idOrName) => core_1.KeyValueStore.open(idOrName, { config: this.config }),
785
+ sendRequest: createSendRequest(this.httpClient, request, session, () => crawlingContext.proxyInfo?.url),
786
+ getKeyValueStore: async (idOrName) => KeyValueStore.open(idOrName, { config: this.config }),
962
787
  };
963
788
  this.crawlingContexts.set(crawlingContext.id, crawlingContext);
964
789
  let isRequestLocked = true;
965
790
  try {
966
- request.state = core_1.RequestState.REQUEST_HANDLER;
967
- await (0, timeout_1.addTimeoutToPromise)(async () => this._runRequestHandler(crawlingContext), this.requestHandlerTimeoutMillis, `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${request.id}).`);
791
+ request.state = RequestState.REQUEST_HANDLER;
792
+ await addTimeoutToPromise(async () => this._runRequestHandler(crawlingContext), this.requestHandlerTimeoutMillis, `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${request.id}).`);
968
793
  await this._timeoutAndRetry(async () => source.markRequestHandled(request), this.internalTimeoutMillis, `Marking request ${request.url} (${request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
969
794
  isRequestLocked = false; // markRequestHandled succeeded and unlocked the request
970
795
  this.stats.finishJob(statisticsId, request.retryCount);
971
796
  this.handledRequestsCount++;
972
797
  // reclaim session if request finishes successfully
973
- request.state = core_1.RequestState.DONE;
798
+ request.state = RequestState.DONE;
974
799
  crawlingContext.session?.markGood();
975
800
  }
976
801
  catch (err) {
977
802
  try {
978
- request.state = core_1.RequestState.ERROR_HANDLER;
979
- await (0, timeout_1.addTimeoutToPromise)(async () => this._requestFunctionErrorHandler(err, crawlingContext, source), this.internalTimeoutMillis, `Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
980
- if (!(err instanceof core_1.CriticalError)) {
803
+ request.state = RequestState.ERROR_HANDLER;
804
+ await addTimeoutToPromise(async () => this._requestFunctionErrorHandler(err, crawlingContext, source), this.internalTimeoutMillis, `Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
805
+ if (!(err instanceof CriticalError)) {
981
806
  isRequestLocked = false; // _requestFunctionErrorHandler calls either markRequestHandled or reclaimRequest
982
807
  }
983
- request.state = core_1.RequestState.DONE;
808
+ request.state = RequestState.DONE;
984
809
  }
985
810
  catch (secondaryError) {
986
811
  if (!secondaryError.triggeredFromUserHandler &&
987
812
  // avoid reprinting the same critical error multiple times, as it will be printed by Nodejs at the end anyway
988
- !(secondaryError instanceof core_1.CriticalError)) {
813
+ !(secondaryError instanceof CriticalError)) {
989
814
  const apifySpecific = process.env.APIFY_IS_AT_HOME
990
815
  ? `This may have happened due to an internal error of Apify's API or due to a misconfigured crawler.`
991
816
  : '';
992
817
  this.log.exception(secondaryError, 'An exception occurred during handling of failed request. ' +
993
818
  `This places the crawler and its underlying storages into an unknown state and crawling will be terminated. ${apifySpecific}`);
994
819
  }
995
- request.state = core_1.RequestState.ERROR;
820
+ request.state = RequestState.ERROR;
996
821
  throw secondaryError;
997
822
  }
998
823
  // decrease the session score if the request fails (but the error handler did not throw)
@@ -1002,7 +827,7 @@ class BasicCrawler {
1002
827
  await this._cleanupContext(crawlingContext);
1003
828
  this.crawlingContexts.delete(crawlingContext.id);
1004
829
  // Safety net - release the lock if nobody managed to do it before
1005
- if (isRequestLocked && source instanceof core_1.RequestProvider) {
830
+ if (isRequestLocked && source instanceof RequestProvider) {
1006
831
  try {
1007
832
  await source.client.deleteRequestLock(request.id);
1008
833
  }
@@ -1018,7 +843,7 @@ class BasicCrawler {
1018
843
  */
1019
844
  async _timeoutAndRetry(handler, timeout, error, maxRetries = 3, retried = 1) {
1020
845
  try {
1021
- await (0, timeout_1.addTimeoutToPromise)(handler, timeout, error);
846
+ await addTimeoutToPromise(handler, timeout, error);
1022
847
  }
1023
848
  catch (e) {
1024
849
  if (retried <= maxRetries) {
@@ -1055,7 +880,7 @@ class BasicCrawler {
1055
880
  }
1056
881
  async _rotateSession(crawlingContext) {
1057
882
  const { request } = crawlingContext;
1058
- request.sessionRotationCount ?? (request.sessionRotationCount = 0);
883
+ request.sessionRotationCount ??= 0;
1059
884
  request.sessionRotationCount++;
1060
885
  crawlingContext.session?.retire();
1061
886
  }
@@ -1065,14 +890,14 @@ class BasicCrawler {
1065
890
  async _requestFunctionErrorHandler(error, crawlingContext, source) {
1066
891
  const { request } = crawlingContext;
1067
892
  request.pushErrorMessage(error);
1068
- if (error instanceof core_1.CriticalError) {
893
+ if (error instanceof CriticalError) {
1069
894
  throw error;
1070
895
  }
1071
896
  const shouldRetryRequest = this._canRequestBeRetried(request, error);
1072
897
  if (shouldRetryRequest) {
1073
898
  await this.stats.errorTrackerRetry.addAsync(error, crawlingContext);
1074
- await this._tagUserHandlerError(() => this.errorHandler?.(this._augmentContextWithDeprecatedError(crawlingContext, error), error));
1075
- if (error instanceof core_1.SessionError) {
899
+ await this.errorHandler?.(crawlingContext, error);
900
+ if (error instanceof SessionError) {
1076
901
  await this._rotateSession(crawlingContext);
1077
902
  }
1078
903
  if (!request.noRetry) {
@@ -1123,7 +948,7 @@ class BasicCrawler {
1123
948
  const message = this._getMessageFromError(error, true);
1124
949
  this.log.error(`Request failed and reached maximum retries. ${message}`, { id, url, method, uniqueKey });
1125
950
  if (this.failedRequestHandler) {
1126
- await this._tagUserHandlerError(() => this.failedRequestHandler?.(this._augmentContextWithDeprecatedError(crawlingContext, error), error));
951
+ await this.failedRequestHandler?.(crawlingContext, error);
1127
952
  }
1128
953
  }
1129
954
  /**
@@ -1138,7 +963,7 @@ class BasicCrawler {
1138
963
  const stackLines = error?.stack ? error.stack.split('\n') : new Error().stack.split('\n').slice(2);
1139
964
  const baseDir = process.cwd();
1140
965
  const userLine = stackLines.find((line) => line.includes(baseDir) && !line.includes('node_modules'));
1141
- if (error instanceof timeout_1.TimeoutError) {
966
+ if (error instanceof TimeoutError) {
1142
967
  return process.env.CRAWLEE_VERBOSE_LOG ? error.stack : error.message || error; // stack in timeout errors does not really help
1143
968
  }
1144
969
  return process.env.CRAWLEE_VERBOSE_LOG || forceStack
@@ -1148,28 +973,18 @@ class BasicCrawler {
1148
973
  _canRequestBeRetried(request, error) {
1149
974
  // Request should never be retried, or the error encountered makes it not able to be retried, or the session rotation limit has been reached
1150
975
  if (request.noRetry ||
1151
- error instanceof core_1.NonRetryableError ||
1152
- (error instanceof core_1.SessionError && this.maxSessionRotations <= (request.sessionRotationCount ?? 0))) {
976
+ error instanceof NonRetryableError ||
977
+ (error instanceof SessionError && this.maxSessionRotations <= (request.sessionRotationCount ?? 0))) {
1153
978
  return false;
1154
979
  }
1155
980
  // User requested retry (we ignore retry count here as its explicitly told by the user to retry)
1156
- if (error instanceof core_1.RetryRequestError) {
981
+ if (error instanceof RetryRequestError) {
1157
982
  return true;
1158
983
  }
1159
984
  // Ensure there are more retries available for the request
1160
985
  const maxRequestRetries = request.maxRetries ?? this.maxRequestRetries;
1161
986
  return request.retryCount < maxRequestRetries;
1162
987
  }
1163
- _augmentContextWithDeprecatedError(context, error) {
1164
- Object.defineProperty(context, 'error', {
1165
- get: () => {
1166
- this.log.deprecated("The 'error' property of the crawling context is deprecated, and it is now passed as the second parameter in 'errorHandler' and 'failedRequestHandler'. Please update your code, as this property will be removed in a future version.");
1167
- return error;
1168
- },
1169
- configurable: true,
1170
- });
1171
- return context;
1172
- }
1173
988
  /**
1174
989
  * Updates handledRequestsCount from possibly stored counts,
1175
990
  * usually after worker migration. Since one of the stores
@@ -1207,36 +1022,10 @@ class BasicCrawler {
1207
1022
  }
1208
1023
  await this.autoscaledPool?.abort();
1209
1024
  }
1210
- _handlePropertyNameChange({ newProperty, newName, oldProperty, oldName, propertyKey, allowUndefined = false, }) {
1211
- if (newProperty && oldProperty) {
1212
- this.log.warning([
1213
- `Both "${newName}" and "${oldName}" were provided in the crawler options.`,
1214
- `"${oldName}" has been renamed to "${newName}", and will be removed in a future version.`,
1215
- `As such, "${newName}" will be used instead.`,
1216
- ].join('\n'));
1217
- // @ts-expect-error Assigning to possibly readonly properties
1218
- this[propertyKey] = newProperty;
1219
- }
1220
- else if (oldProperty) {
1221
- this.log.warning([
1222
- `"${oldName}" has been renamed to "${newName}", and will be removed in a future version.`,
1223
- `The provided value will be used, but you should rename "${oldName}" to "${newName}" in your crawler options.`,
1224
- ].join('\n'));
1225
- // @ts-expect-error Assigning to possibly readonly properties
1226
- this[propertyKey] = oldProperty;
1227
- }
1228
- else if (newProperty) {
1229
- // @ts-expect-error Assigning to possibly readonly properties
1230
- this[propertyKey] = newProperty;
1231
- }
1232
- else if (!allowUndefined) {
1233
- throw new ow_1.ArgumentError(`"${newName}" must be provided in the crawler options`, this.constructor);
1234
- }
1235
- }
1236
1025
  _getCookieHeaderFromRequest(request) {
1237
1026
  if (request.headers?.Cookie && request.headers?.cookie) {
1238
1027
  this.log.warning(`Encountered mixed casing for the cookie headers for request ${request.url} (${request.id}). Their values will be merged.`);
1239
- return (0, core_1.mergeCookies)(request.url, [request.headers.cookie, request.headers.Cookie]);
1028
+ return mergeCookies(request.url, [request.headers.cookie, request.headers.Cookie]);
1240
1029
  }
1241
1030
  return request.headers?.Cookie || request.headers?.cookie || '';
1242
1031
  }
@@ -1247,9 +1036,9 @@ class BasicCrawler {
1247
1036
  this.log.info('Using the old RequestQueue implementation without request locking.');
1248
1037
  this._experimentWarnings.requestLocking = true;
1249
1038
  }
1250
- return core_1.RequestQueueV1.open(null, { config: this.config });
1039
+ return RequestQueueV1.open(null, { config: this.config });
1251
1040
  }
1252
- return core_1.RequestQueue.open(null, { config: this.config });
1041
+ return RequestQueue.open(null, { config: this.config });
1253
1042
  }
1254
1043
  requestMatchesEnqueueStrategy(request) {
1255
1044
  const { url, loadedUrl } = request;
@@ -1266,80 +1055,29 @@ class BasicCrawler {
1266
1055
  const baseUrl = new URL(url);
1267
1056
  const loadedBaseUrl = new URL(loadedUrl);
1268
1057
  switch (strategy) {
1269
- case core_1.EnqueueStrategy.SameHostname: {
1058
+ case EnqueueStrategy.SameHostname: {
1270
1059
  return baseUrl.hostname === loadedBaseUrl.hostname;
1271
1060
  }
1272
- case core_1.EnqueueStrategy.SameDomain: {
1273
- const baseUrlHostname = (0, tldts_1.getDomain)(baseUrl.hostname, { mixedInputs: false });
1061
+ case EnqueueStrategy.SameDomain: {
1062
+ const baseUrlHostname = getDomain(baseUrl.hostname, { mixedInputs: false });
1274
1063
  if (baseUrlHostname) {
1275
- const loadedBaseUrlHostname = (0, tldts_1.getDomain)(loadedBaseUrl.hostname, { mixedInputs: false });
1064
+ const loadedBaseUrlHostname = getDomain(loadedBaseUrl.hostname, { mixedInputs: false });
1276
1065
  return baseUrlHostname === loadedBaseUrlHostname;
1277
1066
  }
1278
1067
  // Can happen for IPs, we just check like same origin
1279
1068
  return baseUrl.origin === loadedBaseUrl.origin;
1280
1069
  }
1281
- case core_1.EnqueueStrategy.SameOrigin: {
1070
+ case EnqueueStrategy.SameOrigin: {
1282
1071
  // Same as hostname, but also checks protocol
1283
1072
  return baseUrl.origin === loadedBaseUrl.origin;
1284
1073
  }
1285
- case core_1.EnqueueStrategy.All:
1074
+ case EnqueueStrategy.All:
1286
1075
  default: {
1287
1076
  return baseUrl.protocol === 'http:' || baseUrl.protocol === 'https:';
1288
1077
  }
1289
1078
  }
1290
1079
  }
1291
1080
  }
1292
- exports.BasicCrawler = BasicCrawler;
1293
- Object.defineProperty(BasicCrawler, "CRAWLEE_STATE_KEY", {
1294
- enumerable: true,
1295
- configurable: true,
1296
- writable: true,
1297
- value: 'CRAWLEE_STATE'
1298
- });
1299
- Object.defineProperty(BasicCrawler, "optionsShape", {
1300
- enumerable: true,
1301
- configurable: true,
1302
- writable: true,
1303
- value: {
1304
- requestList: ow_1.default.optional.object.validate(core_1.validators.requestList),
1305
- requestQueue: ow_1.default.optional.object.validate(core_1.validators.requestQueue),
1306
- // Subclasses override this function instead of passing it
1307
- // in constructor, so this validation needs to apply only
1308
- // if the user creates an instance of BasicCrawler directly.
1309
- requestHandler: ow_1.default.optional.function,
1310
- // TODO: remove in a future release
1311
- handleRequestFunction: ow_1.default.optional.function,
1312
- requestHandlerTimeoutSecs: ow_1.default.optional.number,
1313
- // TODO: remove in a future release
1314
- handleRequestTimeoutSecs: ow_1.default.optional.number,
1315
- errorHandler: ow_1.default.optional.function,
1316
- failedRequestHandler: ow_1.default.optional.function,
1317
- // TODO: remove in a future release
1318
- handleFailedRequestFunction: ow_1.default.optional.function,
1319
- maxRequestRetries: ow_1.default.optional.number,
1320
- sameDomainDelaySecs: ow_1.default.optional.number,
1321
- maxSessionRotations: ow_1.default.optional.number,
1322
- maxRequestsPerCrawl: ow_1.default.optional.number,
1323
- autoscaledPoolOptions: ow_1.default.optional.object,
1324
- sessionPoolOptions: ow_1.default.optional.object,
1325
- useSessionPool: ow_1.default.optional.boolean,
1326
- statusMessageLoggingInterval: ow_1.default.optional.number,
1327
- statusMessageCallback: ow_1.default.optional.function,
1328
- retryOnBlocked: ow_1.default.optional.boolean,
1329
- respectRobotsTxtFile: ow_1.default.optional.boolean,
1330
- onSkippedRequest: ow_1.default.optional.function,
1331
- httpClient: ow_1.default.optional.object,
1332
- // AutoscaledPool shorthands
1333
- minConcurrency: ow_1.default.optional.number,
1334
- maxConcurrency: ow_1.default.optional.number,
1335
- maxRequestsPerMinute: ow_1.default.optional.number.integerOrInfinite.positive.greaterThanOrEqual(1),
1336
- keepAlive: ow_1.default.optional.boolean,
1337
- // internal
1338
- log: ow_1.default.optional.object,
1339
- experiments: ow_1.default.optional.object,
1340
- statisticsOptions: ow_1.default.optional.object,
1341
- }
1342
- });
1343
1081
  /**
1344
1082
  * Creates new {@link Router} instance that works based on request labels.
1345
1083
  * This instance can then serve as a {@link BasicCrawlerOptions.requestHandler|`requestHandler`} of our {@link BasicCrawler}.
@@ -1364,7 +1102,7 @@ Object.defineProperty(BasicCrawler, "optionsShape", {
1364
1102
  * await crawler.run();
1365
1103
  * ```
1366
1104
  */
1367
- function createBasicRouter(routes) {
1368
- return core_1.Router.create(routes);
1105
+ export function createBasicRouter(routes) {
1106
+ return Router.create(routes);
1369
1107
  }
1370
1108
  //# sourceMappingURL=basic-crawler.js.map