@crawlee/basic 4.0.0-beta.2 → 4.0.0-beta.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/index.d.ts +1 -1
- package/index.d.ts.map +1 -1
- package/index.js +0 -1
- package/index.js.map +1 -1
- package/internals/basic-crawler.d.ts +175 -78
- package/internals/basic-crawler.d.ts.map +1 -1
- package/internals/basic-crawler.js +359 -152
- package/internals/basic-crawler.js.map +1 -1
- package/internals/send-request.d.ts +3 -4
- package/internals/send-request.d.ts.map +1 -1
- package/internals/send-request.js +3 -18
- package/internals/send-request.js.map +1 -1
- package/package.json +5 -5
- package/internals/constants.d.ts +0 -7
- package/internals/constants.d.ts.map +0 -1
- package/internals/constants.js +0 -7
- package/internals/constants.js.map +0 -1
- package/tsconfig.build.tsbuildinfo +0 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { writeFile } from 'node:fs/promises';
|
|
2
2
|
import { dirname } from 'node:path';
|
|
3
|
-
import { AutoscaledPool, Configuration, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, GotScrapingHttpClient, KeyValueStore, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
|
|
4
|
-
import { RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
|
|
3
|
+
import { AutoscaledPool, Configuration, ContextPipeline, ContextPipelineCleanupError, ContextPipelineInitializationError, ContextPipelineInterruptedError, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, GotScrapingHttpClient, KeyValueStore, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestHandlerError, RequestListAdapter, RequestManagerTandem, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
|
|
4
|
+
import { getObjectType, isAsyncIterable, isIterable, RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
|
|
5
5
|
import { stringify } from 'csv-stringify/sync';
|
|
6
6
|
import { ensureDir, writeJSON } from 'fs-extra/esm';
|
|
7
7
|
import ow from 'ow';
|
|
@@ -88,6 +88,11 @@ const SAFE_MIGRATION_WAIT_MILLIS = 20000;
|
|
|
88
88
|
export class BasicCrawler {
|
|
89
89
|
config;
|
|
90
90
|
static CRAWLEE_STATE_KEY = 'CRAWLEE_STATE';
|
|
91
|
+
/**
|
|
92
|
+
* Tracks crawler instances that accessed shared state without having an explicit id.
|
|
93
|
+
* Used to detect and warn about multiple crawlers sharing the same state.
|
|
94
|
+
*/
|
|
95
|
+
static useStateCrawlerIds = new Set();
|
|
91
96
|
/**
|
|
92
97
|
* A reference to the underlying {@link Statistics} class that collects and logs run statistics for requests.
|
|
93
98
|
*/
|
|
@@ -103,6 +108,10 @@ export class BasicCrawler {
|
|
|
103
108
|
* Only available if used by the crawler.
|
|
104
109
|
*/
|
|
105
110
|
requestQueue;
|
|
111
|
+
/**
|
|
112
|
+
* The main request-handling component of the crawler. It's initialized during the crawler startup.
|
|
113
|
+
*/
|
|
114
|
+
requestManager;
|
|
106
115
|
/**
|
|
107
116
|
* A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session|sessions}.
|
|
108
117
|
* Only available if used by the crawler.
|
|
@@ -116,11 +125,24 @@ export class BasicCrawler {
|
|
|
116
125
|
* or to abort it by calling {@link AutoscaledPool.abort|`autoscaledPool.abort()`}.
|
|
117
126
|
*/
|
|
118
127
|
autoscaledPool;
|
|
128
|
+
/**
|
|
129
|
+
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
130
|
+
* Only available if used by the crawler.
|
|
131
|
+
*/
|
|
132
|
+
proxyConfiguration;
|
|
119
133
|
/**
|
|
120
134
|
* Default {@link Router} instance that will be used if we don't specify any {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
|
|
121
135
|
* See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
|
|
122
136
|
*/
|
|
123
137
|
router = Router.create();
|
|
138
|
+
contextPipelineBuilder;
|
|
139
|
+
_contextPipeline;
|
|
140
|
+
get contextPipeline() {
|
|
141
|
+
if (this._contextPipeline === undefined) {
|
|
142
|
+
this._contextPipeline = this.contextPipelineBuilder();
|
|
143
|
+
}
|
|
144
|
+
return this._contextPipeline;
|
|
145
|
+
}
|
|
124
146
|
running = false;
|
|
125
147
|
hasFinishedBefore = false;
|
|
126
148
|
log;
|
|
@@ -130,15 +152,16 @@ export class BasicCrawler {
|
|
|
130
152
|
requestHandlerTimeoutMillis;
|
|
131
153
|
internalTimeoutMillis;
|
|
132
154
|
maxRequestRetries;
|
|
155
|
+
maxCrawlDepth;
|
|
133
156
|
sameDomainDelayMillis;
|
|
134
157
|
domainAccessedTime;
|
|
135
158
|
maxSessionRotations;
|
|
136
|
-
|
|
159
|
+
maxRequestsPerCrawl;
|
|
160
|
+
handledRequestsCount = 0;
|
|
137
161
|
statusMessageLoggingInterval;
|
|
138
162
|
statusMessageCallback;
|
|
139
163
|
sessionPoolOptions;
|
|
140
164
|
useSessionPool;
|
|
141
|
-
crawlingContexts = new Map();
|
|
142
165
|
autoscaledPoolOptions;
|
|
143
166
|
events;
|
|
144
167
|
httpClient;
|
|
@@ -146,10 +169,16 @@ export class BasicCrawler {
|
|
|
146
169
|
respectRobotsTxtFile;
|
|
147
170
|
onSkippedRequest;
|
|
148
171
|
_closeEvents;
|
|
172
|
+
shouldLogMaxProcessedRequestsExceeded = true;
|
|
173
|
+
shouldLogMaxEnqueuedRequestsExceeded = true;
|
|
149
174
|
experiments;
|
|
150
175
|
robotsTxtFileCache;
|
|
151
176
|
_experimentWarnings = {};
|
|
177
|
+
crawlerId;
|
|
178
|
+
hasExplicitId;
|
|
152
179
|
static optionsShape = {
|
|
180
|
+
contextPipelineBuilder: ow.optional.object,
|
|
181
|
+
extendContext: ow.optional.function,
|
|
153
182
|
requestList: ow.optional.object.validate(validators.requestList),
|
|
154
183
|
requestQueue: ow.optional.object.validate(validators.requestQueue),
|
|
155
184
|
// Subclasses override this function instead of passing it
|
|
@@ -163,13 +192,15 @@ export class BasicCrawler {
|
|
|
163
192
|
sameDomainDelaySecs: ow.optional.number,
|
|
164
193
|
maxSessionRotations: ow.optional.number,
|
|
165
194
|
maxRequestsPerCrawl: ow.optional.number,
|
|
195
|
+
maxCrawlDepth: ow.optional.number,
|
|
166
196
|
autoscaledPoolOptions: ow.optional.object,
|
|
167
197
|
sessionPoolOptions: ow.optional.object,
|
|
168
198
|
useSessionPool: ow.optional.boolean,
|
|
199
|
+
proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration),
|
|
169
200
|
statusMessageLoggingInterval: ow.optional.number,
|
|
170
201
|
statusMessageCallback: ow.optional.function,
|
|
171
202
|
retryOnBlocked: ow.optional.boolean,
|
|
172
|
-
respectRobotsTxtFile: ow.optional.boolean,
|
|
203
|
+
respectRobotsTxtFile: ow.optional.any(ow.boolean, ow.object),
|
|
173
204
|
onSkippedRequest: ow.optional.function,
|
|
174
205
|
httpClient: ow.optional.object,
|
|
175
206
|
// AutoscaledPool shorthands
|
|
@@ -181,21 +212,64 @@ export class BasicCrawler {
|
|
|
181
212
|
log: ow.optional.object,
|
|
182
213
|
experiments: ow.optional.object,
|
|
183
214
|
statisticsOptions: ow.optional.object,
|
|
215
|
+
id: ow.optional.string,
|
|
184
216
|
};
|
|
185
217
|
/**
|
|
186
218
|
* All `BasicCrawler` parameters are passed via an options object.
|
|
187
219
|
*/
|
|
188
|
-
constructor(options = {},
|
|
220
|
+
constructor(options = {}, // cast because the constructor logic handles missing `contextPipelineBuilder` - the type is just for DX
|
|
221
|
+
config = Configuration.getGlobalConfig()) {
|
|
189
222
|
this.config = config;
|
|
190
223
|
ow(options, 'BasicCrawlerOptions', ow.object.exactShape(BasicCrawler.optionsShape));
|
|
191
|
-
const { requestList, requestQueue, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, useSessionPool = true,
|
|
224
|
+
const { requestList, requestQueue, requestManager, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, maxCrawlDepth, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, useSessionPool = true, proxyConfiguration,
|
|
192
225
|
// AutoscaledPool shorthands
|
|
193
226
|
minConcurrency, maxConcurrency, maxRequestsPerMinute, retryOnBlocked = false, respectRobotsTxtFile = false, onSkippedRequest, requestHandler, requestHandlerTimeoutSecs, errorHandler, failedRequestHandler, statusMessageLoggingInterval = 10, statusMessageCallback, statisticsOptions, httpClient,
|
|
194
227
|
// internal
|
|
195
|
-
log = defaultLog.child({ prefix: this.constructor.name }), experiments = {}, } = options;
|
|
196
|
-
|
|
197
|
-
this.
|
|
228
|
+
log = defaultLog.child({ prefix: this.constructor.name }), experiments = {}, id, } = options;
|
|
229
|
+
// Store whether the user explicitly provided an ID
|
|
230
|
+
this.hasExplicitId = id !== undefined;
|
|
231
|
+
// Store the user-provided ID, or generate a unique one for tracking purposes (not for state key)
|
|
232
|
+
this.crawlerId = id ?? cryptoRandomObjectId();
|
|
233
|
+
// Store the builder so that it can be run when the contextPipeline is needed.
|
|
234
|
+
// Invoking it immediately would cause problems with parent constructor call order.
|
|
235
|
+
this.contextPipelineBuilder = () => {
|
|
236
|
+
let contextPipeline = (options.contextPipelineBuilder?.() ??
|
|
237
|
+
ContextPipeline.create()); // Thanks to the RequireContextPipeline, contextPipeline will only be undefined if InitialContextType is CrawlingContext
|
|
238
|
+
if (options.extendContext !== undefined) {
|
|
239
|
+
contextPipeline = contextPipeline.compose({
|
|
240
|
+
action: async (context) => await options.extendContext(context),
|
|
241
|
+
});
|
|
242
|
+
}
|
|
243
|
+
contextPipeline = contextPipeline.compose({
|
|
244
|
+
action: async (context) => {
|
|
245
|
+
const { request } = context;
|
|
246
|
+
if (!this.requestMatchesEnqueueStrategy(request)) {
|
|
247
|
+
// eslint-disable-next-line dot-notation
|
|
248
|
+
const message = `Skipping request ${request.id} (starting url: ${request.url} -> loaded url: ${request.loadedUrl}) because it does not match the enqueue strategy (${request['enqueueStrategy']}).`;
|
|
249
|
+
this.log.debug(message);
|
|
250
|
+
request.noRetry = true;
|
|
251
|
+
request.state = RequestState.SKIPPED;
|
|
252
|
+
await this.handleSkippedRequest({ url: request.url, reason: 'redirect' });
|
|
253
|
+
throw new ContextPipelineInterruptedError(message);
|
|
254
|
+
}
|
|
255
|
+
return context;
|
|
256
|
+
},
|
|
257
|
+
});
|
|
258
|
+
return contextPipeline;
|
|
259
|
+
};
|
|
260
|
+
if (requestManager !== undefined) {
|
|
261
|
+
if (requestList !== undefined || requestQueue !== undefined) {
|
|
262
|
+
throw new Error('The `requestManager` option cannot be used in conjunction with `requestList` and/or `requestQueue`');
|
|
263
|
+
}
|
|
264
|
+
this.requestManager = requestManager;
|
|
265
|
+
this.requestQueue = requestManager; // TODO(v4) - the cast is not fully legitimate here, but it's fine for internal usage by the BasicCrawler
|
|
266
|
+
}
|
|
267
|
+
else {
|
|
268
|
+
this.requestList = requestList;
|
|
269
|
+
this.requestQueue = requestQueue;
|
|
270
|
+
}
|
|
198
271
|
this.httpClient = httpClient ?? new GotScrapingHttpClient();
|
|
272
|
+
this.proxyConfiguration = proxyConfiguration;
|
|
199
273
|
this.log = log;
|
|
200
274
|
this.statusMessageLoggingInterval = statusMessageLoggingInterval;
|
|
201
275
|
this.statusMessageCallback = statusMessageCallback;
|
|
@@ -203,7 +277,7 @@ export class BasicCrawler {
|
|
|
203
277
|
this.domainAccessedTime = new Map();
|
|
204
278
|
this.experiments = experiments;
|
|
205
279
|
this.robotsTxtFileCache = new LruCache({ maxLength: 1000 });
|
|
206
|
-
|
|
280
|
+
this.handleSkippedRequest = this.handleSkippedRequest.bind(this);
|
|
207
281
|
this.requestHandler = requestHandler ?? this.router;
|
|
208
282
|
this.failedRequestHandler = failedRequestHandler;
|
|
209
283
|
this.errorHandler = errorHandler;
|
|
@@ -228,13 +302,14 @@ export class BasicCrawler {
|
|
|
228
302
|
this.requestQueue.requestLockSecs = Math.max(this.requestHandlerTimeoutMillis / 1000 + 5, 60);
|
|
229
303
|
}
|
|
230
304
|
this.maxRequestRetries = maxRequestRetries;
|
|
305
|
+
this.maxCrawlDepth = maxCrawlDepth;
|
|
231
306
|
this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
|
|
232
307
|
this.maxSessionRotations = maxSessionRotations;
|
|
233
|
-
this.handledRequestsCount = 0;
|
|
234
308
|
this.stats = new Statistics({
|
|
235
309
|
logMessage: `${log.getOptions().prefix} request statistics:`,
|
|
236
310
|
log,
|
|
237
311
|
config,
|
|
312
|
+
...(this.hasExplicitId ? { id: this.crawlerId } : {}),
|
|
238
313
|
...statisticsOptions,
|
|
239
314
|
});
|
|
240
315
|
this.sessionPoolOptions = {
|
|
@@ -248,7 +323,6 @@ export class BasicCrawler {
|
|
|
248
323
|
}
|
|
249
324
|
}
|
|
250
325
|
this.useSessionPool = useSessionPool;
|
|
251
|
-
this.crawlingContexts = new Map();
|
|
252
326
|
const maxSignedInteger = 2 ** 31 - 1;
|
|
253
327
|
if (this.requestHandlerTimeoutMillis > maxSignedInteger) {
|
|
254
328
|
log.warning(`requestHandlerTimeoutMillis ${this.requestHandlerTimeoutMillis}` +
|
|
@@ -256,8 +330,8 @@ export class BasicCrawler {
|
|
|
256
330
|
this.requestHandlerTimeoutMillis = maxSignedInteger;
|
|
257
331
|
}
|
|
258
332
|
this.internalTimeoutMillis = Math.min(this.internalTimeoutMillis, maxSignedInteger);
|
|
259
|
-
|
|
260
|
-
const isMaxPagesExceeded = () => maxRequestsPerCrawl && maxRequestsPerCrawl <= this.handledRequestsCount;
|
|
333
|
+
this.maxRequestsPerCrawl = maxRequestsPerCrawl;
|
|
334
|
+
const isMaxPagesExceeded = () => this.maxRequestsPerCrawl && this.maxRequestsPerCrawl <= this.handledRequestsCount;
|
|
261
335
|
// eslint-disable-next-line prefer-const
|
|
262
336
|
let { isFinishedFunction, isTaskReadyFunction } = autoscaledPoolOptions;
|
|
263
337
|
// override even if `isFinishedFunction` provided by user - `keepAlive` has higher priority
|
|
@@ -271,10 +345,10 @@ export class BasicCrawler {
|
|
|
271
345
|
runTaskFunction: this._runTaskFunction.bind(this),
|
|
272
346
|
isTaskReadyFunction: async () => {
|
|
273
347
|
if (isMaxPagesExceeded()) {
|
|
274
|
-
if (
|
|
348
|
+
if (this.shouldLogMaxProcessedRequestsExceeded) {
|
|
275
349
|
log.info('Crawler reached the maxRequestsPerCrawl limit of ' +
|
|
276
|
-
`${maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
|
|
277
|
-
|
|
350
|
+
`${this.maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
|
|
351
|
+
this.shouldLogMaxProcessedRequestsExceeded = false;
|
|
278
352
|
}
|
|
279
353
|
return false;
|
|
280
354
|
}
|
|
@@ -282,7 +356,7 @@ export class BasicCrawler {
|
|
|
282
356
|
},
|
|
283
357
|
isFinishedFunction: async () => {
|
|
284
358
|
if (isMaxPagesExceeded()) {
|
|
285
|
-
log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${maxRequestsPerCrawl} requests ` +
|
|
359
|
+
log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${this.maxRequestsPerCrawl} requests ` +
|
|
286
360
|
'and all requests that were in progress at that time have now finished. ' +
|
|
287
361
|
`In total, the crawler processed ${this.handledRequestsCount} requests and will shut down.`);
|
|
288
362
|
return true;
|
|
@@ -311,14 +385,6 @@ export class BasicCrawler {
|
|
|
311
385
|
isProxyError(error) {
|
|
312
386
|
return ROTATE_PROXY_ERRORS.some((x) => this._getMessageFromError(error)?.includes(x));
|
|
313
387
|
}
|
|
314
|
-
/**
|
|
315
|
-
* Checks whether the given crawling context is getting blocked by anti-bot protection using several heuristics.
|
|
316
|
-
* Returns `false` if the request is not blocked, otherwise returns a string with a description of the block reason.
|
|
317
|
-
* @param _crawlingContext The crawling context to check.
|
|
318
|
-
*/
|
|
319
|
-
async isRequestBlocked(_crawlingContext) {
|
|
320
|
-
throw new Error('the "isRequestBlocked" method is not implemented in this crawler.');
|
|
321
|
-
}
|
|
322
388
|
/**
|
|
323
389
|
* This method is periodically called by the crawler, every `statusMessageLoggingInterval` seconds.
|
|
324
390
|
*/
|
|
@@ -350,7 +416,7 @@ export class BasicCrawler {
|
|
|
350
416
|
message = `Experiencing problems, ${this.stats.state.requestsFailed - previousState.requestsFailed || this.stats.state.requestsFailed} failed requests in the past ${this.statusMessageLoggingInterval} seconds.`;
|
|
351
417
|
}
|
|
352
418
|
else {
|
|
353
|
-
const total = this.
|
|
419
|
+
const total = this.requestManager?.getTotalCount();
|
|
354
420
|
message = `Crawled ${this.stats.state.requestsFinished}${total ? `/${total}` : ''} pages, ${this.stats.state.requestsFailed} failed requests, desired concurrency ${this.autoscaledPool?.desiredConcurrency ?? 0}.`;
|
|
355
421
|
}
|
|
356
422
|
if (this.statusMessageCallback) {
|
|
@@ -390,20 +456,30 @@ export class BasicCrawler {
|
|
|
390
456
|
if (this.requestQueue?.name === 'default' && purgeRequestQueue) {
|
|
391
457
|
await this.requestQueue.drop();
|
|
392
458
|
this.requestQueue = await this._getRequestQueue();
|
|
459
|
+
this.requestManager = undefined;
|
|
460
|
+
await this.initializeRequestManager();
|
|
461
|
+
this.handledRequestsCount = 0; // This would've been reset by this._init() further down below, but at that point `handledRequestsCount` could prevent `addRequests` from adding the initial requests
|
|
393
462
|
}
|
|
394
463
|
this.stats.reset();
|
|
395
464
|
await this.stats.resetStore();
|
|
396
465
|
await this.sessionPool?.resetStore();
|
|
397
466
|
}
|
|
398
467
|
this.running = true;
|
|
399
|
-
|
|
468
|
+
this.shouldLogMaxProcessedRequestsExceeded = true;
|
|
469
|
+
this.shouldLogMaxEnqueuedRequestsExceeded = true;
|
|
470
|
+
await purgeDefaultStorages({
|
|
471
|
+
onlyPurgeOnce: true,
|
|
472
|
+
client: this.config.getStorageClient(),
|
|
473
|
+
config: this.config,
|
|
474
|
+
});
|
|
400
475
|
if (requests) {
|
|
401
476
|
await this.addRequests(requests, addRequestsOptions);
|
|
402
477
|
}
|
|
403
478
|
await this._init();
|
|
404
479
|
await this.stats.startCapturing();
|
|
405
480
|
const periodicLogger = this.getPeriodicLogger();
|
|
406
|
-
|
|
481
|
+
// Don't await, we don't want to block the execution
|
|
482
|
+
void this.setStatusMessage('Starting the crawler.', { level: 'INFO' });
|
|
407
483
|
const sigintHandler = async () => {
|
|
408
484
|
this.log.warning('Pausing... Press CTRL+C again to force exit. To resume, do: CRAWLEE_PURGE_ON_START=0 npm start');
|
|
409
485
|
await this._pauseOnMigration();
|
|
@@ -452,7 +528,8 @@ export class BasicCrawler {
|
|
|
452
528
|
finished = true;
|
|
453
529
|
}
|
|
454
530
|
periodicLogger.stop();
|
|
455
|
-
|
|
531
|
+
// Don't await, we don't want to block the execution
|
|
532
|
+
void this.setStatusMessage(`Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${this.stats.state.requestsFinished} succeeded, ${this.stats.state.requestsFailed} failed.`, { isStatusMessageTerminal: true, level: 'INFO' });
|
|
456
533
|
this.running = false;
|
|
457
534
|
this.hasFinishedBefore = true;
|
|
458
535
|
}
|
|
@@ -462,6 +539,8 @@ export class BasicCrawler {
|
|
|
462
539
|
* Gracefully stops the current run of the crawler.
|
|
463
540
|
*
|
|
464
541
|
* All the tasks active at the time of calling this method will be allowed to finish.
|
|
542
|
+
*
|
|
543
|
+
* To stop the crawler immediately, use {@link BasicCrawler.teardown|`crawler.teardown()`} instead.
|
|
465
544
|
*/
|
|
466
545
|
stop(message = 'The crawler has been gracefully stopped.') {
|
|
467
546
|
// Gracefully starve the this.autoscaledPool, so it doesn't start new tasks. Resolves once the pool is cleared.
|
|
@@ -478,13 +557,57 @@ export class BasicCrawler {
|
|
|
478
557
|
if (!this.requestQueue && this.requestList) {
|
|
479
558
|
this.log.warningOnce('When using RequestList and RequestQueue at the same time, you should instantiate both explicitly and provide them in the crawler options, to ensure correctly handled restarts of the crawler.');
|
|
480
559
|
}
|
|
481
|
-
this.requestQueue
|
|
560
|
+
if (!this.requestQueue) {
|
|
561
|
+
this.requestQueue = await this._getRequestQueue();
|
|
562
|
+
this.requestManager = undefined;
|
|
563
|
+
}
|
|
564
|
+
if (!this.requestManager) {
|
|
565
|
+
this.requestManager =
|
|
566
|
+
this.requestList === undefined
|
|
567
|
+
? this.requestQueue
|
|
568
|
+
: new RequestManagerTandem(this.requestList, this.requestQueue);
|
|
569
|
+
}
|
|
482
570
|
return this.requestQueue;
|
|
483
571
|
}
|
|
484
572
|
async useState(defaultValue = {}) {
|
|
485
573
|
const kvs = await KeyValueStore.open(null, { config: this.config });
|
|
574
|
+
if (this.hasExplicitId) {
|
|
575
|
+
const stateKey = `${BasicCrawler.CRAWLEE_STATE_KEY}_${this.crawlerId}`;
|
|
576
|
+
return kvs.getAutoSavedValue(stateKey, defaultValue);
|
|
577
|
+
}
|
|
578
|
+
BasicCrawler.useStateCrawlerIds.add(this.crawlerId);
|
|
579
|
+
if (BasicCrawler.useStateCrawlerIds.size > 1) {
|
|
580
|
+
defaultLog.warningOnce('Multiple crawler instances are calling useState() without an explicit `id` option. \n' +
|
|
581
|
+
'This means they will share the same state object, which is likely unintended. \n' +
|
|
582
|
+
'To fix this, provide a unique `id` option to each crawler instance. \n' +
|
|
583
|
+
'Example: new BasicCrawler({ id: "my-crawler-1", ... })');
|
|
584
|
+
}
|
|
486
585
|
return kvs.getAutoSavedValue(BasicCrawler.CRAWLEE_STATE_KEY, defaultValue);
|
|
487
586
|
}
|
|
587
|
+
get pendingRequestCountApproximation() {
|
|
588
|
+
return this.requestManager?.getPendingCount() ?? 0;
|
|
589
|
+
}
|
|
590
|
+
calculateEnqueuedRequestLimit(explicitLimit) {
|
|
591
|
+
if (this.maxRequestsPerCrawl === undefined) {
|
|
592
|
+
return explicitLimit;
|
|
593
|
+
}
|
|
594
|
+
const limit = Math.max(0, this.maxRequestsPerCrawl - this.handledRequestsCount - this.pendingRequestCountApproximation);
|
|
595
|
+
return Math.min(limit, explicitLimit ?? Infinity);
|
|
596
|
+
}
|
|
597
|
+
async handleSkippedRequest(options) {
|
|
598
|
+
if (options.reason === 'limit' && this.shouldLogMaxEnqueuedRequestsExceeded) {
|
|
599
|
+
this.log.info('The number of requests enqueued by the crawler reached the maxRequestsPerCrawl limit of ' +
|
|
600
|
+
`${this.maxRequestsPerCrawl} requests and no further requests will be added.`);
|
|
601
|
+
this.shouldLogMaxEnqueuedRequestsExceeded = false;
|
|
602
|
+
}
|
|
603
|
+
if (options.reason === 'enqueueLimit') {
|
|
604
|
+
const enqueuedRequestLimit = this.calculateEnqueuedRequestLimit();
|
|
605
|
+
if (enqueuedRequestLimit === undefined || enqueuedRequestLimit !== 0) {
|
|
606
|
+
this.log.info('The number of requests enqueued by the crawler reached the enqueueLinks limit.');
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
await this.onSkippedRequest?.(options);
|
|
610
|
+
}
|
|
488
611
|
/**
|
|
489
612
|
* Adds requests to the queue in batches. By default, it will resolve after the initial batch is added, and continue
|
|
490
613
|
* adding the rest in background. You can configure the batch size via `batchSize` option and the sleep time in between
|
|
@@ -497,33 +620,57 @@ export class BasicCrawler {
|
|
|
497
620
|
* @param options Options for the request queue
|
|
498
621
|
*/
|
|
499
622
|
async addRequests(requests, options = {}) {
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
const
|
|
505
|
-
const
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
623
|
+
await this.getRequestQueue();
|
|
624
|
+
const requestLimit = this.calculateEnqueuedRequestLimit();
|
|
625
|
+
const skippedBecauseOfRobots = new Set();
|
|
626
|
+
const skippedBecauseOfLimit = new Set();
|
|
627
|
+
const skippedBecauseOfMaxCrawlDepth = new Set();
|
|
628
|
+
const isAllowedBasedOnRobotsTxtFile = this.isAllowedBasedOnRobotsTxtFile.bind(this);
|
|
629
|
+
const maxCrawlDepth = this.maxCrawlDepth;
|
|
630
|
+
ow(requests, ow.object
|
|
631
|
+
.is((value) => isIterable(value) || isAsyncIterable(value))
|
|
632
|
+
.message((value) => `Expected an iterable or async iterable, got ${getObjectType(value)}`));
|
|
633
|
+
async function* filteredRequests() {
|
|
634
|
+
let yieldedRequestCount = 0;
|
|
635
|
+
for await (const request of requests) {
|
|
636
|
+
const url = typeof request === 'string' ? request : request.url;
|
|
637
|
+
if (requestLimit !== undefined && yieldedRequestCount >= requestLimit) {
|
|
638
|
+
skippedBecauseOfLimit.add(url);
|
|
639
|
+
continue;
|
|
640
|
+
}
|
|
641
|
+
if (maxCrawlDepth !== undefined && request.crawlDepth > maxCrawlDepth) {
|
|
642
|
+
skippedBecauseOfMaxCrawlDepth.add(url);
|
|
643
|
+
continue;
|
|
644
|
+
}
|
|
645
|
+
if (await isAllowedBasedOnRobotsTxtFile(url)) {
|
|
646
|
+
yield request;
|
|
647
|
+
yieldedRequestCount += 1;
|
|
648
|
+
}
|
|
649
|
+
else {
|
|
650
|
+
skippedBecauseOfRobots.add(url);
|
|
651
|
+
}
|
|
514
652
|
}
|
|
515
653
|
}
|
|
516
|
-
|
|
654
|
+
const result = await this.requestManager.addRequestsBatched(filteredRequests(), options);
|
|
655
|
+
if (skippedBecauseOfRobots.size > 0) {
|
|
517
656
|
this.log.warning(`Some requests were skipped because they were disallowed based on the robots.txt file`, {
|
|
518
|
-
skipped: [...
|
|
657
|
+
skipped: [...skippedBecauseOfRobots],
|
|
519
658
|
});
|
|
520
|
-
if (this.onSkippedRequest) {
|
|
521
|
-
await Promise.all([...skipped].map((url) => {
|
|
522
|
-
return this.onSkippedRequest({ url, reason: 'robotsTxt' });
|
|
523
|
-
}));
|
|
524
|
-
}
|
|
525
659
|
}
|
|
526
|
-
|
|
660
|
+
if (skippedBecauseOfRobots.size > 0 ||
|
|
661
|
+
skippedBecauseOfLimit.size > 0 ||
|
|
662
|
+
skippedBecauseOfMaxCrawlDepth.size > 0) {
|
|
663
|
+
await Promise.all([...skippedBecauseOfRobots]
|
|
664
|
+
.map((url) => {
|
|
665
|
+
return this.handleSkippedRequest({ url, reason: 'robotsTxt' });
|
|
666
|
+
})
|
|
667
|
+
.concat([...skippedBecauseOfLimit].map((url) => {
|
|
668
|
+
return this.handleSkippedRequest({ url, reason: 'limit' });
|
|
669
|
+
}), [...skippedBecauseOfMaxCrawlDepth].map((url) => {
|
|
670
|
+
return this.handleSkippedRequest({ url, reason: 'depth' });
|
|
671
|
+
})));
|
|
672
|
+
}
|
|
673
|
+
return result;
|
|
527
674
|
}
|
|
528
675
|
/**
|
|
529
676
|
* Pushes data to the specified {@link Dataset}, or the default crawler {@link Dataset} by calling {@link Dataset.pushData}.
|
|
@@ -563,7 +710,21 @@ export class BasicCrawler {
|
|
|
563
710
|
const dataset = await this.getDataset();
|
|
564
711
|
const items = await dataset.export(options);
|
|
565
712
|
if (format === 'csv') {
|
|
566
|
-
|
|
713
|
+
let value;
|
|
714
|
+
if (items.length === 0) {
|
|
715
|
+
value = '';
|
|
716
|
+
}
|
|
717
|
+
else {
|
|
718
|
+
const keys = options?.collectAllKeys
|
|
719
|
+
? Array.from(new Set(items.flatMap(Object.keys)))
|
|
720
|
+
: Object.keys(items[0]);
|
|
721
|
+
value = stringify([
|
|
722
|
+
keys,
|
|
723
|
+
...items.map((item) => {
|
|
724
|
+
return keys.map((k) => item[k]);
|
|
725
|
+
}),
|
|
726
|
+
]);
|
|
727
|
+
}
|
|
567
728
|
await ensureDir(dirname(path));
|
|
568
729
|
await writeFile(path, value);
|
|
569
730
|
this.log.info(`Export to ${path} finished!`);
|
|
@@ -575,6 +736,9 @@ export class BasicCrawler {
|
|
|
575
736
|
}
|
|
576
737
|
return items;
|
|
577
738
|
}
|
|
739
|
+
/**
|
|
740
|
+
* Initializes the crawler.
|
|
741
|
+
*/
|
|
578
742
|
async _init() {
|
|
579
743
|
if (!this.events.isInitialized()) {
|
|
580
744
|
await this.events.init();
|
|
@@ -589,10 +753,13 @@ export class BasicCrawler {
|
|
|
589
753
|
// Assuming there are not more than 20 browsers running at once;
|
|
590
754
|
this.sessionPool.setMaxListeners(20);
|
|
591
755
|
}
|
|
756
|
+
await this.initializeRequestManager();
|
|
592
757
|
await this._loadHandledRequestCount();
|
|
593
758
|
}
|
|
594
|
-
async
|
|
595
|
-
await this.
|
|
759
|
+
async runRequestHandler(crawlingContext) {
|
|
760
|
+
await this.contextPipeline.call(crawlingContext, async (finalContext) => {
|
|
761
|
+
await addTimeoutToPromise(async () => this.requestHandler(finalContext), this.requestHandlerTimeoutMillis, `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${finalContext.request.id}).`);
|
|
762
|
+
});
|
|
596
763
|
}
|
|
597
764
|
/**
|
|
598
765
|
* Handles blocked request
|
|
@@ -608,7 +775,8 @@ export class BasicCrawler {
|
|
|
608
775
|
return true;
|
|
609
776
|
}
|
|
610
777
|
const robotsTxtFile = await this.getRobotsTxtFileForUrl(url);
|
|
611
|
-
|
|
778
|
+
const userAgent = typeof this.respectRobotsTxtFile === 'object' ? this.respectRobotsTxtFile?.userAgent : '*';
|
|
779
|
+
return !robotsTxtFile || robotsTxtFile.isAllowed(url, userAgent);
|
|
612
780
|
}
|
|
613
781
|
async getRobotsTxtFileForUrl(url) {
|
|
614
782
|
if (!this.respectRobotsTxtFile) {
|
|
@@ -662,36 +830,36 @@ export class BasicCrawler {
|
|
|
662
830
|
await Promise.all([requestListPersistPromise, this.stats.persistState()]);
|
|
663
831
|
}
|
|
664
832
|
/**
|
|
665
|
-
*
|
|
666
|
-
* and RequestQueue is present then enqueues it to the queue first.
|
|
833
|
+
* Initializes the RequestManager based on the configured requestList and requestQueue.
|
|
667
834
|
*/
|
|
668
|
-
async
|
|
669
|
-
if (
|
|
670
|
-
return
|
|
671
|
-
}
|
|
672
|
-
const request = await this.requestList.fetchNextRequest();
|
|
673
|
-
if (!this.requestQueue)
|
|
674
|
-
return request;
|
|
675
|
-
if (!request)
|
|
676
|
-
return this.requestQueue.fetchNextRequest();
|
|
677
|
-
try {
|
|
678
|
-
await this.requestQueue.addRequest(request, { forefront: true });
|
|
835
|
+
async initializeRequestManager() {
|
|
836
|
+
if (this.requestManager !== undefined) {
|
|
837
|
+
return;
|
|
679
838
|
}
|
|
680
|
-
|
|
681
|
-
//
|
|
682
|
-
|
|
683
|
-
this.log.error('Adding of request from the RequestList to the RequestQueue failed, reclaiming request back to the list.', { request });
|
|
684
|
-
await this.requestList.reclaimRequest(request);
|
|
685
|
-
return null;
|
|
839
|
+
if (this.requestList && this.requestQueue) {
|
|
840
|
+
// Create a RequestManagerTandem if both RequestList and RequestQueue are provided
|
|
841
|
+
this.requestManager = new RequestManagerTandem(this.requestList, this.requestQueue);
|
|
686
842
|
}
|
|
687
|
-
|
|
688
|
-
|
|
843
|
+
else if (this.requestQueue) {
|
|
844
|
+
// Use RequestQueue directly if only it is provided
|
|
845
|
+
this.requestManager = this.requestQueue;
|
|
846
|
+
}
|
|
847
|
+
else if (this.requestList) {
|
|
848
|
+
// Use RequestList directly if only it is provided
|
|
849
|
+
// Make it compatible with the IRequestManager interface
|
|
850
|
+
this.requestManager = new RequestListAdapter(this.requestList);
|
|
851
|
+
}
|
|
852
|
+
// If neither RequestList nor RequestQueue is provided, leave the requestManager uninitialized until `getRequestQueue` is called
|
|
689
853
|
}
|
|
690
854
|
/**
|
|
691
|
-
*
|
|
692
|
-
* Can be used to clean up orphaned browser pages.
|
|
855
|
+
* Fetches the next request to process from the underlying request provider.
|
|
693
856
|
*/
|
|
694
|
-
async
|
|
857
|
+
async _fetchNextRequest() {
|
|
858
|
+
if (this.requestManager === undefined) {
|
|
859
|
+
throw new Error(`_fetchNextRequest called on an uninitialized crawler`);
|
|
860
|
+
}
|
|
861
|
+
return this.requestManager.fetchNextRequest();
|
|
862
|
+
}
|
|
695
863
|
/**
|
|
696
864
|
* Delays processing of the request based on the `sameDomainDelaySecs` option,
|
|
697
865
|
* adding it back to the queue after the timeout passes. Returns `true` if the request
|
|
@@ -729,18 +897,21 @@ export class BasicCrawler {
|
|
|
729
897
|
* then retries them in a case of an error, etc.
|
|
730
898
|
*/
|
|
731
899
|
async _runTaskFunction() {
|
|
732
|
-
const source = this.
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
await this._timeoutAndRetry(
|
|
736
|
-
request = await this._fetchNextRequest();
|
|
737
|
-
}, this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
900
|
+
const source = this.requestManager;
|
|
901
|
+
if (!source)
|
|
902
|
+
throw new Error('Request provider is not initialized!');
|
|
903
|
+
const request = await this._timeoutAndRetry(this._fetchNextRequest.bind(this), this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
738
904
|
tryCancel();
|
|
739
|
-
|
|
740
|
-
await this._timeoutAndRetry(async () => {
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
905
|
+
const session = this.useSessionPool
|
|
906
|
+
? await this._timeoutAndRetry(async () => {
|
|
907
|
+
return await this.sessionPool.newSession({
|
|
908
|
+
proxyInfo: await this.proxyConfiguration?.newProxyInfo({
|
|
909
|
+
request: request ?? undefined,
|
|
910
|
+
}),
|
|
911
|
+
maxUsageCount: 1,
|
|
912
|
+
});
|
|
913
|
+
}, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`)
|
|
914
|
+
: undefined;
|
|
744
915
|
tryCancel();
|
|
745
916
|
if (!request || this.delayRequest(request, source)) {
|
|
746
917
|
return;
|
|
@@ -750,7 +921,7 @@ export class BasicCrawler {
|
|
|
750
921
|
request.state = RequestState.SKIPPED;
|
|
751
922
|
request.noRetry = true;
|
|
752
923
|
await source.markRequestHandled(request);
|
|
753
|
-
await this.
|
|
924
|
+
await this.handleSkippedRequest({
|
|
754
925
|
url: request.url,
|
|
755
926
|
reason: 'robotsTxt',
|
|
756
927
|
});
|
|
@@ -760,36 +931,34 @@ export class BasicCrawler {
|
|
|
760
931
|
request.loadedUrl = undefined;
|
|
761
932
|
const statisticsId = request.id || request.uniqueKey;
|
|
762
933
|
this.stats.startJob(statisticsId);
|
|
763
|
-
|
|
764
|
-
// @ts-expect-error
|
|
765
|
-
// All missing properties (that extend CrawlingContext) are set dynamically,
|
|
766
|
-
// but TS does not know that, so otherwise it would throw when compiling.
|
|
934
|
+
const deferredCleanup = [];
|
|
767
935
|
const crawlingContext = {
|
|
768
936
|
id: cryptoRandomObjectId(10),
|
|
769
|
-
crawler: this,
|
|
770
937
|
log: this.log,
|
|
771
938
|
request,
|
|
772
939
|
session,
|
|
940
|
+
proxyInfo: session?.proxyInfo,
|
|
773
941
|
enqueueLinks: async (options) => {
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
942
|
+
const requestQueue = await this.getRequestQueue();
|
|
943
|
+
return await this.enqueueLinksWithCrawlDepth(options, request, requestQueue);
|
|
944
|
+
},
|
|
945
|
+
addRequests: async (requests, options = {}) => {
|
|
946
|
+
const newCrawlDepth = request.crawlDepth + 1;
|
|
947
|
+
const requestsGenerator = this.addCrawlDepthRequestGenerator(requests, newCrawlDepth);
|
|
948
|
+
await this.addRequests(requestsGenerator, options);
|
|
781
949
|
},
|
|
782
|
-
addRequests: this.addRequests.bind(this),
|
|
783
950
|
pushData: this.pushData.bind(this),
|
|
784
951
|
useState: this.useState.bind(this),
|
|
785
|
-
sendRequest: createSendRequest(this.httpClient, request, session
|
|
952
|
+
sendRequest: createSendRequest(this.httpClient, request, session),
|
|
786
953
|
getKeyValueStore: async (idOrName) => KeyValueStore.open(idOrName, { config: this.config }),
|
|
954
|
+
registerDeferredCleanup: (cleanup) => {
|
|
955
|
+
deferredCleanup.push(cleanup);
|
|
956
|
+
},
|
|
787
957
|
};
|
|
788
|
-
this.crawlingContexts.set(crawlingContext.id, crawlingContext);
|
|
789
958
|
let isRequestLocked = true;
|
|
790
959
|
try {
|
|
791
960
|
request.state = RequestState.REQUEST_HANDLER;
|
|
792
|
-
await
|
|
961
|
+
await this.runRequestHandler(crawlingContext);
|
|
793
962
|
await this._timeoutAndRetry(async () => source.markRequestHandled(request), this.internalTimeoutMillis, `Marking request ${request.url} (${request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
794
963
|
isRequestLocked = false; // markRequestHandled succeeded and unlocked the request
|
|
795
964
|
this.stats.finishJob(statisticsId, request.retryCount);
|
|
@@ -798,7 +967,8 @@ export class BasicCrawler {
|
|
|
798
967
|
request.state = RequestState.DONE;
|
|
799
968
|
crawlingContext.session?.markGood();
|
|
800
969
|
}
|
|
801
|
-
catch (
|
|
970
|
+
catch (rawError) {
|
|
971
|
+
const err = this.unwrapError(rawError);
|
|
802
972
|
try {
|
|
803
973
|
request.state = RequestState.ERROR_HANDLER;
|
|
804
974
|
await addTimeoutToPromise(async () => this._requestFunctionErrorHandler(err, crawlingContext, source), this.internalTimeoutMillis, `Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
@@ -808,24 +978,24 @@ export class BasicCrawler {
|
|
|
808
978
|
request.state = RequestState.DONE;
|
|
809
979
|
}
|
|
810
980
|
catch (secondaryError) {
|
|
811
|
-
|
|
981
|
+
const unwrappedSecondaryError = this.unwrapError(secondaryError);
|
|
982
|
+
if (!unwrappedSecondaryError.triggeredFromUserHandler &&
|
|
812
983
|
// avoid reprinting the same critical error multiple times, as it will be printed by Nodejs at the end anyway
|
|
813
|
-
!(
|
|
984
|
+
!(unwrappedSecondaryError instanceof CriticalError)) {
|
|
814
985
|
const apifySpecific = process.env.APIFY_IS_AT_HOME
|
|
815
986
|
? `This may have happened due to an internal error of Apify's API or due to a misconfigured crawler.`
|
|
816
987
|
: '';
|
|
817
|
-
this.log.exception(
|
|
988
|
+
this.log.exception(unwrappedSecondaryError, 'An exception occurred during handling of failed request. ' +
|
|
818
989
|
`This places the crawler and its underlying storages into an unknown state and crawling will be terminated. ${apifySpecific}`);
|
|
819
990
|
}
|
|
820
991
|
request.state = RequestState.ERROR;
|
|
821
|
-
throw
|
|
992
|
+
throw unwrappedSecondaryError;
|
|
822
993
|
}
|
|
823
994
|
// decrease the session score if the request fails (but the error handler did not throw)
|
|
824
995
|
crawlingContext.session?.markBad();
|
|
825
996
|
}
|
|
826
997
|
finally {
|
|
827
|
-
await
|
|
828
|
-
this.crawlingContexts.delete(crawlingContext.id);
|
|
998
|
+
await Promise.all(deferredCleanup.map((cleanup) => cleanup()));
|
|
829
999
|
// Safety net - release the lock if nobody managed to do it before
|
|
830
1000
|
if (isRequestLocked && source instanceof RequestProvider) {
|
|
831
1001
|
try {
|
|
@@ -838,19 +1008,60 @@ export class BasicCrawler {
|
|
|
838
1008
|
}
|
|
839
1009
|
}
|
|
840
1010
|
/**
|
|
841
|
-
*
|
|
1011
|
+
* Wrapper around the crawling context's `enqueueLinks` method:
|
|
1012
|
+
* - Injects `crawlDepth` to each request being added based on the crawling context request.
|
|
1013
|
+
* - Provides defaults for the `enqueueLinks` options based on the crawler configuration.
|
|
1014
|
+
* - These options can be overridden by the user.
|
|
1015
|
+
* @internal
|
|
1016
|
+
*/
|
|
1017
|
+
async enqueueLinksWithCrawlDepth(options, request, requestQueue) {
|
|
1018
|
+
const transformRequestFunctionWrapper = (newRequest) => {
|
|
1019
|
+
newRequest.crawlDepth = request.crawlDepth + 1;
|
|
1020
|
+
if (this.maxCrawlDepth !== undefined && newRequest.crawlDepth > this.maxCrawlDepth) {
|
|
1021
|
+
newRequest.skippedReason = 'depth';
|
|
1022
|
+
return false;
|
|
1023
|
+
}
|
|
1024
|
+
// After injecting the crawlDepth, we call the user-provided transform function, if there is one.
|
|
1025
|
+
return options.transformRequestFunction?.(newRequest) ?? newRequest;
|
|
1026
|
+
};
|
|
1027
|
+
return await enqueueLinks({
|
|
1028
|
+
requestQueue,
|
|
1029
|
+
robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
|
|
1030
|
+
onSkippedRequest: this.handleSkippedRequest,
|
|
1031
|
+
limit: this.calculateEnqueuedRequestLimit(options.limit),
|
|
1032
|
+
// Allow user options to override defaults set above ⤴
|
|
1033
|
+
...options,
|
|
1034
|
+
transformRequestFunction: transformRequestFunctionWrapper,
|
|
1035
|
+
});
|
|
1036
|
+
}
|
|
1037
|
+
/**
|
|
1038
|
+
* Generator function that yields requests injected with the given crawl depth.
|
|
1039
|
+
* @internal
|
|
1040
|
+
*/
|
|
1041
|
+
async *addCrawlDepthRequestGenerator(requests, newRequestDepth) {
|
|
1042
|
+
for await (const request of requests) {
|
|
1043
|
+
if (typeof request === 'string') {
|
|
1044
|
+
yield { url: request, crawlDepth: newRequestDepth };
|
|
1045
|
+
}
|
|
1046
|
+
else {
|
|
1047
|
+
request.crawlDepth ??= newRequestDepth;
|
|
1048
|
+
yield request;
|
|
1049
|
+
}
|
|
1050
|
+
}
|
|
1051
|
+
}
|
|
1052
|
+
/**
|
|
1053
|
+
* Run async callback with given timeout and retry. Returns the result of the callback.
|
|
842
1054
|
* @ignore
|
|
843
1055
|
*/
|
|
844
1056
|
async _timeoutAndRetry(handler, timeout, error, maxRetries = 3, retried = 1) {
|
|
845
1057
|
try {
|
|
846
|
-
await addTimeoutToPromise(handler, timeout, error);
|
|
1058
|
+
return await addTimeoutToPromise(handler, timeout, error);
|
|
847
1059
|
}
|
|
848
1060
|
catch (e) {
|
|
849
1061
|
if (retried <= maxRetries) {
|
|
850
1062
|
// we retry on any error, not just timeout
|
|
851
1063
|
this.log.warning(`${e.message} (retrying ${retried}/${maxRetries})`);
|
|
852
|
-
|
|
853
|
-
return;
|
|
1064
|
+
return this._timeoutAndRetry(handler, timeout, error, maxRetries, retried + 1);
|
|
854
1065
|
}
|
|
855
1066
|
throw e;
|
|
856
1067
|
}
|
|
@@ -859,24 +1070,13 @@ export class BasicCrawler {
|
|
|
859
1070
|
* Returns true if either RequestList or RequestQueue have a request ready for processing.
|
|
860
1071
|
*/
|
|
861
1072
|
async _isTaskReadyFunction() {
|
|
862
|
-
|
|
863
|
-
const isRequestListEmpty = this.requestList ? await this.requestList.isEmpty() : true;
|
|
864
|
-
// If RequestList is not empty, task is ready, no reason to check RequestQueue.
|
|
865
|
-
if (!isRequestListEmpty)
|
|
866
|
-
return true;
|
|
867
|
-
// If RequestQueue is not empty, task is ready, return true, otherwise false.
|
|
868
|
-
return this.requestQueue ? !(await this.requestQueue.isEmpty()) : false;
|
|
1073
|
+
return this.requestManager !== undefined && !(await this.requestManager.isEmpty());
|
|
869
1074
|
}
|
|
870
1075
|
/**
|
|
871
1076
|
* Returns true if both RequestList and RequestQueue have all requests finished.
|
|
872
1077
|
*/
|
|
873
1078
|
async _defaultIsFinishedFunction() {
|
|
874
|
-
|
|
875
|
-
this.requestList ? this.requestList.isFinished() : true,
|
|
876
|
-
this.requestQueue ? this.requestQueue.isFinished() : true,
|
|
877
|
-
]);
|
|
878
|
-
// If both are finished, return true, otherwise return false.
|
|
879
|
-
return isRequestListFinished && isRequestQueueFinished;
|
|
1079
|
+
return !this.requestManager || (await this.requestManager.isFinished());
|
|
880
1080
|
}
|
|
881
1081
|
async _rotateSession(crawlingContext) {
|
|
882
1082
|
const { request } = crawlingContext;
|
|
@@ -884,6 +1084,18 @@ export class BasicCrawler {
|
|
|
884
1084
|
request.sessionRotationCount++;
|
|
885
1085
|
crawlingContext.session?.retire();
|
|
886
1086
|
}
|
|
1087
|
+
/**
|
|
1088
|
+
* Unwraps errors thrown by the context pipeline to get the actual user error.
|
|
1089
|
+
* RequestHandlerError and ContextPipelineInitializationError wrap the actual error.
|
|
1090
|
+
*/
|
|
1091
|
+
unwrapError(error) {
|
|
1092
|
+
if (error instanceof RequestHandlerError ||
|
|
1093
|
+
error instanceof ContextPipelineInitializationError ||
|
|
1094
|
+
error instanceof ContextPipelineCleanupError) {
|
|
1095
|
+
return this.unwrapError(error.cause);
|
|
1096
|
+
}
|
|
1097
|
+
return error;
|
|
1098
|
+
}
|
|
887
1099
|
/**
|
|
888
1100
|
* Handles errors thrown by user provided requestHandler()
|
|
889
1101
|
*/
|
|
@@ -896,7 +1108,8 @@ export class BasicCrawler {
|
|
|
896
1108
|
const shouldRetryRequest = this._canRequestBeRetried(request, error);
|
|
897
1109
|
if (shouldRetryRequest) {
|
|
898
1110
|
await this.stats.errorTrackerRetry.addAsync(error, crawlingContext);
|
|
899
|
-
await this.errorHandler?.(crawlingContext,
|
|
1111
|
+
await this.errorHandler?.(crawlingContext, // valid cast - ExtendedContext transitively extends CrawlingContext
|
|
1112
|
+
error);
|
|
900
1113
|
if (error instanceof SessionError) {
|
|
901
1114
|
await this._rotateSession(crawlingContext);
|
|
902
1115
|
}
|
|
@@ -948,7 +1161,8 @@ export class BasicCrawler {
|
|
|
948
1161
|
const message = this._getMessageFromError(error, true);
|
|
949
1162
|
this.log.error(`Request failed and reached maximum retries. ${message}`, { id, url, method, uniqueKey });
|
|
950
1163
|
if (this.failedRequestHandler) {
|
|
951
|
-
await this.failedRequestHandler?.(crawlingContext,
|
|
1164
|
+
await this.failedRequestHandler?.(crawlingContext, // valid cast - ExtendedContext transitively extends CrawlingContext
|
|
1165
|
+
error);
|
|
952
1166
|
}
|
|
953
1167
|
}
|
|
954
1168
|
/**
|
|
@@ -986,19 +1200,11 @@ export class BasicCrawler {
|
|
|
986
1200
|
return request.retryCount < maxRequestRetries;
|
|
987
1201
|
}
|
|
988
1202
|
/**
|
|
989
|
-
* Updates handledRequestsCount from possibly stored counts,
|
|
990
|
-
* usually after worker migration. Since one of the stores
|
|
991
|
-
* needs to have priority when both are present,
|
|
992
|
-
* it is the request queue, because generally, the request
|
|
993
|
-
* list will first be dumped into the queue and then left
|
|
994
|
-
* empty.
|
|
1203
|
+
* Updates handledRequestsCount from possibly stored counts, usually after worker migration.
|
|
995
1204
|
*/
|
|
996
1205
|
async _loadHandledRequestCount() {
|
|
997
|
-
if (this.
|
|
998
|
-
this.handledRequestsCount = await this.
|
|
999
|
-
}
|
|
1000
|
-
else if (this.requestList) {
|
|
1001
|
-
this.handledRequestsCount = this.requestList.handledCount();
|
|
1206
|
+
if (this.requestManager) {
|
|
1207
|
+
this.handledRequestsCount = await this.requestManager.handledCount();
|
|
1002
1208
|
}
|
|
1003
1209
|
}
|
|
1004
1210
|
async _executeHooks(hooks, ...args) {
|
|
@@ -1009,14 +1215,15 @@ export class BasicCrawler {
|
|
|
1009
1215
|
}
|
|
1010
1216
|
}
|
|
1011
1217
|
/**
|
|
1012
|
-
*
|
|
1013
|
-
*
|
|
1218
|
+
* Stops the crawler immediately.
|
|
1219
|
+
*
|
|
1220
|
+
* This method doesn't wait for currently active requests to finish.
|
|
1221
|
+
*
|
|
1222
|
+
* To stop the crawler gracefully (waiting for all running requests to finish), use {@link BasicCrawler.stop|`crawler.stop()`} instead.
|
|
1014
1223
|
*/
|
|
1015
1224
|
async teardown() {
|
|
1016
1225
|
this.events.emit("persistState" /* EventType.PERSIST_STATE */, { isMigrating: false });
|
|
1017
|
-
|
|
1018
|
-
await this.sessionPool.teardown();
|
|
1019
|
-
}
|
|
1226
|
+
await this.sessionPool?.teardown();
|
|
1020
1227
|
if (this._closeEvents) {
|
|
1021
1228
|
await this.events.close();
|
|
1022
1229
|
}
|