@crawlee/basic 4.0.0-beta.4 → 4.0.0-beta.40
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -5
- package/index.d.ts +1 -1
- package/index.d.ts.map +1 -1
- package/index.js +0 -1
- package/index.js.map +1 -1
- package/internals/basic-crawler.d.ts +270 -102
- package/internals/basic-crawler.d.ts.map +1 -1
- package/internals/basic-crawler.js +666 -330
- package/internals/basic-crawler.js.map +1 -1
- package/internals/send-request.d.ts +3 -5
- package/internals/send-request.d.ts.map +1 -1
- package/internals/send-request.js +21 -25
- package/internals/send-request.js.map +1 -1
- package/package.json +6 -6
- package/internals/constants.d.ts +0 -7
- package/internals/constants.d.ts.map +0 -1
- package/internals/constants.js +0 -7
- package/internals/constants.js.map +0 -1
- package/tsconfig.build.tsbuildinfo +0 -1
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
import { writeFile } from 'node:fs/promises';
|
|
2
2
|
import { dirname } from 'node:path';
|
|
3
|
-
import { AutoscaledPool,
|
|
4
|
-
import {
|
|
3
|
+
import { AutoscaledPool, bindMethodsToServiceLocator, BLOCKED_STATUS_CODES, ContextPipeline, ContextPipelineCleanupError, ContextPipelineInitializationError, ContextPipelineInterruptedError, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, KeyValueStore, LogLevel, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestHandlerError, RequestListAdapter, RequestManagerTandem, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, ServiceLocator, serviceLocator, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
|
|
4
|
+
import { GotScrapingHttpClient } from '@crawlee/got-scraping-client';
|
|
5
|
+
import { getObjectType, isAsyncIterable, isIterable, RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
|
|
5
6
|
import { stringify } from 'csv-stringify/sync';
|
|
6
7
|
import { ensureDir, writeJSON } from 'fs-extra/esm';
|
|
7
8
|
import ow from 'ow';
|
|
8
9
|
import { getDomain } from 'tldts';
|
|
9
10
|
import { LruCache } from '@apify/datastructures';
|
|
10
|
-
import
|
|
11
|
-
import { addTimeoutToPromise, TimeoutError, tryCancel } from '@apify/timeout';
|
|
11
|
+
import { addTimeoutToPromise, TimeoutError } from '@apify/timeout';
|
|
12
12
|
import { cryptoRandomObjectId } from '@apify/utilities';
|
|
13
13
|
import { createSendRequest } from './send-request.js';
|
|
14
14
|
/**
|
|
@@ -21,6 +21,7 @@ import { createSendRequest } from './send-request.js';
|
|
|
21
21
|
* @ignore
|
|
22
22
|
*/
|
|
23
23
|
const SAFE_MIGRATION_WAIT_MILLIS = 20000;
|
|
24
|
+
const deferredCleanupKey = Symbol('deferredCleanup');
|
|
24
25
|
/**
|
|
25
26
|
* Provides a simple framework for parallel crawling of web pages.
|
|
26
27
|
* The URLs to crawl are fed either from a static list of URLs
|
|
@@ -86,8 +87,12 @@ const SAFE_MIGRATION_WAIT_MILLIS = 20000;
|
|
|
86
87
|
* @category Crawlers
|
|
87
88
|
*/
|
|
88
89
|
export class BasicCrawler {
|
|
89
|
-
config;
|
|
90
90
|
static CRAWLEE_STATE_KEY = 'CRAWLEE_STATE';
|
|
91
|
+
/**
|
|
92
|
+
* Tracks crawler instances that accessed shared state without having an explicit id.
|
|
93
|
+
* Used to detect and warn about multiple crawlers sharing the same state.
|
|
94
|
+
*/
|
|
95
|
+
static useStateCrawlerIds = new Set();
|
|
91
96
|
/**
|
|
92
97
|
* A reference to the underlying {@link Statistics} class that collects and logs run statistics for requests.
|
|
93
98
|
*/
|
|
@@ -103,9 +108,12 @@ export class BasicCrawler {
|
|
|
103
108
|
* Only available if used by the crawler.
|
|
104
109
|
*/
|
|
105
110
|
requestQueue;
|
|
111
|
+
/**
|
|
112
|
+
* The main request-handling component of the crawler. It's initialized during the crawler startup.
|
|
113
|
+
*/
|
|
114
|
+
requestManager;
|
|
106
115
|
/**
|
|
107
116
|
* A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session|sessions}.
|
|
108
|
-
* Only available if used by the crawler.
|
|
109
117
|
*/
|
|
110
118
|
sessionPool;
|
|
111
119
|
/**
|
|
@@ -116,40 +124,79 @@ export class BasicCrawler {
|
|
|
116
124
|
* or to abort it by calling {@link AutoscaledPool.abort|`autoscaledPool.abort()`}.
|
|
117
125
|
*/
|
|
118
126
|
autoscaledPool;
|
|
127
|
+
/**
|
|
128
|
+
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
129
|
+
* Only available if used by the crawler.
|
|
130
|
+
*/
|
|
131
|
+
proxyConfiguration;
|
|
119
132
|
/**
|
|
120
133
|
* Default {@link Router} instance that will be used if we don't specify any {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
|
|
121
134
|
* See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
|
|
122
135
|
*/
|
|
123
136
|
router = Router.create();
|
|
137
|
+
_basicContextPipeline;
|
|
138
|
+
/**
|
|
139
|
+
* The basic part of the context pipeline. Unlike the subclass pipeline, this
|
|
140
|
+
* part has no major side effects (e.g. launching a browser). It also makes typing more explicit, as subclass
|
|
141
|
+
* pipelines expect the basic crawler fields to already be present in the context at runtime.
|
|
142
|
+
*
|
|
143
|
+
* Context built with this pipeline can be passed into multiple crawler pipelines at once.
|
|
144
|
+
* This is used e.g. in the {@link AdaptivePlaywrightCrawler|`AdaptivePlaywrightCrawler`}.
|
|
145
|
+
*/
|
|
146
|
+
get basicContextPipeline() {
|
|
147
|
+
if (this._basicContextPipeline === undefined) {
|
|
148
|
+
this._basicContextPipeline = this.buildBasicContextPipeline();
|
|
149
|
+
}
|
|
150
|
+
return this._basicContextPipeline;
|
|
151
|
+
}
|
|
152
|
+
_contextPipeline;
|
|
153
|
+
get contextPipeline() {
|
|
154
|
+
if (this._contextPipeline === undefined) {
|
|
155
|
+
this._contextPipeline = this.buildFinalContextPipeline();
|
|
156
|
+
}
|
|
157
|
+
return this._contextPipeline;
|
|
158
|
+
}
|
|
124
159
|
running = false;
|
|
125
160
|
hasFinishedBefore = false;
|
|
126
|
-
|
|
161
|
+
unexpectedStop = false;
|
|
162
|
+
#log;
|
|
163
|
+
get log() {
|
|
164
|
+
return this.#log;
|
|
165
|
+
}
|
|
127
166
|
requestHandler;
|
|
128
167
|
errorHandler;
|
|
129
168
|
failedRequestHandler;
|
|
130
169
|
requestHandlerTimeoutMillis;
|
|
131
170
|
internalTimeoutMillis;
|
|
132
171
|
maxRequestRetries;
|
|
172
|
+
maxCrawlDepth;
|
|
133
173
|
sameDomainDelayMillis;
|
|
134
174
|
domainAccessedTime;
|
|
135
175
|
maxSessionRotations;
|
|
136
|
-
|
|
176
|
+
maxRequestsPerCrawl;
|
|
177
|
+
handledRequestsCount = 0;
|
|
137
178
|
statusMessageLoggingInterval;
|
|
138
179
|
statusMessageCallback;
|
|
139
180
|
sessionPoolOptions;
|
|
140
|
-
|
|
141
|
-
|
|
181
|
+
blockedStatusCodes = new Set();
|
|
182
|
+
additionalHttpErrorStatusCodes;
|
|
183
|
+
ignoreHttpErrorStatusCodes;
|
|
142
184
|
autoscaledPoolOptions;
|
|
143
|
-
events;
|
|
144
185
|
httpClient;
|
|
145
186
|
retryOnBlocked;
|
|
146
187
|
respectRobotsTxtFile;
|
|
147
188
|
onSkippedRequest;
|
|
148
189
|
_closeEvents;
|
|
190
|
+
loggedPerRun = new Set();
|
|
149
191
|
experiments;
|
|
150
192
|
robotsTxtFileCache;
|
|
151
193
|
_experimentWarnings = {};
|
|
194
|
+
crawlerId;
|
|
195
|
+
hasExplicitId;
|
|
196
|
+
contextPipelineOptions;
|
|
152
197
|
static optionsShape = {
|
|
198
|
+
contextPipelineBuilder: ow.optional.object,
|
|
199
|
+
extendContext: ow.optional.function,
|
|
153
200
|
requestList: ow.optional.object.validate(validators.requestList),
|
|
154
201
|
requestQueue: ow.optional.object.validate(validators.requestQueue),
|
|
155
202
|
// Subclasses override this function instead of passing it
|
|
@@ -163,144 +210,342 @@ export class BasicCrawler {
|
|
|
163
210
|
sameDomainDelaySecs: ow.optional.number,
|
|
164
211
|
maxSessionRotations: ow.optional.number,
|
|
165
212
|
maxRequestsPerCrawl: ow.optional.number,
|
|
213
|
+
maxCrawlDepth: ow.optional.number,
|
|
166
214
|
autoscaledPoolOptions: ow.optional.object,
|
|
167
215
|
sessionPoolOptions: ow.optional.object,
|
|
168
|
-
|
|
216
|
+
proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration),
|
|
169
217
|
statusMessageLoggingInterval: ow.optional.number,
|
|
170
218
|
statusMessageCallback: ow.optional.function,
|
|
219
|
+
additionalHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
|
|
220
|
+
ignoreHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
|
|
221
|
+
blockedStatusCodes: ow.optional.array.ofType(ow.number),
|
|
171
222
|
retryOnBlocked: ow.optional.boolean,
|
|
172
|
-
respectRobotsTxtFile: ow.optional.boolean,
|
|
223
|
+
respectRobotsTxtFile: ow.optional.any(ow.boolean, ow.object),
|
|
173
224
|
onSkippedRequest: ow.optional.function,
|
|
174
225
|
httpClient: ow.optional.object,
|
|
226
|
+
configuration: ow.optional.object,
|
|
227
|
+
storageClient: ow.optional.object,
|
|
228
|
+
eventManager: ow.optional.object,
|
|
229
|
+
logger: ow.optional.object,
|
|
175
230
|
// AutoscaledPool shorthands
|
|
176
231
|
minConcurrency: ow.optional.number,
|
|
177
232
|
maxConcurrency: ow.optional.number,
|
|
178
233
|
maxRequestsPerMinute: ow.optional.number.integerOrInfinite.positive.greaterThanOrEqual(1),
|
|
179
234
|
keepAlive: ow.optional.boolean,
|
|
180
235
|
// internal
|
|
181
|
-
log: ow.optional.object,
|
|
182
236
|
experiments: ow.optional.object,
|
|
183
237
|
statisticsOptions: ow.optional.object,
|
|
238
|
+
id: ow.optional.string,
|
|
184
239
|
};
|
|
185
240
|
/**
|
|
186
241
|
* All `BasicCrawler` parameters are passed via an options object.
|
|
187
242
|
*/
|
|
188
|
-
constructor(options = {}
|
|
189
|
-
this.config = config;
|
|
243
|
+
constructor(options = {}) {
|
|
190
244
|
ow(options, 'BasicCrawlerOptions', ow.object.exactShape(BasicCrawler.optionsShape));
|
|
191
|
-
const { requestList, requestQueue, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {},
|
|
245
|
+
const { requestList, requestQueue, requestManager, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, maxCrawlDepth, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, proxyConfiguration, additionalHttpErrorStatusCodes = [], ignoreHttpErrorStatusCodes = [],
|
|
246
|
+
// Service locator options
|
|
247
|
+
configuration, storageClient, eventManager, logger,
|
|
192
248
|
// AutoscaledPool shorthands
|
|
193
|
-
minConcurrency, maxConcurrency, maxRequestsPerMinute, retryOnBlocked = false, respectRobotsTxtFile = false, onSkippedRequest, requestHandler, requestHandlerTimeoutSecs, errorHandler, failedRequestHandler, statusMessageLoggingInterval = 10, statusMessageCallback, statisticsOptions, httpClient,
|
|
249
|
+
minConcurrency, maxConcurrency, maxRequestsPerMinute, blockedStatusCodes: blockedStatusCodesInput, retryOnBlocked = false, respectRobotsTxtFile = false, onSkippedRequest, requestHandler, requestHandlerTimeoutSecs, errorHandler, failedRequestHandler, statusMessageLoggingInterval = 10, statusMessageCallback, statisticsOptions, httpClient,
|
|
194
250
|
// internal
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
this.failedRequestHandler = failedRequestHandler;
|
|
209
|
-
this.errorHandler = errorHandler;
|
|
210
|
-
if (requestHandlerTimeoutSecs) {
|
|
211
|
-
this.requestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
|
|
251
|
+
experiments = {}, id, } = options;
|
|
252
|
+
// Create per-crawler service locator if custom services were provided.
|
|
253
|
+
// This wraps every method on the crawler instance so that calls to the global `serviceLocator`
|
|
254
|
+
// (via AsyncLocalStorage) resolve to this scoped instance instead.
|
|
255
|
+
// We also enter the scope for the rest of the constructor body, so that any code below
|
|
256
|
+
// that accesses `serviceLocator` will see the correct (scoped) instance.
|
|
257
|
+
let serviceLocatorScope = { enterScope: () => { }, exitScope: () => { } };
|
|
258
|
+
if (storageClient ||
|
|
259
|
+
eventManager ||
|
|
260
|
+
logger ||
|
|
261
|
+
(configuration !== undefined && configuration !== serviceLocator.getConfiguration())) {
|
|
262
|
+
const scopedServiceLocator = new ServiceLocator(configuration, eventManager, storageClient, logger);
|
|
263
|
+
serviceLocatorScope = bindMethodsToServiceLocator(scopedServiceLocator, this);
|
|
212
264
|
}
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
this.maxRequestRetries = maxRequestRetries;
|
|
231
|
-
this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
|
|
232
|
-
this.maxSessionRotations = maxSessionRotations;
|
|
233
|
-
this.handledRequestsCount = 0;
|
|
234
|
-
this.stats = new Statistics({
|
|
235
|
-
logMessage: `${log.getOptions().prefix} request statistics:`,
|
|
236
|
-
log,
|
|
237
|
-
config,
|
|
238
|
-
...statisticsOptions,
|
|
239
|
-
});
|
|
240
|
-
this.sessionPoolOptions = {
|
|
241
|
-
...sessionPoolOptions,
|
|
242
|
-
log,
|
|
243
|
-
};
|
|
244
|
-
if (this.retryOnBlocked) {
|
|
245
|
-
this.sessionPoolOptions.blockedStatusCodes = sessionPoolOptions.blockedStatusCodes ?? [];
|
|
246
|
-
if (this.sessionPoolOptions.blockedStatusCodes.length !== 0) {
|
|
247
|
-
log.warning(`Both 'blockedStatusCodes' and 'retryOnBlocked' are set. Please note that the 'retryOnBlocked' feature might not work as expected.`);
|
|
265
|
+
try {
|
|
266
|
+
serviceLocatorScope.enterScope();
|
|
267
|
+
this.contextPipelineOptions = {
|
|
268
|
+
contextPipelineBuilder: options.contextPipelineBuilder,
|
|
269
|
+
extendContext: options.extendContext,
|
|
270
|
+
};
|
|
271
|
+
this.#log = serviceLocator.getLogger().child({ prefix: this.constructor.name });
|
|
272
|
+
// Store whether the user explicitly provided an ID
|
|
273
|
+
this.hasExplicitId = id !== undefined;
|
|
274
|
+
// Store the user-provided ID, or generate a unique one for tracking purposes (not for state key)
|
|
275
|
+
this.crawlerId = id ?? cryptoRandomObjectId();
|
|
276
|
+
if (requestManager !== undefined) {
|
|
277
|
+
if (requestList !== undefined || requestQueue !== undefined) {
|
|
278
|
+
throw new Error('The `requestManager` option cannot be used in conjunction with `requestList` and/or `requestQueue`');
|
|
279
|
+
}
|
|
280
|
+
this.requestManager = requestManager;
|
|
281
|
+
this.requestQueue = requestManager; // TODO(v4) - the cast is not fully legitimate here, but it's fine for internal usage by the BasicCrawler
|
|
248
282
|
}
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
this.
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
283
|
+
else {
|
|
284
|
+
this.requestList = requestList;
|
|
285
|
+
this.requestQueue = requestQueue;
|
|
286
|
+
}
|
|
287
|
+
this.httpClient = httpClient ?? new GotScrapingHttpClient();
|
|
288
|
+
this.proxyConfiguration = proxyConfiguration;
|
|
289
|
+
this.statusMessageLoggingInterval = statusMessageLoggingInterval;
|
|
290
|
+
this.statusMessageCallback = statusMessageCallback;
|
|
291
|
+
this.domainAccessedTime = new Map();
|
|
292
|
+
this.experiments = experiments;
|
|
293
|
+
this.robotsTxtFileCache = new LruCache({ maxLength: 1000 });
|
|
294
|
+
this.handleSkippedRequest = this.handleSkippedRequest.bind(this);
|
|
295
|
+
this.additionalHttpErrorStatusCodes = new Set([...additionalHttpErrorStatusCodes]);
|
|
296
|
+
this.ignoreHttpErrorStatusCodes = new Set([...ignoreHttpErrorStatusCodes]);
|
|
297
|
+
this.requestHandler = requestHandler ?? this.router;
|
|
298
|
+
this.failedRequestHandler = failedRequestHandler;
|
|
299
|
+
this.errorHandler = errorHandler;
|
|
300
|
+
if (requestHandlerTimeoutSecs) {
|
|
301
|
+
this.requestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
|
|
302
|
+
}
|
|
303
|
+
else {
|
|
304
|
+
this.requestHandlerTimeoutMillis = 60_000;
|
|
305
|
+
}
|
|
306
|
+
this.retryOnBlocked = retryOnBlocked;
|
|
307
|
+
this.respectRobotsTxtFile = respectRobotsTxtFile;
|
|
308
|
+
this.onSkippedRequest = onSkippedRequest;
|
|
309
|
+
const tryEnv = (val) => (val == null ? null : +val);
|
|
310
|
+
// allow at least 5min for internal timeouts
|
|
311
|
+
this.internalTimeoutMillis =
|
|
312
|
+
tryEnv(process.env.CRAWLEE_INTERNAL_TIMEOUT) ?? Math.max(this.requestHandlerTimeoutMillis * 2, 300e3);
|
|
313
|
+
// override the default internal timeout of request queue to respect `requestHandlerTimeoutMillis`
|
|
314
|
+
if (this.requestQueue) {
|
|
315
|
+
this.requestQueue.internalTimeoutMillis = this.internalTimeoutMillis;
|
|
316
|
+
// for request queue v2, we want to lock requests for slightly longer than the request handler timeout so that there is some padding for locking-related overhead,
|
|
317
|
+
// but never for less than a minute
|
|
318
|
+
this.requestQueue.requestLockSecs = Math.max(this.requestHandlerTimeoutMillis / 1000 + 5, 60);
|
|
319
|
+
}
|
|
320
|
+
this.maxRequestRetries = maxRequestRetries;
|
|
321
|
+
this.maxCrawlDepth = maxCrawlDepth;
|
|
322
|
+
this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
|
|
323
|
+
this.maxSessionRotations = maxSessionRotations;
|
|
324
|
+
this.stats = new Statistics({
|
|
325
|
+
logMessage: `${this.constructor.name} request statistics:`,
|
|
326
|
+
log: this.log,
|
|
327
|
+
...(this.hasExplicitId ? { id: this.crawlerId } : {}),
|
|
328
|
+
...statisticsOptions,
|
|
329
|
+
});
|
|
330
|
+
this.sessionPoolOptions = {
|
|
331
|
+
...sessionPoolOptions,
|
|
332
|
+
log: this.log,
|
|
333
|
+
};
|
|
334
|
+
this.sessionPool = new SessionPool(this.sessionPoolOptions);
|
|
335
|
+
this.blockedStatusCodes = new Set(blockedStatusCodesInput ?? BLOCKED_STATUS_CODES);
|
|
336
|
+
const maxSignedInteger = 2 ** 31 - 1;
|
|
337
|
+
if (this.requestHandlerTimeoutMillis > maxSignedInteger) {
|
|
338
|
+
this.log.warning(`requestHandlerTimeoutMillis ${this.requestHandlerTimeoutMillis}` +
|
|
339
|
+
` does not fit a signed 32-bit integer. Limiting the value to ${maxSignedInteger}`);
|
|
340
|
+
this.requestHandlerTimeoutMillis = maxSignedInteger;
|
|
341
|
+
}
|
|
342
|
+
this.internalTimeoutMillis = Math.min(this.internalTimeoutMillis, maxSignedInteger);
|
|
343
|
+
this.maxRequestsPerCrawl = maxRequestsPerCrawl;
|
|
344
|
+
const isMaxPagesExceeded = () => this.maxRequestsPerCrawl && this.maxRequestsPerCrawl <= this.handledRequestsCount;
|
|
345
|
+
// eslint-disable-next-line prefer-const
|
|
346
|
+
let { isFinishedFunction, isTaskReadyFunction } = autoscaledPoolOptions;
|
|
347
|
+
// override even if `isFinishedFunction` provided by user - `keepAlive` has higher priority
|
|
348
|
+
if (keepAlive) {
|
|
349
|
+
isFinishedFunction = async () => false;
|
|
350
|
+
}
|
|
351
|
+
const basicCrawlerAutoscaledPoolConfiguration = {
|
|
352
|
+
minConcurrency: minConcurrency ?? autoscaledPoolOptions?.minConcurrency,
|
|
353
|
+
maxConcurrency: maxConcurrency ?? autoscaledPoolOptions?.maxConcurrency,
|
|
354
|
+
maxTasksPerMinute: maxRequestsPerMinute ?? autoscaledPoolOptions?.maxTasksPerMinute,
|
|
355
|
+
runTaskFunction: async () => {
|
|
356
|
+
const source = this.requestManager;
|
|
357
|
+
if (!source)
|
|
358
|
+
throw new Error('Request provider is not initialized!');
|
|
359
|
+
const request = await this.resolveRequest();
|
|
360
|
+
if (!request || this.delayRequest(request, source)) {
|
|
361
|
+
return;
|
|
278
362
|
}
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
363
|
+
const crawlingContext = { request };
|
|
364
|
+
try {
|
|
365
|
+
await this.basicContextPipeline
|
|
366
|
+
.chain(this.contextPipeline)
|
|
367
|
+
.call(crawlingContext, (ctx) => this.handleRequest(ctx, source));
|
|
368
|
+
}
|
|
369
|
+
catch (error) {
|
|
370
|
+
// ContextPipelineInterruptedError means the request was intentionally skipped
|
|
371
|
+
// (e.g., doesn't match enqueue strategy after redirect). Just return gracefully.
|
|
372
|
+
if (error instanceof ContextPipelineInterruptedError) {
|
|
373
|
+
await this._timeoutAndRetry(async () => this.requestManager?.markRequestHandled(crawlingContext.request), this.internalTimeoutMillis, `Marking request ${crawlingContext.request.url} (${crawlingContext.request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
374
|
+
return;
|
|
375
|
+
}
|
|
376
|
+
// If the error happened during pipeline initialization (e.g., navigation timeout, session/proxy error,
|
|
377
|
+
// i.e. not in user's requestHandler), handle it through the normal error flow.
|
|
378
|
+
const isPipelineError = error instanceof ContextPipelineInitializationError || error instanceof SessionError;
|
|
379
|
+
if (isPipelineError) {
|
|
380
|
+
const unwrappedError = this.unwrapError(error);
|
|
381
|
+
await this._requestFunctionErrorHandler(unwrappedError, crawlingContext, this.requestManager);
|
|
382
|
+
crawlingContext.session?.markBad();
|
|
383
|
+
return;
|
|
384
|
+
}
|
|
385
|
+
throw this.unwrapError(error);
|
|
386
|
+
}
|
|
387
|
+
},
|
|
388
|
+
isTaskReadyFunction: async () => {
|
|
389
|
+
if (isMaxPagesExceeded()) {
|
|
390
|
+
this.logOncePerRun('shuttingDown', 'Crawler reached the maxRequestsPerCrawl limit of ' +
|
|
391
|
+
`${this.maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
|
|
392
|
+
return false;
|
|
393
|
+
}
|
|
394
|
+
if (this.unexpectedStop) {
|
|
395
|
+
this.logOncePerRun('shuttingDown', 'No new requests are allowed because the `stop()` method has been called. ' +
|
|
396
|
+
'Ongoing requests will be allowed to complete.');
|
|
397
|
+
return false;
|
|
398
|
+
}
|
|
399
|
+
return isTaskReadyFunction ? await isTaskReadyFunction() : await this._isTaskReadyFunction();
|
|
400
|
+
},
|
|
401
|
+
isFinishedFunction: async () => {
|
|
402
|
+
if (isMaxPagesExceeded()) {
|
|
403
|
+
this.log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${this.maxRequestsPerCrawl} requests ` +
|
|
404
|
+
'and all requests that were in progress at that time have now finished. ' +
|
|
405
|
+
`In total, the crawler processed ${this.handledRequestsCount} requests and will shut down.`);
|
|
406
|
+
return true;
|
|
407
|
+
}
|
|
408
|
+
if (this.unexpectedStop) {
|
|
409
|
+
this.log.info('The crawler has finished all the remaining ongoing requests and will shut down now.');
|
|
410
|
+
return true;
|
|
411
|
+
}
|
|
412
|
+
const isFinished = isFinishedFunction
|
|
413
|
+
? await isFinishedFunction()
|
|
414
|
+
: await this._defaultIsFinishedFunction();
|
|
415
|
+
if (isFinished) {
|
|
416
|
+
const reason = isFinishedFunction
|
|
417
|
+
? "Crawler's custom isFinishedFunction() returned true, the crawler will shut down."
|
|
418
|
+
: 'All requests from the queue have been processed, the crawler will shut down.';
|
|
419
|
+
this.log.info(reason);
|
|
420
|
+
}
|
|
421
|
+
return isFinished;
|
|
422
|
+
},
|
|
423
|
+
log: this.log,
|
|
424
|
+
};
|
|
425
|
+
this.autoscaledPoolOptions = { ...autoscaledPoolOptions, ...basicCrawlerAutoscaledPoolConfiguration };
|
|
426
|
+
}
|
|
427
|
+
finally {
|
|
428
|
+
serviceLocatorScope.exitScope();
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
/**
|
|
432
|
+
* Determines if the given HTTP status code is an error status code given
|
|
433
|
+
* the default behaviour and user-set preferences.
|
|
434
|
+
* @param status
|
|
435
|
+
* @returns `true` if the status code is considered an error, `false` otherwise
|
|
436
|
+
*/
|
|
437
|
+
isErrorStatusCode(status) {
|
|
438
|
+
const excludeError = this.ignoreHttpErrorStatusCodes.has(status);
|
|
439
|
+
const includeError = this.additionalHttpErrorStatusCodes.has(status);
|
|
440
|
+
return (status >= 500 && !excludeError) || includeError;
|
|
441
|
+
}
|
|
442
|
+
/**
|
|
443
|
+
* Builds the basic context pipeline that transforms `{ request }` into a full `CrawlingContext`.
|
|
444
|
+
* This handles base context creation, session resolution, and context helpers.
|
|
445
|
+
*/
|
|
446
|
+
buildBasicContextPipeline() {
|
|
447
|
+
return ContextPipeline.create()
|
|
448
|
+
.compose({ action: this.checkRobotsTxt.bind(this) })
|
|
449
|
+
.compose({
|
|
450
|
+
action: () => this.createBaseContext(),
|
|
451
|
+
cleanup: async (context) => {
|
|
452
|
+
await Promise.all(context[deferredCleanupKey].map((fn) => fn()));
|
|
282
453
|
},
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
454
|
+
})
|
|
455
|
+
.compose({ action: this.resolveSession.bind(this) })
|
|
456
|
+
.compose({ action: this.createContextHelpers.bind(this) });
|
|
457
|
+
}
|
|
458
|
+
async checkRobotsTxt({ request }) {
|
|
459
|
+
if (!(await this.isAllowedBasedOnRobotsTxtFile(request.url))) {
|
|
460
|
+
this.log.warning(`Skipping request ${request.url} (${request.id}) because it is disallowed based on robots.txt`);
|
|
461
|
+
request.state = RequestState.SKIPPED;
|
|
462
|
+
request.noRetry = true;
|
|
463
|
+
await this.handleSkippedRequest({
|
|
464
|
+
url: request.url,
|
|
465
|
+
reason: 'robotsTxt',
|
|
466
|
+
});
|
|
467
|
+
throw new ContextPipelineInterruptedError(`Skipping request ${request.url} as disallowed by robots.txt`);
|
|
468
|
+
}
|
|
469
|
+
return {};
|
|
470
|
+
}
|
|
471
|
+
/**
|
|
472
|
+
* Builds the subclass-specific context pipeline that transforms a `CrawlingContext` into the crawler's target context type.
|
|
473
|
+
* Subclasses should override this to add their own pipeline stages.
|
|
474
|
+
*/
|
|
475
|
+
buildContextPipeline() {
|
|
476
|
+
return ContextPipeline.create();
|
|
477
|
+
}
|
|
478
|
+
createBaseContext() {
|
|
479
|
+
const deferredCleanup = [];
|
|
480
|
+
return {
|
|
481
|
+
id: cryptoRandomObjectId(10),
|
|
482
|
+
log: this.log,
|
|
483
|
+
pushData: this.pushData.bind(this),
|
|
484
|
+
useState: this.useState.bind(this),
|
|
485
|
+
getKeyValueStore: async (idOrName) => KeyValueStore.open(idOrName),
|
|
486
|
+
registerDeferredCleanup: (cleanup) => {
|
|
487
|
+
deferredCleanup.push(cleanup);
|
|
300
488
|
},
|
|
301
|
-
|
|
489
|
+
[deferredCleanupKey]: deferredCleanup,
|
|
302
490
|
};
|
|
303
|
-
|
|
491
|
+
}
|
|
492
|
+
async resolveRequest() {
|
|
493
|
+
const request = await this._timeoutAndRetry(this._fetchNextRequest.bind(this), this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
494
|
+
// Reset loadedUrl so an old one is not carried over to retries.
|
|
495
|
+
if (request) {
|
|
496
|
+
request.loadedUrl = undefined;
|
|
497
|
+
}
|
|
498
|
+
return request;
|
|
499
|
+
}
|
|
500
|
+
async resolveSession({ request }) {
|
|
501
|
+
const session = await this._timeoutAndRetry(async () => {
|
|
502
|
+
return await this.sessionPool.newSession({
|
|
503
|
+
proxyInfo: await this.proxyConfiguration?.newProxyInfo({
|
|
504
|
+
request: request ?? undefined,
|
|
505
|
+
}),
|
|
506
|
+
maxUsageCount: 1,
|
|
507
|
+
});
|
|
508
|
+
}, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
509
|
+
return { session, proxyInfo: session.proxyInfo };
|
|
510
|
+
}
|
|
511
|
+
async createContextHelpers({ request, session }) {
|
|
512
|
+
const enqueueLinksWrapper = async (options) => {
|
|
513
|
+
const requestQueue = await this.getRequestQueue();
|
|
514
|
+
return await this.enqueueLinksWithCrawlDepth(options, request, requestQueue);
|
|
515
|
+
};
|
|
516
|
+
const addRequests = async (requests, options = {}) => {
|
|
517
|
+
const newCrawlDepth = request.crawlDepth + 1;
|
|
518
|
+
const requestsGenerator = this.addCrawlDepthRequestGenerator(requests, newCrawlDepth);
|
|
519
|
+
await this.addRequests(requestsGenerator, options);
|
|
520
|
+
};
|
|
521
|
+
const sendRequest = createSendRequest(this.httpClient, request, session);
|
|
522
|
+
return { enqueueLinks: enqueueLinksWrapper, addRequests, sendRequest };
|
|
523
|
+
}
|
|
524
|
+
buildFinalContextPipeline() {
|
|
525
|
+
let contextPipeline = (this.contextPipelineOptions.contextPipelineBuilder?.() ??
|
|
526
|
+
this.buildContextPipeline());
|
|
527
|
+
const { extendContext } = this.contextPipelineOptions;
|
|
528
|
+
if (extendContext !== undefined) {
|
|
529
|
+
contextPipeline = contextPipeline.compose({
|
|
530
|
+
action: async (context) => await extendContext(context),
|
|
531
|
+
});
|
|
532
|
+
}
|
|
533
|
+
contextPipeline = contextPipeline.compose({
|
|
534
|
+
action: async (context) => {
|
|
535
|
+
const { request } = context;
|
|
536
|
+
if (request && !this.requestMatchesEnqueueStrategy(request)) {
|
|
537
|
+
// eslint-disable-next-line dot-notation
|
|
538
|
+
const message = `Skipping request ${request.id} (starting url: ${request.url} -> loaded url: ${request.loadedUrl}) because it does not match the enqueue strategy (${request['enqueueStrategy']}).`;
|
|
539
|
+
this.log.debug(message);
|
|
540
|
+
request.noRetry = true;
|
|
541
|
+
request.state = RequestState.SKIPPED;
|
|
542
|
+
await this.handleSkippedRequest({ url: request.url, reason: 'redirect' });
|
|
543
|
+
throw new ContextPipelineInterruptedError(message);
|
|
544
|
+
}
|
|
545
|
+
return context;
|
|
546
|
+
},
|
|
547
|
+
});
|
|
548
|
+
return contextPipeline;
|
|
304
549
|
}
|
|
305
550
|
/**
|
|
306
551
|
* Checks if the given error is a proxy error by comparing its message to a list of known proxy error messages.
|
|
@@ -311,21 +556,13 @@ export class BasicCrawler {
|
|
|
311
556
|
isProxyError(error) {
|
|
312
557
|
return ROTATE_PROXY_ERRORS.some((x) => this._getMessageFromError(error)?.includes(x));
|
|
313
558
|
}
|
|
314
|
-
/**
|
|
315
|
-
* Checks whether the given crawling context is getting blocked by anti-bot protection using several heuristics.
|
|
316
|
-
* Returns `false` if the request is not blocked, otherwise returns a string with a description of the block reason.
|
|
317
|
-
* @param _crawlingContext The crawling context to check.
|
|
318
|
-
*/
|
|
319
|
-
async isRequestBlocked(_crawlingContext) {
|
|
320
|
-
throw new Error('the "isRequestBlocked" method is not implemented in this crawler.');
|
|
321
|
-
}
|
|
322
559
|
/**
|
|
323
560
|
* This method is periodically called by the crawler, every `statusMessageLoggingInterval` seconds.
|
|
324
561
|
*/
|
|
325
562
|
async setStatusMessage(message, options = {}) {
|
|
326
563
|
const data = options.isStatusMessageTerminal != null ? { terminal: options.isStatusMessageTerminal } : undefined;
|
|
327
|
-
this.log.
|
|
328
|
-
const client =
|
|
564
|
+
this.log.logWithLevel(LogLevel[options.level ?? 'DEBUG'], message, data);
|
|
565
|
+
const client = serviceLocator.getStorageClient();
|
|
329
566
|
if (!client.setStatusMessage) {
|
|
330
567
|
return;
|
|
331
568
|
}
|
|
@@ -350,7 +587,7 @@ export class BasicCrawler {
|
|
|
350
587
|
message = `Experiencing problems, ${this.stats.state.requestsFailed - previousState.requestsFailed || this.stats.state.requestsFailed} failed requests in the past ${this.statusMessageLoggingInterval} seconds.`;
|
|
351
588
|
}
|
|
352
589
|
else {
|
|
353
|
-
const total = this.
|
|
590
|
+
const total = this.requestManager?.getTotalCount();
|
|
354
591
|
message = `Crawled ${this.stats.state.requestsFinished}${total ? `/${total}` : ''} pages, ${this.stats.state.requestsFailed} failed requests, desired concurrency ${this.autoscaledPool?.desiredConcurrency ?? 0}.`;
|
|
355
592
|
}
|
|
356
593
|
if (this.statusMessageCallback) {
|
|
@@ -390,20 +627,30 @@ export class BasicCrawler {
|
|
|
390
627
|
if (this.requestQueue?.name === 'default' && purgeRequestQueue) {
|
|
391
628
|
await this.requestQueue.drop();
|
|
392
629
|
this.requestQueue = await this._getRequestQueue();
|
|
630
|
+
this.requestManager = undefined;
|
|
631
|
+
await this.initializeRequestManager();
|
|
632
|
+
this.handledRequestsCount = 0; // This would've been reset by this._init() further down below, but at that point `handledRequestsCount` could prevent `addRequests` from adding the initial requests
|
|
393
633
|
}
|
|
394
634
|
this.stats.reset();
|
|
395
635
|
await this.stats.resetStore();
|
|
396
636
|
await this.sessionPool?.resetStore();
|
|
397
637
|
}
|
|
638
|
+
this.unexpectedStop = false;
|
|
398
639
|
this.running = true;
|
|
399
|
-
|
|
640
|
+
this.loggedPerRun.clear();
|
|
641
|
+
await purgeDefaultStorages({
|
|
642
|
+
onlyPurgeOnce: true,
|
|
643
|
+
client: serviceLocator.getStorageClient(),
|
|
644
|
+
config: serviceLocator.getConfiguration(),
|
|
645
|
+
});
|
|
400
646
|
if (requests) {
|
|
401
647
|
await this.addRequests(requests, addRequestsOptions);
|
|
402
648
|
}
|
|
403
649
|
await this._init();
|
|
404
650
|
await this.stats.startCapturing();
|
|
405
651
|
const periodicLogger = this.getPeriodicLogger();
|
|
406
|
-
|
|
652
|
+
// Don't await, we don't want to block the execution
|
|
653
|
+
void this.setStatusMessage('Starting the crawler.', { level: 'INFO' });
|
|
407
654
|
const sigintHandler = async () => {
|
|
408
655
|
this.log.warning('Pausing... Press CTRL+C again to force exit. To resume, do: CRAWLEE_PURGE_ON_START=0 npm start');
|
|
409
656
|
await this._pauseOnMigration();
|
|
@@ -412,8 +659,9 @@ export class BasicCrawler {
|
|
|
412
659
|
// Attach a listener to handle migration and aborting events gracefully.
|
|
413
660
|
const boundPauseOnMigration = this._pauseOnMigration.bind(this);
|
|
414
661
|
process.once('SIGINT', sigintHandler);
|
|
415
|
-
|
|
416
|
-
|
|
662
|
+
const eventManager = serviceLocator.getEventManager();
|
|
663
|
+
eventManager.on("migrating" /* EventType.MIGRATING */, boundPauseOnMigration);
|
|
664
|
+
eventManager.on("aborting" /* EventType.ABORTING */, boundPauseOnMigration);
|
|
417
665
|
let stats = {};
|
|
418
666
|
try {
|
|
419
667
|
await this.autoscaledPool.run();
|
|
@@ -422,8 +670,8 @@ export class BasicCrawler {
|
|
|
422
670
|
await this.teardown();
|
|
423
671
|
await this.stats.stopCapturing();
|
|
424
672
|
process.off('SIGINT', sigintHandler);
|
|
425
|
-
|
|
426
|
-
|
|
673
|
+
eventManager.off("migrating" /* EventType.MIGRATING */, boundPauseOnMigration);
|
|
674
|
+
eventManager.off("aborting" /* EventType.ABORTING */, boundPauseOnMigration);
|
|
427
675
|
const finalStats = this.stats.calculate();
|
|
428
676
|
stats = {
|
|
429
677
|
requestsFinished: this.stats.state.requestsFinished,
|
|
@@ -440,7 +688,7 @@ export class BasicCrawler {
|
|
|
440
688
|
mostCommonErrors: this.stats.errorTracker.getMostPopularErrors(3).map(prettify),
|
|
441
689
|
});
|
|
442
690
|
}
|
|
443
|
-
const client =
|
|
691
|
+
const client = serviceLocator.getStorageClient();
|
|
444
692
|
if (client.teardown) {
|
|
445
693
|
let finished = false;
|
|
446
694
|
setTimeout(() => {
|
|
@@ -452,7 +700,8 @@ export class BasicCrawler {
|
|
|
452
700
|
finished = true;
|
|
453
701
|
}
|
|
454
702
|
periodicLogger.stop();
|
|
455
|
-
|
|
703
|
+
// Don't await, we don't want to block the execution
|
|
704
|
+
void this.setStatusMessage(`Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${this.stats.state.requestsFinished} succeeded, ${this.stats.state.requestsFailed} failed.`, { isStatusMessageTerminal: true, level: 'INFO' });
|
|
456
705
|
this.running = false;
|
|
457
706
|
this.hasFinishedBefore = true;
|
|
458
707
|
}
|
|
@@ -462,29 +711,75 @@ export class BasicCrawler {
|
|
|
462
711
|
* Gracefully stops the current run of the crawler.
|
|
463
712
|
*
|
|
464
713
|
* All the tasks active at the time of calling this method will be allowed to finish.
|
|
714
|
+
*
|
|
715
|
+
* To stop the crawler immediately, use {@link BasicCrawler.teardown|`crawler.teardown()`} instead.
|
|
465
716
|
*/
|
|
466
|
-
stop(
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
.then(() => this.log.info(message))
|
|
473
|
-
.catch((err) => {
|
|
474
|
-
this.log.error('An error occurred when stopping the crawler:', err);
|
|
475
|
-
});
|
|
717
|
+
stop(reason = 'The crawler has been gracefully stopped.') {
|
|
718
|
+
if (this.unexpectedStop) {
|
|
719
|
+
return;
|
|
720
|
+
}
|
|
721
|
+
this.log.info(reason);
|
|
722
|
+
this.unexpectedStop = true;
|
|
476
723
|
}
|
|
477
724
|
async getRequestQueue() {
|
|
478
725
|
if (!this.requestQueue && this.requestList) {
|
|
479
726
|
this.log.warningOnce('When using RequestList and RequestQueue at the same time, you should instantiate both explicitly and provide them in the crawler options, to ensure correctly handled restarts of the crawler.');
|
|
480
727
|
}
|
|
481
|
-
this.requestQueue
|
|
728
|
+
if (!this.requestQueue) {
|
|
729
|
+
this.requestQueue = await this._getRequestQueue();
|
|
730
|
+
this.requestManager = undefined;
|
|
731
|
+
}
|
|
732
|
+
if (!this.requestManager) {
|
|
733
|
+
this.requestManager =
|
|
734
|
+
this.requestList === undefined
|
|
735
|
+
? this.requestQueue
|
|
736
|
+
: new RequestManagerTandem(this.requestList, this.requestQueue);
|
|
737
|
+
}
|
|
482
738
|
return this.requestQueue;
|
|
483
739
|
}
|
|
484
740
|
async useState(defaultValue = {}) {
|
|
485
|
-
const kvs = await KeyValueStore.open(null, { config:
|
|
741
|
+
const kvs = await KeyValueStore.open(null, { config: serviceLocator.getConfiguration() });
|
|
742
|
+
if (this.hasExplicitId) {
|
|
743
|
+
const stateKey = `${BasicCrawler.CRAWLEE_STATE_KEY}_${this.crawlerId}`;
|
|
744
|
+
return kvs.getAutoSavedValue(stateKey, defaultValue);
|
|
745
|
+
}
|
|
746
|
+
BasicCrawler.useStateCrawlerIds.add(this.crawlerId);
|
|
747
|
+
if (BasicCrawler.useStateCrawlerIds.size > 1) {
|
|
748
|
+
serviceLocator
|
|
749
|
+
.getLogger()
|
|
750
|
+
.warningOnce('Multiple crawler instances are calling useState() without an explicit `id` option. \n' +
|
|
751
|
+
'This means they will share the same state object, which is likely unintended. \n' +
|
|
752
|
+
'To fix this, provide a unique `id` option to each crawler instance. \n' +
|
|
753
|
+
'Example: new BasicCrawler({ id: "my-crawler-1", ... })');
|
|
754
|
+
}
|
|
486
755
|
return kvs.getAutoSavedValue(BasicCrawler.CRAWLEE_STATE_KEY, defaultValue);
|
|
487
756
|
}
|
|
757
|
+
get pendingRequestCountApproximation() {
|
|
758
|
+
return this.requestManager?.getPendingCount() ?? 0;
|
|
759
|
+
}
|
|
760
|
+
calculateEnqueuedRequestLimit(explicitLimit) {
|
|
761
|
+
if (this.maxRequestsPerCrawl === undefined) {
|
|
762
|
+
return explicitLimit;
|
|
763
|
+
}
|
|
764
|
+
const limit = Math.max(0, this.maxRequestsPerCrawl - this.handledRequestsCount - this.pendingRequestCountApproximation);
|
|
765
|
+
return Math.min(limit, explicitLimit ?? Infinity);
|
|
766
|
+
}
|
|
767
|
+
async handleSkippedRequest(options) {
|
|
768
|
+
if (options.reason === 'limit') {
|
|
769
|
+
this.logOncePerRun('maxRequestsPerCrawl', 'The number of requests enqueued by the crawler reached the maxRequestsPerCrawl limit of ' +
|
|
770
|
+
`${this.maxRequestsPerCrawl} requests and no further requests will be added.`);
|
|
771
|
+
}
|
|
772
|
+
if (options.reason === 'depth') {
|
|
773
|
+
this.logOncePerRun('maxCrawlDepth', `The crawler reached the maxCrawlDepth limit of ${this.maxCrawlDepth} and no further requests will be enqueued.`);
|
|
774
|
+
}
|
|
775
|
+
await this.onSkippedRequest?.(options);
|
|
776
|
+
}
|
|
777
|
+
logOncePerRun(key, message) {
|
|
778
|
+
if (!this.loggedPerRun.has(key)) {
|
|
779
|
+
this.log.info(message);
|
|
780
|
+
this.loggedPerRun.add(key);
|
|
781
|
+
}
|
|
782
|
+
}
|
|
488
783
|
/**
|
|
489
784
|
* Adds requests to the queue in batches. By default, it will resolve after the initial batch is added, and continue
|
|
490
785
|
* adding the rest in background. You can configure the batch size via `batchSize` option and the sleep time in between
|
|
@@ -497,33 +792,57 @@ export class BasicCrawler {
|
|
|
497
792
|
* @param options Options for the request queue
|
|
498
793
|
*/
|
|
499
794
|
async addRequests(requests, options = {}) {
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
const
|
|
505
|
-
const
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
795
|
+
await this.getRequestQueue();
|
|
796
|
+
const requestLimit = this.calculateEnqueuedRequestLimit();
|
|
797
|
+
const skippedBecauseOfRobots = new Set();
|
|
798
|
+
const skippedBecauseOfLimit = new Set();
|
|
799
|
+
const skippedBecauseOfMaxCrawlDepth = new Set();
|
|
800
|
+
const isAllowedBasedOnRobotsTxtFile = this.isAllowedBasedOnRobotsTxtFile.bind(this);
|
|
801
|
+
const maxCrawlDepth = this.maxCrawlDepth;
|
|
802
|
+
ow(requests, ow.object
|
|
803
|
+
.is((value) => isIterable(value) || isAsyncIterable(value))
|
|
804
|
+
.message((value) => `Expected an iterable or async iterable, got ${getObjectType(value)}`));
|
|
805
|
+
async function* filteredRequests() {
|
|
806
|
+
let yieldedRequestCount = 0;
|
|
807
|
+
for await (const request of requests) {
|
|
808
|
+
const url = typeof request === 'string' ? request : request.url;
|
|
809
|
+
if (requestLimit !== undefined && yieldedRequestCount >= requestLimit) {
|
|
810
|
+
skippedBecauseOfLimit.add(url);
|
|
811
|
+
continue;
|
|
812
|
+
}
|
|
813
|
+
if (maxCrawlDepth !== undefined && request.crawlDepth > maxCrawlDepth) {
|
|
814
|
+
skippedBecauseOfMaxCrawlDepth.add(url);
|
|
815
|
+
continue;
|
|
816
|
+
}
|
|
817
|
+
if (await isAllowedBasedOnRobotsTxtFile(url)) {
|
|
818
|
+
yield request;
|
|
819
|
+
yieldedRequestCount += 1;
|
|
820
|
+
}
|
|
821
|
+
else {
|
|
822
|
+
skippedBecauseOfRobots.add(url);
|
|
823
|
+
}
|
|
514
824
|
}
|
|
515
825
|
}
|
|
516
|
-
|
|
826
|
+
const result = await this.requestManager.addRequestsBatched(filteredRequests(), options);
|
|
827
|
+
if (skippedBecauseOfRobots.size > 0) {
|
|
517
828
|
this.log.warning(`Some requests were skipped because they were disallowed based on the robots.txt file`, {
|
|
518
|
-
skipped: [...
|
|
829
|
+
skipped: [...skippedBecauseOfRobots],
|
|
519
830
|
});
|
|
520
|
-
if (this.onSkippedRequest) {
|
|
521
|
-
await Promise.all([...skipped].map((url) => {
|
|
522
|
-
return this.onSkippedRequest({ url, reason: 'robotsTxt' });
|
|
523
|
-
}));
|
|
524
|
-
}
|
|
525
831
|
}
|
|
526
|
-
|
|
832
|
+
if (skippedBecauseOfRobots.size > 0 ||
|
|
833
|
+
skippedBecauseOfLimit.size > 0 ||
|
|
834
|
+
skippedBecauseOfMaxCrawlDepth.size > 0) {
|
|
835
|
+
await Promise.all([...skippedBecauseOfRobots]
|
|
836
|
+
.map((url) => {
|
|
837
|
+
return this.handleSkippedRequest({ url, reason: 'robotsTxt' });
|
|
838
|
+
})
|
|
839
|
+
.concat([...skippedBecauseOfLimit].map((url) => {
|
|
840
|
+
return this.handleSkippedRequest({ url, reason: 'limit' });
|
|
841
|
+
}), [...skippedBecauseOfMaxCrawlDepth].map((url) => {
|
|
842
|
+
return this.handleSkippedRequest({ url, reason: 'depth' });
|
|
843
|
+
})));
|
|
844
|
+
}
|
|
845
|
+
return result;
|
|
527
846
|
}
|
|
528
847
|
/**
|
|
529
848
|
* Pushes data to the specified {@link Dataset}, or the default crawler {@link Dataset} by calling {@link Dataset.pushData}.
|
|
@@ -536,7 +855,7 @@ export class BasicCrawler {
|
|
|
536
855
|
* Retrieves the specified {@link Dataset}, or the default crawler {@link Dataset}.
|
|
537
856
|
*/
|
|
538
857
|
async getDataset(idOrName) {
|
|
539
|
-
return Dataset.open(idOrName, { config:
|
|
858
|
+
return Dataset.open(idOrName, { config: serviceLocator.getConfiguration() });
|
|
540
859
|
}
|
|
541
860
|
/**
|
|
542
861
|
* Retrieves data from the default crawler {@link Dataset} by calling {@link Dataset.getData}.
|
|
@@ -563,7 +882,21 @@ export class BasicCrawler {
|
|
|
563
882
|
const dataset = await this.getDataset();
|
|
564
883
|
const items = await dataset.export(options);
|
|
565
884
|
if (format === 'csv') {
|
|
566
|
-
|
|
885
|
+
let value;
|
|
886
|
+
if (items.length === 0) {
|
|
887
|
+
value = '';
|
|
888
|
+
}
|
|
889
|
+
else {
|
|
890
|
+
const keys = options?.collectAllKeys
|
|
891
|
+
? Array.from(new Set(items.flatMap(Object.keys)))
|
|
892
|
+
: Object.keys(items[0]);
|
|
893
|
+
value = stringify([
|
|
894
|
+
keys,
|
|
895
|
+
...items.map((item) => {
|
|
896
|
+
return keys.map((k) => item[k]);
|
|
897
|
+
}),
|
|
898
|
+
]);
|
|
899
|
+
}
|
|
567
900
|
await ensureDir(dirname(path));
|
|
568
901
|
await writeFile(path, value);
|
|
569
902
|
this.log.info(`Export to ${path} finished!`);
|
|
@@ -575,32 +908,34 @@ export class BasicCrawler {
|
|
|
575
908
|
}
|
|
576
909
|
return items;
|
|
577
910
|
}
|
|
911
|
+
/**
|
|
912
|
+
* Initializes the crawler.
|
|
913
|
+
*/
|
|
578
914
|
async _init() {
|
|
579
|
-
|
|
580
|
-
|
|
915
|
+
const eventManager = serviceLocator.getEventManager();
|
|
916
|
+
if (!eventManager.isInitialized()) {
|
|
917
|
+
await eventManager.init();
|
|
581
918
|
this._closeEvents = true;
|
|
582
919
|
}
|
|
583
920
|
// Initialize AutoscaledPool before awaiting _loadHandledRequestCount(),
|
|
584
921
|
// so that the caller can get a reference to it before awaiting the promise returned from run()
|
|
585
922
|
// (otherwise there would be no way)
|
|
586
|
-
this.autoscaledPool = new AutoscaledPool(this.autoscaledPoolOptions
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
// Assuming there are not more than 20 browsers running at once;
|
|
590
|
-
this.sessionPool.setMaxListeners(20);
|
|
591
|
-
}
|
|
923
|
+
this.autoscaledPool = new AutoscaledPool(this.autoscaledPoolOptions);
|
|
924
|
+
this.sessionPool.setMaxListeners(20);
|
|
925
|
+
await this.initializeRequestManager();
|
|
592
926
|
await this._loadHandledRequestCount();
|
|
593
927
|
}
|
|
594
|
-
async
|
|
595
|
-
await this.requestHandler(crawlingContext);
|
|
928
|
+
async runRequestHandler(crawlingContext) {
|
|
929
|
+
await addTimeoutToPromise(async () => this.requestHandler(crawlingContext), this.requestHandlerTimeoutMillis, `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${crawlingContext.request.id}).`);
|
|
596
930
|
}
|
|
597
931
|
/**
|
|
598
932
|
* Handles blocked request
|
|
599
933
|
*/
|
|
600
|
-
_throwOnBlockedRequest(
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
934
|
+
_throwOnBlockedRequest(statusCode) {
|
|
935
|
+
if (this.retryOnBlocked)
|
|
936
|
+
return;
|
|
937
|
+
if (this.blockedStatusCodes.has(statusCode)) {
|
|
938
|
+
throw new SessionError(`Request blocked - received ${statusCode} status code.`);
|
|
604
939
|
}
|
|
605
940
|
}
|
|
606
941
|
async isAllowedBasedOnRobotsTxtFile(url) {
|
|
@@ -608,7 +943,8 @@ export class BasicCrawler {
|
|
|
608
943
|
return true;
|
|
609
944
|
}
|
|
610
945
|
const robotsTxtFile = await this.getRobotsTxtFileForUrl(url);
|
|
611
|
-
|
|
946
|
+
const userAgent = typeof this.respectRobotsTxtFile === 'object' ? this.respectRobotsTxtFile?.userAgent : '*';
|
|
947
|
+
return !robotsTxtFile || robotsTxtFile.isAllowed(url, userAgent);
|
|
612
948
|
}
|
|
613
949
|
async getRobotsTxtFileForUrl(url) {
|
|
614
950
|
if (!this.respectRobotsTxtFile) {
|
|
@@ -662,36 +998,36 @@ export class BasicCrawler {
|
|
|
662
998
|
await Promise.all([requestListPersistPromise, this.stats.persistState()]);
|
|
663
999
|
}
|
|
664
1000
|
/**
|
|
665
|
-
*
|
|
666
|
-
* and RequestQueue is present then enqueues it to the queue first.
|
|
1001
|
+
* Initializes the RequestManager based on the configured requestList and requestQueue.
|
|
667
1002
|
*/
|
|
668
|
-
async
|
|
669
|
-
if (
|
|
670
|
-
return
|
|
671
|
-
}
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
1003
|
+
async initializeRequestManager() {
|
|
1004
|
+
if (this.requestManager !== undefined) {
|
|
1005
|
+
return;
|
|
1006
|
+
}
|
|
1007
|
+
if (this.requestList && this.requestQueue) {
|
|
1008
|
+
// Create a RequestManagerTandem if both RequestList and RequestQueue are provided
|
|
1009
|
+
this.requestManager = new RequestManagerTandem(this.requestList, this.requestQueue);
|
|
1010
|
+
}
|
|
1011
|
+
else if (this.requestQueue) {
|
|
1012
|
+
// Use RequestQueue directly if only it is provided
|
|
1013
|
+
this.requestManager = this.requestQueue;
|
|
679
1014
|
}
|
|
680
|
-
|
|
681
|
-
//
|
|
682
|
-
//
|
|
683
|
-
this.
|
|
684
|
-
await this.requestList.reclaimRequest(request);
|
|
685
|
-
return null;
|
|
1015
|
+
else if (this.requestList) {
|
|
1016
|
+
// Use RequestList directly if only it is provided
|
|
1017
|
+
// Make it compatible with the IRequestManager interface
|
|
1018
|
+
this.requestManager = new RequestListAdapter(this.requestList);
|
|
686
1019
|
}
|
|
687
|
-
|
|
688
|
-
return this.requestQueue.fetchNextRequest();
|
|
1020
|
+
// If neither RequestList nor RequestQueue is provided, leave the requestManager uninitialized until `getRequestQueue` is called
|
|
689
1021
|
}
|
|
690
1022
|
/**
|
|
691
|
-
*
|
|
692
|
-
* Can be used to clean up orphaned browser pages.
|
|
1023
|
+
* Fetches the next request to process from the underlying request provider.
|
|
693
1024
|
*/
|
|
694
|
-
async
|
|
1025
|
+
async _fetchNextRequest() {
|
|
1026
|
+
if (this.requestManager === undefined) {
|
|
1027
|
+
throw new Error(`_fetchNextRequest called on an uninitialized crawler`);
|
|
1028
|
+
}
|
|
1029
|
+
return this.requestManager.fetchNextRequest();
|
|
1030
|
+
}
|
|
695
1031
|
/**
|
|
696
1032
|
* Delays processing of the request based on the `sameDomainDelaySecs` option,
|
|
697
1033
|
* adding it back to the queue after the timeout passes. Returns `true` if the request
|
|
@@ -724,112 +1060,55 @@ export class BasicCrawler {
|
|
|
724
1060
|
}, delay);
|
|
725
1061
|
return true;
|
|
726
1062
|
}
|
|
727
|
-
/**
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
*/
|
|
731
|
-
async _runTaskFunction() {
|
|
732
|
-
const source = this.requestQueue || this.requestList || (await this.getRequestQueue());
|
|
733
|
-
let request;
|
|
734
|
-
let session;
|
|
735
|
-
await this._timeoutAndRetry(async () => {
|
|
736
|
-
request = await this._fetchNextRequest();
|
|
737
|
-
}, this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
738
|
-
tryCancel();
|
|
739
|
-
if (this.useSessionPool) {
|
|
740
|
-
await this._timeoutAndRetry(async () => {
|
|
741
|
-
session = await this.sessionPool.getSession();
|
|
742
|
-
}, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
743
|
-
}
|
|
744
|
-
tryCancel();
|
|
745
|
-
if (!request || this.delayRequest(request, source)) {
|
|
746
|
-
return;
|
|
747
|
-
}
|
|
748
|
-
if (!(await this.isAllowedBasedOnRobotsTxtFile(request.url))) {
|
|
749
|
-
this.log.warning(`Skipping request ${request.url} (${request.id}) because it is disallowed based on robots.txt`);
|
|
750
|
-
request.state = RequestState.SKIPPED;
|
|
751
|
-
request.noRetry = true;
|
|
752
|
-
await source.markRequestHandled(request);
|
|
753
|
-
await this.onSkippedRequest?.({
|
|
754
|
-
url: request.url,
|
|
755
|
-
reason: 'robotsTxt',
|
|
756
|
-
});
|
|
757
|
-
return;
|
|
758
|
-
}
|
|
759
|
-
// Reset loadedUrl so an old one is not carried over to retries.
|
|
760
|
-
request.loadedUrl = undefined;
|
|
1063
|
+
/** Handles a single request - runs the request handler with retries, error handling, and lifecycle management. */
|
|
1064
|
+
async handleRequest(crawlingContext, requestSource) {
|
|
1065
|
+
const { request } = crawlingContext;
|
|
761
1066
|
const statisticsId = request.id || request.uniqueKey;
|
|
762
1067
|
this.stats.startJob(statisticsId);
|
|
763
|
-
// Shared crawling context
|
|
764
|
-
// @ts-expect-error
|
|
765
|
-
// All missing properties (that extend CrawlingContext) are set dynamically,
|
|
766
|
-
// but TS does not know that, so otherwise it would throw when compiling.
|
|
767
|
-
const crawlingContext = {
|
|
768
|
-
id: cryptoRandomObjectId(10),
|
|
769
|
-
crawler: this,
|
|
770
|
-
log: this.log,
|
|
771
|
-
request,
|
|
772
|
-
session,
|
|
773
|
-
enqueueLinks: async (options) => {
|
|
774
|
-
return enqueueLinks({
|
|
775
|
-
// specify the RQ first to allow overriding it
|
|
776
|
-
requestQueue: await this.getRequestQueue(),
|
|
777
|
-
robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
|
|
778
|
-
onSkippedRequest: this.onSkippedRequest,
|
|
779
|
-
...options,
|
|
780
|
-
});
|
|
781
|
-
},
|
|
782
|
-
addRequests: this.addRequests.bind(this),
|
|
783
|
-
pushData: this.pushData.bind(this),
|
|
784
|
-
useState: this.useState.bind(this),
|
|
785
|
-
sendRequest: createSendRequest(this.httpClient, request, session, () => crawlingContext.proxyInfo?.url),
|
|
786
|
-
getKeyValueStore: async (idOrName) => KeyValueStore.open(idOrName, { config: this.config }),
|
|
787
|
-
};
|
|
788
|
-
this.crawlingContexts.set(crawlingContext.id, crawlingContext);
|
|
789
1068
|
let isRequestLocked = true;
|
|
790
1069
|
try {
|
|
791
1070
|
request.state = RequestState.REQUEST_HANDLER;
|
|
792
|
-
await
|
|
793
|
-
await this._timeoutAndRetry(async () =>
|
|
1071
|
+
await this.runRequestHandler(crawlingContext);
|
|
1072
|
+
await this._timeoutAndRetry(async () => requestSource.markRequestHandled(request), this.internalTimeoutMillis, `Marking request ${request.url} (${request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
794
1073
|
isRequestLocked = false; // markRequestHandled succeeded and unlocked the request
|
|
795
1074
|
this.stats.finishJob(statisticsId, request.retryCount);
|
|
796
1075
|
this.handledRequestsCount++;
|
|
797
1076
|
// reclaim session if request finishes successfully
|
|
798
1077
|
request.state = RequestState.DONE;
|
|
799
|
-
crawlingContext.session
|
|
1078
|
+
crawlingContext.session.markGood();
|
|
800
1079
|
}
|
|
801
|
-
catch (
|
|
1080
|
+
catch (rawError) {
|
|
1081
|
+
const err = this.unwrapError(rawError);
|
|
802
1082
|
try {
|
|
803
1083
|
request.state = RequestState.ERROR_HANDLER;
|
|
804
|
-
await addTimeoutToPromise(async () => this._requestFunctionErrorHandler(err, crawlingContext,
|
|
1084
|
+
await addTimeoutToPromise(async () => this._requestFunctionErrorHandler(err, crawlingContext, requestSource), this.internalTimeoutMillis, `Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
805
1085
|
if (!(err instanceof CriticalError)) {
|
|
806
1086
|
isRequestLocked = false; // _requestFunctionErrorHandler calls either markRequestHandled or reclaimRequest
|
|
807
1087
|
}
|
|
808
1088
|
request.state = RequestState.DONE;
|
|
809
1089
|
}
|
|
810
1090
|
catch (secondaryError) {
|
|
811
|
-
|
|
1091
|
+
const unwrappedSecondaryError = this.unwrapError(secondaryError);
|
|
1092
|
+
if (!unwrappedSecondaryError.triggeredFromUserHandler &&
|
|
812
1093
|
// avoid reprinting the same critical error multiple times, as it will be printed by Nodejs at the end anyway
|
|
813
|
-
!(
|
|
1094
|
+
!(unwrappedSecondaryError instanceof CriticalError)) {
|
|
814
1095
|
const apifySpecific = process.env.APIFY_IS_AT_HOME
|
|
815
1096
|
? `This may have happened due to an internal error of Apify's API or due to a misconfigured crawler.`
|
|
816
1097
|
: '';
|
|
817
|
-
this.log.exception(
|
|
1098
|
+
this.log.exception(unwrappedSecondaryError, 'An exception occurred during handling of failed request. ' +
|
|
818
1099
|
`This places the crawler and its underlying storages into an unknown state and crawling will be terminated. ${apifySpecific}`);
|
|
819
1100
|
}
|
|
820
1101
|
request.state = RequestState.ERROR;
|
|
821
|
-
throw
|
|
1102
|
+
throw unwrappedSecondaryError;
|
|
822
1103
|
}
|
|
823
1104
|
// decrease the session score if the request fails (but the error handler did not throw)
|
|
824
|
-
crawlingContext.session
|
|
1105
|
+
crawlingContext.session.markBad();
|
|
825
1106
|
}
|
|
826
1107
|
finally {
|
|
827
|
-
await this._cleanupContext(crawlingContext);
|
|
828
|
-
this.crawlingContexts.delete(crawlingContext.id);
|
|
829
1108
|
// Safety net - release the lock if nobody managed to do it before
|
|
830
|
-
if (isRequestLocked &&
|
|
1109
|
+
if (isRequestLocked && requestSource instanceof RequestProvider) {
|
|
831
1110
|
try {
|
|
832
|
-
await
|
|
1111
|
+
await requestSource.client.deleteRequestLock(request.id);
|
|
833
1112
|
}
|
|
834
1113
|
catch {
|
|
835
1114
|
// We don't have the lock, or the request was never locked. Either way it's fine
|
|
@@ -838,19 +1117,75 @@ export class BasicCrawler {
|
|
|
838
1117
|
}
|
|
839
1118
|
}
|
|
840
1119
|
/**
|
|
841
|
-
*
|
|
1120
|
+
* Wrapper around the crawling context's `enqueueLinks` method:
|
|
1121
|
+
* - Injects `crawlDepth` to each request being added based on the crawling context request.
|
|
1122
|
+
* - Provides defaults for the `enqueueLinks` options based on the crawler configuration.
|
|
1123
|
+
* - These options can be overridden by the user.
|
|
1124
|
+
* @internal
|
|
1125
|
+
*/
|
|
1126
|
+
async enqueueLinksWithCrawlDepth(options, request, requestQueue) {
|
|
1127
|
+
const transformRequestFunctionWrapper = (requestOptions) => {
|
|
1128
|
+
requestOptions.crawlDepth = request.crawlDepth + 1;
|
|
1129
|
+
if (this.maxCrawlDepth !== undefined && requestOptions.crawlDepth > this.maxCrawlDepth) {
|
|
1130
|
+
// Setting `skippedReason` before returning `false` ensures that `reportSkippedRequests`
|
|
1131
|
+
// reports `'depth'` as the reason (via `request.skippedReason ?? reason` fallback),
|
|
1132
|
+
// rather than the generic `'transform'` reason.
|
|
1133
|
+
requestOptions.skippedReason = 'depth';
|
|
1134
|
+
return false;
|
|
1135
|
+
}
|
|
1136
|
+
// After injecting the crawlDepth, we call the user-provided transform function, if there is one.
|
|
1137
|
+
return options.transformRequestFunction?.(requestOptions) ?? requestOptions;
|
|
1138
|
+
};
|
|
1139
|
+
// Create a request-scoped callback that logs enqueueLimit once per request handler call
|
|
1140
|
+
// Only log if an explicit limit was passed to enqueueLinks (not the internal maxRequestsPerCrawl-derived limit)
|
|
1141
|
+
let loggedEnqueueLimitForThisRequest = false;
|
|
1142
|
+
const onSkippedRequest = async (skippedOptions) => {
|
|
1143
|
+
if (skippedOptions.reason === 'enqueueLimit') {
|
|
1144
|
+
if (!loggedEnqueueLimitForThisRequest && options.limit !== undefined) {
|
|
1145
|
+
this.log.info(`Skipping URLs in the handler for ${request.url} due to the enqueueLinks limit of ${options.limit}.`);
|
|
1146
|
+
loggedEnqueueLimitForThisRequest = true;
|
|
1147
|
+
}
|
|
1148
|
+
}
|
|
1149
|
+
await this.handleSkippedRequest(skippedOptions);
|
|
1150
|
+
};
|
|
1151
|
+
return await enqueueLinks({
|
|
1152
|
+
requestQueue,
|
|
1153
|
+
robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
|
|
1154
|
+
onSkippedRequest,
|
|
1155
|
+
limit: this.calculateEnqueuedRequestLimit(options.limit),
|
|
1156
|
+
// Allow user options to override defaults set above ⤴
|
|
1157
|
+
...options,
|
|
1158
|
+
transformRequestFunction: transformRequestFunctionWrapper,
|
|
1159
|
+
});
|
|
1160
|
+
}
|
|
1161
|
+
/**
|
|
1162
|
+
* Generator function that yields requests injected with the given crawl depth.
|
|
1163
|
+
* @internal
|
|
1164
|
+
*/
|
|
1165
|
+
async *addCrawlDepthRequestGenerator(requests, newRequestDepth) {
|
|
1166
|
+
for await (const request of requests) {
|
|
1167
|
+
if (typeof request === 'string') {
|
|
1168
|
+
yield { url: request, crawlDepth: newRequestDepth };
|
|
1169
|
+
}
|
|
1170
|
+
else {
|
|
1171
|
+
request.crawlDepth ??= newRequestDepth;
|
|
1172
|
+
yield request;
|
|
1173
|
+
}
|
|
1174
|
+
}
|
|
1175
|
+
}
|
|
1176
|
+
/**
|
|
1177
|
+
* Run async callback with given timeout and retry. Returns the result of the callback.
|
|
842
1178
|
* @ignore
|
|
843
1179
|
*/
|
|
844
1180
|
async _timeoutAndRetry(handler, timeout, error, maxRetries = 3, retried = 1) {
|
|
845
1181
|
try {
|
|
846
|
-
await addTimeoutToPromise(handler, timeout, error);
|
|
1182
|
+
return await addTimeoutToPromise(handler, timeout, error);
|
|
847
1183
|
}
|
|
848
1184
|
catch (e) {
|
|
849
1185
|
if (retried <= maxRetries) {
|
|
850
1186
|
// we retry on any error, not just timeout
|
|
851
1187
|
this.log.warning(`${e.message} (retrying ${retried}/${maxRetries})`);
|
|
852
|
-
|
|
853
|
-
return;
|
|
1188
|
+
return this._timeoutAndRetry(handler, timeout, error, maxRetries, retried + 1);
|
|
854
1189
|
}
|
|
855
1190
|
throw e;
|
|
856
1191
|
}
|
|
@@ -859,30 +1194,31 @@ export class BasicCrawler {
|
|
|
859
1194
|
* Returns true if either RequestList or RequestQueue have a request ready for processing.
|
|
860
1195
|
*/
|
|
861
1196
|
async _isTaskReadyFunction() {
|
|
862
|
-
|
|
863
|
-
const isRequestListEmpty = this.requestList ? await this.requestList.isEmpty() : true;
|
|
864
|
-
// If RequestList is not empty, task is ready, no reason to check RequestQueue.
|
|
865
|
-
if (!isRequestListEmpty)
|
|
866
|
-
return true;
|
|
867
|
-
// If RequestQueue is not empty, task is ready, return true, otherwise false.
|
|
868
|
-
return this.requestQueue ? !(await this.requestQueue.isEmpty()) : false;
|
|
1197
|
+
return this.requestManager !== undefined && !(await this.requestManager.isEmpty());
|
|
869
1198
|
}
|
|
870
1199
|
/**
|
|
871
1200
|
* Returns true if both RequestList and RequestQueue have all requests finished.
|
|
872
1201
|
*/
|
|
873
1202
|
async _defaultIsFinishedFunction() {
|
|
874
|
-
|
|
875
|
-
this.requestList ? this.requestList.isFinished() : true,
|
|
876
|
-
this.requestQueue ? this.requestQueue.isFinished() : true,
|
|
877
|
-
]);
|
|
878
|
-
// If both are finished, return true, otherwise return false.
|
|
879
|
-
return isRequestListFinished && isRequestQueueFinished;
|
|
1203
|
+
return !this.requestManager || (await this.requestManager.isFinished());
|
|
880
1204
|
}
|
|
881
1205
|
async _rotateSession(crawlingContext) {
|
|
882
1206
|
const { request } = crawlingContext;
|
|
883
1207
|
request.sessionRotationCount ??= 0;
|
|
884
1208
|
request.sessionRotationCount++;
|
|
885
|
-
crawlingContext.session
|
|
1209
|
+
crawlingContext.session.retire();
|
|
1210
|
+
}
|
|
1211
|
+
/**
|
|
1212
|
+
* Unwraps errors thrown by the context pipeline to get the actual user error.
|
|
1213
|
+
* RequestHandlerError and ContextPipelineInitializationError wrap the actual error.
|
|
1214
|
+
*/
|
|
1215
|
+
unwrapError(error) {
|
|
1216
|
+
if (error instanceof RequestHandlerError ||
|
|
1217
|
+
error instanceof ContextPipelineInitializationError ||
|
|
1218
|
+
error instanceof ContextPipelineCleanupError) {
|
|
1219
|
+
return this.unwrapError(error.cause);
|
|
1220
|
+
}
|
|
1221
|
+
return error;
|
|
886
1222
|
}
|
|
887
1223
|
/**
|
|
888
1224
|
* Handles errors thrown by user provided requestHandler()
|
|
@@ -896,12 +1232,15 @@ export class BasicCrawler {
|
|
|
896
1232
|
const shouldRetryRequest = this._canRequestBeRetried(request, error);
|
|
897
1233
|
if (shouldRetryRequest) {
|
|
898
1234
|
await this.stats.errorTrackerRetry.addAsync(error, crawlingContext);
|
|
899
|
-
await this.errorHandler?.(crawlingContext,
|
|
1235
|
+
await this.errorHandler?.(crawlingContext, // valid cast - ExtendedContext transitively extends CrawlingContext
|
|
1236
|
+
error);
|
|
900
1237
|
if (error instanceof SessionError) {
|
|
901
1238
|
await this._rotateSession(crawlingContext);
|
|
902
1239
|
}
|
|
903
1240
|
if (!request.noRetry) {
|
|
904
|
-
|
|
1241
|
+
if (!(error instanceof SessionError)) {
|
|
1242
|
+
request.retryCount++;
|
|
1243
|
+
}
|
|
905
1244
|
const { url, retryCount, id } = request;
|
|
906
1245
|
// We don't want to see the stack trace in the logs by default, when we are going to retry the request.
|
|
907
1246
|
// Thus, we print the full stack trace only when CRAWLEE_VERBOSE_LOG environment variable is set to true.
|
|
@@ -915,6 +1254,9 @@ export class BasicCrawler {
|
|
|
915
1254
|
return;
|
|
916
1255
|
}
|
|
917
1256
|
}
|
|
1257
|
+
if (error instanceof SessionError) {
|
|
1258
|
+
crawlingContext.session?.retire();
|
|
1259
|
+
}
|
|
918
1260
|
// If the request is non-retryable, the error and snapshot aren't saved in the errorTrackerRetry object.
|
|
919
1261
|
// Therefore, we pass the crawlingContext to the errorTracker.add method, enabling snapshot capture.
|
|
920
1262
|
// This is to make sure the error snapshot is not duplicated in the errorTrackerRetry and errorTracker objects.
|
|
@@ -948,7 +1290,8 @@ export class BasicCrawler {
|
|
|
948
1290
|
const message = this._getMessageFromError(error, true);
|
|
949
1291
|
this.log.error(`Request failed and reached maximum retries. ${message}`, { id, url, method, uniqueKey });
|
|
950
1292
|
if (this.failedRequestHandler) {
|
|
951
|
-
await this.failedRequestHandler?.(crawlingContext,
|
|
1293
|
+
await this.failedRequestHandler?.(crawlingContext, // valid cast - ExtendedContext transitively extends CrawlingContext
|
|
1294
|
+
error);
|
|
952
1295
|
}
|
|
953
1296
|
}
|
|
954
1297
|
/**
|
|
@@ -986,19 +1329,11 @@ export class BasicCrawler {
|
|
|
986
1329
|
return request.retryCount < maxRequestRetries;
|
|
987
1330
|
}
|
|
988
1331
|
/**
|
|
989
|
-
* Updates handledRequestsCount from possibly stored counts,
|
|
990
|
-
* usually after worker migration. Since one of the stores
|
|
991
|
-
* needs to have priority when both are present,
|
|
992
|
-
* it is the request queue, because generally, the request
|
|
993
|
-
* list will first be dumped into the queue and then left
|
|
994
|
-
* empty.
|
|
1332
|
+
* Updates handledRequestsCount from possibly stored counts, usually after worker migration.
|
|
995
1333
|
*/
|
|
996
1334
|
async _loadHandledRequestCount() {
|
|
997
|
-
if (this.
|
|
998
|
-
this.handledRequestsCount = await this.
|
|
999
|
-
}
|
|
1000
|
-
else if (this.requestList) {
|
|
1001
|
-
this.handledRequestsCount = this.requestList.handledCount();
|
|
1335
|
+
if (this.requestManager) {
|
|
1336
|
+
this.handledRequestsCount = await this.requestManager.handledCount();
|
|
1002
1337
|
}
|
|
1003
1338
|
}
|
|
1004
1339
|
async _executeHooks(hooks, ...args) {
|
|
@@ -1009,16 +1344,17 @@ export class BasicCrawler {
|
|
|
1009
1344
|
}
|
|
1010
1345
|
}
|
|
1011
1346
|
/**
|
|
1012
|
-
*
|
|
1013
|
-
*
|
|
1347
|
+
* Stops the crawler immediately.
|
|
1348
|
+
*
|
|
1349
|
+
* This method doesn't wait for currently active requests to finish.
|
|
1350
|
+
*
|
|
1351
|
+
* To stop the crawler gracefully (waiting for all running requests to finish), use {@link BasicCrawler.stop|`crawler.stop()`} instead.
|
|
1014
1352
|
*/
|
|
1015
1353
|
async teardown() {
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
await this.sessionPool.teardown();
|
|
1019
|
-
}
|
|
1354
|
+
serviceLocator.getEventManager().emit("persistState" /* EventType.PERSIST_STATE */, { isMigrating: false });
|
|
1355
|
+
await this.sessionPool?.teardown();
|
|
1020
1356
|
if (this._closeEvents) {
|
|
1021
|
-
await
|
|
1357
|
+
await serviceLocator.getEventManager().close();
|
|
1022
1358
|
}
|
|
1023
1359
|
await this.autoscaledPool?.abort();
|
|
1024
1360
|
}
|
|
@@ -1036,9 +1372,9 @@ export class BasicCrawler {
|
|
|
1036
1372
|
this.log.info('Using the old RequestQueue implementation without request locking.');
|
|
1037
1373
|
this._experimentWarnings.requestLocking = true;
|
|
1038
1374
|
}
|
|
1039
|
-
return RequestQueueV1.open(null, { config:
|
|
1375
|
+
return RequestQueueV1.open(null, { config: serviceLocator.getConfiguration() });
|
|
1040
1376
|
}
|
|
1041
|
-
return RequestQueue.open(null, { config:
|
|
1377
|
+
return RequestQueue.open(null, { config: serviceLocator.getConfiguration() });
|
|
1042
1378
|
}
|
|
1043
1379
|
requestMatchesEnqueueStrategy(request) {
|
|
1044
1380
|
const { url, loadedUrl } = request;
|