@crawlee/basic 4.0.0-beta.6 → 4.0.0-beta.61
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -5
- package/index.d.ts +1 -1
- package/index.d.ts.map +1 -1
- package/index.js +0 -1
- package/index.js.map +1 -1
- package/internals/basic-crawler.d.ts +292 -125
- package/internals/basic-crawler.d.ts.map +1 -1
- package/internals/basic-crawler.js +721 -354
- package/internals/basic-crawler.js.map +1 -1
- package/internals/send-request.d.ts +3 -5
- package/internals/send-request.d.ts.map +1 -1
- package/internals/send-request.js +21 -25
- package/internals/send-request.js.map +1 -1
- package/package.json +7 -8
- package/internals/constants.d.ts +0 -7
- package/internals/constants.d.ts.map +0 -1
- package/internals/constants.js +0 -7
- package/internals/constants.js.map +0 -1
- package/tsconfig.build.tsbuildinfo +0 -1
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
import { writeFile } from 'node:fs/promises';
|
|
2
2
|
import { dirname } from 'node:path';
|
|
3
|
-
import { AutoscaledPool,
|
|
4
|
-
import {
|
|
3
|
+
import { AutoscaledPool, bindMethodsToServiceLocator, BLOCKED_STATUS_CODES, ContextPipeline, ContextPipelineCleanupError, ContextPipelineInitializationError, ContextPipelineInterruptedError, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, KeyValueStore, LogLevel, mergeCookies, MissingSessionError, NavigationSkippedError, NonRetryableError, purgeDefaultStorages, RequestHandlerError, RequestListAdapter, RequestManagerTandem, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, ServiceLocator, serviceLocator, Session, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
|
|
4
|
+
import { GotScrapingHttpClient } from '@crawlee/got-scraping-client';
|
|
5
|
+
import { getObjectType, isAsyncIterable, isIterable, RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
|
|
5
6
|
import { stringify } from 'csv-stringify/sync';
|
|
6
7
|
import { ensureDir, writeJSON } from 'fs-extra/esm';
|
|
7
8
|
import ow from 'ow';
|
|
8
9
|
import { getDomain } from 'tldts';
|
|
9
10
|
import { LruCache } from '@apify/datastructures';
|
|
10
|
-
import
|
|
11
|
-
import { addTimeoutToPromise, TimeoutError, tryCancel } from '@apify/timeout';
|
|
11
|
+
import { addTimeoutToPromise, TimeoutError } from '@apify/timeout';
|
|
12
12
|
import { cryptoRandomObjectId } from '@apify/utilities';
|
|
13
13
|
import { createSendRequest } from './send-request.js';
|
|
14
14
|
/**
|
|
@@ -21,6 +21,7 @@ import { createSendRequest } from './send-request.js';
|
|
|
21
21
|
* @ignore
|
|
22
22
|
*/
|
|
23
23
|
const SAFE_MIGRATION_WAIT_MILLIS = 20000;
|
|
24
|
+
const deferredCleanupKey = Symbol('deferredCleanup');
|
|
24
25
|
/**
|
|
25
26
|
* Provides a simple framework for parallel crawling of web pages.
|
|
26
27
|
* The URLs to crawl are fed either from a static list of URLs
|
|
@@ -86,8 +87,12 @@ const SAFE_MIGRATION_WAIT_MILLIS = 20000;
|
|
|
86
87
|
* @category Crawlers
|
|
87
88
|
*/
|
|
88
89
|
export class BasicCrawler {
|
|
89
|
-
config;
|
|
90
90
|
static CRAWLEE_STATE_KEY = 'CRAWLEE_STATE';
|
|
91
|
+
/**
|
|
92
|
+
* Tracks crawler instances that accessed shared state without having an explicit id.
|
|
93
|
+
* Used to detect and warn about multiple crawlers sharing the same state.
|
|
94
|
+
*/
|
|
95
|
+
static useStateCrawlerIds = new Set();
|
|
91
96
|
/**
|
|
92
97
|
* A reference to the underlying {@link Statistics} class that collects and logs run statistics for requests.
|
|
93
98
|
*/
|
|
@@ -104,10 +109,21 @@ export class BasicCrawler {
|
|
|
104
109
|
*/
|
|
105
110
|
requestQueue;
|
|
106
111
|
/**
|
|
107
|
-
*
|
|
108
|
-
|
|
112
|
+
* The main request-handling component of the crawler. It's initialized during the crawler startup.
|
|
113
|
+
*/
|
|
114
|
+
requestManager;
|
|
115
|
+
/**
|
|
116
|
+
* A reference to the underlying session pool that manages the crawler's {@link Session|sessions}. Typed as
|
|
117
|
+
* {@link ISessionPool} so custom implementations can be plugged in via the `sessionPool` constructor option.
|
|
109
118
|
*/
|
|
110
119
|
sessionPool;
|
|
120
|
+
/**
|
|
121
|
+
* Set when the crawler constructed its own {@link SessionPool} (no `sessionPool` option was provided).
|
|
122
|
+
* Holds the same instance as `sessionPool`, but typed as the concrete class so the crawler can call
|
|
123
|
+
* lifecycle methods (`resetStore`, `teardown`) that aren't part of {@link ISessionPool}. A user-supplied
|
|
124
|
+
* pool is never owned and never torn down by the crawler.
|
|
125
|
+
*/
|
|
126
|
+
ownedSessionPool;
|
|
111
127
|
/**
|
|
112
128
|
* A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
|
|
113
129
|
* > *NOTE:* This property is only initialized after calling the {@link BasicCrawler.run|`crawler.run()`} function.
|
|
@@ -116,40 +132,77 @@ export class BasicCrawler {
|
|
|
116
132
|
* or to abort it by calling {@link AutoscaledPool.abort|`autoscaledPool.abort()`}.
|
|
117
133
|
*/
|
|
118
134
|
autoscaledPool;
|
|
135
|
+
/**
|
|
136
|
+
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
137
|
+
* Only available if used by the crawler.
|
|
138
|
+
*/
|
|
139
|
+
proxyConfiguration;
|
|
119
140
|
/**
|
|
120
141
|
* Default {@link Router} instance that will be used if we don't specify any {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
|
|
121
142
|
* See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
|
|
122
143
|
*/
|
|
123
144
|
router = Router.create();
|
|
145
|
+
_basicContextPipeline;
|
|
146
|
+
/**
|
|
147
|
+
* The basic part of the context pipeline. Unlike the subclass pipeline, this
|
|
148
|
+
* part has no major side effects (e.g. launching a browser). It also makes typing more explicit, as subclass
|
|
149
|
+
* pipelines expect the basic crawler fields to already be present in the context at runtime.
|
|
150
|
+
*
|
|
151
|
+
* Context built with this pipeline can be passed into multiple crawler pipelines at once.
|
|
152
|
+
* This is used e.g. in the {@link AdaptivePlaywrightCrawler|`AdaptivePlaywrightCrawler`}.
|
|
153
|
+
*/
|
|
154
|
+
get basicContextPipeline() {
|
|
155
|
+
if (this._basicContextPipeline === undefined) {
|
|
156
|
+
this._basicContextPipeline = this.buildBasicContextPipeline();
|
|
157
|
+
}
|
|
158
|
+
return this._basicContextPipeline;
|
|
159
|
+
}
|
|
160
|
+
_contextPipeline;
|
|
161
|
+
get contextPipeline() {
|
|
162
|
+
if (this._contextPipeline === undefined) {
|
|
163
|
+
this._contextPipeline = this.buildFinalContextPipeline();
|
|
164
|
+
}
|
|
165
|
+
return this._contextPipeline;
|
|
166
|
+
}
|
|
124
167
|
running = false;
|
|
125
168
|
hasFinishedBefore = false;
|
|
126
|
-
|
|
169
|
+
unexpectedStop = false;
|
|
170
|
+
#log;
|
|
171
|
+
get log() {
|
|
172
|
+
return this.#log;
|
|
173
|
+
}
|
|
127
174
|
requestHandler;
|
|
128
175
|
errorHandler;
|
|
129
176
|
failedRequestHandler;
|
|
130
177
|
requestHandlerTimeoutMillis;
|
|
131
178
|
internalTimeoutMillis;
|
|
132
179
|
maxRequestRetries;
|
|
180
|
+
maxCrawlDepth;
|
|
133
181
|
sameDomainDelayMillis;
|
|
134
182
|
domainAccessedTime;
|
|
135
|
-
|
|
136
|
-
handledRequestsCount;
|
|
183
|
+
maxRequestsPerCrawl;
|
|
184
|
+
handledRequestsCount = 0;
|
|
137
185
|
statusMessageLoggingInterval;
|
|
138
186
|
statusMessageCallback;
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
187
|
+
blockedStatusCodes = new Set();
|
|
188
|
+
additionalHttpErrorStatusCodes;
|
|
189
|
+
ignoreHttpErrorStatusCodes;
|
|
142
190
|
autoscaledPoolOptions;
|
|
143
|
-
events;
|
|
144
191
|
httpClient;
|
|
145
192
|
retryOnBlocked;
|
|
146
193
|
respectRobotsTxtFile;
|
|
147
194
|
onSkippedRequest;
|
|
148
195
|
_closeEvents;
|
|
196
|
+
loggedPerRun = new Set();
|
|
149
197
|
experiments;
|
|
150
198
|
robotsTxtFileCache;
|
|
151
199
|
_experimentWarnings = {};
|
|
200
|
+
crawlerId;
|
|
201
|
+
hasExplicitId;
|
|
202
|
+
contextPipelineOptions;
|
|
152
203
|
static optionsShape = {
|
|
204
|
+
contextPipelineBuilder: ow.optional.object,
|
|
205
|
+
extendContext: ow.optional.function,
|
|
153
206
|
requestList: ow.optional.object.validate(validators.requestList),
|
|
154
207
|
requestQueue: ow.optional.object.validate(validators.requestQueue),
|
|
155
208
|
// Subclasses override this function instead of passing it
|
|
@@ -161,145 +214,358 @@ export class BasicCrawler {
|
|
|
161
214
|
failedRequestHandler: ow.optional.function,
|
|
162
215
|
maxRequestRetries: ow.optional.number,
|
|
163
216
|
sameDomainDelaySecs: ow.optional.number,
|
|
164
|
-
maxSessionRotations: ow.optional.number,
|
|
165
217
|
maxRequestsPerCrawl: ow.optional.number,
|
|
218
|
+
maxCrawlDepth: ow.optional.number,
|
|
166
219
|
autoscaledPoolOptions: ow.optional.object,
|
|
167
|
-
|
|
168
|
-
|
|
220
|
+
sessionPool: ow.optional.object.validate(validators.sessionPool),
|
|
221
|
+
proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration),
|
|
169
222
|
statusMessageLoggingInterval: ow.optional.number,
|
|
170
223
|
statusMessageCallback: ow.optional.function,
|
|
224
|
+
additionalHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
|
|
225
|
+
ignoreHttpErrorStatusCodes: ow.optional.array.ofType(ow.number),
|
|
226
|
+
blockedStatusCodes: ow.optional.array.ofType(ow.number),
|
|
171
227
|
retryOnBlocked: ow.optional.boolean,
|
|
172
|
-
respectRobotsTxtFile: ow.optional.boolean,
|
|
228
|
+
respectRobotsTxtFile: ow.optional.any(ow.boolean, ow.object),
|
|
173
229
|
onSkippedRequest: ow.optional.function,
|
|
174
230
|
httpClient: ow.optional.object,
|
|
231
|
+
configuration: ow.optional.object,
|
|
232
|
+
storageClient: ow.optional.object,
|
|
233
|
+
eventManager: ow.optional.object,
|
|
234
|
+
logger: ow.optional.object,
|
|
175
235
|
// AutoscaledPool shorthands
|
|
176
236
|
minConcurrency: ow.optional.number,
|
|
177
237
|
maxConcurrency: ow.optional.number,
|
|
178
238
|
maxRequestsPerMinute: ow.optional.number.integerOrInfinite.positive.greaterThanOrEqual(1),
|
|
179
239
|
keepAlive: ow.optional.boolean,
|
|
180
240
|
// internal
|
|
181
|
-
log: ow.optional.object,
|
|
182
241
|
experiments: ow.optional.object,
|
|
183
242
|
statisticsOptions: ow.optional.object,
|
|
243
|
+
id: ow.optional.string,
|
|
184
244
|
};
|
|
185
245
|
/**
|
|
186
246
|
* All `BasicCrawler` parameters are passed via an options object.
|
|
187
247
|
*/
|
|
188
|
-
constructor(options = {}
|
|
189
|
-
this.config = config;
|
|
248
|
+
constructor(options = {}) {
|
|
190
249
|
ow(options, 'BasicCrawlerOptions', ow.object.exactShape(BasicCrawler.optionsShape));
|
|
191
|
-
const { requestList, requestQueue, maxRequestRetries = 3, sameDomainDelaySecs = 0,
|
|
250
|
+
const { requestList, requestQueue, requestManager, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxRequestsPerCrawl, maxCrawlDepth, autoscaledPoolOptions = {}, keepAlive, sessionPool, proxyConfiguration, additionalHttpErrorStatusCodes = [], ignoreHttpErrorStatusCodes = [],
|
|
251
|
+
// Service locator options
|
|
252
|
+
configuration, storageClient, eventManager, logger,
|
|
192
253
|
// AutoscaledPool shorthands
|
|
193
|
-
minConcurrency, maxConcurrency, maxRequestsPerMinute, retryOnBlocked = false, respectRobotsTxtFile = false, onSkippedRequest, requestHandler, requestHandlerTimeoutSecs, errorHandler, failedRequestHandler, statusMessageLoggingInterval = 10, statusMessageCallback, statisticsOptions, httpClient,
|
|
254
|
+
minConcurrency, maxConcurrency, maxRequestsPerMinute, blockedStatusCodes: blockedStatusCodesInput, retryOnBlocked = false, respectRobotsTxtFile = false, onSkippedRequest, requestHandler, requestHandlerTimeoutSecs, errorHandler, failedRequestHandler, statusMessageLoggingInterval = 10, statusMessageCallback, statisticsOptions, httpClient,
|
|
194
255
|
// internal
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
this.errorHandler = errorHandler;
|
|
209
|
-
if (requestHandlerTimeoutSecs) {
|
|
210
|
-
this.requestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
|
|
256
|
+
experiments = {}, id, } = options;
|
|
257
|
+
// Create per-crawler service locator if custom services were provided.
|
|
258
|
+
// This wraps every method on the crawler instance so that calls to the global `serviceLocator`
|
|
259
|
+
// (via AsyncLocalStorage) resolve to this scoped instance instead.
|
|
260
|
+
// We also enter the scope for the rest of the constructor body, so that any code below
|
|
261
|
+
// that accesses `serviceLocator` will see the correct (scoped) instance.
|
|
262
|
+
let serviceLocatorScope = { enterScope: () => { }, exitScope: () => { } };
|
|
263
|
+
if (storageClient ||
|
|
264
|
+
eventManager ||
|
|
265
|
+
logger ||
|
|
266
|
+
(configuration !== undefined && configuration !== serviceLocator.getConfiguration())) {
|
|
267
|
+
const scopedServiceLocator = new ServiceLocator(configuration, eventManager, storageClient, logger);
|
|
268
|
+
serviceLocatorScope = bindMethodsToServiceLocator(scopedServiceLocator, this);
|
|
211
269
|
}
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
this.maxRequestRetries = maxRequestRetries;
|
|
230
|
-
this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
|
|
231
|
-
this.maxSessionRotations = maxSessionRotations;
|
|
232
|
-
this.handledRequestsCount = 0;
|
|
233
|
-
this.stats = new Statistics({
|
|
234
|
-
logMessage: `${log.getOptions().prefix} request statistics:`,
|
|
235
|
-
log,
|
|
236
|
-
config,
|
|
237
|
-
...statisticsOptions,
|
|
238
|
-
});
|
|
239
|
-
this.sessionPoolOptions = {
|
|
240
|
-
...sessionPoolOptions,
|
|
241
|
-
log,
|
|
242
|
-
};
|
|
243
|
-
if (this.retryOnBlocked) {
|
|
244
|
-
this.sessionPoolOptions.blockedStatusCodes = sessionPoolOptions.blockedStatusCodes ?? [];
|
|
245
|
-
if (this.sessionPoolOptions.blockedStatusCodes.length !== 0) {
|
|
246
|
-
log.warning(`Both 'blockedStatusCodes' and 'retryOnBlocked' are set. Please note that the 'retryOnBlocked' feature might not work as expected.`);
|
|
270
|
+
try {
|
|
271
|
+
serviceLocatorScope.enterScope();
|
|
272
|
+
this.contextPipelineOptions = {
|
|
273
|
+
contextPipelineBuilder: options.contextPipelineBuilder,
|
|
274
|
+
extendContext: options.extendContext,
|
|
275
|
+
};
|
|
276
|
+
this.#log = serviceLocator.getLogger().child({ prefix: this.constructor.name });
|
|
277
|
+
// Store whether the user explicitly provided an ID
|
|
278
|
+
this.hasExplicitId = id !== undefined;
|
|
279
|
+
// Store the user-provided ID, or generate a unique one for tracking purposes (not for state key)
|
|
280
|
+
this.crawlerId = id ?? cryptoRandomObjectId();
|
|
281
|
+
if (requestManager !== undefined) {
|
|
282
|
+
if (requestList !== undefined || requestQueue !== undefined) {
|
|
283
|
+
throw new Error('The `requestManager` option cannot be used in conjunction with `requestList` and/or `requestQueue`');
|
|
284
|
+
}
|
|
285
|
+
this.requestManager = requestManager;
|
|
286
|
+
this.requestQueue = requestManager; // TODO(v4) - the cast is not fully legitimate here, but it's fine for internal usage by the BasicCrawler
|
|
247
287
|
}
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
this.
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
288
|
+
else {
|
|
289
|
+
this.requestList = requestList;
|
|
290
|
+
this.requestQueue = requestQueue;
|
|
291
|
+
}
|
|
292
|
+
this.httpClient = httpClient ?? new GotScrapingHttpClient({ logger: this.log });
|
|
293
|
+
this.proxyConfiguration = proxyConfiguration;
|
|
294
|
+
this.statusMessageLoggingInterval = statusMessageLoggingInterval;
|
|
295
|
+
this.statusMessageCallback = statusMessageCallback;
|
|
296
|
+
this.domainAccessedTime = new Map();
|
|
297
|
+
this.experiments = experiments;
|
|
298
|
+
this.robotsTxtFileCache = new LruCache({ maxLength: 1000 });
|
|
299
|
+
this.handleSkippedRequest = this.handleSkippedRequest.bind(this);
|
|
300
|
+
this.additionalHttpErrorStatusCodes = new Set([...additionalHttpErrorStatusCodes]);
|
|
301
|
+
this.ignoreHttpErrorStatusCodes = new Set([...ignoreHttpErrorStatusCodes]);
|
|
302
|
+
this.requestHandler = requestHandler ?? this.router;
|
|
303
|
+
this.failedRequestHandler = failedRequestHandler;
|
|
304
|
+
this.errorHandler = errorHandler;
|
|
305
|
+
if (requestHandlerTimeoutSecs) {
|
|
306
|
+
this.requestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
|
|
307
|
+
}
|
|
308
|
+
else {
|
|
309
|
+
this.requestHandlerTimeoutMillis = 60_000;
|
|
310
|
+
}
|
|
311
|
+
this.retryOnBlocked = retryOnBlocked;
|
|
312
|
+
this.respectRobotsTxtFile = respectRobotsTxtFile;
|
|
313
|
+
this.onSkippedRequest = onSkippedRequest;
|
|
314
|
+
const tryEnv = (val) => (val == null ? null : +val);
|
|
315
|
+
// allow at least 5min for internal timeouts
|
|
316
|
+
this.internalTimeoutMillis =
|
|
317
|
+
tryEnv(process.env.CRAWLEE_INTERNAL_TIMEOUT) ?? Math.max(this.requestHandlerTimeoutMillis * 2, 300e3);
|
|
318
|
+
// override the default internal timeout of request queue to respect `requestHandlerTimeoutMillis`
|
|
319
|
+
if (this.requestQueue) {
|
|
320
|
+
this.requestQueue.internalTimeoutMillis = this.internalTimeoutMillis;
|
|
321
|
+
// for request queue v2, we want to lock requests for slightly longer than the request handler timeout so that there is some padding for locking-related overhead,
|
|
322
|
+
// but never for less than a minute
|
|
323
|
+
this.requestQueue.requestLockSecs = Math.max(this.requestHandlerTimeoutMillis / 1000 + 5, 60);
|
|
324
|
+
}
|
|
325
|
+
this.maxRequestRetries = maxRequestRetries;
|
|
326
|
+
this.maxCrawlDepth = maxCrawlDepth;
|
|
327
|
+
this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
|
|
328
|
+
this.stats = new Statistics({
|
|
329
|
+
logMessage: `${this.constructor.name} request statistics:`,
|
|
330
|
+
log: this.log,
|
|
331
|
+
...(this.hasExplicitId ? { id: this.crawlerId } : {}),
|
|
332
|
+
...statisticsOptions,
|
|
333
|
+
});
|
|
334
|
+
if (sessionPool && proxyConfiguration) {
|
|
335
|
+
this.log.warning('Both `sessionPool` and `proxyConfiguration` were provided to the crawler. ' +
|
|
336
|
+
'The `proxyConfiguration` is ignored - sessions from the supplied pool keep whatever ' +
|
|
337
|
+
'`proxyInfo` they were created with. Configure proxies on the pool instead, ' +
|
|
338
|
+
'e.g. via `addSession({ proxyInfo })` or a custom `createSessionFunction`.');
|
|
339
|
+
}
|
|
340
|
+
if (sessionPool) {
|
|
341
|
+
this.sessionPool = sessionPool;
|
|
342
|
+
}
|
|
343
|
+
else {
|
|
344
|
+
this.ownedSessionPool = new SessionPool({
|
|
345
|
+
createSessionFunction: async (opts) => new Session({
|
|
346
|
+
...opts?.sessionOptions,
|
|
347
|
+
proxyInfo: opts?.sessionOptions?.proxyInfo ?? (await this.proxyConfiguration?.newProxyInfo()),
|
|
348
|
+
}),
|
|
349
|
+
});
|
|
350
|
+
this.sessionPool = this.ownedSessionPool;
|
|
351
|
+
}
|
|
352
|
+
this.blockedStatusCodes = new Set(blockedStatusCodesInput ?? BLOCKED_STATUS_CODES);
|
|
353
|
+
const maxSignedInteger = 2 ** 31 - 1;
|
|
354
|
+
if (this.requestHandlerTimeoutMillis > maxSignedInteger) {
|
|
355
|
+
this.log.warning(`requestHandlerTimeoutMillis ${this.requestHandlerTimeoutMillis}` +
|
|
356
|
+
` does not fit a signed 32-bit integer. Limiting the value to ${maxSignedInteger}`);
|
|
357
|
+
this.requestHandlerTimeoutMillis = maxSignedInteger;
|
|
358
|
+
}
|
|
359
|
+
this.internalTimeoutMillis = Math.min(this.internalTimeoutMillis, maxSignedInteger);
|
|
360
|
+
this.maxRequestsPerCrawl = maxRequestsPerCrawl;
|
|
361
|
+
const isMaxPagesExceeded = () => this.maxRequestsPerCrawl && this.maxRequestsPerCrawl <= this.handledRequestsCount;
|
|
362
|
+
// eslint-disable-next-line prefer-const
|
|
363
|
+
let { isFinishedFunction, isTaskReadyFunction } = autoscaledPoolOptions;
|
|
364
|
+
// override even if `isFinishedFunction` provided by user - `keepAlive` has higher priority
|
|
365
|
+
if (keepAlive) {
|
|
366
|
+
isFinishedFunction = async () => false;
|
|
367
|
+
}
|
|
368
|
+
const basicCrawlerAutoscaledPoolConfiguration = {
|
|
369
|
+
minConcurrency: minConcurrency ?? autoscaledPoolOptions?.minConcurrency,
|
|
370
|
+
maxConcurrency: maxConcurrency ?? autoscaledPoolOptions?.maxConcurrency,
|
|
371
|
+
maxTasksPerMinute: maxRequestsPerMinute ?? autoscaledPoolOptions?.maxTasksPerMinute,
|
|
372
|
+
runTaskFunction: async () => {
|
|
373
|
+
const source = this.requestManager;
|
|
374
|
+
if (!source)
|
|
375
|
+
throw new Error('Request provider is not initialized!');
|
|
376
|
+
const request = await this.resolveRequest();
|
|
377
|
+
if (!request || this.delayRequest(request, source)) {
|
|
378
|
+
return;
|
|
277
379
|
}
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
380
|
+
const crawlingContext = { request };
|
|
381
|
+
try {
|
|
382
|
+
await this.basicContextPipeline
|
|
383
|
+
.chain(this.contextPipeline)
|
|
384
|
+
.call(crawlingContext, (ctx) => this.handleRequest(ctx, source, request));
|
|
385
|
+
}
|
|
386
|
+
catch (error) {
|
|
387
|
+
// ContextPipelineInterruptedError means the request was intentionally skipped
|
|
388
|
+
// (e.g., doesn't match enqueue strategy after redirect). Just return gracefully.
|
|
389
|
+
if (error instanceof ContextPipelineInterruptedError) {
|
|
390
|
+
await this._timeoutAndRetry(async () => this.requestManager?.markRequestHandled(request), this.internalTimeoutMillis, `Marking request ${crawlingContext.request.url} (${crawlingContext.request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
391
|
+
return;
|
|
392
|
+
}
|
|
393
|
+
// If the error happened during pipeline initialization (e.g., navigation timeout, session/proxy error,
|
|
394
|
+
// i.e. not in user's requestHandler), handle it through the normal error flow.
|
|
395
|
+
const isPipelineError = error instanceof ContextPipelineInitializationError || error instanceof SessionError;
|
|
396
|
+
if (isPipelineError) {
|
|
397
|
+
const unwrappedError = this.unwrapError(error);
|
|
398
|
+
await this._requestFunctionErrorHandler(unwrappedError, crawlingContext, request, this.requestManager);
|
|
399
|
+
// SessionError already retired the session in `_requestFunctionErrorHandler`;
|
|
400
|
+
// skip `markBad` to avoid double-counting usage/error score.
|
|
401
|
+
if (!(unwrappedError instanceof SessionError)) {
|
|
402
|
+
crawlingContext.session?.markBad();
|
|
403
|
+
}
|
|
404
|
+
return;
|
|
405
|
+
}
|
|
406
|
+
throw this.unwrapError(error);
|
|
407
|
+
}
|
|
408
|
+
},
|
|
409
|
+
isTaskReadyFunction: async () => {
|
|
410
|
+
if (isMaxPagesExceeded()) {
|
|
411
|
+
this.logOncePerRun('shuttingDown', 'Crawler reached the maxRequestsPerCrawl limit of ' +
|
|
412
|
+
`${this.maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
|
|
413
|
+
return false;
|
|
414
|
+
}
|
|
415
|
+
if (this.unexpectedStop) {
|
|
416
|
+
this.logOncePerRun('shuttingDown', 'No new requests are allowed because the `stop()` method has been called. ' +
|
|
417
|
+
'Ongoing requests will be allowed to complete.');
|
|
418
|
+
return false;
|
|
419
|
+
}
|
|
420
|
+
return isTaskReadyFunction ? await isTaskReadyFunction() : await this._isTaskReadyFunction();
|
|
421
|
+
},
|
|
422
|
+
isFinishedFunction: async () => {
|
|
423
|
+
if (isMaxPagesExceeded()) {
|
|
424
|
+
this.log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${this.maxRequestsPerCrawl} requests ` +
|
|
425
|
+
'and all requests that were in progress at that time have now finished. ' +
|
|
426
|
+
`In total, the crawler processed ${this.handledRequestsCount} requests and will shut down.`);
|
|
427
|
+
return true;
|
|
428
|
+
}
|
|
429
|
+
if (this.unexpectedStop) {
|
|
430
|
+
this.log.info('The crawler has finished all the remaining ongoing requests and will shut down now.');
|
|
431
|
+
return true;
|
|
432
|
+
}
|
|
433
|
+
const isFinished = isFinishedFunction
|
|
434
|
+
? await isFinishedFunction()
|
|
435
|
+
: await this._defaultIsFinishedFunction();
|
|
436
|
+
if (isFinished) {
|
|
437
|
+
const reason = isFinishedFunction
|
|
438
|
+
? "Crawler's custom isFinishedFunction() returned true, the crawler will shut down."
|
|
439
|
+
: 'All requests from the queue have been processed, the crawler will shut down.';
|
|
440
|
+
this.log.info(reason);
|
|
441
|
+
}
|
|
442
|
+
return isFinished;
|
|
443
|
+
},
|
|
444
|
+
log: this.log,
|
|
445
|
+
};
|
|
446
|
+
this.autoscaledPoolOptions = { ...autoscaledPoolOptions, ...basicCrawlerAutoscaledPoolConfiguration };
|
|
447
|
+
}
|
|
448
|
+
finally {
|
|
449
|
+
serviceLocatorScope.exitScope();
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
/**
|
|
453
|
+
* Determines if the given HTTP status code is an error status code given
|
|
454
|
+
* the default behaviour and user-set preferences.
|
|
455
|
+
* @param status
|
|
456
|
+
* @returns `true` if the status code is considered an error, `false` otherwise
|
|
457
|
+
*/
|
|
458
|
+
isErrorStatusCode(status) {
|
|
459
|
+
const excludeError = this.ignoreHttpErrorStatusCodes.has(status);
|
|
460
|
+
const includeError = this.additionalHttpErrorStatusCodes.has(status);
|
|
461
|
+
return (status >= 500 && !excludeError) || includeError;
|
|
462
|
+
}
|
|
463
|
+
/**
|
|
464
|
+
* Builds the basic context pipeline that transforms `{ request }` into a full `CrawlingContext`.
|
|
465
|
+
* This handles base context creation, session resolution, and context helpers.
|
|
466
|
+
*/
|
|
467
|
+
buildBasicContextPipeline() {
|
|
468
|
+
return ContextPipeline.create()
|
|
469
|
+
.compose({ action: this.checkRobotsTxt.bind(this) })
|
|
470
|
+
.compose({
|
|
471
|
+
action: () => this.createBaseContext(),
|
|
472
|
+
cleanup: async (context) => {
|
|
473
|
+
await Promise.all(context[deferredCleanupKey].map((fn) => fn()));
|
|
281
474
|
},
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
475
|
+
})
|
|
476
|
+
.compose({ action: this.resolveSession.bind(this) })
|
|
477
|
+
.compose({ action: this.createContextHelpers.bind(this) });
|
|
478
|
+
}
|
|
479
|
+
async checkRobotsTxt({ request }) {
|
|
480
|
+
if (!(await this.isAllowedBasedOnRobotsTxtFile(request.url))) {
|
|
481
|
+
this.log.warning(`Skipping request ${request.url} (${request.id}) because it is disallowed based on robots.txt`);
|
|
482
|
+
request.state = RequestState.SKIPPED;
|
|
483
|
+
request.noRetry = true;
|
|
484
|
+
await this.handleSkippedRequest({
|
|
485
|
+
url: request.url,
|
|
486
|
+
reason: 'robotsTxt',
|
|
487
|
+
});
|
|
488
|
+
throw new ContextPipelineInterruptedError(`Skipping request ${request.url} as disallowed by robots.txt`);
|
|
489
|
+
}
|
|
490
|
+
return {};
|
|
491
|
+
}
|
|
492
|
+
/**
|
|
493
|
+
* Builds the subclass-specific context pipeline that transforms a `CrawlingContext` into the crawler's target context type.
|
|
494
|
+
* Subclasses should override this to add their own pipeline stages.
|
|
495
|
+
*/
|
|
496
|
+
buildContextPipeline() {
|
|
497
|
+
return ContextPipeline.create();
|
|
498
|
+
}
|
|
499
|
+
createBaseContext() {
|
|
500
|
+
const deferredCleanup = [];
|
|
501
|
+
return {
|
|
502
|
+
id: cryptoRandomObjectId(10),
|
|
503
|
+
log: this.log,
|
|
504
|
+
pushData: this.pushData.bind(this),
|
|
505
|
+
useState: this.useState.bind(this),
|
|
506
|
+
getKeyValueStore: async (identifier) => KeyValueStore.open(identifier),
|
|
507
|
+
registerDeferredCleanup: (cleanup) => {
|
|
508
|
+
deferredCleanup.push(cleanup);
|
|
299
509
|
},
|
|
300
|
-
|
|
510
|
+
[deferredCleanupKey]: deferredCleanup,
|
|
511
|
+
};
|
|
512
|
+
}
|
|
513
|
+
async resolveRequest() {
|
|
514
|
+
const request = await this._timeoutAndRetry(this._fetchNextRequest.bind(this), this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
515
|
+
// Reset loadedUrl so an old one is not carried over to retries.
|
|
516
|
+
if (request) {
|
|
517
|
+
request.loadedUrl = undefined;
|
|
518
|
+
}
|
|
519
|
+
return request;
|
|
520
|
+
}
|
|
521
|
+
async resolveSession({ request }) {
|
|
522
|
+
const session = await this._timeoutAndRetry(async () => {
|
|
523
|
+
const existingSession = await this.sessionPool.getSession(request.sessionId);
|
|
524
|
+
if (!existingSession) {
|
|
525
|
+
throw new ContextPipelineInitializationError(new MissingSessionError(request.sessionId));
|
|
526
|
+
}
|
|
527
|
+
return existingSession;
|
|
528
|
+
}, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
529
|
+
return { session, proxyInfo: session?.proxyInfo };
|
|
530
|
+
}
|
|
531
|
+
async createContextHelpers({ request, session }) {
|
|
532
|
+
const enqueueLinksWrapper = async (options) => {
|
|
533
|
+
const requestQueue = await this.getRequestQueue();
|
|
534
|
+
return await this.enqueueLinksWithCrawlDepth(options, request, requestQueue);
|
|
535
|
+
};
|
|
536
|
+
const addRequests = async (requests, options = {}) => {
|
|
537
|
+
const newCrawlDepth = request.crawlDepth + 1;
|
|
538
|
+
const requestsGenerator = this.addCrawlDepthRequestGenerator(requests, newCrawlDepth);
|
|
539
|
+
await this.addRequests(requestsGenerator, options);
|
|
301
540
|
};
|
|
302
|
-
|
|
541
|
+
const sendRequest = createSendRequest(this.httpClient, request, session);
|
|
542
|
+
return { enqueueLinks: enqueueLinksWrapper, addRequests, sendRequest };
|
|
543
|
+
}
|
|
544
|
+
buildFinalContextPipeline() {
|
|
545
|
+
let contextPipeline = (this.contextPipelineOptions.contextPipelineBuilder?.() ??
|
|
546
|
+
this.buildContextPipeline());
|
|
547
|
+
const { extendContext } = this.contextPipelineOptions;
|
|
548
|
+
if (extendContext !== undefined) {
|
|
549
|
+
contextPipeline = contextPipeline.compose({
|
|
550
|
+
action: async (context) => await extendContext(context),
|
|
551
|
+
});
|
|
552
|
+
}
|
|
553
|
+
contextPipeline = contextPipeline.compose({
|
|
554
|
+
action: async (context) => {
|
|
555
|
+
const { request } = context;
|
|
556
|
+
if (request && !this.requestMatchesEnqueueStrategy(request)) {
|
|
557
|
+
// eslint-disable-next-line dot-notation
|
|
558
|
+
const message = `Skipping request ${request.id} (starting url: ${request.url} -> loaded url: ${request.loadedUrl}) because it does not match the enqueue strategy (${request['enqueueStrategy']}).`;
|
|
559
|
+
this.log.debug(message);
|
|
560
|
+
request.noRetry = true;
|
|
561
|
+
request.state = RequestState.SKIPPED;
|
|
562
|
+
await this.handleSkippedRequest({ url: request.url, reason: 'redirect' });
|
|
563
|
+
throw new ContextPipelineInterruptedError(message);
|
|
564
|
+
}
|
|
565
|
+
return context;
|
|
566
|
+
},
|
|
567
|
+
});
|
|
568
|
+
return contextPipeline;
|
|
303
569
|
}
|
|
304
570
|
/**
|
|
305
571
|
* Checks if the given error is a proxy error by comparing its message to a list of known proxy error messages.
|
|
@@ -310,21 +576,13 @@ export class BasicCrawler {
|
|
|
310
576
|
isProxyError(error) {
|
|
311
577
|
return ROTATE_PROXY_ERRORS.some((x) => this._getMessageFromError(error)?.includes(x));
|
|
312
578
|
}
|
|
313
|
-
/**
|
|
314
|
-
* Checks whether the given crawling context is getting blocked by anti-bot protection using several heuristics.
|
|
315
|
-
* Returns `false` if the request is not blocked, otherwise returns a string with a description of the block reason.
|
|
316
|
-
* @param _crawlingContext The crawling context to check.
|
|
317
|
-
*/
|
|
318
|
-
async isRequestBlocked(_crawlingContext) {
|
|
319
|
-
throw new Error('the "isRequestBlocked" method is not implemented in this crawler.');
|
|
320
|
-
}
|
|
321
579
|
/**
|
|
322
580
|
* This method is periodically called by the crawler, every `statusMessageLoggingInterval` seconds.
|
|
323
581
|
*/
|
|
324
582
|
async setStatusMessage(message, options = {}) {
|
|
325
583
|
const data = options.isStatusMessageTerminal != null ? { terminal: options.isStatusMessageTerminal } : undefined;
|
|
326
|
-
this.log.
|
|
327
|
-
const client =
|
|
584
|
+
this.log.logWithLevel(LogLevel[options.level ?? 'DEBUG'], message, data);
|
|
585
|
+
const client = serviceLocator.getStorageClient();
|
|
328
586
|
if (!client.setStatusMessage) {
|
|
329
587
|
return;
|
|
330
588
|
}
|
|
@@ -349,7 +607,7 @@ export class BasicCrawler {
|
|
|
349
607
|
message = `Experiencing problems, ${this.stats.state.requestsFailed - previousState.requestsFailed || this.stats.state.requestsFailed} failed requests in the past ${this.statusMessageLoggingInterval} seconds.`;
|
|
350
608
|
}
|
|
351
609
|
else {
|
|
352
|
-
const total = this.
|
|
610
|
+
const total = this.requestManager?.getTotalCount();
|
|
353
611
|
message = `Crawled ${this.stats.state.requestsFinished}${total ? `/${total}` : ''} pages, ${this.stats.state.requestsFailed} failed requests, desired concurrency ${this.autoscaledPool?.desiredConcurrency ?? 0}.`;
|
|
354
612
|
}
|
|
355
613
|
if (this.statusMessageCallback) {
|
|
@@ -386,23 +644,34 @@ export class BasicCrawler {
|
|
|
386
644
|
// we need to purge the default RQ to allow processing the same requests again - this is important so users can
|
|
387
645
|
// pass in failed requests back to the `crawler.run()`, otherwise they would be considered as handled and
|
|
388
646
|
// ignored - as a failed requests is still handled.
|
|
389
|
-
|
|
647
|
+
const isDefaultQueue = this.requestQueue?.name === 'default';
|
|
648
|
+
if (isDefaultQueue && purgeRequestQueue && this.requestQueue) {
|
|
390
649
|
await this.requestQueue.drop();
|
|
391
650
|
this.requestQueue = await this._getRequestQueue();
|
|
651
|
+
this.requestManager = undefined;
|
|
652
|
+
await this.initializeRequestManager();
|
|
653
|
+
this.handledRequestsCount = 0; // This would've been reset by this._init() further down below, but at that point `handledRequestsCount` could prevent `addRequests` from adding the initial requests
|
|
392
654
|
}
|
|
393
655
|
this.stats.reset();
|
|
394
656
|
await this.stats.resetStore();
|
|
395
|
-
await this.
|
|
657
|
+
await this.ownedSessionPool?.resetStore();
|
|
396
658
|
}
|
|
659
|
+
this.unexpectedStop = false;
|
|
397
660
|
this.running = true;
|
|
398
|
-
|
|
661
|
+
this.loggedPerRun.clear();
|
|
662
|
+
await purgeDefaultStorages({
|
|
663
|
+
onlyPurgeOnce: true,
|
|
664
|
+
client: serviceLocator.getStorageClient(),
|
|
665
|
+
config: serviceLocator.getConfiguration(),
|
|
666
|
+
});
|
|
399
667
|
if (requests) {
|
|
400
668
|
await this.addRequests(requests, addRequestsOptions);
|
|
401
669
|
}
|
|
402
670
|
await this._init();
|
|
403
671
|
await this.stats.startCapturing();
|
|
404
672
|
const periodicLogger = this.getPeriodicLogger();
|
|
405
|
-
|
|
673
|
+
// Don't await, we don't want to block the execution
|
|
674
|
+
void this.setStatusMessage('Starting the crawler.', { level: 'INFO' });
|
|
406
675
|
const sigintHandler = async () => {
|
|
407
676
|
this.log.warning('Pausing... Press CTRL+C again to force exit. To resume, do: CRAWLEE_PURGE_ON_START=0 npm start');
|
|
408
677
|
await this._pauseOnMigration();
|
|
@@ -411,8 +680,9 @@ export class BasicCrawler {
|
|
|
411
680
|
// Attach a listener to handle migration and aborting events gracefully.
|
|
412
681
|
const boundPauseOnMigration = this._pauseOnMigration.bind(this);
|
|
413
682
|
process.once('SIGINT', sigintHandler);
|
|
414
|
-
|
|
415
|
-
|
|
683
|
+
const eventManager = serviceLocator.getEventManager();
|
|
684
|
+
eventManager.on("migrating" /* EventType.MIGRATING */, boundPauseOnMigration);
|
|
685
|
+
eventManager.on("aborting" /* EventType.ABORTING */, boundPauseOnMigration);
|
|
416
686
|
let stats = {};
|
|
417
687
|
try {
|
|
418
688
|
await this.autoscaledPool.run();
|
|
@@ -421,8 +691,8 @@ export class BasicCrawler {
|
|
|
421
691
|
await this.teardown();
|
|
422
692
|
await this.stats.stopCapturing();
|
|
423
693
|
process.off('SIGINT', sigintHandler);
|
|
424
|
-
|
|
425
|
-
|
|
694
|
+
eventManager.off("migrating" /* EventType.MIGRATING */, boundPauseOnMigration);
|
|
695
|
+
eventManager.off("aborting" /* EventType.ABORTING */, boundPauseOnMigration);
|
|
426
696
|
const finalStats = this.stats.calculate();
|
|
427
697
|
stats = {
|
|
428
698
|
requestsFinished: this.stats.state.requestsFinished,
|
|
@@ -439,7 +709,7 @@ export class BasicCrawler {
|
|
|
439
709
|
mostCommonErrors: this.stats.errorTracker.getMostPopularErrors(3).map(prettify),
|
|
440
710
|
});
|
|
441
711
|
}
|
|
442
|
-
const client =
|
|
712
|
+
const client = serviceLocator.getStorageClient();
|
|
443
713
|
if (client.teardown) {
|
|
444
714
|
let finished = false;
|
|
445
715
|
setTimeout(() => {
|
|
@@ -451,7 +721,8 @@ export class BasicCrawler {
|
|
|
451
721
|
finished = true;
|
|
452
722
|
}
|
|
453
723
|
periodicLogger.stop();
|
|
454
|
-
|
|
724
|
+
// Don't await, we don't want to block the execution
|
|
725
|
+
void this.setStatusMessage(`Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${this.stats.state.requestsFinished} succeeded, ${this.stats.state.requestsFailed} failed.`, { isStatusMessageTerminal: true, level: 'INFO' });
|
|
455
726
|
this.running = false;
|
|
456
727
|
this.hasFinishedBefore = true;
|
|
457
728
|
}
|
|
@@ -461,29 +732,75 @@ export class BasicCrawler {
|
|
|
461
732
|
* Gracefully stops the current run of the crawler.
|
|
462
733
|
*
|
|
463
734
|
* All the tasks active at the time of calling this method will be allowed to finish.
|
|
735
|
+
*
|
|
736
|
+
* To stop the crawler immediately, use {@link BasicCrawler.teardown|`crawler.teardown()`} instead.
|
|
464
737
|
*/
|
|
465
|
-
stop(
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
.then(() => this.log.info(message))
|
|
472
|
-
.catch((err) => {
|
|
473
|
-
this.log.error('An error occurred when stopping the crawler:', err);
|
|
474
|
-
});
|
|
738
|
+
stop(reason = 'The crawler has been gracefully stopped.') {
|
|
739
|
+
if (this.unexpectedStop) {
|
|
740
|
+
return;
|
|
741
|
+
}
|
|
742
|
+
this.log.info(reason);
|
|
743
|
+
this.unexpectedStop = true;
|
|
475
744
|
}
|
|
476
745
|
async getRequestQueue() {
|
|
477
746
|
if (!this.requestQueue && this.requestList) {
|
|
478
747
|
this.log.warningOnce('When using RequestList and RequestQueue at the same time, you should instantiate both explicitly and provide them in the crawler options, to ensure correctly handled restarts of the crawler.');
|
|
479
748
|
}
|
|
480
|
-
this.requestQueue
|
|
749
|
+
if (!this.requestQueue) {
|
|
750
|
+
this.requestQueue = await this._getRequestQueue();
|
|
751
|
+
this.requestManager = undefined;
|
|
752
|
+
}
|
|
753
|
+
if (!this.requestManager) {
|
|
754
|
+
this.requestManager =
|
|
755
|
+
this.requestList === undefined
|
|
756
|
+
? this.requestQueue
|
|
757
|
+
: new RequestManagerTandem(this.requestList, this.requestQueue);
|
|
758
|
+
}
|
|
481
759
|
return this.requestQueue;
|
|
482
760
|
}
|
|
483
761
|
async useState(defaultValue = {}) {
|
|
484
|
-
const kvs = await KeyValueStore.open(null, { config:
|
|
762
|
+
const kvs = await KeyValueStore.open(null, { config: serviceLocator.getConfiguration() });
|
|
763
|
+
if (this.hasExplicitId) {
|
|
764
|
+
const stateKey = `${BasicCrawler.CRAWLEE_STATE_KEY}_${this.crawlerId}`;
|
|
765
|
+
return kvs.getAutoSavedValue(stateKey, defaultValue);
|
|
766
|
+
}
|
|
767
|
+
BasicCrawler.useStateCrawlerIds.add(this.crawlerId);
|
|
768
|
+
if (BasicCrawler.useStateCrawlerIds.size > 1) {
|
|
769
|
+
serviceLocator
|
|
770
|
+
.getLogger()
|
|
771
|
+
.warningOnce('Multiple crawler instances are calling useState() without an explicit `id` option. \n' +
|
|
772
|
+
'This means they will share the same state object, which is likely unintended. \n' +
|
|
773
|
+
'To fix this, provide a unique `id` option to each crawler instance. \n' +
|
|
774
|
+
'Example: new BasicCrawler({ id: "my-crawler-1", ... })');
|
|
775
|
+
}
|
|
485
776
|
return kvs.getAutoSavedValue(BasicCrawler.CRAWLEE_STATE_KEY, defaultValue);
|
|
486
777
|
}
|
|
778
|
+
get pendingRequestCountApproximation() {
|
|
779
|
+
return this.requestManager?.getPendingCount() ?? 0;
|
|
780
|
+
}
|
|
781
|
+
calculateEnqueuedRequestLimit(explicitLimit) {
|
|
782
|
+
if (this.maxRequestsPerCrawl === undefined) {
|
|
783
|
+
return explicitLimit;
|
|
784
|
+
}
|
|
785
|
+
const limit = Math.max(0, this.maxRequestsPerCrawl - this.handledRequestsCount - this.pendingRequestCountApproximation);
|
|
786
|
+
return Math.min(limit, explicitLimit ?? Infinity);
|
|
787
|
+
}
|
|
788
|
+
async handleSkippedRequest(options) {
|
|
789
|
+
if (options.reason === 'limit') {
|
|
790
|
+
this.logOncePerRun('maxRequestsPerCrawl', 'The number of requests enqueued by the crawler reached the maxRequestsPerCrawl limit of ' +
|
|
791
|
+
`${this.maxRequestsPerCrawl} requests and no further requests will be added.`);
|
|
792
|
+
}
|
|
793
|
+
if (options.reason === 'depth') {
|
|
794
|
+
this.logOncePerRun('maxCrawlDepth', `The crawler reached the maxCrawlDepth limit of ${this.maxCrawlDepth} and no further requests will be enqueued.`);
|
|
795
|
+
}
|
|
796
|
+
await this.onSkippedRequest?.(options);
|
|
797
|
+
}
|
|
798
|
+
logOncePerRun(key, message) {
|
|
799
|
+
if (!this.loggedPerRun.has(key)) {
|
|
800
|
+
this.log.info(message);
|
|
801
|
+
this.loggedPerRun.add(key);
|
|
802
|
+
}
|
|
803
|
+
}
|
|
487
804
|
/**
|
|
488
805
|
* Adds requests to the queue in batches. By default, it will resolve after the initial batch is added, and continue
|
|
489
806
|
* adding the rest in background. You can configure the batch size via `batchSize` option and the sleep time in between
|
|
@@ -496,46 +813,72 @@ export class BasicCrawler {
|
|
|
496
813
|
* @param options Options for the request queue
|
|
497
814
|
*/
|
|
498
815
|
async addRequests(requests, options = {}) {
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
const
|
|
504
|
-
const
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
816
|
+
await this.getRequestQueue();
|
|
817
|
+
const requestLimit = this.calculateEnqueuedRequestLimit();
|
|
818
|
+
const skippedBecauseOfRobots = new Set();
|
|
819
|
+
const skippedBecauseOfLimit = new Set();
|
|
820
|
+
const skippedBecauseOfMaxCrawlDepth = new Set();
|
|
821
|
+
const isAllowedBasedOnRobotsTxtFile = this.isAllowedBasedOnRobotsTxtFile.bind(this);
|
|
822
|
+
const maxCrawlDepth = this.maxCrawlDepth;
|
|
823
|
+
ow(requests, ow.object
|
|
824
|
+
.is((value) => isIterable(value) || isAsyncIterable(value))
|
|
825
|
+
.message((value) => `Expected an iterable or async iterable, got ${getObjectType(value)}`));
|
|
826
|
+
async function* filteredRequests() {
|
|
827
|
+
let yieldedRequestCount = 0;
|
|
828
|
+
for await (const request of requests) {
|
|
829
|
+
const url = typeof request === 'string' ? request : request.url;
|
|
830
|
+
if (requestLimit !== undefined && yieldedRequestCount >= requestLimit) {
|
|
831
|
+
skippedBecauseOfLimit.add(url);
|
|
832
|
+
continue;
|
|
833
|
+
}
|
|
834
|
+
if (maxCrawlDepth !== undefined && request.crawlDepth > maxCrawlDepth) {
|
|
835
|
+
skippedBecauseOfMaxCrawlDepth.add(url);
|
|
836
|
+
continue;
|
|
837
|
+
}
|
|
838
|
+
if (await isAllowedBasedOnRobotsTxtFile(url)) {
|
|
839
|
+
yield request;
|
|
840
|
+
yieldedRequestCount += 1;
|
|
841
|
+
}
|
|
842
|
+
else {
|
|
843
|
+
skippedBecauseOfRobots.add(url);
|
|
844
|
+
}
|
|
513
845
|
}
|
|
514
846
|
}
|
|
515
|
-
|
|
847
|
+
const result = await this.requestManager.addRequestsBatched(filteredRequests(), options);
|
|
848
|
+
if (skippedBecauseOfRobots.size > 0) {
|
|
516
849
|
this.log.warning(`Some requests were skipped because they were disallowed based on the robots.txt file`, {
|
|
517
|
-
skipped: [...
|
|
850
|
+
skipped: [...skippedBecauseOfRobots],
|
|
518
851
|
});
|
|
519
|
-
if (this.onSkippedRequest) {
|
|
520
|
-
await Promise.all([...skipped].map((url) => {
|
|
521
|
-
return this.onSkippedRequest({ url, reason: 'robotsTxt' });
|
|
522
|
-
}));
|
|
523
|
-
}
|
|
524
852
|
}
|
|
525
|
-
|
|
853
|
+
if (skippedBecauseOfRobots.size > 0 ||
|
|
854
|
+
skippedBecauseOfLimit.size > 0 ||
|
|
855
|
+
skippedBecauseOfMaxCrawlDepth.size > 0) {
|
|
856
|
+
await Promise.all([...skippedBecauseOfRobots]
|
|
857
|
+
.map((url) => {
|
|
858
|
+
return this.handleSkippedRequest({ url, reason: 'robotsTxt' });
|
|
859
|
+
})
|
|
860
|
+
.concat([...skippedBecauseOfLimit].map((url) => {
|
|
861
|
+
return this.handleSkippedRequest({ url, reason: 'limit' });
|
|
862
|
+
}), [...skippedBecauseOfMaxCrawlDepth].map((url) => {
|
|
863
|
+
return this.handleSkippedRequest({ url, reason: 'depth' });
|
|
864
|
+
})));
|
|
865
|
+
}
|
|
866
|
+
return result;
|
|
526
867
|
}
|
|
527
868
|
/**
|
|
528
869
|
* Pushes data to the specified {@link Dataset}, or the default crawler {@link Dataset} by calling {@link Dataset.pushData}.
|
|
529
870
|
*/
|
|
530
|
-
async pushData(data,
|
|
531
|
-
const dataset = await this.getDataset(
|
|
871
|
+
async pushData(data, datasetIdentifier) {
|
|
872
|
+
const dataset = await this.getDataset(datasetIdentifier);
|
|
532
873
|
return dataset.pushData(data);
|
|
533
874
|
}
|
|
534
875
|
/**
|
|
535
876
|
* Retrieves the specified {@link Dataset}, or the default crawler {@link Dataset}.
|
|
536
877
|
*/
|
|
537
|
-
async getDataset(
|
|
538
|
-
return Dataset.open(
|
|
878
|
+
async getDataset(identifier) {
|
|
879
|
+
return Dataset.open(identifier, {
|
|
880
|
+
config: serviceLocator.getConfiguration(),
|
|
881
|
+
});
|
|
539
882
|
}
|
|
540
883
|
/**
|
|
541
884
|
* Retrieves data from the default crawler {@link Dataset} by calling {@link Dataset.getData}.
|
|
@@ -550,8 +893,9 @@ export class BasicCrawler {
|
|
|
550
893
|
*/
|
|
551
894
|
async exportData(path, format, options) {
|
|
552
895
|
const supportedFormats = ['json', 'csv'];
|
|
553
|
-
|
|
554
|
-
|
|
896
|
+
const formatMatch = /\.(json|csv)$/i.exec(path);
|
|
897
|
+
if (!format && formatMatch) {
|
|
898
|
+
format = formatMatch[1].toLowerCase();
|
|
555
899
|
}
|
|
556
900
|
if (!format) {
|
|
557
901
|
throw new Error(`Failed to infer format from the path: '${path}'. Supported formats: ${supportedFormats.join(', ')}`);
|
|
@@ -562,7 +906,21 @@ export class BasicCrawler {
|
|
|
562
906
|
const dataset = await this.getDataset();
|
|
563
907
|
const items = await dataset.export(options);
|
|
564
908
|
if (format === 'csv') {
|
|
565
|
-
|
|
909
|
+
let value;
|
|
910
|
+
if (items.length === 0) {
|
|
911
|
+
value = '';
|
|
912
|
+
}
|
|
913
|
+
else {
|
|
914
|
+
const keys = options?.collectAllKeys
|
|
915
|
+
? Array.from(new Set(items.flatMap(Object.keys)))
|
|
916
|
+
: Object.keys(items[0]);
|
|
917
|
+
value = stringify([
|
|
918
|
+
keys,
|
|
919
|
+
...items.map((item) => {
|
|
920
|
+
return keys.map((k) => item[k]);
|
|
921
|
+
}),
|
|
922
|
+
]);
|
|
923
|
+
}
|
|
566
924
|
await ensureDir(dirname(path));
|
|
567
925
|
await writeFile(path, value);
|
|
568
926
|
this.log.info(`Export to ${path} finished!`);
|
|
@@ -574,32 +932,33 @@ export class BasicCrawler {
|
|
|
574
932
|
}
|
|
575
933
|
return items;
|
|
576
934
|
}
|
|
935
|
+
/**
|
|
936
|
+
* Initializes the crawler.
|
|
937
|
+
*/
|
|
577
938
|
async _init() {
|
|
578
|
-
|
|
579
|
-
|
|
939
|
+
const eventManager = serviceLocator.getEventManager();
|
|
940
|
+
if (!eventManager.isInitialized()) {
|
|
941
|
+
await eventManager.init();
|
|
580
942
|
this._closeEvents = true;
|
|
581
943
|
}
|
|
582
944
|
// Initialize AutoscaledPool before awaiting _loadHandledRequestCount(),
|
|
583
945
|
// so that the caller can get a reference to it before awaiting the promise returned from run()
|
|
584
946
|
// (otherwise there would be no way)
|
|
585
|
-
this.autoscaledPool = new AutoscaledPool(this.autoscaledPoolOptions
|
|
586
|
-
|
|
587
|
-
this.sessionPool = await SessionPool.open(this.sessionPoolOptions, this.config);
|
|
588
|
-
// Assuming there are not more than 20 browsers running at once;
|
|
589
|
-
this.sessionPool.setMaxListeners(20);
|
|
590
|
-
}
|
|
947
|
+
this.autoscaledPool = new AutoscaledPool(this.autoscaledPoolOptions);
|
|
948
|
+
await this.initializeRequestManager();
|
|
591
949
|
await this._loadHandledRequestCount();
|
|
592
950
|
}
|
|
593
|
-
async
|
|
594
|
-
await this.requestHandler(crawlingContext);
|
|
951
|
+
async runRequestHandler(crawlingContext) {
|
|
952
|
+
await addTimeoutToPromise(async () => this.requestHandler(crawlingContext), this.requestHandlerTimeoutMillis, `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${crawlingContext.request.id}).`);
|
|
595
953
|
}
|
|
596
954
|
/**
|
|
597
955
|
* Handles blocked request
|
|
598
956
|
*/
|
|
599
|
-
_throwOnBlockedRequest(
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
957
|
+
_throwOnBlockedRequest(statusCode) {
|
|
958
|
+
if (this.retryOnBlocked)
|
|
959
|
+
return;
|
|
960
|
+
if (this.blockedStatusCodes.has(statusCode)) {
|
|
961
|
+
throw new SessionError(`Request blocked - received ${statusCode} status code.`);
|
|
603
962
|
}
|
|
604
963
|
}
|
|
605
964
|
async isAllowedBasedOnRobotsTxtFile(url) {
|
|
@@ -607,7 +966,8 @@ export class BasicCrawler {
|
|
|
607
966
|
return true;
|
|
608
967
|
}
|
|
609
968
|
const robotsTxtFile = await this.getRobotsTxtFileForUrl(url);
|
|
610
|
-
|
|
969
|
+
const userAgent = typeof this.respectRobotsTxtFile === 'object' ? this.respectRobotsTxtFile?.userAgent : '*';
|
|
970
|
+
return !robotsTxtFile || robotsTxtFile.isAllowed(url, userAgent);
|
|
611
971
|
}
|
|
612
972
|
async getRobotsTxtFileForUrl(url) {
|
|
613
973
|
if (!this.respectRobotsTxtFile) {
|
|
@@ -619,7 +979,7 @@ export class BasicCrawler {
|
|
|
619
979
|
if (cachedRobotsTxtFile) {
|
|
620
980
|
return cachedRobotsTxtFile;
|
|
621
981
|
}
|
|
622
|
-
const robotsTxtFile = await RobotsTxtFile.find(url);
|
|
982
|
+
const robotsTxtFile = await RobotsTxtFile.find(url, { logger: this.log });
|
|
623
983
|
this.robotsTxtFileCache.add(origin, robotsTxtFile);
|
|
624
984
|
return robotsTxtFile;
|
|
625
985
|
}
|
|
@@ -661,36 +1021,36 @@ export class BasicCrawler {
|
|
|
661
1021
|
await Promise.all([requestListPersistPromise, this.stats.persistState()]);
|
|
662
1022
|
}
|
|
663
1023
|
/**
|
|
664
|
-
*
|
|
665
|
-
* and RequestQueue is present then enqueues it to the queue first.
|
|
1024
|
+
* Initializes the RequestManager based on the configured requestList and requestQueue.
|
|
666
1025
|
*/
|
|
667
|
-
async
|
|
668
|
-
if (
|
|
669
|
-
return
|
|
670
|
-
}
|
|
671
|
-
const request = await this.requestList.fetchNextRequest();
|
|
672
|
-
if (!this.requestQueue)
|
|
673
|
-
return request;
|
|
674
|
-
if (!request)
|
|
675
|
-
return this.requestQueue.fetchNextRequest();
|
|
676
|
-
try {
|
|
677
|
-
await this.requestQueue.addRequest(request, { forefront: true });
|
|
1026
|
+
async initializeRequestManager() {
|
|
1027
|
+
if (this.requestManager !== undefined) {
|
|
1028
|
+
return;
|
|
678
1029
|
}
|
|
679
|
-
|
|
680
|
-
//
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
1030
|
+
if (this.requestList && this.requestQueue) {
|
|
1031
|
+
// Create a RequestManagerTandem if both RequestList and RequestQueue are provided
|
|
1032
|
+
this.requestManager = new RequestManagerTandem(this.requestList, this.requestQueue);
|
|
1033
|
+
}
|
|
1034
|
+
else if (this.requestQueue) {
|
|
1035
|
+
// Use RequestQueue directly if only it is provided
|
|
1036
|
+
this.requestManager = this.requestQueue;
|
|
1037
|
+
}
|
|
1038
|
+
else if (this.requestList) {
|
|
1039
|
+
// Use RequestList directly if only it is provided
|
|
1040
|
+
// Make it compatible with the IRequestManager interface
|
|
1041
|
+
this.requestManager = new RequestListAdapter(this.requestList);
|
|
1042
|
+
}
|
|
1043
|
+
// If neither RequestList nor RequestQueue is provided, leave the requestManager uninitialized until `getRequestQueue` is called
|
|
688
1044
|
}
|
|
689
1045
|
/**
|
|
690
|
-
*
|
|
691
|
-
* Can be used to clean up orphaned browser pages.
|
|
1046
|
+
* Fetches the next request to process from the underlying request provider.
|
|
692
1047
|
*/
|
|
693
|
-
async
|
|
1048
|
+
async _fetchNextRequest() {
|
|
1049
|
+
if (this.requestManager === undefined) {
|
|
1050
|
+
throw new Error(`_fetchNextRequest called on an uninitialized crawler`);
|
|
1051
|
+
}
|
|
1052
|
+
return this.requestManager.fetchNextRequest();
|
|
1053
|
+
}
|
|
694
1054
|
/**
|
|
695
1055
|
* Delays processing of the request based on the `sameDomainDelaySecs` option,
|
|
696
1056
|
* adding it back to the queue after the timeout passes. Returns `true` if the request
|
|
@@ -723,112 +1083,57 @@ export class BasicCrawler {
|
|
|
723
1083
|
}, delay);
|
|
724
1084
|
return true;
|
|
725
1085
|
}
|
|
726
|
-
/**
|
|
727
|
-
|
|
728
|
-
* then retries them in a case of an error, etc.
|
|
729
|
-
*/
|
|
730
|
-
async _runTaskFunction() {
|
|
731
|
-
const source = this.requestQueue || this.requestList || (await this.getRequestQueue());
|
|
732
|
-
let request;
|
|
733
|
-
let session;
|
|
734
|
-
await this._timeoutAndRetry(async () => {
|
|
735
|
-
request = await this._fetchNextRequest();
|
|
736
|
-
}, this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
737
|
-
tryCancel();
|
|
738
|
-
if (this.useSessionPool) {
|
|
739
|
-
await this._timeoutAndRetry(async () => {
|
|
740
|
-
session = await this.sessionPool.getSession();
|
|
741
|
-
}, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
742
|
-
}
|
|
743
|
-
tryCancel();
|
|
744
|
-
if (!request || this.delayRequest(request, source)) {
|
|
745
|
-
return;
|
|
746
|
-
}
|
|
747
|
-
if (!(await this.isAllowedBasedOnRobotsTxtFile(request.url))) {
|
|
748
|
-
this.log.warning(`Skipping request ${request.url} (${request.id}) because it is disallowed based on robots.txt`);
|
|
749
|
-
request.state = RequestState.SKIPPED;
|
|
750
|
-
request.noRetry = true;
|
|
751
|
-
await source.markRequestHandled(request);
|
|
752
|
-
await this.onSkippedRequest?.({
|
|
753
|
-
url: request.url,
|
|
754
|
-
reason: 'robotsTxt',
|
|
755
|
-
});
|
|
756
|
-
return;
|
|
757
|
-
}
|
|
758
|
-
// Reset loadedUrl so an old one is not carried over to retries.
|
|
759
|
-
request.loadedUrl = undefined;
|
|
1086
|
+
/** Handles a single request - runs the request handler with retries, error handling, and lifecycle management. */
|
|
1087
|
+
async handleRequest(crawlingContext, requestSource, request) {
|
|
760
1088
|
const statisticsId = request.id || request.uniqueKey;
|
|
761
1089
|
this.stats.startJob(statisticsId);
|
|
762
|
-
// Shared crawling context
|
|
763
|
-
// @ts-expect-error
|
|
764
|
-
// All missing properties (that extend CrawlingContext) are set dynamically,
|
|
765
|
-
// but TS does not know that, so otherwise it would throw when compiling.
|
|
766
|
-
const crawlingContext = {
|
|
767
|
-
id: cryptoRandomObjectId(10),
|
|
768
|
-
crawler: this,
|
|
769
|
-
log: this.log,
|
|
770
|
-
request,
|
|
771
|
-
session,
|
|
772
|
-
enqueueLinks: async (options) => {
|
|
773
|
-
return enqueueLinks({
|
|
774
|
-
// specify the RQ first to allow overriding it
|
|
775
|
-
requestQueue: await this.getRequestQueue(),
|
|
776
|
-
robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
|
|
777
|
-
onSkippedRequest: this.onSkippedRequest,
|
|
778
|
-
...options,
|
|
779
|
-
});
|
|
780
|
-
},
|
|
781
|
-
addRequests: this.addRequests.bind(this),
|
|
782
|
-
pushData: this.pushData.bind(this),
|
|
783
|
-
useState: this.useState.bind(this),
|
|
784
|
-
sendRequest: createSendRequest(this.httpClient, request, session, () => crawlingContext.proxyInfo?.url),
|
|
785
|
-
getKeyValueStore: async (idOrName) => KeyValueStore.open(idOrName, { config: this.config }),
|
|
786
|
-
};
|
|
787
|
-
this.crawlingContexts.set(crawlingContext.id, crawlingContext);
|
|
788
1090
|
let isRequestLocked = true;
|
|
789
1091
|
try {
|
|
790
1092
|
request.state = RequestState.REQUEST_HANDLER;
|
|
791
|
-
await
|
|
792
|
-
await this._timeoutAndRetry(async () =>
|
|
1093
|
+
await this.runRequestHandler(crawlingContext);
|
|
1094
|
+
await this._timeoutAndRetry(async () => requestSource.markRequestHandled(request), this.internalTimeoutMillis, `Marking request ${request.url} (${request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
793
1095
|
isRequestLocked = false; // markRequestHandled succeeded and unlocked the request
|
|
794
1096
|
this.stats.finishJob(statisticsId, request.retryCount);
|
|
795
1097
|
this.handledRequestsCount++;
|
|
796
1098
|
// reclaim session if request finishes successfully
|
|
797
1099
|
request.state = RequestState.DONE;
|
|
798
|
-
crawlingContext.session
|
|
1100
|
+
crawlingContext.session.markGood();
|
|
799
1101
|
}
|
|
800
|
-
catch (
|
|
1102
|
+
catch (rawError) {
|
|
1103
|
+
const err = this.unwrapError(rawError);
|
|
801
1104
|
try {
|
|
802
1105
|
request.state = RequestState.ERROR_HANDLER;
|
|
803
|
-
await addTimeoutToPromise(async () => this._requestFunctionErrorHandler(err, crawlingContext,
|
|
1106
|
+
await addTimeoutToPromise(async () => this._requestFunctionErrorHandler(err, crawlingContext, request, requestSource), this.internalTimeoutMillis, `Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
804
1107
|
if (!(err instanceof CriticalError)) {
|
|
805
1108
|
isRequestLocked = false; // _requestFunctionErrorHandler calls either markRequestHandled or reclaimRequest
|
|
806
1109
|
}
|
|
807
1110
|
request.state = RequestState.DONE;
|
|
808
1111
|
}
|
|
809
1112
|
catch (secondaryError) {
|
|
810
|
-
|
|
1113
|
+
const unwrappedSecondaryError = this.unwrapError(secondaryError);
|
|
1114
|
+
if (!unwrappedSecondaryError.triggeredFromUserHandler &&
|
|
811
1115
|
// avoid reprinting the same critical error multiple times, as it will be printed by Nodejs at the end anyway
|
|
812
|
-
!(
|
|
1116
|
+
!(unwrappedSecondaryError instanceof CriticalError)) {
|
|
813
1117
|
const apifySpecific = process.env.APIFY_IS_AT_HOME
|
|
814
1118
|
? `This may have happened due to an internal error of Apify's API or due to a misconfigured crawler.`
|
|
815
1119
|
: '';
|
|
816
|
-
this.log.exception(
|
|
1120
|
+
this.log.exception(unwrappedSecondaryError, 'An exception occurred during handling of failed request. ' +
|
|
817
1121
|
`This places the crawler and its underlying storages into an unknown state and crawling will be terminated. ${apifySpecific}`);
|
|
818
1122
|
}
|
|
819
1123
|
request.state = RequestState.ERROR;
|
|
820
|
-
throw
|
|
1124
|
+
throw unwrappedSecondaryError;
|
|
1125
|
+
}
|
|
1126
|
+
// decrease the session score if the request fails (but the error handler did not throw);
|
|
1127
|
+
// skip when the error is a SessionError, which already retired the session
|
|
1128
|
+
if (!(err instanceof SessionError)) {
|
|
1129
|
+
crawlingContext.session.markBad();
|
|
821
1130
|
}
|
|
822
|
-
// decrease the session score if the request fails (but the error handler did not throw)
|
|
823
|
-
crawlingContext.session?.markBad();
|
|
824
1131
|
}
|
|
825
1132
|
finally {
|
|
826
|
-
await this._cleanupContext(crawlingContext);
|
|
827
|
-
this.crawlingContexts.delete(crawlingContext.id);
|
|
828
1133
|
// Safety net - release the lock if nobody managed to do it before
|
|
829
|
-
if (isRequestLocked &&
|
|
1134
|
+
if (isRequestLocked && requestSource instanceof RequestProvider) {
|
|
830
1135
|
try {
|
|
831
|
-
await
|
|
1136
|
+
await requestSource.client.deleteRequestLock(request.id);
|
|
832
1137
|
}
|
|
833
1138
|
catch {
|
|
834
1139
|
// We don't have the lock, or the request was never locked. Either way it's fine
|
|
@@ -837,19 +1142,75 @@ export class BasicCrawler {
|
|
|
837
1142
|
}
|
|
838
1143
|
}
|
|
839
1144
|
/**
|
|
840
|
-
*
|
|
1145
|
+
* Wrapper around the crawling context's `enqueueLinks` method:
|
|
1146
|
+
* - Injects `crawlDepth` to each request being added based on the crawling context request.
|
|
1147
|
+
* - Provides defaults for the `enqueueLinks` options based on the crawler configuration.
|
|
1148
|
+
* - These options can be overridden by the user.
|
|
1149
|
+
* @internal
|
|
1150
|
+
*/
|
|
1151
|
+
async enqueueLinksWithCrawlDepth(options, request, requestQueue) {
|
|
1152
|
+
const transformRequestFunctionWrapper = (requestOptions) => {
|
|
1153
|
+
requestOptions.crawlDepth = request.crawlDepth + 1;
|
|
1154
|
+
if (this.maxCrawlDepth !== undefined && requestOptions.crawlDepth > this.maxCrawlDepth) {
|
|
1155
|
+
// Setting `skippedReason` before returning `false` ensures that `reportSkippedRequests`
|
|
1156
|
+
// reports `'depth'` as the reason (via `request.skippedReason ?? reason` fallback),
|
|
1157
|
+
// rather than the generic `'transform'` reason.
|
|
1158
|
+
requestOptions.skippedReason = 'depth';
|
|
1159
|
+
return false;
|
|
1160
|
+
}
|
|
1161
|
+
// After injecting the crawlDepth, we call the user-provided transform function, if there is one.
|
|
1162
|
+
return options.transformRequestFunction?.(requestOptions) ?? requestOptions;
|
|
1163
|
+
};
|
|
1164
|
+
// Create a request-scoped callback that logs enqueueLimit once per request handler call
|
|
1165
|
+
// Only log if an explicit limit was passed to enqueueLinks (not the internal maxRequestsPerCrawl-derived limit)
|
|
1166
|
+
let loggedEnqueueLimitForThisRequest = false;
|
|
1167
|
+
const onSkippedRequest = async (skippedOptions) => {
|
|
1168
|
+
if (skippedOptions.reason === 'enqueueLimit') {
|
|
1169
|
+
if (!loggedEnqueueLimitForThisRequest && options.limit !== undefined) {
|
|
1170
|
+
this.log.info(`Skipping URLs in the handler for ${request.url} due to the enqueueLinks limit of ${options.limit}.`);
|
|
1171
|
+
loggedEnqueueLimitForThisRequest = true;
|
|
1172
|
+
}
|
|
1173
|
+
}
|
|
1174
|
+
await this.handleSkippedRequest(skippedOptions);
|
|
1175
|
+
};
|
|
1176
|
+
return await enqueueLinks({
|
|
1177
|
+
requestQueue,
|
|
1178
|
+
robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
|
|
1179
|
+
onSkippedRequest,
|
|
1180
|
+
limit: this.calculateEnqueuedRequestLimit(options.limit),
|
|
1181
|
+
// Allow user options to override defaults set above ⤴
|
|
1182
|
+
...options,
|
|
1183
|
+
transformRequestFunction: transformRequestFunctionWrapper,
|
|
1184
|
+
});
|
|
1185
|
+
}
|
|
1186
|
+
/**
|
|
1187
|
+
* Generator function that yields requests injected with the given crawl depth.
|
|
1188
|
+
* @internal
|
|
1189
|
+
*/
|
|
1190
|
+
async *addCrawlDepthRequestGenerator(requests, newRequestDepth) {
|
|
1191
|
+
for await (const request of requests) {
|
|
1192
|
+
if (typeof request === 'string') {
|
|
1193
|
+
yield { url: request, crawlDepth: newRequestDepth };
|
|
1194
|
+
}
|
|
1195
|
+
else {
|
|
1196
|
+
request.crawlDepth ??= newRequestDepth;
|
|
1197
|
+
yield request;
|
|
1198
|
+
}
|
|
1199
|
+
}
|
|
1200
|
+
}
|
|
1201
|
+
/**
|
|
1202
|
+
* Run async callback with given timeout and retry. Returns the result of the callback.
|
|
841
1203
|
* @ignore
|
|
842
1204
|
*/
|
|
843
1205
|
async _timeoutAndRetry(handler, timeout, error, maxRetries = 3, retried = 1) {
|
|
844
1206
|
try {
|
|
845
|
-
await addTimeoutToPromise(handler, timeout, error);
|
|
1207
|
+
return await addTimeoutToPromise(handler, timeout, error);
|
|
846
1208
|
}
|
|
847
1209
|
catch (e) {
|
|
848
1210
|
if (retried <= maxRetries) {
|
|
849
1211
|
// we retry on any error, not just timeout
|
|
850
1212
|
this.log.warning(`${e.message} (retrying ${retried}/${maxRetries})`);
|
|
851
|
-
|
|
852
|
-
return;
|
|
1213
|
+
return this._timeoutAndRetry(handler, timeout, error, maxRetries, retried + 1);
|
|
853
1214
|
}
|
|
854
1215
|
throw e;
|
|
855
1216
|
}
|
|
@@ -858,36 +1219,32 @@ export class BasicCrawler {
|
|
|
858
1219
|
* Returns true if either RequestList or RequestQueue have a request ready for processing.
|
|
859
1220
|
*/
|
|
860
1221
|
async _isTaskReadyFunction() {
|
|
861
|
-
|
|
862
|
-
const isRequestListEmpty = this.requestList ? await this.requestList.isEmpty() : true;
|
|
863
|
-
// If RequestList is not empty, task is ready, no reason to check RequestQueue.
|
|
864
|
-
if (!isRequestListEmpty)
|
|
865
|
-
return true;
|
|
866
|
-
// If RequestQueue is not empty, task is ready, return true, otherwise false.
|
|
867
|
-
return this.requestQueue ? !(await this.requestQueue.isEmpty()) : false;
|
|
1222
|
+
return this.requestManager !== undefined && !(await this.requestManager.isEmpty());
|
|
868
1223
|
}
|
|
869
1224
|
/**
|
|
870
1225
|
* Returns true if both RequestList and RequestQueue have all requests finished.
|
|
871
1226
|
*/
|
|
872
1227
|
async _defaultIsFinishedFunction() {
|
|
873
|
-
|
|
874
|
-
this.requestList ? this.requestList.isFinished() : true,
|
|
875
|
-
this.requestQueue ? this.requestQueue.isFinished() : true,
|
|
876
|
-
]);
|
|
877
|
-
// If both are finished, return true, otherwise return false.
|
|
878
|
-
return isRequestListFinished && isRequestQueueFinished;
|
|
1228
|
+
return !this.requestManager || (await this.requestManager.isFinished());
|
|
879
1229
|
}
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
1230
|
+
/**
|
|
1231
|
+
* Unwraps errors thrown by the context pipeline to get the actual user error.
|
|
1232
|
+
* RequestHandlerError and ContextPipelineInitializationError wrap the actual error.
|
|
1233
|
+
*/
|
|
1234
|
+
unwrapError(error) {
|
|
1235
|
+
if (error instanceof RequestHandlerError ||
|
|
1236
|
+
error instanceof ContextPipelineInitializationError ||
|
|
1237
|
+
error instanceof ContextPipelineCleanupError) {
|
|
1238
|
+
return this.unwrapError(error.cause);
|
|
1239
|
+
}
|
|
1240
|
+
return error;
|
|
885
1241
|
}
|
|
886
1242
|
/**
|
|
887
1243
|
* Handles errors thrown by user provided requestHandler()
|
|
1244
|
+
*
|
|
1245
|
+
* @param request The request object, passed separately to circumvent potential dynamic logic in crawlingContext.request
|
|
888
1246
|
*/
|
|
889
|
-
async _requestFunctionErrorHandler(error, crawlingContext, source) {
|
|
890
|
-
const { request } = crawlingContext;
|
|
1247
|
+
async _requestFunctionErrorHandler(error, crawlingContext, request, source) {
|
|
891
1248
|
request.pushErrorMessage(error);
|
|
892
1249
|
if (error instanceof CriticalError) {
|
|
893
1250
|
throw error;
|
|
@@ -895,9 +1252,10 @@ export class BasicCrawler {
|
|
|
895
1252
|
const shouldRetryRequest = this._canRequestBeRetried(request, error);
|
|
896
1253
|
if (shouldRetryRequest) {
|
|
897
1254
|
await this.stats.errorTrackerRetry.addAsync(error, crawlingContext);
|
|
898
|
-
await this.errorHandler?.(crawlingContext,
|
|
1255
|
+
await this.errorHandler?.(crawlingContext, // valid cast - ExtendedContext transitively extends CrawlingContext
|
|
1256
|
+
error);
|
|
899
1257
|
if (error instanceof SessionError) {
|
|
900
|
-
|
|
1258
|
+
crawlingContext.session?.retire();
|
|
901
1259
|
}
|
|
902
1260
|
if (!request.noRetry) {
|
|
903
1261
|
request.retryCount++;
|
|
@@ -914,6 +1272,9 @@ export class BasicCrawler {
|
|
|
914
1272
|
return;
|
|
915
1273
|
}
|
|
916
1274
|
}
|
|
1275
|
+
if (error instanceof SessionError) {
|
|
1276
|
+
crawlingContext.session?.retire();
|
|
1277
|
+
}
|
|
917
1278
|
// If the request is non-retryable, the error and snapshot aren't saved in the errorTrackerRetry object.
|
|
918
1279
|
// Therefore, we pass the crawlingContext to the errorTracker.add method, enabling snapshot capture.
|
|
919
1280
|
// This is to make sure the error snapshot is not duplicated in the errorTrackerRetry and errorTracker objects.
|
|
@@ -947,7 +1308,8 @@ export class BasicCrawler {
|
|
|
947
1308
|
const message = this._getMessageFromError(error, true);
|
|
948
1309
|
this.log.error(`Request failed and reached maximum retries. ${message}`, { id, url, method, uniqueKey });
|
|
949
1310
|
if (this.failedRequestHandler) {
|
|
950
|
-
await this.failedRequestHandler?.(crawlingContext,
|
|
1311
|
+
await this.failedRequestHandler?.(crawlingContext, // valid cast - ExtendedContext transitively extends CrawlingContext
|
|
1312
|
+
error);
|
|
951
1313
|
}
|
|
952
1314
|
}
|
|
953
1315
|
/**
|
|
@@ -970,10 +1332,8 @@ export class BasicCrawler {
|
|
|
970
1332
|
: [error.message || error, userLine].join('\n');
|
|
971
1333
|
}
|
|
972
1334
|
_canRequestBeRetried(request, error) {
|
|
973
|
-
// Request should never be retried, or the error encountered makes it not able to be retried
|
|
974
|
-
if (request.noRetry ||
|
|
975
|
-
error instanceof NonRetryableError ||
|
|
976
|
-
(error instanceof SessionError && this.maxSessionRotations <= (request.sessionRotationCount ?? 0))) {
|
|
1335
|
+
// Request should never be retried, or the error encountered makes it not able to be retried.
|
|
1336
|
+
if (request.noRetry || error instanceof NonRetryableError) {
|
|
977
1337
|
return false;
|
|
978
1338
|
}
|
|
979
1339
|
// User requested retry (we ignore retry count here as its explicitly told by the user to retry)
|
|
@@ -985,19 +1345,11 @@ export class BasicCrawler {
|
|
|
985
1345
|
return request.retryCount < maxRequestRetries;
|
|
986
1346
|
}
|
|
987
1347
|
/**
|
|
988
|
-
* Updates handledRequestsCount from possibly stored counts,
|
|
989
|
-
* usually after worker migration. Since one of the stores
|
|
990
|
-
* needs to have priority when both are present,
|
|
991
|
-
* it is the request queue, because generally, the request
|
|
992
|
-
* list will first be dumped into the queue and then left
|
|
993
|
-
* empty.
|
|
1348
|
+
* Updates handledRequestsCount from possibly stored counts, usually after worker migration.
|
|
994
1349
|
*/
|
|
995
1350
|
async _loadHandledRequestCount() {
|
|
996
|
-
if (this.
|
|
997
|
-
this.handledRequestsCount = await this.
|
|
998
|
-
}
|
|
999
|
-
else if (this.requestList) {
|
|
1000
|
-
this.handledRequestsCount = this.requestList.handledCount();
|
|
1351
|
+
if (this.requestManager) {
|
|
1352
|
+
this.handledRequestsCount = await this.requestManager.handledCount();
|
|
1001
1353
|
}
|
|
1002
1354
|
}
|
|
1003
1355
|
async _executeHooks(hooks, ...args) {
|
|
@@ -1008,17 +1360,18 @@ export class BasicCrawler {
|
|
|
1008
1360
|
}
|
|
1009
1361
|
}
|
|
1010
1362
|
/**
|
|
1011
|
-
*
|
|
1012
|
-
*
|
|
1363
|
+
* Stops the crawler immediately.
|
|
1364
|
+
*
|
|
1365
|
+
* This method doesn't wait for currently active requests to finish.
|
|
1366
|
+
*
|
|
1367
|
+
* To stop the crawler gracefully (waiting for all running requests to finish), use {@link BasicCrawler.stop|`crawler.stop()`} instead.
|
|
1013
1368
|
*/
|
|
1014
1369
|
async teardown() {
|
|
1015
|
-
|
|
1016
|
-
if (this.useSessionPool) {
|
|
1017
|
-
await this.sessionPool.teardown();
|
|
1018
|
-
}
|
|
1370
|
+
serviceLocator.getEventManager().emit("persistState" /* EventType.PERSIST_STATE */, { isMigrating: false });
|
|
1019
1371
|
if (this._closeEvents) {
|
|
1020
|
-
await
|
|
1372
|
+
await serviceLocator.getEventManager().close();
|
|
1021
1373
|
}
|
|
1374
|
+
await this.ownedSessionPool?.teardown();
|
|
1022
1375
|
await this.autoscaledPool?.abort();
|
|
1023
1376
|
}
|
|
1024
1377
|
_getCookieHeaderFromRequest(request) {
|
|
@@ -1030,16 +1383,30 @@ export class BasicCrawler {
|
|
|
1030
1383
|
}
|
|
1031
1384
|
async _getRequestQueue() {
|
|
1032
1385
|
// Check if it's explicitly disabled
|
|
1386
|
+
// oxlint-disable-next-line typescript/no-deprecated -- still honored for opt-out until the flag is removed
|
|
1033
1387
|
if (this.experiments.requestLocking === false) {
|
|
1388
|
+
// oxlint-disable-next-line typescript/no-deprecated
|
|
1034
1389
|
if (!this._experimentWarnings.requestLocking) {
|
|
1035
1390
|
this.log.info('Using the old RequestQueue implementation without request locking.');
|
|
1391
|
+
// oxlint-disable-next-line typescript/no-deprecated
|
|
1036
1392
|
this._experimentWarnings.requestLocking = true;
|
|
1037
1393
|
}
|
|
1038
|
-
return RequestQueueV1.open(null, { config:
|
|
1394
|
+
return RequestQueueV1.open(null, { config: serviceLocator.getConfiguration() });
|
|
1039
1395
|
}
|
|
1040
|
-
return RequestQueue.open(null, { config:
|
|
1396
|
+
return RequestQueue.open(null, { config: serviceLocator.getConfiguration() });
|
|
1041
1397
|
}
|
|
1042
1398
|
requestMatchesEnqueueStrategy(request) {
|
|
1399
|
+
// If `skipNavigation` was used, just return `true`
|
|
1400
|
+
try {
|
|
1401
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-expressions
|
|
1402
|
+
request.loadedUrl;
|
|
1403
|
+
}
|
|
1404
|
+
catch (err) {
|
|
1405
|
+
if (err instanceof NavigationSkippedError) {
|
|
1406
|
+
return true;
|
|
1407
|
+
}
|
|
1408
|
+
throw err;
|
|
1409
|
+
}
|
|
1043
1410
|
const { url, loadedUrl } = request;
|
|
1044
1411
|
// eslint-disable-next-line dot-notation -- private access
|
|
1045
1412
|
const strategy = request['enqueueStrategy'];
|