@crawlee/basic 4.0.0-beta.3 → 4.0.0-beta.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/index.d.ts +1 -1
- package/index.d.ts.map +1 -1
- package/index.js +0 -1
- package/index.js.map +1 -1
- package/internals/basic-crawler.d.ts +204 -86
- package/internals/basic-crawler.d.ts.map +1 -1
- package/internals/basic-crawler.js +504 -269
- package/internals/basic-crawler.js.map +1 -1
- package/internals/send-request.d.ts +3 -5
- package/internals/send-request.d.ts.map +1 -1
- package/internals/send-request.js +20 -25
- package/internals/send-request.js.map +1 -1
- package/package.json +6 -6
- package/internals/constants.d.ts +0 -7
- package/internals/constants.d.ts.map +0 -1
- package/internals/constants.js +0 -7
- package/internals/constants.js.map +0 -1
- package/tsconfig.build.tsbuildinfo +0 -1
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import { writeFile } from 'node:fs/promises';
|
|
2
2
|
import { dirname } from 'node:path';
|
|
3
|
-
import { AutoscaledPool,
|
|
4
|
-
import {
|
|
3
|
+
import { AutoscaledPool, bindMethodsToServiceLocator, ContextPipeline, ContextPipelineCleanupError, ContextPipelineInitializationError, ContextPipelineInterruptedError, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, KeyValueStore, LogLevel, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestHandlerError, RequestListAdapter, RequestManagerTandem, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, ServiceLocator, serviceLocator, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
|
|
4
|
+
import { GotScrapingHttpClient } from '@crawlee/got-scraping-client';
|
|
5
|
+
import { getObjectType, isAsyncIterable, isIterable, RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
|
|
5
6
|
import { stringify } from 'csv-stringify/sync';
|
|
6
7
|
import { ensureDir, writeJSON } from 'fs-extra/esm';
|
|
7
8
|
import ow from 'ow';
|
|
8
9
|
import { getDomain } from 'tldts';
|
|
9
10
|
import { LruCache } from '@apify/datastructures';
|
|
10
|
-
import defaultLog, { LogLevel } from '@apify/log';
|
|
11
11
|
import { addTimeoutToPromise, TimeoutError, tryCancel } from '@apify/timeout';
|
|
12
12
|
import { cryptoRandomObjectId } from '@apify/utilities';
|
|
13
13
|
import { createSendRequest } from './send-request.js';
|
|
@@ -86,8 +86,12 @@ const SAFE_MIGRATION_WAIT_MILLIS = 20000;
|
|
|
86
86
|
* @category Crawlers
|
|
87
87
|
*/
|
|
88
88
|
export class BasicCrawler {
|
|
89
|
-
config;
|
|
90
89
|
static CRAWLEE_STATE_KEY = 'CRAWLEE_STATE';
|
|
90
|
+
/**
|
|
91
|
+
* Tracks crawler instances that accessed shared state without having an explicit id.
|
|
92
|
+
* Used to detect and warn about multiple crawlers sharing the same state.
|
|
93
|
+
*/
|
|
94
|
+
static useStateCrawlerIds = new Set();
|
|
91
95
|
/**
|
|
92
96
|
* A reference to the underlying {@link Statistics} class that collects and logs run statistics for requests.
|
|
93
97
|
*/
|
|
@@ -103,6 +107,10 @@ export class BasicCrawler {
|
|
|
103
107
|
* Only available if used by the crawler.
|
|
104
108
|
*/
|
|
105
109
|
requestQueue;
|
|
110
|
+
/**
|
|
111
|
+
* The main request-handling component of the crawler. It's initialized during the crawler startup.
|
|
112
|
+
*/
|
|
113
|
+
requestManager;
|
|
106
114
|
/**
|
|
107
115
|
* A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session|sessions}.
|
|
108
116
|
* Only available if used by the crawler.
|
|
@@ -116,40 +124,62 @@ export class BasicCrawler {
|
|
|
116
124
|
* or to abort it by calling {@link AutoscaledPool.abort|`autoscaledPool.abort()`}.
|
|
117
125
|
*/
|
|
118
126
|
autoscaledPool;
|
|
127
|
+
/**
|
|
128
|
+
* A reference to the underlying {@link ProxyConfiguration} class that manages the crawler's proxies.
|
|
129
|
+
* Only available if used by the crawler.
|
|
130
|
+
*/
|
|
131
|
+
proxyConfiguration;
|
|
119
132
|
/**
|
|
120
133
|
* Default {@link Router} instance that will be used if we don't specify any {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
|
|
121
134
|
* See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
|
|
122
135
|
*/
|
|
123
136
|
router = Router.create();
|
|
137
|
+
contextPipelineBuilder;
|
|
138
|
+
_contextPipeline;
|
|
139
|
+
get contextPipeline() {
|
|
140
|
+
if (this._contextPipeline === undefined) {
|
|
141
|
+
this._contextPipeline = this.contextPipelineBuilder();
|
|
142
|
+
}
|
|
143
|
+
return this._contextPipeline;
|
|
144
|
+
}
|
|
124
145
|
running = false;
|
|
125
146
|
hasFinishedBefore = false;
|
|
126
|
-
log;
|
|
147
|
+
#log;
|
|
148
|
+
get log() {
|
|
149
|
+
return this.#log;
|
|
150
|
+
}
|
|
127
151
|
requestHandler;
|
|
128
152
|
errorHandler;
|
|
129
153
|
failedRequestHandler;
|
|
130
154
|
requestHandlerTimeoutMillis;
|
|
131
155
|
internalTimeoutMillis;
|
|
132
156
|
maxRequestRetries;
|
|
157
|
+
maxCrawlDepth;
|
|
133
158
|
sameDomainDelayMillis;
|
|
134
159
|
domainAccessedTime;
|
|
135
160
|
maxSessionRotations;
|
|
136
|
-
|
|
161
|
+
maxRequestsPerCrawl;
|
|
162
|
+
handledRequestsCount = 0;
|
|
137
163
|
statusMessageLoggingInterval;
|
|
138
164
|
statusMessageCallback;
|
|
139
165
|
sessionPoolOptions;
|
|
140
166
|
useSessionPool;
|
|
141
|
-
crawlingContexts = new Map();
|
|
142
167
|
autoscaledPoolOptions;
|
|
143
|
-
events;
|
|
144
168
|
httpClient;
|
|
145
169
|
retryOnBlocked;
|
|
146
170
|
respectRobotsTxtFile;
|
|
147
171
|
onSkippedRequest;
|
|
148
172
|
_closeEvents;
|
|
173
|
+
shouldLogMaxProcessedRequestsExceeded = true;
|
|
174
|
+
shouldLogMaxEnqueuedRequestsExceeded = true;
|
|
149
175
|
experiments;
|
|
150
176
|
robotsTxtFileCache;
|
|
151
177
|
_experimentWarnings = {};
|
|
178
|
+
crawlerId;
|
|
179
|
+
hasExplicitId;
|
|
152
180
|
static optionsShape = {
|
|
181
|
+
contextPipelineBuilder: ow.optional.object,
|
|
182
|
+
extendContext: ow.optional.function,
|
|
153
183
|
requestList: ow.optional.object.validate(validators.requestList),
|
|
154
184
|
requestQueue: ow.optional.object.validate(validators.requestQueue),
|
|
155
185
|
// Subclasses override this function instead of passing it
|
|
@@ -163,144 +193,209 @@ export class BasicCrawler {
|
|
|
163
193
|
sameDomainDelaySecs: ow.optional.number,
|
|
164
194
|
maxSessionRotations: ow.optional.number,
|
|
165
195
|
maxRequestsPerCrawl: ow.optional.number,
|
|
196
|
+
maxCrawlDepth: ow.optional.number,
|
|
166
197
|
autoscaledPoolOptions: ow.optional.object,
|
|
167
198
|
sessionPoolOptions: ow.optional.object,
|
|
168
199
|
useSessionPool: ow.optional.boolean,
|
|
200
|
+
proxyConfiguration: ow.optional.object.validate(validators.proxyConfiguration),
|
|
169
201
|
statusMessageLoggingInterval: ow.optional.number,
|
|
170
202
|
statusMessageCallback: ow.optional.function,
|
|
171
203
|
retryOnBlocked: ow.optional.boolean,
|
|
172
|
-
respectRobotsTxtFile: ow.optional.boolean,
|
|
204
|
+
respectRobotsTxtFile: ow.optional.any(ow.boolean, ow.object),
|
|
173
205
|
onSkippedRequest: ow.optional.function,
|
|
174
206
|
httpClient: ow.optional.object,
|
|
207
|
+
configuration: ow.optional.object,
|
|
208
|
+
storageClient: ow.optional.object,
|
|
209
|
+
eventManager: ow.optional.object,
|
|
210
|
+
logger: ow.optional.object,
|
|
175
211
|
// AutoscaledPool shorthands
|
|
176
212
|
minConcurrency: ow.optional.number,
|
|
177
213
|
maxConcurrency: ow.optional.number,
|
|
178
214
|
maxRequestsPerMinute: ow.optional.number.integerOrInfinite.positive.greaterThanOrEqual(1),
|
|
179
215
|
keepAlive: ow.optional.boolean,
|
|
180
216
|
// internal
|
|
181
|
-
log: ow.optional.object,
|
|
182
217
|
experiments: ow.optional.object,
|
|
183
218
|
statisticsOptions: ow.optional.object,
|
|
219
|
+
id: ow.optional.string,
|
|
184
220
|
};
|
|
185
221
|
/**
|
|
186
222
|
* All `BasicCrawler` parameters are passed via an options object.
|
|
187
223
|
*/
|
|
188
|
-
constructor(options = {}
|
|
189
|
-
this.config = config;
|
|
224
|
+
constructor(options = {}) {
|
|
190
225
|
ow(options, 'BasicCrawlerOptions', ow.object.exactShape(BasicCrawler.optionsShape));
|
|
191
|
-
const { requestList, requestQueue, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, useSessionPool = true,
|
|
226
|
+
const { requestList, requestQueue, requestManager, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, maxCrawlDepth, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, useSessionPool = true, proxyConfiguration,
|
|
227
|
+
// Service locator options
|
|
228
|
+
configuration, storageClient, eventManager, logger,
|
|
192
229
|
// AutoscaledPool shorthands
|
|
193
230
|
minConcurrency, maxConcurrency, maxRequestsPerMinute, retryOnBlocked = false, respectRobotsTxtFile = false, onSkippedRequest, requestHandler, requestHandlerTimeoutSecs, errorHandler, failedRequestHandler, statusMessageLoggingInterval = 10, statusMessageCallback, statisticsOptions, httpClient,
|
|
194
231
|
// internal
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
this.failedRequestHandler = failedRequestHandler;
|
|
209
|
-
this.errorHandler = errorHandler;
|
|
210
|
-
if (requestHandlerTimeoutSecs) {
|
|
211
|
-
this.requestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
|
|
212
|
-
}
|
|
213
|
-
else {
|
|
214
|
-
this.requestHandlerTimeoutMillis = 60_000;
|
|
215
|
-
}
|
|
216
|
-
this.retryOnBlocked = retryOnBlocked;
|
|
217
|
-
this.respectRobotsTxtFile = respectRobotsTxtFile;
|
|
218
|
-
this.onSkippedRequest = onSkippedRequest;
|
|
219
|
-
const tryEnv = (val) => (val == null ? null : +val);
|
|
220
|
-
// allow at least 5min for internal timeouts
|
|
221
|
-
this.internalTimeoutMillis =
|
|
222
|
-
tryEnv(process.env.CRAWLEE_INTERNAL_TIMEOUT) ?? Math.max(this.requestHandlerTimeoutMillis * 2, 300e3);
|
|
223
|
-
// override the default internal timeout of request queue to respect `requestHandlerTimeoutMillis`
|
|
224
|
-
if (this.requestQueue) {
|
|
225
|
-
this.requestQueue.internalTimeoutMillis = this.internalTimeoutMillis;
|
|
226
|
-
// for request queue v2, we want to lock requests for slightly longer than the request handler timeout so that there is some padding for locking-related overhead,
|
|
227
|
-
// but never for less than a minute
|
|
228
|
-
this.requestQueue.requestLockSecs = Math.max(this.requestHandlerTimeoutMillis / 1000 + 5, 60);
|
|
229
|
-
}
|
|
230
|
-
this.maxRequestRetries = maxRequestRetries;
|
|
231
|
-
this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
|
|
232
|
-
this.maxSessionRotations = maxSessionRotations;
|
|
233
|
-
this.handledRequestsCount = 0;
|
|
234
|
-
this.stats = new Statistics({
|
|
235
|
-
logMessage: `${log.getOptions().prefix} request statistics:`,
|
|
236
|
-
log,
|
|
237
|
-
config,
|
|
238
|
-
...statisticsOptions,
|
|
239
|
-
});
|
|
240
|
-
this.sessionPoolOptions = {
|
|
241
|
-
...sessionPoolOptions,
|
|
242
|
-
log,
|
|
243
|
-
};
|
|
244
|
-
if (this.retryOnBlocked) {
|
|
245
|
-
this.sessionPoolOptions.blockedStatusCodes = sessionPoolOptions.blockedStatusCodes ?? [];
|
|
246
|
-
if (this.sessionPoolOptions.blockedStatusCodes.length !== 0) {
|
|
247
|
-
log.warning(`Both 'blockedStatusCodes' and 'retryOnBlocked' are set. Please note that the 'retryOnBlocked' feature might not work as expected.`);
|
|
248
|
-
}
|
|
232
|
+
experiments = {}, id, } = options;
|
|
233
|
+
// Create per-crawler service locator if custom services were provided.
|
|
234
|
+
// This wraps every method on the crawler instance so that calls to the global `serviceLocator`
|
|
235
|
+
// (via AsyncLocalStorage) resolve to this scoped instance instead.
|
|
236
|
+
// We also enter the scope for the rest of the constructor body, so that any code below
|
|
237
|
+
// that accesses `serviceLocator` will see the correct (scoped) instance.
|
|
238
|
+
let serviceLocatorScope = { enterScope: () => { }, exitScope: () => { } };
|
|
239
|
+
if (storageClient ||
|
|
240
|
+
eventManager ||
|
|
241
|
+
logger ||
|
|
242
|
+
(configuration !== undefined && configuration !== serviceLocator.getConfiguration())) {
|
|
243
|
+
const scopedServiceLocator = new ServiceLocator(configuration, eventManager, storageClient, logger);
|
|
244
|
+
serviceLocatorScope = bindMethodsToServiceLocator(scopedServiceLocator, this);
|
|
249
245
|
}
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
this.
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
}
|
|
267
|
-
const basicCrawlerAutoscaledPoolConfiguration = {
|
|
268
|
-
minConcurrency: minConcurrency ?? autoscaledPoolOptions?.minConcurrency,
|
|
269
|
-
maxConcurrency: maxConcurrency ?? autoscaledPoolOptions?.maxConcurrency,
|
|
270
|
-
maxTasksPerMinute: maxRequestsPerMinute ?? autoscaledPoolOptions?.maxTasksPerMinute,
|
|
271
|
-
runTaskFunction: this._runTaskFunction.bind(this),
|
|
272
|
-
isTaskReadyFunction: async () => {
|
|
273
|
-
if (isMaxPagesExceeded()) {
|
|
274
|
-
if (shouldLogMaxPagesExceeded) {
|
|
275
|
-
log.info('Crawler reached the maxRequestsPerCrawl limit of ' +
|
|
276
|
-
`${maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
|
|
277
|
-
shouldLogMaxPagesExceeded = false;
|
|
278
|
-
}
|
|
279
|
-
return false;
|
|
246
|
+
try {
|
|
247
|
+
serviceLocatorScope.enterScope();
|
|
248
|
+
this.#log = serviceLocator.getLogger().child({ prefix: this.constructor.name });
|
|
249
|
+
// Store whether the user explicitly provided an ID
|
|
250
|
+
this.hasExplicitId = id !== undefined;
|
|
251
|
+
// Store the user-provided ID, or generate a unique one for tracking purposes (not for state key)
|
|
252
|
+
this.crawlerId = id ?? cryptoRandomObjectId();
|
|
253
|
+
// Store the builder so that it can be run when the contextPipeline is needed.
|
|
254
|
+
// Invoking it immediately would cause problems with parent constructor call order.
|
|
255
|
+
this.contextPipelineBuilder = () => {
|
|
256
|
+
let contextPipeline = (options.contextPipelineBuilder?.() ??
|
|
257
|
+
ContextPipeline.create()); // Thanks to the RequireContextPipeline, contextPipeline will only be undefined if InitialContextType is CrawlingContext
|
|
258
|
+
if (options.extendContext !== undefined) {
|
|
259
|
+
contextPipeline = contextPipeline.compose({
|
|
260
|
+
action: async (context) => await options.extendContext(context),
|
|
261
|
+
});
|
|
280
262
|
}
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
263
|
+
contextPipeline = contextPipeline.compose({
|
|
264
|
+
action: async (context) => {
|
|
265
|
+
const { request } = context;
|
|
266
|
+
if (!this.requestMatchesEnqueueStrategy(request)) {
|
|
267
|
+
// eslint-disable-next-line dot-notation
|
|
268
|
+
const message = `Skipping request ${request.id} (starting url: ${request.url} -> loaded url: ${request.loadedUrl}) because it does not match the enqueue strategy (${request['enqueueStrategy']}).`;
|
|
269
|
+
this.log.debug(message);
|
|
270
|
+
request.noRetry = true;
|
|
271
|
+
request.state = RequestState.SKIPPED;
|
|
272
|
+
await this.handleSkippedRequest({ url: request.url, reason: 'redirect' });
|
|
273
|
+
throw new ContextPipelineInterruptedError(message);
|
|
274
|
+
}
|
|
275
|
+
return context;
|
|
276
|
+
},
|
|
277
|
+
});
|
|
278
|
+
return contextPipeline;
|
|
279
|
+
};
|
|
280
|
+
if (requestManager !== undefined) {
|
|
281
|
+
if (requestList !== undefined || requestQueue !== undefined) {
|
|
282
|
+
throw new Error('The `requestManager` option cannot be used in conjunction with `requestList` and/or `requestQueue`');
|
|
289
283
|
}
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
284
|
+
this.requestManager = requestManager;
|
|
285
|
+
this.requestQueue = requestManager; // TODO(v4) - the cast is not fully legitimate here, but it's fine for internal usage by the BasicCrawler
|
|
286
|
+
}
|
|
287
|
+
else {
|
|
288
|
+
this.requestList = requestList;
|
|
289
|
+
this.requestQueue = requestQueue;
|
|
290
|
+
}
|
|
291
|
+
this.httpClient = httpClient ?? new GotScrapingHttpClient();
|
|
292
|
+
this.proxyConfiguration = proxyConfiguration;
|
|
293
|
+
this.statusMessageLoggingInterval = statusMessageLoggingInterval;
|
|
294
|
+
this.statusMessageCallback = statusMessageCallback;
|
|
295
|
+
this.domainAccessedTime = new Map();
|
|
296
|
+
this.experiments = experiments;
|
|
297
|
+
this.robotsTxtFileCache = new LruCache({ maxLength: 1000 });
|
|
298
|
+
this.handleSkippedRequest = this.handleSkippedRequest.bind(this);
|
|
299
|
+
this.requestHandler = requestHandler ?? this.router;
|
|
300
|
+
this.failedRequestHandler = failedRequestHandler;
|
|
301
|
+
this.errorHandler = errorHandler;
|
|
302
|
+
if (requestHandlerTimeoutSecs) {
|
|
303
|
+
this.requestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
|
|
304
|
+
}
|
|
305
|
+
else {
|
|
306
|
+
this.requestHandlerTimeoutMillis = 60_000;
|
|
307
|
+
}
|
|
308
|
+
this.retryOnBlocked = retryOnBlocked;
|
|
309
|
+
this.respectRobotsTxtFile = respectRobotsTxtFile;
|
|
310
|
+
this.onSkippedRequest = onSkippedRequest;
|
|
311
|
+
const tryEnv = (val) => (val == null ? null : +val);
|
|
312
|
+
// allow at least 5min for internal timeouts
|
|
313
|
+
this.internalTimeoutMillis =
|
|
314
|
+
tryEnv(process.env.CRAWLEE_INTERNAL_TIMEOUT) ?? Math.max(this.requestHandlerTimeoutMillis * 2, 300e3);
|
|
315
|
+
// override the default internal timeout of request queue to respect `requestHandlerTimeoutMillis`
|
|
316
|
+
if (this.requestQueue) {
|
|
317
|
+
this.requestQueue.internalTimeoutMillis = this.internalTimeoutMillis;
|
|
318
|
+
// for request queue v2, we want to lock requests for slightly longer than the request handler timeout so that there is some padding for locking-related overhead,
|
|
319
|
+
// but never for less than a minute
|
|
320
|
+
this.requestQueue.requestLockSecs = Math.max(this.requestHandlerTimeoutMillis / 1000 + 5, 60);
|
|
321
|
+
}
|
|
322
|
+
this.maxRequestRetries = maxRequestRetries;
|
|
323
|
+
this.maxCrawlDepth = maxCrawlDepth;
|
|
324
|
+
this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
|
|
325
|
+
this.maxSessionRotations = maxSessionRotations;
|
|
326
|
+
this.stats = new Statistics({
|
|
327
|
+
logMessage: `${this.constructor.name} request statistics:`,
|
|
328
|
+
log: this.log,
|
|
329
|
+
...(this.hasExplicitId ? { id: this.crawlerId } : {}),
|
|
330
|
+
...statisticsOptions,
|
|
331
|
+
});
|
|
332
|
+
this.sessionPoolOptions = {
|
|
333
|
+
...sessionPoolOptions,
|
|
334
|
+
log: this.log,
|
|
335
|
+
};
|
|
336
|
+
if (this.retryOnBlocked) {
|
|
337
|
+
this.sessionPoolOptions.blockedStatusCodes = sessionPoolOptions.blockedStatusCodes ?? [];
|
|
338
|
+
if (this.sessionPoolOptions.blockedStatusCodes.length !== 0) {
|
|
339
|
+
this.log.warning(`Both 'blockedStatusCodes' and 'retryOnBlocked' are set. Please note that the 'retryOnBlocked' feature might not work as expected.`);
|
|
298
340
|
}
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
341
|
+
}
|
|
342
|
+
this.useSessionPool = useSessionPool;
|
|
343
|
+
const maxSignedInteger = 2 ** 31 - 1;
|
|
344
|
+
if (this.requestHandlerTimeoutMillis > maxSignedInteger) {
|
|
345
|
+
this.log.warning(`requestHandlerTimeoutMillis ${this.requestHandlerTimeoutMillis}` +
|
|
346
|
+
` does not fit a signed 32-bit integer. Limiting the value to ${maxSignedInteger}`);
|
|
347
|
+
this.requestHandlerTimeoutMillis = maxSignedInteger;
|
|
348
|
+
}
|
|
349
|
+
this.internalTimeoutMillis = Math.min(this.internalTimeoutMillis, maxSignedInteger);
|
|
350
|
+
this.maxRequestsPerCrawl = maxRequestsPerCrawl;
|
|
351
|
+
const isMaxPagesExceeded = () => this.maxRequestsPerCrawl && this.maxRequestsPerCrawl <= this.handledRequestsCount;
|
|
352
|
+
// eslint-disable-next-line prefer-const
|
|
353
|
+
let { isFinishedFunction, isTaskReadyFunction } = autoscaledPoolOptions;
|
|
354
|
+
// override even if `isFinishedFunction` provided by user - `keepAlive` has higher priority
|
|
355
|
+
if (keepAlive) {
|
|
356
|
+
isFinishedFunction = async () => false;
|
|
357
|
+
}
|
|
358
|
+
const basicCrawlerAutoscaledPoolConfiguration = {
|
|
359
|
+
minConcurrency: minConcurrency ?? autoscaledPoolOptions?.minConcurrency,
|
|
360
|
+
maxConcurrency: maxConcurrency ?? autoscaledPoolOptions?.maxConcurrency,
|
|
361
|
+
maxTasksPerMinute: maxRequestsPerMinute ?? autoscaledPoolOptions?.maxTasksPerMinute,
|
|
362
|
+
runTaskFunction: this._runTaskFunction.bind(this),
|
|
363
|
+
isTaskReadyFunction: async () => {
|
|
364
|
+
if (isMaxPagesExceeded()) {
|
|
365
|
+
if (this.shouldLogMaxProcessedRequestsExceeded) {
|
|
366
|
+
this.log.info('Crawler reached the maxRequestsPerCrawl limit of ' +
|
|
367
|
+
`${this.maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`);
|
|
368
|
+
this.shouldLogMaxProcessedRequestsExceeded = false;
|
|
369
|
+
}
|
|
370
|
+
return false;
|
|
371
|
+
}
|
|
372
|
+
return isTaskReadyFunction ? await isTaskReadyFunction() : await this._isTaskReadyFunction();
|
|
373
|
+
},
|
|
374
|
+
isFinishedFunction: async () => {
|
|
375
|
+
if (isMaxPagesExceeded()) {
|
|
376
|
+
this.log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${this.maxRequestsPerCrawl} requests ` +
|
|
377
|
+
'and all requests that were in progress at that time have now finished. ' +
|
|
378
|
+
`In total, the crawler processed ${this.handledRequestsCount} requests and will shut down.`);
|
|
379
|
+
return true;
|
|
380
|
+
}
|
|
381
|
+
const isFinished = isFinishedFunction
|
|
382
|
+
? await isFinishedFunction()
|
|
383
|
+
: await this._defaultIsFinishedFunction();
|
|
384
|
+
if (isFinished) {
|
|
385
|
+
const reason = isFinishedFunction
|
|
386
|
+
? "Crawler's custom isFinishedFunction() returned true, the crawler will shut down."
|
|
387
|
+
: 'All requests from the queue have been processed, the crawler will shut down.';
|
|
388
|
+
this.log.info(reason);
|
|
389
|
+
}
|
|
390
|
+
return isFinished;
|
|
391
|
+
},
|
|
392
|
+
log: this.log,
|
|
393
|
+
};
|
|
394
|
+
this.autoscaledPoolOptions = { ...autoscaledPoolOptions, ...basicCrawlerAutoscaledPoolConfiguration };
|
|
395
|
+
}
|
|
396
|
+
finally {
|
|
397
|
+
serviceLocatorScope.exitScope();
|
|
398
|
+
}
|
|
304
399
|
}
|
|
305
400
|
/**
|
|
306
401
|
* Checks if the given error is a proxy error by comparing its message to a list of known proxy error messages.
|
|
@@ -311,21 +406,13 @@ export class BasicCrawler {
|
|
|
311
406
|
isProxyError(error) {
|
|
312
407
|
return ROTATE_PROXY_ERRORS.some((x) => this._getMessageFromError(error)?.includes(x));
|
|
313
408
|
}
|
|
314
|
-
/**
|
|
315
|
-
* Checks whether the given crawling context is getting blocked by anti-bot protection using several heuristics.
|
|
316
|
-
* Returns `false` if the request is not blocked, otherwise returns a string with a description of the block reason.
|
|
317
|
-
* @param _crawlingContext The crawling context to check.
|
|
318
|
-
*/
|
|
319
|
-
async isRequestBlocked(_crawlingContext) {
|
|
320
|
-
throw new Error('the "isRequestBlocked" method is not implemented in this crawler.');
|
|
321
|
-
}
|
|
322
409
|
/**
|
|
323
410
|
* This method is periodically called by the crawler, every `statusMessageLoggingInterval` seconds.
|
|
324
411
|
*/
|
|
325
412
|
async setStatusMessage(message, options = {}) {
|
|
326
413
|
const data = options.isStatusMessageTerminal != null ? { terminal: options.isStatusMessageTerminal } : undefined;
|
|
327
|
-
this.log.
|
|
328
|
-
const client =
|
|
414
|
+
this.log.logWithLevel(LogLevel[options.level ?? 'DEBUG'], message, data);
|
|
415
|
+
const client = serviceLocator.getStorageClient();
|
|
329
416
|
if (!client.setStatusMessage) {
|
|
330
417
|
return;
|
|
331
418
|
}
|
|
@@ -350,7 +437,7 @@ export class BasicCrawler {
|
|
|
350
437
|
message = `Experiencing problems, ${this.stats.state.requestsFailed - previousState.requestsFailed || this.stats.state.requestsFailed} failed requests in the past ${this.statusMessageLoggingInterval} seconds.`;
|
|
351
438
|
}
|
|
352
439
|
else {
|
|
353
|
-
const total = this.
|
|
440
|
+
const total = this.requestManager?.getTotalCount();
|
|
354
441
|
message = `Crawled ${this.stats.state.requestsFinished}${total ? `/${total}` : ''} pages, ${this.stats.state.requestsFailed} failed requests, desired concurrency ${this.autoscaledPool?.desiredConcurrency ?? 0}.`;
|
|
355
442
|
}
|
|
356
443
|
if (this.statusMessageCallback) {
|
|
@@ -390,20 +477,30 @@ export class BasicCrawler {
|
|
|
390
477
|
if (this.requestQueue?.name === 'default' && purgeRequestQueue) {
|
|
391
478
|
await this.requestQueue.drop();
|
|
392
479
|
this.requestQueue = await this._getRequestQueue();
|
|
480
|
+
this.requestManager = undefined;
|
|
481
|
+
await this.initializeRequestManager();
|
|
482
|
+
this.handledRequestsCount = 0; // This would've been reset by this._init() further down below, but at that point `handledRequestsCount` could prevent `addRequests` from adding the initial requests
|
|
393
483
|
}
|
|
394
484
|
this.stats.reset();
|
|
395
485
|
await this.stats.resetStore();
|
|
396
486
|
await this.sessionPool?.resetStore();
|
|
397
487
|
}
|
|
398
488
|
this.running = true;
|
|
399
|
-
|
|
489
|
+
this.shouldLogMaxProcessedRequestsExceeded = true;
|
|
490
|
+
this.shouldLogMaxEnqueuedRequestsExceeded = true;
|
|
491
|
+
await purgeDefaultStorages({
|
|
492
|
+
onlyPurgeOnce: true,
|
|
493
|
+
client: serviceLocator.getStorageClient(),
|
|
494
|
+
config: serviceLocator.getConfiguration(),
|
|
495
|
+
});
|
|
400
496
|
if (requests) {
|
|
401
497
|
await this.addRequests(requests, addRequestsOptions);
|
|
402
498
|
}
|
|
403
499
|
await this._init();
|
|
404
500
|
await this.stats.startCapturing();
|
|
405
501
|
const periodicLogger = this.getPeriodicLogger();
|
|
406
|
-
|
|
502
|
+
// Don't await, we don't want to block the execution
|
|
503
|
+
void this.setStatusMessage('Starting the crawler.', { level: 'INFO' });
|
|
407
504
|
const sigintHandler = async () => {
|
|
408
505
|
this.log.warning('Pausing... Press CTRL+C again to force exit. To resume, do: CRAWLEE_PURGE_ON_START=0 npm start');
|
|
409
506
|
await this._pauseOnMigration();
|
|
@@ -412,8 +509,9 @@ export class BasicCrawler {
|
|
|
412
509
|
// Attach a listener to handle migration and aborting events gracefully.
|
|
413
510
|
const boundPauseOnMigration = this._pauseOnMigration.bind(this);
|
|
414
511
|
process.once('SIGINT', sigintHandler);
|
|
415
|
-
|
|
416
|
-
|
|
512
|
+
const eventManager = serviceLocator.getEventManager();
|
|
513
|
+
eventManager.on("migrating" /* EventType.MIGRATING */, boundPauseOnMigration);
|
|
514
|
+
eventManager.on("aborting" /* EventType.ABORTING */, boundPauseOnMigration);
|
|
417
515
|
let stats = {};
|
|
418
516
|
try {
|
|
419
517
|
await this.autoscaledPool.run();
|
|
@@ -422,8 +520,8 @@ export class BasicCrawler {
|
|
|
422
520
|
await this.teardown();
|
|
423
521
|
await this.stats.stopCapturing();
|
|
424
522
|
process.off('SIGINT', sigintHandler);
|
|
425
|
-
|
|
426
|
-
|
|
523
|
+
eventManager.off("migrating" /* EventType.MIGRATING */, boundPauseOnMigration);
|
|
524
|
+
eventManager.off("aborting" /* EventType.ABORTING */, boundPauseOnMigration);
|
|
427
525
|
const finalStats = this.stats.calculate();
|
|
428
526
|
stats = {
|
|
429
527
|
requestsFinished: this.stats.state.requestsFinished,
|
|
@@ -440,7 +538,7 @@ export class BasicCrawler {
|
|
|
440
538
|
mostCommonErrors: this.stats.errorTracker.getMostPopularErrors(3).map(prettify),
|
|
441
539
|
});
|
|
442
540
|
}
|
|
443
|
-
const client =
|
|
541
|
+
const client = serviceLocator.getStorageClient();
|
|
444
542
|
if (client.teardown) {
|
|
445
543
|
let finished = false;
|
|
446
544
|
setTimeout(() => {
|
|
@@ -452,7 +550,8 @@ export class BasicCrawler {
|
|
|
452
550
|
finished = true;
|
|
453
551
|
}
|
|
454
552
|
periodicLogger.stop();
|
|
455
|
-
|
|
553
|
+
// Don't await, we don't want to block the execution
|
|
554
|
+
void this.setStatusMessage(`Finished! Total ${this.stats.state.requestsFinished + this.stats.state.requestsFailed} requests: ${this.stats.state.requestsFinished} succeeded, ${this.stats.state.requestsFailed} failed.`, { isStatusMessageTerminal: true, level: 'INFO' });
|
|
456
555
|
this.running = false;
|
|
457
556
|
this.hasFinishedBefore = true;
|
|
458
557
|
}
|
|
@@ -462,6 +561,8 @@ export class BasicCrawler {
|
|
|
462
561
|
* Gracefully stops the current run of the crawler.
|
|
463
562
|
*
|
|
464
563
|
* All the tasks active at the time of calling this method will be allowed to finish.
|
|
564
|
+
*
|
|
565
|
+
* To stop the crawler immediately, use {@link BasicCrawler.teardown|`crawler.teardown()`} instead.
|
|
465
566
|
*/
|
|
466
567
|
stop(message = 'The crawler has been gracefully stopped.') {
|
|
467
568
|
// Gracefully starve the this.autoscaledPool, so it doesn't start new tasks. Resolves once the pool is cleared.
|
|
@@ -478,13 +579,59 @@ export class BasicCrawler {
|
|
|
478
579
|
if (!this.requestQueue && this.requestList) {
|
|
479
580
|
this.log.warningOnce('When using RequestList and RequestQueue at the same time, you should instantiate both explicitly and provide them in the crawler options, to ensure correctly handled restarts of the crawler.');
|
|
480
581
|
}
|
|
481
|
-
this.requestQueue
|
|
582
|
+
if (!this.requestQueue) {
|
|
583
|
+
this.requestQueue = await this._getRequestQueue();
|
|
584
|
+
this.requestManager = undefined;
|
|
585
|
+
}
|
|
586
|
+
if (!this.requestManager) {
|
|
587
|
+
this.requestManager =
|
|
588
|
+
this.requestList === undefined
|
|
589
|
+
? this.requestQueue
|
|
590
|
+
: new RequestManagerTandem(this.requestList, this.requestQueue);
|
|
591
|
+
}
|
|
482
592
|
return this.requestQueue;
|
|
483
593
|
}
|
|
484
594
|
async useState(defaultValue = {}) {
|
|
485
|
-
const kvs = await KeyValueStore.open(null, { config:
|
|
595
|
+
const kvs = await KeyValueStore.open(null, { config: serviceLocator.getConfiguration() });
|
|
596
|
+
if (this.hasExplicitId) {
|
|
597
|
+
const stateKey = `${BasicCrawler.CRAWLEE_STATE_KEY}_${this.crawlerId}`;
|
|
598
|
+
return kvs.getAutoSavedValue(stateKey, defaultValue);
|
|
599
|
+
}
|
|
600
|
+
BasicCrawler.useStateCrawlerIds.add(this.crawlerId);
|
|
601
|
+
if (BasicCrawler.useStateCrawlerIds.size > 1) {
|
|
602
|
+
serviceLocator
|
|
603
|
+
.getLogger()
|
|
604
|
+
.warningOnce('Multiple crawler instances are calling useState() without an explicit `id` option. \n' +
|
|
605
|
+
'This means they will share the same state object, which is likely unintended. \n' +
|
|
606
|
+
'To fix this, provide a unique `id` option to each crawler instance. \n' +
|
|
607
|
+
'Example: new BasicCrawler({ id: "my-crawler-1", ... })');
|
|
608
|
+
}
|
|
486
609
|
return kvs.getAutoSavedValue(BasicCrawler.CRAWLEE_STATE_KEY, defaultValue);
|
|
487
610
|
}
|
|
611
|
+
get pendingRequestCountApproximation() {
|
|
612
|
+
return this.requestManager?.getPendingCount() ?? 0;
|
|
613
|
+
}
|
|
614
|
+
calculateEnqueuedRequestLimit(explicitLimit) {
|
|
615
|
+
if (this.maxRequestsPerCrawl === undefined) {
|
|
616
|
+
return explicitLimit;
|
|
617
|
+
}
|
|
618
|
+
const limit = Math.max(0, this.maxRequestsPerCrawl - this.handledRequestsCount - this.pendingRequestCountApproximation);
|
|
619
|
+
return Math.min(limit, explicitLimit ?? Infinity);
|
|
620
|
+
}
|
|
621
|
+
async handleSkippedRequest(options) {
|
|
622
|
+
if (options.reason === 'limit' && this.shouldLogMaxEnqueuedRequestsExceeded) {
|
|
623
|
+
this.log.info('The number of requests enqueued by the crawler reached the maxRequestsPerCrawl limit of ' +
|
|
624
|
+
`${this.maxRequestsPerCrawl} requests and no further requests will be added.`);
|
|
625
|
+
this.shouldLogMaxEnqueuedRequestsExceeded = false;
|
|
626
|
+
}
|
|
627
|
+
if (options.reason === 'enqueueLimit') {
|
|
628
|
+
const enqueuedRequestLimit = this.calculateEnqueuedRequestLimit();
|
|
629
|
+
if (enqueuedRequestLimit === undefined || enqueuedRequestLimit !== 0) {
|
|
630
|
+
this.log.info('The number of requests enqueued by the crawler reached the enqueueLinks limit.');
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
await this.onSkippedRequest?.(options);
|
|
634
|
+
}
|
|
488
635
|
/**
|
|
489
636
|
* Adds requests to the queue in batches. By default, it will resolve after the initial batch is added, and continue
|
|
490
637
|
* adding the rest in background. You can configure the batch size via `batchSize` option and the sleep time in between
|
|
@@ -497,33 +644,57 @@ export class BasicCrawler {
|
|
|
497
644
|
* @param options Options for the request queue
|
|
498
645
|
*/
|
|
499
646
|
async addRequests(requests, options = {}) {
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
const
|
|
505
|
-
const
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
647
|
+
await this.getRequestQueue();
|
|
648
|
+
const requestLimit = this.calculateEnqueuedRequestLimit();
|
|
649
|
+
const skippedBecauseOfRobots = new Set();
|
|
650
|
+
const skippedBecauseOfLimit = new Set();
|
|
651
|
+
const skippedBecauseOfMaxCrawlDepth = new Set();
|
|
652
|
+
const isAllowedBasedOnRobotsTxtFile = this.isAllowedBasedOnRobotsTxtFile.bind(this);
|
|
653
|
+
const maxCrawlDepth = this.maxCrawlDepth;
|
|
654
|
+
ow(requests, ow.object
|
|
655
|
+
.is((value) => isIterable(value) || isAsyncIterable(value))
|
|
656
|
+
.message((value) => `Expected an iterable or async iterable, got ${getObjectType(value)}`));
|
|
657
|
+
async function* filteredRequests() {
|
|
658
|
+
let yieldedRequestCount = 0;
|
|
659
|
+
for await (const request of requests) {
|
|
660
|
+
const url = typeof request === 'string' ? request : request.url;
|
|
661
|
+
if (requestLimit !== undefined && yieldedRequestCount >= requestLimit) {
|
|
662
|
+
skippedBecauseOfLimit.add(url);
|
|
663
|
+
continue;
|
|
664
|
+
}
|
|
665
|
+
if (maxCrawlDepth !== undefined && request.crawlDepth > maxCrawlDepth) {
|
|
666
|
+
skippedBecauseOfMaxCrawlDepth.add(url);
|
|
667
|
+
continue;
|
|
668
|
+
}
|
|
669
|
+
if (await isAllowedBasedOnRobotsTxtFile(url)) {
|
|
670
|
+
yield request;
|
|
671
|
+
yieldedRequestCount += 1;
|
|
672
|
+
}
|
|
673
|
+
else {
|
|
674
|
+
skippedBecauseOfRobots.add(url);
|
|
675
|
+
}
|
|
514
676
|
}
|
|
515
677
|
}
|
|
516
|
-
|
|
678
|
+
const result = await this.requestManager.addRequestsBatched(filteredRequests(), options);
|
|
679
|
+
if (skippedBecauseOfRobots.size > 0) {
|
|
517
680
|
this.log.warning(`Some requests were skipped because they were disallowed based on the robots.txt file`, {
|
|
518
|
-
skipped: [...
|
|
681
|
+
skipped: [...skippedBecauseOfRobots],
|
|
519
682
|
});
|
|
520
|
-
if (this.onSkippedRequest) {
|
|
521
|
-
await Promise.all([...skipped].map((url) => {
|
|
522
|
-
return this.onSkippedRequest({ url, reason: 'robotsTxt' });
|
|
523
|
-
}));
|
|
524
|
-
}
|
|
525
683
|
}
|
|
526
|
-
|
|
684
|
+
if (skippedBecauseOfRobots.size > 0 ||
|
|
685
|
+
skippedBecauseOfLimit.size > 0 ||
|
|
686
|
+
skippedBecauseOfMaxCrawlDepth.size > 0) {
|
|
687
|
+
await Promise.all([...skippedBecauseOfRobots]
|
|
688
|
+
.map((url) => {
|
|
689
|
+
return this.handleSkippedRequest({ url, reason: 'robotsTxt' });
|
|
690
|
+
})
|
|
691
|
+
.concat([...skippedBecauseOfLimit].map((url) => {
|
|
692
|
+
return this.handleSkippedRequest({ url, reason: 'limit' });
|
|
693
|
+
}), [...skippedBecauseOfMaxCrawlDepth].map((url) => {
|
|
694
|
+
return this.handleSkippedRequest({ url, reason: 'depth' });
|
|
695
|
+
})));
|
|
696
|
+
}
|
|
697
|
+
return result;
|
|
527
698
|
}
|
|
528
699
|
/**
|
|
529
700
|
* Pushes data to the specified {@link Dataset}, or the default crawler {@link Dataset} by calling {@link Dataset.pushData}.
|
|
@@ -536,7 +707,7 @@ export class BasicCrawler {
|
|
|
536
707
|
* Retrieves the specified {@link Dataset}, or the default crawler {@link Dataset}.
|
|
537
708
|
*/
|
|
538
709
|
async getDataset(idOrName) {
|
|
539
|
-
return Dataset.open(idOrName, { config:
|
|
710
|
+
return Dataset.open(idOrName, { config: serviceLocator.getConfiguration() });
|
|
540
711
|
}
|
|
541
712
|
/**
|
|
542
713
|
* Retrieves data from the default crawler {@link Dataset} by calling {@link Dataset.getData}.
|
|
@@ -563,7 +734,21 @@ export class BasicCrawler {
|
|
|
563
734
|
const dataset = await this.getDataset();
|
|
564
735
|
const items = await dataset.export(options);
|
|
565
736
|
if (format === 'csv') {
|
|
566
|
-
|
|
737
|
+
let value;
|
|
738
|
+
if (items.length === 0) {
|
|
739
|
+
value = '';
|
|
740
|
+
}
|
|
741
|
+
else {
|
|
742
|
+
const keys = options?.collectAllKeys
|
|
743
|
+
? Array.from(new Set(items.flatMap(Object.keys)))
|
|
744
|
+
: Object.keys(items[0]);
|
|
745
|
+
value = stringify([
|
|
746
|
+
keys,
|
|
747
|
+
...items.map((item) => {
|
|
748
|
+
return keys.map((k) => item[k]);
|
|
749
|
+
}),
|
|
750
|
+
]);
|
|
751
|
+
}
|
|
567
752
|
await ensureDir(dirname(path));
|
|
568
753
|
await writeFile(path, value);
|
|
569
754
|
this.log.info(`Export to ${path} finished!`);
|
|
@@ -575,24 +760,31 @@ export class BasicCrawler {
|
|
|
575
760
|
}
|
|
576
761
|
return items;
|
|
577
762
|
}
|
|
763
|
+
/**
|
|
764
|
+
* Initializes the crawler.
|
|
765
|
+
*/
|
|
578
766
|
async _init() {
|
|
579
|
-
|
|
580
|
-
|
|
767
|
+
const eventManager = serviceLocator.getEventManager();
|
|
768
|
+
if (!eventManager.isInitialized()) {
|
|
769
|
+
await eventManager.init();
|
|
581
770
|
this._closeEvents = true;
|
|
582
771
|
}
|
|
583
772
|
// Initialize AutoscaledPool before awaiting _loadHandledRequestCount(),
|
|
584
773
|
// so that the caller can get a reference to it before awaiting the promise returned from run()
|
|
585
774
|
// (otherwise there would be no way)
|
|
586
|
-
this.autoscaledPool = new AutoscaledPool(this.autoscaledPoolOptions
|
|
775
|
+
this.autoscaledPool = new AutoscaledPool(this.autoscaledPoolOptions);
|
|
587
776
|
if (this.useSessionPool) {
|
|
588
|
-
this.sessionPool = await SessionPool.open(this.sessionPoolOptions
|
|
777
|
+
this.sessionPool = await SessionPool.open(this.sessionPoolOptions);
|
|
589
778
|
// Assuming there are not more than 20 browsers running at once;
|
|
590
779
|
this.sessionPool.setMaxListeners(20);
|
|
591
780
|
}
|
|
781
|
+
await this.initializeRequestManager();
|
|
592
782
|
await this._loadHandledRequestCount();
|
|
593
783
|
}
|
|
594
|
-
async
|
|
595
|
-
await this.
|
|
784
|
+
async runRequestHandler(crawlingContext) {
|
|
785
|
+
await this.contextPipeline.call(crawlingContext, async (finalContext) => {
|
|
786
|
+
await addTimeoutToPromise(async () => this.requestHandler(finalContext), this.requestHandlerTimeoutMillis, `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${finalContext.request.id}).`);
|
|
787
|
+
});
|
|
596
788
|
}
|
|
597
789
|
/**
|
|
598
790
|
* Handles blocked request
|
|
@@ -608,7 +800,8 @@ export class BasicCrawler {
|
|
|
608
800
|
return true;
|
|
609
801
|
}
|
|
610
802
|
const robotsTxtFile = await this.getRobotsTxtFileForUrl(url);
|
|
611
|
-
|
|
803
|
+
const userAgent = typeof this.respectRobotsTxtFile === 'object' ? this.respectRobotsTxtFile?.userAgent : '*';
|
|
804
|
+
return !robotsTxtFile || robotsTxtFile.isAllowed(url, userAgent);
|
|
612
805
|
}
|
|
613
806
|
async getRobotsTxtFileForUrl(url) {
|
|
614
807
|
if (!this.respectRobotsTxtFile) {
|
|
@@ -662,36 +855,36 @@ export class BasicCrawler {
|
|
|
662
855
|
await Promise.all([requestListPersistPromise, this.stats.persistState()]);
|
|
663
856
|
}
|
|
664
857
|
/**
|
|
665
|
-
*
|
|
666
|
-
* and RequestQueue is present then enqueues it to the queue first.
|
|
858
|
+
* Initializes the RequestManager based on the configured requestList and requestQueue.
|
|
667
859
|
*/
|
|
668
|
-
async
|
|
669
|
-
if (
|
|
670
|
-
return
|
|
671
|
-
}
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
if (!request)
|
|
676
|
-
return this.requestQueue.fetchNextRequest();
|
|
677
|
-
try {
|
|
678
|
-
await this.requestQueue.addRequest(request, { forefront: true });
|
|
860
|
+
async initializeRequestManager() {
|
|
861
|
+
if (this.requestManager !== undefined) {
|
|
862
|
+
return;
|
|
863
|
+
}
|
|
864
|
+
if (this.requestList && this.requestQueue) {
|
|
865
|
+
// Create a RequestManagerTandem if both RequestList and RequestQueue are provided
|
|
866
|
+
this.requestManager = new RequestManagerTandem(this.requestList, this.requestQueue);
|
|
679
867
|
}
|
|
680
|
-
|
|
681
|
-
//
|
|
682
|
-
|
|
683
|
-
this.log.error('Adding of request from the RequestList to the RequestQueue failed, reclaiming request back to the list.', { request });
|
|
684
|
-
await this.requestList.reclaimRequest(request);
|
|
685
|
-
return null;
|
|
868
|
+
else if (this.requestQueue) {
|
|
869
|
+
// Use RequestQueue directly if only it is provided
|
|
870
|
+
this.requestManager = this.requestQueue;
|
|
686
871
|
}
|
|
687
|
-
|
|
688
|
-
|
|
872
|
+
else if (this.requestList) {
|
|
873
|
+
// Use RequestList directly if only it is provided
|
|
874
|
+
// Make it compatible with the IRequestManager interface
|
|
875
|
+
this.requestManager = new RequestListAdapter(this.requestList);
|
|
876
|
+
}
|
|
877
|
+
// If neither RequestList nor RequestQueue is provided, leave the requestManager uninitialized until `getRequestQueue` is called
|
|
689
878
|
}
|
|
690
879
|
/**
|
|
691
|
-
*
|
|
692
|
-
* Can be used to clean up orphaned browser pages.
|
|
880
|
+
* Fetches the next request to process from the underlying request provider.
|
|
693
881
|
*/
|
|
694
|
-
async
|
|
882
|
+
async _fetchNextRequest() {
|
|
883
|
+
if (this.requestManager === undefined) {
|
|
884
|
+
throw new Error(`_fetchNextRequest called on an uninitialized crawler`);
|
|
885
|
+
}
|
|
886
|
+
return this.requestManager.fetchNextRequest();
|
|
887
|
+
}
|
|
695
888
|
/**
|
|
696
889
|
* Delays processing of the request based on the `sameDomainDelaySecs` option,
|
|
697
890
|
* adding it back to the queue after the timeout passes. Returns `true` if the request
|
|
@@ -729,18 +922,21 @@ export class BasicCrawler {
|
|
|
729
922
|
* then retries them in a case of an error, etc.
|
|
730
923
|
*/
|
|
731
924
|
async _runTaskFunction() {
|
|
732
|
-
const source = this.
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
await this._timeoutAndRetry(
|
|
736
|
-
request = await this._fetchNextRequest();
|
|
737
|
-
}, this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
925
|
+
const source = this.requestManager;
|
|
926
|
+
if (!source)
|
|
927
|
+
throw new Error('Request provider is not initialized!');
|
|
928
|
+
const request = await this._timeoutAndRetry(this._fetchNextRequest.bind(this), this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
738
929
|
tryCancel();
|
|
739
|
-
|
|
740
|
-
await this._timeoutAndRetry(async () => {
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
930
|
+
const session = this.useSessionPool
|
|
931
|
+
? await this._timeoutAndRetry(async () => {
|
|
932
|
+
return await this.sessionPool.newSession({
|
|
933
|
+
proxyInfo: await this.proxyConfiguration?.newProxyInfo({
|
|
934
|
+
request: request ?? undefined,
|
|
935
|
+
}),
|
|
936
|
+
maxUsageCount: 1,
|
|
937
|
+
});
|
|
938
|
+
}, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`)
|
|
939
|
+
: undefined;
|
|
744
940
|
tryCancel();
|
|
745
941
|
if (!request || this.delayRequest(request, source)) {
|
|
746
942
|
return;
|
|
@@ -750,7 +946,7 @@ export class BasicCrawler {
|
|
|
750
946
|
request.state = RequestState.SKIPPED;
|
|
751
947
|
request.noRetry = true;
|
|
752
948
|
await source.markRequestHandled(request);
|
|
753
|
-
await this.
|
|
949
|
+
await this.handleSkippedRequest({
|
|
754
950
|
url: request.url,
|
|
755
951
|
reason: 'robotsTxt',
|
|
756
952
|
});
|
|
@@ -760,36 +956,34 @@ export class BasicCrawler {
|
|
|
760
956
|
request.loadedUrl = undefined;
|
|
761
957
|
const statisticsId = request.id || request.uniqueKey;
|
|
762
958
|
this.stats.startJob(statisticsId);
|
|
763
|
-
|
|
764
|
-
// @ts-expect-error
|
|
765
|
-
// All missing properties (that extend CrawlingContext) are set dynamically,
|
|
766
|
-
// but TS does not know that, so otherwise it would throw when compiling.
|
|
959
|
+
const deferredCleanup = [];
|
|
767
960
|
const crawlingContext = {
|
|
768
961
|
id: cryptoRandomObjectId(10),
|
|
769
|
-
crawler: this,
|
|
770
962
|
log: this.log,
|
|
771
963
|
request,
|
|
772
964
|
session,
|
|
965
|
+
proxyInfo: session?.proxyInfo,
|
|
773
966
|
enqueueLinks: async (options) => {
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
967
|
+
const requestQueue = await this.getRequestQueue();
|
|
968
|
+
return await this.enqueueLinksWithCrawlDepth(options, request, requestQueue);
|
|
969
|
+
},
|
|
970
|
+
addRequests: async (requests, options = {}) => {
|
|
971
|
+
const newCrawlDepth = request.crawlDepth + 1;
|
|
972
|
+
const requestsGenerator = this.addCrawlDepthRequestGenerator(requests, newCrawlDepth);
|
|
973
|
+
await this.addRequests(requestsGenerator, options);
|
|
781
974
|
},
|
|
782
|
-
addRequests: this.addRequests.bind(this),
|
|
783
975
|
pushData: this.pushData.bind(this),
|
|
784
976
|
useState: this.useState.bind(this),
|
|
785
|
-
sendRequest: createSendRequest(this.httpClient, request, session
|
|
786
|
-
getKeyValueStore: async (idOrName) => KeyValueStore.open(idOrName, { config:
|
|
977
|
+
sendRequest: createSendRequest(this.httpClient, request, session),
|
|
978
|
+
getKeyValueStore: async (idOrName) => KeyValueStore.open(idOrName, { config: serviceLocator.getConfiguration() }),
|
|
979
|
+
registerDeferredCleanup: (cleanup) => {
|
|
980
|
+
deferredCleanup.push(cleanup);
|
|
981
|
+
},
|
|
787
982
|
};
|
|
788
|
-
this.crawlingContexts.set(crawlingContext.id, crawlingContext);
|
|
789
983
|
let isRequestLocked = true;
|
|
790
984
|
try {
|
|
791
985
|
request.state = RequestState.REQUEST_HANDLER;
|
|
792
|
-
await
|
|
986
|
+
await this.runRequestHandler(crawlingContext);
|
|
793
987
|
await this._timeoutAndRetry(async () => source.markRequestHandled(request), this.internalTimeoutMillis, `Marking request ${request.url} (${request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
794
988
|
isRequestLocked = false; // markRequestHandled succeeded and unlocked the request
|
|
795
989
|
this.stats.finishJob(statisticsId, request.retryCount);
|
|
@@ -798,7 +992,8 @@ export class BasicCrawler {
|
|
|
798
992
|
request.state = RequestState.DONE;
|
|
799
993
|
crawlingContext.session?.markGood();
|
|
800
994
|
}
|
|
801
|
-
catch (
|
|
995
|
+
catch (rawError) {
|
|
996
|
+
const err = this.unwrapError(rawError);
|
|
802
997
|
try {
|
|
803
998
|
request.state = RequestState.ERROR_HANDLER;
|
|
804
999
|
await addTimeoutToPromise(async () => this._requestFunctionErrorHandler(err, crawlingContext, source), this.internalTimeoutMillis, `Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
@@ -808,24 +1003,24 @@ export class BasicCrawler {
|
|
|
808
1003
|
request.state = RequestState.DONE;
|
|
809
1004
|
}
|
|
810
1005
|
catch (secondaryError) {
|
|
811
|
-
|
|
1006
|
+
const unwrappedSecondaryError = this.unwrapError(secondaryError);
|
|
1007
|
+
if (!unwrappedSecondaryError.triggeredFromUserHandler &&
|
|
812
1008
|
// avoid reprinting the same critical error multiple times, as it will be printed by Nodejs at the end anyway
|
|
813
|
-
!(
|
|
1009
|
+
!(unwrappedSecondaryError instanceof CriticalError)) {
|
|
814
1010
|
const apifySpecific = process.env.APIFY_IS_AT_HOME
|
|
815
1011
|
? `This may have happened due to an internal error of Apify's API or due to a misconfigured crawler.`
|
|
816
1012
|
: '';
|
|
817
|
-
this.log.exception(
|
|
1013
|
+
this.log.exception(unwrappedSecondaryError, 'An exception occurred during handling of failed request. ' +
|
|
818
1014
|
`This places the crawler and its underlying storages into an unknown state and crawling will be terminated. ${apifySpecific}`);
|
|
819
1015
|
}
|
|
820
1016
|
request.state = RequestState.ERROR;
|
|
821
|
-
throw
|
|
1017
|
+
throw unwrappedSecondaryError;
|
|
822
1018
|
}
|
|
823
1019
|
// decrease the session score if the request fails (but the error handler did not throw)
|
|
824
1020
|
crawlingContext.session?.markBad();
|
|
825
1021
|
}
|
|
826
1022
|
finally {
|
|
827
|
-
await
|
|
828
|
-
this.crawlingContexts.delete(crawlingContext.id);
|
|
1023
|
+
await Promise.all(deferredCleanup.map((cleanup) => cleanup()));
|
|
829
1024
|
// Safety net - release the lock if nobody managed to do it before
|
|
830
1025
|
if (isRequestLocked && source instanceof RequestProvider) {
|
|
831
1026
|
try {
|
|
@@ -838,19 +1033,63 @@ export class BasicCrawler {
|
|
|
838
1033
|
}
|
|
839
1034
|
}
|
|
840
1035
|
/**
|
|
841
|
-
*
|
|
1036
|
+
* Wrapper around the crawling context's `enqueueLinks` method:
|
|
1037
|
+
* - Injects `crawlDepth` to each request being added based on the crawling context request.
|
|
1038
|
+
* - Provides defaults for the `enqueueLinks` options based on the crawler configuration.
|
|
1039
|
+
* - These options can be overridden by the user.
|
|
1040
|
+
* @internal
|
|
1041
|
+
*/
|
|
1042
|
+
async enqueueLinksWithCrawlDepth(options, request, requestQueue) {
|
|
1043
|
+
const transformRequestFunctionWrapper = (requestOptions) => {
|
|
1044
|
+
requestOptions.crawlDepth = request.crawlDepth + 1;
|
|
1045
|
+
if (this.maxCrawlDepth !== undefined && requestOptions.crawlDepth > this.maxCrawlDepth) {
|
|
1046
|
+
// Setting `skippedReason` before returning `false` ensures that `reportSkippedRequests`
|
|
1047
|
+
// reports `'depth'` as the reason (via `request.skippedReason ?? reason` fallback),
|
|
1048
|
+
// rather than the generic `'transform'` reason.
|
|
1049
|
+
requestOptions.skippedReason = 'depth';
|
|
1050
|
+
return false;
|
|
1051
|
+
}
|
|
1052
|
+
// After injecting the crawlDepth, we call the user-provided transform function, if there is one.
|
|
1053
|
+
return options.transformRequestFunction?.(requestOptions) ?? requestOptions;
|
|
1054
|
+
};
|
|
1055
|
+
return await enqueueLinks({
|
|
1056
|
+
requestQueue,
|
|
1057
|
+
robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
|
|
1058
|
+
onSkippedRequest: this.handleSkippedRequest,
|
|
1059
|
+
limit: this.calculateEnqueuedRequestLimit(options.limit),
|
|
1060
|
+
// Allow user options to override defaults set above ⤴
|
|
1061
|
+
...options,
|
|
1062
|
+
transformRequestFunction: transformRequestFunctionWrapper,
|
|
1063
|
+
});
|
|
1064
|
+
}
|
|
1065
|
+
/**
|
|
1066
|
+
* Generator function that yields requests injected with the given crawl depth.
|
|
1067
|
+
* @internal
|
|
1068
|
+
*/
|
|
1069
|
+
async *addCrawlDepthRequestGenerator(requests, newRequestDepth) {
|
|
1070
|
+
for await (const request of requests) {
|
|
1071
|
+
if (typeof request === 'string') {
|
|
1072
|
+
yield { url: request, crawlDepth: newRequestDepth };
|
|
1073
|
+
}
|
|
1074
|
+
else {
|
|
1075
|
+
request.crawlDepth ??= newRequestDepth;
|
|
1076
|
+
yield request;
|
|
1077
|
+
}
|
|
1078
|
+
}
|
|
1079
|
+
}
|
|
1080
|
+
/**
|
|
1081
|
+
* Run async callback with given timeout and retry. Returns the result of the callback.
|
|
842
1082
|
* @ignore
|
|
843
1083
|
*/
|
|
844
1084
|
async _timeoutAndRetry(handler, timeout, error, maxRetries = 3, retried = 1) {
|
|
845
1085
|
try {
|
|
846
|
-
await addTimeoutToPromise(handler, timeout, error);
|
|
1086
|
+
return await addTimeoutToPromise(handler, timeout, error);
|
|
847
1087
|
}
|
|
848
1088
|
catch (e) {
|
|
849
1089
|
if (retried <= maxRetries) {
|
|
850
1090
|
// we retry on any error, not just timeout
|
|
851
1091
|
this.log.warning(`${e.message} (retrying ${retried}/${maxRetries})`);
|
|
852
|
-
|
|
853
|
-
return;
|
|
1092
|
+
return this._timeoutAndRetry(handler, timeout, error, maxRetries, retried + 1);
|
|
854
1093
|
}
|
|
855
1094
|
throw e;
|
|
856
1095
|
}
|
|
@@ -859,24 +1098,13 @@ export class BasicCrawler {
|
|
|
859
1098
|
* Returns true if either RequestList or RequestQueue have a request ready for processing.
|
|
860
1099
|
*/
|
|
861
1100
|
async _isTaskReadyFunction() {
|
|
862
|
-
|
|
863
|
-
const isRequestListEmpty = this.requestList ? await this.requestList.isEmpty() : true;
|
|
864
|
-
// If RequestList is not empty, task is ready, no reason to check RequestQueue.
|
|
865
|
-
if (!isRequestListEmpty)
|
|
866
|
-
return true;
|
|
867
|
-
// If RequestQueue is not empty, task is ready, return true, otherwise false.
|
|
868
|
-
return this.requestQueue ? !(await this.requestQueue.isEmpty()) : false;
|
|
1101
|
+
return this.requestManager !== undefined && !(await this.requestManager.isEmpty());
|
|
869
1102
|
}
|
|
870
1103
|
/**
|
|
871
1104
|
* Returns true if both RequestList and RequestQueue have all requests finished.
|
|
872
1105
|
*/
|
|
873
1106
|
async _defaultIsFinishedFunction() {
|
|
874
|
-
|
|
875
|
-
this.requestList ? this.requestList.isFinished() : true,
|
|
876
|
-
this.requestQueue ? this.requestQueue.isFinished() : true,
|
|
877
|
-
]);
|
|
878
|
-
// If both are finished, return true, otherwise return false.
|
|
879
|
-
return isRequestListFinished && isRequestQueueFinished;
|
|
1107
|
+
return !this.requestManager || (await this.requestManager.isFinished());
|
|
880
1108
|
}
|
|
881
1109
|
async _rotateSession(crawlingContext) {
|
|
882
1110
|
const { request } = crawlingContext;
|
|
@@ -884,6 +1112,18 @@ export class BasicCrawler {
|
|
|
884
1112
|
request.sessionRotationCount++;
|
|
885
1113
|
crawlingContext.session?.retire();
|
|
886
1114
|
}
|
|
1115
|
+
/**
|
|
1116
|
+
* Unwraps errors thrown by the context pipeline to get the actual user error.
|
|
1117
|
+
* RequestHandlerError and ContextPipelineInitializationError wrap the actual error.
|
|
1118
|
+
*/
|
|
1119
|
+
unwrapError(error) {
|
|
1120
|
+
if (error instanceof RequestHandlerError ||
|
|
1121
|
+
error instanceof ContextPipelineInitializationError ||
|
|
1122
|
+
error instanceof ContextPipelineCleanupError) {
|
|
1123
|
+
return this.unwrapError(error.cause);
|
|
1124
|
+
}
|
|
1125
|
+
return error;
|
|
1126
|
+
}
|
|
887
1127
|
/**
|
|
888
1128
|
* Handles errors thrown by user provided requestHandler()
|
|
889
1129
|
*/
|
|
@@ -896,7 +1136,8 @@ export class BasicCrawler {
|
|
|
896
1136
|
const shouldRetryRequest = this._canRequestBeRetried(request, error);
|
|
897
1137
|
if (shouldRetryRequest) {
|
|
898
1138
|
await this.stats.errorTrackerRetry.addAsync(error, crawlingContext);
|
|
899
|
-
await this.errorHandler?.(crawlingContext,
|
|
1139
|
+
await this.errorHandler?.(crawlingContext, // valid cast - ExtendedContext transitively extends CrawlingContext
|
|
1140
|
+
error);
|
|
900
1141
|
if (error instanceof SessionError) {
|
|
901
1142
|
await this._rotateSession(crawlingContext);
|
|
902
1143
|
}
|
|
@@ -948,7 +1189,8 @@ export class BasicCrawler {
|
|
|
948
1189
|
const message = this._getMessageFromError(error, true);
|
|
949
1190
|
this.log.error(`Request failed and reached maximum retries. ${message}`, { id, url, method, uniqueKey });
|
|
950
1191
|
if (this.failedRequestHandler) {
|
|
951
|
-
await this.failedRequestHandler?.(crawlingContext,
|
|
1192
|
+
await this.failedRequestHandler?.(crawlingContext, // valid cast - ExtendedContext transitively extends CrawlingContext
|
|
1193
|
+
error);
|
|
952
1194
|
}
|
|
953
1195
|
}
|
|
954
1196
|
/**
|
|
@@ -986,19 +1228,11 @@ export class BasicCrawler {
|
|
|
986
1228
|
return request.retryCount < maxRequestRetries;
|
|
987
1229
|
}
|
|
988
1230
|
/**
|
|
989
|
-
* Updates handledRequestsCount from possibly stored counts,
|
|
990
|
-
* usually after worker migration. Since one of the stores
|
|
991
|
-
* needs to have priority when both are present,
|
|
992
|
-
* it is the request queue, because generally, the request
|
|
993
|
-
* list will first be dumped into the queue and then left
|
|
994
|
-
* empty.
|
|
1231
|
+
* Updates handledRequestsCount from possibly stored counts, usually after worker migration.
|
|
995
1232
|
*/
|
|
996
1233
|
async _loadHandledRequestCount() {
|
|
997
|
-
if (this.
|
|
998
|
-
this.handledRequestsCount = await this.
|
|
999
|
-
}
|
|
1000
|
-
else if (this.requestList) {
|
|
1001
|
-
this.handledRequestsCount = this.requestList.handledCount();
|
|
1234
|
+
if (this.requestManager) {
|
|
1235
|
+
this.handledRequestsCount = await this.requestManager.handledCount();
|
|
1002
1236
|
}
|
|
1003
1237
|
}
|
|
1004
1238
|
async _executeHooks(hooks, ...args) {
|
|
@@ -1009,16 +1243,17 @@ export class BasicCrawler {
|
|
|
1009
1243
|
}
|
|
1010
1244
|
}
|
|
1011
1245
|
/**
|
|
1012
|
-
*
|
|
1013
|
-
*
|
|
1246
|
+
* Stops the crawler immediately.
|
|
1247
|
+
*
|
|
1248
|
+
* This method doesn't wait for currently active requests to finish.
|
|
1249
|
+
*
|
|
1250
|
+
* To stop the crawler gracefully (waiting for all running requests to finish), use {@link BasicCrawler.stop|`crawler.stop()`} instead.
|
|
1014
1251
|
*/
|
|
1015
1252
|
async teardown() {
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
await this.sessionPool.teardown();
|
|
1019
|
-
}
|
|
1253
|
+
serviceLocator.getEventManager().emit("persistState" /* EventType.PERSIST_STATE */, { isMigrating: false });
|
|
1254
|
+
await this.sessionPool?.teardown();
|
|
1020
1255
|
if (this._closeEvents) {
|
|
1021
|
-
await
|
|
1256
|
+
await serviceLocator.getEventManager().close();
|
|
1022
1257
|
}
|
|
1023
1258
|
await this.autoscaledPool?.abort();
|
|
1024
1259
|
}
|
|
@@ -1036,9 +1271,9 @@ export class BasicCrawler {
|
|
|
1036
1271
|
this.log.info('Using the old RequestQueue implementation without request locking.');
|
|
1037
1272
|
this._experimentWarnings.requestLocking = true;
|
|
1038
1273
|
}
|
|
1039
|
-
return RequestQueueV1.open(null, { config:
|
|
1274
|
+
return RequestQueueV1.open(null, { config: serviceLocator.getConfiguration() });
|
|
1040
1275
|
}
|
|
1041
|
-
return RequestQueue.open(null, { config:
|
|
1276
|
+
return RequestQueue.open(null, { config: serviceLocator.getConfiguration() });
|
|
1042
1277
|
}
|
|
1043
1278
|
requestMatchesEnqueueStrategy(request) {
|
|
1044
1279
|
const { url, loadedUrl } = request;
|