@crawlee/basic 3.13.5 → 4.0.0-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.d.ts +2 -2
- package/index.d.ts.map +1 -1
- package/index.js +3 -6
- package/index.js.map +1 -1
- package/internals/basic-crawler.d.ts +0 -57
- package/internals/basic-crawler.d.ts.map +1 -1
- package/internals/basic-crawler.js +181 -443
- package/internals/basic-crawler.js.map +1 -1
- package/internals/constants.js +1 -4
- package/internals/constants.js.map +1 -1
- package/internals/send-request.d.ts +1 -1
- package/internals/send-request.d.ts.map +1 -1
- package/internals/send-request.js +4 -7
- package/internals/send-request.js.map +1 -1
- package/package.json +19 -25
- package/tsconfig.build.tsbuildinfo +1 -1
- package/index.mjs +0 -91
|
@@ -1,20 +1,16 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
const log_1 = tslib_1.__importStar(require("@apify/log"));
|
|
15
|
-
const timeout_1 = require("@apify/timeout");
|
|
16
|
-
const utilities_1 = require("@apify/utilities");
|
|
17
|
-
const send_request_1 = require("./send-request");
|
|
1
|
+
import { writeFile } from 'node:fs/promises';
|
|
2
|
+
import { dirname } from 'node:path';
|
|
3
|
+
import { AutoscaledPool, Configuration, CriticalError, Dataset, enqueueLinks, EnqueueStrategy, GotScrapingHttpClient, KeyValueStore, mergeCookies, NonRetryableError, purgeDefaultStorages, RequestProvider, RequestQueue, RequestQueueV1, RequestState, RetryRequestError, Router, SessionError, SessionPool, Statistics, validators, } from '@crawlee/core';
|
|
4
|
+
import { RobotsTxtFile, ROTATE_PROXY_ERRORS } from '@crawlee/utils';
|
|
5
|
+
import { stringify } from 'csv-stringify/sync';
|
|
6
|
+
import { ensureDir, writeJSON } from 'fs-extra/esm';
|
|
7
|
+
import ow from 'ow';
|
|
8
|
+
import { getDomain } from 'tldts';
|
|
9
|
+
import { LruCache } from '@apify/datastructures';
|
|
10
|
+
import defaultLog, { LogLevel } from '@apify/log';
|
|
11
|
+
import { addTimeoutToPromise, TimeoutError, tryCancel } from '@apify/timeout';
|
|
12
|
+
import { cryptoRandomObjectId } from '@apify/utilities';
|
|
13
|
+
import { createSendRequest } from './send-request.js';
|
|
18
14
|
/**
|
|
19
15
|
* Since there's no set number of seconds before the container is terminated after
|
|
20
16
|
* a migration event, we need some reasonable number to use for RequestList persistence.
|
|
@@ -89,308 +85,137 @@ const SAFE_MIGRATION_WAIT_MILLIS = 20000;
|
|
|
89
85
|
* ```
|
|
90
86
|
* @category Crawlers
|
|
91
87
|
*/
|
|
92
|
-
class BasicCrawler {
|
|
88
|
+
export class BasicCrawler {
|
|
89
|
+
config;
|
|
90
|
+
static CRAWLEE_STATE_KEY = 'CRAWLEE_STATE';
|
|
91
|
+
/**
|
|
92
|
+
* A reference to the underlying {@link Statistics} class that collects and logs run statistics for requests.
|
|
93
|
+
*/
|
|
94
|
+
stats;
|
|
95
|
+
/**
|
|
96
|
+
* A reference to the underlying {@link RequestList} class that manages the crawler's {@link Request|requests}.
|
|
97
|
+
* Only available if used by the crawler.
|
|
98
|
+
*/
|
|
99
|
+
requestList;
|
|
100
|
+
/**
|
|
101
|
+
* Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
|
|
102
|
+
* A reference to the underlying {@link RequestQueue} class that manages the crawler's {@link Request|requests}.
|
|
103
|
+
* Only available if used by the crawler.
|
|
104
|
+
*/
|
|
105
|
+
requestQueue;
|
|
106
|
+
/**
|
|
107
|
+
* A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session|sessions}.
|
|
108
|
+
* Only available if used by the crawler.
|
|
109
|
+
*/
|
|
110
|
+
sessionPool;
|
|
111
|
+
/**
|
|
112
|
+
* A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
|
|
113
|
+
* > *NOTE:* This property is only initialized after calling the {@link BasicCrawler.run|`crawler.run()`} function.
|
|
114
|
+
* We can use it to change the concurrency settings on the fly,
|
|
115
|
+
* to pause the crawler by calling {@link AutoscaledPool.pause|`autoscaledPool.pause()`}
|
|
116
|
+
* or to abort it by calling {@link AutoscaledPool.abort|`autoscaledPool.abort()`}.
|
|
117
|
+
*/
|
|
118
|
+
autoscaledPool;
|
|
119
|
+
/**
|
|
120
|
+
* Default {@link Router} instance that will be used if we don't specify any {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
|
|
121
|
+
* See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
|
|
122
|
+
*/
|
|
123
|
+
router = Router.create();
|
|
124
|
+
running = false;
|
|
125
|
+
hasFinishedBefore = false;
|
|
126
|
+
log;
|
|
127
|
+
requestHandler;
|
|
128
|
+
errorHandler;
|
|
129
|
+
failedRequestHandler;
|
|
130
|
+
requestHandlerTimeoutMillis;
|
|
131
|
+
internalTimeoutMillis;
|
|
132
|
+
maxRequestRetries;
|
|
133
|
+
sameDomainDelayMillis;
|
|
134
|
+
domainAccessedTime;
|
|
135
|
+
maxSessionRotations;
|
|
136
|
+
handledRequestsCount;
|
|
137
|
+
statusMessageLoggingInterval;
|
|
138
|
+
statusMessageCallback;
|
|
139
|
+
sessionPoolOptions;
|
|
140
|
+
useSessionPool;
|
|
141
|
+
crawlingContexts = new Map();
|
|
142
|
+
autoscaledPoolOptions;
|
|
143
|
+
events;
|
|
144
|
+
httpClient;
|
|
145
|
+
retryOnBlocked;
|
|
146
|
+
respectRobotsTxtFile;
|
|
147
|
+
onSkippedRequest;
|
|
148
|
+
_closeEvents;
|
|
149
|
+
experiments;
|
|
150
|
+
robotsTxtFileCache;
|
|
151
|
+
_experimentWarnings = {};
|
|
152
|
+
static optionsShape = {
|
|
153
|
+
requestList: ow.optional.object.validate(validators.requestList),
|
|
154
|
+
requestQueue: ow.optional.object.validate(validators.requestQueue),
|
|
155
|
+
// Subclasses override this function instead of passing it
|
|
156
|
+
// in constructor, so this validation needs to apply only
|
|
157
|
+
// if the user creates an instance of BasicCrawler directly.
|
|
158
|
+
requestHandler: ow.optional.function,
|
|
159
|
+
requestHandlerTimeoutSecs: ow.optional.number,
|
|
160
|
+
errorHandler: ow.optional.function,
|
|
161
|
+
failedRequestHandler: ow.optional.function,
|
|
162
|
+
maxRequestRetries: ow.optional.number,
|
|
163
|
+
sameDomainDelaySecs: ow.optional.number,
|
|
164
|
+
maxSessionRotations: ow.optional.number,
|
|
165
|
+
maxRequestsPerCrawl: ow.optional.number,
|
|
166
|
+
autoscaledPoolOptions: ow.optional.object,
|
|
167
|
+
sessionPoolOptions: ow.optional.object,
|
|
168
|
+
useSessionPool: ow.optional.boolean,
|
|
169
|
+
statusMessageLoggingInterval: ow.optional.number,
|
|
170
|
+
statusMessageCallback: ow.optional.function,
|
|
171
|
+
retryOnBlocked: ow.optional.boolean,
|
|
172
|
+
respectRobotsTxtFile: ow.optional.boolean,
|
|
173
|
+
onSkippedRequest: ow.optional.function,
|
|
174
|
+
httpClient: ow.optional.object,
|
|
175
|
+
// AutoscaledPool shorthands
|
|
176
|
+
minConcurrency: ow.optional.number,
|
|
177
|
+
maxConcurrency: ow.optional.number,
|
|
178
|
+
maxRequestsPerMinute: ow.optional.number.integerOrInfinite.positive.greaterThanOrEqual(1),
|
|
179
|
+
keepAlive: ow.optional.boolean,
|
|
180
|
+
// internal
|
|
181
|
+
log: ow.optional.object,
|
|
182
|
+
experiments: ow.optional.object,
|
|
183
|
+
statisticsOptions: ow.optional.object,
|
|
184
|
+
};
|
|
93
185
|
/**
|
|
94
186
|
* All `BasicCrawler` parameters are passed via an options object.
|
|
95
187
|
*/
|
|
96
|
-
constructor(options = {}, config =
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
configurable: true,
|
|
100
|
-
writable: true,
|
|
101
|
-
value: config
|
|
102
|
-
});
|
|
103
|
-
/**
|
|
104
|
-
* A reference to the underlying {@link Statistics} class that collects and logs run statistics for requests.
|
|
105
|
-
*/
|
|
106
|
-
Object.defineProperty(this, "stats", {
|
|
107
|
-
enumerable: true,
|
|
108
|
-
configurable: true,
|
|
109
|
-
writable: true,
|
|
110
|
-
value: void 0
|
|
111
|
-
});
|
|
112
|
-
/**
|
|
113
|
-
* A reference to the underlying {@link RequestList} class that manages the crawler's {@link Request|requests}.
|
|
114
|
-
* Only available if used by the crawler.
|
|
115
|
-
*/
|
|
116
|
-
Object.defineProperty(this, "requestList", {
|
|
117
|
-
enumerable: true,
|
|
118
|
-
configurable: true,
|
|
119
|
-
writable: true,
|
|
120
|
-
value: void 0
|
|
121
|
-
});
|
|
122
|
-
/**
|
|
123
|
-
* Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
|
|
124
|
-
* A reference to the underlying {@link RequestQueue} class that manages the crawler's {@link Request|requests}.
|
|
125
|
-
* Only available if used by the crawler.
|
|
126
|
-
*/
|
|
127
|
-
Object.defineProperty(this, "requestQueue", {
|
|
128
|
-
enumerable: true,
|
|
129
|
-
configurable: true,
|
|
130
|
-
writable: true,
|
|
131
|
-
value: void 0
|
|
132
|
-
});
|
|
133
|
-
/**
|
|
134
|
-
* A reference to the underlying {@link SessionPool} class that manages the crawler's {@link Session|sessions}.
|
|
135
|
-
* Only available if used by the crawler.
|
|
136
|
-
*/
|
|
137
|
-
Object.defineProperty(this, "sessionPool", {
|
|
138
|
-
enumerable: true,
|
|
139
|
-
configurable: true,
|
|
140
|
-
writable: true,
|
|
141
|
-
value: void 0
|
|
142
|
-
});
|
|
143
|
-
/**
|
|
144
|
-
* A reference to the underlying {@link AutoscaledPool} class that manages the concurrency of the crawler.
|
|
145
|
-
* > *NOTE:* This property is only initialized after calling the {@link BasicCrawler.run|`crawler.run()`} function.
|
|
146
|
-
* We can use it to change the concurrency settings on the fly,
|
|
147
|
-
* to pause the crawler by calling {@link AutoscaledPool.pause|`autoscaledPool.pause()`}
|
|
148
|
-
* or to abort it by calling {@link AutoscaledPool.abort|`autoscaledPool.abort()`}.
|
|
149
|
-
*/
|
|
150
|
-
Object.defineProperty(this, "autoscaledPool", {
|
|
151
|
-
enumerable: true,
|
|
152
|
-
configurable: true,
|
|
153
|
-
writable: true,
|
|
154
|
-
value: void 0
|
|
155
|
-
});
|
|
156
|
-
/**
|
|
157
|
-
* Default {@link Router} instance that will be used if we don't specify any {@link BasicCrawlerOptions.requestHandler|`requestHandler`}.
|
|
158
|
-
* See {@link Router.addHandler|`router.addHandler()`} and {@link Router.addDefaultHandler|`router.addDefaultHandler()`}.
|
|
159
|
-
*/
|
|
160
|
-
Object.defineProperty(this, "router", {
|
|
161
|
-
enumerable: true,
|
|
162
|
-
configurable: true,
|
|
163
|
-
writable: true,
|
|
164
|
-
value: core_1.Router.create()
|
|
165
|
-
});
|
|
166
|
-
Object.defineProperty(this, "running", {
|
|
167
|
-
enumerable: true,
|
|
168
|
-
configurable: true,
|
|
169
|
-
writable: true,
|
|
170
|
-
value: false
|
|
171
|
-
});
|
|
172
|
-
Object.defineProperty(this, "hasFinishedBefore", {
|
|
173
|
-
enumerable: true,
|
|
174
|
-
configurable: true,
|
|
175
|
-
writable: true,
|
|
176
|
-
value: false
|
|
177
|
-
});
|
|
178
|
-
Object.defineProperty(this, "log", {
|
|
179
|
-
enumerable: true,
|
|
180
|
-
configurable: true,
|
|
181
|
-
writable: true,
|
|
182
|
-
value: void 0
|
|
183
|
-
});
|
|
184
|
-
Object.defineProperty(this, "requestHandler", {
|
|
185
|
-
enumerable: true,
|
|
186
|
-
configurable: true,
|
|
187
|
-
writable: true,
|
|
188
|
-
value: void 0
|
|
189
|
-
});
|
|
190
|
-
Object.defineProperty(this, "errorHandler", {
|
|
191
|
-
enumerable: true,
|
|
192
|
-
configurable: true,
|
|
193
|
-
writable: true,
|
|
194
|
-
value: void 0
|
|
195
|
-
});
|
|
196
|
-
Object.defineProperty(this, "failedRequestHandler", {
|
|
197
|
-
enumerable: true,
|
|
198
|
-
configurable: true,
|
|
199
|
-
writable: true,
|
|
200
|
-
value: void 0
|
|
201
|
-
});
|
|
202
|
-
Object.defineProperty(this, "requestHandlerTimeoutMillis", {
|
|
203
|
-
enumerable: true,
|
|
204
|
-
configurable: true,
|
|
205
|
-
writable: true,
|
|
206
|
-
value: void 0
|
|
207
|
-
});
|
|
208
|
-
Object.defineProperty(this, "internalTimeoutMillis", {
|
|
209
|
-
enumerable: true,
|
|
210
|
-
configurable: true,
|
|
211
|
-
writable: true,
|
|
212
|
-
value: void 0
|
|
213
|
-
});
|
|
214
|
-
Object.defineProperty(this, "maxRequestRetries", {
|
|
215
|
-
enumerable: true,
|
|
216
|
-
configurable: true,
|
|
217
|
-
writable: true,
|
|
218
|
-
value: void 0
|
|
219
|
-
});
|
|
220
|
-
Object.defineProperty(this, "sameDomainDelayMillis", {
|
|
221
|
-
enumerable: true,
|
|
222
|
-
configurable: true,
|
|
223
|
-
writable: true,
|
|
224
|
-
value: void 0
|
|
225
|
-
});
|
|
226
|
-
Object.defineProperty(this, "domainAccessedTime", {
|
|
227
|
-
enumerable: true,
|
|
228
|
-
configurable: true,
|
|
229
|
-
writable: true,
|
|
230
|
-
value: void 0
|
|
231
|
-
});
|
|
232
|
-
Object.defineProperty(this, "maxSessionRotations", {
|
|
233
|
-
enumerable: true,
|
|
234
|
-
configurable: true,
|
|
235
|
-
writable: true,
|
|
236
|
-
value: void 0
|
|
237
|
-
});
|
|
238
|
-
Object.defineProperty(this, "handledRequestsCount", {
|
|
239
|
-
enumerable: true,
|
|
240
|
-
configurable: true,
|
|
241
|
-
writable: true,
|
|
242
|
-
value: void 0
|
|
243
|
-
});
|
|
244
|
-
Object.defineProperty(this, "statusMessageLoggingInterval", {
|
|
245
|
-
enumerable: true,
|
|
246
|
-
configurable: true,
|
|
247
|
-
writable: true,
|
|
248
|
-
value: void 0
|
|
249
|
-
});
|
|
250
|
-
Object.defineProperty(this, "statusMessageCallback", {
|
|
251
|
-
enumerable: true,
|
|
252
|
-
configurable: true,
|
|
253
|
-
writable: true,
|
|
254
|
-
value: void 0
|
|
255
|
-
});
|
|
256
|
-
Object.defineProperty(this, "sessionPoolOptions", {
|
|
257
|
-
enumerable: true,
|
|
258
|
-
configurable: true,
|
|
259
|
-
writable: true,
|
|
260
|
-
value: void 0
|
|
261
|
-
});
|
|
262
|
-
Object.defineProperty(this, "useSessionPool", {
|
|
263
|
-
enumerable: true,
|
|
264
|
-
configurable: true,
|
|
265
|
-
writable: true,
|
|
266
|
-
value: void 0
|
|
267
|
-
});
|
|
268
|
-
Object.defineProperty(this, "crawlingContexts", {
|
|
269
|
-
enumerable: true,
|
|
270
|
-
configurable: true,
|
|
271
|
-
writable: true,
|
|
272
|
-
value: new Map()
|
|
273
|
-
});
|
|
274
|
-
Object.defineProperty(this, "autoscaledPoolOptions", {
|
|
275
|
-
enumerable: true,
|
|
276
|
-
configurable: true,
|
|
277
|
-
writable: true,
|
|
278
|
-
value: void 0
|
|
279
|
-
});
|
|
280
|
-
Object.defineProperty(this, "events", {
|
|
281
|
-
enumerable: true,
|
|
282
|
-
configurable: true,
|
|
283
|
-
writable: true,
|
|
284
|
-
value: void 0
|
|
285
|
-
});
|
|
286
|
-
Object.defineProperty(this, "httpClient", {
|
|
287
|
-
enumerable: true,
|
|
288
|
-
configurable: true,
|
|
289
|
-
writable: true,
|
|
290
|
-
value: void 0
|
|
291
|
-
});
|
|
292
|
-
Object.defineProperty(this, "retryOnBlocked", {
|
|
293
|
-
enumerable: true,
|
|
294
|
-
configurable: true,
|
|
295
|
-
writable: true,
|
|
296
|
-
value: void 0
|
|
297
|
-
});
|
|
298
|
-
Object.defineProperty(this, "respectRobotsTxtFile", {
|
|
299
|
-
enumerable: true,
|
|
300
|
-
configurable: true,
|
|
301
|
-
writable: true,
|
|
302
|
-
value: void 0
|
|
303
|
-
});
|
|
304
|
-
Object.defineProperty(this, "onSkippedRequest", {
|
|
305
|
-
enumerable: true,
|
|
306
|
-
configurable: true,
|
|
307
|
-
writable: true,
|
|
308
|
-
value: void 0
|
|
309
|
-
});
|
|
310
|
-
Object.defineProperty(this, "_closeEvents", {
|
|
311
|
-
enumerable: true,
|
|
312
|
-
configurable: true,
|
|
313
|
-
writable: true,
|
|
314
|
-
value: void 0
|
|
315
|
-
});
|
|
316
|
-
Object.defineProperty(this, "experiments", {
|
|
317
|
-
enumerable: true,
|
|
318
|
-
configurable: true,
|
|
319
|
-
writable: true,
|
|
320
|
-
value: void 0
|
|
321
|
-
});
|
|
322
|
-
Object.defineProperty(this, "robotsTxtFileCache", {
|
|
323
|
-
enumerable: true,
|
|
324
|
-
configurable: true,
|
|
325
|
-
writable: true,
|
|
326
|
-
value: void 0
|
|
327
|
-
});
|
|
328
|
-
Object.defineProperty(this, "_experimentWarnings", {
|
|
329
|
-
enumerable: true,
|
|
330
|
-
configurable: true,
|
|
331
|
-
writable: true,
|
|
332
|
-
value: {}
|
|
333
|
-
});
|
|
334
|
-
(0, ow_1.default)(options, 'BasicCrawlerOptions', ow_1.default.object.exactShape(BasicCrawler.optionsShape));
|
|
188
|
+
constructor(options = {}, config = Configuration.getGlobalConfig()) {
|
|
189
|
+
this.config = config;
|
|
190
|
+
ow(options, 'BasicCrawlerOptions', ow.object.exactShape(BasicCrawler.optionsShape));
|
|
335
191
|
const { requestList, requestQueue, maxRequestRetries = 3, sameDomainDelaySecs = 0, maxSessionRotations = 10, maxRequestsPerCrawl, autoscaledPoolOptions = {}, keepAlive, sessionPoolOptions = {}, useSessionPool = true,
|
|
336
192
|
// AutoscaledPool shorthands
|
|
337
|
-
minConcurrency, maxConcurrency, maxRequestsPerMinute, retryOnBlocked = false, respectRobotsTxtFile = false, onSkippedRequest,
|
|
193
|
+
minConcurrency, maxConcurrency, maxRequestsPerMinute, retryOnBlocked = false, respectRobotsTxtFile = false, onSkippedRequest, requestHandler, requestHandlerTimeoutSecs, errorHandler, failedRequestHandler, statusMessageLoggingInterval = 10, statusMessageCallback, statisticsOptions, httpClient,
|
|
338
194
|
// internal
|
|
339
|
-
log =
|
|
340
|
-
// Old and new request handler methods
|
|
341
|
-
handleRequestFunction, requestHandler, handleRequestTimeoutSecs, requestHandlerTimeoutSecs, errorHandler, handleFailedRequestFunction, failedRequestHandler, statusMessageLoggingInterval = 10, statusMessageCallback, statisticsOptions, httpClient, } = options;
|
|
195
|
+
log = defaultLog.child({ prefix: this.constructor.name }), experiments = {}, } = options;
|
|
342
196
|
this.requestList = requestList;
|
|
343
197
|
this.requestQueue = requestQueue;
|
|
344
|
-
this.httpClient = httpClient ?? new
|
|
198
|
+
this.httpClient = httpClient ?? new GotScrapingHttpClient();
|
|
345
199
|
this.log = log;
|
|
346
200
|
this.statusMessageLoggingInterval = statusMessageLoggingInterval;
|
|
347
201
|
this.statusMessageCallback = statusMessageCallback;
|
|
348
202
|
this.events = config.getEventManager();
|
|
349
203
|
this.domainAccessedTime = new Map();
|
|
350
204
|
this.experiments = experiments;
|
|
351
|
-
this.robotsTxtFileCache = new
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
propertyKey: 'requestHandler',
|
|
356
|
-
newProperty: requestHandler,
|
|
357
|
-
oldProperty: handleRequestFunction,
|
|
358
|
-
allowUndefined: true, // fallback to the default router
|
|
359
|
-
});
|
|
360
|
-
if (!this.requestHandler) {
|
|
361
|
-
this.requestHandler = this.router;
|
|
362
|
-
}
|
|
205
|
+
this.robotsTxtFileCache = new LruCache({ maxLength: 1000 });
|
|
206
|
+
// FIXME any
|
|
207
|
+
this.requestHandler = requestHandler ?? this.router;
|
|
208
|
+
this.failedRequestHandler = failedRequestHandler;
|
|
363
209
|
this.errorHandler = errorHandler;
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
oldName: 'handleFailedRequestFunction',
|
|
367
|
-
propertyKey: 'failedRequestHandler',
|
|
368
|
-
newProperty: failedRequestHandler,
|
|
369
|
-
oldProperty: handleFailedRequestFunction,
|
|
370
|
-
allowUndefined: true,
|
|
371
|
-
});
|
|
372
|
-
let newRequestHandlerTimeout;
|
|
373
|
-
if (!handleRequestTimeoutSecs) {
|
|
374
|
-
if (!requestHandlerTimeoutSecs) {
|
|
375
|
-
newRequestHandlerTimeout = 60000;
|
|
376
|
-
}
|
|
377
|
-
else {
|
|
378
|
-
newRequestHandlerTimeout = requestHandlerTimeoutSecs * 1000;
|
|
379
|
-
}
|
|
210
|
+
if (requestHandlerTimeoutSecs) {
|
|
211
|
+
this.requestHandlerTimeoutMillis = requestHandlerTimeoutSecs * 1000;
|
|
380
212
|
}
|
|
381
|
-
else
|
|
382
|
-
|
|
213
|
+
else {
|
|
214
|
+
this.requestHandlerTimeoutMillis = 60_000;
|
|
383
215
|
}
|
|
384
216
|
this.retryOnBlocked = retryOnBlocked;
|
|
385
217
|
this.respectRobotsTxtFile = respectRobotsTxtFile;
|
|
386
218
|
this.onSkippedRequest = onSkippedRequest;
|
|
387
|
-
this._handlePropertyNameChange({
|
|
388
|
-
newName: 'requestHandlerTimeoutSecs',
|
|
389
|
-
oldName: 'handleRequestTimeoutSecs',
|
|
390
|
-
propertyKey: 'requestHandlerTimeoutMillis',
|
|
391
|
-
newProperty: newRequestHandlerTimeout,
|
|
392
|
-
oldProperty: handleRequestTimeoutSecs ? handleRequestTimeoutSecs * 1000 : undefined,
|
|
393
|
-
});
|
|
394
219
|
const tryEnv = (val) => (val == null ? null : +val);
|
|
395
220
|
// allow at least 5min for internal timeouts
|
|
396
221
|
this.internalTimeoutMillis =
|
|
@@ -406,7 +231,7 @@ class BasicCrawler {
|
|
|
406
231
|
this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
|
|
407
232
|
this.maxSessionRotations = maxSessionRotations;
|
|
408
233
|
this.handledRequestsCount = 0;
|
|
409
|
-
this.stats = new
|
|
234
|
+
this.stats = new Statistics({
|
|
410
235
|
logMessage: `${log.getOptions().prefix} request statistics:`,
|
|
411
236
|
log,
|
|
412
237
|
config,
|
|
@@ -484,7 +309,7 @@ class BasicCrawler {
|
|
|
484
309
|
* @param error The error to check.
|
|
485
310
|
*/
|
|
486
311
|
isProxyError(error) {
|
|
487
|
-
return
|
|
312
|
+
return ROTATE_PROXY_ERRORS.some((x) => this._getMessageFromError(error)?.includes(x));
|
|
488
313
|
}
|
|
489
314
|
/**
|
|
490
315
|
* Checks whether the given crawling context is getting blocked by anti-bot protection using several heuristics.
|
|
@@ -499,13 +324,13 @@ class BasicCrawler {
|
|
|
499
324
|
*/
|
|
500
325
|
async setStatusMessage(message, options = {}) {
|
|
501
326
|
const data = options.isStatusMessageTerminal != null ? { terminal: options.isStatusMessageTerminal } : undefined;
|
|
502
|
-
this.log.internal(
|
|
327
|
+
this.log.internal(LogLevel[options.level ?? 'DEBUG'], message, data);
|
|
503
328
|
const client = this.config.getStorageClient();
|
|
504
329
|
if (!client.setStatusMessage) {
|
|
505
330
|
return;
|
|
506
331
|
}
|
|
507
332
|
// just to be sure, this should be fast
|
|
508
|
-
await
|
|
333
|
+
await addTimeoutToPromise(async () => client.setStatusMessage(message, options), 1000, 'Setting status message timed out after 1s').catch((e) => this.log.debug(e.message));
|
|
509
334
|
}
|
|
510
335
|
getPeriodicLogger() {
|
|
511
336
|
let previousState = { ...this.stats.state };
|
|
@@ -571,7 +396,7 @@ class BasicCrawler {
|
|
|
571
396
|
await this.sessionPool?.resetStore();
|
|
572
397
|
}
|
|
573
398
|
this.running = true;
|
|
574
|
-
await
|
|
399
|
+
await purgeDefaultStorages({ onlyPurgeOnce: true });
|
|
575
400
|
if (requests) {
|
|
576
401
|
await this.addRequests(requests, addRequestsOptions);
|
|
577
402
|
}
|
|
@@ -653,11 +478,11 @@ class BasicCrawler {
|
|
|
653
478
|
if (!this.requestQueue && this.requestList) {
|
|
654
479
|
this.log.warningOnce('When using RequestList and RequestQueue at the same time, you should instantiate both explicitly and provide them in the crawler options, to ensure correctly handled restarts of the crawler.');
|
|
655
480
|
}
|
|
656
|
-
this.requestQueue
|
|
481
|
+
this.requestQueue ??= await this._getRequestQueue();
|
|
657
482
|
return this.requestQueue;
|
|
658
483
|
}
|
|
659
484
|
async useState(defaultValue = {}) {
|
|
660
|
-
const kvs = await
|
|
485
|
+
const kvs = await KeyValueStore.open(null, { config: this.config });
|
|
661
486
|
return kvs.getAutoSavedValue(BasicCrawler.CRAWLEE_STATE_KEY, defaultValue);
|
|
662
487
|
}
|
|
663
488
|
/**
|
|
@@ -711,7 +536,7 @@ class BasicCrawler {
|
|
|
711
536
|
* Retrieves the specified {@link Dataset}, or the default crawler {@link Dataset}.
|
|
712
537
|
*/
|
|
713
538
|
async getDataset(idOrName) {
|
|
714
|
-
return
|
|
539
|
+
return Dataset.open(idOrName, { config: this.config });
|
|
715
540
|
}
|
|
716
541
|
/**
|
|
717
542
|
* Retrieves data from the default crawler {@link Dataset} by calling {@link Dataset.getData}.
|
|
@@ -738,14 +563,14 @@ class BasicCrawler {
|
|
|
738
563
|
const dataset = await this.getDataset();
|
|
739
564
|
const items = await dataset.export(options);
|
|
740
565
|
if (format === 'csv') {
|
|
741
|
-
const value =
|
|
742
|
-
await
|
|
743
|
-
await
|
|
566
|
+
const value = stringify([Object.keys(items[0]), ...items.map((item) => Object.values(item))]);
|
|
567
|
+
await ensureDir(dirname(path));
|
|
568
|
+
await writeFile(path, value);
|
|
744
569
|
this.log.info(`Export to ${path} finished!`);
|
|
745
570
|
}
|
|
746
571
|
if (format === 'json') {
|
|
747
|
-
await
|
|
748
|
-
await
|
|
572
|
+
await ensureDir(dirname(path));
|
|
573
|
+
await writeJSON(path, items, { spaces: 4 });
|
|
749
574
|
this.log.info(`Export to ${path} finished!`);
|
|
750
575
|
}
|
|
751
576
|
return items;
|
|
@@ -758,9 +583,9 @@ class BasicCrawler {
|
|
|
758
583
|
// Initialize AutoscaledPool before awaiting _loadHandledRequestCount(),
|
|
759
584
|
// so that the caller can get a reference to it before awaiting the promise returned from run()
|
|
760
585
|
// (otherwise there would be no way)
|
|
761
|
-
this.autoscaledPool = new
|
|
586
|
+
this.autoscaledPool = new AutoscaledPool(this.autoscaledPoolOptions, this.config);
|
|
762
587
|
if (this.useSessionPool) {
|
|
763
|
-
this.sessionPool = await
|
|
588
|
+
this.sessionPool = await SessionPool.open(this.sessionPoolOptions, this.config);
|
|
764
589
|
// Assuming there are not more than 20 browsers running at once;
|
|
765
590
|
this.sessionPool.setMaxListeners(20);
|
|
766
591
|
}
|
|
@@ -795,7 +620,7 @@ class BasicCrawler {
|
|
|
795
620
|
if (cachedRobotsTxtFile) {
|
|
796
621
|
return cachedRobotsTxtFile;
|
|
797
622
|
}
|
|
798
|
-
const robotsTxtFile = await
|
|
623
|
+
const robotsTxtFile = await RobotsTxtFile.find(url);
|
|
799
624
|
this.robotsTxtFileCache.add(origin, robotsTxtFile);
|
|
800
625
|
return robotsTxtFile;
|
|
801
626
|
}
|
|
@@ -873,7 +698,7 @@ class BasicCrawler {
|
|
|
873
698
|
* should be ignored and will be reclaimed to the queue once ready.
|
|
874
699
|
*/
|
|
875
700
|
delayRequest(request, source) {
|
|
876
|
-
const domain =
|
|
701
|
+
const domain = getDomain(request.url);
|
|
877
702
|
if (!domain || !request) {
|
|
878
703
|
return false;
|
|
879
704
|
}
|
|
@@ -883,7 +708,7 @@ class BasicCrawler {
|
|
|
883
708
|
this.domainAccessedTime.set(domain, now);
|
|
884
709
|
return false;
|
|
885
710
|
}
|
|
886
|
-
if (source instanceof
|
|
711
|
+
if (source instanceof RequestQueueV1) {
|
|
887
712
|
// eslint-disable-next-line dot-notation
|
|
888
713
|
source['inProgress']?.delete(request.id);
|
|
889
714
|
}
|
|
@@ -891,7 +716,7 @@ class BasicCrawler {
|
|
|
891
716
|
this.log.debug(`Request ${request.url} (${request.id}) will be reclaimed after ${delay} milliseconds due to same domain delay`);
|
|
892
717
|
setTimeout(async () => {
|
|
893
718
|
this.log.debug(`Adding request ${request.url} (${request.id}) back to the queue`);
|
|
894
|
-
if (source instanceof
|
|
719
|
+
if (source instanceof RequestQueueV1) {
|
|
895
720
|
// eslint-disable-next-line dot-notation
|
|
896
721
|
source['inProgress'].add(request.id);
|
|
897
722
|
}
|
|
@@ -910,19 +735,19 @@ class BasicCrawler {
|
|
|
910
735
|
await this._timeoutAndRetry(async () => {
|
|
911
736
|
request = await this._fetchNextRequest();
|
|
912
737
|
}, this.internalTimeoutMillis, `Fetching next request timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
913
|
-
|
|
738
|
+
tryCancel();
|
|
914
739
|
if (this.useSessionPool) {
|
|
915
740
|
await this._timeoutAndRetry(async () => {
|
|
916
741
|
session = await this.sessionPool.getSession();
|
|
917
742
|
}, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
918
743
|
}
|
|
919
|
-
|
|
744
|
+
tryCancel();
|
|
920
745
|
if (!request || this.delayRequest(request, source)) {
|
|
921
746
|
return;
|
|
922
747
|
}
|
|
923
748
|
if (!(await this.isAllowedBasedOnRobotsTxtFile(request.url))) {
|
|
924
749
|
this.log.warning(`Skipping request ${request.url} (${request.id}) because it is disallowed based on robots.txt`);
|
|
925
|
-
request.state =
|
|
750
|
+
request.state = RequestState.SKIPPED;
|
|
926
751
|
request.noRetry = true;
|
|
927
752
|
await source.markRequestHandled(request);
|
|
928
753
|
await this.onSkippedRequest?.({
|
|
@@ -940,13 +765,13 @@ class BasicCrawler {
|
|
|
940
765
|
// All missing properties (that extend CrawlingContext) are set dynamically,
|
|
941
766
|
// but TS does not know that, so otherwise it would throw when compiling.
|
|
942
767
|
const crawlingContext = {
|
|
943
|
-
id:
|
|
768
|
+
id: cryptoRandomObjectId(10),
|
|
944
769
|
crawler: this,
|
|
945
770
|
log: this.log,
|
|
946
771
|
request,
|
|
947
772
|
session,
|
|
948
773
|
enqueueLinks: async (options) => {
|
|
949
|
-
return
|
|
774
|
+
return enqueueLinks({
|
|
950
775
|
// specify the RQ first to allow overriding it
|
|
951
776
|
requestQueue: await this.getRequestQueue(),
|
|
952
777
|
robotsTxtFile: await this.getRobotsTxtFileForUrl(request.url),
|
|
@@ -957,42 +782,42 @@ class BasicCrawler {
|
|
|
957
782
|
addRequests: this.addRequests.bind(this),
|
|
958
783
|
pushData: this.pushData.bind(this),
|
|
959
784
|
useState: this.useState.bind(this),
|
|
960
|
-
sendRequest:
|
|
961
|
-
getKeyValueStore: async (idOrName) =>
|
|
785
|
+
sendRequest: createSendRequest(this.httpClient, request, session, () => crawlingContext.proxyInfo?.url),
|
|
786
|
+
getKeyValueStore: async (idOrName) => KeyValueStore.open(idOrName, { config: this.config }),
|
|
962
787
|
};
|
|
963
788
|
this.crawlingContexts.set(crawlingContext.id, crawlingContext);
|
|
964
789
|
let isRequestLocked = true;
|
|
965
790
|
try {
|
|
966
|
-
request.state =
|
|
967
|
-
await
|
|
791
|
+
request.state = RequestState.REQUEST_HANDLER;
|
|
792
|
+
await addTimeoutToPromise(async () => this._runRequestHandler(crawlingContext), this.requestHandlerTimeoutMillis, `requestHandler timed out after ${this.requestHandlerTimeoutMillis / 1000} seconds (${request.id}).`);
|
|
968
793
|
await this._timeoutAndRetry(async () => source.markRequestHandled(request), this.internalTimeoutMillis, `Marking request ${request.url} (${request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
969
794
|
isRequestLocked = false; // markRequestHandled succeeded and unlocked the request
|
|
970
795
|
this.stats.finishJob(statisticsId, request.retryCount);
|
|
971
796
|
this.handledRequestsCount++;
|
|
972
797
|
// reclaim session if request finishes successfully
|
|
973
|
-
request.state =
|
|
798
|
+
request.state = RequestState.DONE;
|
|
974
799
|
crawlingContext.session?.markGood();
|
|
975
800
|
}
|
|
976
801
|
catch (err) {
|
|
977
802
|
try {
|
|
978
|
-
request.state =
|
|
979
|
-
await
|
|
980
|
-
if (!(err instanceof
|
|
803
|
+
request.state = RequestState.ERROR_HANDLER;
|
|
804
|
+
await addTimeoutToPromise(async () => this._requestFunctionErrorHandler(err, crawlingContext, source), this.internalTimeoutMillis, `Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`);
|
|
805
|
+
if (!(err instanceof CriticalError)) {
|
|
981
806
|
isRequestLocked = false; // _requestFunctionErrorHandler calls either markRequestHandled or reclaimRequest
|
|
982
807
|
}
|
|
983
|
-
request.state =
|
|
808
|
+
request.state = RequestState.DONE;
|
|
984
809
|
}
|
|
985
810
|
catch (secondaryError) {
|
|
986
811
|
if (!secondaryError.triggeredFromUserHandler &&
|
|
987
812
|
// avoid reprinting the same critical error multiple times, as it will be printed by Nodejs at the end anyway
|
|
988
|
-
!(secondaryError instanceof
|
|
813
|
+
!(secondaryError instanceof CriticalError)) {
|
|
989
814
|
const apifySpecific = process.env.APIFY_IS_AT_HOME
|
|
990
815
|
? `This may have happened due to an internal error of Apify's API or due to a misconfigured crawler.`
|
|
991
816
|
: '';
|
|
992
817
|
this.log.exception(secondaryError, 'An exception occurred during handling of failed request. ' +
|
|
993
818
|
`This places the crawler and its underlying storages into an unknown state and crawling will be terminated. ${apifySpecific}`);
|
|
994
819
|
}
|
|
995
|
-
request.state =
|
|
820
|
+
request.state = RequestState.ERROR;
|
|
996
821
|
throw secondaryError;
|
|
997
822
|
}
|
|
998
823
|
// decrease the session score if the request fails (but the error handler did not throw)
|
|
@@ -1002,7 +827,7 @@ class BasicCrawler {
|
|
|
1002
827
|
await this._cleanupContext(crawlingContext);
|
|
1003
828
|
this.crawlingContexts.delete(crawlingContext.id);
|
|
1004
829
|
// Safety net - release the lock if nobody managed to do it before
|
|
1005
|
-
if (isRequestLocked && source instanceof
|
|
830
|
+
if (isRequestLocked && source instanceof RequestProvider) {
|
|
1006
831
|
try {
|
|
1007
832
|
await source.client.deleteRequestLock(request.id);
|
|
1008
833
|
}
|
|
@@ -1018,7 +843,7 @@ class BasicCrawler {
|
|
|
1018
843
|
*/
|
|
1019
844
|
async _timeoutAndRetry(handler, timeout, error, maxRetries = 3, retried = 1) {
|
|
1020
845
|
try {
|
|
1021
|
-
await
|
|
846
|
+
await addTimeoutToPromise(handler, timeout, error);
|
|
1022
847
|
}
|
|
1023
848
|
catch (e) {
|
|
1024
849
|
if (retried <= maxRetries) {
|
|
@@ -1055,7 +880,7 @@ class BasicCrawler {
|
|
|
1055
880
|
}
|
|
1056
881
|
async _rotateSession(crawlingContext) {
|
|
1057
882
|
const { request } = crawlingContext;
|
|
1058
|
-
request.sessionRotationCount
|
|
883
|
+
request.sessionRotationCount ??= 0;
|
|
1059
884
|
request.sessionRotationCount++;
|
|
1060
885
|
crawlingContext.session?.retire();
|
|
1061
886
|
}
|
|
@@ -1065,14 +890,14 @@ class BasicCrawler {
|
|
|
1065
890
|
async _requestFunctionErrorHandler(error, crawlingContext, source) {
|
|
1066
891
|
const { request } = crawlingContext;
|
|
1067
892
|
request.pushErrorMessage(error);
|
|
1068
|
-
if (error instanceof
|
|
893
|
+
if (error instanceof CriticalError) {
|
|
1069
894
|
throw error;
|
|
1070
895
|
}
|
|
1071
896
|
const shouldRetryRequest = this._canRequestBeRetried(request, error);
|
|
1072
897
|
if (shouldRetryRequest) {
|
|
1073
898
|
await this.stats.errorTrackerRetry.addAsync(error, crawlingContext);
|
|
1074
|
-
await this.
|
|
1075
|
-
if (error instanceof
|
|
899
|
+
await this.errorHandler?.(crawlingContext, error);
|
|
900
|
+
if (error instanceof SessionError) {
|
|
1076
901
|
await this._rotateSession(crawlingContext);
|
|
1077
902
|
}
|
|
1078
903
|
if (!request.noRetry) {
|
|
@@ -1123,7 +948,7 @@ class BasicCrawler {
|
|
|
1123
948
|
const message = this._getMessageFromError(error, true);
|
|
1124
949
|
this.log.error(`Request failed and reached maximum retries. ${message}`, { id, url, method, uniqueKey });
|
|
1125
950
|
if (this.failedRequestHandler) {
|
|
1126
|
-
await this.
|
|
951
|
+
await this.failedRequestHandler?.(crawlingContext, error);
|
|
1127
952
|
}
|
|
1128
953
|
}
|
|
1129
954
|
/**
|
|
@@ -1138,7 +963,7 @@ class BasicCrawler {
|
|
|
1138
963
|
const stackLines = error?.stack ? error.stack.split('\n') : new Error().stack.split('\n').slice(2);
|
|
1139
964
|
const baseDir = process.cwd();
|
|
1140
965
|
const userLine = stackLines.find((line) => line.includes(baseDir) && !line.includes('node_modules'));
|
|
1141
|
-
if (error instanceof
|
|
966
|
+
if (error instanceof TimeoutError) {
|
|
1142
967
|
return process.env.CRAWLEE_VERBOSE_LOG ? error.stack : error.message || error; // stack in timeout errors does not really help
|
|
1143
968
|
}
|
|
1144
969
|
return process.env.CRAWLEE_VERBOSE_LOG || forceStack
|
|
@@ -1148,28 +973,18 @@ class BasicCrawler {
|
|
|
1148
973
|
_canRequestBeRetried(request, error) {
|
|
1149
974
|
// Request should never be retried, or the error encountered makes it not able to be retried, or the session rotation limit has been reached
|
|
1150
975
|
if (request.noRetry ||
|
|
1151
|
-
error instanceof
|
|
1152
|
-
(error instanceof
|
|
976
|
+
error instanceof NonRetryableError ||
|
|
977
|
+
(error instanceof SessionError && this.maxSessionRotations <= (request.sessionRotationCount ?? 0))) {
|
|
1153
978
|
return false;
|
|
1154
979
|
}
|
|
1155
980
|
// User requested retry (we ignore retry count here as its explicitly told by the user to retry)
|
|
1156
|
-
if (error instanceof
|
|
981
|
+
if (error instanceof RetryRequestError) {
|
|
1157
982
|
return true;
|
|
1158
983
|
}
|
|
1159
984
|
// Ensure there are more retries available for the request
|
|
1160
985
|
const maxRequestRetries = request.maxRetries ?? this.maxRequestRetries;
|
|
1161
986
|
return request.retryCount < maxRequestRetries;
|
|
1162
987
|
}
|
|
1163
|
-
_augmentContextWithDeprecatedError(context, error) {
|
|
1164
|
-
Object.defineProperty(context, 'error', {
|
|
1165
|
-
get: () => {
|
|
1166
|
-
this.log.deprecated("The 'error' property of the crawling context is deprecated, and it is now passed as the second parameter in 'errorHandler' and 'failedRequestHandler'. Please update your code, as this property will be removed in a future version.");
|
|
1167
|
-
return error;
|
|
1168
|
-
},
|
|
1169
|
-
configurable: true,
|
|
1170
|
-
});
|
|
1171
|
-
return context;
|
|
1172
|
-
}
|
|
1173
988
|
/**
|
|
1174
989
|
* Updates handledRequestsCount from possibly stored counts,
|
|
1175
990
|
* usually after worker migration. Since one of the stores
|
|
@@ -1207,36 +1022,10 @@ class BasicCrawler {
|
|
|
1207
1022
|
}
|
|
1208
1023
|
await this.autoscaledPool?.abort();
|
|
1209
1024
|
}
|
|
1210
|
-
_handlePropertyNameChange({ newProperty, newName, oldProperty, oldName, propertyKey, allowUndefined = false, }) {
|
|
1211
|
-
if (newProperty && oldProperty) {
|
|
1212
|
-
this.log.warning([
|
|
1213
|
-
`Both "${newName}" and "${oldName}" were provided in the crawler options.`,
|
|
1214
|
-
`"${oldName}" has been renamed to "${newName}", and will be removed in a future version.`,
|
|
1215
|
-
`As such, "${newName}" will be used instead.`,
|
|
1216
|
-
].join('\n'));
|
|
1217
|
-
// @ts-expect-error Assigning to possibly readonly properties
|
|
1218
|
-
this[propertyKey] = newProperty;
|
|
1219
|
-
}
|
|
1220
|
-
else if (oldProperty) {
|
|
1221
|
-
this.log.warning([
|
|
1222
|
-
`"${oldName}" has been renamed to "${newName}", and will be removed in a future version.`,
|
|
1223
|
-
`The provided value will be used, but you should rename "${oldName}" to "${newName}" in your crawler options.`,
|
|
1224
|
-
].join('\n'));
|
|
1225
|
-
// @ts-expect-error Assigning to possibly readonly properties
|
|
1226
|
-
this[propertyKey] = oldProperty;
|
|
1227
|
-
}
|
|
1228
|
-
else if (newProperty) {
|
|
1229
|
-
// @ts-expect-error Assigning to possibly readonly properties
|
|
1230
|
-
this[propertyKey] = newProperty;
|
|
1231
|
-
}
|
|
1232
|
-
else if (!allowUndefined) {
|
|
1233
|
-
throw new ow_1.ArgumentError(`"${newName}" must be provided in the crawler options`, this.constructor);
|
|
1234
|
-
}
|
|
1235
|
-
}
|
|
1236
1025
|
_getCookieHeaderFromRequest(request) {
|
|
1237
1026
|
if (request.headers?.Cookie && request.headers?.cookie) {
|
|
1238
1027
|
this.log.warning(`Encountered mixed casing for the cookie headers for request ${request.url} (${request.id}). Their values will be merged.`);
|
|
1239
|
-
return
|
|
1028
|
+
return mergeCookies(request.url, [request.headers.cookie, request.headers.Cookie]);
|
|
1240
1029
|
}
|
|
1241
1030
|
return request.headers?.Cookie || request.headers?.cookie || '';
|
|
1242
1031
|
}
|
|
@@ -1247,9 +1036,9 @@ class BasicCrawler {
|
|
|
1247
1036
|
this.log.info('Using the old RequestQueue implementation without request locking.');
|
|
1248
1037
|
this._experimentWarnings.requestLocking = true;
|
|
1249
1038
|
}
|
|
1250
|
-
return
|
|
1039
|
+
return RequestQueueV1.open(null, { config: this.config });
|
|
1251
1040
|
}
|
|
1252
|
-
return
|
|
1041
|
+
return RequestQueue.open(null, { config: this.config });
|
|
1253
1042
|
}
|
|
1254
1043
|
requestMatchesEnqueueStrategy(request) {
|
|
1255
1044
|
const { url, loadedUrl } = request;
|
|
@@ -1266,80 +1055,29 @@ class BasicCrawler {
|
|
|
1266
1055
|
const baseUrl = new URL(url);
|
|
1267
1056
|
const loadedBaseUrl = new URL(loadedUrl);
|
|
1268
1057
|
switch (strategy) {
|
|
1269
|
-
case
|
|
1058
|
+
case EnqueueStrategy.SameHostname: {
|
|
1270
1059
|
return baseUrl.hostname === loadedBaseUrl.hostname;
|
|
1271
1060
|
}
|
|
1272
|
-
case
|
|
1273
|
-
const baseUrlHostname =
|
|
1061
|
+
case EnqueueStrategy.SameDomain: {
|
|
1062
|
+
const baseUrlHostname = getDomain(baseUrl.hostname, { mixedInputs: false });
|
|
1274
1063
|
if (baseUrlHostname) {
|
|
1275
|
-
const loadedBaseUrlHostname =
|
|
1064
|
+
const loadedBaseUrlHostname = getDomain(loadedBaseUrl.hostname, { mixedInputs: false });
|
|
1276
1065
|
return baseUrlHostname === loadedBaseUrlHostname;
|
|
1277
1066
|
}
|
|
1278
1067
|
// Can happen for IPs, we just check like same origin
|
|
1279
1068
|
return baseUrl.origin === loadedBaseUrl.origin;
|
|
1280
1069
|
}
|
|
1281
|
-
case
|
|
1070
|
+
case EnqueueStrategy.SameOrigin: {
|
|
1282
1071
|
// Same as hostname, but also checks protocol
|
|
1283
1072
|
return baseUrl.origin === loadedBaseUrl.origin;
|
|
1284
1073
|
}
|
|
1285
|
-
case
|
|
1074
|
+
case EnqueueStrategy.All:
|
|
1286
1075
|
default: {
|
|
1287
1076
|
return baseUrl.protocol === 'http:' || baseUrl.protocol === 'https:';
|
|
1288
1077
|
}
|
|
1289
1078
|
}
|
|
1290
1079
|
}
|
|
1291
1080
|
}
|
|
1292
|
-
exports.BasicCrawler = BasicCrawler;
|
|
1293
|
-
Object.defineProperty(BasicCrawler, "CRAWLEE_STATE_KEY", {
|
|
1294
|
-
enumerable: true,
|
|
1295
|
-
configurable: true,
|
|
1296
|
-
writable: true,
|
|
1297
|
-
value: 'CRAWLEE_STATE'
|
|
1298
|
-
});
|
|
1299
|
-
Object.defineProperty(BasicCrawler, "optionsShape", {
|
|
1300
|
-
enumerable: true,
|
|
1301
|
-
configurable: true,
|
|
1302
|
-
writable: true,
|
|
1303
|
-
value: {
|
|
1304
|
-
requestList: ow_1.default.optional.object.validate(core_1.validators.requestList),
|
|
1305
|
-
requestQueue: ow_1.default.optional.object.validate(core_1.validators.requestQueue),
|
|
1306
|
-
// Subclasses override this function instead of passing it
|
|
1307
|
-
// in constructor, so this validation needs to apply only
|
|
1308
|
-
// if the user creates an instance of BasicCrawler directly.
|
|
1309
|
-
requestHandler: ow_1.default.optional.function,
|
|
1310
|
-
// TODO: remove in a future release
|
|
1311
|
-
handleRequestFunction: ow_1.default.optional.function,
|
|
1312
|
-
requestHandlerTimeoutSecs: ow_1.default.optional.number,
|
|
1313
|
-
// TODO: remove in a future release
|
|
1314
|
-
handleRequestTimeoutSecs: ow_1.default.optional.number,
|
|
1315
|
-
errorHandler: ow_1.default.optional.function,
|
|
1316
|
-
failedRequestHandler: ow_1.default.optional.function,
|
|
1317
|
-
// TODO: remove in a future release
|
|
1318
|
-
handleFailedRequestFunction: ow_1.default.optional.function,
|
|
1319
|
-
maxRequestRetries: ow_1.default.optional.number,
|
|
1320
|
-
sameDomainDelaySecs: ow_1.default.optional.number,
|
|
1321
|
-
maxSessionRotations: ow_1.default.optional.number,
|
|
1322
|
-
maxRequestsPerCrawl: ow_1.default.optional.number,
|
|
1323
|
-
autoscaledPoolOptions: ow_1.default.optional.object,
|
|
1324
|
-
sessionPoolOptions: ow_1.default.optional.object,
|
|
1325
|
-
useSessionPool: ow_1.default.optional.boolean,
|
|
1326
|
-
statusMessageLoggingInterval: ow_1.default.optional.number,
|
|
1327
|
-
statusMessageCallback: ow_1.default.optional.function,
|
|
1328
|
-
retryOnBlocked: ow_1.default.optional.boolean,
|
|
1329
|
-
respectRobotsTxtFile: ow_1.default.optional.boolean,
|
|
1330
|
-
onSkippedRequest: ow_1.default.optional.function,
|
|
1331
|
-
httpClient: ow_1.default.optional.object,
|
|
1332
|
-
// AutoscaledPool shorthands
|
|
1333
|
-
minConcurrency: ow_1.default.optional.number,
|
|
1334
|
-
maxConcurrency: ow_1.default.optional.number,
|
|
1335
|
-
maxRequestsPerMinute: ow_1.default.optional.number.integerOrInfinite.positive.greaterThanOrEqual(1),
|
|
1336
|
-
keepAlive: ow_1.default.optional.boolean,
|
|
1337
|
-
// internal
|
|
1338
|
-
log: ow_1.default.optional.object,
|
|
1339
|
-
experiments: ow_1.default.optional.object,
|
|
1340
|
-
statisticsOptions: ow_1.default.optional.object,
|
|
1341
|
-
}
|
|
1342
|
-
});
|
|
1343
1081
|
/**
|
|
1344
1082
|
* Creates new {@link Router} instance that works based on request labels.
|
|
1345
1083
|
* This instance can then serve as a {@link BasicCrawlerOptions.requestHandler|`requestHandler`} of our {@link BasicCrawler}.
|
|
@@ -1364,7 +1102,7 @@ Object.defineProperty(BasicCrawler, "optionsShape", {
|
|
|
1364
1102
|
* await crawler.run();
|
|
1365
1103
|
* ```
|
|
1366
1104
|
*/
|
|
1367
|
-
function createBasicRouter(routes) {
|
|
1368
|
-
return
|
|
1105
|
+
export function createBasicRouter(routes) {
|
|
1106
|
+
return Router.create(routes);
|
|
1369
1107
|
}
|
|
1370
1108
|
//# sourceMappingURL=basic-crawler.js.map
|