@crawlee/core 3.5.5-beta.8 → 3.5.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/enqueue_links/enqueue_links.d.ts +2 -2
- package/enqueue_links/enqueue_links.d.ts.map +1 -1
- package/index.mjs +2 -0
- package/package.json +5 -5
- package/request.d.ts +2 -0
- package/request.d.ts.map +1 -1
- package/request.js +5 -6
- package/request.js.map +1 -1
- package/storages/index.d.ts +2 -0
- package/storages/index.d.ts.map +1 -1
- package/storages/index.js +2 -0
- package/storages/index.js.map +1 -1
- package/storages/request_provider.d.ts +262 -0
- package/storages/request_provider.d.ts.map +1 -0
- package/storages/request_provider.js +602 -0
- package/storages/request_provider.js.map +1 -0
- package/storages/request_queue.d.ts +17 -299
- package/storages/request_queue.d.ts.map +1 -1
- package/storages/request_queue.js +62 -645
- package/storages/request_queue.js.map +1 -1
- package/storages/request_queue_v2.d.ts +41 -0
- package/storages/request_queue_v2.d.ts.map +1 -0
- package/storages/request_queue_v2.js +250 -0
- package/storages/request_queue_v2.js.map +1 -0
- package/storages/storage_manager.d.ts.map +1 -1
- package/storages/storage_manager.js.map +1 -1
- package/storages/utils.d.ts +34 -0
- package/storages/utils.d.ts.map +1 -1
- package/storages/utils.js +45 -1
- package/storages/utils.js.map +1 -1
- package/tsconfig.build.tsbuildinfo +1 -1
|
@@ -1,68 +1,18 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.RequestQueue =
|
|
4
|
-
const tslib_1 = require("tslib");
|
|
5
|
-
const node_crypto_1 = tslib_1.__importDefault(require("node:crypto"));
|
|
3
|
+
exports.RequestQueue = void 0;
|
|
6
4
|
const promises_1 = require("node:timers/promises");
|
|
7
5
|
const consts_1 = require("@apify/consts");
|
|
8
|
-
const
|
|
9
|
-
const
|
|
10
|
-
const utils_1 = require("@crawlee/utils");
|
|
11
|
-
const ow_1 = tslib_1.__importDefault(require("ow"));
|
|
12
|
-
const storage_manager_1 = require("./storage_manager");
|
|
13
|
-
const utils_2 = require("./utils");
|
|
6
|
+
const request_provider_1 = require("./request_provider");
|
|
7
|
+
const utils_1 = require("./utils");
|
|
14
8
|
const configuration_1 = require("../configuration");
|
|
15
|
-
const log_1 = require("../log");
|
|
16
|
-
const request_1 = require("../request");
|
|
17
9
|
const MAX_CACHED_REQUESTS = 1000000;
|
|
18
|
-
/**
|
|
19
|
-
* When requesting queue head we always fetch requestsInProgressCount * QUERY_HEAD_BUFFER number of requests.
|
|
20
|
-
* @internal
|
|
21
|
-
*/
|
|
22
|
-
exports.QUERY_HEAD_MIN_LENGTH = 100;
|
|
23
|
-
/** @internal */
|
|
24
|
-
exports.QUERY_HEAD_BUFFER = 3;
|
|
25
|
-
/**
|
|
26
|
-
* If queue was modified (request added/updated/deleted) before more than API_PROCESSED_REQUESTS_DELAY_MILLIS
|
|
27
|
-
* then we assume the get head operation to be consistent.
|
|
28
|
-
* @internal
|
|
29
|
-
*/
|
|
30
|
-
exports.API_PROCESSED_REQUESTS_DELAY_MILLIS = 10000;
|
|
31
|
-
/**
|
|
32
|
-
* How many times we try to get queue head with queueModifiedAt older than API_PROCESSED_REQUESTS_DELAY_MILLIS.
|
|
33
|
-
* @internal
|
|
34
|
-
*/
|
|
35
|
-
exports.MAX_QUERIES_FOR_CONSISTENCY = 6;
|
|
36
10
|
/**
|
|
37
11
|
* This number must be large enough so that processing of all these requests cannot be done in
|
|
38
12
|
* a time lower than expected maximum latency of DynamoDB, but low enough not to waste too much memory.
|
|
39
13
|
* @internal
|
|
40
14
|
*/
|
|
41
15
|
const RECENTLY_HANDLED_CACHE_SIZE = 1000;
|
|
42
|
-
/**
|
|
43
|
-
* Indicates how long it usually takes for the underlying storage to propagate all writes
|
|
44
|
-
* to be available to subsequent reads.
|
|
45
|
-
* @internal
|
|
46
|
-
*/
|
|
47
|
-
exports.STORAGE_CONSISTENCY_DELAY_MILLIS = 3000;
|
|
48
|
-
/**
|
|
49
|
-
* Helper function that creates ID from uniqueKey for local emulation of request queue.
|
|
50
|
-
* It's also used for local cache of remote request queue.
|
|
51
|
-
*
|
|
52
|
-
* This function may not exactly match how requestId is created server side.
|
|
53
|
-
* So we never pass requestId created by this to server and use it only for local cache.
|
|
54
|
-
*
|
|
55
|
-
* @internal
|
|
56
|
-
*/
|
|
57
|
-
function getRequestId(uniqueKey) {
|
|
58
|
-
const str = node_crypto_1.default
|
|
59
|
-
.createHash('sha256')
|
|
60
|
-
.update(uniqueKey)
|
|
61
|
-
.digest('base64')
|
|
62
|
-
.replace(/[+/=]/g, '');
|
|
63
|
-
return str.substr(0, 15);
|
|
64
|
-
}
|
|
65
|
-
exports.getRequestId = getRequestId;
|
|
66
16
|
/**
|
|
67
17
|
* Represents a queue of URLs to crawl, which is used for deep crawling of websites
|
|
68
18
|
* where you start with several URLs and then recursively
|
|
@@ -109,361 +59,29 @@ exports.getRequestId = getRequestId;
|
|
|
109
59
|
* ```
|
|
110
60
|
* @category Sources
|
|
111
61
|
*/
|
|
112
|
-
class RequestQueue {
|
|
62
|
+
class RequestQueue extends request_provider_1.RequestProvider {
|
|
113
63
|
/**
|
|
114
64
|
* @internal
|
|
115
65
|
*/
|
|
116
66
|
constructor(options, config = configuration_1.Configuration.getGlobalConfig()) {
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
});
|
|
123
|
-
Object.defineProperty(this, "log", {
|
|
124
|
-
enumerable: true,
|
|
125
|
-
configurable: true,
|
|
126
|
-
writable: true,
|
|
127
|
-
value: log_1.log.child({ prefix: 'RequestQueue' })
|
|
128
|
-
});
|
|
129
|
-
Object.defineProperty(this, "id", {
|
|
130
|
-
enumerable: true,
|
|
131
|
-
configurable: true,
|
|
132
|
-
writable: true,
|
|
133
|
-
value: void 0
|
|
134
|
-
});
|
|
135
|
-
Object.defineProperty(this, "name", {
|
|
136
|
-
enumerable: true,
|
|
137
|
-
configurable: true,
|
|
138
|
-
writable: true,
|
|
139
|
-
value: void 0
|
|
140
|
-
});
|
|
141
|
-
Object.defineProperty(this, "timeoutSecs", {
|
|
142
|
-
enumerable: true,
|
|
143
|
-
configurable: true,
|
|
144
|
-
writable: true,
|
|
145
|
-
value: 30
|
|
146
|
-
});
|
|
147
|
-
Object.defineProperty(this, "clientKey", {
|
|
148
|
-
enumerable: true,
|
|
149
|
-
configurable: true,
|
|
150
|
-
writable: true,
|
|
151
|
-
value: (0, utilities_1.cryptoRandomObjectId)()
|
|
152
|
-
});
|
|
153
|
-
Object.defineProperty(this, "client", {
|
|
154
|
-
enumerable: true,
|
|
155
|
-
configurable: true,
|
|
156
|
-
writable: true,
|
|
157
|
-
value: void 0
|
|
158
|
-
});
|
|
159
|
-
Object.defineProperty(this, "proxyConfiguration", {
|
|
160
|
-
enumerable: true,
|
|
161
|
-
configurable: true,
|
|
162
|
-
writable: true,
|
|
163
|
-
value: void 0
|
|
164
|
-
});
|
|
165
|
-
/**
|
|
166
|
-
* Contains a cached list of request IDs from the head of the queue,
|
|
167
|
-
* as obtained in the last query. Both key and value is the request ID.
|
|
168
|
-
* Need to apply a type here to the generated TS types don't try to use types-apify
|
|
169
|
-
*/
|
|
170
|
-
Object.defineProperty(this, "queueHeadDict", {
|
|
171
|
-
enumerable: true,
|
|
172
|
-
configurable: true,
|
|
173
|
-
writable: true,
|
|
174
|
-
value: new datastructures_1.ListDictionary()
|
|
175
|
-
});
|
|
67
|
+
super({
|
|
68
|
+
...options,
|
|
69
|
+
logPrefix: 'RequestQueue',
|
|
70
|
+
recentlyHandledRequestsMaxSize: RECENTLY_HANDLED_CACHE_SIZE,
|
|
71
|
+
requestCacheMaxSize: MAX_CACHED_REQUESTS,
|
|
72
|
+
}, config);
|
|
176
73
|
Object.defineProperty(this, "queryQueueHeadPromise", {
|
|
177
74
|
enumerable: true,
|
|
178
75
|
configurable: true,
|
|
179
76
|
writable: true,
|
|
180
77
|
value: null
|
|
181
78
|
});
|
|
182
|
-
// A set of all request IDs that are currently being handled,
|
|
183
|
-
// i.e. which were returned by fetchNextRequest() but not markRequestHandled()
|
|
184
|
-
Object.defineProperty(this, "inProgress", {
|
|
185
|
-
enumerable: true,
|
|
186
|
-
configurable: true,
|
|
187
|
-
writable: true,
|
|
188
|
-
value: new Set()
|
|
189
|
-
});
|
|
190
|
-
// To track whether the queue gets stuck, and we need to reset it
|
|
191
|
-
// `lastActivity` tracks the time when we either added, processed or reclaimed a request,
|
|
192
|
-
// or when we add new request to in-progress cache
|
|
193
79
|
Object.defineProperty(this, "lastActivity", {
|
|
194
80
|
enumerable: true,
|
|
195
81
|
configurable: true,
|
|
196
82
|
writable: true,
|
|
197
83
|
value: new Date()
|
|
198
84
|
});
|
|
199
|
-
Object.defineProperty(this, "internalTimeoutMillis", {
|
|
200
|
-
enumerable: true,
|
|
201
|
-
configurable: true,
|
|
202
|
-
writable: true,
|
|
203
|
-
value: 5 * 60e3
|
|
204
|
-
}); // defaults to 5 minutes, will be overridden by BasicCrawler
|
|
205
|
-
// Contains a list of recently handled requests. It is used to avoid inconsistencies
|
|
206
|
-
// caused by delays in the underlying DynamoDB storage.
|
|
207
|
-
// Keys are request IDs, values are true.
|
|
208
|
-
Object.defineProperty(this, "recentlyHandled", {
|
|
209
|
-
enumerable: true,
|
|
210
|
-
configurable: true,
|
|
211
|
-
writable: true,
|
|
212
|
-
value: new datastructures_1.LruCache({ maxLength: RECENTLY_HANDLED_CACHE_SIZE })
|
|
213
|
-
});
|
|
214
|
-
// We can trust these numbers only in a case that queue is used by a single client.
|
|
215
|
-
// This information is returned by getHead() under the hadMultipleClients property.
|
|
216
|
-
Object.defineProperty(this, "assumedTotalCount", {
|
|
217
|
-
enumerable: true,
|
|
218
|
-
configurable: true,
|
|
219
|
-
writable: true,
|
|
220
|
-
value: 0
|
|
221
|
-
});
|
|
222
|
-
Object.defineProperty(this, "assumedHandledCount", {
|
|
223
|
-
enumerable: true,
|
|
224
|
-
configurable: true,
|
|
225
|
-
writable: true,
|
|
226
|
-
value: 0
|
|
227
|
-
});
|
|
228
|
-
// Caching requests to avoid redundant addRequest() calls.
|
|
229
|
-
// Key is computed using getRequestId() and value is { id, isHandled }.
|
|
230
|
-
Object.defineProperty(this, "requestsCache", {
|
|
231
|
-
enumerable: true,
|
|
232
|
-
configurable: true,
|
|
233
|
-
writable: true,
|
|
234
|
-
value: new datastructures_1.LruCache({ maxLength: MAX_CACHED_REQUESTS })
|
|
235
|
-
});
|
|
236
|
-
this.id = options.id;
|
|
237
|
-
this.name = options.name;
|
|
238
|
-
this.client = options.client.requestQueue(this.id, {
|
|
239
|
-
clientKey: this.clientKey,
|
|
240
|
-
timeoutSecs: this.timeoutSecs,
|
|
241
|
-
});
|
|
242
|
-
this.proxyConfiguration = options.proxyConfiguration;
|
|
243
|
-
}
|
|
244
|
-
/**
|
|
245
|
-
* @ignore
|
|
246
|
-
*/
|
|
247
|
-
inProgressCount() {
|
|
248
|
-
return this.inProgress.size;
|
|
249
|
-
}
|
|
250
|
-
/**
|
|
251
|
-
* Adds a request to the queue.
|
|
252
|
-
*
|
|
253
|
-
* If a request with the same `uniqueKey` property is already present in the queue,
|
|
254
|
-
* it will not be updated. You can find out whether this happened from the resulting
|
|
255
|
-
* {@apilink QueueOperationInfo} object.
|
|
256
|
-
*
|
|
257
|
-
* To add multiple requests to the queue by extracting links from a webpage,
|
|
258
|
-
* see the {@apilink enqueueLinks} helper function.
|
|
259
|
-
*
|
|
260
|
-
* @param requestLike {@apilink Request} object or vanilla object with request data.
|
|
261
|
-
* Note that the function sets the `uniqueKey` and `id` fields to the passed Request.
|
|
262
|
-
* @param [options] Request queue operation options.
|
|
263
|
-
*/
|
|
264
|
-
async addRequest(requestLike, options = {}) {
|
|
265
|
-
(0, ow_1.default)(requestLike, ow_1.default.object);
|
|
266
|
-
(0, ow_1.default)(options, ow_1.default.object.exactShape({
|
|
267
|
-
forefront: ow_1.default.optional.boolean,
|
|
268
|
-
}));
|
|
269
|
-
this.lastActivity = new Date();
|
|
270
|
-
const { forefront = false } = options;
|
|
271
|
-
if ('requestsFromUrl' in requestLike) {
|
|
272
|
-
const requests = await this._fetchRequestsFromUrl(requestLike);
|
|
273
|
-
const processedRequests = await this._addFetchedRequests(requestLike, requests, options);
|
|
274
|
-
return processedRequests[0];
|
|
275
|
-
}
|
|
276
|
-
(0, ow_1.default)(requestLike, ow_1.default.object.partialShape({
|
|
277
|
-
url: ow_1.default.string,
|
|
278
|
-
id: ow_1.default.undefined,
|
|
279
|
-
}));
|
|
280
|
-
const request = requestLike instanceof request_1.Request
|
|
281
|
-
? requestLike
|
|
282
|
-
: new request_1.Request(requestLike);
|
|
283
|
-
const cacheKey = getRequestId(request.uniqueKey);
|
|
284
|
-
const cachedInfo = this.requestsCache.get(cacheKey);
|
|
285
|
-
if (cachedInfo) {
|
|
286
|
-
request.id = cachedInfo.id;
|
|
287
|
-
return {
|
|
288
|
-
wasAlreadyPresent: true,
|
|
289
|
-
// We may assume that if request is in local cache then also the information if the
|
|
290
|
-
// request was already handled is there because just one client should be using one queue.
|
|
291
|
-
wasAlreadyHandled: cachedInfo.isHandled,
|
|
292
|
-
requestId: cachedInfo.id,
|
|
293
|
-
uniqueKey: cachedInfo.uniqueKey,
|
|
294
|
-
};
|
|
295
|
-
}
|
|
296
|
-
const queueOperationInfo = await this.client.addRequest(request, { forefront });
|
|
297
|
-
queueOperationInfo.uniqueKey = request.uniqueKey;
|
|
298
|
-
const { requestId, wasAlreadyPresent } = queueOperationInfo;
|
|
299
|
-
this._cacheRequest(cacheKey, queueOperationInfo);
|
|
300
|
-
if (!wasAlreadyPresent && !this.inProgress.has(requestId) && !this.recentlyHandled.get(requestId)) {
|
|
301
|
-
this.assumedTotalCount++;
|
|
302
|
-
// Performance optimization: add request straight to head if possible
|
|
303
|
-
this._maybeAddRequestToQueueHead(requestId, forefront);
|
|
304
|
-
}
|
|
305
|
-
return queueOperationInfo;
|
|
306
|
-
}
|
|
307
|
-
/**
|
|
308
|
-
* Adds requests to the queue in batches of 25.
|
|
309
|
-
*
|
|
310
|
-
* If a request that is passed in is already present due to its `uniqueKey` property being the same,
|
|
311
|
-
* it will not be updated. You can find out whether this happened by finding the request in the resulting
|
|
312
|
-
* {@apilink BatchAddRequestsResult} object.
|
|
313
|
-
*
|
|
314
|
-
* @param requestsLike {@apilink Request} objects or vanilla objects with request data.
|
|
315
|
-
* Note that the function sets the `uniqueKey` and `id` fields to the passed requests if missing.
|
|
316
|
-
* @param [options] Request queue operation options.
|
|
317
|
-
*/
|
|
318
|
-
async addRequests(requestsLike, options = {}) {
|
|
319
|
-
(0, ow_1.default)(requestsLike, ow_1.default.array);
|
|
320
|
-
(0, ow_1.default)(options, ow_1.default.object.exactShape({
|
|
321
|
-
forefront: ow_1.default.optional.boolean,
|
|
322
|
-
}));
|
|
323
|
-
const { forefront = false } = options;
|
|
324
|
-
const uniqueKeyToCacheKey = new Map();
|
|
325
|
-
const getCachedRequestId = (uniqueKey) => {
|
|
326
|
-
const cached = uniqueKeyToCacheKey.get(uniqueKey);
|
|
327
|
-
if (cached)
|
|
328
|
-
return cached;
|
|
329
|
-
const newCacheKey = getRequestId(uniqueKey);
|
|
330
|
-
uniqueKeyToCacheKey.set(uniqueKey, newCacheKey);
|
|
331
|
-
return newCacheKey;
|
|
332
|
-
};
|
|
333
|
-
const results = {
|
|
334
|
-
processedRequests: [],
|
|
335
|
-
unprocessedRequests: [],
|
|
336
|
-
};
|
|
337
|
-
for (const requestLike of requestsLike) {
|
|
338
|
-
if ('requestsFromUrl' in requestLike) {
|
|
339
|
-
const requests = await this._fetchRequestsFromUrl(requestLike);
|
|
340
|
-
await this._addFetchedRequests(requestLike, requests, options);
|
|
341
|
-
}
|
|
342
|
-
}
|
|
343
|
-
const requests = requestsLike
|
|
344
|
-
.filter((requestLike) => !('requestsFromUrl' in requestLike))
|
|
345
|
-
.map((requestLike) => {
|
|
346
|
-
return requestLike instanceof request_1.Request ? requestLike : new request_1.Request(requestLike);
|
|
347
|
-
});
|
|
348
|
-
const requestsToAdd = new Map();
|
|
349
|
-
for (const request of requests) {
|
|
350
|
-
const cacheKey = getCachedRequestId(request.uniqueKey);
|
|
351
|
-
const cachedInfo = this.requestsCache.get(cacheKey);
|
|
352
|
-
if (cachedInfo) {
|
|
353
|
-
request.id = cachedInfo.id;
|
|
354
|
-
results.processedRequests.push({
|
|
355
|
-
wasAlreadyPresent: true,
|
|
356
|
-
// We may assume that if request is in local cache then also the information if the
|
|
357
|
-
// request was already handled is there because just one client should be using one queue.
|
|
358
|
-
wasAlreadyHandled: cachedInfo.isHandled,
|
|
359
|
-
requestId: cachedInfo.id,
|
|
360
|
-
uniqueKey: cachedInfo.uniqueKey,
|
|
361
|
-
});
|
|
362
|
-
}
|
|
363
|
-
else if (!requestsToAdd.has(request.uniqueKey)) {
|
|
364
|
-
requestsToAdd.set(request.uniqueKey, request);
|
|
365
|
-
}
|
|
366
|
-
}
|
|
367
|
-
// Early exit if all provided requests were already added
|
|
368
|
-
if (!requestsToAdd.size) {
|
|
369
|
-
return results;
|
|
370
|
-
}
|
|
371
|
-
const apiResults = await this.client.batchAddRequests([...requestsToAdd.values()], { forefront });
|
|
372
|
-
// Report unprocessed requests
|
|
373
|
-
results.unprocessedRequests = apiResults.unprocessedRequests;
|
|
374
|
-
// Add all new requests to the queue head
|
|
375
|
-
for (const newRequest of apiResults.processedRequests) {
|
|
376
|
-
// Add the new request to the processed list
|
|
377
|
-
results.processedRequests.push(newRequest);
|
|
378
|
-
const cacheKey = getCachedRequestId(newRequest.uniqueKey);
|
|
379
|
-
const { requestId, wasAlreadyPresent } = newRequest;
|
|
380
|
-
this._cacheRequest(cacheKey, newRequest);
|
|
381
|
-
if (!wasAlreadyPresent && !this.inProgress.has(requestId) && !this.recentlyHandled.get(requestId)) {
|
|
382
|
-
this.assumedTotalCount++;
|
|
383
|
-
// Performance optimization: add request straight to head if possible
|
|
384
|
-
this._maybeAddRequestToQueueHead(requestId, forefront);
|
|
385
|
-
}
|
|
386
|
-
}
|
|
387
|
-
return results;
|
|
388
|
-
}
|
|
389
|
-
/**
|
|
390
|
-
* Adds requests to the queue in batches. By default, it will resolve after the initial batch is added, and continue
|
|
391
|
-
* adding the rest in background. You can configure the batch size via `batchSize` option and the sleep time in between
|
|
392
|
-
* the batches via `waitBetweenBatchesMillis`. If you want to wait for all batches to be added to the queue, you can use
|
|
393
|
-
* the `waitForAllRequestsToBeAdded` promise you get in the response object.
|
|
394
|
-
*
|
|
395
|
-
* @param requests The requests to add
|
|
396
|
-
* @param options Options for the request queue
|
|
397
|
-
*/
|
|
398
|
-
async addRequestsBatched(requests, options = {}) {
|
|
399
|
-
(0, ow_1.default)(requests, ow_1.default.array.ofType(ow_1.default.any(ow_1.default.string, ow_1.default.object.partialShape({ url: ow_1.default.string, id: ow_1.default.undefined }), ow_1.default.object.partialShape({ requestsFromUrl: ow_1.default.string, regex: ow_1.default.optional.regExp }))));
|
|
400
|
-
(0, ow_1.default)(options, ow_1.default.object.exactShape({
|
|
401
|
-
forefront: ow_1.default.optional.boolean,
|
|
402
|
-
waitForAllRequestsToBeAdded: ow_1.default.optional.boolean,
|
|
403
|
-
batchSize: ow_1.default.optional.number,
|
|
404
|
-
waitBetweenBatchesMillis: ow_1.default.optional.number,
|
|
405
|
-
}));
|
|
406
|
-
const { batchSize = 1000, waitBetweenBatchesMillis = 1000, } = options;
|
|
407
|
-
const builtRequests = [];
|
|
408
|
-
for (const opts of requests) {
|
|
409
|
-
if (opts && typeof opts === 'object' && 'requestsFromUrl' in opts) {
|
|
410
|
-
await this.addRequest(opts, { forefront: options.forefront });
|
|
411
|
-
}
|
|
412
|
-
else {
|
|
413
|
-
builtRequests.push(new request_1.Request(typeof opts === 'string' ? { url: opts } : opts));
|
|
414
|
-
}
|
|
415
|
-
}
|
|
416
|
-
const attemptToAddToQueueAndAddAnyUnprocessed = async (providedRequests) => {
|
|
417
|
-
const resultsToReturn = [];
|
|
418
|
-
const apiResult = await this.addRequests(providedRequests, { forefront: options.forefront });
|
|
419
|
-
resultsToReturn.push(...apiResult.processedRequests);
|
|
420
|
-
if (apiResult.unprocessedRequests.length) {
|
|
421
|
-
await (0, promises_1.setTimeout)(waitBetweenBatchesMillis);
|
|
422
|
-
resultsToReturn.push(...await attemptToAddToQueueAndAddAnyUnprocessed(providedRequests.filter((r) => !apiResult.processedRequests.some((pr) => pr.uniqueKey === r.uniqueKey))));
|
|
423
|
-
}
|
|
424
|
-
return resultsToReturn;
|
|
425
|
-
};
|
|
426
|
-
const initialChunk = builtRequests.splice(0, batchSize);
|
|
427
|
-
// Add initial batch of `batchSize` to process them right away
|
|
428
|
-
const addedRequests = await attemptToAddToQueueAndAddAnyUnprocessed(initialChunk);
|
|
429
|
-
// If we have no more requests to add, return early
|
|
430
|
-
if (!builtRequests.length) {
|
|
431
|
-
return {
|
|
432
|
-
addedRequests,
|
|
433
|
-
waitForAllRequestsToBeAdded: Promise.resolve([]),
|
|
434
|
-
};
|
|
435
|
-
}
|
|
436
|
-
// eslint-disable-next-line no-async-promise-executor
|
|
437
|
-
const promise = new Promise(async (resolve) => {
|
|
438
|
-
const chunks = (0, utils_1.chunk)(builtRequests, batchSize);
|
|
439
|
-
const finalAddedRequests = [];
|
|
440
|
-
for (const requestChunk of chunks) {
|
|
441
|
-
finalAddedRequests.push(...await attemptToAddToQueueAndAddAnyUnprocessed(requestChunk));
|
|
442
|
-
await (0, promises_1.setTimeout)(waitBetweenBatchesMillis);
|
|
443
|
-
}
|
|
444
|
-
resolve(finalAddedRequests);
|
|
445
|
-
});
|
|
446
|
-
// If the user wants to wait for all the requests to be added, we wait for the promise to resolve for them
|
|
447
|
-
if (options.waitForAllRequestsToBeAdded) {
|
|
448
|
-
addedRequests.push(...await promise);
|
|
449
|
-
}
|
|
450
|
-
return {
|
|
451
|
-
addedRequests,
|
|
452
|
-
waitForAllRequestsToBeAdded: promise,
|
|
453
|
-
};
|
|
454
|
-
}
|
|
455
|
-
/**
|
|
456
|
-
* Gets the request from the queue specified by ID.
|
|
457
|
-
*
|
|
458
|
-
* @param id ID of the request.
|
|
459
|
-
* @returns Returns the request object, or `null` if it was not found.
|
|
460
|
-
*/
|
|
461
|
-
async getRequest(id) {
|
|
462
|
-
(0, ow_1.default)(id, ow_1.default.string);
|
|
463
|
-
const requestOptions = await this.client.getRequest(id);
|
|
464
|
-
if (!requestOptions)
|
|
465
|
-
return null;
|
|
466
|
-
return new request_1.Request(requestOptions);
|
|
467
85
|
}
|
|
468
86
|
/**
|
|
469
87
|
* Returns a next request in the queue to be processed, or `null` if there are no more pending requests.
|
|
@@ -483,17 +101,17 @@ class RequestQueue {
|
|
|
483
101
|
* Returns the request object or `null` if there are no more pending requests.
|
|
484
102
|
*/
|
|
485
103
|
async fetchNextRequest() {
|
|
486
|
-
await this.
|
|
487
|
-
const nextRequestId = this.
|
|
104
|
+
await this.ensureHeadIsNonEmpty();
|
|
105
|
+
const nextRequestId = this.queueHeadIds.removeFirst();
|
|
488
106
|
// We are likely done at this point.
|
|
489
107
|
if (!nextRequestId)
|
|
490
108
|
return null;
|
|
491
109
|
// This should never happen, but...
|
|
492
|
-
if (this.inProgress.has(nextRequestId) || this.
|
|
110
|
+
if (this.inProgress.has(nextRequestId) || this.recentlyHandledRequestsCache.get(nextRequestId)) {
|
|
493
111
|
this.log.warning('Queue head returned a request that is already in progress?!', {
|
|
494
112
|
nextRequestId,
|
|
495
113
|
inProgress: this.inProgress.has(nextRequestId),
|
|
496
|
-
recentlyHandled: !!this.
|
|
114
|
+
recentlyHandled: !!this.recentlyHandledRequestsCache.get(nextRequestId),
|
|
497
115
|
});
|
|
498
116
|
return null;
|
|
499
117
|
}
|
|
@@ -518,7 +136,7 @@ class RequestQueue {
|
|
|
518
136
|
this.log.debug('Cannot find a request from the beginning of queue, will be retried later', { nextRequestId });
|
|
519
137
|
setTimeout(() => {
|
|
520
138
|
this.inProgress.delete(nextRequestId);
|
|
521
|
-
},
|
|
139
|
+
}, utils_1.STORAGE_CONSISTENCY_DELAY_MILLIS);
|
|
522
140
|
return null;
|
|
523
141
|
}
|
|
524
142
|
// 2) Queue head index is behind the main table and the underlying request was already handled
|
|
@@ -527,125 +145,14 @@ class RequestQueue {
|
|
|
527
145
|
// will not put the request again to queueHeadDict.
|
|
528
146
|
if (request.handledAt) {
|
|
529
147
|
this.log.debug('Request fetched from the beginning of queue was already handled', { nextRequestId });
|
|
530
|
-
this.
|
|
148
|
+
this.recentlyHandledRequestsCache.add(nextRequestId, true);
|
|
531
149
|
return null;
|
|
532
150
|
}
|
|
533
151
|
return request;
|
|
534
152
|
}
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
* {@apilink RequestQueue.fetchNextRequest}
|
|
538
|
-
* function as handled after successful processing.
|
|
539
|
-
* Handled requests will never again be returned by the `fetchNextRequest` function.
|
|
540
|
-
*/
|
|
541
|
-
async markRequestHandled(request) {
|
|
542
|
-
this.lastActivity = new Date();
|
|
543
|
-
(0, ow_1.default)(request, ow_1.default.object.partialShape({
|
|
544
|
-
id: ow_1.default.string,
|
|
545
|
-
uniqueKey: ow_1.default.string,
|
|
546
|
-
handledAt: ow_1.default.optional.string,
|
|
547
|
-
}));
|
|
548
|
-
if (!this.inProgress.has(request.id)) {
|
|
549
|
-
this.log.debug(`Cannot mark request ${request.id} as handled, because it is not in progress!`, { requestId: request.id });
|
|
550
|
-
return null;
|
|
551
|
-
}
|
|
552
|
-
const handledAt = request.handledAt ?? new Date().toISOString();
|
|
553
|
-
const queueOperationInfo = await this.client.updateRequest({ ...request, handledAt });
|
|
554
|
-
request.handledAt = handledAt;
|
|
555
|
-
queueOperationInfo.uniqueKey = request.uniqueKey;
|
|
556
|
-
this.inProgress.delete(request.id);
|
|
557
|
-
this.recentlyHandled.add(request.id, true);
|
|
558
|
-
if (!queueOperationInfo.wasAlreadyHandled) {
|
|
559
|
-
this.assumedHandledCount++;
|
|
560
|
-
}
|
|
561
|
-
this._cacheRequest(getRequestId(request.uniqueKey), queueOperationInfo);
|
|
562
|
-
return queueOperationInfo;
|
|
563
|
-
}
|
|
564
|
-
/**
|
|
565
|
-
* Reclaims a failed request back to the queue, so that it can be returned for processing later again
|
|
566
|
-
* by another call to {@apilink RequestQueue.fetchNextRequest}.
|
|
567
|
-
* The request record in the queue is updated using the provided `request` parameter.
|
|
568
|
-
* For example, this lets you store the number of retries or error messages for the request.
|
|
569
|
-
*/
|
|
570
|
-
async reclaimRequest(request, options = {}) {
|
|
571
|
-
this.lastActivity = new Date();
|
|
572
|
-
(0, ow_1.default)(request, ow_1.default.object.partialShape({
|
|
573
|
-
id: ow_1.default.string,
|
|
574
|
-
uniqueKey: ow_1.default.string,
|
|
575
|
-
}));
|
|
576
|
-
(0, ow_1.default)(options, ow_1.default.object.exactShape({
|
|
577
|
-
forefront: ow_1.default.optional.boolean,
|
|
578
|
-
}));
|
|
579
|
-
const { forefront = false } = options;
|
|
580
|
-
if (!this.inProgress.has(request.id)) {
|
|
581
|
-
this.log.debug(`Cannot reclaim request ${request.id}, because it is not in progress!`, { requestId: request.id });
|
|
582
|
-
return null;
|
|
583
|
-
}
|
|
584
|
-
// TODO: If request hasn't been changed since the last getRequest(),
|
|
585
|
-
// we don't need to call updateRequest() and thus improve performance.
|
|
586
|
-
const queueOperationInfo = await this.client.updateRequest(request, { forefront });
|
|
587
|
-
queueOperationInfo.uniqueKey = request.uniqueKey;
|
|
588
|
-
this._cacheRequest(getRequestId(request.uniqueKey), queueOperationInfo);
|
|
589
|
-
// Wait a little to increase a chance that the next call to fetchNextRequest() will return the request with updated data.
|
|
590
|
-
// This is to compensate for the limitation of DynamoDB, where writes might not be immediately visible to subsequent reads.
|
|
591
|
-
setTimeout(() => {
|
|
592
|
-
if (!this.inProgress.has(request.id)) {
|
|
593
|
-
this.log.debug('The request is no longer marked as in progress in the queue?!', { requestId: request.id });
|
|
594
|
-
return;
|
|
595
|
-
}
|
|
596
|
-
this.inProgress.delete(request.id);
|
|
597
|
-
// Performance optimization: add request straight to head if possible
|
|
598
|
-
this._maybeAddRequestToQueueHead(request.id, forefront);
|
|
599
|
-
}, exports.STORAGE_CONSISTENCY_DELAY_MILLIS);
|
|
600
|
-
return queueOperationInfo;
|
|
601
|
-
}
|
|
602
|
-
/**
|
|
603
|
-
* Resolves to `true` if the next call to {@apilink RequestQueue.fetchNextRequest}
|
|
604
|
-
* would return `null`, otherwise it resolves to `false`.
|
|
605
|
-
* Note that even if the queue is empty, there might be some pending requests currently being processed.
|
|
606
|
-
* If you need to ensure that there is no activity in the queue, use {@apilink RequestQueue.isFinished}.
|
|
607
|
-
*/
|
|
608
|
-
async isEmpty() {
|
|
153
|
+
async ensureHeadIsNonEmpty() {
|
|
154
|
+
// Alias for backwards compatibility
|
|
609
155
|
await this._ensureHeadIsNonEmpty();
|
|
610
|
-
return this.queueHeadDict.length() === 0;
|
|
611
|
-
}
|
|
612
|
-
/**
|
|
613
|
-
* Resolves to `true` if all requests were already handled and there are no more left.
|
|
614
|
-
* Due to the nature of distributed storage used by the queue,
|
|
615
|
-
* the function might occasionally return a false negative,
|
|
616
|
-
* but it will never return a false positive.
|
|
617
|
-
*/
|
|
618
|
-
async isFinished() {
|
|
619
|
-
if ((Date.now() - +this.lastActivity) > this.internalTimeoutMillis) {
|
|
620
|
-
const message = `The request queue seems to be stuck for ${this.internalTimeoutMillis / 1e3}s, resetting internal state.`;
|
|
621
|
-
this.log.warning(message, { inProgress: [...this.inProgress] });
|
|
622
|
-
this._reset();
|
|
623
|
-
}
|
|
624
|
-
if (this.queueHeadDict.length() > 0 || this.inProgressCount() > 0)
|
|
625
|
-
return false;
|
|
626
|
-
const isHeadConsistent = await this._ensureHeadIsNonEmpty(true);
|
|
627
|
-
return isHeadConsistent && this.queueHeadDict.length() === 0 && this.inProgressCount() === 0;
|
|
628
|
-
}
|
|
629
|
-
_reset() {
|
|
630
|
-
this.queueHeadDict.clear();
|
|
631
|
-
this.queryQueueHeadPromise = null;
|
|
632
|
-
this.inProgress.clear();
|
|
633
|
-
this.recentlyHandled.clear();
|
|
634
|
-
this.assumedTotalCount = 0;
|
|
635
|
-
this.assumedHandledCount = 0;
|
|
636
|
-
this.requestsCache.clear();
|
|
637
|
-
this.lastActivity = new Date();
|
|
638
|
-
}
|
|
639
|
-
/**
|
|
640
|
-
* Caches information about request to beware of unneeded addRequest() calls.
|
|
641
|
-
*/
|
|
642
|
-
_cacheRequest(cacheKey, queueOperationInfo) {
|
|
643
|
-
this.requestsCache.add(cacheKey, {
|
|
644
|
-
id: queueOperationInfo.requestId,
|
|
645
|
-
isHandled: queueOperationInfo.wasAlreadyHandled,
|
|
646
|
-
uniqueKey: queueOperationInfo.uniqueKey,
|
|
647
|
-
wasAlreadyHandled: queueOperationInfo.wasAlreadyHandled,
|
|
648
|
-
});
|
|
649
156
|
}
|
|
650
157
|
/**
|
|
651
158
|
* We always request more items than is in progress to ensure that something falls into head.
|
|
@@ -658,10 +165,15 @@ class RequestQueue {
|
|
|
658
165
|
* @param [iteration] Used when this function is called recursively to limit the recursion.
|
|
659
166
|
* @returns Indicates if queue head is consistent (true) or inconsistent (false).
|
|
660
167
|
*/
|
|
661
|
-
async _ensureHeadIsNonEmpty(ensureConsistency = false, limit = Math.max(this.inProgressCount() *
|
|
168
|
+
async _ensureHeadIsNonEmpty(ensureConsistency = false, limit = Math.max(this.inProgressCount() * utils_1.QUERY_HEAD_BUFFER, utils_1.QUERY_HEAD_MIN_LENGTH), iteration = 0) {
|
|
169
|
+
// If we are paused for migration, resolve immediately.
|
|
170
|
+
if (this.queuePausedForMigration) {
|
|
171
|
+
return true;
|
|
172
|
+
}
|
|
662
173
|
// If is nonempty resolve immediately.
|
|
663
|
-
if (this.
|
|
174
|
+
if (this.queueHeadIds.length() > 0) {
|
|
664
175
|
return true;
|
|
176
|
+
}
|
|
665
177
|
if (!this.queryQueueHeadPromise) {
|
|
666
178
|
const queryStartedAt = new Date();
|
|
667
179
|
this.queryQueueHeadPromise = this.client
|
|
@@ -669,10 +181,10 @@ class RequestQueue {
|
|
|
669
181
|
.then(({ items, queueModifiedAt, hadMultipleClients }) => {
|
|
670
182
|
items.forEach(({ id: requestId, uniqueKey }) => {
|
|
671
183
|
// Queue head index might be behind the main table, so ensure we don't recycle requests
|
|
672
|
-
if (!requestId || !uniqueKey || this.inProgress.has(requestId) || this.
|
|
184
|
+
if (!requestId || !uniqueKey || this.inProgress.has(requestId) || this.recentlyHandledRequestsCache.get(requestId))
|
|
673
185
|
return;
|
|
674
|
-
this.
|
|
675
|
-
this._cacheRequest(getRequestId(uniqueKey), {
|
|
186
|
+
this.queueHeadIds.add(requestId, requestId, false);
|
|
187
|
+
this._cacheRequest((0, utils_1.getRequestId)(uniqueKey), {
|
|
676
188
|
requestId,
|
|
677
189
|
wasAlreadyHandled: false,
|
|
678
190
|
wasAlreadyPresent: true,
|
|
@@ -701,13 +213,13 @@ class RequestQueue {
|
|
|
701
213
|
if (prevLimit >= consts_1.REQUEST_QUEUE_HEAD_MAX_LIMIT) {
|
|
702
214
|
this.log.warning(`Reached the maximum number of requests in progress: ${consts_1.REQUEST_QUEUE_HEAD_MAX_LIMIT}.`);
|
|
703
215
|
}
|
|
704
|
-
const shouldRepeatWithHigherLimit = this.
|
|
216
|
+
const shouldRepeatWithHigherLimit = this.queueHeadIds.length() === 0
|
|
705
217
|
&& wasLimitReached
|
|
706
218
|
&& prevLimit < consts_1.REQUEST_QUEUE_HEAD_MAX_LIMIT;
|
|
707
219
|
// If ensureConsistency=true then we must ensure that either:
|
|
708
220
|
// - queueModifiedAt is older than queryStartedAt by at least API_PROCESSED_REQUESTS_DELAY_MILLIS
|
|
709
221
|
// - hadMultipleClients=false and this.assumedTotalCount<=this.assumedHandledCount
|
|
710
|
-
const isDatabaseConsistent = +queryStartedAt - +queueModifiedAt >=
|
|
222
|
+
const isDatabaseConsistent = +queryStartedAt - +queueModifiedAt >= utils_1.API_PROCESSED_REQUESTS_DELAY_MILLIS;
|
|
711
223
|
const isLocallyConsistent = !hadMultipleClients && this.assumedTotalCount <= this.assumedHandledCount;
|
|
712
224
|
// Consistent information from one source is enough to consider request queue finished.
|
|
713
225
|
const shouldRepeatForConsistency = ensureConsistency && !isDatabaseConsistent && !isLocallyConsistent;
|
|
@@ -716,152 +228,57 @@ class RequestQueue {
|
|
|
716
228
|
return true;
|
|
717
229
|
// If we are querying for consistency then we limit the number of queries to MAX_QUERIES_FOR_CONSISTENCY.
|
|
718
230
|
// If this is reached then we return false so that empty() and finished() returns possibly false negative.
|
|
719
|
-
if (!shouldRepeatWithHigherLimit && iteration >
|
|
231
|
+
if (!shouldRepeatWithHigherLimit && iteration > utils_1.MAX_QUERIES_FOR_CONSISTENCY)
|
|
720
232
|
return false;
|
|
721
233
|
const nextLimit = shouldRepeatWithHigherLimit
|
|
722
234
|
? Math.round(prevLimit * 1.5)
|
|
723
235
|
: prevLimit;
|
|
724
236
|
// If we are repeating for consistency then wait required time.
|
|
725
237
|
if (shouldRepeatForConsistency) {
|
|
726
|
-
const delayMillis =
|
|
238
|
+
const delayMillis = utils_1.API_PROCESSED_REQUESTS_DELAY_MILLIS - (Date.now() - +queueModifiedAt);
|
|
727
239
|
this.log.info(`Waiting for ${delayMillis}ms before considering the queue as finished to ensure that the data is consistent.`);
|
|
728
240
|
await (0, promises_1.setTimeout)(delayMillis);
|
|
729
241
|
}
|
|
730
242
|
return this._ensureHeadIsNonEmpty(ensureConsistency, nextLimit, iteration + 1);
|
|
731
243
|
}
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
this.
|
|
738
|
-
}
|
|
739
|
-
else if (this.assumedTotalCount < exports.QUERY_HEAD_MIN_LENGTH) {
|
|
740
|
-
this.queueHeadDict.add(requestId, requestId, false);
|
|
244
|
+
// RequestQueue v1 behavior overrides below
|
|
245
|
+
async isFinished() {
|
|
246
|
+
if ((Date.now() - +this.lastActivity) > this.internalTimeoutMillis) {
|
|
247
|
+
const message = `The request queue seems to be stuck for ${this.internalTimeoutMillis / 1e3}s, resetting internal state.`;
|
|
248
|
+
this.log.warning(message, { inProgress: [...this.inProgress] });
|
|
249
|
+
this._reset();
|
|
741
250
|
}
|
|
251
|
+
if (this.queueHeadIds.length() > 0 || this.inProgressCount() > 0)
|
|
252
|
+
return false;
|
|
253
|
+
const isHeadConsistent = await this._ensureHeadIsNonEmpty(true);
|
|
254
|
+
return isHeadConsistent && this.queueHeadIds.length() === 0 && this.inProgressCount() === 0;
|
|
742
255
|
}
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
*/
|
|
747
|
-
async drop() {
|
|
748
|
-
await this.client.delete();
|
|
749
|
-
const manager = storage_manager_1.StorageManager.getManager(RequestQueue, this.config);
|
|
750
|
-
manager.closeStorage(this);
|
|
256
|
+
async addRequest(...args) {
|
|
257
|
+
this.lastActivity = new Date();
|
|
258
|
+
return super.addRequest(...args);
|
|
751
259
|
}
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
* This function is just a convenient shortcut for:
|
|
756
|
-
*
|
|
757
|
-
* ```javascript
|
|
758
|
-
* const { handledRequestCount } = await queue.getInfo();
|
|
759
|
-
* ```
|
|
760
|
-
*/
|
|
761
|
-
async handledCount() {
|
|
762
|
-
// NOTE: We keep this function for compatibility with RequestList.handledCount()
|
|
763
|
-
const { handledRequestCount } = await this.getInfo() ?? {};
|
|
764
|
-
return handledRequestCount ?? 0;
|
|
260
|
+
async addRequests(...args) {
|
|
261
|
+
this.lastActivity = new Date();
|
|
262
|
+
return super.addRequests(...args);
|
|
765
263
|
}
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
* The function returns the same object as the Apify API Client's
|
|
770
|
-
* [getQueue](https://docs.apify.com/api/apify-client-js/latest#ApifyClient-requestQueues)
|
|
771
|
-
* function, which in turn calls the
|
|
772
|
-
* [Get request queue](https://apify.com/docs/api/v2#/reference/request-queues/queue/get-request-queue)
|
|
773
|
-
* API endpoint.
|
|
774
|
-
*
|
|
775
|
-
* **Example:**
|
|
776
|
-
* ```
|
|
777
|
-
* {
|
|
778
|
-
* id: "WkzbQMuFYuamGv3YF",
|
|
779
|
-
* name: "my-queue",
|
|
780
|
-
* userId: "wRsJZtadYvn4mBZmm",
|
|
781
|
-
* createdAt: new Date("2015-12-12T07:34:14.202Z"),
|
|
782
|
-
* modifiedAt: new Date("2015-12-13T08:36:13.202Z"),
|
|
783
|
-
* accessedAt: new Date("2015-12-14T08:36:13.202Z"),
|
|
784
|
-
* totalRequestCount: 25,
|
|
785
|
-
* handledRequestCount: 5,
|
|
786
|
-
* pendingRequestCount: 20,
|
|
787
|
-
* }
|
|
788
|
-
* ```
|
|
789
|
-
*/
|
|
790
|
-
async getInfo() {
|
|
791
|
-
return this.client.get();
|
|
264
|
+
async addRequestsBatched(...args) {
|
|
265
|
+
this.lastActivity = new Date();
|
|
266
|
+
return super.addRequestsBatched(...args);
|
|
792
267
|
}
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
async _fetchRequestsFromUrl(source) {
|
|
797
|
-
const { requestsFromUrl, regex, ...sharedOpts } = source;
|
|
798
|
-
// Download remote resource and parse URLs.
|
|
799
|
-
let urlsArr;
|
|
800
|
-
try {
|
|
801
|
-
urlsArr = await this._downloadListOfUrls({ url: requestsFromUrl, urlRegExp: regex, proxyUrl: await this.proxyConfiguration?.newUrl() });
|
|
802
|
-
}
|
|
803
|
-
catch (err) {
|
|
804
|
-
throw new Error(`Cannot fetch a request list from ${requestsFromUrl}: ${err}`);
|
|
805
|
-
}
|
|
806
|
-
// Skip if resource contained no URLs.
|
|
807
|
-
if (!urlsArr.length) {
|
|
808
|
-
this.log.warning('list fetched, but it is empty.', { requestsFromUrl, regex });
|
|
809
|
-
return [];
|
|
810
|
-
}
|
|
811
|
-
return urlsArr.map((url) => ({ url, ...sharedOpts }));
|
|
268
|
+
async markRequestHandled(...args) {
|
|
269
|
+
this.lastActivity = new Date();
|
|
270
|
+
return super.markRequestHandled(...args);
|
|
812
271
|
}
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
async _addFetchedRequests(source, fetchedRequests, options) {
|
|
817
|
-
const { requestsFromUrl, regex } = source;
|
|
818
|
-
const { addedRequests } = await this.addRequestsBatched(fetchedRequests, options);
|
|
819
|
-
this.log.info('Fetched and loaded Requests from a remote resource.', {
|
|
820
|
-
requestsFromUrl,
|
|
821
|
-
regex,
|
|
822
|
-
fetchedCount: fetchedRequests.length,
|
|
823
|
-
importedCount: addedRequests.length,
|
|
824
|
-
duplicateCount: fetchedRequests.length - addedRequests.length,
|
|
825
|
-
sample: JSON.stringify(fetchedRequests.slice(0, 5)),
|
|
826
|
-
});
|
|
827
|
-
return addedRequests;
|
|
272
|
+
async reclaimRequest(...args) {
|
|
273
|
+
this.lastActivity = new Date();
|
|
274
|
+
return super.reclaimRequest(...args);
|
|
828
275
|
}
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
async _downloadListOfUrls(options) {
|
|
833
|
-
return (0, utils_1.downloadListOfUrls)(options);
|
|
276
|
+
_reset() {
|
|
277
|
+
super._reset();
|
|
278
|
+
this.lastActivity = new Date();
|
|
834
279
|
}
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
* of the {@apilink RequestQueue} class.
|
|
838
|
-
*
|
|
839
|
-
* {@apilink RequestQueue} represents a queue of URLs to crawl, which is stored either on local filesystem or in the cloud.
|
|
840
|
-
* The queue is used for deep crawling of websites, where you start with several URLs and then
|
|
841
|
-
* recursively follow links to other pages. The data structure supports both breadth-first
|
|
842
|
-
* and depth-first crawling orders.
|
|
843
|
-
*
|
|
844
|
-
* For more details and code examples, see the {@apilink RequestQueue} class.
|
|
845
|
-
*
|
|
846
|
-
* @param [queueIdOrName]
|
|
847
|
-
* ID or name of the request queue to be opened. If `null` or `undefined`,
|
|
848
|
-
* the function returns the default request queue associated with the crawler run.
|
|
849
|
-
* @param [options] Open Request Queue options.
|
|
850
|
-
*/
|
|
851
|
-
static async open(queueIdOrName, options = {}) {
|
|
852
|
-
(0, ow_1.default)(queueIdOrName, ow_1.default.optional.any(ow_1.default.string, ow_1.default.null));
|
|
853
|
-
(0, ow_1.default)(options, ow_1.default.object.exactShape({
|
|
854
|
-
config: ow_1.default.optional.object.instanceOf(configuration_1.Configuration),
|
|
855
|
-
storageClient: ow_1.default.optional.object,
|
|
856
|
-
proxyConfiguration: ow_1.default.optional.object,
|
|
857
|
-
}));
|
|
858
|
-
options.config ?? (options.config = configuration_1.Configuration.getGlobalConfig());
|
|
859
|
-
options.storageClient ?? (options.storageClient = options.config.getStorageClient());
|
|
860
|
-
await (0, utils_2.purgeDefaultStorages)({ onlyPurgeOnce: true, client: options.storageClient, config: options.config });
|
|
861
|
-
const manager = storage_manager_1.StorageManager.getManager(this, options.config);
|
|
862
|
-
const queue = await manager.openStorage(queueIdOrName, options.storageClient);
|
|
863
|
-
queue.proxyConfiguration = options.proxyConfiguration;
|
|
864
|
-
return queue;
|
|
280
|
+
static open(...args) {
|
|
281
|
+
return super.open(...args);
|
|
865
282
|
}
|
|
866
283
|
}
|
|
867
284
|
exports.RequestQueue = RequestQueue;
|