@crawlee/core 3.5.5-beta.8 → 3.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,68 +1,18 @@
1
1
  "use strict";
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
- exports.RequestQueue = exports.getRequestId = exports.STORAGE_CONSISTENCY_DELAY_MILLIS = exports.MAX_QUERIES_FOR_CONSISTENCY = exports.API_PROCESSED_REQUESTS_DELAY_MILLIS = exports.QUERY_HEAD_BUFFER = exports.QUERY_HEAD_MIN_LENGTH = void 0;
4
- const tslib_1 = require("tslib");
5
- const node_crypto_1 = tslib_1.__importDefault(require("node:crypto"));
3
+ exports.RequestQueue = void 0;
6
4
  const promises_1 = require("node:timers/promises");
7
5
  const consts_1 = require("@apify/consts");
8
- const datastructures_1 = require("@apify/datastructures");
9
- const utilities_1 = require("@apify/utilities");
10
- const utils_1 = require("@crawlee/utils");
11
- const ow_1 = tslib_1.__importDefault(require("ow"));
12
- const storage_manager_1 = require("./storage_manager");
13
- const utils_2 = require("./utils");
6
+ const request_provider_1 = require("./request_provider");
7
+ const utils_1 = require("./utils");
14
8
  const configuration_1 = require("../configuration");
15
- const log_1 = require("../log");
16
- const request_1 = require("../request");
17
9
  const MAX_CACHED_REQUESTS = 1000000;
18
- /**
19
- * When requesting queue head we always fetch requestsInProgressCount * QUERY_HEAD_BUFFER number of requests.
20
- * @internal
21
- */
22
- exports.QUERY_HEAD_MIN_LENGTH = 100;
23
- /** @internal */
24
- exports.QUERY_HEAD_BUFFER = 3;
25
- /**
26
- * If queue was modified (request added/updated/deleted) before more than API_PROCESSED_REQUESTS_DELAY_MILLIS
27
- * then we assume the get head operation to be consistent.
28
- * @internal
29
- */
30
- exports.API_PROCESSED_REQUESTS_DELAY_MILLIS = 10000;
31
- /**
32
- * How many times we try to get queue head with queueModifiedAt older than API_PROCESSED_REQUESTS_DELAY_MILLIS.
33
- * @internal
34
- */
35
- exports.MAX_QUERIES_FOR_CONSISTENCY = 6;
36
10
  /**
37
11
  * This number must be large enough so that processing of all these requests cannot be done in
38
12
  * a time lower than expected maximum latency of DynamoDB, but low enough not to waste too much memory.
39
13
  * @internal
40
14
  */
41
15
  const RECENTLY_HANDLED_CACHE_SIZE = 1000;
42
- /**
43
- * Indicates how long it usually takes for the underlying storage to propagate all writes
44
- * to be available to subsequent reads.
45
- * @internal
46
- */
47
- exports.STORAGE_CONSISTENCY_DELAY_MILLIS = 3000;
48
- /**
49
- * Helper function that creates ID from uniqueKey for local emulation of request queue.
50
- * It's also used for local cache of remote request queue.
51
- *
52
- * This function may not exactly match how requestId is created server side.
53
- * So we never pass requestId created by this to server and use it only for local cache.
54
- *
55
- * @internal
56
- */
57
- function getRequestId(uniqueKey) {
58
- const str = node_crypto_1.default
59
- .createHash('sha256')
60
- .update(uniqueKey)
61
- .digest('base64')
62
- .replace(/[+/=]/g, '');
63
- return str.substr(0, 15);
64
- }
65
- exports.getRequestId = getRequestId;
66
16
  /**
67
17
  * Represents a queue of URLs to crawl, which is used for deep crawling of websites
68
18
  * where you start with several URLs and then recursively
@@ -109,361 +59,29 @@ exports.getRequestId = getRequestId;
109
59
  * ```
110
60
  * @category Sources
111
61
  */
112
- class RequestQueue {
62
+ class RequestQueue extends request_provider_1.RequestProvider {
113
63
  /**
114
64
  * @internal
115
65
  */
116
66
  constructor(options, config = configuration_1.Configuration.getGlobalConfig()) {
117
- Object.defineProperty(this, "config", {
118
- enumerable: true,
119
- configurable: true,
120
- writable: true,
121
- value: config
122
- });
123
- Object.defineProperty(this, "log", {
124
- enumerable: true,
125
- configurable: true,
126
- writable: true,
127
- value: log_1.log.child({ prefix: 'RequestQueue' })
128
- });
129
- Object.defineProperty(this, "id", {
130
- enumerable: true,
131
- configurable: true,
132
- writable: true,
133
- value: void 0
134
- });
135
- Object.defineProperty(this, "name", {
136
- enumerable: true,
137
- configurable: true,
138
- writable: true,
139
- value: void 0
140
- });
141
- Object.defineProperty(this, "timeoutSecs", {
142
- enumerable: true,
143
- configurable: true,
144
- writable: true,
145
- value: 30
146
- });
147
- Object.defineProperty(this, "clientKey", {
148
- enumerable: true,
149
- configurable: true,
150
- writable: true,
151
- value: (0, utilities_1.cryptoRandomObjectId)()
152
- });
153
- Object.defineProperty(this, "client", {
154
- enumerable: true,
155
- configurable: true,
156
- writable: true,
157
- value: void 0
158
- });
159
- Object.defineProperty(this, "proxyConfiguration", {
160
- enumerable: true,
161
- configurable: true,
162
- writable: true,
163
- value: void 0
164
- });
165
- /**
166
- * Contains a cached list of request IDs from the head of the queue,
167
- * as obtained in the last query. Both key and value is the request ID.
168
- * Need to apply a type here to the generated TS types don't try to use types-apify
169
- */
170
- Object.defineProperty(this, "queueHeadDict", {
171
- enumerable: true,
172
- configurable: true,
173
- writable: true,
174
- value: new datastructures_1.ListDictionary()
175
- });
67
+ super({
68
+ ...options,
69
+ logPrefix: 'RequestQueue',
70
+ recentlyHandledRequestsMaxSize: RECENTLY_HANDLED_CACHE_SIZE,
71
+ requestCacheMaxSize: MAX_CACHED_REQUESTS,
72
+ }, config);
176
73
  Object.defineProperty(this, "queryQueueHeadPromise", {
177
74
  enumerable: true,
178
75
  configurable: true,
179
76
  writable: true,
180
77
  value: null
181
78
  });
182
- // A set of all request IDs that are currently being handled,
183
- // i.e. which were returned by fetchNextRequest() but not markRequestHandled()
184
- Object.defineProperty(this, "inProgress", {
185
- enumerable: true,
186
- configurable: true,
187
- writable: true,
188
- value: new Set()
189
- });
190
- // To track whether the queue gets stuck, and we need to reset it
191
- // `lastActivity` tracks the time when we either added, processed or reclaimed a request,
192
- // or when we add new request to in-progress cache
193
79
  Object.defineProperty(this, "lastActivity", {
194
80
  enumerable: true,
195
81
  configurable: true,
196
82
  writable: true,
197
83
  value: new Date()
198
84
  });
199
- Object.defineProperty(this, "internalTimeoutMillis", {
200
- enumerable: true,
201
- configurable: true,
202
- writable: true,
203
- value: 5 * 60e3
204
- }); // defaults to 5 minutes, will be overridden by BasicCrawler
205
- // Contains a list of recently handled requests. It is used to avoid inconsistencies
206
- // caused by delays in the underlying DynamoDB storage.
207
- // Keys are request IDs, values are true.
208
- Object.defineProperty(this, "recentlyHandled", {
209
- enumerable: true,
210
- configurable: true,
211
- writable: true,
212
- value: new datastructures_1.LruCache({ maxLength: RECENTLY_HANDLED_CACHE_SIZE })
213
- });
214
- // We can trust these numbers only in a case that queue is used by a single client.
215
- // This information is returned by getHead() under the hadMultipleClients property.
216
- Object.defineProperty(this, "assumedTotalCount", {
217
- enumerable: true,
218
- configurable: true,
219
- writable: true,
220
- value: 0
221
- });
222
- Object.defineProperty(this, "assumedHandledCount", {
223
- enumerable: true,
224
- configurable: true,
225
- writable: true,
226
- value: 0
227
- });
228
- // Caching requests to avoid redundant addRequest() calls.
229
- // Key is computed using getRequestId() and value is { id, isHandled }.
230
- Object.defineProperty(this, "requestsCache", {
231
- enumerable: true,
232
- configurable: true,
233
- writable: true,
234
- value: new datastructures_1.LruCache({ maxLength: MAX_CACHED_REQUESTS })
235
- });
236
- this.id = options.id;
237
- this.name = options.name;
238
- this.client = options.client.requestQueue(this.id, {
239
- clientKey: this.clientKey,
240
- timeoutSecs: this.timeoutSecs,
241
- });
242
- this.proxyConfiguration = options.proxyConfiguration;
243
- }
244
- /**
245
- * @ignore
246
- */
247
- inProgressCount() {
248
- return this.inProgress.size;
249
- }
250
- /**
251
- * Adds a request to the queue.
252
- *
253
- * If a request with the same `uniqueKey` property is already present in the queue,
254
- * it will not be updated. You can find out whether this happened from the resulting
255
- * {@apilink QueueOperationInfo} object.
256
- *
257
- * To add multiple requests to the queue by extracting links from a webpage,
258
- * see the {@apilink enqueueLinks} helper function.
259
- *
260
- * @param requestLike {@apilink Request} object or vanilla object with request data.
261
- * Note that the function sets the `uniqueKey` and `id` fields to the passed Request.
262
- * @param [options] Request queue operation options.
263
- */
264
- async addRequest(requestLike, options = {}) {
265
- (0, ow_1.default)(requestLike, ow_1.default.object);
266
- (0, ow_1.default)(options, ow_1.default.object.exactShape({
267
- forefront: ow_1.default.optional.boolean,
268
- }));
269
- this.lastActivity = new Date();
270
- const { forefront = false } = options;
271
- if ('requestsFromUrl' in requestLike) {
272
- const requests = await this._fetchRequestsFromUrl(requestLike);
273
- const processedRequests = await this._addFetchedRequests(requestLike, requests, options);
274
- return processedRequests[0];
275
- }
276
- (0, ow_1.default)(requestLike, ow_1.default.object.partialShape({
277
- url: ow_1.default.string,
278
- id: ow_1.default.undefined,
279
- }));
280
- const request = requestLike instanceof request_1.Request
281
- ? requestLike
282
- : new request_1.Request(requestLike);
283
- const cacheKey = getRequestId(request.uniqueKey);
284
- const cachedInfo = this.requestsCache.get(cacheKey);
285
- if (cachedInfo) {
286
- request.id = cachedInfo.id;
287
- return {
288
- wasAlreadyPresent: true,
289
- // We may assume that if request is in local cache then also the information if the
290
- // request was already handled is there because just one client should be using one queue.
291
- wasAlreadyHandled: cachedInfo.isHandled,
292
- requestId: cachedInfo.id,
293
- uniqueKey: cachedInfo.uniqueKey,
294
- };
295
- }
296
- const queueOperationInfo = await this.client.addRequest(request, { forefront });
297
- queueOperationInfo.uniqueKey = request.uniqueKey;
298
- const { requestId, wasAlreadyPresent } = queueOperationInfo;
299
- this._cacheRequest(cacheKey, queueOperationInfo);
300
- if (!wasAlreadyPresent && !this.inProgress.has(requestId) && !this.recentlyHandled.get(requestId)) {
301
- this.assumedTotalCount++;
302
- // Performance optimization: add request straight to head if possible
303
- this._maybeAddRequestToQueueHead(requestId, forefront);
304
- }
305
- return queueOperationInfo;
306
- }
307
- /**
308
- * Adds requests to the queue in batches of 25.
309
- *
310
- * If a request that is passed in is already present due to its `uniqueKey` property being the same,
311
- * it will not be updated. You can find out whether this happened by finding the request in the resulting
312
- * {@apilink BatchAddRequestsResult} object.
313
- *
314
- * @param requestsLike {@apilink Request} objects or vanilla objects with request data.
315
- * Note that the function sets the `uniqueKey` and `id` fields to the passed requests if missing.
316
- * @param [options] Request queue operation options.
317
- */
318
- async addRequests(requestsLike, options = {}) {
319
- (0, ow_1.default)(requestsLike, ow_1.default.array);
320
- (0, ow_1.default)(options, ow_1.default.object.exactShape({
321
- forefront: ow_1.default.optional.boolean,
322
- }));
323
- const { forefront = false } = options;
324
- const uniqueKeyToCacheKey = new Map();
325
- const getCachedRequestId = (uniqueKey) => {
326
- const cached = uniqueKeyToCacheKey.get(uniqueKey);
327
- if (cached)
328
- return cached;
329
- const newCacheKey = getRequestId(uniqueKey);
330
- uniqueKeyToCacheKey.set(uniqueKey, newCacheKey);
331
- return newCacheKey;
332
- };
333
- const results = {
334
- processedRequests: [],
335
- unprocessedRequests: [],
336
- };
337
- for (const requestLike of requestsLike) {
338
- if ('requestsFromUrl' in requestLike) {
339
- const requests = await this._fetchRequestsFromUrl(requestLike);
340
- await this._addFetchedRequests(requestLike, requests, options);
341
- }
342
- }
343
- const requests = requestsLike
344
- .filter((requestLike) => !('requestsFromUrl' in requestLike))
345
- .map((requestLike) => {
346
- return requestLike instanceof request_1.Request ? requestLike : new request_1.Request(requestLike);
347
- });
348
- const requestsToAdd = new Map();
349
- for (const request of requests) {
350
- const cacheKey = getCachedRequestId(request.uniqueKey);
351
- const cachedInfo = this.requestsCache.get(cacheKey);
352
- if (cachedInfo) {
353
- request.id = cachedInfo.id;
354
- results.processedRequests.push({
355
- wasAlreadyPresent: true,
356
- // We may assume that if request is in local cache then also the information if the
357
- // request was already handled is there because just one client should be using one queue.
358
- wasAlreadyHandled: cachedInfo.isHandled,
359
- requestId: cachedInfo.id,
360
- uniqueKey: cachedInfo.uniqueKey,
361
- });
362
- }
363
- else if (!requestsToAdd.has(request.uniqueKey)) {
364
- requestsToAdd.set(request.uniqueKey, request);
365
- }
366
- }
367
- // Early exit if all provided requests were already added
368
- if (!requestsToAdd.size) {
369
- return results;
370
- }
371
- const apiResults = await this.client.batchAddRequests([...requestsToAdd.values()], { forefront });
372
- // Report unprocessed requests
373
- results.unprocessedRequests = apiResults.unprocessedRequests;
374
- // Add all new requests to the queue head
375
- for (const newRequest of apiResults.processedRequests) {
376
- // Add the new request to the processed list
377
- results.processedRequests.push(newRequest);
378
- const cacheKey = getCachedRequestId(newRequest.uniqueKey);
379
- const { requestId, wasAlreadyPresent } = newRequest;
380
- this._cacheRequest(cacheKey, newRequest);
381
- if (!wasAlreadyPresent && !this.inProgress.has(requestId) && !this.recentlyHandled.get(requestId)) {
382
- this.assumedTotalCount++;
383
- // Performance optimization: add request straight to head if possible
384
- this._maybeAddRequestToQueueHead(requestId, forefront);
385
- }
386
- }
387
- return results;
388
- }
389
- /**
390
- * Adds requests to the queue in batches. By default, it will resolve after the initial batch is added, and continue
391
- * adding the rest in background. You can configure the batch size via `batchSize` option and the sleep time in between
392
- * the batches via `waitBetweenBatchesMillis`. If you want to wait for all batches to be added to the queue, you can use
393
- * the `waitForAllRequestsToBeAdded` promise you get in the response object.
394
- *
395
- * @param requests The requests to add
396
- * @param options Options for the request queue
397
- */
398
- async addRequestsBatched(requests, options = {}) {
399
- (0, ow_1.default)(requests, ow_1.default.array.ofType(ow_1.default.any(ow_1.default.string, ow_1.default.object.partialShape({ url: ow_1.default.string, id: ow_1.default.undefined }), ow_1.default.object.partialShape({ requestsFromUrl: ow_1.default.string, regex: ow_1.default.optional.regExp }))));
400
- (0, ow_1.default)(options, ow_1.default.object.exactShape({
401
- forefront: ow_1.default.optional.boolean,
402
- waitForAllRequestsToBeAdded: ow_1.default.optional.boolean,
403
- batchSize: ow_1.default.optional.number,
404
- waitBetweenBatchesMillis: ow_1.default.optional.number,
405
- }));
406
- const { batchSize = 1000, waitBetweenBatchesMillis = 1000, } = options;
407
- const builtRequests = [];
408
- for (const opts of requests) {
409
- if (opts && typeof opts === 'object' && 'requestsFromUrl' in opts) {
410
- await this.addRequest(opts, { forefront: options.forefront });
411
- }
412
- else {
413
- builtRequests.push(new request_1.Request(typeof opts === 'string' ? { url: opts } : opts));
414
- }
415
- }
416
- const attemptToAddToQueueAndAddAnyUnprocessed = async (providedRequests) => {
417
- const resultsToReturn = [];
418
- const apiResult = await this.addRequests(providedRequests, { forefront: options.forefront });
419
- resultsToReturn.push(...apiResult.processedRequests);
420
- if (apiResult.unprocessedRequests.length) {
421
- await (0, promises_1.setTimeout)(waitBetweenBatchesMillis);
422
- resultsToReturn.push(...await attemptToAddToQueueAndAddAnyUnprocessed(providedRequests.filter((r) => !apiResult.processedRequests.some((pr) => pr.uniqueKey === r.uniqueKey))));
423
- }
424
- return resultsToReturn;
425
- };
426
- const initialChunk = builtRequests.splice(0, batchSize);
427
- // Add initial batch of `batchSize` to process them right away
428
- const addedRequests = await attemptToAddToQueueAndAddAnyUnprocessed(initialChunk);
429
- // If we have no more requests to add, return early
430
- if (!builtRequests.length) {
431
- return {
432
- addedRequests,
433
- waitForAllRequestsToBeAdded: Promise.resolve([]),
434
- };
435
- }
436
- // eslint-disable-next-line no-async-promise-executor
437
- const promise = new Promise(async (resolve) => {
438
- const chunks = (0, utils_1.chunk)(builtRequests, batchSize);
439
- const finalAddedRequests = [];
440
- for (const requestChunk of chunks) {
441
- finalAddedRequests.push(...await attemptToAddToQueueAndAddAnyUnprocessed(requestChunk));
442
- await (0, promises_1.setTimeout)(waitBetweenBatchesMillis);
443
- }
444
- resolve(finalAddedRequests);
445
- });
446
- // If the user wants to wait for all the requests to be added, we wait for the promise to resolve for them
447
- if (options.waitForAllRequestsToBeAdded) {
448
- addedRequests.push(...await promise);
449
- }
450
- return {
451
- addedRequests,
452
- waitForAllRequestsToBeAdded: promise,
453
- };
454
- }
455
- /**
456
- * Gets the request from the queue specified by ID.
457
- *
458
- * @param id ID of the request.
459
- * @returns Returns the request object, or `null` if it was not found.
460
- */
461
- async getRequest(id) {
462
- (0, ow_1.default)(id, ow_1.default.string);
463
- const requestOptions = await this.client.getRequest(id);
464
- if (!requestOptions)
465
- return null;
466
- return new request_1.Request(requestOptions);
467
85
  }
468
86
  /**
469
87
  * Returns a next request in the queue to be processed, or `null` if there are no more pending requests.
@@ -483,17 +101,17 @@ class RequestQueue {
483
101
  * Returns the request object or `null` if there are no more pending requests.
484
102
  */
485
103
  async fetchNextRequest() {
486
- await this._ensureHeadIsNonEmpty();
487
- const nextRequestId = this.queueHeadDict.removeFirst();
104
+ await this.ensureHeadIsNonEmpty();
105
+ const nextRequestId = this.queueHeadIds.removeFirst();
488
106
  // We are likely done at this point.
489
107
  if (!nextRequestId)
490
108
  return null;
491
109
  // This should never happen, but...
492
- if (this.inProgress.has(nextRequestId) || this.recentlyHandled.get(nextRequestId)) {
110
+ if (this.inProgress.has(nextRequestId) || this.recentlyHandledRequestsCache.get(nextRequestId)) {
493
111
  this.log.warning('Queue head returned a request that is already in progress?!', {
494
112
  nextRequestId,
495
113
  inProgress: this.inProgress.has(nextRequestId),
496
- recentlyHandled: !!this.recentlyHandled.get(nextRequestId),
114
+ recentlyHandled: !!this.recentlyHandledRequestsCache.get(nextRequestId),
497
115
  });
498
116
  return null;
499
117
  }
@@ -518,7 +136,7 @@ class RequestQueue {
518
136
  this.log.debug('Cannot find a request from the beginning of queue, will be retried later', { nextRequestId });
519
137
  setTimeout(() => {
520
138
  this.inProgress.delete(nextRequestId);
521
- }, exports.STORAGE_CONSISTENCY_DELAY_MILLIS);
139
+ }, utils_1.STORAGE_CONSISTENCY_DELAY_MILLIS);
522
140
  return null;
523
141
  }
524
142
  // 2) Queue head index is behind the main table and the underlying request was already handled
@@ -527,125 +145,14 @@ class RequestQueue {
527
145
  // will not put the request again to queueHeadDict.
528
146
  if (request.handledAt) {
529
147
  this.log.debug('Request fetched from the beginning of queue was already handled', { nextRequestId });
530
- this.recentlyHandled.add(nextRequestId, true);
148
+ this.recentlyHandledRequestsCache.add(nextRequestId, true);
531
149
  return null;
532
150
  }
533
151
  return request;
534
152
  }
535
- /**
536
- * Marks a request that was previously returned by the
537
- * {@apilink RequestQueue.fetchNextRequest}
538
- * function as handled after successful processing.
539
- * Handled requests will never again be returned by the `fetchNextRequest` function.
540
- */
541
- async markRequestHandled(request) {
542
- this.lastActivity = new Date();
543
- (0, ow_1.default)(request, ow_1.default.object.partialShape({
544
- id: ow_1.default.string,
545
- uniqueKey: ow_1.default.string,
546
- handledAt: ow_1.default.optional.string,
547
- }));
548
- if (!this.inProgress.has(request.id)) {
549
- this.log.debug(`Cannot mark request ${request.id} as handled, because it is not in progress!`, { requestId: request.id });
550
- return null;
551
- }
552
- const handledAt = request.handledAt ?? new Date().toISOString();
553
- const queueOperationInfo = await this.client.updateRequest({ ...request, handledAt });
554
- request.handledAt = handledAt;
555
- queueOperationInfo.uniqueKey = request.uniqueKey;
556
- this.inProgress.delete(request.id);
557
- this.recentlyHandled.add(request.id, true);
558
- if (!queueOperationInfo.wasAlreadyHandled) {
559
- this.assumedHandledCount++;
560
- }
561
- this._cacheRequest(getRequestId(request.uniqueKey), queueOperationInfo);
562
- return queueOperationInfo;
563
- }
564
- /**
565
- * Reclaims a failed request back to the queue, so that it can be returned for processing later again
566
- * by another call to {@apilink RequestQueue.fetchNextRequest}.
567
- * The request record in the queue is updated using the provided `request` parameter.
568
- * For example, this lets you store the number of retries or error messages for the request.
569
- */
570
- async reclaimRequest(request, options = {}) {
571
- this.lastActivity = new Date();
572
- (0, ow_1.default)(request, ow_1.default.object.partialShape({
573
- id: ow_1.default.string,
574
- uniqueKey: ow_1.default.string,
575
- }));
576
- (0, ow_1.default)(options, ow_1.default.object.exactShape({
577
- forefront: ow_1.default.optional.boolean,
578
- }));
579
- const { forefront = false } = options;
580
- if (!this.inProgress.has(request.id)) {
581
- this.log.debug(`Cannot reclaim request ${request.id}, because it is not in progress!`, { requestId: request.id });
582
- return null;
583
- }
584
- // TODO: If request hasn't been changed since the last getRequest(),
585
- // we don't need to call updateRequest() and thus improve performance.
586
- const queueOperationInfo = await this.client.updateRequest(request, { forefront });
587
- queueOperationInfo.uniqueKey = request.uniqueKey;
588
- this._cacheRequest(getRequestId(request.uniqueKey), queueOperationInfo);
589
- // Wait a little to increase a chance that the next call to fetchNextRequest() will return the request with updated data.
590
- // This is to compensate for the limitation of DynamoDB, where writes might not be immediately visible to subsequent reads.
591
- setTimeout(() => {
592
- if (!this.inProgress.has(request.id)) {
593
- this.log.debug('The request is no longer marked as in progress in the queue?!', { requestId: request.id });
594
- return;
595
- }
596
- this.inProgress.delete(request.id);
597
- // Performance optimization: add request straight to head if possible
598
- this._maybeAddRequestToQueueHead(request.id, forefront);
599
- }, exports.STORAGE_CONSISTENCY_DELAY_MILLIS);
600
- return queueOperationInfo;
601
- }
602
- /**
603
- * Resolves to `true` if the next call to {@apilink RequestQueue.fetchNextRequest}
604
- * would return `null`, otherwise it resolves to `false`.
605
- * Note that even if the queue is empty, there might be some pending requests currently being processed.
606
- * If you need to ensure that there is no activity in the queue, use {@apilink RequestQueue.isFinished}.
607
- */
608
- async isEmpty() {
153
+ async ensureHeadIsNonEmpty() {
154
+ // Alias for backwards compatibility
609
155
  await this._ensureHeadIsNonEmpty();
610
- return this.queueHeadDict.length() === 0;
611
- }
612
- /**
613
- * Resolves to `true` if all requests were already handled and there are no more left.
614
- * Due to the nature of distributed storage used by the queue,
615
- * the function might occasionally return a false negative,
616
- * but it will never return a false positive.
617
- */
618
- async isFinished() {
619
- if ((Date.now() - +this.lastActivity) > this.internalTimeoutMillis) {
620
- const message = `The request queue seems to be stuck for ${this.internalTimeoutMillis / 1e3}s, resetting internal state.`;
621
- this.log.warning(message, { inProgress: [...this.inProgress] });
622
- this._reset();
623
- }
624
- if (this.queueHeadDict.length() > 0 || this.inProgressCount() > 0)
625
- return false;
626
- const isHeadConsistent = await this._ensureHeadIsNonEmpty(true);
627
- return isHeadConsistent && this.queueHeadDict.length() === 0 && this.inProgressCount() === 0;
628
- }
629
- _reset() {
630
- this.queueHeadDict.clear();
631
- this.queryQueueHeadPromise = null;
632
- this.inProgress.clear();
633
- this.recentlyHandled.clear();
634
- this.assumedTotalCount = 0;
635
- this.assumedHandledCount = 0;
636
- this.requestsCache.clear();
637
- this.lastActivity = new Date();
638
- }
639
- /**
640
- * Caches information about request to beware of unneeded addRequest() calls.
641
- */
642
- _cacheRequest(cacheKey, queueOperationInfo) {
643
- this.requestsCache.add(cacheKey, {
644
- id: queueOperationInfo.requestId,
645
- isHandled: queueOperationInfo.wasAlreadyHandled,
646
- uniqueKey: queueOperationInfo.uniqueKey,
647
- wasAlreadyHandled: queueOperationInfo.wasAlreadyHandled,
648
- });
649
156
  }
650
157
  /**
651
158
  * We always request more items than is in progress to ensure that something falls into head.
@@ -658,10 +165,15 @@ class RequestQueue {
658
165
  * @param [iteration] Used when this function is called recursively to limit the recursion.
659
166
  * @returns Indicates if queue head is consistent (true) or inconsistent (false).
660
167
  */
661
- async _ensureHeadIsNonEmpty(ensureConsistency = false, limit = Math.max(this.inProgressCount() * exports.QUERY_HEAD_BUFFER, exports.QUERY_HEAD_MIN_LENGTH), iteration = 0) {
168
+ async _ensureHeadIsNonEmpty(ensureConsistency = false, limit = Math.max(this.inProgressCount() * utils_1.QUERY_HEAD_BUFFER, utils_1.QUERY_HEAD_MIN_LENGTH), iteration = 0) {
169
+ // If we are paused for migration, resolve immediately.
170
+ if (this.queuePausedForMigration) {
171
+ return true;
172
+ }
662
173
  // If is nonempty resolve immediately.
663
- if (this.queueHeadDict.length() > 0)
174
+ if (this.queueHeadIds.length() > 0) {
664
175
  return true;
176
+ }
665
177
  if (!this.queryQueueHeadPromise) {
666
178
  const queryStartedAt = new Date();
667
179
  this.queryQueueHeadPromise = this.client
@@ -669,10 +181,10 @@ class RequestQueue {
669
181
  .then(({ items, queueModifiedAt, hadMultipleClients }) => {
670
182
  items.forEach(({ id: requestId, uniqueKey }) => {
671
183
  // Queue head index might be behind the main table, so ensure we don't recycle requests
672
- if (!requestId || !uniqueKey || this.inProgress.has(requestId) || this.recentlyHandled.get(requestId))
184
+ if (!requestId || !uniqueKey || this.inProgress.has(requestId) || this.recentlyHandledRequestsCache.get(requestId))
673
185
  return;
674
- this.queueHeadDict.add(requestId, requestId, false);
675
- this._cacheRequest(getRequestId(uniqueKey), {
186
+ this.queueHeadIds.add(requestId, requestId, false);
187
+ this._cacheRequest((0, utils_1.getRequestId)(uniqueKey), {
676
188
  requestId,
677
189
  wasAlreadyHandled: false,
678
190
  wasAlreadyPresent: true,
@@ -701,13 +213,13 @@ class RequestQueue {
701
213
  if (prevLimit >= consts_1.REQUEST_QUEUE_HEAD_MAX_LIMIT) {
702
214
  this.log.warning(`Reached the maximum number of requests in progress: ${consts_1.REQUEST_QUEUE_HEAD_MAX_LIMIT}.`);
703
215
  }
704
- const shouldRepeatWithHigherLimit = this.queueHeadDict.length() === 0
216
+ const shouldRepeatWithHigherLimit = this.queueHeadIds.length() === 0
705
217
  && wasLimitReached
706
218
  && prevLimit < consts_1.REQUEST_QUEUE_HEAD_MAX_LIMIT;
707
219
  // If ensureConsistency=true then we must ensure that either:
708
220
  // - queueModifiedAt is older than queryStartedAt by at least API_PROCESSED_REQUESTS_DELAY_MILLIS
709
221
  // - hadMultipleClients=false and this.assumedTotalCount<=this.assumedHandledCount
710
- const isDatabaseConsistent = +queryStartedAt - +queueModifiedAt >= exports.API_PROCESSED_REQUESTS_DELAY_MILLIS;
222
+ const isDatabaseConsistent = +queryStartedAt - +queueModifiedAt >= utils_1.API_PROCESSED_REQUESTS_DELAY_MILLIS;
711
223
  const isLocallyConsistent = !hadMultipleClients && this.assumedTotalCount <= this.assumedHandledCount;
712
224
  // Consistent information from one source is enough to consider request queue finished.
713
225
  const shouldRepeatForConsistency = ensureConsistency && !isDatabaseConsistent && !isLocallyConsistent;
@@ -716,152 +228,57 @@ class RequestQueue {
716
228
  return true;
717
229
  // If we are querying for consistency then we limit the number of queries to MAX_QUERIES_FOR_CONSISTENCY.
718
230
  // If this is reached then we return false so that empty() and finished() returns possibly false negative.
719
- if (!shouldRepeatWithHigherLimit && iteration > exports.MAX_QUERIES_FOR_CONSISTENCY)
231
+ if (!shouldRepeatWithHigherLimit && iteration > utils_1.MAX_QUERIES_FOR_CONSISTENCY)
720
232
  return false;
721
233
  const nextLimit = shouldRepeatWithHigherLimit
722
234
  ? Math.round(prevLimit * 1.5)
723
235
  : prevLimit;
724
236
  // If we are repeating for consistency then wait required time.
725
237
  if (shouldRepeatForConsistency) {
726
- const delayMillis = exports.API_PROCESSED_REQUESTS_DELAY_MILLIS - (Date.now() - +queueModifiedAt);
238
+ const delayMillis = utils_1.API_PROCESSED_REQUESTS_DELAY_MILLIS - (Date.now() - +queueModifiedAt);
727
239
  this.log.info(`Waiting for ${delayMillis}ms before considering the queue as finished to ensure that the data is consistent.`);
728
240
  await (0, promises_1.setTimeout)(delayMillis);
729
241
  }
730
242
  return this._ensureHeadIsNonEmpty(ensureConsistency, nextLimit, iteration + 1);
731
243
  }
732
- /**
733
- * Adds a request straight to the queueHeadDict, to improve performance.
734
- */
735
- _maybeAddRequestToQueueHead(requestId, forefront) {
736
- if (forefront) {
737
- this.queueHeadDict.add(requestId, requestId, true);
738
- }
739
- else if (this.assumedTotalCount < exports.QUERY_HEAD_MIN_LENGTH) {
740
- this.queueHeadDict.add(requestId, requestId, false);
244
+ // RequestQueue v1 behavior overrides below
245
+ async isFinished() {
246
+ if ((Date.now() - +this.lastActivity) > this.internalTimeoutMillis) {
247
+ const message = `The request queue seems to be stuck for ${this.internalTimeoutMillis / 1e3}s, resetting internal state.`;
248
+ this.log.warning(message, { inProgress: [...this.inProgress] });
249
+ this._reset();
741
250
  }
251
+ if (this.queueHeadIds.length() > 0 || this.inProgressCount() > 0)
252
+ return false;
253
+ const isHeadConsistent = await this._ensureHeadIsNonEmpty(true);
254
+ return isHeadConsistent && this.queueHeadIds.length() === 0 && this.inProgressCount() === 0;
742
255
  }
743
- /**
744
- * Removes the queue either from the Apify Cloud storage or from the local database,
745
- * depending on the mode of operation.
746
- */
747
- async drop() {
748
- await this.client.delete();
749
- const manager = storage_manager_1.StorageManager.getManager(RequestQueue, this.config);
750
- manager.closeStorage(this);
256
+ async addRequest(...args) {
257
+ this.lastActivity = new Date();
258
+ return super.addRequest(...args);
751
259
  }
752
- /**
753
- * Returns the number of handled requests.
754
- *
755
- * This function is just a convenient shortcut for:
756
- *
757
- * ```javascript
758
- * const { handledRequestCount } = await queue.getInfo();
759
- * ```
760
- */
761
- async handledCount() {
762
- // NOTE: We keep this function for compatibility with RequestList.handledCount()
763
- const { handledRequestCount } = await this.getInfo() ?? {};
764
- return handledRequestCount ?? 0;
260
+ async addRequests(...args) {
261
+ this.lastActivity = new Date();
262
+ return super.addRequests(...args);
765
263
  }
766
- /**
767
- * Returns an object containing general information about the request queue.
768
- *
769
- * The function returns the same object as the Apify API Client's
770
- * [getQueue](https://docs.apify.com/api/apify-client-js/latest#ApifyClient-requestQueues)
771
- * function, which in turn calls the
772
- * [Get request queue](https://apify.com/docs/api/v2#/reference/request-queues/queue/get-request-queue)
773
- * API endpoint.
774
- *
775
- * **Example:**
776
- * ```
777
- * {
778
- * id: "WkzbQMuFYuamGv3YF",
779
- * name: "my-queue",
780
- * userId: "wRsJZtadYvn4mBZmm",
781
- * createdAt: new Date("2015-12-12T07:34:14.202Z"),
782
- * modifiedAt: new Date("2015-12-13T08:36:13.202Z"),
783
- * accessedAt: new Date("2015-12-14T08:36:13.202Z"),
784
- * totalRequestCount: 25,
785
- * handledRequestCount: 5,
786
- * pendingRequestCount: 20,
787
- * }
788
- * ```
789
- */
790
- async getInfo() {
791
- return this.client.get();
264
+ async addRequestsBatched(...args) {
265
+ this.lastActivity = new Date();
266
+ return super.addRequestsBatched(...args);
792
267
  }
793
- /**
794
- * Fetches URLs from requestsFromUrl and returns them in format of list of requests
795
- */
796
- async _fetchRequestsFromUrl(source) {
797
- const { requestsFromUrl, regex, ...sharedOpts } = source;
798
- // Download remote resource and parse URLs.
799
- let urlsArr;
800
- try {
801
- urlsArr = await this._downloadListOfUrls({ url: requestsFromUrl, urlRegExp: regex, proxyUrl: await this.proxyConfiguration?.newUrl() });
802
- }
803
- catch (err) {
804
- throw new Error(`Cannot fetch a request list from ${requestsFromUrl}: ${err}`);
805
- }
806
- // Skip if resource contained no URLs.
807
- if (!urlsArr.length) {
808
- this.log.warning('list fetched, but it is empty.', { requestsFromUrl, regex });
809
- return [];
810
- }
811
- return urlsArr.map((url) => ({ url, ...sharedOpts }));
268
+ async markRequestHandled(...args) {
269
+ this.lastActivity = new Date();
270
+ return super.markRequestHandled(...args);
812
271
  }
813
- /**
814
- * Adds all fetched requests from a URL from a remote resource.
815
- */
816
- async _addFetchedRequests(source, fetchedRequests, options) {
817
- const { requestsFromUrl, regex } = source;
818
- const { addedRequests } = await this.addRequestsBatched(fetchedRequests, options);
819
- this.log.info('Fetched and loaded Requests from a remote resource.', {
820
- requestsFromUrl,
821
- regex,
822
- fetchedCount: fetchedRequests.length,
823
- importedCount: addedRequests.length,
824
- duplicateCount: fetchedRequests.length - addedRequests.length,
825
- sample: JSON.stringify(fetchedRequests.slice(0, 5)),
826
- });
827
- return addedRequests;
272
+ async reclaimRequest(...args) {
273
+ this.lastActivity = new Date();
274
+ return super.reclaimRequest(...args);
828
275
  }
829
- /**
830
- * @internal wraps public utility for mocking purposes
831
- */
832
- async _downloadListOfUrls(options) {
833
- return (0, utils_1.downloadListOfUrls)(options);
276
+ _reset() {
277
+ super._reset();
278
+ this.lastActivity = new Date();
834
279
  }
835
- /**
836
- * Opens a request queue and returns a promise resolving to an instance
837
- * of the {@apilink RequestQueue} class.
838
- *
839
- * {@apilink RequestQueue} represents a queue of URLs to crawl, which is stored either on local filesystem or in the cloud.
840
- * The queue is used for deep crawling of websites, where you start with several URLs and then
841
- * recursively follow links to other pages. The data structure supports both breadth-first
842
- * and depth-first crawling orders.
843
- *
844
- * For more details and code examples, see the {@apilink RequestQueue} class.
845
- *
846
- * @param [queueIdOrName]
847
- * ID or name of the request queue to be opened. If `null` or `undefined`,
848
- * the function returns the default request queue associated with the crawler run.
849
- * @param [options] Open Request Queue options.
850
- */
851
- static async open(queueIdOrName, options = {}) {
852
- (0, ow_1.default)(queueIdOrName, ow_1.default.optional.any(ow_1.default.string, ow_1.default.null));
853
- (0, ow_1.default)(options, ow_1.default.object.exactShape({
854
- config: ow_1.default.optional.object.instanceOf(configuration_1.Configuration),
855
- storageClient: ow_1.default.optional.object,
856
- proxyConfiguration: ow_1.default.optional.object,
857
- }));
858
- options.config ?? (options.config = configuration_1.Configuration.getGlobalConfig());
859
- options.storageClient ?? (options.storageClient = options.config.getStorageClient());
860
- await (0, utils_2.purgeDefaultStorages)({ onlyPurgeOnce: true, client: options.storageClient, config: options.config });
861
- const manager = storage_manager_1.StorageManager.getManager(this, options.config);
862
- const queue = await manager.openStorage(queueIdOrName, options.storageClient);
863
- queue.proxyConfiguration = options.proxyConfiguration;
864
- return queue;
280
+ static open(...args) {
281
+ return super.open(...args);
865
282
  }
866
283
  }
867
284
  exports.RequestQueue = RequestQueue;