@crawlee/core 4.0.0-beta.64 → 4.0.0-beta.66

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/crawlers/crawler_commons.d.ts +3 -3
  2. package/crawlers/crawler_commons.d.ts.map +1 -1
  3. package/enqueue_links/enqueue_links.d.ts +7 -6
  4. package/enqueue_links/enqueue_links.d.ts.map +1 -1
  5. package/enqueue_links/enqueue_links.js +4 -4
  6. package/enqueue_links/enqueue_links.js.map +1 -1
  7. package/package.json +5 -5
  8. package/storages/index.d.ts +4 -6
  9. package/storages/index.d.ts.map +1 -1
  10. package/storages/index.js +2 -6
  11. package/storages/index.js.map +1 -1
  12. package/storages/request_list.d.ts +23 -72
  13. package/storages/request_list.d.ts.map +1 -1
  14. package/storages/request_list.js +34 -29
  15. package/storages/request_list.js.map +1 -1
  16. package/storages/request_loader.d.ts +97 -0
  17. package/storages/request_loader.d.ts.map +1 -0
  18. package/storages/request_loader.js +2 -0
  19. package/storages/request_loader.js.map +1 -0
  20. package/storages/request_manager.d.ts +34 -0
  21. package/storages/request_manager.d.ts.map +1 -0
  22. package/storages/request_manager.js +2 -0
  23. package/storages/request_manager.js.map +1 -0
  24. package/storages/request_manager_tandem.d.ts +56 -17
  25. package/storages/request_manager_tandem.d.ts.map +1 -1
  26. package/storages/request_manager_tandem.js +114 -41
  27. package/storages/request_manager_tandem.js.map +1 -1
  28. package/storages/request_queue.d.ts +276 -44
  29. package/storages/request_queue.d.ts.map +1 -1
  30. package/storages/request_queue.js +576 -212
  31. package/storages/request_queue.js.map +1 -1
  32. package/storages/{sitemap_request_list.d.ts → sitemap_request_loader.d.ts} +24 -19
  33. package/storages/sitemap_request_loader.d.ts.map +1 -0
  34. package/storages/{sitemap_request_list.js → sitemap_request_loader.js} +41 -40
  35. package/storages/sitemap_request_loader.js.map +1 -0
  36. package/validators.d.ts +4 -0
  37. package/validators.d.ts.map +1 -1
  38. package/validators.js +4 -0
  39. package/validators.js.map +1 -1
  40. package/storages/request_list_adapter.d.ts +0 -58
  41. package/storages/request_list_adapter.d.ts.map +0 -1
  42. package/storages/request_list_adapter.js +0 -81
  43. package/storages/request_list_adapter.js.map +0 -1
  44. package/storages/request_provider.d.ts +0 -384
  45. package/storages/request_provider.d.ts.map +0 -1
  46. package/storages/request_provider.js +0 -624
  47. package/storages/request_provider.js.map +0 -1
  48. package/storages/request_queue_v2.d.ts +0 -87
  49. package/storages/request_queue_v2.d.ts.map +0 -1
  50. package/storages/request_queue_v2.js +0 -437
  51. package/storages/request_queue_v2.js.map +0 -1
  52. package/storages/sitemap_request_list.d.ts.map +0 -1
  53. package/storages/sitemap_request_list.js.map +0 -1
@@ -1,16 +1,19 @@
1
- import { setTimeout as sleep } from 'node:timers/promises';
2
- import { REQUEST_QUEUE_HEAD_MAX_LIMIT } from '@apify/consts';
1
+ import { inspect } from 'node:util';
2
+ import { chunkedAsyncIterable, downloadListOfUrls, getObjectType, isAsyncIterable, isIterable, peekableAsyncIterable, sleep, } from '@crawlee/utils';
3
+ import ow from 'ow';
4
+ import { LruCache } from '@apify/datastructures';
5
+ import { cryptoRandomObjectId } from '@apify/utilities';
6
+ import { Configuration } from '../configuration.js';
7
+ import { Request } from '../request.js';
3
8
  import { serviceLocator } from '../service_locator.js';
4
9
  import { checkStorageAccess } from './access_checking.js';
5
- import { RequestProvider } from './request_provider.js';
6
- import { API_PROCESSED_REQUESTS_DELAY_MILLIS, getRequestId, MAX_QUERIES_FOR_CONSISTENCY, QUERY_HEAD_BUFFER, QUERY_HEAD_MIN_LENGTH, STORAGE_CONSISTENCY_DELAY_MILLIS, } from './utils.js';
7
- const MAX_CACHED_REQUESTS = 1_000_000;
10
+ import { resolveStorageIdentifier } from './storage_instance_manager.js';
11
+ import { getRequestId, purgeDefaultStorages } from './utils.js';
8
12
  /**
9
- * This number must be large enough so that processing of all these requests cannot be done in
10
- * a time lower than expected maximum latency of DynamoDB, but low enough not to waste too much memory.
13
+ * The maximum number of requests cached locally to avoid redundant calls to the storage client.
11
14
  * @internal
12
15
  */
13
- const RECENTLY_HANDLED_CACHE_SIZE = 1000;
16
+ const MAX_CACHED_REQUESTS = 2_000_000;
14
17
  /**
15
18
  * Represents a queue of URLs to crawl, which is used for deep crawling of websites
16
19
  * where you start with several URLs and then recursively
@@ -29,18 +32,6 @@ const RECENTLY_HANDLED_CACHE_SIZE = 1000;
29
32
  * Unlike {@link RequestList}, `RequestQueue` supports dynamic adding and removing of requests.
30
33
  * On the other hand, the queue is not optimized for operations that add or remove a large number of URLs in a batch.
31
34
  *
32
- * `RequestQueue` stores its data either on local disk or in the Apify Cloud,
33
- * depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variable is set.
34
- *
35
- * If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the queue data is stored in
36
- * that directory in an SQLite database file.
37
- *
38
- * If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the
39
- * [Apify Request Queue](https://docs.apify.com/storage/request-queue)
40
- * cloud storage. Note that you can force usage of the cloud storage also by passing the `forceCloud`
41
- * option to {@link RequestQueue.open} function,
42
- * even if the `APIFY_LOCAL_STORAGE_DIR` variable is set.
43
- *
44
35
  * **Example usage:**
45
36
  *
46
37
  * ```javascript
@@ -57,31 +48,316 @@ const RECENTLY_HANDLED_CACHE_SIZE = 1000;
57
48
  * ```
58
49
  * @category Sources
59
50
  */
60
- class RequestQueue extends RequestProvider {
61
- queryQueueHeadPromise = null;
62
- inProgress = new Set();
51
+ export class RequestQueue {
52
+ config;
53
+ id;
54
+ name;
55
+ timeoutSecs = 30;
56
+ clientKey = cryptoRandomObjectId();
57
+ client;
58
+ proxyConfiguration;
59
+ log;
60
+ isInitialized = false;
61
+ requestCache;
62
+ queuePausedForMigration = false;
63
+ inProgressRequestBatchCount = 0;
64
+ /**
65
+ * The largest expected request-processing time (in seconds) seen so far via
66
+ * {@link setExpectedRequestProcessingTimeSecs}. Used to ensure that value is only ever raised, never
67
+ * lowered, before being forwarded to the storage client.
68
+ */
69
+ expectedRequestProcessingSecs = 0;
70
+ httpClient;
71
+ events;
63
72
  /**
64
73
  * @internal
65
74
  */
66
- constructor(options, config = serviceLocator.getConfiguration()) {
67
- super({
68
- ...options,
69
- logPrefix: 'RequestQueue',
70
- recentlyHandledRequestsMaxSize: RECENTLY_HANDLED_CACHE_SIZE,
71
- requestCacheMaxSize: MAX_CACHED_REQUESTS,
72
- }, config);
75
+ constructor(options, config = Configuration.getGlobalConfig()) {
76
+ this.config = config;
77
+ this.id = options.id;
78
+ this.name = options.name;
79
+ this.events = serviceLocator.getEventManager();
80
+ this.client = options.client;
81
+ this.proxyConfiguration = options.proxyConfiguration;
82
+ this.requestCache = new LruCache({ maxLength: MAX_CACHED_REQUESTS });
83
+ this.log = serviceLocator.getLogger().child({ prefix: `RequestQueue(${this.id}, ${this.name ?? 'no-name'})` });
84
+ this.events.on("migrating" /* EventType.MIGRATING */, async () => {
85
+ this.queuePausedForMigration = true;
86
+ });
73
87
  }
74
88
  /**
75
- * @internal
89
+ * Returns the total number of requests in the queue (i.e. pending + handled).
90
+ *
91
+ * Survives restarts and actor migrations.
92
+ */
93
+ async getTotalCount() {
94
+ const { totalRequestCount } = await this.getInfo();
95
+ return totalRequestCount;
96
+ }
97
+ /**
98
+ * Returns the total number of pending requests in the queue.
99
+ *
100
+ * Survives restarts and Actor migrations.
101
+ */
102
+ async getPendingCount() {
103
+ const { totalRequestCount, handledRequestCount } = await this.getInfo();
104
+ return totalRequestCount - handledRequestCount;
105
+ }
106
+ /**
107
+ * Adds a request to the queue.
108
+ *
109
+ * If a request with the same `uniqueKey` property is already present in the queue,
110
+ * it will not be updated. You can find out whether this happened from the resulting
111
+ * {@link QueueOperationInfo} object.
112
+ *
113
+ * To add multiple requests to the queue by extracting links from a webpage,
114
+ * see the {@link enqueueLinks} helper function.
115
+ *
116
+ * @param requestLike {@link Request} object or vanilla object with request data.
117
+ * Note that the function sets the `uniqueKey` and `id` fields to the passed Request.
118
+ * @param [options] Request queue operation options.
119
+ */
120
+ async addRequest(requestLike, options = {}) {
121
+ checkStorageAccess();
122
+ ow(requestLike, ow.object);
123
+ ow(options, ow.object.exactShape({
124
+ forefront: ow.optional.boolean,
125
+ }));
126
+ const { forefront = false } = options;
127
+ if ('requestsFromUrl' in requestLike) {
128
+ const requests = await this._fetchRequestsFromUrl(requestLike);
129
+ const processedRequests = await this._addFetchedRequests(requestLike, requests, options);
130
+ return { ...processedRequests[0], forefront };
131
+ }
132
+ ow(requestLike, ow.object.partialShape({
133
+ url: ow.string,
134
+ id: ow.undefined,
135
+ }));
136
+ const request = requestLike instanceof Request ? requestLike : new Request(requestLike);
137
+ const cacheKey = getRequestId(request.uniqueKey);
138
+ const cachedInfo = this.requestCache.get(cacheKey);
139
+ if (cachedInfo) {
140
+ request.id = cachedInfo.id;
141
+ return {
142
+ wasAlreadyPresent: true,
143
+ // We may assume that if request is in local cache then also the information if the
144
+ // request was already handled is there because just one client should be using one queue.
145
+ wasAlreadyHandled: cachedInfo.isHandled,
146
+ requestId: cachedInfo.id,
147
+ uniqueKey: cachedInfo.uniqueKey,
148
+ forefront,
149
+ };
150
+ }
151
+ const { processedRequests } = await this.client.addBatchOfRequests([request], { forefront });
152
+ const queueOperationInfo = {
153
+ ...processedRequests[0],
154
+ uniqueKey: request.uniqueKey,
155
+ forefront,
156
+ };
157
+ this._cacheRequest(cacheKey, queueOperationInfo);
158
+ return queueOperationInfo;
159
+ }
160
+ /**
161
+ * Adds requests to the queue in batches of 25. This method will wait till all the requests are added
162
+ * to the queue before resolving. You should prefer using `queue.addRequestsBatched()` or `crawler.addRequests()`
163
+ * if you don't want to block the processing, as those methods will only wait for the initial 1000 requests,
164
+ * start processing right after that happens, and continue adding more in the background.
165
+ *
166
+ * If a request passed in is already present due to its `uniqueKey` property being the same,
167
+ * it will not be updated. You can find out whether this happened by finding the request in the resulting
168
+ * {@link BatchAddRequestsResult} object.
169
+ *
170
+ * @param requestsLike {@link Request} objects or vanilla objects with request data.
171
+ * Note that the function sets the `uniqueKey` and `id` fields to the passed requests if missing.
172
+ * @param [options] Request queue operation options.
173
+ */
174
+ async addRequests(requestsLike, options = {}) {
175
+ checkStorageAccess();
176
+ ow(requestsLike, ow.object
177
+ .is((value) => isIterable(value) || isAsyncIterable(value))
178
+ .message((value) => `Expected an iterable or async iterable, got ${getObjectType(value)}`));
179
+ ow(options, ow.object.exactShape({
180
+ forefront: ow.optional.boolean,
181
+ cache: ow.optional.boolean,
182
+ }));
183
+ const { forefront = false, cache = true } = options;
184
+ const uniqueKeyToCacheKey = new Map();
185
+ const getCachedRequestId = (uniqueKey) => {
186
+ const cached = uniqueKeyToCacheKey.get(uniqueKey);
187
+ if (cached)
188
+ return cached;
189
+ const newCacheKey = getRequestId(uniqueKey);
190
+ uniqueKeyToCacheKey.set(uniqueKey, newCacheKey);
191
+ return newCacheKey;
192
+ };
193
+ const results = {
194
+ processedRequests: [],
195
+ unprocessedRequests: [],
196
+ };
197
+ const requests = [];
198
+ for await (const requestLike of requestsLike) {
199
+ if (typeof requestLike === 'string') {
200
+ requests.push(new Request({ url: requestLike }));
201
+ }
202
+ else if ('requestsFromUrl' in requestLike) {
203
+ const fetchedRequests = await this._fetchRequestsFromUrl(requestLike);
204
+ await this._addFetchedRequests(requestLike, fetchedRequests, options);
205
+ }
206
+ else {
207
+ requests.push(requestLike instanceof Request ? requestLike : new Request(requestLike));
208
+ }
209
+ }
210
+ const requestsToAdd = new Map();
211
+ for (const request of requests) {
212
+ const cacheKey = getCachedRequestId(request.uniqueKey);
213
+ const cachedInfo = this.requestCache.get(cacheKey);
214
+ if (cachedInfo) {
215
+ request.id = cachedInfo.id;
216
+ results.processedRequests.push({
217
+ wasAlreadyPresent: true,
218
+ // We may assume that if request is in local cache then also the information if the
219
+ // request was already handled is there because just one client should be using one queue.
220
+ wasAlreadyHandled: cachedInfo.isHandled,
221
+ requestId: cachedInfo.id,
222
+ uniqueKey: cachedInfo.uniqueKey,
223
+ });
224
+ }
225
+ else if (!requestsToAdd.has(request.uniqueKey)) {
226
+ requestsToAdd.set(request.uniqueKey, request);
227
+ }
228
+ }
229
+ // Early exit if all provided requests were already added
230
+ if (!requestsToAdd.size) {
231
+ return results;
232
+ }
233
+ const apiResults = await this.client.addBatchOfRequests([...requestsToAdd.values()], { forefront });
234
+ // Report unprocessed requests
235
+ results.unprocessedRequests = apiResults.unprocessedRequests;
236
+ // Add all new requests to the requestCache
237
+ for (const newRequest of apiResults.processedRequests) {
238
+ // Add the new request to the processed list
239
+ results.processedRequests.push(newRequest);
240
+ const cacheKey = getCachedRequestId(newRequest.uniqueKey);
241
+ if (cache) {
242
+ this._cacheRequest(cacheKey, { ...newRequest, forefront });
243
+ }
244
+ }
245
+ return results;
246
+ }
247
+ /**
248
+ * Adds requests to the queue in batches. By default, it will resolve after the initial batch is added, and continue
249
+ * adding the rest in the background. You can configure the batch size via `batchSize` option and the sleep time in between
250
+ * the batches via `waitBetweenBatchesMillis`. If you want to wait for all batches to be added to the queue, you can use
251
+ * the `waitForAllRequestsToBeAdded` promise you get in the response object.
252
+ *
253
+ * @param requests The requests to add
254
+ * @param options Options for the request queue
76
255
  */
77
- inProgressCount() {
78
- return this.inProgress.size;
256
+ async addRequestsBatched(requests, options = {}) {
257
+ checkStorageAccess();
258
+ ow(requests, ow.object
259
+ .is((value) => isIterable(value) || isAsyncIterable(value))
260
+ .message((value) => `Expected an iterable or async iterable, got ${getObjectType(value)}`));
261
+ ow(options, ow.object.exactShape({
262
+ forefront: ow.optional.boolean,
263
+ waitForAllRequestsToBeAdded: ow.optional.boolean,
264
+ batchSize: ow.optional.number,
265
+ waitBetweenBatchesMillis: ow.optional.number,
266
+ }));
267
+ const addRequest = this.addRequest.bind(this);
268
+ async function* generateRequests() {
269
+ for await (const opts of requests) {
270
+ // Validate the input
271
+ if (typeof opts === 'object' && opts !== null) {
272
+ if (opts.url !== undefined && typeof opts.url !== 'string') {
273
+ throw new Error(`Request options are not valid, the 'url' property is not a string. Input: ${inspect(opts)}`);
274
+ }
275
+ if (opts.id !== undefined) {
276
+ throw new Error(`Request options are not valid, the 'id' property must not be present. Input: ${inspect(opts)}`);
277
+ }
278
+ if (opts.requestsFromUrl !== undefined &&
279
+ typeof opts.requestsFromUrl !== 'string') {
280
+ throw new Error(`Request options are not valid, the 'requestsFromUrl' property is not a string. Input: ${inspect(opts)}`);
281
+ }
282
+ }
283
+ if (opts && typeof opts === 'object' && 'requestsFromUrl' in opts) {
284
+ // Handle URL lists right away
285
+ await addRequest(opts, { forefront: options.forefront });
286
+ }
287
+ else {
288
+ // Yield valid requests
289
+ yield typeof opts === 'string' ? { url: opts } : opts;
290
+ }
291
+ }
292
+ }
293
+ const { batchSize = 1000, waitBetweenBatchesMillis = 1000 } = options;
294
+ const chunks = peekableAsyncIterable(chunkedAsyncIterable(generateRequests(), batchSize));
295
+ const chunksIterator = chunks[Symbol.asyncIterator]();
296
+ const attemptToAddToQueueAndAddAnyUnprocessed = async (providedRequests, cache = true) => {
297
+ const resultsToReturn = [];
298
+ const apiResult = await this.addRequests(providedRequests, { forefront: options.forefront, cache });
299
+ resultsToReturn.push(...apiResult.processedRequests);
300
+ if (apiResult.unprocessedRequests.length) {
301
+ await sleep(waitBetweenBatchesMillis);
302
+ resultsToReturn.push(...(await attemptToAddToQueueAndAddAnyUnprocessed(providedRequests.filter((r) => !apiResult.processedRequests.some((pr) => pr.uniqueKey === r.uniqueKey)), false)));
303
+ }
304
+ return resultsToReturn;
305
+ };
306
+ // Add initial batch of `batchSize` to process them right away
307
+ const initialChunk = await chunksIterator.peek();
308
+ if (initialChunk === undefined) {
309
+ return { addedRequests: [], waitForAllRequestsToBeAdded: Promise.resolve([]) };
310
+ }
311
+ const addedRequests = await attemptToAddToQueueAndAddAnyUnprocessed(initialChunk);
312
+ await chunksIterator.next();
313
+ // If we have no more requests to add, return immediately
314
+ if ((await chunksIterator.peek()) === undefined) {
315
+ return {
316
+ addedRequests,
317
+ waitForAllRequestsToBeAdded: Promise.resolve([]),
318
+ };
319
+ }
320
+ // eslint-disable-next-line no-async-promise-executor
321
+ const promise = new Promise(async (resolve) => {
322
+ const finalAddedRequests = [];
323
+ for await (const requestChunk of chunks) {
324
+ finalAddedRequests.push(...(await attemptToAddToQueueAndAddAnyUnprocessed(requestChunk, false)));
325
+ await sleep(waitBetweenBatchesMillis);
326
+ }
327
+ resolve(finalAddedRequests);
328
+ });
329
+ this.inProgressRequestBatchCount += 1;
330
+ void promise.finally(() => {
331
+ this.inProgressRequestBatchCount -= 1;
332
+ });
333
+ // If the user wants to wait for all the requests to be added, we wait for the promise to resolve for them
334
+ if (options.waitForAllRequestsToBeAdded) {
335
+ addedRequests.push(...(await promise));
336
+ }
337
+ return {
338
+ addedRequests,
339
+ waitForAllRequestsToBeAdded: promise,
340
+ };
341
+ }
342
+ /**
343
+ * Gets the request from the queue specified by its `uniqueKey`.
344
+ *
345
+ * @param uniqueKey Unique key of the request.
346
+ * @returns Returns the request object, or `null` if it was not found.
347
+ */
348
+ async getRequest(uniqueKey) {
349
+ checkStorageAccess();
350
+ ow(uniqueKey, ow.string);
351
+ const requestOptions = await this.client.getRequest(uniqueKey);
352
+ if (!requestOptions)
353
+ return null;
354
+ return new Request(requestOptions);
79
355
  }
80
356
  /**
81
357
  * Returns a next request in the queue to be processed, or `null` if there are no more pending requests.
82
358
  *
83
359
  * Once you successfully finish processing of the request, you need to call
84
- * {@link RequestQueue.markRequestHandled}
360
+ * {@link RequestQueue.markRequestAsHandled}
85
361
  * to mark the request as handled in the queue. If there was some error in processing the request,
86
362
  * call {@link RequestQueue.reclaimRequest} instead,
87
363
  * so that the queue will give the request to some other consumer in another call to the `fetchNextRequest` function.
@@ -96,203 +372,258 @@ class RequestQueue extends RequestProvider {
96
372
  */
97
373
  async fetchNextRequest() {
98
374
  checkStorageAccess();
99
- this.lastActivity = new Date();
100
- await this.ensureHeadIsNonEmpty();
101
- const nextRequestId = this.queueHeadIds.removeFirst();
102
- // We are likely done at this point.
103
- if (!nextRequestId)
104
- return null;
105
- // This should never happen, but...
106
- if (this.inProgress.has(nextRequestId) || this.recentlyHandledRequestsCache.get(nextRequestId)) {
107
- this.log.warning('Queue head returned a request that is already in progress?!', {
108
- nextRequestId,
109
- inProgress: this.inProgress.has(nextRequestId),
110
- recentlyHandled: !!this.recentlyHandledRequestsCache.get(nextRequestId),
111
- });
375
+ if (this.queuePausedForMigration) {
112
376
  return null;
113
377
  }
114
- this.inProgress.add(nextRequestId);
115
- this.lastActivity = new Date();
116
- let request;
117
- try {
118
- request = await this.getRequest(nextRequestId);
119
- }
120
- catch (e) {
121
- // On error, remove the request from in progress, otherwise it would be there forever
122
- this.inProgress.delete(nextRequestId);
123
- throw e;
124
- }
125
- // NOTE: It can happen that the queue head index is inconsistent with the main queue table. This can occur in two situations:
126
- // 1) Queue head index is ahead of the main table and the request is not present in the main table yet (i.e. getRequest() returned null).
127
- // In this case, keep the request marked as in progress for a short while,
128
- // so that isFinished() doesn't return true and _ensureHeadIsNonEmpty() doesn't not load the request
129
- // into the queueHeadDict straight again. After the interval expires, fetchNextRequest()
130
- // will try to fetch this request again, until it eventually appears in the main table.
131
- if (!request) {
132
- this.log.debug('Cannot find a request from the beginning of queue, will be retried later', {
133
- nextRequestId,
134
- });
135
- setTimeout(() => {
136
- this.inProgress.delete(nextRequestId);
137
- }, STORAGE_CONSISTENCY_DELAY_MILLIS);
378
+ const requestOptions = await this.client.fetchNextRequest();
379
+ if (!requestOptions)
138
380
  return null;
139
- }
140
- // 2) Queue head index is behind the main table and the underlying request was already handled
141
- // (by some other client, since we keep the track of handled requests in recentlyHandled dictionary).
142
- // We just add the request to the recentlyHandled dictionary so that next call to _ensureHeadIsNonEmpty()
143
- // will not put the request again to queueHeadDict.
144
- if (request.handledAt) {
145
- this.log.debug('Request fetched from the beginning of queue was already handled', { nextRequestId });
146
- this.recentlyHandledRequestsCache.add(nextRequestId, true);
381
+ return new Request(requestOptions);
382
+ }
383
+ /**
384
+ * Marks a request that was previously returned by the
385
+ * {@link RequestQueue.fetchNextRequest}
386
+ * function as handled after successful processing.
387
+ * Handled requests will never again be returned by the `fetchNextRequest` function.
388
+ */
389
+ async markRequestAsHandled(request) {
390
+ checkStorageAccess();
391
+ ow(request, ow.object.partialShape({
392
+ id: ow.string,
393
+ uniqueKey: ow.string,
394
+ handledAt: ow.optional.string,
395
+ }));
396
+ const forefront = this.requestCache.get(getRequestId(request.uniqueKey))?.forefront ?? false;
397
+ const handledAt = request.handledAt ?? new Date().toISOString();
398
+ const processedRequest = await this.client.markRequestAsHandled({
399
+ ...request,
400
+ handledAt,
401
+ });
402
+ // The request was not in progress (e.g. already handled) — nothing to do.
403
+ if (!processedRequest) {
147
404
  return null;
148
405
  }
149
- return request;
406
+ request.handledAt = handledAt;
407
+ const queueOperationInfo = {
408
+ ...processedRequest,
409
+ uniqueKey: request.uniqueKey,
410
+ forefront,
411
+ };
412
+ this._cacheRequest(getRequestId(request.uniqueKey), queueOperationInfo);
413
+ return queueOperationInfo;
150
414
  }
151
- async ensureHeadIsNonEmpty() {
152
- // Alias for backwards compatibility
153
- await this._ensureHeadIsNonEmpty();
415
+ /**
416
+ * Reclaims a failed request back to the queue, so that it can be returned for processing later again
417
+ * by another call to {@link RequestQueue.fetchNextRequest}.
418
+ * The request record in the queue is updated using the provided `request` parameter.
419
+ * For example, this lets you store the number of retries or error messages for the request.
420
+ */
421
+ async reclaimRequest(request, options = {}) {
422
+ checkStorageAccess();
423
+ ow(request, ow.object.partialShape({
424
+ id: ow.string,
425
+ uniqueKey: ow.string,
426
+ }));
427
+ ow(options, ow.object.exactShape({
428
+ forefront: ow.optional.boolean,
429
+ }));
430
+ const { forefront = false } = options;
431
+ const processedRequest = await this.client.reclaimRequest(request, { forefront });
432
+ // The request was not in progress — nothing to reclaim.
433
+ if (!processedRequest) {
434
+ return null;
435
+ }
436
+ const queueOperationInfo = {
437
+ ...processedRequest,
438
+ uniqueKey: request.uniqueKey,
439
+ forefront,
440
+ };
441
+ this._cacheRequest(getRequestId(request.uniqueKey), queueOperationInfo);
442
+ return queueOperationInfo;
154
443
  }
155
444
  /**
156
- * We always request more items than is in progress to ensure that something falls into head.
445
+ * Resolves to `true` if the next call to {@link RequestQueue.fetchNextRequest} would return
446
+ * `null`, i.e. there are no pending requests to fetch right now. Otherwise it resolves to `false`.
157
447
  *
158
- * @param [ensureConsistency] If true then query for queue head is retried until queueModifiedAt
159
- * is older than queryStartedAt by at least API_PROCESSED_REQUESTS_DELAY_MILLIS to ensure that queue
160
- * head is consistent.
161
- * @default false
162
- * @param [limit] How many queue head items will be fetched.
163
- * @param [iteration] Used when this function is called recursively to limit the recursion.
164
- * @returns Indicates if queue head is consistent (true) or inconsistent (false).
448
+ * Note that even if the queue is empty, there might be some requests currently being processed
449
+ * (fetched but not yet handled or reclaimed). An empty queue therefore does not mean crawling is
450
+ * finished — those in-progress requests may still be reclaimed, and background tasks may still be
451
+ * adding more requests. To check whether all activity in the queue has finished, use
452
+ * {@link RequestQueue.isFinished}.
165
453
  */
166
- async _ensureHeadIsNonEmpty(ensureConsistency = false, limit = Math.max(this.inProgressCount() * QUERY_HEAD_BUFFER, QUERY_HEAD_MIN_LENGTH), iteration = 0) {
167
- // If we are paused for migration, resolve immediately.
168
- if (this.queuePausedForMigration) {
169
- return true;
170
- }
171
- // If is nonempty resolve immediately.
172
- if (this.queueHeadIds.length() > 0) {
173
- return true;
174
- }
175
- if (!this.queryQueueHeadPromise) {
176
- const queryStartedAt = new Date();
177
- this.queryQueueHeadPromise = this.client
178
- .listHead({ limit })
179
- .then(({ items, queueModifiedAt, hadMultipleClients }) => {
180
- items.forEach(({ id: requestId, uniqueKey }) => {
181
- // Queue head index might be behind the main table, so ensure we don't recycle requests
182
- if (!requestId ||
183
- !uniqueKey ||
184
- this.inProgress.has(requestId) ||
185
- this.recentlyHandledRequestsCache.get(requestId))
186
- return;
187
- this.queueHeadIds.add(requestId, requestId, false);
188
- const forefront = this.requestCache.get(getRequestId(uniqueKey))?.forefront ?? false;
189
- this._cacheRequest(getRequestId(uniqueKey), {
190
- requestId,
191
- wasAlreadyHandled: false,
192
- wasAlreadyPresent: true,
193
- uniqueKey,
194
- forefront,
195
- });
196
- });
197
- // This is needed so that the next call to _ensureHeadIsNonEmpty() will fetch the queue head again.
198
- this.queryQueueHeadPromise = null;
199
- return {
200
- wasLimitReached: items.length >= limit,
201
- prevLimit: limit,
202
- queueModifiedAt: new Date(queueModifiedAt),
203
- queryStartedAt,
204
- hadMultipleClients,
205
- };
206
- });
207
- }
208
- const { queueModifiedAt, wasLimitReached, prevLimit, queryStartedAt, hadMultipleClients } = await this.queryQueueHeadPromise;
209
- // TODO: I feel this code below can be greatly simplified...
210
- // If queue is still empty then one of the following holds:
211
- // - the other calls waiting for this promise already consumed all the returned requests
212
- // - the limit was too low and contained only requests in progress
213
- // - the writes from other clients were not propagated yet
214
- // - the whole queue was processed and we are done
215
- // If limit was not reached in the call then there are no more requests to be returned.
216
- if (prevLimit >= REQUEST_QUEUE_HEAD_MAX_LIMIT) {
217
- this.log.warning(`Reached the maximum number of requests in progress: ${REQUEST_QUEUE_HEAD_MAX_LIMIT}.`);
218
- }
219
- const shouldRepeatWithHigherLimit = this.queueHeadIds.length() === 0 && wasLimitReached && prevLimit < REQUEST_QUEUE_HEAD_MAX_LIMIT;
220
- // If ensureConsistency=true then we must ensure that either:
221
- // - queueModifiedAt is older than queryStartedAt by at least API_PROCESSED_REQUESTS_DELAY_MILLIS
222
- // - hadMultipleClients=false and this.assumedTotalCount<=this.assumedHandledCount
223
- const isDatabaseConsistent = +queryStartedAt - +queueModifiedAt >= API_PROCESSED_REQUESTS_DELAY_MILLIS;
224
- const isLocallyConsistent = !hadMultipleClients && this.assumedTotalCount <= this.assumedHandledCount;
225
- // Consistent information from one source is enough to consider request queue finished.
226
- const shouldRepeatForConsistency = ensureConsistency && !isDatabaseConsistent && !isLocallyConsistent;
227
- // If both are false then head is consistent and we may exit.
228
- if (!shouldRepeatWithHigherLimit && !shouldRepeatForConsistency)
229
- return true;
230
- // If we are querying for consistency then we limit the number of queries to MAX_QUERIES_FOR_CONSISTENCY.
231
- // If this is reached then we return false so that empty() and finished() returns possibly false negative.
232
- if (!shouldRepeatWithHigherLimit && iteration > MAX_QUERIES_FOR_CONSISTENCY)
233
- return false;
234
- const nextLimit = shouldRepeatWithHigherLimit ? Math.round(prevLimit * 1.5) : prevLimit;
235
- // If we are repeating for consistency then wait required time.
236
- if (shouldRepeatForConsistency) {
237
- const delayMillis = API_PROCESSED_REQUESTS_DELAY_MILLIS - (Date.now() - +queueModifiedAt);
238
- this.log.info(`Waiting for ${delayMillis}ms before considering the queue as finished to ensure that the data is consistent.`);
239
- await sleep(delayMillis);
240
- }
241
- return this._ensureHeadIsNonEmpty(ensureConsistency, nextLimit, iteration + 1);
454
+ async isEmpty() {
455
+ checkStorageAccess();
456
+ return this.client.isEmpty();
242
457
  }
243
- // RequestQueue v1 behavior overrides below
458
+ /**
459
+ * Resolves to `true` if all requests were already handled and there are no more left — including no
460
+ * requests currently in progress (fetched but not yet handled or reclaimed, including requests
461
+ * locked by other clients sharing the same queue) and no background add operations still in flight.
462
+ *
463
+ * Due to the nature of distributed storage used by the queue, the function may occasionally return
464
+ * a false negative, but it shall never return a false positive.
465
+ */
244
466
  async isFinished() {
245
467
  checkStorageAccess();
246
- if (Date.now() - +this.lastActivity > this.internalTimeoutMillis) {
247
- const message = `The request queue seems to be stuck for ${this.internalTimeoutMillis / 1e3}s, resetting internal state.`;
248
- this.log.warning(message, { inProgress: [...this.inProgress] });
249
- this._reset();
250
- }
468
+ // We are not finished if we're still adding new requests in the background.
251
469
  if (this.inProgressRequestBatchCount > 0) {
252
470
  return false;
253
471
  }
254
- if (this.queueHeadIds.length() > 0 || this.inProgressCount() > 0)
255
- return false;
256
- const isHeadConsistent = await this._ensureHeadIsNonEmpty(true);
257
- return isHeadConsistent && this.queueHeadIds.length() === 0 && this.inProgressCount() === 0;
472
+ return this.client.isFinished();
258
473
  }
259
474
  /**
260
- * Reclaims a failed request back to the queue, so that it can be returned for processing later again
261
- * by another call to {@link RequestQueue.fetchNextRequest}.
262
- * The request record in the queue is updated using the provided `request` parameter.
263
- * For example, this lets you store the number of retries or error messages for the request.
475
+ * Tells the queue how long a consumer expects to hold a fetched request before marking it handled
476
+ * or reclaiming it (typically the request-handler timeout plus padding), so that a storage client
477
+ * that reserves requests via locking does not hand the same request out again while it is still
478
+ * being processed.
479
+ *
480
+ * Several consumers may share one queue (and therefore one client) in a single process, so we only
481
+ * ever raise the reservation duration, never lower it — otherwise a short-lived consumer could cut
482
+ * short the reservation of a long-lived one and have its in-flight request stolen.
264
483
  */
265
- async reclaimRequest(...args) {
484
+ setExpectedRequestProcessingTimeSecs(secs) {
485
+ if (secs <= this.expectedRequestProcessingSecs) {
486
+ return;
487
+ }
488
+ this.expectedRequestProcessingSecs = secs;
489
+ this.client.setExpectedRequestProcessingTimeSecs?.(secs);
490
+ }
491
+ /**
492
+ * Caches information about request to beware of unneeded addRequest() calls.
493
+ */
494
+ _cacheRequest(cacheKey, queueOperationInfo) {
495
+ // Remove the previous entry, as otherwise our cache will never update 👀
496
+ this.requestCache.remove(cacheKey);
497
+ this.requestCache.add(cacheKey, {
498
+ id: queueOperationInfo.requestId,
499
+ isHandled: queueOperationInfo.wasAlreadyHandled,
500
+ uniqueKey: queueOperationInfo.uniqueKey,
501
+ hydrated: null,
502
+ lockExpiresAt: null,
503
+ forefront: queueOperationInfo.forefront,
504
+ });
505
+ }
506
+ /**
507
+ * Removes the queue either from the Apify Cloud storage or from the local database,
508
+ * depending on the mode of operation.
509
+ */
510
+ async drop() {
266
511
  checkStorageAccess();
267
- const [request, options] = args;
268
- const forefront = options?.forefront ?? false;
269
- const result = await super.reclaimRequest(...args);
270
- // Wait a little to increase a chance that the next call to fetchNextRequest() will return the request with updated data.
271
- // This is to compensate for the limitation of DynamoDB, where writes might not be immediately visible to subsequent reads.
272
- setTimeout(() => {
273
- if (!this.inProgress.has(request.id)) {
274
- this.log.debug('The request is no longer marked as in progress in the queue?!', {
275
- requestId: request.id,
276
- });
277
- return;
278
- }
279
- this.inProgress.delete(request.id);
280
- // Performance optimization: add request straight to head if possible
281
- this._maybeAddRequestToQueueHead(request.id, forefront);
282
- }, STORAGE_CONSISTENCY_DELAY_MILLIS);
283
- return result;
512
+ await this.client.drop();
513
+ serviceLocator.getStorageInstanceManager().removeFromCache(this);
514
+ }
515
+ /**
516
+ * Remove all requests from the queue but keep the queue itself, resetting it
517
+ * so it can be reused (e.g. across multiple `crawler.run()` calls).
518
+ */
519
+ async purge() {
520
+ checkStorageAccess();
521
+ await this.client.purge();
522
+ // Reset in-memory bookkeeping so the queue behaves as if freshly opened.
523
+ this.requestCache.clear();
524
+ this.inProgressRequestBatchCount = 0;
525
+ // Reset the expected-processing-time high-water mark too, otherwise the monotonic-raise guard
526
+ // in `setExpectedRequestProcessingTimeSecs` would let a value raised in an earlier run leak into a
527
+ // later one and silently swallow a lower hint (the queue is meant to be reusable across runs).
528
+ this.expectedRequestProcessingSecs = 0;
529
+ }
530
+ /**
531
+ * @inheritdoc
532
+ */
533
+ async *[Symbol.asyncIterator]() {
534
+ while (true) {
535
+ const req = await this.fetchNextRequest();
536
+ if (!req)
537
+ break;
538
+ yield req;
539
+ }
284
540
  }
285
541
  /**
542
+ * Returns the number of handled requests.
543
+ *
544
+ * This function is just a convenient shortcut for:
545
+ *
546
+ * ```javascript
547
+ * const { handledRequestCount } = await queue.getInfo();
548
+ * ```
286
549
  * @inheritdoc
287
550
  */
288
- async markRequestHandled(request) {
289
- const res = await super.markRequestHandled(request);
290
- this.inProgress.delete(request.id);
291
- return res;
551
+ async getHandledCount() {
552
+ // NOTE: We keep this function for compatibility with RequestList.getHandledCount()
553
+ const { handledRequestCount } = await this.getInfo();
554
+ return handledRequestCount;
292
555
  }
293
- _reset() {
294
- super._reset();
295
- this.inProgress.clear();
556
+ /**
557
+ * Returns an object containing general information about the request queue.
558
+ *
559
+ * **Example:**
560
+ * ```
561
+ * {
562
+ * id: "WkzbQMuFYuamGv3YF",
563
+ * name: "my-queue",
564
+ * createdAt: new Date("2015-12-12T07:34:14.202Z"),
565
+ * modifiedAt: new Date("2015-12-13T08:36:13.202Z"),
566
+ * accessedAt: new Date("2015-12-14T08:36:13.202Z"),
567
+ * totalRequestCount: 25,
568
+ * handledRequestCount: 5,
569
+ * pendingRequestCount: 20,
570
+ * }
571
+ * ```
572
+ *
573
+ * @throws If the underlying storage no longer exists (e.g. it was deleted externally).
574
+ */
575
+ async getInfo() {
576
+ checkStorageAccess();
577
+ return this.client.getMetadata();
578
+ }
579
+ /**
580
+ * Fetches URLs from requestsFromUrl and returns them in format of list of requests
581
+ */
582
+ async _fetchRequestsFromUrl(source) {
583
+ const { requestsFromUrl, regex, ...sharedOpts } = source;
584
+ // Download remote resource and parse URLs.
585
+ let urlsArr;
586
+ try {
587
+ urlsArr = await this._downloadListOfUrls({
588
+ url: requestsFromUrl,
589
+ urlRegExp: regex,
590
+ proxyUrl: await this.proxyConfiguration?.newUrl(),
591
+ });
592
+ }
593
+ catch (err) {
594
+ throw new Error(`Cannot fetch a request list from ${requestsFromUrl}: ${err}`);
595
+ }
596
+ // Skip if resource contained no URLs.
597
+ if (!urlsArr.length) {
598
+ this.log.warning('The fetched list contains no valid URLs.', { requestsFromUrl, regex });
599
+ return [];
600
+ }
601
+ return urlsArr.map((url) => ({ url, ...sharedOpts }));
602
+ }
603
+ /**
604
+ * Adds all fetched requests from a URL from a remote resource.
605
+ */
606
+ async _addFetchedRequests(source, fetchedRequests, options) {
607
+ const { requestsFromUrl, regex } = source;
608
+ const { addedRequests } = await this.addRequestsBatched(fetchedRequests, options);
609
+ this.log.info('Fetched and loaded Requests from a remote resource.', {
610
+ requestsFromUrl,
611
+ regex,
612
+ fetchedCount: fetchedRequests.length,
613
+ importedCount: addedRequests.length,
614
+ duplicateCount: fetchedRequests.length - addedRequests.length,
615
+ sample: JSON.stringify(fetchedRequests.slice(0, 5)),
616
+ });
617
+ return addedRequests;
618
+ }
619
+ /**
620
+ * @internal wraps public utility for mocking purposes
621
+ */
622
+ async _downloadListOfUrls(options) {
623
+ return downloadListOfUrls({
624
+ ...options,
625
+ httpClient: this.httpClient,
626
+ });
296
627
  }
297
628
  /**
298
629
  * Opens a request queue and returns a promise resolving to an instance
@@ -305,14 +636,47 @@ class RequestQueue extends RequestProvider {
305
636
  *
306
637
  * For more details and code examples, see the {@link RequestQueue} class.
307
638
  *
308
- * @param [queueIdOrName]
309
- * ID or name of the request queue to be opened. If `null` or `undefined`,
310
- * the function returns the default request queue associated with the crawler run.
639
+ * @param [identifier]
640
+ * ID or name of the request queue to be opened. If a string is provided, it will first be
641
+ * looked up as an ID; if no such storage exists, it will be treated as a name.
642
+ * If `null` or `undefined`, the function returns the default request queue associated with the crawler run.
311
643
  * @param [options] Open Request Queue options.
312
644
  */
313
- static async open(...args) {
314
- return super.open(...args);
645
+ static async open(identifier, options = {}) {
646
+ checkStorageAccess();
647
+ ow(options, ow.object.exactShape({
648
+ config: ow.optional.object.instanceOf(Configuration),
649
+ storageClient: ow.optional.object,
650
+ proxyConfiguration: ow.optional.object,
651
+ httpClient: ow.optional.object,
652
+ }));
653
+ const client = options.storageClient ?? serviceLocator.getStorageClient();
654
+ const config = options.config ?? serviceLocator.getConfiguration();
655
+ await purgeDefaultStorages({ onlyPurgeOnce: true, client, config });
656
+ const resolved = await resolveStorageIdentifier(identifier, client, 'RequestQueue');
657
+ const queue = await serviceLocator
658
+ .getStorageInstanceManager()
659
+ .openStorage(this, {
660
+ ...resolved,
661
+ clientOpener: () => client.createRequestQueueClient(resolved),
662
+ clientCacheKey: client.getStorageClientCacheKey?.() ?? client.constructor.name,
663
+ });
664
+ queue.proxyConfiguration = options.proxyConfiguration;
665
+ queue.httpClient = options.httpClient;
666
+ if (!queue.isInitialized) {
667
+ // Re-create the request queue client with clientKey and timeoutSecs so that
668
+ // request locking works correctly for API-backed implementations.
669
+ // TODO: clientKey/timeoutSecs are Apify-platform concerns and should eventually be pushed
670
+ // down into the Apify SDK's client implementation, aligning with crawlee-python's approach
671
+ // where locking is handled internally by the client (see crawlee-python PR #1194).
672
+ queue.client = await client.createRequestQueueClient({
673
+ id: queue.id,
674
+ clientKey: queue.clientKey,
675
+ timeoutSecs: queue.timeoutSecs,
676
+ });
677
+ queue.isInitialized = true;
678
+ }
679
+ return queue;
315
680
  }
316
681
  }
317
- export { RequestQueue as RequestQueueV1 };
318
682
  //# sourceMappingURL=request_queue.js.map