@crawlee/core 4.0.0-beta.64 → 4.0.0-beta.66
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/crawlers/crawler_commons.d.ts +3 -3
- package/crawlers/crawler_commons.d.ts.map +1 -1
- package/enqueue_links/enqueue_links.d.ts +7 -6
- package/enqueue_links/enqueue_links.d.ts.map +1 -1
- package/enqueue_links/enqueue_links.js +4 -4
- package/enqueue_links/enqueue_links.js.map +1 -1
- package/package.json +5 -5
- package/storages/index.d.ts +4 -6
- package/storages/index.d.ts.map +1 -1
- package/storages/index.js +2 -6
- package/storages/index.js.map +1 -1
- package/storages/request_list.d.ts +23 -72
- package/storages/request_list.d.ts.map +1 -1
- package/storages/request_list.js +34 -29
- package/storages/request_list.js.map +1 -1
- package/storages/request_loader.d.ts +97 -0
- package/storages/request_loader.d.ts.map +1 -0
- package/storages/request_loader.js +2 -0
- package/storages/request_loader.js.map +1 -0
- package/storages/request_manager.d.ts +34 -0
- package/storages/request_manager.d.ts.map +1 -0
- package/storages/request_manager.js +2 -0
- package/storages/request_manager.js.map +1 -0
- package/storages/request_manager_tandem.d.ts +56 -17
- package/storages/request_manager_tandem.d.ts.map +1 -1
- package/storages/request_manager_tandem.js +114 -41
- package/storages/request_manager_tandem.js.map +1 -1
- package/storages/request_queue.d.ts +276 -44
- package/storages/request_queue.d.ts.map +1 -1
- package/storages/request_queue.js +576 -212
- package/storages/request_queue.js.map +1 -1
- package/storages/{sitemap_request_list.d.ts → sitemap_request_loader.d.ts} +24 -19
- package/storages/sitemap_request_loader.d.ts.map +1 -0
- package/storages/{sitemap_request_list.js → sitemap_request_loader.js} +41 -40
- package/storages/sitemap_request_loader.js.map +1 -0
- package/validators.d.ts +4 -0
- package/validators.d.ts.map +1 -1
- package/validators.js +4 -0
- package/validators.js.map +1 -1
- package/storages/request_list_adapter.d.ts +0 -58
- package/storages/request_list_adapter.d.ts.map +0 -1
- package/storages/request_list_adapter.js +0 -81
- package/storages/request_list_adapter.js.map +0 -1
- package/storages/request_provider.d.ts +0 -384
- package/storages/request_provider.d.ts.map +0 -1
- package/storages/request_provider.js +0 -624
- package/storages/request_provider.js.map +0 -1
- package/storages/request_queue_v2.d.ts +0 -87
- package/storages/request_queue_v2.d.ts.map +0 -1
- package/storages/request_queue_v2.js +0 -437
- package/storages/request_queue_v2.js.map +0 -1
- package/storages/sitemap_request_list.d.ts.map +0 -1
- package/storages/sitemap_request_list.js.map +0 -1
|
@@ -1,16 +1,19 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
1
|
+
import { inspect } from 'node:util';
|
|
2
|
+
import { chunkedAsyncIterable, downloadListOfUrls, getObjectType, isAsyncIterable, isIterable, peekableAsyncIterable, sleep, } from '@crawlee/utils';
|
|
3
|
+
import ow from 'ow';
|
|
4
|
+
import { LruCache } from '@apify/datastructures';
|
|
5
|
+
import { cryptoRandomObjectId } from '@apify/utilities';
|
|
6
|
+
import { Configuration } from '../configuration.js';
|
|
7
|
+
import { Request } from '../request.js';
|
|
3
8
|
import { serviceLocator } from '../service_locator.js';
|
|
4
9
|
import { checkStorageAccess } from './access_checking.js';
|
|
5
|
-
import {
|
|
6
|
-
import {
|
|
7
|
-
const MAX_CACHED_REQUESTS = 1_000_000;
|
|
10
|
+
import { resolveStorageIdentifier } from './storage_instance_manager.js';
|
|
11
|
+
import { getRequestId, purgeDefaultStorages } from './utils.js';
|
|
8
12
|
/**
|
|
9
|
-
*
|
|
10
|
-
* a time lower than expected maximum latency of DynamoDB, but low enough not to waste too much memory.
|
|
13
|
+
* The maximum number of requests cached locally to avoid redundant calls to the storage client.
|
|
11
14
|
* @internal
|
|
12
15
|
*/
|
|
13
|
-
const
|
|
16
|
+
const MAX_CACHED_REQUESTS = 2_000_000;
|
|
14
17
|
/**
|
|
15
18
|
* Represents a queue of URLs to crawl, which is used for deep crawling of websites
|
|
16
19
|
* where you start with several URLs and then recursively
|
|
@@ -29,18 +32,6 @@ const RECENTLY_HANDLED_CACHE_SIZE = 1000;
|
|
|
29
32
|
* Unlike {@link RequestList}, `RequestQueue` supports dynamic adding and removing of requests.
|
|
30
33
|
* On the other hand, the queue is not optimized for operations that add or remove a large number of URLs in a batch.
|
|
31
34
|
*
|
|
32
|
-
* `RequestQueue` stores its data either on local disk or in the Apify Cloud,
|
|
33
|
-
* depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variable is set.
|
|
34
|
-
*
|
|
35
|
-
* If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the queue data is stored in
|
|
36
|
-
* that directory in an SQLite database file.
|
|
37
|
-
*
|
|
38
|
-
* If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the
|
|
39
|
-
* [Apify Request Queue](https://docs.apify.com/storage/request-queue)
|
|
40
|
-
* cloud storage. Note that you can force usage of the cloud storage also by passing the `forceCloud`
|
|
41
|
-
* option to {@link RequestQueue.open} function,
|
|
42
|
-
* even if the `APIFY_LOCAL_STORAGE_DIR` variable is set.
|
|
43
|
-
*
|
|
44
35
|
* **Example usage:**
|
|
45
36
|
*
|
|
46
37
|
* ```javascript
|
|
@@ -57,31 +48,316 @@ const RECENTLY_HANDLED_CACHE_SIZE = 1000;
|
|
|
57
48
|
* ```
|
|
58
49
|
* @category Sources
|
|
59
50
|
*/
|
|
60
|
-
class RequestQueue
|
|
61
|
-
|
|
62
|
-
|
|
51
|
+
export class RequestQueue {
|
|
52
|
+
config;
|
|
53
|
+
id;
|
|
54
|
+
name;
|
|
55
|
+
timeoutSecs = 30;
|
|
56
|
+
clientKey = cryptoRandomObjectId();
|
|
57
|
+
client;
|
|
58
|
+
proxyConfiguration;
|
|
59
|
+
log;
|
|
60
|
+
isInitialized = false;
|
|
61
|
+
requestCache;
|
|
62
|
+
queuePausedForMigration = false;
|
|
63
|
+
inProgressRequestBatchCount = 0;
|
|
64
|
+
/**
|
|
65
|
+
* The largest expected request-processing time (in seconds) seen so far via
|
|
66
|
+
* {@link setExpectedRequestProcessingTimeSecs}. Used to ensure that value is only ever raised, never
|
|
67
|
+
* lowered, before being forwarded to the storage client.
|
|
68
|
+
*/
|
|
69
|
+
expectedRequestProcessingSecs = 0;
|
|
70
|
+
httpClient;
|
|
71
|
+
events;
|
|
63
72
|
/**
|
|
64
73
|
* @internal
|
|
65
74
|
*/
|
|
66
|
-
constructor(options, config =
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
75
|
+
constructor(options, config = Configuration.getGlobalConfig()) {
|
|
76
|
+
this.config = config;
|
|
77
|
+
this.id = options.id;
|
|
78
|
+
this.name = options.name;
|
|
79
|
+
this.events = serviceLocator.getEventManager();
|
|
80
|
+
this.client = options.client;
|
|
81
|
+
this.proxyConfiguration = options.proxyConfiguration;
|
|
82
|
+
this.requestCache = new LruCache({ maxLength: MAX_CACHED_REQUESTS });
|
|
83
|
+
this.log = serviceLocator.getLogger().child({ prefix: `RequestQueue(${this.id}, ${this.name ?? 'no-name'})` });
|
|
84
|
+
this.events.on("migrating" /* EventType.MIGRATING */, async () => {
|
|
85
|
+
this.queuePausedForMigration = true;
|
|
86
|
+
});
|
|
73
87
|
}
|
|
74
88
|
/**
|
|
75
|
-
*
|
|
89
|
+
* Returns the total number of requests in the queue (i.e. pending + handled).
|
|
90
|
+
*
|
|
91
|
+
* Survives restarts and actor migrations.
|
|
92
|
+
*/
|
|
93
|
+
async getTotalCount() {
|
|
94
|
+
const { totalRequestCount } = await this.getInfo();
|
|
95
|
+
return totalRequestCount;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Returns the total number of pending requests in the queue.
|
|
99
|
+
*
|
|
100
|
+
* Survives restarts and Actor migrations.
|
|
101
|
+
*/
|
|
102
|
+
async getPendingCount() {
|
|
103
|
+
const { totalRequestCount, handledRequestCount } = await this.getInfo();
|
|
104
|
+
return totalRequestCount - handledRequestCount;
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Adds a request to the queue.
|
|
108
|
+
*
|
|
109
|
+
* If a request with the same `uniqueKey` property is already present in the queue,
|
|
110
|
+
* it will not be updated. You can find out whether this happened from the resulting
|
|
111
|
+
* {@link QueueOperationInfo} object.
|
|
112
|
+
*
|
|
113
|
+
* To add multiple requests to the queue by extracting links from a webpage,
|
|
114
|
+
* see the {@link enqueueLinks} helper function.
|
|
115
|
+
*
|
|
116
|
+
* @param requestLike {@link Request} object or vanilla object with request data.
|
|
117
|
+
* Note that the function sets the `uniqueKey` and `id` fields to the passed Request.
|
|
118
|
+
* @param [options] Request queue operation options.
|
|
119
|
+
*/
|
|
120
|
+
async addRequest(requestLike, options = {}) {
|
|
121
|
+
checkStorageAccess();
|
|
122
|
+
ow(requestLike, ow.object);
|
|
123
|
+
ow(options, ow.object.exactShape({
|
|
124
|
+
forefront: ow.optional.boolean,
|
|
125
|
+
}));
|
|
126
|
+
const { forefront = false } = options;
|
|
127
|
+
if ('requestsFromUrl' in requestLike) {
|
|
128
|
+
const requests = await this._fetchRequestsFromUrl(requestLike);
|
|
129
|
+
const processedRequests = await this._addFetchedRequests(requestLike, requests, options);
|
|
130
|
+
return { ...processedRequests[0], forefront };
|
|
131
|
+
}
|
|
132
|
+
ow(requestLike, ow.object.partialShape({
|
|
133
|
+
url: ow.string,
|
|
134
|
+
id: ow.undefined,
|
|
135
|
+
}));
|
|
136
|
+
const request = requestLike instanceof Request ? requestLike : new Request(requestLike);
|
|
137
|
+
const cacheKey = getRequestId(request.uniqueKey);
|
|
138
|
+
const cachedInfo = this.requestCache.get(cacheKey);
|
|
139
|
+
if (cachedInfo) {
|
|
140
|
+
request.id = cachedInfo.id;
|
|
141
|
+
return {
|
|
142
|
+
wasAlreadyPresent: true,
|
|
143
|
+
// We may assume that if request is in local cache then also the information if the
|
|
144
|
+
// request was already handled is there because just one client should be using one queue.
|
|
145
|
+
wasAlreadyHandled: cachedInfo.isHandled,
|
|
146
|
+
requestId: cachedInfo.id,
|
|
147
|
+
uniqueKey: cachedInfo.uniqueKey,
|
|
148
|
+
forefront,
|
|
149
|
+
};
|
|
150
|
+
}
|
|
151
|
+
const { processedRequests } = await this.client.addBatchOfRequests([request], { forefront });
|
|
152
|
+
const queueOperationInfo = {
|
|
153
|
+
...processedRequests[0],
|
|
154
|
+
uniqueKey: request.uniqueKey,
|
|
155
|
+
forefront,
|
|
156
|
+
};
|
|
157
|
+
this._cacheRequest(cacheKey, queueOperationInfo);
|
|
158
|
+
return queueOperationInfo;
|
|
159
|
+
}
|
|
160
|
+
/**
|
|
161
|
+
* Adds requests to the queue in batches of 25. This method will wait till all the requests are added
|
|
162
|
+
* to the queue before resolving. You should prefer using `queue.addRequestsBatched()` or `crawler.addRequests()`
|
|
163
|
+
* if you don't want to block the processing, as those methods will only wait for the initial 1000 requests,
|
|
164
|
+
* start processing right after that happens, and continue adding more in the background.
|
|
165
|
+
*
|
|
166
|
+
* If a request passed in is already present due to its `uniqueKey` property being the same,
|
|
167
|
+
* it will not be updated. You can find out whether this happened by finding the request in the resulting
|
|
168
|
+
* {@link BatchAddRequestsResult} object.
|
|
169
|
+
*
|
|
170
|
+
* @param requestsLike {@link Request} objects or vanilla objects with request data.
|
|
171
|
+
* Note that the function sets the `uniqueKey` and `id` fields to the passed requests if missing.
|
|
172
|
+
* @param [options] Request queue operation options.
|
|
173
|
+
*/
|
|
174
|
+
async addRequests(requestsLike, options = {}) {
|
|
175
|
+
checkStorageAccess();
|
|
176
|
+
ow(requestsLike, ow.object
|
|
177
|
+
.is((value) => isIterable(value) || isAsyncIterable(value))
|
|
178
|
+
.message((value) => `Expected an iterable or async iterable, got ${getObjectType(value)}`));
|
|
179
|
+
ow(options, ow.object.exactShape({
|
|
180
|
+
forefront: ow.optional.boolean,
|
|
181
|
+
cache: ow.optional.boolean,
|
|
182
|
+
}));
|
|
183
|
+
const { forefront = false, cache = true } = options;
|
|
184
|
+
const uniqueKeyToCacheKey = new Map();
|
|
185
|
+
const getCachedRequestId = (uniqueKey) => {
|
|
186
|
+
const cached = uniqueKeyToCacheKey.get(uniqueKey);
|
|
187
|
+
if (cached)
|
|
188
|
+
return cached;
|
|
189
|
+
const newCacheKey = getRequestId(uniqueKey);
|
|
190
|
+
uniqueKeyToCacheKey.set(uniqueKey, newCacheKey);
|
|
191
|
+
return newCacheKey;
|
|
192
|
+
};
|
|
193
|
+
const results = {
|
|
194
|
+
processedRequests: [],
|
|
195
|
+
unprocessedRequests: [],
|
|
196
|
+
};
|
|
197
|
+
const requests = [];
|
|
198
|
+
for await (const requestLike of requestsLike) {
|
|
199
|
+
if (typeof requestLike === 'string') {
|
|
200
|
+
requests.push(new Request({ url: requestLike }));
|
|
201
|
+
}
|
|
202
|
+
else if ('requestsFromUrl' in requestLike) {
|
|
203
|
+
const fetchedRequests = await this._fetchRequestsFromUrl(requestLike);
|
|
204
|
+
await this._addFetchedRequests(requestLike, fetchedRequests, options);
|
|
205
|
+
}
|
|
206
|
+
else {
|
|
207
|
+
requests.push(requestLike instanceof Request ? requestLike : new Request(requestLike));
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
const requestsToAdd = new Map();
|
|
211
|
+
for (const request of requests) {
|
|
212
|
+
const cacheKey = getCachedRequestId(request.uniqueKey);
|
|
213
|
+
const cachedInfo = this.requestCache.get(cacheKey);
|
|
214
|
+
if (cachedInfo) {
|
|
215
|
+
request.id = cachedInfo.id;
|
|
216
|
+
results.processedRequests.push({
|
|
217
|
+
wasAlreadyPresent: true,
|
|
218
|
+
// We may assume that if request is in local cache then also the information if the
|
|
219
|
+
// request was already handled is there because just one client should be using one queue.
|
|
220
|
+
wasAlreadyHandled: cachedInfo.isHandled,
|
|
221
|
+
requestId: cachedInfo.id,
|
|
222
|
+
uniqueKey: cachedInfo.uniqueKey,
|
|
223
|
+
});
|
|
224
|
+
}
|
|
225
|
+
else if (!requestsToAdd.has(request.uniqueKey)) {
|
|
226
|
+
requestsToAdd.set(request.uniqueKey, request);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
// Early exit if all provided requests were already added
|
|
230
|
+
if (!requestsToAdd.size) {
|
|
231
|
+
return results;
|
|
232
|
+
}
|
|
233
|
+
const apiResults = await this.client.addBatchOfRequests([...requestsToAdd.values()], { forefront });
|
|
234
|
+
// Report unprocessed requests
|
|
235
|
+
results.unprocessedRequests = apiResults.unprocessedRequests;
|
|
236
|
+
// Add all new requests to the requestCache
|
|
237
|
+
for (const newRequest of apiResults.processedRequests) {
|
|
238
|
+
// Add the new request to the processed list
|
|
239
|
+
results.processedRequests.push(newRequest);
|
|
240
|
+
const cacheKey = getCachedRequestId(newRequest.uniqueKey);
|
|
241
|
+
if (cache) {
|
|
242
|
+
this._cacheRequest(cacheKey, { ...newRequest, forefront });
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
return results;
|
|
246
|
+
}
|
|
247
|
+
/**
|
|
248
|
+
* Adds requests to the queue in batches. By default, it will resolve after the initial batch is added, and continue
|
|
249
|
+
* adding the rest in the background. You can configure the batch size via `batchSize` option and the sleep time in between
|
|
250
|
+
* the batches via `waitBetweenBatchesMillis`. If you want to wait for all batches to be added to the queue, you can use
|
|
251
|
+
* the `waitForAllRequestsToBeAdded` promise you get in the response object.
|
|
252
|
+
*
|
|
253
|
+
* @param requests The requests to add
|
|
254
|
+
* @param options Options for the request queue
|
|
76
255
|
*/
|
|
77
|
-
|
|
78
|
-
|
|
256
|
+
async addRequestsBatched(requests, options = {}) {
|
|
257
|
+
checkStorageAccess();
|
|
258
|
+
ow(requests, ow.object
|
|
259
|
+
.is((value) => isIterable(value) || isAsyncIterable(value))
|
|
260
|
+
.message((value) => `Expected an iterable or async iterable, got ${getObjectType(value)}`));
|
|
261
|
+
ow(options, ow.object.exactShape({
|
|
262
|
+
forefront: ow.optional.boolean,
|
|
263
|
+
waitForAllRequestsToBeAdded: ow.optional.boolean,
|
|
264
|
+
batchSize: ow.optional.number,
|
|
265
|
+
waitBetweenBatchesMillis: ow.optional.number,
|
|
266
|
+
}));
|
|
267
|
+
const addRequest = this.addRequest.bind(this);
|
|
268
|
+
async function* generateRequests() {
|
|
269
|
+
for await (const opts of requests) {
|
|
270
|
+
// Validate the input
|
|
271
|
+
if (typeof opts === 'object' && opts !== null) {
|
|
272
|
+
if (opts.url !== undefined && typeof opts.url !== 'string') {
|
|
273
|
+
throw new Error(`Request options are not valid, the 'url' property is not a string. Input: ${inspect(opts)}`);
|
|
274
|
+
}
|
|
275
|
+
if (opts.id !== undefined) {
|
|
276
|
+
throw new Error(`Request options are not valid, the 'id' property must not be present. Input: ${inspect(opts)}`);
|
|
277
|
+
}
|
|
278
|
+
if (opts.requestsFromUrl !== undefined &&
|
|
279
|
+
typeof opts.requestsFromUrl !== 'string') {
|
|
280
|
+
throw new Error(`Request options are not valid, the 'requestsFromUrl' property is not a string. Input: ${inspect(opts)}`);
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
if (opts && typeof opts === 'object' && 'requestsFromUrl' in opts) {
|
|
284
|
+
// Handle URL lists right away
|
|
285
|
+
await addRequest(opts, { forefront: options.forefront });
|
|
286
|
+
}
|
|
287
|
+
else {
|
|
288
|
+
// Yield valid requests
|
|
289
|
+
yield typeof opts === 'string' ? { url: opts } : opts;
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
const { batchSize = 1000, waitBetweenBatchesMillis = 1000 } = options;
|
|
294
|
+
const chunks = peekableAsyncIterable(chunkedAsyncIterable(generateRequests(), batchSize));
|
|
295
|
+
const chunksIterator = chunks[Symbol.asyncIterator]();
|
|
296
|
+
const attemptToAddToQueueAndAddAnyUnprocessed = async (providedRequests, cache = true) => {
|
|
297
|
+
const resultsToReturn = [];
|
|
298
|
+
const apiResult = await this.addRequests(providedRequests, { forefront: options.forefront, cache });
|
|
299
|
+
resultsToReturn.push(...apiResult.processedRequests);
|
|
300
|
+
if (apiResult.unprocessedRequests.length) {
|
|
301
|
+
await sleep(waitBetweenBatchesMillis);
|
|
302
|
+
resultsToReturn.push(...(await attemptToAddToQueueAndAddAnyUnprocessed(providedRequests.filter((r) => !apiResult.processedRequests.some((pr) => pr.uniqueKey === r.uniqueKey)), false)));
|
|
303
|
+
}
|
|
304
|
+
return resultsToReturn;
|
|
305
|
+
};
|
|
306
|
+
// Add initial batch of `batchSize` to process them right away
|
|
307
|
+
const initialChunk = await chunksIterator.peek();
|
|
308
|
+
if (initialChunk === undefined) {
|
|
309
|
+
return { addedRequests: [], waitForAllRequestsToBeAdded: Promise.resolve([]) };
|
|
310
|
+
}
|
|
311
|
+
const addedRequests = await attemptToAddToQueueAndAddAnyUnprocessed(initialChunk);
|
|
312
|
+
await chunksIterator.next();
|
|
313
|
+
// If we have no more requests to add, return immediately
|
|
314
|
+
if ((await chunksIterator.peek()) === undefined) {
|
|
315
|
+
return {
|
|
316
|
+
addedRequests,
|
|
317
|
+
waitForAllRequestsToBeAdded: Promise.resolve([]),
|
|
318
|
+
};
|
|
319
|
+
}
|
|
320
|
+
// eslint-disable-next-line no-async-promise-executor
|
|
321
|
+
const promise = new Promise(async (resolve) => {
|
|
322
|
+
const finalAddedRequests = [];
|
|
323
|
+
for await (const requestChunk of chunks) {
|
|
324
|
+
finalAddedRequests.push(...(await attemptToAddToQueueAndAddAnyUnprocessed(requestChunk, false)));
|
|
325
|
+
await sleep(waitBetweenBatchesMillis);
|
|
326
|
+
}
|
|
327
|
+
resolve(finalAddedRequests);
|
|
328
|
+
});
|
|
329
|
+
this.inProgressRequestBatchCount += 1;
|
|
330
|
+
void promise.finally(() => {
|
|
331
|
+
this.inProgressRequestBatchCount -= 1;
|
|
332
|
+
});
|
|
333
|
+
// If the user wants to wait for all the requests to be added, we wait for the promise to resolve for them
|
|
334
|
+
if (options.waitForAllRequestsToBeAdded) {
|
|
335
|
+
addedRequests.push(...(await promise));
|
|
336
|
+
}
|
|
337
|
+
return {
|
|
338
|
+
addedRequests,
|
|
339
|
+
waitForAllRequestsToBeAdded: promise,
|
|
340
|
+
};
|
|
341
|
+
}
|
|
342
|
+
/**
|
|
343
|
+
* Gets the request from the queue specified by its `uniqueKey`.
|
|
344
|
+
*
|
|
345
|
+
* @param uniqueKey Unique key of the request.
|
|
346
|
+
* @returns Returns the request object, or `null` if it was not found.
|
|
347
|
+
*/
|
|
348
|
+
async getRequest(uniqueKey) {
|
|
349
|
+
checkStorageAccess();
|
|
350
|
+
ow(uniqueKey, ow.string);
|
|
351
|
+
const requestOptions = await this.client.getRequest(uniqueKey);
|
|
352
|
+
if (!requestOptions)
|
|
353
|
+
return null;
|
|
354
|
+
return new Request(requestOptions);
|
|
79
355
|
}
|
|
80
356
|
/**
|
|
81
357
|
* Returns a next request in the queue to be processed, or `null` if there are no more pending requests.
|
|
82
358
|
*
|
|
83
359
|
* Once you successfully finish processing of the request, you need to call
|
|
84
|
-
* {@link RequestQueue.
|
|
360
|
+
* {@link RequestQueue.markRequestAsHandled}
|
|
85
361
|
* to mark the request as handled in the queue. If there was some error in processing the request,
|
|
86
362
|
* call {@link RequestQueue.reclaimRequest} instead,
|
|
87
363
|
* so that the queue will give the request to some other consumer in another call to the `fetchNextRequest` function.
|
|
@@ -96,203 +372,258 @@ class RequestQueue extends RequestProvider {
|
|
|
96
372
|
*/
|
|
97
373
|
async fetchNextRequest() {
|
|
98
374
|
checkStorageAccess();
|
|
99
|
-
this.
|
|
100
|
-
await this.ensureHeadIsNonEmpty();
|
|
101
|
-
const nextRequestId = this.queueHeadIds.removeFirst();
|
|
102
|
-
// We are likely done at this point.
|
|
103
|
-
if (!nextRequestId)
|
|
104
|
-
return null;
|
|
105
|
-
// This should never happen, but...
|
|
106
|
-
if (this.inProgress.has(nextRequestId) || this.recentlyHandledRequestsCache.get(nextRequestId)) {
|
|
107
|
-
this.log.warning('Queue head returned a request that is already in progress?!', {
|
|
108
|
-
nextRequestId,
|
|
109
|
-
inProgress: this.inProgress.has(nextRequestId),
|
|
110
|
-
recentlyHandled: !!this.recentlyHandledRequestsCache.get(nextRequestId),
|
|
111
|
-
});
|
|
375
|
+
if (this.queuePausedForMigration) {
|
|
112
376
|
return null;
|
|
113
377
|
}
|
|
114
|
-
this.
|
|
115
|
-
|
|
116
|
-
let request;
|
|
117
|
-
try {
|
|
118
|
-
request = await this.getRequest(nextRequestId);
|
|
119
|
-
}
|
|
120
|
-
catch (e) {
|
|
121
|
-
// On error, remove the request from in progress, otherwise it would be there forever
|
|
122
|
-
this.inProgress.delete(nextRequestId);
|
|
123
|
-
throw e;
|
|
124
|
-
}
|
|
125
|
-
// NOTE: It can happen that the queue head index is inconsistent with the main queue table. This can occur in two situations:
|
|
126
|
-
// 1) Queue head index is ahead of the main table and the request is not present in the main table yet (i.e. getRequest() returned null).
|
|
127
|
-
// In this case, keep the request marked as in progress for a short while,
|
|
128
|
-
// so that isFinished() doesn't return true and _ensureHeadIsNonEmpty() doesn't not load the request
|
|
129
|
-
// into the queueHeadDict straight again. After the interval expires, fetchNextRequest()
|
|
130
|
-
// will try to fetch this request again, until it eventually appears in the main table.
|
|
131
|
-
if (!request) {
|
|
132
|
-
this.log.debug('Cannot find a request from the beginning of queue, will be retried later', {
|
|
133
|
-
nextRequestId,
|
|
134
|
-
});
|
|
135
|
-
setTimeout(() => {
|
|
136
|
-
this.inProgress.delete(nextRequestId);
|
|
137
|
-
}, STORAGE_CONSISTENCY_DELAY_MILLIS);
|
|
378
|
+
const requestOptions = await this.client.fetchNextRequest();
|
|
379
|
+
if (!requestOptions)
|
|
138
380
|
return null;
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
381
|
+
return new Request(requestOptions);
|
|
382
|
+
}
|
|
383
|
+
/**
|
|
384
|
+
* Marks a request that was previously returned by the
|
|
385
|
+
* {@link RequestQueue.fetchNextRequest}
|
|
386
|
+
* function as handled after successful processing.
|
|
387
|
+
* Handled requests will never again be returned by the `fetchNextRequest` function.
|
|
388
|
+
*/
|
|
389
|
+
async markRequestAsHandled(request) {
|
|
390
|
+
checkStorageAccess();
|
|
391
|
+
ow(request, ow.object.partialShape({
|
|
392
|
+
id: ow.string,
|
|
393
|
+
uniqueKey: ow.string,
|
|
394
|
+
handledAt: ow.optional.string,
|
|
395
|
+
}));
|
|
396
|
+
const forefront = this.requestCache.get(getRequestId(request.uniqueKey))?.forefront ?? false;
|
|
397
|
+
const handledAt = request.handledAt ?? new Date().toISOString();
|
|
398
|
+
const processedRequest = await this.client.markRequestAsHandled({
|
|
399
|
+
...request,
|
|
400
|
+
handledAt,
|
|
401
|
+
});
|
|
402
|
+
// The request was not in progress (e.g. already handled) — nothing to do.
|
|
403
|
+
if (!processedRequest) {
|
|
147
404
|
return null;
|
|
148
405
|
}
|
|
149
|
-
|
|
406
|
+
request.handledAt = handledAt;
|
|
407
|
+
const queueOperationInfo = {
|
|
408
|
+
...processedRequest,
|
|
409
|
+
uniqueKey: request.uniqueKey,
|
|
410
|
+
forefront,
|
|
411
|
+
};
|
|
412
|
+
this._cacheRequest(getRequestId(request.uniqueKey), queueOperationInfo);
|
|
413
|
+
return queueOperationInfo;
|
|
150
414
|
}
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
415
|
+
/**
|
|
416
|
+
* Reclaims a failed request back to the queue, so that it can be returned for processing later again
|
|
417
|
+
* by another call to {@link RequestQueue.fetchNextRequest}.
|
|
418
|
+
* The request record in the queue is updated using the provided `request` parameter.
|
|
419
|
+
* For example, this lets you store the number of retries or error messages for the request.
|
|
420
|
+
*/
|
|
421
|
+
async reclaimRequest(request, options = {}) {
|
|
422
|
+
checkStorageAccess();
|
|
423
|
+
ow(request, ow.object.partialShape({
|
|
424
|
+
id: ow.string,
|
|
425
|
+
uniqueKey: ow.string,
|
|
426
|
+
}));
|
|
427
|
+
ow(options, ow.object.exactShape({
|
|
428
|
+
forefront: ow.optional.boolean,
|
|
429
|
+
}));
|
|
430
|
+
const { forefront = false } = options;
|
|
431
|
+
const processedRequest = await this.client.reclaimRequest(request, { forefront });
|
|
432
|
+
// The request was not in progress — nothing to reclaim.
|
|
433
|
+
if (!processedRequest) {
|
|
434
|
+
return null;
|
|
435
|
+
}
|
|
436
|
+
const queueOperationInfo = {
|
|
437
|
+
...processedRequest,
|
|
438
|
+
uniqueKey: request.uniqueKey,
|
|
439
|
+
forefront,
|
|
440
|
+
};
|
|
441
|
+
this._cacheRequest(getRequestId(request.uniqueKey), queueOperationInfo);
|
|
442
|
+
return queueOperationInfo;
|
|
154
443
|
}
|
|
155
444
|
/**
|
|
156
|
-
*
|
|
445
|
+
* Resolves to `true` if the next call to {@link RequestQueue.fetchNextRequest} would return
|
|
446
|
+
* `null`, i.e. there are no pending requests to fetch right now. Otherwise it resolves to `false`.
|
|
157
447
|
*
|
|
158
|
-
*
|
|
159
|
-
*
|
|
160
|
-
*
|
|
161
|
-
*
|
|
162
|
-
* @
|
|
163
|
-
* @param [iteration] Used when this function is called recursively to limit the recursion.
|
|
164
|
-
* @returns Indicates if queue head is consistent (true) or inconsistent (false).
|
|
448
|
+
* Note that even if the queue is empty, there might be some requests currently being processed
|
|
449
|
+
* (fetched but not yet handled or reclaimed). An empty queue therefore does not mean crawling is
|
|
450
|
+
* finished — those in-progress requests may still be reclaimed, and background tasks may still be
|
|
451
|
+
* adding more requests. To check whether all activity in the queue has finished, use
|
|
452
|
+
* {@link RequestQueue.isFinished}.
|
|
165
453
|
*/
|
|
166
|
-
async
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
return true;
|
|
170
|
-
}
|
|
171
|
-
// If is nonempty resolve immediately.
|
|
172
|
-
if (this.queueHeadIds.length() > 0) {
|
|
173
|
-
return true;
|
|
174
|
-
}
|
|
175
|
-
if (!this.queryQueueHeadPromise) {
|
|
176
|
-
const queryStartedAt = new Date();
|
|
177
|
-
this.queryQueueHeadPromise = this.client
|
|
178
|
-
.listHead({ limit })
|
|
179
|
-
.then(({ items, queueModifiedAt, hadMultipleClients }) => {
|
|
180
|
-
items.forEach(({ id: requestId, uniqueKey }) => {
|
|
181
|
-
// Queue head index might be behind the main table, so ensure we don't recycle requests
|
|
182
|
-
if (!requestId ||
|
|
183
|
-
!uniqueKey ||
|
|
184
|
-
this.inProgress.has(requestId) ||
|
|
185
|
-
this.recentlyHandledRequestsCache.get(requestId))
|
|
186
|
-
return;
|
|
187
|
-
this.queueHeadIds.add(requestId, requestId, false);
|
|
188
|
-
const forefront = this.requestCache.get(getRequestId(uniqueKey))?.forefront ?? false;
|
|
189
|
-
this._cacheRequest(getRequestId(uniqueKey), {
|
|
190
|
-
requestId,
|
|
191
|
-
wasAlreadyHandled: false,
|
|
192
|
-
wasAlreadyPresent: true,
|
|
193
|
-
uniqueKey,
|
|
194
|
-
forefront,
|
|
195
|
-
});
|
|
196
|
-
});
|
|
197
|
-
// This is needed so that the next call to _ensureHeadIsNonEmpty() will fetch the queue head again.
|
|
198
|
-
this.queryQueueHeadPromise = null;
|
|
199
|
-
return {
|
|
200
|
-
wasLimitReached: items.length >= limit,
|
|
201
|
-
prevLimit: limit,
|
|
202
|
-
queueModifiedAt: new Date(queueModifiedAt),
|
|
203
|
-
queryStartedAt,
|
|
204
|
-
hadMultipleClients,
|
|
205
|
-
};
|
|
206
|
-
});
|
|
207
|
-
}
|
|
208
|
-
const { queueModifiedAt, wasLimitReached, prevLimit, queryStartedAt, hadMultipleClients } = await this.queryQueueHeadPromise;
|
|
209
|
-
// TODO: I feel this code below can be greatly simplified...
|
|
210
|
-
// If queue is still empty then one of the following holds:
|
|
211
|
-
// - the other calls waiting for this promise already consumed all the returned requests
|
|
212
|
-
// - the limit was too low and contained only requests in progress
|
|
213
|
-
// - the writes from other clients were not propagated yet
|
|
214
|
-
// - the whole queue was processed and we are done
|
|
215
|
-
// If limit was not reached in the call then there are no more requests to be returned.
|
|
216
|
-
if (prevLimit >= REQUEST_QUEUE_HEAD_MAX_LIMIT) {
|
|
217
|
-
this.log.warning(`Reached the maximum number of requests in progress: ${REQUEST_QUEUE_HEAD_MAX_LIMIT}.`);
|
|
218
|
-
}
|
|
219
|
-
const shouldRepeatWithHigherLimit = this.queueHeadIds.length() === 0 && wasLimitReached && prevLimit < REQUEST_QUEUE_HEAD_MAX_LIMIT;
|
|
220
|
-
// If ensureConsistency=true then we must ensure that either:
|
|
221
|
-
// - queueModifiedAt is older than queryStartedAt by at least API_PROCESSED_REQUESTS_DELAY_MILLIS
|
|
222
|
-
// - hadMultipleClients=false and this.assumedTotalCount<=this.assumedHandledCount
|
|
223
|
-
const isDatabaseConsistent = +queryStartedAt - +queueModifiedAt >= API_PROCESSED_REQUESTS_DELAY_MILLIS;
|
|
224
|
-
const isLocallyConsistent = !hadMultipleClients && this.assumedTotalCount <= this.assumedHandledCount;
|
|
225
|
-
// Consistent information from one source is enough to consider request queue finished.
|
|
226
|
-
const shouldRepeatForConsistency = ensureConsistency && !isDatabaseConsistent && !isLocallyConsistent;
|
|
227
|
-
// If both are false then head is consistent and we may exit.
|
|
228
|
-
if (!shouldRepeatWithHigherLimit && !shouldRepeatForConsistency)
|
|
229
|
-
return true;
|
|
230
|
-
// If we are querying for consistency then we limit the number of queries to MAX_QUERIES_FOR_CONSISTENCY.
|
|
231
|
-
// If this is reached then we return false so that empty() and finished() returns possibly false negative.
|
|
232
|
-
if (!shouldRepeatWithHigherLimit && iteration > MAX_QUERIES_FOR_CONSISTENCY)
|
|
233
|
-
return false;
|
|
234
|
-
const nextLimit = shouldRepeatWithHigherLimit ? Math.round(prevLimit * 1.5) : prevLimit;
|
|
235
|
-
// If we are repeating for consistency then wait required time.
|
|
236
|
-
if (shouldRepeatForConsistency) {
|
|
237
|
-
const delayMillis = API_PROCESSED_REQUESTS_DELAY_MILLIS - (Date.now() - +queueModifiedAt);
|
|
238
|
-
this.log.info(`Waiting for ${delayMillis}ms before considering the queue as finished to ensure that the data is consistent.`);
|
|
239
|
-
await sleep(delayMillis);
|
|
240
|
-
}
|
|
241
|
-
return this._ensureHeadIsNonEmpty(ensureConsistency, nextLimit, iteration + 1);
|
|
454
|
+
async isEmpty() {
|
|
455
|
+
checkStorageAccess();
|
|
456
|
+
return this.client.isEmpty();
|
|
242
457
|
}
|
|
243
|
-
|
|
458
|
+
/**
|
|
459
|
+
* Resolves to `true` if all requests were already handled and there are no more left — including no
|
|
460
|
+
* requests currently in progress (fetched but not yet handled or reclaimed, including requests
|
|
461
|
+
* locked by other clients sharing the same queue) and no background add operations still in flight.
|
|
462
|
+
*
|
|
463
|
+
* Due to the nature of distributed storage used by the queue, the function may occasionally return
|
|
464
|
+
* a false negative, but it shall never return a false positive.
|
|
465
|
+
*/
|
|
244
466
|
async isFinished() {
|
|
245
467
|
checkStorageAccess();
|
|
246
|
-
if
|
|
247
|
-
const message = `The request queue seems to be stuck for ${this.internalTimeoutMillis / 1e3}s, resetting internal state.`;
|
|
248
|
-
this.log.warning(message, { inProgress: [...this.inProgress] });
|
|
249
|
-
this._reset();
|
|
250
|
-
}
|
|
468
|
+
// We are not finished if we're still adding new requests in the background.
|
|
251
469
|
if (this.inProgressRequestBatchCount > 0) {
|
|
252
470
|
return false;
|
|
253
471
|
}
|
|
254
|
-
|
|
255
|
-
return false;
|
|
256
|
-
const isHeadConsistent = await this._ensureHeadIsNonEmpty(true);
|
|
257
|
-
return isHeadConsistent && this.queueHeadIds.length() === 0 && this.inProgressCount() === 0;
|
|
472
|
+
return this.client.isFinished();
|
|
258
473
|
}
|
|
259
474
|
/**
|
|
260
|
-
*
|
|
261
|
-
*
|
|
262
|
-
*
|
|
263
|
-
*
|
|
475
|
+
* Tells the queue how long a consumer expects to hold a fetched request before marking it handled
|
|
476
|
+
* or reclaiming it (typically the request-handler timeout plus padding), so that a storage client
|
|
477
|
+
* that reserves requests via locking does not hand the same request out again while it is still
|
|
478
|
+
* being processed.
|
|
479
|
+
*
|
|
480
|
+
* Several consumers may share one queue (and therefore one client) in a single process, so we only
|
|
481
|
+
* ever raise the reservation duration, never lower it — otherwise a short-lived consumer could cut
|
|
482
|
+
* short the reservation of a long-lived one and have its in-flight request stolen.
|
|
264
483
|
*/
|
|
265
|
-
|
|
484
|
+
setExpectedRequestProcessingTimeSecs(secs) {
|
|
485
|
+
if (secs <= this.expectedRequestProcessingSecs) {
|
|
486
|
+
return;
|
|
487
|
+
}
|
|
488
|
+
this.expectedRequestProcessingSecs = secs;
|
|
489
|
+
this.client.setExpectedRequestProcessingTimeSecs?.(secs);
|
|
490
|
+
}
|
|
491
|
+
/**
|
|
492
|
+
* Caches information about request to beware of unneeded addRequest() calls.
|
|
493
|
+
*/
|
|
494
|
+
_cacheRequest(cacheKey, queueOperationInfo) {
|
|
495
|
+
// Remove the previous entry, as otherwise our cache will never update 👀
|
|
496
|
+
this.requestCache.remove(cacheKey);
|
|
497
|
+
this.requestCache.add(cacheKey, {
|
|
498
|
+
id: queueOperationInfo.requestId,
|
|
499
|
+
isHandled: queueOperationInfo.wasAlreadyHandled,
|
|
500
|
+
uniqueKey: queueOperationInfo.uniqueKey,
|
|
501
|
+
hydrated: null,
|
|
502
|
+
lockExpiresAt: null,
|
|
503
|
+
forefront: queueOperationInfo.forefront,
|
|
504
|
+
});
|
|
505
|
+
}
|
|
506
|
+
/**
|
|
507
|
+
* Removes the queue either from the Apify Cloud storage or from the local database,
|
|
508
|
+
* depending on the mode of operation.
|
|
509
|
+
*/
|
|
510
|
+
async drop() {
|
|
266
511
|
checkStorageAccess();
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
512
|
+
await this.client.drop();
|
|
513
|
+
serviceLocator.getStorageInstanceManager().removeFromCache(this);
|
|
514
|
+
}
|
|
515
|
+
/**
|
|
516
|
+
* Remove all requests from the queue but keep the queue itself, resetting it
|
|
517
|
+
* so it can be reused (e.g. across multiple `crawler.run()` calls).
|
|
518
|
+
*/
|
|
519
|
+
async purge() {
|
|
520
|
+
checkStorageAccess();
|
|
521
|
+
await this.client.purge();
|
|
522
|
+
// Reset in-memory bookkeeping so the queue behaves as if freshly opened.
|
|
523
|
+
this.requestCache.clear();
|
|
524
|
+
this.inProgressRequestBatchCount = 0;
|
|
525
|
+
// Reset the expected-processing-time high-water mark too, otherwise the monotonic-raise guard
|
|
526
|
+
// in `setExpectedRequestProcessingTimeSecs` would let a value raised in an earlier run leak into a
|
|
527
|
+
// later one and silently swallow a lower hint (the queue is meant to be reusable across runs).
|
|
528
|
+
this.expectedRequestProcessingSecs = 0;
|
|
529
|
+
}
|
|
530
|
+
/**
|
|
531
|
+
* @inheritdoc
|
|
532
|
+
*/
|
|
533
|
+
async *[Symbol.asyncIterator]() {
|
|
534
|
+
while (true) {
|
|
535
|
+
const req = await this.fetchNextRequest();
|
|
536
|
+
if (!req)
|
|
537
|
+
break;
|
|
538
|
+
yield req;
|
|
539
|
+
}
|
|
284
540
|
}
|
|
285
541
|
/**
|
|
542
|
+
* Returns the number of handled requests.
|
|
543
|
+
*
|
|
544
|
+
* This function is just a convenient shortcut for:
|
|
545
|
+
*
|
|
546
|
+
* ```javascript
|
|
547
|
+
* const { handledRequestCount } = await queue.getInfo();
|
|
548
|
+
* ```
|
|
286
549
|
* @inheritdoc
|
|
287
550
|
*/
|
|
288
|
-
async
|
|
289
|
-
|
|
290
|
-
this.
|
|
291
|
-
return
|
|
551
|
+
async getHandledCount() {
|
|
552
|
+
// NOTE: We keep this function for compatibility with RequestList.getHandledCount()
|
|
553
|
+
const { handledRequestCount } = await this.getInfo();
|
|
554
|
+
return handledRequestCount;
|
|
292
555
|
}
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
556
|
+
/**
|
|
557
|
+
* Returns an object containing general information about the request queue.
|
|
558
|
+
*
|
|
559
|
+
* **Example:**
|
|
560
|
+
* ```
|
|
561
|
+
* {
|
|
562
|
+
* id: "WkzbQMuFYuamGv3YF",
|
|
563
|
+
* name: "my-queue",
|
|
564
|
+
* createdAt: new Date("2015-12-12T07:34:14.202Z"),
|
|
565
|
+
* modifiedAt: new Date("2015-12-13T08:36:13.202Z"),
|
|
566
|
+
* accessedAt: new Date("2015-12-14T08:36:13.202Z"),
|
|
567
|
+
* totalRequestCount: 25,
|
|
568
|
+
* handledRequestCount: 5,
|
|
569
|
+
* pendingRequestCount: 20,
|
|
570
|
+
* }
|
|
571
|
+
* ```
|
|
572
|
+
*
|
|
573
|
+
* @throws If the underlying storage no longer exists (e.g. it was deleted externally).
|
|
574
|
+
*/
|
|
575
|
+
async getInfo() {
|
|
576
|
+
checkStorageAccess();
|
|
577
|
+
return this.client.getMetadata();
|
|
578
|
+
}
|
|
579
|
+
/**
|
|
580
|
+
* Fetches URLs from requestsFromUrl and returns them in format of list of requests
|
|
581
|
+
*/
|
|
582
|
+
async _fetchRequestsFromUrl(source) {
|
|
583
|
+
const { requestsFromUrl, regex, ...sharedOpts } = source;
|
|
584
|
+
// Download remote resource and parse URLs.
|
|
585
|
+
let urlsArr;
|
|
586
|
+
try {
|
|
587
|
+
urlsArr = await this._downloadListOfUrls({
|
|
588
|
+
url: requestsFromUrl,
|
|
589
|
+
urlRegExp: regex,
|
|
590
|
+
proxyUrl: await this.proxyConfiguration?.newUrl(),
|
|
591
|
+
});
|
|
592
|
+
}
|
|
593
|
+
catch (err) {
|
|
594
|
+
throw new Error(`Cannot fetch a request list from ${requestsFromUrl}: ${err}`);
|
|
595
|
+
}
|
|
596
|
+
// Skip if resource contained no URLs.
|
|
597
|
+
if (!urlsArr.length) {
|
|
598
|
+
this.log.warning('The fetched list contains no valid URLs.', { requestsFromUrl, regex });
|
|
599
|
+
return [];
|
|
600
|
+
}
|
|
601
|
+
return urlsArr.map((url) => ({ url, ...sharedOpts }));
|
|
602
|
+
}
|
|
603
|
+
/**
|
|
604
|
+
* Adds all fetched requests from a URL from a remote resource.
|
|
605
|
+
*/
|
|
606
|
+
async _addFetchedRequests(source, fetchedRequests, options) {
|
|
607
|
+
const { requestsFromUrl, regex } = source;
|
|
608
|
+
const { addedRequests } = await this.addRequestsBatched(fetchedRequests, options);
|
|
609
|
+
this.log.info('Fetched and loaded Requests from a remote resource.', {
|
|
610
|
+
requestsFromUrl,
|
|
611
|
+
regex,
|
|
612
|
+
fetchedCount: fetchedRequests.length,
|
|
613
|
+
importedCount: addedRequests.length,
|
|
614
|
+
duplicateCount: fetchedRequests.length - addedRequests.length,
|
|
615
|
+
sample: JSON.stringify(fetchedRequests.slice(0, 5)),
|
|
616
|
+
});
|
|
617
|
+
return addedRequests;
|
|
618
|
+
}
|
|
619
|
+
/**
|
|
620
|
+
* @internal wraps public utility for mocking purposes
|
|
621
|
+
*/
|
|
622
|
+
async _downloadListOfUrls(options) {
|
|
623
|
+
return downloadListOfUrls({
|
|
624
|
+
...options,
|
|
625
|
+
httpClient: this.httpClient,
|
|
626
|
+
});
|
|
296
627
|
}
|
|
297
628
|
/**
|
|
298
629
|
* Opens a request queue and returns a promise resolving to an instance
|
|
@@ -305,14 +636,47 @@ class RequestQueue extends RequestProvider {
|
|
|
305
636
|
*
|
|
306
637
|
* For more details and code examples, see the {@link RequestQueue} class.
|
|
307
638
|
*
|
|
308
|
-
* @param [
|
|
309
|
-
* ID or name of the request queue to be opened. If
|
|
310
|
-
*
|
|
639
|
+
* @param [identifier]
|
|
640
|
+
* ID or name of the request queue to be opened. If a string is provided, it will first be
|
|
641
|
+
* looked up as an ID; if no such storage exists, it will be treated as a name.
|
|
642
|
+
* If `null` or `undefined`, the function returns the default request queue associated with the crawler run.
|
|
311
643
|
* @param [options] Open Request Queue options.
|
|
312
644
|
*/
|
|
313
|
-
static async open(
|
|
314
|
-
|
|
645
|
+
static async open(identifier, options = {}) {
|
|
646
|
+
checkStorageAccess();
|
|
647
|
+
ow(options, ow.object.exactShape({
|
|
648
|
+
config: ow.optional.object.instanceOf(Configuration),
|
|
649
|
+
storageClient: ow.optional.object,
|
|
650
|
+
proxyConfiguration: ow.optional.object,
|
|
651
|
+
httpClient: ow.optional.object,
|
|
652
|
+
}));
|
|
653
|
+
const client = options.storageClient ?? serviceLocator.getStorageClient();
|
|
654
|
+
const config = options.config ?? serviceLocator.getConfiguration();
|
|
655
|
+
await purgeDefaultStorages({ onlyPurgeOnce: true, client, config });
|
|
656
|
+
const resolved = await resolveStorageIdentifier(identifier, client, 'RequestQueue');
|
|
657
|
+
const queue = await serviceLocator
|
|
658
|
+
.getStorageInstanceManager()
|
|
659
|
+
.openStorage(this, {
|
|
660
|
+
...resolved,
|
|
661
|
+
clientOpener: () => client.createRequestQueueClient(resolved),
|
|
662
|
+
clientCacheKey: client.getStorageClientCacheKey?.() ?? client.constructor.name,
|
|
663
|
+
});
|
|
664
|
+
queue.proxyConfiguration = options.proxyConfiguration;
|
|
665
|
+
queue.httpClient = options.httpClient;
|
|
666
|
+
if (!queue.isInitialized) {
|
|
667
|
+
// Re-create the request queue client with clientKey and timeoutSecs so that
|
|
668
|
+
// request locking works correctly for API-backed implementations.
|
|
669
|
+
// TODO: clientKey/timeoutSecs are Apify-platform concerns and should eventually be pushed
|
|
670
|
+
// down into the Apify SDK's client implementation, aligning with crawlee-python's approach
|
|
671
|
+
// where locking is handled internally by the client (see crawlee-python PR #1194).
|
|
672
|
+
queue.client = await client.createRequestQueueClient({
|
|
673
|
+
id: queue.id,
|
|
674
|
+
clientKey: queue.clientKey,
|
|
675
|
+
timeoutSecs: queue.timeoutSecs,
|
|
676
|
+
});
|
|
677
|
+
queue.isInitialized = true;
|
|
678
|
+
}
|
|
679
|
+
return queue;
|
|
315
680
|
}
|
|
316
681
|
}
|
|
317
|
-
export { RequestQueue as RequestQueueV1 };
|
|
318
682
|
//# sourceMappingURL=request_queue.js.map
|