@crawlee/core 4.0.0-beta.6 → 4.0.0-beta.61
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -5
- package/autoscaling/autoscaled_pool.d.ts +3 -5
- package/autoscaling/autoscaled_pool.d.ts.map +1 -1
- package/autoscaling/autoscaled_pool.js +3 -9
- package/autoscaling/autoscaled_pool.js.map +1 -1
- package/autoscaling/snapshotter.d.ts +3 -13
- package/autoscaling/snapshotter.d.ts.map +1 -1
- package/autoscaling/snapshotter.js +18 -29
- package/autoscaling/snapshotter.js.map +1 -1
- package/autoscaling/system_status.d.ts +0 -3
- package/autoscaling/system_status.d.ts.map +1 -1
- package/autoscaling/system_status.js +2 -3
- package/autoscaling/system_status.js.map +1 -1
- package/configuration.d.ts +85 -227
- package/configuration.d.ts.map +1 -1
- package/configuration.js +159 -223
- package/configuration.js.map +1 -1
- package/cookie_utils.d.ts +4 -2
- package/cookie_utils.d.ts.map +1 -1
- package/cookie_utils.js +18 -12
- package/cookie_utils.js.map +1 -1
- package/crawlers/context_pipeline.d.ts +71 -0
- package/crawlers/context_pipeline.d.ts.map +1 -0
- package/crawlers/context_pipeline.js +123 -0
- package/crawlers/context_pipeline.js.map +1 -0
- package/crawlers/crawler_commons.d.ts +19 -28
- package/crawlers/crawler_commons.d.ts.map +1 -1
- package/crawlers/crawler_commons.js +12 -20
- package/crawlers/crawler_commons.js.map +1 -1
- package/crawlers/crawler_utils.d.ts +2 -2
- package/crawlers/crawler_utils.d.ts.map +1 -1
- package/crawlers/crawler_utils.js +1 -1
- package/crawlers/crawler_utils.js.map +1 -1
- package/crawlers/error_snapshotter.d.ts +3 -2
- package/crawlers/error_snapshotter.d.ts.map +1 -1
- package/crawlers/error_snapshotter.js +2 -2
- package/crawlers/error_snapshotter.js.map +1 -1
- package/crawlers/error_tracker.d.ts +2 -1
- package/crawlers/error_tracker.d.ts.map +1 -1
- package/crawlers/error_tracker.js.map +1 -1
- package/crawlers/index.d.ts +1 -1
- package/crawlers/index.d.ts.map +1 -1
- package/crawlers/index.js +1 -1
- package/crawlers/index.js.map +1 -1
- package/crawlers/internals/types.d.ts +8 -0
- package/crawlers/internals/types.d.ts.map +1 -0
- package/crawlers/internals/types.js +2 -0
- package/crawlers/internals/types.js.map +1 -0
- package/crawlers/statistics.d.ts +15 -15
- package/crawlers/statistics.d.ts.map +1 -1
- package/crawlers/statistics.js +21 -24
- package/crawlers/statistics.js.map +1 -1
- package/enqueue_links/enqueue_links.d.ts +32 -18
- package/enqueue_links/enqueue_links.d.ts.map +1 -1
- package/enqueue_links/enqueue_links.js +45 -24
- package/enqueue_links/enqueue_links.js.map +1 -1
- package/enqueue_links/shared.d.ts +25 -8
- package/enqueue_links/shared.d.ts.map +1 -1
- package/enqueue_links/shared.js +69 -37
- package/enqueue_links/shared.js.map +1 -1
- package/errors.d.ts +33 -3
- package/errors.d.ts.map +1 -1
- package/errors.js +48 -4
- package/errors.js.map +1 -1
- package/events/event_manager.d.ts +8 -5
- package/events/event_manager.d.ts.map +1 -1
- package/events/event_manager.js +7 -9
- package/events/event_manager.js.map +1 -1
- package/events/local_event_manager.d.ts +14 -4
- package/events/local_event_manager.d.ts.map +1 -1
- package/events/local_event_manager.js +33 -39
- package/events/local_event_manager.js.map +1 -1
- package/index.d.ts +3 -2
- package/index.d.ts.map +1 -1
- package/index.js +2 -1
- package/index.js.map +1 -1
- package/log.d.ts +82 -2
- package/log.d.ts.map +1 -1
- package/log.js +102 -0
- package/log.js.map +1 -1
- package/package.json +9 -10
- package/proxy_configuration.d.ts +14 -148
- package/proxy_configuration.d.ts.map +1 -1
- package/proxy_configuration.js +19 -167
- package/proxy_configuration.js.map +1 -1
- package/recoverable_state.d.ts +121 -0
- package/recoverable_state.d.ts.map +1 -0
- package/recoverable_state.js +142 -0
- package/recoverable_state.js.map +1 -0
- package/request.d.ts +74 -10
- package/request.d.ts.map +1 -1
- package/request.js +85 -23
- package/request.js.map +1 -1
- package/router.d.ts.map +1 -1
- package/router.js.map +1 -1
- package/serialization.js +1 -1
- package/serialization.js.map +1 -1
- package/service_locator.d.ts +157 -0
- package/service_locator.d.ts.map +1 -0
- package/service_locator.js +234 -0
- package/service_locator.js.map +1 -0
- package/session_pool/index.d.ts +0 -1
- package/session_pool/index.d.ts.map +1 -1
- package/session_pool/index.js +0 -1
- package/session_pool/index.js.map +1 -1
- package/session_pool/session.d.ts +26 -72
- package/session_pool/session.d.ts.map +1 -1
- package/session_pool/session.js +36 -98
- package/session_pool/session.js.map +1 -1
- package/session_pool/session_pool.d.ts +65 -71
- package/session_pool/session_pool.d.ts.map +1 -1
- package/session_pool/session_pool.js +101 -100
- package/session_pool/session_pool.js.map +1 -1
- package/storages/dataset.d.ts +90 -46
- package/storages/dataset.d.ts.map +1 -1
- package/storages/dataset.js +149 -121
- package/storages/dataset.js.map +1 -1
- package/storages/index.d.ts +3 -1
- package/storages/index.d.ts.map +1 -1
- package/storages/index.js +3 -1
- package/storages/index.js.map +1 -1
- package/storages/key_value_store.d.ts +104 -22
- package/storages/key_value_store.d.ts.map +1 -1
- package/storages/key_value_store.js +166 -51
- package/storages/key_value_store.js.map +1 -1
- package/storages/request_list.d.ts +9 -9
- package/storages/request_list.d.ts.map +1 -1
- package/storages/request_list.js +13 -8
- package/storages/request_list.js.map +1 -1
- package/storages/request_list_adapter.d.ts +58 -0
- package/storages/request_list_adapter.d.ts.map +1 -0
- package/storages/request_list_adapter.js +81 -0
- package/storages/request_list_adapter.js.map +1 -0
- package/storages/request_manager_tandem.d.ts +68 -0
- package/storages/request_manager_tandem.d.ts.map +1 -0
- package/storages/request_manager_tandem.js +124 -0
- package/storages/request_manager_tandem.js.map +1 -0
- package/storages/request_provider.d.ts +87 -22
- package/storages/request_provider.d.ts.map +1 -1
- package/storages/request_provider.js +127 -77
- package/storages/request_provider.js.map +1 -1
- package/storages/request_queue.d.ts +1 -3
- package/storages/request_queue.d.ts.map +1 -1
- package/storages/request_queue.js +2 -4
- package/storages/request_queue.js.map +1 -1
- package/storages/request_queue_v2.d.ts +3 -3
- package/storages/request_queue_v2.d.ts.map +1 -1
- package/storages/request_queue_v2.js +4 -5
- package/storages/request_queue_v2.js.map +1 -1
- package/storages/sitemap_request_list.d.ts +5 -5
- package/storages/sitemap_request_list.d.ts.map +1 -1
- package/storages/sitemap_request_list.js +10 -7
- package/storages/sitemap_request_list.js.map +1 -1
- package/storages/storage_instance_manager.d.ts +91 -0
- package/storages/storage_instance_manager.d.ts.map +1 -0
- package/storages/storage_instance_manager.js +236 -0
- package/storages/storage_instance_manager.js.map +1 -0
- package/storages/utils.d.ts +47 -1
- package/storages/utils.d.ts.map +1 -1
- package/storages/utils.js +57 -5
- package/storages/utils.js.map +1 -1
- package/typedefs.d.ts +1 -1
- package/typedefs.d.ts.map +1 -1
- package/validators.d.ts +4 -0
- package/validators.d.ts.map +1 -1
- package/validators.js +4 -0
- package/validators.js.map +1 -1
- package/crawlers/crawler_extension.d.ts +0 -12
- package/crawlers/crawler_extension.d.ts.map +0 -1
- package/crawlers/crawler_extension.js +0 -14
- package/crawlers/crawler_extension.js.map +0 -1
- package/http_clients/base-http-client.d.ts +0 -134
- package/http_clients/base-http-client.d.ts.map +0 -1
- package/http_clients/base-http-client.js +0 -33
- package/http_clients/base-http-client.js.map +0 -1
- package/http_clients/form-data-like.d.ts +0 -67
- package/http_clients/form-data-like.d.ts.map +0 -1
- package/http_clients/form-data-like.js +0 -5
- package/http_clients/form-data-like.js.map +0 -1
- package/http_clients/got-scraping-http-client.d.ts +0 -15
- package/http_clients/got-scraping-http-client.d.ts.map +0 -1
- package/http_clients/got-scraping-http-client.js +0 -69
- package/http_clients/got-scraping-http-client.js.map +0 -1
- package/http_clients/index.d.ts +0 -3
- package/http_clients/index.d.ts.map +0 -1
- package/http_clients/index.js +0 -3
- package/http_clients/index.js.map +0 -1
- package/session_pool/events.d.ts +0 -3
- package/session_pool/events.d.ts.map +0 -1
- package/session_pool/events.js +0 -3
- package/session_pool/events.js.map +0 -1
- package/storages/storage_manager.d.ts +0 -58
- package/storages/storage_manager.d.ts.map +0 -1
- package/storages/storage_manager.js +0 -105
- package/storages/storage_manager.js.map +0 -1
- package/tsconfig.build.tsbuildinfo +0 -1
|
@@ -1,25 +1,81 @@
|
|
|
1
|
-
import type { BatchAddRequestsResult, Dictionary, ProcessedRequest, QueueOperationInfo, RequestQueueClient, RequestQueueInfo
|
|
1
|
+
import type { BaseHttpClient, BatchAddRequestsResult, Dictionary, ProcessedRequest, QueueOperationInfo, RequestQueueClient, RequestQueueInfo } from '@crawlee/types';
|
|
2
|
+
import type { ReadonlyDeep } from 'type-fest';
|
|
2
3
|
import { ListDictionary, LruCache } from '@apify/datastructures';
|
|
3
|
-
import type { Log } from '@apify/log';
|
|
4
4
|
import { Configuration } from '../configuration.js';
|
|
5
|
+
import type { EventManager } from '../events/event_manager.js';
|
|
6
|
+
import type { CrawleeLogger } from '../log.js';
|
|
5
7
|
import type { ProxyConfiguration } from '../proxy_configuration.js';
|
|
6
8
|
import type { InternalSource, RequestOptions, Source } from '../request.js';
|
|
7
9
|
import { Request } from '../request.js';
|
|
8
|
-
import type { IStorage,
|
|
9
|
-
|
|
10
|
-
|
|
10
|
+
import type { IStorage, StorageIdentifier } from './storage_instance_manager.js';
|
|
11
|
+
import type { StorageOpenOptions } from './utils.js';
|
|
12
|
+
export type RequestsLike = AsyncIterable<Source | string> | Iterable<Source | string> | (Source | string)[];
|
|
13
|
+
/**
|
|
14
|
+
* Represents a provider of requests/URLs to crawl.
|
|
15
|
+
*/
|
|
16
|
+
export interface IRequestManager {
|
|
17
|
+
/**
|
|
18
|
+
* Returns `true` if all requests were already handled and there are no more left.
|
|
19
|
+
*/
|
|
20
|
+
isFinished(): Promise<boolean>;
|
|
21
|
+
/**
|
|
22
|
+
* Resolves to `true` if the next call to {@link IRequestManager.fetchNextRequest} function
|
|
23
|
+
* would return `null`, otherwise it resolves to `false`.
|
|
24
|
+
* Note that even if the provider is empty, there might be some pending requests currently being processed.
|
|
25
|
+
*/
|
|
26
|
+
isEmpty(): Promise<boolean>;
|
|
27
|
+
/**
|
|
28
|
+
* Returns number of handled requests.
|
|
29
|
+
*/
|
|
30
|
+
handledCount(): Promise<number>;
|
|
31
|
+
/**
|
|
32
|
+
* Get the total number of requests known to the request manager.
|
|
33
|
+
*/
|
|
34
|
+
getTotalCount(): number;
|
|
35
|
+
/**
|
|
36
|
+
* Get an offline approximation of the number of pending requests.
|
|
37
|
+
*/
|
|
38
|
+
getPendingCount(): number;
|
|
39
|
+
/**
|
|
40
|
+
* Gets the next {@link Request} to process.
|
|
41
|
+
*
|
|
42
|
+
* The function's `Promise` resolves to `null` if there are no more
|
|
43
|
+
* requests to process.
|
|
44
|
+
*/
|
|
45
|
+
fetchNextRequest<T extends Dictionary = Dictionary>(): Promise<Request<T> | null>;
|
|
46
|
+
/**
|
|
47
|
+
* Can be used to iterate over the `RequestManager` instance in a `for await .. of` loop.
|
|
48
|
+
* Provides an alternative for the repeated use of `fetchNextRequest`.
|
|
49
|
+
*/
|
|
50
|
+
[Symbol.asyncIterator](): AsyncGenerator<Request>;
|
|
51
|
+
/**
|
|
52
|
+
* Marks request as handled after successful processing.
|
|
53
|
+
*/
|
|
54
|
+
markRequestHandled(request: Request): Promise<RequestQueueOperationInfo | void | null>;
|
|
55
|
+
/**
|
|
56
|
+
* Reclaims request to the provider if its processing failed.
|
|
57
|
+
* The request will become available in the next `fetchNextRequest()`.
|
|
58
|
+
*/
|
|
59
|
+
reclaimRequest(request: Request, options?: RequestQueueOperationOptions): Promise<RequestQueueOperationInfo | null>;
|
|
60
|
+
addRequest(requestLike: Source, options?: RequestQueueOperationOptions): Promise<RequestQueueOperationInfo>;
|
|
61
|
+
addRequestsBatched(requests: RequestsLike, options?: AddRequestsBatchedOptions): Promise<AddRequestsBatchedResult>;
|
|
62
|
+
}
|
|
63
|
+
export declare abstract class RequestProvider implements IStorage, IRequestManager {
|
|
64
|
+
protected readonly config: Configuration;
|
|
11
65
|
id: string;
|
|
12
66
|
name?: string;
|
|
13
67
|
timeoutSecs: number;
|
|
14
68
|
clientKey: string;
|
|
15
69
|
client: RequestQueueClient;
|
|
16
70
|
protected proxyConfiguration?: ProxyConfiguration;
|
|
17
|
-
log:
|
|
71
|
+
log: CrawleeLogger;
|
|
18
72
|
internalTimeoutMillis: number;
|
|
19
73
|
requestLockSecs: number;
|
|
20
74
|
assumedTotalCount: number;
|
|
21
75
|
assumedHandledCount: number;
|
|
22
76
|
private initialCount;
|
|
77
|
+
private initialHandledCount;
|
|
78
|
+
private isInitialized;
|
|
23
79
|
protected queueHeadIds: ListDictionary<string>;
|
|
24
80
|
protected requestCache: LruCache<RequestLruItem>;
|
|
25
81
|
protected recentlyHandledRequestsCache: LruCache<boolean>;
|
|
@@ -27,6 +83,8 @@ export declare abstract class RequestProvider implements IStorage {
|
|
|
27
83
|
protected lastActivity: Date;
|
|
28
84
|
protected isFinishedCalledWhileHeadWasNotEmpty: number;
|
|
29
85
|
protected inProgressRequestBatchCount: number;
|
|
86
|
+
protected httpClient?: BaseHttpClient;
|
|
87
|
+
protected readonly events: EventManager;
|
|
30
88
|
constructor(options: InternalRequestProviderOptions, config?: Configuration);
|
|
31
89
|
/**
|
|
32
90
|
* Returns an offline approximation of the total number of requests in the queue (i.e. pending + handled).
|
|
@@ -34,6 +92,12 @@ export declare abstract class RequestProvider implements IStorage {
|
|
|
34
92
|
* Survives restarts and actor migrations.
|
|
35
93
|
*/
|
|
36
94
|
getTotalCount(): number;
|
|
95
|
+
/**
|
|
96
|
+
* Returns an offline approximation of the total number of pending requests in the queue.
|
|
97
|
+
*
|
|
98
|
+
* Survives restarts and Actor migrations.
|
|
99
|
+
*/
|
|
100
|
+
getPendingCount(): number;
|
|
37
101
|
/**
|
|
38
102
|
* Adds a request to the queue.
|
|
39
103
|
*
|
|
@@ -63,7 +127,7 @@ export declare abstract class RequestProvider implements IStorage {
|
|
|
63
127
|
* Note that the function sets the `uniqueKey` and `id` fields to the passed requests if missing.
|
|
64
128
|
* @param [options] Request queue operation options.
|
|
65
129
|
*/
|
|
66
|
-
addRequests(requestsLike:
|
|
130
|
+
addRequests(requestsLike: RequestsLike, options?: RequestQueueOperationOptions): Promise<BatchAddRequestsResult>;
|
|
67
131
|
/**
|
|
68
132
|
* Adds requests to the queue in batches. By default, it will resolve after the initial batch is added, and continue
|
|
69
133
|
* adding the rest in the background. You can configure the batch size via `batchSize` option and the sleep time in between
|
|
@@ -73,7 +137,7 @@ export declare abstract class RequestProvider implements IStorage {
|
|
|
73
137
|
* @param requests The requests to add
|
|
74
138
|
* @param options Options for the request queue
|
|
75
139
|
*/
|
|
76
|
-
addRequestsBatched(requests:
|
|
140
|
+
addRequestsBatched(requests: ReadonlyDeep<RequestsLike>, options?: AddRequestsBatchedOptions): Promise<AddRequestsBatchedResult>;
|
|
77
141
|
/**
|
|
78
142
|
* Gets the request from the queue specified by ID.
|
|
79
143
|
*
|
|
@@ -98,7 +162,7 @@ export declare abstract class RequestProvider implements IStorage {
|
|
|
98
162
|
* @returns
|
|
99
163
|
* Returns the request object or `null` if there are no more pending requests.
|
|
100
164
|
*/
|
|
101
|
-
abstract fetchNextRequest<T extends Dictionary = Dictionary>(
|
|
165
|
+
abstract fetchNextRequest<T extends Dictionary = Dictionary>(): Promise<Request<T> | null>;
|
|
102
166
|
/**
|
|
103
167
|
* Marks a request that was previously returned by the
|
|
104
168
|
* {@link RequestQueue.fetchNextRequest}
|
|
@@ -142,6 +206,10 @@ export declare abstract class RequestProvider implements IStorage {
|
|
|
142
206
|
* depending on the mode of operation.
|
|
143
207
|
*/
|
|
144
208
|
drop(): Promise<void>;
|
|
209
|
+
/**
|
|
210
|
+
* @inheritdoc
|
|
211
|
+
*/
|
|
212
|
+
[Symbol.asyncIterator](): AsyncGenerator<Request<Dictionary>, void, unknown>;
|
|
145
213
|
/**
|
|
146
214
|
* Returns the number of handled requests.
|
|
147
215
|
*
|
|
@@ -150,23 +218,17 @@ export declare abstract class RequestProvider implements IStorage {
|
|
|
150
218
|
* ```javascript
|
|
151
219
|
* const { handledRequestCount } = await queue.getInfo();
|
|
152
220
|
* ```
|
|
221
|
+
* @inheritdoc
|
|
153
222
|
*/
|
|
154
223
|
handledCount(): Promise<number>;
|
|
155
224
|
/**
|
|
156
225
|
* Returns an object containing general information about the request queue.
|
|
157
226
|
*
|
|
158
|
-
* The function returns the same object as the Apify API Client's
|
|
159
|
-
* [getQueue](https://docs.apify.com/api/apify-client-js/latest#ApifyClient-requestQueues)
|
|
160
|
-
* function, which in turn calls the
|
|
161
|
-
* [Get request queue](https://apify.com/docs/api/v2#/reference/request-queues/queue/get-request-queue)
|
|
162
|
-
* API endpoint.
|
|
163
|
-
*
|
|
164
227
|
* **Example:**
|
|
165
228
|
* ```
|
|
166
229
|
* {
|
|
167
230
|
* id: "WkzbQMuFYuamGv3YF",
|
|
168
231
|
* name: "my-queue",
|
|
169
|
-
* userId: "wRsJZtadYvn4mBZmm",
|
|
170
232
|
* createdAt: new Date("2015-12-12T07:34:14.202Z"),
|
|
171
233
|
* modifiedAt: new Date("2015-12-13T08:36:13.202Z"),
|
|
172
234
|
* accessedAt: new Date("2015-12-14T08:36:13.202Z"),
|
|
@@ -175,8 +237,10 @@ export declare abstract class RequestProvider implements IStorage {
|
|
|
175
237
|
* pendingRequestCount: 20,
|
|
176
238
|
* }
|
|
177
239
|
* ```
|
|
240
|
+
*
|
|
241
|
+
* @throws If the underlying storage no longer exists (e.g. it was deleted externally).
|
|
178
242
|
*/
|
|
179
|
-
getInfo(): Promise<RequestQueueInfo
|
|
243
|
+
getInfo(): Promise<RequestQueueInfo>;
|
|
180
244
|
/**
|
|
181
245
|
* Fetches URLs from requestsFromUrl and returns them in format of list of requests
|
|
182
246
|
*/
|
|
@@ -200,12 +264,13 @@ export declare abstract class RequestProvider implements IStorage {
|
|
|
200
264
|
*
|
|
201
265
|
* For more details and code examples, see the {@link RequestQueue} class.
|
|
202
266
|
*
|
|
203
|
-
* @param [
|
|
204
|
-
* ID or name of the request queue to be opened. If
|
|
205
|
-
*
|
|
267
|
+
* @param [identifier]
|
|
268
|
+
* ID or name of the request queue to be opened. If a string is provided, it will first be
|
|
269
|
+
* looked up as an ID; if no such storage exists, it will be treated as a name.
|
|
270
|
+
* If `null` or `undefined`, the function returns the default request queue associated with the crawler run.
|
|
206
271
|
* @param [options] Open Request Queue options.
|
|
207
272
|
*/
|
|
208
|
-
static open(
|
|
273
|
+
static open(identifier?: string | StorageIdentifier | null, options?: StorageOpenOptions): Promise<RequestProvider>;
|
|
209
274
|
}
|
|
210
275
|
interface RequestLruItem {
|
|
211
276
|
uniqueKey: string;
|
|
@@ -218,7 +283,7 @@ interface RequestLruItem {
|
|
|
218
283
|
export interface RequestProviderOptions {
|
|
219
284
|
id: string;
|
|
220
285
|
name?: string;
|
|
221
|
-
client:
|
|
286
|
+
client: RequestQueueClient;
|
|
222
287
|
/**
|
|
223
288
|
* Used to pass the proxy configuration for the `requestsFromUrl` objects.
|
|
224
289
|
* Takes advantage of the internal address rotation and authentication process.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"request_provider.d.ts","sourceRoot":"","sources":["../../src/storages/request_provider.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EACR,sBAAsB,EACtB,UAAU,EACV,gBAAgB,EAChB,kBAAkB,EAClB,kBAAkB,EAClB,gBAAgB,
|
|
1
|
+
{"version":3,"file":"request_provider.d.ts","sourceRoot":"","sources":["../../src/storages/request_provider.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EACR,cAAc,EACd,sBAAsB,EACtB,UAAU,EACV,gBAAgB,EAChB,kBAAkB,EAClB,kBAAkB,EAClB,gBAAgB,EACnB,MAAM,gBAAgB,CAAC;AAWxB,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,WAAW,CAAC;AAE9C,OAAO,EAAE,cAAc,EAAE,QAAQ,EAAE,MAAM,uBAAuB,CAAC;AAGjE,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AACpD,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,4BAA4B,CAAC;AAE/D,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,WAAW,CAAC;AAC/C,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,2BAA2B,CAAC;AACpE,OAAO,KAAK,EAAE,cAAc,EAAE,cAAc,EAAE,MAAM,EAAE,MAAM,eAAe,CAAC;AAC5E,OAAO,EAAE,OAAO,EAAE,MAAM,eAAe,CAAC;AAGxC,OAAO,KAAK,EAAE,QAAQ,EAAE,iBAAiB,EAAE,MAAM,+BAA+B,CAAC;AACjF,OAAO,KAAK,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AAIrD,MAAM,MAAM,YAAY,GAAG,aAAa,CAAC,MAAM,GAAG,MAAM,CAAC,GAAG,QAAQ,CAAC,MAAM,GAAG,MAAM,CAAC,GAAG,CAAC,MAAM,GAAG,MAAM,CAAC,EAAE,CAAC;AAE5G;;GAEG;AACH,MAAM,WAAW,eAAe;IAC5B;;OAEG;IACH,UAAU,IAAI,OAAO,CAAC,OAAO,CAAC,CAAC;IAE/B;;;;OAIG;IACH,OAAO,IAAI,OAAO,CAAC,OAAO,CAAC,CAAC;IAE5B;;OAEG;IACH,YAAY,IAAI,OAAO,CAAC,MAAM,CAAC,CAAC;IAEhC;;OAEG;IACH,aAAa,IAAI,MAAM,CAAC;IAExB;;OAEG;IACH,eAAe,IAAI,MAAM,CAAC;IAE1B;;;;;OAKG;IACH,gBAAgB,CAAC,CAAC,SAAS,UAAU,GAAG,UAAU,KAAK,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC;IAElF;;;OAGG;IACH,CAAC,MAAM,CAAC,aAAa,CAAC,IAAI,cAAc,CAAC,OAAO,CAAC,CAAC;IAElD;;OAEG;IACH,kBAAkB,CAAC,OAAO,EAAE,OAAO,GAAG,OAAO,CAAC,yBAAyB,GAAG,IAAI,GAAG,IAAI,CAAC,CAAC;IAEvF;;;OAGG;IACH,cAAc,CAAC,OAAO,EAAE,OAAO,EAAE,OAAO,CAAC,EAAE,4BAA4B,GAAG,OAAO,CAAC,yBAAyB,GAAG,IAAI,CAAC,CAAC;IAEpH,UAAU,CAAC,WAAW,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,4BAA4B,GAAG,OAAO,CAAC,yBAAyB,CAAC,CAAC;IAE5G,kBAAkB,CAAC,QAAQ,EAAE,YAAY,EAAE,OAAO,CAAC,EAAE,yBAAyB,GAAG,OAAO,CAAC,wBAAwB,CAAC,CAAC;CACtH;AAED,8BAAsB,eAAgB,YAAW,QAAQ,EAAE,eAAe;IAwClE,SAAS,CAAC,QAAQ,CAAC,MAAM,EAAE,aAAa;IAvC5C,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,WAAW,SAAM;IACjB,SAAS,SAA0B;IACnC,MAAM,EAAE,kBAAkB,CAAC;IAC3B,SAAS,CAAC,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;IAElD,GAAG,EAAE,aAAa,CAAC;IACnB,qBAAqB,SAAc;IACnC,eAAe,SAAU;IAIzB,iBAAiB,SAAK;IACtB,mBAAmB,SAAK;IAExB,OAAO,CAAC,YAAY,CAAK;IACzB,OAAO,CAAC,mBAAmB,CAAK;IAChC,OAAO,CAAC,aAAa,CAAS;IAE9B,SAAS,CAAC,YAAY,yBAAgC;IACtD,SAAS,CAAC,YAAY,EAAE,QAAQ,CAAC,cAAc,CAAC,CAAC;IAEjD,SAAS,CAAC,4BAA4B,EAAE,QAAQ,CAAC,OAAO,CAAC,CAAC;IAE1D,SAAS,CAAC,uBAAuB,UAAS;IAE1C,SAAS,CAAC,YAAY,OAAc;IAEpC,SAAS,CAAC,oCAAoC,SAAK;IAEnD,SAAS,CAAC,2BAA2B,SAAK;IAE1C,SAAS,CAAC,UAAU,CAAC,EAAE,cAAc,CAAC;IAEtC,SAAS,CAAC,QAAQ,CAAC,MAAM,EAAE,YAAY,CAAC;gBAGpC,OAAO,EAAE,8BAA8B,EACpB,MAAM,GAAE,aAA+C;IAoB9E;;;;OAIG;IACH,aAAa;IAIb;;;;OAIG;IACH,eAAe;IAIf;;;;;;;;;;;;;OAaG;IACG,UAAU,CACZ,WAAW,EAAE,MAAM,EACnB,OAAO,GAAE,4BAAiC,GAC3C,OAAO,CAAC,yBAAyB,CAAC;IAmErC;;;;;;;;;;;;;OAaG;IACG,WAAW,CACb,YAAY,EAAE,YAAY,EAC1B,OAAO,GAAE,4BAAiC,GAC3C,OAAO,CAAC,sBAAsB,CAAC;IA4GlC;;;;;;;;OAQG;IACG,kBAAkB,CACpB,QAAQ,EAAE,YAAY,CAAC,YAAY,CAAC,EACpC,OAAO,GAAE,yBAA8B,GACxC,OAAO,CAAC,wBAAwB,CAAC;IAmIpC;;;;;OAKG;IACG,UAAU,CAAC,CAAC,SAAS,UAAU,GAAG,UAAU,EAAE,EAAE,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC;IAW3F;;;;;;;;;;;;;;;;OAgBG;IACH,QAAQ,CAAC,gBAAgB,CAAC,CAAC,SAAS,UAAU,GAAG,UAAU,KAAK,OAAO,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC;IAE1F;;;;;OAKG;IACG,kBAAkB,CAAC,OAAO,EAAE,OAAO,GAAG,OAAO,CAAC,yBAAyB,GAAG,IAAI,CAAC;IAwCrF;;;;;OAKG;IACG,cAAc,CAChB,OAAO,EAAE,OAAO,EAChB,OAAO,GAAE,4BAAiC,GAC3C,OAAO,CAAC,yBAAyB,GAAG,IAAI,CAAC;IAmC5C,SAAS,CAAC,QAAQ,CAAC,oBAAoB,IAAI,OAAO,CAAC,IAAI,CAAC;IAExD;;;;;OAKG;IACG,OAAO,IAAI,OAAO,CAAC,OAAO,CAAC;IAKjC;;;;;OAKG;IACH,QAAQ,CAAC,UAAU,IAAI,OAAO,CAAC,OAAO,CAAC;IAEvC,SAAS,CAAC,MAAM;IAShB;;OAEG;IACH,SAAS,CAAC,aAAa,CAAC,QAAQ,EAAE,MAAM,EAAE,kBAAkB,EAAE,yBAAyB,GAAG,IAAI;IAc9F;;OAEG;IACH,SAAS,CAAC,2BAA2B,CAAC,SAAS,EAAE,MAAM,EAAE,SAAS,EAAE,OAAO,GAAG,IAAI;IAQlF;;;OAGG;IACG,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;IAO3B;;OAEG;IACI,CAAC,MAAM,CAAC,aAAa,CAAC;IAQ7B;;;;;;;;;OASG;IACG,YAAY,IAAI,OAAO,CAAC,MAAM,CAAC;IAMrC;;;;;;;;;;;;;;;;;;OAkBG;IACG,OAAO,IAAI,OAAO,CAAC,gBAAgB,CAAC;IAM1C;;OAEG;cACa,qBAAqB,CAAC,MAAM,EAAE,cAAc,GAAG,OAAO,CAAC,cAAc,EAAE,CAAC;IAwBxF;;OAEG;cACa,mBAAmB,CAC/B,MAAM,EAAE,cAAc,EACtB,eAAe,EAAE,cAAc,EAAE,EACjC,OAAO,EAAE,4BAA4B;IAiBzC;;OAEG;YACW,mBAAmB;IAWjC;;;;;;;;;;;;;;;;OAgBG;WACU,IAAI,CACb,UAAU,CAAC,EAAE,MAAM,GAAG,iBAAiB,GAAG,IAAI,EAC9C,OAAO,GAAE,kBAAuB,GACjC,OAAO,CAAC,eAAe,CAAC;CAmD9B;AAYD,UAAU,cAAc;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,OAAO,CAAC;IACnB,EAAE,EAAE,MAAM,CAAC;IACX,QAAQ,EAAE,OAAO,GAAG,IAAI,CAAC;IACzB,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,SAAS,EAAE,OAAO,CAAC;CACtB;AAED,MAAM,WAAW,sBAAsB;IACnC,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,kBAAkB,CAAC;IAE3B;;;;OAIG;IACH,kBAAkB,CAAC,EAAE,kBAAkB,CAAC;CAC3C;AAED;;GAEG;AACH,MAAM,WAAW,mBAAoB,SAAQ,sBAAsB;CAAG;AAEtE;;GAEG;AACH,MAAM,WAAW,8BAA+B,SAAQ,sBAAsB;IAC1E,SAAS,EAAE,MAAM,CAAC;IAClB,mBAAmB,EAAE,MAAM,CAAC;IAC5B,8BAA8B,EAAE,MAAM,CAAC;CAC1C;AAED,MAAM,WAAW,4BAA4B;IACzC;;;;;;;;;;;;OAYG;IACH,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB;;;;OAIG;IACH,KAAK,CAAC,EAAE,OAAO,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,yBAA0B,SAAQ,kBAAkB;IACjE,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,EAAE,OAAO,CAAC;CACtB;AAED,MAAM,WAAW,yBAA0B,SAAQ,4BAA4B;IAC3E;;;OAGG;IACH,2BAA2B,CAAC,EAAE,OAAO,CAAC;IAEtC;;OAEG;IACH,SAAS,CAAC,EAAE,MAAM,CAAC;IAEnB;;OAEG;IACH,wBAAwB,CAAC,EAAE,MAAM,CAAC;CACrC;AAED,MAAM,WAAW,wBAAwB;IACrC,aAAa,EAAE,gBAAgB,EAAE,CAAC;IAClC;;;;;;;;;;;;;;;OAeG;IACH,2BAA2B,EAAE,OAAO,CAAC,gBAAgB,EAAE,CAAC,CAAC;CAC5D"}
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import { inspect } from 'node:util';
|
|
2
|
-
import {
|
|
2
|
+
import { chunkedAsyncIterable, downloadListOfUrls, getObjectType, isAsyncIterable, isIterable, peekableAsyncIterable, sleep, } from '@crawlee/utils';
|
|
3
3
|
import ow from 'ow';
|
|
4
4
|
import { ListDictionary, LruCache } from '@apify/datastructures';
|
|
5
5
|
import { cryptoRandomObjectId } from '@apify/utilities';
|
|
6
6
|
import { Configuration } from '../configuration.js';
|
|
7
|
-
import { log } from '../log.js';
|
|
8
7
|
import { Request } from '../request.js';
|
|
8
|
+
import { serviceLocator } from '../service_locator.js';
|
|
9
9
|
import { checkStorageAccess } from './access_checking.js';
|
|
10
|
-
import {
|
|
10
|
+
import { resolveStorageIdentifier } from './storage_instance_manager.js';
|
|
11
11
|
import { getRequestId, purgeDefaultStorages, QUERY_HEAD_MIN_LENGTH } from './utils.js';
|
|
12
12
|
export class RequestProvider {
|
|
13
13
|
config;
|
|
@@ -25,6 +25,8 @@ export class RequestProvider {
|
|
|
25
25
|
assumedTotalCount = 0;
|
|
26
26
|
assumedHandledCount = 0;
|
|
27
27
|
initialCount = 0;
|
|
28
|
+
initialHandledCount = 0; // We track this separately from `assumedHandledCount` which is used non-trivially by RequestQueueV1
|
|
29
|
+
isInitialized = false;
|
|
28
30
|
queueHeadIds = new ListDictionary();
|
|
29
31
|
requestCache;
|
|
30
32
|
recentlyHandledRequestsCache;
|
|
@@ -32,20 +34,21 @@ export class RequestProvider {
|
|
|
32
34
|
lastActivity = new Date();
|
|
33
35
|
isFinishedCalledWhileHeadWasNotEmpty = 0;
|
|
34
36
|
inProgressRequestBatchCount = 0;
|
|
37
|
+
httpClient;
|
|
38
|
+
events;
|
|
35
39
|
constructor(options, config = Configuration.getGlobalConfig()) {
|
|
36
40
|
this.config = config;
|
|
37
41
|
this.id = options.id;
|
|
38
42
|
this.name = options.name;
|
|
39
|
-
this.
|
|
40
|
-
|
|
41
|
-
timeoutSecs: this.timeoutSecs,
|
|
42
|
-
});
|
|
43
|
+
this.events = serviceLocator.getEventManager();
|
|
44
|
+
this.client = options.client;
|
|
43
45
|
this.proxyConfiguration = options.proxyConfiguration;
|
|
44
46
|
this.requestCache = new LruCache({ maxLength: options.requestCacheMaxSize });
|
|
45
47
|
this.recentlyHandledRequestsCache = new LruCache({ maxLength: options.recentlyHandledRequestsMaxSize });
|
|
46
|
-
this.log =
|
|
47
|
-
|
|
48
|
-
|
|
48
|
+
this.log = serviceLocator
|
|
49
|
+
.getLogger()
|
|
50
|
+
.child({ prefix: `${options.logPrefix}(${this.id}, ${this.name ?? 'no-name'})` });
|
|
51
|
+
this.events.on("migrating" /* EventType.MIGRATING */, async () => {
|
|
49
52
|
this.queuePausedForMigration = true;
|
|
50
53
|
});
|
|
51
54
|
}
|
|
@@ -57,6 +60,14 @@ export class RequestProvider {
|
|
|
57
60
|
getTotalCount() {
|
|
58
61
|
return this.assumedTotalCount + this.initialCount;
|
|
59
62
|
}
|
|
63
|
+
/**
|
|
64
|
+
* Returns an offline approximation of the total number of pending requests in the queue.
|
|
65
|
+
*
|
|
66
|
+
* Survives restarts and Actor migrations.
|
|
67
|
+
*/
|
|
68
|
+
getPendingCount() {
|
|
69
|
+
return this.getTotalCount() - this.initialHandledCount - this.assumedHandledCount;
|
|
70
|
+
}
|
|
60
71
|
/**
|
|
61
72
|
* Adds a request to the queue.
|
|
62
73
|
*
|
|
@@ -134,7 +145,9 @@ export class RequestProvider {
|
|
|
134
145
|
async addRequests(requestsLike, options = {}) {
|
|
135
146
|
checkStorageAccess();
|
|
136
147
|
this.lastActivity = new Date();
|
|
137
|
-
ow(requestsLike, ow.
|
|
148
|
+
ow(requestsLike, ow.object
|
|
149
|
+
.is((value) => isIterable(value) || isAsyncIterable(value))
|
|
150
|
+
.message((value) => `Expected an iterable or async iterable, got ${getObjectType(value)}`));
|
|
138
151
|
ow(options, ow.object.exactShape({
|
|
139
152
|
forefront: ow.optional.boolean,
|
|
140
153
|
cache: ow.optional.boolean,
|
|
@@ -153,17 +166,19 @@ export class RequestProvider {
|
|
|
153
166
|
processedRequests: [],
|
|
154
167
|
unprocessedRequests: [],
|
|
155
168
|
};
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
169
|
+
const requests = [];
|
|
170
|
+
for await (const requestLike of requestsLike) {
|
|
171
|
+
if (typeof requestLike === 'string') {
|
|
172
|
+
requests.push(new Request({ url: requestLike }));
|
|
173
|
+
}
|
|
174
|
+
else if ('requestsFromUrl' in requestLike) {
|
|
175
|
+
const fetchedRequests = await this._fetchRequestsFromUrl(requestLike);
|
|
176
|
+
await this._addFetchedRequests(requestLike, fetchedRequests, options);
|
|
177
|
+
}
|
|
178
|
+
else {
|
|
179
|
+
requests.push(requestLike instanceof Request ? requestLike : new Request(requestLike));
|
|
160
180
|
}
|
|
161
181
|
}
|
|
162
|
-
const requests = requestsLike
|
|
163
|
-
.filter((requestLike) => !('requestsFromUrl' in requestLike))
|
|
164
|
-
.map((requestLike) => {
|
|
165
|
-
return requestLike instanceof Request ? requestLike : new Request(requestLike);
|
|
166
|
-
});
|
|
167
182
|
const requestsToAdd = new Map();
|
|
168
183
|
for (const request of requests) {
|
|
169
184
|
const cacheKey = getCachedRequestId(request.uniqueKey);
|
|
@@ -219,43 +234,44 @@ export class RequestProvider {
|
|
|
219
234
|
async addRequestsBatched(requests, options = {}) {
|
|
220
235
|
checkStorageAccess();
|
|
221
236
|
this.lastActivity = new Date();
|
|
237
|
+
ow(requests, ow.object
|
|
238
|
+
.is((value) => isIterable(value) || isAsyncIterable(value))
|
|
239
|
+
.message((value) => `Expected an iterable or async iterable, got ${getObjectType(value)}`));
|
|
222
240
|
ow(options, ow.object.exactShape({
|
|
223
241
|
forefront: ow.optional.boolean,
|
|
224
242
|
waitForAllRequestsToBeAdded: ow.optional.boolean,
|
|
225
243
|
batchSize: ow.optional.number,
|
|
226
244
|
waitBetweenBatchesMillis: ow.optional.number,
|
|
227
245
|
}));
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
246
|
+
const addRequest = this.addRequest.bind(this);
|
|
247
|
+
async function* generateRequests() {
|
|
248
|
+
for await (const opts of requests) {
|
|
249
|
+
// Validate the input
|
|
250
|
+
if (typeof opts === 'object' && opts !== null) {
|
|
251
|
+
if (opts.url !== undefined && typeof opts.url !== 'string') {
|
|
252
|
+
throw new Error(`Request options are not valid, the 'url' property is not a string. Input: ${inspect(opts)}`);
|
|
253
|
+
}
|
|
254
|
+
if (opts.id !== undefined) {
|
|
255
|
+
throw new Error(`Request options are not valid, the 'id' property must not be present. Input: ${inspect(opts)}`);
|
|
256
|
+
}
|
|
257
|
+
if (opts.requestsFromUrl !== undefined &&
|
|
258
|
+
typeof opts.requestsFromUrl !== 'string') {
|
|
259
|
+
throw new Error(`Request options are not valid, the 'requestsFromUrl' property is not a string. Input: ${inspect(opts)}`);
|
|
260
|
+
}
|
|
242
261
|
}
|
|
243
|
-
if (typeof
|
|
244
|
-
|
|
262
|
+
if (opts && typeof opts === 'object' && 'requestsFromUrl' in opts) {
|
|
263
|
+
// Handle URL lists right away
|
|
264
|
+
await addRequest(opts, { forefront: options.forefront });
|
|
265
|
+
}
|
|
266
|
+
else {
|
|
267
|
+
// Yield valid requests
|
|
268
|
+
yield typeof opts === 'string' ? { url: opts } : opts;
|
|
245
269
|
}
|
|
246
270
|
}
|
|
247
|
-
throw new Error(`Request options are not valid, provide either a URL or an object with 'url' property (but without 'id' property), or an object with 'requestsFromUrl' property. Input: ${inspect(request)}`);
|
|
248
271
|
}
|
|
249
272
|
const { batchSize = 1000, waitBetweenBatchesMillis = 1000 } = options;
|
|
250
|
-
const
|
|
251
|
-
|
|
252
|
-
if (opts && typeof opts === 'object' && 'requestsFromUrl' in opts) {
|
|
253
|
-
await this.addRequest(opts, { forefront: options.forefront });
|
|
254
|
-
}
|
|
255
|
-
else {
|
|
256
|
-
sources.push(typeof opts === 'string' ? { url: opts } : opts);
|
|
257
|
-
}
|
|
258
|
-
}
|
|
273
|
+
const chunks = peekableAsyncIterable(chunkedAsyncIterable(generateRequests(), batchSize));
|
|
274
|
+
const chunksIterator = chunks[Symbol.asyncIterator]();
|
|
259
275
|
const attemptToAddToQueueAndAddAnyUnprocessed = async (providedRequests, cache = true) => {
|
|
260
276
|
const resultsToReturn = [];
|
|
261
277
|
const apiResult = await this.addRequests(providedRequests, { forefront: options.forefront, cache });
|
|
@@ -266,11 +282,15 @@ export class RequestProvider {
|
|
|
266
282
|
}
|
|
267
283
|
return resultsToReturn;
|
|
268
284
|
};
|
|
269
|
-
const initialChunk = sources.splice(0, batchSize);
|
|
270
285
|
// Add initial batch of `batchSize` to process them right away
|
|
286
|
+
const initialChunk = await chunksIterator.peek();
|
|
287
|
+
if (initialChunk === undefined) {
|
|
288
|
+
return { addedRequests: [], waitForAllRequestsToBeAdded: Promise.resolve([]) };
|
|
289
|
+
}
|
|
271
290
|
const addedRequests = await attemptToAddToQueueAndAddAnyUnprocessed(initialChunk);
|
|
272
|
-
|
|
273
|
-
|
|
291
|
+
await chunksIterator.next();
|
|
292
|
+
// If we have no more requests to add, return immediately
|
|
293
|
+
if ((await chunksIterator.peek()) === undefined) {
|
|
274
294
|
return {
|
|
275
295
|
addedRequests,
|
|
276
296
|
waitForAllRequestsToBeAdded: Promise.resolve([]),
|
|
@@ -278,9 +298,8 @@ export class RequestProvider {
|
|
|
278
298
|
}
|
|
279
299
|
// eslint-disable-next-line no-async-promise-executor
|
|
280
300
|
const promise = new Promise(async (resolve) => {
|
|
281
|
-
const chunks = chunk(sources, batchSize);
|
|
282
301
|
const finalAddedRequests = [];
|
|
283
|
-
for (const requestChunk of chunks) {
|
|
302
|
+
for await (const requestChunk of chunks) {
|
|
284
303
|
finalAddedRequests.push(...(await attemptToAddToQueueAndAddAnyUnprocessed(requestChunk, false)));
|
|
285
304
|
await sleep(waitBetweenBatchesMillis);
|
|
286
305
|
}
|
|
@@ -425,9 +444,19 @@ export class RequestProvider {
|
|
|
425
444
|
*/
|
|
426
445
|
async drop() {
|
|
427
446
|
checkStorageAccess();
|
|
428
|
-
await this.client.
|
|
429
|
-
|
|
430
|
-
|
|
447
|
+
await this.client.drop();
|
|
448
|
+
serviceLocator.getStorageInstanceManager().removeFromCache(this);
|
|
449
|
+
}
|
|
450
|
+
/**
|
|
451
|
+
* @inheritdoc
|
|
452
|
+
*/
|
|
453
|
+
async *[Symbol.asyncIterator]() {
|
|
454
|
+
while (true) {
|
|
455
|
+
const req = await this.fetchNextRequest();
|
|
456
|
+
if (!req)
|
|
457
|
+
break;
|
|
458
|
+
yield req;
|
|
459
|
+
}
|
|
431
460
|
}
|
|
432
461
|
/**
|
|
433
462
|
* Returns the number of handled requests.
|
|
@@ -437,27 +466,21 @@ export class RequestProvider {
|
|
|
437
466
|
* ```javascript
|
|
438
467
|
* const { handledRequestCount } = await queue.getInfo();
|
|
439
468
|
* ```
|
|
469
|
+
* @inheritdoc
|
|
440
470
|
*/
|
|
441
471
|
async handledCount() {
|
|
442
472
|
// NOTE: We keep this function for compatibility with RequestList.handledCount()
|
|
443
|
-
const { handledRequestCount } =
|
|
444
|
-
return handledRequestCount
|
|
473
|
+
const { handledRequestCount } = await this.getInfo();
|
|
474
|
+
return handledRequestCount;
|
|
445
475
|
}
|
|
446
476
|
/**
|
|
447
477
|
* Returns an object containing general information about the request queue.
|
|
448
478
|
*
|
|
449
|
-
* The function returns the same object as the Apify API Client's
|
|
450
|
-
* [getQueue](https://docs.apify.com/api/apify-client-js/latest#ApifyClient-requestQueues)
|
|
451
|
-
* function, which in turn calls the
|
|
452
|
-
* [Get request queue](https://apify.com/docs/api/v2#/reference/request-queues/queue/get-request-queue)
|
|
453
|
-
* API endpoint.
|
|
454
|
-
*
|
|
455
479
|
* **Example:**
|
|
456
480
|
* ```
|
|
457
481
|
* {
|
|
458
482
|
* id: "WkzbQMuFYuamGv3YF",
|
|
459
483
|
* name: "my-queue",
|
|
460
|
-
* userId: "wRsJZtadYvn4mBZmm",
|
|
461
484
|
* createdAt: new Date("2015-12-12T07:34:14.202Z"),
|
|
462
485
|
* modifiedAt: new Date("2015-12-13T08:36:13.202Z"),
|
|
463
486
|
* accessedAt: new Date("2015-12-14T08:36:13.202Z"),
|
|
@@ -466,10 +489,12 @@ export class RequestProvider {
|
|
|
466
489
|
* pendingRequestCount: 20,
|
|
467
490
|
* }
|
|
468
491
|
* ```
|
|
492
|
+
*
|
|
493
|
+
* @throws If the underlying storage no longer exists (e.g. it was deleted externally).
|
|
469
494
|
*/
|
|
470
495
|
async getInfo() {
|
|
471
496
|
checkStorageAccess();
|
|
472
|
-
return this.client.
|
|
497
|
+
return this.client.getMetadata();
|
|
473
498
|
}
|
|
474
499
|
/**
|
|
475
500
|
* Fetches URLs from requestsFromUrl and returns them in format of list of requests
|
|
@@ -515,7 +540,10 @@ export class RequestProvider {
|
|
|
515
540
|
* @internal wraps public utility for mocking purposes
|
|
516
541
|
*/
|
|
517
542
|
async _downloadListOfUrls(options) {
|
|
518
|
-
return downloadListOfUrls(
|
|
543
|
+
return downloadListOfUrls({
|
|
544
|
+
...options,
|
|
545
|
+
httpClient: this.httpClient,
|
|
546
|
+
});
|
|
519
547
|
}
|
|
520
548
|
/**
|
|
521
549
|
* Opens a request queue and returns a promise resolving to an instance
|
|
@@ -528,27 +556,49 @@ export class RequestProvider {
|
|
|
528
556
|
*
|
|
529
557
|
* For more details and code examples, see the {@link RequestQueue} class.
|
|
530
558
|
*
|
|
531
|
-
* @param [
|
|
532
|
-
* ID or name of the request queue to be opened. If
|
|
533
|
-
*
|
|
559
|
+
* @param [identifier]
|
|
560
|
+
* ID or name of the request queue to be opened. If a string is provided, it will first be
|
|
561
|
+
* looked up as an ID; if no such storage exists, it will be treated as a name.
|
|
562
|
+
* If `null` or `undefined`, the function returns the default request queue associated with the crawler run.
|
|
534
563
|
* @param [options] Open Request Queue options.
|
|
535
564
|
*/
|
|
536
|
-
static async open(
|
|
565
|
+
static async open(identifier, options = {}) {
|
|
537
566
|
checkStorageAccess();
|
|
538
|
-
ow(queueIdOrName, ow.optional.any(ow.string, ow.null));
|
|
539
567
|
ow(options, ow.object.exactShape({
|
|
540
568
|
config: ow.optional.object.instanceOf(Configuration),
|
|
541
569
|
storageClient: ow.optional.object,
|
|
542
570
|
proxyConfiguration: ow.optional.object,
|
|
571
|
+
httpClient: ow.optional.object,
|
|
543
572
|
}));
|
|
544
|
-
options.
|
|
545
|
-
|
|
546
|
-
await purgeDefaultStorages({ onlyPurgeOnce: true, client
|
|
547
|
-
const
|
|
548
|
-
const queue = await
|
|
573
|
+
const client = options.storageClient ?? serviceLocator.getStorageClient();
|
|
574
|
+
const config = options.config ?? serviceLocator.getConfiguration();
|
|
575
|
+
await purgeDefaultStorages({ onlyPurgeOnce: true, client, config });
|
|
576
|
+
const resolved = await resolveStorageIdentifier(identifier, client, 'RequestQueue');
|
|
577
|
+
const queue = await serviceLocator
|
|
578
|
+
.getStorageInstanceManager()
|
|
579
|
+
.openStorage(this, {
|
|
580
|
+
...resolved,
|
|
581
|
+
clientOpener: () => client.createRequestQueueClient(resolved),
|
|
582
|
+
clientCacheKey: client.getStorageClientCacheKey?.() ?? client.constructor.name,
|
|
583
|
+
});
|
|
549
584
|
queue.proxyConfiguration = options.proxyConfiguration;
|
|
550
|
-
|
|
551
|
-
|
|
585
|
+
queue.httpClient = options.httpClient;
|
|
586
|
+
if (!queue.isInitialized) {
|
|
587
|
+
// Re-create the request queue client with clientKey and timeoutSecs so that
|
|
588
|
+
// request locking works correctly for API-backed implementations.
|
|
589
|
+
// TODO: clientKey/timeoutSecs are Apify-platform concerns and should eventually be pushed
|
|
590
|
+
// down into the Apify SDK's client implementation, aligning with crawlee-python's approach
|
|
591
|
+
// where locking is handled internally by the client (see crawlee-python PR #1194).
|
|
592
|
+
queue.client = await client.createRequestQueueClient({
|
|
593
|
+
id: queue.id,
|
|
594
|
+
clientKey: queue.clientKey,
|
|
595
|
+
timeoutSecs: queue.timeoutSecs,
|
|
596
|
+
});
|
|
597
|
+
const queueInfo = await queue.client.getMetadata();
|
|
598
|
+
queue.initialCount = queueInfo.totalRequestCount;
|
|
599
|
+
queue.initialHandledCount = queueInfo.handledRequestCount;
|
|
600
|
+
queue.isInitialized = true;
|
|
601
|
+
}
|
|
552
602
|
return queue;
|
|
553
603
|
}
|
|
554
604
|
}
|