@crawlee/core 3.13.3-beta.11 → 3.13.3-beta.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/autoscaling/autoscaled_pool.d.ts +16 -16
- package/autoscaling/autoscaled_pool.js +13 -13
- package/autoscaling/snapshotter.d.ts +1 -1
- package/autoscaling/snapshotter.js +1 -1
- package/autoscaling/system_status.d.ts +12 -12
- package/autoscaling/system_status.js +11 -11
- package/configuration.d.ts +10 -10
- package/configuration.js +4 -4
- package/crawlers/crawler_commons.d.ts +12 -12
- package/crawlers/crawler_commons.js +4 -4
- package/crawlers/statistics.d.ts +2 -2
- package/crawlers/statistics.js +1 -1
- package/enqueue_links/enqueue_links.d.ts +14 -14
- package/enqueue_links/enqueue_links.js +5 -5
- package/enqueue_links/shared.d.ts +2 -2
- package/http_clients/base-http-client.d.ts +7 -7
- package/http_clients/base-http-client.js +1 -1
- package/package.json +5 -5
- package/proxy_configuration.d.ts +11 -11
- package/proxy_configuration.js +8 -8
- package/request.d.ts +3 -3
- package/request.js +2 -2
- package/session_pool/session.d.ts +1 -1
- package/session_pool/session_pool.d.ts +12 -12
- package/session_pool/session_pool.js +10 -10
- package/storages/dataset.d.ts +15 -15
- package/storages/dataset.js +9 -9
- package/storages/key_value_store.d.ts +32 -32
- package/storages/key_value_store.js +22 -22
- package/storages/request_list.d.ts +35 -35
- package/storages/request_list.js +19 -19
- package/storages/request_provider.d.ts +19 -19
- package/storages/request_provider.js +12 -12
- package/storages/request_queue.d.ts +16 -16
- package/storages/request_queue.js +16 -16
- package/storages/request_queue_v2.d.ts +7 -7
- package/storages/request_queue_v2.js +7 -7
- package/storages/utils.d.ts +2 -2
|
@@ -19,7 +19,7 @@ export interface IRequestList {
|
|
|
19
19
|
*/
|
|
20
20
|
isFinished(): Promise<boolean>;
|
|
21
21
|
/**
|
|
22
|
-
* Resolves to `true` if the next call to {@
|
|
22
|
+
* Resolves to `true` if the next call to {@link IRequestList.fetchNextRequest} function
|
|
23
23
|
* would return `null`, otherwise it resolves to `false`.
|
|
24
24
|
* Note that even if the list is empty, there might be some pending requests currently being processed.
|
|
25
25
|
*/
|
|
@@ -29,7 +29,7 @@ export interface IRequestList {
|
|
|
29
29
|
*/
|
|
30
30
|
handledCount(): number;
|
|
31
31
|
/**
|
|
32
|
-
* Persists the current state of the `IRequestList` into the default {@
|
|
32
|
+
* Persists the current state of the `IRequestList` into the default {@link KeyValueStore}.
|
|
33
33
|
* The state is persisted automatically in regular intervals, but calling this method manually
|
|
34
34
|
* is useful in cases where you want to have the most current state available after you pause
|
|
35
35
|
* or stop fetching its requests. For example after you pause or abort a crawl. Or just before
|
|
@@ -37,8 +37,8 @@ export interface IRequestList {
|
|
|
37
37
|
*/
|
|
38
38
|
persistState(): Promise<void>;
|
|
39
39
|
/**
|
|
40
|
-
* Gets the next {@
|
|
41
|
-
* using the {@
|
|
40
|
+
* Gets the next {@link Request} to process. First, the function gets a request previously reclaimed
|
|
41
|
+
* using the {@link RequestList.reclaimRequest} function, if there is any.
|
|
42
42
|
* Otherwise it gets the next request from sources.
|
|
43
43
|
*
|
|
44
44
|
* The function's `Promise` resolves to `null` if there are no more
|
|
@@ -46,8 +46,8 @@ export interface IRequestList {
|
|
|
46
46
|
*/
|
|
47
47
|
fetchNextRequest(): Promise<Request | null>;
|
|
48
48
|
/**
|
|
49
|
-
* Gets the next {@
|
|
50
|
-
* using the {@
|
|
49
|
+
* Gets the next {@link Request} to process. First, the function gets a request previously reclaimed
|
|
50
|
+
* using the {@link RequestList.reclaimRequest} function, if there is any.
|
|
51
51
|
* Otherwise it gets the next request from sources.
|
|
52
52
|
*
|
|
53
53
|
* The function resolves to `null` if there are no more requests to process.
|
|
@@ -72,8 +72,8 @@ export interface IRequestList {
|
|
|
72
72
|
}
|
|
73
73
|
export interface RequestListOptions {
|
|
74
74
|
/**
|
|
75
|
-
* An array of sources of URLs for the {@
|
|
76
|
-
* plain objects that define at least the `url` property, or an array of {@
|
|
75
|
+
* An array of sources of URLs for the {@link RequestList}. It can be either an array of strings,
|
|
76
|
+
* plain objects that define at least the `url` property, or an array of {@link Request} instances.
|
|
77
77
|
*
|
|
78
78
|
* **IMPORTANT:** The `sources` array will be consumed (left empty) after `RequestList` initializes.
|
|
79
79
|
* This is a measure to prevent memory leaks in situations when millions of sources are
|
|
@@ -107,15 +107,15 @@ export interface RequestListOptions {
|
|
|
107
107
|
sources?: RequestListSource[];
|
|
108
108
|
/**
|
|
109
109
|
* A function that will be called to get the sources for the `RequestList`, but only if `RequestList`
|
|
110
|
-
* was not able to fetch their persisted version (see {@
|
|
111
|
-
* It must return an `Array` of {@
|
|
110
|
+
* was not able to fetch their persisted version (see {@link RequestListOptions.persistRequestsKey}).
|
|
111
|
+
* It must return an `Array` of {@link Request} or {@link RequestOptions}.
|
|
112
112
|
*
|
|
113
113
|
* This is very useful in a scenario when getting the sources is a resource intensive or time consuming
|
|
114
114
|
* task, such as fetching URLs from multiple sitemaps or parsing URLs from large datasets. Using the
|
|
115
115
|
* `sourcesFunction` in combination with `persistStateKey` and `persistRequestsKey` will allow you to
|
|
116
116
|
* fetch and parse those URLs only once, saving valuable time when your crawler migrates or restarts.
|
|
117
117
|
*
|
|
118
|
-
* If both {@
|
|
118
|
+
* If both {@link RequestListOptions.sources} and {@link RequestListOptions.sourcesFunction} are provided,
|
|
119
119
|
* the sources returned by the function will be added after the `sources`.
|
|
120
120
|
*
|
|
121
121
|
* **Example:**
|
|
@@ -162,12 +162,12 @@ export interface RequestListOptions {
|
|
|
162
162
|
persistStateKey?: string;
|
|
163
163
|
/**
|
|
164
164
|
* Identifies the key in the default key-value store under which the `RequestList` persists its
|
|
165
|
-
* Requests during the {@
|
|
165
|
+
* Requests during the {@link RequestList.initialize} call.
|
|
166
166
|
* This is necessary if `persistStateKey` is set and the source URLs might potentially change,
|
|
167
167
|
* to ensure consistency of the source URLs and state object. However, it comes with some
|
|
168
168
|
* storage and performance overheads.
|
|
169
169
|
*
|
|
170
|
-
* If `persistRequestsKey` is not set, {@
|
|
170
|
+
* If `persistRequestsKey` is not set, {@link RequestList.initialize} will always fetch the sources
|
|
171
171
|
* from their origin, check that they are consistent with the restored state (if any)
|
|
172
172
|
* and throw an error if they are not.
|
|
173
173
|
*/
|
|
@@ -193,7 +193,7 @@ export interface RequestListOptions {
|
|
|
193
193
|
state?: RequestListState;
|
|
194
194
|
/**
|
|
195
195
|
* By default, `RequestList` will deduplicate the provided URLs. Default deduplication is based
|
|
196
|
-
* on the `uniqueKey` property of passed source {@
|
|
196
|
+
* on the `uniqueKey` property of passed source {@link Request} objects.
|
|
197
197
|
*
|
|
198
198
|
* If the property is not present, it is generated by normalizing the URL. If present, it is kept intact.
|
|
199
199
|
* In any case, only one request per `uniqueKey` is added to the `RequestList` resulting in removal
|
|
@@ -213,19 +213,19 @@ export interface RequestListOptions {
|
|
|
213
213
|
/**
|
|
214
214
|
* Represents a static list of URLs to crawl.
|
|
215
215
|
* The URLs can be provided either in code or parsed from a text file hosted on the web.
|
|
216
|
-
* `RequestList` is used by {@
|
|
217
|
-
* and {@
|
|
216
|
+
* `RequestList` is used by {@link BasicCrawler}, {@link CheerioCrawler}, {@link PuppeteerCrawler}
|
|
217
|
+
* and {@link PlaywrightCrawler} as a source of URLs to crawl.
|
|
218
218
|
*
|
|
219
|
-
* Each URL is represented using an instance of the {@
|
|
219
|
+
* Each URL is represented using an instance of the {@link Request} class.
|
|
220
220
|
* The list can only contain unique URLs. More precisely, it can only contain `Request` instances
|
|
221
221
|
* with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden.
|
|
222
|
-
* To add a single URL to the list multiple times, corresponding {@
|
|
222
|
+
* To add a single URL to the list multiple times, corresponding {@link Request} objects will need to have different
|
|
223
223
|
* `uniqueKey` properties. You can use the `keepDuplicateUrls` option to do this for you when initializing the
|
|
224
224
|
* `RequestList` from sources.
|
|
225
225
|
*
|
|
226
|
-
* `RequestList` doesn't have a public constructor, you need to create it with the asynchronous {@
|
|
226
|
+
* `RequestList` doesn't have a public constructor, you need to create it with the asynchronous {@link RequestList.open} function. After
|
|
227
227
|
* the request list is created, no more URLs can be added to it.
|
|
228
|
-
* Unlike {@
|
|
228
|
+
* Unlike {@link RequestQueue}, `RequestList` is static but it can contain even millions of URLs.
|
|
229
229
|
* > Note that `RequestList` can be used together with `RequestQueue` by the same crawler.
|
|
230
230
|
* > In such cases, each request from `RequestList` is enqueued into `RequestQueue` first and then consumed from the latter.
|
|
231
231
|
* > This is necessary to avoid the same URL being processed more than once (from the list first and then possibly from the queue).
|
|
@@ -234,9 +234,9 @@ export interface RequestListOptions {
|
|
|
234
234
|
*
|
|
235
235
|
* `RequestList` has an internal state where it stores information about which requests were already handled,
|
|
236
236
|
* which are in progress and which were reclaimed. The state may be automatically persisted to the default
|
|
237
|
-
* {@
|
|
237
|
+
* {@link KeyValueStore} by setting the `persistStateKey` option so that if the Node.js process is restarted,
|
|
238
238
|
* the crawling can continue where it left off. The automated persisting is launched upon receiving the `persistState`
|
|
239
|
-
* event that is periodically emitted by {@
|
|
239
|
+
* event that is periodically emitted by {@link EventManager}.
|
|
240
240
|
*
|
|
241
241
|
* The internal state is closely tied to the provided sources (URLs). If the sources change on crawler restart, the state will become corrupted and
|
|
242
242
|
* `RequestList` will raise an exception. This typically happens when the sources is a list of URLs downloaded from the web.
|
|
@@ -425,14 +425,14 @@ export declare class RequestList implements IRequestList {
|
|
|
425
425
|
handledCount(): number;
|
|
426
426
|
/**
|
|
427
427
|
* Opens a request list and returns a promise resolving to an instance
|
|
428
|
-
* of the {@
|
|
428
|
+
* of the {@link RequestList} class that is already initialized.
|
|
429
429
|
*
|
|
430
|
-
* {@
|
|
430
|
+
* {@link RequestList} represents a list of URLs to crawl, which is always stored in memory.
|
|
431
431
|
* To enable picking up where left off after a process restart, the request list sources
|
|
432
432
|
* are persisted to the key-value store at initialization of the list. Then, while crawling,
|
|
433
433
|
* a small state object is regularly persisted to keep track of the crawling status.
|
|
434
434
|
*
|
|
435
|
-
* For more details and code examples, see the {@
|
|
435
|
+
* For more details and code examples, see the {@link RequestList} class.
|
|
436
436
|
*
|
|
437
437
|
* **Example usage:**
|
|
438
438
|
*
|
|
@@ -458,23 +458,23 @@ export declare class RequestList implements IRequestList {
|
|
|
458
458
|
* If `null`, the list will not be persisted and will only be stored in memory. Process restart
|
|
459
459
|
* will then cause the list to be crawled again from the beginning. We suggest always using a name.
|
|
460
460
|
* @param [sources]
|
|
461
|
-
* An array of sources of URLs for the {@
|
|
462
|
-
* plain objects that define at least the `url` property, or an array of {@
|
|
461
|
+
* An array of sources of URLs for the {@link RequestList}. It can be either an array of strings,
|
|
462
|
+
* plain objects that define at least the `url` property, or an array of {@link Request} instances.
|
|
463
463
|
*
|
|
464
|
-
* **IMPORTANT:** The `sources` array will be consumed (left empty) after {@
|
|
464
|
+
* **IMPORTANT:** The `sources` array will be consumed (left empty) after {@link RequestList} initializes.
|
|
465
465
|
* This is a measure to prevent memory leaks in situations when millions of sources are
|
|
466
466
|
* added.
|
|
467
467
|
*
|
|
468
468
|
* Additionally, the `requestsFromUrl` property may be used instead of `url`,
|
|
469
|
-
* which will instruct {@
|
|
469
|
+
* which will instruct {@link RequestList} to download the source URLs from a given remote location.
|
|
470
470
|
* The URLs will be parsed from the received response. In this case you can limit the URLs
|
|
471
471
|
* using `regex` parameter containing regular expression pattern for URLs to be included.
|
|
472
472
|
*
|
|
473
|
-
* For details, see the {@
|
|
473
|
+
* For details, see the {@link RequestListOptions.sources}
|
|
474
474
|
* @param [options]
|
|
475
|
-
* The {@
|
|
476
|
-
* the {@
|
|
477
|
-
* options and the `sources` parameter supersedes the {@
|
|
475
|
+
* The {@link RequestList} options. Note that the `listName` parameter supersedes
|
|
476
|
+
* the {@link RequestListOptions.persistStateKey} and {@link RequestListOptions.persistRequestsKey}
|
|
477
|
+
* options and the `sources` parameter supersedes the {@link RequestListOptions.sources} option.
|
|
478
478
|
*/
|
|
479
479
|
static open(listNameOrOptions: string | null | RequestListOptions, sources?: RequestListSource[], options?: RequestListOptions): Promise<RequestList>;
|
|
480
480
|
/**
|
|
@@ -483,8 +483,8 @@ export declare class RequestList implements IRequestList {
|
|
|
483
483
|
private _downloadListOfUrls;
|
|
484
484
|
}
|
|
485
485
|
/**
|
|
486
|
-
* Represents state of a {@
|
|
487
|
-
* You can obtain the state by calling {@
|
|
486
|
+
* Represents state of a {@link RequestList}. It can be used to resume a {@link RequestList} which has been previously processed.
|
|
487
|
+
* You can obtain the state by calling {@link RequestList.getState} and receive an object with
|
|
488
488
|
* the following structure:
|
|
489
489
|
*
|
|
490
490
|
* ```
|
package/storages/request_list.js
CHANGED
|
@@ -18,19 +18,19 @@ const CONTENT_TYPE_BINARY = 'application/octet-stream';
|
|
|
18
18
|
/**
|
|
19
19
|
* Represents a static list of URLs to crawl.
|
|
20
20
|
* The URLs can be provided either in code or parsed from a text file hosted on the web.
|
|
21
|
-
* `RequestList` is used by {@
|
|
22
|
-
* and {@
|
|
21
|
+
* `RequestList` is used by {@link BasicCrawler}, {@link CheerioCrawler}, {@link PuppeteerCrawler}
|
|
22
|
+
* and {@link PlaywrightCrawler} as a source of URLs to crawl.
|
|
23
23
|
*
|
|
24
|
-
* Each URL is represented using an instance of the {@
|
|
24
|
+
* Each URL is represented using an instance of the {@link Request} class.
|
|
25
25
|
* The list can only contain unique URLs. More precisely, it can only contain `Request` instances
|
|
26
26
|
* with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden.
|
|
27
|
-
* To add a single URL to the list multiple times, corresponding {@
|
|
27
|
+
* To add a single URL to the list multiple times, corresponding {@link Request} objects will need to have different
|
|
28
28
|
* `uniqueKey` properties. You can use the `keepDuplicateUrls` option to do this for you when initializing the
|
|
29
29
|
* `RequestList` from sources.
|
|
30
30
|
*
|
|
31
|
-
* `RequestList` doesn't have a public constructor, you need to create it with the asynchronous {@
|
|
31
|
+
* `RequestList` doesn't have a public constructor, you need to create it with the asynchronous {@link RequestList.open} function. After
|
|
32
32
|
* the request list is created, no more URLs can be added to it.
|
|
33
|
-
* Unlike {@
|
|
33
|
+
* Unlike {@link RequestQueue}, `RequestList` is static but it can contain even millions of URLs.
|
|
34
34
|
* > Note that `RequestList` can be used together with `RequestQueue` by the same crawler.
|
|
35
35
|
* > In such cases, each request from `RequestList` is enqueued into `RequestQueue` first and then consumed from the latter.
|
|
36
36
|
* > This is necessary to avoid the same URL being processed more than once (from the list first and then possibly from the queue).
|
|
@@ -39,9 +39,9 @@ const CONTENT_TYPE_BINARY = 'application/octet-stream';
|
|
|
39
39
|
*
|
|
40
40
|
* `RequestList` has an internal state where it stores information about which requests were already handled,
|
|
41
41
|
* which are in progress and which were reclaimed. The state may be automatically persisted to the default
|
|
42
|
-
* {@
|
|
42
|
+
* {@link KeyValueStore} by setting the `persistStateKey` option so that if the Node.js process is restarted,
|
|
43
43
|
* the crawling can continue where it left off. The automated persisting is launched upon receiving the `persistState`
|
|
44
|
-
* event that is periodically emitted by {@
|
|
44
|
+
* event that is periodically emitted by {@link EventManager}.
|
|
45
45
|
*
|
|
46
46
|
* The internal state is closely tied to the provided sources (URLs). If the sources change on crawler restart, the state will become corrupted and
|
|
47
47
|
* `RequestList` will raise an exception. This typically happens when the sources is a list of URLs downloaded from the web.
|
|
@@ -657,14 +657,14 @@ class RequestList {
|
|
|
657
657
|
}
|
|
658
658
|
/**
|
|
659
659
|
* Opens a request list and returns a promise resolving to an instance
|
|
660
|
-
* of the {@
|
|
660
|
+
* of the {@link RequestList} class that is already initialized.
|
|
661
661
|
*
|
|
662
|
-
* {@
|
|
662
|
+
* {@link RequestList} represents a list of URLs to crawl, which is always stored in memory.
|
|
663
663
|
* To enable picking up where left off after a process restart, the request list sources
|
|
664
664
|
* are persisted to the key-value store at initialization of the list. Then, while crawling,
|
|
665
665
|
* a small state object is regularly persisted to keep track of the crawling status.
|
|
666
666
|
*
|
|
667
|
-
* For more details and code examples, see the {@
|
|
667
|
+
* For more details and code examples, see the {@link RequestList} class.
|
|
668
668
|
*
|
|
669
669
|
* **Example usage:**
|
|
670
670
|
*
|
|
@@ -690,23 +690,23 @@ class RequestList {
|
|
|
690
690
|
* If `null`, the list will not be persisted and will only be stored in memory. Process restart
|
|
691
691
|
* will then cause the list to be crawled again from the beginning. We suggest always using a name.
|
|
692
692
|
* @param [sources]
|
|
693
|
-
* An array of sources of URLs for the {@
|
|
694
|
-
* plain objects that define at least the `url` property, or an array of {@
|
|
693
|
+
* An array of sources of URLs for the {@link RequestList}. It can be either an array of strings,
|
|
694
|
+
* plain objects that define at least the `url` property, or an array of {@link Request} instances.
|
|
695
695
|
*
|
|
696
|
-
* **IMPORTANT:** The `sources` array will be consumed (left empty) after {@
|
|
696
|
+
* **IMPORTANT:** The `sources` array will be consumed (left empty) after {@link RequestList} initializes.
|
|
697
697
|
* This is a measure to prevent memory leaks in situations when millions of sources are
|
|
698
698
|
* added.
|
|
699
699
|
*
|
|
700
700
|
* Additionally, the `requestsFromUrl` property may be used instead of `url`,
|
|
701
|
-
* which will instruct {@
|
|
701
|
+
* which will instruct {@link RequestList} to download the source URLs from a given remote location.
|
|
702
702
|
* The URLs will be parsed from the received response. In this case you can limit the URLs
|
|
703
703
|
* using `regex` parameter containing regular expression pattern for URLs to be included.
|
|
704
704
|
*
|
|
705
|
-
* For details, see the {@
|
|
705
|
+
* For details, see the {@link RequestListOptions.sources}
|
|
706
706
|
* @param [options]
|
|
707
|
-
* The {@
|
|
708
|
-
* the {@
|
|
709
|
-
* options and the `sources` parameter supersedes the {@
|
|
707
|
+
* The {@link RequestList} options. Note that the `listName` parameter supersedes
|
|
708
|
+
* the {@link RequestListOptions.persistStateKey} and {@link RequestListOptions.persistRequestsKey}
|
|
709
|
+
* options and the `sources` parameter supersedes the {@link RequestListOptions.sources} option.
|
|
710
710
|
*/
|
|
711
711
|
static async open(listNameOrOptions, sources, options = {}) {
|
|
712
712
|
if (listNameOrOptions != null && typeof listNameOrOptions === 'object') {
|
|
@@ -39,12 +39,12 @@ export declare abstract class RequestProvider implements IStorage {
|
|
|
39
39
|
*
|
|
40
40
|
* If a request with the same `uniqueKey` property is already present in the queue,
|
|
41
41
|
* it will not be updated. You can find out whether this happened from the resulting
|
|
42
|
-
* {@
|
|
42
|
+
* {@link QueueOperationInfo} object.
|
|
43
43
|
*
|
|
44
44
|
* To add multiple requests to the queue by extracting links from a webpage,
|
|
45
|
-
* see the {@
|
|
45
|
+
* see the {@link enqueueLinks} helper function.
|
|
46
46
|
*
|
|
47
|
-
* @param requestLike {@
|
|
47
|
+
* @param requestLike {@link Request} object or vanilla object with request data.
|
|
48
48
|
* Note that the function sets the `uniqueKey` and `id` fields to the passed Request.
|
|
49
49
|
* @param [options] Request queue operation options.
|
|
50
50
|
*/
|
|
@@ -57,9 +57,9 @@ export declare abstract class RequestProvider implements IStorage {
|
|
|
57
57
|
*
|
|
58
58
|
* If a request passed in is already present due to its `uniqueKey` property being the same,
|
|
59
59
|
* it will not be updated. You can find out whether this happened by finding the request in the resulting
|
|
60
|
-
* {@
|
|
60
|
+
* {@link BatchAddRequestsResult} object.
|
|
61
61
|
*
|
|
62
|
-
* @param requestsLike {@
|
|
62
|
+
* @param requestsLike {@link Request} objects or vanilla objects with request data.
|
|
63
63
|
* Note that the function sets the `uniqueKey` and `id` fields to the passed requests if missing.
|
|
64
64
|
* @param [options] Request queue operation options.
|
|
65
65
|
*/
|
|
@@ -85,15 +85,15 @@ export declare abstract class RequestProvider implements IStorage {
|
|
|
85
85
|
* Returns a next request in the queue to be processed, or `null` if there are no more pending requests.
|
|
86
86
|
*
|
|
87
87
|
* Once you successfully finish processing of the request, you need to call
|
|
88
|
-
* {@
|
|
88
|
+
* {@link RequestQueue.markRequestHandled}
|
|
89
89
|
* to mark the request as handled in the queue. If there was some error in processing the request,
|
|
90
|
-
* call {@
|
|
90
|
+
* call {@link RequestQueue.reclaimRequest} instead,
|
|
91
91
|
* so that the queue will give the request to some other consumer in another call to the `fetchNextRequest` function.
|
|
92
92
|
*
|
|
93
93
|
* Note that the `null` return value doesn't mean the queue processing finished,
|
|
94
94
|
* it means there are currently no pending requests.
|
|
95
95
|
* To check whether all requests in queue were finished,
|
|
96
|
-
* use {@
|
|
96
|
+
* use {@link RequestQueue.isFinished} instead.
|
|
97
97
|
*
|
|
98
98
|
* @returns
|
|
99
99
|
* Returns the request object or `null` if there are no more pending requests.
|
|
@@ -101,24 +101,24 @@ export declare abstract class RequestProvider implements IStorage {
|
|
|
101
101
|
abstract fetchNextRequest<T extends Dictionary = Dictionary>(options?: RequestOptions): Promise<Request<T> | null>;
|
|
102
102
|
/**
|
|
103
103
|
* Marks a request that was previously returned by the
|
|
104
|
-
* {@
|
|
104
|
+
* {@link RequestQueue.fetchNextRequest}
|
|
105
105
|
* function as handled after successful processing.
|
|
106
106
|
* Handled requests will never again be returned by the `fetchNextRequest` function.
|
|
107
107
|
*/
|
|
108
108
|
markRequestHandled(request: Request): Promise<RequestQueueOperationInfo | null>;
|
|
109
109
|
/**
|
|
110
110
|
* Reclaims a failed request back to the queue, so that it can be returned for processing later again
|
|
111
|
-
* by another call to {@
|
|
111
|
+
* by another call to {@link RequestQueue.fetchNextRequest}.
|
|
112
112
|
* The request record in the queue is updated using the provided `request` parameter.
|
|
113
113
|
* For example, this lets you store the number of retries or error messages for the request.
|
|
114
114
|
*/
|
|
115
115
|
reclaimRequest(request: Request, options?: RequestQueueOperationOptions): Promise<RequestQueueOperationInfo | null>;
|
|
116
116
|
protected abstract ensureHeadIsNonEmpty(): Promise<void>;
|
|
117
117
|
/**
|
|
118
|
-
* Resolves to `true` if the next call to {@
|
|
118
|
+
* Resolves to `true` if the next call to {@link RequestQueue.fetchNextRequest}
|
|
119
119
|
* would return `null`, otherwise it resolves to `false`.
|
|
120
120
|
* Note that even if the queue is empty, there might be some pending requests currently being processed.
|
|
121
|
-
* If you need to ensure that there is no activity in the queue, use {@
|
|
121
|
+
* If you need to ensure that there is no activity in the queue, use {@link RequestQueue.isFinished}.
|
|
122
122
|
*/
|
|
123
123
|
isEmpty(): Promise<boolean>;
|
|
124
124
|
/**
|
|
@@ -191,14 +191,14 @@ export declare abstract class RequestProvider implements IStorage {
|
|
|
191
191
|
private _downloadListOfUrls;
|
|
192
192
|
/**
|
|
193
193
|
* Opens a request queue and returns a promise resolving to an instance
|
|
194
|
-
* of the {@
|
|
194
|
+
* of the {@link RequestQueue} class.
|
|
195
195
|
*
|
|
196
|
-
* {@
|
|
196
|
+
* {@link RequestQueue} represents a queue of URLs to crawl, which is stored either on local filesystem or in the cloud.
|
|
197
197
|
* The queue is used for deep crawling of websites, where you start with several URLs and then
|
|
198
198
|
* recursively follow links to other pages. The data structure supports both breadth-first
|
|
199
199
|
* and depth-first crawling orders.
|
|
200
200
|
*
|
|
201
|
-
* For more details and code examples, see the {@
|
|
201
|
+
* For more details and code examples, see the {@link RequestQueue} class.
|
|
202
202
|
*
|
|
203
203
|
* @param [queueIdOrName]
|
|
204
204
|
* ID or name of the request queue to be opened. If `null` or `undefined`,
|
|
@@ -227,7 +227,7 @@ export interface RequestProviderOptions {
|
|
|
227
227
|
proxyConfiguration?: ProxyConfiguration;
|
|
228
228
|
}
|
|
229
229
|
/**
|
|
230
|
-
* @deprecated Use {@
|
|
230
|
+
* @deprecated Use {@link RequestProviderOptions} instead.
|
|
231
231
|
*/
|
|
232
232
|
export interface RequestQueueOptions extends RequestProviderOptions {
|
|
233
233
|
}
|
|
@@ -244,7 +244,7 @@ export interface RequestQueueOperationOptions {
|
|
|
244
244
|
* If set to `true`:
|
|
245
245
|
* - while adding the request to the queue: the request will be added to the foremost position in the queue.
|
|
246
246
|
* - while reclaiming the request: the request will be placed to the beginning of the queue, so that it's returned
|
|
247
|
-
* in the next call to {@
|
|
247
|
+
* in the next call to {@link RequestQueue.fetchNextRequest}.
|
|
248
248
|
* By default, it's put to the end of the queue.
|
|
249
249
|
*
|
|
250
250
|
* In case the request is already present in the queue, this option has no effect.
|
|
@@ -288,8 +288,8 @@ export interface AddRequestsBatchedResult {
|
|
|
288
288
|
/**
|
|
289
289
|
* A promise which will resolve with the rest of the requests that were added to the queue.
|
|
290
290
|
*
|
|
291
|
-
* Alternatively, we can set {@
|
|
292
|
-
* in the {@
|
|
291
|
+
* Alternatively, we can set {@link AddRequestsBatchedOptions.waitForAllRequestsToBeAdded|`waitForAllRequestsToBeAdded`} to `true`
|
|
292
|
+
* in the {@link BasicCrawler.addRequests|`crawler.addRequests()`} options.
|
|
293
293
|
*
|
|
294
294
|
* **Example:**
|
|
295
295
|
*
|
|
@@ -165,12 +165,12 @@ class RequestProvider {
|
|
|
165
165
|
*
|
|
166
166
|
* If a request with the same `uniqueKey` property is already present in the queue,
|
|
167
167
|
* it will not be updated. You can find out whether this happened from the resulting
|
|
168
|
-
* {@
|
|
168
|
+
* {@link QueueOperationInfo} object.
|
|
169
169
|
*
|
|
170
170
|
* To add multiple requests to the queue by extracting links from a webpage,
|
|
171
|
-
* see the {@
|
|
171
|
+
* see the {@link enqueueLinks} helper function.
|
|
172
172
|
*
|
|
173
|
-
* @param requestLike {@
|
|
173
|
+
* @param requestLike {@link Request} object or vanilla object with request data.
|
|
174
174
|
* Note that the function sets the `uniqueKey` and `id` fields to the passed Request.
|
|
175
175
|
* @param [options] Request queue operation options.
|
|
176
176
|
*/
|
|
@@ -228,9 +228,9 @@ class RequestProvider {
|
|
|
228
228
|
*
|
|
229
229
|
* If a request passed in is already present due to its `uniqueKey` property being the same,
|
|
230
230
|
* it will not be updated. You can find out whether this happened by finding the request in the resulting
|
|
231
|
-
* {@
|
|
231
|
+
* {@link BatchAddRequestsResult} object.
|
|
232
232
|
*
|
|
233
|
-
* @param requestsLike {@
|
|
233
|
+
* @param requestsLike {@link Request} objects or vanilla objects with request data.
|
|
234
234
|
* Note that the function sets the `uniqueKey` and `id` fields to the passed requests if missing.
|
|
235
235
|
* @param [options] Request queue operation options.
|
|
236
236
|
*/
|
|
@@ -418,7 +418,7 @@ class RequestProvider {
|
|
|
418
418
|
}
|
|
419
419
|
/**
|
|
420
420
|
* Marks a request that was previously returned by the
|
|
421
|
-
* {@
|
|
421
|
+
* {@link RequestQueue.fetchNextRequest}
|
|
422
422
|
* function as handled after successful processing.
|
|
423
423
|
* Handled requests will never again be returned by the `fetchNextRequest` function.
|
|
424
424
|
*/
|
|
@@ -451,7 +451,7 @@ class RequestProvider {
|
|
|
451
451
|
}
|
|
452
452
|
/**
|
|
453
453
|
* Reclaims a failed request back to the queue, so that it can be returned for processing later again
|
|
454
|
-
* by another call to {@
|
|
454
|
+
* by another call to {@link RequestQueue.fetchNextRequest}.
|
|
455
455
|
* The request record in the queue is updated using the provided `request` parameter.
|
|
456
456
|
* For example, this lets you store the number of retries or error messages for the request.
|
|
457
457
|
*/
|
|
@@ -479,10 +479,10 @@ class RequestProvider {
|
|
|
479
479
|
return queueOperationInfo;
|
|
480
480
|
}
|
|
481
481
|
/**
|
|
482
|
-
* Resolves to `true` if the next call to {@
|
|
482
|
+
* Resolves to `true` if the next call to {@link RequestQueue.fetchNextRequest}
|
|
483
483
|
* would return `null`, otherwise it resolves to `false`.
|
|
484
484
|
* Note that even if the queue is empty, there might be some pending requests currently being processed.
|
|
485
|
-
* If you need to ensure that there is no activity in the queue, use {@
|
|
485
|
+
* If you need to ensure that there is no activity in the queue, use {@link RequestQueue.isFinished}.
|
|
486
486
|
*/
|
|
487
487
|
async isEmpty() {
|
|
488
488
|
await this.ensureHeadIsNonEmpty();
|
|
@@ -622,14 +622,14 @@ class RequestProvider {
|
|
|
622
622
|
}
|
|
623
623
|
/**
|
|
624
624
|
* Opens a request queue and returns a promise resolving to an instance
|
|
625
|
-
* of the {@
|
|
625
|
+
* of the {@link RequestQueue} class.
|
|
626
626
|
*
|
|
627
|
-
* {@
|
|
627
|
+
* {@link RequestQueue} represents a queue of URLs to crawl, which is stored either on local filesystem or in the cloud.
|
|
628
628
|
* The queue is used for deep crawling of websites, where you start with several URLs and then
|
|
629
629
|
* recursively follow links to other pages. The data structure supports both breadth-first
|
|
630
630
|
* and depth-first crawling orders.
|
|
631
631
|
*
|
|
632
|
-
* For more details and code examples, see the {@
|
|
632
|
+
* For more details and code examples, see the {@link RequestQueue} class.
|
|
633
633
|
*
|
|
634
634
|
* @param [queueIdOrName]
|
|
635
635
|
* ID or name of the request queue to be opened. If `null` or `undefined`,
|
|
@@ -8,17 +8,17 @@ import { RequestProvider } from './request_provider';
|
|
|
8
8
|
* where you start with several URLs and then recursively
|
|
9
9
|
* follow links to other pages. The data structure supports both breadth-first and depth-first crawling orders.
|
|
10
10
|
*
|
|
11
|
-
* Each URL is represented using an instance of the {@
|
|
12
|
-
* The queue can only contain unique URLs. More precisely, it can only contain {@
|
|
11
|
+
* Each URL is represented using an instance of the {@link Request} class.
|
|
12
|
+
* The queue can only contain unique URLs. More precisely, it can only contain {@link Request} instances
|
|
13
13
|
* with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden.
|
|
14
14
|
* To add a single URL multiple times to the queue,
|
|
15
|
-
* corresponding {@
|
|
15
|
+
* corresponding {@link Request} objects will need to have different `uniqueKey` properties.
|
|
16
16
|
*
|
|
17
|
-
* Do not instantiate this class directly, use the {@
|
|
17
|
+
* Do not instantiate this class directly, use the {@link RequestQueue.open} function instead.
|
|
18
18
|
*
|
|
19
|
-
* `RequestQueue` is used by {@
|
|
20
|
-
* and {@
|
|
21
|
-
* Unlike {@
|
|
19
|
+
* `RequestQueue` is used by {@link BasicCrawler}, {@link CheerioCrawler}, {@link PuppeteerCrawler}
|
|
20
|
+
* and {@link PlaywrightCrawler} as a source of URLs to crawl.
|
|
21
|
+
* Unlike {@link RequestList}, `RequestQueue` supports dynamic adding and removing of requests.
|
|
22
22
|
* On the other hand, the queue is not optimized for operations that add or remove a large number of URLs in a batch.
|
|
23
23
|
*
|
|
24
24
|
* `RequestQueue` stores its data either on local disk or in the Apify Cloud,
|
|
@@ -30,7 +30,7 @@ import { RequestProvider } from './request_provider';
|
|
|
30
30
|
* If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the
|
|
31
31
|
* [Apify Request Queue](https://docs.apify.com/storage/request-queue)
|
|
32
32
|
* cloud storage. Note that you can force usage of the cloud storage also by passing the `forceCloud`
|
|
33
|
-
* option to {@
|
|
33
|
+
* option to {@link RequestQueue.open} function,
|
|
34
34
|
* even if the `APIFY_LOCAL_STORAGE_DIR` variable is set.
|
|
35
35
|
*
|
|
36
36
|
* **Example usage:**
|
|
@@ -49,7 +49,7 @@ import { RequestProvider } from './request_provider';
|
|
|
49
49
|
* ```
|
|
50
50
|
* @category Sources
|
|
51
51
|
*
|
|
52
|
-
* @deprecated RequestQueue v1 is deprecated and will be removed in the future. Please use {@
|
|
52
|
+
* @deprecated RequestQueue v1 is deprecated and will be removed in the future. Please use {@link RequestQueue} instead.
|
|
53
53
|
*/
|
|
54
54
|
declare class RequestQueue extends RequestProvider {
|
|
55
55
|
private queryQueueHeadPromise?;
|
|
@@ -66,15 +66,15 @@ declare class RequestQueue extends RequestProvider {
|
|
|
66
66
|
* Returns a next request in the queue to be processed, or `null` if there are no more pending requests.
|
|
67
67
|
*
|
|
68
68
|
* Once you successfully finish processing of the request, you need to call
|
|
69
|
-
* {@
|
|
69
|
+
* {@link RequestQueue.markRequestHandled}
|
|
70
70
|
* to mark the request as handled in the queue. If there was some error in processing the request,
|
|
71
|
-
* call {@
|
|
71
|
+
* call {@link RequestQueue.reclaimRequest} instead,
|
|
72
72
|
* so that the queue will give the request to some other consumer in another call to the `fetchNextRequest` function.
|
|
73
73
|
*
|
|
74
74
|
* Note that the `null` return value doesn't mean the queue processing finished,
|
|
75
75
|
* it means there are currently no pending requests.
|
|
76
76
|
* To check whether all requests in queue were finished,
|
|
77
|
-
* use {@
|
|
77
|
+
* use {@link RequestQueue.isFinished} instead.
|
|
78
78
|
*
|
|
79
79
|
* @returns
|
|
80
80
|
* Returns the request object or `null` if there are no more pending requests.
|
|
@@ -96,7 +96,7 @@ declare class RequestQueue extends RequestProvider {
|
|
|
96
96
|
isFinished(): Promise<boolean>;
|
|
97
97
|
/**
|
|
98
98
|
* Reclaims a failed request back to the queue, so that it can be returned for processing later again
|
|
99
|
-
* by another call to {@
|
|
99
|
+
* by another call to {@link RequestQueue.fetchNextRequest}.
|
|
100
100
|
* The request record in the queue is updated using the provided `request` parameter.
|
|
101
101
|
* For example, this lets you store the number of retries or error messages for the request.
|
|
102
102
|
*/
|
|
@@ -108,14 +108,14 @@ declare class RequestQueue extends RequestProvider {
|
|
|
108
108
|
protected _reset(): void;
|
|
109
109
|
/**
|
|
110
110
|
* Opens a request queue and returns a promise resolving to an instance
|
|
111
|
-
* of the {@
|
|
111
|
+
* of the {@link RequestQueue} class.
|
|
112
112
|
*
|
|
113
|
-
* {@
|
|
113
|
+
* {@link RequestQueue} represents a queue of URLs to crawl, which is stored either on local filesystem or in the cloud.
|
|
114
114
|
* The queue is used for deep crawling of websites, where you start with several URLs and then
|
|
115
115
|
* recursively follow links to other pages. The data structure supports both breadth-first
|
|
116
116
|
* and depth-first crawling orders.
|
|
117
117
|
*
|
|
118
|
-
* For more details and code examples, see the {@
|
|
118
|
+
* For more details and code examples, see the {@link RequestQueue} class.
|
|
119
119
|
*
|
|
120
120
|
* @param [queueIdOrName]
|
|
121
121
|
* ID or name of the request queue to be opened. If `null` or `undefined`,
|