@crawlee/core 3.13.3-beta.11 → 3.13.3-beta.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/autoscaling/autoscaled_pool.d.ts +16 -16
- package/autoscaling/autoscaled_pool.js +13 -13
- package/autoscaling/snapshotter.d.ts +1 -1
- package/autoscaling/snapshotter.js +1 -1
- package/autoscaling/system_status.d.ts +12 -12
- package/autoscaling/system_status.js +11 -11
- package/configuration.d.ts +10 -10
- package/configuration.js +4 -4
- package/crawlers/crawler_commons.d.ts +12 -12
- package/crawlers/crawler_commons.js +4 -4
- package/crawlers/statistics.d.ts +2 -2
- package/crawlers/statistics.js +1 -1
- package/enqueue_links/enqueue_links.d.ts +14 -14
- package/enqueue_links/enqueue_links.js +5 -5
- package/enqueue_links/shared.d.ts +2 -2
- package/http_clients/base-http-client.d.ts +7 -7
- package/http_clients/base-http-client.js +1 -1
- package/package.json +5 -5
- package/proxy_configuration.d.ts +11 -11
- package/proxy_configuration.js +8 -8
- package/request.d.ts +3 -3
- package/request.js +2 -2
- package/session_pool/session.d.ts +1 -1
- package/session_pool/session_pool.d.ts +12 -12
- package/session_pool/session_pool.js +10 -10
- package/storages/dataset.d.ts +15 -15
- package/storages/dataset.js +9 -9
- package/storages/key_value_store.d.ts +32 -32
- package/storages/key_value_store.js +22 -22
- package/storages/request_list.d.ts +35 -35
- package/storages/request_list.js +19 -19
- package/storages/request_provider.d.ts +19 -19
- package/storages/request_provider.js +12 -12
- package/storages/request_queue.d.ts +16 -16
- package/storages/request_queue.js +16 -16
- package/storages/request_queue_v2.d.ts +7 -7
- package/storages/request_queue_v2.js +7 -7
- package/storages/utils.d.ts +2 -2
|
@@ -19,17 +19,17 @@ const RECENTLY_HANDLED_CACHE_SIZE = 1000;
|
|
|
19
19
|
* where you start with several URLs and then recursively
|
|
20
20
|
* follow links to other pages. The data structure supports both breadth-first and depth-first crawling orders.
|
|
21
21
|
*
|
|
22
|
-
* Each URL is represented using an instance of the {@
|
|
23
|
-
* The queue can only contain unique URLs. More precisely, it can only contain {@
|
|
22
|
+
* Each URL is represented using an instance of the {@link Request} class.
|
|
23
|
+
* The queue can only contain unique URLs. More precisely, it can only contain {@link Request} instances
|
|
24
24
|
* with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden.
|
|
25
25
|
* To add a single URL multiple times to the queue,
|
|
26
|
-
* corresponding {@
|
|
26
|
+
* corresponding {@link Request} objects will need to have different `uniqueKey` properties.
|
|
27
27
|
*
|
|
28
|
-
* Do not instantiate this class directly, use the {@
|
|
28
|
+
* Do not instantiate this class directly, use the {@link RequestQueue.open} function instead.
|
|
29
29
|
*
|
|
30
|
-
* `RequestQueue` is used by {@
|
|
31
|
-
* and {@
|
|
32
|
-
* Unlike {@
|
|
30
|
+
* `RequestQueue` is used by {@link BasicCrawler}, {@link CheerioCrawler}, {@link PuppeteerCrawler}
|
|
31
|
+
* and {@link PlaywrightCrawler} as a source of URLs to crawl.
|
|
32
|
+
* Unlike {@link RequestList}, `RequestQueue` supports dynamic adding and removing of requests.
|
|
33
33
|
* On the other hand, the queue is not optimized for operations that add or remove a large number of URLs in a batch.
|
|
34
34
|
*
|
|
35
35
|
* `RequestQueue` stores its data either on local disk or in the Apify Cloud,
|
|
@@ -41,7 +41,7 @@ const RECENTLY_HANDLED_CACHE_SIZE = 1000;
|
|
|
41
41
|
* If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the
|
|
42
42
|
* [Apify Request Queue](https://docs.apify.com/storage/request-queue)
|
|
43
43
|
* cloud storage. Note that you can force usage of the cloud storage also by passing the `forceCloud`
|
|
44
|
-
* option to {@
|
|
44
|
+
* option to {@link RequestQueue.open} function,
|
|
45
45
|
* even if the `APIFY_LOCAL_STORAGE_DIR` variable is set.
|
|
46
46
|
*
|
|
47
47
|
* **Example usage:**
|
|
@@ -60,7 +60,7 @@ const RECENTLY_HANDLED_CACHE_SIZE = 1000;
|
|
|
60
60
|
* ```
|
|
61
61
|
* @category Sources
|
|
62
62
|
*
|
|
63
|
-
* @deprecated RequestQueue v1 is deprecated and will be removed in the future. Please use {@
|
|
63
|
+
* @deprecated RequestQueue v1 is deprecated and will be removed in the future. Please use {@link RequestQueue} instead.
|
|
64
64
|
*/
|
|
65
65
|
class RequestQueue extends request_provider_1.RequestProvider {
|
|
66
66
|
/**
|
|
@@ -96,15 +96,15 @@ class RequestQueue extends request_provider_1.RequestProvider {
|
|
|
96
96
|
* Returns a next request in the queue to be processed, or `null` if there are no more pending requests.
|
|
97
97
|
*
|
|
98
98
|
* Once you successfully finish processing of the request, you need to call
|
|
99
|
-
* {@
|
|
99
|
+
* {@link RequestQueue.markRequestHandled}
|
|
100
100
|
* to mark the request as handled in the queue. If there was some error in processing the request,
|
|
101
|
-
* call {@
|
|
101
|
+
* call {@link RequestQueue.reclaimRequest} instead,
|
|
102
102
|
* so that the queue will give the request to some other consumer in another call to the `fetchNextRequest` function.
|
|
103
103
|
*
|
|
104
104
|
* Note that the `null` return value doesn't mean the queue processing finished,
|
|
105
105
|
* it means there are currently no pending requests.
|
|
106
106
|
* To check whether all requests in queue were finished,
|
|
107
|
-
* use {@
|
|
107
|
+
* use {@link RequestQueue.isFinished} instead.
|
|
108
108
|
*
|
|
109
109
|
* @returns
|
|
110
110
|
* Returns the request object or `null` if there are no more pending requests.
|
|
@@ -273,7 +273,7 @@ class RequestQueue extends request_provider_1.RequestProvider {
|
|
|
273
273
|
}
|
|
274
274
|
/**
|
|
275
275
|
* Reclaims a failed request back to the queue, so that it can be returned for processing later again
|
|
276
|
-
* by another call to {@
|
|
276
|
+
* by another call to {@link RequestQueue.fetchNextRequest}.
|
|
277
277
|
* The request record in the queue is updated using the provided `request` parameter.
|
|
278
278
|
* For example, this lets you store the number of retries or error messages for the request.
|
|
279
279
|
*/
|
|
@@ -311,14 +311,14 @@ class RequestQueue extends request_provider_1.RequestProvider {
|
|
|
311
311
|
}
|
|
312
312
|
/**
|
|
313
313
|
* Opens a request queue and returns a promise resolving to an instance
|
|
314
|
-
* of the {@
|
|
314
|
+
* of the {@link RequestQueue} class.
|
|
315
315
|
*
|
|
316
|
-
* {@
|
|
316
|
+
* {@link RequestQueue} represents a queue of URLs to crawl, which is stored either on local filesystem or in the cloud.
|
|
317
317
|
* The queue is used for deep crawling of websites, where you start with several URLs and then
|
|
318
318
|
* recursively follow links to other pages. The data structure supports both breadth-first
|
|
319
319
|
* and depth-first crawling orders.
|
|
320
320
|
*
|
|
321
|
-
* For more details and code examples, see the {@
|
|
321
|
+
* For more details and code examples, see the {@link RequestQueue} class.
|
|
322
322
|
*
|
|
323
323
|
* @param [queueIdOrName]
|
|
324
324
|
* ID or name of the request queue to be opened. If `null` or `undefined`,
|
|
@@ -8,17 +8,17 @@ import { RequestProvider } from './request_provider';
|
|
|
8
8
|
* where you start with several URLs and then recursively
|
|
9
9
|
* follow links to other pages. The data structure supports both breadth-first and depth-first crawling orders.
|
|
10
10
|
*
|
|
11
|
-
* Each URL is represented using an instance of the {@
|
|
12
|
-
* The queue can only contain unique URLs. More precisely, it can only contain {@
|
|
11
|
+
* Each URL is represented using an instance of the {@link Request} class.
|
|
12
|
+
* The queue can only contain unique URLs. More precisely, it can only contain {@link Request} instances
|
|
13
13
|
* with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden.
|
|
14
14
|
* To add a single URL multiple times to the queue,
|
|
15
|
-
* corresponding {@
|
|
15
|
+
* corresponding {@link Request} objects will need to have different `uniqueKey` properties.
|
|
16
16
|
*
|
|
17
|
-
* Do not instantiate this class directly, use the {@
|
|
17
|
+
* Do not instantiate this class directly, use the {@link RequestQueue.open} function instead.
|
|
18
18
|
*
|
|
19
|
-
* `RequestQueue` is used by {@
|
|
20
|
-
* and {@
|
|
21
|
-
* Unlike {@
|
|
19
|
+
* `RequestQueue` is used by {@link BasicCrawler}, {@link CheerioCrawler}, {@link PuppeteerCrawler}
|
|
20
|
+
* and {@link PlaywrightCrawler} as a source of URLs to crawl.
|
|
21
|
+
* Unlike {@link RequestList}, `RequestQueue` supports dynamic adding and removing of requests.
|
|
22
22
|
* On the other hand, the queue is not optimized for operations that add or remove a large number of URLs in a batch.
|
|
23
23
|
*
|
|
24
24
|
* **Example usage:**
|
|
@@ -18,17 +18,17 @@ const RECENTLY_HANDLED_CACHE_SIZE = 1000;
|
|
|
18
18
|
* where you start with several URLs and then recursively
|
|
19
19
|
* follow links to other pages. The data structure supports both breadth-first and depth-first crawling orders.
|
|
20
20
|
*
|
|
21
|
-
* Each URL is represented using an instance of the {@
|
|
22
|
-
* The queue can only contain unique URLs. More precisely, it can only contain {@
|
|
21
|
+
* Each URL is represented using an instance of the {@link Request} class.
|
|
22
|
+
* The queue can only contain unique URLs. More precisely, it can only contain {@link Request} instances
|
|
23
23
|
* with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden.
|
|
24
24
|
* To add a single URL multiple times to the queue,
|
|
25
|
-
* corresponding {@
|
|
25
|
+
* corresponding {@link Request} objects will need to have different `uniqueKey` properties.
|
|
26
26
|
*
|
|
27
|
-
* Do not instantiate this class directly, use the {@
|
|
27
|
+
* Do not instantiate this class directly, use the {@link RequestQueue.open} function instead.
|
|
28
28
|
*
|
|
29
|
-
* `RequestQueue` is used by {@
|
|
30
|
-
* and {@
|
|
31
|
-
* Unlike {@
|
|
29
|
+
* `RequestQueue` is used by {@link BasicCrawler}, {@link CheerioCrawler}, {@link PuppeteerCrawler}
|
|
30
|
+
* and {@link PlaywrightCrawler} as a source of URLs to crawl.
|
|
31
|
+
* Unlike {@link RequestList}, `RequestQueue` supports dynamic adding and removing of requests.
|
|
32
32
|
* On the other hand, the queue is not optimized for operations that add or remove a large number of URLs in a batch.
|
|
33
33
|
*
|
|
34
34
|
* **Example usage:**
|
package/storages/utils.d.ts
CHANGED
|
@@ -16,7 +16,7 @@ interface PurgeDefaultStorageOptions {
|
|
|
16
16
|
* Purging will remove all the files in all storages except for INPUT.json in the default KV store.
|
|
17
17
|
*
|
|
18
18
|
* Purging of storages is happening automatically when we run our crawler (or when we open some storage
|
|
19
|
-
* explicitly, e.g. via `RequestList.open()`). We can disable that via `purgeOnStart` {@
|
|
19
|
+
* explicitly, e.g. via `RequestList.open()`). We can disable that via `purgeOnStart` {@link Configuration}
|
|
20
20
|
* option or by setting `CRAWLEE_PURGE_ON_START` environment variable to `0` or `false`.
|
|
21
21
|
*
|
|
22
22
|
* This is a shortcut for running (optional) `purge` method on the StorageClient interface, in other words
|
|
@@ -30,7 +30,7 @@ export declare function purgeDefaultStorages(options?: PurgeDefaultStorageOption
|
|
|
30
30
|
* Purging will remove all the files in all storages except for INPUT.json in the default KV store.
|
|
31
31
|
*
|
|
32
32
|
* Purging of storages is happening automatically when we run our crawler (or when we open some storage
|
|
33
|
-
* explicitly, e.g. via `RequestList.open()`). We can disable that via `purgeOnStart` {@
|
|
33
|
+
* explicitly, e.g. via `RequestList.open()`). We can disable that via `purgeOnStart` {@link Configuration}
|
|
34
34
|
* option or by setting `CRAWLEE_PURGE_ON_START` environment variable to `0` or `false`.
|
|
35
35
|
*
|
|
36
36
|
* This is a shortcut for running (optional) `purge` method on the StorageClient interface, in other words
|