@crawlee/core 4.0.0-beta.6 → 4.0.0-beta.61
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -5
- package/autoscaling/autoscaled_pool.d.ts +3 -5
- package/autoscaling/autoscaled_pool.d.ts.map +1 -1
- package/autoscaling/autoscaled_pool.js +3 -9
- package/autoscaling/autoscaled_pool.js.map +1 -1
- package/autoscaling/snapshotter.d.ts +3 -13
- package/autoscaling/snapshotter.d.ts.map +1 -1
- package/autoscaling/snapshotter.js +18 -29
- package/autoscaling/snapshotter.js.map +1 -1
- package/autoscaling/system_status.d.ts +0 -3
- package/autoscaling/system_status.d.ts.map +1 -1
- package/autoscaling/system_status.js +2 -3
- package/autoscaling/system_status.js.map +1 -1
- package/configuration.d.ts +85 -227
- package/configuration.d.ts.map +1 -1
- package/configuration.js +159 -223
- package/configuration.js.map +1 -1
- package/cookie_utils.d.ts +4 -2
- package/cookie_utils.d.ts.map +1 -1
- package/cookie_utils.js +18 -12
- package/cookie_utils.js.map +1 -1
- package/crawlers/context_pipeline.d.ts +71 -0
- package/crawlers/context_pipeline.d.ts.map +1 -0
- package/crawlers/context_pipeline.js +123 -0
- package/crawlers/context_pipeline.js.map +1 -0
- package/crawlers/crawler_commons.d.ts +19 -28
- package/crawlers/crawler_commons.d.ts.map +1 -1
- package/crawlers/crawler_commons.js +12 -20
- package/crawlers/crawler_commons.js.map +1 -1
- package/crawlers/crawler_utils.d.ts +2 -2
- package/crawlers/crawler_utils.d.ts.map +1 -1
- package/crawlers/crawler_utils.js +1 -1
- package/crawlers/crawler_utils.js.map +1 -1
- package/crawlers/error_snapshotter.d.ts +3 -2
- package/crawlers/error_snapshotter.d.ts.map +1 -1
- package/crawlers/error_snapshotter.js +2 -2
- package/crawlers/error_snapshotter.js.map +1 -1
- package/crawlers/error_tracker.d.ts +2 -1
- package/crawlers/error_tracker.d.ts.map +1 -1
- package/crawlers/error_tracker.js.map +1 -1
- package/crawlers/index.d.ts +1 -1
- package/crawlers/index.d.ts.map +1 -1
- package/crawlers/index.js +1 -1
- package/crawlers/index.js.map +1 -1
- package/crawlers/internals/types.d.ts +8 -0
- package/crawlers/internals/types.d.ts.map +1 -0
- package/crawlers/internals/types.js +2 -0
- package/crawlers/internals/types.js.map +1 -0
- package/crawlers/statistics.d.ts +15 -15
- package/crawlers/statistics.d.ts.map +1 -1
- package/crawlers/statistics.js +21 -24
- package/crawlers/statistics.js.map +1 -1
- package/enqueue_links/enqueue_links.d.ts +32 -18
- package/enqueue_links/enqueue_links.d.ts.map +1 -1
- package/enqueue_links/enqueue_links.js +45 -24
- package/enqueue_links/enqueue_links.js.map +1 -1
- package/enqueue_links/shared.d.ts +25 -8
- package/enqueue_links/shared.d.ts.map +1 -1
- package/enqueue_links/shared.js +69 -37
- package/enqueue_links/shared.js.map +1 -1
- package/errors.d.ts +33 -3
- package/errors.d.ts.map +1 -1
- package/errors.js +48 -4
- package/errors.js.map +1 -1
- package/events/event_manager.d.ts +8 -5
- package/events/event_manager.d.ts.map +1 -1
- package/events/event_manager.js +7 -9
- package/events/event_manager.js.map +1 -1
- package/events/local_event_manager.d.ts +14 -4
- package/events/local_event_manager.d.ts.map +1 -1
- package/events/local_event_manager.js +33 -39
- package/events/local_event_manager.js.map +1 -1
- package/index.d.ts +3 -2
- package/index.d.ts.map +1 -1
- package/index.js +2 -1
- package/index.js.map +1 -1
- package/log.d.ts +82 -2
- package/log.d.ts.map +1 -1
- package/log.js +102 -0
- package/log.js.map +1 -1
- package/package.json +9 -10
- package/proxy_configuration.d.ts +14 -148
- package/proxy_configuration.d.ts.map +1 -1
- package/proxy_configuration.js +19 -167
- package/proxy_configuration.js.map +1 -1
- package/recoverable_state.d.ts +121 -0
- package/recoverable_state.d.ts.map +1 -0
- package/recoverable_state.js +142 -0
- package/recoverable_state.js.map +1 -0
- package/request.d.ts +74 -10
- package/request.d.ts.map +1 -1
- package/request.js +85 -23
- package/request.js.map +1 -1
- package/router.d.ts.map +1 -1
- package/router.js.map +1 -1
- package/serialization.js +1 -1
- package/serialization.js.map +1 -1
- package/service_locator.d.ts +157 -0
- package/service_locator.d.ts.map +1 -0
- package/service_locator.js +234 -0
- package/service_locator.js.map +1 -0
- package/session_pool/index.d.ts +0 -1
- package/session_pool/index.d.ts.map +1 -1
- package/session_pool/index.js +0 -1
- package/session_pool/index.js.map +1 -1
- package/session_pool/session.d.ts +26 -72
- package/session_pool/session.d.ts.map +1 -1
- package/session_pool/session.js +36 -98
- package/session_pool/session.js.map +1 -1
- package/session_pool/session_pool.d.ts +65 -71
- package/session_pool/session_pool.d.ts.map +1 -1
- package/session_pool/session_pool.js +101 -100
- package/session_pool/session_pool.js.map +1 -1
- package/storages/dataset.d.ts +90 -46
- package/storages/dataset.d.ts.map +1 -1
- package/storages/dataset.js +149 -121
- package/storages/dataset.js.map +1 -1
- package/storages/index.d.ts +3 -1
- package/storages/index.d.ts.map +1 -1
- package/storages/index.js +3 -1
- package/storages/index.js.map +1 -1
- package/storages/key_value_store.d.ts +104 -22
- package/storages/key_value_store.d.ts.map +1 -1
- package/storages/key_value_store.js +166 -51
- package/storages/key_value_store.js.map +1 -1
- package/storages/request_list.d.ts +9 -9
- package/storages/request_list.d.ts.map +1 -1
- package/storages/request_list.js +13 -8
- package/storages/request_list.js.map +1 -1
- package/storages/request_list_adapter.d.ts +58 -0
- package/storages/request_list_adapter.d.ts.map +1 -0
- package/storages/request_list_adapter.js +81 -0
- package/storages/request_list_adapter.js.map +1 -0
- package/storages/request_manager_tandem.d.ts +68 -0
- package/storages/request_manager_tandem.d.ts.map +1 -0
- package/storages/request_manager_tandem.js +124 -0
- package/storages/request_manager_tandem.js.map +1 -0
- package/storages/request_provider.d.ts +87 -22
- package/storages/request_provider.d.ts.map +1 -1
- package/storages/request_provider.js +127 -77
- package/storages/request_provider.js.map +1 -1
- package/storages/request_queue.d.ts +1 -3
- package/storages/request_queue.d.ts.map +1 -1
- package/storages/request_queue.js +2 -4
- package/storages/request_queue.js.map +1 -1
- package/storages/request_queue_v2.d.ts +3 -3
- package/storages/request_queue_v2.d.ts.map +1 -1
- package/storages/request_queue_v2.js +4 -5
- package/storages/request_queue_v2.js.map +1 -1
- package/storages/sitemap_request_list.d.ts +5 -5
- package/storages/sitemap_request_list.d.ts.map +1 -1
- package/storages/sitemap_request_list.js +10 -7
- package/storages/sitemap_request_list.js.map +1 -1
- package/storages/storage_instance_manager.d.ts +91 -0
- package/storages/storage_instance_manager.d.ts.map +1 -0
- package/storages/storage_instance_manager.js +236 -0
- package/storages/storage_instance_manager.js.map +1 -0
- package/storages/utils.d.ts +47 -1
- package/storages/utils.d.ts.map +1 -1
- package/storages/utils.js +57 -5
- package/storages/utils.js.map +1 -1
- package/typedefs.d.ts +1 -1
- package/typedefs.d.ts.map +1 -1
- package/validators.d.ts +4 -0
- package/validators.d.ts.map +1 -1
- package/validators.js +4 -0
- package/validators.js.map +1 -1
- package/crawlers/crawler_extension.d.ts +0 -12
- package/crawlers/crawler_extension.d.ts.map +0 -1
- package/crawlers/crawler_extension.js +0 -14
- package/crawlers/crawler_extension.js.map +0 -1
- package/http_clients/base-http-client.d.ts +0 -134
- package/http_clients/base-http-client.d.ts.map +0 -1
- package/http_clients/base-http-client.js +0 -33
- package/http_clients/base-http-client.js.map +0 -1
- package/http_clients/form-data-like.d.ts +0 -67
- package/http_clients/form-data-like.d.ts.map +0 -1
- package/http_clients/form-data-like.js +0 -5
- package/http_clients/form-data-like.js.map +0 -1
- package/http_clients/got-scraping-http-client.d.ts +0 -15
- package/http_clients/got-scraping-http-client.d.ts.map +0 -1
- package/http_clients/got-scraping-http-client.js +0 -69
- package/http_clients/got-scraping-http-client.js.map +0 -1
- package/http_clients/index.d.ts +0 -3
- package/http_clients/index.d.ts.map +0 -1
- package/http_clients/index.js +0 -3
- package/http_clients/index.js.map +0 -1
- package/session_pool/events.d.ts +0 -3
- package/session_pool/events.d.ts.map +0 -1
- package/session_pool/events.js +0 -3
- package/session_pool/events.js.map +0 -1
- package/storages/storage_manager.d.ts +0 -58
- package/storages/storage_manager.d.ts.map +0 -1
- package/storages/storage_manager.js +0 -105
- package/storages/storage_manager.js.map +0 -1
- package/tsconfig.build.tsbuildinfo +0 -1
package/proxy_configuration.d.ts
CHANGED
|
@@ -1,129 +1,28 @@
|
|
|
1
|
+
import type { ProxyInfo } from '@crawlee/types';
|
|
1
2
|
import type { Request } from './request.js';
|
|
2
3
|
export interface ProxyConfigurationFunction {
|
|
3
|
-
(
|
|
4
|
+
(options?: {
|
|
4
5
|
request?: Request;
|
|
5
6
|
}): string | null | Promise<string | null>;
|
|
6
7
|
}
|
|
8
|
+
type UrlList = (string | null)[];
|
|
7
9
|
export interface ProxyConfigurationOptions {
|
|
8
10
|
/**
|
|
9
11
|
* An array of custom proxy URLs to be rotated.
|
|
10
12
|
* Custom proxies are not compatible with Apify Proxy and an attempt to use both
|
|
11
13
|
* configuration options will cause an error to be thrown on initialize.
|
|
12
14
|
*/
|
|
13
|
-
proxyUrls?:
|
|
15
|
+
proxyUrls?: UrlList;
|
|
14
16
|
/**
|
|
15
|
-
* Custom function that allows you to generate the new proxy URL dynamically. It gets
|
|
17
|
+
* Custom function that allows you to generate the new proxy URL dynamically. It gets an optional parameter with the `Request` object when applicable.
|
|
16
18
|
* Can return either stringified proxy URL or `null` if the proxy should not be used. Can be asynchronous.
|
|
17
19
|
*
|
|
18
20
|
* This function is used to generate the URL when {@link ProxyConfiguration.newUrl} or {@link ProxyConfiguration.newProxyInfo} is called.
|
|
19
21
|
*/
|
|
20
22
|
newUrlFunction?: ProxyConfigurationFunction;
|
|
21
|
-
/**
|
|
22
|
-
* An array of custom proxy URLs to be rotated stratified in tiers.
|
|
23
|
-
* This is a more advanced version of `proxyUrls` that allows you to define a hierarchy of proxy URLs
|
|
24
|
-
* If everything goes well, all the requests will be sent through the first proxy URL in the list.
|
|
25
|
-
* Whenever the crawler encounters a problem with the current proxy on the given domain, it will switch to the higher tier for this domain.
|
|
26
|
-
* The crawler probes lower-level proxies at intervals to check if it can make the tier downshift.
|
|
27
|
-
*
|
|
28
|
-
* This feature is useful when you have a set of proxies with different performance characteristics (speed, price, antibot performance etc.) and you want to use the best one for each domain.
|
|
29
|
-
*
|
|
30
|
-
* Use `null` as a proxy URL to disable the proxy for the given tier.
|
|
31
|
-
*/
|
|
32
|
-
tieredProxyUrls?: (string | null)[][];
|
|
33
23
|
}
|
|
34
|
-
|
|
35
|
-
proxyUrl: string | null;
|
|
36
|
-
proxyTier?: number;
|
|
37
|
-
}
|
|
38
|
-
/**
|
|
39
|
-
* The main purpose of the ProxyInfo object is to provide information
|
|
40
|
-
* about the current proxy connection used by the crawler for the request.
|
|
41
|
-
* Outside of crawlers, you can get this object by calling {@link ProxyConfiguration.newProxyInfo}.
|
|
42
|
-
*
|
|
43
|
-
* **Example usage:**
|
|
44
|
-
*
|
|
45
|
-
* ```javascript
|
|
46
|
-
* const proxyConfiguration = new ProxyConfiguration({
|
|
47
|
-
* proxyUrls: ['...', '...'] // List of Proxy URLs to rotate
|
|
48
|
-
* });
|
|
49
|
-
*
|
|
50
|
-
* // Getting proxyInfo object by calling class method directly
|
|
51
|
-
* const proxyInfo = await proxyConfiguration.newProxyInfo();
|
|
52
|
-
*
|
|
53
|
-
* // In crawler
|
|
54
|
-
* const crawler = new CheerioCrawler({
|
|
55
|
-
* // ...
|
|
56
|
-
* proxyConfiguration,
|
|
57
|
-
* requestHandler({ proxyInfo }) {
|
|
58
|
-
* // Getting used proxy URL
|
|
59
|
-
* const proxyUrl = proxyInfo.url;
|
|
60
|
-
*
|
|
61
|
-
* // Getting ID of used Session
|
|
62
|
-
* const sessionIdentifier = proxyInfo.sessionId;
|
|
63
|
-
* }
|
|
64
|
-
* })
|
|
65
|
-
*
|
|
66
|
-
* ```
|
|
67
|
-
*/
|
|
68
|
-
export interface ProxyInfo {
|
|
69
|
-
/**
|
|
70
|
-
* The identifier of used {@link Session}, if used.
|
|
71
|
-
*/
|
|
72
|
-
sessionId?: string;
|
|
73
|
-
/**
|
|
74
|
-
* The URL of the proxy.
|
|
75
|
-
*/
|
|
76
|
-
url: string;
|
|
77
|
-
/**
|
|
78
|
-
* Username for the proxy.
|
|
79
|
-
*/
|
|
80
|
-
username?: string;
|
|
81
|
-
/**
|
|
82
|
-
* User's password for the proxy.
|
|
83
|
-
*/
|
|
84
|
-
password: string;
|
|
85
|
-
/**
|
|
86
|
-
* Hostname of your proxy.
|
|
87
|
-
*/
|
|
88
|
-
hostname: string;
|
|
89
|
-
/**
|
|
90
|
-
* Proxy port.
|
|
91
|
-
*/
|
|
92
|
-
port: number | string;
|
|
93
|
-
/**
|
|
94
|
-
* Proxy tier for the current proxy, if applicable (only for `tieredProxyUrls`).
|
|
95
|
-
*/
|
|
96
|
-
proxyTier?: number;
|
|
97
|
-
}
|
|
98
|
-
interface TieredProxyOptions {
|
|
24
|
+
interface NewUrlOptions {
|
|
99
25
|
request?: Request;
|
|
100
|
-
proxyTier?: number;
|
|
101
|
-
}
|
|
102
|
-
/**
|
|
103
|
-
* Internal class for tracking the proxy tier history for a specific domain.
|
|
104
|
-
*
|
|
105
|
-
* Predicts the best proxy tier for the next request based on the error history for different proxy tiers.
|
|
106
|
-
*/
|
|
107
|
-
declare class ProxyTierTracker {
|
|
108
|
-
private histogram;
|
|
109
|
-
private currentTier;
|
|
110
|
-
constructor(tieredProxyUrls: (string | null)[][]);
|
|
111
|
-
/**
|
|
112
|
-
* Processes a single step of the algorithm and updates the current tier prediction based on the error history.
|
|
113
|
-
*/
|
|
114
|
-
private processStep;
|
|
115
|
-
/**
|
|
116
|
-
* Increases the error score for the given proxy tier. This raises the chance of picking a different proxy tier for the subsequent requests.
|
|
117
|
-
*
|
|
118
|
-
* The error score is increased by 10 for the given tier. This means that this tier will be disadvantaged for the next 10 requests (every new request prediction decreases the error score by 1).
|
|
119
|
-
* @param tier The proxy tier to mark as problematic.
|
|
120
|
-
*/
|
|
121
|
-
addError(tier: number): void;
|
|
122
|
-
/**
|
|
123
|
-
* Returns the best proxy tier for the next request based on the error history for different proxy tiers.
|
|
124
|
-
* @returns The proxy tier prediction
|
|
125
|
-
*/
|
|
126
|
-
predictTier(): number;
|
|
127
26
|
}
|
|
128
27
|
/**
|
|
129
28
|
* Configures connection to a proxy server with the provided options. Proxy servers are used to prevent target websites from blocking
|
|
@@ -156,13 +55,11 @@ declare class ProxyTierTracker {
|
|
|
156
55
|
export declare class ProxyConfiguration {
|
|
157
56
|
isManInTheMiddle: boolean;
|
|
158
57
|
protected nextCustomUrlIndex: number;
|
|
159
|
-
protected proxyUrls?:
|
|
160
|
-
protected
|
|
161
|
-
protected usedProxyUrls: Map<string, string>;
|
|
58
|
+
protected proxyUrls?: UrlList;
|
|
59
|
+
protected usedProxyUrls: Map<string, string | null>;
|
|
162
60
|
protected newUrlFunction?: ProxyConfigurationFunction;
|
|
163
61
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
164
|
-
protected log: import("@
|
|
165
|
-
protected domainTiers: Map<string, ProxyTierTracker>;
|
|
62
|
+
protected log: import("@crawlee/types").CrawleeLogger;
|
|
166
63
|
/**
|
|
167
64
|
* Creates a {@link ProxyConfiguration} instance based on the provided options. Proxy servers are used to prevent target websites from
|
|
168
65
|
* blocking your crawlers based on IP address rate limits or blacklists. Setting proxy configuration in your crawlers automatically configures
|
|
@@ -190,53 +87,22 @@ export declare class ProxyConfiguration {
|
|
|
190
87
|
* the currently used proxy via the requestHandler parameter `proxyInfo`.
|
|
191
88
|
* Use it if you want to work with a rich representation of a proxy URL.
|
|
192
89
|
* If you need the URL string only, use {@link ProxyConfiguration.newUrl}.
|
|
193
|
-
* @param [sessionId]
|
|
194
|
-
* Represents the identifier of user {@link Session} that can be managed by the {@link SessionPool} or
|
|
195
|
-
* you can use the Apify Proxy [Session](https://docs.apify.com/proxy#sessions) identifier.
|
|
196
|
-
* When the provided sessionId is a number, it's converted to a string. Property sessionId of
|
|
197
|
-
* {@link ProxyInfo} is always returned as a type string.
|
|
198
90
|
*
|
|
199
|
-
* All the HTTP requests going through the proxy with the same session identifier
|
|
200
|
-
* will use the same target proxy server (i.e. the same IP address).
|
|
201
|
-
* The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`.
|
|
202
91
|
* @return Represents information about used proxy and its configuration.
|
|
203
92
|
*/
|
|
204
|
-
newProxyInfo(
|
|
93
|
+
newProxyInfo(options?: NewUrlOptions): Promise<ProxyInfo | undefined>;
|
|
205
94
|
/**
|
|
206
|
-
*
|
|
207
|
-
* @param _sessionId Session identifier
|
|
208
|
-
* @param options Options for the tiered proxy rotation
|
|
209
|
-
* @returns An object with the proxy URL and the proxy tier used.
|
|
210
|
-
*/
|
|
211
|
-
protected _handleTieredUrl(_sessionId: string, options?: TieredProxyOptions): TieredProxy;
|
|
212
|
-
/**
|
|
213
|
-
* Given a `Request` object, this function returns the tier of the proxy that should be used for the request.
|
|
95
|
+
* Returns a new proxy URL based on provided configuration options.
|
|
214
96
|
*
|
|
215
|
-
* This returns `null` if `tieredProxyUrls` option is not set.
|
|
216
|
-
*/
|
|
217
|
-
protected predictProxyTier(request: Request): number | null;
|
|
218
|
-
/**
|
|
219
|
-
* Returns a new proxy URL based on provided configuration options and the `sessionId` parameter.
|
|
220
|
-
* @param [sessionId]
|
|
221
|
-
* Represents the identifier of user {@link Session} that can be managed by the {@link SessionPool} or
|
|
222
|
-
* you can use the Apify Proxy [Session](https://docs.apify.com/proxy#sessions) identifier.
|
|
223
|
-
* When the provided sessionId is a number, it's converted to a string.
|
|
224
|
-
*
|
|
225
|
-
* All the HTTP requests going through the proxy with the same session identifier
|
|
226
|
-
* will use the same target proxy server (i.e. the same IP address).
|
|
227
|
-
* The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`.
|
|
228
97
|
* @return A string with a proxy URL, including authentication credentials and port number.
|
|
229
98
|
* For example, `http://bob:password123@proxy.example.com:8000`
|
|
230
99
|
*/
|
|
231
|
-
newUrl(
|
|
232
|
-
|
|
233
|
-
* Handles custom url rotation with session
|
|
234
|
-
*/
|
|
235
|
-
protected _handleCustomUrl(sessionId?: string): string;
|
|
100
|
+
newUrl(options?: NewUrlOptions): Promise<string | undefined>;
|
|
101
|
+
protected _handleProxyUrlsList(): string | null;
|
|
236
102
|
/**
|
|
237
103
|
* Calls the custom newUrlFunction and checks format of its return value
|
|
238
104
|
*/
|
|
239
|
-
protected _callNewUrlFunction(
|
|
105
|
+
protected _callNewUrlFunction(options?: {
|
|
240
106
|
request?: Request;
|
|
241
107
|
}): Promise<string | null>;
|
|
242
108
|
protected _throwCannotCombineCustomMethods(): never;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"proxy_configuration.d.ts","sourceRoot":"","sources":["../src/proxy_configuration.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"proxy_configuration.d.ts","sourceRoot":"","sources":["../src/proxy_configuration.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAc,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAG5D,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,cAAc,CAAC;AAG5C,MAAM,WAAW,0BAA0B;IACvC,CAAC,OAAO,CAAC,EAAE;QAAE,OAAO,CAAC,EAAE,OAAO,CAAA;KAAE,GAAG,MAAM,GAAG,IAAI,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC;CAC7E;AAED,KAAK,OAAO,GAAG,CAAC,MAAM,GAAG,IAAI,CAAC,EAAE,CAAC;AAEjC,MAAM,WAAW,yBAAyB;IACtC;;;;OAIG;IACH,SAAS,CAAC,EAAE,OAAO,CAAC;IAEpB;;;;;OAKG;IACH,cAAc,CAAC,EAAE,0BAA0B,CAAC;CAC/C;AAED,UAAU,aAAa;IACnB,OAAO,CAAC,EAAE,OAAO,CAAC;CACrB;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AACH,qBAAa,kBAAkB;IAC3B,gBAAgB,UAAS;IACzB,SAAS,CAAC,kBAAkB,SAAK;IACjC,SAAS,CAAC,SAAS,CAAC,EAAE,OAAO,CAAC;IAC9B,SAAS,CAAC,aAAa,6BAAoC;IAC3D,SAAS,CAAC,cAAc,CAAC,EAAE,0BAA0B,CAAC;IACtD,SAAS,CAAC,GAAG,yCAAsE;IAEnF;;;;;;;;;;;;;;;;;;;OAmBG;gBACS,OAAO,GAAE,yBAA8B;IA2BnD;;;;;;;;OAQG;IACG,YAAY,CAAC,OAAO,CAAC,EAAE,aAAa,GAAG,OAAO,CAAC,SAAS,GAAG,SAAS,CAAC;IAe3E;;;;;OAKG;IACG,MAAM,CAAC,OAAO,CAAC,EAAE,aAAa,GAAG,OAAO,CAAC,MAAM,GAAG,SAAS,CAAC;IAQlE,SAAS,CAAC,oBAAoB,IAAI,MAAM,GAAG,IAAI;IAI/C;;OAEG;cACa,mBAAmB,CAAC,OAAO,CAAC,EAAE;QAAE,OAAO,CAAC,EAAE,OAAO,CAAA;KAAE;IAcnE,SAAS,CAAC,gCAAgC,IAAI,KAAK;IAMnD,SAAS,CAAC,uBAAuB,IAAI,KAAK;CAG7C"}
|
package/proxy_configuration.js
CHANGED
|
@@ -1,55 +1,5 @@
|
|
|
1
1
|
import ow from 'ow';
|
|
2
|
-
import
|
|
3
|
-
import { cryptoRandomObjectId } from '@apify/utilities';
|
|
4
|
-
/**
|
|
5
|
-
* Internal class for tracking the proxy tier history for a specific domain.
|
|
6
|
-
*
|
|
7
|
-
* Predicts the best proxy tier for the next request based on the error history for different proxy tiers.
|
|
8
|
-
*/
|
|
9
|
-
class ProxyTierTracker {
|
|
10
|
-
histogram;
|
|
11
|
-
currentTier;
|
|
12
|
-
constructor(tieredProxyUrls) {
|
|
13
|
-
this.histogram = tieredProxyUrls.map(() => 0);
|
|
14
|
-
this.currentTier = 0;
|
|
15
|
-
}
|
|
16
|
-
/**
|
|
17
|
-
* Processes a single step of the algorithm and updates the current tier prediction based on the error history.
|
|
18
|
-
*/
|
|
19
|
-
processStep() {
|
|
20
|
-
this.histogram.forEach((x, i) => {
|
|
21
|
-
if (this.currentTier === i)
|
|
22
|
-
return;
|
|
23
|
-
if (x > 0)
|
|
24
|
-
this.histogram[i]--;
|
|
25
|
-
});
|
|
26
|
-
const left = this.currentTier > 0 ? this.histogram[this.currentTier - 1] : Infinity;
|
|
27
|
-
const right = this.currentTier < this.histogram.length - 1 ? this.histogram[this.currentTier + 1] : Infinity;
|
|
28
|
-
if (this.histogram[this.currentTier] > Math.min(left, right)) {
|
|
29
|
-
this.currentTier = left <= right ? this.currentTier - 1 : this.currentTier + 1;
|
|
30
|
-
}
|
|
31
|
-
else if (this.histogram[this.currentTier] === left) {
|
|
32
|
-
this.currentTier--;
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
/**
|
|
36
|
-
* Increases the error score for the given proxy tier. This raises the chance of picking a different proxy tier for the subsequent requests.
|
|
37
|
-
*
|
|
38
|
-
* The error score is increased by 10 for the given tier. This means that this tier will be disadvantaged for the next 10 requests (every new request prediction decreases the error score by 1).
|
|
39
|
-
* @param tier The proxy tier to mark as problematic.
|
|
40
|
-
*/
|
|
41
|
-
addError(tier) {
|
|
42
|
-
this.histogram[tier] += 10;
|
|
43
|
-
}
|
|
44
|
-
/**
|
|
45
|
-
* Returns the best proxy tier for the next request based on the error history for different proxy tiers.
|
|
46
|
-
* @returns The proxy tier prediction
|
|
47
|
-
*/
|
|
48
|
-
predictTier() {
|
|
49
|
-
this.processStep();
|
|
50
|
-
return this.currentTier;
|
|
51
|
-
}
|
|
52
|
-
}
|
|
2
|
+
import { serviceLocator } from './service_locator.js';
|
|
53
3
|
/**
|
|
54
4
|
* Configures connection to a proxy server with the provided options. Proxy servers are used to prevent target websites from blocking
|
|
55
5
|
* your crawlers based on IP address rate limits or blacklists. Setting proxy configuration in your crawlers automatically configures
|
|
@@ -82,11 +32,9 @@ export class ProxyConfiguration {
|
|
|
82
32
|
isManInTheMiddle = false;
|
|
83
33
|
nextCustomUrlIndex = 0;
|
|
84
34
|
proxyUrls;
|
|
85
|
-
tieredProxyUrls;
|
|
86
35
|
usedProxyUrls = new Map();
|
|
87
36
|
newUrlFunction;
|
|
88
|
-
log =
|
|
89
|
-
domainTiers = new Map();
|
|
37
|
+
log = serviceLocator.getLogger().child({ prefix: 'ProxyConfiguration' });
|
|
90
38
|
/**
|
|
91
39
|
* Creates a {@link ProxyConfiguration} instance based on the provided options. Proxy servers are used to prevent target websites from
|
|
92
40
|
* blocking your crawlers based on IP address rate limits or blacklists. Setting proxy configuration in your crawlers automatically configures
|
|
@@ -109,19 +57,21 @@ export class ProxyConfiguration {
|
|
|
109
57
|
*/
|
|
110
58
|
constructor(options = {}) {
|
|
111
59
|
const { validateRequired, ...rest } = options;
|
|
60
|
+
if ('tieredProxyUrls' in rest) {
|
|
61
|
+
throw new Error('The `tieredProxyUrls` option has been removed in Crawlee v4. ' +
|
|
62
|
+
'See the v4 upgrading guide for the recommended migration to named sessions.');
|
|
63
|
+
}
|
|
112
64
|
ow(rest, ow.object.exactShape({
|
|
113
|
-
proxyUrls: ow.optional.array.nonEmpty.ofType(ow.string.url),
|
|
65
|
+
proxyUrls: ow.optional.array.nonEmpty.ofType(ow.any(ow.string.url, ow.null)),
|
|
114
66
|
newUrlFunction: ow.optional.function,
|
|
115
|
-
tieredProxyUrls: ow.optional.array.nonEmpty.ofType(ow.array.nonEmpty.ofType(ow.any(ow.string.url, ow.null))),
|
|
116
67
|
}));
|
|
117
|
-
const { proxyUrls, newUrlFunction
|
|
118
|
-
if (
|
|
68
|
+
const { proxyUrls, newUrlFunction } = options;
|
|
69
|
+
if (proxyUrls && newUrlFunction)
|
|
119
70
|
this._throwCannotCombineCustomMethods();
|
|
120
71
|
if (!proxyUrls && !newUrlFunction && validateRequired)
|
|
121
72
|
this._throwNoOptionsProvided();
|
|
122
73
|
this.proxyUrls = proxyUrls;
|
|
123
74
|
this.newUrlFunction = newUrlFunction;
|
|
124
|
-
this.tieredProxyUrls = tieredProxyUrls;
|
|
125
75
|
}
|
|
126
76
|
/**
|
|
127
77
|
* This function creates a new {@link ProxyInfo} info object.
|
|
@@ -129,140 +79,42 @@ export class ProxyConfiguration {
|
|
|
129
79
|
* the currently used proxy via the requestHandler parameter `proxyInfo`.
|
|
130
80
|
* Use it if you want to work with a rich representation of a proxy URL.
|
|
131
81
|
* If you need the URL string only, use {@link ProxyConfiguration.newUrl}.
|
|
132
|
-
* @param [sessionId]
|
|
133
|
-
* Represents the identifier of user {@link Session} that can be managed by the {@link SessionPool} or
|
|
134
|
-
* you can use the Apify Proxy [Session](https://docs.apify.com/proxy#sessions) identifier.
|
|
135
|
-
* When the provided sessionId is a number, it's converted to a string. Property sessionId of
|
|
136
|
-
* {@link ProxyInfo} is always returned as a type string.
|
|
137
82
|
*
|
|
138
|
-
* All the HTTP requests going through the proxy with the same session identifier
|
|
139
|
-
* will use the same target proxy server (i.e. the same IP address).
|
|
140
|
-
* The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`.
|
|
141
83
|
* @return Represents information about used proxy and its configuration.
|
|
142
84
|
*/
|
|
143
|
-
async newProxyInfo(
|
|
144
|
-
|
|
145
|
-
sessionId = `${sessionId}`;
|
|
146
|
-
let url;
|
|
147
|
-
let tier;
|
|
148
|
-
if (this.tieredProxyUrls) {
|
|
149
|
-
const { proxyUrl, proxyTier } = this._handleTieredUrl(sessionId ?? cryptoRandomObjectId(6), options);
|
|
150
|
-
url = proxyUrl ?? undefined;
|
|
151
|
-
tier = proxyTier;
|
|
152
|
-
}
|
|
153
|
-
else {
|
|
154
|
-
url = await this.newUrl(sessionId, options);
|
|
155
|
-
}
|
|
85
|
+
async newProxyInfo(options) {
|
|
86
|
+
const url = await this.newUrl(options);
|
|
156
87
|
if (!url)
|
|
157
88
|
return undefined;
|
|
158
89
|
const { username, password, port, hostname } = new URL(url);
|
|
159
90
|
return {
|
|
160
|
-
sessionId,
|
|
161
91
|
url,
|
|
162
92
|
username: decodeURIComponent(username),
|
|
163
93
|
password: decodeURIComponent(password),
|
|
164
94
|
hostname,
|
|
165
95
|
port: port,
|
|
166
|
-
proxyTier: tier,
|
|
167
96
|
};
|
|
168
97
|
}
|
|
169
98
|
/**
|
|
170
|
-
*
|
|
171
|
-
* @param _sessionId Session identifier
|
|
172
|
-
* @param options Options for the tiered proxy rotation
|
|
173
|
-
* @returns An object with the proxy URL and the proxy tier used.
|
|
174
|
-
*/
|
|
175
|
-
_handleTieredUrl(_sessionId, options) {
|
|
176
|
-
if (!this.tieredProxyUrls)
|
|
177
|
-
throw new Error('Tiered proxy URLs are not set');
|
|
178
|
-
if (!options || (!options?.request && options?.proxyTier === undefined)) {
|
|
179
|
-
const allProxyUrls = this.tieredProxyUrls.flat();
|
|
180
|
-
return {
|
|
181
|
-
proxyUrl: allProxyUrls[this.nextCustomUrlIndex++ % allProxyUrls.length],
|
|
182
|
-
};
|
|
183
|
-
}
|
|
184
|
-
let tierPrediction = options.proxyTier;
|
|
185
|
-
if (typeof tierPrediction !== 'number') {
|
|
186
|
-
tierPrediction = this.predictProxyTier(options.request);
|
|
187
|
-
}
|
|
188
|
-
const proxyTier = this.tieredProxyUrls[tierPrediction];
|
|
189
|
-
return {
|
|
190
|
-
proxyUrl: proxyTier[this.nextCustomUrlIndex++ % proxyTier.length],
|
|
191
|
-
proxyTier: tierPrediction,
|
|
192
|
-
};
|
|
193
|
-
}
|
|
194
|
-
/**
|
|
195
|
-
* Given a `Request` object, this function returns the tier of the proxy that should be used for the request.
|
|
196
|
-
*
|
|
197
|
-
* This returns `null` if `tieredProxyUrls` option is not set.
|
|
198
|
-
*/
|
|
199
|
-
predictProxyTier(request) {
|
|
200
|
-
if (!this.tieredProxyUrls)
|
|
201
|
-
return null;
|
|
202
|
-
const domain = new URL(request.url).hostname;
|
|
203
|
-
if (!this.domainTiers.has(domain)) {
|
|
204
|
-
this.domainTiers.set(domain, new ProxyTierTracker(this.tieredProxyUrls));
|
|
205
|
-
}
|
|
206
|
-
request.userData.__crawlee ??= {};
|
|
207
|
-
const tracker = this.domainTiers.get(domain);
|
|
208
|
-
if (typeof request.userData.__crawlee.lastProxyTier === 'number') {
|
|
209
|
-
tracker.addError(request.userData.__crawlee.lastProxyTier);
|
|
210
|
-
}
|
|
211
|
-
const tierPrediction = tracker.predictTier();
|
|
212
|
-
if (typeof request.userData.__crawlee.lastProxyTier === 'number' &&
|
|
213
|
-
request.userData.__crawlee.lastProxyTier !== tierPrediction) {
|
|
214
|
-
log.debug(`Changing proxy tier for domain "${domain}" from ${request.userData.__crawlee.lastProxyTier} to ${tierPrediction}.`);
|
|
215
|
-
}
|
|
216
|
-
request.userData.__crawlee.lastProxyTier = tierPrediction;
|
|
217
|
-
request.userData.__crawlee.forefront = true;
|
|
218
|
-
return tierPrediction;
|
|
219
|
-
}
|
|
220
|
-
/**
|
|
221
|
-
* Returns a new proxy URL based on provided configuration options and the `sessionId` parameter.
|
|
222
|
-
* @param [sessionId]
|
|
223
|
-
* Represents the identifier of user {@link Session} that can be managed by the {@link SessionPool} or
|
|
224
|
-
* you can use the Apify Proxy [Session](https://docs.apify.com/proxy#sessions) identifier.
|
|
225
|
-
* When the provided sessionId is a number, it's converted to a string.
|
|
99
|
+
* Returns a new proxy URL based on provided configuration options.
|
|
226
100
|
*
|
|
227
|
-
* All the HTTP requests going through the proxy with the same session identifier
|
|
228
|
-
* will use the same target proxy server (i.e. the same IP address).
|
|
229
|
-
* The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`.
|
|
230
101
|
* @return A string with a proxy URL, including authentication credentials and port number.
|
|
231
102
|
* For example, `http://bob:password123@proxy.example.com:8000`
|
|
232
103
|
*/
|
|
233
|
-
async newUrl(
|
|
234
|
-
if (typeof sessionId === 'number')
|
|
235
|
-
sessionId = `${sessionId}`;
|
|
104
|
+
async newUrl(options) {
|
|
236
105
|
if (this.newUrlFunction) {
|
|
237
|
-
return (await this._callNewUrlFunction(
|
|
238
|
-
}
|
|
239
|
-
if (this.tieredProxyUrls) {
|
|
240
|
-
return this._handleTieredUrl(sessionId ?? cryptoRandomObjectId(6), options).proxyUrl ?? undefined;
|
|
106
|
+
return (await this._callNewUrlFunction({ request: options?.request })) ?? undefined;
|
|
241
107
|
}
|
|
242
|
-
return this.
|
|
108
|
+
return this._handleProxyUrlsList() ?? undefined;
|
|
243
109
|
}
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
*/
|
|
247
|
-
_handleCustomUrl(sessionId) {
|
|
248
|
-
let customUrlToUse;
|
|
249
|
-
if (!sessionId) {
|
|
250
|
-
return this.proxyUrls[this.nextCustomUrlIndex++ % this.proxyUrls.length];
|
|
251
|
-
}
|
|
252
|
-
if (this.usedProxyUrls.has(sessionId)) {
|
|
253
|
-
customUrlToUse = this.usedProxyUrls.get(sessionId);
|
|
254
|
-
}
|
|
255
|
-
else {
|
|
256
|
-
customUrlToUse = this.proxyUrls[this.nextCustomUrlIndex++ % this.proxyUrls.length];
|
|
257
|
-
this.usedProxyUrls.set(sessionId, customUrlToUse);
|
|
258
|
-
}
|
|
259
|
-
return customUrlToUse;
|
|
110
|
+
_handleProxyUrlsList() {
|
|
111
|
+
return this.proxyUrls[this.nextCustomUrlIndex++ % this.proxyUrls.length];
|
|
260
112
|
}
|
|
261
113
|
/**
|
|
262
114
|
* Calls the custom newUrlFunction and checks format of its return value
|
|
263
115
|
*/
|
|
264
|
-
async _callNewUrlFunction(
|
|
265
|
-
const proxyUrl = await this.newUrlFunction(
|
|
116
|
+
async _callNewUrlFunction(options) {
|
|
117
|
+
const proxyUrl = await this.newUrlFunction(options);
|
|
266
118
|
try {
|
|
267
119
|
if (proxyUrl) {
|
|
268
120
|
new URL(proxyUrl); // eslint-disable-line no-new
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"proxy_configuration.js","sourceRoot":"","sources":["../src/proxy_configuration.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,MAAM,IAAI,CAAC;
|
|
1
|
+
{"version":3,"file":"proxy_configuration.js","sourceRoot":"","sources":["../src/proxy_configuration.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,MAAM,IAAI,CAAC;AAGpB,OAAO,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AA6BtD;;;;;;;;;;;;;;;;;;;;;;;;;;;GA2BG;AACH,MAAM,OAAO,kBAAkB;IAC3B,gBAAgB,GAAG,KAAK,CAAC;IACf,kBAAkB,GAAG,CAAC,CAAC;IACvB,SAAS,CAAW;IACpB,aAAa,GAAG,IAAI,GAAG,EAAyB,CAAC;IACjD,cAAc,CAA8B;IAC5C,GAAG,GAAG,cAAc,CAAC,SAAS,EAAE,CAAC,KAAK,CAAC,EAAE,MAAM,EAAE,oBAAoB,EAAE,CAAC,CAAC;IAEnF;;;;;;;;;;;;;;;;;;;OAmBG;IACH,YAAY,UAAqC,EAAE;QAC/C,MAAM,EAAE,gBAAgB,EAAE,GAAG,IAAI,EAAE,GAAG,OAAqB,CAAC;QAE5D,IAAI,iBAAiB,IAAI,IAAI,EAAE,CAAC;YAC5B,MAAM,IAAI,KAAK,CACX,+DAA+D;gBAC3D,6EAA6E,CACpF,CAAC;QACN,CAAC;QAED,EAAE,CACE,IAAI,EACJ,EAAE,CAAC,MAAM,CAAC,UAAU,CAAC;YACjB,SAAS,EAAE,EAAE,CAAC,QAAQ,CAAC,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,MAAM,CAAC,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC;YAC5E,cAAc,EAAE,EAAE,CAAC,QAAQ,CAAC,QAAQ;SACvC,CAAC,CACL,CAAC;QAEF,MAAM,EAAE,SAAS,EAAE,cAAc,EAAE,GAAG,OAAO,CAAC;QAE9C,IAAI,SAAS,IAAI,cAAc;YAAE,IAAI,CAAC,gCAAgC,EAAE,CAAC;QACzE,IAAI,CAAC,SAAS,IAAI,CAAC,cAAc,IAAI,gBAAgB;YAAE,IAAI,CAAC,uBAAuB,EAAE,CAAC;QAEtF,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;QAC3B,IAAI,CAAC,cAAc,GAAG,cAAc,CAAC;IACzC,CAAC;IAED;;;;;;;;OAQG;IACH,KAAK,CAAC,YAAY,CAAC,OAAuB;QACtC,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QACvC,IAAI,CAAC,GAAG;YAAE,OAAO,SAAS,CAAC;QAE3B,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,IAAI,EAAE,QAAQ,EAAE,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC;QAE5D,OAAO;YACH,GAAG;YACH,QAAQ,EAAE,kBAAkB,CAAC,QAAQ,CAAC;YACtC,QAAQ,EAAE,kBAAkB,CAAC,QAAQ,CAAC;YACtC,QAAQ;YACR,IAAI,EAAE,IAAK;SACd,CAAC;IACN,CAAC;IAED;;;;;OAKG;IACH,KAAK,CAAC,MAAM,CAAC,OAAuB;QAChC,IAAI,IAAI,CAAC,cAAc,EAAE,CAAC;YACtB,OAAO,CAAC,MAAM,IAAI,CAAC,mBAAmB,CAAC,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,CAAC,CAAC,IAAI,SAAS,CAAC;QACxF,CAAC;QAED,OAAO,IAAI,CAAC,oBAAoB,EAAE,IAAI,SAAS,CAAC;IACpD,CAAC;IAES,oBAAoB;QAC1B,OAAO,IAAI,CAAC,SAAU,CAAC,IAAI,CAAC,kBAAkB,EAAE,GAAG,IAAI,CAAC,SAAU,CAAC,MAAM,CAAC,CAAC;IAC/E,CAAC;IAED;;OAEG;IACO,KAAK,CAAC,mBAAmB,CAAC,OAA+B;QAC/D,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,cAAe,CAAC,OAAO,CAAC,CAAC;QACrD,IAAI,CAAC;YACD,IAAI,QAAQ,EAAE,CAAC;gBACX,IAAI,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,6BAA6B;YACpD,CAAC;YACD,OAAO,QAAQ,CAAC;QACpB,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACX,MAAM,IAAI,KAAK,CACX,mEAAoE,GAAa,CAAC,OAAO,EAAE,CAC9F,CAAC;QACN,CAAC;IACL,CAAC;IAES,gCAAgC;QACtC,MAAM,IAAI,KAAK,CACX,6GAA6G,CAChH,CAAC;IACN,CAAC;IAES,uBAAuB;QAC7B,MAAM,IAAI,KAAK,CAAC,8EAA8E,CAAC,CAAC;IACpG,CAAC;CACJ"}
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import type { Configuration, CrawleeLogger } from '@crawlee/core';
|
|
2
|
+
export interface RecoverableStatePersistenceOptions {
|
|
3
|
+
/**
|
|
4
|
+
* The key under which the state is stored in the KeyValueStore
|
|
5
|
+
*/
|
|
6
|
+
persistStateKey: string;
|
|
7
|
+
/**
|
|
8
|
+
* Flag to enable or disable state persistence
|
|
9
|
+
*/
|
|
10
|
+
persistenceEnabled?: boolean;
|
|
11
|
+
/**
|
|
12
|
+
* The name of the KeyValueStore to use for persistence.
|
|
13
|
+
* If neither a name nor an id are supplied, the default store will be used.
|
|
14
|
+
*/
|
|
15
|
+
persistStateKvsName?: string;
|
|
16
|
+
/**
|
|
17
|
+
* The identifier of the KeyValueStore to use for persistence.
|
|
18
|
+
* If neither a name nor an id are supplied, the default store will be used.
|
|
19
|
+
*/
|
|
20
|
+
persistStateKvsId?: string;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Options for configuring the RecoverableState
|
|
24
|
+
*/
|
|
25
|
+
export interface RecoverableStateOptions<TStateModel = Record<string, unknown>> extends RecoverableStatePersistenceOptions {
|
|
26
|
+
/**
|
|
27
|
+
* The default state used if no persisted state is found.
|
|
28
|
+
* A deep copy is made each time the state is used.
|
|
29
|
+
*/
|
|
30
|
+
defaultState: TStateModel;
|
|
31
|
+
/**
|
|
32
|
+
* A logger instance for logging operations related to state persistence
|
|
33
|
+
*/
|
|
34
|
+
logger?: CrawleeLogger;
|
|
35
|
+
/**
|
|
36
|
+
* Configuration instance to use
|
|
37
|
+
*/
|
|
38
|
+
config?: Configuration;
|
|
39
|
+
/**
|
|
40
|
+
* Optional function to transform the state to a JSON string before persistence.
|
|
41
|
+
* If not provided, JSON.stringify will be used.
|
|
42
|
+
*/
|
|
43
|
+
serialize?: (state: TStateModel) => string;
|
|
44
|
+
/**
|
|
45
|
+
* Optional function to transform a JSON-serialized object back to the state model.
|
|
46
|
+
* If not provided, JSON.parse is used.
|
|
47
|
+
* It is advisable to perform validation in this function and to throw an exception if it fails.
|
|
48
|
+
*/
|
|
49
|
+
deserialize?: (serializedState: string) => TStateModel;
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* A class for managing persistent recoverable state using a plain JavaScript object.
|
|
53
|
+
*
|
|
54
|
+
* This class facilitates state persistence to a `KeyValueStore`, allowing data to be saved and retrieved
|
|
55
|
+
* across migrations or restarts. It manages the loading, saving, and resetting of state data,
|
|
56
|
+
* with optional persistence capabilities.
|
|
57
|
+
*
|
|
58
|
+
* The state is represented by a plain JavaScript object that can be serialized to and deserialized from JSON.
|
|
59
|
+
* The class automatically hooks into the event system to persist state when needed.
|
|
60
|
+
*/
|
|
61
|
+
export declare class RecoverableState<TStateModel = Record<string, unknown>> {
|
|
62
|
+
private readonly defaultState;
|
|
63
|
+
private state;
|
|
64
|
+
private readonly persistenceEnabled;
|
|
65
|
+
private readonly persistStateKey;
|
|
66
|
+
private readonly persistStateKvsName?;
|
|
67
|
+
private readonly persistStateKvsId?;
|
|
68
|
+
private keyValueStore;
|
|
69
|
+
private readonly log;
|
|
70
|
+
private readonly serialize;
|
|
71
|
+
private readonly deserialize;
|
|
72
|
+
/**
|
|
73
|
+
* Initialize a new recoverable state object.
|
|
74
|
+
*
|
|
75
|
+
* @param options Configuration options for the recoverable state
|
|
76
|
+
*/
|
|
77
|
+
constructor(options: RecoverableStateOptions<TStateModel>);
|
|
78
|
+
/**
|
|
79
|
+
* Initialize the recoverable state.
|
|
80
|
+
*
|
|
81
|
+
* This method must be called before using the recoverable state. It loads the saved state
|
|
82
|
+
* if persistence is enabled and registers the object to listen for PERSIST_STATE events.
|
|
83
|
+
*
|
|
84
|
+
* @returns The loaded state object
|
|
85
|
+
*/
|
|
86
|
+
initialize(): Promise<TStateModel>;
|
|
87
|
+
/**
|
|
88
|
+
* Clean up resources used by the recoverable state.
|
|
89
|
+
*
|
|
90
|
+
* If persistence is enabled, this method deregisters the object from PERSIST_STATE events
|
|
91
|
+
* and persists the current state one last time.
|
|
92
|
+
*/
|
|
93
|
+
teardown(): Promise<void>;
|
|
94
|
+
/**
|
|
95
|
+
* Get the current state.
|
|
96
|
+
*/
|
|
97
|
+
get currentValue(): TStateModel;
|
|
98
|
+
/**
|
|
99
|
+
* Reset the state to the default values and clear any persisted state.
|
|
100
|
+
*
|
|
101
|
+
* Resets the current state to the default state and, if persistence is enabled,
|
|
102
|
+
* clears the persisted state from the KeyValueStore.
|
|
103
|
+
*/
|
|
104
|
+
reset(): Promise<void>;
|
|
105
|
+
/**
|
|
106
|
+
* Persist the current state to the KeyValueStore.
|
|
107
|
+
*
|
|
108
|
+
* This method is typically called in response to a PERSIST_STATE event, but can also be called
|
|
109
|
+
* directly when needed.
|
|
110
|
+
*
|
|
111
|
+
* @param eventData Optional data associated with a PERSIST_STATE event
|
|
112
|
+
*/
|
|
113
|
+
persistState(eventData?: {
|
|
114
|
+
isMigrating: boolean;
|
|
115
|
+
}): Promise<void>;
|
|
116
|
+
/**
|
|
117
|
+
* Load the saved state from the KeyValueStore
|
|
118
|
+
*/
|
|
119
|
+
private loadSavedState;
|
|
120
|
+
}
|
|
121
|
+
//# sourceMappingURL=recoverable_state.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"recoverable_state.d.ts","sourceRoot":"","sources":["../src/recoverable_state.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAGlE,MAAM,WAAW,kCAAkC;IAC/C;;OAEG;IACH,eAAe,EAAE,MAAM,CAAC;IAExB;;OAEG;IACH,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAE7B;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAE7B;;;OAGG;IACH,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC9B;AAED;;GAEG;AACH,MAAM,WAAW,uBAAuB,CACpC,WAAW,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CACvC,SAAQ,kCAAkC;IACxC;;;OAGG;IACH,YAAY,EAAE,WAAW,CAAC;IAE1B;;OAEG;IACH,MAAM,CAAC,EAAE,aAAa,CAAC;IAEvB;;OAEG;IACH,MAAM,CAAC,EAAE,aAAa,CAAC;IAEvB;;;OAGG;IACH,SAAS,CAAC,EAAE,CAAC,KAAK,EAAE,WAAW,KAAK,MAAM,CAAC;IAE3C;;;;OAIG;IACH,WAAW,CAAC,EAAE,CAAC,eAAe,EAAE,MAAM,KAAK,WAAW,CAAC;CAC1D;AAED;;;;;;;;;GASG;AACH,qBAAa,gBAAgB,CAAC,WAAW,GAAG,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC;IAC/D,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAc;IAC3C,OAAO,CAAC,KAAK,CAA4B;IACzC,OAAO,CAAC,QAAQ,CAAC,kBAAkB,CAAU;IAC7C,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAS;IACzC,OAAO,CAAC,QAAQ,CAAC,mBAAmB,CAAC,CAAS;IAC9C,OAAO,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CAAS;IAC5C,OAAO,CAAC,aAAa,CAA8B;IACnD,OAAO,CAAC,QAAQ,CAAC,GAAG,CAAgB;IACpC,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAiC;IAC3D,OAAO,CAAC,QAAQ,CAAC,WAAW,CAA2C;IAEvE;;;;OAIG;gBACS,OAAO,EAAE,uBAAuB,CAAC,WAAW,CAAC;IAazD;;;;;;;OAOG;IACG,UAAU,IAAI,OAAO,CAAC,WAAW,CAAC;IA6BxC;;;;;OAKG;IACG,QAAQ,IAAI,OAAO,CAAC,IAAI,CAAC;IAU/B;;OAEG;IACH,IAAI,YAAY,IAAI,WAAW,CAM9B;IAED;;;;;OAKG;IACG,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAY5B;;;;;;;OAOG;IACG,YAAY,CAAC,SAAS,CAAC,EAAE;QAAE,WAAW,EAAE,OAAO,CAAA;KAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAcvE;;OAEG;YACW,cAAc;CAY/B"}
|