apify 4.0.0-beta.12 → 4.0.0-beta.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -48
- package/dist/actor.d.ts +157 -61
- package/dist/actor.js +278 -91
- package/dist/apify_storage_client.d.ts +54 -0
- package/dist/apify_storage_client.js +152 -0
- package/dist/charging.d.ts +43 -2
- package/dist/charging.js +196 -54
- package/dist/configuration.d.ts +79 -132
- package/dist/configuration.js +114 -141
- package/dist/index.d.ts +2 -2
- package/dist/index.js +1 -2
- package/dist/input-schemas.d.ts +7 -0
- package/dist/input-schemas.js +58 -0
- package/dist/key_value_store.d.ts +8 -4
- package/dist/key_value_store.js +19 -11
- package/dist/platform_event_manager.d.ts +0 -1
- package/dist/platform_event_manager.js +5 -5
- package/dist/proxy_configuration.d.ts +41 -44
- package/dist/proxy_configuration.js +65 -103
- package/dist/storage.d.ts +58 -0
- package/dist/storage.js +79 -0
- package/dist/utils.d.ts +0 -1
- package/dist/utils.js +2 -4
- package/package.json +123 -73
- package/.turbo/turbo-build.log +0 -26
- package/.turbo/turbo-copy.log +0 -4
- package/dist/LICENSE.md +0 -201
- package/dist/README.md +0 -98
- package/dist/actor.d.ts.map +0 -1
- package/dist/actor.js.map +0 -1
- package/dist/charging.d.ts.map +0 -1
- package/dist/charging.js.map +0 -1
- package/dist/configuration.d.ts.map +0 -1
- package/dist/configuration.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/key_value_store.d.ts.map +0 -1
- package/dist/key_value_store.js.map +0 -1
- package/dist/package.json +0 -75
- package/dist/platform_event_manager.d.ts.map +0 -1
- package/dist/platform_event_manager.js.map +0 -1
- package/dist/proxy_configuration.d.ts.map +0 -1
- package/dist/proxy_configuration.js.map +0 -1
- package/dist/utils.d.ts.map +0 -1
- package/dist/utils.js.map +0 -1
package/dist/key_value_store.js
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import { KeyValueStore as CoreKeyValueStore } from '@crawlee/core';
|
|
2
|
+
import { KeyValueStoreClient as RemoteKeyValueStoreClient } from 'apify-client';
|
|
2
3
|
import { createHmacSignature } from '@apify/utilities';
|
|
3
|
-
// @ts-ignore newer crawlee versions already declare this method in core
|
|
4
|
-
const { getPublicUrl } = CoreKeyValueStore.prototype;
|
|
5
4
|
/**
|
|
6
5
|
* @inheritDoc
|
|
7
6
|
*/
|
|
@@ -9,15 +8,27 @@ export class KeyValueStore extends CoreKeyValueStore {
|
|
|
9
8
|
/**
|
|
10
9
|
* Returns a URL for the given key that may be used to publicly
|
|
11
10
|
* access the value in the remote key-value store.
|
|
11
|
+
*
|
|
12
|
+
* On the Apify platform the URL is signed with the store's
|
|
13
|
+
* `urlSigningSecretKey` so that anyone with the URL can read the record
|
|
14
|
+
* without authentication. Locally we delegate to crawlee's default
|
|
15
|
+
* implementation (which produces a `file://` URL or returns `undefined`).
|
|
12
16
|
*/
|
|
13
|
-
getPublicUrl(key) {
|
|
17
|
+
async getPublicUrl(key) {
|
|
14
18
|
const config = this.config;
|
|
15
|
-
|
|
16
|
-
|
|
19
|
+
// Detect a remote (Apify) store by its client type rather than by
|
|
20
|
+
// `isAtHome`, so that a `forceCloud` store opened locally still gets a
|
|
21
|
+
// signed Apify URL (matching the platform behaviour). `client` is
|
|
22
|
+
// `private` on `CoreKeyValueStore`, so bypass the visibility check.
|
|
23
|
+
const { client } = this;
|
|
24
|
+
const isLocalStore = !(client instanceof RemoteKeyValueStoreClient);
|
|
25
|
+
if (isLocalStore) {
|
|
26
|
+
return super.getPublicUrl(key);
|
|
17
27
|
}
|
|
18
|
-
const publicUrl = new URL(`${config.
|
|
19
|
-
|
|
20
|
-
|
|
28
|
+
const publicUrl = new URL(`${config.apiPublicBaseUrl}/v2/key-value-stores/${this.id}/records/${key}`);
|
|
29
|
+
const metadata = (await client.getMetadata());
|
|
30
|
+
if (metadata?.urlSigningSecretKey) {
|
|
31
|
+
publicUrl.searchParams.append('signature', createHmacSignature(metadata.urlSigningSecretKey, key));
|
|
21
32
|
}
|
|
22
33
|
return publicUrl.toString();
|
|
23
34
|
}
|
|
@@ -28,6 +39,3 @@ export class KeyValueStore extends CoreKeyValueStore {
|
|
|
28
39
|
return super.open(storeIdOrName, options);
|
|
29
40
|
}
|
|
30
41
|
}
|
|
31
|
-
// @ts-ignore newer crawlee versions already declare this method in core
|
|
32
|
-
CoreKeyValueStore.prototype.getPublicUrl = KeyValueStore.prototype.getPublicUrl;
|
|
33
|
-
//# sourceMappingURL=key_value_store.js.map
|
|
@@ -46,7 +46,9 @@ export class PlatformEventManager extends EventManager {
|
|
|
46
46
|
/** Websocket connection to Actor events. */
|
|
47
47
|
eventsWs;
|
|
48
48
|
constructor(config = Configuration.getGlobalConfig()) {
|
|
49
|
-
super(
|
|
49
|
+
super({
|
|
50
|
+
persistStateIntervalMillis: config.persistStateIntervalMillis,
|
|
51
|
+
});
|
|
50
52
|
this.config = config;
|
|
51
53
|
}
|
|
52
54
|
/**
|
|
@@ -58,7 +60,7 @@ export class PlatformEventManager extends EventManager {
|
|
|
58
60
|
return;
|
|
59
61
|
}
|
|
60
62
|
await super.init();
|
|
61
|
-
const eventsWsUrl = this.config.
|
|
63
|
+
const eventsWsUrl = this.config.actorEventsWsUrl;
|
|
62
64
|
// Locally there is no web socket to connect, so just print a log message.
|
|
63
65
|
if (!eventsWsUrl) {
|
|
64
66
|
this.log.debug(`Environment variable ${ACTOR_ENV_VARS.EVENTS_WEBSOCKET_URL} is not set, no events from Apify platform will be emitted.`);
|
|
@@ -87,8 +89,7 @@ export class PlatformEventManager extends EventManager {
|
|
|
87
89
|
});
|
|
88
90
|
this.eventsWs.on('error', (err) => {
|
|
89
91
|
// Don't print this error as this happens in the case of very short Actor.main().
|
|
90
|
-
if (err.message ===
|
|
91
|
-
'WebSocket was closed before the connection was established')
|
|
92
|
+
if (err.message === 'WebSocket was closed before the connection was established')
|
|
92
93
|
return;
|
|
93
94
|
this.log.exception(err, 'web socket connection failed');
|
|
94
95
|
});
|
|
@@ -110,4 +111,3 @@ export class PlatformEventManager extends EventManager {
|
|
|
110
111
|
this.eventsWs?.close();
|
|
111
112
|
}
|
|
112
113
|
}
|
|
113
|
-
//# sourceMappingURL=platform_event_manager.js.map
|
|
@@ -1,6 +1,8 @@
|
|
|
1
|
-
import type { ProxyConfigurationOptions as CoreProxyConfigurationOptions
|
|
1
|
+
import type { ProxyConfigurationOptions as CoreProxyConfigurationOptions } from '@crawlee/core';
|
|
2
2
|
import { ProxyConfiguration as CoreProxyConfiguration } from '@crawlee/core';
|
|
3
|
+
import type { ProxyInfo as CoreProxyInfo } from '@crawlee/types';
|
|
3
4
|
import { Configuration } from './configuration.js';
|
|
5
|
+
type NewUrlOptions = Parameters<CoreProxyConfiguration['newProxyInfo']>[0];
|
|
4
6
|
export interface ProxyConfigurationOptions extends CoreProxyConfigurationOptions {
|
|
5
7
|
/**
|
|
6
8
|
* User's password for the proxy. By default, it is taken from the `APIFY_PROXY_PASSWORD`
|
|
@@ -24,6 +26,12 @@ export interface ProxyConfigurationOptions extends CoreProxyConfigurationOptions
|
|
|
24
26
|
* on the Apify cloud, or when using the [Apify CLI](https://github.com/apify/apify-cli).
|
|
25
27
|
*/
|
|
26
28
|
countryCode?: string;
|
|
29
|
+
/**
|
|
30
|
+
* If set, all proxied requests will use IP addresses geolocated to the specified subdivision (e.g. US state).
|
|
31
|
+
* Requires `countryCode` to be set. The value must follow the ISO 3166-2 subdivision code format,
|
|
32
|
+
* e.g. `'CA'` for California when `countryCode` is `'US'`.
|
|
33
|
+
*/
|
|
34
|
+
subdivisionCode?: string;
|
|
27
35
|
/**
|
|
28
36
|
* Same option as `groups` which can be used to
|
|
29
37
|
* configurate the proxy by UI input schema. You should use the `groups` option in your crawler code.
|
|
@@ -35,10 +43,15 @@ export interface ProxyConfigurationOptions extends CoreProxyConfigurationOptions
|
|
|
35
43
|
*/
|
|
36
44
|
apifyProxyCountry?: string;
|
|
37
45
|
/**
|
|
38
|
-
*
|
|
39
|
-
*
|
|
46
|
+
* Same option as `subdivisionCode` which can be used to
|
|
47
|
+
* configurate the proxy by UI input schema. You should use the `subdivisionCode` option in your crawler code.
|
|
48
|
+
*/
|
|
49
|
+
apifyProxySubdivision?: string;
|
|
50
|
+
/**
|
|
51
|
+
* As part of the init process, we verify the configuration by checking the proxy status endpoint.
|
|
52
|
+
* This can make the init slower, to opt-out of this, use `checkAccess: false` (defaults to `true`).
|
|
40
53
|
*/
|
|
41
|
-
|
|
54
|
+
checkAccess?: boolean;
|
|
42
55
|
}
|
|
43
56
|
/**
|
|
44
57
|
* The main purpose of the ProxyInfo object is to provide information
|
|
@@ -64,9 +77,6 @@ export interface ProxyConfigurationOptions extends CoreProxyConfigurationOptions
|
|
|
64
77
|
* requestHandler({ proxyInfo }) {
|
|
65
78
|
* // Getting used proxy URL
|
|
66
79
|
* const proxyUrl = proxyInfo.url;
|
|
67
|
-
*
|
|
68
|
-
* // Getting ID of used Session
|
|
69
|
-
* const sessionIdentifier = proxyInfo.sessionId;
|
|
70
80
|
* }
|
|
71
81
|
* })
|
|
72
82
|
*
|
|
@@ -77,7 +87,7 @@ export interface ProxyInfo extends CoreProxyInfo {
|
|
|
77
87
|
* An array of proxy groups to be used by the [Apify Proxy](https://docs.apify.com/proxy).
|
|
78
88
|
* If not provided, the proxy will select the groups automatically.
|
|
79
89
|
*/
|
|
80
|
-
groups
|
|
90
|
+
groups?: string[];
|
|
81
91
|
/**
|
|
82
92
|
* If set and relevant proxies are available in your Apify account, all proxied requests will
|
|
83
93
|
* use IP addresses that are geolocated to the specified country. For example `GB` for IPs
|
|
@@ -89,6 +99,11 @@ export interface ProxyInfo extends CoreProxyInfo {
|
|
|
89
99
|
* This parameter is optional, by default, the proxy uses all available proxy servers from all countries.
|
|
90
100
|
*/
|
|
91
101
|
countryCode?: string;
|
|
102
|
+
/**
|
|
103
|
+
* If set, all proxied requests use IP addresses geolocated to the specified subdivision (e.g. US state).
|
|
104
|
+
* ISO 3166-2 subdivision code, e.g. `'CA'` when `countryCode` is `'US'`.
|
|
105
|
+
*/
|
|
106
|
+
subdivisionCode?: string;
|
|
92
107
|
/**
|
|
93
108
|
* User's password for the proxy. By default, it is taken from the `APIFY_PROXY_PASSWORD`
|
|
94
109
|
* environment variable, which is automatically set by the system when running the Actors
|
|
@@ -133,6 +148,7 @@ export declare class ProxyConfiguration extends CoreProxyConfiguration {
|
|
|
133
148
|
readonly config: Configuration;
|
|
134
149
|
private groups;
|
|
135
150
|
private countryCode?;
|
|
151
|
+
private subdivisionCode?;
|
|
136
152
|
private password?;
|
|
137
153
|
private hostname;
|
|
138
154
|
private port?;
|
|
@@ -149,45 +165,26 @@ export declare class ProxyConfiguration extends CoreProxyConfiguration {
|
|
|
149
165
|
* You should use the {@link createProxyConfiguration} function to create a pre-initialized
|
|
150
166
|
* `ProxyConfiguration` instance instead of calling this manually.
|
|
151
167
|
*/
|
|
152
|
-
initialize(
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
*
|
|
157
|
-
*
|
|
158
|
-
*
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
*
|
|
163
|
-
*
|
|
164
|
-
*
|
|
165
|
-
* All the HTTP requests going through the proxy with the same session identifier
|
|
166
|
-
* will use the same target proxy server (i.e. the same IP address).
|
|
167
|
-
* The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`.
|
|
168
|
-
* @return Represents information about used proxy and its configuration.
|
|
169
|
-
*/
|
|
170
|
-
newProxyInfo(sessionId?: string | number, options?: Parameters<CoreProxyConfiguration['newProxyInfo']>[1]): Promise<ProxyInfo | undefined>;
|
|
171
|
-
/**
|
|
172
|
-
* Returns a new proxy URL based on provided configuration options and the `sessionId` parameter.
|
|
173
|
-
* @param [sessionId]
|
|
174
|
-
* Represents the identifier of user {@link Session} that can be managed by the {@link SessionPool} or
|
|
175
|
-
* you can use the Apify Proxy [Session](https://docs.apify.com/proxy#sessions) identifier.
|
|
176
|
-
* When the provided sessionId is a number, it's converted to a string.
|
|
177
|
-
*
|
|
178
|
-
* All the HTTP requests going through the proxy with the same session identifier
|
|
179
|
-
* will use the same target proxy server (i.e. the same IP address).
|
|
180
|
-
* The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`.
|
|
181
|
-
* @return A string with a proxy URL, including authentication credentials and port number.
|
|
182
|
-
* For example, `http://bob:password123@proxy.example.com:8000`
|
|
168
|
+
initialize(options?: {
|
|
169
|
+
checkAccess?: boolean;
|
|
170
|
+
}): Promise<boolean>;
|
|
171
|
+
/**
|
|
172
|
+
* Returns a new {@link ProxyInfo} object with a fresh proxy URL. Each call mints an
|
|
173
|
+
* independent URL; for Apify Proxy a random session id is embedded so consecutive
|
|
174
|
+
* calls resolve to different IPs.
|
|
175
|
+
*/
|
|
176
|
+
newProxyInfo(options?: NewUrlOptions): Promise<ProxyInfo | undefined>;
|
|
177
|
+
/**
|
|
178
|
+
* Returns a new proxy URL. For Apify Proxy, each call generates a URL with a fresh
|
|
179
|
+
* random session id, so consecutive calls return independent URLs. For custom
|
|
180
|
+
* `proxyUrls`, the URLs are rotated round-robin.
|
|
183
181
|
*/
|
|
184
|
-
newUrl(
|
|
185
|
-
protected _generateTieredProxyUrls(tieredProxyConfig: NonNullable<ProxyConfigurationOptions['tieredProxyConfig']>, globalOptions: ProxyConfigurationOptions): string[][];
|
|
182
|
+
newUrl(options?: NewUrlOptions): Promise<string | undefined>;
|
|
186
183
|
/**
|
|
187
184
|
* Returns proxy username.
|
|
188
185
|
*/
|
|
189
|
-
protected _getUsername(sessionId
|
|
190
|
-
protected composeDefaultUrl(sessionId
|
|
186
|
+
protected _getUsername(sessionId: string): string;
|
|
187
|
+
protected composeDefaultUrl(sessionId: string): string;
|
|
191
188
|
/**
|
|
192
189
|
* Fetch & set the proxy password from Apify API if an Apify token is provided.
|
|
193
190
|
*/
|
|
@@ -212,4 +209,4 @@ export declare class ProxyConfiguration extends CoreProxyConfiguration {
|
|
|
212
209
|
*/
|
|
213
210
|
protected _throwCannotCombineCustomWithApify(): void;
|
|
214
211
|
}
|
|
215
|
-
|
|
212
|
+
export {};
|
|
@@ -5,11 +5,15 @@ import { APIFY_ENV_VARS, APIFY_PROXY_VALUE_REGEX } from '@apify/consts';
|
|
|
5
5
|
import { cryptoRandomObjectId } from '@apify/utilities';
|
|
6
6
|
import { Actor } from './actor.js';
|
|
7
7
|
import { Configuration } from './configuration.js';
|
|
8
|
-
// https://docs.apify.com/proxy/datacenter-proxy#username-parameters
|
|
9
|
-
const MAX_SESSION_ID_LENGTH = 50;
|
|
10
8
|
const CHECK_ACCESS_REQUEST_TIMEOUT_MILLIS = 4_000;
|
|
11
9
|
const CHECK_ACCESS_MAX_ATTEMPTS = 2;
|
|
12
10
|
const COUNTRY_CODE_REGEX = /^[A-Z]{2}$/;
|
|
11
|
+
// ISO 3166-2 subdivision codes are 1–3 uppercase alphanumeric characters, e.g. 'CA' (California), 'NSW' (New South Wales), '9' (Wien, AT-9)
|
|
12
|
+
const SUBDIVISION_CODE_REGEX = /^[A-Z0-9]{1,3}$/;
|
|
13
|
+
// Apify Proxy session identifier embedded in the proxy username — opaque to
|
|
14
|
+
// users; a fresh one is minted for every URL the SDK hands out so that the
|
|
15
|
+
// returned proxy URLs are independent.
|
|
16
|
+
const SESSION_ID_LENGTH = 12;
|
|
13
17
|
/**
|
|
14
18
|
* Configures connection to a proxy server with the provided options. Proxy servers are used to prevent target websites from blocking
|
|
15
19
|
* your crawlers based on IP address rate limits or blacklists. Setting proxy configuration in your crawlers automatically configures
|
|
@@ -47,6 +51,7 @@ export class ProxyConfiguration extends CoreProxyConfiguration {
|
|
|
47
51
|
config;
|
|
48
52
|
groups;
|
|
49
53
|
countryCode;
|
|
54
|
+
subdivisionCode;
|
|
50
55
|
password;
|
|
51
56
|
hostname;
|
|
52
57
|
port;
|
|
@@ -67,35 +72,37 @@ export class ProxyConfiguration extends CoreProxyConfiguration {
|
|
|
67
72
|
apifyProxyGroups: ow.optional.array.ofType(ow.string.matches(APIFY_PROXY_VALUE_REGEX)),
|
|
68
73
|
countryCode: ow.optional.string.matches(COUNTRY_CODE_REGEX),
|
|
69
74
|
apifyProxyCountry: ow.optional.string.matches(COUNTRY_CODE_REGEX),
|
|
75
|
+
subdivisionCode: ow.optional.string.matches(SUBDIVISION_CODE_REGEX),
|
|
76
|
+
apifyProxySubdivision: ow.optional.string.matches(SUBDIVISION_CODE_REGEX),
|
|
70
77
|
password: ow.optional.string,
|
|
71
|
-
tieredProxyUrls: ow.optional.array.ofType(ow.array.ofType(ow.string)),
|
|
72
|
-
tieredProxyConfig: ow.optional.array.ofType(ow.object),
|
|
73
78
|
}));
|
|
74
|
-
const { groups = [], apifyProxyGroups = [], countryCode, apifyProxyCountry, password = config.
|
|
75
|
-
this.tieredProxyUrls ??= tieredProxyUrls;
|
|
76
|
-
if (tieredProxyConfig) {
|
|
77
|
-
this.tieredProxyUrls = this._generateTieredProxyUrls(tieredProxyConfig, options);
|
|
78
|
-
}
|
|
79
|
+
const { groups = [], apifyProxyGroups = [], countryCode, apifyProxyCountry, subdivisionCode, apifyProxySubdivision, password = config.proxyPassword, } = options;
|
|
79
80
|
const groupsToUse = groups.length ? groups : apifyProxyGroups;
|
|
80
81
|
const countryCodeToUse = countryCode || apifyProxyCountry;
|
|
81
|
-
const
|
|
82
|
-
const
|
|
82
|
+
const subdivisionCodeToUse = subdivisionCode || apifyProxySubdivision;
|
|
83
|
+
const hostname = config.proxyHostname;
|
|
84
|
+
const port = config.proxyPort;
|
|
85
|
+
// The Apify Proxy subdivision is expressed as part of the country
|
|
86
|
+
// username parameter (`country-US_CA`), so a country is required.
|
|
87
|
+
if (subdivisionCodeToUse && !countryCodeToUse) {
|
|
88
|
+
throw new Error('ProxyConfiguration: "subdivisionCode" requires "countryCode" to be set.');
|
|
89
|
+
}
|
|
83
90
|
// Validation
|
|
84
|
-
if ((proxyUrls || newUrlFunction) &&
|
|
85
|
-
(groupsToUse.length || countryCodeToUse)) {
|
|
91
|
+
if ((proxyUrls || newUrlFunction) && (groupsToUse.length || countryCodeToUse || subdivisionCodeToUse)) {
|
|
86
92
|
this._throwCannotCombineCustomWithApify();
|
|
87
93
|
}
|
|
88
94
|
if (proxyUrls && newUrlFunction)
|
|
89
95
|
this._throwCannotCombineCustomMethods();
|
|
90
96
|
this.groups = groupsToUse;
|
|
91
97
|
this.countryCode = countryCodeToUse;
|
|
98
|
+
this.subdivisionCode = subdivisionCodeToUse;
|
|
92
99
|
this.password = password;
|
|
93
100
|
this.hostname = hostname;
|
|
94
101
|
this.port = port;
|
|
95
102
|
this.usesApifyProxy = !this.proxyUrls && !this.newUrlFunction;
|
|
96
|
-
if (proxyUrls && proxyUrls.some((url) => url
|
|
103
|
+
if (proxyUrls && proxyUrls.some((url) => url?.includes('apify.com'))) {
|
|
97
104
|
this.log.warning('Some Apify proxy features may work incorrectly. Please consider setting up Apify properties instead of `proxyUrls`.\n' +
|
|
98
|
-
'See https://
|
|
105
|
+
'See https://docs.apify.com/sdk/js/docs/concepts/proxy-management#apify-proxy-configuration');
|
|
99
106
|
}
|
|
100
107
|
}
|
|
101
108
|
/**
|
|
@@ -106,7 +113,7 @@ export class ProxyConfiguration extends CoreProxyConfiguration {
|
|
|
106
113
|
* You should use the {@link createProxyConfiguration} function to create a pre-initialized
|
|
107
114
|
* `ProxyConfiguration` instance instead of calling this manually.
|
|
108
115
|
*/
|
|
109
|
-
async initialize() {
|
|
116
|
+
async initialize(options) {
|
|
110
117
|
if (this.usesApifyProxy) {
|
|
111
118
|
if (!this.password) {
|
|
112
119
|
await this._setPasswordIfToken();
|
|
@@ -124,111 +131,66 @@ export class ProxyConfiguration extends CoreProxyConfiguration {
|
|
|
124
131
|
`so that the SDK can fetch the proxy password from Apify API, when ${APIFY_ENV_VARS.PROXY_PASSWORD} is not defined`);
|
|
125
132
|
}
|
|
126
133
|
}
|
|
127
|
-
|
|
134
|
+
if (options?.checkAccess !== false) {
|
|
135
|
+
return this._checkAccess();
|
|
136
|
+
}
|
|
128
137
|
}
|
|
129
138
|
return true;
|
|
130
139
|
}
|
|
131
140
|
/**
|
|
132
|
-
*
|
|
133
|
-
*
|
|
134
|
-
*
|
|
135
|
-
* Use it if you want to work with a rich representation of a proxy URL.
|
|
136
|
-
* If you need the URL string only, use {@link ProxyConfiguration.newUrl}.
|
|
137
|
-
* @param [sessionId]
|
|
138
|
-
* Represents the identifier of user {@link Session} that can be managed by the {@link SessionPool} or
|
|
139
|
-
* you can use the Apify Proxy [Session](https://docs.apify.com/proxy#sessions) identifier.
|
|
140
|
-
* When the provided sessionId is a number, it's converted to a string. Property sessionId of
|
|
141
|
-
* {@link ProxyInfo} is always returned as a type string.
|
|
142
|
-
*
|
|
143
|
-
* All the HTTP requests going through the proxy with the same session identifier
|
|
144
|
-
* will use the same target proxy server (i.e. the same IP address).
|
|
145
|
-
* The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`.
|
|
146
|
-
* @return Represents information about used proxy and its configuration.
|
|
141
|
+
* Returns a new {@link ProxyInfo} object with a fresh proxy URL. Each call mints an
|
|
142
|
+
* independent URL; for Apify Proxy a random session id is embedded so consecutive
|
|
143
|
+
* calls resolve to different IPs.
|
|
147
144
|
*/
|
|
148
|
-
async newProxyInfo(
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
...proxyInfo,
|
|
160
|
-
sessionId,
|
|
161
|
-
groups,
|
|
162
|
-
countryCode,
|
|
163
|
-
// this.password is not encoded, but the password from the URL will be, we need to normalize
|
|
164
|
-
password: this.usesApifyProxy
|
|
165
|
-
? (password ?? '')
|
|
166
|
-
: decodeURIComponent(password),
|
|
167
|
-
hostname,
|
|
168
|
-
port: port,
|
|
145
|
+
async newProxyInfo(options) {
|
|
146
|
+
const url = await this.newUrl(options);
|
|
147
|
+
if (!url)
|
|
148
|
+
return undefined;
|
|
149
|
+
const parsed = new URL(url);
|
|
150
|
+
const result = {
|
|
151
|
+
url,
|
|
152
|
+
username: decodeURIComponent(parsed.username),
|
|
153
|
+
password: decodeURIComponent(parsed.password),
|
|
154
|
+
hostname: parsed.hostname,
|
|
155
|
+
port: parsed.port,
|
|
169
156
|
};
|
|
157
|
+
if (this.usesApifyProxy) {
|
|
158
|
+
result.groups = this.groups;
|
|
159
|
+
if (this.countryCode !== undefined)
|
|
160
|
+
result.countryCode = this.countryCode;
|
|
161
|
+
if (this.subdivisionCode !== undefined)
|
|
162
|
+
result.subdivisionCode = this.subdivisionCode;
|
|
163
|
+
}
|
|
164
|
+
return result;
|
|
170
165
|
}
|
|
171
166
|
/**
|
|
172
|
-
* Returns a new proxy URL
|
|
173
|
-
*
|
|
174
|
-
*
|
|
175
|
-
* you can use the Apify Proxy [Session](https://docs.apify.com/proxy#sessions) identifier.
|
|
176
|
-
* When the provided sessionId is a number, it's converted to a string.
|
|
177
|
-
*
|
|
178
|
-
* All the HTTP requests going through the proxy with the same session identifier
|
|
179
|
-
* will use the same target proxy server (i.e. the same IP address).
|
|
180
|
-
* The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`.
|
|
181
|
-
* @return A string with a proxy URL, including authentication credentials and port number.
|
|
182
|
-
* For example, `http://bob:password123@proxy.example.com:8000`
|
|
167
|
+
* Returns a new proxy URL. For Apify Proxy, each call generates a URL with a fresh
|
|
168
|
+
* random session id, so consecutive calls return independent URLs. For custom
|
|
169
|
+
* `proxyUrls`, the URLs are rotated round-robin.
|
|
183
170
|
*/
|
|
184
|
-
async newUrl(
|
|
185
|
-
if (
|
|
186
|
-
|
|
187
|
-
ow(sessionId, ow.optional.string
|
|
188
|
-
.maxLength(MAX_SESSION_ID_LENGTH)
|
|
189
|
-
.matches(APIFY_PROXY_VALUE_REGEX));
|
|
190
|
-
if (this.newUrlFunction) {
|
|
191
|
-
return ((await this._callNewUrlFunction(sessionId, {
|
|
192
|
-
request: options?.request,
|
|
193
|
-
})) ?? undefined);
|
|
194
|
-
}
|
|
195
|
-
if (this.proxyUrls) {
|
|
196
|
-
return this._handleCustomUrl(sessionId);
|
|
171
|
+
async newUrl(options) {
|
|
172
|
+
if (this.newUrlFunction || this.proxyUrls) {
|
|
173
|
+
return super.newUrl(options);
|
|
197
174
|
}
|
|
198
|
-
|
|
199
|
-
return (this._handleTieredUrl(sessionId ?? cryptoRandomObjectId(6), options).proxyUrl ?? undefined);
|
|
200
|
-
}
|
|
201
|
-
return this.composeDefaultUrl(sessionId);
|
|
202
|
-
}
|
|
203
|
-
_generateTieredProxyUrls(tieredProxyConfig, globalOptions) {
|
|
204
|
-
return tieredProxyConfig.map((config) => [
|
|
205
|
-
new ProxyConfiguration({
|
|
206
|
-
...globalOptions,
|
|
207
|
-
...config,
|
|
208
|
-
tieredProxyConfig: undefined,
|
|
209
|
-
}).composeDefaultUrl(),
|
|
210
|
-
]);
|
|
175
|
+
return this.composeDefaultUrl(cryptoRandomObjectId(SESSION_ID_LENGTH));
|
|
211
176
|
}
|
|
212
177
|
/**
|
|
213
178
|
* Returns proxy username.
|
|
214
179
|
*/
|
|
215
180
|
_getUsername(sessionId) {
|
|
216
|
-
|
|
217
|
-
const { groups, countryCode } = this;
|
|
181
|
+
const { groups, countryCode, subdivisionCode } = this;
|
|
218
182
|
const parts = [];
|
|
219
183
|
if (groups && groups.length) {
|
|
220
184
|
parts.push(`groups-${groups.join('+')}`);
|
|
221
185
|
}
|
|
222
|
-
|
|
223
|
-
|
|
186
|
+
parts.push(`session-${sessionId}`);
|
|
187
|
+
if (subdivisionCode) {
|
|
188
|
+
parts.push(`country-${countryCode}_${subdivisionCode}`);
|
|
224
189
|
}
|
|
225
|
-
if (countryCode) {
|
|
190
|
+
else if (countryCode) {
|
|
226
191
|
parts.push(`country-${countryCode}`);
|
|
227
192
|
}
|
|
228
|
-
|
|
229
|
-
if (parts.length === 0)
|
|
230
|
-
username = 'auto';
|
|
231
|
-
return username;
|
|
193
|
+
return parts.join(',');
|
|
232
194
|
}
|
|
233
195
|
composeDefaultUrl(sessionId) {
|
|
234
196
|
const username = this._getUsername(sessionId);
|
|
@@ -243,7 +205,7 @@ export class ProxyConfiguration extends CoreProxyConfiguration {
|
|
|
243
205
|
*/
|
|
244
206
|
// TODO: Make this private
|
|
245
207
|
async _setPasswordIfToken() {
|
|
246
|
-
const token = this.config
|
|
208
|
+
const { token } = this.config;
|
|
247
209
|
if (!token)
|
|
248
210
|
return;
|
|
249
211
|
try {
|
|
@@ -291,7 +253,7 @@ export class ProxyConfiguration extends CoreProxyConfiguration {
|
|
|
291
253
|
* Apify Proxy can be down for a second or a minute, but this should not crash processes.
|
|
292
254
|
*/
|
|
293
255
|
async _fetchStatus() {
|
|
294
|
-
const proxyStatusUrl = this.config
|
|
256
|
+
const { proxyStatusUrl } = this.config;
|
|
295
257
|
const requestOpts = {
|
|
296
258
|
url: `${proxyStatusUrl}/?format=json`,
|
|
297
259
|
proxyUrl: await this.newUrl(),
|
|
@@ -316,7 +278,7 @@ export class ProxyConfiguration extends CoreProxyConfiguration {
|
|
|
316
278
|
_throwCannotCombineCustomWithApify() {
|
|
317
279
|
throw new Error('Cannot combine custom proxies with Apify Proxy! ' +
|
|
318
280
|
'It is not allowed to set "options.proxyUrls" or "options.newUrlFunction" combined with ' +
|
|
319
|
-
'"options.groups"
|
|
281
|
+
'"options.groups", "options.apifyProxyGroups", "options.countryCode", "options.apifyProxyCountry", ' +
|
|
282
|
+
'"options.subdivisionCode" or "options.apifyProxySubdivision".');
|
|
320
283
|
}
|
|
321
284
|
}
|
|
322
|
-
//# sourceMappingURL=proxy_configuration.js.map
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import type { Constructor, IStorage, StorageOpenOptions } from '@crawlee/core';
|
|
2
|
+
import type { StorageClient } from '@crawlee/types';
|
|
3
|
+
import type { Configuration } from './configuration.js';
|
|
4
|
+
export interface OpenStorageOptions {
|
|
5
|
+
/**
|
|
6
|
+
* If set to `true` then the cloud storage is used even if the `CRAWLEE_STORAGE_DIR`
|
|
7
|
+
* environment variable is set. This way it is possible to combine local and cloud storage.
|
|
8
|
+
* @default false
|
|
9
|
+
*/
|
|
10
|
+
forceCloud?: boolean;
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Identifies a storage by its alias from the Actor's schema storages
|
|
14
|
+
* (resolved via the `ACTOR_STORAGES_JSON` environment variable).
|
|
15
|
+
*/
|
|
16
|
+
export interface StorageAlias {
|
|
17
|
+
alias: string;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Identifies a storage by its platform ID.
|
|
21
|
+
*/
|
|
22
|
+
export interface StorageId {
|
|
23
|
+
id: string;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Identifies a storage by its name.
|
|
27
|
+
*/
|
|
28
|
+
export interface StorageName {
|
|
29
|
+
name: string;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Identifies a storage to open. Can be:
|
|
33
|
+
* - A plain `string` for backward compatibility (treated as ID or name)
|
|
34
|
+
* - `{ alias: string }` to resolve from the Actor's schema storages (`ACTOR_STORAGES_JSON`)
|
|
35
|
+
* - `{ id: string }` to open by explicit platform ID
|
|
36
|
+
* - `{ name: string }` to open by explicit name
|
|
37
|
+
*/
|
|
38
|
+
export type StorageIdentifier = string | StorageAlias | StorageId | StorageName;
|
|
39
|
+
/**
|
|
40
|
+
* Identifies a storage to open, without alias support.
|
|
41
|
+
* Used for key-value stores and request queues, which do not support aliases.
|
|
42
|
+
* Can be:
|
|
43
|
+
* - A plain `string` for backward compatibility (treated as ID or name)
|
|
44
|
+
* - `{ id: string }` to open by explicit platform ID
|
|
45
|
+
* - `{ name: string }` to open by explicit name
|
|
46
|
+
*/
|
|
47
|
+
export type StorageIdentifierWithoutAlias = string | StorageId | StorageName;
|
|
48
|
+
export interface OpenStorageContext {
|
|
49
|
+
config: Configuration;
|
|
50
|
+
client?: StorageClient;
|
|
51
|
+
purgedStorageAliases: Set<string>;
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Opens a storage by its identifier, handling Apify alias resolution and local purging.
|
|
55
|
+
*/
|
|
56
|
+
export declare function openStorage<T extends IStorage>(storageClass: Constructor<T> & {
|
|
57
|
+
open(id?: string | null, options?: StorageOpenOptions): Promise<T>;
|
|
58
|
+
}, identifier: StorageIdentifier | null | undefined, context: OpenStorageContext): Promise<T>;
|
package/dist/storage.js
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import { ApifyStorageClient } from './apify_storage_client.js';
|
|
2
|
+
const STORAGE_TYPE_KEYS = {
|
|
3
|
+
Dataset: 'datasets',
|
|
4
|
+
KeyValueStore: 'keyValueStores',
|
|
5
|
+
RequestQueue: 'requestQueues',
|
|
6
|
+
};
|
|
7
|
+
const parsedStoragesJson = new Map();
|
|
8
|
+
/**
|
|
9
|
+
* Resolves a {@link StorageIdentifier} to a plain string ID or name
|
|
10
|
+
* that can be passed to crawlee v4's `<Storage>.open()`.
|
|
11
|
+
*/
|
|
12
|
+
function resolveStorageIdentifier(storageType, identifier, config) {
|
|
13
|
+
if (identifier === null || identifier === undefined) {
|
|
14
|
+
return undefined;
|
|
15
|
+
}
|
|
16
|
+
if (typeof identifier === 'string') {
|
|
17
|
+
return identifier;
|
|
18
|
+
}
|
|
19
|
+
if ('id' in identifier) {
|
|
20
|
+
return identifier.id;
|
|
21
|
+
}
|
|
22
|
+
if ('name' in identifier) {
|
|
23
|
+
return identifier.name;
|
|
24
|
+
}
|
|
25
|
+
// { alias: string }
|
|
26
|
+
const storagesJson = config.actorStoragesJson;
|
|
27
|
+
if (config.isAtHome && storagesJson) {
|
|
28
|
+
let storages;
|
|
29
|
+
try {
|
|
30
|
+
if (!parsedStoragesJson.has(storagesJson)) {
|
|
31
|
+
parsedStoragesJson.set(storagesJson, JSON.parse(storagesJson));
|
|
32
|
+
}
|
|
33
|
+
storages = parsedStoragesJson.get(storagesJson);
|
|
34
|
+
}
|
|
35
|
+
catch {
|
|
36
|
+
throw new Error(`Failed to parse ACTOR_STORAGES_JSON environment variable: ${storagesJson}`);
|
|
37
|
+
}
|
|
38
|
+
const typeKey = STORAGE_TYPE_KEYS[storageType];
|
|
39
|
+
const resolvedId = storages[typeKey]?.[identifier.alias];
|
|
40
|
+
if (resolvedId) {
|
|
41
|
+
return resolvedId;
|
|
42
|
+
}
|
|
43
|
+
throw new Error(`Storage alias "${identifier.alias}" not found in ACTOR_STORAGES_JSON for storage type "${storageType}". ` +
|
|
44
|
+
`Available aliases: ${Object.keys(storages[typeKey] ?? {}).join(', ') || '(none)'}`);
|
|
45
|
+
}
|
|
46
|
+
// When using local storage, just use the alias as a name.
|
|
47
|
+
// When using platform storage, we can't just make up a name — the alias must be
|
|
48
|
+
// in ACTOR_STORAGES_JSON.
|
|
49
|
+
if (config.isAtHome) {
|
|
50
|
+
throw new Error(`Storage alias "${identifier.alias}" cannot be resolved because ACTOR_STORAGES_JSON is not set. ` +
|
|
51
|
+
`Aliases are only available for storages declared in the Actor's schema.`);
|
|
52
|
+
}
|
|
53
|
+
return identifier.alias;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Opens a storage by its identifier, handling Apify alias resolution and local purging.
|
|
57
|
+
*/
|
|
58
|
+
export async function openStorage(storageClass, identifier, context) {
|
|
59
|
+
const isAlias = identifier !== null && identifier !== undefined && typeof identifier === 'object' && 'alias' in identifier;
|
|
60
|
+
if (isAlias && !context.config.isAtHome && context.client instanceof ApifyStorageClient) {
|
|
61
|
+
throw new Error('The `alias` option is not allowed for Apify-based storages running outside of Apify');
|
|
62
|
+
}
|
|
63
|
+
const resolvedIdOrName = resolveStorageIdentifier(storageClass.name, identifier, context.config);
|
|
64
|
+
// When running locally, purge aliased storages on first open
|
|
65
|
+
// (similar to how crawlee purges default storages on start).
|
|
66
|
+
if (isAlias &&
|
|
67
|
+
!context.config.isAtHome &&
|
|
68
|
+
context.config.purgeOnStart &&
|
|
69
|
+
!context.purgedStorageAliases.has(identifier.alias)) {
|
|
70
|
+
context.purgedStorageAliases.add(identifier.alias);
|
|
71
|
+
const existingStorage = await storageClass.open(resolvedIdOrName ?? null, {
|
|
72
|
+
storageClient: context.client,
|
|
73
|
+
});
|
|
74
|
+
await existingStorage.drop();
|
|
75
|
+
}
|
|
76
|
+
return storageClass.open(resolvedIdOrName ?? null, {
|
|
77
|
+
storageClient: context.client,
|
|
78
|
+
});
|
|
79
|
+
}
|
package/dist/utils.d.ts
CHANGED