apify 3.7.3-beta.9 → 4.0.0-beta.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/actor.d.ts +19 -4
- package/dist/actor.js +207 -239
- package/dist/apify_storage_client.d.ts +54 -0
- package/dist/apify_storage_client.js +152 -0
- package/dist/charging.js +45 -122
- package/dist/configuration.d.ts +79 -141
- package/dist/configuration.js +117 -171
- package/dist/index.js +8 -22
- package/dist/input-schemas.js +12 -18
- package/dist/key_value_store.d.ts +8 -3
- package/dist/key_value_store.js +22 -21
- package/dist/platform_event_manager.d.ts +0 -5
- package/dist/platform_event_manager.js +18 -34
- package/dist/proxy_configuration.d.ts +26 -55
- package/dist/proxy_configuration.js +80 -174
- package/dist/storage.d.ts +6 -4
- package/dist/storage.js +17 -17
- package/dist/utils.d.ts +5 -0
- package/dist/utils.js +39 -23
- package/package.json +16 -15
- package/dist/index.mjs +0 -19
- package/dist/patched_apify_client.d.ts +0 -25
- package/dist/patched_apify_client.js +0 -70
|
@@ -1,21 +1,19 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
const
|
|
9
|
-
const utilities_1 = require("@apify/utilities");
|
|
10
|
-
const actor_js_1 = require("./actor.js");
|
|
11
|
-
const configuration_js_1 = require("./configuration.js");
|
|
12
|
-
// https://docs.apify.com/proxy/datacenter-proxy#username-parameters
|
|
13
|
-
const MAX_SESSION_ID_LENGTH = 50;
|
|
14
|
-
const CHECK_ACCESS_REQUEST_TIMEOUT_MILLIS = 4000;
|
|
1
|
+
import { ProxyConfiguration as CoreProxyConfiguration } from '@crawlee/core';
|
|
2
|
+
import { gotScraping } from 'got-scraping';
|
|
3
|
+
import ow from 'ow';
|
|
4
|
+
import { APIFY_ENV_VARS, APIFY_PROXY_VALUE_REGEX } from '@apify/consts';
|
|
5
|
+
import { cryptoRandomObjectId } from '@apify/utilities';
|
|
6
|
+
import { Actor } from './actor.js';
|
|
7
|
+
import { Configuration } from './configuration.js';
|
|
8
|
+
const CHECK_ACCESS_REQUEST_TIMEOUT_MILLIS = 4_000;
|
|
15
9
|
const CHECK_ACCESS_MAX_ATTEMPTS = 2;
|
|
16
10
|
const COUNTRY_CODE_REGEX = /^[A-Z]{2}$/;
|
|
17
11
|
// ISO 3166-2 subdivision codes are 1–3 uppercase alphanumeric characters, e.g. 'CA' (California), 'NSW' (New South Wales), '9' (Wien, AT-9)
|
|
18
12
|
const SUBDIVISION_CODE_REGEX = /^[A-Z0-9]{1,3}$/;
|
|
13
|
+
// Apify Proxy session identifier embedded in the proxy username — opaque to
|
|
14
|
+
// users; a fresh one is minted for every URL the SDK hands out so that the
|
|
15
|
+
// returned proxy URLs are independent.
|
|
16
|
+
const SESSION_ID_LENGTH = 12;
|
|
19
17
|
/**
|
|
20
18
|
* Configures connection to a proxy server with the provided options. Proxy servers are used to prevent target websites from blocking
|
|
21
19
|
* your crawlers based on IP address rate limits or blacklists. Setting proxy configuration in your crawlers automatically configures
|
|
@@ -49,86 +47,43 @@ const SUBDIVISION_CODE_REGEX = /^[A-Z0-9]{1,3}$/;
|
|
|
49
47
|
* ```
|
|
50
48
|
* @category Scaling
|
|
51
49
|
*/
|
|
52
|
-
class ProxyConfiguration extends
|
|
50
|
+
export class ProxyConfiguration extends CoreProxyConfiguration {
|
|
51
|
+
config;
|
|
52
|
+
groups;
|
|
53
|
+
countryCode;
|
|
54
|
+
subdivisionCode;
|
|
55
|
+
password;
|
|
56
|
+
hostname;
|
|
57
|
+
port;
|
|
58
|
+
usesApifyProxy;
|
|
53
59
|
/**
|
|
54
60
|
* @internal
|
|
55
61
|
*/
|
|
56
|
-
constructor(options = {}, config =
|
|
62
|
+
constructor(options = {}, config = Configuration.getGlobalConfig()) {
|
|
57
63
|
const { proxyUrls, newUrlFunction, ...rest } = options;
|
|
58
64
|
super({
|
|
59
65
|
proxyUrls,
|
|
60
66
|
newUrlFunction,
|
|
61
67
|
['validateRequired']: false,
|
|
62
68
|
});
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
writable: true,
|
|
73
|
-
value: void 0
|
|
74
|
-
});
|
|
75
|
-
Object.defineProperty(this, "countryCode", {
|
|
76
|
-
enumerable: true,
|
|
77
|
-
configurable: true,
|
|
78
|
-
writable: true,
|
|
79
|
-
value: void 0
|
|
80
|
-
});
|
|
81
|
-
Object.defineProperty(this, "subdivisionCode", {
|
|
82
|
-
enumerable: true,
|
|
83
|
-
configurable: true,
|
|
84
|
-
writable: true,
|
|
85
|
-
value: void 0
|
|
86
|
-
});
|
|
87
|
-
Object.defineProperty(this, "password", {
|
|
88
|
-
enumerable: true,
|
|
89
|
-
configurable: true,
|
|
90
|
-
writable: true,
|
|
91
|
-
value: void 0
|
|
92
|
-
});
|
|
93
|
-
Object.defineProperty(this, "hostname", {
|
|
94
|
-
enumerable: true,
|
|
95
|
-
configurable: true,
|
|
96
|
-
writable: true,
|
|
97
|
-
value: void 0
|
|
98
|
-
});
|
|
99
|
-
Object.defineProperty(this, "port", {
|
|
100
|
-
enumerable: true,
|
|
101
|
-
configurable: true,
|
|
102
|
-
writable: true,
|
|
103
|
-
value: void 0
|
|
104
|
-
});
|
|
105
|
-
Object.defineProperty(this, "usesApifyProxy", {
|
|
106
|
-
enumerable: true,
|
|
107
|
-
configurable: true,
|
|
108
|
-
writable: true,
|
|
109
|
-
value: void 0
|
|
110
|
-
});
|
|
111
|
-
(0, ow_1.default)(rest, ow_1.default.object.partialShape({
|
|
112
|
-
groups: ow_1.default.optional.array.ofType(ow_1.default.string.matches(consts_1.APIFY_PROXY_VALUE_REGEX)),
|
|
113
|
-
apifyProxyGroups: ow_1.default.optional.array.ofType(ow_1.default.string.matches(consts_1.APIFY_PROXY_VALUE_REGEX)),
|
|
114
|
-
countryCode: ow_1.default.optional.string.matches(COUNTRY_CODE_REGEX),
|
|
115
|
-
apifyProxyCountry: ow_1.default.optional.string.matches(COUNTRY_CODE_REGEX),
|
|
116
|
-
subdivisionCode: ow_1.default.optional.string.matches(SUBDIVISION_CODE_REGEX),
|
|
117
|
-
apifyProxySubdivision: ow_1.default.optional.string.matches(SUBDIVISION_CODE_REGEX),
|
|
118
|
-
password: ow_1.default.optional.string,
|
|
119
|
-
tieredProxyUrls: ow_1.default.optional.array.ofType(ow_1.default.array.ofType(ow_1.default.string)),
|
|
120
|
-
tieredProxyConfig: ow_1.default.optional.array.ofType(ow_1.default.object),
|
|
69
|
+
this.config = config;
|
|
70
|
+
ow(rest, ow.object.exactShape({
|
|
71
|
+
groups: ow.optional.array.ofType(ow.string.matches(APIFY_PROXY_VALUE_REGEX)),
|
|
72
|
+
apifyProxyGroups: ow.optional.array.ofType(ow.string.matches(APIFY_PROXY_VALUE_REGEX)),
|
|
73
|
+
countryCode: ow.optional.string.matches(COUNTRY_CODE_REGEX),
|
|
74
|
+
apifyProxyCountry: ow.optional.string.matches(COUNTRY_CODE_REGEX),
|
|
75
|
+
subdivisionCode: ow.optional.string.matches(SUBDIVISION_CODE_REGEX),
|
|
76
|
+
apifyProxySubdivision: ow.optional.string.matches(SUBDIVISION_CODE_REGEX),
|
|
77
|
+
password: ow.optional.string,
|
|
121
78
|
}));
|
|
122
|
-
const { groups = [], apifyProxyGroups = [], countryCode, apifyProxyCountry, subdivisionCode, apifyProxySubdivision, password = config.
|
|
123
|
-
this.tieredProxyUrls ?? (this.tieredProxyUrls = tieredProxyUrls);
|
|
124
|
-
if (tieredProxyConfig) {
|
|
125
|
-
this.tieredProxyUrls = this._generateTieredProxyUrls(tieredProxyConfig, options);
|
|
126
|
-
}
|
|
79
|
+
const { groups = [], apifyProxyGroups = [], countryCode, apifyProxyCountry, subdivisionCode, apifyProxySubdivision, password = config.proxyPassword, } = options;
|
|
127
80
|
const groupsToUse = groups.length ? groups : apifyProxyGroups;
|
|
128
81
|
const countryCodeToUse = countryCode || apifyProxyCountry;
|
|
129
82
|
const subdivisionCodeToUse = subdivisionCode || apifyProxySubdivision;
|
|
130
|
-
const hostname = config.
|
|
131
|
-
const port = config.
|
|
83
|
+
const hostname = config.proxyHostname;
|
|
84
|
+
const port = config.proxyPort;
|
|
85
|
+
// The Apify Proxy subdivision is expressed as part of the country
|
|
86
|
+
// username parameter (`country-US_CA`), so a country is required.
|
|
132
87
|
if (subdivisionCodeToUse && !countryCodeToUse) {
|
|
133
88
|
throw new Error('ProxyConfiguration: "subdivisionCode" requires "countryCode" to be set.');
|
|
134
89
|
}
|
|
@@ -157,9 +112,6 @@ class ProxyConfiguration extends core_1.ProxyConfiguration {
|
|
|
157
112
|
*
|
|
158
113
|
* You should use the {@link createProxyConfiguration} function to create a pre-initialized
|
|
159
114
|
* `ProxyConfiguration` instance instead of calling this manually.
|
|
160
|
-
*
|
|
161
|
-
* As part of the init process, we verify the configuration by checking the proxy status endpoint.
|
|
162
|
-
* This can make the init slower, to opt-out of this, use `checkAccess: false`.
|
|
163
115
|
*/
|
|
164
116
|
async initialize(options) {
|
|
165
117
|
if (this.usesApifyProxy) {
|
|
@@ -167,16 +119,16 @@ class ProxyConfiguration extends core_1.ProxyConfiguration {
|
|
|
167
119
|
await this._setPasswordIfToken();
|
|
168
120
|
}
|
|
169
121
|
if (!this.password) {
|
|
170
|
-
if (
|
|
171
|
-
throw new Error(`Apify Proxy password must be provided using options.password or the "${
|
|
172
|
-
`You can also provide your Apify token via the "${
|
|
173
|
-
`so that the SDK can fetch the proxy password from Apify API, when ${
|
|
122
|
+
if (Actor.isAtHome()) {
|
|
123
|
+
throw new Error(`Apify Proxy password must be provided using options.password or the "${APIFY_ENV_VARS.PROXY_PASSWORD}" environment variable. ` +
|
|
124
|
+
`You can also provide your Apify token via the "${APIFY_ENV_VARS.TOKEN}" environment variable, ` +
|
|
125
|
+
`so that the SDK can fetch the proxy password from Apify API, when ${APIFY_ENV_VARS.PROXY_PASSWORD} is not defined`);
|
|
174
126
|
}
|
|
175
127
|
else {
|
|
176
128
|
this.log.warning(`No proxy password or token detected, running without proxy. To use Apify Proxy locally, ` +
|
|
177
|
-
`provide options.password or "${
|
|
178
|
-
`You can also provide your Apify token via the "${
|
|
179
|
-
`so that the SDK can fetch the proxy password from Apify API, when ${
|
|
129
|
+
`provide options.password or "${APIFY_ENV_VARS.PROXY_PASSWORD}" environment variable. ` +
|
|
130
|
+
`You can also provide your Apify token via the "${APIFY_ENV_VARS.TOKEN}" environment variable, ` +
|
|
131
|
+
`so that the SDK can fetch the proxy password from Apify API, when ${APIFY_ENV_VARS.PROXY_PASSWORD} is not defined`);
|
|
180
132
|
}
|
|
181
133
|
}
|
|
182
134
|
if (options?.checkAccess !== false) {
|
|
@@ -186,104 +138,59 @@ class ProxyConfiguration extends core_1.ProxyConfiguration {
|
|
|
186
138
|
return true;
|
|
187
139
|
}
|
|
188
140
|
/**
|
|
189
|
-
*
|
|
190
|
-
*
|
|
191
|
-
*
|
|
192
|
-
* Use it if you want to work with a rich representation of a proxy URL.
|
|
193
|
-
* If you need the URL string only, use {@link ProxyConfiguration.newUrl}.
|
|
194
|
-
* @param [sessionId]
|
|
195
|
-
* Represents the identifier of user {@link Session} that can be managed by the {@link SessionPool} or
|
|
196
|
-
* you can use the Apify Proxy [Session](https://docs.apify.com/proxy#sessions) identifier.
|
|
197
|
-
* When the provided sessionId is a number, it's converted to a string. Property sessionId of
|
|
198
|
-
* {@link ProxyInfo} is always returned as a type string.
|
|
199
|
-
*
|
|
200
|
-
* All the HTTP requests going through the proxy with the same session identifier
|
|
201
|
-
* will use the same target proxy server (i.e. the same IP address).
|
|
202
|
-
* The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`.
|
|
203
|
-
* @return Represents information about used proxy and its configuration.
|
|
141
|
+
* Returns a new {@link ProxyInfo} object with a fresh proxy URL. Each call mints an
|
|
142
|
+
* independent URL; for Apify Proxy a random session id is embedded so consecutive
|
|
143
|
+
* calls resolve to different IPs.
|
|
204
144
|
*/
|
|
205
|
-
async newProxyInfo(
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
const
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
groups,
|
|
217
|
-
countryCode,
|
|
218
|
-
subdivisionCode,
|
|
219
|
-
// this.password is not encoded, but the password from the URL will be, we need to normalize
|
|
220
|
-
password: this.usesApifyProxy ? (password ?? '') : decodeURIComponent(password),
|
|
221
|
-
hostname,
|
|
222
|
-
port: port,
|
|
145
|
+
async newProxyInfo(options) {
|
|
146
|
+
const url = await this.newUrl(options);
|
|
147
|
+
if (!url)
|
|
148
|
+
return undefined;
|
|
149
|
+
const parsed = new URL(url);
|
|
150
|
+
const result = {
|
|
151
|
+
url,
|
|
152
|
+
username: decodeURIComponent(parsed.username),
|
|
153
|
+
password: decodeURIComponent(parsed.password),
|
|
154
|
+
hostname: parsed.hostname,
|
|
155
|
+
port: parsed.port,
|
|
223
156
|
};
|
|
157
|
+
if (this.usesApifyProxy) {
|
|
158
|
+
result.groups = this.groups;
|
|
159
|
+
if (this.countryCode !== undefined)
|
|
160
|
+
result.countryCode = this.countryCode;
|
|
161
|
+
if (this.subdivisionCode !== undefined)
|
|
162
|
+
result.subdivisionCode = this.subdivisionCode;
|
|
163
|
+
}
|
|
164
|
+
return result;
|
|
224
165
|
}
|
|
225
166
|
/**
|
|
226
|
-
* Returns a new proxy URL
|
|
227
|
-
*
|
|
228
|
-
*
|
|
229
|
-
* you can use the Apify Proxy [Session](https://docs.apify.com/proxy#sessions) identifier.
|
|
230
|
-
* When the provided sessionId is a number, it's converted to a string.
|
|
231
|
-
*
|
|
232
|
-
* All the HTTP requests going through the proxy with the same session identifier
|
|
233
|
-
* will use the same target proxy server (i.e. the same IP address).
|
|
234
|
-
* The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`.
|
|
235
|
-
* @return A string with a proxy URL, including authentication credentials and port number.
|
|
236
|
-
* For example, `http://bob:password123@proxy.example.com:8000`
|
|
167
|
+
* Returns a new proxy URL. For Apify Proxy, each call generates a URL with a fresh
|
|
168
|
+
* random session id, so consecutive calls return independent URLs. For custom
|
|
169
|
+
* `proxyUrls`, the URLs are rotated round-robin.
|
|
237
170
|
*/
|
|
238
|
-
async newUrl(
|
|
239
|
-
if (
|
|
240
|
-
|
|
241
|
-
(0, ow_1.default)(sessionId, ow_1.default.optional.string.maxLength(MAX_SESSION_ID_LENGTH).matches(consts_1.APIFY_PROXY_VALUE_REGEX));
|
|
242
|
-
if (this.newUrlFunction) {
|
|
243
|
-
return ((await this._callNewUrlFunction(sessionId, {
|
|
244
|
-
request: options?.request,
|
|
245
|
-
})) ?? undefined);
|
|
171
|
+
async newUrl(options) {
|
|
172
|
+
if (this.newUrlFunction || this.proxyUrls) {
|
|
173
|
+
return super.newUrl(options);
|
|
246
174
|
}
|
|
247
|
-
|
|
248
|
-
return this._handleCustomUrl(sessionId) ?? undefined;
|
|
249
|
-
}
|
|
250
|
-
if (this.tieredProxyUrls) {
|
|
251
|
-
return this._handleTieredUrl(sessionId ?? (0, utilities_1.cryptoRandomObjectId)(6), options).proxyUrl ?? undefined;
|
|
252
|
-
}
|
|
253
|
-
return this.composeDefaultUrl(sessionId);
|
|
254
|
-
}
|
|
255
|
-
_generateTieredProxyUrls(tieredProxyConfig, globalOptions) {
|
|
256
|
-
return tieredProxyConfig.map((config) => [
|
|
257
|
-
new ProxyConfiguration({
|
|
258
|
-
...globalOptions,
|
|
259
|
-
...config,
|
|
260
|
-
tieredProxyConfig: undefined,
|
|
261
|
-
}).composeDefaultUrl(),
|
|
262
|
-
]);
|
|
175
|
+
return this.composeDefaultUrl(cryptoRandomObjectId(SESSION_ID_LENGTH));
|
|
263
176
|
}
|
|
264
177
|
/**
|
|
265
178
|
* Returns proxy username.
|
|
266
179
|
*/
|
|
267
180
|
_getUsername(sessionId) {
|
|
268
|
-
let username;
|
|
269
181
|
const { groups, countryCode, subdivisionCode } = this;
|
|
270
182
|
const parts = [];
|
|
271
183
|
if (groups && groups.length) {
|
|
272
184
|
parts.push(`groups-${groups.join('+')}`);
|
|
273
185
|
}
|
|
274
|
-
|
|
275
|
-
parts.push(`session-${sessionId}`);
|
|
276
|
-
}
|
|
186
|
+
parts.push(`session-${sessionId}`);
|
|
277
187
|
if (subdivisionCode) {
|
|
278
188
|
parts.push(`country-${countryCode}_${subdivisionCode}`);
|
|
279
189
|
}
|
|
280
190
|
else if (countryCode) {
|
|
281
191
|
parts.push(`country-${countryCode}`);
|
|
282
192
|
}
|
|
283
|
-
|
|
284
|
-
if (parts.length === 0)
|
|
285
|
-
username = 'auto';
|
|
286
|
-
return username;
|
|
193
|
+
return parts.join(',');
|
|
287
194
|
}
|
|
288
195
|
composeDefaultUrl(sessionId) {
|
|
289
196
|
const username = this._getUsername(sessionId);
|
|
@@ -298,15 +205,15 @@ class ProxyConfiguration extends core_1.ProxyConfiguration {
|
|
|
298
205
|
*/
|
|
299
206
|
// TODO: Make this private
|
|
300
207
|
async _setPasswordIfToken() {
|
|
301
|
-
const token = this.config
|
|
208
|
+
const { token } = this.config;
|
|
302
209
|
if (!token)
|
|
303
210
|
return;
|
|
304
211
|
try {
|
|
305
|
-
const user = await
|
|
212
|
+
const user = await Actor.apifyClient.user().get();
|
|
306
213
|
this.password = user.proxy?.password;
|
|
307
214
|
}
|
|
308
215
|
catch (error) {
|
|
309
|
-
if (
|
|
216
|
+
if (Actor.isAtHome()) {
|
|
310
217
|
throw error;
|
|
311
218
|
}
|
|
312
219
|
else {
|
|
@@ -336,7 +243,7 @@ class ProxyConfiguration extends core_1.ProxyConfiguration {
|
|
|
336
243
|
// Throw only on the platform, locally we just print a warning and run requests without the proxy.
|
|
337
244
|
// This is because the user might not have set up things correctly yet.
|
|
338
245
|
// It still fails on the platform, where we don't want to allow this behavior.
|
|
339
|
-
if (
|
|
246
|
+
if (Actor.isAtHome()) {
|
|
340
247
|
throw new Error(connectionError);
|
|
341
248
|
}
|
|
342
249
|
this.log.warning(connectionError);
|
|
@@ -346,7 +253,7 @@ class ProxyConfiguration extends core_1.ProxyConfiguration {
|
|
|
346
253
|
* Apify Proxy can be down for a second or a minute, but this should not crash processes.
|
|
347
254
|
*/
|
|
348
255
|
async _fetchStatus() {
|
|
349
|
-
const proxyStatusUrl = this.config
|
|
256
|
+
const { proxyStatusUrl } = this.config;
|
|
350
257
|
const requestOpts = {
|
|
351
258
|
url: `${proxyStatusUrl}/?format=json`,
|
|
352
259
|
proxyUrl: await this.newUrl(),
|
|
@@ -355,7 +262,7 @@ class ProxyConfiguration extends core_1.ProxyConfiguration {
|
|
|
355
262
|
};
|
|
356
263
|
for (let attempt = 1; attempt <= CHECK_ACCESS_MAX_ATTEMPTS; attempt++) {
|
|
357
264
|
try {
|
|
358
|
-
const response = await
|
|
265
|
+
const response = await gotScraping(requestOpts);
|
|
359
266
|
return response.body;
|
|
360
267
|
}
|
|
361
268
|
catch {
|
|
@@ -375,4 +282,3 @@ class ProxyConfiguration extends core_1.ProxyConfiguration {
|
|
|
375
282
|
'"options.subdivisionCode" or "options.apifyProxySubdivision".');
|
|
376
283
|
}
|
|
377
284
|
}
|
|
378
|
-
exports.ProxyConfiguration = ProxyConfiguration;
|
package/dist/storage.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import type { IStorage } from '@crawlee/core';
|
|
2
|
-
import type {
|
|
1
|
+
import type { Constructor, IStorage, StorageOpenOptions } from '@crawlee/core';
|
|
2
|
+
import type { StorageClient } from '@crawlee/types';
|
|
3
3
|
import type { Configuration } from './configuration.js';
|
|
4
4
|
export interface OpenStorageOptions {
|
|
5
5
|
/**
|
|
@@ -51,6 +51,8 @@ export interface OpenStorageContext {
|
|
|
51
51
|
purgedStorageAliases: Set<string>;
|
|
52
52
|
}
|
|
53
53
|
/**
|
|
54
|
-
* Opens a storage by its identifier, handling alias resolution and local purging.
|
|
54
|
+
* Opens a storage by its identifier, handling Apify alias resolution and local purging.
|
|
55
55
|
*/
|
|
56
|
-
export declare function openStorage<T extends IStorage>(storageClass: Constructor<T
|
|
56
|
+
export declare function openStorage<T extends IStorage>(storageClass: Constructor<T> & {
|
|
57
|
+
open(id?: string | null, options?: StorageOpenOptions): Promise<T>;
|
|
58
|
+
}, identifier: StorageIdentifier | null | undefined, context: OpenStorageContext): Promise<T>;
|
package/dist/storage.js
CHANGED
|
@@ -1,8 +1,4 @@
|
|
|
1
|
-
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.openStorage = openStorage;
|
|
4
|
-
const core_1 = require("@crawlee/core");
|
|
5
|
-
const apify_client_1 = require("apify-client");
|
|
1
|
+
import { ApifyStorageClient } from './apify_storage_client.js';
|
|
6
2
|
const STORAGE_TYPE_KEYS = {
|
|
7
3
|
Dataset: 'datasets',
|
|
8
4
|
KeyValueStore: 'keyValueStores',
|
|
@@ -11,7 +7,7 @@ const STORAGE_TYPE_KEYS = {
|
|
|
11
7
|
const parsedStoragesJson = new Map();
|
|
12
8
|
/**
|
|
13
9
|
* Resolves a {@link StorageIdentifier} to a plain string ID or name
|
|
14
|
-
* that can be passed to
|
|
10
|
+
* that can be passed to crawlee v4's `<Storage>.open()`.
|
|
15
11
|
*/
|
|
16
12
|
function resolveStorageIdentifier(storageType, identifier, config) {
|
|
17
13
|
if (identifier === null || identifier === undefined) {
|
|
@@ -27,8 +23,8 @@ function resolveStorageIdentifier(storageType, identifier, config) {
|
|
|
27
23
|
return identifier.name;
|
|
28
24
|
}
|
|
29
25
|
// { alias: string }
|
|
30
|
-
const storagesJson = config.
|
|
31
|
-
if (config.
|
|
26
|
+
const storagesJson = config.actorStoragesJson;
|
|
27
|
+
if (config.isAtHome && storagesJson) {
|
|
32
28
|
let storages;
|
|
33
29
|
try {
|
|
34
30
|
if (!parsedStoragesJson.has(storagesJson)) {
|
|
@@ -50,30 +46,34 @@ function resolveStorageIdentifier(storageType, identifier, config) {
|
|
|
50
46
|
// When using local storage, just use the alias as a name.
|
|
51
47
|
// When using platform storage, we can't just make up a name — the alias must be
|
|
52
48
|
// in ACTOR_STORAGES_JSON.
|
|
53
|
-
if (config.
|
|
49
|
+
if (config.isAtHome) {
|
|
54
50
|
throw new Error(`Storage alias "${identifier.alias}" cannot be resolved because ACTOR_STORAGES_JSON is not set. ` +
|
|
55
51
|
`Aliases are only available for storages declared in the Actor's schema.`);
|
|
56
52
|
}
|
|
57
53
|
return identifier.alias;
|
|
58
54
|
}
|
|
59
55
|
/**
|
|
60
|
-
* Opens a storage by its identifier, handling alias resolution and local purging.
|
|
56
|
+
* Opens a storage by its identifier, handling Apify alias resolution and local purging.
|
|
61
57
|
*/
|
|
62
|
-
async function openStorage(storageClass, identifier, context) {
|
|
58
|
+
export async function openStorage(storageClass, identifier, context) {
|
|
63
59
|
const isAlias = identifier !== null && identifier !== undefined && typeof identifier === 'object' && 'alias' in identifier;
|
|
64
|
-
if (isAlias && !context.config.
|
|
60
|
+
if (isAlias && !context.config.isAtHome && context.client instanceof ApifyStorageClient) {
|
|
65
61
|
throw new Error('The `alias` option is not allowed for Apify-based storages running outside of Apify');
|
|
66
62
|
}
|
|
67
63
|
const resolvedIdOrName = resolveStorageIdentifier(storageClass.name, identifier, context.config);
|
|
68
64
|
// When running locally, purge aliased storages on first open
|
|
69
|
-
// (similar to how
|
|
65
|
+
// (similar to how crawlee purges default storages on start).
|
|
70
66
|
if (isAlias &&
|
|
71
|
-
!context.config.
|
|
72
|
-
context.config.
|
|
67
|
+
!context.config.isAtHome &&
|
|
68
|
+
context.config.purgeOnStart &&
|
|
73
69
|
!context.purgedStorageAliases.has(identifier.alias)) {
|
|
74
70
|
context.purgedStorageAliases.add(identifier.alias);
|
|
75
|
-
const existingStorage = await
|
|
71
|
+
const existingStorage = await storageClass.open(resolvedIdOrName ?? null, {
|
|
72
|
+
storageClient: context.client,
|
|
73
|
+
});
|
|
76
74
|
await existingStorage.drop();
|
|
77
75
|
}
|
|
78
|
-
return
|
|
76
|
+
return storageClass.open(resolvedIdOrName ?? null, {
|
|
77
|
+
storageClient: context.client,
|
|
78
|
+
});
|
|
79
79
|
}
|
package/dist/utils.d.ts
CHANGED
|
@@ -13,3 +13,8 @@ export declare function getSystemInfo(): {
|
|
|
13
13
|
* @internal
|
|
14
14
|
*/
|
|
15
15
|
export declare function checkCrawleeVersion(): void;
|
|
16
|
+
/**
|
|
17
|
+
* Prints a warning if this version of Apify SDK is outdated.
|
|
18
|
+
* @ignore
|
|
19
|
+
*/
|
|
20
|
+
export declare function printOutdatedSdkWarning(): void;
|
package/dist/utils.js
CHANGED
|
@@ -1,33 +1,36 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
const node_os_1 = require("node:os");
|
|
6
|
-
const node_path_1 = require("node:path");
|
|
1
|
+
import { existsSync } from 'node:fs';
|
|
2
|
+
import { createRequire } from 'node:module';
|
|
3
|
+
import { type } from 'node:os';
|
|
4
|
+
import { normalize } from 'node:path';
|
|
7
5
|
// @ts-ignore if we enable resolveJsonModule, we end up with `src` folder in `dist`
|
|
8
|
-
|
|
6
|
+
import crawleePkgJson from '@crawlee/core/package.json' with { type: 'json' };
|
|
9
7
|
// @ts-ignore if we enable resolveJsonModule, we end up with `src` folder in `dist`
|
|
10
|
-
|
|
11
|
-
|
|
8
|
+
import apifyClientPkgJson from 'apify-client/package.json' with { type: 'json' };
|
|
9
|
+
// eslint-disable-next-line import/extensions
|
|
10
|
+
import { readJSONSync } from 'fs-extra/esm';
|
|
11
|
+
import semver from 'semver';
|
|
12
|
+
import { APIFY_ENV_VARS } from '@apify/consts';
|
|
13
|
+
import log from '@apify/log';
|
|
12
14
|
// @ts-ignore if we enable resolveJsonModule, we end up with `src` folder in `dist`
|
|
13
|
-
|
|
15
|
+
import apifyPkgJson from '../package.json' with { type: 'json' };
|
|
16
|
+
const require = createRequire(import.meta.url);
|
|
14
17
|
/**
|
|
15
18
|
* Gets info about system, node version and apify package version.
|
|
16
19
|
* @internal
|
|
17
20
|
*/
|
|
18
|
-
function getSystemInfo() {
|
|
21
|
+
export function getSystemInfo() {
|
|
19
22
|
return {
|
|
20
|
-
apifyVersion:
|
|
21
|
-
apifyClientVersion:
|
|
22
|
-
crawleeVersion:
|
|
23
|
-
osType:
|
|
23
|
+
apifyVersion: apifyPkgJson.version,
|
|
24
|
+
apifyClientVersion: apifyClientPkgJson.version,
|
|
25
|
+
crawleeVersion: crawleePkgJson.version,
|
|
26
|
+
osType: type(),
|
|
24
27
|
nodeVersion: process.version,
|
|
25
28
|
};
|
|
26
29
|
}
|
|
27
30
|
/**
|
|
28
31
|
* @internal
|
|
29
32
|
*/
|
|
30
|
-
function checkCrawleeVersion() {
|
|
33
|
+
export function checkCrawleeVersion() {
|
|
31
34
|
const resolve = (name) => {
|
|
32
35
|
try {
|
|
33
36
|
return require.resolve(name);
|
|
@@ -38,28 +41,41 @@ function checkCrawleeVersion() {
|
|
|
38
41
|
};
|
|
39
42
|
const paths = [
|
|
40
43
|
// when users install `crawlee` package, we need to check its core dependency
|
|
41
|
-
|
|
44
|
+
normalize(`${process.cwd()}/node_modules/crawlee/node_modules/@crawlee/core/package.json`),
|
|
42
45
|
// when users install `@crawlee/cheerio` or other crawler package, we need to check the dependency under basic crawler package
|
|
43
|
-
|
|
46
|
+
normalize(`${process.cwd()}/node_modules/@crawlee/basic/node_modules/@crawlee/core/package.json`),
|
|
44
47
|
// also check paths via `require.resolve` to support pnpm
|
|
45
48
|
resolve('crawlee/package.json'),
|
|
46
49
|
resolve('@crawlee/basic/package.json'),
|
|
47
50
|
];
|
|
48
51
|
for (const path of paths) {
|
|
49
52
|
// ignore unresolved paths or paths that are not in the project directory
|
|
50
|
-
if (!(
|
|
53
|
+
if (!existsSync(path) || !path.startsWith(process.cwd())) {
|
|
51
54
|
continue;
|
|
52
55
|
}
|
|
53
56
|
let version;
|
|
54
57
|
try {
|
|
55
|
-
version =
|
|
58
|
+
version = readJSONSync(path).version;
|
|
56
59
|
}
|
|
57
60
|
catch {
|
|
58
61
|
//
|
|
59
62
|
}
|
|
60
|
-
if (version != null && version !==
|
|
61
|
-
const details = `User installed version (${version}) found in ${path}.\nSDK uses ${
|
|
62
|
-
throw new Error(`Detected incompatible Crawlee version used by the SDK. User installed ${version} but the SDK uses ${
|
|
63
|
+
if (version != null && version !== crawleePkgJson.version) {
|
|
64
|
+
const details = `User installed version (${version}) found in ${path}.\nSDK uses ${crawleePkgJson.version} from ${require.resolve('@crawlee/core')}`;
|
|
65
|
+
throw new Error(`Detected incompatible Crawlee version used by the SDK. User installed ${version} but the SDK uses ${crawleePkgJson.version}.\n\n${details}`);
|
|
63
66
|
}
|
|
64
67
|
}
|
|
65
68
|
}
|
|
69
|
+
/**
|
|
70
|
+
* Prints a warning if this version of Apify SDK is outdated.
|
|
71
|
+
* @ignore
|
|
72
|
+
*/
|
|
73
|
+
export function printOutdatedSdkWarning() {
|
|
74
|
+
if (process.env[APIFY_ENV_VARS.DISABLE_OUTDATED_WARNING])
|
|
75
|
+
return;
|
|
76
|
+
const latestApifyVersion = process.env[APIFY_ENV_VARS.SDK_LATEST_VERSION];
|
|
77
|
+
if (!latestApifyVersion || !semver.lt(apifyPkgJson.version, latestApifyVersion))
|
|
78
|
+
return;
|
|
79
|
+
log.warning(`You are using an outdated version (${apifyPkgJson.version}) of Apify SDK. We recommend you to update to the latest version (${latestApifyVersion}).
|
|
80
|
+
Read more about Apify SDK versioning at: https://help.apify.com/en/articles/3184510-updates-and-versioning-of-apify-sdk`);
|
|
81
|
+
}
|
package/package.json
CHANGED
|
@@ -1,18 +1,17 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "apify",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "4.0.0-beta.13",
|
|
4
4
|
"description": "The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.",
|
|
5
5
|
"engines": {
|
|
6
|
-
"node": ">=
|
|
6
|
+
"node": ">=22.0.0"
|
|
7
7
|
},
|
|
8
|
+
"type": "module",
|
|
8
9
|
"main": "./dist/index.js",
|
|
9
|
-
"module": "./dist/index.mjs",
|
|
10
10
|
"types": "./dist/index.d.ts",
|
|
11
11
|
"exports": {
|
|
12
12
|
".": {
|
|
13
|
-
"
|
|
14
|
-
"
|
|
15
|
-
"types": "./dist/index.d.ts"
|
|
13
|
+
"types": "./dist/index.d.ts",
|
|
14
|
+
"default": "./dist/index.js"
|
|
16
15
|
},
|
|
17
16
|
"./package.json": "./package.json"
|
|
18
17
|
},
|
|
@@ -59,20 +58,23 @@
|
|
|
59
58
|
"@apify/log": "^2.4.3",
|
|
60
59
|
"@apify/timeout": "^0.3.0",
|
|
61
60
|
"@apify/utilities": "^2.13.0",
|
|
62
|
-
"@crawlee/core": "^
|
|
63
|
-
"@crawlee/types": "^
|
|
64
|
-
"@crawlee/utils": "^
|
|
61
|
+
"@crawlee/core": "^4.0.0-beta.61",
|
|
62
|
+
"@crawlee/types": "^4.0.0-beta.61",
|
|
63
|
+
"@crawlee/utils": "^4.0.0-beta.61",
|
|
65
64
|
"apify-client": "^2.17.0",
|
|
66
65
|
"fs-extra": "^11.2.0",
|
|
67
|
-
"
|
|
66
|
+
"got-scraping": "^4.2.1",
|
|
67
|
+
"ow": "^2.0.0",
|
|
68
68
|
"semver": "^7.5.4",
|
|
69
69
|
"tslib": "^2.6.2",
|
|
70
|
-
"ws": "^8.18.0"
|
|
70
|
+
"ws": "^8.18.0",
|
|
71
|
+
"zod": "^3.24.0 || ^4.0.0"
|
|
71
72
|
},
|
|
72
73
|
"devDependencies": {
|
|
73
74
|
"@apify/oxlint-config": "^0.2.5",
|
|
74
75
|
"@apify/tsconfig": "^0.1.2",
|
|
75
76
|
"@commitlint/config-conventional": "^21.0.0",
|
|
77
|
+
"@crawlee/memory-storage": "^4.0.0-beta.61",
|
|
76
78
|
"@playwright/browser-chromium": "^1.60.0",
|
|
77
79
|
"@types/content-type": "^1.1.8",
|
|
78
80
|
"@types/fs-extra": "^11.0.4",
|
|
@@ -81,13 +83,12 @@
|
|
|
81
83
|
"@types/tough-cookie": "^4.0.5",
|
|
82
84
|
"@types/ws": "^8.5.12",
|
|
83
85
|
"commitlint": "^21.0.0",
|
|
84
|
-
"crawlee": "^
|
|
85
|
-
"gen-esm-wrapper": "^1.1.3",
|
|
86
|
+
"crawlee": "^4.0.0-beta.61",
|
|
86
87
|
"globby": "^16.0.0",
|
|
87
88
|
"husky": "^9.1.7",
|
|
88
89
|
"lint-staged": "^17.0.0",
|
|
89
90
|
"oxfmt": "0.52.0",
|
|
90
|
-
"oxlint": "1.
|
|
91
|
+
"oxlint": "1.67.0",
|
|
91
92
|
"oxlint-tsgolint": "0.22.0",
|
|
92
93
|
"playwright": "^1.60.0",
|
|
93
94
|
"puppeteer": "^25.0.0",
|
|
@@ -106,7 +107,7 @@
|
|
|
106
107
|
},
|
|
107
108
|
"scripts": {
|
|
108
109
|
"clean": "rimraf ./dist ./tsconfig.build.tsbuildinfo",
|
|
109
|
-
"compile": "tsc -p tsconfig.build.json
|
|
110
|
+
"compile": "tsc -p tsconfig.build.json",
|
|
110
111
|
"fixApifyExport": "node ./scripts/temp_fix_apify_exports.mjs",
|
|
111
112
|
"build": "pnpm clean && pnpm compile && pnpm fixApifyExport",
|
|
112
113
|
"ci:build": "pnpm build",
|