apify 3.1.2-beta.44 → 3.1.2-beta.63

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,395 +0,0 @@
1
- import ow from 'ow';
2
- import { APIFY_PROXY_VALUE_REGEX, ENV_VARS } from '@apify/consts';
3
- import type {
4
- ProxyConfigurationOptions as CoreProxyConfigurationOptions,
5
- ProxyInfo as CoreProxyInfo,
6
- } from '@crawlee/core';
7
- import {
8
- ProxyConfiguration as CoreProxyConfiguration,
9
- } from '@crawlee/core';
10
- import { gotScraping } from 'got-scraping';
11
- import { Actor } from './actor';
12
- import { Configuration } from './configuration';
13
-
14
- // https://docs.apify.com/proxy/datacenter-proxy#username-parameters
15
- const MAX_SESSION_ID_LENGTH = 50;
16
- const CHECK_ACCESS_REQUEST_TIMEOUT_MILLIS = 4_000;
17
- const CHECK_ACCESS_MAX_ATTEMPTS = 2;
18
- const COUNTRY_CODE_REGEX = /^[A-Z]{2}$/;
19
-
20
- export interface ProxyConfigurationOptions extends CoreProxyConfigurationOptions {
21
- /**
22
- * User's password for the proxy. By default, it is taken from the `APIFY_PROXY_PASSWORD`
23
- * environment variable, which is automatically set by the system when running the actors.
24
- */
25
- password?: string;
26
-
27
- /**
28
- * An array of proxy groups to be used by the [Apify Proxy](https://docs.apify.com/proxy).
29
- * If not provided, the proxy will select the groups automatically.
30
- */
31
- groups?: string[];
32
-
33
- /**
34
- * If set and relevant proxies are available in your Apify account, all proxied requests will
35
- * use IP addresses that are geolocated to the specified country. For example `GB` for IPs
36
- * from Great Britain. Note that online services often have their own rules for handling
37
- * geolocation and thus the country selection is a best attempt at geolocation, rather than
38
- * a guaranteed hit. This parameter is optional, by default, each proxied request is assigned
39
- * an IP address from a random country. The country code needs to be a two letter ISO country code. See the
40
- * [full list of available country codes](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2#Officially_assigned_code_elements).
41
- * This parameter is optional, by default, the proxy uses all available proxy servers from all countries.
42
- * on the Apify cloud, or when using the [Apify CLI](https://github.com/apify/apify-cli).
43
- */
44
- countryCode?: string;
45
-
46
- /**
47
- * Same option as `groups` which can be used to
48
- * configurate the proxy by UI input schema. You should use the `groups` option in your crawler code.
49
- */
50
- apifyProxyGroups?: string[];
51
-
52
- /**
53
- * Same option as `countryCode` which can be used to
54
- * configurate the proxy by UI input schema. You should use the `countryCode` option in your crawler code.
55
- */
56
- apifyProxyCountry?: string;
57
- }
58
-
59
- /**
60
- * The main purpose of the ProxyInfo object is to provide information
61
- * about the current proxy connection used by the crawler for the request.
62
- * Outside of crawlers, you can get this object by calling {@apilink ProxyConfiguration.newProxyInfo}.
63
- *
64
- * **Example usage:**
65
- *
66
- * ```javascript
67
- *
68
- * const proxyConfiguration = await Actor.createProxyConfiguration({
69
- * groups: ['GROUP1', 'GROUP2'] // List of Apify Proxy groups
70
- * countryCode: 'US',
71
- * });
72
- *
73
- * // Getting proxyInfo object by calling class method directly
74
- * const proxyInfo = proxyConfiguration.newProxyInfo();
75
- *
76
- * // In crawler
77
- * const crawler = new CheerioCrawler({
78
- * // ...
79
- * proxyConfiguration,
80
- * requestHandler({ proxyInfo }) {
81
- * // Getting used proxy URL
82
- * const proxyUrl = proxyInfo.url;
83
- *
84
- * // Getting ID of used Session
85
- * const sessionIdentifier = proxyInfo.sessionId;
86
- * }
87
- * })
88
- *
89
- * ```
90
- */
91
- export interface ProxyInfo extends CoreProxyInfo {
92
- /**
93
- * An array of proxy groups to be used by the [Apify Proxy](https://docs.apify.com/proxy).
94
- * If not provided, the proxy will select the groups automatically.
95
- */
96
- groups: string[];
97
-
98
- /**
99
- * If set and relevant proxies are available in your Apify account, all proxied requests will
100
- * use IP addresses that are geolocated to the specified country. For example `GB` for IPs
101
- * from Great Britain. Note that online services often have their own rules for handling
102
- * geolocation and thus the country selection is a best attempt at geolocation, rather than
103
- * a guaranteed hit. This parameter is optional, by default, each proxied request is assigned
104
- * an IP address from a random country. The country code needs to be a two letter ISO country code. See the
105
- * [full list of available country codes](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2#Officially_assigned_code_elements).
106
- * This parameter is optional, by default, the proxy uses all available proxy servers from all countries.
107
- */
108
- countryCode?: string;
109
-
110
- /**
111
- * User's password for the proxy. By default, it is taken from the `APIFY_PROXY_PASSWORD`
112
- * environment variable, which is automatically set by the system when running the actors
113
- * on the Apify cloud, or when using the [Apify CLI](https://github.com/apify/apify-cli).
114
- */
115
- password: string;
116
- }
117
-
118
- /**
119
- * Configures connection to a proxy server with the provided options. Proxy servers are used to prevent target websites from blocking
120
- * your crawlers based on IP address rate limits or blacklists. Setting proxy configuration in your crawlers automatically configures
121
- * them to use the selected proxies for all connections. You can get information about the currently used proxy by inspecting
122
- * the {@apilink ProxyInfo} property in your crawler's page function. There, you can inspect the proxy's URL and other attributes.
123
- *
124
- * The proxy servers are managed by [Apify Proxy](https://docs.apify.com/proxy). To be able to use Apify Proxy,
125
- * you need an Apify account and access to the selected proxies. If you provide no configuration option,
126
- * the proxies will be managed automatically using a smart algorithm.
127
- *
128
- * If you want to use your own proxies, use the {@apilink ProxyConfigurationOptions.proxyUrls} option. Your list of proxy URLs will
129
- * be rotated by the configuration if this option is provided.
130
- *
131
- * **Example usage:**
132
- *
133
- * ```javascript
134
- *
135
- * const proxyConfiguration = await Actor.createProxyConfiguration({
136
- * groups: ['GROUP1', 'GROUP2'] // List of Apify Proxy groups
137
- * countryCode: 'US',
138
- * });
139
- *
140
- * const crawler = new CheerioCrawler({
141
- * // ...
142
- * proxyConfiguration,
143
- * requestHandler({ proxyInfo }) {
144
- * const usedProxyUrl = proxyInfo.url; // Getting the proxy URL
145
- * }
146
- * })
147
- *
148
- * ```
149
- * @category Scaling
150
- */
151
- export class ProxyConfiguration extends CoreProxyConfiguration {
152
- private groups: string[];
153
- private countryCode?: string;
154
- private password?: string;
155
- private hostname: string;
156
- private port?: number;
157
- private usesApifyProxy?: boolean;
158
-
159
- /**
160
- * @internal
161
- */
162
- constructor(options: ProxyConfigurationOptions = {}, readonly config = Configuration.getGlobalConfig()) {
163
- const { proxyUrls, newUrlFunction, ...rest } = options;
164
- super({ proxyUrls, newUrlFunction, ['validateRequired' as string]: false });
165
- ow(rest, ow.object.exactShape({
166
- groups: ow.optional.array.ofType(ow.string.matches(APIFY_PROXY_VALUE_REGEX)),
167
- apifyProxyGroups: ow.optional.array.ofType(ow.string.matches(APIFY_PROXY_VALUE_REGEX)),
168
- countryCode: ow.optional.string.matches(COUNTRY_CODE_REGEX),
169
- apifyProxyCountry: ow.optional.string.matches(COUNTRY_CODE_REGEX),
170
- password: ow.optional.string,
171
- }));
172
-
173
- const {
174
- groups = [],
175
- apifyProxyGroups = [],
176
- countryCode,
177
- apifyProxyCountry,
178
- password = config.get('proxyPassword'),
179
- } = options;
180
-
181
- const groupsToUse = groups.length ? groups : apifyProxyGroups;
182
- const countryCodeToUse = countryCode || apifyProxyCountry;
183
- const hostname = config.get('proxyHostname');
184
- const port = config.get('proxyPort');
185
-
186
- // Validation
187
- if (((proxyUrls || newUrlFunction) && ((groupsToUse.length) || countryCodeToUse))) {
188
- this._throwCannotCombineCustomWithApify();
189
- }
190
- if (proxyUrls && newUrlFunction) this._throwCannotCombineCustomMethods();
191
-
192
- this.groups = groupsToUse;
193
- this.countryCode = countryCodeToUse;
194
- this.password = password;
195
- this.hostname = hostname!;
196
- this.port = port;
197
- this.usesApifyProxy = !this.proxyUrls && !this.newUrlFunction;
198
-
199
- if (proxyUrls && proxyUrls.some((url) => url.includes('apify.com'))) {
200
- this.log.warning(
201
- 'Some Apify proxy features may work incorrectly. Please consider setting up Apify properties instead of `proxyUrls`.\n'
202
- + 'See https://sdk.apify.com/docs/guides/proxy-management#apify-proxy-configuration',
203
- );
204
- }
205
- }
206
-
207
- /**
208
- * Loads proxy password if token is provided and checks access to Apify Proxy and provided proxy groups
209
- * if Apify Proxy configuration is used.
210
- * Also checks if country has access to Apify Proxy groups if the country code is provided.
211
- *
212
- * You should use the {@apilink createProxyConfiguration} function to create a pre-initialized
213
- * `ProxyConfiguration` instance instead of calling this manually.
214
- */
215
- async initialize(): Promise<void> {
216
- if (this.usesApifyProxy) {
217
- await this._setPasswordIfToken();
218
- await this._checkAccess();
219
- }
220
- }
221
-
222
- /**
223
- * This function creates a new {@apilink ProxyInfo} info object.
224
- * It is used by CheerioCrawler and PuppeteerCrawler to generate proxy URLs and also to allow the user to inspect
225
- * the currently used proxy via the requestHandler parameter `proxyInfo`.
226
- * Use it if you want to work with a rich representation of a proxy URL.
227
- * If you need the URL string only, use {@apilink ProxyConfiguration.newUrl}.
228
- * @param [sessionId]
229
- * Represents the identifier of user {@apilink Session} that can be managed by the {@apilink SessionPool} or
230
- * you can use the Apify Proxy [Session](https://docs.apify.com/proxy#sessions) identifier.
231
- * When the provided sessionId is a number, it's converted to a string. Property sessionId of
232
- * {@apilink ProxyInfo} is always returned as a type string.
233
- *
234
- * All the HTTP requests going through the proxy with the same session identifier
235
- * will use the same target proxy server (i.e. the same IP address).
236
- * The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`.
237
- * @return Represents information about used proxy and its configuration.
238
- */
239
- override async newProxyInfo(sessionId?: string | number): Promise<ProxyInfo> {
240
- if (typeof sessionId === 'number') sessionId = `${sessionId}`;
241
- ow(sessionId, ow.optional.string.maxLength(MAX_SESSION_ID_LENGTH).matches(APIFY_PROXY_VALUE_REGEX));
242
- const url = await this.newUrl(sessionId);
243
-
244
- const { groups, countryCode, password, port, hostname } = (this.usesApifyProxy ? this : new URL(url)) as ProxyConfiguration;
245
-
246
- return {
247
- sessionId,
248
- url,
249
- groups,
250
- countryCode,
251
- password: password ?? '',
252
- hostname,
253
- port: port!,
254
- };
255
- }
256
-
257
- /**
258
- * Returns a new proxy URL based on provided configuration options and the `sessionId` parameter.
259
- * @param [sessionId]
260
- * Represents the identifier of user {@apilink Session} that can be managed by the {@apilink SessionPool} or
261
- * you can use the Apify Proxy [Session](https://docs.apify.com/proxy#sessions) identifier.
262
- * When the provided sessionId is a number, it's converted to a string.
263
- *
264
- * All the HTTP requests going through the proxy with the same session identifier
265
- * will use the same target proxy server (i.e. the same IP address).
266
- * The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`.
267
- * @return A string with a proxy URL, including authentication credentials and port number.
268
- * For example, `http://bob:password123@proxy.example.com:8000`
269
- */
270
- override async newUrl(sessionId?: string | number): Promise<string> {
271
- if (typeof sessionId === 'number') sessionId = `${sessionId}`;
272
- ow(sessionId, ow.optional.string.maxLength(MAX_SESSION_ID_LENGTH).matches(APIFY_PROXY_VALUE_REGEX));
273
- if (this.newUrlFunction) {
274
- return this._callNewUrlFunction(sessionId)!;
275
- }
276
- if (this.proxyUrls) {
277
- return this._handleCustomUrl(sessionId);
278
- }
279
- const username = this._getUsername(sessionId);
280
- const { password, hostname, port } = this;
281
-
282
- return `http://${username}:${password}@${hostname}:${port}`;
283
- }
284
-
285
- /**
286
- * Returns proxy username.
287
- */
288
- protected _getUsername(sessionId?: string): string {
289
- let username;
290
- const { groups, countryCode } = this;
291
- const parts: string[] = [];
292
-
293
- if (groups && groups.length) {
294
- parts.push(`groups-${groups.join('+')}`);
295
- }
296
- if (sessionId) {
297
- parts.push(`session-${sessionId}`);
298
- }
299
- if (countryCode) {
300
- parts.push(`country-${countryCode}`);
301
- }
302
-
303
- username = parts.join(',');
304
-
305
- if (parts.length === 0) username = 'auto';
306
-
307
- return username;
308
- }
309
-
310
- /**
311
- * Checks if Apify Token is provided in env and gets the password via API and sets it to env
312
- */
313
- protected async _setPasswordIfToken(): Promise<void> {
314
- const token = this.config.get('token');
315
-
316
- if (token) {
317
- const { proxy } = await Actor.apifyClient.user().get();
318
- const { password } = proxy!;
319
-
320
- if (this.password) {
321
- if (this.password !== password) {
322
- this.log.warning('The Apify Proxy password you provided belongs to'
323
- + ' a different user than the Apify token you are using. Are you sure this is correct?');
324
- }
325
- } else {
326
- this.password = password;
327
- }
328
- }
329
-
330
- if (!this.password) {
331
- throw new Error(`Apify Proxy password must be provided using options.password or the "${ENV_VARS.PROXY_PASSWORD}" environment variable. `
332
- + `If you add the "${ENV_VARS.TOKEN}" environment variable, the password will be automatically inferred.`);
333
- }
334
- }
335
-
336
- /**
337
- * Checks whether the user has access to the proxies specified in the provided ProxyConfigurationOptions.
338
- * If the check can not be made, it only prints a warning and allows the program to continue. This is to
339
- * prevent program crashes caused by short downtimes of Proxy.
340
- */
341
- protected async _checkAccess(): Promise<void> {
342
- const status = await this._fetchStatus();
343
- if (status) {
344
- const { connected, connectionError, isManInTheMiddle } = status;
345
- this.isManInTheMiddle = isManInTheMiddle;
346
-
347
- if (!connected) this._throwApifyProxyConnectionError(connectionError);
348
- } else {
349
- this.log.warning('Apify Proxy access check timed out. Watch out for errors with status code 407. '
350
- + 'If you see some, it most likely means you don\'t have access to either all or some of the proxies you\'re trying to use.');
351
- }
352
- }
353
-
354
- /**
355
- * Apify Proxy can be down for a second or a minute, but this should not crash processes.
356
- */
357
- protected async _fetchStatus(): Promise<{ connected: boolean; connectionError: string; isManInTheMiddle: boolean } | undefined> {
358
- const proxyStatusUrl = this.config.get('proxyStatusUrl', 'http://proxy.apify.com');
359
- const requestOpts = {
360
- url: `${proxyStatusUrl}/?format=json`,
361
- proxyUrl: await this.newUrl(),
362
- timeout: { request: CHECK_ACCESS_REQUEST_TIMEOUT_MILLIS },
363
- responseType: 'json',
364
- } as const;
365
-
366
- for (let attempt = 1; attempt <= CHECK_ACCESS_MAX_ATTEMPTS; attempt++) {
367
- try {
368
- const response = await gotScraping<{ connected: boolean; connectionError: string; isManInTheMiddle: boolean }>(requestOpts);
369
- return response.body;
370
- } catch {
371
- // retry connection errors
372
- }
373
- }
374
-
375
- return undefined;
376
- }
377
-
378
- /**
379
- * Throws Apify Proxy is not connected
380
- * @internal
381
- */
382
- protected _throwApifyProxyConnectionError(errorMessage: string) {
383
- throw new Error(errorMessage);
384
- }
385
-
386
- /**
387
- * Throws cannot combine custom proxies with Apify Proxy
388
- * @internal
389
- */
390
- protected _throwCannotCombineCustomWithApify() {
391
- throw new Error('Cannot combine custom proxies with Apify Proxy!'
392
- + 'It is not allowed to set "options.proxyUrls" or "options.newUrlFunction" combined with '
393
- + '"options.groups" or "options.apifyProxyGroups" and "options.countryCode" or "options.apifyProxyCountry".');
394
- }
395
- }
package/src/utils.ts DELETED
@@ -1,38 +0,0 @@
1
- import log from '@apify/log';
2
- import { ENV_VARS } from '@apify/consts';
3
- import { type } from 'node:os';
4
- import semver from 'semver';
5
-
6
- // @ts-expect-error if we enable resolveJsonModule, we end up with `src` folder in `dist`
7
- import { version as apifyClientVersion } from 'apify-client/package.json';
8
- // @ts-expect-error if we enable resolveJsonModule, we end up with `src` folder in `dist`
9
- import { version as crawleeVersion } from '@crawlee/core/package.json';
10
- // @ts-expect-error if we enable resolveJsonModule, we end up with `src` folder in `dist`
11
- import { version as apifyVersion } from '../package.json';
12
-
13
- /**
14
- * Logs info about system, node version and apify package version.
15
- * @internal
16
- */
17
- export function logSystemInfo() {
18
- log.info('System info', {
19
- apifyVersion,
20
- apifyClientVersion,
21
- crawleeVersion,
22
- osType: type(),
23
- nodeVersion: process.version,
24
- });
25
- }
26
-
27
- /**
28
- * Prints a warning if this version of Apify SDK is outdated.
29
- * @ignore
30
- */
31
- export function printOutdatedSdkWarning() {
32
- if (process.env[ENV_VARS.DISABLE_OUTDATED_WARNING]) return;
33
- const latestApifyVersion = process.env[ENV_VARS.SDK_LATEST_VERSION];
34
- if (!latestApifyVersion || !semver.lt(apifyVersion, latestApifyVersion)) return;
35
-
36
- log.warning(`You are using an outdated version (${apifyVersion}) of Apify SDK. We recommend you to update to the latest version (${latestApifyVersion}).
37
- Read more about Apify SDK versioning at: https://help.apify.com/en/articles/3184510-updates-and-versioning-of-apify-sdk`);
38
- }