apify 3.1.1 → 3.1.2-beta.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/.turbo/turbo-build.log +20 -0
  2. package/.turbo/turbo-copy.log +4 -0
  3. package/dist/LICENSE.md +201 -0
  4. package/dist/README.md +98 -0
  5. package/{actor.d.ts → dist/actor.d.ts} +1 -1
  6. package/{actor.d.ts.map → dist/actor.d.ts.map} +1 -1
  7. package/{actor.js → dist/actor.js} +0 -0
  8. package/{actor.js.map → dist/actor.js.map} +0 -0
  9. package/dist/cli.d.ts +3 -0
  10. package/dist/cli.d.ts.map +1 -0
  11. package/dist/cli.js +9 -0
  12. package/dist/cli.js.map +1 -0
  13. package/{configuration.d.ts → dist/configuration.d.ts} +0 -0
  14. package/{configuration.d.ts.map → dist/configuration.d.ts.map} +0 -0
  15. package/{configuration.js → dist/configuration.js} +0 -0
  16. package/{configuration.js.map → dist/configuration.js.map} +0 -0
  17. package/{index.d.ts → dist/index.d.ts} +0 -0
  18. package/{index.d.ts.map → dist/index.d.ts.map} +0 -0
  19. package/{index.js → dist/index.js} +0 -0
  20. package/{index.js.map → dist/index.js.map} +0 -0
  21. package/{index.mjs → dist/index.mjs} +0 -0
  22. package/{key_value_store.d.ts → dist/key_value_store.d.ts} +0 -0
  23. package/{key_value_store.d.ts.map → dist/key_value_store.d.ts.map} +0 -0
  24. package/{key_value_store.js → dist/key_value_store.js} +0 -0
  25. package/{key_value_store.js.map → dist/key_value_store.js.map} +0 -0
  26. package/dist/package.json +76 -0
  27. package/{platform_event_manager.d.ts → dist/platform_event_manager.d.ts} +0 -0
  28. package/{platform_event_manager.d.ts.map → dist/platform_event_manager.d.ts.map} +0 -0
  29. package/{platform_event_manager.js → dist/platform_event_manager.js} +0 -0
  30. package/{platform_event_manager.js.map → dist/platform_event_manager.js.map} +0 -0
  31. package/{proxy_configuration.d.ts → dist/proxy_configuration.d.ts} +0 -0
  32. package/{proxy_configuration.d.ts.map → dist/proxy_configuration.d.ts.map} +0 -0
  33. package/{proxy_configuration.js → dist/proxy_configuration.js} +0 -0
  34. package/{proxy_configuration.js.map → dist/proxy_configuration.js.map} +0 -0
  35. package/{utils.d.ts → dist/utils.d.ts} +0 -0
  36. package/{utils.d.ts.map → dist/utils.d.ts.map} +0 -0
  37. package/{utils.js → dist/utils.js} +0 -0
  38. package/{utils.js.map → dist/utils.js.map} +0 -0
  39. package/package.json +14 -9
  40. package/src/actor.ts +1614 -0
  41. package/src/cli.ts +9 -0
  42. package/src/configuration.ts +202 -0
  43. package/src/index.ts +11 -0
  44. package/src/key_value_store.ts +25 -0
  45. package/src/platform_event_manager.ts +118 -0
  46. package/src/proxy_configuration.ts +395 -0
  47. package/src/utils.ts +38 -0
  48. package/tsconfig.build.tsbuildinfo +0 -1
package/src/cli.ts ADDED
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env node
2
+
3
+ // eslint-disable-next-line
4
+ const importLocal = require('import-local');
5
+
6
+ if (!importLocal(__filename)) {
7
+ // eslint-disable-next-line
8
+ require('apify-cli');
9
+ }
@@ -0,0 +1,202 @@
1
+ import { ENV_VARS, LOCAL_ENV_VARS } from '@apify/consts';
2
+ import { Configuration as CoreConfiguration } from '@crawlee/core';
3
+ import type { ConfigurationOptions as CoreConfigurationOptions } from '@crawlee/core';
4
+
5
+ export interface ConfigurationOptions extends CoreConfigurationOptions {
6
+ metamorphAfterSleepMillis?: number;
7
+ actorEventsWsUrl?: string;
8
+ token?: string;
9
+ actorId?: string;
10
+ actorRunId?: string;
11
+ actorTaskId?: string;
12
+ apiBaseUrl?: string;
13
+ containerPort?: number;
14
+ containerUrl?: string;
15
+ proxyHostname?: string;
16
+ proxyPassword?: string;
17
+ proxyPort?: number;
18
+ proxyStatusUrl?: string;
19
+ isAtHome?: boolean;
20
+ userId?: string;
21
+ inputSecretsPrivateKeyPassphrase?: string;
22
+ inputSecretsPrivateKeyFile?: string;
23
+ }
24
+
25
+ /**
26
+ * `Configuration` is a value object holding the SDK configuration. We can use it in two ways:
27
+ *
28
+ * 1. When using `Actor` class, we can get the instance configuration via `sdk.config`
29
+ *
30
+ * ```javascript
31
+ * import { Actor } from 'apify';
32
+ * import { BasicCrawler } from 'crawlee';
33
+ *
34
+ * const sdk = new Actor({ token: '123' });
35
+ * console.log(sdk.config.get('token')); // '123'
36
+ *
37
+ * const crawler = new BasicCrawler({
38
+ * // ... crawler options
39
+ * }, sdk.config);
40
+ * ```
41
+ *
42
+ * 2. To get the global configuration (singleton instance). It will respect the environment variables.
43
+ *
44
+ * ```javascript
45
+ * import { BasicCrawler, Configuration } from 'crawlee';
46
+ *
47
+ * // Get the global configuration
48
+ * const config = Configuration.getGlobalConfig();
49
+ * // Set the 'persistStateIntervalMillis' option
50
+ * // of global configuration to 30 seconds
51
+ * config.set('persistStateIntervalMillis', 30_000);
52
+ *
53
+ * // No need to pass the configuration to the crawler,
54
+ * // as it's using the global configuration by default
55
+ * const crawler = new BasicCrawler();
56
+ * ```
57
+ *
58
+ * ## Supported Configuration Options
59
+ *
60
+ * Key | Environment Variable | Default Value
61
+ * ---|---|---
62
+ * `memoryMbytes` | `APIFY_MEMORY_MBYTES` | -
63
+ * `headless` | `APIFY_HEADLESS` | -
64
+ * `persistStateIntervalMillis` | `APIFY_PERSIST_STATE_INTERVAL_MILLIS` | `60e3`
65
+ * `token` | `APIFY_TOKEN` | -
66
+ * `isAtHome` | `APIFY_IS_AT_HOME` | -
67
+ * `defaultDatasetId` | `APIFY_DEFAULT_DATASET_ID` | `'default'`
68
+ * `defaultKeyValueStoreId` | `APIFY_DEFAULT_KEY_VALUE_STORE_ID` | `'default'`
69
+ * `defaultRequestQueueId` | `APIFY_DEFAULT_REQUEST_QUEUE_ID` | `'default'`
70
+ *
71
+ * ## Advanced Configuration Options
72
+ *
73
+ * Key | Environment Variable | Default Value
74
+ * ---|---|---
75
+ * `actorEventsWsUrl` | `APIFY_ACTOR_EVENTS_WS_URL` | -
76
+ * `actorId` | `APIFY_ACTOR_ID` | -
77
+ * `actorRunId` | `APIFY_ACTOR_RUN_ID` | -
78
+ * `actorTaskId` | `APIFY_ACTOR_TASK_ID` | -
79
+ * `apiBaseUrl` | `APIFY_API_BASE_URL` | `'https://api.apify.com'`
80
+ * `containerPort` | `APIFY_CONTAINER_PORT` | `4321`
81
+ * `containerUrl` | `APIFY_CONTAINER_URL` | `'http://localhost:4321'`
82
+ * `inputKey` | `APIFY_INPUT_KEY` | `'INPUT'`
83
+ * `metamorphAfterSleepMillis` | `APIFY_METAMORPH_AFTER_SLEEP_MILLIS` | `300e3`
84
+ * `proxyHostname` | `APIFY_PROXY_HOSTNAME` | `'proxy.apify.com'`
85
+ * `proxyPassword` | `APIFY_PROXY_PASSWORD` | -
86
+ * `proxyPort` | `APIFY_PROXY_PORT` | `8000`
87
+ * `proxyStatusUrl` | `APIFY_PROXY_STATUS_URL` | `'http://proxy.apify.com'`
88
+ * `userId` | `APIFY_USER_ID` | -
89
+ * `xvfb` | `APIFY_XVFB` | -
90
+ * `chromeExecutablePath` | `APIFY_CHROME_EXECUTABLE_PATH` | -
91
+ * `defaultBrowserPath` | `APIFY_DEFAULT_BROWSER_PATH` | -
92
+ */
93
+ export class Configuration extends CoreConfiguration {
94
+ /** @inheritDoc */
95
+ static override globalConfig?: Configuration;
96
+
97
+ // maps environment variables to config keys (e.g. `APIFY_MEMORY_MBYTES` to `memoryMbytes`)
98
+ protected static override ENV_MAP = {
99
+ // regular crawlee env vars are also supported
100
+ ...super.ENV_MAP,
101
+
102
+ // support crawlee env vars prefixed with `APIFY_` too
103
+ APIFY_AVAILABLE_MEMORY_RATIO: 'availableMemoryRatio',
104
+ APIFY_PURGE_ON_START: 'purgeOnStart',
105
+ APIFY_MEMORY_MBYTES: 'memoryMbytes',
106
+ APIFY_DEFAULT_DATASET_ID: 'defaultDatasetId',
107
+ APIFY_DEFAULT_KEY_VALUE_STORE_ID: 'defaultKeyValueStoreId',
108
+ APIFY_DEFAULT_REQUEST_QUEUE_ID: 'defaultRequestQueueId',
109
+ APIFY_INPUT_KEY: 'inputKey',
110
+ APIFY_PERSIST_STATE_INTERVAL_MILLIS: 'persistStateIntervalMillis',
111
+ APIFY_HEADLESS: 'headless',
112
+ APIFY_XVFB: 'xvfb',
113
+ APIFY_CHROME_EXECUTABLE_PATH: 'chromeExecutablePath',
114
+ APIFY_DEFAULT_BROWSER_PATH: 'defaultBrowserPath',
115
+ APIFY_DISABLE_BROWSER_SANDBOX: 'disableBrowserSandbox',
116
+
117
+ // as well as apify specific ones
118
+ APIFY_TOKEN: 'token',
119
+ APIFY_METAMORPH_AFTER_SLEEP_MILLIS: 'metamorphAfterSleepMillis',
120
+ APIFY_TEST_PERSIST_INTERVAL_MILLIS: 'persistStateIntervalMillis', // for BC, seems to be unused
121
+ APIFY_ACTOR_EVENTS_WS_URL: 'actorEventsWsUrl',
122
+ APIFY_ACTOR_ID: 'actorId',
123
+ APIFY_API_BASE_URL: 'apiBaseUrl',
124
+ APIFY_IS_AT_HOME: 'isAtHome',
125
+ APIFY_ACTOR_RUN_ID: 'actorRunId',
126
+ APIFY_ACTOR_TASK_ID: 'actorTaskId',
127
+ APIFY_CONTAINER_PORT: 'containerPort',
128
+ APIFY_CONTAINER_URL: 'containerUrl',
129
+ APIFY_USER_ID: 'userId',
130
+ APIFY_PROXY_HOSTNAME: 'proxyHostname',
131
+ APIFY_PROXY_PASSWORD: 'proxyPassword',
132
+ APIFY_PROXY_STATUS_URL: 'proxyStatusUrl',
133
+ APIFY_PROXY_PORT: 'proxyPort',
134
+ APIFY_INPUT_SECRETS_PRIVATE_KEY_FILE: 'inputSecretsPrivateKeyFile',
135
+ APIFY_INPUT_SECRETS_PRIVATE_KEY_PASSPHRASE: 'inputSecretsPrivateKeyPassphrase',
136
+ };
137
+
138
+ protected static override INTEGER_VARS = [...super.INTEGER_VARS, 'proxyPort', 'containerPort', 'metamorphAfterSleepMillis'];
139
+
140
+ protected static override BOOLEAN_VARS = [...super.BOOLEAN_VARS, 'isAtHome'];
141
+
142
+ protected static override DEFAULTS = {
143
+ ...super.DEFAULTS,
144
+ defaultKeyValueStoreId: LOCAL_ENV_VARS[ENV_VARS.DEFAULT_KEY_VALUE_STORE_ID],
145
+ defaultDatasetId: LOCAL_ENV_VARS[ENV_VARS.DEFAULT_DATASET_ID],
146
+ defaultRequestQueueId: LOCAL_ENV_VARS[ENV_VARS.DEFAULT_REQUEST_QUEUE_ID],
147
+ inputKey: 'INPUT',
148
+ apiBaseUrl: 'https://api.apify.com',
149
+ proxyStatusUrl: 'http://proxy.apify.com',
150
+ proxyHostname: LOCAL_ENV_VARS[ENV_VARS.PROXY_HOSTNAME],
151
+ proxyPort: +LOCAL_ENV_VARS[ENV_VARS.PROXY_PORT],
152
+ containerPort: +LOCAL_ENV_VARS[ENV_VARS.CONTAINER_PORT],
153
+ containerUrl: LOCAL_ENV_VARS[ENV_VARS.CONTAINER_URL],
154
+ metamorphAfterSleepMillis: 300e3,
155
+ persistStateIntervalMillis: 60e3, // This value is mentioned in jsdoc in `events.js`, if you update it here, update it there too.
156
+ };
157
+
158
+ /**
159
+ * @inheritDoc
160
+ */
161
+ override get<T extends keyof ConfigurationOptions, U extends ConfigurationOptions[T]>(key: T, defaultValue?: U): U {
162
+ return super.get(key as keyof CoreConfigurationOptions, defaultValue);
163
+ }
164
+
165
+ /**
166
+ * @inheritDoc
167
+ */
168
+ override set(key: keyof ConfigurationOptions, value?: any) {
169
+ super.set(key as keyof CoreConfigurationOptions, value);
170
+ }
171
+
172
+ /**
173
+ * @inheritDoc
174
+ */
175
+ static override getGlobalConfig(): Configuration {
176
+ if (Configuration.storage.getStore()) {
177
+ return Configuration.storage.getStore() as Configuration;
178
+ }
179
+
180
+ Configuration.globalConfig ??= new Configuration();
181
+ return Configuration.globalConfig as Configuration;
182
+ }
183
+
184
+ /**
185
+ * Resets global configuration instance. The default instance holds configuration based on env vars,
186
+ * if we want to change them, we need to first reset the global state. Used mainly for testing purposes.
187
+ */
188
+ static override resetGlobalState(): void {
189
+ delete this.globalConfig;
190
+ }
191
+ }
192
+
193
+ // monkey patch the core class so it respects the new options too
194
+ CoreConfiguration.getGlobalConfig = Configuration.getGlobalConfig;
195
+ // @ts-expect-error protected property
196
+ CoreConfiguration.ENV_MAP = Configuration.ENV_MAP;
197
+ // @ts-expect-error protected property
198
+ CoreConfiguration.INTEGER_VARS = Configuration.INTEGER_VARS;
199
+ // @ts-expect-error protected property
200
+ CoreConfiguration.BOOLEAN_VARS = Configuration.BOOLEAN_VARS;
201
+ // @ts-expect-error protected property
202
+ CoreConfiguration.DEFAULTS = Configuration.DEFAULTS;
package/src/index.ts ADDED
@@ -0,0 +1,11 @@
1
+ export * from './actor';
2
+ export * from './configuration';
3
+ export * from './proxy_configuration';
4
+ export * from './platform_event_manager';
5
+ export * from './key_value_store';
6
+ export {
7
+ Dataset, DatasetDataOptions, DatasetIteratorOptions, DatasetConsumer, DatasetMapper, DatasetReducer, DatasetOptions, DatasetContent,
8
+ RequestQueue, QueueOperationInfo, RequestQueueOperationOptions, RequestQueueOptions, QueueOperationInfoOptions,
9
+ KeyConsumer, KeyValueStoreOptions, RecordOptions, KeyValueStoreIteratorOptions, log, Log, LoggerOptions, LogLevel, Logger, LoggerJson, LoggerText,
10
+ } from '@crawlee/core';
11
+ export { ApifyClient, ApifyClientOptions } from 'apify-client';
@@ -0,0 +1,25 @@
1
+ import type { StorageManagerOptions } from '@crawlee/core';
2
+ import { KeyValueStore as CoreKeyValueStore } from '@crawlee/core';
3
+
4
+ /**
5
+ * @inheritDoc
6
+ */
7
+ export class KeyValueStore extends CoreKeyValueStore {
8
+ /**
9
+ * Returns a URL for the given key that may be used to publicly
10
+ * access the value in the remote key-value store.
11
+ */
12
+ getPublicUrl(key: string): string {
13
+ return `https://api.apify.com/v2/key-value-stores/${this.id}/records/${key}`;
14
+ }
15
+
16
+ /**
17
+ * @inheritDoc
18
+ */
19
+ static override async open(storeIdOrName?: string | null, options: StorageManagerOptions = {}): Promise<KeyValueStore> {
20
+ return super.open(storeIdOrName, options) as unknown as KeyValueStore;
21
+ }
22
+ }
23
+
24
+ // @ts-expect-error extension of the core class to make this only a type-issue
25
+ CoreKeyValueStore.prototype.getPublicUrl = KeyValueStore.prototype.getPublicUrl;
@@ -0,0 +1,118 @@
1
+ import { ACTOR_EVENT_NAMES, ENV_VARS } from '@apify/consts';
2
+ import WebSocket from 'ws';
3
+ import { EventType, EventManager } from '@crawlee/core';
4
+ import { betterClearInterval } from '@apify/utilities';
5
+ import { Configuration } from './configuration';
6
+
7
+ /**
8
+ * Gets an instance of a Node.js'
9
+ * [EventEmitter](https://nodejs.org/api/events.html#events_class_eventemitter)
10
+ * class that emits various events from the SDK or the Apify platform.
11
+ * The event emitter is initialized by calling the {@apilink Actor.main} function.
12
+ *
13
+ * **Example usage:**
14
+ *
15
+ * ```javascript
16
+ * Actor.on('cpuInfo', (data) => {
17
+ * if (data.isCpuOverloaded) console.log('Oh no, the CPU is overloaded!');
18
+ * });
19
+ * ```
20
+ *
21
+ * The following events are emitted:
22
+ *
23
+ * - `cpuInfo`: `{ "isCpuOverloaded": Boolean }`
24
+ * The event is emitted approximately every second
25
+ * and it indicates whether the actor is using the maximum of available CPU resources.
26
+ * If that's the case, the actor should not add more workload.
27
+ * For example, this event is used by the {@apilink AutoscaledPool} class.
28
+ * - `migrating`: `void`
29
+ * Emitted when the actor running on the Apify platform is going to be migrated to another worker server soon.
30
+ * You can use it to persist the state of the actor and abort the run, to speed up migration.
31
+ * For example, this is used by the {@apilink RequestList} class.
32
+ * - `aborting`: `void`
33
+ * When a user aborts an actor run on the Apify platform, they can choose to abort gracefully to allow
34
+ * the actor some time before getting killed. This graceful abort emits the `aborting` event which the SDK
35
+ * uses to gracefully stop running crawls and you can use it to do your own cleanup as well.
36
+ * - `persistState`: `{ "isMigrating": Boolean }`
37
+ * Emitted in regular intervals (by default 60 seconds) to notify all components of Apify SDK that it is time to persist
38
+ * their state, in order to avoid repeating all work when the actor restarts.
39
+ * This event is automatically emitted together with the `migrating` event,
40
+ * in which case the `isMigrating` flag is set to `true`. Otherwise the flag is `false`.
41
+ * Note that the `persistState` event is provided merely for user convenience,
42
+ * you can achieve the same effect using `setInterval()` and listening for the `migrating` event.
43
+ */
44
+ export class PlatformEventManager extends EventManager {
45
+ /** Websocket connection to actor events. */
46
+ private eventsWs?: WebSocket;
47
+
48
+ constructor(override readonly config = Configuration.getGlobalConfig()) {
49
+ super();
50
+ }
51
+
52
+ /**
53
+ * Initializes `Actor.events` event emitter by creating a connection to a websocket that provides them.
54
+ * This is an internal function that is automatically called by `Actor.main()`.
55
+ */
56
+ override async init() {
57
+ if (this.initialized) {
58
+ return;
59
+ }
60
+
61
+ await super.init();
62
+ const eventsWsUrl = this.config.get('actorEventsWsUrl');
63
+
64
+ // Locally there is no web socket to connect, so just print a log message.
65
+ if (!eventsWsUrl) {
66
+ this.log.debug(`Environment variable ${ENV_VARS.ACTOR_EVENTS_WS_URL} is not set, no events from Apify platform will be emitted.`);
67
+ return;
68
+ }
69
+
70
+ this.createWebSocketConnection(eventsWsUrl);
71
+ }
72
+
73
+ private createWebSocketConnection(eventsWsUrl: string) {
74
+ this.eventsWs = new WebSocket(eventsWsUrl);
75
+ this.eventsWs.on('message', (message) => {
76
+ if (!message) return;
77
+
78
+ try {
79
+ const {
80
+ name,
81
+ data,
82
+ } = JSON.parse(String(message));
83
+ this.events.emit(name, data);
84
+
85
+ if (name === ACTOR_EVENT_NAMES.MIGRATING) {
86
+ betterClearInterval(this.intervals.persistState!); // Don't send any other persist state event.
87
+ this.events.emit(EventType.PERSIST_STATE, { isMigrating: true });
88
+ }
89
+ } catch (err) {
90
+ this.log.exception(err as Error, 'Cannot parse actor event');
91
+ }
92
+ });
93
+ this.eventsWs.on('error', (err) => {
94
+ // Don't print this error as this happens in the case of very short Actor.main().
95
+ if (err.message === 'WebSocket was closed before the connection was established') return;
96
+
97
+ this.log.exception(err, 'web socket connection failed');
98
+ });
99
+ this.eventsWs.on('close', () => {
100
+ this.log.debug('web socket has been closed');
101
+ this.eventsWs = undefined;
102
+ });
103
+ }
104
+
105
+ /**
106
+ * Closes websocket providing events from Actor infrastructure and also stops sending internal events
107
+ * of Apify package such as `persistState`.
108
+ * This is automatically called at the end of `Actor.main()`.
109
+ */
110
+ override async close() {
111
+ if (!this.initialized) {
112
+ return;
113
+ }
114
+
115
+ await super.close();
116
+ this.eventsWs?.close();
117
+ }
118
+ }