apify 1.7.1b1__py3-none-any.whl → 2.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

Files changed (62) hide show
  1. apify/__init__.py +33 -4
  2. apify/_actor.py +1074 -0
  3. apify/_configuration.py +370 -0
  4. apify/_consts.py +10 -0
  5. apify/_crypto.py +31 -27
  6. apify/_models.py +117 -0
  7. apify/_platform_event_manager.py +231 -0
  8. apify/_proxy_configuration.py +320 -0
  9. apify/_utils.py +18 -484
  10. apify/apify_storage_client/__init__.py +3 -0
  11. apify/apify_storage_client/_apify_storage_client.py +68 -0
  12. apify/apify_storage_client/_dataset_client.py +190 -0
  13. apify/apify_storage_client/_dataset_collection_client.py +51 -0
  14. apify/apify_storage_client/_key_value_store_client.py +94 -0
  15. apify/apify_storage_client/_key_value_store_collection_client.py +51 -0
  16. apify/apify_storage_client/_request_queue_client.py +176 -0
  17. apify/apify_storage_client/_request_queue_collection_client.py +51 -0
  18. apify/apify_storage_client/py.typed +0 -0
  19. apify/log.py +22 -105
  20. apify/scrapy/__init__.py +11 -3
  21. apify/scrapy/middlewares/__init__.py +3 -1
  22. apify/scrapy/middlewares/apify_proxy.py +29 -27
  23. apify/scrapy/middlewares/py.typed +0 -0
  24. apify/scrapy/pipelines/__init__.py +3 -1
  25. apify/scrapy/pipelines/actor_dataset_push.py +6 -3
  26. apify/scrapy/pipelines/py.typed +0 -0
  27. apify/scrapy/py.typed +0 -0
  28. apify/scrapy/requests.py +60 -58
  29. apify/scrapy/scheduler.py +28 -19
  30. apify/scrapy/utils.py +10 -32
  31. apify/storages/__init__.py +4 -10
  32. apify/storages/_request_list.py +150 -0
  33. apify/storages/py.typed +0 -0
  34. apify-2.2.1.dist-info/METADATA +211 -0
  35. apify-2.2.1.dist-info/RECORD +38 -0
  36. {apify-1.7.1b1.dist-info → apify-2.2.1.dist-info}/WHEEL +1 -2
  37. apify/_memory_storage/__init__.py +0 -3
  38. apify/_memory_storage/file_storage_utils.py +0 -71
  39. apify/_memory_storage/memory_storage_client.py +0 -219
  40. apify/_memory_storage/resource_clients/__init__.py +0 -19
  41. apify/_memory_storage/resource_clients/base_resource_client.py +0 -141
  42. apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -114
  43. apify/_memory_storage/resource_clients/dataset.py +0 -452
  44. apify/_memory_storage/resource_clients/dataset_collection.py +0 -48
  45. apify/_memory_storage/resource_clients/key_value_store.py +0 -533
  46. apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -48
  47. apify/_memory_storage/resource_clients/request_queue.py +0 -466
  48. apify/_memory_storage/resource_clients/request_queue_collection.py +0 -48
  49. apify/actor.py +0 -1351
  50. apify/config.py +0 -127
  51. apify/consts.py +0 -67
  52. apify/event_manager.py +0 -236
  53. apify/proxy_configuration.py +0 -365
  54. apify/storages/base_storage.py +0 -181
  55. apify/storages/dataset.py +0 -494
  56. apify/storages/key_value_store.py +0 -257
  57. apify/storages/request_queue.py +0 -602
  58. apify/storages/storage_client_manager.py +0 -72
  59. apify-1.7.1b1.dist-info/METADATA +0 -149
  60. apify-1.7.1b1.dist-info/RECORD +0 -41
  61. apify-1.7.1b1.dist-info/top_level.txt +0 -1
  62. {apify-1.7.1b1.dist-info → apify-2.2.1.dist-info}/LICENSE +0 -0
apify/_actor.py ADDED
@@ -0,0 +1,1074 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import os
5
+ import sys
6
+ from datetime import timedelta
7
+ from typing import TYPE_CHECKING, Any, Callable, Literal, TypeVar, cast, overload
8
+
9
+ from lazy_object_proxy import Proxy
10
+ from more_itertools import flatten
11
+ from pydantic import AliasChoices
12
+
13
+ from apify_client import ApifyClientAsync
14
+ from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
15
+ from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value
16
+ from crawlee import service_locator
17
+ from crawlee.events import (
18
+ Event,
19
+ EventAbortingData,
20
+ EventExitData,
21
+ EventListener,
22
+ EventMigratingData,
23
+ EventPersistStateData,
24
+ EventSystemInfoData,
25
+ )
26
+
27
+ from apify._configuration import Configuration
28
+ from apify._consts import EVENT_LISTENERS_TIMEOUT
29
+ from apify._crypto import decrypt_input_secrets, load_private_key
30
+ from apify._models import ActorRun
31
+ from apify._platform_event_manager import EventManager, LocalEventManager, PlatformEventManager
32
+ from apify._proxy_configuration import ProxyConfiguration
33
+ from apify._utils import docs_group, docs_name, get_system_info, is_running_in_ipython
34
+ from apify.apify_storage_client import ApifyStorageClient
35
+ from apify.log import _configure_logging, logger
36
+ from apify.storages import Dataset, KeyValueStore, RequestQueue
37
+
38
+ if TYPE_CHECKING:
39
+ import logging
40
+ from types import TracebackType
41
+
42
+ from typing_extensions import Self
43
+
44
+ from crawlee.proxy_configuration import _NewUrlFunction
45
+ from crawlee.storage_clients import BaseStorageClient
46
+
47
+ from apify._models import Webhook
48
+
49
+
50
+ MainReturnType = TypeVar('MainReturnType')
51
+
52
+
53
+ @docs_name('Actor')
54
+ @docs_group('Classes')
55
+ class _ActorType:
56
+ """The class of `Actor`. Only make a new instance if you're absolutely sure you need to."""
57
+
58
+ _apify_client: ApifyClientAsync
59
+ _configuration: Configuration
60
+ _is_exiting = False
61
+ _is_rebooting = False
62
+
63
+ def __init__(
64
+ self,
65
+ configuration: Configuration | None = None,
66
+ *,
67
+ configure_logging: bool = True,
68
+ ) -> None:
69
+ """Create an Actor instance.
70
+
71
+ Note that you don't have to do this, all the functionality is accessible using the default instance
72
+ (e.g. `Actor.open_dataset()`).
73
+
74
+ Args:
75
+ configuration: The Actor configuration to be used. If not passed, a new Configuration instance will
76
+ be created.
77
+ configure_logging: Should the default logging configuration be configured?
78
+ """
79
+ self._configuration = configuration or Configuration.get_global_configuration()
80
+ self._configure_logging = configure_logging
81
+ self._apify_client = self.new_client()
82
+
83
+ # Create an instance of the cloud storage client, the local storage client is obtained
84
+ # from the service locator.
85
+ self._cloud_storage_client = ApifyStorageClient.from_config(config=self._configuration)
86
+
87
+ # Set the event manager based on whether the Actor is running on the platform or locally.
88
+ self._event_manager = (
89
+ PlatformEventManager(
90
+ config=self._configuration,
91
+ persist_state_interval=self._configuration.persist_state_interval,
92
+ )
93
+ if self.is_at_home()
94
+ else LocalEventManager(
95
+ system_info_interval=self._configuration.system_info_interval,
96
+ persist_state_interval=self._configuration.persist_state_interval,
97
+ )
98
+ )
99
+
100
+ self._is_initialized = False
101
+
102
+ @ignore_docs
103
+ async def __aenter__(self) -> Self:
104
+ """Initialize the Actor.
105
+
106
+ Automatically initializes the Actor instance when you use it in an `async with ...` statement.
107
+
108
+ When you exit the `async with` block, the `Actor.exit()` method is called, and if any exception happens while
109
+ executing the block code, the `Actor.fail` method is called.
110
+ """
111
+ await self.init()
112
+ return self
113
+
114
+ @ignore_docs
115
+ async def __aexit__(
116
+ self,
117
+ _exc_type: type[BaseException] | None,
118
+ exc_value: BaseException | None,
119
+ _exc_traceback: TracebackType | None,
120
+ ) -> None:
121
+ """Exit the Actor, handling any exceptions properly.
122
+
123
+ When you exit the `async with` block, the `Actor.exit()` method is called, and if any exception happens while
124
+ executing the block code, the `Actor.fail` method is called.
125
+ """
126
+ if not self._is_exiting:
127
+ if exc_value:
128
+ await self.fail(
129
+ exit_code=ActorExitCodes.ERROR_USER_FUNCTION_THREW.value,
130
+ exception=exc_value,
131
+ )
132
+ else:
133
+ await self.exit()
134
+
135
+ def __repr__(self) -> str:
136
+ if self is cast(Proxy, Actor).__wrapped__:
137
+ return '<apify.Actor>'
138
+
139
+ return super().__repr__()
140
+
141
+ def __call__(self, configuration: Configuration | None = None, *, configure_logging: bool = True) -> Self:
142
+ """Make a new Actor instance with a non-default configuration."""
143
+ return self.__class__(configuration=configuration, configure_logging=configure_logging)
144
+
145
+ @property
146
+ def apify_client(self) -> ApifyClientAsync:
147
+ """The ApifyClientAsync instance the Actor instance uses."""
148
+ return self._apify_client
149
+
150
+ @property
151
+ def configuration(self) -> Configuration:
152
+ """The Configuration instance the Actor instance uses."""
153
+ return self._configuration
154
+
155
+ @property
156
+ def config(self) -> Configuration:
157
+ """The Configuration instance the Actor instance uses."""
158
+ return self._configuration
159
+
160
+ @property
161
+ def event_manager(self) -> EventManager:
162
+ """The EventManager instance the Actor instance uses."""
163
+ return self._event_manager
164
+
165
+ @property
166
+ def log(self) -> logging.Logger:
167
+ """The logging.Logger instance the Actor uses."""
168
+ return logger
169
+
170
+ @property
171
+ def _local_storage_client(self) -> BaseStorageClient:
172
+ """The local storage client the Actor instance uses."""
173
+ return service_locator.get_storage_client()
174
+
175
+ def _raise_if_not_initialized(self) -> None:
176
+ if not self._is_initialized:
177
+ raise RuntimeError('The Actor was not initialized!')
178
+
179
+ def _raise_if_cloud_requested_but_not_configured(self, *, force_cloud: bool) -> None:
180
+ if not force_cloud:
181
+ return
182
+
183
+ if not self.is_at_home() and self.config.token is None:
184
+ raise RuntimeError(
185
+ 'In order to use the Apify cloud storage from your computer, '
186
+ 'you need to provide an Apify token using the APIFY_TOKEN environment variable.'
187
+ )
188
+
189
+ async def init(self) -> None:
190
+ """Initialize the Actor instance.
191
+
192
+ This initializes the Actor instance. It configures the right storage client based on whether the Actor is
193
+ running locally or on the Apify platform, it initializes the event manager for processing Actor events,
194
+ and starts an interval for regularly sending `PERSIST_STATE` events, so that the Actor can regularly persist
195
+ its state in response to these events.
196
+
197
+ This method should be called immediately before performing any additional Actor actions, and it should be
198
+ called only once.
199
+ """
200
+ if self._is_initialized:
201
+ raise RuntimeError('The Actor was already initialized!')
202
+
203
+ self._is_exiting = False
204
+ self._was_final_persist_state_emitted = False
205
+
206
+ # If the Actor is running on the Apify platform, we set the cloud storage client.
207
+ if self.is_at_home():
208
+ service_locator.set_storage_client(self._cloud_storage_client)
209
+
210
+ service_locator.set_event_manager(self.event_manager)
211
+ service_locator.set_configuration(self.configuration)
212
+
213
+ # The logging configuration has to be called after all service_locator set methods.
214
+ if self._configure_logging:
215
+ _configure_logging()
216
+
217
+ self.log.info('Initializing Actor...')
218
+ self.log.info('System info', extra=get_system_info())
219
+
220
+ # TODO: Print outdated SDK version warning (we need a new env var for this)
221
+ # https://github.com/apify/apify-sdk-python/issues/146
222
+
223
+ await self._event_manager.__aenter__()
224
+
225
+ self._is_initialized = True
226
+
227
+ async def exit(
228
+ self,
229
+ *,
230
+ exit_code: int = 0,
231
+ event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT,
232
+ status_message: str | None = None,
233
+ cleanup_timeout: timedelta = timedelta(seconds=30),
234
+ ) -> None:
235
+ """Exit the Actor instance.
236
+
237
+ This stops the Actor instance. It cancels all the intervals for regularly sending `PERSIST_STATE` events,
238
+ sends a final `PERSIST_STATE` event, waits for all the event listeners to finish, and stops the event manager.
239
+
240
+ Args:
241
+ exit_code: The exit code with which the Actor should fail (defaults to `0`).
242
+ event_listeners_timeout: How long should the Actor wait for Actor event listeners to finish before exiting.
243
+ status_message: The final status message that the Actor should display.
244
+ cleanup_timeout: How long we should wait for event listeners.
245
+ """
246
+ self._raise_if_not_initialized()
247
+
248
+ self._is_exiting = True
249
+
250
+ exit_code = maybe_extract_enum_member_value(exit_code)
251
+
252
+ self.log.info('Exiting Actor', extra={'exit_code': exit_code})
253
+
254
+ async def finalize() -> None:
255
+ if status_message is not None:
256
+ await self.set_status_message(status_message, is_terminal=True)
257
+
258
+ # Sleep for a bit so that the listeners have a chance to trigger
259
+ await asyncio.sleep(0.1)
260
+
261
+ if event_listeners_timeout:
262
+ await self._event_manager.wait_for_all_listeners_to_complete(timeout=event_listeners_timeout)
263
+
264
+ await self._event_manager.__aexit__(None, None, None)
265
+
266
+ await asyncio.wait_for(finalize(), cleanup_timeout.total_seconds())
267
+ self._is_initialized = False
268
+
269
+ if is_running_in_ipython():
270
+ self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in IPython')
271
+ elif os.getenv('PYTEST_CURRENT_TEST', default=False): # noqa: PLW1508
272
+ self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in an unit test')
273
+ elif hasattr(asyncio, '_nest_patched'):
274
+ self.log.debug(f'Not calling sys.exit({exit_code}) because Actor is running in a nested event loop')
275
+ else:
276
+ sys.exit(exit_code)
277
+
278
+ async def fail(
279
+ self,
280
+ *,
281
+ exit_code: int = 1,
282
+ exception: BaseException | None = None,
283
+ status_message: str | None = None,
284
+ ) -> None:
285
+ """Fail the Actor instance.
286
+
287
+ This performs all the same steps as Actor.exit(), but it additionally sets the exit code to `1` (by default).
288
+
289
+ Args:
290
+ exit_code: The exit code with which the Actor should fail (defaults to `1`).
291
+ exception: The exception with which the Actor failed.
292
+ status_message: The final status message that the Actor should display.
293
+ """
294
+ self._raise_if_not_initialized()
295
+
296
+ # In IPython, we don't run `sys.exit()` during Actor exits,
297
+ # so the exception traceback will be printed on its own
298
+ if exception and not is_running_in_ipython():
299
+ self.log.exception('Actor failed with an exception', exc_info=exception)
300
+
301
+ await self.exit(exit_code=exit_code, status_message=status_message)
302
+
303
+ def new_client(
304
+ self,
305
+ *,
306
+ token: str | None = None,
307
+ api_url: str | None = None,
308
+ max_retries: int | None = None,
309
+ min_delay_between_retries: timedelta | None = None,
310
+ timeout: timedelta | None = None,
311
+ ) -> ApifyClientAsync:
312
+ """Return a new instance of the Apify API client.
313
+
314
+ The `ApifyClientAsync` class is provided by the [apify-client](https://github.com/apify/apify-client-python)
315
+ package, and it is automatically configured using the `APIFY_API_BASE_URL` and `APIFY_TOKEN` environment
316
+ variables.
317
+
318
+ You can override the token via the available options. That's useful if you want to use the client
319
+ as a different Apify user than the SDK internals are using.
320
+
321
+ Args:
322
+ token: The Apify API token.
323
+ api_url: The URL of the Apify API server to which to connect to. Defaults to https://api.apify.com.
324
+ max_retries: How many times to retry a failed request at most.
325
+ min_delay_between_retries: How long will the client wait between retrying requests
326
+ (increases exponentially from this value).
327
+ timeout: The socket timeout of the HTTP requests sent to the Apify API.
328
+ """
329
+ token = token or self._configuration.token
330
+ api_url = api_url or self._configuration.api_base_url
331
+ return ApifyClientAsync(
332
+ token=token,
333
+ api_url=api_url,
334
+ max_retries=max_retries,
335
+ min_delay_between_retries_millis=int(min_delay_between_retries.total_seconds() * 1000)
336
+ if min_delay_between_retries is not None
337
+ else None,
338
+ timeout_secs=int(timeout.total_seconds()) if timeout else None,
339
+ )
340
+
341
+ async def open_dataset(
342
+ self,
343
+ *,
344
+ id: str | None = None,
345
+ name: str | None = None,
346
+ force_cloud: bool = False,
347
+ ) -> Dataset:
348
+ """Open a dataset.
349
+
350
+ Datasets are used to store structured data where each object stored has the same attributes, such as online
351
+ store products or real estate offers. The actual data is stored either on the local filesystem or in
352
+ the Apify cloud.
353
+
354
+ Args:
355
+ id: ID of the dataset to be opened. If neither `id` nor `name` are provided, the method returns
356
+ the default dataset associated with the Actor run.
357
+ name: Name of the dataset to be opened. If neither `id` nor `name` are provided, the method returns
358
+ the default dataset associated with the Actor run.
359
+ force_cloud: If set to `True` then the Apify cloud storage is always used. This way it is possible
360
+ to combine local and cloud storage.
361
+
362
+ Returns:
363
+ An instance of the `Dataset` class for the given ID or name.
364
+ """
365
+ self._raise_if_not_initialized()
366
+ self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)
367
+
368
+ storage_client = self._cloud_storage_client if force_cloud else self._local_storage_client
369
+
370
+ return await Dataset.open(
371
+ id=id,
372
+ name=name,
373
+ configuration=self._configuration,
374
+ storage_client=storage_client,
375
+ )
376
+
377
+ async def open_key_value_store(
378
+ self,
379
+ *,
380
+ id: str | None = None,
381
+ name: str | None = None,
382
+ force_cloud: bool = False,
383
+ ) -> KeyValueStore:
384
+ """Open a key-value store.
385
+
386
+ Key-value stores are used to store records or files, along with their MIME content type. The records are stored
387
+ and retrieved using a unique key. The actual data is stored either on a local filesystem or in the Apify cloud.
388
+
389
+ Args:
390
+ id: ID of the key-value store to be opened. If neither `id` nor `name` are provided, the method returns
391
+ the default key-value store associated with the Actor run.
392
+ name: Name of the key-value store to be opened. If neither `id` nor `name` are provided, the method
393
+ returns the default key-value store associated with the Actor run.
394
+ force_cloud: If set to `True` then the Apify cloud storage is always used. This way it is possible
395
+ to combine local and cloud storage.
396
+
397
+ Returns:
398
+ An instance of the `KeyValueStore` class for the given ID or name.
399
+ """
400
+ self._raise_if_not_initialized()
401
+ self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)
402
+ storage_client = self._cloud_storage_client if force_cloud else self._local_storage_client
403
+
404
+ return await KeyValueStore.open(
405
+ id=id,
406
+ name=name,
407
+ configuration=self._configuration,
408
+ storage_client=storage_client,
409
+ )
410
+
411
+ async def open_request_queue(
412
+ self,
413
+ *,
414
+ id: str | None = None,
415
+ name: str | None = None,
416
+ force_cloud: bool = False,
417
+ ) -> RequestQueue:
418
+ """Open a request queue.
419
+
420
+ Request queue represents a queue of URLs to crawl, which is stored either on local filesystem or in
421
+ the Apify cloud. The queue is used for deep crawling of websites, where you start with several URLs and then
422
+ recursively follow links to other pages. The data structure supports both breadth-first and depth-first
423
+ crawling orders.
424
+
425
+ Args:
426
+ id: ID of the request queue to be opened. If neither `id` nor `name` are provided, the method returns
427
+ the default request queue associated with the Actor run.
428
+ name: Name of the request queue to be opened. If neither `id` nor `name` are provided, the method returns
429
+ the default request queue associated with the Actor run.
430
+ force_cloud: If set to `True` then the Apify cloud storage is always used. This way it is possible
431
+ to combine local and cloud storage.
432
+
433
+ Returns:
434
+ An instance of the `RequestQueue` class for the given ID or name.
435
+ """
436
+ self._raise_if_not_initialized()
437
+ self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)
438
+
439
+ storage_client = self._cloud_storage_client if force_cloud else self._local_storage_client
440
+
441
+ return await RequestQueue.open(
442
+ id=id,
443
+ name=name,
444
+ configuration=self._configuration,
445
+ storage_client=storage_client,
446
+ )
447
+
448
+ async def push_data(self, data: dict | list[dict]) -> None:
449
+ """Store an object or a list of objects to the default dataset of the current Actor run.
450
+
451
+ Args:
452
+ data: The data to push to the default dataset.
453
+ """
454
+ self._raise_if_not_initialized()
455
+
456
+ if not data:
457
+ return
458
+
459
+ dataset = await self.open_dataset()
460
+ await dataset.push_data(data)
461
+
462
+ async def get_input(self) -> Any:
463
+ """Get the Actor input value from the default key-value store associated with the current Actor run."""
464
+ self._raise_if_not_initialized()
465
+
466
+ input_value = await self.get_value(self._configuration.input_key)
467
+ input_secrets_private_key = self._configuration.input_secrets_private_key_file
468
+ input_secrets_key_passphrase = self._configuration.input_secrets_private_key_passphrase
469
+ if input_secrets_private_key and input_secrets_key_passphrase:
470
+ private_key = load_private_key(
471
+ input_secrets_private_key,
472
+ input_secrets_key_passphrase,
473
+ )
474
+ input_value = decrypt_input_secrets(private_key, input_value)
475
+
476
+ return input_value
477
+
478
+ async def get_value(self, key: str, default_value: Any = None) -> Any:
479
+ """Get a value from the default key-value store associated with the current Actor run.
480
+
481
+ Args:
482
+ key: The key of the record which to retrieve.
483
+ default_value: Default value returned in case the record does not exist.
484
+ """
485
+ self._raise_if_not_initialized()
486
+
487
+ key_value_store = await self.open_key_value_store()
488
+ return await key_value_store.get_value(key, default_value)
489
+
490
+ async def set_value(
491
+ self,
492
+ key: str,
493
+ value: Any,
494
+ *,
495
+ content_type: str | None = None,
496
+ ) -> None:
497
+ """Set or delete a value in the default key-value store associated with the current Actor run.
498
+
499
+ Args:
500
+ key: The key of the record which to set.
501
+ value: The value of the record which to set, or None, if the record should be deleted.
502
+ content_type: The content type which should be set to the value.
503
+ """
504
+ self._raise_if_not_initialized()
505
+
506
+ key_value_store = await self.open_key_value_store()
507
+ return await key_value_store.set_value(key, value, content_type=content_type)
508
+
509
+ @overload
510
+ def on(
511
+ self, event_name: Literal[Event.PERSIST_STATE], listener: EventListener[EventPersistStateData]
512
+ ) -> EventListener[EventPersistStateData]: ...
513
+ @overload
514
+ def on(
515
+ self, event_name: Literal[Event.SYSTEM_INFO], listener: EventListener[EventSystemInfoData]
516
+ ) -> EventListener[EventSystemInfoData]: ...
517
+ @overload
518
+ def on(
519
+ self, event_name: Literal[Event.MIGRATING], listener: EventListener[EventMigratingData]
520
+ ) -> EventListener[EventMigratingData]: ...
521
+ @overload
522
+ def on(
523
+ self, event_name: Literal[Event.ABORTING], listener: EventListener[EventAbortingData]
524
+ ) -> EventListener[EventAbortingData]: ...
525
+ @overload
526
+ def on(
527
+ self, event_name: Literal[Event.EXIT], listener: EventListener[EventExitData]
528
+ ) -> EventListener[EventExitData]: ...
529
+ @overload
530
+ def on(self, event_name: Event, listener: EventListener[None]) -> EventListener[Any]: ...
531
+
532
+ def on(self, event_name: Event, listener: EventListener[Any]) -> EventListener[Any]:
533
+ """Add an event listener to the Actor's event manager.
534
+
535
+ The following events can be emitted:
536
+
537
+ - `Event.SYSTEM_INFO`: Emitted every minute; the event data contains information about the Actor's resource
538
+ usage.
539
+
540
+ - `Event.MIGRATING`: Emitted when the Actor on the Apify platform is about to be migrated to another worker
541
+ server. Use this event to persist the Actor's state and gracefully stop in-progress tasks, preventing
542
+ disruption.
543
+
544
+ - `Event.PERSIST_STATE`: Emitted regularly (default: 60 seconds) to notify the Actor to persist its state,
545
+ preventing work repetition after a restart. This event is emitted together with the `MIGRATING` event, where
546
+ the `isMigrating` flag in the event data is `True`; otherwise, the flag is `False`. This event is for
547
+ convenience; the same effect can be achieved by setting an interval and listening for the `MIGRATING` event.
548
+
549
+ - `Event.ABORTING`: Emitted when a user aborts an Actor run on the Apify platform, allowing the Actor time
550
+ to clean up its state if the abort is graceful.
551
+
552
+ Args:
553
+ event_name: The Actor event to listen for.
554
+ listener: The function to be called when the event is emitted (can be async).
555
+ """
556
+ self._raise_if_not_initialized()
557
+
558
+ self._event_manager.on(event=event_name, listener=listener)
559
+ return listener
560
+
561
+ @overload
562
+ def off(self, event_name: Literal[Event.PERSIST_STATE], listener: EventListener[EventPersistStateData]) -> None: ...
563
+ @overload
564
+ def off(self, event_name: Literal[Event.SYSTEM_INFO], listener: EventListener[EventSystemInfoData]) -> None: ...
565
+ @overload
566
+ def off(self, event_name: Literal[Event.MIGRATING], listener: EventListener[EventMigratingData]) -> None: ...
567
+ @overload
568
+ def off(self, event_name: Literal[Event.ABORTING], listener: EventListener[EventAbortingData]) -> None: ...
569
+ @overload
570
+ def off(self, event_name: Literal[Event.EXIT], listener: EventListener[EventExitData]) -> None: ...
571
+ @overload
572
+ def off(self, event_name: Event, listener: EventListener[None]) -> None: ...
573
+
574
+ def off(self, event_name: Event, listener: Callable | None = None) -> None:
575
+ """Remove a listener, or all listeners, from an Actor event.
576
+
577
+ Args:
578
+ event_name: The Actor event for which to remove listeners.
579
+ listener: The listener which is supposed to be removed. If not passed, all listeners of this event
580
+ are removed.
581
+ """
582
+ self._raise_if_not_initialized()
583
+
584
+ self._event_manager.off(event=event_name, listener=listener)
585
+
586
+ def is_at_home(self) -> bool:
587
+ """Return `True` when the Actor is running on the Apify platform, and `False` otherwise (e.g. local run)."""
588
+ return self._configuration.is_at_home
589
+
590
+ def get_env(self) -> dict:
591
+ """Return a dictionary with information parsed from all the `APIFY_XXX` environment variables.
592
+
593
+ For a list of all the environment variables, see the
594
+ [Actor documentation](https://docs.apify.com/actors/development/environment-variables). If some variables
595
+ are not defined or are invalid, the corresponding value in the resulting dictionary will be None.
596
+ """
597
+ self._raise_if_not_initialized()
598
+
599
+ config = dict[str, Any]()
600
+ for field_name, field in Configuration.model_fields.items():
601
+ if field.deprecated:
602
+ continue
603
+
604
+ if field.alias:
605
+ aliases = [field.alias]
606
+ elif isinstance(field.validation_alias, str):
607
+ aliases = [field.validation_alias]
608
+ elif isinstance(field.validation_alias, AliasChoices):
609
+ aliases = cast(list[str], field.validation_alias.choices)
610
+ else:
611
+ aliases = [field_name]
612
+
613
+ for alias in aliases:
614
+ config[alias] = getattr(self._configuration, field_name)
615
+
616
+ env_vars = {env_var.value.lower(): env_var.name.lower() for env_var in [*ActorEnvVars, *ApifyEnvVars]}
617
+ return {option_name: config[env_var] for env_var, option_name in env_vars.items() if env_var in config}
618
+
619
+ async def start(
620
+ self,
621
+ actor_id: str,
622
+ run_input: Any = None,
623
+ *,
624
+ token: str | None = None,
625
+ content_type: str | None = None,
626
+ build: str | None = None,
627
+ memory_mbytes: int | None = None,
628
+ timeout: timedelta | None = None,
629
+ wait_for_finish: int | None = None,
630
+ webhooks: list[Webhook] | None = None,
631
+ ) -> ActorRun:
632
+ """Run an Actor on the Apify platform.
633
+
634
+ Unlike `Actor.call`, this method just starts the run without waiting for finish.
635
+
636
+ Args:
637
+ actor_id: The ID of the Actor to be run.
638
+ run_input: The input to pass to the Actor run.
639
+ token: The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable).
640
+ content_type: The content type of the input.
641
+ build: Specifies the Actor build to run. It can be either a build tag or build number. By default,
642
+ the run uses the build specified in the default run configuration for the Actor (typically latest).
643
+ memory_mbytes: Memory limit for the run, in megabytes. By default, the run uses a memory limit specified
644
+ in the default run configuration for the Actor.
645
+ timeout: Optional timeout for the run, in seconds. By default, the run uses timeout specified in
646
+ the default run configuration for the Actor.
647
+ wait_for_finish: The maximum number of seconds the server waits for the run to finish. By default,
648
+ it is 0, the maximum value is 300.
649
+ webhooks: Optional ad-hoc webhooks (https://docs.apify.com/webhooks/ad-hoc-webhooks) associated with
650
+ the Actor run which can be used to receive a notification, e.g. when the Actor finished or failed.
651
+ If you already have a webhook set up for the Actor or task, you do not have to add it again here.
652
+
653
+ Returns:
654
+ Info about the started Actor run
655
+ """
656
+ self._raise_if_not_initialized()
657
+
658
+ client = self.new_client(token=token) if token else self._apify_client
659
+
660
+ if webhooks:
661
+ serialized_webhooks = [
662
+ hook.model_dump(by_alias=True, exclude_unset=True, exclude_defaults=True) for hook in webhooks
663
+ ]
664
+ else:
665
+ serialized_webhooks = None
666
+
667
+ api_result = await client.actor(actor_id).start(
668
+ run_input=run_input,
669
+ content_type=content_type,
670
+ build=build,
671
+ memory_mbytes=memory_mbytes,
672
+ timeout_secs=int(timeout.total_seconds()) if timeout is not None else None,
673
+ wait_for_finish=wait_for_finish,
674
+ webhooks=serialized_webhooks,
675
+ )
676
+
677
+ return ActorRun.model_validate(api_result)
678
+
679
+ async def abort(
680
+ self,
681
+ run_id: str,
682
+ *,
683
+ token: str | None = None,
684
+ status_message: str | None = None,
685
+ gracefully: bool | None = None,
686
+ ) -> ActorRun:
687
+ """Abort given Actor run on the Apify platform using the current user account.
688
+
689
+ The user account is determined by the `APIFY_TOKEN` environment variable.
690
+
691
+ Args:
692
+ run_id: The ID of the Actor run to be aborted.
693
+ token: The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable).
694
+ status_message: Status message of the Actor to be set on the platform.
695
+ gracefully: If True, the Actor run will abort gracefully. It will send `aborting` and `persistState`
696
+ events into the run and force-stop the run after 30 seconds. It is helpful in cases where you plan
697
+ to resurrect the run later.
698
+
699
+ Returns:
700
+ Info about the aborted Actor run.
701
+ """
702
+ self._raise_if_not_initialized()
703
+
704
+ client = self.new_client(token=token) if token else self._apify_client
705
+
706
+ if status_message:
707
+ await client.run(run_id).update(status_message=status_message)
708
+
709
+ api_result = await client.run(run_id).abort(gracefully=gracefully)
710
+
711
+ return ActorRun.model_validate(api_result)
712
+
713
+ async def call(
714
+ self,
715
+ actor_id: str,
716
+ run_input: Any = None,
717
+ *,
718
+ token: str | None = None,
719
+ content_type: str | None = None,
720
+ build: str | None = None,
721
+ memory_mbytes: int | None = None,
722
+ timeout: timedelta | None = None,
723
+ webhooks: list[Webhook] | None = None,
724
+ wait: timedelta | None = None,
725
+ ) -> ActorRun | None:
726
+ """Start an Actor on the Apify Platform and wait for it to finish before returning.
727
+
728
+ It waits indefinitely, unless the wait argument is provided.
729
+
730
+ Args:
731
+ actor_id: The ID of the Actor to be run.
732
+ run_input: The input to pass to the Actor run.
733
+ token: The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable).
734
+ content_type: The content type of the input.
735
+ build: Specifies the Actor build to run. It can be either a build tag or build number. By default,
736
+ the run uses the build specified in the default run configuration for the Actor (typically latest).
737
+ memory_mbytes: Memory limit for the run, in megabytes. By default, the run uses a memory limit specified
738
+ in the default run configuration for the Actor.
739
+ timeout: Optional timeout for the run, in seconds. By default, the run uses timeout specified in
740
+ the default run configuration for the Actor.
741
+ webhooks: Optional webhooks (https://docs.apify.com/webhooks) associated with the Actor run, which can
742
+ be used to receive a notification, e.g. when the Actor finished or failed. If you already have
743
+ a webhook set up for the Actor, you do not have to add it again here.
744
+ wait: The maximum number of seconds the server waits for the run to finish. If not provided,
745
+ waits indefinitely.
746
+
747
+ Returns:
748
+ Info about the started Actor run.
749
+ """
750
+ self._raise_if_not_initialized()
751
+
752
+ client = self.new_client(token=token) if token else self._apify_client
753
+
754
+ if webhooks:
755
+ serialized_webhooks = [
756
+ hook.model_dump(by_alias=True, exclude_unset=True, exclude_defaults=True) for hook in webhooks
757
+ ]
758
+ else:
759
+ serialized_webhooks = None
760
+
761
+ api_result = await client.actor(actor_id).call(
762
+ run_input=run_input,
763
+ content_type=content_type,
764
+ build=build,
765
+ memory_mbytes=memory_mbytes,
766
+ timeout_secs=int(timeout.total_seconds()) if timeout is not None else None,
767
+ webhooks=serialized_webhooks,
768
+ wait_secs=int(wait.total_seconds()) if wait is not None else None,
769
+ )
770
+
771
+ return ActorRun.model_validate(api_result)
772
+
773
+ async def call_task(
774
+ self,
775
+ task_id: str,
776
+ task_input: dict | None = None,
777
+ *,
778
+ build: str | None = None,
779
+ memory_mbytes: int | None = None,
780
+ timeout: timedelta | None = None,
781
+ webhooks: list[Webhook] | None = None,
782
+ wait: timedelta | None = None,
783
+ token: str | None = None,
784
+ ) -> ActorRun | None:
785
+ """Start an Actor task on the Apify Platform and wait for it to finish before returning.
786
+
787
+ It waits indefinitely, unless the wait argument is provided.
788
+
789
+ Note that an Actor task is a saved input configuration and options for an Actor. If you want to run an Actor
790
+ directly rather than an Actor task, please use the `Actor.call`
791
+
792
+ Args:
793
+ task_id: The ID of the Actor to be run.
794
+ task_input: Overrides the input to pass to the Actor run.
795
+ token: The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable).
796
+ content_type: The content type of the input.
797
+ build: Specifies the Actor build to run. It can be either a build tag or build number. By default,
798
+ the run uses the build specified in the default run configuration for the Actor (typically latest).
799
+ memory_mbytes: Memory limit for the run, in megabytes. By default, the run uses a memory limit specified
800
+ in the default run configuration for the Actor.
801
+ timeout: Optional timeout for the run, in seconds. By default, the run uses timeout specified in
802
+ the default run configuration for the Actor.
803
+ webhooks: Optional webhooks (https://docs.apify.com/webhooks) associated with the Actor run, which can
804
+ be used to receive a notification, e.g. when the Actor finished or failed. If you already have
805
+ a webhook set up for the Actor, you do not have to add it again here.
806
+ wait: The maximum number of seconds the server waits for the run to finish. If not provided, waits
807
+ indefinitely.
808
+
809
+ Returns:
810
+ Info about the started Actor run.
811
+ """
812
+ self._raise_if_not_initialized()
813
+
814
+ client = self.new_client(token=token) if token else self._apify_client
815
+
816
+ if webhooks:
817
+ serialized_webhooks = [
818
+ hook.model_dump(by_alias=True, exclude_unset=True, exclude_defaults=True) for hook in webhooks
819
+ ]
820
+ else:
821
+ serialized_webhooks = None
822
+
823
+ api_result = await client.task(task_id).call(
824
+ task_input=task_input,
825
+ build=build,
826
+ memory_mbytes=memory_mbytes,
827
+ timeout_secs=int(timeout.total_seconds()) if timeout is not None else None,
828
+ webhooks=serialized_webhooks,
829
+ wait_secs=int(wait.total_seconds()) if wait is not None else None,
830
+ )
831
+
832
+ return ActorRun.model_validate(api_result)
833
+
834
+ async def metamorph(
835
+ self,
836
+ target_actor_id: str,
837
+ run_input: Any = None,
838
+ *,
839
+ target_actor_build: str | None = None,
840
+ content_type: str | None = None,
841
+ custom_after_sleep: timedelta | None = None,
842
+ ) -> None:
843
+ """Transform this Actor run to an Actor run of a different Actor.
844
+
845
+ The platform stops the current Actor container and starts a new container with the new Actor instead. All
846
+ the default storages are preserved, and the new input is stored under the `INPUT-METAMORPH-1` key in the same
847
+ default key-value store.
848
+
849
+ Args:
850
+ target_actor_id: ID of the target Actor that the run should be transformed into
851
+ run_input: The input to pass to the new run.
852
+ target_actor_build: The build of the target Actor. It can be either a build tag or build number.
853
+ By default, the run uses the build specified in the default run configuration for the target Actor
854
+ (typically the latest build).
855
+ content_type: The content type of the input.
856
+ custom_after_sleep: How long to sleep for after the metamorph, to wait for the container to be stopped.
857
+ """
858
+ self._raise_if_not_initialized()
859
+
860
+ if not self.is_at_home():
861
+ self.log.error('Actor.metamorph() is only supported when running on the Apify platform.')
862
+ return
863
+
864
+ if not custom_after_sleep:
865
+ custom_after_sleep = self._configuration.metamorph_after_sleep
866
+
867
+ # If is_at_home() is True, config.actor_run_id is always set
868
+ if not self._configuration.actor_run_id:
869
+ raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.')
870
+
871
+ await self._apify_client.run(self._configuration.actor_run_id).metamorph(
872
+ target_actor_id=target_actor_id,
873
+ run_input=run_input,
874
+ target_actor_build=target_actor_build,
875
+ content_type=content_type,
876
+ )
877
+
878
+ if custom_after_sleep:
879
+ await asyncio.sleep(custom_after_sleep.total_seconds())
880
+
881
+ async def reboot(
882
+ self,
883
+ *,
884
+ event_listeners_timeout: timedelta | None = EVENT_LISTENERS_TIMEOUT, # noqa: ARG002
885
+ custom_after_sleep: timedelta | None = None,
886
+ ) -> None:
887
+ """Internally reboot this Actor.
888
+
889
+ The system stops the current container and starts a new one, with the same run ID and default storages.
890
+
891
+ Args:
892
+ event_listeners_timeout: How long should the Actor wait for Actor event listeners to finish before exiting
893
+ custom_after_sleep: How long to sleep for after the reboot, to wait for the container to be stopped.
894
+ """
895
+ self._raise_if_not_initialized()
896
+
897
+ if not self.is_at_home():
898
+ self.log.error('Actor.reboot() is only supported when running on the Apify platform.')
899
+ return
900
+
901
+ if self._is_rebooting:
902
+ self.log.debug('Actor is already rebooting, skipping the additional reboot call.')
903
+ return
904
+
905
+ self._is_rebooting = True
906
+
907
+ if not custom_after_sleep:
908
+ custom_after_sleep = self._configuration.metamorph_after_sleep
909
+
910
+ # Call all the listeners for the PERSIST_STATE and MIGRATING events, and wait for them to finish.
911
+ # PERSIST_STATE listeners are called to allow the Actor to persist its state before the reboot.
912
+ # MIGRATING listeners are called to allow the Actor to gracefully stop in-progress tasks before the reboot.
913
+ # Typically, crawlers are listening for the MIIGRATING event to stop processing new requests.
914
+ # We can't just emit the events and wait for all listeners to finish,
915
+ # because this method might be called from an event listener itself, and we would deadlock.
916
+ persist_state_listeners = flatten(
917
+ (self._event_manager._listeners_to_wrappers[Event.PERSIST_STATE] or {}).values() # noqa: SLF001
918
+ )
919
+ migrating_listeners = flatten(
920
+ (self._event_manager._listeners_to_wrappers[Event.MIGRATING] or {}).values() # noqa: SLF001
921
+ )
922
+
923
+ await asyncio.gather(
924
+ *[listener(EventPersistStateData(is_migrating=True)) for listener in persist_state_listeners],
925
+ *[listener(EventMigratingData()) for listener in migrating_listeners],
926
+ )
927
+
928
+ if not self._configuration.actor_run_id:
929
+ raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.')
930
+
931
+ await self._apify_client.run(self._configuration.actor_run_id).reboot()
932
+
933
+ if custom_after_sleep:
934
+ await asyncio.sleep(custom_after_sleep.total_seconds())
935
+
936
+ async def add_webhook(
937
+ self,
938
+ webhook: Webhook,
939
+ *,
940
+ ignore_ssl_errors: bool | None = None,
941
+ do_not_retry: bool | None = None,
942
+ idempotency_key: str | None = None,
943
+ ) -> None:
944
+ """Create an ad-hoc webhook for the current Actor run.
945
+
946
+ This webhook lets you receive a notification when the Actor run finished or failed.
947
+
948
+ Note that webhooks are only supported for Actors running on the Apify platform. When running the Actor locally,
949
+ the function will print a warning and have no effect.
950
+
951
+ For more information about Apify Actor webhooks, please see the [documentation](https://docs.apify.com/webhooks).
952
+
953
+ Args:
954
+ webhook: The webhook to be added
955
+ ignore_ssl_errors: Whether the webhook should ignore SSL errors returned by request_url
956
+ do_not_retry: Whether the webhook should retry sending the payload to request_url upon failure.
957
+ idempotency_key: A unique identifier of a webhook. You can use it to ensure that you won't create
958
+ the same webhook multiple times.
959
+
960
+ Returns:
961
+ The created webhook.
962
+ """
963
+ self._raise_if_not_initialized()
964
+
965
+ if not self.is_at_home():
966
+ self.log.error('Actor.add_webhook() is only supported when running on the Apify platform.')
967
+ return
968
+
969
+ # If is_at_home() is True, config.actor_run_id is always set
970
+ if not self._configuration.actor_run_id:
971
+ raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.')
972
+
973
+ await self._apify_client.webhooks().create(
974
+ actor_run_id=self._configuration.actor_run_id,
975
+ event_types=webhook.event_types,
976
+ request_url=webhook.request_url,
977
+ payload_template=webhook.payload_template,
978
+ ignore_ssl_errors=ignore_ssl_errors,
979
+ do_not_retry=do_not_retry,
980
+ idempotency_key=idempotency_key,
981
+ )
982
+
983
+ async def set_status_message(
984
+ self,
985
+ status_message: str,
986
+ *,
987
+ is_terminal: bool | None = None,
988
+ ) -> ActorRun | None:
989
+ """Set the status message for the current Actor run.
990
+
991
+ Args:
992
+ status_message: The status message to set to the run.
993
+ is_terminal: Set this flag to True if this is the final status message of the Actor run.
994
+
995
+ Returns:
996
+ The updated Actor run object.
997
+ """
998
+ self._raise_if_not_initialized()
999
+
1000
+ if not self.is_at_home():
1001
+ title = 'Terminal status message' if is_terminal else 'Status message'
1002
+ self.log.info(f'[{title}]: {status_message}')
1003
+ return None
1004
+
1005
+ # If is_at_home() is True, config.actor_run_id is always set
1006
+ if not self._configuration.actor_run_id:
1007
+ raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.')
1008
+
1009
+ api_result = await self._apify_client.run(self._configuration.actor_run_id).update(
1010
+ status_message=status_message, is_status_message_terminal=is_terminal
1011
+ )
1012
+
1013
+ return ActorRun.model_validate(api_result)
1014
+
1015
+ async def create_proxy_configuration(
1016
+ self,
1017
+ *,
1018
+ actor_proxy_input: dict
1019
+ | None = None, # this is the raw proxy input from the actor run input, it is not spread or snake_cased in here
1020
+ password: str | None = None,
1021
+ groups: list[str] | None = None,
1022
+ country_code: str | None = None,
1023
+ proxy_urls: list[str | None] | None = None,
1024
+ new_url_function: _NewUrlFunction | None = None,
1025
+ ) -> ProxyConfiguration | None:
1026
+ """Create a ProxyConfiguration object with the passed proxy configuration.
1027
+
1028
+ Configures connection to a proxy server with the provided options. Proxy servers are used to prevent target
1029
+ websites from blocking your crawlers based on IP address rate limits or blacklists.
1030
+
1031
+ For more details and code examples, see the `ProxyConfiguration` class.
1032
+
1033
+ Args:
1034
+ actor_proxy_input: Proxy configuration field from the Actor input, if input has such input field. If you
1035
+ pass this argument, all the other arguments will be inferred from it.
1036
+ password: Password for the Apify Proxy. If not provided, will use os.environ['APIFY_PROXY_PASSWORD'],
1037
+ if available.
1038
+ groups: Proxy groups which the Apify Proxy should use, if provided.
1039
+ country_code: Country which the Apify Proxy should use, if provided.
1040
+ proxy_urls: Custom proxy server URLs which should be rotated through.
1041
+ new_url_function: Function which returns a custom proxy URL to be used.
1042
+
1043
+ Returns:
1044
+ ProxyConfiguration object with the passed configuration, or None, if no proxy should be used based
1045
+ on the configuration.
1046
+ """
1047
+ self._raise_if_not_initialized()
1048
+
1049
+ if actor_proxy_input is not None:
1050
+ if actor_proxy_input.get('useApifyProxy', False):
1051
+ country_code = country_code or actor_proxy_input.get('apifyProxyCountry')
1052
+ groups = groups or actor_proxy_input.get('apifyProxyGroups')
1053
+ else:
1054
+ proxy_urls = actor_proxy_input.get('proxyUrls', [])
1055
+ if not proxy_urls:
1056
+ return None
1057
+
1058
+ proxy_configuration = ProxyConfiguration(
1059
+ password=password,
1060
+ groups=groups,
1061
+ country_code=country_code,
1062
+ proxy_urls=proxy_urls,
1063
+ new_url_function=new_url_function,
1064
+ _actor_config=self._configuration,
1065
+ _apify_client=self._apify_client,
1066
+ )
1067
+
1068
+ await proxy_configuration.initialize()
1069
+
1070
+ return proxy_configuration
1071
+
1072
+
1073
+ Actor = cast(_ActorType, Proxy(_ActorType))
1074
+ """The entry point of the SDK, through which all the Actor operations should be done."""