apify 2.7.2__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

Files changed (51) hide show
  1. apify/_actor.py +194 -126
  2. apify/_charging.py +34 -9
  3. apify/_configuration.py +79 -6
  4. apify/_crypto.py +0 -6
  5. apify/_models.py +7 -7
  6. apify/_proxy_configuration.py +10 -10
  7. apify/_utils.py +25 -2
  8. apify/events/__init__.py +5 -0
  9. apify/events/_apify_event_manager.py +140 -0
  10. apify/events/_types.py +102 -0
  11. apify/log.py +0 -9
  12. apify/request_loaders/__init__.py +18 -0
  13. apify/{storages/_request_list.py → request_loaders/_apify_request_list.py} +25 -18
  14. apify/request_loaders/py.typed +0 -0
  15. apify/scrapy/_logging_config.py +1 -4
  16. apify/scrapy/extensions/_httpcache.py +9 -5
  17. apify/scrapy/requests.py +3 -3
  18. apify/scrapy/scheduler.py +8 -5
  19. apify/storage_clients/__init__.py +12 -0
  20. apify/storage_clients/_apify/__init__.py +11 -0
  21. apify/storage_clients/_apify/_dataset_client.py +328 -0
  22. apify/storage_clients/_apify/_key_value_store_client.py +265 -0
  23. apify/storage_clients/_apify/_models.py +131 -0
  24. apify/storage_clients/_apify/_request_queue_client.py +327 -0
  25. apify/storage_clients/_apify/_request_queue_shared_client.py +527 -0
  26. apify/storage_clients/_apify/_request_queue_single_client.py +399 -0
  27. apify/storage_clients/_apify/_storage_client.py +106 -0
  28. apify/storage_clients/_apify/_utils.py +194 -0
  29. apify/storage_clients/_apify/py.typed +0 -0
  30. apify/storage_clients/_file_system/__init__.py +2 -0
  31. apify/storage_clients/_file_system/_key_value_store_client.py +57 -0
  32. apify/storage_clients/_file_system/_storage_client.py +41 -0
  33. apify/storage_clients/_smart_apify/__init__.py +1 -0
  34. apify/storage_clients/_smart_apify/_storage_client.py +117 -0
  35. apify/storage_clients/py.typed +0 -0
  36. apify/storages/__init__.py +1 -3
  37. {apify-2.7.2.dist-info → apify-3.0.0.dist-info}/METADATA +25 -9
  38. apify-3.0.0.dist-info/RECORD +57 -0
  39. apify/_platform_event_manager.py +0 -231
  40. apify/apify_storage_client/__init__.py +0 -3
  41. apify/apify_storage_client/_apify_storage_client.py +0 -72
  42. apify/apify_storage_client/_dataset_client.py +0 -190
  43. apify/apify_storage_client/_dataset_collection_client.py +0 -51
  44. apify/apify_storage_client/_key_value_store_client.py +0 -109
  45. apify/apify_storage_client/_key_value_store_collection_client.py +0 -51
  46. apify/apify_storage_client/_request_queue_client.py +0 -176
  47. apify/apify_storage_client/_request_queue_collection_client.py +0 -51
  48. apify-2.7.2.dist-info/RECORD +0 -44
  49. /apify/{apify_storage_client → events}/py.typed +0 -0
  50. {apify-2.7.2.dist-info → apify-3.0.0.dist-info}/WHEEL +0 -0
  51. {apify-2.7.2.dist-info → apify-3.0.0.dist-info}/licenses/LICENSE +0 -0
apify/_charging.py CHANGED
@@ -4,11 +4,10 @@ import math
4
4
  from dataclasses import dataclass
5
5
  from datetime import datetime, timezone
6
6
  from decimal import Decimal
7
- from typing import TYPE_CHECKING, Protocol, Union
7
+ from typing import TYPE_CHECKING, Protocol
8
8
 
9
9
  from pydantic import TypeAdapter
10
10
 
11
- from apify_shared.utils import ignore_docs
12
11
  from crawlee._utils.context import ensure_context
13
12
 
14
13
  from apify._models import ActorRun, PricingModel
@@ -23,13 +22,21 @@ if TYPE_CHECKING:
23
22
 
24
23
  from apify._configuration import Configuration
25
24
 
25
+ run_validator = TypeAdapter[ActorRun | None](ActorRun | None)
26
26
 
27
- run_validator: TypeAdapter[ActorRun | None] = TypeAdapter(Union[ActorRun, None])
28
27
 
29
-
30
- @docs_group('Interfaces')
28
+ @docs_group('Charging')
31
29
  class ChargingManager(Protocol):
32
- """Provides fine-grained access to pay-per-event functionality."""
30
+ """Provides fine-grained access to pay-per-event functionality.
31
+
32
+ The ChargingManager allows you to charge for specific events in your Actor when using
33
+ the pay-per-event pricing model. This enables precise cost control and transparent
34
+ billing for different operations within your Actor.
35
+
36
+ ### References
37
+
38
+ - Apify platform documentation: https://docs.apify.com/platform/actors/publishing/monetize
39
+ """
33
40
 
34
41
  async def charge(self, event_name: str, count: int = 1) -> ChargeResult:
35
42
  """Charge for a specified number of events - sub-operations of the Actor.
@@ -57,8 +64,18 @@ class ChargingManager(Protocol):
57
64
  This can be used for instance when your code needs to support multiple pricing models in transition periods.
58
65
  """
59
66
 
67
+ def get_charged_event_count(self, event_name: str) -> int:
68
+ """Get the number of events with the given name that were charged so far.
69
+
70
+ Args:
71
+ event_name: Name of the inspected event.
72
+ """
73
+
74
+ def get_max_total_charge_usd(self) -> Decimal:
75
+ """Get the configured maximum total charge for this Actor run."""
60
76
 
61
- @docs_group('Data structures')
77
+
78
+ @docs_group('Charging')
62
79
  @dataclass(frozen=True)
63
80
  class ChargeResult:
64
81
  """Result of the `ChargingManager.charge` method."""
@@ -73,7 +90,7 @@ class ChargeResult:
73
90
  """How many events of each known type can still be charged within the limit."""
74
91
 
75
92
 
76
- @docs_group('Data structures')
93
+ @docs_group('Charging')
77
94
  @dataclass
78
95
  class ActorPricingInfo:
79
96
  """Result of the `ChargingManager.get_pricing_info` method."""
@@ -91,7 +108,6 @@ class ActorPricingInfo:
91
108
  """Price of every known event type."""
92
109
 
93
110
 
94
- @ignore_docs
95
111
  class ChargingManagerImplementation(ChargingManager):
96
112
  """Implementation of the `ChargingManager` Protocol - this is only meant to be instantiated internally."""
97
113
 
@@ -303,6 +319,15 @@ class ChargingManagerImplementation(ChargingManager):
303
319
  },
304
320
  )
305
321
 
322
+ @ensure_context
323
+ def get_charged_event_count(self, event_name: str) -> int:
324
+ item = self._charging_state.get(event_name)
325
+ return item.charge_count if item is not None else 0
326
+
327
+ @ensure_context
328
+ def get_max_total_charge_usd(self) -> Decimal:
329
+ return self._max_total_charge_usd
330
+
306
331
 
307
332
  @dataclass
308
333
  class ChargingStateItem:
apify/_configuration.py CHANGED
@@ -8,6 +8,7 @@ from typing import Annotated, Any
8
8
  from pydantic import AliasChoices, BeforeValidator, Field, model_validator
9
9
  from typing_extensions import Self, deprecated
10
10
 
11
+ from crawlee import service_locator
11
12
  from crawlee._utils.models import timedelta_ms
12
13
  from crawlee._utils.urls import validate_http_url
13
14
  from crawlee.configuration import Configuration as CrawleeConfiguration
@@ -25,7 +26,7 @@ def _transform_to_list(value: Any) -> list[str] | None:
25
26
  return value if isinstance(value, list) else str(value).split(',')
26
27
 
27
28
 
28
- @docs_group('Classes')
29
+ @docs_group('Configuration')
29
30
  class Configuration(CrawleeConfiguration):
30
31
  """A class for specifying the configuration of an Actor.
31
32
 
@@ -140,6 +141,39 @@ class Configuration(CrawleeConfiguration):
140
141
  ),
141
142
  ] = None
142
143
 
144
+ default_dataset_id: Annotated[
145
+ str | None,
146
+ Field(
147
+ validation_alias=AliasChoices(
148
+ 'actor_default_dataset_id',
149
+ 'apify_default_dataset_id',
150
+ ),
151
+ description='Default dataset ID used by the Apify storage client when no ID or name is provided.',
152
+ ),
153
+ ] = None
154
+
155
+ default_key_value_store_id: Annotated[
156
+ str | None,
157
+ Field(
158
+ validation_alias=AliasChoices(
159
+ 'actor_default_key_value_store_id',
160
+ 'apify_default_key_value_store_id',
161
+ ),
162
+ description='Default key-value store ID for the Apify storage client when no ID or name is provided.',
163
+ ),
164
+ ] = None
165
+
166
+ default_request_queue_id: Annotated[
167
+ str | None,
168
+ Field(
169
+ validation_alias=AliasChoices(
170
+ 'actor_default_request_queue_id',
171
+ 'apify_default_request_queue_id',
172
+ ),
173
+ description='Default request queue ID for the Apify storage client when no ID or name is provided.',
174
+ ),
175
+ ] = None
176
+
143
177
  disable_outdated_warning: Annotated[
144
178
  bool,
145
179
  Field(
@@ -334,6 +368,15 @@ class Configuration(CrawleeConfiguration):
334
368
  ),
335
369
  ] = None
336
370
 
371
+ user_is_paying: Annotated[
372
+ bool,
373
+ Field(
374
+ alias='apify_user_is_paying',
375
+ description='True if the user calling the Actor is paying user',
376
+ ),
377
+ BeforeValidator(lambda val: False if val == '' else val),
378
+ ] = False
379
+
337
380
  web_server_port: Annotated[
338
381
  int,
339
382
  Field(
@@ -382,11 +425,41 @@ class Configuration(CrawleeConfiguration):
382
425
  def get_global_configuration(cls) -> Configuration:
383
426
  """Retrieve the global instance of the configuration.
384
427
 
385
- Mostly for the backwards compatibility. It is recommended to use the `service_locator.get_configuration()`
386
- instead.
428
+ This method ensures that ApifyConfigration is returned, even if CrawleeConfiguration was set in the
429
+ service locator.
430
+ """
431
+ global_configuration = service_locator.get_configuration()
432
+
433
+ if isinstance(global_configuration, Configuration):
434
+ # If Apify configuration was already stored in service locator, return it.
435
+ return global_configuration
436
+
437
+ logger.warning(
438
+ 'Non Apify Configration is set in the `service_locator` in the SDK context. '
439
+ 'It is recommended to set `apify.Configuration` explicitly as early as possible by using '
440
+ 'service_locator.set_configuration'
441
+ )
442
+
443
+ return cls.from_configuration(global_configuration)
444
+
445
+ @classmethod
446
+ def from_configuration(cls, configuration: CrawleeConfiguration) -> Configuration:
447
+ """Create Apify Configuration from existing Crawlee Configuration.
448
+
449
+ Args:
450
+ configuration: The existing Crawlee Configuration.
451
+
452
+ Returns:
453
+ The created Apify Configuration.
387
454
  """
388
- return cls()
455
+ apify_configuration = cls()
389
456
 
457
+ # Ensure the returned configuration is of type Apify Configuration.
458
+ # Most likely crawlee configuration was already set. Create Apify configuration from it.
459
+ # Due to known Pydantic issue https://github.com/pydantic/pydantic/issues/9516, creating new instance of
460
+ # Configuration from existing one in situation where environment can have some fields set by alias is very
461
+ # unpredictable. Use the stable workaround.
462
+ for name in configuration.model_fields:
463
+ setattr(apify_configuration, name, getattr(configuration, name))
390
464
 
391
- # Monkey-patch the base class so that it works with the extended configuration
392
- CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration # type: ignore[method-assign]
465
+ return apify_configuration
apify/_crypto.py CHANGED
@@ -12,7 +12,6 @@ from cryptography.hazmat.primitives import hashes, serialization
12
12
  from cryptography.hazmat.primitives.asymmetric import padding, rsa
13
13
  from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
14
14
 
15
- from apify_shared.utils import ignore_docs
16
15
  from crawlee._utils.crypto import crypto_random_object_id
17
16
 
18
17
  from apify._consts import ENCRYPTED_INPUT_VALUE_REGEXP, ENCRYPTED_JSON_VALUE_PREFIX, ENCRYPTED_STRING_VALUE_PREFIX
@@ -22,7 +21,6 @@ ENCRYPTION_IV_LENGTH = 16
22
21
  ENCRYPTION_AUTH_TAG_LENGTH = 16
23
22
 
24
23
 
25
- @ignore_docs
26
24
  def public_encrypt(value: str, *, public_key: rsa.RSAPublicKey) -> dict:
27
25
  """Encrypts the given value using AES cipher and the password for encryption using the public key.
28
26
 
@@ -66,7 +64,6 @@ def public_encrypt(value: str, *, public_key: rsa.RSAPublicKey) -> dict:
66
64
  }
67
65
 
68
66
 
69
- @ignore_docs
70
67
  def private_decrypt(
71
68
  encrypted_password: str,
72
69
  encrypted_value: str,
@@ -118,7 +115,6 @@ def private_decrypt(
118
115
  return decipher_bytes.decode('utf-8')
119
116
 
120
117
 
121
- @ignore_docs
122
118
  def load_private_key(private_key_file_base64: str, private_key_password: str) -> rsa.RSAPrivateKey:
123
119
  private_key = serialization.load_pem_private_key(
124
120
  base64.b64decode(private_key_file_base64.encode('utf-8')),
@@ -138,7 +134,6 @@ def _load_public_key(public_key_file_base64: str) -> rsa.RSAPublicKey:
138
134
  return public_key
139
135
 
140
136
 
141
- @ignore_docs
142
137
  def decrypt_input_secrets(private_key: rsa.RSAPrivateKey, input_data: Any) -> Any:
143
138
  """Decrypt input secrets."""
144
139
  if not isinstance(input_data, dict):
@@ -180,7 +175,6 @@ def encode_base62(num: int) -> str:
180
175
  return res
181
176
 
182
177
 
183
- @ignore_docs
184
178
  def create_hmac_signature(secret_key: str, message: str) -> str:
185
179
  """Generate an HMAC signature and encodes it using Base62. Base62 encoding reduces the signature length.
186
180
 
apify/_models.py CHANGED
@@ -13,10 +13,10 @@ from crawlee._utils.urls import validate_http_url
13
13
  from apify._utils import docs_group
14
14
 
15
15
  if TYPE_CHECKING:
16
- from typing_extensions import TypeAlias
16
+ from typing import TypeAlias
17
17
 
18
18
 
19
- @docs_group('Data structures')
19
+ @docs_group('Actor')
20
20
  class Webhook(BaseModel):
21
21
  __model_config__ = ConfigDict(populate_by_name=True)
22
22
 
@@ -35,14 +35,14 @@ class Webhook(BaseModel):
35
35
  ] = None
36
36
 
37
37
 
38
- @docs_group('Data structures')
38
+ @docs_group('Actor')
39
39
  class ActorRunMeta(BaseModel):
40
40
  __model_config__ = ConfigDict(populate_by_name=True)
41
41
 
42
42
  origin: Annotated[MetaOrigin, Field()]
43
43
 
44
44
 
45
- @docs_group('Data structures')
45
+ @docs_group('Actor')
46
46
  class ActorRunStats(BaseModel):
47
47
  __model_config__ = ConfigDict(populate_by_name=True)
48
48
 
@@ -63,7 +63,7 @@ class ActorRunStats(BaseModel):
63
63
  compute_units: Annotated[float, Field(alias='computeUnits')]
64
64
 
65
65
 
66
- @docs_group('Data structures')
66
+ @docs_group('Actor')
67
67
  class ActorRunOptions(BaseModel):
68
68
  __model_config__ = ConfigDict(populate_by_name=True)
69
69
 
@@ -74,7 +74,7 @@ class ActorRunOptions(BaseModel):
74
74
  max_total_charge_usd: Annotated[Decimal | None, Field(alias='maxTotalChargeUsd')] = None
75
75
 
76
76
 
77
- @docs_group('Data structures')
77
+ @docs_group('Actor')
78
78
  class ActorRunUsage(BaseModel):
79
79
  __model_config__ = ConfigDict(populate_by_name=True)
80
80
 
@@ -92,7 +92,7 @@ class ActorRunUsage(BaseModel):
92
92
  proxy_serps: Annotated[float | None, Field(alias='PROXY_SERPS')] = None
93
93
 
94
94
 
95
- @docs_group('Data structures')
95
+ @docs_group('Actor')
96
96
  class ActorRun(BaseModel):
97
97
  __model_config__ = ConfigDict(populate_by_name=True)
98
98
 
@@ -1,16 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import ipaddress
4
+ import json
4
5
  import re
5
6
  from dataclasses import dataclass, field
6
7
  from re import Pattern
7
8
  from typing import TYPE_CHECKING, Any
8
9
  from urllib.parse import urljoin, urlparse
9
10
 
10
- import httpx
11
+ import impit
12
+ from yarl import URL
11
13
 
12
14
  from apify_shared.consts import ApifyEnvVars
13
- from apify_shared.utils import ignore_docs
14
15
  from crawlee.proxy_configuration import ProxyConfiguration as CrawleeProxyConfiguration
15
16
  from crawlee.proxy_configuration import ProxyInfo as CrawleeProxyInfo
16
17
  from crawlee.proxy_configuration import _NewUrlFunction
@@ -21,14 +22,14 @@ from apify.log import logger
21
22
 
22
23
  if TYPE_CHECKING:
23
24
  from apify_client import ApifyClientAsync
24
- from crawlee import Request
25
+
26
+ from apify import Request
25
27
 
26
28
  APIFY_PROXY_VALUE_REGEX = re.compile(r'^[\w._~]+$')
27
29
  COUNTRY_CODE_REGEX = re.compile(r'^[A-Z]{2}$')
28
30
  SESSION_ID_MAX_LENGTH = 50
29
31
 
30
32
 
31
- @ignore_docs
32
33
  def is_url(url: str) -> bool:
33
34
  """Check if the given string is a valid URL."""
34
35
  try:
@@ -69,7 +70,7 @@ def _check(
69
70
  raise ValueError(f'{error_str} does not match pattern {pattern.pattern!r}')
70
71
 
71
72
 
72
- @docs_group('Classes')
73
+ @docs_group('Configuration')
73
74
  @dataclass
74
75
  class ProxyInfo(CrawleeProxyInfo):
75
76
  """Provides information about a proxy connection that is used for requests."""
@@ -89,7 +90,7 @@ class ProxyInfo(CrawleeProxyInfo):
89
90
  """
90
91
 
91
92
 
92
- @docs_group('Classes')
93
+ @docs_group('Configuration')
93
94
  class ProxyConfiguration(CrawleeProxyConfiguration):
94
95
  """Configures a connection to a proxy server with the provided options.
95
96
 
@@ -104,7 +105,6 @@ class ProxyConfiguration(CrawleeProxyConfiguration):
104
105
 
105
106
  _configuration: Configuration
106
107
 
107
- @ignore_docs
108
108
  def __init__(
109
109
  self,
110
110
  *,
@@ -233,7 +233,7 @@ class ProxyConfiguration(CrawleeProxyConfiguration):
233
233
  return None
234
234
 
235
235
  if self._uses_apify_proxy:
236
- parsed_url = httpx.URL(proxy_info.url)
236
+ parsed_url = URL(proxy_info.url)
237
237
  username = self._get_username(session_id)
238
238
 
239
239
  return ProxyInfo(
@@ -277,11 +277,11 @@ class ProxyConfiguration(CrawleeProxyConfiguration):
277
277
  return
278
278
 
279
279
  status = None
280
- async with httpx.AsyncClient(proxy=proxy_info.url, timeout=10) as client:
280
+ async with impit.AsyncClient(proxy=proxy_info.url, timeout=10) as client:
281
281
  for _ in range(2):
282
282
  try:
283
283
  response = await client.get(proxy_status_url)
284
- status = response.json()
284
+ status = json.loads(response.text)
285
285
  break
286
286
  except Exception: # noqa: S110
287
287
  # retry on connection errors
apify/_utils.py CHANGED
@@ -2,8 +2,12 @@ from __future__ import annotations
2
2
 
3
3
  import builtins
4
4
  import sys
5
+ from enum import Enum
5
6
  from importlib import metadata
6
- from typing import Callable, Literal
7
+ from typing import TYPE_CHECKING, Any, Literal
8
+
9
+ if TYPE_CHECKING:
10
+ from collections.abc import Callable
7
11
 
8
12
 
9
13
  def get_system_info() -> dict:
@@ -27,7 +31,19 @@ def is_running_in_ipython() -> bool:
27
31
  return getattr(builtins, '__IPYTHON__', False)
28
32
 
29
33
 
30
- GroupName = Literal['Classes', 'Abstract classes', 'Interfaces', 'Data structures', 'Errors', 'Functions']
34
+ # The order of the rendered API groups is defined in the website/docusaurus.config.js file.
35
+ GroupName = Literal[
36
+ 'Actor',
37
+ 'Charging',
38
+ 'Configuration',
39
+ 'Event data',
40
+ 'Event managers',
41
+ 'Events',
42
+ 'Request loaders',
43
+ 'Storage clients',
44
+ 'Storage data',
45
+ 'Storages',
46
+ ]
31
47
 
32
48
 
33
49
  def docs_group(group_name: GroupName) -> Callable: # noqa: ARG001
@@ -66,3 +82,10 @@ def docs_name(symbol_name: str) -> Callable: # noqa: ARG001
66
82
  return func
67
83
 
68
84
  return wrapper
85
+
86
+
87
+ def maybe_extract_enum_member_value(maybe_enum_member: Any) -> Any:
88
+ """Extract the value of an enumeration member if it is an Enum, otherwise return the original value."""
89
+ if isinstance(maybe_enum_member, Enum):
90
+ return maybe_enum_member.value
91
+ return maybe_enum_member
@@ -0,0 +1,5 @@
1
+ from crawlee.events import Event, EventManager, LocalEventManager
2
+
3
+ from ._apify_event_manager import ApifyEventManager
4
+
5
+ __all__ = ['ApifyEventManager', 'Event', 'EventManager', 'LocalEventManager']
@@ -0,0 +1,140 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import contextlib
5
+ from typing import TYPE_CHECKING, Annotated
6
+
7
+ import websockets.asyncio.client
8
+ from pydantic import Discriminator, TypeAdapter
9
+ from typing_extensions import Self, Unpack, override
10
+
11
+ from crawlee.events import EventManager
12
+ from crawlee.events._types import Event, EventPersistStateData
13
+
14
+ from apify._utils import docs_group
15
+ from apify.events._types import DeprecatedEvent, EventMessage, SystemInfoEventData, UnknownEvent
16
+ from apify.log import logger
17
+
18
+ if TYPE_CHECKING:
19
+ from types import TracebackType
20
+
21
+ from crawlee.events._event_manager import EventManagerOptions
22
+
23
+ from apify._configuration import Configuration
24
+
25
+
26
+ event_data_adapter = TypeAdapter[EventMessage | DeprecatedEvent | UnknownEvent](
27
+ Annotated[EventMessage, Discriminator('name')] | DeprecatedEvent | UnknownEvent
28
+ )
29
+
30
+
31
+ @docs_group('Event managers')
32
+ class ApifyEventManager(EventManager):
33
+ """Event manager for the Apify platform.
34
+
35
+ This class extends Crawlee's `EventManager` to provide Apify-specific functionality, including websocket
36
+ connectivity to the Apify platform for receiving platform events.
37
+
38
+ The event manager handles:
39
+ - Registration and emission of events and their listeners.
40
+ - Websocket connection to Apify platform events.
41
+ - Processing and validation of platform messages.
42
+ - Automatic event forwarding from the platform to local event listeners.
43
+
44
+ This class should not be used directly. Use the `Actor.on` and `Actor.off` methods to interact
45
+ with the event system.
46
+ """
47
+
48
+ def __init__(self, configuration: Configuration, **kwargs: Unpack[EventManagerOptions]) -> None:
49
+ """Initialize a new instance.
50
+
51
+ Args:
52
+ configuration: The Actor configuration for the event manager.
53
+ **kwargs: Additional event manager options passed to the parent class.
54
+ """
55
+ super().__init__(**kwargs)
56
+
57
+ self._configuration = configuration
58
+ """The Actor configuration for the event manager."""
59
+
60
+ self._platform_events_websocket: websockets.asyncio.client.ClientConnection | None = None
61
+ """WebSocket connection to the platform events."""
62
+
63
+ self._process_platform_messages_task: asyncio.Task | None = None
64
+ """Task for processing messages from the platform websocket."""
65
+
66
+ self._connected_to_platform_websocket: asyncio.Future[bool] | None = None
67
+ """Future that resolves when the connection to the platform websocket is established."""
68
+
69
+ @override
70
+ async def __aenter__(self) -> Self:
71
+ await super().__aenter__()
72
+ self._connected_to_platform_websocket = asyncio.Future()
73
+
74
+ # Run tasks but don't await them
75
+ if self._configuration.actor_events_ws_url:
76
+ self._process_platform_messages_task = asyncio.create_task(
77
+ self._process_platform_messages(self._configuration.actor_events_ws_url)
78
+ )
79
+ is_connected = await self._connected_to_platform_websocket
80
+ if not is_connected:
81
+ raise RuntimeError('Error connecting to platform events websocket!')
82
+ else:
83
+ logger.debug('APIFY_ACTOR_EVENTS_WS_URL env var not set, no events from Apify platform will be emitted.')
84
+
85
+ return self
86
+
87
+ @override
88
+ async def __aexit__(
89
+ self,
90
+ exc_type: type[BaseException] | None,
91
+ exc_value: BaseException | None,
92
+ exc_traceback: TracebackType | None,
93
+ ) -> None:
94
+ if self._platform_events_websocket:
95
+ await self._platform_events_websocket.close()
96
+
97
+ if self._process_platform_messages_task and not self._process_platform_messages_task.done():
98
+ self._process_platform_messages_task.cancel()
99
+ with contextlib.suppress(asyncio.CancelledError):
100
+ await self._process_platform_messages_task
101
+
102
+ await super().__aexit__(exc_type, exc_value, exc_traceback)
103
+
104
+ async def _process_platform_messages(self, ws_url: str) -> None:
105
+ try:
106
+ async with websockets.asyncio.client.connect(ws_url) as websocket:
107
+ self._platform_events_websocket = websocket
108
+ if self._connected_to_platform_websocket is not None:
109
+ self._connected_to_platform_websocket.set_result(True)
110
+
111
+ async for message in websocket:
112
+ try:
113
+ parsed_message = event_data_adapter.validate_json(message)
114
+
115
+ if isinstance(parsed_message, DeprecatedEvent):
116
+ continue
117
+
118
+ if isinstance(parsed_message, UnknownEvent):
119
+ logger.info(
120
+ f'Unknown message received: event_name={parsed_message.name}, '
121
+ f'event_data={parsed_message.data}'
122
+ )
123
+ continue
124
+
125
+ self.emit(
126
+ event=parsed_message.name,
127
+ event_data=parsed_message.data
128
+ if not isinstance(parsed_message.data, SystemInfoEventData)
129
+ else parsed_message.data.to_crawlee_format(self._configuration.dedicated_cpus or 1),
130
+ )
131
+
132
+ if parsed_message.name == Event.MIGRATING:
133
+ await self._emit_persist_state_event_rec_task.stop()
134
+ self.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=True))
135
+ except Exception:
136
+ logger.exception('Cannot parse Actor event', extra={'message': message})
137
+ except Exception:
138
+ logger.exception('Error in websocket connection')
139
+ if self._connected_to_platform_websocket is not None:
140
+ self._connected_to_platform_websocket.set_result(False)
apify/events/_types.py ADDED
@@ -0,0 +1,102 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+ from typing import Annotated, Any, Literal
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+ from crawlee.events._types import (
9
+ Event,
10
+ EventAbortingData,
11
+ EventExitData,
12
+ EventMigratingData,
13
+ EventPersistStateData,
14
+ EventSystemInfoData,
15
+ )
16
+
17
+ from apify._utils import docs_group
18
+
19
+
20
+ @docs_group('Event data')
21
+ class SystemInfoEventData(BaseModel):
22
+ mem_avg_bytes: Annotated[float, Field(alias='memAvgBytes')]
23
+ mem_current_bytes: Annotated[float, Field(alias='memCurrentBytes')]
24
+ mem_max_bytes: Annotated[float, Field(alias='memMaxBytes')]
25
+ cpu_avg_usage: Annotated[float, Field(alias='cpuAvgUsage')]
26
+ cpu_max_usage: Annotated[float, Field(alias='cpuMaxUsage')]
27
+ cpu_current_usage: Annotated[float, Field(alias='cpuCurrentUsage')]
28
+ is_cpu_overloaded: Annotated[bool, Field(alias='isCpuOverloaded')]
29
+ created_at: Annotated[datetime, Field(alias='createdAt')]
30
+
31
+ def to_crawlee_format(self, dedicated_cpus: float) -> EventSystemInfoData:
32
+ return EventSystemInfoData.model_validate(
33
+ {
34
+ 'cpu_info': {
35
+ 'used_ratio': (self.cpu_current_usage / 100) / dedicated_cpus,
36
+ 'created_at': self.created_at,
37
+ },
38
+ 'memory_info': {
39
+ 'total_size': self.mem_max_bytes,
40
+ 'current_size': self.mem_current_bytes,
41
+ 'created_at': self.created_at,
42
+ },
43
+ }
44
+ )
45
+
46
+
47
+ @docs_group('Events')
48
+ class PersistStateEvent(BaseModel):
49
+ name: Literal[Event.PERSIST_STATE]
50
+ data: Annotated[EventPersistStateData, Field(default_factory=lambda: EventPersistStateData(is_migrating=False))]
51
+
52
+
53
+ @docs_group('Events')
54
+ class SystemInfoEvent(BaseModel):
55
+ name: Literal[Event.SYSTEM_INFO]
56
+ data: SystemInfoEventData
57
+
58
+
59
+ @docs_group('Events')
60
+ class MigratingEvent(BaseModel):
61
+ name: Literal[Event.MIGRATING]
62
+ data: Annotated[EventMigratingData, Field(default_factory=EventMigratingData)]
63
+
64
+
65
+ @docs_group('Events')
66
+ class AbortingEvent(BaseModel):
67
+ name: Literal[Event.ABORTING]
68
+ data: Annotated[EventAbortingData, Field(default_factory=EventAbortingData)]
69
+
70
+
71
+ @docs_group('Events')
72
+ class ExitEvent(BaseModel):
73
+ name: Literal[Event.EXIT]
74
+ data: Annotated[EventExitData, Field(default_factory=EventExitData)]
75
+
76
+
77
+ @docs_group('Events')
78
+ class EventWithoutData(BaseModel):
79
+ name: Literal[
80
+ Event.SESSION_RETIRED,
81
+ Event.BROWSER_LAUNCHED,
82
+ Event.BROWSER_RETIRED,
83
+ Event.BROWSER_CLOSED,
84
+ Event.PAGE_CREATED,
85
+ Event.PAGE_CLOSED,
86
+ ]
87
+ data: Any = None
88
+
89
+
90
+ @docs_group('Events')
91
+ class DeprecatedEvent(BaseModel):
92
+ name: Literal['cpuInfo']
93
+ data: Annotated[dict[str, Any], Field(default_factory=dict)]
94
+
95
+
96
+ @docs_group('Events')
97
+ class UnknownEvent(BaseModel):
98
+ name: str
99
+ data: Annotated[dict[str, Any], Field(default_factory=dict)]
100
+
101
+
102
+ EventMessage = PersistStateEvent | SystemInfoEvent | MigratingEvent | AbortingEvent | ExitEvent | EventWithoutData