apify 3.4.2b22__tar.gz → 3.4.2b23__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {apify-3.4.2b22 → apify-3.4.2b23}/CHANGELOG.md +1 -0
  2. {apify-3.4.2b22 → apify-3.4.2b23}/PKG-INFO +1 -1
  3. {apify-3.4.2b22 → apify-3.4.2b23}/pyproject.toml +1 -1
  4. apify-3.4.2b23/src/apify/events/_apify_event_manager.py +242 -0
  5. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storage_clients/_apify/_alias_resolving.py +29 -27
  6. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storage_clients/_apify/_api_client_creation.py +2 -0
  7. apify-3.4.2b22/src/apify/events/_apify_event_manager.py +0 -142
  8. {apify-3.4.2b22 → apify-3.4.2b23}/.gitignore +0 -0
  9. {apify-3.4.2b22 → apify-3.4.2b23}/CONTRIBUTING.md +0 -0
  10. {apify-3.4.2b22 → apify-3.4.2b23}/LICENSE +0 -0
  11. {apify-3.4.2b22 → apify-3.4.2b23}/README.md +0 -0
  12. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/__init__.py +0 -0
  13. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/_actor.py +0 -0
  14. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/_charging.py +0 -0
  15. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/_configuration.py +0 -0
  16. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/_consts.py +0 -0
  17. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/_crypto.py +0 -0
  18. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/_proxy_configuration.py +0 -0
  19. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/_utils.py +0 -0
  20. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/_webhook.py +0 -0
  21. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/events/__init__.py +0 -0
  22. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/events/_types.py +0 -0
  23. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/events/py.typed +0 -0
  24. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/log.py +0 -0
  25. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/py.typed +0 -0
  26. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/request_loaders/__init__.py +0 -0
  27. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/request_loaders/_apify_request_list.py +0 -0
  28. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/request_loaders/py.typed +0 -0
  29. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/scrapy/__init__.py +0 -0
  30. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/scrapy/_actor_runner.py +0 -0
  31. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/scrapy/_async_thread.py +0 -0
  32. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/scrapy/_logging_config.py +0 -0
  33. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/scrapy/_serialization.py +0 -0
  34. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/scrapy/extensions/__init__.py +0 -0
  35. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/scrapy/extensions/_httpcache.py +0 -0
  36. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/scrapy/middlewares/__init__.py +0 -0
  37. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/scrapy/middlewares/apify_proxy.py +0 -0
  38. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/scrapy/middlewares/py.typed +0 -0
  39. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/scrapy/pipelines/__init__.py +0 -0
  40. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/scrapy/pipelines/actor_dataset_push.py +0 -0
  41. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/scrapy/pipelines/py.typed +0 -0
  42. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/scrapy/py.typed +0 -0
  43. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/scrapy/requests.py +0 -0
  44. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/scrapy/scheduler.py +0 -0
  45. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/scrapy/utils.py +0 -0
  46. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storage_clients/__init__.py +0 -0
  47. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storage_clients/_apify/__init__.py +0 -0
  48. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storage_clients/_apify/_dataset_client.py +0 -0
  49. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storage_clients/_apify/_key_value_store_client.py +0 -0
  50. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storage_clients/_apify/_models.py +0 -0
  51. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storage_clients/_apify/_request_queue_client.py +0 -0
  52. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storage_clients/_apify/_request_queue_shared_client.py +0 -0
  53. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storage_clients/_apify/_request_queue_single_client.py +0 -0
  54. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storage_clients/_apify/_storage_client.py +0 -0
  55. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storage_clients/_apify/_utils.py +0 -0
  56. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storage_clients/_apify/py.typed +0 -0
  57. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storage_clients/_file_system/__init__.py +0 -0
  58. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storage_clients/_file_system/_dataset_client.py +0 -0
  59. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storage_clients/_file_system/_key_value_store_client.py +0 -0
  60. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storage_clients/_file_system/_storage_client.py +0 -0
  61. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storage_clients/_ppe_dataset_mixin.py +0 -0
  62. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storage_clients/_smart_apify/__init__.py +0 -0
  63. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storage_clients/_smart_apify/_storage_client.py +0 -0
  64. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storage_clients/py.typed +0 -0
  65. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storages/__init__.py +0 -0
  66. {apify-3.4.2b22 → apify-3.4.2b23}/src/apify/storages/py.typed +0 -0
@@ -26,6 +26,7 @@ All notable changes to this project will be documented in this file.
26
26
  - **scrapy:** Drop deprecated spider arg from Scrapy proxy middleware methods ([#977](https://github.com/apify/apify-sdk-python/pull/977)) ([49dd836](https://github.com/apify/apify-sdk-python/commit/49dd836c986b0e3bda4bb6485be8ad95d286cd82)) by [@vdusek](https://github.com/vdusek)
27
27
  - Redirect input key in all file-system key-value store operations ([#976](https://github.com/apify/apify-sdk-python/pull/976)) ([1fbdce2](https://github.com/apify/apify-sdk-python/commit/1fbdce27452f1cdc9580a53edd6de3ce9fe36a1d)) by [@vdusek](https://github.com/vdusek)
28
28
  - Respect explicit zero custom_after_sleep in metamorph and reboot ([#971](https://github.com/apify/apify-sdk-python/pull/971)) ([e8cda0a](https://github.com/apify/apify-sdk-python/commit/e8cda0a80339a21ce94cc382bd042509052ce1bf)) by [@vdusek](https://github.com/vdusek)
29
+ - Reconnect to platform events websocket after connection drop ([#967](https://github.com/apify/apify-sdk-python/pull/967)) ([5653a22](https://github.com/apify/apify-sdk-python/commit/5653a222d976919cc7388a88242a488ba503647e)) by [@vdusek](https://github.com/vdusek)
29
30
 
30
31
  ### 🚜 Refactor
31
32
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: apify
3
- Version: 3.4.2b22
3
+ Version: 3.4.2b23
4
4
  Summary: Apify SDK for Python
5
5
  Project-URL: Apify Homepage, https://apify.com
6
6
  Project-URL: Changelog, https://docs.apify.com/sdk/python/docs/changelog
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "apify"
7
- version = "3.4.2b22"
7
+ version = "3.4.2b23"
8
8
  description = "Apify SDK for Python"
9
9
  authors = [{ name = "Apify Technologies s.r.o.", email = "support@apify.com" }]
10
10
  license = { file = "LICENSE" }
@@ -0,0 +1,242 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import contextlib
5
+ import time
6
+ from typing import TYPE_CHECKING, Annotated, Self
7
+
8
+ import websockets.asyncio.client
9
+ import websockets.client
10
+ import websockets.exceptions
11
+ from pydantic import Discriminator, TypeAdapter
12
+ from typing_extensions import Unpack, override
13
+ from websockets.frames import CloseCode
14
+
15
+ from crawlee.events import EventManager
16
+ from crawlee.events._types import Event, EventPersistStateData
17
+
18
+ from apify._utils import docs_group
19
+ from apify.events._types import DeprecatedEvent, EventMessage, SystemInfoEventData, UnknownEvent
20
+ from apify.log import logger
21
+
22
+ if TYPE_CHECKING:
23
+ from collections.abc import Generator
24
+ from types import TracebackType
25
+
26
+ from crawlee.events._event_manager import EventManagerOptions
27
+
28
+ from apify._configuration import Configuration
29
+
30
+
31
+ event_data_adapter = TypeAdapter[EventMessage | DeprecatedEvent | UnknownEvent](
32
+ Annotated[EventMessage, Discriminator('name')] | DeprecatedEvent | UnknownEvent
33
+ )
34
+
35
+
36
+ @docs_group('Event managers')
37
+ class ApifyEventManager(EventManager):
38
+ """Event manager for the Apify platform.
39
+
40
+ This class extends Crawlee's `EventManager` to provide Apify-specific functionality, including websocket
41
+ connectivity to the Apify platform for receiving platform events.
42
+
43
+ The event manager handles:
44
+ - Registration and emission of events and their listeners.
45
+ - Websocket connection to Apify platform events.
46
+ - Processing and validation of platform messages.
47
+ - Automatic event forwarding from the platform to local event listeners.
48
+
49
+ This class should not be used directly. Use the `Actor.on` and `Actor.off` methods to interact
50
+ with the event system.
51
+ """
52
+
53
+ _NON_RETRYABLE_CLOSE_CODES = frozenset(
54
+ {
55
+ CloseCode.PROTOCOL_ERROR,
56
+ CloseCode.UNSUPPORTED_DATA,
57
+ CloseCode.INVALID_DATA,
58
+ CloseCode.POLICY_VIOLATION,
59
+ CloseCode.MANDATORY_EXTENSION,
60
+ }
61
+ )
62
+ """WebSocket close codes for a permanent condition, on which the connection is not re-established.
63
+
64
+ The platform sends `POLICY_VIOLATION` (1008) for an unknown/missing run ID or an exceeded per-run
65
+ connection limit. The rest are protocol, data, or mandatory-extension failures that reconnecting
66
+ cannot resolve.
67
+ """
68
+
69
+ _HEALTHY_CONNECTION_MIN_DURATION = 1.0
70
+ """Seconds a connection must stay open to count as healthy, after which a drop reconnects without backoff."""
71
+
72
+ def __init__(self, configuration: Configuration, **kwargs: Unpack[EventManagerOptions]) -> None:
73
+ """Initialize a new instance.
74
+
75
+ Args:
76
+ configuration: The Actor configuration for the event manager.
77
+ **kwargs: Additional event manager options passed to the parent class.
78
+ """
79
+ super().__init__(**kwargs)
80
+
81
+ self._configuration = configuration
82
+ """The Actor configuration for the event manager."""
83
+
84
+ self._platform_events_websocket: websockets.asyncio.client.ClientConnection | None = None
85
+ """WebSocket connection to the platform events."""
86
+
87
+ self._process_platform_messages_task: asyncio.Task | None = None
88
+ """Task for processing messages from the platform websocket."""
89
+
90
+ self._connected_to_platform_websocket: asyncio.Future[bool] | None = None
91
+ """Future that resolves when the connection to the platform websocket is established."""
92
+
93
+ @override
94
+ async def __aenter__(self) -> Self:
95
+ await super().__aenter__()
96
+ self._connected_to_platform_websocket = asyncio.Future()
97
+
98
+ # Run tasks but don't await them
99
+ if self._configuration.actor_events_ws_url:
100
+ self._process_platform_messages_task = asyncio.create_task(
101
+ self._process_platform_messages(self._configuration.actor_events_ws_url)
102
+ )
103
+ is_connected = await self._connected_to_platform_websocket
104
+ if not is_connected:
105
+ # Exit the already-entered parent so the recurring persist state task does not leak.
106
+ await self.__aexit__(None, None, None)
107
+ raise RuntimeError('Error connecting to platform events websocket!')
108
+ else:
109
+ logger.debug('APIFY_ACTOR_EVENTS_WS_URL env var not set, no events from Apify platform will be emitted.')
110
+
111
+ return self
112
+
113
+ @override
114
+ async def __aexit__(
115
+ self,
116
+ exc_type: type[BaseException] | None,
117
+ exc_value: BaseException | None,
118
+ exc_traceback: TracebackType | None,
119
+ ) -> None:
120
+ # Cancel the task before closing the websocket so that the closed connection is not treated as a drop
121
+ # and followed by a reconnect attempt.
122
+ if self._process_platform_messages_task and not self._process_platform_messages_task.done():
123
+ self._process_platform_messages_task.cancel()
124
+ with contextlib.suppress(asyncio.CancelledError):
125
+ await self._process_platform_messages_task
126
+
127
+ if self._platform_events_websocket:
128
+ await self._platform_events_websocket.close()
129
+
130
+ await super().__aexit__(exc_type, exc_value, exc_traceback)
131
+
132
+ def _process_connection_exception(self, exc: Exception) -> Exception | None:
133
+ """Decide whether a failed connection attempt to the platform websocket should be retried.
134
+
135
+ Before the first successful connection, every error is fatal so that `__aenter__` fails fast. After that,
136
+ the default `websockets` behavior decides which errors are transient and retried with exponential backoff.
137
+ """
138
+ if self._connected_to_platform_websocket and self._connected_to_platform_websocket.done():
139
+ return websockets.asyncio.client.process_exception(exc)
140
+ return exc
141
+
142
+ async def _process_platform_messages(self, ws_url: str) -> None:
143
+ # The `websockets` reconnect iterator only backs off on failed connection *attempts*, not on a connection
144
+ # that opens and is then closed. Track our own backoff here so a server that keeps accepting and immediately
145
+ # closing is not hammered; it is reset after a healthy connection so a healthy drop reconnects immediately.
146
+ backoff_delays: Generator[float] | None = None
147
+
148
+ try:
149
+ # Used as an async iterator, `connect` reconnects with exponential backoff on failed connection attempts.
150
+ async for websocket in websockets.asyncio.client.connect(
151
+ ws_url, process_exception=self._process_connection_exception
152
+ ):
153
+ self._platform_events_websocket = websocket
154
+ if self._connected_to_platform_websocket and not self._connected_to_platform_websocket.done():
155
+ self._connected_to_platform_websocket.set_result(True)
156
+ else:
157
+ logger.info('Reconnected to the platform events websocket.')
158
+
159
+ connection_opened_at = time.monotonic()
160
+ connection_lost = await self._consume_messages(websocket)
161
+
162
+ if not self._should_reconnect_after_close(websocket, connection_lost=connection_lost):
163
+ break
164
+
165
+ # Reconnect a healthy connection immediately; back off only on repeated rapid drops. The first
166
+ # rapid drop reconnects once without delay (it only primes the backoff generator), and each
167
+ # subsequent consecutive rapid drop then sleeps for the next backoff interval. A healthy
168
+ # connection resets the generator, so the next rapid drop again gets that one free retry.
169
+ if time.monotonic() - connection_opened_at >= self._HEALTHY_CONNECTION_MIN_DURATION:
170
+ backoff_delays = None
171
+ elif backoff_delays is None:
172
+ backoff_delays = websockets.client.backoff()
173
+ else:
174
+ await asyncio.sleep(next(backoff_delays))
175
+ except Exception:
176
+ logger.exception('Error in websocket connection')
177
+ if self._connected_to_platform_websocket is not None and not self._connected_to_platform_websocket.done():
178
+ self._connected_to_platform_websocket.set_result(False)
179
+
180
+ async def _consume_messages(self, websocket: websockets.asyncio.client.ClientConnection) -> bool:
181
+ """Handle platform messages until the connection closes; return whether it was lost vs. closed cleanly."""
182
+ try:
183
+ async for message in websocket:
184
+ await self._handle_platform_message(message)
185
+ except websockets.exceptions.ConnectionClosed:
186
+ return True
187
+ return False
188
+
189
+ async def _handle_platform_message(self, message: str | bytes) -> None:
190
+ """Parse a single platform message and emit the matching local event."""
191
+ try:
192
+ parsed_message = event_data_adapter.validate_json(message)
193
+
194
+ if isinstance(parsed_message, DeprecatedEvent):
195
+ return
196
+
197
+ if isinstance(parsed_message, UnknownEvent):
198
+ logger.info(
199
+ f'Unknown message received: event_name={parsed_message.name}, event_data={parsed_message.data}'
200
+ )
201
+ return
202
+
203
+ self.emit(
204
+ event=parsed_message.name,
205
+ event_data=parsed_message.data
206
+ if not isinstance(parsed_message.data, SystemInfoEventData)
207
+ else parsed_message.data.to_crawlee_format(self._configuration.dedicated_cpus or 1),
208
+ )
209
+
210
+ if parsed_message.name == Event.MIGRATING:
211
+ await self._emit_persist_state_event_rec_task.stop()
212
+ self.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=True))
213
+ except Exception:
214
+ logger.exception('Cannot parse Actor event', extra={'raw_message': message})
215
+
216
+ def _should_reconnect_after_close(
217
+ self,
218
+ websocket: websockets.asyncio.client.ClientConnection,
219
+ *,
220
+ connection_lost: bool,
221
+ ) -> bool:
222
+ """Log the websocket close and report whether to reconnect (`False` on a non-retryable close code)."""
223
+ if websocket.close_code in self._NON_RETRYABLE_CLOSE_CODES:
224
+ logger.error(
225
+ f'Connection to platform events websocket was closed with a non-retryable code '
226
+ f'(code={websocket.close_code}, reason={websocket.close_reason!r}); not reconnecting. '
227
+ f'No further platform events will be received for the rest of this run, so migration and '
228
+ f'abort handling (e.g. persisting state before a migration) is now disabled.'
229
+ )
230
+ return False
231
+
232
+ if connection_lost:
233
+ logger.warning(
234
+ f'Connection to platform events websocket was lost '
235
+ f'(code={websocket.close_code}, reason={websocket.close_reason!r}), reconnecting...'
236
+ )
237
+ else:
238
+ logger.info(
239
+ f'Connection to platform events websocket was closed '
240
+ f'(code={websocket.close_code}, reason={websocket.close_reason!r}), reconnecting...'
241
+ )
242
+ return True
@@ -6,14 +6,13 @@ from functools import cached_property
6
6
  from logging import getLogger
7
7
  from typing import TYPE_CHECKING, ClassVar, Literal, overload
8
8
 
9
- from apify_client import ApifyClientAsync
10
-
11
9
  from ._utils import hash_api_public_base_url_and_token
12
10
 
13
11
  if TYPE_CHECKING:
14
12
  from collections.abc import Callable
15
13
  from types import TracebackType
16
14
 
15
+ from apify_client import ApifyClientAsync
17
16
  from apify_client._resource_clients import (
18
17
  DatasetClientAsync,
19
18
  DatasetCollectionClientAsync,
@@ -35,6 +34,7 @@ async def open_by_alias(
35
34
  storage_type: Literal['Dataset'],
36
35
  collection_client: DatasetCollectionClientAsync,
37
36
  get_resource_client_by_id: Callable[[str], DatasetClientAsync],
37
+ api_client: ApifyClientAsync,
38
38
  configuration: Configuration,
39
39
  ) -> DatasetClientAsync: ...
40
40
 
@@ -46,6 +46,7 @@ async def open_by_alias(
46
46
  storage_type: Literal['KeyValueStore'],
47
47
  collection_client: KeyValueStoreCollectionClientAsync,
48
48
  get_resource_client_by_id: Callable[[str], KeyValueStoreClientAsync],
49
+ api_client: ApifyClientAsync,
49
50
  configuration: Configuration,
50
51
  ) -> KeyValueStoreClientAsync: ...
51
52
 
@@ -57,6 +58,7 @@ async def open_by_alias(
57
58
  storage_type: Literal['RequestQueue'],
58
59
  collection_client: RequestQueueCollectionClientAsync,
59
60
  get_resource_client_by_id: Callable[[str], RequestQueueClientAsync],
61
+ api_client: ApifyClientAsync,
60
62
  configuration: Configuration,
61
63
  ) -> RequestQueueClientAsync: ...
62
64
 
@@ -69,6 +71,7 @@ async def open_by_alias(
69
71
  KeyValueStoreCollectionClientAsync | RequestQueueCollectionClientAsync | DatasetCollectionClientAsync
70
72
  ),
71
73
  get_resource_client_by_id: Callable[[str], KeyValueStoreClientAsync | RequestQueueClientAsync | DatasetClientAsync],
74
+ api_client: ApifyClientAsync,
72
75
  configuration: Configuration,
73
76
  ) -> KeyValueStoreClientAsync | RequestQueueClientAsync | DatasetClientAsync:
74
77
  """Open storage by alias, creating it if necessary.
@@ -81,6 +84,8 @@ async def open_by_alias(
81
84
  storage_type: The type of storage to open.
82
85
  collection_client: The Apify API collection client for the storage type.
83
86
  get_resource_client_by_id: A callable that takes a storage ID and returns the resource client.
87
+ api_client: The Apify API client used for the storage operation. Reused to access the default KVS that
88
+ holds the alias mapping, so alias resolution does not spin up its own client.
84
89
  configuration: Configuration object containing API credentials and settings.
85
90
 
86
91
  Returns:
@@ -94,6 +99,7 @@ async def open_by_alias(
94
99
  storage_type=storage_type,
95
100
  alias=alias,
96
101
  configuration=configuration,
102
+ api_client=api_client,
97
103
  ) as alias_resolver:
98
104
  storage_id = await alias_resolver.resolve_id()
99
105
 
@@ -142,10 +148,12 @@ class AliasResolver:
142
148
  storage_type: Literal['Dataset', 'KeyValueStore', 'RequestQueue'],
143
149
  alias: str,
144
150
  configuration: Configuration,
151
+ api_client: ApifyClientAsync,
145
152
  ) -> None:
146
153
  self._storage_type = storage_type
147
154
  self._alias = alias
148
155
  self._configuration = configuration
156
+ self._api_client = api_client
149
157
 
150
158
  async def __aenter__(self) -> AliasResolver:
151
159
  """Context manager to prevent race condition in alias creation."""
@@ -173,26 +181,22 @@ class AliasResolver:
173
181
  cls._alias_init_lock = Lock()
174
182
  return cls._alias_init_lock
175
183
 
176
- @classmethod
177
- async def _get_alias_map(cls, configuration: Configuration) -> dict[str, str]:
184
+ async def _get_alias_map(self) -> dict[str, str]:
178
185
  """Get the aliases and storage ids mapping from the default kvs.
179
186
 
180
- Mapping is loaded from kvs only once and is shared for all instances of the AliasResolver class.
181
-
182
- Args:
183
- configuration: Configuration object to use for accessing the default KVS.
187
+ Mapping is loaded from kvs only once and is shared for all instances of the `AliasResolver` class.
184
188
 
185
189
  Returns:
186
190
  Map of aliases and storage ids.
187
191
  """
188
- if not cls._alias_map_loaded and configuration.is_at_home:
189
- default_kvs_client = await cls._get_default_kvs_client(configuration)
192
+ if not AliasResolver._alias_map_loaded and self._configuration.is_at_home:
193
+ default_kvs_client = self._get_default_kvs_client()
190
194
 
191
- record = await default_kvs_client.get_record(cls._ALIAS_MAPPING_KEY)
192
- cls._alias_map = record.get('value', {}) if record else {}
193
- cls._alias_map_loaded = True
195
+ record = await default_kvs_client.get_record(self._ALIAS_MAPPING_KEY)
196
+ AliasResolver._alias_map = record.get('value', {}) if record else {}
197
+ AliasResolver._alias_map_loaded = True
194
198
 
195
- return cls._alias_map
199
+ return AliasResolver._alias_map
196
200
 
197
201
  async def resolve_id(self) -> str | None:
198
202
  """Get id of the aliased storage.
@@ -212,12 +216,12 @@ class AliasResolver:
212
216
  return storage_id
213
217
 
214
218
  # Fallback to the mapping saved in the default KVS
215
- return (await self._get_alias_map(self._configuration)).get(self._storage_key, None)
219
+ return (await self._get_alias_map()).get(self._storage_key, None)
216
220
 
217
221
  async def store_mapping(self, storage_id: str) -> None:
218
222
  """Add alias and related storage id to the mapping in default kvs and local in-memory mapping."""
219
223
  # Update in-memory mapping
220
- alias_map = await self._get_alias_map(self._configuration)
224
+ alias_map = await self._get_alias_map()
221
225
  alias_map[self._storage_key] = storage_id
222
226
 
223
227
  if not self._configuration.is_at_home:
@@ -226,7 +230,7 @@ class AliasResolver:
226
230
  )
227
231
  return
228
232
 
229
- default_kvs_client = await self._get_default_kvs_client(self._configuration)
233
+ default_kvs_client = self._get_default_kvs_client()
230
234
 
231
235
  try:
232
236
  record = await default_kvs_client.get_record(self._ALIAS_MAPPING_KEY)
@@ -249,16 +253,14 @@ class AliasResolver:
249
253
  ]
250
254
  )
251
255
 
252
- @staticmethod
253
- async def _get_default_kvs_client(configuration: Configuration) -> KeyValueStoreClientAsync:
254
- """Get a client for the default key-value store."""
255
- apify_client_async = ApifyClientAsync(
256
- token=configuration.token,
257
- api_url=configuration.api_base_url,
258
- max_retries=8,
259
- )
256
+ def _get_default_kvs_client(self) -> KeyValueStoreClientAsync:
257
+ """Get a client for the default key-value store.
260
258
 
261
- if not configuration.default_key_value_store_id:
259
+ Derived from the injected `ApifyClientAsync`, so alias resolution shares the same HTTP client (and its
260
+ connection pool and event loop affinity) as the storage operation that triggered it, instead of creating
261
+ and leaking its own.
262
+ """
263
+ if not self._configuration.default_key_value_store_id:
262
264
  raise ValueError("'Configuration.default_key_value_store_id' must be set.")
263
265
 
264
- return apify_client_async.key_value_store(key_value_store_id=configuration.default_key_value_store_id)
266
+ return self._api_client.key_value_store(key_value_store_id=self._configuration.default_key_value_store_id)
@@ -117,6 +117,7 @@ async def create_storage_api_client(
117
117
  storage_type=storage_type,
118
118
  collection_client=collection_client,
119
119
  get_resource_client_by_id=get_resource_client,
120
+ api_client=apify_client,
120
121
  configuration=configuration,
121
122
  ) # ty:ignore[no-matching-overload]
122
123
 
@@ -127,6 +128,7 @@ async def create_storage_api_client(
127
128
  storage_type=storage_type,
128
129
  collection_client=collection_client,
129
130
  get_resource_client_by_id=get_resource_client,
131
+ api_client=apify_client,
130
132
  configuration=configuration,
131
133
  ) # ty:ignore[no-matching-overload]
132
134
 
@@ -1,142 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import asyncio
4
- import contextlib
5
- from typing import TYPE_CHECKING, Annotated, Self
6
-
7
- import websockets.asyncio.client
8
- from pydantic import Discriminator, TypeAdapter
9
- from typing_extensions import Unpack, override
10
-
11
- from crawlee.events import EventManager
12
- from crawlee.events._types import Event, EventPersistStateData
13
-
14
- from apify._utils import docs_group
15
- from apify.events._types import DeprecatedEvent, EventMessage, SystemInfoEventData, UnknownEvent
16
- from apify.log import logger
17
-
18
- if TYPE_CHECKING:
19
- from types import TracebackType
20
-
21
- from crawlee.events._event_manager import EventManagerOptions
22
-
23
- from apify._configuration import Configuration
24
-
25
-
26
- event_data_adapter = TypeAdapter[EventMessage | DeprecatedEvent | UnknownEvent](
27
- Annotated[EventMessage, Discriminator('name')] | DeprecatedEvent | UnknownEvent
28
- )
29
-
30
-
31
- @docs_group('Event managers')
32
- class ApifyEventManager(EventManager):
33
- """Event manager for the Apify platform.
34
-
35
- This class extends Crawlee's `EventManager` to provide Apify-specific functionality, including websocket
36
- connectivity to the Apify platform for receiving platform events.
37
-
38
- The event manager handles:
39
- - Registration and emission of events and their listeners.
40
- - Websocket connection to Apify platform events.
41
- - Processing and validation of platform messages.
42
- - Automatic event forwarding from the platform to local event listeners.
43
-
44
- This class should not be used directly. Use the `Actor.on` and `Actor.off` methods to interact
45
- with the event system.
46
- """
47
-
48
- def __init__(self, configuration: Configuration, **kwargs: Unpack[EventManagerOptions]) -> None:
49
- """Initialize a new instance.
50
-
51
- Args:
52
- configuration: The Actor configuration for the event manager.
53
- **kwargs: Additional event manager options passed to the parent class.
54
- """
55
- super().__init__(**kwargs)
56
-
57
- self._configuration = configuration
58
- """The Actor configuration for the event manager."""
59
-
60
- self._platform_events_websocket: websockets.asyncio.client.ClientConnection | None = None
61
- """WebSocket connection to the platform events."""
62
-
63
- self._process_platform_messages_task: asyncio.Task | None = None
64
- """Task for processing messages from the platform websocket."""
65
-
66
- self._connected_to_platform_websocket: asyncio.Future[bool] | None = None
67
- """Future that resolves when the connection to the platform websocket is established."""
68
-
69
- @override
70
- async def __aenter__(self) -> Self:
71
- await super().__aenter__()
72
- self._connected_to_platform_websocket = asyncio.Future()
73
-
74
- # Run tasks but don't await them
75
- if self._configuration.actor_events_ws_url:
76
- self._process_platform_messages_task = asyncio.create_task(
77
- self._process_platform_messages(self._configuration.actor_events_ws_url)
78
- )
79
- is_connected = await self._connected_to_platform_websocket
80
- if not is_connected:
81
- # Exit the already-entered parent so the recurring persist state task does not leak.
82
- await self.__aexit__(None, None, None)
83
- raise RuntimeError('Error connecting to platform events websocket!')
84
- else:
85
- logger.debug('APIFY_ACTOR_EVENTS_WS_URL env var not set, no events from Apify platform will be emitted.')
86
-
87
- return self
88
-
89
- @override
90
- async def __aexit__(
91
- self,
92
- exc_type: type[BaseException] | None,
93
- exc_value: BaseException | None,
94
- exc_traceback: TracebackType | None,
95
- ) -> None:
96
- if self._platform_events_websocket:
97
- await self._platform_events_websocket.close()
98
-
99
- if self._process_platform_messages_task and not self._process_platform_messages_task.done():
100
- self._process_platform_messages_task.cancel()
101
- with contextlib.suppress(asyncio.CancelledError):
102
- await self._process_platform_messages_task
103
-
104
- await super().__aexit__(exc_type, exc_value, exc_traceback)
105
-
106
- async def _process_platform_messages(self, ws_url: str) -> None:
107
- try:
108
- async with websockets.asyncio.client.connect(ws_url) as websocket:
109
- self._platform_events_websocket = websocket
110
- if self._connected_to_platform_websocket is not None:
111
- self._connected_to_platform_websocket.set_result(True)
112
-
113
- async for message in websocket:
114
- try:
115
- parsed_message = event_data_adapter.validate_json(message)
116
-
117
- if isinstance(parsed_message, DeprecatedEvent):
118
- continue
119
-
120
- if isinstance(parsed_message, UnknownEvent):
121
- logger.info(
122
- f'Unknown message received: event_name={parsed_message.name}, '
123
- f'event_data={parsed_message.data}'
124
- )
125
- continue
126
-
127
- self.emit(
128
- event=parsed_message.name,
129
- event_data=parsed_message.data
130
- if not isinstance(parsed_message.data, SystemInfoEventData)
131
- else parsed_message.data.to_crawlee_format(self._configuration.dedicated_cpus or 1),
132
- )
133
-
134
- if parsed_message.name == Event.MIGRATING:
135
- await self._emit_persist_state_event_rec_task.stop()
136
- self.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=True))
137
- except Exception:
138
- logger.exception('Cannot parse Actor event', extra={'raw_message': message})
139
- except Exception:
140
- logger.exception('Error in websocket connection')
141
- if self._connected_to_platform_websocket is not None and not self._connected_to_platform_websocket.done():
142
- self._connected_to_platform_websocket.set_result(False)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes