apify 3.4.2b22__tar.gz → 3.4.2b24__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {apify-3.4.2b22 → apify-3.4.2b24}/CHANGELOG.md +2 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/PKG-INFO +1 -1
- {apify-3.4.2b22 → apify-3.4.2b24}/pyproject.toml +1 -1
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/_actor.py +9 -7
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/_webhook.py +8 -10
- apify-3.4.2b24/src/apify/events/_apify_event_manager.py +242 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_apify/_alias_resolving.py +29 -27
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_apify/_api_client_creation.py +2 -0
- apify-3.4.2b22/src/apify/events/_apify_event_manager.py +0 -142
- {apify-3.4.2b22 → apify-3.4.2b24}/.gitignore +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/CONTRIBUTING.md +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/LICENSE +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/README.md +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/__init__.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/_charging.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/_configuration.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/_consts.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/_crypto.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/_proxy_configuration.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/_utils.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/events/__init__.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/events/_types.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/events/py.typed +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/log.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/py.typed +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/request_loaders/__init__.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/request_loaders/_apify_request_list.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/request_loaders/py.typed +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/scrapy/__init__.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/scrapy/_actor_runner.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/scrapy/_async_thread.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/scrapy/_logging_config.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/scrapy/_serialization.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/scrapy/extensions/__init__.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/scrapy/extensions/_httpcache.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/scrapy/middlewares/__init__.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/scrapy/middlewares/apify_proxy.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/scrapy/middlewares/py.typed +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/scrapy/pipelines/__init__.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/scrapy/pipelines/actor_dataset_push.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/scrapy/pipelines/py.typed +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/scrapy/py.typed +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/scrapy/requests.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/scrapy/scheduler.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/scrapy/utils.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/__init__.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_apify/__init__.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_apify/_dataset_client.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_apify/_key_value_store_client.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_apify/_models.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_apify/_request_queue_client.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_apify/_request_queue_shared_client.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_apify/_request_queue_single_client.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_apify/_storage_client.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_apify/_utils.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_apify/py.typed +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_file_system/__init__.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_file_system/_dataset_client.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_file_system/_key_value_store_client.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_file_system/_storage_client.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_ppe_dataset_mixin.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_smart_apify/__init__.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_smart_apify/_storage_client.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/py.typed +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storages/__init__.py +0 -0
- {apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storages/py.typed +0 -0
|
@@ -26,6 +26,8 @@ All notable changes to this project will be documented in this file.
|
|
|
26
26
|
- **scrapy:** Drop deprecated spider arg from Scrapy proxy middleware methods ([#977](https://github.com/apify/apify-sdk-python/pull/977)) ([49dd836](https://github.com/apify/apify-sdk-python/commit/49dd836c986b0e3bda4bb6485be8ad95d286cd82)) by [@vdusek](https://github.com/vdusek)
|
|
27
27
|
- Redirect input key in all file-system key-value store operations ([#976](https://github.com/apify/apify-sdk-python/pull/976)) ([1fbdce2](https://github.com/apify/apify-sdk-python/commit/1fbdce27452f1cdc9580a53edd6de3ce9fe36a1d)) by [@vdusek](https://github.com/vdusek)
|
|
28
28
|
- Respect explicit zero custom_after_sleep in metamorph and reboot ([#971](https://github.com/apify/apify-sdk-python/pull/971)) ([e8cda0a](https://github.com/apify/apify-sdk-python/commit/e8cda0a80339a21ce94cc382bd042509052ce1bf)) by [@vdusek](https://github.com/vdusek)
|
|
29
|
+
- Reconnect to platform events websocket after connection drop ([#967](https://github.com/apify/apify-sdk-python/pull/967)) ([5653a22](https://github.com/apify/apify-sdk-python/commit/5653a222d976919cc7388a88242a488ba503647e)) by [@vdusek](https://github.com/vdusek)
|
|
30
|
+
- Forward all `Webhook` fields to ad-hoc webhooks ([#963](https://github.com/apify/apify-sdk-python/pull/963)) ([726620b](https://github.com/apify/apify-sdk-python/commit/726620be25da85b74b3f0d1e4f8c1f8f1b29d9b1)) by [@vdusek](https://github.com/vdusek)
|
|
29
31
|
|
|
30
32
|
### 🚜 Refactor
|
|
31
33
|
|
|
@@ -4,6 +4,7 @@ import asyncio
|
|
|
4
4
|
import sys
|
|
5
5
|
import warnings
|
|
6
6
|
from contextlib import suppress
|
|
7
|
+
from dataclasses import asdict
|
|
7
8
|
from datetime import UTC, datetime, timedelta
|
|
8
9
|
from functools import cached_property
|
|
9
10
|
from typing import TYPE_CHECKING, Any, Literal, TypeVar, cast, overload
|
|
@@ -1283,15 +1284,16 @@ class _ActorType:
|
|
|
1283
1284
|
if not self.configuration.actor_run_id:
|
|
1284
1285
|
raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.')
|
|
1285
1286
|
|
|
1287
|
+
# `Webhook`'s field names match `webhooks().create()`'s parameters, so we forward them by name rather
|
|
1288
|
+
# than listing each one.
|
|
1289
|
+
webhook_fields = asdict(webhook)
|
|
1290
|
+
|
|
1291
|
+
if idempotency_key is not None:
|
|
1292
|
+
webhook_fields['idempotency_key'] = idempotency_key
|
|
1293
|
+
|
|
1286
1294
|
await self.apify_client.webhooks().create(
|
|
1295
|
+
**webhook_fields,
|
|
1287
1296
|
actor_run_id=self.configuration.actor_run_id,
|
|
1288
|
-
event_types=webhook.event_types,
|
|
1289
|
-
request_url=webhook.request_url,
|
|
1290
|
-
payload_template=webhook.payload_template,
|
|
1291
|
-
headers_template=webhook.headers_template,
|
|
1292
|
-
ignore_ssl_errors=webhook.ignore_ssl_errors,
|
|
1293
|
-
do_not_retry=webhook.do_not_retry,
|
|
1294
|
-
idempotency_key=idempotency_key if idempotency_key is not None else webhook.idempotency_key,
|
|
1295
1297
|
is_ad_hoc=True,
|
|
1296
1298
|
)
|
|
1297
1299
|
|
|
@@ -48,15 +48,13 @@ class Webhook:
|
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
def to_client_representations(webhooks: list[Webhook] | None) -> list[WebhookRepresentation] | None:
|
|
51
|
-
"""
|
|
51
|
+
"""Convert SDK webhooks to the ad-hoc representation accepted by the client's `start()` / `call()`.
|
|
52
|
+
|
|
53
|
+
`Webhook`'s field names match `WebhookRepresentation`'s, so we let pydantic read them straight off the
|
|
54
|
+
instance rather than listing each one. This forwards any field the representation declares without changes
|
|
55
|
+
here, and never emits an undeclared field as a malformed snake_case extra.
|
|
56
|
+
"""
|
|
52
57
|
if not webhooks:
|
|
53
58
|
return None
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
event_types=w.event_types,
|
|
57
|
-
request_url=w.request_url,
|
|
58
|
-
payload_template=w.payload_template,
|
|
59
|
-
headers_template=w.headers_template,
|
|
60
|
-
)
|
|
61
|
-
for w in webhooks
|
|
62
|
-
]
|
|
59
|
+
|
|
60
|
+
return [WebhookRepresentation.model_validate(webhook, from_attributes=True) for webhook in webhooks]
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import contextlib
|
|
5
|
+
import time
|
|
6
|
+
from typing import TYPE_CHECKING, Annotated, Self
|
|
7
|
+
|
|
8
|
+
import websockets.asyncio.client
|
|
9
|
+
import websockets.client
|
|
10
|
+
import websockets.exceptions
|
|
11
|
+
from pydantic import Discriminator, TypeAdapter
|
|
12
|
+
from typing_extensions import Unpack, override
|
|
13
|
+
from websockets.frames import CloseCode
|
|
14
|
+
|
|
15
|
+
from crawlee.events import EventManager
|
|
16
|
+
from crawlee.events._types import Event, EventPersistStateData
|
|
17
|
+
|
|
18
|
+
from apify._utils import docs_group
|
|
19
|
+
from apify.events._types import DeprecatedEvent, EventMessage, SystemInfoEventData, UnknownEvent
|
|
20
|
+
from apify.log import logger
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from collections.abc import Generator
|
|
24
|
+
from types import TracebackType
|
|
25
|
+
|
|
26
|
+
from crawlee.events._event_manager import EventManagerOptions
|
|
27
|
+
|
|
28
|
+
from apify._configuration import Configuration
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
event_data_adapter = TypeAdapter[EventMessage | DeprecatedEvent | UnknownEvent](
|
|
32
|
+
Annotated[EventMessage, Discriminator('name')] | DeprecatedEvent | UnknownEvent
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@docs_group('Event managers')
|
|
37
|
+
class ApifyEventManager(EventManager):
|
|
38
|
+
"""Event manager for the Apify platform.
|
|
39
|
+
|
|
40
|
+
This class extends Crawlee's `EventManager` to provide Apify-specific functionality, including websocket
|
|
41
|
+
connectivity to the Apify platform for receiving platform events.
|
|
42
|
+
|
|
43
|
+
The event manager handles:
|
|
44
|
+
- Registration and emission of events and their listeners.
|
|
45
|
+
- Websocket connection to Apify platform events.
|
|
46
|
+
- Processing and validation of platform messages.
|
|
47
|
+
- Automatic event forwarding from the platform to local event listeners.
|
|
48
|
+
|
|
49
|
+
This class should not be used directly. Use the `Actor.on` and `Actor.off` methods to interact
|
|
50
|
+
with the event system.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
_NON_RETRYABLE_CLOSE_CODES = frozenset(
|
|
54
|
+
{
|
|
55
|
+
CloseCode.PROTOCOL_ERROR,
|
|
56
|
+
CloseCode.UNSUPPORTED_DATA,
|
|
57
|
+
CloseCode.INVALID_DATA,
|
|
58
|
+
CloseCode.POLICY_VIOLATION,
|
|
59
|
+
CloseCode.MANDATORY_EXTENSION,
|
|
60
|
+
}
|
|
61
|
+
)
|
|
62
|
+
"""WebSocket close codes for a permanent condition, on which the connection is not re-established.
|
|
63
|
+
|
|
64
|
+
The platform sends `POLICY_VIOLATION` (1008) for an unknown/missing run ID or an exceeded per-run
|
|
65
|
+
connection limit. The rest are protocol, data, or mandatory-extension failures that reconnecting
|
|
66
|
+
cannot resolve.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
_HEALTHY_CONNECTION_MIN_DURATION = 1.0
|
|
70
|
+
"""Seconds a connection must stay open to count as healthy, after which a drop reconnects without backoff."""
|
|
71
|
+
|
|
72
|
+
def __init__(self, configuration: Configuration, **kwargs: Unpack[EventManagerOptions]) -> None:
|
|
73
|
+
"""Initialize a new instance.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
configuration: The Actor configuration for the event manager.
|
|
77
|
+
**kwargs: Additional event manager options passed to the parent class.
|
|
78
|
+
"""
|
|
79
|
+
super().__init__(**kwargs)
|
|
80
|
+
|
|
81
|
+
self._configuration = configuration
|
|
82
|
+
"""The Actor configuration for the event manager."""
|
|
83
|
+
|
|
84
|
+
self._platform_events_websocket: websockets.asyncio.client.ClientConnection | None = None
|
|
85
|
+
"""WebSocket connection to the platform events."""
|
|
86
|
+
|
|
87
|
+
self._process_platform_messages_task: asyncio.Task | None = None
|
|
88
|
+
"""Task for processing messages from the platform websocket."""
|
|
89
|
+
|
|
90
|
+
self._connected_to_platform_websocket: asyncio.Future[bool] | None = None
|
|
91
|
+
"""Future that resolves when the connection to the platform websocket is established."""
|
|
92
|
+
|
|
93
|
+
@override
|
|
94
|
+
async def __aenter__(self) -> Self:
|
|
95
|
+
await super().__aenter__()
|
|
96
|
+
self._connected_to_platform_websocket = asyncio.Future()
|
|
97
|
+
|
|
98
|
+
# Run tasks but don't await them
|
|
99
|
+
if self._configuration.actor_events_ws_url:
|
|
100
|
+
self._process_platform_messages_task = asyncio.create_task(
|
|
101
|
+
self._process_platform_messages(self._configuration.actor_events_ws_url)
|
|
102
|
+
)
|
|
103
|
+
is_connected = await self._connected_to_platform_websocket
|
|
104
|
+
if not is_connected:
|
|
105
|
+
# Exit the already-entered parent so the recurring persist state task does not leak.
|
|
106
|
+
await self.__aexit__(None, None, None)
|
|
107
|
+
raise RuntimeError('Error connecting to platform events websocket!')
|
|
108
|
+
else:
|
|
109
|
+
logger.debug('APIFY_ACTOR_EVENTS_WS_URL env var not set, no events from Apify platform will be emitted.')
|
|
110
|
+
|
|
111
|
+
return self
|
|
112
|
+
|
|
113
|
+
@override
|
|
114
|
+
async def __aexit__(
|
|
115
|
+
self,
|
|
116
|
+
exc_type: type[BaseException] | None,
|
|
117
|
+
exc_value: BaseException | None,
|
|
118
|
+
exc_traceback: TracebackType | None,
|
|
119
|
+
) -> None:
|
|
120
|
+
# Cancel the task before closing the websocket so that the closed connection is not treated as a drop
|
|
121
|
+
# and followed by a reconnect attempt.
|
|
122
|
+
if self._process_platform_messages_task and not self._process_platform_messages_task.done():
|
|
123
|
+
self._process_platform_messages_task.cancel()
|
|
124
|
+
with contextlib.suppress(asyncio.CancelledError):
|
|
125
|
+
await self._process_platform_messages_task
|
|
126
|
+
|
|
127
|
+
if self._platform_events_websocket:
|
|
128
|
+
await self._platform_events_websocket.close()
|
|
129
|
+
|
|
130
|
+
await super().__aexit__(exc_type, exc_value, exc_traceback)
|
|
131
|
+
|
|
132
|
+
def _process_connection_exception(self, exc: Exception) -> Exception | None:
|
|
133
|
+
"""Decide whether a failed connection attempt to the platform websocket should be retried.
|
|
134
|
+
|
|
135
|
+
Before the first successful connection, every error is fatal so that `__aenter__` fails fast. After that,
|
|
136
|
+
the default `websockets` behavior decides which errors are transient and retried with exponential backoff.
|
|
137
|
+
"""
|
|
138
|
+
if self._connected_to_platform_websocket and self._connected_to_platform_websocket.done():
|
|
139
|
+
return websockets.asyncio.client.process_exception(exc)
|
|
140
|
+
return exc
|
|
141
|
+
|
|
142
|
+
async def _process_platform_messages(self, ws_url: str) -> None:
|
|
143
|
+
# The `websockets` reconnect iterator only backs off on failed connection *attempts*, not on a connection
|
|
144
|
+
# that opens and is then closed. Track our own backoff here so a server that keeps accepting and immediately
|
|
145
|
+
# closing is not hammered; it is reset after a healthy connection so a healthy drop reconnects immediately.
|
|
146
|
+
backoff_delays: Generator[float] | None = None
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
# Used as an async iterator, `connect` reconnects with exponential backoff on failed connection attempts.
|
|
150
|
+
async for websocket in websockets.asyncio.client.connect(
|
|
151
|
+
ws_url, process_exception=self._process_connection_exception
|
|
152
|
+
):
|
|
153
|
+
self._platform_events_websocket = websocket
|
|
154
|
+
if self._connected_to_platform_websocket and not self._connected_to_platform_websocket.done():
|
|
155
|
+
self._connected_to_platform_websocket.set_result(True)
|
|
156
|
+
else:
|
|
157
|
+
logger.info('Reconnected to the platform events websocket.')
|
|
158
|
+
|
|
159
|
+
connection_opened_at = time.monotonic()
|
|
160
|
+
connection_lost = await self._consume_messages(websocket)
|
|
161
|
+
|
|
162
|
+
if not self._should_reconnect_after_close(websocket, connection_lost=connection_lost):
|
|
163
|
+
break
|
|
164
|
+
|
|
165
|
+
# Reconnect a healthy connection immediately; back off only on repeated rapid drops. The first
|
|
166
|
+
# rapid drop reconnects once without delay (it only primes the backoff generator), and each
|
|
167
|
+
# subsequent consecutive rapid drop then sleeps for the next backoff interval. A healthy
|
|
168
|
+
# connection resets the generator, so the next rapid drop again gets that one free retry.
|
|
169
|
+
if time.monotonic() - connection_opened_at >= self._HEALTHY_CONNECTION_MIN_DURATION:
|
|
170
|
+
backoff_delays = None
|
|
171
|
+
elif backoff_delays is None:
|
|
172
|
+
backoff_delays = websockets.client.backoff()
|
|
173
|
+
else:
|
|
174
|
+
await asyncio.sleep(next(backoff_delays))
|
|
175
|
+
except Exception:
|
|
176
|
+
logger.exception('Error in websocket connection')
|
|
177
|
+
if self._connected_to_platform_websocket is not None and not self._connected_to_platform_websocket.done():
|
|
178
|
+
self._connected_to_platform_websocket.set_result(False)
|
|
179
|
+
|
|
180
|
+
async def _consume_messages(self, websocket: websockets.asyncio.client.ClientConnection) -> bool:
|
|
181
|
+
"""Handle platform messages until the connection closes; return whether it was lost vs. closed cleanly."""
|
|
182
|
+
try:
|
|
183
|
+
async for message in websocket:
|
|
184
|
+
await self._handle_platform_message(message)
|
|
185
|
+
except websockets.exceptions.ConnectionClosed:
|
|
186
|
+
return True
|
|
187
|
+
return False
|
|
188
|
+
|
|
189
|
+
async def _handle_platform_message(self, message: str | bytes) -> None:
|
|
190
|
+
"""Parse a single platform message and emit the matching local event."""
|
|
191
|
+
try:
|
|
192
|
+
parsed_message = event_data_adapter.validate_json(message)
|
|
193
|
+
|
|
194
|
+
if isinstance(parsed_message, DeprecatedEvent):
|
|
195
|
+
return
|
|
196
|
+
|
|
197
|
+
if isinstance(parsed_message, UnknownEvent):
|
|
198
|
+
logger.info(
|
|
199
|
+
f'Unknown message received: event_name={parsed_message.name}, event_data={parsed_message.data}'
|
|
200
|
+
)
|
|
201
|
+
return
|
|
202
|
+
|
|
203
|
+
self.emit(
|
|
204
|
+
event=parsed_message.name,
|
|
205
|
+
event_data=parsed_message.data
|
|
206
|
+
if not isinstance(parsed_message.data, SystemInfoEventData)
|
|
207
|
+
else parsed_message.data.to_crawlee_format(self._configuration.dedicated_cpus or 1),
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
if parsed_message.name == Event.MIGRATING:
|
|
211
|
+
await self._emit_persist_state_event_rec_task.stop()
|
|
212
|
+
self.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=True))
|
|
213
|
+
except Exception:
|
|
214
|
+
logger.exception('Cannot parse Actor event', extra={'raw_message': message})
|
|
215
|
+
|
|
216
|
+
def _should_reconnect_after_close(
|
|
217
|
+
self,
|
|
218
|
+
websocket: websockets.asyncio.client.ClientConnection,
|
|
219
|
+
*,
|
|
220
|
+
connection_lost: bool,
|
|
221
|
+
) -> bool:
|
|
222
|
+
"""Log the websocket close and report whether to reconnect (`False` on a non-retryable close code)."""
|
|
223
|
+
if websocket.close_code in self._NON_RETRYABLE_CLOSE_CODES:
|
|
224
|
+
logger.error(
|
|
225
|
+
f'Connection to platform events websocket was closed with a non-retryable code '
|
|
226
|
+
f'(code={websocket.close_code}, reason={websocket.close_reason!r}); not reconnecting. '
|
|
227
|
+
f'No further platform events will be received for the rest of this run, so migration and '
|
|
228
|
+
f'abort handling (e.g. persisting state before a migration) is now disabled.'
|
|
229
|
+
)
|
|
230
|
+
return False
|
|
231
|
+
|
|
232
|
+
if connection_lost:
|
|
233
|
+
logger.warning(
|
|
234
|
+
f'Connection to platform events websocket was lost '
|
|
235
|
+
f'(code={websocket.close_code}, reason={websocket.close_reason!r}), reconnecting...'
|
|
236
|
+
)
|
|
237
|
+
else:
|
|
238
|
+
logger.info(
|
|
239
|
+
f'Connection to platform events websocket was closed '
|
|
240
|
+
f'(code={websocket.close_code}, reason={websocket.close_reason!r}), reconnecting...'
|
|
241
|
+
)
|
|
242
|
+
return True
|
|
@@ -6,14 +6,13 @@ from functools import cached_property
|
|
|
6
6
|
from logging import getLogger
|
|
7
7
|
from typing import TYPE_CHECKING, ClassVar, Literal, overload
|
|
8
8
|
|
|
9
|
-
from apify_client import ApifyClientAsync
|
|
10
|
-
|
|
11
9
|
from ._utils import hash_api_public_base_url_and_token
|
|
12
10
|
|
|
13
11
|
if TYPE_CHECKING:
|
|
14
12
|
from collections.abc import Callable
|
|
15
13
|
from types import TracebackType
|
|
16
14
|
|
|
15
|
+
from apify_client import ApifyClientAsync
|
|
17
16
|
from apify_client._resource_clients import (
|
|
18
17
|
DatasetClientAsync,
|
|
19
18
|
DatasetCollectionClientAsync,
|
|
@@ -35,6 +34,7 @@ async def open_by_alias(
|
|
|
35
34
|
storage_type: Literal['Dataset'],
|
|
36
35
|
collection_client: DatasetCollectionClientAsync,
|
|
37
36
|
get_resource_client_by_id: Callable[[str], DatasetClientAsync],
|
|
37
|
+
api_client: ApifyClientAsync,
|
|
38
38
|
configuration: Configuration,
|
|
39
39
|
) -> DatasetClientAsync: ...
|
|
40
40
|
|
|
@@ -46,6 +46,7 @@ async def open_by_alias(
|
|
|
46
46
|
storage_type: Literal['KeyValueStore'],
|
|
47
47
|
collection_client: KeyValueStoreCollectionClientAsync,
|
|
48
48
|
get_resource_client_by_id: Callable[[str], KeyValueStoreClientAsync],
|
|
49
|
+
api_client: ApifyClientAsync,
|
|
49
50
|
configuration: Configuration,
|
|
50
51
|
) -> KeyValueStoreClientAsync: ...
|
|
51
52
|
|
|
@@ -57,6 +58,7 @@ async def open_by_alias(
|
|
|
57
58
|
storage_type: Literal['RequestQueue'],
|
|
58
59
|
collection_client: RequestQueueCollectionClientAsync,
|
|
59
60
|
get_resource_client_by_id: Callable[[str], RequestQueueClientAsync],
|
|
61
|
+
api_client: ApifyClientAsync,
|
|
60
62
|
configuration: Configuration,
|
|
61
63
|
) -> RequestQueueClientAsync: ...
|
|
62
64
|
|
|
@@ -69,6 +71,7 @@ async def open_by_alias(
|
|
|
69
71
|
KeyValueStoreCollectionClientAsync | RequestQueueCollectionClientAsync | DatasetCollectionClientAsync
|
|
70
72
|
),
|
|
71
73
|
get_resource_client_by_id: Callable[[str], KeyValueStoreClientAsync | RequestQueueClientAsync | DatasetClientAsync],
|
|
74
|
+
api_client: ApifyClientAsync,
|
|
72
75
|
configuration: Configuration,
|
|
73
76
|
) -> KeyValueStoreClientAsync | RequestQueueClientAsync | DatasetClientAsync:
|
|
74
77
|
"""Open storage by alias, creating it if necessary.
|
|
@@ -81,6 +84,8 @@ async def open_by_alias(
|
|
|
81
84
|
storage_type: The type of storage to open.
|
|
82
85
|
collection_client: The Apify API collection client for the storage type.
|
|
83
86
|
get_resource_client_by_id: A callable that takes a storage ID and returns the resource client.
|
|
87
|
+
api_client: The Apify API client used for the storage operation. Reused to access the default KVS that
|
|
88
|
+
holds the alias mapping, so alias resolution does not spin up its own client.
|
|
84
89
|
configuration: Configuration object containing API credentials and settings.
|
|
85
90
|
|
|
86
91
|
Returns:
|
|
@@ -94,6 +99,7 @@ async def open_by_alias(
|
|
|
94
99
|
storage_type=storage_type,
|
|
95
100
|
alias=alias,
|
|
96
101
|
configuration=configuration,
|
|
102
|
+
api_client=api_client,
|
|
97
103
|
) as alias_resolver:
|
|
98
104
|
storage_id = await alias_resolver.resolve_id()
|
|
99
105
|
|
|
@@ -142,10 +148,12 @@ class AliasResolver:
|
|
|
142
148
|
storage_type: Literal['Dataset', 'KeyValueStore', 'RequestQueue'],
|
|
143
149
|
alias: str,
|
|
144
150
|
configuration: Configuration,
|
|
151
|
+
api_client: ApifyClientAsync,
|
|
145
152
|
) -> None:
|
|
146
153
|
self._storage_type = storage_type
|
|
147
154
|
self._alias = alias
|
|
148
155
|
self._configuration = configuration
|
|
156
|
+
self._api_client = api_client
|
|
149
157
|
|
|
150
158
|
async def __aenter__(self) -> AliasResolver:
|
|
151
159
|
"""Context manager to prevent race condition in alias creation."""
|
|
@@ -173,26 +181,22 @@ class AliasResolver:
|
|
|
173
181
|
cls._alias_init_lock = Lock()
|
|
174
182
|
return cls._alias_init_lock
|
|
175
183
|
|
|
176
|
-
|
|
177
|
-
async def _get_alias_map(cls, configuration: Configuration) -> dict[str, str]:
|
|
184
|
+
async def _get_alias_map(self) -> dict[str, str]:
|
|
178
185
|
"""Get the aliases and storage ids mapping from the default kvs.
|
|
179
186
|
|
|
180
|
-
Mapping is loaded from kvs only once and is shared for all instances of the AliasResolver class.
|
|
181
|
-
|
|
182
|
-
Args:
|
|
183
|
-
configuration: Configuration object to use for accessing the default KVS.
|
|
187
|
+
Mapping is loaded from kvs only once and is shared for all instances of the `AliasResolver` class.
|
|
184
188
|
|
|
185
189
|
Returns:
|
|
186
190
|
Map of aliases and storage ids.
|
|
187
191
|
"""
|
|
188
|
-
if not
|
|
189
|
-
default_kvs_client =
|
|
192
|
+
if not AliasResolver._alias_map_loaded and self._configuration.is_at_home:
|
|
193
|
+
default_kvs_client = self._get_default_kvs_client()
|
|
190
194
|
|
|
191
|
-
record = await default_kvs_client.get_record(
|
|
192
|
-
|
|
193
|
-
|
|
195
|
+
record = await default_kvs_client.get_record(self._ALIAS_MAPPING_KEY)
|
|
196
|
+
AliasResolver._alias_map = record.get('value', {}) if record else {}
|
|
197
|
+
AliasResolver._alias_map_loaded = True
|
|
194
198
|
|
|
195
|
-
return
|
|
199
|
+
return AliasResolver._alias_map
|
|
196
200
|
|
|
197
201
|
async def resolve_id(self) -> str | None:
|
|
198
202
|
"""Get id of the aliased storage.
|
|
@@ -212,12 +216,12 @@ class AliasResolver:
|
|
|
212
216
|
return storage_id
|
|
213
217
|
|
|
214
218
|
# Fallback to the mapping saved in the default KVS
|
|
215
|
-
return (await self._get_alias_map(
|
|
219
|
+
return (await self._get_alias_map()).get(self._storage_key, None)
|
|
216
220
|
|
|
217
221
|
async def store_mapping(self, storage_id: str) -> None:
|
|
218
222
|
"""Add alias and related storage id to the mapping in default kvs and local in-memory mapping."""
|
|
219
223
|
# Update in-memory mapping
|
|
220
|
-
alias_map = await self._get_alias_map(
|
|
224
|
+
alias_map = await self._get_alias_map()
|
|
221
225
|
alias_map[self._storage_key] = storage_id
|
|
222
226
|
|
|
223
227
|
if not self._configuration.is_at_home:
|
|
@@ -226,7 +230,7 @@ class AliasResolver:
|
|
|
226
230
|
)
|
|
227
231
|
return
|
|
228
232
|
|
|
229
|
-
default_kvs_client =
|
|
233
|
+
default_kvs_client = self._get_default_kvs_client()
|
|
230
234
|
|
|
231
235
|
try:
|
|
232
236
|
record = await default_kvs_client.get_record(self._ALIAS_MAPPING_KEY)
|
|
@@ -249,16 +253,14 @@ class AliasResolver:
|
|
|
249
253
|
]
|
|
250
254
|
)
|
|
251
255
|
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
"""Get a client for the default key-value store."""
|
|
255
|
-
apify_client_async = ApifyClientAsync(
|
|
256
|
-
token=configuration.token,
|
|
257
|
-
api_url=configuration.api_base_url,
|
|
258
|
-
max_retries=8,
|
|
259
|
-
)
|
|
256
|
+
def _get_default_kvs_client(self) -> KeyValueStoreClientAsync:
|
|
257
|
+
"""Get a client for the default key-value store.
|
|
260
258
|
|
|
261
|
-
|
|
259
|
+
Derived from the injected `ApifyClientAsync`, so alias resolution shares the same HTTP client (and its
|
|
260
|
+
connection pool and event loop affinity) as the storage operation that triggered it, instead of creating
|
|
261
|
+
and leaking its own.
|
|
262
|
+
"""
|
|
263
|
+
if not self._configuration.default_key_value_store_id:
|
|
262
264
|
raise ValueError("'Configuration.default_key_value_store_id' must be set.")
|
|
263
265
|
|
|
264
|
-
return
|
|
266
|
+
return self._api_client.key_value_store(key_value_store_id=self._configuration.default_key_value_store_id)
|
|
@@ -117,6 +117,7 @@ async def create_storage_api_client(
|
|
|
117
117
|
storage_type=storage_type,
|
|
118
118
|
collection_client=collection_client,
|
|
119
119
|
get_resource_client_by_id=get_resource_client,
|
|
120
|
+
api_client=apify_client,
|
|
120
121
|
configuration=configuration,
|
|
121
122
|
) # ty:ignore[no-matching-overload]
|
|
122
123
|
|
|
@@ -127,6 +128,7 @@ async def create_storage_api_client(
|
|
|
127
128
|
storage_type=storage_type,
|
|
128
129
|
collection_client=collection_client,
|
|
129
130
|
get_resource_client_by_id=get_resource_client,
|
|
131
|
+
api_client=apify_client,
|
|
130
132
|
configuration=configuration,
|
|
131
133
|
) # ty:ignore[no-matching-overload]
|
|
132
134
|
|
|
@@ -1,142 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
import contextlib
|
|
5
|
-
from typing import TYPE_CHECKING, Annotated, Self
|
|
6
|
-
|
|
7
|
-
import websockets.asyncio.client
|
|
8
|
-
from pydantic import Discriminator, TypeAdapter
|
|
9
|
-
from typing_extensions import Unpack, override
|
|
10
|
-
|
|
11
|
-
from crawlee.events import EventManager
|
|
12
|
-
from crawlee.events._types import Event, EventPersistStateData
|
|
13
|
-
|
|
14
|
-
from apify._utils import docs_group
|
|
15
|
-
from apify.events._types import DeprecatedEvent, EventMessage, SystemInfoEventData, UnknownEvent
|
|
16
|
-
from apify.log import logger
|
|
17
|
-
|
|
18
|
-
if TYPE_CHECKING:
|
|
19
|
-
from types import TracebackType
|
|
20
|
-
|
|
21
|
-
from crawlee.events._event_manager import EventManagerOptions
|
|
22
|
-
|
|
23
|
-
from apify._configuration import Configuration
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
event_data_adapter = TypeAdapter[EventMessage | DeprecatedEvent | UnknownEvent](
|
|
27
|
-
Annotated[EventMessage, Discriminator('name')] | DeprecatedEvent | UnknownEvent
|
|
28
|
-
)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
@docs_group('Event managers')
|
|
32
|
-
class ApifyEventManager(EventManager):
|
|
33
|
-
"""Event manager for the Apify platform.
|
|
34
|
-
|
|
35
|
-
This class extends Crawlee's `EventManager` to provide Apify-specific functionality, including websocket
|
|
36
|
-
connectivity to the Apify platform for receiving platform events.
|
|
37
|
-
|
|
38
|
-
The event manager handles:
|
|
39
|
-
- Registration and emission of events and their listeners.
|
|
40
|
-
- Websocket connection to Apify platform events.
|
|
41
|
-
- Processing and validation of platform messages.
|
|
42
|
-
- Automatic event forwarding from the platform to local event listeners.
|
|
43
|
-
|
|
44
|
-
This class should not be used directly. Use the `Actor.on` and `Actor.off` methods to interact
|
|
45
|
-
with the event system.
|
|
46
|
-
"""
|
|
47
|
-
|
|
48
|
-
def __init__(self, configuration: Configuration, **kwargs: Unpack[EventManagerOptions]) -> None:
|
|
49
|
-
"""Initialize a new instance.
|
|
50
|
-
|
|
51
|
-
Args:
|
|
52
|
-
configuration: The Actor configuration for the event manager.
|
|
53
|
-
**kwargs: Additional event manager options passed to the parent class.
|
|
54
|
-
"""
|
|
55
|
-
super().__init__(**kwargs)
|
|
56
|
-
|
|
57
|
-
self._configuration = configuration
|
|
58
|
-
"""The Actor configuration for the event manager."""
|
|
59
|
-
|
|
60
|
-
self._platform_events_websocket: websockets.asyncio.client.ClientConnection | None = None
|
|
61
|
-
"""WebSocket connection to the platform events."""
|
|
62
|
-
|
|
63
|
-
self._process_platform_messages_task: asyncio.Task | None = None
|
|
64
|
-
"""Task for processing messages from the platform websocket."""
|
|
65
|
-
|
|
66
|
-
self._connected_to_platform_websocket: asyncio.Future[bool] | None = None
|
|
67
|
-
"""Future that resolves when the connection to the platform websocket is established."""
|
|
68
|
-
|
|
69
|
-
@override
|
|
70
|
-
async def __aenter__(self) -> Self:
|
|
71
|
-
await super().__aenter__()
|
|
72
|
-
self._connected_to_platform_websocket = asyncio.Future()
|
|
73
|
-
|
|
74
|
-
# Run tasks but don't await them
|
|
75
|
-
if self._configuration.actor_events_ws_url:
|
|
76
|
-
self._process_platform_messages_task = asyncio.create_task(
|
|
77
|
-
self._process_platform_messages(self._configuration.actor_events_ws_url)
|
|
78
|
-
)
|
|
79
|
-
is_connected = await self._connected_to_platform_websocket
|
|
80
|
-
if not is_connected:
|
|
81
|
-
# Exit the already-entered parent so the recurring persist state task does not leak.
|
|
82
|
-
await self.__aexit__(None, None, None)
|
|
83
|
-
raise RuntimeError('Error connecting to platform events websocket!')
|
|
84
|
-
else:
|
|
85
|
-
logger.debug('APIFY_ACTOR_EVENTS_WS_URL env var not set, no events from Apify platform will be emitted.')
|
|
86
|
-
|
|
87
|
-
return self
|
|
88
|
-
|
|
89
|
-
@override
|
|
90
|
-
async def __aexit__(
|
|
91
|
-
self,
|
|
92
|
-
exc_type: type[BaseException] | None,
|
|
93
|
-
exc_value: BaseException | None,
|
|
94
|
-
exc_traceback: TracebackType | None,
|
|
95
|
-
) -> None:
|
|
96
|
-
if self._platform_events_websocket:
|
|
97
|
-
await self._platform_events_websocket.close()
|
|
98
|
-
|
|
99
|
-
if self._process_platform_messages_task and not self._process_platform_messages_task.done():
|
|
100
|
-
self._process_platform_messages_task.cancel()
|
|
101
|
-
with contextlib.suppress(asyncio.CancelledError):
|
|
102
|
-
await self._process_platform_messages_task
|
|
103
|
-
|
|
104
|
-
await super().__aexit__(exc_type, exc_value, exc_traceback)
|
|
105
|
-
|
|
106
|
-
async def _process_platform_messages(self, ws_url: str) -> None:
|
|
107
|
-
try:
|
|
108
|
-
async with websockets.asyncio.client.connect(ws_url) as websocket:
|
|
109
|
-
self._platform_events_websocket = websocket
|
|
110
|
-
if self._connected_to_platform_websocket is not None:
|
|
111
|
-
self._connected_to_platform_websocket.set_result(True)
|
|
112
|
-
|
|
113
|
-
async for message in websocket:
|
|
114
|
-
try:
|
|
115
|
-
parsed_message = event_data_adapter.validate_json(message)
|
|
116
|
-
|
|
117
|
-
if isinstance(parsed_message, DeprecatedEvent):
|
|
118
|
-
continue
|
|
119
|
-
|
|
120
|
-
if isinstance(parsed_message, UnknownEvent):
|
|
121
|
-
logger.info(
|
|
122
|
-
f'Unknown message received: event_name={parsed_message.name}, '
|
|
123
|
-
f'event_data={parsed_message.data}'
|
|
124
|
-
)
|
|
125
|
-
continue
|
|
126
|
-
|
|
127
|
-
self.emit(
|
|
128
|
-
event=parsed_message.name,
|
|
129
|
-
event_data=parsed_message.data
|
|
130
|
-
if not isinstance(parsed_message.data, SystemInfoEventData)
|
|
131
|
-
else parsed_message.data.to_crawlee_format(self._configuration.dedicated_cpus or 1),
|
|
132
|
-
)
|
|
133
|
-
|
|
134
|
-
if parsed_message.name == Event.MIGRATING:
|
|
135
|
-
await self._emit_persist_state_event_rec_task.stop()
|
|
136
|
-
self.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=True))
|
|
137
|
-
except Exception:
|
|
138
|
-
logger.exception('Cannot parse Actor event', extra={'raw_message': message})
|
|
139
|
-
except Exception:
|
|
140
|
-
logger.exception('Error in websocket connection')
|
|
141
|
-
if self._connected_to_platform_websocket is not None and not self._connected_to_platform_websocket.done():
|
|
142
|
-
self._connected_to_platform_websocket.set_result(False)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_apify/_key_value_store_client.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_apify/_request_queue_shared_client.py
RENAMED
|
File without changes
|
{apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_apify/_request_queue_single_client.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{apify-3.4.2b22 → apify-3.4.2b24}/src/apify/storage_clients/_file_system/_key_value_store_client.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|