apify 2.7.3__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/_actor.py +194 -126
- apify/_charging.py +34 -9
- apify/_configuration.py +70 -6
- apify/_crypto.py +0 -6
- apify/_models.py +7 -7
- apify/_proxy_configuration.py +10 -10
- apify/_utils.py +25 -2
- apify/events/__init__.py +5 -0
- apify/events/_apify_event_manager.py +140 -0
- apify/events/_types.py +102 -0
- apify/log.py +0 -9
- apify/request_loaders/__init__.py +18 -0
- apify/{storages/_request_list.py → request_loaders/_apify_request_list.py} +25 -18
- apify/request_loaders/py.typed +0 -0
- apify/scrapy/_logging_config.py +1 -4
- apify/scrapy/extensions/_httpcache.py +9 -5
- apify/scrapy/requests.py +3 -3
- apify/scrapy/scheduler.py +8 -5
- apify/storage_clients/__init__.py +12 -0
- apify/storage_clients/_apify/__init__.py +11 -0
- apify/storage_clients/_apify/_dataset_client.py +328 -0
- apify/storage_clients/_apify/_key_value_store_client.py +265 -0
- apify/storage_clients/_apify/_models.py +131 -0
- apify/storage_clients/_apify/_request_queue_client.py +327 -0
- apify/storage_clients/_apify/_request_queue_shared_client.py +527 -0
- apify/storage_clients/_apify/_request_queue_single_client.py +399 -0
- apify/storage_clients/_apify/_storage_client.py +106 -0
- apify/storage_clients/_apify/_utils.py +194 -0
- apify/storage_clients/_apify/py.typed +0 -0
- apify/storage_clients/_file_system/__init__.py +2 -0
- apify/storage_clients/_file_system/_key_value_store_client.py +57 -0
- apify/storage_clients/_file_system/_storage_client.py +41 -0
- apify/storage_clients/_smart_apify/__init__.py +1 -0
- apify/storage_clients/_smart_apify/_storage_client.py +117 -0
- apify/storage_clients/py.typed +0 -0
- apify/storages/__init__.py +1 -3
- {apify-2.7.3.dist-info → apify-3.0.0.dist-info}/METADATA +25 -9
- apify-3.0.0.dist-info/RECORD +57 -0
- apify/_platform_event_manager.py +0 -231
- apify/apify_storage_client/__init__.py +0 -3
- apify/apify_storage_client/_apify_storage_client.py +0 -72
- apify/apify_storage_client/_dataset_client.py +0 -190
- apify/apify_storage_client/_dataset_collection_client.py +0 -51
- apify/apify_storage_client/_key_value_store_client.py +0 -109
- apify/apify_storage_client/_key_value_store_collection_client.py +0 -51
- apify/apify_storage_client/_request_queue_client.py +0 -176
- apify/apify_storage_client/_request_queue_collection_client.py +0 -51
- apify-2.7.3.dist-info/RECORD +0 -44
- /apify/{apify_storage_client → events}/py.typed +0 -0
- {apify-2.7.3.dist-info → apify-3.0.0.dist-info}/WHEEL +0 -0
- {apify-2.7.3.dist-info → apify-3.0.0.dist-info}/licenses/LICENSE +0 -0
apify/_charging.py
CHANGED
|
@@ -4,11 +4,10 @@ import math
|
|
|
4
4
|
from dataclasses import dataclass
|
|
5
5
|
from datetime import datetime, timezone
|
|
6
6
|
from decimal import Decimal
|
|
7
|
-
from typing import TYPE_CHECKING, Protocol
|
|
7
|
+
from typing import TYPE_CHECKING, Protocol
|
|
8
8
|
|
|
9
9
|
from pydantic import TypeAdapter
|
|
10
10
|
|
|
11
|
-
from apify_shared.utils import ignore_docs
|
|
12
11
|
from crawlee._utils.context import ensure_context
|
|
13
12
|
|
|
14
13
|
from apify._models import ActorRun, PricingModel
|
|
@@ -23,13 +22,21 @@ if TYPE_CHECKING:
|
|
|
23
22
|
|
|
24
23
|
from apify._configuration import Configuration
|
|
25
24
|
|
|
25
|
+
run_validator = TypeAdapter[ActorRun | None](ActorRun | None)
|
|
26
26
|
|
|
27
|
-
run_validator: TypeAdapter[ActorRun | None] = TypeAdapter(Union[ActorRun, None])
|
|
28
27
|
|
|
29
|
-
|
|
30
|
-
@docs_group('Interfaces')
|
|
28
|
+
@docs_group('Charging')
|
|
31
29
|
class ChargingManager(Protocol):
|
|
32
|
-
"""Provides fine-grained access to pay-per-event functionality.
|
|
30
|
+
"""Provides fine-grained access to pay-per-event functionality.
|
|
31
|
+
|
|
32
|
+
The ChargingManager allows you to charge for specific events in your Actor when using
|
|
33
|
+
the pay-per-event pricing model. This enables precise cost control and transparent
|
|
34
|
+
billing for different operations within your Actor.
|
|
35
|
+
|
|
36
|
+
### References
|
|
37
|
+
|
|
38
|
+
- Apify platform documentation: https://docs.apify.com/platform/actors/publishing/monetize
|
|
39
|
+
"""
|
|
33
40
|
|
|
34
41
|
async def charge(self, event_name: str, count: int = 1) -> ChargeResult:
|
|
35
42
|
"""Charge for a specified number of events - sub-operations of the Actor.
|
|
@@ -57,8 +64,18 @@ class ChargingManager(Protocol):
|
|
|
57
64
|
This can be used for instance when your code needs to support multiple pricing models in transition periods.
|
|
58
65
|
"""
|
|
59
66
|
|
|
67
|
+
def get_charged_event_count(self, event_name: str) -> int:
|
|
68
|
+
"""Get the number of events with the given name that were charged so far.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
event_name: Name of the inspected event.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def get_max_total_charge_usd(self) -> Decimal:
|
|
75
|
+
"""Get the configured maximum total charge for this Actor run."""
|
|
60
76
|
|
|
61
|
-
|
|
77
|
+
|
|
78
|
+
@docs_group('Charging')
|
|
62
79
|
@dataclass(frozen=True)
|
|
63
80
|
class ChargeResult:
|
|
64
81
|
"""Result of the `ChargingManager.charge` method."""
|
|
@@ -73,7 +90,7 @@ class ChargeResult:
|
|
|
73
90
|
"""How many events of each known type can still be charged within the limit."""
|
|
74
91
|
|
|
75
92
|
|
|
76
|
-
@docs_group('
|
|
93
|
+
@docs_group('Charging')
|
|
77
94
|
@dataclass
|
|
78
95
|
class ActorPricingInfo:
|
|
79
96
|
"""Result of the `ChargingManager.get_pricing_info` method."""
|
|
@@ -91,7 +108,6 @@ class ActorPricingInfo:
|
|
|
91
108
|
"""Price of every known event type."""
|
|
92
109
|
|
|
93
110
|
|
|
94
|
-
@ignore_docs
|
|
95
111
|
class ChargingManagerImplementation(ChargingManager):
|
|
96
112
|
"""Implementation of the `ChargingManager` Protocol - this is only meant to be instantiated internally."""
|
|
97
113
|
|
|
@@ -303,6 +319,15 @@ class ChargingManagerImplementation(ChargingManager):
|
|
|
303
319
|
},
|
|
304
320
|
)
|
|
305
321
|
|
|
322
|
+
@ensure_context
|
|
323
|
+
def get_charged_event_count(self, event_name: str) -> int:
|
|
324
|
+
item = self._charging_state.get(event_name)
|
|
325
|
+
return item.charge_count if item is not None else 0
|
|
326
|
+
|
|
327
|
+
@ensure_context
|
|
328
|
+
def get_max_total_charge_usd(self) -> Decimal:
|
|
329
|
+
return self._max_total_charge_usd
|
|
330
|
+
|
|
306
331
|
|
|
307
332
|
@dataclass
|
|
308
333
|
class ChargingStateItem:
|
apify/_configuration.py
CHANGED
|
@@ -8,6 +8,7 @@ from typing import Annotated, Any
|
|
|
8
8
|
from pydantic import AliasChoices, BeforeValidator, Field, model_validator
|
|
9
9
|
from typing_extensions import Self, deprecated
|
|
10
10
|
|
|
11
|
+
from crawlee import service_locator
|
|
11
12
|
from crawlee._utils.models import timedelta_ms
|
|
12
13
|
from crawlee._utils.urls import validate_http_url
|
|
13
14
|
from crawlee.configuration import Configuration as CrawleeConfiguration
|
|
@@ -25,7 +26,7 @@ def _transform_to_list(value: Any) -> list[str] | None:
|
|
|
25
26
|
return value if isinstance(value, list) else str(value).split(',')
|
|
26
27
|
|
|
27
28
|
|
|
28
|
-
@docs_group('
|
|
29
|
+
@docs_group('Configuration')
|
|
29
30
|
class Configuration(CrawleeConfiguration):
|
|
30
31
|
"""A class for specifying the configuration of an Actor.
|
|
31
32
|
|
|
@@ -140,6 +141,39 @@ class Configuration(CrawleeConfiguration):
|
|
|
140
141
|
),
|
|
141
142
|
] = None
|
|
142
143
|
|
|
144
|
+
default_dataset_id: Annotated[
|
|
145
|
+
str | None,
|
|
146
|
+
Field(
|
|
147
|
+
validation_alias=AliasChoices(
|
|
148
|
+
'actor_default_dataset_id',
|
|
149
|
+
'apify_default_dataset_id',
|
|
150
|
+
),
|
|
151
|
+
description='Default dataset ID used by the Apify storage client when no ID or name is provided.',
|
|
152
|
+
),
|
|
153
|
+
] = None
|
|
154
|
+
|
|
155
|
+
default_key_value_store_id: Annotated[
|
|
156
|
+
str | None,
|
|
157
|
+
Field(
|
|
158
|
+
validation_alias=AliasChoices(
|
|
159
|
+
'actor_default_key_value_store_id',
|
|
160
|
+
'apify_default_key_value_store_id',
|
|
161
|
+
),
|
|
162
|
+
description='Default key-value store ID for the Apify storage client when no ID or name is provided.',
|
|
163
|
+
),
|
|
164
|
+
] = None
|
|
165
|
+
|
|
166
|
+
default_request_queue_id: Annotated[
|
|
167
|
+
str | None,
|
|
168
|
+
Field(
|
|
169
|
+
validation_alias=AliasChoices(
|
|
170
|
+
'actor_default_request_queue_id',
|
|
171
|
+
'apify_default_request_queue_id',
|
|
172
|
+
),
|
|
173
|
+
description='Default request queue ID for the Apify storage client when no ID or name is provided.',
|
|
174
|
+
),
|
|
175
|
+
] = None
|
|
176
|
+
|
|
143
177
|
disable_outdated_warning: Annotated[
|
|
144
178
|
bool,
|
|
145
179
|
Field(
|
|
@@ -391,11 +425,41 @@ class Configuration(CrawleeConfiguration):
|
|
|
391
425
|
def get_global_configuration(cls) -> Configuration:
|
|
392
426
|
"""Retrieve the global instance of the configuration.
|
|
393
427
|
|
|
394
|
-
|
|
395
|
-
|
|
428
|
+
This method ensures that ApifyConfigration is returned, even if CrawleeConfiguration was set in the
|
|
429
|
+
service locator.
|
|
430
|
+
"""
|
|
431
|
+
global_configuration = service_locator.get_configuration()
|
|
432
|
+
|
|
433
|
+
if isinstance(global_configuration, Configuration):
|
|
434
|
+
# If Apify configuration was already stored in service locator, return it.
|
|
435
|
+
return global_configuration
|
|
436
|
+
|
|
437
|
+
logger.warning(
|
|
438
|
+
'Non Apify Configration is set in the `service_locator` in the SDK context. '
|
|
439
|
+
'It is recommended to set `apify.Configuration` explicitly as early as possible by using '
|
|
440
|
+
'service_locator.set_configuration'
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
return cls.from_configuration(global_configuration)
|
|
444
|
+
|
|
445
|
+
@classmethod
|
|
446
|
+
def from_configuration(cls, configuration: CrawleeConfiguration) -> Configuration:
|
|
447
|
+
"""Create Apify Configuration from existing Crawlee Configuration.
|
|
448
|
+
|
|
449
|
+
Args:
|
|
450
|
+
configuration: The existing Crawlee Configuration.
|
|
451
|
+
|
|
452
|
+
Returns:
|
|
453
|
+
The created Apify Configuration.
|
|
396
454
|
"""
|
|
397
|
-
|
|
455
|
+
apify_configuration = cls()
|
|
398
456
|
|
|
457
|
+
# Ensure the returned configuration is of type Apify Configuration.
|
|
458
|
+
# Most likely crawlee configuration was already set. Create Apify configuration from it.
|
|
459
|
+
# Due to known Pydantic issue https://github.com/pydantic/pydantic/issues/9516, creating new instance of
|
|
460
|
+
# Configuration from existing one in situation where environment can have some fields set by alias is very
|
|
461
|
+
# unpredictable. Use the stable workaround.
|
|
462
|
+
for name in configuration.model_fields:
|
|
463
|
+
setattr(apify_configuration, name, getattr(configuration, name))
|
|
399
464
|
|
|
400
|
-
|
|
401
|
-
CrawleeConfiguration.get_global_configuration = Configuration.get_global_configuration # type: ignore[method-assign]
|
|
465
|
+
return apify_configuration
|
apify/_crypto.py
CHANGED
|
@@ -12,7 +12,6 @@ from cryptography.hazmat.primitives import hashes, serialization
|
|
|
12
12
|
from cryptography.hazmat.primitives.asymmetric import padding, rsa
|
|
13
13
|
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
|
|
14
14
|
|
|
15
|
-
from apify_shared.utils import ignore_docs
|
|
16
15
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
17
16
|
|
|
18
17
|
from apify._consts import ENCRYPTED_INPUT_VALUE_REGEXP, ENCRYPTED_JSON_VALUE_PREFIX, ENCRYPTED_STRING_VALUE_PREFIX
|
|
@@ -22,7 +21,6 @@ ENCRYPTION_IV_LENGTH = 16
|
|
|
22
21
|
ENCRYPTION_AUTH_TAG_LENGTH = 16
|
|
23
22
|
|
|
24
23
|
|
|
25
|
-
@ignore_docs
|
|
26
24
|
def public_encrypt(value: str, *, public_key: rsa.RSAPublicKey) -> dict:
|
|
27
25
|
"""Encrypts the given value using AES cipher and the password for encryption using the public key.
|
|
28
26
|
|
|
@@ -66,7 +64,6 @@ def public_encrypt(value: str, *, public_key: rsa.RSAPublicKey) -> dict:
|
|
|
66
64
|
}
|
|
67
65
|
|
|
68
66
|
|
|
69
|
-
@ignore_docs
|
|
70
67
|
def private_decrypt(
|
|
71
68
|
encrypted_password: str,
|
|
72
69
|
encrypted_value: str,
|
|
@@ -118,7 +115,6 @@ def private_decrypt(
|
|
|
118
115
|
return decipher_bytes.decode('utf-8')
|
|
119
116
|
|
|
120
117
|
|
|
121
|
-
@ignore_docs
|
|
122
118
|
def load_private_key(private_key_file_base64: str, private_key_password: str) -> rsa.RSAPrivateKey:
|
|
123
119
|
private_key = serialization.load_pem_private_key(
|
|
124
120
|
base64.b64decode(private_key_file_base64.encode('utf-8')),
|
|
@@ -138,7 +134,6 @@ def _load_public_key(public_key_file_base64: str) -> rsa.RSAPublicKey:
|
|
|
138
134
|
return public_key
|
|
139
135
|
|
|
140
136
|
|
|
141
|
-
@ignore_docs
|
|
142
137
|
def decrypt_input_secrets(private_key: rsa.RSAPrivateKey, input_data: Any) -> Any:
|
|
143
138
|
"""Decrypt input secrets."""
|
|
144
139
|
if not isinstance(input_data, dict):
|
|
@@ -180,7 +175,6 @@ def encode_base62(num: int) -> str:
|
|
|
180
175
|
return res
|
|
181
176
|
|
|
182
177
|
|
|
183
|
-
@ignore_docs
|
|
184
178
|
def create_hmac_signature(secret_key: str, message: str) -> str:
|
|
185
179
|
"""Generate an HMAC signature and encodes it using Base62. Base62 encoding reduces the signature length.
|
|
186
180
|
|
apify/_models.py
CHANGED
|
@@ -13,10 +13,10 @@ from crawlee._utils.urls import validate_http_url
|
|
|
13
13
|
from apify._utils import docs_group
|
|
14
14
|
|
|
15
15
|
if TYPE_CHECKING:
|
|
16
|
-
from
|
|
16
|
+
from typing import TypeAlias
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
@docs_group('
|
|
19
|
+
@docs_group('Actor')
|
|
20
20
|
class Webhook(BaseModel):
|
|
21
21
|
__model_config__ = ConfigDict(populate_by_name=True)
|
|
22
22
|
|
|
@@ -35,14 +35,14 @@ class Webhook(BaseModel):
|
|
|
35
35
|
] = None
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
@docs_group('
|
|
38
|
+
@docs_group('Actor')
|
|
39
39
|
class ActorRunMeta(BaseModel):
|
|
40
40
|
__model_config__ = ConfigDict(populate_by_name=True)
|
|
41
41
|
|
|
42
42
|
origin: Annotated[MetaOrigin, Field()]
|
|
43
43
|
|
|
44
44
|
|
|
45
|
-
@docs_group('
|
|
45
|
+
@docs_group('Actor')
|
|
46
46
|
class ActorRunStats(BaseModel):
|
|
47
47
|
__model_config__ = ConfigDict(populate_by_name=True)
|
|
48
48
|
|
|
@@ -63,7 +63,7 @@ class ActorRunStats(BaseModel):
|
|
|
63
63
|
compute_units: Annotated[float, Field(alias='computeUnits')]
|
|
64
64
|
|
|
65
65
|
|
|
66
|
-
@docs_group('
|
|
66
|
+
@docs_group('Actor')
|
|
67
67
|
class ActorRunOptions(BaseModel):
|
|
68
68
|
__model_config__ = ConfigDict(populate_by_name=True)
|
|
69
69
|
|
|
@@ -74,7 +74,7 @@ class ActorRunOptions(BaseModel):
|
|
|
74
74
|
max_total_charge_usd: Annotated[Decimal | None, Field(alias='maxTotalChargeUsd')] = None
|
|
75
75
|
|
|
76
76
|
|
|
77
|
-
@docs_group('
|
|
77
|
+
@docs_group('Actor')
|
|
78
78
|
class ActorRunUsage(BaseModel):
|
|
79
79
|
__model_config__ = ConfigDict(populate_by_name=True)
|
|
80
80
|
|
|
@@ -92,7 +92,7 @@ class ActorRunUsage(BaseModel):
|
|
|
92
92
|
proxy_serps: Annotated[float | None, Field(alias='PROXY_SERPS')] = None
|
|
93
93
|
|
|
94
94
|
|
|
95
|
-
@docs_group('
|
|
95
|
+
@docs_group('Actor')
|
|
96
96
|
class ActorRun(BaseModel):
|
|
97
97
|
__model_config__ = ConfigDict(populate_by_name=True)
|
|
98
98
|
|
apify/_proxy_configuration.py
CHANGED
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import ipaddress
|
|
4
|
+
import json
|
|
4
5
|
import re
|
|
5
6
|
from dataclasses import dataclass, field
|
|
6
7
|
from re import Pattern
|
|
7
8
|
from typing import TYPE_CHECKING, Any
|
|
8
9
|
from urllib.parse import urljoin, urlparse
|
|
9
10
|
|
|
10
|
-
import
|
|
11
|
+
import impit
|
|
12
|
+
from yarl import URL
|
|
11
13
|
|
|
12
14
|
from apify_shared.consts import ApifyEnvVars
|
|
13
|
-
from apify_shared.utils import ignore_docs
|
|
14
15
|
from crawlee.proxy_configuration import ProxyConfiguration as CrawleeProxyConfiguration
|
|
15
16
|
from crawlee.proxy_configuration import ProxyInfo as CrawleeProxyInfo
|
|
16
17
|
from crawlee.proxy_configuration import _NewUrlFunction
|
|
@@ -21,14 +22,14 @@ from apify.log import logger
|
|
|
21
22
|
|
|
22
23
|
if TYPE_CHECKING:
|
|
23
24
|
from apify_client import ApifyClientAsync
|
|
24
|
-
|
|
25
|
+
|
|
26
|
+
from apify import Request
|
|
25
27
|
|
|
26
28
|
APIFY_PROXY_VALUE_REGEX = re.compile(r'^[\w._~]+$')
|
|
27
29
|
COUNTRY_CODE_REGEX = re.compile(r'^[A-Z]{2}$')
|
|
28
30
|
SESSION_ID_MAX_LENGTH = 50
|
|
29
31
|
|
|
30
32
|
|
|
31
|
-
@ignore_docs
|
|
32
33
|
def is_url(url: str) -> bool:
|
|
33
34
|
"""Check if the given string is a valid URL."""
|
|
34
35
|
try:
|
|
@@ -69,7 +70,7 @@ def _check(
|
|
|
69
70
|
raise ValueError(f'{error_str} does not match pattern {pattern.pattern!r}')
|
|
70
71
|
|
|
71
72
|
|
|
72
|
-
@docs_group('
|
|
73
|
+
@docs_group('Configuration')
|
|
73
74
|
@dataclass
|
|
74
75
|
class ProxyInfo(CrawleeProxyInfo):
|
|
75
76
|
"""Provides information about a proxy connection that is used for requests."""
|
|
@@ -89,7 +90,7 @@ class ProxyInfo(CrawleeProxyInfo):
|
|
|
89
90
|
"""
|
|
90
91
|
|
|
91
92
|
|
|
92
|
-
@docs_group('
|
|
93
|
+
@docs_group('Configuration')
|
|
93
94
|
class ProxyConfiguration(CrawleeProxyConfiguration):
|
|
94
95
|
"""Configures a connection to a proxy server with the provided options.
|
|
95
96
|
|
|
@@ -104,7 +105,6 @@ class ProxyConfiguration(CrawleeProxyConfiguration):
|
|
|
104
105
|
|
|
105
106
|
_configuration: Configuration
|
|
106
107
|
|
|
107
|
-
@ignore_docs
|
|
108
108
|
def __init__(
|
|
109
109
|
self,
|
|
110
110
|
*,
|
|
@@ -233,7 +233,7 @@ class ProxyConfiguration(CrawleeProxyConfiguration):
|
|
|
233
233
|
return None
|
|
234
234
|
|
|
235
235
|
if self._uses_apify_proxy:
|
|
236
|
-
parsed_url =
|
|
236
|
+
parsed_url = URL(proxy_info.url)
|
|
237
237
|
username = self._get_username(session_id)
|
|
238
238
|
|
|
239
239
|
return ProxyInfo(
|
|
@@ -277,11 +277,11 @@ class ProxyConfiguration(CrawleeProxyConfiguration):
|
|
|
277
277
|
return
|
|
278
278
|
|
|
279
279
|
status = None
|
|
280
|
-
async with
|
|
280
|
+
async with impit.AsyncClient(proxy=proxy_info.url, timeout=10) as client:
|
|
281
281
|
for _ in range(2):
|
|
282
282
|
try:
|
|
283
283
|
response = await client.get(proxy_status_url)
|
|
284
|
-
status = response.
|
|
284
|
+
status = json.loads(response.text)
|
|
285
285
|
break
|
|
286
286
|
except Exception: # noqa: S110
|
|
287
287
|
# retry on connection errors
|
apify/_utils.py
CHANGED
|
@@ -2,8 +2,12 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import builtins
|
|
4
4
|
import sys
|
|
5
|
+
from enum import Enum
|
|
5
6
|
from importlib import metadata
|
|
6
|
-
from typing import
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from collections.abc import Callable
|
|
7
11
|
|
|
8
12
|
|
|
9
13
|
def get_system_info() -> dict:
|
|
@@ -27,7 +31,19 @@ def is_running_in_ipython() -> bool:
|
|
|
27
31
|
return getattr(builtins, '__IPYTHON__', False)
|
|
28
32
|
|
|
29
33
|
|
|
30
|
-
|
|
34
|
+
# The order of the rendered API groups is defined in the website/docusaurus.config.js file.
|
|
35
|
+
GroupName = Literal[
|
|
36
|
+
'Actor',
|
|
37
|
+
'Charging',
|
|
38
|
+
'Configuration',
|
|
39
|
+
'Event data',
|
|
40
|
+
'Event managers',
|
|
41
|
+
'Events',
|
|
42
|
+
'Request loaders',
|
|
43
|
+
'Storage clients',
|
|
44
|
+
'Storage data',
|
|
45
|
+
'Storages',
|
|
46
|
+
]
|
|
31
47
|
|
|
32
48
|
|
|
33
49
|
def docs_group(group_name: GroupName) -> Callable: # noqa: ARG001
|
|
@@ -66,3 +82,10 @@ def docs_name(symbol_name: str) -> Callable: # noqa: ARG001
|
|
|
66
82
|
return func
|
|
67
83
|
|
|
68
84
|
return wrapper
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def maybe_extract_enum_member_value(maybe_enum_member: Any) -> Any:
|
|
88
|
+
"""Extract the value of an enumeration member if it is an Enum, otherwise return the original value."""
|
|
89
|
+
if isinstance(maybe_enum_member, Enum):
|
|
90
|
+
return maybe_enum_member.value
|
|
91
|
+
return maybe_enum_member
|
apify/events/__init__.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import contextlib
|
|
5
|
+
from typing import TYPE_CHECKING, Annotated
|
|
6
|
+
|
|
7
|
+
import websockets.asyncio.client
|
|
8
|
+
from pydantic import Discriminator, TypeAdapter
|
|
9
|
+
from typing_extensions import Self, Unpack, override
|
|
10
|
+
|
|
11
|
+
from crawlee.events import EventManager
|
|
12
|
+
from crawlee.events._types import Event, EventPersistStateData
|
|
13
|
+
|
|
14
|
+
from apify._utils import docs_group
|
|
15
|
+
from apify.events._types import DeprecatedEvent, EventMessage, SystemInfoEventData, UnknownEvent
|
|
16
|
+
from apify.log import logger
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from types import TracebackType
|
|
20
|
+
|
|
21
|
+
from crawlee.events._event_manager import EventManagerOptions
|
|
22
|
+
|
|
23
|
+
from apify._configuration import Configuration
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
event_data_adapter = TypeAdapter[EventMessage | DeprecatedEvent | UnknownEvent](
|
|
27
|
+
Annotated[EventMessage, Discriminator('name')] | DeprecatedEvent | UnknownEvent
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@docs_group('Event managers')
|
|
32
|
+
class ApifyEventManager(EventManager):
|
|
33
|
+
"""Event manager for the Apify platform.
|
|
34
|
+
|
|
35
|
+
This class extends Crawlee's `EventManager` to provide Apify-specific functionality, including websocket
|
|
36
|
+
connectivity to the Apify platform for receiving platform events.
|
|
37
|
+
|
|
38
|
+
The event manager handles:
|
|
39
|
+
- Registration and emission of events and their listeners.
|
|
40
|
+
- Websocket connection to Apify platform events.
|
|
41
|
+
- Processing and validation of platform messages.
|
|
42
|
+
- Automatic event forwarding from the platform to local event listeners.
|
|
43
|
+
|
|
44
|
+
This class should not be used directly. Use the `Actor.on` and `Actor.off` methods to interact
|
|
45
|
+
with the event system.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(self, configuration: Configuration, **kwargs: Unpack[EventManagerOptions]) -> None:
|
|
49
|
+
"""Initialize a new instance.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
configuration: The Actor configuration for the event manager.
|
|
53
|
+
**kwargs: Additional event manager options passed to the parent class.
|
|
54
|
+
"""
|
|
55
|
+
super().__init__(**kwargs)
|
|
56
|
+
|
|
57
|
+
self._configuration = configuration
|
|
58
|
+
"""The Actor configuration for the event manager."""
|
|
59
|
+
|
|
60
|
+
self._platform_events_websocket: websockets.asyncio.client.ClientConnection | None = None
|
|
61
|
+
"""WebSocket connection to the platform events."""
|
|
62
|
+
|
|
63
|
+
self._process_platform_messages_task: asyncio.Task | None = None
|
|
64
|
+
"""Task for processing messages from the platform websocket."""
|
|
65
|
+
|
|
66
|
+
self._connected_to_platform_websocket: asyncio.Future[bool] | None = None
|
|
67
|
+
"""Future that resolves when the connection to the platform websocket is established."""
|
|
68
|
+
|
|
69
|
+
@override
|
|
70
|
+
async def __aenter__(self) -> Self:
|
|
71
|
+
await super().__aenter__()
|
|
72
|
+
self._connected_to_platform_websocket = asyncio.Future()
|
|
73
|
+
|
|
74
|
+
# Run tasks but don't await them
|
|
75
|
+
if self._configuration.actor_events_ws_url:
|
|
76
|
+
self._process_platform_messages_task = asyncio.create_task(
|
|
77
|
+
self._process_platform_messages(self._configuration.actor_events_ws_url)
|
|
78
|
+
)
|
|
79
|
+
is_connected = await self._connected_to_platform_websocket
|
|
80
|
+
if not is_connected:
|
|
81
|
+
raise RuntimeError('Error connecting to platform events websocket!')
|
|
82
|
+
else:
|
|
83
|
+
logger.debug('APIFY_ACTOR_EVENTS_WS_URL env var not set, no events from Apify platform will be emitted.')
|
|
84
|
+
|
|
85
|
+
return self
|
|
86
|
+
|
|
87
|
+
@override
|
|
88
|
+
async def __aexit__(
|
|
89
|
+
self,
|
|
90
|
+
exc_type: type[BaseException] | None,
|
|
91
|
+
exc_value: BaseException | None,
|
|
92
|
+
exc_traceback: TracebackType | None,
|
|
93
|
+
) -> None:
|
|
94
|
+
if self._platform_events_websocket:
|
|
95
|
+
await self._platform_events_websocket.close()
|
|
96
|
+
|
|
97
|
+
if self._process_platform_messages_task and not self._process_platform_messages_task.done():
|
|
98
|
+
self._process_platform_messages_task.cancel()
|
|
99
|
+
with contextlib.suppress(asyncio.CancelledError):
|
|
100
|
+
await self._process_platform_messages_task
|
|
101
|
+
|
|
102
|
+
await super().__aexit__(exc_type, exc_value, exc_traceback)
|
|
103
|
+
|
|
104
|
+
async def _process_platform_messages(self, ws_url: str) -> None:
|
|
105
|
+
try:
|
|
106
|
+
async with websockets.asyncio.client.connect(ws_url) as websocket:
|
|
107
|
+
self._platform_events_websocket = websocket
|
|
108
|
+
if self._connected_to_platform_websocket is not None:
|
|
109
|
+
self._connected_to_platform_websocket.set_result(True)
|
|
110
|
+
|
|
111
|
+
async for message in websocket:
|
|
112
|
+
try:
|
|
113
|
+
parsed_message = event_data_adapter.validate_json(message)
|
|
114
|
+
|
|
115
|
+
if isinstance(parsed_message, DeprecatedEvent):
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
if isinstance(parsed_message, UnknownEvent):
|
|
119
|
+
logger.info(
|
|
120
|
+
f'Unknown message received: event_name={parsed_message.name}, '
|
|
121
|
+
f'event_data={parsed_message.data}'
|
|
122
|
+
)
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
self.emit(
|
|
126
|
+
event=parsed_message.name,
|
|
127
|
+
event_data=parsed_message.data
|
|
128
|
+
if not isinstance(parsed_message.data, SystemInfoEventData)
|
|
129
|
+
else parsed_message.data.to_crawlee_format(self._configuration.dedicated_cpus or 1),
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
if parsed_message.name == Event.MIGRATING:
|
|
133
|
+
await self._emit_persist_state_event_rec_task.stop()
|
|
134
|
+
self.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=True))
|
|
135
|
+
except Exception:
|
|
136
|
+
logger.exception('Cannot parse Actor event', extra={'message': message})
|
|
137
|
+
except Exception:
|
|
138
|
+
logger.exception('Error in websocket connection')
|
|
139
|
+
if self._connected_to_platform_websocket is not None:
|
|
140
|
+
self._connected_to_platform_websocket.set_result(False)
|
apify/events/_types.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Annotated, Any, Literal
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
8
|
+
from crawlee.events._types import (
|
|
9
|
+
Event,
|
|
10
|
+
EventAbortingData,
|
|
11
|
+
EventExitData,
|
|
12
|
+
EventMigratingData,
|
|
13
|
+
EventPersistStateData,
|
|
14
|
+
EventSystemInfoData,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
from apify._utils import docs_group
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@docs_group('Event data')
|
|
21
|
+
class SystemInfoEventData(BaseModel):
|
|
22
|
+
mem_avg_bytes: Annotated[float, Field(alias='memAvgBytes')]
|
|
23
|
+
mem_current_bytes: Annotated[float, Field(alias='memCurrentBytes')]
|
|
24
|
+
mem_max_bytes: Annotated[float, Field(alias='memMaxBytes')]
|
|
25
|
+
cpu_avg_usage: Annotated[float, Field(alias='cpuAvgUsage')]
|
|
26
|
+
cpu_max_usage: Annotated[float, Field(alias='cpuMaxUsage')]
|
|
27
|
+
cpu_current_usage: Annotated[float, Field(alias='cpuCurrentUsage')]
|
|
28
|
+
is_cpu_overloaded: Annotated[bool, Field(alias='isCpuOverloaded')]
|
|
29
|
+
created_at: Annotated[datetime, Field(alias='createdAt')]
|
|
30
|
+
|
|
31
|
+
def to_crawlee_format(self, dedicated_cpus: float) -> EventSystemInfoData:
|
|
32
|
+
return EventSystemInfoData.model_validate(
|
|
33
|
+
{
|
|
34
|
+
'cpu_info': {
|
|
35
|
+
'used_ratio': (self.cpu_current_usage / 100) / dedicated_cpus,
|
|
36
|
+
'created_at': self.created_at,
|
|
37
|
+
},
|
|
38
|
+
'memory_info': {
|
|
39
|
+
'total_size': self.mem_max_bytes,
|
|
40
|
+
'current_size': self.mem_current_bytes,
|
|
41
|
+
'created_at': self.created_at,
|
|
42
|
+
},
|
|
43
|
+
}
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@docs_group('Events')
|
|
48
|
+
class PersistStateEvent(BaseModel):
|
|
49
|
+
name: Literal[Event.PERSIST_STATE]
|
|
50
|
+
data: Annotated[EventPersistStateData, Field(default_factory=lambda: EventPersistStateData(is_migrating=False))]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@docs_group('Events')
|
|
54
|
+
class SystemInfoEvent(BaseModel):
|
|
55
|
+
name: Literal[Event.SYSTEM_INFO]
|
|
56
|
+
data: SystemInfoEventData
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@docs_group('Events')
|
|
60
|
+
class MigratingEvent(BaseModel):
|
|
61
|
+
name: Literal[Event.MIGRATING]
|
|
62
|
+
data: Annotated[EventMigratingData, Field(default_factory=EventMigratingData)]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@docs_group('Events')
|
|
66
|
+
class AbortingEvent(BaseModel):
|
|
67
|
+
name: Literal[Event.ABORTING]
|
|
68
|
+
data: Annotated[EventAbortingData, Field(default_factory=EventAbortingData)]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@docs_group('Events')
|
|
72
|
+
class ExitEvent(BaseModel):
|
|
73
|
+
name: Literal[Event.EXIT]
|
|
74
|
+
data: Annotated[EventExitData, Field(default_factory=EventExitData)]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@docs_group('Events')
|
|
78
|
+
class EventWithoutData(BaseModel):
|
|
79
|
+
name: Literal[
|
|
80
|
+
Event.SESSION_RETIRED,
|
|
81
|
+
Event.BROWSER_LAUNCHED,
|
|
82
|
+
Event.BROWSER_RETIRED,
|
|
83
|
+
Event.BROWSER_CLOSED,
|
|
84
|
+
Event.PAGE_CREATED,
|
|
85
|
+
Event.PAGE_CLOSED,
|
|
86
|
+
]
|
|
87
|
+
data: Any = None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@docs_group('Events')
|
|
91
|
+
class DeprecatedEvent(BaseModel):
|
|
92
|
+
name: Literal['cpuInfo']
|
|
93
|
+
data: Annotated[dict[str, Any], Field(default_factory=dict)]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@docs_group('Events')
|
|
97
|
+
class UnknownEvent(BaseModel):
|
|
98
|
+
name: str
|
|
99
|
+
data: Annotated[dict[str, Any], Field(default_factory=dict)]
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
EventMessage = PersistStateEvent | SystemInfoEvent | MigratingEvent | AbortingEvent | ExitEvent | EventWithoutData
|