apify 2.7.3__py3-none-any.whl → 3.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/_actor.py +47 -12
- apify/_charging.py +15 -9
- apify/_configuration.py +34 -1
- apify/_crypto.py +0 -6
- apify/_models.py +7 -7
- apify/_proxy_configuration.py +10 -10
- apify/_utils.py +25 -2
- apify/events/__init__.py +5 -0
- apify/events/_apify_event_manager.py +140 -0
- apify/events/_types.py +102 -0
- apify/log.py +0 -9
- apify/request_loaders/__init__.py +18 -0
- apify/{storages/_request_list.py → request_loaders/_apify_request_list.py} +25 -18
- apify/request_loaders/py.typed +0 -0
- apify/scrapy/_logging_config.py +1 -4
- apify/scrapy/extensions/_httpcache.py +9 -5
- apify/scrapy/requests.py +3 -3
- apify/scrapy/scheduler.py +8 -5
- apify/storage_clients/__init__.py +10 -0
- apify/storage_clients/_apify/__init__.py +11 -0
- apify/storage_clients/_apify/_dataset_client.py +304 -0
- apify/storage_clients/_apify/_key_value_store_client.py +241 -0
- apify/storage_clients/_apify/_models.py +107 -0
- apify/storage_clients/_apify/_request_queue_client.py +787 -0
- apify/storage_clients/_apify/_storage_client.py +80 -0
- apify/storage_clients/_apify/py.typed +0 -0
- apify/storage_clients/_file_system/__init__.py +2 -0
- apify/storage_clients/_file_system/_key_value_store_client.py +36 -0
- apify/storage_clients/_file_system/_storage_client.py +35 -0
- apify/storage_clients/py.typed +0 -0
- apify/storages/__init__.py +1 -3
- {apify-2.7.3.dist-info → apify-3.0.0rc1.dist-info}/METADATA +8 -7
- apify-3.0.0rc1.dist-info/RECORD +52 -0
- apify/_platform_event_manager.py +0 -231
- apify/apify_storage_client/__init__.py +0 -3
- apify/apify_storage_client/_apify_storage_client.py +0 -72
- apify/apify_storage_client/_dataset_client.py +0 -190
- apify/apify_storage_client/_dataset_collection_client.py +0 -51
- apify/apify_storage_client/_key_value_store_client.py +0 -109
- apify/apify_storage_client/_key_value_store_collection_client.py +0 -51
- apify/apify_storage_client/_request_queue_client.py +0 -176
- apify/apify_storage_client/_request_queue_collection_client.py +0 -51
- apify-2.7.3.dist-info/RECORD +0 -44
- /apify/{apify_storage_client → events}/py.typed +0 -0
- {apify-2.7.3.dist-info → apify-3.0.0rc1.dist-info}/WHEEL +0 -0
- {apify-2.7.3.dist-info → apify-3.0.0rc1.dist-info}/licenses/LICENSE +0 -0
apify/_actor.py
CHANGED
|
@@ -5,7 +5,7 @@ import os
|
|
|
5
5
|
import sys
|
|
6
6
|
from contextlib import suppress
|
|
7
7
|
from datetime import datetime, timedelta, timezone
|
|
8
|
-
from typing import TYPE_CHECKING, Any,
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Literal, TypeVar, cast, overload
|
|
9
9
|
|
|
10
10
|
from lazy_object_proxy import Proxy
|
|
11
11
|
from more_itertools import flatten
|
|
@@ -13,7 +13,6 @@ from pydantic import AliasChoices
|
|
|
13
13
|
|
|
14
14
|
from apify_client import ApifyClientAsync
|
|
15
15
|
from apify_shared.consts import ActorEnvVars, ActorExitCodes, ApifyEnvVars
|
|
16
|
-
from apify_shared.utils import ignore_docs, maybe_extract_enum_member_value
|
|
17
16
|
from crawlee import service_locator
|
|
18
17
|
from crawlee.events import (
|
|
19
18
|
Event,
|
|
@@ -30,15 +29,16 @@ from apify._configuration import Configuration
|
|
|
30
29
|
from apify._consts import EVENT_LISTENERS_TIMEOUT
|
|
31
30
|
from apify._crypto import decrypt_input_secrets, load_private_key
|
|
32
31
|
from apify._models import ActorRun
|
|
33
|
-
from apify._platform_event_manager import EventManager, LocalEventManager, PlatformEventManager
|
|
34
32
|
from apify._proxy_configuration import ProxyConfiguration
|
|
35
|
-
from apify._utils import docs_group, docs_name, get_system_info, is_running_in_ipython
|
|
36
|
-
from apify.
|
|
33
|
+
from apify._utils import docs_group, docs_name, get_system_info, is_running_in_ipython, maybe_extract_enum_member_value
|
|
34
|
+
from apify.events import ApifyEventManager, EventManager, LocalEventManager
|
|
37
35
|
from apify.log import _configure_logging, logger
|
|
36
|
+
from apify.storage_clients import ApifyStorageClient
|
|
38
37
|
from apify.storages import Dataset, KeyValueStore, RequestQueue
|
|
39
38
|
|
|
40
39
|
if TYPE_CHECKING:
|
|
41
40
|
import logging
|
|
41
|
+
from collections.abc import Callable
|
|
42
42
|
from types import TracebackType
|
|
43
43
|
|
|
44
44
|
from typing_extensions import Self
|
|
@@ -53,9 +53,46 @@ MainReturnType = TypeVar('MainReturnType')
|
|
|
53
53
|
|
|
54
54
|
|
|
55
55
|
@docs_name('Actor')
|
|
56
|
-
@docs_group('
|
|
56
|
+
@docs_group('Actor')
|
|
57
57
|
class _ActorType:
|
|
58
|
-
"""The class
|
|
58
|
+
"""The core class for building Actors on the Apify platform.
|
|
59
|
+
|
|
60
|
+
Actors are serverless programs running in the cloud that can perform anything from simple actions
|
|
61
|
+
(such as filling out a web form or sending an email) to complex operations (such as crawling an
|
|
62
|
+
entire website or removing duplicates from a large dataset). They are packaged as Docker containers
|
|
63
|
+
which accept well-defined JSON input, perform an action, and optionally produce well-defined output.
|
|
64
|
+
|
|
65
|
+
### References
|
|
66
|
+
|
|
67
|
+
- Apify platform documentation: https://docs.apify.com/platform/actors
|
|
68
|
+
- Actor whitepaper: https://whitepaper.actor/
|
|
69
|
+
|
|
70
|
+
### Usage
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
import asyncio
|
|
74
|
+
|
|
75
|
+
import httpx
|
|
76
|
+
from apify import Actor
|
|
77
|
+
from bs4 import BeautifulSoup
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
async def main() -> None:
|
|
81
|
+
async with Actor:
|
|
82
|
+
actor_input = await Actor.get_input()
|
|
83
|
+
async with httpx.AsyncClient() as client:
|
|
84
|
+
response = await client.get(actor_input['url'])
|
|
85
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
86
|
+
data = {
|
|
87
|
+
'url': actor_input['url'],
|
|
88
|
+
'title': soup.title.string if soup.title else None,
|
|
89
|
+
}
|
|
90
|
+
await Actor.push_data(data)
|
|
91
|
+
|
|
92
|
+
if __name__ == '__main__':
|
|
93
|
+
asyncio.run(main())
|
|
94
|
+
```
|
|
95
|
+
"""
|
|
59
96
|
|
|
60
97
|
_is_rebooting = False
|
|
61
98
|
_is_any_instance_initialized = False
|
|
@@ -88,12 +125,12 @@ class _ActorType:
|
|
|
88
125
|
|
|
89
126
|
# Create an instance of the cloud storage client, the local storage client is obtained
|
|
90
127
|
# from the service locator.
|
|
91
|
-
self._cloud_storage_client = ApifyStorageClient
|
|
128
|
+
self._cloud_storage_client = ApifyStorageClient()
|
|
92
129
|
|
|
93
130
|
# Set the event manager based on whether the Actor is running on the platform or locally.
|
|
94
131
|
self._event_manager = (
|
|
95
|
-
|
|
96
|
-
|
|
132
|
+
ApifyEventManager(
|
|
133
|
+
configuration=self._configuration,
|
|
97
134
|
persist_state_interval=self._configuration.persist_state_interval,
|
|
98
135
|
)
|
|
99
136
|
if self.is_at_home()
|
|
@@ -107,7 +144,6 @@ class _ActorType:
|
|
|
107
144
|
|
|
108
145
|
self._is_initialized = False
|
|
109
146
|
|
|
110
|
-
@ignore_docs
|
|
111
147
|
async def __aenter__(self) -> Self:
|
|
112
148
|
"""Initialize the Actor.
|
|
113
149
|
|
|
@@ -119,7 +155,6 @@ class _ActorType:
|
|
|
119
155
|
await self.init()
|
|
120
156
|
return self
|
|
121
157
|
|
|
122
|
-
@ignore_docs
|
|
123
158
|
async def __aexit__(
|
|
124
159
|
self,
|
|
125
160
|
_exc_type: type[BaseException] | None,
|
apify/_charging.py
CHANGED
|
@@ -4,11 +4,10 @@ import math
|
|
|
4
4
|
from dataclasses import dataclass
|
|
5
5
|
from datetime import datetime, timezone
|
|
6
6
|
from decimal import Decimal
|
|
7
|
-
from typing import TYPE_CHECKING, Protocol
|
|
7
|
+
from typing import TYPE_CHECKING, Protocol
|
|
8
8
|
|
|
9
9
|
from pydantic import TypeAdapter
|
|
10
10
|
|
|
11
|
-
from apify_shared.utils import ignore_docs
|
|
12
11
|
from crawlee._utils.context import ensure_context
|
|
13
12
|
|
|
14
13
|
from apify._models import ActorRun, PricingModel
|
|
@@ -23,13 +22,21 @@ if TYPE_CHECKING:
|
|
|
23
22
|
|
|
24
23
|
from apify._configuration import Configuration
|
|
25
24
|
|
|
25
|
+
run_validator = TypeAdapter[ActorRun | None](ActorRun | None)
|
|
26
26
|
|
|
27
|
-
run_validator: TypeAdapter[ActorRun | None] = TypeAdapter(Union[ActorRun, None])
|
|
28
27
|
|
|
29
|
-
|
|
30
|
-
@docs_group('Interfaces')
|
|
28
|
+
@docs_group('Charging')
|
|
31
29
|
class ChargingManager(Protocol):
|
|
32
|
-
"""Provides fine-grained access to pay-per-event functionality.
|
|
30
|
+
"""Provides fine-grained access to pay-per-event functionality.
|
|
31
|
+
|
|
32
|
+
The ChargingManager allows you to charge for specific events in your Actor when using
|
|
33
|
+
the pay-per-event pricing model. This enables precise cost control and transparent
|
|
34
|
+
billing for different operations within your Actor.
|
|
35
|
+
|
|
36
|
+
### References
|
|
37
|
+
|
|
38
|
+
- Apify platform documentation: https://docs.apify.com/platform/actors/publishing/monetize
|
|
39
|
+
"""
|
|
33
40
|
|
|
34
41
|
async def charge(self, event_name: str, count: int = 1) -> ChargeResult:
|
|
35
42
|
"""Charge for a specified number of events - sub-operations of the Actor.
|
|
@@ -58,7 +65,7 @@ class ChargingManager(Protocol):
|
|
|
58
65
|
"""
|
|
59
66
|
|
|
60
67
|
|
|
61
|
-
@docs_group('
|
|
68
|
+
@docs_group('Charging')
|
|
62
69
|
@dataclass(frozen=True)
|
|
63
70
|
class ChargeResult:
|
|
64
71
|
"""Result of the `ChargingManager.charge` method."""
|
|
@@ -73,7 +80,7 @@ class ChargeResult:
|
|
|
73
80
|
"""How many events of each known type can still be charged within the limit."""
|
|
74
81
|
|
|
75
82
|
|
|
76
|
-
@docs_group('
|
|
83
|
+
@docs_group('Charging')
|
|
77
84
|
@dataclass
|
|
78
85
|
class ActorPricingInfo:
|
|
79
86
|
"""Result of the `ChargingManager.get_pricing_info` method."""
|
|
@@ -91,7 +98,6 @@ class ActorPricingInfo:
|
|
|
91
98
|
"""Price of every known event type."""
|
|
92
99
|
|
|
93
100
|
|
|
94
|
-
@ignore_docs
|
|
95
101
|
class ChargingManagerImplementation(ChargingManager):
|
|
96
102
|
"""Implementation of the `ChargingManager` Protocol - this is only meant to be instantiated internally."""
|
|
97
103
|
|
apify/_configuration.py
CHANGED
|
@@ -25,7 +25,7 @@ def _transform_to_list(value: Any) -> list[str] | None:
|
|
|
25
25
|
return value if isinstance(value, list) else str(value).split(',')
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
@docs_group('
|
|
28
|
+
@docs_group('Configuration')
|
|
29
29
|
class Configuration(CrawleeConfiguration):
|
|
30
30
|
"""A class for specifying the configuration of an Actor.
|
|
31
31
|
|
|
@@ -140,6 +140,39 @@ class Configuration(CrawleeConfiguration):
|
|
|
140
140
|
),
|
|
141
141
|
] = None
|
|
142
142
|
|
|
143
|
+
default_dataset_id: Annotated[
|
|
144
|
+
str,
|
|
145
|
+
Field(
|
|
146
|
+
validation_alias=AliasChoices(
|
|
147
|
+
'actor_default_dataset_id',
|
|
148
|
+
'apify_default_dataset_id',
|
|
149
|
+
),
|
|
150
|
+
description='Default dataset ID used by the Apify storage client when no ID or name is provided.',
|
|
151
|
+
),
|
|
152
|
+
] = 'default'
|
|
153
|
+
|
|
154
|
+
default_key_value_store_id: Annotated[
|
|
155
|
+
str,
|
|
156
|
+
Field(
|
|
157
|
+
validation_alias=AliasChoices(
|
|
158
|
+
'actor_default_key_value_store_id',
|
|
159
|
+
'apify_default_key_value_store_id',
|
|
160
|
+
),
|
|
161
|
+
description='Default key-value store ID for the Apify storage client when no ID or name is provided.',
|
|
162
|
+
),
|
|
163
|
+
] = 'default'
|
|
164
|
+
|
|
165
|
+
default_request_queue_id: Annotated[
|
|
166
|
+
str,
|
|
167
|
+
Field(
|
|
168
|
+
validation_alias=AliasChoices(
|
|
169
|
+
'actor_default_request_queue_id',
|
|
170
|
+
'apify_default_request_queue_id',
|
|
171
|
+
),
|
|
172
|
+
description='Default request queue ID for the Apify storage client when no ID or name is provided.',
|
|
173
|
+
),
|
|
174
|
+
] = 'default'
|
|
175
|
+
|
|
143
176
|
disable_outdated_warning: Annotated[
|
|
144
177
|
bool,
|
|
145
178
|
Field(
|
apify/_crypto.py
CHANGED
|
@@ -12,7 +12,6 @@ from cryptography.hazmat.primitives import hashes, serialization
|
|
|
12
12
|
from cryptography.hazmat.primitives.asymmetric import padding, rsa
|
|
13
13
|
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
|
|
14
14
|
|
|
15
|
-
from apify_shared.utils import ignore_docs
|
|
16
15
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
17
16
|
|
|
18
17
|
from apify._consts import ENCRYPTED_INPUT_VALUE_REGEXP, ENCRYPTED_JSON_VALUE_PREFIX, ENCRYPTED_STRING_VALUE_PREFIX
|
|
@@ -22,7 +21,6 @@ ENCRYPTION_IV_LENGTH = 16
|
|
|
22
21
|
ENCRYPTION_AUTH_TAG_LENGTH = 16
|
|
23
22
|
|
|
24
23
|
|
|
25
|
-
@ignore_docs
|
|
26
24
|
def public_encrypt(value: str, *, public_key: rsa.RSAPublicKey) -> dict:
|
|
27
25
|
"""Encrypts the given value using AES cipher and the password for encryption using the public key.
|
|
28
26
|
|
|
@@ -66,7 +64,6 @@ def public_encrypt(value: str, *, public_key: rsa.RSAPublicKey) -> dict:
|
|
|
66
64
|
}
|
|
67
65
|
|
|
68
66
|
|
|
69
|
-
@ignore_docs
|
|
70
67
|
def private_decrypt(
|
|
71
68
|
encrypted_password: str,
|
|
72
69
|
encrypted_value: str,
|
|
@@ -118,7 +115,6 @@ def private_decrypt(
|
|
|
118
115
|
return decipher_bytes.decode('utf-8')
|
|
119
116
|
|
|
120
117
|
|
|
121
|
-
@ignore_docs
|
|
122
118
|
def load_private_key(private_key_file_base64: str, private_key_password: str) -> rsa.RSAPrivateKey:
|
|
123
119
|
private_key = serialization.load_pem_private_key(
|
|
124
120
|
base64.b64decode(private_key_file_base64.encode('utf-8')),
|
|
@@ -138,7 +134,6 @@ def _load_public_key(public_key_file_base64: str) -> rsa.RSAPublicKey:
|
|
|
138
134
|
return public_key
|
|
139
135
|
|
|
140
136
|
|
|
141
|
-
@ignore_docs
|
|
142
137
|
def decrypt_input_secrets(private_key: rsa.RSAPrivateKey, input_data: Any) -> Any:
|
|
143
138
|
"""Decrypt input secrets."""
|
|
144
139
|
if not isinstance(input_data, dict):
|
|
@@ -180,7 +175,6 @@ def encode_base62(num: int) -> str:
|
|
|
180
175
|
return res
|
|
181
176
|
|
|
182
177
|
|
|
183
|
-
@ignore_docs
|
|
184
178
|
def create_hmac_signature(secret_key: str, message: str) -> str:
|
|
185
179
|
"""Generate an HMAC signature and encodes it using Base62. Base62 encoding reduces the signature length.
|
|
186
180
|
|
apify/_models.py
CHANGED
|
@@ -13,10 +13,10 @@ from crawlee._utils.urls import validate_http_url
|
|
|
13
13
|
from apify._utils import docs_group
|
|
14
14
|
|
|
15
15
|
if TYPE_CHECKING:
|
|
16
|
-
from
|
|
16
|
+
from typing import TypeAlias
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
@docs_group('
|
|
19
|
+
@docs_group('Actor')
|
|
20
20
|
class Webhook(BaseModel):
|
|
21
21
|
__model_config__ = ConfigDict(populate_by_name=True)
|
|
22
22
|
|
|
@@ -35,14 +35,14 @@ class Webhook(BaseModel):
|
|
|
35
35
|
] = None
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
@docs_group('
|
|
38
|
+
@docs_group('Actor')
|
|
39
39
|
class ActorRunMeta(BaseModel):
|
|
40
40
|
__model_config__ = ConfigDict(populate_by_name=True)
|
|
41
41
|
|
|
42
42
|
origin: Annotated[MetaOrigin, Field()]
|
|
43
43
|
|
|
44
44
|
|
|
45
|
-
@docs_group('
|
|
45
|
+
@docs_group('Actor')
|
|
46
46
|
class ActorRunStats(BaseModel):
|
|
47
47
|
__model_config__ = ConfigDict(populate_by_name=True)
|
|
48
48
|
|
|
@@ -63,7 +63,7 @@ class ActorRunStats(BaseModel):
|
|
|
63
63
|
compute_units: Annotated[float, Field(alias='computeUnits')]
|
|
64
64
|
|
|
65
65
|
|
|
66
|
-
@docs_group('
|
|
66
|
+
@docs_group('Actor')
|
|
67
67
|
class ActorRunOptions(BaseModel):
|
|
68
68
|
__model_config__ = ConfigDict(populate_by_name=True)
|
|
69
69
|
|
|
@@ -74,7 +74,7 @@ class ActorRunOptions(BaseModel):
|
|
|
74
74
|
max_total_charge_usd: Annotated[Decimal | None, Field(alias='maxTotalChargeUsd')] = None
|
|
75
75
|
|
|
76
76
|
|
|
77
|
-
@docs_group('
|
|
77
|
+
@docs_group('Actor')
|
|
78
78
|
class ActorRunUsage(BaseModel):
|
|
79
79
|
__model_config__ = ConfigDict(populate_by_name=True)
|
|
80
80
|
|
|
@@ -92,7 +92,7 @@ class ActorRunUsage(BaseModel):
|
|
|
92
92
|
proxy_serps: Annotated[float | None, Field(alias='PROXY_SERPS')] = None
|
|
93
93
|
|
|
94
94
|
|
|
95
|
-
@docs_group('
|
|
95
|
+
@docs_group('Actor')
|
|
96
96
|
class ActorRun(BaseModel):
|
|
97
97
|
__model_config__ = ConfigDict(populate_by_name=True)
|
|
98
98
|
|
apify/_proxy_configuration.py
CHANGED
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import ipaddress
|
|
4
|
+
import json
|
|
4
5
|
import re
|
|
5
6
|
from dataclasses import dataclass, field
|
|
6
7
|
from re import Pattern
|
|
7
8
|
from typing import TYPE_CHECKING, Any
|
|
8
9
|
from urllib.parse import urljoin, urlparse
|
|
9
10
|
|
|
10
|
-
import
|
|
11
|
+
import impit
|
|
12
|
+
from yarl import URL
|
|
11
13
|
|
|
12
14
|
from apify_shared.consts import ApifyEnvVars
|
|
13
|
-
from apify_shared.utils import ignore_docs
|
|
14
15
|
from crawlee.proxy_configuration import ProxyConfiguration as CrawleeProxyConfiguration
|
|
15
16
|
from crawlee.proxy_configuration import ProxyInfo as CrawleeProxyInfo
|
|
16
17
|
from crawlee.proxy_configuration import _NewUrlFunction
|
|
@@ -21,14 +22,14 @@ from apify.log import logger
|
|
|
21
22
|
|
|
22
23
|
if TYPE_CHECKING:
|
|
23
24
|
from apify_client import ApifyClientAsync
|
|
24
|
-
|
|
25
|
+
|
|
26
|
+
from apify import Request
|
|
25
27
|
|
|
26
28
|
APIFY_PROXY_VALUE_REGEX = re.compile(r'^[\w._~]+$')
|
|
27
29
|
COUNTRY_CODE_REGEX = re.compile(r'^[A-Z]{2}$')
|
|
28
30
|
SESSION_ID_MAX_LENGTH = 50
|
|
29
31
|
|
|
30
32
|
|
|
31
|
-
@ignore_docs
|
|
32
33
|
def is_url(url: str) -> bool:
|
|
33
34
|
"""Check if the given string is a valid URL."""
|
|
34
35
|
try:
|
|
@@ -69,7 +70,7 @@ def _check(
|
|
|
69
70
|
raise ValueError(f'{error_str} does not match pattern {pattern.pattern!r}')
|
|
70
71
|
|
|
71
72
|
|
|
72
|
-
@docs_group('
|
|
73
|
+
@docs_group('Configuration')
|
|
73
74
|
@dataclass
|
|
74
75
|
class ProxyInfo(CrawleeProxyInfo):
|
|
75
76
|
"""Provides information about a proxy connection that is used for requests."""
|
|
@@ -89,7 +90,7 @@ class ProxyInfo(CrawleeProxyInfo):
|
|
|
89
90
|
"""
|
|
90
91
|
|
|
91
92
|
|
|
92
|
-
@docs_group('
|
|
93
|
+
@docs_group('Configuration')
|
|
93
94
|
class ProxyConfiguration(CrawleeProxyConfiguration):
|
|
94
95
|
"""Configures a connection to a proxy server with the provided options.
|
|
95
96
|
|
|
@@ -104,7 +105,6 @@ class ProxyConfiguration(CrawleeProxyConfiguration):
|
|
|
104
105
|
|
|
105
106
|
_configuration: Configuration
|
|
106
107
|
|
|
107
|
-
@ignore_docs
|
|
108
108
|
def __init__(
|
|
109
109
|
self,
|
|
110
110
|
*,
|
|
@@ -233,7 +233,7 @@ class ProxyConfiguration(CrawleeProxyConfiguration):
|
|
|
233
233
|
return None
|
|
234
234
|
|
|
235
235
|
if self._uses_apify_proxy:
|
|
236
|
-
parsed_url =
|
|
236
|
+
parsed_url = URL(proxy_info.url)
|
|
237
237
|
username = self._get_username(session_id)
|
|
238
238
|
|
|
239
239
|
return ProxyInfo(
|
|
@@ -277,11 +277,11 @@ class ProxyConfiguration(CrawleeProxyConfiguration):
|
|
|
277
277
|
return
|
|
278
278
|
|
|
279
279
|
status = None
|
|
280
|
-
async with
|
|
280
|
+
async with impit.AsyncClient(proxy=proxy_info.url, timeout=10) as client:
|
|
281
281
|
for _ in range(2):
|
|
282
282
|
try:
|
|
283
283
|
response = await client.get(proxy_status_url)
|
|
284
|
-
status = response.
|
|
284
|
+
status = json.loads(response.text)
|
|
285
285
|
break
|
|
286
286
|
except Exception: # noqa: S110
|
|
287
287
|
# retry on connection errors
|
apify/_utils.py
CHANGED
|
@@ -2,8 +2,12 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import builtins
|
|
4
4
|
import sys
|
|
5
|
+
from enum import Enum
|
|
5
6
|
from importlib import metadata
|
|
6
|
-
from typing import
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from collections.abc import Callable
|
|
7
11
|
|
|
8
12
|
|
|
9
13
|
def get_system_info() -> dict:
|
|
@@ -27,7 +31,19 @@ def is_running_in_ipython() -> bool:
|
|
|
27
31
|
return getattr(builtins, '__IPYTHON__', False)
|
|
28
32
|
|
|
29
33
|
|
|
30
|
-
|
|
34
|
+
# The order of the rendered API groups is defined in the website/docusaurus.config.js file.
|
|
35
|
+
GroupName = Literal[
|
|
36
|
+
'Actor',
|
|
37
|
+
'Charging',
|
|
38
|
+
'Configuration',
|
|
39
|
+
'Event data',
|
|
40
|
+
'Event managers',
|
|
41
|
+
'Events',
|
|
42
|
+
'Request loaders',
|
|
43
|
+
'Storage clients',
|
|
44
|
+
'Storage data',
|
|
45
|
+
'Storages',
|
|
46
|
+
]
|
|
31
47
|
|
|
32
48
|
|
|
33
49
|
def docs_group(group_name: GroupName) -> Callable: # noqa: ARG001
|
|
@@ -66,3 +82,10 @@ def docs_name(symbol_name: str) -> Callable: # noqa: ARG001
|
|
|
66
82
|
return func
|
|
67
83
|
|
|
68
84
|
return wrapper
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def maybe_extract_enum_member_value(maybe_enum_member: Any) -> Any:
|
|
88
|
+
"""Extract the value of an enumeration member if it is an Enum, otherwise return the original value."""
|
|
89
|
+
if isinstance(maybe_enum_member, Enum):
|
|
90
|
+
return maybe_enum_member.value
|
|
91
|
+
return maybe_enum_member
|
apify/events/__init__.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import contextlib
|
|
5
|
+
from typing import TYPE_CHECKING, Annotated
|
|
6
|
+
|
|
7
|
+
import websockets.asyncio.client
|
|
8
|
+
from pydantic import Discriminator, TypeAdapter
|
|
9
|
+
from typing_extensions import Self, Unpack, override
|
|
10
|
+
|
|
11
|
+
from crawlee.events import EventManager
|
|
12
|
+
from crawlee.events._types import Event, EventPersistStateData
|
|
13
|
+
|
|
14
|
+
from apify._utils import docs_group
|
|
15
|
+
from apify.events._types import DeprecatedEvent, EventMessage, SystemInfoEventData, UnknownEvent
|
|
16
|
+
from apify.log import logger
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from types import TracebackType
|
|
20
|
+
|
|
21
|
+
from crawlee.events._event_manager import EventManagerOptions
|
|
22
|
+
|
|
23
|
+
from apify._configuration import Configuration
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
event_data_adapter = TypeAdapter[EventMessage | DeprecatedEvent | UnknownEvent](
|
|
27
|
+
Annotated[EventMessage, Discriminator('name')] | DeprecatedEvent | UnknownEvent
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@docs_group('Event managers')
|
|
32
|
+
class ApifyEventManager(EventManager):
|
|
33
|
+
"""Event manager for the Apify platform.
|
|
34
|
+
|
|
35
|
+
This class extends Crawlee's `EventManager` to provide Apify-specific functionality, including websocket
|
|
36
|
+
connectivity to the Apify platform for receiving platform events.
|
|
37
|
+
|
|
38
|
+
The event manager handles:
|
|
39
|
+
- Registration and emission of events and their listeners.
|
|
40
|
+
- Websocket connection to Apify platform events.
|
|
41
|
+
- Processing and validation of platform messages.
|
|
42
|
+
- Automatic event forwarding from the platform to local event listeners.
|
|
43
|
+
|
|
44
|
+
This class should not be used directly. Use the `Actor.on` and `Actor.off` methods to interact
|
|
45
|
+
with the event system.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(self, configuration: Configuration, **kwargs: Unpack[EventManagerOptions]) -> None:
|
|
49
|
+
"""Initialize a new instance.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
configuration: The Actor configuration for the event manager.
|
|
53
|
+
**kwargs: Additional event manager options passed to the parent class.
|
|
54
|
+
"""
|
|
55
|
+
super().__init__(**kwargs)
|
|
56
|
+
|
|
57
|
+
self._configuration = configuration
|
|
58
|
+
"""The Actor configuration for the event manager."""
|
|
59
|
+
|
|
60
|
+
self._platform_events_websocket: websockets.asyncio.client.ClientConnection | None = None
|
|
61
|
+
"""WebSocket connection to the platform events."""
|
|
62
|
+
|
|
63
|
+
self._process_platform_messages_task: asyncio.Task | None = None
|
|
64
|
+
"""Task for processing messages from the platform websocket."""
|
|
65
|
+
|
|
66
|
+
self._connected_to_platform_websocket: asyncio.Future[bool] | None = None
|
|
67
|
+
"""Future that resolves when the connection to the platform websocket is established."""
|
|
68
|
+
|
|
69
|
+
@override
|
|
70
|
+
async def __aenter__(self) -> Self:
|
|
71
|
+
await super().__aenter__()
|
|
72
|
+
self._connected_to_platform_websocket = asyncio.Future()
|
|
73
|
+
|
|
74
|
+
# Run tasks but don't await them
|
|
75
|
+
if self._configuration.actor_events_ws_url:
|
|
76
|
+
self._process_platform_messages_task = asyncio.create_task(
|
|
77
|
+
self._process_platform_messages(self._configuration.actor_events_ws_url)
|
|
78
|
+
)
|
|
79
|
+
is_connected = await self._connected_to_platform_websocket
|
|
80
|
+
if not is_connected:
|
|
81
|
+
raise RuntimeError('Error connecting to platform events websocket!')
|
|
82
|
+
else:
|
|
83
|
+
logger.debug('APIFY_ACTOR_EVENTS_WS_URL env var not set, no events from Apify platform will be emitted.')
|
|
84
|
+
|
|
85
|
+
return self
|
|
86
|
+
|
|
87
|
+
@override
|
|
88
|
+
async def __aexit__(
|
|
89
|
+
self,
|
|
90
|
+
exc_type: type[BaseException] | None,
|
|
91
|
+
exc_value: BaseException | None,
|
|
92
|
+
exc_traceback: TracebackType | None,
|
|
93
|
+
) -> None:
|
|
94
|
+
if self._platform_events_websocket:
|
|
95
|
+
await self._platform_events_websocket.close()
|
|
96
|
+
|
|
97
|
+
if self._process_platform_messages_task and not self._process_platform_messages_task.done():
|
|
98
|
+
self._process_platform_messages_task.cancel()
|
|
99
|
+
with contextlib.suppress(asyncio.CancelledError):
|
|
100
|
+
await self._process_platform_messages_task
|
|
101
|
+
|
|
102
|
+
await super().__aexit__(exc_type, exc_value, exc_traceback)
|
|
103
|
+
|
|
104
|
+
async def _process_platform_messages(self, ws_url: str) -> None:
|
|
105
|
+
try:
|
|
106
|
+
async with websockets.asyncio.client.connect(ws_url) as websocket:
|
|
107
|
+
self._platform_events_websocket = websocket
|
|
108
|
+
if self._connected_to_platform_websocket is not None:
|
|
109
|
+
self._connected_to_platform_websocket.set_result(True)
|
|
110
|
+
|
|
111
|
+
async for message in websocket:
|
|
112
|
+
try:
|
|
113
|
+
parsed_message = event_data_adapter.validate_json(message)
|
|
114
|
+
|
|
115
|
+
if isinstance(parsed_message, DeprecatedEvent):
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
if isinstance(parsed_message, UnknownEvent):
|
|
119
|
+
logger.info(
|
|
120
|
+
f'Unknown message received: event_name={parsed_message.name}, '
|
|
121
|
+
f'event_data={parsed_message.data}'
|
|
122
|
+
)
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
self.emit(
|
|
126
|
+
event=parsed_message.name,
|
|
127
|
+
event_data=parsed_message.data
|
|
128
|
+
if not isinstance(parsed_message.data, SystemInfoEventData)
|
|
129
|
+
else parsed_message.data.to_crawlee_format(self._configuration.dedicated_cpus or 1),
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
if parsed_message.name == Event.MIGRATING:
|
|
133
|
+
await self._emit_persist_state_event_rec_task.stop()
|
|
134
|
+
self.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=True))
|
|
135
|
+
except Exception:
|
|
136
|
+
logger.exception('Cannot parse Actor event', extra={'message': message})
|
|
137
|
+
except Exception:
|
|
138
|
+
logger.exception('Error in websocket connection')
|
|
139
|
+
if self._connected_to_platform_websocket is not None:
|
|
140
|
+
self._connected_to_platform_websocket.set_result(False)
|