apify 3.0.3b1__py3-none-any.whl → 3.0.4b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/_actor.py +0 -2
- apify/storage_clients/_apify/_models.py +1 -1
- apify/storage_clients/_apify/_storage_client.py +59 -33
- apify/storage_clients/_smart_apify/_storage_client.py +40 -33
- {apify-3.0.3b1.dist-info → apify-3.0.4b1.dist-info}/METADATA +1 -1
- {apify-3.0.3b1.dist-info → apify-3.0.4b1.dist-info}/RECORD +8 -8
- {apify-3.0.3b1.dist-info → apify-3.0.4b1.dist-info}/WHEEL +0 -0
- {apify-3.0.3b1.dist-info → apify-3.0.4b1.dist-info}/licenses/LICENSE +0 -0
apify/_actor.py
CHANGED
|
@@ -48,7 +48,6 @@ if TYPE_CHECKING:
|
|
|
48
48
|
from typing_extensions import Self
|
|
49
49
|
|
|
50
50
|
from crawlee.proxy_configuration import _NewUrlFunction
|
|
51
|
-
from crawlee.storage_clients import StorageClient
|
|
52
51
|
|
|
53
52
|
from apify._models import Webhook
|
|
54
53
|
|
|
@@ -140,7 +139,6 @@ class _ActorType:
|
|
|
140
139
|
# `__init__` method should not be considered final.
|
|
141
140
|
|
|
142
141
|
self._apify_client: ApifyClientAsync | None = None
|
|
143
|
-
self._local_storage_client: StorageClient | None = None
|
|
144
142
|
|
|
145
143
|
self._is_exiting = False
|
|
146
144
|
self._is_initialized = False
|
|
@@ -120,7 +120,7 @@ class RequestQueueStats(BaseModel):
|
|
|
120
120
|
"""The number of request queue reads."""
|
|
121
121
|
|
|
122
122
|
storage_bytes: Annotated[int, Field(alias='storageBytes', default=0)]
|
|
123
|
-
"""Storage size in
|
|
123
|
+
"""Storage size in bytes."""
|
|
124
124
|
|
|
125
125
|
write_count: Annotated[int, Field(alias='writeCount', default=0)]
|
|
126
126
|
"""The number of request queue writes."""
|
|
@@ -21,43 +21,59 @@ if TYPE_CHECKING:
|
|
|
21
21
|
|
|
22
22
|
@docs_group('Storage clients')
|
|
23
23
|
class ApifyStorageClient(StorageClient):
|
|
24
|
-
"""Apify storage client.
|
|
24
|
+
"""Apify platform implementation of the storage client.
|
|
25
|
+
|
|
26
|
+
This storage client provides access to datasets, key-value stores, and request queues that persist data
|
|
27
|
+
to the Apify platform. Each storage type is implemented with its own specific Apify client that stores data
|
|
28
|
+
in the cloud, making it accessible from anywhere.
|
|
29
|
+
|
|
30
|
+
The communication with the Apify platform is handled via the Apify API client for Python, which is an HTTP API
|
|
31
|
+
wrapper. For maximum efficiency and performance of the storage clients, various caching mechanisms are used to
|
|
32
|
+
minimize the number of API calls made to the Apify platform. Data can be inspected and manipulated through
|
|
33
|
+
the Apify console web interface or via the Apify API.
|
|
34
|
+
|
|
35
|
+
The request queue client supports two access modes controlled by the `request_queue_access` parameter:
|
|
36
|
+
|
|
37
|
+
### Single mode
|
|
38
|
+
|
|
39
|
+
The `single` mode is optimized for scenarios with only one consumer. It minimizes API calls, making it faster
|
|
40
|
+
and more cost-efficient compared to the `shared` mode. This option is ideal when a single Actor is responsible
|
|
41
|
+
for consuming the entire request queue. Using multiple consumers simultaneously may lead to inconsistencies
|
|
42
|
+
or unexpected behavior.
|
|
43
|
+
|
|
44
|
+
In this mode, multiple producers can safely add new requests, but forefront requests may not be processed
|
|
45
|
+
immediately, as the client relies on local head estimation instead of frequent forefront fetching. Requests can
|
|
46
|
+
also be added or marked as handled by other clients, but they must not be deleted or modified, since such changes
|
|
47
|
+
would not be reflected in the local cache. If a request is already fully cached locally, marking it as handled
|
|
48
|
+
by another client will be ignored by this client. This does not cause errors but can occasionally result in
|
|
49
|
+
reprocessing a request that was already handled elsewhere. If the request was not yet cached locally, marking
|
|
50
|
+
it as handled poses no issue.
|
|
51
|
+
|
|
52
|
+
### Shared mode
|
|
53
|
+
|
|
54
|
+
The `shared` mode is designed for scenarios with multiple concurrent consumers. It ensures proper synchronization
|
|
55
|
+
and consistency across clients, at the cost of higher API usage and slightly worse performance. This mode is safe
|
|
56
|
+
for concurrent access from multiple processes, including Actors running in parallel on the Apify platform. It
|
|
57
|
+
should be used when multiple consumers need to process requests from the same queue simultaneously.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
_LSP_ERROR_MSG = 'Expected "configuration" to be an instance of "apify.Configuration", but got {} instead.'
|
|
61
|
+
"""This class (intentionally) violates the Liskov Substitution Principle.
|
|
62
|
+
|
|
63
|
+
It requires a specialized `Configuration` instance compared to its parent class.
|
|
64
|
+
"""
|
|
25
65
|
|
|
26
66
|
def __init__(self, *, request_queue_access: Literal['single', 'shared'] = 'single') -> None:
|
|
27
|
-
"""Initialize
|
|
67
|
+
"""Initialize a new instance.
|
|
28
68
|
|
|
29
69
|
Args:
|
|
30
|
-
request_queue_access:
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
- Only one client is consuming the request queue at the time.
|
|
35
|
-
- Multiple producers can put requests to the queue, but their forefront requests are not guaranteed to
|
|
36
|
-
be handled so quickly as this client does not aggressively fetch the forefront and relies on local
|
|
37
|
-
head estimation.
|
|
38
|
-
- Requests are only added to the queue, never deleted by other clients. (Marking as handled is ok.)
|
|
39
|
-
- Other producers can add new requests, but not modify existing ones.
|
|
40
|
-
(Modifications would not be included in local cache)
|
|
70
|
+
request_queue_access: Defines how the request queue client behaves. Use `single` mode for a single
|
|
71
|
+
consumer. It has fewer API calls, meaning better performance and lower costs. If you need multiple
|
|
72
|
+
concurrent consumers use `shared` mode, but expect worse performance and higher costs due to
|
|
73
|
+
the additional overhead.
|
|
41
74
|
"""
|
|
42
75
|
self._request_queue_access = request_queue_access
|
|
43
76
|
|
|
44
|
-
# This class breaches Liskov Substitution Principle. It requires specialized Configuration compared to its parent.
|
|
45
|
-
_lsp_violation_error_message_template = (
|
|
46
|
-
'Expected "configuration" to be an instance of "apify.Configuration", but got {} instead.'
|
|
47
|
-
)
|
|
48
|
-
|
|
49
|
-
@override
|
|
50
|
-
def get_storage_client_cache_key(self, configuration: CrawleeConfiguration) -> Hashable:
|
|
51
|
-
if isinstance(configuration, ApifyConfiguration):
|
|
52
|
-
# It is not supported to open exactly same queue with 'single' and 'shared' client at the same time.
|
|
53
|
-
# Whichever client variation gets used first, wins.
|
|
54
|
-
return super().get_storage_client_cache_key(configuration), hash_api_base_url_and_token(configuration)
|
|
55
|
-
|
|
56
|
-
config_class = type(configuration)
|
|
57
|
-
raise TypeError(
|
|
58
|
-
self._lsp_violation_error_message_template.format(f'{config_class.__module__}.{config_class.__name__}')
|
|
59
|
-
)
|
|
60
|
-
|
|
61
77
|
@override
|
|
62
78
|
async def create_dataset_client(
|
|
63
79
|
self,
|
|
@@ -71,7 +87,7 @@ class ApifyStorageClient(StorageClient):
|
|
|
71
87
|
if isinstance(configuration, ApifyConfiguration):
|
|
72
88
|
return await ApifyDatasetClient.open(id=id, name=name, alias=alias, configuration=configuration)
|
|
73
89
|
|
|
74
|
-
raise TypeError(self.
|
|
90
|
+
raise TypeError(self._LSP_ERROR_MSG.format(type(configuration).__name__))
|
|
75
91
|
|
|
76
92
|
@override
|
|
77
93
|
async def create_kvs_client(
|
|
@@ -86,7 +102,7 @@ class ApifyStorageClient(StorageClient):
|
|
|
86
102
|
if isinstance(configuration, ApifyConfiguration):
|
|
87
103
|
return await ApifyKeyValueStoreClient.open(id=id, name=name, alias=alias, configuration=configuration)
|
|
88
104
|
|
|
89
|
-
raise TypeError(self.
|
|
105
|
+
raise TypeError(self._LSP_ERROR_MSG.format(type(configuration).__name__))
|
|
90
106
|
|
|
91
107
|
@override
|
|
92
108
|
async def create_rq_client(
|
|
@@ -103,4 +119,14 @@ class ApifyStorageClient(StorageClient):
|
|
|
103
119
|
id=id, name=name, alias=alias, configuration=configuration, access=self._request_queue_access
|
|
104
120
|
)
|
|
105
121
|
|
|
106
|
-
raise TypeError(self.
|
|
122
|
+
raise TypeError(self._LSP_ERROR_MSG.format(type(configuration).__name__))
|
|
123
|
+
|
|
124
|
+
@override
|
|
125
|
+
def get_storage_client_cache_key(self, configuration: CrawleeConfiguration) -> Hashable:
|
|
126
|
+
if isinstance(configuration, ApifyConfiguration):
|
|
127
|
+
# It is not supported to open exactly same queue with 'single' and 'shared' client at the same time.
|
|
128
|
+
# Whichever client variation gets used first, wins.
|
|
129
|
+
return super().get_storage_client_cache_key(configuration), hash_api_base_url_and_token(configuration)
|
|
130
|
+
|
|
131
|
+
config_class = type(configuration)
|
|
132
|
+
raise TypeError(self._LSP_ERROR_MSG.format(f'{config_class.__module__}.{config_class.__name__}'))
|
|
@@ -8,8 +8,7 @@ from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, Re
|
|
|
8
8
|
|
|
9
9
|
from apify._configuration import Configuration as ApifyConfiguration
|
|
10
10
|
from apify._utils import docs_group
|
|
11
|
-
from apify.storage_clients import ApifyStorageClient
|
|
12
|
-
from apify.storage_clients._file_system import ApifyFileSystemStorageClient
|
|
11
|
+
from apify.storage_clients import ApifyStorageClient, FileSystemStorageClient
|
|
13
12
|
|
|
14
13
|
if TYPE_CHECKING:
|
|
15
14
|
from collections.abc import Hashable
|
|
@@ -19,28 +18,36 @@ if TYPE_CHECKING:
|
|
|
19
18
|
|
|
20
19
|
@docs_group('Storage clients')
|
|
21
20
|
class SmartApifyStorageClient(StorageClient):
|
|
22
|
-
"""
|
|
21
|
+
"""Storage client that automatically selects cloud or local storage client based on the environment.
|
|
23
22
|
|
|
24
|
-
|
|
25
|
-
|
|
23
|
+
This storage client provides access to datasets, key-value stores, and request queues by intelligently
|
|
24
|
+
delegating to either the cloud or local storage client based on the execution environment and configuration.
|
|
25
|
+
|
|
26
|
+
When running on the Apify platform (which is detected via environment variables), this client automatically
|
|
27
|
+
uses the `cloud_storage_client` to store storage data there. When running locally, it uses the
|
|
28
|
+
`local_storage_client` to store storage data there. You can also force cloud storage usage from your
|
|
29
|
+
local machine by using the `force_cloud` argument.
|
|
30
|
+
|
|
31
|
+
This storage client is designed to work specifically in `Actor` context and provides a seamless development
|
|
32
|
+
experience where the same code works both locally and on the Apify platform without any changes.
|
|
26
33
|
"""
|
|
27
34
|
|
|
28
35
|
def __init__(
|
|
29
36
|
self,
|
|
30
37
|
*,
|
|
31
|
-
cloud_storage_client:
|
|
38
|
+
cloud_storage_client: StorageClient | None = None,
|
|
32
39
|
local_storage_client: StorageClient | None = None,
|
|
33
40
|
) -> None:
|
|
34
|
-
"""Initialize
|
|
41
|
+
"""Initialize a new instance.
|
|
35
42
|
|
|
36
43
|
Args:
|
|
37
|
-
cloud_storage_client:
|
|
38
|
-
`force_cloud` argument
|
|
39
|
-
local_storage_client:
|
|
40
|
-
|
|
44
|
+
cloud_storage_client: Storage client used when an Actor is running on the Apify platform, or when
|
|
45
|
+
explicitly enabled via the `force_cloud` argument. Defaults to `ApifyStorageClient`.
|
|
46
|
+
local_storage_client: Storage client used when an Actor is not running on the Apify platform and when
|
|
47
|
+
`force_cloud` flag is not set. Defaults to `FileSystemStorageClient`.
|
|
41
48
|
"""
|
|
42
|
-
self._cloud_storage_client = cloud_storage_client or ApifyStorageClient(
|
|
43
|
-
self._local_storage_client = local_storage_client or
|
|
49
|
+
self._cloud_storage_client = cloud_storage_client or ApifyStorageClient()
|
|
50
|
+
self._local_storage_client = local_storage_client or FileSystemStorageClient()
|
|
44
51
|
|
|
45
52
|
def __str__(self) -> str:
|
|
46
53
|
return (
|
|
@@ -48,26 +55,6 @@ class SmartApifyStorageClient(StorageClient):
|
|
|
48
55
|
f' local_storage_client={self._local_storage_client.__class__.__name__})'
|
|
49
56
|
)
|
|
50
57
|
|
|
51
|
-
def get_suitable_storage_client(self, *, force_cloud: bool = False) -> StorageClient:
|
|
52
|
-
"""Get a suitable storage client based on the global configuration and the value of the force_cloud flag.
|
|
53
|
-
|
|
54
|
-
Args:
|
|
55
|
-
force_cloud: If True, return `cloud_storage_client`.
|
|
56
|
-
"""
|
|
57
|
-
if ApifyConfiguration.get_global_configuration().is_at_home:
|
|
58
|
-
return self._cloud_storage_client
|
|
59
|
-
|
|
60
|
-
configuration = ApifyConfiguration.get_global_configuration()
|
|
61
|
-
if force_cloud:
|
|
62
|
-
if configuration.token is None:
|
|
63
|
-
raise RuntimeError(
|
|
64
|
-
'In order to use the Apify cloud storage from your computer, '
|
|
65
|
-
'you need to provide an Apify token using the APIFY_TOKEN environment variable.'
|
|
66
|
-
)
|
|
67
|
-
return self._cloud_storage_client
|
|
68
|
-
|
|
69
|
-
return self._local_storage_client
|
|
70
|
-
|
|
71
58
|
@override
|
|
72
59
|
def get_storage_client_cache_key(self, configuration: CrawleeConfiguration) -> Hashable:
|
|
73
60
|
if ApifyConfiguration.get_global_configuration().is_at_home:
|
|
@@ -115,3 +102,23 @@ class SmartApifyStorageClient(StorageClient):
|
|
|
115
102
|
return await self.get_suitable_storage_client().create_rq_client(
|
|
116
103
|
id=id, name=id, alias=alias, configuration=configuration
|
|
117
104
|
)
|
|
105
|
+
|
|
106
|
+
def get_suitable_storage_client(self, *, force_cloud: bool = False) -> StorageClient:
|
|
107
|
+
"""Get a suitable storage client based on the global configuration and the value of the force_cloud flag.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
force_cloud: If True, return `cloud_storage_client`.
|
|
111
|
+
"""
|
|
112
|
+
if ApifyConfiguration.get_global_configuration().is_at_home:
|
|
113
|
+
return self._cloud_storage_client
|
|
114
|
+
|
|
115
|
+
configuration = ApifyConfiguration.get_global_configuration()
|
|
116
|
+
if force_cloud:
|
|
117
|
+
if configuration.token is None:
|
|
118
|
+
raise RuntimeError(
|
|
119
|
+
'In order to use the Apify cloud storage from your computer, '
|
|
120
|
+
'you need to provide an Apify token using the APIFY_TOKEN environment variable.'
|
|
121
|
+
)
|
|
122
|
+
return self._cloud_storage_client
|
|
123
|
+
|
|
124
|
+
return self._local_storage_client
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
apify/__init__.py,sha256=HpgKg2FZWJuSPfDygzJ62psylhw4NN4tKFnoYUIhcd4,838
|
|
2
|
-
apify/_actor.py,sha256=
|
|
2
|
+
apify/_actor.py,sha256=eJzlRicWkVuO-151ikjJnRMRl-UgJ2QKg6wahtoc2Rc,57252
|
|
3
3
|
apify/_charging.py,sha256=KjZ2DnEMS0Tt8ibizmmt0RwBq8FOAsD1z-hKFgdazcY,13143
|
|
4
4
|
apify/_configuration.py,sha256=7ZHhgRp98kr35zx4k4EB2aImq7Dq1FJjPg7r5bucv_M,14984
|
|
5
5
|
apify/_consts.py,sha256=CjhyEJ4Mi0lcIrzfqz8dN7nPJWGjCeBrrXQy1PZ6zRI,440
|
|
@@ -37,21 +37,21 @@ apify/storage_clients/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuF
|
|
|
37
37
|
apify/storage_clients/_apify/__init__.py,sha256=mtbVDsxqWL3kx30elnh0kAn2kZ4s3BBsWa15Y5e7RMU,347
|
|
38
38
|
apify/storage_clients/_apify/_dataset_client.py,sha256=qmCJyL1MN83tYRXmc31P6yMIXVZMyRrGjr7R6-86FSE,11869
|
|
39
39
|
apify/storage_clients/_apify/_key_value_store_client.py,sha256=994a5bM_BGHIeirnny6QlXjy5CzMU2I9SmMksCbHCUY,9357
|
|
40
|
-
apify/storage_clients/_apify/_models.py,sha256=
|
|
40
|
+
apify/storage_clients/_apify/_models.py,sha256=szYdJOvWQ6hrmwUp7y7QsoQrRh5TlIYNxmO7nOe-M14,4556
|
|
41
41
|
apify/storage_clients/_apify/_request_queue_client.py,sha256=tAyap34gpxvPiQ0McDjX5ojq1ZIZc4EI3PrW8VQqS4k,13292
|
|
42
42
|
apify/storage_clients/_apify/_request_queue_shared_client.py,sha256=pWmd6aPxM-eZ6PC1MfsfTcjD2mGGpCDS3ZZ3cG_2MEA,20971
|
|
43
43
|
apify/storage_clients/_apify/_request_queue_single_client.py,sha256=d2txMwxW1nlYnvjdOH8xpxhcOYNeyc1ousGHRE7jsPg,17468
|
|
44
|
-
apify/storage_clients/_apify/_storage_client.py,sha256=
|
|
44
|
+
apify/storage_clients/_apify/_storage_client.py,sha256=TcmMzbEMgyndxaT6lIMl2fTG4oQR_lpLSHTmwwH0tXs,6515
|
|
45
45
|
apify/storage_clients/_apify/_utils.py,sha256=375gk_TJyMWIIgRbE9SS0hQup0h6sA3mzpTG53XIjkM,8769
|
|
46
46
|
apify/storage_clients/_apify/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
47
47
|
apify/storage_clients/_file_system/__init__.py,sha256=rDbXatXV9wHKPhKTrXDzWnexhTm7sIJQWucMi-P-SD4,130
|
|
48
48
|
apify/storage_clients/_file_system/_key_value_store_client.py,sha256=gxM3ap67PnY80Rd7P3onPAf2pksYpU0LoAlJdayEMdU,4179
|
|
49
49
|
apify/storage_clients/_file_system/_storage_client.py,sha256=rcwpKYlrWzvlSA2xoxftg-EZAi_iGZ3vOCbu0C5lKDE,1396
|
|
50
50
|
apify/storage_clients/_smart_apify/__init__.py,sha256=614B2AaWY-dx6RQ6mod7VVR8gFh75-_jnq5BeDD7hSc,53
|
|
51
|
-
apify/storage_clients/_smart_apify/_storage_client.py,sha256=
|
|
51
|
+
apify/storage_clients/_smart_apify/_storage_client.py,sha256=ZNNY4Qm9Cx_UFqBaforT28gC4hhOnCcKWpUYCIvzj48,5218
|
|
52
52
|
apify/storages/__init__.py,sha256=-9tEYJVabVs_eRVhUehxN58GH0UG8OfuGjGwuDieP2M,122
|
|
53
53
|
apify/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
54
|
-
apify-3.0.
|
|
55
|
-
apify-3.0.
|
|
56
|
-
apify-3.0.
|
|
57
|
-
apify-3.0.
|
|
54
|
+
apify-3.0.4b1.dist-info/METADATA,sha256=Y3LBVJMWeWazp-foQTEHw-5kD1AGlTkmOPRwJZ2KAcU,22582
|
|
55
|
+
apify-3.0.4b1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
56
|
+
apify-3.0.4b1.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
|
|
57
|
+
apify-3.0.4b1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|