apify 2.7.2__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/_actor.py +194 -126
- apify/_charging.py +34 -9
- apify/_configuration.py +79 -6
- apify/_crypto.py +0 -6
- apify/_models.py +7 -7
- apify/_proxy_configuration.py +10 -10
- apify/_utils.py +25 -2
- apify/events/__init__.py +5 -0
- apify/events/_apify_event_manager.py +140 -0
- apify/events/_types.py +102 -0
- apify/log.py +0 -9
- apify/request_loaders/__init__.py +18 -0
- apify/{storages/_request_list.py → request_loaders/_apify_request_list.py} +25 -18
- apify/request_loaders/py.typed +0 -0
- apify/scrapy/_logging_config.py +1 -4
- apify/scrapy/extensions/_httpcache.py +9 -5
- apify/scrapy/requests.py +3 -3
- apify/scrapy/scheduler.py +8 -5
- apify/storage_clients/__init__.py +12 -0
- apify/storage_clients/_apify/__init__.py +11 -0
- apify/storage_clients/_apify/_dataset_client.py +328 -0
- apify/storage_clients/_apify/_key_value_store_client.py +265 -0
- apify/storage_clients/_apify/_models.py +131 -0
- apify/storage_clients/_apify/_request_queue_client.py +327 -0
- apify/storage_clients/_apify/_request_queue_shared_client.py +527 -0
- apify/storage_clients/_apify/_request_queue_single_client.py +399 -0
- apify/storage_clients/_apify/_storage_client.py +106 -0
- apify/storage_clients/_apify/_utils.py +194 -0
- apify/storage_clients/_apify/py.typed +0 -0
- apify/storage_clients/_file_system/__init__.py +2 -0
- apify/storage_clients/_file_system/_key_value_store_client.py +57 -0
- apify/storage_clients/_file_system/_storage_client.py +41 -0
- apify/storage_clients/_smart_apify/__init__.py +1 -0
- apify/storage_clients/_smart_apify/_storage_client.py +117 -0
- apify/storage_clients/py.typed +0 -0
- apify/storages/__init__.py +1 -3
- {apify-2.7.2.dist-info → apify-3.0.0.dist-info}/METADATA +25 -9
- apify-3.0.0.dist-info/RECORD +57 -0
- apify/_platform_event_manager.py +0 -231
- apify/apify_storage_client/__init__.py +0 -3
- apify/apify_storage_client/_apify_storage_client.py +0 -72
- apify/apify_storage_client/_dataset_client.py +0 -190
- apify/apify_storage_client/_dataset_collection_client.py +0 -51
- apify/apify_storage_client/_key_value_store_client.py +0 -109
- apify/apify_storage_client/_key_value_store_collection_client.py +0 -51
- apify/apify_storage_client/_request_queue_client.py +0 -176
- apify/apify_storage_client/_request_queue_collection_client.py +0 -51
- apify-2.7.2.dist-info/RECORD +0 -44
- /apify/{apify_storage_client → events}/py.typed +0 -0
- {apify-2.7.2.dist-info → apify-3.0.0.dist-info}/WHEEL +0 -0
- {apify-2.7.2.dist-info → apify-3.0.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from logging import getLogger
|
|
4
|
+
from typing import TYPE_CHECKING, Final, Literal
|
|
5
|
+
|
|
6
|
+
from typing_extensions import override
|
|
7
|
+
|
|
8
|
+
from apify_client import ApifyClientAsync
|
|
9
|
+
from crawlee._utils.crypto import crypto_random_object_id
|
|
10
|
+
from crawlee.storage_clients._base import RequestQueueClient
|
|
11
|
+
from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata
|
|
12
|
+
from crawlee.storages import RequestQueue
|
|
13
|
+
|
|
14
|
+
from ._models import ApifyRequestQueueMetadata, RequestQueueStats
|
|
15
|
+
from ._request_queue_shared_client import _ApifyRequestQueueSharedClient
|
|
16
|
+
from ._request_queue_single_client import _ApifyRequestQueueSingleClient
|
|
17
|
+
from ._utils import AliasResolver
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from collections.abc import Sequence
|
|
21
|
+
|
|
22
|
+
from apify_client.clients import RequestQueueClientAsync
|
|
23
|
+
from crawlee import Request
|
|
24
|
+
|
|
25
|
+
from apify import Configuration
|
|
26
|
+
|
|
27
|
+
logger = getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ApifyRequestQueueClient(RequestQueueClient):
|
|
31
|
+
"""Base class for Apify platform implementations of the request queue client."""
|
|
32
|
+
|
|
33
|
+
_MAX_CACHED_REQUESTS: Final[int] = 1_000_000
|
|
34
|
+
"""Maximum number of requests that can be cached."""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
*,
|
|
39
|
+
api_client: RequestQueueClientAsync,
|
|
40
|
+
metadata: RequestQueueMetadata,
|
|
41
|
+
access: Literal['single', 'shared'] = 'single',
|
|
42
|
+
) -> None:
|
|
43
|
+
"""Initialize a new instance.
|
|
44
|
+
|
|
45
|
+
Preferably use the `ApifyRequestQueueClient.open` class method to create a new instance.
|
|
46
|
+
"""
|
|
47
|
+
self._api_client = api_client
|
|
48
|
+
"""The Apify request queue client for API operations."""
|
|
49
|
+
|
|
50
|
+
self._implementation: _ApifyRequestQueueSingleClient | _ApifyRequestQueueSharedClient
|
|
51
|
+
"""Internal implementation used to communicate with the Apify platform based Request Queue."""
|
|
52
|
+
if access == 'single':
|
|
53
|
+
self._implementation = _ApifyRequestQueueSingleClient(
|
|
54
|
+
api_client=self._api_client, metadata=metadata, cache_size=self._MAX_CACHED_REQUESTS
|
|
55
|
+
)
|
|
56
|
+
elif access == 'shared':
|
|
57
|
+
self._implementation = _ApifyRequestQueueSharedClient(
|
|
58
|
+
api_client=self._api_client,
|
|
59
|
+
metadata=metadata,
|
|
60
|
+
cache_size=self._MAX_CACHED_REQUESTS,
|
|
61
|
+
metadata_getter=self.get_metadata,
|
|
62
|
+
)
|
|
63
|
+
else:
|
|
64
|
+
raise RuntimeError(f"Unsupported access type: {access}. Allowed values are 'single' or 'shared'.")
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def _metadata(self) -> RequestQueueMetadata:
|
|
68
|
+
return self._implementation.metadata
|
|
69
|
+
|
|
70
|
+
@override
|
|
71
|
+
async def add_batch_of_requests(
|
|
72
|
+
self,
|
|
73
|
+
requests: Sequence[Request],
|
|
74
|
+
*,
|
|
75
|
+
forefront: bool = False,
|
|
76
|
+
) -> AddRequestsResponse:
|
|
77
|
+
"""Add a batch of requests to the queue.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
requests: The requests to add.
|
|
81
|
+
forefront: Whether to add the requests to the beginning of the queue.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
Response containing information about the added requests.
|
|
85
|
+
"""
|
|
86
|
+
return await self._implementation.add_batch_of_requests(requests, forefront=forefront)
|
|
87
|
+
|
|
88
|
+
@override
|
|
89
|
+
async def fetch_next_request(self) -> Request | None:
|
|
90
|
+
"""Return the next request in the queue to be processed.
|
|
91
|
+
|
|
92
|
+
Once you successfully finish processing of the request, you need to call `mark_request_as_handled`
|
|
93
|
+
to mark the request as handled in the queue. If there was some error in processing the request, call
|
|
94
|
+
`reclaim_request` instead, so that the queue will give the request to some other consumer
|
|
95
|
+
in another call to the `fetch_next_request` method.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
The request or `None` if there are no more pending requests.
|
|
99
|
+
"""
|
|
100
|
+
return await self._implementation.fetch_next_request()
|
|
101
|
+
|
|
102
|
+
@override
|
|
103
|
+
async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
|
|
104
|
+
"""Mark a request as handled after successful processing.
|
|
105
|
+
|
|
106
|
+
Handled requests will never again be returned by the `fetch_next_request` method.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
request: The request to mark as handled.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Information about the queue operation. `None` if the given request was not in progress.
|
|
113
|
+
"""
|
|
114
|
+
return await self._implementation.mark_request_as_handled(request)
|
|
115
|
+
|
|
116
|
+
@override
|
|
117
|
+
async def get_request(self, unique_key: str) -> Request | None:
|
|
118
|
+
"""Get a request by unique key.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
unique_key: Unique key of the request to get.
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
The request or None if not found.
|
|
125
|
+
"""
|
|
126
|
+
return await self._implementation.get_request(unique_key)
|
|
127
|
+
|
|
128
|
+
@override
|
|
129
|
+
async def reclaim_request(
|
|
130
|
+
self,
|
|
131
|
+
request: Request,
|
|
132
|
+
*,
|
|
133
|
+
forefront: bool = False,
|
|
134
|
+
) -> ProcessedRequest | None:
|
|
135
|
+
"""Reclaim a failed request back to the queue.
|
|
136
|
+
|
|
137
|
+
The request will be returned for processing later again by another call to `fetch_next_request`.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
request: The request to return to the queue.
|
|
141
|
+
forefront: Whether to add the request to the head or the end of the queue.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Information about the queue operation. `None` if the given request was not in progress.
|
|
145
|
+
"""
|
|
146
|
+
return await self._implementation.reclaim_request(request, forefront=forefront)
|
|
147
|
+
|
|
148
|
+
@override
|
|
149
|
+
async def is_empty(self) -> bool:
|
|
150
|
+
"""Check if the queue is empty.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
True if the queue is empty, False otherwise.
|
|
154
|
+
"""
|
|
155
|
+
return await self._implementation.is_empty()
|
|
156
|
+
|
|
157
|
+
@override
|
|
158
|
+
async def get_metadata(self) -> ApifyRequestQueueMetadata:
|
|
159
|
+
"""Get metadata about the request queue.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Metadata from the API, merged with local estimation, because in some cases, the data from the API can
|
|
163
|
+
be delayed.
|
|
164
|
+
"""
|
|
165
|
+
response = await self._api_client.get()
|
|
166
|
+
if response is None:
|
|
167
|
+
raise ValueError('Failed to fetch request queue metadata from the API.')
|
|
168
|
+
# Enhance API response by local estimations (API can be delayed few seconds, while local estimation not.)
|
|
169
|
+
return ApifyRequestQueueMetadata(
|
|
170
|
+
id=response['id'],
|
|
171
|
+
name=response['name'],
|
|
172
|
+
total_request_count=max(response['totalRequestCount'], self._metadata.total_request_count),
|
|
173
|
+
handled_request_count=max(response['handledRequestCount'], self._metadata.handled_request_count),
|
|
174
|
+
pending_request_count=response['pendingRequestCount'],
|
|
175
|
+
created_at=min(response['createdAt'], self._metadata.created_at),
|
|
176
|
+
modified_at=max(response['modifiedAt'], self._metadata.modified_at),
|
|
177
|
+
accessed_at=max(response['accessedAt'], self._metadata.accessed_at),
|
|
178
|
+
had_multiple_clients=response['hadMultipleClients'] or self._metadata.had_multiple_clients,
|
|
179
|
+
stats=RequestQueueStats.model_validate(response['stats'], by_alias=True),
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
@classmethod
|
|
183
|
+
async def open(
|
|
184
|
+
cls,
|
|
185
|
+
*,
|
|
186
|
+
id: str | None,
|
|
187
|
+
name: str | None,
|
|
188
|
+
alias: str | None,
|
|
189
|
+
configuration: Configuration,
|
|
190
|
+
access: Literal['single', 'shared'] = 'single',
|
|
191
|
+
) -> ApifyRequestQueueClient:
|
|
192
|
+
"""Open an Apify request queue client.
|
|
193
|
+
|
|
194
|
+
This method creates and initializes a new instance of the Apify request queue client. It handles
|
|
195
|
+
authentication, storage lookup/creation, and metadata retrieval, and sets up internal caching and queue
|
|
196
|
+
management structures.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
id: The ID of the RQ to open. If provided, searches for existing RQ by ID.
|
|
200
|
+
Mutually exclusive with name and alias.
|
|
201
|
+
name: The name of the RQ to open (global scope, persists across runs).
|
|
202
|
+
Mutually exclusive with id and alias.
|
|
203
|
+
alias: The alias of the RQ to open (run scope, creates unnamed storage).
|
|
204
|
+
Mutually exclusive with id and name.
|
|
205
|
+
configuration: The configuration object containing API credentials and settings. Must include a valid
|
|
206
|
+
`token` and `api_base_url`. May also contain a `default_request_queue_id` for fallback when neither
|
|
207
|
+
`id`, `name`, nor `alias` is provided.
|
|
208
|
+
access: Controls the implementation of the request queue client based on expected scenario:
|
|
209
|
+
- 'single' is suitable for single consumer scenarios. It makes less API calls, is cheaper and faster.
|
|
210
|
+
- 'shared' is suitable for multiple consumers scenarios at the cost of higher API usage.
|
|
211
|
+
Detailed constraints for the 'single' access type:
|
|
212
|
+
- Only one client is consuming the request queue at the time.
|
|
213
|
+
- Multiple producers can put requests to the queue, but their forefront requests are not guaranteed to
|
|
214
|
+
be handled so quickly as this client does not aggressively fetch the forefront and relies on local
|
|
215
|
+
head estimation.
|
|
216
|
+
- Requests are only added to the queue, never deleted by other clients. (Marking as handled is ok.)
|
|
217
|
+
- Other producers can add new requests, but not modify existing ones.
|
|
218
|
+
(Modifications would not be included in local cache)
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
An instance for the opened or created storage client.
|
|
222
|
+
|
|
223
|
+
Raises:
|
|
224
|
+
ValueError: If the configuration is missing required fields (token, api_base_url), if more than one of
|
|
225
|
+
`id`, `name`, or `alias` is provided, or if none are provided and no default storage ID is available
|
|
226
|
+
in the configuration.
|
|
227
|
+
"""
|
|
228
|
+
if sum(1 for param in [id, name, alias] if param is not None) > 1:
|
|
229
|
+
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
|
|
230
|
+
|
|
231
|
+
token = configuration.token
|
|
232
|
+
if not token:
|
|
233
|
+
raise ValueError(f'Apify storage client requires a valid token in Configuration (token={token}).')
|
|
234
|
+
|
|
235
|
+
api_url = configuration.api_base_url
|
|
236
|
+
if not api_url:
|
|
237
|
+
raise ValueError(f'Apify storage client requires a valid API URL in Configuration (api_url={api_url}).')
|
|
238
|
+
|
|
239
|
+
api_public_base_url = configuration.api_public_base_url
|
|
240
|
+
if not api_public_base_url:
|
|
241
|
+
raise ValueError(
|
|
242
|
+
'Apify storage client requires a valid API public base URL in Configuration '
|
|
243
|
+
f'(api_public_base_url={api_public_base_url}).'
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# Create Apify client with the provided token and API URL.
|
|
247
|
+
apify_client_async = ApifyClientAsync(
|
|
248
|
+
token=token,
|
|
249
|
+
api_url=api_url,
|
|
250
|
+
max_retries=8,
|
|
251
|
+
min_delay_between_retries_millis=500,
|
|
252
|
+
timeout_secs=360,
|
|
253
|
+
)
|
|
254
|
+
apify_rqs_client = apify_client_async.request_queues()
|
|
255
|
+
|
|
256
|
+
# Normalize unnamed default storage in cases where not defined in `configuration.default_request_queue_id` to
|
|
257
|
+
# unnamed storage aliased as `__default__`
|
|
258
|
+
if not any([alias, name, id, configuration.default_request_queue_id]):
|
|
259
|
+
alias = '__default__'
|
|
260
|
+
|
|
261
|
+
if alias:
|
|
262
|
+
# Check if there is pre-existing alias mapping in the default KVS.
|
|
263
|
+
async with AliasResolver(storage_type=RequestQueue, alias=alias, configuration=configuration) as _alias:
|
|
264
|
+
id = await _alias.resolve_id()
|
|
265
|
+
|
|
266
|
+
# There was no pre-existing alias in the mapping.
|
|
267
|
+
# Create a new unnamed storage and store the mapping.
|
|
268
|
+
if id is None:
|
|
269
|
+
new_storage_metadata = RequestQueueMetadata.model_validate(
|
|
270
|
+
await apify_rqs_client.get_or_create(),
|
|
271
|
+
)
|
|
272
|
+
id = new_storage_metadata.id
|
|
273
|
+
await _alias.store_mapping(storage_id=id)
|
|
274
|
+
|
|
275
|
+
# If name is provided, get or create the storage by name.
|
|
276
|
+
elif name:
|
|
277
|
+
id = RequestQueueMetadata.model_validate(
|
|
278
|
+
await apify_rqs_client.get_or_create(name=name),
|
|
279
|
+
).id
|
|
280
|
+
|
|
281
|
+
# If none are provided, try to get the default storage ID from environment variables.
|
|
282
|
+
elif id is None:
|
|
283
|
+
id = configuration.default_request_queue_id
|
|
284
|
+
if not id:
|
|
285
|
+
raise ValueError(
|
|
286
|
+
'RequestQueue "id", "name", or "alias" must be specified, '
|
|
287
|
+
'or a default default_request_queue_id ID must be set in the configuration.'
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
# Use suitable client_key to make `hadMultipleClients` response of Apify API useful.
|
|
291
|
+
# It should persist across migrated or resurrected Actor runs on the Apify platform.
|
|
292
|
+
_api_max_client_key_length = 32
|
|
293
|
+
client_key = (configuration.actor_run_id or crypto_random_object_id(length=_api_max_client_key_length))[
|
|
294
|
+
:_api_max_client_key_length
|
|
295
|
+
]
|
|
296
|
+
|
|
297
|
+
apify_rq_client = apify_client_async.request_queue(request_queue_id=id, client_key=client_key)
|
|
298
|
+
|
|
299
|
+
# Fetch its metadata.
|
|
300
|
+
metadata = await apify_rq_client.get()
|
|
301
|
+
|
|
302
|
+
# If metadata is None, it means the storage does not exist, so we create it.
|
|
303
|
+
if metadata is None:
|
|
304
|
+
id = RequestQueueMetadata.model_validate(
|
|
305
|
+
await apify_rqs_client.get_or_create(),
|
|
306
|
+
).id
|
|
307
|
+
apify_rq_client = apify_client_async.request_queue(request_queue_id=id, client_key=client_key)
|
|
308
|
+
|
|
309
|
+
# Verify that the storage exists by fetching its metadata again.
|
|
310
|
+
metadata = await apify_rq_client.get()
|
|
311
|
+
if metadata is None:
|
|
312
|
+
raise ValueError(f'Opening request queue with id={id}, name={name}, and alias={alias} failed.')
|
|
313
|
+
|
|
314
|
+
metadata_model = RequestQueueMetadata.model_validate(metadata)
|
|
315
|
+
|
|
316
|
+
return cls(api_client=apify_rq_client, metadata=metadata_model, access=access)
|
|
317
|
+
|
|
318
|
+
@override
|
|
319
|
+
async def purge(self) -> None:
|
|
320
|
+
raise NotImplementedError(
|
|
321
|
+
'Purging the request queue is not supported in the Apify platform. '
|
|
322
|
+
'Use the `drop` method to delete the request queue instead.'
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
@override
|
|
326
|
+
async def drop(self) -> None:
|
|
327
|
+
await self._api_client.delete()
|