apify 2.7.1b8__py3-none-any.whl → 2.7.1b10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/storage_clients/_apify/_request_queue_client.py +70 -72
- {apify-2.7.1b8.dist-info → apify-2.7.1b10.dist-info}/METADATA +1 -1
- {apify-2.7.1b8.dist-info → apify-2.7.1b10.dist-info}/RECORD +5 -5
- {apify-2.7.1b8.dist-info → apify-2.7.1b10.dist-info}/WHEEL +0 -0
- {apify-2.7.1b8.dist-info → apify-2.7.1b10.dist-info}/licenses/LICENSE +0 -0
|
@@ -13,6 +13,7 @@ from cachetools import LRUCache
|
|
|
13
13
|
from typing_extensions import override
|
|
14
14
|
|
|
15
15
|
from apify_client import ApifyClientAsync
|
|
16
|
+
from crawlee._utils.crypto import crypto_random_object_id
|
|
16
17
|
from crawlee.storage_clients._base import RequestQueueClient
|
|
17
18
|
from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata
|
|
18
19
|
|
|
@@ -65,10 +66,7 @@ class ApifyRequestQueueClient(RequestQueueClient):
|
|
|
65
66
|
self,
|
|
66
67
|
*,
|
|
67
68
|
api_client: RequestQueueClientAsync,
|
|
68
|
-
|
|
69
|
-
name: str | None,
|
|
70
|
-
total_request_count: int,
|
|
71
|
-
handled_request_count: int,
|
|
69
|
+
metadata: RequestQueueMetadata,
|
|
72
70
|
) -> None:
|
|
73
71
|
"""Initialize a new instance.
|
|
74
72
|
|
|
@@ -77,11 +75,8 @@ class ApifyRequestQueueClient(RequestQueueClient):
|
|
|
77
75
|
self._api_client = api_client
|
|
78
76
|
"""The Apify request queue client for API operations."""
|
|
79
77
|
|
|
80
|
-
self.
|
|
81
|
-
"""
|
|
82
|
-
|
|
83
|
-
self._name = name
|
|
84
|
-
"""The name of the request queue."""
|
|
78
|
+
self._metadata = metadata
|
|
79
|
+
"""Additional data related to the RequestQueue."""
|
|
85
80
|
|
|
86
81
|
self._queue_head = deque[str]()
|
|
87
82
|
"""A deque to store request unique keys in the queue head."""
|
|
@@ -95,40 +90,43 @@ class ApifyRequestQueueClient(RequestQueueClient):
|
|
|
95
90
|
self._should_check_for_forefront_requests = False
|
|
96
91
|
"""Whether to check for forefront requests in the next list_head call."""
|
|
97
92
|
|
|
98
|
-
self._had_multiple_clients = False
|
|
99
|
-
"""Whether the request queue has been accessed by multiple clients."""
|
|
100
|
-
|
|
101
|
-
self._initial_total_count = total_request_count
|
|
102
|
-
"""The initial total request count (from the API) when the queue was opened."""
|
|
103
|
-
|
|
104
|
-
self._initial_handled_count = handled_request_count
|
|
105
|
-
"""The initial handled request count (from the API) when the queue was opened."""
|
|
106
|
-
|
|
107
|
-
self._assumed_total_count = 0
|
|
108
|
-
"""The number of requests we assume are in the queue (tracked manually for this instance)."""
|
|
109
|
-
|
|
110
|
-
self._assumed_handled_count = 0
|
|
111
|
-
"""The number of requests we assume have been handled (tracked manually for this instance)."""
|
|
112
|
-
|
|
113
93
|
self._fetch_lock = asyncio.Lock()
|
|
114
94
|
"""Fetch lock to minimize race conditions when communicating with API."""
|
|
115
95
|
|
|
96
|
+
async def _get_metadata_estimate(self) -> RequestQueueMetadata:
|
|
97
|
+
"""Try to get cached metadata first. If multiple clients, fuse with global metadata.
|
|
98
|
+
|
|
99
|
+
This method is used internally to avoid unnecessary API call unless needed (multiple clients).
|
|
100
|
+
Local estimation of metadata is without delay, unlike metadata from API. In situation where there is only one
|
|
101
|
+
client, it is the better choice.
|
|
102
|
+
"""
|
|
103
|
+
if self._metadata.had_multiple_clients:
|
|
104
|
+
return await self.get_metadata()
|
|
105
|
+
# Get local estimation (will not include changes done bo another client)
|
|
106
|
+
return self._metadata
|
|
107
|
+
|
|
116
108
|
@override
|
|
117
109
|
async def get_metadata(self) -> RequestQueueMetadata:
|
|
118
|
-
|
|
119
|
-
handled_count = self._initial_handled_count + self._assumed_handled_count
|
|
120
|
-
pending_count = total_count - handled_count
|
|
110
|
+
"""Get metadata about the request queue.
|
|
121
111
|
|
|
112
|
+
Returns:
|
|
113
|
+
Metadata from the API, merged with local estimation, because in some cases, the data from the API can
|
|
114
|
+
be delayed.
|
|
115
|
+
"""
|
|
116
|
+
response = await self._api_client.get()
|
|
117
|
+
if response is None:
|
|
118
|
+
raise ValueError('Failed to fetch request queue metadata from the API.')
|
|
119
|
+
# Enhance API response by local estimations (API can be delayed few seconds, while local estimation not.)
|
|
122
120
|
return RequestQueueMetadata(
|
|
123
|
-
id=
|
|
124
|
-
name=
|
|
125
|
-
total_request_count=
|
|
126
|
-
handled_request_count=
|
|
127
|
-
pending_request_count=
|
|
128
|
-
created_at=
|
|
129
|
-
modified_at=
|
|
130
|
-
accessed_at=
|
|
131
|
-
had_multiple_clients=self.
|
|
121
|
+
id=response['id'],
|
|
122
|
+
name=response['name'],
|
|
123
|
+
total_request_count=max(response['totalRequestCount'], self._metadata.total_request_count),
|
|
124
|
+
handled_request_count=max(response['handledRequestCount'], self._metadata.handled_request_count),
|
|
125
|
+
pending_request_count=response['pendingRequestCount'],
|
|
126
|
+
created_at=min(response['createdAt'], self._metadata.created_at),
|
|
127
|
+
modified_at=max(response['modifiedAt'], self._metadata.modified_at),
|
|
128
|
+
accessed_at=max(response['accessedAt'], self._metadata.accessed_at),
|
|
129
|
+
had_multiple_clients=response['hadMultipleClients'] or self._metadata.had_multiple_clients,
|
|
132
130
|
)
|
|
133
131
|
|
|
134
132
|
@classmethod
|
|
@@ -187,27 +185,34 @@ class ApifyRequestQueueClient(RequestQueueClient):
|
|
|
187
185
|
)
|
|
188
186
|
apify_rqs_client = apify_client_async.request_queues()
|
|
189
187
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
188
|
+
match (id, name):
|
|
189
|
+
case (None, None):
|
|
190
|
+
# If both id and name are None, try to get the default storage ID from environment variables.
|
|
191
|
+
# The default storage ID environment variable is set by the Apify platform. It also contains
|
|
192
|
+
# a new storage ID after Actor's reboot or migration.
|
|
193
|
+
id = configuration.default_request_queue_id
|
|
194
|
+
case (None, name):
|
|
195
|
+
# If only name is provided, get or create the storage by name.
|
|
196
|
+
id = RequestQueueMetadata.model_validate(
|
|
197
|
+
await apify_rqs_client.get_or_create(name=name),
|
|
198
|
+
).id
|
|
199
|
+
case (_, None):
|
|
200
|
+
# If only id is provided, use it.
|
|
201
|
+
pass
|
|
202
|
+
case (_, _):
|
|
203
|
+
# If both id and name are provided, raise an error.
|
|
204
|
+
raise ValueError('Only one of "id" or "name" can be specified, not both.')
|
|
205
|
+
if id is None:
|
|
206
|
+
raise RuntimeError('Unreachable code')
|
|
197
207
|
|
|
198
|
-
#
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
208
|
+
# Use suitable client_key to make `hadMultipleClients` response of Apify API useful.
|
|
209
|
+
# It should persist across migrated or resurrected Actor runs on the Apify platform.
|
|
210
|
+
_api_max_client_key_length = 32
|
|
211
|
+
client_key = (configuration.actor_run_id or crypto_random_object_id(length=_api_max_client_key_length))[
|
|
212
|
+
:_api_max_client_key_length
|
|
213
|
+
]
|
|
204
214
|
|
|
205
|
-
|
|
206
|
-
# The default storage ID environment variable is set by the Apify platform. It also contains
|
|
207
|
-
# a new storage ID after Actor's reboot or migration.
|
|
208
|
-
if id is None and name is None:
|
|
209
|
-
id = configuration.default_request_queue_id
|
|
210
|
-
apify_rq_client = apify_client_async.request_queue(request_queue_id=id)
|
|
215
|
+
apify_rq_client = apify_client_async.request_queue(request_queue_id=id, client_key=client_key)
|
|
211
216
|
|
|
212
217
|
# Fetch its metadata.
|
|
213
218
|
metadata = await apify_rq_client.get()
|
|
@@ -217,27 +222,18 @@ class ApifyRequestQueueClient(RequestQueueClient):
|
|
|
217
222
|
id = RequestQueueMetadata.model_validate(
|
|
218
223
|
await apify_rqs_client.get_or_create(),
|
|
219
224
|
).id
|
|
220
|
-
apify_rq_client = apify_client_async.request_queue(request_queue_id=id)
|
|
225
|
+
apify_rq_client = apify_client_async.request_queue(request_queue_id=id, client_key=client_key)
|
|
221
226
|
|
|
222
227
|
# Verify that the storage exists by fetching its metadata again.
|
|
223
228
|
metadata = await apify_rq_client.get()
|
|
224
229
|
if metadata is None:
|
|
225
230
|
raise ValueError(f'Opening request queue with id={id} and name={name} failed.')
|
|
226
231
|
|
|
227
|
-
metadata_model = RequestQueueMetadata.model_validate(
|
|
228
|
-
await apify_rqs_client.get_or_create(),
|
|
229
|
-
)
|
|
230
|
-
|
|
231
|
-
# Ensure we have a valid ID.
|
|
232
|
-
if id is None:
|
|
233
|
-
raise ValueError('Request queue ID cannot be None.')
|
|
232
|
+
metadata_model = RequestQueueMetadata.model_validate(metadata)
|
|
234
233
|
|
|
235
234
|
return cls(
|
|
236
235
|
api_client=apify_rq_client,
|
|
237
|
-
|
|
238
|
-
name=name,
|
|
239
|
-
total_request_count=metadata_model.total_request_count,
|
|
240
|
-
handled_request_count=metadata_model.handled_request_count,
|
|
236
|
+
metadata=metadata_model,
|
|
241
237
|
)
|
|
242
238
|
|
|
243
239
|
@override
|
|
@@ -341,7 +337,7 @@ class ApifyRequestQueueClient(RequestQueueClient):
|
|
|
341
337
|
if not processed_request.was_already_present and not processed_request.was_already_handled:
|
|
342
338
|
new_request_count += 1
|
|
343
339
|
|
|
344
|
-
self.
|
|
340
|
+
self._metadata.total_request_count += new_request_count
|
|
345
341
|
|
|
346
342
|
return api_response
|
|
347
343
|
|
|
@@ -439,7 +435,7 @@ class ApifyRequestQueueClient(RequestQueueClient):
|
|
|
439
435
|
|
|
440
436
|
# Update assumed handled count if this wasn't already handled
|
|
441
437
|
if not processed_request.was_already_handled:
|
|
442
|
-
self.
|
|
438
|
+
self._metadata.handled_request_count += 1
|
|
443
439
|
|
|
444
440
|
# Update the cache with the handled request
|
|
445
441
|
cache_key = request.unique_key
|
|
@@ -487,7 +483,7 @@ class ApifyRequestQueueClient(RequestQueueClient):
|
|
|
487
483
|
# If the request was previously handled, decrement our handled count since
|
|
488
484
|
# we're putting it back for processing.
|
|
489
485
|
if request.was_already_handled and not processed_request.was_already_handled:
|
|
490
|
-
self.
|
|
486
|
+
self._metadata.handled_request_count -= 1
|
|
491
487
|
|
|
492
488
|
# Update the cache
|
|
493
489
|
cache_key = request.unique_key
|
|
@@ -539,7 +535,7 @@ class ApifyRequestQueueClient(RequestQueueClient):
|
|
|
539
535
|
"""Get a request by unique key, either from cache or by fetching from API.
|
|
540
536
|
|
|
541
537
|
Args:
|
|
542
|
-
unique_key: Unique
|
|
538
|
+
unique_key: Unique key of the request to get.
|
|
543
539
|
|
|
544
540
|
Returns:
|
|
545
541
|
The request if found and valid, otherwise None.
|
|
@@ -645,7 +641,7 @@ class ApifyRequestQueueClient(RequestQueueClient):
|
|
|
645
641
|
if cached_request and cached_request.hydrated:
|
|
646
642
|
items.append(cached_request.hydrated)
|
|
647
643
|
|
|
648
|
-
metadata = await self.
|
|
644
|
+
metadata = await self._get_metadata_estimate()
|
|
649
645
|
|
|
650
646
|
return RequestQueueHead(
|
|
651
647
|
limit=limit,
|
|
@@ -672,6 +668,8 @@ class ApifyRequestQueueClient(RequestQueueClient):
|
|
|
672
668
|
|
|
673
669
|
# Update the queue head cache
|
|
674
670
|
self._queue_has_locked_requests = response.get('queueHasLockedRequests', False)
|
|
671
|
+
# Check if there is another client working with the RequestQueue
|
|
672
|
+
self._metadata.had_multiple_clients = response.get('hadMultipleClients', False)
|
|
675
673
|
|
|
676
674
|
for request_data in response.get('items', []):
|
|
677
675
|
request = Request.model_validate(request_data)
|
|
@@ -38,7 +38,7 @@ apify/storage_clients/_apify/__init__.py,sha256=mtbVDsxqWL3kx30elnh0kAn2kZ4s3BBs
|
|
|
38
38
|
apify/storage_clients/_apify/_dataset_client.py,sha256=8ZQvbtXZm54-V0Ukio0Z4jVI2gGkfqzZ59GlBQJXGUU,11485
|
|
39
39
|
apify/storage_clients/_apify/_key_value_store_client.py,sha256=WbyzDCFmJS2hd_7ddYL3JEO9zvjUAAE1D_F4kohiim4,9455
|
|
40
40
|
apify/storage_clients/_apify/_models.py,sha256=C6FpXswtO6kXE5RUumazm_conzJJS6PrXAGF9XBuDb8,3651
|
|
41
|
-
apify/storage_clients/_apify/_request_queue_client.py,sha256=
|
|
41
|
+
apify/storage_clients/_apify/_request_queue_client.py,sha256=LuKH_7Y9TMU1qtSagWRPsrb5aKcAIp3dkupS9W4615o,32117
|
|
42
42
|
apify/storage_clients/_apify/_storage_client.py,sha256=5me6gHOeNAG3JaHxKRdzsZaa3FsqLDbObjhECGGWrr4,2890
|
|
43
43
|
apify/storage_clients/_apify/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
44
44
|
apify/storage_clients/_file_system/__init__.py,sha256=rDbXatXV9wHKPhKTrXDzWnexhTm7sIJQWucMi-P-SD4,130
|
|
@@ -46,7 +46,7 @@ apify/storage_clients/_file_system/_key_value_store_client.py,sha256=DHDv_e0kFwh
|
|
|
46
46
|
apify/storage_clients/_file_system/_storage_client.py,sha256=UwxuSvhbyQ7zR1db1hTmZ1h38yH7btHNp82X7e8MWWE,1290
|
|
47
47
|
apify/storages/__init__.py,sha256=-9tEYJVabVs_eRVhUehxN58GH0UG8OfuGjGwuDieP2M,122
|
|
48
48
|
apify/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
49
|
-
apify-2.7.
|
|
50
|
-
apify-2.7.
|
|
51
|
-
apify-2.7.
|
|
52
|
-
apify-2.7.
|
|
49
|
+
apify-2.7.1b10.dist-info/METADATA,sha256=BA5QmkDxZHnUjb0ZPJxj4lXO7lP6vFdcj3NGYKvGwJc,21801
|
|
50
|
+
apify-2.7.1b10.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
51
|
+
apify-2.7.1b10.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
|
|
52
|
+
apify-2.7.1b10.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|