apify 2.7.1b8__py3-none-any.whl → 2.7.1b9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

@@ -13,6 +13,7 @@ from cachetools import LRUCache
13
13
  from typing_extensions import override
14
14
 
15
15
  from apify_client import ApifyClientAsync
16
+ from crawlee._utils.crypto import crypto_random_object_id
16
17
  from crawlee.storage_clients._base import RequestQueueClient
17
18
  from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata
18
19
 
@@ -65,10 +66,7 @@ class ApifyRequestQueueClient(RequestQueueClient):
65
66
  self,
66
67
  *,
67
68
  api_client: RequestQueueClientAsync,
68
- id: str,
69
- name: str | None,
70
- total_request_count: int,
71
- handled_request_count: int,
69
+ metadata: RequestQueueMetadata,
72
70
  ) -> None:
73
71
  """Initialize a new instance.
74
72
 
@@ -77,11 +75,8 @@ class ApifyRequestQueueClient(RequestQueueClient):
77
75
  self._api_client = api_client
78
76
  """The Apify request queue client for API operations."""
79
77
 
80
- self._id = id
81
- """The ID of the request queue."""
82
-
83
- self._name = name
84
- """The name of the request queue."""
78
+ self._metadata = metadata
79
+ """Additional data related to the RequestQueue."""
85
80
 
86
81
  self._queue_head = deque[str]()
87
82
  """A deque to store request unique keys in the queue head."""
@@ -95,40 +90,43 @@ class ApifyRequestQueueClient(RequestQueueClient):
95
90
  self._should_check_for_forefront_requests = False
96
91
  """Whether to check for forefront requests in the next list_head call."""
97
92
 
98
- self._had_multiple_clients = False
99
- """Whether the request queue has been accessed by multiple clients."""
100
-
101
- self._initial_total_count = total_request_count
102
- """The initial total request count (from the API) when the queue was opened."""
103
-
104
- self._initial_handled_count = handled_request_count
105
- """The initial handled request count (from the API) when the queue was opened."""
106
-
107
- self._assumed_total_count = 0
108
- """The number of requests we assume are in the queue (tracked manually for this instance)."""
109
-
110
- self._assumed_handled_count = 0
111
- """The number of requests we assume have been handled (tracked manually for this instance)."""
112
-
113
93
  self._fetch_lock = asyncio.Lock()
114
94
  """Fetch lock to minimize race conditions when communicating with API."""
115
95
 
96
+ async def _get_metadata_estimate(self) -> RequestQueueMetadata:
97
+ """Try to get cached metadata first. If multiple clients, fuse with global metadata.
98
+
99
+ This method is used internally to avoid unnecessary API call unless needed (multiple clients).
100
+ Local estimation of metadata is without delay, unlike metadata from API. In situation where there is only one
101
+ client, it is the better choice.
102
+ """
103
+ if self._metadata.had_multiple_clients:
104
+ return await self.get_metadata()
105
+ # Get local estimation (will not include changes done bo another client)
106
+ return self._metadata
107
+
116
108
  @override
117
109
  async def get_metadata(self) -> RequestQueueMetadata:
118
- total_count = self._initial_total_count + self._assumed_total_count
119
- handled_count = self._initial_handled_count + self._assumed_handled_count
120
- pending_count = total_count - handled_count
110
+ """Get metadata about the request queue.
121
111
 
112
+ Returns:
113
+ Metadata from the API, merged with local estimation, because in some cases, the data from the API can
114
+ be delayed.
115
+ """
116
+ response = await self._api_client.get()
117
+ if response is None:
118
+ raise ValueError('Failed to fetch request queue metadata from the API.')
119
+ # Enhance API response by local estimations (API can be delayed few seconds, while local estimation not.)
122
120
  return RequestQueueMetadata(
123
- id=self._id,
124
- name=self._name,
125
- total_request_count=total_count,
126
- handled_request_count=handled_count,
127
- pending_request_count=pending_count,
128
- created_at=datetime.now(timezone.utc),
129
- modified_at=datetime.now(timezone.utc),
130
- accessed_at=datetime.now(timezone.utc),
131
- had_multiple_clients=self._had_multiple_clients,
121
+ id=response['id'],
122
+ name=response['name'],
123
+ total_request_count=max(response['totalRequestCount'], self._metadata.total_request_count),
124
+ handled_request_count=max(response['handledRequestCount'], self._metadata.handled_request_count),
125
+ pending_request_count=response['pendingRequestCount'],
126
+ created_at=min(response['createdAt'], self._metadata.created_at),
127
+ modified_at=max(response['modifiedAt'], self._metadata.modified_at),
128
+ accessed_at=max(response['accessedAt'], self._metadata.accessed_at),
129
+ had_multiple_clients=response['hadMultipleClients'] or self._metadata.had_multiple_clients,
132
130
  )
133
131
 
134
132
  @classmethod
@@ -187,27 +185,34 @@ class ApifyRequestQueueClient(RequestQueueClient):
187
185
  )
188
186
  apify_rqs_client = apify_client_async.request_queues()
189
187
 
190
- # If both id and name are provided, raise an error.
191
- if id and name:
192
- raise ValueError('Only one of "id" or "name" can be specified, not both.')
193
-
194
- # If id is provided, get the storage by ID.
195
- if id and name is None:
196
- apify_rq_client = apify_client_async.request_queue(request_queue_id=id)
188
+ match (id, name):
189
+ case (None, None):
190
+ # If both id and name are None, try to get the default storage ID from environment variables.
191
+ # The default storage ID environment variable is set by the Apify platform. It also contains
192
+ # a new storage ID after Actor's reboot or migration.
193
+ id = configuration.default_request_queue_id
194
+ case (None, name):
195
+ # If only name is provided, get or create the storage by name.
196
+ id = RequestQueueMetadata.model_validate(
197
+ await apify_rqs_client.get_or_create(name=name),
198
+ ).id
199
+ case (_, None):
200
+ # If only id is provided, use it.
201
+ pass
202
+ case (_, _):
203
+ # If both id and name are provided, raise an error.
204
+ raise ValueError('Only one of "id" or "name" can be specified, not both.')
205
+ if id is None:
206
+ raise RuntimeError('Unreachable code')
197
207
 
198
- # If name is provided, get or create the storage by name.
199
- if name and id is None:
200
- id = RequestQueueMetadata.model_validate(
201
- await apify_rqs_client.get_or_create(name=name),
202
- ).id
203
- apify_rq_client = apify_client_async.request_queue(request_queue_id=id)
208
+ # Use suitable client_key to make `hadMultipleClients` response of Apify API useful.
209
+ # It should persist across migrated or resurrected Actor runs on the Apify platform.
210
+ _api_max_client_key_length = 32
211
+ client_key = (configuration.actor_run_id or crypto_random_object_id(length=_api_max_client_key_length))[
212
+ :_api_max_client_key_length
213
+ ]
204
214
 
205
- # If both id and name are None, try to get the default storage ID from environment variables.
206
- # The default storage ID environment variable is set by the Apify platform. It also contains
207
- # a new storage ID after Actor's reboot or migration.
208
- if id is None and name is None:
209
- id = configuration.default_request_queue_id
210
- apify_rq_client = apify_client_async.request_queue(request_queue_id=id)
215
+ apify_rq_client = apify_client_async.request_queue(request_queue_id=id, client_key=client_key)
211
216
 
212
217
  # Fetch its metadata.
213
218
  metadata = await apify_rq_client.get()
@@ -217,27 +222,18 @@ class ApifyRequestQueueClient(RequestQueueClient):
217
222
  id = RequestQueueMetadata.model_validate(
218
223
  await apify_rqs_client.get_or_create(),
219
224
  ).id
220
- apify_rq_client = apify_client_async.request_queue(request_queue_id=id)
225
+ apify_rq_client = apify_client_async.request_queue(request_queue_id=id, client_key=client_key)
221
226
 
222
227
  # Verify that the storage exists by fetching its metadata again.
223
228
  metadata = await apify_rq_client.get()
224
229
  if metadata is None:
225
230
  raise ValueError(f'Opening request queue with id={id} and name={name} failed.')
226
231
 
227
- metadata_model = RequestQueueMetadata.model_validate(
228
- await apify_rqs_client.get_or_create(),
229
- )
230
-
231
- # Ensure we have a valid ID.
232
- if id is None:
233
- raise ValueError('Request queue ID cannot be None.')
232
+ metadata_model = RequestQueueMetadata.model_validate(metadata)
234
233
 
235
234
  return cls(
236
235
  api_client=apify_rq_client,
237
- id=id,
238
- name=name,
239
- total_request_count=metadata_model.total_request_count,
240
- handled_request_count=metadata_model.handled_request_count,
236
+ metadata=metadata_model,
241
237
  )
242
238
 
243
239
  @override
@@ -341,7 +337,7 @@ class ApifyRequestQueueClient(RequestQueueClient):
341
337
  if not processed_request.was_already_present and not processed_request.was_already_handled:
342
338
  new_request_count += 1
343
339
 
344
- self._assumed_total_count += new_request_count
340
+ self._metadata.total_request_count += new_request_count
345
341
 
346
342
  return api_response
347
343
 
@@ -439,7 +435,7 @@ class ApifyRequestQueueClient(RequestQueueClient):
439
435
 
440
436
  # Update assumed handled count if this wasn't already handled
441
437
  if not processed_request.was_already_handled:
442
- self._assumed_handled_count += 1
438
+ self._metadata.handled_request_count += 1
443
439
 
444
440
  # Update the cache with the handled request
445
441
  cache_key = request.unique_key
@@ -487,7 +483,7 @@ class ApifyRequestQueueClient(RequestQueueClient):
487
483
  # If the request was previously handled, decrement our handled count since
488
484
  # we're putting it back for processing.
489
485
  if request.was_already_handled and not processed_request.was_already_handled:
490
- self._assumed_handled_count -= 1
486
+ self._metadata.handled_request_count -= 1
491
487
 
492
488
  # Update the cache
493
489
  cache_key = request.unique_key
@@ -539,7 +535,7 @@ class ApifyRequestQueueClient(RequestQueueClient):
539
535
  """Get a request by unique key, either from cache or by fetching from API.
540
536
 
541
537
  Args:
542
- unique_key: Unique keu of the request to get.
538
+ unique_key: Unique key of the request to get.
543
539
 
544
540
  Returns:
545
541
  The request if found and valid, otherwise None.
@@ -645,7 +641,7 @@ class ApifyRequestQueueClient(RequestQueueClient):
645
641
  if cached_request and cached_request.hydrated:
646
642
  items.append(cached_request.hydrated)
647
643
 
648
- metadata = await self.get_metadata()
644
+ metadata = await self._get_metadata_estimate()
649
645
 
650
646
  return RequestQueueHead(
651
647
  limit=limit,
@@ -672,6 +668,8 @@ class ApifyRequestQueueClient(RequestQueueClient):
672
668
 
673
669
  # Update the queue head cache
674
670
  self._queue_has_locked_requests = response.get('queueHasLockedRequests', False)
671
+ # Check if there is another client working with the RequestQueue
672
+ self._metadata.had_multiple_clients = response.get('hadMultipleClients', False)
675
673
 
676
674
  for request_data in response.get('items', []):
677
675
  request = Request.model_validate(request_data)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: apify
3
- Version: 2.7.1b8
3
+ Version: 2.7.1b9
4
4
  Summary: Apify SDK for Python
5
5
  Project-URL: Apify Homepage, https://apify.com
6
6
  Project-URL: Changelog, https://docs.apify.com/sdk/python/docs/changelog
@@ -38,7 +38,7 @@ apify/storage_clients/_apify/__init__.py,sha256=mtbVDsxqWL3kx30elnh0kAn2kZ4s3BBs
38
38
  apify/storage_clients/_apify/_dataset_client.py,sha256=8ZQvbtXZm54-V0Ukio0Z4jVI2gGkfqzZ59GlBQJXGUU,11485
39
39
  apify/storage_clients/_apify/_key_value_store_client.py,sha256=WbyzDCFmJS2hd_7ddYL3JEO9zvjUAAE1D_F4kohiim4,9455
40
40
  apify/storage_clients/_apify/_models.py,sha256=C6FpXswtO6kXE5RUumazm_conzJJS6PrXAGF9XBuDb8,3651
41
- apify/storage_clients/_apify/_request_queue_client.py,sha256=sMinB02V9236PH7fQTQ0AIwH6oObnZv2ivzKBEgaLOk,31372
41
+ apify/storage_clients/_apify/_request_queue_client.py,sha256=LuKH_7Y9TMU1qtSagWRPsrb5aKcAIp3dkupS9W4615o,32117
42
42
  apify/storage_clients/_apify/_storage_client.py,sha256=5me6gHOeNAG3JaHxKRdzsZaa3FsqLDbObjhECGGWrr4,2890
43
43
  apify/storage_clients/_apify/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  apify/storage_clients/_file_system/__init__.py,sha256=rDbXatXV9wHKPhKTrXDzWnexhTm7sIJQWucMi-P-SD4,130
@@ -46,7 +46,7 @@ apify/storage_clients/_file_system/_key_value_store_client.py,sha256=DHDv_e0kFwh
46
46
  apify/storage_clients/_file_system/_storage_client.py,sha256=UwxuSvhbyQ7zR1db1hTmZ1h38yH7btHNp82X7e8MWWE,1290
47
47
  apify/storages/__init__.py,sha256=-9tEYJVabVs_eRVhUehxN58GH0UG8OfuGjGwuDieP2M,122
48
48
  apify/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
49
- apify-2.7.1b8.dist-info/METADATA,sha256=9OY09tUOf00U-K6_kvD63Vs8DKsDEGJs4SPTaPhwzq0,21800
50
- apify-2.7.1b8.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
51
- apify-2.7.1b8.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
52
- apify-2.7.1b8.dist-info/RECORD,,
49
+ apify-2.7.1b9.dist-info/METADATA,sha256=bPUGmcC2s7uUbJoz7SGfiMRCpYoiuc16FPNBlD-5f2k,21800
50
+ apify-2.7.1b9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
51
+ apify-2.7.1b9.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
52
+ apify-2.7.1b9.dist-info/RECORD,,