apify 3.0.0rc1__py3-none-any.whl → 3.0.1b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

@@ -0,0 +1,399 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import deque
4
+ from datetime import datetime, timezone
5
+ from logging import getLogger
6
+ from typing import TYPE_CHECKING, Final
7
+
8
+ from cachetools import LRUCache
9
+
10
+ from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata
11
+
12
+ from apify import Request
13
+ from apify.storage_clients._apify._utils import unique_key_to_request_id
14
+
15
+ if TYPE_CHECKING:
16
+ from collections.abc import Sequence
17
+
18
+ from apify_client.clients import RequestQueueClientAsync
19
+
20
+
21
+ logger = getLogger(__name__)
22
+
23
+
24
+ class _ApifyRequestQueueSingleClient:
25
+ """An Apify platform implementation of the request queue client with limited capability.
26
+
27
+ This client is designed to use as little resources as possible, but has to be used in constrained context.
28
+ Constraints:
29
+ - Only one client is consuming the request queue at the time.
30
+ - Multiple producers can put requests to the queue, but their forefront requests are not guaranteed to be handled
31
+ so quickly as this client does not aggressively fetch the forefront and relies on local head estimation.
32
+ - Requests are only added to the queue, never deleted. (Marking as handled is ok.)
33
+ - Other producers can add new requests, but not modify existing ones (otherwise caching can miss the updates)
34
+
35
+ If the constraints are not met, the client might work in an unpredictable way.
36
+ """
37
+
38
+ _MAX_HEAD_ITEMS: Final[int] = 1000
39
+ """The maximum head items read count limited by API."""
40
+
41
+ def __init__(
42
+ self,
43
+ *,
44
+ api_client: RequestQueueClientAsync,
45
+ metadata: RequestQueueMetadata,
46
+ cache_size: int,
47
+ ) -> None:
48
+ """Initialize a new instance.
49
+
50
+ Preferably use the `ApifyRequestQueueClient.open` class method to create a new instance.
51
+ """
52
+ self.metadata = metadata
53
+ """Additional data related to the RequestQueue."""
54
+
55
+ self._api_client = api_client
56
+ """The Apify request queue client for API operations."""
57
+
58
+ self._requests_cache: LRUCache[str, Request] = LRUCache(maxsize=cache_size)
59
+ """A cache to store request objects. Request unique key is used as the cache key."""
60
+
61
+ self._head_requests: deque[str] = deque()
62
+ """Ordered unique keys of requests that represent queue head."""
63
+
64
+ self._requests_already_handled: set[str] = set()
65
+ """Local estimation of requests unique keys that are already present and handled on the platform.
66
+
67
+ - To enhance local deduplication.
68
+ - To reduce the _requests_cache size. Already handled requests are most likely not going to be needed again,
69
+ so no need to cache more than their unique_key.
70
+ """
71
+
72
+ self._requests_in_progress: set[str] = set()
73
+ """Set of requests unique keys that are being processed locally.
74
+
75
+ - To help decide if the RQ is finished or not. This is the only consumer, so it can be tracked locally.
76
+ """
77
+
78
+ self._initialized_caches = False
79
+ """This flag indicates whether the local caches were already initialized.
80
+
81
+ Initialization is done lazily only if deduplication is needed (When calling add_batch_of_requests).
82
+ """
83
+
84
+ async def add_batch_of_requests(
85
+ self,
86
+ requests: Sequence[Request],
87
+ *,
88
+ forefront: bool = False,
89
+ ) -> AddRequestsResponse:
90
+ """Add a batch of requests to the queue.
91
+
92
+ Args:
93
+ requests: The requests to add.
94
+ forefront: Whether to add the requests to the beginning of the queue.
95
+
96
+ Returns:
97
+ Response containing information about the added requests.
98
+ """
99
+ if not self._initialized_caches:
100
+ # One time process to initialize local caches for existing request queues.
101
+ await self._init_caches()
102
+ self._initialized_caches = True
103
+
104
+ new_requests: list[Request] = []
105
+ already_present_requests: list[ProcessedRequest] = []
106
+
107
+ for request in requests:
108
+ # Check if request is known to be already handled (it has to be present as well.)
109
+ if request.unique_key in self._requests_already_handled:
110
+ already_present_requests.append(
111
+ ProcessedRequest.model_validate(
112
+ {
113
+ 'uniqueKey': request.unique_key,
114
+ 'wasAlreadyPresent': True,
115
+ 'wasAlreadyHandled': True,
116
+ }
117
+ )
118
+ )
119
+ # Check if request is known to be already present, but unhandled
120
+ elif self._requests_cache.get(request.unique_key):
121
+ already_present_requests.append(
122
+ ProcessedRequest.model_validate(
123
+ {
124
+ 'uniqueKey': request.unique_key,
125
+ 'wasAlreadyPresent': True,
126
+ 'wasAlreadyHandled': request.was_already_handled,
127
+ }
128
+ )
129
+ )
130
+ else:
131
+ # Push the request to the platform. Probably not there, or we are not aware of it
132
+ new_requests.append(request)
133
+
134
+ # Update local caches
135
+ self._requests_cache[request.unique_key] = request
136
+ if forefront:
137
+ self._head_requests.append(request.unique_key)
138
+ else:
139
+ self._head_requests.appendleft(request.unique_key)
140
+
141
+ if new_requests:
142
+ # Prepare requests for API by converting to dictionaries.
143
+ requests_dict = [
144
+ request.model_dump(
145
+ by_alias=True,
146
+ )
147
+ for request in new_requests
148
+ ]
149
+
150
+ # Send requests to API.
151
+ api_response = AddRequestsResponse.model_validate(
152
+ await self._api_client.batch_add_requests(requests=requests_dict, forefront=forefront)
153
+ )
154
+ # Add the locally known already present processed requests based on the local cache.
155
+ api_response.processed_requests.extend(already_present_requests)
156
+ # Remove unprocessed requests from the cache
157
+ for unprocessed_request in api_response.unprocessed_requests:
158
+ self._requests_cache.pop(unprocessed_request.unique_key, None)
159
+
160
+ else:
161
+ api_response = AddRequestsResponse.model_validate(
162
+ {'unprocessedRequests': [], 'processedRequests': already_present_requests}
163
+ )
164
+
165
+ # Update assumed total count for newly added requests.
166
+ new_request_count = 0
167
+ for processed_request in api_response.processed_requests:
168
+ if not processed_request.was_already_present and not processed_request.was_already_handled:
169
+ new_request_count += 1
170
+ self.metadata.total_request_count += new_request_count
171
+ self.metadata.pending_request_count += new_request_count
172
+
173
+ return api_response
174
+
175
+ async def get_request(self, unique_key: str) -> Request | None:
176
+ """Get a request by unique key.
177
+
178
+ Args:
179
+ unique_key: Unique key of the request to get.
180
+
181
+ Returns:
182
+ The request or None if not found.
183
+ """
184
+ if unique_key in self._requests_cache:
185
+ return self._requests_cache[unique_key]
186
+
187
+ response = await self._api_client.get_request(unique_key_to_request_id(unique_key))
188
+
189
+ if response is None:
190
+ return None
191
+
192
+ return Request.model_validate(response)
193
+
194
+ async def fetch_next_request(self) -> Request | None:
195
+ """Return the next request in the queue to be processed.
196
+
197
+ Once you successfully finish processing of the request, you need to call `mark_request_as_handled`
198
+ to mark the request as handled in the queue. If there was some error in processing the request, call
199
+ `reclaim_request` instead, so that the queue will give the request to some other consumer
200
+ in another call to the `fetch_next_request` method.
201
+
202
+ Returns:
203
+ The request or `None` if there are no more pending requests.
204
+ """
205
+ await self._ensure_head_is_non_empty()
206
+
207
+ while self._head_requests:
208
+ request_unique_key = self._head_requests.pop()
209
+ if (
210
+ request_unique_key not in self._requests_in_progress
211
+ and request_unique_key not in self._requests_already_handled
212
+ ):
213
+ self._requests_in_progress.add(request_unique_key)
214
+ return await self.get_request(request_unique_key)
215
+ # No request locally and the ones returned from the platform are already in progress.
216
+ return None
217
+
218
+ async def _ensure_head_is_non_empty(self) -> None:
219
+ """Ensure that the queue head has requests if they are available in the queue."""
220
+ if len(self._head_requests) <= 1:
221
+ await self._list_head()
222
+
223
+ async def _list_head(self) -> None:
224
+ desired_new_head_items = 200
225
+ # The head will contain in progress requests as well, so we need to fetch more, to get some new ones.
226
+ requested_head_items = max(self._MAX_HEAD_ITEMS, desired_new_head_items + len(self._requests_in_progress))
227
+ response = await self._api_client.list_head(limit=requested_head_items)
228
+
229
+ # Update metadata
230
+ # Check if there is another client working with the RequestQueue
231
+ self.metadata.had_multiple_clients = response.get('hadMultipleClients', False)
232
+ # Should warn once? This might be outside expected context if the other consumers consumes at the same time
233
+
234
+ if modified_at := response.get('queueModifiedAt'):
235
+ self.metadata.modified_at = max(self.metadata.modified_at, modified_at)
236
+
237
+ # Update the cached data
238
+ for request_data in response.get('items', []):
239
+ request = Request.model_validate(request_data)
240
+
241
+ if request.unique_key in self._requests_in_progress:
242
+ # Ignore requests that are already in progress, we will not process them again.
243
+ continue
244
+ if request.was_already_handled:
245
+ # Do not cache fully handled requests, we do not need them. Just cache their unique_key.
246
+ self._requests_already_handled.add(request.unique_key)
247
+ else:
248
+ # Only fetch the request if we do not know it yet.
249
+ if request.unique_key not in self._requests_cache:
250
+ request = Request.model_validate(
251
+ await self._api_client.get_request(unique_key_to_request_id(request.unique_key))
252
+ )
253
+ self._requests_cache[request.unique_key] = request
254
+
255
+ # Add new requests to the end of the head, unless already present in head
256
+ if request.unique_key not in self._head_requests:
257
+ self._head_requests.appendleft(request.unique_key)
258
+
259
+ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
260
+ """Mark a request as handled after successful processing.
261
+
262
+ Handled requests will never again be returned by the `fetch_next_request` method.
263
+
264
+ Args:
265
+ request: The request to mark as handled.
266
+
267
+ Returns:
268
+ Information about the queue operation. `None` if the given request was not in progress.
269
+ """
270
+ # Set the handled_at timestamp if not already set
271
+
272
+ if request.handled_at is None:
273
+ request.handled_at = datetime.now(tz=timezone.utc)
274
+ self.metadata.handled_request_count += 1
275
+ self.metadata.pending_request_count -= 1
276
+
277
+ if cached_request := self._requests_cache.get(request.unique_key):
278
+ cached_request.handled_at = request.handled_at
279
+
280
+ try:
281
+ # Update the request in the API
282
+ # Works as upsert - adds the request if it does not exist yet. (Local request that was handled before
283
+ # adding to the queue.)
284
+ processed_request = await self._update_request(request)
285
+ # Remember that we handled this request, to optimize local deduplication.
286
+ self._requests_already_handled.add(request.unique_key)
287
+ # Remove request from cache. It will most likely not be needed.
288
+ self._requests_cache.pop(request.unique_key)
289
+ self._requests_in_progress.discard(request.unique_key)
290
+
291
+ except Exception as exc:
292
+ logger.debug(f'Error marking request {request.unique_key} as handled: {exc!s}')
293
+ return None
294
+ else:
295
+ return processed_request
296
+
297
+ async def reclaim_request(
298
+ self,
299
+ request: Request,
300
+ *,
301
+ forefront: bool = False,
302
+ ) -> ProcessedRequest | None:
303
+ """Reclaim a failed request back to the queue.
304
+
305
+ The request will be returned for processing later again by another call to `fetch_next_request`.
306
+
307
+ Args:
308
+ request: The request to return to the queue.
309
+ forefront: Whether to add the request to the head or the end of the queue.
310
+
311
+ Returns:
312
+ Information about the queue operation. `None` if the given request was not in progress.
313
+ """
314
+ # Check if the request was marked as handled and clear it. When reclaiming,
315
+ # we want to put the request back for processing.
316
+ if request.was_already_handled:
317
+ request.handled_at = None
318
+
319
+ try:
320
+ # Make sure request is in the local cache. We might need it.
321
+ self._requests_cache[request.unique_key] = request
322
+
323
+ # No longer in progress
324
+ self._requests_in_progress.discard(request.unique_key)
325
+ # No longer handled
326
+ self._requests_already_handled.discard(request.unique_key)
327
+
328
+ if forefront:
329
+ # Append to top of the local head estimation
330
+ self._head_requests.append(request.unique_key)
331
+
332
+ processed_request = await self._update_request(request, forefront=forefront)
333
+ processed_request.unique_key = request.unique_key
334
+ # If the request was previously handled, decrement our handled count since
335
+ # we're putting it back for processing.
336
+ if request.was_already_handled and not processed_request.was_already_handled:
337
+ self.metadata.handled_request_count -= 1
338
+ self.metadata.pending_request_count += 1
339
+
340
+ except Exception as exc:
341
+ logger.debug(f'Error reclaiming request {request.unique_key}: {exc!s}')
342
+ return None
343
+ else:
344
+ return processed_request
345
+
346
+ async def is_empty(self) -> bool:
347
+ """Check if the queue is empty.
348
+
349
+ Returns:
350
+ True if the queue is empty, False otherwise.
351
+ """
352
+ # Without the lock the `is_empty` is prone to falsely report True with some low probability race condition.
353
+ await self._ensure_head_is_non_empty()
354
+ return not self._head_requests and not self._requests_in_progress
355
+
356
+ async def _update_request(
357
+ self,
358
+ request: Request,
359
+ *,
360
+ forefront: bool = False,
361
+ ) -> ProcessedRequest:
362
+ """Update a request in the queue.
363
+
364
+ Args:
365
+ request: The updated request.
366
+ forefront: Whether to put the updated request in the beginning or the end of the queue.
367
+
368
+ Returns:
369
+ The updated request
370
+ """
371
+ request_dict = request.model_dump(by_alias=True)
372
+ request_dict['id'] = unique_key_to_request_id(request.unique_key)
373
+ response = await self._api_client.update_request(
374
+ request=request_dict,
375
+ forefront=forefront,
376
+ )
377
+
378
+ return ProcessedRequest.model_validate(
379
+ {'uniqueKey': request.unique_key} | response,
380
+ )
381
+
382
+ async def _init_caches(self) -> None:
383
+ """Initialize the local caches by getting requests from the existing queue.
384
+
385
+ This is mainly done to improve local deduplication capability. List request can return up to 10k requests, but
386
+ their order is implementation detail and does not respect head order or insertion order.
387
+
388
+ Deduplication on platform is expensive, it takes 1 API call per request and 1 write operation per request.
389
+ Local deduplication is cheaper, it takes 1 API call for whole cache and 1 read operation per request.
390
+ """
391
+ response = await self._api_client.list_requests(limit=10_000)
392
+ for request_data in response.get('items', []):
393
+ request = Request.model_validate(request_data)
394
+ if request.was_already_handled:
395
+ # Cache just unique_key for deduplication
396
+ self._requests_already_handled.add(request.unique_key)
397
+ else:
398
+ # Cache full request
399
+ self._requests_cache[request.unique_key] = request
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import TYPE_CHECKING
3
+ from typing import TYPE_CHECKING, Literal
4
4
 
5
5
  from typing_extensions import override
6
6
 
@@ -9,35 +9,69 @@ from crawlee.storage_clients._base import StorageClient
9
9
  from ._dataset_client import ApifyDatasetClient
10
10
  from ._key_value_store_client import ApifyKeyValueStoreClient
11
11
  from ._request_queue_client import ApifyRequestQueueClient
12
+ from ._utils import hash_api_base_url_and_token
13
+ from apify._configuration import Configuration as ApifyConfiguration
12
14
  from apify._utils import docs_group
13
15
 
14
16
  if TYPE_CHECKING:
15
- from crawlee.configuration import Configuration
17
+ from collections.abc import Hashable
18
+
19
+ from crawlee.configuration import Configuration as CrawleeConfiguration
16
20
 
17
21
 
18
22
  @docs_group('Storage clients')
19
23
  class ApifyStorageClient(StorageClient):
20
24
  """Apify storage client."""
21
25
 
26
+ def __init__(self, *, request_queue_access: Literal['single', 'shared'] = 'single') -> None:
27
+ """Initialize the Apify storage client.
28
+
29
+ Args:
30
+ request_queue_access: Controls the implementation of the request queue client based on expected scenario:
31
+ - 'single' is suitable for single consumer scenarios. It makes less API calls, is cheaper and faster.
32
+ - 'shared' is suitable for multiple consumers scenarios at the cost of higher API usage.
33
+ Detailed constraints for the 'single' access type:
34
+ - Only one client is consuming the request queue at the time.
35
+ - Multiple producers can put requests to the queue, but their forefront requests are not guaranteed to
36
+ be handled so quickly as this client does not aggressively fetch the forefront and relies on local
37
+ head estimation.
38
+ - Requests are only added to the queue, never deleted by other clients. (Marking as handled is ok.)
39
+ - Other producers can add new requests, but not modify existing ones.
40
+ (Modifications would not be included in local cache)
41
+ """
42
+ self._request_queue_access = request_queue_access
43
+
44
+ # This class breaches Liskov Substitution Principle. It requires specialized Configuration compared to its parent.
45
+ _lsp_violation_error_message_template = (
46
+ 'Expected "configuration" to be an instance of "apify.Configuration", but got {} instead.'
47
+ )
48
+
49
+ @override
50
+ def get_storage_client_cache_key(self, configuration: CrawleeConfiguration) -> Hashable:
51
+ if isinstance(configuration, ApifyConfiguration):
52
+ # It is not supported to open exactly same queue with 'single' and 'shared' client at the same time.
53
+ # Whichever client variation gets used first, wins.
54
+ return super().get_storage_client_cache_key(configuration), hash_api_base_url_and_token(configuration)
55
+
56
+ config_class = type(configuration)
57
+ raise TypeError(
58
+ self._lsp_violation_error_message_template.format(f'{config_class.__module__}.{config_class.__name__}')
59
+ )
60
+
22
61
  @override
23
62
  async def create_dataset_client(
24
63
  self,
25
64
  *,
26
65
  id: str | None = None,
27
66
  name: str | None = None,
28
- configuration: Configuration | None = None,
67
+ alias: str | None = None,
68
+ configuration: CrawleeConfiguration | None = None,
29
69
  ) -> ApifyDatasetClient:
30
- # Import here to avoid circular imports.
31
- from apify import Configuration as ApifyConfiguration # noqa: PLC0415
32
-
33
70
  configuration = configuration or ApifyConfiguration.get_global_configuration()
34
71
  if isinstance(configuration, ApifyConfiguration):
35
- return await ApifyDatasetClient.open(id=id, name=name, configuration=configuration)
72
+ return await ApifyDatasetClient.open(id=id, name=name, alias=alias, configuration=configuration)
36
73
 
37
- raise TypeError(
38
- f'Expected "configuration" to be an instance of "apify.Configuration", '
39
- f'but got {type(configuration).__name__} instead.'
40
- )
74
+ raise TypeError(self._lsp_violation_error_message_template.format(type(configuration).__name__))
41
75
 
42
76
  @override
43
77
  async def create_kvs_client(
@@ -45,19 +79,14 @@ class ApifyStorageClient(StorageClient):
45
79
  *,
46
80
  id: str | None = None,
47
81
  name: str | None = None,
48
- configuration: Configuration | None = None,
82
+ alias: str | None = None,
83
+ configuration: CrawleeConfiguration | None = None,
49
84
  ) -> ApifyKeyValueStoreClient:
50
- # Import here to avoid circular imports.
51
- from apify import Configuration as ApifyConfiguration # noqa: PLC0415
52
-
53
85
  configuration = configuration or ApifyConfiguration.get_global_configuration()
54
86
  if isinstance(configuration, ApifyConfiguration):
55
- return await ApifyKeyValueStoreClient.open(id=id, name=name, configuration=configuration)
87
+ return await ApifyKeyValueStoreClient.open(id=id, name=name, alias=alias, configuration=configuration)
56
88
 
57
- raise TypeError(
58
- f'Expected "configuration" to be an instance of "apify.Configuration", '
59
- f'but got {type(configuration).__name__} instead.'
60
- )
89
+ raise TypeError(self._lsp_violation_error_message_template.format(type(configuration).__name__))
61
90
 
62
91
  @override
63
92
  async def create_rq_client(
@@ -65,16 +94,13 @@ class ApifyStorageClient(StorageClient):
65
94
  *,
66
95
  id: str | None = None,
67
96
  name: str | None = None,
68
- configuration: Configuration | None = None,
97
+ alias: str | None = None,
98
+ configuration: CrawleeConfiguration | None = None,
69
99
  ) -> ApifyRequestQueueClient:
70
- # Import here to avoid circular imports.
71
- from apify import Configuration as ApifyConfiguration # noqa: PLC0415
72
-
73
100
  configuration = configuration or ApifyConfiguration.get_global_configuration()
74
101
  if isinstance(configuration, ApifyConfiguration):
75
- return await ApifyRequestQueueClient.open(id=id, name=name, configuration=configuration)
102
+ return await ApifyRequestQueueClient.open(
103
+ id=id, name=name, alias=alias, configuration=configuration, access=self._request_queue_access
104
+ )
76
105
 
77
- raise TypeError(
78
- f'Expected "configuration" to be an instance of "apify.Configuration", '
79
- f'but got {type(configuration).__name__} instead.'
80
- )
106
+ raise TypeError(self._lsp_violation_error_message_template.format(type(configuration).__name__))