apify 2.7.2__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

Files changed (51) hide show
  1. apify/_actor.py +194 -126
  2. apify/_charging.py +34 -9
  3. apify/_configuration.py +79 -6
  4. apify/_crypto.py +0 -6
  5. apify/_models.py +7 -7
  6. apify/_proxy_configuration.py +10 -10
  7. apify/_utils.py +25 -2
  8. apify/events/__init__.py +5 -0
  9. apify/events/_apify_event_manager.py +140 -0
  10. apify/events/_types.py +102 -0
  11. apify/log.py +0 -9
  12. apify/request_loaders/__init__.py +18 -0
  13. apify/{storages/_request_list.py → request_loaders/_apify_request_list.py} +25 -18
  14. apify/request_loaders/py.typed +0 -0
  15. apify/scrapy/_logging_config.py +1 -4
  16. apify/scrapy/extensions/_httpcache.py +9 -5
  17. apify/scrapy/requests.py +3 -3
  18. apify/scrapy/scheduler.py +8 -5
  19. apify/storage_clients/__init__.py +12 -0
  20. apify/storage_clients/_apify/__init__.py +11 -0
  21. apify/storage_clients/_apify/_dataset_client.py +328 -0
  22. apify/storage_clients/_apify/_key_value_store_client.py +265 -0
  23. apify/storage_clients/_apify/_models.py +131 -0
  24. apify/storage_clients/_apify/_request_queue_client.py +327 -0
  25. apify/storage_clients/_apify/_request_queue_shared_client.py +527 -0
  26. apify/storage_clients/_apify/_request_queue_single_client.py +399 -0
  27. apify/storage_clients/_apify/_storage_client.py +106 -0
  28. apify/storage_clients/_apify/_utils.py +194 -0
  29. apify/storage_clients/_apify/py.typed +0 -0
  30. apify/storage_clients/_file_system/__init__.py +2 -0
  31. apify/storage_clients/_file_system/_key_value_store_client.py +57 -0
  32. apify/storage_clients/_file_system/_storage_client.py +41 -0
  33. apify/storage_clients/_smart_apify/__init__.py +1 -0
  34. apify/storage_clients/_smart_apify/_storage_client.py +117 -0
  35. apify/storage_clients/py.typed +0 -0
  36. apify/storages/__init__.py +1 -3
  37. {apify-2.7.2.dist-info → apify-3.0.0.dist-info}/METADATA +25 -9
  38. apify-3.0.0.dist-info/RECORD +57 -0
  39. apify/_platform_event_manager.py +0 -231
  40. apify/apify_storage_client/__init__.py +0 -3
  41. apify/apify_storage_client/_apify_storage_client.py +0 -72
  42. apify/apify_storage_client/_dataset_client.py +0 -190
  43. apify/apify_storage_client/_dataset_collection_client.py +0 -51
  44. apify/apify_storage_client/_key_value_store_client.py +0 -109
  45. apify/apify_storage_client/_key_value_store_collection_client.py +0 -51
  46. apify/apify_storage_client/_request_queue_client.py +0 -176
  47. apify/apify_storage_client/_request_queue_collection_client.py +0 -51
  48. apify-2.7.2.dist-info/RECORD +0 -44
  49. /apify/{apify_storage_client → events}/py.typed +0 -0
  50. {apify-2.7.2.dist-info → apify-3.0.0.dist-info}/WHEEL +0 -0
  51. {apify-2.7.2.dist-info → apify-3.0.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,527 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from collections import deque
5
+ from datetime import datetime, timedelta, timezone
6
+ from logging import getLogger
7
+ from typing import TYPE_CHECKING, Any, Final
8
+
9
+ from cachetools import LRUCache
10
+
11
+ from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata
12
+
13
+ from ._models import ApifyRequestQueueMetadata, CachedRequest, RequestQueueHead
14
+ from ._utils import unique_key_to_request_id
15
+ from apify import Request
16
+
17
+ if TYPE_CHECKING:
18
+ from collections.abc import Callable, Coroutine, Sequence
19
+
20
+ from apify_client.clients import RequestQueueClientAsync
21
+
22
+
23
+ logger = getLogger(__name__)
24
+
25
+
26
+ class _ApifyRequestQueueSharedClient:
27
+ """An Apify platform implementation of the request queue client.
28
+
29
+ This implementation supports multiple producers and multiple consumers scenario.
30
+ """
31
+
32
+ _DEFAULT_LOCK_TIME: Final[timedelta] = timedelta(minutes=3)
33
+ """The default lock time for requests in the queue."""
34
+
35
+ def __init__(
36
+ self,
37
+ *,
38
+ api_client: RequestQueueClientAsync,
39
+ metadata: RequestQueueMetadata,
40
+ cache_size: int,
41
+ metadata_getter: Callable[[], Coroutine[Any, Any, ApifyRequestQueueMetadata]],
42
+ ) -> None:
43
+ """Initialize a new instance.
44
+
45
+ Preferably use the `ApifyRequestQueueClient.open` class method to create a new instance.
46
+ """
47
+ self.metadata = metadata
48
+ """Additional data related to the RequestQueue."""
49
+
50
+ self._metadata_getter = metadata_getter
51
+ """Async function to get metadata from API."""
52
+
53
+ self._api_client = api_client
54
+ """The Apify request queue client for API operations."""
55
+
56
+ self._queue_head = deque[str]()
57
+ """A deque to store request unique keys in the queue head."""
58
+
59
+ self._requests_cache: LRUCache[str, CachedRequest] = LRUCache(maxsize=cache_size)
60
+ """A cache to store request objects. Request unique key is used as the cache key."""
61
+
62
+ self._queue_has_locked_requests: bool | None = None
63
+ """Whether the queue has requests locked by another client."""
64
+
65
+ self._should_check_for_forefront_requests = False
66
+ """Whether to check for forefront requests in the next list_head call."""
67
+
68
+ self._fetch_lock = asyncio.Lock()
69
+ """Fetch lock to minimize race conditions when communicating with API."""
70
+
71
+ async def _get_metadata_estimate(self) -> RequestQueueMetadata:
72
+ """Try to get cached metadata first. If multiple clients, fuse with global metadata.
73
+
74
+ This method is used internally to avoid unnecessary API call unless needed (multiple clients).
75
+ Local estimation of metadata is without delay, unlike metadata from API. In situation where there is only one
76
+ client, it is the better choice.
77
+ """
78
+ if self.metadata.had_multiple_clients:
79
+ return await self._metadata_getter()
80
+ # Get local estimation (will not include changes done bo another client)
81
+ return self.metadata
82
+
83
+ async def add_batch_of_requests(
84
+ self,
85
+ requests: Sequence[Request],
86
+ *,
87
+ forefront: bool = False,
88
+ ) -> AddRequestsResponse:
89
+ """Add a batch of requests to the queue.
90
+
91
+ Args:
92
+ requests: The requests to add.
93
+ forefront: Whether to add the requests to the beginning of the queue.
94
+
95
+ Returns:
96
+ Response containing information about the added requests.
97
+ """
98
+ # Do not try to add previously added requests to avoid pointless expensive calls to API
99
+
100
+ new_requests: list[Request] = []
101
+ already_present_requests: list[ProcessedRequest] = []
102
+
103
+ for request in requests:
104
+ if self._requests_cache.get(request.unique_key):
105
+ # We are not sure if it was already handled at this point, and it is not worth calling API for it.
106
+ # It could have been handled by another client in the meantime, so cached information about
107
+ # `request.was_already_handled` is not reliable.
108
+ already_present_requests.append(
109
+ ProcessedRequest.model_validate(
110
+ {
111
+ 'uniqueKey': request.unique_key,
112
+ 'wasAlreadyPresent': True,
113
+ 'wasAlreadyHandled': request.was_already_handled,
114
+ }
115
+ )
116
+ )
117
+
118
+ else:
119
+ # Add new request to the cache.
120
+ processed_request = ProcessedRequest.model_validate(
121
+ {
122
+ 'uniqueKey': request.unique_key,
123
+ 'wasAlreadyPresent': True,
124
+ 'wasAlreadyHandled': request.was_already_handled,
125
+ }
126
+ )
127
+ self._cache_request(
128
+ request.unique_key,
129
+ processed_request,
130
+ )
131
+ new_requests.append(request)
132
+
133
+ if new_requests:
134
+ # Prepare requests for API by converting to dictionaries.
135
+ requests_dict = [
136
+ request.model_dump(
137
+ by_alias=True,
138
+ exclude={'id'}, # Exclude ID fields from requests since the API doesn't accept them.
139
+ )
140
+ for request in new_requests
141
+ ]
142
+
143
+ # Send requests to API.
144
+ api_response = AddRequestsResponse.model_validate(
145
+ await self._api_client.batch_add_requests(requests=requests_dict, forefront=forefront)
146
+ )
147
+
148
+ # Add the locally known already present processed requests based on the local cache.
149
+ api_response.processed_requests.extend(already_present_requests)
150
+
151
+ # Remove unprocessed requests from the cache
152
+ for unprocessed_request in api_response.unprocessed_requests:
153
+ self._requests_cache.pop(unprocessed_request.unique_key, None)
154
+
155
+ else:
156
+ api_response = AddRequestsResponse.model_validate(
157
+ {'unprocessedRequests': [], 'processedRequests': already_present_requests}
158
+ )
159
+
160
+ logger.debug(
161
+ f'Tried to add new requests: {len(new_requests)}, '
162
+ f'succeeded to add new requests: {len(api_response.processed_requests) - len(already_present_requests)}, '
163
+ f'skipped already present requests: {len(already_present_requests)}'
164
+ )
165
+
166
+ # Update assumed total count for newly added requests.
167
+ new_request_count = 0
168
+ for processed_request in api_response.processed_requests:
169
+ if not processed_request.was_already_present and not processed_request.was_already_handled:
170
+ new_request_count += 1
171
+
172
+ self.metadata.total_request_count += new_request_count
173
+ self.metadata.pending_request_count += new_request_count
174
+
175
+ return api_response
176
+
177
+ async def get_request(self, unique_key: str) -> Request | None:
178
+ """Get a request by unique key.
179
+
180
+ Args:
181
+ unique_key: Unique key of the request to get.
182
+
183
+ Returns:
184
+ The request or None if not found.
185
+ """
186
+ response = await self._api_client.get_request(unique_key_to_request_id(unique_key))
187
+
188
+ if response is None:
189
+ return None
190
+
191
+ return Request.model_validate(response)
192
+
193
+ async def fetch_next_request(self) -> Request | None:
194
+ """Return the next request in the queue to be processed.
195
+
196
+ Once you successfully finish processing of the request, you need to call `mark_request_as_handled`
197
+ to mark the request as handled in the queue. If there was some error in processing the request, call
198
+ `reclaim_request` instead, so that the queue will give the request to some other consumer
199
+ in another call to the `fetch_next_request` method.
200
+
201
+ Returns:
202
+ The request or `None` if there are no more pending requests.
203
+ """
204
+ # Ensure the queue head has requests if available. Fetching the head with lock to prevent race conditions.
205
+ async with self._fetch_lock:
206
+ await self._ensure_head_is_non_empty()
207
+
208
+ # If queue head is empty after ensuring, there are no requests
209
+ if not self._queue_head:
210
+ return None
211
+
212
+ # Get the next request ID from the queue head
213
+ next_unique_key = self._queue_head.popleft()
214
+
215
+ request = await self._get_or_hydrate_request(next_unique_key)
216
+
217
+ # Handle potential inconsistency where request might not be in the main table yet
218
+ if request is None:
219
+ logger.debug(
220
+ 'Cannot find a request from the beginning of queue, will be retried later',
221
+ extra={'nextRequestUniqueKey': next_unique_key},
222
+ )
223
+ return None
224
+
225
+ # If the request was already handled, skip it
226
+ if request.handled_at is not None:
227
+ logger.debug(
228
+ 'Request fetched from the beginning of queue was already handled',
229
+ extra={'nextRequestUniqueKey': next_unique_key},
230
+ )
231
+ return None
232
+
233
+ # Use get request to ensure we have the full request object.
234
+ request = await self.get_request(request.unique_key)
235
+ if request is None:
236
+ logger.debug(
237
+ 'Request fetched from the beginning of queue was not found in the RQ',
238
+ extra={'nextRequestUniqueKey': next_unique_key},
239
+ )
240
+ return None
241
+
242
+ return request
243
+
244
+ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
245
+ """Mark a request as handled after successful processing.
246
+
247
+ Handled requests will never again be returned by the `fetch_next_request` method.
248
+
249
+ Args:
250
+ request: The request to mark as handled.
251
+
252
+ Returns:
253
+ Information about the queue operation. `None` if the given request was not in progress.
254
+ """
255
+ # Set the handled_at timestamp if not already set
256
+ if request.handled_at is None:
257
+ request.handled_at = datetime.now(tz=timezone.utc)
258
+
259
+ if cached_request := self._requests_cache[request.unique_key]:
260
+ cached_request.was_already_handled = request.was_already_handled
261
+ try:
262
+ # Update the request in the API
263
+ processed_request = await self._update_request(request)
264
+ processed_request.unique_key = request.unique_key
265
+
266
+ # Update assumed handled count if this wasn't already handled
267
+ if not processed_request.was_already_handled:
268
+ self.metadata.handled_request_count += 1
269
+ self.metadata.pending_request_count -= 1
270
+
271
+ # Update the cache with the handled request
272
+ cache_key = request.unique_key
273
+ self._cache_request(
274
+ cache_key,
275
+ processed_request,
276
+ hydrated_request=request,
277
+ )
278
+ except Exception as exc:
279
+ logger.debug(f'Error marking request {request.unique_key} as handled: {exc!s}')
280
+ return None
281
+ else:
282
+ return processed_request
283
+
284
+ async def reclaim_request(
285
+ self,
286
+ request: Request,
287
+ *,
288
+ forefront: bool = False,
289
+ ) -> ProcessedRequest | None:
290
+ """Reclaim a failed request back to the queue.
291
+
292
+ The request will be returned for processing later again by another call to `fetch_next_request`.
293
+
294
+ Args:
295
+ request: The request to return to the queue.
296
+ forefront: Whether to add the request to the head or the end of the queue.
297
+
298
+ Returns:
299
+ Information about the queue operation. `None` if the given request was not in progress.
300
+ """
301
+ # Check if the request was marked as handled and clear it. When reclaiming,
302
+ # we want to put the request back for processing.
303
+ if request.was_already_handled:
304
+ request.handled_at = None
305
+
306
+ # Reclaim with lock to prevent race conditions that could lead to double processing of the same request.
307
+ async with self._fetch_lock:
308
+ try:
309
+ # Update the request in the API.
310
+ processed_request = await self._update_request(request, forefront=forefront)
311
+ processed_request.unique_key = request.unique_key
312
+
313
+ # If the request was previously handled, decrement our handled count since
314
+ # we're putting it back for processing.
315
+ if request.was_already_handled and not processed_request.was_already_handled:
316
+ self.metadata.handled_request_count -= 1
317
+ self.metadata.pending_request_count += 1
318
+
319
+ # Update the cache
320
+ cache_key = request.unique_key
321
+ self._cache_request(
322
+ cache_key,
323
+ processed_request,
324
+ hydrated_request=request,
325
+ )
326
+
327
+ # If we're adding to the forefront, we need to check for forefront requests
328
+ # in the next list_head call
329
+ if forefront:
330
+ self._should_check_for_forefront_requests = True
331
+
332
+ except Exception as exc:
333
+ logger.debug(f'Error reclaiming request {request.unique_key}: {exc!s}')
334
+ return None
335
+ else:
336
+ return processed_request
337
+
338
+ async def is_empty(self) -> bool:
339
+ """Check if the queue is empty.
340
+
341
+ Returns:
342
+ True if the queue is empty, False otherwise.
343
+ """
344
+ # Check _list_head.
345
+ # Without the lock the `is_empty` is prone to falsely report True with some low probability race condition.
346
+ async with self._fetch_lock:
347
+ head = await self._list_head(limit=1)
348
+ return len(head.items) == 0 and not self._queue_has_locked_requests
349
+
350
+ async def _ensure_head_is_non_empty(self) -> None:
351
+ """Ensure that the queue head has requests if they are available in the queue."""
352
+ # If queue head has adequate requests, skip fetching more
353
+ if len(self._queue_head) > 1 and not self._should_check_for_forefront_requests:
354
+ return
355
+
356
+ # Fetch requests from the API and populate the queue head
357
+ await self._list_head()
358
+
359
+ async def _get_or_hydrate_request(self, unique_key: str) -> Request | None:
360
+ """Get a request by unique key, either from cache or by fetching from API.
361
+
362
+ Args:
363
+ unique_key: Unique key of the request to get.
364
+
365
+ Returns:
366
+ The request if found and valid, otherwise None.
367
+ """
368
+ # First check if the request is in our cache
369
+ cached_entry = self._requests_cache.get(unique_key)
370
+
371
+ if cached_entry and cached_entry.hydrated:
372
+ # If we have the request hydrated in cache, return it
373
+ return cached_entry.hydrated
374
+
375
+ # If not in cache or not hydrated, fetch the request
376
+ try:
377
+ # Fetch the request data
378
+ request = await self.get_request(unique_key)
379
+
380
+ # If request is not found and return None
381
+ if not request:
382
+ return None
383
+
384
+ # Update cache with hydrated request
385
+ cache_key = request.unique_key
386
+ self._cache_request(
387
+ cache_key,
388
+ ProcessedRequest(
389
+ unique_key=request.unique_key,
390
+ was_already_present=True,
391
+ was_already_handled=request.handled_at is not None,
392
+ ),
393
+ hydrated_request=request,
394
+ )
395
+ except Exception as exc:
396
+ logger.debug(f'Error fetching request {unique_key}: {exc!s}')
397
+ return None
398
+ else:
399
+ return request
400
+
401
+ async def _update_request(
402
+ self,
403
+ request: Request,
404
+ *,
405
+ forefront: bool = False,
406
+ ) -> ProcessedRequest:
407
+ """Update a request in the queue.
408
+
409
+ Args:
410
+ request: The updated request.
411
+ forefront: Whether to put the updated request in the beginning or the end of the queue.
412
+
413
+ Returns:
414
+ The updated request
415
+ """
416
+ request_dict = request.model_dump(by_alias=True)
417
+ request_dict['id'] = unique_key_to_request_id(request.unique_key)
418
+ response = await self._api_client.update_request(
419
+ request=request_dict,
420
+ forefront=forefront,
421
+ )
422
+
423
+ return ProcessedRequest.model_validate(
424
+ {'uniqueKey': request.unique_key} | response,
425
+ )
426
+
427
+ async def _list_head(
428
+ self,
429
+ *,
430
+ limit: int = 25,
431
+ ) -> RequestQueueHead:
432
+ """Retrieve requests from the beginning of the queue.
433
+
434
+ Args:
435
+ limit: Maximum number of requests to retrieve.
436
+
437
+ Returns:
438
+ A collection of requests from the beginning of the queue.
439
+ """
440
+ # Return from cache if available and we're not checking for new forefront requests
441
+ if self._queue_head and not self._should_check_for_forefront_requests:
442
+ logger.debug(f'Using cached queue head with {len(self._queue_head)} requests')
443
+ # Create a list of requests from the cached queue head
444
+ items = []
445
+ for unique_key in list(self._queue_head)[:limit]:
446
+ cached_request = self._requests_cache.get(unique_key)
447
+ if cached_request and cached_request.hydrated:
448
+ items.append(cached_request.hydrated)
449
+
450
+ metadata = await self._get_metadata_estimate()
451
+
452
+ return RequestQueueHead(
453
+ limit=limit,
454
+ had_multiple_clients=metadata.had_multiple_clients,
455
+ queue_modified_at=metadata.modified_at,
456
+ items=items,
457
+ lock_time=None,
458
+ queue_has_locked_requests=self._queue_has_locked_requests,
459
+ )
460
+ leftover_buffer = list[str]()
461
+ if self._should_check_for_forefront_requests:
462
+ leftover_buffer = list(self._queue_head)
463
+ self._queue_head.clear()
464
+ self._should_check_for_forefront_requests = False
465
+
466
+ # Otherwise fetch from API
467
+ response = await self._api_client.list_and_lock_head(
468
+ lock_secs=int(self._DEFAULT_LOCK_TIME.total_seconds()),
469
+ limit=limit,
470
+ )
471
+
472
+ # Update the queue head cache
473
+ self._queue_has_locked_requests = response.get('queueHasLockedRequests', False)
474
+ # Check if there is another client working with the RequestQueue
475
+ self.metadata.had_multiple_clients = response.get('hadMultipleClients', False)
476
+
477
+ for request_data in response.get('items', []):
478
+ request = Request.model_validate(request_data)
479
+
480
+ # Skip requests without ID or unique key
481
+ if not request.unique_key:
482
+ logger.debug(
483
+ 'Skipping request from queue head, missing unique key',
484
+ extra={
485
+ 'unique_key': request.unique_key,
486
+ },
487
+ )
488
+ continue
489
+
490
+ # Cache the request
491
+ self._cache_request(
492
+ request.unique_key,
493
+ ProcessedRequest(
494
+ unique_key=request.unique_key,
495
+ was_already_present=True,
496
+ was_already_handled=False,
497
+ ),
498
+ hydrated_request=request,
499
+ )
500
+ self._queue_head.append(request.unique_key)
501
+
502
+ for leftover_unique_key in leftover_buffer:
503
+ # After adding new requests to the forefront, any existing leftover locked request is kept in the end.
504
+ self._queue_head.append(leftover_unique_key)
505
+ return RequestQueueHead.model_validate(response)
506
+
507
+ def _cache_request(
508
+ self,
509
+ cache_key: str,
510
+ processed_request: ProcessedRequest,
511
+ *,
512
+ hydrated_request: Request | None = None,
513
+ ) -> None:
514
+ """Cache a request for future use.
515
+
516
+ Args:
517
+ cache_key: The key to use for caching the request. It should be request ID.
518
+ processed_request: The processed request information.
519
+ forefront: Whether the request was added to the forefront of the queue.
520
+ hydrated_request: The hydrated request object, if available.
521
+ """
522
+ self._requests_cache[cache_key] = CachedRequest(
523
+ unique_key=processed_request.unique_key,
524
+ was_already_handled=processed_request.was_already_handled,
525
+ hydrated=hydrated_request,
526
+ lock_expires_at=None,
527
+ )