apify 1.7.1b1__py3-none-any.whl → 2.2.1b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of apify might be problematic. Click here for more details.

Files changed (62) hide show
  1. apify/__init__.py +33 -4
  2. apify/_actor.py +1074 -0
  3. apify/_configuration.py +370 -0
  4. apify/_consts.py +10 -0
  5. apify/_crypto.py +31 -27
  6. apify/_models.py +117 -0
  7. apify/_platform_event_manager.py +231 -0
  8. apify/_proxy_configuration.py +320 -0
  9. apify/_utils.py +18 -484
  10. apify/apify_storage_client/__init__.py +3 -0
  11. apify/apify_storage_client/_apify_storage_client.py +68 -0
  12. apify/apify_storage_client/_dataset_client.py +190 -0
  13. apify/apify_storage_client/_dataset_collection_client.py +51 -0
  14. apify/apify_storage_client/_key_value_store_client.py +94 -0
  15. apify/apify_storage_client/_key_value_store_collection_client.py +51 -0
  16. apify/apify_storage_client/_request_queue_client.py +176 -0
  17. apify/apify_storage_client/_request_queue_collection_client.py +51 -0
  18. apify/apify_storage_client/py.typed +0 -0
  19. apify/log.py +22 -105
  20. apify/scrapy/__init__.py +11 -3
  21. apify/scrapy/middlewares/__init__.py +3 -1
  22. apify/scrapy/middlewares/apify_proxy.py +29 -27
  23. apify/scrapy/middlewares/py.typed +0 -0
  24. apify/scrapy/pipelines/__init__.py +3 -1
  25. apify/scrapy/pipelines/actor_dataset_push.py +6 -3
  26. apify/scrapy/pipelines/py.typed +0 -0
  27. apify/scrapy/py.typed +0 -0
  28. apify/scrapy/requests.py +60 -58
  29. apify/scrapy/scheduler.py +28 -19
  30. apify/scrapy/utils.py +10 -32
  31. apify/storages/__init__.py +4 -10
  32. apify/storages/_request_list.py +150 -0
  33. apify/storages/py.typed +0 -0
  34. apify-2.2.1b1.dist-info/METADATA +211 -0
  35. apify-2.2.1b1.dist-info/RECORD +38 -0
  36. {apify-1.7.1b1.dist-info → apify-2.2.1b1.dist-info}/WHEEL +1 -2
  37. apify/_memory_storage/__init__.py +0 -3
  38. apify/_memory_storage/file_storage_utils.py +0 -71
  39. apify/_memory_storage/memory_storage_client.py +0 -219
  40. apify/_memory_storage/resource_clients/__init__.py +0 -19
  41. apify/_memory_storage/resource_clients/base_resource_client.py +0 -141
  42. apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -114
  43. apify/_memory_storage/resource_clients/dataset.py +0 -452
  44. apify/_memory_storage/resource_clients/dataset_collection.py +0 -48
  45. apify/_memory_storage/resource_clients/key_value_store.py +0 -533
  46. apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -48
  47. apify/_memory_storage/resource_clients/request_queue.py +0 -466
  48. apify/_memory_storage/resource_clients/request_queue_collection.py +0 -48
  49. apify/actor.py +0 -1351
  50. apify/config.py +0 -127
  51. apify/consts.py +0 -67
  52. apify/event_manager.py +0 -236
  53. apify/proxy_configuration.py +0 -365
  54. apify/storages/base_storage.py +0 -181
  55. apify/storages/dataset.py +0 -494
  56. apify/storages/key_value_store.py +0 -257
  57. apify/storages/request_queue.py +0 -602
  58. apify/storages/storage_client_manager.py +0 -72
  59. apify-1.7.1b1.dist-info/METADATA +0 -149
  60. apify-1.7.1b1.dist-info/RECORD +0 -41
  61. apify-1.7.1b1.dist-info/top_level.txt +0 -1
  62. {apify-1.7.1b1.dist-info → apify-2.2.1b1.dist-info}/LICENSE +0 -0
@@ -1,602 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import asyncio
4
- from collections import OrderedDict
5
- from datetime import datetime, timezone
6
- from typing import TYPE_CHECKING
7
- from typing import OrderedDict as OrderedDictType
8
-
9
- from apify_shared.utils import ignore_docs
10
-
11
- from apify._crypto import crypto_random_object_id
12
- from apify._utils import LRUCache, budget_ow, compute_unique_key, unique_key_to_request_id
13
- from apify.consts import REQUEST_QUEUE_HEAD_MAX_LIMIT
14
- from apify.log import logger
15
- from apify.storages.base_storage import BaseStorage
16
-
17
- if TYPE_CHECKING:
18
- from apify_client import ApifyClientAsync
19
- from apify_client.clients import RequestQueueClientAsync, RequestQueueCollectionClientAsync
20
-
21
- from apify._memory_storage import MemoryStorageClient
22
- from apify._memory_storage.resource_clients import RequestQueueClient, RequestQueueCollectionClient
23
- from apify.config import Configuration
24
-
25
-
26
- MAX_CACHED_REQUESTS = 1_000_000
27
-
28
- # When requesting queue head we always fetch requestsInProgressCount * QUERY_HEAD_BUFFER number of requests.
29
- QUERY_HEAD_MIN_LENGTH = 100
30
-
31
- QUERY_HEAD_BUFFER = 3
32
-
33
- # If queue was modified (request added/updated/deleted) before more than API_PROCESSED_REQUESTS_DELAY_MILLIS
34
- # then we assume the get head operation to be consistent.
35
- API_PROCESSED_REQUESTS_DELAY_MILLIS = 10_000
36
-
37
- # How many times we try to get queue head with queueModifiedAt older than API_PROCESSED_REQUESTS_DELAY_MILLIS.
38
- MAX_QUERIES_FOR_CONSISTENCY = 6
39
-
40
- # This number must be large enough so that processing of all these requests cannot be done in
41
- # a time lower than expected maximum latency of DynamoDB, but low enough not to waste too much memory.
42
- RECENTLY_HANDLED_CACHE_SIZE = 1000
43
-
44
- # Indicates how long it usually takes for the underlying storage to propagate all writes
45
- # to be available to subsequent reads.
46
- STORAGE_CONSISTENCY_DELAY_MILLIS = 3000
47
-
48
-
49
- class RequestQueue(BaseStorage):
50
- """Represents a queue of URLs to crawl.
51
-
52
- Can be used for deep crawling of websites where you start with several URLs and then recursively
53
- follow links to other pages. The data structure supports both breadth-first and depth-first crawling orders.
54
-
55
- Each URL is represented using an instance of the {@apilink Request} class.
56
- The queue can only contain unique URLs. More precisely, it can only contain request dictionaries
57
- with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden.
58
- To add a single URL multiple times to the queue,
59
- corresponding request dictionary will need to have different `uniqueKey` properties.
60
-
61
- Do not instantiate this class directly, use the `Actor.open_request_queue()` function instead.
62
-
63
- `RequestQueue` stores its data either on local disk or in the Apify cloud,
64
- depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variables are set.
65
-
66
- If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the data is stored in
67
- the local directory in the following files:
68
- ```
69
- {APIFY_LOCAL_STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json
70
- ```
71
- Note that `{QUEUE_ID}` is the name or ID of the request queue. The default request queue has ID: `default`,
72
- unless you override it by setting the `APIFY_DEFAULT_REQUEST_QUEUE_ID` environment variable.
73
- The `{REQUEST_ID}` is the id of the request.
74
-
75
- If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the
76
- [Apify Request Queue](https://docs.apify.com/storage/request-queue)
77
- cloud storage.
78
- """
79
-
80
- _request_queue_client: RequestQueueClientAsync | RequestQueueClient
81
- _client_key = crypto_random_object_id()
82
- _queue_head_dict: OrderedDictType[str, str]
83
- _query_queue_head_task: asyncio.Task | None
84
- _in_progress: set[str]
85
- _last_activity: datetime
86
- _internal_timeout_seconds = 5 * 60
87
- _recently_handled: LRUCache[bool]
88
- _assumed_total_count = 0
89
- _assumed_handled_count = 0
90
- _requests_cache: LRUCache[dict]
91
-
92
- @ignore_docs
93
- def __init__(
94
- self: RequestQueue,
95
- id: str, # noqa: A002
96
- name: str | None,
97
- client: ApifyClientAsync | MemoryStorageClient,
98
- config: Configuration,
99
- ) -> None:
100
- """Create a `RequestQueue` instance.
101
-
102
- Do not use the constructor directly, use the `Actor.open_request_queue()` function instead.
103
-
104
- Args:
105
- id (str): ID of the request queue.
106
- name (str, optional): Name of the request queue.
107
- client (ApifyClientAsync or MemoryStorageClient): The storage client which should be used.
108
- config (Configuration): The configuration which should be used.
109
- """
110
- super().__init__(id=id, name=name, client=client, config=config)
111
-
112
- self._request_queue_client = client.request_queue(self._id, client_key=self._client_key)
113
- self._queue_head_dict = OrderedDict()
114
- self._query_queue_head_task = None
115
- self._in_progress = set()
116
- self._last_activity = datetime.now(timezone.utc)
117
- self._recently_handled = LRUCache[bool](max_length=RECENTLY_HANDLED_CACHE_SIZE)
118
- self._requests_cache = LRUCache(max_length=MAX_CACHED_REQUESTS)
119
-
120
- @classmethod
121
- def _get_human_friendly_label(cls: type[RequestQueue]) -> str:
122
- return 'Request queue'
123
-
124
- @classmethod
125
- def _get_default_id(cls: type[RequestQueue], config: Configuration) -> str:
126
- return config.default_request_queue_id
127
-
128
- @classmethod
129
- def _get_single_storage_client(
130
- cls: type[RequestQueue],
131
- id: str, # noqa: A002
132
- client: ApifyClientAsync | MemoryStorageClient,
133
- ) -> RequestQueueClientAsync | RequestQueueClient:
134
- return client.request_queue(id)
135
-
136
- @classmethod
137
- def _get_storage_collection_client(
138
- cls: type[RequestQueue],
139
- client: ApifyClientAsync | MemoryStorageClient,
140
- ) -> RequestQueueCollectionClientAsync | RequestQueueCollectionClient:
141
- return client.request_queues()
142
-
143
- async def add_request(
144
- self: RequestQueue,
145
- request: dict,
146
- *,
147
- forefront: bool = False,
148
- keep_url_fragment: bool = False,
149
- use_extended_unique_key: bool = False,
150
- ) -> dict:
151
- """Adds a request to the `RequestQueue` while managing deduplication and positioning within the queue.
152
-
153
- The deduplication of requests relies on the `uniqueKey` field within the request dictionary. If `uniqueKey`
154
- exists, it remains unchanged; if it does not, it is generated based on the request's `url`, `method`,
155
- and `payload` fields. The generation of `uniqueKey` can be influenced by the `keep_url_fragment` and
156
- `use_extended_unique_key` flags, which dictate whether to include the URL fragment and the request's method
157
- and payload, respectively, in its computation.
158
-
159
- The request can be added to the forefront (beginning) or the back of the queue based on the `forefront`
160
- parameter. Information about the request's addition to the queue, including whether it was already present or
161
- handled, is returned in an output dictionary.
162
-
163
- Args:
164
- request: The request object to be added to the queue. Must include at least the `url` key.
165
- Optionaly it can include the `method`, `payload` and `uniqueKey` keys.
166
-
167
- forefront: If True, adds the request to the forefront of the queue; otherwise, adds it to the end.
168
-
169
- keep_url_fragment: Determines whether the URL fragment (the part of the URL after '#') should be retained
170
- in the unique key computation.
171
-
172
- use_extended_unique_key: Determines whether to use an extended unique key, incorporating the request's
173
- method and payload into the unique key computation.
174
-
175
- Returns: A dictionary containing information about the operation, including:
176
- - `requestId` (str): The ID of the request.
177
- - `uniqueKey` (str): The unique key associated with the request.
178
- - `wasAlreadyPresent` (bool): Indicates whether the request was already in the queue.
179
- - `wasAlreadyHandled` (bool): Indicates whether the request was already processed.
180
- """
181
- budget_ow(
182
- request,
183
- {
184
- 'url': (str, True),
185
- },
186
- )
187
- self._last_activity = datetime.now(timezone.utc)
188
-
189
- if request.get('uniqueKey') is None:
190
- request['uniqueKey'] = compute_unique_key(
191
- url=request['url'],
192
- method=request.get('method', 'GET'),
193
- payload=request.get('payload'),
194
- keep_url_fragment=keep_url_fragment,
195
- use_extended_unique_key=use_extended_unique_key,
196
- )
197
-
198
- cache_key = unique_key_to_request_id(request['uniqueKey'])
199
- cached_info = self._requests_cache.get(cache_key)
200
-
201
- if cached_info:
202
- request['id'] = cached_info['id']
203
- return {
204
- 'wasAlreadyPresent': True,
205
- # We may assume that if request is in local cache then also the information if the
206
- # request was already handled is there because just one client should be using one queue.
207
- 'wasAlreadyHandled': cached_info['isHandled'],
208
- 'requestId': cached_info['id'],
209
- 'uniqueKey': cached_info['uniqueKey'],
210
- }
211
-
212
- queue_operation_info = await self._request_queue_client.add_request(request, forefront=forefront)
213
- queue_operation_info['uniqueKey'] = request['uniqueKey']
214
-
215
- self._cache_request(cache_key, queue_operation_info)
216
-
217
- request_id, was_already_present = queue_operation_info['requestId'], queue_operation_info['wasAlreadyPresent']
218
- is_handled = request.get('handledAt') is not None
219
- if not is_handled and not was_already_present and request_id not in self._in_progress and self._recently_handled.get(request_id) is None:
220
- self._assumed_total_count += 1
221
-
222
- self._maybe_add_request_to_queue_head(request_id, forefront)
223
-
224
- return queue_operation_info
225
-
226
- async def get_request(self: RequestQueue, request_id: str) -> dict | None:
227
- """Retrieve a request from the queue.
228
-
229
- Args:
230
- request_id (str): ID of the request to retrieve.
231
-
232
- Returns:
233
- dict, optional: The retrieved request, or `None`, if it does not exist.
234
- """
235
- budget_ow(request_id, (str, True), 'request_id')
236
- return await self._request_queue_client.get_request(request_id)
237
-
238
- async def fetch_next_request(self: RequestQueue) -> dict | None:
239
- """Return the next request in the queue to be processed.
240
-
241
- Once you successfully finish processing of the request, you need to call
242
- `RequestQueue.mark_request_as_handled` to mark the request as handled in the queue.
243
- If there was some error in processing the request, call `RequestQueue.reclaim_request` instead,
244
- so that the queue will give the request to some other consumer in another call to the `fetch_next_request` method.
245
-
246
- Note that the `None` return value does not mean the queue processing finished, it means there are currently no pending requests.
247
- To check whether all requests in queue were finished, use `RequestQueue.is_finished` instead.
248
-
249
- Returns:
250
- dict, optional: The request or `None` if there are no more pending requests.
251
- """
252
- await self._ensure_head_is_non_empty()
253
-
254
- # We are likely done at this point.
255
- if len(self._queue_head_dict) == 0:
256
- return None
257
-
258
- next_request_id, _ = self._queue_head_dict.popitem(last=False) # ~removeFirst()
259
-
260
- # This should never happen, but...
261
- if next_request_id in self._in_progress or self._recently_handled.get(next_request_id):
262
- logger.warning(
263
- 'Queue head returned a request that is already in progress?!',
264
- extra={
265
- 'nextRequestId': next_request_id,
266
- 'inProgress': next_request_id in self._in_progress,
267
- 'recentlyHandled': next_request_id in self._recently_handled,
268
- },
269
- )
270
- return None
271
- self._in_progress.add(next_request_id)
272
- self._last_activity = datetime.now(timezone.utc)
273
-
274
- try:
275
- request = await self.get_request(next_request_id)
276
- except Exception:
277
- # On error, remove the request from in progress, otherwise it would be there forever
278
- self._in_progress.remove(next_request_id)
279
- raise
280
-
281
- # NOTE: It can happen that the queue head index is inconsistent with the main queue table. This can occur in two situations:
282
-
283
- """ 1) Queue head index is ahead of the main table and the request is not present in the main table yet (i.e. getRequest() returned null).
284
- In this case, keep the request marked as in progress for a short while,
285
- so that isFinished() doesn't return true and _ensureHeadIsNonEmpty() doesn't not load the request
286
- into the queueHeadDict straight again. After the interval expires, fetchNextRequest()
287
- will try to fetch this request again, until it eventually appears in the main table.
288
- """
289
- if request is None:
290
- logger.debug('Cannot find a request from the beginning of queue, will be retried later', extra={'nextRequestId': next_request_id})
291
- asyncio.get_running_loop().call_later(STORAGE_CONSISTENCY_DELAY_MILLIS // 1000, lambda: self._in_progress.remove(next_request_id))
292
- return None
293
-
294
- """ 2) Queue head index is behind the main table and the underlying request was already handled
295
- (by some other client, since we keep the track of handled requests in recentlyHandled dictionary).
296
- We just add the request to the recentlyHandled dictionary so that next call to _ensureHeadIsNonEmpty()
297
- will not put the request again to queueHeadDict.
298
- """
299
- if request.get('handledAt') is not None:
300
- logger.debug('Request fetched from the beginning of queue was already handled', extra={'nextRequestId': next_request_id})
301
- self._recently_handled[next_request_id] = True
302
- return None
303
-
304
- return request
305
-
306
- async def mark_request_as_handled(self: RequestQueue, request: dict) -> dict | None:
307
- """Mark a request as handled after successful processing.
308
-
309
- Handled requests will never again be returned by the `RequestQueue.fetch_next_request` method.
310
-
311
- Args:
312
- request (dict): The request to mark as handled.
313
-
314
- Returns:
315
- dict, optional: Information about the queue operation with keys `requestId`, `uniqueKey`, `wasAlreadyPresent`, `wasAlreadyHandled`.
316
- `None` if the given request was not in progress.
317
- """
318
- budget_ow(
319
- request,
320
- {
321
- 'id': (str, True),
322
- 'uniqueKey': (str, True),
323
- 'handledAt': (datetime, False),
324
- },
325
- )
326
- self._last_activity = datetime.now(timezone.utc)
327
- if request['id'] not in self._in_progress:
328
- logger.debug('Cannot mark request as handled, because it is not in progress!', extra={'requestId': request['id']})
329
- return None
330
-
331
- request['handledAt'] = request.get('handledAt', datetime.now(timezone.utc))
332
- queue_operation_info = await self._request_queue_client.update_request({**request})
333
- queue_operation_info['uniqueKey'] = request['uniqueKey']
334
-
335
- self._in_progress.remove(request['id'])
336
- self._recently_handled[request['id']] = True
337
-
338
- if not queue_operation_info['wasAlreadyHandled']:
339
- self._assumed_handled_count += 1
340
-
341
- self._cache_request(unique_key_to_request_id(request['uniqueKey']), queue_operation_info)
342
-
343
- return queue_operation_info
344
-
345
- async def reclaim_request(
346
- self: RequestQueue,
347
- request: dict,
348
- forefront: bool = False, # noqa: FBT001, FBT002
349
- ) -> dict | None:
350
- """Reclaim a failed request back to the queue.
351
-
352
- The request will be returned for processing later again
353
- by another call to `RequestQueue.fetchNextRequest`.
354
-
355
- Args:
356
- request (dict): The request to return to the queue.
357
- forefront (bool, optional): Whether to add the request to the head or the end of the queue
358
- Returns:
359
- dict, optional: Information about the queue operation with keys `requestId`, `uniqueKey`, `wasAlreadyPresent`, `wasAlreadyHandled`.
360
- `None` if the given request was not in progress.
361
- """
362
- budget_ow(
363
- request,
364
- {
365
- 'id': (str, True),
366
- 'uniqueKey': (str, True),
367
- },
368
- )
369
- self._last_activity = datetime.now(timezone.utc)
370
-
371
- if request['id'] not in self._in_progress:
372
- logger.debug('Cannot reclaim request, because it is not in progress!', extra={'requestId': request['id']})
373
- return None
374
-
375
- # TODO: If request hasn't been changed since the last getRequest(), we don't need to call updateRequest()
376
- # and thus improve performance.
377
- # https://github.com/apify/apify-sdk-python/issues/143
378
- queue_operation_info = await self._request_queue_client.update_request(request, forefront=forefront)
379
- queue_operation_info['uniqueKey'] = request['uniqueKey']
380
- self._cache_request(unique_key_to_request_id(request['uniqueKey']), queue_operation_info)
381
-
382
- # Wait a little to increase a chance that the next call to fetchNextRequest() will return the request with updated data.
383
- # This is to compensate for the limitation of DynamoDB, where writes might not be immediately visible to subsequent reads.
384
- def callback() -> None:
385
- if request['id'] not in self._in_progress:
386
- logger.debug('The request is no longer marked as in progress in the queue?!', {'requestId': request['id']})
387
- return
388
-
389
- self._in_progress.remove(request['id'])
390
-
391
- # Performance optimization: add request straight to head if possible
392
- self._maybe_add_request_to_queue_head(request['id'], forefront)
393
-
394
- asyncio.get_running_loop().call_later(STORAGE_CONSISTENCY_DELAY_MILLIS // 1000, callback)
395
-
396
- return queue_operation_info
397
-
398
- def _in_progress_count(self: RequestQueue) -> int:
399
- return len(self._in_progress)
400
-
401
- async def is_empty(self: RequestQueue) -> bool:
402
- """Check whether the queue is empty.
403
-
404
- Returns:
405
- bool: `True` if the next call to `RequestQueue.fetchNextRequest` would return `None`, otherwise `False`.
406
- """
407
- await self._ensure_head_is_non_empty()
408
- return len(self._queue_head_dict) == 0
409
-
410
- async def is_finished(self: RequestQueue) -> bool:
411
- """Check whether the queue is finished.
412
-
413
- Due to the nature of distributed storage used by the queue,
414
- the function might occasionally return a false negative,
415
- but it will never return a false positive.
416
-
417
- Returns:
418
- bool: `True` if all requests were already handled and there are no more left. `False` otherwise.
419
- """
420
- seconds_since_last_activity = (datetime.now(timezone.utc) - self._last_activity).seconds
421
- if self._in_progress_count() > 0 and seconds_since_last_activity > self._internal_timeout_seconds:
422
- message = f'The request queue seems to be stuck for {self._internal_timeout_seconds}s, resetting internal state.'
423
- logger.warning(message)
424
- self._reset()
425
-
426
- if len(self._queue_head_dict) > 0 or self._in_progress_count() > 0:
427
- return False
428
-
429
- is_head_consistent = await self._ensure_head_is_non_empty(ensure_consistency=True)
430
- return is_head_consistent and len(self._queue_head_dict) == 0 and self._in_progress_count() == 0
431
-
432
- def _reset(self: RequestQueue) -> None:
433
- self._queue_head_dict.clear()
434
- self._query_queue_head_task = None
435
- self._in_progress.clear()
436
- self._recently_handled.clear()
437
- self._assumed_total_count = 0
438
- self._assumed_handled_count = 0
439
- self._requests_cache.clear()
440
- self._last_activity = datetime.now(timezone.utc)
441
-
442
- def _cache_request(self: RequestQueue, cache_key: str, queue_operation_info: dict) -> None:
443
- self._requests_cache[cache_key] = {
444
- 'id': queue_operation_info['requestId'],
445
- 'isHandled': queue_operation_info['wasAlreadyHandled'],
446
- 'uniqueKey': queue_operation_info['uniqueKey'],
447
- 'wasAlreadyHandled': queue_operation_info['wasAlreadyHandled'],
448
- }
449
-
450
- async def _queue_query_head(self: RequestQueue, limit: int) -> dict:
451
- query_started_at = datetime.now(timezone.utc)
452
-
453
- list_head = await self._request_queue_client.list_head(limit=limit)
454
- for request in list_head['items']:
455
- # Queue head index might be behind the main table, so ensure we don't recycle requests
456
- if not request['id'] or not request['uniqueKey'] or request['id'] in self._in_progress or self._recently_handled.get(request['id']):
457
- continue
458
- self._queue_head_dict[request['id']] = request['id']
459
- self._cache_request(
460
- unique_key_to_request_id(request['uniqueKey']),
461
- {
462
- 'requestId': request['id'],
463
- 'wasAlreadyHandled': False,
464
- 'wasAlreadyPresent': True,
465
- 'uniqueKey': request['uniqueKey'],
466
- },
467
- )
468
-
469
- # This is needed so that the next call to _ensureHeadIsNonEmpty() will fetch the queue head again.
470
- self._query_queue_head_task = None
471
-
472
- return {
473
- 'wasLimitReached': len(list_head['items']) >= limit,
474
- 'prevLimit': limit,
475
- 'queueModifiedAt': list_head['queueModifiedAt'],
476
- 'queryStartedAt': query_started_at,
477
- 'hadMultipleClients': list_head['hadMultipleClients'],
478
- }
479
-
480
- async def _ensure_head_is_non_empty(
481
- self: RequestQueue,
482
- ensure_consistency: bool = False, # noqa: FBT001, FBT002
483
- limit: int | None = None,
484
- iteration: int = 0,
485
- ) -> bool:
486
- # If is nonempty resolve immediately.
487
- if len(self._queue_head_dict) > 0:
488
- return True
489
-
490
- if limit is None:
491
- limit = max(self._in_progress_count() * QUERY_HEAD_BUFFER, QUERY_HEAD_MIN_LENGTH)
492
-
493
- if self._query_queue_head_task is None:
494
- self._query_queue_head_task = asyncio.Task(self._queue_query_head(limit))
495
-
496
- queue_head = await self._query_queue_head_task
497
-
498
- # TODO: I feel this code below can be greatly simplified... (comes from TS implementation *wink*)
499
- # https://github.com/apify/apify-sdk-python/issues/142
500
-
501
- # If queue is still empty then one of the following holds:
502
- # - the other calls waiting for this task already consumed all the returned requests
503
- # - the limit was too low and contained only requests in progress
504
- # - the writes from other clients were not propagated yet
505
- # - the whole queue was processed and we are done
506
-
507
- # If limit was not reached in the call then there are no more requests to be returned.
508
- if queue_head['prevLimit'] >= REQUEST_QUEUE_HEAD_MAX_LIMIT:
509
- logger.warning('Reached the maximum number of requests in progress', extra={'limit': REQUEST_QUEUE_HEAD_MAX_LIMIT})
510
-
511
- should_repeat_with_higher_limit = (
512
- len(self._queue_head_dict) == 0 and queue_head['wasLimitReached'] and queue_head['prevLimit'] < REQUEST_QUEUE_HEAD_MAX_LIMIT
513
- )
514
-
515
- # If ensureConsistency=true then we must ensure that either:
516
- # - queueModifiedAt is older than queryStartedAt by at least API_PROCESSED_REQUESTS_DELAY_MILLIS
517
- # - hadMultipleClients=false and this.assumedTotalCount<=this.assumedHandledCount
518
- is_database_consistent = (queue_head['queryStartedAt'] - queue_head['queueModifiedAt'].replace(tzinfo=timezone.utc)).seconds >= (
519
- API_PROCESSED_REQUESTS_DELAY_MILLIS // 1000
520
- )
521
- is_locally_consistent = not queue_head['hadMultipleClients'] and self._assumed_total_count <= self._assumed_handled_count
522
- # Consistent information from one source is enough to consider request queue finished.
523
- should_repeat_for_consistency = ensure_consistency and not is_database_consistent and not is_locally_consistent
524
-
525
- # If both are false then head is consistent and we may exit.
526
- if not should_repeat_with_higher_limit and not should_repeat_for_consistency:
527
- return True
528
-
529
- # If we are querying for consistency then we limit the number of queries to MAX_QUERIES_FOR_CONSISTENCY.
530
- # If this is reached then we return false so that empty() and finished() returns possibly false negative.
531
- if not should_repeat_with_higher_limit and iteration > MAX_QUERIES_FOR_CONSISTENCY:
532
- return False
533
-
534
- next_limit = round(queue_head['prevLimit'] * 1.5) if should_repeat_with_higher_limit else queue_head['prevLimit']
535
-
536
- # If we are repeating for consistency then wait required time.
537
- if should_repeat_for_consistency:
538
- delay_seconds = (API_PROCESSED_REQUESTS_DELAY_MILLIS // 1000) - (datetime.now(timezone.utc) - queue_head['queueModifiedAt']).seconds
539
- logger.info(f'Waiting for {delay_seconds}s before considering the queue as finished to ensure that the data is consistent.')
540
- await asyncio.sleep(delay_seconds)
541
-
542
- return await self._ensure_head_is_non_empty(ensure_consistency, next_limit, iteration + 1)
543
-
544
- def _maybe_add_request_to_queue_head(
545
- self: RequestQueue,
546
- request_id: str,
547
- forefront: bool, # noqa: FBT001
548
- ) -> None:
549
- if forefront:
550
- self._queue_head_dict[request_id] = request_id
551
- # Move to start, i.e. forefront of the queue
552
- self._queue_head_dict.move_to_end(request_id, last=False)
553
- elif self._assumed_total_count < QUERY_HEAD_MIN_LENGTH:
554
- # OrderedDict puts the item to the end of the queue by default
555
- self._queue_head_dict[request_id] = request_id
556
-
557
- async def drop(self: RequestQueue) -> None:
558
- """Remove the request queue either from the Apify cloud storage or from the local directory."""
559
- await self._request_queue_client.delete()
560
- self._remove_from_cache()
561
-
562
- async def get_info(self: RequestQueue) -> dict | None:
563
- """Get an object containing general information about the request queue.
564
-
565
- Returns:
566
- dict: Object returned by calling the GET request queue API endpoint.
567
- """
568
- return await self._request_queue_client.get()
569
-
570
- @classmethod
571
- async def open(
572
- cls: type[RequestQueue],
573
- *,
574
- id: str | None = None, # noqa: A002
575
- name: str | None = None,
576
- force_cloud: bool = False,
577
- config: Configuration | None = None,
578
- ) -> RequestQueue:
579
- """Open a request queue.
580
-
581
- Request queue represents a queue of URLs to crawl, which is stored either on local filesystem or in the Apify cloud.
582
- The queue is used for deep crawling of websites, where you start with several URLs and then
583
- recursively follow links to other pages. The data structure supports both breadth-first
584
- and depth-first crawling orders.
585
-
586
- Args:
587
- id (str, optional): ID of the request queue to be opened.
588
- If neither `id` nor `name` are provided, the method returns the default request queue associated with the actor run.
589
- If the request queue with the given ID does not exist, it raises an error.
590
- name (str, optional): Name of the request queue to be opened.
591
- If neither `id` nor `name` are provided, the method returns the default request queue associated with the actor run.
592
- If the request queue with the given name does not exist, it is created.
593
- force_cloud (bool, optional): If set to True, it will open a request queue on the Apify Platform even when running the actor locally.
594
- Defaults to False.
595
- config (Configuration, optional): A `Configuration` instance, uses global configuration if omitted.
596
-
597
- Returns:
598
- RequestQueue: An instance of the `RequestQueue` class for the given ID or name.
599
- """
600
- queue = await super().open(id=id, name=name, force_cloud=force_cloud, config=config)
601
- await queue._ensure_head_is_non_empty() # type: ignore
602
- return queue # type: ignore
@@ -1,72 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import TYPE_CHECKING
4
-
5
- from apify_shared.utils import ignore_docs
6
-
7
- from apify._memory_storage import MemoryStorageClient
8
- from apify.config import Configuration
9
-
10
- if TYPE_CHECKING:
11
- from apify_client import ApifyClientAsync
12
-
13
-
14
- @ignore_docs
15
- class StorageClientManager:
16
- """A class for managing storage clients."""
17
-
18
- _config: Configuration
19
-
20
- _local_client: MemoryStorageClient | None = None
21
- _cloud_client: ApifyClientAsync | None = None
22
-
23
- _default_instance: StorageClientManager | None = None
24
-
25
- def __init__(self: StorageClientManager) -> None:
26
- """Create a `StorageClientManager` instance."""
27
- self._config = Configuration.get_global_configuration()
28
-
29
- @classmethod
30
- def set_config(cls: type[StorageClientManager], config: Configuration) -> None:
31
- """Set the config for the StorageClientManager.
32
-
33
- Args:
34
- config (Configuration): The configuration this StorageClientManager should use.
35
- """
36
- cls._get_default_instance()._config = config
37
-
38
- @classmethod
39
- def get_storage_client(
40
- cls: type[StorageClientManager],
41
- force_cloud: bool = False, # noqa: FBT001, FBT002
42
- ) -> ApifyClientAsync | MemoryStorageClient:
43
- """Get the current storage client instance.
44
-
45
- Returns:
46
- ApifyClientAsync or MemoryStorageClient: The current storage client instance.
47
- """
48
- default_instance = cls._get_default_instance()
49
- if not default_instance._local_client:
50
- default_instance._local_client = MemoryStorageClient(persist_storage=default_instance._config.persist_storage, write_metadata=True)
51
-
52
- if default_instance._config.is_at_home or force_cloud:
53
- assert default_instance._cloud_client is not None # noqa: S101
54
- return default_instance._cloud_client
55
-
56
- return default_instance._local_client
57
-
58
- @classmethod
59
- def set_cloud_client(cls: type[StorageClientManager], client: ApifyClientAsync) -> None:
60
- """Set the storage client.
61
-
62
- Args:
63
- client (ApifyClientAsync or MemoryStorageClient): The instance of a storage client.
64
- """
65
- cls._get_default_instance()._cloud_client = client
66
-
67
- @classmethod
68
- def _get_default_instance(cls: type[StorageClientManager]) -> StorageClientManager:
69
- if cls._default_instance is None:
70
- cls._default_instance = cls()
71
-
72
- return cls._default_instance