apify 1.7.1b1__py3-none-any.whl → 2.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of apify might be problematic. Click here for more details.
- apify/__init__.py +33 -4
- apify/_actor.py +1074 -0
- apify/_configuration.py +370 -0
- apify/_consts.py +10 -0
- apify/_crypto.py +31 -27
- apify/_models.py +117 -0
- apify/_platform_event_manager.py +231 -0
- apify/_proxy_configuration.py +320 -0
- apify/_utils.py +18 -484
- apify/apify_storage_client/__init__.py +3 -0
- apify/apify_storage_client/_apify_storage_client.py +68 -0
- apify/apify_storage_client/_dataset_client.py +190 -0
- apify/apify_storage_client/_dataset_collection_client.py +51 -0
- apify/apify_storage_client/_key_value_store_client.py +94 -0
- apify/apify_storage_client/_key_value_store_collection_client.py +51 -0
- apify/apify_storage_client/_request_queue_client.py +176 -0
- apify/apify_storage_client/_request_queue_collection_client.py +51 -0
- apify/apify_storage_client/py.typed +0 -0
- apify/log.py +22 -105
- apify/scrapy/__init__.py +11 -3
- apify/scrapy/middlewares/__init__.py +3 -1
- apify/scrapy/middlewares/apify_proxy.py +29 -27
- apify/scrapy/middlewares/py.typed +0 -0
- apify/scrapy/pipelines/__init__.py +3 -1
- apify/scrapy/pipelines/actor_dataset_push.py +6 -3
- apify/scrapy/pipelines/py.typed +0 -0
- apify/scrapy/py.typed +0 -0
- apify/scrapy/requests.py +60 -58
- apify/scrapy/scheduler.py +28 -19
- apify/scrapy/utils.py +10 -32
- apify/storages/__init__.py +4 -10
- apify/storages/_request_list.py +150 -0
- apify/storages/py.typed +0 -0
- apify-2.2.1.dist-info/METADATA +211 -0
- apify-2.2.1.dist-info/RECORD +38 -0
- {apify-1.7.1b1.dist-info → apify-2.2.1.dist-info}/WHEEL +1 -2
- apify/_memory_storage/__init__.py +0 -3
- apify/_memory_storage/file_storage_utils.py +0 -71
- apify/_memory_storage/memory_storage_client.py +0 -219
- apify/_memory_storage/resource_clients/__init__.py +0 -19
- apify/_memory_storage/resource_clients/base_resource_client.py +0 -141
- apify/_memory_storage/resource_clients/base_resource_collection_client.py +0 -114
- apify/_memory_storage/resource_clients/dataset.py +0 -452
- apify/_memory_storage/resource_clients/dataset_collection.py +0 -48
- apify/_memory_storage/resource_clients/key_value_store.py +0 -533
- apify/_memory_storage/resource_clients/key_value_store_collection.py +0 -48
- apify/_memory_storage/resource_clients/request_queue.py +0 -466
- apify/_memory_storage/resource_clients/request_queue_collection.py +0 -48
- apify/actor.py +0 -1351
- apify/config.py +0 -127
- apify/consts.py +0 -67
- apify/event_manager.py +0 -236
- apify/proxy_configuration.py +0 -365
- apify/storages/base_storage.py +0 -181
- apify/storages/dataset.py +0 -494
- apify/storages/key_value_store.py +0 -257
- apify/storages/request_queue.py +0 -602
- apify/storages/storage_client_manager.py +0 -72
- apify-1.7.1b1.dist-info/METADATA +0 -149
- apify-1.7.1b1.dist-info/RECORD +0 -41
- apify-1.7.1b1.dist-info/top_level.txt +0 -1
- {apify-1.7.1b1.dist-info → apify-2.2.1.dist-info}/LICENSE +0 -0
apify/storages/request_queue.py
DELETED
|
@@ -1,602 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
from collections import OrderedDict
|
|
5
|
-
from datetime import datetime, timezone
|
|
6
|
-
from typing import TYPE_CHECKING
|
|
7
|
-
from typing import OrderedDict as OrderedDictType
|
|
8
|
-
|
|
9
|
-
from apify_shared.utils import ignore_docs
|
|
10
|
-
|
|
11
|
-
from apify._crypto import crypto_random_object_id
|
|
12
|
-
from apify._utils import LRUCache, budget_ow, compute_unique_key, unique_key_to_request_id
|
|
13
|
-
from apify.consts import REQUEST_QUEUE_HEAD_MAX_LIMIT
|
|
14
|
-
from apify.log import logger
|
|
15
|
-
from apify.storages.base_storage import BaseStorage
|
|
16
|
-
|
|
17
|
-
if TYPE_CHECKING:
|
|
18
|
-
from apify_client import ApifyClientAsync
|
|
19
|
-
from apify_client.clients import RequestQueueClientAsync, RequestQueueCollectionClientAsync
|
|
20
|
-
|
|
21
|
-
from apify._memory_storage import MemoryStorageClient
|
|
22
|
-
from apify._memory_storage.resource_clients import RequestQueueClient, RequestQueueCollectionClient
|
|
23
|
-
from apify.config import Configuration
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
MAX_CACHED_REQUESTS = 1_000_000
|
|
27
|
-
|
|
28
|
-
# When requesting queue head we always fetch requestsInProgressCount * QUERY_HEAD_BUFFER number of requests.
|
|
29
|
-
QUERY_HEAD_MIN_LENGTH = 100
|
|
30
|
-
|
|
31
|
-
QUERY_HEAD_BUFFER = 3
|
|
32
|
-
|
|
33
|
-
# If queue was modified (request added/updated/deleted) before more than API_PROCESSED_REQUESTS_DELAY_MILLIS
|
|
34
|
-
# then we assume the get head operation to be consistent.
|
|
35
|
-
API_PROCESSED_REQUESTS_DELAY_MILLIS = 10_000
|
|
36
|
-
|
|
37
|
-
# How many times we try to get queue head with queueModifiedAt older than API_PROCESSED_REQUESTS_DELAY_MILLIS.
|
|
38
|
-
MAX_QUERIES_FOR_CONSISTENCY = 6
|
|
39
|
-
|
|
40
|
-
# This number must be large enough so that processing of all these requests cannot be done in
|
|
41
|
-
# a time lower than expected maximum latency of DynamoDB, but low enough not to waste too much memory.
|
|
42
|
-
RECENTLY_HANDLED_CACHE_SIZE = 1000
|
|
43
|
-
|
|
44
|
-
# Indicates how long it usually takes for the underlying storage to propagate all writes
|
|
45
|
-
# to be available to subsequent reads.
|
|
46
|
-
STORAGE_CONSISTENCY_DELAY_MILLIS = 3000
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
class RequestQueue(BaseStorage):
|
|
50
|
-
"""Represents a queue of URLs to crawl.
|
|
51
|
-
|
|
52
|
-
Can be used for deep crawling of websites where you start with several URLs and then recursively
|
|
53
|
-
follow links to other pages. The data structure supports both breadth-first and depth-first crawling orders.
|
|
54
|
-
|
|
55
|
-
Each URL is represented using an instance of the {@apilink Request} class.
|
|
56
|
-
The queue can only contain unique URLs. More precisely, it can only contain request dictionaries
|
|
57
|
-
with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden.
|
|
58
|
-
To add a single URL multiple times to the queue,
|
|
59
|
-
corresponding request dictionary will need to have different `uniqueKey` properties.
|
|
60
|
-
|
|
61
|
-
Do not instantiate this class directly, use the `Actor.open_request_queue()` function instead.
|
|
62
|
-
|
|
63
|
-
`RequestQueue` stores its data either on local disk or in the Apify cloud,
|
|
64
|
-
depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variables are set.
|
|
65
|
-
|
|
66
|
-
If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the data is stored in
|
|
67
|
-
the local directory in the following files:
|
|
68
|
-
```
|
|
69
|
-
{APIFY_LOCAL_STORAGE_DIR}/request_queues/{QUEUE_ID}/{REQUEST_ID}.json
|
|
70
|
-
```
|
|
71
|
-
Note that `{QUEUE_ID}` is the name or ID of the request queue. The default request queue has ID: `default`,
|
|
72
|
-
unless you override it by setting the `APIFY_DEFAULT_REQUEST_QUEUE_ID` environment variable.
|
|
73
|
-
The `{REQUEST_ID}` is the id of the request.
|
|
74
|
-
|
|
75
|
-
If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the
|
|
76
|
-
[Apify Request Queue](https://docs.apify.com/storage/request-queue)
|
|
77
|
-
cloud storage.
|
|
78
|
-
"""
|
|
79
|
-
|
|
80
|
-
_request_queue_client: RequestQueueClientAsync | RequestQueueClient
|
|
81
|
-
_client_key = crypto_random_object_id()
|
|
82
|
-
_queue_head_dict: OrderedDictType[str, str]
|
|
83
|
-
_query_queue_head_task: asyncio.Task | None
|
|
84
|
-
_in_progress: set[str]
|
|
85
|
-
_last_activity: datetime
|
|
86
|
-
_internal_timeout_seconds = 5 * 60
|
|
87
|
-
_recently_handled: LRUCache[bool]
|
|
88
|
-
_assumed_total_count = 0
|
|
89
|
-
_assumed_handled_count = 0
|
|
90
|
-
_requests_cache: LRUCache[dict]
|
|
91
|
-
|
|
92
|
-
@ignore_docs
|
|
93
|
-
def __init__(
|
|
94
|
-
self: RequestQueue,
|
|
95
|
-
id: str, # noqa: A002
|
|
96
|
-
name: str | None,
|
|
97
|
-
client: ApifyClientAsync | MemoryStorageClient,
|
|
98
|
-
config: Configuration,
|
|
99
|
-
) -> None:
|
|
100
|
-
"""Create a `RequestQueue` instance.
|
|
101
|
-
|
|
102
|
-
Do not use the constructor directly, use the `Actor.open_request_queue()` function instead.
|
|
103
|
-
|
|
104
|
-
Args:
|
|
105
|
-
id (str): ID of the request queue.
|
|
106
|
-
name (str, optional): Name of the request queue.
|
|
107
|
-
client (ApifyClientAsync or MemoryStorageClient): The storage client which should be used.
|
|
108
|
-
config (Configuration): The configuration which should be used.
|
|
109
|
-
"""
|
|
110
|
-
super().__init__(id=id, name=name, client=client, config=config)
|
|
111
|
-
|
|
112
|
-
self._request_queue_client = client.request_queue(self._id, client_key=self._client_key)
|
|
113
|
-
self._queue_head_dict = OrderedDict()
|
|
114
|
-
self._query_queue_head_task = None
|
|
115
|
-
self._in_progress = set()
|
|
116
|
-
self._last_activity = datetime.now(timezone.utc)
|
|
117
|
-
self._recently_handled = LRUCache[bool](max_length=RECENTLY_HANDLED_CACHE_SIZE)
|
|
118
|
-
self._requests_cache = LRUCache(max_length=MAX_CACHED_REQUESTS)
|
|
119
|
-
|
|
120
|
-
@classmethod
|
|
121
|
-
def _get_human_friendly_label(cls: type[RequestQueue]) -> str:
|
|
122
|
-
return 'Request queue'
|
|
123
|
-
|
|
124
|
-
@classmethod
|
|
125
|
-
def _get_default_id(cls: type[RequestQueue], config: Configuration) -> str:
|
|
126
|
-
return config.default_request_queue_id
|
|
127
|
-
|
|
128
|
-
@classmethod
|
|
129
|
-
def _get_single_storage_client(
|
|
130
|
-
cls: type[RequestQueue],
|
|
131
|
-
id: str, # noqa: A002
|
|
132
|
-
client: ApifyClientAsync | MemoryStorageClient,
|
|
133
|
-
) -> RequestQueueClientAsync | RequestQueueClient:
|
|
134
|
-
return client.request_queue(id)
|
|
135
|
-
|
|
136
|
-
@classmethod
|
|
137
|
-
def _get_storage_collection_client(
|
|
138
|
-
cls: type[RequestQueue],
|
|
139
|
-
client: ApifyClientAsync | MemoryStorageClient,
|
|
140
|
-
) -> RequestQueueCollectionClientAsync | RequestQueueCollectionClient:
|
|
141
|
-
return client.request_queues()
|
|
142
|
-
|
|
143
|
-
async def add_request(
|
|
144
|
-
self: RequestQueue,
|
|
145
|
-
request: dict,
|
|
146
|
-
*,
|
|
147
|
-
forefront: bool = False,
|
|
148
|
-
keep_url_fragment: bool = False,
|
|
149
|
-
use_extended_unique_key: bool = False,
|
|
150
|
-
) -> dict:
|
|
151
|
-
"""Adds a request to the `RequestQueue` while managing deduplication and positioning within the queue.
|
|
152
|
-
|
|
153
|
-
The deduplication of requests relies on the `uniqueKey` field within the request dictionary. If `uniqueKey`
|
|
154
|
-
exists, it remains unchanged; if it does not, it is generated based on the request's `url`, `method`,
|
|
155
|
-
and `payload` fields. The generation of `uniqueKey` can be influenced by the `keep_url_fragment` and
|
|
156
|
-
`use_extended_unique_key` flags, which dictate whether to include the URL fragment and the request's method
|
|
157
|
-
and payload, respectively, in its computation.
|
|
158
|
-
|
|
159
|
-
The request can be added to the forefront (beginning) or the back of the queue based on the `forefront`
|
|
160
|
-
parameter. Information about the request's addition to the queue, including whether it was already present or
|
|
161
|
-
handled, is returned in an output dictionary.
|
|
162
|
-
|
|
163
|
-
Args:
|
|
164
|
-
request: The request object to be added to the queue. Must include at least the `url` key.
|
|
165
|
-
Optionaly it can include the `method`, `payload` and `uniqueKey` keys.
|
|
166
|
-
|
|
167
|
-
forefront: If True, adds the request to the forefront of the queue; otherwise, adds it to the end.
|
|
168
|
-
|
|
169
|
-
keep_url_fragment: Determines whether the URL fragment (the part of the URL after '#') should be retained
|
|
170
|
-
in the unique key computation.
|
|
171
|
-
|
|
172
|
-
use_extended_unique_key: Determines whether to use an extended unique key, incorporating the request's
|
|
173
|
-
method and payload into the unique key computation.
|
|
174
|
-
|
|
175
|
-
Returns: A dictionary containing information about the operation, including:
|
|
176
|
-
- `requestId` (str): The ID of the request.
|
|
177
|
-
- `uniqueKey` (str): The unique key associated with the request.
|
|
178
|
-
- `wasAlreadyPresent` (bool): Indicates whether the request was already in the queue.
|
|
179
|
-
- `wasAlreadyHandled` (bool): Indicates whether the request was already processed.
|
|
180
|
-
"""
|
|
181
|
-
budget_ow(
|
|
182
|
-
request,
|
|
183
|
-
{
|
|
184
|
-
'url': (str, True),
|
|
185
|
-
},
|
|
186
|
-
)
|
|
187
|
-
self._last_activity = datetime.now(timezone.utc)
|
|
188
|
-
|
|
189
|
-
if request.get('uniqueKey') is None:
|
|
190
|
-
request['uniqueKey'] = compute_unique_key(
|
|
191
|
-
url=request['url'],
|
|
192
|
-
method=request.get('method', 'GET'),
|
|
193
|
-
payload=request.get('payload'),
|
|
194
|
-
keep_url_fragment=keep_url_fragment,
|
|
195
|
-
use_extended_unique_key=use_extended_unique_key,
|
|
196
|
-
)
|
|
197
|
-
|
|
198
|
-
cache_key = unique_key_to_request_id(request['uniqueKey'])
|
|
199
|
-
cached_info = self._requests_cache.get(cache_key)
|
|
200
|
-
|
|
201
|
-
if cached_info:
|
|
202
|
-
request['id'] = cached_info['id']
|
|
203
|
-
return {
|
|
204
|
-
'wasAlreadyPresent': True,
|
|
205
|
-
# We may assume that if request is in local cache then also the information if the
|
|
206
|
-
# request was already handled is there because just one client should be using one queue.
|
|
207
|
-
'wasAlreadyHandled': cached_info['isHandled'],
|
|
208
|
-
'requestId': cached_info['id'],
|
|
209
|
-
'uniqueKey': cached_info['uniqueKey'],
|
|
210
|
-
}
|
|
211
|
-
|
|
212
|
-
queue_operation_info = await self._request_queue_client.add_request(request, forefront=forefront)
|
|
213
|
-
queue_operation_info['uniqueKey'] = request['uniqueKey']
|
|
214
|
-
|
|
215
|
-
self._cache_request(cache_key, queue_operation_info)
|
|
216
|
-
|
|
217
|
-
request_id, was_already_present = queue_operation_info['requestId'], queue_operation_info['wasAlreadyPresent']
|
|
218
|
-
is_handled = request.get('handledAt') is not None
|
|
219
|
-
if not is_handled and not was_already_present and request_id not in self._in_progress and self._recently_handled.get(request_id) is None:
|
|
220
|
-
self._assumed_total_count += 1
|
|
221
|
-
|
|
222
|
-
self._maybe_add_request_to_queue_head(request_id, forefront)
|
|
223
|
-
|
|
224
|
-
return queue_operation_info
|
|
225
|
-
|
|
226
|
-
async def get_request(self: RequestQueue, request_id: str) -> dict | None:
|
|
227
|
-
"""Retrieve a request from the queue.
|
|
228
|
-
|
|
229
|
-
Args:
|
|
230
|
-
request_id (str): ID of the request to retrieve.
|
|
231
|
-
|
|
232
|
-
Returns:
|
|
233
|
-
dict, optional: The retrieved request, or `None`, if it does not exist.
|
|
234
|
-
"""
|
|
235
|
-
budget_ow(request_id, (str, True), 'request_id')
|
|
236
|
-
return await self._request_queue_client.get_request(request_id)
|
|
237
|
-
|
|
238
|
-
async def fetch_next_request(self: RequestQueue) -> dict | None:
|
|
239
|
-
"""Return the next request in the queue to be processed.
|
|
240
|
-
|
|
241
|
-
Once you successfully finish processing of the request, you need to call
|
|
242
|
-
`RequestQueue.mark_request_as_handled` to mark the request as handled in the queue.
|
|
243
|
-
If there was some error in processing the request, call `RequestQueue.reclaim_request` instead,
|
|
244
|
-
so that the queue will give the request to some other consumer in another call to the `fetch_next_request` method.
|
|
245
|
-
|
|
246
|
-
Note that the `None` return value does not mean the queue processing finished, it means there are currently no pending requests.
|
|
247
|
-
To check whether all requests in queue were finished, use `RequestQueue.is_finished` instead.
|
|
248
|
-
|
|
249
|
-
Returns:
|
|
250
|
-
dict, optional: The request or `None` if there are no more pending requests.
|
|
251
|
-
"""
|
|
252
|
-
await self._ensure_head_is_non_empty()
|
|
253
|
-
|
|
254
|
-
# We are likely done at this point.
|
|
255
|
-
if len(self._queue_head_dict) == 0:
|
|
256
|
-
return None
|
|
257
|
-
|
|
258
|
-
next_request_id, _ = self._queue_head_dict.popitem(last=False) # ~removeFirst()
|
|
259
|
-
|
|
260
|
-
# This should never happen, but...
|
|
261
|
-
if next_request_id in self._in_progress or self._recently_handled.get(next_request_id):
|
|
262
|
-
logger.warning(
|
|
263
|
-
'Queue head returned a request that is already in progress?!',
|
|
264
|
-
extra={
|
|
265
|
-
'nextRequestId': next_request_id,
|
|
266
|
-
'inProgress': next_request_id in self._in_progress,
|
|
267
|
-
'recentlyHandled': next_request_id in self._recently_handled,
|
|
268
|
-
},
|
|
269
|
-
)
|
|
270
|
-
return None
|
|
271
|
-
self._in_progress.add(next_request_id)
|
|
272
|
-
self._last_activity = datetime.now(timezone.utc)
|
|
273
|
-
|
|
274
|
-
try:
|
|
275
|
-
request = await self.get_request(next_request_id)
|
|
276
|
-
except Exception:
|
|
277
|
-
# On error, remove the request from in progress, otherwise it would be there forever
|
|
278
|
-
self._in_progress.remove(next_request_id)
|
|
279
|
-
raise
|
|
280
|
-
|
|
281
|
-
# NOTE: It can happen that the queue head index is inconsistent with the main queue table. This can occur in two situations:
|
|
282
|
-
|
|
283
|
-
""" 1) Queue head index is ahead of the main table and the request is not present in the main table yet (i.e. getRequest() returned null).
|
|
284
|
-
In this case, keep the request marked as in progress for a short while,
|
|
285
|
-
so that isFinished() doesn't return true and _ensureHeadIsNonEmpty() doesn't not load the request
|
|
286
|
-
into the queueHeadDict straight again. After the interval expires, fetchNextRequest()
|
|
287
|
-
will try to fetch this request again, until it eventually appears in the main table.
|
|
288
|
-
"""
|
|
289
|
-
if request is None:
|
|
290
|
-
logger.debug('Cannot find a request from the beginning of queue, will be retried later', extra={'nextRequestId': next_request_id})
|
|
291
|
-
asyncio.get_running_loop().call_later(STORAGE_CONSISTENCY_DELAY_MILLIS // 1000, lambda: self._in_progress.remove(next_request_id))
|
|
292
|
-
return None
|
|
293
|
-
|
|
294
|
-
""" 2) Queue head index is behind the main table and the underlying request was already handled
|
|
295
|
-
(by some other client, since we keep the track of handled requests in recentlyHandled dictionary).
|
|
296
|
-
We just add the request to the recentlyHandled dictionary so that next call to _ensureHeadIsNonEmpty()
|
|
297
|
-
will not put the request again to queueHeadDict.
|
|
298
|
-
"""
|
|
299
|
-
if request.get('handledAt') is not None:
|
|
300
|
-
logger.debug('Request fetched from the beginning of queue was already handled', extra={'nextRequestId': next_request_id})
|
|
301
|
-
self._recently_handled[next_request_id] = True
|
|
302
|
-
return None
|
|
303
|
-
|
|
304
|
-
return request
|
|
305
|
-
|
|
306
|
-
async def mark_request_as_handled(self: RequestQueue, request: dict) -> dict | None:
|
|
307
|
-
"""Mark a request as handled after successful processing.
|
|
308
|
-
|
|
309
|
-
Handled requests will never again be returned by the `RequestQueue.fetch_next_request` method.
|
|
310
|
-
|
|
311
|
-
Args:
|
|
312
|
-
request (dict): The request to mark as handled.
|
|
313
|
-
|
|
314
|
-
Returns:
|
|
315
|
-
dict, optional: Information about the queue operation with keys `requestId`, `uniqueKey`, `wasAlreadyPresent`, `wasAlreadyHandled`.
|
|
316
|
-
`None` if the given request was not in progress.
|
|
317
|
-
"""
|
|
318
|
-
budget_ow(
|
|
319
|
-
request,
|
|
320
|
-
{
|
|
321
|
-
'id': (str, True),
|
|
322
|
-
'uniqueKey': (str, True),
|
|
323
|
-
'handledAt': (datetime, False),
|
|
324
|
-
},
|
|
325
|
-
)
|
|
326
|
-
self._last_activity = datetime.now(timezone.utc)
|
|
327
|
-
if request['id'] not in self._in_progress:
|
|
328
|
-
logger.debug('Cannot mark request as handled, because it is not in progress!', extra={'requestId': request['id']})
|
|
329
|
-
return None
|
|
330
|
-
|
|
331
|
-
request['handledAt'] = request.get('handledAt', datetime.now(timezone.utc))
|
|
332
|
-
queue_operation_info = await self._request_queue_client.update_request({**request})
|
|
333
|
-
queue_operation_info['uniqueKey'] = request['uniqueKey']
|
|
334
|
-
|
|
335
|
-
self._in_progress.remove(request['id'])
|
|
336
|
-
self._recently_handled[request['id']] = True
|
|
337
|
-
|
|
338
|
-
if not queue_operation_info['wasAlreadyHandled']:
|
|
339
|
-
self._assumed_handled_count += 1
|
|
340
|
-
|
|
341
|
-
self._cache_request(unique_key_to_request_id(request['uniqueKey']), queue_operation_info)
|
|
342
|
-
|
|
343
|
-
return queue_operation_info
|
|
344
|
-
|
|
345
|
-
async def reclaim_request(
|
|
346
|
-
self: RequestQueue,
|
|
347
|
-
request: dict,
|
|
348
|
-
forefront: bool = False, # noqa: FBT001, FBT002
|
|
349
|
-
) -> dict | None:
|
|
350
|
-
"""Reclaim a failed request back to the queue.
|
|
351
|
-
|
|
352
|
-
The request will be returned for processing later again
|
|
353
|
-
by another call to `RequestQueue.fetchNextRequest`.
|
|
354
|
-
|
|
355
|
-
Args:
|
|
356
|
-
request (dict): The request to return to the queue.
|
|
357
|
-
forefront (bool, optional): Whether to add the request to the head or the end of the queue
|
|
358
|
-
Returns:
|
|
359
|
-
dict, optional: Information about the queue operation with keys `requestId`, `uniqueKey`, `wasAlreadyPresent`, `wasAlreadyHandled`.
|
|
360
|
-
`None` if the given request was not in progress.
|
|
361
|
-
"""
|
|
362
|
-
budget_ow(
|
|
363
|
-
request,
|
|
364
|
-
{
|
|
365
|
-
'id': (str, True),
|
|
366
|
-
'uniqueKey': (str, True),
|
|
367
|
-
},
|
|
368
|
-
)
|
|
369
|
-
self._last_activity = datetime.now(timezone.utc)
|
|
370
|
-
|
|
371
|
-
if request['id'] not in self._in_progress:
|
|
372
|
-
logger.debug('Cannot reclaim request, because it is not in progress!', extra={'requestId': request['id']})
|
|
373
|
-
return None
|
|
374
|
-
|
|
375
|
-
# TODO: If request hasn't been changed since the last getRequest(), we don't need to call updateRequest()
|
|
376
|
-
# and thus improve performance.
|
|
377
|
-
# https://github.com/apify/apify-sdk-python/issues/143
|
|
378
|
-
queue_operation_info = await self._request_queue_client.update_request(request, forefront=forefront)
|
|
379
|
-
queue_operation_info['uniqueKey'] = request['uniqueKey']
|
|
380
|
-
self._cache_request(unique_key_to_request_id(request['uniqueKey']), queue_operation_info)
|
|
381
|
-
|
|
382
|
-
# Wait a little to increase a chance that the next call to fetchNextRequest() will return the request with updated data.
|
|
383
|
-
# This is to compensate for the limitation of DynamoDB, where writes might not be immediately visible to subsequent reads.
|
|
384
|
-
def callback() -> None:
|
|
385
|
-
if request['id'] not in self._in_progress:
|
|
386
|
-
logger.debug('The request is no longer marked as in progress in the queue?!', {'requestId': request['id']})
|
|
387
|
-
return
|
|
388
|
-
|
|
389
|
-
self._in_progress.remove(request['id'])
|
|
390
|
-
|
|
391
|
-
# Performance optimization: add request straight to head if possible
|
|
392
|
-
self._maybe_add_request_to_queue_head(request['id'], forefront)
|
|
393
|
-
|
|
394
|
-
asyncio.get_running_loop().call_later(STORAGE_CONSISTENCY_DELAY_MILLIS // 1000, callback)
|
|
395
|
-
|
|
396
|
-
return queue_operation_info
|
|
397
|
-
|
|
398
|
-
def _in_progress_count(self: RequestQueue) -> int:
|
|
399
|
-
return len(self._in_progress)
|
|
400
|
-
|
|
401
|
-
async def is_empty(self: RequestQueue) -> bool:
|
|
402
|
-
"""Check whether the queue is empty.
|
|
403
|
-
|
|
404
|
-
Returns:
|
|
405
|
-
bool: `True` if the next call to `RequestQueue.fetchNextRequest` would return `None`, otherwise `False`.
|
|
406
|
-
"""
|
|
407
|
-
await self._ensure_head_is_non_empty()
|
|
408
|
-
return len(self._queue_head_dict) == 0
|
|
409
|
-
|
|
410
|
-
async def is_finished(self: RequestQueue) -> bool:
|
|
411
|
-
"""Check whether the queue is finished.
|
|
412
|
-
|
|
413
|
-
Due to the nature of distributed storage used by the queue,
|
|
414
|
-
the function might occasionally return a false negative,
|
|
415
|
-
but it will never return a false positive.
|
|
416
|
-
|
|
417
|
-
Returns:
|
|
418
|
-
bool: `True` if all requests were already handled and there are no more left. `False` otherwise.
|
|
419
|
-
"""
|
|
420
|
-
seconds_since_last_activity = (datetime.now(timezone.utc) - self._last_activity).seconds
|
|
421
|
-
if self._in_progress_count() > 0 and seconds_since_last_activity > self._internal_timeout_seconds:
|
|
422
|
-
message = f'The request queue seems to be stuck for {self._internal_timeout_seconds}s, resetting internal state.'
|
|
423
|
-
logger.warning(message)
|
|
424
|
-
self._reset()
|
|
425
|
-
|
|
426
|
-
if len(self._queue_head_dict) > 0 or self._in_progress_count() > 0:
|
|
427
|
-
return False
|
|
428
|
-
|
|
429
|
-
is_head_consistent = await self._ensure_head_is_non_empty(ensure_consistency=True)
|
|
430
|
-
return is_head_consistent and len(self._queue_head_dict) == 0 and self._in_progress_count() == 0
|
|
431
|
-
|
|
432
|
-
def _reset(self: RequestQueue) -> None:
|
|
433
|
-
self._queue_head_dict.clear()
|
|
434
|
-
self._query_queue_head_task = None
|
|
435
|
-
self._in_progress.clear()
|
|
436
|
-
self._recently_handled.clear()
|
|
437
|
-
self._assumed_total_count = 0
|
|
438
|
-
self._assumed_handled_count = 0
|
|
439
|
-
self._requests_cache.clear()
|
|
440
|
-
self._last_activity = datetime.now(timezone.utc)
|
|
441
|
-
|
|
442
|
-
def _cache_request(self: RequestQueue, cache_key: str, queue_operation_info: dict) -> None:
|
|
443
|
-
self._requests_cache[cache_key] = {
|
|
444
|
-
'id': queue_operation_info['requestId'],
|
|
445
|
-
'isHandled': queue_operation_info['wasAlreadyHandled'],
|
|
446
|
-
'uniqueKey': queue_operation_info['uniqueKey'],
|
|
447
|
-
'wasAlreadyHandled': queue_operation_info['wasAlreadyHandled'],
|
|
448
|
-
}
|
|
449
|
-
|
|
450
|
-
async def _queue_query_head(self: RequestQueue, limit: int) -> dict:
|
|
451
|
-
query_started_at = datetime.now(timezone.utc)
|
|
452
|
-
|
|
453
|
-
list_head = await self._request_queue_client.list_head(limit=limit)
|
|
454
|
-
for request in list_head['items']:
|
|
455
|
-
# Queue head index might be behind the main table, so ensure we don't recycle requests
|
|
456
|
-
if not request['id'] or not request['uniqueKey'] or request['id'] in self._in_progress or self._recently_handled.get(request['id']):
|
|
457
|
-
continue
|
|
458
|
-
self._queue_head_dict[request['id']] = request['id']
|
|
459
|
-
self._cache_request(
|
|
460
|
-
unique_key_to_request_id(request['uniqueKey']),
|
|
461
|
-
{
|
|
462
|
-
'requestId': request['id'],
|
|
463
|
-
'wasAlreadyHandled': False,
|
|
464
|
-
'wasAlreadyPresent': True,
|
|
465
|
-
'uniqueKey': request['uniqueKey'],
|
|
466
|
-
},
|
|
467
|
-
)
|
|
468
|
-
|
|
469
|
-
# This is needed so that the next call to _ensureHeadIsNonEmpty() will fetch the queue head again.
|
|
470
|
-
self._query_queue_head_task = None
|
|
471
|
-
|
|
472
|
-
return {
|
|
473
|
-
'wasLimitReached': len(list_head['items']) >= limit,
|
|
474
|
-
'prevLimit': limit,
|
|
475
|
-
'queueModifiedAt': list_head['queueModifiedAt'],
|
|
476
|
-
'queryStartedAt': query_started_at,
|
|
477
|
-
'hadMultipleClients': list_head['hadMultipleClients'],
|
|
478
|
-
}
|
|
479
|
-
|
|
480
|
-
async def _ensure_head_is_non_empty(
|
|
481
|
-
self: RequestQueue,
|
|
482
|
-
ensure_consistency: bool = False, # noqa: FBT001, FBT002
|
|
483
|
-
limit: int | None = None,
|
|
484
|
-
iteration: int = 0,
|
|
485
|
-
) -> bool:
|
|
486
|
-
# If is nonempty resolve immediately.
|
|
487
|
-
if len(self._queue_head_dict) > 0:
|
|
488
|
-
return True
|
|
489
|
-
|
|
490
|
-
if limit is None:
|
|
491
|
-
limit = max(self._in_progress_count() * QUERY_HEAD_BUFFER, QUERY_HEAD_MIN_LENGTH)
|
|
492
|
-
|
|
493
|
-
if self._query_queue_head_task is None:
|
|
494
|
-
self._query_queue_head_task = asyncio.Task(self._queue_query_head(limit))
|
|
495
|
-
|
|
496
|
-
queue_head = await self._query_queue_head_task
|
|
497
|
-
|
|
498
|
-
# TODO: I feel this code below can be greatly simplified... (comes from TS implementation *wink*)
|
|
499
|
-
# https://github.com/apify/apify-sdk-python/issues/142
|
|
500
|
-
|
|
501
|
-
# If queue is still empty then one of the following holds:
|
|
502
|
-
# - the other calls waiting for this task already consumed all the returned requests
|
|
503
|
-
# - the limit was too low and contained only requests in progress
|
|
504
|
-
# - the writes from other clients were not propagated yet
|
|
505
|
-
# - the whole queue was processed and we are done
|
|
506
|
-
|
|
507
|
-
# If limit was not reached in the call then there are no more requests to be returned.
|
|
508
|
-
if queue_head['prevLimit'] >= REQUEST_QUEUE_HEAD_MAX_LIMIT:
|
|
509
|
-
logger.warning('Reached the maximum number of requests in progress', extra={'limit': REQUEST_QUEUE_HEAD_MAX_LIMIT})
|
|
510
|
-
|
|
511
|
-
should_repeat_with_higher_limit = (
|
|
512
|
-
len(self._queue_head_dict) == 0 and queue_head['wasLimitReached'] and queue_head['prevLimit'] < REQUEST_QUEUE_HEAD_MAX_LIMIT
|
|
513
|
-
)
|
|
514
|
-
|
|
515
|
-
# If ensureConsistency=true then we must ensure that either:
|
|
516
|
-
# - queueModifiedAt is older than queryStartedAt by at least API_PROCESSED_REQUESTS_DELAY_MILLIS
|
|
517
|
-
# - hadMultipleClients=false and this.assumedTotalCount<=this.assumedHandledCount
|
|
518
|
-
is_database_consistent = (queue_head['queryStartedAt'] - queue_head['queueModifiedAt'].replace(tzinfo=timezone.utc)).seconds >= (
|
|
519
|
-
API_PROCESSED_REQUESTS_DELAY_MILLIS // 1000
|
|
520
|
-
)
|
|
521
|
-
is_locally_consistent = not queue_head['hadMultipleClients'] and self._assumed_total_count <= self._assumed_handled_count
|
|
522
|
-
# Consistent information from one source is enough to consider request queue finished.
|
|
523
|
-
should_repeat_for_consistency = ensure_consistency and not is_database_consistent and not is_locally_consistent
|
|
524
|
-
|
|
525
|
-
# If both are false then head is consistent and we may exit.
|
|
526
|
-
if not should_repeat_with_higher_limit and not should_repeat_for_consistency:
|
|
527
|
-
return True
|
|
528
|
-
|
|
529
|
-
# If we are querying for consistency then we limit the number of queries to MAX_QUERIES_FOR_CONSISTENCY.
|
|
530
|
-
# If this is reached then we return false so that empty() and finished() returns possibly false negative.
|
|
531
|
-
if not should_repeat_with_higher_limit and iteration > MAX_QUERIES_FOR_CONSISTENCY:
|
|
532
|
-
return False
|
|
533
|
-
|
|
534
|
-
next_limit = round(queue_head['prevLimit'] * 1.5) if should_repeat_with_higher_limit else queue_head['prevLimit']
|
|
535
|
-
|
|
536
|
-
# If we are repeating for consistency then wait required time.
|
|
537
|
-
if should_repeat_for_consistency:
|
|
538
|
-
delay_seconds = (API_PROCESSED_REQUESTS_DELAY_MILLIS // 1000) - (datetime.now(timezone.utc) - queue_head['queueModifiedAt']).seconds
|
|
539
|
-
logger.info(f'Waiting for {delay_seconds}s before considering the queue as finished to ensure that the data is consistent.')
|
|
540
|
-
await asyncio.sleep(delay_seconds)
|
|
541
|
-
|
|
542
|
-
return await self._ensure_head_is_non_empty(ensure_consistency, next_limit, iteration + 1)
|
|
543
|
-
|
|
544
|
-
def _maybe_add_request_to_queue_head(
|
|
545
|
-
self: RequestQueue,
|
|
546
|
-
request_id: str,
|
|
547
|
-
forefront: bool, # noqa: FBT001
|
|
548
|
-
) -> None:
|
|
549
|
-
if forefront:
|
|
550
|
-
self._queue_head_dict[request_id] = request_id
|
|
551
|
-
# Move to start, i.e. forefront of the queue
|
|
552
|
-
self._queue_head_dict.move_to_end(request_id, last=False)
|
|
553
|
-
elif self._assumed_total_count < QUERY_HEAD_MIN_LENGTH:
|
|
554
|
-
# OrderedDict puts the item to the end of the queue by default
|
|
555
|
-
self._queue_head_dict[request_id] = request_id
|
|
556
|
-
|
|
557
|
-
async def drop(self: RequestQueue) -> None:
|
|
558
|
-
"""Remove the request queue either from the Apify cloud storage or from the local directory."""
|
|
559
|
-
await self._request_queue_client.delete()
|
|
560
|
-
self._remove_from_cache()
|
|
561
|
-
|
|
562
|
-
async def get_info(self: RequestQueue) -> dict | None:
|
|
563
|
-
"""Get an object containing general information about the request queue.
|
|
564
|
-
|
|
565
|
-
Returns:
|
|
566
|
-
dict: Object returned by calling the GET request queue API endpoint.
|
|
567
|
-
"""
|
|
568
|
-
return await self._request_queue_client.get()
|
|
569
|
-
|
|
570
|
-
@classmethod
|
|
571
|
-
async def open(
|
|
572
|
-
cls: type[RequestQueue],
|
|
573
|
-
*,
|
|
574
|
-
id: str | None = None, # noqa: A002
|
|
575
|
-
name: str | None = None,
|
|
576
|
-
force_cloud: bool = False,
|
|
577
|
-
config: Configuration | None = None,
|
|
578
|
-
) -> RequestQueue:
|
|
579
|
-
"""Open a request queue.
|
|
580
|
-
|
|
581
|
-
Request queue represents a queue of URLs to crawl, which is stored either on local filesystem or in the Apify cloud.
|
|
582
|
-
The queue is used for deep crawling of websites, where you start with several URLs and then
|
|
583
|
-
recursively follow links to other pages. The data structure supports both breadth-first
|
|
584
|
-
and depth-first crawling orders.
|
|
585
|
-
|
|
586
|
-
Args:
|
|
587
|
-
id (str, optional): ID of the request queue to be opened.
|
|
588
|
-
If neither `id` nor `name` are provided, the method returns the default request queue associated with the actor run.
|
|
589
|
-
If the request queue with the given ID does not exist, it raises an error.
|
|
590
|
-
name (str, optional): Name of the request queue to be opened.
|
|
591
|
-
If neither `id` nor `name` are provided, the method returns the default request queue associated with the actor run.
|
|
592
|
-
If the request queue with the given name does not exist, it is created.
|
|
593
|
-
force_cloud (bool, optional): If set to True, it will open a request queue on the Apify Platform even when running the actor locally.
|
|
594
|
-
Defaults to False.
|
|
595
|
-
config (Configuration, optional): A `Configuration` instance, uses global configuration if omitted.
|
|
596
|
-
|
|
597
|
-
Returns:
|
|
598
|
-
RequestQueue: An instance of the `RequestQueue` class for the given ID or name.
|
|
599
|
-
"""
|
|
600
|
-
queue = await super().open(id=id, name=name, force_cloud=force_cloud, config=config)
|
|
601
|
-
await queue._ensure_head_is_non_empty() # type: ignore
|
|
602
|
-
return queue # type: ignore
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import TYPE_CHECKING
|
|
4
|
-
|
|
5
|
-
from apify_shared.utils import ignore_docs
|
|
6
|
-
|
|
7
|
-
from apify._memory_storage import MemoryStorageClient
|
|
8
|
-
from apify.config import Configuration
|
|
9
|
-
|
|
10
|
-
if TYPE_CHECKING:
|
|
11
|
-
from apify_client import ApifyClientAsync
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
@ignore_docs
|
|
15
|
-
class StorageClientManager:
|
|
16
|
-
"""A class for managing storage clients."""
|
|
17
|
-
|
|
18
|
-
_config: Configuration
|
|
19
|
-
|
|
20
|
-
_local_client: MemoryStorageClient | None = None
|
|
21
|
-
_cloud_client: ApifyClientAsync | None = None
|
|
22
|
-
|
|
23
|
-
_default_instance: StorageClientManager | None = None
|
|
24
|
-
|
|
25
|
-
def __init__(self: StorageClientManager) -> None:
|
|
26
|
-
"""Create a `StorageClientManager` instance."""
|
|
27
|
-
self._config = Configuration.get_global_configuration()
|
|
28
|
-
|
|
29
|
-
@classmethod
|
|
30
|
-
def set_config(cls: type[StorageClientManager], config: Configuration) -> None:
|
|
31
|
-
"""Set the config for the StorageClientManager.
|
|
32
|
-
|
|
33
|
-
Args:
|
|
34
|
-
config (Configuration): The configuration this StorageClientManager should use.
|
|
35
|
-
"""
|
|
36
|
-
cls._get_default_instance()._config = config
|
|
37
|
-
|
|
38
|
-
@classmethod
|
|
39
|
-
def get_storage_client(
|
|
40
|
-
cls: type[StorageClientManager],
|
|
41
|
-
force_cloud: bool = False, # noqa: FBT001, FBT002
|
|
42
|
-
) -> ApifyClientAsync | MemoryStorageClient:
|
|
43
|
-
"""Get the current storage client instance.
|
|
44
|
-
|
|
45
|
-
Returns:
|
|
46
|
-
ApifyClientAsync or MemoryStorageClient: The current storage client instance.
|
|
47
|
-
"""
|
|
48
|
-
default_instance = cls._get_default_instance()
|
|
49
|
-
if not default_instance._local_client:
|
|
50
|
-
default_instance._local_client = MemoryStorageClient(persist_storage=default_instance._config.persist_storage, write_metadata=True)
|
|
51
|
-
|
|
52
|
-
if default_instance._config.is_at_home or force_cloud:
|
|
53
|
-
assert default_instance._cloud_client is not None # noqa: S101
|
|
54
|
-
return default_instance._cloud_client
|
|
55
|
-
|
|
56
|
-
return default_instance._local_client
|
|
57
|
-
|
|
58
|
-
@classmethod
|
|
59
|
-
def set_cloud_client(cls: type[StorageClientManager], client: ApifyClientAsync) -> None:
|
|
60
|
-
"""Set the storage client.
|
|
61
|
-
|
|
62
|
-
Args:
|
|
63
|
-
client (ApifyClientAsync or MemoryStorageClient): The instance of a storage client.
|
|
64
|
-
"""
|
|
65
|
-
cls._get_default_instance()._cloud_client = client
|
|
66
|
-
|
|
67
|
-
@classmethod
|
|
68
|
-
def _get_default_instance(cls: type[StorageClientManager]) -> StorageClientManager:
|
|
69
|
-
if cls._default_instance is None:
|
|
70
|
-
cls._default_instance = cls()
|
|
71
|
-
|
|
72
|
-
return cls._default_instance
|