crawlee 1.0.3b6__py3-none-any.whl → 1.2.2b24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/__init__.py +2 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +32 -13
- crawlee/_service_locator.py +4 -4
- crawlee/_types.py +44 -5
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +27 -3
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +13 -6
- crawlee/_utils/system.py +27 -11
- crawlee/_utils/time.py +41 -1
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +5 -2
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/browsers/_playwright_browser_controller.py +1 -1
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +3 -1
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
- crawlee/crawlers/_basic/_basic_crawler.py +156 -131
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +23 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +4 -6
- crawlee/request_loaders/_sitemap_request_loader.py +23 -5
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +3 -3
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +51 -9
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +4 -0
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
- crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
- crawlee/storage_clients/_file_system/_request_queue_client.py +29 -10
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +292 -0
- crawlee/storage_clients/_redis/_dataset_client.py +329 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
- crawlee/storage_clients/_redis/_storage_client.py +149 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/_client_mixin.py +1 -1
- crawlee/storage_clients/_sql/_db_models.py +1 -2
- crawlee/storage_clients/models.py +8 -3
- crawlee/storages/_key_value_store.py +5 -2
- crawlee/storages/_storage_instance_manager.py +103 -44
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +14 -16
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +82 -69
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,583 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from collections import deque
|
|
5
|
+
from datetime import datetime, timedelta, timezone
|
|
6
|
+
from logging import getLogger
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
8
|
+
|
|
9
|
+
from typing_extensions import NotRequired, override
|
|
10
|
+
|
|
11
|
+
from crawlee import Request
|
|
12
|
+
from crawlee._utils.crypto import crypto_random_object_id
|
|
13
|
+
from crawlee.storage_clients._base import RequestQueueClient
|
|
14
|
+
from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata
|
|
15
|
+
|
|
16
|
+
from ._client_mixin import MetadataUpdateParams, RedisClientMixin
|
|
17
|
+
from ._utils import await_redis_response
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from collections.abc import Sequence
|
|
21
|
+
|
|
22
|
+
from redis.asyncio import Redis
|
|
23
|
+
from redis.asyncio.client import Pipeline
|
|
24
|
+
from redis.commands.core import AsyncScript
|
|
25
|
+
|
|
26
|
+
logger = getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class _QueueMetadataUpdateParams(MetadataUpdateParams):
|
|
30
|
+
"""Parameters for updating queue metadata."""
|
|
31
|
+
|
|
32
|
+
new_handled_request_count: NotRequired[int]
|
|
33
|
+
new_pending_request_count: NotRequired[int]
|
|
34
|
+
new_total_request_count: NotRequired[int]
|
|
35
|
+
delta_handled_request_count: NotRequired[int]
|
|
36
|
+
delta_pending_request_count: NotRequired[int]
|
|
37
|
+
delta_total_request_count: NotRequired[int]
|
|
38
|
+
recalculate: NotRequired[bool]
|
|
39
|
+
update_had_multiple_clients: NotRequired[bool]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class RedisRequestQueueClient(RequestQueueClient, RedisClientMixin):
|
|
43
|
+
"""Redis implementation of the request queue client.
|
|
44
|
+
|
|
45
|
+
This client persists requests to Redis using multiple data structures for efficient queue operations,
|
|
46
|
+
deduplication, and concurrent access safety. Requests are stored with FIFO ordering and support
|
|
47
|
+
both regular and forefront (high-priority) insertion modes.
|
|
48
|
+
|
|
49
|
+
The implementation uses Bloom filters for efficient request deduplication and Redis lists for
|
|
50
|
+
queue operations. Request blocking and client coordination is handled through Redis hashes
|
|
51
|
+
with timestamp-based expiration for stale request recovery.
|
|
52
|
+
|
|
53
|
+
The request queue data is stored in Redis using the following key patterns:
|
|
54
|
+
- `request_queues:{name}:queue` - Redis list for FIFO request ordering
|
|
55
|
+
- `request_queues:{name}:data` - Redis hash storing serialized Request objects by unique_key
|
|
56
|
+
- `request_queues:{name}:in_progress` - Redis hash tracking requests currently being processed
|
|
57
|
+
- `request_queues:{name}:added_bloom_filter` - Bloom filter for added request deduplication (`bloom` dedup_strategy)
|
|
58
|
+
- `request_queues:{name}:handled_bloom_filter` - Bloom filter for completed request tracking (`bloom`
|
|
59
|
+
dedup_strategy)
|
|
60
|
+
- `request_queues:{name}:pending_set` - Redis set for added request deduplication (`default` dedup_strategy)
|
|
61
|
+
- `request_queues:{name}:handled_set` - Redis set for completed request tracking (`default` dedup_strategy)
|
|
62
|
+
- `request_queues:{name}:metadata` - Redis JSON object containing queue metadata
|
|
63
|
+
|
|
64
|
+
Requests are serialized to JSON for storage and maintain proper FIFO ordering through Redis list
|
|
65
|
+
operations. The implementation provides concurrent access safety through atomic Lua scripts,
|
|
66
|
+
Bloom filter operations, and Redis's built-in atomicity guarantees for individual operations.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
_DEFAULT_NAME = 'default'
|
|
70
|
+
"""Default Request Queue name key prefix when none provided."""
|
|
71
|
+
|
|
72
|
+
_MAIN_KEY = 'request_queues'
|
|
73
|
+
"""Main Redis key prefix for Request Queue."""
|
|
74
|
+
|
|
75
|
+
_CLIENT_TYPE = 'Request queue'
|
|
76
|
+
"""Human-readable client type for error messages."""
|
|
77
|
+
|
|
78
|
+
_MAX_BATCH_FETCH_SIZE = 10
|
|
79
|
+
"""Maximum number of requests to fetch in a single batch operation."""
|
|
80
|
+
|
|
81
|
+
_BLOCK_REQUEST_TIME = 300_000 # milliseconds
|
|
82
|
+
"""Time in milliseconds to block a fetched request for other clients before it can be autoreclaimed."""
|
|
83
|
+
|
|
84
|
+
_RECLAIM_INTERVAL = timedelta(seconds=30)
|
|
85
|
+
"""Interval to check for stale requests to reclaim."""
|
|
86
|
+
|
|
87
|
+
def __init__(
|
|
88
|
+
self,
|
|
89
|
+
storage_name: str,
|
|
90
|
+
storage_id: str,
|
|
91
|
+
redis: Redis,
|
|
92
|
+
dedup_strategy: Literal['default', 'bloom'] = 'default',
|
|
93
|
+
bloom_error_rate: float = 1e-7,
|
|
94
|
+
) -> None:
|
|
95
|
+
"""Initialize a new instance.
|
|
96
|
+
|
|
97
|
+
Preferably use the `RedisRequestQueueClient.open` class method to create a new instance.
|
|
98
|
+
"""
|
|
99
|
+
super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis)
|
|
100
|
+
|
|
101
|
+
self._dedup_strategy = dedup_strategy
|
|
102
|
+
"""Deduplication strategy for the queue."""
|
|
103
|
+
|
|
104
|
+
self._bloom_error_rate = bloom_error_rate
|
|
105
|
+
"""Desired false positive rate for Bloom filters."""
|
|
106
|
+
|
|
107
|
+
self._pending_fetch_cache: deque[Request] = deque()
|
|
108
|
+
"""Cache for requests: ordered by sequence number."""
|
|
109
|
+
|
|
110
|
+
self.client_key = crypto_random_object_id(length=32)[:32]
|
|
111
|
+
"""Unique identifier for this client instance."""
|
|
112
|
+
|
|
113
|
+
# Lua scripts for atomic operations
|
|
114
|
+
self._fetch_script: AsyncScript | None = None
|
|
115
|
+
self._reclaim_stale_script: AsyncScript | None = None
|
|
116
|
+
self._add_requests_script: AsyncScript | None = None
|
|
117
|
+
|
|
118
|
+
self._next_reclaim_stale: None | datetime = None
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def _added_filter_key(self) -> str:
|
|
122
|
+
"""Return the Redis key for the added requests Bloom filter."""
|
|
123
|
+
if self._dedup_strategy != 'bloom':
|
|
124
|
+
raise RuntimeError('The added requests filter is only available with the bloom deduplication strategy.')
|
|
125
|
+
return f'{self._MAIN_KEY}:{self._storage_name}:added_bloom_filter'
|
|
126
|
+
|
|
127
|
+
@property
|
|
128
|
+
def _handled_filter_key(self) -> str:
|
|
129
|
+
"""Return the Redis key for the handled requests Bloom filter."""
|
|
130
|
+
if self._dedup_strategy != 'bloom':
|
|
131
|
+
raise RuntimeError('The handled requests filter is only available with the bloom deduplication strategy.')
|
|
132
|
+
return f'{self._MAIN_KEY}:{self._storage_name}:handled_bloom_filter'
|
|
133
|
+
|
|
134
|
+
@property
|
|
135
|
+
def _pending_set_key(self) -> str:
|
|
136
|
+
"""Return the Redis key for the pending requests set."""
|
|
137
|
+
if self._dedup_strategy != 'default':
|
|
138
|
+
raise RuntimeError('The pending requests set is only available with the default deduplication strategy.')
|
|
139
|
+
return f'{self._MAIN_KEY}:{self._storage_name}:pending_set'
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def _handled_set_key(self) -> str:
|
|
143
|
+
"""Return the Redis key for the handled requests set."""
|
|
144
|
+
if self._dedup_strategy != 'default':
|
|
145
|
+
raise RuntimeError('The handled requests set is only available with the default deduplication strategy.')
|
|
146
|
+
return f'{self._MAIN_KEY}:{self._storage_name}:handled_set'
|
|
147
|
+
|
|
148
|
+
@property
|
|
149
|
+
def _queue_key(self) -> str:
|
|
150
|
+
"""Return the Redis key for the request queue."""
|
|
151
|
+
return f'{self._MAIN_KEY}:{self._storage_name}:queue'
|
|
152
|
+
|
|
153
|
+
@property
|
|
154
|
+
def _data_key(self) -> str:
|
|
155
|
+
"""Return the Redis key for the request data hash."""
|
|
156
|
+
return f'{self._MAIN_KEY}:{self._storage_name}:data'
|
|
157
|
+
|
|
158
|
+
@property
|
|
159
|
+
def _in_progress_key(self) -> str:
|
|
160
|
+
"""Return the Redis key for the in-progress requests hash."""
|
|
161
|
+
return f'{self._MAIN_KEY}:{self._storage_name}:in_progress'
|
|
162
|
+
|
|
163
|
+
@classmethod
|
|
164
|
+
async def open(
|
|
165
|
+
cls,
|
|
166
|
+
*,
|
|
167
|
+
id: str | None,
|
|
168
|
+
name: str | None,
|
|
169
|
+
alias: str | None,
|
|
170
|
+
redis: Redis,
|
|
171
|
+
dedup_strategy: Literal['default', 'bloom'] = 'default',
|
|
172
|
+
bloom_error_rate: float = 1e-7,
|
|
173
|
+
) -> RedisRequestQueueClient:
|
|
174
|
+
"""Open or create a new Redis request queue client.
|
|
175
|
+
|
|
176
|
+
This method attempts to open an existing request queue from the Redis database. If a queue with the specified
|
|
177
|
+
ID or name exists, it loads the metadata from the database. If no existing queue is found, a new one
|
|
178
|
+
is created.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
id: The ID of the request queue. If not provided, a random ID will be generated.
|
|
182
|
+
name: The name of the dataset for named (global scope) storages.
|
|
183
|
+
alias: The alias of the dataset for unnamed (run scope) storages.
|
|
184
|
+
redis: Redis client instance.
|
|
185
|
+
dedup_strategy: Strategy for request queue deduplication. Options are:
|
|
186
|
+
- 'default': Uses Redis sets for exact deduplication.
|
|
187
|
+
- 'bloom': Uses Redis Bloom filters for probabilistic deduplication with lower memory usage. When using
|
|
188
|
+
this approach, there is a possibility 1e-7 that requests will be skipped in the queue.
|
|
189
|
+
bloom_error_rate: Desired false positive rate for Bloom filter deduplication. Only relevant if
|
|
190
|
+
`dedup_strategy` is set to 'bloom'.
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
An instance for the opened or created storage client.
|
|
194
|
+
"""
|
|
195
|
+
return await cls._open(
|
|
196
|
+
id=id,
|
|
197
|
+
name=name,
|
|
198
|
+
alias=alias,
|
|
199
|
+
redis=redis,
|
|
200
|
+
metadata_model=RequestQueueMetadata,
|
|
201
|
+
extra_metadata_fields={
|
|
202
|
+
'had_multiple_clients': False,
|
|
203
|
+
'handled_request_count': 0,
|
|
204
|
+
'pending_request_count': 0,
|
|
205
|
+
'total_request_count': 0,
|
|
206
|
+
},
|
|
207
|
+
instance_kwargs={'dedup_strategy': dedup_strategy, 'bloom_error_rate': bloom_error_rate},
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
@override
|
|
211
|
+
async def get_metadata(self) -> RequestQueueMetadata:
|
|
212
|
+
return await self._get_metadata(RequestQueueMetadata)
|
|
213
|
+
|
|
214
|
+
@override
|
|
215
|
+
async def drop(self) -> None:
|
|
216
|
+
if self._dedup_strategy == 'bloom':
|
|
217
|
+
extra_keys = [self._added_filter_key, self._handled_filter_key]
|
|
218
|
+
elif self._dedup_strategy == 'default':
|
|
219
|
+
extra_keys = [self._pending_set_key, self._handled_set_key]
|
|
220
|
+
else:
|
|
221
|
+
raise RuntimeError(f'Unknown deduplication strategy: {self._dedup_strategy}')
|
|
222
|
+
extra_keys.extend([self._queue_key, self._data_key, self._in_progress_key])
|
|
223
|
+
await self._drop(extra_keys=extra_keys)
|
|
224
|
+
|
|
225
|
+
@override
|
|
226
|
+
async def purge(self) -> None:
|
|
227
|
+
if self._dedup_strategy == 'bloom':
|
|
228
|
+
extra_keys = [self._added_filter_key, self._handled_filter_key]
|
|
229
|
+
elif self._dedup_strategy == 'default':
|
|
230
|
+
extra_keys = [self._pending_set_key, self._handled_set_key]
|
|
231
|
+
else:
|
|
232
|
+
raise RuntimeError(f'Unknown deduplication strategy: {self._dedup_strategy}')
|
|
233
|
+
extra_keys.extend([self._queue_key, self._data_key, self._in_progress_key])
|
|
234
|
+
await self._purge(
|
|
235
|
+
extra_keys=extra_keys,
|
|
236
|
+
metadata_kwargs=_QueueMetadataUpdateParams(
|
|
237
|
+
update_accessed_at=True,
|
|
238
|
+
update_modified_at=True,
|
|
239
|
+
new_pending_request_count=0,
|
|
240
|
+
),
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
@override
|
|
244
|
+
async def add_batch_of_requests(
|
|
245
|
+
self,
|
|
246
|
+
requests: Sequence[Request],
|
|
247
|
+
*,
|
|
248
|
+
forefront: bool = False,
|
|
249
|
+
) -> AddRequestsResponse:
|
|
250
|
+
if self._add_requests_script is None:
|
|
251
|
+
raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.')
|
|
252
|
+
|
|
253
|
+
processed_requests = []
|
|
254
|
+
|
|
255
|
+
delta_pending = 0
|
|
256
|
+
delta_total = 0
|
|
257
|
+
|
|
258
|
+
requests_by_unique_key = {req.unique_key: req for req in requests}
|
|
259
|
+
unique_keys = list(requests_by_unique_key.keys())
|
|
260
|
+
# Check which requests are already added or handled
|
|
261
|
+
async with self._get_pipeline(with_execute=False) as pipe:
|
|
262
|
+
if self._dedup_strategy == 'default':
|
|
263
|
+
await await_redis_response(pipe.smismember(self._pending_set_key, unique_keys))
|
|
264
|
+
await await_redis_response(pipe.smismember(self._handled_set_key, unique_keys))
|
|
265
|
+
elif self._dedup_strategy == 'bloom':
|
|
266
|
+
await await_redis_response(pipe.bf().mexists(self._added_filter_key, *unique_keys))
|
|
267
|
+
await await_redis_response(pipe.bf().mexists(self._handled_filter_key, *unique_keys))
|
|
268
|
+
|
|
269
|
+
pipe_results = await pipe.execute()
|
|
270
|
+
|
|
271
|
+
added_pending_flags = pipe_results[0]
|
|
272
|
+
handled_flags = pipe_results[1]
|
|
273
|
+
|
|
274
|
+
new_unique_keys = []
|
|
275
|
+
new_request_data = {}
|
|
276
|
+
delta_pending = 0
|
|
277
|
+
delta_total = 0
|
|
278
|
+
|
|
279
|
+
for i, unique_key in enumerate(unique_keys):
|
|
280
|
+
# Already handled - skip
|
|
281
|
+
if handled_flags[i]:
|
|
282
|
+
processed_requests.append(
|
|
283
|
+
ProcessedRequest(
|
|
284
|
+
unique_key=unique_key,
|
|
285
|
+
was_already_present=True,
|
|
286
|
+
was_already_handled=True,
|
|
287
|
+
)
|
|
288
|
+
)
|
|
289
|
+
continue
|
|
290
|
+
|
|
291
|
+
# Already in queue - skip
|
|
292
|
+
if added_pending_flags[i]:
|
|
293
|
+
processed_requests.append(
|
|
294
|
+
ProcessedRequest(
|
|
295
|
+
unique_key=unique_key,
|
|
296
|
+
was_already_present=True,
|
|
297
|
+
was_already_handled=False,
|
|
298
|
+
)
|
|
299
|
+
)
|
|
300
|
+
continue
|
|
301
|
+
|
|
302
|
+
# New request - will add to queue
|
|
303
|
+
request = requests_by_unique_key[unique_key]
|
|
304
|
+
|
|
305
|
+
new_unique_keys.append(unique_key)
|
|
306
|
+
new_request_data[unique_key] = request.model_dump_json()
|
|
307
|
+
|
|
308
|
+
if new_unique_keys:
|
|
309
|
+
# Add new requests to the queue atomically, get back which were actually added
|
|
310
|
+
script_results = await self._add_requests_script(
|
|
311
|
+
keys=[
|
|
312
|
+
self._added_filter_key if self._dedup_strategy == 'bloom' else self._pending_set_key,
|
|
313
|
+
self._queue_key,
|
|
314
|
+
self._data_key,
|
|
315
|
+
],
|
|
316
|
+
args=[int(forefront), json.dumps(new_unique_keys), json.dumps(new_request_data)],
|
|
317
|
+
)
|
|
318
|
+
actually_added = set(json.loads(script_results))
|
|
319
|
+
|
|
320
|
+
delta_pending = len(actually_added)
|
|
321
|
+
delta_total = len(actually_added)
|
|
322
|
+
|
|
323
|
+
processed_requests.extend(
|
|
324
|
+
[
|
|
325
|
+
ProcessedRequest(
|
|
326
|
+
unique_key=unique_key,
|
|
327
|
+
was_already_present=unique_key not in actually_added,
|
|
328
|
+
was_already_handled=False,
|
|
329
|
+
)
|
|
330
|
+
for unique_key in new_unique_keys
|
|
331
|
+
]
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
async with self._get_pipeline() as pipe:
|
|
335
|
+
await self._update_metadata(
|
|
336
|
+
pipe,
|
|
337
|
+
**_QueueMetadataUpdateParams(
|
|
338
|
+
update_accessed_at=True,
|
|
339
|
+
update_modified_at=True,
|
|
340
|
+
delta_pending_request_count=delta_pending,
|
|
341
|
+
delta_total_request_count=delta_total,
|
|
342
|
+
),
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
return AddRequestsResponse(
|
|
346
|
+
processed_requests=processed_requests,
|
|
347
|
+
unprocessed_requests=[],
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
@override
|
|
351
|
+
async def fetch_next_request(self) -> Request | None:
|
|
352
|
+
if self._pending_fetch_cache:
|
|
353
|
+
return self._pending_fetch_cache.popleft()
|
|
354
|
+
|
|
355
|
+
if self._fetch_script is None:
|
|
356
|
+
raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.')
|
|
357
|
+
|
|
358
|
+
blocked_until_timestamp = int(datetime.now(tz=timezone.utc).timestamp() * 1000) + self._BLOCK_REQUEST_TIME
|
|
359
|
+
|
|
360
|
+
# The script retrieves requests from the queue and places them in the in_progress hash.
|
|
361
|
+
requests_json = await self._fetch_script(
|
|
362
|
+
keys=[self._queue_key, self._in_progress_key, self._data_key],
|
|
363
|
+
args=[self.client_key, blocked_until_timestamp, self._MAX_BATCH_FETCH_SIZE],
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
async with self._get_pipeline() as pipe:
|
|
367
|
+
await self._update_metadata(pipe, **_QueueMetadataUpdateParams(update_accessed_at=True))
|
|
368
|
+
|
|
369
|
+
if not requests_json:
|
|
370
|
+
return None
|
|
371
|
+
|
|
372
|
+
requests = [Request.model_validate_json(req_json) for req_json in requests_json]
|
|
373
|
+
|
|
374
|
+
self._pending_fetch_cache.extend(requests[1:])
|
|
375
|
+
|
|
376
|
+
return requests[0]
|
|
377
|
+
|
|
378
|
+
@override
|
|
379
|
+
async def get_request(self, unique_key: str) -> Request | None:
|
|
380
|
+
request_data = await await_redis_response(self._redis.hget(self._data_key, unique_key))
|
|
381
|
+
|
|
382
|
+
if isinstance(request_data, (str, bytes, bytearray)):
|
|
383
|
+
return Request.model_validate_json(request_data)
|
|
384
|
+
|
|
385
|
+
return None
|
|
386
|
+
|
|
387
|
+
@override
|
|
388
|
+
async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
|
|
389
|
+
# Check if the request is in progress.
|
|
390
|
+
check_in_progress = await await_redis_response(self._redis.hexists(self._in_progress_key, request.unique_key))
|
|
391
|
+
if not check_in_progress:
|
|
392
|
+
logger.warning(f'Marking request {request.unique_key} as handled that is not in progress.')
|
|
393
|
+
return None
|
|
394
|
+
|
|
395
|
+
async with self._get_pipeline() as pipe:
|
|
396
|
+
if self._dedup_strategy == 'default':
|
|
397
|
+
await await_redis_response(pipe.sadd(self._handled_set_key, request.unique_key))
|
|
398
|
+
await await_redis_response(pipe.srem(self._pending_set_key, request.unique_key))
|
|
399
|
+
elif self._dedup_strategy == 'bloom':
|
|
400
|
+
await await_redis_response(pipe.bf().add(self._handled_filter_key, request.unique_key))
|
|
401
|
+
|
|
402
|
+
await await_redis_response(pipe.hdel(self._in_progress_key, request.unique_key))
|
|
403
|
+
await await_redis_response(pipe.hdel(self._data_key, request.unique_key))
|
|
404
|
+
|
|
405
|
+
await self._update_metadata(
|
|
406
|
+
pipe,
|
|
407
|
+
**_QueueMetadataUpdateParams(
|
|
408
|
+
update_accessed_at=True,
|
|
409
|
+
update_modified_at=True,
|
|
410
|
+
delta_handled_request_count=1,
|
|
411
|
+
delta_pending_request_count=-1,
|
|
412
|
+
),
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
return ProcessedRequest(
|
|
416
|
+
unique_key=request.unique_key,
|
|
417
|
+
was_already_present=True,
|
|
418
|
+
was_already_handled=True,
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
@override
|
|
422
|
+
async def reclaim_request(
|
|
423
|
+
self,
|
|
424
|
+
request: Request,
|
|
425
|
+
*,
|
|
426
|
+
forefront: bool = False,
|
|
427
|
+
) -> ProcessedRequest | None:
|
|
428
|
+
check_in_progress = await await_redis_response(self._redis.hexists(self._in_progress_key, request.unique_key))
|
|
429
|
+
if not check_in_progress:
|
|
430
|
+
logger.info(f'Reclaiming request {request.unique_key} that is not in progress.')
|
|
431
|
+
return None
|
|
432
|
+
|
|
433
|
+
async with self._get_pipeline() as pipe:
|
|
434
|
+
if forefront:
|
|
435
|
+
blocked_until_timestamp = (
|
|
436
|
+
int(datetime.now(tz=timezone.utc).timestamp() * 1000) + self._BLOCK_REQUEST_TIME
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
await await_redis_response(
|
|
440
|
+
pipe.hset(
|
|
441
|
+
self._in_progress_key,
|
|
442
|
+
request.unique_key,
|
|
443
|
+
f'{{"client_id":"{self.client_key}","blocked_until_timestamp":{blocked_until_timestamp}}}',
|
|
444
|
+
)
|
|
445
|
+
)
|
|
446
|
+
self._pending_fetch_cache.appendleft(request)
|
|
447
|
+
else:
|
|
448
|
+
await await_redis_response(pipe.rpush(self._queue_key, request.unique_key))
|
|
449
|
+
await await_redis_response(pipe.hset(self._data_key, request.unique_key, request.model_dump_json()))
|
|
450
|
+
await await_redis_response(pipe.hdel(self._in_progress_key, request.unique_key))
|
|
451
|
+
await self._update_metadata(
|
|
452
|
+
pipe,
|
|
453
|
+
**_QueueMetadataUpdateParams(
|
|
454
|
+
update_modified_at=True,
|
|
455
|
+
update_accessed_at=True,
|
|
456
|
+
),
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
return ProcessedRequest(
|
|
460
|
+
unique_key=request.unique_key,
|
|
461
|
+
was_already_present=True,
|
|
462
|
+
was_already_handled=False,
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
@override
|
|
466
|
+
async def is_empty(self) -> bool:
|
|
467
|
+
"""Check if the queue is empty.
|
|
468
|
+
|
|
469
|
+
Returns:
|
|
470
|
+
True if the queue is empty, False otherwise.
|
|
471
|
+
"""
|
|
472
|
+
if self._pending_fetch_cache:
|
|
473
|
+
return False
|
|
474
|
+
|
|
475
|
+
# Reclaim stale requests if needed
|
|
476
|
+
if self._next_reclaim_stale is None or datetime.now(tz=timezone.utc) >= self._next_reclaim_stale:
|
|
477
|
+
await self._reclaim_stale_requests()
|
|
478
|
+
self._next_reclaim_stale = datetime.now(tz=timezone.utc) + self._RECLAIM_INTERVAL
|
|
479
|
+
|
|
480
|
+
metadata = await self.get_metadata()
|
|
481
|
+
|
|
482
|
+
return metadata.pending_request_count == 0
|
|
483
|
+
|
|
484
|
+
async def _load_scripts(self) -> None:
|
|
485
|
+
"""Ensure Lua scripts are loaded in Redis."""
|
|
486
|
+
self._fetch_script = await self._create_script('atomic_fetch_request.lua')
|
|
487
|
+
self._reclaim_stale_script = await self._create_script('reclaim_stale_requests.lua')
|
|
488
|
+
if self._dedup_strategy == 'bloom':
|
|
489
|
+
self._add_requests_script = await self._create_script('atomic_bloom_add_requests.lua')
|
|
490
|
+
elif self._dedup_strategy == 'default':
|
|
491
|
+
self._add_requests_script = await self._create_script('atomic_set_add_requests.lua')
|
|
492
|
+
|
|
493
|
+
@override
|
|
494
|
+
async def _create_storage(self, pipeline: Pipeline) -> None:
|
|
495
|
+
# Create Bloom filters for added and handled requests
|
|
496
|
+
if self._dedup_strategy == 'bloom':
|
|
497
|
+
await await_redis_response(
|
|
498
|
+
pipeline.bf().create(
|
|
499
|
+
self._added_filter_key, errorRate=self._bloom_error_rate, capacity=100000, expansion=10
|
|
500
|
+
)
|
|
501
|
+
)
|
|
502
|
+
await await_redis_response(
|
|
503
|
+
pipeline.bf().create(
|
|
504
|
+
self._handled_filter_key, errorRate=self._bloom_error_rate, capacity=100000, expansion=10
|
|
505
|
+
)
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
async def _reclaim_stale_requests(self) -> None:
|
|
509
|
+
"""Reclaim requests that have been in progress for too long."""
|
|
510
|
+
if self._reclaim_stale_script is None:
|
|
511
|
+
raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.')
|
|
512
|
+
|
|
513
|
+
current_time = int(datetime.now(tz=timezone.utc).timestamp() * 1000)
|
|
514
|
+
|
|
515
|
+
await self._reclaim_stale_script(
|
|
516
|
+
keys=[self._in_progress_key, self._queue_key, self._data_key], args=[current_time]
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
@override
|
|
520
|
+
async def _specific_update_metadata(
|
|
521
|
+
self,
|
|
522
|
+
pipeline: Pipeline,
|
|
523
|
+
*,
|
|
524
|
+
delta_handled_request_count: int | None = None,
|
|
525
|
+
new_handled_request_count: int | None = None,
|
|
526
|
+
delta_pending_request_count: int | None = None,
|
|
527
|
+
new_pending_request_count: int | None = None,
|
|
528
|
+
delta_total_request_count: int | None = None,
|
|
529
|
+
new_total_request_count: int | None = None,
|
|
530
|
+
update_had_multiple_clients: bool = False,
|
|
531
|
+
**_kwargs: Any,
|
|
532
|
+
) -> None:
|
|
533
|
+
"""Update the dataset metadata with current information.
|
|
534
|
+
|
|
535
|
+
Args:
|
|
536
|
+
pipeline: The Redis pipeline to use for the update.
|
|
537
|
+
new_handled_request_count: If provided, update the handled_request_count to this value.
|
|
538
|
+
new_pending_request_count: If provided, update the pending_request_count to this value.
|
|
539
|
+
new_total_request_count: If provided, update the total_request_count to this value.
|
|
540
|
+
delta_handled_request_count: If provided, add this value to the handled_request_count.
|
|
541
|
+
delta_pending_request_count: If provided, add this value to the pending_request_count.
|
|
542
|
+
delta_total_request_count: If provided, add this value to the total_request_count.
|
|
543
|
+
update_had_multiple_clients: If True, set had_multiple_clients to True.
|
|
544
|
+
"""
|
|
545
|
+
if new_pending_request_count is not None:
|
|
546
|
+
await await_redis_response(
|
|
547
|
+
pipeline.json().set(
|
|
548
|
+
self.metadata_key, '$.pending_request_count', new_pending_request_count, nx=False, xx=True
|
|
549
|
+
)
|
|
550
|
+
)
|
|
551
|
+
elif delta_pending_request_count is not None:
|
|
552
|
+
await await_redis_response(
|
|
553
|
+
pipeline.json().numincrby(self.metadata_key, '$.pending_request_count', delta_pending_request_count)
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
if new_handled_request_count is not None:
|
|
557
|
+
await await_redis_response(
|
|
558
|
+
pipeline.json().set(
|
|
559
|
+
self.metadata_key, '$.handled_request_count', new_handled_request_count, nx=False, xx=True
|
|
560
|
+
)
|
|
561
|
+
)
|
|
562
|
+
elif delta_handled_request_count is not None:
|
|
563
|
+
await await_redis_response(
|
|
564
|
+
pipeline.json().numincrby(self.metadata_key, '$.handled_request_count', delta_handled_request_count)
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
if new_total_request_count is not None:
|
|
568
|
+
await await_redis_response(
|
|
569
|
+
pipeline.json().set(
|
|
570
|
+
self.metadata_key, '$.total_request_count', new_total_request_count, nx=False, xx=True
|
|
571
|
+
)
|
|
572
|
+
)
|
|
573
|
+
elif delta_total_request_count is not None:
|
|
574
|
+
await await_redis_response(
|
|
575
|
+
pipeline.json().numincrby(self.metadata_key, '$.total_request_count', delta_total_request_count)
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
if update_had_multiple_clients:
|
|
579
|
+
await await_redis_response(
|
|
580
|
+
pipeline.json().set(
|
|
581
|
+
self.metadata_key, '$.had_multiple_clients', update_had_multiple_clients, nx=False, xx=True
|
|
582
|
+
)
|
|
583
|
+
)
|