crawlee 0.6.13b17__py3-none-any.whl → 1.1.2b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (102) hide show
  1. crawlee/_autoscaling/snapshotter.py +1 -1
  2. crawlee/_request.py +35 -33
  3. crawlee/_service_locator.py +44 -24
  4. crawlee/_types.py +106 -34
  5. crawlee/_utils/context.py +2 -2
  6. crawlee/_utils/file.py +7 -0
  7. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  8. crawlee/_utils/recoverable_state.py +32 -8
  9. crawlee/_utils/recurring_task.py +17 -1
  10. crawlee/_utils/requests.py +0 -26
  11. crawlee/_utils/robots.py +17 -5
  12. crawlee/_utils/sitemap.py +4 -2
  13. crawlee/_utils/system.py +3 -3
  14. crawlee/_utils/time.py +120 -0
  15. crawlee/_utils/urls.py +9 -2
  16. crawlee/browsers/_browser_pool.py +4 -1
  17. crawlee/browsers/_playwright_browser_controller.py +21 -15
  18. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  19. crawlee/browsers/_types.py +1 -1
  20. crawlee/configuration.py +2 -0
  21. crawlee/crawlers/__init__.py +2 -1
  22. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  23. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +50 -12
  24. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  25. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  26. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
  27. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  28. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
  29. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  30. crawlee/crawlers/_basic/_basic_crawler.py +219 -126
  31. crawlee/crawlers/_basic/_logging_utils.py +5 -1
  32. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  33. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  34. crawlee/crawlers/_playwright/_playwright_crawler.py +60 -11
  35. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  36. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  37. crawlee/crawlers/_playwright/_types.py +12 -2
  38. crawlee/events/_event_manager.py +4 -4
  39. crawlee/events/_types.py +6 -6
  40. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  41. crawlee/fingerprint_suite/_header_generator.py +2 -2
  42. crawlee/fingerprint_suite/_types.py +2 -2
  43. crawlee/http_clients/_base.py +4 -0
  44. crawlee/http_clients/_curl_impersonate.py +12 -0
  45. crawlee/http_clients/_httpx.py +16 -6
  46. crawlee/http_clients/_impit.py +25 -10
  47. crawlee/otel/crawler_instrumentor.py +3 -3
  48. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  49. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  50. crawlee/request_loaders/_request_list.py +3 -3
  51. crawlee/request_loaders/_request_loader.py +5 -1
  52. crawlee/request_loaders/_sitemap_request_loader.py +248 -50
  53. crawlee/sessions/_models.py +2 -2
  54. crawlee/sessions/_session_pool.py +1 -1
  55. crawlee/statistics/_error_snapshotter.py +1 -1
  56. crawlee/statistics/_models.py +43 -4
  57. crawlee/statistics/_statistics.py +24 -33
  58. crawlee/storage_clients/__init__.py +16 -0
  59. crawlee/storage_clients/_base/_request_queue_client.py +2 -2
  60. crawlee/storage_clients/_base/_storage_client.py +13 -0
  61. crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
  62. crawlee/storage_clients/_file_system/_key_value_store_client.py +30 -26
  63. crawlee/storage_clients/_file_system/_request_queue_client.py +169 -153
  64. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  65. crawlee/storage_clients/_file_system/_utils.py +0 -0
  66. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  67. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  68. crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
  69. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  70. crawlee/storage_clients/_redis/__init__.py +6 -0
  71. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  72. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  73. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  74. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  75. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  76. crawlee/storage_clients/_redis/_utils.py +23 -0
  77. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  78. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  79. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  80. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  81. crawlee/storage_clients/_redis/py.typed +0 -0
  82. crawlee/storage_clients/_sql/__init__.py +6 -0
  83. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  84. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  85. crawlee/storage_clients/_sql/_db_models.py +268 -0
  86. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  87. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  88. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  89. crawlee/storage_clients/_sql/py.typed +0 -0
  90. crawlee/storage_clients/models.py +13 -11
  91. crawlee/storages/_base.py +5 -1
  92. crawlee/storages/_dataset.py +12 -2
  93. crawlee/storages/_key_value_store.py +17 -4
  94. crawlee/storages/_request_queue.py +13 -5
  95. crawlee/storages/_storage_instance_manager.py +133 -71
  96. crawlee/storages/_utils.py +11 -0
  97. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +18 -6
  98. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +101 -78
  99. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
  100. crawlee/_utils/measure_time.py +0 -31
  101. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
  102. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,586 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from collections import deque
5
+ from datetime import datetime, timedelta, timezone
6
+ from logging import getLogger
7
+ from typing import TYPE_CHECKING, Any, Literal
8
+
9
+ from typing_extensions import NotRequired, override
10
+
11
+ from crawlee import Request
12
+ from crawlee._utils.crypto import crypto_random_object_id
13
+ from crawlee.storage_clients._base import RequestQueueClient
14
+ from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata
15
+
16
+ from ._client_mixin import MetadataUpdateParams, RedisClientMixin
17
+ from ._utils import await_redis_response
18
+
19
+ if TYPE_CHECKING:
20
+ from collections.abc import Sequence
21
+
22
+ from redis.asyncio import Redis
23
+ from redis.asyncio.client import Pipeline
24
+ from redis.commands.core import AsyncScript
25
+
26
+ logger = getLogger(__name__)
27
+
28
+
29
+ class _QueueMetadataUpdateParams(MetadataUpdateParams):
30
+ """Parameters for updating queue metadata."""
31
+
32
+ new_handled_request_count: NotRequired[int]
33
+ new_pending_request_count: NotRequired[int]
34
+ new_total_request_count: NotRequired[int]
35
+ delta_handled_request_count: NotRequired[int]
36
+ delta_pending_request_count: NotRequired[int]
37
+ delta_total_request_count: NotRequired[int]
38
+ recalculate: NotRequired[bool]
39
+ update_had_multiple_clients: NotRequired[bool]
40
+
41
+
42
+ class RedisRequestQueueClient(RequestQueueClient, RedisClientMixin):
43
+ """Redis implementation of the request queue client.
44
+
45
+ This client persists requests to Redis using multiple data structures for efficient queue operations,
46
+ deduplication, and concurrent access safety. Requests are stored with FIFO ordering and support
47
+ both regular and forefront (high-priority) insertion modes.
48
+
49
+ The implementation uses Bloom filters for efficient request deduplication and Redis lists for
50
+ queue operations. Request blocking and client coordination is handled through Redis hashes
51
+ with timestamp-based expiration for stale request recovery.
52
+
53
+ The request queue data is stored in Redis using the following key patterns:
54
+ - `request_queues:{name}:queue` - Redis list for FIFO request ordering
55
+ - `request_queues:{name}:data` - Redis hash storing serialized Request objects by unique_key
56
+ - `request_queues:{name}:in_progress` - Redis hash tracking requests currently being processed
57
+ - `request_queues:{name}:added_bloom_filter` - Bloom filter for added request deduplication (`bloom` dedup_strategy)
58
+ - `request_queues:{name}:handled_bloom_filter` - Bloom filter for completed request tracking (`bloom`
59
+ dedup_strategy)
60
+ - `request_queues:{name}:pending_set` - Redis set for added request deduplication (`default` dedup_strategy)
61
+ - `request_queues:{name}:handled_set` - Redis set for completed request tracking (`default` dedup_strategy)
62
+ - `request_queues:{name}:metadata` - Redis JSON object containing queue metadata
63
+
64
+ Requests are serialized to JSON for storage and maintain proper FIFO ordering through Redis list
65
+ operations. The implementation provides concurrent access safety through atomic Lua scripts,
66
+ Bloom filter operations, and Redis's built-in atomicity guarantees for individual operations.
67
+ """
68
+
69
+ _DEFAULT_NAME = 'default'
70
+ """Default Request Queue name key prefix when none provided."""
71
+
72
+ _MAIN_KEY = 'request_queues'
73
+ """Main Redis key prefix for Request Queue."""
74
+
75
+ _CLIENT_TYPE = 'Request queue'
76
+ """Human-readable client type for error messages."""
77
+
78
+ _MAX_BATCH_FETCH_SIZE = 10
79
+ """Maximum number of requests to fetch in a single batch operation."""
80
+
81
+ _BLOCK_REQUEST_TIME = 300_000 # milliseconds
82
+ """Time in milliseconds to block a fetched request for other clients before it can be autoreclaimed."""
83
+
84
+ _RECLAIM_INTERVAL = timedelta(seconds=30)
85
+ """Interval to check for stale requests to reclaim."""
86
+
87
+ def __init__(
88
+ self,
89
+ storage_name: str,
90
+ storage_id: str,
91
+ redis: Redis,
92
+ dedup_strategy: Literal['default', 'bloom'] = 'default',
93
+ bloom_error_rate: float = 1e-7,
94
+ ) -> None:
95
+ """Initialize a new instance.
96
+
97
+ Preferably use the `RedisRequestQueueClient.open` class method to create a new instance.
98
+ """
99
+ super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis)
100
+
101
+ self._dedup_strategy = dedup_strategy
102
+ """Deduplication strategy for the queue."""
103
+
104
+ self._bloom_error_rate = bloom_error_rate
105
+ """Desired false positive rate for Bloom filters."""
106
+
107
+ self._pending_fetch_cache: deque[Request] = deque()
108
+ """Cache for requests: ordered by sequence number."""
109
+
110
+ self.client_key = crypto_random_object_id(length=32)[:32]
111
+ """Unique identifier for this client instance."""
112
+
113
+ # Lua scripts for atomic operations
114
+ self._fetch_script: AsyncScript | None = None
115
+ self._reclaim_stale_script: AsyncScript | None = None
116
+ self._add_requests_script: AsyncScript | None = None
117
+
118
+ self._next_reclaim_stale: None | datetime = None
119
+
120
+ @property
121
+ def _added_filter_key(self) -> str:
122
+ """Return the Redis key for the added requests Bloom filter."""
123
+ if self._dedup_strategy != 'bloom':
124
+ raise RuntimeError('The added requests filter is only available with the bloom deduplication strategy.')
125
+ return f'{self._MAIN_KEY}:{self._storage_name}:added_bloom_filter'
126
+
127
+ @property
128
+ def _handled_filter_key(self) -> str:
129
+ """Return the Redis key for the handled requests Bloom filter."""
130
+ if self._dedup_strategy != 'bloom':
131
+ raise RuntimeError('The handled requests filter is only available with the bloom deduplication strategy.')
132
+ return f'{self._MAIN_KEY}:{self._storage_name}:handled_bloom_filter'
133
+
134
+ @property
135
+ def _pending_set_key(self) -> str:
136
+ """Return the Redis key for the pending requests set."""
137
+ if self._dedup_strategy != 'default':
138
+ raise RuntimeError('The pending requests set is only available with the default deduplication strategy.')
139
+ return f'{self._MAIN_KEY}:{self._storage_name}:pending_set'
140
+
141
+ @property
142
+ def _handled_set_key(self) -> str:
143
+ """Return the Redis key for the handled requests set."""
144
+ if self._dedup_strategy != 'default':
145
+ raise RuntimeError('The handled requests set is only available with the default deduplication strategy.')
146
+ return f'{self._MAIN_KEY}:{self._storage_name}:handled_set'
147
+
148
+ @property
149
+ def _queue_key(self) -> str:
150
+ """Return the Redis key for the request queue."""
151
+ return f'{self._MAIN_KEY}:{self._storage_name}:queue'
152
+
153
+ @property
154
+ def _data_key(self) -> str:
155
+ """Return the Redis key for the request data hash."""
156
+ return f'{self._MAIN_KEY}:{self._storage_name}:data'
157
+
158
+ @property
159
+ def _in_progress_key(self) -> str:
160
+ """Return the Redis key for the in-progress requests hash."""
161
+ return f'{self._MAIN_KEY}:{self._storage_name}:in_progress'
162
+
163
+ @classmethod
164
+ async def open(
165
+ cls,
166
+ *,
167
+ id: str | None,
168
+ name: str | None,
169
+ alias: str | None,
170
+ redis: Redis,
171
+ dedup_strategy: Literal['default', 'bloom'] = 'default',
172
+ bloom_error_rate: float = 1e-7,
173
+ ) -> RedisRequestQueueClient:
174
+ """Open or create a new Redis request queue client.
175
+
176
+ This method attempts to open an existing request queue from the Redis database. If a queue with the specified
177
+ ID or name exists, it loads the metadata from the database. If no existing queue is found, a new one
178
+ is created.
179
+
180
+ Args:
181
+ id: The ID of the request queue. If not provided, a random ID will be generated.
182
+ name: The name of the dataset for named (global scope) storages.
183
+ alias: The alias of the dataset for unnamed (run scope) storages.
184
+ redis: Redis client instance.
185
+ dedup_strategy: Strategy for request queue deduplication. Options are:
186
+ - 'default': Uses Redis sets for exact deduplication.
187
+ - 'bloom': Uses Redis Bloom filters for probabilistic deduplication with lower memory usage. When using
188
+ this approach, there is a possibility 1e-7 that requests will be skipped in the queue.
189
+ bloom_error_rate: Desired false positive rate for Bloom filter deduplication. Only relevant if
190
+ `dedup_strategy` is set to 'bloom'.
191
+
192
+ Returns:
193
+ An instance for the opened or created storage client.
194
+ """
195
+ return await cls._open(
196
+ id=id,
197
+ name=name,
198
+ alias=alias,
199
+ redis=redis,
200
+ metadata_model=RequestQueueMetadata,
201
+ extra_metadata_fields={
202
+ 'had_multiple_clients': False,
203
+ 'handled_request_count': 0,
204
+ 'pending_request_count': 0,
205
+ 'total_request_count': 0,
206
+ },
207
+ instance_kwargs={'dedup_strategy': dedup_strategy, 'bloom_error_rate': bloom_error_rate},
208
+ )
209
+
210
+ @override
211
+ async def get_metadata(self) -> RequestQueueMetadata:
212
+ return await self._get_metadata(RequestQueueMetadata)
213
+
214
+ @override
215
+ async def drop(self) -> None:
216
+ if self._dedup_strategy == 'bloom':
217
+ extra_keys = [self._added_filter_key, self._handled_filter_key]
218
+ elif self._dedup_strategy == 'default':
219
+ extra_keys = [self._pending_set_key, self._handled_set_key]
220
+ else:
221
+ raise RuntimeError(f'Unknown deduplication strategy: {self._dedup_strategy}')
222
+ extra_keys.extend([self._queue_key, self._data_key, self._in_progress_key])
223
+ await self._drop(extra_keys=extra_keys)
224
+
225
+ @override
226
+ async def purge(self) -> None:
227
+ if self._dedup_strategy == 'bloom':
228
+ extra_keys = [self._added_filter_key, self._handled_filter_key]
229
+ elif self._dedup_strategy == 'default':
230
+ extra_keys = [self._pending_set_key, self._handled_set_key]
231
+ else:
232
+ raise RuntimeError(f'Unknown deduplication strategy: {self._dedup_strategy}')
233
+ extra_keys.extend([self._queue_key, self._data_key, self._in_progress_key])
234
+ await self._purge(
235
+ extra_keys=extra_keys,
236
+ metadata_kwargs=_QueueMetadataUpdateParams(
237
+ update_accessed_at=True,
238
+ update_modified_at=True,
239
+ new_pending_request_count=0,
240
+ ),
241
+ )
242
+
243
+ @override
244
+ async def add_batch_of_requests(
245
+ self,
246
+ requests: Sequence[Request],
247
+ *,
248
+ forefront: bool = False,
249
+ ) -> AddRequestsResponse:
250
+ # Mypy workaround
251
+ if self._add_requests_script is None:
252
+ raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.')
253
+
254
+ processed_requests = []
255
+
256
+ delta_pending = 0
257
+ delta_total = 0
258
+
259
+ requests_by_unique_key = {req.unique_key: req for req in requests}
260
+ unique_keys = list(requests_by_unique_key.keys())
261
+ # Check which requests are already added or handled
262
+ async with self._get_pipeline(with_execute=False) as pipe:
263
+ if self._dedup_strategy == 'default':
264
+ await await_redis_response(pipe.smismember(self._pending_set_key, unique_keys))
265
+ await await_redis_response(pipe.smismember(self._handled_set_key, unique_keys))
266
+ elif self._dedup_strategy == 'bloom':
267
+ await await_redis_response(pipe.bf().mexists(self._added_filter_key, *unique_keys)) # type: ignore[no-untyped-call]
268
+ await await_redis_response(pipe.bf().mexists(self._handled_filter_key, *unique_keys)) # type: ignore[no-untyped-call]
269
+
270
+ pipe_results = await pipe.execute()
271
+
272
+ added_pending_flags = pipe_results[0]
273
+ handled_flags = pipe_results[1]
274
+
275
+ new_unique_keys = []
276
+ new_request_data = {}
277
+ delta_pending = 0
278
+ delta_total = 0
279
+
280
+ for i, unique_key in enumerate(unique_keys):
281
+ # Already handled - skip
282
+ if handled_flags[i]:
283
+ processed_requests.append(
284
+ ProcessedRequest(
285
+ unique_key=unique_key,
286
+ was_already_present=True,
287
+ was_already_handled=True,
288
+ )
289
+ )
290
+ continue
291
+
292
+ # Already in queue - skip
293
+ if added_pending_flags[i]:
294
+ processed_requests.append(
295
+ ProcessedRequest(
296
+ unique_key=unique_key,
297
+ was_already_present=True,
298
+ was_already_handled=False,
299
+ )
300
+ )
301
+ continue
302
+
303
+ # New request - will add to queue
304
+ request = requests_by_unique_key[unique_key]
305
+
306
+ new_unique_keys.append(unique_key)
307
+ new_request_data[unique_key] = request.model_dump_json()
308
+
309
+ if new_unique_keys:
310
+ # Add new requests to the queue atomically, get back which were actually added
311
+ script_results = await self._add_requests_script(
312
+ keys=[
313
+ self._added_filter_key if self._dedup_strategy == 'bloom' else self._pending_set_key,
314
+ self._queue_key,
315
+ self._data_key,
316
+ ],
317
+ args=[int(forefront), json.dumps(new_unique_keys), json.dumps(new_request_data)],
318
+ )
319
+ actually_added = set(json.loads(script_results))
320
+
321
+ delta_pending = len(actually_added)
322
+ delta_total = len(actually_added)
323
+
324
+ processed_requests.extend(
325
+ [
326
+ ProcessedRequest(
327
+ unique_key=unique_key,
328
+ was_already_present=unique_key not in actually_added,
329
+ was_already_handled=False,
330
+ )
331
+ for unique_key in new_unique_keys
332
+ ]
333
+ )
334
+
335
+ async with self._get_pipeline() as pipe:
336
+ await self._update_metadata(
337
+ pipe,
338
+ **_QueueMetadataUpdateParams(
339
+ update_accessed_at=True,
340
+ update_modified_at=True,
341
+ delta_pending_request_count=delta_pending,
342
+ delta_total_request_count=delta_total,
343
+ ),
344
+ )
345
+
346
+ return AddRequestsResponse(
347
+ processed_requests=processed_requests,
348
+ unprocessed_requests=[],
349
+ )
350
+
351
+ @override
352
+ async def fetch_next_request(self) -> Request | None:
353
+ if self._pending_fetch_cache:
354
+ return self._pending_fetch_cache.popleft()
355
+
356
+ # Mypy workaround
357
+ if self._fetch_script is None:
358
+ raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.')
359
+
360
+ blocked_until_timestamp = int(datetime.now(tz=timezone.utc).timestamp() * 1000) + self._BLOCK_REQUEST_TIME
361
+
362
+ # The script retrieves requests from the queue and places them in the in_progress hash.
363
+ requests_json = await self._fetch_script(
364
+ keys=[self._queue_key, self._in_progress_key, self._data_key],
365
+ args=[self.client_key, blocked_until_timestamp, self._MAX_BATCH_FETCH_SIZE],
366
+ )
367
+
368
+ async with self._get_pipeline() as pipe:
369
+ await self._update_metadata(pipe, **_QueueMetadataUpdateParams(update_accessed_at=True))
370
+
371
+ if not requests_json:
372
+ return None
373
+
374
+ requests = [Request.model_validate_json(req_json) for req_json in requests_json]
375
+
376
+ self._pending_fetch_cache.extend(requests[1:])
377
+
378
+ return requests[0]
379
+
380
+ @override
381
+ async def get_request(self, unique_key: str) -> Request | None:
382
+ request_data = await await_redis_response(self._redis.hget(self._data_key, unique_key))
383
+
384
+ if isinstance(request_data, (str, bytes, bytearray)):
385
+ return Request.model_validate_json(request_data)
386
+
387
+ return None
388
+
389
+ @override
390
+ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
391
+ # Check if the request is in progress.
392
+ check_in_progress = await await_redis_response(self._redis.hexists(self._in_progress_key, request.unique_key))
393
+ if not check_in_progress:
394
+ logger.warning(f'Marking request {request.unique_key} as handled that is not in progress.')
395
+ return None
396
+
397
+ async with self._get_pipeline() as pipe:
398
+ if self._dedup_strategy == 'default':
399
+ await await_redis_response(pipe.sadd(self._handled_set_key, request.unique_key))
400
+ await await_redis_response(pipe.srem(self._pending_set_key, request.unique_key))
401
+ elif self._dedup_strategy == 'bloom':
402
+ await await_redis_response(pipe.bf().add(self._handled_filter_key, request.unique_key)) # type: ignore[no-untyped-call]
403
+
404
+ await await_redis_response(pipe.hdel(self._in_progress_key, request.unique_key))
405
+ await await_redis_response(pipe.hdel(self._data_key, request.unique_key))
406
+
407
+ await self._update_metadata(
408
+ pipe,
409
+ **_QueueMetadataUpdateParams(
410
+ update_accessed_at=True,
411
+ update_modified_at=True,
412
+ delta_handled_request_count=1,
413
+ delta_pending_request_count=-1,
414
+ ),
415
+ )
416
+
417
+ return ProcessedRequest(
418
+ unique_key=request.unique_key,
419
+ was_already_present=True,
420
+ was_already_handled=True,
421
+ )
422
+
423
+ @override
424
+ async def reclaim_request(
425
+ self,
426
+ request: Request,
427
+ *,
428
+ forefront: bool = False,
429
+ ) -> ProcessedRequest | None:
430
+ check_in_progress = await await_redis_response(self._redis.hexists(self._in_progress_key, request.unique_key))
431
+ if not check_in_progress:
432
+ logger.info(f'Reclaiming request {request.unique_key} that is not in progress.')
433
+ return None
434
+
435
+ async with self._get_pipeline() as pipe:
436
+ if forefront:
437
+ blocked_until_timestamp = (
438
+ int(datetime.now(tz=timezone.utc).timestamp() * 1000) + self._BLOCK_REQUEST_TIME
439
+ )
440
+
441
+ await await_redis_response(
442
+ pipe.hset(
443
+ self._in_progress_key,
444
+ request.unique_key,
445
+ f'{{"client_id":"{self.client_key}","blocked_until_timestamp":{blocked_until_timestamp}}}',
446
+ )
447
+ )
448
+ self._pending_fetch_cache.appendleft(request)
449
+ else:
450
+ await await_redis_response(pipe.rpush(self._queue_key, request.unique_key))
451
+ await await_redis_response(pipe.hset(self._data_key, request.unique_key, request.model_dump_json()))
452
+ await await_redis_response(pipe.hdel(self._in_progress_key, request.unique_key))
453
+ await self._update_metadata(
454
+ pipe,
455
+ **_QueueMetadataUpdateParams(
456
+ update_modified_at=True,
457
+ update_accessed_at=True,
458
+ ),
459
+ )
460
+
461
+ return ProcessedRequest(
462
+ unique_key=request.unique_key,
463
+ was_already_present=True,
464
+ was_already_handled=False,
465
+ )
466
+
467
+ @override
468
+ async def is_empty(self) -> bool:
469
+ """Check if the queue is empty.
470
+
471
+ Returns:
472
+ True if the queue is empty, False otherwise.
473
+ """
474
+ if self._pending_fetch_cache:
475
+ return False
476
+
477
+ # Reclaim stale requests if needed
478
+ if self._next_reclaim_stale is None or datetime.now(tz=timezone.utc) >= self._next_reclaim_stale:
479
+ await self._reclaim_stale_requests()
480
+ self._next_reclaim_stale = datetime.now(tz=timezone.utc) + self._RECLAIM_INTERVAL
481
+
482
+ metadata = await self.get_metadata()
483
+
484
+ return metadata.pending_request_count == 0
485
+
486
+ async def _load_scripts(self) -> None:
487
+ """Ensure Lua scripts are loaded in Redis."""
488
+ self._fetch_script = await self._create_script('atomic_fetch_request.lua')
489
+ self._reclaim_stale_script = await self._create_script('reclaim_stale_requests.lua')
490
+ if self._dedup_strategy == 'bloom':
491
+ self._add_requests_script = await self._create_script('atomic_bloom_add_requests.lua')
492
+ elif self._dedup_strategy == 'default':
493
+ self._add_requests_script = await self._create_script('atomic_set_add_requests.lua')
494
+
495
+ @override
496
+ async def _create_storage(self, pipeline: Pipeline) -> None:
497
+ # Create Bloom filters for added and handled requests
498
+ if self._dedup_strategy == 'bloom':
499
+ await await_redis_response(
500
+ pipeline.bf().create(
501
+ self._added_filter_key, errorRate=self._bloom_error_rate, capacity=100000, expansion=10
502
+ ) # type: ignore[no-untyped-call]
503
+ )
504
+ await await_redis_response(
505
+ pipeline.bf().create(
506
+ self._handled_filter_key, errorRate=self._bloom_error_rate, capacity=100000, expansion=10
507
+ ) # type: ignore[no-untyped-call]
508
+ )
509
+
510
+ async def _reclaim_stale_requests(self) -> None:
511
+ """Reclaim requests that have been in progress for too long."""
512
+ # Mypy workaround
513
+ if self._reclaim_stale_script is None:
514
+ raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.')
515
+
516
+ current_time = int(datetime.now(tz=timezone.utc).timestamp() * 1000)
517
+
518
+ await self._reclaim_stale_script(
519
+ keys=[self._in_progress_key, self._queue_key, self._data_key], args=[current_time]
520
+ )
521
+
522
+ @override
523
+ async def _specific_update_metadata(
524
+ self,
525
+ pipeline: Pipeline,
526
+ *,
527
+ delta_handled_request_count: int | None = None,
528
+ new_handled_request_count: int | None = None,
529
+ delta_pending_request_count: int | None = None,
530
+ new_pending_request_count: int | None = None,
531
+ delta_total_request_count: int | None = None,
532
+ new_total_request_count: int | None = None,
533
+ update_had_multiple_clients: bool = False,
534
+ **_kwargs: Any,
535
+ ) -> None:
536
+ """Update the dataset metadata with current information.
537
+
538
+ Args:
539
+ pipeline: The Redis pipeline to use for the update.
540
+ new_handled_request_count: If provided, update the handled_request_count to this value.
541
+ new_pending_request_count: If provided, update the pending_request_count to this value.
542
+ new_total_request_count: If provided, update the total_request_count to this value.
543
+ delta_handled_request_count: If provided, add this value to the handled_request_count.
544
+ delta_pending_request_count: If provided, add this value to the pending_request_count.
545
+ delta_total_request_count: If provided, add this value to the total_request_count.
546
+ update_had_multiple_clients: If True, set had_multiple_clients to True.
547
+ """
548
+ if new_pending_request_count is not None:
549
+ await await_redis_response(
550
+ pipeline.json().set(
551
+ self.metadata_key, '$.pending_request_count', new_pending_request_count, nx=False, xx=True
552
+ )
553
+ )
554
+ elif delta_pending_request_count is not None:
555
+ await await_redis_response(
556
+ pipeline.json().numincrby(self.metadata_key, '$.pending_request_count', delta_pending_request_count)
557
+ )
558
+
559
+ if new_handled_request_count is not None:
560
+ await await_redis_response(
561
+ pipeline.json().set(
562
+ self.metadata_key, '$.handled_request_count', new_handled_request_count, nx=False, xx=True
563
+ )
564
+ )
565
+ elif delta_handled_request_count is not None:
566
+ await await_redis_response(
567
+ pipeline.json().numincrby(self.metadata_key, '$.handled_request_count', delta_handled_request_count)
568
+ )
569
+
570
+ if new_total_request_count is not None:
571
+ await await_redis_response(
572
+ pipeline.json().set(
573
+ self.metadata_key, '$.total_request_count', new_total_request_count, nx=False, xx=True
574
+ )
575
+ )
576
+ elif delta_total_request_count is not None:
577
+ await await_redis_response(
578
+ pipeline.json().numincrby(self.metadata_key, '$.total_request_count', delta_total_request_count)
579
+ )
580
+
581
+ if update_had_multiple_clients:
582
+ await await_redis_response(
583
+ pipeline.json().set(
584
+ self.metadata_key, '$.had_multiple_clients', update_had_multiple_clients, nx=False, xx=True
585
+ )
586
+ )