crawlee 0.6.13b17__py3-none-any.whl → 1.1.2b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (102) hide show
  1. crawlee/_autoscaling/snapshotter.py +1 -1
  2. crawlee/_request.py +35 -33
  3. crawlee/_service_locator.py +44 -24
  4. crawlee/_types.py +106 -34
  5. crawlee/_utils/context.py +2 -2
  6. crawlee/_utils/file.py +7 -0
  7. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  8. crawlee/_utils/recoverable_state.py +32 -8
  9. crawlee/_utils/recurring_task.py +17 -1
  10. crawlee/_utils/requests.py +0 -26
  11. crawlee/_utils/robots.py +17 -5
  12. crawlee/_utils/sitemap.py +4 -2
  13. crawlee/_utils/system.py +3 -3
  14. crawlee/_utils/time.py +120 -0
  15. crawlee/_utils/urls.py +9 -2
  16. crawlee/browsers/_browser_pool.py +4 -1
  17. crawlee/browsers/_playwright_browser_controller.py +21 -15
  18. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  19. crawlee/browsers/_types.py +1 -1
  20. crawlee/configuration.py +2 -0
  21. crawlee/crawlers/__init__.py +2 -1
  22. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  23. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +50 -12
  24. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  25. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  26. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
  27. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  28. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
  29. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  30. crawlee/crawlers/_basic/_basic_crawler.py +219 -126
  31. crawlee/crawlers/_basic/_logging_utils.py +5 -1
  32. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  33. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  34. crawlee/crawlers/_playwright/_playwright_crawler.py +60 -11
  35. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  36. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  37. crawlee/crawlers/_playwright/_types.py +12 -2
  38. crawlee/events/_event_manager.py +4 -4
  39. crawlee/events/_types.py +6 -6
  40. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  41. crawlee/fingerprint_suite/_header_generator.py +2 -2
  42. crawlee/fingerprint_suite/_types.py +2 -2
  43. crawlee/http_clients/_base.py +4 -0
  44. crawlee/http_clients/_curl_impersonate.py +12 -0
  45. crawlee/http_clients/_httpx.py +16 -6
  46. crawlee/http_clients/_impit.py +25 -10
  47. crawlee/otel/crawler_instrumentor.py +3 -3
  48. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  49. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  50. crawlee/request_loaders/_request_list.py +3 -3
  51. crawlee/request_loaders/_request_loader.py +5 -1
  52. crawlee/request_loaders/_sitemap_request_loader.py +248 -50
  53. crawlee/sessions/_models.py +2 -2
  54. crawlee/sessions/_session_pool.py +1 -1
  55. crawlee/statistics/_error_snapshotter.py +1 -1
  56. crawlee/statistics/_models.py +43 -4
  57. crawlee/statistics/_statistics.py +24 -33
  58. crawlee/storage_clients/__init__.py +16 -0
  59. crawlee/storage_clients/_base/_request_queue_client.py +2 -2
  60. crawlee/storage_clients/_base/_storage_client.py +13 -0
  61. crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
  62. crawlee/storage_clients/_file_system/_key_value_store_client.py +30 -26
  63. crawlee/storage_clients/_file_system/_request_queue_client.py +169 -153
  64. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  65. crawlee/storage_clients/_file_system/_utils.py +0 -0
  66. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  67. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  68. crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
  69. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  70. crawlee/storage_clients/_redis/__init__.py +6 -0
  71. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  72. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  73. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  74. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  75. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  76. crawlee/storage_clients/_redis/_utils.py +23 -0
  77. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  78. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  79. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  80. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  81. crawlee/storage_clients/_redis/py.typed +0 -0
  82. crawlee/storage_clients/_sql/__init__.py +6 -0
  83. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  84. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  85. crawlee/storage_clients/_sql/_db_models.py +268 -0
  86. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  87. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  88. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  89. crawlee/storage_clients/_sql/py.typed +0 -0
  90. crawlee/storage_clients/models.py +13 -11
  91. crawlee/storages/_base.py +5 -1
  92. crawlee/storages/_dataset.py +12 -2
  93. crawlee/storages/_key_value_store.py +17 -4
  94. crawlee/storages/_request_queue.py +13 -5
  95. crawlee/storages/_storage_instance_manager.py +133 -71
  96. crawlee/storages/_utils.py +11 -0
  97. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +18 -6
  98. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +101 -78
  99. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
  100. crawlee/_utils/measure_time.py +0 -31
  101. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
  102. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,720 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import deque
4
+ from datetime import datetime, timedelta, timezone
5
+ from functools import lru_cache
6
+ from hashlib import sha256
7
+ from logging import getLogger
8
+ from typing import TYPE_CHECKING, Any, cast
9
+
10
+ from sqlalchemy import CursorResult, func, or_, select, update
11
+ from sqlalchemy.exc import SQLAlchemyError
12
+ from sqlalchemy.orm import load_only
13
+ from typing_extensions import NotRequired, Self, override
14
+
15
+ from crawlee import Request
16
+ from crawlee._utils.crypto import crypto_random_object_id
17
+ from crawlee.storage_clients._base import RequestQueueClient
18
+ from crawlee.storage_clients.models import (
19
+ AddRequestsResponse,
20
+ ProcessedRequest,
21
+ RequestQueueMetadata,
22
+ UnprocessedRequest,
23
+ )
24
+
25
+ from ._client_mixin import MetadataUpdateParams, SqlClientMixin
26
+ from ._db_models import RequestDb, RequestQueueMetadataDb, RequestQueueStateDb
27
+
28
+ if TYPE_CHECKING:
29
+ from collections.abc import Sequence
30
+
31
+ from sqlalchemy.ext.asyncio import AsyncSession
32
+
33
+ from ._storage_client import SqlStorageClient
34
+
35
+
36
+ logger = getLogger(__name__)
37
+
38
+
39
+ class _QueueMetadataUpdateParams(MetadataUpdateParams):
40
+ """Parameters for updating queue metadata."""
41
+
42
+ new_handled_request_count: NotRequired[int]
43
+ new_pending_request_count: NotRequired[int]
44
+ new_total_request_count: NotRequired[int]
45
+ delta_handled_request_count: NotRequired[int]
46
+ delta_pending_request_count: NotRequired[int]
47
+ recalculate: NotRequired[bool]
48
+ update_had_multiple_clients: NotRequired[bool]
49
+
50
+
51
+ class SqlRequestQueueClient(RequestQueueClient, SqlClientMixin):
52
+ """SQL implementation of the request queue client.
53
+
54
+ This client persists requests to a SQL database with transaction handling and
55
+ concurrent access safety. Requests are stored with sequence-based ordering and
56
+ efficient querying capabilities.
57
+
58
+ The implementation uses negative sequence numbers for forefront (high-priority) requests
59
+ and positive sequence numbers for regular requests, allowing for efficient single-query
60
+ ordering. A cache mechanism reduces database queries.
61
+
62
+ The request queue data is stored in SQL database tables following the pattern:
63
+ - `request_queues` table: Contains queue metadata (id, name, timestamps, request counts, multi-client flag)
64
+ - `request_queue_records` table: Contains individual requests with JSON data, unique keys for deduplication,
65
+ sequence numbers for ordering, and processing status flags
66
+ - `request_queue_state` table: Maintains counters for sequence numbers to ensure proper ordering of requests.
67
+
68
+ Requests are serialized to JSON for storage and maintain proper ordering through sequence
69
+ numbers. The implementation provides concurrent access safety through transaction
70
+ handling, locking mechanisms, and optimized database indexes for efficient querying.
71
+ """
72
+
73
+ _DEFAULT_NAME = 'default'
74
+ """Default dataset name used when no name is provided."""
75
+
76
+ _MAX_BATCH_FETCH_SIZE = 10
77
+ """Maximum number of requests to fetch from the database in a single batch operation.
78
+
79
+ Used to limit the number of requests loaded and locked for processing at once (improves efficiency and reduces
80
+ database load).
81
+ """
82
+
83
+ _METADATA_TABLE = RequestQueueMetadataDb
84
+ """SQLAlchemy model for request queue metadata."""
85
+
86
+ _ITEM_TABLE = RequestDb
87
+ """SQLAlchemy model for request items."""
88
+
89
+ _CLIENT_TYPE = 'Request queue'
90
+ """Human-readable client type for error messages."""
91
+
92
+ _BLOCK_REQUEST_TIME = 300
93
+ """Number of seconds for which a request is considered blocked in the database after being fetched for processing.
94
+ """
95
+
96
+ def __init__(
97
+ self,
98
+ *,
99
+ id: str,
100
+ storage_client: SqlStorageClient,
101
+ ) -> None:
102
+ """Initialize a new instance.
103
+
104
+ Preferably use the `SqlRequestQueueClient.open` class method to create a new instance.
105
+ """
106
+ super().__init__(id=id, storage_client=storage_client)
107
+
108
+ self._pending_fetch_cache: deque[Request] = deque()
109
+ """Cache for requests: ordered by sequence number."""
110
+
111
+ self.client_key = crypto_random_object_id(length=32)[:32]
112
+ """Unique identifier for this client instance."""
113
+
114
+ @classmethod
115
+ async def open(
116
+ cls,
117
+ *,
118
+ id: str | None,
119
+ name: str | None,
120
+ alias: str | None,
121
+ storage_client: SqlStorageClient,
122
+ ) -> Self:
123
+ """Open an existing request queue or create a new one.
124
+
125
+ This method first tries to find an existing queue by ID or name.
126
+ If found, it returns a client for that queue. If not found, it creates
127
+ a new queue with the specified parameters.
128
+
129
+ Args:
130
+ id: The ID of the request queue to open. Takes precedence over name.
131
+ name: The name of the request queue for named (global scope) storages.
132
+ alias: The alias of the request queue for unnamed (run scope) storages.
133
+ storage_client: The SQL storage client used to access the database.
134
+
135
+ Returns:
136
+ An instance for the opened or created request queue.
137
+
138
+ Raises:
139
+ ValueError: If a queue with the specified ID is not found.
140
+ """
141
+ return await cls._safely_open(
142
+ id=id,
143
+ name=name,
144
+ alias=alias,
145
+ storage_client=storage_client,
146
+ metadata_model=RequestQueueMetadata,
147
+ extra_metadata_fields={
148
+ 'had_multiple_clients': False,
149
+ 'handled_request_count': 0,
150
+ 'pending_request_count': 0,
151
+ 'total_request_count': 0,
152
+ },
153
+ )
154
+
155
+ @override
156
+ async def get_metadata(self) -> RequestQueueMetadata:
157
+ # The database is a single place of truth
158
+ return await self._get_metadata(RequestQueueMetadata)
159
+
160
+ @override
161
+ async def drop(self) -> None:
162
+ """Delete this request queue and all its records from the database.
163
+
164
+ This operation is irreversible. Uses CASCADE deletion to remove all related records.
165
+ """
166
+ await self._drop()
167
+
168
+ self._pending_fetch_cache.clear()
169
+
170
+ @override
171
+ async def purge(self) -> None:
172
+ """Remove all items from this dataset while keeping the dataset structure.
173
+
174
+ Resets pending_request_count and handled_request_count to 0 and deletes all records from request_queue_records
175
+ table.
176
+ """
177
+ await self._purge(
178
+ metadata_kwargs=_QueueMetadataUpdateParams(
179
+ update_accessed_at=True,
180
+ update_modified_at=True,
181
+ new_pending_request_count=0,
182
+ force=True,
183
+ )
184
+ )
185
+
186
+ # Clear recoverable state
187
+ self._pending_fetch_cache.clear()
188
+
189
+ @override
190
+ async def add_batch_of_requests(
191
+ self,
192
+ requests: Sequence[Request],
193
+ *,
194
+ forefront: bool = False,
195
+ ) -> AddRequestsResponse:
196
+ if not requests:
197
+ return AddRequestsResponse(processed_requests=[], unprocessed_requests=[])
198
+
199
+ # Clear empty cache since we're adding requests
200
+ processed_requests = []
201
+ unprocessed_requests = []
202
+ transaction_processed_requests = []
203
+ transaction_processed_requests_unique_keys = set()
204
+
205
+ metadata_recalculate = False
206
+
207
+ # Deduplicate requests by unique_key upfront
208
+ unique_requests = {}
209
+ unique_key_by_request_id = {}
210
+ for req in requests:
211
+ if req.unique_key not in unique_requests:
212
+ request_id = self._get_int_id_from_unique_key(req.unique_key)
213
+ unique_requests[request_id] = req
214
+ unique_key_by_request_id[request_id] = req.unique_key
215
+
216
+ # Get existing requests by unique keys
217
+ stmt = (
218
+ select(self._ITEM_TABLE)
219
+ .where(
220
+ self._ITEM_TABLE.request_queue_id == self._id,
221
+ self._ITEM_TABLE.request_id.in_(set(unique_requests.keys())),
222
+ )
223
+ .options(
224
+ load_only(
225
+ self._ITEM_TABLE.request_id,
226
+ self._ITEM_TABLE.is_handled,
227
+ self._ITEM_TABLE.time_blocked_until,
228
+ )
229
+ )
230
+ )
231
+
232
+ async with self.get_session() as session:
233
+ result = await session.execute(stmt)
234
+ result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result
235
+ existing_requests = {req.request_id: req for req in result.scalars()}
236
+ state = await self._get_state(session)
237
+ insert_values: list[dict] = []
238
+
239
+ for request_id, request in sorted(unique_requests.items()):
240
+ existing_req_db = existing_requests.get(request_id)
241
+ # New Request, add it
242
+ if existing_req_db is None:
243
+ value = {
244
+ 'request_id': request_id,
245
+ 'request_queue_id': self._id,
246
+ 'data': request.model_dump_json(),
247
+ 'is_handled': False,
248
+ }
249
+ if forefront:
250
+ value['sequence_number'] = state.forefront_sequence_counter
251
+ state.forefront_sequence_counter -= 1
252
+ else:
253
+ value['sequence_number'] = state.sequence_counter
254
+ state.sequence_counter += 1
255
+
256
+ insert_values.append(value)
257
+ metadata_recalculate = True
258
+ transaction_processed_requests.append(
259
+ ProcessedRequest(
260
+ unique_key=request.unique_key,
261
+ was_already_present=False,
262
+ was_already_handled=False,
263
+ )
264
+ )
265
+ transaction_processed_requests_unique_keys.add(request.unique_key)
266
+ # Already handled request, skip adding
267
+ elif existing_req_db and existing_req_db.is_handled:
268
+ processed_requests.append(
269
+ ProcessedRequest(
270
+ unique_key=request.unique_key,
271
+ was_already_present=True,
272
+ was_already_handled=True,
273
+ )
274
+ )
275
+ # Already in progress in one of the clients
276
+ elif existing_req_db and existing_req_db.time_blocked_until:
277
+ processed_requests.append(
278
+ ProcessedRequest(
279
+ unique_key=request.unique_key,
280
+ was_already_present=True,
281
+ was_already_handled=False,
282
+ )
283
+ )
284
+ # Request in database but not yet handled and not in progress
285
+ elif existing_req_db and not existing_req_db.is_handled and not existing_req_db.time_blocked_until:
286
+ # Forefront request, update its sequence number
287
+ if forefront:
288
+ insert_values.append(
289
+ {
290
+ 'request_queue_id': self._id,
291
+ 'request_id': request_id,
292
+ 'sequence_number': state.forefront_sequence_counter,
293
+ 'data': request.model_dump_json(),
294
+ 'is_handled': False,
295
+ }
296
+ )
297
+ state.forefront_sequence_counter -= 1
298
+ transaction_processed_requests.append(
299
+ ProcessedRequest(
300
+ unique_key=request.unique_key,
301
+ was_already_present=True,
302
+ was_already_handled=False,
303
+ )
304
+ )
305
+ transaction_processed_requests_unique_keys.add(request.unique_key)
306
+ # Regular request, keep its position
307
+ else:
308
+ processed_requests.append(
309
+ ProcessedRequest(
310
+ unique_key=request.unique_key,
311
+ was_already_present=True,
312
+ was_already_handled=False,
313
+ )
314
+ )
315
+ # Unexpected condition
316
+ else:
317
+ unprocessed_requests.append(
318
+ UnprocessedRequest(
319
+ unique_key=request.unique_key,
320
+ url=request.url,
321
+ method=request.method,
322
+ )
323
+ )
324
+
325
+ if insert_values:
326
+ if forefront:
327
+ # If the request already exists in the database, we update the sequence_number by shifting request
328
+ # to the left.
329
+ upsert_stmt = self._build_upsert_stmt(
330
+ self._ITEM_TABLE,
331
+ insert_values,
332
+ update_columns=['sequence_number'],
333
+ conflict_cols=['request_id', 'request_queue_id'],
334
+ )
335
+ await session.execute(upsert_stmt)
336
+ else:
337
+ # If the request already exists in the database, we ignore this request when inserting.
338
+ insert_stmt_with_ignore = self._build_insert_stmt_with_ignore(self._ITEM_TABLE, insert_values)
339
+ await session.execute(insert_stmt_with_ignore)
340
+
341
+ await self._update_metadata(
342
+ session,
343
+ **_QueueMetadataUpdateParams(
344
+ recalculate=metadata_recalculate,
345
+ update_modified_at=True,
346
+ update_accessed_at=True,
347
+ force=metadata_recalculate,
348
+ ),
349
+ )
350
+
351
+ try:
352
+ await session.commit()
353
+ processed_requests.extend(transaction_processed_requests)
354
+ except SQLAlchemyError as e:
355
+ await session.rollback()
356
+ logger.warning(f'Failed to commit session: {e}')
357
+ await self._update_metadata(
358
+ session, recalculate=True, update_modified_at=True, update_accessed_at=True, force=True
359
+ )
360
+ await session.commit()
361
+ transaction_processed_requests.clear()
362
+ unprocessed_requests.extend(
363
+ [
364
+ UnprocessedRequest(
365
+ unique_key=request.unique_key,
366
+ url=request.url,
367
+ method=request.method,
368
+ )
369
+ for request in requests
370
+ if request.unique_key in transaction_processed_requests_unique_keys
371
+ ]
372
+ )
373
+
374
+ return AddRequestsResponse(
375
+ processed_requests=processed_requests,
376
+ unprocessed_requests=unprocessed_requests,
377
+ )
378
+
379
+ @override
380
+ async def get_request(self, unique_key: str) -> Request | None:
381
+ request_id = self._get_int_id_from_unique_key(unique_key)
382
+
383
+ stmt = select(self._ITEM_TABLE).where(
384
+ self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.request_id == request_id
385
+ )
386
+ async with self.get_session() as session:
387
+ result = await session.execute(stmt)
388
+ request_db = result.scalar_one_or_none()
389
+
390
+ if request_db is None:
391
+ logger.warning(f'Request with ID "{unique_key}" not found in the queue.')
392
+ return None
393
+
394
+ updated = await self._update_metadata(session, update_accessed_at=True)
395
+
396
+ # Commit updates to the metadata
397
+ if updated:
398
+ await session.commit()
399
+
400
+ return Request.model_validate_json(request_db.data)
401
+
402
+ @override
403
+ async def fetch_next_request(self) -> Request | None:
404
+ if self._pending_fetch_cache:
405
+ return self._pending_fetch_cache.popleft()
406
+
407
+ now = datetime.now(timezone.utc)
408
+ block_until = now + timedelta(seconds=self._BLOCK_REQUEST_TIME)
409
+ dialect = self._storage_client.get_dialect_name()
410
+
411
+ # Get available requests not blocked by another client
412
+ stmt = (
413
+ select(self._ITEM_TABLE)
414
+ .where(
415
+ self._ITEM_TABLE.request_queue_id == self._id,
416
+ self._ITEM_TABLE.is_handled.is_(False),
417
+ or_(self._ITEM_TABLE.time_blocked_until.is_(None), self._ITEM_TABLE.time_blocked_until < now),
418
+ )
419
+ .order_by(self._ITEM_TABLE.sequence_number.asc())
420
+ .limit(self._MAX_BATCH_FETCH_SIZE)
421
+ )
422
+
423
+ async with self.get_session() as session:
424
+ # We use the `skip_locked` database mechanism to prevent the 'interception' of requests by another client
425
+ if dialect == 'postgresql':
426
+ stmt = stmt.with_for_update(skip_locked=True)
427
+ result = await session.execute(stmt)
428
+ requests_db = result.scalars().all()
429
+
430
+ if not requests_db:
431
+ return None
432
+
433
+ # All requests received have already been reserved for update with the help of `skip_locked`.
434
+ request_ids = {r.request_id for r in requests_db}
435
+
436
+ update_stmt = (
437
+ update(self._ITEM_TABLE)
438
+ .where(self._ITEM_TABLE.request_id.in_(request_ids))
439
+ .values(time_blocked_until=block_until, client_key=self.client_key)
440
+ )
441
+ await session.execute(update_stmt)
442
+
443
+ blocked_ids = request_ids
444
+ else:
445
+ # For other databases, we first select the requests, then try to update them to be blocked.
446
+ result = await session.execute(stmt)
447
+ requests_db = result.scalars().all()
448
+
449
+ if not requests_db:
450
+ return None
451
+
452
+ request_ids = {r.request_id for r in requests_db}
453
+
454
+ update_stmt = (
455
+ update(self._ITEM_TABLE)
456
+ .where(
457
+ self._ITEM_TABLE.request_queue_id == self._id,
458
+ self._ITEM_TABLE.request_id.in_(request_ids),
459
+ self._ITEM_TABLE.is_handled.is_(False),
460
+ or_(self._ITEM_TABLE.time_blocked_until.is_(None), self._ITEM_TABLE.time_blocked_until < now),
461
+ )
462
+ .values(time_blocked_until=block_until, client_key=self.client_key)
463
+ .returning(self._ITEM_TABLE.request_id)
464
+ )
465
+
466
+ update_result = await session.execute(update_stmt)
467
+ blocked_ids = {row[0] for row in update_result.fetchall()}
468
+
469
+ if not blocked_ids:
470
+ await session.rollback()
471
+ return None
472
+
473
+ await self._update_metadata(session, **_QueueMetadataUpdateParams(update_accessed_at=True))
474
+
475
+ await session.commit()
476
+
477
+ requests = [Request.model_validate_json(r.data) for r in requests_db if r.request_id in blocked_ids]
478
+
479
+ if not requests:
480
+ return None
481
+
482
+ self._pending_fetch_cache.extend(requests[1:])
483
+
484
+ return requests[0]
485
+
486
+ @override
487
+ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
488
+ request_id = self._get_int_id_from_unique_key(request.unique_key)
489
+
490
+ # Update the request's handled_at timestamp.
491
+ if request.handled_at is None:
492
+ request.handled_at = datetime.now(timezone.utc)
493
+
494
+ # Update request in Db
495
+ stmt = (
496
+ update(self._ITEM_TABLE)
497
+ .where(self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.request_id == request_id)
498
+ .values(is_handled=True, time_blocked_until=None, client_key=None, data=request.model_dump_json())
499
+ )
500
+ async with self.get_session() as session:
501
+ result = await session.execute(stmt)
502
+ result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result
503
+
504
+ if result.rowcount == 0:
505
+ logger.warning(f'Request {request.unique_key} not found in database.')
506
+ return None
507
+
508
+ await self._update_metadata(
509
+ session,
510
+ **_QueueMetadataUpdateParams(
511
+ delta_handled_request_count=1,
512
+ delta_pending_request_count=-1,
513
+ update_modified_at=True,
514
+ update_accessed_at=True,
515
+ force=True,
516
+ ),
517
+ )
518
+ await session.commit()
519
+ return ProcessedRequest(
520
+ unique_key=request.unique_key,
521
+ was_already_present=True,
522
+ was_already_handled=True,
523
+ )
524
+
525
+ @override
526
+ async def reclaim_request(
527
+ self,
528
+ request: Request,
529
+ *,
530
+ forefront: bool = False,
531
+ ) -> ProcessedRequest | None:
532
+ request_id = self._get_int_id_from_unique_key(request.unique_key)
533
+
534
+ stmt = update(self._ITEM_TABLE).where(
535
+ self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.request_id == request_id
536
+ )
537
+
538
+ async with self.get_session(with_simple_commit=True) as session:
539
+ state = await self._get_state(session)
540
+
541
+ # Update sequence number if changing priority
542
+ if forefront:
543
+ new_sequence = state.forefront_sequence_counter
544
+ state.forefront_sequence_counter -= 1
545
+ now = datetime.now(timezone.utc)
546
+ block_until = now + timedelta(seconds=self._BLOCK_REQUEST_TIME)
547
+ # Extend blocking for forefront request, it is considered blocked by the current client.
548
+ stmt = stmt.values(
549
+ sequence_number=new_sequence,
550
+ time_blocked_until=block_until,
551
+ client_key=self.client_key,
552
+ data=request.model_dump_json(),
553
+ )
554
+ else:
555
+ new_sequence = state.sequence_counter
556
+ state.sequence_counter += 1
557
+ stmt = stmt.values(
558
+ sequence_number=new_sequence,
559
+ time_blocked_until=None,
560
+ client_key=None,
561
+ data=request.model_dump_json(),
562
+ )
563
+
564
+ result = await session.execute(stmt)
565
+ result = cast('CursorResult', result) if not isinstance(result, CursorResult) else result
566
+
567
+ if result.rowcount == 0:
568
+ logger.warning(f'Request {request.unique_key} not found in database.')
569
+ return None
570
+ await self._update_metadata(
571
+ session, **_QueueMetadataUpdateParams(update_modified_at=True, update_accessed_at=True)
572
+ )
573
+
574
+ # put the forefront request at the beginning of the cache
575
+ if forefront:
576
+ self._pending_fetch_cache.appendleft(request)
577
+
578
+ return ProcessedRequest(
579
+ unique_key=request.unique_key,
580
+ was_already_present=True,
581
+ was_already_handled=False,
582
+ )
583
+
584
+ @override
585
+ async def is_empty(self) -> bool:
586
+ # Check in-memory cache for requests
587
+ if self._pending_fetch_cache:
588
+ return False
589
+
590
+ # Check database for unhandled requests
591
+ async with self.get_session() as session:
592
+ metadata_orm = await session.get(self._METADATA_TABLE, self._id)
593
+ if not metadata_orm:
594
+ raise ValueError(f'Request queue with ID "{self._id}" not found.')
595
+
596
+ empty = metadata_orm.pending_request_count == 0
597
+
598
+ updated = await self._update_metadata(
599
+ session,
600
+ **_QueueMetadataUpdateParams(
601
+ update_accessed_at=True,
602
+ # With multi-client access, counters may become out of sync.
603
+ # If the queue is not empty, we perform a recalculation to synchronize the counters in the metadata.
604
+ recalculate=not empty,
605
+ update_modified_at=not empty,
606
+ ),
607
+ )
608
+
609
+ # Commit updates to the metadata
610
+ if updated:
611
+ await session.commit()
612
+
613
+ return empty
614
+
615
+ async def _get_state(self, session: AsyncSession) -> RequestQueueStateDb:
616
+ """Get the current state of the request queue."""
617
+ orm_state: RequestQueueStateDb | None = await session.get(RequestQueueStateDb, self._id)
618
+ if not orm_state:
619
+ insert_values = {'request_queue_id': self._id}
620
+ # Create a new state if it doesn't exist
621
+ # This is a safeguard against race conditions where multiple clients might try to create the state
622
+ # simultaneously.
623
+ insert_stmt = self._build_insert_stmt_with_ignore(RequestQueueStateDb, insert_values)
624
+ await session.execute(insert_stmt)
625
+ await session.flush()
626
+ orm_state = await session.get(RequestQueueStateDb, self._id)
627
+ if not orm_state:
628
+ raise RuntimeError(f'Failed to create or retrieve state for queue {self._id}')
629
+ return orm_state
630
+
631
+ def _specific_update_metadata(
632
+ self,
633
+ new_handled_request_count: int | None = None,
634
+ new_pending_request_count: int | None = None,
635
+ new_total_request_count: int | None = None,
636
+ delta_handled_request_count: int | None = None,
637
+ delta_pending_request_count: int | None = None,
638
+ *,
639
+ recalculate: bool = False,
640
+ update_had_multiple_clients: bool = False,
641
+ **_kwargs: dict[str, Any],
642
+ ) -> dict[str, Any]:
643
+ """Update the request queue metadata in the database.
644
+
645
+ Args:
646
+ session: The SQLAlchemy session to use for database operations.
647
+ new_handled_request_count: If provided, update the handled_request_count to this value.
648
+ new_pending_request_count: If provided, update the pending_request_count to this value.
649
+ new_total_request_count: If provided, update the total_request_count to this value.
650
+ delta_handled_request_count: If provided, add this value to the handled_request_count.
651
+ delta_pending_request_count: If provided, add this value to the pending_request_count.
652
+ recalculate: If True, recalculate the pending_request_count, and total_request_count on request table.
653
+ update_had_multiple_clients: If True, set had_multiple_clients to True.
654
+ """
655
+ values_to_set: dict[str, Any] = {}
656
+
657
+ if update_had_multiple_clients:
658
+ values_to_set['had_multiple_clients'] = True
659
+
660
+ if new_handled_request_count is not None:
661
+ values_to_set['handled_request_count'] = new_handled_request_count
662
+ elif delta_handled_request_count is not None:
663
+ values_to_set['handled_request_count'] = (
664
+ self._METADATA_TABLE.handled_request_count + delta_handled_request_count
665
+ )
666
+
667
+ if new_pending_request_count is not None:
668
+ values_to_set['pending_request_count'] = new_pending_request_count
669
+ elif delta_pending_request_count is not None:
670
+ values_to_set['pending_request_count'] = (
671
+ self._METADATA_TABLE.pending_request_count + delta_pending_request_count
672
+ )
673
+
674
+ if new_total_request_count is not None:
675
+ values_to_set['total_request_count'] = new_total_request_count
676
+
677
+ if recalculate:
678
+ stmt = (
679
+ update(self._METADATA_TABLE)
680
+ .where(self._METADATA_TABLE.request_queue_id == self._id)
681
+ .values(
682
+ pending_request_count=(
683
+ select(func.count())
684
+ .select_from(self._ITEM_TABLE)
685
+ .where(self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.is_handled.is_(False))
686
+ .scalar_subquery()
687
+ ),
688
+ total_request_count=(
689
+ select(func.count())
690
+ .select_from(self._ITEM_TABLE)
691
+ .where(self._ITEM_TABLE.request_queue_id == self._id)
692
+ .scalar_subquery()
693
+ ),
694
+ handled_request_count=(
695
+ select(func.count())
696
+ .select_from(self._ITEM_TABLE)
697
+ .where(self._ITEM_TABLE.request_queue_id == self._id, self._ITEM_TABLE.is_handled.is_(True))
698
+ .scalar_subquery()
699
+ ),
700
+ )
701
+ )
702
+
703
+ values_to_set['custom_stmt'] = stmt
704
+
705
+ return values_to_set
706
+
707
+ @staticmethod
708
+ @lru_cache(maxsize=10000)
709
+ def _get_int_id_from_unique_key(unique_key: str) -> int:
710
+ """Generate a deterministic integer ID for a unique_key.
711
+
712
+ Args:
713
+ unique_key: Unique key to be used to generate ID.
714
+
715
+ Returns:
716
+ An integer ID based on the unique_key.
717
+ """
718
+ hashed_key = sha256(unique_key.encode('utf-8')).hexdigest()
719
+ name_length = 15
720
+ return int(hashed_key[:name_length], 16)