crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (116) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_autoscaling/snapshotter.py +1 -1
  3. crawlee/_browserforge_workaround.py +7 -3
  4. crawlee/_request.py +64 -43
  5. crawlee/_service_locator.py +44 -24
  6. crawlee/_types.py +128 -36
  7. crawlee/_utils/context.py +3 -3
  8. crawlee/_utils/file.py +8 -1
  9. crawlee/_utils/globs.py +4 -4
  10. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  11. crawlee/_utils/recoverable_state.py +32 -8
  12. crawlee/_utils/recurring_task.py +27 -3
  13. crawlee/_utils/requests.py +0 -26
  14. crawlee/_utils/robots.py +17 -5
  15. crawlee/_utils/sitemap.py +16 -7
  16. crawlee/_utils/system.py +30 -14
  17. crawlee/_utils/time.py +120 -0
  18. crawlee/_utils/urls.py +9 -2
  19. crawlee/browsers/_browser_pool.py +5 -2
  20. crawlee/browsers/_playwright_browser.py +2 -1
  21. crawlee/browsers/_playwright_browser_controller.py +21 -15
  22. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  23. crawlee/browsers/_types.py +1 -1
  24. crawlee/configuration.py +2 -0
  25. crawlee/crawlers/__init__.py +5 -1
  26. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  27. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
  28. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  29. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  30. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  31. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
  32. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  33. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
  34. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  35. crawlee/crawlers/_basic/_basic_crawler.py +254 -148
  36. crawlee/crawlers/_basic/_context_utils.py +24 -0
  37. crawlee/crawlers/_basic/_logging_utils.py +27 -4
  38. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  39. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  40. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  41. crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
  42. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  43. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  44. crawlee/crawlers/_playwright/_types.py +12 -2
  45. crawlee/errors.py +4 -0
  46. crawlee/events/_event_manager.py +12 -6
  47. crawlee/events/_types.py +6 -6
  48. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  49. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  50. crawlee/fingerprint_suite/_header_generator.py +2 -2
  51. crawlee/fingerprint_suite/_types.py +2 -2
  52. crawlee/http_clients/_base.py +4 -0
  53. crawlee/http_clients/_curl_impersonate.py +68 -14
  54. crawlee/http_clients/_httpx.py +16 -6
  55. crawlee/http_clients/_impit.py +32 -11
  56. crawlee/otel/crawler_instrumentor.py +4 -6
  57. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  58. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  59. crawlee/request_loaders/_request_list.py +3 -3
  60. crawlee/request_loaders/_request_loader.py +5 -1
  61. crawlee/request_loaders/_sitemap_request_loader.py +248 -50
  62. crawlee/router.py +13 -3
  63. crawlee/sessions/_cookies.py +13 -8
  64. crawlee/sessions/_models.py +5 -5
  65. crawlee/sessions/_session_pool.py +1 -1
  66. crawlee/statistics/_error_snapshotter.py +1 -1
  67. crawlee/statistics/_models.py +62 -12
  68. crawlee/statistics/_statistics.py +24 -33
  69. crawlee/storage_clients/__init__.py +16 -0
  70. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  71. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  72. crawlee/storage_clients/_base/_request_queue_client.py +2 -2
  73. crawlee/storage_clients/_base/_storage_client.py +13 -0
  74. crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
  75. crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
  76. crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
  77. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  78. crawlee/storage_clients/_file_system/_utils.py +0 -0
  79. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  80. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  81. crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
  82. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  83. crawlee/storage_clients/_redis/__init__.py +6 -0
  84. crawlee/storage_clients/_redis/_client_mixin.py +292 -0
  85. crawlee/storage_clients/_redis/_dataset_client.py +329 -0
  86. crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
  87. crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
  88. crawlee/storage_clients/_redis/_storage_client.py +149 -0
  89. crawlee/storage_clients/_redis/_utils.py +23 -0
  90. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  91. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  92. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  93. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  94. crawlee/storage_clients/_redis/py.typed +0 -0
  95. crawlee/storage_clients/_sql/__init__.py +6 -0
  96. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  97. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  98. crawlee/storage_clients/_sql/_db_models.py +268 -0
  99. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  100. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  101. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  102. crawlee/storage_clients/_sql/py.typed +0 -0
  103. crawlee/storage_clients/models.py +21 -14
  104. crawlee/storages/_base.py +5 -1
  105. crawlee/storages/_dataset.py +12 -2
  106. crawlee/storages/_key_value_store.py +17 -4
  107. crawlee/storages/_request_queue.py +13 -5
  108. crawlee/storages/_storage_instance_manager.py +196 -75
  109. crawlee/storages/_utils.py +11 -0
  110. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
  111. crawlee-1.3.1b3.dist-info/RECORD +207 -0
  112. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
  113. crawlee/_utils/measure_time.py +0 -31
  114. crawlee-0.6.13b15.dist-info/RECORD +0 -183
  115. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
  116. {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
@@ -1,21 +1,24 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ import functools
4
5
  import json
5
6
  import shutil
6
7
  from collections import deque
7
8
  from datetime import datetime, timezone
9
+ from hashlib import sha256
8
10
  from logging import getLogger
9
11
  from pathlib import Path
10
12
  from typing import TYPE_CHECKING
11
13
 
12
14
  from pydantic import BaseModel, ValidationError
13
- from typing_extensions import override
15
+ from typing_extensions import Self, override
14
16
 
15
17
  from crawlee import Request
16
18
  from crawlee._consts import METADATA_FILENAME
17
19
  from crawlee._utils.crypto import crypto_random_object_id
18
20
  from crawlee._utils.file import atomic_write, json_dumps
21
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
19
22
  from crawlee._utils.recoverable_state import RecoverableState
20
23
  from crawlee.storage_clients._base import RequestQueueClient
21
24
  from crawlee.storage_clients.models import (
@@ -29,6 +32,7 @@ if TYPE_CHECKING:
29
32
  from collections.abc import Sequence
30
33
 
31
34
  from crawlee.configuration import Configuration
35
+ from crawlee.storages import KeyValueStore
32
36
 
33
37
  logger = getLogger(__name__)
34
38
 
@@ -43,16 +47,16 @@ class RequestQueueState(BaseModel):
43
47
  """Counter for forefront request ordering."""
44
48
 
45
49
  forefront_requests: dict[str, int] = {}
46
- """Mapping of forefront request IDs to their sequence numbers."""
50
+ """Mapping of forefront request unique keys to their sequence numbers."""
47
51
 
48
52
  regular_requests: dict[str, int] = {}
49
- """Mapping of regular request IDs to their sequence numbers."""
53
+ """Mapping of regular request unique keys to their sequence numbers."""
50
54
 
51
55
  in_progress_requests: set[str] = set()
52
- """Set of request IDs currently being processed."""
56
+ """Set of request unique keys currently being processed."""
53
57
 
54
58
  handled_requests: set[str] = set()
55
- """Set of request IDs that have been handled."""
59
+ """Set of request unique keys that have been handled."""
56
60
 
57
61
 
58
62
  class FileSystemRequestQueueClient(RequestQueueClient):
@@ -88,8 +92,9 @@ class FileSystemRequestQueueClient(RequestQueueClient):
88
92
  self,
89
93
  *,
90
94
  metadata: RequestQueueMetadata,
91
- storage_dir: Path,
95
+ path_to_rq: Path,
92
96
  lock: asyncio.Lock,
97
+ recoverable_state: RecoverableState[RequestQueueState],
93
98
  ) -> None:
94
99
  """Initialize a new instance.
95
100
 
@@ -97,8 +102,8 @@ class FileSystemRequestQueueClient(RequestQueueClient):
97
102
  """
98
103
  self._metadata = metadata
99
104
 
100
- self._storage_dir = storage_dir
101
- """The base directory where the storage data are being persisted."""
105
+ self._path_to_rq = path_to_rq
106
+ """The full path to the request queue directory."""
102
107
 
103
108
  self._lock = lock
104
109
  """A lock to ensure that only one operation is performed at a time."""
@@ -112,13 +117,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
112
117
  self._is_empty_cache: bool | None = None
113
118
  """Cache for is_empty result: None means unknown, True/False is cached state."""
114
119
 
115
- self._state = RecoverableState[RequestQueueState](
116
- default_state=RequestQueueState(),
117
- persist_state_key='request_queue_state',
118
- persistence_enabled=True,
119
- persist_state_kvs_name=f'__RQ_STATE_{self._metadata.id}',
120
- logger=logger,
121
- )
120
+ self._state = recoverable_state
122
121
  """Recoverable state to maintain request ordering, in-progress status, and handled status."""
123
122
 
124
123
  @override
@@ -128,24 +127,38 @@ class FileSystemRequestQueueClient(RequestQueueClient):
128
127
  @property
129
128
  def path_to_rq(self) -> Path:
130
129
  """The full path to the request queue directory."""
131
- if self._metadata.name is None:
132
- return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT
133
-
134
- return self._storage_dir / self._STORAGE_SUBDIR / self._metadata.name
130
+ return self._path_to_rq
135
131
 
136
132
  @property
137
133
  def path_to_metadata(self) -> Path:
138
134
  """The full path to the request queue metadata file."""
139
135
  return self.path_to_rq / METADATA_FILENAME
140
136
 
137
+ @classmethod
138
+ async def _create_recoverable_state(cls, id: str, configuration: Configuration) -> RecoverableState:
139
+ async def kvs_factory() -> KeyValueStore:
140
+ from crawlee.storage_clients import FileSystemStorageClient # noqa: PLC0415 avoid circular import
141
+ from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
142
+
143
+ return await KeyValueStore.open(storage_client=FileSystemStorageClient(), configuration=configuration)
144
+
145
+ return RecoverableState[RequestQueueState](
146
+ default_state=RequestQueueState(),
147
+ persist_state_key=f'__RQ_STATE_{id}',
148
+ persist_state_kvs_factory=kvs_factory,
149
+ persistence_enabled=True,
150
+ logger=logger,
151
+ )
152
+
141
153
  @classmethod
142
154
  async def open(
143
155
  cls,
144
156
  *,
145
157
  id: str | None,
146
158
  name: str | None,
159
+ alias: str | None,
147
160
  configuration: Configuration,
148
- ) -> FileSystemRequestQueueClient:
161
+ ) -> Self:
149
162
  """Open or create a file system request queue client.
150
163
 
151
164
  This method attempts to open an existing request queue from the file system. If a queue with the specified
@@ -154,17 +167,21 @@ class FileSystemRequestQueueClient(RequestQueueClient):
154
167
 
155
168
  Args:
156
169
  id: The ID of the request queue to open. If provided, searches for existing queue by ID.
157
- name: The name of the request queue to open. If not provided, uses the default queue.
170
+ name: The name of the request queue for named (global scope) storages.
171
+ alias: The alias of the request queue for unnamed (run scope) storages.
158
172
  configuration: The configuration object containing storage directory settings.
159
173
 
160
174
  Returns:
161
175
  An instance for the opened or created storage client.
162
176
 
163
177
  Raises:
164
- ValueError: If a queue with the specified ID is not found, or if metadata is invalid.
178
+ ValueError: If a queue with the specified ID is not found, if metadata is invalid,
179
+ or if both name and alias are provided.
165
180
  """
166
- storage_dir = Path(configuration.storage_dir)
167
- rq_base_path = storage_dir / cls._STORAGE_SUBDIR
181
+ # Validate input parameters.
182
+ raise_if_too_many_kwargs(id=id, name=name, alias=alias)
183
+
184
+ rq_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
168
185
 
169
186
  if not rq_base_path.exists():
170
187
  await asyncio.to_thread(rq_base_path.mkdir, parents=True, exist_ok=True)
@@ -176,12 +193,12 @@ class FileSystemRequestQueueClient(RequestQueueClient):
176
193
  if not rq_dir.is_dir():
177
194
  continue
178
195
 
179
- metadata_path = rq_dir / METADATA_FILENAME
180
- if not metadata_path.exists():
196
+ path_to_metadata = rq_dir / METADATA_FILENAME
197
+ if not path_to_metadata.exists():
181
198
  continue
182
199
 
183
200
  try:
184
- file = await asyncio.to_thread(metadata_path.open)
201
+ file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
185
202
  try:
186
203
  file_content = json.load(file)
187
204
  metadata = RequestQueueMetadata(**file_content)
@@ -189,8 +206,11 @@ class FileSystemRequestQueueClient(RequestQueueClient):
189
206
  if metadata.id == id:
190
207
  client = cls(
191
208
  metadata=metadata,
192
- storage_dir=storage_dir,
209
+ path_to_rq=rq_base_path / rq_dir,
193
210
  lock=asyncio.Lock(),
211
+ recoverable_state=await cls._create_recoverable_state(
212
+ id=id, configuration=configuration
213
+ ),
194
214
  )
195
215
  await client._state.initialize()
196
216
  await client._discover_existing_requests()
@@ -205,14 +225,15 @@ class FileSystemRequestQueueClient(RequestQueueClient):
205
225
  if not found:
206
226
  raise ValueError(f'Request queue with ID "{id}" not found')
207
227
 
208
- # Open an existing RQ by its name, or create a new one if not found.
228
+ # Open an existing RQ by its name or alias, or create a new one if not found.
209
229
  else:
210
- rq_path = rq_base_path / cls._STORAGE_SUBSUBDIR_DEFAULT if name is None else rq_base_path / name
211
- metadata_path = rq_path / METADATA_FILENAME
230
+ rq_dir = Path(name) if name else Path(alias) if alias else Path('default')
231
+ path_to_rq = rq_base_path / rq_dir
232
+ path_to_metadata = path_to_rq / METADATA_FILENAME
212
233
 
213
234
  # If the RQ directory exists, reconstruct the client from the metadata file.
214
- if rq_path.exists() and metadata_path.exists():
215
- file = await asyncio.to_thread(open, metadata_path)
235
+ if path_to_rq.exists() and path_to_metadata.exists():
236
+ file = await asyncio.to_thread(path_to_metadata.open, encoding='utf-8')
216
237
  try:
217
238
  file_content = json.load(file)
218
239
  finally:
@@ -220,14 +241,13 @@ class FileSystemRequestQueueClient(RequestQueueClient):
220
241
  try:
221
242
  metadata = RequestQueueMetadata(**file_content)
222
243
  except ValidationError as exc:
223
- raise ValueError(f'Invalid metadata file for request queue "{name}"') from exc
224
-
225
- metadata.name = name
244
+ raise ValueError(f'Invalid metadata file for request queue "{name or alias}"') from exc
226
245
 
227
246
  client = cls(
228
247
  metadata=metadata,
229
- storage_dir=storage_dir,
248
+ path_to_rq=path_to_rq,
230
249
  lock=asyncio.Lock(),
250
+ recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
231
251
  )
232
252
 
233
253
  await client._state.initialize()
@@ -250,8 +270,9 @@ class FileSystemRequestQueueClient(RequestQueueClient):
250
270
  )
251
271
  client = cls(
252
272
  metadata=metadata,
253
- storage_dir=storage_dir,
273
+ path_to_rq=path_to_rq,
254
274
  lock=asyncio.Lock(),
275
+ recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
255
276
  )
256
277
  await client._state.initialize()
257
278
  await client._update_metadata()
@@ -311,37 +332,52 @@ class FileSystemRequestQueueClient(RequestQueueClient):
311
332
  unprocessed_requests = list[UnprocessedRequest]()
312
333
  state = self._state.current_value
313
334
 
314
- # Prepare a dictionary to track existing requests by their unique keys.
315
- existing_unique_keys: dict[str, Path] = {}
316
- existing_request_files = await self._get_request_files(self.path_to_rq)
335
+ all_requests = state.forefront_requests | state.regular_requests
317
336
 
318
- for request_file in existing_request_files:
319
- existing_request = await self._parse_request_file(request_file)
320
- if existing_request is not None:
321
- existing_unique_keys[existing_request.unique_key] = request_file
337
+ requests_to_enqueue = {}
322
338
 
323
- # Process each request in the batch.
339
+ # Determine which requests can be added or are modified.
324
340
  for request in requests:
325
- existing_request_file = existing_unique_keys.get(request.unique_key)
326
- existing_request = None
327
-
328
- # Only load the full request from disk if we found a duplicate
329
- if existing_request_file is not None:
330
- existing_request = await self._parse_request_file(existing_request_file)
331
-
332
- # If there is no existing request with the same unique key, add the new request.
333
- if existing_request is None:
334
- request_path = self._get_request_path(request.id)
341
+ # Check if the request has already been handled.
342
+ if request.unique_key in state.handled_requests:
343
+ processed_requests.append(
344
+ ProcessedRequest(
345
+ unique_key=request.unique_key,
346
+ was_already_present=True,
347
+ was_already_handled=True,
348
+ )
349
+ )
350
+ # Check if the request is already in progress.
351
+ # Or if the request is already in the queue and the `forefront` flag is not used, we do not change the
352
+ # position of the request.
353
+ elif (request.unique_key in state.in_progress_requests) or (
354
+ request.unique_key in all_requests and not forefront
355
+ ):
356
+ processed_requests.append(
357
+ ProcessedRequest(
358
+ unique_key=request.unique_key,
359
+ was_already_present=True,
360
+ was_already_handled=False,
361
+ )
362
+ )
363
+ # These requests must either be added or update their position.
364
+ else:
365
+ requests_to_enqueue[request.unique_key] = request
335
366
 
367
+ # Process each request in the batch.
368
+ for request in requests_to_enqueue.values():
369
+ # If the request is not already in the RQ, this is a new request.
370
+ if request.unique_key not in all_requests:
371
+ request_path = self._get_request_path(request.unique_key)
336
372
  # Add sequence number to ensure FIFO ordering using state.
337
373
  if forefront:
338
374
  sequence_number = state.forefront_sequence_counter
339
375
  state.forefront_sequence_counter += 1
340
- state.forefront_requests[request.id] = sequence_number
376
+ state.forefront_requests[request.unique_key] = sequence_number
341
377
  else:
342
378
  sequence_number = state.sequence_counter
343
379
  state.sequence_counter += 1
344
- state.regular_requests[request.id] = sequence_number
380
+ state.regular_requests[request.unique_key] = sequence_number
345
381
 
346
382
  # Save the clean request without extra fields
347
383
  request_data = await json_dumps(request.model_dump())
@@ -351,71 +387,41 @@ class FileSystemRequestQueueClient(RequestQueueClient):
351
387
  new_total_request_count += 1
352
388
  new_pending_request_count += 1
353
389
 
354
- # Add to our index for subsequent requests in this batch
355
- existing_unique_keys[request.unique_key] = self._get_request_path(request.id)
356
-
357
390
  processed_requests.append(
358
391
  ProcessedRequest(
359
- id=request.id,
360
392
  unique_key=request.unique_key,
361
393
  was_already_present=False,
362
394
  was_already_handled=False,
363
395
  )
364
396
  )
365
397
 
366
- # If the request already exists in the RQ, just update it if needed.
367
- else:
368
- # Set the processed request flags.
369
- was_already_present = existing_request is not None
370
- was_already_handled = existing_request.id in state.handled_requests
371
-
372
- # If the request is already in the RQ and handled, just continue with the next one.
373
- if was_already_present and was_already_handled:
374
- processed_requests.append(
375
- ProcessedRequest(
376
- id=existing_request.id,
377
- unique_key=request.unique_key,
378
- was_already_present=True,
379
- was_already_handled=True,
380
- )
381
- )
398
+ # If the request already exists in the RQ and use the forefront flag to update its position
399
+ elif forefront:
400
+ # If the request is among `regular`, remove it from its current position.
401
+ if request.unique_key in state.regular_requests:
402
+ state.regular_requests.pop(request.unique_key)
382
403
 
383
- # If the request is already in the RQ but not handled yet, update it.
384
- elif was_already_present and not was_already_handled:
385
- # Update request type (forefront vs regular) in state
386
- if forefront:
387
- # Move from regular to forefront if needed
388
- if existing_request.id in state.regular_requests:
389
- state.regular_requests.pop(existing_request.id)
390
- if existing_request.id not in state.forefront_requests:
391
- state.forefront_requests[existing_request.id] = state.forefront_sequence_counter
392
- state.forefront_sequence_counter += 1
393
- elif (
394
- existing_request.id not in state.forefront_requests
395
- and existing_request.id not in state.regular_requests
396
- ):
397
- # Keep as regular if not already forefront
398
- state.regular_requests[existing_request.id] = state.sequence_counter
399
- state.sequence_counter += 1
400
-
401
- processed_requests.append(
402
- ProcessedRequest(
403
- id=existing_request.id,
404
- unique_key=request.unique_key,
405
- was_already_present=True,
406
- was_already_handled=False,
407
- )
404
+ # If the request is already in `forefront`, we just need to update its position.
405
+ state.forefront_requests[request.unique_key] = state.forefront_sequence_counter
406
+ state.forefront_sequence_counter += 1
407
+
408
+ processed_requests.append(
409
+ ProcessedRequest(
410
+ unique_key=request.unique_key,
411
+ was_already_present=True,
412
+ was_already_handled=False,
408
413
  )
414
+ )
409
415
 
410
- else:
411
- logger.warning(f'Request with unique key "{request.unique_key}" could not be processed.')
412
- unprocessed_requests.append(
413
- UnprocessedRequest(
414
- unique_key=request.unique_key,
415
- url=request.url,
416
- method=request.method,
417
- )
416
+ else:
417
+ logger.warning(f'Request with unique key "{request.unique_key}" could not be processed.')
418
+ unprocessed_requests.append(
419
+ UnprocessedRequest(
420
+ unique_key=request.unique_key,
421
+ url=request.url,
422
+ method=request.method,
418
423
  )
424
+ )
419
425
 
420
426
  await self._update_metadata(
421
427
  update_modified_at=True,
@@ -437,17 +443,17 @@ class FileSystemRequestQueueClient(RequestQueueClient):
437
443
  )
438
444
 
439
445
  @override
440
- async def get_request(self, request_id: str) -> Request | None:
446
+ async def get_request(self, unique_key: str) -> Request | None:
441
447
  async with self._lock:
442
- request_path = self._get_request_path(request_id)
448
+ request_path = self._get_request_path(unique_key)
443
449
  request = await self._parse_request_file(request_path)
444
450
 
445
451
  if request is None:
446
- logger.warning(f'Request with ID "{request_id}" not found in the queue.')
452
+ logger.warning(f'Request with unique key "{unique_key}" not found in the queue.')
447
453
  return None
448
454
 
449
455
  state = self._state.current_value
450
- state.in_progress_requests.add(request.id)
456
+ state.in_progress_requests.add(request.unique_key)
451
457
  await self._update_metadata(update_accessed_at=True)
452
458
  return request
453
459
 
@@ -466,11 +472,11 @@ class FileSystemRequestQueueClient(RequestQueueClient):
466
472
  candidate = self._request_cache.popleft()
467
473
 
468
474
  # Skip requests that are already in progress, however this should not happen.
469
- if candidate.id not in state.in_progress_requests:
475
+ if candidate.unique_key not in state.in_progress_requests:
470
476
  next_request = candidate
471
477
 
472
478
  if next_request is not None:
473
- state.in_progress_requests.add(next_request.id)
479
+ state.in_progress_requests.add(next_request.unique_key)
474
480
 
475
481
  return next_request
476
482
 
@@ -481,8 +487,8 @@ class FileSystemRequestQueueClient(RequestQueueClient):
481
487
  state = self._state.current_value
482
488
 
483
489
  # Check if the request is in progress.
484
- if request.id not in state.in_progress_requests:
485
- logger.warning(f'Marking request {request.id} as handled that is not in progress.')
490
+ if request.unique_key not in state.in_progress_requests:
491
+ logger.warning(f'Marking request {request.unique_key} as handled that is not in progress.')
486
492
  return None
487
493
 
488
494
  # Update the request's handled_at timestamp.
@@ -490,18 +496,18 @@ class FileSystemRequestQueueClient(RequestQueueClient):
490
496
  request.handled_at = datetime.now(timezone.utc)
491
497
 
492
498
  # Dump the updated request to the file.
493
- request_path = self._get_request_path(request.id)
499
+ request_path = self._get_request_path(request.unique_key)
494
500
 
495
501
  if not await asyncio.to_thread(request_path.exists):
496
- logger.warning(f'Request file for {request.id} does not exist, cannot mark as handled.')
502
+ logger.warning(f'Request file for {request.unique_key} does not exist, cannot mark as handled.')
497
503
  return None
498
504
 
499
505
  request_data = await json_dumps(request.model_dump())
500
506
  await atomic_write(request_path, request_data)
501
507
 
502
508
  # Update state: remove from in-progress and add to handled.
503
- state.in_progress_requests.discard(request.id)
504
- state.handled_requests.add(request.id)
509
+ state.in_progress_requests.discard(request.unique_key)
510
+ state.handled_requests.add(request.unique_key)
505
511
 
506
512
  # Update RQ metadata.
507
513
  await self._update_metadata(
@@ -512,7 +518,6 @@ class FileSystemRequestQueueClient(RequestQueueClient):
512
518
  )
513
519
 
514
520
  return ProcessedRequest(
515
- id=request.id,
516
521
  unique_key=request.unique_key,
517
522
  was_already_present=True,
518
523
  was_already_handled=True,
@@ -530,36 +535,36 @@ class FileSystemRequestQueueClient(RequestQueueClient):
530
535
  state = self._state.current_value
531
536
 
532
537
  # Check if the request is in progress.
533
- if request.id not in state.in_progress_requests:
534
- logger.info(f'Reclaiming request {request.id} that is not in progress.')
538
+ if request.unique_key not in state.in_progress_requests:
539
+ logger.info(f'Reclaiming request {request.unique_key} that is not in progress.')
535
540
  return None
536
541
 
537
- request_path = self._get_request_path(request.id)
542
+ request_path = self._get_request_path(request.unique_key)
538
543
 
539
544
  if not await asyncio.to_thread(request_path.exists):
540
- logger.warning(f'Request file for {request.id} does not exist, cannot reclaim.')
545
+ logger.warning(f'Request file for {request.unique_key} does not exist, cannot reclaim.')
541
546
  return None
542
547
 
543
548
  # Update sequence number and state to ensure proper ordering.
544
549
  if forefront:
545
550
  # Remove from regular requests if it was there
546
- state.regular_requests.pop(request.id, None)
551
+ state.regular_requests.pop(request.unique_key, None)
547
552
  sequence_number = state.forefront_sequence_counter
548
553
  state.forefront_sequence_counter += 1
549
- state.forefront_requests[request.id] = sequence_number
554
+ state.forefront_requests[request.unique_key] = sequence_number
550
555
  else:
551
556
  # Remove from forefront requests if it was there
552
- state.forefront_requests.pop(request.id, None)
557
+ state.forefront_requests.pop(request.unique_key, None)
553
558
  sequence_number = state.sequence_counter
554
559
  state.sequence_counter += 1
555
- state.regular_requests[request.id] = sequence_number
560
+ state.regular_requests[request.unique_key] = sequence_number
556
561
 
557
562
  # Save the clean request without extra fields
558
563
  request_data = await json_dumps(request.model_dump())
559
564
  await atomic_write(request_path, request_data)
560
565
 
561
566
  # Remove from in-progress.
562
- state.in_progress_requests.discard(request.id)
567
+ state.in_progress_requests.discard(request.unique_key)
563
568
 
564
569
  # Update RQ metadata.
565
570
  await self._update_metadata(
@@ -574,7 +579,6 @@ class FileSystemRequestQueueClient(RequestQueueClient):
574
579
  self._request_cache.append(request)
575
580
 
576
581
  return ProcessedRequest(
577
- id=request.id,
578
582
  unique_key=request.unique_key,
579
583
  was_already_present=True,
580
584
  was_already_handled=False,
@@ -597,7 +601,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
597
601
  # If we have a cached requests, check them first (fast path).
598
602
  if self._request_cache:
599
603
  for req in self._request_cache:
600
- if req.id not in state.handled_requests:
604
+ if req.unique_key not in state.handled_requests:
601
605
  self._is_empty_cache = False
602
606
  return False
603
607
  self._is_empty_cache = True
@@ -617,16 +621,16 @@ class FileSystemRequestQueueClient(RequestQueueClient):
617
621
  self._is_empty_cache = True
618
622
  return True
619
623
 
620
- def _get_request_path(self, request_id: str) -> Path:
624
+ def _get_request_path(self, unique_key: str) -> Path:
621
625
  """Get the path to a specific request file.
622
626
 
623
627
  Args:
624
- request_id: The ID of the request.
628
+ unique_key: Unique key of the request.
625
629
 
626
630
  Returns:
627
631
  The path to the request file.
628
632
  """
629
- return self.path_to_rq / f'{request_id}.json'
633
+ return self.path_to_rq / f'{self._get_file_base_name_from_unique_key(unique_key)}.json'
630
634
 
631
635
  async def _update_metadata(
632
636
  self,
@@ -699,23 +703,23 @@ class FileSystemRequestQueueClient(RequestQueueClient):
699
703
  continue
700
704
 
701
705
  # Skip handled requests
702
- if request.id in state.handled_requests:
706
+ if request.unique_key in state.handled_requests:
703
707
  continue
704
708
 
705
709
  # Skip in-progress requests
706
- if request.id in state.in_progress_requests:
710
+ if request.unique_key in state.in_progress_requests:
707
711
  continue
708
712
 
709
713
  # Determine if request is forefront or regular based on state
710
- if request.id in state.forefront_requests:
711
- sequence = state.forefront_requests[request.id]
714
+ if request.unique_key in state.forefront_requests:
715
+ sequence = state.forefront_requests[request.unique_key]
712
716
  forefront_requests.append((request, sequence))
713
- elif request.id in state.regular_requests:
714
- sequence = state.regular_requests[request.id]
717
+ elif request.unique_key in state.regular_requests:
718
+ sequence = state.regular_requests[request.unique_key]
715
719
  regular_requests.append((request, sequence))
716
720
  else:
717
721
  # Request not in state, skip it (might be orphaned)
718
- logger.warning(f'Request {request.id} not found in state, skipping.')
722
+ logger.warning(f'Request {request.unique_key} not found in state, skipping.')
719
723
  continue
720
724
 
721
725
  # Sort forefront requests by sequence (newest first for LIFO behavior).
@@ -753,13 +757,10 @@ class FileSystemRequestQueueClient(RequestQueueClient):
753
757
  await asyncio.to_thread(path_to_rq.mkdir, parents=True, exist_ok=True)
754
758
 
755
759
  # List all the json files.
756
- files = await asyncio.to_thread(list, path_to_rq.glob('*.json'))
760
+ files = await asyncio.to_thread(lambda: list(path_to_rq.glob('*.json')))
757
761
 
758
762
  # Filter out metadata file and non-file entries.
759
- filtered = filter(
760
- lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME,
761
- files,
762
- )
763
+ filtered = filter(lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME, files)
763
764
 
764
765
  return list(filtered)
765
766
 
@@ -775,7 +776,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
775
776
  """
776
777
  # Open the request file.
777
778
  try:
778
- file = await asyncio.to_thread(open, file_path)
779
+ file = await asyncio.to_thread(functools.partial(file_path.open, mode='r', encoding='utf-8'))
779
780
  except FileNotFoundError:
780
781
  logger.warning(f'Request file "{file_path}" not found.')
781
782
  return None
@@ -807,11 +808,27 @@ class FileSystemRequestQueueClient(RequestQueueClient):
807
808
  continue
808
809
 
809
810
  # Add request to state as regular request (assign sequence numbers)
810
- if request.id not in state.regular_requests and request.id not in state.forefront_requests:
811
+ if request.unique_key not in state.regular_requests and request.unique_key not in state.forefront_requests:
811
812
  # Assign as regular request with current sequence counter
812
- state.regular_requests[request.id] = state.sequence_counter
813
+ state.regular_requests[request.unique_key] = state.sequence_counter
813
814
  state.sequence_counter += 1
814
815
 
815
816
  # Check if request was already handled
816
817
  if request.handled_at is not None:
817
- state.handled_requests.add(request.id)
818
+ state.handled_requests.add(request.unique_key)
819
+
820
+ @staticmethod
821
+ def _get_file_base_name_from_unique_key(unique_key: str) -> str:
822
+ """Generate a deterministic file name for a unique_key.
823
+
824
+ Args:
825
+ unique_key: Unique key to be used to generate filename.
826
+
827
+ Returns:
828
+ A file name based on the unique_key.
829
+ """
830
+ # hexdigest produces filenames compliant strings
831
+ hashed_key = sha256(unique_key.encode('utf-8')).hexdigest()
832
+ name_length = 15
833
+ # Truncate the key to the desired length
834
+ return hashed_key[:name_length]