crawlee 0.6.13b17__py3-none-any.whl → 1.1.2b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (102) hide show
  1. crawlee/_autoscaling/snapshotter.py +1 -1
  2. crawlee/_request.py +35 -33
  3. crawlee/_service_locator.py +44 -24
  4. crawlee/_types.py +106 -34
  5. crawlee/_utils/context.py +2 -2
  6. crawlee/_utils/file.py +7 -0
  7. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  8. crawlee/_utils/recoverable_state.py +32 -8
  9. crawlee/_utils/recurring_task.py +17 -1
  10. crawlee/_utils/requests.py +0 -26
  11. crawlee/_utils/robots.py +17 -5
  12. crawlee/_utils/sitemap.py +4 -2
  13. crawlee/_utils/system.py +3 -3
  14. crawlee/_utils/time.py +120 -0
  15. crawlee/_utils/urls.py +9 -2
  16. crawlee/browsers/_browser_pool.py +4 -1
  17. crawlee/browsers/_playwright_browser_controller.py +21 -15
  18. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  19. crawlee/browsers/_types.py +1 -1
  20. crawlee/configuration.py +2 -0
  21. crawlee/crawlers/__init__.py +2 -1
  22. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  23. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +50 -12
  24. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  25. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  26. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
  27. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  28. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
  29. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  30. crawlee/crawlers/_basic/_basic_crawler.py +219 -126
  31. crawlee/crawlers/_basic/_logging_utils.py +5 -1
  32. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  33. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  34. crawlee/crawlers/_playwright/_playwright_crawler.py +60 -11
  35. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  36. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  37. crawlee/crawlers/_playwright/_types.py +12 -2
  38. crawlee/events/_event_manager.py +4 -4
  39. crawlee/events/_types.py +6 -6
  40. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  41. crawlee/fingerprint_suite/_header_generator.py +2 -2
  42. crawlee/fingerprint_suite/_types.py +2 -2
  43. crawlee/http_clients/_base.py +4 -0
  44. crawlee/http_clients/_curl_impersonate.py +12 -0
  45. crawlee/http_clients/_httpx.py +16 -6
  46. crawlee/http_clients/_impit.py +25 -10
  47. crawlee/otel/crawler_instrumentor.py +3 -3
  48. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  49. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  50. crawlee/request_loaders/_request_list.py +3 -3
  51. crawlee/request_loaders/_request_loader.py +5 -1
  52. crawlee/request_loaders/_sitemap_request_loader.py +248 -50
  53. crawlee/sessions/_models.py +2 -2
  54. crawlee/sessions/_session_pool.py +1 -1
  55. crawlee/statistics/_error_snapshotter.py +1 -1
  56. crawlee/statistics/_models.py +43 -4
  57. crawlee/statistics/_statistics.py +24 -33
  58. crawlee/storage_clients/__init__.py +16 -0
  59. crawlee/storage_clients/_base/_request_queue_client.py +2 -2
  60. crawlee/storage_clients/_base/_storage_client.py +13 -0
  61. crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
  62. crawlee/storage_clients/_file_system/_key_value_store_client.py +30 -26
  63. crawlee/storage_clients/_file_system/_request_queue_client.py +169 -153
  64. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  65. crawlee/storage_clients/_file_system/_utils.py +0 -0
  66. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  67. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  68. crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
  69. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  70. crawlee/storage_clients/_redis/__init__.py +6 -0
  71. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  72. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  73. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  74. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  75. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  76. crawlee/storage_clients/_redis/_utils.py +23 -0
  77. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  78. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  79. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  80. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  81. crawlee/storage_clients/_redis/py.typed +0 -0
  82. crawlee/storage_clients/_sql/__init__.py +6 -0
  83. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  84. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  85. crawlee/storage_clients/_sql/_db_models.py +268 -0
  86. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  87. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  88. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  89. crawlee/storage_clients/_sql/py.typed +0 -0
  90. crawlee/storage_clients/models.py +13 -11
  91. crawlee/storages/_base.py +5 -1
  92. crawlee/storages/_dataset.py +12 -2
  93. crawlee/storages/_key_value_store.py +17 -4
  94. crawlee/storages/_request_queue.py +13 -5
  95. crawlee/storages/_storage_instance_manager.py +133 -71
  96. crawlee/storages/_utils.py +11 -0
  97. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +18 -6
  98. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +101 -78
  99. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
  100. crawlee/_utils/measure_time.py +0 -31
  101. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
  102. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
@@ -5,17 +5,19 @@ import json
5
5
  import shutil
6
6
  from collections import deque
7
7
  from datetime import datetime, timezone
8
+ from hashlib import sha256
8
9
  from logging import getLogger
9
10
  from pathlib import Path
10
11
  from typing import TYPE_CHECKING
11
12
 
12
13
  from pydantic import BaseModel, ValidationError
13
- from typing_extensions import override
14
+ from typing_extensions import Self, override
14
15
 
15
16
  from crawlee import Request
16
17
  from crawlee._consts import METADATA_FILENAME
17
18
  from crawlee._utils.crypto import crypto_random_object_id
18
19
  from crawlee._utils.file import atomic_write, json_dumps
20
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
19
21
  from crawlee._utils.recoverable_state import RecoverableState
20
22
  from crawlee.storage_clients._base import RequestQueueClient
21
23
  from crawlee.storage_clients.models import (
@@ -29,6 +31,7 @@ if TYPE_CHECKING:
29
31
  from collections.abc import Sequence
30
32
 
31
33
  from crawlee.configuration import Configuration
34
+ from crawlee.storages import KeyValueStore
32
35
 
33
36
  logger = getLogger(__name__)
34
37
 
@@ -43,16 +46,16 @@ class RequestQueueState(BaseModel):
43
46
  """Counter for forefront request ordering."""
44
47
 
45
48
  forefront_requests: dict[str, int] = {}
46
- """Mapping of forefront request IDs to their sequence numbers."""
49
+ """Mapping of forefront request unique keys to their sequence numbers."""
47
50
 
48
51
  regular_requests: dict[str, int] = {}
49
- """Mapping of regular request IDs to their sequence numbers."""
52
+ """Mapping of regular request unique keys to their sequence numbers."""
50
53
 
51
54
  in_progress_requests: set[str] = set()
52
- """Set of request IDs currently being processed."""
55
+ """Set of request unique keys currently being processed."""
53
56
 
54
57
  handled_requests: set[str] = set()
55
- """Set of request IDs that have been handled."""
58
+ """Set of request unique keys that have been handled."""
56
59
 
57
60
 
58
61
  class FileSystemRequestQueueClient(RequestQueueClient):
@@ -88,8 +91,9 @@ class FileSystemRequestQueueClient(RequestQueueClient):
88
91
  self,
89
92
  *,
90
93
  metadata: RequestQueueMetadata,
91
- storage_dir: Path,
94
+ path_to_rq: Path,
92
95
  lock: asyncio.Lock,
96
+ recoverable_state: RecoverableState[RequestQueueState],
93
97
  ) -> None:
94
98
  """Initialize a new instance.
95
99
 
@@ -97,8 +101,8 @@ class FileSystemRequestQueueClient(RequestQueueClient):
97
101
  """
98
102
  self._metadata = metadata
99
103
 
100
- self._storage_dir = storage_dir
101
- """The base directory where the storage data are being persisted."""
104
+ self._path_to_rq = path_to_rq
105
+ """The full path to the request queue directory."""
102
106
 
103
107
  self._lock = lock
104
108
  """A lock to ensure that only one operation is performed at a time."""
@@ -112,13 +116,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
112
116
  self._is_empty_cache: bool | None = None
113
117
  """Cache for is_empty result: None means unknown, True/False is cached state."""
114
118
 
115
- self._state = RecoverableState[RequestQueueState](
116
- default_state=RequestQueueState(),
117
- persist_state_key='request_queue_state',
118
- persistence_enabled=True,
119
- persist_state_kvs_name=f'__RQ_STATE_{self._metadata.id}',
120
- logger=logger,
121
- )
119
+ self._state = recoverable_state
122
120
  """Recoverable state to maintain request ordering, in-progress status, and handled status."""
123
121
 
124
122
  @override
@@ -128,24 +126,38 @@ class FileSystemRequestQueueClient(RequestQueueClient):
128
126
  @property
129
127
  def path_to_rq(self) -> Path:
130
128
  """The full path to the request queue directory."""
131
- if self._metadata.name is None:
132
- return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT
133
-
134
- return self._storage_dir / self._STORAGE_SUBDIR / self._metadata.name
129
+ return self._path_to_rq
135
130
 
136
131
  @property
137
132
  def path_to_metadata(self) -> Path:
138
133
  """The full path to the request queue metadata file."""
139
134
  return self.path_to_rq / METADATA_FILENAME
140
135
 
136
+ @classmethod
137
+ async def _create_recoverable_state(cls, id: str, configuration: Configuration) -> RecoverableState:
138
+ async def kvs_factory() -> KeyValueStore:
139
+ from crawlee.storage_clients import FileSystemStorageClient # noqa: PLC0415 avoid circular import
140
+ from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
141
+
142
+ return await KeyValueStore.open(storage_client=FileSystemStorageClient(), configuration=configuration)
143
+
144
+ return RecoverableState[RequestQueueState](
145
+ default_state=RequestQueueState(),
146
+ persist_state_key=f'__RQ_STATE_{id}',
147
+ persist_state_kvs_factory=kvs_factory,
148
+ persistence_enabled=True,
149
+ logger=logger,
150
+ )
151
+
141
152
  @classmethod
142
153
  async def open(
143
154
  cls,
144
155
  *,
145
156
  id: str | None,
146
157
  name: str | None,
158
+ alias: str | None,
147
159
  configuration: Configuration,
148
- ) -> FileSystemRequestQueueClient:
160
+ ) -> Self:
149
161
  """Open or create a file system request queue client.
150
162
 
151
163
  This method attempts to open an existing request queue from the file system. If a queue with the specified
@@ -154,17 +166,21 @@ class FileSystemRequestQueueClient(RequestQueueClient):
154
166
 
155
167
  Args:
156
168
  id: The ID of the request queue to open. If provided, searches for existing queue by ID.
157
- name: The name of the request queue to open. If not provided, uses the default queue.
169
+ name: The name of the request queue for named (global scope) storages.
170
+ alias: The alias of the request queue for unnamed (run scope) storages.
158
171
  configuration: The configuration object containing storage directory settings.
159
172
 
160
173
  Returns:
161
174
  An instance for the opened or created storage client.
162
175
 
163
176
  Raises:
164
- ValueError: If a queue with the specified ID is not found, or if metadata is invalid.
177
+ ValueError: If a queue with the specified ID is not found, if metadata is invalid,
178
+ or if both name and alias are provided.
165
179
  """
166
- storage_dir = Path(configuration.storage_dir)
167
- rq_base_path = storage_dir / cls._STORAGE_SUBDIR
180
+ # Validate input parameters.
181
+ raise_if_too_many_kwargs(id=id, name=name, alias=alias)
182
+
183
+ rq_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
168
184
 
169
185
  if not rq_base_path.exists():
170
186
  await asyncio.to_thread(rq_base_path.mkdir, parents=True, exist_ok=True)
@@ -176,12 +192,12 @@ class FileSystemRequestQueueClient(RequestQueueClient):
176
192
  if not rq_dir.is_dir():
177
193
  continue
178
194
 
179
- metadata_path = rq_dir / METADATA_FILENAME
180
- if not metadata_path.exists():
195
+ path_to_metadata = rq_dir / METADATA_FILENAME
196
+ if not path_to_metadata.exists():
181
197
  continue
182
198
 
183
199
  try:
184
- file = await asyncio.to_thread(metadata_path.open)
200
+ file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8')
185
201
  try:
186
202
  file_content = json.load(file)
187
203
  metadata = RequestQueueMetadata(**file_content)
@@ -189,8 +205,11 @@ class FileSystemRequestQueueClient(RequestQueueClient):
189
205
  if metadata.id == id:
190
206
  client = cls(
191
207
  metadata=metadata,
192
- storage_dir=storage_dir,
208
+ path_to_rq=rq_base_path / rq_dir,
193
209
  lock=asyncio.Lock(),
210
+ recoverable_state=await cls._create_recoverable_state(
211
+ id=id, configuration=configuration
212
+ ),
194
213
  )
195
214
  await client._state.initialize()
196
215
  await client._discover_existing_requests()
@@ -205,14 +224,15 @@ class FileSystemRequestQueueClient(RequestQueueClient):
205
224
  if not found:
206
225
  raise ValueError(f'Request queue with ID "{id}" not found')
207
226
 
208
- # Open an existing RQ by its name, or create a new one if not found.
227
+ # Open an existing RQ by its name or alias, or create a new one if not found.
209
228
  else:
210
- rq_path = rq_base_path / cls._STORAGE_SUBSUBDIR_DEFAULT if name is None else rq_base_path / name
211
- metadata_path = rq_path / METADATA_FILENAME
229
+ rq_dir = Path(name) if name else Path(alias) if alias else Path('default')
230
+ path_to_rq = rq_base_path / rq_dir
231
+ path_to_metadata = path_to_rq / METADATA_FILENAME
212
232
 
213
233
  # If the RQ directory exists, reconstruct the client from the metadata file.
214
- if rq_path.exists() and metadata_path.exists():
215
- file = await asyncio.to_thread(open, metadata_path)
234
+ if path_to_rq.exists() and path_to_metadata.exists():
235
+ file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8')
216
236
  try:
217
237
  file_content = json.load(file)
218
238
  finally:
@@ -220,14 +240,13 @@ class FileSystemRequestQueueClient(RequestQueueClient):
220
240
  try:
221
241
  metadata = RequestQueueMetadata(**file_content)
222
242
  except ValidationError as exc:
223
- raise ValueError(f'Invalid metadata file for request queue "{name}"') from exc
224
-
225
- metadata.name = name
243
+ raise ValueError(f'Invalid metadata file for request queue "{name or alias}"') from exc
226
244
 
227
245
  client = cls(
228
246
  metadata=metadata,
229
- storage_dir=storage_dir,
247
+ path_to_rq=path_to_rq,
230
248
  lock=asyncio.Lock(),
249
+ recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
231
250
  )
232
251
 
233
252
  await client._state.initialize()
@@ -250,8 +269,9 @@ class FileSystemRequestQueueClient(RequestQueueClient):
250
269
  )
251
270
  client = cls(
252
271
  metadata=metadata,
253
- storage_dir=storage_dir,
272
+ path_to_rq=path_to_rq,
254
273
  lock=asyncio.Lock(),
274
+ recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
255
275
  )
256
276
  await client._state.initialize()
257
277
  await client._update_metadata()
@@ -311,37 +331,52 @@ class FileSystemRequestQueueClient(RequestQueueClient):
311
331
  unprocessed_requests = list[UnprocessedRequest]()
312
332
  state = self._state.current_value
313
333
 
314
- # Prepare a dictionary to track existing requests by their unique keys.
315
- existing_unique_keys: dict[str, Path] = {}
316
- existing_request_files = await self._get_request_files(self.path_to_rq)
334
+ all_requests = state.forefront_requests | state.regular_requests
317
335
 
318
- for request_file in existing_request_files:
319
- existing_request = await self._parse_request_file(request_file)
320
- if existing_request is not None:
321
- existing_unique_keys[existing_request.unique_key] = request_file
336
+ requests_to_enqueue = {}
322
337
 
323
- # Process each request in the batch.
338
+ # Determine which requests can be added or are modified.
324
339
  for request in requests:
325
- existing_request_file = existing_unique_keys.get(request.unique_key)
326
- existing_request = None
327
-
328
- # Only load the full request from disk if we found a duplicate
329
- if existing_request_file is not None:
330
- existing_request = await self._parse_request_file(existing_request_file)
331
-
332
- # If there is no existing request with the same unique key, add the new request.
333
- if existing_request is None:
334
- request_path = self._get_request_path(request.id)
340
+ # Check if the request has already been handled.
341
+ if request.unique_key in state.handled_requests:
342
+ processed_requests.append(
343
+ ProcessedRequest(
344
+ unique_key=request.unique_key,
345
+ was_already_present=True,
346
+ was_already_handled=True,
347
+ )
348
+ )
349
+ # Check if the request is already in progress.
350
+ # Or if the request is already in the queue and the `forefront` flag is not used, we do not change the
351
+ # position of the request.
352
+ elif (request.unique_key in state.in_progress_requests) or (
353
+ request.unique_key in all_requests and not forefront
354
+ ):
355
+ processed_requests.append(
356
+ ProcessedRequest(
357
+ unique_key=request.unique_key,
358
+ was_already_present=True,
359
+ was_already_handled=False,
360
+ )
361
+ )
362
+ # These requests must either be added or update their position.
363
+ else:
364
+ requests_to_enqueue[request.unique_key] = request
335
365
 
366
+ # Process each request in the batch.
367
+ for request in requests_to_enqueue.values():
368
+ # If the request is not already in the RQ, this is a new request.
369
+ if request.unique_key not in all_requests:
370
+ request_path = self._get_request_path(request.unique_key)
336
371
  # Add sequence number to ensure FIFO ordering using state.
337
372
  if forefront:
338
373
  sequence_number = state.forefront_sequence_counter
339
374
  state.forefront_sequence_counter += 1
340
- state.forefront_requests[request.id] = sequence_number
375
+ state.forefront_requests[request.unique_key] = sequence_number
341
376
  else:
342
377
  sequence_number = state.sequence_counter
343
378
  state.sequence_counter += 1
344
- state.regular_requests[request.id] = sequence_number
379
+ state.regular_requests[request.unique_key] = sequence_number
345
380
 
346
381
  # Save the clean request without extra fields
347
382
  request_data = await json_dumps(request.model_dump())
@@ -351,71 +386,41 @@ class FileSystemRequestQueueClient(RequestQueueClient):
351
386
  new_total_request_count += 1
352
387
  new_pending_request_count += 1
353
388
 
354
- # Add to our index for subsequent requests in this batch
355
- existing_unique_keys[request.unique_key] = self._get_request_path(request.id)
356
-
357
389
  processed_requests.append(
358
390
  ProcessedRequest(
359
- id=request.id,
360
391
  unique_key=request.unique_key,
361
392
  was_already_present=False,
362
393
  was_already_handled=False,
363
394
  )
364
395
  )
365
396
 
366
- # If the request already exists in the RQ, just update it if needed.
367
- else:
368
- # Set the processed request flags.
369
- was_already_present = existing_request is not None
370
- was_already_handled = existing_request.id in state.handled_requests
371
-
372
- # If the request is already in the RQ and handled, just continue with the next one.
373
- if was_already_present and was_already_handled:
374
- processed_requests.append(
375
- ProcessedRequest(
376
- id=existing_request.id,
377
- unique_key=request.unique_key,
378
- was_already_present=True,
379
- was_already_handled=True,
380
- )
381
- )
397
+ # If the request already exists in the RQ and use the forefront flag to update its position
398
+ elif forefront:
399
+ # If the request is among `regular`, remove it from its current position.
400
+ if request.unique_key in state.regular_requests:
401
+ state.regular_requests.pop(request.unique_key)
382
402
 
383
- # If the request is already in the RQ but not handled yet, update it.
384
- elif was_already_present and not was_already_handled:
385
- # Update request type (forefront vs regular) in state
386
- if forefront:
387
- # Move from regular to forefront if needed
388
- if existing_request.id in state.regular_requests:
389
- state.regular_requests.pop(existing_request.id)
390
- if existing_request.id not in state.forefront_requests:
391
- state.forefront_requests[existing_request.id] = state.forefront_sequence_counter
392
- state.forefront_sequence_counter += 1
393
- elif (
394
- existing_request.id not in state.forefront_requests
395
- and existing_request.id not in state.regular_requests
396
- ):
397
- # Keep as regular if not already forefront
398
- state.regular_requests[existing_request.id] = state.sequence_counter
399
- state.sequence_counter += 1
400
-
401
- processed_requests.append(
402
- ProcessedRequest(
403
- id=existing_request.id,
404
- unique_key=request.unique_key,
405
- was_already_present=True,
406
- was_already_handled=False,
407
- )
403
+ # If the request is already in `forefront`, we just need to update its position.
404
+ state.forefront_requests[request.unique_key] = state.forefront_sequence_counter
405
+ state.forefront_sequence_counter += 1
406
+
407
+ processed_requests.append(
408
+ ProcessedRequest(
409
+ unique_key=request.unique_key,
410
+ was_already_present=True,
411
+ was_already_handled=False,
408
412
  )
413
+ )
409
414
 
410
- else:
411
- logger.warning(f'Request with unique key "{request.unique_key}" could not be processed.')
412
- unprocessed_requests.append(
413
- UnprocessedRequest(
414
- unique_key=request.unique_key,
415
- url=request.url,
416
- method=request.method,
417
- )
415
+ else:
416
+ logger.warning(f'Request with unique key "{request.unique_key}" could not be processed.')
417
+ unprocessed_requests.append(
418
+ UnprocessedRequest(
419
+ unique_key=request.unique_key,
420
+ url=request.url,
421
+ method=request.method,
418
422
  )
423
+ )
419
424
 
420
425
  await self._update_metadata(
421
426
  update_modified_at=True,
@@ -437,17 +442,17 @@ class FileSystemRequestQueueClient(RequestQueueClient):
437
442
  )
438
443
 
439
444
  @override
440
- async def get_request(self, request_id: str) -> Request | None:
445
+ async def get_request(self, unique_key: str) -> Request | None:
441
446
  async with self._lock:
442
- request_path = self._get_request_path(request_id)
447
+ request_path = self._get_request_path(unique_key)
443
448
  request = await self._parse_request_file(request_path)
444
449
 
445
450
  if request is None:
446
- logger.warning(f'Request with ID "{request_id}" not found in the queue.')
451
+ logger.warning(f'Request with unique key "{unique_key}" not found in the queue.')
447
452
  return None
448
453
 
449
454
  state = self._state.current_value
450
- state.in_progress_requests.add(request.id)
455
+ state.in_progress_requests.add(request.unique_key)
451
456
  await self._update_metadata(update_accessed_at=True)
452
457
  return request
453
458
 
@@ -466,11 +471,11 @@ class FileSystemRequestQueueClient(RequestQueueClient):
466
471
  candidate = self._request_cache.popleft()
467
472
 
468
473
  # Skip requests that are already in progress, however this should not happen.
469
- if candidate.id not in state.in_progress_requests:
474
+ if candidate.unique_key not in state.in_progress_requests:
470
475
  next_request = candidate
471
476
 
472
477
  if next_request is not None:
473
- state.in_progress_requests.add(next_request.id)
478
+ state.in_progress_requests.add(next_request.unique_key)
474
479
 
475
480
  return next_request
476
481
 
@@ -481,8 +486,8 @@ class FileSystemRequestQueueClient(RequestQueueClient):
481
486
  state = self._state.current_value
482
487
 
483
488
  # Check if the request is in progress.
484
- if request.id not in state.in_progress_requests:
485
- logger.warning(f'Marking request {request.id} as handled that is not in progress.')
489
+ if request.unique_key not in state.in_progress_requests:
490
+ logger.warning(f'Marking request {request.unique_key} as handled that is not in progress.')
486
491
  return None
487
492
 
488
493
  # Update the request's handled_at timestamp.
@@ -490,18 +495,18 @@ class FileSystemRequestQueueClient(RequestQueueClient):
490
495
  request.handled_at = datetime.now(timezone.utc)
491
496
 
492
497
  # Dump the updated request to the file.
493
- request_path = self._get_request_path(request.id)
498
+ request_path = self._get_request_path(request.unique_key)
494
499
 
495
500
  if not await asyncio.to_thread(request_path.exists):
496
- logger.warning(f'Request file for {request.id} does not exist, cannot mark as handled.')
501
+ logger.warning(f'Request file for {request.unique_key} does not exist, cannot mark as handled.')
497
502
  return None
498
503
 
499
504
  request_data = await json_dumps(request.model_dump())
500
505
  await atomic_write(request_path, request_data)
501
506
 
502
507
  # Update state: remove from in-progress and add to handled.
503
- state.in_progress_requests.discard(request.id)
504
- state.handled_requests.add(request.id)
508
+ state.in_progress_requests.discard(request.unique_key)
509
+ state.handled_requests.add(request.unique_key)
505
510
 
506
511
  # Update RQ metadata.
507
512
  await self._update_metadata(
@@ -512,7 +517,6 @@ class FileSystemRequestQueueClient(RequestQueueClient):
512
517
  )
513
518
 
514
519
  return ProcessedRequest(
515
- id=request.id,
516
520
  unique_key=request.unique_key,
517
521
  was_already_present=True,
518
522
  was_already_handled=True,
@@ -530,36 +534,36 @@ class FileSystemRequestQueueClient(RequestQueueClient):
530
534
  state = self._state.current_value
531
535
 
532
536
  # Check if the request is in progress.
533
- if request.id not in state.in_progress_requests:
534
- logger.info(f'Reclaiming request {request.id} that is not in progress.')
537
+ if request.unique_key not in state.in_progress_requests:
538
+ logger.info(f'Reclaiming request {request.unique_key} that is not in progress.')
535
539
  return None
536
540
 
537
- request_path = self._get_request_path(request.id)
541
+ request_path = self._get_request_path(request.unique_key)
538
542
 
539
543
  if not await asyncio.to_thread(request_path.exists):
540
- logger.warning(f'Request file for {request.id} does not exist, cannot reclaim.')
544
+ logger.warning(f'Request file for {request.unique_key} does not exist, cannot reclaim.')
541
545
  return None
542
546
 
543
547
  # Update sequence number and state to ensure proper ordering.
544
548
  if forefront:
545
549
  # Remove from regular requests if it was there
546
- state.regular_requests.pop(request.id, None)
550
+ state.regular_requests.pop(request.unique_key, None)
547
551
  sequence_number = state.forefront_sequence_counter
548
552
  state.forefront_sequence_counter += 1
549
- state.forefront_requests[request.id] = sequence_number
553
+ state.forefront_requests[request.unique_key] = sequence_number
550
554
  else:
551
555
  # Remove from forefront requests if it was there
552
- state.forefront_requests.pop(request.id, None)
556
+ state.forefront_requests.pop(request.unique_key, None)
553
557
  sequence_number = state.sequence_counter
554
558
  state.sequence_counter += 1
555
- state.regular_requests[request.id] = sequence_number
559
+ state.regular_requests[request.unique_key] = sequence_number
556
560
 
557
561
  # Save the clean request without extra fields
558
562
  request_data = await json_dumps(request.model_dump())
559
563
  await atomic_write(request_path, request_data)
560
564
 
561
565
  # Remove from in-progress.
562
- state.in_progress_requests.discard(request.id)
566
+ state.in_progress_requests.discard(request.unique_key)
563
567
 
564
568
  # Update RQ metadata.
565
569
  await self._update_metadata(
@@ -574,7 +578,6 @@ class FileSystemRequestQueueClient(RequestQueueClient):
574
578
  self._request_cache.append(request)
575
579
 
576
580
  return ProcessedRequest(
577
- id=request.id,
578
581
  unique_key=request.unique_key,
579
582
  was_already_present=True,
580
583
  was_already_handled=False,
@@ -597,7 +600,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
597
600
  # If we have a cached requests, check them first (fast path).
598
601
  if self._request_cache:
599
602
  for req in self._request_cache:
600
- if req.id not in state.handled_requests:
603
+ if req.unique_key not in state.handled_requests:
601
604
  self._is_empty_cache = False
602
605
  return False
603
606
  self._is_empty_cache = True
@@ -617,16 +620,16 @@ class FileSystemRequestQueueClient(RequestQueueClient):
617
620
  self._is_empty_cache = True
618
621
  return True
619
622
 
620
- def _get_request_path(self, request_id: str) -> Path:
623
+ def _get_request_path(self, unique_key: str) -> Path:
621
624
  """Get the path to a specific request file.
622
625
 
623
626
  Args:
624
- request_id: The ID of the request.
627
+ unique_key: Unique key of the request.
625
628
 
626
629
  Returns:
627
630
  The path to the request file.
628
631
  """
629
- return self.path_to_rq / f'{request_id}.json'
632
+ return self.path_to_rq / f'{self._get_file_base_name_from_unique_key(unique_key)}.json'
630
633
 
631
634
  async def _update_metadata(
632
635
  self,
@@ -699,23 +702,23 @@ class FileSystemRequestQueueClient(RequestQueueClient):
699
702
  continue
700
703
 
701
704
  # Skip handled requests
702
- if request.id in state.handled_requests:
705
+ if request.unique_key in state.handled_requests:
703
706
  continue
704
707
 
705
708
  # Skip in-progress requests
706
- if request.id in state.in_progress_requests:
709
+ if request.unique_key in state.in_progress_requests:
707
710
  continue
708
711
 
709
712
  # Determine if request is forefront or regular based on state
710
- if request.id in state.forefront_requests:
711
- sequence = state.forefront_requests[request.id]
713
+ if request.unique_key in state.forefront_requests:
714
+ sequence = state.forefront_requests[request.unique_key]
712
715
  forefront_requests.append((request, sequence))
713
- elif request.id in state.regular_requests:
714
- sequence = state.regular_requests[request.id]
716
+ elif request.unique_key in state.regular_requests:
717
+ sequence = state.regular_requests[request.unique_key]
715
718
  regular_requests.append((request, sequence))
716
719
  else:
717
720
  # Request not in state, skip it (might be orphaned)
718
- logger.warning(f'Request {request.id} not found in state, skipping.')
721
+ logger.warning(f'Request {request.unique_key} not found in state, skipping.')
719
722
  continue
720
723
 
721
724
  # Sort forefront requests by sequence (newest first for LIFO behavior).
@@ -756,10 +759,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
756
759
  files = await asyncio.to_thread(list, path_to_rq.glob('*.json'))
757
760
 
758
761
  # Filter out metadata file and non-file entries.
759
- filtered = filter(
760
- lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME,
761
- files,
762
- )
762
+ filtered = filter(lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME, files)
763
763
 
764
764
  return list(filtered)
765
765
 
@@ -775,7 +775,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
775
775
  """
776
776
  # Open the request file.
777
777
  try:
778
- file = await asyncio.to_thread(open, file_path)
778
+ file = await asyncio.to_thread(open, file_path, 'r', encoding='utf-8')
779
779
  except FileNotFoundError:
780
780
  logger.warning(f'Request file "{file_path}" not found.')
781
781
  return None
@@ -807,11 +807,27 @@ class FileSystemRequestQueueClient(RequestQueueClient):
807
807
  continue
808
808
 
809
809
  # Add request to state as regular request (assign sequence numbers)
810
- if request.id not in state.regular_requests and request.id not in state.forefront_requests:
810
+ if request.unique_key not in state.regular_requests and request.unique_key not in state.forefront_requests:
811
811
  # Assign as regular request with current sequence counter
812
- state.regular_requests[request.id] = state.sequence_counter
812
+ state.regular_requests[request.unique_key] = state.sequence_counter
813
813
  state.sequence_counter += 1
814
814
 
815
815
  # Check if request was already handled
816
816
  if request.handled_at is not None:
817
- state.handled_requests.add(request.id)
817
+ state.handled_requests.add(request.unique_key)
818
+
819
+ @staticmethod
820
+ def _get_file_base_name_from_unique_key(unique_key: str) -> str:
821
+ """Generate a deterministic file name for a unique_key.
822
+
823
+ Args:
824
+ unique_key: Unique key to be used to generate filename.
825
+
826
+ Returns:
827
+ A file name based on the unique_key.
828
+ """
829
+ # hexdigest produces filenames compliant strings
830
+ hashed_key = sha256(unique_key.encode('utf-8')).hexdigest()
831
+ name_length = 15
832
+ # Truncate the key to the desired length
833
+ return hashed_key[:name_length]