crawlee 0.6.13b17__py3-none-any.whl → 1.1.2b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_request.py +35 -33
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +106 -34
- crawlee/_utils/context.py +2 -2
- crawlee/_utils/file.py +7 -0
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +17 -1
- crawlee/_utils/requests.py +0 -26
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +4 -2
- crawlee/_utils/system.py +3 -3
- crawlee/_utils/time.py +120 -0
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +50 -12
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +219 -126
- crawlee/crawlers/_basic/_logging_utils.py +5 -1
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_playwright/_playwright_crawler.py +60 -11
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/events/_event_manager.py +4 -4
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +12 -0
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +3 -3
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +248 -50
- crawlee/sessions/_models.py +2 -2
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +43 -4
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_request_queue_client.py +2 -2
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
- crawlee/storage_clients/_file_system/_key_value_store_client.py +30 -26
- crawlee/storage_clients/_file_system/_request_queue_client.py +169 -153
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +13 -11
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +13 -5
- crawlee/storages/_storage_instance_manager.py +133 -71
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +18 -6
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +101 -78
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
- crawlee/_utils/measure_time.py +0 -31
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,17 +5,19 @@ import json
|
|
|
5
5
|
import shutil
|
|
6
6
|
from collections import deque
|
|
7
7
|
from datetime import datetime, timezone
|
|
8
|
+
from hashlib import sha256
|
|
8
9
|
from logging import getLogger
|
|
9
10
|
from pathlib import Path
|
|
10
11
|
from typing import TYPE_CHECKING
|
|
11
12
|
|
|
12
13
|
from pydantic import BaseModel, ValidationError
|
|
13
|
-
from typing_extensions import override
|
|
14
|
+
from typing_extensions import Self, override
|
|
14
15
|
|
|
15
16
|
from crawlee import Request
|
|
16
17
|
from crawlee._consts import METADATA_FILENAME
|
|
17
18
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
18
19
|
from crawlee._utils.file import atomic_write, json_dumps
|
|
20
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
19
21
|
from crawlee._utils.recoverable_state import RecoverableState
|
|
20
22
|
from crawlee.storage_clients._base import RequestQueueClient
|
|
21
23
|
from crawlee.storage_clients.models import (
|
|
@@ -29,6 +31,7 @@ if TYPE_CHECKING:
|
|
|
29
31
|
from collections.abc import Sequence
|
|
30
32
|
|
|
31
33
|
from crawlee.configuration import Configuration
|
|
34
|
+
from crawlee.storages import KeyValueStore
|
|
32
35
|
|
|
33
36
|
logger = getLogger(__name__)
|
|
34
37
|
|
|
@@ -43,16 +46,16 @@ class RequestQueueState(BaseModel):
|
|
|
43
46
|
"""Counter for forefront request ordering."""
|
|
44
47
|
|
|
45
48
|
forefront_requests: dict[str, int] = {}
|
|
46
|
-
"""Mapping of forefront request
|
|
49
|
+
"""Mapping of forefront request unique keys to their sequence numbers."""
|
|
47
50
|
|
|
48
51
|
regular_requests: dict[str, int] = {}
|
|
49
|
-
"""Mapping of regular request
|
|
52
|
+
"""Mapping of regular request unique keys to their sequence numbers."""
|
|
50
53
|
|
|
51
54
|
in_progress_requests: set[str] = set()
|
|
52
|
-
"""Set of request
|
|
55
|
+
"""Set of request unique keys currently being processed."""
|
|
53
56
|
|
|
54
57
|
handled_requests: set[str] = set()
|
|
55
|
-
"""Set of request
|
|
58
|
+
"""Set of request unique keys that have been handled."""
|
|
56
59
|
|
|
57
60
|
|
|
58
61
|
class FileSystemRequestQueueClient(RequestQueueClient):
|
|
@@ -88,8 +91,9 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
88
91
|
self,
|
|
89
92
|
*,
|
|
90
93
|
metadata: RequestQueueMetadata,
|
|
91
|
-
|
|
94
|
+
path_to_rq: Path,
|
|
92
95
|
lock: asyncio.Lock,
|
|
96
|
+
recoverable_state: RecoverableState[RequestQueueState],
|
|
93
97
|
) -> None:
|
|
94
98
|
"""Initialize a new instance.
|
|
95
99
|
|
|
@@ -97,8 +101,8 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
97
101
|
"""
|
|
98
102
|
self._metadata = metadata
|
|
99
103
|
|
|
100
|
-
self.
|
|
101
|
-
"""The
|
|
104
|
+
self._path_to_rq = path_to_rq
|
|
105
|
+
"""The full path to the request queue directory."""
|
|
102
106
|
|
|
103
107
|
self._lock = lock
|
|
104
108
|
"""A lock to ensure that only one operation is performed at a time."""
|
|
@@ -112,13 +116,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
112
116
|
self._is_empty_cache: bool | None = None
|
|
113
117
|
"""Cache for is_empty result: None means unknown, True/False is cached state."""
|
|
114
118
|
|
|
115
|
-
self._state =
|
|
116
|
-
default_state=RequestQueueState(),
|
|
117
|
-
persist_state_key='request_queue_state',
|
|
118
|
-
persistence_enabled=True,
|
|
119
|
-
persist_state_kvs_name=f'__RQ_STATE_{self._metadata.id}',
|
|
120
|
-
logger=logger,
|
|
121
|
-
)
|
|
119
|
+
self._state = recoverable_state
|
|
122
120
|
"""Recoverable state to maintain request ordering, in-progress status, and handled status."""
|
|
123
121
|
|
|
124
122
|
@override
|
|
@@ -128,24 +126,38 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
128
126
|
@property
|
|
129
127
|
def path_to_rq(self) -> Path:
|
|
130
128
|
"""The full path to the request queue directory."""
|
|
131
|
-
|
|
132
|
-
return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT
|
|
133
|
-
|
|
134
|
-
return self._storage_dir / self._STORAGE_SUBDIR / self._metadata.name
|
|
129
|
+
return self._path_to_rq
|
|
135
130
|
|
|
136
131
|
@property
|
|
137
132
|
def path_to_metadata(self) -> Path:
|
|
138
133
|
"""The full path to the request queue metadata file."""
|
|
139
134
|
return self.path_to_rq / METADATA_FILENAME
|
|
140
135
|
|
|
136
|
+
@classmethod
|
|
137
|
+
async def _create_recoverable_state(cls, id: str, configuration: Configuration) -> RecoverableState:
|
|
138
|
+
async def kvs_factory() -> KeyValueStore:
|
|
139
|
+
from crawlee.storage_clients import FileSystemStorageClient # noqa: PLC0415 avoid circular import
|
|
140
|
+
from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
|
|
141
|
+
|
|
142
|
+
return await KeyValueStore.open(storage_client=FileSystemStorageClient(), configuration=configuration)
|
|
143
|
+
|
|
144
|
+
return RecoverableState[RequestQueueState](
|
|
145
|
+
default_state=RequestQueueState(),
|
|
146
|
+
persist_state_key=f'__RQ_STATE_{id}',
|
|
147
|
+
persist_state_kvs_factory=kvs_factory,
|
|
148
|
+
persistence_enabled=True,
|
|
149
|
+
logger=logger,
|
|
150
|
+
)
|
|
151
|
+
|
|
141
152
|
@classmethod
|
|
142
153
|
async def open(
|
|
143
154
|
cls,
|
|
144
155
|
*,
|
|
145
156
|
id: str | None,
|
|
146
157
|
name: str | None,
|
|
158
|
+
alias: str | None,
|
|
147
159
|
configuration: Configuration,
|
|
148
|
-
) ->
|
|
160
|
+
) -> Self:
|
|
149
161
|
"""Open or create a file system request queue client.
|
|
150
162
|
|
|
151
163
|
This method attempts to open an existing request queue from the file system. If a queue with the specified
|
|
@@ -154,17 +166,21 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
154
166
|
|
|
155
167
|
Args:
|
|
156
168
|
id: The ID of the request queue to open. If provided, searches for existing queue by ID.
|
|
157
|
-
name: The name of the request queue
|
|
169
|
+
name: The name of the request queue for named (global scope) storages.
|
|
170
|
+
alias: The alias of the request queue for unnamed (run scope) storages.
|
|
158
171
|
configuration: The configuration object containing storage directory settings.
|
|
159
172
|
|
|
160
173
|
Returns:
|
|
161
174
|
An instance for the opened or created storage client.
|
|
162
175
|
|
|
163
176
|
Raises:
|
|
164
|
-
ValueError: If a queue with the specified ID is not found,
|
|
177
|
+
ValueError: If a queue with the specified ID is not found, if metadata is invalid,
|
|
178
|
+
or if both name and alias are provided.
|
|
165
179
|
"""
|
|
166
|
-
|
|
167
|
-
|
|
180
|
+
# Validate input parameters.
|
|
181
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
182
|
+
|
|
183
|
+
rq_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
|
|
168
184
|
|
|
169
185
|
if not rq_base_path.exists():
|
|
170
186
|
await asyncio.to_thread(rq_base_path.mkdir, parents=True, exist_ok=True)
|
|
@@ -176,12 +192,12 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
176
192
|
if not rq_dir.is_dir():
|
|
177
193
|
continue
|
|
178
194
|
|
|
179
|
-
|
|
180
|
-
if not
|
|
195
|
+
path_to_metadata = rq_dir / METADATA_FILENAME
|
|
196
|
+
if not path_to_metadata.exists():
|
|
181
197
|
continue
|
|
182
198
|
|
|
183
199
|
try:
|
|
184
|
-
file = await asyncio.to_thread(
|
|
200
|
+
file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8')
|
|
185
201
|
try:
|
|
186
202
|
file_content = json.load(file)
|
|
187
203
|
metadata = RequestQueueMetadata(**file_content)
|
|
@@ -189,8 +205,11 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
189
205
|
if metadata.id == id:
|
|
190
206
|
client = cls(
|
|
191
207
|
metadata=metadata,
|
|
192
|
-
|
|
208
|
+
path_to_rq=rq_base_path / rq_dir,
|
|
193
209
|
lock=asyncio.Lock(),
|
|
210
|
+
recoverable_state=await cls._create_recoverable_state(
|
|
211
|
+
id=id, configuration=configuration
|
|
212
|
+
),
|
|
194
213
|
)
|
|
195
214
|
await client._state.initialize()
|
|
196
215
|
await client._discover_existing_requests()
|
|
@@ -205,14 +224,15 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
205
224
|
if not found:
|
|
206
225
|
raise ValueError(f'Request queue with ID "{id}" not found')
|
|
207
226
|
|
|
208
|
-
# Open an existing RQ by its name, or create a new one if not found.
|
|
227
|
+
# Open an existing RQ by its name or alias, or create a new one if not found.
|
|
209
228
|
else:
|
|
210
|
-
|
|
211
|
-
|
|
229
|
+
rq_dir = Path(name) if name else Path(alias) if alias else Path('default')
|
|
230
|
+
path_to_rq = rq_base_path / rq_dir
|
|
231
|
+
path_to_metadata = path_to_rq / METADATA_FILENAME
|
|
212
232
|
|
|
213
233
|
# If the RQ directory exists, reconstruct the client from the metadata file.
|
|
214
|
-
if
|
|
215
|
-
file = await asyncio.to_thread(open,
|
|
234
|
+
if path_to_rq.exists() and path_to_metadata.exists():
|
|
235
|
+
file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8')
|
|
216
236
|
try:
|
|
217
237
|
file_content = json.load(file)
|
|
218
238
|
finally:
|
|
@@ -220,14 +240,13 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
220
240
|
try:
|
|
221
241
|
metadata = RequestQueueMetadata(**file_content)
|
|
222
242
|
except ValidationError as exc:
|
|
223
|
-
raise ValueError(f'Invalid metadata file for request queue "{name}"') from exc
|
|
224
|
-
|
|
225
|
-
metadata.name = name
|
|
243
|
+
raise ValueError(f'Invalid metadata file for request queue "{name or alias}"') from exc
|
|
226
244
|
|
|
227
245
|
client = cls(
|
|
228
246
|
metadata=metadata,
|
|
229
|
-
|
|
247
|
+
path_to_rq=path_to_rq,
|
|
230
248
|
lock=asyncio.Lock(),
|
|
249
|
+
recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
|
|
231
250
|
)
|
|
232
251
|
|
|
233
252
|
await client._state.initialize()
|
|
@@ -250,8 +269,9 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
250
269
|
)
|
|
251
270
|
client = cls(
|
|
252
271
|
metadata=metadata,
|
|
253
|
-
|
|
272
|
+
path_to_rq=path_to_rq,
|
|
254
273
|
lock=asyncio.Lock(),
|
|
274
|
+
recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
|
|
255
275
|
)
|
|
256
276
|
await client._state.initialize()
|
|
257
277
|
await client._update_metadata()
|
|
@@ -311,37 +331,52 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
311
331
|
unprocessed_requests = list[UnprocessedRequest]()
|
|
312
332
|
state = self._state.current_value
|
|
313
333
|
|
|
314
|
-
|
|
315
|
-
existing_unique_keys: dict[str, Path] = {}
|
|
316
|
-
existing_request_files = await self._get_request_files(self.path_to_rq)
|
|
334
|
+
all_requests = state.forefront_requests | state.regular_requests
|
|
317
335
|
|
|
318
|
-
|
|
319
|
-
existing_request = await self._parse_request_file(request_file)
|
|
320
|
-
if existing_request is not None:
|
|
321
|
-
existing_unique_keys[existing_request.unique_key] = request_file
|
|
336
|
+
requests_to_enqueue = {}
|
|
322
337
|
|
|
323
|
-
#
|
|
338
|
+
# Determine which requests can be added or are modified.
|
|
324
339
|
for request in requests:
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
340
|
+
# Check if the request has already been handled.
|
|
341
|
+
if request.unique_key in state.handled_requests:
|
|
342
|
+
processed_requests.append(
|
|
343
|
+
ProcessedRequest(
|
|
344
|
+
unique_key=request.unique_key,
|
|
345
|
+
was_already_present=True,
|
|
346
|
+
was_already_handled=True,
|
|
347
|
+
)
|
|
348
|
+
)
|
|
349
|
+
# Check if the request is already in progress.
|
|
350
|
+
# Or if the request is already in the queue and the `forefront` flag is not used, we do not change the
|
|
351
|
+
# position of the request.
|
|
352
|
+
elif (request.unique_key in state.in_progress_requests) or (
|
|
353
|
+
request.unique_key in all_requests and not forefront
|
|
354
|
+
):
|
|
355
|
+
processed_requests.append(
|
|
356
|
+
ProcessedRequest(
|
|
357
|
+
unique_key=request.unique_key,
|
|
358
|
+
was_already_present=True,
|
|
359
|
+
was_already_handled=False,
|
|
360
|
+
)
|
|
361
|
+
)
|
|
362
|
+
# These requests must either be added or update their position.
|
|
363
|
+
else:
|
|
364
|
+
requests_to_enqueue[request.unique_key] = request
|
|
335
365
|
|
|
366
|
+
# Process each request in the batch.
|
|
367
|
+
for request in requests_to_enqueue.values():
|
|
368
|
+
# If the request is not already in the RQ, this is a new request.
|
|
369
|
+
if request.unique_key not in all_requests:
|
|
370
|
+
request_path = self._get_request_path(request.unique_key)
|
|
336
371
|
# Add sequence number to ensure FIFO ordering using state.
|
|
337
372
|
if forefront:
|
|
338
373
|
sequence_number = state.forefront_sequence_counter
|
|
339
374
|
state.forefront_sequence_counter += 1
|
|
340
|
-
state.forefront_requests[request.
|
|
375
|
+
state.forefront_requests[request.unique_key] = sequence_number
|
|
341
376
|
else:
|
|
342
377
|
sequence_number = state.sequence_counter
|
|
343
378
|
state.sequence_counter += 1
|
|
344
|
-
state.regular_requests[request.
|
|
379
|
+
state.regular_requests[request.unique_key] = sequence_number
|
|
345
380
|
|
|
346
381
|
# Save the clean request without extra fields
|
|
347
382
|
request_data = await json_dumps(request.model_dump())
|
|
@@ -351,71 +386,41 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
351
386
|
new_total_request_count += 1
|
|
352
387
|
new_pending_request_count += 1
|
|
353
388
|
|
|
354
|
-
# Add to our index for subsequent requests in this batch
|
|
355
|
-
existing_unique_keys[request.unique_key] = self._get_request_path(request.id)
|
|
356
|
-
|
|
357
389
|
processed_requests.append(
|
|
358
390
|
ProcessedRequest(
|
|
359
|
-
id=request.id,
|
|
360
391
|
unique_key=request.unique_key,
|
|
361
392
|
was_already_present=False,
|
|
362
393
|
was_already_handled=False,
|
|
363
394
|
)
|
|
364
395
|
)
|
|
365
396
|
|
|
366
|
-
# If the request already exists in the RQ
|
|
367
|
-
|
|
368
|
-
#
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
# If the request is already in the RQ and handled, just continue with the next one.
|
|
373
|
-
if was_already_present and was_already_handled:
|
|
374
|
-
processed_requests.append(
|
|
375
|
-
ProcessedRequest(
|
|
376
|
-
id=existing_request.id,
|
|
377
|
-
unique_key=request.unique_key,
|
|
378
|
-
was_already_present=True,
|
|
379
|
-
was_already_handled=True,
|
|
380
|
-
)
|
|
381
|
-
)
|
|
397
|
+
# If the request already exists in the RQ and use the forefront flag to update its position
|
|
398
|
+
elif forefront:
|
|
399
|
+
# If the request is among `regular`, remove it from its current position.
|
|
400
|
+
if request.unique_key in state.regular_requests:
|
|
401
|
+
state.regular_requests.pop(request.unique_key)
|
|
382
402
|
|
|
383
|
-
# If the request is already in
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
state.forefront_sequence_counter += 1
|
|
393
|
-
elif (
|
|
394
|
-
existing_request.id not in state.forefront_requests
|
|
395
|
-
and existing_request.id not in state.regular_requests
|
|
396
|
-
):
|
|
397
|
-
# Keep as regular if not already forefront
|
|
398
|
-
state.regular_requests[existing_request.id] = state.sequence_counter
|
|
399
|
-
state.sequence_counter += 1
|
|
400
|
-
|
|
401
|
-
processed_requests.append(
|
|
402
|
-
ProcessedRequest(
|
|
403
|
-
id=existing_request.id,
|
|
404
|
-
unique_key=request.unique_key,
|
|
405
|
-
was_already_present=True,
|
|
406
|
-
was_already_handled=False,
|
|
407
|
-
)
|
|
403
|
+
# If the request is already in `forefront`, we just need to update its position.
|
|
404
|
+
state.forefront_requests[request.unique_key] = state.forefront_sequence_counter
|
|
405
|
+
state.forefront_sequence_counter += 1
|
|
406
|
+
|
|
407
|
+
processed_requests.append(
|
|
408
|
+
ProcessedRequest(
|
|
409
|
+
unique_key=request.unique_key,
|
|
410
|
+
was_already_present=True,
|
|
411
|
+
was_already_handled=False,
|
|
408
412
|
)
|
|
413
|
+
)
|
|
409
414
|
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
)
|
|
415
|
+
else:
|
|
416
|
+
logger.warning(f'Request with unique key "{request.unique_key}" could not be processed.')
|
|
417
|
+
unprocessed_requests.append(
|
|
418
|
+
UnprocessedRequest(
|
|
419
|
+
unique_key=request.unique_key,
|
|
420
|
+
url=request.url,
|
|
421
|
+
method=request.method,
|
|
418
422
|
)
|
|
423
|
+
)
|
|
419
424
|
|
|
420
425
|
await self._update_metadata(
|
|
421
426
|
update_modified_at=True,
|
|
@@ -437,17 +442,17 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
437
442
|
)
|
|
438
443
|
|
|
439
444
|
@override
|
|
440
|
-
async def get_request(self,
|
|
445
|
+
async def get_request(self, unique_key: str) -> Request | None:
|
|
441
446
|
async with self._lock:
|
|
442
|
-
request_path = self._get_request_path(
|
|
447
|
+
request_path = self._get_request_path(unique_key)
|
|
443
448
|
request = await self._parse_request_file(request_path)
|
|
444
449
|
|
|
445
450
|
if request is None:
|
|
446
|
-
logger.warning(f'Request with
|
|
451
|
+
logger.warning(f'Request with unique key "{unique_key}" not found in the queue.')
|
|
447
452
|
return None
|
|
448
453
|
|
|
449
454
|
state = self._state.current_value
|
|
450
|
-
state.in_progress_requests.add(request.
|
|
455
|
+
state.in_progress_requests.add(request.unique_key)
|
|
451
456
|
await self._update_metadata(update_accessed_at=True)
|
|
452
457
|
return request
|
|
453
458
|
|
|
@@ -466,11 +471,11 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
466
471
|
candidate = self._request_cache.popleft()
|
|
467
472
|
|
|
468
473
|
# Skip requests that are already in progress, however this should not happen.
|
|
469
|
-
if candidate.
|
|
474
|
+
if candidate.unique_key not in state.in_progress_requests:
|
|
470
475
|
next_request = candidate
|
|
471
476
|
|
|
472
477
|
if next_request is not None:
|
|
473
|
-
state.in_progress_requests.add(next_request.
|
|
478
|
+
state.in_progress_requests.add(next_request.unique_key)
|
|
474
479
|
|
|
475
480
|
return next_request
|
|
476
481
|
|
|
@@ -481,8 +486,8 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
481
486
|
state = self._state.current_value
|
|
482
487
|
|
|
483
488
|
# Check if the request is in progress.
|
|
484
|
-
if request.
|
|
485
|
-
logger.warning(f'Marking request {request.
|
|
489
|
+
if request.unique_key not in state.in_progress_requests:
|
|
490
|
+
logger.warning(f'Marking request {request.unique_key} as handled that is not in progress.')
|
|
486
491
|
return None
|
|
487
492
|
|
|
488
493
|
# Update the request's handled_at timestamp.
|
|
@@ -490,18 +495,18 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
490
495
|
request.handled_at = datetime.now(timezone.utc)
|
|
491
496
|
|
|
492
497
|
# Dump the updated request to the file.
|
|
493
|
-
request_path = self._get_request_path(request.
|
|
498
|
+
request_path = self._get_request_path(request.unique_key)
|
|
494
499
|
|
|
495
500
|
if not await asyncio.to_thread(request_path.exists):
|
|
496
|
-
logger.warning(f'Request file for {request.
|
|
501
|
+
logger.warning(f'Request file for {request.unique_key} does not exist, cannot mark as handled.')
|
|
497
502
|
return None
|
|
498
503
|
|
|
499
504
|
request_data = await json_dumps(request.model_dump())
|
|
500
505
|
await atomic_write(request_path, request_data)
|
|
501
506
|
|
|
502
507
|
# Update state: remove from in-progress and add to handled.
|
|
503
|
-
state.in_progress_requests.discard(request.
|
|
504
|
-
state.handled_requests.add(request.
|
|
508
|
+
state.in_progress_requests.discard(request.unique_key)
|
|
509
|
+
state.handled_requests.add(request.unique_key)
|
|
505
510
|
|
|
506
511
|
# Update RQ metadata.
|
|
507
512
|
await self._update_metadata(
|
|
@@ -512,7 +517,6 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
512
517
|
)
|
|
513
518
|
|
|
514
519
|
return ProcessedRequest(
|
|
515
|
-
id=request.id,
|
|
516
520
|
unique_key=request.unique_key,
|
|
517
521
|
was_already_present=True,
|
|
518
522
|
was_already_handled=True,
|
|
@@ -530,36 +534,36 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
530
534
|
state = self._state.current_value
|
|
531
535
|
|
|
532
536
|
# Check if the request is in progress.
|
|
533
|
-
if request.
|
|
534
|
-
logger.info(f'Reclaiming request {request.
|
|
537
|
+
if request.unique_key not in state.in_progress_requests:
|
|
538
|
+
logger.info(f'Reclaiming request {request.unique_key} that is not in progress.')
|
|
535
539
|
return None
|
|
536
540
|
|
|
537
|
-
request_path = self._get_request_path(request.
|
|
541
|
+
request_path = self._get_request_path(request.unique_key)
|
|
538
542
|
|
|
539
543
|
if not await asyncio.to_thread(request_path.exists):
|
|
540
|
-
logger.warning(f'Request file for {request.
|
|
544
|
+
logger.warning(f'Request file for {request.unique_key} does not exist, cannot reclaim.')
|
|
541
545
|
return None
|
|
542
546
|
|
|
543
547
|
# Update sequence number and state to ensure proper ordering.
|
|
544
548
|
if forefront:
|
|
545
549
|
# Remove from regular requests if it was there
|
|
546
|
-
state.regular_requests.pop(request.
|
|
550
|
+
state.regular_requests.pop(request.unique_key, None)
|
|
547
551
|
sequence_number = state.forefront_sequence_counter
|
|
548
552
|
state.forefront_sequence_counter += 1
|
|
549
|
-
state.forefront_requests[request.
|
|
553
|
+
state.forefront_requests[request.unique_key] = sequence_number
|
|
550
554
|
else:
|
|
551
555
|
# Remove from forefront requests if it was there
|
|
552
|
-
state.forefront_requests.pop(request.
|
|
556
|
+
state.forefront_requests.pop(request.unique_key, None)
|
|
553
557
|
sequence_number = state.sequence_counter
|
|
554
558
|
state.sequence_counter += 1
|
|
555
|
-
state.regular_requests[request.
|
|
559
|
+
state.regular_requests[request.unique_key] = sequence_number
|
|
556
560
|
|
|
557
561
|
# Save the clean request without extra fields
|
|
558
562
|
request_data = await json_dumps(request.model_dump())
|
|
559
563
|
await atomic_write(request_path, request_data)
|
|
560
564
|
|
|
561
565
|
# Remove from in-progress.
|
|
562
|
-
state.in_progress_requests.discard(request.
|
|
566
|
+
state.in_progress_requests.discard(request.unique_key)
|
|
563
567
|
|
|
564
568
|
# Update RQ metadata.
|
|
565
569
|
await self._update_metadata(
|
|
@@ -574,7 +578,6 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
574
578
|
self._request_cache.append(request)
|
|
575
579
|
|
|
576
580
|
return ProcessedRequest(
|
|
577
|
-
id=request.id,
|
|
578
581
|
unique_key=request.unique_key,
|
|
579
582
|
was_already_present=True,
|
|
580
583
|
was_already_handled=False,
|
|
@@ -597,7 +600,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
597
600
|
# If we have a cached requests, check them first (fast path).
|
|
598
601
|
if self._request_cache:
|
|
599
602
|
for req in self._request_cache:
|
|
600
|
-
if req.
|
|
603
|
+
if req.unique_key not in state.handled_requests:
|
|
601
604
|
self._is_empty_cache = False
|
|
602
605
|
return False
|
|
603
606
|
self._is_empty_cache = True
|
|
@@ -617,16 +620,16 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
617
620
|
self._is_empty_cache = True
|
|
618
621
|
return True
|
|
619
622
|
|
|
620
|
-
def _get_request_path(self,
|
|
623
|
+
def _get_request_path(self, unique_key: str) -> Path:
|
|
621
624
|
"""Get the path to a specific request file.
|
|
622
625
|
|
|
623
626
|
Args:
|
|
624
|
-
|
|
627
|
+
unique_key: Unique key of the request.
|
|
625
628
|
|
|
626
629
|
Returns:
|
|
627
630
|
The path to the request file.
|
|
628
631
|
"""
|
|
629
|
-
return self.path_to_rq / f'{
|
|
632
|
+
return self.path_to_rq / f'{self._get_file_base_name_from_unique_key(unique_key)}.json'
|
|
630
633
|
|
|
631
634
|
async def _update_metadata(
|
|
632
635
|
self,
|
|
@@ -699,23 +702,23 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
699
702
|
continue
|
|
700
703
|
|
|
701
704
|
# Skip handled requests
|
|
702
|
-
if request.
|
|
705
|
+
if request.unique_key in state.handled_requests:
|
|
703
706
|
continue
|
|
704
707
|
|
|
705
708
|
# Skip in-progress requests
|
|
706
|
-
if request.
|
|
709
|
+
if request.unique_key in state.in_progress_requests:
|
|
707
710
|
continue
|
|
708
711
|
|
|
709
712
|
# Determine if request is forefront or regular based on state
|
|
710
|
-
if request.
|
|
711
|
-
sequence = state.forefront_requests[request.
|
|
713
|
+
if request.unique_key in state.forefront_requests:
|
|
714
|
+
sequence = state.forefront_requests[request.unique_key]
|
|
712
715
|
forefront_requests.append((request, sequence))
|
|
713
|
-
elif request.
|
|
714
|
-
sequence = state.regular_requests[request.
|
|
716
|
+
elif request.unique_key in state.regular_requests:
|
|
717
|
+
sequence = state.regular_requests[request.unique_key]
|
|
715
718
|
regular_requests.append((request, sequence))
|
|
716
719
|
else:
|
|
717
720
|
# Request not in state, skip it (might be orphaned)
|
|
718
|
-
logger.warning(f'Request {request.
|
|
721
|
+
logger.warning(f'Request {request.unique_key} not found in state, skipping.')
|
|
719
722
|
continue
|
|
720
723
|
|
|
721
724
|
# Sort forefront requests by sequence (newest first for LIFO behavior).
|
|
@@ -756,10 +759,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
756
759
|
files = await asyncio.to_thread(list, path_to_rq.glob('*.json'))
|
|
757
760
|
|
|
758
761
|
# Filter out metadata file and non-file entries.
|
|
759
|
-
filtered = filter(
|
|
760
|
-
lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME,
|
|
761
|
-
files,
|
|
762
|
-
)
|
|
762
|
+
filtered = filter(lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME, files)
|
|
763
763
|
|
|
764
764
|
return list(filtered)
|
|
765
765
|
|
|
@@ -775,7 +775,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
775
775
|
"""
|
|
776
776
|
# Open the request file.
|
|
777
777
|
try:
|
|
778
|
-
file = await asyncio.to_thread(open, file_path)
|
|
778
|
+
file = await asyncio.to_thread(open, file_path, 'r', encoding='utf-8')
|
|
779
779
|
except FileNotFoundError:
|
|
780
780
|
logger.warning(f'Request file "{file_path}" not found.')
|
|
781
781
|
return None
|
|
@@ -807,11 +807,27 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
807
807
|
continue
|
|
808
808
|
|
|
809
809
|
# Add request to state as regular request (assign sequence numbers)
|
|
810
|
-
if request.
|
|
810
|
+
if request.unique_key not in state.regular_requests and request.unique_key not in state.forefront_requests:
|
|
811
811
|
# Assign as regular request with current sequence counter
|
|
812
|
-
state.regular_requests[request.
|
|
812
|
+
state.regular_requests[request.unique_key] = state.sequence_counter
|
|
813
813
|
state.sequence_counter += 1
|
|
814
814
|
|
|
815
815
|
# Check if request was already handled
|
|
816
816
|
if request.handled_at is not None:
|
|
817
|
-
state.handled_requests.add(request.
|
|
817
|
+
state.handled_requests.add(request.unique_key)
|
|
818
|
+
|
|
819
|
+
@staticmethod
|
|
820
|
+
def _get_file_base_name_from_unique_key(unique_key: str) -> str:
|
|
821
|
+
"""Generate a deterministic file name for a unique_key.
|
|
822
|
+
|
|
823
|
+
Args:
|
|
824
|
+
unique_key: Unique key to be used to generate filename.
|
|
825
|
+
|
|
826
|
+
Returns:
|
|
827
|
+
A file name based on the unique_key.
|
|
828
|
+
"""
|
|
829
|
+
# hexdigest produces filenames compliant strings
|
|
830
|
+
hashed_key = sha256(unique_key.encode('utf-8')).hexdigest()
|
|
831
|
+
name_length = 15
|
|
832
|
+
# Truncate the key to the desired length
|
|
833
|
+
return hashed_key[:name_length]
|