crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/__init__.py +2 -1
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +64 -43
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +128 -36
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +27 -3
- crawlee/_utils/requests.py +0 -26
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +16 -7
- crawlee/_utils/system.py +30 -14
- crawlee/_utils/time.py +120 -0
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +5 -2
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +254 -148
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +27 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +32 -11
- crawlee/otel/crawler_instrumentor.py +4 -6
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +3 -3
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +248 -50
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +5 -5
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +62 -12
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_base/_request_queue_client.py +2 -2
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
- crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
- crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +292 -0
- crawlee/storage_clients/_redis/_dataset_client.py +329 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
- crawlee/storage_clients/_redis/_storage_client.py +149 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +21 -14
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +13 -5
- crawlee/storages/_storage_instance_manager.py +196 -75
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
- crawlee-1.3.1b3.dist-info/RECORD +207 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
- crawlee/_utils/measure_time.py +0 -31
- crawlee-0.6.13b15.dist-info/RECORD +0 -183
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,21 +1,24 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import functools
|
|
4
5
|
import json
|
|
5
6
|
import shutil
|
|
6
7
|
from collections import deque
|
|
7
8
|
from datetime import datetime, timezone
|
|
9
|
+
from hashlib import sha256
|
|
8
10
|
from logging import getLogger
|
|
9
11
|
from pathlib import Path
|
|
10
12
|
from typing import TYPE_CHECKING
|
|
11
13
|
|
|
12
14
|
from pydantic import BaseModel, ValidationError
|
|
13
|
-
from typing_extensions import override
|
|
15
|
+
from typing_extensions import Self, override
|
|
14
16
|
|
|
15
17
|
from crawlee import Request
|
|
16
18
|
from crawlee._consts import METADATA_FILENAME
|
|
17
19
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
18
20
|
from crawlee._utils.file import atomic_write, json_dumps
|
|
21
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
19
22
|
from crawlee._utils.recoverable_state import RecoverableState
|
|
20
23
|
from crawlee.storage_clients._base import RequestQueueClient
|
|
21
24
|
from crawlee.storage_clients.models import (
|
|
@@ -29,6 +32,7 @@ if TYPE_CHECKING:
|
|
|
29
32
|
from collections.abc import Sequence
|
|
30
33
|
|
|
31
34
|
from crawlee.configuration import Configuration
|
|
35
|
+
from crawlee.storages import KeyValueStore
|
|
32
36
|
|
|
33
37
|
logger = getLogger(__name__)
|
|
34
38
|
|
|
@@ -43,16 +47,16 @@ class RequestQueueState(BaseModel):
|
|
|
43
47
|
"""Counter for forefront request ordering."""
|
|
44
48
|
|
|
45
49
|
forefront_requests: dict[str, int] = {}
|
|
46
|
-
"""Mapping of forefront request
|
|
50
|
+
"""Mapping of forefront request unique keys to their sequence numbers."""
|
|
47
51
|
|
|
48
52
|
regular_requests: dict[str, int] = {}
|
|
49
|
-
"""Mapping of regular request
|
|
53
|
+
"""Mapping of regular request unique keys to their sequence numbers."""
|
|
50
54
|
|
|
51
55
|
in_progress_requests: set[str] = set()
|
|
52
|
-
"""Set of request
|
|
56
|
+
"""Set of request unique keys currently being processed."""
|
|
53
57
|
|
|
54
58
|
handled_requests: set[str] = set()
|
|
55
|
-
"""Set of request
|
|
59
|
+
"""Set of request unique keys that have been handled."""
|
|
56
60
|
|
|
57
61
|
|
|
58
62
|
class FileSystemRequestQueueClient(RequestQueueClient):
|
|
@@ -88,8 +92,9 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
88
92
|
self,
|
|
89
93
|
*,
|
|
90
94
|
metadata: RequestQueueMetadata,
|
|
91
|
-
|
|
95
|
+
path_to_rq: Path,
|
|
92
96
|
lock: asyncio.Lock,
|
|
97
|
+
recoverable_state: RecoverableState[RequestQueueState],
|
|
93
98
|
) -> None:
|
|
94
99
|
"""Initialize a new instance.
|
|
95
100
|
|
|
@@ -97,8 +102,8 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
97
102
|
"""
|
|
98
103
|
self._metadata = metadata
|
|
99
104
|
|
|
100
|
-
self.
|
|
101
|
-
"""The
|
|
105
|
+
self._path_to_rq = path_to_rq
|
|
106
|
+
"""The full path to the request queue directory."""
|
|
102
107
|
|
|
103
108
|
self._lock = lock
|
|
104
109
|
"""A lock to ensure that only one operation is performed at a time."""
|
|
@@ -112,13 +117,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
112
117
|
self._is_empty_cache: bool | None = None
|
|
113
118
|
"""Cache for is_empty result: None means unknown, True/False is cached state."""
|
|
114
119
|
|
|
115
|
-
self._state =
|
|
116
|
-
default_state=RequestQueueState(),
|
|
117
|
-
persist_state_key='request_queue_state',
|
|
118
|
-
persistence_enabled=True,
|
|
119
|
-
persist_state_kvs_name=f'__RQ_STATE_{self._metadata.id}',
|
|
120
|
-
logger=logger,
|
|
121
|
-
)
|
|
120
|
+
self._state = recoverable_state
|
|
122
121
|
"""Recoverable state to maintain request ordering, in-progress status, and handled status."""
|
|
123
122
|
|
|
124
123
|
@override
|
|
@@ -128,24 +127,38 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
128
127
|
@property
|
|
129
128
|
def path_to_rq(self) -> Path:
|
|
130
129
|
"""The full path to the request queue directory."""
|
|
131
|
-
|
|
132
|
-
return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT
|
|
133
|
-
|
|
134
|
-
return self._storage_dir / self._STORAGE_SUBDIR / self._metadata.name
|
|
130
|
+
return self._path_to_rq
|
|
135
131
|
|
|
136
132
|
@property
|
|
137
133
|
def path_to_metadata(self) -> Path:
|
|
138
134
|
"""The full path to the request queue metadata file."""
|
|
139
135
|
return self.path_to_rq / METADATA_FILENAME
|
|
140
136
|
|
|
137
|
+
@classmethod
|
|
138
|
+
async def _create_recoverable_state(cls, id: str, configuration: Configuration) -> RecoverableState:
|
|
139
|
+
async def kvs_factory() -> KeyValueStore:
|
|
140
|
+
from crawlee.storage_clients import FileSystemStorageClient # noqa: PLC0415 avoid circular import
|
|
141
|
+
from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
|
|
142
|
+
|
|
143
|
+
return await KeyValueStore.open(storage_client=FileSystemStorageClient(), configuration=configuration)
|
|
144
|
+
|
|
145
|
+
return RecoverableState[RequestQueueState](
|
|
146
|
+
default_state=RequestQueueState(),
|
|
147
|
+
persist_state_key=f'__RQ_STATE_{id}',
|
|
148
|
+
persist_state_kvs_factory=kvs_factory,
|
|
149
|
+
persistence_enabled=True,
|
|
150
|
+
logger=logger,
|
|
151
|
+
)
|
|
152
|
+
|
|
141
153
|
@classmethod
|
|
142
154
|
async def open(
|
|
143
155
|
cls,
|
|
144
156
|
*,
|
|
145
157
|
id: str | None,
|
|
146
158
|
name: str | None,
|
|
159
|
+
alias: str | None,
|
|
147
160
|
configuration: Configuration,
|
|
148
|
-
) ->
|
|
161
|
+
) -> Self:
|
|
149
162
|
"""Open or create a file system request queue client.
|
|
150
163
|
|
|
151
164
|
This method attempts to open an existing request queue from the file system. If a queue with the specified
|
|
@@ -154,17 +167,21 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
154
167
|
|
|
155
168
|
Args:
|
|
156
169
|
id: The ID of the request queue to open. If provided, searches for existing queue by ID.
|
|
157
|
-
name: The name of the request queue
|
|
170
|
+
name: The name of the request queue for named (global scope) storages.
|
|
171
|
+
alias: The alias of the request queue for unnamed (run scope) storages.
|
|
158
172
|
configuration: The configuration object containing storage directory settings.
|
|
159
173
|
|
|
160
174
|
Returns:
|
|
161
175
|
An instance for the opened or created storage client.
|
|
162
176
|
|
|
163
177
|
Raises:
|
|
164
|
-
ValueError: If a queue with the specified ID is not found,
|
|
178
|
+
ValueError: If a queue with the specified ID is not found, if metadata is invalid,
|
|
179
|
+
or if both name and alias are provided.
|
|
165
180
|
"""
|
|
166
|
-
|
|
167
|
-
|
|
181
|
+
# Validate input parameters.
|
|
182
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
183
|
+
|
|
184
|
+
rq_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
|
|
168
185
|
|
|
169
186
|
if not rq_base_path.exists():
|
|
170
187
|
await asyncio.to_thread(rq_base_path.mkdir, parents=True, exist_ok=True)
|
|
@@ -176,12 +193,12 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
176
193
|
if not rq_dir.is_dir():
|
|
177
194
|
continue
|
|
178
195
|
|
|
179
|
-
|
|
180
|
-
if not
|
|
196
|
+
path_to_metadata = rq_dir / METADATA_FILENAME
|
|
197
|
+
if not path_to_metadata.exists():
|
|
181
198
|
continue
|
|
182
199
|
|
|
183
200
|
try:
|
|
184
|
-
file = await asyncio.to_thread(
|
|
201
|
+
file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
|
|
185
202
|
try:
|
|
186
203
|
file_content = json.load(file)
|
|
187
204
|
metadata = RequestQueueMetadata(**file_content)
|
|
@@ -189,8 +206,11 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
189
206
|
if metadata.id == id:
|
|
190
207
|
client = cls(
|
|
191
208
|
metadata=metadata,
|
|
192
|
-
|
|
209
|
+
path_to_rq=rq_base_path / rq_dir,
|
|
193
210
|
lock=asyncio.Lock(),
|
|
211
|
+
recoverable_state=await cls._create_recoverable_state(
|
|
212
|
+
id=id, configuration=configuration
|
|
213
|
+
),
|
|
194
214
|
)
|
|
195
215
|
await client._state.initialize()
|
|
196
216
|
await client._discover_existing_requests()
|
|
@@ -205,14 +225,15 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
205
225
|
if not found:
|
|
206
226
|
raise ValueError(f'Request queue with ID "{id}" not found')
|
|
207
227
|
|
|
208
|
-
# Open an existing RQ by its name, or create a new one if not found.
|
|
228
|
+
# Open an existing RQ by its name or alias, or create a new one if not found.
|
|
209
229
|
else:
|
|
210
|
-
|
|
211
|
-
|
|
230
|
+
rq_dir = Path(name) if name else Path(alias) if alias else Path('default')
|
|
231
|
+
path_to_rq = rq_base_path / rq_dir
|
|
232
|
+
path_to_metadata = path_to_rq / METADATA_FILENAME
|
|
212
233
|
|
|
213
234
|
# If the RQ directory exists, reconstruct the client from the metadata file.
|
|
214
|
-
if
|
|
215
|
-
file = await asyncio.to_thread(open,
|
|
235
|
+
if path_to_rq.exists() and path_to_metadata.exists():
|
|
236
|
+
file = await asyncio.to_thread(path_to_metadata.open, encoding='utf-8')
|
|
216
237
|
try:
|
|
217
238
|
file_content = json.load(file)
|
|
218
239
|
finally:
|
|
@@ -220,14 +241,13 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
220
241
|
try:
|
|
221
242
|
metadata = RequestQueueMetadata(**file_content)
|
|
222
243
|
except ValidationError as exc:
|
|
223
|
-
raise ValueError(f'Invalid metadata file for request queue "{name}"') from exc
|
|
224
|
-
|
|
225
|
-
metadata.name = name
|
|
244
|
+
raise ValueError(f'Invalid metadata file for request queue "{name or alias}"') from exc
|
|
226
245
|
|
|
227
246
|
client = cls(
|
|
228
247
|
metadata=metadata,
|
|
229
|
-
|
|
248
|
+
path_to_rq=path_to_rq,
|
|
230
249
|
lock=asyncio.Lock(),
|
|
250
|
+
recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
|
|
231
251
|
)
|
|
232
252
|
|
|
233
253
|
await client._state.initialize()
|
|
@@ -250,8 +270,9 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
250
270
|
)
|
|
251
271
|
client = cls(
|
|
252
272
|
metadata=metadata,
|
|
253
|
-
|
|
273
|
+
path_to_rq=path_to_rq,
|
|
254
274
|
lock=asyncio.Lock(),
|
|
275
|
+
recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
|
|
255
276
|
)
|
|
256
277
|
await client._state.initialize()
|
|
257
278
|
await client._update_metadata()
|
|
@@ -311,37 +332,52 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
311
332
|
unprocessed_requests = list[UnprocessedRequest]()
|
|
312
333
|
state = self._state.current_value
|
|
313
334
|
|
|
314
|
-
|
|
315
|
-
existing_unique_keys: dict[str, Path] = {}
|
|
316
|
-
existing_request_files = await self._get_request_files(self.path_to_rq)
|
|
335
|
+
all_requests = state.forefront_requests | state.regular_requests
|
|
317
336
|
|
|
318
|
-
|
|
319
|
-
existing_request = await self._parse_request_file(request_file)
|
|
320
|
-
if existing_request is not None:
|
|
321
|
-
existing_unique_keys[existing_request.unique_key] = request_file
|
|
337
|
+
requests_to_enqueue = {}
|
|
322
338
|
|
|
323
|
-
#
|
|
339
|
+
# Determine which requests can be added or are modified.
|
|
324
340
|
for request in requests:
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
341
|
+
# Check if the request has already been handled.
|
|
342
|
+
if request.unique_key in state.handled_requests:
|
|
343
|
+
processed_requests.append(
|
|
344
|
+
ProcessedRequest(
|
|
345
|
+
unique_key=request.unique_key,
|
|
346
|
+
was_already_present=True,
|
|
347
|
+
was_already_handled=True,
|
|
348
|
+
)
|
|
349
|
+
)
|
|
350
|
+
# Check if the request is already in progress.
|
|
351
|
+
# Or if the request is already in the queue and the `forefront` flag is not used, we do not change the
|
|
352
|
+
# position of the request.
|
|
353
|
+
elif (request.unique_key in state.in_progress_requests) or (
|
|
354
|
+
request.unique_key in all_requests and not forefront
|
|
355
|
+
):
|
|
356
|
+
processed_requests.append(
|
|
357
|
+
ProcessedRequest(
|
|
358
|
+
unique_key=request.unique_key,
|
|
359
|
+
was_already_present=True,
|
|
360
|
+
was_already_handled=False,
|
|
361
|
+
)
|
|
362
|
+
)
|
|
363
|
+
# These requests must either be added or update their position.
|
|
364
|
+
else:
|
|
365
|
+
requests_to_enqueue[request.unique_key] = request
|
|
335
366
|
|
|
367
|
+
# Process each request in the batch.
|
|
368
|
+
for request in requests_to_enqueue.values():
|
|
369
|
+
# If the request is not already in the RQ, this is a new request.
|
|
370
|
+
if request.unique_key not in all_requests:
|
|
371
|
+
request_path = self._get_request_path(request.unique_key)
|
|
336
372
|
# Add sequence number to ensure FIFO ordering using state.
|
|
337
373
|
if forefront:
|
|
338
374
|
sequence_number = state.forefront_sequence_counter
|
|
339
375
|
state.forefront_sequence_counter += 1
|
|
340
|
-
state.forefront_requests[request.
|
|
376
|
+
state.forefront_requests[request.unique_key] = sequence_number
|
|
341
377
|
else:
|
|
342
378
|
sequence_number = state.sequence_counter
|
|
343
379
|
state.sequence_counter += 1
|
|
344
|
-
state.regular_requests[request.
|
|
380
|
+
state.regular_requests[request.unique_key] = sequence_number
|
|
345
381
|
|
|
346
382
|
# Save the clean request without extra fields
|
|
347
383
|
request_data = await json_dumps(request.model_dump())
|
|
@@ -351,71 +387,41 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
351
387
|
new_total_request_count += 1
|
|
352
388
|
new_pending_request_count += 1
|
|
353
389
|
|
|
354
|
-
# Add to our index for subsequent requests in this batch
|
|
355
|
-
existing_unique_keys[request.unique_key] = self._get_request_path(request.id)
|
|
356
|
-
|
|
357
390
|
processed_requests.append(
|
|
358
391
|
ProcessedRequest(
|
|
359
|
-
id=request.id,
|
|
360
392
|
unique_key=request.unique_key,
|
|
361
393
|
was_already_present=False,
|
|
362
394
|
was_already_handled=False,
|
|
363
395
|
)
|
|
364
396
|
)
|
|
365
397
|
|
|
366
|
-
# If the request already exists in the RQ
|
|
367
|
-
|
|
368
|
-
#
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
# If the request is already in the RQ and handled, just continue with the next one.
|
|
373
|
-
if was_already_present and was_already_handled:
|
|
374
|
-
processed_requests.append(
|
|
375
|
-
ProcessedRequest(
|
|
376
|
-
id=existing_request.id,
|
|
377
|
-
unique_key=request.unique_key,
|
|
378
|
-
was_already_present=True,
|
|
379
|
-
was_already_handled=True,
|
|
380
|
-
)
|
|
381
|
-
)
|
|
398
|
+
# If the request already exists in the RQ and use the forefront flag to update its position
|
|
399
|
+
elif forefront:
|
|
400
|
+
# If the request is among `regular`, remove it from its current position.
|
|
401
|
+
if request.unique_key in state.regular_requests:
|
|
402
|
+
state.regular_requests.pop(request.unique_key)
|
|
382
403
|
|
|
383
|
-
# If the request is already in
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
state.forefront_sequence_counter += 1
|
|
393
|
-
elif (
|
|
394
|
-
existing_request.id not in state.forefront_requests
|
|
395
|
-
and existing_request.id not in state.regular_requests
|
|
396
|
-
):
|
|
397
|
-
# Keep as regular if not already forefront
|
|
398
|
-
state.regular_requests[existing_request.id] = state.sequence_counter
|
|
399
|
-
state.sequence_counter += 1
|
|
400
|
-
|
|
401
|
-
processed_requests.append(
|
|
402
|
-
ProcessedRequest(
|
|
403
|
-
id=existing_request.id,
|
|
404
|
-
unique_key=request.unique_key,
|
|
405
|
-
was_already_present=True,
|
|
406
|
-
was_already_handled=False,
|
|
407
|
-
)
|
|
404
|
+
# If the request is already in `forefront`, we just need to update its position.
|
|
405
|
+
state.forefront_requests[request.unique_key] = state.forefront_sequence_counter
|
|
406
|
+
state.forefront_sequence_counter += 1
|
|
407
|
+
|
|
408
|
+
processed_requests.append(
|
|
409
|
+
ProcessedRequest(
|
|
410
|
+
unique_key=request.unique_key,
|
|
411
|
+
was_already_present=True,
|
|
412
|
+
was_already_handled=False,
|
|
408
413
|
)
|
|
414
|
+
)
|
|
409
415
|
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
)
|
|
416
|
+
else:
|
|
417
|
+
logger.warning(f'Request with unique key "{request.unique_key}" could not be processed.')
|
|
418
|
+
unprocessed_requests.append(
|
|
419
|
+
UnprocessedRequest(
|
|
420
|
+
unique_key=request.unique_key,
|
|
421
|
+
url=request.url,
|
|
422
|
+
method=request.method,
|
|
418
423
|
)
|
|
424
|
+
)
|
|
419
425
|
|
|
420
426
|
await self._update_metadata(
|
|
421
427
|
update_modified_at=True,
|
|
@@ -437,17 +443,17 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
437
443
|
)
|
|
438
444
|
|
|
439
445
|
@override
|
|
440
|
-
async def get_request(self,
|
|
446
|
+
async def get_request(self, unique_key: str) -> Request | None:
|
|
441
447
|
async with self._lock:
|
|
442
|
-
request_path = self._get_request_path(
|
|
448
|
+
request_path = self._get_request_path(unique_key)
|
|
443
449
|
request = await self._parse_request_file(request_path)
|
|
444
450
|
|
|
445
451
|
if request is None:
|
|
446
|
-
logger.warning(f'Request with
|
|
452
|
+
logger.warning(f'Request with unique key "{unique_key}" not found in the queue.')
|
|
447
453
|
return None
|
|
448
454
|
|
|
449
455
|
state = self._state.current_value
|
|
450
|
-
state.in_progress_requests.add(request.
|
|
456
|
+
state.in_progress_requests.add(request.unique_key)
|
|
451
457
|
await self._update_metadata(update_accessed_at=True)
|
|
452
458
|
return request
|
|
453
459
|
|
|
@@ -466,11 +472,11 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
466
472
|
candidate = self._request_cache.popleft()
|
|
467
473
|
|
|
468
474
|
# Skip requests that are already in progress, however this should not happen.
|
|
469
|
-
if candidate.
|
|
475
|
+
if candidate.unique_key not in state.in_progress_requests:
|
|
470
476
|
next_request = candidate
|
|
471
477
|
|
|
472
478
|
if next_request is not None:
|
|
473
|
-
state.in_progress_requests.add(next_request.
|
|
479
|
+
state.in_progress_requests.add(next_request.unique_key)
|
|
474
480
|
|
|
475
481
|
return next_request
|
|
476
482
|
|
|
@@ -481,8 +487,8 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
481
487
|
state = self._state.current_value
|
|
482
488
|
|
|
483
489
|
# Check if the request is in progress.
|
|
484
|
-
if request.
|
|
485
|
-
logger.warning(f'Marking request {request.
|
|
490
|
+
if request.unique_key not in state.in_progress_requests:
|
|
491
|
+
logger.warning(f'Marking request {request.unique_key} as handled that is not in progress.')
|
|
486
492
|
return None
|
|
487
493
|
|
|
488
494
|
# Update the request's handled_at timestamp.
|
|
@@ -490,18 +496,18 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
490
496
|
request.handled_at = datetime.now(timezone.utc)
|
|
491
497
|
|
|
492
498
|
# Dump the updated request to the file.
|
|
493
|
-
request_path = self._get_request_path(request.
|
|
499
|
+
request_path = self._get_request_path(request.unique_key)
|
|
494
500
|
|
|
495
501
|
if not await asyncio.to_thread(request_path.exists):
|
|
496
|
-
logger.warning(f'Request file for {request.
|
|
502
|
+
logger.warning(f'Request file for {request.unique_key} does not exist, cannot mark as handled.')
|
|
497
503
|
return None
|
|
498
504
|
|
|
499
505
|
request_data = await json_dumps(request.model_dump())
|
|
500
506
|
await atomic_write(request_path, request_data)
|
|
501
507
|
|
|
502
508
|
# Update state: remove from in-progress and add to handled.
|
|
503
|
-
state.in_progress_requests.discard(request.
|
|
504
|
-
state.handled_requests.add(request.
|
|
509
|
+
state.in_progress_requests.discard(request.unique_key)
|
|
510
|
+
state.handled_requests.add(request.unique_key)
|
|
505
511
|
|
|
506
512
|
# Update RQ metadata.
|
|
507
513
|
await self._update_metadata(
|
|
@@ -512,7 +518,6 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
512
518
|
)
|
|
513
519
|
|
|
514
520
|
return ProcessedRequest(
|
|
515
|
-
id=request.id,
|
|
516
521
|
unique_key=request.unique_key,
|
|
517
522
|
was_already_present=True,
|
|
518
523
|
was_already_handled=True,
|
|
@@ -530,36 +535,36 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
530
535
|
state = self._state.current_value
|
|
531
536
|
|
|
532
537
|
# Check if the request is in progress.
|
|
533
|
-
if request.
|
|
534
|
-
logger.info(f'Reclaiming request {request.
|
|
538
|
+
if request.unique_key not in state.in_progress_requests:
|
|
539
|
+
logger.info(f'Reclaiming request {request.unique_key} that is not in progress.')
|
|
535
540
|
return None
|
|
536
541
|
|
|
537
|
-
request_path = self._get_request_path(request.
|
|
542
|
+
request_path = self._get_request_path(request.unique_key)
|
|
538
543
|
|
|
539
544
|
if not await asyncio.to_thread(request_path.exists):
|
|
540
|
-
logger.warning(f'Request file for {request.
|
|
545
|
+
logger.warning(f'Request file for {request.unique_key} does not exist, cannot reclaim.')
|
|
541
546
|
return None
|
|
542
547
|
|
|
543
548
|
# Update sequence number and state to ensure proper ordering.
|
|
544
549
|
if forefront:
|
|
545
550
|
# Remove from regular requests if it was there
|
|
546
|
-
state.regular_requests.pop(request.
|
|
551
|
+
state.regular_requests.pop(request.unique_key, None)
|
|
547
552
|
sequence_number = state.forefront_sequence_counter
|
|
548
553
|
state.forefront_sequence_counter += 1
|
|
549
|
-
state.forefront_requests[request.
|
|
554
|
+
state.forefront_requests[request.unique_key] = sequence_number
|
|
550
555
|
else:
|
|
551
556
|
# Remove from forefront requests if it was there
|
|
552
|
-
state.forefront_requests.pop(request.
|
|
557
|
+
state.forefront_requests.pop(request.unique_key, None)
|
|
553
558
|
sequence_number = state.sequence_counter
|
|
554
559
|
state.sequence_counter += 1
|
|
555
|
-
state.regular_requests[request.
|
|
560
|
+
state.regular_requests[request.unique_key] = sequence_number
|
|
556
561
|
|
|
557
562
|
# Save the clean request without extra fields
|
|
558
563
|
request_data = await json_dumps(request.model_dump())
|
|
559
564
|
await atomic_write(request_path, request_data)
|
|
560
565
|
|
|
561
566
|
# Remove from in-progress.
|
|
562
|
-
state.in_progress_requests.discard(request.
|
|
567
|
+
state.in_progress_requests.discard(request.unique_key)
|
|
563
568
|
|
|
564
569
|
# Update RQ metadata.
|
|
565
570
|
await self._update_metadata(
|
|
@@ -574,7 +579,6 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
574
579
|
self._request_cache.append(request)
|
|
575
580
|
|
|
576
581
|
return ProcessedRequest(
|
|
577
|
-
id=request.id,
|
|
578
582
|
unique_key=request.unique_key,
|
|
579
583
|
was_already_present=True,
|
|
580
584
|
was_already_handled=False,
|
|
@@ -597,7 +601,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
597
601
|
# If we have a cached requests, check them first (fast path).
|
|
598
602
|
if self._request_cache:
|
|
599
603
|
for req in self._request_cache:
|
|
600
|
-
if req.
|
|
604
|
+
if req.unique_key not in state.handled_requests:
|
|
601
605
|
self._is_empty_cache = False
|
|
602
606
|
return False
|
|
603
607
|
self._is_empty_cache = True
|
|
@@ -617,16 +621,16 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
617
621
|
self._is_empty_cache = True
|
|
618
622
|
return True
|
|
619
623
|
|
|
620
|
-
def _get_request_path(self,
|
|
624
|
+
def _get_request_path(self, unique_key: str) -> Path:
|
|
621
625
|
"""Get the path to a specific request file.
|
|
622
626
|
|
|
623
627
|
Args:
|
|
624
|
-
|
|
628
|
+
unique_key: Unique key of the request.
|
|
625
629
|
|
|
626
630
|
Returns:
|
|
627
631
|
The path to the request file.
|
|
628
632
|
"""
|
|
629
|
-
return self.path_to_rq / f'{
|
|
633
|
+
return self.path_to_rq / f'{self._get_file_base_name_from_unique_key(unique_key)}.json'
|
|
630
634
|
|
|
631
635
|
async def _update_metadata(
|
|
632
636
|
self,
|
|
@@ -699,23 +703,23 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
699
703
|
continue
|
|
700
704
|
|
|
701
705
|
# Skip handled requests
|
|
702
|
-
if request.
|
|
706
|
+
if request.unique_key in state.handled_requests:
|
|
703
707
|
continue
|
|
704
708
|
|
|
705
709
|
# Skip in-progress requests
|
|
706
|
-
if request.
|
|
710
|
+
if request.unique_key in state.in_progress_requests:
|
|
707
711
|
continue
|
|
708
712
|
|
|
709
713
|
# Determine if request is forefront or regular based on state
|
|
710
|
-
if request.
|
|
711
|
-
sequence = state.forefront_requests[request.
|
|
714
|
+
if request.unique_key in state.forefront_requests:
|
|
715
|
+
sequence = state.forefront_requests[request.unique_key]
|
|
712
716
|
forefront_requests.append((request, sequence))
|
|
713
|
-
elif request.
|
|
714
|
-
sequence = state.regular_requests[request.
|
|
717
|
+
elif request.unique_key in state.regular_requests:
|
|
718
|
+
sequence = state.regular_requests[request.unique_key]
|
|
715
719
|
regular_requests.append((request, sequence))
|
|
716
720
|
else:
|
|
717
721
|
# Request not in state, skip it (might be orphaned)
|
|
718
|
-
logger.warning(f'Request {request.
|
|
722
|
+
logger.warning(f'Request {request.unique_key} not found in state, skipping.')
|
|
719
723
|
continue
|
|
720
724
|
|
|
721
725
|
# Sort forefront requests by sequence (newest first for LIFO behavior).
|
|
@@ -753,13 +757,10 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
753
757
|
await asyncio.to_thread(path_to_rq.mkdir, parents=True, exist_ok=True)
|
|
754
758
|
|
|
755
759
|
# List all the json files.
|
|
756
|
-
files = await asyncio.to_thread(list
|
|
760
|
+
files = await asyncio.to_thread(lambda: list(path_to_rq.glob('*.json')))
|
|
757
761
|
|
|
758
762
|
# Filter out metadata file and non-file entries.
|
|
759
|
-
filtered = filter(
|
|
760
|
-
lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME,
|
|
761
|
-
files,
|
|
762
|
-
)
|
|
763
|
+
filtered = filter(lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME, files)
|
|
763
764
|
|
|
764
765
|
return list(filtered)
|
|
765
766
|
|
|
@@ -775,7 +776,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
775
776
|
"""
|
|
776
777
|
# Open the request file.
|
|
777
778
|
try:
|
|
778
|
-
file = await asyncio.to_thread(open,
|
|
779
|
+
file = await asyncio.to_thread(functools.partial(file_path.open, mode='r', encoding='utf-8'))
|
|
779
780
|
except FileNotFoundError:
|
|
780
781
|
logger.warning(f'Request file "{file_path}" not found.')
|
|
781
782
|
return None
|
|
@@ -807,11 +808,27 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
807
808
|
continue
|
|
808
809
|
|
|
809
810
|
# Add request to state as regular request (assign sequence numbers)
|
|
810
|
-
if request.
|
|
811
|
+
if request.unique_key not in state.regular_requests and request.unique_key not in state.forefront_requests:
|
|
811
812
|
# Assign as regular request with current sequence counter
|
|
812
|
-
state.regular_requests[request.
|
|
813
|
+
state.regular_requests[request.unique_key] = state.sequence_counter
|
|
813
814
|
state.sequence_counter += 1
|
|
814
815
|
|
|
815
816
|
# Check if request was already handled
|
|
816
817
|
if request.handled_at is not None:
|
|
817
|
-
state.handled_requests.add(request.
|
|
818
|
+
state.handled_requests.add(request.unique_key)
|
|
819
|
+
|
|
820
|
+
@staticmethod
|
|
821
|
+
def _get_file_base_name_from_unique_key(unique_key: str) -> str:
|
|
822
|
+
"""Generate a deterministic file name for a unique_key.
|
|
823
|
+
|
|
824
|
+
Args:
|
|
825
|
+
unique_key: Unique key to be used to generate filename.
|
|
826
|
+
|
|
827
|
+
Returns:
|
|
828
|
+
A file name based on the unique_key.
|
|
829
|
+
"""
|
|
830
|
+
# hexdigest produces filenames compliant strings
|
|
831
|
+
hashed_key = sha256(unique_key.encode('utf-8')).hexdigest()
|
|
832
|
+
name_length = 15
|
|
833
|
+
# Truncate the key to the desired length
|
|
834
|
+
return hashed_key[:name_length]
|