crawlee 1.0.3b6__py3-none-any.whl → 1.2.2b24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/__init__.py +2 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +32 -13
- crawlee/_service_locator.py +4 -4
- crawlee/_types.py +44 -5
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +27 -3
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +13 -6
- crawlee/_utils/system.py +27 -11
- crawlee/_utils/time.py +41 -1
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +5 -2
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/browsers/_playwright_browser_controller.py +1 -1
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +3 -1
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
- crawlee/crawlers/_basic/_basic_crawler.py +156 -131
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +23 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +4 -6
- crawlee/request_loaders/_sitemap_request_loader.py +23 -5
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +3 -3
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +51 -9
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +4 -0
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
- crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
- crawlee/storage_clients/_file_system/_request_queue_client.py +29 -10
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +292 -0
- crawlee/storage_clients/_redis/_dataset_client.py +329 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
- crawlee/storage_clients/_redis/_storage_client.py +149 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/_client_mixin.py +1 -1
- crawlee/storage_clients/_sql/_db_models.py +1 -2
- crawlee/storage_clients/models.py +8 -3
- crawlee/storages/_key_value_store.py +5 -2
- crawlee/storages/_storage_instance_manager.py +103 -44
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +14 -16
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +82 -69
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/core/src/crawlers/statistics.ts
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
|
+
import asyncio
|
|
4
5
|
import math
|
|
5
6
|
import time
|
|
6
7
|
from datetime import datetime, timedelta, timezone
|
|
@@ -17,8 +18,11 @@ from crawlee.statistics import FinalStatistics, StatisticsState
|
|
|
17
18
|
from crawlee.statistics._error_tracker import ErrorTracker
|
|
18
19
|
|
|
19
20
|
if TYPE_CHECKING:
|
|
21
|
+
from collections.abc import Callable, Coroutine
|
|
20
22
|
from types import TracebackType
|
|
21
23
|
|
|
24
|
+
from crawlee.storages import KeyValueStore
|
|
25
|
+
|
|
22
26
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
23
27
|
TNewStatisticsState = TypeVar('TNewStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
24
28
|
logger = getLogger(__name__)
|
|
@@ -70,6 +74,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
70
74
|
persistence_enabled: bool | Literal['explicit_only'] = False,
|
|
71
75
|
persist_state_kvs_name: str | None = None,
|
|
72
76
|
persist_state_key: str | None = None,
|
|
77
|
+
persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
|
|
73
78
|
log_message: str = 'Statistics',
|
|
74
79
|
periodic_message_logger: Logger | None = None,
|
|
75
80
|
log_interval: timedelta = timedelta(minutes=1),
|
|
@@ -80,8 +85,6 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
80
85
|
self._id = Statistics.__next_id
|
|
81
86
|
Statistics.__next_id += 1
|
|
82
87
|
|
|
83
|
-
self._instance_start: datetime | None = None
|
|
84
|
-
|
|
85
88
|
self.error_tracker = ErrorTracker(
|
|
86
89
|
save_error_snapshots=save_error_snapshots,
|
|
87
90
|
snapshot_kvs_name=persist_state_kvs_name,
|
|
@@ -92,9 +95,10 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
92
95
|
|
|
93
96
|
self._state = RecoverableState(
|
|
94
97
|
default_state=state_model(stats_id=self._id),
|
|
95
|
-
persist_state_key=persist_state_key or f'
|
|
98
|
+
persist_state_key=persist_state_key or f'__CRAWLER_STATISTICS_{self._id}',
|
|
96
99
|
persistence_enabled=persistence_enabled,
|
|
97
100
|
persist_state_kvs_name=persist_state_kvs_name,
|
|
101
|
+
persist_state_kvs_factory=persist_state_kvs_factory,
|
|
98
102
|
logger=logger,
|
|
99
103
|
)
|
|
100
104
|
|
|
@@ -110,8 +114,8 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
110
114
|
"""Create near copy of the `Statistics` with replaced `state_model`."""
|
|
111
115
|
new_statistics: Statistics[TNewStatisticsState] = Statistics(
|
|
112
116
|
persistence_enabled=self._state._persistence_enabled, # noqa: SLF001
|
|
113
|
-
persist_state_kvs_name=self._state._persist_state_kvs_name, # noqa: SLF001
|
|
114
117
|
persist_state_key=self._state._persist_state_key, # noqa: SLF001
|
|
118
|
+
persist_state_kvs_factory=self._state._persist_state_kvs_factory, # noqa: SLF001
|
|
115
119
|
log_message=self._log_message,
|
|
116
120
|
periodic_message_logger=self._periodic_message_logger,
|
|
117
121
|
state_model=state_model,
|
|
@@ -125,6 +129,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
125
129
|
persistence_enabled: bool = False,
|
|
126
130
|
persist_state_kvs_name: str | None = None,
|
|
127
131
|
persist_state_key: str | None = None,
|
|
132
|
+
persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
|
|
128
133
|
log_message: str = 'Statistics',
|
|
129
134
|
periodic_message_logger: Logger | None = None,
|
|
130
135
|
log_interval: timedelta = timedelta(minutes=1),
|
|
@@ -136,6 +141,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
136
141
|
persistence_enabled=persistence_enabled,
|
|
137
142
|
persist_state_kvs_name=persist_state_kvs_name,
|
|
138
143
|
persist_state_key=persist_state_key,
|
|
144
|
+
persist_state_kvs_factory=persist_state_kvs_factory,
|
|
139
145
|
log_message=log_message,
|
|
140
146
|
periodic_message_logger=periodic_message_logger,
|
|
141
147
|
log_interval=log_interval,
|
|
@@ -158,14 +164,17 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
158
164
|
if self._active:
|
|
159
165
|
raise RuntimeError(f'The {self.__class__.__name__} is already active.')
|
|
160
166
|
|
|
161
|
-
self._active = True
|
|
162
|
-
self._instance_start = datetime.now(timezone.utc)
|
|
163
|
-
|
|
164
167
|
await self._state.initialize()
|
|
165
|
-
|
|
168
|
+
# Reset `crawler_finished_at` to indicate a new run in progress.
|
|
169
|
+
self.state.crawler_finished_at = None
|
|
166
170
|
|
|
171
|
+
# Start periodic logging and let it print initial state before activation.
|
|
167
172
|
self._periodic_logger.start()
|
|
173
|
+
await asyncio.sleep(0.01)
|
|
174
|
+
self._active = True
|
|
168
175
|
|
|
176
|
+
self.state.crawler_last_started_at = datetime.now(timezone.utc)
|
|
177
|
+
self.state.crawler_started_at = self.state.crawler_started_at or self.state.crawler_last_started_at
|
|
169
178
|
return self
|
|
170
179
|
|
|
171
180
|
async def __aexit__(
|
|
@@ -182,13 +191,14 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
182
191
|
if not self._active:
|
|
183
192
|
raise RuntimeError(f'The {self.__class__.__name__} is not active.')
|
|
184
193
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
await self._state.teardown()
|
|
194
|
+
if not self.state.crawler_last_started_at:
|
|
195
|
+
raise RuntimeError('Statistics.state.crawler_last_started_at not set.')
|
|
188
196
|
|
|
197
|
+
# Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
|
|
189
198
|
await self._periodic_logger.stop()
|
|
190
|
-
|
|
199
|
+
self.state.crawler_finished_at = datetime.now(timezone.utc)
|
|
191
200
|
self._active = False
|
|
201
|
+
await self._state.teardown()
|
|
192
202
|
|
|
193
203
|
@property
|
|
194
204
|
def state(self) -> TStatisticsState:
|
|
@@ -247,11 +257,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
247
257
|
|
|
248
258
|
def calculate(self) -> FinalStatistics:
|
|
249
259
|
"""Calculate the current statistics."""
|
|
250
|
-
|
|
251
|
-
raise RuntimeError('The Statistics object is not initialized')
|
|
252
|
-
|
|
253
|
-
crawler_runtime = datetime.now(timezone.utc) - self._instance_start
|
|
254
|
-
total_minutes = crawler_runtime.total_seconds() / 60
|
|
260
|
+
total_minutes = self.state.crawler_runtime.total_seconds() / 60
|
|
255
261
|
state = self._state.current_value
|
|
256
262
|
serialized_state = state.model_dump(by_alias=False)
|
|
257
263
|
|
|
@@ -262,7 +268,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
262
268
|
requests_failed_per_minute=math.floor(state.requests_failed / total_minutes) if total_minutes else 0,
|
|
263
269
|
request_total_duration=state.request_total_finished_duration + state.request_total_failed_duration,
|
|
264
270
|
requests_total=state.requests_failed + state.requests_finished,
|
|
265
|
-
crawler_runtime=crawler_runtime,
|
|
271
|
+
crawler_runtime=state.crawler_runtime,
|
|
266
272
|
requests_finished=state.requests_finished,
|
|
267
273
|
requests_failed=state.requests_failed,
|
|
268
274
|
retry_histogram=serialized_state['request_retry_histogram'],
|
|
@@ -282,21 +288,6 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
282
288
|
else:
|
|
283
289
|
self._periodic_message_logger.info(self._log_message, extra=stats.to_dict())
|
|
284
290
|
|
|
285
|
-
def _after_initialize(self) -> None:
|
|
286
|
-
state = self._state.current_value
|
|
287
|
-
|
|
288
|
-
if state.crawler_started_at is None:
|
|
289
|
-
state.crawler_started_at = datetime.now(timezone.utc)
|
|
290
|
-
|
|
291
|
-
if state.stats_persisted_at is not None and state.crawler_last_started_at:
|
|
292
|
-
self._instance_start = datetime.now(timezone.utc) - (
|
|
293
|
-
state.stats_persisted_at - state.crawler_last_started_at
|
|
294
|
-
)
|
|
295
|
-
elif state.crawler_last_started_at:
|
|
296
|
-
self._instance_start = state.crawler_last_started_at
|
|
297
|
-
|
|
298
|
-
state.crawler_last_started_at = self._instance_start
|
|
299
|
-
|
|
300
291
|
def _save_retry_count_for_request(self, record: RequestProcessingRecord) -> None:
|
|
301
292
|
retry_count = record.retry_count
|
|
302
293
|
state = self._state.current_value
|
|
@@ -13,9 +13,13 @@ _install_import_hook(__name__)
|
|
|
13
13
|
with _try_import(__name__, 'SqlStorageClient'):
|
|
14
14
|
from ._sql import SqlStorageClient
|
|
15
15
|
|
|
16
|
+
with _try_import(__name__, 'RedisStorageClient'):
|
|
17
|
+
from ._redis import RedisStorageClient
|
|
18
|
+
|
|
16
19
|
__all__ = [
|
|
17
20
|
'FileSystemStorageClient',
|
|
18
21
|
'MemoryStorageClient',
|
|
22
|
+
'RedisStorageClient',
|
|
19
23
|
'SqlStorageClient',
|
|
20
24
|
'StorageClient',
|
|
21
25
|
]
|
|
@@ -87,8 +87,8 @@ class DatasetClient(ABC):
|
|
|
87
87
|
|
|
88
88
|
The backend method for the `Dataset.iterate_items` call.
|
|
89
89
|
"""
|
|
90
|
-
# This syntax is to make
|
|
90
|
+
# This syntax is to make type checker properly work with abstract AsyncIterator.
|
|
91
91
|
# https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
|
|
92
92
|
raise NotImplementedError
|
|
93
|
-
if False:
|
|
93
|
+
if False:
|
|
94
94
|
yield 0
|
|
@@ -72,10 +72,10 @@ class KeyValueStoreClient(ABC):
|
|
|
72
72
|
|
|
73
73
|
The backend method for the `KeyValueStore.iterate_keys` call.
|
|
74
74
|
"""
|
|
75
|
-
# This syntax is to make
|
|
75
|
+
# This syntax is to make type checker properly work with abstract AsyncIterator.
|
|
76
76
|
# https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators
|
|
77
77
|
raise NotImplementedError
|
|
78
|
-
if False:
|
|
78
|
+
if False:
|
|
79
79
|
yield 0
|
|
80
80
|
|
|
81
81
|
@abstractmethod
|
|
@@ -134,7 +134,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
134
134
|
continue
|
|
135
135
|
|
|
136
136
|
try:
|
|
137
|
-
file = await asyncio.to_thread(path_to_metadata.open)
|
|
137
|
+
file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
|
|
138
138
|
try:
|
|
139
139
|
file_content = json.load(file)
|
|
140
140
|
metadata = DatasetMetadata(**file_content)
|
|
@@ -163,7 +163,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
163
163
|
|
|
164
164
|
# If the dataset directory exists, reconstruct the client from the metadata file.
|
|
165
165
|
if path_to_dataset.exists() and path_to_metadata.exists():
|
|
166
|
-
file = await asyncio.to_thread(open,
|
|
166
|
+
file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
|
|
167
167
|
try:
|
|
168
168
|
file_content = json.load(file)
|
|
169
169
|
finally:
|
|
@@ -473,9 +473,10 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
473
473
|
"""
|
|
474
474
|
# Retrieve and sort all JSON files in the dataset directory numerically.
|
|
475
475
|
files = await asyncio.to_thread(
|
|
476
|
-
sorted
|
|
477
|
-
|
|
478
|
-
|
|
476
|
+
lambda: sorted(
|
|
477
|
+
self.path_to_dataset.glob('*.json'),
|
|
478
|
+
key=lambda f: int(f.stem) if f.stem.isdigit() else 0,
|
|
479
|
+
)
|
|
479
480
|
)
|
|
480
481
|
|
|
481
482
|
# Remove the metadata file from the list if present.
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import functools
|
|
4
5
|
import json
|
|
5
6
|
import shutil
|
|
6
7
|
import urllib.parse
|
|
@@ -133,7 +134,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
133
134
|
continue
|
|
134
135
|
|
|
135
136
|
try:
|
|
136
|
-
file = await asyncio.to_thread(path_to_metadata.open)
|
|
137
|
+
file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
|
|
137
138
|
try:
|
|
138
139
|
file_content = json.load(file)
|
|
139
140
|
metadata = KeyValueStoreMetadata(**file_content)
|
|
@@ -162,7 +163,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
162
163
|
|
|
163
164
|
# If the key-value store directory exists, reconstruct the client from the metadata file.
|
|
164
165
|
if path_to_kvs.exists() and path_to_metadata.exists():
|
|
165
|
-
file = await asyncio.to_thread(open,
|
|
166
|
+
file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
|
|
166
167
|
try:
|
|
167
168
|
file_content = json.load(file)
|
|
168
169
|
finally:
|
|
@@ -239,7 +240,9 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
239
240
|
# Read the metadata file
|
|
240
241
|
async with self._lock:
|
|
241
242
|
try:
|
|
242
|
-
file = await asyncio.to_thread(
|
|
243
|
+
file = await asyncio.to_thread(
|
|
244
|
+
functools.partial(record_metadata_filepath.open, mode='r', encoding='utf-8'),
|
|
245
|
+
)
|
|
243
246
|
except FileNotFoundError:
|
|
244
247
|
logger.warning(f'Metadata file disappeared for key "{key}", aborting get_value')
|
|
245
248
|
return None
|
|
@@ -373,7 +376,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
373
376
|
|
|
374
377
|
# List and sort all files *inside* a brief lock, then release it immediately:
|
|
375
378
|
async with self._lock:
|
|
376
|
-
files = sorted(await asyncio.to_thread(list
|
|
379
|
+
files = sorted(await asyncio.to_thread(lambda: list(self.path_to_kvs.glob('*'))))
|
|
377
380
|
|
|
378
381
|
count = 0
|
|
379
382
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import functools
|
|
4
5
|
import json
|
|
5
6
|
import shutil
|
|
6
7
|
from collections import deque
|
|
@@ -31,6 +32,7 @@ if TYPE_CHECKING:
|
|
|
31
32
|
from collections.abc import Sequence
|
|
32
33
|
|
|
33
34
|
from crawlee.configuration import Configuration
|
|
35
|
+
from crawlee.storages import KeyValueStore
|
|
34
36
|
|
|
35
37
|
logger = getLogger(__name__)
|
|
36
38
|
|
|
@@ -92,6 +94,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
92
94
|
metadata: RequestQueueMetadata,
|
|
93
95
|
path_to_rq: Path,
|
|
94
96
|
lock: asyncio.Lock,
|
|
97
|
+
recoverable_state: RecoverableState[RequestQueueState],
|
|
95
98
|
) -> None:
|
|
96
99
|
"""Initialize a new instance.
|
|
97
100
|
|
|
@@ -114,12 +117,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
114
117
|
self._is_empty_cache: bool | None = None
|
|
115
118
|
"""Cache for is_empty result: None means unknown, True/False is cached state."""
|
|
116
119
|
|
|
117
|
-
self._state =
|
|
118
|
-
default_state=RequestQueueState(),
|
|
119
|
-
persist_state_key=f'__RQ_STATE_{self._metadata.id}',
|
|
120
|
-
persistence_enabled=True,
|
|
121
|
-
logger=logger,
|
|
122
|
-
)
|
|
120
|
+
self._state = recoverable_state
|
|
123
121
|
"""Recoverable state to maintain request ordering, in-progress status, and handled status."""
|
|
124
122
|
|
|
125
123
|
@override
|
|
@@ -136,6 +134,22 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
136
134
|
"""The full path to the request queue metadata file."""
|
|
137
135
|
return self.path_to_rq / METADATA_FILENAME
|
|
138
136
|
|
|
137
|
+
@classmethod
|
|
138
|
+
async def _create_recoverable_state(cls, id: str, configuration: Configuration) -> RecoverableState:
|
|
139
|
+
async def kvs_factory() -> KeyValueStore:
|
|
140
|
+
from crawlee.storage_clients import FileSystemStorageClient # noqa: PLC0415 avoid circular import
|
|
141
|
+
from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
|
|
142
|
+
|
|
143
|
+
return await KeyValueStore.open(storage_client=FileSystemStorageClient(), configuration=configuration)
|
|
144
|
+
|
|
145
|
+
return RecoverableState[RequestQueueState](
|
|
146
|
+
default_state=RequestQueueState(),
|
|
147
|
+
persist_state_key=f'__RQ_STATE_{id}',
|
|
148
|
+
persist_state_kvs_factory=kvs_factory,
|
|
149
|
+
persistence_enabled=True,
|
|
150
|
+
logger=logger,
|
|
151
|
+
)
|
|
152
|
+
|
|
139
153
|
@classmethod
|
|
140
154
|
async def open(
|
|
141
155
|
cls,
|
|
@@ -184,7 +198,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
184
198
|
continue
|
|
185
199
|
|
|
186
200
|
try:
|
|
187
|
-
file = await asyncio.to_thread(path_to_metadata.open)
|
|
201
|
+
file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8')
|
|
188
202
|
try:
|
|
189
203
|
file_content = json.load(file)
|
|
190
204
|
metadata = RequestQueueMetadata(**file_content)
|
|
@@ -194,6 +208,9 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
194
208
|
metadata=metadata,
|
|
195
209
|
path_to_rq=rq_base_path / rq_dir,
|
|
196
210
|
lock=asyncio.Lock(),
|
|
211
|
+
recoverable_state=await cls._create_recoverable_state(
|
|
212
|
+
id=id, configuration=configuration
|
|
213
|
+
),
|
|
197
214
|
)
|
|
198
215
|
await client._state.initialize()
|
|
199
216
|
await client._discover_existing_requests()
|
|
@@ -216,7 +233,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
216
233
|
|
|
217
234
|
# If the RQ directory exists, reconstruct the client from the metadata file.
|
|
218
235
|
if path_to_rq.exists() and path_to_metadata.exists():
|
|
219
|
-
file = await asyncio.to_thread(open,
|
|
236
|
+
file = await asyncio.to_thread(path_to_metadata.open, encoding='utf-8')
|
|
220
237
|
try:
|
|
221
238
|
file_content = json.load(file)
|
|
222
239
|
finally:
|
|
@@ -230,6 +247,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
230
247
|
metadata=metadata,
|
|
231
248
|
path_to_rq=path_to_rq,
|
|
232
249
|
lock=asyncio.Lock(),
|
|
250
|
+
recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
|
|
233
251
|
)
|
|
234
252
|
|
|
235
253
|
await client._state.initialize()
|
|
@@ -254,6 +272,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
254
272
|
metadata=metadata,
|
|
255
273
|
path_to_rq=path_to_rq,
|
|
256
274
|
lock=asyncio.Lock(),
|
|
275
|
+
recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
|
|
257
276
|
)
|
|
258
277
|
await client._state.initialize()
|
|
259
278
|
await client._update_metadata()
|
|
@@ -738,7 +757,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
738
757
|
await asyncio.to_thread(path_to_rq.mkdir, parents=True, exist_ok=True)
|
|
739
758
|
|
|
740
759
|
# List all the json files.
|
|
741
|
-
files = await asyncio.to_thread(list
|
|
760
|
+
files = await asyncio.to_thread(lambda: list(path_to_rq.glob('*.json')))
|
|
742
761
|
|
|
743
762
|
# Filter out metadata file and non-file entries.
|
|
744
763
|
filtered = filter(lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME, files)
|
|
@@ -757,7 +776,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
757
776
|
"""
|
|
758
777
|
# Open the request file.
|
|
759
778
|
try:
|
|
760
|
-
file = await asyncio.to_thread(open,
|
|
779
|
+
file = await asyncio.to_thread(functools.partial(file_path.open, mode='r', encoding='utf-8'))
|
|
761
780
|
except FileNotFoundError:
|
|
762
781
|
logger.warning(f'Request file "{file_path}" not found.')
|
|
763
782
|
return None
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
from ._dataset_client import RedisDatasetClient
|
|
2
|
+
from ._key_value_store_client import RedisKeyValueStoreClient
|
|
3
|
+
from ._request_queue_client import RedisRequestQueueClient
|
|
4
|
+
from ._storage_client import RedisStorageClient
|
|
5
|
+
|
|
6
|
+
__all__ = ['RedisDatasetClient', 'RedisKeyValueStoreClient', 'RedisRequestQueueClient', 'RedisStorageClient']
|