crawlee 0.6.13b43__py3-none-any.whl → 1.1.1b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_request.py +32 -21
- crawlee/_service_locator.py +4 -4
- crawlee/_types.py +67 -24
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +15 -0
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +1 -1
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +3 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +38 -14
- crawlee/crawlers/_basic/_basic_crawler.py +51 -14
- crawlee/crawlers/_playwright/_playwright_crawler.py +16 -4
- crawlee/events/_event_manager.py +3 -1
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_sitemap_request_loader.py +22 -4
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +32 -1
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_storage_client.py +5 -4
- crawlee/storage_clients/_file_system/_dataset_client.py +4 -5
- crawlee/storage_clients/_file_system/_key_value_store_client.py +4 -5
- crawlee/storage_clients/_file_system/_request_queue_client.py +28 -12
- crawlee/storage_clients/_file_system/_storage_client.py +2 -2
- crawlee/storage_clients/_memory/_dataset_client.py +4 -5
- crawlee/storage_clients/_memory/_key_value_store_client.py +4 -5
- crawlee/storage_clients/_memory/_request_queue_client.py +4 -5
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +291 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +10 -10
- crawlee/storages/_base.py +3 -1
- crawlee/storages/_dataset.py +5 -3
- crawlee/storages/_key_value_store.py +11 -6
- crawlee/storages/_request_queue.py +5 -3
- crawlee/storages/_storage_instance_manager.py +54 -68
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/METADATA +16 -5
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/RECORD +69 -47
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/WHEEL +0 -0
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/core/src/crawlers/statistics.ts
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
|
+
import asyncio
|
|
4
5
|
import math
|
|
5
6
|
import time
|
|
6
7
|
from datetime import datetime, timedelta, timezone
|
|
@@ -17,8 +18,11 @@ from crawlee.statistics import FinalStatistics, StatisticsState
|
|
|
17
18
|
from crawlee.statistics._error_tracker import ErrorTracker
|
|
18
19
|
|
|
19
20
|
if TYPE_CHECKING:
|
|
21
|
+
from collections.abc import Callable, Coroutine
|
|
20
22
|
from types import TracebackType
|
|
21
23
|
|
|
24
|
+
from crawlee.storages import KeyValueStore
|
|
25
|
+
|
|
22
26
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
23
27
|
TNewStatisticsState = TypeVar('TNewStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
24
28
|
logger = getLogger(__name__)
|
|
@@ -70,6 +74,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
70
74
|
persistence_enabled: bool | Literal['explicit_only'] = False,
|
|
71
75
|
persist_state_kvs_name: str | None = None,
|
|
72
76
|
persist_state_key: str | None = None,
|
|
77
|
+
persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
|
|
73
78
|
log_message: str = 'Statistics',
|
|
74
79
|
periodic_message_logger: Logger | None = None,
|
|
75
80
|
log_interval: timedelta = timedelta(minutes=1),
|
|
@@ -80,8 +85,6 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
80
85
|
self._id = Statistics.__next_id
|
|
81
86
|
Statistics.__next_id += 1
|
|
82
87
|
|
|
83
|
-
self._instance_start: datetime | None = None
|
|
84
|
-
|
|
85
88
|
self.error_tracker = ErrorTracker(
|
|
86
89
|
save_error_snapshots=save_error_snapshots,
|
|
87
90
|
snapshot_kvs_name=persist_state_kvs_name,
|
|
@@ -92,9 +95,10 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
92
95
|
|
|
93
96
|
self._state = RecoverableState(
|
|
94
97
|
default_state=state_model(stats_id=self._id),
|
|
95
|
-
persist_state_key=persist_state_key or f'
|
|
98
|
+
persist_state_key=persist_state_key or f'__CRAWLER_STATISTICS_{self._id}',
|
|
96
99
|
persistence_enabled=persistence_enabled,
|
|
97
100
|
persist_state_kvs_name=persist_state_kvs_name,
|
|
101
|
+
persist_state_kvs_factory=persist_state_kvs_factory,
|
|
98
102
|
logger=logger,
|
|
99
103
|
)
|
|
100
104
|
|
|
@@ -110,8 +114,8 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
110
114
|
"""Create near copy of the `Statistics` with replaced `state_model`."""
|
|
111
115
|
new_statistics: Statistics[TNewStatisticsState] = Statistics(
|
|
112
116
|
persistence_enabled=self._state._persistence_enabled, # noqa: SLF001
|
|
113
|
-
persist_state_kvs_name=self._state._persist_state_kvs_name, # noqa: SLF001
|
|
114
117
|
persist_state_key=self._state._persist_state_key, # noqa: SLF001
|
|
118
|
+
persist_state_kvs_factory=self._state._persist_state_kvs_factory, # noqa: SLF001
|
|
115
119
|
log_message=self._log_message,
|
|
116
120
|
periodic_message_logger=self._periodic_message_logger,
|
|
117
121
|
state_model=state_model,
|
|
@@ -125,6 +129,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
125
129
|
persistence_enabled: bool = False,
|
|
126
130
|
persist_state_kvs_name: str | None = None,
|
|
127
131
|
persist_state_key: str | None = None,
|
|
132
|
+
persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
|
|
128
133
|
log_message: str = 'Statistics',
|
|
129
134
|
periodic_message_logger: Logger | None = None,
|
|
130
135
|
log_interval: timedelta = timedelta(minutes=1),
|
|
@@ -136,6 +141,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
136
141
|
persistence_enabled=persistence_enabled,
|
|
137
142
|
persist_state_kvs_name=persist_state_kvs_name,
|
|
138
143
|
persist_state_key=persist_state_key,
|
|
144
|
+
persist_state_kvs_factory=persist_state_kvs_factory,
|
|
139
145
|
log_message=log_message,
|
|
140
146
|
periodic_message_logger=periodic_message_logger,
|
|
141
147
|
log_interval=log_interval,
|
|
@@ -158,14 +164,17 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
158
164
|
if self._active:
|
|
159
165
|
raise RuntimeError(f'The {self.__class__.__name__} is already active.')
|
|
160
166
|
|
|
161
|
-
self._active = True
|
|
162
|
-
self._instance_start = datetime.now(timezone.utc)
|
|
163
|
-
|
|
164
167
|
await self._state.initialize()
|
|
165
|
-
|
|
168
|
+
# Reset `crawler_finished_at` to indicate a new run in progress.
|
|
169
|
+
self.state.crawler_finished_at = None
|
|
166
170
|
|
|
171
|
+
# Start periodic logging and let it print initial state before activation.
|
|
167
172
|
self._periodic_logger.start()
|
|
173
|
+
await asyncio.sleep(0.01)
|
|
174
|
+
self._active = True
|
|
168
175
|
|
|
176
|
+
self.state.crawler_last_started_at = datetime.now(timezone.utc)
|
|
177
|
+
self.state.crawler_started_at = self.state.crawler_started_at or self.state.crawler_last_started_at
|
|
169
178
|
return self
|
|
170
179
|
|
|
171
180
|
async def __aexit__(
|
|
@@ -182,13 +191,14 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
182
191
|
if not self._active:
|
|
183
192
|
raise RuntimeError(f'The {self.__class__.__name__} is not active.')
|
|
184
193
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
await self._state.teardown()
|
|
194
|
+
if not self.state.crawler_last_started_at:
|
|
195
|
+
raise RuntimeError('Statistics.state.crawler_last_started_at not set.')
|
|
188
196
|
|
|
197
|
+
# Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
|
|
189
198
|
await self._periodic_logger.stop()
|
|
190
|
-
|
|
199
|
+
self.state.crawler_finished_at = datetime.now(timezone.utc)
|
|
191
200
|
self._active = False
|
|
201
|
+
await self._state.teardown()
|
|
192
202
|
|
|
193
203
|
@property
|
|
194
204
|
def state(self) -> TStatisticsState:
|
|
@@ -247,11 +257,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
247
257
|
|
|
248
258
|
def calculate(self) -> FinalStatistics:
|
|
249
259
|
"""Calculate the current statistics."""
|
|
250
|
-
|
|
251
|
-
raise RuntimeError('The Statistics object is not initialized')
|
|
252
|
-
|
|
253
|
-
crawler_runtime = datetime.now(timezone.utc) - self._instance_start
|
|
254
|
-
total_minutes = crawler_runtime.total_seconds() / 60
|
|
260
|
+
total_minutes = self.state.crawler_runtime.total_seconds() / 60
|
|
255
261
|
state = self._state.current_value
|
|
256
262
|
serialized_state = state.model_dump(by_alias=False)
|
|
257
263
|
|
|
@@ -262,7 +268,7 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
262
268
|
requests_failed_per_minute=math.floor(state.requests_failed / total_minutes) if total_minutes else 0,
|
|
263
269
|
request_total_duration=state.request_total_finished_duration + state.request_total_failed_duration,
|
|
264
270
|
requests_total=state.requests_failed + state.requests_finished,
|
|
265
|
-
crawler_runtime=crawler_runtime,
|
|
271
|
+
crawler_runtime=state.crawler_runtime,
|
|
266
272
|
requests_finished=state.requests_finished,
|
|
267
273
|
requests_failed=state.requests_failed,
|
|
268
274
|
retry_histogram=serialized_state['request_retry_histogram'],
|
|
@@ -282,21 +288,6 @@ class Statistics(Generic[TStatisticsState]):
|
|
|
282
288
|
else:
|
|
283
289
|
self._periodic_message_logger.info(self._log_message, extra=stats.to_dict())
|
|
284
290
|
|
|
285
|
-
def _after_initialize(self) -> None:
|
|
286
|
-
state = self._state.current_value
|
|
287
|
-
|
|
288
|
-
if state.crawler_started_at is None:
|
|
289
|
-
state.crawler_started_at = datetime.now(timezone.utc)
|
|
290
|
-
|
|
291
|
-
if state.stats_persisted_at is not None and state.crawler_last_started_at:
|
|
292
|
-
self._instance_start = datetime.now(timezone.utc) - (
|
|
293
|
-
state.stats_persisted_at - state.crawler_last_started_at
|
|
294
|
-
)
|
|
295
|
-
elif state.crawler_last_started_at:
|
|
296
|
-
self._instance_start = state.crawler_last_started_at
|
|
297
|
-
|
|
298
|
-
state.crawler_last_started_at = self._instance_start
|
|
299
|
-
|
|
300
291
|
def _save_retry_count_for_request(self, record: RequestProcessingRecord) -> None:
|
|
301
292
|
retry_count = record.retry_count
|
|
302
293
|
state = self._state.current_value
|
|
@@ -1,9 +1,25 @@
|
|
|
1
|
+
from crawlee._utils.try_import import install_import_hook as _install_import_hook
|
|
2
|
+
from crawlee._utils.try_import import try_import as _try_import
|
|
3
|
+
|
|
4
|
+
# These imports have only mandatory dependencies, so they are imported directly.
|
|
1
5
|
from ._base import StorageClient
|
|
2
6
|
from ._file_system import FileSystemStorageClient
|
|
3
7
|
from ._memory import MemoryStorageClient
|
|
4
8
|
|
|
9
|
+
_install_import_hook(__name__)
|
|
10
|
+
|
|
11
|
+
# The following imports are wrapped in try_import to handle optional dependencies,
|
|
12
|
+
# ensuring the module can still function even if these dependencies are missing.
|
|
13
|
+
with _try_import(__name__, 'SqlStorageClient'):
|
|
14
|
+
from ._sql import SqlStorageClient
|
|
15
|
+
|
|
16
|
+
with _try_import(__name__, 'RedisStorageClient'):
|
|
17
|
+
from ._redis import RedisStorageClient
|
|
18
|
+
|
|
5
19
|
__all__ = [
|
|
6
20
|
'FileSystemStorageClient',
|
|
7
21
|
'MemoryStorageClient',
|
|
22
|
+
'RedisStorageClient',
|
|
23
|
+
'SqlStorageClient',
|
|
8
24
|
'StorageClient',
|
|
9
25
|
]
|
|
@@ -30,12 +30,13 @@ class StorageClient(ABC):
|
|
|
30
30
|
(where applicable), and consistent access patterns across all storage types it supports.
|
|
31
31
|
"""
|
|
32
32
|
|
|
33
|
-
def
|
|
34
|
-
"""Return a cache key that can differentiate between different storages of this
|
|
33
|
+
def get_storage_client_cache_key(self, configuration: Configuration) -> Hashable: # noqa: ARG002
|
|
34
|
+
"""Return a cache key that can differentiate between different storages of this and other clients.
|
|
35
35
|
|
|
36
|
-
Can be based on configuration or on the client itself. By default, returns
|
|
36
|
+
Can be based on configuration or on the client itself. By default, returns a module and name of the client
|
|
37
|
+
class.
|
|
37
38
|
"""
|
|
38
|
-
return ''
|
|
39
|
+
return f'{self.__class__.__module__}.{self.__class__.__name__}'
|
|
39
40
|
|
|
40
41
|
@abstractmethod
|
|
41
42
|
async def create_dataset_client(
|
|
@@ -9,11 +9,12 @@ from pathlib import Path
|
|
|
9
9
|
from typing import TYPE_CHECKING, Any
|
|
10
10
|
|
|
11
11
|
from pydantic import ValidationError
|
|
12
|
-
from typing_extensions import override
|
|
12
|
+
from typing_extensions import Self, override
|
|
13
13
|
|
|
14
14
|
from crawlee._consts import METADATA_FILENAME
|
|
15
15
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
16
16
|
from crawlee._utils.file import atomic_write, json_dumps
|
|
17
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
17
18
|
from crawlee.storage_clients._base import DatasetClient
|
|
18
19
|
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
|
|
19
20
|
|
|
@@ -93,7 +94,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
93
94
|
name: str | None,
|
|
94
95
|
alias: str | None,
|
|
95
96
|
configuration: Configuration,
|
|
96
|
-
) ->
|
|
97
|
+
) -> Self:
|
|
97
98
|
"""Open or create a file system dataset client.
|
|
98
99
|
|
|
99
100
|
This method attempts to open an existing dataset from the file system. If a dataset with the specified ID
|
|
@@ -114,9 +115,7 @@ class FileSystemDatasetClient(DatasetClient):
|
|
|
114
115
|
or if both name and alias are provided.
|
|
115
116
|
"""
|
|
116
117
|
# Validate input parameters.
|
|
117
|
-
|
|
118
|
-
if specified_params > 1:
|
|
119
|
-
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
|
|
118
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
120
119
|
|
|
121
120
|
dataset_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
|
|
122
121
|
|
|
@@ -10,11 +10,12 @@ from pathlib import Path
|
|
|
10
10
|
from typing import TYPE_CHECKING, Any
|
|
11
11
|
|
|
12
12
|
from pydantic import ValidationError
|
|
13
|
-
from typing_extensions import override
|
|
13
|
+
from typing_extensions import Self, override
|
|
14
14
|
|
|
15
15
|
from crawlee._consts import METADATA_FILENAME
|
|
16
16
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
17
17
|
from crawlee._utils.file import atomic_write, infer_mime_type, json_dumps
|
|
18
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
18
19
|
from crawlee.storage_clients._base import KeyValueStoreClient
|
|
19
20
|
from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
|
|
20
21
|
|
|
@@ -92,7 +93,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
92
93
|
name: str | None,
|
|
93
94
|
alias: str | None,
|
|
94
95
|
configuration: Configuration,
|
|
95
|
-
) ->
|
|
96
|
+
) -> Self:
|
|
96
97
|
"""Open or create a file system key-value store client.
|
|
97
98
|
|
|
98
99
|
This method attempts to open an existing key-value store from the file system. If a KVS with the specified
|
|
@@ -113,9 +114,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
|
|
|
113
114
|
or if both name and alias are provided.
|
|
114
115
|
"""
|
|
115
116
|
# Validate input parameters.
|
|
116
|
-
|
|
117
|
-
if specified_params > 1:
|
|
118
|
-
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
|
|
117
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
119
118
|
|
|
120
119
|
kvs_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
|
|
121
120
|
|
|
@@ -11,12 +11,13 @@ from pathlib import Path
|
|
|
11
11
|
from typing import TYPE_CHECKING
|
|
12
12
|
|
|
13
13
|
from pydantic import BaseModel, ValidationError
|
|
14
|
-
from typing_extensions import override
|
|
14
|
+
from typing_extensions import Self, override
|
|
15
15
|
|
|
16
16
|
from crawlee import Request
|
|
17
17
|
from crawlee._consts import METADATA_FILENAME
|
|
18
18
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
19
19
|
from crawlee._utils.file import atomic_write, json_dumps
|
|
20
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
20
21
|
from crawlee._utils.recoverable_state import RecoverableState
|
|
21
22
|
from crawlee.storage_clients._base import RequestQueueClient
|
|
22
23
|
from crawlee.storage_clients.models import (
|
|
@@ -30,6 +31,7 @@ if TYPE_CHECKING:
|
|
|
30
31
|
from collections.abc import Sequence
|
|
31
32
|
|
|
32
33
|
from crawlee.configuration import Configuration
|
|
34
|
+
from crawlee.storages import KeyValueStore
|
|
33
35
|
|
|
34
36
|
logger = getLogger(__name__)
|
|
35
37
|
|
|
@@ -91,6 +93,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
91
93
|
metadata: RequestQueueMetadata,
|
|
92
94
|
path_to_rq: Path,
|
|
93
95
|
lock: asyncio.Lock,
|
|
96
|
+
recoverable_state: RecoverableState[RequestQueueState],
|
|
94
97
|
) -> None:
|
|
95
98
|
"""Initialize a new instance.
|
|
96
99
|
|
|
@@ -113,13 +116,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
113
116
|
self._is_empty_cache: bool | None = None
|
|
114
117
|
"""Cache for is_empty result: None means unknown, True/False is cached state."""
|
|
115
118
|
|
|
116
|
-
self._state =
|
|
117
|
-
default_state=RequestQueueState(),
|
|
118
|
-
persist_state_key='request_queue_state',
|
|
119
|
-
persistence_enabled=True,
|
|
120
|
-
persist_state_kvs_name=f'__RQ_STATE_{self._metadata.id}',
|
|
121
|
-
logger=logger,
|
|
122
|
-
)
|
|
119
|
+
self._state = recoverable_state
|
|
123
120
|
"""Recoverable state to maintain request ordering, in-progress status, and handled status."""
|
|
124
121
|
|
|
125
122
|
@override
|
|
@@ -136,6 +133,22 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
136
133
|
"""The full path to the request queue metadata file."""
|
|
137
134
|
return self.path_to_rq / METADATA_FILENAME
|
|
138
135
|
|
|
136
|
+
@classmethod
|
|
137
|
+
async def _create_recoverable_state(cls, id: str, configuration: Configuration) -> RecoverableState:
|
|
138
|
+
async def kvs_factory() -> KeyValueStore:
|
|
139
|
+
from crawlee.storage_clients import FileSystemStorageClient # noqa: PLC0415 avoid circular import
|
|
140
|
+
from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
|
|
141
|
+
|
|
142
|
+
return await KeyValueStore.open(storage_client=FileSystemStorageClient(), configuration=configuration)
|
|
143
|
+
|
|
144
|
+
return RecoverableState[RequestQueueState](
|
|
145
|
+
default_state=RequestQueueState(),
|
|
146
|
+
persist_state_key=f'__RQ_STATE_{id}',
|
|
147
|
+
persist_state_kvs_factory=kvs_factory,
|
|
148
|
+
persistence_enabled=True,
|
|
149
|
+
logger=logger,
|
|
150
|
+
)
|
|
151
|
+
|
|
139
152
|
@classmethod
|
|
140
153
|
async def open(
|
|
141
154
|
cls,
|
|
@@ -144,7 +157,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
144
157
|
name: str | None,
|
|
145
158
|
alias: str | None,
|
|
146
159
|
configuration: Configuration,
|
|
147
|
-
) ->
|
|
160
|
+
) -> Self:
|
|
148
161
|
"""Open or create a file system request queue client.
|
|
149
162
|
|
|
150
163
|
This method attempts to open an existing request queue from the file system. If a queue with the specified
|
|
@@ -165,9 +178,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
165
178
|
or if both name and alias are provided.
|
|
166
179
|
"""
|
|
167
180
|
# Validate input parameters.
|
|
168
|
-
|
|
169
|
-
if specified_params > 1:
|
|
170
|
-
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
|
|
181
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
171
182
|
|
|
172
183
|
rq_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
|
|
173
184
|
|
|
@@ -196,6 +207,9 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
196
207
|
metadata=metadata,
|
|
197
208
|
path_to_rq=rq_base_path / rq_dir,
|
|
198
209
|
lock=asyncio.Lock(),
|
|
210
|
+
recoverable_state=await cls._create_recoverable_state(
|
|
211
|
+
id=id, configuration=configuration
|
|
212
|
+
),
|
|
199
213
|
)
|
|
200
214
|
await client._state.initialize()
|
|
201
215
|
await client._discover_existing_requests()
|
|
@@ -232,6 +246,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
232
246
|
metadata=metadata,
|
|
233
247
|
path_to_rq=path_to_rq,
|
|
234
248
|
lock=asyncio.Lock(),
|
|
249
|
+
recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
|
|
235
250
|
)
|
|
236
251
|
|
|
237
252
|
await client._state.initialize()
|
|
@@ -256,6 +271,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
|
|
|
256
271
|
metadata=metadata,
|
|
257
272
|
path_to_rq=path_to_rq,
|
|
258
273
|
lock=asyncio.Lock(),
|
|
274
|
+
recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
|
|
259
275
|
)
|
|
260
276
|
await client._state.initialize()
|
|
261
277
|
await client._update_metadata()
|
|
@@ -35,9 +35,9 @@ class FileSystemStorageClient(StorageClient):
|
|
|
35
35
|
"""
|
|
36
36
|
|
|
37
37
|
@override
|
|
38
|
-
def
|
|
38
|
+
def get_storage_client_cache_key(self, configuration: Configuration) -> Hashable:
|
|
39
39
|
# Even different client instances should return same storage if the storage_dir is the same.
|
|
40
|
-
return configuration.storage_dir
|
|
40
|
+
return super().get_storage_client_cache_key(configuration), configuration.storage_dir
|
|
41
41
|
|
|
42
42
|
@override
|
|
43
43
|
async def create_dataset_client(
|
|
@@ -4,9 +4,10 @@ from datetime import datetime, timezone
|
|
|
4
4
|
from logging import getLogger
|
|
5
5
|
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
|
-
from typing_extensions import override
|
|
7
|
+
from typing_extensions import Self, override
|
|
8
8
|
|
|
9
9
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
10
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
10
11
|
from crawlee.storage_clients._base import DatasetClient
|
|
11
12
|
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
|
|
12
13
|
|
|
@@ -54,7 +55,7 @@ class MemoryDatasetClient(DatasetClient):
|
|
|
54
55
|
id: str | None,
|
|
55
56
|
name: str | None,
|
|
56
57
|
alias: str | None,
|
|
57
|
-
) ->
|
|
58
|
+
) -> Self:
|
|
58
59
|
"""Open or create a new memory dataset client.
|
|
59
60
|
|
|
60
61
|
This method creates a new in-memory dataset instance. Unlike persistent storage implementations, memory
|
|
@@ -76,9 +77,7 @@ class MemoryDatasetClient(DatasetClient):
|
|
|
76
77
|
ValueError: If both name and alias are provided, or if neither id, name, nor alias is provided.
|
|
77
78
|
"""
|
|
78
79
|
# Validate input parameters.
|
|
79
|
-
|
|
80
|
-
if specified_params > 1:
|
|
81
|
-
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
|
|
80
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
82
81
|
|
|
83
82
|
# Create a new dataset
|
|
84
83
|
dataset_id = id or crypto_random_object_id()
|
|
@@ -4,10 +4,11 @@ import sys
|
|
|
4
4
|
from datetime import datetime, timezone
|
|
5
5
|
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
|
-
from typing_extensions import override
|
|
7
|
+
from typing_extensions import Self, override
|
|
8
8
|
|
|
9
9
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
10
10
|
from crawlee._utils.file import infer_mime_type
|
|
11
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
11
12
|
from crawlee.storage_clients._base import KeyValueStoreClient
|
|
12
13
|
from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
|
|
13
14
|
|
|
@@ -52,7 +53,7 @@ class MemoryKeyValueStoreClient(KeyValueStoreClient):
|
|
|
52
53
|
id: str | None,
|
|
53
54
|
name: str | None,
|
|
54
55
|
alias: str | None,
|
|
55
|
-
) ->
|
|
56
|
+
) -> Self:
|
|
56
57
|
"""Open or create a new memory key-value store client.
|
|
57
58
|
|
|
58
59
|
This method creates a new in-memory key-value store instance. Unlike persistent storage implementations,
|
|
@@ -74,9 +75,7 @@ class MemoryKeyValueStoreClient(KeyValueStoreClient):
|
|
|
74
75
|
ValueError: If both name and alias are provided.
|
|
75
76
|
"""
|
|
76
77
|
# Validate input parameters.
|
|
77
|
-
|
|
78
|
-
if specified_params > 1:
|
|
79
|
-
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
|
|
78
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
80
79
|
|
|
81
80
|
# Create a new key-value store
|
|
82
81
|
store_id = id or crypto_random_object_id()
|
|
@@ -6,10 +6,11 @@ from datetime import datetime, timezone
|
|
|
6
6
|
from logging import getLogger
|
|
7
7
|
from typing import TYPE_CHECKING
|
|
8
8
|
|
|
9
|
-
from typing_extensions import override
|
|
9
|
+
from typing_extensions import Self, override
|
|
10
10
|
|
|
11
11
|
from crawlee import Request
|
|
12
12
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
13
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
13
14
|
from crawlee.storage_clients._base import RequestQueueClient
|
|
14
15
|
from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata
|
|
15
16
|
|
|
@@ -64,7 +65,7 @@ class MemoryRequestQueueClient(RequestQueueClient):
|
|
|
64
65
|
id: str | None,
|
|
65
66
|
name: str | None,
|
|
66
67
|
alias: str | None,
|
|
67
|
-
) ->
|
|
68
|
+
) -> Self:
|
|
68
69
|
"""Open or create a new memory request queue client.
|
|
69
70
|
|
|
70
71
|
This method creates a new in-memory request queue instance. Unlike persistent storage implementations,
|
|
@@ -86,9 +87,7 @@ class MemoryRequestQueueClient(RequestQueueClient):
|
|
|
86
87
|
ValueError: If both name and alias are provided.
|
|
87
88
|
"""
|
|
88
89
|
# Validate input parameters.
|
|
89
|
-
|
|
90
|
-
if specified_params > 1:
|
|
91
|
-
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
|
|
90
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
92
91
|
|
|
93
92
|
# Create a new queue
|
|
94
93
|
queue_id = id or crypto_random_object_id()
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
from ._dataset_client import RedisDatasetClient
|
|
2
|
+
from ._key_value_store_client import RedisKeyValueStoreClient
|
|
3
|
+
from ._request_queue_client import RedisRequestQueueClient
|
|
4
|
+
from ._storage_client import RedisStorageClient
|
|
5
|
+
|
|
6
|
+
__all__ = ['RedisDatasetClient', 'RedisKeyValueStoreClient', 'RedisRequestQueueClient', 'RedisStorageClient']
|