crawlee 0.6.13b43__py3-none-any.whl → 1.1.1b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (69) hide show
  1. crawlee/_request.py +32 -21
  2. crawlee/_service_locator.py +4 -4
  3. crawlee/_types.py +67 -24
  4. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  5. crawlee/_utils/recoverable_state.py +32 -8
  6. crawlee/_utils/recurring_task.py +15 -0
  7. crawlee/_utils/robots.py +17 -5
  8. crawlee/_utils/sitemap.py +1 -1
  9. crawlee/_utils/urls.py +9 -2
  10. crawlee/browsers/_browser_pool.py +4 -1
  11. crawlee/browsers/_playwright_browser_controller.py +21 -15
  12. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  13. crawlee/browsers/_types.py +1 -1
  14. crawlee/configuration.py +3 -1
  15. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
  16. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +38 -14
  17. crawlee/crawlers/_basic/_basic_crawler.py +51 -14
  18. crawlee/crawlers/_playwright/_playwright_crawler.py +16 -4
  19. crawlee/events/_event_manager.py +3 -1
  20. crawlee/fingerprint_suite/_header_generator.py +2 -2
  21. crawlee/otel/crawler_instrumentor.py +3 -3
  22. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  23. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  24. crawlee/request_loaders/_sitemap_request_loader.py +22 -4
  25. crawlee/sessions/_session_pool.py +1 -1
  26. crawlee/statistics/_error_snapshotter.py +1 -1
  27. crawlee/statistics/_models.py +32 -1
  28. crawlee/statistics/_statistics.py +24 -33
  29. crawlee/storage_clients/__init__.py +16 -0
  30. crawlee/storage_clients/_base/_storage_client.py +5 -4
  31. crawlee/storage_clients/_file_system/_dataset_client.py +4 -5
  32. crawlee/storage_clients/_file_system/_key_value_store_client.py +4 -5
  33. crawlee/storage_clients/_file_system/_request_queue_client.py +28 -12
  34. crawlee/storage_clients/_file_system/_storage_client.py +2 -2
  35. crawlee/storage_clients/_memory/_dataset_client.py +4 -5
  36. crawlee/storage_clients/_memory/_key_value_store_client.py +4 -5
  37. crawlee/storage_clients/_memory/_request_queue_client.py +4 -5
  38. crawlee/storage_clients/_redis/__init__.py +6 -0
  39. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  40. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  41. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  42. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  43. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  44. crawlee/storage_clients/_redis/_utils.py +23 -0
  45. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  46. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  47. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  48. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  49. crawlee/storage_clients/_redis/py.typed +0 -0
  50. crawlee/storage_clients/_sql/__init__.py +6 -0
  51. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  52. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  53. crawlee/storage_clients/_sql/_db_models.py +268 -0
  54. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  55. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  56. crawlee/storage_clients/_sql/_storage_client.py +291 -0
  57. crawlee/storage_clients/_sql/py.typed +0 -0
  58. crawlee/storage_clients/models.py +10 -10
  59. crawlee/storages/_base.py +3 -1
  60. crawlee/storages/_dataset.py +5 -3
  61. crawlee/storages/_key_value_store.py +11 -6
  62. crawlee/storages/_request_queue.py +5 -3
  63. crawlee/storages/_storage_instance_manager.py +54 -68
  64. crawlee/storages/_utils.py +11 -0
  65. {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/METADATA +16 -5
  66. {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/RECORD +69 -47
  67. {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/WHEEL +0 -0
  68. {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/entry_points.txt +0 -0
  69. {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,7 @@
1
1
  # Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/core/src/crawlers/statistics.ts
2
2
  from __future__ import annotations
3
3
 
4
+ import asyncio
4
5
  import math
5
6
  import time
6
7
  from datetime import datetime, timedelta, timezone
@@ -17,8 +18,11 @@ from crawlee.statistics import FinalStatistics, StatisticsState
17
18
  from crawlee.statistics._error_tracker import ErrorTracker
18
19
 
19
20
  if TYPE_CHECKING:
21
+ from collections.abc import Callable, Coroutine
20
22
  from types import TracebackType
21
23
 
24
+ from crawlee.storages import KeyValueStore
25
+
22
26
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
23
27
  TNewStatisticsState = TypeVar('TNewStatisticsState', bound=StatisticsState, default=StatisticsState)
24
28
  logger = getLogger(__name__)
@@ -70,6 +74,7 @@ class Statistics(Generic[TStatisticsState]):
70
74
  persistence_enabled: bool | Literal['explicit_only'] = False,
71
75
  persist_state_kvs_name: str | None = None,
72
76
  persist_state_key: str | None = None,
77
+ persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
73
78
  log_message: str = 'Statistics',
74
79
  periodic_message_logger: Logger | None = None,
75
80
  log_interval: timedelta = timedelta(minutes=1),
@@ -80,8 +85,6 @@ class Statistics(Generic[TStatisticsState]):
80
85
  self._id = Statistics.__next_id
81
86
  Statistics.__next_id += 1
82
87
 
83
- self._instance_start: datetime | None = None
84
-
85
88
  self.error_tracker = ErrorTracker(
86
89
  save_error_snapshots=save_error_snapshots,
87
90
  snapshot_kvs_name=persist_state_kvs_name,
@@ -92,9 +95,10 @@ class Statistics(Generic[TStatisticsState]):
92
95
 
93
96
  self._state = RecoverableState(
94
97
  default_state=state_model(stats_id=self._id),
95
- persist_state_key=persist_state_key or f'SDK_CRAWLER_STATISTICS_{self._id}',
98
+ persist_state_key=persist_state_key or f'__CRAWLER_STATISTICS_{self._id}',
96
99
  persistence_enabled=persistence_enabled,
97
100
  persist_state_kvs_name=persist_state_kvs_name,
101
+ persist_state_kvs_factory=persist_state_kvs_factory,
98
102
  logger=logger,
99
103
  )
100
104
 
@@ -110,8 +114,8 @@ class Statistics(Generic[TStatisticsState]):
110
114
  """Create near copy of the `Statistics` with replaced `state_model`."""
111
115
  new_statistics: Statistics[TNewStatisticsState] = Statistics(
112
116
  persistence_enabled=self._state._persistence_enabled, # noqa: SLF001
113
- persist_state_kvs_name=self._state._persist_state_kvs_name, # noqa: SLF001
114
117
  persist_state_key=self._state._persist_state_key, # noqa: SLF001
118
+ persist_state_kvs_factory=self._state._persist_state_kvs_factory, # noqa: SLF001
115
119
  log_message=self._log_message,
116
120
  periodic_message_logger=self._periodic_message_logger,
117
121
  state_model=state_model,
@@ -125,6 +129,7 @@ class Statistics(Generic[TStatisticsState]):
125
129
  persistence_enabled: bool = False,
126
130
  persist_state_kvs_name: str | None = None,
127
131
  persist_state_key: str | None = None,
132
+ persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
128
133
  log_message: str = 'Statistics',
129
134
  periodic_message_logger: Logger | None = None,
130
135
  log_interval: timedelta = timedelta(minutes=1),
@@ -136,6 +141,7 @@ class Statistics(Generic[TStatisticsState]):
136
141
  persistence_enabled=persistence_enabled,
137
142
  persist_state_kvs_name=persist_state_kvs_name,
138
143
  persist_state_key=persist_state_key,
144
+ persist_state_kvs_factory=persist_state_kvs_factory,
139
145
  log_message=log_message,
140
146
  periodic_message_logger=periodic_message_logger,
141
147
  log_interval=log_interval,
@@ -158,14 +164,17 @@ class Statistics(Generic[TStatisticsState]):
158
164
  if self._active:
159
165
  raise RuntimeError(f'The {self.__class__.__name__} is already active.')
160
166
 
161
- self._active = True
162
- self._instance_start = datetime.now(timezone.utc)
163
-
164
167
  await self._state.initialize()
165
- self._after_initialize()
168
+ # Reset `crawler_finished_at` to indicate a new run in progress.
169
+ self.state.crawler_finished_at = None
166
170
 
171
+ # Start periodic logging and let it print initial state before activation.
167
172
  self._periodic_logger.start()
173
+ await asyncio.sleep(0.01)
174
+ self._active = True
168
175
 
176
+ self.state.crawler_last_started_at = datetime.now(timezone.utc)
177
+ self.state.crawler_started_at = self.state.crawler_started_at or self.state.crawler_last_started_at
169
178
  return self
170
179
 
171
180
  async def __aexit__(
@@ -182,13 +191,14 @@ class Statistics(Generic[TStatisticsState]):
182
191
  if not self._active:
183
192
  raise RuntimeError(f'The {self.__class__.__name__} is not active.')
184
193
 
185
- self._state.current_value.crawler_finished_at = datetime.now(timezone.utc)
186
-
187
- await self._state.teardown()
194
+ if not self.state.crawler_last_started_at:
195
+ raise RuntimeError('Statistics.state.crawler_last_started_at not set.')
188
196
 
197
+ # Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
189
198
  await self._periodic_logger.stop()
190
-
199
+ self.state.crawler_finished_at = datetime.now(timezone.utc)
191
200
  self._active = False
201
+ await self._state.teardown()
192
202
 
193
203
  @property
194
204
  def state(self) -> TStatisticsState:
@@ -247,11 +257,7 @@ class Statistics(Generic[TStatisticsState]):
247
257
 
248
258
  def calculate(self) -> FinalStatistics:
249
259
  """Calculate the current statistics."""
250
- if self._instance_start is None:
251
- raise RuntimeError('The Statistics object is not initialized')
252
-
253
- crawler_runtime = datetime.now(timezone.utc) - self._instance_start
254
- total_minutes = crawler_runtime.total_seconds() / 60
260
+ total_minutes = self.state.crawler_runtime.total_seconds() / 60
255
261
  state = self._state.current_value
256
262
  serialized_state = state.model_dump(by_alias=False)
257
263
 
@@ -262,7 +268,7 @@ class Statistics(Generic[TStatisticsState]):
262
268
  requests_failed_per_minute=math.floor(state.requests_failed / total_minutes) if total_minutes else 0,
263
269
  request_total_duration=state.request_total_finished_duration + state.request_total_failed_duration,
264
270
  requests_total=state.requests_failed + state.requests_finished,
265
- crawler_runtime=crawler_runtime,
271
+ crawler_runtime=state.crawler_runtime,
266
272
  requests_finished=state.requests_finished,
267
273
  requests_failed=state.requests_failed,
268
274
  retry_histogram=serialized_state['request_retry_histogram'],
@@ -282,21 +288,6 @@ class Statistics(Generic[TStatisticsState]):
282
288
  else:
283
289
  self._periodic_message_logger.info(self._log_message, extra=stats.to_dict())
284
290
 
285
- def _after_initialize(self) -> None:
286
- state = self._state.current_value
287
-
288
- if state.crawler_started_at is None:
289
- state.crawler_started_at = datetime.now(timezone.utc)
290
-
291
- if state.stats_persisted_at is not None and state.crawler_last_started_at:
292
- self._instance_start = datetime.now(timezone.utc) - (
293
- state.stats_persisted_at - state.crawler_last_started_at
294
- )
295
- elif state.crawler_last_started_at:
296
- self._instance_start = state.crawler_last_started_at
297
-
298
- state.crawler_last_started_at = self._instance_start
299
-
300
291
  def _save_retry_count_for_request(self, record: RequestProcessingRecord) -> None:
301
292
  retry_count = record.retry_count
302
293
  state = self._state.current_value
@@ -1,9 +1,25 @@
1
+ from crawlee._utils.try_import import install_import_hook as _install_import_hook
2
+ from crawlee._utils.try_import import try_import as _try_import
3
+
4
+ # These imports have only mandatory dependencies, so they are imported directly.
1
5
  from ._base import StorageClient
2
6
  from ._file_system import FileSystemStorageClient
3
7
  from ._memory import MemoryStorageClient
4
8
 
9
+ _install_import_hook(__name__)
10
+
11
+ # The following imports are wrapped in try_import to handle optional dependencies,
12
+ # ensuring the module can still function even if these dependencies are missing.
13
+ with _try_import(__name__, 'SqlStorageClient'):
14
+ from ._sql import SqlStorageClient
15
+
16
+ with _try_import(__name__, 'RedisStorageClient'):
17
+ from ._redis import RedisStorageClient
18
+
5
19
  __all__ = [
6
20
  'FileSystemStorageClient',
7
21
  'MemoryStorageClient',
22
+ 'RedisStorageClient',
23
+ 'SqlStorageClient',
8
24
  'StorageClient',
9
25
  ]
@@ -30,12 +30,13 @@ class StorageClient(ABC):
30
30
  (where applicable), and consistent access patterns across all storage types it supports.
31
31
  """
32
32
 
33
- def get_additional_cache_key(self, configuration: Configuration) -> Hashable: # noqa: ARG002
34
- """Return a cache key that can differentiate between different storages of this client.
33
+ def get_storage_client_cache_key(self, configuration: Configuration) -> Hashable: # noqa: ARG002
34
+ """Return a cache key that can differentiate between different storages of this and other clients.
35
35
 
36
- Can be based on configuration or on the client itself. By default, returns an empty string.
36
+ Can be based on configuration or on the client itself. By default, returns a module and name of the client
37
+ class.
37
38
  """
38
- return ''
39
+ return f'{self.__class__.__module__}.{self.__class__.__name__}'
39
40
 
40
41
  @abstractmethod
41
42
  async def create_dataset_client(
@@ -9,11 +9,12 @@ from pathlib import Path
9
9
  from typing import TYPE_CHECKING, Any
10
10
 
11
11
  from pydantic import ValidationError
12
- from typing_extensions import override
12
+ from typing_extensions import Self, override
13
13
 
14
14
  from crawlee._consts import METADATA_FILENAME
15
15
  from crawlee._utils.crypto import crypto_random_object_id
16
16
  from crawlee._utils.file import atomic_write, json_dumps
17
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
17
18
  from crawlee.storage_clients._base import DatasetClient
18
19
  from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
19
20
 
@@ -93,7 +94,7 @@ class FileSystemDatasetClient(DatasetClient):
93
94
  name: str | None,
94
95
  alias: str | None,
95
96
  configuration: Configuration,
96
- ) -> FileSystemDatasetClient:
97
+ ) -> Self:
97
98
  """Open or create a file system dataset client.
98
99
 
99
100
  This method attempts to open an existing dataset from the file system. If a dataset with the specified ID
@@ -114,9 +115,7 @@ class FileSystemDatasetClient(DatasetClient):
114
115
  or if both name and alias are provided.
115
116
  """
116
117
  # Validate input parameters.
117
- specified_params = sum(1 for param in [id, name, alias] if param is not None)
118
- if specified_params > 1:
119
- raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
118
+ raise_if_too_many_kwargs(id=id, name=name, alias=alias)
120
119
 
121
120
  dataset_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
122
121
 
@@ -10,11 +10,12 @@ from pathlib import Path
10
10
  from typing import TYPE_CHECKING, Any
11
11
 
12
12
  from pydantic import ValidationError
13
- from typing_extensions import override
13
+ from typing_extensions import Self, override
14
14
 
15
15
  from crawlee._consts import METADATA_FILENAME
16
16
  from crawlee._utils.crypto import crypto_random_object_id
17
17
  from crawlee._utils.file import atomic_write, infer_mime_type, json_dumps
18
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
18
19
  from crawlee.storage_clients._base import KeyValueStoreClient
19
20
  from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
20
21
 
@@ -92,7 +93,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
92
93
  name: str | None,
93
94
  alias: str | None,
94
95
  configuration: Configuration,
95
- ) -> FileSystemKeyValueStoreClient:
96
+ ) -> Self:
96
97
  """Open or create a file system key-value store client.
97
98
 
98
99
  This method attempts to open an existing key-value store from the file system. If a KVS with the specified
@@ -113,9 +114,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
113
114
  or if both name and alias are provided.
114
115
  """
115
116
  # Validate input parameters.
116
- specified_params = sum(1 for param in [id, name, alias] if param is not None)
117
- if specified_params > 1:
118
- raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
117
+ raise_if_too_many_kwargs(id=id, name=name, alias=alias)
119
118
 
120
119
  kvs_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
121
120
 
@@ -11,12 +11,13 @@ from pathlib import Path
11
11
  from typing import TYPE_CHECKING
12
12
 
13
13
  from pydantic import BaseModel, ValidationError
14
- from typing_extensions import override
14
+ from typing_extensions import Self, override
15
15
 
16
16
  from crawlee import Request
17
17
  from crawlee._consts import METADATA_FILENAME
18
18
  from crawlee._utils.crypto import crypto_random_object_id
19
19
  from crawlee._utils.file import atomic_write, json_dumps
20
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
20
21
  from crawlee._utils.recoverable_state import RecoverableState
21
22
  from crawlee.storage_clients._base import RequestQueueClient
22
23
  from crawlee.storage_clients.models import (
@@ -30,6 +31,7 @@ if TYPE_CHECKING:
30
31
  from collections.abc import Sequence
31
32
 
32
33
  from crawlee.configuration import Configuration
34
+ from crawlee.storages import KeyValueStore
33
35
 
34
36
  logger = getLogger(__name__)
35
37
 
@@ -91,6 +93,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
91
93
  metadata: RequestQueueMetadata,
92
94
  path_to_rq: Path,
93
95
  lock: asyncio.Lock,
96
+ recoverable_state: RecoverableState[RequestQueueState],
94
97
  ) -> None:
95
98
  """Initialize a new instance.
96
99
 
@@ -113,13 +116,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
113
116
  self._is_empty_cache: bool | None = None
114
117
  """Cache for is_empty result: None means unknown, True/False is cached state."""
115
118
 
116
- self._state = RecoverableState[RequestQueueState](
117
- default_state=RequestQueueState(),
118
- persist_state_key='request_queue_state',
119
- persistence_enabled=True,
120
- persist_state_kvs_name=f'__RQ_STATE_{self._metadata.id}',
121
- logger=logger,
122
- )
119
+ self._state = recoverable_state
123
120
  """Recoverable state to maintain request ordering, in-progress status, and handled status."""
124
121
 
125
122
  @override
@@ -136,6 +133,22 @@ class FileSystemRequestQueueClient(RequestQueueClient):
136
133
  """The full path to the request queue metadata file."""
137
134
  return self.path_to_rq / METADATA_FILENAME
138
135
 
136
+ @classmethod
137
+ async def _create_recoverable_state(cls, id: str, configuration: Configuration) -> RecoverableState:
138
+ async def kvs_factory() -> KeyValueStore:
139
+ from crawlee.storage_clients import FileSystemStorageClient # noqa: PLC0415 avoid circular import
140
+ from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
141
+
142
+ return await KeyValueStore.open(storage_client=FileSystemStorageClient(), configuration=configuration)
143
+
144
+ return RecoverableState[RequestQueueState](
145
+ default_state=RequestQueueState(),
146
+ persist_state_key=f'__RQ_STATE_{id}',
147
+ persist_state_kvs_factory=kvs_factory,
148
+ persistence_enabled=True,
149
+ logger=logger,
150
+ )
151
+
139
152
  @classmethod
140
153
  async def open(
141
154
  cls,
@@ -144,7 +157,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
144
157
  name: str | None,
145
158
  alias: str | None,
146
159
  configuration: Configuration,
147
- ) -> FileSystemRequestQueueClient:
160
+ ) -> Self:
148
161
  """Open or create a file system request queue client.
149
162
 
150
163
  This method attempts to open an existing request queue from the file system. If a queue with the specified
@@ -165,9 +178,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
165
178
  or if both name and alias are provided.
166
179
  """
167
180
  # Validate input parameters.
168
- specified_params = sum(1 for param in [id, name, alias] if param is not None)
169
- if specified_params > 1:
170
- raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
181
+ raise_if_too_many_kwargs(id=id, name=name, alias=alias)
171
182
 
172
183
  rq_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
173
184
 
@@ -196,6 +207,9 @@ class FileSystemRequestQueueClient(RequestQueueClient):
196
207
  metadata=metadata,
197
208
  path_to_rq=rq_base_path / rq_dir,
198
209
  lock=asyncio.Lock(),
210
+ recoverable_state=await cls._create_recoverable_state(
211
+ id=id, configuration=configuration
212
+ ),
199
213
  )
200
214
  await client._state.initialize()
201
215
  await client._discover_existing_requests()
@@ -232,6 +246,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
232
246
  metadata=metadata,
233
247
  path_to_rq=path_to_rq,
234
248
  lock=asyncio.Lock(),
249
+ recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
235
250
  )
236
251
 
237
252
  await client._state.initialize()
@@ -256,6 +271,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
256
271
  metadata=metadata,
257
272
  path_to_rq=path_to_rq,
258
273
  lock=asyncio.Lock(),
274
+ recoverable_state=await cls._create_recoverable_state(id=metadata.id, configuration=configuration),
259
275
  )
260
276
  await client._state.initialize()
261
277
  await client._update_metadata()
@@ -35,9 +35,9 @@ class FileSystemStorageClient(StorageClient):
35
35
  """
36
36
 
37
37
  @override
38
- def get_additional_cache_key(self, configuration: Configuration) -> Hashable:
38
+ def get_storage_client_cache_key(self, configuration: Configuration) -> Hashable:
39
39
  # Even different client instances should return same storage if the storage_dir is the same.
40
- return configuration.storage_dir
40
+ return super().get_storage_client_cache_key(configuration), configuration.storage_dir
41
41
 
42
42
  @override
43
43
  async def create_dataset_client(
@@ -4,9 +4,10 @@ from datetime import datetime, timezone
4
4
  from logging import getLogger
5
5
  from typing import TYPE_CHECKING, Any
6
6
 
7
- from typing_extensions import override
7
+ from typing_extensions import Self, override
8
8
 
9
9
  from crawlee._utils.crypto import crypto_random_object_id
10
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
10
11
  from crawlee.storage_clients._base import DatasetClient
11
12
  from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
12
13
 
@@ -54,7 +55,7 @@ class MemoryDatasetClient(DatasetClient):
54
55
  id: str | None,
55
56
  name: str | None,
56
57
  alias: str | None,
57
- ) -> MemoryDatasetClient:
58
+ ) -> Self:
58
59
  """Open or create a new memory dataset client.
59
60
 
60
61
  This method creates a new in-memory dataset instance. Unlike persistent storage implementations, memory
@@ -76,9 +77,7 @@ class MemoryDatasetClient(DatasetClient):
76
77
  ValueError: If both name and alias are provided, or if neither id, name, nor alias is provided.
77
78
  """
78
79
  # Validate input parameters.
79
- specified_params = sum(1 for param in [id, name, alias] if param is not None)
80
- if specified_params > 1:
81
- raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
80
+ raise_if_too_many_kwargs(id=id, name=name, alias=alias)
82
81
 
83
82
  # Create a new dataset
84
83
  dataset_id = id or crypto_random_object_id()
@@ -4,10 +4,11 @@ import sys
4
4
  from datetime import datetime, timezone
5
5
  from typing import TYPE_CHECKING, Any
6
6
 
7
- from typing_extensions import override
7
+ from typing_extensions import Self, override
8
8
 
9
9
  from crawlee._utils.crypto import crypto_random_object_id
10
10
  from crawlee._utils.file import infer_mime_type
11
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
11
12
  from crawlee.storage_clients._base import KeyValueStoreClient
12
13
  from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
13
14
 
@@ -52,7 +53,7 @@ class MemoryKeyValueStoreClient(KeyValueStoreClient):
52
53
  id: str | None,
53
54
  name: str | None,
54
55
  alias: str | None,
55
- ) -> MemoryKeyValueStoreClient:
56
+ ) -> Self:
56
57
  """Open or create a new memory key-value store client.
57
58
 
58
59
  This method creates a new in-memory key-value store instance. Unlike persistent storage implementations,
@@ -74,9 +75,7 @@ class MemoryKeyValueStoreClient(KeyValueStoreClient):
74
75
  ValueError: If both name and alias are provided.
75
76
  """
76
77
  # Validate input parameters.
77
- specified_params = sum(1 for param in [id, name, alias] if param is not None)
78
- if specified_params > 1:
79
- raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
78
+ raise_if_too_many_kwargs(id=id, name=name, alias=alias)
80
79
 
81
80
  # Create a new key-value store
82
81
  store_id = id or crypto_random_object_id()
@@ -6,10 +6,11 @@ from datetime import datetime, timezone
6
6
  from logging import getLogger
7
7
  from typing import TYPE_CHECKING
8
8
 
9
- from typing_extensions import override
9
+ from typing_extensions import Self, override
10
10
 
11
11
  from crawlee import Request
12
12
  from crawlee._utils.crypto import crypto_random_object_id
13
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
13
14
  from crawlee.storage_clients._base import RequestQueueClient
14
15
  from crawlee.storage_clients.models import AddRequestsResponse, ProcessedRequest, RequestQueueMetadata
15
16
 
@@ -64,7 +65,7 @@ class MemoryRequestQueueClient(RequestQueueClient):
64
65
  id: str | None,
65
66
  name: str | None,
66
67
  alias: str | None,
67
- ) -> MemoryRequestQueueClient:
68
+ ) -> Self:
68
69
  """Open or create a new memory request queue client.
69
70
 
70
71
  This method creates a new in-memory request queue instance. Unlike persistent storage implementations,
@@ -86,9 +87,7 @@ class MemoryRequestQueueClient(RequestQueueClient):
86
87
  ValueError: If both name and alias are provided.
87
88
  """
88
89
  # Validate input parameters.
89
- specified_params = sum(1 for param in [id, name, alias] if param is not None)
90
- if specified_params > 1:
91
- raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
90
+ raise_if_too_many_kwargs(id=id, name=name, alias=alias)
92
91
 
93
92
  # Create a new queue
94
93
  queue_id = id or crypto_random_object_id()
@@ -0,0 +1,6 @@
1
+ from ._dataset_client import RedisDatasetClient
2
+ from ._key_value_store_client import RedisKeyValueStoreClient
3
+ from ._request_queue_client import RedisRequestQueueClient
4
+ from ._storage_client import RedisStorageClient
5
+
6
+ __all__ = ['RedisDatasetClient', 'RedisKeyValueStoreClient', 'RedisRequestQueueClient', 'RedisStorageClient']