crawlee 0.6.13b31__py3-none-any.whl → 1.1.1b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (82) hide show
  1. crawlee/_autoscaling/snapshotter.py +1 -1
  2. crawlee/_request.py +34 -22
  3. crawlee/_service_locator.py +44 -24
  4. crawlee/_types.py +86 -33
  5. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  6. crawlee/_utils/recoverable_state.py +32 -8
  7. crawlee/_utils/recurring_task.py +15 -0
  8. crawlee/_utils/robots.py +17 -5
  9. crawlee/_utils/sitemap.py +1 -1
  10. crawlee/_utils/system.py +3 -3
  11. crawlee/_utils/urls.py +9 -2
  12. crawlee/browsers/_browser_pool.py +4 -1
  13. crawlee/browsers/_playwright_browser_controller.py +21 -15
  14. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  15. crawlee/browsers/_types.py +1 -1
  16. crawlee/configuration.py +2 -0
  17. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +6 -2
  18. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  19. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  20. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
  21. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  22. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
  23. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  24. crawlee/crawlers/_basic/_basic_crawler.py +124 -37
  25. crawlee/crawlers/_playwright/_playwright_crawler.py +17 -5
  26. crawlee/events/_event_manager.py +3 -1
  27. crawlee/events/_types.py +6 -6
  28. crawlee/fingerprint_suite/_header_generator.py +2 -2
  29. crawlee/fingerprint_suite/_types.py +2 -2
  30. crawlee/otel/crawler_instrumentor.py +3 -3
  31. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  32. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  33. crawlee/request_loaders/_request_list.py +1 -1
  34. crawlee/request_loaders/_sitemap_request_loader.py +23 -5
  35. crawlee/sessions/_models.py +2 -2
  36. crawlee/sessions/_session_pool.py +1 -1
  37. crawlee/statistics/_error_snapshotter.py +1 -1
  38. crawlee/statistics/_models.py +33 -2
  39. crawlee/statistics/_statistics.py +24 -33
  40. crawlee/storage_clients/__init__.py +16 -0
  41. crawlee/storage_clients/_base/_storage_client.py +13 -0
  42. crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
  43. crawlee/storage_clients/_file_system/_key_value_store_client.py +29 -25
  44. crawlee/storage_clients/_file_system/_request_queue_client.py +53 -34
  45. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  46. crawlee/storage_clients/_file_system/_utils.py +0 -0
  47. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  48. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  49. crawlee/storage_clients/_memory/_request_queue_client.py +16 -4
  50. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  51. crawlee/storage_clients/_redis/__init__.py +6 -0
  52. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  53. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  54. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  55. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  56. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  57. crawlee/storage_clients/_redis/_utils.py +23 -0
  58. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  59. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  60. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  61. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  62. crawlee/storage_clients/_redis/py.typed +0 -0
  63. crawlee/storage_clients/_sql/__init__.py +6 -0
  64. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  65. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  66. crawlee/storage_clients/_sql/_db_models.py +268 -0
  67. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  68. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  69. crawlee/storage_clients/_sql/_storage_client.py +291 -0
  70. crawlee/storage_clients/_sql/py.typed +0 -0
  71. crawlee/storage_clients/models.py +10 -10
  72. crawlee/storages/_base.py +5 -1
  73. crawlee/storages/_dataset.py +12 -2
  74. crawlee/storages/_key_value_store.py +17 -4
  75. crawlee/storages/_request_queue.py +10 -2
  76. crawlee/storages/_storage_instance_manager.py +133 -71
  77. crawlee/storages/_utils.py +11 -0
  78. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/METADATA +17 -6
  79. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/RECORD +82 -59
  80. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/WHEEL +0 -0
  81. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/entry_points.txt +0 -0
  82. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/licenses/LICENSE +0 -0
@@ -20,7 +20,7 @@ from ._session import Session
20
20
  class SessionModel(BaseModel):
21
21
  """Model for a Session object."""
22
22
 
23
- model_config = ConfigDict(populate_by_name=True)
23
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
24
24
 
25
25
  id: Annotated[str, Field(alias='id')]
26
26
  max_age: Annotated[timedelta, Field(alias='maxAge')]
@@ -38,7 +38,7 @@ class SessionModel(BaseModel):
38
38
  class SessionPoolModel(BaseModel):
39
39
  """Model for a SessionPool object."""
40
40
 
41
- model_config = ConfigDict(populate_by_name=True)
41
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
42
42
 
43
43
  max_pool_size: Annotated[int, Field(alias='maxPoolSize')]
44
44
 
@@ -163,7 +163,7 @@ class SessionPool:
163
163
  def add_session(self, session: Session) -> None:
164
164
  """Add an externally created session to the pool.
165
165
 
166
- This is intened only for the cases when you want to add a session that was created outside of the pool.
166
+ This is intended only for the cases when you want to add a session that was created outside of the pool.
167
167
  Otherwise, the pool will create new sessions automatically.
168
168
 
169
169
  Args:
@@ -32,7 +32,7 @@ class ErrorSnapshotter:
32
32
  """Capture error snapshot and save it to key value store.
33
33
 
34
34
  It saves the error snapshot directly to a key value store. It can't use `context.get_key_value_store` because
35
- it returns `KeyValueStoreChangeRecords` which is commited to the key value store only if the `RequestHandler`
35
+ it returns `KeyValueStoreChangeRecords` which is committed to the key value store only if the `RequestHandler`
36
36
  returned without an exception. ErrorSnapshotter is on the contrary active only when `RequestHandler` fails with
37
37
  an exception.
38
38
 
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
+ import warnings
4
5
  from dataclasses import asdict, dataclass
5
6
  from datetime import datetime, timedelta, timezone
6
7
  from typing import Annotated, Any
@@ -57,7 +58,7 @@ class FinalStatistics:
57
58
  class StatisticsState(BaseModel):
58
59
  """Statistic data about a crawler run."""
59
60
 
60
- model_config = ConfigDict(populate_by_name=True, ser_json_inf_nan='constants')
61
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants')
61
62
  stats_id: Annotated[int | None, Field(alias='statsId')] = None
62
63
 
63
64
  requests_finished: Annotated[int, Field(alias='requestsFinished')] = 0
@@ -76,7 +77,6 @@ class StatisticsState(BaseModel):
76
77
  crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None
77
78
  crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None
78
79
  crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None
79
- crawler_runtime: Annotated[timedelta_ms, Field(alias='crawlerRuntimeMillis')] = timedelta()
80
80
  errors: dict[str, Any] = Field(default_factory=dict)
81
81
  retry_errors: dict[str, Any] = Field(alias='retryErrors', default_factory=dict)
82
82
  requests_with_status_code: dict[str, int] = Field(alias='requestsWithStatusCode', default_factory=dict)
@@ -93,6 +93,37 @@ class StatisticsState(BaseModel):
93
93
  ),
94
94
  ] = {}
95
95
 
96
+ # Used to track the crawler runtime, that had already been persisted. This is the runtime from previous runs.
97
+ _runtime_offset: Annotated[timedelta, Field(exclude=True)] = timedelta()
98
+
99
+ def model_post_init(self, /, __context: Any) -> None:
100
+ self._runtime_offset = self.crawler_runtime or self._runtime_offset
101
+
102
+ @property
103
+ def crawler_runtime(self) -> timedelta:
104
+ if self.crawler_last_started_at:
105
+ finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
106
+ return self._runtime_offset + finished_at - self.crawler_last_started_at
107
+ return self._runtime_offset
108
+
109
+ @crawler_runtime.setter
110
+ def crawler_runtime(self, value: timedelta) -> None:
111
+ # Setter for backwards compatibility only, the crawler_runtime is now computed_field, and cant be set manually.
112
+ # To be removed in v2 release https://github.com/apify/crawlee-python/issues/1567
113
+ warnings.warn(
114
+ f"Setting 'crawler_runtime' is deprecated and will be removed in a future version."
115
+ f' Value {value} will not be used.',
116
+ DeprecationWarning,
117
+ stacklevel=2,
118
+ )
119
+
120
+ @computed_field(alias='crawlerRuntimeMillis')
121
+ def crawler_runtime_for_serialization(self) -> timedelta:
122
+ if self.crawler_last_started_at:
123
+ finished_at = self.crawler_finished_at or datetime.now(timezone.utc)
124
+ return self._runtime_offset + finished_at - self.crawler_last_started_at
125
+ return self._runtime_offset
126
+
96
127
  @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms) # type: ignore[prop-decorator]
97
128
  @property
98
129
  def request_total_duration(self) -> timedelta:
@@ -1,6 +1,7 @@
1
1
  # Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/core/src/crawlers/statistics.ts
2
2
  from __future__ import annotations
3
3
 
4
+ import asyncio
4
5
  import math
5
6
  import time
6
7
  from datetime import datetime, timedelta, timezone
@@ -17,8 +18,11 @@ from crawlee.statistics import FinalStatistics, StatisticsState
17
18
  from crawlee.statistics._error_tracker import ErrorTracker
18
19
 
19
20
  if TYPE_CHECKING:
21
+ from collections.abc import Callable, Coroutine
20
22
  from types import TracebackType
21
23
 
24
+ from crawlee.storages import KeyValueStore
25
+
22
26
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
23
27
  TNewStatisticsState = TypeVar('TNewStatisticsState', bound=StatisticsState, default=StatisticsState)
24
28
  logger = getLogger(__name__)
@@ -70,6 +74,7 @@ class Statistics(Generic[TStatisticsState]):
70
74
  persistence_enabled: bool | Literal['explicit_only'] = False,
71
75
  persist_state_kvs_name: str | None = None,
72
76
  persist_state_key: str | None = None,
77
+ persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
73
78
  log_message: str = 'Statistics',
74
79
  periodic_message_logger: Logger | None = None,
75
80
  log_interval: timedelta = timedelta(minutes=1),
@@ -80,8 +85,6 @@ class Statistics(Generic[TStatisticsState]):
80
85
  self._id = Statistics.__next_id
81
86
  Statistics.__next_id += 1
82
87
 
83
- self._instance_start: datetime | None = None
84
-
85
88
  self.error_tracker = ErrorTracker(
86
89
  save_error_snapshots=save_error_snapshots,
87
90
  snapshot_kvs_name=persist_state_kvs_name,
@@ -92,9 +95,10 @@ class Statistics(Generic[TStatisticsState]):
92
95
 
93
96
  self._state = RecoverableState(
94
97
  default_state=state_model(stats_id=self._id),
95
- persist_state_key=persist_state_key or f'SDK_CRAWLER_STATISTICS_{self._id}',
98
+ persist_state_key=persist_state_key or f'__CRAWLER_STATISTICS_{self._id}',
96
99
  persistence_enabled=persistence_enabled,
97
100
  persist_state_kvs_name=persist_state_kvs_name,
101
+ persist_state_kvs_factory=persist_state_kvs_factory,
98
102
  logger=logger,
99
103
  )
100
104
 
@@ -110,8 +114,8 @@ class Statistics(Generic[TStatisticsState]):
110
114
  """Create near copy of the `Statistics` with replaced `state_model`."""
111
115
  new_statistics: Statistics[TNewStatisticsState] = Statistics(
112
116
  persistence_enabled=self._state._persistence_enabled, # noqa: SLF001
113
- persist_state_kvs_name=self._state._persist_state_kvs_name, # noqa: SLF001
114
117
  persist_state_key=self._state._persist_state_key, # noqa: SLF001
118
+ persist_state_kvs_factory=self._state._persist_state_kvs_factory, # noqa: SLF001
115
119
  log_message=self._log_message,
116
120
  periodic_message_logger=self._periodic_message_logger,
117
121
  state_model=state_model,
@@ -125,6 +129,7 @@ class Statistics(Generic[TStatisticsState]):
125
129
  persistence_enabled: bool = False,
126
130
  persist_state_kvs_name: str | None = None,
127
131
  persist_state_key: str | None = None,
132
+ persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
128
133
  log_message: str = 'Statistics',
129
134
  periodic_message_logger: Logger | None = None,
130
135
  log_interval: timedelta = timedelta(minutes=1),
@@ -136,6 +141,7 @@ class Statistics(Generic[TStatisticsState]):
136
141
  persistence_enabled=persistence_enabled,
137
142
  persist_state_kvs_name=persist_state_kvs_name,
138
143
  persist_state_key=persist_state_key,
144
+ persist_state_kvs_factory=persist_state_kvs_factory,
139
145
  log_message=log_message,
140
146
  periodic_message_logger=periodic_message_logger,
141
147
  log_interval=log_interval,
@@ -158,14 +164,17 @@ class Statistics(Generic[TStatisticsState]):
158
164
  if self._active:
159
165
  raise RuntimeError(f'The {self.__class__.__name__} is already active.')
160
166
 
161
- self._active = True
162
- self._instance_start = datetime.now(timezone.utc)
163
-
164
167
  await self._state.initialize()
165
- self._after_initialize()
168
+ # Reset `crawler_finished_at` to indicate a new run in progress.
169
+ self.state.crawler_finished_at = None
166
170
 
171
+ # Start periodic logging and let it print initial state before activation.
167
172
  self._periodic_logger.start()
173
+ await asyncio.sleep(0.01)
174
+ self._active = True
168
175
 
176
+ self.state.crawler_last_started_at = datetime.now(timezone.utc)
177
+ self.state.crawler_started_at = self.state.crawler_started_at or self.state.crawler_last_started_at
169
178
  return self
170
179
 
171
180
  async def __aexit__(
@@ -182,13 +191,14 @@ class Statistics(Generic[TStatisticsState]):
182
191
  if not self._active:
183
192
  raise RuntimeError(f'The {self.__class__.__name__} is not active.')
184
193
 
185
- self._state.current_value.crawler_finished_at = datetime.now(timezone.utc)
186
-
187
- await self._state.teardown()
194
+ if not self.state.crawler_last_started_at:
195
+ raise RuntimeError('Statistics.state.crawler_last_started_at not set.')
188
196
 
197
+ # Stop logging and deactivate the statistics to prevent further changes to crawler_runtime
189
198
  await self._periodic_logger.stop()
190
-
199
+ self.state.crawler_finished_at = datetime.now(timezone.utc)
191
200
  self._active = False
201
+ await self._state.teardown()
192
202
 
193
203
  @property
194
204
  def state(self) -> TStatisticsState:
@@ -247,11 +257,7 @@ class Statistics(Generic[TStatisticsState]):
247
257
 
248
258
  def calculate(self) -> FinalStatistics:
249
259
  """Calculate the current statistics."""
250
- if self._instance_start is None:
251
- raise RuntimeError('The Statistics object is not initialized')
252
-
253
- crawler_runtime = datetime.now(timezone.utc) - self._instance_start
254
- total_minutes = crawler_runtime.total_seconds() / 60
260
+ total_minutes = self.state.crawler_runtime.total_seconds() / 60
255
261
  state = self._state.current_value
256
262
  serialized_state = state.model_dump(by_alias=False)
257
263
 
@@ -262,7 +268,7 @@ class Statistics(Generic[TStatisticsState]):
262
268
  requests_failed_per_minute=math.floor(state.requests_failed / total_minutes) if total_minutes else 0,
263
269
  request_total_duration=state.request_total_finished_duration + state.request_total_failed_duration,
264
270
  requests_total=state.requests_failed + state.requests_finished,
265
- crawler_runtime=crawler_runtime,
271
+ crawler_runtime=state.crawler_runtime,
266
272
  requests_finished=state.requests_finished,
267
273
  requests_failed=state.requests_failed,
268
274
  retry_histogram=serialized_state['request_retry_histogram'],
@@ -282,21 +288,6 @@ class Statistics(Generic[TStatisticsState]):
282
288
  else:
283
289
  self._periodic_message_logger.info(self._log_message, extra=stats.to_dict())
284
290
 
285
- def _after_initialize(self) -> None:
286
- state = self._state.current_value
287
-
288
- if state.crawler_started_at is None:
289
- state.crawler_started_at = datetime.now(timezone.utc)
290
-
291
- if state.stats_persisted_at is not None and state.crawler_last_started_at:
292
- self._instance_start = datetime.now(timezone.utc) - (
293
- state.stats_persisted_at - state.crawler_last_started_at
294
- )
295
- elif state.crawler_last_started_at:
296
- self._instance_start = state.crawler_last_started_at
297
-
298
- state.crawler_last_started_at = self._instance_start
299
-
300
291
  def _save_retry_count_for_request(self, record: RequestProcessingRecord) -> None:
301
292
  retry_count = record.retry_count
302
293
  state = self._state.current_value
@@ -1,9 +1,25 @@
1
+ from crawlee._utils.try_import import install_import_hook as _install_import_hook
2
+ from crawlee._utils.try_import import try_import as _try_import
3
+
4
+ # These imports have only mandatory dependencies, so they are imported directly.
1
5
  from ._base import StorageClient
2
6
  from ._file_system import FileSystemStorageClient
3
7
  from ._memory import MemoryStorageClient
4
8
 
9
+ _install_import_hook(__name__)
10
+
11
+ # The following imports are wrapped in try_import to handle optional dependencies,
12
+ # ensuring the module can still function even if these dependencies are missing.
13
+ with _try_import(__name__, 'SqlStorageClient'):
14
+ from ._sql import SqlStorageClient
15
+
16
+ with _try_import(__name__, 'RedisStorageClient'):
17
+ from ._redis import RedisStorageClient
18
+
5
19
  __all__ = [
6
20
  'FileSystemStorageClient',
7
21
  'MemoryStorageClient',
22
+ 'RedisStorageClient',
23
+ 'SqlStorageClient',
8
24
  'StorageClient',
9
25
  ]
@@ -6,6 +6,8 @@ from typing import TYPE_CHECKING
6
6
  from crawlee._utils.docs import docs_group
7
7
 
8
8
  if TYPE_CHECKING:
9
+ from collections.abc import Hashable
10
+
9
11
  from crawlee.configuration import Configuration
10
12
 
11
13
  from ._dataset_client import DatasetClient
@@ -28,12 +30,21 @@ class StorageClient(ABC):
28
30
  (where applicable), and consistent access patterns across all storage types it supports.
29
31
  """
30
32
 
33
+ def get_storage_client_cache_key(self, configuration: Configuration) -> Hashable: # noqa: ARG002
34
+ """Return a cache key that can differentiate between different storages of this and other clients.
35
+
36
+ Can be based on configuration or on the client itself. By default, returns a module and name of the client
37
+ class.
38
+ """
39
+ return f'{self.__class__.__module__}.{self.__class__.__name__}'
40
+
31
41
  @abstractmethod
32
42
  async def create_dataset_client(
33
43
  self,
34
44
  *,
35
45
  id: str | None = None,
36
46
  name: str | None = None,
47
+ alias: str | None = None,
37
48
  configuration: Configuration | None = None,
38
49
  ) -> DatasetClient:
39
50
  """Create a dataset client."""
@@ -44,6 +55,7 @@ class StorageClient(ABC):
44
55
  *,
45
56
  id: str | None = None,
46
57
  name: str | None = None,
58
+ alias: str | None = None,
47
59
  configuration: Configuration | None = None,
48
60
  ) -> KeyValueStoreClient:
49
61
  """Create a key-value store client."""
@@ -54,6 +66,7 @@ class StorageClient(ABC):
54
66
  *,
55
67
  id: str | None = None,
56
68
  name: str | None = None,
69
+ alias: str | None = None,
57
70
  configuration: Configuration | None = None,
58
71
  ) -> RequestQueueClient:
59
72
  """Create a request queue client."""
@@ -9,11 +9,12 @@ from pathlib import Path
9
9
  from typing import TYPE_CHECKING, Any
10
10
 
11
11
  from pydantic import ValidationError
12
- from typing_extensions import override
12
+ from typing_extensions import Self, override
13
13
 
14
14
  from crawlee._consts import METADATA_FILENAME
15
15
  from crawlee._utils.crypto import crypto_random_object_id
16
16
  from crawlee._utils.file import atomic_write, json_dumps
17
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
17
18
  from crawlee.storage_clients._base import DatasetClient
18
19
  from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
19
20
 
@@ -56,7 +57,7 @@ class FileSystemDatasetClient(DatasetClient):
56
57
  self,
57
58
  *,
58
59
  metadata: DatasetMetadata,
59
- storage_dir: Path,
60
+ path_to_dataset: Path,
60
61
  lock: asyncio.Lock,
61
62
  ) -> None:
62
63
  """Initialize a new instance.
@@ -65,8 +66,8 @@ class FileSystemDatasetClient(DatasetClient):
65
66
  """
66
67
  self._metadata = metadata
67
68
 
68
- self._storage_dir = storage_dir
69
- """The base directory where the storage data are being persisted."""
69
+ self._path_to_dataset = path_to_dataset
70
+ """The full path to the dataset directory."""
70
71
 
71
72
  self._lock = lock
72
73
  """A lock to ensure that only one operation is performed at a time."""
@@ -78,10 +79,7 @@ class FileSystemDatasetClient(DatasetClient):
78
79
  @property
79
80
  def path_to_dataset(self) -> Path:
80
81
  """The full path to the dataset directory."""
81
- if self._metadata.name is None:
82
- return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT
83
-
84
- return self._storage_dir / self._STORAGE_SUBDIR / self._metadata.name
82
+ return self._path_to_dataset
85
83
 
86
84
  @property
87
85
  def path_to_metadata(self) -> Path:
@@ -94,8 +92,9 @@ class FileSystemDatasetClient(DatasetClient):
94
92
  *,
95
93
  id: str | None,
96
94
  name: str | None,
95
+ alias: str | None,
97
96
  configuration: Configuration,
98
- ) -> FileSystemDatasetClient:
97
+ ) -> Self:
99
98
  """Open or create a file system dataset client.
100
99
 
101
100
  This method attempts to open an existing dataset from the file system. If a dataset with the specified ID
@@ -104,17 +103,21 @@ class FileSystemDatasetClient(DatasetClient):
104
103
 
105
104
  Args:
106
105
  id: The ID of the dataset to open. If provided, searches for existing dataset by ID.
107
- name: The name of the dataset to open. If not provided, uses the default dataset.
106
+ name: The name of the dataset for named (global scope) storages.
107
+ alias: The alias of the dataset for unnamed (run scope) storages.
108
108
  configuration: The configuration object containing storage directory settings.
109
109
 
110
110
  Returns:
111
111
  An instance for the opened or created storage client.
112
112
 
113
113
  Raises:
114
- ValueError: If a dataset with the specified ID is not found, or if metadata is invalid.
114
+ ValueError: If a dataset with the specified ID is not found, if metadata is invalid,
115
+ or if both name and alias are provided.
115
116
  """
116
- storage_dir = Path(configuration.storage_dir)
117
- dataset_base_path = storage_dir / cls._STORAGE_SUBDIR
117
+ # Validate input parameters.
118
+ raise_if_too_many_kwargs(id=id, name=name, alias=alias)
119
+
120
+ dataset_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
118
121
 
119
122
  if not dataset_base_path.exists():
120
123
  await asyncio.to_thread(dataset_base_path.mkdir, parents=True, exist_ok=True)
@@ -126,19 +129,19 @@ class FileSystemDatasetClient(DatasetClient):
126
129
  if not dataset_dir.is_dir():
127
130
  continue
128
131
 
129
- metadata_path = dataset_dir / METADATA_FILENAME
130
- if not metadata_path.exists():
132
+ path_to_metadata = dataset_dir / METADATA_FILENAME
133
+ if not path_to_metadata.exists():
131
134
  continue
132
135
 
133
136
  try:
134
- file = await asyncio.to_thread(metadata_path.open)
137
+ file = await asyncio.to_thread(path_to_metadata.open)
135
138
  try:
136
139
  file_content = json.load(file)
137
140
  metadata = DatasetMetadata(**file_content)
138
141
  if metadata.id == id:
139
142
  client = cls(
140
143
  metadata=metadata,
141
- storage_dir=storage_dir,
144
+ path_to_dataset=dataset_base_path / dataset_dir,
142
145
  lock=asyncio.Lock(),
143
146
  )
144
147
  await client._update_metadata(update_accessed_at=True)
@@ -152,16 +155,15 @@ class FileSystemDatasetClient(DatasetClient):
152
155
  if not found:
153
156
  raise ValueError(f'Dataset with ID "{id}" not found')
154
157
 
155
- # Get a new instance by name.
158
+ # Get a new instance by name or alias.
156
159
  else:
157
- dataset_path = (
158
- dataset_base_path / cls._STORAGE_SUBSUBDIR_DEFAULT if name is None else dataset_base_path / name
159
- )
160
- metadata_path = dataset_path / METADATA_FILENAME
160
+ dataset_dir = Path(name) if name else Path(alias) if alias else Path('default')
161
+ path_to_dataset = dataset_base_path / dataset_dir
162
+ path_to_metadata = path_to_dataset / METADATA_FILENAME
161
163
 
162
164
  # If the dataset directory exists, reconstruct the client from the metadata file.
163
- if dataset_path.exists() and metadata_path.exists():
164
- file = await asyncio.to_thread(open, metadata_path)
165
+ if path_to_dataset.exists() and path_to_metadata.exists():
166
+ file = await asyncio.to_thread(open, path_to_metadata)
165
167
  try:
166
168
  file_content = json.load(file)
167
169
  finally:
@@ -169,11 +171,11 @@ class FileSystemDatasetClient(DatasetClient):
169
171
  try:
170
172
  metadata = DatasetMetadata(**file_content)
171
173
  except ValidationError as exc:
172
- raise ValueError(f'Invalid metadata file for dataset "{name}"') from exc
174
+ raise ValueError(f'Invalid metadata file for dataset "{name or alias}"') from exc
173
175
 
174
176
  client = cls(
175
177
  metadata=metadata,
176
- storage_dir=storage_dir,
178
+ path_to_dataset=path_to_dataset,
177
179
  lock=asyncio.Lock(),
178
180
  )
179
181
 
@@ -192,7 +194,7 @@ class FileSystemDatasetClient(DatasetClient):
192
194
  )
193
195
  client = cls(
194
196
  metadata=metadata,
195
- storage_dir=storage_dir,
197
+ path_to_dataset=path_to_dataset,
196
198
  lock=asyncio.Lock(),
197
199
  )
198
200
  await client._update_metadata()
@@ -10,11 +10,12 @@ from pathlib import Path
10
10
  from typing import TYPE_CHECKING, Any
11
11
 
12
12
  from pydantic import ValidationError
13
- from typing_extensions import override
13
+ from typing_extensions import Self, override
14
14
 
15
15
  from crawlee._consts import METADATA_FILENAME
16
16
  from crawlee._utils.crypto import crypto_random_object_id
17
17
  from crawlee._utils.file import atomic_write, infer_mime_type, json_dumps
18
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
18
19
  from crawlee.storage_clients._base import KeyValueStoreClient
19
20
  from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
20
21
 
@@ -55,7 +56,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
55
56
  self,
56
57
  *,
57
58
  metadata: KeyValueStoreMetadata,
58
- storage_dir: Path,
59
+ path_to_kvs: Path,
59
60
  lock: asyncio.Lock,
60
61
  ) -> None:
61
62
  """Initialize a new instance.
@@ -64,8 +65,8 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
64
65
  """
65
66
  self._metadata = metadata
66
67
 
67
- self._storage_dir = storage_dir
68
- """The base directory where the storage data are being persisted."""
68
+ self._path_to_kvs = path_to_kvs
69
+ """The full path to the key-value store directory."""
69
70
 
70
71
  self._lock = lock
71
72
  """A lock to ensure that only one operation is performed at a time."""
@@ -77,10 +78,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
77
78
  @property
78
79
  def path_to_kvs(self) -> Path:
79
80
  """The full path to the key-value store directory."""
80
- if self._metadata.name is None:
81
- return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT
82
-
83
- return self._storage_dir / self._STORAGE_SUBDIR / self._metadata.name
81
+ return self._path_to_kvs
84
82
 
85
83
  @property
86
84
  def path_to_metadata(self) -> Path:
@@ -93,8 +91,9 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
93
91
  *,
94
92
  id: str | None,
95
93
  name: str | None,
94
+ alias: str | None,
96
95
  configuration: Configuration,
97
- ) -> FileSystemKeyValueStoreClient:
96
+ ) -> Self:
98
97
  """Open or create a file system key-value store client.
99
98
 
100
99
  This method attempts to open an existing key-value store from the file system. If a KVS with the specified
@@ -103,17 +102,21 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
103
102
 
104
103
  Args:
105
104
  id: The ID of the key-value store to open. If provided, searches for existing store by ID.
106
- name: The name of the key-value store to open. If not provided, uses the default store.
105
+ name: The name of the key-value store for named (global scope) storages.
106
+ alias: The alias of the key-value store for unnamed (run scope) storages.
107
107
  configuration: The configuration object containing storage directory settings.
108
108
 
109
109
  Returns:
110
110
  An instance for the opened or created storage client.
111
111
 
112
112
  Raises:
113
- ValueError: If a store with the specified ID is not found, or if metadata is invalid.
113
+ ValueError: If a store with the specified ID is not found, if metadata is invalid,
114
+ or if both name and alias are provided.
114
115
  """
115
- storage_dir = Path(configuration.storage_dir)
116
- kvs_base_path = storage_dir / cls._STORAGE_SUBDIR
116
+ # Validate input parameters.
117
+ raise_if_too_many_kwargs(id=id, name=name, alias=alias)
118
+
119
+ kvs_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
117
120
 
118
121
  if not kvs_base_path.exists():
119
122
  await asyncio.to_thread(kvs_base_path.mkdir, parents=True, exist_ok=True)
@@ -125,19 +128,19 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
125
128
  if not kvs_dir.is_dir():
126
129
  continue
127
130
 
128
- metadata_path = kvs_dir / METADATA_FILENAME
129
- if not metadata_path.exists():
131
+ path_to_metadata = kvs_dir / METADATA_FILENAME
132
+ if not path_to_metadata.exists():
130
133
  continue
131
134
 
132
135
  try:
133
- file = await asyncio.to_thread(metadata_path.open)
136
+ file = await asyncio.to_thread(path_to_metadata.open)
134
137
  try:
135
138
  file_content = json.load(file)
136
139
  metadata = KeyValueStoreMetadata(**file_content)
137
140
  if metadata.id == id:
138
141
  client = cls(
139
142
  metadata=metadata,
140
- storage_dir=storage_dir,
143
+ path_to_kvs=kvs_base_path / kvs_dir,
141
144
  lock=asyncio.Lock(),
142
145
  )
143
146
  await client._update_metadata(update_accessed_at=True)
@@ -151,14 +154,15 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
151
154
  if not found:
152
155
  raise ValueError(f'Key-value store with ID "{id}" not found.')
153
156
 
154
- # Get a new instance by name.
157
+ # Get a new instance by name or alias.
155
158
  else:
156
- kvs_path = kvs_base_path / cls._STORAGE_SUBSUBDIR_DEFAULT if name is None else kvs_base_path / name
157
- metadata_path = kvs_path / METADATA_FILENAME
159
+ kvs_dir = Path(name) if name else Path(alias) if alias else Path('default')
160
+ path_to_kvs = kvs_base_path / kvs_dir
161
+ path_to_metadata = path_to_kvs / METADATA_FILENAME
158
162
 
159
163
  # If the key-value store directory exists, reconstruct the client from the metadata file.
160
- if kvs_path.exists() and metadata_path.exists():
161
- file = await asyncio.to_thread(open, metadata_path)
164
+ if path_to_kvs.exists() and path_to_metadata.exists():
165
+ file = await asyncio.to_thread(open, path_to_metadata)
162
166
  try:
163
167
  file_content = json.load(file)
164
168
  finally:
@@ -166,11 +170,11 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
166
170
  try:
167
171
  metadata = KeyValueStoreMetadata(**file_content)
168
172
  except ValidationError as exc:
169
- raise ValueError(f'Invalid metadata file for key-value store "{name}"') from exc
173
+ raise ValueError(f'Invalid metadata file for key-value store "{name or alias}"') from exc
170
174
 
171
175
  client = cls(
172
176
  metadata=metadata,
173
- storage_dir=storage_dir,
177
+ path_to_kvs=path_to_kvs,
174
178
  lock=asyncio.Lock(),
175
179
  )
176
180
 
@@ -188,7 +192,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
188
192
  )
189
193
  client = cls(
190
194
  metadata=metadata,
191
- storage_dir=storage_dir,
195
+ path_to_kvs=path_to_kvs,
192
196
  lock=asyncio.Lock(),
193
197
  )
194
198
  await client._update_metadata()