crawlee 1.0.0rc1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. crawlee/_autoscaling/snapshotter.py +1 -1
  2. crawlee/_request.py +2 -1
  3. crawlee/_service_locator.py +44 -24
  4. crawlee/_types.py +76 -17
  5. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  6. crawlee/_utils/sitemap.py +3 -1
  7. crawlee/_utils/system.py +3 -3
  8. crawlee/browsers/_playwright_browser_controller.py +20 -14
  9. crawlee/configuration.py +1 -1
  10. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
  11. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  12. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  13. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +6 -2
  14. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  15. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
  16. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  17. crawlee/crawlers/_basic/_basic_crawler.py +107 -27
  18. crawlee/crawlers/_basic/_logging_utils.py +5 -1
  19. crawlee/crawlers/_playwright/_playwright_crawler.py +6 -1
  20. crawlee/events/_types.py +6 -6
  21. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  22. crawlee/fingerprint_suite/_types.py +2 -2
  23. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  24. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  25. crawlee/request_loaders/_request_list.py +1 -1
  26. crawlee/request_loaders/_request_loader.py +5 -1
  27. crawlee/request_loaders/_sitemap_request_loader.py +228 -48
  28. crawlee/sessions/_models.py +2 -2
  29. crawlee/statistics/_models.py +1 -1
  30. crawlee/storage_clients/__init__.py +12 -0
  31. crawlee/storage_clients/_base/_storage_client.py +13 -0
  32. crawlee/storage_clients/_file_system/_dataset_client.py +27 -25
  33. crawlee/storage_clients/_file_system/_key_value_store_client.py +27 -23
  34. crawlee/storage_clients/_file_system/_request_queue_client.py +84 -98
  35. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  36. crawlee/storage_clients/_file_system/_utils.py +0 -0
  37. crawlee/storage_clients/_memory/_dataset_client.py +14 -2
  38. crawlee/storage_clients/_memory/_key_value_store_client.py +14 -2
  39. crawlee/storage_clients/_memory/_request_queue_client.py +43 -12
  40. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  41. crawlee/storage_clients/_sql/__init__.py +6 -0
  42. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  43. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  44. crawlee/storage_clients/_sql/_db_models.py +269 -0
  45. crawlee/storage_clients/_sql/_key_value_store_client.py +299 -0
  46. crawlee/storage_clients/_sql/_request_queue_client.py +706 -0
  47. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  48. crawlee/storage_clients/_sql/py.typed +0 -0
  49. crawlee/storage_clients/models.py +10 -10
  50. crawlee/storages/_base.py +3 -1
  51. crawlee/storages/_dataset.py +9 -2
  52. crawlee/storages/_key_value_store.py +9 -2
  53. crawlee/storages/_request_queue.py +7 -2
  54. crawlee/storages/_storage_instance_manager.py +126 -72
  55. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/METADATA +12 -5
  56. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/RECORD +59 -49
  57. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/WHEEL +0 -0
  58. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/entry_points.txt +0 -0
  59. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,19 +1,25 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ from collections import deque
4
5
  from contextlib import suppress
5
6
  from logging import getLogger
6
- from typing import TYPE_CHECKING, Any
7
+ from typing import TYPE_CHECKING, Annotated, Any
8
+
9
+ from pydantic import BaseModel, ConfigDict, Field
10
+ from typing_extensions import override
7
11
 
8
12
  from crawlee import Request
9
13
  from crawlee._utils.docs import docs_group
10
14
  from crawlee._utils.globs import Glob
11
- from crawlee._utils.sitemap import ParseSitemapOptions, SitemapSource, SitemapUrl, parse_sitemap
15
+ from crawlee._utils.recoverable_state import RecoverableState
16
+ from crawlee._utils.sitemap import NestedSitemap, ParseSitemapOptions, SitemapSource, SitemapUrl, parse_sitemap
12
17
  from crawlee.request_loaders._request_loader import RequestLoader
13
18
 
14
19
  if TYPE_CHECKING:
15
20
  import re
16
21
  from collections.abc import Sequence
22
+ from types import TracebackType
17
23
 
18
24
  from crawlee.http_clients import HttpClient
19
25
  from crawlee.proxy_configuration import ProxyInfo
@@ -23,12 +29,72 @@ if TYPE_CHECKING:
23
29
  logger = getLogger(__name__)
24
30
 
25
31
 
32
+ class SitemapRequestLoaderState(BaseModel):
33
+ """State model for persisting sitemap request loader data.
34
+
35
+ The crawler processes one sitemap at a time. The current sitemap is stored in `in_progress_sitemap_url`.
36
+ The `parse_sitemap` function parses the sitemap and returns elements as an async iterator. Each element retrieved
37
+ from the iterator is processed based on its type. If the element is a `NestedSitemap`, its URL is added to
38
+ `pending_sitemap_urls` if it hasn't been processed yet (not in `processed_sitemap_urls`). If the element is a
39
+ `SitemapUrl`, the system checks whether it already exists in `current_sitemap_processed_urls`. If it exists,
40
+ the loader was restarted from a saved state and the URL is skipped.
41
+
42
+ If the URL is new, it is first added to `url_queue`, then to `current_sitemap_processed_urls`, and `total_count` is
43
+ incremented by 1. When all elements from the current sitemap iterator have been processed, `in_progress_sitemap_url`
44
+ is set to `None`, the sitemap URL is added to `processed_sitemap_urls`, and `current_sitemap_processed_urls` is
45
+ cleared. The next sitemap is retrieved from `pending_sitemap_urls`, skipping any URLs that already exist in
46
+ `processed_sitemap_urls`. If `pending_sitemap_urls` is empty, `completed` is set to `True`.
47
+
48
+ When `fetch_next_request` is called, a URL is extracted from `url_queue` and placed in `in_progress`.
49
+ When `mark_request_as_handled` is called for the extracted URL, it is removed from `in_progress` and
50
+ `handled_count` is incremented by 1.
51
+
52
+ During initial startup or restart after persistence, state validation occurs in `_get_state`. If both
53
+ `pending_sitemap_urls` and `in_progress_sitemap_url` are empty and `completed` is False, this indicates a
54
+ fresh start. In this case, `self._sitemap_urls` are moved to `pending_sitemap_urls`. Otherwise, the system is
55
+ restarting from a persisted state. If `in_progress` contains any URLs, they are moved back to `url_queue` and
56
+ `in_progress` is cleared.
57
+ """
58
+
59
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
60
+
61
+ url_queue: Annotated[deque[str], Field(alias='urlQueue')]
62
+ """Queue of URLs extracted from sitemaps and ready for processing."""
63
+
64
+ in_progress: Annotated[set[str], Field(alias='inProgress')] = set()
65
+ """Set of request URLs currently being processed."""
66
+
67
+ pending_sitemap_urls: Annotated[deque[str], Field(alias='pendingSitemapUrls')]
68
+ """Queue of sitemap URLs that need to be fetched and processed."""
69
+
70
+ in_progress_sitemap_url: Annotated[str | None, Field(alias='inProgressSitemapUrl')] = None
71
+ """The sitemap URL currently being processed."""
72
+
73
+ current_sitemap_processed_urls: Annotated[set[str], Field(alias='currentSitemapProcessedUrls')] = set()
74
+ """URLs from the current sitemap that have been added to the queue."""
75
+
76
+ processed_sitemap_urls: Annotated[set[str], Field(alias='processedSitemapUrls')] = set()
77
+ """Set of processed sitemap URLs."""
78
+
79
+ completed: Annotated[bool, Field(alias='sitemapCompleted')] = False
80
+ """Whether all sitemaps have been fully processed."""
81
+
82
+ total_count: Annotated[int, Field(alias='totalCount')] = 0
83
+ """Total number of URLs found and added to the queue from all processed sitemaps."""
84
+
85
+ handled_count: Annotated[int, Field(alias='handledCount')] = 0
86
+ """Number of URLs that have been successfully handled."""
87
+
88
+
26
89
  @docs_group('Request loaders')
27
90
  class SitemapRequestLoader(RequestLoader):
28
91
  """A request loader that reads URLs from sitemap(s).
29
92
 
30
93
  The loader fetches and parses sitemaps in the background, allowing crawling to start
31
94
  before all URLs are loaded. It supports filtering URLs using glob and regex patterns.
95
+
96
+ The loader supports state persistence, allowing it to resume from where it left off
97
+ after interruption when a `persist_state_key` is provided during initialization.
32
98
  """
33
99
 
34
100
  def __init__(
@@ -40,7 +106,7 @@ class SitemapRequestLoader(RequestLoader):
40
106
  include: list[re.Pattern[Any] | Glob] | None = None,
41
107
  exclude: list[re.Pattern[Any] | Glob] | None = None,
42
108
  max_buffer_size: int = 200,
43
- parse_sitemap_options: ParseSitemapOptions | None = None,
109
+ persist_state_key: str | None = None,
44
110
  ) -> None:
45
111
  """Initialize the sitemap request loader.
46
112
 
@@ -50,27 +116,64 @@ class SitemapRequestLoader(RequestLoader):
50
116
  include: List of glob or regex patterns to include URLs.
51
117
  exclude: List of glob or regex patterns to exclude URLs.
52
118
  max_buffer_size: Maximum number of URLs to buffer in memory.
53
- parse_sitemap_options: Options for parsing sitemaps, such as `SitemapSource` and `max_urls`.
54
119
  http_client: the instance of `HttpClient` to use for fetching sitemaps.
120
+ persist_state_key: A key for persisting the loader's state in the KeyValueStore.
121
+ When provided, allows resuming from where it left off after interruption.
122
+ If None, no state persistence occurs.
55
123
  """
56
124
  self._http_client = http_client
57
-
58
125
  self._sitemap_urls = sitemap_urls
59
126
  self._include = include
60
127
  self._exclude = exclude
61
128
  self._proxy_info = proxy_info
62
- self._parse_sitemap_options = parse_sitemap_options or ParseSitemapOptions()
129
+ self._max_buffer_size = max_buffer_size
130
+
131
+ # Synchronization for queue operations
132
+ self._queue_has_capacity = asyncio.Event()
133
+ self._queue_has_capacity.set()
134
+ self._queue_lock = asyncio.Lock()
135
+
136
+ # Initialize recoverable state
137
+ self._state = RecoverableState(
138
+ default_state=SitemapRequestLoaderState(
139
+ url_queue=deque(),
140
+ pending_sitemap_urls=deque(),
141
+ ),
142
+ persistence_enabled=bool(persist_state_key),
143
+ persist_state_key=persist_state_key or '',
144
+ logger=logger,
145
+ )
146
+
147
+ # Start background loading
148
+ self._loading_task = asyncio.create_task(self._load_sitemaps())
63
149
 
64
- self._handled_count = 0
65
- self._total_count = 0
150
+ async def _get_state(self) -> SitemapRequestLoaderState:
151
+ """Initialize and return the current state."""
152
+ async with self._queue_lock:
153
+ if self._state.is_initialized:
154
+ return self._state.current_value
66
155
 
67
- # URL queue and tracking
68
- self._url_queue: asyncio.Queue[str] = asyncio.Queue(maxsize=max_buffer_size)
69
- self._in_progress: set[str] = set()
70
- self._processed_urls: set[str] = set()
156
+ await self._state.initialize()
71
157
 
72
- # Loading state
73
- self._loading_task = asyncio.create_task(self._load_sitemaps())
158
+ # Initialize pending sitemaps on first run
159
+ has_sitemap_for_processing = (
160
+ self._state.current_value.pending_sitemap_urls or self._state.current_value.in_progress_sitemap_url
161
+ )
162
+ if not has_sitemap_for_processing and not self._state.current_value.completed:
163
+ self._state.current_value.pending_sitemap_urls.extend(self._sitemap_urls)
164
+
165
+ if self._state.current_value.in_progress:
166
+ self._state.current_value.url_queue.extendleft(self._state.current_value.in_progress)
167
+ self._state.current_value.in_progress.clear()
168
+
169
+ if (
170
+ self._state.current_value.url_queue
171
+ and len(self._state.current_value.url_queue) >= self._max_buffer_size
172
+ ):
173
+ # Notify that the queue is full
174
+ self._queue_has_capacity.clear()
175
+
176
+ return self._state.current_value
74
177
 
75
178
  def _check_url_patterns(
76
179
  self,
@@ -105,73 +208,150 @@ class SitemapRequestLoader(RequestLoader):
105
208
  async def _load_sitemaps(self) -> None:
106
209
  """Load URLs from sitemaps in the background."""
107
210
  try:
108
- async for item in parse_sitemap(
109
- [SitemapSource(type='url', url=url) for url in self._sitemap_urls],
110
- self._http_client,
111
- proxy_info=self._proxy_info,
112
- options=self._parse_sitemap_options,
113
- ):
114
- # Only process URL items (not nested sitemaps)
115
- if isinstance(item, SitemapUrl):
116
- url = item.loc
117
-
118
- # Skip if already processed
119
- if url in self._processed_urls:
211
+ # Get actual state
212
+ while (state := await self._get_state()) and (state.pending_sitemap_urls or state.in_progress_sitemap_url):
213
+ # Get sitemap URL for parsing
214
+ sitemap_url = state.in_progress_sitemap_url
215
+ if not sitemap_url:
216
+ sitemap_url = state.pending_sitemap_urls.popleft()
217
+ # Skip processed urls
218
+ if sitemap_url in state.processed_sitemap_urls:
120
219
  continue
121
-
122
- # Check if URL should be included
123
- if not self._check_url_patterns(url, self._include, self._exclude):
220
+ state.in_progress_sitemap_url = sitemap_url
221
+
222
+ parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True)
223
+
224
+ async for item in parse_sitemap(
225
+ [SitemapSource(type='url', url=sitemap_url)],
226
+ self._http_client,
227
+ proxy_info=self._proxy_info,
228
+ options=parse_options,
229
+ ):
230
+ if isinstance(item, NestedSitemap):
231
+ # Add nested sitemap to queue
232
+ if item.loc not in state.pending_sitemap_urls and item.loc not in state.processed_sitemap_urls:
233
+ state.pending_sitemap_urls.append(item.loc)
124
234
  continue
125
235
 
126
- await self._url_queue.put(url)
127
- self._processed_urls.add(url)
128
- self._total_count += 1
236
+ if isinstance(item, SitemapUrl):
237
+ url = item.loc
238
+
239
+ state = await self._get_state()
240
+
241
+ # Skip if already processed
242
+ if url in state.current_sitemap_processed_urls:
243
+ continue
244
+
245
+ # Check if URL should be included
246
+ if not self._check_url_patterns(url, self._include, self._exclude):
247
+ continue
248
+
249
+ # Check if we have capacity in the queue
250
+ await self._queue_has_capacity.wait()
251
+
252
+ state = await self._get_state()
253
+ async with self._queue_lock:
254
+ state.url_queue.append(url)
255
+ state.current_sitemap_processed_urls.add(url)
256
+ state.total_count += 1
257
+ if len(state.url_queue) >= self._max_buffer_size:
258
+ # Notify that the queue is full
259
+ self._queue_has_capacity.clear()
260
+
261
+ # Clear current sitemap after processing
262
+ state = await self._get_state()
263
+ current_sitemap_url = state.in_progress_sitemap_url
264
+ state.in_progress_sitemap_url = None
265
+ if current_sitemap_url:
266
+ state.processed_sitemap_urls.add(current_sitemap_url)
267
+ state.current_sitemap_processed_urls.clear()
268
+
269
+ # Mark as completed after processing all sitemap urls
270
+ state.completed = True
129
271
 
130
272
  except Exception:
131
273
  logger.exception('Error loading sitemaps')
132
274
  raise
133
275
 
276
+ @override
134
277
  async def get_total_count(self) -> int:
135
278
  """Return the total number of URLs found so far."""
136
- return self._total_count
279
+ state = await self._get_state()
280
+ return state.total_count
137
281
 
282
+ @override
283
+ async def get_handled_count(self) -> int:
284
+ """Return the number of URLs that have been handled."""
285
+ state = await self._get_state()
286
+ return state.handled_count
287
+
288
+ @override
138
289
  async def is_empty(self) -> bool:
139
290
  """Check if there are no more URLs to process."""
140
- return self._url_queue.empty() and self._loading_task.done()
291
+ state = await self._get_state()
292
+ return not state.url_queue
141
293
 
294
+ @override
142
295
  async def is_finished(self) -> bool:
143
296
  """Check if all URLs have been processed."""
144
- return self._url_queue.empty() and len(self._in_progress) == 0 and self._loading_task.done()
297
+ state = await self._get_state()
298
+ return not state.url_queue and len(state.in_progress) == 0 and self._loading_task.done()
145
299
 
300
+ @override
146
301
  async def fetch_next_request(self) -> Request | None:
147
302
  """Fetch the next request to process."""
148
- while not (self._loading_task.done() and self._url_queue.empty()):
149
- if self._url_queue.empty():
150
- await asyncio.sleep(0.5)
303
+ while not (await self.is_finished()):
304
+ state = await self._get_state()
305
+ if not state.url_queue:
306
+ await asyncio.sleep(0.1)
151
307
  continue
152
308
 
153
- url = await self._url_queue.get()
309
+ async with self._queue_lock:
310
+ url = state.url_queue.popleft()
311
+
312
+ request = Request.from_url(url)
313
+ state.in_progress.add(request.url)
314
+ if len(state.url_queue) < self._max_buffer_size:
315
+ self._queue_has_capacity.set()
154
316
 
155
- request = Request.from_url(url)
156
- self._in_progress.add(request.unique_key)
157
317
  return request
158
318
 
159
319
  return None
160
320
 
321
+ @override
161
322
  async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
162
323
  """Mark a request as successfully handled."""
163
- if request.unique_key in self._in_progress:
164
- self._in_progress.remove(request.unique_key)
165
- self._handled_count += 1
324
+ state = await self._get_state()
325
+ if request.url in state.in_progress:
326
+ state.in_progress.remove(request.url)
327
+ state.handled_count += 1
166
328
  return None
167
329
 
168
- async def get_handled_count(self) -> int:
169
- """Return the number of handled requests."""
170
- return self._handled_count
171
-
172
330
  async def abort_loading(self) -> None:
173
331
  """Abort the sitemap loading process."""
174
332
  if self._loading_task and not self._loading_task.done():
175
333
  self._loading_task.cancel()
176
334
  with suppress(asyncio.CancelledError):
177
335
  await self._loading_task
336
+
337
+ async def start(self) -> None:
338
+ """Start the sitemap loading process."""
339
+ if self._loading_task and not self._loading_task.done():
340
+ return
341
+ self._loading_task = asyncio.create_task(self._load_sitemaps())
342
+
343
+ async def close(self) -> None:
344
+ """Close the request loader."""
345
+ await self.abort_loading()
346
+ await self._state.teardown()
347
+
348
+ async def __aenter__(self) -> SitemapRequestLoader:
349
+ """Enter the context manager."""
350
+ await self.start()
351
+ return self
352
+
353
+ async def __aexit__(
354
+ self, exc_type: type[BaseException] | None, exc_value: BaseException | None, exc_traceback: TracebackType | None
355
+ ) -> None:
356
+ """Exit the context manager."""
357
+ await self.close()
@@ -20,7 +20,7 @@ from ._session import Session
20
20
  class SessionModel(BaseModel):
21
21
  """Model for a Session object."""
22
22
 
23
- model_config = ConfigDict(populate_by_name=True)
23
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
24
24
 
25
25
  id: Annotated[str, Field(alias='id')]
26
26
  max_age: Annotated[timedelta, Field(alias='maxAge')]
@@ -38,7 +38,7 @@ class SessionModel(BaseModel):
38
38
  class SessionPoolModel(BaseModel):
39
39
  """Model for a SessionPool object."""
40
40
 
41
- model_config = ConfigDict(populate_by_name=True)
41
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
42
42
 
43
43
  max_pool_size: Annotated[int, Field(alias='maxPoolSize')]
44
44
 
@@ -57,7 +57,7 @@ class FinalStatistics:
57
57
  class StatisticsState(BaseModel):
58
58
  """Statistic data about a crawler run."""
59
59
 
60
- model_config = ConfigDict(populate_by_name=True, ser_json_inf_nan='constants')
60
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants')
61
61
  stats_id: Annotated[int | None, Field(alias='statsId')] = None
62
62
 
63
63
  requests_finished: Annotated[int, Field(alias='requestsFinished')] = 0
@@ -1,9 +1,21 @@
1
+ from crawlee._utils.try_import import install_import_hook as _install_import_hook
2
+ from crawlee._utils.try_import import try_import as _try_import
3
+
4
+ # These imports have only mandatory dependencies, so they are imported directly.
1
5
  from ._base import StorageClient
2
6
  from ._file_system import FileSystemStorageClient
3
7
  from ._memory import MemoryStorageClient
4
8
 
9
+ _install_import_hook(__name__)
10
+
11
+ # The following imports are wrapped in try_import to handle optional dependencies,
12
+ # ensuring the module can still function even if these dependencies are missing.
13
+ with _try_import(__name__, 'SqlStorageClient'):
14
+ from ._sql import SqlStorageClient
15
+
5
16
  __all__ = [
6
17
  'FileSystemStorageClient',
7
18
  'MemoryStorageClient',
19
+ 'SqlStorageClient',
8
20
  'StorageClient',
9
21
  ]
@@ -6,6 +6,8 @@ from typing import TYPE_CHECKING
6
6
  from crawlee._utils.docs import docs_group
7
7
 
8
8
  if TYPE_CHECKING:
9
+ from collections.abc import Hashable
10
+
9
11
  from crawlee.configuration import Configuration
10
12
 
11
13
  from ._dataset_client import DatasetClient
@@ -28,12 +30,21 @@ class StorageClient(ABC):
28
30
  (where applicable), and consistent access patterns across all storage types it supports.
29
31
  """
30
32
 
33
+ def get_storage_client_cache_key(self, configuration: Configuration) -> Hashable: # noqa: ARG002
34
+ """Return a cache key that can differentiate between different storages of this and other clients.
35
+
36
+ Can be based on configuration or on the client itself. By default, returns a module and name of the client
37
+ class.
38
+ """
39
+ return f'{self.__class__.__module__}.{self.__class__.__name__}'
40
+
31
41
  @abstractmethod
32
42
  async def create_dataset_client(
33
43
  self,
34
44
  *,
35
45
  id: str | None = None,
36
46
  name: str | None = None,
47
+ alias: str | None = None,
37
48
  configuration: Configuration | None = None,
38
49
  ) -> DatasetClient:
39
50
  """Create a dataset client."""
@@ -44,6 +55,7 @@ class StorageClient(ABC):
44
55
  *,
45
56
  id: str | None = None,
46
57
  name: str | None = None,
58
+ alias: str | None = None,
47
59
  configuration: Configuration | None = None,
48
60
  ) -> KeyValueStoreClient:
49
61
  """Create a key-value store client."""
@@ -54,6 +66,7 @@ class StorageClient(ABC):
54
66
  *,
55
67
  id: str | None = None,
56
68
  name: str | None = None,
69
+ alias: str | None = None,
57
70
  configuration: Configuration | None = None,
58
71
  ) -> RequestQueueClient:
59
72
  """Create a request queue client."""
@@ -14,6 +14,7 @@ from typing_extensions import override
14
14
  from crawlee._consts import METADATA_FILENAME
15
15
  from crawlee._utils.crypto import crypto_random_object_id
16
16
  from crawlee._utils.file import atomic_write, json_dumps
17
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
17
18
  from crawlee.storage_clients._base import DatasetClient
18
19
  from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
19
20
 
@@ -56,7 +57,7 @@ class FileSystemDatasetClient(DatasetClient):
56
57
  self,
57
58
  *,
58
59
  metadata: DatasetMetadata,
59
- storage_dir: Path,
60
+ path_to_dataset: Path,
60
61
  lock: asyncio.Lock,
61
62
  ) -> None:
62
63
  """Initialize a new instance.
@@ -65,8 +66,8 @@ class FileSystemDatasetClient(DatasetClient):
65
66
  """
66
67
  self._metadata = metadata
67
68
 
68
- self._storage_dir = storage_dir
69
- """The base directory where the storage data are being persisted."""
69
+ self._path_to_dataset = path_to_dataset
70
+ """The full path to the dataset directory."""
70
71
 
71
72
  self._lock = lock
72
73
  """A lock to ensure that only one operation is performed at a time."""
@@ -78,10 +79,7 @@ class FileSystemDatasetClient(DatasetClient):
78
79
  @property
79
80
  def path_to_dataset(self) -> Path:
80
81
  """The full path to the dataset directory."""
81
- if self._metadata.name is None:
82
- return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT
83
-
84
- return self._storage_dir / self._STORAGE_SUBDIR / self._metadata.name
82
+ return self._path_to_dataset
85
83
 
86
84
  @property
87
85
  def path_to_metadata(self) -> Path:
@@ -94,6 +92,7 @@ class FileSystemDatasetClient(DatasetClient):
94
92
  *,
95
93
  id: str | None,
96
94
  name: str | None,
95
+ alias: str | None,
97
96
  configuration: Configuration,
98
97
  ) -> FileSystemDatasetClient:
99
98
  """Open or create a file system dataset client.
@@ -104,17 +103,21 @@ class FileSystemDatasetClient(DatasetClient):
104
103
 
105
104
  Args:
106
105
  id: The ID of the dataset to open. If provided, searches for existing dataset by ID.
107
- name: The name of the dataset to open. If not provided, uses the default dataset.
106
+ name: The name of the dataset for named (global scope) storages.
107
+ alias: The alias of the dataset for unnamed (run scope) storages.
108
108
  configuration: The configuration object containing storage directory settings.
109
109
 
110
110
  Returns:
111
111
  An instance for the opened or created storage client.
112
112
 
113
113
  Raises:
114
- ValueError: If a dataset with the specified ID is not found, or if metadata is invalid.
114
+ ValueError: If a dataset with the specified ID is not found, if metadata is invalid,
115
+ or if both name and alias are provided.
115
116
  """
116
- storage_dir = Path(configuration.storage_dir)
117
- dataset_base_path = storage_dir / cls._STORAGE_SUBDIR
117
+ # Validate input parameters.
118
+ raise_if_too_many_kwargs(id=id, name=name, alias=alias)
119
+
120
+ dataset_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
118
121
 
119
122
  if not dataset_base_path.exists():
120
123
  await asyncio.to_thread(dataset_base_path.mkdir, parents=True, exist_ok=True)
@@ -126,19 +129,19 @@ class FileSystemDatasetClient(DatasetClient):
126
129
  if not dataset_dir.is_dir():
127
130
  continue
128
131
 
129
- metadata_path = dataset_dir / METADATA_FILENAME
130
- if not metadata_path.exists():
132
+ path_to_metadata = dataset_dir / METADATA_FILENAME
133
+ if not path_to_metadata.exists():
131
134
  continue
132
135
 
133
136
  try:
134
- file = await asyncio.to_thread(metadata_path.open)
137
+ file = await asyncio.to_thread(path_to_metadata.open)
135
138
  try:
136
139
  file_content = json.load(file)
137
140
  metadata = DatasetMetadata(**file_content)
138
141
  if metadata.id == id:
139
142
  client = cls(
140
143
  metadata=metadata,
141
- storage_dir=storage_dir,
144
+ path_to_dataset=dataset_base_path / dataset_dir,
142
145
  lock=asyncio.Lock(),
143
146
  )
144
147
  await client._update_metadata(update_accessed_at=True)
@@ -152,16 +155,15 @@ class FileSystemDatasetClient(DatasetClient):
152
155
  if not found:
153
156
  raise ValueError(f'Dataset with ID "{id}" not found')
154
157
 
155
- # Get a new instance by name.
158
+ # Get a new instance by name or alias.
156
159
  else:
157
- dataset_path = (
158
- dataset_base_path / cls._STORAGE_SUBSUBDIR_DEFAULT if name is None else dataset_base_path / name
159
- )
160
- metadata_path = dataset_path / METADATA_FILENAME
160
+ dataset_dir = Path(name) if name else Path(alias) if alias else Path('default')
161
+ path_to_dataset = dataset_base_path / dataset_dir
162
+ path_to_metadata = path_to_dataset / METADATA_FILENAME
161
163
 
162
164
  # If the dataset directory exists, reconstruct the client from the metadata file.
163
- if dataset_path.exists() and metadata_path.exists():
164
- file = await asyncio.to_thread(open, metadata_path)
165
+ if path_to_dataset.exists() and path_to_metadata.exists():
166
+ file = await asyncio.to_thread(open, path_to_metadata)
165
167
  try:
166
168
  file_content = json.load(file)
167
169
  finally:
@@ -169,11 +171,11 @@ class FileSystemDatasetClient(DatasetClient):
169
171
  try:
170
172
  metadata = DatasetMetadata(**file_content)
171
173
  except ValidationError as exc:
172
- raise ValueError(f'Invalid metadata file for dataset "{name}"') from exc
174
+ raise ValueError(f'Invalid metadata file for dataset "{name or alias}"') from exc
173
175
 
174
176
  client = cls(
175
177
  metadata=metadata,
176
- storage_dir=storage_dir,
178
+ path_to_dataset=path_to_dataset,
177
179
  lock=asyncio.Lock(),
178
180
  )
179
181
 
@@ -192,7 +194,7 @@ class FileSystemDatasetClient(DatasetClient):
192
194
  )
193
195
  client = cls(
194
196
  metadata=metadata,
195
- storage_dir=storage_dir,
197
+ path_to_dataset=path_to_dataset,
196
198
  lock=asyncio.Lock(),
197
199
  )
198
200
  await client._update_metadata()