crawlee 1.0.0rc1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. crawlee/_autoscaling/snapshotter.py +1 -1
  2. crawlee/_request.py +2 -1
  3. crawlee/_service_locator.py +44 -24
  4. crawlee/_types.py +76 -17
  5. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  6. crawlee/_utils/sitemap.py +3 -1
  7. crawlee/_utils/system.py +3 -3
  8. crawlee/browsers/_playwright_browser_controller.py +20 -14
  9. crawlee/configuration.py +1 -1
  10. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
  11. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  12. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  13. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +6 -2
  14. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  15. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
  16. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  17. crawlee/crawlers/_basic/_basic_crawler.py +107 -27
  18. crawlee/crawlers/_basic/_logging_utils.py +5 -1
  19. crawlee/crawlers/_playwright/_playwright_crawler.py +6 -1
  20. crawlee/events/_types.py +6 -6
  21. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  22. crawlee/fingerprint_suite/_types.py +2 -2
  23. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  24. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  25. crawlee/request_loaders/_request_list.py +1 -1
  26. crawlee/request_loaders/_request_loader.py +5 -1
  27. crawlee/request_loaders/_sitemap_request_loader.py +228 -48
  28. crawlee/sessions/_models.py +2 -2
  29. crawlee/statistics/_models.py +1 -1
  30. crawlee/storage_clients/__init__.py +12 -0
  31. crawlee/storage_clients/_base/_storage_client.py +13 -0
  32. crawlee/storage_clients/_file_system/_dataset_client.py +27 -25
  33. crawlee/storage_clients/_file_system/_key_value_store_client.py +27 -23
  34. crawlee/storage_clients/_file_system/_request_queue_client.py +84 -98
  35. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  36. crawlee/storage_clients/_file_system/_utils.py +0 -0
  37. crawlee/storage_clients/_memory/_dataset_client.py +14 -2
  38. crawlee/storage_clients/_memory/_key_value_store_client.py +14 -2
  39. crawlee/storage_clients/_memory/_request_queue_client.py +43 -12
  40. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  41. crawlee/storage_clients/_sql/__init__.py +6 -0
  42. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  43. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  44. crawlee/storage_clients/_sql/_db_models.py +269 -0
  45. crawlee/storage_clients/_sql/_key_value_store_client.py +299 -0
  46. crawlee/storage_clients/_sql/_request_queue_client.py +706 -0
  47. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  48. crawlee/storage_clients/_sql/py.typed +0 -0
  49. crawlee/storage_clients/models.py +10 -10
  50. crawlee/storages/_base.py +3 -1
  51. crawlee/storages/_dataset.py +9 -2
  52. crawlee/storages/_key_value_store.py +9 -2
  53. crawlee/storages/_request_queue.py +7 -2
  54. crawlee/storages/_storage_instance_manager.py +126 -72
  55. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/METADATA +12 -5
  56. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/RECORD +59 -49
  57. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/WHEEL +0 -0
  58. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/entry_points.txt +0 -0
  59. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/licenses/LICENSE +0 -0
@@ -15,6 +15,7 @@ from typing_extensions import override
15
15
  from crawlee._consts import METADATA_FILENAME
16
16
  from crawlee._utils.crypto import crypto_random_object_id
17
17
  from crawlee._utils.file import atomic_write, infer_mime_type, json_dumps
18
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
18
19
  from crawlee.storage_clients._base import KeyValueStoreClient
19
20
  from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
20
21
 
@@ -55,7 +56,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
55
56
  self,
56
57
  *,
57
58
  metadata: KeyValueStoreMetadata,
58
- storage_dir: Path,
59
+ path_to_kvs: Path,
59
60
  lock: asyncio.Lock,
60
61
  ) -> None:
61
62
  """Initialize a new instance.
@@ -64,8 +65,8 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
64
65
  """
65
66
  self._metadata = metadata
66
67
 
67
- self._storage_dir = storage_dir
68
- """The base directory where the storage data are being persisted."""
68
+ self._path_to_kvs = path_to_kvs
69
+ """The full path to the key-value store directory."""
69
70
 
70
71
  self._lock = lock
71
72
  """A lock to ensure that only one operation is performed at a time."""
@@ -77,10 +78,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
77
78
  @property
78
79
  def path_to_kvs(self) -> Path:
79
80
  """The full path to the key-value store directory."""
80
- if self._metadata.name is None:
81
- return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT
82
-
83
- return self._storage_dir / self._STORAGE_SUBDIR / self._metadata.name
81
+ return self._path_to_kvs
84
82
 
85
83
  @property
86
84
  def path_to_metadata(self) -> Path:
@@ -93,6 +91,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
93
91
  *,
94
92
  id: str | None,
95
93
  name: str | None,
94
+ alias: str | None,
96
95
  configuration: Configuration,
97
96
  ) -> FileSystemKeyValueStoreClient:
98
97
  """Open or create a file system key-value store client.
@@ -103,17 +102,21 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
103
102
 
104
103
  Args:
105
104
  id: The ID of the key-value store to open. If provided, searches for existing store by ID.
106
- name: The name of the key-value store to open. If not provided, uses the default store.
105
+ name: The name of the key-value store for named (global scope) storages.
106
+ alias: The alias of the key-value store for unnamed (run scope) storages.
107
107
  configuration: The configuration object containing storage directory settings.
108
108
 
109
109
  Returns:
110
110
  An instance for the opened or created storage client.
111
111
 
112
112
  Raises:
113
- ValueError: If a store with the specified ID is not found, or if metadata is invalid.
113
+ ValueError: If a store with the specified ID is not found, if metadata is invalid,
114
+ or if both name and alias are provided.
114
115
  """
115
- storage_dir = Path(configuration.storage_dir)
116
- kvs_base_path = storage_dir / cls._STORAGE_SUBDIR
116
+ # Validate input parameters.
117
+ raise_if_too_many_kwargs(id=id, name=name, alias=alias)
118
+
119
+ kvs_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
117
120
 
118
121
  if not kvs_base_path.exists():
119
122
  await asyncio.to_thread(kvs_base_path.mkdir, parents=True, exist_ok=True)
@@ -125,19 +128,19 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
125
128
  if not kvs_dir.is_dir():
126
129
  continue
127
130
 
128
- metadata_path = kvs_dir / METADATA_FILENAME
129
- if not metadata_path.exists():
131
+ path_to_metadata = kvs_dir / METADATA_FILENAME
132
+ if not path_to_metadata.exists():
130
133
  continue
131
134
 
132
135
  try:
133
- file = await asyncio.to_thread(metadata_path.open)
136
+ file = await asyncio.to_thread(path_to_metadata.open)
134
137
  try:
135
138
  file_content = json.load(file)
136
139
  metadata = KeyValueStoreMetadata(**file_content)
137
140
  if metadata.id == id:
138
141
  client = cls(
139
142
  metadata=metadata,
140
- storage_dir=storage_dir,
143
+ path_to_kvs=kvs_base_path / kvs_dir,
141
144
  lock=asyncio.Lock(),
142
145
  )
143
146
  await client._update_metadata(update_accessed_at=True)
@@ -151,14 +154,15 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
151
154
  if not found:
152
155
  raise ValueError(f'Key-value store with ID "{id}" not found.')
153
156
 
154
- # Get a new instance by name.
157
+ # Get a new instance by name or alias.
155
158
  else:
156
- kvs_path = kvs_base_path / cls._STORAGE_SUBSUBDIR_DEFAULT if name is None else kvs_base_path / name
157
- metadata_path = kvs_path / METADATA_FILENAME
159
+ kvs_dir = Path(name) if name else Path(alias) if alias else Path('default')
160
+ path_to_kvs = kvs_base_path / kvs_dir
161
+ path_to_metadata = path_to_kvs / METADATA_FILENAME
158
162
 
159
163
  # If the key-value store directory exists, reconstruct the client from the metadata file.
160
- if kvs_path.exists() and metadata_path.exists():
161
- file = await asyncio.to_thread(open, metadata_path)
164
+ if path_to_kvs.exists() and path_to_metadata.exists():
165
+ file = await asyncio.to_thread(open, path_to_metadata)
162
166
  try:
163
167
  file_content = json.load(file)
164
168
  finally:
@@ -166,11 +170,11 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
166
170
  try:
167
171
  metadata = KeyValueStoreMetadata(**file_content)
168
172
  except ValidationError as exc:
169
- raise ValueError(f'Invalid metadata file for key-value store "{name}"') from exc
173
+ raise ValueError(f'Invalid metadata file for key-value store "{name or alias}"') from exc
170
174
 
171
175
  client = cls(
172
176
  metadata=metadata,
173
- storage_dir=storage_dir,
177
+ path_to_kvs=path_to_kvs,
174
178
  lock=asyncio.Lock(),
175
179
  )
176
180
 
@@ -188,7 +192,7 @@ class FileSystemKeyValueStoreClient(KeyValueStoreClient):
188
192
  )
189
193
  client = cls(
190
194
  metadata=metadata,
191
- storage_dir=storage_dir,
195
+ path_to_kvs=path_to_kvs,
192
196
  lock=asyncio.Lock(),
193
197
  )
194
198
  await client._update_metadata()
@@ -17,6 +17,7 @@ from crawlee import Request
17
17
  from crawlee._consts import METADATA_FILENAME
18
18
  from crawlee._utils.crypto import crypto_random_object_id
19
19
  from crawlee._utils.file import atomic_write, json_dumps
20
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
20
21
  from crawlee._utils.recoverable_state import RecoverableState
21
22
  from crawlee.storage_clients._base import RequestQueueClient
22
23
  from crawlee.storage_clients.models import (
@@ -89,7 +90,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
89
90
  self,
90
91
  *,
91
92
  metadata: RequestQueueMetadata,
92
- storage_dir: Path,
93
+ path_to_rq: Path,
93
94
  lock: asyncio.Lock,
94
95
  ) -> None:
95
96
  """Initialize a new instance.
@@ -98,8 +99,8 @@ class FileSystemRequestQueueClient(RequestQueueClient):
98
99
  """
99
100
  self._metadata = metadata
100
101
 
101
- self._storage_dir = storage_dir
102
- """The base directory where the storage data are being persisted."""
102
+ self._path_to_rq = path_to_rq
103
+ """The full path to the request queue directory."""
103
104
 
104
105
  self._lock = lock
105
106
  """A lock to ensure that only one operation is performed at a time."""
@@ -115,9 +116,8 @@ class FileSystemRequestQueueClient(RequestQueueClient):
115
116
 
116
117
  self._state = RecoverableState[RequestQueueState](
117
118
  default_state=RequestQueueState(),
118
- persist_state_key='request_queue_state',
119
+ persist_state_key=f'__RQ_STATE_{self._metadata.id}',
119
120
  persistence_enabled=True,
120
- persist_state_kvs_name=f'__RQ_STATE_{self._metadata.id}',
121
121
  logger=logger,
122
122
  )
123
123
  """Recoverable state to maintain request ordering, in-progress status, and handled status."""
@@ -129,10 +129,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
129
129
  @property
130
130
  def path_to_rq(self) -> Path:
131
131
  """The full path to the request queue directory."""
132
- if self._metadata.name is None:
133
- return self._storage_dir / self._STORAGE_SUBDIR / self._STORAGE_SUBSUBDIR_DEFAULT
134
-
135
- return self._storage_dir / self._STORAGE_SUBDIR / self._metadata.name
132
+ return self._path_to_rq
136
133
 
137
134
  @property
138
135
  def path_to_metadata(self) -> Path:
@@ -145,6 +142,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
145
142
  *,
146
143
  id: str | None,
147
144
  name: str | None,
145
+ alias: str | None,
148
146
  configuration: Configuration,
149
147
  ) -> FileSystemRequestQueueClient:
150
148
  """Open or create a file system request queue client.
@@ -155,17 +153,21 @@ class FileSystemRequestQueueClient(RequestQueueClient):
155
153
 
156
154
  Args:
157
155
  id: The ID of the request queue to open. If provided, searches for existing queue by ID.
158
- name: The name of the request queue to open. If not provided, uses the default queue.
156
+ name: The name of the request queue for named (global scope) storages.
157
+ alias: The alias of the request queue for unnamed (run scope) storages.
159
158
  configuration: The configuration object containing storage directory settings.
160
159
 
161
160
  Returns:
162
161
  An instance for the opened or created storage client.
163
162
 
164
163
  Raises:
165
- ValueError: If a queue with the specified ID is not found, or if metadata is invalid.
164
+ ValueError: If a queue with the specified ID is not found, if metadata is invalid,
165
+ or if both name and alias are provided.
166
166
  """
167
- storage_dir = Path(configuration.storage_dir)
168
- rq_base_path = storage_dir / cls._STORAGE_SUBDIR
167
+ # Validate input parameters.
168
+ raise_if_too_many_kwargs(id=id, name=name, alias=alias)
169
+
170
+ rq_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR
169
171
 
170
172
  if not rq_base_path.exists():
171
173
  await asyncio.to_thread(rq_base_path.mkdir, parents=True, exist_ok=True)
@@ -177,12 +179,12 @@ class FileSystemRequestQueueClient(RequestQueueClient):
177
179
  if not rq_dir.is_dir():
178
180
  continue
179
181
 
180
- metadata_path = rq_dir / METADATA_FILENAME
181
- if not metadata_path.exists():
182
+ path_to_metadata = rq_dir / METADATA_FILENAME
183
+ if not path_to_metadata.exists():
182
184
  continue
183
185
 
184
186
  try:
185
- file = await asyncio.to_thread(metadata_path.open)
187
+ file = await asyncio.to_thread(path_to_metadata.open)
186
188
  try:
187
189
  file_content = json.load(file)
188
190
  metadata = RequestQueueMetadata(**file_content)
@@ -190,7 +192,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
190
192
  if metadata.id == id:
191
193
  client = cls(
192
194
  metadata=metadata,
193
- storage_dir=storage_dir,
195
+ path_to_rq=rq_base_path / rq_dir,
194
196
  lock=asyncio.Lock(),
195
197
  )
196
198
  await client._state.initialize()
@@ -206,14 +208,15 @@ class FileSystemRequestQueueClient(RequestQueueClient):
206
208
  if not found:
207
209
  raise ValueError(f'Request queue with ID "{id}" not found')
208
210
 
209
- # Open an existing RQ by its name, or create a new one if not found.
211
+ # Open an existing RQ by its name or alias, or create a new one if not found.
210
212
  else:
211
- rq_path = rq_base_path / cls._STORAGE_SUBSUBDIR_DEFAULT if name is None else rq_base_path / name
212
- metadata_path = rq_path / METADATA_FILENAME
213
+ rq_dir = Path(name) if name else Path(alias) if alias else Path('default')
214
+ path_to_rq = rq_base_path / rq_dir
215
+ path_to_metadata = path_to_rq / METADATA_FILENAME
213
216
 
214
217
  # If the RQ directory exists, reconstruct the client from the metadata file.
215
- if rq_path.exists() and metadata_path.exists():
216
- file = await asyncio.to_thread(open, metadata_path)
218
+ if path_to_rq.exists() and path_to_metadata.exists():
219
+ file = await asyncio.to_thread(open, path_to_metadata)
217
220
  try:
218
221
  file_content = json.load(file)
219
222
  finally:
@@ -221,13 +224,11 @@ class FileSystemRequestQueueClient(RequestQueueClient):
221
224
  try:
222
225
  metadata = RequestQueueMetadata(**file_content)
223
226
  except ValidationError as exc:
224
- raise ValueError(f'Invalid metadata file for request queue "{name}"') from exc
225
-
226
- metadata.name = name
227
+ raise ValueError(f'Invalid metadata file for request queue "{name or alias}"') from exc
227
228
 
228
229
  client = cls(
229
230
  metadata=metadata,
230
- storage_dir=storage_dir,
231
+ path_to_rq=path_to_rq,
231
232
  lock=asyncio.Lock(),
232
233
  )
233
234
 
@@ -251,7 +252,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
251
252
  )
252
253
  client = cls(
253
254
  metadata=metadata,
254
- storage_dir=storage_dir,
255
+ path_to_rq=path_to_rq,
255
256
  lock=asyncio.Lock(),
256
257
  )
257
258
  await client._state.initialize()
@@ -312,28 +313,43 @@ class FileSystemRequestQueueClient(RequestQueueClient):
312
313
  unprocessed_requests = list[UnprocessedRequest]()
313
314
  state = self._state.current_value
314
315
 
315
- # Prepare a dictionary to track existing requests by their unique keys.
316
- existing_unique_keys: dict[str, Path] = {}
317
- existing_request_files = await self._get_request_files(self.path_to_rq)
316
+ all_requests = state.forefront_requests | state.regular_requests
318
317
 
319
- for request_file in existing_request_files:
320
- existing_request = await self._parse_request_file(request_file)
321
- if existing_request is not None:
322
- existing_unique_keys[existing_request.unique_key] = request_file
318
+ requests_to_enqueue = {}
323
319
 
324
- # Process each request in the batch.
320
+ # Determine which requests can be added or are modified.
325
321
  for request in requests:
326
- existing_request_file = existing_unique_keys.get(request.unique_key)
327
- existing_request = None
328
-
329
- # Only load the full request from disk if we found a duplicate
330
- if existing_request_file is not None:
331
- existing_request = await self._parse_request_file(existing_request_file)
322
+ # Check if the request has already been handled.
323
+ if request.unique_key in state.handled_requests:
324
+ processed_requests.append(
325
+ ProcessedRequest(
326
+ unique_key=request.unique_key,
327
+ was_already_present=True,
328
+ was_already_handled=True,
329
+ )
330
+ )
331
+ # Check if the request is already in progress.
332
+ # Or if the request is already in the queue and the `forefront` flag is not used, we do not change the
333
+ # position of the request.
334
+ elif (request.unique_key in state.in_progress_requests) or (
335
+ request.unique_key in all_requests and not forefront
336
+ ):
337
+ processed_requests.append(
338
+ ProcessedRequest(
339
+ unique_key=request.unique_key,
340
+ was_already_present=True,
341
+ was_already_handled=False,
342
+ )
343
+ )
344
+ # These requests must either be added or update their position.
345
+ else:
346
+ requests_to_enqueue[request.unique_key] = request
332
347
 
333
- # If there is no existing request with the same unique key, add the new request.
334
- if existing_request is None:
348
+ # Process each request in the batch.
349
+ for request in requests_to_enqueue.values():
350
+ # If the request is not already in the RQ, this is a new request.
351
+ if request.unique_key not in all_requests:
335
352
  request_path = self._get_request_path(request.unique_key)
336
-
337
353
  # Add sequence number to ensure FIFO ordering using state.
338
354
  if forefront:
339
355
  sequence_number = state.forefront_sequence_counter
@@ -352,9 +368,6 @@ class FileSystemRequestQueueClient(RequestQueueClient):
352
368
  new_total_request_count += 1
353
369
  new_pending_request_count += 1
354
370
 
355
- # Add to our index for subsequent requests in this batch
356
- existing_unique_keys[request.unique_key] = self._get_request_path(request.unique_key)
357
-
358
371
  processed_requests.append(
359
372
  ProcessedRequest(
360
373
  unique_key=request.unique_key,
@@ -363,57 +376,33 @@ class FileSystemRequestQueueClient(RequestQueueClient):
363
376
  )
364
377
  )
365
378
 
366
- # If the request already exists in the RQ, just update it if needed.
367
- else:
368
- # Set the processed request flags.
369
- was_already_present = existing_request is not None
370
- was_already_handled = existing_request.unique_key in state.handled_requests
371
-
372
- # If the request is already in the RQ and handled, just continue with the next one.
373
- if was_already_present and was_already_handled:
374
- processed_requests.append(
375
- ProcessedRequest(
376
- unique_key=request.unique_key,
377
- was_already_present=True,
378
- was_already_handled=True,
379
- )
380
- )
379
+ # If the request already exists in the RQ and use the forefront flag to update its position
380
+ elif forefront:
381
+ # If the request is among `regular`, remove it from its current position.
382
+ if request.unique_key in state.regular_requests:
383
+ state.regular_requests.pop(request.unique_key)
381
384
 
382
- # If the request is already in the RQ but not handled yet, update it.
383
- elif was_already_present and not was_already_handled:
384
- # Update request type (forefront vs regular) in state
385
- if forefront:
386
- # Move from regular to forefront if needed
387
- if existing_request.unique_key in state.regular_requests:
388
- state.regular_requests.pop(existing_request.unique_key)
389
- if existing_request.unique_key not in state.forefront_requests:
390
- state.forefront_requests[existing_request.unique_key] = state.forefront_sequence_counter
391
- state.forefront_sequence_counter += 1
392
- elif (
393
- existing_request.unique_key not in state.forefront_requests
394
- and existing_request.unique_key not in state.regular_requests
395
- ):
396
- # Keep as regular if not already forefront
397
- state.regular_requests[existing_request.unique_key] = state.sequence_counter
398
- state.sequence_counter += 1
399
-
400
- processed_requests.append(
401
- ProcessedRequest(
402
- unique_key=request.unique_key,
403
- was_already_present=True,
404
- was_already_handled=False,
405
- )
385
+ # If the request is already in `forefront`, we just need to update its position.
386
+ state.forefront_requests[request.unique_key] = state.forefront_sequence_counter
387
+ state.forefront_sequence_counter += 1
388
+
389
+ processed_requests.append(
390
+ ProcessedRequest(
391
+ unique_key=request.unique_key,
392
+ was_already_present=True,
393
+ was_already_handled=False,
406
394
  )
395
+ )
407
396
 
408
- else:
409
- logger.warning(f'Request with unique key "{request.unique_key}" could not be processed.')
410
- unprocessed_requests.append(
411
- UnprocessedRequest(
412
- unique_key=request.unique_key,
413
- url=request.url,
414
- method=request.method,
415
- )
397
+ else:
398
+ logger.warning(f'Request with unique key "{request.unique_key}" could not be processed.')
399
+ unprocessed_requests.append(
400
+ UnprocessedRequest(
401
+ unique_key=request.unique_key,
402
+ url=request.url,
403
+ method=request.method,
416
404
  )
405
+ )
417
406
 
418
407
  await self._update_metadata(
419
408
  update_modified_at=True,
@@ -752,10 +741,7 @@ class FileSystemRequestQueueClient(RequestQueueClient):
752
741
  files = await asyncio.to_thread(list, path_to_rq.glob('*.json'))
753
742
 
754
743
  # Filter out metadata file and non-file entries.
755
- filtered = filter(
756
- lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME,
757
- files,
758
- )
744
+ filtered = filter(lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME, files)
759
745
 
760
746
  return list(filtered)
761
747
 
@@ -1,5 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from typing import TYPE_CHECKING
4
+
3
5
  from typing_extensions import override
4
6
 
5
7
  from crawlee._utils.docs import docs_group
@@ -10,6 +12,9 @@ from ._dataset_client import FileSystemDatasetClient
10
12
  from ._key_value_store_client import FileSystemKeyValueStoreClient
11
13
  from ._request_queue_client import FileSystemRequestQueueClient
12
14
 
15
+ if TYPE_CHECKING:
16
+ from collections.abc import Hashable
17
+
13
18
 
14
19
  @docs_group('Storage clients')
15
20
  class FileSystemStorageClient(StorageClient):
@@ -29,16 +34,22 @@ class FileSystemStorageClient(StorageClient):
29
34
  Use it only when running a single crawler process at a time.
30
35
  """
31
36
 
37
+ @override
38
+ def get_storage_client_cache_key(self, configuration: Configuration) -> Hashable:
39
+ # Even different client instances should return same storage if the storage_dir is the same.
40
+ return super().get_storage_client_cache_key(configuration), configuration.storage_dir
41
+
32
42
  @override
33
43
  async def create_dataset_client(
34
44
  self,
35
45
  *,
36
46
  id: str | None = None,
37
47
  name: str | None = None,
48
+ alias: str | None = None,
38
49
  configuration: Configuration | None = None,
39
50
  ) -> FileSystemDatasetClient:
40
51
  configuration = configuration or Configuration.get_global_configuration()
41
- client = await FileSystemDatasetClient.open(id=id, name=name, configuration=configuration)
52
+ client = await FileSystemDatasetClient.open(id=id, name=name, alias=alias, configuration=configuration)
42
53
  await self._purge_if_needed(client, configuration)
43
54
  return client
44
55
 
@@ -48,10 +59,11 @@ class FileSystemStorageClient(StorageClient):
48
59
  *,
49
60
  id: str | None = None,
50
61
  name: str | None = None,
62
+ alias: str | None = None,
51
63
  configuration: Configuration | None = None,
52
64
  ) -> FileSystemKeyValueStoreClient:
53
65
  configuration = configuration or Configuration.get_global_configuration()
54
- client = await FileSystemKeyValueStoreClient.open(id=id, name=name, configuration=configuration)
66
+ client = await FileSystemKeyValueStoreClient.open(id=id, name=name, alias=alias, configuration=configuration)
55
67
  await self._purge_if_needed(client, configuration)
56
68
  return client
57
69
 
@@ -61,9 +73,10 @@ class FileSystemStorageClient(StorageClient):
61
73
  *,
62
74
  id: str | None = None,
63
75
  name: str | None = None,
76
+ alias: str | None = None,
64
77
  configuration: Configuration | None = None,
65
78
  ) -> FileSystemRequestQueueClient:
66
79
  configuration = configuration or Configuration.get_global_configuration()
67
- client = await FileSystemRequestQueueClient.open(id=id, name=name, configuration=configuration)
80
+ client = await FileSystemRequestQueueClient.open(id=id, name=name, alias=alias, configuration=configuration)
68
81
  await self._purge_if_needed(client, configuration)
69
82
  return client
File without changes
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any
7
7
  from typing_extensions import override
8
8
 
9
9
  from crawlee._utils.crypto import crypto_random_object_id
10
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
10
11
  from crawlee.storage_clients._base import DatasetClient
11
12
  from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
12
13
 
@@ -53,6 +54,7 @@ class MemoryDatasetClient(DatasetClient):
53
54
  *,
54
55
  id: str | None,
55
56
  name: str | None,
57
+ alias: str | None,
56
58
  ) -> MemoryDatasetClient:
57
59
  """Open or create a new memory dataset client.
58
60
 
@@ -60,14 +62,24 @@ class MemoryDatasetClient(DatasetClient):
60
62
  datasets don't check for existing datasets with the same name or ID since all data exists only in memory
61
63
  and is lost when the process terminates.
62
64
 
65
+ Alias does not have any effect on the memory storage client implementation, because unnamed storages
66
+ are supported by default, since data are not persisted.
67
+
63
68
  Args:
64
69
  id: The ID of the dataset. If not provided, a random ID will be generated.
65
- name: The name of the dataset. If not provided, the dataset will be unnamed.
70
+ name: The name of the dataset for named (global scope) storages.
71
+ alias: The alias of the dataset for unnamed (run scope) storages.
66
72
 
67
73
  Returns:
68
74
  An instance for the opened or created storage client.
75
+
76
+ Raises:
77
+ ValueError: If both name and alias are provided, or if neither id, name, nor alias is provided.
69
78
  """
70
- # Otherwise create a new dataset
79
+ # Validate input parameters.
80
+ raise_if_too_many_kwargs(id=id, name=name, alias=alias)
81
+
82
+ # Create a new dataset
71
83
  dataset_id = id or crypto_random_object_id()
72
84
  now = datetime.now(timezone.utc)
73
85
 
@@ -8,6 +8,7 @@ from typing_extensions import override
8
8
 
9
9
  from crawlee._utils.crypto import crypto_random_object_id
10
10
  from crawlee._utils.file import infer_mime_type
11
+ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
11
12
  from crawlee.storage_clients._base import KeyValueStoreClient
12
13
  from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
13
14
 
@@ -51,6 +52,7 @@ class MemoryKeyValueStoreClient(KeyValueStoreClient):
51
52
  *,
52
53
  id: str | None,
53
54
  name: str | None,
55
+ alias: str | None,
54
56
  ) -> MemoryKeyValueStoreClient:
55
57
  """Open or create a new memory key-value store client.
56
58
 
@@ -58,14 +60,24 @@ class MemoryKeyValueStoreClient(KeyValueStoreClient):
58
60
  memory KVS don't check for existing stores with the same name or ID since all data exists only in memory
59
61
  and is lost when the process terminates.
60
62
 
63
+ Alias does not have any effect on the memory storage client implementation, because unnamed storages
64
+ are supported by default, since data are not persisted.
65
+
61
66
  Args:
62
67
  id: The ID of the key-value store. If not provided, a random ID will be generated.
63
- name: The name of the key-value store. If not provided, the store will be unnamed.
68
+ name: The name of the key-value store for named (global scope) storages.
69
+ alias: The alias of the key-value store for unnamed (run scope) storages.
64
70
 
65
71
  Returns:
66
72
  An instance for the opened or created storage client.
73
+
74
+ Raises:
75
+ ValueError: If both name and alias are provided.
67
76
  """
68
- # Otherwise create a new key-value store
77
+ # Validate input parameters.
78
+ raise_if_too_many_kwargs(id=id, name=name, alias=alias)
79
+
80
+ # Create a new key-value store
69
81
  store_id = id or crypto_random_object_id()
70
82
  now = datetime.now(timezone.utc)
71
83