crawlee 0.6.13b31__py3-none-any.whl → 1.1.1b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (82) hide show
  1. crawlee/_autoscaling/snapshotter.py +1 -1
  2. crawlee/_request.py +34 -22
  3. crawlee/_service_locator.py +44 -24
  4. crawlee/_types.py +86 -33
  5. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  6. crawlee/_utils/recoverable_state.py +32 -8
  7. crawlee/_utils/recurring_task.py +15 -0
  8. crawlee/_utils/robots.py +17 -5
  9. crawlee/_utils/sitemap.py +1 -1
  10. crawlee/_utils/system.py +3 -3
  11. crawlee/_utils/urls.py +9 -2
  12. crawlee/browsers/_browser_pool.py +4 -1
  13. crawlee/browsers/_playwright_browser_controller.py +21 -15
  14. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  15. crawlee/browsers/_types.py +1 -1
  16. crawlee/configuration.py +2 -0
  17. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +6 -2
  18. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  19. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  20. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
  21. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  22. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
  23. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  24. crawlee/crawlers/_basic/_basic_crawler.py +124 -37
  25. crawlee/crawlers/_playwright/_playwright_crawler.py +17 -5
  26. crawlee/events/_event_manager.py +3 -1
  27. crawlee/events/_types.py +6 -6
  28. crawlee/fingerprint_suite/_header_generator.py +2 -2
  29. crawlee/fingerprint_suite/_types.py +2 -2
  30. crawlee/otel/crawler_instrumentor.py +3 -3
  31. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  32. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  33. crawlee/request_loaders/_request_list.py +1 -1
  34. crawlee/request_loaders/_sitemap_request_loader.py +23 -5
  35. crawlee/sessions/_models.py +2 -2
  36. crawlee/sessions/_session_pool.py +1 -1
  37. crawlee/statistics/_error_snapshotter.py +1 -1
  38. crawlee/statistics/_models.py +33 -2
  39. crawlee/statistics/_statistics.py +24 -33
  40. crawlee/storage_clients/__init__.py +16 -0
  41. crawlee/storage_clients/_base/_storage_client.py +13 -0
  42. crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
  43. crawlee/storage_clients/_file_system/_key_value_store_client.py +29 -25
  44. crawlee/storage_clients/_file_system/_request_queue_client.py +53 -34
  45. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  46. crawlee/storage_clients/_file_system/_utils.py +0 -0
  47. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  48. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  49. crawlee/storage_clients/_memory/_request_queue_client.py +16 -4
  50. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  51. crawlee/storage_clients/_redis/__init__.py +6 -0
  52. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  53. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  54. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  55. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  56. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  57. crawlee/storage_clients/_redis/_utils.py +23 -0
  58. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  59. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  60. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  61. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  62. crawlee/storage_clients/_redis/py.typed +0 -0
  63. crawlee/storage_clients/_sql/__init__.py +6 -0
  64. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  65. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  66. crawlee/storage_clients/_sql/_db_models.py +268 -0
  67. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  68. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  69. crawlee/storage_clients/_sql/_storage_client.py +291 -0
  70. crawlee/storage_clients/_sql/py.typed +0 -0
  71. crawlee/storage_clients/models.py +10 -10
  72. crawlee/storages/_base.py +5 -1
  73. crawlee/storages/_dataset.py +12 -2
  74. crawlee/storages/_key_value_store.py +17 -4
  75. crawlee/storages/_request_queue.py +10 -2
  76. crawlee/storages/_storage_instance_manager.py +133 -71
  77. crawlee/storages/_utils.py +11 -0
  78. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/METADATA +17 -6
  79. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/RECORD +82 -59
  80. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/WHEEL +0 -0
  81. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/entry_points.txt +0 -0
  82. {crawlee-0.6.13b31.dist-info → crawlee-1.1.1b1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,291 @@
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ import warnings
5
+ from datetime import timedelta
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING
8
+
9
+ from sqlalchemy.exc import IntegrityError, OperationalError
10
+ from sqlalchemy.ext.asyncio import AsyncEngine, async_sessionmaker, create_async_engine
11
+ from sqlalchemy.sql import insert, select, text
12
+ from typing_extensions import override
13
+
14
+ from crawlee._utils.docs import docs_group
15
+ from crawlee.configuration import Configuration
16
+ from crawlee.storage_clients._base import StorageClient
17
+
18
+ from ._dataset_client import SqlDatasetClient
19
+ from ._db_models import Base, VersionDb
20
+ from ._key_value_store_client import SqlKeyValueStoreClient
21
+ from ._request_queue_client import SqlRequestQueueClient
22
+
23
+ if TYPE_CHECKING:
24
+ from types import TracebackType
25
+
26
+ from sqlalchemy.ext.asyncio import AsyncSession
27
+
28
+
29
+ @docs_group('Storage clients')
30
+ class SqlStorageClient(StorageClient):
31
+ """SQL implementation of the storage client.
32
+
33
+ This storage client provides access to datasets, key-value stores, and request queues that persist data
34
+ to a SQL database using SQLAlchemy 2+. Each storage type uses two tables: one for metadata and one for
35
+ records.
36
+
37
+ The client accepts either a database connection string or a pre-configured AsyncEngine. If neither is
38
+ provided, it creates a default SQLite database 'crawlee.db' in the storage directory.
39
+
40
+ Database schema is automatically created during initialization. SQLite databases receive performance
41
+ optimizations including WAL mode and increased cache size.
42
+
43
+ Warning:
44
+ This is an experimental feature. The behavior and interface may change in future versions.
45
+ """
46
+
47
+ _DEFAULT_DB_NAME = 'crawlee.db'
48
+ """Default database name if not specified in connection string."""
49
+
50
+ def __init__(
51
+ self,
52
+ *,
53
+ connection_string: str | None = None,
54
+ engine: AsyncEngine | None = None,
55
+ ) -> None:
56
+ """Initialize the SQL storage client.
57
+
58
+ Args:
59
+ connection_string: Database connection string (e.g., "sqlite+aiosqlite:///crawlee.db").
60
+ If not provided, defaults to SQLite database in the storage directory.
61
+ engine: Pre-configured AsyncEngine instance. If provided, connection_string is ignored.
62
+ """
63
+ if engine is not None and connection_string is not None:
64
+ raise ValueError('Either connection_string or engine must be provided, not both.')
65
+
66
+ self._connection_string = connection_string
67
+ self._engine = engine
68
+ self._initialized = False
69
+ self.session_maker: None | async_sessionmaker[AsyncSession] = None
70
+
71
+ # Minimum interval to reduce database load from frequent concurrent metadata updates
72
+ self._accessed_modified_update_interval = timedelta(seconds=1)
73
+
74
+ # Flag needed to apply optimizations only for default database
75
+ self._default_flag = self._engine is None and self._connection_string is None
76
+ self._dialect_name: str | None = None
77
+
78
+ # Call the notification only once
79
+ warnings.warn(
80
+ 'The SqlStorageClient is experimental and may change or be removed in future releases.',
81
+ category=UserWarning,
82
+ stacklevel=2,
83
+ )
84
+
85
+ async def __aenter__(self) -> SqlStorageClient:
86
+ """Async context manager entry."""
87
+ return self
88
+
89
+ async def __aexit__(
90
+ self,
91
+ exc_type: type[BaseException] | None,
92
+ exc_value: BaseException | None,
93
+ exc_traceback: TracebackType | None,
94
+ ) -> None:
95
+ """Async context manager exit."""
96
+ await self.close()
97
+
98
+ @property
99
+ def engine(self) -> AsyncEngine:
100
+ """Get the SQLAlchemy AsyncEngine instance."""
101
+ if self._engine is None:
102
+ raise ValueError('Engine is not initialized. Call initialize() before accessing the engine.')
103
+ return self._engine
104
+
105
+ def get_dialect_name(self) -> str | None:
106
+ """Get the database dialect name."""
107
+ return self._dialect_name
108
+
109
+ def get_accessed_modified_update_interval(self) -> timedelta:
110
+ """Get the interval for accessed and modified updates."""
111
+ return self._accessed_modified_update_interval
112
+
113
+ async def initialize(self, configuration: Configuration) -> None:
114
+ """Initialize the database schema.
115
+
116
+ This method creates all necessary tables if they don't exist.
117
+ Should be called before using the storage client.
118
+ """
119
+ if not self._initialized:
120
+ engine = self._get_or_create_engine(configuration)
121
+ async with engine.begin() as conn:
122
+ self._dialect_name = engine.dialect.name
123
+
124
+ if self._dialect_name not in ('sqlite', 'postgresql'):
125
+ raise ValueError(
126
+ f'Unsupported database dialect: {self._dialect_name}. Supported: sqlite, postgresql. '
127
+ 'Consider using a different database.',
128
+ )
129
+
130
+ # Create tables if they don't exist.
131
+ # Rollback the transaction when an exception occurs.
132
+ # This is likely an attempt to create a database from several parallel processes.
133
+ try:
134
+ # Set SQLite pragmas for performance and consistency
135
+ if self._default_flag:
136
+ await conn.execute(text('PRAGMA journal_mode=WAL')) # Better concurrency
137
+ await conn.execute(text('PRAGMA synchronous=NORMAL')) # Balanced safety/speed
138
+ await conn.execute(text('PRAGMA cache_size=100000')) # 100MB cache
139
+ await conn.execute(text('PRAGMA temp_store=MEMORY')) # Memory temp storage
140
+ await conn.execute(text('PRAGMA mmap_size=268435456')) # 256MB memory mapping
141
+ await conn.execute(text('PRAGMA foreign_keys=ON')) # Enforce constraints
142
+ await conn.execute(text('PRAGMA busy_timeout=30000')) # 30s busy timeout
143
+
144
+ await conn.run_sync(Base.metadata.create_all, checkfirst=True)
145
+
146
+ from crawlee import __version__ # Noqa: PLC0415
147
+
148
+ db_version = (await conn.execute(select(VersionDb))).scalar_one_or_none()
149
+
150
+ # Raise an error if the new version creates breaking changes in the database schema.
151
+ if db_version and db_version != __version__:
152
+ warnings.warn(
153
+ f'Database version {db_version} does not match library version {__version__}. '
154
+ 'This may lead to unexpected behavior. Drop the db if you want to make sure that '
155
+ 'everything will work fine.',
156
+ category=UserWarning,
157
+ stacklevel=2,
158
+ )
159
+ elif not db_version:
160
+ await conn.execute(insert(VersionDb).values(version=__version__))
161
+
162
+ except (IntegrityError, OperationalError):
163
+ await conn.rollback()
164
+
165
+ self._initialized = True
166
+
167
+ async def close(self) -> None:
168
+ """Close the database connection pool."""
169
+ if self._engine is not None:
170
+ await self._engine.dispose()
171
+ self._engine = None
172
+
173
+ def create_session(self) -> AsyncSession:
174
+ """Create a new database session.
175
+
176
+ Returns:
177
+ A new AsyncSession instance.
178
+ """
179
+ if self.session_maker is None:
180
+ self.session_maker = async_sessionmaker(self._engine, expire_on_commit=False, autoflush=False)
181
+ return self.session_maker()
182
+
183
+ @override
184
+ async def create_dataset_client(
185
+ self,
186
+ *,
187
+ id: str | None = None,
188
+ name: str | None = None,
189
+ alias: str | None = None,
190
+ configuration: Configuration | None = None,
191
+ ) -> SqlDatasetClient:
192
+ configuration = configuration or Configuration.get_global_configuration()
193
+ await self.initialize(configuration)
194
+
195
+ client = await SqlDatasetClient.open(
196
+ id=id,
197
+ name=name,
198
+ alias=alias,
199
+ storage_client=self,
200
+ )
201
+
202
+ await self._purge_if_needed(client, configuration)
203
+ return client
204
+
205
+ @override
206
+ async def create_kvs_client(
207
+ self,
208
+ *,
209
+ id: str | None = None,
210
+ name: str | None = None,
211
+ alias: str | None = None,
212
+ configuration: Configuration | None = None,
213
+ ) -> SqlKeyValueStoreClient:
214
+ configuration = configuration or Configuration.get_global_configuration()
215
+ await self.initialize(configuration)
216
+
217
+ client = await SqlKeyValueStoreClient.open(
218
+ id=id,
219
+ name=name,
220
+ alias=alias,
221
+ storage_client=self,
222
+ )
223
+
224
+ await self._purge_if_needed(client, configuration)
225
+ return client
226
+
227
+ @override
228
+ async def create_rq_client(
229
+ self,
230
+ *,
231
+ id: str | None = None,
232
+ name: str | None = None,
233
+ alias: str | None = None,
234
+ configuration: Configuration | None = None,
235
+ ) -> SqlRequestQueueClient:
236
+ configuration = configuration or Configuration.get_global_configuration()
237
+ await self.initialize(configuration)
238
+
239
+ client = await SqlRequestQueueClient.open(
240
+ id=id,
241
+ name=name,
242
+ alias=alias,
243
+ storage_client=self,
244
+ )
245
+
246
+ await self._purge_if_needed(client, configuration)
247
+ return client
248
+
249
+ def _get_or_create_engine(self, configuration: Configuration) -> AsyncEngine:
250
+ """Get or create the database engine based on configuration."""
251
+ if self._engine is not None:
252
+ return self._engine
253
+
254
+ if self._connection_string is not None:
255
+ connection_string = self._connection_string
256
+ else:
257
+ # Create SQLite database in the storage directory
258
+ storage_dir = Path(configuration.storage_dir)
259
+ if not storage_dir.exists():
260
+ storage_dir.mkdir(parents=True, exist_ok=True)
261
+
262
+ db_path = storage_dir / self._DEFAULT_DB_NAME
263
+
264
+ # Create connection string with path to default database
265
+ connection_string = f'sqlite+aiosqlite:///{db_path}'
266
+
267
+ if 'sqlite' not in connection_string and 'postgresql' not in connection_string:
268
+ raise ValueError(
269
+ 'Unsupported database. Supported: sqlite, postgresql. Consider using a different database.'
270
+ )
271
+
272
+ # TODO: https://github.com/apify/crawlee-python/issues/1555
273
+ if 'postgresql' in connection_string and sys.version_info >= (3, 14):
274
+ raise ValueError(
275
+ 'SqlStorageClient cannot use PostgreSQL with Python 3.14 '
276
+ 'due to asyncpg compatibility limitations. '
277
+ 'Please use Python 3.13 or earlier, or switch to SQLite.'
278
+ )
279
+
280
+ self._engine = create_async_engine(
281
+ connection_string,
282
+ future=True,
283
+ pool_size=5,
284
+ max_overflow=10,
285
+ pool_timeout=30,
286
+ pool_recycle=600,
287
+ pool_pre_ping=True,
288
+ echo=False,
289
+ connect_args={'timeout': 30},
290
+ )
291
+ return self._engine
File without changes
@@ -20,7 +20,7 @@ class StorageMetadata(BaseModel):
20
20
  It contains common fields shared across all specific storage types.
21
21
  """
22
22
 
23
- model_config = ConfigDict(populate_by_name=True, extra='allow')
23
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, extra='allow', from_attributes=True)
24
24
 
25
25
  id: Annotated[str, Field(alias='id')]
26
26
  """The unique identifier of the storage."""
@@ -42,7 +42,7 @@ class StorageMetadata(BaseModel):
42
42
  class DatasetMetadata(StorageMetadata):
43
43
  """Model for a dataset metadata."""
44
44
 
45
- model_config = ConfigDict(populate_by_name=True)
45
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
46
46
 
47
47
  item_count: Annotated[int, Field(alias='itemCount')]
48
48
  """The number of items in the dataset."""
@@ -52,14 +52,14 @@ class DatasetMetadata(StorageMetadata):
52
52
  class KeyValueStoreMetadata(StorageMetadata):
53
53
  """Model for a key-value store metadata."""
54
54
 
55
- model_config = ConfigDict(populate_by_name=True)
55
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
56
56
 
57
57
 
58
58
  @docs_group('Storage data')
59
59
  class RequestQueueMetadata(StorageMetadata):
60
60
  """Model for a request queue metadata."""
61
61
 
62
- model_config = ConfigDict(populate_by_name=True)
62
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
63
63
 
64
64
  had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients')]
65
65
  """Indicates whether the queue has been accessed by multiple clients (consumers)."""
@@ -78,7 +78,7 @@ class RequestQueueMetadata(StorageMetadata):
78
78
  class KeyValueStoreRecordMetadata(BaseModel):
79
79
  """Model for a key-value store record metadata."""
80
80
 
81
- model_config = ConfigDict(populate_by_name=True)
81
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
82
82
 
83
83
  key: Annotated[str, Field(alias='key')]
84
84
  """The key of the record.
@@ -100,7 +100,7 @@ class KeyValueStoreRecordMetadata(BaseModel):
100
100
  class KeyValueStoreRecord(KeyValueStoreRecordMetadata, Generic[KvsValueType]):
101
101
  """Model for a key-value store record."""
102
102
 
103
- model_config = ConfigDict(populate_by_name=True)
103
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
104
104
 
105
105
  value: Annotated[KvsValueType, Field(alias='value')]
106
106
  """The value of the record."""
@@ -110,7 +110,7 @@ class KeyValueStoreRecord(KeyValueStoreRecordMetadata, Generic[KvsValueType]):
110
110
  class DatasetItemsListPage(BaseModel):
111
111
  """Model for a single page of dataset items returned from a collection list method."""
112
112
 
113
- model_config = ConfigDict(populate_by_name=True)
113
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
114
114
 
115
115
  count: Annotated[int, Field(default=0)]
116
116
  """The number of objects returned on this page."""
@@ -135,7 +135,7 @@ class DatasetItemsListPage(BaseModel):
135
135
  class ProcessedRequest(BaseModel):
136
136
  """Represents a processed request."""
137
137
 
138
- model_config = ConfigDict(populate_by_name=True)
138
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
139
139
 
140
140
  id: Annotated[str | None, Field(alias='requestId', default=None)] = None
141
141
  """Internal representation of the request by the storage client. Only some clients use id."""
@@ -149,7 +149,7 @@ class ProcessedRequest(BaseModel):
149
149
  class UnprocessedRequest(BaseModel):
150
150
  """Represents an unprocessed request."""
151
151
 
152
- model_config = ConfigDict(populate_by_name=True)
152
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
153
153
 
154
154
  unique_key: Annotated[str, Field(alias='uniqueKey')]
155
155
  url: Annotated[str, BeforeValidator(validate_http_url), Field()]
@@ -165,7 +165,7 @@ class AddRequestsResponse(BaseModel):
165
165
  encountered issues during processing.
166
166
  """
167
167
 
168
- model_config = ConfigDict(populate_by_name=True)
168
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
169
169
 
170
170
  processed_requests: Annotated[list[ProcessedRequest], Field(alias='processedRequests')]
171
171
  """Successfully processed requests, including information about whether they were
crawlee/storages/_base.py CHANGED
@@ -36,6 +36,7 @@ class Storage(ABC):
36
36
  *,
37
37
  id: str | None = None,
38
38
  name: str | None = None,
39
+ alias: str | None = None,
39
40
  configuration: Configuration | None = None,
40
41
  storage_client: StorageClient | None = None,
41
42
  ) -> Storage:
@@ -43,7 +44,10 @@ class Storage(ABC):
43
44
 
44
45
  Args:
45
46
  id: The storage ID.
46
- name: The storage name.
47
+ name: The storage name (global scope, persists across runs). Name can only contain letters "a" through "z",
48
+ the digits "0" through "9", and the hyphen ("-") but only in the middle of the string
49
+ (e.g. "my-value-1").
50
+ alias: The storage alias (run scope, creates unnamed storage).
47
51
  configuration: Configuration object used during the storage creation or restoration process.
48
52
  storage_client: Underlying storage client to use. If not provided, the default global storage client
49
53
  from the service locator will be used.
@@ -12,6 +12,7 @@ from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
12
12
 
13
13
  from ._base import Storage
14
14
  from ._key_value_store import KeyValueStore
15
+ from ._utils import validate_storage_name
15
16
 
16
17
  if TYPE_CHECKING:
17
18
  from collections.abc import AsyncIterator
@@ -75,6 +76,8 @@ class Dataset(Storage):
75
76
  id: The unique identifier of the storage.
76
77
  name: The name of the storage, if available.
77
78
  """
79
+ validate_storage_name(name)
80
+
78
81
  self._client = client
79
82
  self._id = id
80
83
  self._name = name
@@ -100,18 +103,25 @@ class Dataset(Storage):
100
103
  *,
101
104
  id: str | None = None,
102
105
  name: str | None = None,
106
+ alias: str | None = None,
103
107
  configuration: Configuration | None = None,
104
108
  storage_client: StorageClient | None = None,
105
109
  ) -> Dataset:
106
110
  configuration = service_locator.get_configuration() if configuration is None else configuration
107
111
  storage_client = service_locator.get_storage_client() if storage_client is None else storage_client
108
112
 
113
+ client_opener_coro = storage_client.create_dataset_client(
114
+ id=id, name=name, alias=alias, configuration=configuration
115
+ )
116
+ storage_client_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)
117
+
109
118
  return await service_locator.storage_instance_manager.open_storage_instance(
110
119
  cls,
111
120
  id=id,
112
121
  name=name,
113
- configuration=configuration,
114
- client_opener=storage_client.create_dataset_client,
122
+ alias=alias,
123
+ client_opener_coro=client_opener_coro,
124
+ storage_client_cache_key=storage_client_cache_key,
115
125
  )
116
126
 
117
127
  @override
@@ -15,6 +15,7 @@ from crawlee._utils.recoverable_state import RecoverableState
15
15
  from crawlee.storage_clients.models import KeyValueStoreMetadata
16
16
 
17
17
  from ._base import Storage
18
+ from ._utils import validate_storage_name
18
19
 
19
20
  if TYPE_CHECKING:
20
21
  from collections.abc import AsyncIterator
@@ -84,6 +85,8 @@ class KeyValueStore(Storage):
84
85
  id: The unique identifier of the storage.
85
86
  name: The name of the storage, if available.
86
87
  """
88
+ validate_storage_name(name)
89
+
87
90
  self._client = client
88
91
  self._id = id
89
92
  self._name = name
@@ -112,18 +115,25 @@ class KeyValueStore(Storage):
112
115
  *,
113
116
  id: str | None = None,
114
117
  name: str | None = None,
118
+ alias: str | None = None,
115
119
  configuration: Configuration | None = None,
116
120
  storage_client: StorageClient | None = None,
117
121
  ) -> KeyValueStore:
118
122
  configuration = service_locator.get_configuration() if configuration is None else configuration
119
123
  storage_client = service_locator.get_storage_client() if storage_client is None else storage_client
120
124
 
125
+ client_opener_coro = storage_client.create_kvs_client(
126
+ id=id, name=name, alias=alias, configuration=configuration
127
+ )
128
+ additional_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)
129
+
121
130
  return await service_locator.storage_instance_manager.open_storage_instance(
122
131
  cls,
123
132
  id=id,
124
133
  name=name,
125
- configuration=configuration,
126
- client_opener=storage_client.create_kvs_client,
134
+ alias=alias,
135
+ client_opener_coro=client_opener_coro,
136
+ storage_client_cache_key=additional_cache_key,
127
137
  )
128
138
 
129
139
  @override
@@ -271,11 +281,14 @@ class KeyValueStore(Storage):
271
281
  if key in cache:
272
282
  return cache[key].current_value.root
273
283
 
284
+ async def kvs_factory() -> KeyValueStore:
285
+ return self
286
+
274
287
  cache[key] = recoverable_state = RecoverableState(
275
288
  default_state=AutosavedValue(default_value),
276
- persistence_enabled=True,
277
- persist_state_kvs_id=self.id,
278
289
  persist_state_key=key,
290
+ persistence_enabled=True,
291
+ persist_state_kvs_factory=kvs_factory,
279
292
  logger=logger,
280
293
  )
281
294
 
@@ -13,6 +13,7 @@ from crawlee._utils.wait import wait_for_all_tasks_for_finish
13
13
  from crawlee.request_loaders import RequestManager
14
14
 
15
15
  from ._base import Storage
16
+ from ._utils import validate_storage_name
16
17
 
17
18
  if TYPE_CHECKING:
18
19
  from collections.abc import Sequence
@@ -80,6 +81,8 @@ class RequestQueue(Storage, RequestManager):
80
81
  id: The unique identifier of the storage.
81
82
  name: The name of the storage, if available.
82
83
  """
84
+ validate_storage_name(name)
85
+
83
86
  self._client = client
84
87
  self._id = id
85
88
  self._name = name
@@ -118,18 +121,23 @@ class RequestQueue(Storage, RequestManager):
118
121
  *,
119
122
  id: str | None = None,
120
123
  name: str | None = None,
124
+ alias: str | None = None,
121
125
  configuration: Configuration | None = None,
122
126
  storage_client: StorageClient | None = None,
123
127
  ) -> RequestQueue:
124
128
  configuration = service_locator.get_configuration() if configuration is None else configuration
125
129
  storage_client = service_locator.get_storage_client() if storage_client is None else storage_client
126
130
 
131
+ client_opener_coro = storage_client.create_rq_client(id=id, name=name, alias=alias, configuration=configuration)
132
+ additional_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)
133
+
127
134
  return await service_locator.storage_instance_manager.open_storage_instance(
128
135
  cls,
129
136
  id=id,
130
137
  name=name,
131
- configuration=configuration,
132
- client_opener=storage_client.create_rq_client,
138
+ alias=alias,
139
+ client_opener_coro=client_opener_coro,
140
+ storage_client_cache_key=additional_cache_key,
133
141
  )
134
142
 
135
143
  @override