crawlee 0.6.13b17__py3-none-any.whl → 1.1.2b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (102) hide show
  1. crawlee/_autoscaling/snapshotter.py +1 -1
  2. crawlee/_request.py +35 -33
  3. crawlee/_service_locator.py +44 -24
  4. crawlee/_types.py +106 -34
  5. crawlee/_utils/context.py +2 -2
  6. crawlee/_utils/file.py +7 -0
  7. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  8. crawlee/_utils/recoverable_state.py +32 -8
  9. crawlee/_utils/recurring_task.py +17 -1
  10. crawlee/_utils/requests.py +0 -26
  11. crawlee/_utils/robots.py +17 -5
  12. crawlee/_utils/sitemap.py +4 -2
  13. crawlee/_utils/system.py +3 -3
  14. crawlee/_utils/time.py +120 -0
  15. crawlee/_utils/urls.py +9 -2
  16. crawlee/browsers/_browser_pool.py +4 -1
  17. crawlee/browsers/_playwright_browser_controller.py +21 -15
  18. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  19. crawlee/browsers/_types.py +1 -1
  20. crawlee/configuration.py +2 -0
  21. crawlee/crawlers/__init__.py +2 -1
  22. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  23. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +50 -12
  24. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  25. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  26. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
  27. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  28. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
  29. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  30. crawlee/crawlers/_basic/_basic_crawler.py +219 -126
  31. crawlee/crawlers/_basic/_logging_utils.py +5 -1
  32. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  33. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  34. crawlee/crawlers/_playwright/_playwright_crawler.py +60 -11
  35. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  36. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  37. crawlee/crawlers/_playwright/_types.py +12 -2
  38. crawlee/events/_event_manager.py +4 -4
  39. crawlee/events/_types.py +6 -6
  40. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  41. crawlee/fingerprint_suite/_header_generator.py +2 -2
  42. crawlee/fingerprint_suite/_types.py +2 -2
  43. crawlee/http_clients/_base.py +4 -0
  44. crawlee/http_clients/_curl_impersonate.py +12 -0
  45. crawlee/http_clients/_httpx.py +16 -6
  46. crawlee/http_clients/_impit.py +25 -10
  47. crawlee/otel/crawler_instrumentor.py +3 -3
  48. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  49. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  50. crawlee/request_loaders/_request_list.py +3 -3
  51. crawlee/request_loaders/_request_loader.py +5 -1
  52. crawlee/request_loaders/_sitemap_request_loader.py +248 -50
  53. crawlee/sessions/_models.py +2 -2
  54. crawlee/sessions/_session_pool.py +1 -1
  55. crawlee/statistics/_error_snapshotter.py +1 -1
  56. crawlee/statistics/_models.py +43 -4
  57. crawlee/statistics/_statistics.py +24 -33
  58. crawlee/storage_clients/__init__.py +16 -0
  59. crawlee/storage_clients/_base/_request_queue_client.py +2 -2
  60. crawlee/storage_clients/_base/_storage_client.py +13 -0
  61. crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
  62. crawlee/storage_clients/_file_system/_key_value_store_client.py +30 -26
  63. crawlee/storage_clients/_file_system/_request_queue_client.py +169 -153
  64. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  65. crawlee/storage_clients/_file_system/_utils.py +0 -0
  66. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  67. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  68. crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
  69. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  70. crawlee/storage_clients/_redis/__init__.py +6 -0
  71. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  72. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  73. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  74. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  75. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  76. crawlee/storage_clients/_redis/_utils.py +23 -0
  77. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  78. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  79. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  80. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  81. crawlee/storage_clients/_redis/py.typed +0 -0
  82. crawlee/storage_clients/_sql/__init__.py +6 -0
  83. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  84. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  85. crawlee/storage_clients/_sql/_db_models.py +268 -0
  86. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  87. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  88. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  89. crawlee/storage_clients/_sql/py.typed +0 -0
  90. crawlee/storage_clients/models.py +13 -11
  91. crawlee/storages/_base.py +5 -1
  92. crawlee/storages/_dataset.py +12 -2
  93. crawlee/storages/_key_value_store.py +17 -4
  94. crawlee/storages/_request_queue.py +13 -5
  95. crawlee/storages/_storage_instance_manager.py +133 -71
  96. crawlee/storages/_utils.py +11 -0
  97. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +18 -6
  98. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +101 -78
  99. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
  100. crawlee/_utils/measure_time.py +0 -31
  101. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
  102. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,282 @@
1
+ from __future__ import annotations
2
+
3
+ import warnings
4
+ from datetime import timedelta
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING
7
+
8
+ from sqlalchemy.exc import IntegrityError, OperationalError
9
+ from sqlalchemy.ext.asyncio import AsyncEngine, async_sessionmaker, create_async_engine
10
+ from sqlalchemy.sql import insert, select, text
11
+ from typing_extensions import override
12
+
13
+ from crawlee._utils.docs import docs_group
14
+ from crawlee.configuration import Configuration
15
+ from crawlee.storage_clients._base import StorageClient
16
+
17
+ from ._dataset_client import SqlDatasetClient
18
+ from ._db_models import Base, VersionDb
19
+ from ._key_value_store_client import SqlKeyValueStoreClient
20
+ from ._request_queue_client import SqlRequestQueueClient
21
+
22
+ if TYPE_CHECKING:
23
+ from types import TracebackType
24
+
25
+ from sqlalchemy.ext.asyncio import AsyncSession
26
+
27
+
28
+ @docs_group('Storage clients')
29
+ class SqlStorageClient(StorageClient):
30
+ """SQL implementation of the storage client.
31
+
32
+ This storage client provides access to datasets, key-value stores, and request queues that persist data
33
+ to a SQL database using SQLAlchemy 2+. Each storage type uses two tables: one for metadata and one for
34
+ records.
35
+
36
+ The client accepts either a database connection string or a pre-configured AsyncEngine. If neither is
37
+ provided, it creates a default SQLite database 'crawlee.db' in the storage directory.
38
+
39
+ Database schema is automatically created during initialization. SQLite databases receive performance
40
+ optimizations including WAL mode and increased cache size.
41
+
42
+ Warning:
43
+ This is an experimental feature. The behavior and interface may change in future versions.
44
+ """
45
+
46
+ _DEFAULT_DB_NAME = 'crawlee.db'
47
+ """Default database name if not specified in connection string."""
48
+
49
+ def __init__(
50
+ self,
51
+ *,
52
+ connection_string: str | None = None,
53
+ engine: AsyncEngine | None = None,
54
+ ) -> None:
55
+ """Initialize the SQL storage client.
56
+
57
+ Args:
58
+ connection_string: Database connection string (e.g., "sqlite+aiosqlite:///crawlee.db").
59
+ If not provided, defaults to SQLite database in the storage directory.
60
+ engine: Pre-configured AsyncEngine instance. If provided, connection_string is ignored.
61
+ """
62
+ if engine is not None and connection_string is not None:
63
+ raise ValueError('Either connection_string or engine must be provided, not both.')
64
+
65
+ self._connection_string = connection_string
66
+ self._engine = engine
67
+ self._initialized = False
68
+ self.session_maker: None | async_sessionmaker[AsyncSession] = None
69
+
70
+ # Minimum interval to reduce database load from frequent concurrent metadata updates
71
+ self._accessed_modified_update_interval = timedelta(seconds=1)
72
+
73
+ # Flag needed to apply optimizations only for default database
74
+ self._default_flag = self._engine is None and self._connection_string is None
75
+ self._dialect_name: str | None = None
76
+
77
+ # Call the notification only once
78
+ warnings.warn(
79
+ 'The SqlStorageClient is experimental and may change or be removed in future releases.',
80
+ category=UserWarning,
81
+ stacklevel=2,
82
+ )
83
+
84
+ async def __aenter__(self) -> SqlStorageClient:
85
+ """Async context manager entry."""
86
+ return self
87
+
88
+ async def __aexit__(
89
+ self,
90
+ exc_type: type[BaseException] | None,
91
+ exc_value: BaseException | None,
92
+ exc_traceback: TracebackType | None,
93
+ ) -> None:
94
+ """Async context manager exit."""
95
+ await self.close()
96
+
97
+ @property
98
+ def engine(self) -> AsyncEngine:
99
+ """Get the SQLAlchemy AsyncEngine instance."""
100
+ if self._engine is None:
101
+ raise ValueError('Engine is not initialized. Call initialize() before accessing the engine.')
102
+ return self._engine
103
+
104
+ def get_dialect_name(self) -> str | None:
105
+ """Get the database dialect name."""
106
+ return self._dialect_name
107
+
108
+ def get_accessed_modified_update_interval(self) -> timedelta:
109
+ """Get the interval for accessed and modified updates."""
110
+ return self._accessed_modified_update_interval
111
+
112
+ async def initialize(self, configuration: Configuration) -> None:
113
+ """Initialize the database schema.
114
+
115
+ This method creates all necessary tables if they don't exist.
116
+ Should be called before using the storage client.
117
+ """
118
+ if not self._initialized:
119
+ engine = self._get_or_create_engine(configuration)
120
+ async with engine.begin() as conn:
121
+ self._dialect_name = engine.dialect.name
122
+
123
+ if self._dialect_name not in ('sqlite', 'postgresql'):
124
+ raise ValueError(
125
+ f'Unsupported database dialect: {self._dialect_name}. Supported: sqlite, postgresql. '
126
+ 'Consider using a different database.',
127
+ )
128
+
129
+ # Create tables if they don't exist.
130
+ # Rollback the transaction when an exception occurs.
131
+ # This is likely an attempt to create a database from several parallel processes.
132
+ try:
133
+ # Set SQLite pragmas for performance and consistency
134
+ if self._default_flag:
135
+ await conn.execute(text('PRAGMA journal_mode=WAL')) # Better concurrency
136
+ await conn.execute(text('PRAGMA synchronous=NORMAL')) # Balanced safety/speed
137
+ await conn.execute(text('PRAGMA cache_size=100000')) # 100MB cache
138
+ await conn.execute(text('PRAGMA temp_store=MEMORY')) # Memory temp storage
139
+ await conn.execute(text('PRAGMA mmap_size=268435456')) # 256MB memory mapping
140
+ await conn.execute(text('PRAGMA foreign_keys=ON')) # Enforce constraints
141
+ await conn.execute(text('PRAGMA busy_timeout=30000')) # 30s busy timeout
142
+
143
+ await conn.run_sync(Base.metadata.create_all, checkfirst=True)
144
+
145
+ from crawlee import __version__ # Noqa: PLC0415
146
+
147
+ db_version = (await conn.execute(select(VersionDb))).scalar_one_or_none()
148
+
149
+ # Raise an error if the new version creates breaking changes in the database schema.
150
+ if db_version and db_version != __version__:
151
+ warnings.warn(
152
+ f'Database version {db_version} does not match library version {__version__}. '
153
+ 'This may lead to unexpected behavior. Drop the db if you want to make sure that '
154
+ 'everything will work fine.',
155
+ category=UserWarning,
156
+ stacklevel=2,
157
+ )
158
+ elif not db_version:
159
+ await conn.execute(insert(VersionDb).values(version=__version__))
160
+
161
+ except (IntegrityError, OperationalError):
162
+ await conn.rollback()
163
+
164
+ self._initialized = True
165
+
166
+ async def close(self) -> None:
167
+ """Close the database connection pool."""
168
+ if self._engine is not None:
169
+ await self._engine.dispose()
170
+ self._engine = None
171
+
172
+ def create_session(self) -> AsyncSession:
173
+ """Create a new database session.
174
+
175
+ Returns:
176
+ A new AsyncSession instance.
177
+ """
178
+ if self.session_maker is None:
179
+ self.session_maker = async_sessionmaker(self._engine, expire_on_commit=False, autoflush=False)
180
+ return self.session_maker()
181
+
182
+ @override
183
+ async def create_dataset_client(
184
+ self,
185
+ *,
186
+ id: str | None = None,
187
+ name: str | None = None,
188
+ alias: str | None = None,
189
+ configuration: Configuration | None = None,
190
+ ) -> SqlDatasetClient:
191
+ configuration = configuration or Configuration.get_global_configuration()
192
+ await self.initialize(configuration)
193
+
194
+ client = await SqlDatasetClient.open(
195
+ id=id,
196
+ name=name,
197
+ alias=alias,
198
+ storage_client=self,
199
+ )
200
+
201
+ await self._purge_if_needed(client, configuration)
202
+ return client
203
+
204
+ @override
205
+ async def create_kvs_client(
206
+ self,
207
+ *,
208
+ id: str | None = None,
209
+ name: str | None = None,
210
+ alias: str | None = None,
211
+ configuration: Configuration | None = None,
212
+ ) -> SqlKeyValueStoreClient:
213
+ configuration = configuration or Configuration.get_global_configuration()
214
+ await self.initialize(configuration)
215
+
216
+ client = await SqlKeyValueStoreClient.open(
217
+ id=id,
218
+ name=name,
219
+ alias=alias,
220
+ storage_client=self,
221
+ )
222
+
223
+ await self._purge_if_needed(client, configuration)
224
+ return client
225
+
226
+ @override
227
+ async def create_rq_client(
228
+ self,
229
+ *,
230
+ id: str | None = None,
231
+ name: str | None = None,
232
+ alias: str | None = None,
233
+ configuration: Configuration | None = None,
234
+ ) -> SqlRequestQueueClient:
235
+ configuration = configuration or Configuration.get_global_configuration()
236
+ await self.initialize(configuration)
237
+
238
+ client = await SqlRequestQueueClient.open(
239
+ id=id,
240
+ name=name,
241
+ alias=alias,
242
+ storage_client=self,
243
+ )
244
+
245
+ await self._purge_if_needed(client, configuration)
246
+ return client
247
+
248
+ def _get_or_create_engine(self, configuration: Configuration) -> AsyncEngine:
249
+ """Get or create the database engine based on configuration."""
250
+ if self._engine is not None:
251
+ return self._engine
252
+
253
+ if self._connection_string is not None:
254
+ connection_string = self._connection_string
255
+ else:
256
+ # Create SQLite database in the storage directory
257
+ storage_dir = Path(configuration.storage_dir)
258
+ if not storage_dir.exists():
259
+ storage_dir.mkdir(parents=True, exist_ok=True)
260
+
261
+ db_path = storage_dir / self._DEFAULT_DB_NAME
262
+
263
+ # Create connection string with path to default database
264
+ connection_string = f'sqlite+aiosqlite:///{db_path}'
265
+
266
+ if 'sqlite' not in connection_string and 'postgresql' not in connection_string:
267
+ raise ValueError(
268
+ 'Unsupported database. Supported: sqlite, postgresql. Consider using a different database.'
269
+ )
270
+
271
+ self._engine = create_async_engine(
272
+ connection_string,
273
+ future=True,
274
+ pool_size=5,
275
+ max_overflow=10,
276
+ pool_timeout=30,
277
+ pool_recycle=600,
278
+ pool_pre_ping=True,
279
+ echo=False,
280
+ connect_args={'timeout': 30},
281
+ )
282
+ return self._engine
File without changes
@@ -20,7 +20,7 @@ class StorageMetadata(BaseModel):
20
20
  It contains common fields shared across all specific storage types.
21
21
  """
22
22
 
23
- model_config = ConfigDict(populate_by_name=True, extra='allow')
23
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, extra='allow', from_attributes=True)
24
24
 
25
25
  id: Annotated[str, Field(alias='id')]
26
26
  """The unique identifier of the storage."""
@@ -42,7 +42,7 @@ class StorageMetadata(BaseModel):
42
42
  class DatasetMetadata(StorageMetadata):
43
43
  """Model for a dataset metadata."""
44
44
 
45
- model_config = ConfigDict(populate_by_name=True)
45
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
46
46
 
47
47
  item_count: Annotated[int, Field(alias='itemCount')]
48
48
  """The number of items in the dataset."""
@@ -52,14 +52,14 @@ class DatasetMetadata(StorageMetadata):
52
52
  class KeyValueStoreMetadata(StorageMetadata):
53
53
  """Model for a key-value store metadata."""
54
54
 
55
- model_config = ConfigDict(populate_by_name=True)
55
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
56
56
 
57
57
 
58
58
  @docs_group('Storage data')
59
59
  class RequestQueueMetadata(StorageMetadata):
60
60
  """Model for a request queue metadata."""
61
61
 
62
- model_config = ConfigDict(populate_by_name=True)
62
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
63
63
 
64
64
  had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients')]
65
65
  """Indicates whether the queue has been accessed by multiple clients (consumers)."""
@@ -78,7 +78,7 @@ class RequestQueueMetadata(StorageMetadata):
78
78
  class KeyValueStoreRecordMetadata(BaseModel):
79
79
  """Model for a key-value store record metadata."""
80
80
 
81
- model_config = ConfigDict(populate_by_name=True)
81
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
82
82
 
83
83
  key: Annotated[str, Field(alias='key')]
84
84
  """The key of the record.
@@ -100,7 +100,7 @@ class KeyValueStoreRecordMetadata(BaseModel):
100
100
  class KeyValueStoreRecord(KeyValueStoreRecordMetadata, Generic[KvsValueType]):
101
101
  """Model for a key-value store record."""
102
102
 
103
- model_config = ConfigDict(populate_by_name=True)
103
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
104
104
 
105
105
  value: Annotated[KvsValueType, Field(alias='value')]
106
106
  """The value of the record."""
@@ -110,7 +110,7 @@ class KeyValueStoreRecord(KeyValueStoreRecordMetadata, Generic[KvsValueType]):
110
110
  class DatasetItemsListPage(BaseModel):
111
111
  """Model for a single page of dataset items returned from a collection list method."""
112
112
 
113
- model_config = ConfigDict(populate_by_name=True)
113
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
114
114
 
115
115
  count: Annotated[int, Field(default=0)]
116
116
  """The number of objects returned on this page."""
@@ -135,9 +135,11 @@ class DatasetItemsListPage(BaseModel):
135
135
  class ProcessedRequest(BaseModel):
136
136
  """Represents a processed request."""
137
137
 
138
- model_config = ConfigDict(populate_by_name=True)
138
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
139
+
140
+ id: Annotated[str | None, Field(alias='requestId', default=None)] = None
141
+ """Internal representation of the request by the storage client. Only some clients use id."""
139
142
 
140
- id: Annotated[str, Field(alias='requestId')]
141
143
  unique_key: Annotated[str, Field(alias='uniqueKey')]
142
144
  was_already_present: Annotated[bool, Field(alias='wasAlreadyPresent')]
143
145
  was_already_handled: Annotated[bool, Field(alias='wasAlreadyHandled')]
@@ -147,7 +149,7 @@ class ProcessedRequest(BaseModel):
147
149
  class UnprocessedRequest(BaseModel):
148
150
  """Represents an unprocessed request."""
149
151
 
150
- model_config = ConfigDict(populate_by_name=True)
152
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
151
153
 
152
154
  unique_key: Annotated[str, Field(alias='uniqueKey')]
153
155
  url: Annotated[str, BeforeValidator(validate_http_url), Field()]
@@ -163,7 +165,7 @@ class AddRequestsResponse(BaseModel):
163
165
  encountered issues during processing.
164
166
  """
165
167
 
166
- model_config = ConfigDict(populate_by_name=True)
168
+ model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
167
169
 
168
170
  processed_requests: Annotated[list[ProcessedRequest], Field(alias='processedRequests')]
169
171
  """Successfully processed requests, including information about whether they were
crawlee/storages/_base.py CHANGED
@@ -36,6 +36,7 @@ class Storage(ABC):
36
36
  *,
37
37
  id: str | None = None,
38
38
  name: str | None = None,
39
+ alias: str | None = None,
39
40
  configuration: Configuration | None = None,
40
41
  storage_client: StorageClient | None = None,
41
42
  ) -> Storage:
@@ -43,7 +44,10 @@ class Storage(ABC):
43
44
 
44
45
  Args:
45
46
  id: The storage ID.
46
- name: The storage name.
47
+ name: The storage name (global scope, persists across runs). Name can only contain letters "a" through "z",
48
+ the digits "0" through "9", and the hyphen ("-") but only in the middle of the string
49
+ (e.g. "my-value-1").
50
+ alias: The storage alias (run scope, creates unnamed storage).
47
51
  configuration: Configuration object used during the storage creation or restoration process.
48
52
  storage_client: Underlying storage client to use. If not provided, the default global storage client
49
53
  from the service locator will be used.
@@ -12,6 +12,7 @@ from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
12
12
 
13
13
  from ._base import Storage
14
14
  from ._key_value_store import KeyValueStore
15
+ from ._utils import validate_storage_name
15
16
 
16
17
  if TYPE_CHECKING:
17
18
  from collections.abc import AsyncIterator
@@ -75,6 +76,8 @@ class Dataset(Storage):
75
76
  id: The unique identifier of the storage.
76
77
  name: The name of the storage, if available.
77
78
  """
79
+ validate_storage_name(name)
80
+
78
81
  self._client = client
79
82
  self._id = id
80
83
  self._name = name
@@ -100,18 +103,25 @@ class Dataset(Storage):
100
103
  *,
101
104
  id: str | None = None,
102
105
  name: str | None = None,
106
+ alias: str | None = None,
103
107
  configuration: Configuration | None = None,
104
108
  storage_client: StorageClient | None = None,
105
109
  ) -> Dataset:
106
110
  configuration = service_locator.get_configuration() if configuration is None else configuration
107
111
  storage_client = service_locator.get_storage_client() if storage_client is None else storage_client
108
112
 
113
+ client_opener_coro = storage_client.create_dataset_client(
114
+ id=id, name=name, alias=alias, configuration=configuration
115
+ )
116
+ storage_client_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)
117
+
109
118
  return await service_locator.storage_instance_manager.open_storage_instance(
110
119
  cls,
111
120
  id=id,
112
121
  name=name,
113
- configuration=configuration,
114
- client_opener=storage_client.create_dataset_client,
122
+ alias=alias,
123
+ client_opener_coro=client_opener_coro,
124
+ storage_client_cache_key=storage_client_cache_key,
115
125
  )
116
126
 
117
127
  @override
@@ -15,6 +15,7 @@ from crawlee._utils.recoverable_state import RecoverableState
15
15
  from crawlee.storage_clients.models import KeyValueStoreMetadata
16
16
 
17
17
  from ._base import Storage
18
+ from ._utils import validate_storage_name
18
19
 
19
20
  if TYPE_CHECKING:
20
21
  from collections.abc import AsyncIterator
@@ -84,6 +85,8 @@ class KeyValueStore(Storage):
84
85
  id: The unique identifier of the storage.
85
86
  name: The name of the storage, if available.
86
87
  """
88
+ validate_storage_name(name)
89
+
87
90
  self._client = client
88
91
  self._id = id
89
92
  self._name = name
@@ -112,18 +115,25 @@ class KeyValueStore(Storage):
112
115
  *,
113
116
  id: str | None = None,
114
117
  name: str | None = None,
118
+ alias: str | None = None,
115
119
  configuration: Configuration | None = None,
116
120
  storage_client: StorageClient | None = None,
117
121
  ) -> KeyValueStore:
118
122
  configuration = service_locator.get_configuration() if configuration is None else configuration
119
123
  storage_client = service_locator.get_storage_client() if storage_client is None else storage_client
120
124
 
125
+ client_opener_coro = storage_client.create_kvs_client(
126
+ id=id, name=name, alias=alias, configuration=configuration
127
+ )
128
+ additional_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)
129
+
121
130
  return await service_locator.storage_instance_manager.open_storage_instance(
122
131
  cls,
123
132
  id=id,
124
133
  name=name,
125
- configuration=configuration,
126
- client_opener=storage_client.create_kvs_client,
134
+ alias=alias,
135
+ client_opener_coro=client_opener_coro,
136
+ storage_client_cache_key=additional_cache_key,
127
137
  )
128
138
 
129
139
  @override
@@ -271,11 +281,14 @@ class KeyValueStore(Storage):
271
281
  if key in cache:
272
282
  return cache[key].current_value.root
273
283
 
284
+ async def kvs_factory() -> KeyValueStore:
285
+ return self
286
+
274
287
  cache[key] = recoverable_state = RecoverableState(
275
288
  default_state=AutosavedValue(default_value),
276
- persistence_enabled=True,
277
- persist_state_kvs_id=self.id,
278
289
  persist_state_key=key,
290
+ persistence_enabled=True,
291
+ persist_state_kvs_factory=kvs_factory,
279
292
  logger=logger,
280
293
  )
281
294
 
@@ -13,6 +13,7 @@ from crawlee._utils.wait import wait_for_all_tasks_for_finish
13
13
  from crawlee.request_loaders import RequestManager
14
14
 
15
15
  from ._base import Storage
16
+ from ._utils import validate_storage_name
16
17
 
17
18
  if TYPE_CHECKING:
18
19
  from collections.abc import Sequence
@@ -80,6 +81,8 @@ class RequestQueue(Storage, RequestManager):
80
81
  id: The unique identifier of the storage.
81
82
  name: The name of the storage, if available.
82
83
  """
84
+ validate_storage_name(name)
85
+
83
86
  self._client = client
84
87
  self._id = id
85
88
  self._name = name
@@ -118,18 +121,23 @@ class RequestQueue(Storage, RequestManager):
118
121
  *,
119
122
  id: str | None = None,
120
123
  name: str | None = None,
124
+ alias: str | None = None,
121
125
  configuration: Configuration | None = None,
122
126
  storage_client: StorageClient | None = None,
123
127
  ) -> RequestQueue:
124
128
  configuration = service_locator.get_configuration() if configuration is None else configuration
125
129
  storage_client = service_locator.get_storage_client() if storage_client is None else storage_client
126
130
 
131
+ client_opener_coro = storage_client.create_rq_client(id=id, name=name, alias=alias, configuration=configuration)
132
+ additional_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)
133
+
127
134
  return await service_locator.storage_instance_manager.open_storage_instance(
128
135
  cls,
129
136
  id=id,
130
137
  name=name,
131
- configuration=configuration,
132
- client_opener=storage_client.create_rq_client,
138
+ alias=alias,
139
+ client_opener_coro=client_opener_coro,
140
+ storage_client_cache_key=additional_cache_key,
133
141
  )
134
142
 
135
143
  @override
@@ -223,16 +231,16 @@ class RequestQueue(Storage, RequestManager):
223
231
  """
224
232
  return await self._client.fetch_next_request()
225
233
 
226
- async def get_request(self, request_id: str) -> Request | None:
234
+ async def get_request(self, unique_key: str) -> Request | None:
227
235
  """Retrieve a specific request from the queue by its ID.
228
236
 
229
237
  Args:
230
- request_id: The ID of the request to retrieve.
238
+ unique_key: Unique key of the request to retrieve.
231
239
 
232
240
  Returns:
233
241
  The request with the specified ID, or `None` if no such request exists.
234
242
  """
235
- return await self._client.get_request(request_id)
243
+ return await self._client.get_request(unique_key)
236
244
 
237
245
  async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
238
246
  """Mark a request as handled after successful processing.