crawlee 1.0.0rc1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_request.py +2 -1
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +76 -17
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/sitemap.py +3 -1
- crawlee/_utils/system.py +3 -3
- crawlee/browsers/_playwright_browser_controller.py +20 -14
- crawlee/configuration.py +1 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +6 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +107 -27
- crawlee/crawlers/_basic/_logging_utils.py +5 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +6 -1
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +1 -1
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +228 -48
- crawlee/sessions/_models.py +2 -2
- crawlee/statistics/_models.py +1 -1
- crawlee/storage_clients/__init__.py +12 -0
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +27 -25
- crawlee/storage_clients/_file_system/_key_value_store_client.py +27 -23
- crawlee/storage_clients/_file_system/_request_queue_client.py +84 -98
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +14 -2
- crawlee/storage_clients/_memory/_key_value_store_client.py +14 -2
- crawlee/storage_clients/_memory/_request_queue_client.py +43 -12
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +269 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +299 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +706 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +10 -10
- crawlee/storages/_base.py +3 -1
- crawlee/storages/_dataset.py +9 -2
- crawlee/storages/_key_value_store.py +9 -2
- crawlee/storages/_request_queue.py +7 -2
- crawlee/storages/_storage_instance_manager.py +126 -72
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/METADATA +12 -5
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/RECORD +59 -49
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/WHEEL +0 -0
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
from datetime import timedelta
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
from sqlalchemy.exc import IntegrityError, OperationalError
|
|
9
|
+
from sqlalchemy.ext.asyncio import AsyncEngine, async_sessionmaker, create_async_engine
|
|
10
|
+
from sqlalchemy.sql import insert, select, text
|
|
11
|
+
from typing_extensions import override
|
|
12
|
+
|
|
13
|
+
from crawlee._utils.docs import docs_group
|
|
14
|
+
from crawlee.configuration import Configuration
|
|
15
|
+
from crawlee.storage_clients._base import StorageClient
|
|
16
|
+
|
|
17
|
+
from ._dataset_client import SqlDatasetClient
|
|
18
|
+
from ._db_models import Base, VersionDb
|
|
19
|
+
from ._key_value_store_client import SqlKeyValueStoreClient
|
|
20
|
+
from ._request_queue_client import SqlRequestQueueClient
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from types import TracebackType
|
|
24
|
+
|
|
25
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@docs_group('Storage clients')
|
|
29
|
+
class SqlStorageClient(StorageClient):
|
|
30
|
+
"""SQL implementation of the storage client.
|
|
31
|
+
|
|
32
|
+
This storage client provides access to datasets, key-value stores, and request queues that persist data
|
|
33
|
+
to a SQL database using SQLAlchemy 2+. Each storage type uses two tables: one for metadata and one for
|
|
34
|
+
records.
|
|
35
|
+
|
|
36
|
+
The client accepts either a database connection string or a pre-configured AsyncEngine. If neither is
|
|
37
|
+
provided, it creates a default SQLite database 'crawlee.db' in the storage directory.
|
|
38
|
+
|
|
39
|
+
Database schema is automatically created during initialization. SQLite databases receive performance
|
|
40
|
+
optimizations including WAL mode and increased cache size.
|
|
41
|
+
|
|
42
|
+
Warning:
|
|
43
|
+
This is an experimental feature. The behavior and interface may change in future versions.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
_DEFAULT_DB_NAME = 'crawlee.db'
|
|
47
|
+
"""Default database name if not specified in connection string."""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
*,
|
|
52
|
+
connection_string: str | None = None,
|
|
53
|
+
engine: AsyncEngine | None = None,
|
|
54
|
+
) -> None:
|
|
55
|
+
"""Initialize the SQL storage client.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
connection_string: Database connection string (e.g., "sqlite+aiosqlite:///crawlee.db").
|
|
59
|
+
If not provided, defaults to SQLite database in the storage directory.
|
|
60
|
+
engine: Pre-configured AsyncEngine instance. If provided, connection_string is ignored.
|
|
61
|
+
"""
|
|
62
|
+
if engine is not None and connection_string is not None:
|
|
63
|
+
raise ValueError('Either connection_string or engine must be provided, not both.')
|
|
64
|
+
|
|
65
|
+
self._connection_string = connection_string
|
|
66
|
+
self._engine = engine
|
|
67
|
+
self._initialized = False
|
|
68
|
+
self.session_maker: None | async_sessionmaker[AsyncSession] = None
|
|
69
|
+
|
|
70
|
+
# Minimum interval to reduce database load from frequent concurrent metadata updates
|
|
71
|
+
self._accessed_modified_update_interval = timedelta(seconds=1)
|
|
72
|
+
|
|
73
|
+
# Flag needed to apply optimizations only for default database
|
|
74
|
+
self._default_flag = self._engine is None and self._connection_string is None
|
|
75
|
+
self._dialect_name: str | None = None
|
|
76
|
+
|
|
77
|
+
# Call the notification only once
|
|
78
|
+
warnings.warn(
|
|
79
|
+
'The SqlStorageClient is experimental and may change or be removed in future releases.',
|
|
80
|
+
category=UserWarning,
|
|
81
|
+
stacklevel=2,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
async def __aenter__(self) -> SqlStorageClient:
|
|
85
|
+
"""Async context manager entry."""
|
|
86
|
+
return self
|
|
87
|
+
|
|
88
|
+
async def __aexit__(
|
|
89
|
+
self,
|
|
90
|
+
exc_type: type[BaseException] | None,
|
|
91
|
+
exc_value: BaseException | None,
|
|
92
|
+
exc_traceback: TracebackType | None,
|
|
93
|
+
) -> None:
|
|
94
|
+
"""Async context manager exit."""
|
|
95
|
+
await self.close()
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def engine(self) -> AsyncEngine:
|
|
99
|
+
"""Get the SQLAlchemy AsyncEngine instance."""
|
|
100
|
+
if self._engine is None:
|
|
101
|
+
raise ValueError('Engine is not initialized. Call initialize() before accessing the engine.')
|
|
102
|
+
return self._engine
|
|
103
|
+
|
|
104
|
+
def get_dialect_name(self) -> str | None:
|
|
105
|
+
"""Get the database dialect name."""
|
|
106
|
+
return self._dialect_name
|
|
107
|
+
|
|
108
|
+
def get_accessed_modified_update_interval(self) -> timedelta:
|
|
109
|
+
"""Get the interval for accessed and modified updates."""
|
|
110
|
+
return self._accessed_modified_update_interval
|
|
111
|
+
|
|
112
|
+
async def initialize(self, configuration: Configuration) -> None:
|
|
113
|
+
"""Initialize the database schema.
|
|
114
|
+
|
|
115
|
+
This method creates all necessary tables if they don't exist.
|
|
116
|
+
Should be called before using the storage client.
|
|
117
|
+
"""
|
|
118
|
+
if not self._initialized:
|
|
119
|
+
engine = self._get_or_create_engine(configuration)
|
|
120
|
+
async with engine.begin() as conn:
|
|
121
|
+
self._dialect_name = engine.dialect.name
|
|
122
|
+
|
|
123
|
+
if self._dialect_name not in ('sqlite', 'postgresql'):
|
|
124
|
+
raise ValueError(
|
|
125
|
+
f'Unsupported database dialect: {self._dialect_name}. Supported: sqlite, postgresql. '
|
|
126
|
+
'Consider using a different database.',
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# Create tables if they don't exist.
|
|
130
|
+
# Rollback the transaction when an exception occurs.
|
|
131
|
+
# This is likely an attempt to create a database from several parallel processes.
|
|
132
|
+
try:
|
|
133
|
+
# Set SQLite pragmas for performance and consistency
|
|
134
|
+
if self._default_flag:
|
|
135
|
+
await conn.execute(text('PRAGMA journal_mode=WAL')) # Better concurrency
|
|
136
|
+
await conn.execute(text('PRAGMA synchronous=NORMAL')) # Balanced safety/speed
|
|
137
|
+
await conn.execute(text('PRAGMA cache_size=100000')) # 100MB cache
|
|
138
|
+
await conn.execute(text('PRAGMA temp_store=MEMORY')) # Memory temp storage
|
|
139
|
+
await conn.execute(text('PRAGMA mmap_size=268435456')) # 256MB memory mapping
|
|
140
|
+
await conn.execute(text('PRAGMA foreign_keys=ON')) # Enforce constraints
|
|
141
|
+
await conn.execute(text('PRAGMA busy_timeout=30000')) # 30s busy timeout
|
|
142
|
+
|
|
143
|
+
await conn.run_sync(Base.metadata.create_all, checkfirst=True)
|
|
144
|
+
|
|
145
|
+
from crawlee import __version__ # Noqa: PLC0415
|
|
146
|
+
|
|
147
|
+
db_version = (await conn.execute(select(VersionDb))).scalar_one_or_none()
|
|
148
|
+
|
|
149
|
+
# Raise an error if the new version creates breaking changes in the database schema.
|
|
150
|
+
if db_version and db_version != __version__:
|
|
151
|
+
warnings.warn(
|
|
152
|
+
f'Database version {db_version.version} does not match library version {__version__}. '
|
|
153
|
+
'This may lead to unexpected behavior. Drop the db if you want to make sure that '
|
|
154
|
+
'everything will work fine.',
|
|
155
|
+
category=UserWarning,
|
|
156
|
+
stacklevel=2,
|
|
157
|
+
)
|
|
158
|
+
elif not db_version:
|
|
159
|
+
await conn.execute(insert(VersionDb).values(version=__version__))
|
|
160
|
+
|
|
161
|
+
except (IntegrityError, OperationalError):
|
|
162
|
+
await conn.rollback()
|
|
163
|
+
|
|
164
|
+
self._initialized = True
|
|
165
|
+
|
|
166
|
+
async def close(self) -> None:
|
|
167
|
+
"""Close the database connection pool."""
|
|
168
|
+
if self._engine is not None:
|
|
169
|
+
await self._engine.dispose()
|
|
170
|
+
self._engine = None
|
|
171
|
+
|
|
172
|
+
def create_session(self) -> AsyncSession:
|
|
173
|
+
"""Create a new database session.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
A new AsyncSession instance.
|
|
177
|
+
"""
|
|
178
|
+
if self.session_maker is None:
|
|
179
|
+
self.session_maker = async_sessionmaker(self._engine, expire_on_commit=False, autoflush=False)
|
|
180
|
+
return self.session_maker()
|
|
181
|
+
|
|
182
|
+
@override
|
|
183
|
+
async def create_dataset_client(
|
|
184
|
+
self,
|
|
185
|
+
*,
|
|
186
|
+
id: str | None = None,
|
|
187
|
+
name: str | None = None,
|
|
188
|
+
alias: str | None = None,
|
|
189
|
+
configuration: Configuration | None = None,
|
|
190
|
+
) -> SqlDatasetClient:
|
|
191
|
+
configuration = configuration or Configuration.get_global_configuration()
|
|
192
|
+
await self.initialize(configuration)
|
|
193
|
+
|
|
194
|
+
client = await SqlDatasetClient.open(
|
|
195
|
+
id=id,
|
|
196
|
+
name=name,
|
|
197
|
+
alias=alias,
|
|
198
|
+
storage_client=self,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
await self._purge_if_needed(client, configuration)
|
|
202
|
+
return client
|
|
203
|
+
|
|
204
|
+
@override
|
|
205
|
+
async def create_kvs_client(
|
|
206
|
+
self,
|
|
207
|
+
*,
|
|
208
|
+
id: str | None = None,
|
|
209
|
+
name: str | None = None,
|
|
210
|
+
alias: str | None = None,
|
|
211
|
+
configuration: Configuration | None = None,
|
|
212
|
+
) -> SqlKeyValueStoreClient:
|
|
213
|
+
configuration = configuration or Configuration.get_global_configuration()
|
|
214
|
+
await self.initialize(configuration)
|
|
215
|
+
|
|
216
|
+
client = await SqlKeyValueStoreClient.open(
|
|
217
|
+
id=id,
|
|
218
|
+
name=name,
|
|
219
|
+
alias=alias,
|
|
220
|
+
storage_client=self,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
await self._purge_if_needed(client, configuration)
|
|
224
|
+
return client
|
|
225
|
+
|
|
226
|
+
@override
|
|
227
|
+
async def create_rq_client(
|
|
228
|
+
self,
|
|
229
|
+
*,
|
|
230
|
+
id: str | None = None,
|
|
231
|
+
name: str | None = None,
|
|
232
|
+
alias: str | None = None,
|
|
233
|
+
configuration: Configuration | None = None,
|
|
234
|
+
) -> SqlRequestQueueClient:
|
|
235
|
+
configuration = configuration or Configuration.get_global_configuration()
|
|
236
|
+
await self.initialize(configuration)
|
|
237
|
+
|
|
238
|
+
client = await SqlRequestQueueClient.open(
|
|
239
|
+
id=id,
|
|
240
|
+
name=name,
|
|
241
|
+
alias=alias,
|
|
242
|
+
storage_client=self,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
await self._purge_if_needed(client, configuration)
|
|
246
|
+
return client
|
|
247
|
+
|
|
248
|
+
def _get_or_create_engine(self, configuration: Configuration) -> AsyncEngine:
|
|
249
|
+
"""Get or create the database engine based on configuration."""
|
|
250
|
+
if self._engine is not None:
|
|
251
|
+
return self._engine
|
|
252
|
+
|
|
253
|
+
if self._connection_string is not None:
|
|
254
|
+
connection_string = self._connection_string
|
|
255
|
+
else:
|
|
256
|
+
# Create SQLite database in the storage directory
|
|
257
|
+
storage_dir = Path(configuration.storage_dir)
|
|
258
|
+
if not storage_dir.exists():
|
|
259
|
+
storage_dir.mkdir(parents=True, exist_ok=True)
|
|
260
|
+
|
|
261
|
+
db_path = storage_dir / self._DEFAULT_DB_NAME
|
|
262
|
+
|
|
263
|
+
# Create connection string with path to default database
|
|
264
|
+
connection_string = f'sqlite+aiosqlite:///{db_path}'
|
|
265
|
+
|
|
266
|
+
if 'sqlite' not in connection_string and 'postgresql' not in connection_string:
|
|
267
|
+
raise ValueError(
|
|
268
|
+
'Unsupported database. Supported: sqlite, postgresql. Consider using a different database.'
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
self._engine = create_async_engine(
|
|
272
|
+
connection_string,
|
|
273
|
+
future=True,
|
|
274
|
+
pool_size=5,
|
|
275
|
+
max_overflow=10,
|
|
276
|
+
pool_timeout=30,
|
|
277
|
+
pool_recycle=600,
|
|
278
|
+
pool_pre_ping=True,
|
|
279
|
+
echo=False,
|
|
280
|
+
connect_args={'timeout': 30},
|
|
281
|
+
)
|
|
282
|
+
return self._engine
|
|
File without changes
|
|
@@ -20,7 +20,7 @@ class StorageMetadata(BaseModel):
|
|
|
20
20
|
It contains common fields shared across all specific storage types.
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
|
-
model_config = ConfigDict(
|
|
23
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, extra='allow', from_attributes=True)
|
|
24
24
|
|
|
25
25
|
id: Annotated[str, Field(alias='id')]
|
|
26
26
|
"""The unique identifier of the storage."""
|
|
@@ -42,7 +42,7 @@ class StorageMetadata(BaseModel):
|
|
|
42
42
|
class DatasetMetadata(StorageMetadata):
|
|
43
43
|
"""Model for a dataset metadata."""
|
|
44
44
|
|
|
45
|
-
model_config = ConfigDict(
|
|
45
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
46
46
|
|
|
47
47
|
item_count: Annotated[int, Field(alias='itemCount')]
|
|
48
48
|
"""The number of items in the dataset."""
|
|
@@ -52,14 +52,14 @@ class DatasetMetadata(StorageMetadata):
|
|
|
52
52
|
class KeyValueStoreMetadata(StorageMetadata):
|
|
53
53
|
"""Model for a key-value store metadata."""
|
|
54
54
|
|
|
55
|
-
model_config = ConfigDict(
|
|
55
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
56
56
|
|
|
57
57
|
|
|
58
58
|
@docs_group('Storage data')
|
|
59
59
|
class RequestQueueMetadata(StorageMetadata):
|
|
60
60
|
"""Model for a request queue metadata."""
|
|
61
61
|
|
|
62
|
-
model_config = ConfigDict(
|
|
62
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
63
63
|
|
|
64
64
|
had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients')]
|
|
65
65
|
"""Indicates whether the queue has been accessed by multiple clients (consumers)."""
|
|
@@ -78,7 +78,7 @@ class RequestQueueMetadata(StorageMetadata):
|
|
|
78
78
|
class KeyValueStoreRecordMetadata(BaseModel):
|
|
79
79
|
"""Model for a key-value store record metadata."""
|
|
80
80
|
|
|
81
|
-
model_config = ConfigDict(
|
|
81
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
82
82
|
|
|
83
83
|
key: Annotated[str, Field(alias='key')]
|
|
84
84
|
"""The key of the record.
|
|
@@ -100,7 +100,7 @@ class KeyValueStoreRecordMetadata(BaseModel):
|
|
|
100
100
|
class KeyValueStoreRecord(KeyValueStoreRecordMetadata, Generic[KvsValueType]):
|
|
101
101
|
"""Model for a key-value store record."""
|
|
102
102
|
|
|
103
|
-
model_config = ConfigDict(
|
|
103
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
104
104
|
|
|
105
105
|
value: Annotated[KvsValueType, Field(alias='value')]
|
|
106
106
|
"""The value of the record."""
|
|
@@ -110,7 +110,7 @@ class KeyValueStoreRecord(KeyValueStoreRecordMetadata, Generic[KvsValueType]):
|
|
|
110
110
|
class DatasetItemsListPage(BaseModel):
|
|
111
111
|
"""Model for a single page of dataset items returned from a collection list method."""
|
|
112
112
|
|
|
113
|
-
model_config = ConfigDict(
|
|
113
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
114
114
|
|
|
115
115
|
count: Annotated[int, Field(default=0)]
|
|
116
116
|
"""The number of objects returned on this page."""
|
|
@@ -135,7 +135,7 @@ class DatasetItemsListPage(BaseModel):
|
|
|
135
135
|
class ProcessedRequest(BaseModel):
|
|
136
136
|
"""Represents a processed request."""
|
|
137
137
|
|
|
138
|
-
model_config = ConfigDict(
|
|
138
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
139
139
|
|
|
140
140
|
id: Annotated[str | None, Field(alias='requestId', default=None)] = None
|
|
141
141
|
"""Internal representation of the request by the storage client. Only some clients use id."""
|
|
@@ -149,7 +149,7 @@ class ProcessedRequest(BaseModel):
|
|
|
149
149
|
class UnprocessedRequest(BaseModel):
|
|
150
150
|
"""Represents an unprocessed request."""
|
|
151
151
|
|
|
152
|
-
model_config = ConfigDict(
|
|
152
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
153
153
|
|
|
154
154
|
unique_key: Annotated[str, Field(alias='uniqueKey')]
|
|
155
155
|
url: Annotated[str, BeforeValidator(validate_http_url), Field()]
|
|
@@ -165,7 +165,7 @@ class AddRequestsResponse(BaseModel):
|
|
|
165
165
|
encountered issues during processing.
|
|
166
166
|
"""
|
|
167
167
|
|
|
168
|
-
model_config = ConfigDict(
|
|
168
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
169
169
|
|
|
170
170
|
processed_requests: Annotated[list[ProcessedRequest], Field(alias='processedRequests')]
|
|
171
171
|
"""Successfully processed requests, including information about whether they were
|
crawlee/storages/_base.py
CHANGED
|
@@ -36,6 +36,7 @@ class Storage(ABC):
|
|
|
36
36
|
*,
|
|
37
37
|
id: str | None = None,
|
|
38
38
|
name: str | None = None,
|
|
39
|
+
alias: str | None = None,
|
|
39
40
|
configuration: Configuration | None = None,
|
|
40
41
|
storage_client: StorageClient | None = None,
|
|
41
42
|
) -> Storage:
|
|
@@ -43,7 +44,8 @@ class Storage(ABC):
|
|
|
43
44
|
|
|
44
45
|
Args:
|
|
45
46
|
id: The storage ID.
|
|
46
|
-
name: The storage name.
|
|
47
|
+
name: The storage name (global scope, persists across runs).
|
|
48
|
+
alias: The storage alias (run scope, creates unnamed storage).
|
|
47
49
|
configuration: Configuration object used during the storage creation or restoration process.
|
|
48
50
|
storage_client: Underlying storage client to use. If not provided, the default global storage client
|
|
49
51
|
from the service locator will be used.
|
crawlee/storages/_dataset.py
CHANGED
|
@@ -100,18 +100,25 @@ class Dataset(Storage):
|
|
|
100
100
|
*,
|
|
101
101
|
id: str | None = None,
|
|
102
102
|
name: str | None = None,
|
|
103
|
+
alias: str | None = None,
|
|
103
104
|
configuration: Configuration | None = None,
|
|
104
105
|
storage_client: StorageClient | None = None,
|
|
105
106
|
) -> Dataset:
|
|
106
107
|
configuration = service_locator.get_configuration() if configuration is None else configuration
|
|
107
108
|
storage_client = service_locator.get_storage_client() if storage_client is None else storage_client
|
|
108
109
|
|
|
110
|
+
client_opener_coro = storage_client.create_dataset_client(
|
|
111
|
+
id=id, name=name, alias=alias, configuration=configuration
|
|
112
|
+
)
|
|
113
|
+
storage_client_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)
|
|
114
|
+
|
|
109
115
|
return await service_locator.storage_instance_manager.open_storage_instance(
|
|
110
116
|
cls,
|
|
111
117
|
id=id,
|
|
112
118
|
name=name,
|
|
113
|
-
|
|
114
|
-
|
|
119
|
+
alias=alias,
|
|
120
|
+
client_opener_coro=client_opener_coro,
|
|
121
|
+
storage_client_cache_key=storage_client_cache_key,
|
|
115
122
|
)
|
|
116
123
|
|
|
117
124
|
@override
|
|
@@ -112,18 +112,25 @@ class KeyValueStore(Storage):
|
|
|
112
112
|
*,
|
|
113
113
|
id: str | None = None,
|
|
114
114
|
name: str | None = None,
|
|
115
|
+
alias: str | None = None,
|
|
115
116
|
configuration: Configuration | None = None,
|
|
116
117
|
storage_client: StorageClient | None = None,
|
|
117
118
|
) -> KeyValueStore:
|
|
118
119
|
configuration = service_locator.get_configuration() if configuration is None else configuration
|
|
119
120
|
storage_client = service_locator.get_storage_client() if storage_client is None else storage_client
|
|
120
121
|
|
|
122
|
+
client_opener_coro = storage_client.create_kvs_client(
|
|
123
|
+
id=id, name=name, alias=alias, configuration=configuration
|
|
124
|
+
)
|
|
125
|
+
additional_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)
|
|
126
|
+
|
|
121
127
|
return await service_locator.storage_instance_manager.open_storage_instance(
|
|
122
128
|
cls,
|
|
123
129
|
id=id,
|
|
124
130
|
name=name,
|
|
125
|
-
|
|
126
|
-
|
|
131
|
+
alias=alias,
|
|
132
|
+
client_opener_coro=client_opener_coro,
|
|
133
|
+
storage_client_cache_key=additional_cache_key,
|
|
127
134
|
)
|
|
128
135
|
|
|
129
136
|
@override
|
|
@@ -118,18 +118,23 @@ class RequestQueue(Storage, RequestManager):
|
|
|
118
118
|
*,
|
|
119
119
|
id: str | None = None,
|
|
120
120
|
name: str | None = None,
|
|
121
|
+
alias: str | None = None,
|
|
121
122
|
configuration: Configuration | None = None,
|
|
122
123
|
storage_client: StorageClient | None = None,
|
|
123
124
|
) -> RequestQueue:
|
|
124
125
|
configuration = service_locator.get_configuration() if configuration is None else configuration
|
|
125
126
|
storage_client = service_locator.get_storage_client() if storage_client is None else storage_client
|
|
126
127
|
|
|
128
|
+
client_opener_coro = storage_client.create_rq_client(id=id, name=name, alias=alias, configuration=configuration)
|
|
129
|
+
additional_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)
|
|
130
|
+
|
|
127
131
|
return await service_locator.storage_instance_manager.open_storage_instance(
|
|
128
132
|
cls,
|
|
129
133
|
id=id,
|
|
130
134
|
name=name,
|
|
131
|
-
|
|
132
|
-
|
|
135
|
+
alias=alias,
|
|
136
|
+
client_opener_coro=client_opener_coro,
|
|
137
|
+
storage_client_cache_key=additional_cache_key,
|
|
133
138
|
)
|
|
134
139
|
|
|
135
140
|
@override
|