crawlee 0.6.13b43__py3-none-any.whl → 1.1.1b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_request.py +32 -21
- crawlee/_service_locator.py +4 -4
- crawlee/_types.py +67 -24
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +15 -0
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +1 -1
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +3 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +38 -14
- crawlee/crawlers/_basic/_basic_crawler.py +51 -14
- crawlee/crawlers/_playwright/_playwright_crawler.py +16 -4
- crawlee/events/_event_manager.py +3 -1
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_sitemap_request_loader.py +22 -4
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +32 -1
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_storage_client.py +5 -4
- crawlee/storage_clients/_file_system/_dataset_client.py +4 -5
- crawlee/storage_clients/_file_system/_key_value_store_client.py +4 -5
- crawlee/storage_clients/_file_system/_request_queue_client.py +28 -12
- crawlee/storage_clients/_file_system/_storage_client.py +2 -2
- crawlee/storage_clients/_memory/_dataset_client.py +4 -5
- crawlee/storage_clients/_memory/_key_value_store_client.py +4 -5
- crawlee/storage_clients/_memory/_request_queue_client.py +4 -5
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +291 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +10 -10
- crawlee/storages/_base.py +3 -1
- crawlee/storages/_dataset.py +5 -3
- crawlee/storages/_key_value_store.py +11 -6
- crawlee/storages/_request_queue.py +5 -3
- crawlee/storages/_storage_instance_manager.py +54 -68
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/METADATA +16 -5
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/RECORD +69 -47
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/WHEEL +0 -0
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
import warnings
|
|
5
|
+
from datetime import timedelta
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
from sqlalchemy.exc import IntegrityError, OperationalError
|
|
10
|
+
from sqlalchemy.ext.asyncio import AsyncEngine, async_sessionmaker, create_async_engine
|
|
11
|
+
from sqlalchemy.sql import insert, select, text
|
|
12
|
+
from typing_extensions import override
|
|
13
|
+
|
|
14
|
+
from crawlee._utils.docs import docs_group
|
|
15
|
+
from crawlee.configuration import Configuration
|
|
16
|
+
from crawlee.storage_clients._base import StorageClient
|
|
17
|
+
|
|
18
|
+
from ._dataset_client import SqlDatasetClient
|
|
19
|
+
from ._db_models import Base, VersionDb
|
|
20
|
+
from ._key_value_store_client import SqlKeyValueStoreClient
|
|
21
|
+
from ._request_queue_client import SqlRequestQueueClient
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from types import TracebackType
|
|
25
|
+
|
|
26
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@docs_group('Storage clients')
|
|
30
|
+
class SqlStorageClient(StorageClient):
|
|
31
|
+
"""SQL implementation of the storage client.
|
|
32
|
+
|
|
33
|
+
This storage client provides access to datasets, key-value stores, and request queues that persist data
|
|
34
|
+
to a SQL database using SQLAlchemy 2+. Each storage type uses two tables: one for metadata and one for
|
|
35
|
+
records.
|
|
36
|
+
|
|
37
|
+
The client accepts either a database connection string or a pre-configured AsyncEngine. If neither is
|
|
38
|
+
provided, it creates a default SQLite database 'crawlee.db' in the storage directory.
|
|
39
|
+
|
|
40
|
+
Database schema is automatically created during initialization. SQLite databases receive performance
|
|
41
|
+
optimizations including WAL mode and increased cache size.
|
|
42
|
+
|
|
43
|
+
Warning:
|
|
44
|
+
This is an experimental feature. The behavior and interface may change in future versions.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
_DEFAULT_DB_NAME = 'crawlee.db'
|
|
48
|
+
"""Default database name if not specified in connection string."""
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
*,
|
|
53
|
+
connection_string: str | None = None,
|
|
54
|
+
engine: AsyncEngine | None = None,
|
|
55
|
+
) -> None:
|
|
56
|
+
"""Initialize the SQL storage client.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
connection_string: Database connection string (e.g., "sqlite+aiosqlite:///crawlee.db").
|
|
60
|
+
If not provided, defaults to SQLite database in the storage directory.
|
|
61
|
+
engine: Pre-configured AsyncEngine instance. If provided, connection_string is ignored.
|
|
62
|
+
"""
|
|
63
|
+
if engine is not None and connection_string is not None:
|
|
64
|
+
raise ValueError('Either connection_string or engine must be provided, not both.')
|
|
65
|
+
|
|
66
|
+
self._connection_string = connection_string
|
|
67
|
+
self._engine = engine
|
|
68
|
+
self._initialized = False
|
|
69
|
+
self.session_maker: None | async_sessionmaker[AsyncSession] = None
|
|
70
|
+
|
|
71
|
+
# Minimum interval to reduce database load from frequent concurrent metadata updates
|
|
72
|
+
self._accessed_modified_update_interval = timedelta(seconds=1)
|
|
73
|
+
|
|
74
|
+
# Flag needed to apply optimizations only for default database
|
|
75
|
+
self._default_flag = self._engine is None and self._connection_string is None
|
|
76
|
+
self._dialect_name: str | None = None
|
|
77
|
+
|
|
78
|
+
# Call the notification only once
|
|
79
|
+
warnings.warn(
|
|
80
|
+
'The SqlStorageClient is experimental and may change or be removed in future releases.',
|
|
81
|
+
category=UserWarning,
|
|
82
|
+
stacklevel=2,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
async def __aenter__(self) -> SqlStorageClient:
|
|
86
|
+
"""Async context manager entry."""
|
|
87
|
+
return self
|
|
88
|
+
|
|
89
|
+
async def __aexit__(
|
|
90
|
+
self,
|
|
91
|
+
exc_type: type[BaseException] | None,
|
|
92
|
+
exc_value: BaseException | None,
|
|
93
|
+
exc_traceback: TracebackType | None,
|
|
94
|
+
) -> None:
|
|
95
|
+
"""Async context manager exit."""
|
|
96
|
+
await self.close()
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def engine(self) -> AsyncEngine:
|
|
100
|
+
"""Get the SQLAlchemy AsyncEngine instance."""
|
|
101
|
+
if self._engine is None:
|
|
102
|
+
raise ValueError('Engine is not initialized. Call initialize() before accessing the engine.')
|
|
103
|
+
return self._engine
|
|
104
|
+
|
|
105
|
+
def get_dialect_name(self) -> str | None:
|
|
106
|
+
"""Get the database dialect name."""
|
|
107
|
+
return self._dialect_name
|
|
108
|
+
|
|
109
|
+
def get_accessed_modified_update_interval(self) -> timedelta:
|
|
110
|
+
"""Get the interval for accessed and modified updates."""
|
|
111
|
+
return self._accessed_modified_update_interval
|
|
112
|
+
|
|
113
|
+
async def initialize(self, configuration: Configuration) -> None:
|
|
114
|
+
"""Initialize the database schema.
|
|
115
|
+
|
|
116
|
+
This method creates all necessary tables if they don't exist.
|
|
117
|
+
Should be called before using the storage client.
|
|
118
|
+
"""
|
|
119
|
+
if not self._initialized:
|
|
120
|
+
engine = self._get_or_create_engine(configuration)
|
|
121
|
+
async with engine.begin() as conn:
|
|
122
|
+
self._dialect_name = engine.dialect.name
|
|
123
|
+
|
|
124
|
+
if self._dialect_name not in ('sqlite', 'postgresql'):
|
|
125
|
+
raise ValueError(
|
|
126
|
+
f'Unsupported database dialect: {self._dialect_name}. Supported: sqlite, postgresql. '
|
|
127
|
+
'Consider using a different database.',
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# Create tables if they don't exist.
|
|
131
|
+
# Rollback the transaction when an exception occurs.
|
|
132
|
+
# This is likely an attempt to create a database from several parallel processes.
|
|
133
|
+
try:
|
|
134
|
+
# Set SQLite pragmas for performance and consistency
|
|
135
|
+
if self._default_flag:
|
|
136
|
+
await conn.execute(text('PRAGMA journal_mode=WAL')) # Better concurrency
|
|
137
|
+
await conn.execute(text('PRAGMA synchronous=NORMAL')) # Balanced safety/speed
|
|
138
|
+
await conn.execute(text('PRAGMA cache_size=100000')) # 100MB cache
|
|
139
|
+
await conn.execute(text('PRAGMA temp_store=MEMORY')) # Memory temp storage
|
|
140
|
+
await conn.execute(text('PRAGMA mmap_size=268435456')) # 256MB memory mapping
|
|
141
|
+
await conn.execute(text('PRAGMA foreign_keys=ON')) # Enforce constraints
|
|
142
|
+
await conn.execute(text('PRAGMA busy_timeout=30000')) # 30s busy timeout
|
|
143
|
+
|
|
144
|
+
await conn.run_sync(Base.metadata.create_all, checkfirst=True)
|
|
145
|
+
|
|
146
|
+
from crawlee import __version__ # Noqa: PLC0415
|
|
147
|
+
|
|
148
|
+
db_version = (await conn.execute(select(VersionDb))).scalar_one_or_none()
|
|
149
|
+
|
|
150
|
+
# Raise an error if the new version creates breaking changes in the database schema.
|
|
151
|
+
if db_version and db_version != __version__:
|
|
152
|
+
warnings.warn(
|
|
153
|
+
f'Database version {db_version} does not match library version {__version__}. '
|
|
154
|
+
'This may lead to unexpected behavior. Drop the db if you want to make sure that '
|
|
155
|
+
'everything will work fine.',
|
|
156
|
+
category=UserWarning,
|
|
157
|
+
stacklevel=2,
|
|
158
|
+
)
|
|
159
|
+
elif not db_version:
|
|
160
|
+
await conn.execute(insert(VersionDb).values(version=__version__))
|
|
161
|
+
|
|
162
|
+
except (IntegrityError, OperationalError):
|
|
163
|
+
await conn.rollback()
|
|
164
|
+
|
|
165
|
+
self._initialized = True
|
|
166
|
+
|
|
167
|
+
async def close(self) -> None:
|
|
168
|
+
"""Close the database connection pool."""
|
|
169
|
+
if self._engine is not None:
|
|
170
|
+
await self._engine.dispose()
|
|
171
|
+
self._engine = None
|
|
172
|
+
|
|
173
|
+
def create_session(self) -> AsyncSession:
|
|
174
|
+
"""Create a new database session.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
A new AsyncSession instance.
|
|
178
|
+
"""
|
|
179
|
+
if self.session_maker is None:
|
|
180
|
+
self.session_maker = async_sessionmaker(self._engine, expire_on_commit=False, autoflush=False)
|
|
181
|
+
return self.session_maker()
|
|
182
|
+
|
|
183
|
+
@override
|
|
184
|
+
async def create_dataset_client(
|
|
185
|
+
self,
|
|
186
|
+
*,
|
|
187
|
+
id: str | None = None,
|
|
188
|
+
name: str | None = None,
|
|
189
|
+
alias: str | None = None,
|
|
190
|
+
configuration: Configuration | None = None,
|
|
191
|
+
) -> SqlDatasetClient:
|
|
192
|
+
configuration = configuration or Configuration.get_global_configuration()
|
|
193
|
+
await self.initialize(configuration)
|
|
194
|
+
|
|
195
|
+
client = await SqlDatasetClient.open(
|
|
196
|
+
id=id,
|
|
197
|
+
name=name,
|
|
198
|
+
alias=alias,
|
|
199
|
+
storage_client=self,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
await self._purge_if_needed(client, configuration)
|
|
203
|
+
return client
|
|
204
|
+
|
|
205
|
+
@override
|
|
206
|
+
async def create_kvs_client(
|
|
207
|
+
self,
|
|
208
|
+
*,
|
|
209
|
+
id: str | None = None,
|
|
210
|
+
name: str | None = None,
|
|
211
|
+
alias: str | None = None,
|
|
212
|
+
configuration: Configuration | None = None,
|
|
213
|
+
) -> SqlKeyValueStoreClient:
|
|
214
|
+
configuration = configuration or Configuration.get_global_configuration()
|
|
215
|
+
await self.initialize(configuration)
|
|
216
|
+
|
|
217
|
+
client = await SqlKeyValueStoreClient.open(
|
|
218
|
+
id=id,
|
|
219
|
+
name=name,
|
|
220
|
+
alias=alias,
|
|
221
|
+
storage_client=self,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
await self._purge_if_needed(client, configuration)
|
|
225
|
+
return client
|
|
226
|
+
|
|
227
|
+
@override
|
|
228
|
+
async def create_rq_client(
|
|
229
|
+
self,
|
|
230
|
+
*,
|
|
231
|
+
id: str | None = None,
|
|
232
|
+
name: str | None = None,
|
|
233
|
+
alias: str | None = None,
|
|
234
|
+
configuration: Configuration | None = None,
|
|
235
|
+
) -> SqlRequestQueueClient:
|
|
236
|
+
configuration = configuration or Configuration.get_global_configuration()
|
|
237
|
+
await self.initialize(configuration)
|
|
238
|
+
|
|
239
|
+
client = await SqlRequestQueueClient.open(
|
|
240
|
+
id=id,
|
|
241
|
+
name=name,
|
|
242
|
+
alias=alias,
|
|
243
|
+
storage_client=self,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
await self._purge_if_needed(client, configuration)
|
|
247
|
+
return client
|
|
248
|
+
|
|
249
|
+
def _get_or_create_engine(self, configuration: Configuration) -> AsyncEngine:
|
|
250
|
+
"""Get or create the database engine based on configuration."""
|
|
251
|
+
if self._engine is not None:
|
|
252
|
+
return self._engine
|
|
253
|
+
|
|
254
|
+
if self._connection_string is not None:
|
|
255
|
+
connection_string = self._connection_string
|
|
256
|
+
else:
|
|
257
|
+
# Create SQLite database in the storage directory
|
|
258
|
+
storage_dir = Path(configuration.storage_dir)
|
|
259
|
+
if not storage_dir.exists():
|
|
260
|
+
storage_dir.mkdir(parents=True, exist_ok=True)
|
|
261
|
+
|
|
262
|
+
db_path = storage_dir / self._DEFAULT_DB_NAME
|
|
263
|
+
|
|
264
|
+
# Create connection string with path to default database
|
|
265
|
+
connection_string = f'sqlite+aiosqlite:///{db_path}'
|
|
266
|
+
|
|
267
|
+
if 'sqlite' not in connection_string and 'postgresql' not in connection_string:
|
|
268
|
+
raise ValueError(
|
|
269
|
+
'Unsupported database. Supported: sqlite, postgresql. Consider using a different database.'
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
# TODO: https://github.com/apify/crawlee-python/issues/1555
|
|
273
|
+
if 'postgresql' in connection_string and sys.version_info >= (3, 14):
|
|
274
|
+
raise ValueError(
|
|
275
|
+
'SqlStorageClient cannot use PostgreSQL with Python 3.14 '
|
|
276
|
+
'due to asyncpg compatibility limitations. '
|
|
277
|
+
'Please use Python 3.13 or earlier, or switch to SQLite.'
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
self._engine = create_async_engine(
|
|
281
|
+
connection_string,
|
|
282
|
+
future=True,
|
|
283
|
+
pool_size=5,
|
|
284
|
+
max_overflow=10,
|
|
285
|
+
pool_timeout=30,
|
|
286
|
+
pool_recycle=600,
|
|
287
|
+
pool_pre_ping=True,
|
|
288
|
+
echo=False,
|
|
289
|
+
connect_args={'timeout': 30},
|
|
290
|
+
)
|
|
291
|
+
return self._engine
|
|
File without changes
|
|
@@ -20,7 +20,7 @@ class StorageMetadata(BaseModel):
|
|
|
20
20
|
It contains common fields shared across all specific storage types.
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
|
-
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, extra='allow')
|
|
23
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, extra='allow', from_attributes=True)
|
|
24
24
|
|
|
25
25
|
id: Annotated[str, Field(alias='id')]
|
|
26
26
|
"""The unique identifier of the storage."""
|
|
@@ -42,7 +42,7 @@ class StorageMetadata(BaseModel):
|
|
|
42
42
|
class DatasetMetadata(StorageMetadata):
|
|
43
43
|
"""Model for a dataset metadata."""
|
|
44
44
|
|
|
45
|
-
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
45
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
46
46
|
|
|
47
47
|
item_count: Annotated[int, Field(alias='itemCount')]
|
|
48
48
|
"""The number of items in the dataset."""
|
|
@@ -52,14 +52,14 @@ class DatasetMetadata(StorageMetadata):
|
|
|
52
52
|
class KeyValueStoreMetadata(StorageMetadata):
|
|
53
53
|
"""Model for a key-value store metadata."""
|
|
54
54
|
|
|
55
|
-
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
55
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
56
56
|
|
|
57
57
|
|
|
58
58
|
@docs_group('Storage data')
|
|
59
59
|
class RequestQueueMetadata(StorageMetadata):
|
|
60
60
|
"""Model for a request queue metadata."""
|
|
61
61
|
|
|
62
|
-
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
62
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
63
63
|
|
|
64
64
|
had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients')]
|
|
65
65
|
"""Indicates whether the queue has been accessed by multiple clients (consumers)."""
|
|
@@ -78,7 +78,7 @@ class RequestQueueMetadata(StorageMetadata):
|
|
|
78
78
|
class KeyValueStoreRecordMetadata(BaseModel):
|
|
79
79
|
"""Model for a key-value store record metadata."""
|
|
80
80
|
|
|
81
|
-
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
81
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
82
82
|
|
|
83
83
|
key: Annotated[str, Field(alias='key')]
|
|
84
84
|
"""The key of the record.
|
|
@@ -100,7 +100,7 @@ class KeyValueStoreRecordMetadata(BaseModel):
|
|
|
100
100
|
class KeyValueStoreRecord(KeyValueStoreRecordMetadata, Generic[KvsValueType]):
|
|
101
101
|
"""Model for a key-value store record."""
|
|
102
102
|
|
|
103
|
-
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
103
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
104
104
|
|
|
105
105
|
value: Annotated[KvsValueType, Field(alias='value')]
|
|
106
106
|
"""The value of the record."""
|
|
@@ -110,7 +110,7 @@ class KeyValueStoreRecord(KeyValueStoreRecordMetadata, Generic[KvsValueType]):
|
|
|
110
110
|
class DatasetItemsListPage(BaseModel):
|
|
111
111
|
"""Model for a single page of dataset items returned from a collection list method."""
|
|
112
112
|
|
|
113
|
-
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
113
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
114
114
|
|
|
115
115
|
count: Annotated[int, Field(default=0)]
|
|
116
116
|
"""The number of objects returned on this page."""
|
|
@@ -135,7 +135,7 @@ class DatasetItemsListPage(BaseModel):
|
|
|
135
135
|
class ProcessedRequest(BaseModel):
|
|
136
136
|
"""Represents a processed request."""
|
|
137
137
|
|
|
138
|
-
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
138
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
139
139
|
|
|
140
140
|
id: Annotated[str | None, Field(alias='requestId', default=None)] = None
|
|
141
141
|
"""Internal representation of the request by the storage client. Only some clients use id."""
|
|
@@ -149,7 +149,7 @@ class ProcessedRequest(BaseModel):
|
|
|
149
149
|
class UnprocessedRequest(BaseModel):
|
|
150
150
|
"""Represents an unprocessed request."""
|
|
151
151
|
|
|
152
|
-
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
152
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
153
153
|
|
|
154
154
|
unique_key: Annotated[str, Field(alias='uniqueKey')]
|
|
155
155
|
url: Annotated[str, BeforeValidator(validate_http_url), Field()]
|
|
@@ -165,7 +165,7 @@ class AddRequestsResponse(BaseModel):
|
|
|
165
165
|
encountered issues during processing.
|
|
166
166
|
"""
|
|
167
167
|
|
|
168
|
-
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
168
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
169
169
|
|
|
170
170
|
processed_requests: Annotated[list[ProcessedRequest], Field(alias='processedRequests')]
|
|
171
171
|
"""Successfully processed requests, including information about whether they were
|
crawlee/storages/_base.py
CHANGED
|
@@ -44,7 +44,9 @@ class Storage(ABC):
|
|
|
44
44
|
|
|
45
45
|
Args:
|
|
46
46
|
id: The storage ID.
|
|
47
|
-
name: The storage name (global scope, persists across runs).
|
|
47
|
+
name: The storage name (global scope, persists across runs). Name can only contain letters "a" through "z",
|
|
48
|
+
the digits "0" through "9", and the hyphen ("-") but only in the middle of the string
|
|
49
|
+
(e.g. "my-value-1").
|
|
48
50
|
alias: The storage alias (run scope, creates unnamed storage).
|
|
49
51
|
configuration: Configuration object used during the storage creation or restoration process.
|
|
50
52
|
storage_client: Underlying storage client to use. If not provided, the default global storage client
|
crawlee/storages/_dataset.py
CHANGED
|
@@ -12,6 +12,7 @@ from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
|
|
|
12
12
|
|
|
13
13
|
from ._base import Storage
|
|
14
14
|
from ._key_value_store import KeyValueStore
|
|
15
|
+
from ._utils import validate_storage_name
|
|
15
16
|
|
|
16
17
|
if TYPE_CHECKING:
|
|
17
18
|
from collections.abc import AsyncIterator
|
|
@@ -75,6 +76,8 @@ class Dataset(Storage):
|
|
|
75
76
|
id: The unique identifier of the storage.
|
|
76
77
|
name: The name of the storage, if available.
|
|
77
78
|
"""
|
|
79
|
+
validate_storage_name(name)
|
|
80
|
+
|
|
78
81
|
self._client = client
|
|
79
82
|
self._id = id
|
|
80
83
|
self._name = name
|
|
@@ -110,7 +113,7 @@ class Dataset(Storage):
|
|
|
110
113
|
client_opener_coro = storage_client.create_dataset_client(
|
|
111
114
|
id=id, name=name, alias=alias, configuration=configuration
|
|
112
115
|
)
|
|
113
|
-
|
|
116
|
+
storage_client_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)
|
|
114
117
|
|
|
115
118
|
return await service_locator.storage_instance_manager.open_storage_instance(
|
|
116
119
|
cls,
|
|
@@ -118,8 +121,7 @@ class Dataset(Storage):
|
|
|
118
121
|
name=name,
|
|
119
122
|
alias=alias,
|
|
120
123
|
client_opener_coro=client_opener_coro,
|
|
121
|
-
|
|
122
|
-
additional_cache_key=additional_cache_key,
|
|
124
|
+
storage_client_cache_key=storage_client_cache_key,
|
|
123
125
|
)
|
|
124
126
|
|
|
125
127
|
@override
|
|
@@ -15,6 +15,7 @@ from crawlee._utils.recoverable_state import RecoverableState
|
|
|
15
15
|
from crawlee.storage_clients.models import KeyValueStoreMetadata
|
|
16
16
|
|
|
17
17
|
from ._base import Storage
|
|
18
|
+
from ._utils import validate_storage_name
|
|
18
19
|
|
|
19
20
|
if TYPE_CHECKING:
|
|
20
21
|
from collections.abc import AsyncIterator
|
|
@@ -84,6 +85,8 @@ class KeyValueStore(Storage):
|
|
|
84
85
|
id: The unique identifier of the storage.
|
|
85
86
|
name: The name of the storage, if available.
|
|
86
87
|
"""
|
|
88
|
+
validate_storage_name(name)
|
|
89
|
+
|
|
87
90
|
self._client = client
|
|
88
91
|
self._id = id
|
|
89
92
|
self._name = name
|
|
@@ -122,16 +125,15 @@ class KeyValueStore(Storage):
|
|
|
122
125
|
client_opener_coro = storage_client.create_kvs_client(
|
|
123
126
|
id=id, name=name, alias=alias, configuration=configuration
|
|
124
127
|
)
|
|
125
|
-
additional_cache_key = storage_client.
|
|
128
|
+
additional_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)
|
|
126
129
|
|
|
127
130
|
return await service_locator.storage_instance_manager.open_storage_instance(
|
|
128
131
|
cls,
|
|
129
132
|
id=id,
|
|
130
133
|
name=name,
|
|
131
|
-
client_opener_coro=client_opener_coro,
|
|
132
134
|
alias=alias,
|
|
133
|
-
|
|
134
|
-
|
|
135
|
+
client_opener_coro=client_opener_coro,
|
|
136
|
+
storage_client_cache_key=additional_cache_key,
|
|
135
137
|
)
|
|
136
138
|
|
|
137
139
|
@override
|
|
@@ -279,11 +281,14 @@ class KeyValueStore(Storage):
|
|
|
279
281
|
if key in cache:
|
|
280
282
|
return cache[key].current_value.root
|
|
281
283
|
|
|
284
|
+
async def kvs_factory() -> KeyValueStore:
|
|
285
|
+
return self
|
|
286
|
+
|
|
282
287
|
cache[key] = recoverable_state = RecoverableState(
|
|
283
288
|
default_state=AutosavedValue(default_value),
|
|
284
|
-
persistence_enabled=True,
|
|
285
|
-
persist_state_kvs_id=self.id,
|
|
286
289
|
persist_state_key=key,
|
|
290
|
+
persistence_enabled=True,
|
|
291
|
+
persist_state_kvs_factory=kvs_factory,
|
|
287
292
|
logger=logger,
|
|
288
293
|
)
|
|
289
294
|
|
|
@@ -13,6 +13,7 @@ from crawlee._utils.wait import wait_for_all_tasks_for_finish
|
|
|
13
13
|
from crawlee.request_loaders import RequestManager
|
|
14
14
|
|
|
15
15
|
from ._base import Storage
|
|
16
|
+
from ._utils import validate_storage_name
|
|
16
17
|
|
|
17
18
|
if TYPE_CHECKING:
|
|
18
19
|
from collections.abc import Sequence
|
|
@@ -80,6 +81,8 @@ class RequestQueue(Storage, RequestManager):
|
|
|
80
81
|
id: The unique identifier of the storage.
|
|
81
82
|
name: The name of the storage, if available.
|
|
82
83
|
"""
|
|
84
|
+
validate_storage_name(name)
|
|
85
|
+
|
|
83
86
|
self._client = client
|
|
84
87
|
self._id = id
|
|
85
88
|
self._name = name
|
|
@@ -126,7 +129,7 @@ class RequestQueue(Storage, RequestManager):
|
|
|
126
129
|
storage_client = service_locator.get_storage_client() if storage_client is None else storage_client
|
|
127
130
|
|
|
128
131
|
client_opener_coro = storage_client.create_rq_client(id=id, name=name, alias=alias, configuration=configuration)
|
|
129
|
-
additional_cache_key = storage_client.
|
|
132
|
+
additional_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)
|
|
130
133
|
|
|
131
134
|
return await service_locator.storage_instance_manager.open_storage_instance(
|
|
132
135
|
cls,
|
|
@@ -134,8 +137,7 @@ class RequestQueue(Storage, RequestManager):
|
|
|
134
137
|
name=name,
|
|
135
138
|
alias=alias,
|
|
136
139
|
client_opener_coro=client_opener_coro,
|
|
137
|
-
|
|
138
|
-
additional_cache_key=additional_cache_key,
|
|
140
|
+
storage_client_cache_key=additional_cache_key,
|
|
139
141
|
)
|
|
140
142
|
|
|
141
143
|
@override
|