crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/__init__.py +2 -1
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +64 -43
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +128 -36
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +27 -3
- crawlee/_utils/requests.py +0 -26
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +16 -7
- crawlee/_utils/system.py +30 -14
- crawlee/_utils/time.py +120 -0
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +5 -2
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +254 -148
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +27 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +32 -11
- crawlee/otel/crawler_instrumentor.py +4 -6
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +3 -3
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +248 -50
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +5 -5
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +62 -12
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_base/_request_queue_client.py +2 -2
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
- crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
- crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +292 -0
- crawlee/storage_clients/_redis/_dataset_client.py +329 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
- crawlee/storage_clients/_redis/_storage_client.py +149 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +21 -14
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +13 -5
- crawlee/storages/_storage_instance_manager.py +196 -75
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
- crawlee-1.3.1b3.dist-info/RECORD +207 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
- crawlee/_utils/measure_time.py +0 -31
- crawlee-0.6.13b15.dist-info/RECORD +0 -183
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
from datetime import timedelta
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
from sqlalchemy.exc import IntegrityError, OperationalError
|
|
9
|
+
from sqlalchemy.ext.asyncio import AsyncEngine, async_sessionmaker, create_async_engine
|
|
10
|
+
from sqlalchemy.sql import insert, select, text
|
|
11
|
+
from typing_extensions import override
|
|
12
|
+
|
|
13
|
+
from crawlee._utils.docs import docs_group
|
|
14
|
+
from crawlee.configuration import Configuration
|
|
15
|
+
from crawlee.storage_clients._base import StorageClient
|
|
16
|
+
|
|
17
|
+
from ._dataset_client import SqlDatasetClient
|
|
18
|
+
from ._db_models import Base, VersionDb
|
|
19
|
+
from ._key_value_store_client import SqlKeyValueStoreClient
|
|
20
|
+
from ._request_queue_client import SqlRequestQueueClient
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from types import TracebackType
|
|
24
|
+
|
|
25
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@docs_group('Storage clients')
|
|
29
|
+
class SqlStorageClient(StorageClient):
|
|
30
|
+
"""SQL implementation of the storage client.
|
|
31
|
+
|
|
32
|
+
This storage client provides access to datasets, key-value stores, and request queues that persist data
|
|
33
|
+
to a SQL database using SQLAlchemy 2+. Each storage type uses two tables: one for metadata and one for
|
|
34
|
+
records.
|
|
35
|
+
|
|
36
|
+
The client accepts either a database connection string or a pre-configured AsyncEngine. If neither is
|
|
37
|
+
provided, it creates a default SQLite database 'crawlee.db' in the storage directory.
|
|
38
|
+
|
|
39
|
+
Database schema is automatically created during initialization. SQLite databases receive performance
|
|
40
|
+
optimizations including WAL mode and increased cache size.
|
|
41
|
+
|
|
42
|
+
Warning:
|
|
43
|
+
This is an experimental feature. The behavior and interface may change in future versions.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
_DEFAULT_DB_NAME = 'crawlee.db'
|
|
47
|
+
"""Default database name if not specified in connection string."""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
*,
|
|
52
|
+
connection_string: str | None = None,
|
|
53
|
+
engine: AsyncEngine | None = None,
|
|
54
|
+
) -> None:
|
|
55
|
+
"""Initialize the SQL storage client.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
connection_string: Database connection string (e.g., "sqlite+aiosqlite:///crawlee.db").
|
|
59
|
+
If not provided, defaults to SQLite database in the storage directory.
|
|
60
|
+
engine: Pre-configured AsyncEngine instance. If provided, connection_string is ignored.
|
|
61
|
+
"""
|
|
62
|
+
if engine is not None and connection_string is not None:
|
|
63
|
+
raise ValueError('Either connection_string or engine must be provided, not both.')
|
|
64
|
+
|
|
65
|
+
self._connection_string = connection_string
|
|
66
|
+
self._engine = engine
|
|
67
|
+
self._initialized = False
|
|
68
|
+
self.session_maker: None | async_sessionmaker[AsyncSession] = None
|
|
69
|
+
|
|
70
|
+
# Minimum interval to reduce database load from frequent concurrent metadata updates
|
|
71
|
+
self._accessed_modified_update_interval = timedelta(seconds=1)
|
|
72
|
+
|
|
73
|
+
# Flag needed to apply optimizations only for default database
|
|
74
|
+
self._default_flag = self._engine is None and self._connection_string is None
|
|
75
|
+
self._dialect_name: str | None = None
|
|
76
|
+
|
|
77
|
+
# Call the notification only once
|
|
78
|
+
warnings.warn(
|
|
79
|
+
'The SqlStorageClient is experimental and may change or be removed in future releases.',
|
|
80
|
+
category=UserWarning,
|
|
81
|
+
stacklevel=2,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
async def __aenter__(self) -> SqlStorageClient:
|
|
85
|
+
"""Async context manager entry."""
|
|
86
|
+
return self
|
|
87
|
+
|
|
88
|
+
async def __aexit__(
|
|
89
|
+
self,
|
|
90
|
+
exc_type: type[BaseException] | None,
|
|
91
|
+
exc_value: BaseException | None,
|
|
92
|
+
exc_traceback: TracebackType | None,
|
|
93
|
+
) -> None:
|
|
94
|
+
"""Async context manager exit."""
|
|
95
|
+
await self.close()
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def engine(self) -> AsyncEngine:
|
|
99
|
+
"""Get the SQLAlchemy AsyncEngine instance."""
|
|
100
|
+
if self._engine is None:
|
|
101
|
+
raise ValueError('Engine is not initialized. Call initialize() before accessing the engine.')
|
|
102
|
+
return self._engine
|
|
103
|
+
|
|
104
|
+
def get_dialect_name(self) -> str | None:
|
|
105
|
+
"""Get the database dialect name."""
|
|
106
|
+
return self._dialect_name
|
|
107
|
+
|
|
108
|
+
def get_accessed_modified_update_interval(self) -> timedelta:
|
|
109
|
+
"""Get the interval for accessed and modified updates."""
|
|
110
|
+
return self._accessed_modified_update_interval
|
|
111
|
+
|
|
112
|
+
async def initialize(self, configuration: Configuration) -> None:
|
|
113
|
+
"""Initialize the database schema.
|
|
114
|
+
|
|
115
|
+
This method creates all necessary tables if they don't exist.
|
|
116
|
+
Should be called before using the storage client.
|
|
117
|
+
"""
|
|
118
|
+
if not self._initialized:
|
|
119
|
+
engine = self._get_or_create_engine(configuration)
|
|
120
|
+
async with engine.begin() as conn:
|
|
121
|
+
self._dialect_name = engine.dialect.name
|
|
122
|
+
|
|
123
|
+
if self._dialect_name not in ('sqlite', 'postgresql'):
|
|
124
|
+
raise ValueError(
|
|
125
|
+
f'Unsupported database dialect: {self._dialect_name}. Supported: sqlite, postgresql. '
|
|
126
|
+
'Consider using a different database.',
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# Create tables if they don't exist.
|
|
130
|
+
# Rollback the transaction when an exception occurs.
|
|
131
|
+
# This is likely an attempt to create a database from several parallel processes.
|
|
132
|
+
try:
|
|
133
|
+
# Set SQLite pragmas for performance and consistency
|
|
134
|
+
if self._default_flag:
|
|
135
|
+
await conn.execute(text('PRAGMA journal_mode=WAL')) # Better concurrency
|
|
136
|
+
await conn.execute(text('PRAGMA synchronous=NORMAL')) # Balanced safety/speed
|
|
137
|
+
await conn.execute(text('PRAGMA cache_size=100000')) # 100MB cache
|
|
138
|
+
await conn.execute(text('PRAGMA temp_store=MEMORY')) # Memory temp storage
|
|
139
|
+
await conn.execute(text('PRAGMA mmap_size=268435456')) # 256MB memory mapping
|
|
140
|
+
await conn.execute(text('PRAGMA foreign_keys=ON')) # Enforce constraints
|
|
141
|
+
await conn.execute(text('PRAGMA busy_timeout=30000')) # 30s busy timeout
|
|
142
|
+
|
|
143
|
+
await conn.run_sync(Base.metadata.create_all, checkfirst=True)
|
|
144
|
+
|
|
145
|
+
from crawlee import __version__ # Noqa: PLC0415
|
|
146
|
+
|
|
147
|
+
db_version = (await conn.execute(select(VersionDb))).scalar_one_or_none()
|
|
148
|
+
|
|
149
|
+
# Raise an error if the new version creates breaking changes in the database schema.
|
|
150
|
+
if db_version and db_version != __version__:
|
|
151
|
+
warnings.warn(
|
|
152
|
+
f'Database version {db_version} does not match library version {__version__}. '
|
|
153
|
+
'This may lead to unexpected behavior. Drop the db if you want to make sure that '
|
|
154
|
+
'everything will work fine.',
|
|
155
|
+
category=UserWarning,
|
|
156
|
+
stacklevel=2,
|
|
157
|
+
)
|
|
158
|
+
elif not db_version:
|
|
159
|
+
await conn.execute(insert(VersionDb).values(version=__version__))
|
|
160
|
+
|
|
161
|
+
except (IntegrityError, OperationalError):
|
|
162
|
+
await conn.rollback()
|
|
163
|
+
|
|
164
|
+
self._initialized = True
|
|
165
|
+
|
|
166
|
+
async def close(self) -> None:
|
|
167
|
+
"""Close the database connection pool."""
|
|
168
|
+
if self._engine is not None:
|
|
169
|
+
await self._engine.dispose()
|
|
170
|
+
self._engine = None
|
|
171
|
+
|
|
172
|
+
def create_session(self) -> AsyncSession:
|
|
173
|
+
"""Create a new database session.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
A new AsyncSession instance.
|
|
177
|
+
"""
|
|
178
|
+
if self.session_maker is None:
|
|
179
|
+
self.session_maker = async_sessionmaker(self._engine, expire_on_commit=False, autoflush=False)
|
|
180
|
+
return self.session_maker()
|
|
181
|
+
|
|
182
|
+
@override
|
|
183
|
+
async def create_dataset_client(
|
|
184
|
+
self,
|
|
185
|
+
*,
|
|
186
|
+
id: str | None = None,
|
|
187
|
+
name: str | None = None,
|
|
188
|
+
alias: str | None = None,
|
|
189
|
+
configuration: Configuration | None = None,
|
|
190
|
+
) -> SqlDatasetClient:
|
|
191
|
+
configuration = configuration or Configuration.get_global_configuration()
|
|
192
|
+
await self.initialize(configuration)
|
|
193
|
+
|
|
194
|
+
client = await SqlDatasetClient.open(
|
|
195
|
+
id=id,
|
|
196
|
+
name=name,
|
|
197
|
+
alias=alias,
|
|
198
|
+
storage_client=self,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
await self._purge_if_needed(client, configuration)
|
|
202
|
+
return client
|
|
203
|
+
|
|
204
|
+
@override
|
|
205
|
+
async def create_kvs_client(
|
|
206
|
+
self,
|
|
207
|
+
*,
|
|
208
|
+
id: str | None = None,
|
|
209
|
+
name: str | None = None,
|
|
210
|
+
alias: str | None = None,
|
|
211
|
+
configuration: Configuration | None = None,
|
|
212
|
+
) -> SqlKeyValueStoreClient:
|
|
213
|
+
configuration = configuration or Configuration.get_global_configuration()
|
|
214
|
+
await self.initialize(configuration)
|
|
215
|
+
|
|
216
|
+
client = await SqlKeyValueStoreClient.open(
|
|
217
|
+
id=id,
|
|
218
|
+
name=name,
|
|
219
|
+
alias=alias,
|
|
220
|
+
storage_client=self,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
await self._purge_if_needed(client, configuration)
|
|
224
|
+
return client
|
|
225
|
+
|
|
226
|
+
@override
|
|
227
|
+
async def create_rq_client(
|
|
228
|
+
self,
|
|
229
|
+
*,
|
|
230
|
+
id: str | None = None,
|
|
231
|
+
name: str | None = None,
|
|
232
|
+
alias: str | None = None,
|
|
233
|
+
configuration: Configuration | None = None,
|
|
234
|
+
) -> SqlRequestQueueClient:
|
|
235
|
+
configuration = configuration or Configuration.get_global_configuration()
|
|
236
|
+
await self.initialize(configuration)
|
|
237
|
+
|
|
238
|
+
client = await SqlRequestQueueClient.open(
|
|
239
|
+
id=id,
|
|
240
|
+
name=name,
|
|
241
|
+
alias=alias,
|
|
242
|
+
storage_client=self,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
await self._purge_if_needed(client, configuration)
|
|
246
|
+
return client
|
|
247
|
+
|
|
248
|
+
def _get_or_create_engine(self, configuration: Configuration) -> AsyncEngine:
|
|
249
|
+
"""Get or create the database engine based on configuration."""
|
|
250
|
+
if self._engine is not None:
|
|
251
|
+
return self._engine
|
|
252
|
+
|
|
253
|
+
if self._connection_string is not None:
|
|
254
|
+
connection_string = self._connection_string
|
|
255
|
+
else:
|
|
256
|
+
# Create SQLite database in the storage directory
|
|
257
|
+
storage_dir = Path(configuration.storage_dir)
|
|
258
|
+
if not storage_dir.exists():
|
|
259
|
+
storage_dir.mkdir(parents=True, exist_ok=True)
|
|
260
|
+
|
|
261
|
+
db_path = storage_dir / self._DEFAULT_DB_NAME
|
|
262
|
+
|
|
263
|
+
# Create connection string with path to default database
|
|
264
|
+
connection_string = f'sqlite+aiosqlite:///{db_path}'
|
|
265
|
+
|
|
266
|
+
if 'sqlite' not in connection_string and 'postgresql' not in connection_string:
|
|
267
|
+
raise ValueError(
|
|
268
|
+
'Unsupported database. Supported: sqlite, postgresql. Consider using a different database.'
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
self._engine = create_async_engine(
|
|
272
|
+
connection_string,
|
|
273
|
+
future=True,
|
|
274
|
+
pool_size=5,
|
|
275
|
+
max_overflow=10,
|
|
276
|
+
pool_timeout=30,
|
|
277
|
+
pool_recycle=600,
|
|
278
|
+
pool_pre_ping=True,
|
|
279
|
+
echo=False,
|
|
280
|
+
connect_args={'timeout': 30},
|
|
281
|
+
)
|
|
282
|
+
return self._engine
|
|
File without changes
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from datetime import datetime
|
|
4
|
-
from typing import Annotated, Any, Generic
|
|
4
|
+
from typing import TYPE_CHECKING, Annotated, Any, Generic
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel, BeforeValidator, ConfigDict, Field
|
|
7
7
|
from typing_extensions import TypeVar
|
|
@@ -20,7 +20,7 @@ class StorageMetadata(BaseModel):
|
|
|
20
20
|
It contains common fields shared across all specific storage types.
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
|
-
model_config = ConfigDict(
|
|
23
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, extra='allow', from_attributes=True)
|
|
24
24
|
|
|
25
25
|
id: Annotated[str, Field(alias='id')]
|
|
26
26
|
"""The unique identifier of the storage."""
|
|
@@ -42,7 +42,7 @@ class StorageMetadata(BaseModel):
|
|
|
42
42
|
class DatasetMetadata(StorageMetadata):
|
|
43
43
|
"""Model for a dataset metadata."""
|
|
44
44
|
|
|
45
|
-
model_config = ConfigDict(
|
|
45
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
46
46
|
|
|
47
47
|
item_count: Annotated[int, Field(alias='itemCount')]
|
|
48
48
|
"""The number of items in the dataset."""
|
|
@@ -52,14 +52,14 @@ class DatasetMetadata(StorageMetadata):
|
|
|
52
52
|
class KeyValueStoreMetadata(StorageMetadata):
|
|
53
53
|
"""Model for a key-value store metadata."""
|
|
54
54
|
|
|
55
|
-
model_config = ConfigDict(
|
|
55
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
56
56
|
|
|
57
57
|
|
|
58
58
|
@docs_group('Storage data')
|
|
59
59
|
class RequestQueueMetadata(StorageMetadata):
|
|
60
60
|
"""Model for a request queue metadata."""
|
|
61
61
|
|
|
62
|
-
model_config = ConfigDict(
|
|
62
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
63
63
|
|
|
64
64
|
had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients')]
|
|
65
65
|
"""Indicates whether the queue has been accessed by multiple clients (consumers)."""
|
|
@@ -78,7 +78,7 @@ class RequestQueueMetadata(StorageMetadata):
|
|
|
78
78
|
class KeyValueStoreRecordMetadata(BaseModel):
|
|
79
79
|
"""Model for a key-value store record metadata."""
|
|
80
80
|
|
|
81
|
-
model_config = ConfigDict(
|
|
81
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
82
82
|
|
|
83
83
|
key: Annotated[str, Field(alias='key')]
|
|
84
84
|
"""The key of the record.
|
|
@@ -100,7 +100,7 @@ class KeyValueStoreRecordMetadata(BaseModel):
|
|
|
100
100
|
class KeyValueStoreRecord(KeyValueStoreRecordMetadata, Generic[KvsValueType]):
|
|
101
101
|
"""Model for a key-value store record."""
|
|
102
102
|
|
|
103
|
-
model_config = ConfigDict(
|
|
103
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
104
104
|
|
|
105
105
|
value: Annotated[KvsValueType, Field(alias='value')]
|
|
106
106
|
"""The value of the record."""
|
|
@@ -110,7 +110,7 @@ class KeyValueStoreRecord(KeyValueStoreRecordMetadata, Generic[KvsValueType]):
|
|
|
110
110
|
class DatasetItemsListPage(BaseModel):
|
|
111
111
|
"""Model for a single page of dataset items returned from a collection list method."""
|
|
112
112
|
|
|
113
|
-
model_config = ConfigDict(
|
|
113
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
114
114
|
|
|
115
115
|
count: Annotated[int, Field(default=0)]
|
|
116
116
|
"""The number of objects returned on this page."""
|
|
@@ -127,17 +127,24 @@ class DatasetItemsListPage(BaseModel):
|
|
|
127
127
|
desc: Annotated[bool, Field(default=False)]
|
|
128
128
|
"""Indicates if the returned list is in descending order."""
|
|
129
129
|
|
|
130
|
-
|
|
131
|
-
|
|
130
|
+
# Workaround for Pydantic and type checkers when using Annotated with default_factory
|
|
131
|
+
if TYPE_CHECKING:
|
|
132
|
+
items: list[dict] = []
|
|
133
|
+
"""The list of dataset items returned on this page."""
|
|
134
|
+
else:
|
|
135
|
+
items: Annotated[list[dict], Field(default_factory=list)]
|
|
136
|
+
"""The list of dataset items returned on this page."""
|
|
132
137
|
|
|
133
138
|
|
|
134
139
|
@docs_group('Storage data')
|
|
135
140
|
class ProcessedRequest(BaseModel):
|
|
136
141
|
"""Represents a processed request."""
|
|
137
142
|
|
|
138
|
-
model_config = ConfigDict(
|
|
143
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
144
|
+
|
|
145
|
+
id: Annotated[str | None, Field(alias='requestId', default=None)] = None
|
|
146
|
+
"""Internal representation of the request by the storage client. Only some clients use id."""
|
|
139
147
|
|
|
140
|
-
id: Annotated[str, Field(alias='requestId')]
|
|
141
148
|
unique_key: Annotated[str, Field(alias='uniqueKey')]
|
|
142
149
|
was_already_present: Annotated[bool, Field(alias='wasAlreadyPresent')]
|
|
143
150
|
was_already_handled: Annotated[bool, Field(alias='wasAlreadyHandled')]
|
|
@@ -147,7 +154,7 @@ class ProcessedRequest(BaseModel):
|
|
|
147
154
|
class UnprocessedRequest(BaseModel):
|
|
148
155
|
"""Represents an unprocessed request."""
|
|
149
156
|
|
|
150
|
-
model_config = ConfigDict(
|
|
157
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
151
158
|
|
|
152
159
|
unique_key: Annotated[str, Field(alias='uniqueKey')]
|
|
153
160
|
url: Annotated[str, BeforeValidator(validate_http_url), Field()]
|
|
@@ -163,7 +170,7 @@ class AddRequestsResponse(BaseModel):
|
|
|
163
170
|
encountered issues during processing.
|
|
164
171
|
"""
|
|
165
172
|
|
|
166
|
-
model_config = ConfigDict(
|
|
173
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, from_attributes=True)
|
|
167
174
|
|
|
168
175
|
processed_requests: Annotated[list[ProcessedRequest], Field(alias='processedRequests')]
|
|
169
176
|
"""Successfully processed requests, including information about whether they were
|
crawlee/storages/_base.py
CHANGED
|
@@ -36,6 +36,7 @@ class Storage(ABC):
|
|
|
36
36
|
*,
|
|
37
37
|
id: str | None = None,
|
|
38
38
|
name: str | None = None,
|
|
39
|
+
alias: str | None = None,
|
|
39
40
|
configuration: Configuration | None = None,
|
|
40
41
|
storage_client: StorageClient | None = None,
|
|
41
42
|
) -> Storage:
|
|
@@ -43,7 +44,10 @@ class Storage(ABC):
|
|
|
43
44
|
|
|
44
45
|
Args:
|
|
45
46
|
id: The storage ID.
|
|
46
|
-
name: The storage name.
|
|
47
|
+
name: The storage name (global scope, persists across runs). Name can only contain letters "a" through "z",
|
|
48
|
+
the digits "0" through "9", and the hyphen ("-") but only in the middle of the string
|
|
49
|
+
(e.g. "my-value-1").
|
|
50
|
+
alias: The storage alias (run scope, creates unnamed storage).
|
|
47
51
|
configuration: Configuration object used during the storage creation or restoration process.
|
|
48
52
|
storage_client: Underlying storage client to use. If not provided, the default global storage client
|
|
49
53
|
from the service locator will be used.
|
crawlee/storages/_dataset.py
CHANGED
|
@@ -12,6 +12,7 @@ from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
|
|
|
12
12
|
|
|
13
13
|
from ._base import Storage
|
|
14
14
|
from ._key_value_store import KeyValueStore
|
|
15
|
+
from ._utils import validate_storage_name
|
|
15
16
|
|
|
16
17
|
if TYPE_CHECKING:
|
|
17
18
|
from collections.abc import AsyncIterator
|
|
@@ -75,6 +76,8 @@ class Dataset(Storage):
|
|
|
75
76
|
id: The unique identifier of the storage.
|
|
76
77
|
name: The name of the storage, if available.
|
|
77
78
|
"""
|
|
79
|
+
validate_storage_name(name)
|
|
80
|
+
|
|
78
81
|
self._client = client
|
|
79
82
|
self._id = id
|
|
80
83
|
self._name = name
|
|
@@ -100,18 +103,25 @@ class Dataset(Storage):
|
|
|
100
103
|
*,
|
|
101
104
|
id: str | None = None,
|
|
102
105
|
name: str | None = None,
|
|
106
|
+
alias: str | None = None,
|
|
103
107
|
configuration: Configuration | None = None,
|
|
104
108
|
storage_client: StorageClient | None = None,
|
|
105
109
|
) -> Dataset:
|
|
106
110
|
configuration = service_locator.get_configuration() if configuration is None else configuration
|
|
107
111
|
storage_client = service_locator.get_storage_client() if storage_client is None else storage_client
|
|
108
112
|
|
|
113
|
+
client_opener_coro = storage_client.create_dataset_client(
|
|
114
|
+
id=id, name=name, alias=alias, configuration=configuration
|
|
115
|
+
)
|
|
116
|
+
storage_client_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)
|
|
117
|
+
|
|
109
118
|
return await service_locator.storage_instance_manager.open_storage_instance(
|
|
110
119
|
cls,
|
|
111
120
|
id=id,
|
|
112
121
|
name=name,
|
|
113
|
-
|
|
114
|
-
|
|
122
|
+
alias=alias,
|
|
123
|
+
client_opener_coro=client_opener_coro,
|
|
124
|
+
storage_client_cache_key=storage_client_cache_key,
|
|
115
125
|
)
|
|
116
126
|
|
|
117
127
|
@override
|
|
@@ -15,6 +15,7 @@ from crawlee._utils.recoverable_state import RecoverableState
|
|
|
15
15
|
from crawlee.storage_clients.models import KeyValueStoreMetadata
|
|
16
16
|
|
|
17
17
|
from ._base import Storage
|
|
18
|
+
from ._utils import validate_storage_name
|
|
18
19
|
|
|
19
20
|
if TYPE_CHECKING:
|
|
20
21
|
from collections.abc import AsyncIterator
|
|
@@ -84,6 +85,8 @@ class KeyValueStore(Storage):
|
|
|
84
85
|
id: The unique identifier of the storage.
|
|
85
86
|
name: The name of the storage, if available.
|
|
86
87
|
"""
|
|
88
|
+
validate_storage_name(name)
|
|
89
|
+
|
|
87
90
|
self._client = client
|
|
88
91
|
self._id = id
|
|
89
92
|
self._name = name
|
|
@@ -112,18 +115,25 @@ class KeyValueStore(Storage):
|
|
|
112
115
|
*,
|
|
113
116
|
id: str | None = None,
|
|
114
117
|
name: str | None = None,
|
|
118
|
+
alias: str | None = None,
|
|
115
119
|
configuration: Configuration | None = None,
|
|
116
120
|
storage_client: StorageClient | None = None,
|
|
117
121
|
) -> KeyValueStore:
|
|
118
122
|
configuration = service_locator.get_configuration() if configuration is None else configuration
|
|
119
123
|
storage_client = service_locator.get_storage_client() if storage_client is None else storage_client
|
|
120
124
|
|
|
125
|
+
client_opener_coro = storage_client.create_kvs_client(
|
|
126
|
+
id=id, name=name, alias=alias, configuration=configuration
|
|
127
|
+
)
|
|
128
|
+
additional_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)
|
|
129
|
+
|
|
121
130
|
return await service_locator.storage_instance_manager.open_storage_instance(
|
|
122
131
|
cls,
|
|
123
132
|
id=id,
|
|
124
133
|
name=name,
|
|
125
|
-
|
|
126
|
-
|
|
134
|
+
alias=alias,
|
|
135
|
+
client_opener_coro=client_opener_coro,
|
|
136
|
+
storage_client_cache_key=additional_cache_key,
|
|
127
137
|
)
|
|
128
138
|
|
|
129
139
|
@override
|
|
@@ -271,11 +281,14 @@ class KeyValueStore(Storage):
|
|
|
271
281
|
if key in cache:
|
|
272
282
|
return cache[key].current_value.root
|
|
273
283
|
|
|
284
|
+
async def kvs_factory() -> KeyValueStore:
|
|
285
|
+
return self
|
|
286
|
+
|
|
274
287
|
cache[key] = recoverable_state = RecoverableState(
|
|
275
288
|
default_state=AutosavedValue(default_value),
|
|
276
|
-
persistence_enabled=True,
|
|
277
|
-
persist_state_kvs_id=self.id,
|
|
278
289
|
persist_state_key=key,
|
|
290
|
+
persistence_enabled=True,
|
|
291
|
+
persist_state_kvs_factory=kvs_factory,
|
|
279
292
|
logger=logger,
|
|
280
293
|
)
|
|
281
294
|
|
|
@@ -13,6 +13,7 @@ from crawlee._utils.wait import wait_for_all_tasks_for_finish
|
|
|
13
13
|
from crawlee.request_loaders import RequestManager
|
|
14
14
|
|
|
15
15
|
from ._base import Storage
|
|
16
|
+
from ._utils import validate_storage_name
|
|
16
17
|
|
|
17
18
|
if TYPE_CHECKING:
|
|
18
19
|
from collections.abc import Sequence
|
|
@@ -80,6 +81,8 @@ class RequestQueue(Storage, RequestManager):
|
|
|
80
81
|
id: The unique identifier of the storage.
|
|
81
82
|
name: The name of the storage, if available.
|
|
82
83
|
"""
|
|
84
|
+
validate_storage_name(name)
|
|
85
|
+
|
|
83
86
|
self._client = client
|
|
84
87
|
self._id = id
|
|
85
88
|
self._name = name
|
|
@@ -118,18 +121,23 @@ class RequestQueue(Storage, RequestManager):
|
|
|
118
121
|
*,
|
|
119
122
|
id: str | None = None,
|
|
120
123
|
name: str | None = None,
|
|
124
|
+
alias: str | None = None,
|
|
121
125
|
configuration: Configuration | None = None,
|
|
122
126
|
storage_client: StorageClient | None = None,
|
|
123
127
|
) -> RequestQueue:
|
|
124
128
|
configuration = service_locator.get_configuration() if configuration is None else configuration
|
|
125
129
|
storage_client = service_locator.get_storage_client() if storage_client is None else storage_client
|
|
126
130
|
|
|
131
|
+
client_opener_coro = storage_client.create_rq_client(id=id, name=name, alias=alias, configuration=configuration)
|
|
132
|
+
additional_cache_key = storage_client.get_storage_client_cache_key(configuration=configuration)
|
|
133
|
+
|
|
127
134
|
return await service_locator.storage_instance_manager.open_storage_instance(
|
|
128
135
|
cls,
|
|
129
136
|
id=id,
|
|
130
137
|
name=name,
|
|
131
|
-
|
|
132
|
-
|
|
138
|
+
alias=alias,
|
|
139
|
+
client_opener_coro=client_opener_coro,
|
|
140
|
+
storage_client_cache_key=additional_cache_key,
|
|
133
141
|
)
|
|
134
142
|
|
|
135
143
|
@override
|
|
@@ -223,16 +231,16 @@ class RequestQueue(Storage, RequestManager):
|
|
|
223
231
|
"""
|
|
224
232
|
return await self._client.fetch_next_request()
|
|
225
233
|
|
|
226
|
-
async def get_request(self,
|
|
234
|
+
async def get_request(self, unique_key: str) -> Request | None:
|
|
227
235
|
"""Retrieve a specific request from the queue by its ID.
|
|
228
236
|
|
|
229
237
|
Args:
|
|
230
|
-
|
|
238
|
+
unique_key: Unique key of the request to retrieve.
|
|
231
239
|
|
|
232
240
|
Returns:
|
|
233
241
|
The request with the specified ID, or `None` if no such request exists.
|
|
234
242
|
"""
|
|
235
|
-
return await self._client.get_request(
|
|
243
|
+
return await self._client.get_request(unique_key)
|
|
236
244
|
|
|
237
245
|
async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | None:
|
|
238
246
|
"""Mark a request as handled after successful processing.
|