crawlee 0.6.13b17__py3-none-any.whl → 1.1.2b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_request.py +35 -33
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +106 -34
- crawlee/_utils/context.py +2 -2
- crawlee/_utils/file.py +7 -0
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +17 -1
- crawlee/_utils/requests.py +0 -26
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +4 -2
- crawlee/_utils/system.py +3 -3
- crawlee/_utils/time.py +120 -0
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +50 -12
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +219 -126
- crawlee/crawlers/_basic/_logging_utils.py +5 -1
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_playwright/_playwright_crawler.py +60 -11
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/events/_event_manager.py +4 -4
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +12 -0
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +3 -3
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +248 -50
- crawlee/sessions/_models.py +2 -2
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +43 -4
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_request_queue_client.py +2 -2
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
- crawlee/storage_clients/_file_system/_key_value_store_client.py +30 -26
- crawlee/storage_clients/_file_system/_request_queue_client.py +169 -153
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +13 -11
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +13 -5
- crawlee/storages/_storage_instance_manager.py +133 -71
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +18 -6
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +101 -78
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
- crawlee/_utils/measure_time.py +0 -31
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,385 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from contextlib import asynccontextmanager
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from logging import getLogger
|
|
7
|
+
from typing import TYPE_CHECKING, Any, ClassVar, TypedDict, cast, overload
|
|
8
|
+
|
|
9
|
+
from sqlalchemy import delete, select, text, update
|
|
10
|
+
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
|
11
|
+
from sqlalchemy.dialects.sqlite import insert as lite_insert
|
|
12
|
+
from sqlalchemy.exc import SQLAlchemyError
|
|
13
|
+
|
|
14
|
+
from crawlee._utils.crypto import crypto_random_object_id
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from collections.abc import AsyncIterator
|
|
18
|
+
|
|
19
|
+
from sqlalchemy import Insert
|
|
20
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
21
|
+
from sqlalchemy.orm import DeclarativeBase
|
|
22
|
+
from typing_extensions import NotRequired, Self
|
|
23
|
+
|
|
24
|
+
from crawlee.storage_clients.models import DatasetMetadata, KeyValueStoreMetadata, RequestQueueMetadata
|
|
25
|
+
|
|
26
|
+
from ._db_models import (
|
|
27
|
+
DatasetItemDb,
|
|
28
|
+
DatasetMetadataDb,
|
|
29
|
+
KeyValueStoreMetadataDb,
|
|
30
|
+
KeyValueStoreRecordDb,
|
|
31
|
+
RequestDb,
|
|
32
|
+
RequestQueueMetadataDb,
|
|
33
|
+
)
|
|
34
|
+
from ._storage_client import SqlStorageClient
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
logger = getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class MetadataUpdateParams(TypedDict, total=False):
|
|
41
|
+
"""Parameters for updating metadata."""
|
|
42
|
+
|
|
43
|
+
update_accessed_at: NotRequired[bool]
|
|
44
|
+
update_modified_at: NotRequired[bool]
|
|
45
|
+
force: NotRequired[bool]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class SqlClientMixin(ABC):
|
|
49
|
+
"""Mixin class for SQL clients.
|
|
50
|
+
|
|
51
|
+
This mixin provides common SQL operations and basic methods for SQL storage clients.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
_DEFAULT_NAME: ClassVar[str]
|
|
55
|
+
"""Default name when none provided."""
|
|
56
|
+
|
|
57
|
+
_METADATA_TABLE: ClassVar[type[DatasetMetadataDb | KeyValueStoreMetadataDb | RequestQueueMetadataDb]]
|
|
58
|
+
"""SQLAlchemy model for metadata."""
|
|
59
|
+
|
|
60
|
+
_ITEM_TABLE: ClassVar[type[DatasetItemDb | KeyValueStoreRecordDb | RequestDb]]
|
|
61
|
+
"""SQLAlchemy model for items."""
|
|
62
|
+
|
|
63
|
+
_CLIENT_TYPE: ClassVar[str]
|
|
64
|
+
"""Human-readable client type for error messages."""
|
|
65
|
+
|
|
66
|
+
def __init__(self, *, id: str, storage_client: SqlStorageClient) -> None:
|
|
67
|
+
self._id = id
|
|
68
|
+
self._storage_client = storage_client
|
|
69
|
+
|
|
70
|
+
# Time tracking to reduce database writes during frequent operation
|
|
71
|
+
self._accessed_at_allow_update_after: datetime | None = None
|
|
72
|
+
self._modified_at_allow_update_after: datetime | None = None
|
|
73
|
+
self._accessed_modified_update_interval = storage_client.get_accessed_modified_update_interval()
|
|
74
|
+
|
|
75
|
+
@classmethod
|
|
76
|
+
async def _open(
|
|
77
|
+
cls,
|
|
78
|
+
*,
|
|
79
|
+
id: str | None,
|
|
80
|
+
name: str | None,
|
|
81
|
+
internal_name: str,
|
|
82
|
+
storage_client: SqlStorageClient,
|
|
83
|
+
metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata],
|
|
84
|
+
session: AsyncSession,
|
|
85
|
+
extra_metadata_fields: dict[str, Any],
|
|
86
|
+
) -> Self:
|
|
87
|
+
"""Open existing storage or create new one.
|
|
88
|
+
|
|
89
|
+
Internal method used by _safely_open.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
id: Storage ID to open (takes precedence over name).
|
|
93
|
+
name: The name of the storage.
|
|
94
|
+
internal_name: The database name for the storage based on name or alias.
|
|
95
|
+
storage_client: SQL storage client instance.
|
|
96
|
+
metadata_model: Pydantic model for metadata validation.
|
|
97
|
+
session: Active database session.
|
|
98
|
+
extra_metadata_fields: Storage-specific metadata fields.
|
|
99
|
+
"""
|
|
100
|
+
orm_metadata: DatasetMetadataDb | KeyValueStoreMetadataDb | RequestQueueMetadataDb | None = None
|
|
101
|
+
if id:
|
|
102
|
+
orm_metadata = await session.get(cls._METADATA_TABLE, id)
|
|
103
|
+
if not orm_metadata:
|
|
104
|
+
raise ValueError(f'{cls._CLIENT_TYPE} with ID "{id}" not found.')
|
|
105
|
+
else:
|
|
106
|
+
stmt = select(cls._METADATA_TABLE).where(cls._METADATA_TABLE.internal_name == internal_name)
|
|
107
|
+
result = await session.execute(stmt)
|
|
108
|
+
orm_metadata = result.scalar_one_or_none() # type: ignore[assignment]
|
|
109
|
+
|
|
110
|
+
if orm_metadata:
|
|
111
|
+
client = cls(id=orm_metadata.id, storage_client=storage_client)
|
|
112
|
+
await client._update_metadata(session, update_accessed_at=True)
|
|
113
|
+
else:
|
|
114
|
+
now = datetime.now(timezone.utc)
|
|
115
|
+
metadata = metadata_model(
|
|
116
|
+
id=crypto_random_object_id(),
|
|
117
|
+
name=name,
|
|
118
|
+
created_at=now,
|
|
119
|
+
accessed_at=now,
|
|
120
|
+
modified_at=now,
|
|
121
|
+
**extra_metadata_fields,
|
|
122
|
+
)
|
|
123
|
+
client = cls(id=metadata.id, storage_client=storage_client)
|
|
124
|
+
client._accessed_at_allow_update_after = now + client._accessed_modified_update_interval
|
|
125
|
+
client._modified_at_allow_update_after = now + client._accessed_modified_update_interval
|
|
126
|
+
session.add(cls._METADATA_TABLE(**metadata.model_dump(), internal_name=internal_name))
|
|
127
|
+
|
|
128
|
+
return client
|
|
129
|
+
|
|
130
|
+
@classmethod
|
|
131
|
+
async def _safely_open(
|
|
132
|
+
cls,
|
|
133
|
+
*,
|
|
134
|
+
id: str | None,
|
|
135
|
+
name: str | None,
|
|
136
|
+
alias: str | None = None,
|
|
137
|
+
storage_client: SqlStorageClient,
|
|
138
|
+
metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata],
|
|
139
|
+
extra_metadata_fields: dict[str, Any],
|
|
140
|
+
) -> Self:
|
|
141
|
+
"""Safely open storage with transaction handling.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
id: Storage ID to open (takes precedence over name).
|
|
145
|
+
name: The name of the storage for named (global scope) storages.
|
|
146
|
+
alias: The alias of the storage for unnamed (run scope) storages.
|
|
147
|
+
storage_client: SQL storage client instance.
|
|
148
|
+
client_class: Concrete client class to instantiate.
|
|
149
|
+
metadata_model: Pydantic model for metadata validation.
|
|
150
|
+
extra_metadata_fields: Storage-specific metadata fields.
|
|
151
|
+
"""
|
|
152
|
+
# Validate input parameters.
|
|
153
|
+
specified_params = sum(1 for param in [id, name, alias] if param is not None)
|
|
154
|
+
if specified_params > 1:
|
|
155
|
+
raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
|
|
156
|
+
|
|
157
|
+
internal_name = name or alias or cls._DEFAULT_NAME
|
|
158
|
+
|
|
159
|
+
async with storage_client.create_session() as session:
|
|
160
|
+
try:
|
|
161
|
+
client = await cls._open(
|
|
162
|
+
id=id,
|
|
163
|
+
name=name,
|
|
164
|
+
internal_name=internal_name,
|
|
165
|
+
storage_client=storage_client,
|
|
166
|
+
metadata_model=metadata_model,
|
|
167
|
+
session=session,
|
|
168
|
+
extra_metadata_fields=extra_metadata_fields,
|
|
169
|
+
)
|
|
170
|
+
await session.commit()
|
|
171
|
+
except SQLAlchemyError:
|
|
172
|
+
await session.rollback()
|
|
173
|
+
|
|
174
|
+
stmt = select(cls._METADATA_TABLE).where(cls._METADATA_TABLE.internal_name == internal_name)
|
|
175
|
+
result = await session.execute(stmt)
|
|
176
|
+
orm_metadata: DatasetMetadataDb | KeyValueStoreMetadataDb | RequestQueueMetadataDb | None
|
|
177
|
+
orm_metadata = cast(
|
|
178
|
+
'DatasetMetadataDb | KeyValueStoreMetadataDb | RequestQueueMetadataDb | None',
|
|
179
|
+
result.scalar_one_or_none(),
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
if not orm_metadata:
|
|
183
|
+
raise ValueError(f'{cls._CLIENT_TYPE} with Name "{internal_name}" not found.') from None
|
|
184
|
+
|
|
185
|
+
client = cls(id=orm_metadata.id, storage_client=storage_client)
|
|
186
|
+
|
|
187
|
+
return client
|
|
188
|
+
|
|
189
|
+
@asynccontextmanager
|
|
190
|
+
async def get_session(self, *, with_simple_commit: bool = False) -> AsyncIterator[AsyncSession]:
|
|
191
|
+
"""Create a new SQLAlchemy session for this storage."""
|
|
192
|
+
async with self._storage_client.create_session() as session:
|
|
193
|
+
# For operations where a final commit is mandatory and does not require specific processing conditions
|
|
194
|
+
if with_simple_commit:
|
|
195
|
+
try:
|
|
196
|
+
yield session
|
|
197
|
+
await session.commit()
|
|
198
|
+
except SQLAlchemyError as e:
|
|
199
|
+
logger.warning(f'Error occurred during session transaction: {e}')
|
|
200
|
+
await session.rollback()
|
|
201
|
+
else:
|
|
202
|
+
yield session
|
|
203
|
+
|
|
204
|
+
def _build_insert_stmt_with_ignore(
|
|
205
|
+
self, table_model: type[DeclarativeBase], insert_values: dict[str, Any] | list[dict[str, Any]]
|
|
206
|
+
) -> Insert:
|
|
207
|
+
"""Build an insert statement with ignore for the SQL dialect.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
table_model: SQLAlchemy table model.
|
|
211
|
+
insert_values: Single dict or list of dicts to insert.
|
|
212
|
+
"""
|
|
213
|
+
if isinstance(insert_values, dict):
|
|
214
|
+
insert_values = [insert_values]
|
|
215
|
+
|
|
216
|
+
dialect = self._storage_client.get_dialect_name()
|
|
217
|
+
|
|
218
|
+
if dialect == 'postgresql':
|
|
219
|
+
return pg_insert(table_model).values(insert_values).on_conflict_do_nothing()
|
|
220
|
+
|
|
221
|
+
if dialect == 'sqlite':
|
|
222
|
+
return lite_insert(table_model).values(insert_values).on_conflict_do_nothing()
|
|
223
|
+
|
|
224
|
+
raise NotImplementedError(f'Insert with ignore not supported for dialect: {dialect}')
|
|
225
|
+
|
|
226
|
+
def _build_upsert_stmt(
|
|
227
|
+
self,
|
|
228
|
+
table_model: type[DeclarativeBase],
|
|
229
|
+
insert_values: dict[str, Any] | list[dict[str, Any]],
|
|
230
|
+
update_columns: list[str],
|
|
231
|
+
conflict_cols: list[str] | None = None,
|
|
232
|
+
) -> Insert:
|
|
233
|
+
"""Build an upsert statement for the SQL dialect.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
table_model: SQLAlchemy table model.
|
|
237
|
+
insert_values: Single dict or list of dicts to upsert.
|
|
238
|
+
update_columns: Column names to update on conflict.
|
|
239
|
+
conflict_cols: Column names that define uniqueness (for PostgreSQL/SQLite).
|
|
240
|
+
|
|
241
|
+
"""
|
|
242
|
+
if isinstance(insert_values, dict):
|
|
243
|
+
insert_values = [insert_values]
|
|
244
|
+
|
|
245
|
+
dialect = self._storage_client.get_dialect_name()
|
|
246
|
+
|
|
247
|
+
if dialect == 'postgresql':
|
|
248
|
+
pg_stmt = pg_insert(table_model).values(insert_values)
|
|
249
|
+
set_ = {col: getattr(pg_stmt.excluded, col) for col in update_columns}
|
|
250
|
+
return pg_stmt.on_conflict_do_update(index_elements=conflict_cols, set_=set_)
|
|
251
|
+
|
|
252
|
+
if dialect == 'sqlite':
|
|
253
|
+
lite_stmt = lite_insert(table_model).values(insert_values)
|
|
254
|
+
set_ = {col: getattr(lite_stmt.excluded, col) for col in update_columns}
|
|
255
|
+
return lite_stmt.on_conflict_do_update(index_elements=conflict_cols, set_=set_)
|
|
256
|
+
|
|
257
|
+
raise NotImplementedError(f'Upsert not supported for dialect: {dialect}')
|
|
258
|
+
|
|
259
|
+
async def _purge(self, metadata_kwargs: MetadataUpdateParams) -> None:
|
|
260
|
+
"""Drop all items in storage and update metadata.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
metadata_kwargs: Arguments to pass to _update_metadata.
|
|
264
|
+
"""
|
|
265
|
+
stmt = delete(self._ITEM_TABLE).where(self._ITEM_TABLE.storage_id == self._id)
|
|
266
|
+
async with self.get_session(with_simple_commit=True) as session:
|
|
267
|
+
await session.execute(stmt)
|
|
268
|
+
await self._update_metadata(session, **metadata_kwargs)
|
|
269
|
+
|
|
270
|
+
async def _drop(self) -> None:
|
|
271
|
+
"""Delete this storage and all its data.
|
|
272
|
+
|
|
273
|
+
This operation is irreversible. Uses CASCADE deletion to remove all related items.
|
|
274
|
+
"""
|
|
275
|
+
stmt = delete(self._METADATA_TABLE).where(self._METADATA_TABLE.id == self._id)
|
|
276
|
+
async with self.get_session(with_simple_commit=True) as session:
|
|
277
|
+
if self._storage_client.get_dialect_name() == 'sqlite':
|
|
278
|
+
# foreign_keys=ON is set at the connection level. Required for cascade deletion.
|
|
279
|
+
await session.execute(text('PRAGMA foreign_keys=ON'))
|
|
280
|
+
await session.execute(stmt)
|
|
281
|
+
|
|
282
|
+
@overload
|
|
283
|
+
async def _get_metadata(self, metadata_model: type[DatasetMetadata]) -> DatasetMetadata: ...
|
|
284
|
+
@overload
|
|
285
|
+
async def _get_metadata(self, metadata_model: type[KeyValueStoreMetadata]) -> KeyValueStoreMetadata: ...
|
|
286
|
+
@overload
|
|
287
|
+
async def _get_metadata(self, metadata_model: type[RequestQueueMetadata]) -> RequestQueueMetadata: ...
|
|
288
|
+
|
|
289
|
+
async def _get_metadata(
|
|
290
|
+
self, metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata]
|
|
291
|
+
) -> DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata:
|
|
292
|
+
"""Retrieve client metadata."""
|
|
293
|
+
async with self.get_session() as session:
|
|
294
|
+
orm_metadata = await session.get(self._METADATA_TABLE, self._id)
|
|
295
|
+
if not orm_metadata:
|
|
296
|
+
raise ValueError(f'{self._CLIENT_TYPE} with ID "{self._id}" not found.')
|
|
297
|
+
|
|
298
|
+
return metadata_model.model_validate(orm_metadata)
|
|
299
|
+
|
|
300
|
+
def _default_update_metadata(
|
|
301
|
+
self, *, update_accessed_at: bool = False, update_modified_at: bool = False, force: bool = False
|
|
302
|
+
) -> dict[str, Any]:
|
|
303
|
+
"""Prepare common metadata updates with rate limiting.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
update_accessed_at: Whether to update accessed_at timestamp.
|
|
307
|
+
update_modified_at: Whether to update modified_at timestamp.
|
|
308
|
+
force: Whether to force the update regardless of rate limiting.
|
|
309
|
+
"""
|
|
310
|
+
values_to_set: dict[str, Any] = {}
|
|
311
|
+
now = datetime.now(timezone.utc)
|
|
312
|
+
|
|
313
|
+
# If the record must be updated (for example, when updating counters), we update timestamps and shift the time.
|
|
314
|
+
if force:
|
|
315
|
+
if update_modified_at:
|
|
316
|
+
values_to_set['modified_at'] = now
|
|
317
|
+
self._modified_at_allow_update_after = now + self._accessed_modified_update_interval
|
|
318
|
+
if update_accessed_at:
|
|
319
|
+
values_to_set['accessed_at'] = now
|
|
320
|
+
self._accessed_at_allow_update_after = now + self._accessed_modified_update_interval
|
|
321
|
+
|
|
322
|
+
elif update_modified_at and (
|
|
323
|
+
self._modified_at_allow_update_after is None or now >= self._modified_at_allow_update_after
|
|
324
|
+
):
|
|
325
|
+
values_to_set['modified_at'] = now
|
|
326
|
+
self._modified_at_allow_update_after = now + self._accessed_modified_update_interval
|
|
327
|
+
# The record will be updated, we can update `accessed_at` and shift the time.
|
|
328
|
+
if update_accessed_at:
|
|
329
|
+
values_to_set['accessed_at'] = now
|
|
330
|
+
self._accessed_at_allow_update_after = now + self._accessed_modified_update_interval
|
|
331
|
+
|
|
332
|
+
elif update_accessed_at and (
|
|
333
|
+
self._accessed_at_allow_update_after is None or now >= self._accessed_at_allow_update_after
|
|
334
|
+
):
|
|
335
|
+
values_to_set['accessed_at'] = now
|
|
336
|
+
self._accessed_at_allow_update_after = now + self._accessed_modified_update_interval
|
|
337
|
+
|
|
338
|
+
return values_to_set
|
|
339
|
+
|
|
340
|
+
@abstractmethod
|
|
341
|
+
def _specific_update_metadata(self, **kwargs: Any) -> dict[str, Any]:
|
|
342
|
+
"""Prepare storage-specific metadata updates.
|
|
343
|
+
|
|
344
|
+
Must be implemented by concrete classes.
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
**kwargs: Storage-specific update parameters.
|
|
348
|
+
"""
|
|
349
|
+
|
|
350
|
+
async def _update_metadata(
|
|
351
|
+
self,
|
|
352
|
+
session: AsyncSession,
|
|
353
|
+
*,
|
|
354
|
+
update_accessed_at: bool = False,
|
|
355
|
+
update_modified_at: bool = False,
|
|
356
|
+
force: bool = False,
|
|
357
|
+
**kwargs: Any,
|
|
358
|
+
) -> bool:
|
|
359
|
+
"""Update storage metadata combining common and specific fields.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
session: Active database session.
|
|
363
|
+
update_accessed_at: Whether to update accessed_at timestamp.
|
|
364
|
+
update_modified_at: Whether to update modified_at timestamp.
|
|
365
|
+
force: Whether to force the update timestamps regardless of rate limiting.
|
|
366
|
+
**kwargs: Additional arguments for _specific_update_metadata.
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
True if any updates were made, False otherwise
|
|
370
|
+
"""
|
|
371
|
+
values_to_set = self._default_update_metadata(
|
|
372
|
+
update_accessed_at=update_accessed_at, update_modified_at=update_modified_at, force=force
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
values_to_set.update(self._specific_update_metadata(**kwargs))
|
|
376
|
+
|
|
377
|
+
if values_to_set:
|
|
378
|
+
if (stmt := values_to_set.pop('custom_stmt', None)) is None:
|
|
379
|
+
stmt = update(self._METADATA_TABLE).where(self._METADATA_TABLE.id == self._id)
|
|
380
|
+
|
|
381
|
+
stmt = stmt.values(**values_to_set)
|
|
382
|
+
await session.execute(stmt)
|
|
383
|
+
return True
|
|
384
|
+
|
|
385
|
+
return False
|
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from logging import getLogger
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
from sqlalchemy import Select, insert, select
|
|
7
|
+
from typing_extensions import Self, override
|
|
8
|
+
|
|
9
|
+
from crawlee.storage_clients._base import DatasetClient
|
|
10
|
+
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
|
|
11
|
+
|
|
12
|
+
from ._client_mixin import MetadataUpdateParams, SqlClientMixin
|
|
13
|
+
from ._db_models import DatasetItemDb, DatasetMetadataDb
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from collections.abc import AsyncIterator
|
|
17
|
+
|
|
18
|
+
from sqlalchemy import Select
|
|
19
|
+
from typing_extensions import NotRequired
|
|
20
|
+
|
|
21
|
+
from ._storage_client import SqlStorageClient
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
logger = getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class _DatasetMetadataUpdateParams(MetadataUpdateParams):
|
|
28
|
+
"""Parameters for updating dataset metadata."""
|
|
29
|
+
|
|
30
|
+
new_item_count: NotRequired[int]
|
|
31
|
+
delta_item_count: NotRequired[int]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class SqlDatasetClient(DatasetClient, SqlClientMixin):
|
|
35
|
+
"""SQL implementation of the dataset client.
|
|
36
|
+
|
|
37
|
+
This client persists dataset items to a SQL database using two tables for storage
|
|
38
|
+
and retrieval. Items are stored as JSON with automatic ordering preservation.
|
|
39
|
+
|
|
40
|
+
The dataset data is stored in SQL database tables following the pattern:
|
|
41
|
+
- `datasets` table: Contains dataset metadata (id, name, timestamps, item_count)
|
|
42
|
+
- `dataset_records` table: Contains individual items with JSON data and auto-increment ordering
|
|
43
|
+
|
|
44
|
+
Items are stored as a JSON object in SQLite and as JSONB in PostgreSQL. These objects must be JSON-serializable.
|
|
45
|
+
The `item_id` auto-increment primary key ensures insertion order is preserved.
|
|
46
|
+
All operations are wrapped in database transactions with CASCADE deletion support.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
_DEFAULT_NAME = 'default'
|
|
50
|
+
"""Default dataset name used when no name is provided."""
|
|
51
|
+
|
|
52
|
+
_METADATA_TABLE = DatasetMetadataDb
|
|
53
|
+
"""SQLAlchemy model for dataset metadata."""
|
|
54
|
+
|
|
55
|
+
_ITEM_TABLE = DatasetItemDb
|
|
56
|
+
"""SQLAlchemy model for dataset items."""
|
|
57
|
+
|
|
58
|
+
_CLIENT_TYPE = 'Dataset'
|
|
59
|
+
"""Human-readable client type for error messages."""
|
|
60
|
+
|
|
61
|
+
def __init__(
|
|
62
|
+
self,
|
|
63
|
+
*,
|
|
64
|
+
id: str,
|
|
65
|
+
storage_client: SqlStorageClient,
|
|
66
|
+
) -> None:
|
|
67
|
+
"""Initialize a new instance.
|
|
68
|
+
|
|
69
|
+
Preferably use the `SqlDatasetClient.open` class method to create a new instance.
|
|
70
|
+
"""
|
|
71
|
+
super().__init__(id=id, storage_client=storage_client)
|
|
72
|
+
|
|
73
|
+
@classmethod
|
|
74
|
+
async def open(
|
|
75
|
+
cls,
|
|
76
|
+
*,
|
|
77
|
+
id: str | None,
|
|
78
|
+
name: str | None,
|
|
79
|
+
alias: str | None,
|
|
80
|
+
storage_client: SqlStorageClient,
|
|
81
|
+
) -> Self:
|
|
82
|
+
"""Open an existing dataset or create a new one.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
id: The ID of the dataset to open. If provided, searches for existing dataset by ID.
|
|
86
|
+
name: The name of the dataset for named (global scope) storages.
|
|
87
|
+
alias: The alias of the dataset for unnamed (run scope) storages.
|
|
88
|
+
storage_client: The SQL storage client instance.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
An instance for the opened or created storage client.
|
|
92
|
+
|
|
93
|
+
Raises:
|
|
94
|
+
ValueError: If a dataset with the specified ID is not found.
|
|
95
|
+
"""
|
|
96
|
+
return await cls._safely_open(
|
|
97
|
+
id=id,
|
|
98
|
+
name=name,
|
|
99
|
+
alias=alias,
|
|
100
|
+
storage_client=storage_client,
|
|
101
|
+
metadata_model=DatasetMetadata,
|
|
102
|
+
extra_metadata_fields={'item_count': 0},
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
@override
|
|
106
|
+
async def get_metadata(self) -> DatasetMetadata:
|
|
107
|
+
# The database is a single place of truth
|
|
108
|
+
return await self._get_metadata(DatasetMetadata)
|
|
109
|
+
|
|
110
|
+
@override
|
|
111
|
+
async def drop(self) -> None:
|
|
112
|
+
"""Delete this dataset and all its items from the database.
|
|
113
|
+
|
|
114
|
+
This operation is irreversible. Uses CASCADE deletion to remove all related items.
|
|
115
|
+
"""
|
|
116
|
+
await self._drop()
|
|
117
|
+
|
|
118
|
+
@override
|
|
119
|
+
async def purge(self) -> None:
|
|
120
|
+
"""Remove all items from this dataset while keeping the dataset structure.
|
|
121
|
+
|
|
122
|
+
Resets item_count to 0 and deletes all records from dataset_records table.
|
|
123
|
+
"""
|
|
124
|
+
await self._purge(
|
|
125
|
+
metadata_kwargs=_DatasetMetadataUpdateParams(
|
|
126
|
+
new_item_count=0,
|
|
127
|
+
update_accessed_at=True,
|
|
128
|
+
update_modified_at=True,
|
|
129
|
+
force=True,
|
|
130
|
+
)
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
@override
|
|
134
|
+
async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None:
|
|
135
|
+
if not isinstance(data, list):
|
|
136
|
+
data = [data]
|
|
137
|
+
|
|
138
|
+
db_items: list[dict[str, Any]] = []
|
|
139
|
+
db_items = [{'dataset_id': self._id, 'data': item} for item in data]
|
|
140
|
+
stmt = insert(self._ITEM_TABLE).values(db_items)
|
|
141
|
+
|
|
142
|
+
async with self.get_session(with_simple_commit=True) as session:
|
|
143
|
+
await session.execute(stmt)
|
|
144
|
+
|
|
145
|
+
await self._update_metadata(
|
|
146
|
+
session,
|
|
147
|
+
**_DatasetMetadataUpdateParams(
|
|
148
|
+
update_accessed_at=True,
|
|
149
|
+
update_modified_at=True,
|
|
150
|
+
delta_item_count=len(data),
|
|
151
|
+
new_item_count=len(data),
|
|
152
|
+
force=True,
|
|
153
|
+
),
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
@override
|
|
157
|
+
async def get_data(
|
|
158
|
+
self,
|
|
159
|
+
*,
|
|
160
|
+
offset: int = 0,
|
|
161
|
+
limit: int | None = 999_999_999_999,
|
|
162
|
+
clean: bool = False,
|
|
163
|
+
desc: bool = False,
|
|
164
|
+
fields: list[str] | None = None,
|
|
165
|
+
omit: list[str] | None = None,
|
|
166
|
+
unwind: list[str] | None = None,
|
|
167
|
+
skip_empty: bool = False,
|
|
168
|
+
skip_hidden: bool = False,
|
|
169
|
+
flatten: list[str] | None = None,
|
|
170
|
+
view: str | None = None,
|
|
171
|
+
) -> DatasetItemsListPage:
|
|
172
|
+
stmt = self._prepare_get_stmt(
|
|
173
|
+
offset=offset,
|
|
174
|
+
limit=limit,
|
|
175
|
+
clean=clean,
|
|
176
|
+
desc=desc,
|
|
177
|
+
fields=fields,
|
|
178
|
+
omit=omit,
|
|
179
|
+
unwind=unwind,
|
|
180
|
+
skip_empty=skip_empty,
|
|
181
|
+
skip_hidden=skip_hidden,
|
|
182
|
+
flatten=flatten,
|
|
183
|
+
view=view,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
async with self.get_session() as session:
|
|
187
|
+
result = await session.execute(stmt)
|
|
188
|
+
db_items = result.scalars().all()
|
|
189
|
+
|
|
190
|
+
updated = await self._update_metadata(session, **_DatasetMetadataUpdateParams(update_accessed_at=True))
|
|
191
|
+
|
|
192
|
+
# Commit updates to the metadata
|
|
193
|
+
if updated:
|
|
194
|
+
await session.commit()
|
|
195
|
+
|
|
196
|
+
items = [db_item.data for db_item in db_items]
|
|
197
|
+
metadata = await self.get_metadata()
|
|
198
|
+
return DatasetItemsListPage(
|
|
199
|
+
items=items,
|
|
200
|
+
count=len(items),
|
|
201
|
+
desc=desc,
|
|
202
|
+
limit=limit or 0,
|
|
203
|
+
offset=offset or 0,
|
|
204
|
+
total=metadata.item_count,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
@override
|
|
208
|
+
async def iterate_items(
|
|
209
|
+
self,
|
|
210
|
+
*,
|
|
211
|
+
offset: int = 0,
|
|
212
|
+
limit: int | None = None,
|
|
213
|
+
clean: bool = False,
|
|
214
|
+
desc: bool = False,
|
|
215
|
+
fields: list[str] | None = None,
|
|
216
|
+
omit: list[str] | None = None,
|
|
217
|
+
unwind: list[str] | None = None,
|
|
218
|
+
skip_empty: bool = False,
|
|
219
|
+
skip_hidden: bool = False,
|
|
220
|
+
) -> AsyncIterator[dict[str, Any]]:
|
|
221
|
+
stmt = self._prepare_get_stmt(
|
|
222
|
+
offset=offset,
|
|
223
|
+
limit=limit,
|
|
224
|
+
clean=clean,
|
|
225
|
+
desc=desc,
|
|
226
|
+
fields=fields,
|
|
227
|
+
omit=omit,
|
|
228
|
+
unwind=unwind,
|
|
229
|
+
skip_empty=skip_empty,
|
|
230
|
+
skip_hidden=skip_hidden,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
async with self.get_session() as session:
|
|
234
|
+
db_items = await session.stream_scalars(stmt)
|
|
235
|
+
|
|
236
|
+
async for db_item in db_items:
|
|
237
|
+
yield db_item.data
|
|
238
|
+
|
|
239
|
+
updated = await self._update_metadata(session, **_DatasetMetadataUpdateParams(update_accessed_at=True))
|
|
240
|
+
|
|
241
|
+
# Commit updates to the metadata
|
|
242
|
+
if updated:
|
|
243
|
+
await session.commit()
|
|
244
|
+
|
|
245
|
+
def _prepare_get_stmt(
|
|
246
|
+
self,
|
|
247
|
+
*,
|
|
248
|
+
offset: int = 0,
|
|
249
|
+
limit: int | None = 999_999_999_999,
|
|
250
|
+
clean: bool = False,
|
|
251
|
+
desc: bool = False,
|
|
252
|
+
fields: list[str] | None = None,
|
|
253
|
+
omit: list[str] | None = None,
|
|
254
|
+
unwind: list[str] | None = None,
|
|
255
|
+
skip_empty: bool = False,
|
|
256
|
+
skip_hidden: bool = False,
|
|
257
|
+
flatten: list[str] | None = None,
|
|
258
|
+
view: str | None = None,
|
|
259
|
+
) -> Select:
|
|
260
|
+
# Check for unsupported arguments and log a warning if found.
|
|
261
|
+
unsupported_args: dict[str, Any] = {
|
|
262
|
+
'clean': clean,
|
|
263
|
+
'fields': fields,
|
|
264
|
+
'omit': omit,
|
|
265
|
+
'unwind': unwind,
|
|
266
|
+
'skip_hidden': skip_hidden,
|
|
267
|
+
'flatten': flatten,
|
|
268
|
+
'view': view,
|
|
269
|
+
}
|
|
270
|
+
unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)}
|
|
271
|
+
|
|
272
|
+
if unsupported:
|
|
273
|
+
logger.warning(
|
|
274
|
+
f'The arguments {list(unsupported.keys())} of get_data are not supported by the '
|
|
275
|
+
f'{self.__class__.__name__} client.'
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
stmt = select(self._ITEM_TABLE).where(self._ITEM_TABLE.dataset_id == self._id)
|
|
279
|
+
|
|
280
|
+
if skip_empty:
|
|
281
|
+
# Skip items that are empty JSON objects
|
|
282
|
+
stmt = stmt.where(self._ITEM_TABLE.data != {})
|
|
283
|
+
|
|
284
|
+
# Apply ordering by insertion order (item_id)
|
|
285
|
+
stmt = stmt.order_by(self._ITEM_TABLE.item_id.desc()) if desc else stmt.order_by(self._ITEM_TABLE.item_id.asc())
|
|
286
|
+
|
|
287
|
+
return stmt.offset(offset).limit(limit)
|
|
288
|
+
|
|
289
|
+
def _specific_update_metadata(
|
|
290
|
+
self,
|
|
291
|
+
new_item_count: int | None = None,
|
|
292
|
+
delta_item_count: int | None = None,
|
|
293
|
+
**_kwargs: dict[str, Any],
|
|
294
|
+
) -> dict[str, Any]:
|
|
295
|
+
"""Update the dataset metadata in the database.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
session: The SQLAlchemy AsyncSession to use for the update.
|
|
299
|
+
new_item_count: If provided, set item count to this value.
|
|
300
|
+
delta_item_count: If provided, add this value to the current item count.
|
|
301
|
+
"""
|
|
302
|
+
values_to_set: dict[str, Any] = {}
|
|
303
|
+
|
|
304
|
+
if new_item_count is not None:
|
|
305
|
+
values_to_set['item_count'] = new_item_count
|
|
306
|
+
elif delta_item_count:
|
|
307
|
+
# Use database-level for atomic updates
|
|
308
|
+
values_to_set['item_count'] = self._METADATA_TABLE.item_count + delta_item_count
|
|
309
|
+
|
|
310
|
+
return values_to_set
|