crawlee 0.6.13b17__py3-none-any.whl → 1.1.2b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (102) hide show
  1. crawlee/_autoscaling/snapshotter.py +1 -1
  2. crawlee/_request.py +35 -33
  3. crawlee/_service_locator.py +44 -24
  4. crawlee/_types.py +106 -34
  5. crawlee/_utils/context.py +2 -2
  6. crawlee/_utils/file.py +7 -0
  7. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  8. crawlee/_utils/recoverable_state.py +32 -8
  9. crawlee/_utils/recurring_task.py +17 -1
  10. crawlee/_utils/requests.py +0 -26
  11. crawlee/_utils/robots.py +17 -5
  12. crawlee/_utils/sitemap.py +4 -2
  13. crawlee/_utils/system.py +3 -3
  14. crawlee/_utils/time.py +120 -0
  15. crawlee/_utils/urls.py +9 -2
  16. crawlee/browsers/_browser_pool.py +4 -1
  17. crawlee/browsers/_playwright_browser_controller.py +21 -15
  18. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  19. crawlee/browsers/_types.py +1 -1
  20. crawlee/configuration.py +2 -0
  21. crawlee/crawlers/__init__.py +2 -1
  22. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  23. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +50 -12
  24. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  25. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  26. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
  27. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  28. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
  29. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  30. crawlee/crawlers/_basic/_basic_crawler.py +219 -126
  31. crawlee/crawlers/_basic/_logging_utils.py +5 -1
  32. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  33. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  34. crawlee/crawlers/_playwright/_playwright_crawler.py +60 -11
  35. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  36. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  37. crawlee/crawlers/_playwright/_types.py +12 -2
  38. crawlee/events/_event_manager.py +4 -4
  39. crawlee/events/_types.py +6 -6
  40. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  41. crawlee/fingerprint_suite/_header_generator.py +2 -2
  42. crawlee/fingerprint_suite/_types.py +2 -2
  43. crawlee/http_clients/_base.py +4 -0
  44. crawlee/http_clients/_curl_impersonate.py +12 -0
  45. crawlee/http_clients/_httpx.py +16 -6
  46. crawlee/http_clients/_impit.py +25 -10
  47. crawlee/otel/crawler_instrumentor.py +3 -3
  48. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  49. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  50. crawlee/request_loaders/_request_list.py +3 -3
  51. crawlee/request_loaders/_request_loader.py +5 -1
  52. crawlee/request_loaders/_sitemap_request_loader.py +248 -50
  53. crawlee/sessions/_models.py +2 -2
  54. crawlee/sessions/_session_pool.py +1 -1
  55. crawlee/statistics/_error_snapshotter.py +1 -1
  56. crawlee/statistics/_models.py +43 -4
  57. crawlee/statistics/_statistics.py +24 -33
  58. crawlee/storage_clients/__init__.py +16 -0
  59. crawlee/storage_clients/_base/_request_queue_client.py +2 -2
  60. crawlee/storage_clients/_base/_storage_client.py +13 -0
  61. crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
  62. crawlee/storage_clients/_file_system/_key_value_store_client.py +30 -26
  63. crawlee/storage_clients/_file_system/_request_queue_client.py +169 -153
  64. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  65. crawlee/storage_clients/_file_system/_utils.py +0 -0
  66. crawlee/storage_clients/_memory/_dataset_client.py +16 -4
  67. crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
  68. crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
  69. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  70. crawlee/storage_clients/_redis/__init__.py +6 -0
  71. crawlee/storage_clients/_redis/_client_mixin.py +295 -0
  72. crawlee/storage_clients/_redis/_dataset_client.py +325 -0
  73. crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
  74. crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
  75. crawlee/storage_clients/_redis/_storage_client.py +146 -0
  76. crawlee/storage_clients/_redis/_utils.py +23 -0
  77. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  78. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  79. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  80. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  81. crawlee/storage_clients/_redis/py.typed +0 -0
  82. crawlee/storage_clients/_sql/__init__.py +6 -0
  83. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  84. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  85. crawlee/storage_clients/_sql/_db_models.py +268 -0
  86. crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
  87. crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
  88. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  89. crawlee/storage_clients/_sql/py.typed +0 -0
  90. crawlee/storage_clients/models.py +13 -11
  91. crawlee/storages/_base.py +5 -1
  92. crawlee/storages/_dataset.py +12 -2
  93. crawlee/storages/_key_value_store.py +17 -4
  94. crawlee/storages/_request_queue.py +13 -5
  95. crawlee/storages/_storage_instance_manager.py +133 -71
  96. crawlee/storages/_utils.py +11 -0
  97. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +18 -6
  98. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +101 -78
  99. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
  100. crawlee/_utils/measure_time.py +0 -31
  101. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
  102. {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,385 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from contextlib import asynccontextmanager
5
+ from datetime import datetime, timezone
6
+ from logging import getLogger
7
+ from typing import TYPE_CHECKING, Any, ClassVar, TypedDict, cast, overload
8
+
9
+ from sqlalchemy import delete, select, text, update
10
+ from sqlalchemy.dialects.postgresql import insert as pg_insert
11
+ from sqlalchemy.dialects.sqlite import insert as lite_insert
12
+ from sqlalchemy.exc import SQLAlchemyError
13
+
14
+ from crawlee._utils.crypto import crypto_random_object_id
15
+
16
+ if TYPE_CHECKING:
17
+ from collections.abc import AsyncIterator
18
+
19
+ from sqlalchemy import Insert
20
+ from sqlalchemy.ext.asyncio import AsyncSession
21
+ from sqlalchemy.orm import DeclarativeBase
22
+ from typing_extensions import NotRequired, Self
23
+
24
+ from crawlee.storage_clients.models import DatasetMetadata, KeyValueStoreMetadata, RequestQueueMetadata
25
+
26
+ from ._db_models import (
27
+ DatasetItemDb,
28
+ DatasetMetadataDb,
29
+ KeyValueStoreMetadataDb,
30
+ KeyValueStoreRecordDb,
31
+ RequestDb,
32
+ RequestQueueMetadataDb,
33
+ )
34
+ from ._storage_client import SqlStorageClient
35
+
36
+
37
+ logger = getLogger(__name__)
38
+
39
+
40
+ class MetadataUpdateParams(TypedDict, total=False):
41
+ """Parameters for updating metadata."""
42
+
43
+ update_accessed_at: NotRequired[bool]
44
+ update_modified_at: NotRequired[bool]
45
+ force: NotRequired[bool]
46
+
47
+
48
+ class SqlClientMixin(ABC):
49
+ """Mixin class for SQL clients.
50
+
51
+ This mixin provides common SQL operations and basic methods for SQL storage clients.
52
+ """
53
+
54
+ _DEFAULT_NAME: ClassVar[str]
55
+ """Default name when none provided."""
56
+
57
+ _METADATA_TABLE: ClassVar[type[DatasetMetadataDb | KeyValueStoreMetadataDb | RequestQueueMetadataDb]]
58
+ """SQLAlchemy model for metadata."""
59
+
60
+ _ITEM_TABLE: ClassVar[type[DatasetItemDb | KeyValueStoreRecordDb | RequestDb]]
61
+ """SQLAlchemy model for items."""
62
+
63
+ _CLIENT_TYPE: ClassVar[str]
64
+ """Human-readable client type for error messages."""
65
+
66
+ def __init__(self, *, id: str, storage_client: SqlStorageClient) -> None:
67
+ self._id = id
68
+ self._storage_client = storage_client
69
+
70
+ # Time tracking to reduce database writes during frequent operation
71
+ self._accessed_at_allow_update_after: datetime | None = None
72
+ self._modified_at_allow_update_after: datetime | None = None
73
+ self._accessed_modified_update_interval = storage_client.get_accessed_modified_update_interval()
74
+
75
+ @classmethod
76
+ async def _open(
77
+ cls,
78
+ *,
79
+ id: str | None,
80
+ name: str | None,
81
+ internal_name: str,
82
+ storage_client: SqlStorageClient,
83
+ metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata],
84
+ session: AsyncSession,
85
+ extra_metadata_fields: dict[str, Any],
86
+ ) -> Self:
87
+ """Open existing storage or create new one.
88
+
89
+ Internal method used by _safely_open.
90
+
91
+ Args:
92
+ id: Storage ID to open (takes precedence over name).
93
+ name: The name of the storage.
94
+ internal_name: The database name for the storage based on name or alias.
95
+ storage_client: SQL storage client instance.
96
+ metadata_model: Pydantic model for metadata validation.
97
+ session: Active database session.
98
+ extra_metadata_fields: Storage-specific metadata fields.
99
+ """
100
+ orm_metadata: DatasetMetadataDb | KeyValueStoreMetadataDb | RequestQueueMetadataDb | None = None
101
+ if id:
102
+ orm_metadata = await session.get(cls._METADATA_TABLE, id)
103
+ if not orm_metadata:
104
+ raise ValueError(f'{cls._CLIENT_TYPE} with ID "{id}" not found.')
105
+ else:
106
+ stmt = select(cls._METADATA_TABLE).where(cls._METADATA_TABLE.internal_name == internal_name)
107
+ result = await session.execute(stmt)
108
+ orm_metadata = result.scalar_one_or_none() # type: ignore[assignment]
109
+
110
+ if orm_metadata:
111
+ client = cls(id=orm_metadata.id, storage_client=storage_client)
112
+ await client._update_metadata(session, update_accessed_at=True)
113
+ else:
114
+ now = datetime.now(timezone.utc)
115
+ metadata = metadata_model(
116
+ id=crypto_random_object_id(),
117
+ name=name,
118
+ created_at=now,
119
+ accessed_at=now,
120
+ modified_at=now,
121
+ **extra_metadata_fields,
122
+ )
123
+ client = cls(id=metadata.id, storage_client=storage_client)
124
+ client._accessed_at_allow_update_after = now + client._accessed_modified_update_interval
125
+ client._modified_at_allow_update_after = now + client._accessed_modified_update_interval
126
+ session.add(cls._METADATA_TABLE(**metadata.model_dump(), internal_name=internal_name))
127
+
128
+ return client
129
+
130
+ @classmethod
131
+ async def _safely_open(
132
+ cls,
133
+ *,
134
+ id: str | None,
135
+ name: str | None,
136
+ alias: str | None = None,
137
+ storage_client: SqlStorageClient,
138
+ metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata],
139
+ extra_metadata_fields: dict[str, Any],
140
+ ) -> Self:
141
+ """Safely open storage with transaction handling.
142
+
143
+ Args:
144
+ id: Storage ID to open (takes precedence over name).
145
+ name: The name of the storage for named (global scope) storages.
146
+ alias: The alias of the storage for unnamed (run scope) storages.
147
+ storage_client: SQL storage client instance.
148
+ client_class: Concrete client class to instantiate.
149
+ metadata_model: Pydantic model for metadata validation.
150
+ extra_metadata_fields: Storage-specific metadata fields.
151
+ """
152
+ # Validate input parameters.
153
+ specified_params = sum(1 for param in [id, name, alias] if param is not None)
154
+ if specified_params > 1:
155
+ raise ValueError('Only one of "id", "name", or "alias" can be specified, not multiple.')
156
+
157
+ internal_name = name or alias or cls._DEFAULT_NAME
158
+
159
+ async with storage_client.create_session() as session:
160
+ try:
161
+ client = await cls._open(
162
+ id=id,
163
+ name=name,
164
+ internal_name=internal_name,
165
+ storage_client=storage_client,
166
+ metadata_model=metadata_model,
167
+ session=session,
168
+ extra_metadata_fields=extra_metadata_fields,
169
+ )
170
+ await session.commit()
171
+ except SQLAlchemyError:
172
+ await session.rollback()
173
+
174
+ stmt = select(cls._METADATA_TABLE).where(cls._METADATA_TABLE.internal_name == internal_name)
175
+ result = await session.execute(stmt)
176
+ orm_metadata: DatasetMetadataDb | KeyValueStoreMetadataDb | RequestQueueMetadataDb | None
177
+ orm_metadata = cast(
178
+ 'DatasetMetadataDb | KeyValueStoreMetadataDb | RequestQueueMetadataDb | None',
179
+ result.scalar_one_or_none(),
180
+ )
181
+
182
+ if not orm_metadata:
183
+ raise ValueError(f'{cls._CLIENT_TYPE} with Name "{internal_name}" not found.') from None
184
+
185
+ client = cls(id=orm_metadata.id, storage_client=storage_client)
186
+
187
+ return client
188
+
189
+ @asynccontextmanager
190
+ async def get_session(self, *, with_simple_commit: bool = False) -> AsyncIterator[AsyncSession]:
191
+ """Create a new SQLAlchemy session for this storage."""
192
+ async with self._storage_client.create_session() as session:
193
+ # For operations where a final commit is mandatory and does not require specific processing conditions
194
+ if with_simple_commit:
195
+ try:
196
+ yield session
197
+ await session.commit()
198
+ except SQLAlchemyError as e:
199
+ logger.warning(f'Error occurred during session transaction: {e}')
200
+ await session.rollback()
201
+ else:
202
+ yield session
203
+
204
+ def _build_insert_stmt_with_ignore(
205
+ self, table_model: type[DeclarativeBase], insert_values: dict[str, Any] | list[dict[str, Any]]
206
+ ) -> Insert:
207
+ """Build an insert statement with ignore for the SQL dialect.
208
+
209
+ Args:
210
+ table_model: SQLAlchemy table model.
211
+ insert_values: Single dict or list of dicts to insert.
212
+ """
213
+ if isinstance(insert_values, dict):
214
+ insert_values = [insert_values]
215
+
216
+ dialect = self._storage_client.get_dialect_name()
217
+
218
+ if dialect == 'postgresql':
219
+ return pg_insert(table_model).values(insert_values).on_conflict_do_nothing()
220
+
221
+ if dialect == 'sqlite':
222
+ return lite_insert(table_model).values(insert_values).on_conflict_do_nothing()
223
+
224
+ raise NotImplementedError(f'Insert with ignore not supported for dialect: {dialect}')
225
+
226
+ def _build_upsert_stmt(
227
+ self,
228
+ table_model: type[DeclarativeBase],
229
+ insert_values: dict[str, Any] | list[dict[str, Any]],
230
+ update_columns: list[str],
231
+ conflict_cols: list[str] | None = None,
232
+ ) -> Insert:
233
+ """Build an upsert statement for the SQL dialect.
234
+
235
+ Args:
236
+ table_model: SQLAlchemy table model.
237
+ insert_values: Single dict or list of dicts to upsert.
238
+ update_columns: Column names to update on conflict.
239
+ conflict_cols: Column names that define uniqueness (for PostgreSQL/SQLite).
240
+
241
+ """
242
+ if isinstance(insert_values, dict):
243
+ insert_values = [insert_values]
244
+
245
+ dialect = self._storage_client.get_dialect_name()
246
+
247
+ if dialect == 'postgresql':
248
+ pg_stmt = pg_insert(table_model).values(insert_values)
249
+ set_ = {col: getattr(pg_stmt.excluded, col) for col in update_columns}
250
+ return pg_stmt.on_conflict_do_update(index_elements=conflict_cols, set_=set_)
251
+
252
+ if dialect == 'sqlite':
253
+ lite_stmt = lite_insert(table_model).values(insert_values)
254
+ set_ = {col: getattr(lite_stmt.excluded, col) for col in update_columns}
255
+ return lite_stmt.on_conflict_do_update(index_elements=conflict_cols, set_=set_)
256
+
257
+ raise NotImplementedError(f'Upsert not supported for dialect: {dialect}')
258
+
259
+ async def _purge(self, metadata_kwargs: MetadataUpdateParams) -> None:
260
+ """Drop all items in storage and update metadata.
261
+
262
+ Args:
263
+ metadata_kwargs: Arguments to pass to _update_metadata.
264
+ """
265
+ stmt = delete(self._ITEM_TABLE).where(self._ITEM_TABLE.storage_id == self._id)
266
+ async with self.get_session(with_simple_commit=True) as session:
267
+ await session.execute(stmt)
268
+ await self._update_metadata(session, **metadata_kwargs)
269
+
270
+ async def _drop(self) -> None:
271
+ """Delete this storage and all its data.
272
+
273
+ This operation is irreversible. Uses CASCADE deletion to remove all related items.
274
+ """
275
+ stmt = delete(self._METADATA_TABLE).where(self._METADATA_TABLE.id == self._id)
276
+ async with self.get_session(with_simple_commit=True) as session:
277
+ if self._storage_client.get_dialect_name() == 'sqlite':
278
+ # foreign_keys=ON is set at the connection level. Required for cascade deletion.
279
+ await session.execute(text('PRAGMA foreign_keys=ON'))
280
+ await session.execute(stmt)
281
+
282
+ @overload
283
+ async def _get_metadata(self, metadata_model: type[DatasetMetadata]) -> DatasetMetadata: ...
284
+ @overload
285
+ async def _get_metadata(self, metadata_model: type[KeyValueStoreMetadata]) -> KeyValueStoreMetadata: ...
286
+ @overload
287
+ async def _get_metadata(self, metadata_model: type[RequestQueueMetadata]) -> RequestQueueMetadata: ...
288
+
289
+ async def _get_metadata(
290
+ self, metadata_model: type[DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata]
291
+ ) -> DatasetMetadata | KeyValueStoreMetadata | RequestQueueMetadata:
292
+ """Retrieve client metadata."""
293
+ async with self.get_session() as session:
294
+ orm_metadata = await session.get(self._METADATA_TABLE, self._id)
295
+ if not orm_metadata:
296
+ raise ValueError(f'{self._CLIENT_TYPE} with ID "{self._id}" not found.')
297
+
298
+ return metadata_model.model_validate(orm_metadata)
299
+
300
+ def _default_update_metadata(
301
+ self, *, update_accessed_at: bool = False, update_modified_at: bool = False, force: bool = False
302
+ ) -> dict[str, Any]:
303
+ """Prepare common metadata updates with rate limiting.
304
+
305
+ Args:
306
+ update_accessed_at: Whether to update accessed_at timestamp.
307
+ update_modified_at: Whether to update modified_at timestamp.
308
+ force: Whether to force the update regardless of rate limiting.
309
+ """
310
+ values_to_set: dict[str, Any] = {}
311
+ now = datetime.now(timezone.utc)
312
+
313
+ # If the record must be updated (for example, when updating counters), we update timestamps and shift the time.
314
+ if force:
315
+ if update_modified_at:
316
+ values_to_set['modified_at'] = now
317
+ self._modified_at_allow_update_after = now + self._accessed_modified_update_interval
318
+ if update_accessed_at:
319
+ values_to_set['accessed_at'] = now
320
+ self._accessed_at_allow_update_after = now + self._accessed_modified_update_interval
321
+
322
+ elif update_modified_at and (
323
+ self._modified_at_allow_update_after is None or now >= self._modified_at_allow_update_after
324
+ ):
325
+ values_to_set['modified_at'] = now
326
+ self._modified_at_allow_update_after = now + self._accessed_modified_update_interval
327
+ # The record will be updated, we can update `accessed_at` and shift the time.
328
+ if update_accessed_at:
329
+ values_to_set['accessed_at'] = now
330
+ self._accessed_at_allow_update_after = now + self._accessed_modified_update_interval
331
+
332
+ elif update_accessed_at and (
333
+ self._accessed_at_allow_update_after is None or now >= self._accessed_at_allow_update_after
334
+ ):
335
+ values_to_set['accessed_at'] = now
336
+ self._accessed_at_allow_update_after = now + self._accessed_modified_update_interval
337
+
338
+ return values_to_set
339
+
340
+ @abstractmethod
341
+ def _specific_update_metadata(self, **kwargs: Any) -> dict[str, Any]:
342
+ """Prepare storage-specific metadata updates.
343
+
344
+ Must be implemented by concrete classes.
345
+
346
+ Args:
347
+ **kwargs: Storage-specific update parameters.
348
+ """
349
+
350
+ async def _update_metadata(
351
+ self,
352
+ session: AsyncSession,
353
+ *,
354
+ update_accessed_at: bool = False,
355
+ update_modified_at: bool = False,
356
+ force: bool = False,
357
+ **kwargs: Any,
358
+ ) -> bool:
359
+ """Update storage metadata combining common and specific fields.
360
+
361
+ Args:
362
+ session: Active database session.
363
+ update_accessed_at: Whether to update accessed_at timestamp.
364
+ update_modified_at: Whether to update modified_at timestamp.
365
+ force: Whether to force the update timestamps regardless of rate limiting.
366
+ **kwargs: Additional arguments for _specific_update_metadata.
367
+
368
+ Returns:
369
+ True if any updates were made, False otherwise
370
+ """
371
+ values_to_set = self._default_update_metadata(
372
+ update_accessed_at=update_accessed_at, update_modified_at=update_modified_at, force=force
373
+ )
374
+
375
+ values_to_set.update(self._specific_update_metadata(**kwargs))
376
+
377
+ if values_to_set:
378
+ if (stmt := values_to_set.pop('custom_stmt', None)) is None:
379
+ stmt = update(self._METADATA_TABLE).where(self._METADATA_TABLE.id == self._id)
380
+
381
+ stmt = stmt.values(**values_to_set)
382
+ await session.execute(stmt)
383
+ return True
384
+
385
+ return False
@@ -0,0 +1,310 @@
1
+ from __future__ import annotations
2
+
3
+ from logging import getLogger
4
+ from typing import TYPE_CHECKING, Any
5
+
6
+ from sqlalchemy import Select, insert, select
7
+ from typing_extensions import Self, override
8
+
9
+ from crawlee.storage_clients._base import DatasetClient
10
+ from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
11
+
12
+ from ._client_mixin import MetadataUpdateParams, SqlClientMixin
13
+ from ._db_models import DatasetItemDb, DatasetMetadataDb
14
+
15
+ if TYPE_CHECKING:
16
+ from collections.abc import AsyncIterator
17
+
18
+ from sqlalchemy import Select
19
+ from typing_extensions import NotRequired
20
+
21
+ from ._storage_client import SqlStorageClient
22
+
23
+
24
+ logger = getLogger(__name__)
25
+
26
+
27
+ class _DatasetMetadataUpdateParams(MetadataUpdateParams):
28
+ """Parameters for updating dataset metadata."""
29
+
30
+ new_item_count: NotRequired[int]
31
+ delta_item_count: NotRequired[int]
32
+
33
+
34
+ class SqlDatasetClient(DatasetClient, SqlClientMixin):
35
+ """SQL implementation of the dataset client.
36
+
37
+ This client persists dataset items to a SQL database using two tables for storage
38
+ and retrieval. Items are stored as JSON with automatic ordering preservation.
39
+
40
+ The dataset data is stored in SQL database tables following the pattern:
41
+ - `datasets` table: Contains dataset metadata (id, name, timestamps, item_count)
42
+ - `dataset_records` table: Contains individual items with JSON data and auto-increment ordering
43
+
44
+ Items are stored as a JSON object in SQLite and as JSONB in PostgreSQL. These objects must be JSON-serializable.
45
+ The `item_id` auto-increment primary key ensures insertion order is preserved.
46
+ All operations are wrapped in database transactions with CASCADE deletion support.
47
+ """
48
+
49
+ _DEFAULT_NAME = 'default'
50
+ """Default dataset name used when no name is provided."""
51
+
52
+ _METADATA_TABLE = DatasetMetadataDb
53
+ """SQLAlchemy model for dataset metadata."""
54
+
55
+ _ITEM_TABLE = DatasetItemDb
56
+ """SQLAlchemy model for dataset items."""
57
+
58
+ _CLIENT_TYPE = 'Dataset'
59
+ """Human-readable client type for error messages."""
60
+
61
+ def __init__(
62
+ self,
63
+ *,
64
+ id: str,
65
+ storage_client: SqlStorageClient,
66
+ ) -> None:
67
+ """Initialize a new instance.
68
+
69
+ Preferably use the `SqlDatasetClient.open` class method to create a new instance.
70
+ """
71
+ super().__init__(id=id, storage_client=storage_client)
72
+
73
+ @classmethod
74
+ async def open(
75
+ cls,
76
+ *,
77
+ id: str | None,
78
+ name: str | None,
79
+ alias: str | None,
80
+ storage_client: SqlStorageClient,
81
+ ) -> Self:
82
+ """Open an existing dataset or create a new one.
83
+
84
+ Args:
85
+ id: The ID of the dataset to open. If provided, searches for existing dataset by ID.
86
+ name: The name of the dataset for named (global scope) storages.
87
+ alias: The alias of the dataset for unnamed (run scope) storages.
88
+ storage_client: The SQL storage client instance.
89
+
90
+ Returns:
91
+ An instance for the opened or created storage client.
92
+
93
+ Raises:
94
+ ValueError: If a dataset with the specified ID is not found.
95
+ """
96
+ return await cls._safely_open(
97
+ id=id,
98
+ name=name,
99
+ alias=alias,
100
+ storage_client=storage_client,
101
+ metadata_model=DatasetMetadata,
102
+ extra_metadata_fields={'item_count': 0},
103
+ )
104
+
105
+ @override
106
+ async def get_metadata(self) -> DatasetMetadata:
107
+ # The database is a single place of truth
108
+ return await self._get_metadata(DatasetMetadata)
109
+
110
+ @override
111
+ async def drop(self) -> None:
112
+ """Delete this dataset and all its items from the database.
113
+
114
+ This operation is irreversible. Uses CASCADE deletion to remove all related items.
115
+ """
116
+ await self._drop()
117
+
118
+ @override
119
+ async def purge(self) -> None:
120
+ """Remove all items from this dataset while keeping the dataset structure.
121
+
122
+ Resets item_count to 0 and deletes all records from dataset_records table.
123
+ """
124
+ await self._purge(
125
+ metadata_kwargs=_DatasetMetadataUpdateParams(
126
+ new_item_count=0,
127
+ update_accessed_at=True,
128
+ update_modified_at=True,
129
+ force=True,
130
+ )
131
+ )
132
+
133
+ @override
134
+ async def push_data(self, data: list[dict[str, Any]] | dict[str, Any]) -> None:
135
+ if not isinstance(data, list):
136
+ data = [data]
137
+
138
+ db_items: list[dict[str, Any]] = []
139
+ db_items = [{'dataset_id': self._id, 'data': item} for item in data]
140
+ stmt = insert(self._ITEM_TABLE).values(db_items)
141
+
142
+ async with self.get_session(with_simple_commit=True) as session:
143
+ await session.execute(stmt)
144
+
145
+ await self._update_metadata(
146
+ session,
147
+ **_DatasetMetadataUpdateParams(
148
+ update_accessed_at=True,
149
+ update_modified_at=True,
150
+ delta_item_count=len(data),
151
+ new_item_count=len(data),
152
+ force=True,
153
+ ),
154
+ )
155
+
156
+ @override
157
+ async def get_data(
158
+ self,
159
+ *,
160
+ offset: int = 0,
161
+ limit: int | None = 999_999_999_999,
162
+ clean: bool = False,
163
+ desc: bool = False,
164
+ fields: list[str] | None = None,
165
+ omit: list[str] | None = None,
166
+ unwind: list[str] | None = None,
167
+ skip_empty: bool = False,
168
+ skip_hidden: bool = False,
169
+ flatten: list[str] | None = None,
170
+ view: str | None = None,
171
+ ) -> DatasetItemsListPage:
172
+ stmt = self._prepare_get_stmt(
173
+ offset=offset,
174
+ limit=limit,
175
+ clean=clean,
176
+ desc=desc,
177
+ fields=fields,
178
+ omit=omit,
179
+ unwind=unwind,
180
+ skip_empty=skip_empty,
181
+ skip_hidden=skip_hidden,
182
+ flatten=flatten,
183
+ view=view,
184
+ )
185
+
186
+ async with self.get_session() as session:
187
+ result = await session.execute(stmt)
188
+ db_items = result.scalars().all()
189
+
190
+ updated = await self._update_metadata(session, **_DatasetMetadataUpdateParams(update_accessed_at=True))
191
+
192
+ # Commit updates to the metadata
193
+ if updated:
194
+ await session.commit()
195
+
196
+ items = [db_item.data for db_item in db_items]
197
+ metadata = await self.get_metadata()
198
+ return DatasetItemsListPage(
199
+ items=items,
200
+ count=len(items),
201
+ desc=desc,
202
+ limit=limit or 0,
203
+ offset=offset or 0,
204
+ total=metadata.item_count,
205
+ )
206
+
207
+ @override
208
+ async def iterate_items(
209
+ self,
210
+ *,
211
+ offset: int = 0,
212
+ limit: int | None = None,
213
+ clean: bool = False,
214
+ desc: bool = False,
215
+ fields: list[str] | None = None,
216
+ omit: list[str] | None = None,
217
+ unwind: list[str] | None = None,
218
+ skip_empty: bool = False,
219
+ skip_hidden: bool = False,
220
+ ) -> AsyncIterator[dict[str, Any]]:
221
+ stmt = self._prepare_get_stmt(
222
+ offset=offset,
223
+ limit=limit,
224
+ clean=clean,
225
+ desc=desc,
226
+ fields=fields,
227
+ omit=omit,
228
+ unwind=unwind,
229
+ skip_empty=skip_empty,
230
+ skip_hidden=skip_hidden,
231
+ )
232
+
233
+ async with self.get_session() as session:
234
+ db_items = await session.stream_scalars(stmt)
235
+
236
+ async for db_item in db_items:
237
+ yield db_item.data
238
+
239
+ updated = await self._update_metadata(session, **_DatasetMetadataUpdateParams(update_accessed_at=True))
240
+
241
+ # Commit updates to the metadata
242
+ if updated:
243
+ await session.commit()
244
+
245
+ def _prepare_get_stmt(
246
+ self,
247
+ *,
248
+ offset: int = 0,
249
+ limit: int | None = 999_999_999_999,
250
+ clean: bool = False,
251
+ desc: bool = False,
252
+ fields: list[str] | None = None,
253
+ omit: list[str] | None = None,
254
+ unwind: list[str] | None = None,
255
+ skip_empty: bool = False,
256
+ skip_hidden: bool = False,
257
+ flatten: list[str] | None = None,
258
+ view: str | None = None,
259
+ ) -> Select:
260
+ # Check for unsupported arguments and log a warning if found.
261
+ unsupported_args: dict[str, Any] = {
262
+ 'clean': clean,
263
+ 'fields': fields,
264
+ 'omit': omit,
265
+ 'unwind': unwind,
266
+ 'skip_hidden': skip_hidden,
267
+ 'flatten': flatten,
268
+ 'view': view,
269
+ }
270
+ unsupported = {k: v for k, v in unsupported_args.items() if v not in (False, None)}
271
+
272
+ if unsupported:
273
+ logger.warning(
274
+ f'The arguments {list(unsupported.keys())} of get_data are not supported by the '
275
+ f'{self.__class__.__name__} client.'
276
+ )
277
+
278
+ stmt = select(self._ITEM_TABLE).where(self._ITEM_TABLE.dataset_id == self._id)
279
+
280
+ if skip_empty:
281
+ # Skip items that are empty JSON objects
282
+ stmt = stmt.where(self._ITEM_TABLE.data != {})
283
+
284
+ # Apply ordering by insertion order (item_id)
285
+ stmt = stmt.order_by(self._ITEM_TABLE.item_id.desc()) if desc else stmt.order_by(self._ITEM_TABLE.item_id.asc())
286
+
287
+ return stmt.offset(offset).limit(limit)
288
+
289
+ def _specific_update_metadata(
290
+ self,
291
+ new_item_count: int | None = None,
292
+ delta_item_count: int | None = None,
293
+ **_kwargs: dict[str, Any],
294
+ ) -> dict[str, Any]:
295
+ """Update the dataset metadata in the database.
296
+
297
+ Args:
298
+ session: The SQLAlchemy AsyncSession to use for the update.
299
+ new_item_count: If provided, set item count to this value.
300
+ delta_item_count: If provided, add this value to the current item count.
301
+ """
302
+ values_to_set: dict[str, Any] = {}
303
+
304
+ if new_item_count is not None:
305
+ values_to_set['item_count'] = new_item_count
306
+ elif delta_item_count:
307
+ # Use database-level for atomic updates
308
+ values_to_set['item_count'] = self._METADATA_TABLE.item_count + delta_item_count
309
+
310
+ return values_to_set