crawlee 1.0.0rc1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. crawlee/_autoscaling/snapshotter.py +1 -1
  2. crawlee/_request.py +2 -1
  3. crawlee/_service_locator.py +44 -24
  4. crawlee/_types.py +76 -17
  5. crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
  6. crawlee/_utils/sitemap.py +3 -1
  7. crawlee/_utils/system.py +3 -3
  8. crawlee/browsers/_playwright_browser_controller.py +20 -14
  9. crawlee/configuration.py +1 -1
  10. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
  11. crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
  12. crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
  13. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +6 -2
  14. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
  15. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
  16. crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
  17. crawlee/crawlers/_basic/_basic_crawler.py +107 -27
  18. crawlee/crawlers/_basic/_logging_utils.py +5 -1
  19. crawlee/crawlers/_playwright/_playwright_crawler.py +6 -1
  20. crawlee/events/_types.py +6 -6
  21. crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
  22. crawlee/fingerprint_suite/_types.py +2 -2
  23. crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
  24. crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
  25. crawlee/request_loaders/_request_list.py +1 -1
  26. crawlee/request_loaders/_request_loader.py +5 -1
  27. crawlee/request_loaders/_sitemap_request_loader.py +228 -48
  28. crawlee/sessions/_models.py +2 -2
  29. crawlee/statistics/_models.py +1 -1
  30. crawlee/storage_clients/__init__.py +12 -0
  31. crawlee/storage_clients/_base/_storage_client.py +13 -0
  32. crawlee/storage_clients/_file_system/_dataset_client.py +27 -25
  33. crawlee/storage_clients/_file_system/_key_value_store_client.py +27 -23
  34. crawlee/storage_clients/_file_system/_request_queue_client.py +84 -98
  35. crawlee/storage_clients/_file_system/_storage_client.py +16 -3
  36. crawlee/storage_clients/_file_system/_utils.py +0 -0
  37. crawlee/storage_clients/_memory/_dataset_client.py +14 -2
  38. crawlee/storage_clients/_memory/_key_value_store_client.py +14 -2
  39. crawlee/storage_clients/_memory/_request_queue_client.py +43 -12
  40. crawlee/storage_clients/_memory/_storage_client.py +6 -3
  41. crawlee/storage_clients/_sql/__init__.py +6 -0
  42. crawlee/storage_clients/_sql/_client_mixin.py +385 -0
  43. crawlee/storage_clients/_sql/_dataset_client.py +310 -0
  44. crawlee/storage_clients/_sql/_db_models.py +269 -0
  45. crawlee/storage_clients/_sql/_key_value_store_client.py +299 -0
  46. crawlee/storage_clients/_sql/_request_queue_client.py +706 -0
  47. crawlee/storage_clients/_sql/_storage_client.py +282 -0
  48. crawlee/storage_clients/_sql/py.typed +0 -0
  49. crawlee/storage_clients/models.py +10 -10
  50. crawlee/storages/_base.py +3 -1
  51. crawlee/storages/_dataset.py +9 -2
  52. crawlee/storages/_key_value_store.py +9 -2
  53. crawlee/storages/_request_queue.py +7 -2
  54. crawlee/storages/_storage_instance_manager.py +126 -72
  55. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/METADATA +12 -5
  56. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/RECORD +59 -49
  57. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/WHEEL +0 -0
  58. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/entry_points.txt +0 -0
  59. {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,299 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from logging import getLogger
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from sqlalchemy import delete, select
8
+ from typing_extensions import override
9
+
10
+ from crawlee._utils.file import infer_mime_type
11
+ from crawlee.storage_clients._base import KeyValueStoreClient
12
+ from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
13
+
14
+ from ._client_mixin import MetadataUpdateParams, SqlClientMixin
15
+ from ._db_models import KeyValueStoreMetadataDb, KeyValueStoreRecordDb
16
+
17
+ if TYPE_CHECKING:
18
+ from collections.abc import AsyncIterator
19
+
20
+ from ._storage_client import SqlStorageClient
21
+
22
+
23
+ logger = getLogger(__name__)
24
+
25
+
26
+ class SqlKeyValueStoreClient(KeyValueStoreClient, SqlClientMixin):
27
+ """SQL implementation of the key-value store client.
28
+
29
+ This client persists key-value data to a SQL database with transaction support and
30
+ concurrent access safety. Keys are mapped to rows in database tables with proper indexing
31
+ for efficient retrieval.
32
+
33
+ The key-value store data is stored in SQL database tables following the pattern:
34
+ - `key_value_stores` table: Contains store metadata (id, name, timestamps)
35
+ - `key_value_store_records` table: Contains individual key-value pairs with binary value storage, content type,
36
+ and size information
37
+
38
+ Values are serialized based on their type: JSON objects are stored as formatted JSON,
39
+ text values as UTF-8 encoded strings, and binary data as-is in the `LargeBinary` column.
40
+ The implementation automatically handles content type detection and maintains metadata
41
+ about each record including size and MIME type information.
42
+
43
+ All database operations are wrapped in transactions with proper error handling and rollback
44
+ mechanisms. The client supports atomic upsert operations and handles race conditions when
45
+ multiple clients access the same store using composite primary keys (key_value_store_id, key).
46
+ """
47
+
48
+ _DEFAULT_NAME = 'default'
49
+ """Default dataset name used when no name is provided."""
50
+
51
+ _METADATA_TABLE = KeyValueStoreMetadataDb
52
+ """SQLAlchemy model for key-value store metadata."""
53
+
54
+ _ITEM_TABLE = KeyValueStoreRecordDb
55
+ """SQLAlchemy model for key-value store items."""
56
+
57
+ _CLIENT_TYPE = 'Key-value store'
58
+ """Human-readable client type for error messages."""
59
+
60
+ def __init__(
61
+ self,
62
+ *,
63
+ storage_client: SqlStorageClient,
64
+ id: str,
65
+ ) -> None:
66
+ """Initialize a new instance.
67
+
68
+ Preferably use the `SqlKeyValueStoreClient.open` class method to create a new instance.
69
+ """
70
+ super().__init__(id=id, storage_client=storage_client)
71
+
72
+ @classmethod
73
+ async def open(
74
+ cls,
75
+ *,
76
+ id: str | None,
77
+ name: str | None,
78
+ alias: str | None,
79
+ storage_client: SqlStorageClient,
80
+ ) -> SqlKeyValueStoreClient:
81
+ """Open or create a SQL key-value store client.
82
+
83
+ This method attempts to open an existing key-value store from the SQL database. If a KVS with the specified
84
+ ID or name exists, it loads the metadata from the database. If no existing store is found, a new one
85
+ is created.
86
+
87
+ Args:
88
+ id: The ID of the key-value store to open. If provided, searches for existing store by ID.
89
+ name: The name of the key-value store for named (global scope) storages.
90
+ alias: The alias of the key-value store for unnamed (run scope) storages.
91
+ storage_client: The SQL storage client used to access the database.
92
+
93
+ Returns:
94
+ An instance for the opened or created storage client.
95
+
96
+ Raises:
97
+ ValueError: If a store with the specified ID is not found, or if metadata is invalid.
98
+ """
99
+ return await cls._safely_open(
100
+ id=id,
101
+ name=name,
102
+ alias=alias,
103
+ storage_client=storage_client,
104
+ metadata_model=KeyValueStoreMetadata,
105
+ extra_metadata_fields={},
106
+ )
107
+
108
+ @override
109
+ async def get_metadata(self) -> KeyValueStoreMetadata:
110
+ # The database is a single place of truth
111
+ return await self._get_metadata(KeyValueStoreMetadata)
112
+
113
+ @override
114
+ async def drop(self) -> None:
115
+ """Delete this key-value store and all its records from the database.
116
+
117
+ This operation is irreversible. Uses CASCADE deletion to remove all related records.
118
+ """
119
+ await self._drop()
120
+
121
+ @override
122
+ async def purge(self) -> None:
123
+ """Remove all items from this key-value store while keeping the key-value store structure.
124
+
125
+ Remove all records from key_value_store_records table.
126
+ """
127
+ await self._purge(metadata_kwargs=MetadataUpdateParams(update_accessed_at=True, update_modified_at=True))
128
+
129
+ @override
130
+ async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None:
131
+ # Special handling for None values
132
+ if value is None:
133
+ content_type = 'application/x-none' # Special content type to identify None values
134
+ value_bytes = b''
135
+ else:
136
+ content_type = content_type or infer_mime_type(value)
137
+
138
+ # Serialize the value to bytes.
139
+ if 'application/json' in content_type:
140
+ value_bytes = json.dumps(value, default=str, ensure_ascii=False).encode('utf-8')
141
+ elif isinstance(value, str):
142
+ value_bytes = value.encode('utf-8')
143
+ elif isinstance(value, (bytes, bytearray)):
144
+ value_bytes = value
145
+ else:
146
+ # Fallback: attempt to convert to string and encode.
147
+ value_bytes = str(value).encode('utf-8')
148
+
149
+ size = len(value_bytes)
150
+ insert_values = {
151
+ 'key_value_store_id': self._id,
152
+ 'key': key,
153
+ 'value': value_bytes,
154
+ 'content_type': content_type,
155
+ 'size': size,
156
+ }
157
+
158
+ upsert_stmt = self._build_upsert_stmt(
159
+ self._ITEM_TABLE,
160
+ insert_values=insert_values,
161
+ update_columns=['value', 'content_type', 'size'],
162
+ conflict_cols=['key_value_store_id', 'key'],
163
+ )
164
+
165
+ async with self.get_session(with_simple_commit=True) as session:
166
+ await session.execute(upsert_stmt)
167
+
168
+ await self._update_metadata(
169
+ session, **MetadataUpdateParams(update_accessed_at=True, update_modified_at=True)
170
+ )
171
+
172
+ @override
173
+ async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
174
+ # Query the record by key
175
+ stmt = select(self._ITEM_TABLE).where(
176
+ self._ITEM_TABLE.key_value_store_id == self._id, self._ITEM_TABLE.key == key
177
+ )
178
+ async with self.get_session() as session:
179
+ result = await session.execute(stmt)
180
+ record_db = result.scalar_one_or_none()
181
+
182
+ updated = await self._update_metadata(session, **MetadataUpdateParams(update_accessed_at=True))
183
+
184
+ # Commit updates to the metadata
185
+ if updated:
186
+ await session.commit()
187
+
188
+ if not record_db:
189
+ return None
190
+
191
+ # Deserialize the value based on content type
192
+ value_bytes = record_db.value
193
+
194
+ # Handle None values
195
+ if record_db.content_type == 'application/x-none':
196
+ value = None
197
+ # Handle JSON values
198
+ elif 'application/json' in record_db.content_type:
199
+ try:
200
+ value = json.loads(value_bytes.decode('utf-8'))
201
+ except (json.JSONDecodeError, UnicodeDecodeError):
202
+ logger.warning(f'Failed to decode JSON value for key "{key}"')
203
+ return None
204
+ # Handle text values
205
+ elif record_db.content_type.startswith('text/'):
206
+ try:
207
+ value = value_bytes.decode('utf-8')
208
+ except UnicodeDecodeError:
209
+ logger.warning(f'Failed to decode text value for key "{key}"')
210
+ return None
211
+ # Handle binary values
212
+ else:
213
+ value = value_bytes
214
+
215
+ return KeyValueStoreRecord(
216
+ key=record_db.key,
217
+ value=value,
218
+ content_type=record_db.content_type,
219
+ size=record_db.size,
220
+ )
221
+
222
+ @override
223
+ async def delete_value(self, *, key: str) -> None:
224
+ stmt = delete(self._ITEM_TABLE).where(
225
+ self._ITEM_TABLE.key_value_store_id == self._id, self._ITEM_TABLE.key == key
226
+ )
227
+ async with self.get_session(with_simple_commit=True) as session:
228
+ # Delete the record if it exists
229
+ result = await session.execute(stmt)
230
+
231
+ # Update metadata if we actually deleted something
232
+ if result.rowcount > 0:
233
+ await self._update_metadata(
234
+ session, **MetadataUpdateParams(update_accessed_at=True, update_modified_at=True)
235
+ )
236
+
237
+ await session.commit()
238
+
239
+ @override
240
+ async def iterate_keys(
241
+ self,
242
+ *,
243
+ exclusive_start_key: str | None = None,
244
+ limit: int | None = None,
245
+ ) -> AsyncIterator[KeyValueStoreRecordMetadata]:
246
+ # Build query for record metadata
247
+ stmt = (
248
+ select(self._ITEM_TABLE.key, self._ITEM_TABLE.content_type, self._ITEM_TABLE.size)
249
+ .where(self._ITEM_TABLE.key_value_store_id == self._id)
250
+ .order_by(self._ITEM_TABLE.key)
251
+ )
252
+
253
+ # Apply exclusive_start_key filter
254
+ if exclusive_start_key is not None:
255
+ stmt = stmt.where(self._ITEM_TABLE.key > exclusive_start_key)
256
+
257
+ # Apply limit
258
+ if limit is not None:
259
+ stmt = stmt.limit(limit)
260
+
261
+ async with self.get_session() as session:
262
+ result = await session.stream(stmt.execution_options(stream_results=True))
263
+
264
+ async for row in result:
265
+ yield KeyValueStoreRecordMetadata(
266
+ key=row.key,
267
+ content_type=row.content_type,
268
+ size=row.size,
269
+ )
270
+
271
+ updated = await self._update_metadata(session, **MetadataUpdateParams(update_accessed_at=True))
272
+
273
+ # Commit updates to the metadata
274
+ if updated:
275
+ await session.commit()
276
+
277
+ @override
278
+ async def record_exists(self, *, key: str) -> bool:
279
+ stmt = select(self._ITEM_TABLE.key).where(
280
+ self._ITEM_TABLE.key_value_store_id == self._id, self._ITEM_TABLE.key == key
281
+ )
282
+ async with self.get_session() as session:
283
+ # Check if record exists
284
+ result = await session.execute(stmt)
285
+
286
+ updated = await self._update_metadata(session, **MetadataUpdateParams(update_accessed_at=True))
287
+
288
+ # Commit updates to the metadata
289
+ if updated:
290
+ await session.commit()
291
+
292
+ return result.scalar_one_or_none() is not None
293
+
294
+ @override
295
+ async def get_public_url(self, *, key: str) -> str:
296
+ raise NotImplementedError('Public URLs are not supported for SQL key-value stores.')
297
+
298
+ def _specific_update_metadata(self, **_kwargs: dict[str, Any]) -> dict[str, Any]:
299
+ return {}