crawlee 1.0.3b6__py3-none-any.whl → 1.2.2b24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_browserforge_workaround.py +7 -3
  3. crawlee/_request.py +32 -13
  4. crawlee/_service_locator.py +4 -4
  5. crawlee/_types.py +44 -5
  6. crawlee/_utils/context.py +3 -3
  7. crawlee/_utils/file.py +8 -1
  8. crawlee/_utils/globs.py +4 -4
  9. crawlee/_utils/recoverable_state.py +32 -8
  10. crawlee/_utils/recurring_task.py +27 -3
  11. crawlee/_utils/robots.py +17 -5
  12. crawlee/_utils/sitemap.py +13 -6
  13. crawlee/_utils/system.py +27 -11
  14. crawlee/_utils/time.py +41 -1
  15. crawlee/_utils/urls.py +9 -2
  16. crawlee/browsers/_browser_pool.py +5 -2
  17. crawlee/browsers/_playwright_browser.py +2 -1
  18. crawlee/browsers/_playwright_browser_controller.py +1 -1
  19. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  20. crawlee/browsers/_types.py +1 -1
  21. crawlee/configuration.py +3 -1
  22. crawlee/crawlers/__init__.py +5 -1
  23. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  24. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
  25. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  26. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
  27. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
  28. crawlee/crawlers/_basic/_basic_crawler.py +156 -131
  29. crawlee/crawlers/_basic/_context_utils.py +24 -0
  30. crawlee/crawlers/_basic/_logging_utils.py +23 -4
  31. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  32. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  33. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  34. crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
  35. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  36. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  37. crawlee/crawlers/_playwright/_types.py +12 -2
  38. crawlee/errors.py +4 -0
  39. crawlee/events/_event_manager.py +12 -6
  40. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  41. crawlee/fingerprint_suite/_header_generator.py +2 -2
  42. crawlee/http_clients/_base.py +4 -0
  43. crawlee/http_clients/_curl_impersonate.py +68 -14
  44. crawlee/http_clients/_httpx.py +16 -6
  45. crawlee/http_clients/_impit.py +25 -10
  46. crawlee/otel/crawler_instrumentor.py +4 -6
  47. crawlee/request_loaders/_sitemap_request_loader.py +23 -5
  48. crawlee/router.py +13 -3
  49. crawlee/sessions/_cookies.py +13 -8
  50. crawlee/sessions/_models.py +3 -3
  51. crawlee/sessions/_session_pool.py +1 -1
  52. crawlee/statistics/_error_snapshotter.py +1 -1
  53. crawlee/statistics/_models.py +51 -9
  54. crawlee/statistics/_statistics.py +24 -33
  55. crawlee/storage_clients/__init__.py +4 -0
  56. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  57. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  58. crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
  59. crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
  60. crawlee/storage_clients/_file_system/_request_queue_client.py +29 -10
  61. crawlee/storage_clients/_redis/__init__.py +6 -0
  62. crawlee/storage_clients/_redis/_client_mixin.py +292 -0
  63. crawlee/storage_clients/_redis/_dataset_client.py +329 -0
  64. crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
  65. crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
  66. crawlee/storage_clients/_redis/_storage_client.py +149 -0
  67. crawlee/storage_clients/_redis/_utils.py +23 -0
  68. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  69. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  70. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  71. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  72. crawlee/storage_clients/_redis/py.typed +0 -0
  73. crawlee/storage_clients/_sql/_client_mixin.py +1 -1
  74. crawlee/storage_clients/_sql/_db_models.py +1 -2
  75. crawlee/storage_clients/models.py +8 -3
  76. crawlee/storages/_key_value_store.py +5 -2
  77. crawlee/storages/_storage_instance_manager.py +103 -44
  78. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +14 -16
  79. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +82 -69
  80. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
  81. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
  82. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,262 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from logging import getLogger
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from typing_extensions import override
8
+
9
+ from crawlee._utils.file import infer_mime_type
10
+ from crawlee.storage_clients._base import KeyValueStoreClient
11
+ from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
12
+
13
+ from ._client_mixin import MetadataUpdateParams, RedisClientMixin
14
+ from ._utils import await_redis_response
15
+
16
+ if TYPE_CHECKING:
17
+ from collections.abc import AsyncIterator
18
+
19
+ from redis.asyncio import Redis
20
+
21
+ logger = getLogger(__name__)
22
+
23
+
24
+ class RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin):
25
+ """Redis implementation of the key-value store client.
26
+
27
+ This client persists key-value data to Redis using hash data structures for efficient storage and retrieval.
28
+ Keys are mapped to values with automatic content type detection and size tracking for metadata management.
29
+
30
+ The key-value store data is stored in Redis using the following key pattern:
31
+ - `key_value_stores:{name}:items` - Redis hash containing key-value pairs (values stored as binary data).
32
+ - `key_value_stores:{name}:metadata_items` - Redis hash containing metadata for each key.
33
+ - `key_value_stores:{name}:metadata` - Redis JSON object containing store metadata.
34
+
35
+ Values are serialized based on their type: JSON objects are stored as UTF-8 encoded JSON strings,
36
+ text values as UTF-8 encoded strings, and binary data as-is. The implementation automatically handles
37
+ content type detection and maintains metadata about each record including size and MIME type information.
38
+
39
+ All operations are atomic through Redis hash operations and pipeline transactions. The client supports
40
+ concurrent access through Redis's built-in atomic operations for hash fields.
41
+ """
42
+
43
+ _DEFAULT_NAME = 'default'
44
+ """Default Key-Value Store name key prefix when none provided."""
45
+
46
+ _MAIN_KEY = 'key_value_stores'
47
+ """Main Redis key prefix for Key-Value Store."""
48
+
49
+ _CLIENT_TYPE = 'Key-value store'
50
+ """Human-readable client type for error messages."""
51
+
52
+ def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None:
53
+ """Initialize a new instance.
54
+
55
+ Preferably use the `RedisKeyValueStoreClient.open` class method to create a new instance.
56
+ """
57
+ super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis)
58
+
59
+ @property
60
+ def _items_key(self) -> str:
61
+ """Return the Redis key for the items of KVS."""
62
+ return f'{self._MAIN_KEY}:{self._storage_name}:items'
63
+
64
+ @property
65
+ def _metadata_items_key(self) -> str:
66
+ """Return the Redis key for the items metadata of KVS."""
67
+ return f'{self._MAIN_KEY}:{self._storage_name}:metadata_items'
68
+
69
+ @classmethod
70
+ async def open(
71
+ cls,
72
+ *,
73
+ id: str | None,
74
+ name: str | None,
75
+ alias: str | None,
76
+ redis: Redis,
77
+ ) -> RedisKeyValueStoreClient:
78
+ """Open or create a new Redis key-value store client.
79
+
80
+ This method attempts to open an existing key-value store from the Redis database. If a store with the specified
81
+ ID or name exists, it loads the metadata from the database. If no existing store is found, a new one
82
+ is created.
83
+
84
+ Args:
85
+ id: The ID of the key-value store. If not provided, a random ID will be generated.
86
+ name: The name of the key-value store for named (global scope) storages.
87
+ alias: The alias of the key-value store for unnamed (run scope) storages.
88
+ redis: Redis client instance.
89
+
90
+ Returns:
91
+ An instance for the opened or created storage client.
92
+ """
93
+ return await cls._open(
94
+ id=id,
95
+ name=name,
96
+ alias=alias,
97
+ redis=redis,
98
+ metadata_model=KeyValueStoreMetadata,
99
+ extra_metadata_fields={},
100
+ instance_kwargs={},
101
+ )
102
+
103
+ @override
104
+ async def get_metadata(self) -> KeyValueStoreMetadata:
105
+ return await self._get_metadata(KeyValueStoreMetadata)
106
+
107
+ @override
108
+ async def drop(self) -> None:
109
+ await self._drop(extra_keys=[self._items_key, self._metadata_items_key])
110
+
111
+ @override
112
+ async def purge(self) -> None:
113
+ await self._purge(
114
+ extra_keys=[self._items_key, self._metadata_items_key],
115
+ metadata_kwargs=MetadataUpdateParams(update_accessed_at=True, update_modified_at=True),
116
+ )
117
+
118
+ @override
119
+ async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None:
120
+ # Special handling for None values
121
+ if value is None:
122
+ content_type = 'application/x-none' # Special content type to identify None values
123
+ value_bytes = b''
124
+ else:
125
+ content_type = content_type or infer_mime_type(value)
126
+
127
+ # Serialize the value to bytes.
128
+ if 'application/json' in content_type:
129
+ value_bytes = json.dumps(value, default=str, ensure_ascii=False).encode('utf-8')
130
+ elif isinstance(value, str):
131
+ value_bytes = value.encode('utf-8')
132
+ elif isinstance(value, (bytes, bytearray)):
133
+ value_bytes = value
134
+ else:
135
+ # Fallback: attempt to convert to string and encode.
136
+ value_bytes = str(value).encode('utf-8')
137
+
138
+ size = len(value_bytes)
139
+ item_metadata = KeyValueStoreRecordMetadata(
140
+ key=key,
141
+ content_type=content_type,
142
+ size=size,
143
+ )
144
+
145
+ async with self._get_pipeline() as pipe:
146
+ # redis-py typing issue
147
+ await await_redis_response(pipe.hset(self._items_key, key, value_bytes)) # ty: ignore[invalid-argument-type]
148
+
149
+ await await_redis_response(
150
+ pipe.hset(
151
+ self._metadata_items_key,
152
+ key,
153
+ item_metadata.model_dump_json(),
154
+ )
155
+ )
156
+ await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True, update_modified_at=True))
157
+
158
+ @override
159
+ async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
160
+ serialized_metadata_item = await await_redis_response(self._redis.hget(self._metadata_items_key, key))
161
+
162
+ async with self._get_pipeline() as pipe:
163
+ await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True))
164
+
165
+ if not isinstance(serialized_metadata_item, (str, bytes, bytearray)):
166
+ logger.warning(f'Metadata for key "{key}" is missing or invalid.')
167
+ return None
168
+
169
+ metadata_item = KeyValueStoreRecordMetadata.model_validate_json(serialized_metadata_item)
170
+
171
+ # Handle None values
172
+ if metadata_item.content_type == 'application/x-none':
173
+ return KeyValueStoreRecord(value=None, **metadata_item.model_dump())
174
+
175
+ # Query the record by key
176
+ # redis-py typing issue
177
+ value_bytes: bytes | None = await await_redis_response(self._redis.hget(self._items_key, key)) # ty: ignore[invalid-assignment]
178
+
179
+ if value_bytes is None:
180
+ logger.warning(f'Value for key "{key}" is missing.')
181
+ return None
182
+
183
+ # Handle JSON values
184
+ if 'application/json' in metadata_item.content_type:
185
+ try:
186
+ value = json.loads(value_bytes.decode('utf-8'))
187
+ except (json.JSONDecodeError, UnicodeDecodeError):
188
+ logger.warning(f'Failed to decode JSON value for key "{key}"')
189
+ return None
190
+ # Handle text values
191
+ elif metadata_item.content_type.startswith('text/'):
192
+ try:
193
+ value = value_bytes.decode('utf-8')
194
+ except UnicodeDecodeError:
195
+ logger.warning(f'Failed to decode text value for key "{key}"')
196
+ return None
197
+ # Handle binary values
198
+ else:
199
+ value = value_bytes
200
+
201
+ return KeyValueStoreRecord(value=value, **metadata_item.model_dump())
202
+
203
+ @override
204
+ async def delete_value(self, *, key: str) -> None:
205
+ async with self._get_pipeline() as pipe:
206
+ await await_redis_response(pipe.hdel(self._items_key, key))
207
+ await await_redis_response(pipe.hdel(self._metadata_items_key, key))
208
+ await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True, update_modified_at=True))
209
+
210
+ @override
211
+ async def iterate_keys(
212
+ self,
213
+ *,
214
+ exclusive_start_key: str | None = None,
215
+ limit: int | None = None,
216
+ ) -> AsyncIterator[KeyValueStoreRecordMetadata]:
217
+ items_data = await await_redis_response(self._redis.hgetall(self._metadata_items_key))
218
+
219
+ if not items_data:
220
+ return # No items to iterate over
221
+
222
+ if not isinstance(items_data, dict):
223
+ raise TypeError('The items data was received in an incorrect format.')
224
+
225
+ # Get all keys, sorted alphabetically
226
+ keys = sorted(items_data.keys()) # ty: ignore[invalid-argument-type]
227
+
228
+ # Apply exclusive_start_key filter if provided
229
+ if exclusive_start_key is not None:
230
+ bytes_exclusive_start_key = exclusive_start_key.encode()
231
+ keys = [k for k in keys if k > bytes_exclusive_start_key]
232
+
233
+ # Apply limit if provided
234
+ if limit is not None:
235
+ keys = keys[:limit]
236
+
237
+ # Yield metadata for each key
238
+ for key in keys:
239
+ record = items_data[key]
240
+ yield KeyValueStoreRecordMetadata.model_validate_json(record)
241
+
242
+ async with self._get_pipeline() as pipe:
243
+ await self._update_metadata(
244
+ pipe,
245
+ **MetadataUpdateParams(update_accessed_at=True),
246
+ )
247
+
248
+ @override
249
+ async def get_public_url(self, *, key: str) -> str:
250
+ raise NotImplementedError('Public URLs are not supported for memory key-value stores.')
251
+
252
+ @override
253
+ async def record_exists(self, *, key: str) -> bool:
254
+ async with self._get_pipeline(with_execute=False) as pipe:
255
+ await await_redis_response(pipe.hexists(self._items_key, key))
256
+ await self._update_metadata(
257
+ pipe,
258
+ **MetadataUpdateParams(update_accessed_at=True),
259
+ )
260
+ results = await pipe.execute()
261
+
262
+ return bool(results[0])