crawlee 1.0.3b6__py3-none-any.whl → 1.2.2b24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/__init__.py +2 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +32 -13
- crawlee/_service_locator.py +4 -4
- crawlee/_types.py +44 -5
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +27 -3
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +13 -6
- crawlee/_utils/system.py +27 -11
- crawlee/_utils/time.py +41 -1
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +5 -2
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/browsers/_playwright_browser_controller.py +1 -1
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +3 -1
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
- crawlee/crawlers/_basic/_basic_crawler.py +156 -131
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +23 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +4 -6
- crawlee/request_loaders/_sitemap_request_loader.py +23 -5
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +3 -3
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +51 -9
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +4 -0
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
- crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
- crawlee/storage_clients/_file_system/_request_queue_client.py +29 -10
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +292 -0
- crawlee/storage_clients/_redis/_dataset_client.py +329 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
- crawlee/storage_clients/_redis/_storage_client.py +149 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/_client_mixin.py +1 -1
- crawlee/storage_clients/_sql/_db_models.py +1 -2
- crawlee/storage_clients/models.py +8 -3
- crawlee/storages/_key_value_store.py +5 -2
- crawlee/storages/_storage_instance_manager.py +103 -44
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +14 -16
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +82 -69
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from logging import getLogger
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
from typing_extensions import override
|
|
8
|
+
|
|
9
|
+
from crawlee._utils.file import infer_mime_type
|
|
10
|
+
from crawlee.storage_clients._base import KeyValueStoreClient
|
|
11
|
+
from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
|
|
12
|
+
|
|
13
|
+
from ._client_mixin import MetadataUpdateParams, RedisClientMixin
|
|
14
|
+
from ._utils import await_redis_response
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from collections.abc import AsyncIterator
|
|
18
|
+
|
|
19
|
+
from redis.asyncio import Redis
|
|
20
|
+
|
|
21
|
+
logger = getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class RedisKeyValueStoreClient(KeyValueStoreClient, RedisClientMixin):
|
|
25
|
+
"""Redis implementation of the key-value store client.
|
|
26
|
+
|
|
27
|
+
This client persists key-value data to Redis using hash data structures for efficient storage and retrieval.
|
|
28
|
+
Keys are mapped to values with automatic content type detection and size tracking for metadata management.
|
|
29
|
+
|
|
30
|
+
The key-value store data is stored in Redis using the following key pattern:
|
|
31
|
+
- `key_value_stores:{name}:items` - Redis hash containing key-value pairs (values stored as binary data).
|
|
32
|
+
- `key_value_stores:{name}:metadata_items` - Redis hash containing metadata for each key.
|
|
33
|
+
- `key_value_stores:{name}:metadata` - Redis JSON object containing store metadata.
|
|
34
|
+
|
|
35
|
+
Values are serialized based on their type: JSON objects are stored as UTF-8 encoded JSON strings,
|
|
36
|
+
text values as UTF-8 encoded strings, and binary data as-is. The implementation automatically handles
|
|
37
|
+
content type detection and maintains metadata about each record including size and MIME type information.
|
|
38
|
+
|
|
39
|
+
All operations are atomic through Redis hash operations and pipeline transactions. The client supports
|
|
40
|
+
concurrent access through Redis's built-in atomic operations for hash fields.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
_DEFAULT_NAME = 'default'
|
|
44
|
+
"""Default Key-Value Store name key prefix when none provided."""
|
|
45
|
+
|
|
46
|
+
_MAIN_KEY = 'key_value_stores'
|
|
47
|
+
"""Main Redis key prefix for Key-Value Store."""
|
|
48
|
+
|
|
49
|
+
_CLIENT_TYPE = 'Key-value store'
|
|
50
|
+
"""Human-readable client type for error messages."""
|
|
51
|
+
|
|
52
|
+
def __init__(self, storage_name: str, storage_id: str, redis: Redis) -> None:
|
|
53
|
+
"""Initialize a new instance.
|
|
54
|
+
|
|
55
|
+
Preferably use the `RedisKeyValueStoreClient.open` class method to create a new instance.
|
|
56
|
+
"""
|
|
57
|
+
super().__init__(storage_name=storage_name, storage_id=storage_id, redis=redis)
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def _items_key(self) -> str:
|
|
61
|
+
"""Return the Redis key for the items of KVS."""
|
|
62
|
+
return f'{self._MAIN_KEY}:{self._storage_name}:items'
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def _metadata_items_key(self) -> str:
|
|
66
|
+
"""Return the Redis key for the items metadata of KVS."""
|
|
67
|
+
return f'{self._MAIN_KEY}:{self._storage_name}:metadata_items'
|
|
68
|
+
|
|
69
|
+
@classmethod
|
|
70
|
+
async def open(
|
|
71
|
+
cls,
|
|
72
|
+
*,
|
|
73
|
+
id: str | None,
|
|
74
|
+
name: str | None,
|
|
75
|
+
alias: str | None,
|
|
76
|
+
redis: Redis,
|
|
77
|
+
) -> RedisKeyValueStoreClient:
|
|
78
|
+
"""Open or create a new Redis key-value store client.
|
|
79
|
+
|
|
80
|
+
This method attempts to open an existing key-value store from the Redis database. If a store with the specified
|
|
81
|
+
ID or name exists, it loads the metadata from the database. If no existing store is found, a new one
|
|
82
|
+
is created.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
id: The ID of the key-value store. If not provided, a random ID will be generated.
|
|
86
|
+
name: The name of the key-value store for named (global scope) storages.
|
|
87
|
+
alias: The alias of the key-value store for unnamed (run scope) storages.
|
|
88
|
+
redis: Redis client instance.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
An instance for the opened or created storage client.
|
|
92
|
+
"""
|
|
93
|
+
return await cls._open(
|
|
94
|
+
id=id,
|
|
95
|
+
name=name,
|
|
96
|
+
alias=alias,
|
|
97
|
+
redis=redis,
|
|
98
|
+
metadata_model=KeyValueStoreMetadata,
|
|
99
|
+
extra_metadata_fields={},
|
|
100
|
+
instance_kwargs={},
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
@override
|
|
104
|
+
async def get_metadata(self) -> KeyValueStoreMetadata:
|
|
105
|
+
return await self._get_metadata(KeyValueStoreMetadata)
|
|
106
|
+
|
|
107
|
+
@override
|
|
108
|
+
async def drop(self) -> None:
|
|
109
|
+
await self._drop(extra_keys=[self._items_key, self._metadata_items_key])
|
|
110
|
+
|
|
111
|
+
@override
|
|
112
|
+
async def purge(self) -> None:
|
|
113
|
+
await self._purge(
|
|
114
|
+
extra_keys=[self._items_key, self._metadata_items_key],
|
|
115
|
+
metadata_kwargs=MetadataUpdateParams(update_accessed_at=True, update_modified_at=True),
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
@override
|
|
119
|
+
async def set_value(self, *, key: str, value: Any, content_type: str | None = None) -> None:
|
|
120
|
+
# Special handling for None values
|
|
121
|
+
if value is None:
|
|
122
|
+
content_type = 'application/x-none' # Special content type to identify None values
|
|
123
|
+
value_bytes = b''
|
|
124
|
+
else:
|
|
125
|
+
content_type = content_type or infer_mime_type(value)
|
|
126
|
+
|
|
127
|
+
# Serialize the value to bytes.
|
|
128
|
+
if 'application/json' in content_type:
|
|
129
|
+
value_bytes = json.dumps(value, default=str, ensure_ascii=False).encode('utf-8')
|
|
130
|
+
elif isinstance(value, str):
|
|
131
|
+
value_bytes = value.encode('utf-8')
|
|
132
|
+
elif isinstance(value, (bytes, bytearray)):
|
|
133
|
+
value_bytes = value
|
|
134
|
+
else:
|
|
135
|
+
# Fallback: attempt to convert to string and encode.
|
|
136
|
+
value_bytes = str(value).encode('utf-8')
|
|
137
|
+
|
|
138
|
+
size = len(value_bytes)
|
|
139
|
+
item_metadata = KeyValueStoreRecordMetadata(
|
|
140
|
+
key=key,
|
|
141
|
+
content_type=content_type,
|
|
142
|
+
size=size,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
async with self._get_pipeline() as pipe:
|
|
146
|
+
# redis-py typing issue
|
|
147
|
+
await await_redis_response(pipe.hset(self._items_key, key, value_bytes)) # ty: ignore[invalid-argument-type]
|
|
148
|
+
|
|
149
|
+
await await_redis_response(
|
|
150
|
+
pipe.hset(
|
|
151
|
+
self._metadata_items_key,
|
|
152
|
+
key,
|
|
153
|
+
item_metadata.model_dump_json(),
|
|
154
|
+
)
|
|
155
|
+
)
|
|
156
|
+
await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True, update_modified_at=True))
|
|
157
|
+
|
|
158
|
+
@override
|
|
159
|
+
async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
|
|
160
|
+
serialized_metadata_item = await await_redis_response(self._redis.hget(self._metadata_items_key, key))
|
|
161
|
+
|
|
162
|
+
async with self._get_pipeline() as pipe:
|
|
163
|
+
await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True))
|
|
164
|
+
|
|
165
|
+
if not isinstance(serialized_metadata_item, (str, bytes, bytearray)):
|
|
166
|
+
logger.warning(f'Metadata for key "{key}" is missing or invalid.')
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
metadata_item = KeyValueStoreRecordMetadata.model_validate_json(serialized_metadata_item)
|
|
170
|
+
|
|
171
|
+
# Handle None values
|
|
172
|
+
if metadata_item.content_type == 'application/x-none':
|
|
173
|
+
return KeyValueStoreRecord(value=None, **metadata_item.model_dump())
|
|
174
|
+
|
|
175
|
+
# Query the record by key
|
|
176
|
+
# redis-py typing issue
|
|
177
|
+
value_bytes: bytes | None = await await_redis_response(self._redis.hget(self._items_key, key)) # ty: ignore[invalid-assignment]
|
|
178
|
+
|
|
179
|
+
if value_bytes is None:
|
|
180
|
+
logger.warning(f'Value for key "{key}" is missing.')
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
# Handle JSON values
|
|
184
|
+
if 'application/json' in metadata_item.content_type:
|
|
185
|
+
try:
|
|
186
|
+
value = json.loads(value_bytes.decode('utf-8'))
|
|
187
|
+
except (json.JSONDecodeError, UnicodeDecodeError):
|
|
188
|
+
logger.warning(f'Failed to decode JSON value for key "{key}"')
|
|
189
|
+
return None
|
|
190
|
+
# Handle text values
|
|
191
|
+
elif metadata_item.content_type.startswith('text/'):
|
|
192
|
+
try:
|
|
193
|
+
value = value_bytes.decode('utf-8')
|
|
194
|
+
except UnicodeDecodeError:
|
|
195
|
+
logger.warning(f'Failed to decode text value for key "{key}"')
|
|
196
|
+
return None
|
|
197
|
+
# Handle binary values
|
|
198
|
+
else:
|
|
199
|
+
value = value_bytes
|
|
200
|
+
|
|
201
|
+
return KeyValueStoreRecord(value=value, **metadata_item.model_dump())
|
|
202
|
+
|
|
203
|
+
@override
|
|
204
|
+
async def delete_value(self, *, key: str) -> None:
|
|
205
|
+
async with self._get_pipeline() as pipe:
|
|
206
|
+
await await_redis_response(pipe.hdel(self._items_key, key))
|
|
207
|
+
await await_redis_response(pipe.hdel(self._metadata_items_key, key))
|
|
208
|
+
await self._update_metadata(pipe, **MetadataUpdateParams(update_accessed_at=True, update_modified_at=True))
|
|
209
|
+
|
|
210
|
+
@override
|
|
211
|
+
async def iterate_keys(
|
|
212
|
+
self,
|
|
213
|
+
*,
|
|
214
|
+
exclusive_start_key: str | None = None,
|
|
215
|
+
limit: int | None = None,
|
|
216
|
+
) -> AsyncIterator[KeyValueStoreRecordMetadata]:
|
|
217
|
+
items_data = await await_redis_response(self._redis.hgetall(self._metadata_items_key))
|
|
218
|
+
|
|
219
|
+
if not items_data:
|
|
220
|
+
return # No items to iterate over
|
|
221
|
+
|
|
222
|
+
if not isinstance(items_data, dict):
|
|
223
|
+
raise TypeError('The items data was received in an incorrect format.')
|
|
224
|
+
|
|
225
|
+
# Get all keys, sorted alphabetically
|
|
226
|
+
keys = sorted(items_data.keys()) # ty: ignore[invalid-argument-type]
|
|
227
|
+
|
|
228
|
+
# Apply exclusive_start_key filter if provided
|
|
229
|
+
if exclusive_start_key is not None:
|
|
230
|
+
bytes_exclusive_start_key = exclusive_start_key.encode()
|
|
231
|
+
keys = [k for k in keys if k > bytes_exclusive_start_key]
|
|
232
|
+
|
|
233
|
+
# Apply limit if provided
|
|
234
|
+
if limit is not None:
|
|
235
|
+
keys = keys[:limit]
|
|
236
|
+
|
|
237
|
+
# Yield metadata for each key
|
|
238
|
+
for key in keys:
|
|
239
|
+
record = items_data[key]
|
|
240
|
+
yield KeyValueStoreRecordMetadata.model_validate_json(record)
|
|
241
|
+
|
|
242
|
+
async with self._get_pipeline() as pipe:
|
|
243
|
+
await self._update_metadata(
|
|
244
|
+
pipe,
|
|
245
|
+
**MetadataUpdateParams(update_accessed_at=True),
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
@override
|
|
249
|
+
async def get_public_url(self, *, key: str) -> str:
|
|
250
|
+
raise NotImplementedError('Public URLs are not supported for memory key-value stores.')
|
|
251
|
+
|
|
252
|
+
@override
|
|
253
|
+
async def record_exists(self, *, key: str) -> bool:
|
|
254
|
+
async with self._get_pipeline(with_execute=False) as pipe:
|
|
255
|
+
await await_redis_response(pipe.hexists(self._items_key, key))
|
|
256
|
+
await self._update_metadata(
|
|
257
|
+
pipe,
|
|
258
|
+
**MetadataUpdateParams(update_accessed_at=True),
|
|
259
|
+
)
|
|
260
|
+
results = await pipe.execute()
|
|
261
|
+
|
|
262
|
+
return bool(results[0])
|