crawlee 0.6.13b15__py3-none-any.whl → 1.3.1b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/__init__.py +2 -1
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_browserforge_workaround.py +7 -3
- crawlee/_request.py +64 -43
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +128 -36
- crawlee/_utils/context.py +3 -3
- crawlee/_utils/file.py +8 -1
- crawlee/_utils/globs.py +4 -4
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +27 -3
- crawlee/_utils/requests.py +0 -26
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +16 -7
- crawlee/_utils/system.py +30 -14
- crawlee/_utils/time.py +120 -0
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +5 -2
- crawlee/browsers/_playwright_browser.py +2 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/__init__.py +5 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +57 -17
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +27 -32
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +254 -148
- crawlee/crawlers/_basic/_context_utils.py +24 -0
- crawlee/crawlers/_basic/_logging_utils.py +27 -4
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +73 -23
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/errors.py +4 -0
- crawlee/events/_event_manager.py +12 -6
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +68 -14
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +32 -11
- crawlee/otel/crawler_instrumentor.py +4 -6
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +3 -3
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +248 -50
- crawlee/router.py +13 -3
- crawlee/sessions/_cookies.py +13 -8
- crawlee/sessions/_models.py +5 -5
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +62 -12
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_dataset_client.py +2 -2
- crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
- crawlee/storage_clients/_base/_request_queue_client.py +2 -2
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +33 -30
- crawlee/storage_clients/_file_system/_key_value_store_client.py +34 -27
- crawlee/storage_clients/_file_system/_request_queue_client.py +171 -154
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +292 -0
- crawlee/storage_clients/_redis/_dataset_client.py +329 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
- crawlee/storage_clients/_redis/_storage_client.py +149 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +21 -14
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +13 -5
- crawlee/storages/_storage_instance_manager.py +196 -75
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +24 -19
- crawlee-1.3.1b3.dist-info/RECORD +207 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
- crawlee/_utils/measure_time.py +0 -31
- crawlee-0.6.13b15.dist-info/RECORD +0 -183
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b15.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,21 +1,68 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from
|
|
4
|
-
from
|
|
5
|
-
|
|
3
|
+
from asyncio import Lock
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from collections.abc import Coroutine, Hashable
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import TYPE_CHECKING, TypeVar
|
|
8
|
+
from weakref import WeakValueDictionary
|
|
9
|
+
|
|
10
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
6
11
|
from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient
|
|
7
12
|
|
|
8
|
-
from .
|
|
13
|
+
from ._utils import validate_storage_name
|
|
9
14
|
|
|
10
15
|
if TYPE_CHECKING:
|
|
11
|
-
from
|
|
16
|
+
from ._base import Storage
|
|
12
17
|
|
|
13
18
|
T = TypeVar('T', bound='Storage')
|
|
14
19
|
|
|
15
|
-
StorageClientType = DatasetClient | KeyValueStoreClient | RequestQueueClient
|
|
16
|
-
"""Type alias for the storage client types."""
|
|
17
20
|
|
|
18
|
-
|
|
21
|
+
@dataclass
|
|
22
|
+
class _StorageCache:
|
|
23
|
+
"""Cache for storage instances."""
|
|
24
|
+
|
|
25
|
+
by_id: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field(
|
|
26
|
+
default_factory=lambda: defaultdict(lambda: defaultdict(lambda: defaultdict()))
|
|
27
|
+
)
|
|
28
|
+
"""Cache for storage instances by ID. Example: by_id[Dataset]['some_id']['some_additional_cache_key']."""
|
|
29
|
+
|
|
30
|
+
by_name: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field(
|
|
31
|
+
default_factory=lambda: defaultdict(lambda: defaultdict(lambda: defaultdict()))
|
|
32
|
+
)
|
|
33
|
+
"""Cache for storage instances by name. Example: by_name[Dataset]['some_name']['some_additional_cache_key']"""
|
|
34
|
+
|
|
35
|
+
by_alias: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field(
|
|
36
|
+
default_factory=lambda: defaultdict(lambda: defaultdict(lambda: defaultdict()))
|
|
37
|
+
)
|
|
38
|
+
"""Cache for storage instances by alias. Example: by_alias[Dataset]['some_alias']['some_additional_cache_key']"""
|
|
39
|
+
|
|
40
|
+
def remove_from_cache(self, storage_instance: Storage) -> None:
|
|
41
|
+
"""Remove a storage instance from the cache.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
storage_instance: The storage instance to remove.
|
|
45
|
+
"""
|
|
46
|
+
storage_type = type(storage_instance)
|
|
47
|
+
|
|
48
|
+
# Remove from ID cache
|
|
49
|
+
for additional_key in self.by_id[storage_type][storage_instance.id]:
|
|
50
|
+
del self.by_id[storage_type][storage_instance.id][additional_key]
|
|
51
|
+
break
|
|
52
|
+
|
|
53
|
+
# Remove from name cache or alias cache. It can never be in both.
|
|
54
|
+
if storage_instance.name is not None:
|
|
55
|
+
for additional_key in self.by_name[storage_type][storage_instance.name]:
|
|
56
|
+
del self.by_name[storage_type][storage_instance.name][additional_key]
|
|
57
|
+
break
|
|
58
|
+
else:
|
|
59
|
+
for alias_key in self.by_alias[storage_type]:
|
|
60
|
+
for additional_key in self.by_alias[storage_type][alias_key]:
|
|
61
|
+
del self.by_alias[storage_type][alias_key][additional_key]
|
|
62
|
+
break
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
ClientOpenerCoro = Coroutine[None, None, DatasetClient | KeyValueStoreClient | RequestQueueClient]
|
|
19
66
|
"""Type alias for the client opener function."""
|
|
20
67
|
|
|
21
68
|
|
|
@@ -26,15 +73,12 @@ class StorageInstanceManager:
|
|
|
26
73
|
and provides a unified interface for opening and managing storage instances.
|
|
27
74
|
"""
|
|
28
75
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
"""Cache for storage instances by ID, separated by storage type."""
|
|
32
|
-
|
|
33
|
-
self._cache_by_name = dict[type[Storage], dict[str, Storage]]()
|
|
34
|
-
"""Cache for storage instances by name, separated by storage type."""
|
|
76
|
+
_DEFAULT_STORAGE_ALIAS = '__default__'
|
|
77
|
+
"""Reserved alias for default unnamed storage."""
|
|
35
78
|
|
|
36
|
-
|
|
37
|
-
|
|
79
|
+
def __init__(self) -> None:
|
|
80
|
+
self._cache: _StorageCache = _StorageCache()
|
|
81
|
+
self._opener_locks: WeakValueDictionary[tuple, Lock] = WeakValueDictionary()
|
|
38
82
|
|
|
39
83
|
async def open_storage_instance(
|
|
40
84
|
self,
|
|
@@ -42,66 +86,112 @@ class StorageInstanceManager:
|
|
|
42
86
|
*,
|
|
43
87
|
id: str | None,
|
|
44
88
|
name: str | None,
|
|
45
|
-
|
|
46
|
-
|
|
89
|
+
alias: str | None,
|
|
90
|
+
client_opener_coro: ClientOpenerCoro,
|
|
91
|
+
storage_client_cache_key: Hashable = '',
|
|
47
92
|
) -> T:
|
|
48
93
|
"""Open a storage instance with caching support.
|
|
49
94
|
|
|
50
95
|
Args:
|
|
51
96
|
cls: The storage class to instantiate.
|
|
52
97
|
id: Storage ID.
|
|
53
|
-
name: Storage name.
|
|
54
|
-
|
|
55
|
-
|
|
98
|
+
name: Storage name. (global scope, persists across runs). Name can only contain letters "a" through "z",
|
|
99
|
+
the digits "0" through "9", and the hyphen ("-") but only in the middle of the string
|
|
100
|
+
(e.g. "my-value-1").
|
|
101
|
+
alias: Storage alias (run scope, creates unnamed storage).
|
|
102
|
+
client_opener_coro: Coroutine to open the storage client when storage instance not found in cache.
|
|
103
|
+
storage_client_cache_key: Additional optional key from storage client to differentiate cache entries.
|
|
56
104
|
|
|
57
105
|
Returns:
|
|
58
106
|
The storage instance.
|
|
59
107
|
|
|
60
108
|
Raises:
|
|
61
|
-
ValueError: If
|
|
109
|
+
ValueError: If multiple parameters out of `id`, `name`, and `alias` are specified.
|
|
62
110
|
"""
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
111
|
+
try:
|
|
112
|
+
if name == self._DEFAULT_STORAGE_ALIAS:
|
|
113
|
+
raise ValueError(
|
|
114
|
+
f'Storage name cannot be "{self._DEFAULT_STORAGE_ALIAS}" as it is reserved for default alias.'
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Validate input parameters.
|
|
118
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
119
|
+
|
|
120
|
+
# Auto-set alias='default' when no parameters are specified.
|
|
121
|
+
# Default unnamed storage is equal to alias=default unnamed storage.
|
|
122
|
+
if not any([name, alias, id]):
|
|
123
|
+
alias = self._DEFAULT_STORAGE_ALIAS
|
|
124
|
+
|
|
125
|
+
# Check cache without lock first for performance.
|
|
126
|
+
if cached_instance := self._get_from_cache(
|
|
127
|
+
cls,
|
|
128
|
+
id=id,
|
|
129
|
+
name=name,
|
|
130
|
+
alias=alias,
|
|
131
|
+
storage_client_cache_key=storage_client_cache_key,
|
|
132
|
+
):
|
|
133
|
+
return cached_instance
|
|
134
|
+
|
|
135
|
+
# Validate storage name
|
|
136
|
+
if name is not None:
|
|
137
|
+
validate_storage_name(name)
|
|
138
|
+
|
|
139
|
+
# Acquire lock for this opener
|
|
140
|
+
opener_lock_key = (cls, str(id or name or alias), storage_client_cache_key)
|
|
141
|
+
if not (lock := self._opener_locks.get(opener_lock_key)):
|
|
142
|
+
lock = Lock()
|
|
143
|
+
self._opener_locks[opener_lock_key] = lock
|
|
144
|
+
|
|
145
|
+
async with lock:
|
|
146
|
+
# Another task could have created the storage while we were waiting for the lock - check if that
|
|
147
|
+
# happened
|
|
148
|
+
if cached_instance := self._get_from_cache(
|
|
149
|
+
cls,
|
|
150
|
+
id=id,
|
|
151
|
+
name=name,
|
|
152
|
+
alias=alias,
|
|
153
|
+
storage_client_cache_key=storage_client_cache_key,
|
|
154
|
+
):
|
|
76
155
|
return cached_instance
|
|
77
156
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
157
|
+
# Check for conflicts between named and alias storages
|
|
158
|
+
self._check_name_alias_conflict(
|
|
159
|
+
cls,
|
|
160
|
+
name=name,
|
|
161
|
+
alias=alias,
|
|
162
|
+
storage_client_cache_key=storage_client_cache_key,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# Create new instance
|
|
166
|
+
client: KeyValueStoreClient | DatasetClient | RequestQueueClient
|
|
167
|
+
client = await client_opener_coro
|
|
168
|
+
|
|
169
|
+
metadata = await client.get_metadata()
|
|
84
170
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
metadata = await client.get_metadata()
|
|
171
|
+
instance = cls(client, metadata.id, metadata.name) # type: ignore[call-arg]
|
|
172
|
+
instance_name = getattr(instance, 'name', None)
|
|
88
173
|
|
|
89
|
-
|
|
90
|
-
|
|
174
|
+
# Cache the instance.
|
|
175
|
+
# Note: No awaits in this section. All cache entries must be written
|
|
176
|
+
# atomically to ensure pre-checks outside the lock see consistent state.
|
|
91
177
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
type_cache_by_name = self._cache_by_name.setdefault(cls, {})
|
|
178
|
+
# Always cache by id.
|
|
179
|
+
self._cache.by_id[cls][instance.id][storage_client_cache_key] = instance
|
|
95
180
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
181
|
+
# Cache named storage.
|
|
182
|
+
if instance_name is not None:
|
|
183
|
+
self._cache.by_name[cls][instance_name][storage_client_cache_key] = instance
|
|
99
184
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
185
|
+
# Cache unnamed storage.
|
|
186
|
+
if alias is not None:
|
|
187
|
+
self._cache.by_alias[cls][alias][storage_client_cache_key] = instance
|
|
103
188
|
|
|
104
|
-
|
|
189
|
+
return instance
|
|
190
|
+
|
|
191
|
+
finally:
|
|
192
|
+
# Make sure the client opener is closed.
|
|
193
|
+
# If it was awaited, then closing is no operation, if it was not awaited, this is the cleanup.
|
|
194
|
+
client_opener_coro.close()
|
|
105
195
|
|
|
106
196
|
def remove_from_cache(self, storage_instance: Storage) -> None:
|
|
107
197
|
"""Remove a storage instance from the cache.
|
|
@@ -109,25 +199,56 @@ class StorageInstanceManager:
|
|
|
109
199
|
Args:
|
|
110
200
|
storage_instance: The storage instance to remove.
|
|
111
201
|
"""
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
# Remove from ID cache
|
|
115
|
-
type_cache_by_id = self._cache_by_id.get(storage_type, {})
|
|
116
|
-
if storage_instance.id in type_cache_by_id:
|
|
117
|
-
del type_cache_by_id[storage_instance.id]
|
|
118
|
-
|
|
119
|
-
# Remove from name cache
|
|
120
|
-
if storage_instance.name is not None:
|
|
121
|
-
type_cache_by_name = self._cache_by_name.get(storage_type, {})
|
|
122
|
-
if storage_instance.name in type_cache_by_name:
|
|
123
|
-
del type_cache_by_name[storage_instance.name]
|
|
124
|
-
|
|
125
|
-
# Remove from default instances
|
|
126
|
-
if storage_type in self._default_instances and self._default_instances[storage_type] is storage_instance:
|
|
127
|
-
del self._default_instances[storage_type]
|
|
202
|
+
self._cache.remove_from_cache(storage_instance)
|
|
128
203
|
|
|
129
204
|
def clear_cache(self) -> None:
|
|
130
205
|
"""Clear all cached storage instances."""
|
|
131
|
-
self.
|
|
132
|
-
|
|
133
|
-
|
|
206
|
+
self._cache = _StorageCache()
|
|
207
|
+
|
|
208
|
+
def _get_from_cache(
|
|
209
|
+
self,
|
|
210
|
+
cls: type[T],
|
|
211
|
+
*,
|
|
212
|
+
id: str | None = None,
|
|
213
|
+
name: str | None = None,
|
|
214
|
+
alias: str | None = None,
|
|
215
|
+
storage_client_cache_key: Hashable = '',
|
|
216
|
+
) -> T | None:
|
|
217
|
+
"""Get a storage instance from the cache."""
|
|
218
|
+
if id is not None and (cached_instance := self._cache.by_id[cls][id].get(storage_client_cache_key)):
|
|
219
|
+
if isinstance(cached_instance, cls):
|
|
220
|
+
return cached_instance
|
|
221
|
+
raise RuntimeError('Cached instance type mismatch.')
|
|
222
|
+
|
|
223
|
+
if name is not None and (cached_instance := self._cache.by_name[cls][name].get(storage_client_cache_key)):
|
|
224
|
+
if isinstance(cached_instance, cls):
|
|
225
|
+
return cached_instance
|
|
226
|
+
raise RuntimeError('Cached instance type mismatch.')
|
|
227
|
+
|
|
228
|
+
if alias is not None and (cached_instance := self._cache.by_alias[cls][alias].get(storage_client_cache_key)):
|
|
229
|
+
if isinstance(cached_instance, cls):
|
|
230
|
+
return cached_instance
|
|
231
|
+
raise RuntimeError('Cached instance type mismatch.')
|
|
232
|
+
|
|
233
|
+
return None
|
|
234
|
+
|
|
235
|
+
def _check_name_alias_conflict(
|
|
236
|
+
self,
|
|
237
|
+
cls: type[T],
|
|
238
|
+
*,
|
|
239
|
+
name: str | None = None,
|
|
240
|
+
alias: str | None = None,
|
|
241
|
+
storage_client_cache_key: Hashable = '',
|
|
242
|
+
) -> None:
|
|
243
|
+
"""Check for conflicts between named and alias storages."""
|
|
244
|
+
if alias and (self._cache.by_name[cls][alias].get(storage_client_cache_key)):
|
|
245
|
+
raise ValueError(
|
|
246
|
+
f'Cannot create alias storage "{alias}" because a named storage with the same name already exists. '
|
|
247
|
+
f'Use a different alias or drop the existing named storage first.'
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
if name and (self._cache.by_alias[cls][name].get(storage_client_cache_key)):
|
|
251
|
+
raise ValueError(
|
|
252
|
+
f'Cannot create named storage "{name}" because an alias storage with the same name already exists. '
|
|
253
|
+
f'Use a different name or drop the existing alias storage first.'
|
|
254
|
+
)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
NAME_REGEX = re.compile(r'^([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])$')
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def validate_storage_name(name: str | None) -> None:
|
|
7
|
+
if name and not NAME_REGEX.match(name):
|
|
8
|
+
raise ValueError(
|
|
9
|
+
f'Invalid storage name "{name}". Name can only contain letters "a" through "z", the digits "0" through'
|
|
10
|
+
'"9", and the hyphen ("-") but only in the middle of the string (e.g. "my-value-1")'
|
|
11
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: crawlee
|
|
3
|
-
Version:
|
|
3
|
+
Version: 1.3.1b3
|
|
4
4
|
Summary: Crawlee for Python
|
|
5
5
|
Project-URL: Apify Homepage, https://apify.com
|
|
6
6
|
Project-URL: Changelog, https://crawlee.dev/python/docs/changelog
|
|
@@ -223,16 +223,18 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
223
223
|
Classifier: Programming Language :: Python :: 3.11
|
|
224
224
|
Classifier: Programming Language :: Python :: 3.12
|
|
225
225
|
Classifier: Programming Language :: Python :: 3.13
|
|
226
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
226
227
|
Classifier: Topic :: Software Development :: Libraries
|
|
227
228
|
Requires-Python: >=3.10
|
|
229
|
+
Requires-Dist: async-timeout>=5.0.1
|
|
228
230
|
Requires-Dist: cachetools>=5.5.0
|
|
229
231
|
Requires-Dist: colorama>=0.4.0
|
|
230
|
-
Requires-Dist: impit>=0.
|
|
232
|
+
Requires-Dist: impit>=0.8.0
|
|
231
233
|
Requires-Dist: more-itertools>=10.2.0
|
|
232
234
|
Requires-Dist: protego>=0.5.0
|
|
233
235
|
Requires-Dist: psutil>=6.0.0
|
|
234
|
-
Requires-Dist: pydantic
|
|
235
|
-
Requires-Dist: pydantic
|
|
236
|
+
Requires-Dist: pydantic-settings>=2.12.0
|
|
237
|
+
Requires-Dist: pydantic>=2.11.0
|
|
236
238
|
Requires-Dist: pyee>=9.0.0
|
|
237
239
|
Requires-Dist: tldextract>=5.1.0
|
|
238
240
|
Requires-Dist: typing-extensions>=4.1.0
|
|
@@ -244,7 +246,9 @@ Requires-Dist: jaro-winkler>=2.0.3; extra == 'adaptive-crawler'
|
|
|
244
246
|
Requires-Dist: playwright>=1.27.0; extra == 'adaptive-crawler'
|
|
245
247
|
Requires-Dist: scikit-learn>=1.6.0; extra == 'adaptive-crawler'
|
|
246
248
|
Provides-Extra: all
|
|
249
|
+
Requires-Dist: aiosqlite>=0.21.0; extra == 'all'
|
|
247
250
|
Requires-Dist: apify-fingerprint-datapoints>=0.0.2; extra == 'all'
|
|
251
|
+
Requires-Dist: asyncpg>=0.24.0; extra == 'all'
|
|
248
252
|
Requires-Dist: beautifulsoup4[lxml]>=4.12.0; extra == 'all'
|
|
249
253
|
Requires-Dist: browserforge>=1.2.3; extra == 'all'
|
|
250
254
|
Requires-Dist: cookiecutter>=2.6.0; extra == 'all'
|
|
@@ -261,8 +265,10 @@ Requires-Dist: opentelemetry-sdk>=1.34.1; extra == 'all'
|
|
|
261
265
|
Requires-Dist: opentelemetry-semantic-conventions>=0.54; extra == 'all'
|
|
262
266
|
Requires-Dist: parsel>=1.10.0; extra == 'all'
|
|
263
267
|
Requires-Dist: playwright>=1.27.0; extra == 'all'
|
|
268
|
+
Requires-Dist: redis[hiredis]>=7.0.0; extra == 'all'
|
|
264
269
|
Requires-Dist: rich>=13.9.0; extra == 'all'
|
|
265
270
|
Requires-Dist: scikit-learn>=1.6.0; extra == 'all'
|
|
271
|
+
Requires-Dist: sqlalchemy[asyncio]<3.0.0,>=2.0.0; extra == 'all'
|
|
266
272
|
Requires-Dist: typer>=0.12.0; extra == 'all'
|
|
267
273
|
Requires-Dist: wrapt>=1.17.0; extra == 'all'
|
|
268
274
|
Provides-Extra: beautifulsoup
|
|
@@ -293,6 +299,14 @@ Provides-Extra: playwright
|
|
|
293
299
|
Requires-Dist: apify-fingerprint-datapoints>=0.0.2; extra == 'playwright'
|
|
294
300
|
Requires-Dist: browserforge>=1.2.3; extra == 'playwright'
|
|
295
301
|
Requires-Dist: playwright>=1.27.0; extra == 'playwright'
|
|
302
|
+
Provides-Extra: redis
|
|
303
|
+
Requires-Dist: redis[hiredis]>=7.0.0; extra == 'redis'
|
|
304
|
+
Provides-Extra: sql-postgres
|
|
305
|
+
Requires-Dist: asyncpg>=0.24.0; extra == 'sql-postgres'
|
|
306
|
+
Requires-Dist: sqlalchemy[asyncio]<3.0.0,>=2.0.0; extra == 'sql-postgres'
|
|
307
|
+
Provides-Extra: sql-sqlite
|
|
308
|
+
Requires-Dist: aiosqlite>=0.21.0; extra == 'sql-sqlite'
|
|
309
|
+
Requires-Dist: sqlalchemy[asyncio]<3.0.0,>=2.0.0; extra == 'sql-sqlite'
|
|
296
310
|
Description-Content-Type: text/markdown
|
|
297
311
|
|
|
298
312
|
<h1 align="center">
|
|
@@ -310,25 +324,16 @@ Description-Content-Type: text/markdown
|
|
|
310
324
|
<a href="https://trendshift.io/repositories/11169" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11169" alt="apify%2Fcrawlee-python | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
|
311
325
|
</p>
|
|
312
326
|
|
|
313
|
-
<p align=center>
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
</a>
|
|
320
|
-
<a href="https://pypi.org/project/crawlee/" rel="nofollow">
|
|
321
|
-
<img src="https://img.shields.io/pypi/pyversions/crawlee" alt="PyPI - Python Version" style="max-width: 100%;">
|
|
322
|
-
</a>
|
|
323
|
-
<a href="https://discord.gg/jyEM2PRvMU" rel="nofollow">
|
|
324
|
-
<img src="https://img.shields.io/discord/801163717915574323?label=discord" alt="Chat on discord" style="max-width: 100%;">
|
|
325
|
-
</a>
|
|
327
|
+
<p align="center">
|
|
328
|
+
<a href="https://badge.fury.io/py/crawlee" rel="nofollow"><img src="https://badge.fury.io/py/crawlee.svg" alt="PyPI package version"></a>
|
|
329
|
+
<a href="https://pypi.org/project/crawlee/" rel="nofollow"><img src="https://img.shields.io/pypi/dm/crawlee" alt="PyPI package downloads"></a>
|
|
330
|
+
<a href="https://codecov.io/gh/apify/crawlee-python"><img src="https://codecov.io/gh/apify/crawlee-python/graph/badge.svg?token=cCju61iPQG" alt="Codecov report"></a>
|
|
331
|
+
<a href="https://pypi.org/project/crawlee/" rel="nofollow"><img src="https://img.shields.io/pypi/pyversions/crawlee" alt="PyPI Python version"></a>
|
|
332
|
+
<a href="https://discord.gg/jyEM2PRvMU" rel="nofollow"><img src="https://img.shields.io/discord/801163717915574323?label=discord" alt="Chat on Discord"></a>
|
|
326
333
|
</p>
|
|
327
334
|
|
|
328
335
|
Crawlee covers your crawling and scraping end-to-end and **helps you build reliable scrapers. Fast.**
|
|
329
336
|
|
|
330
|
-
> 🚀 Crawlee for Python is open to early adopters!
|
|
331
|
-
|
|
332
337
|
Your crawlers will appear almost human-like and fly under the radar of modern bot protections even with the default configuration. Crawlee gives you the tools to crawl the web for links, scrape data and persistently store it in machine-readable formats, without having to worry about the technical details. And thanks to rich configuration options, you can tweak almost any aspect of Crawlee to suit your project's needs if the default settings don't cut it.
|
|
333
338
|
|
|
334
339
|
> 👉 **View full documentation, guides and examples on the [Crawlee project website](https://crawlee.dev/python/)** 👈
|