crawlee 0.6.13b17__py3-none-any.whl → 1.1.2b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_request.py +35 -33
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +106 -34
- crawlee/_utils/context.py +2 -2
- crawlee/_utils/file.py +7 -0
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +17 -1
- crawlee/_utils/requests.py +0 -26
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +4 -2
- crawlee/_utils/system.py +3 -3
- crawlee/_utils/time.py +120 -0
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +50 -12
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +219 -126
- crawlee/crawlers/_basic/_logging_utils.py +5 -1
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_playwright/_playwright_crawler.py +60 -11
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/events/_event_manager.py +4 -4
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +12 -0
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +3 -3
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +248 -50
- crawlee/sessions/_models.py +2 -2
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +43 -4
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_request_queue_client.py +2 -2
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
- crawlee/storage_clients/_file_system/_key_value_store_client.py +30 -26
- crawlee/storage_clients/_file_system/_request_queue_client.py +169 -153
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +13 -11
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +13 -5
- crawlee/storages/_storage_instance_manager.py +133 -71
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +18 -6
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +101 -78
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
- crawlee/_utils/measure_time.py +0 -31
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,21 +1,66 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from collections
|
|
4
|
-
from
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from collections.abc import Coroutine, Hashable
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import TYPE_CHECKING, TypeVar
|
|
5
7
|
|
|
8
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
6
9
|
from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient
|
|
7
10
|
|
|
8
|
-
from .
|
|
11
|
+
from ._utils import validate_storage_name
|
|
9
12
|
|
|
10
13
|
if TYPE_CHECKING:
|
|
11
|
-
from
|
|
14
|
+
from ._base import Storage
|
|
12
15
|
|
|
13
16
|
T = TypeVar('T', bound='Storage')
|
|
14
17
|
|
|
15
|
-
StorageClientType = DatasetClient | KeyValueStoreClient | RequestQueueClient
|
|
16
|
-
"""Type alias for the storage client types."""
|
|
17
18
|
|
|
18
|
-
|
|
19
|
+
@dataclass
|
|
20
|
+
class _StorageCache:
|
|
21
|
+
"""Cache for storage instances."""
|
|
22
|
+
|
|
23
|
+
by_id: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field(
|
|
24
|
+
default_factory=lambda: defaultdict(lambda: defaultdict(lambda: defaultdict()))
|
|
25
|
+
)
|
|
26
|
+
"""Cache for storage instances by ID. Example: by_id[Dataset]['some_id']['some_additional_cache_key']."""
|
|
27
|
+
|
|
28
|
+
by_name: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field(
|
|
29
|
+
default_factory=lambda: defaultdict(lambda: defaultdict(lambda: defaultdict()))
|
|
30
|
+
)
|
|
31
|
+
"""Cache for storage instances by name. Example: by_name[Dataset]['some_name']['some_additional_cache_key']"""
|
|
32
|
+
|
|
33
|
+
by_alias: defaultdict[type[Storage], defaultdict[str, defaultdict[Hashable, Storage]]] = field(
|
|
34
|
+
default_factory=lambda: defaultdict(lambda: defaultdict(lambda: defaultdict()))
|
|
35
|
+
)
|
|
36
|
+
"""Cache for storage instances by alias. Example: by_alias[Dataset]['some_alias']['some_additional_cache_key']"""
|
|
37
|
+
|
|
38
|
+
def remove_from_cache(self, storage_instance: Storage) -> None:
|
|
39
|
+
"""Remove a storage instance from the cache.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
storage_instance: The storage instance to remove.
|
|
43
|
+
"""
|
|
44
|
+
storage_type = type(storage_instance)
|
|
45
|
+
|
|
46
|
+
# Remove from ID cache
|
|
47
|
+
for additional_key in self.by_id[storage_type][storage_instance.id]:
|
|
48
|
+
del self.by_id[storage_type][storage_instance.id][additional_key]
|
|
49
|
+
break
|
|
50
|
+
|
|
51
|
+
# Remove from name cache or alias cache. It can never be in both.
|
|
52
|
+
if storage_instance.name is not None:
|
|
53
|
+
for additional_key in self.by_name[storage_type][storage_instance.name]:
|
|
54
|
+
del self.by_name[storage_type][storage_instance.name][additional_key]
|
|
55
|
+
break
|
|
56
|
+
else:
|
|
57
|
+
for alias_key in self.by_alias[storage_type]:
|
|
58
|
+
for additional_key in self.by_alias[storage_type][alias_key]:
|
|
59
|
+
del self.by_alias[storage_type][alias_key][additional_key]
|
|
60
|
+
break
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
ClientOpenerCoro = Coroutine[None, None, DatasetClient | KeyValueStoreClient | RequestQueueClient]
|
|
19
64
|
"""Type alias for the client opener function."""
|
|
20
65
|
|
|
21
66
|
|
|
@@ -26,15 +71,11 @@ class StorageInstanceManager:
|
|
|
26
71
|
and provides a unified interface for opening and managing storage instances.
|
|
27
72
|
"""
|
|
28
73
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
"""Cache for storage instances by ID, separated by storage type."""
|
|
32
|
-
|
|
33
|
-
self._cache_by_name = dict[type[Storage], dict[str, Storage]]()
|
|
34
|
-
"""Cache for storage instances by name, separated by storage type."""
|
|
74
|
+
_DEFAULT_STORAGE_ALIAS = '__default__'
|
|
75
|
+
"""Reserved alias for default unnamed storage."""
|
|
35
76
|
|
|
36
|
-
|
|
37
|
-
|
|
77
|
+
def __init__(self) -> None:
|
|
78
|
+
self._cache: _StorageCache = _StorageCache()
|
|
38
79
|
|
|
39
80
|
async def open_storage_instance(
|
|
40
81
|
self,
|
|
@@ -42,66 +83,104 @@ class StorageInstanceManager:
|
|
|
42
83
|
*,
|
|
43
84
|
id: str | None,
|
|
44
85
|
name: str | None,
|
|
45
|
-
|
|
46
|
-
|
|
86
|
+
alias: str | None,
|
|
87
|
+
client_opener_coro: ClientOpenerCoro,
|
|
88
|
+
storage_client_cache_key: Hashable = '',
|
|
47
89
|
) -> T:
|
|
48
90
|
"""Open a storage instance with caching support.
|
|
49
91
|
|
|
50
92
|
Args:
|
|
51
93
|
cls: The storage class to instantiate.
|
|
52
94
|
id: Storage ID.
|
|
53
|
-
name: Storage name.
|
|
54
|
-
|
|
55
|
-
|
|
95
|
+
name: Storage name. (global scope, persists across runs). Name can only contain letters "a" through "z",
|
|
96
|
+
the digits "0" through "9", and the hyphen ("-") but only in the middle of the string
|
|
97
|
+
(e.g. "my-value-1").
|
|
98
|
+
alias: Storage alias (run scope, creates unnamed storage).
|
|
99
|
+
client_opener_coro: Coroutine to open the storage client when storage instance not found in cache.
|
|
100
|
+
storage_client_cache_key: Additional optional key from storage client to differentiate cache entries.
|
|
56
101
|
|
|
57
102
|
Returns:
|
|
58
103
|
The storage instance.
|
|
59
104
|
|
|
60
105
|
Raises:
|
|
61
|
-
ValueError: If
|
|
106
|
+
ValueError: If multiple parameters out of `id`, `name`, and `alias` are specified.
|
|
62
107
|
"""
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
108
|
+
try:
|
|
109
|
+
if name == self._DEFAULT_STORAGE_ALIAS:
|
|
110
|
+
raise ValueError(
|
|
111
|
+
f'Storage name cannot be "{self._DEFAULT_STORAGE_ALIAS}" as it is reserved for default alias.'
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Validate input parameters.
|
|
115
|
+
raise_if_too_many_kwargs(id=id, name=name, alias=alias)
|
|
116
|
+
|
|
117
|
+
# Auto-set alias='default' when no parameters are specified.
|
|
118
|
+
# Default unnamed storage is equal to alias=default unnamed storage.
|
|
119
|
+
if not any([name, alias, id]):
|
|
120
|
+
alias = self._DEFAULT_STORAGE_ALIAS
|
|
121
|
+
|
|
122
|
+
# Check cache
|
|
123
|
+
if id is not None and (cached_instance := self._cache.by_id[cls][id].get(storage_client_cache_key)):
|
|
75
124
|
if isinstance(cached_instance, cls):
|
|
76
125
|
return cached_instance
|
|
126
|
+
raise RuntimeError('Cached instance type mismatch.')
|
|
77
127
|
|
|
78
|
-
|
|
79
|
-
type_cache_by_name = self._cache_by_name.get(cls, {})
|
|
80
|
-
if name in type_cache_by_name:
|
|
81
|
-
cached_instance = type_cache_by_name[name]
|
|
128
|
+
if name is not None and (cached_instance := self._cache.by_name[cls][name].get(storage_client_cache_key)):
|
|
82
129
|
if isinstance(cached_instance, cls):
|
|
83
130
|
return cached_instance
|
|
131
|
+
raise RuntimeError('Cached instance type mismatch.')
|
|
132
|
+
|
|
133
|
+
if alias is not None and (
|
|
134
|
+
cached_instance := self._cache.by_alias[cls][alias].get(storage_client_cache_key)
|
|
135
|
+
):
|
|
136
|
+
if isinstance(cached_instance, cls):
|
|
137
|
+
return cached_instance
|
|
138
|
+
raise RuntimeError('Cached instance type mismatch.')
|
|
139
|
+
|
|
140
|
+
# Check for conflicts between named and alias storages
|
|
141
|
+
if alias and (self._cache.by_name[cls][alias].get(storage_client_cache_key)):
|
|
142
|
+
raise ValueError(
|
|
143
|
+
f'Cannot create alias storage "{alias}" because a named storage with the same name already exists. '
|
|
144
|
+
f'Use a different alias or drop the existing named storage first.'
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
if name and (self._cache.by_alias[cls][name].get(storage_client_cache_key)):
|
|
148
|
+
raise ValueError(
|
|
149
|
+
f'Cannot create named storage "{name}" because an alias storage with the same name already exists. '
|
|
150
|
+
f'Use a different name or drop the existing alias storage first.'
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Validate storage name
|
|
154
|
+
if name is not None:
|
|
155
|
+
validate_storage_name(name)
|
|
84
156
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
157
|
+
# Create new instance
|
|
158
|
+
client: KeyValueStoreClient | DatasetClient | RequestQueueClient
|
|
159
|
+
client = await client_opener_coro
|
|
88
160
|
|
|
89
|
-
|
|
90
|
-
instance_name = getattr(instance, 'name', None)
|
|
161
|
+
metadata = await client.get_metadata()
|
|
91
162
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
type_cache_by_name = self._cache_by_name.setdefault(cls, {})
|
|
163
|
+
instance = cls(client, metadata.id, metadata.name) # type: ignore[call-arg]
|
|
164
|
+
instance_name = getattr(instance, 'name', None)
|
|
95
165
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
166
|
+
# Cache the instance.
|
|
167
|
+
# Always cache by id.
|
|
168
|
+
self._cache.by_id[cls][instance.id][storage_client_cache_key] = instance
|
|
99
169
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
170
|
+
# Cache named storage.
|
|
171
|
+
if instance_name is not None:
|
|
172
|
+
self._cache.by_name[cls][instance_name][storage_client_cache_key] = instance
|
|
103
173
|
|
|
104
|
-
|
|
174
|
+
# Cache unnamed storage.
|
|
175
|
+
if alias is not None:
|
|
176
|
+
self._cache.by_alias[cls][alias][storage_client_cache_key] = instance
|
|
177
|
+
|
|
178
|
+
return instance
|
|
179
|
+
|
|
180
|
+
finally:
|
|
181
|
+
# Make sure the client opener is closed.
|
|
182
|
+
# If it was awaited, then closing is no operation, if it was not awaited, this is the cleanup.
|
|
183
|
+
client_opener_coro.close()
|
|
105
184
|
|
|
106
185
|
def remove_from_cache(self, storage_instance: Storage) -> None:
|
|
107
186
|
"""Remove a storage instance from the cache.
|
|
@@ -109,25 +188,8 @@ class StorageInstanceManager:
|
|
|
109
188
|
Args:
|
|
110
189
|
storage_instance: The storage instance to remove.
|
|
111
190
|
"""
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
# Remove from ID cache
|
|
115
|
-
type_cache_by_id = self._cache_by_id.get(storage_type, {})
|
|
116
|
-
if storage_instance.id in type_cache_by_id:
|
|
117
|
-
del type_cache_by_id[storage_instance.id]
|
|
118
|
-
|
|
119
|
-
# Remove from name cache
|
|
120
|
-
if storage_instance.name is not None:
|
|
121
|
-
type_cache_by_name = self._cache_by_name.get(storage_type, {})
|
|
122
|
-
if storage_instance.name in type_cache_by_name:
|
|
123
|
-
del type_cache_by_name[storage_instance.name]
|
|
124
|
-
|
|
125
|
-
# Remove from default instances
|
|
126
|
-
if storage_type in self._default_instances and self._default_instances[storage_type] is storage_instance:
|
|
127
|
-
del self._default_instances[storage_type]
|
|
191
|
+
self._cache.remove_from_cache(storage_instance)
|
|
128
192
|
|
|
129
193
|
def clear_cache(self) -> None:
|
|
130
194
|
"""Clear all cached storage instances."""
|
|
131
|
-
self.
|
|
132
|
-
self._cache_by_name.clear()
|
|
133
|
-
self._default_instances.clear()
|
|
195
|
+
self._cache = _StorageCache()
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
NAME_REGEX = re.compile(r'^([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])$')
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def validate_storage_name(name: str | None) -> None:
|
|
7
|
+
if name and not NAME_REGEX.match(name):
|
|
8
|
+
raise ValueError(
|
|
9
|
+
f'Invalid storage name "{name}". Name can only contain letters "a" through "z", the digits "0" through'
|
|
10
|
+
'"9", and the hyphen ("-") but only in the middle of the string (e.g. "my-value-1")'
|
|
11
|
+
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: crawlee
|
|
3
|
-
Version:
|
|
3
|
+
Version: 1.1.2b7
|
|
4
4
|
Summary: Crawlee for Python
|
|
5
5
|
Project-URL: Apify Homepage, https://apify.com
|
|
6
6
|
Project-URL: Changelog, https://crawlee.dev/python/docs/changelog
|
|
@@ -223,16 +223,18 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
223
223
|
Classifier: Programming Language :: Python :: 3.11
|
|
224
224
|
Classifier: Programming Language :: Python :: 3.12
|
|
225
225
|
Classifier: Programming Language :: Python :: 3.13
|
|
226
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
226
227
|
Classifier: Topic :: Software Development :: Libraries
|
|
227
228
|
Requires-Python: >=3.10
|
|
229
|
+
Requires-Dist: async-timeout>=5.0.1
|
|
228
230
|
Requires-Dist: cachetools>=5.5.0
|
|
229
231
|
Requires-Dist: colorama>=0.4.0
|
|
230
|
-
Requires-Dist: impit>=0.
|
|
232
|
+
Requires-Dist: impit>=0.8.0
|
|
231
233
|
Requires-Dist: more-itertools>=10.2.0
|
|
232
234
|
Requires-Dist: protego>=0.5.0
|
|
233
235
|
Requires-Dist: psutil>=6.0.0
|
|
234
|
-
Requires-Dist: pydantic
|
|
235
|
-
Requires-Dist: pydantic
|
|
236
|
+
Requires-Dist: pydantic-settings>=2.12.0
|
|
237
|
+
Requires-Dist: pydantic>=2.11.0
|
|
236
238
|
Requires-Dist: pyee>=9.0.0
|
|
237
239
|
Requires-Dist: tldextract>=5.1.0
|
|
238
240
|
Requires-Dist: typing-extensions>=4.1.0
|
|
@@ -244,7 +246,9 @@ Requires-Dist: jaro-winkler>=2.0.3; extra == 'adaptive-crawler'
|
|
|
244
246
|
Requires-Dist: playwright>=1.27.0; extra == 'adaptive-crawler'
|
|
245
247
|
Requires-Dist: scikit-learn>=1.6.0; extra == 'adaptive-crawler'
|
|
246
248
|
Provides-Extra: all
|
|
249
|
+
Requires-Dist: aiosqlite>=0.21.0; extra == 'all'
|
|
247
250
|
Requires-Dist: apify-fingerprint-datapoints>=0.0.2; extra == 'all'
|
|
251
|
+
Requires-Dist: asyncpg>=0.24.0; extra == 'all'
|
|
248
252
|
Requires-Dist: beautifulsoup4[lxml]>=4.12.0; extra == 'all'
|
|
249
253
|
Requires-Dist: browserforge>=1.2.3; extra == 'all'
|
|
250
254
|
Requires-Dist: cookiecutter>=2.6.0; extra == 'all'
|
|
@@ -261,8 +265,10 @@ Requires-Dist: opentelemetry-sdk>=1.34.1; extra == 'all'
|
|
|
261
265
|
Requires-Dist: opentelemetry-semantic-conventions>=0.54; extra == 'all'
|
|
262
266
|
Requires-Dist: parsel>=1.10.0; extra == 'all'
|
|
263
267
|
Requires-Dist: playwright>=1.27.0; extra == 'all'
|
|
268
|
+
Requires-Dist: redis[hiredis]>=7.0.0; extra == 'all'
|
|
264
269
|
Requires-Dist: rich>=13.9.0; extra == 'all'
|
|
265
270
|
Requires-Dist: scikit-learn>=1.6.0; extra == 'all'
|
|
271
|
+
Requires-Dist: sqlalchemy[asyncio]<3.0.0,>=2.0.0; extra == 'all'
|
|
266
272
|
Requires-Dist: typer>=0.12.0; extra == 'all'
|
|
267
273
|
Requires-Dist: wrapt>=1.17.0; extra == 'all'
|
|
268
274
|
Provides-Extra: beautifulsoup
|
|
@@ -293,6 +299,14 @@ Provides-Extra: playwright
|
|
|
293
299
|
Requires-Dist: apify-fingerprint-datapoints>=0.0.2; extra == 'playwright'
|
|
294
300
|
Requires-Dist: browserforge>=1.2.3; extra == 'playwright'
|
|
295
301
|
Requires-Dist: playwright>=1.27.0; extra == 'playwright'
|
|
302
|
+
Provides-Extra: redis
|
|
303
|
+
Requires-Dist: redis[hiredis]>=7.0.0; extra == 'redis'
|
|
304
|
+
Provides-Extra: sql-postgres
|
|
305
|
+
Requires-Dist: asyncpg>=0.24.0; extra == 'sql-postgres'
|
|
306
|
+
Requires-Dist: sqlalchemy[asyncio]<3.0.0,>=2.0.0; extra == 'sql-postgres'
|
|
307
|
+
Provides-Extra: sql-sqlite
|
|
308
|
+
Requires-Dist: aiosqlite>=0.21.0; extra == 'sql-sqlite'
|
|
309
|
+
Requires-Dist: sqlalchemy[asyncio]<3.0.0,>=2.0.0; extra == 'sql-sqlite'
|
|
296
310
|
Description-Content-Type: text/markdown
|
|
297
311
|
|
|
298
312
|
<h1 align="center">
|
|
@@ -327,8 +341,6 @@ Description-Content-Type: text/markdown
|
|
|
327
341
|
|
|
328
342
|
Crawlee covers your crawling and scraping end-to-end and **helps you build reliable scrapers. Fast.**
|
|
329
343
|
|
|
330
|
-
> 🚀 Crawlee for Python is open to early adopters!
|
|
331
|
-
|
|
332
344
|
Your crawlers will appear almost human-like and fly under the radar of modern bot protections even with the default configuration. Crawlee gives you the tools to crawl the web for links, scrape data and persistently store it in machine-readable formats, without having to worry about the technical details. And thanks to rich configuration options, you can tweak almost any aspect of Crawlee to suit your project's needs if the default settings don't cut it.
|
|
333
345
|
|
|
334
346
|
> 👉 **View full documentation, guides and examples on the [Crawlee project website](https://crawlee.dev/python/)** 👈
|