crawlee 0.6.13b17__py3-none-any.whl → 1.1.2b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_request.py +35 -33
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +106 -34
- crawlee/_utils/context.py +2 -2
- crawlee/_utils/file.py +7 -0
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +17 -1
- crawlee/_utils/requests.py +0 -26
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +4 -2
- crawlee/_utils/system.py +3 -3
- crawlee/_utils/time.py +120 -0
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +2 -0
- crawlee/crawlers/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/__init__.py +2 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +50 -12
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +39 -15
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +8 -3
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +219 -126
- crawlee/crawlers/_basic/_logging_utils.py +5 -1
- crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
- crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
- crawlee/crawlers/_playwright/_playwright_crawler.py +60 -11
- crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
- crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
- crawlee/crawlers/_playwright/_types.py +12 -2
- crawlee/events/_event_manager.py +4 -4
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/http_clients/_base.py +4 -0
- crawlee/http_clients/_curl_impersonate.py +12 -0
- crawlee/http_clients/_httpx.py +16 -6
- crawlee/http_clients/_impit.py +25 -10
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +3 -3
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +248 -50
- crawlee/sessions/_models.py +2 -2
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +43 -4
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_request_queue_client.py +2 -2
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +29 -27
- crawlee/storage_clients/_file_system/_key_value_store_client.py +30 -26
- crawlee/storage_clients/_file_system/_request_queue_client.py +169 -153
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +16 -4
- crawlee/storage_clients/_memory/_key_value_store_client.py +16 -4
- crawlee/storage_clients/_memory/_request_queue_client.py +55 -36
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +13 -11
- crawlee/storages/_base.py +5 -1
- crawlee/storages/_dataset.py +12 -2
- crawlee/storages/_key_value_store.py +17 -4
- crawlee/storages/_request_queue.py +13 -5
- crawlee/storages/_storage_instance_manager.py +133 -71
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/METADATA +18 -6
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/RECORD +101 -78
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/WHEEL +1 -1
- crawlee/_utils/measure_time.py +0 -31
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b17.dist-info → crawlee-1.1.2b7.dist-info}/licenses/LICENSE +0 -0
|
@@ -113,7 +113,7 @@ class Snapshotter:
|
|
|
113
113
|
Args:
|
|
114
114
|
config: The `Configuration` instance. Uses the global (default) one if not provided.
|
|
115
115
|
"""
|
|
116
|
-
config = service_locator.get_configuration()
|
|
116
|
+
config = config or service_locator.get_configuration()
|
|
117
117
|
|
|
118
118
|
# Compute the maximum memory size based on the provided configuration. If `memory_mbytes` is provided,
|
|
119
119
|
# it uses that value. Otherwise, it calculates the `max_memory_size` as a proportion of the system's
|
crawlee/_request.py
CHANGED
|
@@ -11,7 +11,7 @@ from yarl import URL
|
|
|
11
11
|
from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, JsonSerializable
|
|
12
12
|
from crawlee._utils.crypto import crypto_random_object_id
|
|
13
13
|
from crawlee._utils.docs import docs_group
|
|
14
|
-
from crawlee._utils.requests import compute_unique_key
|
|
14
|
+
from crawlee._utils.requests import compute_unique_key
|
|
15
15
|
from crawlee._utils.urls import validate_http_url
|
|
16
16
|
|
|
17
17
|
if TYPE_CHECKING:
|
|
@@ -117,6 +117,7 @@ class UserData(BaseModel, MutableMapping[str, JsonSerializable]):
|
|
|
117
117
|
user_data_adapter = TypeAdapter(UserData)
|
|
118
118
|
|
|
119
119
|
|
|
120
|
+
@docs_group('Other')
|
|
120
121
|
class RequestOptions(TypedDict):
|
|
121
122
|
"""Options that can be used to customize request creation.
|
|
122
123
|
|
|
@@ -163,11 +164,7 @@ class Request(BaseModel):
|
|
|
163
164
|
```
|
|
164
165
|
"""
|
|
165
166
|
|
|
166
|
-
model_config = ConfigDict(
|
|
167
|
-
|
|
168
|
-
id: str
|
|
169
|
-
"""A unique identifier for the request. Note that this is not used for deduplication, and should not be confused
|
|
170
|
-
with `unique_key`."""
|
|
167
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
171
168
|
|
|
172
169
|
unique_key: Annotated[str, Field(alias='uniqueKey')]
|
|
173
170
|
"""A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
|
|
@@ -188,9 +185,6 @@ class Request(BaseModel):
|
|
|
188
185
|
method: HttpMethod = 'GET'
|
|
189
186
|
"""HTTP request method."""
|
|
190
187
|
|
|
191
|
-
headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)] = HttpHeaders()
|
|
192
|
-
"""HTTP request headers."""
|
|
193
|
-
|
|
194
188
|
payload: Annotated[
|
|
195
189
|
HttpPayload | None,
|
|
196
190
|
BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
|
|
@@ -198,23 +192,37 @@ class Request(BaseModel):
|
|
|
198
192
|
] = None
|
|
199
193
|
"""HTTP request payload."""
|
|
200
194
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
195
|
+
# Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
|
|
196
|
+
if TYPE_CHECKING:
|
|
197
|
+
headers: HttpHeaders = HttpHeaders()
|
|
198
|
+
"""HTTP request headers."""
|
|
199
|
+
|
|
200
|
+
user_data: dict[str, JsonSerializable] = {}
|
|
201
|
+
"""Custom user data assigned to the request. Use this to save any request related data to the
|
|
202
|
+
request's scope, keeping them accessible on retries, failures etc.
|
|
203
|
+
"""
|
|
204
|
+
|
|
205
|
+
else:
|
|
206
|
+
headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)]
|
|
207
|
+
"""HTTP request headers."""
|
|
208
|
+
|
|
209
|
+
user_data: Annotated[
|
|
210
|
+
dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience
|
|
211
|
+
Field(alias='userData', default_factory=lambda: UserData()),
|
|
212
|
+
PlainValidator(user_data_adapter.validate_python),
|
|
213
|
+
PlainSerializer(
|
|
214
|
+
lambda instance: user_data_adapter.dump_python(
|
|
215
|
+
instance,
|
|
216
|
+
by_alias=True,
|
|
217
|
+
exclude_none=True,
|
|
218
|
+
exclude_unset=True,
|
|
219
|
+
exclude_defaults=True,
|
|
220
|
+
)
|
|
221
|
+
),
|
|
222
|
+
]
|
|
223
|
+
"""Custom user data assigned to the request. Use this to save any request related data to the
|
|
224
|
+
request's scope, keeping them accessible on retries, failures etc.
|
|
225
|
+
"""
|
|
218
226
|
|
|
219
227
|
retry_count: Annotated[int, Field(alias='retryCount')] = 0
|
|
220
228
|
"""Number of times the request has been retried."""
|
|
@@ -239,7 +247,6 @@ class Request(BaseModel):
|
|
|
239
247
|
label: str | None = None,
|
|
240
248
|
session_id: str | None = None,
|
|
241
249
|
unique_key: str | None = None,
|
|
242
|
-
id: str | None = None,
|
|
243
250
|
keep_url_fragment: bool = False,
|
|
244
251
|
use_extended_unique_key: bool = False,
|
|
245
252
|
always_enqueue: bool = False,
|
|
@@ -264,8 +271,6 @@ class Request(BaseModel):
|
|
|
264
271
|
raised.
|
|
265
272
|
unique_key: A unique key identifying the request. If not provided, it is automatically computed based on
|
|
266
273
|
the URL and other parameters. Requests with the same `unique_key` are treated as identical.
|
|
267
|
-
id: A unique identifier for the request. If not provided, it is automatically generated from the
|
|
268
|
-
`unique_key`.
|
|
269
274
|
keep_url_fragment: Determines whether the URL fragment (e.g., `#section`) should be included in
|
|
270
275
|
the `unique_key` computation. This is only relevant when `unique_key` is not provided.
|
|
271
276
|
use_extended_unique_key: Determines whether to include the HTTP method, ID Session and payload in the
|
|
@@ -294,14 +299,11 @@ class Request(BaseModel):
|
|
|
294
299
|
)
|
|
295
300
|
|
|
296
301
|
if always_enqueue:
|
|
297
|
-
unique_key = f'{
|
|
298
|
-
|
|
299
|
-
id = id or unique_key_to_request_id(unique_key)
|
|
302
|
+
unique_key = f'{crypto_random_object_id()}|{unique_key}'
|
|
300
303
|
|
|
301
304
|
request = cls(
|
|
302
305
|
url=url,
|
|
303
306
|
unique_key=unique_key,
|
|
304
|
-
id=id,
|
|
305
307
|
method=method,
|
|
306
308
|
headers=headers,
|
|
307
309
|
payload=payload,
|
crawlee/_service_locator.py
CHANGED
|
@@ -11,6 +11,10 @@ from crawlee.storage_clients import FileSystemStorageClient, StorageClient
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
12
|
from crawlee.storages._storage_instance_manager import StorageInstanceManager
|
|
13
13
|
|
|
14
|
+
from logging import getLogger
|
|
15
|
+
|
|
16
|
+
logger = getLogger(__name__)
|
|
17
|
+
|
|
14
18
|
|
|
15
19
|
@docs_group('Configuration')
|
|
16
20
|
class ServiceLocator:
|
|
@@ -19,23 +23,24 @@ class ServiceLocator:
|
|
|
19
23
|
All services are initialized to its default value lazily.
|
|
20
24
|
"""
|
|
21
25
|
|
|
22
|
-
|
|
23
|
-
self._configuration: Configuration | None = None
|
|
24
|
-
self._event_manager: EventManager | None = None
|
|
25
|
-
self._storage_client: StorageClient | None = None
|
|
26
|
-
self._storage_instance_manager: StorageInstanceManager | None = None
|
|
26
|
+
global_storage_instance_manager: StorageInstanceManager | None = None
|
|
27
27
|
|
|
28
|
-
|
|
29
|
-
self
|
|
30
|
-
|
|
31
|
-
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
configuration: Configuration | None = None,
|
|
31
|
+
event_manager: EventManager | None = None,
|
|
32
|
+
storage_client: StorageClient | None = None,
|
|
33
|
+
) -> None:
|
|
34
|
+
self._configuration = configuration
|
|
35
|
+
self._event_manager = event_manager
|
|
36
|
+
self._storage_client = storage_client
|
|
32
37
|
|
|
33
38
|
def get_configuration(self) -> Configuration:
|
|
34
39
|
"""Get the configuration."""
|
|
35
40
|
if self._configuration is None:
|
|
41
|
+
logger.debug('No configuration set, implicitly creating and using default Configuration.')
|
|
36
42
|
self._configuration = Configuration()
|
|
37
43
|
|
|
38
|
-
self._configuration_was_retrieved = True
|
|
39
44
|
return self._configuration
|
|
40
45
|
|
|
41
46
|
def set_configuration(self, configuration: Configuration) -> None:
|
|
@@ -47,7 +52,10 @@ class ServiceLocator:
|
|
|
47
52
|
Raises:
|
|
48
53
|
ServiceConflictError: If the configuration has already been retrieved before.
|
|
49
54
|
"""
|
|
50
|
-
if self.
|
|
55
|
+
if self._configuration is configuration:
|
|
56
|
+
# Same instance, no need to anything
|
|
57
|
+
return
|
|
58
|
+
if self._configuration:
|
|
51
59
|
raise ServiceConflictError(Configuration, configuration, self._configuration)
|
|
52
60
|
|
|
53
61
|
self._configuration = configuration
|
|
@@ -55,13 +63,14 @@ class ServiceLocator:
|
|
|
55
63
|
def get_event_manager(self) -> EventManager:
|
|
56
64
|
"""Get the event manager."""
|
|
57
65
|
if self._event_manager is None:
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
66
|
+
logger.debug('No event manager set, implicitly creating and using default LocalEventManager.')
|
|
67
|
+
if self._configuration is None:
|
|
68
|
+
logger.debug(
|
|
69
|
+
'Implicit creation of event manager will implicitly set configuration as side effect. '
|
|
70
|
+
'It is advised to explicitly first set the configuration instead.'
|
|
71
|
+
)
|
|
72
|
+
self._event_manager = LocalEventManager().from_config(config=self._configuration)
|
|
63
73
|
|
|
64
|
-
self._event_manager_was_retrieved = True
|
|
65
74
|
return self._event_manager
|
|
66
75
|
|
|
67
76
|
def set_event_manager(self, event_manager: EventManager) -> None:
|
|
@@ -73,7 +82,10 @@ class ServiceLocator:
|
|
|
73
82
|
Raises:
|
|
74
83
|
ServiceConflictError: If the event manager has already been retrieved before.
|
|
75
84
|
"""
|
|
76
|
-
if self.
|
|
85
|
+
if self._event_manager is event_manager:
|
|
86
|
+
# Same instance, no need to anything
|
|
87
|
+
return
|
|
88
|
+
if self._event_manager:
|
|
77
89
|
raise ServiceConflictError(EventManager, event_manager, self._event_manager)
|
|
78
90
|
|
|
79
91
|
self._event_manager = event_manager
|
|
@@ -81,9 +93,14 @@ class ServiceLocator:
|
|
|
81
93
|
def get_storage_client(self) -> StorageClient:
|
|
82
94
|
"""Get the storage client."""
|
|
83
95
|
if self._storage_client is None:
|
|
96
|
+
logger.debug('No storage client set, implicitly creating and using default FileSystemStorageClient.')
|
|
97
|
+
if self._configuration is None:
|
|
98
|
+
logger.warning(
|
|
99
|
+
'Implicit creation of storage client will implicitly set configuration as side effect. '
|
|
100
|
+
'It is advised to explicitly first set the configuration instead.'
|
|
101
|
+
)
|
|
84
102
|
self._storage_client = FileSystemStorageClient()
|
|
85
103
|
|
|
86
|
-
self._storage_client_was_retrieved = True
|
|
87
104
|
return self._storage_client
|
|
88
105
|
|
|
89
106
|
def set_storage_client(self, storage_client: StorageClient) -> None:
|
|
@@ -95,21 +112,24 @@ class ServiceLocator:
|
|
|
95
112
|
Raises:
|
|
96
113
|
ServiceConflictError: If the storage client has already been retrieved before.
|
|
97
114
|
"""
|
|
98
|
-
if self.
|
|
115
|
+
if self._storage_client is storage_client:
|
|
116
|
+
# Same instance, no need to anything
|
|
117
|
+
return
|
|
118
|
+
if self._storage_client:
|
|
99
119
|
raise ServiceConflictError(StorageClient, storage_client, self._storage_client)
|
|
100
120
|
|
|
101
121
|
self._storage_client = storage_client
|
|
102
122
|
|
|
103
123
|
@property
|
|
104
124
|
def storage_instance_manager(self) -> StorageInstanceManager:
|
|
105
|
-
"""Get the storage instance manager."""
|
|
106
|
-
if
|
|
125
|
+
"""Get the storage instance manager. It is global manager shared by all instances of ServiceLocator."""
|
|
126
|
+
if ServiceLocator.global_storage_instance_manager is None:
|
|
107
127
|
# Import here to avoid circular imports.
|
|
108
128
|
from crawlee.storages._storage_instance_manager import StorageInstanceManager # noqa: PLC0415
|
|
109
129
|
|
|
110
|
-
|
|
130
|
+
ServiceLocator.global_storage_instance_manager = StorageInstanceManager()
|
|
111
131
|
|
|
112
|
-
return
|
|
132
|
+
return ServiceLocator.global_storage_instance_manager
|
|
113
133
|
|
|
114
134
|
|
|
115
135
|
service_locator = ServiceLocator()
|
crawlee/_types.py
CHANGED
|
@@ -3,17 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
from collections.abc import Callable, Iterator, Mapping
|
|
5
5
|
from dataclasses import dataclass
|
|
6
|
-
from typing import
|
|
7
|
-
TYPE_CHECKING,
|
|
8
|
-
Annotated,
|
|
9
|
-
Any,
|
|
10
|
-
Literal,
|
|
11
|
-
Protocol,
|
|
12
|
-
TypedDict,
|
|
13
|
-
TypeVar,
|
|
14
|
-
cast,
|
|
15
|
-
overload,
|
|
16
|
-
)
|
|
6
|
+
from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypedDict, TypeVar, cast, overload
|
|
17
7
|
|
|
18
8
|
from pydantic import ConfigDict, Field, PlainValidator, RootModel
|
|
19
9
|
|
|
@@ -25,7 +15,7 @@ if TYPE_CHECKING:
|
|
|
25
15
|
import re
|
|
26
16
|
from collections.abc import Callable, Coroutine, Sequence
|
|
27
17
|
|
|
28
|
-
from typing_extensions import NotRequired, Required, Unpack
|
|
18
|
+
from typing_extensions import NotRequired, Required, Self, Unpack
|
|
29
19
|
|
|
30
20
|
from crawlee import Glob, Request
|
|
31
21
|
from crawlee._request import RequestOptions
|
|
@@ -69,13 +59,17 @@ def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
|
|
|
69
59
|
class HttpHeaders(RootModel, Mapping[str, str]):
|
|
70
60
|
"""A dictionary-like object representing HTTP headers."""
|
|
71
61
|
|
|
72
|
-
model_config = ConfigDict(
|
|
62
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
73
63
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
64
|
+
# Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
|
|
65
|
+
if TYPE_CHECKING:
|
|
66
|
+
root: dict[str, str] = {}
|
|
67
|
+
else:
|
|
68
|
+
root: Annotated[
|
|
69
|
+
dict[str, str],
|
|
70
|
+
PlainValidator(lambda value: _normalize_headers(value)),
|
|
71
|
+
Field(default_factory=dict),
|
|
72
|
+
]
|
|
79
73
|
|
|
80
74
|
def __getitem__(self, key: str) -> str:
|
|
81
75
|
return self.root[key.lower()]
|
|
@@ -110,9 +104,9 @@ class ConcurrencySettings:
|
|
|
110
104
|
def __init__(
|
|
111
105
|
self,
|
|
112
106
|
min_concurrency: int = 1,
|
|
113
|
-
max_concurrency: int =
|
|
107
|
+
max_concurrency: int = 100,
|
|
114
108
|
max_tasks_per_minute: float = float('inf'),
|
|
115
|
-
desired_concurrency: int
|
|
109
|
+
desired_concurrency: int = 10,
|
|
116
110
|
) -> None:
|
|
117
111
|
"""Initialize a new instance.
|
|
118
112
|
|
|
@@ -125,21 +119,24 @@ class ConcurrencySettings:
|
|
|
125
119
|
desired_concurrency: The desired number of tasks that should be running parallel on the start of the pool,
|
|
126
120
|
if there is a large enough supply of them. By default, it is `min_concurrency`.
|
|
127
121
|
"""
|
|
128
|
-
if desired_concurrency is not None and desired_concurrency < 1:
|
|
129
|
-
raise ValueError('desired_concurrency must be 1 or larger')
|
|
130
|
-
|
|
131
122
|
if min_concurrency < 1:
|
|
132
123
|
raise ValueError('min_concurrency must be 1 or larger')
|
|
133
124
|
|
|
134
125
|
if max_concurrency < min_concurrency:
|
|
135
126
|
raise ValueError('max_concurrency cannot be less than min_concurrency')
|
|
136
127
|
|
|
128
|
+
if desired_concurrency < min_concurrency:
|
|
129
|
+
raise ValueError('desired_concurrency cannot be less than min_concurrency')
|
|
130
|
+
|
|
131
|
+
if desired_concurrency > max_concurrency:
|
|
132
|
+
raise ValueError('desired_concurrency cannot be greater than max_concurrency')
|
|
133
|
+
|
|
137
134
|
if max_tasks_per_minute <= 0:
|
|
138
135
|
raise ValueError('max_tasks_per_minute must be positive')
|
|
139
136
|
|
|
140
137
|
self.min_concurrency = min_concurrency
|
|
141
138
|
self.max_concurrency = max_concurrency
|
|
142
|
-
self.desired_concurrency = desired_concurrency
|
|
139
|
+
self.desired_concurrency = desired_concurrency
|
|
143
140
|
self.max_tasks_per_minute = max_tasks_per_minute
|
|
144
141
|
|
|
145
142
|
|
|
@@ -180,6 +177,17 @@ class AddRequestsKwargs(EnqueueLinksKwargs):
|
|
|
180
177
|
requests: Sequence[str | Request]
|
|
181
178
|
"""Requests to be added to the `RequestManager`."""
|
|
182
179
|
|
|
180
|
+
rq_id: str | None
|
|
181
|
+
"""ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided."""
|
|
182
|
+
|
|
183
|
+
rq_name: str | None
|
|
184
|
+
"""Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
|
|
185
|
+
"""
|
|
186
|
+
|
|
187
|
+
rq_alias: str | None
|
|
188
|
+
"""Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
|
|
189
|
+
"""
|
|
190
|
+
|
|
183
191
|
|
|
184
192
|
class PushDataKwargs(TypedDict):
|
|
185
193
|
"""Keyword arguments for dataset's `push_data` method."""
|
|
@@ -189,6 +197,7 @@ class PushDataFunctionCall(PushDataKwargs):
|
|
|
189
197
|
data: list[dict[str, Any]] | dict[str, Any]
|
|
190
198
|
dataset_id: str | None
|
|
191
199
|
dataset_name: str | None
|
|
200
|
+
dataset_alias: str | None
|
|
192
201
|
|
|
193
202
|
|
|
194
203
|
class KeyValueStoreInterface(Protocol):
|
|
@@ -255,21 +264,30 @@ class RequestHandlerRunResult:
|
|
|
255
264
|
self._key_value_store_getter = key_value_store_getter
|
|
256
265
|
self.add_requests_calls = list[AddRequestsKwargs]()
|
|
257
266
|
self.push_data_calls = list[PushDataFunctionCall]()
|
|
258
|
-
self.key_value_store_changes = dict[tuple[str | None, str | None], KeyValueStoreChangeRecords]()
|
|
267
|
+
self.key_value_store_changes = dict[tuple[str | None, str | None, str | None], KeyValueStoreChangeRecords]()
|
|
259
268
|
|
|
260
269
|
async def add_requests(
|
|
261
270
|
self,
|
|
262
271
|
requests: Sequence[str | Request],
|
|
272
|
+
rq_id: str | None = None,
|
|
273
|
+
rq_name: str | None = None,
|
|
274
|
+
rq_alias: str | None = None,
|
|
263
275
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
264
276
|
) -> None:
|
|
265
277
|
"""Track a call to the `add_requests` context helper."""
|
|
266
|
-
|
|
278
|
+
specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
|
|
279
|
+
if specified_params > 1:
|
|
280
|
+
raise ValueError('Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.')
|
|
281
|
+
self.add_requests_calls.append(
|
|
282
|
+
AddRequestsKwargs(requests=requests, rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs)
|
|
283
|
+
)
|
|
267
284
|
|
|
268
285
|
async def push_data(
|
|
269
286
|
self,
|
|
270
287
|
data: list[dict[str, Any]] | dict[str, Any],
|
|
271
288
|
dataset_id: str | None = None,
|
|
272
289
|
dataset_name: str | None = None,
|
|
290
|
+
dataset_alias: str | None = None,
|
|
273
291
|
**kwargs: Unpack[PushDataKwargs],
|
|
274
292
|
) -> None:
|
|
275
293
|
"""Track a call to the `push_data` context helper."""
|
|
@@ -278,6 +296,7 @@ class RequestHandlerRunResult:
|
|
|
278
296
|
data=data,
|
|
279
297
|
dataset_id=dataset_id,
|
|
280
298
|
dataset_name=dataset_name,
|
|
299
|
+
dataset_alias=dataset_alias,
|
|
281
300
|
**kwargs,
|
|
282
301
|
)
|
|
283
302
|
)
|
|
@@ -287,13 +306,14 @@ class RequestHandlerRunResult:
|
|
|
287
306
|
*,
|
|
288
307
|
id: str | None = None,
|
|
289
308
|
name: str | None = None,
|
|
309
|
+
alias: str | None = None,
|
|
290
310
|
) -> KeyValueStoreInterface:
|
|
291
|
-
if (id, name) not in self.key_value_store_changes:
|
|
292
|
-
self.key_value_store_changes[id, name] = KeyValueStoreChangeRecords(
|
|
293
|
-
await self._key_value_store_getter(id=id, name=name)
|
|
311
|
+
if (id, name, alias) not in self.key_value_store_changes:
|
|
312
|
+
self.key_value_store_changes[id, name, alias] = KeyValueStoreChangeRecords(
|
|
313
|
+
await self._key_value_store_getter(id=id, name=name, alias=alias)
|
|
294
314
|
)
|
|
295
315
|
|
|
296
|
-
return self.key_value_store_changes[id, name]
|
|
316
|
+
return self.key_value_store_changes[id, name, alias]
|
|
297
317
|
|
|
298
318
|
|
|
299
319
|
@docs_group('Functions')
|
|
@@ -307,12 +327,21 @@ class AddRequestsFunction(Protocol):
|
|
|
307
327
|
def __call__(
|
|
308
328
|
self,
|
|
309
329
|
requests: Sequence[str | Request],
|
|
330
|
+
rq_id: str | None = None,
|
|
331
|
+
rq_name: str | None = None,
|
|
332
|
+
rq_alias: str | None = None,
|
|
310
333
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
311
334
|
) -> Coroutine[None, None, None]:
|
|
312
335
|
"""Call dunder method.
|
|
313
336
|
|
|
314
337
|
Args:
|
|
315
338
|
requests: Requests to be added to the `RequestManager`.
|
|
339
|
+
rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
|
|
340
|
+
provided.
|
|
341
|
+
rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
|
|
342
|
+
can be provided.
|
|
343
|
+
rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
|
|
344
|
+
can be provided.
|
|
316
345
|
**kwargs: Additional keyword arguments.
|
|
317
346
|
"""
|
|
318
347
|
|
|
@@ -340,12 +369,21 @@ class EnqueueLinksFunction(Protocol):
|
|
|
340
369
|
label: str | None = None,
|
|
341
370
|
user_data: dict[str, Any] | None = None,
|
|
342
371
|
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
|
|
372
|
+
rq_id: str | None = None,
|
|
373
|
+
rq_name: str | None = None,
|
|
374
|
+
rq_alias: str | None = None,
|
|
343
375
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
344
376
|
) -> Coroutine[None, None, None]: ...
|
|
345
377
|
|
|
346
378
|
@overload
|
|
347
379
|
def __call__(
|
|
348
|
-
self,
|
|
380
|
+
self,
|
|
381
|
+
*,
|
|
382
|
+
requests: Sequence[str | Request] | None = None,
|
|
383
|
+
rq_id: str | None = None,
|
|
384
|
+
rq_name: str | None = None,
|
|
385
|
+
rq_alias: str | None = None,
|
|
386
|
+
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
349
387
|
) -> Coroutine[None, None, None]: ...
|
|
350
388
|
|
|
351
389
|
def __call__(
|
|
@@ -356,6 +394,9 @@ class EnqueueLinksFunction(Protocol):
|
|
|
356
394
|
user_data: dict[str, Any] | None = None,
|
|
357
395
|
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
|
|
358
396
|
requests: Sequence[str | Request] | None = None,
|
|
397
|
+
rq_id: str | None = None,
|
|
398
|
+
rq_name: str | None = None,
|
|
399
|
+
rq_alias: str | None = None,
|
|
359
400
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
360
401
|
) -> Coroutine[None, None, None]:
|
|
361
402
|
"""Call enqueue links function.
|
|
@@ -373,6 +414,12 @@ class EnqueueLinksFunction(Protocol):
|
|
|
373
414
|
- `'skip'` to exclude the request from being enqueued,
|
|
374
415
|
- `'unchanged'` to use the original request options without modification.
|
|
375
416
|
requests: Requests to be added to the `RequestManager`.
|
|
417
|
+
rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
|
|
418
|
+
provided.
|
|
419
|
+
rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
|
|
420
|
+
can be provided.
|
|
421
|
+
rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
|
|
422
|
+
can be provided.
|
|
376
423
|
**kwargs: Additional keyword arguments.
|
|
377
424
|
"""
|
|
378
425
|
|
|
@@ -424,12 +471,14 @@ class GetKeyValueStoreFunction(Protocol):
|
|
|
424
471
|
*,
|
|
425
472
|
id: str | None = None,
|
|
426
473
|
name: str | None = None,
|
|
474
|
+
alias: str | None = None,
|
|
427
475
|
) -> Coroutine[None, None, KeyValueStore]:
|
|
428
476
|
"""Call dunder method.
|
|
429
477
|
|
|
430
478
|
Args:
|
|
431
479
|
id: The ID of the `KeyValueStore` to get.
|
|
432
|
-
name: The name of the `KeyValueStore` to get.
|
|
480
|
+
name: The name of the `KeyValueStore` to get (global scope, named storage).
|
|
481
|
+
alias: The alias of the `KeyValueStore` to get (run scope, unnamed storage).
|
|
433
482
|
"""
|
|
434
483
|
|
|
435
484
|
|
|
@@ -444,12 +493,14 @@ class GetKeyValueStoreFromRequestHandlerFunction(Protocol):
|
|
|
444
493
|
*,
|
|
445
494
|
id: str | None = None,
|
|
446
495
|
name: str | None = None,
|
|
496
|
+
alias: str | None = None,
|
|
447
497
|
) -> Coroutine[None, None, KeyValueStoreInterface]:
|
|
448
498
|
"""Call dunder method.
|
|
449
499
|
|
|
450
500
|
Args:
|
|
451
501
|
id: The ID of the `KeyValueStore` to get.
|
|
452
|
-
name: The name of the `KeyValueStore` to get.
|
|
502
|
+
name: The name of the `KeyValueStore` to get (global scope, named storage).
|
|
503
|
+
alias: The alias of the `KeyValueStore` to get (run scope, unnamed storage).
|
|
453
504
|
"""
|
|
454
505
|
|
|
455
506
|
|
|
@@ -466,6 +517,7 @@ class PushDataFunction(Protocol):
|
|
|
466
517
|
data: list[dict[str, Any]] | dict[str, Any],
|
|
467
518
|
dataset_id: str | None = None,
|
|
468
519
|
dataset_name: str | None = None,
|
|
520
|
+
dataset_alias: str | None = None,
|
|
469
521
|
**kwargs: Unpack[PushDataKwargs],
|
|
470
522
|
) -> Coroutine[None, None, None]:
|
|
471
523
|
"""Call dunder method.
|
|
@@ -473,7 +525,8 @@ class PushDataFunction(Protocol):
|
|
|
473
525
|
Args:
|
|
474
526
|
data: The data to push to the `Dataset`.
|
|
475
527
|
dataset_id: The ID of the `Dataset` to push the data to.
|
|
476
|
-
dataset_name: The name of the `Dataset` to push the data to.
|
|
528
|
+
dataset_name: The name of the `Dataset` to push the data to (global scope, named storage).
|
|
529
|
+
dataset_alias: The alias of the `Dataset` to push the data to (run scope, unnamed storage).
|
|
477
530
|
**kwargs: Additional keyword arguments.
|
|
478
531
|
"""
|
|
479
532
|
|
|
@@ -590,6 +643,25 @@ class BasicCrawlingContext:
|
|
|
590
643
|
"""Return hash of the context. Each context is considered unique."""
|
|
591
644
|
return id(self)
|
|
592
645
|
|
|
646
|
+
def create_modified_copy(
|
|
647
|
+
self,
|
|
648
|
+
push_data: PushDataFunction | None = None,
|
|
649
|
+
add_requests: AddRequestsFunction | None = None,
|
|
650
|
+
get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,
|
|
651
|
+
) -> Self:
|
|
652
|
+
"""Create a modified copy of the crawling context with specified changes."""
|
|
653
|
+
original_fields = {field.name: getattr(self, field.name) for field in dataclasses.fields(self)}
|
|
654
|
+
modified_fields = {
|
|
655
|
+
key: value
|
|
656
|
+
for key, value in {
|
|
657
|
+
'push_data': push_data,
|
|
658
|
+
'add_requests': add_requests,
|
|
659
|
+
'get_key_value_store': get_key_value_store,
|
|
660
|
+
}.items()
|
|
661
|
+
if value
|
|
662
|
+
}
|
|
663
|
+
return self.__class__(**{**original_fields, **modified_fields})
|
|
664
|
+
|
|
593
665
|
|
|
594
666
|
class GetDataKwargs(TypedDict):
|
|
595
667
|
"""Keyword arguments for dataset's `get_data` method."""
|
crawlee/_utils/context.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import
|
|
3
|
+
import inspect
|
|
4
4
|
from collections.abc import Callable
|
|
5
5
|
from functools import wraps
|
|
6
6
|
from typing import Any, TypeVar
|
|
@@ -44,4 +44,4 @@ def ensure_context(method: T) -> T:
|
|
|
44
44
|
|
|
45
45
|
return await method(self, *args, **kwargs)
|
|
46
46
|
|
|
47
|
-
return async_wrapper if
|
|
47
|
+
return async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper # type: ignore[return-value]
|
crawlee/_utils/file.py
CHANGED
|
@@ -163,6 +163,13 @@ async def export_csv_to_stream(
|
|
|
163
163
|
dst: TextIO,
|
|
164
164
|
**kwargs: Unpack[ExportDataCsvKwargs],
|
|
165
165
|
) -> None:
|
|
166
|
+
# Set lineterminator to '\n' if not explicitly provided. This prevents double line endings on Windows.
|
|
167
|
+
# The csv.writer default is '\r\n', which when written to a file in text mode on Windows gets converted
|
|
168
|
+
# to '\r\r\n' due to newline translation. By using '\n', we let the platform handle the line ending
|
|
169
|
+
# conversion: '\n' stays as '\n' on Unix, and becomes '\r\n' on Windows.
|
|
170
|
+
if 'lineterminator' not in kwargs:
|
|
171
|
+
kwargs['lineterminator'] = '\n'
|
|
172
|
+
|
|
166
173
|
writer = csv.writer(dst, **kwargs) # type: ignore[arg-type]
|
|
167
174
|
write_header = True
|
|
168
175
|
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def raise_if_too_many_kwargs(max_kwargs: int = 1, **kwargs: Any) -> None:
|
|
5
|
+
"""Raise ValueError if there are more non-None kwargs then max_kwargs."""
|
|
6
|
+
none_kwargs_names = [f'"{kwarg_name}"' for kwarg_name, value in kwargs.items() if value is not None]
|
|
7
|
+
if len(none_kwargs_names) > max_kwargs:
|
|
8
|
+
all_kwargs_names = [f'"{kwarg_name}"' for kwarg_name in kwargs]
|
|
9
|
+
raise ValueError(
|
|
10
|
+
f'Only one of {", ".join(all_kwargs_names)} can be specified, but following arguments were '
|
|
11
|
+
f'specified: {", ".join(none_kwargs_names)}.'
|
|
12
|
+
)
|