crawlee 1.0.0rc1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/_autoscaling/snapshotter.py +1 -1
- crawlee/_request.py +2 -1
- crawlee/_service_locator.py +44 -24
- crawlee/_types.py +76 -17
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/sitemap.py +3 -1
- crawlee/_utils/system.py +3 -3
- crawlee/browsers/_playwright_browser_controller.py +20 -14
- crawlee/configuration.py +1 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
- crawlee/crawlers/_abstract_http/_abstract_http_parser.py +1 -1
- crawlee/crawlers/_abstract_http/_http_crawling_context.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +6 -2
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler_statistics.py +1 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +2 -1
- crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py +1 -1
- crawlee/crawlers/_basic/_basic_crawler.py +107 -27
- crawlee/crawlers/_basic/_logging_utils.py +5 -1
- crawlee/crawlers/_playwright/_playwright_crawler.py +6 -1
- crawlee/events/_types.py +6 -6
- crawlee/fingerprint_suite/_fingerprint_generator.py +3 -0
- crawlee/fingerprint_suite/_types.py +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_request_list.py +1 -1
- crawlee/request_loaders/_request_loader.py +5 -1
- crawlee/request_loaders/_sitemap_request_loader.py +228 -48
- crawlee/sessions/_models.py +2 -2
- crawlee/statistics/_models.py +1 -1
- crawlee/storage_clients/__init__.py +12 -0
- crawlee/storage_clients/_base/_storage_client.py +13 -0
- crawlee/storage_clients/_file_system/_dataset_client.py +27 -25
- crawlee/storage_clients/_file_system/_key_value_store_client.py +27 -23
- crawlee/storage_clients/_file_system/_request_queue_client.py +84 -98
- crawlee/storage_clients/_file_system/_storage_client.py +16 -3
- crawlee/storage_clients/_file_system/_utils.py +0 -0
- crawlee/storage_clients/_memory/_dataset_client.py +14 -2
- crawlee/storage_clients/_memory/_key_value_store_client.py +14 -2
- crawlee/storage_clients/_memory/_request_queue_client.py +43 -12
- crawlee/storage_clients/_memory/_storage_client.py +6 -3
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +269 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +299 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +706 -0
- crawlee/storage_clients/_sql/_storage_client.py +282 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +10 -10
- crawlee/storages/_base.py +3 -1
- crawlee/storages/_dataset.py +9 -2
- crawlee/storages/_key_value_store.py +9 -2
- crawlee/storages/_request_queue.py +7 -2
- crawlee/storages/_storage_instance_manager.py +126 -72
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/METADATA +12 -5
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/RECORD +59 -49
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/WHEEL +0 -0
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/entry_points.txt +0 -0
- {crawlee-1.0.0rc1.dist-info → crawlee-1.0.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -113,7 +113,7 @@ class Snapshotter:
|
|
|
113
113
|
Args:
|
|
114
114
|
config: The `Configuration` instance. Uses the global (default) one if not provided.
|
|
115
115
|
"""
|
|
116
|
-
config = service_locator.get_configuration()
|
|
116
|
+
config = config or service_locator.get_configuration()
|
|
117
117
|
|
|
118
118
|
# Compute the maximum memory size based on the provided configuration. If `memory_mbytes` is provided,
|
|
119
119
|
# it uses that value. Otherwise, it calculates the `max_memory_size` as a proportion of the system's
|
crawlee/_request.py
CHANGED
|
@@ -117,6 +117,7 @@ class UserData(BaseModel, MutableMapping[str, JsonSerializable]):
|
|
|
117
117
|
user_data_adapter = TypeAdapter(UserData)
|
|
118
118
|
|
|
119
119
|
|
|
120
|
+
@docs_group('Other')
|
|
120
121
|
class RequestOptions(TypedDict):
|
|
121
122
|
"""Options that can be used to customize request creation.
|
|
122
123
|
|
|
@@ -163,7 +164,7 @@ class Request(BaseModel):
|
|
|
163
164
|
```
|
|
164
165
|
"""
|
|
165
166
|
|
|
166
|
-
model_config = ConfigDict(
|
|
167
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
167
168
|
|
|
168
169
|
unique_key: Annotated[str, Field(alias='uniqueKey')]
|
|
169
170
|
"""A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
|
crawlee/_service_locator.py
CHANGED
|
@@ -11,6 +11,10 @@ from crawlee.storage_clients import FileSystemStorageClient, StorageClient
|
|
|
11
11
|
if TYPE_CHECKING:
|
|
12
12
|
from crawlee.storages._storage_instance_manager import StorageInstanceManager
|
|
13
13
|
|
|
14
|
+
from logging import getLogger
|
|
15
|
+
|
|
16
|
+
logger = getLogger(__name__)
|
|
17
|
+
|
|
14
18
|
|
|
15
19
|
@docs_group('Configuration')
|
|
16
20
|
class ServiceLocator:
|
|
@@ -19,23 +23,24 @@ class ServiceLocator:
|
|
|
19
23
|
All services are initialized to its default value lazily.
|
|
20
24
|
"""
|
|
21
25
|
|
|
22
|
-
|
|
23
|
-
self._configuration: Configuration | None = None
|
|
24
|
-
self._event_manager: EventManager | None = None
|
|
25
|
-
self._storage_client: StorageClient | None = None
|
|
26
|
-
self._storage_instance_manager: StorageInstanceManager | None = None
|
|
26
|
+
global_storage_instance_manager: StorageInstanceManager | None = None
|
|
27
27
|
|
|
28
|
-
|
|
29
|
-
self
|
|
30
|
-
|
|
31
|
-
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
configuration: Configuration | None = None,
|
|
31
|
+
event_manager: EventManager | None = None,
|
|
32
|
+
storage_client: StorageClient | None = None,
|
|
33
|
+
) -> None:
|
|
34
|
+
self._configuration = configuration
|
|
35
|
+
self._event_manager = event_manager
|
|
36
|
+
self._storage_client = storage_client
|
|
32
37
|
|
|
33
38
|
def get_configuration(self) -> Configuration:
|
|
34
39
|
"""Get the configuration."""
|
|
35
40
|
if self._configuration is None:
|
|
41
|
+
logger.warning('No configuration set, implicitly creating and using default Configuration.')
|
|
36
42
|
self._configuration = Configuration()
|
|
37
43
|
|
|
38
|
-
self._configuration_was_retrieved = True
|
|
39
44
|
return self._configuration
|
|
40
45
|
|
|
41
46
|
def set_configuration(self, configuration: Configuration) -> None:
|
|
@@ -47,7 +52,10 @@ class ServiceLocator:
|
|
|
47
52
|
Raises:
|
|
48
53
|
ServiceConflictError: If the configuration has already been retrieved before.
|
|
49
54
|
"""
|
|
50
|
-
if self.
|
|
55
|
+
if self._configuration is configuration:
|
|
56
|
+
# Same instance, no need to anything
|
|
57
|
+
return
|
|
58
|
+
if self._configuration:
|
|
51
59
|
raise ServiceConflictError(Configuration, configuration, self._configuration)
|
|
52
60
|
|
|
53
61
|
self._configuration = configuration
|
|
@@ -55,13 +63,14 @@ class ServiceLocator:
|
|
|
55
63
|
def get_event_manager(self) -> EventManager:
|
|
56
64
|
"""Get the event manager."""
|
|
57
65
|
if self._event_manager is None:
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
66
|
+
logger.warning('No event manager set, implicitly creating and using default LocalEventManager.')
|
|
67
|
+
if self._configuration is None:
|
|
68
|
+
logger.warning(
|
|
69
|
+
'Implicit creation of event manager will implicitly set configuration as side effect. '
|
|
70
|
+
'It is advised to explicitly first set the configuration instead.'
|
|
71
|
+
)
|
|
72
|
+
self._event_manager = LocalEventManager().from_config(config=self._configuration)
|
|
63
73
|
|
|
64
|
-
self._event_manager_was_retrieved = True
|
|
65
74
|
return self._event_manager
|
|
66
75
|
|
|
67
76
|
def set_event_manager(self, event_manager: EventManager) -> None:
|
|
@@ -73,7 +82,10 @@ class ServiceLocator:
|
|
|
73
82
|
Raises:
|
|
74
83
|
ServiceConflictError: If the event manager has already been retrieved before.
|
|
75
84
|
"""
|
|
76
|
-
if self.
|
|
85
|
+
if self._event_manager is event_manager:
|
|
86
|
+
# Same instance, no need to anything
|
|
87
|
+
return
|
|
88
|
+
if self._event_manager:
|
|
77
89
|
raise ServiceConflictError(EventManager, event_manager, self._event_manager)
|
|
78
90
|
|
|
79
91
|
self._event_manager = event_manager
|
|
@@ -81,9 +93,14 @@ class ServiceLocator:
|
|
|
81
93
|
def get_storage_client(self) -> StorageClient:
|
|
82
94
|
"""Get the storage client."""
|
|
83
95
|
if self._storage_client is None:
|
|
96
|
+
logger.warning('No storage client set, implicitly creating and using default FileSystemStorageClient.')
|
|
97
|
+
if self._configuration is None:
|
|
98
|
+
logger.warning(
|
|
99
|
+
'Implicit creation of storage client will implicitly set configuration as side effect. '
|
|
100
|
+
'It is advised to explicitly first set the configuration instead.'
|
|
101
|
+
)
|
|
84
102
|
self._storage_client = FileSystemStorageClient()
|
|
85
103
|
|
|
86
|
-
self._storage_client_was_retrieved = True
|
|
87
104
|
return self._storage_client
|
|
88
105
|
|
|
89
106
|
def set_storage_client(self, storage_client: StorageClient) -> None:
|
|
@@ -95,21 +112,24 @@ class ServiceLocator:
|
|
|
95
112
|
Raises:
|
|
96
113
|
ServiceConflictError: If the storage client has already been retrieved before.
|
|
97
114
|
"""
|
|
98
|
-
if self.
|
|
115
|
+
if self._storage_client is storage_client:
|
|
116
|
+
# Same instance, no need to anything
|
|
117
|
+
return
|
|
118
|
+
if self._storage_client:
|
|
99
119
|
raise ServiceConflictError(StorageClient, storage_client, self._storage_client)
|
|
100
120
|
|
|
101
121
|
self._storage_client = storage_client
|
|
102
122
|
|
|
103
123
|
@property
|
|
104
124
|
def storage_instance_manager(self) -> StorageInstanceManager:
|
|
105
|
-
"""Get the storage instance manager."""
|
|
106
|
-
if
|
|
125
|
+
"""Get the storage instance manager. It is global manager shared by all instances of ServiceLocator."""
|
|
126
|
+
if ServiceLocator.global_storage_instance_manager is None:
|
|
107
127
|
# Import here to avoid circular imports.
|
|
108
128
|
from crawlee.storages._storage_instance_manager import StorageInstanceManager # noqa: PLC0415
|
|
109
129
|
|
|
110
|
-
|
|
130
|
+
ServiceLocator.global_storage_instance_manager = StorageInstanceManager()
|
|
111
131
|
|
|
112
|
-
return
|
|
132
|
+
return ServiceLocator.global_storage_instance_manager
|
|
113
133
|
|
|
114
134
|
|
|
115
135
|
service_locator = ServiceLocator()
|
crawlee/_types.py
CHANGED
|
@@ -69,7 +69,7 @@ def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
|
|
|
69
69
|
class HttpHeaders(RootModel, Mapping[str, str]):
|
|
70
70
|
"""A dictionary-like object representing HTTP headers."""
|
|
71
71
|
|
|
72
|
-
model_config = ConfigDict(
|
|
72
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
73
73
|
|
|
74
74
|
root: Annotated[
|
|
75
75
|
dict[str, str],
|
|
@@ -110,9 +110,9 @@ class ConcurrencySettings:
|
|
|
110
110
|
def __init__(
|
|
111
111
|
self,
|
|
112
112
|
min_concurrency: int = 1,
|
|
113
|
-
max_concurrency: int =
|
|
113
|
+
max_concurrency: int = 100,
|
|
114
114
|
max_tasks_per_minute: float = float('inf'),
|
|
115
|
-
desired_concurrency: int
|
|
115
|
+
desired_concurrency: int = 10,
|
|
116
116
|
) -> None:
|
|
117
117
|
"""Initialize a new instance.
|
|
118
118
|
|
|
@@ -125,21 +125,24 @@ class ConcurrencySettings:
|
|
|
125
125
|
desired_concurrency: The desired number of tasks that should be running parallel on the start of the pool,
|
|
126
126
|
if there is a large enough supply of them. By default, it is `min_concurrency`.
|
|
127
127
|
"""
|
|
128
|
-
if desired_concurrency is not None and desired_concurrency < 1:
|
|
129
|
-
raise ValueError('desired_concurrency must be 1 or larger')
|
|
130
|
-
|
|
131
128
|
if min_concurrency < 1:
|
|
132
129
|
raise ValueError('min_concurrency must be 1 or larger')
|
|
133
130
|
|
|
134
131
|
if max_concurrency < min_concurrency:
|
|
135
132
|
raise ValueError('max_concurrency cannot be less than min_concurrency')
|
|
136
133
|
|
|
134
|
+
if desired_concurrency < min_concurrency:
|
|
135
|
+
raise ValueError('desired_concurrency cannot be less than min_concurrency')
|
|
136
|
+
|
|
137
|
+
if desired_concurrency > max_concurrency:
|
|
138
|
+
raise ValueError('desired_concurrency cannot be greater than max_concurrency')
|
|
139
|
+
|
|
137
140
|
if max_tasks_per_minute <= 0:
|
|
138
141
|
raise ValueError('max_tasks_per_minute must be positive')
|
|
139
142
|
|
|
140
143
|
self.min_concurrency = min_concurrency
|
|
141
144
|
self.max_concurrency = max_concurrency
|
|
142
|
-
self.desired_concurrency = desired_concurrency
|
|
145
|
+
self.desired_concurrency = desired_concurrency
|
|
143
146
|
self.max_tasks_per_minute = max_tasks_per_minute
|
|
144
147
|
|
|
145
148
|
|
|
@@ -180,6 +183,17 @@ class AddRequestsKwargs(EnqueueLinksKwargs):
|
|
|
180
183
|
requests: Sequence[str | Request]
|
|
181
184
|
"""Requests to be added to the `RequestManager`."""
|
|
182
185
|
|
|
186
|
+
rq_id: str | None
|
|
187
|
+
"""ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided."""
|
|
188
|
+
|
|
189
|
+
rq_name: str | None
|
|
190
|
+
"""Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
|
|
191
|
+
"""
|
|
192
|
+
|
|
193
|
+
rq_alias: str | None
|
|
194
|
+
"""Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
|
|
195
|
+
"""
|
|
196
|
+
|
|
183
197
|
|
|
184
198
|
class PushDataKwargs(TypedDict):
|
|
185
199
|
"""Keyword arguments for dataset's `push_data` method."""
|
|
@@ -189,6 +203,7 @@ class PushDataFunctionCall(PushDataKwargs):
|
|
|
189
203
|
data: list[dict[str, Any]] | dict[str, Any]
|
|
190
204
|
dataset_id: str | None
|
|
191
205
|
dataset_name: str | None
|
|
206
|
+
dataset_alias: str | None
|
|
192
207
|
|
|
193
208
|
|
|
194
209
|
class KeyValueStoreInterface(Protocol):
|
|
@@ -255,21 +270,30 @@ class RequestHandlerRunResult:
|
|
|
255
270
|
self._key_value_store_getter = key_value_store_getter
|
|
256
271
|
self.add_requests_calls = list[AddRequestsKwargs]()
|
|
257
272
|
self.push_data_calls = list[PushDataFunctionCall]()
|
|
258
|
-
self.key_value_store_changes = dict[tuple[str | None, str | None], KeyValueStoreChangeRecords]()
|
|
273
|
+
self.key_value_store_changes = dict[tuple[str | None, str | None, str | None], KeyValueStoreChangeRecords]()
|
|
259
274
|
|
|
260
275
|
async def add_requests(
|
|
261
276
|
self,
|
|
262
277
|
requests: Sequence[str | Request],
|
|
278
|
+
rq_id: str | None = None,
|
|
279
|
+
rq_name: str | None = None,
|
|
280
|
+
rq_alias: str | None = None,
|
|
263
281
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
264
282
|
) -> None:
|
|
265
283
|
"""Track a call to the `add_requests` context helper."""
|
|
266
|
-
|
|
284
|
+
specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
|
|
285
|
+
if specified_params > 1:
|
|
286
|
+
raise ValueError('Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.')
|
|
287
|
+
self.add_requests_calls.append(
|
|
288
|
+
AddRequestsKwargs(requests=requests, rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs)
|
|
289
|
+
)
|
|
267
290
|
|
|
268
291
|
async def push_data(
|
|
269
292
|
self,
|
|
270
293
|
data: list[dict[str, Any]] | dict[str, Any],
|
|
271
294
|
dataset_id: str | None = None,
|
|
272
295
|
dataset_name: str | None = None,
|
|
296
|
+
dataset_alias: str | None = None,
|
|
273
297
|
**kwargs: Unpack[PushDataKwargs],
|
|
274
298
|
) -> None:
|
|
275
299
|
"""Track a call to the `push_data` context helper."""
|
|
@@ -278,6 +302,7 @@ class RequestHandlerRunResult:
|
|
|
278
302
|
data=data,
|
|
279
303
|
dataset_id=dataset_id,
|
|
280
304
|
dataset_name=dataset_name,
|
|
305
|
+
dataset_alias=dataset_alias,
|
|
281
306
|
**kwargs,
|
|
282
307
|
)
|
|
283
308
|
)
|
|
@@ -287,13 +312,14 @@ class RequestHandlerRunResult:
|
|
|
287
312
|
*,
|
|
288
313
|
id: str | None = None,
|
|
289
314
|
name: str | None = None,
|
|
315
|
+
alias: str | None = None,
|
|
290
316
|
) -> KeyValueStoreInterface:
|
|
291
|
-
if (id, name) not in self.key_value_store_changes:
|
|
292
|
-
self.key_value_store_changes[id, name] = KeyValueStoreChangeRecords(
|
|
293
|
-
await self._key_value_store_getter(id=id, name=name)
|
|
317
|
+
if (id, name, alias) not in self.key_value_store_changes:
|
|
318
|
+
self.key_value_store_changes[id, name, alias] = KeyValueStoreChangeRecords(
|
|
319
|
+
await self._key_value_store_getter(id=id, name=name, alias=alias)
|
|
294
320
|
)
|
|
295
321
|
|
|
296
|
-
return self.key_value_store_changes[id, name]
|
|
322
|
+
return self.key_value_store_changes[id, name, alias]
|
|
297
323
|
|
|
298
324
|
|
|
299
325
|
@docs_group('Functions')
|
|
@@ -307,12 +333,21 @@ class AddRequestsFunction(Protocol):
|
|
|
307
333
|
def __call__(
|
|
308
334
|
self,
|
|
309
335
|
requests: Sequence[str | Request],
|
|
336
|
+
rq_id: str | None = None,
|
|
337
|
+
rq_name: str | None = None,
|
|
338
|
+
rq_alias: str | None = None,
|
|
310
339
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
311
340
|
) -> Coroutine[None, None, None]:
|
|
312
341
|
"""Call dunder method.
|
|
313
342
|
|
|
314
343
|
Args:
|
|
315
344
|
requests: Requests to be added to the `RequestManager`.
|
|
345
|
+
rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
|
|
346
|
+
provided.
|
|
347
|
+
rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
|
|
348
|
+
can be provided.
|
|
349
|
+
rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
|
|
350
|
+
can be provided.
|
|
316
351
|
**kwargs: Additional keyword arguments.
|
|
317
352
|
"""
|
|
318
353
|
|
|
@@ -340,12 +375,21 @@ class EnqueueLinksFunction(Protocol):
|
|
|
340
375
|
label: str | None = None,
|
|
341
376
|
user_data: dict[str, Any] | None = None,
|
|
342
377
|
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
|
|
378
|
+
rq_id: str | None = None,
|
|
379
|
+
rq_name: str | None = None,
|
|
380
|
+
rq_alias: str | None = None,
|
|
343
381
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
344
382
|
) -> Coroutine[None, None, None]: ...
|
|
345
383
|
|
|
346
384
|
@overload
|
|
347
385
|
def __call__(
|
|
348
|
-
self,
|
|
386
|
+
self,
|
|
387
|
+
*,
|
|
388
|
+
requests: Sequence[str | Request] | None = None,
|
|
389
|
+
rq_id: str | None = None,
|
|
390
|
+
rq_name: str | None = None,
|
|
391
|
+
rq_alias: str | None = None,
|
|
392
|
+
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
349
393
|
) -> Coroutine[None, None, None]: ...
|
|
350
394
|
|
|
351
395
|
def __call__(
|
|
@@ -356,6 +400,9 @@ class EnqueueLinksFunction(Protocol):
|
|
|
356
400
|
user_data: dict[str, Any] | None = None,
|
|
357
401
|
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
|
|
358
402
|
requests: Sequence[str | Request] | None = None,
|
|
403
|
+
rq_id: str | None = None,
|
|
404
|
+
rq_name: str | None = None,
|
|
405
|
+
rq_alias: str | None = None,
|
|
359
406
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
360
407
|
) -> Coroutine[None, None, None]:
|
|
361
408
|
"""Call enqueue links function.
|
|
@@ -373,6 +420,12 @@ class EnqueueLinksFunction(Protocol):
|
|
|
373
420
|
- `'skip'` to exclude the request from being enqueued,
|
|
374
421
|
- `'unchanged'` to use the original request options without modification.
|
|
375
422
|
requests: Requests to be added to the `RequestManager`.
|
|
423
|
+
rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
|
|
424
|
+
provided.
|
|
425
|
+
rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
|
|
426
|
+
can be provided.
|
|
427
|
+
rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
|
|
428
|
+
can be provided.
|
|
376
429
|
**kwargs: Additional keyword arguments.
|
|
377
430
|
"""
|
|
378
431
|
|
|
@@ -424,12 +477,14 @@ class GetKeyValueStoreFunction(Protocol):
|
|
|
424
477
|
*,
|
|
425
478
|
id: str | None = None,
|
|
426
479
|
name: str | None = None,
|
|
480
|
+
alias: str | None = None,
|
|
427
481
|
) -> Coroutine[None, None, KeyValueStore]:
|
|
428
482
|
"""Call dunder method.
|
|
429
483
|
|
|
430
484
|
Args:
|
|
431
485
|
id: The ID of the `KeyValueStore` to get.
|
|
432
|
-
name: The name of the `KeyValueStore` to get.
|
|
486
|
+
name: The name of the `KeyValueStore` to get (global scope, named storage).
|
|
487
|
+
alias: The alias of the `KeyValueStore` to get (run scope, unnamed storage).
|
|
433
488
|
"""
|
|
434
489
|
|
|
435
490
|
|
|
@@ -444,12 +499,14 @@ class GetKeyValueStoreFromRequestHandlerFunction(Protocol):
|
|
|
444
499
|
*,
|
|
445
500
|
id: str | None = None,
|
|
446
501
|
name: str | None = None,
|
|
502
|
+
alias: str | None = None,
|
|
447
503
|
) -> Coroutine[None, None, KeyValueStoreInterface]:
|
|
448
504
|
"""Call dunder method.
|
|
449
505
|
|
|
450
506
|
Args:
|
|
451
507
|
id: The ID of the `KeyValueStore` to get.
|
|
452
|
-
name: The name of the `KeyValueStore` to get.
|
|
508
|
+
name: The name of the `KeyValueStore` to get (global scope, named storage).
|
|
509
|
+
alias: The alias of the `KeyValueStore` to get (run scope, unnamed storage).
|
|
453
510
|
"""
|
|
454
511
|
|
|
455
512
|
|
|
@@ -466,6 +523,7 @@ class PushDataFunction(Protocol):
|
|
|
466
523
|
data: list[dict[str, Any]] | dict[str, Any],
|
|
467
524
|
dataset_id: str | None = None,
|
|
468
525
|
dataset_name: str | None = None,
|
|
526
|
+
dataset_alias: str | None = None,
|
|
469
527
|
**kwargs: Unpack[PushDataKwargs],
|
|
470
528
|
) -> Coroutine[None, None, None]:
|
|
471
529
|
"""Call dunder method.
|
|
@@ -473,7 +531,8 @@ class PushDataFunction(Protocol):
|
|
|
473
531
|
Args:
|
|
474
532
|
data: The data to push to the `Dataset`.
|
|
475
533
|
dataset_id: The ID of the `Dataset` to push the data to.
|
|
476
|
-
dataset_name: The name of the `Dataset` to push the data to.
|
|
534
|
+
dataset_name: The name of the `Dataset` to push the data to (global scope, named storage).
|
|
535
|
+
dataset_alias: The alias of the `Dataset` to push the data to (run scope, unnamed storage).
|
|
477
536
|
**kwargs: Additional keyword arguments.
|
|
478
537
|
"""
|
|
479
538
|
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def raise_if_too_many_kwargs(max_kwargs: int = 1, **kwargs: Any) -> None:
|
|
5
|
+
"""Raise ValueError if there are more non-None kwargs then max_kwargs."""
|
|
6
|
+
none_kwargs_names = [f'"{kwarg_name}"' for kwarg_name, value in kwargs.items() if value is not None]
|
|
7
|
+
if len(none_kwargs_names) > max_kwargs:
|
|
8
|
+
all_kwargs_names = [f'"{kwarg_name}"' for kwarg_name in kwargs]
|
|
9
|
+
raise ValueError(
|
|
10
|
+
f'Only one of {", ".join(all_kwargs_names)} can be specified, but following arguments were '
|
|
11
|
+
f'specified: {", ".join(none_kwargs_names)}.'
|
|
12
|
+
)
|
crawlee/_utils/sitemap.py
CHANGED
|
@@ -9,6 +9,7 @@ from datetime import datetime, timedelta
|
|
|
9
9
|
from hashlib import sha256
|
|
10
10
|
from logging import getLogger
|
|
11
11
|
from typing import TYPE_CHECKING, Literal, TypedDict
|
|
12
|
+
from xml.sax import SAXParseException
|
|
12
13
|
from xml.sax.expatreader import ExpatParser
|
|
13
14
|
from xml.sax.handler import ContentHandler
|
|
14
15
|
|
|
@@ -192,7 +193,8 @@ class _XmlSitemapParser:
|
|
|
192
193
|
|
|
193
194
|
def close(self) -> None:
|
|
194
195
|
"""Clean up resources."""
|
|
195
|
-
|
|
196
|
+
with suppress(SAXParseException):
|
|
197
|
+
self._parser.close()
|
|
196
198
|
|
|
197
199
|
|
|
198
200
|
def _get_parser(content_type: str = '', url: str | None = None) -> _XmlSitemapParser | _TxtSitemapParser:
|
crawlee/_utils/system.py
CHANGED
|
@@ -36,7 +36,7 @@ else:
|
|
|
36
36
|
class CpuInfo(BaseModel):
|
|
37
37
|
"""Information about the CPU usage."""
|
|
38
38
|
|
|
39
|
-
model_config = ConfigDict(
|
|
39
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
40
40
|
|
|
41
41
|
used_ratio: Annotated[float, Field(alias='usedRatio')]
|
|
42
42
|
"""The ratio of CPU currently in use, represented as a float between 0 and 1."""
|
|
@@ -51,7 +51,7 @@ class CpuInfo(BaseModel):
|
|
|
51
51
|
class MemoryUsageInfo(BaseModel):
|
|
52
52
|
"""Information about the memory usage."""
|
|
53
53
|
|
|
54
|
-
model_config = ConfigDict(
|
|
54
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
55
55
|
|
|
56
56
|
current_size: Annotated[
|
|
57
57
|
ByteSize,
|
|
@@ -71,7 +71,7 @@ class MemoryUsageInfo(BaseModel):
|
|
|
71
71
|
class MemoryInfo(MemoryUsageInfo):
|
|
72
72
|
"""Information about system memory."""
|
|
73
73
|
|
|
74
|
-
model_config = ConfigDict(
|
|
74
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
75
75
|
|
|
76
76
|
total_size: Annotated[
|
|
77
77
|
ByteSize, PlainValidator(ByteSize.validate), PlainSerializer(lambda size: size.bytes), Field(alias='totalSize')
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
from asyncio import Lock
|
|
5
6
|
from datetime import datetime, timedelta, timezone
|
|
6
7
|
from typing import TYPE_CHECKING, Any, cast
|
|
7
8
|
|
|
@@ -77,6 +78,19 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
77
78
|
|
|
78
79
|
self._total_opened_pages = 0
|
|
79
80
|
|
|
81
|
+
self._context_creation_lock: Lock | None = None
|
|
82
|
+
|
|
83
|
+
async def _get_context_creation_lock(self) -> Lock:
|
|
84
|
+
"""Get context checking and creation lock.
|
|
85
|
+
|
|
86
|
+
It should be done with lock to prevent multiple concurrent attempts to create context, which could lead to
|
|
87
|
+
memory leak as one of the two concurrently created contexts will become orphaned and not properly closed.
|
|
88
|
+
"""
|
|
89
|
+
if self._context_creation_lock:
|
|
90
|
+
return self._context_creation_lock
|
|
91
|
+
self._context_creation_lock = Lock()
|
|
92
|
+
return self._context_creation_lock
|
|
93
|
+
|
|
80
94
|
@property
|
|
81
95
|
@override
|
|
82
96
|
def pages(self) -> list[Page]:
|
|
@@ -137,12 +151,6 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
137
151
|
Raises:
|
|
138
152
|
ValueError: If the browser has reached the maximum number of open pages.
|
|
139
153
|
"""
|
|
140
|
-
if not self._browser_context:
|
|
141
|
-
self._browser_context = await self._create_browser_context(
|
|
142
|
-
browser_new_context_options=browser_new_context_options,
|
|
143
|
-
proxy_info=proxy_info,
|
|
144
|
-
)
|
|
145
|
-
|
|
146
154
|
if not self.has_free_capacity:
|
|
147
155
|
raise ValueError('Cannot open more pages in this browser.')
|
|
148
156
|
|
|
@@ -154,11 +162,12 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
154
162
|
)
|
|
155
163
|
page = await new_context.new_page()
|
|
156
164
|
else:
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
165
|
+
async with await self._get_context_creation_lock():
|
|
166
|
+
if not self._browser_context:
|
|
167
|
+
self._browser_context = await self._create_browser_context(
|
|
168
|
+
browser_new_context_options=browser_new_context_options,
|
|
169
|
+
proxy_info=proxy_info,
|
|
170
|
+
)
|
|
162
171
|
page = await self._browser_context.new_page()
|
|
163
172
|
|
|
164
173
|
# Handle page close event
|
|
@@ -169,7 +178,6 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
169
178
|
self._last_page_opened_at = datetime.now(timezone.utc)
|
|
170
179
|
|
|
171
180
|
self._total_opened_pages += 1
|
|
172
|
-
|
|
173
181
|
return page
|
|
174
182
|
|
|
175
183
|
@override
|
|
@@ -206,7 +214,6 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
206
214
|
`self._fingerprint_generator` is available.
|
|
207
215
|
"""
|
|
208
216
|
browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
|
|
209
|
-
|
|
210
217
|
if proxy_info:
|
|
211
218
|
if browser_new_context_options.get('proxy'):
|
|
212
219
|
logger.warning("browser_new_context_options['proxy'] overriden by explicit `proxy_info` argument.")
|
|
@@ -244,5 +251,4 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
244
251
|
browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
|
|
245
252
|
'extra_http_headers', extra_http_headers
|
|
246
253
|
)
|
|
247
|
-
|
|
248
254
|
return await self._browser.new_context(**browser_new_context_options)
|
crawlee/configuration.py
CHANGED
|
@@ -28,7 +28,7 @@ class Configuration(BaseSettings):
|
|
|
28
28
|
Settings can also be configured via environment variables, prefixed with `CRAWLEE_`.
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
|
-
model_config = SettingsConfigDict(
|
|
31
|
+
model_config = SettingsConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
32
32
|
|
|
33
33
|
internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None
|
|
34
34
|
"""Timeout for the internal asynchronous operations."""
|
|
@@ -34,7 +34,9 @@ TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=St
|
|
|
34
34
|
|
|
35
35
|
@docs_group('Crawlers')
|
|
36
36
|
class AbstractHttpCrawler(
|
|
37
|
-
|
|
37
|
+
BasicCrawler[TCrawlingContext, StatisticsState],
|
|
38
|
+
ABC,
|
|
39
|
+
Generic[TCrawlingContext, TParseResult, TSelectResult],
|
|
38
40
|
):
|
|
39
41
|
"""A web crawler for performing HTTP requests.
|
|
40
42
|
|
|
@@ -16,7 +16,7 @@ if TYPE_CHECKING:
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
@docs_group('HTTP parsers')
|
|
19
|
-
class AbstractHttpParser(Generic[TParseResult, TSelectResult]
|
|
19
|
+
class AbstractHttpParser(ABC, Generic[TParseResult, TSelectResult]):
|
|
20
20
|
"""Parser used for parsing HTTP response and inspecting parsed result to find links or detect blocking."""
|
|
21
21
|
|
|
22
22
|
@abstractmethod
|
|
@@ -31,7 +31,7 @@ class HttpCrawlingContext(BasicCrawlingContext, HttpCrawlingResult):
|
|
|
31
31
|
|
|
32
32
|
@dataclass(frozen=True)
|
|
33
33
|
@docs_group('Crawling contexts')
|
|
34
|
-
class ParsedHttpCrawlingContext(Generic[TParseResult]
|
|
34
|
+
class ParsedHttpCrawlingContext(HttpCrawlingContext, Generic[TParseResult]):
|
|
35
35
|
"""The crawling context used by `AbstractHttpCrawler`.
|
|
36
36
|
|
|
37
37
|
It provides access to key objects as well as utility functions for handling crawling tasks.
|
|
@@ -12,7 +12,7 @@ from bs4 import BeautifulSoup, Tag
|
|
|
12
12
|
from parsel import Selector
|
|
13
13
|
from typing_extensions import Self, TypeVar, override
|
|
14
14
|
|
|
15
|
-
from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult
|
|
15
|
+
from crawlee._types import BasicCrawlingContext, ConcurrencySettings, JsonSerializable, RequestHandlerRunResult
|
|
16
16
|
from crawlee._utils.docs import docs_group
|
|
17
17
|
from crawlee._utils.wait import wait_for
|
|
18
18
|
from crawlee.crawlers import (
|
|
@@ -85,8 +85,8 @@ class _NonPersistentStatistics(Statistics):
|
|
|
85
85
|
|
|
86
86
|
@docs_group('Crawlers')
|
|
87
87
|
class AdaptivePlaywrightCrawler(
|
|
88
|
-
Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult],
|
|
89
88
|
BasicCrawler[AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightCrawlerStatisticState],
|
|
89
|
+
Generic[TStaticCrawlingContext, TStaticParseResult, TStaticSelectResult],
|
|
90
90
|
):
|
|
91
91
|
"""An adaptive web crawler capable of using both static HTTP request based crawling and browser based crawling.
|
|
92
92
|
|
|
@@ -158,6 +158,10 @@ class AdaptivePlaywrightCrawler(
|
|
|
158
158
|
self.result_checker = result_checker or (lambda _: True)
|
|
159
159
|
self.result_comparator = result_comparator or create_default_comparator(result_checker)
|
|
160
160
|
|
|
161
|
+
# Set default concurrency settings for browser crawlers if not provided
|
|
162
|
+
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
|
|
163
|
+
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
|
|
164
|
+
|
|
161
165
|
super().__init__(statistics=statistics, **kwargs)
|
|
162
166
|
|
|
163
167
|
# Sub crawlers related.
|
|
@@ -12,7 +12,7 @@ from crawlee.statistics import StatisticsState
|
|
|
12
12
|
class AdaptivePlaywrightCrawlerStatisticState(StatisticsState):
|
|
13
13
|
"""Statistic data about a crawler run with additional information related to adaptive crawling."""
|
|
14
14
|
|
|
15
|
-
model_config = ConfigDict(
|
|
15
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True, ser_json_inf_nan='constants')
|
|
16
16
|
|
|
17
17
|
http_only_request_handler_runs: Annotated[int, Field(alias='http_only_request_handler_runs')] = 0
|
|
18
18
|
"""Number representing how many times static http based crawling was used."""
|
|
@@ -31,7 +31,8 @@ class AdaptiveContextError(RuntimeError):
|
|
|
31
31
|
@dataclass(frozen=True)
|
|
32
32
|
@docs_group('Crawling contexts')
|
|
33
33
|
class AdaptivePlaywrightCrawlingContext(
|
|
34
|
-
|
|
34
|
+
ParsedHttpCrawlingContext[TStaticParseResult],
|
|
35
|
+
Generic[TStaticParseResult, TStaticSelectResult],
|
|
35
36
|
):
|
|
36
37
|
_static_parser: AbstractHttpParser[TStaticParseResult, TStaticSelectResult]
|
|
37
38
|
"""The crawling context used by `AdaptivePlaywrightCrawler`.
|
|
@@ -32,7 +32,7 @@ FeatureVector = tuple[float, float]
|
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
class RenderingTypePredictorState(BaseModel):
|
|
35
|
-
model_config = ConfigDict(
|
|
35
|
+
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
36
36
|
|
|
37
37
|
model: Annotated[
|
|
38
38
|
LogisticRegression,
|