crawlee 0.6.13b43__py3-none-any.whl → 1.1.1b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlee might be problematic. Click here for more details.
- crawlee/_request.py +32 -21
- crawlee/_service_locator.py +4 -4
- crawlee/_types.py +67 -24
- crawlee/_utils/raise_if_too_many_kwargs.py +12 -0
- crawlee/_utils/recoverable_state.py +32 -8
- crawlee/_utils/recurring_task.py +15 -0
- crawlee/_utils/robots.py +17 -5
- crawlee/_utils/sitemap.py +1 -1
- crawlee/_utils/urls.py +9 -2
- crawlee/browsers/_browser_pool.py +4 -1
- crawlee/browsers/_playwright_browser_controller.py +21 -15
- crawlee/browsers/_playwright_browser_plugin.py +17 -3
- crawlee/browsers/_types.py +1 -1
- crawlee/configuration.py +3 -1
- crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +3 -1
- crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +38 -14
- crawlee/crawlers/_basic/_basic_crawler.py +51 -14
- crawlee/crawlers/_playwright/_playwright_crawler.py +16 -4
- crawlee/events/_event_manager.py +3 -1
- crawlee/fingerprint_suite/_header_generator.py +2 -2
- crawlee/otel/crawler_instrumentor.py +3 -3
- crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml +2 -2
- crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt +3 -0
- crawlee/request_loaders/_sitemap_request_loader.py +22 -4
- crawlee/sessions/_session_pool.py +1 -1
- crawlee/statistics/_error_snapshotter.py +1 -1
- crawlee/statistics/_models.py +32 -1
- crawlee/statistics/_statistics.py +24 -33
- crawlee/storage_clients/__init__.py +16 -0
- crawlee/storage_clients/_base/_storage_client.py +5 -4
- crawlee/storage_clients/_file_system/_dataset_client.py +4 -5
- crawlee/storage_clients/_file_system/_key_value_store_client.py +4 -5
- crawlee/storage_clients/_file_system/_request_queue_client.py +28 -12
- crawlee/storage_clients/_file_system/_storage_client.py +2 -2
- crawlee/storage_clients/_memory/_dataset_client.py +4 -5
- crawlee/storage_clients/_memory/_key_value_store_client.py +4 -5
- crawlee/storage_clients/_memory/_request_queue_client.py +4 -5
- crawlee/storage_clients/_redis/__init__.py +6 -0
- crawlee/storage_clients/_redis/_client_mixin.py +295 -0
- crawlee/storage_clients/_redis/_dataset_client.py +325 -0
- crawlee/storage_clients/_redis/_key_value_store_client.py +264 -0
- crawlee/storage_clients/_redis/_request_queue_client.py +586 -0
- crawlee/storage_clients/_redis/_storage_client.py +146 -0
- crawlee/storage_clients/_redis/_utils.py +23 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
- crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
- crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
- crawlee/storage_clients/_redis/py.typed +0 -0
- crawlee/storage_clients/_sql/__init__.py +6 -0
- crawlee/storage_clients/_sql/_client_mixin.py +385 -0
- crawlee/storage_clients/_sql/_dataset_client.py +310 -0
- crawlee/storage_clients/_sql/_db_models.py +268 -0
- crawlee/storage_clients/_sql/_key_value_store_client.py +300 -0
- crawlee/storage_clients/_sql/_request_queue_client.py +720 -0
- crawlee/storage_clients/_sql/_storage_client.py +291 -0
- crawlee/storage_clients/_sql/py.typed +0 -0
- crawlee/storage_clients/models.py +10 -10
- crawlee/storages/_base.py +3 -1
- crawlee/storages/_dataset.py +5 -3
- crawlee/storages/_key_value_store.py +11 -6
- crawlee/storages/_request_queue.py +5 -3
- crawlee/storages/_storage_instance_manager.py +54 -68
- crawlee/storages/_utils.py +11 -0
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/METADATA +16 -5
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/RECORD +69 -47
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/WHEEL +0 -0
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/entry_points.txt +0 -0
- {crawlee-0.6.13b43.dist-info → crawlee-1.1.1b1.dist-info}/licenses/LICENSE +0 -0
crawlee/_request.py
CHANGED
|
@@ -185,9 +185,6 @@ class Request(BaseModel):
|
|
|
185
185
|
method: HttpMethod = 'GET'
|
|
186
186
|
"""HTTP request method."""
|
|
187
187
|
|
|
188
|
-
headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)] = HttpHeaders()
|
|
189
|
-
"""HTTP request headers."""
|
|
190
|
-
|
|
191
188
|
payload: Annotated[
|
|
192
189
|
HttpPayload | None,
|
|
193
190
|
BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
|
|
@@ -195,23 +192,37 @@ class Request(BaseModel):
|
|
|
195
192
|
] = None
|
|
196
193
|
"""HTTP request payload."""
|
|
197
194
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
195
|
+
# Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
|
|
196
|
+
if TYPE_CHECKING:
|
|
197
|
+
headers: HttpHeaders = HttpHeaders()
|
|
198
|
+
"""HTTP request headers."""
|
|
199
|
+
|
|
200
|
+
user_data: dict[str, JsonSerializable] = {}
|
|
201
|
+
"""Custom user data assigned to the request. Use this to save any request related data to the
|
|
202
|
+
request's scope, keeping them accessible on retries, failures etc.
|
|
203
|
+
"""
|
|
204
|
+
|
|
205
|
+
else:
|
|
206
|
+
headers: Annotated[HttpHeaders, Field(default_factory=HttpHeaders)]
|
|
207
|
+
"""HTTP request headers."""
|
|
208
|
+
|
|
209
|
+
user_data: Annotated[
|
|
210
|
+
dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience
|
|
211
|
+
Field(alias='userData', default_factory=lambda: UserData()),
|
|
212
|
+
PlainValidator(user_data_adapter.validate_python),
|
|
213
|
+
PlainSerializer(
|
|
214
|
+
lambda instance: user_data_adapter.dump_python(
|
|
215
|
+
instance,
|
|
216
|
+
by_alias=True,
|
|
217
|
+
exclude_none=True,
|
|
218
|
+
exclude_unset=True,
|
|
219
|
+
exclude_defaults=True,
|
|
220
|
+
)
|
|
221
|
+
),
|
|
222
|
+
]
|
|
223
|
+
"""Custom user data assigned to the request. Use this to save any request related data to the
|
|
224
|
+
request's scope, keeping them accessible on retries, failures etc.
|
|
225
|
+
"""
|
|
215
226
|
|
|
216
227
|
retry_count: Annotated[int, Field(alias='retryCount')] = 0
|
|
217
228
|
"""Number of times the request has been retried."""
|
|
@@ -288,7 +299,7 @@ class Request(BaseModel):
|
|
|
288
299
|
)
|
|
289
300
|
|
|
290
301
|
if always_enqueue:
|
|
291
|
-
unique_key = f'{
|
|
302
|
+
unique_key = f'{crypto_random_object_id()}|{unique_key}'
|
|
292
303
|
|
|
293
304
|
request = cls(
|
|
294
305
|
url=url,
|
crawlee/_service_locator.py
CHANGED
|
@@ -38,7 +38,7 @@ class ServiceLocator:
|
|
|
38
38
|
def get_configuration(self) -> Configuration:
|
|
39
39
|
"""Get the configuration."""
|
|
40
40
|
if self._configuration is None:
|
|
41
|
-
logger.
|
|
41
|
+
logger.debug('No configuration set, implicitly creating and using default Configuration.')
|
|
42
42
|
self._configuration = Configuration()
|
|
43
43
|
|
|
44
44
|
return self._configuration
|
|
@@ -63,9 +63,9 @@ class ServiceLocator:
|
|
|
63
63
|
def get_event_manager(self) -> EventManager:
|
|
64
64
|
"""Get the event manager."""
|
|
65
65
|
if self._event_manager is None:
|
|
66
|
-
logger.
|
|
66
|
+
logger.debug('No event manager set, implicitly creating and using default LocalEventManager.')
|
|
67
67
|
if self._configuration is None:
|
|
68
|
-
logger.
|
|
68
|
+
logger.debug(
|
|
69
69
|
'Implicit creation of event manager will implicitly set configuration as side effect. '
|
|
70
70
|
'It is advised to explicitly first set the configuration instead.'
|
|
71
71
|
)
|
|
@@ -93,7 +93,7 @@ class ServiceLocator:
|
|
|
93
93
|
def get_storage_client(self) -> StorageClient:
|
|
94
94
|
"""Get the storage client."""
|
|
95
95
|
if self._storage_client is None:
|
|
96
|
-
logger.
|
|
96
|
+
logger.debug('No storage client set, implicitly creating and using default FileSystemStorageClient.')
|
|
97
97
|
if self._configuration is None:
|
|
98
98
|
logger.warning(
|
|
99
99
|
'Implicit creation of storage client will implicitly set configuration as side effect. '
|
crawlee/_types.py
CHANGED
|
@@ -3,17 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import dataclasses
|
|
4
4
|
from collections.abc import Callable, Iterator, Mapping
|
|
5
5
|
from dataclasses import dataclass
|
|
6
|
-
from typing import
|
|
7
|
-
TYPE_CHECKING,
|
|
8
|
-
Annotated,
|
|
9
|
-
Any,
|
|
10
|
-
Literal,
|
|
11
|
-
Protocol,
|
|
12
|
-
TypedDict,
|
|
13
|
-
TypeVar,
|
|
14
|
-
cast,
|
|
15
|
-
overload,
|
|
16
|
-
)
|
|
6
|
+
from typing import TYPE_CHECKING, Annotated, Any, Literal, Protocol, TypedDict, TypeVar, cast, overload
|
|
17
7
|
|
|
18
8
|
from pydantic import ConfigDict, Field, PlainValidator, RootModel
|
|
19
9
|
|
|
@@ -71,11 +61,15 @@ class HttpHeaders(RootModel, Mapping[str, str]):
|
|
|
71
61
|
|
|
72
62
|
model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)
|
|
73
63
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
64
|
+
# Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
|
|
65
|
+
if TYPE_CHECKING:
|
|
66
|
+
root: dict[str, str] = {}
|
|
67
|
+
else:
|
|
68
|
+
root: Annotated[
|
|
69
|
+
dict[str, str],
|
|
70
|
+
PlainValidator(lambda value: _normalize_headers(value)),
|
|
71
|
+
Field(default_factory=dict),
|
|
72
|
+
]
|
|
79
73
|
|
|
80
74
|
def __getitem__(self, key: str) -> str:
|
|
81
75
|
return self.root[key.lower()]
|
|
@@ -110,9 +104,9 @@ class ConcurrencySettings:
|
|
|
110
104
|
def __init__(
|
|
111
105
|
self,
|
|
112
106
|
min_concurrency: int = 1,
|
|
113
|
-
max_concurrency: int =
|
|
107
|
+
max_concurrency: int = 100,
|
|
114
108
|
max_tasks_per_minute: float = float('inf'),
|
|
115
|
-
desired_concurrency: int
|
|
109
|
+
desired_concurrency: int = 10,
|
|
116
110
|
) -> None:
|
|
117
111
|
"""Initialize a new instance.
|
|
118
112
|
|
|
@@ -125,21 +119,24 @@ class ConcurrencySettings:
|
|
|
125
119
|
desired_concurrency: The desired number of tasks that should be running parallel on the start of the pool,
|
|
126
120
|
if there is a large enough supply of them. By default, it is `min_concurrency`.
|
|
127
121
|
"""
|
|
128
|
-
if desired_concurrency is not None and desired_concurrency < 1:
|
|
129
|
-
raise ValueError('desired_concurrency must be 1 or larger')
|
|
130
|
-
|
|
131
122
|
if min_concurrency < 1:
|
|
132
123
|
raise ValueError('min_concurrency must be 1 or larger')
|
|
133
124
|
|
|
134
125
|
if max_concurrency < min_concurrency:
|
|
135
126
|
raise ValueError('max_concurrency cannot be less than min_concurrency')
|
|
136
127
|
|
|
128
|
+
if desired_concurrency < min_concurrency:
|
|
129
|
+
raise ValueError('desired_concurrency cannot be less than min_concurrency')
|
|
130
|
+
|
|
131
|
+
if desired_concurrency > max_concurrency:
|
|
132
|
+
raise ValueError('desired_concurrency cannot be greater than max_concurrency')
|
|
133
|
+
|
|
137
134
|
if max_tasks_per_minute <= 0:
|
|
138
135
|
raise ValueError('max_tasks_per_minute must be positive')
|
|
139
136
|
|
|
140
137
|
self.min_concurrency = min_concurrency
|
|
141
138
|
self.max_concurrency = max_concurrency
|
|
142
|
-
self.desired_concurrency = desired_concurrency
|
|
139
|
+
self.desired_concurrency = desired_concurrency
|
|
143
140
|
self.max_tasks_per_minute = max_tasks_per_minute
|
|
144
141
|
|
|
145
142
|
|
|
@@ -180,6 +177,17 @@ class AddRequestsKwargs(EnqueueLinksKwargs):
|
|
|
180
177
|
requests: Sequence[str | Request]
|
|
181
178
|
"""Requests to be added to the `RequestManager`."""
|
|
182
179
|
|
|
180
|
+
rq_id: str | None
|
|
181
|
+
"""ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided."""
|
|
182
|
+
|
|
183
|
+
rq_name: str | None
|
|
184
|
+
"""Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
|
|
185
|
+
"""
|
|
186
|
+
|
|
187
|
+
rq_alias: str | None
|
|
188
|
+
"""Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.
|
|
189
|
+
"""
|
|
190
|
+
|
|
183
191
|
|
|
184
192
|
class PushDataKwargs(TypedDict):
|
|
185
193
|
"""Keyword arguments for dataset's `push_data` method."""
|
|
@@ -261,10 +269,18 @@ class RequestHandlerRunResult:
|
|
|
261
269
|
async def add_requests(
|
|
262
270
|
self,
|
|
263
271
|
requests: Sequence[str | Request],
|
|
272
|
+
rq_id: str | None = None,
|
|
273
|
+
rq_name: str | None = None,
|
|
274
|
+
rq_alias: str | None = None,
|
|
264
275
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
265
276
|
) -> None:
|
|
266
277
|
"""Track a call to the `add_requests` context helper."""
|
|
267
|
-
|
|
278
|
+
specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
|
|
279
|
+
if specified_params > 1:
|
|
280
|
+
raise ValueError('Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.')
|
|
281
|
+
self.add_requests_calls.append(
|
|
282
|
+
AddRequestsKwargs(requests=requests, rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs)
|
|
283
|
+
)
|
|
268
284
|
|
|
269
285
|
async def push_data(
|
|
270
286
|
self,
|
|
@@ -311,12 +327,21 @@ class AddRequestsFunction(Protocol):
|
|
|
311
327
|
def __call__(
|
|
312
328
|
self,
|
|
313
329
|
requests: Sequence[str | Request],
|
|
330
|
+
rq_id: str | None = None,
|
|
331
|
+
rq_name: str | None = None,
|
|
332
|
+
rq_alias: str | None = None,
|
|
314
333
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
315
334
|
) -> Coroutine[None, None, None]:
|
|
316
335
|
"""Call dunder method.
|
|
317
336
|
|
|
318
337
|
Args:
|
|
319
338
|
requests: Requests to be added to the `RequestManager`.
|
|
339
|
+
rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
|
|
340
|
+
provided.
|
|
341
|
+
rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
|
|
342
|
+
can be provided.
|
|
343
|
+
rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
|
|
344
|
+
can be provided.
|
|
320
345
|
**kwargs: Additional keyword arguments.
|
|
321
346
|
"""
|
|
322
347
|
|
|
@@ -344,12 +369,21 @@ class EnqueueLinksFunction(Protocol):
|
|
|
344
369
|
label: str | None = None,
|
|
345
370
|
user_data: dict[str, Any] | None = None,
|
|
346
371
|
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
|
|
372
|
+
rq_id: str | None = None,
|
|
373
|
+
rq_name: str | None = None,
|
|
374
|
+
rq_alias: str | None = None,
|
|
347
375
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
348
376
|
) -> Coroutine[None, None, None]: ...
|
|
349
377
|
|
|
350
378
|
@overload
|
|
351
379
|
def __call__(
|
|
352
|
-
self,
|
|
380
|
+
self,
|
|
381
|
+
*,
|
|
382
|
+
requests: Sequence[str | Request] | None = None,
|
|
383
|
+
rq_id: str | None = None,
|
|
384
|
+
rq_name: str | None = None,
|
|
385
|
+
rq_alias: str | None = None,
|
|
386
|
+
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
353
387
|
) -> Coroutine[None, None, None]: ...
|
|
354
388
|
|
|
355
389
|
def __call__(
|
|
@@ -360,6 +394,9 @@ class EnqueueLinksFunction(Protocol):
|
|
|
360
394
|
user_data: dict[str, Any] | None = None,
|
|
361
395
|
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
|
|
362
396
|
requests: Sequence[str | Request] | None = None,
|
|
397
|
+
rq_id: str | None = None,
|
|
398
|
+
rq_name: str | None = None,
|
|
399
|
+
rq_alias: str | None = None,
|
|
363
400
|
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
364
401
|
) -> Coroutine[None, None, None]:
|
|
365
402
|
"""Call enqueue links function.
|
|
@@ -377,6 +414,12 @@ class EnqueueLinksFunction(Protocol):
|
|
|
377
414
|
- `'skip'` to exclude the request from being enqueued,
|
|
378
415
|
- `'unchanged'` to use the original request options without modification.
|
|
379
416
|
requests: Requests to be added to the `RequestManager`.
|
|
417
|
+
rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be
|
|
418
|
+
provided.
|
|
419
|
+
rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
|
|
420
|
+
can be provided.
|
|
421
|
+
rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias`
|
|
422
|
+
can be provided.
|
|
380
423
|
**kwargs: Additional keyword arguments.
|
|
381
424
|
"""
|
|
382
425
|
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def raise_if_too_many_kwargs(max_kwargs: int = 1, **kwargs: Any) -> None:
|
|
5
|
+
"""Raise ValueError if there are more non-None kwargs then max_kwargs."""
|
|
6
|
+
none_kwargs_names = [f'"{kwarg_name}"' for kwarg_name, value in kwargs.items() if value is not None]
|
|
7
|
+
if len(none_kwargs_names) > max_kwargs:
|
|
8
|
+
all_kwargs_names = [f'"{kwarg_name}"' for kwarg_name in kwargs]
|
|
9
|
+
raise ValueError(
|
|
10
|
+
f'Only one of {", ".join(all_kwargs_names)} can be specified, but following arguments were '
|
|
11
|
+
f'specified: {", ".join(none_kwargs_names)}.'
|
|
12
|
+
)
|
|
@@ -4,12 +4,14 @@ from typing import TYPE_CHECKING, Generic, Literal, TypeVar
|
|
|
4
4
|
|
|
5
5
|
from pydantic import BaseModel
|
|
6
6
|
|
|
7
|
+
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
|
|
7
8
|
from crawlee.events._types import Event, EventPersistStateData
|
|
8
9
|
|
|
9
10
|
if TYPE_CHECKING:
|
|
10
11
|
import logging
|
|
12
|
+
from collections.abc import Callable, Coroutine
|
|
11
13
|
|
|
12
|
-
from crawlee.storages
|
|
14
|
+
from crawlee.storages import KeyValueStore
|
|
13
15
|
|
|
14
16
|
TStateModel = TypeVar('TStateModel', bound=BaseModel)
|
|
15
17
|
|
|
@@ -37,6 +39,7 @@ class RecoverableState(Generic[TStateModel]):
|
|
|
37
39
|
persistence_enabled: Literal[True, False, 'explicit_only'] = False,
|
|
38
40
|
persist_state_kvs_name: str | None = None,
|
|
39
41
|
persist_state_kvs_id: str | None = None,
|
|
42
|
+
persist_state_kvs_factory: Callable[[], Coroutine[None, None, KeyValueStore]] | None = None,
|
|
40
43
|
logger: logging.Logger,
|
|
41
44
|
) -> None:
|
|
42
45
|
"""Initialize a new recoverable state object.
|
|
@@ -51,16 +54,40 @@ class RecoverableState(Generic[TStateModel]):
|
|
|
51
54
|
If neither a name nor and id are supplied, the default store will be used.
|
|
52
55
|
persist_state_kvs_id: The identifier of the KeyValueStore to use for persistence.
|
|
53
56
|
If neither a name nor and id are supplied, the default store will be used.
|
|
57
|
+
persist_state_kvs_factory: Factory that can be awaited to create KeyValueStore to use for persistence. If
|
|
58
|
+
not provided, a system-wide KeyValueStore will be used, based on service locator configuration.
|
|
54
59
|
logger: A logger instance for logging operations related to state persistence
|
|
55
60
|
"""
|
|
61
|
+
raise_if_too_many_kwargs(
|
|
62
|
+
persist_state_kvs_name=persist_state_kvs_name,
|
|
63
|
+
persist_state_kvs_id=persist_state_kvs_id,
|
|
64
|
+
persist_state_kvs_factory=persist_state_kvs_factory,
|
|
65
|
+
)
|
|
66
|
+
if not persist_state_kvs_factory:
|
|
67
|
+
logger.debug(
|
|
68
|
+
'No explicit key_value_store set for recoverable state. Recovery will use a system-wide KeyValueStore '
|
|
69
|
+
'based on service_locator configuration, potentially calling service_locator.set_storage_client in the '
|
|
70
|
+
'process. It is recommended to initialize RecoverableState with explicit key_value_store to avoid '
|
|
71
|
+
'global side effects.'
|
|
72
|
+
)
|
|
73
|
+
|
|
56
74
|
self._default_state = default_state
|
|
57
75
|
self._state_type: type[TStateModel] = self._default_state.__class__
|
|
58
76
|
self._state: TStateModel | None = None
|
|
59
77
|
self._persistence_enabled = persistence_enabled
|
|
60
78
|
self._persist_state_key = persist_state_key
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
79
|
+
if persist_state_kvs_factory is None:
|
|
80
|
+
|
|
81
|
+
async def kvs_factory() -> KeyValueStore:
|
|
82
|
+
from crawlee.storages import KeyValueStore # noqa: PLC0415 avoid circular import
|
|
83
|
+
|
|
84
|
+
return await KeyValueStore.open(name=persist_state_kvs_name, id=persist_state_kvs_id)
|
|
85
|
+
|
|
86
|
+
self._persist_state_kvs_factory = kvs_factory
|
|
87
|
+
else:
|
|
88
|
+
self._persist_state_kvs_factory = persist_state_kvs_factory
|
|
89
|
+
|
|
90
|
+
self._key_value_store: KeyValueStore | None = None
|
|
64
91
|
self._log = logger
|
|
65
92
|
|
|
66
93
|
async def initialize(self) -> TStateModel:
|
|
@@ -77,11 +104,8 @@ class RecoverableState(Generic[TStateModel]):
|
|
|
77
104
|
return self.current_value
|
|
78
105
|
|
|
79
106
|
# Import here to avoid circular imports.
|
|
80
|
-
from crawlee.storages._key_value_store import KeyValueStore # noqa: PLC0415
|
|
81
107
|
|
|
82
|
-
self._key_value_store = await
|
|
83
|
-
name=self._persist_state_kvs_name, id=self._persist_state_kvs_id
|
|
84
|
-
)
|
|
108
|
+
self._key_value_store = await self._persist_state_kvs_factory()
|
|
85
109
|
|
|
86
110
|
await self._load_saved_state()
|
|
87
111
|
|
crawlee/_utils/recurring_task.py
CHANGED
|
@@ -7,6 +7,9 @@ from typing import TYPE_CHECKING
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
8
8
|
from collections.abc import Callable
|
|
9
9
|
from datetime import timedelta
|
|
10
|
+
from types import TracebackType
|
|
11
|
+
|
|
12
|
+
from typing_extensions import Self
|
|
10
13
|
|
|
11
14
|
logger = getLogger(__name__)
|
|
12
15
|
|
|
@@ -26,6 +29,18 @@ class RecurringTask:
|
|
|
26
29
|
self.delay = delay
|
|
27
30
|
self.task: asyncio.Task | None = None
|
|
28
31
|
|
|
32
|
+
async def __aenter__(self) -> Self:
|
|
33
|
+
self.start()
|
|
34
|
+
return self
|
|
35
|
+
|
|
36
|
+
async def __aexit__(
|
|
37
|
+
self,
|
|
38
|
+
exc_type: type[BaseException] | None,
|
|
39
|
+
exc_value: BaseException | None,
|
|
40
|
+
exc_traceback: TracebackType | None,
|
|
41
|
+
) -> None:
|
|
42
|
+
await self.stop()
|
|
43
|
+
|
|
29
44
|
async def _wrapper(self) -> None:
|
|
30
45
|
"""Continuously execute the provided function with the specified delay.
|
|
31
46
|
|
crawlee/_utils/robots.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from logging import getLogger
|
|
3
4
|
from typing import TYPE_CHECKING
|
|
4
5
|
|
|
5
6
|
from protego import Protego
|
|
@@ -15,6 +16,9 @@ if TYPE_CHECKING:
|
|
|
15
16
|
from crawlee.proxy_configuration import ProxyInfo
|
|
16
17
|
|
|
17
18
|
|
|
19
|
+
logger = getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
18
22
|
class RobotsTxtFile:
|
|
19
23
|
def __init__(
|
|
20
24
|
self, url: str, robots: Protego, http_client: HttpClient | None = None, proxy_info: ProxyInfo | None = None
|
|
@@ -56,12 +60,20 @@ class RobotsTxtFile:
|
|
|
56
60
|
http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
|
|
57
61
|
proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
|
|
58
62
|
"""
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
+
try:
|
|
64
|
+
response = await http_client.send_request(url, proxy_info=proxy_info)
|
|
65
|
+
|
|
66
|
+
body = (
|
|
67
|
+
b'User-agent: *\nAllow: /'
|
|
68
|
+
if is_status_code_client_error(response.status_code)
|
|
69
|
+
else await response.read()
|
|
70
|
+
)
|
|
71
|
+
robots = Protego.parse(body.decode('utf-8'))
|
|
72
|
+
|
|
73
|
+
except Exception as e:
|
|
74
|
+
logger.warning(f'Failed to fetch from robots.txt from "{url}" with error: "{e}"')
|
|
63
75
|
|
|
64
|
-
|
|
76
|
+
robots = Protego.parse('User-agent: *\nAllow: /')
|
|
65
77
|
|
|
66
78
|
return cls(url, robots, http_client=http_client, proxy_info=proxy_info)
|
|
67
79
|
|
crawlee/_utils/sitemap.py
CHANGED
|
@@ -335,7 +335,7 @@ async def _fetch_and_process_sitemap(
|
|
|
335
335
|
# Check if the first chunk is a valid gzip header
|
|
336
336
|
if first_chunk and raw_chunk.startswith(b'\x1f\x8b'):
|
|
337
337
|
decompressor = zlib.decompressobj(zlib.MAX_WBITS | 16)
|
|
338
|
-
|
|
338
|
+
first_chunk = False
|
|
339
339
|
|
|
340
340
|
chunk = decompressor.decompress(raw_chunk) if decompressor else raw_chunk
|
|
341
341
|
text_chunk = decoder.decode(chunk)
|
crawlee/_utils/urls.py
CHANGED
|
@@ -7,6 +7,7 @@ from yarl import URL
|
|
|
7
7
|
|
|
8
8
|
if TYPE_CHECKING:
|
|
9
9
|
from collections.abc import Iterator
|
|
10
|
+
from logging import Logger
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
def is_url_absolute(url: str) -> bool:
|
|
@@ -22,13 +23,19 @@ def convert_to_absolute_url(base_url: str, relative_url: str) -> str:
|
|
|
22
23
|
return str(URL(base_url).join(URL(relative_url)))
|
|
23
24
|
|
|
24
25
|
|
|
25
|
-
def to_absolute_url_iterator(base_url: str, urls: Iterator[str]) -> Iterator[str]:
|
|
26
|
+
def to_absolute_url_iterator(base_url: str, urls: Iterator[str], logger: Logger | None = None) -> Iterator[str]:
|
|
26
27
|
"""Convert an iterator of relative URLs to absolute URLs using a base URL."""
|
|
27
28
|
for url in urls:
|
|
28
29
|
if is_url_absolute(url):
|
|
29
30
|
yield url
|
|
30
31
|
else:
|
|
31
|
-
|
|
32
|
+
converted_url = convert_to_absolute_url(base_url, url)
|
|
33
|
+
# Skip the URL if conversion fails, probably due to an incorrect format, such as 'mailto:'.
|
|
34
|
+
if not is_url_absolute(converted_url):
|
|
35
|
+
if logger:
|
|
36
|
+
logger.debug(f'Could not convert URL "{url}" to absolute using base URL "{base_url}". Skipping it.')
|
|
37
|
+
continue
|
|
38
|
+
yield converted_url
|
|
32
39
|
|
|
33
40
|
|
|
34
41
|
_http_url_adapter = TypeAdapter(AnyHttpUrl)
|
|
@@ -118,7 +118,10 @@ class BrowserPool:
|
|
|
118
118
|
"""Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.
|
|
119
119
|
|
|
120
120
|
Args:
|
|
121
|
-
browser_type: The type of browser to launch
|
|
121
|
+
browser_type: The type of browser to launch:
|
|
122
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
123
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
|
|
124
|
+
the system.
|
|
122
125
|
user_data_dir: Path to a user data directory, which stores browser session data like cookies
|
|
123
126
|
and local storage.
|
|
124
127
|
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
from asyncio import Lock
|
|
5
6
|
from datetime import datetime, timedelta, timezone
|
|
6
7
|
from typing import TYPE_CHECKING, Any, cast
|
|
7
8
|
|
|
@@ -77,6 +78,19 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
77
78
|
|
|
78
79
|
self._total_opened_pages = 0
|
|
79
80
|
|
|
81
|
+
self._context_creation_lock: Lock | None = None
|
|
82
|
+
|
|
83
|
+
async def _get_context_creation_lock(self) -> Lock:
|
|
84
|
+
"""Get context checking and creation lock.
|
|
85
|
+
|
|
86
|
+
It should be done with lock to prevent multiple concurrent attempts to create context, which could lead to
|
|
87
|
+
memory leak as one of the two concurrently created contexts will become orphaned and not properly closed.
|
|
88
|
+
"""
|
|
89
|
+
if self._context_creation_lock:
|
|
90
|
+
return self._context_creation_lock
|
|
91
|
+
self._context_creation_lock = Lock()
|
|
92
|
+
return self._context_creation_lock
|
|
93
|
+
|
|
80
94
|
@property
|
|
81
95
|
@override
|
|
82
96
|
def pages(self) -> list[Page]:
|
|
@@ -137,12 +151,6 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
137
151
|
Raises:
|
|
138
152
|
ValueError: If the browser has reached the maximum number of open pages.
|
|
139
153
|
"""
|
|
140
|
-
if not self._browser_context:
|
|
141
|
-
self._browser_context = await self._create_browser_context(
|
|
142
|
-
browser_new_context_options=browser_new_context_options,
|
|
143
|
-
proxy_info=proxy_info,
|
|
144
|
-
)
|
|
145
|
-
|
|
146
154
|
if not self.has_free_capacity:
|
|
147
155
|
raise ValueError('Cannot open more pages in this browser.')
|
|
148
156
|
|
|
@@ -154,11 +162,12 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
154
162
|
)
|
|
155
163
|
page = await new_context.new_page()
|
|
156
164
|
else:
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
165
|
+
async with await self._get_context_creation_lock():
|
|
166
|
+
if not self._browser_context:
|
|
167
|
+
self._browser_context = await self._create_browser_context(
|
|
168
|
+
browser_new_context_options=browser_new_context_options,
|
|
169
|
+
proxy_info=proxy_info,
|
|
170
|
+
)
|
|
162
171
|
page = await self._browser_context.new_page()
|
|
163
172
|
|
|
164
173
|
# Handle page close event
|
|
@@ -169,7 +178,6 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
169
178
|
self._last_page_opened_at = datetime.now(timezone.utc)
|
|
170
179
|
|
|
171
180
|
self._total_opened_pages += 1
|
|
172
|
-
|
|
173
181
|
return page
|
|
174
182
|
|
|
175
183
|
@override
|
|
@@ -206,10 +214,9 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
206
214
|
`self._fingerprint_generator` is available.
|
|
207
215
|
"""
|
|
208
216
|
browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
|
|
209
|
-
|
|
210
217
|
if proxy_info:
|
|
211
218
|
if browser_new_context_options.get('proxy'):
|
|
212
|
-
logger.warning("browser_new_context_options['proxy']
|
|
219
|
+
logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.")
|
|
213
220
|
|
|
214
221
|
browser_new_context_options['proxy'] = ProxySettings(
|
|
215
222
|
server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
|
|
@@ -244,5 +251,4 @@ class PlaywrightBrowserController(BrowserController):
|
|
|
244
251
|
browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
|
|
245
252
|
'extra_http_headers', extra_http_headers
|
|
246
253
|
)
|
|
247
|
-
|
|
248
254
|
return await self._browser.new_context(**browser_new_context_options)
|
|
@@ -34,8 +34,8 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
|
|
|
34
34
|
|
|
35
35
|
It is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory
|
|
36
36
|
for creating new browser instances and provides a unified interface for interacting with different browser types
|
|
37
|
-
(chromium, firefox, and
|
|
38
|
-
executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
|
|
37
|
+
(chromium, firefox, webkit and chrome). This class integrates configuration options for browser launches (headless
|
|
38
|
+
mode, executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each
|
|
39
39
|
browser instance, ensuring that resource limits are respected.
|
|
40
40
|
"""
|
|
41
41
|
|
|
@@ -55,7 +55,10 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
|
|
|
55
55
|
"""Initialize a new instance.
|
|
56
56
|
|
|
57
57
|
Args:
|
|
58
|
-
browser_type: The type of browser to launch
|
|
58
|
+
browser_type: The type of browser to launch:
|
|
59
|
+
- 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers
|
|
60
|
+
- 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on
|
|
61
|
+
the system.
|
|
59
62
|
user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local
|
|
60
63
|
storage.
|
|
61
64
|
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
|
|
@@ -80,6 +83,17 @@ class PlaywrightBrowserPlugin(BrowserPlugin):
|
|
|
80
83
|
'chromium_sandbox': not config.disable_browser_sandbox,
|
|
81
84
|
}
|
|
82
85
|
|
|
86
|
+
if browser_type == 'chrome' and default_launch_browser_options['executable_path']:
|
|
87
|
+
raise ValueError(
|
|
88
|
+
'Cannot use browser_type `chrome` with `Configuration.default_browser_path` or `executable_path` set.'
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Map 'chrome' to 'chromium' with the 'chrome' channel.
|
|
92
|
+
if browser_type == 'chrome':
|
|
93
|
+
browser_type = 'chromium'
|
|
94
|
+
# Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome.
|
|
95
|
+
default_launch_browser_options['channel'] = 'chrome'
|
|
96
|
+
|
|
83
97
|
self._browser_type: BrowserType = browser_type
|
|
84
98
|
self._browser_launch_options: dict[str, Any] = default_launch_browser_options | (browser_launch_options or {})
|
|
85
99
|
self._browser_new_context_options = browser_new_context_options or {}
|
crawlee/browsers/_types.py
CHANGED
crawlee/configuration.py
CHANGED
|
@@ -28,7 +28,9 @@ class Configuration(BaseSettings):
|
|
|
28
28
|
Settings can also be configured via environment variables, prefixed with `CRAWLEE_`.
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
|
-
|
|
31
|
+
# TODO: https://github.com/pydantic/pydantic-settings/issues/706
|
|
32
|
+
# Use `SettingsConfigDict(validate_by_name=True, validate_by_alias=True)` when issue is resolved.
|
|
33
|
+
model_config = SettingsConfigDict(populate_by_name=True)
|
|
32
34
|
|
|
33
35
|
internal_timeout: Annotated[timedelta | None, Field(alias='crawlee_internal_timeout')] = None
|
|
34
36
|
"""Timeout for the internal asynchronous operations."""
|