crawlee 1.1.1b2__py3-none-any.whl → 1.1.1b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawlee/_types.py +20 -1
- crawlee/crawlers/_basic/_basic_crawler.py +60 -44
- {crawlee-1.1.1b2.dist-info → crawlee-1.1.1b3.dist-info}/METADATA +1 -1
- {crawlee-1.1.1b2.dist-info → crawlee-1.1.1b3.dist-info}/RECORD +7 -7
- {crawlee-1.1.1b2.dist-info → crawlee-1.1.1b3.dist-info}/WHEEL +0 -0
- {crawlee-1.1.1b2.dist-info → crawlee-1.1.1b3.dist-info}/entry_points.txt +0 -0
- {crawlee-1.1.1b2.dist-info → crawlee-1.1.1b3.dist-info}/licenses/LICENSE +0 -0
crawlee/_types.py
CHANGED
|
@@ -15,7 +15,7 @@ if TYPE_CHECKING:
|
|
|
15
15
|
import re
|
|
16
16
|
from collections.abc import Callable, Coroutine, Sequence
|
|
17
17
|
|
|
18
|
-
from typing_extensions import NotRequired, Required, Unpack
|
|
18
|
+
from typing_extensions import NotRequired, Required, Self, Unpack
|
|
19
19
|
|
|
20
20
|
from crawlee import Glob, Request
|
|
21
21
|
from crawlee._request import RequestOptions
|
|
@@ -643,6 +643,25 @@ class BasicCrawlingContext:
|
|
|
643
643
|
"""Return hash of the context. Each context is considered unique."""
|
|
644
644
|
return id(self)
|
|
645
645
|
|
|
646
|
+
def create_modified_copy(
|
|
647
|
+
self,
|
|
648
|
+
push_data: PushDataFunction | None = None,
|
|
649
|
+
add_requests: AddRequestsFunction | None = None,
|
|
650
|
+
get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,
|
|
651
|
+
) -> Self:
|
|
652
|
+
"""Create a modified copy of the crawling context with specified changes."""
|
|
653
|
+
original_fields = {field.name: getattr(self, field.name) for field in dataclasses.fields(self)}
|
|
654
|
+
modified_fields = {
|
|
655
|
+
key: value
|
|
656
|
+
for key, value in {
|
|
657
|
+
'push_data': push_data,
|
|
658
|
+
'add_requests': add_requests,
|
|
659
|
+
'get_key_value_store': get_key_value_store,
|
|
660
|
+
}.items()
|
|
661
|
+
if value
|
|
662
|
+
}
|
|
663
|
+
return self.__class__(**{**original_fields, **modified_fields})
|
|
664
|
+
|
|
646
665
|
|
|
647
666
|
class GetDataKwargs(TypedDict):
|
|
648
667
|
"""Keyword arguments for dataset's `get_data` method."""
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
4
|
import asyncio
|
|
5
|
+
import functools
|
|
5
6
|
import logging
|
|
6
7
|
import signal
|
|
7
8
|
import sys
|
|
@@ -14,7 +15,7 @@ from contextlib import AsyncExitStack, suppress
|
|
|
14
15
|
from datetime import timedelta
|
|
15
16
|
from functools import partial
|
|
16
17
|
from pathlib import Path
|
|
17
|
-
from typing import TYPE_CHECKING, Any, Generic, Literal, cast
|
|
18
|
+
from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
|
|
18
19
|
from urllib.parse import ParseResult, urlparse
|
|
19
20
|
from weakref import WeakKeyDictionary
|
|
20
21
|
|
|
@@ -96,6 +97,9 @@ if TYPE_CHECKING:
|
|
|
96
97
|
TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
|
|
97
98
|
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
|
|
98
99
|
TRequestIterator = TypeVar('TRequestIterator', str, Request)
|
|
100
|
+
TParams = ParamSpec('TParams')
|
|
101
|
+
T = TypeVar('T')
|
|
102
|
+
|
|
99
103
|
ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
|
|
100
104
|
FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
|
|
101
105
|
SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
|
|
@@ -520,6 +524,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
520
524
|
self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
|
|
521
525
|
self._unexpected_stop = True
|
|
522
526
|
|
|
527
|
+
def _wrap_handler_with_error_context(
|
|
528
|
+
self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
|
|
529
|
+
) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
|
|
530
|
+
"""Decorate error handlers to make their context helpers usable."""
|
|
531
|
+
|
|
532
|
+
@functools.wraps(handler)
|
|
533
|
+
async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
|
|
534
|
+
# Original context helpers that are from `RequestHandlerRunResult` will not be commited as the request
|
|
535
|
+
# failed. Modified context provides context helpers with direct access to the storages.
|
|
536
|
+
error_context = context.create_modified_copy(
|
|
537
|
+
push_data=self._push_data,
|
|
538
|
+
get_key_value_store=self.get_key_value_store,
|
|
539
|
+
add_requests=functools.partial(self._add_requests, context),
|
|
540
|
+
)
|
|
541
|
+
return await handler(error_context, exception)
|
|
542
|
+
|
|
543
|
+
return wrapped_handler
|
|
544
|
+
|
|
523
545
|
def _stop_if_max_requests_count_exceeded(self) -> None:
|
|
524
546
|
"""Call `stop` when the maximum number of requests to crawl has been reached."""
|
|
525
547
|
if self._max_requests_per_crawl is None:
|
|
@@ -618,7 +640,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
618
640
|
|
|
619
641
|
The error handler is invoked after a request handler error occurs and before a retry attempt.
|
|
620
642
|
"""
|
|
621
|
-
self._error_handler = handler
|
|
643
|
+
self._error_handler = self._wrap_handler_with_error_context(handler)
|
|
622
644
|
return handler
|
|
623
645
|
|
|
624
646
|
def failed_request_handler(
|
|
@@ -628,7 +650,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
628
650
|
|
|
629
651
|
The failed request handler is invoked when a request has failed all retry attempts.
|
|
630
652
|
"""
|
|
631
|
-
self._failed_request_handler = handler
|
|
653
|
+
self._failed_request_handler = self._wrap_handler_with_error_context(handler)
|
|
632
654
|
return handler
|
|
633
655
|
|
|
634
656
|
def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
|
|
@@ -1256,52 +1278,46 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
|
|
|
1256
1278
|
else:
|
|
1257
1279
|
yield Request.from_url(url)
|
|
1258
1280
|
|
|
1259
|
-
async def
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
storage_client=self._service_locator.get_storage_client(),
|
|
1280
|
-
configuration=self._service_locator.get_configuration(),
|
|
1281
|
-
)
|
|
1282
|
-
else:
|
|
1283
|
-
request_manager = base_request_manager
|
|
1284
|
-
|
|
1285
|
-
requests = list[Request]()
|
|
1286
|
-
|
|
1287
|
-
base_url = url if (url := add_requests_call.get('base_url')) else origin
|
|
1288
|
-
|
|
1289
|
-
requests_iterator = self._convert_url_to_request_iterator(add_requests_call['requests'], base_url)
|
|
1281
|
+
async def _add_requests(
|
|
1282
|
+
self,
|
|
1283
|
+
context: BasicCrawlingContext,
|
|
1284
|
+
requests: Sequence[str | Request],
|
|
1285
|
+
rq_id: str | None = None,
|
|
1286
|
+
rq_name: str | None = None,
|
|
1287
|
+
rq_alias: str | None = None,
|
|
1288
|
+
**kwargs: Unpack[EnqueueLinksKwargs],
|
|
1289
|
+
) -> None:
|
|
1290
|
+
"""Add requests method aware of the crawling context."""
|
|
1291
|
+
if rq_id or rq_name or rq_alias:
|
|
1292
|
+
request_manager: RequestManager = await RequestQueue.open(
|
|
1293
|
+
id=rq_id,
|
|
1294
|
+
name=rq_name,
|
|
1295
|
+
alias=rq_alias,
|
|
1296
|
+
storage_client=self._service_locator.get_storage_client(),
|
|
1297
|
+
configuration=self._service_locator.get_configuration(),
|
|
1298
|
+
)
|
|
1299
|
+
else:
|
|
1300
|
+
request_manager = await self.get_request_manager()
|
|
1290
1301
|
|
|
1291
|
-
|
|
1302
|
+
context_aware_requests = list[Request]()
|
|
1303
|
+
base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
|
|
1304
|
+
requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
|
|
1305
|
+
filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
|
|
1306
|
+
for dst_request in filter_requests_iterator:
|
|
1307
|
+
# Update the crawl depth of the request.
|
|
1308
|
+
dst_request.crawl_depth = context.request.crawl_depth + 1
|
|
1292
1309
|
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
)
|
|
1310
|
+
if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
|
|
1311
|
+
context_aware_requests.append(dst_request)
|
|
1296
1312
|
|
|
1297
|
-
|
|
1298
|
-
# Update the crawl depth of the request.
|
|
1299
|
-
dst_request.crawl_depth = context.request.crawl_depth + 1
|
|
1313
|
+
return await request_manager.add_requests(context_aware_requests)
|
|
1300
1314
|
|
|
1301
|
-
|
|
1302
|
-
|
|
1315
|
+
async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
|
|
1316
|
+
"""Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
|
|
1317
|
+
result = self._context_result_map[context]
|
|
1303
1318
|
|
|
1304
|
-
|
|
1319
|
+
for add_requests_call in result.add_requests_calls:
|
|
1320
|
+
await self._add_requests(context, **add_requests_call)
|
|
1305
1321
|
|
|
1306
1322
|
for push_data_call in result.push_data_calls:
|
|
1307
1323
|
await self._push_data(**push_data_call)
|
|
@@ -5,7 +5,7 @@ crawlee/_consts.py,sha256=RQ96gx7V-WPH91cVsMUz76X5UZUNDNhCudtlyGkxFVk,133
|
|
|
5
5
|
crawlee/_log_config.py,sha256=VyxoEfWCq_9fyicmmJbjiZ5KC91onMcAtX2L4oKX4m4,5999
|
|
6
6
|
crawlee/_request.py,sha256=fnUofyFMV3HJwfcLjYr2BCZ5K9mEwl6vZd8Pr309wCE,16458
|
|
7
7
|
crawlee/_service_locator.py,sha256=SJ8ABYtclBl7rz8kfZ2jZkIgKq5oNIoGT7WmN8ApTzo,5058
|
|
8
|
-
crawlee/_types.py,sha256=
|
|
8
|
+
crawlee/_types.py,sha256=93yoGr_KqMDIkq3__3QYpIAJmEzZvDoilHAF7_X4J4A,29933
|
|
9
9
|
crawlee/configuration.py,sha256=DWS2z1FC6Ua93W2tStK3R1ZKZbZjVQYWGiGFbZFaRtA,8064
|
|
10
10
|
crawlee/errors.py,sha256=RhFNA_uT615nVBHf9TylpX5YWwtDuHUUEV8LPT4CYa4,3878
|
|
11
11
|
crawlee/proxy_configuration.py,sha256=rqf67yerXvLvraBaAHW04nvf5ECze3wMQbK7LlqXucM,10386
|
|
@@ -65,7 +65,7 @@ crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py,sha256=TM4mkb
|
|
|
65
65
|
crawlee/crawlers/_adaptive_playwright/_result_comparator.py,sha256=NAfw5VKzTnkvARtLr_zrZj6UGeMp05Voc6Oi8oPxU3w,1747
|
|
66
66
|
crawlee/crawlers/_adaptive_playwright/_utils.py,sha256=EUYVz5i2YkLpL_gbVRp9BAD5u6w1xJ_AFzc_qB9bdDQ,1102
|
|
67
67
|
crawlee/crawlers/_basic/__init__.py,sha256=LPln8SiBBXSMqrApiFUfpqz3hvqxN5HUa1cHQXMVKgU,280
|
|
68
|
-
crawlee/crawlers/_basic/_basic_crawler.py,sha256=
|
|
68
|
+
crawlee/crawlers/_basic/_basic_crawler.py,sha256=9N8qPh-k2b6_oywL5taMV-kfMlcs_Q7NgwzISQjzX2I,73869
|
|
69
69
|
crawlee/crawlers/_basic/_basic_crawling_context.py,sha256=fjxm2RQXMDkDlWu38dQ3xn5rrGUOhJXkXiqkgbFJFk4,155
|
|
70
70
|
crawlee/crawlers/_basic/_context_pipeline.py,sha256=vM8EEvnCoguERjRV3oyrxUq2Ln2F9DzY7P5dAEiuMHo,5869
|
|
71
71
|
crawlee/crawlers/_basic/_logging_utils.py,sha256=jp5mEwSq5a_BgzUhNPJ9WrIDcoIeYGbeHstcRqCcP0s,3093
|
|
@@ -199,8 +199,8 @@ crawlee/storages/_request_queue.py,sha256=bjBOGbpMaGUsqJPVB-JD2VShziPAYMI-GvWKKp
|
|
|
199
199
|
crawlee/storages/_storage_instance_manager.py,sha256=72n0YlPwNpSQDJSPf4TxnI2GvIK6L-ZiTmHRbFcoVU0,8164
|
|
200
200
|
crawlee/storages/_utils.py,sha256=Yz-5tEBYKYCFJemYT29--uGJqoJLApLDLgPcsnbifRw,439
|
|
201
201
|
crawlee/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
202
|
-
crawlee-1.1.
|
|
203
|
-
crawlee-1.1.
|
|
204
|
-
crawlee-1.1.
|
|
205
|
-
crawlee-1.1.
|
|
206
|
-
crawlee-1.1.
|
|
202
|
+
crawlee-1.1.1b3.dist-info/METADATA,sha256=2Xebj8j2r7TzYlzBEqMu7W2uYfTVHKJK68yO7kEd5Ws,29532
|
|
203
|
+
crawlee-1.1.1b3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
204
|
+
crawlee-1.1.1b3.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
|
|
205
|
+
crawlee-1.1.1b3.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
|
|
206
|
+
crawlee-1.1.1b3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|