crawlee 1.1.1b2__py3-none-any.whl → 1.1.1b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
crawlee/_types.py CHANGED
@@ -15,7 +15,7 @@ if TYPE_CHECKING:
15
15
  import re
16
16
  from collections.abc import Callable, Coroutine, Sequence
17
17
 
18
- from typing_extensions import NotRequired, Required, Unpack
18
+ from typing_extensions import NotRequired, Required, Self, Unpack
19
19
 
20
20
  from crawlee import Glob, Request
21
21
  from crawlee._request import RequestOptions
@@ -643,6 +643,25 @@ class BasicCrawlingContext:
643
643
  """Return hash of the context. Each context is considered unique."""
644
644
  return id(self)
645
645
 
646
+ def create_modified_copy(
647
+ self,
648
+ push_data: PushDataFunction | None = None,
649
+ add_requests: AddRequestsFunction | None = None,
650
+ get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,
651
+ ) -> Self:
652
+ """Create a modified copy of the crawling context with specified changes."""
653
+ original_fields = {field.name: getattr(self, field.name) for field in dataclasses.fields(self)}
654
+ modified_fields = {
655
+ key: value
656
+ for key, value in {
657
+ 'push_data': push_data,
658
+ 'add_requests': add_requests,
659
+ 'get_key_value_store': get_key_value_store,
660
+ }.items()
661
+ if value
662
+ }
663
+ return self.__class__(**{**original_fields, **modified_fields})
664
+
646
665
 
647
666
  class GetDataKwargs(TypedDict):
648
667
  """Keyword arguments for dataset's `get_data` method."""
@@ -2,6 +2,7 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  import asyncio
5
+ import functools
5
6
  import logging
6
7
  import signal
7
8
  import sys
@@ -14,7 +15,7 @@ from contextlib import AsyncExitStack, suppress
14
15
  from datetime import timedelta
15
16
  from functools import partial
16
17
  from pathlib import Path
17
- from typing import TYPE_CHECKING, Any, Generic, Literal, cast
18
+ from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
18
19
  from urllib.parse import ParseResult, urlparse
19
20
  from weakref import WeakKeyDictionary
20
21
 
@@ -96,6 +97,9 @@ if TYPE_CHECKING:
96
97
  TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
97
98
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
98
99
  TRequestIterator = TypeVar('TRequestIterator', str, Request)
100
+ TParams = ParamSpec('TParams')
101
+ T = TypeVar('T')
102
+
99
103
  ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
100
104
  FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
101
105
  SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
@@ -520,6 +524,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
520
524
  self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
521
525
  self._unexpected_stop = True
522
526
 
527
+ def _wrap_handler_with_error_context(
528
+ self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
529
+ ) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
530
+ """Decorate error handlers to make their context helpers usable."""
531
+
532
+ @functools.wraps(handler)
533
+ async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
534
+ # Original context helpers that are from `RequestHandlerRunResult` will not be commited as the request
535
+ # failed. Modified context provides context helpers with direct access to the storages.
536
+ error_context = context.create_modified_copy(
537
+ push_data=self._push_data,
538
+ get_key_value_store=self.get_key_value_store,
539
+ add_requests=functools.partial(self._add_requests, context),
540
+ )
541
+ return await handler(error_context, exception)
542
+
543
+ return wrapped_handler
544
+
523
545
  def _stop_if_max_requests_count_exceeded(self) -> None:
524
546
  """Call `stop` when the maximum number of requests to crawl has been reached."""
525
547
  if self._max_requests_per_crawl is None:
@@ -618,7 +640,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
618
640
 
619
641
  The error handler is invoked after a request handler error occurs and before a retry attempt.
620
642
  """
621
- self._error_handler = handler
643
+ self._error_handler = self._wrap_handler_with_error_context(handler)
622
644
  return handler
623
645
 
624
646
  def failed_request_handler(
@@ -628,7 +650,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
628
650
 
629
651
  The failed request handler is invoked when a request has failed all retry attempts.
630
652
  """
631
- self._failed_request_handler = handler
653
+ self._failed_request_handler = self._wrap_handler_with_error_context(handler)
632
654
  return handler
633
655
 
634
656
  def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
@@ -1256,52 +1278,46 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1256
1278
  else:
1257
1279
  yield Request.from_url(url)
1258
1280
 
1259
- async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
1260
- """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1261
- result = self._context_result_map[context]
1262
-
1263
- base_request_manager = await self.get_request_manager()
1264
-
1265
- origin = context.request.loaded_url or context.request.url
1266
-
1267
- for add_requests_call in result.add_requests_calls:
1268
- rq_id = add_requests_call.get('rq_id')
1269
- rq_name = add_requests_call.get('rq_name')
1270
- rq_alias = add_requests_call.get('rq_alias')
1271
- specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
1272
- if specified_params > 1:
1273
- raise ValueError('You can only provide one of `rq_id`, `rq_name` or `rq_alias` arguments.')
1274
- if rq_id or rq_name or rq_alias:
1275
- request_manager: RequestManager | RequestQueue = await RequestQueue.open(
1276
- id=rq_id,
1277
- name=rq_name,
1278
- alias=rq_alias,
1279
- storage_client=self._service_locator.get_storage_client(),
1280
- configuration=self._service_locator.get_configuration(),
1281
- )
1282
- else:
1283
- request_manager = base_request_manager
1284
-
1285
- requests = list[Request]()
1286
-
1287
- base_url = url if (url := add_requests_call.get('base_url')) else origin
1288
-
1289
- requests_iterator = self._convert_url_to_request_iterator(add_requests_call['requests'], base_url)
1281
+ async def _add_requests(
1282
+ self,
1283
+ context: BasicCrawlingContext,
1284
+ requests: Sequence[str | Request],
1285
+ rq_id: str | None = None,
1286
+ rq_name: str | None = None,
1287
+ rq_alias: str | None = None,
1288
+ **kwargs: Unpack[EnqueueLinksKwargs],
1289
+ ) -> None:
1290
+ """Add requests method aware of the crawling context."""
1291
+ if rq_id or rq_name or rq_alias:
1292
+ request_manager: RequestManager = await RequestQueue.open(
1293
+ id=rq_id,
1294
+ name=rq_name,
1295
+ alias=rq_alias,
1296
+ storage_client=self._service_locator.get_storage_client(),
1297
+ configuration=self._service_locator.get_configuration(),
1298
+ )
1299
+ else:
1300
+ request_manager = await self.get_request_manager()
1290
1301
 
1291
- enqueue_links_kwargs: EnqueueLinksKwargs = {k: v for k, v in add_requests_call.items() if k != 'requests'} # type: ignore[assignment]
1302
+ context_aware_requests = list[Request]()
1303
+ base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
1304
+ requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
1305
+ filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
1306
+ for dst_request in filter_requests_iterator:
1307
+ # Update the crawl depth of the request.
1308
+ dst_request.crawl_depth = context.request.crawl_depth + 1
1292
1309
 
1293
- filter_requests_iterator = self._enqueue_links_filter_iterator(
1294
- requests_iterator, context.request.url, **enqueue_links_kwargs
1295
- )
1310
+ if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
1311
+ context_aware_requests.append(dst_request)
1296
1312
 
1297
- for dst_request in filter_requests_iterator:
1298
- # Update the crawl depth of the request.
1299
- dst_request.crawl_depth = context.request.crawl_depth + 1
1313
+ return await request_manager.add_requests(context_aware_requests)
1300
1314
 
1301
- if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
1302
- requests.append(dst_request)
1315
+ async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
1316
+ """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1317
+ result = self._context_result_map[context]
1303
1318
 
1304
- await request_manager.add_requests(requests)
1319
+ for add_requests_call in result.add_requests_calls:
1320
+ await self._add_requests(context, **add_requests_call)
1305
1321
 
1306
1322
  for push_data_call in result.push_data_calls:
1307
1323
  await self._push_data(**push_data_call)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: crawlee
3
- Version: 1.1.1b2
3
+ Version: 1.1.1b4
4
4
  Summary: Crawlee for Python
5
5
  Project-URL: Apify Homepage, https://apify.com
6
6
  Project-URL: Changelog, https://crawlee.dev/python/docs/changelog
@@ -5,7 +5,7 @@ crawlee/_consts.py,sha256=RQ96gx7V-WPH91cVsMUz76X5UZUNDNhCudtlyGkxFVk,133
5
5
  crawlee/_log_config.py,sha256=VyxoEfWCq_9fyicmmJbjiZ5KC91onMcAtX2L4oKX4m4,5999
6
6
  crawlee/_request.py,sha256=fnUofyFMV3HJwfcLjYr2BCZ5K9mEwl6vZd8Pr309wCE,16458
7
7
  crawlee/_service_locator.py,sha256=SJ8ABYtclBl7rz8kfZ2jZkIgKq5oNIoGT7WmN8ApTzo,5058
8
- crawlee/_types.py,sha256=DAmfSv5W1dt3nJhJ8z-02gDaE06fdEizNKUlHpsd2_A,29129
8
+ crawlee/_types.py,sha256=93yoGr_KqMDIkq3__3QYpIAJmEzZvDoilHAF7_X4J4A,29933
9
9
  crawlee/configuration.py,sha256=DWS2z1FC6Ua93W2tStK3R1ZKZbZjVQYWGiGFbZFaRtA,8064
10
10
  crawlee/errors.py,sha256=RhFNA_uT615nVBHf9TylpX5YWwtDuHUUEV8LPT4CYa4,3878
11
11
  crawlee/proxy_configuration.py,sha256=rqf67yerXvLvraBaAHW04nvf5ECze3wMQbK7LlqXucM,10386
@@ -65,7 +65,7 @@ crawlee/crawlers/_adaptive_playwright/_rendering_type_predictor.py,sha256=TM4mkb
65
65
  crawlee/crawlers/_adaptive_playwright/_result_comparator.py,sha256=NAfw5VKzTnkvARtLr_zrZj6UGeMp05Voc6Oi8oPxU3w,1747
66
66
  crawlee/crawlers/_adaptive_playwright/_utils.py,sha256=EUYVz5i2YkLpL_gbVRp9BAD5u6w1xJ_AFzc_qB9bdDQ,1102
67
67
  crawlee/crawlers/_basic/__init__.py,sha256=LPln8SiBBXSMqrApiFUfpqz3hvqxN5HUa1cHQXMVKgU,280
68
- crawlee/crawlers/_basic/_basic_crawler.py,sha256=w8JSPF1zw2QG_1Phek5dENL_d5BC2N4dSZ6oR0MoUSY,73052
68
+ crawlee/crawlers/_basic/_basic_crawler.py,sha256=9N8qPh-k2b6_oywL5taMV-kfMlcs_Q7NgwzISQjzX2I,73869
69
69
  crawlee/crawlers/_basic/_basic_crawling_context.py,sha256=fjxm2RQXMDkDlWu38dQ3xn5rrGUOhJXkXiqkgbFJFk4,155
70
70
  crawlee/crawlers/_basic/_context_pipeline.py,sha256=vM8EEvnCoguERjRV3oyrxUq2Ln2F9DzY7P5dAEiuMHo,5869
71
71
  crawlee/crawlers/_basic/_logging_utils.py,sha256=jp5mEwSq5a_BgzUhNPJ9WrIDcoIeYGbeHstcRqCcP0s,3093
@@ -199,8 +199,8 @@ crawlee/storages/_request_queue.py,sha256=bjBOGbpMaGUsqJPVB-JD2VShziPAYMI-GvWKKp
199
199
  crawlee/storages/_storage_instance_manager.py,sha256=72n0YlPwNpSQDJSPf4TxnI2GvIK6L-ZiTmHRbFcoVU0,8164
200
200
  crawlee/storages/_utils.py,sha256=Yz-5tEBYKYCFJemYT29--uGJqoJLApLDLgPcsnbifRw,439
201
201
  crawlee/storages/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
202
- crawlee-1.1.1b2.dist-info/METADATA,sha256=LMwCP8_OXNKMiOwWfP1tF-6gy2OTkjJdrjtfKHRfdLs,29532
203
- crawlee-1.1.1b2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
204
- crawlee-1.1.1b2.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
205
- crawlee-1.1.1b2.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
206
- crawlee-1.1.1b2.dist-info/RECORD,,
202
+ crawlee-1.1.1b4.dist-info/METADATA,sha256=sSul_ezKS7jN5cvWq6i6o9npC8egHrVXAnEXfOWW0P0,29532
203
+ crawlee-1.1.1b4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
204
+ crawlee-1.1.1b4.dist-info/entry_points.txt,sha256=1p65X3dA-cYvzjtlxLL6Kn1wpY-3uEDVqJLp53uNPeo,45
205
+ crawlee-1.1.1b4.dist-info/licenses/LICENSE,sha256=AsFjHssKjj4LGd2ZCqXn6FBzMqcWdjQre1byPPSypVw,11355
206
+ crawlee-1.1.1b4.dist-info/RECORD,,