crawlee 1.0.5b18__py3-none-any.whl → 1.2.2b24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_browserforge_workaround.py +7 -3
  3. crawlee/_request.py +32 -13
  4. crawlee/_types.py +44 -5
  5. crawlee/_utils/context.py +3 -3
  6. crawlee/_utils/file.py +8 -1
  7. crawlee/_utils/globs.py +4 -4
  8. crawlee/_utils/recurring_task.py +12 -3
  9. crawlee/_utils/sitemap.py +12 -5
  10. crawlee/_utils/system.py +27 -11
  11. crawlee/_utils/time.py +41 -1
  12. crawlee/browsers/_browser_pool.py +1 -1
  13. crawlee/browsers/_playwright_browser.py +2 -1
  14. crawlee/crawlers/__init__.py +5 -1
  15. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  16. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +53 -17
  17. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  18. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +20 -49
  19. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
  20. crawlee/crawlers/_basic/_basic_crawler.py +138 -124
  21. crawlee/crawlers/_basic/_context_utils.py +24 -0
  22. crawlee/crawlers/_basic/_logging_utils.py +23 -4
  23. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  24. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  25. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  26. crawlee/crawlers/_playwright/_playwright_crawler.py +60 -22
  27. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  28. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  29. crawlee/crawlers/_playwright/_types.py +12 -2
  30. crawlee/errors.py +4 -0
  31. crawlee/events/_event_manager.py +12 -6
  32. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  33. crawlee/http_clients/_base.py +4 -0
  34. crawlee/http_clients/_curl_impersonate.py +68 -14
  35. crawlee/http_clients/_httpx.py +16 -6
  36. crawlee/http_clients/_impit.py +25 -10
  37. crawlee/otel/crawler_instrumentor.py +1 -3
  38. crawlee/request_loaders/_sitemap_request_loader.py +18 -5
  39. crawlee/router.py +13 -3
  40. crawlee/sessions/_cookies.py +13 -8
  41. crawlee/sessions/_models.py +3 -3
  42. crawlee/statistics/_models.py +51 -9
  43. crawlee/statistics/_statistics.py +2 -21
  44. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  45. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  46. crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
  47. crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
  48. crawlee/storage_clients/_file_system/_request_queue_client.py +5 -4
  49. crawlee/storage_clients/_redis/_client_mixin.py +1 -4
  50. crawlee/storage_clients/_redis/_dataset_client.py +6 -2
  51. crawlee/storage_clients/_redis/_key_value_store_client.py +3 -5
  52. crawlee/storage_clients/_redis/_request_queue_client.py +5 -8
  53. crawlee/storage_clients/_redis/_storage_client.py +12 -9
  54. crawlee/storage_clients/_redis/_utils.py +1 -1
  55. crawlee/storage_clients/_sql/_client_mixin.py +1 -1
  56. crawlee/storage_clients/_sql/_storage_client.py +0 -9
  57. crawlee/storage_clients/models.py +8 -3
  58. crawlee/storages/_storage_instance_manager.py +103 -44
  59. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +10 -16
  60. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +63 -62
  61. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
  62. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
  63. {crawlee-1.0.5b18.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0
@@ -2,6 +2,7 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  import asyncio
5
+ import functools
5
6
  import logging
6
7
  import signal
7
8
  import sys
@@ -13,8 +14,9 @@ from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Seque
13
14
  from contextlib import AsyncExitStack, suppress
14
15
  from datetime import timedelta
15
16
  from functools import partial
17
+ from io import StringIO
16
18
  from pathlib import Path
17
- from typing import TYPE_CHECKING, Any, Generic, Literal, cast
19
+ from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
18
20
  from urllib.parse import ParseResult, urlparse
19
21
  from weakref import WeakKeyDictionary
20
22
 
@@ -31,6 +33,8 @@ from crawlee._service_locator import ServiceLocator
31
33
  from crawlee._types import (
32
34
  BasicCrawlingContext,
33
35
  EnqueueLinksKwargs,
36
+ ExportDataCsvKwargs,
37
+ ExportDataJsonKwargs,
34
38
  GetKeyValueStoreFromRequestHandlerFunction,
35
39
  HttpHeaders,
36
40
  HttpPayload,
@@ -40,7 +44,7 @@ from crawlee._types import (
40
44
  SkippedReason,
41
45
  )
42
46
  from crawlee._utils.docs import docs_group
43
- from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
47
+ from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
44
48
  from crawlee._utils.recurring_task import RecurringTask
45
49
  from crawlee._utils.robots import RobotsTxtFile
46
50
  from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
@@ -55,8 +59,9 @@ from crawlee.errors import (
55
59
  RequestHandlerError,
56
60
  SessionError,
57
61
  UserDefinedErrorHandlerError,
62
+ UserHandlerTimeoutError,
58
63
  )
59
- from crawlee.events._types import Event, EventCrawlerStatusData, EventPersistStateData
64
+ from crawlee.events._types import Event, EventCrawlerStatusData
60
65
  from crawlee.http_clients import ImpitHttpClient
61
66
  from crawlee.router import Router
62
67
  from crawlee.sessions import SessionPool
@@ -64,6 +69,7 @@ from crawlee.statistics import Statistics, StatisticsState
64
69
  from crawlee.storages import Dataset, KeyValueStore, RequestQueue
65
70
 
66
71
  from ._context_pipeline import ContextPipeline
72
+ from ._context_utils import swapped_context
67
73
  from ._logging_utils import (
68
74
  get_one_line_error_summary_if_possible,
69
75
  reduce_asyncio_timeout_error_to_relevant_traceback_parts,
@@ -96,6 +102,9 @@ if TYPE_CHECKING:
96
102
  TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
97
103
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
98
104
  TRequestIterator = TypeVar('TRequestIterator', str, Request)
105
+ TParams = ParamSpec('TParams')
106
+ T = TypeVar('T')
107
+
99
108
  ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
100
109
  FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
101
110
  SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
@@ -401,7 +410,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
401
410
  self._context_result_map = WeakKeyDictionary[BasicCrawlingContext, RequestHandlerRunResult]()
402
411
 
403
412
  # Context pipeline
404
- self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)
413
+ self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects) # ty: ignore[invalid-argument-type]
405
414
 
406
415
  # Crawl settings
407
416
  self._max_request_retries = max_request_retries
@@ -520,6 +529,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
520
529
  self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
521
530
  self._unexpected_stop = True
522
531
 
532
+ def _wrap_handler_with_error_context(
533
+ self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
534
+ ) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
535
+ """Decorate error handlers to make their context helpers usable."""
536
+
537
+ @functools.wraps(handler)
538
+ async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
539
+ # Original context helpers that are from `RequestHandlerRunResult` will not be committed as the request
540
+ # failed. Modified context provides context helpers with direct access to the storages.
541
+ error_context = context.create_modified_copy(
542
+ push_data=self._push_data,
543
+ get_key_value_store=self.get_key_value_store,
544
+ add_requests=functools.partial(self._add_requests, context),
545
+ )
546
+ return await handler(error_context, exception)
547
+
548
+ return wrapped_handler
549
+
523
550
  def _stop_if_max_requests_count_exceeded(self) -> None:
524
551
  """Call `stop` when the maximum number of requests to crawl has been reached."""
525
552
  if self._max_requests_per_crawl is None:
@@ -618,7 +645,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
618
645
 
619
646
  The error handler is invoked after a request handler error occurs and before a retry attempt.
620
647
  """
621
- self._error_handler = handler
648
+ self._error_handler = self._wrap_handler_with_error_context(handler)
622
649
  return handler
623
650
 
624
651
  def failed_request_handler(
@@ -628,7 +655,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
628
655
 
629
656
  The failed request handler is invoked when a request has failed all retry attempts.
630
657
  """
631
- self._failed_request_handler = handler
658
+ self._failed_request_handler = self._wrap_handler_with_error_context(handler)
632
659
  return handler
633
660
 
634
661
  def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
@@ -747,13 +774,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
747
774
 
748
775
  async with AsyncExitStack() as exit_stack:
749
776
  for context in contexts_to_enter:
750
- await exit_stack.enter_async_context(context) # type: ignore[arg-type]
777
+ await exit_stack.enter_async_context(context) # ty: ignore[invalid-argument-type]
751
778
 
752
779
  await self._autoscaled_pool.run()
753
780
 
754
- # Emit PERSIST_STATE event when crawler is finishing to allow listeners to persist their state if needed
755
- event_manager.emit(event=Event.PERSIST_STATE, event_data=EventPersistStateData(is_migrating=False))
756
-
757
781
  async def add_requests(
758
782
  self,
759
783
  requests: Sequence[str | Request],
@@ -849,6 +873,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
849
873
  dataset_id: str | None = None,
850
874
  dataset_name: str | None = None,
851
875
  dataset_alias: str | None = None,
876
+ **additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs],
852
877
  ) -> None:
853
878
  """Export all items from a Dataset to a JSON or CSV file.
854
879
 
@@ -861,6 +886,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
861
886
  dataset_id: The ID of the Dataset to export from.
862
887
  dataset_name: The name of the Dataset to export from (global scope, named storage).
863
888
  dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
889
+ additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
864
890
  """
865
891
  dataset = await Dataset.open(
866
892
  id=dataset_id,
@@ -870,13 +896,18 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
870
896
  configuration=self._service_locator.get_configuration(),
871
897
  )
872
898
 
873
- path = path if isinstance(path, Path) else Path(path)
874
- dst = path.open('w', newline='')
899
+ path = Path(path)
875
900
 
876
901
  if path.suffix == '.csv':
877
- await export_csv_to_stream(dataset.iterate_items(), dst)
902
+ dst = StringIO()
903
+ csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
904
+ await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
905
+ await atomic_write(path, dst.getvalue())
878
906
  elif path.suffix == '.json':
879
- await export_json_to_stream(dataset.iterate_items(), dst)
907
+ dst = StringIO()
908
+ json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
909
+ await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
910
+ await atomic_write(path, dst.getvalue())
880
911
  else:
881
912
  raise ValueError(f'Unsupported file extension: {path.suffix}')
882
913
 
@@ -1008,7 +1039,12 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1008
1039
  warning_flag = True
1009
1040
 
1010
1041
  for request in request_iterator:
1011
- target_url = request.url if isinstance(request, Request) else request
1042
+ if isinstance(request, Request):
1043
+ if request.enqueue_strategy != strategy:
1044
+ request.enqueue_strategy = strategy
1045
+ target_url = request.url
1046
+ else:
1047
+ target_url = request
1012
1048
  parsed_target_url = urlparse(target_url)
1013
1049
 
1014
1050
  if warning_flag and strategy != 'all' and not parsed_target_url.hostname:
@@ -1046,8 +1082,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1046
1082
  return target_url.hostname == origin_url.hostname
1047
1083
 
1048
1084
  if strategy == 'same-domain':
1049
- origin_domain = self._tld_extractor.extract_str(origin_url.hostname).domain
1050
- target_domain = self._tld_extractor.extract_str(target_url.hostname).domain
1085
+ origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
1086
+ target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
1051
1087
  return origin_domain == target_domain
1052
1088
 
1053
1089
  if strategy == 'same-origin':
@@ -1105,7 +1141,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1105
1141
  request.retry_count += 1
1106
1142
  reduced_error = str(error).split('\n')[0]
1107
1143
  self.log.warning(
1108
- f'Retrying request to {context.request.url} due to: {reduced_error}'
1144
+ f'Retrying request to {context.request.url} due to: {reduced_error}. '
1109
1145
  f'{get_one_line_error_summary_if_possible(error)}'
1110
1146
  )
1111
1147
  await self._statistics.error_tracker.add(error=error, context=context)
@@ -1116,19 +1152,15 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1116
1152
  except Exception as e:
1117
1153
  raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
1118
1154
  else:
1119
- if new_request is not None:
1120
- request = new_request
1155
+ if new_request is not None and new_request != request:
1156
+ await request_manager.add_request(new_request)
1157
+ await self._mark_request_as_handled(request)
1158
+ return
1121
1159
 
1122
1160
  await request_manager.reclaim_request(request)
1123
1161
  else:
1124
- await wait_for(
1125
- lambda: request_manager.mark_request_as_handled(context.request),
1126
- timeout=self._internal_timeout,
1127
- timeout_message='Marking request as handled timed out after '
1128
- f'{self._internal_timeout.total_seconds()} seconds',
1129
- logger=self._logger,
1130
- max_retries=3,
1131
- )
1162
+ request.state = RequestState.ERROR
1163
+ await self._mark_request_as_handled(request)
1132
1164
  await self._handle_failed_request(context, error)
1133
1165
  self._statistics.record_request_processing_failure(request.unique_key)
1134
1166
 
@@ -1143,8 +1175,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1143
1175
  f'{self._internal_timeout.total_seconds()} seconds',
1144
1176
  logger=self._logger,
1145
1177
  )
1146
-
1147
- context.request.state = RequestState.DONE
1148
1178
  except UserDefinedErrorHandlerError:
1149
1179
  context.request.state = RequestState.ERROR
1150
1180
  raise
@@ -1177,17 +1207,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1177
1207
  self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
1178
1208
  ) -> None:
1179
1209
  if need_mark and isinstance(request, Request):
1180
- request_manager = await self.get_request_manager()
1181
-
1182
- await wait_for(
1183
- lambda: request_manager.mark_request_as_handled(request),
1184
- timeout=self._internal_timeout,
1185
- timeout_message='Marking request as handled timed out after '
1186
- f'{self._internal_timeout.total_seconds()} seconds',
1187
- logger=self._logger,
1188
- max_retries=3,
1189
- )
1190
1210
  request.state = RequestState.SKIPPED
1211
+ await self._mark_request_as_handled(request)
1191
1212
 
1192
1213
  url = request.url if isinstance(request, Request) else request
1193
1214
 
@@ -1207,10 +1228,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1207
1228
 
1208
1229
  if (
1209
1230
  isinstance(error, asyncio.exceptions.TimeoutError)
1231
+ and traceback_parts
1210
1232
  and self._request_handler_timeout_text in traceback_parts[-1]
1211
- ):
1233
+ ) or isinstance(error, UserHandlerTimeoutError):
1212
1234
  used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
1213
- used_traceback_parts.append(traceback_parts[-1])
1235
+ used_traceback_parts.extend(traceback_parts[-1:])
1214
1236
 
1215
1237
  return ''.join(used_traceback_parts).strip('\n')
1216
1238
 
@@ -1259,58 +1281,54 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1259
1281
  else:
1260
1282
  yield Request.from_url(url)
1261
1283
 
1262
- async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
1263
- """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1264
- result = self._context_result_map[context]
1265
-
1266
- base_request_manager = await self.get_request_manager()
1267
-
1268
- origin = context.request.loaded_url or context.request.url
1269
-
1270
- for add_requests_call in result.add_requests_calls:
1271
- rq_id = add_requests_call.get('rq_id')
1272
- rq_name = add_requests_call.get('rq_name')
1273
- rq_alias = add_requests_call.get('rq_alias')
1274
- specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
1275
- if specified_params > 1:
1276
- raise ValueError('You can only provide one of `rq_id`, `rq_name` or `rq_alias` arguments.')
1277
- if rq_id or rq_name or rq_alias:
1278
- request_manager: RequestManager | RequestQueue = await RequestQueue.open(
1279
- id=rq_id,
1280
- name=rq_name,
1281
- alias=rq_alias,
1282
- storage_client=self._service_locator.get_storage_client(),
1283
- configuration=self._service_locator.get_configuration(),
1284
- )
1285
- else:
1286
- request_manager = base_request_manager
1287
-
1288
- requests = list[Request]()
1289
-
1290
- base_url = url if (url := add_requests_call.get('base_url')) else origin
1291
-
1292
- requests_iterator = self._convert_url_to_request_iterator(add_requests_call['requests'], base_url)
1284
+ async def _add_requests(
1285
+ self,
1286
+ context: BasicCrawlingContext,
1287
+ requests: Sequence[str | Request],
1288
+ rq_id: str | None = None,
1289
+ rq_name: str | None = None,
1290
+ rq_alias: str | None = None,
1291
+ **kwargs: Unpack[EnqueueLinksKwargs],
1292
+ ) -> None:
1293
+ """Add requests method aware of the crawling context."""
1294
+ if rq_id or rq_name or rq_alias:
1295
+ request_manager: RequestManager = await RequestQueue.open(
1296
+ id=rq_id,
1297
+ name=rq_name,
1298
+ alias=rq_alias,
1299
+ storage_client=self._service_locator.get_storage_client(),
1300
+ configuration=self._service_locator.get_configuration(),
1301
+ )
1302
+ else:
1303
+ request_manager = await self.get_request_manager()
1293
1304
 
1294
- enqueue_links_kwargs: EnqueueLinksKwargs = {k: v for k, v in add_requests_call.items() if k != 'requests'} # type: ignore[assignment]
1305
+ context_aware_requests = list[Request]()
1306
+ base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
1307
+ requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
1308
+ filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
1309
+ for dst_request in filter_requests_iterator:
1310
+ # Update the crawl depth of the request.
1311
+ dst_request.crawl_depth = context.request.crawl_depth + 1
1295
1312
 
1296
- filter_requests_iterator = self._enqueue_links_filter_iterator(
1297
- requests_iterator, context.request.url, **enqueue_links_kwargs
1298
- )
1313
+ if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
1314
+ context_aware_requests.append(dst_request)
1299
1315
 
1300
- for dst_request in filter_requests_iterator:
1301
- # Update the crawl depth of the request.
1302
- dst_request.crawl_depth = context.request.crawl_depth + 1
1316
+ return await request_manager.add_requests(context_aware_requests)
1303
1317
 
1304
- if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
1305
- requests.append(dst_request)
1318
+ async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
1319
+ """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1320
+ result = self._context_result_map[context]
1306
1321
 
1307
- await request_manager.add_requests(requests)
1322
+ for add_requests_call in result.add_requests_calls:
1323
+ await self._add_requests(context, **add_requests_call)
1308
1324
 
1309
1325
  for push_data_call in result.push_data_calls:
1310
1326
  await self._push_data(**push_data_call)
1311
1327
 
1312
1328
  await self._commit_key_value_store_changes(result, get_kvs=self.get_key_value_store)
1313
1329
 
1330
+ result.apply_request_changes(target=context.request)
1331
+
1314
1332
  @staticmethod
1315
1333
  async def _commit_key_value_store_changes(
1316
1334
  result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction
@@ -1376,10 +1394,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1376
1394
  else:
1377
1395
  session = await self._get_session()
1378
1396
  proxy_info = await self._get_proxy_info(request, session)
1379
- result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
1397
+ result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store, request=request)
1380
1398
 
1381
1399
  context = BasicCrawlingContext(
1382
- request=request,
1400
+ request=result.request,
1383
1401
  session=session,
1384
1402
  proxy_info=proxy_info,
1385
1403
  send_request=self._prepare_send_request_function(session, proxy_info),
@@ -1396,32 +1414,26 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1396
1414
  try:
1397
1415
  request.state = RequestState.REQUEST_HANDLER
1398
1416
 
1399
- self._check_request_collision(context.request, context.session)
1400
-
1401
1417
  try:
1402
- await self._run_request_handler(context=context)
1418
+ with swapped_context(context, request):
1419
+ self._check_request_collision(request, session)
1420
+ await self._run_request_handler(context=context)
1403
1421
  except asyncio.TimeoutError as e:
1404
1422
  raise RequestHandlerError(e, context) from e
1405
1423
 
1406
1424
  await self._commit_request_handler_result(context)
1407
- await wait_for(
1408
- lambda: request_manager.mark_request_as_handled(context.request),
1409
- timeout=self._internal_timeout,
1410
- timeout_message='Marking request as handled timed out after '
1411
- f'{self._internal_timeout.total_seconds()} seconds',
1412
- logger=self._logger,
1413
- max_retries=3,
1414
- )
1415
1425
 
1416
1426
  request.state = RequestState.DONE
1417
1427
 
1418
- if context.session and context.session.is_usable:
1419
- context.session.mark_good()
1428
+ await self._mark_request_as_handled(request)
1429
+
1430
+ if session and session.is_usable:
1431
+ session.mark_good()
1420
1432
 
1421
1433
  self._statistics.record_request_processing_finish(request.unique_key)
1422
1434
 
1423
1435
  except RequestCollisionError as request_error:
1424
- context.request.no_retry = True
1436
+ request.no_retry = True
1425
1437
  await self._handle_request_error(context, request_error)
1426
1438
 
1427
1439
  except RequestHandlerError as primary_error:
@@ -1436,7 +1448,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1436
1448
  await self._handle_request_error(primary_error.crawling_context, primary_error.wrapped_exception)
1437
1449
 
1438
1450
  except SessionError as session_error:
1439
- if not context.session:
1451
+ if not session:
1440
1452
  raise RuntimeError('SessionError raised in a crawling context without a session') from session_error
1441
1453
 
1442
1454
  if self._error_handler:
@@ -1446,22 +1458,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1446
1458
  exc_only = ''.join(traceback.format_exception_only(session_error)).strip()
1447
1459
  self._logger.warning('Encountered "%s", rotating session and retrying...', exc_only)
1448
1460
 
1449
- context.session.retire()
1461
+ if session:
1462
+ session.retire()
1450
1463
 
1451
1464
  # Increment session rotation count.
1452
- context.request.session_rotation_count = (context.request.session_rotation_count or 0) + 1
1465
+ request.session_rotation_count = (request.session_rotation_count or 0) + 1
1453
1466
 
1454
1467
  await request_manager.reclaim_request(request)
1455
1468
  await self._statistics.error_tracker_retry.add(error=session_error, context=context)
1456
1469
  else:
1457
- await wait_for(
1458
- lambda: request_manager.mark_request_as_handled(context.request),
1459
- timeout=self._internal_timeout,
1460
- timeout_message='Marking request as handled timed out after '
1461
- f'{self._internal_timeout.total_seconds()} seconds',
1462
- logger=self._logger,
1463
- max_retries=3,
1464
- )
1470
+ await self._mark_request_as_handled(request)
1465
1471
 
1466
1472
  await self._handle_failed_request(context, session_error)
1467
1473
  self._statistics.record_request_processing_failure(request.unique_key)
@@ -1469,14 +1475,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1469
1475
  except ContextPipelineInterruptedError as interrupted_error:
1470
1476
  self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
1471
1477
 
1472
- await wait_for(
1473
- lambda: request_manager.mark_request_as_handled(context.request),
1474
- timeout=self._internal_timeout,
1475
- timeout_message='Marking request as handled timed out after '
1476
- f'{self._internal_timeout.total_seconds()} seconds',
1477
- logger=self._logger,
1478
- max_retries=3,
1479
- )
1478
+ await self._mark_request_as_handled(request)
1480
1479
 
1481
1480
  except ContextPipelineInitializationError as initialization_error:
1482
1481
  self._logger.debug(
@@ -1494,12 +1493,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1494
1493
  raise
1495
1494
 
1496
1495
  async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
1497
- await wait_for(
1498
- lambda: self._context_pipeline(context, self.router),
1499
- timeout=self._request_handler_timeout,
1500
- timeout_message=f'{self._request_handler_timeout_text}'
1501
- f' {self._request_handler_timeout.total_seconds()} seconds',
1502
- logger=self._logger,
1496
+ context.request.state = RequestState.BEFORE_NAV
1497
+ await self._context_pipeline(
1498
+ context,
1499
+ lambda final_context: wait_for(
1500
+ lambda: self.router(final_context),
1501
+ timeout=self._request_handler_timeout,
1502
+ timeout_message=f'{self._request_handler_timeout_text}'
1503
+ f' {self._request_handler_timeout.total_seconds()} seconds',
1504
+ logger=self._logger,
1505
+ ),
1503
1506
  )
1504
1507
 
1505
1508
  def _raise_for_error_status_code(self, status_code: int) -> None:
@@ -1647,3 +1650,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1647
1650
  )
1648
1651
 
1649
1652
  self._previous_crawler_state = current_state
1653
+
1654
+ async def _mark_request_as_handled(self, request: Request) -> None:
1655
+ request_manager = await self.get_request_manager()
1656
+ await wait_for(
1657
+ lambda: request_manager.mark_request_as_handled(request),
1658
+ timeout=self._internal_timeout,
1659
+ timeout_message='Marking request as handled timed out after '
1660
+ f'{self._internal_timeout.total_seconds()} seconds',
1661
+ logger=self._logger,
1662
+ max_retries=3,
1663
+ )
@@ -0,0 +1,24 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import contextmanager
4
+ from typing import TYPE_CHECKING
5
+
6
+ if TYPE_CHECKING:
7
+ from collections.abc import Iterator
8
+
9
+ from crawlee._request import Request
10
+
11
+ from ._basic_crawling_context import BasicCrawlingContext
12
+
13
+
14
+ @contextmanager
15
+ def swapped_context(
16
+ context: BasicCrawlingContext,
17
+ request: Request,
18
+ ) -> Iterator[None]:
19
+ """Replace context's isolated copies with originals after handler execution."""
20
+ try:
21
+ yield
22
+ finally:
23
+ # Restore original context state to avoid side effects between different handlers.
24
+ object.__setattr__(context, 'request', request)
@@ -2,9 +2,21 @@ import asyncio
2
2
  import re
3
3
  import traceback
4
4
 
5
+ import crawlee.errors
6
+
5
7
 
6
8
  def _get_only_innermost_exception(error: BaseException) -> BaseException:
7
- """Get innermost exception by following __cause__ and __context__ attributes of exception."""
9
+ """Get innermost exception by following __cause__ and __context__ attributes of exception.
10
+
11
+ If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible.
12
+ """
13
+ if type(error) is crawlee.errors.UserHandlerTimeoutError:
14
+ if error.__cause__:
15
+ return error.__cause__
16
+ if error.__context__:
17
+ return error.__context__
18
+ return error
19
+
8
20
  if error.__cause__:
9
21
  return _get_only_innermost_exception(error.__cause__)
10
22
  if error.__context__:
@@ -34,7 +46,7 @@ def _strip_pep657_highlighting(traceback_part: str) -> str:
34
46
 
35
47
 
36
48
  def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
37
- timeout_error: asyncio.exceptions.TimeoutError,
49
+ timeout_error: asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError,
38
50
  ) -> list[str]:
39
51
  innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error)
40
52
  return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts)
@@ -43,13 +55,20 @@ def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
43
55
  def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
44
56
  innermost_error = _get_only_innermost_exception(error)
45
57
  return traceback.format_exception(
46
- type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=True
58
+ type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=False
47
59
  )
48
60
 
49
61
 
50
62
  def get_one_line_error_summary_if_possible(error: Exception) -> str:
51
63
  if isinstance(error, asyncio.exceptions.TimeoutError):
52
- most_relevant_part = ',' + reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
64
+ relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
65
+ most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else ''
66
+ elif isinstance(error, crawlee.errors.UserHandlerTimeoutError):
67
+ # Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee
68
+ # code and third line the topmost user error
69
+ traceback_parts = _get_traceback_parts_for_innermost_exception(error)
70
+ relevant_index_from_start = 3
71
+ most_relevant_part = traceback_parts[2] if len(traceback_parts) >= relevant_index_from_start else ''
53
72
  elif 'playwright._impl._errors.Error' in str(error.__class__):
54
73
  # Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
55
74
  # point to deep internals.
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
5
5
  from bs4 import BeautifulSoup, Tag
6
6
 
7
7
  from crawlee._utils.docs import docs_group
8
- from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
8
+ from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
9
9
 
10
10
  from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
11
11
  from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
@@ -58,7 +58,7 @@ class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, Bea
58
58
  self,
59
59
  *,
60
60
  parser: BeautifulSoupParserType = 'lxml',
61
- **kwargs: Unpack[BasicCrawlerOptions[BeautifulSoupCrawlingContext]],
61
+ **kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
62
62
  ) -> None:
63
63
  """Initialize a new instance.
64
64
 
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
5
5
  from parsel import Selector
6
6
 
7
7
  from crawlee._utils.docs import docs_group
8
- from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
8
+ from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
9
9
 
10
10
  from ._parsel_crawling_context import ParselCrawlingContext
11
11
  from ._parsel_parser import ParselParser
@@ -56,7 +56,7 @@ class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selecto
56
56
 
57
57
  def __init__(
58
58
  self,
59
- **kwargs: Unpack[BasicCrawlerOptions[ParselCrawlingContext]],
59
+ **kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
60
60
  ) -> None:
61
61
  """Initialize a new instance.
62
62
 
@@ -22,7 +22,7 @@ class ParselParser(AbstractHttpParser[Selector, Selector]):
22
22
  @override
23
23
  async def parse(self, response: HttpResponse) -> Selector:
24
24
  response_body = await response.read()
25
- return await asyncio.to_thread(lambda: Selector(body=response_body))
25
+ return await asyncio.to_thread(Selector, body=response_body)
26
26
 
27
27
  @override
28
28
  async def parse_text(self, text: str) -> Selector: