crawlee 1.1.1b1__py3-none-any.whl → 1.2.1b7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (37) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_request.py +29 -10
  3. crawlee/_types.py +42 -2
  4. crawlee/_utils/context.py +2 -2
  5. crawlee/_utils/file.py +7 -0
  6. crawlee/_utils/recurring_task.py +2 -1
  7. crawlee/_utils/time.py +41 -1
  8. crawlee/crawlers/__init__.py +2 -1
  9. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  10. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +52 -14
  11. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +10 -33
  12. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
  13. crawlee/crawlers/_basic/_basic_crawler.py +135 -118
  14. crawlee/crawlers/_basic/_context_utils.py +24 -0
  15. crawlee/crawlers/_basic/_logging_utils.py +23 -4
  16. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  17. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  18. crawlee/crawlers/_playwright/_playwright_crawler.py +58 -17
  19. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  20. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  21. crawlee/crawlers/_playwright/_types.py +12 -2
  22. crawlee/errors.py +4 -0
  23. crawlee/events/_event_manager.py +1 -3
  24. crawlee/http_clients/_base.py +4 -0
  25. crawlee/http_clients/_curl_impersonate.py +12 -0
  26. crawlee/http_clients/_httpx.py +16 -6
  27. crawlee/http_clients/_impit.py +25 -10
  28. crawlee/router.py +13 -3
  29. crawlee/storage_clients/_file_system/_dataset_client.py +2 -2
  30. crawlee/storage_clients/_file_system/_key_value_store_client.py +3 -3
  31. crawlee/storage_clients/_file_system/_request_queue_client.py +3 -3
  32. crawlee/storage_clients/_sql/_storage_client.py +0 -9
  33. {crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/METADATA +10 -16
  34. {crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/RECORD +37 -36
  35. {crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/WHEEL +1 -1
  36. {crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/entry_points.txt +0 -0
  37. {crawlee-1.1.1b1.dist-info → crawlee-1.2.1b7.dist-info}/licenses/LICENSE +0 -0
@@ -2,6 +2,7 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  import asyncio
5
+ import functools
5
6
  import logging
6
7
  import signal
7
8
  import sys
@@ -13,8 +14,9 @@ from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Seque
13
14
  from contextlib import AsyncExitStack, suppress
14
15
  from datetime import timedelta
15
16
  from functools import partial
17
+ from io import StringIO
16
18
  from pathlib import Path
17
- from typing import TYPE_CHECKING, Any, Generic, Literal, cast
19
+ from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
18
20
  from urllib.parse import ParseResult, urlparse
19
21
  from weakref import WeakKeyDictionary
20
22
 
@@ -31,6 +33,8 @@ from crawlee._service_locator import ServiceLocator
31
33
  from crawlee._types import (
32
34
  BasicCrawlingContext,
33
35
  EnqueueLinksKwargs,
36
+ ExportDataCsvKwargs,
37
+ ExportDataJsonKwargs,
34
38
  GetKeyValueStoreFromRequestHandlerFunction,
35
39
  HttpHeaders,
36
40
  HttpPayload,
@@ -40,7 +44,7 @@ from crawlee._types import (
40
44
  SkippedReason,
41
45
  )
42
46
  from crawlee._utils.docs import docs_group
43
- from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
47
+ from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
44
48
  from crawlee._utils.recurring_task import RecurringTask
45
49
  from crawlee._utils.robots import RobotsTxtFile
46
50
  from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
@@ -55,6 +59,7 @@ from crawlee.errors import (
55
59
  RequestHandlerError,
56
60
  SessionError,
57
61
  UserDefinedErrorHandlerError,
62
+ UserHandlerTimeoutError,
58
63
  )
59
64
  from crawlee.events._types import Event, EventCrawlerStatusData
60
65
  from crawlee.http_clients import ImpitHttpClient
@@ -64,6 +69,7 @@ from crawlee.statistics import Statistics, StatisticsState
64
69
  from crawlee.storages import Dataset, KeyValueStore, RequestQueue
65
70
 
66
71
  from ._context_pipeline import ContextPipeline
72
+ from ._context_utils import swaped_context
67
73
  from ._logging_utils import (
68
74
  get_one_line_error_summary_if_possible,
69
75
  reduce_asyncio_timeout_error_to_relevant_traceback_parts,
@@ -96,6 +102,9 @@ if TYPE_CHECKING:
96
102
  TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
97
103
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
98
104
  TRequestIterator = TypeVar('TRequestIterator', str, Request)
105
+ TParams = ParamSpec('TParams')
106
+ T = TypeVar('T')
107
+
99
108
  ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
100
109
  FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
101
110
  SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
@@ -520,6 +529,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
520
529
  self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
521
530
  self._unexpected_stop = True
522
531
 
532
+ def _wrap_handler_with_error_context(
533
+ self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
534
+ ) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
535
+ """Decorate error handlers to make their context helpers usable."""
536
+
537
+ @functools.wraps(handler)
538
+ async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
539
+ # Original context helpers that are from `RequestHandlerRunResult` will not be commited as the request
540
+ # failed. Modified context provides context helpers with direct access to the storages.
541
+ error_context = context.create_modified_copy(
542
+ push_data=self._push_data,
543
+ get_key_value_store=self.get_key_value_store,
544
+ add_requests=functools.partial(self._add_requests, context),
545
+ )
546
+ return await handler(error_context, exception)
547
+
548
+ return wrapped_handler
549
+
523
550
  def _stop_if_max_requests_count_exceeded(self) -> None:
524
551
  """Call `stop` when the maximum number of requests to crawl has been reached."""
525
552
  if self._max_requests_per_crawl is None:
@@ -618,7 +645,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
618
645
 
619
646
  The error handler is invoked after a request handler error occurs and before a retry attempt.
620
647
  """
621
- self._error_handler = handler
648
+ self._error_handler = self._wrap_handler_with_error_context(handler)
622
649
  return handler
623
650
 
624
651
  def failed_request_handler(
@@ -628,7 +655,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
628
655
 
629
656
  The failed request handler is invoked when a request has failed all retry attempts.
630
657
  """
631
- self._failed_request_handler = handler
658
+ self._failed_request_handler = self._wrap_handler_with_error_context(handler)
632
659
  return handler
633
660
 
634
661
  def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
@@ -846,6 +873,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
846
873
  dataset_id: str | None = None,
847
874
  dataset_name: str | None = None,
848
875
  dataset_alias: str | None = None,
876
+ **additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs], # type: ignore[misc]
849
877
  ) -> None:
850
878
  """Export all items from a Dataset to a JSON or CSV file.
851
879
 
@@ -858,6 +886,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
858
886
  dataset_id: The ID of the Dataset to export from.
859
887
  dataset_name: The name of the Dataset to export from (global scope, named storage).
860
888
  dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
889
+ additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
861
890
  """
862
891
  dataset = await Dataset.open(
863
892
  id=dataset_id,
@@ -867,13 +896,18 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
867
896
  configuration=self._service_locator.get_configuration(),
868
897
  )
869
898
 
870
- path = path if isinstance(path, Path) else Path(path)
871
- dst = path.open('w', newline='')
899
+ path = Path(path)
872
900
 
873
901
  if path.suffix == '.csv':
874
- await export_csv_to_stream(dataset.iterate_items(), dst)
902
+ dst = StringIO()
903
+ csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
904
+ await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
905
+ await atomic_write(path, dst.getvalue())
875
906
  elif path.suffix == '.json':
876
- await export_json_to_stream(dataset.iterate_items(), dst)
907
+ dst = StringIO()
908
+ json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
909
+ await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
910
+ await atomic_write(path, dst.getvalue())
877
911
  else:
878
912
  raise ValueError(f'Unsupported file extension: {path.suffix}')
879
913
 
@@ -1005,7 +1039,12 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1005
1039
  warning_flag = True
1006
1040
 
1007
1041
  for request in request_iterator:
1008
- target_url = request.url if isinstance(request, Request) else request
1042
+ if isinstance(request, Request):
1043
+ if request.enqueue_strategy != strategy:
1044
+ request.enqueue_strategy = strategy
1045
+ target_url = request.url
1046
+ else:
1047
+ target_url = request
1009
1048
  parsed_target_url = urlparse(target_url)
1010
1049
 
1011
1050
  if warning_flag and strategy != 'all' and not parsed_target_url.hostname:
@@ -1043,8 +1082,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1043
1082
  return target_url.hostname == origin_url.hostname
1044
1083
 
1045
1084
  if strategy == 'same-domain':
1046
- origin_domain = self._tld_extractor.extract_str(origin_url.hostname).domain
1047
- target_domain = self._tld_extractor.extract_str(target_url.hostname).domain
1085
+ origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
1086
+ target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
1048
1087
  return origin_domain == target_domain
1049
1088
 
1050
1089
  if strategy == 'same-origin':
@@ -1102,7 +1141,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1102
1141
  request.retry_count += 1
1103
1142
  reduced_error = str(error).split('\n')[0]
1104
1143
  self.log.warning(
1105
- f'Retrying request to {context.request.url} due to: {reduced_error}'
1144
+ f'Retrying request to {context.request.url} due to: {reduced_error}. '
1106
1145
  f'{get_one_line_error_summary_if_possible(error)}'
1107
1146
  )
1108
1147
  await self._statistics.error_tracker.add(error=error, context=context)
@@ -1113,19 +1152,15 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1113
1152
  except Exception as e:
1114
1153
  raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
1115
1154
  else:
1116
- if new_request is not None:
1117
- request = new_request
1155
+ if new_request is not None and new_request != request:
1156
+ await request_manager.add_request(new_request)
1157
+ await self._mark_request_as_handled(request)
1158
+ return
1118
1159
 
1119
1160
  await request_manager.reclaim_request(request)
1120
1161
  else:
1121
- await wait_for(
1122
- lambda: request_manager.mark_request_as_handled(context.request),
1123
- timeout=self._internal_timeout,
1124
- timeout_message='Marking request as handled timed out after '
1125
- f'{self._internal_timeout.total_seconds()} seconds',
1126
- logger=self._logger,
1127
- max_retries=3,
1128
- )
1162
+ request.state = RequestState.ERROR
1163
+ await self._mark_request_as_handled(request)
1129
1164
  await self._handle_failed_request(context, error)
1130
1165
  self._statistics.record_request_processing_failure(request.unique_key)
1131
1166
 
@@ -1140,8 +1175,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1140
1175
  f'{self._internal_timeout.total_seconds()} seconds',
1141
1176
  logger=self._logger,
1142
1177
  )
1143
-
1144
- context.request.state = RequestState.DONE
1145
1178
  except UserDefinedErrorHandlerError:
1146
1179
  context.request.state = RequestState.ERROR
1147
1180
  raise
@@ -1174,17 +1207,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1174
1207
  self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
1175
1208
  ) -> None:
1176
1209
  if need_mark and isinstance(request, Request):
1177
- request_manager = await self.get_request_manager()
1178
-
1179
- await wait_for(
1180
- lambda: request_manager.mark_request_as_handled(request),
1181
- timeout=self._internal_timeout,
1182
- timeout_message='Marking request as handled timed out after '
1183
- f'{self._internal_timeout.total_seconds()} seconds',
1184
- logger=self._logger,
1185
- max_retries=3,
1186
- )
1187
1210
  request.state = RequestState.SKIPPED
1211
+ await self._mark_request_as_handled(request)
1188
1212
 
1189
1213
  url = request.url if isinstance(request, Request) else request
1190
1214
 
@@ -1204,10 +1228,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1204
1228
 
1205
1229
  if (
1206
1230
  isinstance(error, asyncio.exceptions.TimeoutError)
1231
+ and traceback_parts
1207
1232
  and self._request_handler_timeout_text in traceback_parts[-1]
1208
- ):
1233
+ ) or isinstance(error, UserHandlerTimeoutError):
1209
1234
  used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
1210
- used_traceback_parts.append(traceback_parts[-1])
1235
+ used_traceback_parts.extend(traceback_parts[-1:])
1211
1236
 
1212
1237
  return ''.join(used_traceback_parts).strip('\n')
1213
1238
 
@@ -1256,58 +1281,54 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1256
1281
  else:
1257
1282
  yield Request.from_url(url)
1258
1283
 
1259
- async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
1260
- """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1261
- result = self._context_result_map[context]
1262
-
1263
- base_request_manager = await self.get_request_manager()
1264
-
1265
- origin = context.request.loaded_url or context.request.url
1266
-
1267
- for add_requests_call in result.add_requests_calls:
1268
- rq_id = add_requests_call.get('rq_id')
1269
- rq_name = add_requests_call.get('rq_name')
1270
- rq_alias = add_requests_call.get('rq_alias')
1271
- specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
1272
- if specified_params > 1:
1273
- raise ValueError('You can only provide one of `rq_id`, `rq_name` or `rq_alias` arguments.')
1274
- if rq_id or rq_name or rq_alias:
1275
- request_manager: RequestManager | RequestQueue = await RequestQueue.open(
1276
- id=rq_id,
1277
- name=rq_name,
1278
- alias=rq_alias,
1279
- storage_client=self._service_locator.get_storage_client(),
1280
- configuration=self._service_locator.get_configuration(),
1281
- )
1282
- else:
1283
- request_manager = base_request_manager
1284
-
1285
- requests = list[Request]()
1286
-
1287
- base_url = url if (url := add_requests_call.get('base_url')) else origin
1288
-
1289
- requests_iterator = self._convert_url_to_request_iterator(add_requests_call['requests'], base_url)
1284
+ async def _add_requests(
1285
+ self,
1286
+ context: BasicCrawlingContext,
1287
+ requests: Sequence[str | Request],
1288
+ rq_id: str | None = None,
1289
+ rq_name: str | None = None,
1290
+ rq_alias: str | None = None,
1291
+ **kwargs: Unpack[EnqueueLinksKwargs],
1292
+ ) -> None:
1293
+ """Add requests method aware of the crawling context."""
1294
+ if rq_id or rq_name or rq_alias:
1295
+ request_manager: RequestManager = await RequestQueue.open(
1296
+ id=rq_id,
1297
+ name=rq_name,
1298
+ alias=rq_alias,
1299
+ storage_client=self._service_locator.get_storage_client(),
1300
+ configuration=self._service_locator.get_configuration(),
1301
+ )
1302
+ else:
1303
+ request_manager = await self.get_request_manager()
1290
1304
 
1291
- enqueue_links_kwargs: EnqueueLinksKwargs = {k: v for k, v in add_requests_call.items() if k != 'requests'} # type: ignore[assignment]
1305
+ context_aware_requests = list[Request]()
1306
+ base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
1307
+ requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
1308
+ filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
1309
+ for dst_request in filter_requests_iterator:
1310
+ # Update the crawl depth of the request.
1311
+ dst_request.crawl_depth = context.request.crawl_depth + 1
1292
1312
 
1293
- filter_requests_iterator = self._enqueue_links_filter_iterator(
1294
- requests_iterator, context.request.url, **enqueue_links_kwargs
1295
- )
1313
+ if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
1314
+ context_aware_requests.append(dst_request)
1296
1315
 
1297
- for dst_request in filter_requests_iterator:
1298
- # Update the crawl depth of the request.
1299
- dst_request.crawl_depth = context.request.crawl_depth + 1
1316
+ return await request_manager.add_requests(context_aware_requests)
1300
1317
 
1301
- if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
1302
- requests.append(dst_request)
1318
+ async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
1319
+ """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1320
+ result = self._context_result_map[context]
1303
1321
 
1304
- await request_manager.add_requests(requests)
1322
+ for add_requests_call in result.add_requests_calls:
1323
+ await self._add_requests(context, **add_requests_call)
1305
1324
 
1306
1325
  for push_data_call in result.push_data_calls:
1307
1326
  await self._push_data(**push_data_call)
1308
1327
 
1309
1328
  await self._commit_key_value_store_changes(result, get_kvs=self.get_key_value_store)
1310
1329
 
1330
+ result.apply_request_changes(target=context.request)
1331
+
1311
1332
  @staticmethod
1312
1333
  async def _commit_key_value_store_changes(
1313
1334
  result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction
@@ -1373,10 +1394,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1373
1394
  else:
1374
1395
  session = await self._get_session()
1375
1396
  proxy_info = await self._get_proxy_info(request, session)
1376
- result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
1397
+ result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store, request=request)
1377
1398
 
1378
1399
  context = BasicCrawlingContext(
1379
- request=request,
1400
+ request=result.request,
1380
1401
  session=session,
1381
1402
  proxy_info=proxy_info,
1382
1403
  send_request=self._prepare_send_request_function(session, proxy_info),
@@ -1393,32 +1414,26 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1393
1414
  try:
1394
1415
  request.state = RequestState.REQUEST_HANDLER
1395
1416
 
1396
- self._check_request_collision(context.request, context.session)
1397
-
1398
1417
  try:
1399
- await self._run_request_handler(context=context)
1418
+ with swaped_context(context, request):
1419
+ self._check_request_collision(request, session)
1420
+ await self._run_request_handler(context=context)
1400
1421
  except asyncio.TimeoutError as e:
1401
1422
  raise RequestHandlerError(e, context) from e
1402
1423
 
1403
1424
  await self._commit_request_handler_result(context)
1404
- await wait_for(
1405
- lambda: request_manager.mark_request_as_handled(context.request),
1406
- timeout=self._internal_timeout,
1407
- timeout_message='Marking request as handled timed out after '
1408
- f'{self._internal_timeout.total_seconds()} seconds',
1409
- logger=self._logger,
1410
- max_retries=3,
1411
- )
1412
1425
 
1413
1426
  request.state = RequestState.DONE
1414
1427
 
1415
- if context.session and context.session.is_usable:
1416
- context.session.mark_good()
1428
+ await self._mark_request_as_handled(request)
1429
+
1430
+ if session and session.is_usable:
1431
+ session.mark_good()
1417
1432
 
1418
1433
  self._statistics.record_request_processing_finish(request.unique_key)
1419
1434
 
1420
1435
  except RequestCollisionError as request_error:
1421
- context.request.no_retry = True
1436
+ request.no_retry = True
1422
1437
  await self._handle_request_error(context, request_error)
1423
1438
 
1424
1439
  except RequestHandlerError as primary_error:
@@ -1433,7 +1448,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1433
1448
  await self._handle_request_error(primary_error.crawling_context, primary_error.wrapped_exception)
1434
1449
 
1435
1450
  except SessionError as session_error:
1436
- if not context.session:
1451
+ if not session:
1437
1452
  raise RuntimeError('SessionError raised in a crawling context without a session') from session_error
1438
1453
 
1439
1454
  if self._error_handler:
@@ -1443,22 +1458,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1443
1458
  exc_only = ''.join(traceback.format_exception_only(session_error)).strip()
1444
1459
  self._logger.warning('Encountered "%s", rotating session and retrying...', exc_only)
1445
1460
 
1446
- context.session.retire()
1461
+ if session:
1462
+ session.retire()
1447
1463
 
1448
1464
  # Increment session rotation count.
1449
- context.request.session_rotation_count = (context.request.session_rotation_count or 0) + 1
1465
+ request.session_rotation_count = (request.session_rotation_count or 0) + 1
1450
1466
 
1451
1467
  await request_manager.reclaim_request(request)
1452
1468
  await self._statistics.error_tracker_retry.add(error=session_error, context=context)
1453
1469
  else:
1454
- await wait_for(
1455
- lambda: request_manager.mark_request_as_handled(context.request),
1456
- timeout=self._internal_timeout,
1457
- timeout_message='Marking request as handled timed out after '
1458
- f'{self._internal_timeout.total_seconds()} seconds',
1459
- logger=self._logger,
1460
- max_retries=3,
1461
- )
1470
+ await self._mark_request_as_handled(request)
1462
1471
 
1463
1472
  await self._handle_failed_request(context, session_error)
1464
1473
  self._statistics.record_request_processing_failure(request.unique_key)
@@ -1466,14 +1475,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1466
1475
  except ContextPipelineInterruptedError as interrupted_error:
1467
1476
  self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
1468
1477
 
1469
- await wait_for(
1470
- lambda: request_manager.mark_request_as_handled(context.request),
1471
- timeout=self._internal_timeout,
1472
- timeout_message='Marking request as handled timed out after '
1473
- f'{self._internal_timeout.total_seconds()} seconds',
1474
- logger=self._logger,
1475
- max_retries=3,
1476
- )
1478
+ await self._mark_request_as_handled(request)
1477
1479
 
1478
1480
  except ContextPipelineInitializationError as initialization_error:
1479
1481
  self._logger.debug(
@@ -1491,12 +1493,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1491
1493
  raise
1492
1494
 
1493
1495
  async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
1494
- await wait_for(
1495
- lambda: self._context_pipeline(context, self.router),
1496
- timeout=self._request_handler_timeout,
1497
- timeout_message=f'{self._request_handler_timeout_text}'
1498
- f' {self._request_handler_timeout.total_seconds()} seconds',
1499
- logger=self._logger,
1496
+ context.request.state = RequestState.BEFORE_NAV
1497
+ await self._context_pipeline(
1498
+ context,
1499
+ lambda final_context: wait_for(
1500
+ lambda: self.router(final_context),
1501
+ timeout=self._request_handler_timeout,
1502
+ timeout_message=f'{self._request_handler_timeout_text}'
1503
+ f' {self._request_handler_timeout.total_seconds()} seconds',
1504
+ logger=self._logger,
1505
+ ),
1500
1506
  )
1501
1507
 
1502
1508
  def _raise_for_error_status_code(self, status_code: int) -> None:
@@ -1644,3 +1650,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1644
1650
  )
1645
1651
 
1646
1652
  self._previous_crawler_state = current_state
1653
+
1654
+ async def _mark_request_as_handled(self, request: Request) -> None:
1655
+ request_manager = await self.get_request_manager()
1656
+ await wait_for(
1657
+ lambda: request_manager.mark_request_as_handled(request),
1658
+ timeout=self._internal_timeout,
1659
+ timeout_message='Marking request as handled timed out after '
1660
+ f'{self._internal_timeout.total_seconds()} seconds',
1661
+ logger=self._logger,
1662
+ max_retries=3,
1663
+ )
@@ -0,0 +1,24 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import contextmanager
4
+ from typing import TYPE_CHECKING
5
+
6
+ if TYPE_CHECKING:
7
+ from collections.abc import Iterator
8
+
9
+ from crawlee._request import Request
10
+
11
+ from ._basic_crawling_context import BasicCrawlingContext
12
+
13
+
14
+ @contextmanager
15
+ def swaped_context(
16
+ context: BasicCrawlingContext,
17
+ request: Request,
18
+ ) -> Iterator[None]:
19
+ """Replace context's isolated copies with originals after handler execution."""
20
+ try:
21
+ yield
22
+ finally:
23
+ # Restore original context state to avoid side effects between different handlers.
24
+ object.__setattr__(context, 'request', request)
@@ -2,9 +2,21 @@ import asyncio
2
2
  import re
3
3
  import traceback
4
4
 
5
+ import crawlee.errors
6
+
5
7
 
6
8
  def _get_only_innermost_exception(error: BaseException) -> BaseException:
7
- """Get innermost exception by following __cause__ and __context__ attributes of exception."""
9
+ """Get innermost exception by following __cause__ and __context__ attributes of exception.
10
+
11
+ If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible.
12
+ """
13
+ if type(error) is crawlee.errors.UserHandlerTimeoutError:
14
+ if error.__cause__:
15
+ return error.__cause__
16
+ if error.__context__:
17
+ return error.__context__
18
+ return error
19
+
8
20
  if error.__cause__:
9
21
  return _get_only_innermost_exception(error.__cause__)
10
22
  if error.__context__:
@@ -34,7 +46,7 @@ def _strip_pep657_highlighting(traceback_part: str) -> str:
34
46
 
35
47
 
36
48
  def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
37
- timeout_error: asyncio.exceptions.TimeoutError,
49
+ timeout_error: asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError,
38
50
  ) -> list[str]:
39
51
  innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error)
40
52
  return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts)
@@ -43,13 +55,20 @@ def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
43
55
  def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
44
56
  innermost_error = _get_only_innermost_exception(error)
45
57
  return traceback.format_exception(
46
- type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=True
58
+ type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=False
47
59
  )
48
60
 
49
61
 
50
62
  def get_one_line_error_summary_if_possible(error: Exception) -> str:
51
63
  if isinstance(error, asyncio.exceptions.TimeoutError):
52
- most_relevant_part = ',' + reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
64
+ relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
65
+ most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else ''
66
+ elif isinstance(error, crawlee.errors.UserHandlerTimeoutError):
67
+ # Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee
68
+ # code and third line the topmost user error
69
+ traceback_parts = _get_traceback_parts_for_innermost_exception(error)
70
+ relevant_index_from_start = 3
71
+ most_relevant_part = traceback_parts[2] if len(traceback_parts) >= relevant_index_from_start else ''
53
72
  elif 'playwright._impl._errors.Error' in str(error.__class__):
54
73
  # Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
55
74
  # point to deep internals.
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
5
5
  from bs4 import BeautifulSoup, Tag
6
6
 
7
7
  from crawlee._utils.docs import docs_group
8
- from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
8
+ from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
9
9
 
10
10
  from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
11
11
  from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
@@ -58,7 +58,7 @@ class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, Bea
58
58
  self,
59
59
  *,
60
60
  parser: BeautifulSoupParserType = 'lxml',
61
- **kwargs: Unpack[BasicCrawlerOptions[BeautifulSoupCrawlingContext]],
61
+ **kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
62
62
  ) -> None:
63
63
  """Initialize a new instance.
64
64
 
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
5
5
  from parsel import Selector
6
6
 
7
7
  from crawlee._utils.docs import docs_group
8
- from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
8
+ from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
9
9
 
10
10
  from ._parsel_crawling_context import ParselCrawlingContext
11
11
  from ._parsel_parser import ParselParser
@@ -56,7 +56,7 @@ class ParselCrawler(AbstractHttpCrawler[ParselCrawlingContext, Selector, Selecto
56
56
 
57
57
  def __init__(
58
58
  self,
59
- **kwargs: Unpack[BasicCrawlerOptions[ParselCrawlingContext]],
59
+ **kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
60
60
  ) -> None:
61
61
  """Initialize a new instance.
62
62