crawlee 1.0.3b6__py3-none-any.whl → 1.2.2b24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_browserforge_workaround.py +7 -3
  3. crawlee/_request.py +32 -13
  4. crawlee/_service_locator.py +4 -4
  5. crawlee/_types.py +44 -5
  6. crawlee/_utils/context.py +3 -3
  7. crawlee/_utils/file.py +8 -1
  8. crawlee/_utils/globs.py +4 -4
  9. crawlee/_utils/recoverable_state.py +32 -8
  10. crawlee/_utils/recurring_task.py +27 -3
  11. crawlee/_utils/robots.py +17 -5
  12. crawlee/_utils/sitemap.py +13 -6
  13. crawlee/_utils/system.py +27 -11
  14. crawlee/_utils/time.py +41 -1
  15. crawlee/_utils/urls.py +9 -2
  16. crawlee/browsers/_browser_pool.py +5 -2
  17. crawlee/browsers/_playwright_browser.py +2 -1
  18. crawlee/browsers/_playwright_browser_controller.py +1 -1
  19. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  20. crawlee/browsers/_types.py +1 -1
  21. crawlee/configuration.py +3 -1
  22. crawlee/crawlers/__init__.py +5 -1
  23. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  24. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
  25. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  26. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
  27. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
  28. crawlee/crawlers/_basic/_basic_crawler.py +156 -131
  29. crawlee/crawlers/_basic/_context_utils.py +24 -0
  30. crawlee/crawlers/_basic/_logging_utils.py +23 -4
  31. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  32. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  33. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  34. crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
  35. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  36. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  37. crawlee/crawlers/_playwright/_types.py +12 -2
  38. crawlee/errors.py +4 -0
  39. crawlee/events/_event_manager.py +12 -6
  40. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  41. crawlee/fingerprint_suite/_header_generator.py +2 -2
  42. crawlee/http_clients/_base.py +4 -0
  43. crawlee/http_clients/_curl_impersonate.py +68 -14
  44. crawlee/http_clients/_httpx.py +16 -6
  45. crawlee/http_clients/_impit.py +25 -10
  46. crawlee/otel/crawler_instrumentor.py +4 -6
  47. crawlee/request_loaders/_sitemap_request_loader.py +23 -5
  48. crawlee/router.py +13 -3
  49. crawlee/sessions/_cookies.py +13 -8
  50. crawlee/sessions/_models.py +3 -3
  51. crawlee/sessions/_session_pool.py +1 -1
  52. crawlee/statistics/_error_snapshotter.py +1 -1
  53. crawlee/statistics/_models.py +51 -9
  54. crawlee/statistics/_statistics.py +24 -33
  55. crawlee/storage_clients/__init__.py +4 -0
  56. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  57. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  58. crawlee/storage_clients/_file_system/_dataset_client.py +6 -5
  59. crawlee/storage_clients/_file_system/_key_value_store_client.py +7 -4
  60. crawlee/storage_clients/_file_system/_request_queue_client.py +29 -10
  61. crawlee/storage_clients/_redis/__init__.py +6 -0
  62. crawlee/storage_clients/_redis/_client_mixin.py +292 -0
  63. crawlee/storage_clients/_redis/_dataset_client.py +329 -0
  64. crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
  65. crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
  66. crawlee/storage_clients/_redis/_storage_client.py +149 -0
  67. crawlee/storage_clients/_redis/_utils.py +23 -0
  68. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  69. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  70. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  71. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  72. crawlee/storage_clients/_redis/py.typed +0 -0
  73. crawlee/storage_clients/_sql/_client_mixin.py +1 -1
  74. crawlee/storage_clients/_sql/_db_models.py +1 -2
  75. crawlee/storage_clients/models.py +8 -3
  76. crawlee/storages/_key_value_store.py +5 -2
  77. crawlee/storages/_storage_instance_manager.py +103 -44
  78. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/METADATA +14 -16
  79. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/RECORD +82 -69
  80. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/WHEEL +1 -1
  81. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/entry_points.txt +0 -0
  82. {crawlee-1.0.3b6.dist-info → crawlee-1.2.2b24.dist-info}/licenses/LICENSE +0 -0
@@ -2,6 +2,7 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  import asyncio
5
+ import functools
5
6
  import logging
6
7
  import signal
7
8
  import sys
@@ -13,8 +14,9 @@ from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Seque
13
14
  from contextlib import AsyncExitStack, suppress
14
15
  from datetime import timedelta
15
16
  from functools import partial
17
+ from io import StringIO
16
18
  from pathlib import Path
17
- from typing import TYPE_CHECKING, Any, Generic, Literal, cast
19
+ from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
18
20
  from urllib.parse import ParseResult, urlparse
19
21
  from weakref import WeakKeyDictionary
20
22
 
@@ -31,6 +33,8 @@ from crawlee._service_locator import ServiceLocator
31
33
  from crawlee._types import (
32
34
  BasicCrawlingContext,
33
35
  EnqueueLinksKwargs,
36
+ ExportDataCsvKwargs,
37
+ ExportDataJsonKwargs,
34
38
  GetKeyValueStoreFromRequestHandlerFunction,
35
39
  HttpHeaders,
36
40
  HttpPayload,
@@ -40,7 +44,7 @@ from crawlee._types import (
40
44
  SkippedReason,
41
45
  )
42
46
  from crawlee._utils.docs import docs_group
43
- from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
47
+ from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
44
48
  from crawlee._utils.recurring_task import RecurringTask
45
49
  from crawlee._utils.robots import RobotsTxtFile
46
50
  from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
@@ -55,6 +59,7 @@ from crawlee.errors import (
55
59
  RequestHandlerError,
56
60
  SessionError,
57
61
  UserDefinedErrorHandlerError,
62
+ UserHandlerTimeoutError,
58
63
  )
59
64
  from crawlee.events._types import Event, EventCrawlerStatusData
60
65
  from crawlee.http_clients import ImpitHttpClient
@@ -64,6 +69,7 @@ from crawlee.statistics import Statistics, StatisticsState
64
69
  from crawlee.storages import Dataset, KeyValueStore, RequestQueue
65
70
 
66
71
  from ._context_pipeline import ContextPipeline
72
+ from ._context_utils import swapped_context
67
73
  from ._logging_utils import (
68
74
  get_one_line_error_summary_if_possible,
69
75
  reduce_asyncio_timeout_error_to_relevant_traceback_parts,
@@ -96,6 +102,9 @@ if TYPE_CHECKING:
96
102
  TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
97
103
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
98
104
  TRequestIterator = TypeVar('TRequestIterator', str, Request)
105
+ TParams = ParamSpec('TParams')
106
+ T = TypeVar('T')
107
+
99
108
  ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
100
109
  FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
101
110
  SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
@@ -401,7 +410,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
401
410
  self._context_result_map = WeakKeyDictionary[BasicCrawlingContext, RequestHandlerRunResult]()
402
411
 
403
412
  # Context pipeline
404
- self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)
413
+ self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects) # ty: ignore[invalid-argument-type]
405
414
 
406
415
  # Crawl settings
407
416
  self._max_request_retries = max_request_retries
@@ -437,14 +446,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
437
446
  self._statistics_log_format = statistics_log_format
438
447
 
439
448
  # Statistics
440
- self._statistics = statistics or cast(
441
- 'Statistics[TStatisticsState]',
442
- Statistics.with_default_state(
443
- periodic_message_logger=self._logger,
444
- statistics_log_format=self._statistics_log_format,
445
- log_message='Current request statistics:',
446
- ),
447
- )
449
+ if statistics:
450
+ self._statistics = statistics
451
+ else:
452
+
453
+ async def persist_state_factory() -> KeyValueStore:
454
+ return await self.get_key_value_store()
455
+
456
+ self._statistics = cast(
457
+ 'Statistics[TStatisticsState]',
458
+ Statistics.with_default_state(
459
+ persistence_enabled=True,
460
+ periodic_message_logger=self._logger,
461
+ statistics_log_format=self._statistics_log_format,
462
+ log_message='Current request statistics:',
463
+ persist_state_kvs_factory=persist_state_factory,
464
+ ),
465
+ )
448
466
 
449
467
  # Additional context managers to enter and exit
450
468
  self._additional_context_managers = _additional_context_managers or []
@@ -511,6 +529,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
511
529
  self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
512
530
  self._unexpected_stop = True
513
531
 
532
+ def _wrap_handler_with_error_context(
533
+ self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
534
+ ) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
535
+ """Decorate error handlers to make their context helpers usable."""
536
+
537
+ @functools.wraps(handler)
538
+ async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
539
+ # Original context helpers that are from `RequestHandlerRunResult` will not be committed as the request
540
+ # failed. Modified context provides context helpers with direct access to the storages.
541
+ error_context = context.create_modified_copy(
542
+ push_data=self._push_data,
543
+ get_key_value_store=self.get_key_value_store,
544
+ add_requests=functools.partial(self._add_requests, context),
545
+ )
546
+ return await handler(error_context, exception)
547
+
548
+ return wrapped_handler
549
+
514
550
  def _stop_if_max_requests_count_exceeded(self) -> None:
515
551
  """Call `stop` when the maximum number of requests to crawl has been reached."""
516
552
  if self._max_requests_per_crawl is None:
@@ -609,7 +645,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
609
645
 
610
646
  The error handler is invoked after a request handler error occurs and before a retry attempt.
611
647
  """
612
- self._error_handler = handler
648
+ self._error_handler = self._wrap_handler_with_error_context(handler)
613
649
  return handler
614
650
 
615
651
  def failed_request_handler(
@@ -619,7 +655,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
619
655
 
620
656
  The failed request handler is invoked when a request has failed all retry attempts.
621
657
  """
622
- self._failed_request_handler = handler
658
+ self._failed_request_handler = self._wrap_handler_with_error_context(handler)
623
659
  return handler
624
660
 
625
661
  def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
@@ -689,7 +725,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
689
725
  except CancelledError:
690
726
  pass
691
727
  finally:
692
- await self._crawler_state_rec_task.stop()
693
728
  if threading.current_thread() is threading.main_thread():
694
729
  with suppress(NotImplementedError):
695
730
  asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
@@ -721,8 +756,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
721
756
  async def _run_crawler(self) -> None:
722
757
  event_manager = self._service_locator.get_event_manager()
723
758
 
724
- self._crawler_state_rec_task.start()
725
-
726
759
  # Collect the context managers to be entered. Context managers that are already active are excluded,
727
760
  # as they were likely entered by the caller, who will also be responsible for exiting them.
728
761
  contexts_to_enter = [
@@ -733,6 +766,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
733
766
  self._statistics,
734
767
  self._session_pool if self._use_session_pool else None,
735
768
  self._http_client,
769
+ self._crawler_state_rec_task,
736
770
  *self._additional_context_managers,
737
771
  )
738
772
  if cm and getattr(cm, 'active', False) is False
@@ -740,7 +774,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
740
774
 
741
775
  async with AsyncExitStack() as exit_stack:
742
776
  for context in contexts_to_enter:
743
- await exit_stack.enter_async_context(context) # type: ignore[arg-type]
777
+ await exit_stack.enter_async_context(context) # ty: ignore[invalid-argument-type]
744
778
 
745
779
  await self._autoscaled_pool.run()
746
780
 
@@ -839,6 +873,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
839
873
  dataset_id: str | None = None,
840
874
  dataset_name: str | None = None,
841
875
  dataset_alias: str | None = None,
876
+ **additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs],
842
877
  ) -> None:
843
878
  """Export all items from a Dataset to a JSON or CSV file.
844
879
 
@@ -851,6 +886,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
851
886
  dataset_id: The ID of the Dataset to export from.
852
887
  dataset_name: The name of the Dataset to export from (global scope, named storage).
853
888
  dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
889
+ additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
854
890
  """
855
891
  dataset = await Dataset.open(
856
892
  id=dataset_id,
@@ -860,13 +896,18 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
860
896
  configuration=self._service_locator.get_configuration(),
861
897
  )
862
898
 
863
- path = path if isinstance(path, Path) else Path(path)
864
- dst = path.open('w', newline='')
899
+ path = Path(path)
865
900
 
866
901
  if path.suffix == '.csv':
867
- await export_csv_to_stream(dataset.iterate_items(), dst)
902
+ dst = StringIO()
903
+ csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
904
+ await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
905
+ await atomic_write(path, dst.getvalue())
868
906
  elif path.suffix == '.json':
869
- await export_json_to_stream(dataset.iterate_items(), dst)
907
+ dst = StringIO()
908
+ json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
909
+ await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
910
+ await atomic_write(path, dst.getvalue())
870
911
  else:
871
912
  raise ValueError(f'Unsupported file extension: {path.suffix}')
872
913
 
@@ -972,6 +1013,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
972
1013
  label=label,
973
1014
  user_data=user_data,
974
1015
  transform_request_function=transform_request_function,
1016
+ **kwargs,
975
1017
  ),
976
1018
  rq_id=rq_id,
977
1019
  rq_name=rq_name,
@@ -997,7 +1039,12 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
997
1039
  warning_flag = True
998
1040
 
999
1041
  for request in request_iterator:
1000
- target_url = request.url if isinstance(request, Request) else request
1042
+ if isinstance(request, Request):
1043
+ if request.enqueue_strategy != strategy:
1044
+ request.enqueue_strategy = strategy
1045
+ target_url = request.url
1046
+ else:
1047
+ target_url = request
1001
1048
  parsed_target_url = urlparse(target_url)
1002
1049
 
1003
1050
  if warning_flag and strategy != 'all' and not parsed_target_url.hostname:
@@ -1035,8 +1082,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1035
1082
  return target_url.hostname == origin_url.hostname
1036
1083
 
1037
1084
  if strategy == 'same-domain':
1038
- origin_domain = self._tld_extractor.extract_str(origin_url.hostname).domain
1039
- target_domain = self._tld_extractor.extract_str(target_url.hostname).domain
1085
+ origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
1086
+ target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
1040
1087
  return origin_domain == target_domain
1041
1088
 
1042
1089
  if strategy == 'same-origin':
@@ -1094,7 +1141,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1094
1141
  request.retry_count += 1
1095
1142
  reduced_error = str(error).split('\n')[0]
1096
1143
  self.log.warning(
1097
- f'Retrying request to {context.request.url} due to: {reduced_error}'
1144
+ f'Retrying request to {context.request.url} due to: {reduced_error}. '
1098
1145
  f'{get_one_line_error_summary_if_possible(error)}'
1099
1146
  )
1100
1147
  await self._statistics.error_tracker.add(error=error, context=context)
@@ -1105,19 +1152,15 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1105
1152
  except Exception as e:
1106
1153
  raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
1107
1154
  else:
1108
- if new_request is not None:
1109
- request = new_request
1155
+ if new_request is not None and new_request != request:
1156
+ await request_manager.add_request(new_request)
1157
+ await self._mark_request_as_handled(request)
1158
+ return
1110
1159
 
1111
1160
  await request_manager.reclaim_request(request)
1112
1161
  else:
1113
- await wait_for(
1114
- lambda: request_manager.mark_request_as_handled(context.request),
1115
- timeout=self._internal_timeout,
1116
- timeout_message='Marking request as handled timed out after '
1117
- f'{self._internal_timeout.total_seconds()} seconds',
1118
- logger=self._logger,
1119
- max_retries=3,
1120
- )
1162
+ request.state = RequestState.ERROR
1163
+ await self._mark_request_as_handled(request)
1121
1164
  await self._handle_failed_request(context, error)
1122
1165
  self._statistics.record_request_processing_failure(request.unique_key)
1123
1166
 
@@ -1132,8 +1175,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1132
1175
  f'{self._internal_timeout.total_seconds()} seconds',
1133
1176
  logger=self._logger,
1134
1177
  )
1135
-
1136
- context.request.state = RequestState.DONE
1137
1178
  except UserDefinedErrorHandlerError:
1138
1179
  context.request.state = RequestState.ERROR
1139
1180
  raise
@@ -1166,17 +1207,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1166
1207
  self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
1167
1208
  ) -> None:
1168
1209
  if need_mark and isinstance(request, Request):
1169
- request_manager = await self.get_request_manager()
1170
-
1171
- await wait_for(
1172
- lambda: request_manager.mark_request_as_handled(request),
1173
- timeout=self._internal_timeout,
1174
- timeout_message='Marking request as handled timed out after '
1175
- f'{self._internal_timeout.total_seconds()} seconds',
1176
- logger=self._logger,
1177
- max_retries=3,
1178
- )
1179
1210
  request.state = RequestState.SKIPPED
1211
+ await self._mark_request_as_handled(request)
1180
1212
 
1181
1213
  url = request.url if isinstance(request, Request) else request
1182
1214
 
@@ -1196,10 +1228,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1196
1228
 
1197
1229
  if (
1198
1230
  isinstance(error, asyncio.exceptions.TimeoutError)
1231
+ and traceback_parts
1199
1232
  and self._request_handler_timeout_text in traceback_parts[-1]
1200
- ):
1233
+ ) or isinstance(error, UserHandlerTimeoutError):
1201
1234
  used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
1202
- used_traceback_parts.append(traceback_parts[-1])
1235
+ used_traceback_parts.extend(traceback_parts[-1:])
1203
1236
 
1204
1237
  return ''.join(used_traceback_parts).strip('\n')
1205
1238
 
@@ -1248,58 +1281,54 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1248
1281
  else:
1249
1282
  yield Request.from_url(url)
1250
1283
 
1251
- async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
1252
- """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1253
- result = self._context_result_map[context]
1254
-
1255
- base_request_manager = await self.get_request_manager()
1256
-
1257
- origin = context.request.loaded_url or context.request.url
1258
-
1259
- for add_requests_call in result.add_requests_calls:
1260
- rq_id = add_requests_call.get('rq_id')
1261
- rq_name = add_requests_call.get('rq_name')
1262
- rq_alias = add_requests_call.get('rq_alias')
1263
- specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
1264
- if specified_params > 1:
1265
- raise ValueError('You can only provide one of `rq_id`, `rq_name` or `rq_alias` arguments.')
1266
- if rq_id or rq_name or rq_alias:
1267
- request_manager: RequestManager | RequestQueue = await RequestQueue.open(
1268
- id=rq_id,
1269
- name=rq_name,
1270
- alias=rq_alias,
1271
- storage_client=self._service_locator.get_storage_client(),
1272
- configuration=self._service_locator.get_configuration(),
1273
- )
1274
- else:
1275
- request_manager = base_request_manager
1276
-
1277
- requests = list[Request]()
1278
-
1279
- base_url = url if (url := add_requests_call.get('base_url')) else origin
1280
-
1281
- requests_iterator = self._convert_url_to_request_iterator(add_requests_call['requests'], base_url)
1284
+ async def _add_requests(
1285
+ self,
1286
+ context: BasicCrawlingContext,
1287
+ requests: Sequence[str | Request],
1288
+ rq_id: str | None = None,
1289
+ rq_name: str | None = None,
1290
+ rq_alias: str | None = None,
1291
+ **kwargs: Unpack[EnqueueLinksKwargs],
1292
+ ) -> None:
1293
+ """Add requests method aware of the crawling context."""
1294
+ if rq_id or rq_name or rq_alias:
1295
+ request_manager: RequestManager = await RequestQueue.open(
1296
+ id=rq_id,
1297
+ name=rq_name,
1298
+ alias=rq_alias,
1299
+ storage_client=self._service_locator.get_storage_client(),
1300
+ configuration=self._service_locator.get_configuration(),
1301
+ )
1302
+ else:
1303
+ request_manager = await self.get_request_manager()
1282
1304
 
1283
- enqueue_links_kwargs: EnqueueLinksKwargs = {k: v for k, v in add_requests_call.items() if k != 'requests'} # type: ignore[assignment]
1305
+ context_aware_requests = list[Request]()
1306
+ base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
1307
+ requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
1308
+ filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
1309
+ for dst_request in filter_requests_iterator:
1310
+ # Update the crawl depth of the request.
1311
+ dst_request.crawl_depth = context.request.crawl_depth + 1
1284
1312
 
1285
- filter_requests_iterator = self._enqueue_links_filter_iterator(
1286
- requests_iterator, context.request.url, **enqueue_links_kwargs
1287
- )
1313
+ if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
1314
+ context_aware_requests.append(dst_request)
1288
1315
 
1289
- for dst_request in filter_requests_iterator:
1290
- # Update the crawl depth of the request.
1291
- dst_request.crawl_depth = context.request.crawl_depth + 1
1316
+ return await request_manager.add_requests(context_aware_requests)
1292
1317
 
1293
- if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
1294
- requests.append(dst_request)
1318
+ async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
1319
+ """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1320
+ result = self._context_result_map[context]
1295
1321
 
1296
- await request_manager.add_requests(requests)
1322
+ for add_requests_call in result.add_requests_calls:
1323
+ await self._add_requests(context, **add_requests_call)
1297
1324
 
1298
1325
  for push_data_call in result.push_data_calls:
1299
1326
  await self._push_data(**push_data_call)
1300
1327
 
1301
1328
  await self._commit_key_value_store_changes(result, get_kvs=self.get_key_value_store)
1302
1329
 
1330
+ result.apply_request_changes(target=context.request)
1331
+
1303
1332
  @staticmethod
1304
1333
  async def _commit_key_value_store_changes(
1305
1334
  result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction
@@ -1365,10 +1394,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1365
1394
  else:
1366
1395
  session = await self._get_session()
1367
1396
  proxy_info = await self._get_proxy_info(request, session)
1368
- result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
1397
+ result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store, request=request)
1369
1398
 
1370
1399
  context = BasicCrawlingContext(
1371
- request=request,
1400
+ request=result.request,
1372
1401
  session=session,
1373
1402
  proxy_info=proxy_info,
1374
1403
  send_request=self._prepare_send_request_function(session, proxy_info),
@@ -1385,32 +1414,26 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1385
1414
  try:
1386
1415
  request.state = RequestState.REQUEST_HANDLER
1387
1416
 
1388
- self._check_request_collision(context.request, context.session)
1389
-
1390
1417
  try:
1391
- await self._run_request_handler(context=context)
1418
+ with swapped_context(context, request):
1419
+ self._check_request_collision(request, session)
1420
+ await self._run_request_handler(context=context)
1392
1421
  except asyncio.TimeoutError as e:
1393
1422
  raise RequestHandlerError(e, context) from e
1394
1423
 
1395
1424
  await self._commit_request_handler_result(context)
1396
- await wait_for(
1397
- lambda: request_manager.mark_request_as_handled(context.request),
1398
- timeout=self._internal_timeout,
1399
- timeout_message='Marking request as handled timed out after '
1400
- f'{self._internal_timeout.total_seconds()} seconds',
1401
- logger=self._logger,
1402
- max_retries=3,
1403
- )
1404
1425
 
1405
1426
  request.state = RequestState.DONE
1406
1427
 
1407
- if context.session and context.session.is_usable:
1408
- context.session.mark_good()
1428
+ await self._mark_request_as_handled(request)
1429
+
1430
+ if session and session.is_usable:
1431
+ session.mark_good()
1409
1432
 
1410
1433
  self._statistics.record_request_processing_finish(request.unique_key)
1411
1434
 
1412
1435
  except RequestCollisionError as request_error:
1413
- context.request.no_retry = True
1436
+ request.no_retry = True
1414
1437
  await self._handle_request_error(context, request_error)
1415
1438
 
1416
1439
  except RequestHandlerError as primary_error:
@@ -1425,7 +1448,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1425
1448
  await self._handle_request_error(primary_error.crawling_context, primary_error.wrapped_exception)
1426
1449
 
1427
1450
  except SessionError as session_error:
1428
- if not context.session:
1451
+ if not session:
1429
1452
  raise RuntimeError('SessionError raised in a crawling context without a session') from session_error
1430
1453
 
1431
1454
  if self._error_handler:
@@ -1435,22 +1458,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1435
1458
  exc_only = ''.join(traceback.format_exception_only(session_error)).strip()
1436
1459
  self._logger.warning('Encountered "%s", rotating session and retrying...', exc_only)
1437
1460
 
1438
- context.session.retire()
1461
+ if session:
1462
+ session.retire()
1439
1463
 
1440
1464
  # Increment session rotation count.
1441
- context.request.session_rotation_count = (context.request.session_rotation_count or 0) + 1
1465
+ request.session_rotation_count = (request.session_rotation_count or 0) + 1
1442
1466
 
1443
1467
  await request_manager.reclaim_request(request)
1444
1468
  await self._statistics.error_tracker_retry.add(error=session_error, context=context)
1445
1469
  else:
1446
- await wait_for(
1447
- lambda: request_manager.mark_request_as_handled(context.request),
1448
- timeout=self._internal_timeout,
1449
- timeout_message='Marking request as handled timed out after '
1450
- f'{self._internal_timeout.total_seconds()} seconds',
1451
- logger=self._logger,
1452
- max_retries=3,
1453
- )
1470
+ await self._mark_request_as_handled(request)
1454
1471
 
1455
1472
  await self._handle_failed_request(context, session_error)
1456
1473
  self._statistics.record_request_processing_failure(request.unique_key)
@@ -1458,14 +1475,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1458
1475
  except ContextPipelineInterruptedError as interrupted_error:
1459
1476
  self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
1460
1477
 
1461
- await wait_for(
1462
- lambda: request_manager.mark_request_as_handled(context.request),
1463
- timeout=self._internal_timeout,
1464
- timeout_message='Marking request as handled timed out after '
1465
- f'{self._internal_timeout.total_seconds()} seconds',
1466
- logger=self._logger,
1467
- max_retries=3,
1468
- )
1478
+ await self._mark_request_as_handled(request)
1469
1479
 
1470
1480
  except ContextPipelineInitializationError as initialization_error:
1471
1481
  self._logger.debug(
@@ -1483,12 +1493,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1483
1493
  raise
1484
1494
 
1485
1495
  async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
1486
- await wait_for(
1487
- lambda: self._context_pipeline(context, self.router),
1488
- timeout=self._request_handler_timeout,
1489
- timeout_message=f'{self._request_handler_timeout_text}'
1490
- f' {self._request_handler_timeout.total_seconds()} seconds',
1491
- logger=self._logger,
1496
+ context.request.state = RequestState.BEFORE_NAV
1497
+ await self._context_pipeline(
1498
+ context,
1499
+ lambda final_context: wait_for(
1500
+ lambda: self.router(final_context),
1501
+ timeout=self._request_handler_timeout,
1502
+ timeout_message=f'{self._request_handler_timeout_text}'
1503
+ f' {self._request_handler_timeout.total_seconds()} seconds',
1504
+ logger=self._logger,
1505
+ ),
1492
1506
  )
1493
1507
 
1494
1508
  def _raise_for_error_status_code(self, status_code: int) -> None:
@@ -1636,3 +1650,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1636
1650
  )
1637
1651
 
1638
1652
  self._previous_crawler_state = current_state
1653
+
1654
+ async def _mark_request_as_handled(self, request: Request) -> None:
1655
+ request_manager = await self.get_request_manager()
1656
+ await wait_for(
1657
+ lambda: request_manager.mark_request_as_handled(request),
1658
+ timeout=self._internal_timeout,
1659
+ timeout_message='Marking request as handled timed out after '
1660
+ f'{self._internal_timeout.total_seconds()} seconds',
1661
+ logger=self._logger,
1662
+ max_retries=3,
1663
+ )
@@ -0,0 +1,24 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import contextmanager
4
+ from typing import TYPE_CHECKING
5
+
6
+ if TYPE_CHECKING:
7
+ from collections.abc import Iterator
8
+
9
+ from crawlee._request import Request
10
+
11
+ from ._basic_crawling_context import BasicCrawlingContext
12
+
13
+
14
+ @contextmanager
15
+ def swapped_context(
16
+ context: BasicCrawlingContext,
17
+ request: Request,
18
+ ) -> Iterator[None]:
19
+ """Replace context's isolated copies with originals after handler execution."""
20
+ try:
21
+ yield
22
+ finally:
23
+ # Restore original context state to avoid side effects between different handlers.
24
+ object.__setattr__(context, 'request', request)
@@ -2,9 +2,21 @@ import asyncio
2
2
  import re
3
3
  import traceback
4
4
 
5
+ import crawlee.errors
6
+
5
7
 
6
8
  def _get_only_innermost_exception(error: BaseException) -> BaseException:
7
- """Get innermost exception by following __cause__ and __context__ attributes of exception."""
9
+ """Get innermost exception by following __cause__ and __context__ attributes of exception.
10
+
11
+ If the innermost exception is UserHandlerTimeoutError, return whatever caused that if possible.
12
+ """
13
+ if type(error) is crawlee.errors.UserHandlerTimeoutError:
14
+ if error.__cause__:
15
+ return error.__cause__
16
+ if error.__context__:
17
+ return error.__context__
18
+ return error
19
+
8
20
  if error.__cause__:
9
21
  return _get_only_innermost_exception(error.__cause__)
10
22
  if error.__context__:
@@ -34,7 +46,7 @@ def _strip_pep657_highlighting(traceback_part: str) -> str:
34
46
 
35
47
 
36
48
  def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
37
- timeout_error: asyncio.exceptions.TimeoutError,
49
+ timeout_error: asyncio.exceptions.TimeoutError | crawlee.errors.UserHandlerTimeoutError,
38
50
  ) -> list[str]:
39
51
  innermost_error_traceback_parts = _get_traceback_parts_for_innermost_exception(timeout_error)
40
52
  return _get_filtered_traceback_parts_for_asyncio_timeout_error(innermost_error_traceback_parts)
@@ -43,13 +55,20 @@ def reduce_asyncio_timeout_error_to_relevant_traceback_parts(
43
55
  def _get_traceback_parts_for_innermost_exception(error: Exception) -> list[str]:
44
56
  innermost_error = _get_only_innermost_exception(error)
45
57
  return traceback.format_exception(
46
- type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=True
58
+ type(innermost_error), value=innermost_error, tb=innermost_error.__traceback__, chain=False
47
59
  )
48
60
 
49
61
 
50
62
  def get_one_line_error_summary_if_possible(error: Exception) -> str:
51
63
  if isinstance(error, asyncio.exceptions.TimeoutError):
52
- most_relevant_part = ',' + reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)[-1]
64
+ relevant_part = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
65
+ most_relevant_part = (',' + relevant_part[-1]) if len(relevant_part) else ''
66
+ elif isinstance(error, crawlee.errors.UserHandlerTimeoutError):
67
+ # Error is user defined handler. First two lines should be location of the `UserHandlerTimeoutError` in crawlee
68
+ # code and third line the topmost user error
69
+ traceback_parts = _get_traceback_parts_for_innermost_exception(error)
70
+ relevant_index_from_start = 3
71
+ most_relevant_part = traceback_parts[2] if len(traceback_parts) >= relevant_index_from_start else ''
53
72
  elif 'playwright._impl._errors.Error' in str(error.__class__):
54
73
  # Playwright autogenerated errors are often very long, so we do not try to summarize them at all as they anyway
55
74
  # point to deep internals.
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING
5
5
  from bs4 import BeautifulSoup, Tag
6
6
 
7
7
  from crawlee._utils.docs import docs_group
8
- from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
8
+ from crawlee.crawlers import AbstractHttpCrawler, HttpCrawlerOptions
9
9
 
10
10
  from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
11
11
  from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
@@ -58,7 +58,7 @@ class BeautifulSoupCrawler(AbstractHttpCrawler[BeautifulSoupCrawlingContext, Bea
58
58
  self,
59
59
  *,
60
60
  parser: BeautifulSoupParserType = 'lxml',
61
- **kwargs: Unpack[BasicCrawlerOptions[BeautifulSoupCrawlingContext]],
61
+ **kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
62
62
  ) -> None:
63
63
  """Initialize a new instance.
64
64