crawlee 1.0.1b9__py3-none-any.whl → 1.3.1b3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlee might be problematic. Click here for more details.

Files changed (93) hide show
  1. crawlee/__init__.py +2 -1
  2. crawlee/_browserforge_workaround.py +7 -3
  3. crawlee/_request.py +62 -32
  4. crawlee/_service_locator.py +4 -4
  5. crawlee/_types.py +52 -19
  6. crawlee/_utils/context.py +3 -3
  7. crawlee/_utils/file.py +8 -1
  8. crawlee/_utils/globs.py +4 -4
  9. crawlee/_utils/recoverable_state.py +32 -8
  10. crawlee/_utils/recurring_task.py +27 -3
  11. crawlee/_utils/robots.py +17 -5
  12. crawlee/_utils/sitemap.py +13 -6
  13. crawlee/_utils/system.py +27 -11
  14. crawlee/_utils/time.py +41 -1
  15. crawlee/_utils/urls.py +9 -2
  16. crawlee/browsers/_browser_pool.py +5 -2
  17. crawlee/browsers/_playwright_browser.py +2 -1
  18. crawlee/browsers/_playwright_browser_controller.py +1 -1
  19. crawlee/browsers/_playwright_browser_plugin.py +17 -3
  20. crawlee/browsers/_types.py +1 -1
  21. crawlee/configuration.py +3 -1
  22. crawlee/crawlers/__init__.py +5 -1
  23. crawlee/crawlers/_abstract_http/__init__.py +2 -1
  24. crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +54 -16
  25. crawlee/crawlers/_adaptive_playwright/__init__.py +5 -2
  26. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +21 -30
  27. crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawling_context.py +6 -2
  28. crawlee/crawlers/_basic/_basic_crawler.py +160 -134
  29. crawlee/crawlers/_basic/_context_utils.py +24 -0
  30. crawlee/crawlers/_basic/_logging_utils.py +23 -4
  31. crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py +2 -2
  32. crawlee/crawlers/_parsel/_parsel_crawler.py +2 -2
  33. crawlee/crawlers/_parsel/_parsel_parser.py +1 -1
  34. crawlee/crawlers/_playwright/_playwright_crawler.py +68 -23
  35. crawlee/crawlers/_playwright/_playwright_http_client.py +7 -1
  36. crawlee/crawlers/_playwright/_playwright_pre_nav_crawling_context.py +4 -1
  37. crawlee/crawlers/_playwright/_types.py +12 -2
  38. crawlee/errors.py +4 -0
  39. crawlee/events/_event_manager.py +12 -6
  40. crawlee/fingerprint_suite/_browserforge_adapter.py +1 -1
  41. crawlee/fingerprint_suite/_header_generator.py +2 -2
  42. crawlee/http_clients/_base.py +4 -0
  43. crawlee/http_clients/_curl_impersonate.py +68 -14
  44. crawlee/http_clients/_httpx.py +16 -6
  45. crawlee/http_clients/_impit.py +25 -10
  46. crawlee/otel/crawler_instrumentor.py +4 -6
  47. crawlee/request_loaders/_sitemap_request_loader.py +23 -5
  48. crawlee/router.py +13 -3
  49. crawlee/sessions/_cookies.py +13 -8
  50. crawlee/sessions/_models.py +3 -3
  51. crawlee/sessions/_session_pool.py +1 -1
  52. crawlee/statistics/_error_snapshotter.py +1 -1
  53. crawlee/statistics/_models.py +51 -9
  54. crawlee/statistics/_statistics.py +24 -33
  55. crawlee/storage_clients/__init__.py +4 -0
  56. crawlee/storage_clients/_base/_dataset_client.py +2 -2
  57. crawlee/storage_clients/_base/_key_value_store_client.py +2 -2
  58. crawlee/storage_clients/_file_system/_dataset_client.py +8 -7
  59. crawlee/storage_clients/_file_system/_key_value_store_client.py +9 -6
  60. crawlee/storage_clients/_file_system/_request_queue_client.py +31 -12
  61. crawlee/storage_clients/_memory/_dataset_client.py +2 -2
  62. crawlee/storage_clients/_memory/_key_value_store_client.py +2 -2
  63. crawlee/storage_clients/_memory/_request_queue_client.py +2 -2
  64. crawlee/storage_clients/_redis/__init__.py +6 -0
  65. crawlee/storage_clients/_redis/_client_mixin.py +292 -0
  66. crawlee/storage_clients/_redis/_dataset_client.py +329 -0
  67. crawlee/storage_clients/_redis/_key_value_store_client.py +262 -0
  68. crawlee/storage_clients/_redis/_request_queue_client.py +583 -0
  69. crawlee/storage_clients/_redis/_storage_client.py +149 -0
  70. crawlee/storage_clients/_redis/_utils.py +23 -0
  71. crawlee/storage_clients/_redis/lua_scripts/atomic_bloom_add_requests.lua +36 -0
  72. crawlee/storage_clients/_redis/lua_scripts/atomic_fetch_request.lua +49 -0
  73. crawlee/storage_clients/_redis/lua_scripts/atomic_set_add_requests.lua +37 -0
  74. crawlee/storage_clients/_redis/lua_scripts/reclaim_stale_requests.lua +34 -0
  75. crawlee/storage_clients/_redis/py.typed +0 -0
  76. crawlee/storage_clients/_sql/_client_mixin.py +1 -1
  77. crawlee/storage_clients/_sql/_dataset_client.py +2 -2
  78. crawlee/storage_clients/_sql/_db_models.py +1 -2
  79. crawlee/storage_clients/_sql/_key_value_store_client.py +5 -4
  80. crawlee/storage_clients/_sql/_request_queue_client.py +20 -6
  81. crawlee/storage_clients/_sql/_storage_client.py +1 -1
  82. crawlee/storage_clients/models.py +8 -3
  83. crawlee/storages/_base.py +3 -1
  84. crawlee/storages/_dataset.py +3 -0
  85. crawlee/storages/_key_value_store.py +8 -2
  86. crawlee/storages/_request_queue.py +3 -0
  87. crawlee/storages/_storage_instance_manager.py +109 -42
  88. crawlee/storages/_utils.py +11 -0
  89. {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/METADATA +14 -16
  90. {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/RECORD +93 -79
  91. {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/WHEEL +1 -1
  92. {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/entry_points.txt +0 -0
  93. {crawlee-1.0.1b9.dist-info → crawlee-1.3.1b3.dist-info}/licenses/LICENSE +0 -0
@@ -17,7 +17,7 @@ if TYPE_CHECKING:
17
17
  from playwright.async_api import Page, Response
18
18
  from typing_extensions import Self
19
19
 
20
- from crawlee.crawlers._playwright._types import BlockRequestsFunction
20
+ from crawlee.crawlers._playwright._types import BlockRequestsFunction, GotoOptions
21
21
 
22
22
 
23
23
  TStaticParseResult = TypeVar('TStaticParseResult')
@@ -190,8 +190,9 @@ class AdaptivePlaywrightCrawlingContext(
190
190
  http_response = await PlaywrightHttpResponse.from_playwright_response(
191
191
  response=context.response, protocol=protocol_guess or ''
192
192
  )
193
- # block_requests is useful only on pre-navigation contexts. It is useless here.
193
+ # block_requests and goto_options are useful only on pre-navigation contexts. It is useless here.
194
194
  context_kwargs.pop('block_requests')
195
+ context_kwargs.pop('goto_options')
195
196
  return cls(
196
197
  parsed_content=await parser.parse(http_response),
197
198
  http_response=http_response,
@@ -212,6 +213,9 @@ class AdaptivePlaywrightPreNavCrawlingContext(BasicCrawlingContext):
212
213
  block_requests: BlockRequestsFunction | None = None
213
214
  """Blocks network requests matching specified URL patterns."""
214
215
 
216
+ goto_options: GotoOptions | None = None
217
+ """Additional options to pass to Playwright's `Page.goto()` method. The `timeout` option is not supported."""
218
+
215
219
  @property
216
220
  def page(self) -> Page:
217
221
  """The Playwright `Page` object for the current page.
@@ -2,6 +2,7 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  import asyncio
5
+ import functools
5
6
  import logging
6
7
  import signal
7
8
  import sys
@@ -13,8 +14,9 @@ from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable, Seque
13
14
  from contextlib import AsyncExitStack, suppress
14
15
  from datetime import timedelta
15
16
  from functools import partial
17
+ from io import StringIO
16
18
  from pathlib import Path
17
- from typing import TYPE_CHECKING, Any, Generic, Literal, cast
19
+ from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast
18
20
  from urllib.parse import ParseResult, urlparse
19
21
  from weakref import WeakKeyDictionary
20
22
 
@@ -31,6 +33,8 @@ from crawlee._service_locator import ServiceLocator
31
33
  from crawlee._types import (
32
34
  BasicCrawlingContext,
33
35
  EnqueueLinksKwargs,
36
+ ExportDataCsvKwargs,
37
+ ExportDataJsonKwargs,
34
38
  GetKeyValueStoreFromRequestHandlerFunction,
35
39
  HttpHeaders,
36
40
  HttpPayload,
@@ -40,7 +44,7 @@ from crawlee._types import (
40
44
  SkippedReason,
41
45
  )
42
46
  from crawlee._utils.docs import docs_group
43
- from crawlee._utils.file import export_csv_to_stream, export_json_to_stream
47
+ from crawlee._utils.file import atomic_write, export_csv_to_stream, export_json_to_stream
44
48
  from crawlee._utils.recurring_task import RecurringTask
45
49
  from crawlee._utils.robots import RobotsTxtFile
46
50
  from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
@@ -55,6 +59,7 @@ from crawlee.errors import (
55
59
  RequestHandlerError,
56
60
  SessionError,
57
61
  UserDefinedErrorHandlerError,
62
+ UserHandlerTimeoutError,
58
63
  )
59
64
  from crawlee.events._types import Event, EventCrawlerStatusData
60
65
  from crawlee.http_clients import ImpitHttpClient
@@ -64,6 +69,7 @@ from crawlee.statistics import Statistics, StatisticsState
64
69
  from crawlee.storages import Dataset, KeyValueStore, RequestQueue
65
70
 
66
71
  from ._context_pipeline import ContextPipeline
72
+ from ._context_utils import swapped_context
67
73
  from ._logging_utils import (
68
74
  get_one_line_error_summary_if_possible,
69
75
  reduce_asyncio_timeout_error_to_relevant_traceback_parts,
@@ -96,6 +102,9 @@ if TYPE_CHECKING:
96
102
  TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
97
103
  TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
98
104
  TRequestIterator = TypeVar('TRequestIterator', str, Request)
105
+ TParams = ParamSpec('TParams')
106
+ T = TypeVar('T')
107
+
99
108
  ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]]
100
109
  FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
101
110
  SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]
@@ -401,7 +410,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
401
410
  self._context_result_map = WeakKeyDictionary[BasicCrawlingContext, RequestHandlerRunResult]()
402
411
 
403
412
  # Context pipeline
404
- self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects)
413
+ self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects) # ty: ignore[invalid-argument-type]
405
414
 
406
415
  # Crawl settings
407
416
  self._max_request_retries = max_request_retries
@@ -437,14 +446,23 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
437
446
  self._statistics_log_format = statistics_log_format
438
447
 
439
448
  # Statistics
440
- self._statistics = statistics or cast(
441
- 'Statistics[TStatisticsState]',
442
- Statistics.with_default_state(
443
- periodic_message_logger=self._logger,
444
- statistics_log_format=self._statistics_log_format,
445
- log_message='Current request statistics:',
446
- ),
447
- )
449
+ if statistics:
450
+ self._statistics = statistics
451
+ else:
452
+
453
+ async def persist_state_factory() -> KeyValueStore:
454
+ return await self.get_key_value_store()
455
+
456
+ self._statistics = cast(
457
+ 'Statistics[TStatisticsState]',
458
+ Statistics.with_default_state(
459
+ persistence_enabled=True,
460
+ periodic_message_logger=self._logger,
461
+ statistics_log_format=self._statistics_log_format,
462
+ log_message='Current request statistics:',
463
+ persist_state_kvs_factory=persist_state_factory,
464
+ ),
465
+ )
448
466
 
449
467
  # Additional context managers to enter and exit
450
468
  self._additional_context_managers = _additional_context_managers or []
@@ -511,6 +529,24 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
511
529
  self._logger.info(f'Crawler.stop() was called with following reason: {reason}.')
512
530
  self._unexpected_stop = True
513
531
 
532
+ def _wrap_handler_with_error_context(
533
+ self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]
534
+ ) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]:
535
+ """Decorate error handlers to make their context helpers usable."""
536
+
537
+ @functools.wraps(handler)
538
+ async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T:
539
+ # Original context helpers that are from `RequestHandlerRunResult` will not be committed as the request
540
+ # failed. Modified context provides context helpers with direct access to the storages.
541
+ error_context = context.create_modified_copy(
542
+ push_data=self._push_data,
543
+ get_key_value_store=self.get_key_value_store,
544
+ add_requests=functools.partial(self._add_requests, context),
545
+ )
546
+ return await handler(error_context, exception)
547
+
548
+ return wrapped_handler
549
+
514
550
  def _stop_if_max_requests_count_exceeded(self) -> None:
515
551
  """Call `stop` when the maximum number of requests to crawl has been reached."""
516
552
  if self._max_requests_per_crawl is None:
@@ -609,7 +645,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
609
645
 
610
646
  The error handler is invoked after a request handler error occurs and before a retry attempt.
611
647
  """
612
- self._error_handler = handler
648
+ self._error_handler = self._wrap_handler_with_error_context(handler)
613
649
  return handler
614
650
 
615
651
  def failed_request_handler(
@@ -619,7 +655,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
619
655
 
620
656
  The failed request handler is invoked when a request has failed all retry attempts.
621
657
  """
622
- self._failed_request_handler = handler
658
+ self._failed_request_handler = self._wrap_handler_with_error_context(handler)
623
659
  return handler
624
660
 
625
661
  def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
@@ -689,7 +725,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
689
725
  except CancelledError:
690
726
  pass
691
727
  finally:
692
- await self._crawler_state_rec_task.stop()
693
728
  if threading.current_thread() is threading.main_thread():
694
729
  with suppress(NotImplementedError):
695
730
  asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
@@ -721,8 +756,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
721
756
  async def _run_crawler(self) -> None:
722
757
  event_manager = self._service_locator.get_event_manager()
723
758
 
724
- self._crawler_state_rec_task.start()
725
-
726
759
  # Collect the context managers to be entered. Context managers that are already active are excluded,
727
760
  # as they were likely entered by the caller, who will also be responsible for exiting them.
728
761
  contexts_to_enter = [
@@ -733,6 +766,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
733
766
  self._statistics,
734
767
  self._session_pool if self._use_session_pool else None,
735
768
  self._http_client,
769
+ self._crawler_state_rec_task,
736
770
  *self._additional_context_managers,
737
771
  )
738
772
  if cm and getattr(cm, 'active', False) is False
@@ -740,7 +774,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
740
774
 
741
775
  async with AsyncExitStack() as exit_stack:
742
776
  for context in contexts_to_enter:
743
- await exit_stack.enter_async_context(context) # type: ignore[arg-type]
777
+ await exit_stack.enter_async_context(context) # ty: ignore[invalid-argument-type]
744
778
 
745
779
  await self._autoscaled_pool.run()
746
780
 
@@ -839,6 +873,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
839
873
  dataset_id: str | None = None,
840
874
  dataset_name: str | None = None,
841
875
  dataset_alias: str | None = None,
876
+ **additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs],
842
877
  ) -> None:
843
878
  """Export all items from a Dataset to a JSON or CSV file.
844
879
 
@@ -851,6 +886,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
851
886
  dataset_id: The ID of the Dataset to export from.
852
887
  dataset_name: The name of the Dataset to export from (global scope, named storage).
853
888
  dataset_alias: The alias of the Dataset to export from (run scope, unnamed storage).
889
+ additional_kwargs: Extra keyword arguments forwarded to the JSON/CSV exporter depending on the file format.
854
890
  """
855
891
  dataset = await Dataset.open(
856
892
  id=dataset_id,
@@ -860,13 +896,18 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
860
896
  configuration=self._service_locator.get_configuration(),
861
897
  )
862
898
 
863
- path = path if isinstance(path, Path) else Path(path)
864
- dst = path.open('w', newline='')
899
+ path = Path(path)
865
900
 
866
901
  if path.suffix == '.csv':
867
- await export_csv_to_stream(dataset.iterate_items(), dst)
902
+ dst = StringIO()
903
+ csv_kwargs = cast('ExportDataCsvKwargs', additional_kwargs)
904
+ await export_csv_to_stream(dataset.iterate_items(), dst, **csv_kwargs)
905
+ await atomic_write(path, dst.getvalue())
868
906
  elif path.suffix == '.json':
869
- await export_json_to_stream(dataset.iterate_items(), dst)
907
+ dst = StringIO()
908
+ json_kwargs = cast('ExportDataJsonKwargs', additional_kwargs)
909
+ await export_json_to_stream(dataset.iterate_items(), dst, **json_kwargs)
910
+ await atomic_write(path, dst.getvalue())
870
911
  else:
871
912
  raise ValueError(f'Unsupported file extension: {path.suffix}')
872
913
 
@@ -972,6 +1013,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
972
1013
  label=label,
973
1014
  user_data=user_data,
974
1015
  transform_request_function=transform_request_function,
1016
+ **kwargs,
975
1017
  ),
976
1018
  rq_id=rq_id,
977
1019
  rq_name=rq_name,
@@ -997,7 +1039,12 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
997
1039
  warning_flag = True
998
1040
 
999
1041
  for request in request_iterator:
1000
- target_url = request.url if isinstance(request, Request) else request
1042
+ if isinstance(request, Request):
1043
+ if request.enqueue_strategy != strategy:
1044
+ request.enqueue_strategy = strategy
1045
+ target_url = request.url
1046
+ else:
1047
+ target_url = request
1001
1048
  parsed_target_url = urlparse(target_url)
1002
1049
 
1003
1050
  if warning_flag and strategy != 'all' and not parsed_target_url.hostname:
@@ -1009,9 +1056,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1009
1056
  ) and self._check_url_patterns(target_url, kwargs.get('include'), kwargs.get('exclude')):
1010
1057
  yield request
1011
1058
 
1012
- limit = limit - 1 if limit is not None else None
1013
- if limit and limit <= 0:
1014
- break
1059
+ if limit is not None:
1060
+ limit -= 1
1061
+ if limit <= 0:
1062
+ break
1015
1063
 
1016
1064
  def _check_enqueue_strategy(
1017
1065
  self,
@@ -1035,8 +1083,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1035
1083
  return target_url.hostname == origin_url.hostname
1036
1084
 
1037
1085
  if strategy == 'same-domain':
1038
- origin_domain = self._tld_extractor.extract_str(origin_url.hostname).domain
1039
- target_domain = self._tld_extractor.extract_str(target_url.hostname).domain
1086
+ origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
1087
+ target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
1040
1088
  return origin_domain == target_domain
1041
1089
 
1042
1090
  if strategy == 'same-origin':
@@ -1094,7 +1142,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1094
1142
  request.retry_count += 1
1095
1143
  reduced_error = str(error).split('\n')[0]
1096
1144
  self.log.warning(
1097
- f'Retrying request to {context.request.url} due to: {reduced_error}'
1145
+ f'Retrying request to {context.request.url} due to: {reduced_error}. '
1098
1146
  f'{get_one_line_error_summary_if_possible(error)}'
1099
1147
  )
1100
1148
  await self._statistics.error_tracker.add(error=error, context=context)
@@ -1105,19 +1153,15 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1105
1153
  except Exception as e:
1106
1154
  raise UserDefinedErrorHandlerError('Exception thrown in user-defined request error handler') from e
1107
1155
  else:
1108
- if new_request is not None:
1109
- request = new_request
1156
+ if new_request is not None and new_request != request:
1157
+ await request_manager.add_request(new_request)
1158
+ await self._mark_request_as_handled(request)
1159
+ return
1110
1160
 
1111
1161
  await request_manager.reclaim_request(request)
1112
1162
  else:
1113
- await wait_for(
1114
- lambda: request_manager.mark_request_as_handled(context.request),
1115
- timeout=self._internal_timeout,
1116
- timeout_message='Marking request as handled timed out after '
1117
- f'{self._internal_timeout.total_seconds()} seconds',
1118
- logger=self._logger,
1119
- max_retries=3,
1120
- )
1163
+ request.state = RequestState.ERROR
1164
+ await self._mark_request_as_handled(request)
1121
1165
  await self._handle_failed_request(context, error)
1122
1166
  self._statistics.record_request_processing_failure(request.unique_key)
1123
1167
 
@@ -1132,8 +1176,6 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1132
1176
  f'{self._internal_timeout.total_seconds()} seconds',
1133
1177
  logger=self._logger,
1134
1178
  )
1135
-
1136
- context.request.state = RequestState.DONE
1137
1179
  except UserDefinedErrorHandlerError:
1138
1180
  context.request.state = RequestState.ERROR
1139
1181
  raise
@@ -1166,17 +1208,8 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1166
1208
  self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
1167
1209
  ) -> None:
1168
1210
  if need_mark and isinstance(request, Request):
1169
- request_manager = await self.get_request_manager()
1170
-
1171
- await wait_for(
1172
- lambda: request_manager.mark_request_as_handled(request),
1173
- timeout=self._internal_timeout,
1174
- timeout_message='Marking request as handled timed out after '
1175
- f'{self._internal_timeout.total_seconds()} seconds',
1176
- logger=self._logger,
1177
- max_retries=3,
1178
- )
1179
1211
  request.state = RequestState.SKIPPED
1212
+ await self._mark_request_as_handled(request)
1180
1213
 
1181
1214
  url = request.url if isinstance(request, Request) else request
1182
1215
 
@@ -1196,10 +1229,11 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1196
1229
 
1197
1230
  if (
1198
1231
  isinstance(error, asyncio.exceptions.TimeoutError)
1232
+ and traceback_parts
1199
1233
  and self._request_handler_timeout_text in traceback_parts[-1]
1200
- ):
1234
+ ) or isinstance(error, UserHandlerTimeoutError):
1201
1235
  used_traceback_parts = reduce_asyncio_timeout_error_to_relevant_traceback_parts(error)
1202
- used_traceback_parts.append(traceback_parts[-1])
1236
+ used_traceback_parts.extend(traceback_parts[-1:])
1203
1237
 
1204
1238
  return ''.join(used_traceback_parts).strip('\n')
1205
1239
 
@@ -1248,58 +1282,54 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1248
1282
  else:
1249
1283
  yield Request.from_url(url)
1250
1284
 
1251
- async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
1252
- """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1253
- result = self._context_result_map[context]
1254
-
1255
- base_request_manager = await self.get_request_manager()
1256
-
1257
- origin = context.request.loaded_url or context.request.url
1258
-
1259
- for add_requests_call in result.add_requests_calls:
1260
- rq_id = add_requests_call.get('rq_id')
1261
- rq_name = add_requests_call.get('rq_name')
1262
- rq_alias = add_requests_call.get('rq_alias')
1263
- specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None)
1264
- if specified_params > 1:
1265
- raise ValueError('You can only provide one of `rq_id`, `rq_name` or `rq_alias` arguments.')
1266
- if rq_id or rq_name or rq_alias:
1267
- request_manager: RequestManager | RequestQueue = await RequestQueue.open(
1268
- id=rq_id,
1269
- name=rq_name,
1270
- alias=rq_alias,
1271
- storage_client=self._service_locator.get_storage_client(),
1272
- configuration=self._service_locator.get_configuration(),
1273
- )
1274
- else:
1275
- request_manager = base_request_manager
1276
-
1277
- requests = list[Request]()
1278
-
1279
- base_url = url if (url := add_requests_call.get('base_url')) else origin
1280
-
1281
- requests_iterator = self._convert_url_to_request_iterator(add_requests_call['requests'], base_url)
1285
+ async def _add_requests(
1286
+ self,
1287
+ context: BasicCrawlingContext,
1288
+ requests: Sequence[str | Request],
1289
+ rq_id: str | None = None,
1290
+ rq_name: str | None = None,
1291
+ rq_alias: str | None = None,
1292
+ **kwargs: Unpack[EnqueueLinksKwargs],
1293
+ ) -> None:
1294
+ """Add requests method aware of the crawling context."""
1295
+ if rq_id or rq_name or rq_alias:
1296
+ request_manager: RequestManager = await RequestQueue.open(
1297
+ id=rq_id,
1298
+ name=rq_name,
1299
+ alias=rq_alias,
1300
+ storage_client=self._service_locator.get_storage_client(),
1301
+ configuration=self._service_locator.get_configuration(),
1302
+ )
1303
+ else:
1304
+ request_manager = await self.get_request_manager()
1282
1305
 
1283
- enqueue_links_kwargs: EnqueueLinksKwargs = {k: v for k, v in add_requests_call.items() if k != 'requests'} # type: ignore[assignment]
1306
+ context_aware_requests = list[Request]()
1307
+ base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url
1308
+ requests_iterator = self._convert_url_to_request_iterator(requests, base_url)
1309
+ filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs)
1310
+ for dst_request in filter_requests_iterator:
1311
+ # Update the crawl depth of the request.
1312
+ dst_request.crawl_depth = context.request.crawl_depth + 1
1284
1313
 
1285
- filter_requests_iterator = self._enqueue_links_filter_iterator(
1286
- requests_iterator, context.request.url, **enqueue_links_kwargs
1287
- )
1314
+ if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
1315
+ context_aware_requests.append(dst_request)
1288
1316
 
1289
- for dst_request in filter_requests_iterator:
1290
- # Update the crawl depth of the request.
1291
- dst_request.crawl_depth = context.request.crawl_depth + 1
1317
+ return await request_manager.add_requests(context_aware_requests)
1292
1318
 
1293
- if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth:
1294
- requests.append(dst_request)
1319
+ async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None:
1320
+ """Commit request handler result for the input `context`. Result is taken from `_context_result_map`."""
1321
+ result = self._context_result_map[context]
1295
1322
 
1296
- await request_manager.add_requests(requests)
1323
+ for add_requests_call in result.add_requests_calls:
1324
+ await self._add_requests(context, **add_requests_call)
1297
1325
 
1298
1326
  for push_data_call in result.push_data_calls:
1299
1327
  await self._push_data(**push_data_call)
1300
1328
 
1301
1329
  await self._commit_key_value_store_changes(result, get_kvs=self.get_key_value_store)
1302
1330
 
1331
+ result.apply_request_changes(target=context.request)
1332
+
1303
1333
  @staticmethod
1304
1334
  async def _commit_key_value_store_changes(
1305
1335
  result: RequestHandlerRunResult, get_kvs: GetKeyValueStoreFromRequestHandlerFunction
@@ -1365,10 +1395,10 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1365
1395
  else:
1366
1396
  session = await self._get_session()
1367
1397
  proxy_info = await self._get_proxy_info(request, session)
1368
- result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store)
1398
+ result = RequestHandlerRunResult(key_value_store_getter=self.get_key_value_store, request=request)
1369
1399
 
1370
1400
  context = BasicCrawlingContext(
1371
- request=request,
1401
+ request=result.request,
1372
1402
  session=session,
1373
1403
  proxy_info=proxy_info,
1374
1404
  send_request=self._prepare_send_request_function(session, proxy_info),
@@ -1385,32 +1415,26 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1385
1415
  try:
1386
1416
  request.state = RequestState.REQUEST_HANDLER
1387
1417
 
1388
- self._check_request_collision(context.request, context.session)
1389
-
1390
1418
  try:
1391
- await self._run_request_handler(context=context)
1419
+ with swapped_context(context, request):
1420
+ self._check_request_collision(request, session)
1421
+ await self._run_request_handler(context=context)
1392
1422
  except asyncio.TimeoutError as e:
1393
1423
  raise RequestHandlerError(e, context) from e
1394
1424
 
1395
1425
  await self._commit_request_handler_result(context)
1396
- await wait_for(
1397
- lambda: request_manager.mark_request_as_handled(context.request),
1398
- timeout=self._internal_timeout,
1399
- timeout_message='Marking request as handled timed out after '
1400
- f'{self._internal_timeout.total_seconds()} seconds',
1401
- logger=self._logger,
1402
- max_retries=3,
1403
- )
1404
1426
 
1405
1427
  request.state = RequestState.DONE
1406
1428
 
1407
- if context.session and context.session.is_usable:
1408
- context.session.mark_good()
1429
+ await self._mark_request_as_handled(request)
1430
+
1431
+ if session and session.is_usable:
1432
+ session.mark_good()
1409
1433
 
1410
1434
  self._statistics.record_request_processing_finish(request.unique_key)
1411
1435
 
1412
1436
  except RequestCollisionError as request_error:
1413
- context.request.no_retry = True
1437
+ request.no_retry = True
1414
1438
  await self._handle_request_error(context, request_error)
1415
1439
 
1416
1440
  except RequestHandlerError as primary_error:
@@ -1425,7 +1449,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1425
1449
  await self._handle_request_error(primary_error.crawling_context, primary_error.wrapped_exception)
1426
1450
 
1427
1451
  except SessionError as session_error:
1428
- if not context.session:
1452
+ if not session:
1429
1453
  raise RuntimeError('SessionError raised in a crawling context without a session') from session_error
1430
1454
 
1431
1455
  if self._error_handler:
@@ -1435,22 +1459,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1435
1459
  exc_only = ''.join(traceback.format_exception_only(session_error)).strip()
1436
1460
  self._logger.warning('Encountered "%s", rotating session and retrying...', exc_only)
1437
1461
 
1438
- context.session.retire()
1462
+ if session:
1463
+ session.retire()
1439
1464
 
1440
1465
  # Increment session rotation count.
1441
- context.request.session_rotation_count = (context.request.session_rotation_count or 0) + 1
1466
+ request.session_rotation_count = (request.session_rotation_count or 0) + 1
1442
1467
 
1443
1468
  await request_manager.reclaim_request(request)
1444
1469
  await self._statistics.error_tracker_retry.add(error=session_error, context=context)
1445
1470
  else:
1446
- await wait_for(
1447
- lambda: request_manager.mark_request_as_handled(context.request),
1448
- timeout=self._internal_timeout,
1449
- timeout_message='Marking request as handled timed out after '
1450
- f'{self._internal_timeout.total_seconds()} seconds',
1451
- logger=self._logger,
1452
- max_retries=3,
1453
- )
1471
+ await self._mark_request_as_handled(request)
1454
1472
 
1455
1473
  await self._handle_failed_request(context, session_error)
1456
1474
  self._statistics.record_request_processing_failure(request.unique_key)
@@ -1458,14 +1476,7 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1458
1476
  except ContextPipelineInterruptedError as interrupted_error:
1459
1477
  self._logger.debug('The context pipeline was interrupted', exc_info=interrupted_error)
1460
1478
 
1461
- await wait_for(
1462
- lambda: request_manager.mark_request_as_handled(context.request),
1463
- timeout=self._internal_timeout,
1464
- timeout_message='Marking request as handled timed out after '
1465
- f'{self._internal_timeout.total_seconds()} seconds',
1466
- logger=self._logger,
1467
- max_retries=3,
1468
- )
1479
+ await self._mark_request_as_handled(request)
1469
1480
 
1470
1481
  except ContextPipelineInitializationError as initialization_error:
1471
1482
  self._logger.debug(
@@ -1483,12 +1494,16 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1483
1494
  raise
1484
1495
 
1485
1496
  async def _run_request_handler(self, context: BasicCrawlingContext) -> None:
1486
- await wait_for(
1487
- lambda: self._context_pipeline(context, self.router),
1488
- timeout=self._request_handler_timeout,
1489
- timeout_message=f'{self._request_handler_timeout_text}'
1490
- f' {self._request_handler_timeout.total_seconds()} seconds',
1491
- logger=self._logger,
1497
+ context.request.state = RequestState.BEFORE_NAV
1498
+ await self._context_pipeline(
1499
+ context,
1500
+ lambda final_context: wait_for(
1501
+ lambda: self.router(final_context),
1502
+ timeout=self._request_handler_timeout,
1503
+ timeout_message=f'{self._request_handler_timeout_text}'
1504
+ f' {self._request_handler_timeout.total_seconds()} seconds',
1505
+ logger=self._logger,
1506
+ ),
1492
1507
  )
1493
1508
 
1494
1509
  def _raise_for_error_status_code(self, status_code: int) -> None:
@@ -1636,3 +1651,14 @@ class BasicCrawler(Generic[TCrawlingContext, TStatisticsState]):
1636
1651
  )
1637
1652
 
1638
1653
  self._previous_crawler_state = current_state
1654
+
1655
+ async def _mark_request_as_handled(self, request: Request) -> None:
1656
+ request_manager = await self.get_request_manager()
1657
+ await wait_for(
1658
+ lambda: request_manager.mark_request_as_handled(request),
1659
+ timeout=self._internal_timeout,
1660
+ timeout_message='Marking request as handled timed out after '
1661
+ f'{self._internal_timeout.total_seconds()} seconds',
1662
+ logger=self._logger,
1663
+ max_retries=3,
1664
+ )
@@ -0,0 +1,24 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import contextmanager
4
+ from typing import TYPE_CHECKING
5
+
6
+ if TYPE_CHECKING:
7
+ from collections.abc import Iterator
8
+
9
+ from crawlee._request import Request
10
+
11
+ from ._basic_crawling_context import BasicCrawlingContext
12
+
13
+
14
+ @contextmanager
15
+ def swapped_context(
16
+ context: BasicCrawlingContext,
17
+ request: Request,
18
+ ) -> Iterator[None]:
19
+ """Replace context's isolated copies with originals after handler execution."""
20
+ try:
21
+ yield
22
+ finally:
23
+ # Restore original context state to avoid side effects between different handlers.
24
+ object.__setattr__(context, 'request', request)